library(data.table)
options(datatable.print.class = TRUE)
37 Reshape
37.1 Long to wide using dcast()
37.1.1 Example 1: key-value pairs
Using the same example seen in Chapter 26:
dat_long <- data.frame(
Account_ID = c(8001, 8002, 8003, 8004, 8001, 8002, 8003, 8004,
8001, 8002, 8003, 8004, 8001, 8002, 8003, 8004),
Age = c(67.8017038366664, 42.9198507293701, 46.2301756642422,
39.665983196671, 67.8017038366664, 42.9198507293701,
46.2301756642422, 39.665983196671, 67.8017038366664,
42.9198507293701, 46.2301756642422, 39.665983196671,
67.8017038366664, 42.9198507293701, 46.2301756642422,
39.665983196671),
Admission = c("ED", "Planned", "Planned", "ED", "ED", "Planned",
"Planned", "ED", "ED", "Planned", "Planned", "ED", "ED", "Planned",
"Planned", "ED"),
Lab_key = c("RBC", "RBC", "RBC", "RBC", "WBC", "WBC", "WBC", "WBC",
"Hematocrit", "Hematocrit", "Hematocrit", "Hematocrit",
"Hemoglobin", "Hemoglobin", "Hemoglobin", "Hemoglobin"),
Lab_value = c(4.63449321082268, 3.34968550627897, 4.27037213597765,
4.93897736897793, 8374.22887757195, 7612.37380499927,
8759.27855519425, 6972.28096216548, 36.272693147236,
40.5716317809522, 39.9888624177955, 39.8786884058422,
12.6188444991545, 12.1739747363806, 15.1293426442183,
14.8885696185238)
)
dat_long
Account_ID Age Admission Lab_key Lab_value
1 8001 67.80170 ED RBC 4.634493
2 8002 42.91985 Planned RBC 3.349686
3 8003 46.23018 Planned RBC 4.270372
4 8004 39.66598 ED RBC 4.938977
5 8001 67.80170 ED WBC 8374.228878
6 8002 42.91985 Planned WBC 7612.373805
7 8003 46.23018 Planned WBC 8759.278555
8 8004 39.66598 ED WBC 6972.280962
9 8001 67.80170 ED Hematocrit 36.272693
10 8002 42.91985 Planned Hematocrit 40.571632
11 8003 46.23018 Planned Hematocrit 39.988862
12 8004 39.66598 ED Hematocrit 39.878688
13 8001 67.80170 ED Hemoglobin 12.618844
14 8002 42.91985 Planned Hemoglobin 12.173975
15 8003 46.23018 Planned Hemoglobin 15.129343
16 8004 39.66598 ED Hemoglobin 14.888570
data.table
’s long to wide procedure is defined with a convenient formula notation:
dat_long_dt <- as.data.table(dat_long)
dat_long2wide_dt <- dcast(dat_long_dt,
Account_ID + Age + Admission ~ Lab_key,
value.var = "Lab_value")
dat_long2wide_dt
Key: <Account_ID, Age, Admission>
Account_ID Age Admission Hematocrit Hemoglobin RBC WBC
<num> <num> <char> <num> <num> <num> <num>
1: 8001 67.80170 ED 36.27269 12.61884 4.634493 8374.229
2: 8002 42.91985 Planned 40.57163 12.17397 3.349686 7612.374
3: 8003 46.23018 Planned 39.98886 15.12934 4.270372 8759.279
4: 8004 39.66598 ED 39.87869 14.88857 4.938977 6972.281
Instead of listing all variables you can use ...
, which corresponds to all variables not otherwise mentioned in the formula or in value.var
:
dcast(dat_long_dt,
... ~ Lab_key,
value.var = "Lab_value")
Key: <Account_ID, Age, Admission>
Account_ID Age Admission Hematocrit Hemoglobin RBC WBC
<num> <num> <char> <num> <num> <num> <num>
1: 8001 67.80170 ED 36.27269 12.61884 4.634493 8374.229
2: 8002 42.91985 Planned 40.57163 12.17397 3.349686 7612.374
3: 8003 46.23018 Planned 39.98886 15.12934 4.270372 8759.279
4: 8004 39.66598 ED 39.87869 14.88857 4.938977 6972.281
37.1.2 Example 2: Timepoints
Another simple synthetic dataset:
dt_long <- data.table(
ID = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L),
Timepoint = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L), levels = c("Timepoint_A", "Timepoint_B",
"Timepoint_C"), class = "factor"),
Score = c(11L, 12L, 13L, 14L, 21L, 22L, 23L, 24L, 51L, 52L, 53L, 54L)
)
dt_long
ID Timepoint Score
<int> <fctr> <int>
1: 1 Timepoint_A 11
2: 2 Timepoint_A 12
3: 3 Timepoint_A 13
4: 4 Timepoint_A 14
5: 1 Timepoint_B 21
6: 2 Timepoint_B 22
7: 3 Timepoint_B 23
8: 4 Timepoint_B 24
9: 1 Timepoint_C 51
10: 2 Timepoint_C 52
11: 3 Timepoint_C 53
12: 4 Timepoint_C 54
dt_long
ID Timepoint Score
<int> <fctr> <int>
1: 1 Timepoint_A 11
2: 2 Timepoint_A 12
3: 3 Timepoint_A 13
4: 4 Timepoint_A 14
5: 1 Timepoint_B 21
6: 2 Timepoint_B 22
7: 3 Timepoint_B 23
8: 4 Timepoint_B 24
9: 1 Timepoint_C 51
10: 2 Timepoint_C 52
11: 3 Timepoint_C 53
12: 4 Timepoint_C 54
dcast(dt_long, ID ~ Timepoint,
value.var = "Score")
Key: <ID>
ID Timepoint_A Timepoint_B Timepoint_C
<int> <int> <int> <int>
1: 1 11 21 51
2: 2 12 22 52
3: 3 13 23 53
4: 4 14 24 54
37.1.3 dcast()
+ aggregate
If your ID ~ Timepoint
combination does not define a unique row in your input dataset, you need to specify an aggregate function.
For example, suppose you have four subjects with IDs “A”, “B”, “C”, “D” who had a couple variables measured 3 times in the AM and 3 times in the PM.
dt_long2 <- data.table(ID = rep(LETTERS[1:4], each = 6),
Timepoint = rep(c("AM", "PM"), length.out = 24, each = 3),
Var1 = rnorm(24, mean = 10),
Var2 = rnorm(24, mean = 20))
dt_long2[sample(24, size = 4), Var1 := NA]
dt_long2[sample(24, size = 4), Var2 := NA]
dt_long2
ID Timepoint Var1 Var2
<char> <char> <num> <num>
1: A AM 9.095061 20.55069
2: A AM 10.024434 NA
3: A AM 10.280247 19.98988
4: A PM NA 20.32035
5: A PM 11.505311 19.72577
6: A PM 10.045226 20.43301
7: B AM 10.258153 19.77664
8: B AM 9.781122 NA
9: B AM 10.901976 19.13577
10: B PM NA 20.83391
11: B PM 10.208148 21.79704
12: B PM 10.442472 20.17209
13: C AM 9.835870 17.95787
14: C AM 9.952885 19.22704
15: C AM 11.462364 19.60074
16: C PM NA 20.48730
17: C PM 7.973092 15.95908
18: C PM 10.835813 19.39142
19: D AM 8.211689 20.02134
20: D AM 11.016285 NA
21: D AM NA 19.64777
22: D PM 10.905356 18.92603
23: D PM 9.431838 NA
24: D PM 8.143880 20.31228
ID Timepoint Var1 Var2
If you wanted to convert the above data.table to wide format and get mean AM and PM values using the fun.aggregate
argument:
Key: <ID>
ID Var1_AM Var1_PM Var2_AM Var2_PM
<char> <num> <num> <num> <num>
1: A 9.799914 10.775268 20.27029 20.15971
2: B 10.313751 10.325310 19.45621 20.93434
3: C 10.417040 9.404453 18.92855 18.61260
4: D 9.613987 9.493691 19.83456 19.61916
You can apply multiple aggregating functions by passing a list to fun.aggregate
:
dcast(dt_long2,
ID ~ Timepoint,
value.var = c("Var1", "Var2"),
fun.aggregate = list(mean, max, min), na.rm = TRUE)
Key: <ID>
ID Var1_mean_AM Var1_mean_PM Var2_mean_AM Var2_mean_PM Var1_max_AM
<char> <num> <num> <num> <num> <num>
1: A 9.799914 10.775268 20.27029 20.15971 10.28025
2: B 10.313751 10.325310 19.45621 20.93434 10.90198
3: C 10.417040 9.404453 18.92855 18.61260 11.46236
4: D 9.613987 9.493691 19.83456 19.61916 11.01628
Var1_max_PM Var2_max_AM Var2_max_PM Var1_min_AM Var1_min_PM Var2_min_AM
<num> <num> <num> <num> <num> <num>
1: 11.50531 20.55069 20.43301 9.095061 10.045226 19.98988
2: 10.44247 19.77664 21.79704 9.781122 10.208148 19.13577
3: 10.83581 19.60074 20.48730 9.835870 7.973092 17.95787
4: 10.90536 20.02134 20.31228 8.211689 8.143880 19.64777
Var2_min_PM
<num>
1: 19.72577
2: 20.17209
3: 15.95908
4: 18.92603
Note how na.rm = TRUE
was successfully applied to all aggregating functions
37.2 Wide to long: melt()
37.2.1 Example 1
dt_wide <- data.table(
ID = 1:4,
Timepoint_A = 11:14,
Timepoint_B = 21:24,
Timepoint_C = 51:54)
dt_wide
ID Timepoint_A Timepoint_B Timepoint_C
<int> <int> <int> <int>
1: 1 11 21 51
2: 2 12 22 52
3: 3 13 23 53
4: 4 14 24 54
dt_long <- melt(
dt_wide,
id.vars = "ID",
measure.vars = 2:4, # defaults to all non-id columns
variable.name = "Timepoint",
value.name = c("Score"))
dt_long
ID Timepoint Score
<int> <fctr> <int>
1: 1 Timepoint_A 11
2: 2 Timepoint_A 12
3: 3 Timepoint_A 13
4: 4 Timepoint_A 14
5: 1 Timepoint_B 21
6: 2 Timepoint_B 22
7: 3 Timepoint_B 23
8: 4 Timepoint_B 24
9: 1 Timepoint_C 51
10: 2 Timepoint_C 52
11: 3 Timepoint_C 53
12: 4 Timepoint_C 54
37.2.2 Example 2
Using the same synthetic data as in Chapter 26:
dat_wide_dt <- data.table(
Account_ID = c(8001, 8002, 8003, 8004),
Age = rnorm(4, mean = 57, sd = 12),
RBC = rnorm(4, mean = 4.8, sd = 0.5),
WBC = rnorm(4, mean = 7250, sd = 1500),
Hematocrit = rnorm(4, mean = 40.2, sd = 4),
Hemoglobin = rnorm(4, mean = 13.6, sd = 1.5),
Admission = sample(c("ED", "Planned"), size = 4, replace = TRUE)
)
dat_wide_dt
Account_ID Age RBC WBC Hematocrit Hemoglobin Admission
<num> <num> <num> <num> <num> <num> <char>
1: 8001 58.45752 4.675691 7776.085 35.61951 13.76775 Planned
2: 8002 38.29222 5.344828 8158.330 41.50580 13.91031 Planned
3: 8003 60.75944 4.482781 7182.376 42.62882 13.85600 ED
4: 8004 63.58479 5.968185 5453.067 44.84574 13.89564 ED
dat_wide2long_dt <- melt(dat_wide_dt,
id.vars = c(1:2, 7),
measure.vars = 3:6,
variable.name = "Lab_key",
value.name = "Lab_value")
dat_wide2long_dt
Account_ID Age Admission Lab_key Lab_value
<num> <num> <char> <fctr> <num>
1: 8001 58.45752 Planned RBC 4.675691
2: 8002 38.29222 Planned RBC 5.344828
3: 8003 60.75944 ED RBC 4.482781
4: 8004 63.58479 ED RBC 5.968185
5: 8001 58.45752 Planned WBC 7776.085457
6: 8002 38.29222 Planned WBC 8158.330363
7: 8003 60.75944 ED WBC 7182.375695
8: 8004 63.58479 ED WBC 5453.066686
9: 8001 58.45752 Planned Hematocrit 35.619509
10: 8002 38.29222 Planned Hematocrit 41.505801
11: 8003 60.75944 ED Hematocrit 42.628816
12: 8004 63.58479 ED Hematocrit 44.845745
13: 8001 58.45752 Planned Hemoglobin 13.767750
14: 8002 38.29222 Planned Hemoglobin 13.910312
15: 8003 60.75944 ED Hemoglobin 13.856005
16: 8004 63.58479 ED Hemoglobin 13.895635
If desired, you can set the ID column as they key, which will sort the data.table by its values:
setorder(dat_wide2long_dt, "Account_ID")
dat_wide2long_dt
Account_ID Age Admission Lab_key Lab_value
<num> <num> <char> <fctr> <num>
1: 8001 58.45752 Planned RBC 4.675691
2: 8001 58.45752 Planned WBC 7776.085457
3: 8001 58.45752 Planned Hematocrit 35.619509
4: 8001 58.45752 Planned Hemoglobin 13.767750
5: 8002 38.29222 Planned RBC 5.344828
6: 8002 38.29222 Planned WBC 8158.330363
7: 8002 38.29222 Planned Hematocrit 41.505801
8: 8002 38.29222 Planned Hemoglobin 13.910312
9: 8003 60.75944 ED RBC 4.482781
10: 8003 60.75944 ED WBC 7182.375695
11: 8003 60.75944 ED Hematocrit 42.628816
12: 8003 60.75944 ED Hemoglobin 13.856005
13: 8004 63.58479 ED RBC 5.968185
14: 8004 63.58479 ED WBC 5453.066686
15: 8004 63.58479 ED Hematocrit 44.845745
16: 8004 63.58479 ED Hemoglobin 13.895635