library(data.table)
options(datatable.print.class = TRUE)
37 Reshape
37.1 Long to wide using dcast()
37.1.1 Example 1: key-value pairs
Using the same example seen in Chapter 26:
dat_long <- data.frame(
Account_ID = c(8001, 8002, 8003, 8004, 8001, 8002, 8003, 8004,
8001, 8002, 8003, 8004, 8001, 8002, 8003, 8004),
Age = c(67.8017038366664, 42.9198507293701, 46.2301756642422,
39.665983196671, 67.8017038366664, 42.9198507293701,
46.2301756642422, 39.665983196671, 67.8017038366664,
42.9198507293701, 46.2301756642422, 39.665983196671,
67.8017038366664, 42.9198507293701, 46.2301756642422,
39.665983196671),
Admission = c("ED", "Planned", "Planned", "ED", "ED", "Planned",
"Planned", "ED", "ED", "Planned", "Planned", "ED", "ED", "Planned",
"Planned", "ED"),
Lab_key = c("RBC", "RBC", "RBC", "RBC", "WBC", "WBC", "WBC", "WBC",
"Hematocrit", "Hematocrit", "Hematocrit", "Hematocrit",
"Hemoglobin", "Hemoglobin", "Hemoglobin", "Hemoglobin"),
Lab_value = c(4.63449321082268, 3.34968550627897, 4.27037213597765,
4.93897736897793, 8374.22887757195, 7612.37380499927,
8759.27855519425, 6972.28096216548, 36.272693147236,
40.5716317809522, 39.9888624177955, 39.8786884058422,
12.6188444991545, 12.1739747363806, 15.1293426442183,
14.8885696185238)
)
dat_long
Account_ID Age Admission Lab_key Lab_value
1 8001 67.80170 ED RBC 4.634493
2 8002 42.91985 Planned RBC 3.349686
3 8003 46.23018 Planned RBC 4.270372
4 8004 39.66598 ED RBC 4.938977
5 8001 67.80170 ED WBC 8374.228878
6 8002 42.91985 Planned WBC 7612.373805
7 8003 46.23018 Planned WBC 8759.278555
8 8004 39.66598 ED WBC 6972.280962
9 8001 67.80170 ED Hematocrit 36.272693
10 8002 42.91985 Planned Hematocrit 40.571632
11 8003 46.23018 Planned Hematocrit 39.988862
12 8004 39.66598 ED Hematocrit 39.878688
13 8001 67.80170 ED Hemoglobin 12.618844
14 8002 42.91985 Planned Hemoglobin 12.173975
15 8003 46.23018 Planned Hemoglobin 15.129343
16 8004 39.66598 ED Hemoglobin 14.888570
data.table
’s long to wide procedure is defined with a convenient formula notation:
dat_long_dt <- as.data.table(dat_long)
dat_long2wide_dt <- dcast(dat_long_dt,
Account_ID + Age + Admission ~ Lab_key,
value.var = "Lab_value")
dat_long2wide_dt
Key: <Account_ID, Age, Admission>
Account_ID Age Admission Hematocrit Hemoglobin RBC WBC
<num> <num> <char> <num> <num> <num> <num>
1: 8001 67.80170 ED 36.27269 12.61884 4.634493 8374.229
2: 8002 42.91985 Planned 40.57163 12.17397 3.349686 7612.374
3: 8003 46.23018 Planned 39.98886 15.12934 4.270372 8759.279
4: 8004 39.66598 ED 39.87869 14.88857 4.938977 6972.281
Instead of listing all variables you can use ...
, which corresponds to all variables not otherwise mentioned in the formula or in value.var
:
dcast(dat_long_dt,
... ~ Lab_key,
value.var = "Lab_value")
Key: <Account_ID, Age, Admission>
Account_ID Age Admission Hematocrit Hemoglobin RBC WBC
<num> <num> <char> <num> <num> <num> <num>
1: 8001 67.80170 ED 36.27269 12.61884 4.634493 8374.229
2: 8002 42.91985 Planned 40.57163 12.17397 3.349686 7612.374
3: 8003 46.23018 Planned 39.98886 15.12934 4.270372 8759.279
4: 8004 39.66598 ED 39.87869 14.88857 4.938977 6972.281
37.1.2 Example 2: Timepoints
Another simple synthetic dataset:
dt_long <- data.table(
ID = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L),
Timepoint = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L), levels = c("Timepoint_A", "Timepoint_B",
"Timepoint_C"), class = "factor"),
Score = c(11L, 12L, 13L, 14L, 21L, 22L, 23L, 24L, 51L, 52L, 53L, 54L)
)
dt_long
ID Timepoint Score
<int> <fctr> <int>
1: 1 Timepoint_A 11
2: 2 Timepoint_A 12
3: 3 Timepoint_A 13
4: 4 Timepoint_A 14
5: 1 Timepoint_B 21
6: 2 Timepoint_B 22
7: 3 Timepoint_B 23
8: 4 Timepoint_B 24
9: 1 Timepoint_C 51
10: 2 Timepoint_C 52
11: 3 Timepoint_C 53
12: 4 Timepoint_C 54
dt_long
ID Timepoint Score
<int> <fctr> <int>
1: 1 Timepoint_A 11
2: 2 Timepoint_A 12
3: 3 Timepoint_A 13
4: 4 Timepoint_A 14
5: 1 Timepoint_B 21
6: 2 Timepoint_B 22
7: 3 Timepoint_B 23
8: 4 Timepoint_B 24
9: 1 Timepoint_C 51
10: 2 Timepoint_C 52
11: 3 Timepoint_C 53
12: 4 Timepoint_C 54
dcast(dt_long, ID ~ Timepoint,
value.var = "Score")
Key: <ID>
ID Timepoint_A Timepoint_B Timepoint_C
<int> <int> <int> <int>
1: 1 11 21 51
2: 2 12 22 52
3: 3 13 23 53
4: 4 14 24 54
37.1.3 dcast()
+ aggregate
If your ID ~ Timepoint
combination does not define a unique row in your input dataset, you need to specify an aggregate function.
For example, suppose you have four subjects with IDs “A”, “B”, “C”, “D” who had a couple variables measured 3 times in the AM and 3 times in the PM.
dt_long2 <- data.table(ID = rep(LETTERS[1:4], each = 6),
Timepoint = rep(c("AM", "PM"), length.out = 24, each = 3),
Var1 = rnorm(24, mean = 10),
Var2 = rnorm(24, mean = 20))
dt_long2[sample(24, size = 4), Var1 := NA]
dt_long2[sample(24, size = 4), Var2 := NA]
dt_long2
ID Timepoint Var1 Var2
<char> <char> <num> <num>
1: A AM NA 19.23404
2: A AM 9.409035 NA
3: A AM 9.778383 20.79869
4: A PM 10.801012 19.83813
5: A PM 10.569462 19.76169
6: A PM NA 19.97912
7: B AM 8.938075 20.77894
8: B AM 10.537467 18.38159
9: B AM 9.491706 19.28348
10: B PM NA 20.99647
11: B PM 10.672218 19.12882
12: B PM 11.692662 19.84949
13: C AM 9.799844 19.10500
14: C AM 9.718182 19.62712
15: C AM 10.780138 NA
16: C PM 10.945768 NA
17: C PM 10.438192 20.62939
18: C PM 10.863070 18.68233
19: D AM 11.101189 18.89342
20: D AM 10.479316 20.65663
21: D AM NA 19.85799
22: D PM 10.226233 22.01497
23: D PM 12.121823 NA
24: D PM 11.819867 19.83612
ID Timepoint Var1 Var2
If you wanted to convert the above data.table to wide format and get mean AM and PM values using the fun.aggregate
argument:
Key: <ID>
ID Var1_AM Var1_PM Var2_AM Var2_PM
<char> <num> <num> <num> <num>
1: A 9.593709 10.68524 20.01636 19.85965
2: B 9.655749 11.18244 19.48133 19.99159
3: C 10.099388 10.74901 19.36606 19.65586
4: D 10.790252 11.38931 19.80268 20.92554
You can apply multiple aggregating functions by passing a list to fun.aggregate
:
dcast(dt_long2,
ID ~ Timepoint,
value.var = c("Var1", "Var2"),
fun.aggregate = list(mean, max, min), na.rm = TRUE)
Key: <ID>
ID Var1_mean_AM Var1_mean_PM Var2_mean_AM Var2_mean_PM Var1_max_AM
<char> <num> <num> <num> <num> <num>
1: A 9.593709 10.68524 20.01636 19.85965 9.778383
2: B 9.655749 11.18244 19.48133 19.99159 10.537467
3: C 10.099388 10.74901 19.36606 19.65586 10.780138
4: D 10.790252 11.38931 19.80268 20.92554 11.101189
Var1_max_PM Var2_max_AM Var2_max_PM Var1_min_AM Var1_min_PM Var2_min_AM
<num> <num> <num> <num> <num> <num>
1: 10.80101 20.79869 19.97912 9.409035 10.56946 19.23404
2: 11.69266 20.77894 20.99647 8.938075 10.67222 18.38159
3: 10.94577 19.62712 20.62939 9.718182 10.43819 19.10500
4: 12.12182 20.65663 22.01497 10.479316 10.22623 18.89342
Var2_min_PM
<num>
1: 19.76169
2: 19.12882
3: 18.68233
4: 19.83612
Note how na.rm = TRUE
was successfully applied to all aggregating functions
37.2 Wide to long: melt()
37.2.1 Example 1
dt_wide <- data.table(
ID = 1:4,
Timepoint_A = 11:14,
Timepoint_B = 21:24,
Timepoint_C = 51:54)
dt_wide
ID Timepoint_A Timepoint_B Timepoint_C
<int> <int> <int> <int>
1: 1 11 21 51
2: 2 12 22 52
3: 3 13 23 53
4: 4 14 24 54
dt_long <- melt(
dt_wide,
id.vars = "ID",
measure.vars = 2:4, # defaults to all non-id columns
variable.name = "Timepoint",
value.name = c("Score"))
dt_long
ID Timepoint Score
<int> <fctr> <int>
1: 1 Timepoint_A 11
2: 2 Timepoint_A 12
3: 3 Timepoint_A 13
4: 4 Timepoint_A 14
5: 1 Timepoint_B 21
6: 2 Timepoint_B 22
7: 3 Timepoint_B 23
8: 4 Timepoint_B 24
9: 1 Timepoint_C 51
10: 2 Timepoint_C 52
11: 3 Timepoint_C 53
12: 4 Timepoint_C 54
37.2.2 Example 2
Using the same synthetic data as in Chapter 26:
dat_wide_dt <- data.table(
Account_ID = c(8001, 8002, 8003, 8004),
Age = rnorm(4, mean = 57, sd = 12),
RBC = rnorm(4, mean = 4.8, sd = 0.5),
WBC = rnorm(4, mean = 7250, sd = 1500),
Hematocrit = rnorm(4, mean = 40.2, sd = 4),
Hemoglobin = rnorm(4, mean = 13.6, sd = 1.5),
Admission = sample(c("ED", "Planned"), size = 4, replace = TRUE)
)
dat_wide_dt
Account_ID Age RBC WBC Hematocrit Hemoglobin Admission
<num> <num> <num> <num> <num> <num> <char>
1: 8001 67.97607 4.993220 6401.030 38.92769 13.06050 ED
2: 8002 37.92261 4.632340 8240.622 31.44702 13.68199 ED
3: 8003 76.86221 5.015397 6448.790 41.20295 13.01755 Planned
4: 8004 68.14413 5.265801 8179.193 38.84197 14.42528 Planned
dat_wide2long_dt <- melt(dat_wide_dt,
id.vars = c(1:2, 7),
measure.vars = 3:6,
variable.name = "Lab_key",
value.name = "Lab_value")
dat_wide2long_dt
Account_ID Age Admission Lab_key Lab_value
<num> <num> <char> <fctr> <num>
1: 8001 67.97607 ED RBC 4.993220
2: 8002 37.92261 ED RBC 4.632340
3: 8003 76.86221 Planned RBC 5.015397
4: 8004 68.14413 Planned RBC 5.265801
5: 8001 67.97607 ED WBC 6401.030145
6: 8002 37.92261 ED WBC 8240.622414
7: 8003 76.86221 Planned WBC 6448.789949
8: 8004 68.14413 Planned WBC 8179.192677
9: 8001 67.97607 ED Hematocrit 38.927686
10: 8002 37.92261 ED Hematocrit 31.447017
11: 8003 76.86221 Planned Hematocrit 41.202953
12: 8004 68.14413 Planned Hematocrit 38.841967
13: 8001 67.97607 ED Hemoglobin 13.060502
14: 8002 37.92261 ED Hemoglobin 13.681989
15: 8003 76.86221 Planned Hemoglobin 13.017553
16: 8004 68.14413 Planned Hemoglobin 14.425278
If desired, you can set the ID column as they key, which will sort the data.table by its values:
setorder(dat_wide2long_dt, "Account_ID")
dat_wide2long_dt
Account_ID Age Admission Lab_key Lab_value
<num> <num> <char> <fctr> <num>
1: 8001 67.97607 ED RBC 4.993220
2: 8001 67.97607 ED WBC 6401.030145
3: 8001 67.97607 ED Hematocrit 38.927686
4: 8001 67.97607 ED Hemoglobin 13.060502
5: 8002 37.92261 ED RBC 4.632340
6: 8002 37.92261 ED WBC 8240.622414
7: 8002 37.92261 ED Hematocrit 31.447017
8: 8002 37.92261 ED Hemoglobin 13.681989
9: 8003 76.86221 Planned RBC 5.015397
10: 8003 76.86221 Planned WBC 6448.789949
11: 8003 76.86221 Planned Hematocrit 41.202953
12: 8003 76.86221 Planned Hemoglobin 13.017553
13: 8004 68.14413 Planned RBC 5.265801
14: 8004 68.14413 Planned WBC 8179.192677
15: 8004 68.14413 Planned Hematocrit 38.841967
16: 8004 68.14413 Planned Hemoglobin 14.425278