37  Reshape

library(data.table)
options(datatable.print.class = TRUE)

37.1 Long to wide using dcast()

37.1.1 Example 1: key-value pairs

Using the same example seen in Chapter 26:

dat_long <- data.frame(
    Account_ID = c(8001, 8002, 8003, 8004, 8001, 8002, 8003, 8004, 
            8001, 8002, 8003, 8004, 8001, 8002, 8003, 8004), 
    Age = c(67.8017038366664, 42.9198507293701, 46.2301756642422, 
            39.665983196671, 67.8017038366664, 42.9198507293701, 
            46.2301756642422, 39.665983196671, 67.8017038366664, 
            42.9198507293701, 46.2301756642422, 39.665983196671, 
            67.8017038366664, 42.9198507293701, 46.2301756642422, 
            39.665983196671), 
    Admission = c("ED", "Planned", "Planned", "ED", "ED", "Planned", 
            "Planned", "ED", "ED", "Planned", "Planned", "ED", "ED", "Planned", 
            "Planned", "ED"), 
    Lab_key = c("RBC", "RBC", "RBC", "RBC", "WBC", "WBC", "WBC", "WBC", 
            "Hematocrit", "Hematocrit", "Hematocrit", "Hematocrit", 
            "Hemoglobin", "Hemoglobin", "Hemoglobin", "Hemoglobin"), 
    Lab_value = c(4.63449321082268, 3.34968550627897, 4.27037213597765, 
            4.93897736897793, 8374.22887757195, 7612.37380499927, 
            8759.27855519425, 6972.28096216548, 36.272693147236, 
            40.5716317809522, 39.9888624177955, 39.8786884058422, 
            12.6188444991545, 12.1739747363806, 15.1293426442183, 
            14.8885696185238)
)
dat_long
   Account_ID      Age Admission    Lab_key   Lab_value
1        8001 67.80170        ED        RBC    4.634493
2        8002 42.91985   Planned        RBC    3.349686
3        8003 46.23018   Planned        RBC    4.270372
4        8004 39.66598        ED        RBC    4.938977
5        8001 67.80170        ED        WBC 8374.228878
6        8002 42.91985   Planned        WBC 7612.373805
7        8003 46.23018   Planned        WBC 8759.278555
8        8004 39.66598        ED        WBC 6972.280962
9        8001 67.80170        ED Hematocrit   36.272693
10       8002 42.91985   Planned Hematocrit   40.571632
11       8003 46.23018   Planned Hematocrit   39.988862
12       8004 39.66598        ED Hematocrit   39.878688
13       8001 67.80170        ED Hemoglobin   12.618844
14       8002 42.91985   Planned Hemoglobin   12.173975
15       8003 46.23018   Planned Hemoglobin   15.129343
16       8004 39.66598        ED Hemoglobin   14.888570

data.table’s long to wide procedure is defined with a convenient formula notation:

dat_long_dt <- as.data.table(dat_long)
dat_long2wide_dt <- dcast(dat_long_dt,
                          Account_ID + Age + Admission ~ Lab_key,
                          value.var = "Lab_value")
dat_long2wide_dt
Key: <Account_ID, Age, Admission>
   Account_ID      Age Admission Hematocrit Hemoglobin      RBC      WBC
        <num>    <num>    <char>      <num>      <num>    <num>    <num>
1:       8001 67.80170        ED   36.27269   12.61884 4.634493 8374.229
2:       8002 42.91985   Planned   40.57163   12.17397 3.349686 7612.374
3:       8003 46.23018   Planned   39.98886   15.12934 4.270372 8759.279
4:       8004 39.66598        ED   39.87869   14.88857 4.938977 6972.281

Instead of listing all variables you can use ..., which corresponds to all variables not otherwise mentioned in the formula or in value.var:

dcast(dat_long_dt,
      ... ~ Lab_key,
      value.var = "Lab_value")
Key: <Account_ID, Age, Admission>
   Account_ID      Age Admission Hematocrit Hemoglobin      RBC      WBC
        <num>    <num>    <char>      <num>      <num>    <num>    <num>
1:       8001 67.80170        ED   36.27269   12.61884 4.634493 8374.229
2:       8002 42.91985   Planned   40.57163   12.17397 3.349686 7612.374
3:       8003 46.23018   Planned   39.98886   15.12934 4.270372 8759.279
4:       8004 39.66598        ED   39.87869   14.88857 4.938977 6972.281

37.1.2 Example 2: Timepoints

Another simple synthetic dataset:

dt_long <- data.table(
        ID = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), 
        Timepoint = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 
        2L, 3L, 3L, 3L, 3L), levels = c("Timepoint_A", "Timepoint_B", 
        "Timepoint_C"), class = "factor"), 
        Score = c(11L, 12L, 13L, 14L, 21L, 22L, 23L, 24L, 51L, 52L, 53L, 54L)
)
dt_long
       ID   Timepoint Score
    <int>      <fctr> <int>
 1:     1 Timepoint_A    11
 2:     2 Timepoint_A    12
 3:     3 Timepoint_A    13
 4:     4 Timepoint_A    14
 5:     1 Timepoint_B    21
 6:     2 Timepoint_B    22
 7:     3 Timepoint_B    23
 8:     4 Timepoint_B    24
 9:     1 Timepoint_C    51
10:     2 Timepoint_C    52
11:     3 Timepoint_C    53
12:     4 Timepoint_C    54
dt_long
       ID   Timepoint Score
    <int>      <fctr> <int>
 1:     1 Timepoint_A    11
 2:     2 Timepoint_A    12
 3:     3 Timepoint_A    13
 4:     4 Timepoint_A    14
 5:     1 Timepoint_B    21
 6:     2 Timepoint_B    22
 7:     3 Timepoint_B    23
 8:     4 Timepoint_B    24
 9:     1 Timepoint_C    51
10:     2 Timepoint_C    52
11:     3 Timepoint_C    53
12:     4 Timepoint_C    54
dcast(dt_long, ID ~ Timepoint,
      value.var = "Score")
Key: <ID>
      ID Timepoint_A Timepoint_B Timepoint_C
   <int>       <int>       <int>       <int>
1:     1          11          21          51
2:     2          12          22          52
3:     3          13          23          53
4:     4          14          24          54

37.1.3 dcast() + aggregate

If your ID ~ Timepoint combination does not define a unique row in your input dataset, you need to specify an aggregate function.

For example, suppose you have four subjects with IDs “A”, “B”, “C”, “D” who had a couple variables measured 3 times in the AM and 3 times in the PM.

dt_long2 <- data.table(ID = rep(LETTERS[1:4], each = 6),
                      Timepoint = rep(c("AM", "PM"), length.out = 24, each = 3),
                      Var1 = rnorm(24, mean = 10),
                      Var2 = rnorm(24, mean = 20))

dt_long2[sample(24, size = 4), Var1 := NA]
dt_long2[sample(24, size = 4), Var2 := NA]
dt_long2
        ID Timepoint      Var1     Var2
    <char>    <char>     <num>    <num>
 1:      A        AM        NA 19.23404
 2:      A        AM  9.409035       NA
 3:      A        AM  9.778383 20.79869
 4:      A        PM 10.801012 19.83813
 5:      A        PM 10.569462 19.76169
 6:      A        PM        NA 19.97912
 7:      B        AM  8.938075 20.77894
 8:      B        AM 10.537467 18.38159
 9:      B        AM  9.491706 19.28348
10:      B        PM        NA 20.99647
11:      B        PM 10.672218 19.12882
12:      B        PM 11.692662 19.84949
13:      C        AM  9.799844 19.10500
14:      C        AM  9.718182 19.62712
15:      C        AM 10.780138       NA
16:      C        PM 10.945768       NA
17:      C        PM 10.438192 20.62939
18:      C        PM 10.863070 18.68233
19:      D        AM 11.101189 18.89342
20:      D        AM 10.479316 20.65663
21:      D        AM        NA 19.85799
22:      D        PM 10.226233 22.01497
23:      D        PM 12.121823       NA
24:      D        PM 11.819867 19.83612
        ID Timepoint      Var1     Var2

If you wanted to convert the above data.table to wide format and get mean AM and PM values using the fun.aggregate argument:

dcast(dt_long2,
      ID ~ Timepoint,
      value.var = c("Var1", "Var2"),
      fun.aggregate = mean, na.rm = TRUE)
Key: <ID>
       ID   Var1_AM  Var1_PM  Var2_AM  Var2_PM
   <char>     <num>    <num>    <num>    <num>
1:      A  9.593709 10.68524 20.01636 19.85965
2:      B  9.655749 11.18244 19.48133 19.99159
3:      C 10.099388 10.74901 19.36606 19.65586
4:      D 10.790252 11.38931 19.80268 20.92554

You can apply multiple aggregating functions by passing a list to fun.aggregate:

dcast(dt_long2,
      ID ~ Timepoint,
      value.var = c("Var1", "Var2"),
      fun.aggregate = list(mean, max, min), na.rm = TRUE)
Key: <ID>
       ID Var1_mean_AM Var1_mean_PM Var2_mean_AM Var2_mean_PM Var1_max_AM
   <char>        <num>        <num>        <num>        <num>       <num>
1:      A     9.593709     10.68524     20.01636     19.85965    9.778383
2:      B     9.655749     11.18244     19.48133     19.99159   10.537467
3:      C    10.099388     10.74901     19.36606     19.65586   10.780138
4:      D    10.790252     11.38931     19.80268     20.92554   11.101189
   Var1_max_PM Var2_max_AM Var2_max_PM Var1_min_AM Var1_min_PM Var2_min_AM
         <num>       <num>       <num>       <num>       <num>       <num>
1:    10.80101    20.79869    19.97912    9.409035    10.56946    19.23404
2:    11.69266    20.77894    20.99647    8.938075    10.67222    18.38159
3:    10.94577    19.62712    20.62939    9.718182    10.43819    19.10500
4:    12.12182    20.65663    22.01497   10.479316    10.22623    18.89342
   Var2_min_PM
         <num>
1:    19.76169
2:    19.12882
3:    18.68233
4:    19.83612

Note how na.rm = TRUE was successfully applied to all aggregating functions

37.2 Wide to long: melt()

37.2.1 Example 1

dt_wide <- data.table(
        ID = 1:4,
        Timepoint_A = 11:14,
        Timepoint_B = 21:24,
        Timepoint_C = 51:54)
dt_wide
      ID Timepoint_A Timepoint_B Timepoint_C
   <int>       <int>       <int>       <int>
1:     1          11          21          51
2:     2          12          22          52
3:     3          13          23          53
4:     4          14          24          54
dt_long <- melt(
        dt_wide, 
        id.vars = "ID",
        measure.vars = 2:4, # defaults to all non-id columns
        variable.name = "Timepoint",
        value.name = c("Score"))
dt_long
       ID   Timepoint Score
    <int>      <fctr> <int>
 1:     1 Timepoint_A    11
 2:     2 Timepoint_A    12
 3:     3 Timepoint_A    13
 4:     4 Timepoint_A    14
 5:     1 Timepoint_B    21
 6:     2 Timepoint_B    22
 7:     3 Timepoint_B    23
 8:     4 Timepoint_B    24
 9:     1 Timepoint_C    51
10:     2 Timepoint_C    52
11:     3 Timepoint_C    53
12:     4 Timepoint_C    54

37.2.2 Example 2

Using the same synthetic data as in Chapter 26:

dat_wide_dt <- data.table(
    Account_ID = c(8001, 8002, 8003, 8004),
    Age = rnorm(4, mean = 57, sd = 12),
    RBC = rnorm(4, mean = 4.8, sd = 0.5),
    WBC = rnorm(4, mean = 7250, sd = 1500),
    Hematocrit = rnorm(4, mean = 40.2, sd = 4),
    Hemoglobin = rnorm(4, mean = 13.6, sd = 1.5),
    Admission = sample(c("ED", "Planned"), size = 4, replace = TRUE)
)
dat_wide_dt
   Account_ID      Age      RBC      WBC Hematocrit Hemoglobin Admission
        <num>    <num>    <num>    <num>      <num>      <num>    <char>
1:       8001 67.97607 4.993220 6401.030   38.92769   13.06050        ED
2:       8002 37.92261 4.632340 8240.622   31.44702   13.68199        ED
3:       8003 76.86221 5.015397 6448.790   41.20295   13.01755   Planned
4:       8004 68.14413 5.265801 8179.193   38.84197   14.42528   Planned
dat_wide2long_dt <- melt(dat_wide_dt,
                         id.vars = c(1:2, 7),
                         measure.vars = 3:6,
                         variable.name = "Lab_key",
                         value.name = "Lab_value")
dat_wide2long_dt
    Account_ID      Age Admission    Lab_key   Lab_value
         <num>    <num>    <char>     <fctr>       <num>
 1:       8001 67.97607        ED        RBC    4.993220
 2:       8002 37.92261        ED        RBC    4.632340
 3:       8003 76.86221   Planned        RBC    5.015397
 4:       8004 68.14413   Planned        RBC    5.265801
 5:       8001 67.97607        ED        WBC 6401.030145
 6:       8002 37.92261        ED        WBC 8240.622414
 7:       8003 76.86221   Planned        WBC 6448.789949
 8:       8004 68.14413   Planned        WBC 8179.192677
 9:       8001 67.97607        ED Hematocrit   38.927686
10:       8002 37.92261        ED Hematocrit   31.447017
11:       8003 76.86221   Planned Hematocrit   41.202953
12:       8004 68.14413   Planned Hematocrit   38.841967
13:       8001 67.97607        ED Hemoglobin   13.060502
14:       8002 37.92261        ED Hemoglobin   13.681989
15:       8003 76.86221   Planned Hemoglobin   13.017553
16:       8004 68.14413   Planned Hemoglobin   14.425278

If desired, you can set the ID column as they key, which will sort the data.table by its values:

setorder(dat_wide2long_dt, "Account_ID")
dat_wide2long_dt
    Account_ID      Age Admission    Lab_key   Lab_value
         <num>    <num>    <char>     <fctr>       <num>
 1:       8001 67.97607        ED        RBC    4.993220
 2:       8001 67.97607        ED        WBC 6401.030145
 3:       8001 67.97607        ED Hematocrit   38.927686
 4:       8001 67.97607        ED Hemoglobin   13.060502
 5:       8002 37.92261        ED        RBC    4.632340
 6:       8002 37.92261        ED        WBC 8240.622414
 7:       8002 37.92261        ED Hematocrit   31.447017
 8:       8002 37.92261        ED Hemoglobin   13.681989
 9:       8003 76.86221   Planned        RBC    5.015397
10:       8003 76.86221   Planned        WBC 6448.789949
11:       8003 76.86221   Planned Hematocrit   41.202953
12:       8003 76.86221   Planned Hemoglobin   13.017553
13:       8004 68.14413   Planned        RBC    5.265801
14:       8004 68.14413   Planned        WBC 8179.192677
15:       8004 68.14413   Planned Hematocrit   38.841967
16:       8004 68.14413   Planned Hemoglobin   14.425278

37.3 Resources

37.4 See also