37  Reshape

library(data.table)
options(datatable.print.class = TRUE)

37.1 Long to wide using dcast()

37.1.1 Example 1: key-value pairs

Using the same example seen in Chapter 26:

dat_long <- data.frame(
    Account_ID = c(8001, 8002, 8003, 8004, 8001, 8002, 8003, 8004, 
            8001, 8002, 8003, 8004, 8001, 8002, 8003, 8004), 
    Age = c(67.8017038366664, 42.9198507293701, 46.2301756642422, 
            39.665983196671, 67.8017038366664, 42.9198507293701, 
            46.2301756642422, 39.665983196671, 67.8017038366664, 
            42.9198507293701, 46.2301756642422, 39.665983196671, 
            67.8017038366664, 42.9198507293701, 46.2301756642422, 
            39.665983196671), 
    Admission = c("ED", "Planned", "Planned", "ED", "ED", "Planned", 
            "Planned", "ED", "ED", "Planned", "Planned", "ED", "ED", "Planned", 
            "Planned", "ED"), 
    Lab_key = c("RBC", "RBC", "RBC", "RBC", "WBC", "WBC", "WBC", "WBC", 
            "Hematocrit", "Hematocrit", "Hematocrit", "Hematocrit", 
            "Hemoglobin", "Hemoglobin", "Hemoglobin", "Hemoglobin"), 
    Lab_value = c(4.63449321082268, 3.34968550627897, 4.27037213597765, 
            4.93897736897793, 8374.22887757195, 7612.37380499927, 
            8759.27855519425, 6972.28096216548, 36.272693147236, 
            40.5716317809522, 39.9888624177955, 39.8786884058422, 
            12.6188444991545, 12.1739747363806, 15.1293426442183, 
            14.8885696185238)
)
dat_long
   Account_ID      Age Admission    Lab_key   Lab_value
1        8001 67.80170        ED        RBC    4.634493
2        8002 42.91985   Planned        RBC    3.349686
3        8003 46.23018   Planned        RBC    4.270372
4        8004 39.66598        ED        RBC    4.938977
5        8001 67.80170        ED        WBC 8374.228878
6        8002 42.91985   Planned        WBC 7612.373805
7        8003 46.23018   Planned        WBC 8759.278555
8        8004 39.66598        ED        WBC 6972.280962
9        8001 67.80170        ED Hematocrit   36.272693
10       8002 42.91985   Planned Hematocrit   40.571632
11       8003 46.23018   Planned Hematocrit   39.988862
12       8004 39.66598        ED Hematocrit   39.878688
13       8001 67.80170        ED Hemoglobin   12.618844
14       8002 42.91985   Planned Hemoglobin   12.173975
15       8003 46.23018   Planned Hemoglobin   15.129343
16       8004 39.66598        ED Hemoglobin   14.888570

data.table’s long to wide procedure is defined with a convenient formula notation:

dat_long_dt <- as.data.table(dat_long)
dat_long2wide_dt <- dcast(dat_long_dt,
                          Account_ID + Age + Admission ~ Lab_key,
                          value.var = "Lab_value")
dat_long2wide_dt
Key: <Account_ID, Age, Admission>
   Account_ID      Age Admission Hematocrit Hemoglobin      RBC      WBC
        <num>    <num>    <char>      <num>      <num>    <num>    <num>
1:       8001 67.80170        ED   36.27269   12.61884 4.634493 8374.229
2:       8002 42.91985   Planned   40.57163   12.17397 3.349686 7612.374
3:       8003 46.23018   Planned   39.98886   15.12934 4.270372 8759.279
4:       8004 39.66598        ED   39.87869   14.88857 4.938977 6972.281

Instead of listing all variables you can use ..., which corresponds to all variables not otherwise mentioned in the formula or in value.var:

dcast(dat_long_dt,
      ... ~ Lab_key,
      value.var = "Lab_value")
Key: <Account_ID, Age, Admission>
   Account_ID      Age Admission Hematocrit Hemoglobin      RBC      WBC
        <num>    <num>    <char>      <num>      <num>    <num>    <num>
1:       8001 67.80170        ED   36.27269   12.61884 4.634493 8374.229
2:       8002 42.91985   Planned   40.57163   12.17397 3.349686 7612.374
3:       8003 46.23018   Planned   39.98886   15.12934 4.270372 8759.279
4:       8004 39.66598        ED   39.87869   14.88857 4.938977 6972.281

37.1.2 Example 2: Timepoints

Another simple synthetic dataset:

dt_long <- data.table(
        ID = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), 
        Timepoint = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 
        2L, 3L, 3L, 3L, 3L), levels = c("Timepoint_A", "Timepoint_B", 
        "Timepoint_C"), class = "factor"), 
        Score = c(11L, 12L, 13L, 14L, 21L, 22L, 23L, 24L, 51L, 52L, 53L, 54L)
)
dt_long
       ID   Timepoint Score
    <int>      <fctr> <int>
 1:     1 Timepoint_A    11
 2:     2 Timepoint_A    12
 3:     3 Timepoint_A    13
 4:     4 Timepoint_A    14
 5:     1 Timepoint_B    21
 6:     2 Timepoint_B    22
 7:     3 Timepoint_B    23
 8:     4 Timepoint_B    24
 9:     1 Timepoint_C    51
10:     2 Timepoint_C    52
11:     3 Timepoint_C    53
12:     4 Timepoint_C    54
dt_long
       ID   Timepoint Score
    <int>      <fctr> <int>
 1:     1 Timepoint_A    11
 2:     2 Timepoint_A    12
 3:     3 Timepoint_A    13
 4:     4 Timepoint_A    14
 5:     1 Timepoint_B    21
 6:     2 Timepoint_B    22
 7:     3 Timepoint_B    23
 8:     4 Timepoint_B    24
 9:     1 Timepoint_C    51
10:     2 Timepoint_C    52
11:     3 Timepoint_C    53
12:     4 Timepoint_C    54
dcast(dt_long, ID ~ Timepoint,
      value.var = "Score")
Key: <ID>
      ID Timepoint_A Timepoint_B Timepoint_C
   <int>       <int>       <int>       <int>
1:     1          11          21          51
2:     2          12          22          52
3:     3          13          23          53
4:     4          14          24          54

37.1.3 dcast() + aggregate

If your ID ~ Timepoint combination does not define a unique row in your input dataset, you need to specify an aggregate function.

For example, suppose you have four subjects with IDs “A”, “B”, “C”, “D” who had a couple variables measured 3 times in the AM and 3 times in the PM.

dt_long2 <- data.table(ID = rep(LETTERS[1:4], each = 6),
                      Timepoint = rep(c("AM", "PM"), length.out = 24, each = 3),
                      Var1 = rnorm(24, mean = 10),
                      Var2 = rnorm(24, mean = 20))

dt_long2[sample(24, size = 4), Var1 := NA]
dt_long2[sample(24, size = 4), Var2 := NA]
dt_long2
        ID Timepoint      Var1     Var2
    <char>    <char>     <num>    <num>
 1:      A        AM  9.095061 20.55069
 2:      A        AM 10.024434       NA
 3:      A        AM 10.280247 19.98988
 4:      A        PM        NA 20.32035
 5:      A        PM 11.505311 19.72577
 6:      A        PM 10.045226 20.43301
 7:      B        AM 10.258153 19.77664
 8:      B        AM  9.781122       NA
 9:      B        AM 10.901976 19.13577
10:      B        PM        NA 20.83391
11:      B        PM 10.208148 21.79704
12:      B        PM 10.442472 20.17209
13:      C        AM  9.835870 17.95787
14:      C        AM  9.952885 19.22704
15:      C        AM 11.462364 19.60074
16:      C        PM        NA 20.48730
17:      C        PM  7.973092 15.95908
18:      C        PM 10.835813 19.39142
19:      D        AM  8.211689 20.02134
20:      D        AM 11.016285       NA
21:      D        AM        NA 19.64777
22:      D        PM 10.905356 18.92603
23:      D        PM  9.431838       NA
24:      D        PM  8.143880 20.31228
        ID Timepoint      Var1     Var2

If you wanted to convert the above data.table to wide format and get mean AM and PM values using the fun.aggregate argument:

dcast(dt_long2,
      ID ~ Timepoint,
      value.var = c("Var1", "Var2"),
      fun.aggregate = mean, na.rm = TRUE)
Key: <ID>
       ID   Var1_AM   Var1_PM  Var2_AM  Var2_PM
   <char>     <num>     <num>    <num>    <num>
1:      A  9.799914 10.775268 20.27029 20.15971
2:      B 10.313751 10.325310 19.45621 20.93434
3:      C 10.417040  9.404453 18.92855 18.61260
4:      D  9.613987  9.493691 19.83456 19.61916

You can apply multiple aggregating functions by passing a list to fun.aggregate:

dcast(dt_long2,
      ID ~ Timepoint,
      value.var = c("Var1", "Var2"),
      fun.aggregate = list(mean, max, min), na.rm = TRUE)
Key: <ID>
       ID Var1_mean_AM Var1_mean_PM Var2_mean_AM Var2_mean_PM Var1_max_AM
   <char>        <num>        <num>        <num>        <num>       <num>
1:      A     9.799914    10.775268     20.27029     20.15971    10.28025
2:      B    10.313751    10.325310     19.45621     20.93434    10.90198
3:      C    10.417040     9.404453     18.92855     18.61260    11.46236
4:      D     9.613987     9.493691     19.83456     19.61916    11.01628
   Var1_max_PM Var2_max_AM Var2_max_PM Var1_min_AM Var1_min_PM Var2_min_AM
         <num>       <num>       <num>       <num>       <num>       <num>
1:    11.50531    20.55069    20.43301    9.095061   10.045226    19.98988
2:    10.44247    19.77664    21.79704    9.781122   10.208148    19.13577
3:    10.83581    19.60074    20.48730    9.835870    7.973092    17.95787
4:    10.90536    20.02134    20.31228    8.211689    8.143880    19.64777
   Var2_min_PM
         <num>
1:    19.72577
2:    20.17209
3:    15.95908
4:    18.92603

Note how na.rm = TRUE was successfully applied to all aggregating functions

37.2 Wide to long: melt()

37.2.1 Example 1

dt_wide <- data.table(
        ID = 1:4,
        Timepoint_A = 11:14,
        Timepoint_B = 21:24,
        Timepoint_C = 51:54)
dt_wide
      ID Timepoint_A Timepoint_B Timepoint_C
   <int>       <int>       <int>       <int>
1:     1          11          21          51
2:     2          12          22          52
3:     3          13          23          53
4:     4          14          24          54
dt_long <- melt(
        dt_wide, 
        id.vars = "ID",
        measure.vars = 2:4, # defaults to all non-id columns
        variable.name = "Timepoint",
        value.name = c("Score"))
dt_long
       ID   Timepoint Score
    <int>      <fctr> <int>
 1:     1 Timepoint_A    11
 2:     2 Timepoint_A    12
 3:     3 Timepoint_A    13
 4:     4 Timepoint_A    14
 5:     1 Timepoint_B    21
 6:     2 Timepoint_B    22
 7:     3 Timepoint_B    23
 8:     4 Timepoint_B    24
 9:     1 Timepoint_C    51
10:     2 Timepoint_C    52
11:     3 Timepoint_C    53
12:     4 Timepoint_C    54

37.2.2 Example 2

Using the same synthetic data as in Chapter 26:

dat_wide_dt <- data.table(
    Account_ID = c(8001, 8002, 8003, 8004),
    Age = rnorm(4, mean = 57, sd = 12),
    RBC = rnorm(4, mean = 4.8, sd = 0.5),
    WBC = rnorm(4, mean = 7250, sd = 1500),
    Hematocrit = rnorm(4, mean = 40.2, sd = 4),
    Hemoglobin = rnorm(4, mean = 13.6, sd = 1.5),
    Admission = sample(c("ED", "Planned"), size = 4, replace = TRUE)
)
dat_wide_dt
   Account_ID      Age      RBC      WBC Hematocrit Hemoglobin Admission
        <num>    <num>    <num>    <num>      <num>      <num>    <char>
1:       8001 58.45752 4.675691 7776.085   35.61951   13.76775   Planned
2:       8002 38.29222 5.344828 8158.330   41.50580   13.91031   Planned
3:       8003 60.75944 4.482781 7182.376   42.62882   13.85600        ED
4:       8004 63.58479 5.968185 5453.067   44.84574   13.89564        ED
dat_wide2long_dt <- melt(dat_wide_dt,
                         id.vars = c(1:2, 7),
                         measure.vars = 3:6,
                         variable.name = "Lab_key",
                         value.name = "Lab_value")
dat_wide2long_dt
    Account_ID      Age Admission    Lab_key   Lab_value
         <num>    <num>    <char>     <fctr>       <num>
 1:       8001 58.45752   Planned        RBC    4.675691
 2:       8002 38.29222   Planned        RBC    5.344828
 3:       8003 60.75944        ED        RBC    4.482781
 4:       8004 63.58479        ED        RBC    5.968185
 5:       8001 58.45752   Planned        WBC 7776.085457
 6:       8002 38.29222   Planned        WBC 8158.330363
 7:       8003 60.75944        ED        WBC 7182.375695
 8:       8004 63.58479        ED        WBC 5453.066686
 9:       8001 58.45752   Planned Hematocrit   35.619509
10:       8002 38.29222   Planned Hematocrit   41.505801
11:       8003 60.75944        ED Hematocrit   42.628816
12:       8004 63.58479        ED Hematocrit   44.845745
13:       8001 58.45752   Planned Hemoglobin   13.767750
14:       8002 38.29222   Planned Hemoglobin   13.910312
15:       8003 60.75944        ED Hemoglobin   13.856005
16:       8004 63.58479        ED Hemoglobin   13.895635

If desired, you can set the ID column as they key, which will sort the data.table by its values:

setorder(dat_wide2long_dt, "Account_ID")
dat_wide2long_dt
    Account_ID      Age Admission    Lab_key   Lab_value
         <num>    <num>    <char>     <fctr>       <num>
 1:       8001 58.45752   Planned        RBC    4.675691
 2:       8001 58.45752   Planned        WBC 7776.085457
 3:       8001 58.45752   Planned Hematocrit   35.619509
 4:       8001 58.45752   Planned Hemoglobin   13.767750
 5:       8002 38.29222   Planned        RBC    5.344828
 6:       8002 38.29222   Planned        WBC 8158.330363
 7:       8002 38.29222   Planned Hematocrit   41.505801
 8:       8002 38.29222   Planned Hemoglobin   13.910312
 9:       8003 60.75944        ED        RBC    4.482781
10:       8003 60.75944        ED        WBC 7182.375695
11:       8003 60.75944        ED Hematocrit   42.628816
12:       8003 60.75944        ED Hemoglobin   13.856005
13:       8004 63.58479        ED        RBC    5.968185
14:       8004 63.58479        ED        WBC 5453.066686
15:       8004 63.58479        ED Hematocrit   44.845745
16:       8004 63.58479        ED Hemoglobin   13.895635

37.3 See also

37.4 Resources