14  Reshape

import pandas as pd

14.1 Long to wide

Example 1:

dat_long = pd.DataFrame({"ID":[1, 2, 3, 1, 2, 3, 1, 2, 3],
                         "Fruit":["mango", "mango", "mango", 
                                  "banana", "banana", "banana", 
                                  "tangerine", "tangerine", "tangerine"],
                         "Score":[1.1, 2.1, 3.1, 1.2, 2.2, 3.2, 1.3, 2.3, 3.3]})
dat_long
ID Fruit Score
0 1 mango 1.1
1 2 mango 2.1
2 3 mango 3.1
3 1 banana 1.2
4 2 banana 2.2
5 3 banana 3.2
6 1 tangerine 1.3
7 2 tangerine 2.3
8 3 tangerine 3.3

Usage: pd.pivot(df, index, columns, values)

pd.pivot(dat_long, 
         index = "ID", 
         columns = "Fruit", 
         values = "Score")
Fruit banana mango tangerine
ID
1 1.2 1.1 1.3
2 2.2 2.1 2.3
3 3.2 3.1 3.3

Example 2:

dat_long = pd.DataFrame({
    "Account_ID": [8001, 8002, 8003, 8004, 8001, 8002, 8003, 8004, 
            8001, 8002, 8003, 8004, 8001, 8002, 8003, 8004], 
    "Age": [67.8017038366664, 42.9198507293701, 46.2301756642422, 
            39.665983196671, 67.8017038366664, 42.9198507293701, 
            46.2301756642422, 39.665983196671, 67.8017038366664, 
            42.9198507293701, 46.2301756642422, 39.665983196671, 
            67.8017038366664, 42.9198507293701, 46.2301756642422, 
            39.665983196671], 
    "Admission": ["ED", "Planned", "Planned", "ED", "ED", "Planned", 
            "Planned", "ED", "ED", "Planned", "Planned", "ED", "ED", 
            "Planned", "Planned", "ED"], 
    "Lab_key": ["RBC", "RBC", "RBC", "RBC", "WBC", "WBC", "WBC", "WBC", 
            "Hematocrit", "Hematocrit", "Hematocrit", "Hematocrit", 
            "Hemoglobin", "Hemoglobin", "Hemoglobin", "Hemoglobin"], 
    "Lab_value": [4.63449321082268, 3.34968550627897, 4.27037213597765, 
            4.93897736897793, 8374.22887757195, 7612.37380499927, 
            8759.27855519425, 6972.28096216548, 36.272693147236, 
            40.5716317809522, 39.9888624177955, 39.8786884058422, 
            12.6188444991545, 12.1739747363806, 15.1293426442183, 
            14.8885696185238]
})
dat_long
Account_ID Age Admission Lab_key Lab_value
0 8001 67.801704 ED RBC 4.634493
1 8002 42.919851 Planned RBC 3.349686
2 8003 46.230176 Planned RBC 4.270372
3 8004 39.665983 ED RBC 4.938977
4 8001 67.801704 ED WBC 8374.228878
5 8002 42.919851 Planned WBC 7612.373805
6 8003 46.230176 Planned WBC 8759.278555
7 8004 39.665983 ED WBC 6972.280962
8 8001 67.801704 ED Hematocrit 36.272693
9 8002 42.919851 Planned Hematocrit 40.571632
10 8003 46.230176 Planned Hematocrit 39.988862
11 8004 39.665983 ED Hematocrit 39.878688
12 8001 67.801704 ED Hemoglobin 12.618844
13 8002 42.919851 Planned Hemoglobin 12.173975
14 8003 46.230176 Planned Hemoglobin 15.129343
15 8004 39.665983 ED Hemoglobin 14.888570

Usage: pd.pivot(df, index, columns, values)

pd.pivot(dat_long,
         index = ["Account_ID", "Age", "Admission"],
         columns = "Lab_key",
         values = "Lab_value")
Lab_key Hematocrit Hemoglobin RBC WBC
Account_ID Age Admission
8001 67.801704 ED 36.272693 12.618844 4.634493 8374.228878
8002 42.919851 Planned 40.571632 12.173975 3.349686 7612.373805
8003 46.230176 Planned 39.988862 15.129343 4.270372 8759.278555
8004 39.665983 ED 39.878688 14.888570 4.938977 6972.280962

14.2 Wide to Long

dat_wide = pd.DataFrame({"ID":[1, 2, 3],
                          "mango":[1.1, 2.1, 3.1],
                          "banana":[1.2, 2.2, 3.2],
                          "tangerine":[1.3, 2.3, 3.3]})
dat_wide
ID mango banana tangerine
0 1 1.1 1.2 1.3
1 2 2.1 2.2 2.3
2 3 3.1 3.2 3.3

Usage: pd.melt(df, id_vars, value_vars, var_name, value_name)

pd.melt(dat_wide, 
        id_vars = "ID", 
        value_vars = ['mango', "banana", "tangerine"],
        var_name = "Fruit",
        value_name = "Value")
ID Fruit Value
0 1 mango 1.1
1 2 mango 2.1
2 3 mango 3.1
3 1 banana 1.2
4 2 banana 2.2
5 3 banana 3.2
6 1 tangerine 1.3
7 2 tangerine 2.3
8 3 tangerine 3.3

In the above example, value_vars can be ommitted: all columns other than those defined as ids will be used.

wide_to_long() uses melt() under the hood, and supposed to be more user-friendly - is not.

Example 2:

dat_wide = pd.DataFrame({
    "Account_ID": [8001, 8002, 8003, 8004], 
    "Age": [67.8017038366664, 42.9198507293701, 46.2301756642422, 
        39.665983196671], 
    "RBC": [4.63449321082268, 3.34968550627897, 4.27037213597765, 
        4.93897736897793], 
    "WBC": [8374.22887757195, 7612.37380499927, 8759.27855519425, 
        6972.28096216548], 
    "Hematocrit": [36.272693147236, 40.5716317809522, 39.9888624177955, 
        39.8786884058422], 
    "Hemoglobin": [12.618444991545, 12.1739747363806, 15.1293426442183, 
        14.8885696185238], 
    "Admission": ["ED", "Planned", "Planned", "Planned"]
})
dat_wide
Account_ID Age RBC WBC Hematocrit Hemoglobin Admission
0 8001 67.801704 4.634493 8374.228878 36.272693 12.618445 ED
1 8002 42.919851 3.349686 7612.373805 40.571632 12.173975 Planned
2 8003 46.230176 4.270372 8759.278555 39.988862 15.129343 Planned
3 8004 39.665983 4.938977 6972.280962 39.878688 14.888570 Planned

Usage: pd.melt(df, id_vars, value_vars, var_name, value_name)

pd.melt(dat_wide,
        id_vars = ["Account_ID", "Age", "Admission"],
        value_vars = ["RBC", "WBC", "Hematocrit", "Hemoglobin"],
        var_name = "Lab",
        value_name = "Value")
Account_ID Age Admission Lab Value
0 8001 67.801704 ED RBC 4.634493
1 8002 42.919851 Planned RBC 3.349686
2 8003 46.230176 Planned RBC 4.270372
3 8004 39.665983 Planned RBC 4.938977
4 8001 67.801704 ED WBC 8374.228878
5 8002 42.919851 Planned WBC 7612.373805
6 8003 46.230176 Planned WBC 8759.278555
7 8004 39.665983 Planned WBC 6972.280962
8 8001 67.801704 ED Hematocrit 36.272693
9 8002 42.919851 Planned Hematocrit 40.571632
10 8003 46.230176 Planned Hematocrit 39.988862
11 8004 39.665983 Planned Hematocrit 39.878688
12 8001 67.801704 ED Hemoglobin 12.618445
13 8002 42.919851 Planned Hemoglobin 12.173975
14 8003 46.230176 Planned Hemoglobin 15.129343
15 8004 39.665983 Planned Hemoglobin 14.888570