18  Reshape

import dask.dataframe as dd
import pandas as pd

18.1 Long to wide

Example 1:

dat_long = pd.DataFrame({'ID':[1, 2, 3, 1, 2, 3, 1, 2, 3],
                         'Fruit':['mango', 'mango', 'mango', 
                                  'banana', 'banana', 'banana', 
                                  'tangerine', 'tangerine', 'tangerine'],
                         'Score':[1.1, 2.1, 3.1, 1.2, 2.2, 3.2, 1.3, 2.3, 3.3]})
dat_long['Fruit'] = dat_long['Fruit'].astype('category')
dat_long = dd.from_pandas(dat_long, npartitions=1)
dat_long
Dask DataFrame Structure:
ID Fruit Score
npartitions=1
0 int64 category[known] float64
8 ... ... ...
Dask Name: from_pandas, 1 graph layer

Usage: dd.pivot(df, index, columns, values)

dat_long.pivot_table(
         index = 'ID', 
         columns = 'Fruit', 
         values = 'Score').compute()
Fruit banana mango tangerine
ID
1 1.2 1.1 1.3
2 2.2 2.1 2.3
3 3.2 3.1 3.3

Example 2:

dat_long = pd.DataFrame({
    'Account_ID': [8001, 8002, 8003, 8004, 8001, 8002, 8003, 8004, 
            8001, 8002, 8003, 8004, 8001, 8002, 8003, 8004], 
    'Age': [67.8017038366664, 42.9198507293701, 46.2301756642422, 
            39.665983196671, 67.8017038366664, 42.9198507293701, 
            46.2301756642422, 39.665983196671, 67.8017038366664, 
            42.9198507293701, 46.2301756642422, 39.665983196671, 
            67.8017038366664, 42.9198507293701, 46.2301756642422, 
            39.665983196671], 
    'Admission': ['ED', 'Planned', 'Planned', 'ED', 'ED', 'Planned', 
            'Planned', 'ED', 'ED', 'Planned', 'Planned', 'ED', 'ED', 
            'Planned', 'Planned', 'ED'], 
    'Lab_key': ['RBC', 'RBC', 'RBC', 'RBC', 'WBC', 'WBC', 'WBC', 'WBC', 
            'Hematocrit', 'Hematocrit', 'Hematocrit', 'Hematocrit', 
            'Hemoglobin', 'Hemoglobin', 'Hemoglobin', 'Hemoglobin'], 
    'Lab_value': [4.63449321082268, 3.34968550627897, 4.27037213597765, 
            4.93897736897793, 8374.22887757195, 7612.37380499927, 
            8759.27855519425, 6972.28096216548, 36.272693147236, 
            40.5716317809522, 39.9888624177955, 39.8786884058422, 
            12.6188444991545, 12.1739747363806, 15.1293426442183, 
            14.8885696185238]
})
dat_long.Lab_key = dat_long.Lab_key.astype('category')
dat_long = dd.from_pandas(dat_long, npartitions=1)
dat_long
Dask DataFrame Structure:
Account_ID Age Admission Lab_key Lab_value
npartitions=1
0 int64 float64 object category[known] float64
15 ... ... ... ... ...
Dask Name: from_pandas, 1 graph layer

Usage: df.pivot_table(index, columns, values)

Unlike Pandas, the index needs to be a single column of type ‘category’

dat_long.pivot_table(
         index = 'Account_ID',
         columns = 'Lab_key',
         values = 'Lab_value')
Dask DataFrame Structure:
Hematocrit Hemoglobin RBC WBC
npartitions=1
float64 float64 float64 float64
... ... ... ...
Dask Name: truediv, 6 graph layers

18.2 Wide to Long

dat_wide = pd.DataFrame({'ID':[1, 2, 3],
                          'mango':[1.1, 2.1, 3.1],
                          'banana':[1.2, 2.2, 3.2],
                          'tangerine':[1.3, 2.3, 3.3]})
dat_wide = dd.from_pandas(dat_wide, npartitions=1)
dat_wide.compute()
ID mango banana tangerine
0 1 1.1 1.2 1.3
1 2 2.1 2.2 2.3
2 3 3.1 3.2 3.3

Usage: pdd.melt(id_vars, value_vars, var_name, value_name)

dat_wide.melt(
        id_vars = 'ID', 
        value_vars = ['mango', 'banana', 'tangerine'],
        var_name = 'Fruit',
        value_name = 'Value').compute()
ID Fruit Value
0 1 mango 1.1
1 2 mango 2.1
2 3 mango 3.1
3 1 banana 1.2
4 2 banana 2.2
5 3 banana 3.2
6 1 tangerine 1.3
7 2 tangerine 2.3
8 3 tangerine 3.3

Sam as in Pandas, in the above example, value_vars can be ommitted: all columns other than those defined as ids will be used.

Example 2:

dat_wide = pd.DataFrame({
    'Account_ID': [8001, 8002, 8003, 8004], 
    'Age': [67.8017038366664, 42.9198507293701, 46.2301756642422, 
        39.665983196671], 
    'RBC': [4.63449321082268, 3.34968550627897, 4.27037213597765, 
        4.93897736897793], 
    'WBC': [8374.22887757195, 7612.37380499927, 8759.27855519425, 
        6972.28096216548], 
    'Hematocrit': [36.272693147236, 40.5716317809522, 39.9888624177955, 
        39.8786884058422], 
    'Hemoglobin': [12.618444991545, 12.1739747363806, 15.1293426442183, 
        14.8885696185238], 
    'Admission': ['ED', 'Planned', 'Planned', 'Planned']
})
dat_wide = dd.from_pandas(dat_wide, npartitions=1)
dat_wide.compute()
Account_ID Age RBC WBC Hematocrit Hemoglobin Admission
0 8001 67.801704 4.634493 8374.228878 36.272693 12.618445 ED
1 8002 42.919851 3.349686 7612.373805 40.571632 12.173975 Planned
2 8003 46.230176 4.270372 8759.278555 39.988862 15.129343 Planned
3 8004 39.665983 4.938977 6972.280962 39.878688 14.888570 Planned

Usage: pd.melt(df, id_vars, value_vars, var_name, value_name)

dat_wide.melt(
        id_vars = ['Account_ID', 'Age', 'Admission'],
        value_vars = ['RBC', 'WBC', 'Hematocrit', 'Hemoglobin'],
        var_name = 'Lab',
        value_name = 'Value').compute()
Account_ID Age Admission Lab Value
0 8001 67.801704 ED RBC 4.634493
1 8002 42.919851 Planned RBC 3.349686
2 8003 46.230176 Planned RBC 4.270372
3 8004 39.665983 Planned RBC 4.938977
4 8001 67.801704 ED WBC 8374.228878
5 8002 42.919851 Planned WBC 7612.373805
6 8003 46.230176 Planned WBC 8759.278555
7 8004 39.665983 Planned WBC 6972.280962
8 8001 67.801704 ED Hematocrit 36.272693
9 8002 42.919851 Planned Hematocrit 40.571632
10 8003 46.230176 Planned Hematocrit 39.988862
11 8004 39.665983 Planned Hematocrit 39.878688
12 8001 67.801704 ED Hemoglobin 12.618445
13 8002 42.919851 Planned Hemoglobin 12.173975
14 8003 46.230176 Planned Hemoglobin 15.129343
15 8004 39.665983 Planned Hemoglobin 14.888570