Groups and pivot tables with Pandas¶

Feng Li

School of Statistics and Mathematics

Central University of Finance and Economics

feng.li@cufe.edu.cn

https://feng.li/python

In [4]:
import pandas as pd
ledger_df_info = pd.ExcelFile('data/Q1Sales.xlsx')
ledger_df_info.sheet_names
ledger_df = pd.read_excel('data/Q1Sales.xlsx') # read the first sheet
In [5]:
columns = ['ProductID', 'Product Name', 'Channel', 'Unit Price', 'Quantity', 'Total']
sample_df = ledger_df[columns].tail(10)
In [6]:
sample_df
Out[6]:
ProductID Product Name Channel Unit Price Quantity Total
14044 MI/SEN-01085 Sennheiser EW 112P G3-B omni-directional EW sy... Understock.com 18.58 6 111.48
14045 I&S/WIH-08645 Wiha 26598 Nut Driver Set, Metric In Canvas Po... Shoppe.com 16.56 62 1026.72
14046 H&K/KIK-91404 Kikkerland Magnetic Decision Maker iBay.com 3.64 15 54.60
14047 T&G/YU--76445 Yu-Gi-Oh! - Light-Imprisoning Mirror (GLAS-EN0... Understock.com 4.50 4 18.00
14048 T&G/LAU-88048 Lauri Toddler Tote iBay.com 14.46 1 14.46
14049 E/AC-63975 AC Adapter/Power Supply&Cord for Lenovo 3000 G... Bullseye 28.72 8 229.76
14050 E/CIS-74992 Cisco Systems Gigabit VPN Router (RV320K9NA) Bullseye 33.39 1 33.39
14051 E/PHI-08100 Philips AJ3116M/37 Digital Tuning Clock Radio ... Understock.com 4.18 1 4.18
14052 E/POL-61164 NaN iBay.com 4.78 25 119.50
14053 E/SIR-83381 Sirius Satellite Radio XADH2 Home Access Kit f... Understock.com 33.16 2 66.32
In [7]:
sample_df.groupby('Channel').agg({'Total': 'sum'})
Out[7]:
Total
Channel
Bullseye 263.15
Shoppe.com 1026.72
Understock.com 199.98
iBay.com 188.56

The pandas group object¶

In [8]:
groups = sample_df.groupby('Channel')
In [9]:
type(groups)
Out[9]:
pandas.core.groupby.generic.DataFrameGroupBy
In [10]:
groups.groups
Out[10]:
{'Bullseye': [14049, 14050], 'Shoppe.com': [14045], 'Understock.com': [14044, 14047, 14051, 14053], 'iBay.com': [14046, 14048, 14052]}
In [11]:
groups.get_group('Bullseye')
Out[11]:
ProductID Product Name Channel Unit Price Quantity Total
14049 E/AC-63975 AC Adapter/Power Supply&Cord for Lenovo 3000 G... Bullseye 28.72 8 229.76
14050 E/CIS-74992 Cisco Systems Gigabit VPN Router (RV320K9NA) Bullseye 33.39 1 33.39
In [12]:
groups['Total'].get_group('Bullseye')
Out[12]:
14049    229.76
14050     33.39
Name: Total, dtype: float64
In [13]:
groups[['Quantity', 'Total']].get_group('Bullseye')
Out[13]:
Quantity Total
14049 8 229.76
14050 1 33.39
In [14]:
groups['Total'].sum()
Out[14]:
Channel
Bullseye           263.15
Shoppe.com        1026.72
Understock.com     199.98
iBay.com           188.56
Name: Total, dtype: float64
In [15]:
groups[['Quantity', 'Total']].sum()
Out[15]:
Quantity Total
Channel
Bullseye 9 263.15
Shoppe.com 62 1026.72
Understock.com 13 199.98
iBay.com 41 188.56
In [16]:
sample_df.groupby('Channel')['Total'].sum()
Out[16]:
Channel
Bullseye           263.15
Shoppe.com        1026.72
Understock.com     199.98
iBay.com           188.56
Name: Total, dtype: float64

Aggregating group functions¶

In [17]:
# this works as well
# sample_df.groupby('Channel')['Total'].agg('sum')
sample_df.groupby('Channel').agg({'Total': 'sum'})
Out[17]:
Total
Channel
Bullseye 263.15
Shoppe.com 1026.72
Understock.com 199.98
iBay.com 188.56
In [18]:
(sample_df
    .groupby('Channel').agg({
        'Total': ['sum', 'mean'],
        'Quantity': ['sum', 'max', 'min']
    })
) 
Out[18]:
Total Quantity
sum mean sum max min
Channel
Bullseye 263.15 131.575000 9 8 1
Shoppe.com 1026.72 1026.720000 62 62 62
Understock.com 199.98 49.995000 13 6 1
iBay.com 188.56 62.853333 41 25 1
In [19]:
# assigns the output above
# to another variable 
aggregate_df = (
  sample_df
    .groupby('Channel').agg({
        'Total': ['sum', 'mean'],
        'Quantity': ['sum', 'max', 'min']
    })
) 

# selects the max column under Quantity
aggregate_df.loc[:, ('Quantity', 'max')]
Out[19]:
Channel
Bullseye           8
Shoppe.com        62
Understock.com     6
iBay.com          25
Name: (Quantity, max), dtype: int64
In [20]:
aggregate_df.loc[:, 'Quantity']
Out[20]:
sum max min
Channel
Bullseye 9 8 1
Shoppe.com 62 62 62
Understock.com 13 6 1
iBay.com 41 25 1

Custom aggregating functions¶

In [21]:
def total_diff(column):
    return column.max() - column.min()    

sample_df.groupby('Channel').agg({'Total': ['min', 'max', total_diff]})
Out[21]:
Total
min max total_diff
Channel
Bullseye 33.39 229.76 196.37
Shoppe.com 1026.72 1026.72 0.00
Understock.com 4.18 111.48 107.30
iBay.com 14.46 119.50 105.04

Overthinking: Other group functions¶

In [23]:
def filter_group(group_df):
    return group_df if group_df['Total'].sum() > 200 else None

sample_df.groupby('Channel').apply(filter_group)
Out[23]:
ProductID Product Name Channel Unit Price Quantity Total
Channel
Bullseye 14049 E/AC-63975 AC Adapter/Power Supply&Cord for Lenovo 3000 G... Bullseye 28.72 8 229.76
14050 E/CIS-74992 Cisco Systems Gigabit VPN Router (RV320K9NA) Bullseye 33.39 1 33.39
Shoppe.com 14045 I&S/WIH-08645 Wiha 26598 Nut Driver Set, Metric In Canvas Po... Shoppe.com 16.56 62 1026.72
In [24]:
sample_df.groupby('Channel').apply(filter_group).reset_index(drop=True)
Out[24]:
ProductID Product Name Channel Unit Price Quantity Total
0 E/AC-63975 AC Adapter/Power Supply&Cord for Lenovo 3000 G... Bullseye 28.72 8 229.76
1 E/CIS-74992 Cisco Systems Gigabit VPN Router (RV320K9NA) Bullseye 33.39 1 33.39
2 I&S/WIH-08645 Wiha 26598 Nut Driver Set, Metric In Canvas Po... Shoppe.com 16.56 62 1026.72
In [25]:
def percent_group_total(group_df):    
    group_df['% Group Total'] = group_df['Total'] / group_df['Total'].sum() * 100
    group_df['% Group Total'] = group_df['% Group Total'].round(2)
    
    return group_df

sample_df.groupby('Channel').apply(percent_group_total)
Out[25]:
ProductID Product Name Channel Unit Price Quantity Total % Group Total
14044 MI/SEN-01085 Sennheiser EW 112P G3-B omni-directional EW sy... Understock.com 18.58 6 111.48 55.75
14045 I&S/WIH-08645 Wiha 26598 Nut Driver Set, Metric In Canvas Po... Shoppe.com 16.56 62 1026.72 100.00
14046 H&K/KIK-91404 Kikkerland Magnetic Decision Maker iBay.com 3.64 15 54.60 28.96
14047 T&G/YU--76445 Yu-Gi-Oh! - Light-Imprisoning Mirror (GLAS-EN0... Understock.com 4.50 4 18.00 9.00
14048 T&G/LAU-88048 Lauri Toddler Tote iBay.com 14.46 1 14.46 7.67
14049 E/AC-63975 AC Adapter/Power Supply&Cord for Lenovo 3000 G... Bullseye 28.72 8 229.76 87.31
14050 E/CIS-74992 Cisco Systems Gigabit VPN Router (RV320K9NA) Bullseye 33.39 1 33.39 12.69
14051 E/PHI-08100 Philips AJ3116M/37 Digital Tuning Clock Radio ... Understock.com 4.18 1 4.18 2.09
14052 E/POL-61164 NaN iBay.com 4.78 25 119.50 63.38
14053 E/SIR-83381 Sirius Satellite Radio XADH2 Home Access Kit f... Understock.com 33.16 2 66.32 33.16

Stacking and unstacking¶

In [26]:
columns = ['ProductID', 'Channel', 'Total']
sample_df = ledger_df[columns].head()

sample_df
Out[26]:
ProductID Channel Total
0 T&G/CAN-97509 Shoppe.com 281.54
1 T&G/LEG-37777 Walcart 6.70
2 T&G/PET-14209 Bullseye 58.35
3 T&G/TRA-20170 Bullseye 80.76
4 T&G/TRA-20170 Bullseye 80.76
In [27]:
sample_df.stack()
Out[27]:
0  ProductID    T&G/CAN-97509
   Channel         Shoppe.com
   Total               281.54
1  ProductID    T&G/LEG-37777
   Channel            Walcart
   Total                  6.7
2  ProductID    T&G/PET-14209
   Channel           Bullseye
   Total                58.35
3  ProductID    T&G/TRA-20170
   Channel           Bullseye
   Total                80.76
4  ProductID    T&G/TRA-20170
   Channel           Bullseye
   Total                80.76
dtype: object
In [28]:
stacked_sample = sample_df.stack() 

stacked_sample.unstack()
Out[28]:
ProductID Channel Total
0 T&G/CAN-97509 Shoppe.com 281.54
1 T&G/LEG-37777 Walcart 6.7
2 T&G/PET-14209 Bullseye 58.35
3 T&G/TRA-20170 Bullseye 80.76
4 T&G/TRA-20170 Bullseye 80.76

Pivot tables¶

In [31]:
ledger_df = pd.read_excel('data/Q1Sales.xlsx')

ledger_df['Deadline'] = pd.to_datetime(ledger_df['Deadline'])
ledger_df['Deadline Quarter'] = ledger_df['Deadline'].dt.to_period(freq='Q-DEC')

ledger_df
Out[31]:
InvoiceNo Channel Product Name ProductID Account AccountNo Date Deadline Currency Unit Price Quantity Total Deadline Quarter
0 1532 Shoppe.com Cannon Water Bomb Balloons 100 Pack T&G/CAN-97509 Sales 5004 2020-01-01 2019-11-23 USD 20.11 14 281.54 2019Q4
1 1533 Walcart LEGO Ninja Turtles Stealth Shell in Pursuit 79102 T&G/LEG-37777 Sales 5004 2020-01-01 2020-06-15 USD 6.70 1 6.70 2020Q2
2 1534 Bullseye NaN T&G/PET-14209 Sales 5004 2020-01-01 2020-05-07 USD 11.67 5 58.35 2020Q2
3 1535 Bullseye Transformers Age of Extinction Generations Del... T&G/TRA-20170 Sales 5004 2020-01-01 2019-12-22 USD 13.46 6 80.76 2019Q4
4 1535 Bullseye Transformers Age of Extinction Generations Del... T&G/TRA-20170 Sales 5004 2020-01-01 2019-12-22 USD 13.46 6 80.76 2019Q4
... ... ... ... ... ... ... ... ... ... ... ... ... ...
14049 15581 Bullseye AC Adapter/Power Supply&Cord for Lenovo 3000 G... E/AC-63975 Sales 5004 2020-01-31 2020-02-23 USD 28.72 8 229.76 2020Q1
14050 15582 Bullseye Cisco Systems Gigabit VPN Router (RV320K9NA) E/CIS-74992 Sales 5004 2020-01-31 2020-01-21 USD 33.39 1 33.39 2020Q1
14051 15583 Understock.com Philips AJ3116M/37 Digital Tuning Clock Radio ... E/PHI-08100 Sales 5004 2020-01-31 2020-03-22 USD 4.18 1 4.18 2020Q1
14052 15584 iBay.com NaN E/POL-61164 Sales 5004 2020-01-31 2020-06-25 USD 4.78 25 119.50 2020Q2
14053 15585 Understock.com Sirius Satellite Radio XADH2 Home Access Kit f... E/SIR-83381 Sales 5004 2020-01-31 2020-02-01 USD 33.16 2 66.32 2020Q1

14054 rows × 13 columns

In [32]:
ledger_df.groupby(['Channel', 'Deadline Quarter']).agg({'Quantity': 'sum'})
Out[32]:
Quantity
Channel Deadline Quarter
Bullseye 2019Q4 2212
2020Q1 3413
2020Q2 3034
2020Q3 377
Shoppe.com 2019Q4 11424
2020Q1 9969
2020Q2 11212
2020Q3 1757
Understock.com 2019Q4 18518
2020Q1 21614
2020Q2 23222
2020Q3 3484
Walcart 2019Q4 3816
2020Q1 5174
2020Q2 3921
2020Q3 411
iBay.com 2019Q4 9711
2020Q1 11525
2020Q2 11995
2020Q3 1317
In [33]:
ledger_df.groupby(['Channel', 'Deadline Quarter']).agg({'Quantity': 'sum'}).unstack()
Out[33]:
Quantity
Deadline Quarter 2019Q4 2020Q1 2020Q2 2020Q3
Channel
Bullseye 2212 3413 3034 377
Shoppe.com 11424 9969 11212 1757
Understock.com 18518 21614 23222 3484
Walcart 3816 5174 3921 411
iBay.com 9711 11525 11995 1317
In [34]:
pd.pivot_table(ledger_df, 
               index='Channel', 
               columns='Deadline Quarter', 
               values='Quantity',
               aggfunc='sum')
Out[34]:
Deadline Quarter 2019Q4 2020Q1 2020Q2 2020Q3
Channel
Bullseye 2212 3413 3034 377
Shoppe.com 11424 9969 11212 1757
Understock.com 18518 21614 23222 3484
Walcart 3816 5174 3921 411
iBay.com 9711 11525 11995 1317
In [35]:
pd.pivot_table(ledger_df, 
               index='Channel', 
               columns='Deadline Quarter', 
               values='Quantity', 
               aggfunc='sum',
               margins=True,
               margins_name='TOTAL')
Out[35]:
Deadline Quarter 2019Q4 2020Q1 2020Q2 2020Q3 TOTAL
Channel
Bullseye 2212 3413 3034 377 9036
Shoppe.com 11424 9969 11212 1757 34362
Understock.com 18518 21614 23222 3484 66838
Walcart 3816 5174 3921 411 13322
iBay.com 9711 11525 11995 1317 34548
TOTAL 45681 51695 53384 7346 158106