# Install pandas within a terminal
# ! pip3 install pandas -U 
! pip install pandas -U

Looking in indexes: https://mirrors.163.com/pypi/simple/
Requirement already satisfied: pandas in /usr/local/lib/python3.9/dist-packages (1.3.4)
Collecting pandas
  Using cached https://mirrors.163.com/pypi/packages/48/b4/1081d66b71c4dfc1bc1e19d6f2abbf93ed42f69df7703eb323742d45423e/pandas-1.3.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.5 MB)
  Using cached https://mirrors.163.com/pypi/packages/03/ea/98d488a4047b3fd8075b5c1e00469ad42d715e2c1e4fd15fa1ffaef8d635/pandas-1.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.5 MB)
Requirement already satisfied: pytz>=2017.3 in /usr/lib/python3/dist-packages (from pandas) (2021.3)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/lib/python3/dist-packages (from pandas) (2.8.1)
Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.9/dist-packages (from pandas) (1.21.2)


import pandas as pd


counts = pd.Series([632, 1638, 569, 115])
counts

0     632
1    1638
2     569
3     115
dtype: int64


counts.values

array([ 632, 1638,  569,  115])


counts.index

RangeIndex(start=0, stop=4, step=1)


bacteria = pd.Series([632, 1638, 569, 115], 
                     index=['Firmicutes', 'Proteobacteria', 'Actinobacteria', 'Bacteroidetes'])

bacteria

Firmicutes         632
Proteobacteria    1638
Actinobacteria     569
Bacteroidetes      115
dtype: int64


bacteria['Actinobacteria']

569


bacteria.index

Index(['Firmicutes', 'Proteobacteria', 'Actinobacteria', 'Bacteroidetes'], dtype='object', name='phylum')


[name.endswith('bacteria') for name in bacteria.index]

[False, True, True, False]


bacteria[[name.endswith('bacteria') for name in bacteria.index]]

Proteobacteria    1638
Actinobacteria     569
dtype: int64


bacteria[0]

632


bacteria.name = 'counts'
bacteria.index.name = 'phylum'
bacteria

phylum
Firmicutes         632
Proteobacteria    1638
Actinobacteria     569
Bacteroidetes      115
Name: counts, dtype: int64


bacteria[bacteria>1000]

phylum
Proteobacteria    1638
Name: counts, dtype: int64


bacteria_dict = {'Firmicutes': 632, 'Proteobacteria': 1638, 'Actinobacteria': 569, 'Bacteroidetes': 115}
pd.Series(bacteria_dict)

Firmicutes         632
Proteobacteria    1638
Actinobacteria     569
Bacteroidetes      115
dtype: int64


bacteria2 = pd.Series(bacteria_dict, index=['Cyanobacteria','Firmicutes','Proteobacteria','Actinobacteria'])
bacteria2

Cyanobacteria        NaN
Firmicutes         632.0
Proteobacteria    1638.0
Actinobacteria     569.0
dtype: float64


bacteria2.isnull()

Cyanobacteria      True
Firmicutes        False
Proteobacteria    False
Actinobacteria    False
dtype: bool


bacteria + bacteria2

Actinobacteria    1138.0
Bacteroidetes        NaN
Cyanobacteria        NaN
Firmicutes        1264.0
Proteobacteria    3276.0
dtype: float64


data = pd.DataFrame({'value':[632, 1638, 569, 115, 433, 1130, 754, 555],
                     'patient':[1, 1, 1, 1, 2, 2, 2, 2],
                     'phylum':['Firmicutes', 'Proteobacteria', 'Actinobacteria', 
                               'Bacteroidetes', 'Firmicutes', 'Proteobacteria', 
                               'Actinobacteria', 'Bacteroidetes']})
data


data[['phylum','value','patient']]


data.columns

Index(['value', 'patient', 'phylum'], dtype='object')


data['value']

0     632
1    1638
2     569
3     115
4     433
5    1130
6     754
7     555
Name: value, dtype: int64


data.value

0     632
1    1638
2     569
3     115
4     433
5    1130
6     754
7     555
Name: value, dtype: int64


type(data.value)

pandas.core.series.Series


type(data[['value']])

pandas.core.frame.DataFrame


data.iloc[0]

value             632
patient             1
phylum     Firmicutes
Name: 0, dtype: object


data.iloc[2:5]


data.iloc[lambda x: x.index % 2 == 0]


data.iloc[[1,2]]


data.iloc[[True, False,True, False,True, False,True, False]]


data.iloc[0, 1] # With scalar integers

1


data.iloc[[0, 4], [0, 2]] # With lists of integers.


data.iloc[1:3, 0:2]


df = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
                  index=['cobra', 'viper', 'sidewinder'],
                  columns=['max_speed', 'shield'])
df


df.loc['viper']  # Single label returns the row as a Series.

max_speed    4
shield       5
Name: viper, dtype: int64


df.loc['cobra':'viper', 'max_speed']

cobra    1
viper    4
Name: max_speed, dtype: int64


df.loc[['viper', 'sidewinder']] # List of labels returns a DataFrame.


df.loc[[False, False, True]]


df.loc[df['shield'] > 6]


data


data.loc[3:5]


# Set value for all items matching the list of labels
df.loc[['viper', 'sidewinder'], ['shield']] = 50
df


df.loc['cobra'] = 10 # Set value for an entire row

df


df.loc[:, 'max_speed'] = 30 # Set value for an entire column
df


vals = data.value
vals

0     632
1    1638
2     569
3     115
4     433
5    1130
6     754
7     555
Name: value, dtype: int64


vals[5] = 0
vals

/tmp/ipykernel_229928/1693880163.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vals[5] = 0

0     632
1    1638
2     569
3     115
4     433
5       0
6     754
7     555
Name: value, dtype: int64


data


vals = data.value.copy()
vals[5] = 1000
data


data.value[3] = 14
data

/tmp/ipykernel_229928/2998967180.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.value[3] = 14


data['year'] = 2013
data


data.treatment = 1
data


data.treatment

1


treatment = pd.Series([0]*4 + [1]*2)
treatment

0    0
1    0
2    0
3    0
4    1
5    1
dtype: int64


data['treatment'] = treatment
data


# month = ['Jan', 'Feb', 'Mar', 'Apr']
# data['month'] = month


data.values

array([[632, 1, 'Firmicutes', 2013, 0.0],
       [1638, 1, 'Proteobacteria', 2013, 0.0],
       [569, 1, 'Actinobacteria', 2013, 0.0],
       [14, 1, 'Bacteroidetes', 2013, 0.0],
       [433, 2, 'Firmicutes', 2013, 1.0],
       [0, 2, 'Proteobacteria', 2013, 1.0],
       [754, 2, 'Actinobacteria', 2013, nan],
       [555, 2, 'Bacteroidetes', 2013, nan]], dtype=object)

Data Wrangling with Pandas¶

Introduction to Pandas¶

Key features of Pandas:¶

Pandas Data Structures: Series¶

Pandas Data Structures: DataFrame¶

Indexing¶

Set values¶

Lab¶

	value	patient	phylum
0	632	1	Firmicutes
1	1638	1	Proteobacteria
2	569	1	Actinobacteria
3	115	1	Bacteroidetes
4	433	2	Firmicutes
5	1130	2	Proteobacteria
6	754	2	Actinobacteria
7	555	2	Bacteroidetes

	value	patient	phylum	year
0	632	1	Firmicutes	2013
1	1638	1	Proteobacteria	2013
2	569	1	Actinobacteria	2013
3	14	1	Bacteroidetes	2013
4	433	2	Firmicutes	2013
5	0	2	Proteobacteria	2013
6	754	2	Actinobacteria	2013
7	555	2	Bacteroidetes	2013