import pandas as pd
import numpy as np
import re
12 Basics
12.1 Construct DataFrame by column
You can create a DataFrame from a dictionary of arrays/lists, i.e. by inputting data column by column:
= pd.DataFrame({"Fruit":["mango", "banana", "tangerine"],
dat1 "Rating":[8, 9, 7],
"Cost":[5, 2, 3]})
dat1
Fruit | Rating | Cost | |
---|---|---|---|
0 | mango | 8 | 5 |
1 | banana | 9 | 2 |
2 | tangerine | 7 | 3 |
type(dat1)
pandas.core.frame.DataFrame
12.2 Construct DataFrame by row
You can create a pandas DataFrame from a list of lists, by inputting data case by case:
= pd.DataFrame([["mango", 8, 5],
dat2 "banana", 9, 2],
["tangerine", 7, 3]],
[= ["Fruit", "Rating", "Cost"])
columns dat2
Fruit | Rating | Cost | |
---|---|---|---|
0 | mango | 8 | 5 |
1 | banana | 9 | 2 |
2 | tangerine | 7 | 3 |
12.3 Read csv
= pd.read_csv("/Users/egenn/icloud/Data/iris.csv")
dat dat
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
... | ... | ... | ... | ... | ... |
145 | 6.7 | 3.0 | 5.2 | 2.3 | virginica |
146 | 6.3 | 2.5 | 5.0 | 1.9 | virginica |
147 | 6.5 | 3.0 | 5.2 | 2.0 | virginica |
148 | 6.2 | 3.4 | 5.4 | 2.3 | virginica |
149 | 5.9 | 3.0 | 5.1 | 1.8 | virginica |
150 rows × 5 columns
12.4 Get dimensions: shape
dat.shape
(150, 5)
12.5 Show first n rows: head()
defaults to first 5 rows
dat.head()
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
3) dat.head(
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
Hide last n
rows with negative indexing:
-145) dat.head(
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
12.6 Get column names: columns
dat.columns
Index(['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width',
'Species'],
dtype='object')
12.7 Change column names
= [re.sub("\.", "_", col) for col in list(dat.columns)]
dat.columns dat.columns
<>:1: SyntaxWarning:
invalid escape sequence '\.'
<>:1: SyntaxWarning:
invalid escape sequence '\.'
/var/folders/rb/99nqfz7s2rb6d_p0d6yxtbxc0000gn/T/ipykernel_52386/1385048352.py:1: SyntaxWarning:
invalid escape sequence '\.'
Index(['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width',
'Species'],
dtype='object')
12.8 Get row names: index.values
dat.index.values
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103,
104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
143, 144, 145, 146, 147, 148, 149])
12.9 Get column data types: dtypes
dat.dtypes
Sepal_Length float64
Sepal_Width float64
Petal_Length float64
Petal_Width float64
Species object
dtype: object
12.10 index by integer location: iloc[]
0-based indexing
0, 0] dat.iloc[
np.float64(5.1)
Note: The first element of a range is included, the last is excluded.
0:3, 0:4] dat.iloc[
Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | |
---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 |
1 | 4.9 | 3.0 | 1.4 | 0.2 |
2 | 4.7 | 3.2 | 1.3 | 0.2 |
12.11 Index by name: loc[]
3:9] dat.loc[
Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | Species | |
---|---|---|---|---|---|
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
5 | 5.4 | 3.9 | 1.7 | 0.4 | setosa |
6 | 4.6 | 3.4 | 1.4 | 0.3 | setosa |
7 | 5.0 | 3.4 | 1.5 | 0.2 | setosa |
8 | 4.4 | 2.9 | 1.4 | 0.2 | setosa |
9 | 4.9 | 3.1 | 1.5 | 0.1 | setosa |
12.12 iloc
vs. loc
2:5] dat.iloc[
Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | Species | |
---|---|---|---|---|---|
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
2:5] dat.loc[
Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | Species | |
---|---|---|---|---|---|
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
5 | 5.4 | 3.9 | 1.7 | 0.4 | setosa |
Indexing with
iloc
vs. loc
When you use iloc
with a range, the first element is included and the last is excluded as with any numeric range.
When you use loc
the range refers to row names and both first and last elements are included.
12.13 Select column by name: ["name"]
"Species"] dat[
0 setosa
1 setosa
2 setosa
3 setosa
4 setosa
...
145 virginica
146 virginica
147 virginica
148 virginica
149 virginica
Name: Species, Length: 150, dtype: object
12.14 Filter cases: [...]
"Sepal_Length"] > dat["Sepal_Length"].mean()] dat[dat[
Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | Species | |
---|---|---|---|---|---|
50 | 7.0 | 3.2 | 4.7 | 1.4 | versicolor |
51 | 6.4 | 3.2 | 4.5 | 1.5 | versicolor |
52 | 6.9 | 3.1 | 4.9 | 1.5 | versicolor |
54 | 6.5 | 2.8 | 4.6 | 1.5 | versicolor |
56 | 6.3 | 3.3 | 4.7 | 1.6 | versicolor |
... | ... | ... | ... | ... | ... |
145 | 6.7 | 3.0 | 5.2 | 2.3 | virginica |
146 | 6.3 | 2.5 | 5.0 | 1.9 | virginica |
147 | 6.5 | 3.0 | 5.2 | 2.0 | virginica |
148 | 6.2 | 3.4 | 5.4 | 2.3 | virginica |
149 | 5.9 | 3.0 | 5.1 | 1.8 | virginica |
70 rows × 5 columns
"Species"].isin(["versicolor"])].head() dat[dat[
Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | Species | |
---|---|---|---|---|---|
50 | 7.0 | 3.2 | 4.7 | 1.4 | versicolor |
51 | 6.4 | 3.2 | 4.5 | 1.5 | versicolor |
52 | 6.9 | 3.1 | 4.9 | 1.5 | versicolor |
53 | 5.5 | 2.3 | 4.0 | 1.3 | versicolor |
54 | 6.5 | 2.8 | 4.6 | 1.5 | versicolor |
"Sepal_Length"].min() dat[
np.float64(4.3)