import polars as pl
import re
21 Basics
21.1 Construct DataFrame by column
You can create a polars DataFrame from a dictionary of arrays/lists, i.e. by inputting columb by column - just like in pandas:
= pl.DataFrame({"Fruit":["mango", "banana", "tangerine"],
dat1 "Rating":[8, 9, 7],
"Cost":[5, 2, 3]})
dat1
shape: (3, 3)
Fruit | Rating | Cost |
---|---|---|
str | i64 | i64 |
"mango" | 8 | 5 |
"banana" | 9 | 2 |
"tangerine" | 7 | 3 |
type(dat1)
polars.internals.dataframe.frame.DataFrame
21.2 Read csv
= pl.read_csv("/Users/egenn/icloud/Data/iris.csv",
dat ={'Species': pl.Categorical})
dtype dat
/var/folders/rb/99nqfz7s2rb6d_p0d6yxtbxc0000gn/T/ipykernel_40505/3554946109.py:1: DeprecationWarning:
`dtype` is deprecated as an argument to `read_csv`; use `dtypes` instead.
shape: (150, 5)
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species |
---|---|---|---|---|
f64 | f64 | f64 | f64 | cat |
5.1 | 3.5 | 1.4 | 0.2 | "setosa" |
4.9 | 3.0 | 1.4 | 0.2 | "setosa" |
4.7 | 3.2 | 1.3 | 0.2 | "setosa" |
4.6 | 3.1 | 1.5 | 0.2 | "setosa" |
5.0 | 3.6 | 1.4 | 0.2 | "setosa" |
5.4 | 3.9 | 1.7 | 0.4 | "setosa" |
4.6 | 3.4 | 1.4 | 0.3 | "setosa" |
5.0 | 3.4 | 1.5 | 0.2 | "setosa" |
4.4 | 2.9 | 1.4 | 0.2 | "setosa" |
4.9 | 3.1 | 1.5 | 0.1 | "setosa" |
5.4 | 3.7 | 1.5 | 0.2 | "setosa" |
4.8 | 3.4 | 1.6 | 0.2 | "setosa" |
... | ... | ... | ... | ... |
6.0 | 3.0 | 4.8 | 1.8 | "virginica" |
6.9 | 3.1 | 5.4 | 2.1 | "virginica" |
6.7 | 3.1 | 5.6 | 2.4 | "virginica" |
6.9 | 3.1 | 5.1 | 2.3 | "virginica" |
5.8 | 2.7 | 5.1 | 1.9 | "virginica" |
6.8 | 3.2 | 5.9 | 2.3 | "virginica" |
6.7 | 3.3 | 5.7 | 2.5 | "virginica" |
6.7 | 3.0 | 5.2 | 2.3 | "virginica" |
6.3 | 2.5 | 5.0 | 1.9 | "virginica" |
6.5 | 3.0 | 5.2 | 2.0 | "virginica" |
6.2 | 3.4 | 5.4 | 2.3 | "virginica" |
5.9 | 3.0 | 5.1 | 1.8 | "virginica" |
21.3 Get dimensions: shape
dat.shape
(150, 5)
21.4 Show first n rows: head()
defaults to first 5 rows
dat.head()
shape: (5, 5)
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species |
---|---|---|---|---|
f64 | f64 | f64 | f64 | cat |
5.1 | 3.5 | 1.4 | 0.2 | "setosa" |
4.9 | 3.0 | 1.4 | 0.2 | "setosa" |
4.7 | 3.2 | 1.3 | 0.2 | "setosa" |
4.6 | 3.1 | 1.5 | 0.2 | "setosa" |
5.0 | 3.6 | 1.4 | 0.2 | "setosa" |
3) dat.head(
shape: (3, 5)
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species |
---|---|---|---|---|
f64 | f64 | f64 | f64 | cat |
5.1 | 3.5 | 1.4 | 0.2 | "setosa" |
4.9 | 3.0 | 1.4 | 0.2 | "setosa" |
4.7 | 3.2 | 1.3 | 0.2 | "setosa" |
21.5 Get & set column names: df.columns
dat.columns
['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width', 'Species']
= [re.sub("\.", "_", col) for col in list(dat.columns)]
dat.columns dat.columns
['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Species']
21.6 Get column data types: df.dtypes
dat.dtypes
[polars.datatypes.Float64,
polars.datatypes.Float64,
polars.datatypes.Float64,
polars.datatypes.Float64,
polars.datatypes.Categorical]
21.7 Get names and types: df.schema
dat.schema
{'Sepal_Length': polars.datatypes.Float64,
'Sepal_Width': polars.datatypes.Float64,
'Petal_Length': polars.datatypes.Float64,
'Petal_Width': polars.datatypes.Float64,
'Species': polars.datatypes.Categorical}
21.8 Indexing
Polars favors using df.select()
and df.filter()
for indexing columns and rows. Square bracket indexing is also available (may be removed)
21.9 Select
"Species") dat.select(
shape: (150, 1)
Species |
---|
cat |
"setosa" |
"setosa" |
"setosa" |
"setosa" |
"setosa" |
"setosa" |
"setosa" |
"setosa" |
"setosa" |
"setosa" |
"setosa" |
"setosa" |
... |
"virginica" |
"virginica" |
"virginica" |
"virginica" |
"virginica" |
"virginica" |
"virginica" |
"virginica" |
"virginica" |
"virginica" |
"virginica" |
"virginica" |
"Sepal_Length", "Species"]) dat.select([
shape: (150, 2)
Sepal_Length | Species |
---|---|
f64 | cat |
5.1 | "setosa" |
4.9 | "setosa" |
4.7 | "setosa" |
4.6 | "setosa" |
5.0 | "setosa" |
5.4 | "setosa" |
4.6 | "setosa" |
5.0 | "setosa" |
4.4 | "setosa" |
4.9 | "setosa" |
5.4 | "setosa" |
4.8 | "setosa" |
... | ... |
6.0 | "virginica" |
6.9 | "virginica" |
6.7 | "virginica" |
6.9 | "virginica" |
5.8 | "virginica" |
6.8 | "virginica" |
6.7 | "virginica" |
6.7 | "virginica" |
6.3 | "virginica" |
6.5 | "virginica" |
6.2 | "virginica" |
5.9 | "virginica" |
You can (for now?) also index using brackets:
10:15, "Sepal_Length":"Petal_Length"] dat[
shape: (5, 3)
Sepal_Length | Sepal_Width | Petal_Length |
---|---|---|
f64 | f64 | f64 |
5.4 | 3.7 | 1.5 |
4.8 | 3.4 | 1.6 |
4.8 | 3.0 | 1.4 |
4.3 | 3.0 | 1.1 |
5.8 | 4.0 | 1.2 |
0:10, 2:5] dat[
shape: (10, 3)
Petal_Length | Petal_Width | Species |
---|---|---|
f64 | f64 | cat |
1.4 | 0.2 | "setosa" |
1.4 | 0.2 | "setosa" |
1.3 | 0.2 | "setosa" |
1.5 | 0.2 | "setosa" |
1.4 | 0.2 | "setosa" |
1.7 | 0.4 | "setosa" |
1.4 | 0.3 | "setosa" |
1.5 | 0.2 | "setosa" |
1.4 | 0.2 | "setosa" |
1.5 | 0.1 | "setosa" |
21.10 Filter
filter(pl.col("Species") == "versicolor") dat.
shape: (50, 5)
Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | Species |
---|---|---|---|---|
f64 | f64 | f64 | f64 | cat |
7.0 | 3.2 | 4.7 | 1.4 | "versicolor" |
6.4 | 3.2 | 4.5 | 1.5 | "versicolor" |
6.9 | 3.1 | 4.9 | 1.5 | "versicolor" |
5.5 | 2.3 | 4.0 | 1.3 | "versicolor" |
6.5 | 2.8 | 4.6 | 1.5 | "versicolor" |
5.7 | 2.8 | 4.5 | 1.3 | "versicolor" |
6.3 | 3.3 | 4.7 | 1.6 | "versicolor" |
4.9 | 2.4 | 3.3 | 1.0 | "versicolor" |
6.6 | 2.9 | 4.6 | 1.3 | "versicolor" |
5.2 | 2.7 | 3.9 | 1.4 | "versicolor" |
5.0 | 2.0 | 3.5 | 1.0 | "versicolor" |
5.9 | 3.0 | 4.2 | 1.5 | "versicolor" |
... | ... | ... | ... | ... |
5.6 | 3.0 | 4.1 | 1.3 | "versicolor" |
5.5 | 2.5 | 4.0 | 1.3 | "versicolor" |
5.5 | 2.6 | 4.4 | 1.2 | "versicolor" |
6.1 | 3.0 | 4.6 | 1.4 | "versicolor" |
5.8 | 2.6 | 4.0 | 1.2 | "versicolor" |
5.0 | 2.3 | 3.3 | 1.0 | "versicolor" |
5.6 | 2.7 | 4.2 | 1.3 | "versicolor" |
5.7 | 3.0 | 4.2 | 1.2 | "versicolor" |
5.7 | 2.9 | 4.2 | 1.3 | "versicolor" |
6.2 | 2.9 | 4.3 | 1.3 | "versicolor" |
5.1 | 2.5 | 3.0 | 1.1 | "versicolor" |
5.7 | 2.8 | 4.1 | 1.3 | "versicolor" |
filter(pl.col("Species").cast(pl.Utf8).is_in(['versicolor', 'virginica'])) dat.
shape: (100, 5)
Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | Species |
---|---|---|---|---|
f64 | f64 | f64 | f64 | cat |
7.0 | 3.2 | 4.7 | 1.4 | "versicolor" |
6.4 | 3.2 | 4.5 | 1.5 | "versicolor" |
6.9 | 3.1 | 4.9 | 1.5 | "versicolor" |
5.5 | 2.3 | 4.0 | 1.3 | "versicolor" |
6.5 | 2.8 | 4.6 | 1.5 | "versicolor" |
5.7 | 2.8 | 4.5 | 1.3 | "versicolor" |
6.3 | 3.3 | 4.7 | 1.6 | "versicolor" |
4.9 | 2.4 | 3.3 | 1.0 | "versicolor" |
6.6 | 2.9 | 4.6 | 1.3 | "versicolor" |
5.2 | 2.7 | 3.9 | 1.4 | "versicolor" |
5.0 | 2.0 | 3.5 | 1.0 | "versicolor" |
5.9 | 3.0 | 4.2 | 1.5 | "versicolor" |
... | ... | ... | ... | ... |
6.0 | 3.0 | 4.8 | 1.8 | "virginica" |
6.9 | 3.1 | 5.4 | 2.1 | "virginica" |
6.7 | 3.1 | 5.6 | 2.4 | "virginica" |
6.9 | 3.1 | 5.1 | 2.3 | "virginica" |
5.8 | 2.7 | 5.1 | 1.9 | "virginica" |
6.8 | 3.2 | 5.9 | 2.3 | "virginica" |
6.7 | 3.3 | 5.7 | 2.5 | "virginica" |
6.7 | 3.0 | 5.2 | 2.3 | "virginica" |
6.3 | 2.5 | 5.0 | 1.9 | "virginica" |
6.5 | 3.0 | 5.2 | 2.0 | "virginica" |
6.2 | 3.4 | 5.4 | 2.3 | "virginica" |
5.9 | 3.0 | 5.1 | 1.8 | "virginica" |
filter(pl.col("Sepal_Length") < 4.5) dat.
shape: (4, 5)
Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | Species |
---|---|---|---|---|
f64 | f64 | f64 | f64 | cat |
4.4 | 2.9 | 1.4 | 0.2 | "setosa" |
4.3 | 3.0 | 1.1 | 0.1 | "setosa" |
4.4 | 3.0 | 1.3 | 0.2 | "setosa" |
4.4 | 3.2 | 1.3 | 0.2 | "setosa" |
filter(pl.col("Petal_Length") < dat["Petal_Length"].mean()) dat.
shape: (57, 5)
Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | Species |
---|---|---|---|---|
f64 | f64 | f64 | f64 | cat |
5.1 | 3.5 | 1.4 | 0.2 | "setosa" |
4.9 | 3.0 | 1.4 | 0.2 | "setosa" |
4.7 | 3.2 | 1.3 | 0.2 | "setosa" |
4.6 | 3.1 | 1.5 | 0.2 | "setosa" |
5.0 | 3.6 | 1.4 | 0.2 | "setosa" |
5.4 | 3.9 | 1.7 | 0.4 | "setosa" |
4.6 | 3.4 | 1.4 | 0.3 | "setosa" |
5.0 | 3.4 | 1.5 | 0.2 | "setosa" |
4.4 | 2.9 | 1.4 | 0.2 | "setosa" |
4.9 | 3.1 | 1.5 | 0.1 | "setosa" |
5.4 | 3.7 | 1.5 | 0.2 | "setosa" |
4.8 | 3.4 | 1.6 | 0.2 | "setosa" |
... | ... | ... | ... | ... |
4.8 | 3.0 | 1.4 | 0.3 | "setosa" |
5.1 | 3.8 | 1.6 | 0.2 | "setosa" |
4.6 | 3.2 | 1.4 | 0.2 | "setosa" |
5.3 | 3.7 | 1.5 | 0.2 | "setosa" |
5.0 | 3.3 | 1.4 | 0.2 | "setosa" |
4.9 | 2.4 | 3.3 | 1.0 | "versicolor" |
5.0 | 2.0 | 3.5 | 1.0 | "versicolor" |
5.6 | 2.9 | 3.6 | 1.3 | "versicolor" |
5.7 | 2.6 | 3.5 | 1.0 | "versicolor" |
5.5 | 2.4 | 3.7 | 1.0 | "versicolor" |
5.0 | 2.3 | 3.3 | 1.0 | "versicolor" |
5.1 | 2.5 | 3.0 | 1.1 | "versicolor" |
= pl.DataFrame({
df 'ID': [1, 3, 5, 7 ],
'Age': [45, 43, 23, 76]
}) df
shape: (4, 2)
ID | Age |
---|---|
i64 | i64 |
1 | 45 |
3 | 43 |
5 | 23 |
7 | 76 |
filter(pl.col("ID").is_in([3, 5])) df.
shape: (2, 2)
ID | Age |
---|---|
i64 | i64 |
3 | 43 |
5 | 23 |