import polars as pl
import numpy as np
import re
iris = pl.read_csv("/Users/egenn/icloud/Data/iris.csv",
dtypes = {"Species": pl.Categorical})
iris
shape: (150, 5)
Sepal.Length
|
Sepal.Width
|
Petal.Length
|
Petal.Width
|
Species
|
f64
|
f64
|
f64
|
f64
|
cat
|
5.1
|
3.5
|
1.4
|
0.2
|
"setosa"
|
4.9
|
3.0
|
1.4
|
0.2
|
"setosa"
|
4.7
|
3.2
|
1.3
|
0.2
|
"setosa"
|
4.6
|
3.1
|
1.5
|
0.2
|
"setosa"
|
5.0
|
3.6
|
1.4
|
0.2
|
"setosa"
|
5.4
|
3.9
|
1.7
|
0.4
|
"setosa"
|
4.6
|
3.4
|
1.4
|
0.3
|
"setosa"
|
5.0
|
3.4
|
1.5
|
0.2
|
"setosa"
|
4.4
|
2.9
|
1.4
|
0.2
|
"setosa"
|
4.9
|
3.1
|
1.5
|
0.1
|
"setosa"
|
5.4
|
3.7
|
1.5
|
0.2
|
"setosa"
|
4.8
|
3.4
|
1.6
|
0.2
|
"setosa"
|
...
|
...
|
...
|
...
|
...
|
6.0
|
3.0
|
4.8
|
1.8
|
"virginica"
|
6.9
|
3.1
|
5.4
|
2.1
|
"virginica"
|
6.7
|
3.1
|
5.6
|
2.4
|
"virginica"
|
6.9
|
3.1
|
5.1
|
2.3
|
"virginica"
|
5.8
|
2.7
|
5.1
|
1.9
|
"virginica"
|
6.8
|
3.2
|
5.9
|
2.3
|
"virginica"
|
6.7
|
3.3
|
5.7
|
2.5
|
"virginica"
|
6.7
|
3.0
|
5.2
|
2.3
|
"virginica"
|
6.3
|
2.5
|
5.0
|
1.9
|
"virginica"
|
6.5
|
3.0
|
5.2
|
2.0
|
"virginica"
|
6.2
|
3.4
|
5.4
|
2.3
|
"virginica"
|
5.9
|
3.0
|
5.1
|
1.8
|
"virginica"
|
Clean column names
iris.columns = [re.sub("\.", "_", col) for col in iris.columns]
iris.columns
['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Species']
Groupby + Mean
Get mean Sepal_Length by Species
iris.groupby('Species').agg(
pl.col('Sepal_Length').mean().alias('Sepal Length (mean)'))
shape: (3, 2)
Species
|
Sepal Length (mean)
|
cat
|
f64
|
"virginica"
|
6.588
|
"versicolor"
|
5.936
|
"setosa"
|
5.006
|
To perform the above with lazy evaluation, just add df.lazy()
and end with df.collect()
:
iris.lazy().groupby('Species').agg(
pl.col('Sepal_Length').mean().alias('Sepal Length (mean)')).collect()
shape: (3, 2)
Species
|
Sepal Length (mean)
|
cat
|
f64
|
"versicolor"
|
5.936
|
"setosa"
|
5.006
|
"virginica"
|
6.588
|
q = (
iris
.lazy()
.groupby("Species")
.agg(
pl.col("Sepal_Length").mean().alias("Sepal Length (mean)")
)
)
q.collect()
shape: (3, 2)
Species
|
Sepal Length (mean)
|
cat
|
f64
|
"setosa"
|
5.006
|
"virginica"
|
6.588
|
"versicolor"
|
5.936
|
Groupby + multiple metrics
You can get multiple summary measure and sort results:
iris.groupby("Species").agg(
[
pl.col("Sepal_Length").mean().alias("Sepal_Length (Mean)"),
pl.col("Petal_Length").mean().alias("Petal_Length (Mean)")
]
).sort('Species')
shape: (3, 3)
Species
|
Sepal_Length (Mean)
|
Petal_Length (Mean)
|
cat
|
f64
|
f64
|
"setosa"
|
5.006
|
1.462
|
"versicolor"
|
5.936
|
4.26
|
"virginica"
|
6.588
|
5.552
|
with lazy evaluation:
iris.lazy().groupby('Species').agg(
[
pl.col("Sepal_Length").mean().alias("Sepal_Length (Mean)"),
pl.col("Petal_Length").mean().alias("Petal_Length (Mean)")
]
).sort('Species')
NAIVE QUERY PLAN
run LazyFrame.show_graph() to see the optimized version
or build a query and collect()
it:
q = (
iris
.lazy()
.groupby("Species")
.agg(
[
pl.col("Sepal_Length").mean().alias("Sepal_Length (Mean)"),
pl.col("Petal_Length").mean().alias("Petal_Length (Mean)")
]
)
.sort("Species")
)
q.collect()
shape: (3, 3)
Species
|
Sepal_Length (Mean)
|
Petal_Length (Mean)
|
cat
|
f64
|
f64
|
"setosa"
|
5.006
|
1.462
|
"versicolor"
|
5.936
|
4.26
|
"virginica"
|
6.588
|
5.552
|
Groupby counts
iris.groupby("Species").agg(pl.count())
shape: (3, 2)
Species
|
count
|
cat
|
u32
|
"virginica"
|
50
|
"setosa"
|
50
|
"versicolor"
|
50
|
Remove duplicates and repeat
iris.groupby("Species").agg(pl.count())
shape: (3, 2)
Species
|
count
|
cat
|
u32
|
"versicolor"
|
50
|
"virginica"
|
49
|
"setosa"
|
50
|
One virginica row was removed
iris.groupby("Species").agg(
[
pl.count().alias("Count"),
pl.max('Sepal_Length').alias("Sepal Length (max)"),
pl.min('Sepal_Length').alias("Sepal Length (min)")
]
)
shape: (3, 4)
Species
|
Count
|
Sepal Length (max)
|
Sepal Length (min)
|
cat
|
u32
|
f64
|
f64
|
"setosa"
|
50
|
5.8
|
4.3
|
"versicolor"
|
50
|
7.0
|
4.9
|
"virginica"
|
49
|
7.9
|
4.9
|
the same with lazy evaluation
iris.lazy().groupby("Species").agg(
[
pl.count().alias("Count"),
pl.max('Sepal_Length').alias("Sepal Length (max)"),
pl.min('Sepal_Length').alias("Sepal Length (min)")
]
).collect()
shape: (3, 4)
Species
|
Count
|
Sepal Length (max)
|
Sepal Length (min)
|
cat
|
u32
|
f64
|
f64
|
"versicolor"
|
50
|
7.0
|
4.9
|
"virginica"
|
49
|
7.9
|
4.9
|
"setosa"
|
50
|
5.8
|
4.3
|
or
q = (
iris
.lazy()
.groupby('Species')
.agg(
[
pl.count().alias("Count"),
pl.max('Sepal_Length').alias("Sepal Length (max)"),
pl.min('Sepal_Length').alias("Sepal Length (min)")
]
)
)
q.collect()
shape: (3, 4)
Species
|
Count
|
Sepal Length (max)
|
Sepal Length (min)
|
cat
|
u32
|
f64
|
f64
|
"versicolor"
|
50
|
7.0
|
4.9
|
"virginica"
|
49
|
7.9
|
4.9
|
"setosa"
|
50
|
5.8
|
4.3
|
Columnwise operations
iris.select(pl.col(iris.columns[0:4]))
shape: (149, 4)
Sepal_Length
|
Sepal_Width
|
Petal_Length
|
Petal_Width
|
f64
|
f64
|
f64
|
f64
|
5.1
|
3.5
|
1.4
|
0.2
|
4.9
|
3.0
|
1.4
|
0.2
|
4.7
|
3.2
|
1.3
|
0.2
|
4.6
|
3.1
|
1.5
|
0.2
|
5.0
|
3.6
|
1.4
|
0.2
|
5.4
|
3.9
|
1.7
|
0.4
|
4.6
|
3.4
|
1.4
|
0.3
|
5.0
|
3.4
|
1.5
|
0.2
|
4.4
|
2.9
|
1.4
|
0.2
|
4.9
|
3.1
|
1.5
|
0.1
|
5.4
|
3.7
|
1.5
|
0.2
|
4.8
|
3.4
|
1.6
|
0.2
|
...
|
...
|
...
|
...
|
6.4
|
3.1
|
5.5
|
1.8
|
6.0
|
3.0
|
4.8
|
1.8
|
6.9
|
3.1
|
5.4
|
2.1
|
6.7
|
3.1
|
5.6
|
2.4
|
6.9
|
3.1
|
5.1
|
2.3
|
6.8
|
3.2
|
5.9
|
2.3
|
6.7
|
3.3
|
5.7
|
2.5
|
6.7
|
3.0
|
5.2
|
2.3
|
6.3
|
2.5
|
5.0
|
1.9
|
6.5
|
3.0
|
5.2
|
2.0
|
6.2
|
3.4
|
5.4
|
2.3
|
5.9
|
3.0
|
5.1
|
1.8
|