26  DuckDB

import duckdb
import polars as pl

26.1 Setup Connection

By default, connect()’s database argument defaults to :memory:, creating a non-persistent, in-memory object.

con = duckdb.connect()

26.2 Settings

con.execute(
    """
    PRAGMA enable_progress_bar;
    """
)
<duckdb.DuckDBPyConnection at 0x12344c770>

26.3 Read CSV

con.execute(
    "SELECT * FROM read_csv_auto('~/icloud/Data/iris.csv');"
)
<duckdb.DuckDBPyConnection at 0x12344c770>
con.execute(
    "CREATE TABLE iris AS SELECT * FROM '~/icloud/Data/iris.csv';"
)
<duckdb.DuckDBPyConnection at 0x12344c770>
con.execute(
    """
    CREATE TABLE ir AS SELECT * FROM 
    read_csv_auto('/Users/egenn/icloud/Data/iris.csv', 
    delim=',', header=True);
    """
)
<duckdb.DuckDBPyConnection at 0x12344c770>

26.3.1 Fetch data to pandas DataFrame:

iris = con.execute(
    "SELECT * from iris"
).fetchdf()
iris.head()
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
type(iris)
pandas.core.frame.DataFrame

26.3.2 Fetch data to polars DataFrame:

iris = pl.from_arrow(
    con.execute(
        "SELECT * from iris"
    ).fetch_arrow_table()
)
iris.head()
shape: (5, 5)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
f64 f64 f64 f64 str
5.1 3.5 1.4 0.2 "setosa"
4.9 3.0 1.4 0.2 "setosa"
4.7 3.2 1.3 0.2 "setosa"
4.6 3.1 1.5 0.2 "setosa"
5.0 3.6 1.4 0.2 "setosa"
type(iris)
polars.internals.dataframe.frame.DataFrame

26.4 Read CSV specifying schema

con.execute(
    """
    CREATE TABLE iris2(
        "Sepal.Length" DOUBLE,
        "Sepal.Width" DOUBLE, 
        "Petal.Length" DOUBLE,
        "Petal.Width" DOUBLE, 
        Species VARCHAR);
    COPY iris2 FROM '/Users/egenn/icloud/Data/iris.csv' (AUTO_DETECT TRUE);
    """
)
con.execute("SELECT * FROM iris2").df().head()
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa

26.4.1 DuckDBPyRelation

ir = duckdb.from_csv_auto('/Users/egenn/icloud/Data/iris.csv')
ir
---------------------
--- Relation Tree ---
---------------------
read_csv_auto(/Users/egenn/icloud/Data/iris.csv)

---------------------
-- Result Columns  --
---------------------
- Sepal.Length (DOUBLE)
- Sepal.Width (DOUBLE)
- Petal.Length (DOUBLE)
- Petal.Width (DOUBLE)
- Species (VARCHAR)

---------------------
-- Result Preview  --
---------------------
Sepal.Length    Sepal.Width Petal.Length    Petal.Width Species 
DOUBLE  DOUBLE  DOUBLE  DOUBLE  VARCHAR 
[ Rows: 10]
5.1 3.5 1.4 0.2 setosa
4.9 3.0 1.4 0.2 setosa
4.7 3.2 1.3 0.2 setosa
4.6 3.1 1.5 0.2 setosa
5.0 3.6 1.4 0.2 setosa
5.4 3.9 1.7 0.4 setosa
4.6 3.4 1.4 0.3 setosa
5.0 3.4 1.5 0.2 setosa
4.4 2.9 1.4 0.2 setosa
4.9 3.1 1.5 0.1 setosa
type(ir)
duckdb.DuckDBPyRelation

26.5 Select at read

Note: Sepal.Length below must have double quotes, while Species may have double or none:

iris = pl.from_arrow(
    con.execute(
        """
        SELECT "Sepal.Length", Species
        FROM read_csv_auto('/Users/egenn/icloud/Data/iris.csv');
        """
    ).fetch_arrow_table()
)
iris.head()
shape: (5, 2)
Sepal.Length Species
f64 str
5.1 "setosa"
4.9 "setosa"
4.7 "setosa"
4.6 "setosa"
5.0 "setosa"

Concatenate a string to select multiple columns, instead of passing a list or tuple:

cols = tuple(["Sepal.Length", "Species"])
cols = '"' + '", "'.join(cols) + '"'
iris = pl.from_arrow(
    con.execute(
        f"""
        SELECT {cols}
        FROM read_csv_auto('/Users/egenn/icloud/Data/iris.csv');
        """
    ).fetch_arrow_table()
)
iris.head()
shape: (5, 2)
Sepal.Length Species
f64 str
5.1 "setosa"
4.9 "setosa"
4.7 "setosa"
4.6 "setosa"
5.0 "setosa"

26.6 Filter at read

26.6.1 Read entire table

ids = tuple([21, 22, 23, 24, 25])
con.execute(
    f"""
    CREATE TABLE synth1 AS
    SELECT * FROM read_csv_auto('/Users/egenn/icloud/Data/synth1.csv')
    """
)
con.execute("SELECT * FROM synth1").df()
ID Age Group
0 1 49.001420 D
1 2 28.266542 B
2 3 31.025146 A
3 4 25.554986 C
4 5 36.689864 B
... ... ... ...
495 496 36.633847 A
496 497 34.457815 C
497 498 49.096331 A
498 499 31.708578 A
499 500 53.850253 A

500 rows × 3 columns

26.6.2 Read only specific IDs

ids = tuple([21, 22, 23, 24, 25])
con.execute(
    f"""
    CREATE TABLE synth1f AS
    SELECT * FROM read_csv_auto('/Users/egenn/icloud/Data/synth1.csv')
    WHERE ID in {ids};
    """
)
con.execute("SELECT * FROM synth1f").df()
ID Age Group
0 21 43.644608 A
1 22 43.836510 A
2 23 51.134057 C
3 24 52.115098 B
4 25 36.516745 C

26.7 Using Queries with the Python API

ids = tuple([21, 22, 23, 24, 25])
sq = duckdb.query(
    f"""
    SELECT * FROM read_csv_auto('/Users/egenn/icloud/Data/synth1.csv')
    WHERE ID in {ids};
    """
)
sq
---------------------
--- Relation Tree ---
---------------------
Subquery

---------------------
-- Result Columns  --
---------------------
- ID (INTEGER)
- Age (DOUBLE)
- Group (VARCHAR)

---------------------
-- Result Preview  --
---------------------
ID  Age Group   
INTEGER DOUBLE  VARCHAR 
[ Rows: 5]
21  43.6446078379911    A
22  43.8365100056002    A
23  51.1340571729247    C
24  52.1150978723465    B
25  36.516745383723 C

You can execute the query at any time:

sq.execute().df()
ID Age Group
0 21 43.644608 A
1 22 43.836510 A
2 23 51.134057 C
3 24 52.115098 B
4 25 36.516745 C

26.8 Compose queries

How to specify path, select, and filter programmatically

26.8.1 Select

distinct = "DISTINCT" # set to "" to not filter unique rows
sep = ","
path = "/Users/egenn/icloud/Data/synth1.csv"
cols = tuple(["ID", "Age"])
cols = '"' + '", "'.join(cols) + '"'
ignore_errors = True
# cols = "*" # to select all programmatically
sq = duckdb.query(
    f"""
    SELECT {distinct + cols} FROM 
    read_csv_auto('{path}', sep = '{sep}', ignore_errors = {ignore_errors});
    """
)
sq.execute().df()
ID Age
0 1 49.001420
1 2 28.266542
2 3 31.025146
3 4 25.554986
4 5 36.689864
... ... ...
495 411 45.938594
496 416 30.819571
497 209 33.385111
498 384 42.311528
499 435 31.042009

500 rows × 2 columns

26.8.2 Filter

Suppose you have a list of IDs in a list, convert to tuple first

ids = list(range(80, 91))
filter_col = "ID"
sq = sq.filter(f"{filter_col} in {tuple(ids)}")
sq.execute().df()
ID Age
0 80 48.873065
1 81 29.840911
2 82 58.708691
3 83 50.761984
4 84 29.255924
5 85 18.044240
6 86 45.345446
7 87 53.437334
8 88 53.850035
9 89 67.469268
10 90 39.540553

26.9 Resources