duckreg
  • Home
  • Ibis
  • Compression
  • Linear
  • Panel
  • DML
  • GLMs
  • Fisher Scoring
  • Ridge
  • Inference
  • Examples
  • Performance
  1. Executed Examples
  • duckreg
  • Ibis Backends
  • Compression and Estimator Lifecycle
  • Linear Regression API
  • Panel Estimators
  • Compressed Double Machine Learning
  • Generalized Linear Models
  • Fisher Scoring and Multinomial GLMs
  • Compressed Ridge Regression
  • Inference and Variance Estimation
  • Executed Examples
  • Performance Comparisons

On this page

  • Setup
  • Linear Regression
  • Panel Mundlak
  • Double Demeaning
  • DML With Discrete Controls
  • GLMs
  • Multinomial Labels
  • Many-Label Counts

Executed Examples

This page runs compact versions of the repository notebooks. The data are synthetic and small enough for fast documentation renders, but the estimators execute the real package code.

Setup

Code
import os
import tempfile

import ibis
import numpy as np
import pandas as pd

from duckreg import (
    DBDML,
    DBDoubleDemeaning,
    DBLogisticRegression,
    DBMultinomialLogisticRegression,
    DBMundlak,
    DBPoissonMultinomialRegression,
    DBPoissonRegression,
    DBRegression,
)

pd.set_option("display.precision", 4)
rng = np.random.default_rng(2026)
tmpdir = tempfile.mkdtemp(prefix="duckreg_docs_")


def duckdb_backend(name, table, df):
    path = os.path.join(tmpdir, name)
    con = ibis.duckdb.connect(path)
    con.create_table(table, df, overwrite=True)
    return con

Linear Regression

n = 8_000
linear = pd.DataFrame(
    {
        "rowid": np.arange(n),
        "D": rng.integers(0, 2, n).astype(float),
        "f1": rng.integers(0, 20, n).astype(float),
        "f2": rng.integers(0, 8, n).astype(float),
        "cluster": rng.integers(0, 250, n),
    }
)
linear["Y"] = (
    1.0
    + 2.0 * linear["D"]
    + 0.10 * linear["f1"]
    - 0.05 * linear["f2"]
    + rng.normal(0, 1, n)
)

con = duckdb_backend("linear.db", "data", linear)

ols = DBRegression(
    db_name=None,
    connection=con,
    table_name="data",
    formula="Y ~ D + f1 + f2",
    cluster_col=None,
    seed=42,
    n_bootstraps=0,
)
ols.fit()
ols.fit_vcov()

pd.DataFrame(
    {
        "term": ["Intercept", "D", "f1", "f2"],
        "estimate": ols.summary()["point_estimate"],
        "std_error": ols.summary()["standard_error"],
    }
)
term estimate std_error
0 Intercept 1.0098 0.0303
1 D 2.0179 0.0223
2 f1 0.1004 0.0019
3 f2 -0.0512 0.0049

Compression reduces 8,000 raw rows to one row per unique covariate cell.

pd.DataFrame(
    {
        "raw_rows": [len(linear)],
        "compressed_rows": [len(ols.df_compressed)],
        "compression_ratio": [len(linear) / len(ols.df_compressed)],
    }
)
raw_rows compressed_rows compression_ratio
0 8000 320 25.0

Panel Mundlak

units, periods = 160, 6
unit = np.repeat(np.arange(units), periods)
time = np.tile(np.arange(periods), units)
unit_fe = rng.normal(0, 1, units)
time_fe = rng.normal(0, 0.4, periods)
D = rng.normal(size=len(unit)) + 0.4 * unit_fe[unit] + 0.2 * time_fe[time]
X = rng.normal(size=len(unit))
Y = 1.5 * D - 0.5 * X + unit_fe[unit] + time_fe[time] + rng.normal(0, 0.5, len(unit))
panel = pd.DataFrame({"unit": unit, "time": time, "D": D, "X": X, "Y": Y})

panel_con = duckdb_backend("panel.db", "panel_data", panel)
mundlak = DBMundlak(
    db_name=None,
    connection=panel_con,
    table_name="panel_data",
    outcome_var="Y",
    covariates=["D", "X"],
    unit_col="unit",
    time_col="time",
    seed=42,
    n_bootstraps=0,
)
mundlak.fit()

pd.DataFrame(
    mundlak.point_estimate,
    index=["Intercept"] + mundlak.rhs,
    columns=["estimate"],
).head(8)
estimate
Intercept 0.0156
D 1.4844
X -0.5138
avg_D_unit 1.3289
avg_X_unit -0.1031
avg_D_time 1.8313
avg_X_time -1.8936

Double Demeaning

twfe = DBDoubleDemeaning(
    db_name=None,
    connection=panel_con,
    table_name="panel_data",
    outcome_var="Y",
    treatment_var="D",
    unit_col="unit",
    time_col="time",
    seed=42,
    n_bootstraps=0,
)
twfe.fit()

pd.DataFrame(
    twfe.point_estimate,
    index=["Intercept", "ddot_D"],
    columns=["estimate"],
)
estimate
Intercept -0.0941
ddot_D 1.4960

DML With Discrete Controls

dml_data = linear.copy()
dml_data["X"] = rng.normal(size=len(dml_data))
dml_data["Y"] = (
    0.8 * dml_data["D"]
    + 0.6 * dml_data["X"]
    + dml_data.groupby(["f1", "f2"])["f1"].transform("mean") * 0.02
    + rng.normal(0, 1, len(dml_data))
)
dml_con = duckdb_backend("dml.db", "data", dml_data)

dml = DBDML(
    db_name=None,
    connection=dml_con,
    table_name="data",
    outcome_var="Y",
    treatment_var=["D", "X"],
    discrete_covars=["f1", "f2"],
    seed=42,
    n_bootstraps=0,
)
dml.fit()
pd.DataFrame({"term": ["D", "X"], "estimate": dml.point_estimate})
term estimate
0 D 0.7762
1 X 0.5858

GLMs

glm = linear[["D", "f1", "f2"]].copy()
eta = -1.0 + 1.1 * glm["D"] + 0.04 * glm["f1"] - 0.08 * glm["f2"]
glm["y_binary"] = rng.binomial(1, 1 / (1 + np.exp(-eta)))
mu = np.exp(0.2 + 0.25 * glm["D"] + 0.02 * glm["f1"] - 0.03 * glm["f2"])
glm["y_count"] = rng.poisson(mu)
glm_con = duckdb_backend("glm.db", "glm_data", glm)

logit = DBLogisticRegression(
    db_name=None,
    connection=glm_con,
    table_name="glm_data",
    formula="y_binary ~ D + f1 + f2",
    seed=42,
    method="irls",
    n_bootstraps=0,
)
logit.fit()
logit.fit_vcov()

poisson = DBPoissonRegression(
    db_name=None,
    connection=glm_con,
    table_name="glm_data",
    formula="y_count ~ D + f1 + f2",
    seed=42,
    method="irls",
    n_bootstraps=0,
)
poisson.fit()
poisson.fit_vcov()

pd.DataFrame(
    {
        "term": ["Intercept", "D", "f1", "f2"],
        "logit": logit.point_estimate,
        "poisson": poisson.point_estimate,
    }
)
term logit poisson
0 Intercept -0.9177 0.1877
1 D 1.1555 0.2786
2 f1 0.0356 0.0196
3 f2 -0.0880 -0.0318

Multinomial Labels

X_design = np.c_[np.ones(len(glm)), glm[["D", "f1", "f2"]].to_numpy()]
beta = np.array([[0.2, 0.5, 0.03, -0.05], [-0.3, -0.2, 0.01, 0.06]])
eta = np.c_[X_design @ beta.T, np.zeros(len(glm))]
prob = np.exp(eta - eta.max(axis=1, keepdims=True))
prob = prob / prob.sum(axis=1, keepdims=True)
labels = np.array(["a", "b", "c"])
glm["label"] = [rng.choice(labels, p=p) for p in prob]

glm_con.create_table("label_data", glm, overwrite=True)
multi = DBMultinomialLogisticRegression(
    db_name=None,
    connection=glm_con,
    table_name="label_data",
    formula="label ~ D + f1 + f2",
    labels=["a", "b", "c"],
    baseline="c",
    seed=42,
    n_bootstraps=0,
)
multi.fit()
multi.fit_vcov()

pd.DataFrame(
    multi.point_estimate,
    index=multi.labels[:-1],
    columns=["Intercept", "D", "f1", "f2"],
)
Intercept D f1 f2
a 0.2191 0.5537 0.0276 -0.0673
b -0.2284 -0.0836 0.0052 0.0313

Many-Label Counts

rows = []
count_labels = [f"label_{j}" for j in range(6)]
for obs_id in range(500):
    x1 = rng.integers(0, 4)
    x2 = rng.integers(0, 3)
    for j, label in enumerate(count_labels):
        mean = np.exp(-1.1 + 0.08 * j + 0.12 * x1 - 0.06 * x2)
        rows.append((obs_id, label, rng.poisson(mean), x1, x2))

counts = pd.DataFrame(rows, columns=["obs_id", "label", "count", "x1", "x2"])
count_con = duckdb_backend("counts.db", "token_counts", counts)

pm = DBPoissonMultinomialRegression(
    db_name=None,
    connection=count_con,
    table_name="token_counts",
    count_col="count",
    label_col="label",
    covars=["x1", "x2"],
    seed=42,
    n_bootstraps=0,
)
pm.fit()
pm.summary()["point_estimate"].head()
Intercept x1 x2
label_0 -1.0286 0.0702 -0.0045
label_1 -1.0493 0.1140 -0.0824
label_2 -0.9425 0.0935 -0.0225
label_3 -0.9250 0.2507 -0.1838
label_4 -0.6342 0.0618 -0.1070