Crosstab(incsv: Path, row_headers: tuple, col_headers: tuple, value_cols: tuple, outxlsx: Path | None = None, fill: str | None = None, keep_src: bool = False, keep_duckdb: bool = False)
Rearrange a normalized CSV into a crosstabulated XLSX workbook.
The crosstab is computed by DuckDB in a single pass: the CSV is read
via read_csv (with all_varchar=True to preserve string semantics),
duplicate row/column key combinations are detected with a
GROUP BY ... HAVING COUNT(*) > 1 query, and the pivoted long-format
result is materialized with one query before being written to the output
workbook with xlsxwriter.
The output workbook contains:
- Crosstab — the pivoted table. Row-header values appear on the
left; column-header values fan out across the top, grouped per
value-column.
autofilter is applied to the row-header row and
the column-header rows are frozen.
- Source Data (optional) — a verbatim copy of the input CSV,
included when
keep_src=True.
Parameters
incsv:
Path to the input CSV file.
row_headers:
Tuple of column names whose values become row keys.
col_headers:
Tuple of column names whose value combinations become column keys.
value_cols:
Tuple of column names whose values fill the crosstab cells.
outxlsx:
Output path. Defaults to <incsv stem>_crosstab.xlsx next to the
input.
fill:
Value to write into empty cells. None (default) leaves them
blank.
keep_src:
Append a "Source Data" sheet containing the verbatim input CSV.
keep_duckdb:
Persist the DuckDB database to disk at <incsv stem>.duckdb so
the data can be queried again later.
Source code in src/crosstab/crosstab.py
| def __init__(
self: Crosstab,
incsv: Path,
row_headers: tuple,
col_headers: tuple,
value_cols: tuple,
outxlsx: Path | None = None,
fill: str | None = None,
keep_src: bool = False,
keep_duckdb: bool = False,
) -> None:
self.incsv = Path(incsv)
if outxlsx is None:
outxlsx = self.incsv.with_name(self.incsv.stem + "_crosstab.xlsx")
self.outxlsx = Path(outxlsx)
self.row_headers = tuple(row_headers)
self.col_headers = tuple(col_headers)
self.value_cols = tuple(value_cols)
self.fill = fill
self.keep_src = keep_src
self.keep_duckdb = keep_duckdb
logger.debug(self)
self._validate_args()
if self.keep_duckdb:
duckdb_path = self.incsv.with_suffix(".duckdb")
if duckdb_path.exists():
duckdb_path.unlink()
self._con = duckdb.connect(str(duckdb_path))
else:
self._con = duckdb.connect()
self.csv_columns = self._read_columns()
self._validate_csv_headers()
|
crosstab
Compute the crosstab and write it to self.outxlsx.
Source code in src/crosstab/crosstab.py
| def crosstab(self: Crosstab) -> None:
"""Compute the crosstab and write it to ``self.outxlsx``."""
logger.info(f"Creating crosstab table from {self.incsv}.")
# Stage the CSV as a DuckDB table named ``input_data``. We materialize
# via CREATE TABLE (rather than register-a-view) so the data persists
# in the on-disk database when ``keep_duckdb=True``.
self._con.execute("DROP TABLE IF EXISTS input_data")
self._con.execute(
"CREATE TABLE input_data AS SELECT * FROM read_csv(?, all_varchar=true)",
[str(self.incsv)],
)
rh_q = [_quote_ident(h) for h in self.row_headers]
ch_q = [_quote_ident(h) for h in self.col_headers]
vc_q = [_quote_ident(v) for v in self.value_cols]
rh_list = ", ".join(rh_q)
ch_list = ", ".join(ch_q)
all_keys = ", ".join(rh_q + ch_q)
# Strict mode: any (row_key, col_key) appearing more than once is a
# fatal error — the crosstab cannot represent it without losing
# information. The caller is expected to pre-aggregate the input
# (with DuckDB, pandas, polars, etc.) if the source data contains
# duplicates that should be combined.
logger.debug("Checking for duplicate row/column key combinations.")
dup_sql = f"SELECT {all_keys} FROM input_data GROUP BY {all_keys} HAVING COUNT(*) > 1 LIMIT 1"
if self._con.execute(dup_sql).fetchone():
raise ValueError("Multiple values found for the row/column combination(s).")
# Distinct row keys and col keys, sorted for deterministic output.
logger.debug("Fetching distinct row-header values.")
row_keys = self._con.execute(
f"SELECT DISTINCT {rh_list} FROM input_data ORDER BY {rh_list}",
).fetchall()
logger.debug("Fetching distinct column-header values.")
col_keys = self._con.execute(
f"SELECT DISTINCT {ch_list} FROM input_data ORDER BY {ch_list}",
).fetchall()
# Long-format result: one row per (row_key, col_key) with value
# columns. The duplicate check above has already verified each group
# has exactly one input row, so ``any_value()`` simply reads back
# that single value — nothing is collapsed.
logger.debug("Fetching value rows.")
agg_select = ", ".join(
rh_q + ch_q + [f"any_value({v}) AS {v}" for v in vc_q],
)
agg_rows = self._con.execute(
f"SELECT {agg_select} FROM input_data GROUP BY {all_keys}",
).fetchall()
n_rh = len(self.row_headers)
n_ch = len(self.col_headers)
cells: dict[tuple[tuple, tuple], tuple] = {}
for r in agg_rows:
row_key = r[:n_rh]
col_key = r[n_rh : n_rh + n_ch]
cells[(row_key, col_key)] = r[n_rh + n_ch :]
self._write_xlsx(row_keys, col_keys, cells)
|
close
Release the DuckDB connection.
Source code in src/crosstab/crosstab.py
| def close(self: Crosstab) -> None:
"""Release the DuckDB connection."""
self._con.close()
|