Crosstab¶

All of crosstab's functionality can be accessed by the Crosstab object, which includes all the methods and attributes mentioned in the sections below.

crosstab.Crosstab ¶

Crosstab(incsv: Path, row_headers: tuple, col_headers: tuple, value_cols: tuple, outxlsx: Path | None = None, fill: str | None = None, keep_src: bool = False, keep_duckdb: bool = False)

Rearrange a normalized CSV into a crosstabulated XLSX workbook.

The crosstab is computed by DuckDB in a single pass: the CSV is read via read_csv (with all_varchar=True to preserve string semantics), duplicate row/column key combinations are detected with a GROUP BY ... HAVING COUNT(*) > 1 query, and the pivoted long-format result is materialized with one query before being written to the output workbook with xlsxwriter.

The output workbook contains:

Crosstab — the pivoted table. Row-header values appear on the left; column-header values fan out across the top, grouped per value-column. autofilter is applied to the row-header row and the column-header rows are frozen.
Source Data (optional) — a verbatim copy of the input CSV, included when keep_src=True.

Parameters¶

incsv: Path to the input CSV file. row_headers: Tuple of column names whose values become row keys. col_headers: Tuple of column names whose value combinations become column keys. value_cols: Tuple of column names whose values fill the crosstab cells. outxlsx: Output path. Defaults to <incsv stem>_crosstab.xlsx next to the input. fill: Value to write into empty cells. None (default) leaves them blank. keep_src: Append a "Source Data" sheet containing the verbatim input CSV. keep_duckdb: Persist the DuckDB database to disk at <incsv stem>.duckdb so the data can be queried again later.

Source code in src/crosstab/crosstab.py

def __init__(
    self: Crosstab,
    incsv: Path,
    row_headers: tuple,
    col_headers: tuple,
    value_cols: tuple,
    outxlsx: Path | None = None,
    fill: str | None = None,
    keep_src: bool = False,
    keep_duckdb: bool = False,
) -> None:
    self.incsv = Path(incsv)
    if outxlsx is None:
        outxlsx = self.incsv.with_name(self.incsv.stem + "_crosstab.xlsx")
    self.outxlsx = Path(outxlsx)
    self.row_headers = tuple(row_headers)
    self.col_headers = tuple(col_headers)
    self.value_cols = tuple(value_cols)
    self.fill = fill
    self.keep_src = keep_src
    self.keep_duckdb = keep_duckdb
    logger.debug(self)
    self._validate_args()

    if self.keep_duckdb:
        duckdb_path = self.incsv.with_suffix(".duckdb")
        if duckdb_path.exists():
            duckdb_path.unlink()
        self._con = duckdb.connect(str(duckdb_path))
    else:
        self._con = duckdb.connect()

    self.csv_columns = self._read_columns()
    self._validate_csv_headers()

crosstab ¶

crosstab() -> None

Compute the crosstab and write it to self.outxlsx.

Source code in src/crosstab/crosstab.py

def crosstab(self: Crosstab) -> None:
    """Compute the crosstab and write it to ``self.outxlsx``."""
    logger.info(f"Creating crosstab table from {self.incsv}.")

    # Stage the CSV as a DuckDB table named ``input_data``. We materialize
    # via CREATE TABLE (rather than register-a-view) so the data persists
    # in the on-disk database when ``keep_duckdb=True``.
    self._con.execute("DROP TABLE IF EXISTS input_data")
    self._con.execute(
        "CREATE TABLE input_data AS SELECT * FROM read_csv(?, all_varchar=true)",
        [str(self.incsv)],
    )

    rh_q = [_quote_ident(h) for h in self.row_headers]
    ch_q = [_quote_ident(h) for h in self.col_headers]
    vc_q = [_quote_ident(v) for v in self.value_cols]

    rh_list = ", ".join(rh_q)
    ch_list = ", ".join(ch_q)
    all_keys = ", ".join(rh_q + ch_q)

    # Strict mode: any (row_key, col_key) appearing more than once is a
    # fatal error — the crosstab cannot represent it without losing
    # information. The caller is expected to pre-aggregate the input
    # (with DuckDB, pandas, polars, etc.) if the source data contains
    # duplicates that should be combined.
    logger.debug("Checking for duplicate row/column key combinations.")
    dup_sql = f"SELECT {all_keys} FROM input_data GROUP BY {all_keys} HAVING COUNT(*) > 1 LIMIT 1"
    if self._con.execute(dup_sql).fetchone():
        raise ValueError("Multiple values found for the row/column combination(s).")

    # Distinct row keys and col keys, sorted for deterministic output.
    logger.debug("Fetching distinct row-header values.")
    row_keys = self._con.execute(
        f"SELECT DISTINCT {rh_list} FROM input_data ORDER BY {rh_list}",
    ).fetchall()
    logger.debug("Fetching distinct column-header values.")
    col_keys = self._con.execute(
        f"SELECT DISTINCT {ch_list} FROM input_data ORDER BY {ch_list}",
    ).fetchall()

    # Long-format result: one row per (row_key, col_key) with value
    # columns. The duplicate check above has already verified each group
    # has exactly one input row, so ``any_value()`` simply reads back
    # that single value — nothing is collapsed.
    logger.debug("Fetching value rows.")
    agg_select = ", ".join(
        rh_q + ch_q + [f"any_value({v}) AS {v}" for v in vc_q],
    )
    agg_rows = self._con.execute(
        f"SELECT {agg_select} FROM input_data GROUP BY {all_keys}",
    ).fetchall()

    n_rh = len(self.row_headers)
    n_ch = len(self.col_headers)
    cells: dict[tuple[tuple, tuple], tuple] = {}
    for r in agg_rows:
        row_key = r[:n_rh]
        col_key = r[n_rh : n_rh + n_ch]
        cells[(row_key, col_key)] = r[n_rh + n_ch :]

    self._write_xlsx(row_keys, col_keys, cells)

close ¶

close() -> None

Release the DuckDB connection.

Source code in src/crosstab/crosstab.py

def close(self: Crosstab) -> None:
    """Release the DuckDB connection."""
    self._con.close()