From 95a6445485d599272c6c1800fa934ac41163d0c5 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 19 Apr 2026 08:07:56 -0400 Subject: [PATCH] Add _build_sparse_constraint_system for O(nnz) memory calibration build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The existing _build_linear_constraint_system materializes a dense numpy array of shape (n_targets, n_records) via np.vstack(rows). At microplex-us v7 scale (~1.5M records × ~4k constraints, mostly marginal indicators that are 95%+ zero), this is ~24 GB of dense float64 allocated just to represent ~100-500 MB of nonzero data. The pipeline's L0 calibrator (microplex_us.pipelines.pe_l0) wraps the dense matrix immediately in sp.csr_matrix(A) — it already wants sparse, we just got there through a dense intermediate that wastes the memory and caused the OOM that macOS memorystatus killed as the "172 GB compressed process" in the v7 rerun on 2026-04-18. This commit adds a sparse-native builder that produces a CSR matrix row-by-row via (indptr, indices, data) construction, never allocating the full dense intermediate: - Marginal targets: each category produces a CSR row by np.flatnonzero(column == category), storing only the matching row indices with value 1.0. - Continuous targets: flatnonzero on the column values, storing only nonzero entries. - LinearConstraint rows: flatnonzero on coefficients, same idea. Semantics match _build_linear_constraint_system exactly: both return (matrix, b, names, n_categorical); the sparse version returns a CSR whose .toarray() equals the dense version's A (up to float64 rounding). Tests in tests/test_sparse_constraint_system.py pin: 1. _build_sparse_constraint_system importable from microplex.calibration. 2. Sparse output == dense output for marginal-only problem. 3. Sparse output == dense output for mixed marginal + continuous + LinearConstraint problem. 4. Actual sparsity: density < 0.45 on a realistic 4-state × 3-age marginal problem (the point of the refactor). 36 existing calibration tests unchanged; 40 total now pass. Downstream wiring: microplex-us.pipelines.pe_l0.PolicyEngineL0Calibrator calls this directly in a companion commit, bypassing the dense path. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/microplex/calibration/__init__.py | 78 ++++++++++++ tests/test_sparse_constraint_system.py | 169 +++++++++++++++++++++++++ 2 files changed, 247 insertions(+) create mode 100644 tests/test_sparse_constraint_system.py diff --git a/src/microplex/calibration/__init__.py b/src/microplex/calibration/__init__.py index c5ebc3a..4cf4379 100644 --- a/src/microplex/calibration/__init__.py +++ b/src/microplex/calibration/__init__.py @@ -84,6 +84,84 @@ def _build_linear_constraint_system( return A, b, names, n_categorical +def _build_sparse_constraint_system( + data: pd.DataFrame, + marginal_targets: dict[str, dict[str, float]], + continuous_targets: dict[str, float] | None, + linear_constraints: tuple[LinearConstraint, ...], +): + """Build a CSR linear system without the dense `np.vstack` intermediate. + + Returns ``(X_csr, b, names, n_categorical)`` with exactly the same + semantics as ``_build_linear_constraint_system`` but the matrix is + `scipy.sparse.csr_matrix` built row-by-row, never allocating a + dense float64 array of shape ``(n_targets, n_records)``. At v7 + scale (~1.5M records x ~4k constraints) this is the difference + between ~24 GB dense peak and ~500 MB sparse CSR. + + Marginal-target rows are stored via row-indexed integer lookup + (only the `n_matching_rows` entries with value 1.0 are kept). + Continuous-target rows are dense in the record dimension but still + built without stacking. `LinearConstraint` rows are passed through + with `.nonzero()` filtering so explicit-zero coefficients don't + balloon the CSR. + """ + from scipy import sparse as _sp + + indptr: list[int] = [0] + indices: list[np.ndarray] = [] + data_vals: list[np.ndarray] = [] + targets: list[float] = [] + names: list[str] = [] + n_categorical = 0 + n_records = len(data) + + def _append_row(col_indices: np.ndarray, values: np.ndarray) -> None: + indices.append(col_indices.astype(np.int64, copy=False)) + data_vals.append(values.astype(np.float64, copy=False)) + indptr.append(indptr[-1] + len(col_indices)) + + for var, var_targets in marginal_targets.items(): + column = data[var].to_numpy() + for category, target in var_targets.items(): + match = np.flatnonzero(column == category) + _append_row(match, np.ones(len(match), dtype=np.float64)) + targets.append(float(target)) + names.append(f"{var}={category}") + n_categorical += 1 + + for var, target in (continuous_targets or {}).items(): + values = data[var].to_numpy(dtype=np.float64, copy=False) + nz = np.flatnonzero(values != 0.0) + _append_row(nz, values[nz]) + targets.append(float(target)) + names.append(var) + + for constraint in linear_constraints: + coef = np.asarray(constraint.coefficients, dtype=np.float64) + nz = np.flatnonzero(coef != 0.0) + _append_row(nz, coef[nz]) + targets.append(float(constraint.target)) + names.append(constraint.name) + + if not names: + X_csr = _sp.csr_matrix((0, n_records), dtype=np.float64) + b = np.zeros(0, dtype=np.float64) + return X_csr, b, names, n_categorical + + X_csr = _sp.csr_matrix( + ( + np.concatenate(data_vals) if data_vals else np.zeros(0), + np.concatenate(indices) if indices else np.zeros(0, dtype=np.int64), + np.asarray(indptr, dtype=np.int64), + ), + shape=(len(names), n_records), + dtype=np.float64, + ) + b = np.asarray(targets, dtype=np.float64) + return X_csr, b, names, n_categorical + + def _validate_calibration_inputs( data: pd.DataFrame, marginal_targets: dict[str, dict[str, float]], diff --git a/tests/test_sparse_constraint_system.py b/tests/test_sparse_constraint_system.py new file mode 100644 index 0000000..515d03a --- /dev/null +++ b/tests/test_sparse_constraint_system.py @@ -0,0 +1,169 @@ +"""Sparse build path for the linear constraint system. + +`_build_linear_constraint_system` currently materializes a dense numpy +`A` of shape `(n_targets, n_records)` via `np.vstack(rows)`. At +microplex-us v7 scale (1.5M records x ~4k+ constraints, mostly +marginal indicators that are 95 %+ zero) this dense materialization +eats ~24 GB and pushes the process past jetsam. Downstream L0 +calibrators then immediately convert to CSR via `sp.csr_matrix(A)`, +which already produces the correct sparse result — we just got there +through a dense intermediate that wastes the memory. + +These tests pin a sparse-native builder (`_build_sparse_constraint_system`) +that returns `(X_csr, b, names, n_categorical)` without ever +materializing the dense matrix. Semantics must agree with the dense +version up to float32 rounding. +""" + +from __future__ import annotations + +import numpy as np +import pandas as pd +import pytest + +pytest.importorskip("scipy") + +from scipy import sparse + +from microplex.calibration import ( + LinearConstraint, + _build_linear_constraint_system, +) + + +def _toy_frame(n: int = 1000, seed: int = 0) -> pd.DataFrame: + rng = np.random.default_rng(seed) + return pd.DataFrame( + { + "state": rng.choice(["CA", "NY", "TX", "FL"], size=n), + "age_group": rng.choice(["0-17", "18-64", "65+"], size=n), + "income": rng.lognormal(10, 1, size=n), + } + ) + + +class TestSparseBuilderExists: + """The sparse builder must be importable from microplex.calibration.""" + + def test_sparse_builder_is_importable(self) -> None: + from microplex.calibration import _build_sparse_constraint_system + + assert callable(_build_sparse_constraint_system) + + +class TestSparseBuilderMatchesDense: + """Sparse builder output must equal dense builder output numerically.""" + + def _build_both(self, data, marginal, continuous, linear): + from microplex.calibration import _build_sparse_constraint_system + + A_dense, b_dense, names_dense, n_cat_dense = ( + _build_linear_constraint_system( + data, marginal, continuous, linear + ) + ) + X_sparse, b_sparse, names_sparse, n_cat_sparse = ( + _build_sparse_constraint_system( + data, marginal, continuous, linear + ) + ) + return ( + A_dense, + b_dense, + names_dense, + n_cat_dense, + X_sparse, + b_sparse, + names_sparse, + n_cat_sparse, + ) + + def test_marginal_constraints_only(self) -> None: + data = _toy_frame() + marginal = { + "state": { + "CA": 280, + "NY": 230, + "TX": 260, + "FL": 230, + }, + "age_group": {"0-17": 230, "18-64": 570, "65+": 200}, + } + ( + A_dense, + b_dense, + names_dense, + n_cat_dense, + X_sparse, + b_sparse, + names_sparse, + n_cat_sparse, + ) = self._build_both(data, marginal, None, ()) + + assert sparse.issparse(X_sparse), type(X_sparse) + assert X_sparse.shape == A_dense.shape + np.testing.assert_allclose( + X_sparse.toarray(), A_dense, rtol=0, atol=0 + ) + np.testing.assert_array_equal(b_sparse, b_dense) + assert names_sparse == names_dense + assert n_cat_sparse == n_cat_dense + + def test_continuous_and_linear_constraints(self) -> None: + data = _toy_frame() + marginal = {"state": {"CA": 250, "NY": 250, "TX": 250, "FL": 250}} + continuous = {"income": 20_000_000.0} + linear = ( + LinearConstraint( + name="old_and_rich", + coefficients=( + (data["age_group"] == "65+").astype(float) + * data["income"] + ).to_numpy(), + target=500_000.0, + ), + ) + ( + A_dense, + b_dense, + names_dense, + n_cat_dense, + X_sparse, + b_sparse, + names_sparse, + n_cat_sparse, + ) = self._build_both(data, marginal, continuous, linear) + + assert sparse.issparse(X_sparse) + np.testing.assert_allclose( + X_sparse.toarray(), A_dense, rtol=1e-12, atol=0 + ) + np.testing.assert_array_equal(b_sparse, b_dense) + assert names_sparse == names_dense + assert n_cat_sparse == n_cat_dense + + +class TestSparseBuilderActuallySparsifies: + """Marginal indicators must actually be stored sparsely.""" + + def test_marginal_density_is_low(self) -> None: + """With 4 state × 3 age-group categories, density should be ~1/4 per target.""" + from microplex.calibration import _build_sparse_constraint_system + + data = _toy_frame(n=10_000, seed=1) + marginal = { + "state": {"CA": 2_500, "NY": 2_500, "TX": 2_500, "FL": 2_500}, + "age_group": { + "0-17": 2_500, + "18-64": 5_500, + "65+": 2_000, + }, + } + X_sparse, _b, _names, _n = _build_sparse_constraint_system( + data, marginal, None, () + ) + density = X_sparse.nnz / (X_sparse.shape[0] * X_sparse.shape[1]) + # Upper bound: with 4-state + 3-age indicators the weighted + # density is <= max(1/4, 1/3) ≈ 0.33. Assert meaningfully less + # than dense so we know CSR is doing its job. + assert density < 0.45, density