Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 63 additions & 18 deletions src/microplex/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,44 +2,68 @@
Core data models for microplex.

This module provides the foundational data structures for microdata representation:
- Entity types (Person, TaxUnit, Household, Family, SPMUnit, Record)
- Entity types (Person, TaxUnit, Household, Family, BenefitUnit, SPMUnit, Record)
- Variable definitions with legal references
- Period arithmetic
- Multi-resolution dataset generation
"""

from microplex.core.entities import (
BenefitUnit,
Entity,
EntityType,
Family,
FilingStatus,
RecordType,
Entity,
Person,
TaxUnit,
Household,
Family,
SPMUnit,
Person,
Record,
)
from microplex.core.variables import (
DataType,
VariableRole,
LegalReference,
Variable,
VariableRegistry,
RecordType,
SPMUnit,
TaxUnit,
)
from microplex.core.periods import (
PeriodType,
Period,
PeriodType,
)
from microplex.core.resolution import (
ResolutionLevel,
ResolutionConfig,
HardConcreteGate,
ResolutionConfig,
ResolutionLevel,
compress_dataset,
for_browser,
for_api,
for_browser,
for_research,
)
from microplex.core.source_manifests import (
SourceColumnManifest,
SourceColumnValueType,
SourceManifest,
SourceObservationManifest,
load_source_manifest,
)
from microplex.core.sources import (
EntityObservation,
EntityRelationship,
ObservationFrame,
RelationshipCardinality,
Shareability,
SourceAdapter,
SourceArchetype,
SourceDescriptor,
SourceProvider,
SourceQuery,
SourceVariableCapability,
StaticSourceProvider,
TimeStructure,
apply_source_query,
)
from microplex.core.variables import (
DataType,
LegalReference,
Variable,
VariableRegistry,
VariableRole,
)

__all__ = [
# Entities
Expand All @@ -51,8 +75,24 @@
"TaxUnit",
"Household",
"Family",
"BenefitUnit",
"SPMUnit",
"Record",
# Sources
"TimeStructure",
"Shareability",
"SourceArchetype",
"RelationshipCardinality",
"EntityObservation",
"SourceVariableCapability",
"SourceDescriptor",
"EntityRelationship",
"ObservationFrame",
"SourceQuery",
"SourceProvider",
"StaticSourceProvider",
"apply_source_query",
"SourceAdapter",
# Variables
"DataType",
"VariableRole",
Expand All @@ -66,6 +106,11 @@
"ResolutionLevel",
"ResolutionConfig",
"HardConcreteGate",
"SourceColumnValueType",
"SourceColumnManifest",
"SourceObservationManifest",
"SourceManifest",
"load_source_manifest",
"compress_dataset",
"for_browser",
"for_api",
Expand Down
38 changes: 29 additions & 9 deletions src/microplex/core/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
Entities represent the hierarchical structure of tax-benefit microdata:
- Person: Individual-level attributes
- TaxUnit: Tax filing unit (IRS perspective)
- Household: Census household (housing costs, geography)
- Family: SPM family unit (poverty calculation)
- Household: Residential unit (housing costs, geography)
- Family: Family grouping used by some systems
- BenefitUnit: Benefit assessment unit (UK-style family benefit unit)
- SPMUnit: Supplemental Poverty Measure unit
- Record: Sub-person records (W-2s, K-1s, 1099s, etc.)
"""
Expand All @@ -18,28 +19,32 @@
class EntityType(Enum):
"""Types of entities in the microdata hierarchy."""

RECORD = "record"
PERSON = "person"
TAX_UNIT = "tax_unit"
HOUSEHOLD = "household"
FAMILY = "family"
BENEFIT_UNIT = "benefit_unit"
SPM_UNIT = "spm_unit"

@property
def level(self) -> int:
"""Hierarchy level (0 = lowest/most granular)."""
levels = {
EntityType.PERSON: 0,
EntityType.TAX_UNIT: 1,
EntityType.HOUSEHOLD: 1,
EntityType.FAMILY: 1,
EntityType.SPM_UNIT: 1,
EntityType.RECORD: 0,
EntityType.PERSON: 1,
EntityType.TAX_UNIT: 2,
EntityType.HOUSEHOLD: 2,
EntityType.FAMILY: 2,
EntityType.BENEFIT_UNIT: 2,
EntityType.SPM_UNIT: 2,
}
return levels[self]

@property
def is_group(self) -> bool:
"""Whether this entity groups persons."""
return self != EntityType.PERSON
return self not in (EntityType.RECORD, EntityType.PERSON)


class FilingStatus(Enum):
Expand Down Expand Up @@ -207,6 +212,21 @@ def entity_type(self) -> EntityType:
return EntityType.FAMILY


class BenefitUnit(Entity):
"""Benefit assessment unit.

Used by tax-benefit systems that determine eligibility/resources at a
family-benefit-unit grain distinct from households or tax units.
"""

member_ids: list[str] = Field(default_factory=list)
head_id: str | None = None

@property
def entity_type(self) -> EntityType:
return EntityType.BENEFIT_UNIT


class SPMUnit(Entity):
"""Supplemental Poverty Measure unit."""

Expand Down Expand Up @@ -275,4 +295,4 @@ class Record(Entity):

@property
def entity_type(self) -> EntityType:
return EntityType.PERSON # Records are person-level
return EntityType.RECORD
117 changes: 117 additions & 0 deletions src/microplex/core/source_manifests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
"""Typed source-manifest loader for externalized provider specs."""

from __future__ import annotations

import json
from collections.abc import Iterable, Mapping
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path

from microplex.core.entities import EntityType
from microplex.core.sources import SourceArchetype


class SourceColumnValueType(Enum):
"""How a raw source column should be coerced during canonical mapping."""

NUMERIC = "numeric"
CATEGORICAL = "categorical"


@dataclass(frozen=True)
class SourceColumnManifest:
"""One raw-to-canonical column mapping."""

raw_column: str
canonical_name: str
value_type: SourceColumnValueType = SourceColumnValueType.NUMERIC


@dataclass(frozen=True)
class SourceObservationManifest:
"""Manifest for one observed entity table."""

entity: EntityType
key_column: str
table_name: str | None = None
weight_column: str | None = None
period_column: str | None = None
excluded_columns: tuple[str, ...] = ()
aliases: Mapping[str, str] = field(default_factory=dict)
columns: tuple[SourceColumnManifest, ...] = ()

def observed_variable_names(
self,
frame_columns: Iterable[str] | None = None,
) -> tuple[str, ...]:
"""Return canonical observed variables for this entity."""
reserved = {self.key_column}
if self.weight_column is not None:
reserved.add(self.weight_column)
if self.period_column is not None:
reserved.add(self.period_column)
reserved.update(self.excluded_columns)
if self.columns:
return tuple(
column.canonical_name
for column in self.columns
if column.canonical_name not in reserved
)
if frame_columns is None:
raise ValueError(
"frame_columns must be provided when manifest columns are implicit"
)
return tuple(column for column in frame_columns if column not in reserved)


@dataclass(frozen=True)
class SourceManifest:
"""Typed manifest for one source-provider family."""

name: str
archetype: SourceArchetype
population: str | None = None
description: str | None = None
observations: tuple[SourceObservationManifest, ...] = ()

def observation_for(self, entity: EntityType) -> SourceObservationManifest:
"""Return the manifest entry for one entity."""
for observation in self.observations:
if observation.entity is entity:
return observation
raise KeyError(f"Manifest '{self.name}' has no entity '{entity.value}'")


def load_source_manifest(path: str | Path) -> SourceManifest:
"""Load a typed source manifest from JSON."""
payload = json.loads(Path(path).read_text())
observations = tuple(
SourceObservationManifest(
entity=EntityType(observation_payload["entity"]),
key_column=observation_payload["key_column"],
table_name=observation_payload.get("table_name"),
weight_column=observation_payload.get("weight_column"),
period_column=observation_payload.get("period_column"),
excluded_columns=tuple(observation_payload.get("excluded_columns", ())),
aliases=dict(observation_payload.get("aliases", {})),
columns=tuple(
SourceColumnManifest(
raw_column=column_payload["raw_column"],
canonical_name=column_payload["canonical_name"],
value_type=SourceColumnValueType(
column_payload.get("value_type", "numeric")
),
)
for column_payload in observation_payload.get("columns", ())
),
)
for observation_payload in payload["observations"]
)
return SourceManifest(
name=payload["name"],
archetype=SourceArchetype(payload["archetype"]),
population=payload.get("population"),
description=payload.get("description"),
observations=observations,
)
Loading
Loading