diff --git a/pyproject.toml b/pyproject.toml index 1229c73..f92b47e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ authors = [ ] requires-python = ">=3.13" dependencies = [ - "microplex[calibrate] @ git+https://github.com/PolicyEngine/microplex.git@1e0627182f9df40aacd7043c96956c2895bf9d30", + "microplex[calibrate] @ git+https://github.com/PolicyEngine/microplex.git@90f21d2b2048ed810cde9240f8d03d5bfc1565fc", "duckdb>=1.2", "h5py>=3.10", "requests>=2.31", @@ -35,7 +35,7 @@ hf = [ "huggingface_hub>=0.24", ] policyengine = [ - "microimpute==3.1.1; python_full_version >= '3.12' and python_full_version < '3.15'", + "microimpute @ git+https://github.com/PolicyEngine/microimpute.git@90be828eb442c48ee86bb91bb83a75da4b0f0f89 ; python_full_version >= '3.12' and python_full_version < '3.15'", "policyengine-us==1.715.2; python_version >= '3.11' and python_version < '3.15'", "spm-calculator>=0.3.1", # Standalone tax-unit construction engine (the extraction of eCPS's @@ -85,6 +85,7 @@ allow-direct-references = true [tool.hatch.build.targets.wheel.force-include] "src/microplex_us/pipelines/pe_native_scores.py" = "microplex_us/pipelines/pe_native_scores.py" "src/microplex_us/pipelines/ecps_export_contract.json" = "microplex_us/pipelines/ecps_export_contract.json" +"src/microplex_us/specs/us-2024.yaml" = "microplex_us/specs/us-2024.yaml" [tool.pytest.ini_options] testpaths = ["tests"] diff --git a/src/microplex_us/specs/__init__.py b/src/microplex_us/specs/__init__.py new file mode 100644 index 0000000..42b391c --- /dev/null +++ b/src/microplex_us/specs/__init__.py @@ -0,0 +1 @@ +"""Packaged Microplex-US declarative specs.""" diff --git a/src/microplex_us/specs/us-2024.yaml b/src/microplex_us/specs/us-2024.yaml new file mode 100644 index 0000000..7402937 --- /dev/null +++ b/src/microplex_us/specs/us-2024.yaml @@ -0,0 +1,261 @@ +meta: + country: us + model_year: 2024 + policyengine_model: policyengine-us + +sources: + cps_asec: { dataset: cps_asec_2025_calendar_2024, role: spine } + puf: { dataset: puf_2024, role: donor } + acs: { dataset: acs_2024, role: donor } + sipp: { dataset: sipp_2023, role: donor } + scf: { dataset: scf_2022, role: donor } + +spine: + base: cps_asec + method: clone + clone: { seed: 20260529 } + halves: + - { name: cps_keep, keep: all } + - { name: synthetic_puf, strip_to: [demographics] } + +imputation: + - onto: synthetic_puf + from: puf + vars: + - employment_income + - partnership_s_corp_income + - social_security + - taxable_pension_income + - interest_deduction + - tax_exempt_pension_income + - long_term_capital_gains + - unreimbursed_business_employee_expenses + - pre_tax_contributions + - taxable_ira_distributions + - self_employment_income + - w2_wages_from_qualified_business + - unadjusted_basis_qualified_property + - business_is_sstb + - sstb_self_employment_income_before_lsr + - sstb_self_employment_income + - sstb_self_employment_income_would_be_qualified + - sstb_w2_wages_from_qualified_business + - sstb_unadjusted_basis_qualified_property + - short_term_capital_gains + - qualified_dividend_income + - charitable_cash_donations + - self_employed_pension_contribution_ald + - unrecaptured_section_1250_gain + - taxable_unemployment_compensation + - taxable_interest_income + - domestic_production_ald + - self_employed_health_insurance_ald + - rental_income + - non_qualified_dividend_income + - cdcc_relevant_expenses + - tax_exempt_interest_income + - salt_refund_income + - foreign_tax_credit + - estate_income + - charitable_non_cash_donations + - american_opportunity_credit + - miscellaneous_income + - alimony_expense + - farm_income + - partnership_se_income + - alimony_income + - health_savings_account_ald + - non_sch_d_capital_gains + - general_business_credit + - energy_efficient_home_improvement_credit + - traditional_ira_contributions + - amt_foreign_tax_credit + - excess_withheld_payroll_tax + - savers_credit + - student_loan_interest + - investment_income_elected_form_4952 + - early_withdrawal_penalty + - prior_year_minimum_tax_credit + - farm_rent_income + - qualified_tuition_expenses + - educator_expense + - long_term_capital_gains_on_collectibles + - other_credits + - casualty_loss + - unreported_payroll_tax + - recapture_of_investment_credit + - deductible_mortgage_interest + - home_mortgage_interest + - investment_interest_expense + - other_health_insurance_premiums + - qualified_reit_and_ptp_income + - qualified_bdc_income + - farm_operations_income + - estate_income_would_be_qualified + - farm_operations_income_would_be_qualified + - farm_rent_income_would_be_qualified + - partnership_s_corp_income_would_be_qualified + - rental_income_would_be_qualified + - self_employment_income_would_be_qualified + - weeks_unemployed + condition_on: [demographics] + order: spine_first + synthesize: true + + - onto: cps_keep + from: puf + vars: + - employment_income + - partnership_s_corp_income + - social_security + - taxable_pension_income + - interest_deduction + - tax_exempt_pension_income + - long_term_capital_gains + - unreimbursed_business_employee_expenses + - pre_tax_contributions + - taxable_ira_distributions + - self_employment_income + - w2_wages_from_qualified_business + - unadjusted_basis_qualified_property + - business_is_sstb + - sstb_self_employment_income_before_lsr + - sstb_self_employment_income + - sstb_self_employment_income_would_be_qualified + - sstb_w2_wages_from_qualified_business + - sstb_unadjusted_basis_qualified_property + - short_term_capital_gains + - qualified_dividend_income + - charitable_cash_donations + - self_employed_pension_contribution_ald + - unrecaptured_section_1250_gain + - taxable_unemployment_compensation + - taxable_interest_income + - domestic_production_ald + - self_employed_health_insurance_ald + - rental_income + - non_qualified_dividend_income + - cdcc_relevant_expenses + - tax_exempt_interest_income + - salt_refund_income + - foreign_tax_credit + - estate_income + - charitable_non_cash_donations + - american_opportunity_credit + - miscellaneous_income + - alimony_expense + - farm_income + - partnership_se_income + - alimony_income + - health_savings_account_ald + - non_sch_d_capital_gains + - general_business_credit + - energy_efficient_home_improvement_credit + - traditional_ira_contributions + - amt_foreign_tax_credit + - excess_withheld_payroll_tax + - savers_credit + - student_loan_interest + - investment_income_elected_form_4952 + - early_withdrawal_penalty + - prior_year_minimum_tax_credit + - farm_rent_income + - qualified_tuition_expenses + - educator_expense + - long_term_capital_gains_on_collectibles + - other_credits + - casualty_loss + - unreported_payroll_tax + - recapture_of_investment_credit + - deductible_mortgage_interest + - home_mortgage_interest + - investment_interest_expense + - other_health_insurance_premiums + - qualified_reit_and_ptp_income + - qualified_bdc_income + - farm_operations_income + - estate_income_would_be_qualified + - farm_operations_income_would_be_qualified + - farm_rent_income_would_be_qualified + - partnership_s_corp_income_would_be_qualified + - rental_income_would_be_qualified + - self_employment_income_would_be_qualified + - weeks_unemployed + condition_on: [demographics] + order: spine_first + + - onto: cps_keep + from: puf + vars: + - partnership_s_corp_income + - interest_deduction + - unreimbursed_business_employee_expenses + - pre_tax_contributions + - w2_wages_from_qualified_business + - unadjusted_basis_qualified_property + - business_is_sstb + - sstb_self_employment_income_before_lsr + - sstb_self_employment_income + - sstb_self_employment_income_would_be_qualified + - sstb_w2_wages_from_qualified_business + - sstb_unadjusted_basis_qualified_property + - charitable_cash_donations + - self_employed_pension_contribution_ald + - unrecaptured_section_1250_gain + - taxable_unemployment_compensation + - domestic_production_ald + - self_employed_health_insurance_ald + - cdcc_relevant_expenses + - salt_refund_income + - foreign_tax_credit + - estate_income + - charitable_non_cash_donations + - american_opportunity_credit + - miscellaneous_income + - alimony_expense + - health_savings_account_ald + - non_sch_d_capital_gains + - general_business_credit + - energy_efficient_home_improvement_credit + - amt_foreign_tax_credit + - excess_withheld_payroll_tax + - savers_credit + - student_loan_interest + - investment_income_elected_form_4952 + - early_withdrawal_penalty + - prior_year_minimum_tax_credit + - farm_rent_income + - qualified_tuition_expenses + - educator_expense + - long_term_capital_gains_on_collectibles + - other_credits + - casualty_loss + - unreported_payroll_tax + - recapture_of_investment_credit + - deductible_mortgage_interest + - home_mortgage_interest + - investment_interest_expense + - other_health_insurance_premiums + - qualified_reit_and_ptp_income + - qualified_bdc_income + - farm_operations_income + - estate_income_would_be_qualified + - farm_operations_income_would_be_qualified + - farm_rent_income_would_be_qualified + - partnership_s_corp_income_would_be_qualified + - rental_income_would_be_qualified + - self_employment_income_would_be_qualified + condition_on: [demographics] + order: spine_first + synthesize: true + +targets: + arch: + country: us + model_year: 2024 + target_profile: pe_native_broad + calibration_target_profile: pe_native_broad_source_backed + +calibrate: + loss: pe_native_bucketed_huber_v1 + method: apg diff --git a/tests/specs/test_us_2024_spec.py b/tests/specs/test_us_2024_spec.py new file mode 100644 index 0000000..3311a78 --- /dev/null +++ b/tests/specs/test_us_2024_spec.py @@ -0,0 +1,102 @@ +from __future__ import annotations + +from importlib.resources import files +from pathlib import Path + +from microplex.spec import DEMOGRAPHICS_TOKEN, ImputationOrder, SpineMethod, load_spec + +from microplex_us.pipelines.us import ( + PUF_SUPPORT_CLONE_IMPUTED_VARIABLES, + PUF_SUPPORT_CLONE_OVERRIDDEN_VARIABLES, + PUF_SUPPORT_CLONE_SPECIAL_VARIABLES, +) +from microplex_us.variables import PE_STYLE_PUF_IRS_DEMOGRAPHIC_PREDICTORS + +SPEC_PATH = Path(str(files("microplex_us.specs").joinpath("us-2024.yaml"))) + + +def _spec(): + return load_spec(SPEC_PATH) + + +def test_us_2024_spec_loads_and_names_release_surface() -> None: + spec = _spec() + + assert spec.meta.country == "us" + assert spec.meta.model_year == 2024 + assert spec.meta.policyengine_model == "policyengine-us" + assert spec.sources["cps_asec"].dataset == "cps_asec_2025_calendar_2024" + assert spec.sources["puf"].dataset == "puf_2024" + assert set(spec.sources) == {"cps_asec", "puf", "acs", "sipp", "scf"} + + assert spec.targets is not None + assert spec.targets.arch.country == "us" + assert spec.targets.arch.model_year == 2024 + assert spec.targets.arch.target_profile == "pe_native_broad" + assert ( + spec.targets.arch.resolved_calibration_target_profile + == "pe_native_broad_source_backed" + ) + assert spec.calibrate is not None + assert spec.calibrate.loss == "pe_native_bucketed_huber_v1" + assert spec.calibrate.method.value == "apg" + + +def test_us_2024_spec_declares_ecps_clone_spine() -> None: + spec = _spec() + + assert spec.spine.base == "cps_asec" + assert spec.spine.method is SpineMethod.CLONE + assert spec.spine.clone.seed == 20260529 + assert spec.spine.passthrough_half.name == "cps_keep" + assert spec.spine.passthrough_half.keep == "all" + assert spec.spine.synthetic_half.name == "synthetic_puf" + assert spec.spine.synthetic_half.strip_to == [DEMOGRAPHICS_TOKEN] + + +def test_us_2024_spec_declares_demographic_only_puf_synthesis() -> None: + spec = _spec() + all_puf_vars = list( + PUF_SUPPORT_CLONE_IMPUTED_VARIABLES + PUF_SUPPORT_CLONE_SPECIAL_VARIABLES + ) + + synthetic, cps_fill, cps_override = spec.imputation + + assert synthetic.onto == "synthetic_puf" + assert synthetic.from_ == "puf" + assert synthetic.vars == all_puf_vars + assert synthetic.condition_on == [DEMOGRAPHICS_TOKEN] + assert synthetic.order is ImputationOrder.SPINE_FIRST + assert synthetic.synthesize is True + + assert cps_fill.onto == "cps_keep" + assert cps_fill.from_ == "puf" + assert cps_fill.vars == all_puf_vars + assert cps_fill.condition_on == [DEMOGRAPHICS_TOKEN] + assert cps_fill.synthesize is False + + assert cps_override.onto == "cps_keep" + assert cps_override.from_ == "puf" + assert cps_override.vars == list(PUF_SUPPORT_CLONE_OVERRIDDEN_VARIABLES) + assert cps_override.condition_on == [DEMOGRAPHICS_TOKEN] + assert cps_override.synthesize is True + + assert set(PUF_SUPPORT_CLONE_OVERRIDDEN_VARIABLES).issubset( + PUF_SUPPORT_CLONE_IMPUTED_VARIABLES + ) + assert "employment_income" in synthetic.vars + assert "employment_income" not in cps_override.vars + assert "employment_income" not in synthetic.condition_on + assert tuple(PE_STYLE_PUF_IRS_DEMOGRAPHIC_PREDICTORS) == ( + "age", + "is_male", + "tax_unit_is_joint", + "tax_unit_count_dependents", + "is_tax_unit_head", + "is_tax_unit_spouse", + "is_tax_unit_dependent", + ) + + +def test_us_2024_spec_keeps_forbes_out_of_replication_baseline() -> None: + assert "forbes" not in SPEC_PATH.read_text(encoding="utf-8").lower() diff --git a/uv.lock b/uv.lock index c61afca..9df2fae 100644 --- a/uv.lock +++ b/uv.lock @@ -1125,7 +1125,7 @@ wheels = [ [[package]] name = "microimpute" version = "3.1.1" -source = { registry = "https://pypi.org/simple" } +source = { git = "https://github.com/PolicyEngine/microimpute.git?rev=90be828eb442c48ee86bb91bb83a75da4b0f0f89#90be828eb442c48ee86bb91bb83a75da4b0f0f89" } dependencies = [ { name = "joblib" }, { name = "numpy" }, @@ -1141,18 +1141,15 @@ dependencies = [ { name = "statsmodels" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/a0/a0/15e25e78b7fa48d100f52d210290f2ba5820ebc47e4859748a7d89a3cae9/microimpute-3.1.1.tar.gz", hash = "sha256:70aa5bd28e7cef254695b8317c0f88e11e39ea204e0f6362cb33a94163438c3e", size = 146197, upload-time = "2026-06-06T09:54:07.848Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/7f/d9/2b1ae246461f88388e2a43ff29f0f3477ac89cc52b11abaf775089b7553d/microimpute-3.1.1-py3-none-any.whl", hash = "sha256:c6a8fcb2ab129486fce48299cf89901b12632d69c0483251bae5f8d68a0d326b", size = 127432, upload-time = "2026-06-06T09:54:06.717Z" }, -] [[package]] name = "microplex" version = "0.2.0" -source = { git = "https://github.com/PolicyEngine/microplex.git?rev=1e0627182f9df40aacd7043c96956c2895bf9d30#1e0627182f9df40aacd7043c96956c2895bf9d30" } +source = { git = "https://github.com/PolicyEngine/microplex.git?rev=90f21d2b2048ed810cde9240f8d03d5bfc1565fc#90f21d2b2048ed810cde9240f8d03d5bfc1565fc" } dependencies = [ { name = "httpx" }, { name = "huggingface-hub" }, + { name = "microimpute", marker = "python_full_version < '3.15'" }, { name = "numpy" }, { name = "pandas" }, { name = "polars" }, @@ -1211,8 +1208,8 @@ requires-dist = [ { name = "h5py", specifier = ">=3.10" }, { name = "huggingface-hub", marker = "extra == 'hf'", specifier = ">=0.24" }, { name = "jupyter-book", marker = "extra == 'docs'", specifier = ">=0.15,<0.16" }, - { name = "microimpute", marker = "python_full_version >= '3.12' and python_full_version < '3.15' and extra == 'policyengine'", specifier = "==3.1.1" }, - { name = "microplex", extras = ["calibrate"], git = "https://github.com/PolicyEngine/microplex.git?rev=1e0627182f9df40aacd7043c96956c2895bf9d30" }, + { name = "microimpute", marker = "python_full_version >= '3.12' and python_full_version < '3.15' and extra == 'policyengine'", git = "https://github.com/PolicyEngine/microimpute.git?rev=90be828eb442c48ee86bb91bb83a75da4b0f0f89" }, + { name = "microplex", extras = ["calibrate"], git = "https://github.com/PolicyEngine/microplex.git?rev=90f21d2b2048ed810cde9240f8d03d5bfc1565fc" }, { name = "microunit", marker = "extra == 'policyengine'", specifier = ">=0.1.0" }, { name = "policyengine-us", marker = "python_full_version >= '3.11' and python_full_version < '3.15' and extra == 'policyengine'", specifier = "==1.715.2" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=7.0" },