diff --git a/changelog.d/deterministic-property-purchased.fixed.md b/changelog.d/deterministic-property-purchased.fixed.md new file mode 100644 index 00000000..6e8918e8 --- /dev/null +++ b/changelog.d/deterministic-property-purchased.fixed.md @@ -0,0 +1 @@ +Make the FRS dataset build deterministic. Several assignments drew from the unseeded global numpy RNG, so otherwise-identical builds produced different datasets: property_purchased (which households are charged stamp duty), capital gains imputation quantiles (CGT revenue), and BRMA assignment (LHA/housing-benefit geography). Each now draws from a seeded generator, so the same inputs always produce the same dataset. diff --git a/policyengine_uk_data/datasets/frs.py b/policyengine_uk_data/datasets/frs.py index fbfea69f..58e2ac6b 100644 --- a/policyengine_uk_data/datasets/frs.py +++ b/policyengine_uk_data/datasets/frs.py @@ -1251,9 +1251,12 @@ def determine_education_level(fted_val, typeed2_val, age_val): lha_category = sim.calculate("LHA_category", year) brma = np.empty(len(region), dtype=object) - # Sample from a random BRMA in the region, weighted by the number of observations in each BRMA + # Sample from a random BRMA in the region, weighted by the number of observations in each BRMA. + # Use a seeded generator so the assignment is reproducible across builds; + # pandas .sample() otherwise draws from the unseeded global numpy RNG. lha_list_of_rents = pd.read_csv(STORAGE_FOLDER / "lha_list_of_rents.csv.gz") lha_list_of_rents = lha_list_of_rents.copy() + brma_rng = np.random.default_rng(0) for possible_region in lha_list_of_rents.region.unique(): for possible_lha_category in lha_list_of_rents.lha_category.unique(): @@ -1262,7 +1265,7 @@ def determine_education_level(fted_val, typeed2_val, age_val): ) mask = (region == possible_region) & (lha_category == possible_lha_category) brma[mask] = lha_list_of_rents[lor_mask].brma.sample( - n=len(region[mask]), replace=True + n=len(region[mask]), replace=True, random_state=brma_rng ) # Convert benunit-level BRMAs to household-level BRMAs (pick a random one) @@ -1276,7 +1279,9 @@ def determine_education_level(fted_val, typeed2_val, age_val): } ) - df = df.groupby("household_id").brma.aggregate(lambda x: x.sample(n=1).iloc[0]) + df = df.groupby("household_id").brma.aggregate( + lambda x: x.sample(n=1, random_state=brma_rng).iloc[0] + ) brmas = df[sim.calculate("household_id")].values pe_household["brma"] = brmas @@ -1430,9 +1435,15 @@ def _reported_benunit_mask(person_column: str) -> np.ndarray: pe_benunit["is_married"] = frs["benunit"].famtypb2.isin([5, 7]) - # Stochastically set property_purchased based on UK housing transaction rate. - # Previously defaulted to True in policyengine-uk, causing all households - # to be charged SDLT as if they just bought their property (£370bn total). + # Assign property_purchased to a share of households matching the UK + # housing transaction rate, so only genuine purchasers are charged SDLT. + # + # This MUST be deterministic: a rules engine's inputs have to be + # reproducible across builds. Use a seeded Generator (not global + # np.random, whose state depends on whatever ran earlier in the build) + # so the same FRS input always yields the same assignment. An unseeded + # draw previously made the build non-reproducible and intermittently + # spiked the first decile's effective tax rate. # # Sources: # - Transactions: HMRC 2024 - 1.1m/year @@ -1443,11 +1454,13 @@ def _reported_benunit_mask(person_column: str) -> np.ndarray: # # Verification against official SDLT revenue (2024-25): # - Official SDLT: £13.9bn (https://www.gov.uk/government/statistics/uk-stamp-tax-statistics) - # - With fix (3.85%): £15.7bn (close to official) - # - Without fix (100%): £370bn (26x too high) + # - With 3.85% purchasers: £15.7bn (close to official) + # - With every household a purchaser: £370bn (26x too high) PROPERTY_PURCHASE_RATE = 0.0385 + PROPERTY_PURCHASE_SEED = 0 + purchase_rng = np.random.default_rng(PROPERTY_PURCHASE_SEED) pe_household["property_purchased"] = ( - np.random.random(len(pe_household)) < PROPERTY_PURCHASE_RATE + purchase_rng.random(len(pe_household)) < PROPERTY_PURCHASE_RATE ) if not include_internal_disability_reported_amounts: diff --git a/policyengine_uk_data/datasets/imputations/capital_gains.py b/policyengine_uk_data/datasets/imputations/capital_gains.py index 9a4790cd..22fe38d0 100644 --- a/policyengine_uk_data/datasets/imputations/capital_gains.py +++ b/policyengine_uk_data/datasets/imputations/capital_gains.py @@ -117,6 +117,11 @@ def loss(blend_factor): logging.info("Imputing capital gains among those with gains") + # Draw imputation quantiles from a seeded generator so the build is + # reproducible: an unseeded global np.random made capital gains (and hence + # CGT revenue) differ between otherwise identical builds. + cg_rng = np.random.default_rng(0) + for i in range(len(capital_gains)): row = capital_gains.iloc[i] spline = UnivariateSpline( @@ -128,7 +133,7 @@ def loss(blend_factor): upper = row.maximum_total_income ti_in_range = (ti >= lower) * (ti < upper) in_target_range = has_cg * ti_in_range > 0 - quantiles = np.random.random(int(in_target_range.sum())) + quantiles = cg_rng.random(int(in_target_range.sum())) pred_capital_gains = spline(quantiles) new_cg[in_target_range] = pred_capital_gains