Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/deterministic-property-purchased.fixed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Make the FRS dataset build deterministic. Several assignments drew from the unseeded global numpy RNG, so otherwise-identical builds produced different datasets: property_purchased (which households are charged stamp duty), capital gains imputation quantiles (CGT revenue), and BRMA assignment (LHA/housing-benefit geography). Each now draws from a seeded generator, so the same inputs always produce the same dataset.
31 changes: 22 additions & 9 deletions policyengine_uk_data/datasets/frs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1251,9 +1251,12 @@ def determine_education_level(fted_val, typeed2_val, age_val):
lha_category = sim.calculate("LHA_category", year)
brma = np.empty(len(region), dtype=object)

# Sample from a random BRMA in the region, weighted by the number of observations in each BRMA
# Sample from a random BRMA in the region, weighted by the number of observations in each BRMA.
# Use a seeded generator so the assignment is reproducible across builds;
# pandas .sample() otherwise draws from the unseeded global numpy RNG.
lha_list_of_rents = pd.read_csv(STORAGE_FOLDER / "lha_list_of_rents.csv.gz")
lha_list_of_rents = lha_list_of_rents.copy()
brma_rng = np.random.default_rng(0)

for possible_region in lha_list_of_rents.region.unique():
for possible_lha_category in lha_list_of_rents.lha_category.unique():
Expand All @@ -1262,7 +1265,7 @@ def determine_education_level(fted_val, typeed2_val, age_val):
)
mask = (region == possible_region) & (lha_category == possible_lha_category)
brma[mask] = lha_list_of_rents[lor_mask].brma.sample(
n=len(region[mask]), replace=True
n=len(region[mask]), replace=True, random_state=brma_rng
)

# Convert benunit-level BRMAs to household-level BRMAs (pick a random one)
Expand All @@ -1276,7 +1279,9 @@ def determine_education_level(fted_val, typeed2_val, age_val):
}
)

df = df.groupby("household_id").brma.aggregate(lambda x: x.sample(n=1).iloc[0])
df = df.groupby("household_id").brma.aggregate(
lambda x: x.sample(n=1, random_state=brma_rng).iloc[0]
)
brmas = df[sim.calculate("household_id")].values

pe_household["brma"] = brmas
Expand Down Expand Up @@ -1430,9 +1435,15 @@ def _reported_benunit_mask(person_column: str) -> np.ndarray:

pe_benunit["is_married"] = frs["benunit"].famtypb2.isin([5, 7])

# Stochastically set property_purchased based on UK housing transaction rate.
# Previously defaulted to True in policyengine-uk, causing all households
# to be charged SDLT as if they just bought their property (£370bn total).
# Assign property_purchased to a share of households matching the UK
# housing transaction rate, so only genuine purchasers are charged SDLT.
#
# This MUST be deterministic: a rules engine's inputs have to be
# reproducible across builds. Use a seeded Generator (not global
# np.random, whose state depends on whatever ran earlier in the build)
# so the same FRS input always yields the same assignment. An unseeded
# draw previously made the build non-reproducible and intermittently
# spiked the first decile's effective tax rate.
#
# Sources:
# - Transactions: HMRC 2024 - 1.1m/year
Expand All @@ -1443,11 +1454,13 @@ def _reported_benunit_mask(person_column: str) -> np.ndarray:
#
# Verification against official SDLT revenue (2024-25):
# - Official SDLT: £13.9bn (https://www.gov.uk/government/statistics/uk-stamp-tax-statistics)
# - With fix (3.85%): £15.7bn (close to official)
# - Without fix (100%): £370bn (26x too high)
# - With 3.85% purchasers: £15.7bn (close to official)
# - With every household a purchaser: £370bn (26x too high)
PROPERTY_PURCHASE_RATE = 0.0385
PROPERTY_PURCHASE_SEED = 0
purchase_rng = np.random.default_rng(PROPERTY_PURCHASE_SEED)
pe_household["property_purchased"] = (
np.random.random(len(pe_household)) < PROPERTY_PURCHASE_RATE
purchase_rng.random(len(pe_household)) < PROPERTY_PURCHASE_RATE
)

if not include_internal_disability_reported_amounts:
Expand Down
7 changes: 6 additions & 1 deletion policyengine_uk_data/datasets/imputations/capital_gains.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,11 @@ def loss(blend_factor):

logging.info("Imputing capital gains among those with gains")

# Draw imputation quantiles from a seeded generator so the build is
# reproducible: an unseeded global np.random made capital gains (and hence
# CGT revenue) differ between otherwise identical builds.
cg_rng = np.random.default_rng(0)

for i in range(len(capital_gains)):
row = capital_gains.iloc[i]
spline = UnivariateSpline(
Expand All @@ -128,7 +133,7 @@ def loss(blend_factor):
upper = row.maximum_total_income
ti_in_range = (ti >= lower) * (ti < upper)
in_target_range = has_cg * ti_in_range > 0
quantiles = np.random.random(int(in_target_range.sum()))
quantiles = cg_rng.random(int(in_target_range.sum()))
pred_capital_gains = spline(quantiles)
new_cg[in_target_range] = pred_capital_gains

Expand Down