Skip to content
1 change: 1 addition & 0 deletions changelog.d/impute-cps-clone-features.changed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Donor-impute race, Hispanic status, sex, and occupation-based CPS features onto the PUF clone half of the extended CPS so subgroup analyses and overtime-eligibility inputs better align with PUF-imputed incomes.
20 changes: 20 additions & 0 deletions policyengine_us_data/calibration/source_impute.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@

import numpy as np
import pandas as pd
from policyengine_us_data.datasets.cps.tipped_occupation import (
derive_any_treasury_tipped_occupation_code,
derive_is_tipped_occupation,
)

from policyengine_us_data.datasets.org import (
ORG_BOOL_VARIABLES,
Expand Down Expand Up @@ -80,6 +84,7 @@
"age",
"count_under_18",
"count_under_6",
"is_tipped_occupation",
]

SIPP_ASSETS_PREDICTORS = [
Expand Down Expand Up @@ -112,6 +117,8 @@
"NONE": 0,
}

SIPP_JOB_OCCUPATION_COLUMNS = [f"TJB{i}_OCC" for i in range(1, 8)]


def _encode_tenure_type(df: pd.DataFrame) -> pd.DataFrame:
"""Convert tenure_type enum strings to numeric codes."""
Expand Down Expand Up @@ -384,6 +391,12 @@ def _impute_sipp(
sipp_df["age"] = sipp_df.TAGE
sipp_df["household_weight"] = sipp_df.WPFINWGT
sipp_df["household_id"] = sipp_df.SSUID
sipp_df["treasury_tipped_occupation_code"] = (
derive_any_treasury_tipped_occupation_code(sipp_df[SIPP_JOB_OCCUPATION_COLUMNS])
)
sipp_df["is_tipped_occupation"] = derive_is_tipped_occupation(
sipp_df["treasury_tipped_occupation_code"]
)

sipp_df["is_under_18"] = sipp_df.TAGE < 18
sipp_df["is_under_6"] = sipp_df.TAGE < 6
Expand All @@ -401,6 +414,7 @@ def _impute_sipp(
"count_under_18",
"count_under_6",
"age",
"is_tipped_occupation",
"household_weight",
]
tip_train = sipp_df[tip_cols].dropna()
Expand Down Expand Up @@ -431,6 +445,12 @@ def _impute_sipp(
else:
cps_tip_df["count_under_18"] = 0.0
cps_tip_df["count_under_6"] = 0.0
if "treasury_tipped_occupation_code" in data:
cps_tip_df["is_tipped_occupation"] = derive_is_tipped_occupation(
data["treasury_tipped_occupation_code"][time_period]
).astype(np.float32)
else:
cps_tip_df["is_tipped_occupation"] = 0.0

qrf = QRF()
logger.info(
Expand Down
10 changes: 10 additions & 0 deletions policyengine_us_data/datasets/cps/cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@
)
from policyengine_us_data.utils.downsample import downsample_dataset_arrays
from policyengine_us_data.utils.randomness import seeded_rng
from policyengine_us_data.datasets.cps.tipped_occupation import (
derive_treasury_tipped_occupation_code,
derive_is_tipped_occupation,
)


class CPS(Dataset):
Expand Down Expand Up @@ -466,6 +470,9 @@ def children_per_parent(col: str) -> pd.DataFrame:
cps["is_full_time_college_student"] = person.A_HSCOL == 2

cps["detailed_occupation_recode"] = person.POCCU2
cps["treasury_tipped_occupation_code"] = derive_treasury_tipped_occupation_code(
person.PEIOOCC
)
add_overtime_occupation(cps, person)


Expand Down Expand Up @@ -1767,6 +1774,9 @@ def add_tips(self, cps: h5py.File):
raw_data = self.raw_cps(require=True).load()
raw_person = raw_data["person"]
cps["is_married"] = raw_person.A_MARITL.isin([1, 2]).values
cps["is_tipped_occupation"] = derive_is_tipped_occupation(
derive_treasury_tipped_occupation_code(raw_person.PEIOOCC)
)
raw_data.close()

cps["is_under_18"] = cps.age < 18
Expand Down
Loading
Loading