Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions changelog.d/fix-state-income-tax-stc.fixed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Fix the state income tax ETL to parse the official FY2023 Census STC `T40`
row instead of using a mismatched hardcoded table, correcting Washington,
New Hampshire, Tennessee, California, and other state targets.
2 changes: 1 addition & 1 deletion policyengine_us_data/db/DATABASE_GUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ make promote-database # Copy DB + raw inputs to HuggingFace clone
| 4 | `etl_age.py` | Census ACS 1-year | Age distribution: 18 bins x 488 geographies |
| 5 | `etl_medicaid.py` | Census ACS + CMS | Medicaid enrollment (admin state-level, survey district-level) |
| 6 | `etl_snap.py` | USDA FNS + Census ACS | SNAP participation (admin state-level, survey district-level) |
| 7 | `etl_state_income_tax.py` | No | State income tax collections (Census STC FY2023, hardcoded) |
| 7 | `etl_state_income_tax.py` | Census STC | State income tax collections (Census STC FY2023 `T40`, downloaded and cached) |
| 8 | `etl_irs_soi.py` | IRS | Tax variables, EITC by child count, AGI brackets, conditional strata |
| 9 | `etl_pregnancy.py` | CDC VSRR + Census ACS | Pregnancy prevalence by state (provisional birth counts) |
| 10 | `validate_database.py` | No | Checks all target variables exist in policyengine-us |
Expand Down
115 changes: 35 additions & 80 deletions policyengine_us_data/db/etl_state_income_tax.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"""

import logging

import pandas as pd
from sqlmodel import Session, create_engine

Expand All @@ -28,19 +29,11 @@

logger = logging.getLogger(__name__)


# States without individual income tax (these will have $0 target)
NO_INCOME_TAX_STATES = {
"AK", # Alaska
"FL", # Florida
"NV", # Nevada
"SD", # South Dakota
"TX", # Texas
"WA", # Washington (has capital gains tax only, modeled separately)
"WY", # Wyoming
"NH", # New Hampshire (phased out interest/dividends tax)
"TN", # Tennessee (phased out Hall income tax)
CENSUS_STC_FLAT_FILE_URLS = {
2023: "https://www2.census.gov/programs-surveys/stc/datasets/2023/FY2023-Flat-File.txt",
}
CENSUS_STC_INDIVIDUAL_INCOME_TAX_ITEM = "T40"
CENSUS_STC_NOT_AVAILABLE = "X"

STATE_FIPS_TO_ABBREV = {
"01": "AL",
Expand Down Expand Up @@ -103,87 +96,50 @@ def extract_state_income_tax_data(year: int = 2023) -> pd.DataFrame:
"""
Extract state individual income tax collections from Census STC.

Uses hardcoded FY2023 values from Census Bureau's Annual Survey of
State Government Tax Collections. These values are derived from
Census STC Table 1: State Government Tax Collections by Category.

Source: https://www.census.gov/data/tables/2023/econ/stc/2023-annual.html
Parses the official FY2023 Census STC flat file and extracts item
``T40`` (Individual Income Taxes). Census reports amounts in
thousands of dollars, so the returned values are converted to
dollars. Cells marked ``X`` in the source are treated as 0.

Args:
year: Fiscal year for the data (currently only 2023 supported)

Returns:
DataFrame with state_fips, state_abbrev, and income_tax_collections
"""
cache_file = f"census_stc_individual_income_tax_{year}.json"
if year not in CENSUS_STC_FLAT_FILE_URLS:
raise ValueError(
f"Only years {sorted(CENSUS_STC_FLAT_FILE_URLS)} are supported, got {year}"
)

# Use a distinct cache key so existing bad hardcoded JSON cannot survive
# the switch to the official Census T40 download.
cache_file = f"census_stc_t40_individual_income_tax_{year}.json"

if is_cached(cache_file):
logger.info(f"Using cached {cache_file}")
data = load_json(cache_file)
return pd.DataFrame(data)

logger.info(f"Building Census STC individual income tax data for FY{year}")

# FY2023 values in dollars from Census STC
# Source: Census STC Table 1 - State Government Tax Collections by Category
# https://www.census.gov/data/tables/2023/econ/stc/2023-annual.html
stc_2023_individual_income_tax = {
"AL": 5_881_000_000,
"AK": 0,
"AZ": 5_424_000_000,
"AR": 4_352_000_000,
"CA": 115_845_000_000,
"CO": 13_671_000_000,
"CT": 10_716_000_000,
"DE": 1_747_000_000,
"DC": 3_456_000_000,
"FL": 0,
"GA": 15_297_000_000,
"HI": 2_725_000_000,
"ID": 2_593_000_000,
"IL": 21_453_000_000,
"IN": 8_098_000_000,
"IA": 5_243_000_000,
"KS": 4_304_000_000,
"KY": 6_163_000_000,
"LA": 4_088_000_000,
"ME": 2_246_000_000,
"MD": 11_635_000_000,
"MA": 18_645_000_000,
"MI": 12_139_000_000,
"MN": 14_239_000_000,
"MS": 2_477_000_000,
"MO": 9_006_000_000,
"MT": 1_718_000_000,
"NE": 3_248_000_000,
"NV": 0,
"NH": 0,
"NJ": 17_947_000_000,
"NM": 2_224_000_000,
"NY": 63_247_000_000,
"NC": 17_171_000_000,
"ND": 534_000_000,
"OH": 9_520_000_000, # Confirmed with Policy Matters Ohio
"OK": 4_253_000_000,
"OR": 11_583_000_000,
"PA": 16_898_000_000,
"RI": 1_739_000_000,
"SC": 6_367_000_000,
"SD": 0,
"TN": 0,
"TX": 0,
"UT": 5_464_000_000,
"VT": 1_035_000_000,
"VA": 17_934_000_000,
"WA": 0, # WA has capital gains tax but no broad income tax
"WV": 2_163_000_000,
"WI": 10_396_000_000,
"WY": 0,
}
stc_df = pd.read_csv(CENSUS_STC_FLAT_FILE_URLS[year], dtype=str)
item_rows = stc_df.loc[stc_df["ITEM"] == CENSUS_STC_INDIVIDUAL_INCOME_TAX_ITEM]
if len(item_rows) != 1:
raise ValueError(
f"Expected exactly one Census STC row for item "
f"{CENSUS_STC_INDIVIDUAL_INCOME_TAX_ITEM}, found {len(item_rows)}"
)
item_row = item_rows.iloc[0]

rows = []
for abbrev, value in stc_2023_individual_income_tax.items():
for abbrev in STATE_ABBREV_TO_FIPS:
fips = STATE_ABBREV_TO_FIPS[abbrev]
raw_value = item_row[abbrev]
value = (
0
if pd.isna(raw_value) or raw_value == CENSUS_STC_NOT_AVAILABLE
else int(raw_value) * 1000
)
rows.append(
{
"state_fips": fips,
Expand Down Expand Up @@ -318,15 +274,14 @@ def main():

# Print summary
total_collections = transformed_df["income_tax_collections"].sum()
states_with_tax = len(
[s for s in transformed_df["state_abbrev"] if s not in NO_INCOME_TAX_STATES]
)
states_with_tax = int((transformed_df["income_tax_collections"] > 0).sum())
states_without_tax = len(transformed_df) - states_with_tax

logger.info(
f"State Income Tax Targets Summary:\n"
f" Total states loaded: {len(stratum_lookup)}\n"
f" States with income tax: {states_with_tax}\n"
f" States without income tax: {len(NO_INCOME_TAX_STATES)}\n"
f" States without income tax: {states_without_tax}\n"
f" Total collections: ${total_collections / 1e9:.1f}B"
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -275,9 +275,10 @@ def test_inactive_targets_are_excluded(self):
self.assertEqual(float(baseline_rows.iloc[0]["value"]), 10000.0)

def test_legacy_target_overview_without_reform_id(self):
b = self._make_builder()
_create_legacy_target_overview(self.engine)
try:
b = self._make_builder()
b._target_overview_columns = None
df = b._query_targets({"domain_variables": ["aca_ptc"]})
self.assertGreater(len(df), 0)
self.assertIn("reform_id", df.columns)
Expand Down
18 changes: 13 additions & 5 deletions policyengine_us_data/tests/test_database_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def test_jct_tax_expenditure_targets_have_distinct_reform_ids(built_db):


def test_state_income_tax_targets(built_db):
"""State income tax targets should cover all income-tax states."""
"""State income tax targets should match the official FY2023 Census T40 row."""
conn = sqlite3.connect(str(built_db))
rows = conn.execute("""
SELECT sc.value, t.value
Expand All @@ -185,12 +185,20 @@ def test_state_income_tax_targets(built_db):
n = len(state_totals)
assert n >= 42, f"Expected >= 42 state income tax targets, got {n}"

# California should be the largest, over $100B.
# Values come from Census STC FY2023 Table 1 / item T40
# (Individual Income Taxes), reported in thousands of dollars.
ca_val = state_totals.get("06") or state_totals.get("6")
assert ca_val is not None, "California (FIPS 06) target missing"
assert ca_val > 100e9, (
f"California income tax should be > $100B, got ${ca_val / 1e9:.1f}B"
)
assert ca_val == 96_379_294_000

wa_val = state_totals.get("53")
assert wa_val == 846_835_000

nh_val = state_totals.get("33")
assert nh_val == 149_485_000

tn_val = state_totals.get("47")
assert tn_val == 2_926_000


def test_congressional_district_strata(built_db):
Expand Down
66 changes: 66 additions & 0 deletions policyengine_us_data/tests/test_etl_state_income_tax.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import pandas as pd
import pytest

from policyengine_us_data.db import etl_state_income_tax as stc_module


def test_extract_state_income_tax_data_parses_census_t40(monkeypatch):
mapping = {
"02": "AK",
"06": "CA",
"33": "NH",
"47": "TN",
"53": "WA",
}
monkeypatch.setattr(stc_module, "STATE_FIPS_TO_ABBREV", mapping)
monkeypatch.setattr(
stc_module,
"STATE_ABBREV_TO_FIPS",
{abbrev: fips for fips, abbrev in mapping.items()},
)
monkeypatch.setattr(stc_module, "is_cached", lambda _: False)

saved = {}

def fake_save_json(filename, data):
saved["filename"] = filename
saved["data"] = data

monkeypatch.setattr(stc_module, "save_json", fake_save_json)

t40_row = {
"ITEM": "T40",
"AK": "X",
"CA": "96379294",
"NH": "149485",
"TN": "2926",
"WA": "846835",
}
monkeypatch.setattr(
stc_module.pd,
"read_csv",
lambda url, dtype=str: pd.DataFrame(
[
{"ITEM": "T00"},
t40_row,
]
),
)

df = stc_module.extract_state_income_tax_data(2023)
actual = dict(zip(df["state_abbrev"], df["income_tax_collections"]))

assert actual == {
"AK": 0,
"CA": 96_379_294_000,
"NH": 149_485_000,
"TN": 2_926_000,
"WA": 846_835_000,
}
assert saved["filename"] == "census_stc_t40_individual_income_tax_2023.json"
assert saved["data"] == df.to_dict(orient="records")


def test_extract_state_income_tax_data_rejects_unsupported_year():
with pytest.raises(ValueError, match="Only years"):
stc_module.extract_state_income_tax_data(2022)
Loading