-
Notifications
You must be signed in to change notification settings - Fork 10
Expand file tree
/
Copy pathMakefile
More file actions
306 lines (245 loc) · 10.9 KB
/
Makefile
File metadata and controls
306 lines (245 loc) · 10.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
.PHONY: all format test test-unit test-integration install download upload docker documentation data validate-data calibrate calibrate-build publish-local-area upload-calibration upload-dataset push-to-modal build-data-modal build-matrices calibrate-modal calibrate-modal-national calibrate-both stage-h5s stage-national-h5 stage-all-h5s pipeline validate-staging validate-staging-full upload-validation check-staging check-sanity clean build paper clean-paper presentations database database-refresh promote-dataset promote build-h5s validate-local refresh-soi-targets push-pr-branch
SOI_SOURCE_YEAR ?= 2021
SOI_TARGET_YEAR ?= 2023
YEAR ?= 2024
GPU ?= T4
EPOCHS ?= 1000
NATIONAL_GPU ?= T4
NATIONAL_EPOCHS ?= 1000
BRANCH ?= $(shell git rev-parse --abbrev-ref HEAD)
NUM_WORKERS ?= 8
N_CLONES ?= 430
VERSION ?=
SOI_SOURCE_YEAR ?= 2021
SOI_TARGET_YEAR ?= 2023
HF_CLONE_DIR ?= $(HOME)/huggingface/policyengine-us-data
all: data test
format:
ruff format .
mdformat --wrap 100 docs/
test:
pytest
test-unit:
pytest tests/unit/
test-integration:
pytest tests/integration/
install:
pip install policyengine-us
pip install -e ".[dev]" --config-settings editable_mode=compat
changelog:
python .github/bump_version.py
towncrier build --yes --version $$(python -c "import re; print(re.search(r'version = \"(.+?)\"', open('pyproject.toml').read()).group(1))")
download:
python -m policyengine_us_data.storage.download_private_prerequisites
upload:
python -m policyengine_us_data.storage.upload_completed_datasets
docker:
docker buildx build --platform linux/amd64 . -t policyengine-us-data:latest
documentation:
cd docs && \
rm -rf _build .jupyter_cache && \
rm -f _toc.yml && \
myst clean && \
myst build --html
cd docs && test -d _build/html && touch _build/html/.nojekyll || true
documentation-build:
cd docs && \
rm -rf _build .jupyter_cache && \
rm -f _toc.yml && \
myst clean && \
myst build --html
documentation-serve:
cd docs/_build/site && python3 -m http.server 8080
documentation-dev:
cd docs && \
rm -rf _build .jupyter_cache && \
rm -f _toc.yml && \
myst clean && \
myst start
DATABASE_YEAR ?= 2024
database:
rm -f policyengine_us_data/storage/calibration/policy_data.db
python -m policyengine_us_data.db.create_database_tables
python -m policyengine_us_data.db.create_initial_strata --year $(YEAR)
python -m policyengine_us_data.db.etl_national_targets --year $(YEAR)
python -m policyengine_us_data.db.etl_age --year $(YEAR)
python -m policyengine_us_data.db.etl_medicaid --year $(YEAR)
python -m policyengine_us_data.db.etl_snap --year $(YEAR)
python -m policyengine_us_data.db.etl_tanf --year $(YEAR)
python -m policyengine_us_data.db.etl_state_income_tax --year $(YEAR)
python -m policyengine_us_data.db.etl_irs_soi --year $(YEAR)
python -m policyengine_us_data.db.etl_aca_agi_state_targets --year $(YEAR)
python -m policyengine_us_data.db.etl_aca_marketplace --year $(YEAR)
python -m policyengine_us_data.db.etl_pregnancy --year $(YEAR)
python -m policyengine_us_data.db.validate_database
database-refresh:
rm -f policyengine_us_data/storage/calibration/policy_data.db
rm -rf policyengine_us_data/storage/calibration/raw_inputs/
$(MAKE) database
promote-dataset:
python -c "from policyengine_us_data.utils.huggingface import upload; \
upload('policyengine_us_data/storage/source_imputed_stratified_extended_cps_2024.h5', \
'policyengine/policyengine-us-data', \
'calibration/source_imputed_stratified_extended_cps.h5')"
@echo "Dataset promoted to HF."
data: download database
python -m policyengine_us_data.utils.uprating
python -m policyengine_us_data.datasets.acs.acs
python -m policyengine_us_data.datasets.cps.cps
python -m policyengine_us_data.datasets.puf.irs_puf
python -m policyengine_us_data.datasets.puf.puf
python -m policyengine_us_data.datasets.cps.extended_cps
python -m policyengine_us_data.calibration.create_stratified_cps
python -m policyengine_us_data.calibration.create_source_imputed_cps
data-legacy: data
python -m policyengine_us_data.datasets.cps.enhanced_cps
python -m policyengine_us_data.datasets.cps.small_enhanced_cps
calibrate: data
python -m policyengine_us_data.calibration.unified_calibration \
--target-config policyengine_us_data/calibration/target_config.yaml \
--log-freq 100
calibrate-build: data
python -m policyengine_us_data.calibration.unified_calibration \
--target-config policyengine_us_data/calibration/target_config.yaml \
--build-only
validate-package:
python -m policyengine_us_data.calibration.validate_package
publish-local-area:
python -m policyengine_us_data.calibration.promote_local_h5s --local-dir local_area_build
build-h5s:
python -m policyengine_us_data.calibration.publish_local_area \
--weights-path policyengine_us_data/storage/calibration/calibration_weights.npy \
--dataset-path policyengine_us_data/storage/source_imputed_stratified_extended_cps_2024.h5 \
--n-clones 430 \
--seed 42 \
--states-only
validate-local:
python -m policyengine_us_data.calibration.validate_staging \
--hf-prefix local_area_build \
--area-type states --output validation_results.csv
validate-data:
python -c "from policyengine_us_data.storage.upload_completed_datasets import validate_all_datasets; validate_all_datasets()"
refresh-soi-targets:
python -m policyengine_us_data.storage.calibration_targets.refresh_soi_table_targets \
--source-year $(SOI_SOURCE_YEAR) \
--target-year $(SOI_TARGET_YEAR)
push-pr-branch:
@if [ "$(BRANCH)" = "main" ]; then \
echo "Refusing to push main as a PR branch."; \
exit 1; \
fi
@git push -u upstream $(BRANCH)
upload-calibration:
python -c "from policyengine_us_data.utils.huggingface import upload_calibration_artifacts; \
upload_calibration_artifacts()"
upload-dataset:
python -c "from policyengine_us_data.utils.huggingface import upload; \
upload('policyengine_us_data/storage/source_imputed_stratified_extended_cps_2024.h5', \
'policyengine/policyengine-us-data', \
'calibration/source_imputed_stratified_extended_cps.h5')"
@echo "Dataset uploaded to HF."
push-to-modal:
modal volume put pipeline-artifacts \
policyengine_us_data/storage/calibration/calibration_weights.npy \
artifacts/calibration_weights.npy --force
modal volume put pipeline-artifacts \
policyengine_us_data/storage/calibration/policy_data.db \
artifacts/policy_data.db --force
modal volume put pipeline-artifacts \
policyengine_us_data/storage/source_imputed_stratified_extended_cps_2024.h5 \
artifacts/source_imputed_stratified_extended_cps.h5 --force
@echo "All pipeline artifacts pushed to Modal volume."
build-matrices:
modal run --detach modal_app/remote_calibration_runner.py::build_package \
--branch $(BRANCH) --county-level --n-clones $(N_CLONES)
calibrate-modal:
modal run --detach modal_app/remote_calibration_runner.py::main \
--branch $(BRANCH) --gpu $(GPU) --epochs $(EPOCHS) \
--beta 0.65 --lambda-l0 1e-7 --lambda-l2 1e-8 --log-freq 100 \
--target-config policyengine_us_data/calibration/target_config.yaml \
--push-results
calibrate-modal-national:
modal run --detach modal_app/remote_calibration_runner.py::main \
--branch $(BRANCH) --gpu $(NATIONAL_GPU) \
--epochs $(NATIONAL_EPOCHS) \
--beta 0.65 --lambda-l0 2e-2 --lambda-l2 1e-12 --log-freq 100 \
--target-config policyengine_us_data/calibration/target_config.yaml \
--push-results --national
calibrate-both:
$(MAKE) calibrate-modal & $(MAKE) calibrate-modal-national & wait
stage-h5s:
modal run --detach modal_app/local_area.py::main \
--branch $(BRANCH) --num-workers $(NUM_WORKERS) --n-clones $(N_CLONES)
stage-national-h5:
modal run --detach modal_app/local_area.py::main_national \
--branch $(BRANCH) --n-clones $(N_CLONES)
stage-all-h5s:
$(MAKE) stage-h5s & $(MAKE) stage-national-h5 & wait
promote:
@echo "This will run the full Modal promote pipeline (local_area.py::main_promote)."
@read -p "Are you sure? [y/N] " confirm && [ "$$confirm" = "y" ] || (echo "Aborted."; exit 1)
$(eval VERSION := $(or $(VERSION),$(shell python -c "import tomllib; print(tomllib.load(open('pyproject.toml','rb'))['project']['version'])")))
modal run --detach modal_app/local_area.py::main_promote \
--branch $(BRANCH) --version $(VERSION)
validate-staging:
python -m policyengine_us_data.calibration.validate_staging \
--area-type states --output validation_results.csv \
$(if $(RUN_ID),--run-id $(RUN_ID))
validate-staging-full:
python -m policyengine_us_data.calibration.validate_staging \
--area-type states,districts --output validation_results.csv \
$(if $(RUN_ID),--run-id $(RUN_ID))
upload-validation:
python -c "from policyengine_us_data.utils.huggingface import upload; \
upload('validation_results.csv', \
'policyengine/policyengine-us-data', \
'calibration/logs/validation_results.csv')"
check-staging:
python -m policyengine_us_data.calibration.check_staging_sums \
$(if $(RUN_ID),--run-id $(RUN_ID))
check-sanity:
python -m policyengine_us_data.calibration.validate_staging \
--sanity-only --area-type states --areas NC \
$(if $(RUN_ID),--run-id $(RUN_ID))
build-data-modal:
modal run --detach modal_app/data_build.py::main --branch $(BRANCH) --upload --skip-tests
pipeline:
modal run --detach modal_app.pipeline::main \
--action run --branch $(BRANCH) --gpu $(GPU) \
--epochs $(EPOCHS) --national-gpu $(NATIONAL_GPU) \
--national-epochs $(NATIONAL_EPOCHS) \
--num-workers $(NUM_WORKERS) --n-clones $(N_CLONES)
clean:
rm -f policyengine_us_data/storage/*.h5
rm -f policyengine_us_data/storage/*.db
git ls-files --others --ignored --exclude-standard -- '*.csv' | grep -Ev '(^|/)(\.venv|venv|env|\.tox|\.nox|node_modules)/' | xargs -r rm -f
rm -rf policyengine_us_data/docs/_build
build:
python -m build
publish:
twine upload dist/*
paper-content:
@echo "Building paper sections and docs from unified content..."
python paper/scripts/build_from_content.py
paper-tables:
@echo "Generating all LaTeX tables..."
python paper/scripts/generate_all_tables.py
paper-results: paper-tables
@echo "Generating paper results tables and figures..."
python paper/scripts/generate_validation_metrics.py
python paper/scripts/calculate_distributional_metrics.py
@echo "Paper results generated in paper/results/"
paper: paper-content paper-results paper/woodruff_ghenis_2024_enhanced_cps.pdf
paper/woodruff_ghenis_2024_enhanced_cps.pdf: $(wildcard paper/sections/**/*.tex) $(wildcard paper/bibliography/*.bib) paper/main.tex paper/macros.tex
cd paper && \
BIBINPUTS=./bibliography pdflatex main && \
BIBINPUTS=./bibliography bibtex main && \
pdflatex -jobname=woodruff_ghenis_2024_enhanced_cps main && \
pdflatex -jobname=woodruff_ghenis_2024_enhanced_cps main
clean-paper:
rm -f paper/*.aux paper/*.bbl paper/*.blg paper/*.log paper/*.out paper/*.toc paper/*.pdf paper/sections/**/*.aux
presentations: presentations/nta_2024_11/nta_2024_slides.pdf
presentations/nta_2024_11/nta_2024_slides.pdf: presentations/nta_2024_11/main.tex
cd presentations/nta_2024_11 && \
pdflatex -jobname=nta_2024_slides main && \
pdflatex -jobname=nta_2024_slides main