diff --git a/input/InitialPopulations/compile/RegressionEstimates/00_master_conditions.do b/input/InitialPopulations/compile/RegressionEstimates/00_master_conditions.do new file mode 100644 index 000000000..20ee28b3b --- /dev/null +++ b/input/InitialPopulations/compile/RegressionEstimates/00_master_conditions.do @@ -0,0 +1,219 @@ +/******************************************************************************* +* PROJECT: SimPaths UK +* DO-FILE NAME: 00_master_conditions.do +* DESCRIPTION: Sets out the assumptions and conditions imposed in the +* creation of the unique dataset and the if conditions +* imposed when estimating the processes for SimPaths. +******************************************************************************** +* COUNTRY: UK +* AUTHORS: Daria Popova +* LAST UPDATE: 6 May 2026 DP +******************************************************************************** +* ----------------------------------------------------------------------------- +* Assumptions imposed to align the initial populations with simulation rules +* ----------------------------------------------------------------------------- +* +* - Retirement: +* - Treated as an absorbing state +* - Must retire by a specified maximum age +* - Cannot retire before a specified minimum age +* +* - Education: +* - Leave education no earlier than a specified minimum age +* - Must leave the initial education spell by a specified maximum age +* - Cannot return to education after retirement +* +* - Work: +* - Can work from a specified minimum age +* - Activity status and hours of work populated consistently: +* → Assume not working if report hours = 0 +* → Assume hours = 0 if not working +* - If missing partial information, don't assume the missing is 0 and +* impute (hot-deck) +* +* - Leaving the parental home: +* - Can leave from a specified minimum age +* - Become the effective head of hh even when living with parents when +* paretns retire or reach state retirment age +* +* - Home ownership: +* - Can own a home from a specified minimum age +* +* - Partnership formation: +* - Can form a partnership from a specified minimum age +* +* - Disability: +* - Treated as a subsample of the not-employed population +* +* The relevant age thresholds are defined in globals defined in "DEFINE +* PARAMETERS" section below. +* Throughout also construct relevant flags and produce an Excel file "flag_descriptves" to +* see the extent of the adjustments to the raw data. +* +* ----------------------------------------------------------------------- +* Additional notes on implementation: +* ----------------------------------------------------------------------- +* Current imputations : +* - Self-rated health status (ordered probit model) +* - Subjective well-being (liner regression) +* - Mental and physical component summaries (linear regression) +* - Impute highest parental education status (ordered probit model) +* - Impute education status using lagged observation and generalized ordered logit +* - Impute working hours if missing but the person is in work (panel based imputation + hot-deck) +* - Impute observed hourly wages if missing but the person is in work (panel based imputation + hot-deck) +* +* ----------------------------------------------------------------------- +* Remaining disparities between initial populations and simulation rules: +* ----------------------------------------------------------------------- +* - Ages at which females can have a child. [Be informed by the sample?] +* Permit teenage mothers in this script (deal with in 03_ ) +* - A few higher/older education spells (30+) that last multiple years, whilst +* in the simulation can only return to education for single year spells. +* - Should we have people becoming adults at 18 or 16 for income/number of +* children purposes? +* Considered a child if live with parents until 18 and in ft education? +* - Don't impose monotoncity on reported educational attainment information. +* - Number of children vars (all ages or 0-2) don't account for feasibility +* of age at birth of the mother. +*******************************************************************************/ + +/******************************************************************************* +* DEFINTE PARAMETERS +*******************************************************************************/ + +global country "UK" + +global first_sim_year "2010" + +global last_sim_year "2025" + + + +* Globals used for all processes +global weight "dwt" + +//global regions "UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN" //UKI is London (reference) +global regions "demRgnUKC demRgnUKD demRgnUKE demRgnUKF demRgnUKG demRgnUKH demRgnUKJ demRgnUKK demRgnUKL demRgnUKM demRgnUKN" //demRgnUKI is London (reference) + +//global ethnicity "Ethn_Asian Ethn_Black Ethn_Other" //White is reference. Mixed race & undefined are in Other category +global ethnicity "demEthnC4Asian demEthnC4Black demEthnC4Other" //White is reference. Mixed race & undefined are in Other category + +* Define threshold ages +/* +Ages used for specifying samples. +ENSURE THE SAME AS THE GLOBALS USED IN THE INTIIAL POPULATIONS MASTER FILE +*/ + +* Age become an adult in various dimensions +global age_becomes_responsible 18 + +global age_becomes_semi_responsible 16 + +global age_seek_employment 16 + +global age_leave_school 16 + +global age_form_partnership 18 + +global age_have_child_min 18 + +global age_leave_parental_home 18 + +global age_own_home 18 + +* Age can/must/cannot make various transitions +global age_max_dep_child 17 + +global age_adult 18 + +global age_can_retire 50 + +global age_force_retire 75 + +global age_force_leave_spell1_edu 30 + +global age_have_child_max 49 // allow this to be led by the data + + +/******************************************************************************* +* PROCESS IF CONDITIONS +*******************************************************************************/ + +* Education +global e1a_if_condition "dag >= ${age_leave_school} & dag < ${age_force_leave_spell1_edu} & l.les_c4 == 2" + +global e1b_if_condition "dag >= ${age_leave_school} & l.les_c4 != 4 & l.les_c4 != 2" + +global e2_if_condition "dag >= ${age_leave_school} & l.les_c4 == 2 & les_c4 != 2" + +* Leave the parental home +global p1_if_condition "ded == 0 & dag >= ${age_leave_parental_home}" + +* Partnership +global u1_if_condition "dag >= ${age_form_partnership} & ssscp != 1" + +global u2_if_condition "dgn == 0 & dag >= ${age_form_partnership} & l.ssscp != 1" + +* Fertility +global f1_if_condition "dag >= ${age_have_child_min} & dag <= ${age_have_child_max} & dgn == 0" + +* Health +global h1_if_condition "dag >= ${age_becomes_semi_responsible} & flag_dhe_imp == 0" + +global h2_if_condition "dag >= ${age_becomes_semi_responsible} & ded == 0" + +* Home ownership +global ho1_if_condition "dag >= ${age_own_home}" + +* Retirment +global r1a_if_condition "dcpst == 2 & dag >= ${age_can_retire}" + +global r1b_if_condition "ssscp != 1 & dcpst == 1 & dag >= ${age_can_retire}" + + +* WAGES +global wages_f_no_prev_if_condition "dgn == 0 & dag >= ${age_seek_employment} & dag <= ${age_force_retire} & previouslyWorking == 0 & deh_c4>0" + +global wages_m_no_prev_if_condition "dgn == 1 & dag >= ${age_seek_employment} & dag <= ${age_force_retire} & previouslyWorking == 0 & deh_c4>0" + +global wages_f_prev_if_condition "dgn == 0 & dag >= ${age_seek_employment} & dag <= ${age_force_retire} & previouslyWorking == 1 & deh_c4>0" + +global wages_m_prev_if_condition "dgn == 1 & dag >= ${age_seek_employment} & dag <= ${age_force_retire} & previouslyWorking == 1 & deh_c4>0" + + +* CAPITAL INCOME +global i1a_if_condition "dag >= ${age_becomes_semi_responsible}" + +global i1b_if_condition "dag >= ${age_becomes_semi_responsible} & receives_ypncp == 1" + +* PRIVATE PENSION INCOME +global i2b_if_condition "dag >= ${age_can_retire} & dlrtrd == 1 & l.dlrtrd==1 & receives_ypnoab==1" + +global i3a_if_condition "dag >= ${age_can_retire} & dlrtrd == 1 & l.dlrtrd!=1 & l.les_c4 != 2" + +global i3b_if_condition "dag >= ${age_can_retire} & dlrtrd == 1 & l.dlrtrd!=1 & l.les_c4 != 2 & receives_ypnoab==1" + + +* SOCIAL CARE +global s2a_if_condition "dag > 64 & stm >= 15 & stm <= 22" // Need care + +global s2b_if_condition "dag > 64 & stm >= 16 & stm <= 21" // Receive care + +global s2c_if_condition "dag > 64 & receive_care & stm >= 16 & stm <= 21" // Care mix received + +global s2d_if_condition "dag > 64 & receive_informal_care & stm >= 16 & stm <= 21" // Informal care hours received + +global s2e_if_condition "dag > 64 & receive_formal_care & stm >= 16 & stm <= 21" // Formal care hours received + + +global s3a_if_condition "Single & stm >= 15" // Provide care, Singles + +global s3b_if_condition "Partnered & stm >= 15" // Provide care, Partnered + +global s3c_if_condition "provide_informal_care & Single & stm >= 15" // Informal care hours provided, Singles + +global s3d_if_condition "provide_informal_care & Partnered & stm >= 15" // Informal care hours provided, Singles + + +* Finanicial distress and health processes +* TO ADD diff --git a/input/InitialPopulations/compile/RegressionEstimates/00_master_regression_estimates.do b/input/InitialPopulations/compile/RegressionEstimates/00_master_regression_estimates.do new file mode 100644 index 000000000..d9a4681cc --- /dev/null +++ b/input/InitialPopulations/compile/RegressionEstimates/00_master_regression_estimates.do @@ -0,0 +1,127 @@ + +*************************************************************************************** +* PROJECT: SimPaths UK: regression estimates for SimPaths using UKHLS data +* DO-FILE NAME: master.do +* DESCRIPTION: Main do-file to set the main parameters (country, paths) and call sub-scripts +*************************************************************************************** +* COUNTRY: UK +* DATA: UKHLS EUL version - UKDA-6614-stata [to wave o] +* +* AUTHORS: Daria Popova, Justin van de Ven +* LAST UPDATE: 6 May 2026 DP +*************************************************************************************** + +*************************************************************************************** +* General comments: +* - Note that in the following scripts some standard commands may be +* abbreviated: (gen)erate, (tab)ulate, (sum)marize, (di)splay, +* (cap)ture, (qui)etly, (noi)sily + +*Stata packages to install +*ssc install fre +*ssc install tsspell +*ssc install carryforward +*ssc install outreg2 +*ssc install oparallel +*ssc install gologit2 +*ssc install winsor +*ssc install reghdfe +*ssc install ftools +*ssc install require +* +* NOTES: +* The income and union parameter do file must be run after +* the wage estimates are obtained because they use +* predicted wages. The order of the remaining files is +* arbitrary. +*************************************************************************************** +*************************************************************************************** + +clear all +set more off +set type double +set maxvar 30000 +set matsize 1000 + + +/************************************************************************************** +* DEFINE DIRECTORIES +**************************************************************************************/ + +* Working directory + +global path "D:\Dasha\ESSEX\_SimPaths\_SimPaths_UK\input_processing" + +global dir_work "${path}\regression_estimates" + +* Directory which contains do files +global dir_do "${dir_work}\do" + +* Directory which contains log files +global dir_log "${dir_work}\log" + +* Directory which contains raw output: Excel and Word tables +global dir_raw_results "${dir_work}\raw_results" + +* Directory which contains final Excel files read by the model +global dir_results "${dir_work}\results" + +* Pooled dataset for estimates +global estimation_sample "${path}\initial_populations\data\ukhls_pooled_ipop.dta" + +* Pooled dataset with predicted wages after Heckman +global estimation_sample2 "${path}\initial_populations\data\UKHLS_pooled_ipop2.dta" + +* Directory containing external data used for the estimates (e.g. fertility rates, wage growth) +global dir_external_data "${dir_work}/external_data" + +* Directory to save data for internal validation +global dir_validation_data "${dir_work}/internal_validation/data" + +/******************************************************************************* +* DEFINE PARAMETERS & PROCESS IF CONDITIONS +*******************************************************************************/ + +do "${path}\00_master_conditions.do" + + +/******************************************************************************* +* ESTIMATION FILES +*******************************************************************************/ +/* +Two additional do-files are called from each of these do-files +- variable_update.do refactors variable names +- programs.do contains Stata programs to process the output of regressions and create Excel files with results used by Simpaths + */ + +do "${dir_do}/01_reg_education.do" + +do "${dir_do}/02_reg_leave_parental_home.do" + +do "${dir_do}/03_reg_partnership.do" + +do "${dir_do}/04_reg_fertility.do" + +do "${dir_do}/05_reg_health.do" + +do "${dir_do}/06_reg_home_ownership.do" + +do "${dir_do}/07_reg_retirement.do" + +do "${dir_do}/08_reg_wages.do" + +do "${dir_do}/09_reg_income.do" + +do "${dir_do}/10_reg_socialcare.do" + +/*Note that the do-files below are not yet refactored */ +do "${dir_do}/11_reg_financial_distress.do" + +do "${dir_do}/12_reg_health_mental.do" + +do "${dir_do}/13_reg_health_wellbeing.do" + + +/************************************************************************************** +* END OF FILE +**************************************************************************************/ diff --git a/input/InitialPopulations/compile/RegressionEstimates/01_reg_education.do b/input/InitialPopulations/compile/RegressionEstimates/01_reg_education.do new file mode 100644 index 000000000..e90ce2676 --- /dev/null +++ b/input/InitialPopulations/compile/RegressionEstimates/01_reg_education.do @@ -0,0 +1,136 @@ +/******************************************************************************* +* PROJECT: SimPaths UK +* SECTION: Education +* OBJECT: Final Probit & Generalised Logit Models - Weighted +* AUTHORS: Patryk Bronka, Daria Popova, Justin van de Ven, +* Aleksandra Kolndrekaj, Ashley Burdett +* LAST UPDATE: 15 April 2026 (DP) +* COUNTRY: UK +* +* NOTES: +* +*******************************************************************************/ + +clear all +set more off +set mem 200m +set type double +//set maxvar 120000 +set maxvar 30000 + + +/********************************* SET LOG FILE *******************************/ +cap log close +log using "${dir_log}/reg_education.log", replace + + +/******************************* SET EXCEL FILE *******************************/ + +putexcel set "$dir_results/reg_education", sheet("Info") replace +putexcel A1 = "Description:", bold +putexcel B1 = "Model parameters governing projection of education status" +putexcel A2 = "Authors:" +putexcel B2 = "Patryk Bronka, Justin van de Ven, Daria Popova, Aleksandra Kolndrekaj, Ashley Burdett" +putexcel A3 = "Last edit:" +putexcel B3 = "15 April 2026 (DP)" + +putexcel A5 = "Process:", bold +putexcel B5 = "Description:", bold + +putexcel A6 = "E1a" +putexcel B6 = "Prob. remain in education" + +putexcel A7 = "E1b" +putexcel B7 = "Prob. retrun to education" + +putexcel A8 = "E2" +putexcel B8 = "Educational attainment when leave education" + +putexcel A9 = "E2_raw" +putexcel B9 = "Raw attainment results" + +putexcel A11 = "Notes:", bold +putexcel B11 = "Estimation sample: UK_ipop.dta with grossing up weight dwt" +putexcel B12 = "Conditions for processes are defined as globals in master.do" +//putexcel B13 = "E1a: Compared to the previous version, where age and age squared were used, age is now centered (at age 23) and its effect is allowed to change after age 18." + +putexcel set "$dir_results/reg_education", sheet("Gof") modify +putexcel A1 = "Goodness of fit", bold + + +/********************************* PREPARE DATA *******************************/ + +use "${estimation_sample}", clear + +* Set data +xtset idperson swv +sort idperson swv + +* Adjust variables +do "${dir_do}/variable_update.do" + + +/********************************** ESTIMATION ********************************/ + +* Run Stata programs to produce Excel file +do "${dir_do}/programs.do" + + +/****************** E1a: PROBABILITY OF REMAINING IN EDUCATION ****************/ +display "${e1a_if_condition}" + + +probit Dst /// + demMaleFlag demAge demAgeSq eduSampleFlagL1 /// + eduHighestParentC3MediumL1 eduHighestParentC3LowL1 /// + yHhQuintilesMonthC5Q2L1 yHhQuintilesMonthC5Q3L1 yHhQuintilesMonthC5Q4L1 yHhQuintilesMonthC5Q5L1 /// + $regions demYear demYear2020 demYear2021 $ethnicity /// + if ${e1a_if_condition} [pw=${weight}], vce(robust) + + +process_regression, domain("education") process("E1a") sheet("E1a") /// + title("Process E1a: Prob. remain in education") /// + gofrow(3) goflabel("E1a - Remain in education") /// + ifcond("${e1a_if_condition}") probit + + + +/****************** E1b: PROBABILITY OF RETURNING TO EDUCATION ****************/ +display "${e1b_if_condition}" + +probit der /// + demMaleFlag demAge demAgeSq demPartnerStatusPartneredL1 /// + eduHighestC4HighL1 eduHighestC4LowL1 /// + eduHighestParentC3MediumL1 eduHighestParentC3LowL1 /// + labStatusC3NotEmployedL1 /*labStatusC3EmployedL1*/ /// + demNChildL1 demNChild0to2L1 /// + $regions demYear demYear2020 demYear2021 $ethnicity /// + if ${e1b_if_condition} [pw=${weight}], vce(robust) + +process_regression, domain("education") process("E1b") sheet("E1b") /// + title("Process E1b: Prob. return to education") /// + gofrow(7) goflabel("E1b - Return to education") /// + ifcond("${e1b_if_condition}") probit + + +/****************** E2: EDUCATION ATTAINMENT WHEN LEAVE SCHOOL ****************/ +display "${e2_if_condition}" + +gologit2 deh_c3_recoded /// + demMaleFlag demAge demAgeSq /// + eduHighestParentC3MediumL1 eduHighestParentC3LowL1 /// + $regions demYear demYear2020 demYear2021 $ethnicity /// + if ${e2_if_condition} [pw=${weight}] , autofit + + +process_gologit, domain("education") process("E2") sheet("E2") /// + title("Process E2: Educational Attainment When Leave School") /// + gofrow(11) goflabel("E2 - Education attainment") /// + outcomes(3) /// + ifcond("${e2_if_condition}") + + +display "Education analysis complete!" + + +capture log close diff --git a/input/InitialPopulations/compile/RegressionEstimates/02_reg_leave_parental_home.do b/input/InitialPopulations/compile/RegressionEstimates/02_reg_leave_parental_home.do new file mode 100644 index 000000000..7084edb6d --- /dev/null +++ b/input/InitialPopulations/compile/RegressionEstimates/02_reg_leave_parental_home.do @@ -0,0 +1,87 @@ +******************************************************************************** +* PROJECT: SimPaths UK +* SECTION: Leaving Parental Home +* OBJECT: Final Probit Regression Model +* AUTHORS: Daria Popova, Justin van de Ven, Aleksandra Kolndrekaj, Ashley Burdett +* LAST UPDATE: 15 April 2026 (DP) +* COUNTRY: UK +* +* NOTES: +********************************************************************************** + +clear all +set more off +set mem 200m +set type double +//set maxvar 120000 +set maxvar 30000 + + +/********************************* SET LOG FILE *******************************/ +cap log close +log using "${dir_log}/reg_leave_parental_home.log", replace + + +/********************************* SET EXCEL FILE *****************************/ + +putexcel set "$dir_results/reg_leave_parental_home", sheet("Info") replace +putexcel A1 = "Description:", bold +putexcel B1 = "Model parameters governing leaving parental home" +putexcel A2 = "Authors:" +putexcel B2 = "Patryk Bronka, Justin van de Ven, Daria Popova, Aleksandra Kolndrekaj, Ashley Burdett" +putexcel A3 = "Last edit: 15 April 2026 (DP)" + +putexcel A5 = "Process:", bold +putexcel B5 = "Description:", bold +putexcel A6 = "P1a" +putexcel B6 = "Prob. leave the parental home, transitioning out of adult child status" + +putexcel A10 = "Notes:", bold +putexcel B10 = "Estimation sample: UK_ipop.dta with grossing up weight dwt" +putexcel B11 = "Conditions for processes are defined as globals in master.do" + +putexcel set "$dir_results/reg_leave_parental_home", sheet("Gof") modify +putexcel A1 = "Goodness of fit", bold + + +/********************************* PREPARE DATA *******************************/ + +* Load data +use "${estimation_sample}", clear + +* Set data +xtset idperson swv +sort idperson swv + +* Adjust variables +do "${dir_do}/variable_update.do" + + +/********************************** ESTIMATION ********************************/ + +* Run Stata programs to produce Excel file +do "${dir_do}/programs.do" + + +/**************** P1: PROBABILITY OF LEAVING THE PARENTAL HOME ****************/ +display "${p1_if_condition}" + +probit dlftphm /// + demMaleFlag demAge demAgeSq /// + eduHighestC4NaL1 eduHighestC4MediumL1 eduHighestC4LowL1 /// + labStatusC3StudentL1 labStatusC3NotEmployedL1 /// + yHhQuintilesMonthC5Q2L1 yHhQuintilesMonthC5Q3L1 yHhQuintilesMonthC5Q4L1 yHhQuintilesMonthC5Q5L1 /// + $regions demYear demYear2020 demYear2021 $ethnicity /// + if ${p1_if_condition} [pw=${weight}], vce(robust) + + +process_regression, domain("leave_parental_home") process("P1") sheet("P1") /// + title("Process P1: Prob. leave parental home") /// + gofrow(3) goflabel("P1 - Leave parental home") /// + ifcond("${p1_if_condition}") probit + + +display "Leaving parental home analysis complete!" + + +cap log close diff --git a/input/InitialPopulations/compile/RegressionEstimates/03_reg_partnership.do b/input/InitialPopulations/compile/RegressionEstimates/03_reg_partnership.do new file mode 100644 index 000000000..4d45c7c05 --- /dev/null +++ b/input/InitialPopulations/compile/RegressionEstimates/03_reg_partnership.do @@ -0,0 +1,115 @@ +******************************************************************************** +* PROJECT: SimPaths UK +* SECTION: Unions +* OBJECT: Final Probit Models +* AUTHORS: Daria Popova, Justin van de Ven, Aleksandra Kolndrekaj, Ashley Burdett +* LAST UPDATE: 15 April 2026 (DP) +* COUNTRY: UK +* +*NOTES: +* Combined former a and b processes. +******************************************************************************** + +clear all +set more off +set mem 200m +set type double +//set maxvar 120000 +set maxvar 30000 + +/********************************* SET LOG FILE *******************************/ +cap log close +log using "${dir_log}/reg_partnership.log", replace + + +/********************************* SET EXCEL FILE *****************************/ + +putexcel set "$dir_results/reg_partnership", sheet("Info") replace +putexcel A1 = "Description:", bold +putexcel B1 = "Model parameters for relationship status projection" +putexcel A2 = "Authors:" +putexcel B2 = "Patryk Bronka, Justin van de Ven, Daria Popova, Aleksandra Kolndrekaj, Ashley Burdett" +putexcel A3 = "Last edit: 15 April 2026 (DP)" + +putexcel A5 = "Process:", bold +putexcel B5 = "Description:", bold +putexcel A6 = "U1 " +putexcel B6 = "Prob enter partnership" +putexcel A7 = "U2" +putexcel B7 = "Prob exit partnership" + +putexcel A10 = "Notes:", bold +putexcel B10 = "Estimation sample: UK_ipop.dta with grossing up weight dwt" +putexcel B11 = "Conditions for processes are defined as globals in master.do" +putexcel B12 = "Combined former processes U1a and U1b" + +putexcel set "$dir_results/reg_partnership", sheet("Gof") modify +putexcel A1 = "Goodness of fit", bold + + +/********************************* PREPARE DATA *******************************/ + +* Load data +use "${estimation_sample}", clear + +* Set data +xtset idperson swv +sort idperson swv + +* Adjust variables +do "${dir_do}/variable_update.do" + +* Run Stata programs to produce Excel file +do "${dir_do}/programs.do" + +/********************************** ESTIMATION ********************************/ + +/******************** U1: PROBABILITY FORMING PARTNERSHIP *********************/ +display "${u1_if_condition}" + +probit dcpen /// + eduSampleFlag demMaleFlag demAge demAgeSq /// + demNChildL1 demNChild0to2L1 /// + yHhQuintilesMonthC5Q2L1 yHhQuintilesMonthC5Q3L1 yHhQuintilesMonthC5Q4L1 yHhQuintilesMonthC5Q5L1 /// + eduSampleFlag_demMaleFlag eduSampleFlag_demNChildL1 eduSampleFlag_demNChild0to2L1 /// + eduSampleFlag_Q2L1 eduSampleFlag_Q3L1 eduSampleFlag_Q4L1 eduSampleFlag_Q5L1 /// + eduHighestC4NaL1 eduHighestC4HighL1 eduHighestC4MediumL1 eduHighestC4LowL1 /// + labStatusC4EmployedL1 labStatusC4StudentL1 labStatusC4RetiredL1 /// + labStatusC4EmployedL1_Male labStatusC4StudentL1_Male labStatusC4RetiredL1_Male /// + healthPhysicalPcsL1 healthMentalMcsL1 /// + $regions demYear demYear2020 demYear2021 $ethnicity /// + if ${u1_if_condition} [pw=${weight}], vce(robust) + +process_regression, domain("partnership") process("U1") sheet("U1") /// + title("Process U1: Prob. form partnership") /// + gofrow(3) goflabel("U1 - Form partnership") /// + ifcond("${u1_if_condition}") probit + + +/******************* U2: PROBABILITY TERMINATE PARTNERSHIP ********************/ +display "${u2_if_condition}" + +probit dcpex /// + eduSampleFlag demMaleFlag demAge demAgeSq /// + eduHighestC4NaL1 eduHighestC4HighL1 eduHighestC4MediumL1 eduHighestC4LowL1 /// + eduHighestPartnerC3MediumL1 eduHighestPartnerC3LowL1 /// + healthPhysicalPcsL1 healthMentalMcsL1 /// + healthPhysicalPartnerPcsL1 healthMentalPartnerMcsL1 /// + demPartnerNYearL1 demEnterPartnerFlagL1 demAgePartnerDiffL1 /// + demNChildL1 demNChild0to2L1 /// + labStatusPartnerAndOwnC42L1 labStatusPartnerAndOwnC43L1 labStatusPartnerAndOwnC44L1 /// + yNonBenPersGrossMonthL1 yPersAndPartnerGrossDiffMonthL1 /// + $regions demYear demYear2020 demYear2021 $ethnicity /// + if ${u2_if_condition} [pw=${weight}], vce(robust) + +process_regression, domain("partnership") process("U2") sheet("U2") /// + title("Process U2: Prob. end partnership") /// + gofrow(7) goflabel("U2 - End partnership") /// + ifcond("${u2_if_condition}") probit + + +display "Partnership analysis complete!" + + +capture log close + diff --git a/input/InitialPopulations/compile/RegressionEstimates/04_reg_fertility.do b/input/InitialPopulations/compile/RegressionEstimates/04_reg_fertility.do new file mode 100644 index 000000000..2525db426 --- /dev/null +++ b/input/InitialPopulations/compile/RegressionEstimates/04_reg_fertility.do @@ -0,0 +1,95 @@ +********************************************************************************* +* PROJECT: SimPaths UK +* SECTION: Fertility +* OBJECT: Final Probit Models +* AUTHORS: Daria Popova, Justin van de Ven, Aleksandra Kolndrekaj +* LAST UPDATE: 15 April 2026 (DP) +* COUNTRY: UK +* +* NOTES: +* Combined former a and b processes. +******************************************************************************** +clear all +set more off +set mem 200m +set type double +//set maxvar 120000 +set maxvar 30000 + + +/********************************* SET LOG FILE *******************************/ +cap log close +log using "${dir_log}/reg_fertility.log", replace + + +/******************************* SET EXCEL FILE *******************************/ + +putexcel set "$dir_results/reg_fertility", sheet("Info") replace +putexcel A1 = "Description:", bold +putexcel B1 = "Model parameters governing projection of fertility" +putexcel A2 = "Authors:" +putexcel B2 = "Patryk Bronka, Justin van de Ven, Daria Popova, Aleksandra Kolndrekaj, Ashley Burdett" +putexcel A3 = "Last edit: 15 April 2026 (DP) " + +putexcel A5 = "Process:", bold +putexcel B5 = "Description:", bold +putexcel A6 = "F1" +putexcel B6 = "Prob have a child for women" + +putexcel A10 = "Notes:", bold +putexcel B10 = "Estimation sample: UK_ipop.dta with grossing up weight dwt" +putexcel B11 = "Conditions for processes are defined as globals in master.do" +putexcel B12 = "Combined former processes F1a and F1b" + +putexcel set "$dir_results/reg_fertility", sheet("Gof") modify +putexcel A1 = "Goodness of fit", bold + + +/********************************* PREPARE DATA *******************************/ + +* Load data +use "${estimation_sample}", clear + +* Set data +xtset idperson swv +sort idperson swv + +* Adjust variables +do "${dir_do}/variable_update.do" + +* Any-children dummy (dchpd collapsing) +replace dchpd = 1 if inlist(dchpd, 2, 3, 4, 5) +fre dchpd + + +/********************************* ESTIMATION *********************************/ + +* Run Stata programs to produce Excel file +do "${dir_do}/programs.do" + +/*********************** F1: PROBABILITY OF HAVING A CHILD ********************/ +display "${f1_if_condition}" + +probit dchpd /// + eduSampleFlag demMaleFlag demAge demAgeSq /// + healthPhysicalPcsL1 healthMentalMcsL1 /// + demPartnerStatusSingle demPartnerStatusSingleL1 /// + eduSampleFlag_Single /// + yHhQuintilesMonthC5Q2L1 yHhQuintilesMonthC5Q3L1 yHhQuintilesMonthC5Q4L1 yHhQuintilesMonthC5Q5L1 /// + demNChildL1 demNChild0to2L1 /// + eduHighestC4HighL1 eduHighestC4MediumL1 eduHighestC4LowL1 /// + fertilityRate /// + /*labStatusC3StudentL1*/ labStatusC3NotEmployedL1 /// + $regions demYear demYear2020 demYear2021 $ethnicity /// +if ${f1_if_condition} [pw=${weight}], vce(robust) + +process_regression, domain("fertility") process("F1") sheet("F1") /// + title("Process F1: Prob. have a child") /// + gofrow(3) goflabel("F1 - Have child") /// + ifcond("${f1_if_condition}") probit + + +display "Fertility analysis complete!" + + +capture log close diff --git a/input/InitialPopulations/compile/RegressionEstimates/05_reg_health.do b/input/InitialPopulations/compile/RegressionEstimates/05_reg_health.do new file mode 100644 index 000000000..c7f2686a2 --- /dev/null +++ b/input/InitialPopulations/compile/RegressionEstimates/05_reg_health.do @@ -0,0 +1,122 @@ +******************************************************************************** +* PROJECT: SimPaths UK +* SECTION: Health +* OBJECT: Health status and Disability +* AUTHORS: Daria Popova, Justin van de Ven, Aleksandra Kolndrekaj, +* Ashley Burdett +* LAST UPDATE: 15 April 2026 (DP) +* COUNTRY: UK +* +* NOTES: Combined former a and b processes. +* +******************************************************************************** +clear all +set more off +set mem 200m +set type double +//set maxvar 120000 +set maxvar 30000 + +/********************************* SET LOG FILE *******************************/ +cap log close +log using "${dir_log}/reg_health.log", replace + + +/******************************* SET EXCEL FILE *******************************/ + +putexcel set "$dir_results/reg_health", sheet("Info") replace +putexcel A1 = "Description:", bold +putexcel B1 = "Model parameters governing projection self-reported health status" +putexcel A2 = "Authors:" +putexcel B2 = "Justin van de Ven, Daria Popova, Aleksandra Kolndrekaj, Ashley Burdett" +putexcel A3 = "Last edit: 15 April 2026 (DP) " + +putexcel A5 = "Process:", bold +putexcel B5 = "Description:", bold + +putexcel A6 = "H1" +putexcel B6 = "Self rated health (5 cat)" +putexcel B7 = "Covariates that satisfy the parallel lines assumption have one estimate for all categories of the dependent variable and are present once in the table" +putexcel B8 = "Covariates that do not satisfy the parallel lines assumption have an estimate for each estimated category of the dependent variable. These covariates have the dependent variable category appended to their name." + +putexcel A9 = "H1_raw" +putexcel B9 = "elf rated health (5 cat) - unformatted output" + +putexcel A10 = "H2" +putexcel B10 = "Prob. long-term sick or disabled" + +putexcel A15 = "Notes:", bold +putexcel B15 = "Estimation sample: UK_ipop.dta with grossing up weight dwt" +putexcel B16 = "Conditions for processes are defined as globals in master.do" +putexcel B17 = "Combined former processes H1a and H1b" + +putexcel set "$dir_results/reg_health", sheet("Gof") modify +putexcel A1 = "Goodness of fit", bold + + +/********************************* PREPARE DATA *******************************/ + +* Load data +use "${estimation_sample}", clear + +* Set data +xtset idperson swv +sort idperson swv + +* Adjust variables +do "${dir_do}/variable_update.do" + + +/********************************** ESTIMATION ********************************/ + +* Run Stata programs to produce Excel file +do "${dir_do}/programs.do" + + +/********************** H1: SELF-REPORTED HEALTH STATUS ***********************/ +display "${h1_if_condition}" + +gologit2 dhe /// + eduSampleFlag demMaleFlag demAge demAgeSq /// + healthPhysicalPcsL1 healthMentalMcsL1 /// + eduHighestC4NaL1 eduHighestC4MediumL1 eduHighestC4LowL1 /// + /*labStatusC4StudentL1*/ labStatusC4EmployedL1 labStatusC4RetiredL1 /// + yHhQuintilesMonthC5Q2L1 yHhQuintilesMonthC5Q3L1 yHhQuintilesMonthC5Q4L1 yHhQuintilesMonthC5Q5L1 /// + demCompHhC4CoupleChL1 demCompHhC4SingleNoChL1 demCompHhC4L1SingleChL1 /// + healthDsblLongtermFlagL1 /// + $regions demYear demYear2020 demYear2021 $ethnicity /// + if ${h1_if_condition} [pw=${weight}], autofit + +/* +Note: In gologit2, the coefficients show how covariates affect the log-odds of +being above a certain category vs. at or below it. +*/ + +process_gologit, domain("health") process("H1") sheet("H1") /// + title("Process H1: Self Rated Health") /// + gofrow(3) goflabel("H1 - Self-rated health") /// + outcomes(5) /// + ifcond("${h1_if_condition}") + +/**************** H2: PROBABILITY LONG-TERM SICK OR DISABLED ******************/ +display "${h2_if_condition}" + +probit dlltsd01 demMaleFlag demAge demAgeSq /// + eduHighestC4Medium eduHighestC4Low eduHighestC4Na /// + yHhQuintilesMonthC5Q2L1 yHhQuintilesMonthC5Q3L1 yHhQuintilesMonthC5Q4L1 yHhQuintilesMonthC5Q5L1 /// + healthPhysicalPcsL1 healthMentalMcsL1 /// + healthDsblLongtermFlagL1 /// + demCompHhC4CoupleChL1 demCompHhC4SingleNoChL1 demCompHhC4L1SingleChL1 /// + $regions demYear demYear2020 demYear2021 $ethnicity /// + if ${h2_if_condition} [pw=${weight}], vce(robust) + +process_regression, domain("health") process("H2") sheet("H2") /// + title("Process H2: Prob.disabled or long term sick") /// + gofrow(7) goflabel("H2 - Disabled or long term sick") /// + ifcond("${h2_if_condition}") probit + + +display "Self-rated health analysis complete!" + + +capture log close diff --git a/input/InitialPopulations/compile/RegressionEstimates/06_reg_home_ownership.do b/input/InitialPopulations/compile/RegressionEstimates/06_reg_home_ownership.do new file mode 100644 index 000000000..88bc29f80 --- /dev/null +++ b/input/InitialPopulations/compile/RegressionEstimates/06_reg_home_ownership.do @@ -0,0 +1,133 @@ +******************************************************************************** +* PROJECT: SimPaths UK +* SECTION: Home ownership +* OBJECT: Final Regresion Models - Weighted +* AUTHORS: Daria Popova, Justin van de Ven, Aleksandra Kolndrekaj, Ashley Burdett +* LAST UPDATE: 15 April 2026 (DP) +* COUNTRY: UK +* +* NOTES: Re-estimated process at benefit unit level to be consistent with SimPaths +* +******************************************************************************** +clear all +set more off +set mem 200m +set type double +//set maxvar 120000 +set maxvar 30000 + + +/********************************* SET LOG FILE *******************************/ +cap log close +log using "${dir_log}/reg_home_ownership.log", replace + + +/********************************* SET EXCEL FILE *****************************/ + +putexcel set "$dir_results/reg_home_ownership", sheet("Info") replace +putexcel A1 = "Description:", bold +putexcel B1 = "Model parameters governing projection of home ownership" +putexcel A2 = "Authors: " +putexcel B2 = "Patryk Bronka, Justin van de Ven, Daria Popova, Aleksandra Kolndrekaj, Ashley Burdett" +putexcel A3 = "Last edit: 15 April 2026 (DP) " + +putexcel A5 = "Process:", bold +putexcel B5 = "Description:", bold +putexcel A6 = "HO1" +putexcel B6 = "Prob. of being a home owner" + +putexcel A10 = "Notes:", bold +putexcel B10 = "Estimation sample: UK_ipop.dta with grossing up weight dwt" +putexcel B11 = "Conditions for processes are defined as globals in master.do" +putexcel B12 = "Re-estimated process at benefit unit level to be consistent with SimPaths" + +putexcel set "$dir_results/reg_home_ownership", sheet("Gof") modify +putexcel A1 = "Goodness of fit", bold + + +/********************************* PREPARE DATA *******************************/ + +* Load data +use "${estimation_sample}", clear + +* Set data +xtset idperson swv +sort idperson swv + +* Adjust variables +do "${dir_do}/variable_update.do" + + +* Create sample at benefit unit head + +* Keep adults (18+) +keep if dag >= 18 + +* Count unique benefit-unit–wave combinations BEFORE head selection +egen tag_bu_wave = tag(idbenefitunit swv) +count if tag_bu_wave +local n_bu_before = r(N) +display "Number of benefit unit–wave combinations BEFORE selecting head: `n_bu_before'" + +* Sort benefit unit members within each wave: +* 1. Highest non-benefit income (ypnbihs_dv) +* 2. Highest age (dag) +* 3. Lowest idperson (idperson) +gsort idbenefitunit swv -ypnbihs_dv -dag idperson + +* Tag the first person (the "head") per benefit unit and wave +bysort idbenefitunit swv: gen benunit_head = (_n == 1) + +* Keep only benefit unit heads +keep if benunit_head == 1 + +* Count unique benefit-unit–wave combinations AFTER head selection +drop tag_bu_wave +egen tag_bu_wave = tag(idbenefitunit swv) +count if tag_bu_wave +local n_bu_after = r(N) +display "Number of benefit unit–wave combinations AFTER selecting head: `n_bu_after'" + +* Ensure benefit unit–wave counts match before and after head selection +assert `n_bu_before' == `n_bu_after' + +* Verify only one head per benefit unit per wave +by idbenefitunit swv, sort: gen n=_N +assert n==1 + +sort idperson swv + + +/********************************** ESTIMATION ********************************/ + +* Run Stata programs to produce Excel file +do "${dir_do}/programs.do" + + +/********************** HO1: PROBABILITY OF OWNING HOME ***********************/ +display "${ho1_if_condition}" + +probit dhh_owned /// + demMaleFlag demAge demAgeSq /// + demCompHhC82L1 demCompHhC83L1 demCompHhC84L1 demCompHhC85L1 demCompHhC86L1 demCompHhC87L1 demCompHhC88L1 /// + labStatusC4StudentL1 labStatusC4NotEmployedL1 labStatusC4RetiredL1 /// + eduHighestC4MediumL1 eduHighestC4LowL1 eduHighestC4NaL1 /// + healthPhysicalPcsL1 healthMentalMcsL1 /// + yHhQuintilesMonthC5Q2L1 yHhQuintilesMonthC5Q3L1 yHhQuintilesMonthC5Q4L1 yHhQuintilesMonthC5Q5L1 /// + yMiscPersGrossMonthL1 /// + wealthPrptyFlagL1 /// + $regions demYear demYear2020 demYear2021 $ethnicity /// + if ${ho1_if_condition} [pw=dwt], vce(cluster idperson) + +process_regression, domain("home_ownership") process("HO1") sheet("HO1") /// + title("Process S2b: Prob. own home") /// + gofrow(3) goflabel("HO1 - Own home") /// + ifcond("${ho1_if_condition}") probit + + +display "Home ownership analysis complete!" + + +capture log close + + \ No newline at end of file diff --git a/input/InitialPopulations/compile/RegressionEstimates/07_reg_retirement.do b/input/InitialPopulations/compile/RegressionEstimates/07_reg_retirement.do new file mode 100644 index 000000000..b4c136110 --- /dev/null +++ b/input/InitialPopulations/compile/RegressionEstimates/07_reg_retirement.do @@ -0,0 +1,116 @@ +******************************************************************************** +* PROJECT: SimPaths UK +* SECTION: Retirement +* OBJECT: Probit Regresion Models +* AUTHORS: Daria Popova, Justin van de Ven, Aleksandra Kolndrekaj, Ashley Burdett +* LAST UPDATE: 15 April 2026 (DP) +* COUNTRY: UK +* +* NOTES: +* +******************************************************************************** +clear all +set more off +set mem 200m +set type double +//set maxvar 120000 +set maxvar 30000 + + +/********************************* SET LOG FILE *******************************/ + +cap log close +log using "${dir_log}/reg_retirement.log", replace + + +/********************************* SET EXCEL FILE *****************************/ + +putexcel set "$dir_results/reg_retirement", sheet("Info") replace +putexcel A1 = "Description:", bold +putexcel B1 = "Model parameters governing projection of retirement" +putexcel A2 = "Authors: " +putexcel B2 = "Patryk Bronka, Justin van de Ven, Daria Popova, Aleksandra Kolndrekaj, Ashley Burdett" +putexcel A3 = "Last edit: 15 April 2026 (DP) " + +putexcel A5 = "Process:", bold +putexcel B5 = "Description:", bold + +putexcel A6 = "R1a" +putexcel B6 = "Prob of retiring, singles" + +putexcel A7 = "R1b" +putexcel B7 = "Prob of retiring, partnered" + +putexcel A10 = "Notes:", bold +putexcel B10 = "Estimation sample: UK_ipop.dta with grossing up weight dwt" +putexcel B11 = "Conditions for processes are defined as globals in master.do" + +putexcel set "$dir_results/reg_retirement", sheet("Gof") modify +putexcel A1 = "Goodness of fit", bold + + +/********************************* PREPARE DATA *******************************/ + +* Load data +use "${estimation_sample}", clear + +* Set data +xtset idperson swv +sort idperson swv + +* Adjust variables +do "${dir_do}/variable_update.do" + + +/********************************** ESTIMATION ********************************/ + +* Run Stata programs to produce Excel file +do "${dir_do}/programs.do" + + +/****************** R1a: PROBABILITY OF RETIREMENT, SINLGE ********************/ +display "${r1a_if_condition}" + +probit drtren /// + demMaleFlag demAge demAgeSq /// + eduHighestC4MediumL1 eduHighestC4LowL1 eduHighestC4NaL1 /// + healthPhysicalPcsL1 healthMentalMcsL1 /// + demPensAgeFlag /// + labStatusC3NotEmployedL1 /// + yHhQuintilesMonthC5Q2L1 yHhQuintilesMonthC5Q3L1 yHhQuintilesMonthC5Q4L1 yHhQuintilesMonthC5Q5L1 /// + healthDsblLongtermFlagL1 /// + $regions demYear demYear2020 demYear2021 $ethnicity /// + if ${r1a_if_condition} [pw=${weight}], vce(robust) + +process_regression, domain("retirement") process("R1a") sheet("R1a") /// + title("Process R1a: Prob. retire, singles") /// + gofrow(3) goflabel("R1a - Retire, singles") /// + ifcond("${r1a_if_condition}") probit + + + +/***************** R1b: PROBABILITY OF RETIREMENT, PARTNERED ******************/ +display "${r1b_if_condition}" + +probit drtren /// + demMaleFlag demAge demAgeSq /// + eduHighestC4MediumL1 eduHighestC4LowL1 eduHighestC4NaL1 /// + healthPhysicalPcsL1 healthMentalMcsL1 /// + demPensAgeFlag demPensAgeFlag_NotEmployed /// + labStatusC3NotEmployedL1 labStatusPartnerC3NotEmplL1 /// + demPensPartnerAgeFlag /// + yHhQuintilesMonthC5Q2L1 yHhQuintilesMonthC5Q3L1 yHhQuintilesMonthC5Q4L1 yHhQuintilesMonthC5Q5L1 /// + $regions demYear demYear2020 demYear2021 $ethnicity /// + if ${r1b_if_condition} [pw=${weight}], vce(robust) + +process_regression, domain("retirement") process("R1b") sheet("R1b") /// + title("Process R1b: Prob. retire, partnered") /// + gofrow(7) goflabel("R1a - Retire, partnered") /// + ifcond("${r1b_if_condition}") probit + + +display "Retirement analysis complete!" + + +capture log close + diff --git a/input/InitialPopulations/compile/RegressionEstimates/08_reg_wages.do b/input/InitialPopulations/compile/RegressionEstimates/08_reg_wages.do new file mode 100644 index 000000000..1afdb6b14 --- /dev/null +++ b/input/InitialPopulations/compile/RegressionEstimates/08_reg_wages.do @@ -0,0 +1,361 @@ +******************************************************************************** +* PROJECT: SimPaths UK +* SECTION: Wage regression +* OBJECT: Heckman regressions +* AUTHORS: Patryk Bronka, Daria Popova, Justin van de Ven, +* Aleksandra Kolndrekaj, Ashley Burdett +* LAST UPDATE: 28 April 2026 (DP) +******************************************************************************** +******************************************************************************** +* NOTES: Strategy: +* 1) Heckman estimated on the sub-sample of individuals +* who are not observed working in previous period. +* => Wage equation does not controls for lagged wage +* 2) Heckman estimated on the sub-sample of individuals who +* are observed working in previous period. +* => Wage equation controls for lagged wage +* Specification of selection equation is the same in the +* two samples +* +* Import labour cost index to create a measure of wage growth. +* Make sure loaded into the external_data subfolder. +* +*******************************************************************************/ +clear all +set more off +set mem 200m +set type double +//set maxvar 120000 +set maxvar 30000 + +******************************************************************* +cap log close +log using "${dir_log}/reg_wages.log", replace +******************************************************************* + +* Load helper programs +do "${dir_do}/programs.do" + +******************************************************************************** +* Set Excel file +* Info sheet - first stage +putexcel set "$dir_results/reg_employment_selection", sheet("Info") replace +putexcel A1 = "Description:", bold +putexcel B1 = "This file contains regression estimates from the first stage of the Heckman selection model used to estimates wages." +putexcel A2 = "Authors:", bold +putexcel B2 = "Patryk Bronka, Justin Van de Ven, Daria Popova, Aleksandra Kolndrekaj, Ashley Burdett" +putexcel A3 = "Last edit: 28 April 2026 (DP) " + +putexcel A5 = "Process:", bold +putexcel B5 = "Description:", bold +putexcel A6 = "W1fa-sel" +putexcel B6 = "First stage Heckman selection estimates for women that do not have an observed wage in the previous year" +putexcel A7 = "W1ma-sel" +putexcel B7 = "First stage Heckman selection estimates for women that do not have an observed wage in the previous year" +putexcel A8 = "W1fb-sel" +putexcel B8 = "First stage Heckman selection estimates for women that have an observed wage in the previous year" +putexcel A9 = "W1mb-sel" +putexcel B9 = "First stage Heckman selection estimates for men that have an observed wage in the previous year" + +putexcel A11 = "Notes:", bold +putexcel B11 = "Estimated on panel data unlike the labour supply estimates" +putexcel B12 = "Predicted wages used as input into union parameters and income process estimates" +putexcel B13 = "Two-step Heckman command is used which does not permit weights" + +* Info sheet - second stage +putexcel set "$dir_results/reg_wages", sheet("Info") replace +putexcel A1 = "Description:" +putexcel B1 = "This file contains regression estimates used to calculate potential wages for males and females in the simulation." +putexcel A2 = "Authors:", bold +putexcel B2 = "Patryk Bronka, Justin Van de Ven, Daria Popova, Aleksandra Kolndrekaj, Ashley Burdett" +putexcel A3 = "Last edit: 28 April 2026 (DP) " + +putexcel A5 = "Process:", bold +putexcel B5 = "Description:", bold +putexcel A6 = "W1fa" +putexcel B6 = "Second stage Heckman selection estimates using women that do not have an observed wage in the previous year" +putexcel A7 = "W1ma" +putexcel B7 = "Second stage Heckman selection estimates using men that do not have an observed wage in the previous year" +putexcel A8 = "W1fb" +putexcel B8 = "Second stage Heckman selection estimates using women that have an observed wage in the previous year" +putexcel A9 = "W1mb" +putexcel B9 = "Second stage Heckman selection estimates using men that have an observed wage in the previous year" + +putexcel A11 = "Notes:", bold +putexcel B11 = "Estimation sample: UK_ipop.dta. Two-step Heckman command is used which does not permit weights" +putexcel B12 = "Conditions for processes are defined as globals in master.do" +putexcel B13 = "Predicted wages sre saved in dataset UK_ipop2.dta and used as input into union parameters and income process estimates" + +/********************************* PREPARE DATA *******************************/ + +* Prepare data on real growth of wages +import excel "$dir_external_data/time_series_factor.xlsx", /// + sheet("UK_wage_growth") firstrow clear + +rename Year stm +rename Value real_wage_growth + +replace stm = stm - 2000 + +sum real_wage_growth if stm == 15 +gen base = r(mean) +replace real_wage_growth = real_wage_growth / base +drop base + +save "$dir_external_data/growth_rates", replace + +* Load data +use "${estimation_sample}", clear + +* Adjust variables +do "${dir_do}/variable_update.do" + +* Merge in real growth index +merge m:1 stm using "$dir_external_data/growth_rates", keep(3) nogen /// + keepusing(real_wage_growth) +rename real_wage_growth realWageGrowth + +* Set data +xtset idperson swv +sort idperson swv + +* Hours work per week +gen hours = 0 +replace hours = lhw if ((lhw > 0) & (lhw < .)) +label var hours "Hours worked per week" + +* Hourly wage +gen wage_hour = obs_earnings_hourly + +* Winsorize +sum wage_hour, det +replace wage_hour = . if wage_hour <= 0 +replace wage_hour = . if wage_hour >= r(p99) + +gen lwage_hour = ln(wage_hour) +label var lwage_hour "Log gross hourly wage" + +gen lwage_hour_2 = lwage_hour^2 +label var lwage_hour_2 "Squared log gross hourly wage" + +gen labWageHrlyLog = lwage_hour +gen labWageHrlyLogL1 = l.lwage_hour + + +* Flag to identify observations to be included in the estimation sample +bys idperson (swv): gen obs_count_ttl = _N +bys idperson (swv): gen obs_count = _n + +gen in_sample = (obs_count_ttl > 1 & obs_count > 1) +replace in_sample = 0 if swv != swv[_n-1] +1 & idperson == idperson[_n-1] +replace in_sample = 0 if les_c3 == . | obs_earning == . +fre in_sample + +* Flag to distinguish the two samples (prev work and not) +capture drop previouslyWorking +gen previouslyWorking = (L1.lwage_hour != .) +replace previouslyWorking = . if in_sample == 0 +fre previouslyWorking + +* Prep storage +capture drop lwage_hour_hat wage_hour_hat esample +gen lwage_hour_hat = . +gen wage_hour_hat = . +gen esample = . +gen pred_hourly_wage = . + + +/********************************** ESTIMATION ********************************/ + +* Run Stata programs to produce Excel file +do "${dir_do}/programs.do" + +/******************** WAGES: WOMEN, NO PREV WAGE OBSERVED *********************/ +#delimit ; +global wage_eqn +lwage_hour +demAge +demAgeSq +eduHighestC4LowL1 +eduHighestC4MediumL1 +eduHighestC4HighL1 +eduHighestC4LowL1_demAge +eduHighestC4MediumL1_demAge +eduHighestC4HighL1_demAge +eduHighestParentC3Medium +eduHighestParentC3High +healthDsblLongtermFlag +healthPhysicalPcsL1 +healthMentalMcsL1 +labPt +realWageGrowth +$regions +demYear2020 +demYear2021 +$ethnicity +; +#delimit cr + +#delimit ; +global seln_eqn +labStatusC3StudentL1 +labStatusC3NotEmployedL1 +demAge +demAgeSq +eduHighestC4LowL1 +eduHighestC4MediumL1 +eduHighestC4HighL1 +eduHighestC4LowL1_demAge +eduHighestC4MediumL1_demAge +eduHighestC4HighL1_demAge +eduHighestParentC3Medium +eduHighestParentC3High +healthDsblLongtermFlag +healthPhysicalPcsL1 +healthMentalMcsL1 +demPartnerStatusPartnered +demNChild +$regions +demYear2020 +demYear2021 +$ethnicity +; +#delimit cr + +local filter = "${wages_f_no_prev_if_condition}" + +heckman $wage_eqn if `filter', select($seln_eqn) twostep mills(lambda) + +process_heckman, /// + process("W1fa") /// + ifcond("`filter'") /// + savefile("Female_NPW_sample") /// + graphsubtitle("Females, No previously observed wage") /// + wordfile("$dir_raw_results/wages/Output_NWW.doc") /// + wordtitle("Heckman-corrected wage equation: women not in employment last year") /// + wordctitle("Not working women") /// + sheet2("W1fa") sheet1("W1fa-sel") /// + rmserow(2) + + +/********************** WAGES: MEN, NO PREV WAGE OBSERVED *********************/ +* globals are the same as for women + +local filter = "${wages_m_no_prev_if_condition}" + +heckman $wage_eqn if `filter', select($seln_eqn) twostep mills(lambda) + +process_heckman, /// + process("W1ma") /// + ifcond("`filter'") /// + savefile("Male_NPW_sample") /// + graphsubtitle("Males, No previously observed wage") /// + wordfile("$dir_raw_results/wages/Output_NWM.doc") /// + wordtitle("Heckman-corrected wage equation: men not in employment last year") /// + wordctitle("Not working men") /// + sheet2("W1ma") sheet1("W1ma-sel") /// + rmserow(3) + + +/********************** WAGES: WOMEN, PREV WAGE OBSERVED **********************/ + +#delimit ; +global wage_eqn2 +lwage_hour +labWageHrlyLogL1 +demAge +demAgeSq +eduHighestC4LowL1 +eduHighestC4MediumL1 +eduHighestC4HighL1 +eduHighestC4LowL1_demAge +eduHighestC4MediumL1_demAge +eduHighestC4HighL1_demAge +eduHighestParentC3Medium +eduHighestParentC3High +healthDsblLongtermFlag +healthPhysicalPcsL1 +healthMentalMcsL1 +labPt +realWageGrowth +$regions +demYear2020 +demYear2021 +$ethnicity +; +#delimit cr + +#delimit ; +global seln_eqn2 +demAge +demAgeSq +eduHighestC4LowL1 +eduHighestC4MediumL1 +eduHighestC4HighL1 +eduHighestC4LowL1_demAge +eduHighestC4MediumL1_demAge +eduHighestC4HighL1_demAge +eduHighestParentC3Medium +eduHighestParentC3High +healthDsblLongtermFlag +healthPhysicalPcsL1 +healthMentalMcsL1 +demPartnerStatusPartnered +demNChild +$regions +demYear2020 +demYear2021 +$ethnicity +; +#delimit cr + +local filter = "${wages_f_prev_if_condition}" + +heckman $wage_eqn2 if `filter', select($seln_eqn2) twostep mills(lambda) + +process_heckman, /// + process("W1fb") /// + ifcond("`filter'") /// + savefile("Female_PW_sample") /// + graphsubtitle("Females, Previously observed wage") /// + wordfile("$dir_raw_results/wages/Output_WW.doc") /// + wordtitle("Heckman-corrected wage equation: women in employment last year") /// + wordctitle("Working women") /// + sheet2("W1fb") sheet1("W1fb-sel") /// + rmserow(4) + + +/********************** WAGES: MEN, PREV WAGE OBSERVED ************************/ +* globals are the same as for women + +local filter = "${wages_m_prev_if_condition}" + +heckman $wage_eqn2 if `filter', select($seln_eqn2) twostep mills(lambda) + +process_heckman, /// + process("W1mb") /// + ifcond("`filter'") /// + savefile("Male_PW_sample") /// + graphsubtitle("Male, Previously observed wage") /// + wordfile("$dir_raw_results/wages/Output_WM.doc") /// + wordtitle("Heckman-corrected wage equation: men in employment last year") /// + wordctitle("Working men") /// + sheet2("W1mb") sheet1("W1mb-sel") /// + rmserow(5) + + +* Save predicted wages to dataset + +* Use predicted wage for all; fall back to observed wage for those with no +* prediction (first observation for an individual) +replace pred_hourly_wage = exp(lwage_hour) if missing(pred_hourly_wage) + +gen labWageHrly = pred_hourly_wage + + +save "${estimation_sample2}", replace + +display "Wage analysis complete!" + +capture log close + diff --git a/input/InitialPopulations/compile/RegressionEstimates/09_reg_income.do b/input/InitialPopulations/compile/RegressionEstimates/09_reg_income.do new file mode 100644 index 000000000..09b16f0ad --- /dev/null +++ b/input/InitialPopulations/compile/RegressionEstimates/09_reg_income.do @@ -0,0 +1,275 @@ +/******************************************************************************* +* PROJECT: SimPaths UK +* SECTION: Non-employment/non-benefit income +* OBJECT: Final Regresion Models +* AUTHORS: Patryk Bronka, Daria Popova, Justin van de Ven, Ashley Burdett +* LAST UPDATE: 15 April 2026 (DP) +* COUNTRY: UK + +* NOTES: Models for split income variable +* - Capital returns +* - Private pension income +* +* The income do file must be run after +* reg_wages.do because it uses predicted wages. +******************************************************************************/ + +clear all +set more off +set mem 200m +set type double +//set maxvar 120000 +set maxvar 30000 + + +/********************************* SET LOG FILE *******************************/ +cap log close +log using "${dir_log}/reg_income.log", replace + + +/********************************* SET EXCEL FILE *****************************/ + +putexcel set "$dir_results/reg_income", sheet("Info") replace +putexcel A1 = "Description:", bold +putexcel B1 = "This file contains regression estimates used by processes I1 (capital income), I2 (private pension, retired last year), I3 (private pension income, not retired last year) " +putexcel A2 = "Authors:" +putexcel B2 = "Patryk Bronka, Justin Van de Ven, Daria Popova, Aleksandra Kolndrekaj, Ashley Burdett" +putexcel A3 = "Last edit: 15 April 2026 (DP) " + +putexcel A5 = "Process:", bold +putexcel B5 = "Description:", bold + +putexcel A7 = "Process I1a" +putexcel B7 = "Prob. receive capital income " + +putexcel A8 = "Process I1b" +putexcel B8 = "Capital income amount" + +putexcel A9 = "Process I2b" +putexcel B9 = "Private pension income amount" + +putexcel A10 = "Process I3a" +putexcel B10 = "Prob. receive private pension income" + +putexcel A11 = "Process I3b" +putexcel B11 = "Private pension income amount" + + +putexcel A17 = "Notes:", bold +putexcel B17 = "Estimation sample: UK_ipop2.dta with grossing up weight dwt" +putexcel B18 = "Conditions for processes are defined as globals in master.do" +putexcel B19 = "Combined former capital income processes I3a and I3b and renamed as I1a and I1b" +putexcel B20 = "Income variables are IHS transformed." + +putexcel set "$dir_results/reg_income", sheet("Gof") modify +putexcel A1 = "Goodness of fit", bold + +/********************************* PREPARE DATA *******************************/ + +* Prepare data on real growth of wages +/* +import excel "${dir_external_data}/time_series_factor.xlsx", /// + sheet("UK_gdp") firstrow clear // Import real growth index + +rename Year stm +rename Value growth +gen base_val = growth if stm == 2015 +sum base_val +replace base_val = r(mean) +replace growth= growth/base_val +drop base_val +replace stm = stm - 2000 + +save "$dir_external_data\growth_rates", replace +*/ + +* Load data +use "${estimation_sample2}", clear //panel with predicted wages + +* Merge in growth rates +merge m:1 stm using "$dir_external_data/growth_rates", keep(3) nogen /// + keepusing(real_wage_growth) + +* Set data +xtset idperson swv +sort idperson swv + + +*rename pedicted wage +capture confirm variable labWageHrly +if _rc == 0 { + gen labWageHrlyL1 = l.labWageHrly +} + + +cap drop in_sample +cap drop p + +/********************************** ESTIMATION ********************************/ + +* Run Stata programs to produce Excel file +do "${dir_do}/programs.do" + +/*************** I1a: PROBABILITY OF RECEIVEING CAPITAL INCOME ****************/ + +display "${i1a_if_condition}" + +logit receives_ypncp /// + eduSampleFlag demMaleFlag demAge demAgeSq /// + healthPhysicalPcsL1 healthMentalMcsL1 /// + yCapitalPersMonthL1 yEmpPersGrossMonthL1 /// + yCapitalPersMonthL2 yEmpPersGrossMonthL2 /// + eduSampleFlag_Male /// + eduSampleFlag_PcsL1 eduSampleFlag_McsL1 /// + eduSampleFlag_yCapitalPersL1 eduSampleFlag_yCapitalPersL2 eduSampleFlag_yEmpPersGrossL1 eduSampleFlag_yEmpPersGrossL2 /// + eduHighestC4LowL1 eduHighestC4MediumL1 eduHighestC4HighL1 /// + labStatusC4StudentL1 labStatusC4NotEmployedL1 labStatusC4RetiredL1 /// + demCompHhC4CoupleChL1 demCompHhC4SingleNoChL1 demCompHhC4L1SingleChL1 /// + $regions demYear demYear2020 demYear2021 $ethnicity /// + if ${i1a_if_condition} [pw=${weight}], vce(cluster idperson) base + +process_regression, domain("income") process("I1a") sheet("I1a") /// + title("Process I1a: Prob. recieve capital income") /// + gofrow(3) goflabel("I1a - Receive capital income ") /// + ifcond("${i1a_if_condition}") probit + + +/********************** I1b: AMOUNT OF CAPITAL INCOME *************************/ + +* DV: ypncp = Inverse hyperbolic sine (IHS) of gross capital income +display "${i1b_if_condition}" + +reg ypncp /// + demMaleFlag demAge demAgeSq /// + eduHighestC4LowL1 eduHighestC4MediumL1 eduHighestC4HighL1 /// + labStatusC4StudentL1 labStatusC4NotEmployedL1 labStatusC4RetiredL1 /// + demCompHhC4CoupleChL1 demCompHhC4SingleNoChL1 demCompHhC4L1SingleChL1 /// + healthPhysicalPcsL1 healthMentalMcsL1 /// + yCapitalPersMonthL1 yCapitalPersMonthL2 /// + yEmpPersGrossMonthL1 yEmpPersGrossMonthL2 /// + eduSampleFlag_Male /// + eduSampleFlag_yCapitalPersL1 eduSampleFlag_yCapitalPersL2 eduSampleFlag_yEmpPersGrossL1 eduSampleFlag_yEmpPersGrossL2 /// + $regions demYear demYear2020 demYear2021 $ethnicity /// + if ${i1b_if_condition} [pw=${weight}], vce(cluster idperson) + +process_regression, domain("income") process("I1b") sheet("I1b") /// + title("Process I1b: Amount of capital income") /// + gofrow(7) goflabel("I1b - Amount of capital income") /// + ifcond("${i1b_if_condition}") + + +* Calculate RMSE +cap drop residuals squared_residuals +predict residuals , residuals +gen squared_residuals = residuals^2 + +preserve +keep if receives_ypncp == 1 +sum squared_residuals [w = dwt] +di "RMSE for Amount of capital income" sqrt(r(mean)) +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A6 = ("I1b") B6 = (sqrt(r(mean))) +restore + + + +/****************** I2b: AMOUNT OF PENSION INCOME, RETIRED L1 *****************/ + +*Sample: Retired individuals who were retired in the previous year. +*ypnoab = Inverse hyperbolic sine transformation of Gross personal private +* pension income + +display "${i2b_if_condition}" + +reg ypnoab /// + demAge demAgeSq /// + eduHighestC4High eduHighestC4Medium eduHighestC4Na /// + demCompHhC4CoupleChL1 demCompHhC4SingleNoChL1 demCompHhC4L1SingleChL1 /// + healthPhysicalPcsL1 healthMentalMcsL1 /// + yPensPersGrossMonthL1 yPensPersGrossMonthL2 /// + $regions demYear demYear2020 demYear2021 $ethnicity /// + if ${i2b_if_condition} [pw=${weight}], vce(cluster idperson) + +process_regression, domain("income") process("I2b") sheet("I2b") /// + title("Process I2b: Amount of private pension income, retired L1") /// + gofrow(11) goflabel("I2b - Amount of private pension income") /// + ifcond("${i2b_if_condition}") + +* Calculate RMSE +cap drop residuals squared_residuals +predict residuals , residuals +gen squared_residuals = residuals^2 + +preserve +keep if receives_ypnoab == 1 +sum squared_residuals [w = dwt] +di "RMSE for Amount of private pension income" sqrt(r(mean)) +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A7 = ("I2b") B7 = (sqrt(r(mean))) +restore + + + +/**** I3a: PROBABILITY OF RECEIVING PRIVATE PENSION INCOME, NOT RETIRED L1 ****/ + +*Sample: Retired individuals who were not retired in the previous year. + +display "${i3a_if_condition}" + +logit receives_ypnoab /// + demMaleFlag demPensAgeFlag /// + eduHighestC4High eduHighestC4Medium eduHighestC4Na /// + labStatusC4NotEmployedL1 /// + demCompHhC4CoupleChL1 demCompHhC4SingleNoChL1 demCompHhC4L1SingleChL1 /// + healthPhysicalPcsL1 healthMentalMcsL1 /// + labWageHrlyL1 /// + $regions demYear demYear2020 demYear2021 $ethnicity /// + if ${i3a_if_condition} [pw=${weight}], vce(cluster idperson) base + +process_regression, domain("income") process("I3a") sheet("I3a") /// + title("Process I3a: Amount of private pension income, not retired L1") /// + gofrow(15) goflabel("I3a - Receive private pension income ") /// + ifcond("${i3a_if_condition}") probit + + +/******************* I3b: AMOUNT PRIVATE PENSION, NOT RETIRED L1 **************/ + +*Sample: Retired individuals who were not retired in the previous year. +*ypnoab = Inverse hyperbolic sine transformation of Gross personal private +*pension income + +display "${i3b_if_condition}" + +reg ypnoab /// + demMaleFlag demAge /// + eduHighestC4High eduHighestC4Medium eduHighestC4Na /// + labStatusC4NotEmployedL1 /// + demCompHhC4CoupleChL1 demCompHhC4SingleNoChL1 demCompHhC4L1SingleChL1 /// + healthPhysicalPcsL1 healthMentalMcsL1 /// + labWageHrlyL1 /// + $regions demYear demYear2020 demYear2021 $ethnicity /// + if ${i3b_if_condition} [pw=${weight}], vce(cluster idperson) + +process_regression, domain("income") process("I3b") sheet("I3b") /// + title("Process I3b: Amount of private pension income, retired L1") /// + gofrow(19) goflabel("I3b - Amount of private pension income") /// + ifcond("${i3b_if_condition}") + +* Calculate RMSE +cap drop residuals squared_residuals +predict residuals , residuals +gen squared_residuals = residuals^2 + +preserve +keep if receives_ypnoab == 1 +sum squared_residuals [w = dwt] +di "RMSE for Amount of private pension income" sqrt(r(mean)) +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A8 = ("I3b") B8 = (sqrt(r(mean))) +restore + + +display "Income analysis complete!" + + +capture log close diff --git a/input/InitialPopulations/compile/RegressionEstimates/10_reg_socialcare.do b/input/InitialPopulations/compile/RegressionEstimates/10_reg_socialcare.do new file mode 100644 index 000000000..d8add75dc --- /dev/null +++ b/input/InitialPopulations/compile/RegressionEstimates/10_reg_socialcare.do @@ -0,0 +1,397 @@ +******************************************************************************** +* PROJECT: SimPaths UK +* SECTION: SOCIAL CARE RECEIPT +* AUTHORS: Justin van de Ven, Matteo Richiardi, Daria Popova, Ashley Burdett +* LAST UPDATE: 15 April 2026 (DP) +* COUNTRY: UK +* +* NOTES: +* PROGRAM TO EVALUATE SOCIAL CARE RECEIPT FROM UKHLS DATA +* ANALYSIS BASED ON THE SOCIAL CARE MODULE OF UKHLS +* First version: Justin van de Ven, 28 Aug 2023 +* Refactored version: Matteo Richiardi, 16 Feb 2026 +* Integration into the pipeline: Daria Popova 18 Feb 2026 DP +* +*******************************************************************************/ + +/* ANALYTICAL STRATEGY +We analyse/simulate the following variables: +- NeedCare +- ReceiveCare +- CareMarket: Formal, Informal, Mixed +- HrsReceivedFormalIHS +- HrsReceivedInformalIHS +- ProvideCare +- HrsProvidedInformalIHS +(IHS stands for Inverse Hyperbolic Sine transformation) + +The most complicated case is for Partnered, as an issue of consistency arises (care between partners is most common): + +=========================================== + Partner B: Receiving informal care? +___________________________________________ +Partner A: | No Yes +providing | No | (1) (2) +informal care | Yes | (3) (4) +=========================================== + +In the analysis we do not distinguish whom care is received from, and to whom care is provided. +However, the cases above imply: +(1) No hrs received, no hrs provided +(2) All hrs received are from non-partner +(3) All hrs provided are to non-partner_socare_hrs +(4) At least some of the hrs received/provided are from/to partner + +========================================================================================== +RMK: We first analyse care receipt, and then care provision. This order must be preserved. +========================================================================================== +*/ + +* CRITICAL: Clear all FIRST +clear all +set more off +set mem 200m +set type double +//set maxvar 120000 +set maxvar 30000 + + +/********************************* SET LOG FILE *******************************/ +cap log close +log using "${dir_log}/reg_socialcare.log", replace + + +/********************************* SET EXCEL FILE *****************************/ + +putexcel set "$dir_results/reg_socialcare", sheet("Info") replace +putexcel A1 = "Description:", bold +putexcel B1 = "Model parameters for social care module" +putexcel A2 = "Authors:" +putexcel B2 = "Justin van de Ven, Ashley Burdett, Matteo Richiardi, Daria Popova" +putexcel A3 = "Last edit:" +putexcel B3 = "15 April 2026 (DP)" + +putexcel A5 = "Process:", bold +putexcel B5 = "Description:", bold + +putexcel A6 = "S2a" B6 = "Prob. need care" +putexcel A7 = "S2b" B7 = "Prob. receive care" +putexcel A8 = "S2c" B8 = "Prob. receive Formal/informal care" +putexcel A9 = "S2d" B9 = "Informal care hours received" +putexcel A10 = "S2e" B10 = "Hours of formal care received" + +putexcel A11 = "S3a" B11 = "Prob. provide care, Singles" +putexcel A12 = "S3b" B12 = "Prob. provide care, Partnered" +putexcel A13 = "S3c" B13 = "Hours of informal care provided, Singles" +putexcel A14 = "S3d" B14 = "Hours of informal care provided, Partnered" + +putexcel A20 = "Notes:", bold +putexcel B20 = "Estimation sample: UK_ipop.dta with grossing up weight dwt" +putexcel B21 = "Conditions for processes are defined as globals in master.do" + +putexcel set "$dir_results/reg_socialcare", sheet("Gof") modify +putexcel A1 = "Goodness of fit", bold + + +/********************************* PREPARE DATA *******************************/ + +use "${estimation_sample}", clear + +* Time series structure +gsort idperson stm +xtset idperson stm + +* Adjust variables +do "${dir_do}/variable_update.do" + + +/********************************** ESTIMATION ********************************/ + +* Run Stata programs to produce Excel file +do "${dir_do}/programs.do" + +* Stats for if conditions +/* +table stm, stat (count NeedCare) stat (mean NeedCare) // [2015, 2022] +table stm, stat (count ReceiveCare) stat (mean ReceiveCare) // [2016, 2021] but with significant decrease in 2020 and 2021 +table stm, stat (count receive_formal_care) stat (mean receive_formal_care) // [2016, 2021] but with significant decrease in 2020 and 2021 +table stm, stat (count receive_informal_care) stat (mean receive_informal_care) // [2016, 2021] but with significant decrease in 2020 and 2021 +table stm, stat (count provide_informal_care) stat (mean provide_informal_care) // [2015, 2024] also 2014, but fewer hours +*/ +/* +table stm, c(count NeedCare mean NeedCare) +table stm, c(count ReceiveCare mean ReceiveCare) +table stm, c(count receive_formal_care mean receive_formal_care) +table stm, c(count receive_informal_care mean receive_informal_care) +table stm, c(count provide_informal_care mean provide_informal_care) +*/ + +/* Age variables (for experimenting -> copy and paste in the specification) + Dag Dagsq /// + Age67to68 Age69to70 Age71to72 Age73to74 Age75to76 /// + Age77to78 Age79to80 Age81to82 Age83to84 Age85plus /// +*/ + + +/************************ Probit need care (S2a) ******************************/ + +probit careNeedFlag /// + careNeedFlagL1 /// + demMaleFlag /// + demAge67to68 demAge69to70 demAge71to72 demAge73to74 demAge75to76 /// + demAge77to78 demAge79to80 demAge81to82 demAge83to84 demAge85plus /// + healthSelfRatedFair healthSelfRatedGood healthSelfRatedVeryGood healthSelfRatedExcellent /// + demPartnerStatusPartnered /// + eduHighestC4Low eduHighestC4Medium /// + $regions demYear2020 demYear2021 $ethnicity /// + if ${s2a_if_condition} [pw=${weight}], vce(r) + +process_regression, domain("socialcare") process("S2a") sheet("S2a") /// + title("Process S2a: Prob. need care") /// + gofrow(3) goflabel("S2a - Need care") /// + ifcond("${s2a_if_condition}") probit + + +/************************ Probit receive care (S2b) ***************************/ + +probit careReceivedFlag /// + careReceivedFlagL1 /// + demMaleFlag /// + demAge67to68 demAge69to70 demAge71to72 demAge73to74 demAge75to76 /// + demAge77to78 demAge79to80 demAge81to82 demAge83to84 demAge85plus /// + healthSelfRatedFair healthSelfRatedGood healthSelfRatedVeryGood healthSelfRatedExcellent /// + demPartnerStatusPartnered /// + eduHighestC4Low eduHighestC4Medium /// + yHhQuintilesMonthC5Q2 yHhQuintilesMonthC5Q3 yHhQuintilesMonthC5Q4 yHhQuintilesMonthC5Q5 /// + $regions demYear2020 demYear2021 $ethnicity /// + if ${s2b_if_condition} [pw=${weight}], vce(r) + +process_regression, domain("socialcare") process("S2b") sheet("S2b") /// + title("Process S2b: Prob. receive care") /// + gofrow(7) goflabel("S2b - Receive care") /// + ifcond("${s2b_if_condition}") probit + + +/************************ Mlogit formal/informal (S2c) ************************/ +/* + Informal is base outcome + Mixed is 1st outcome + Formal is 2nd outcomes +*/ + +mlogit CareMarket /// + careMarketFormalL1 careMarketInformalL1 careMarketMixedL1 /// + demMaleFlag /// + demAge67to68 demAge69to70 demAge71to72 demAge73to74 demAge75to76 /// + demAge77to78 demAge79to80 demAge81to82 demAge83to84 demAge85plus /// + healthSelfRatedFair healthSelfRatedGood healthSelfRatedVeryGood healthSelfRatedExcellent /// + demPartnerStatusPartnered /// + eduHighestC4Low eduHighestC4Medium /// + yHhQuintilesMonthC5Q2 yHhQuintilesMonthC5Q3 yHhQuintilesMonthC5Q4 yHhQuintilesMonthC5Q5 /// + $regions demYear2020 demYear2021 $ethnicity /// + if ${s2c_if_condition} [pw=${weight}], vce(r) base(2) + +process_gologit, domain("socialcare") process("S2c") sheet("S2c") /// + title("Process S2c: Formal vs Informal") /// + gofrow(11) goflabel("S2c - Formal vs Informal") /// + outcomes(3) /// + ifcond("${s2c_if_condition}") + + +/******************** OLS informal care hours received (S2d) ******************/ + +reg careHrsInformalIhs /// + careHrsInformalIhsL1 /// + careMarketMixed /// + demMaleFlag /// + demAge demAgeSq /// + healthSelfRatedFair healthSelfRatedGood healthSelfRatedVeryGood healthSelfRatedExcellent /// + demPartnerStatusPartnered /// + eduHighestC4Low eduHighestC4Medium /// + yHhQuintilesMonthC5Q2 yHhQuintilesMonthC5Q3 yHhQuintilesMonthC5Q4 yHhQuintilesMonthC5Q5 /// + $regions demYear2020 demYear2021 /*$ethnicity*/ /// + if ${s2d_if_condition} [pweight=${weight}], vce(r) + +process_regression, domain("socialcare") process("S2d") sheet("S2d") /// + title("Process S2d: Informal care hours received") /// + gofrow(15) goflabel("S2d - Hours of informal care received") /// + ifcond("${s2d_if_condition}") + + +* Calculate RMSE +cap drop residuals squared_residuals +predict residuals, residuals +gen squared_residuals = residuals^2 + +preserve +keep if ${s2d_if_condition} + +sum squared_residuals [w=${weight}], meanonly +scalar rmse = sqrt(r(mean)) +di "RMSE for Informal care hours received: " rmse + +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A9 = ("S2d") B9 = (rmse) + +restore + +/********************* OLS formal care hours received (S2e) *******************/ + +reg careHrsFormalIhs /// + careHrsFormalIhsL1 /// + careMarketMixed /// + demMaleFlag /// + healthSelfRatedFair healthSelfRatedGood healthSelfRatedVeryGood healthSelfRatedExcellent /// + demPartnerStatusPartnered /// + eduHighestC4Low eduHighestC4Medium /// + yHhQuintilesMonthC5Q2 yHhQuintilesMonthC5Q3 yHhQuintilesMonthC5Q4 yHhQuintilesMonthC5Q5 /// + $regions demYear2020 demYear2021 $ethnicity /// + if ${s2e_if_condition} [pweight=${weight}], vce(r) + +process_regression, domain("socialcare") process("S2e") sheet("S2e") /// + title("Process S2e: Formal care hours received") /// + gofrow(19) goflabel("S2e - Hours of formal care received") /// + ifcond("${s2e_if_condition}") + +* Calculate RMSE +cap drop residuals squared_residuals +predict residuals, residuals +gen squared_residuals = residuals^2 + +preserve +keep if ${s2e_if_condition} + +sum squared_residuals [w=${weight}], meanonly +scalar rmse = sqrt(r(mean)) +di "RMSE for Formal care hours received: " rmse + +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A10 = ("S2e") B10 = (rmse) + +restore + + +/***************** Probit provide care, Singles (S3a) *************************/ + +probit careProvidedFlag /// + careProvidedFlagL1 /// + careNeedFlag careReceivedFlag /// + demMaleFlag /// + demAge30to34 demAge35to39 demAge40to44 demAge45to49 demAge50to54 /// + demAge55to59 demAge60to64 demAge65to69 demAge70to74 demAge75to79 demAge80to84 demAge85plus /// + healthSelfRatedFair healthSelfRatedGood healthSelfRatedVeryGood healthSelfRatedExcellent /// + eduHighestC4Low eduHighestC4Medium eduHighestC4High /// + yHhQuintilesMonthC5Q2 yHhQuintilesMonthC5Q3 yHhQuintilesMonthC5Q4 yHhQuintilesMonthC5Q5 /// + $regions demYear2020 demYear2021 $ethnicity /// + if ${s3a_if_condition} [pweight=${weight}], vce(r) + +process_regression, domain("socialcare") process("S3a") sheet("S3a") /// + title("Process S3a: Prob. provide care, Singles") /// + gofrow(23) goflabel("S3a - Provide care, Singles") /// + ifcond("${s3a_if_condition}") probit + + + +/***************** Probit provide care, Partnered (S3b) ***********************/ +/* +tab CareMarket ProvideCare if ${s3b_if_condition} +tab deh_c4 ProvideCare if ${s3b_if_condition} +deh_c4 =0 is excluded because there's just 1 obs providing care and probit would not converge +*/ + +//capture drop in_sample p +probit careProvidedFlag /// + careProvidedFlagL1 /// + careNeedFlag careReceivedFlag /// + demMaleFlag /// + careReceivedPartnerFlag careMarketFormalPartner careMarketInformalPartner careMarketMixedPsrtner /// + healthSelfRatedFair healthSelfRatedGood healthSelfRatedVeryGood healthSelfRatedExcellent /// + healthPartnerSelfRatedFair healthPartnerSelfRatedGood healthPartnerSelfRatedVeryGood healthPartnerSelfRatedExcellent /// + eduHighestC4Medium eduHighestC4High /// + yHhQuintilesMonthC5Q2 yHhQuintilesMonthC5Q3 yHhQuintilesMonthC5Q4 yHhQuintilesMonthC5Q5 /// + $regions demYear2020 demYear2021 $ethnicity /// + if ${s3b_if_condition} [pweight=${weight}], vce(r) + +process_regression, domain("socialcare") process("S3b") sheet("S3b") /// + title("Process S3b: Prob. provide care, Partnered") /// + gofrow(27) goflabel("S3b - Provide care, Partnered") /// + ifcond("${s3b_if_condition}") probit + + + + +/******************* OLS care hours provided, Singles (S3c) ******************/ + +reg careHrsProvidedWeekIhs /// + careHrsProvidedWeekIhsL1 /// + demMaleFlag /// + demAge20to24 demAge25to29 demAge30to34 demAge35to39 demAge40to44 demAge45to49 demAge50to54 /// + demAge55to59 demAge60to64 demAge65to69 demAge70to74 demAge75to79 demAge80to84 demAge85plus /// + healthSelfRatedFair healthSelfRatedGood healthSelfRatedVeryGood healthSelfRatedExcellent /// + eduHighestC4Low eduHighestC4Medium eduHighestC4High /// + yHhQuintilesMonthC5Q2 yHhQuintilesMonthC5Q3 yHhQuintilesMonthC5Q4 yHhQuintilesMonthC5Q5 /// + $regions demYear2020 demYear2021 $ethnicity /// + if ${s3c_if_condition} [pweight=${weight}], vce(r) + +process_regression, domain("socialcare") process("S3c") sheet("S3c") /// + title("Process S3c: Informal care hours provided, Singles") /// + gofrow(31) goflabel("S3c - Hours of informal care provided, Singles") /// + ifcond("${s3c_if_condition}") + +* Calculate RMSE +cap drop residuals squared_residuals +predict residuals, residuals +gen squared_residuals = residuals^2 + +preserve +keep if ${s3c_if_condition} + +sum squared_residuals [w=${weight}], meanonly +scalar rmse = sqrt(r(mean)) +di "RMSE for Informal care hours provided, Singles: " rmse + +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A11 = ("S3c") B11 = (rmse) + +restore + + +/****************** OLS care hours provided, Partnered (S3d) *****************/ + +reg careHrsProvidedWeekIhs /// + careHrsProvidedWeekIhsL1 /// + demMaleFlag /// + demAge20to24 demAge25to29 demAge30to34 demAge35to39 demAge40to44 demAge45to49 demAge50to54 /// + demAge55to59 demAge60to64 demAge65to69 demAge70to74 demAge75to79 demAge80to84 demAge85plus /// + careReceivedPartnerFlag careMarketFormalPartner careMarketInformalPartner careMarketMixedPsrtner /// + healthSelfRatedFair healthSelfRatedGood healthSelfRatedVeryGood healthSelfRatedExcellent /// + healthPartnerSelfRatedFair healthPartnerSelfRatedGood healthPartnerSelfRatedVeryGood healthPartnerSelfRatedExcellent /// + eduHighestC4Low eduHighestC4Medium eduHighestC4High /// + yHhQuintilesMonthC5Q2 yHhQuintilesMonthC5Q3 yHhQuintilesMonthC5Q4 yHhQuintilesMonthC5Q5 /// + $regions demYear2020 demYear2021 $ethnicity /// + if ${s3d_if_condition} [pweight=${weight}], vce(r) + +process_regression, domain("socialcare") process("S3d") sheet("S3d") /// + title("Process S3d: Informal care hours provided, Partnered") /// + gofrow(35) goflabel("S3d - Hours of informal care provided, Partnered") /// + ifcond("${s3d_if_condition}") + + * Calculate RMSE +cap drop residuals squared_residuals +predict residuals, residuals +gen squared_residuals = residuals^2 + +preserve +keep if ${s3d_if_condition} + +sum squared_residuals [w=${weight}], meanonly +scalar rmse = sqrt(r(mean)) +di "RMSE for Informal care hours provided, Partnered: " rmse + +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A12 = ("S3d") B12 = (rmse) + +restore + + +display "Social care analysis complete!" diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_financial_distress.do b/input/InitialPopulations/compile/RegressionEstimates/11_reg_financial_distress.do similarity index 97% rename from input/InitialPopulations/compile/RegressionEstimates/reg_financial_distress.do rename to input/InitialPopulations/compile/RegressionEstimates/11_reg_financial_distress.do index e17bc4469..d21395387 100644 --- a/input/InitialPopulations/compile/RegressionEstimates/reg_financial_distress.do +++ b/input/InitialPopulations/compile/RegressionEstimates/11_reg_financial_distress.do @@ -1,234 +1,234 @@ -******************************************************************************** -* PROJECT: UC and mental health -* SECTION: Health and wellbeing -* OBJECT: Financial distress -* AUTHORS: Andy Baxter, Erik Igelström -* LAST UPDATE: 17 Feb 2026 -* COUNTRY: UK -* -* NOTES: -******************************************************************************** -clear all -set more off -set mem 200m -set maxvar 30000 - - -******************************************************************* -cap log close -log using "${dir_log}/reg_financial_distress.log", replace -******************************************************************* - - -/********************************* PREPARE DATA *******************************/ - -use ${estimation_sample}, clear - -* Set data -xtset idperson swv -sort idperson swv - -* Adjust variables -do "${dir_do}/variable_update.do" -/* DP: Household income/poverty/employment transition variables are moved to variable_update.do */ - -* Remove children -drop if dag < 16 - -********************************************************************** -* HM1_L: GHQ12 score 0-36 of all working-age adults - baseline effects * -********************************************************************** - -logit financial_distress /// -ib11.exp_emp i.lhw_c5 D.log_income i.exp_incchange ib0.exp_poverty L.ypncp L.ypnoab /// -L.i.econ_benefits L.i.dhh_owned L.i.dcpst L.dnc L.dhe_pcs L.dhe_mcs L.ib8.drgn L.i.ydses_c5 L.dlltsd01 L.financial_distress /// -i.dgn L.dag L.dagsq i.deh_c3 i.dot stm /// -[pweight=${weight}] /// -, vce(cluster idperson) - - * save raw results -matrix results = r(table) -matrix results = results[1..6,1...]' -putexcel set "$dir_raw_results/financial_distress/financial_distress", sheet("UK") replace -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -gen in_sample = e(sample) - -predict p - -save "$dir_validation_data/financial_distress", replace - - -scalar r2_p = e(r2_p) -scalar N = e(N) -scalar rmse = e(rmse) -scalar chi2 = e(chi2) -scalar ll = e(ll) - - -* Results - -* Note: Zeros values are eliminated - -matrix b = e(b) -matrix V = e(V) - - -* Store variance-covariance matrix - -preserve - -putexcel set "$dir_raw_results/financial_distress/var_cov", sheet("var_cov") replace -putexcel A1 = matrix(V) - -import excel "$dir_raw_results/financial_distress/var_cov", sheet("var_cov") clear - -describe -local no_vars = `r(k)' - -forvalues i = 1/2 { - egen row_sum = rowtotal(*) - drop if row_sum == 0 - drop row_sum - xpose, clear -} - -mkmat v*, matrix(var) -putexcel set "$dir_results/reg_financial_distress", sheet("UK") modify -putexcel C2 = matrix(var) - -restore - - -* Store estimated coefficients - -// Initialize a counter for non-zero coefficients -local non_zero_count = 0 -//local names : colnames b - -// Loop through each element in `b` to count non-zero coefficients -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - local non_zero_count = `non_zero_count' + 1 - } -} - -// Create a new row vector to hold only non-zero coefficients -matrix nonzero_b = J(1, `non_zero_count', .) - -// Populate nonzero_b with non-zero coefficients from b -local index = 1 -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - matrix nonzero_b[1, `index'] = b[1, `i'] - local index = `index' + 1 - } -} - -putexcel set "$dir_results/reg_financial_distress", sheet("UK") modify -putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) - - -* Labelling - -putexcel A1 = "REGRESSOR" -putexcel A2 = "EmployedToUnemployed" // 13.exp_emp -putexcel A3 = "UnemployedToEmployed" // 31.exp_emp -putexcel A4 = "PersistentUnemployed" // 33.exp_emp -putexcel A5 = "Lhw_10" // 10.lhw_c5 -putexcel A6 = "Lhw_20" // 20.lhw_c5 -putexcel A7 = "Lhw_30" // 30.lhw_c5 -putexcel A8 = "Lhw_40" // 40.lhw_c5 -putexcel A9 = "RealIncomeChange" // D.log_income -putexcel A10 = "RealIncomeDecrease_D" // 1.exp_incchange -putexcel A11 = "NonPovertyToPoverty" // 1.exp_poverty -putexcel A12 = "PovertyToNonPoverty" // 2.exp_poverty -putexcel A13 = "PersistentPoverty" // 3.exp_poverty -putexcel A14 = "Ypncp_L1" // L.ypncp -putexcel A15 = "Ypnoab_L1" // L.ypnoab -putexcel A16 = "D_Econ_benefits" // 1L.econ_benefits -putexcel A17 = "D_Home_owner_L1" // 1L.dhh_owned -putexcel A18 = "Dcpst_Single_L1" // 2L.dcpst -putexcel A19 = "Dnc_L1" // L.dnc -putexcel A20 = "Dhe_pcs_L1" // L.dhe_pcs -putexcel A21 = "Dhe_mcs_L1" // L.dhe_mcs -putexcel A22 = "UKC" // 1L.drgn1 -putexcel A23 = "UKD" // 2L.drgn1 -putexcel A24 = "UKE" // 4L.drgn1 -putexcel A25 = "UKF" // 5L.drgn1 -putexcel A26 = "UKG" // 6L.drgn1 -putexcel A27 = "UKH" // 7L.drgn1 -putexcel A28 = "UKJ" // 9L.drgn1 -putexcel A29 = "UKK" // 10L.drgn1 -putexcel A30 = "UKL" // 11L.drgn1 -putexcel A31 = "UKM" // 12L.drgn1 -putexcel A32 = "UKN" // 13L.drgn1 -putexcel A33 = "Ydses_c5_Q2_L1" // 2L.ydses_c5 -putexcel A34 = "Ydses_c5_Q3_L1" // 3L.ydses_c5 -putexcel A35 = "Ydses_c5_Q4_L1" // 4L.ydses_c5 -putexcel A36 = "Ydses_c5_Q5_L1" // 5L.ydses_c5 -putexcel A37 = "Dlltsd01_L1" // L.dlltsd01 -putexcel A38 = "FinancialDistress" // L.financial_distress -putexcel A39 = "Dgn" // 1.dgn -putexcel A40 = "Dag_L1" // L.dag -putexcel A41 = "Dag_sq_L1" // L.dagsq -putexcel A42 = "Deh_c3_Medium" // 2.deh_c3 -putexcel A43 = "Deh_c3_Low" // 3.deh_c3 -putexcel A44 = "EthnicityAsian" // 2.dot -putexcel A45 = "EthnicityBlack" // 3.dot -putexcel A46 = "EthnicityOther" // 4.dot -putexcel A47 = "Year_transformed" // stm -putexcel A48 = "Constant" // _cons - -putexcel B1 = "COEFFICIENT" -putexcel C1 = "EmployedToUnemployed" // 13.exp_emp -putexcel D1 = "UnemployedToEmployed" // 31.exp_emp -putexcel E1 = "PersistentUnemployed" // 33.exp_emp -putexcel F1 = "Lhw_10" // 10.lhw_c5 -putexcel G1 = "Lhw_20" // 20.lhw_c5 -putexcel H1 = "Lhw_30" // 30.lhw_c5 -putexcel I1 = "Lhw_40" // 40.lhw_c5 -putexcel J1 = "RealIncomeChange" // D.log_income -putexcel K1 = "RealIncomeDecrease_D" // 1.exp_incchange -putexcel L1 = "NonPovertyToPoverty" // 1.exp_poverty -putexcel M1 = "PovertyToNonPoverty" // 2.exp_poverty -putexcel N1 = "PersistentPoverty" // 3.exp_poverty -putexcel O1 = "Ypncp_L1" // L.ypncp -putexcel P1 = "Ypnoab_L1" // L.ypnoab -putexcel Q1 = "D_Econ_benefits" // 1L.econ_benefits -putexcel R1 = "D_Home_owner_L1" // 1L.dhh_owned -putexcel S1 = "Dcpst_Single_L1" // 2L.dcpst -putexcel T1 = "Dnc_L1" // L.dnc -putexcel U1 = "Dhe_pcs_L1" // L.dhe_pcs -putexcel V1 = "Dhe_mcs_L1" // L.dhe_mcs -putexcel W1 = "UKC" // 1L.drgn1 -putexcel X1 = "UKD" // 2L.drgn1 -putexcel Y1 = "UKE" // 4L.drgn1 -putexcel Z1 = "UKF" // 5L.drgn1 -putexcel AA1 = "UKG" // 6L.drgn1 -putexcel AB1 = "UKH" // 7L.drgn1 -putexcel AC1 = "UKJ" // 9L.drgn1 -putexcel AD1 = "UKK" // 10L.drgn1 -putexcel AE1 = "UKL" // 11L.drgn1 -putexcel AF1 = "UKM" // 12L.drgn1 -putexcel AG1 = "UKN" // 13L.drgn1 -putexcel AH1 = "Ydses_c5_Q2_L1" // 2L.ydses_c5 -putexcel AI1 = "Ydses_c5_Q3_L1" // 3L.ydses_c5 -putexcel AJ1 = "Ydses_c5_Q4_L1" // 4L.ydses_c5 -putexcel AK1 = "Ydses_c5_Q5_L1" // 5L.ydses_c5 -putexcel AL1 = "Dlltsd01_L1" // L.dlltsd01 -putexcel AM1 = "FinancialDistress" // L.financial_distress -putexcel AN1 = "Dgn" // 1.dgn -putexcel AO1 = "Dag_L1" // L.dag -putexcel AP1 = "Dag_sq_L1" // L.dagsq -putexcel AQ1 = "Deh_c3_Medium" // 2.deh_c3 -putexcel AR1 = "Deh_c3_Low" // 3.deh_c3 -putexcel AS1 = "EthnicityAsian" // 2.dot -putexcel AT1 = "EthnicityBlack" // 3.dot -putexcel AU1 = "EthnicityOther" // 4.dot -putexcel AV1 = "Year_transformed" // stm -putexcel AW1 = "Constant" // _cons - -drop in_sample p -scalar drop r2_p N chi2 ll +******************************************************************************** +* PROJECT: UC and mental health +* SECTION: Health and wellbeing +* OBJECT: Financial distress +* AUTHORS: Andy Baxter, Erik Igelström +* LAST UPDATE: 17 Feb 2026 +* COUNTRY: UK +* +* NOTES: +******************************************************************************** +clear all +set more off +set mem 200m +set maxvar 30000 + + +******************************************************************* +cap log close +log using "${dir_log}/reg_financial_distress.log", replace +******************************************************************* + + +/********************************* PREPARE DATA *******************************/ + +use ${estimation_sample}, clear + +* Set data +xtset idperson swv +sort idperson swv + +* Adjust variables +do "${dir_do}/variable_update.do" +/* DP: Household income/poverty/employment transition variables are moved to variable_update.do */ + +* Remove children +drop if dag < 16 + +********************************************************************** +* HM1_L: GHQ12 score 0-36 of all working-age adults - baseline effects * +********************************************************************** + +logit financial_distress /// +ib11.exp_emp i.lhw_c5 D.log_income i.exp_incchange ib0.exp_poverty L.ypncp L.ypnoab /// +L.i.econ_benefits L.i.dhh_owned L.i.dcpst L.dnc L.dhe_pcs L.dhe_mcs L.ib8.drgn L.i.ydses_c5 L.dlltsd01 L.financial_distress /// +i.dgn L.dag L.dagsq i.deh_c3 i.dot stm /// +[pweight=${weight}] /// +, vce(cluster idperson) + + * save raw results +matrix results = r(table) +matrix results = results[1..6,1...]' +putexcel set "$dir_raw_results/financial_distress/financial_distress", sheet("UK") replace +putexcel A3 = matrix(results), names nformat(number_d2) +putexcel J4 = matrix(e(V)) + +gen in_sample = e(sample) + +predict p + +save "$dir_validation_data/financial_distress", replace + + +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar rmse = e(rmse) +scalar chi2 = e(chi2) +scalar ll = e(ll) + + +* Results + +* Note: Zeros values are eliminated + +matrix b = e(b) +matrix V = e(V) + + +* Store variance-covariance matrix + +preserve + +putexcel set "$dir_raw_results/financial_distress/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/financial_distress/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_financial_distress", sheet("UK") modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients + +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +// Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +// Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +// Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_financial_distress", sheet("UK") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + + +* Labelling + +putexcel A1 = "REGRESSOR" +putexcel A2 = "EmployedToUnemployed" // 13.exp_emp +putexcel A3 = "UnemployedToEmployed" // 31.exp_emp +putexcel A4 = "PersistentUnemployed" // 33.exp_emp +putexcel A5 = "Lhw_10" // 10.lhw_c5 +putexcel A6 = "Lhw_20" // 20.lhw_c5 +putexcel A7 = "Lhw_30" // 30.lhw_c5 +putexcel A8 = "Lhw_40" // 40.lhw_c5 +putexcel A9 = "RealIncomeChange" // D.log_income +putexcel A10 = "RealIncomeDecrease_D" // 1.exp_incchange +putexcel A11 = "NonPovertyToPoverty" // 1.exp_poverty +putexcel A12 = "PovertyToNonPoverty" // 2.exp_poverty +putexcel A13 = "PersistentPoverty" // 3.exp_poverty +putexcel A14 = "Ypncp_L1" // L.ypncp +putexcel A15 = "Ypnoab_L1" // L.ypnoab +putexcel A16 = "D_Econ_benefits" // 1L.econ_benefits +putexcel A17 = "D_Home_owner_L1" // 1L.dhh_owned +putexcel A18 = "Dcpst_Single_L1" // 2L.dcpst +putexcel A19 = "Dnc_L1" // L.dnc +putexcel A20 = "Dhe_pcs_L1" // L.dhe_pcs +putexcel A21 = "Dhe_mcs_L1" // L.dhe_mcs +putexcel A22 = "UKC" // 1L.drgn1 +putexcel A23 = "UKD" // 2L.drgn1 +putexcel A24 = "UKE" // 4L.drgn1 +putexcel A25 = "UKF" // 5L.drgn1 +putexcel A26 = "UKG" // 6L.drgn1 +putexcel A27 = "UKH" // 7L.drgn1 +putexcel A28 = "UKJ" // 9L.drgn1 +putexcel A29 = "UKK" // 10L.drgn1 +putexcel A30 = "UKL" // 11L.drgn1 +putexcel A31 = "UKM" // 12L.drgn1 +putexcel A32 = "UKN" // 13L.drgn1 +putexcel A33 = "Ydses_c5_Q2_L1" // 2L.ydses_c5 +putexcel A34 = "Ydses_c5_Q3_L1" // 3L.ydses_c5 +putexcel A35 = "Ydses_c5_Q4_L1" // 4L.ydses_c5 +putexcel A36 = "Ydses_c5_Q5_L1" // 5L.ydses_c5 +putexcel A37 = "Dlltsd01_L1" // L.dlltsd01 +putexcel A38 = "FinancialDistress" // L.financial_distress +putexcel A39 = "Dgn" // 1.dgn +putexcel A40 = "Dag_L1" // L.dag +putexcel A41 = "Dag_sq_L1" // L.dagsq +putexcel A42 = "Deh_c3_Medium" // 2.deh_c3 +putexcel A43 = "Deh_c3_Low" // 3.deh_c3 +putexcel A44 = "EthnicityAsian" // 2.dot +putexcel A45 = "EthnicityBlack" // 3.dot +putexcel A46 = "EthnicityOther" // 4.dot +putexcel A47 = "Year_transformed" // stm +putexcel A48 = "Constant" // _cons + +putexcel B1 = "COEFFICIENT" +putexcel C1 = "EmployedToUnemployed" // 13.exp_emp +putexcel D1 = "UnemployedToEmployed" // 31.exp_emp +putexcel E1 = "PersistentUnemployed" // 33.exp_emp +putexcel F1 = "Lhw_10" // 10.lhw_c5 +putexcel G1 = "Lhw_20" // 20.lhw_c5 +putexcel H1 = "Lhw_30" // 30.lhw_c5 +putexcel I1 = "Lhw_40" // 40.lhw_c5 +putexcel J1 = "RealIncomeChange" // D.log_income +putexcel K1 = "RealIncomeDecrease_D" // 1.exp_incchange +putexcel L1 = "NonPovertyToPoverty" // 1.exp_poverty +putexcel M1 = "PovertyToNonPoverty" // 2.exp_poverty +putexcel N1 = "PersistentPoverty" // 3.exp_poverty +putexcel O1 = "Ypncp_L1" // L.ypncp +putexcel P1 = "Ypnoab_L1" // L.ypnoab +putexcel Q1 = "D_Econ_benefits" // 1L.econ_benefits +putexcel R1 = "D_Home_owner_L1" // 1L.dhh_owned +putexcel S1 = "Dcpst_Single_L1" // 2L.dcpst +putexcel T1 = "Dnc_L1" // L.dnc +putexcel U1 = "Dhe_pcs_L1" // L.dhe_pcs +putexcel V1 = "Dhe_mcs_L1" // L.dhe_mcs +putexcel W1 = "UKC" // 1L.drgn1 +putexcel X1 = "UKD" // 2L.drgn1 +putexcel Y1 = "UKE" // 4L.drgn1 +putexcel Z1 = "UKF" // 5L.drgn1 +putexcel AA1 = "UKG" // 6L.drgn1 +putexcel AB1 = "UKH" // 7L.drgn1 +putexcel AC1 = "UKJ" // 9L.drgn1 +putexcel AD1 = "UKK" // 10L.drgn1 +putexcel AE1 = "UKL" // 11L.drgn1 +putexcel AF1 = "UKM" // 12L.drgn1 +putexcel AG1 = "UKN" // 13L.drgn1 +putexcel AH1 = "Ydses_c5_Q2_L1" // 2L.ydses_c5 +putexcel AI1 = "Ydses_c5_Q3_L1" // 3L.ydses_c5 +putexcel AJ1 = "Ydses_c5_Q4_L1" // 4L.ydses_c5 +putexcel AK1 = "Ydses_c5_Q5_L1" // 5L.ydses_c5 +putexcel AL1 = "Dlltsd01_L1" // L.dlltsd01 +putexcel AM1 = "FinancialDistress" // L.financial_distress +putexcel AN1 = "Dgn" // 1.dgn +putexcel AO1 = "Dag_L1" // L.dag +putexcel AP1 = "Dag_sq_L1" // L.dagsq +putexcel AQ1 = "Deh_c3_Medium" // 2.deh_c3 +putexcel AR1 = "Deh_c3_Low" // 3.deh_c3 +putexcel AS1 = "EthnicityAsian" // 2.dot +putexcel AT1 = "EthnicityBlack" // 3.dot +putexcel AU1 = "EthnicityOther" // 4.dot +putexcel AV1 = "Year_transformed" // stm +putexcel AW1 = "Constant" // _cons + +drop in_sample p +scalar drop r2_p N chi2 ll diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_health_mental.do b/input/InitialPopulations/compile/RegressionEstimates/12_reg_health_mental.do similarity index 96% rename from input/InitialPopulations/compile/RegressionEstimates/reg_health_mental.do rename to input/InitialPopulations/compile/RegressionEstimates/12_reg_health_mental.do index 0b3fd33aa..831e52eae 100644 --- a/input/InitialPopulations/compile/RegressionEstimates/reg_health_mental.do +++ b/input/InitialPopulations/compile/RegressionEstimates/12_reg_health_mental.do @@ -1,982 +1,982 @@ -******************************************************************************** -* PROJECT: UC and mental health -* SECTION: Health and wellbeing -* OBJECT: Health status and Disability -* AUTHORS: Andy Baxter -* LAST UPDATE: 17 Feb 2026 -* COUNTRY: UK -* -* NOTES: -* - This file updates GHQ12 Level (0-36) and Caseness (0-12) variables -******************************************************************************** -clear all -set more off -set mem 200m -set maxvar 30000 - - -******************************************************************* -cap log close -log using "${dir_log}/reg_health_mental.log", replace -******************************************************************* - -/********************************* PREPARE DATA *******************************/ - -use ${estimation_sample}, clear - -* Set data -xtset idperson swv -sort idperson swv - -* Adjust variables -do "${dir_do}/variable_update.do" -/* DP: Household income/poverty/employment transition variables are moved to variable_update.do */ - -* Remove children -drop if dag < 16 - -********************************************************************** -* HM1_L: GHQ12 score 0-36 of all working-age adults - baseline effects * -********************************************************************** - -reg dhm /// -L.i.dhh_owned L.i.dcpst L.dnc L.dhe_pcs L.ib8.drgn L.i.ydses_c5 L.dlltsd01 L.dhm /// -L.dag L.dagsq i.deh_c3 i.dot i.dgn stm /// -[pweight=${weight}] /// -, vce(cluster idperson) - - * save raw results -matrix results = r(table) -matrix results = results[1..6,1...]' -putexcel set "$dir_raw_results/health_mental/health_mental", sheet("HM1_L") replace -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -gen in_sample = e(sample) - -predict p - -save "$dir_validation_data/HM1_L_sample", replace - - -scalar r2_p = e(r2_p) -scalar N = e(N) -scalar rmse = e(rmse) -scalar chi2 = e(chi2) -scalar ll = e(ll) - - -* Results - -* Note: Zeros values are eliminated - -matrix b = e(b) -matrix V = e(V) - - -* Store variance-covariance matrix - -preserve - -putexcel set "$dir_raw_results/health_mental/var_cov", sheet("var_cov") replace -putexcel A1 = matrix(V) - -import excel "$dir_raw_results/health_mental/var_cov", sheet("var_cov") clear - -describe -local no_vars = `r(k)' - -forvalues i = 1/2 { - egen row_sum = rowtotal(*) - drop if row_sum == 0 - drop row_sum - xpose, clear -} - -mkmat v*, matrix(var) -putexcel set "$dir_results/reg_health_mental", sheet("HM1_L", replace) modify -putexcel C2 = matrix(var) - -restore - - -* Store estimated coefficients - -// Initialize a counter for non-zero coefficients -local non_zero_count = 0 -//local names : colnames b - -// Loop through each element in `b` to count non-zero coefficients -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - local non_zero_count = `non_zero_count' + 1 - } -} - -// Create a new row vector to hold only non-zero coefficients -matrix nonzero_b = J(1, `non_zero_count', .) - -// Populate nonzero_b with non-zero coefficients from b -local index = 1 -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - matrix nonzero_b[1, `index'] = b[1, `i'] - local index = `index' + 1 - } -} - -putexcel set "$dir_results/reg_health_mental", sheet("HM1_L") modify -putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) - - -* Labelling - -putexcel A1 = "REGRESSOR" -putexcel A2 = "D_Home_owner_L1" -putexcel A3 = "Dcpst_Single_L1" -putexcel A4 = "Dnc_L1" -putexcel A5 = "Dhe_pcs_L1" -putexcel A6 = "UKC" -putexcel A7 = "UKD" -putexcel A8 = "UKE" -putexcel A9 = "UKF" -putexcel A10 = "UKG" -putexcel A11 = "UKH" -putexcel A12 = "UKJ" -putexcel A13 = "UKK" -putexcel A14 = "UKL" -putexcel A15 = "UKM" -putexcel A16 = "UKN" -putexcel A17 = "Ydses_c5_Q2_L1" -putexcel A18 = "Ydses_c5_Q3_L1" -putexcel A19 = "Ydses_c5_Q4_L1" -putexcel A20 = "Ydses_c5_Q5_L1" -putexcel A21 = "Dlltsd01_L1" -putexcel A22 = "Dhm_L1" -putexcel A23 = "Dag_L1" -putexcel A24 = "Dag_sq_L1" -putexcel A25 = "Deh_c3_Medium" -putexcel A26 = "Deh_c3_Low" -putexcel A27 = "EthnicityAsian" -putexcel A28 = "EthnicityBlack" -putexcel A29 = "EthnicityOther" -putexcel A30 = "Dgn" -putexcel A31 = "Year_transformed" -putexcel A32 = "Constant" - -putexcel B1 = "COEFFICIENT" -putexcel C1 = "D_Home_owner_L1" -putexcel D1 = "Dcpst_Single_L1" -putexcel E1 = "Dnc_L1" -putexcel F1 = "Dhe_pcs_L1" -putexcel G1 = "UKC" -putexcel H1 = "UKD" -putexcel I1 = "UKE" -putexcel J1 = "UKF" -putexcel K1 = "UKG" -putexcel L1 = "UKH" -putexcel M1 = "UKJ" -putexcel N1 = "UKK" -putexcel O1 = "UKL" -putexcel P1 = "UKM" -putexcel Q1 = "UKN" -putexcel R1 = "Ydses_c5_Q2_L1" -putexcel S1 = "Ydses_c5_Q3_L1" -putexcel T1 = "Ydses_c5_Q4_L1" -putexcel U1 = "Ydses_c5_Q5_L1" -putexcel V1 = "Dlltsd01_L1" -putexcel W1 = "Dhm_L1" -putexcel X1 = "Dag_L1" -putexcel Y1 = "Dag_sq_L1" -putexcel Z1 = "Deh_c3_Medium" -putexcel AA1 = "Deh_c3_Low" -putexcel AB1 = "EthnicityAsian" -putexcel AC1 = "EthnicityBlack" -putexcel AD1 = "EthnicityOther" -putexcel AE1 = "Dgn" -putexcel AF1 = "Year_transformed" -putexcel AG1 = "Constant" - -* save RMSE -putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify -putexcel A13 = ("HM1_L") B13 = rmse - -drop in_sample p - -scalar drop r2_p N chi2 ll - -*************************************************************** -* HM2_Females_L: GHQ12 Score 0-36 - causal employment effects * -*************************************************************** - -*Stage 2 -*Female -reghdfe dhm /// -ib11.exp_emp i.exp_poverty i.exp_incchange D.log_income financial_distress /// -y2020 y2021 /// -L.i.dhh_owned L.i.dcpst L.dnc L.dhe_pcs L.ib8.drgn L.i.ydses_c5 L.dlltsd01 L.dhm /// -L.dag L.dagsq i.deh_c3 stm /// -if dag>=25 & dag<=64 & dgn==0 /// -[pweight=${weight}] /// -, absorb(idperson) vce(cluster idperson) - - - * save raw results -matrix results = r(table) -matrix results = results[1..6,1..10]' -putexcel set "$dir_raw_results/health_mental/health_mental", sheet("HM2_Females_L") replace -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -gen in_sample = e(sample) - -predict p - -save "$dir_validation_data/HM2_Females_L_sample", replace - - -scalar r2_p = e(r2_p) -scalar N = e(N) -scalar rmse = e(rmse) -scalar chi2 = e(chi2) -scalar ll = e(ll) - - -* Results - -* Note: Zeros values are eliminated - -matrix b = e(b) -matrix V = e(V) -matrix V = V[1..14,1..14] - -forvalues i = 1/14 { - forvalues j = 1/14 { - if `i' == `j' { - continue - } - matrix V[`i',`j'] = 0 - } -} - -* Store variance-covariance matrix - -preserve - -putexcel set "$dir_raw_results/health_mental/var_cov", sheet("var_cov") replace -putexcel A1 = matrix(V) - -import excel "$dir_raw_results/health_mental/var_cov", sheet("var_cov") clear - -describe -local no_vars = `r(k)' - -forvalues i = 1/2 { - egen row_sum = rowtotal(*) - drop if row_sum == 0 - drop row_sum - xpose, clear -} - -mkmat v*, matrix(var) -putexcel set "$dir_results/reg_health_mental", sheet("HM2_Females_L", replace) modify -putexcel C2 = matrix(var) - -restore - - -* Store estimated coefficients - -// Initialize a counter for non-zero coefficients -local non_zero_count = 0 -//local names : colnames b - -// Loop through each element in `b` to count non-zero coefficients -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - local non_zero_count = `non_zero_count' + 1 - } -} - -// Create a new row vector to hold only non-zero coefficients -matrix nonzero_b = J(1, `non_zero_count', .) - -// Populate nonzero_b with non-zero coefficients from b -local index = 1 -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - matrix nonzero_b[1, `index'] = b[1, `i'] - local index = `index' + 1 - } -} - -putexcel set "$dir_results/reg_health_mental", sheet("HM2_Females_L") modify -putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) - -* Labelling - -putexcel A1 = "REGRESSOR" -putexcel A2 = "EmployedToUnemployed" -putexcel A3 = "UnemployedToEmployed" -putexcel A4 = "PersistentUnemployed" -putexcel A5 = "NonPovertyToPoverty" -putexcel A6 = "PovertyToNonPoverty" -putexcel A7 = "PersistentPoverty" -putexcel A8 = "RealIncomeChange" -putexcel A9 = "RealIncomeDecrease_D" -putexcel A10 = "FinancialDistress" -putexcel A11 = "Covid_2020_D" -putexcel A12 = "Covid_2021_D" - - -putexcel B1 = "COEFFICIENT" -putexcel C1 = "EmployedToUnemployed" -putexcel D1 = "UnemployedToEmployed" -putexcel E1 = "PersistentUnemployed" -putexcel F1 = "NonPovertyToPoverty" -putexcel G1 = "PovertyToNonPoverty" -putexcel H1 = "PersistentPoverty" -putexcel I1 = "RealIncomeChange" -putexcel J1 = "RealIncomeDecrease_D" -putexcel K1 = "FinancialDistress" -putexcel L1 = "Covid_2020_D" -putexcel M1 = "Covid_2021_D" - -* save RMSE -putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify -putexcel A14 = ("HM2_Females_L") B14 = rmse - -drop in_sample p -scalar drop r2_p N chi2 ll - -*************************************************************** -* HM2_Males_L: GHQ12 Score 0-36 - causal employment effects * -*************************************************************** - - -*Stage 2 -*Male -reghdfe dhm /// -ib11.exp_emp i.exp_poverty i.exp_incchange D.log_income financial_distress /// -y2020 y2021 /// -L.i.dhh_owned L.i.dcpst L.dnc L.dhe_pcs L.ib8.drgn L.i.ydses_c5 L.dlltsd01 L.dhm /// -L.dag L.dagsq i.deh_c3 stm /// -if dag>=25 & dag<=64 & dgn==1 /// -[pweight=${weight}] /// -, absorb(idperson) vce(cluster idperson) - - - * save raw results -matrix results = r(table) -matrix results = results[1..6,1..10]' -putexcel set "$dir_raw_results/health_mental/health_mental", sheet("HM2_Males_L") replace -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -gen in_sample = e(sample) - -predict p - -save "$dir_validation_data/HM2_Males_L_sample", replace - - -scalar r2_p = e(r2_p) -scalar N = e(N) -scalar rmse = e(rmse) -scalar chi2 = e(chi2) -scalar ll = e(ll) - - -* Results - -* Note: Zeros values are eliminated - -matrix b = e(b) -matrix V = e(V) -matrix V = V[1..14,1..14] - -forvalues i = 1/14 { - forvalues j = 1/14 { - if `i' == `j' { - continue - } - matrix V[`i',`j'] = 0 - } -} - -* Store variance-covariance matrix - -preserve - -putexcel set "$dir_raw_results/health_mental/var_cov", sheet("var_cov") replace -putexcel A1 = matrix(V) - -import excel "$dir_raw_results/health_mental/var_cov", sheet("var_cov") clear - -describe -local no_vars = `r(k)' - -forvalues i = 1/2 { - egen row_sum = rowtotal(*) - drop if row_sum == 0 - drop row_sum - xpose, clear -} - -mkmat v*, matrix(var) -putexcel set "$dir_results/reg_health_mental", sheet("HM2_Males_L", replace) modify -putexcel C2 = matrix(var) - -restore - - -* Store estimated coefficients - -// Initialize a counter for non-zero coefficients -local non_zero_count = 0 -//local names : colnames b - -// Loop through each element in `b` to count non-zero coefficients -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - local non_zero_count = `non_zero_count' + 1 - } -} - -// Create a new row vector to hold only non-zero coefficients -matrix nonzero_b = J(1, `non_zero_count', .) - -// Populate nonzero_b with non-zero coefficients from b -local index = 1 -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - matrix nonzero_b[1, `index'] = b[1, `i'] - local index = `index' + 1 - } -} - -putexcel set "$dir_results/reg_health_mental", sheet("HM2_Males_L") modify -putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) - -* Labelling - -putexcel A1 = "REGRESSOR" -putexcel A2 = "EmployedToUnemployed" -putexcel A3 = "UnemployedToEmployed" -putexcel A4 = "PersistentUnemployed" -putexcel A5 = "NonPovertyToPoverty" -putexcel A6 = "PovertyToNonPoverty" -putexcel A7 = "PersistentPoverty" -putexcel A8 = "RealIncomeChange" -putexcel A9 = "RealIncomeDecrease_D" -putexcel A10 = "FinancialDistress" -putexcel A11 = "Covid_2020_D" -putexcel A12 = "Covid_2021_D" - - -putexcel B1 = "COEFFICIENT" -putexcel C1 = "EmployedToUnemployed" -putexcel D1 = "UnemployedToEmployed" -putexcel E1 = "PersistentUnemployed" -putexcel F1 = "NonPovertyToPoverty" -putexcel G1 = "PovertyToNonPoverty" -putexcel H1 = "PersistentPoverty" -putexcel I1 = "RealIncomeChange" -putexcel J1 = "RealIncomeDecrease_D" -putexcel K1 = "FinancialDistress" -putexcel L1 = "Covid_2020_D" -putexcel M1 = "Covid_2021_D" - -* save RMSE -putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify -putexcel A15 = ("HM2_Males_L") B15 = rmse - -drop in_sample p -scalar drop r2_p N chi2 ll - - -********************************************************************** -* HM1_C: GHQ12 score 0-12 of all working-age adults - baseline effects * -********************************************************************** - -* New ordered logistic regression model, reflecting observed distributions - -ologit scghq2_dv /// -L.i.dhh_owned L.i.dcpst L.dnc L.dhe_pcs L.ib8.drgn L.i.ydses_c5 L.dlltsd01 L.scghq2_dv /// -L.dag L.dagsq i.deh_c3 i.dot i.dgn stm /// -if stm!=20 & stm!=21 & dag>=25 & dag<=64 & swv!=12 /// -[pweight=${weight}] /// -, vce(cluster idperson) - - * save raw results -matrix results = r(table) -matrix results = results[1..6,1...]' -putexcel set "$dir_raw_results/health_mental/health_mental", sheet("HM1_C") replace -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -gen in_sample = e(sample) - -predict p - -save "$dir_validation_data/HM1_C_sample", replace - - -scalar r2_p = e(r2_p) -scalar N = e(N) -scalar rmse = e(rmse) -scalar chi2 = e(chi2) -scalar ll = e(ll) - - -* Results - -* Note: Zeros values are eliminated - -matrix b = e(b) -matrix V = e(V) - - -* Store variance-covariance matrix - -preserve - -putexcel set "$dir_raw_results/health_mental/var_cov", sheet("var_cov") replace -putexcel A1 = matrix(V) - -import excel "$dir_raw_results/health_mental/var_cov", sheet("var_cov") clear - -describe -local no_vars = `r(k)' - -forvalues i = 1/2 { - egen row_sum = rowtotal(*) - drop if row_sum == 0 - drop row_sum - xpose, clear -} - -mkmat v*, matrix(var) -putexcel set "$dir_results/reg_health_mental", sheet("HM1_C", replace) modify -putexcel C2 = matrix(var) - -restore - - -* Store estimated coefficients - -// Initialize a counter for non-zero coefficients -local non_zero_count = 0 -//local names : colnames b - -// Loop through each element in `b` to count non-zero coefficients -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - local non_zero_count = `non_zero_count' + 1 - } -} - -// Create a new row vector to hold only non-zero coefficients -matrix nonzero_b = J(1, `non_zero_count', .) - -// Populate nonzero_b with non-zero coefficients from b -local index = 1 -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - matrix nonzero_b[1, `index'] = b[1, `i'] - local index = `index' + 1 - } -} - -putexcel set "$dir_results/reg_health_mental", sheet("HM1_C") modify -putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) - - -* Labelling - -putexcel A1 = "REGRESSOR" -putexcel A2 = "D_Home_owner_L1" -putexcel A3 = "Dcpst_Single_L1" -putexcel A4 = "Dnc_L1" -putexcel A5 = "Dhe_pcs_L1" -putexcel A6 = "UKC" -putexcel A7 = "UKD" -putexcel A8 = "UKE" -putexcel A9 = "UKF" -putexcel A10 = "UKG" -putexcel A11 = "UKH" -putexcel A12 = "UKJ" -putexcel A13 = "UKK" -putexcel A14 = "UKL" -putexcel A15 = "UKM" -putexcel A16 = "UKN" -putexcel A17 = "Ydses_c5_Q2_L1" -putexcel A18 = "Ydses_c5_Q3_L1" -putexcel A19 = "Ydses_c5_Q4_L1" -putexcel A20 = "Ydses_c5_Q5_L1" -putexcel A21 = "Dlltsd01_L1" -putexcel A22 = "Dhm_L1" -putexcel A23 = "Dag_L1" -putexcel A24 = "Dag_sq_L1" -putexcel A25 = "Deh_c3_Medium" -putexcel A26 = "Deh_c3_Low" -putexcel A27 = "EthnicityAsian" -putexcel A28 = "EthnicityBlack" -putexcel A29 = "EthnicityOther" -putexcel A30 = "Dgn" -putexcel A31 = "Year_transformed" -putexcel A32 = "Cut1" -putexcel A33 = "Cut2" -putexcel A34 = "Cut3" -putexcel A35 = "Cut4" -putexcel A36 = "Cut5" -putexcel A37 = "Cut6" -putexcel A38 = "Cut7" -putexcel A39 = "Cut8" -putexcel A40 = "Cut9" -putexcel A41 = "Cut10" -putexcel A42 = "Cut11" -putexcel A43 = "Cut12" - -putexcel B1 = "COEFFICIENT" -putexcel C1 = "D_Home_owner_L1" -putexcel D1 = "Dcpst_Single_L1" -putexcel E1 = "Dnc_L1" -putexcel F1 = "Dhe_pcs_L1" -putexcel G1 = "UKC" -putexcel H1 = "UKD" -putexcel I1 = "UKE" -putexcel J1 = "UKF" -putexcel K1 = "UKG" -putexcel L1 = "UKH" -putexcel M1 = "UKJ" -putexcel N1 = "UKK" -putexcel O1 = "UKL" -putexcel P1 = "UKM" -putexcel Q1 = "UKN" -putexcel R1 = "Ydses_c5_Q2_L1" -putexcel S1 = "Ydses_c5_Q3_L1" -putexcel T1 = "Ydses_c5_Q4_L1" -putexcel U1 = "Ydses_c5_Q5_L1" -putexcel V1 = "Dlltsd01_L1" -putexcel W1 = "Dhm_L1" -putexcel X1 = "Dag_L1" -putexcel Y1 = "Dag_sq_L1" -putexcel Z1 = "Deh_c3_Medium" -putexcel AA1 = "Deh_c3_Low" -putexcel AB1 = "EthnicityAsian" -putexcel AC1 = "EthnicityBlack" -putexcel AD1 = "EthnicityOther" -putexcel AE1 = "Dgn" -putexcel AF1 = "Year_transformed" -putexcel AG1 = "Cut1" -putexcel AH1 = "Cut2" -putexcel AI1 = "Cut3" -putexcel AJ1 = "Cut4" -putexcel AK1 = "Cut5" -putexcel AL1 = "Cut6" -putexcel AM1 = "Cut7" -putexcel AN1 = "Cut8" -putexcel AO1 = "Cut9" -putexcel AP1 = "Cut10" -putexcel AQ1 = "Cut11" -putexcel AR1 = "Cut12" - -/* save RMSE - not strictly needed for ologit predictions -putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify -putexcel A16 = ("HM1_C") B16 = rmse -*/ - -drop in_sample p -scalar drop r2_p N chi2 ll - -*************************************************************** -* HM2_Females_C: GHQ12 Score 0-12 - causal employment effects * -*************************************************************** - -* Kept as linear as adding an 'additional' causal effect on baseline - -gen RealIncomeDecrease_D = log_income - L.log_income -gen scghq2_dv_L1 = L.scghq2_dv - -*Stage 2 -*Female -reghdfe scghq2_dv /// -ib11.exp_emp i.exp_poverty i.exp_incchange RealIncomeDecrease_D financial_distress /// -y2020 y2021 /// -i.dhh_owned i.dcpst dnc dhe_pcs ib8.drgn i.ydses_c5 dlltsd01 /// -dag dagsq i.deh_c3 stm /// -if dag>=25 & dag<=64 & dgn==0 /// -, absorb(idperson) vce(cluster idperson) - - - * save raw results -matrix results = r(table) -matrix results = results[1..6,1..10]' -putexcel set "$dir_raw_results/health_mental/health_mental", sheet("HM2_Females_C") replace -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -gen in_sample = e(sample) - -predict p - -save "$dir_validation_data/HM2_Females_C_sample", replace - - -scalar r2_p = e(r2_p) -scalar N = e(N) -scalar rmse = e(rmse) -scalar chi2 = e(chi2) -scalar ll = e(ll) - - -* Results - -* Note: Zeros values are eliminated - -matrix b = e(b) -matrix V = e(V) -matrix V = V[1..14,1..14] - -forvalues i = 1/14 { - forvalues j = 1/14 { - if `i' == `j' { - continue - } - matrix V[`i',`j'] = 0 - } -} - -* Store variance-covariance matrix - -preserve - -putexcel set "$dir_raw_results/health_mental/var_cov", sheet("var_cov") replace -putexcel A1 = matrix(V) - -import excel "$dir_raw_results/health_mental/var_cov", sheet("var_cov") clear - -describe -local no_vars = `r(k)' - -forvalues i = 1/2 { - egen row_sum = rowtotal(*) - drop if row_sum == 0 - drop row_sum - xpose, clear -} - -mkmat v*, matrix(var) -putexcel set "$dir_results/reg_health_mental", sheet("HM2_Females_C", replace) modify -putexcel C2 = matrix(var) - -restore - - -* Store estimated coefficients - -// Initialize a counter for non-zero coefficients -local non_zero_count = 0 -//local names : colnames b - -// Loop through each element in `b` to count non-zero coefficients -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - local non_zero_count = `non_zero_count' + 1 - } -} - -// Create a new row vector to hold only non-zero coefficients -matrix nonzero_b = J(1, `non_zero_count', .) - -// Populate nonzero_b with non-zero coefficients from b -local index = 1 -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - matrix nonzero_b[1, `index'] = b[1, `i'] - local index = `index' + 1 - } -} - -putexcel set "$dir_results/reg_health_mental", sheet("HM2_Females_C") modify -putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) - -* Labelling - -putexcel A1 = "REGRESSOR" -putexcel A2 = "EmployedToUnemployed" -putexcel A3 = "UnemployedToEmployed" -putexcel A4 = "PersistentUnemployed" -putexcel A5 = "NonPovertyToPoverty" -putexcel A6 = "PovertyToNonPoverty" -putexcel A7 = "PersistentPoverty" -putexcel A8 = "RealIncomeChange" -putexcel A9 = "RealIncomeDecrease_D" -putexcel A10 = "FinancialDistress" -putexcel A11 = "Covid_2020_D" -putexcel A12 = "Covid_2021_D" - - -putexcel B1 = "COEFFICIENT" -putexcel C1 = "EmployedToUnemployed" -putexcel D1 = "UnemployedToEmployed" -putexcel E1 = "PersistentUnemployed" -putexcel F1 = "NonPovertyToPoverty" -putexcel G1 = "PovertyToNonPoverty" -putexcel H1 = "PersistentPoverty" -putexcel I1 = "RealIncomeChange" -putexcel J1 = "RealIncomeDecrease_D" -putexcel K1 = "FinancialDistress" -putexcel L1 = "Covid_2020_D" -putexcel M1 = "Covid_2021_D" - -* save RMSE -putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify -putexcel A16 = ("HM2_Females_C") B16 = rmse - -drop in_sample p -scalar drop r2_p N chi2 ll - -*************************************************************** -* HM2_Males_C: GHQ12 Score 0-12 - causal employment effects * -*************************************************************** - - -*Stage 2 -*Male -reghdfe scghq2_dv /// -ib11.exp_emp i.exp_poverty i.exp_incchange RealIncomeDecrease_D financial_distress /// -y2020 y2021 /// -i.dhh_owned i.dcpst dnc dhe_pcs ib8.drgn i.ydses_c5 dlltsd01 /// -dag dagsq i.deh_c3 stm /// -if dag>=25 & dag<=64 & dgn==1 /// -, absorb(idperson) vce(cluster idperson) - - * save raw results -matrix results = r(table) -matrix results = results[1..6,1..10]' -putexcel set "$dir_raw_results/health_mental/health_mental", sheet("HM2_Males_C") replace -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -gen in_sample = e(sample) - -predict p - -save "$dir_validation_data/HM2_Males_C_sample", replace - - -scalar r2_p = e(r2_p) -scalar N = e(N) -scalar rmse = e(rmse) -scalar chi2 = e(chi2) -scalar ll = e(ll) - - -* Results - -* Note: Zeros values are eliminated - -matrix b = e(b) -matrix V = e(V) -matrix V = V[1..14,1..14] - -forvalues i = 1/14 { - forvalues j = 1/14 { - if `i' == `j' { - continue - } - matrix V[`i',`j'] = 0 - } -} - -* Store variance-covariance matrix - -preserve - -putexcel set "$dir_raw_results/health_mental/var_cov", sheet("var_cov") replace -putexcel A1 = matrix(V) - -import excel "$dir_raw_results/health_mental/var_cov", sheet("var_cov") clear - -describe -local no_vars = `r(k)' - -forvalues i = 1/2 { - egen row_sum = rowtotal(*) - drop if row_sum == 0 - drop row_sum - xpose, clear -} - -mkmat v*, matrix(var) -putexcel set "$dir_results/reg_health_mental", sheet("HM2_Males_C", replace) modify -putexcel C2 = matrix(var) - -restore - - -* Store estimated coefficients - -// Initialize a counter for non-zero coefficients -local non_zero_count = 0 -//local names : colnames b - -// Loop through each element in `b` to count non-zero coefficients -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - local non_zero_count = `non_zero_count' + 1 - } -} - -// Create a new row vector to hold only non-zero coefficients -matrix nonzero_b = J(1, `non_zero_count', .) - -// Populate nonzero_b with non-zero coefficients from b -local index = 1 -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - matrix nonzero_b[1, `index'] = b[1, `i'] - local index = `index' + 1 - } -} - -putexcel set "$dir_results/reg_health_mental", sheet("HM2_Males_C") modify -putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) - -* Labelling - -putexcel A1 = "REGRESSOR" -putexcel A2 = "EmployedToUnemployed" -putexcel A3 = "UnemployedToEmployed" -putexcel A4 = "PersistentUnemployed" -putexcel A5 = "NonPovertyToPoverty" -putexcel A6 = "PovertyToNonPoverty" -putexcel A7 = "PersistentPoverty" -putexcel A8 = "RealIncomeChange" -putexcel A9 = "RealIncomeDecrease_D" -putexcel A10 = "FinancialDistress" -putexcel A11 = "Covid_2020_D" -putexcel A12 = "Covid_2021_D" - - -putexcel B1 = "COEFFICIENT" -putexcel C1 = "EmployedToUnemployed" -putexcel D1 = "UnemployedToEmployed" -putexcel E1 = "PersistentUnemployed" -putexcel F1 = "NonPovertyToPoverty" -putexcel G1 = "PovertyToNonPoverty" -putexcel H1 = "PersistentPoverty" -putexcel I1 = "RealIncomeChange" -putexcel J1 = "RealIncomeDecrease_D" -putexcel K1 = "FinancialDistress" -putexcel L1 = "Covid_2020_D" -putexcel M1 = "Covid_2021_D" - -* save RMSE -putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify -putexcel A17 = ("HM2_Males_C") B17 = rmse - -drop in_sample p -scalar drop r2_p N chi2 ll +******************************************************************************** +* PROJECT: UC and mental health +* SECTION: Health and wellbeing +* OBJECT: Health status and Disability +* AUTHORS: Andy Baxter +* LAST UPDATE: 17 Feb 2026 +* COUNTRY: UK +* +* NOTES: +* - This file updates GHQ12 Level (0-36) and Caseness (0-12) variables +******************************************************************************** +clear all +set more off +set mem 200m +set maxvar 30000 + + +******************************************************************* +cap log close +log using "${dir_log}/reg_health_mental.log", replace +******************************************************************* + +/********************************* PREPARE DATA *******************************/ + +use ${estimation_sample}, clear + +* Set data +xtset idperson swv +sort idperson swv + +* Adjust variables +do "${dir_do}/variable_update.do" +/* DP: Household income/poverty/employment transition variables are moved to variable_update.do */ + +* Remove children +drop if dag < 16 + +********************************************************************** +* HM1_L: GHQ12 score 0-36 of all working-age adults - baseline effects * +********************************************************************** + +reg dhm /// +L.i.dhh_owned L.i.dcpst L.dnc L.dhe_pcs L.ib8.drgn L.i.ydses_c5 L.dlltsd01 L.dhm /// +L.dag L.dagsq i.deh_c3 i.dot i.dgn stm /// +[pweight=${weight}] /// +, vce(cluster idperson) + + * save raw results +matrix results = r(table) +matrix results = results[1..6,1...]' +putexcel set "$dir_raw_results/health_mental/health_mental", sheet("HM1_L") replace +putexcel A3 = matrix(results), names nformat(number_d2) +putexcel J4 = matrix(e(V)) + +gen in_sample = e(sample) + +predict p + +save "$dir_validation_data/HM1_L_sample", replace + + +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar rmse = e(rmse) +scalar chi2 = e(chi2) +scalar ll = e(ll) + + +* Results + +* Note: Zeros values are eliminated + +matrix b = e(b) +matrix V = e(V) + + +* Store variance-covariance matrix + +preserve + +putexcel set "$dir_raw_results/health_mental/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/health_mental/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_health_mental", sheet("HM1_L", replace) modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients + +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +// Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +// Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +// Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_health_mental", sheet("HM1_L") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + + +* Labelling + +putexcel A1 = "REGRESSOR" +putexcel A2 = "D_Home_owner_L1" +putexcel A3 = "Dcpst_Single_L1" +putexcel A4 = "Dnc_L1" +putexcel A5 = "Dhe_pcs_L1" +putexcel A6 = "UKC" +putexcel A7 = "UKD" +putexcel A8 = "UKE" +putexcel A9 = "UKF" +putexcel A10 = "UKG" +putexcel A11 = "UKH" +putexcel A12 = "UKJ" +putexcel A13 = "UKK" +putexcel A14 = "UKL" +putexcel A15 = "UKM" +putexcel A16 = "UKN" +putexcel A17 = "Ydses_c5_Q2_L1" +putexcel A18 = "Ydses_c5_Q3_L1" +putexcel A19 = "Ydses_c5_Q4_L1" +putexcel A20 = "Ydses_c5_Q5_L1" +putexcel A21 = "Dlltsd01_L1" +putexcel A22 = "Dhm_L1" +putexcel A23 = "Dag_L1" +putexcel A24 = "Dag_sq_L1" +putexcel A25 = "Deh_c3_Medium" +putexcel A26 = "Deh_c3_Low" +putexcel A27 = "EthnicityAsian" +putexcel A28 = "EthnicityBlack" +putexcel A29 = "EthnicityOther" +putexcel A30 = "Dgn" +putexcel A31 = "Year_transformed" +putexcel A32 = "Constant" + +putexcel B1 = "COEFFICIENT" +putexcel C1 = "D_Home_owner_L1" +putexcel D1 = "Dcpst_Single_L1" +putexcel E1 = "Dnc_L1" +putexcel F1 = "Dhe_pcs_L1" +putexcel G1 = "UKC" +putexcel H1 = "UKD" +putexcel I1 = "UKE" +putexcel J1 = "UKF" +putexcel K1 = "UKG" +putexcel L1 = "UKH" +putexcel M1 = "UKJ" +putexcel N1 = "UKK" +putexcel O1 = "UKL" +putexcel P1 = "UKM" +putexcel Q1 = "UKN" +putexcel R1 = "Ydses_c5_Q2_L1" +putexcel S1 = "Ydses_c5_Q3_L1" +putexcel T1 = "Ydses_c5_Q4_L1" +putexcel U1 = "Ydses_c5_Q5_L1" +putexcel V1 = "Dlltsd01_L1" +putexcel W1 = "Dhm_L1" +putexcel X1 = "Dag_L1" +putexcel Y1 = "Dag_sq_L1" +putexcel Z1 = "Deh_c3_Medium" +putexcel AA1 = "Deh_c3_Low" +putexcel AB1 = "EthnicityAsian" +putexcel AC1 = "EthnicityBlack" +putexcel AD1 = "EthnicityOther" +putexcel AE1 = "Dgn" +putexcel AF1 = "Year_transformed" +putexcel AG1 = "Constant" + +* save RMSE +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A13 = ("HM1_L") B13 = rmse + +drop in_sample p + +scalar drop r2_p N chi2 ll + +*************************************************************** +* HM2_Females_L: GHQ12 Score 0-36 - causal employment effects * +*************************************************************** + +*Stage 2 +*Female +reghdfe dhm /// +ib11.exp_emp i.exp_poverty i.exp_incchange D.log_income financial_distress /// +y2020 y2021 /// +L.i.dhh_owned L.i.dcpst L.dnc L.dhe_pcs L.ib8.drgn L.i.ydses_c5 L.dlltsd01 L.dhm /// +L.dag L.dagsq i.deh_c3 stm /// +if dag>=25 & dag<=64 & dgn==0 /// +[pweight=${weight}] /// +, absorb(idperson) vce(cluster idperson) + + + * save raw results +matrix results = r(table) +matrix results = results[1..6,1..10]' +putexcel set "$dir_raw_results/health_mental/health_mental", sheet("HM2_Females_L") replace +putexcel A3 = matrix(results), names nformat(number_d2) +putexcel J4 = matrix(e(V)) + +gen in_sample = e(sample) + +predict p + +save "$dir_validation_data/HM2_Females_L_sample", replace + + +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar rmse = e(rmse) +scalar chi2 = e(chi2) +scalar ll = e(ll) + + +* Results + +* Note: Zeros values are eliminated + +matrix b = e(b) +matrix V = e(V) +matrix V = V[1..14,1..14] + +forvalues i = 1/14 { + forvalues j = 1/14 { + if `i' == `j' { + continue + } + matrix V[`i',`j'] = 0 + } +} + +* Store variance-covariance matrix + +preserve + +putexcel set "$dir_raw_results/health_mental/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/health_mental/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_health_mental", sheet("HM2_Females_L", replace) modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients + +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +// Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +// Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +// Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_health_mental", sheet("HM2_Females_L") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + +* Labelling + +putexcel A1 = "REGRESSOR" +putexcel A2 = "EmployedToUnemployed" +putexcel A3 = "UnemployedToEmployed" +putexcel A4 = "PersistentUnemployed" +putexcel A5 = "NonPovertyToPoverty" +putexcel A6 = "PovertyToNonPoverty" +putexcel A7 = "PersistentPoverty" +putexcel A8 = "RealIncomeChange" +putexcel A9 = "RealIncomeDecrease_D" +putexcel A10 = "FinancialDistress" +putexcel A11 = "Covid_2020_D" +putexcel A12 = "Covid_2021_D" + + +putexcel B1 = "COEFFICIENT" +putexcel C1 = "EmployedToUnemployed" +putexcel D1 = "UnemployedToEmployed" +putexcel E1 = "PersistentUnemployed" +putexcel F1 = "NonPovertyToPoverty" +putexcel G1 = "PovertyToNonPoverty" +putexcel H1 = "PersistentPoverty" +putexcel I1 = "RealIncomeChange" +putexcel J1 = "RealIncomeDecrease_D" +putexcel K1 = "FinancialDistress" +putexcel L1 = "Covid_2020_D" +putexcel M1 = "Covid_2021_D" + +* save RMSE +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A14 = ("HM2_Females_L") B14 = rmse + +drop in_sample p +scalar drop r2_p N chi2 ll + +*************************************************************** +* HM2_Males_L: GHQ12 Score 0-36 - causal employment effects * +*************************************************************** + + +*Stage 2 +*Male +reghdfe dhm /// +ib11.exp_emp i.exp_poverty i.exp_incchange D.log_income financial_distress /// +y2020 y2021 /// +L.i.dhh_owned L.i.dcpst L.dnc L.dhe_pcs L.ib8.drgn L.i.ydses_c5 L.dlltsd01 L.dhm /// +L.dag L.dagsq i.deh_c3 stm /// +if dag>=25 & dag<=64 & dgn==1 /// +[pweight=${weight}] /// +, absorb(idperson) vce(cluster idperson) + + + * save raw results +matrix results = r(table) +matrix results = results[1..6,1..10]' +putexcel set "$dir_raw_results/health_mental/health_mental", sheet("HM2_Males_L") replace +putexcel A3 = matrix(results), names nformat(number_d2) +putexcel J4 = matrix(e(V)) + +gen in_sample = e(sample) + +predict p + +save "$dir_validation_data/HM2_Males_L_sample", replace + + +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar rmse = e(rmse) +scalar chi2 = e(chi2) +scalar ll = e(ll) + + +* Results + +* Note: Zeros values are eliminated + +matrix b = e(b) +matrix V = e(V) +matrix V = V[1..14,1..14] + +forvalues i = 1/14 { + forvalues j = 1/14 { + if `i' == `j' { + continue + } + matrix V[`i',`j'] = 0 + } +} + +* Store variance-covariance matrix + +preserve + +putexcel set "$dir_raw_results/health_mental/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/health_mental/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_health_mental", sheet("HM2_Males_L", replace) modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients + +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +// Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +// Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +// Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_health_mental", sheet("HM2_Males_L") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + +* Labelling + +putexcel A1 = "REGRESSOR" +putexcel A2 = "EmployedToUnemployed" +putexcel A3 = "UnemployedToEmployed" +putexcel A4 = "PersistentUnemployed" +putexcel A5 = "NonPovertyToPoverty" +putexcel A6 = "PovertyToNonPoverty" +putexcel A7 = "PersistentPoverty" +putexcel A8 = "RealIncomeChange" +putexcel A9 = "RealIncomeDecrease_D" +putexcel A10 = "FinancialDistress" +putexcel A11 = "Covid_2020_D" +putexcel A12 = "Covid_2021_D" + + +putexcel B1 = "COEFFICIENT" +putexcel C1 = "EmployedToUnemployed" +putexcel D1 = "UnemployedToEmployed" +putexcel E1 = "PersistentUnemployed" +putexcel F1 = "NonPovertyToPoverty" +putexcel G1 = "PovertyToNonPoverty" +putexcel H1 = "PersistentPoverty" +putexcel I1 = "RealIncomeChange" +putexcel J1 = "RealIncomeDecrease_D" +putexcel K1 = "FinancialDistress" +putexcel L1 = "Covid_2020_D" +putexcel M1 = "Covid_2021_D" + +* save RMSE +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A15 = ("HM2_Males_L") B15 = rmse + +drop in_sample p +scalar drop r2_p N chi2 ll + + +********************************************************************** +* HM1_C: GHQ12 score 0-12 of all working-age adults - baseline effects * +********************************************************************** + +* New ordered logistic regression model, reflecting observed distributions + +ologit scghq2_dv /// +L.i.dhh_owned L.i.dcpst L.dnc L.dhe_pcs L.ib8.drgn L.i.ydses_c5 L.dlltsd01 L.scghq2_dv /// +L.dag L.dagsq i.deh_c3 i.dot i.dgn stm /// +if stm!=20 & stm!=21 & dag>=25 & dag<=64 & swv!=12 /// +[pweight=${weight}] /// +, vce(cluster idperson) + + * save raw results +matrix results = r(table) +matrix results = results[1..6,1...]' +putexcel set "$dir_raw_results/health_mental/health_mental", sheet("HM1_C") replace +putexcel A3 = matrix(results), names nformat(number_d2) +putexcel J4 = matrix(e(V)) + +gen in_sample = e(sample) + +predict p + +save "$dir_validation_data/HM1_C_sample", replace + + +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar rmse = e(rmse) +scalar chi2 = e(chi2) +scalar ll = e(ll) + + +* Results + +* Note: Zeros values are eliminated + +matrix b = e(b) +matrix V = e(V) + + +* Store variance-covariance matrix + +preserve + +putexcel set "$dir_raw_results/health_mental/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/health_mental/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_health_mental", sheet("HM1_C", replace) modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients + +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +// Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +// Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +// Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_health_mental", sheet("HM1_C") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + + +* Labelling + +putexcel A1 = "REGRESSOR" +putexcel A2 = "D_Home_owner_L1" +putexcel A3 = "Dcpst_Single_L1" +putexcel A4 = "Dnc_L1" +putexcel A5 = "Dhe_pcs_L1" +putexcel A6 = "UKC" +putexcel A7 = "UKD" +putexcel A8 = "UKE" +putexcel A9 = "UKF" +putexcel A10 = "UKG" +putexcel A11 = "UKH" +putexcel A12 = "UKJ" +putexcel A13 = "UKK" +putexcel A14 = "UKL" +putexcel A15 = "UKM" +putexcel A16 = "UKN" +putexcel A17 = "Ydses_c5_Q2_L1" +putexcel A18 = "Ydses_c5_Q3_L1" +putexcel A19 = "Ydses_c5_Q4_L1" +putexcel A20 = "Ydses_c5_Q5_L1" +putexcel A21 = "Dlltsd01_L1" +putexcel A22 = "Dhm_L1" +putexcel A23 = "Dag_L1" +putexcel A24 = "Dag_sq_L1" +putexcel A25 = "Deh_c3_Medium" +putexcel A26 = "Deh_c3_Low" +putexcel A27 = "EthnicityAsian" +putexcel A28 = "EthnicityBlack" +putexcel A29 = "EthnicityOther" +putexcel A30 = "Dgn" +putexcel A31 = "Year_transformed" +putexcel A32 = "Cut1" +putexcel A33 = "Cut2" +putexcel A34 = "Cut3" +putexcel A35 = "Cut4" +putexcel A36 = "Cut5" +putexcel A37 = "Cut6" +putexcel A38 = "Cut7" +putexcel A39 = "Cut8" +putexcel A40 = "Cut9" +putexcel A41 = "Cut10" +putexcel A42 = "Cut11" +putexcel A43 = "Cut12" + +putexcel B1 = "COEFFICIENT" +putexcel C1 = "D_Home_owner_L1" +putexcel D1 = "Dcpst_Single_L1" +putexcel E1 = "Dnc_L1" +putexcel F1 = "Dhe_pcs_L1" +putexcel G1 = "UKC" +putexcel H1 = "UKD" +putexcel I1 = "UKE" +putexcel J1 = "UKF" +putexcel K1 = "UKG" +putexcel L1 = "UKH" +putexcel M1 = "UKJ" +putexcel N1 = "UKK" +putexcel O1 = "UKL" +putexcel P1 = "UKM" +putexcel Q1 = "UKN" +putexcel R1 = "Ydses_c5_Q2_L1" +putexcel S1 = "Ydses_c5_Q3_L1" +putexcel T1 = "Ydses_c5_Q4_L1" +putexcel U1 = "Ydses_c5_Q5_L1" +putexcel V1 = "Dlltsd01_L1" +putexcel W1 = "Dhm_L1" +putexcel X1 = "Dag_L1" +putexcel Y1 = "Dag_sq_L1" +putexcel Z1 = "Deh_c3_Medium" +putexcel AA1 = "Deh_c3_Low" +putexcel AB1 = "EthnicityAsian" +putexcel AC1 = "EthnicityBlack" +putexcel AD1 = "EthnicityOther" +putexcel AE1 = "Dgn" +putexcel AF1 = "Year_transformed" +putexcel AG1 = "Cut1" +putexcel AH1 = "Cut2" +putexcel AI1 = "Cut3" +putexcel AJ1 = "Cut4" +putexcel AK1 = "Cut5" +putexcel AL1 = "Cut6" +putexcel AM1 = "Cut7" +putexcel AN1 = "Cut8" +putexcel AO1 = "Cut9" +putexcel AP1 = "Cut10" +putexcel AQ1 = "Cut11" +putexcel AR1 = "Cut12" + +/* save RMSE - not strictly needed for ologit predictions +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A16 = ("HM1_C") B16 = rmse +*/ + +drop in_sample p +scalar drop r2_p N chi2 ll + +*************************************************************** +* HM2_Females_C: GHQ12 Score 0-12 - causal employment effects * +*************************************************************** + +* Kept as linear as adding an 'additional' causal effect on baseline + +gen RealIncomeDecrease_D = log_income - L.log_income +gen scghq2_dv_L1 = L.scghq2_dv + +*Stage 2 +*Female +reghdfe scghq2_dv /// +ib11.exp_emp i.exp_poverty i.exp_incchange RealIncomeDecrease_D financial_distress /// +y2020 y2021 /// +i.dhh_owned i.dcpst dnc dhe_pcs ib8.drgn i.ydses_c5 dlltsd01 /// +dag dagsq i.deh_c3 stm /// +if dag>=25 & dag<=64 & dgn==0 /// +, absorb(idperson) vce(cluster idperson) + + + * save raw results +matrix results = r(table) +matrix results = results[1..6,1..10]' +putexcel set "$dir_raw_results/health_mental/health_mental", sheet("HM2_Females_C") replace +putexcel A3 = matrix(results), names nformat(number_d2) +putexcel J4 = matrix(e(V)) + +gen in_sample = e(sample) + +predict p + +save "$dir_validation_data/HM2_Females_C_sample", replace + + +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar rmse = e(rmse) +scalar chi2 = e(chi2) +scalar ll = e(ll) + + +* Results + +* Note: Zeros values are eliminated + +matrix b = e(b) +matrix V = e(V) +matrix V = V[1..14,1..14] + +forvalues i = 1/14 { + forvalues j = 1/14 { + if `i' == `j' { + continue + } + matrix V[`i',`j'] = 0 + } +} + +* Store variance-covariance matrix + +preserve + +putexcel set "$dir_raw_results/health_mental/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/health_mental/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_health_mental", sheet("HM2_Females_C", replace) modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients + +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +// Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +// Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +// Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_health_mental", sheet("HM2_Females_C") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + +* Labelling + +putexcel A1 = "REGRESSOR" +putexcel A2 = "EmployedToUnemployed" +putexcel A3 = "UnemployedToEmployed" +putexcel A4 = "PersistentUnemployed" +putexcel A5 = "NonPovertyToPoverty" +putexcel A6 = "PovertyToNonPoverty" +putexcel A7 = "PersistentPoverty" +putexcel A8 = "RealIncomeChange" +putexcel A9 = "RealIncomeDecrease_D" +putexcel A10 = "FinancialDistress" +putexcel A11 = "Covid_2020_D" +putexcel A12 = "Covid_2021_D" + + +putexcel B1 = "COEFFICIENT" +putexcel C1 = "EmployedToUnemployed" +putexcel D1 = "UnemployedToEmployed" +putexcel E1 = "PersistentUnemployed" +putexcel F1 = "NonPovertyToPoverty" +putexcel G1 = "PovertyToNonPoverty" +putexcel H1 = "PersistentPoverty" +putexcel I1 = "RealIncomeChange" +putexcel J1 = "RealIncomeDecrease_D" +putexcel K1 = "FinancialDistress" +putexcel L1 = "Covid_2020_D" +putexcel M1 = "Covid_2021_D" + +* save RMSE +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A16 = ("HM2_Females_C") B16 = rmse + +drop in_sample p +scalar drop r2_p N chi2 ll + +*************************************************************** +* HM2_Males_C: GHQ12 Score 0-12 - causal employment effects * +*************************************************************** + + +*Stage 2 +*Male +reghdfe scghq2_dv /// +ib11.exp_emp i.exp_poverty i.exp_incchange RealIncomeDecrease_D financial_distress /// +y2020 y2021 /// +i.dhh_owned i.dcpst dnc dhe_pcs ib8.drgn i.ydses_c5 dlltsd01 /// +dag dagsq i.deh_c3 stm /// +if dag>=25 & dag<=64 & dgn==1 /// +, absorb(idperson) vce(cluster idperson) + + * save raw results +matrix results = r(table) +matrix results = results[1..6,1..10]' +putexcel set "$dir_raw_results/health_mental/health_mental", sheet("HM2_Males_C") replace +putexcel A3 = matrix(results), names nformat(number_d2) +putexcel J4 = matrix(e(V)) + +gen in_sample = e(sample) + +predict p + +save "$dir_validation_data/HM2_Males_C_sample", replace + + +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar rmse = e(rmse) +scalar chi2 = e(chi2) +scalar ll = e(ll) + + +* Results + +* Note: Zeros values are eliminated + +matrix b = e(b) +matrix V = e(V) +matrix V = V[1..14,1..14] + +forvalues i = 1/14 { + forvalues j = 1/14 { + if `i' == `j' { + continue + } + matrix V[`i',`j'] = 0 + } +} + +* Store variance-covariance matrix + +preserve + +putexcel set "$dir_raw_results/health_mental/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/health_mental/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_health_mental", sheet("HM2_Males_C", replace) modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients + +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +// Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +// Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +// Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_health_mental", sheet("HM2_Males_C") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + +* Labelling + +putexcel A1 = "REGRESSOR" +putexcel A2 = "EmployedToUnemployed" +putexcel A3 = "UnemployedToEmployed" +putexcel A4 = "PersistentUnemployed" +putexcel A5 = "NonPovertyToPoverty" +putexcel A6 = "PovertyToNonPoverty" +putexcel A7 = "PersistentPoverty" +putexcel A8 = "RealIncomeChange" +putexcel A9 = "RealIncomeDecrease_D" +putexcel A10 = "FinancialDistress" +putexcel A11 = "Covid_2020_D" +putexcel A12 = "Covid_2021_D" + + +putexcel B1 = "COEFFICIENT" +putexcel C1 = "EmployedToUnemployed" +putexcel D1 = "UnemployedToEmployed" +putexcel E1 = "PersistentUnemployed" +putexcel F1 = "NonPovertyToPoverty" +putexcel G1 = "PovertyToNonPoverty" +putexcel H1 = "PersistentPoverty" +putexcel I1 = "RealIncomeChange" +putexcel J1 = "RealIncomeDecrease_D" +putexcel K1 = "FinancialDistress" +putexcel L1 = "Covid_2020_D" +putexcel M1 = "Covid_2021_D" + +* save RMSE +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A17 = ("HM2_Males_C") B17 = rmse + +drop in_sample p +scalar drop r2_p N chi2 ll diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_health_wellbeing.do b/input/InitialPopulations/compile/RegressionEstimates/13_reg_health_wellbeing.do similarity index 96% rename from input/InitialPopulations/compile/RegressionEstimates/reg_health_wellbeing.do rename to input/InitialPopulations/compile/RegressionEstimates/13_reg_health_wellbeing.do index f213c975f..b3319bc94 100644 --- a/input/InitialPopulations/compile/RegressionEstimates/reg_health_wellbeing.do +++ b/input/InitialPopulations/compile/RegressionEstimates/13_reg_health_wellbeing.do @@ -1,1431 +1,1431 @@ -******************************************************************************** -* PROJECT: UC and mental health -* SECTION: Health and wellbeing -* OBJECT: Health status and Disability -* AUTHORS: Andy Baxter -* LAST UPDATE: 17 Feb 2026 -* COUNTRY: UK -* -* NOTES: -* - This file updates SF12 MCS and PCS, and Life Satisfaction (7 levels) -******************************************************************************** -clear all -set more off -set mem 200m -set maxvar 30000 - - -******************************************************************* -cap log close -log using "${dir_log}/reg_health_wellbeing.log", replace -******************************************************************* - -/********************************* PREPARE DATA *******************************/ - -use ${estimation_sample}, clear - -* Set data -xtset idperson swv -sort idperson swv - -* Adjust variables -do "${dir_do}/variable_update.do" -/* DP: Household income/poverty/employment transition variables are moved to variable_update.do */ - -* Remove children -drop if dag < 16 - - -******************************************************************************** -* DHE_MCS1 - SF12 MCS score 0-100 of all working-age adults - baseline effects * -******************************************************************************** - -reg dhe_mcs /// -L.i.dhh_owned L.i.dcpst L.dnc L.dhe_pcs L.ib8.drgn L.i.ydses_c5 L.dlltsd01 L.dhe_mcs /// -L.dag L.dagsq i.deh_c3 i.dot i.dgn stm /// -[pweight=${weight}] /// -, vce(cluster idperson) - - * save raw results -matrix results = r(table) -matrix results = results[1..6,1...]' -putexcel set "$dir_raw_results/health_wellbeing/health_wellbeing", sheet("DHE_MCS1") replace -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -gen in_sample = e(sample) - -predict p - -save "$dir_validation_data/DHE_MCS1_sample", replace - - -scalar r2_p = e(r2_p) -scalar N = e(N) -scalar rmse = e(rmse) -scalar chi2 = e(chi2) -scalar ll = e(ll) - - -* Results - -* Note: Zeros values are eliminated - -matrix b = e(b) -matrix V = e(V) - - -* Store variance-covariance matrix - -preserve - -putexcel set "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") replace -putexcel A1 = matrix(V) - -import excel "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") clear - -describe -local no_vars = `r(k)' - -forvalues i = 1/2 { - egen row_sum = rowtotal(*) - drop if row_sum == 0 - drop row_sum - xpose, clear -} - -mkmat v*, matrix(var) -putexcel set "$dir_results/reg_health_wellbeing", sheet("DHE_MCS1", replace) modify -putexcel C2 = matrix(var) - -restore - - -* Store estimated coefficients - -// Initialize a counter for non-zero coefficients -local non_zero_count = 0 -//local names : colnames b - -// Loop through each element in `b` to count non-zero coefficients -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - local non_zero_count = `non_zero_count' + 1 - } -} - -// Create a new row vector to hold only non-zero coefficients -matrix nonzero_b = J(1, `non_zero_count', .) - -// Populate nonzero_b with non-zero coefficients from b -local index = 1 -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - matrix nonzero_b[1, `index'] = b[1, `i'] - local index = `index' + 1 - } -} - -putexcel set "$dir_results/reg_health_wellbeing", sheet("DHE_MCS1") modify -putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) - - -* Labelling - -putexcel A1 = "REGRESSOR" -putexcel A2 = "D_Home_owner_L1" -putexcel A3 = "Dcpst_Single_L1" -putexcel A4 = "Dnc_L1" -putexcel A5 = "Dhe_pcs_L1" -putexcel A6 = "UKC" -putexcel A7 = "UKD" -putexcel A8 = "UKE" -putexcel A9 = "UKF" -putexcel A10 = "UKG" -putexcel A11 = "UKH" -putexcel A12 = "UKJ" -putexcel A13 = "UKK" -putexcel A14 = "UKL" -putexcel A15 = "UKM" -putexcel A16 = "UKN" -putexcel A17 = "Ydses_c5_Q2_L1" -putexcel A18 = "Ydses_c5_Q3_L1" -putexcel A19 = "Ydses_c5_Q4_L1" -putexcel A20 = "Ydses_c5_Q5_L1" -putexcel A21 = "Dlltsd01_L1" -putexcel A22 = "Dhe_mcs_L1" -putexcel A23 = "Dag_L1" -putexcel A24 = "Dag_sq_L1" -putexcel A25 = "Deh_c3_Medium" -putexcel A26 = "Deh_c3_Low" -putexcel A27 = "EthnicityAsian" -putexcel A28 = "EthnicityBlack" -putexcel A29 = "EthnicityOther" -putexcel A30 = "Dgn" -putexcel A31 = "Year_transformed" -putexcel A32 = "Constant" - -putexcel B1 = "COEFFICIENT" -putexcel C1 = "D_Home_owner_L1" -putexcel D1 = "Dcpst_Single_L1" -putexcel E1 = "Dnc_L1" -putexcel F1 = "Dhe_pcs_L1" -putexcel G1 = "UKC" -putexcel H1 = "UKD" -putexcel I1 = "UKE" -putexcel J1 = "UKF" -putexcel K1 = "UKG" -putexcel L1 = "UKH" -putexcel M1 = "UKJ" -putexcel N1 = "UKK" -putexcel O1 = "UKL" -putexcel P1 = "UKM" -putexcel Q1 = "UKN" -putexcel R1 = "Ydses_c5_Q2_L1" -putexcel S1 = "Ydses_c5_Q3_L1" -putexcel T1 = "Ydses_c5_Q4_L1" -putexcel U1 = "Ydses_c5_Q5_L1" -putexcel V1 = "Dlltsd01_L1" -putexcel W1 = "Dhe_mcs_L1" -putexcel X1 = "Dag_L1" -putexcel Y1 = "Dag_sq_L1" -putexcel Z1 = "Deh_c3_Medium" -putexcel AA1 = "Deh_c3_Low" -putexcel AB1 = "EthnicityAsian" -putexcel AC1 = "EthnicityBlack" -putexcel AD1 = "EthnicityOther" -putexcel AE1 = "Dgn" -putexcel AF1 = "Year_transformed" -putexcel AG1 = "Constant" - -* save RMSE -putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify -putexcel A18 = ("DHE_MCS1") B18 = rmse - - -drop in_sample p -scalar drop r2_p N chi2 ll - - -*************************************************************** -* DHE_MCS2_Females: SF12 MCS score 0-100 - causal employment effects * -*************************************************************** - - -*Stage 2 -*Female -reghdfe dhe_mcs /// -ib11.exp_emp i.exp_poverty i.exp_incchange D.log_income financial_distress /// -y2020 y2021 /// -L.i.dhh_owned L.i.dcpst L.dnc L.dhe_pcs L.ib8.drgn L.i.ydses_c5 L.dlltsd01 L.dhe_mcs /// -L.dag L.dagsq i.deh_c3 stm /// -if dag>=25 & dag<=64 & dgn==0 /// -[pweight=${weight}] /// -, absorb(idperson) vce(cluster idperson) - - - * save raw results -matrix results = r(table) -matrix results = results[1..6,1..10]' -putexcel set "$dir_raw_results/health_wellbeing/health_wellbeing", sheet("DHE_MCS2_Females") replace -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -gen in_sample = e(sample) - -predict p - -save "$dir_validation_data/DHE_MCS2_Females_sample", replace - - -scalar r2_p = e(r2_p) -scalar N = e(N) -scalar rmse = e(rmse) -scalar chi2 = e(chi2) -scalar ll = e(ll) - - -* Results - -* Note: Zeros values are eliminated - -matrix b = e(b) -matrix V = e(V) -matrix V = V[1..14,1..14] - -forvalues i = 1/14 { - forvalues j = 1/14 { - if `i' == `j' { - continue - } - matrix V[`i',`j'] = 0 - } -} - -* Store variance-covariance matrix - -preserve - -putexcel set "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") replace -putexcel A1 = matrix(V) - -import excel "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") clear - -describe -local no_vars = `r(k)' - -forvalues i = 1/2 { - egen row_sum = rowtotal(*) - drop if row_sum == 0 - drop row_sum - xpose, clear -} - -mkmat v*, matrix(var) -putexcel set "$dir_results/reg_health_wellbeing", sheet("DHE_MCS2_Females", replace) modify -putexcel C2 = matrix(var) - -restore - - -* Store estimated coefficients - -// Initialize a counter for non-zero coefficients -local non_zero_count = 0 -//local names : colnames b - -// Loop through each element in `b` to count non-zero coefficients -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - local non_zero_count = `non_zero_count' + 1 - } -} - -// Create a new row vector to hold only non-zero coefficients -matrix nonzero_b = J(1, `non_zero_count', .) - -// Populate nonzero_b with non-zero coefficients from b -local index = 1 -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - matrix nonzero_b[1, `index'] = b[1, `i'] - local index = `index' + 1 - } -} - -putexcel set "$dir_results/reg_health_wellbeing", sheet("DHE_MCS2_Females") modify -putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) - -* Labelling - -putexcel A1 = "REGRESSOR" -putexcel A2 = "EmployedToUnemployed" -putexcel A3 = "UnemployedToEmployed" -putexcel A4 = "PersistentUnemployed" -putexcel A5 = "NonPovertyToPoverty" -putexcel A6 = "PovertyToNonPoverty" -putexcel A7 = "PersistentPoverty" -putexcel A8 = "RealIncomeChange" -putexcel A9 = "RealIncomeDecrease_D" -putexcel A10 = "FinancialDistress" -putexcel A11 = "Covid_2020_D" -putexcel A12 = "Covid_2021_D" - - -putexcel B1 = "COEFFICIENT" -putexcel C1 = "EmployedToUnemployed" -putexcel D1 = "UnemployedToEmployed" -putexcel E1 = "PersistentUnemployed" -putexcel F1 = "NonPovertyToPoverty" -putexcel G1 = "PovertyToNonPoverty" -putexcel H1 = "PersistentPoverty" -putexcel I1 = "RealIncomeChange" -putexcel J1 = "RealIncomeDecrease_D" -putexcel K1 = "FinancialDistress" -putexcel L1 = "Covid_2020_D" -putexcel M1 = "Covid_2021_D" - -* save RMSE -putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify -putexcel A19 = ("DHE_MCS2_Females") B19 = rmse - - -drop in_sample p -scalar drop r2_p N chi2 ll - -*************************************************************** -* DHE_MCS2_Males: SF12 MCS score 0-100 - causal employment effects * -*************************************************************** - - -*Stage 2 -*Male -reghdfe dhe_mcs /// -ib11.exp_emp i.exp_poverty i.exp_incchange D.log_income financial_distress /// -y2020 y2021 /// -L.i.dhh_owned L.i.dcpst L.dnc L.dhe_pcs L.ib8.drgn L.i.ydses_c5 L.dlltsd01 L.dhe_mcs /// -L.dag L.dagsq i.deh_c3 stm /// -if dag>=25 & dag<=64 & dgn==1 /// -[pweight=${weight}] /// -, absorb(idperson) vce(cluster idperson) - - - * save raw results -matrix results = r(table) -matrix results = results[1..6,1..10]' -putexcel set "$dir_raw_results/health_wellbeing/health_wellbeing", sheet("DHE_MCS2_Males") replace -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -gen in_sample = e(sample) - -predict p - -save "$dir_validation_data/DHE_MCS2_Males_sample", replace - - -scalar r2_p = e(r2_p) -scalar N = e(N) -scalar rmse = e(rmse) -scalar chi2 = e(chi2) -scalar ll = e(ll) - - -* Results - -* Note: Zeros values are eliminated - -matrix b = e(b) -matrix V = e(V) -matrix V = V[1..14,1..14] - -forvalues i = 1/14 { - forvalues j = 1/14 { - if `i' == `j' { - continue - } - matrix V[`i',`j'] = 0 - } -} - -* Store variance-covariance matrix - -preserve - -putexcel set "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") replace -putexcel A1 = matrix(V) - -import excel "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") clear - -describe -local no_vars = `r(k)' - -forvalues i = 1/2 { - egen row_sum = rowtotal(*) - drop if row_sum == 0 - drop row_sum - xpose, clear -} - -mkmat v*, matrix(var) -putexcel set "$dir_results/reg_health_wellbeing", sheet("DHE_MCS2_Males", replace) modify -putexcel C2 = matrix(var) - -restore - - -* Store estimated coefficients - -// Initialize a counter for non-zero coefficients -local non_zero_count = 0 -//local names : colnames b - -// Loop through each element in `b` to count non-zero coefficients -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - local non_zero_count = `non_zero_count' + 1 - } -} - -// Create a new row vector to hold only non-zero coefficients -matrix nonzero_b = J(1, `non_zero_count', .) - -// Populate nonzero_b with non-zero coefficients from b -local index = 1 -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - matrix nonzero_b[1, `index'] = b[1, `i'] - local index = `index' + 1 - } -} - -putexcel set "$dir_results/reg_health_wellbeing", sheet("DHE_MCS2_Males") modify -putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) - -* Labelling - -putexcel A1 = "REGRESSOR" -putexcel A2 = "EmployedToUnemployed" -putexcel A3 = "UnemployedToEmployed" -putexcel A4 = "PersistentUnemployed" -putexcel A5 = "NonPovertyToPoverty" -putexcel A6 = "PovertyToNonPoverty" -putexcel A7 = "PersistentPoverty" -putexcel A8 = "RealIncomeChange" -putexcel A9 = "RealIncomeDecrease_D" -putexcel A10 = "FinancialDistress" -putexcel A11 = "Covid_2020_D" -putexcel A12 = "Covid_2021_D" - - -putexcel B1 = "COEFFICIENT" -putexcel C1 = "EmployedToUnemployed" -putexcel D1 = "UnemployedToEmployed" -putexcel E1 = "PersistentUnemployed" -putexcel F1 = "NonPovertyToPoverty" -putexcel G1 = "PovertyToNonPoverty" -putexcel H1 = "PersistentPoverty" -putexcel I1 = "RealIncomeChange" -putexcel J1 = "RealIncomeDecrease_D" -putexcel K1 = "FinancialDistress" -putexcel L1 = "Covid_2020_D" -putexcel M1 = "Covid_2021_D" - -* save RMSE -putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify -putexcel A20 = ("DHE_MCS2_Males") B20 = rmse - - -drop in_sample p -scalar drop r2_p N chi2 ll - - -******************************************************************************* -* DHE_PCS1 - SF12 PCS score 0-100 of all working-age adults - baseline effects * -******************************************************************************** - -reg dhe_pcs /// -L.i.dhh_owned L.i.dcpst L.dnc L.dhe_mcs L.ib8.drgn L.i.ydses_c5 L.dlltsd01 L.dhe_pcs /// -L.dag L.dagsq i.deh_c3 i.dot i.dgn stm /// -[pweight=${weight}] /// -, vce(cluster idperson) - - * save raw results -matrix results = r(table) -matrix results = results[1..6,1...]' -putexcel set "$dir_raw_results/health_wellbeing/health_wellbeing", sheet("DHE_PCS1") replace -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -gen in_sample = e(sample) - -predict p - -save "$dir_validation_data/DHE_PCS1_sample", replace - - -scalar r2_p = e(r2_p) -scalar N = e(N) -scalar rmse = e(rmse) -scalar chi2 = e(chi2) -scalar ll = e(ll) - - -* Results - -* Note: Zeros values are eliminated - -matrix b = e(b) -matrix V = e(V) - - -* Store variance-covariance matrix - -preserve - -putexcel set "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") replace -putexcel A1 = matrix(V) - -import excel "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") clear - -describe -local no_vars = `r(k)' - -forvalues i = 1/2 { - egen row_sum = rowtotal(*) - drop if row_sum == 0 - drop row_sum - xpose, clear -} - -mkmat v*, matrix(var) -putexcel set "$dir_results/reg_health_wellbeing", sheet("DHE_PCS1", replace) modify -putexcel C2 = matrix(var) - -restore - - -* Store estimated coefficients - -// Initialize a counter for non-zero coefficients -local non_zero_count = 0 -//local names : colnames b - -// Loop through each element in `b` to count non-zero coefficients -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - local non_zero_count = `non_zero_count' + 1 - } -} - -// Create a new row vector to hold only non-zero coefficients -matrix nonzero_b = J(1, `non_zero_count', .) - -// Populate nonzero_b with non-zero coefficients from b -local index = 1 -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - matrix nonzero_b[1, `index'] = b[1, `i'] - local index = `index' + 1 - } -} - -putexcel set "$dir_results/reg_health_wellbeing", sheet("DHE_PCS1") modify -putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) - - -* Labelling - -putexcel A1 = "REGRESSOR" -putexcel A2 = "D_Home_owner_L1" -putexcel A3 = "Dcpst_Single_L1" -putexcel A4 = "Dnc_L1" -putexcel A5 = "Dhe_mcs_L1" -putexcel A6 = "UKC" -putexcel A7 = "UKD" -putexcel A8 = "UKE" -putexcel A9 = "UKF" -putexcel A10 = "UKG" -putexcel A11 = "UKH" -putexcel A12 = "UKJ" -putexcel A13 = "UKK" -putexcel A14 = "UKL" -putexcel A15 = "UKM" -putexcel A16 = "UKN" -putexcel A17 = "Ydses_c5_Q2_L1" -putexcel A18 = "Ydses_c5_Q3_L1" -putexcel A19 = "Ydses_c5_Q4_L1" -putexcel A20 = "Ydses_c5_Q5_L1" -putexcel A21 = "Dlltsd01_L1" -putexcel A22 = "Dhe_pcs_L1" -putexcel A23 = "Dag_L1" -putexcel A24 = "Dag_sq_L1" -putexcel A25 = "Deh_c3_Medium" -putexcel A26 = "Deh_c3_Low" -putexcel A27 = "EthnicityAsian" -putexcel A28 = "EthnicityBlack" -putexcel A29 = "EthnicityOther" -putexcel A30 = "Dgn" -putexcel A31 = "Year_transformed" -putexcel A32 = "Constant" - -putexcel B1 = "COEFFICIENT" -putexcel C1 = "D_Home_owner_L1" -putexcel D1 = "Dcpst_Single_L1" -putexcel E1 = "Dnc_L1" -putexcel F1 = "Dhe_mcs_L1" -putexcel G1 = "UKC" -putexcel H1 = "UKD" -putexcel I1 = "UKE" -putexcel J1 = "UKF" -putexcel K1 = "UKG" -putexcel L1 = "UKH" -putexcel M1 = "UKJ" -putexcel N1 = "UKK" -putexcel O1 = "UKL" -putexcel P1 = "UKM" -putexcel Q1 = "UKN" -putexcel R1 = "Ydses_c5_Q2_L1" -putexcel S1 = "Ydses_c5_Q3_L1" -putexcel T1 = "Ydses_c5_Q4_L1" -putexcel U1 = "Ydses_c5_Q5_L1" -putexcel V1 = "Dlltsd01_L1" -putexcel W1 = "Dhe_pcs_L1" -putexcel X1 = "Dag_L1" -putexcel Y1 = "Dag_sq_L1" -putexcel Z1 = "Deh_c3_Medium" -putexcel AA1 = "Deh_c3_Low" -putexcel AB1 = "EthnicityAsian" -putexcel AC1 = "EthnicityBlack" -putexcel AD1 = "EthnicityOther" -putexcel AE1 = "Dgn" -putexcel AF1 = "Year_transformed" -putexcel AG1 = "Constant" - -* save RMSE -putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify -putexcel A21 = ("DHE_PCS1") B21 = rmse - - -drop in_sample p -scalar drop r2_p N chi2 ll - - -*************************************************************** -* DHE_PCS2_Females: SF12 PCS score 0-100 - causal employment effects * -*************************************************************** - - -*Stage 2 -*Female -reghdfe dhe_pcs /// -ib11.exp_emp i.exp_poverty i.exp_incchange D.log_income financial_distress /// -y2020 y2021 /// -L.i.dhh_owned L.i.dcpst L.dnc L.dhe_mcs L.ib8.drgn L.i.ydses_c5 L.dlltsd01 L.dhe_pcs /// -L.dag L.dagsq i.deh_c3 stm /// -if dag>=25 & dag<=64 & dgn==0 /// -[pweight=${weight}] /// -, absorb(idperson) vce(cluster idperson) - - - * save raw results -matrix results = r(table) -matrix results = results[1..6,1..10]' -putexcel set "$dir_raw_results/health_wellbeing/health_wellbeing", sheet("DHE_PCS2_Females") replace -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -gen in_sample = e(sample) - -predict p - -save "$dir_validation_data/DHE_PCS2_Females_sample", replace - - -scalar r2_p = e(r2_p) -scalar N = e(N) -scalar rmse = e(rmse) -scalar chi2 = e(chi2) -scalar ll = e(ll) - - -* Results - -* Note: Zeros values are eliminated - -matrix b = e(b) -matrix V = e(V) -matrix V = V[1..14,1..14] - -forvalues i = 1/14 { - forvalues j = 1/14 { - if `i' == `j' { - continue - } - matrix V[`i',`j'] = 0 - } -} - -* Store variance-covariance matrix - -preserve - -putexcel set "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") replace -putexcel A1 = matrix(V) - -import excel "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") clear - -describe -local no_vars = `r(k)' - -forvalues i = 1/2 { - egen row_sum = rowtotal(*) - drop if row_sum == 0 - drop row_sum - xpose, clear -} - -mkmat v*, matrix(var) -putexcel set "$dir_results/reg_health_wellbeing", sheet("DHE_PCS2_Females", replace) modify -putexcel C2 = matrix(var) - -restore - - -* Store estimated coefficients - -// Initialize a counter for non-zero coefficients -local non_zero_count = 0 -//local names : colnames b - -// Loop through each element in `b` to count non-zero coefficients -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - local non_zero_count = `non_zero_count' + 1 - } -} - -// Create a new row vector to hold only non-zero coefficients -matrix nonzero_b = J(1, `non_zero_count', .) - -// Populate nonzero_b with non-zero coefficients from b -local index = 1 -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - matrix nonzero_b[1, `index'] = b[1, `i'] - local index = `index' + 1 - } -} - -putexcel set "$dir_results/reg_health_wellbeing", sheet("DHE_PCS2_Females") modify -putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) - -* Labelling - -putexcel A1 = "REGRESSOR" -putexcel A2 = "EmployedToUnemployed" -putexcel A3 = "UnemployedToEmployed" -putexcel A4 = "PersistentUnemployed" -putexcel A5 = "NonPovertyToPoverty" -putexcel A6 = "PovertyToNonPoverty" -putexcel A7 = "PersistentPoverty" -putexcel A8 = "RealIncomeChange" -putexcel A9 = "RealIncomeDecrease_D" -putexcel A10 = "FinancialDistress" -putexcel A11 = "Covid_2020_D" -putexcel A12 = "Covid_2021_D" - - -putexcel B1 = "COEFFICIENT" -putexcel C1 = "EmployedToUnemployed" -putexcel D1 = "UnemployedToEmployed" -putexcel E1 = "PersistentUnemployed" -putexcel F1 = "NonPovertyToPoverty" -putexcel G1 = "PovertyToNonPoverty" -putexcel H1 = "PersistentPoverty" -putexcel I1 = "RealIncomeChange" -putexcel J1 = "RealIncomeDecrease_D" -putexcel K1 = "FinancialDistress" -putexcel L1 = "Covid_2020_D" -putexcel M1 = "Covid_2021_D" - -* save RMSE -putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify -putexcel A22 = ("DHE_PCS2_Females") B22 = rmse - - - -drop in_sample p -scalar drop r2_p N chi2 ll - -*************************************************************** -* DHE_PCS2_Males: SF12 PCS score 0-100 - causal employment effects * -*************************************************************** - - -*Stage 2 -*Male -reghdfe dhe_pcs /// -ib11.exp_emp i.exp_poverty i.exp_incchange D.log_income financial_distress /// -y2020 y2021 /// -L.i.dhh_owned L.i.dcpst L.dnc L.dhe_mcs L.ib8.drgn L.i.ydses_c5 L.dlltsd01 L.dhe_pcs /// -L.dag L.dagsq i.deh_c3 stm /// -if dag>=25 & dag<=64 & dgn==1 /// -[pweight=${weight}] /// -, absorb(idperson) vce(cluster idperson) - - - * save raw results -matrix results = r(table) -matrix results = results[1..6,1..10]' -putexcel set "$dir_raw_results/health_wellbeing/health_wellbeing", sheet("DHE_PCS2_Males") replace -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -gen in_sample = e(sample) - -predict p - -save "$dir_validation_data/DHE_PCS2_Males_sample", replace - - -scalar r2_p = e(r2_p) -scalar N = e(N) -scalar rmse = e(rmse) -scalar chi2 = e(chi2) -scalar ll = e(ll) - - -* Results - -* Note: Zeros values are eliminated - -matrix b = e(b) -matrix V = e(V) -matrix V = V[1..14,1..14] - -forvalues i = 1/14 { - forvalues j = 1/14 { - if `i' == `j' { - continue - } - matrix V[`i',`j'] = 0 - } -} - -* Store variance-covariance matrix - -preserve - -putexcel set "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") replace -putexcel A1 = matrix(V) - -import excel "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") clear - -describe -local no_vars = `r(k)' - -forvalues i = 1/2 { - egen row_sum = rowtotal(*) - drop if row_sum == 0 - drop row_sum - xpose, clear -} - -mkmat v*, matrix(var) -putexcel set "$dir_results/reg_health_wellbeing", sheet("DHE_PCS2_Males", replace) modify -putexcel C2 = matrix(var) - -restore - - -* Store estimated coefficients - -// Initialize a counter for non-zero coefficients -local non_zero_count = 0 -//local names : colnames b - -// Loop through each element in `b` to count non-zero coefficients -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - local non_zero_count = `non_zero_count' + 1 - } -} - -// Create a new row vector to hold only non-zero coefficients -matrix nonzero_b = J(1, `non_zero_count', .) - -// Populate nonzero_b with non-zero coefficients from b -local index = 1 -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - matrix nonzero_b[1, `index'] = b[1, `i'] - local index = `index' + 1 - } -} - -putexcel set "$dir_results/reg_health_wellbeing", sheet("DHE_PCS2_Males") modify -putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) - -* Labelling - -putexcel A1 = "REGRESSOR" -putexcel A2 = "EmployedToUnemployed" -putexcel A3 = "UnemployedToEmployed" -putexcel A4 = "PersistentUnemployed" -putexcel A5 = "NonPovertyToPoverty" -putexcel A6 = "PovertyToNonPoverty" -putexcel A7 = "PersistentPoverty" -putexcel A8 = "RealIncomeChange" -putexcel A9 = "RealIncomeDecrease_D" -putexcel A10 = "FinancialDistress" -putexcel A11 = "Covid_2020_D" -putexcel A12 = "Covid_2021_D" - - -putexcel B1 = "COEFFICIENT" -putexcel C1 = "EmployedToUnemployed" -putexcel D1 = "UnemployedToEmployed" -putexcel E1 = "PersistentUnemployed" -putexcel F1 = "NonPovertyToPoverty" -putexcel G1 = "PovertyToNonPoverty" -putexcel H1 = "PersistentPoverty" -putexcel I1 = "RealIncomeChange" -putexcel J1 = "RealIncomeDecrease_D" -putexcel K1 = "FinancialDistress" -putexcel L1 = "Covid_2020_D" -putexcel M1 = "Covid_2021_D" - -* save RMSE -putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify -putexcel A23 = ("DHE_PCS2_Males") B23 = rmse - - -drop in_sample p -scalar drop r2_p N chi2 ll - - -******************************************************************************* -* DLS1 - Life Satisfaction 1-7 of all working-age adults - baseline effects * -******************************************************************************** - -reg dls /// -L.i.dhh_owned L.i.dcpst L.dnc L.dhe_pcs L.ib8.drgn L.i.ydses_c5 L.dlltsd01 L.dls /// -L.dag L.dagsq i.deh_c3 i.dot i.dgn stm /// -[pweight=${weight}] /// -, vce(cluster idperson) - - * save raw results -matrix results = r(table) -matrix results = results[1..6,1...]' -putexcel set "$dir_raw_results/health_wellbeing/health_wellbeing", sheet("DLS1") replace -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -gen in_sample = e(sample) - -predict p - -save "$dir_validation_data/DLS1_sample", replace - - -scalar r2_p = e(r2_p) -scalar N = e(N) -scalar rmse = e(rmse) -scalar chi2 = e(chi2) -scalar ll = e(ll) - - -* Results - -* Note: Zeros values are eliminated - -matrix b = e(b) -matrix V = e(V) - - -* Store variance-covariance matrix - -preserve - -putexcel set "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") replace -putexcel A1 = matrix(V) - -import excel "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") clear - -describe -local no_vars = `r(k)' - -forvalues i = 1/2 { - egen row_sum = rowtotal(*) - drop if row_sum == 0 - drop row_sum - xpose, clear -} - -mkmat v*, matrix(var) -putexcel set "$dir_results/reg_health_wellbeing", sheet("DLS1", replace) modify -putexcel C2 = matrix(var) - -restore - - -* Store estimated coefficients - -// Initialize a counter for non-zero coefficients -local non_zero_count = 0 -//local names : colnames b - -// Loop through each element in `b` to count non-zero coefficients -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - local non_zero_count = `non_zero_count' + 1 - } -} - -// Create a new row vector to hold only non-zero coefficients -matrix nonzero_b = J(1, `non_zero_count', .) - -// Populate nonzero_b with non-zero coefficients from b -local index = 1 -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - matrix nonzero_b[1, `index'] = b[1, `i'] - local index = `index' + 1 - } -} - -putexcel set "$dir_results/reg_health_wellbeing", sheet("DLS1") modify -putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) - - -* Labelling - -putexcel A1 = "REGRESSOR" -putexcel A2 = "D_Home_owner_L1" -putexcel A3 = "Dcpst_Single_L1" -putexcel A4 = "Dnc_L1" -putexcel A5 = "Dhe_pcs_L1" -putexcel A6 = "UKC" -putexcel A7 = "UKD" -putexcel A8 = "UKE" -putexcel A9 = "UKF" -putexcel A10 = "UKG" -putexcel A11 = "UKH" -putexcel A12 = "UKJ" -putexcel A13 = "UKK" -putexcel A14 = "UKL" -putexcel A15 = "UKM" -putexcel A16 = "UKN" -putexcel A17 = "Ydses_c5_Q2_L1" -putexcel A18 = "Ydses_c5_Q3_L1" -putexcel A19 = "Ydses_c5_Q4_L1" -putexcel A20 = "Ydses_c5_Q5_L1" -putexcel A21 = "Dlltsd01_L1" -putexcel A22 = "Dls_L1" -putexcel A23 = "Dag_L1" -putexcel A24 = "Dag_sq_L1" -putexcel A25 = "Deh_c3_Medium" -putexcel A26 = "Deh_c3_Low" -putexcel A27 = "EthnicityAsian" -putexcel A28 = "EthnicityBlack" -putexcel A29 = "EthnicityOther" -putexcel A30 = "Dgn" -putexcel A31 = "Year_transformed" -putexcel A32 = "Constant" - -putexcel B1 = "COEFFICIENT" -putexcel C1 = "D_Home_owner_L1" -putexcel D1 = "Dcpst_Single_L1" -putexcel E1 = "Dnc_L1" -putexcel F1 = "Dhe_pcs_L1" -putexcel G1 = "UKC" -putexcel H1 = "UKD" -putexcel I1 = "UKE" -putexcel J1 = "UKF" -putexcel K1 = "UKG" -putexcel L1 = "UKH" -putexcel M1 = "UKJ" -putexcel N1 = "UKK" -putexcel O1 = "UKL" -putexcel P1 = "UKM" -putexcel Q1 = "UKN" -putexcel R1 = "Ydses_c5_Q2_L1" -putexcel S1 = "Ydses_c5_Q3_L1" -putexcel T1 = "Ydses_c5_Q4_L1" -putexcel U1 = "Ydses_c5_Q5_L1" -putexcel V1 = "Dlltsd01_L1" -putexcel W1 = "Dls_L1" -putexcel X1 = "Dag_L1" -putexcel Y1 = "Dag_sq_L1" -putexcel Z1 = "Deh_c3_Medium" -putexcel AA1 = "Deh_c3_Low" -putexcel AB1 = "EthnicityAsian" -putexcel AC1 = "EthnicityBlack" -putexcel AD1 = "EthnicityOther" -putexcel AE1 = "Dgn" -putexcel AF1 = "Year_transformed" -putexcel AG1 = "Constant" - -* save RMSE -putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify -putexcel A24 = ("DLS1") B24 = rmse - - -drop in_sample p -scalar drop r2_p N chi2 ll - - -*************************************************************** -* DLS2_Females: Life Satisfaction 1-7 - causal employment effects * -*************************************************************** - - -*Stage 2 -*Female -reghdfe dls /// -ib11.exp_emp i.exp_poverty i.exp_incchange D.log_income financial_distress /// -y2020 y2021 /// -L.i.dhh_owned L.i.dcpst L.dnc L.dhe_pcs L.ib8.drgn L.i.ydses_c5 L.dlltsd01 L.dls /// -L.dag L.dagsq i.deh_c3 stm /// -if dag>=25 & dag<=64 & dgn==0 /// -[pweight=${weight}] /// -, absorb(idperson) vce(cluster idperson) - - - * save raw results -matrix results = r(table) -matrix results = results[1..6,1..10]' -putexcel set "$dir_raw_results/health_wellbeing/health_wellbeing", sheet("DLS2_Females") replace -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -gen in_sample = e(sample) - -predict p - -save "$dir_validation_data/DLS2_Females_sample", replace - - -scalar r2_p = e(r2_p) -scalar N = e(N) -scalar rmse = e(rmse) -scalar chi2 = e(chi2) -scalar ll = e(ll) - - -* Results - -* Note: Zeros values are eliminated - -matrix b = e(b) -matrix V = e(V) -matrix V = V[1..14,1..14] - -forvalues i = 1/14 { - forvalues j = 1/14 { - if `i' == `j' { - continue - } - matrix V[`i',`j'] = 0 - } -} - -* Store variance-covariance matrix - -preserve - -putexcel set "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") replace -putexcel A1 = matrix(V) - -import excel "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") clear - -describe -local no_vars = `r(k)' - -forvalues i = 1/2 { - egen row_sum = rowtotal(*) - drop if row_sum == 0 - drop row_sum - xpose, clear -} - -mkmat v*, matrix(var) -putexcel set "$dir_results/reg_health_wellbeing", sheet("DLS2_Females", replace) modify -putexcel C2 = matrix(var) - -restore - - -* Store estimated coefficients - -// Initialize a counter for non-zero coefficients -local non_zero_count = 0 -//local names : colnames b - -// Loop through each element in `b` to count non-zero coefficients -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - local non_zero_count = `non_zero_count' + 1 - } -} - -// Create a new row vector to hold only non-zero coefficients -matrix nonzero_b = J(1, `non_zero_count', .) - -// Populate nonzero_b with non-zero coefficients from b -local index = 1 -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - matrix nonzero_b[1, `index'] = b[1, `i'] - local index = `index' + 1 - } -} - -putexcel set "$dir_results/reg_health_wellbeing", sheet("DLS2_Females") modify -putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) - -* Labelling - -putexcel A1 = "REGRESSOR" -putexcel A2 = "EmployedToUnemployed" -putexcel A3 = "UnemployedToEmployed" -putexcel A4 = "PersistentUnemployed" -putexcel A5 = "NonPovertyToPoverty" -putexcel A6 = "PovertyToNonPoverty" -putexcel A7 = "PersistentPoverty" -putexcel A8 = "RealIncomeChange" -putexcel A9 = "RealIncomeDecrease_D" -putexcel A10 = "FinancialDistress" -putexcel A11 = "Covid_2020_D" -putexcel A12 = "Covid_2021_D" - - -putexcel B1 = "COEFFICIENT" -putexcel C1 = "EmployedToUnemployed" -putexcel D1 = "UnemployedToEmployed" -putexcel E1 = "PersistentUnemployed" -putexcel F1 = "NonPovertyToPoverty" -putexcel G1 = "PovertyToNonPoverty" -putexcel H1 = "PersistentPoverty" -putexcel I1 = "RealIncomeChange" -putexcel J1 = "RealIncomeDecrease_D" -putexcel K1 = "FinancialDistress" -putexcel L1 = "Covid_2020_D" -putexcel M1 = "Covid_2021_D" - -* save RMSE -putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify -putexcel A25 = ("DLS2_Females") B25 = rmse - - -drop in_sample p -scalar drop r2_p N chi2 ll - -*************************************************************** -* DLS2_Males: Life Satisfaction 1-7 - causal employment effects * -*************************************************************** - - -*Stage 2 -*Male -reghdfe dls /// -ib11.exp_emp i.exp_poverty i.exp_incchange D.log_income financial_distress /// -y2020 y2021 /// -L.i.dhh_owned L.i.dcpst L.dnc L.dhe_pcs L.ib8.drgn L.i.ydses_c5 L.dlltsd01 L.dls /// -L.dag L.dagsq i.deh_c3 stm /// -if dag>=25 & dag<=64 & dgn==1 /// -[pweight=${weight}] /// -, absorb(idperson) vce(cluster idperson) - - - * save raw results -matrix results = r(table) -matrix results = results[1..6,1..10]' -putexcel set "$dir_raw_results/health_wellbeing/health_wellbeing", sheet("DLS2_Males") replace -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -gen in_sample = e(sample) - -predict p - -save "$dir_validation_data/DLS2_Males_sample", replace - - -scalar r2_p = e(r2_p) -scalar N = e(N) -scalar rmse = e(rmse) -scalar chi2 = e(chi2) -scalar ll = e(ll) - - -* Results - -* Note: Zeros values are eliminated - -matrix b = e(b) -matrix V = e(V) -matrix V = V[1..14,1..14] - -forvalues i = 1/14 { - forvalues j = 1/14 { - if `i' == `j' { - continue - } - matrix V[`i',`j'] = 0 - } -} - -* Store variance-covariance matrix - -preserve - -putexcel set "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") replace -putexcel A1 = matrix(V) - -import excel "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") clear - -describe -local no_vars = `r(k)' - -forvalues i = 1/2 { - egen row_sum = rowtotal(*) - drop if row_sum == 0 - drop row_sum - xpose, clear -} - -mkmat v*, matrix(var) -putexcel set "$dir_results/reg_health_wellbeing", sheet("DLS2_Males", replace) modify -putexcel C2 = matrix(var) - -restore - - -* Store estimated coefficients - -// Initialize a counter for non-zero coefficients -local non_zero_count = 0 -//local names : colnames b - -// Loop through each element in `b` to count non-zero coefficients -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - local non_zero_count = `non_zero_count' + 1 - } -} - -// Create a new row vector to hold only non-zero coefficients -matrix nonzero_b = J(1, `non_zero_count', .) - -// Populate nonzero_b with non-zero coefficients from b -local index = 1 -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - matrix nonzero_b[1, `index'] = b[1, `i'] - local index = `index' + 1 - } -} - -putexcel set "$dir_results/reg_health_wellbeing", sheet("DLS2_Males") modify -putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) - -* Labelling - -putexcel A1 = "REGRESSOR" -putexcel A2 = "EmployedToUnemployed" -putexcel A3 = "UnemployedToEmployed" -putexcel A4 = "PersistentUnemployed" -putexcel A5 = "NonPovertyToPoverty" -putexcel A6 = "PovertyToNonPoverty" -putexcel A7 = "PersistentPoverty" -putexcel A8 = "RealIncomeChange" -putexcel A9 = "RealIncomeDecrease_D" -putexcel A10 = "FinancialDistress" -putexcel A11 = "Covid_2020_D" -putexcel A12 = "Covid_2021_D" - - -putexcel B1 = "COEFFICIENT" -putexcel C1 = "EmployedToUnemployed" -putexcel D1 = "UnemployedToEmployed" -putexcel E1 = "PersistentUnemployed" -putexcel F1 = "NonPovertyToPoverty" -putexcel G1 = "PovertyToNonPoverty" -putexcel H1 = "PersistentPoverty" -putexcel I1 = "RealIncomeChange" -putexcel J1 = "RealIncomeDecrease_D" -putexcel K1 = "FinancialDistress" -putexcel L1 = "Covid_2020_D" -putexcel M1 = "Covid_2021_D" - -* save RMSE -putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify -putexcel A26 = ("DLS2_Males") B26 = rmse - - -drop in_sample p -scalar drop r2_p N chi2 ll - - +******************************************************************************** +* PROJECT: UC and mental health +* SECTION: Health and wellbeing +* OBJECT: Health status and Disability +* AUTHORS: Andy Baxter +* LAST UPDATE: 17 Feb 2026 +* COUNTRY: UK +* +* NOTES: +* - This file updates SF12 MCS and PCS, and Life Satisfaction (7 levels) +******************************************************************************** +clear all +set more off +set mem 200m +set maxvar 30000 + + +******************************************************************* +cap log close +log using "${dir_log}/reg_health_wellbeing.log", replace +******************************************************************* + +/********************************* PREPARE DATA *******************************/ + +use ${estimation_sample}, clear + +* Set data +xtset idperson swv +sort idperson swv + +* Adjust variables +do "${dir_do}/variable_update.do" +/* DP: Household income/poverty/employment transition variables are moved to variable_update.do */ + +* Remove children +drop if dag < 16 + + +******************************************************************************** +* DHE_MCS1 - SF12 MCS score 0-100 of all working-age adults - baseline effects * +******************************************************************************** + +reg dhe_mcs /// +L.i.dhh_owned L.i.dcpst L.dnc L.dhe_pcs L.ib8.drgn L.i.ydses_c5 L.dlltsd01 L.dhe_mcs /// +L.dag L.dagsq i.deh_c3 i.dot i.dgn stm /// +[pweight=${weight}] /// +, vce(cluster idperson) + + * save raw results +matrix results = r(table) +matrix results = results[1..6,1...]' +putexcel set "$dir_raw_results/health_wellbeing/health_wellbeing", sheet("DHE_MCS1") replace +putexcel A3 = matrix(results), names nformat(number_d2) +putexcel J4 = matrix(e(V)) + +gen in_sample = e(sample) + +predict p + +save "$dir_validation_data/DHE_MCS1_sample", replace + + +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar rmse = e(rmse) +scalar chi2 = e(chi2) +scalar ll = e(ll) + + +* Results + +* Note: Zeros values are eliminated + +matrix b = e(b) +matrix V = e(V) + + +* Store variance-covariance matrix + +preserve + +putexcel set "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_health_wellbeing", sheet("DHE_MCS1", replace) modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients + +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +// Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +// Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +// Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_health_wellbeing", sheet("DHE_MCS1") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + + +* Labelling + +putexcel A1 = "REGRESSOR" +putexcel A2 = "D_Home_owner_L1" +putexcel A3 = "Dcpst_Single_L1" +putexcel A4 = "Dnc_L1" +putexcel A5 = "Dhe_pcs_L1" +putexcel A6 = "UKC" +putexcel A7 = "UKD" +putexcel A8 = "UKE" +putexcel A9 = "UKF" +putexcel A10 = "UKG" +putexcel A11 = "UKH" +putexcel A12 = "UKJ" +putexcel A13 = "UKK" +putexcel A14 = "UKL" +putexcel A15 = "UKM" +putexcel A16 = "UKN" +putexcel A17 = "Ydses_c5_Q2_L1" +putexcel A18 = "Ydses_c5_Q3_L1" +putexcel A19 = "Ydses_c5_Q4_L1" +putexcel A20 = "Ydses_c5_Q5_L1" +putexcel A21 = "Dlltsd01_L1" +putexcel A22 = "Dhe_mcs_L1" +putexcel A23 = "Dag_L1" +putexcel A24 = "Dag_sq_L1" +putexcel A25 = "Deh_c3_Medium" +putexcel A26 = "Deh_c3_Low" +putexcel A27 = "EthnicityAsian" +putexcel A28 = "EthnicityBlack" +putexcel A29 = "EthnicityOther" +putexcel A30 = "Dgn" +putexcel A31 = "Year_transformed" +putexcel A32 = "Constant" + +putexcel B1 = "COEFFICIENT" +putexcel C1 = "D_Home_owner_L1" +putexcel D1 = "Dcpst_Single_L1" +putexcel E1 = "Dnc_L1" +putexcel F1 = "Dhe_pcs_L1" +putexcel G1 = "UKC" +putexcel H1 = "UKD" +putexcel I1 = "UKE" +putexcel J1 = "UKF" +putexcel K1 = "UKG" +putexcel L1 = "UKH" +putexcel M1 = "UKJ" +putexcel N1 = "UKK" +putexcel O1 = "UKL" +putexcel P1 = "UKM" +putexcel Q1 = "UKN" +putexcel R1 = "Ydses_c5_Q2_L1" +putexcel S1 = "Ydses_c5_Q3_L1" +putexcel T1 = "Ydses_c5_Q4_L1" +putexcel U1 = "Ydses_c5_Q5_L1" +putexcel V1 = "Dlltsd01_L1" +putexcel W1 = "Dhe_mcs_L1" +putexcel X1 = "Dag_L1" +putexcel Y1 = "Dag_sq_L1" +putexcel Z1 = "Deh_c3_Medium" +putexcel AA1 = "Deh_c3_Low" +putexcel AB1 = "EthnicityAsian" +putexcel AC1 = "EthnicityBlack" +putexcel AD1 = "EthnicityOther" +putexcel AE1 = "Dgn" +putexcel AF1 = "Year_transformed" +putexcel AG1 = "Constant" + +* save RMSE +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A18 = ("DHE_MCS1") B18 = rmse + + +drop in_sample p +scalar drop r2_p N chi2 ll + + +*************************************************************** +* DHE_MCS2_Females: SF12 MCS score 0-100 - causal employment effects * +*************************************************************** + + +*Stage 2 +*Female +reghdfe dhe_mcs /// +ib11.exp_emp i.exp_poverty i.exp_incchange D.log_income financial_distress /// +y2020 y2021 /// +L.i.dhh_owned L.i.dcpst L.dnc L.dhe_pcs L.ib8.drgn L.i.ydses_c5 L.dlltsd01 L.dhe_mcs /// +L.dag L.dagsq i.deh_c3 stm /// +if dag>=25 & dag<=64 & dgn==0 /// +[pweight=${weight}] /// +, absorb(idperson) vce(cluster idperson) + + + * save raw results +matrix results = r(table) +matrix results = results[1..6,1..10]' +putexcel set "$dir_raw_results/health_wellbeing/health_wellbeing", sheet("DHE_MCS2_Females") replace +putexcel A3 = matrix(results), names nformat(number_d2) +putexcel J4 = matrix(e(V)) + +gen in_sample = e(sample) + +predict p + +save "$dir_validation_data/DHE_MCS2_Females_sample", replace + + +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar rmse = e(rmse) +scalar chi2 = e(chi2) +scalar ll = e(ll) + + +* Results + +* Note: Zeros values are eliminated + +matrix b = e(b) +matrix V = e(V) +matrix V = V[1..14,1..14] + +forvalues i = 1/14 { + forvalues j = 1/14 { + if `i' == `j' { + continue + } + matrix V[`i',`j'] = 0 + } +} + +* Store variance-covariance matrix + +preserve + +putexcel set "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_health_wellbeing", sheet("DHE_MCS2_Females", replace) modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients + +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +// Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +// Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +// Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_health_wellbeing", sheet("DHE_MCS2_Females") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + +* Labelling + +putexcel A1 = "REGRESSOR" +putexcel A2 = "EmployedToUnemployed" +putexcel A3 = "UnemployedToEmployed" +putexcel A4 = "PersistentUnemployed" +putexcel A5 = "NonPovertyToPoverty" +putexcel A6 = "PovertyToNonPoverty" +putexcel A7 = "PersistentPoverty" +putexcel A8 = "RealIncomeChange" +putexcel A9 = "RealIncomeDecrease_D" +putexcel A10 = "FinancialDistress" +putexcel A11 = "Covid_2020_D" +putexcel A12 = "Covid_2021_D" + + +putexcel B1 = "COEFFICIENT" +putexcel C1 = "EmployedToUnemployed" +putexcel D1 = "UnemployedToEmployed" +putexcel E1 = "PersistentUnemployed" +putexcel F1 = "NonPovertyToPoverty" +putexcel G1 = "PovertyToNonPoverty" +putexcel H1 = "PersistentPoverty" +putexcel I1 = "RealIncomeChange" +putexcel J1 = "RealIncomeDecrease_D" +putexcel K1 = "FinancialDistress" +putexcel L1 = "Covid_2020_D" +putexcel M1 = "Covid_2021_D" + +* save RMSE +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A19 = ("DHE_MCS2_Females") B19 = rmse + + +drop in_sample p +scalar drop r2_p N chi2 ll + +*************************************************************** +* DHE_MCS2_Males: SF12 MCS score 0-100 - causal employment effects * +*************************************************************** + + +*Stage 2 +*Male +reghdfe dhe_mcs /// +ib11.exp_emp i.exp_poverty i.exp_incchange D.log_income financial_distress /// +y2020 y2021 /// +L.i.dhh_owned L.i.dcpst L.dnc L.dhe_pcs L.ib8.drgn L.i.ydses_c5 L.dlltsd01 L.dhe_mcs /// +L.dag L.dagsq i.deh_c3 stm /// +if dag>=25 & dag<=64 & dgn==1 /// +[pweight=${weight}] /// +, absorb(idperson) vce(cluster idperson) + + + * save raw results +matrix results = r(table) +matrix results = results[1..6,1..10]' +putexcel set "$dir_raw_results/health_wellbeing/health_wellbeing", sheet("DHE_MCS2_Males") replace +putexcel A3 = matrix(results), names nformat(number_d2) +putexcel J4 = matrix(e(V)) + +gen in_sample = e(sample) + +predict p + +save "$dir_validation_data/DHE_MCS2_Males_sample", replace + + +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar rmse = e(rmse) +scalar chi2 = e(chi2) +scalar ll = e(ll) + + +* Results + +* Note: Zeros values are eliminated + +matrix b = e(b) +matrix V = e(V) +matrix V = V[1..14,1..14] + +forvalues i = 1/14 { + forvalues j = 1/14 { + if `i' == `j' { + continue + } + matrix V[`i',`j'] = 0 + } +} + +* Store variance-covariance matrix + +preserve + +putexcel set "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_health_wellbeing", sheet("DHE_MCS2_Males", replace) modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients + +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +// Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +// Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +// Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_health_wellbeing", sheet("DHE_MCS2_Males") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + +* Labelling + +putexcel A1 = "REGRESSOR" +putexcel A2 = "EmployedToUnemployed" +putexcel A3 = "UnemployedToEmployed" +putexcel A4 = "PersistentUnemployed" +putexcel A5 = "NonPovertyToPoverty" +putexcel A6 = "PovertyToNonPoverty" +putexcel A7 = "PersistentPoverty" +putexcel A8 = "RealIncomeChange" +putexcel A9 = "RealIncomeDecrease_D" +putexcel A10 = "FinancialDistress" +putexcel A11 = "Covid_2020_D" +putexcel A12 = "Covid_2021_D" + + +putexcel B1 = "COEFFICIENT" +putexcel C1 = "EmployedToUnemployed" +putexcel D1 = "UnemployedToEmployed" +putexcel E1 = "PersistentUnemployed" +putexcel F1 = "NonPovertyToPoverty" +putexcel G1 = "PovertyToNonPoverty" +putexcel H1 = "PersistentPoverty" +putexcel I1 = "RealIncomeChange" +putexcel J1 = "RealIncomeDecrease_D" +putexcel K1 = "FinancialDistress" +putexcel L1 = "Covid_2020_D" +putexcel M1 = "Covid_2021_D" + +* save RMSE +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A20 = ("DHE_MCS2_Males") B20 = rmse + + +drop in_sample p +scalar drop r2_p N chi2 ll + + +******************************************************************************* +* DHE_PCS1 - SF12 PCS score 0-100 of all working-age adults - baseline effects * +******************************************************************************** + +reg dhe_pcs /// +L.i.dhh_owned L.i.dcpst L.dnc L.dhe_mcs L.ib8.drgn L.i.ydses_c5 L.dlltsd01 L.dhe_pcs /// +L.dag L.dagsq i.deh_c3 i.dot i.dgn stm /// +[pweight=${weight}] /// +, vce(cluster idperson) + + * save raw results +matrix results = r(table) +matrix results = results[1..6,1...]' +putexcel set "$dir_raw_results/health_wellbeing/health_wellbeing", sheet("DHE_PCS1") replace +putexcel A3 = matrix(results), names nformat(number_d2) +putexcel J4 = matrix(e(V)) + +gen in_sample = e(sample) + +predict p + +save "$dir_validation_data/DHE_PCS1_sample", replace + + +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar rmse = e(rmse) +scalar chi2 = e(chi2) +scalar ll = e(ll) + + +* Results + +* Note: Zeros values are eliminated + +matrix b = e(b) +matrix V = e(V) + + +* Store variance-covariance matrix + +preserve + +putexcel set "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_health_wellbeing", sheet("DHE_PCS1", replace) modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients + +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +// Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +// Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +// Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_health_wellbeing", sheet("DHE_PCS1") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + + +* Labelling + +putexcel A1 = "REGRESSOR" +putexcel A2 = "D_Home_owner_L1" +putexcel A3 = "Dcpst_Single_L1" +putexcel A4 = "Dnc_L1" +putexcel A5 = "Dhe_mcs_L1" +putexcel A6 = "UKC" +putexcel A7 = "UKD" +putexcel A8 = "UKE" +putexcel A9 = "UKF" +putexcel A10 = "UKG" +putexcel A11 = "UKH" +putexcel A12 = "UKJ" +putexcel A13 = "UKK" +putexcel A14 = "UKL" +putexcel A15 = "UKM" +putexcel A16 = "UKN" +putexcel A17 = "Ydses_c5_Q2_L1" +putexcel A18 = "Ydses_c5_Q3_L1" +putexcel A19 = "Ydses_c5_Q4_L1" +putexcel A20 = "Ydses_c5_Q5_L1" +putexcel A21 = "Dlltsd01_L1" +putexcel A22 = "Dhe_pcs_L1" +putexcel A23 = "Dag_L1" +putexcel A24 = "Dag_sq_L1" +putexcel A25 = "Deh_c3_Medium" +putexcel A26 = "Deh_c3_Low" +putexcel A27 = "EthnicityAsian" +putexcel A28 = "EthnicityBlack" +putexcel A29 = "EthnicityOther" +putexcel A30 = "Dgn" +putexcel A31 = "Year_transformed" +putexcel A32 = "Constant" + +putexcel B1 = "COEFFICIENT" +putexcel C1 = "D_Home_owner_L1" +putexcel D1 = "Dcpst_Single_L1" +putexcel E1 = "Dnc_L1" +putexcel F1 = "Dhe_mcs_L1" +putexcel G1 = "UKC" +putexcel H1 = "UKD" +putexcel I1 = "UKE" +putexcel J1 = "UKF" +putexcel K1 = "UKG" +putexcel L1 = "UKH" +putexcel M1 = "UKJ" +putexcel N1 = "UKK" +putexcel O1 = "UKL" +putexcel P1 = "UKM" +putexcel Q1 = "UKN" +putexcel R1 = "Ydses_c5_Q2_L1" +putexcel S1 = "Ydses_c5_Q3_L1" +putexcel T1 = "Ydses_c5_Q4_L1" +putexcel U1 = "Ydses_c5_Q5_L1" +putexcel V1 = "Dlltsd01_L1" +putexcel W1 = "Dhe_pcs_L1" +putexcel X1 = "Dag_L1" +putexcel Y1 = "Dag_sq_L1" +putexcel Z1 = "Deh_c3_Medium" +putexcel AA1 = "Deh_c3_Low" +putexcel AB1 = "EthnicityAsian" +putexcel AC1 = "EthnicityBlack" +putexcel AD1 = "EthnicityOther" +putexcel AE1 = "Dgn" +putexcel AF1 = "Year_transformed" +putexcel AG1 = "Constant" + +* save RMSE +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A21 = ("DHE_PCS1") B21 = rmse + + +drop in_sample p +scalar drop r2_p N chi2 ll + + +*************************************************************** +* DHE_PCS2_Females: SF12 PCS score 0-100 - causal employment effects * +*************************************************************** + + +*Stage 2 +*Female +reghdfe dhe_pcs /// +ib11.exp_emp i.exp_poverty i.exp_incchange D.log_income financial_distress /// +y2020 y2021 /// +L.i.dhh_owned L.i.dcpst L.dnc L.dhe_mcs L.ib8.drgn L.i.ydses_c5 L.dlltsd01 L.dhe_pcs /// +L.dag L.dagsq i.deh_c3 stm /// +if dag>=25 & dag<=64 & dgn==0 /// +[pweight=${weight}] /// +, absorb(idperson) vce(cluster idperson) + + + * save raw results +matrix results = r(table) +matrix results = results[1..6,1..10]' +putexcel set "$dir_raw_results/health_wellbeing/health_wellbeing", sheet("DHE_PCS2_Females") replace +putexcel A3 = matrix(results), names nformat(number_d2) +putexcel J4 = matrix(e(V)) + +gen in_sample = e(sample) + +predict p + +save "$dir_validation_data/DHE_PCS2_Females_sample", replace + + +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar rmse = e(rmse) +scalar chi2 = e(chi2) +scalar ll = e(ll) + + +* Results + +* Note: Zeros values are eliminated + +matrix b = e(b) +matrix V = e(V) +matrix V = V[1..14,1..14] + +forvalues i = 1/14 { + forvalues j = 1/14 { + if `i' == `j' { + continue + } + matrix V[`i',`j'] = 0 + } +} + +* Store variance-covariance matrix + +preserve + +putexcel set "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_health_wellbeing", sheet("DHE_PCS2_Females", replace) modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients + +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +// Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +// Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +// Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_health_wellbeing", sheet("DHE_PCS2_Females") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + +* Labelling + +putexcel A1 = "REGRESSOR" +putexcel A2 = "EmployedToUnemployed" +putexcel A3 = "UnemployedToEmployed" +putexcel A4 = "PersistentUnemployed" +putexcel A5 = "NonPovertyToPoverty" +putexcel A6 = "PovertyToNonPoverty" +putexcel A7 = "PersistentPoverty" +putexcel A8 = "RealIncomeChange" +putexcel A9 = "RealIncomeDecrease_D" +putexcel A10 = "FinancialDistress" +putexcel A11 = "Covid_2020_D" +putexcel A12 = "Covid_2021_D" + + +putexcel B1 = "COEFFICIENT" +putexcel C1 = "EmployedToUnemployed" +putexcel D1 = "UnemployedToEmployed" +putexcel E1 = "PersistentUnemployed" +putexcel F1 = "NonPovertyToPoverty" +putexcel G1 = "PovertyToNonPoverty" +putexcel H1 = "PersistentPoverty" +putexcel I1 = "RealIncomeChange" +putexcel J1 = "RealIncomeDecrease_D" +putexcel K1 = "FinancialDistress" +putexcel L1 = "Covid_2020_D" +putexcel M1 = "Covid_2021_D" + +* save RMSE +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A22 = ("DHE_PCS2_Females") B22 = rmse + + + +drop in_sample p +scalar drop r2_p N chi2 ll + +*************************************************************** +* DHE_PCS2_Males: SF12 PCS score 0-100 - causal employment effects * +*************************************************************** + + +*Stage 2 +*Male +reghdfe dhe_pcs /// +ib11.exp_emp i.exp_poverty i.exp_incchange D.log_income financial_distress /// +y2020 y2021 /// +L.i.dhh_owned L.i.dcpst L.dnc L.dhe_mcs L.ib8.drgn L.i.ydses_c5 L.dlltsd01 L.dhe_pcs /// +L.dag L.dagsq i.deh_c3 stm /// +if dag>=25 & dag<=64 & dgn==1 /// +[pweight=${weight}] /// +, absorb(idperson) vce(cluster idperson) + + + * save raw results +matrix results = r(table) +matrix results = results[1..6,1..10]' +putexcel set "$dir_raw_results/health_wellbeing/health_wellbeing", sheet("DHE_PCS2_Males") replace +putexcel A3 = matrix(results), names nformat(number_d2) +putexcel J4 = matrix(e(V)) + +gen in_sample = e(sample) + +predict p + +save "$dir_validation_data/DHE_PCS2_Males_sample", replace + + +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar rmse = e(rmse) +scalar chi2 = e(chi2) +scalar ll = e(ll) + + +* Results + +* Note: Zeros values are eliminated + +matrix b = e(b) +matrix V = e(V) +matrix V = V[1..14,1..14] + +forvalues i = 1/14 { + forvalues j = 1/14 { + if `i' == `j' { + continue + } + matrix V[`i',`j'] = 0 + } +} + +* Store variance-covariance matrix + +preserve + +putexcel set "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_health_wellbeing", sheet("DHE_PCS2_Males", replace) modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients + +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +// Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +// Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +// Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_health_wellbeing", sheet("DHE_PCS2_Males") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + +* Labelling + +putexcel A1 = "REGRESSOR" +putexcel A2 = "EmployedToUnemployed" +putexcel A3 = "UnemployedToEmployed" +putexcel A4 = "PersistentUnemployed" +putexcel A5 = "NonPovertyToPoverty" +putexcel A6 = "PovertyToNonPoverty" +putexcel A7 = "PersistentPoverty" +putexcel A8 = "RealIncomeChange" +putexcel A9 = "RealIncomeDecrease_D" +putexcel A10 = "FinancialDistress" +putexcel A11 = "Covid_2020_D" +putexcel A12 = "Covid_2021_D" + + +putexcel B1 = "COEFFICIENT" +putexcel C1 = "EmployedToUnemployed" +putexcel D1 = "UnemployedToEmployed" +putexcel E1 = "PersistentUnemployed" +putexcel F1 = "NonPovertyToPoverty" +putexcel G1 = "PovertyToNonPoverty" +putexcel H1 = "PersistentPoverty" +putexcel I1 = "RealIncomeChange" +putexcel J1 = "RealIncomeDecrease_D" +putexcel K1 = "FinancialDistress" +putexcel L1 = "Covid_2020_D" +putexcel M1 = "Covid_2021_D" + +* save RMSE +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A23 = ("DHE_PCS2_Males") B23 = rmse + + +drop in_sample p +scalar drop r2_p N chi2 ll + + +******************************************************************************* +* DLS1 - Life Satisfaction 1-7 of all working-age adults - baseline effects * +******************************************************************************** + +reg dls /// +L.i.dhh_owned L.i.dcpst L.dnc L.dhe_pcs L.ib8.drgn L.i.ydses_c5 L.dlltsd01 L.dls /// +L.dag L.dagsq i.deh_c3 i.dot i.dgn stm /// +[pweight=${weight}] /// +, vce(cluster idperson) + + * save raw results +matrix results = r(table) +matrix results = results[1..6,1...]' +putexcel set "$dir_raw_results/health_wellbeing/health_wellbeing", sheet("DLS1") replace +putexcel A3 = matrix(results), names nformat(number_d2) +putexcel J4 = matrix(e(V)) + +gen in_sample = e(sample) + +predict p + +save "$dir_validation_data/DLS1_sample", replace + + +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar rmse = e(rmse) +scalar chi2 = e(chi2) +scalar ll = e(ll) + + +* Results + +* Note: Zeros values are eliminated + +matrix b = e(b) +matrix V = e(V) + + +* Store variance-covariance matrix + +preserve + +putexcel set "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_health_wellbeing", sheet("DLS1", replace) modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients + +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +// Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +// Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +// Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_health_wellbeing", sheet("DLS1") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + + +* Labelling + +putexcel A1 = "REGRESSOR" +putexcel A2 = "D_Home_owner_L1" +putexcel A3 = "Dcpst_Single_L1" +putexcel A4 = "Dnc_L1" +putexcel A5 = "Dhe_pcs_L1" +putexcel A6 = "UKC" +putexcel A7 = "UKD" +putexcel A8 = "UKE" +putexcel A9 = "UKF" +putexcel A10 = "UKG" +putexcel A11 = "UKH" +putexcel A12 = "UKJ" +putexcel A13 = "UKK" +putexcel A14 = "UKL" +putexcel A15 = "UKM" +putexcel A16 = "UKN" +putexcel A17 = "Ydses_c5_Q2_L1" +putexcel A18 = "Ydses_c5_Q3_L1" +putexcel A19 = "Ydses_c5_Q4_L1" +putexcel A20 = "Ydses_c5_Q5_L1" +putexcel A21 = "Dlltsd01_L1" +putexcel A22 = "Dls_L1" +putexcel A23 = "Dag_L1" +putexcel A24 = "Dag_sq_L1" +putexcel A25 = "Deh_c3_Medium" +putexcel A26 = "Deh_c3_Low" +putexcel A27 = "EthnicityAsian" +putexcel A28 = "EthnicityBlack" +putexcel A29 = "EthnicityOther" +putexcel A30 = "Dgn" +putexcel A31 = "Year_transformed" +putexcel A32 = "Constant" + +putexcel B1 = "COEFFICIENT" +putexcel C1 = "D_Home_owner_L1" +putexcel D1 = "Dcpst_Single_L1" +putexcel E1 = "Dnc_L1" +putexcel F1 = "Dhe_pcs_L1" +putexcel G1 = "UKC" +putexcel H1 = "UKD" +putexcel I1 = "UKE" +putexcel J1 = "UKF" +putexcel K1 = "UKG" +putexcel L1 = "UKH" +putexcel M1 = "UKJ" +putexcel N1 = "UKK" +putexcel O1 = "UKL" +putexcel P1 = "UKM" +putexcel Q1 = "UKN" +putexcel R1 = "Ydses_c5_Q2_L1" +putexcel S1 = "Ydses_c5_Q3_L1" +putexcel T1 = "Ydses_c5_Q4_L1" +putexcel U1 = "Ydses_c5_Q5_L1" +putexcel V1 = "Dlltsd01_L1" +putexcel W1 = "Dls_L1" +putexcel X1 = "Dag_L1" +putexcel Y1 = "Dag_sq_L1" +putexcel Z1 = "Deh_c3_Medium" +putexcel AA1 = "Deh_c3_Low" +putexcel AB1 = "EthnicityAsian" +putexcel AC1 = "EthnicityBlack" +putexcel AD1 = "EthnicityOther" +putexcel AE1 = "Dgn" +putexcel AF1 = "Year_transformed" +putexcel AG1 = "Constant" + +* save RMSE +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A24 = ("DLS1") B24 = rmse + + +drop in_sample p +scalar drop r2_p N chi2 ll + + +*************************************************************** +* DLS2_Females: Life Satisfaction 1-7 - causal employment effects * +*************************************************************** + + +*Stage 2 +*Female +reghdfe dls /// +ib11.exp_emp i.exp_poverty i.exp_incchange D.log_income financial_distress /// +y2020 y2021 /// +L.i.dhh_owned L.i.dcpst L.dnc L.dhe_pcs L.ib8.drgn L.i.ydses_c5 L.dlltsd01 L.dls /// +L.dag L.dagsq i.deh_c3 stm /// +if dag>=25 & dag<=64 & dgn==0 /// +[pweight=${weight}] /// +, absorb(idperson) vce(cluster idperson) + + + * save raw results +matrix results = r(table) +matrix results = results[1..6,1..10]' +putexcel set "$dir_raw_results/health_wellbeing/health_wellbeing", sheet("DLS2_Females") replace +putexcel A3 = matrix(results), names nformat(number_d2) +putexcel J4 = matrix(e(V)) + +gen in_sample = e(sample) + +predict p + +save "$dir_validation_data/DLS2_Females_sample", replace + + +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar rmse = e(rmse) +scalar chi2 = e(chi2) +scalar ll = e(ll) + + +* Results + +* Note: Zeros values are eliminated + +matrix b = e(b) +matrix V = e(V) +matrix V = V[1..14,1..14] + +forvalues i = 1/14 { + forvalues j = 1/14 { + if `i' == `j' { + continue + } + matrix V[`i',`j'] = 0 + } +} + +* Store variance-covariance matrix + +preserve + +putexcel set "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_health_wellbeing", sheet("DLS2_Females", replace) modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients + +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +// Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +// Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +// Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_health_wellbeing", sheet("DLS2_Females") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + +* Labelling + +putexcel A1 = "REGRESSOR" +putexcel A2 = "EmployedToUnemployed" +putexcel A3 = "UnemployedToEmployed" +putexcel A4 = "PersistentUnemployed" +putexcel A5 = "NonPovertyToPoverty" +putexcel A6 = "PovertyToNonPoverty" +putexcel A7 = "PersistentPoverty" +putexcel A8 = "RealIncomeChange" +putexcel A9 = "RealIncomeDecrease_D" +putexcel A10 = "FinancialDistress" +putexcel A11 = "Covid_2020_D" +putexcel A12 = "Covid_2021_D" + + +putexcel B1 = "COEFFICIENT" +putexcel C1 = "EmployedToUnemployed" +putexcel D1 = "UnemployedToEmployed" +putexcel E1 = "PersistentUnemployed" +putexcel F1 = "NonPovertyToPoverty" +putexcel G1 = "PovertyToNonPoverty" +putexcel H1 = "PersistentPoverty" +putexcel I1 = "RealIncomeChange" +putexcel J1 = "RealIncomeDecrease_D" +putexcel K1 = "FinancialDistress" +putexcel L1 = "Covid_2020_D" +putexcel M1 = "Covid_2021_D" + +* save RMSE +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A25 = ("DLS2_Females") B25 = rmse + + +drop in_sample p +scalar drop r2_p N chi2 ll + +*************************************************************** +* DLS2_Males: Life Satisfaction 1-7 - causal employment effects * +*************************************************************** + + +*Stage 2 +*Male +reghdfe dls /// +ib11.exp_emp i.exp_poverty i.exp_incchange D.log_income financial_distress /// +y2020 y2021 /// +L.i.dhh_owned L.i.dcpst L.dnc L.dhe_pcs L.ib8.drgn L.i.ydses_c5 L.dlltsd01 L.dls /// +L.dag L.dagsq i.deh_c3 stm /// +if dag>=25 & dag<=64 & dgn==1 /// +[pweight=${weight}] /// +, absorb(idperson) vce(cluster idperson) + + + * save raw results +matrix results = r(table) +matrix results = results[1..6,1..10]' +putexcel set "$dir_raw_results/health_wellbeing/health_wellbeing", sheet("DLS2_Males") replace +putexcel A3 = matrix(results), names nformat(number_d2) +putexcel J4 = matrix(e(V)) + +gen in_sample = e(sample) + +predict p + +save "$dir_validation_data/DLS2_Males_sample", replace + + +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar rmse = e(rmse) +scalar chi2 = e(chi2) +scalar ll = e(ll) + + +* Results + +* Note: Zeros values are eliminated + +matrix b = e(b) +matrix V = e(V) +matrix V = V[1..14,1..14] + +forvalues i = 1/14 { + forvalues j = 1/14 { + if `i' == `j' { + continue + } + matrix V[`i',`j'] = 0 + } +} + +* Store variance-covariance matrix + +preserve + +putexcel set "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/health_wellbeing/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_health_wellbeing", sheet("DLS2_Males", replace) modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients + +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +// Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +// Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +// Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_health_wellbeing", sheet("DLS2_Males") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + +* Labelling + +putexcel A1 = "REGRESSOR" +putexcel A2 = "EmployedToUnemployed" +putexcel A3 = "UnemployedToEmployed" +putexcel A4 = "PersistentUnemployed" +putexcel A5 = "NonPovertyToPoverty" +putexcel A6 = "PovertyToNonPoverty" +putexcel A7 = "PersistentPoverty" +putexcel A8 = "RealIncomeChange" +putexcel A9 = "RealIncomeDecrease_D" +putexcel A10 = "FinancialDistress" +putexcel A11 = "Covid_2020_D" +putexcel A12 = "Covid_2021_D" + + +putexcel B1 = "COEFFICIENT" +putexcel C1 = "EmployedToUnemployed" +putexcel D1 = "UnemployedToEmployed" +putexcel E1 = "PersistentUnemployed" +putexcel F1 = "NonPovertyToPoverty" +putexcel G1 = "PovertyToNonPoverty" +putexcel H1 = "PersistentPoverty" +putexcel I1 = "RealIncomeChange" +putexcel J1 = "RealIncomeDecrease_D" +putexcel K1 = "FinancialDistress" +putexcel L1 = "Covid_2020_D" +putexcel M1 = "Covid_2021_D" + +* save RMSE +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A26 = ("DLS2_Males") B26 = rmse + + +drop in_sample p +scalar drop r2_p N chi2 ll + + diff --git a/input/InitialPopulations/compile/RegressionEstimates/master.do b/input/InitialPopulations/compile/RegressionEstimates/master.do deleted file mode 100644 index e9d83a6cf..000000000 --- a/input/InitialPopulations/compile/RegressionEstimates/master.do +++ /dev/null @@ -1,281 +0,0 @@ - -*************************************************************************************** -* PROJECT: SimPaths UK: regression estimates for SimPaths using UKHLS data -* DO-FILE NAME: master.do -* DESCRIPTION: Main do-file to set the main parameters (country, paths) and call sub-scripts -*************************************************************************************** -* COUNTRY: UK -* DATA: UKHLS EUL version - UKDA-6614-stata [to wave o] -* -* AUTHORS: Daria Popova, Justin van de Ven -* LAST UPDATE: 18 Feb 2026 DP -*************************************************************************************** - -*************************************************************************************** -* General comments: -* - Note that in the following scripts some standard commands may be -* abbreviated: (gen)erate, (tab)ulate, (sum)marize, (di)splay, -* (cap)ture, (qui)etly, (noi)sily - -*Stata packages to install -*ssc install fre -*ssc install tsspell -*ssc install carryforward -*ssc install outreg2 -*ssc install oparallel -*ssc install gologit2 -*ssc install winsor -*ssc install reghdfe -*ssc install ftools -*ssc install require -* -* NOTES: -* The income and union parameter do file must be run after -* the wage estimates are obtained because they use -* predicted wages. The order of the remaining files is -* arbitrary. -*************************************************************************************** -*************************************************************************************** - -clear all -set more off -set type double -set maxvar 30000 -set matsize 1000 - - -/************************************************************************************** -* DEFINE DIRECTORIES -**************************************************************************************/ - -* Working directory -global dir_work "D:\Dasha\ESSEX\_SimPaths\_SimPaths_UK\regression_estimates" - -* Directory which contains do files -global dir_do "${dir_work}/do" - -* Directory which contains log files -global dir_log "${dir_work}/log" - -* Directory which contains raw output: Excel and Word tables -global dir_raw_results "${dir_work}/raw_results" - -* Directory which contains final Excel files read by the model -global dir_results "${dir_work}/results" - -* Pooled dataset for estimates -global estimation_sample "D:\Dasha\ESSEX\_SimPaths\_SimPaths_UK\initial_populations\data\UKHLS_pooled_ipop.dta" - -* Pooled dataset with predicted wages after Heckman -global estimation_sample2 "D:\Dasha\ESSEX\_SimPaths\_SimPaths_UK\initial_populations\data\UKHLS_pooled_ipop2.dta" - -* Directory containing external input data -global dir_external_data "$dir_work/external_data" - -* Directory containing results of comparison of various weights -global weight_checks "${dir_work}/weight_checks" - -*********************Internal validation**************************************** -* Directory to save data for internal validation -global dir_validation_data "${dir_work}/internal_validation/data" - -* Directory for internal validation do-files -global dir_do_validation "${dir_work}/internal_validation/do_files" - -* Directory for internal validation do-files -global dir_validation_graphs "${dir_work}/internal_validation/graphs" - -global country "UK" - -global first_sim_year "2010" - -global last_sim_year "2025" - -* Globals used for all processes - -global weight "dwt" - -global regions "UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN" //UKI is London (reference) - -global ethnicity "Ethn_Asian Ethn_Black Ethn_Other" //White is reference. Mixed race & undefined are in Other category - - -* Define threshold ages -/* -Ages used for specifying samples. -ENSURE THE SAME AS THE GLOBALS USED IN THE INTIIAL POPULATIONS MASTER FILE -*/ - -* Age become an adult in various dimensions -global age_becomes_responsible 18 - -global age_becomes_semi_responsible 16 - -global age_seek_employment 16 - -global age_leave_school 16 - -global age_form_partnership 18 - -global age_have_child_min 18 - -global age_leave_parental_home 18 - -global age_own_home 18 - -* Age can/must/cannot make various transitions -global age_max_dep_child 17 - -global age_adult 18 - -global age_can_retire 50 - -global age_force_retire 75 - -global age_force_leave_spell1_edu 30 - -global age_have_child_max 49 // allow this to be led by the data - - -/******************************************************************************* -* PROCESS IF CONDITIONS -*******************************************************************************/ - -* Education -global e1a_if_condition "dag >= ${age_leave_school} & dag < ${age_force_leave_spell1_edu} & l.les_c4 == 2" - -global e1b_if_condition "dag >= ${age_leave_school} & l.les_c4 != 4 & l.les_c4 != 2" - -global e2_if_condition "dag >= ${age_leave_school} & l.les_c4 == 2 & les_c4 != 2" - -* Leave the parental home -global p1_if_condition "ded == 0 & dag >= ${age_leave_parental_home}" - -* Partnership -global u1_if_condition "dag >= ${age_form_partnership} & ssscp != 1" - -global u2_if_condition "dgn == 0 & dag >= ${age_form_partnership} & l.ssscp != 1" - -* Fertility -global f1_if_condition "dag >= ${age_have_child_min} & dag <= ${age_have_child_max} & dgn == 0" - -* Health -global h1_if_condition "dag >= ${age_becomes_semi_responsible} & flag_dhe_imp == 0" - -global h2_if_condition "dag >= ${age_becomes_semi_responsible} & ded == 0" - -* Home ownership -global ho1_if_condition "dag >= ${age_own_home}" - -* Retirment -global r1a_if_condition "dcpst == 2 & dag >= ${age_can_retire}" - -global r1b_if_condition "ssscp != 1 & dcpst == 1 & dag >= ${age_can_retire}" - - -* WAGES -global wages_f_no_prev_if_condition "dgn == 0 & dag >= ${age_seek_employment} & dag <= ${age_force_retire} & previouslyWorking == 0 & deh_c4>0" - -global wages_m_no_prev_if_condition "dgn == 1 & dag >= ${age_seek_employment} & dag <= ${age_force_retire} & previouslyWorking == 0 & deh_c4>0" - -global wages_f_prev_if_condition "dgn == 0 & dag >= ${age_seek_employment} & dag <= ${age_force_retire} & previouslyWorking == 1 & deh_c4>0" - -global wages_m_prev_if_condition "dgn == 1 & dag >= ${age_seek_employment} & dag <= ${age_force_retire} & previouslyWorking == 1 & deh_c4>0" - - -* CAPITAL INCOME -global i1a_if_condition "dag >= ${age_becomes_semi_responsible}" - -global i1b_if_condition "dag >= ${age_becomes_semi_responsible} & receives_ypncp == 1" - -* PRIVATE PENSION INCOME -global i2b_if_condition "dag >= ${age_can_retire} & dlrtrd == 1 & l.dlrtrd==1 & receives_ypnoab==1" - -global i3a_if_condition "dag >= ${age_can_retire} & dlrtrd == 1 & l.dlrtrd!=1 & l.les_c4 != 2" - -global i3b_if_condition "dag >= ${age_can_retire} & dlrtrd == 1 & l.dlrtrd!=1 & l.les_c4 != 2 & receives_ypnoab==1" - - -* SOCIAL CARE -global s2a_if_condition "dag > 64 & stm >= 15 & stm <= 22" // Need care - -global s2b_if_condition "dag > 64 & stm >= 16 & stm <= 21" // Receive care - -global s2c_if_condition "dag > 64 & receive_care & stm >= 16 & stm <= 21" // Care mix received - -global s2d_if_condition "dag > 64 & receive_informal_care & stm >= 16 & stm <= 21" // Informal care hours received - -global s2e_if_condition "dag > 64 & receive_formal_care & stm >= 16 & stm <= 21" // Formal care hours received - - -global s3a_if_condition "Single & stm >= 15" // Provide care, Singles - -global s3b_if_condition "Partnered & stm >= 15" // Provide care, Partnered - -global s3c_if_condition "provide_informal_care & Single & stm >= 15" // Informal care hours provided, Singles - -global s3d_if_condition "provide_informal_care & Partnered & stm >= 15" // Informal care hours provided, Singles - - -* Finanicial distress and health processes -* TO ADD - - - -/******************************************************************************* -* ESTIMATION FILES -*******************************************************************************/ -/**/ -do "${dir_do}/reg_education.do" - -do "${dir_do}/reg_leave_parental_home.do" - -do "${dir_do}/reg_partnership.do" - -do "${dir_do}/reg_fertility.do" - -do "${dir_do}/reg_health.do" - -do "${dir_do}/reg_home_ownership.do" - -do "${dir_do}/reg_retirement.do" - -do "${dir_do}/reg_wages.do" - -do "${dir_do}/reg_income.do" - -do "${dir_do}/reg_socialcare.do" - -do "${dir_do}/reg_financial_distress.do" - -do "${dir_do}/reg_health_mental.do" - -do "${dir_do}/reg_health_wellbeing.do" - - -******************************************************************************* -* INTERNAL VALIDATION FILES -****************************************************************************** -/* -do "$dir_do_validation/int_val_education.do" - -do "$dir_do_validation/int_val_leave_parental_home.do" - -do "$dir_do_validation/int_val_partnership.do" - -do "$dir_do_validation/int_val_fertility.do" - -do "$dir_do_validation/int_val_health.do" - -do "$dir_do_validation/int_val_home_ownership.do" - -do "$dir_do_validation/int_val_retirement.do" - -do "$dir_do_validation/int_val_wages.do" - -do "$dir_do_validation/int_val_income.do" -*/ - -/************************************************************************************** -* END OF FILE -**************************************************************************************/ diff --git a/input/InitialPopulations/compile/RegressionEstimates/programs.do b/input/InitialPopulations/compile/RegressionEstimates/programs.do index f807f1dc9..5800889ed 100644 --- a/input/InitialPopulations/compile/RegressionEstimates/programs.do +++ b/input/InitialPopulations/compile/RegressionEstimates/programs.do @@ -1,80 +1,281 @@ /*============================================================================== - MATA FUNCTIONS - Define all Mata functions + MATA FUNCTIONS - Define all Mata functions ==============================================================================*/ mata: mata clear mata set matastrict off -end -mata: -mata clear +// Format labels +void extract_and_export_labels(string scalar domain, string scalar sheet, real scalar max_n, real scalar is_ologit) { + nonzero_b_flag = st_matrix("nonzero_b_flag")' + + if (is_ologit) { + stripe = st_matrixcolstripe("b") + } + else { + stripe = st_matrixcolstripe("e(b)") + } -void trim_matrices() { - V = st_matrix("V") - b = st_matrix("b") - keep = (b :!= 0) - b_trimmed = select(b, keep) - V_trimmed = select(V, keep) - V_trimmed = select(V_trimmed', keep)' - st_matrix("b_trimmed", b_trimmed) - st_matrix("V_trimmed", V_trimmed) - st_matrix("nonzero_b_flag", keep) - printf("Matrices transferred successfully\n") + varnames = stripe[.,2] + varnames_no_bl = select(varnames, nonzero_b_flag :== 1) + labels_no_bl = usubinstr(varnames_no_bl, "1.", "", 1) + labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant") + // Handle lags: L.var -> var_L1 + labels_no_bl = /// + regexm(labels_no_bl, "^L\.") :* /// + (regexr(labels_no_bl, "^L\.", "") :+ "_L1") :+ (!regexm(labels_no_bl, "^L\.") :* labels_no_bl) + // Handle 1L.var + labels_no_bl = /// + regexm(labels_no_bl, "^1L\.") :* /// + (regexr(labels_no_bl, "^1L\.", "") :+ "_L1") :+ /// + (!regexm(labels_no_bl, "^1L\.") :* labels_no_bl) + // Handle 2L.var + labels_no_bl = /// + regexm(labels_no_bl, "^L2\.") :* /// + (regexr(labels_no_bl, "^L2\.", "") :+ "_L2") :+ /// + (!regexm(labels_no_bl, "^L2\.") :* labels_no_bl) + + // Truncate labels if max_n is specified (>0) + if (max_n > 0 & rows(labels_no_bl) > max_n) { + labels_no_bl = labels_no_bl[1..max_n, .] + } + + // Write labels via xl() — avoids stata() round-trip overhead per cell + // Column A rows 2+ (vertical), row 1 cols C+ (horizontal) + real scalar n_labs, i + n_labs = rows(labels_no_bl) + string scalar path + path = st_global("dir_results") + "/reg_" + domain + ".xlsx" + class xl scalar xbook + xbook = xl() + xbook.load_book(path) + xbook.set_sheet(sheet) + for (i=1; i<=n_labs; i++) xbook.put_string(i+1, 1, labels_no_bl[i]) + for (i=1; i<=n_labs; i++) xbook.put_string(1, i+2, labels_no_bl[i]) + xbook.close_book() } -void write_all_to_excel() { - b_trimmed = st_matrix("b_trimmed") - V_trimmed = st_matrix("V_trimmed") - n = cols(b_trimmed) - for (i=1; i<=n; i++) { - row = i + 1 - coef = b_trimmed[1,i] - stata("quietly putexcel B" + strofreal(row) + " = (" + strofreal(coef) + ")") - } - printf("Writing V-C matrix\n") - for (i=1; i<=n; i++) { - for (j=1; j<=n; j++) { - row = i + 1 - col_num = j + 2 - col_name = "" - temp = col_num - while (temp > 0) { - rem = mod(temp - 1, 26) - col_name = char(65 + rem) + col_name - temp = floor((temp - 1) / 26) - } - val = V_trimmed[i,j] - stata("quietly putexcel " + col_name + strofreal(row) + " = (" + strofreal(val) + ")") - } - if (mod(i, 5) == 0) printf(" Row %g/%g\n", i, n) - } - printf("Done\n") +// Create var diagonal matrix +void write_diagonal_to_excel() { + // Pull the matrices into Mata + V_trimmed = st_matrix("V_trimmed") + b_trimmed = st_matrix("b_trimmed") + + // Write coefficients to Column B + stata("quietly putexcel B2 = matrix(b_trimmed')") + + printf("Creating diagonal matrix...\n") + + // Create a diagonal version of V_trimmed + // diag(diagonal(V)) keeps the 'spine' and fills the rest with 0s + V_diag = diag(diagonal(V_trimmed)) + + // Push this modified matrix BACK into Stata's memory + // This overwrites the old "V_trimmed" with the diagonal version + st_replacematrix("V_trimmed", V_diag) + + // Now tell Stata to write the matrix it now sees as diagonal + stata("quietly putexcel C2 = matrix(V_trimmed)") + + printf("Done (Diagonal matrix written)\n") } -void extract_and_export_labels(string scalar sheet) { - nonzero_b_flag = st_matrix("nonzero_b_flag")' - stripe = st_matrixcolstripe("e(b)") - varnames = stripe[.,2] - varnames_no_bl = select(varnames, nonzero_b_flag :== 1) - labels_no_bl = usubinstr(varnames_no_bl, "1.", "", 1) - labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant") - n_labs = rows(labels_no_bl) - for (i=1; i<=n_labs; i++) { - row = i + 1 - stata("quietly putexcel A" + strofreal(row) + " = " + char(34) + labels_no_bl[i] + char(34)) - } - for (j=1; j<=n_labs; j++) { - col_num = j + 2 - col_name = "" - n_temp = col_num - while (n_temp > 0) { - rem = mod(n_temp - 1, 26) - col_name = char(65 + rem) + col_name - n_temp = floor((n_temp - 1) / 26) - } - stata("quietly putexcel " + col_name + "1 = " + char(34) + labels_no_bl[j] + char(34)) - } +// Ensure cuts are at the end of matrix +void reorder_cuts_to_end() { + b = st_matrix("b") + V = st_matrix("V") + stripe = st_matrixcolstripe("e(b)") + + // Identify cut point columns + is_cut = (stripe[.,1] :== "/") + not_cut = (is_cut :== 0) + is_cut_row = is_cut' + not_cut_row = not_cut' + + // Reorder b and V + b_reordered = select(b, not_cut_row), select(b, is_cut_row) + V_temp = select(V, not_cut_row), select(V, is_cut_row) + V_reordered = select(V_temp', not_cut_row), select(V_temp', is_cut_row) + V_reordered = V_reordered' + + // Reorder stripe and rename cuts before writing back + stripe_reordered = (select(stripe, not_cut) \ select(stripe, is_cut)) + for (i=1; i<=rows(stripe_reordered); i++) { + if (stripe_reordered[i,1] == "/" & regexm(stripe_reordered[i,2], "^cut([0-9]+)")) { + stripe_reordered[i,2] = "Cut" + regexs(1) + stripe_reordered[i,1] = "" + } + } + + // Write back with correct stripe + st_matrix("b", b_reordered) + st_matrix("V", V_reordered) + st_matrixcolstripe("b", stripe_reordered) + st_matrixcolstripe("V", stripe_reordered) + st_matrixrowstripe("V", stripe_reordered) + +} + +// Format gologit estimates and var-cov matricies +void build_gologit_structure(real scalar n_outcomes) { + b = st_matrix("b") + V = st_matrix("V") + + // Remove zero coefficients (baseline categories) + keep = (b :!= 0) + nonzero_b = select(b, keep) + V_trimmed = select(V, keep) + V_trimmed = select(V_trimmed', keep)' + st_matrix("nonzero_b", nonzero_b) + st_matrix("nonzero_b_flag", keep) + + // Detect repeated coefficients (proportional odds vars) + n = cols(nonzero_b) + n_per = n / (n_outcomes - 1) + repetition_flag = J(n, 1, 0) + tol = 1e-8 + for (i=1; i<=n; i++) { + found = 0 + for (j=1; j<=n; j++) { + if (found == 0 & i != j & abs(nonzero_b[1,i] - nonzero_b[1,j]) < tol) { + repetition_flag[i] = 1 + found = 1 + } + } + } + unique_flag = 1 :- repetition_flag + st_matrix("repetition_flag", repetition_flag') + st_matrix("unique_flag", unique_flag') + + // Build structure vector + structure_a = J(1, n_per, 1) + structure_b = unique_flag[n_per+1::n]' + structure = structure_a, structure_b + st_matrix("structure", structure) + + // Apply structure to b + b_structure = structure :* nonzero_b + keep2 = (b_structure :!= 0) + nonzero_b_structure = select(b_structure, keep2) + st_matrix("nonzero_b_structure", nonzero_b_structure) + + // Apply structure to V + square_structure_a = J(n, 1, 1) * structure + square_structure_b = square_structure_a' + square_structure = square_structure_a :* square_structure_b + var_structure = square_structure :* V_trimmed + row_keep = (rowsum(abs(var_structure)) :!= 0) + col_keep = (colsum(abs(var_structure)) :!= 0) + nonzero_var_structure = select(select(var_structure, col_keep), row_keep) + st_matrix("nonzero_var_structure", nonzero_var_structure) + + printf("Gologit structure built: %g unique coefficients\n", cols(nonzero_b_structure)) +} + +// Format gologit labels +void export_labels_gologit(string scalar domain, string scalar sheet) { + nonzero_b_flag = st_matrix("nonzero_b_flag")' + unique_flag = st_matrix("unique_flag")' + structure = st_matrix("structure")' + stripe = st_matrixcolstripe("e(b)") + + catnames = stripe[.,1] + varnames = stripe[.,2] + varnames_no_bl = select(varnames, nonzero_b_flag :== 1) + catnames_no_bl = select(catnames, nonzero_b_flag :== 1) + + // Clean variable names + labels_no_bl = usubinstr(varnames_no_bl, "1.", "", 1) + labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant") + labels_no_bl = (regexm(labels_no_bl, "^L\.") :* + (regexr(labels_no_bl, "^L\.", "") :+ "_L1")) :+ + (!regexm(labels_no_bl, "^L\.") :* labels_no_bl) + + // Add category suffix only for non-prop odds vars (unique_flag == 1) + labels_no_bl = labels_no_bl :+ + (("_" :+ catnames_no_bl) :* (unique_flag[1::rows(labels_no_bl)] :== 1)) + + // Filter by structure + final_labels = select(labels_no_bl, structure[1::rows(labels_no_bl)] :== 1) + + // Write labels via xl() — avoids stata() round-trip overhead per cell + // Column A rows 2+ (vertical), row 1 cols C+ (horizontal) + real scalar n_labs, i + n_labs = rows(final_labels) + string scalar path + path = st_global("dir_results") + "/reg_" + domain + ".xlsx" + class xl scalar xbook + xbook = xl() + xbook.load_book(path) + xbook.set_sheet(sheet) + for (i=1; i<=n_labs; i++) xbook.put_string(i+1, 1, final_labels[i]) + for (i=1; i<=n_labs; i++) xbook.put_string(1, i+2, final_labels[i]) + xbook.close_book() +} + +// Auto-extract labels for one stage of a Heckman model from e(b) column stripe. +// is_outcome=1 - outcome equation (all columns except "select"); +// is_outcome=0 - selection equation (columns where eq=="select"). +// Applies the same formatting as extract_and_export_labels. +// Writes vertically to col A (from row 2) and horizontally to row 1 (from col C). +void extract_heckman_stage_labels(string scalar domain, string scalar sheet, real scalar is_outcome) { + string matrix stripe + real matrix b_full + stripe = st_matrixcolstripe("e(b)") + b_full = st_matrix("e(b)") + + real scalar n, i + n = rows(stripe) + + string colvector labels_raw + labels_raw = J(0, 1, "") + + for (i = 1; i <= n; i++) { + string scalar eq_i + eq_i = stripe[i, 1] + + real scalar include + if (is_outcome) include = (eq_i != "select") + else include = (eq_i == "select") + + // Exclude base-category coefficients (zero b) + if (include & b_full[1, i] != 0) { + labels_raw = labels_raw \ stripe[i, 2] + } + } + + // Apply same formatting as extract_and_export_labels + string colvector labels + labels = usubinstr(labels_raw, "1.", "", 1) + labels = regexr(labels, "^_cons", "Constant") + // Handle lags: L.var -> var_L1 + labels = regexm(labels, "^L\.") :* (regexr(labels, "^L\.", "") :+ "_L1") :+ + (!regexm(labels, "^L\.") :* labels) + // Handle 1L.var + labels = regexm(labels, "^1L\.") :* (regexr(labels, "^1L\.", "") :+ "_L1") :+ + (!regexm(labels, "^1L\.") :* labels) + // Handle L2.var + labels = regexm(labels, "^L2\.") :* (regexr(labels, "^L2\.", "") :+ "_L2") :+ + (!regexm(labels, "^L2\.") :* labels) + + // Rename the inverse Mills ratio variable to a readable label + for (i = 1; i <= rows(labels); i++) { + if (labels[i] == "lambda") labels[i] = "InvMillsRatio" + } + + // Write via xl() + real scalar n_labs + n_labs = rows(labels) + string scalar path + path = st_global("dir_results") + "/reg_" + domain + ".xlsx" + class xl scalar xbook + xbook = xl() + xbook.load_book(path) + xbook.set_sheet(sheet) + for (i = 1; i <= n_labs; i++) xbook.put_string(i+1, 1, labels[i]) + for (i = 1; i <= n_labs; i++) xbook.put_string(1, i+2, labels[i]) + xbook.close_book() } end @@ -84,85 +285,166 @@ end HELPER PROGRAMS - Modular functions for common operations ==============================================================================*/ -* Check matrix eigenvalues for stability +* Load matrices and remove zero coefficients (baseline categories) +capture program drop trim_matrices +program define trim_matrices + + local k = colsof(b) + + // Identify non-zero coefficient column indices + local keep_idx "" + forvalues j = 1/`k' { + if b[1,`j'] != 0 { + local keep_idx "`keep_idx' `j'" + } + } + local n_keep = wordcount("`keep_idx'") + + // Build nonzero_b_flag as 1 x k row vector + matrix nonzero_b_flag = J(1, `k', 0) + foreach j of local keep_idx { + matrix nonzero_b_flag[1, `j'] = 1 + } + + // Build b_trimmed: 1 x n_keep + matrix b_trimmed = J(1, `n_keep', 0) + local c = 1 + foreach j of local keep_idx { + matrix b_trimmed[1, `c'] = b[1, `j'] + local ++c + } + + // Build V_trimmed: n_keep x n_keep + matrix V_trimmed = J(`n_keep', `n_keep', 0) + local r = 1 + foreach i of local keep_idx { + local c = 1 + foreach j of local keep_idx { + matrix V_trimmed[`r', `c'] = V[`i', `j'] + local ++c + } + local ++r + } + + display "Matrices transferred successfully" + +end + + +* Truncate trimmed matrices to first max_n non-zero estimates +capture program drop truncate_to_n +program define truncate_to_n + + syntax, maxn(integer) + + local n = colsof(b_trimmed) + + if `n' > `maxn' { + + // Truncate b_trimmed and V_trimmed to first maxn columns/rows + matrix b_trimmed = b_trimmed[1, 1..`maxn'] + matrix V_trimmed = V_trimmed[1..`maxn', 1..`maxn'] + + // Update nonzero_b_flag: zero out entries beyond the first maxn kept variables + local k = colsof(nonzero_b_flag) + local kept = 0 + forvalues i = 1/`k' { + if nonzero_b_flag[1, `i'] == 1 { + local kept = `kept' + 1 + } + if `kept' > `maxn' { + matrix nonzero_b_flag[1, `i'] = 0 + } + } + + display "Truncated to first `maxn' non-zero estimates" + } + else { + display "Fewer than `maxn' non-zero estimates (`n' found), no truncation needed" + } + +end + + +* Check var-cov matrix eigenvalues for stability (conformability) capture program drop check_matrix_stability program define check_matrix_stability - + matrix symeigen X lambda = V_trimmed scalar max_eig = lambda[1,1] scalar min_ratio = lambda[1, colsof(lambda)] / max_eig - + if max_eig < 1.0e-12 { display as error "CRITICAL ERROR: Maximum eigenvalue is too small (`max_eig')." display as error "The Variance-Covariance matrix is likely singular." exit 999 } - + display "Stability Check Passed: Max Eigenvalue is " max_eig - + if min_ratio < 1.0e-12 { display as error "Matrix is ill-conditioned. Min/Max ratio: " min_ratio exit 506 } - + display "Stability Check Passed. Min/Max ratio: " min_ratio - + +end + + +* Paste estimates and var-cov matrices to Excel +capture program drop write_all_to_excel +program define write_all_to_excel + + display "Writing Var-Cov matrix" + quietly putexcel B2 = matrix(b_trimmed') + quietly putexcel C2 = matrix(V_trimmed) + display "Done" + end * Export labels to Excel (both vertical and horizontal) capture program drop export_labels_to_excel program define export_labels_to_excel - - syntax, sheet(string) - - * Set up Excel file - putexcel set "$dir_results/reg_socialcare", sheet("`sheet'") modify - - * Vertical labels - forvalues i = 1/`n_labels' { - local row = `i' + 1 - quietly putexcel A`row' = "`lbl`i''" - } - - * Horizontal labels - use Mata to generate column names + + syntax, domain(string) sheet(string) + + // Set up Excel file + putexcel set "$dir_results/reg_`domain'", sheet("`sheet'") modify + + // Collect labels from locals and write via xl() — avoids stata() overhead per cell mata: { + real scalar n, i n = strtoreal(st_local("n_labels")) - for (j=1; j<=n; j++) { - col_num = j + 2 - col_name = "" - n_temp = col_num - while (n_temp > 0) { - rem = mod(n_temp - 1, 26) - col_name = char(65 + rem) + col_name - n_temp = floor((n_temp - 1) / 26) - } - st_local("col_" + strofreal(j), col_name) - } + string scalar path + path = st_global("dir_results") + "/reg_" + st_local("domain") + ".xlsx" + class xl scalar xbook + xbook = xl() + xbook.load_book(path) + xbook.set_sheet(st_local("sheet")) + for (i=1; i<=n; i++) xbook.put_string(i+1, 1, st_local("lbl" + strofreal(i))) + for (i=1; i<=n; i++) xbook.put_string(1, i+2, st_local("lbl" + strofreal(i))) + xbook.close_book() } - - * Now write using the column names from Mata - forvalues j = 1/`n_labels' { - quietly putexcel `col_`j''1 = "`lbl`j''" - } - + display "Exported `n_labels' labels to sheet `sheet'" - + end * Export goodness of fit statistics for probit/logit capture program drop export_gof_probit program define export_gof_probit - - syntax, row(integer) label(string) - - putexcel set "$dir_results/reg_socialcare", sheet("Gof") modify - + + syntax, domain(string) row(integer) label(string) + + putexcel set "$dir_results/reg_`domain'", sheet("Gof") modify + local row1 = `row' local row2 = `row' + 1 local row3 = `row' + 2 - + putexcel A`row1' = "`label'", bold putexcel A`row2' = "Pseudo R-squared" putexcel B`row2' = r2_p @@ -172,100 +454,126 @@ program define export_gof_probit putexcel F`row2' = chi2 putexcel E`row3' = "Log likelihood" putexcel F`row3' = ll - + end * Export goodness of fit statistics for OLS capture program drop export_gof_ols program define export_gof_ols - - syntax, row(integer) label(string) - - putexcel set "$dir_results/reg_socialcare", sheet("Gof") modify - + + syntax, domain(string) row(integer) label(string) + + putexcel set "$dir_results/reg_`domain'", sheet("Gof") modify + local row1 = `row' local row2 = `row' + 1 local row3 = `row' + 2 - + putexcel A`row1' = "`label'", bold putexcel A`row2' = "R-squared" putexcel B`row2' = r2 putexcel A`row3' = "N" putexcel B`row3' = N_sample - + end * Save raw results to Excel and Word capture program drop save_raw_results program define save_raw_results - - syntax, process(string) title(string) [ifcond(string)] - - * Save to Excel + + syntax, domain(string) process(string) title(string) [ifcond(string)] + + // Save to Excel matrix results = r(table) matrix results = results[1..6,1...]' - - putexcel set "$dir_raw_results/social_care/socialcare", /// + + putexcel set "$dir_raw_results/`domain'/`domain'", /// sheet("Process `process'") replace putexcel A3 = matrix(results), names nformat(number_d2) putexcel J4 = matrix(e(V)) - - * Save to Word (conditional on outreg2 being installed) + + // Save to Word (conditional on outreg2 being installed) capture which outreg2 if _rc == 0 { if "`ifcond'" != "" { local note `"addnote("Note: Regression if condition = (`ifcond')")"' } - - * Check if probit/logit or OLS - if "`e(cmd)'" == "probit" | "`e(cmd)'" == "logit" { + + // Check if probit/logit/ologit or OLS + if "`e(cmd)'" == "probit" | "`e(cmd)'" == "logit" | "`e(cmd)'" == "ologit" | "`e(cmd)'" == "gologit2" { outreg2 stats(coef se pval) using /// - "$dir_raw_results/social_care/`process'.doc", replace /// + "$dir_raw_results/`domain'/`process'.doc", replace /// title("`title'") ctitle(Model) label side dec(2) noparen /// addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) `note' } else { outreg2 stats(coef se pval) using /// - "$dir_raw_results/social_care/`process'.doc", replace /// + "$dir_raw_results/`domain'/`process'.doc", replace /// title("`title'") ctitle(Model) label side dec(2) noparen /// addstat(R2, e(r2)) `note' } } - + end * Main export routine: combines matrix operations, stability checks, and Excel export capture program drop export_results_to_excel program define export_results_to_excel - - syntax, sheet(string) [probit] - - * Store estimates + + syntax, domain(string) sheet(string) [probit ologit gformula maxestimates(integer 11)] + + // Store estimates matrix b = e(b) matrix V = e(V) - - * Trim zero coefficients - mata: trim_matrices() - - * Check matrix stability - check_matrix_stability - - * Export to Excel - use modify mode (file already created in setup) - putexcel set "$dir_results/reg_socialcare", sheet("`sheet'") modify + + // For ologit, reorder cuts to end before trimming + if "`ologit'" == "ologit" { + mata: reorder_cuts_to_end() + } + + // Trim zero coefficients + trim_matrices + + // For gformula, further truncate to first maxestimates non-zero estimates + if "`gformula'" == "gformula" { + truncate_to_n, maxn(`maxestimates') + } + + // Check matrix stability (skip for gformula) + if "`gformula'" != "gformula" { + check_matrix_stability + } + + // Export to Excel - use modify mode (file already created in setup) + putexcel set "$dir_results/reg_`domain'", sheet("`sheet'") modify putexcel A1 = "REGRESSOR" putexcel B1 = "COEFFICIENT" - - * Write coefficients cell-by-cell - mata: write_all_to_excel() - - * Extract and export labels - mata: extract_and_export_labels("`sheet'") - - * Store model statistics - if "`probit'" == "probit" { + + // Write coefficient and variance-covariance matrices + if "`gformula'" == "gformula" { + putexcel C1 = "VARIANCE" + mata: write_diagonal_to_excel() + } + else { + write_all_to_excel + } + + // Extract and export labels using bulk xl() writes (domain passed for file path) + if "`ologit'" == "ologit" { + mata: extract_and_export_labels("`domain'", "`sheet'", 0, 1) + } + else if "`gformula'" == "gformula" { + mata: extract_and_export_labels("`domain'", "`sheet'", `maxestimates', 0) + } + else { + mata: extract_and_export_labels("`domain'", "`sheet'", 0, 0) + } + + // Store model statistics + if "`probit'" == "probit" | "`ologit'" == "ologit" { scalar r2_p = e(r2_p) scalar chi2 = e(chi2) scalar ll = e(ll) @@ -274,80 +582,401 @@ program define export_results_to_excel scalar r2 = e(r2) } scalar N_sample = e(N) - + +end + + +* Split full Heckman e(b)/e(V) into outcome and selection equation sub-matrices. +* Stores: b_outcome, V_outcome, b_select, V_select. +* Reads from matrices named b and V (set from e(b) and e(V) before calling). +* Identifies equations by the column stripe: all non-"select" columns go to the +* outcome equation (this includes the IMR/lambda coefficient); "select" columns +* go to the selection equation. +capture program drop split_heckman_matrices +program define split_heckman_matrices + + local k = colsof(b) + local outcome_cols "" + local select_cols "" + + forvalues j = 1/`k' { + local eq : word `j' of `: coleq b' + if "`eq'" == "select" { + local select_cols "`select_cols' `j'" + } + else { + local outcome_cols "`outcome_cols' `j'" + } + } + + local n_out = wordcount("`outcome_cols'") + local n_sel = wordcount("`select_cols'") + + // Build b_outcome and V_outcome + matrix b_outcome = J(1, `n_out', 0) + local c = 1 + foreach j of local outcome_cols { + matrix b_outcome[1, `c'] = b[1, `j'] + local ++c + } + + matrix V_outcome = J(`n_out', `n_out', 0) + local r = 1 + foreach i of local outcome_cols { + local c = 1 + foreach j of local outcome_cols { + matrix V_outcome[`r', `c'] = V[`i', `j'] + local ++c + } + local ++r + } + + // Build b_select and V_select + matrix b_select = J(1, `n_sel', 0) + local c = 1 + foreach j of local select_cols { + matrix b_select[1, `c'] = b[1, `j'] + local ++c + } + + matrix V_select = J(`n_sel', `n_sel', 0) + local r = 1 + foreach i of local select_cols { + local c = 1 + foreach j of local select_cols { + matrix V_select[`r', `c'] = V[`i', `j'] + local ++c + } + local ++r + } + + display "Split Heckman matrices: `n_out' outcome eq params, `n_sel' selection eq params" + end +* Write a label list to an Excel sheet both vertically (col A from row 2) +* and horizontally (row 1 from col C). Overwrites A1/B1 with standard headers. +capture program drop write_labels_to_excel +program define write_labels_to_excel + + syntax, domain(string) sheet(string) labels(string) + + putexcel set "$dir_results/reg_`domain'", sheet("`sheet'") modify + putexcel A1 = "REGRESSOR" + putexcel B1 = "COEFFICIENT" + + // Vertical: A2, A3, ... + local row = 1 + foreach var of local labels { + local ++row + putexcel A`row' = "`var'" + } + + // Horizontal: C1, D1, ... (offset by 2 since cols A and B are used) + local col = 2 + foreach var of local labels { + local ++col + if `col' <= 26 { + local letter = char(64 + `col') + putexcel `letter'1 = "`var'" + } + else { + local first = char(64 + int((`col' - 1) / 26)) + local second = char(65 + mod((`col' - 1), 26)) + putexcel `first'`second'1 = "`var'" + } + } + +end + + +* Export one stage of a Heckman model to Excel. +* Caller must set matrices b and V to the relevant stage before calling. +* Trims zeros, checks stability, writes coefs + var-cov, then writes labels. +capture program drop export_heckman_stage +program define export_heckman_stage + + syntax, domain(string) sheet(string) equation(string) + + trim_matrices + check_matrix_stability + + putexcel set "$dir_results/reg_`domain'", sheet("`sheet'") modify + putexcel A1 = "REGRESSOR" + putexcel B1 = "COEFFICIENT" + write_all_to_excel + + local is_outcome = ("`equation'" == "outcome") + mata: extract_heckman_stage_labels("`domain'", "`sheet'", `is_outcome') + +end + + +/*============================================================================== + COMPLETE WORKFLOW PROGRAMS +==============================================================================*/ + * Complete workflow: save sample, export results, and clean up capture program drop process_regression program define process_regression - - syntax, process(string) sheet(string) title(string) gofrow(integer) /// - goflabel(string) [ifcond(string) probit] - - * Save raw results - save_raw_results, process("`process'") title("`title'") ifcond("`ifcond'") - - * Save sample for validation + + syntax, domain(string) process(string) sheet(string) title(string) /// + gofrow(integer) goflabel(string) [ifcond(string) /// + probit gformula maxestimates(integer 11)] + + //Save raw results + save_raw_results, domain("`domain'") process("`process'") /// + title("`title'") ifcond("`ifcond'") + + // Save sample for validation gen in_sample = e(sample) predict p save "$dir_validation_data/`process'_sample", replace - - * Export results to Excel - if "`probit'" == "probit" { - export_results_to_excel, sheet("`sheet'") probit - export_gof_probit, row(`gofrow') label("`goflabel'") + + // Export results to Excel + if "`gformula'" == "gformula" { + export_results_to_excel, domain("`domain'") sheet("`sheet'") /// + gformula maxestimates(`maxestimates') + export_gof_ols, domain("`domain'") row(`gofrow') label("`goflabel'") + } + else if "`probit'" == "probit" { + export_results_to_excel, domain("`domain'") sheet("`sheet'") probit + export_gof_probit, domain("`domain'") row(`gofrow') label("`goflabel'") } else { - export_results_to_excel, sheet("`sheet'") - export_gof_ols, row(`gofrow') label("`goflabel'") + export_results_to_excel, domain("`domain'") sheet("`sheet'") + export_gof_ols, domain("`domain'") row(`gofrow') label("`goflabel'") } - - * Clean up + + // Clean up drop in_sample p scalar drop _all matrix drop _all - + end -* Specialized workflow for multinomial logit models -capture program drop process_mlogit -program define process_mlogit - - syntax, process(string) sheet(string) title(string) gofrow(integer) /// - goflabel(string) outcomes(integer) [ifcond(string)] - - * Save raw results (skip outreg2 for mlogit - it has issues) - matrix results = r(table) - matrix results = results[1..6,1...]' - putexcel set "$dir_raw_results/social_care/socialcare", /// - sheet("Process `process'") replace - putexcel A3 = matrix(results), names nformat(number_d2) - putexcel J4 = matrix(e(V)) - - * Save sample for validation - gen in_sample = e(sample) - - * Generate predictions (number depends on outcomes) - if `outcomes' == 3 { - predict p1 p2 p3 - } - else if `outcomes' == 4 { - predict p1 p2 p3 p4 - } - else if `outcomes' == 5 { - predict p1 p2 p3 p4 p5 - } - - save "$dir_validation_data/`process'_sample", replace - - * Export results to Excel - export_results_to_excel, sheet("`sheet'") probit - export_gof_probit, row(`gofrow') label("`goflabel'") - - * Clean up - drop in_sample p* - scalar drop _all - matrix drop _all - + +* Specialized workflow for ordered logit models +capture program drop process_ologit +program define process_ologit + + syntax, domain(string) process(string) sheet(string) title(string) /// + gofrow(integer) goflabel(string) [ifcond(string)] + + // Save raw results + save_raw_results, domain("`domain'") process("`process'") /// + title("`title'") ifcond("`ifcond'") + + // Save sample for validation + gen in_sample = e(sample) + predict p + save "$dir_validation_data/`process'_sample", replace + + // Export results to Excel + export_results_to_excel, domain("`domain'") sheet("`sheet'") ologit + + // Export GoF + export_gof_probit, domain("`domain'") row(`gofrow') label("`goflabel'") + + // Clean up + drop in_sample p + scalar drop _all + matrix drop _all + +end + + +* Specialized workflow for generalized ordered logit models +capture program drop process_gologit +program define process_gologit + + syntax, domain(string) process(string) sheet(string) title(string) /// + gofrow(integer) goflabel(string) outcomes(integer) [ifcond(string)] + // Note: outcomes() = total number of categories INCLUDING the base category + + // Save raw results + matrix results = r(table) + matrix results = results[1..6,1...]' + putexcel set "$dir_raw_results/`domain'/`domain'", /// + sheet("Process `process'") modify + putexcel A3 = matrix(results), names nformat(number_d2) + putexcel J4 = matrix(e(V)) + + // Save to Word + capture which outreg2 + if _rc == 0 { + if "`ifcond'" != "" { + local note `"addnote("Note: Regression if condition = (`ifcond')")"' + } + outreg2 stats(coef se pval) using /// + "$dir_raw_results/`domain'/`process'.doc", replace /// + title("`title'") ctitle(Education level) label side dec(2) noparen /// + addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) `note' + } + + // Save sample and predictions + gen in_sample = e(sample) + local plist "" + forvalues k = 1/`outcomes' { + local plist "`plist' p`k'" + } + predict `plist' + save "$dir_validation_data/`process'_sample", replace + + // Store model summary statistics + scalar r2_p = e(r2_p) + scalar chi2 = e(chi2) + scalar ll = e(ll) + scalar N_sample = e(N) + + // Store estimates in matrices + matrix b = e(b) + matrix V = e(V) + + // Raw output + putexcel set "$dir_results/reg_`domain'", sheet("`sheet'_raw") modify + putexcel A1 = matrix(b'), names nformat(number_d2) + putexcel A1 = "CATEGORY" + putexcel B1 = "REGRESSOR" + putexcel C1 = "COEFFICIENT" + + // Build gologit structure + mata: build_gologit_structure(`outcomes') + + // Eigenvalue stability check + matrix symeigen X lambda = nonzero_var_structure + scalar max_eig = lambda[1,1] + scalar min_ratio = lambda[1, colsof(lambda)] / max_eig + if max_eig < 1.0e-12 { + display as error "CRITICAL ERROR: Variance-covariance matrix is near singular." + exit 999 + } + if min_ratio < 1.0e-12 { + display as error "Matrix is ill-conditioned. Min/Max ratio: " min_ratio + exit 506 + } + display "VCV stability check passed. Max eigenvalue: " max_eig + display "Min/Max ratio: " min_ratio + + // Export matrices to Excel + putexcel set "$dir_results/reg_`domain'", sheet("`sheet'") modify + putexcel A1 = "REGRESSOR" + putexcel B1 = "COEFFICIENT" + putexcel B2 = matrix(nonzero_b_structure') + putexcel C2 = matrix(nonzero_var_structure) + + // Extract and export labels using bulk xl() writes (domain passed for file path) + mata: export_labels_gologit("`domain'", "`sheet'") + + // Goodness of fit + putexcel set "$dir_results/reg_`domain'", sheet("Gof") modify + local row2 = `gofrow' + 1 + local row3 = `gofrow' + 2 + putexcel A`gofrow' = "`goflabel'", bold + putexcel A`row2' = "Pseudo R-squared" + putexcel B`row2' = r2_p + putexcel A`row3' = "N" + putexcel B`row3' = N_sample + putexcel E`row2' = "Chi^2" + putexcel F`row2' = chi2 + putexcel E`row3' = "Log likelihood" + putexcel F`row3' = ll + + // Clean up + drop in_sample `plist' + scalar drop _all + matrix drop _all + +end + +* Specialized workflow for Heckman wage model +capture program drop process_heckman +program define process_heckman + + syntax, process(string) ifcond(string) savefile(string) /// + graphsubtitle(string) /// + wordfile(string) wordtitle(string) /// + wordctitle(string) /// + sheet2(string) sheet1(string) /// + rmserow(integer) + + // Raw Word output + capture which outreg2 + if _rc == 0 { + outreg2 stats(coef se pval) using "`wordfile'", replace /// + title("`wordtitle'") ctitle("`wordctitle'") /// + label side dec(2) noparen + } + + // Stability check on full joint var-cov (captures e() before any matrix work) + local sigma_val = e(sigma) + matrix b = e(b) + matrix V = e(V) + trim_matrices + check_matrix_stability + + // Predictions and bias correction (log to level) + cap drop pred epsilon + predict pred if `ifcond', ycond + replace lwage_hour_hat = pred if `ifcond' + gen in_sample_`process' = e(sample) + gen epsilon = rnormal() * `sigma_val' + replace pred_hourly_wage = exp(lwage_hour_hat + epsilon) if `ifcond' + + // Diagnostic histogram + twoway /// + (hist wage_hour if `ifcond', width(0.5) lcolor(gs12) fcolor(gs12)) /// + (hist pred_hourly_wage if `ifcond' & (!missing(wage_hour)), /// + width(0.5) fcolor(none) lcolor(red)), /// + title("Gross Hourly Wage (Level)") subtitle("`graphsubtitle'") /// + xtitle("GBP") legend(lab(1 "UKHLS") lab(2 "Prediction")) /// + note("Notes: Sample condition `ifcond'", size(vsmall)) + graph export "${dir_raw_results}/wages/`process'_hist.png", replace + graph drop _all + + sum wage_hour if `ifcond' [aw=${weight}] + sum pred_hourly_wage if `ifcond' & (!missing(wage_hour)) [aw=$weight] + + // Save validation data + save "$dir_validation_data/`savefile'", replace + cap drop pred epsilon + + // Split and export results to Excel + matrix b = e(b) + matrix V = e(V) + split_heckman_matrices + + // Second stage, reg_wages + matrix b = b_outcome + matrix V = V_outcome + export_heckman_stage, domain("wages") sheet("`sheet2'") equation("outcome") + + // First stage, reg_employment_selection + matrix b = b_select + matrix V = V_select + export_heckman_stage, domain("employment_selection") sheet("`sheet1'") /// + equation("select") + + // RMSE + cap drop residuals squared_residuals + gen residuals = lwage_hour - lwage_hour_hat + gen squared_residuals = residuals^2 + preserve + keep if `ifcond' + sum squared_residuals + local rmse = sqrt(r(mean)) + di "RMSE for `process': " `rmse' + putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify + putexcel A1 = "REGRESSOR" + putexcel B1 = "COEFFICIENT" + putexcel A`rmserow' = "`process'" + putexcel B`rmserow' = `rmse' + restore + cap drop residuals squared_residuals + + cap drop lambda + scalar drop _all + matrix drop _all + end diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_education.do b/input/InitialPopulations/compile/RegressionEstimates/reg_education.do deleted file mode 100644 index 43a17efbb..000000000 --- a/input/InitialPopulations/compile/RegressionEstimates/reg_education.do +++ /dev/null @@ -1,889 +0,0 @@ -****************************************************************************************** -* PROJECT: SimPaths UK -* SECTION: Education -* OBJECT: Final Probit & Generalised Logit Models - Weighted -* AUTHORS: Patryk Bronka, Daria Popova, Justin van de Ven, Aleksandra Kolndrekaj -* LAST UPDATE: 18 Feb 2026 AK -* COUNTRY: UK -* -* NOTES: -* -***************************************************************************************** - -clear all -set more off -set mem 200m -set type double -//set maxvar 120000 -set maxvar 30000 - -******************************************************************* -cap log close -log using "${dir_log}/reg_education.log", replace -******************************************************************* - -* Set Excel file - -* Info sheet -putexcel set "$dir_results/reg_education", sheet("Info") replace -putexcel A1 = "Description:" -putexcel B1 = "Model parameters governing projection of education status" -putexcel A2 = "Authors: Patryk Bronka, Justin van de Ven, Daria Popova, Aleksandra Kolndrekaj" -putexcel A3 = "Last edit: 18 Feb 2026 AK" - -putexcel A4 = "Process:", bold -putexcel B4 = "Description:", bold - -putexcel A5 = "E1a" -putexcel B5 = "Probit regression estimates of exiting education" - -putexcel A6 = "E1b" -putexcel B6 = "Probit regression estimates of returning to education" - -putexcel A7 = "E2" -putexcel B7 = "Generalized ordered logit regression estimates of education attainment - individuals aged 16+ exiting education." - -putexcel A8 = "E2_raw" -putexcel B8 = "Raw generalized ordered logit regression estimates of education attainment - individuals aged 16+ exiting education. Useful for the 'Gologit predictor' file." - -putexcel A10 = "Notes:", bold -putexcel B10 = "Estimation sample: UK_ipop.dta with grossing up weight dwt" -putexcel B11 = "Conditions for processes are defined as globals in master.do" -//putexcel B12 = "E1a: Compared to the previous version, where age and age squared were used, age is now centered (at age 23) and its effect is allowed to change after age 18." - -putexcel set "$dir_results/reg_education", sheet("Gof") modify -putexcel A1 = "Goodness of fit", bold - - - -/********************************* PREPARE DATA *******************************/ - -use "${estimation_sample}", clear - -* Set data -xtset idperson swv -sort idperson swv - -* Adjust variables -do "${dir_do}/variable_update.do" - - -/********************************** ESTIMATION ********************************/ - -/****************** E1a: PROBABILITY OF REMAINING IN EDUCATION ****************/ -display "${e1a_if_condition}" - -probit Dst i.Dgn Dag Dag_sq /*Dag_c Dag_c_sq Dag_post18_sq*/ li.Ded /// - li.Dehmf_c3_Medium li.Dehmf_c3_Low /// - li.Ydses_c5_Q2 li.Ydses_c5_Q3 li.Ydses_c5_Q4 li.Ydses_c5_Q5 /// - $regions Year_transformed Y2020 Y2021 $ethnicity /// - if ${e1a_if_condition} /// - [pw=dwt], vce(robust) - - -* Save raw results -matrix results = r(table) -matrix results = results[1..6,1...]' - -putexcel set "$dir_raw_results/education/education", sheet("Process E1a") /// - replace -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -outreg2 stats(coef se pval) using "$dir_raw_results/education/E1a.doc", /// - replace /// -title("Process E1a: Probability Remaining In Education") /// - ctitle(Continuing student) label side dec(2) noparen /// - addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) /// - addnote(`"Note: Regression if condition = (${e1a_if_condition})"') - -* Save sample inclusion indicator and predicted probabilities -gen in_sample = e(sample) -predict p - -* Save sample estimate validation -save "$dir_validation_data/E1a_sample", replace - -* Store model summary statistics -scalar r2_p = e(r2_p) -scalar N_sample = e(N) -scalar chi2 = e(chi2) -scalar ll = e(ll) - - -* Save estimates for use in SimPaths - -* Store estimates -matrix b = e(b) -matrix V = e(V) - -* Eliminate rows and columns containing zeros (baseline cats) -mata: - // Call matrices into mata - V = st_matrix("V") - b = st_matrix("b") - - // Find which coefficients are nonzero - keep = (b :!= 0) - - // Eliminate zeros - b_trimmed = select(b, keep) - V_trimmed = select(V, keep) - V_trimmed = select(V_trimmed', keep)' - - // Inspection - b_trimmed - V_trimmed - - // Return to Stata - st_matrix("b_trimmed", b_trimmed') - st_matrix("V_trimmed", V_trimmed) - st_matrix("nonzero_b_flag", keep) -end - -* Eigenvalue tests for var-cov invertablility in SimPaths -matrix symeigen X lambda = V_trimmed - -scalar max_eig = lambda[1,1] - -scalar min_ratio = lambda[1, colsof(lambda)] / max_eig - -* Outcome of max eigenvalue test -if max_eig < 1.0e-12 { - - display as error "CRITICAL ERROR: Maximum eigenvalue is too small (`max_eig')." - display as error "The Variance-Covariance matrix is likely singular." - exit 999 - -} - -display "Stability Check Passed: Max Eigenvalue is " max_eig - -* Outcome of eigenvalue ratio test -if min_ratio < 1.0e-12 { - - display as error "Matrix is ill-conditioned. Min/Max ratio: " min_ratio - exit 506 - -} - -display "Stability Check Passed. Min/Max ratio: " min_ratio - -* Export into Excel -putexcel set "$dir_results/reg_education", sheet("E1a") modify -putexcel B2 = matrix(b_trimmed) -putexcel C2 = matrix(V_trimmed) - - -* Labels -preserve -putexcel set "$dir_results/reg_education", sheet("E1a") modify - -putexcel A1 = "REGRESSOR" -putexcel B1 = "COEFFICIENT" - - -* Use Mata to extract nice labels from colstripe of e(b) - -local dir_results "$dir_results" -cap erase "$dir_results/temp_labels.txt" - -mata: - // -------------------------------------------------- - // Import objects from Stata - // -------------------------------------------------- - nonzero_b_flag = st_matrix("nonzero_b_flag") - stripe = st_matrixcolstripe("e(b)") - - // Ensure column vector - nonzero_b_flag = nonzero_b_flag' - - // -------------------------------------------------- - // Extract variable names - // -------------------------------------------------- - varnames = stripe[.,2] - - // Keep non-baseline coefficients - varnames_no_bl = select(varnames, nonzero_b_flag :== 1) - - // -------------------------------------------------- - // Clean labels - // -------------------------------------------------- - labels_no_bl = usubinstr(varnames_no_bl, "1.", "", 1) - labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant") - - // Handle lags: L.var -> var_L1 - labels_no_bl = /// - regexm(labels_no_bl, "^L\.") :* /// - (regexr(labels_no_bl, "^L\.", "") :+ "_L1") :+ /// - (!regexm(labels_no_bl, "^L\.") :* labels_no_bl) - - // Handle 1L.var - labels_no_bl = /// - regexm(labels_no_bl, "^1L\.") :* /// - (regexr(labels_no_bl, "^1L\.", "") :+ "_L1") :+ /// - (!regexm(labels_no_bl, "^1L\.") :* labels_no_bl) - - // -------------------------------------------------- - // Add header - // -------------------------------------------------- - labels_out = "v1" \ labels_no_bl - - // -------------------------------------------------- - // Write to temp file - // -------------------------------------------------- - outfile = st_local("dir_results") + "/temp_labels.txt" - fh = fopen(outfile, "w") - for (i=1; i<=rows(labels_out); i++) { - fput(fh, labels_out[i]) - } - fclose(fh) -end - - - * Import cleaned labels into Stata - import delimited "$dir_results/temp_labels.txt", clear varnames(1) /// - encoding(utf8) - gen n = _n - - * Export labels to Excel - putexcel set "$dir_results/reg_education", sheet("E1a") modify - - * Vertical labels - summarize n, meanonly - local N = r(max)+1 - forvalue i = 2/`N' { - - local j = `i' - 1 - putexcel A`i' = v1[`j'] - - } - - * Horizontal labels - summarize n, meanonly - local N = r(max) + 1 // Adjusted since we're working across columns - - forvalues j = 1/`N' { - - local n = `j'+2 // Shift by 2 to start from column C - local col "" - - while `n' > 0 { - local rem = mod(`n' - 1, 26) - local col = char(65 + `rem') + "`col'" - local n = floor((`n' - 1)/26) - } - - putexcel `col'1 = v1[`j'] - } - - * Clean up - cap erase "$dir_results/temp_labels.txt" - -restore - -* Export model fit statistics -putexcel set "$dir_results/reg_education", sheet("Gof") modify - -putexcel A3 = "E1a - Leaving education", bold - -putexcel A5 = "Pseudo R-squared" -putexcel B5 = r2_p -putexcel A6 = "N" -putexcel B6 = N_sample -putexcel E5 = "Chi^2" -putexcel F5 = chi2 -putexcel E6 = "Log likelihood" -putexcel F6 = ll - -* Clean up -drop in_sample p -scalar drop _all -matrix drop _all - - - -/****************** E1b: PROBABILITY OF RETURNING TO EDUCATION ****************/ -display "${e1b_if_condition}" - -probit der i.Dgn Dag Dag_sq li.Dcpst_Partnered /// -li.Deh_c4_High li.Deh_c4_Low /// -li.Dehmf_c3_Medium li.Dehmf_c3_Low /// -li.Les_c3_NotEmployed li.Les_c3_Employed /// -l.Dnc l.Dnc02 /// -$regions Year_transformed Y2020 Y2021 $ethnicity /// -if ${e1b_if_condition} /// - [pw=dwt], vce(robust) - - -* Save raw results -matrix results = r(table) -matrix results = results[1..6,1...]' - -putexcel set "$dir_raw_results/education/education", sheet("Process E1b") /// - modify -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -outreg2 stats(coef se pval) using "$dir_raw_results/education/E1b.doc", /// - replace /// -title("Process E1b: Probability Returning To Education") /// - ctitle(Returning student) label side dec(2) noparen /// - addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) /// - addnote(`"Note: Regression if condition = (${e1b_if_condition})"') - - -* Save sample inclusion indicator and predicted probabilities -gen in_sample = e(sample) -predict p - -* Save sample for later use (internal validation) -save "$dir_validation_data/E1b_sample", replace - -* Store model summary statistics -scalar r2_p = e(r2_p) -scalar N_sample = e(N) -scalar chi2 = e(chi2) -scalar ll = e(ll) - -* Prepare to store results in Excel - -* Eliminate rows and columns containing zeros (baseline cats) -matrix b = e(b) -matrix V = e(V) - - -mata: - V = st_matrix("V") - b = st_matrix("b") - - // Find which coefficients are nonzero - keep = (b :!= 0) - - // Eliminate zeros - b_trimmed = select(b, keep) - V_trimmed = select(V, keep) - V_trimmed = select(V_trimmed', keep)' - - b_trimmed - V_trimmed - - // Return to Stata - st_matrix("b_trimmed", b_trimmed') - st_matrix("V_trimmed", V_trimmed) - st_matrix("nonzero_b_flag", keep) -end - - -* Eigenvalue tests for var-cov invertablility in SimPaths -matrix symeigen X lambda = V_trimmed - -scalar max_eig = lambda[1,1] - -scalar min_ratio = lambda[1, colsof(lambda)] / max_eig - -* Outcome of max eigenvalue test -if max_eig < 1.0e-12 { - - display as error "CRITICAL ERROR: Maximum eigenvalue is too small (`max_eig')." - display as error "The Variance-Covariance matrix is likely singular." - exit 999 - -} - -display "Stability Check Passed: Max Eigenvalue is " max_eig - -* Outcome of eigenvalue ratio test -if min_ratio < 1.0e-12 { - - display as error "Matrix is ill-conditioned. Min/Max ratio: " min_ratio - exit 506 - -} - -display "Stability Check Passed. Min/Max ratio: " min_ratio - -* Export into Excel -putexcel set "$dir_results/reg_education", sheet("E1b") modify -putexcel B2 = matrix(b_trimmed) -putexcel C2 = matrix(V_trimmed) - - -* Labels -preserve -putexcel set "$dir_results/reg_education", sheet("E1b") modify - -putexcel A1 = "REGRESSOR" -putexcel B1 = "COEFFICIENT" - - -* Use Mata to extract nice labels from colstripe of e(b) (replacement for Stata 14) - -local dir_results "$dir_results" -cap erase "$dir_results/temp_labels.txt" - -mata: - // -------------------------------------------------- - // Import objects from Stata - // -------------------------------------------------- - nonzero_b_flag = st_matrix("nonzero_b_flag") - stripe = st_matrixcolstripe("e(b)") - - // Ensure column vector - nonzero_b_flag = nonzero_b_flag' - - // -------------------------------------------------- - // Extract variable names - // -------------------------------------------------- - varnames = stripe[.,2] - - // Keep non-baseline coefficients - varnames_no_bl = select(varnames, nonzero_b_flag :== 1) - - // -------------------------------------------------- - // Clean labels - // -------------------------------------------------- - labels_no_bl = usubinstr(varnames_no_bl, "1.", "", 1) - labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant") - - // Handle lags: L.var -> var_L1 - labels_no_bl = /// - regexm(labels_no_bl, "^L\.") :* /// - (regexr(labels_no_bl, "^L\.", "") :+ "_L1") :+ /// - (!regexm(labels_no_bl, "^L\.") :* labels_no_bl) - - // Handle 1L.var - labels_no_bl = /// - regexm(labels_no_bl, "^1L\.") :* /// - (regexr(labels_no_bl, "^1L\.", "") :+ "_L1") :+ /// - (!regexm(labels_no_bl, "^1L\.") :* labels_no_bl) - - // -------------------------------------------------- - // Add header - // -------------------------------------------------- - labels_out = "v1" \ labels_no_bl - - // -------------------------------------------------- - // Write to temp file - // -------------------------------------------------- - outfile = st_local("dir_results") + "/temp_labels.txt" - fh = fopen(outfile, "w") - for (i=1; i<=rows(labels_out); i++) { - fput(fh, labels_out[i]) - } - fclose(fh) -end - - - * Import cleaned labels into Stata - import delimited "$dir_results/temp_labels.txt", clear varnames(1) encoding(utf8) - - gen n = _n - - * Export labels to Excel - putexcel set "$dir_results/reg_education", sheet("E1b") modify - - * Vertical labels - summarize n, meanonly - local N = r(max)+1 - forvalue i = 2/`N' { - - local j = `i' - 1 - putexcel A`i' = v1[`j'] - - } - - * Horizontal labels - summarize n, meanonly - local N = r(max) + 1 // Adjusted since we're working across columns - - forvalues j = 1/`N' { - - local n = `j'+2 // Shift by 2 to start from column C - local col "" - - while `n' > 0 { - local rem = mod(`n' - 1, 26) - local col = char(65 + `rem') + "`col'" - local n = floor((`n' - 1)/26) - } - - putexcel `col'1 = v1[`j'] - } - - - * Clean up - cap erase "$dir_results/temp_labels.txt" - -restore - -* Export model fit statistics -putexcel set "$dir_results/reg_education", sheet("Gof") modify - -putexcel A8 = "E1b - Returning to education", bold - -putexcel A10 = "Pseudo R-squared" -putexcel B10 = r2_p -putexcel A11 = "N" -putexcel B11 = N_sample -putexcel E10 = "Chi^2" -putexcel F10 = chi2 -putexcel E11 = "Log likelihood" -putexcel F11 = ll - -* Clean up -drop in_sample p -scalar drop _all -matrix drop _all - - - -/****************** E2: EDUCATION ATTAINMENT WHEN LEAVE SCHOOL ****************/ -display "${e2_if_condition}" - -gologit2 deh_c3_recoded i.Dgn Dag Dag_sq /// - i.L_Dehmf_c3_Medium i.L_Dehmf_c3_Low /// - $regions Year_transformed Y2020 Y2021 $ethnicity /// - if ${e2_if_condition} [pw=dwt], autofit - -* Save raw results -matrix results = r(table) -matrix results = results[1..6,1...]' - -putexcel set "$dir_raw_results/education/education", sheet("Process E2") /// - modify -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - - -outreg2 stats(coef se pval) using "$dir_raw_results/education/E2.doc", /// - replace /// -title("Process E2: Educational Attainment When Leave School") /// - ctitle(Education level) label side dec(2) noparen /// - addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) /// - addnote(`"Note: Regression if condition = (${e2_if_condition})"') - - -* Save sample inclusion indicator and predicted probabilities -gen in_sample = e(sample) -predict p1 p2 p3 - -* Save sample for estimates validation -save "$dir_validation_data/E2_sample", replace - -* Store model summary statistics -scalar r2_p = e(r2_p) -scalar N_sample = e(N) - -* Store results in Excel - -* Store estimates in matrices -matrix b = e(b) -matrix V = e(V) - -* Raw output -putexcel set "$dir_results/reg_education", sheet("E2_raw") modify -putexcel A1 = matrix(b'), names nformat(number_d2) -putexcel A1 = "CATEGORY" -putexcel B1 = "REGRESSOR" -putexcel C1 = "COEFFICIENT" - -* Estimated coefficients -scalar no_coefs_all = colsof(b) - -* Eliminate rows and columns containing zeros (baseline cats) -mata: - // Call matrices into mata - b = st_matrix("b") - - // Find which coefficients are nonzero - keep = (b :!= 0) - - // Eliminate zeros - nonzero_b = select(b, keep) - - nonzero_b - - // Return to Stata - st_matrix("nonzero_b", nonzero_b) - st_matrix("nonzero_b_flag", keep) -end - -* Inspect -matrix list b -matrix list nonzero_b -matrix list nonzero_b_flag - -* Save dimensions -scalar no_nonzero_b = colsof(nonzero_b) -scalar no_nonzero_b_per = no_nonzero_b / 2 - -* Address repetition of proportional odds covariates - -* Generate repetition/unique observation flag -mata: - // Import matrices into mata - nonzero_b_mata = st_matrix("nonzero_b") - - // Generate binary vector =1 if coefficient repeated - n = cols(nonzero_b_mata) - repetition_flag = J(n, 1, 0) - - // use tolerance based comparison to avoid precision errors - tol = 1e-8 - - for (i = 1; i <= n; i++) { - for (j = 1; j <= n; j++) { - if (i != j && abs(nonzero_b_mata[i] - nonzero_b_mata[j]) < tol) { - repetition_flag[i] = 1 - break - } - } - } - repetition_flag - - // Generate binary vector =1 if coefficient not repeated - unique_flag = 1 :- repetition_flag - - // Return to Stata - st_matrix("repetition_flag", repetition_flag') - st_matrix("unique_flag", unique_flag') - -end - -* Generate vector to multiply the coef vector with to eliminate the repetitions -* of coefficients for vars that satify the proportional odds assumptions -matrix structure_a = J(1,no_nonzero_b_per,1) -matrix structure_b = unique_flag[1,no_nonzero_b_per+1..no_nonzero_b] -matrix structure = structure_a, structure_b - -* Inspect -matrix list structure_a -matrix list structure_b -matrix list structure -matrix list nonzero_b - -* Eliminate repetitions -mata: - // Call matrices into mata - var = st_matrix("var") - structure = st_matrix("structure") - nonzero_b = st_matrix("nonzero_b") - - // Convert reptitions into zeros - b_structure = structure :* nonzero_b - - b_structure - - // Eliminate zeros - keep = (b_structure :!= 0) - - nonzero_b_structure = select(b_structure, keep) - - // Export to Stata - st_matrix("b_structure", b_structure) - st_matrix("nonzero_b_structure", nonzero_b_structure) - -end - -matrix list nonzero_b_structure - -* Export into Excel -putexcel set "$dir_results/reg_education", sheet("E2") modify -putexcel A1 = matrix(nonzero_b_structure'), names //nformat(number_d2) - - -* Variance-covariance matrix -* Eliminate zeros (baseline categories) -mata: - V = st_matrix("V") - b = st_matrix("b") - - // Find which coefficients are nonzero - keep = (b :!= 0) - - // Eliminate zeros - V_trimmed = select(V, keep) - V_trimmed = select(V_trimmed', keep)' - - V_trimmed - - // Return to Stata - st_matrix("var", V_trimmed) -end - -matrix list var - - -* Address repetition due to proportional odds being satisfied for some covars -matrix square_structure_a = J(no_nonzero_b,1,1) * structure -matrix square_structure_b = square_structure_a' - -matrix list square_structure_a -matrix list square_structure_b -mata: - // Call matrices into mata - var = st_matrix("var") - - // Create structure matrix (0 = eliminate) - square_structure_a = st_matrix("square_structure_a") - square_structure_b = st_matrix("square_structure_b") - - // Element-by-element multiplication - square_structure = square_structure_a :* square_structure_b - var_structure = square_structure :* var - - // Eliminate zeros - row_keep = rowsum(abs(var_structure)) :!= 0 - col_keep = colsum(abs(var_structure)) :!= 0 - - nonzero_var_structure = select(select(var_structure, row_keep), col_keep) - - // Return to Stata - st_matrix("nonzero_var_structure", nonzero_var_structure) -end - -matrix list nonzero_var_structure - -* Export to Excel -putexcel set "$dir_results/reg_education", sheet("E2") modify -putexcel C2 = matrix(nonzero_var_structure) - -*======================================================================= -* Eigenvalue stability check for trimmed variance-covariance matrix - -matrix symeigen X lambda = nonzero_var_structure - -* Largest eigenvalue -scalar max_eig = lambda[1,1] - -* Ratio of smallest to largest eigenvalue -scalar min_ratio = lambda[1, colsof(lambda)] / max_eig - -* Check 1: near-singularity -if max_eig < 1.0e-12 { - display as error "CRITICAL ERROR: Variance-covariance matrix is near singular." - display as error "Max eigenvalue = " max_eig - exit 999 -} - -* Check 2: ill-conditioning -if min_ratio < 1.0e-12 { - display as error "Matrix is ill-conditioned." - display as error "Min/Max eigenvalue ratio = " min_ratio - exit 506 -} - -display "VCV stability check passed." -display "Max eigenvalue: " max_eig -display "Min/Max ratio: " min_ratio -*======================================================================= - -* Labels -preserve - -putexcel set "$dir_results/reg_education", sheet("E2") modify - -putexcel A1 = "REGRESSOR" -putexcel B1 = "COEFFICIENT" - - - * Use Mata to extract nice labels from colstripe of e(b) (replacement for Stata 14) -local dir_results "$dir_results" -cap erase "$dir_results/temp_labels.txt" - -* Run Mata block -mata: - // Import matrices from Stata - nonzero_b_flag = st_matrix("nonzero_b_flag")' - unique_flag = st_matrix("unique_flag")' - structure = st_matrix("structure")' - stripe = st_matrixcolstripe("e(b)") - - // Extract variable and category names - catnames = stripe[.,1] - varnames = stripe[.,2] - varnames_no_bl = select(varnames, nonzero_b_flag :== 1) - catnames_no_bl = select(catnames, nonzero_b_flag :== 1) - - // Handle lags - labels_no_bl = regexm(varnames_no_bl, "^L_") :* (regexr(varnames_no_bl, "^L_", "") :+ "_L1") :+ (!regexm(varnames_no_bl, "^L_") :* varnames_no_bl) - - // Add category name when flag is not unique - labels_no_bl = labels_no_bl :+ "_" :+ (catnames_no_bl :* (unique_flag[1::rows(labels_no_bl)] :!= 0)) - - // Clean labels - labels_no_bl = usubinstr(labels_no_bl, "1.", "", 1) - labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant") - - // Filter for structure == 1 - nonzero_labels_structure = select(labels_no_bl, structure[1::rows(labels_no_bl)] :== 1) - - // Add header row - nonzero_labels_structure = "v1"\nonzero_labels_structure - - // Write to temporary file - fh = fopen(st_local("dir_results") + "/temp_labels.txt", "w") - for (i=1; i<=rows(nonzero_labels_structure); i++) { - fput(fh, nonzero_labels_structure[i]) - } - fclose(fh) -end - - * Import cleaned labels into Stata as new dataset - import delimited "$dir_results/temp_labels.txt", clear varnames(1) encoding(utf8) - gen n = _n - - * Export labels to Excel - putexcel set "$dir_results/reg_education", sheet("E2") modify - - * Vertical labels - sum n, meanonly - local N = r(max)+1 - - forvalue i = 2/`N' { - local j = `i' - 1 - putexcel A`i' = v1[`j'] - } - - * Horizontal labels - sum n, meanonly - local N = r(max) + 1 // Adjusted since we're working across columns - - forvalues j = 1/`N' { - local n = `j'+2 // Shift by 2 to start from column C - local col "" - - while `n' > 0 { - local rem = mod(`n' - 1, 26) - local col = char(65 + `rem') + "`col'" - local n = floor((`n' - 1)/26) - } - - putexcel `col'1 = v1[`j'] - } - - *Clean up - cap erase "$dir_results/temp_labels.txt" - -restore - -* Goodness of fit - -putexcel set "$dir_results/reg_education", sheet("Gof") modify - -putexcel A13 = "E2 - Education attainment", bold - -putexcel A15 = "Pseudo R-squared" -putexcel B15 = r2_p -putexcel A16 = "N" -putexcel B16 = N_sample - - -* Clean up -drop in_sample p1 p2 p3 -scalar drop _all -matrix drop _all - - -capture log close diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_fertility.do b/input/InitialPopulations/compile/RegressionEstimates/reg_fertility.do deleted file mode 100644 index f26a980c1..000000000 --- a/input/InitialPopulations/compile/RegressionEstimates/reg_fertility.do +++ /dev/null @@ -1,306 +0,0 @@ -********************************************************************************* -* PROJECT: SimPaths UK -* SECTION: Fertility -* OBJECT: Final Probit Models -* AUTHORS: Daria Popova, Justin van de Ven, Aleksandra Kolndrekaj -* LAST UPDATE: 18 Feb 2026 AK -* COUNTRY: UK -* -* NOTES: -* Combined former a and b processes. -******************************************************************************** -clear all -set more off -set mem 200m -set type double -//set maxvar 120000 -set maxvar 30000 - -******************************************************************* -cap log close -log using "${dir_log}/reg_fertility.log", replace -******************************************************************* - -* Set Excel file - -* Info sheet - -putexcel set "$dir_results/reg_fertility", sheet("Info") replace -putexcel A1 = "Description:" -putexcel B1 = "Model parameters governing projection of fertility" -putexcel A2 = "Authors: Patryk Bronka, Justin van de Ven, Daria Popova, Aleksandra Kolndrekaj" -putexcel A3 = "Last edit: 18 Feb 2026 AK" - -putexcel A4 = "Process:", bold -putexcel B4 = "Description:", bold -putexcel A5 = "F1" -putexcel B5 = "Probit regression estimates of the probability of having a child for women aged 18-44" - -putexcel A10 = "Notes:", bold -putexcel B10 = "Estimation sample: UK_ipop.dta with grossing up weight dwt" -putexcel B11 = "Conditions for processes are defined as globals in master.do" -putexcel B12 = "Combined former processes F1a and F1b" - -putexcel set "$dir_results/reg_fertility", sheet("Gof") modify -putexcel A1 = "Goodness of fit", bold - - -/********************************* PREPARE DATA *******************************/ - -* Load data -use "${estimation_sample}", clear - -* Set data -xtset idperson swv -sort idperson swv - -* Adjust variables -do "${dir_do}/variable_update.do" - -*-------------------------------------------------- -* Any-children dummy (dchpd collapsing) -*-------------------------------------------------- -replace dchpd = 1 if inlist(dchpd, 2, 3, 4, 5) -fre dchpd - -/********************************** ESTIMATION ********************************/ - -/*********************** F1: PROBABILITY OF HAVING A CHILD ********************/ -display "${f1_if_condition}" - -probit dchpd /// - i.Ded Dag Dag_sq /// - l.Dhe_pcs l.Dhe_mcs /// - Dcpst_Single li.Dcpst_Single /// - /*Ded_Dag Ded_Dhe_pcs Ded_Dhe_mcs*/ /// - Ded_Dcpst_Single /*Ded_Dcpst_Single_L1*/ /// - li.Ydses_c5_Q2 li.Ydses_c5_Q3 li.Ydses_c5_Q4 li.Ydses_c5_Q5 /// - l.Dnc l.Dnc02 /// - i.Deh_c4_Low i.Deh_c4_High /// - FertilityRate /// - /*li.Les_c3_Student*/ li.Les_c3_NotEmployed /// - $regions Year_transformed Y2020 Y2021 $ethnicity /// - if ${f1_if_condition} [pw=dwt], vce(robust) - - -* Save raw results -matrix results = r(table) -matrix results = results[1..6,1...]' - -putexcel set "$dir_raw_results/fertility/fertility", /// - sheet("Process F1") replace -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -outreg2 stats(coef se pval) using "$dir_raw_results/fertility/F1.doc", replace /// - title("Process F1: Probability of Having a Child") /// - ctitle(Having a Child) label side dec(2) noparen /// - addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) /// - addnote(`"Note: Regression if condition = (${f1_if_condition})"') - - -* Save sample inclusion indicator and predicted probabilities -gen in_sample = e(sample) -predict p - -* Save sample for estimate validation -save "$dir_validation_data/F1_sample", replace - -* Store model summary statistics -scalar r2_p = e(r2_p) -scalar N_sample = e(N) -scalar chi2 = e(chi2) -scalar ll = e(ll) - -* Store results in Excel - -* Store estimates -matrix b = e(b) -matrix V = e(V) - -mata: - // Call matrices into mata - V = st_matrix("V") - b = st_matrix("b") - - // Find which coefficients are nonzero - keep = (b :!= 0) - - // Eliminate zeros - b_trimmed = select(b, keep) - V_trimmed = select(V, keep) - V_trimmed = select(V_trimmed', keep)' - - // Inspection - b_trimmed - V_trimmed - - // Return to Stata - st_matrix("b_trimmed", b_trimmed') - st_matrix("V_trimmed", V_trimmed) - st_matrix("nonzero_b_flag", keep) -end - -* Eigenvalue tests for var-cov invertablility in SimPaths -matrix symeigen X lambda = V_trimmed - -scalar max_eig = lambda[1,1] - -scalar min_ratio = lambda[1, colsof(lambda)] / max_eig - -* Outcome of max eigenvalue test -if max_eig < 1.0e-12 { - - display as error "CRITICAL ERROR: Maximum eigenvalue is too small (`max_eig')." - display as error "The Variance-Covariance matrix is likely singular." - exit 999 - -} - -display "Stability Check Passed: Max Eigenvalue is " max_eig - -* Outcome of eigenvalue ratio test -if min_ratio < 1.0e-12 { - - display as error "Matrix is ill-conditioned. Min/Max ratio: " min_ratio - exit 506 - -} - -display "Stability Check Passed. Min/Max ratio: " min_ratio - -* Export into Excel -putexcel set "$dir_results/reg_fertility", sheet("F1") modify -putexcel B2 = matrix(b_trimmed) -putexcel C2 = matrix(V_trimmed) - - -* Labels -preserve -putexcel set "$dir_results/reg_fertility", sheet("F1") modify - -putexcel A1 = "REGRESSOR" -putexcel B1 = "COEFFICIENT" - -* Use Mata to extract nice labels from colstripe of e(b) - -local dir_results "$dir_results" -cap erase "$dir_results/temp_labels.txt" - -mata: - // -------------------------------------------------- - // Import objects from Stata - // -------------------------------------------------- - nonzero_b_flag = st_matrix("nonzero_b_flag") - stripe = st_matrixcolstripe("e(b)") - - // Ensure column vector - nonzero_b_flag = nonzero_b_flag' - - // -------------------------------------------------- - // Extract variable names - // -------------------------------------------------- - varnames = stripe[.,2] - - // Keep non-baseline coefficients - varnames_no_bl = select(varnames, nonzero_b_flag :== 1) - - // -------------------------------------------------- - // Clean labels - // -------------------------------------------------- - labels_no_bl = usubinstr(varnames_no_bl, "1.", "", 1) - labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant") - - // Handle lags: L.var -> var_L1 - labels_no_bl = /// - regexm(labels_no_bl, "^L\.") :* /// - (regexr(labels_no_bl, "^L\.", "") :+ "_L1") :+ /// - (!regexm(labels_no_bl, "^L\.") :* labels_no_bl) - - // Handle 1L.var - labels_no_bl = /// - regexm(labels_no_bl, "^1L\.") :* /// - (regexr(labels_no_bl, "^1L\.", "") :+ "_L1") :+ /// - (!regexm(labels_no_bl, "^1L\.") :* labels_no_bl) - - // -------------------------------------------------- - // Add header - // -------------------------------------------------- - labels_out = "v1" \ labels_no_bl - - // -------------------------------------------------- - // Write to temp file - // -------------------------------------------------- - outfile = st_local("dir_results") + "/temp_labels.txt" - fh = fopen(outfile, "w") - for (i=1; i<=rows(labels_out); i++) { - fput(fh, labels_out[i]) - } - fclose(fh) -end - - - * Import cleaned labels into Stata - import delimited "$dir_results/temp_labels.txt", clear varnames(1) /// - encoding(utf8) - gen n = _n - - * Export labels to Excel - putexcel set "$dir_results/reg_fertility", sheet("F1") modify - - * Vertical labels - summarize n, meanonly - local N = r(max)+1 - forvalue i = 2/`N' { - - local j = `i' - 1 - putexcel A`i' = v1[`j'] - - } - - * Horizontal labels - summarize n, meanonly - local N = r(max) + 1 // Adjusted since we're working across columns - - forvalues j = 1/`N' { - - local n = `j'+2 // Shift by 2 to start from column C - local col "" - - while `n' > 0 { - local rem = mod(`n' - 1, 26) - local col = char(65 + `rem') + "`col'" - local n = floor((`n' - 1)/26) - } - - putexcel `col'1 = v1[`j'] - } - - * Clean up - cap erase "$dir_results/temp_labels.txt" - -restore - - -* Export model fit statistics -putexcel set "$dir_results/reg_fertility", sheet("Gof") modify - -putexcel A3 = "U1- Partnership formation", bold - -putexcel A5 = "Pseudo R-squared" -putexcel B5 = r2_p -putexcel A6 = "N" -putexcel B6 = N_sample -putexcel E5 = "Chi^2" -putexcel F5 = chi2 -putexcel E6 = "Log likelihood" -putexcel F6 = ll - -* Clean up -drop in_sample p -scalar drop _all -matrix drop _all - - -capture log close - diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_health.do b/input/InitialPopulations/compile/RegressionEstimates/reg_health.do deleted file mode 100644 index 5195f1269..000000000 --- a/input/InitialPopulations/compile/RegressionEstimates/reg_health.do +++ /dev/null @@ -1,614 +0,0 @@ -******************************************************************************** -* PROJECT: SimPaths UK -* SECTION: Health -* OBJECT: Health status and Disability -* AUTHORS: Daria Popova, Justin van de Ven, Aleksandra Kolndrekaj -* LAST UPDATE: 18 Feb 2026 AK -* COUNTRY: UK -* -* NOTES: Combined former a and b processes. -* -******************************************************************************** -clear all -set more off -set mem 200m -set type double -//set maxvar 120000 -set maxvar 30000 - -******************************************************************* -cap log close -log using "${dir_log}/reg_health.log", replace -******************************************************************* - -* Set Excel file - -* Info sheet - -putexcel set "$dir_results/reg_health", sheet("Info") replace -putexcel A1 = "Description:" -putexcel B1 = "Model parameters governing projection self-reported health status" -putexcel A2 = "Authors: Patryk Bronka, Justin van de Ven, Daria Popova, Aleksandra Kolndrekaj" -putexcel A3 = "Last edit: 18 Feb 2026 AK" - -putexcel A4 = "Process:", bold -putexcel B4 = "Description:", bold - -putexcel A5 = "H1" -putexcel B5 = "Generalized ordered logit regression estimates of self reported health status" -putexcel B6 = "Covariates that satisfy the parallel lines assumption have one estimate for all categories of the dependent variable and are present once in the table" -putexcel B7 = "Covariates that do not satisfy the parallel lines assumption have an estimate for each estimated category of the dependent variable. These covariates have the dependent variable category appended to their name." - -putexcel A8 = "H1_raw" -putexcel B8 = "Raw generalized ordered logit regression estimates of self reported health status. Useful for the 'Gologit predictor' file." - -putexcel A11 = "H2" -putexcel B11 = "Probit regression estimates of the probability of being long-term sick or disabled" - -putexcel A15 = "Notes:", bold -putexcel B15 = "Estimation sample: UK_ipop.dta with grossing up weight dwt" -putexcel B16 = "Conditions for processes are defined as globals in master.do" -putexcel B17 = "Combined former processes H1a and H1b" - -putexcel set "$dir_results/reg_health", sheet("Gof") modify -putexcel A1 = "Goodness of fit", bold - - -/********************************* PREPARE DATA *******************************/ - -* Load data -use "${estimation_sample}", clear - -* Set data -xtset idperson swv -sort idperson swv - -* Adjust variables -do "${dir_do}/variable_update.do" - - -/********************************** ESTIMATION ********************************/ - -/********************** H1: SELF-REPORTED HEALTH STATUS ***********************/ -display "${h1_if_condition}" - -gologit2 dhe Ded Dgn Dag Dag_sq /// /*Ded_Dag Ded_Dag_sq Ded_Dgn /// */ - L_Dhe_pcs L_Dhe_mcs /// - i.Deh_c4_Medium i.Deh_c4_Low i.Deh_c4_Na /// - /*L_Les_c4_Student*/ L_Les_c4_NotEmployed L_Les_c4_Retired /// - L_Ydses_c5_Q2 L_Ydses_c5_Q3 L_Ydses_c5_Q4 L_Ydses_c5_Q5 /// - L_Dhhtp_c4_CoupleChildren L_Dhhtp_c4_SingleNoChildren L_Dhhtp_c4_SingleChildren L_Dlltsd01 /// - $regions Year_transformed Y2020 Y2021 $ethnicity if /// - ${h1_if_condition} [pw=dwt], autofit - -*Note: In gologit2, the coefficients show how covariates affect the log-odds of being above a certain category vs. at or below it. - -* Save raw results -matrix results = r(table) -matrix results = results[1..6,1...]' - -putexcel set "$dir_raw_results/health/health", /// - sheet("Process H1") replace -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -outreg2 stats(coef se pval) using /// - "$dir_raw_results/health/H1.doc", replace /// -title("Process H1: Self-Reported Health Status") /// - ctitle(Health) label side dec(2) noparen /// - addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) /// - addnote(`"Note: Regression if condition = (${h1_if_condition})"') - - -* Save sample inclusion indicator and predicted probabilities -gen in_sample = e(sample) -predict p1 p2 p3 p4 p5 - -* Save sample for estimate validation -save "$dir_validation_data/H1_sample", replace - -* Store model summary statistics -scalar r2_p = e(r2_p) -scalar N_sample = e(N) - - -* Store results in Excel - -* Store estimates in matrices -matrix b = e(b) -matrix V = e(V) - -* Raw output -putexcel set "$dir_results/reg_health", sheet("H1_raw") modify -putexcel A1 = matrix(b'), names nformat(number_d2) -putexcel A1 = "CATEGORY" -putexcel B1 = "REGRESSOR" -putexcel C1 = "COEFFICIENT" - -* Estimated coefficients -scalar no_coefs_all = colsof(b) - -* Eliminate rows and columns containing zeros (baseline cats) -mata: - // Call matrices into mata - b = st_matrix("b") - - // Find which coefficients are nonzero - keep = (b :!= 0) - - // Eliminate zeros - nonzero_b = select(b, keep) - - // Inspect - nonzero_b - - // Return to Stata - st_matrix("nonzero_b", nonzero_b) - st_matrix("nonzero_b_flag", keep) -end - -* Inspect -matrix list b -matrix list nonzero_b -matrix list nonzero_b_flag - -* Save dimensions -scalar no_nonzero_b = colsof(nonzero_b) -scalar no_nonzero_b_per = no_nonzero_b / 4 // number of categories-1 - -* Address repetition of proportional odds covariates - -* Generate repetition/unique observation flag -mata: - // Import matrices into mata - nonzero_b_mata = st_matrix("nonzero_b") - - // Generate binary vector =1 if coefficient repeated - n = cols(nonzero_b_mata) - repetition_flag = J(n, 1, 0) - - // use tolerance based comparison to avoid precision errors - tol = 1e-8 - - for (i = 1; i <= n; i++) { - for (j = 1; j <= n; j++) { - if (i != j && abs(nonzero_b_mata[i] - nonzero_b_mata[j]) < tol) { - repetition_flag[i] = 1 - break - } - } - } - repetition_flag - - // Generate binary vector =1 if coefficient not repeated - unique_flag = 1 :- repetition_flag - - // Return to Stata - st_matrix("repetition_flag", repetition_flag') - st_matrix("unique_flag", unique_flag') - -end - -* Generate vector to multiply the coef vector with to eliminate the -* repetitions of coefficients for vars that satify the proportional odds -* assumptions -matrix structure_a = J(1,no_nonzero_b_per,1) -matrix structure_b = unique_flag[1,no_nonzero_b_per+1..no_nonzero_b] -matrix structure = structure_a, structure_b - -* Inspect -matrix list structure_a -matrix list structure_b -matrix list structure -matrix list nonzero_b - -* Eliminate repetitions -mata: - // Call matrices into mata - var = st_matrix("var") - structure = st_matrix("structure") - nonzero_b = st_matrix("nonzero_b") - - // Convert reptitions into zeros - b_structure = structure :* nonzero_b - - b_structure - - // Eliminate zeros - keep = (b_structure :!= 0) - - nonzero_b_structure = select(b_structure, keep) - - // Export to Stata - st_matrix("b_structure", b_structure) - st_matrix("nonzero_b_structure", nonzero_b_structure) - -end - -matrix list nonzero_b_structure - -* Export into Excel -putexcel set "$dir_results/reg_health", sheet("H1") modify -putexcel A1 = matrix(nonzero_b_structure'), names nformat(number_d2) - - -* Variance-covariance matrix -* ELiminate zeros (baseline categories) -mata: - V = st_matrix("V") - b = st_matrix("b") - - // Find which coefficients are nonzero - keep = (b :!= 0) - - // Eliminate zeros - V_trimmed = select(V, keep) - V_trimmed = select(V_trimmed', keep)' - - V_trimmed - - // Return to Stata - st_matrix("var", V_trimmed) -end - -matrix list var - -* Address repetition due to proportional odds being satisfied for some covars -matrix square_structure_a = J(no_nonzero_b,1,1) * structure -matrix square_structure_b = square_structure_a' - -matrix list square_structure_a -matrix list square_structure_b -mata: - // Call matrices into mata - var = st_matrix("var") - - // Create structure matrix (0 = eliminate) - square_structure_a = st_matrix("square_structure_a") - square_structure_b = st_matrix("square_structure_b") - - // Element-by-element multiplication - square_structure = square_structure_a :* square_structure_b - var_structure = square_structure :* var - - // Eliminate zeros - row_keep = rowsum(abs(var_structure)) :!= 0 - col_keep = colsum(abs(var_structure)) :!= 0 - - nonzero_var_structure = select(select(var_structure, row_keep), col_keep) - - // Return to Stata - st_matrix("nonzero_var_structure", nonzero_var_structure) -end - -matrix list nonzero_var_structure - -* Export to Excel -putexcel set "$dir_results/reg_health", sheet("H1") modify -putexcel C2 = matrix(nonzero_var_structure) - -*======================================================================= -* Eigenvalue stability check for trimmed variance-covariance matrix - -matrix symeigen X lambda = nonzero_var_structure - -* Largest eigenvalue -scalar max_eig = lambda[1,1] - -* Ratio of smallest to largest eigenvalue -scalar min_ratio = lambda[1, colsof(lambda)] / max_eig - -* Check 1: near-singularity -if max_eig < 1.0e-12 { - display as error "CRITICAL ERROR: Variance-covariance matrix is near singular." - display as error "Max eigenvalue = " max_eig - exit 999 -} - -* Check 2: ill-conditioning -if min_ratio < 1.0e-12 { - display as error "Matrix is ill-conditioned." - display as error "Min/Max eigenvalue ratio = " min_ratio - exit 506 -} - -display "VCV stability check passed." -display "Max eigenvalue: " max_eig -display "Min/Max ratio: " min_ratio -*======================================================================= - -* Labels -preserve -putexcel set "$dir_results/reg_health", sheet("H1") modify - -putexcel A1 = "REGRESSOR" -putexcel B1 = "COEFFICIENT" - - * Use Mata to extract nice labels from colstripe of e(b) (replacement for Stata 14) -local dir_results "$dir_results" -cap erase "$dir_results/temp_labels.txt" - -* Run Mata block -mata: - // Import matrices from Stata - nonzero_b_flag = st_matrix("nonzero_b_flag")' - unique_flag = st_matrix("unique_flag")' - structure = st_matrix("structure")' - stripe = st_matrixcolstripe("e(b)") - - // Extract variable and category names - catnames = stripe[.,1] - varnames = stripe[.,2] - varnames_no_bl = select(varnames, nonzero_b_flag :== 1) - catnames_no_bl = select(catnames, nonzero_b_flag :== 1) - - // Handle lags - labels_no_bl = regexm(varnames_no_bl, "^L_") :* (regexr(varnames_no_bl, "^L_", "") :+ "_L1") :+ (!regexm(varnames_no_bl, "^L_") :* varnames_no_bl) - - // Add category name when flag is not unique - labels_no_bl = labels_no_bl :+ "_" :+ (catnames_no_bl :* (unique_flag[1::rows(labels_no_bl)] :!= 0)) - - // Clean labels - labels_no_bl = usubinstr(labels_no_bl, "1.", "", 1) - labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant") - - // Filter for structure == 1 - nonzero_labels_structure = select(labels_no_bl, structure[1::rows(labels_no_bl)] :== 1) - - // Add header row - nonzero_labels_structure = "v1"\nonzero_labels_structure - - // Write to temporary file - fh = fopen(st_local("dir_results") + "/temp_labels.txt", "w") - for (i=1; i<=rows(nonzero_labels_structure); i++) { - fput(fh, nonzero_labels_structure[i]) - } - fclose(fh) -end - - * Import cleaned labels into Stata as new dataset - import delimited "$dir_results/temp_labels.txt", clear varnames(1) encoding(utf8) - gen n = _n - - * Export labels to Excel - putexcel set "$dir_results/reg_health", sheet("H1") modify - - * Vertical labels - sum n, meanonly - local N = r(max)+1 - - forvalue i = 2/`N' { - local j = `i' - 1 - putexcel A`i' = v1[`j'] - } - - * Horizontal labels - sum n, meanonly - local N = r(max) + 1 // Adjusted since we're working across columns - - forvalues j = 1/`N' { - local n = `j'+2 // Shift by 2 to start from column C - local col "" - - while `n' > 0 { - local rem = mod(`n' - 1, 26) - local col = char(65 + `rem') + "`col'" - local n = floor((`n' - 1)/26) - } - - putexcel `col'1 = v1[`j'] - } - - *Clean up - cap erase "$dir_results/temp_labels.txt" - -restore - - -* Export model fit statistics -putexcel set "$dir_results/reg_health", sheet("Gof") modify - -putexcel A3 = "H1 - Health status", bold - -putexcel A5 = "Pseudo R-squared" -putexcel B5 = r2_p -putexcel A6 = "N" -putexcel B6 = N_sample - -* Clean up -drop in_sample p1 p2 p3 p4 p5 -scalar drop _all -matrix drop _all - - -/**************** H2: PROBABILITY LONG-TERM SICK OR DISABLED ******************/ -display "${h2_if_condition}" - -probit dlltsd01 i.Dgn Dag Dag_sq /// - Deh_c4_Medium Deh_c4_Low Deh_c4_Na /// - L_Ydses_c5_Q2 L_Ydses_c5_Q3 L_Ydses_c5_Q4 L_Ydses_c5_Q5 /// - L_Dhe_pcs L_Dhe_mcs /// - L_Dlltsd01 /// - L_Dhhtp_c4_CoupleChildren L_Dhhtp_c4_SingleNoChildren L_Dhhtp_c4_SingleChildren /// - $regions Year_transformed Y2020 Y2021 $ethnicity /// - if ${h2_if_condition} [pw = dwt], vce(robust) - - - * raw results -matrix results = r(table) -matrix results = results[1..6,1...]' -putexcel set "$dir_raw_results/health/health", sheet("Process H2") modify -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) -outreg2 stats(coef se pval) using "$dir_raw_results/health/H2.doc", replace /// -title("Process H2b: Probit regression estimates for being long-term sick or disabled - people aged 16+ not in continuous education") /// - ctitle(long-term sick or disabled) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) - -gen in_sample = e(sample) - -predict p - -* Save sample for later use (internal validation) -save "$dir_validation_data/H2_sample", replace - -* Store model summary statistics -scalar r2_p = e(r2_p) -scalar N_sample = e(N) -scalar chi2 = e(chi2) -scalar ll = e(ll) - -* Store results in Excel - -* Store estimates in matrices -matrix b = e(b) -matrix V = e(V) - -* Eliminate rows and columns containing zeros (baseline cats) -mata: - // Call matrices into mata - V = st_matrix("V") - b = st_matrix("b") - - // Find which coefficients are nonzero - keep = (b :!= 0) - - // Eliminate zeros - b_trimmed = select(b, keep) - V_trimmed = select(V, keep) - V_trimmed = select(V_trimmed', keep)' - - // Inspection - b_trimmed - V_trimmed - - // Return to Stata - st_matrix("b_trimmed", b_trimmed') - st_matrix("V_trimmed", V_trimmed) - st_matrix("nonzero_b_flag", keep) -end - -* Eigenvalue tests for var-cov invertablility in SimPaths -matrix symeigen X lambda = V_trimmed - -scalar max_eig = lambda[1,1] - -scalar min_ratio = lambda[1, colsof(lambda)] / max_eig - -* Outcome of max eigenvalue test -if max_eig < 1.0e-12 { - - display as error "CRITICAL ERROR: Maximum eigenvalue is too small (`max_eig')." - display as error "The Variance-Covariance matrix is likely singular." - exit 999 - -} - -display "Stability Check Passed: Max Eigenvalue is " max_eig - -* Outcome of eigenvalue ratio test -if min_ratio < 1.0e-12 { - - display as error "Matrix is ill-conditioned. Min/Max ratio: " min_ratio - exit 506 - -} - -display "Stability Check Passed. Min/Max ratio: " min_ratio - - -* Export into Excel -putexcel set "$dir_results/reg_health", sheet("H2") modify -putexcel B2 = matrix(b_trimmed) -putexcel C2 = matrix(V_trimmed) - - -* Labels -putexcel set "$dir_results/reg_health", sheet("H2") modify - -putexcel A1 = "REGRESSOR" -putexcel B1 = "COEFFICIENT" - - -* Mata: extract and clean labels -mata: - // Import matrices - nonzero_b_flag = st_matrix("nonzero_b_flag")' - stripe = st_matrixcolstripe("e(b)") - - // Extract varnames from stripe (2nd column) - varnames = stripe[.,2] - varnames_no_bl = select(varnames, nonzero_b_flag :== 1) - - // Clean label vector - labels_no_bl = usubinstr(varnames_no_bl, "1.", "", 1) - labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant") - labels_no_bl = regexm(labels_no_bl, "^L\\.") :* (regexr(labels_no_bl, "^L\\.", "") :+ "_L1") :+ /// - (!regexm(labels_no_bl, "^L\\.") :* labels_no_bl) - labels_no_bl = regexm(labels_no_bl, "^1L\\.") :* (regexr(labels_no_bl, "^1L\\.", "") :+ "_L1") :+ /// - (!regexm(labels_no_bl, "^1L\\.") :* labels_no_bl) - labels_no_bl = regexr(labels_no_bl, "_Dgn_L1$", "_Dgn") - - // Save as macro for writing labels from Stata - st_local("nice_labels", invtokens(labels_no_bl')) -end - -* Save cleaned labels into your original file -capture file close labelout -file open labelout using "$dir_results/temp_labels.txt", write replace -file write labelout "v1" _n // header for import -foreach lbl in `nice_labels' { - file write labelout "`lbl'" _n -} -file close labelout - -* Import cleaned labels from your file -import delimited "$dir_results/temp_labels.txt", clear varnames(1) encoding(utf8) -gen n = _n - -* Export to Excel (vertical layout in column A) -putexcel set "$dir_results/reg_health", sheet("H2") modify -summarize n, meanonly -local N = r(max) + 1 -forvalue i = 2/`N' { - local j = `i' - 1 - putexcel A`i' = v1[`j'] -} - -* Export to Excel (horizontal layout in row 1, starting at column C) -forvalues j = 1/`N' { - local n = `j' + 2 // shift index: col C = 3 - local col "" - local nn = `n' - while `nn' > 0 { - local rem = mod(`nn' - 1, 26) - local col = char(65 + `rem') + "`col'" - local nn = floor((`nn' - 1)/26) - } - putexcel `col'1 = v1[`j'] -} - -* Clean up original file -erase "$dir_results/temp_labels.txt" - - -* Export model fit statistics -putexcel set "$dir_results/reg_health", sheet("Gof") modify - -putexcel A15 = "H2-Long-term sick/disabled or on disability benefits", bold -putexcel A17 = "Pseudo R-squared" -putexcel B17 = r2_p -putexcel A18 = "N" -putexcel B18 = N_sample -putexcel E17 = "Chi^2" -putexcel F17 = chi2 -putexcel E18 = "Log likelihood" -putexcel F18 = ll - -* Clean up -//drop in_sample p -scalar drop _all -matrix drop _all - - -capture log close - diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_home_ownership.do b/input/InitialPopulations/compile/RegressionEstimates/reg_home_ownership.do deleted file mode 100644 index 3a2dd6308..000000000 --- a/input/InitialPopulations/compile/RegressionEstimates/reg_home_ownership.do +++ /dev/null @@ -1,333 +0,0 @@ -******************************************************************************** -* PROJECT: SimPaths UK -* SECTION: Home ownership -* OBJECT: Final Regresion Models - Weighted -* AUTHORS: Daria Popova, Justin van de Ven, Aleksandra Kolndrekaj -* LAST UPDATE: 18 Feb 2026 AK -* COUNTRY: UK -* -* NOTES: Re-estimated process at benefit unit level to be consistent with SimPaths -* -******************************************************************************** -clear all -set more off -set mem 200m -set type double -//set maxvar 120000 -set maxvar 30000 - -******************************************************************* -cap log close -log using "${dir_log}/reg_home_ownership.log", replace -******************************************************************* - -* Set Excel file - -* Info sheet - -putexcel set "$dir_results/reg_home_ownership", sheet("Info") replace -putexcel A1 = "Description:" -putexcel B1 = "Model parameters governing projection of home ownership" -putexcel A2 = "Authors: Patryk Bronka, Justin van de Ven, Daria Popova, Aleksandra Kolndrekaj" -putexcel A3 = "Last edit: 18 Feb 2026 AK" - -putexcel A4 = "Process:", bold -putexcel B4 = "Description:", bold -putexcel A5 = "HO1" -putexcel B5 = "Probit regression estimates of the probability of being a home owner, aged 18+" - -putexcel A10 = "Notes:", bold -putexcel B10 = "Estimation sample: UK_ipop.dta with grossing up weight dwt" -putexcel B11 = "Conditions for processes are defined as globals in master.do" -putexcel B12 = "Re-estimated process at benefit unit level to be consistent with SimPaths" - -putexcel set "$dir_results/reg_home_ownership", sheet("Gof") modify -putexcel A1 = "Goodness of fit", bold - - -/********************************* PREPARE DATA *******************************/ - -* Load data -use "${estimation_sample}", clear - -* Set data -xtset idperson swv -sort idperson swv - -* Adjust variables -do "${dir_do}/variable_update.do" - -*-------------------------------------------------- -* Create sample at benefti unit head -*-------------------------------------------------- - -* Keep adults (18+) -keep if dag >= 18 - -* Count unique benefit-unit–wave combinations BEFORE head selection -egen tag_bu_wave = tag(idbenefitunit swv) -count if tag_bu_wave -local n_bu_before = r(N) -display "Number of benefit unit–wave combinations BEFORE selecting head: `n_bu_before'" - -* Sort benefit unit members within each wave: -* 1. Highest non-benefit income (ypnbihs_dv) -* 2. Highest age (dag) -* 3. Lowest idperson (idperson) -gsort idbenefitunit swv -ypnbihs_dv -dag idperson - -* Tag the first person (the "head") per benefit unit and wave -bysort idbenefitunit swv: gen benunit_head = (_n == 1) - -* Keep only benefit unit heads -keep if benunit_head == 1 - -* Count unique benefit-unit–wave combinations AFTER head selection -drop tag_bu_wave -egen tag_bu_wave = tag(idbenefitunit swv) -count if tag_bu_wave -local n_bu_after = r(N) -display "Number of benefit unit–wave combinations AFTER selecting head: `n_bu_after'" - -* Ensure benefit unit–wave counts match before and after head selection -assert `n_bu_before' == `n_bu_after' - -* Verify only one head per benefit unit per wave -by idbenefitunit swv, sort: gen n=_N -assert n==1 - -sort idperson swv -/********************************** ESTIMATION ********************************/ - -/********************** HO1: PROBABILITY OF OWNING HOME ***********************/ -display "${ho1_if_condition}" - -probit dhh_owned i.Dgn Dag Dag_sq /// - il.Dhhtp_c8_2 il.Dhhtp_c8_3 il.Dhhtp_c8_4 il.Dhhtp_c8_5 il.Dhhtp_c8_6 il.Dhhtp_c8_7 il.Dhhtp_c8_8 /// - il.Les_c4_Student il.Les_c4_NotEmployed il.Les_c4_Retired /// - i.Deh_c4_Medium i.Deh_c4_Low i.Deh_c4_Na /// - l.Dhe_mcs l.Dhe_pcs /// - li.Ydses_c5_Q2 li.Ydses_c5_Q3 li.Ydses_c5_Q4 li.Ydses_c5_Q5 /// - l.Yptciihs_dv /// - l.Dhh_owned /// - $regions Year_transformed Y2020 Y2021 $ethnicity /// - if ${ho1_if_condition} [pw=dwt], vce(cluster idperson) - - * Save raw results -matrix results = r(table) -matrix results = results[1..6,1...]' - -putexcel set "$dir_raw_results/home_ownership/home_ownership", /// - sheet("Process HO1") replace -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -outreg2 stats(coef se pval) using /// - "$dir_raw_results/home_ownership/HO1.doc", replace /// -title("Process H01: Probability Own Home") /// - ctitle(Own home) label side dec(2) noparen /// - addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) /// - addnote(`"Note: Regression if condition = (${ho1_if_condition}). Only estimated on benefit unit heads."') - -* Save sample inclusion indicator and predicted probabilities -gen in_sample = e(sample) -predict p - -* Save sample for stimate validation -save "$dir_validation_data/HO1_sample", replace - -* Store model summary statistics -scalar r2_p = e(r2_p) -scalar N_sample = e(N) -scalar chi2 = e(chi2) -scalar ll = e(ll) - - -* Store results in Excel - -* Store estimates -matrix b = e(b) -matrix V = e(V) - -mata: - // Call matrices into mata - V = st_matrix("V") - b = st_matrix("b") - - // Find which coefficients are nonzero - keep = (b :!= 0) - - // Eliminate zeros - b_trimmed = select(b, keep) - V_trimmed = select(V, keep) - V_trimmed = select(V_trimmed', keep)' - - // Inspection - b_trimmed - V_trimmed - - // Return to Stata - st_matrix("b_trimmed", b_trimmed') - st_matrix("V_trimmed", V_trimmed) - st_matrix("nonzero_b_flag", keep) -end - -* Eigenvalue tests for var-cov invertablility in SimPaths -matrix symeigen X lambda = V_trimmed - -scalar max_eig = lambda[1,1] - -scalar min_ratio = lambda[1, colsof(lambda)] / max_eig - -* Outcome of max eigenvalue test -if max_eig < 1.0e-12 { - - display as error "CRITICAL ERROR: Maximum eigenvalue is too small (`max_eig')." - display as error "The Variance-Covariance matrix is likely singular." - exit 999 - -} - -display "Stability Check Passed: Max Eigenvalue is " max_eig - -* Outcome of eigenvalue ratio test -if min_ratio < 1.0e-12 { - - display as error "Matrix is ill-conditioned. Min/Max ratio: " min_ratio - exit 506 - -} - -display "Stability Check Passed. Min/Max ratio: " min_ratio - -* Export into Excel -putexcel set "$dir_results/reg_home_ownership", sheet("HO1") modify -putexcel B2 = matrix(b_trimmed) -putexcel C2 = matrix(V_trimmed) - - -* Labels -preserve -putexcel set "$dir_results/reg_home_ownership", sheet("HO1") modify - -putexcel A1 = "REGRESSOR" -putexcel B1 = "COEFFICIENT" - -* Use Mata to extract nice labels from colstripe of e(b) - -local dir_results "$dir_results" -cap erase "$dir_results/temp_labels.txt" - -mata: - // -------------------------------------------------- - // Import objects from Stata - // -------------------------------------------------- - nonzero_b_flag = st_matrix("nonzero_b_flag") - stripe = st_matrixcolstripe("e(b)") - - // Ensure column vector - nonzero_b_flag = nonzero_b_flag' - - // -------------------------------------------------- - // Extract variable names - // -------------------------------------------------- - varnames = stripe[.,2] - - // Keep non-baseline coefficients - varnames_no_bl = select(varnames, nonzero_b_flag :== 1) - - // -------------------------------------------------- - // Clean labels - // -------------------------------------------------- - labels_no_bl = usubinstr(varnames_no_bl, "1.", "", 1) - labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant") - - // Handle lags: L.var -> var_L1 - labels_no_bl = /// - regexm(labels_no_bl, "^L\.") :* /// - (regexr(labels_no_bl, "^L\.", "") :+ "_L1") :+ /// - (!regexm(labels_no_bl, "^L\.") :* labels_no_bl) - - // Handle 1L.var - labels_no_bl = /// - regexm(labels_no_bl, "^1L\.") :* /// - (regexr(labels_no_bl, "^1L\.", "") :+ "_L1") :+ /// - (!regexm(labels_no_bl, "^1L\.") :* labels_no_bl) - - // -------------------------------------------------- - // Add header - // -------------------------------------------------- - labels_out = "v1" \ labels_no_bl - - // -------------------------------------------------- - // Write to temp file - // -------------------------------------------------- - outfile = st_local("dir_results") + "/temp_labels.txt" - fh = fopen(outfile, "w") - for (i=1; i<=rows(labels_out); i++) { - fput(fh, labels_out[i]) - } - fclose(fh) -end - - - * Import cleaned labels into Stata - import delimited "$dir_results/temp_labels.txt", clear varnames(1) /// - encoding(utf8) - gen n = _n - - * Export labels to Excel - putexcel set "$dir_results/reg_home_ownership", sheet("HO1") modify - - * Vertical labels - summarize n, meanonly - local N = r(max)+1 - forvalue i = 2/`N' { - - local j = `i' - 1 - putexcel A`i' = v1[`j'] - - } - - * Horizontal labels - summarize n, meanonly - local N = r(max) + 1 // Adjusted since we're working across columns - - forvalues j = 1/`N' { - - local n = `j'+2 // Shift by 2 to start from column C - local col "" - - while `n' > 0 { - local rem = mod(`n' - 1, 26) - local col = char(65 + `rem') + "`col'" - local n = floor((`n' - 1)/26) - } - - putexcel `col'1 = v1[`j'] - } - - * Clean up - cap erase "$dir_results/temp_labels.txt" - -restore - -* Export model fit statistics -putexcel set "$dir_results/reg_home_ownership", sheet("Gof") modify - -putexcel A3 = "HO1 - Home ownership", bold - -putexcel A5 = "Pseudo R-squared" -putexcel B5 = r2_p -putexcel A6 = "N" -putexcel B6 = N_sample -putexcel E5 = "Chi^2" -putexcel F5 = chi2 -putexcel E6 = "Log likelihood" -putexcel F6 = ll - -drop in_sample p -scalar drop r2_p N_sample chi2 ll - -capture log close - diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_income.do b/input/InitialPopulations/compile/RegressionEstimates/reg_income.do deleted file mode 100644 index 788c10b05..000000000 --- a/input/InitialPopulations/compile/RegressionEstimates/reg_income.do +++ /dev/null @@ -1,1387 +0,0 @@ -******************************************************************************** -* PROJECT: SimPaths UK -* SECTION: Non-employment/non-benefit income -* OBJECT: Final Regresion Models -* AUTHORS: Patryk Bronka, Daria Popova, Justin van de Ven -* LAST UPDATE: 21 Jan 2026 DP -* COUNTRY: UK - -* NOTES: Models for split income variable -* - Capital returns -* - Private pension income -* -* The income do file must be run after -* reg_wages.do because it uses predicted wages. -/******************************************************************************* - - -*******************************************************************************/ -******************************************************************************** -clear all -set more off -set mem 200m -set type double -//set maxvar 120000 -set maxvar 30000 - -******************************************************************* -cap log close -log using "${dir_log}/reg_income.log", replace -******************************************************************* - -* Set Excel file -* Info sheet -putexcel set "$dir_results/reg_income", sheet("Info") replace -putexcel A1 = "Description:" -putexcel B1 = "This file contains regression estiamtes used by processes I1 (capital income), I2 (private pension, retired last year), I3 (private pension income, not retired last year) " -putexcel A2 = "Authors: Patryk Bronka, Justin Van de Ven, Daria Popova, Aleksandra Kolndrekaj" -putexcel A3 = "Last edit: 18 Feb 2026 AK" - -putexcel A4 = "Process:", bold -putexcel B4 = "Description:", bold - -putexcel A6 = "Process I1a" -putexcel B6 = "Logit regression estimates of the probability of receiving capital income " - -putexcel A8 = "Process I1b" -putexcel B8 = "OLS regression estimates (ihs) capital income amount - who receive capital income" - -putexcel A10 = "Process I2b" -putexcel B10 = "OLS regression estimates (ihs) private pension income amount - aged 50+ and were retired last yeare" - -putexcel A12 = "Process I3a" -putexcel B12 = "Logit regression estimates of the probability of receiving private pension income - aged 50+ and not a student or retired last year" - -putexcel A14 = "Process I3b" -putexcel B14 = "OLS regression estimates (ihs) private pension income - aged 50+ and not a student or retired last year" - - -putexcel A17 = "Notes:", bold -putexcel B17 = "Estimation sample: UK_ipop2.dta with grossing up weight dwt" -putexcel B18 = "Conditions for processes are defined as globals in master.do" -putexcel B19 = "Combined former capital income processes I3a and I3b and renamed as I1a and I1b" -putexcel B20 = "Income variables are IHS transformed." - - -/**************************************************************/ -* prepare data on real growth of wages -/**************************************************************/ - -import excel "$dir_external_data/time_series_factor.xlsx", sheet("UK_gdp") firstrow clear // Import real growth index -rename Year stm -rename Value growth -gen base_val = growth if stm == 2015 -sum base_val -replace base_val = r(mean) -replace growth= growth/base_val -drop base_val -replace stm = stm - 2000 -save "$dir_external_data\growth_rates", replace - - -/********************************* PREPARE DATA *******************************/ - -* Load data -use "${estimation_sample2}", clear //panel with predicted wages - -* Merge in growth rates -merge m:1 stm using "$dir_external_data/growth_rates", keep(3) nogen keepusing(growth) - -* Set data -xtset idperson swv -sort idperson swv - -* adjust capital income -sum ypncp, det -scalar p99 = r(p99) -replace ypncp = . if ypncp >= p99 - -* adjust pension income -sum ypnoab, det -scalar p99 = r(p99) -replace ypnoab = . if ypnoab >= p99 - -*rename pedicted wage -capture confirm variable pred_hourly_wage -if _rc == 0 { - gen Hourly_wage = pred_hourly_wage -} - -/********************************** ESTIMATION ********************************/ - -/*************** I1a: PROBABILITY OF RECEIVEING CAPITAL INCOME ****************/ - -display "${i1a_if_condition}" - -logit receives_ypncp /// - i.Ded i.Dgn c.Dag c.Dag_sq /// - l.Dhe_pcs l.Dhe_mcs /// - lc.Ypncp lc.Yplgrs_dv /// - l2c.Yplgrs_dv l2c.Ypncp /// - Ded_Dgn /*Ded_Dag Ded_Dag_sq*/ /// - l.Ded_Dhe_pcs l.Ded_Dhe_mcs /// - l.Ded_Ypncp l.Ded_Yplgrs_dv l2.Ded_Yplgrs_dv l2.Ded_Ypncp /// - i.Deh_c4_Low i.Deh_c4_Medium i.Deh_c4_High /// - li.Les_c4_Student li.Les_c4_NotEmployed li.Les_c4_Retired /// - li.Dhhtp_c4_CoupleChildren li.Dhhtp_c4_SingleNoChildren li.Dhhtp_c4_SingleChildren /// - $regions Year_transformed Y2020 Y2021 $ethnicity if /// - ${i1a_if_condition} [pweight = dwt], /// - vce(cluster idperson) base - - -* Save raw results -matrix results = r(table) -matrix results = results[1..6,1...]' - -putexcel set "$dir_raw_results/income/income", /// - sheet("Process I1") replace -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -outreg2 stats(coef se pval) using /// - "$dir_raw_results/income/Selection_I1a.doc", replace /// -title("Process I1a: Probability Receiving Capital Income") /// - ctitle(Receives capital income) label side dec(2) noparen /// - addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) /// - addnote(`"Note: Regression if condition = (${i1a_if_condition})"') - - -* Save sample inclusion indicator and predicted probabilities -cap drop in_sample -cap drop p -gen in_sample = e(sample) -predict p - -* Save sample for estimates validation -save "$dir_validation_data/I1_selection_sample", replace - -* Store model summary statistics -scalar r2_p = e(r2_p) -scalar N_sample = e(N) -scalar chi2 = e(chi2) -scalar ll = e(ll) - -* Store results in Excel - -* Store estimates in matrices -matrix b = e(b) -matrix V = e(V) - -* Eliminate rows and columns containing zeros (baseline cats) -mata: - // Call matrices into mata - V = st_matrix("V") - b = st_matrix("b") - - // Find which coefficients are nonzero - keep = (b :!= 0) - - // Eliminate zeros - b_trimmed = select(b, keep) - V_trimmed = select(V, keep) - V_trimmed = select(V_trimmed', keep)' - - // Inspection - b_trimmed - V_trimmed - - // Return to Stata - st_matrix("b_trimmed", b_trimmed') - st_matrix("V_trimmed", V_trimmed) - st_matrix("nonzero_b_flag", keep) -end - -* Eigenvalue tests for var-cov invertablility in SimPaths -matrix symeigen X lambda = V_trimmed - -scalar max_eig = lambda[1,1] - -scalar min_ratio = lambda[1, colsof(lambda)] / max_eig - -* Outcome of max eigenvalue test -if max_eig < 1.0e-12 { - - display as error "CRITICAL ERROR: Maximum eigenvalue is too small (`max_eig')." - display as error "The Variance-Covariance matrix is likely singular." - exit 999 - -} - -display "Stability Check Passed: Max Eigenvalue is " max_eig - -* Outcome of eigenvalue ratio test -if min_ratio < 1.0e-12 { - display as error "Matrix is ill-conditioned. Min/Max ratio: " min_ratio - exit 506 - -} - -display "Stability Check Passed. Min/Max ratio: " min_ratio - - -* Export into Excel -putexcel set "$dir_results/reg_income", sheet("I1a") modify -putexcel B2 = matrix(b_trimmed) -putexcel C2 = matrix(V_trimmed) - - -* Labels -preserve - -putexcel set "$dir_results/reg_income", sheet("I1a") modify - -putexcel A1 = "REGRESSOR" -putexcel B1 = "COEFFICIENT" - - -* Use Mata to extract nice labels from colstripe of e(b) -local dir_results "$dir_results" -cap erase "$dir_results/temp_labels.txt" - -mata: - // -------------------------------------------------- - // Import objects from Stata - // -------------------------------------------------- - nonzero_b_flag = st_matrix("nonzero_b_flag") - stripe = st_matrixcolstripe("e(b)") - - // Ensure column vector - nonzero_b_flag = nonzero_b_flag' - - // -------------------------------------------------- - // Extract variable names - // -------------------------------------------------- - varnames = stripe[.,2] - - // Keep non-baseline coefficients - varnames_no_bl = select(varnames, nonzero_b_flag :== 1) - - // -------------------------------------------------- - // Clean labels - // -------------------------------------------------- - labels_no_bl = usubinstr(varnames_no_bl, "1.", "", 1) - labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant") - - // Handle lags: L.var -> var_L1 - labels_no_bl = /// - regexm(labels_no_bl, "^L\.") :* /// - (regexr(labels_no_bl, "^L\.", "") :+ "_L1") :+ (!regexm(labels_no_bl, "^L\.") :* labels_no_bl) - - // Handle 1L.var - labels_no_bl = /// - regexm(labels_no_bl, "^1L\.") :* /// - (regexr(labels_no_bl, "^1L\.", "") :+ "_L1") :+ /// - (!regexm(labels_no_bl, "^1L\.") :* labels_no_bl) - - // Handle 2L.var - labels_no_bl = /// - regexm(labels_no_bl, "^L2\.") :* /// - (regexr(labels_no_bl, "^L2\.", "") :+ "_L2") :+ /// - (!regexm(labels_no_bl, "^L2\.") :* labels_no_bl) - - // -------------------------------------------------- - // Add header - // -------------------------------------------------- - labels_out = "v1" \ labels_no_bl - - // -------------------------------------------------- - // Write to temp file - // -------------------------------------------------- - outfile = st_local("dir_results") + "/temp_labels.txt" - fh = fopen(outfile, "w") - for (i=1; i<=rows(labels_out); i++) { - fput(fh, labels_out[i]) - } - fclose(fh) -end - - - * Import cleaned labels into Stata - import delimited "$dir_results/temp_labels.txt", clear varnames(1) /// - encoding(utf8) - gen n = _n - - * Export labels to Excel - putexcel set "$dir_results/reg_income", sheet("I1a") modify - - * Vertical labels - summarize n, meanonly - local N = r(max)+1 - forvalue i = 2/`N' { - - local j = `i' - 1 - putexcel A`i' = v1[`j'] - - } - - * Horizontal labels - summarize n, meanonly - local N = r(max) + 1 // Adjusted since we're working across columns - - forvalues j = 1/`N' { - - local n = `j'+2 // Shift by 2 to start from column C - local col "" - - while `n' > 0 { - local rem = mod(`n' - 1, 26) - local col = char(65 + `rem') + "`col'" - local n = floor((`n' - 1)/26) - } - - putexcel `col'1 = v1[`j'] - } - - * Clean up - cap erase "$dir_results/temp_labels.txt" - -restore - -* Export model fit statistics -putexcel set "$dir_results/reg_income", sheet("Gof") modify - -putexcel A3 = /// - "I1a - Receiving capital income ", /// - bold - -putexcel A5 = "Pseudo R-squared" -putexcel B5 = r2_p -putexcel A6 = "N" -putexcel B6 = N_sample -putexcel E5 = "Chi^2" -putexcel F5 = chi2 -putexcel E6 = "Log likelihood" -putexcel F6 = ll - - -* Clean up -drop in_sample p -scalar drop _all -matrix drop _all - - - -/********************** I1b: AMOUNT OF CAPITAL INCOME *************************/ - -* DV: ypncp = Inverse hyperbolic sine (IHS) of gross capital income -display "${i1b_if_condition}" - -reg ypncp i.Dgn c.Dag c.Dag_sq /// - i.Deh_c4_Low i.Deh_c4_Medium i.Deh_c4_High /// - li.Les_c4_Student li.Les_c4_NotEmployed li.Les_c4_Retired /// - li.Dhhtp_c4_CoupleChildren li.Dhhtp_c4_SingleNoChildren li.Dhhtp_c4_SingleChildren /// - l.Dhe_pcs l.Dhe_mcs /// - lc.Ypncp l2c.Ypncp lc.Yplgrs_dv l2c.Yplgrs_dv /// - Ded_Dgn /*Ded_Dag Ded_Dag_sq*/ /// - l.Ded_Ypncp l.Ded_Yplgrs_dv l2.Ded_Yplgrs_dv l2.Ded_Ypncp /// - $regions Year_transformed Y2020 Y2021 $ethnicity /// - if ${i1b_if_condition} [pw=dwt], vce(cluster idperson) - - - * Save raw results -matrix results = r(table) -matrix results = results[1..6,1...]' - -putexcel set "$dir_raw_results/income/income", sheet("Process I1b") modify -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -outreg2 stats(coef se pval) using /// - "$dir_raw_results/income/Amount_I1b.doc", replace /// -title("Process I1b: Capital Income Amount") /// - ctitle(Capital amount) label side dec(2) noparen /// - addstat("R2", e(r2)) /// - addnote(`"Note: Regression if condition = (${i1b_if_condition})"') - - -* Save sample inclusion indicator and predicted probabilities -cap drop in_sample -cap drop p -gen in_sample = e(sample) -predict p -cap drop sigma -gen sigma = e(rmse) - -* Save sample for estimate validation -save "$dir_validation_data/I1_level_sample", replace - -* Store model summary statistics -scalar r2 = e(r2) -scalar N_sample = e(N) -scalar chi2 = e(chi2) -scalar ll = e(ll) - -* Store results in Excel - -* Store estimates in matrices -matrix b = e(b) -matrix V = e(V) - -* Eliminate rows and columns containing zeros (baseline cats) -mata: - // Call matrices into mata - V = st_matrix("V") - b = st_matrix("b") - - // Find which coefficients are nonzero - keep = (b :!= 0) - - // Eliminate zeros - b_trimmed = select(b, keep) - V_trimmed = select(V, keep) - V_trimmed = select(V_trimmed', keep)' - - // Inspection - b_trimmed - V_trimmed - - // Return to Stata - st_matrix("b_trimmed", b_trimmed') - st_matrix("V_trimmed", V_trimmed) - st_matrix("nonzero_b_flag", keep) -end - -* Eigenvalue tests for var-cov invertablility in SimPaths -matrix symeigen X lambda = V_trimmed - -scalar max_eig = lambda[1,1] - -scalar min_ratio = lambda[1, colsof(lambda)] / max_eig - -* Outcome of max eigenvalue test -if max_eig < 1.0e-12 { - - display as error "CRITICAL ERROR: Maximum eigenvalue is too small (`max_eig')." - display as error "The Variance-Covariance matrix is likely singular." - exit 999 - -} - -display "Stability Check Passed: Max Eigenvalue is " max_eig - -* Outcome of eigenvalue ratio test -if min_ratio < 1.0e-12 { - - display as error "Matrix is ill-conditioned. Min/Max ratio: " min_ratio - exit 506 - -} - -display "Stability Check Passed. Min/Max ratio: " min_ratio - - -* Export into Excel -putexcel set "$dir_results/reg_income", sheet("I1b") modify -putexcel B2 = matrix(b_trimmed) -putexcel C2 = matrix(V_trimmed) - - -* Labels -preserve - -putexcel set "$dir_results/reg_income", sheet("I1b") modify - -putexcel A1 = "REGRESSOR" -putexcel B1 = "COEFFICIENT" - - -* Use Mata to extract nice labels from colstripe of e(b) -local dir_results "$dir_results" -cap erase "$dir_results/temp_labels.txt" - -mata: - // -------------------------------------------------- - // Import objects from Stata - // -------------------------------------------------- - nonzero_b_flag = st_matrix("nonzero_b_flag") - stripe = st_matrixcolstripe("e(b)") - - // Ensure column vector - nonzero_b_flag = nonzero_b_flag' - - // -------------------------------------------------- - // Extract variable names - // -------------------------------------------------- - varnames = stripe[.,2] - - // Keep non-baseline coefficients - varnames_no_bl = select(varnames, nonzero_b_flag :== 1) - - // -------------------------------------------------- - // Clean labels - // -------------------------------------------------- - labels_no_bl = usubinstr(varnames_no_bl, "1.", "", 1) - labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant") - - // Handle lags: L.var -> var_L1 - labels_no_bl = /// - regexm(labels_no_bl, "^L\.") :* /// - (regexr(labels_no_bl, "^L\.", "") :+ "_L1") :+ (!regexm(labels_no_bl, "^L\.") :* labels_no_bl) - - // Handle 1L.var - labels_no_bl = /// - regexm(labels_no_bl, "^1L\.") :* /// - (regexr(labels_no_bl, "^1L\.", "") :+ "_L1") :+ /// - (!regexm(labels_no_bl, "^1L\.") :* labels_no_bl) - - // Handle 2L.var - labels_no_bl = /// - regexm(labels_no_bl, "^L2\.") :* /// - (regexr(labels_no_bl, "^L2\.", "") :+ "_L2") :+ /// - (!regexm(labels_no_bl, "^L2\.") :* labels_no_bl) - - // -------------------------------------------------- - // Add header - // -------------------------------------------------- - labels_out = "v1" \ labels_no_bl - - // -------------------------------------------------- - // Write to temp file - // -------------------------------------------------- - outfile = st_local("dir_results") + "/temp_labels.txt" - fh = fopen(outfile, "w") - for (i=1; i<=rows(labels_out); i++) { - fput(fh, labels_out[i]) - } - fclose(fh) -end - - - * Import cleaned labels into Stata - import delimited "$dir_results/temp_labels.txt", clear varnames(1) /// - encoding(utf8) - gen n = _n - - * Export labels to Excel - putexcel set "$dir_results/reg_income", sheet("I1b") modify - - * Vertical labels - summarize n, meanonly - local N = r(max)+1 - forvalue i = 2/`N' { - - local j = `i' - 1 - putexcel A`i' = v1[`j'] - - } - - * Horizontal labels - summarize n, meanonly - local N = r(max) + 1 // Adjusted since we're working across columns - - forvalues j = 1/`N' { - - local n = `j'+2 // Shift by 2 to start from column C - local col "" - - while `n' > 0 { - local rem = mod(`n' - 1, 26) - local col = char(65 + `rem') + "`col'" - local n = floor((`n' - 1)/26) - } - - putexcel `col'1 = v1[`j'] - } - - * Clean up - cap erase "$dir_results/temp_labels.txt" - -restore - - - -* Calculate RMSE -cap drop residuals squared_residuals -predict residuals , residuals -gen squared_residuals = residuals^2 - -preserve -keep if receives_ypncp == 1 -sum squared_residuals [w = dwt] -di "RMSE for Amount of capital income" sqrt(r(mean)) -putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify -putexcel A6 = ("I1b") B6 = (sqrt(r(mean))) -restore - -* Export model fit statistics -putexcel set "$dir_results/reg_income", sheet("Gof") modify - -putexcel A9 = "I1b - Capital income amount", /// - bold - -putexcel A11 = "R-squared" -putexcel B11 = r2 -putexcel A12 = "N" -putexcel B12 = N_sample - -* Clean up -drop in_sample p -scalar drop _all -matrix drop _all - - - -/******************************* I2b: Amount of pension income *********************************************/ - -*Sample: Retired individuals who were retired in the previous year. -*ypnoab = Inverse hyperbolic sine transformation of Gross personal private pension income - -display "${i2b_if_condition}" - -reg ypnoab i.Dgn c.Dag /// - i.Deh_c4_High i.Deh_c4_Medium i.Deh_c4_Na /// - li.Dhhtp_c4_CoupleChildren li.Dhhtp_c4_SingleNoChildren li.Dhhtp_c4_SingleChildren /// - l.Dhe_pcs l.Dhe_mcs /// - lc.Ypnoab l2c.Ypnoab /// - $regions Year_transformed Y2020 Y2021 $ethnicity /// - if ${i2b_if_condition} [pw=dwt], vce(cluster idperson) - - * Save raw results -matrix results = r(table) -matrix results = results[1..6,1...]' - -putexcel set "$dir_raw_results/income/income", sheet("Process I2b") modify -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -outreg2 stats(coef se pval) using /// - "$dir_raw_results/income/Amount_I2b.doc", replace /// -title("Process I2b: Capital Income Amount") /// - ctitle(Private Pension Income amount) label side dec(2) noparen /// - addstat("R2", e(r2)) /// - addnote(`"Note: Regression if condition = (${i2b_if_condition})"') - - -* Save sample inclusion indicator and predicted probabilities -cap drop in_sample -cap drop p -gen in_sample = e(sample) -predict p -cap drop sigma -gen sigma = e(rmse) - -* Save sample for estimate validation -save "$dir_validation_data/I2_level_sample", replace - -* Store model summary statistics -scalar r2 = e(r2) -scalar N_sample = e(N) -scalar chi2 = e(chi2) -scalar ll = e(ll) - -* Store results in Excel - -* Store estimates in matrices -matrix b = e(b) -matrix V = e(V) - -* Eliminate rows and columns containing zeros (baseline cats) -mata: - // Call matrices into mata - V = st_matrix("V") - b = st_matrix("b") - - // Find which coefficients are nonzero - keep = (b :!= 0) - - // Eliminate zeros - b_trimmed = select(b, keep) - V_trimmed = select(V, keep) - V_trimmed = select(V_trimmed', keep)' - - // Inspection - b_trimmed - V_trimmed - - // Return to Stata - st_matrix("b_trimmed", b_trimmed') - st_matrix("V_trimmed", V_trimmed) - st_matrix("nonzero_b_flag", keep) -end - -* Eigenvalue tests for var-cov invertablility in SimPaths -matrix symeigen X lambda = V_trimmed - -scalar max_eig = lambda[1,1] - -scalar min_ratio = lambda[1, colsof(lambda)] / max_eig - -* Outcome of max eigenvalue test -if max_eig < 1.0e-12 { - - display as error "CRITICAL ERROR: Maximum eigenvalue is too small (`max_eig')." - display as error "The Variance-Covariance matrix is likely singular." - exit 999 - -} - -display "Stability Check Passed: Max Eigenvalue is " max_eig - -* Outcome of eigenvalue ratio test -if min_ratio < 1.0e-12 { - - display as error "Matrix is ill-conditioned. Min/Max ratio: " min_ratio - exit 506 - -} - -display "Stability Check Passed. Min/Max ratio: " min_ratio - - -* Export into Excel -putexcel set "$dir_results/reg_income", sheet("I2b") modify -putexcel B2 = matrix(b_trimmed) -putexcel C2 = matrix(V_trimmed) - - -* Labels -preserve - -putexcel set "$dir_results/reg_income", sheet("I2b") modify - -putexcel A1 = "REGRESSOR" -putexcel B1 = "COEFFICIENT" - - -* Use Mata to extract nice labels from colstripe of e(b) -local dir_results "$dir_results" -cap erase "$dir_results/temp_labels.txt" - -mata: - // -------------------------------------------------- - // Import objects from Stata - // -------------------------------------------------- - nonzero_b_flag = st_matrix("nonzero_b_flag") - stripe = st_matrixcolstripe("e(b)") - - // Ensure column vector - nonzero_b_flag = nonzero_b_flag' - - // -------------------------------------------------- - // Extract variable names - // -------------------------------------------------- - varnames = stripe[.,2] - - // Keep non-baseline coefficients - varnames_no_bl = select(varnames, nonzero_b_flag :== 1) - - // -------------------------------------------------- - // Clean labels - // -------------------------------------------------- - labels_no_bl = usubinstr(varnames_no_bl, "1.", "", 1) - labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant") - - // Handle lags: L.var -> var_L1 - labels_no_bl = /// - regexm(labels_no_bl, "^L\.") :* /// - (regexr(labels_no_bl, "^L\.", "") :+ "_L1") :+ (!regexm(labels_no_bl, "^L\.") :* labels_no_bl) - - // Handle 1L.var - labels_no_bl = /// - regexm(labels_no_bl, "^1L\.") :* /// - (regexr(labels_no_bl, "^1L\.", "") :+ "_L1") :+ /// - (!regexm(labels_no_bl, "^1L\.") :* labels_no_bl) - - // Handle 2L.var - labels_no_bl = /// - regexm(labels_no_bl, "^L2\.") :* /// - (regexr(labels_no_bl, "^L2\.", "") :+ "_L2") :+ /// - (!regexm(labels_no_bl, "^L2\.") :* labels_no_bl) - - // -------------------------------------------------- - // Add header - // -------------------------------------------------- - labels_out = "v1" \ labels_no_bl - - // -------------------------------------------------- - // Write to temp file - // -------------------------------------------------- - outfile = st_local("dir_results") + "/temp_labels.txt" - fh = fopen(outfile, "w") - for (i=1; i<=rows(labels_out); i++) { - fput(fh, labels_out[i]) - } - fclose(fh) -end - - - * Import cleaned labels into Stata - import delimited "$dir_results/temp_labels.txt", clear varnames(1) /// - encoding(utf8) - gen n = _n - - * Export labels to Excel - putexcel set "$dir_results/reg_income", sheet("I2b") modify - - * Vertical labels - summarize n, meanonly - local N = r(max)+1 - forvalue i = 2/`N' { - - local j = `i' - 1 - putexcel A`i' = v1[`j'] - - } - - * Horizontal labels - summarize n, meanonly - local N = r(max) + 1 // Adjusted since we're working across columns - - forvalues j = 1/`N' { - - local n = `j'+2 // Shift by 2 to start from column C - local col "" - - while `n' > 0 { - local rem = mod(`n' - 1, 26) - local col = char(65 + `rem') + "`col'" - local n = floor((`n' - 1)/26) - } - - putexcel `col'1 = v1[`j'] - } - - * Clean up - cap erase "$dir_results/temp_labels.txt" - -restore - - -* Calculate RMSE -cap drop residuals squared_residuals -predict residuals , residuals -gen squared_residuals = residuals^2 - -preserve -keep if receives_ypncp == 1 -sum squared_residuals [w = dwt] -di "RMSE for Amount of private pension income" sqrt(r(mean)) -putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify -putexcel A7 = ("I2b") B7 = (sqrt(r(mean))) -restore - -* Export model fit statistics -putexcel set "$dir_results/reg_income", sheet("Gof") modify - -putexcel A15 = /// - "I2b - Private Pension income amount", /// - bold - -putexcel A17 = "R-squared" -putexcel B17 = r2 -putexcel A18 = "N" -putexcel B18 = N_sample - -* Clean up -drop in_sample p -scalar drop _all -matrix drop _all - - -/*************************** I3a: PROBABILITY OF RECEIVEING PRIVATE PENSION INCOME ***********************************/ -*Sample: Retired individuals who were not retired in the previous year. - -display "${i3a_if_condition}" - -logit receives_ypnoab /// - i.Dgn i.Reached_Retirement_Age /// - i.Deh_c4_High i.Deh_c4_Medium i.Deh_c4_Na /// - li.Les_c4_NotEmployed /// - li.Dhhtp_c4_CoupleChildren li.Dhhtp_c4_SingleNoChildren li.Dhhtp_c4_SingleChildren /// - l.Dhe_pcs l.Dhe_mcs /// - l.Hourly_wage /// - $regions Year_transformed Y2020 Y2021 $ethnicity if /// - ${i3a_if_condition} [pweight = dwt], vce(cluster idperson) base - - -* Save raw results -matrix results = r(table) -matrix results = results[1..6,1...]' - -putexcel set "$dir_raw_results/income/income", /// - sheet("Pension Income selection") replace -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -outreg2 stats(coef se pval) using /// - "$dir_raw_results/income/Selection_I3a.doc", replace /// - title("Process I3a: Probability Receiving Private Pension Income") /// - ctitle(Receives private pesnion income) label side dec(2) noparen /// - addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) /// - addnote(`"Note: Regression if condition = (${i3a_if_condition})"') - -* Save sample inclusion indicator and predicted probabilities -cap drop in_sample -cap drop p -gen in_sample = e(sample) -predict p - -* Save sample for estimates validation -save "$dir_validation_data/I3_selection_sample", replace - -* Store model summary statistics -scalar r2_p = e(r2_p) -scalar N_sample = e(N) -scalar chi2 = e(chi2) -scalar ll = e(ll) - -* Store results in Excel - -* Store estimates in matrices -matrix b = e(b) -matrix V = e(V) - -* Eliminate rows and columns containing zeros (baseline cats) -mata: - // Call matrices into mata - V = st_matrix("V") - b = st_matrix("b") - - // Find which coefficients are nonzero - keep = (b :!= 0) - - // Eliminate zeros - b_trimmed = select(b, keep) - V_trimmed = select(V, keep) - V_trimmed = select(V_trimmed', keep)' - - // Inspection - b_trimmed - V_trimmed - - // Return to Stata - st_matrix("b_trimmed", b_trimmed') - st_matrix("V_trimmed", V_trimmed) - st_matrix("nonzero_b_flag", keep) -end - -* Eigenvalue tests for var-cov invertablility in SimPaths -matrix symeigen X lambda = V_trimmed - -scalar max_eig = lambda[1,1] - -scalar min_ratio = lambda[1, colsof(lambda)] / max_eig - -* Outcome of max eigenvalue test -if max_eig < 1.0e-12 { - - display as error "CRITICAL ERROR: Maximum eigenvalue is too small (`max_eig')." - display as error "The Variance-Covariance matrix is likely singular." - exit 999 - -} - -display "Stability Check Passed: Max Eigenvalue is " max_eig - -* Outcome of eigenvalue ratio test -if min_ratio < 1.0e-12 { - display as error "Matrix is ill-conditioned. Min/Max ratio: " min_ratio - exit 506 - -} - -display "Stability Check Passed. Min/Max ratio: " min_ratio - -* Export into Excel -putexcel set "$dir_results/reg_income", sheet("I3a") modify -putexcel B2 = matrix(b_trimmed) -putexcel C2 = matrix(V_trimmed) - - -* Labels -preserve - -putexcel set "$dir_results/reg_income", sheet("I3a") modify - -putexcel A1 = "REGRESSOR" -putexcel B1 = "COEFFICIENT" - - -* Use Mata to extract nice labels from colstripe of e(b) -local dir_results "$dir_results" -cap erase "$dir_results/temp_labels.txt" - -mata: - // -------------------------------------------------- - // Import objects from Stata - // -------------------------------------------------- - nonzero_b_flag = st_matrix("nonzero_b_flag") - stripe = st_matrixcolstripe("e(b)") - - // Ensure column vector - nonzero_b_flag = nonzero_b_flag' - - // -------------------------------------------------- - // Extract variable names - // -------------------------------------------------- - varnames = stripe[.,2] - - // Keep non-baseline coefficients - varnames_no_bl = select(varnames, nonzero_b_flag :== 1) - - // -------------------------------------------------- - // Clean labels - // -------------------------------------------------- - labels_no_bl = usubinstr(varnames_no_bl, "1.", "", 1) - labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant") - - // Handle lags: L.var -> var_L1 - labels_no_bl = /// - regexm(labels_no_bl, "^L\.") :* /// - (regexr(labels_no_bl, "^L\.", "") :+ "_L1") :+ (!regexm(labels_no_bl, "^L\.") :* labels_no_bl) - - // Handle 1L.var - labels_no_bl = /// - regexm(labels_no_bl, "^1L\.") :* /// - (regexr(labels_no_bl, "^1L\.", "") :+ "_L1") :+ /// - (!regexm(labels_no_bl, "^1L\.") :* labels_no_bl) - - // Handle 2L.var - labels_no_bl = /// - regexm(labels_no_bl, "^L2\.") :* /// - (regexr(labels_no_bl, "^L2\.", "") :+ "_L2") :+ /// - (!regexm(labels_no_bl, "^L2\.") :* labels_no_bl) - - // -------------------------------------------------- - // Add header - // -------------------------------------------------- - labels_out = "v1" \ labels_no_bl - - // -------------------------------------------------- - // Write to temp file - // -------------------------------------------------- - outfile = st_local("dir_results") + "/temp_labels.txt" - fh = fopen(outfile, "w") - for (i=1; i<=rows(labels_out); i++) { - fput(fh, labels_out[i]) - } - fclose(fh) -end - - - * Import cleaned labels into Stata - import delimited "$dir_results/temp_labels.txt", clear varnames(1) /// - encoding(utf8) - gen n = _n - - * Export labels to Excel - putexcel set "$dir_results/reg_income", sheet("I3a") modify - - * Vertical labels - summarize n, meanonly - local N = r(max)+1 - forvalue i = 2/`N' { - - local j = `i' - 1 - putexcel A`i' = v1[`j'] - - } - - * Horizontal labels - summarize n, meanonly - local N = r(max) + 1 // Adjusted since we're working across columns - - forvalues j = 1/`N' { - - local n = `j'+2 // Shift by 2 to start from column C - local col "" - - while `n' > 0 { - local rem = mod(`n' - 1, 26) - local col = char(65 + `rem') + "`col'" - local n = floor((`n' - 1)/26) - } - - putexcel `col'1 = v1[`j'] - } - - * Clean up - cap erase "$dir_results/temp_labels.txt" - -restore - -* Export model fit statistics -putexcel set "$dir_results/reg_income", sheet("Gof") modify - -putexcel A21 = /// - "I3a - Receiving private pension income", /// - bold -putexcel A23 = "Pseudo R-squared" -putexcel B23 = r2_p -putexcel A24 = "N" -putexcel B24 = N_sample -putexcel E23 = "Chi^2" -putexcel F23 = chi2 -putexcel E24 = "Log likelihood" -putexcel F24 = ll - - -* Clean up -drop in_sample p -scalar drop _all -matrix drop _all - - -/***************************** I3b: Amount of pension income ********************************************/ - -*Sample: Retired individuals who were not retired in the previous year. -*ypnoab = Inverse hyperbolic sine transformation of Gross personal private pension income - -display "${i3b_if_condition}" - -reg ypnoab i.Dgn c.Dag /// - i.Deh_c4_High i.Deh_c4_Medium i.Deh_c4_Na /// - li.Les_c4_NotEmployed /// - li.Dhhtp_c4_CoupleChildren li.Dhhtp_c4_SingleNoChildren li.Dhhtp_c4_SingleChildren /// - l.Dhe_pcs l.Dhe_mcs /// - l.Hourly_wage /// - $regions Year_transformed Y2020 Y2021 $ethnicity /// - if ${i3b_if_condition} [pw=dwt], vce(cluster idperson) - - - * Save raw results -matrix results = r(table) -matrix results = results[1..6,1...]' - -putexcel set "$dir_raw_results/income/income", sheet("Process I3b") modify -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -outreg2 stats(coef se pval) using /// - "$dir_raw_results/income/Amount_I3b.doc", replace /// -title("Process I3b: Private Pension Income Amount") /// - ctitle(Private Pension Income amount) label side dec(2) noparen /// - addstat("R2", e(r2)) /// - addnote(`"Note: Regression if condition = (${i3b_if_condition})"') - - -* Save sample inclusion indicator and predicted probabilities -cap drop in_sample -cap drop p -gen in_sample = e(sample) -predict p -cap drop sigma -gen sigma = e(rmse) - -* Save sample for estimate validation -save "$dir_validation_data/I3_level_sample", replace - -* Store model summary statistics -scalar r2 = e(r2) -scalar N_sample = e(N) -scalar chi2 = e(chi2) -scalar ll = e(ll) - -* Store results in Excel - -* Store estimates in matrices -matrix b = e(b) -matrix V = e(V) - -* Eliminate rows and columns containing zeros (baseline cats) -mata: - // Call matrices into mata - V = st_matrix("V") - b = st_matrix("b") - - // Find which coefficients are nonzero - keep = (b :!= 0) - - // Eliminate zeros - b_trimmed = select(b, keep) - V_trimmed = select(V, keep) - V_trimmed = select(V_trimmed', keep)' - - // Inspection - b_trimmed - V_trimmed - - // Return to Stata - st_matrix("b_trimmed", b_trimmed') - st_matrix("V_trimmed", V_trimmed) - st_matrix("nonzero_b_flag", keep) -end - -* Eigenvalue tests for var-cov invertablility in SimPaths -matrix symeigen X lambda = V_trimmed - -scalar max_eig = lambda[1,1] - -scalar min_ratio = lambda[1, colsof(lambda)] / max_eig - -* Outcome of max eigenvalue test -if max_eig < 1.0e-12 { - - display as error "CRITICAL ERROR: Maximum eigenvalue is too small (`max_eig')." - display as error "The Variance-Covariance matrix is likely singular." - exit 999 - -} - -display "Stability Check Passed: Max Eigenvalue is " max_eig - -* Outcome of eigenvalue ratio test -if min_ratio < 1.0e-12 { - - display as error "Matrix is ill-conditioned. Min/Max ratio: " min_ratio - exit 506 - -} - -display "Stability Check Passed. Min/Max ratio: " min_ratio - - -* Export into Excel -putexcel set "$dir_results/reg_income", sheet("I3b") modify -putexcel B2 = matrix(b_trimmed) -putexcel C2 = matrix(V_trimmed) - - -* Labels -preserve - -putexcel set "$dir_results/reg_income", sheet("I3b") modify - -putexcel A1 = "REGRESSOR" -putexcel B1 = "COEFFICIENT" - - -* Use Mata to extract nice labels from colstripe of e(b) -local dir_results "$dir_results" -cap erase "$dir_results/temp_labels.txt" - -mata: - // -------------------------------------------------- - // Import objects from Stata - // -------------------------------------------------- - nonzero_b_flag = st_matrix("nonzero_b_flag") - stripe = st_matrixcolstripe("e(b)") - - // Ensure column vector - nonzero_b_flag = nonzero_b_flag' - - // -------------------------------------------------- - // Extract variable names - // -------------------------------------------------- - varnames = stripe[.,2] - - // Keep non-baseline coefficients - varnames_no_bl = select(varnames, nonzero_b_flag :== 1) - - // -------------------------------------------------- - // Clean labels - // -------------------------------------------------- - labels_no_bl = usubinstr(varnames_no_bl, "1.", "", 1) - labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant") - - // Handle lags: L.var -> var_L1 - labels_no_bl = /// - regexm(labels_no_bl, "^L\.") :* /// - (regexr(labels_no_bl, "^L\.", "") :+ "_L1") :+ (!regexm(labels_no_bl, "^L\.") :* labels_no_bl) - - // Handle 1L.var - labels_no_bl = /// - regexm(labels_no_bl, "^1L\.") :* /// - (regexr(labels_no_bl, "^1L\.", "") :+ "_L1") :+ /// - (!regexm(labels_no_bl, "^1L\.") :* labels_no_bl) - - // Handle 2L.var - labels_no_bl = /// - regexm(labels_no_bl, "^L2\.") :* /// - (regexr(labels_no_bl, "^L2\.", "") :+ "_L2") :+ /// - (!regexm(labels_no_bl, "^L2\.") :* labels_no_bl) - - // -------------------------------------------------- - // Add header - // -------------------------------------------------- - labels_out = "v1" \ labels_no_bl - - // -------------------------------------------------- - // Write to temp file - // -------------------------------------------------- - outfile = st_local("dir_results") + "/temp_labels.txt" - fh = fopen(outfile, "w") - for (i=1; i<=rows(labels_out); i++) { - fput(fh, labels_out[i]) - } - fclose(fh) -end - - - * Import cleaned labels into Stata - import delimited "$dir_results/temp_labels.txt", clear varnames(1) /// - encoding(utf8) - gen n = _n - - * Export labels to Excel - putexcel set "$dir_results/reg_income", sheet("I3b") modify - - * Vertical labels - summarize n, meanonly - local N = r(max)+1 - forvalue i = 2/`N' { - - local j = `i' - 1 - putexcel A`i' = v1[`j'] - - } - - * Horizontal labels - summarize n, meanonly - local N = r(max) + 1 // Adjusted since we're working across columns - - forvalues j = 1/`N' { - - local n = `j'+2 // Shift by 2 to start from column C - local col "" - - while `n' > 0 { - local rem = mod(`n' - 1, 26) - local col = char(65 + `rem') + "`col'" - local n = floor((`n' - 1)/26) - } - - putexcel `col'1 = v1[`j'] - } - - * Clean up - cap erase "$dir_results/temp_labels.txt" - -restore - - -* Calculate RMSE -cap drop residuals squared_residuals -predict residuals , residuals -gen squared_residuals = residuals^2 - -preserve -keep if receives_ypncp == 1 -sum squared_residuals [w = dwt] -di "RMSE for Amount of private pension income" sqrt(r(mean)) -putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify -putexcel A8 = ("I3b") B8 = (sqrt(r(mean))) -restore - -* Export model fit statistics -putexcel set "$dir_results/reg_income", sheet("Gof") modify - -putexcel A27 = /// - "I3b - Private Pension income amount", /// - bold - -putexcel A28 = "R-squared" -putexcel B28 = r2 -putexcel A29 = "N" -putexcel B29 = N_sample - - -* Clean up -drop in_sample p -scalar drop _all -matrix drop _all - - -//end - -capture log close diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_leave_parental_home.do b/input/InitialPopulations/compile/RegressionEstimates/reg_leave_parental_home.do deleted file mode 100644 index 87a28dea3..000000000 --- a/input/InitialPopulations/compile/RegressionEstimates/reg_leave_parental_home.do +++ /dev/null @@ -1,295 +0,0 @@ -******************************************************************************** -* PROJECT: SimPaths UK -* SECTION: Leaving Parental Home -* OBJECT: Final Probit Regression Model -* AUTHORS: Daria Popova, Justin van de Ven, Aleksandra Kolndrekaj -* LAST UPDATE: 18 Feb 2026 AK -* COUNTRY: UK -* -* NOTES: -********************************************************************************** - -clear all -set more off -set mem 200m -set type double -//set maxvar 120000 -set maxvar 30000 - - -******************************************************************* -cap log close -log using "${dir_log}/reg_leave_parental_home.log", replace -******************************************************************* - -* Set Excel file - -* Info sheet - -putexcel set "$dir_results/reg_leave_parental_home", sheet("Info") replace -putexcel A1 = "Description:" -putexcel B1 = "Model parameters governing leaving parental home" -putexcel A2 = "Authors: Patryk Bronka, Justin van de Ven, Daria Popova, Aleksandra Kolndrekaj" -putexcel A3 = "Last edit: 19 Jan 2026 DP" - -putexcel A4 = "Process:", bold -putexcel B4 = "Description:", bold -putexcel A5 = "P1a" -putexcel B5 = "Probit regression estimates for leaving the parental home, transitioning out of adult child status" - -putexcel A10 = "Notes:", bold -putexcel B10 = "Estimation sample: UK_ipop.dta with grossing up weight dwt" -putexcel B11 = "Conditions for processes are defined as globals in master.do" - -putexcel set "$dir_results/reg_leave_parental_home", sheet("Gof") modify -putexcel A1 = "Goodness of fit", bold - - -/********************************* PREPARE DATA *******************************/ - -* Load data -use "${estimation_sample}", clear - -* Set data -xtset idperson swv -sort idperson swv - -* Adjust variables -do "${dir_do}/variable_update.do" - - -/********************************** ESTIMATION ********************************/ - -/**************** P1: PROBABILITY OF LEAVING THE PARENTAL HOME ****************/ -display "${p1_if_condition}" - -probit dlftphm i.Dgn Dag Dag_sq li.Deh_c4_Na li.Deh_c4_Medium li.Deh_c4_Low /// - li.Les_c3_Student li.Les_c3_NotEmployed /// - li.Ydses_c5_Q2 li.Ydses_c5_Q3 li.Ydses_c5_Q4 li.Ydses_c5_Q5 /// - $regions Year_transformed Y2020 Y2021 $ethnicity /// - if ${p1_if_condition} [pw=dwt], vce(robust) - - * Save raw results -matrix results = r(table) -matrix results = results[1..6,1...]' - -putexcel set "$dir_raw_results/leave_parental_home/leave_parental_home", /// - sheet("Process P1") replace -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -outreg2 stats(coef se pval) using /// - "$dir_raw_results/leave_parental_home/P1.doc", replace /// -title("Process P1: Probability Leave the Parental Home") /// - ctitle(Leave home) label side dec(2) noparen /// - addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) /// - addnote(`"Note: Regression if condition = (${p1_if_condition})"') - -* Save sample inclusion indicator and predicted probabilities -gen in_sample = e(sample) -predict p - -* Save sample for estiamte validation -save "$dir_validation_data/P1_sample", replace - -* Store model summary statistics -scalar r2_p = e(r2_p) -scalar N_sample = e(N) -scalar chi2 = e(chi2) -scalar ll = e(ll) - - -* Store results in Excel - -* Store estimates in matrices -matrix b = e(b) -matrix V = e(V) - -* Eliminate rows and columns containing zeros (baseline cats) -mata: - // Call matrices into mata - V = st_matrix("V") - b = st_matrix("b") - - // Find which coefficients are nonzero - keep = (b :!= 0) - - // Eliminate zeros - b_trimmed = select(b, keep) - V_trimmed = select(V, keep) - V_trimmed = select(V_trimmed', keep)' - - // Inspection - b_trimmed - V_trimmed - - // Return to Stata - st_matrix("b_trimmed", b_trimmed') - st_matrix("V_trimmed", V_trimmed) - st_matrix("nonzero_b_flag", keep) -end - - -* Eigenvalue tests for var-cov invertablility in SimPaths -matrix symeigen X lambda = V_trimmed - -scalar max_eig = lambda[1,1] - -scalar min_ratio = lambda[1, colsof(lambda)] / max_eig - -* Outcome of max eigenvalue test -if max_eig < 1.0e-12 { - - display as error "CRITICAL ERROR: Maximum eigenvalue is too small (`max_eig')." - display as error "The Variance-Covariance matrix is likely singular." - exit 999 - -} - -display "Stability Check Passed: Max Eigenvalue is " max_eig - -* Outcome of eigenvalue ratio test -if min_ratio < 1.0e-12 { - - display as error "Matrix is ill-conditioned. Min/Max ratio: " min_ratio - exit 506 - -} - -display "Stability Check Passed. Min/Max ratio: " min_ratio - - -* Export into Excel -putexcel set "$dir_results/reg_leave_parental_home", sheet("P1") modify -putexcel B2 = matrix(b_trimmed) -putexcel C2 = matrix(V_trimmed) - - -* Labels -preserve -putexcel set "$dir_results/reg_leave_parental_home", sheet("P1") modify - -putexcel A1 = "REGRESSOR" -putexcel B1 = "COEFFICIENT" - -* Use Mata to extract nice labels from colstripe of e(b) - -local dir_results "$dir_results" -cap erase "$dir_results/temp_labels.txt" - -mata: - // -------------------------------------------------- - // Import objects from Stata - // -------------------------------------------------- - nonzero_b_flag = st_matrix("nonzero_b_flag") - stripe = st_matrixcolstripe("e(b)") - - // Ensure column vector - nonzero_b_flag = nonzero_b_flag' - - // -------------------------------------------------- - // Extract variable names - // -------------------------------------------------- - varnames = stripe[.,2] - - // Keep non-baseline coefficients - varnames_no_bl = select(varnames, nonzero_b_flag :== 1) - - // -------------------------------------------------- - // Clean labels - // -------------------------------------------------- - labels_no_bl = usubinstr(varnames_no_bl, "1.", "", 1) - labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant") - - // Handle lags: L.var -> var_L1 - labels_no_bl = /// - regexm(labels_no_bl, "^L\.") :* /// - (regexr(labels_no_bl, "^L\.", "") :+ "_L1") :+ /// - (!regexm(labels_no_bl, "^L\.") :* labels_no_bl) - - // Handle 1L.var - labels_no_bl = /// - regexm(labels_no_bl, "^1L\.") :* /// - (regexr(labels_no_bl, "^1L\.", "") :+ "_L1") :+ /// - (!regexm(labels_no_bl, "^1L\.") :* labels_no_bl) - - // -------------------------------------------------- - // Add header - // -------------------------------------------------- - labels_out = "v1" \ labels_no_bl - - // -------------------------------------------------- - // Write to temp file - // -------------------------------------------------- - outfile = st_local("dir_results") + "/temp_labels.txt" - fh = fopen(outfile, "w") - for (i=1; i<=rows(labels_out); i++) { - fput(fh, labels_out[i]) - } - fclose(fh) -end - - - * Import cleaned labels into Stata - import delimited "$dir_results/temp_labels.txt", clear varnames(1) /// - encoding(utf8) - gen n = _n - - * Export labels to Excel - putexcel set "$dir_results/reg_leave_parental_home", sheet("P1") modify - - * Vertical labels - summarize n, meanonly - local N = r(max)+1 - forvalue i = 2/`N' { - - local j = `i' - 1 - putexcel A`i' = v1[`j'] - - } - - * Horizontal labels - summarize n, meanonly - local N = r(max) + 1 // Adjusted since we're working across columns - - forvalues j = 1/`N' { - - local n = `j'+2 // Shift by 2 to start from column C - local col "" - - while `n' > 0 { - local rem = mod(`n' - 1, 26) - local col = char(65 + `rem') + "`col'" - local n = floor((`n' - 1)/26) - } - - putexcel `col'1 = v1[`j'] - } - - * Clean up - cap erase "$dir_results/temp_labels.txt" - -restore - - -* Export model fit statistics -putexcel set "$dir_results/reg_leave_parental_home", sheet("Gof") modify - -putexcel A3 = "P1 - Leaving the parental home ", bold - -putexcel A5 = "Pseudo R-squared" -putexcel B5 = r2_p -putexcel A6 = "N" -putexcel B6 = N_sample -putexcel E5 = "Chi^2" -putexcel F5 = chi2 -putexcel E6 = "Log likelihood" -putexcel F6 = ll - -* Clean up -drop in_sample p -scalar drop _all -matrix drop _all - - -capture log close diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_partnership.do b/input/InitialPopulations/compile/RegressionEstimates/reg_partnership.do deleted file mode 100644 index 1517b9ac3..000000000 --- a/input/InitialPopulations/compile/RegressionEstimates/reg_partnership.do +++ /dev/null @@ -1,544 +0,0 @@ -******************************************************************************** -* PROJECT: SimPaths UK -* SECTION: Unions -* OBJECT: Final Probit Models -* AUTHORS: Daria Popova, Justin van de Ven -* LAST UPDATE: 4 Feb 2026 DP -* COUNTRY: UK -* -*NOTES: -* Combined former a and b processes. -******************************************************************************** - -clear all -set more off -set mem 200m -set type double -//set maxvar 120000 -set maxvar 30000 - -******************************************************************* -cap log close -log using "${dir_log}/reg_partnership.log", replace -******************************************************************* - -* Set Excel file - -* Info sheet - -putexcel set "$dir_results/reg_partnership", sheet("Info") replace -putexcel A1 = "Description:" -putexcel B1 = "Model parameters for relationship status projection" -putexcel A2 = "Authors: Patryk Bronka, Justin van de Ven, Daria Popova, Aleksandra Kolndrekaj" -putexcel A3 = "Last edit: 18 Feb 2026 AK" - -putexcel A4 = "Process:", bold -putexcel B4 = "Description:", bold -putexcel A5 = "U1" -putexcel B5 = "Probit regression estimates probability of entering a partnership - single respondents aged 18+" -putexcel A6 = "U2" -putexcel B6 = "Probit regression estimates of probability of exiting a partnership - cohabiting women aged 18+" - -putexcel A10 = "Notes:", bold -putexcel B10 = "Estimation sample: UK_ipop.dta with grossing up weight dwt" -putexcel B11 = "Conditions for processes are defined as globals in master.do" -putexcel B12 = "Combined former processes U1a and U1b" - -putexcel set "$dir_results/reg_partnership", sheet("Gof") modify -putexcel A1 = "Goodness of fit", bold - - - -/********************************* PREPARE DATA *******************************/ - -* Load data -use "${estimation_sample}", clear - -* Set data -xtset idperson swv -sort idperson swv - -* Adjust variables -do "${dir_do}/variable_update.do" - - -/********************************** ESTIMATION ********************************/ - -/******************** U1: PROBABILITY FORMING PARTNERSHIP *********************/ -display "${u1_if_condition}" - -probit dcpen i.Ded Dgn Dag Dag_sq lc.Dnc lc.Dnc02 /// - li.Ydses_c5_Q2 li.Ydses_c5_Q3 li.Ydses_c5_Q4 li.Ydses_c5_Q5 /// - /*Ded_Dag Ded_Dag_sq*/ Ded_Dgn Ded_Dnc_L1 Ded_Dnc02_L1 /// - Ded_Ydses_c5_Q2_L1 Ded_Ydses_c5_Q3_L1 Ded_Ydses_c5_Q4_L1 Ded_Ydses_c5_Q5_L1 /// - i.Deh_c4_Na i.Deh_c4_High i.Deh_c4_Medium i.Deh_c4_Low /// - li.Les_c4_Student li.Les_c4_NotEmployed li.Les_c4_Retired /// - li.Les_c4_Student_Dgn li.Les_c4_NotEmployed_Dgn /// - li.Les_c4_Retired_Dgn /// - l.Dhe_pcs l.Dhe_mcs /// - $regions Year_transformed Y2020 Y2021 $ethnicity /// - if ${u1_if_condition} [pw=dwt], vce(robust) - - -* Save raw results -matrix results = r(table) -matrix results = results[1..6,1...]' - -putexcel set "$dir_raw_results/partnership/partnership", /// - sheet("Process U1") replace -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -outreg2 stats(coef se pval) using /// - "$dir_raw_results/partnership/U1.doc", replace /// -title("Process U1: Probability Form partnership") /// - ctitle(Form partnership) label side dec(2) noparen /// - addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) /// - addnote(`"Note: Regression if condition = (${u1_if_condition})"') - -* Save sample inclusion indicator and predicted probabilities -gen in_sample = e(sample) -predict p - -* Save sample for later use (internal validation) -save "$dir_validation_data/U1_sample", replace - -* Store model summary statistics -scalar r2_p = e(r2_p) -scalar N_sample = e(N) -scalar chi2 = e(chi2) -scalar ll = e(ll) - -* Store results in Excel - -* Store estimates in matrices -matrix b = e(b) -matrix V = e(V) - -* Eliminate rows and columns containing zeros (baseline cats) -mata: - // Call matrices into mata - V = st_matrix("V") - b = st_matrix("b") - - // Find which coefficients are nonzero - keep = (b :!= 0) - - // Eliminate zeros - b_trimmed = select(b, keep) - V_trimmed = select(V, keep) - V_trimmed = select(V_trimmed', keep)' - - // Inspection - b_trimmed - V_trimmed - - // Return to Stata - st_matrix("b_trimmed", b_trimmed') - st_matrix("V_trimmed", V_trimmed) - st_matrix("nonzero_b_flag", keep) -end - - -* Eigenvalue tests for var-cov invertablility in SimPaths -matrix symeigen X lambda = V_trimmed - -scalar max_eig = lambda[1,1] - -scalar min_ratio = lambda[1, colsof(lambda)] / max_eig - -* Outcome of max eigenvalue test -if max_eig < 1.0e-12 { - - display as error "CRITICAL ERROR: Maximum eigenvalue is too small (`max_eig')." - display as error "The Variance-Covariance matrix is likely singular." - exit 999 - -} - -display "Stability Check Passed: Max Eigenvalue is " max_eig - -* Outcome of eigenvalue ratio test -if min_ratio < 1.0e-12 { - display as error "Matrix is ill-conditioned. Min/Max ratio: " min_ratio - exit 506 - -} - -display "Stability Check Passed. Min/Max ratio: " min_ratio - - -* Export into Excel -putexcel set "$dir_results/reg_partnership", sheet("U1") modify -putexcel B2 = matrix(b_trimmed) -putexcel C2 = matrix(V_trimmed) - - -* Labels -preserve -putexcel set "$dir_results/reg_partnership", sheet("U1") modify - -putexcel A1 = "REGRESSOR" -putexcel B1 = "COEFFICIENT" - -* Use Mata to extract nice labels from colstripe of e(b) - -local dir_results "$dir_results" -cap erase "$dir_results/temp_labels.txt" - -mata: - // -------------------------------------------------- - // Import objects from Stata - // -------------------------------------------------- - nonzero_b_flag = st_matrix("nonzero_b_flag") - stripe = st_matrixcolstripe("e(b)") - - // Ensure column vector - nonzero_b_flag = nonzero_b_flag' - - // -------------------------------------------------- - // Extract variable names - // -------------------------------------------------- - varnames = stripe[.,2] - - // Keep non-baseline coefficients - varnames_no_bl = select(varnames, nonzero_b_flag :== 1) - - // -------------------------------------------------- - // Clean labels - // -------------------------------------------------- - labels_no_bl = usubinstr(varnames_no_bl, "1.", "", 1) - labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant") - - // Handle lags: L.var -> var_L1 - labels_no_bl = /// - regexm(labels_no_bl, "^L\.") :* /// - (regexr(labels_no_bl, "^L\.", "") :+ "_L1") :+ /// - (!regexm(labels_no_bl, "^L\.") :* labels_no_bl) - - // Handle 1L.var - labels_no_bl = /// - regexm(labels_no_bl, "^1L\.") :* /// - (regexr(labels_no_bl, "^1L\.", "") :+ "_L1") :+ /// - (!regexm(labels_no_bl, "^1L\.") :* labels_no_bl) - - // -------------------------------------------------- - // Add header - // -------------------------------------------------- - labels_out = "v1" \ labels_no_bl - - // -------------------------------------------------- - // Write to temp file - // -------------------------------------------------- - outfile = st_local("dir_results") + "/temp_labels.txt" - fh = fopen(outfile, "w") - for (i=1; i<=rows(labels_out); i++) { - fput(fh, labels_out[i]) - } - fclose(fh) -end - - - * Import cleaned labels into Stata - import delimited "$dir_results/temp_labels.txt", clear varnames(1) /// - encoding(utf8) - gen n = _n - - * Export labels to Excel - putexcel set "$dir_results/reg_partnership", sheet("U1") modify - - * Vertical labels - summarize n, meanonly - local N = r(max)+1 - forvalue i = 2/`N' { - - local j = `i' - 1 - putexcel A`i' = v1[`j'] - - } - - * Horizontal labels - summarize n, meanonly - local N = r(max) + 1 // Adjusted since we're working across columns - - forvalues j = 1/`N' { - - local n = `j'+2 // Shift by 2 to start from column C - local col "" - - while `n' > 0 { - local rem = mod(`n' - 1, 26) - local col = char(65 + `rem') + "`col'" - local n = floor((`n' - 1)/26) - } - - putexcel `col'1 = v1[`j'] - } - - * Clean up - cap erase "$dir_results/temp_labels.txt" - -restore - - -* Export model fit statistics -putexcel set "$dir_results/reg_partnership", sheet("Gof") modify - -putexcel A3 = "U1- Partnership formation", bold - -putexcel A5 = "Pseudo R-squared" -putexcel B5 = r2_p -putexcel A6 = "N" -putexcel B6 = N_sample -putexcel E5 = "Chi^2" -putexcel F5 = chi2 -putexcel E6 = "Log likelihood" -putexcel F6 = ll - -* Clean up -drop in_sample p -scalar drop _all -matrix drop _all - - -/******************* U2: PROBABILITY TERMINATE PARTNERSHIP ********************/ -display "${u2_if_condition}" - - -* Estimation -probit dcpex i.Ded Dag Dag_sq /*Ded_Dag Ded_Dag_sq*/ /// - li.Deh_c4_Na li.Deh_c4_Low li.Deh_c4_Medium li.Deh_c4_High /// - li.Dehsp_c3_Medium li.Dehsp_c3_Low /// - li.Dhe_Fair li.Dhe_Good li.Dhe_VeryGood li.Dhe_Excellent /// - l.Dhe_pcs l.Dhe_mcs /// - l.Dhe_pcssp l.Dhe_mcssp /// - l.Dcpyy l.New_rel l.Dcpagdf l.Dnc l.Dnc02 /// - li.Lesdf_c4_EmpSpouseNotEmp li.Lesdf_c4_NotEmpSpouseEmp li.Lesdf_c4_BothNotEmployed /// - l.Ypnbihs_dv l.Ynbcpdf_dv /// - $regions Year_transformed Y2020 Y2021 $ethnicity /// - if ${u2_if_condition} [pw=dwt], vce(robust) - - -* Save raw results -matrix results = r(table) -matrix results = results[1..6,1...]' - -putexcel set "$dir_raw_results/partnership/partnership", sheet("Process U2") /// - modify -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -outreg2 stats(coef se pval) using /// - "$dir_raw_results/partnership/U2.doc", replace /// -title("Process U2: Probability Terminating Partnership") /// - ctitle(End partnership) label side dec(2) noparen /// - addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) /// - addnote(`"Note: Regression if condition = (${u2_if_condition})"') - -* Save sample inclusion indicator and predicted probabilities -gen in_sample = e(sample) -predict p - -* Save sample for later use (internal validation) -save "$dir_validation_data/U2_sample", replace - -* Store model summary statistics -scalar r2_p = e(r2_p) -scalar N_sample = e(N) -scalar chi2 = e(chi2) -scalar ll = e(ll) - -* Store results in Excel - -* Store estimates in matrices -matrix b = e(b) -matrix V = e(V) - -* Eliminate rows and columns containing zeros (baseline cats) -mata: - // Call matrices into mata - V = st_matrix("V") - b = st_matrix("b") - - // Find which coefficients are nonzero - keep = (b :!= 0) - - // Eliminate zeros - b_trimmed = select(b, keep) - V_trimmed = select(V, keep) - V_trimmed = select(V_trimmed', keep)' - - // Inspection - b_trimmed - V_trimmed - - // Return to Stata - st_matrix("b_trimmed", b_trimmed') - st_matrix("V_trimmed", V_trimmed) - st_matrix("nonzero_b_flag", keep) -end - -* Eigenvalue tests for var-cov invertablility in SimPaths -matrix symeigen X lambda = V_trimmed - -scalar max_eig = lambda[1,1] - -scalar min_ratio = lambda[1, colsof(lambda)] / max_eig - -* Outcome of max eigenvalue test -if max_eig < 1.0e-12 { - - display as error "CRITICAL ERROR: Maximum eigenvalue is too small (`max_eig')." - display as error "The Variance-Covariance matrix is likely singular." - exit 999 - -} - -display "Stability Check Passed: Max Eigenvalue is " max_eig - -* Outcome of eigenvalue ratio test -if min_ratio < 1.0e-12 { - display as error "Matrix is ill-conditioned. Min/Max ratio: " min_ratio - exit 506 - -} - -display "Stability Check Passed. Min/Max ratio: " min_ratio - - -* Export into Excel -putexcel set "$dir_results/reg_partnership", sheet("U2") modify -putexcel B2 = matrix(b_trimmed) -putexcel C2 = matrix(V_trimmed) - - -* Labels -preserve -putexcel set "$dir_results/reg_partnership", sheet("U2") modify - -putexcel A1 = "REGRESSOR" -putexcel B1 = "COEFFICIENT" - -* Use Mata to extract nice labels from colstripe of e(b) - -local dir_results "$dir_results" -cap erase "$dir_results/temp_labels.txt" - -mata: - // -------------------------------------------------- - // Import objects from Stata - // -------------------------------------------------- - nonzero_b_flag = st_matrix("nonzero_b_flag") - stripe = st_matrixcolstripe("e(b)") - - // Ensure column vector - nonzero_b_flag = nonzero_b_flag' - - // -------------------------------------------------- - // Extract variable names - // -------------------------------------------------- - varnames = stripe[.,2] - - // Keep non-baseline coefficients - varnames_no_bl = select(varnames, nonzero_b_flag :== 1) - - // -------------------------------------------------- - // Clean labels - // -------------------------------------------------- - labels_no_bl = usubinstr(varnames_no_bl, "1.", "", 1) - labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant") - - // Handle lags: L.var -> var_L1 - labels_no_bl = /// - regexm(labels_no_bl, "^L\.") :* /// - (regexr(labels_no_bl, "^L\.", "") :+ "_L1") :+ /// - (!regexm(labels_no_bl, "^L\.") :* labels_no_bl) - - // Handle 1L.var - labels_no_bl = /// - regexm(labels_no_bl, "^1L\.") :* /// - (regexr(labels_no_bl, "^1L\.", "") :+ "_L1") :+ /// - (!regexm(labels_no_bl, "^1L\.") :* labels_no_bl) - - // -------------------------------------------------- - // Add header - // -------------------------------------------------- - labels_out = "v1" \ labels_no_bl - - // -------------------------------------------------- - // Write to temp file - // -------------------------------------------------- - outfile = st_local("dir_results") + "/temp_labels.txt" - fh = fopen(outfile, "w") - for (i=1; i<=rows(labels_out); i++) { - fput(fh, labels_out[i]) - } - fclose(fh) -end - - - * Import cleaned labels into Stata - import delimited "$dir_results/temp_labels.txt", clear varnames(1) /// - encoding(utf8) - gen n = _n - - * Export labels to Excel - putexcel set "$dir_results/reg_partnership", sheet("U2") modify - - * Vertical labels - summarize n, meanonly - local N = r(max)+1 - forvalue i = 2/`N' { - - local j = `i' - 1 - putexcel A`i' = v1[`j'] - - } - - * Horizontal labels - summarize n, meanonly - local N = r(max) + 1 // Adjusted since we're working across columns - - forvalues j = 1/`N' { - - local n = `j'+2 // Shift by 2 to start from column C - local col "" - - while `n' > 0 { - local rem = mod(`n' - 1, 26) - local col = char(65 + `rem') + "`col'" - local n = floor((`n' - 1)/26) - } - - putexcel `col'1 = v1[`j'] - } - - * Clean up - cap erase "$dir_results/temp_labels.txt" - -restore - - -* Export model fit statistics -putexcel set "$dir_results/reg_partnership", sheet("Gof") modify - -putexcel A8 = "U2 - Partnership termination", bold - -putexcel A10 = "Pseudo R-squared" -putexcel B10 = r2_p -putexcel A11 = "N" -putexcel B11 = N_sample -putexcel E10 = "Chi^2" -putexcel F10 = chi2 -putexcel E11 = "Log likelihood" -putexcel F11 = ll - - -* Clean up -drop in_sample p -scalar drop _all -matrix drop _all - - -capture log close - diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_retirement.do b/input/InitialPopulations/compile/RegressionEstimates/reg_retirement.do deleted file mode 100644 index 805836ffa..000000000 --- a/input/InitialPopulations/compile/RegressionEstimates/reg_retirement.do +++ /dev/null @@ -1,534 +0,0 @@ -******************************************************************************** -* PROJECT: SimPaths UK -* SECTION: Retirement -* OBJECT: Probit Regresion Models -* AUTHORS: Daria Popova, Justin van de Ven, Aleksandra Kolndrekaj -* LAST UPDATE: 18 Feb 2026 AK -* COUNTRY: UK -* -* NOTES: -* -******************************************************************************** -clear all -set more off -set mem 200m -set type double -//set maxvar 120000 -set maxvar 30000 - -******************************************************************* -cap log close -log using "${dir_log}/reg_retirement.log", replace -******************************************************************* - -* Set Excel file - -* Info sheet - -putexcel set "$dir_results/reg_retirement", sheet("Info") replace -putexcel A1 = "Description:" -putexcel B1 = "Model parameters governing projection of retirement" -putexcel A2 = "Authors: Patryk Bronka, Justin van de Ven, Daria Popova, Aleksandra Kolndrekaj" -putexcel A3 = "Last edit: 26 jan 2026 DP" - -putexcel A4 = "Process:", bold -putexcel B4 = "Description:", bold - -putexcel A5 = "R1a" -putexcel B5 = "Probit regression estimates of the probability of retiring, single individuals aged 50+ not yet retired" - -putexcel A6 = "R1b" -putexcel B6 = "Probit regression estimates of the probability of retiring, cohabiting individuals aged 50+ not yet retired" - -putexcel A10 = "Notes:", bold -//putexcel B10 = "" - -putexcel set "$dir_results/reg_retirement", sheet("Gof") modify -putexcel A1 = "Goodness of fit", bold - - -/********************************* PREPARE DATA *******************************/ - -* Load data -use "${estimation_sample}", clear - -* Set data -xtset idperson swv -sort idperson swv - -* Adjust variables -do "${dir_do}/variable_update.do" - - -/********************************** ESTIMATION ********************************/ - -/****************** R1a: PROBABILITY OF RETIREMENT, SINLGE ********************/ -display "${r1a_if_condition}" - -probit drtren i.Dgn Dag Dag_sq /// - li.Deh_c4_Medium li.Deh_c4_Low li.Deh_c4_Na /// - l.Dhe_pcs l.Dhe_mcs /// - i.Reached_Retirement_Age /// - li.Les_c3_NotEmployed /// - li.Ydses_c5_Q2 li.Ydses_c5_Q3 li.Ydses_c5_Q4 li.Ydses_c5_Q5 li.Dlltsd01 /// - $regions Year_transformed Y2020 Y2021 $ethnicity /// - if ${r1a_if_condition} [pw=dwt], vce(robust) - - -* Save raw results -matrix results = r(table) -matrix results = results[1..6,1...]' - -putexcel set "$dir_raw_results/retirement/retirement", /// - sheet("Process R1a") replace -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -outreg2 stats(coef se pval) using /// - "$dir_raw_results/retirement/R1a.doc", replace /// -title("Process R1a: Probability of Retirement, Single") /// - ctitle(Retire) label side dec(2) noparen /// - addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) /// - addnote(`"Note: Regression if condition = (${r1a_if_condition})"') - -* Save sample inclusion indicator and predicted probabilities -gen in_sample = e(sample) -predict p - -* Save sample for estimte validation -save "$dir_validation_data/R1a_sample", replace - -* Store model summary statistics -scalar r2_p = e(r2_p) -scalar N_sample = e(N) -scalar chi2 = e(chi2) -scalar ll = e(ll) - -* Store results in Excel - -* Store estimates -matrix b = e(b) -matrix V = e(V) - -mata: - // Call matrices into mata - V = st_matrix("V") - b = st_matrix("b") - - // Find which coefficients are nonzero - keep = (b :!= 0) - - // Eliminate zeros - b_trimmed = select(b, keep) - V_trimmed = select(V, keep) - V_trimmed = select(V_trimmed', keep)' - - // Inspection - b_trimmed - V_trimmed - - // Return to Stata - st_matrix("b_trimmed", b_trimmed') - st_matrix("V_trimmed", V_trimmed) - st_matrix("nonzero_b_flag", keep) -end - -* Eigenvalue tests for var-cov invertablility in SimPaths -matrix symeigen X lambda = V_trimmed - -scalar max_eig = lambda[1,1] - -scalar min_ratio = lambda[1, colsof(lambda)] / max_eig - -* Outcome of max eigenvalue test -if max_eig < 1.0e-12 { - - display as error "CRITICAL ERROR: Maximum eigenvalue is too small (`max_eig')." - display as error "The Variance-Covariance matrix is likely singular." - exit 999 - -} - -display "Stability Check Passed: Max Eigenvalue is " max_eig - -* Outcome of eigenvalue ratio test -if min_ratio < 1.0e-12 { - - display as error "Matrix is ill-conditioned. Min/Max ratio: " min_ratio - exit 506 - -} - -display "Stability Check Passed. Min/Max ratio: " min_ratio - -* Export into Excel -putexcel set "$dir_results/reg_retirement", sheet("R1a") modify -putexcel B2 = matrix(b_trimmed) -putexcel C2 = matrix(V_trimmed) - -* Labels -preserve -putexcel set "$dir_results/reg_retirement", sheet("R1a") modify - -putexcel A1 = "REGRESSOR" -putexcel B1 = "COEFFICIENT" - -* Use Mata to extract nice labels from colstripe of e(b) - -local dir_results "$dir_results" -cap erase "$dir_results/temp_labels.txt" - -mata: - // -------------------------------------------------- - // Import objects from Stata - // -------------------------------------------------- - nonzero_b_flag = st_matrix("nonzero_b_flag") - stripe = st_matrixcolstripe("e(b)") - - // Ensure column vector - nonzero_b_flag = nonzero_b_flag' - - // -------------------------------------------------- - // Extract variable names - // -------------------------------------------------- - varnames = stripe[.,2] - - // Keep non-baseline coefficients - varnames_no_bl = select(varnames, nonzero_b_flag :== 1) - - // -------------------------------------------------- - // Clean labels - // -------------------------------------------------- - labels_no_bl = usubinstr(varnames_no_bl, "1.", "", 1) - labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant") - - // Handle lags: L.var -> var_L1 - labels_no_bl = /// - regexm(labels_no_bl, "^L\.") :* /// - (regexr(labels_no_bl, "^L\.", "") :+ "_L1") :+ /// - (!regexm(labels_no_bl, "^L\.") :* labels_no_bl) - - // Handle 1L.var - labels_no_bl = /// - regexm(labels_no_bl, "^1L\.") :* /// - (regexr(labels_no_bl, "^1L\.", "") :+ "_L1") :+ /// - (!regexm(labels_no_bl, "^1L\.") :* labels_no_bl) - - // -------------------------------------------------- - // Add header - // -------------------------------------------------- - labels_out = "v1" \ labels_no_bl - - // -------------------------------------------------- - // Write to temp file - // -------------------------------------------------- - outfile = st_local("dir_results") + "/temp_labels.txt" - fh = fopen(outfile, "w") - for (i=1; i<=rows(labels_out); i++) { - fput(fh, labels_out[i]) - } - fclose(fh) -end - - - * Import cleaned labels into Stata - import delimited "$dir_results/temp_labels.txt", clear varnames(1) /// - encoding(utf8) - gen n = _n - - * Export labels to Excel - putexcel set "$dir_results/reg_retirement", sheet("R1a") modify - - * Vertical labels - summarize n, meanonly - local N = r(max)+1 - forvalue i = 2/`N' { - - local j = `i' - 1 - putexcel A`i' = v1[`j'] - - } - - * Horizontal labels - summarize n, meanonly - local N = r(max) + 1 // Adjusted since we're working across columns - - forvalues j = 1/`N' { - - local n = `j'+2 // Shift by 2 to start from column C - local col "" - - while `n' > 0 { - local rem = mod(`n' - 1, 26) - local col = char(65 + `rem') + "`col'" - local n = floor((`n' - 1)/26) - } - - putexcel `col'1 = v1[`j'] - } - - * Clean up - cap erase "$dir_results/temp_labels.txt" - -restore - -* Export model fit statistics -putexcel set "$dir_results/reg_retirement", sheet("Gof") modify - -putexcel A3 = "R1a - Retirement single", bold - -putexcel A5 = "Pseudo R-squared" -putexcel B5 = r2_p -putexcel A6 = "N" -putexcel B6 = N_sample -putexcel E5 = "Chi^2" -putexcel F5 = chi2 -putexcel E6 = "Log likelihood" -putexcel F6 = ll - - -* Clean up -drop in_sample p -scalar drop _all -matrix drop _all - - - -/***************** R1b: PROBABILITY OF RETIREMENT, PARTNERED ******************/ -display "${r1b_if_condition}" - -probit drtren i.Dgn Dag Dag_sq /// - li.Deh_c4_Medium li.Deh_c4_Low li.Deh_c4_Na /// - l.Dhe_pcs l.Dhe_mcs /// - i.Reached_Retirement_Age i.Reached_Retirement_Age_Les /// - li.Les_c3_NotEmployed li.Lessp_c3_NotEmployed /// - i.Reached_Retirement_Age_Sp /// - li.Ydses_c5_Q2 li.Ydses_c5_Q3 li.Ydses_c5_Q4 li.Ydses_c5_Q5 li.Dlltsd01 /// - $regions Year_transformed Y2020 Y2021 $ethnicity /// - if ${r1b_if_condition} [pweight = dwt], vce(robust) - - -* Save raw results -matrix results = r(table) -matrix results = results[1..6,1...]' - -putexcel set "$dir_raw_results/retirement/retirement", /// - sheet("Process R1b") modify -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - -outreg2 stats(coef se pval) using /// - "$dir_raw_results/retirement/R1b.doc", replace /// -title("Process R1b: Probability of Retirement, Partnered") /// - ctitle(Retire) label side dec(2) noparen /// - addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) /// - addnote(`"Note: Regression if condition = (${r1b_if_condition})"') - -* Save sample inclusion indicator and predicted probabilities -gen in_sample = e(sample) -predict p - -graph bar (mean) drtren p if in_sample, over(dag, label(labsize(vsmall))) /// - legend(label(1 "observed") label(2 "predicted")) - -graph drop _all - -* Save sample for estiamte validation -save "$dir_validation_data/R1b_sample", replace - -* Store model summary statistics -scalar r2_p = e(r2_p) -scalar N_sample = e(N) -scalar chi2 = e(chi2) -scalar ll = e(ll) - - -* Store results in Excel - -* Store estimates -matrix b = e(b) -matrix V = e(V) - -mata: - // Call matrices into mata - V = st_matrix("V") - b = st_matrix("b") - - // Find which coefficients are nonzero - keep = (b :!= 0) - - // Eliminate zeros - b_trimmed = select(b, keep) - V_trimmed = select(V, keep) - V_trimmed = select(V_trimmed', keep)' - - // Inspection - b_trimmed - V_trimmed - - // Return to Stata - st_matrix("b_trimmed", b_trimmed') - st_matrix("V_trimmed", V_trimmed) - st_matrix("nonzero_b_flag", keep) -end - -* Eigenvalue tests for var-cov invertablility in SimPaths -matrix symeigen X lambda = V_trimmed - -scalar max_eig = lambda[1,1] - -scalar min_ratio = lambda[1, colsof(lambda)] / max_eig - -* Outcome of max eigenvalue test -if max_eig < 1.0e-12 { - - display as error "CRITICAL ERROR: Maximum eigenvalue is too small (`max_eig')." - display as error "The Variance-Covariance matrix is likely singular." - exit 999 - -} - -display "Stability Check Passed: Max Eigenvalue is " max_eig - -* Outcome of eigenvalue ratio test -if min_ratio < 1.0e-12 { - - display as error "Matrix is ill-conditioned. Min/Max ratio: " min_ratio - exit 506 - -} - -display "Stability Check Passed. Min/Max ratio: " min_ratio - -* Export into Excel -putexcel set "$dir_results/reg_retirement", sheet("R1b") modify -putexcel B2 = matrix(b_trimmed) -putexcel C2 = matrix(V_trimmed) - -* Labels -preserve -putexcel set "$dir_results/reg_retirement", sheet("R1b") modify -putexcel A1 = "REGRESSOR" -putexcel B1 = "COEFFICIENT" - -* Use Mata to extract nice labels from colstripe of e(b) - -local dir_results "$dir_results" -cap erase "$dir_results/temp_labels.txt" - -mata: - // -------------------------------------------------- - // Import objects from Stata - // -------------------------------------------------- - nonzero_b_flag = st_matrix("nonzero_b_flag") - stripe = st_matrixcolstripe("e(b)") - - // Ensure column vector - nonzero_b_flag = nonzero_b_flag' - - // -------------------------------------------------- - // Extract variable names - // -------------------------------------------------- - varnames = stripe[.,2] - - // Keep non-baseline coefficients - varnames_no_bl = select(varnames, nonzero_b_flag :== 1) - - // -------------------------------------------------- - // Clean labels - // -------------------------------------------------- - labels_no_bl = usubinstr(varnames_no_bl, "1.", "", 1) - labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant") - - // Handle lags: L.var -> var_L1 - labels_no_bl = /// - regexm(labels_no_bl, "^L\.") :* /// - (regexr(labels_no_bl, "^L\.", "") :+ "_L1") :+ /// - (!regexm(labels_no_bl, "^L\.") :* labels_no_bl) - - // Handle 1L.var - labels_no_bl = /// - regexm(labels_no_bl, "^1L\.") :* /// - (regexr(labels_no_bl, "^1L\.", "") :+ "_L1") :+ /// - (!regexm(labels_no_bl, "^1L\.") :* labels_no_bl) - - // -------------------------------------------------- - // Add header - // -------------------------------------------------- - labels_out = "v1" \ labels_no_bl - - // -------------------------------------------------- - // Write to temp file - // -------------------------------------------------- - outfile = st_local("dir_results") + "/temp_labels.txt" - fh = fopen(outfile, "w") - for (i=1; i<=rows(labels_out); i++) { - fput(fh, labels_out[i]) - } - fclose(fh) -end - - - * Import cleaned labels into Stata - import delimited "$dir_results/temp_labels.txt", clear varnames(1) /// - encoding(utf8) - gen n = _n - - * Export labels to Excel - putexcel set "$dir_results/reg_retirement", sheet("R1b") modify - - * Vertical labels - summarize n, meanonly - local N = r(max)+1 - forvalue i = 2/`N' { - - local j = `i' - 1 - putexcel A`i' = v1[`j'] - - } - - * Horizontal labels - summarize n, meanonly - local N = r(max) + 1 // Adjusted since we're working across columns - - forvalues j = 1/`N' { - - local n = `j'+2 // Shift by 2 to start from column C - local col "" - - while `n' > 0 { - local rem = mod(`n' - 1, 26) - local col = char(65 + `rem') + "`col'" - local n = floor((`n' - 1)/26) - } - - putexcel `col'1 = v1[`j'] - } - - * Clean up - cap erase "$dir_results/temp_labels.txt" - -restore - -* Export model fit statistics -putexcel set "$dir_results/reg_retirement", sheet("Gof") modify - -putexcel A9 = "R1b - Retirement partnered", bold - -putexcel A11 = "Pseudo R-squared" -putexcel B11 = r2_p -putexcel A12 = "N" -putexcel B12 = N_sample -putexcel E11 = "Chi^2" -putexcel F11 = chi2 -putexcel E12 = "Log likelihood" -putexcel F12 = ll - -* Clean up -drop in_sample p -scalar drop _all -matrix drop _all -graph drop _all - -capture log close - diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_socialcare.do b/input/InitialPopulations/compile/RegressionEstimates/reg_socialcare.do deleted file mode 100644 index f58734e0a..000000000 --- a/input/InitialPopulations/compile/RegressionEstimates/reg_socialcare.do +++ /dev/null @@ -1,706 +0,0 @@ -******************************************************************************** -* PROJECT: SimPaths UK -* SECTION: SOCIAL CARE RECEIPT -* AUTHORS: Justin van de Ven, Matteo Richiardi, Daria Popova -* LAST UPDATE: 19 Feb 2026 DP -* COUNTRY: UK -* -* NOTES: -* PROGRAM TO EVALUATE SOCIAL CARE RECEIPT FROM UKHLS DATA -* ANALYSIS BASED ON THE SOCIAL CARE MODULE OF UKHLS -* First version: Justin van de Ven, 28 Aug 2023 -* Refactored version: Matteo Richiardi, 16 Feb 2026 -* Integration into the pipeline: Daria Popova 18 Feb 2026 DP -* -*******************************************************************************/ - -/* ANALYTICAL STRATEGY -We analyse/simulate the following variables: -- NeedCare -- ReceiveCare -- CareMarket: Formal, Informal, Mixed -- HrsReceivedFormalIHS -- HrsReceivedInformalIHS -- ProvideCare -- HrsProvidedInformalIHS -(IHS stands for Inverse Hyperbolic Sine transformation) - -The most complicated case is for Partnered, as an issue of consistency arises (care between partners is most common): - -=========================================== - Partner B: Receiving informal care? -___________________________________________ -Partner A: | No Yes -providing | No | (1) (2) -informal care | Yes | (3) (4) -=========================================== - -In the analysis we do not distinguish whom care is received from, and to whom care is provided. -However, the cases above imply: -(1) No hrs received, no hrs provided -(2) All hrs received are from non-partner -(3) All hrs provided are to non-partner_socare_hrs -(4) At least some of the hrs received/provided are from/to partner - -========================================================================================== -RMK: We first analyse care receipt, and then care provision. This order must be preserved. -========================================================================================== -*/ - -* CRITICAL: Clear all FIRST -clear all -set more off -set mem 200m -set type double -//set maxvar 120000 -set maxvar 30000 - -******************************************************************* -cap log close -log using "${dir_log}/reg_socialcare.log", replace -******************************************************************* - - -/********************************* SET EXCEL FILE *****************************/ - -putexcel set "$dir_results/reg_socialcare", sheet("Info") replace -putexcel A1 = "Description:", bold -putexcel B1 = "Model parameters for social care module" -putexcel A2 = "Authors:", bold -putexcel B2 = "Justin van de Ven, Ashley Burdett, Matteo Richiardi, Daria Popova" -putexcel A3 = "Last edit:", bold -putexcel B3 = "16 Feb 2026 MR (Refactored)" -putexcel B3 = "18 Feb 2026 DP (Integrated into the pipeline)" - -putexcel A5 = "Process:", bold -putexcel B5 = "Description:", bold - -putexcel A6 = "S2a" B6 = "Prob. need care" -putexcel A7 = "S2b" B7 = "Prob. receive care" -putexcel A8 = "S2c" B8 = "Prob. receive Formal/informal care" -putexcel A9 = "S2d" B9 = "Informal care hours received" -putexcel A10 = "S2e" B10 = "Hours of formal care received" - -putexcel A11 = "S3a" B11 = "Prob. provide care, Singles" -putexcel A12 = "S3b" B12 = "Prob. provide care, Partnered" -putexcel A13 = "S3c" B13 = "Hours of informal care provided, Singles" -putexcel A14 = "S3d" B14 = "Hours of informal care provided, Partnered" - -putexcel A20 = "Notes:", bold -putexcel B20 = "Estimation sample: UK_ipop.dta with grossing up weight dwt" -putexcel B21 = "Conditions for processes are defined as globals in master.do" - -putexcel set "$dir_results/reg_socialcare", sheet("Gof") modify -putexcel A1 = "Goodness of fit", bold - - - -/*============================================================================== - MAIN ANALYSIS -==============================================================================*/ - -use ${estimation_sample}, clear - -* Time series structure -gsort idperson stm -xtset idperson stm - -* Adjust variables -do "${dir_do}/variable_update.do" - -* Run Stata programs to produce Excel file -do "${dir_do}/programs.do" - -/*============================================================================== - REGRESSIONS -==============================================================================*/ - -* Stats for if conditions -/* -table stm, stat (count NeedCare) stat (mean NeedCare) // [2015, 2022] -table stm, stat (count ReceiveCare) stat (mean ReceiveCare) // [2016, 2021] but with significant decrease in 2020 and 2021 -table stm, stat (count receive_formal_care) stat (mean receive_formal_care) // [2016, 2021] but with significant decrease in 2020 and 2021 -table stm, stat (count receive_informal_care) stat (mean receive_informal_care) // [2016, 2021] but with significant decrease in 2020 and 2021 -table stm, stat (count provide_informal_care) stat (mean provide_informal_care) // [2015, 2024] also 2014, but fewer hours -*/ -table stm, c(count NeedCare mean NeedCare) -table stm, c(count ReceiveCare mean ReceiveCare) -table stm, c(count receive_formal_care mean receive_formal_care) -table stm, c(count receive_informal_care mean receive_informal_care) -table stm, c(count provide_informal_care mean provide_informal_care) - - -/* Age variables (for experimenting -> copy and paste in the specification) - Dag Dagsq /// - Age67to68 Age69to70 Age71to72 Age73to74 Age75to76 /// - Age77to78 Age79to80 Age81to82 Age83to84 Age85plus /// -*/ - -/************************ Probit need care (S2a) ******************************/ - -probit NeedCare NeedCare_L1 Dgn /// - Age67to68 Age69to70 Age71to72 Age73to74 Age75to76 /// - Age77to78 Age79to80 Age81to82 Age83to84 Age85plus /// - Dhe_Fair Dhe_Good Dhe_VeryGood Dhe_Excellent /// - Partnered /// - Deh_c4_Medium Deh_c4_Low /// - Y2020 Y2021 ${regions} ${ethnicity} /// - if ${s2a_if_condition} [pweight=${weight}], vce(r) - -process_regression, process("S2a") sheet("S2a") /// - title("Process S2a: Prob. need care") /// - gofrow(3) goflabel("S2a - Need care") /// - ifcond("${s2a_if_condition}") probit - -/************************ Probit receive care (S2b) ***************************/ - -probit ReceiveCare ReceiveCare_L1 Dgn /// - Age67to68 Age69to70 Age71to72 Age73to74 Age75to76 /// - Age77to78 Age79to80 Age81to82 Age83to84 Age85plus /// - Dhe_Fair Dhe_Good Dhe_VeryGood Dhe_Excellent /// - Partnered /// - Deh_c4_Medium Deh_c4_Low /// - HHincomeQ2 HHincomeQ3 HHincomeQ4 HHincomeQ5 /// - Y2020 Y2021 ${regions} ${ethnicity} /// - if ${s2b_if_condition} [pweight=${weight}], vce(r) - -process_regression, process("S2b") sheet("S2b") /// - title("Process S2b: Prob. receive care") /// - gofrow(7) goflabel("S2b - Receive care") /// - ifcond("${s2b_if_condition}") probit - - -/************************ Mlogit formal/informal (S2c) ************************/ - -/* - Informal is base outcome - Mixed is 1st outcome - Formal is 2nd outcomes -*/ - -mlogit CareMarket CareMarketFormal_L1 CareMarketInformal_L1 CareMarketMixed_L1 Dgn /// - Age67to68 Age69to70 Age71to72 Age73to74 Age75to76 /// - Age77to78 Age79to80 Age81to82 Age83to84 Age85plus /// - Dhe_Fair Dhe_Good Dhe_VeryGood Dhe_Excellent /// - Partnered /// - Deh_c4_Medium Deh_c4_Low /// - HHincomeQ2 HHincomeQ3 HHincomeQ4 HHincomeQ5 /// - Y2020 Y2021 ${regions} ${ethnicity} /// - if ${s2c_if_condition} [pweight=${weight}], vce(r) base(2) -/* -process_mlogit, process("S2c") sheet("S2c") /// - title("Process S2c: Formal/informal care") /// - gofrow(11) goflabel("S2c - Formal/informal") /// - outcomes(3) ifcond("${s2c_if_condition}") -*/ - -/* DP: Use this routine as program for MLogit does not display labels corectly in Excel ==> to replace by program later on ? */ -* Save raw results -matrix results = r(table) -matrix results = results[1..6,1...]' - -putexcel set "$dir_raw_results/social_care/socialcare", sheet("Process S2c") /// - modify -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) - - -* Save sample inclusion indicator and predicted probabilities -gen in_sample = e(sample) -predict p1 p2 p3 - -* Save sample for estimates validation -save "$dir_validation_data/S2c_sample", replace - -* Store model summary statistics -scalar r2_p = e(r2_p) -scalar N_sample = e(N) -scalar chi2 = e(chi2) -scalar ll = e(ll) - - -* Store results in Excel - -* Store estimates in matrices -matrix b = e(b) -matrix V = e(V) - -* Raw output -putexcel set "$dir_results/reg_socialcare", sheet("S2c_raw") modify -putexcel A1 = matrix(b'), names nformat(number_d2) -putexcel A1 = "CATEGORY" -putexcel B1 = "REGRESSOR" -putexcel C1 = "COEFFICIENT" - -* Estimated coefficients -scalar no_coefs_all = colsof(b) - -* Eliminate rows and columns containing zeros (baseline cats) -mata: - // Call matrices into mata - b = st_matrix("b") - - // Find which coefficients are nonzero - keep = (b :!= 0) - - // Eliminate zeros - nonzero_b = select(b, keep) - - nonzero_b - - // Return to Stata - st_matrix("nonzero_b", nonzero_b) - st_matrix("nonzero_b_flag", keep) -end - -* Inspect -matrix list b -matrix list nonzero_b -matrix list nonzero_b_flag - -* Save dimensions -scalar no_nonzero_b = colsof(nonzero_b) -scalar no_nonzero_b_per = no_nonzero_b / 2 - -* Address repetition of proportional odds covariates - -* Generate repetition/unique observation flag -mata: - // Import matrices into mata - nonzero_b_mata = st_matrix("nonzero_b") - - // Generate binary vector =1 if coefficient repeated - n = cols(nonzero_b_mata) - repetition_flag = J(n, 1, 0) - - // use tolerance based comparison to avoid precision errors - tol = 1e-8 - - for (i = 1; i <= n; i++) { - for (j = 1; j <= n; j++) { - if (i != j && abs(nonzero_b_mata[i] - nonzero_b_mata[j]) < tol) { - repetition_flag[i] = 1 - break - } - } - } - repetition_flag - - // Generate binary vector =1 if coefficient not repeated - unique_flag = 1 :- repetition_flag - - // Return to Stata - st_matrix("repetition_flag", repetition_flag') - st_matrix("unique_flag", unique_flag') - -end - -* Generate vector to multiply the coef vector with to eliminate the repetitions -* of coefficients for vars that satify the proportional odds assumptions -matrix structure_a = J(1,no_nonzero_b_per,1) -matrix structure_b = unique_flag[1,no_nonzero_b_per+1..no_nonzero_b] -matrix structure = structure_a, structure_b - -* Inspect -matrix list structure_a -matrix list structure_b -matrix list structure -matrix list nonzero_b - -* Eliminate repetitions -mata: - // Call matrices into mata - var = st_matrix("var") - structure = st_matrix("structure") - nonzero_b = st_matrix("nonzero_b") - - // Convert reptitions into zeros - b_structure = structure :* nonzero_b - - b_structure - - // Eliminate zeros - keep = (b_structure :!= 0) - - nonzero_b_structure = select(b_structure, keep) - - // Export to Stata - st_matrix("b_structure", b_structure) - st_matrix("nonzero_b_structure", nonzero_b_structure) - -end - -matrix list nonzero_b_structure - -* Export into Excel -putexcel set "$dir_results/reg_socialcare", sheet("S2c") modify -putexcel A1 = matrix(nonzero_b_structure'), names //nformat(number_d2) - - -* Variance-covariance matrix -* Eliminate zeros (baseline categories) -mata: - V = st_matrix("V") - b = st_matrix("b") - - // Find which coefficients are nonzero - keep = (b :!= 0) - - // Eliminate zeros - V_trimmed = select(V, keep) - V_trimmed = select(V_trimmed', keep)' - - V_trimmed - - // Return to Stata - st_matrix("var", V_trimmed) -end - -matrix list var - - -* Address repetition due to proportional odds being satisfied for some covars -matrix square_structure_a = J(no_nonzero_b,1,1) * structure -matrix square_structure_b = square_structure_a' - -matrix list square_structure_a -matrix list square_structure_b -mata: - // Call matrices into mata - var = st_matrix("var") - - // Create structure matrix (0 = eliminate) - square_structure_a = st_matrix("square_structure_a") - square_structure_b = st_matrix("square_structure_b") - - // Element-by-element multiplication - square_structure = square_structure_a :* square_structure_b - var_structure = square_structure :* var - - // Eliminate zeros - row_keep = rowsum(abs(var_structure)) :!= 0 - col_keep = colsum(abs(var_structure)) :!= 0 - - nonzero_var_structure = select(select(var_structure, row_keep), col_keep) - - // Return to Stata - st_matrix("nonzero_var_structure", nonzero_var_structure) -end - -matrix list nonzero_var_structure - -* Export to Excel -putexcel set "$dir_results/reg_socialcare", sheet("S2c") modify -putexcel C2 = matrix(nonzero_var_structure) - -*======================================================================= -* Eigenvalue stability check for trimmed variance-covariance matrix - -matrix symeigen X lambda = nonzero_var_structure - -* Largest eigenvalue -scalar max_eig = lambda[1,1] - -* Ratio of smallest to largest eigenvalue -scalar min_ratio = lambda[1, colsof(lambda)] / max_eig - -* Check 1: near-singularity -if max_eig < 1.0e-12 { - display as error "CRITICAL ERROR: Variance-covariance matrix is near singular." - display as error "Max eigenvalue = " max_eig - exit 999 -} - -* Check 2: ill-conditioning -if min_ratio < 1.0e-12 { - display as error "Matrix is ill-conditioned." - display as error "Min/Max eigenvalue ratio = " min_ratio - exit 506 -} - -display "VCV stability check passed." -display "Max eigenvalue: " max_eig -display "Min/Max ratio: " min_ratio -*======================================================================= - -* Labels -preserve - -putexcel set "$dir_results/reg_socialcare", sheet("S2c") modify - -putexcel A1 = "REGRESSOR" -putexcel B1 = "COEFFICIENT" - - - * Use Mata to extract nice labels from colstripe of e(b) (replacement for Stata 14) -local dir_results "$dir_results" -cap erase "$dir_results/temp_labels.txt" - -* Run Mata block -mata: - // Import matrices from Stata - nonzero_b_flag = st_matrix("nonzero_b_flag")' - unique_flag = st_matrix("unique_flag")' - structure = st_matrix("structure")' - stripe = st_matrixcolstripe("e(b)") - - // Extract variable and category names - catnames = stripe[.,1] - varnames = stripe[.,2] - varnames_no_bl = select(varnames, nonzero_b_flag :== 1) - catnames_no_bl = select(catnames, nonzero_b_flag :== 1) - - // Handle lags - labels_no_bl = regexm(varnames_no_bl, "^L_") :* (regexr(varnames_no_bl, "^L_", "") :+ "_L1") :+ (!regexm(varnames_no_bl, "^L_") :* varnames_no_bl) - - // Add category name when flag is not unique - labels_no_bl = labels_no_bl :+ "_" :+ (catnames_no_bl :* (unique_flag[1::rows(labels_no_bl)] :!= 0)) - - // Clean labels - labels_no_bl = usubinstr(labels_no_bl, "1.", "", 1) - labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant") - - // Filter for structure == 1 - nonzero_labels_structure = select(labels_no_bl, structure[1::rows(labels_no_bl)] :== 1) - - // Add header row - nonzero_labels_structure = "v1"\nonzero_labels_structure - - // Write to temporary file - fh = fopen(st_local("dir_results") + "/temp_labels.txt", "w") - for (i=1; i<=rows(nonzero_labels_structure); i++) { - fput(fh, nonzero_labels_structure[i]) - } - fclose(fh) -end - - * Import cleaned labels into Stata as new dataset - import delimited "$dir_results/temp_labels.txt", clear varnames(1) encoding(utf8) - gen n = _n - - * Export labels to Excel - putexcel set "$dir_results/reg_socialcare", sheet("S2c") modify - - * Vertical labels - sum n, meanonly - local N = r(max)+1 - - forvalue i = 2/`N' { - local j = `i' - 1 - putexcel A`i' = v1[`j'] - } - - * Horizontal labels - sum n, meanonly - local N = r(max) + 1 // Adjusted since we're working across columns - - forvalues j = 1/`N' { - local n = `j'+2 // Shift by 2 to start from column C - local col "" - - while `n' > 0 { - local rem = mod(`n' - 1, 26) - local col = char(65 + `rem') + "`col'" - local n = floor((`n' - 1)/26) - } - - putexcel `col'1 = v1[`j'] - } - - *Clean up - cap erase "$dir_results/temp_labels.txt" - -restore - - -* Goodness of fit - -export_gof_probit, row(11) label("Process S2c: Formal/informal care") - -* Clean up -drop in_sample p1 p2 p3 -scalar drop _all -matrix drop _all - - -/******************** OLS informal care hours received (S2d) ******************/ - -reg HrsReceivedInformalIHS HrsReceivedInformalIHS_L1 CareMarketMixed Dgn /// - Age AgeSquared /// - Dhe_Fair Dhe_Good Dhe_VeryGood Dhe_Excellent /// - Partnered /// - Deh_c4_Medium Deh_c4_Low /// - HHincomeQ2 HHincomeQ3 HHincomeQ4 HHincomeQ5 /// - Y2020 Y2021 ${regions} /*${ethnicity} Ethn_White*/ /// - if ${s2d_if_condition} [pweight=${weight}], vce(r) - -process_regression, process("S2d") sheet("S2d") /// - title("Process S2d: Informal care hours received") /// - gofrow(15) goflabel("S2d - Hours of informal care received") /// - ifcond("${s2d_if_condition}") - - -* Calculate RMSE -cap drop residuals squared_residuals -predict residuals, residuals -gen squared_residuals = residuals^2 - -preserve -keep if ${s2d_if_condition} - -sum squared_residuals [w=${weight}], meanonly -scalar rmse = sqrt(r(mean)) -di "RMSE for Informal care hours received: " rmse - -putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify -putexcel A9 = ("S2d") B9 = (rmse) - -restore - -/********************* OLS formal care hours received (S2e) *******************/ - -reg HrsReceivedFormalIHS HrsReceivedFormalIHS_L1 CareMarketMixed Dgn /// - Dhe_Fair Dhe_Good Dhe_VeryGood Dhe_Excellent /// - Partnered /// - Deh_c4_Medium Deh_c4_Low /// - HHincomeQ2 HHincomeQ3 HHincomeQ4 HHincomeQ5 /// - Y2020 Y2021 ${regions} ${ethnicity} /// - if ${s2e_if_condition} [pweight=${weight}], vce(r) - -process_regression, process("S2e") sheet("S2e") /// - title("Process S2e: Formal care hours received") /// - gofrow(19) goflabel("S2e - Hours of formal care received") /// - ifcond("${s2e_if_condition}") - -* Calculate RMSE -cap drop residuals squared_residuals -predict residuals, residuals -gen squared_residuals = residuals^2 - -preserve -keep if ${s2e_if_condition} - -sum squared_residuals [w=${weight}], meanonly -scalar rmse = sqrt(r(mean)) -di "RMSE for Formal care hours received: " rmse - -putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify -putexcel A10 = ("S2e") B10 = (rmse) - -restore - - -/***************** Probit provide care, Singles (S3a) *************************/ - -probit ProvideCare ProvideCare_L1 NeedCare ReceiveCare Dgn /// - Age30to34 Age35to39 Age40to44 Age45to49 Age50to54 /// - Age55to59 Age60to64 Age65to69 Age70to74 Age75to79 Age80to84 Age85plus /// - Dhe_Fair Dhe_Good Dhe_VeryGood Dhe_Excellent /// - Deh_c4_High Deh_c4_Medium Deh_c4_Low /// - HHincomeQ2 HHincomeQ3 HHincomeQ4 HHincomeQ5 /// - Y2020 Y2021 ${regions} ${ethnicity} /// - if ${s3a_if_condition} [pweight=${weight}], vce(r) - -process_regression, process("S3a") sheet("S3a") /// - title("Process S3a: Prob. provide care, Singles") /// - gofrow(23) goflabel("S3a - Provide care, Singles") /// - ifcond("${s3a_if_condition}") probit - - -/***************** Probit provide care, Partnered (S3b) ***********************/ -/* -tab CareMarket ProvideCare if ${s3b_if_condition} -tab deh_c4 ProvideCare if ${s3b_if_condition} -deh_c4 =0 is excluded because there's just 1 obs providing care and probit would not converge -*/ - -capture drop in_sample p -probit ProvideCare ProvideCare_L1 NeedCare ReceiveCare Dgn /// - ReceiveCarePartner CareMarketFormalPartner CareMarketInformalPartner CareMarketMixedPartner /// - Dhe_Poor Dhe_Fair Dhe_Good Dhe_VeryGood /// - Dhesp_Fair Dhesp_Good Dhesp_VeryGood Dhesp_Excellent /// - Deh_c4_High Deh_c4_Medium /// - HHincomeQ2 HHincomeQ3 HHincomeQ4 HHincomeQ5 /// - Y2020 Y2021 ${regions} ${ethnicity} /// - if ${s3b_if_condition} [pweight=${weight}], vce(r) - -process_regression, process("S3b") sheet("S3b") /// - title("Process S3b: Prob. provide care, Partnered") /// - gofrow(27) goflabel("S3b - Provide care, Partnered") /// - ifcond("${s3b_if_condition}") probit - - - -/******************* OLS care hours provided, Singles (S3c) ******************/ - -reg HrsProvidedInformalIHS HrsProvidedInformalIHS_L1 Dgn /// - Age20to24 Age25to29 Age30to34 Age35to39 Age40to44 Age45to49 Age50to54 /// - Age55to59 Age60to64 Age65to69 Age70to74 Age75to79 Age80to84 Age85plus /// - Dhe_Fair Dhe_Good Dhe_VeryGood Dhe_Excellent /// - Deh_c4_High Deh_c4_Medium Deh_c4_Low /// - HHincomeQ2 HHincomeQ3 HHincomeQ4 HHincomeQ5 /// - Y2020 Y2021 ${regions} ${ethnicity} /// - if ${s3c_if_condition} [pweight=${weight}], vce(r) - -process_regression, process("S3c") sheet("S3c") /// - title("Process S3c: Informal care hours provided, Singles") /// - gofrow(31) goflabel("S3c - Hours of informal care provided, Singles") /// - ifcond("${s3c_if_condition}") - - * Calculate RMSE -cap drop residuals squared_residuals -predict residuals, residuals -gen squared_residuals = residuals^2 - -preserve -keep if ${s3c_if_condition} - -sum squared_residuals [w=${weight}], meanonly -scalar rmse = sqrt(r(mean)) -di "RMSE for Informal care hours provided, Singles: " rmse - -putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify -putexcel A11 = ("S3c") B11 = (rmse) - -restore - - - -/****************** OLS care hours provided, Partnered (S3d) *****************/ - -reg HrsProvidedInformalIHS HrsProvidedInformalIHS_L1 Dgn /// - Age20to24 Age25to29 Age30to34 Age35to39 Age40to44 Age45to49 Age50to54 /// - Age55to59 Age60to64 Age65to69 Age70to74 Age75to79 Age80to84 Age85plus /// - ReceiveCarePartner CareMarketFormalPartner CareMarketInformalPartner CareMarketMixedPartner /// - Dhe_Poor Dhe_Fair Dhe_Good Dhe_VeryGood /// - Dhesp_Fair Dhesp_Good Dhesp_VeryGood Dhesp_Excellent /// - Deh_c4_High Deh_c4_Medium Deh_c4_Low /// - HHincomeQ2 HHincomeQ3 HHincomeQ4 HHincomeQ5 /// - Y2020 Y2021 ${regions} ${ethnicity} /// - if ${s3d_if_condition} [pweight=${weight}], vce(r) - -process_regression, process("S3d") sheet("S3d") /// - title("Process S3d: Informal care hours provided, Partnered") /// - gofrow(35) goflabel("S3d - Hours of informal care provided, Partnered") /// - ifcond("${s3d_if_condition}") - - * Calculate RMSE -cap drop residuals squared_residuals -predict residuals, residuals -gen squared_residuals = residuals^2 - -preserve -keep if ${s3d_if_condition} - -sum squared_residuals [w=${weight}], meanonly -scalar rmse = sqrt(r(mean)) -di "RMSE for Informal care hours provided, Partnered: " rmse - -putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify -putexcel A12 = ("S3d") B12 = (rmse) - -restore - - - -display "Analysis complete!" diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_unemployment.do b/input/InitialPopulations/compile/RegressionEstimates/reg_unemployment.do deleted file mode 100644 index 6d6928863..000000000 --- a/input/InitialPopulations/compile/RegressionEstimates/reg_unemployment.do +++ /dev/null @@ -1,143 +0,0 @@ -******************************************************************************** -* PROJECT: ESPON -* SECTION: Unemployment -* OBJECT: Final Probit Models -* AUTHORS: Justin van de Ven -* LAST UPDATE: 21/04/2024 (JV) -******************************************************************************** -clear all -set more off -set mem 200m -set type double -//set maxvar 120000 -set maxvar 30000 - - -/******************************************************************************* -* DEFINE DIRECTORIES -*******************************************************************************/ -* Working directory -global dir_work "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\regression_estimates" - -* Directory which contains do files -global dir_do "${dir_work}/do" - -* Directory which contains data files -global dir_data "${dir_work}/data" - -* Directory which contains log files -global dir_log "${dir_work}/log" - -* Directory which contains pooled UKHLS dataset -global dir_ukhls_data "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\initial_populations\data" - - -******************************************************************* -cap log close -log using "${dir_log}/reg_unemployment.log", replace -******************************************************************* - - -/******************************************************************************* -* START ANALYSIS -*******************************************************************************/ - - -/******************************************************************************* -* IMPORT UNEMPLOYMENT RATES -*******************************************************************************/ -import delimited "${dir_data}/unemp_rates.csv", clear -save "${dir_data}/unemp_rates", replace - - -/******************************************************************************* -* LOAD WORKING DATA -*******************************************************************************/ -use "$dir_ukhls_data/ukhls_pooled_all_obs_09.dta", clear -keep if (dag>15 & dag<75) - -// append unemployment rates to data -merge m:1 dgn deh_c3 dag stm using "${dir_data}/unemp_rates", keep(3) nogen -label variable dukue "UK unemployment rate by age, year, gender, and graduate status" - -gen unemp = (jbstat==3) -label variable unemp "labour status unemployed" -gen nemp = (jbstat!=1 & jbstat!=2 & jbstat!=10 & jbstat!=11) -replace nemp = . if (jbstat==4 | jbstat==5 | jbstat==7 | jbstat==8 | jbstat==9 | jbstat==12 | jbstat==13 | jbstat==14) -label variable nemp "labour status not employed" -label variable dgn "Gender" -gen ageGroup = floor(dag/5) -label variable ageGroup "five year age band" - -recode careWho dcpst drgn1 (-9=.) -recode formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs (-9=0) -gen carer = (careWho>0) -gen recare = (formal_socare_hrs + partner_socare_hrs + daughter_socare_hrs + son_socare_hrs + other_socare_hrs > 0) - -gen ageUnder20 = (dag<20) -gen age20to24 = (dag>19) * (dag<25) - -gen dnc1 = (dnc==1) -gen dnc2 = (dnc==2) -gen dnc3 = (dnc>2) -gen dnc2p = (dnc>1) -gen dc02 = (dnc02>0) - - -/******************************************************************************* -* CALCULATE REGRESSION -*******************************************************************************/ -xtset idperson swv -probit unemp dukue i.dhe l.nemp ib8.drgn1 if (dgn==1 & dag>17 & dag<65 & deh_c3==1) [pweight=disclwt], vce(robust) -matrix results = r(table) -matrix results = results[1..6,1...]' -putexcel set "$dir_data/unempoyment", sheet("Process U1a male grads") replace -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) -outreg2 stats(coef se pval) using "$dir_data/U1a.doc", replace /// -title("Process U1a: Probability of unemployment. Sample: Men aged 18-64 with graduate education.") /// - ctitle(Giving birth) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) - -probit unemp dukue i.dhe l.nemp ib8.drgn1 if (dgn==1 & dag>17 & dag<65 & deh_c3>1) [pweight=disclwt], vce(robust) -matrix results = r(table) -matrix results = results[1..6,1...]' -putexcel set "$dir_data/unempoyment", sheet("Process U1b male ngrads") modify -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) -outreg2 stats(coef se pval) using "$dir_data/U1b.doc", replace /// -title("Process U1b: Probability of unemployment. Sample: Men aged 18-64 with non-graduate education.") /// - ctitle(Giving birth) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) - -probit unemp dukue i.dhe l.nemp ib8.drgn1 if (dgn==0 & dag>17 & dag<65 & deh_c3==1) [pweight=disclwt], vce(robust) -matrix results = r(table) -matrix results = results[1..6,1...]' -putexcel set "$dir_data/unempoyment", sheet("Process U1c female grads") modify -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) -outreg2 stats(coef se pval) using "$dir_data/U1c.doc", replace /// -title("Process U1c: Probability of unemployment. Sample: Women aged 18-64 with graduate education.") /// - ctitle(Giving birth) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) - -probit unemp dukue i.dhe l.nemp ib8.drgn1 if (dgn==0 & dag>17 & dag<65 & deh_c3>1) [pweight=disclwt], vce(robust) -matrix results = r(table) -matrix results = results[1..6,1...]' -putexcel set "$dir_data/unempoyment", sheet("Process U1d female ngrads") modify -putexcel A3 = matrix(results), names nformat(number_d2) -putexcel J4 = matrix(e(V)) -outreg2 stats(coef se pval) using "$dir_data/U1d.doc", replace /// -title("Process U1d: Probability of unemployment. Sample: Women aged 18-64 with non-graduate education.") /// - ctitle(Giving birth) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) - - -// exploratory regressions -probit unemp i.ageGroup dukue carer recare i.dhe l.unemp ib1.dcpst dnc dnc02 i.drgn1 if (dgn==0 & deh_c3==1 & stm>2017) -probit unemp i.ageGroup dukue carer recare i.dhe l.unemp ib1.dcpst dnc dnc02 i.drgn1 if (dgn==0 & deh_c3>1 & stm>2017) -probit unemp i.ageGroup dukue carer recare i.dhe l.unemp ib1.dcpst dnc dnc02 i.drgn1 if (dgn==1 & deh_c3==1 & stm>2017) -probit unemp i.ageGroup dukue carer recare i.dhe l.unemp ib1.dcpst dnc dnc02 i.drgn1 if (dgn==1 & deh_c3>1 & stm>2017) - - - -capture log close - - - diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_wages.do b/input/InitialPopulations/compile/RegressionEstimates/reg_wages.do deleted file mode 100644 index 41cf4ffb7..000000000 --- a/input/InitialPopulations/compile/RegressionEstimates/reg_wages.do +++ /dev/null @@ -1,1362 +0,0 @@ -******************************************************************************************* -* PROJECT: SimPaths UK -* SECTION: Wage regression -* OBJECT: Heckman regressions -* AUTHORS: Patryk Bronka, Daria Popova, Justin van de Ven, Aleksandra Kolndrekaj -* LAST UPDATE: 18 Feb 2026 AK -****************************************************************************************** -****************************************************************************************** -* NOTES: Strategy: -* 1) Heckman estimated on the sub-sample of individuals -* who are not observed working in previous period. -* => Wage equation does not controls for lagged wage -* 2) Heckman estimated on the sub-sample of individuals who -* are observed working in previous period. -* => Wage equation controls for lagged wage -* Specification of selection equation is the same in the -* two samples -* -* Import labour cost index to create a measure of wage growth. -* Make sure loaded into the external_data subfolder. -* -* Update the winsorization process if alter data -* -*******************************************************************************/ -clear all -set more off -set mem 200m -set type double -//set maxvar 120000 -set maxvar 30000 - -******************************************************************* -cap log close -log using "${dir_log}/reg_wages.log", replace -******************************************************************* - -***************************************************************************************************************************** -* Set Excel file -* Info sheet - first stage -putexcel set "$dir_results/reg_employment_selection", sheet("Info") replace -putexcel A1 = "Description:" -putexcel B1 = "This file contains regression estimates from the first stage of the Heckman selection model used to estimates wages." -putexcel A2 = "Authors: Patryk Bronka, Justin Van de Ven, Daria Popova, Aleksandra Kolndrekaj" -putexcel A3 = "Last edit: 18 Feb 2026 AK" - -putexcel A5 = "Process:", bold -putexcel B5 = "Description:", bold -putexcel A6 = "W1fa-sel" -putexcel B6 = "First stage Heckman selection estimates for women that do not have an observed wage in the previous year" -putexcel A7 = "W1ma-sel" -putexcel B7 = "First stage Heckman selection estimates for women that do not have an observed wage in the previous year" -putexcel A8 = "W1fb-sel" -putexcel B8 = "First stage Heckman selection estimates for women that have an observed wage in the previous year" -putexcel A9 = "W1mb-sel" -putexcel B9 = "First stage Heckman selection estimates for men that have an observed wage in the previous year" - -putexcel A11 = "Notes:", bold -putexcel B11 = "Estimated on panel data unlike the labour supply estimates" -putexcel B12 = "Predicted wages used as input into union parameters and income process estimates" -putexcel B13 = "Two-step Heckman command is used which does not permit weights" - -* Info sheet - second stage -putexcel set "$dir_results/reg_wages", sheet("Info") replace -putexcel A1 = "Description:" -putexcel B1 = "This file contains regression estimates used to calculate potential wages for males and females in the simulation." -putexcel A2 = "Authors: Patryk Bronka, Daria Popova, Aleksandra Kolndrekaj" -putexcel A3 = "Last edit: 18 Feb 2026 AK" - -putexcel A4 = "Process:", bold -putexcel B4 = "Description:", bold -putexcel A5 = "Process:", bold -putexcel B5 = "Description:", bold -putexcel A6 = "W1fa" -putexcel B6 = "Second stage Heckman selection estimates using women that do not have an observed wage in the previous year" -putexcel A7 = "W1ma" -putexcel B7 = "Second stage Heckman selection estimates using men that do not have an observed wage in the previous year" -putexcel A8 = "W1fb" -putexcel B8 = "Second stage Heckman selection estimates using women that have an observed wage in the previous year" -putexcel A9 = "W1mb" -putexcel B9 = "Second stage Heckman selection estimates using men that have an observed wage in the previous year" - -putexcel A11 = "Notes:", bold -putexcel B11 = "Estimation sample: UK_ipop.dta. Two-step Heckman command is used which does not permit weights" -putexcel B12 = "Conditions for processes are defined as globals in master.do" -putexcel B13 = "Predicted wages sre saved in dataset UK_ipop2.dta and used as input into union parameters and income process estimates" - -/**************************************************************/ -* prepare data on real growth of wages -/**************************************************************/ - -import excel "$dir_external_data/time_series_factor.xlsx", sheet("UK_wage_growth") firstrow clear // Import real wage growth rates -rename Year stm -rename Value real_wage_growth -replace stm = stm - 2000 -sum real_wage_growth if stm == 15 -gen base = r(mean) -replace real_wage_growth = real_wage_growth / base // Note: switching from 100 base to 1 base as that's what happens in the simulation when rebasing indices -drop base -save "$dir_external_data/growth_rates", replace - -/********************************* PREPARE DATA *******************************/ - -* Load data -use "${estimation_sample}", clear - -* Set data -xtset idperson swv -sort idperson swv - -* Adjust variables -do "${dir_do}/variable_update.do" - -* merge in real growth index -merge m:1 stm using "$dir_external_data/growth_rates", keep(3) nogen keepusing(real_wage_growth) - -* Hours work per week -gen hours = 0 -replace hours = lhw if ((lhw > 0) & (lhw < .)) -label var hours "Hours worked per week" - -* Hourly wage -gen wage_hour = obs_earnings_hourly - -* Winsorize -sum wage_hour, det -replace wage_hour = . if wage_hour <= 0 -replace wage_hour = . if wage_hour >= r(p99) - -gen lwage_hour = ln(wage_hour) -label var lwage_hour "Log gross hourly wage" - -gen lwage_hour_2 = lwage_hour^2 -label var lwage_hour_2 "Squared log gross hourly wage" - - -* relationship status (1=cohabitating) -gen mar = (dcpst==1) - -* children -gen any02 = dnc02 > 0 - -gen dnc4p = dnc -replace dnc4p = 1 if (dnc>4) - -gen dnc2p = dnc -replace dnc2p = 2 if (dnc>2) - -cap drop child -gen child = (dnc>0) - -*employment status in previous wave -sort idperson swv -gen L1les_c3 = L1.les_c3 - -*part time work -gen pt = (hours > 0) * (hours <= 25) - -* Flag to identify observations to be included in the estimation sample -* Need to have been observed at least once in the past and activity information -* is not missing in the previous observation -bys idperson (swv): gen obs_count_ttl = _N -bys idperson (swv): gen obs_count = _n - -gen in_sample = (obs_count_ttl > 1 & obs_count > 1) -replace in_sample = 0 if swv != swv[_n-1] +1 & idperson == idperson[_n-1] -replace in_sample = 0 if les_c3 == . | obs_earning == . -fre in_sample - - -* Flag to distinguish the two samples (prev work and not) -capture drop previouslyWorking -gen previouslyWorking = (L1.lwage_hour != .) -replace previouslyWorking = . if in_sample == 0 -fre previouslyWorking - - -* Prep storage -capture drop lwage_hour_hat wage_hour_hat esample -gen lwage_hour_hat = . -gen wage_hour_hat = . -gen esample = . -gen pred_hourly_wage = . - -/********************************** ESTIMATION ********************************/ - -/******************** WAGES: WOMEN, NO PREV WAGE OBSERVED *********************/ - -* Estimate a predicted wage using a Heckman selection model -* Sample: Working age (16-75) women who did not receive a wage in t-1 -* DV: Log gross hourly wage - -global wage_eqn "lwage_hour dag dagsq ib1.deh_c4 ib1.deh_c4#c.dag i.dehmf_c3 dlltsd01 l.dhe_pcs l.dhe_mcs ib8.drgn1 pt real_wage_growth y2020 y2021 i.dot" //ded -global seln_eqn "i.L1les_c3 dag dagsq ib1.deh_c4 ib1.deh_c4#c.dag i.dehmf_c3 mar child dlltsd01 l.dhe_pcs l.dhe_mcs ib8.drgn1 y2020 y2021 i.dot" //ded - -local filter = "${wages_f_no_prev_if_condition}" -display "`filter'" - -heckman $wage_eqn if `filter', select($seln_eqn) twostep mills(lambda) - -outreg2 stats(coef se pval) using "$dir_raw_results/wages/Output_NWW.doc", replace /// -title("Heckman-corrected wage equation estimated on the sample of women who were not in employment last year") /// - ctitle(Not working women) label side dec(2) noparen - -/***************************************************************************/ -* Eigenvalue stability check - -* Extract variance-covariance matrix -matrix V = e(V) - -* Preserve data state -preserve - -* Export V to dataset -clear -svmat double V - -* Drop zero rows and columns -forvalues r = 1/2 { - egen rowsum = rowtotal(*) - drop if rowsum == 0 - drop rowsum - xpose, clear -} - -* Recreate trimmed VCV matrix -mkmat *, matrix(V_trimmed) - -restore - -* Eigen decomposition -matrix symeigen X lambda = V_trimmed - -* Largest eigenvalue -scalar max_eig = lambda[1,1] - -* Smallest-to-largest eigenvalue ratio -scalar min_ratio = lambda[1, colsof(lambda)] / max_eig - -* Check 1: near singularity -if max_eig < 1.0e-12 { - display as error "CRITICAL ERROR: Heckman VCV near singular" - display as error "Max eigenvalue = " max_eig - exit 999 -} - -* Check 2: ill-conditioning -if min_ratio < 1.0e-12 { - display as error "ERROR: Heckman VCV ill-conditioned" - display as error "Min/Max eigenvalue ratio = " min_ratio - exit 506 -} - -display "VCV stability check passed" -display "Max eigenvalue: " max_eig -display "Min/Max ratio: " min_ratio - -/***************************************************************************/ - -* Obtain predicted values (log wage) with selection correction - -predict pred if `filter', ycond // ycond -> include IMR in prediction to account for selection into employment -replace lwage_hour_hat = pred if `filter' - -gen in_sample_fnpw = e(sample) - -* Correct bias when transforming from log to levels -cap drop epsilon -gen epsilon = rnormal()*e(sigma) -replace pred_hourly_wage = exp(lwage_hour_hat + epsilon) if `filter' - - -twoway (hist wage_hour if `filter', width(0.5) /// - lcolor(gs12) fcolor(gs12)) /// - (hist pred_hourly_wage if `filter' & (!missing(wage_hour)), width(0.5) /// - fcolor(none) lcolor(red)), /// - title("Gross Hourly Wage (Level)") /// - subtitle("Females, No previously observed wage") /// - xtitle("GBP") /// - legend(lab(1 "UKHLS") lab(2 "Prediction")) /// - note("Notes: Sample condition ${wages_f_no_prev_if_condition}", size(vsmall)) - -graph export "${dir_raw_results}/wages/W1fa_hist.png", replace - -graph drop _all - -sum wage_hour if `filter' [aw=dwt] -sum pred_hourly_wage if `filter' & (!missing(wage_hour)) [aw=dwt] - -* Save sample validation -save "$dir_validation_data/Female_NPW_sample", replace - -cap drop pred epsilon - - -* Formatted results -* Clean up matrix of estimates -* Note: Zeros values are eliminated -matrix b = e(b) -matrix V = e(V) - -* Store variance-covariance matrix -preserve - -putexcel set "$dir_raw_results/wages/var_cov", sheet("var_cov") replace -putexcel A1 = matrix(V) - -import excel "$dir_raw_results/wages/var_cov", sheet("var_cov") clear - -describe -local no_vars = `r(k)' - -forvalues i = 1/2 { - egen row_sum = rowtotal(*) - drop if row_sum == 0 - drop row_sum - xpose, clear -} - -mkmat v*, matrix(var) - -* Second stage -putexcel set "$dir_raw_results/wages/reg_wages", sheet("Females_NLW") replace -putexcel C2 = matrix(var) - -restore - -* Store estimated coefficients -* Initialize a counter for non-zero coefficients -local non_zero_count = 0 -//local names : colnames b - -* Loop through each element in `b` to count non-zero coefficients -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - local non_zero_count = `non_zero_count' + 1 - } -} - -* Create a new row vector to hold only non-zero coefficients -matrix nonzero_b = J(1, `non_zero_count', .) - -* Populate nonzero_b with non-zero coefficients from b -local index = 1 -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - matrix nonzero_b[1, `index'] = b[1, `i'] - local index = `index' + 1 - } -} - -putexcel set "$dir_raw_results/wages/reg_wages", sheet("Females_NLW") modify -putexcel A1 = matrix(nonzero_b'), names //nformat(number_d2) - -preserve - -import excel "$dir_raw_results/wages/reg_wages", sheet("Females_NLW") firstrow /// - clear -ds -//define which cells are to be dropped -drop if C == 0 // UPDATE -drop A -drop AG-BL // UPDATE - - -mkmat *, matrix(Females_NLW) -putexcel set "$dir_results/reg_wages", sheet("W1fa") modify -putexcel B2 = matrix(Females_NLW) - -restore - - -* Labelling -putexcel set "$dir_results/reg_wages", /// - sheet("W1fa") modify - -local var_list Dag Dag_sq Deh_c4_Medium Deh_c4_Low Deh_c4_Medium_Dag /// - Deh_c4_Low_Dag Dehmf_c3_Medium Dehmf_c3_Low Dlltsd01 Dhe_pcs_L1 Dhe_mcs_L1 /// - UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN Pt RealWageGrowth Y2020 Y2021 /// - Ethn_Asian Ethn_Black Ethn_Other Constant InverseMillsRatio - - -putexcel A1 = "REGRESSOR" -putexcel B1 = "COEFFICIENT" - -local i = 1 -foreach var in `var_list' { - local ++i - - putexcel A`i' = "`var'" - -} - -local i = 2 -foreach var in `var_list' { - local ++i - - if `i' <= 26 { - local letter = char(64 + `i') // Convert 1=A, 2=B, ..., 26=Z - putexcel `letter'1 = "`var'" - } - else { - local first = char(64 + int((`i' - 1) / 26)) // First letter: A-Z - local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z - putexcel `first'`second'1 = "`var'" // Correctly places AA-ZZ - } -} - - -* First stage -preserve - -import excel "$dir_raw_results/wages/reg_wages", sheet("Females_NLW") firstrow /// - clear -ds - -drop if AG == 0 // UPDATE -drop A -drop C-AF // UPDATE -drop BM // UPDATE - - -mkmat *, matrix(Females_NLW) -putexcel set "$dir_results/reg_employment_selection", /// - sheet("W1fa-sel") modify -putexcel B2 = matrix(Females_NLW) - -restore - -* Labelling -putexcel set "$dir_results/reg_employment_selection", sheet("W1fa-sel") modify - -local var_list Les_c3_Student_L1 Les_c3_NotEmployed_L1 Dag Dag_sq Deh_c4_Medium Deh_c4_Low Deh_c4_Medium_Dag /// - Deh_c4_Low_Dag Dehmf_c3_Medium Dehmf_c3_Low Dcpst_Partnered D_Children Dlltsd01 Dhe_pcs_L1 Dhe_mcs_L1 /// - UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN Y2020 Y2021 /// - Ethn_Asian Ethn_Black Ethn_Other Constant - - -putexcel A1 = "REGRESSOR" -putexcel B1 = "COEFFICIENT" - -local i = 1 -foreach var in `var_list' { - local ++i - - putexcel A`i' = "`var'" - -} - -local i = 2 -foreach var in `var_list' { - local ++i - - if `i' <= 26 { - local letter = char(64 + `i') // Convert 1=A, 2=B, ..., 26=Z - putexcel `letter'1 = "`var'" - } - else { - local first = char(64 + int((`i' - 1) / 26)) // First letter: A-Z - local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z - putexcel `first'`second'1 = "`var'" // Correctly places AA-ZZ - } -} - -cap drop lambda - - -* Calculate RMSE -cap drop residuals squared_residuals -gen residuals = lwage_hour - lwage_hour_hat -gen squared_residuals = residuals^2 - -preserve -keep if `filter' -sum squared_residuals -di "RMSE for Not employed women: " sqrt(r(mean)) -putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify -putexcel A1=("REGRESSOR") B1=("COEFFICIENT") /// -A2=("W1fa") B2=(sqrt(r(mean))) -restore - - -/******************** WAGES: MEN, NO PREV WAGE OBSERVED *********************/ - -* Estimate a predicted wage using a Heckman selection model -* Sample: Working age (16-75) men who did not receive a wage in t-1 -* DV: Log gross hourly wage - -global wage_eqn "lwage_hour dag dagsq ib1.deh_c4 ib1.deh_c4#c.dag i.dehmf_c3 dlltsd01 l.dhe_pcs l.dhe_mcs ib8.drgn1 pt real_wage_growth y2020 y2021 i.dot" //ded -global seln_eqn "i.L1les_c3 dag dagsq ib1.deh_c4 ib1.deh_c4#c.dag i.dehmf_c3 mar child dlltsd01 l.dhe_pcs l.dhe_mcs ib8.drgn1 y2020 y2021 i.dot" //ded - -local filter = "${wages_m_no_prev_if_condition}" -display "`filter'" - -heckman $wage_eqn if `filter', select($seln_eqn) twostep mills(lambda) - -outreg2 stats(coef se pval) using "$dir_raw_results/wages/Output_NWM.doc", replace /// -title("Heckman-corrected wage equation estimated on the sample of men who were not in employment last year") /// - ctitle(Not working men) label side dec(2) noparen - -/***************************************************************************/ -* Eigenvalue stability check - -* Extract variance-covariance matrix -matrix V = e(V) - -* Preserve data state -preserve - -* Export V to dataset -clear -svmat double V - -* Drop zero rows and columns -forvalues r = 1/2 { - egen rowsum = rowtotal(*) - drop if rowsum == 0 - drop rowsum - xpose, clear -} - -* Recreate trimmed VCV matrix -mkmat *, matrix(V_trimmed) - -restore - -* Eigen decomposition -matrix symeigen X lambda = V_trimmed - -* Largest eigenvalue -scalar max_eig = lambda[1,1] - -* Smallest-to-largest eigenvalue ratio -scalar min_ratio = lambda[1, colsof(lambda)] / max_eig - -* Check 1: near singularity -if max_eig < 1.0e-12 { - display as error "CRITICAL ERROR: Heckman VCV near singular" - display as error "Max eigenvalue = " max_eig - exit 999 -} - -* Check 2: ill-conditioning -if min_ratio < 1.0e-12 { - display as error "ERROR: Heckman VCV ill-conditioned" - display as error "Min/Max eigenvalue ratio = " min_ratio - exit 506 -} - -display "VCV stability check passed" -display "Max eigenvalue: " max_eig -display "Min/Max ratio: " min_ratio - -/***************************************************************************/ - - -* Obtain predicted values (log wage) with selection correction -predict pred if `filter', ycond // ycond -> include IMR in prediction to account for selection into employment -replace lwage_hour_hat = pred if `filter' - -gen in_sample_mnpw = e(sample) - -* Correct bias transforming from log to levels -gen epsilon = rnormal()*e(sigma) - -replace pred_hourly_wage = exp(lwage_hour_hat + epsilon) if `filter' - -twoway (hist wage_hour if `filter', width(0.5) /// - lcolor(gs12) fcolor(gs12)) /// - (hist pred_hourly_wage if `filter' & (!missing(wage_hour)), width(0.5) /// - fcolor(none) lcolor(red)), /// - title("Gross Hourly Wage (Level)") /// - subtitle("Males, No previously observed wage") /// - xtitle("GBP") /// - legend(lab(1 "UKHLS") lab(2 "Prediction")) /// - note("Notes: Sample condition ${wages_m_no_prev_if_condition}", size(vsmall)) - -graph export "${dir_raw_results}/wages/W1ma_hist.png", replace - -graph drop _all - -sum wage_hour if `filter' [aw=dwt] -sum pred_hourly_wage if `filter' & (!missing(wage_hour)) [aw=dwt] - - -* Save sample for validation -save "$dir_validation_data/Male_NPW_sample", replace -cap drop pred epsilon - - -* Formatted results -* Clean up matrix of estimates -* Note: Zeros values are eliminated -matrix b = e(b) -matrix V = e(V) - -* Store variance-covariance matrix -preserve - -putexcel set "$dir_raw_results/wages/var_cov", sheet("var_cov") replace -putexcel A1 = matrix(V) - -import excel "$dir_raw_results/wages/var_cov", sheet("var_cov") clear - -describe -local no_vars = `r(k)' - -forvalues i = 1/2 { - egen row_sum = rowtotal(*) - drop if row_sum == 0 - drop row_sum - xpose, clear -} - -mkmat v*, matrix(var) - -* Second stage -putexcel set "$dir_raw_results/wages/reg_wages", sheet("Males_NLW") replace -putexcel C2 = matrix(var) - -restore - -* Store estimated coefficients -* Initialize a counter for non-zero coefficients -local non_zero_count = 0 -//local names : colnames b - -* Loop through each element in `b` to count non-zero coefficients -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - local non_zero_count = `non_zero_count' + 1 - } -} - -* Create a new row vector to hold only non-zero coefficients -matrix nonzero_b = J(1, `non_zero_count', .) - -* Populate nonzero_b with non-zero coefficients from b -local index = 1 -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - matrix nonzero_b[1, `index'] = b[1, `i'] - local index = `index' + 1 - } -} - -putexcel set "$dir_raw_results/wages/reg_wages", sheet("Males_NLW") modify -putexcel A1 = matrix(nonzero_b'), names //nformat(number_d2) - -preserve - -import excel "$dir_raw_results/wages/reg_wages", sheet("Males_NLW") firstrow /// - clear -ds - -drop if C == 0 // UPDATE -drop A -drop AG-BL // UPDATE - - - -mkmat *, matrix(Males_NLW) -putexcel set "$dir_results/reg_wages", /// - sheet("W1ma") modify -putexcel B2 = matrix(Males_NLW) - -restore - -* Labelling -putexcel set "$dir_results/reg_wages", /// - sheet("W1ma") modify - -local var_list Dag Dag_sq Deh_c4_Medium Deh_c4_Low Deh_c4_Medium_Dag /// - Deh_c4_Low_Dag Dehmf_c3_Medium Dehmf_c3_Low Dlltsd01 Dhe_pcs_L1 Dhe_mcs_L1 /// - UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN Pt RealWageGrowth Y2020 Y2021 /// - Ethn_Asian Ethn_Black Ethn_Other Constant InverseMillsRatio - - -putexcel A1 = "REGRESSOR" -putexcel B1 = "COEFFICIENT" - -local i = 1 -foreach var in `var_list' { - local ++i - - putexcel A`i' = "`var'" - -} - -local i = 2 -foreach var in `var_list' { - local ++i - - if `i' <= 26 { - local letter = char(64 + `i') // Convert 1=A, 2=B, ..., 26=Z - putexcel `letter'1 = "`var'" - } - else { - local first = char(64 + int((`i' - 1) / 26)) // First letter: A-Z - local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z - putexcel `first'`second'1 = "`var'" // Correctly places AA-ZZ - } -} - - -* First stage -preserve - -import excel "$dir_raw_results/wages/reg_wages", sheet("Males_NLW") firstrow /// - clear -ds - -drop if AG == 0 // UPDATE -drop A -drop C-AF // UPDATE -drop BM // UPDATE - - -mkmat *, matrix(Males_NLW) -putexcel set "$dir_results/reg_employment_selection", /// - sheet("W1ma-sel") modify -putexcel B2 = matrix(Males_NLW) - -restore - -* Labelling -putexcel set "$dir_results/reg_employment_selection", /// - sheet("W1ma-sel") modify - -local var_list Les_c3_Student_L1 Les_c3_NotEmployed_L1 Dag Dag_sq Deh_c4_Medium Deh_c4_Low Deh_c4_Medium_Dag /// - Deh_c4_Low_Dag Dehmf_c3_Medium Dehmf_c3_Low Dcpst_Partnered D_Children Dlltsd01 Dhe_pcs_L1 Dhe_mcs_L1 /// - UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN Y2020 Y2021 /// - Ethn_Asian Ethn_Black Ethn_Other Constant - - -putexcel A1 = "REGRESSOR" -putexcel B1 = "COEFFICIENT" - -local i = 1 -foreach var in `var_list' { - local ++i - - putexcel A`i' = "`var'" - -} - -local i = 2 -foreach var in `var_list' { - local ++i - - if `i' <= 26 { - local letter = char(64 + `i') // Convert 1=A, 2=B, ..., 26=Z - putexcel `letter'1 = "`var'" - } - else { - local first = char(64 + int((`i' - 1) / 26)) // First letter: A-Z - local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z - putexcel `first'`second'1 = "`var'" // Correctly places AA-ZZ - } -} - -cap drop lambda - -* Calculate RMSE -cap drop residuals squared_residuals -gen residuals = lwage_hour - lwage_hour_hat -gen squared_residuals = residuals^2 - -preserve -keep if `filter' -sum squared_residuals -di "RMSE for Not employed men: " sqrt(r(mean)) -putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify -putexcel A1=("REGRESSOR") B1=("COEFFICIENT") /// -A3=("W1ma") B3=(sqrt(r(mean))) -restore - - -/******************** WAGES: WOMEN, PREV WAGE OBSERVED *********************/ - -* Estimate a predicted wage using a Heckman selection model -* Sample: Working age (16-75) women who received a wage in t-1 -* DV: Log gross hourly wage - -global wage_eqn "lwage_hour L1.lwage_hour dag dagsq ib1.deh_c4 ib1.deh_c4#c.dag i.dehmf_c3 dlltsd01 l.dhe_pcs l.dhe_mcs ib8.drgn1 pt real_wage_growth y2020 y2021 i.dot" //ded -global seln_eqn "dag dagsq ib1.deh_c4 ib1.deh_c4#c.dag i.dehmf_c3 mar child dlltsd01 l.dhe_pcs l.dhe_mcs ib8.drgn1 y2020 y2021 i.dot" //ded - -local filter = "${wages_f_prev_if_condition}" -display "`filter'" - -heckman $wage_eqn if `filter', select($seln_eqn) twostep mills(lambda) - -outreg2 stats(coef se pval) using "$dir_raw_results/wages/Output_WW.doc", replace /// -title("Heckman-corrected wage equation estimated on the sample of women who were in employment last year") /// - ctitle(Working women) label side dec(2) noparen - - /***************************************************************************/ -* Eigenvalue stability check - -* Extract variance-covariance matrix -matrix V = e(V) - -* Preserve data state -preserve - -* Export V to dataset -clear -svmat double V - -* Drop zero rows and columns -forvalues r = 1/2 { - egen rowsum = rowtotal(*) - drop if rowsum == 0 - drop rowsum - xpose, clear -} - -* Recreate trimmed VCV matrix -mkmat *, matrix(V_trimmed) - -restore - -* Eigen decomposition -matrix symeigen X lambda = V_trimmed - -* Largest eigenvalue -scalar max_eig = lambda[1,1] - -* Smallest-to-largest eigenvalue ratio -scalar min_ratio = lambda[1, colsof(lambda)] / max_eig - -* Check 1: near singularity -if max_eig < 1.0e-12 { - display as error "CRITICAL ERROR: Heckman VCV near singular" - display as error "Max eigenvalue = " max_eig - exit 999 -} - -* Check 2: ill-conditioning -if min_ratio < 1.0e-12 { - display as error "ERROR: Heckman VCV ill-conditioned" - display as error "Min/Max eigenvalue ratio = " min_ratio - exit 506 -} - -display "VCV stability check passed" -display "Max eigenvalue: " max_eig -display "Min/Max ratio: " min_ratio - -/***************************************************************************/ - - * Obtain predicted values (log wage) with selection correction -predict pred if `filter', ycond // ycond -> include IMR in prediction -replace lwage_hour_hat = pred if `filter' - -gen in_sample_fpw = 1 if e(sample) == 1 - -* Correct bias transforming from log to levels -gen epsilon = rnormal()* e(sigma) -replace pred_hourly_wage = exp(lwage_hour_hat + epsilon) if `filter' - -twoway (hist wage_hour if `filter', width(0.5) /// - lcolor(gs12) fcolor(gs12)) /// - (hist pred_hourly_wage if `filter' & (!missing(wage_hour)), width(0.5) /// - fcolor(none) lcolor(red)), /// - title("Gross Hourly Wage (Level)") /// - subtitle("Females, Previously observed wage") /// - xtitle("GBP") /// - legend(lab(1 "UKHLS") lab(2 "Prediction")) /// - note("Notes: Sample condition ${wages_f_prev_if_condition}", /// - size(vsmall)) - -graph export "${dir_raw_results}/wages/W1fb_hist.png", replace - -graph drop _all - -sum wage_hour if `filter' [aw=dwt] -sum pred_hourly_wage if `filter' & (!missing(wage_hour)) [aw=dwt] - - -* Save sample for validation -save "$dir_validation_data/Female_PW_sample", replace - -cap drop pred epsilon - -* Formatted results -* Clean up matrix of estimates -* Note: Zeros values are eliminated -matrix b = e(b) -matrix V = e(V) - -* Store variance-covariance matrix -preserve - -putexcel set "$dir_raw_results/wages/var_cov", sheet("var_cov") replace -putexcel A1 = matrix(V) - -import excel "$dir_raw_results/wages/var_cov", sheet("var_cov") clear - -describe -local no_vars = `r(k)' - -forvalues i = 1/2 { - egen row_sum = rowtotal(*) - drop if row_sum == 0 - drop row_sum - xpose, clear -} - -mkmat v*, matrix(var) - -* Second stage -putexcel set "$dir_raw_results/wages/reg_wages", sheet("Females_LW") replace -putexcel C2 = matrix(var) - -restore - -* Store estimated coefficients -* Initialize a counter for non-zero coefficients -local non_zero_count = 0 -//local names : colnames b - -* Loop through each element in `b` to count non-zero coefficients -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - local non_zero_count = `non_zero_count' + 1 - } -} - -* Create a new row vector to hold only non-zero coefficients -matrix nonzero_b = J(1, `non_zero_count', .) - -* Populate nonzero_b with non-zero coefficients from b -local index = 1 -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - matrix nonzero_b[1, `index'] = b[1, `i'] - local index = `index' + 1 - } -} - -putexcel set "$dir_raw_results/wages/reg_wages", sheet("Females_LW") modify -putexcel A1 = matrix(nonzero_b'), names //nformat(number_d2) - -preserve - -import excel "$dir_raw_results/wages/reg_wages", sheet("Females_LW") firstrow /// - clear -ds -drop if C == 0 // UPDATE -drop A -drop AH-BK // UPDATE - -mkmat *, matrix(Females_LW) -putexcel set "$dir_results/reg_wages", sheet("W1fb") modify -putexcel B2 = matrix(Females_LW) - -restore - -* Labelling -putexcel set "$dir_results/reg_wages", /// - sheet("W1fb") modify - -local var_list L1_log_hourly_wage Dag Dag_sq Deh_c4_Medium Deh_c4_Low Deh_c4_Medium_Dag /// - Deh_c4_Low_Dag Dehmf_c3_Medium Dehmf_c3_Low Dlltsd01 Dhe_pcs_L1 Dhe_mcs_L1 /// - UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN Pt RealWageGrowth Y2020 Y2021 /// - Ethn_Asian Ethn_Black Ethn_Other Constant InverseMillsRatio - - -putexcel A1 = "REGRESSOR" -putexcel B1 = "COEFFICIENT" - -local i = 1 -foreach var in `var_list' { - local ++i - - putexcel A`i' = "`var'" - -} - -local i = 2 -foreach var in `var_list' { - local ++i - - if `i' <= 26 { - local letter = char(64 + `i') // Convert 1=A, 2=B, ..., 26=Z - putexcel `letter'1 = "`var'" - } - else { - local first = char(64 + int((`i' - 1) / 26)) // First letter: A-Z - local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z - putexcel `first'`second'1 = "`var'" // Correctly places AA-ZZ - } -} - - -* First stage -preserve - -import excel "$dir_raw_results/wages/reg_wages", sheet("Females_LW") firstrow /// - clear -ds -drop if AH == 0 // UPDATE -drop A -drop C-AG // UPDATE -drop BL // UPDATE - -mkmat *, matrix(Females_LW) -putexcel set "$dir_results/reg_employment_selection", sheet("W1fb-sel") modify -putexcel B2 = matrix(Females_LW) - -restore - -* Labelling -putexcel set "$dir_results/reg_employment_selection", sheet("W1fb-sel") modify - -local var_list Dag Dag_sq Deh_c4_Medium Deh_c4_Low Deh_c4_Medium_Dag /// - Deh_c4_Low_Dag Dehmf_c3_Medium Dehmf_c3_Low Dcpst_Partnered D_Children Dlltsd01 Dhe_pcs_L1 Dhe_mcs_L1 /// - UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN Y2020 Y2021 /// - Ethn_Asian Ethn_Black Ethn_Other Constant - - -putexcel A1 = "REGRESSOR" -putexcel B1 = "COEFFICIENT" - -local i = 1 -foreach var in `var_list' { - local ++i - - putexcel A`i' = "`var'" - -} - -local i = 2 -foreach var in `var_list' { - local ++i - - if `i' <= 26 { - local letter = char(64 + `i') // Convert 1=A, 2=B, ..., 26=Z - putexcel `letter'1 = "`var'" - } - else { - local first = char(64 + int((`i' - 1) / 26)) // First letter: A-Z - local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z - putexcel `first'`second'1 = "`var'" // Correctly places AA-ZZ - } -} - -cap drop lambda - - -* Calculate RMSE -cap drop residuals squared_residuals -gen residuals = lwage_hour - lwage_hour_hat -gen squared_residuals = residuals^2 - -preserve -keep if `filter' -sum squared_residuals -di "RMSE for Employed women: " sqrt(r(mean)) -putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify -putexcel A1=("REGRESSOR") B1=("COEFFICIENT") /// -A4=("W1fb") B4=(sqrt(r(mean))) -restore - - -/******************** WAGES: MEN, PREV WAGE OBSERVED *********************/ - -* Estimate a predicted wage using a Heckman selection model -* Sample: Working age (16-75) men who received a wage in t-1 -* DV: Log gross hourly wage - -global wage_eqn "lwage_hour L1.lwage_hour dag dagsq ib1.deh_c4 ib1.deh_c4#c.dag i.dehmf_c3 dlltsd01 l.dhe_pcs l.dhe_mcs ib8.drgn1 pt real_wage_growth y2020 y2021 i.dot" //ded -global seln_eqn "dag dagsq ib1.deh_c4 ib1.deh_c4#c.dag i.dehmf_c3 mar child dlltsd01 l.dhe_pcs l.dhe_mcs ib8.drgn1 y2020 y2021 i.dot" //ded - -local filter = "${wages_m_prev_if_condition}" -display "`filter'" - -heckman $wage_eqn if `filter', select($seln_eqn) twostep mills(lambda) - -outreg2 stats(coef se pval) using "$dir_raw_results/wages/Output_WM.doc", replace /// -title("Heckman-corrected wage equation estimated on the sample of men who were in employment last year") /// - ctitle(Working women) label side dec(2) noparen - -/***************************************************************************/ -* Eigenvalue stability check - -* Extract variance-covariance matrix -matrix V = e(V) - -* Preserve data state -preserve - -* Export V to dataset -clear -svmat double V - -* Drop zero rows and columns -forvalues r = 1/2 { - egen rowsum = rowtotal(*) - drop if rowsum == 0 - drop rowsum - xpose, clear -} - -* Recreate trimmed VCV matrix -mkmat *, matrix(V_trimmed) - -restore - -* Eigen decomposition -matrix symeigen X lambda = V_trimmed - -* Largest eigenvalue -scalar max_eig = lambda[1,1] - -* Smallest-to-largest eigenvalue ratio -scalar min_ratio = lambda[1, colsof(lambda)] / max_eig - -* Check 1: near singularity -if max_eig < 1.0e-12 { - display as error "CRITICAL ERROR: Heckman VCV near singular" - display as error "Max eigenvalue = " max_eig - exit 999 -} - -* Check 2: ill-conditioning -if min_ratio < 1.0e-12 { - display as error "ERROR: Heckman VCV ill-conditioned" - display as error "Min/Max eigenvalue ratio = " min_ratio - exit 506 -} - -display "VCV stability check passed" -display "Max eigenvalue: " max_eig -display "Min/Max ratio: " min_ratio - -/***************************************************************************/ - * Obtain predicted values (log wage) with selection correction -predict pred if `filter', ycond // ycond -> include IMR in prediction - -replace lwage_hour_hat = pred if `filter' - -gen in_sample_mpw = e(sample) - -* Correct bias transforming from log to levels -gen epsilon = rnormal()*e(sigma) -replace pred_hourly_wage = exp(lwage_hour_hat + epsilon) if `filter' - -twoway (hist wage_hour if `filter', width(0.5) /// - lcolor(gs12) fcolor(gs12)) /// - (hist pred_hourly_wage if `filter' & (!missing(wage_hour)), width(0.5) /// - fcolor(none) lcolor(red)), /// - title("Gross Hourly Wage (Level)") /// - subtitle("Male, Previously observed wage") /// - xtitle("GBP") /// - legend(lab(1 "UKHLS") lab(2 "Prediction")) /// - note("Notes: Sample condition ${wages_m_prev_if_condition}", /// - size(vsmall)) - -graph export "${dir_raw_results}/wages/W1mb_hist.png", replace - -graph drop _all - -sum wage_hour if `filter' [aw=dwt] -sum pred_hourly_wage if `filter' & (!missing(wage_hour)) [aw=dwt] - -* Save sample for validation -save "$dir_validation_data/Male_PW_sample", replace - -cap drop pred epsilon - - - * Formatted results -* Clean up matrix of estimates -* Note: Zeros values are eliminated -matrix b = e(b) -matrix V = e(V) - -* Store variance-covariance matrix -preserve - -putexcel set "$dir_raw_results/wages/var_cov", sheet("var_cov") replace -putexcel A1 = matrix(V) - -import excel "$dir_raw_results/wages/var_cov", sheet("var_cov") clear - -describe -local no_vars = `r(k)' - -forvalues i = 1/2 { - egen row_sum = rowtotal(*) - drop if row_sum == 0 - drop row_sum - xpose, clear -} - -mkmat v*, matrix(var) - -* Second stage -putexcel set "$dir_raw_results/wages/reg_wages", sheet("Males_LW") replace -putexcel C2 = matrix(var) - -restore - -* Store estimated coefficients -* Initialize a counter for non-zero coefficients -local non_zero_count = 0 -//local names : colnames b - -* Loop through each element in `b` to count non-zero coefficients -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - local non_zero_count = `non_zero_count' + 1 - } -} - -* Create a new row vector to hold only non-zero coefficients -matrix nonzero_b = J(1, `non_zero_count', .) - -* Populate nonzero_b with non-zero coefficients from b -local index = 1 -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - matrix nonzero_b[1, `index'] = b[1, `i'] - local index = `index' + 1 - } -} - -putexcel set "$dir_raw_results/wages/reg_wages", sheet("Males_LW") modify -putexcel A1 = matrix(nonzero_b'), names //nformat(number_d2) - -preserve - -import excel "$dir_raw_results/wages/reg_wages", sheet("Males_LW") firstrow /// - clear -ds -drop if C == 0 // UPDATE -drop A -drop AH-BK // UPDATE - - -mkmat *, matrix(Males_LW) -putexcel set "$dir_results/reg_wages", sheet("W1mb") modify -putexcel B2 = matrix(Males_LW) - -restore - -* Labelling -putexcel set "$dir_results/reg_wages", /// - sheet("W1mb") modify - -local var_list L1_log_hourly_wage Dag Dag_sq Deh_c4_Medium Deh_c4_Low Deh_c4_Medium_Dag /// - Deh_c4_Low_Dag Dehmf_c3_Medium Dehmf_c3_Low Dlltsd01 Dhe_pcs_L1 Dhe_mcs_L1 /// - UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN Pt RealWageGrowth Y2020 Y2021 /// - Ethn_Asian Ethn_Black Ethn_Other Constant InverseMillsRatio - - -putexcel A1 = "REGRESSOR" -putexcel B1 = "COEFFICIENT" - -local i = 1 -foreach var in `var_list' { - local ++i - - putexcel A`i' = "`var'" - -} - -local i = 2 -foreach var in `var_list' { - local ++i - - if `i' <= 26 { - local letter = char(64 + `i') // Convert 1=A, 2=B, ..., 26=Z - putexcel `letter'1 = "`var'" - } - else { - local first = char(64 + int((`i' - 1) / 26)) // First letter: A-Z - local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z - putexcel `first'`second'1 = "`var'" // Correctly places AA-ZZ - } -} - - -* First stage -preserve - -import excel "$dir_raw_results/wages/reg_wages", sheet("Males_LW") firstrow /// - clear -ds -drop if AH == 0 // UPDATE -drop A -drop C-AG // UPDATE -drop BL // UPDATE - -mkmat *, matrix(Males_LW) -putexcel set "$dir_results/reg_employment_selection", sheet("W1mb-sel") modify -putexcel B2 = matrix(Males_LW) - -restore - -* Labelling -putexcel set "$dir_results/reg_employment_selection", sheet("W1mb-sel") modify - -local var_list Dag Dag_sq Deh_c4_Medium Deh_c4_Low Deh_c4_Medium_Dag /// - Deh_c4_Low_Dag Dehmf_c3_Medium Dehmf_c3_Low Dcpst_Partnered D_Children Dlltsd01 Dhe_Pcs_L1 Dhe_Mcs_L1 /// - UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN Y2020 Y2021 /// - Ethn_Asian Ethn_Black Ethn_Other Constant - - -putexcel A1 = "REGRESSOR" -putexcel B1 = "COEFFICIENT" - -local i = 1 -foreach var in `var_list' { - local ++i - - putexcel A`i' = "`var'" - -} - -local i = 2 -foreach var in `var_list' { - local ++i - - if `i' <= 26 { - local letter = char(64 + `i') // Convert 1=A, 2=B, ..., 26=Z - putexcel `letter'1 = "`var'" - } - else { - local first = char(64 + int((`i' - 1) / 26)) // First letter: A-Z - local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z - putexcel `first'`second'1 = "`var'" // Correctly places AA-ZZ - } -} - -cap drop lambda - - -* Calculate RMSE -cap drop residuals squared_residuals -gen residuals = lwage_hour - lwage_hour_hat -gen squared_residuals = residuals^2 - -preserve -keep if `filter' -sum squared_residuals -di "RMSE for Employed men: " sqrt(r(mean)) -putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify -putexcel A1=("REGRESSOR") B1=("COEFFICIENT") /// -A5=("W1mb") B5=(sqrt(r(mean))) -restore - -* Save for use in the do-file "reg_income" estimating non-employment incomes -// use predicted wage for all -// use the observed wage for those that are working today and not in any -// estimation sample above (first observation for an individual) -replace pred_hourly_wage = exp(lwage_hour) if missing(pred_hourly_wage) - -save "${estimation_sample2}", replace - -capture log close diff --git a/input/InitialPopulations/compile/RegressionEstimates/variable_update.do b/input/InitialPopulations/compile/RegressionEstimates/variable_update.do index 24551baaa..c0c32fe92 100644 --- a/input/InitialPopulations/compile/RegressionEstimates/variable_update.do +++ b/input/InitialPopulations/compile/RegressionEstimates/variable_update.do @@ -17,6 +17,10 @@ foreach var in idhh idperson idpartner idfather idmother dct drgn1 dwt dnc02 dnc qui recode `var' (-9/-1=.) } +* Set data +xtset idperson swv +sort idperson swv + *================================================== * Student flag *================================================== @@ -275,7 +279,7 @@ gen Dhe = dhe gen Ydses_c5 = ydses_c5 gen Dcpyy = dcpyy gen Dcpagdf = dcpagdf -gen FertilityRate = dukfr +gen fertilityRate = dukfr gen Dhh_owned = dhh_owned gen Elig_pen = dagpns @@ -388,7 +392,8 @@ replace dage10prime = 2 if (dag>44 & dag<55) replace dage10prime = 3 if (dag>54 & dag<65) replace dage10prime = 4 if (dag>64) //table dage10prime, stat(min dag) stat(max dag) -table dage10prime, c(min dag max dag) +//table dage10prime, c(min dag max dag) +tabstat dag, by(dage10prime) stat(min max) * - Categorical: 65-66, 67-68, 69-70, 71-72..., 85+ gen dage2old = 0 @@ -397,7 +402,8 @@ forval ii = 1/10 { } replace dage2old = 11 if (dag >= 85) //table dage2old, stat(min dag) stat(max dag) -table dage2old, c(min dag max dag) +//table dage2old, c(min dag max dag) +tabstat dag, by(dage2old) stat(min max) * Poor health flag gen poor_health = (dhe == 1) @@ -433,7 +439,8 @@ cap rename Age_16 Age85plus tab dage10prime, gen(Age_) //table dage10prime, stat(min dag) stat(max dag) // RMK: AgeXX categories start at 1, hence shifted by 1 -table dage10prime, c(min dag max dag) +//table dage10prime, c(min dag max dag) +tabstat dag, by(dage10prime) stat(min max) drop Age_1 rename Age_2 Age35to44 rename Age_3 Age45to54 @@ -442,7 +449,8 @@ rename Age_5 Age65plus tab dage2old, gen(Age_) //table dage2old, stat(min dag) stat(max dag) // RMK: AgeXX categories start at 1, hence shifted by 1 -table dage2old, c(min dag max dag) +//table dage2old, c(min dag max dag) +tabstat dag, by(dage2old) stat(min max) drop Age_1 rename Age_2 Age65to66 rename Age_3 Age67to68 @@ -633,3 +641,359 @@ label define lhwsp 0 "Zero" 10 "Ten" 20 "Twenty" 30 "Thirty" 40 "Forty" label value lhw_c5 lhwsp la var lhw_c5 "Hours worked per week (category)" + +*================================================== +* Refactoring variable names +*================================================== +xtset idperson stm + +gen demMaleFlag = dgn +gen demAge = dag +gen demAgeSq = dagsq +gen eduSampleFlag = ded + + +gen eduSampleFlagL1 = l.ded + +gen eduHighestParentC3 = dehmf_c3 +tab eduHighestParentC3, gen(eduHighestParentC3_) + +rename eduHighestParentC3_1 eduHighestParentC3High +rename eduHighestParentC3_2 eduHighestParentC3Medium +rename eduHighestParentC3_3 eduHighestParentC3Low + +gen eduHighestParentC3L1 = l.dehmf_c3 +tab eduHighestParentC3L1, gen(eduHighestParentC3L1_) + +rename eduHighestParentC3L1_1 eduHighestParentC3HighL1 +rename eduHighestParentC3L1_2 eduHighestParentC3MediumL1 +rename eduHighestParentC3L1_3 eduHighestParentC3LowL1 + +gen yHhQuintilesMonthC5 = ydses_c5 +tab yHhQuintilesMonthC5 , gen(yHhQuintilesMonthC5Q) + +gen yHhQuintilesMonthC5L1 = l.ydses_c5 +tab yHhQuintilesMonthC5L1 , gen(yHhQuintilesMonthC5L1_Q) + +rename yHhQuintilesMonthC5L1_Q1 yHhQuintilesMonthC5Q1L1 +rename yHhQuintilesMonthC5L1_Q2 yHhQuintilesMonthC5Q2L1 +rename yHhQuintilesMonthC5L1_Q3 yHhQuintilesMonthC5Q3L1 +rename yHhQuintilesMonthC5L1_Q4 yHhQuintilesMonthC5Q4L1 +rename yHhQuintilesMonthC5L1_Q5 yHhQuintilesMonthC5Q5L1 + +gen demRgn = drgn1 + +tab drgn1, gen(demRgn_) + +rename demRgn_1 demRgnUKC +rename demRgn_2 demRgnUKD +rename demRgn_3 demRgnUKE +rename demRgn_4 demRgnUKF +rename demRgn_5 demRgnUKG +rename demRgn_6 demRgnUKH +rename demRgn_7 demRgnUKI +rename demRgn_8 demRgnUKJ +rename demRgn_9 demRgnUKK +rename demRgn_10 demRgnUKL +rename demRgn_11 demRgnUKM +rename demRgn_12 demRgnUKN + +gen demYear = stm + +foreach y of numlist 11/25 { + gen demYear20`y' = (demYear == `y') + } + +gen demEthnC4 = dot +tab demEthnC4, gen(demEthnC4_) + +rename demEthnC4_1 demEthnC4White +rename demEthnC4_2 demEthnC4Asian +rename demEthnC4_3 demEthnC4Black +rename demEthnC4_4 demEthnC4Other + +gen demPartnerStatus = dcpst +gen demPartnerStatusPartnered = (dcpst==1) +gen demPartnerStatusSingle = (dcpst==2) + +gen demPartnerStatusL1 = l.dcpst +gen demPartnerStatusPartneredL1 = (demPartnerStatusL1==1) +gen demPartnerStatusSingleL1 = (demPartnerStatusL1==2) + +gen eduHighestC4 = deh_c4 +gen eduHighestC4L1 = l.deh_c4 + +gen eduHighestC4Na = (eduHighestC4==0) +gen eduHighestC4High = (eduHighestC4==1) +gen eduHighestC4Medium = (eduHighestC4==2) +gen eduHighestC4Low = (eduHighestC4==3) + +gen eduHighestC4NaL1 = (eduHighestC4L1==0) +gen eduHighestC4HighL1 = (eduHighestC4L1==1) +gen eduHighestC4MediumL1 = (eduHighestC4L1==2) +gen eduHighestC4LowL1 = (eduHighestC4L1==3) + +gen eduHighestPartnerC3 = dehsp_c3 +gen eduHighestPartnerC3L1 = l.dehsp_c3 + +gen eduHighestPartnerC3HighL1 = (eduHighestPartnerC3L1==1) +gen eduHighestPartnerC3MediumL1 = (eduHighestPartnerC3L1==2) +gen eduHighestPartnerC3LowL1 = (eduHighestPartnerC3L1==3) + +gen labStatusC3 = les_c3 +gen labStatusC3L1 = l.les_c3 + +gen labStatusC3EmployedL1 = (labStatusC3L1==1) +gen labStatusC3StudentL1 = (labStatusC3L1==2) +gen labStatusC3NotEmployedL1 = (labStatusC3L1==3) + + +gen labStatusPartnerC3 = lessp_c3 +gen labStatusPartnerC3L1 = l.les_c3 + +gen labStatusPartnerC3EmployedL1 = (labStatusPartnerC3L1==1) +gen labStatusPartnerC3StudentL1 = (labStatusPartnerC3L1==2) +gen labStatusPartnerC3NotEmplL1 = (labStatusPartnerC3L1==3) + +gen labStatusC4 = les_c4 +gen labStatusC4L1 = l.les_c4 + +gen labStatusC4EmployedL1 = (labStatusC4L1==1) +gen labStatusC4StudentL1 = (labStatusC4L1==2) +gen labStatusC4NotEmployedL1 = (labStatusC4L1==3) +gen labStatusC4RetiredL1 = (labStatusC4L1==4) + +gen demNChild = dnc +gen demNChild0to2 = dnc02 + +gen demNChildL1 = l.demNChild +gen demNChild0to2L1 = l.demNChild0to2 + +gen eduSampleFlag_demMaleFlag = eduSampleFlag * demMaleFlag +gen eduSampleFlag_demNChildL1 = eduSampleFlag * demNChildL1 +gen eduSampleFlag_demNChild0to2L1 = eduSampleFlag * demNChild0to2L1 +gen eduSampleFlag_Single = eduSampleFlag * demPartnerStatusSingle + +gen eduSampleFlag_Q2L1 = eduSampleFlag * yHhQuintilesMonthC5Q2L1 +gen eduSampleFlag_Q3L1 = eduSampleFlag * yHhQuintilesMonthC5Q3L1 +gen eduSampleFlag_Q4L1 = eduSampleFlag * yHhQuintilesMonthC5Q4L1 +gen eduSampleFlag_Q5L1 = eduSampleFlag * yHhQuintilesMonthC5Q5L1 + + +gen labStatusC4EmployedL1_Male = labStatusC4EmployedL1 * demMaleFlag +gen labStatusC4StudentL1_Male = labStatusC4StudentL1 * demMaleFlag +//gen labStatusC4NotEmployedL1_Male = labStatusC4NotEmployedL1 * demMaleFlag +gen labStatusC4RetiredL1_Male = labStatusC4RetiredL1 * demMaleFlag + +gen healthPhysicalPcs = dhe_pcs +gen healthMentalMcs = dhe_mcs + +gen healthPhysicalPcsL1 = l.healthPhysicalPcs +gen healthMentalMcsL1 = l.healthMentalMcs + +gen healthPhysicalPartnerPcs = dhe_pcssp +gen healthMentalPartnerMcs = dhe_mcssp + +gen healthPhysicalPartnerPcsL1 = l.dhe_pcssp +gen healthMentalPartnerMcsL1 = l.dhe_mcssp + +gen demPartnerNYear = dcpyy +gen demPartnerNYearL1 = l.demPartnerNYear + +gen demEnterPartnerFlag = new_rel +gen demEnterPartnerFlagL1 = l.new_rel + +gen demAgePartnerDiff = dcpagdf +gen demAgePartnerDiffL1 = l.dcpagdf + +gen labStatusPartnerAndOwnC4 = lesdf_c4 +gen labStatusPartnerAndOwnC4L1 = l.lesdf_c4 + +gen labStatusPartnerAndOwnC41L1 = (labStatusPartnerAndOwnC4L1==1) +gen labStatusPartnerAndOwnC42L1 = (labStatusPartnerAndOwnC4L1==2) +gen labStatusPartnerAndOwnC43L1 = (labStatusPartnerAndOwnC4L1==3) +gen labStatusPartnerAndOwnC44L1 = (labStatusPartnerAndOwnC4L1==4) + +gen yNonBenPersGrossMonth = ypnbihs_dv +gen yNonBenPersGrossMonthL1 = l.ypnbihs_dv + +gen yPersAndPartnerGrossDiffMonth = ynbcpdf_dv +gen yPersAndPartnerGrossDiffMonthL1 = l.ynbcpdf_dv + +gen demCompHhC4 = dhhtp_c4 +gen demCompHhC4L1 = l.dhhtp_c4 + +gen demCompHhC4CoupleNoChL1 = (l.dhhtp_c4==1) +gen demCompHhC4CoupleChL1 = (l.dhhtp_c4==2) +gen demCompHhC4SingleNoChL1 = (l.dhhtp_c4==3) +gen demCompHhC4L1SingleChL1 = (l.dhhtp_c4==4) + +gen healthDsblLongtermFlag = dlltsd01 +gen healthDsblLongtermFlagL1 = l.dlltsd01 + +gen demCompHhC8= dhhtp_c8 +tab demCompHhC8, gen(demCompHhC8) + +gen demCompHhC8L1 = l.dhhtp_c8 + +gen demCompHhC81L1 = (l.dhhtp_c8==1) +gen demCompHhC82L1 = (l.dhhtp_c8==2) +gen demCompHhC83L1 = (l.dhhtp_c8==3) +gen demCompHhC84L1 = (l.dhhtp_c8==4) +gen demCompHhC85L1 = (l.dhhtp_c8==5) +gen demCompHhC86L1 = (l.dhhtp_c8==6) +gen demCompHhC87L1 = (l.dhhtp_c8==7) +gen demCompHhC88L1 = (l.dhhtp_c8==8) + +gen yMiscPersGrossMonth = yptciihs_dv +gen yMiscPersGrossMonthL1 = l.yptciihs_dv + +gen wealthPrptyFlag = dhh_owned +gen wealthPrptyFlagL1 = l.dhh_owned + +gen demPensAgeFlag = dagpns +gen demPensPartnerAgeFlag = dagpns_sp + +gen demPensAgeFlag_NotEmployedL1 = demPensAgeFlag * labStatusC3NotEmployedL1 + + +* adjust capital income +sum ypncp, det +scalar p99 = r(p99) +replace ypncp = . if ypncp >= p99 + +gen yCapitalPersMonth = ypncp +gen yCapitalPersMonthL1 = l.ypncp +gen yCapitalPersMonthL2 = l2.ypncp + +gen yEmpPersGrossMonth = yplgrs_dv +gen yEmpPersGrossMonthL1 = l.yplgrs_dv +gen yEmpPersGrossMonthL2 = l2.yplgrs_dv +gen yEmpPersGrossMonthL3 = l3.yplgrs_dv + +gen eduSampleFlag_Male = eduSampleFlag * demMaleFlag + +gen eduSampleFlag_Pcs = eduSampleFlag * healthPhysicalPcs +gen eduSampleFlag_Mcs = eduSampleFlag * healthMentalMcs + +gen eduSampleFlag_PcsL1 = l.eduSampleFlag_Pcs +gen eduSampleFlag_McsL1 = l.eduSampleFlag_Mcs + +gen eduSampleFlag_yCapitalPers = eduSampleFlag * yCapitalPersMonth +gen eduSampleFlag_yCapitalPersL1 = l.eduSampleFlag_yCapitalPers +gen eduSampleFlag_yCapitalPersL2 = l2.eduSampleFlag_yCapitalPers + +gen eduSampleFlag_yEmpPersGross = eduSampleFlag * yEmpPersGrossMonth +gen eduSampleFlag_yEmpPersGrossL1 = l.eduSampleFlag_yEmpPersGross +gen eduSampleFlag_yEmpPersGrossL2 = l2.eduSampleFlag_yEmpPersGross + +gen eduHighestC4Na_demAge = eduHighestC4Na * dag +gen eduHighestC4Low_demAge = eduHighestC4Low * dag +gen eduHighestC4Medium_demAge = eduHighestC4Medium * dag +gen eduHighestC4High_demAge = eduHighestC4High * dag + +gen eduHighestC4NaL1_demAge = eduHighestC4NaL1 * demAge +gen eduHighestC4LowL1_demAge = eduHighestC4LowL1 * demAge +gen eduHighestC4MediumL1_demAge = eduHighestC4MediumL1 * demAge +gen eduHighestC4HighL1_demAge = eduHighestC4HighL1 * demAge + +gen labPt = (lhw > 0 & lhw <=25) + +* adjust pension income +sum ypnoab, det +scalar p99 = r(p99) +replace ypnoab = . if ypnoab >= p99 + +gen yPensPersGrossMonth = ypnoab +gen yPensPersGrossMonthL1 = l.ypnoab +gen yPensPersGrossMonthL2 = l2.ypnoab + +gen healthSelfRated = dhe + +tab healthSelfRated, gen(healthSelfRated_) +rename healthSelfRated_1 healthSelfRatedPoor +rename healthSelfRated_2 healthSelfRatedFair +rename healthSelfRated_3 healthSelfRatedGood +rename healthSelfRated_4 healthSelfRatedVeryGood +rename healthSelfRated_5 healthSelfRatedExcellent + + +gen healthPartnerSelfRated = dhesp + +tab healthPartnerSelfRated, gen(healthPartnerSelfRated_) +rename healthPartnerSelfRated_1 healthPartnerSelfRatedPoor +rename healthPartnerSelfRated_2 healthPartnerSelfRatedFair +rename healthPartnerSelfRated_3 healthPartnerSelfRatedGood +rename healthPartnerSelfRated_4 healthPartnerSelfRatedVeryGood +rename healthPartnerSelfRated_5 healthPartnerSelfRatedExcellent + + + +rename Age20to24 demAge20to24 +rename Age25to29 demAge25to29 +rename Age30to34 demAge30to34 +rename Age35to39 demAge35to39 +rename Age40to44 demAge40to44 +rename Age45to49 demAge45to49 +rename Age50to54 demAge50to54 +rename Age55to59 demAge55to59 +rename Age60to64 demAge60to64 +rename Age65to69 demAge65to69 +rename Age70to74 demAge70to74 +rename Age75to79 demAge75to79 +rename Age80to84 demAge80to84 +rename Age85plus demAge85plus + +rename Age65to66 demAge65to66 +rename Age67to68 demAge67to68 +rename Age69to70 demAge69to70 +rename Age71to72 demAge71to72 +rename Age73to74 demAge73to74 +rename Age75to76 demAge75to76 +rename Age77to78 demAge77to78 +rename Age79to80 demAge79to80 +rename Age81to82 demAge81to82 +rename Age83to84 demAge83to84 + +gen careMarket = CareMarket +gen careMarketL1 = l.careMarket + + +gen careMarketInformal = (careMarket == 2) +gen careMarketMixed = (careMarket == 3) +gen careMarketFormal = (careMarket == 4) + +gen careMarketInformalL1 = (l.careMarket== 2) +gen careMarketMixedL1 = (l.careMarket == 3) +gen careMarketFormalL1 = (l.careMarket == 4) + +gen careHrsInformalIhs = HrsReceivedInformalIHS +gen careHrsInformalIhsL1 = l.HrsReceivedInformalIHS + +gen careHrsFormalIhs = HrsReceivedFormalIHS +gen careHrsFormalIhsL1 = l.HrsReceivedFormalIHS + + +gen careNeedFlag = NeedCare +gen careNeedFlagL1 = l.NeedCare + +gen careReceivedFlag = ReceiveCare +gen careReceivedFlagL1 = l.ReceiveCare + +gen careProvidedFlag = ProvideCare +gen careProvidedFlagL1 = l.ProvideCare + +gen careNeedPartnerFlag = NeedCarePartner +gen careReceivedPartnerFlag = ReceiveCarePartner + +gen careMarketInformalPartner = CareMarketInformalPartner +gen careMarketMixedPsrtner = CareMarketMixedPartner +gen careMarketFormalPartner = CareMarketFormalPartner + +gen careHrsProvidedWeekIhs = HrsProvidedInformalIHS +gen careHrsProvidedWeekIhsL1 = l.HrsProvidedInformalIHS + +*================================================== +* End +*================================================== + diff --git a/input/reg_RMSE.xlsx b/input/reg_RMSE.xlsx index b929e9701..a3370af73 100644 Binary files a/input/reg_RMSE.xlsx and b/input/reg_RMSE.xlsx differ diff --git a/input/reg_education.xlsx b/input/reg_education.xlsx index 1d75ac6a4..1444f4ead 100644 Binary files a/input/reg_education.xlsx and b/input/reg_education.xlsx differ diff --git a/input/reg_employment_selection.xlsx b/input/reg_employment_selection.xlsx index 1e5cebf06..9b09898c4 100644 Binary files a/input/reg_employment_selection.xlsx and b/input/reg_employment_selection.xlsx differ diff --git a/input/reg_fertility.xlsx b/input/reg_fertility.xlsx index 1e68ecdbc..7f9c987fb 100644 Binary files a/input/reg_fertility.xlsx and b/input/reg_fertility.xlsx differ diff --git a/input/reg_financial_distress.xlsx b/input/reg_financial_distress.xlsx index 11b64fefe..d945ffd9b 100644 Binary files a/input/reg_financial_distress.xlsx and b/input/reg_financial_distress.xlsx differ diff --git a/input/reg_health.xlsx b/input/reg_health.xlsx index a4c5e7cd7..26e0791b8 100644 Binary files a/input/reg_health.xlsx and b/input/reg_health.xlsx differ diff --git a/input/reg_health_mental.xlsx b/input/reg_health_mental.xlsx index e349e102c..beea64a8f 100644 Binary files a/input/reg_health_mental.xlsx and b/input/reg_health_mental.xlsx differ diff --git a/input/reg_health_wellbeing.xlsx b/input/reg_health_wellbeing.xlsx index 7eaf3a948..0d0f97abf 100644 Binary files a/input/reg_health_wellbeing.xlsx and b/input/reg_health_wellbeing.xlsx differ diff --git a/input/reg_home_ownership.xlsx b/input/reg_home_ownership.xlsx index d82fa013e..5d5fd1bca 100644 Binary files a/input/reg_home_ownership.xlsx and b/input/reg_home_ownership.xlsx differ diff --git a/input/reg_income.xlsx b/input/reg_income.xlsx index 72b68c526..b8b7039ff 100644 Binary files a/input/reg_income.xlsx and b/input/reg_income.xlsx differ diff --git a/input/reg_leave_parental_home.xlsx b/input/reg_leave_parental_home.xlsx index 11e6dd5c5..88afbaf9d 100644 Binary files a/input/reg_leave_parental_home.xlsx and b/input/reg_leave_parental_home.xlsx differ diff --git a/input/reg_partnership.xlsx b/input/reg_partnership.xlsx index ba1cdf736..c6161df48 100644 Binary files a/input/reg_partnership.xlsx and b/input/reg_partnership.xlsx differ diff --git a/input/reg_retirement.xlsx b/input/reg_retirement.xlsx index 7ac0e8f30..1fc41b9ff 100644 Binary files a/input/reg_retirement.xlsx and b/input/reg_retirement.xlsx differ diff --git a/input/reg_socialcare.xlsx b/input/reg_socialcare.xlsx index c00403ecb..b8af56164 100644 Binary files a/input/reg_socialcare.xlsx and b/input/reg_socialcare.xlsx differ diff --git a/input/reg_wages.xlsx b/input/reg_wages.xlsx index fb1c5954e..a9a94b438 100644 Binary files a/input/reg_wages.xlsx and b/input/reg_wages.xlsx differ