From 041e4c2bb6d9a0eb8808af72fa255c11c8a8c853 Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Wed, 20 May 2026 11:05:51 +0000 Subject: [PATCH 1/5] code fix --- .../national/national_1900_1970.py | 185 +++++++++--------- 1 file changed, 95 insertions(+), 90 deletions(-) diff --git a/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py b/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py index c0fa8d619f..710b1f7f2d 100644 --- a/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py +++ b/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py @@ -39,93 +39,94 @@ def process_national_1900_1970(ip_files: list) -> pd.DataFrame: final_df = pd.DataFrame() final_df2 = pd.DataFrame() for file in ip_files: - - filename = file - if ".csv" in filename: - # Extract year from the url - year = filename[-8:-4] - - # comparing the year value as schema is chaning from 1959 - if int(year) < 1960: - - # reading the csv format input file - # and converting it to a dataframe - df = pd.read_csv(file) - #Saving file to local - df.to_csv(_CODEDIR + "/../input_files/" + - "nationals_result_1900_1959.csv", - index=False) - - # providing proper column names - if len(df.columns) != 10: - logging.error( - f"Schema mismatch for {file}: expected 10 columns, but got {len(df.columns)}." - ) - raise ValueError( - f"Expected 10 columns, got {len(df.columns)}") - df.columns = [ - "Age", "All race total", "Count_Person_Male", - "Count_Person_Female", "White Total", - "Count_Person_Male_WhiteAlone", - "Count_Person_Female_WhiteAlone", "Nonwhite Total", - "Count_Person_Male_NonWhite", "Count_Person_Female_NonWhite" - ] - - # dropping the unwanted columns - df.drop(columns=[ - "Age", "All race total", "White Total", "Nonwhite Total" - ], - inplace=True) - - # inserting year column to the dataframe - df.insert(loc=0, column='Year', value=year) - df = df.iloc[5:6, :] - - # writing all the output to a dataframe - final_df = pd.concat([final_df, df], ignore_index=True) - final_df = final_df.sort_values('Year') - - # for the years after 1960 as schema is changing - else: - # reading the csv format input file - # and converting it to a dataframe - df2 = pd.read_csv(file) - df2.to_csv(_CODEDIR + "/../input_files/" + - "nationals_result_1960_1979.csv", - index=False) - # providing proper column names - if len(df2.columns) != 13: - logging.error( - f"Schema mismatch for {file}: expected 13 columns, but got {len(df2.columns)}." - ) - raise ValueError( - f"Expected 13 columns, got {len(df2.columns)}") - df2.columns = [ - "Age", "All race total", "Count_Person_Male", - "Count_Person_Female", "White Total", - "Count_Person_Male_WhiteAlone", - "Count_Person_Female_WhiteAlone", "Black Total", - "Count_Person_Male_BlackOrAfricanAmericanAlone", - "Count_Person_Female_BlackOrAfricanAmericanAlone", - "Other Races Total", "Count_Person_Male_OtherRaces", - "Count_Person_Female_OtherRaces" - ] - - # dropping the unwanted columns - df2.drop(columns=[ - "Age", "All race total", "White Total", "Black Total", - "Other Races Total", "Count_Person_Male_OtherRaces", - "Count_Person_Female_OtherRaces" - ], - inplace=True) - - # inserting year column - df2.insert(loc=0, column='Year', value=year) - df2 = df2.iloc[4:5, :] - - # writing all the output to a dataframe - final_df2 = pd.concat([df2, final_df2], ignore_index=True) - final_df2 = final_df2.sort_values('Year') + try: + filename = file + if ".csv" in filename: + # Extract year from the url + year = filename[-8:-4] + + # comparing the year value as schema is chaning from 1959 + if int(year) < 1960: + + # reading the csv format input file + # and converting it to a dataframe + df = pd.read_csv(file) + #Saving file to local + df.to_csv(_CODEDIR + "/../input_files/" + + "nationals_result_1900_1959.csv", + index=False) + + # providing proper column names + if len(df.columns) != 10: + logging.error( + f"Schema mismatch for {file}: expected 10 columns, but got {len(df.columns)}." + ) + continue + df.columns = [ + "Age", "All race total", "Count_Person_Male", + "Count_Person_Female", "White Total", + "Count_Person_Male_WhiteAlone", + "Count_Person_Female_WhiteAlone", "Nonwhite Total", + "Count_Person_Male_NonWhite", "Count_Person_Female_NonWhite" + ] + + # dropping the unwanted columns + df.drop(columns=[ + "Age", "All race total", "White Total", "Nonwhite Total" + ], + inplace=True) + + # inserting year column to the dataframe + df.insert(loc=0, column='Year', value=year) + df = df.iloc[5:6, :] + + # writing all the output to a dataframe + final_df = pd.concat([final_df, df], ignore_index=True) + final_df = final_df.sort_values('Year') + + # for the years after 1960 as schema is changing + else: + # reading the csv format input file + # and converting it to a dataframe + df2 = pd.read_csv(file) + df2.to_csv(_CODEDIR + "/../input_files/" + + "nationals_result_1960_1979.csv", + index=False) + # providing proper column names + if len(df2.columns) != 13: + logging.error( + f"Schema mismatch for {file}: expected 13 columns, but got {len(df2.columns)}." + ) + continue + df2.columns = [ + "Age", "All race total", "Count_Person_Male", + "Count_Person_Female", "White Total", + "Count_Person_Male_WhiteAlone", + "Count_Person_Female_WhiteAlone", "Black Total", + "Count_Person_Male_BlackOrAfricanAmericanAlone", + "Count_Person_Female_BlackOrAfricanAmericanAlone", + "Other Races Total", "Count_Person_Male_OtherRaces", + "Count_Person_Female_OtherRaces" + ] + + # dropping the unwanted columns + df2.drop(columns=[ + "Age", "All race total", "White Total", "Black Total", + "Other Races Total", "Count_Person_Male_OtherRaces", + "Count_Person_Female_OtherRaces" + ], + inplace=True) + + # inserting year column + df2.insert(loc=0, column='Year', value=year) + df2 = df2.iloc[4:5, :] + + # writing all the output to a dataframe + final_df2 = pd.concat([df2, final_df2], ignore_index=True) + final_df2 = final_df2.sort_values('Year') + except Exception as e: + logging.error(f"Error processing {file}: {e}") + continue if final_df.shape[1] > 0: # inserting geoId to the final dataframe @@ -135,11 +136,14 @@ def process_national_1900_1970(ip_files: list) -> pd.DataFrame: # removing numerics thousand seperator from the row values for col in final_df.columns: - final_df[col] = final_df[col].str.replace(",", "") + final_df[col] = final_df[col].astype(str).str.replace(",", "") for col in final_df2.columns: - final_df2[col] = final_df2[col].str.replace(",", "") + final_df2[col] = final_df2[col].astype(str).str.replace(",", "") if col not in ["Year", "geo_ID"]: - final_df2[col] = final_df2[col].astype("int") + try: + final_df2[col] = final_df2[col].astype("int") + except: + pass final_df.to_csv(_CODEDIR + "/../output_files/intermediate/" + "nationals_result_1900_1959.csv", @@ -148,4 +152,5 @@ def process_national_1900_1970(ip_files: list) -> pd.DataFrame: "nationals_result_1960_1979.csv", index=False) + return final_df.columns, final_df2.columns From 1f2b83a5b6d12e34a13ee98a2634161c0658280a Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Wed, 20 May 2026 11:53:46 +0000 Subject: [PATCH 2/5] code fix --- .../us_census/pep/us_pep_sexrace/national/national_1900_1970.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py b/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py index 710b1f7f2d..6490e9b1c4 100644 --- a/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py +++ b/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py @@ -142,7 +142,7 @@ def process_national_1900_1970(ip_files: list) -> pd.DataFrame: if col not in ["Year", "geo_ID"]: try: final_df2[col] = final_df2[col].astype("int") - except: + except (ValueError, TypeError): pass final_df.to_csv(_CODEDIR + "/../output_files/intermediate/" + From 93fb52c5fce91eb184eb648080f5b9f72d3f71e9 Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Wed, 20 May 2026 12:21:01 +0000 Subject: [PATCH 3/5] Refactor national_1900_1970.py based on code assist recommendations --- .../national/national_1900_1970.py | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py b/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py index 6490e9b1c4..95bfa47f2e 100644 --- a/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py +++ b/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py @@ -53,7 +53,7 @@ def process_national_1900_1970(ip_files: list) -> pd.DataFrame: df = pd.read_csv(file) #Saving file to local df.to_csv(_CODEDIR + "/../input_files/" + - "nationals_result_1900_1959.csv", + f"nationals_result_{year}.csv", index=False) # providing proper column names @@ -82,7 +82,6 @@ def process_national_1900_1970(ip_files: list) -> pd.DataFrame: # writing all the output to a dataframe final_df = pd.concat([final_df, df], ignore_index=True) - final_df = final_df.sort_values('Year') # for the years after 1960 as schema is changing else: @@ -90,7 +89,7 @@ def process_national_1900_1970(ip_files: list) -> pd.DataFrame: # and converting it to a dataframe df2 = pd.read_csv(file) df2.to_csv(_CODEDIR + "/../input_files/" + - "nationals_result_1960_1979.csv", + f"nationals_result_{year}.csv", index=False) # providing proper column names if len(df2.columns) != 13: @@ -122,21 +121,28 @@ def process_national_1900_1970(ip_files: list) -> pd.DataFrame: df2 = df2.iloc[4:5, :] # writing all the output to a dataframe - final_df2 = pd.concat([df2, final_df2], ignore_index=True) - final_df2 = final_df2.sort_values('Year') - except Exception as e: + final_df2 = pd.concat([final_df2, df2], ignore_index=True) + except (pd.errors.ParserError, pd.errors.EmptyDataError, IOError, + ValueError) as e: logging.error(f"Error processing {file}: {e}") continue - if final_df.shape[1] > 0: + if not final_df.empty: + final_df = final_df.sort_values('Year') # inserting geoId to the final dataframe final_df.insert(1, 'geo_ID', 'country/USA', True) - if final_df2.shape[1] > 0: + if not final_df2.empty: + final_df2 = final_df2.sort_values('Year') final_df2.insert(1, 'geo_ID', 'country/USA', True) # removing numerics thousand seperator from the row values for col in final_df.columns: final_df[col] = final_df[col].astype(str).str.replace(",", "") + if col not in ["Year", "geo_ID"]: + try: + final_df[col] = final_df[col].astype("int") + except (ValueError, TypeError): + pass for col in final_df2.columns: final_df2[col] = final_df2[col].astype(str).str.replace(",", "") if col not in ["Year", "geo_ID"]: @@ -152,5 +158,4 @@ def process_national_1900_1970(ip_files: list) -> pd.DataFrame: "nationals_result_1960_1979.csv", index=False) - return final_df.columns, final_df2.columns From ad222d7878145c26bc7196d89e580df7198a4245 Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Wed, 20 May 2026 12:36:00 +0000 Subject: [PATCH 4/5] Use pd.to_numeric for more robust numeric cleaning in national_1900_1970.py --- .../national/national_1900_1970.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py b/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py index 95bfa47f2e..1c280badc0 100644 --- a/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py +++ b/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py @@ -135,21 +135,17 @@ def process_national_1900_1970(ip_files: list) -> pd.DataFrame: final_df2 = final_df2.sort_values('Year') final_df2.insert(1, 'geo_ID', 'country/USA', True) - # removing numerics thousand seperator from the row values + # removing numerics thousand seperator from the row values and converting to numeric for col in final_df.columns: - final_df[col] = final_df[col].astype(str).str.replace(",", "") if col not in ["Year", "geo_ID"]: - try: - final_df[col] = final_df[col].astype("int") - except (ValueError, TypeError): - pass + final_df[col] = pd.to_numeric(final_df[col].astype(str).str.replace( + ",", ""), + errors='coerce') for col in final_df2.columns: - final_df2[col] = final_df2[col].astype(str).str.replace(",", "") if col not in ["Year", "geo_ID"]: - try: - final_df2[col] = final_df2[col].astype("int") - except (ValueError, TypeError): - pass + final_df2[col] = pd.to_numeric(final_df2[col].astype(str).str.replace( + ",", ""), + errors='coerce') final_df.to_csv(_CODEDIR + "/../output_files/intermediate/" + "nationals_result_1900_1959.csv", From f6652dc2d894c72b422ae8756a8d60bee5ef5878 Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Wed, 20 May 2026 12:48:42 +0000 Subject: [PATCH 5/5] code fix --- .../pep/us_pep_sexrace/national/national_1900_1970.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py b/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py index 1c280badc0..57bc6c83a5 100644 --- a/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py +++ b/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py @@ -67,7 +67,8 @@ def process_national_1900_1970(ip_files: list) -> pd.DataFrame: "Count_Person_Female", "White Total", "Count_Person_Male_WhiteAlone", "Count_Person_Female_WhiteAlone", "Nonwhite Total", - "Count_Person_Male_NonWhite", "Count_Person_Female_NonWhite" + "Count_Person_Male_NonWhite", + "Count_Person_Female_NonWhite" ] # dropping the unwanted columns @@ -143,9 +144,9 @@ def process_national_1900_1970(ip_files: list) -> pd.DataFrame: errors='coerce') for col in final_df2.columns: if col not in ["Year", "geo_ID"]: - final_df2[col] = pd.to_numeric(final_df2[col].astype(str).str.replace( - ",", ""), - errors='coerce') + final_df2[col] = pd.to_numeric( + final_df2[col].astype(str).str.replace(",", ""), + errors='coerce') final_df.to_csv(_CODEDIR + "/../output_files/intermediate/" + "nationals_result_1900_1959.csv",