From 041e4c2bb6d9a0eb8808af72fa255c11c8a8c853 Mon Sep 17 00:00:00 2001
From: Nivedita Singh <niveditasing@google.com>
Date: Wed, 20 May 2026 11:05:51 +0000
Subject: [PATCH 1/5] code fix

---
 .../national/national_1900_1970.py            | 185 +++++++++---------
 1 file changed, 95 insertions(+), 90 deletions(-)

diff --git a/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py b/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py
index c0fa8d619f..710b1f7f2d 100644
--- a/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py
+++ b/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py
@@ -39,93 +39,94 @@ def process_national_1900_1970(ip_files: list) -> pd.DataFrame:
     final_df = pd.DataFrame()
     final_df2 = pd.DataFrame()
     for file in ip_files:
-
-        filename = file
-        if ".csv" in filename:
-            # Extract year from the url
-            year = filename[-8:-4]
-
-            # comparing the year value as schema is chaning from 1959
-            if int(year) < 1960:
-
-                # reading the csv format input file
-                # and converting it to a dataframe
-                df = pd.read_csv(file)
-                #Saving file to local
-                df.to_csv(_CODEDIR + "/../input_files/" +
-                          "nationals_result_1900_1959.csv",
-                          index=False)
-
-                # providing proper column names
-                if len(df.columns) != 10:
-                    logging.error(
-                        f"Schema mismatch for {file}: expected 10 columns, but got {len(df.columns)}."
-                    )
-                    raise ValueError(
-                        f"Expected 10 columns, got {len(df.columns)}")
-                df.columns = [
-                    "Age", "All race total", "Count_Person_Male",
-                    "Count_Person_Female", "White Total",
-                    "Count_Person_Male_WhiteAlone",
-                    "Count_Person_Female_WhiteAlone", "Nonwhite Total",
-                    "Count_Person_Male_NonWhite", "Count_Person_Female_NonWhite"
-                ]
-
-                # dropping the unwanted columns
-                df.drop(columns=[
-                    "Age", "All race total", "White Total", "Nonwhite Total"
-                ],
-                        inplace=True)
-
-                # inserting year column to the dataframe
-                df.insert(loc=0, column='Year', value=year)
-                df = df.iloc[5:6, :]
-
-                # writing all the output to a dataframe
-                final_df = pd.concat([final_df, df], ignore_index=True)
-                final_df = final_df.sort_values('Year')
-
-            # for the years after 1960 as schema is changing
-            else:
-                # reading the csv format input file
-                # and converting it to a dataframe
-                df2 = pd.read_csv(file)
-                df2.to_csv(_CODEDIR + "/../input_files/" +
-                           "nationals_result_1960_1979.csv",
-                           index=False)
-                # providing proper column names
-                if len(df2.columns) != 13:
-                    logging.error(
-                        f"Schema mismatch for {file}: expected 13 columns, but got {len(df2.columns)}."
-                    )
-                    raise ValueError(
-                        f"Expected 13 columns, got {len(df2.columns)}")
-                df2.columns = [
-                    "Age", "All race total", "Count_Person_Male",
-                    "Count_Person_Female", "White Total",
-                    "Count_Person_Male_WhiteAlone",
-                    "Count_Person_Female_WhiteAlone", "Black Total",
-                    "Count_Person_Male_BlackOrAfricanAmericanAlone",
-                    "Count_Person_Female_BlackOrAfricanAmericanAlone",
-                    "Other Races Total", "Count_Person_Male_OtherRaces",
-                    "Count_Person_Female_OtherRaces"
-                ]
-
-                # dropping the unwanted columns
-                df2.drop(columns=[
-                    "Age", "All race total", "White Total", "Black Total",
-                    "Other Races Total", "Count_Person_Male_OtherRaces",
-                    "Count_Person_Female_OtherRaces"
-                ],
-                         inplace=True)
-
-                # inserting year column
-                df2.insert(loc=0, column='Year', value=year)
-                df2 = df2.iloc[4:5, :]
-
-                # writing all the output to a dataframe
-                final_df2 = pd.concat([df2, final_df2], ignore_index=True)
-                final_df2 = final_df2.sort_values('Year')
+        try:
+            filename = file
+            if ".csv" in filename:
+                # Extract year from the url
+                year = filename[-8:-4]
+
+                # comparing the year value as schema is chaning from 1959
+                if int(year) < 1960:
+
+                    # reading the csv format input file
+                    # and converting it to a dataframe
+                    df = pd.read_csv(file)
+                    #Saving file to local
+                    df.to_csv(_CODEDIR + "/../input_files/" +
+                              "nationals_result_1900_1959.csv",
+                              index=False)
+
+                    # providing proper column names
+                    if len(df.columns) != 10:
+                        logging.error(
+                            f"Schema mismatch for {file}: expected 10 columns, but got {len(df.columns)}."
+                        )
+                        continue
+                    df.columns = [
+                        "Age", "All race total", "Count_Person_Male",
+                        "Count_Person_Female", "White Total",
+                        "Count_Person_Male_WhiteAlone",
+                        "Count_Person_Female_WhiteAlone", "Nonwhite Total",
+                        "Count_Person_Male_NonWhite", "Count_Person_Female_NonWhite"
+                    ]
+
+                    # dropping the unwanted columns
+                    df.drop(columns=[
+                        "Age", "All race total", "White Total", "Nonwhite Total"
+                    ],
+                            inplace=True)
+
+                    # inserting year column to the dataframe
+                    df.insert(loc=0, column='Year', value=year)
+                    df = df.iloc[5:6, :]
+
+                    # writing all the output to a dataframe
+                    final_df = pd.concat([final_df, df], ignore_index=True)
+                    final_df = final_df.sort_values('Year')
+
+                # for the years after 1960 as schema is changing
+                else:
+                    # reading the csv format input file
+                    # and converting it to a dataframe
+                    df2 = pd.read_csv(file)
+                    df2.to_csv(_CODEDIR + "/../input_files/" +
+                               "nationals_result_1960_1979.csv",
+                               index=False)
+                    # providing proper column names
+                    if len(df2.columns) != 13:
+                        logging.error(
+                            f"Schema mismatch for {file}: expected 13 columns, but got {len(df2.columns)}."
+                        )
+                        continue
+                    df2.columns = [
+                        "Age", "All race total", "Count_Person_Male",
+                        "Count_Person_Female", "White Total",
+                        "Count_Person_Male_WhiteAlone",
+                        "Count_Person_Female_WhiteAlone", "Black Total",
+                        "Count_Person_Male_BlackOrAfricanAmericanAlone",
+                        "Count_Person_Female_BlackOrAfricanAmericanAlone",
+                        "Other Races Total", "Count_Person_Male_OtherRaces",
+                        "Count_Person_Female_OtherRaces"
+                    ]
+
+                    # dropping the unwanted columns
+                    df2.drop(columns=[
+                        "Age", "All race total", "White Total", "Black Total",
+                        "Other Races Total", "Count_Person_Male_OtherRaces",
+                        "Count_Person_Female_OtherRaces"
+                    ],
+                             inplace=True)
+
+                    # inserting year column
+                    df2.insert(loc=0, column='Year', value=year)
+                    df2 = df2.iloc[4:5, :]
+
+                    # writing all the output to a dataframe
+                    final_df2 = pd.concat([df2, final_df2], ignore_index=True)
+                    final_df2 = final_df2.sort_values('Year')
+        except Exception as e:
+            logging.error(f"Error processing {file}: {e}")
+            continue
 
     if final_df.shape[1] > 0:
         # inserting geoId to the final dataframe
@@ -135,11 +136,14 @@ def process_national_1900_1970(ip_files: list) -> pd.DataFrame:
 
     # removing numerics thousand seperator from the row values
     for col in final_df.columns:
-        final_df[col] = final_df[col].str.replace(",", "")
+        final_df[col] = final_df[col].astype(str).str.replace(",", "")
     for col in final_df2.columns:
-        final_df2[col] = final_df2[col].str.replace(",", "")
+        final_df2[col] = final_df2[col].astype(str).str.replace(",", "")
         if col not in ["Year", "geo_ID"]:
-            final_df2[col] = final_df2[col].astype("int")
+            try:
+                final_df2[col] = final_df2[col].astype("int")
+            except:
+                pass
 
     final_df.to_csv(_CODEDIR + "/../output_files/intermediate/" +
                     "nationals_result_1900_1959.csv",
@@ -148,4 +152,5 @@ def process_national_1900_1970(ip_files: list) -> pd.DataFrame:
                      "nationals_result_1960_1979.csv",
                      index=False)
 
+
     return final_df.columns, final_df2.columns

From 1f2b83a5b6d12e34a13ee98a2634161c0658280a Mon Sep 17 00:00:00 2001
From: Nivedita Singh <niveditasing@google.com>
Date: Wed, 20 May 2026 11:53:46 +0000
Subject: [PATCH 2/5] code fix

---
 .../us_census/pep/us_pep_sexrace/national/national_1900_1970.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py b/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py
index 710b1f7f2d..6490e9b1c4 100644
--- a/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py
+++ b/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py
@@ -142,7 +142,7 @@ def process_national_1900_1970(ip_files: list) -> pd.DataFrame:
         if col not in ["Year", "geo_ID"]:
             try:
                 final_df2[col] = final_df2[col].astype("int")
-            except:
+            except (ValueError, TypeError):
                 pass
 
     final_df.to_csv(_CODEDIR + "/../output_files/intermediate/" +

From 93fb52c5fce91eb184eb648080f5b9f72d3f71e9 Mon Sep 17 00:00:00 2001
From: Nivedita Singh <niveditasing@google.com>
Date: Wed, 20 May 2026 12:21:01 +0000
Subject: [PATCH 3/5] Refactor national_1900_1970.py based on code assist
 recommendations

---
 .../national/national_1900_1970.py            | 23 +++++++++++--------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py b/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py
index 6490e9b1c4..95bfa47f2e 100644
--- a/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py
+++ b/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py
@@ -53,7 +53,7 @@ def process_national_1900_1970(ip_files: list) -> pd.DataFrame:
                     df = pd.read_csv(file)
                     #Saving file to local
                     df.to_csv(_CODEDIR + "/../input_files/" +
-                              "nationals_result_1900_1959.csv",
+                              f"nationals_result_{year}.csv",
                               index=False)
 
                     # providing proper column names
@@ -82,7 +82,6 @@ def process_national_1900_1970(ip_files: list) -> pd.DataFrame:
 
                     # writing all the output to a dataframe
                     final_df = pd.concat([final_df, df], ignore_index=True)
-                    final_df = final_df.sort_values('Year')
 
                 # for the years after 1960 as schema is changing
                 else:
@@ -90,7 +89,7 @@ def process_national_1900_1970(ip_files: list) -> pd.DataFrame:
                     # and converting it to a dataframe
                     df2 = pd.read_csv(file)
                     df2.to_csv(_CODEDIR + "/../input_files/" +
-                               "nationals_result_1960_1979.csv",
+                               f"nationals_result_{year}.csv",
                                index=False)
                     # providing proper column names
                     if len(df2.columns) != 13:
@@ -122,21 +121,28 @@ def process_national_1900_1970(ip_files: list) -> pd.DataFrame:
                     df2 = df2.iloc[4:5, :]
 
                     # writing all the output to a dataframe
-                    final_df2 = pd.concat([df2, final_df2], ignore_index=True)
-                    final_df2 = final_df2.sort_values('Year')
-        except Exception as e:
+                    final_df2 = pd.concat([final_df2, df2], ignore_index=True)
+        except (pd.errors.ParserError, pd.errors.EmptyDataError, IOError,
+                ValueError) as e:
             logging.error(f"Error processing {file}: {e}")
             continue
 
-    if final_df.shape[1] > 0:
+    if not final_df.empty:
+        final_df = final_df.sort_values('Year')
         # inserting geoId to the final dataframe
         final_df.insert(1, 'geo_ID', 'country/USA', True)
-    if final_df2.shape[1] > 0:
+    if not final_df2.empty:
+        final_df2 = final_df2.sort_values('Year')
         final_df2.insert(1, 'geo_ID', 'country/USA', True)
 
     # removing numerics thousand seperator from the row values
     for col in final_df.columns:
         final_df[col] = final_df[col].astype(str).str.replace(",", "")
+        if col not in ["Year", "geo_ID"]:
+            try:
+                final_df[col] = final_df[col].astype("int")
+            except (ValueError, TypeError):
+                pass
     for col in final_df2.columns:
         final_df2[col] = final_df2[col].astype(str).str.replace(",", "")
         if col not in ["Year", "geo_ID"]:
@@ -152,5 +158,4 @@ def process_national_1900_1970(ip_files: list) -> pd.DataFrame:
                      "nationals_result_1960_1979.csv",
                      index=False)
 
-
     return final_df.columns, final_df2.columns

From ad222d7878145c26bc7196d89e580df7198a4245 Mon Sep 17 00:00:00 2001
From: Nivedita Singh <niveditasing@google.com>
Date: Wed, 20 May 2026 12:36:00 +0000
Subject: [PATCH 4/5] Use pd.to_numeric for more robust numeric cleaning in
 national_1900_1970.py

---
 .../national/national_1900_1970.py             | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py b/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py
index 95bfa47f2e..1c280badc0 100644
--- a/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py
+++ b/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py
@@ -135,21 +135,17 @@ def process_national_1900_1970(ip_files: list) -> pd.DataFrame:
         final_df2 = final_df2.sort_values('Year')
         final_df2.insert(1, 'geo_ID', 'country/USA', True)
 
-    # removing numerics thousand seperator from the row values
+    # removing numerics thousand seperator from the row values and converting to numeric
     for col in final_df.columns:
-        final_df[col] = final_df[col].astype(str).str.replace(",", "")
         if col not in ["Year", "geo_ID"]:
-            try:
-                final_df[col] = final_df[col].astype("int")
-            except (ValueError, TypeError):
-                pass
+            final_df[col] = pd.to_numeric(final_df[col].astype(str).str.replace(
+                ",", ""),
+                                          errors='coerce')
     for col in final_df2.columns:
-        final_df2[col] = final_df2[col].astype(str).str.replace(",", "")
         if col not in ["Year", "geo_ID"]:
-            try:
-                final_df2[col] = final_df2[col].astype("int")
-            except (ValueError, TypeError):
-                pass
+            final_df2[col] = pd.to_numeric(final_df2[col].astype(str).str.replace(
+                ",", ""),
+                                           errors='coerce')
 
     final_df.to_csv(_CODEDIR + "/../output_files/intermediate/" +
                     "nationals_result_1900_1959.csv",

From f6652dc2d894c72b422ae8756a8d60bee5ef5878 Mon Sep 17 00:00:00 2001
From: Nivedita Singh <niveditasing@google.com>
Date: Wed, 20 May 2026 12:48:42 +0000
Subject: [PATCH 5/5] code fix

---
 .../pep/us_pep_sexrace/national/national_1900_1970.py    | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py b/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py
index 1c280badc0..57bc6c83a5 100644
--- a/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py
+++ b/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py
@@ -67,7 +67,8 @@ def process_national_1900_1970(ip_files: list) -> pd.DataFrame:
                         "Count_Person_Female", "White Total",
                         "Count_Person_Male_WhiteAlone",
                         "Count_Person_Female_WhiteAlone", "Nonwhite Total",
-                        "Count_Person_Male_NonWhite", "Count_Person_Female_NonWhite"
+                        "Count_Person_Male_NonWhite",
+                        "Count_Person_Female_NonWhite"
                     ]
 
                     # dropping the unwanted columns
@@ -143,9 +144,9 @@ def process_national_1900_1970(ip_files: list) -> pd.DataFrame:
                                           errors='coerce')
     for col in final_df2.columns:
         if col not in ["Year", "geo_ID"]:
-            final_df2[col] = pd.to_numeric(final_df2[col].astype(str).str.replace(
-                ",", ""),
-                                           errors='coerce')
+            final_df2[col] = pd.to_numeric(
+                final_df2[col].astype(str).str.replace(",", ""),
+                errors='coerce')
 
     final_df.to_csv(_CODEDIR + "/../output_files/intermediate/" +
                     "nationals_result_1900_1959.csv",