diff --git a/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py b/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py index c0fa8d619f..57bc6c83a5 100644 --- a/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py +++ b/scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py @@ -39,107 +39,114 @@ def process_national_1900_1970(ip_files: list) -> pd.DataFrame: final_df = pd.DataFrame() final_df2 = pd.DataFrame() for file in ip_files: - - filename = file - if ".csv" in filename: - # Extract year from the url - year = filename[-8:-4] - - # comparing the year value as schema is chaning from 1959 - if int(year) < 1960: - - # reading the csv format input file - # and converting it to a dataframe - df = pd.read_csv(file) - #Saving file to local - df.to_csv(_CODEDIR + "/../input_files/" + - "nationals_result_1900_1959.csv", - index=False) - - # providing proper column names - if len(df.columns) != 10: - logging.error( - f"Schema mismatch for {file}: expected 10 columns, but got {len(df.columns)}." - ) - raise ValueError( - f"Expected 10 columns, got {len(df.columns)}") - df.columns = [ - "Age", "All race total", "Count_Person_Male", - "Count_Person_Female", "White Total", - "Count_Person_Male_WhiteAlone", - "Count_Person_Female_WhiteAlone", "Nonwhite Total", - "Count_Person_Male_NonWhite", "Count_Person_Female_NonWhite" - ] - - # dropping the unwanted columns - df.drop(columns=[ - "Age", "All race total", "White Total", "Nonwhite Total" - ], - inplace=True) - - # inserting year column to the dataframe - df.insert(loc=0, column='Year', value=year) - df = df.iloc[5:6, :] - - # writing all the output to a dataframe - final_df = pd.concat([final_df, df], ignore_index=True) - final_df = final_df.sort_values('Year') - - # for the years after 1960 as schema is changing - else: - # reading the csv format input file - # and converting it to a dataframe - df2 = pd.read_csv(file) - df2.to_csv(_CODEDIR + "/../input_files/" + - "nationals_result_1960_1979.csv", - index=False) - # providing proper column names - if len(df2.columns) != 13: - logging.error( - f"Schema mismatch for {file}: expected 13 columns, but got {len(df2.columns)}." - ) - raise ValueError( - f"Expected 13 columns, got {len(df2.columns)}") - df2.columns = [ - "Age", "All race total", "Count_Person_Male", - "Count_Person_Female", "White Total", - "Count_Person_Male_WhiteAlone", - "Count_Person_Female_WhiteAlone", "Black Total", - "Count_Person_Male_BlackOrAfricanAmericanAlone", - "Count_Person_Female_BlackOrAfricanAmericanAlone", - "Other Races Total", "Count_Person_Male_OtherRaces", - "Count_Person_Female_OtherRaces" - ] - - # dropping the unwanted columns - df2.drop(columns=[ - "Age", "All race total", "White Total", "Black Total", - "Other Races Total", "Count_Person_Male_OtherRaces", - "Count_Person_Female_OtherRaces" - ], - inplace=True) - - # inserting year column - df2.insert(loc=0, column='Year', value=year) - df2 = df2.iloc[4:5, :] - - # writing all the output to a dataframe - final_df2 = pd.concat([df2, final_df2], ignore_index=True) - final_df2 = final_df2.sort_values('Year') - - if final_df.shape[1] > 0: + try: + filename = file + if ".csv" in filename: + # Extract year from the url + year = filename[-8:-4] + + # comparing the year value as schema is chaning from 1959 + if int(year) < 1960: + + # reading the csv format input file + # and converting it to a dataframe + df = pd.read_csv(file) + #Saving file to local + df.to_csv(_CODEDIR + "/../input_files/" + + f"nationals_result_{year}.csv", + index=False) + + # providing proper column names + if len(df.columns) != 10: + logging.error( + f"Schema mismatch for {file}: expected 10 columns, but got {len(df.columns)}." + ) + continue + df.columns = [ + "Age", "All race total", "Count_Person_Male", + "Count_Person_Female", "White Total", + "Count_Person_Male_WhiteAlone", + "Count_Person_Female_WhiteAlone", "Nonwhite Total", + "Count_Person_Male_NonWhite", + "Count_Person_Female_NonWhite" + ] + + # dropping the unwanted columns + df.drop(columns=[ + "Age", "All race total", "White Total", "Nonwhite Total" + ], + inplace=True) + + # inserting year column to the dataframe + df.insert(loc=0, column='Year', value=year) + df = df.iloc[5:6, :] + + # writing all the output to a dataframe + final_df = pd.concat([final_df, df], ignore_index=True) + + # for the years after 1960 as schema is changing + else: + # reading the csv format input file + # and converting it to a dataframe + df2 = pd.read_csv(file) + df2.to_csv(_CODEDIR + "/../input_files/" + + f"nationals_result_{year}.csv", + index=False) + # providing proper column names + if len(df2.columns) != 13: + logging.error( + f"Schema mismatch for {file}: expected 13 columns, but got {len(df2.columns)}." + ) + continue + df2.columns = [ + "Age", "All race total", "Count_Person_Male", + "Count_Person_Female", "White Total", + "Count_Person_Male_WhiteAlone", + "Count_Person_Female_WhiteAlone", "Black Total", + "Count_Person_Male_BlackOrAfricanAmericanAlone", + "Count_Person_Female_BlackOrAfricanAmericanAlone", + "Other Races Total", "Count_Person_Male_OtherRaces", + "Count_Person_Female_OtherRaces" + ] + + # dropping the unwanted columns + df2.drop(columns=[ + "Age", "All race total", "White Total", "Black Total", + "Other Races Total", "Count_Person_Male_OtherRaces", + "Count_Person_Female_OtherRaces" + ], + inplace=True) + + # inserting year column + df2.insert(loc=0, column='Year', value=year) + df2 = df2.iloc[4:5, :] + + # writing all the output to a dataframe + final_df2 = pd.concat([final_df2, df2], ignore_index=True) + except (pd.errors.ParserError, pd.errors.EmptyDataError, IOError, + ValueError) as e: + logging.error(f"Error processing {file}: {e}") + continue + + if not final_df.empty: + final_df = final_df.sort_values('Year') # inserting geoId to the final dataframe final_df.insert(1, 'geo_ID', 'country/USA', True) - if final_df2.shape[1] > 0: + if not final_df2.empty: + final_df2 = final_df2.sort_values('Year') final_df2.insert(1, 'geo_ID', 'country/USA', True) - # removing numerics thousand seperator from the row values + # removing numerics thousand seperator from the row values and converting to numeric for col in final_df.columns: - final_df[col] = final_df[col].str.replace(",", "") + if col not in ["Year", "geo_ID"]: + final_df[col] = pd.to_numeric(final_df[col].astype(str).str.replace( + ",", ""), + errors='coerce') for col in final_df2.columns: - final_df2[col] = final_df2[col].str.replace(",", "") if col not in ["Year", "geo_ID"]: - final_df2[col] = final_df2[col].astype("int") + final_df2[col] = pd.to_numeric( + final_df2[col].astype(str).str.replace(",", ""), + errors='coerce') final_df.to_csv(_CODEDIR + "/../output_files/intermediate/" + "nationals_result_1900_1959.csv",