Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
195 changes: 101 additions & 94 deletions scripts/us_census/pep/us_pep_sexrace/national/national_1900_1970.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,107 +39,114 @@ def process_national_1900_1970(ip_files: list) -> pd.DataFrame:
final_df = pd.DataFrame()
final_df2 = pd.DataFrame()
for file in ip_files:

filename = file
if ".csv" in filename:
# Extract year from the url
year = filename[-8:-4]

# comparing the year value as schema is chaning from 1959
if int(year) < 1960:

# reading the csv format input file
# and converting it to a dataframe
df = pd.read_csv(file)
#Saving file to local
df.to_csv(_CODEDIR + "/../input_files/" +
"nationals_result_1900_1959.csv",
index=False)

# providing proper column names
if len(df.columns) != 10:
logging.error(
f"Schema mismatch for {file}: expected 10 columns, but got {len(df.columns)}."
)
raise ValueError(
f"Expected 10 columns, got {len(df.columns)}")
df.columns = [
"Age", "All race total", "Count_Person_Male",
"Count_Person_Female", "White Total",
"Count_Person_Male_WhiteAlone",
"Count_Person_Female_WhiteAlone", "Nonwhite Total",
"Count_Person_Male_NonWhite", "Count_Person_Female_NonWhite"
]

# dropping the unwanted columns
df.drop(columns=[
"Age", "All race total", "White Total", "Nonwhite Total"
],
inplace=True)

# inserting year column to the dataframe
df.insert(loc=0, column='Year', value=year)
df = df.iloc[5:6, :]

# writing all the output to a dataframe
final_df = pd.concat([final_df, df], ignore_index=True)
final_df = final_df.sort_values('Year')

# for the years after 1960 as schema is changing
else:
# reading the csv format input file
# and converting it to a dataframe
df2 = pd.read_csv(file)
df2.to_csv(_CODEDIR + "/../input_files/" +
"nationals_result_1960_1979.csv",
index=False)
# providing proper column names
if len(df2.columns) != 13:
logging.error(
f"Schema mismatch for {file}: expected 13 columns, but got {len(df2.columns)}."
)
raise ValueError(
f"Expected 13 columns, got {len(df2.columns)}")
df2.columns = [
"Age", "All race total", "Count_Person_Male",
"Count_Person_Female", "White Total",
"Count_Person_Male_WhiteAlone",
"Count_Person_Female_WhiteAlone", "Black Total",
"Count_Person_Male_BlackOrAfricanAmericanAlone",
"Count_Person_Female_BlackOrAfricanAmericanAlone",
"Other Races Total", "Count_Person_Male_OtherRaces",
"Count_Person_Female_OtherRaces"
]

# dropping the unwanted columns
df2.drop(columns=[
"Age", "All race total", "White Total", "Black Total",
"Other Races Total", "Count_Person_Male_OtherRaces",
"Count_Person_Female_OtherRaces"
],
inplace=True)

# inserting year column
df2.insert(loc=0, column='Year', value=year)
df2 = df2.iloc[4:5, :]

# writing all the output to a dataframe
final_df2 = pd.concat([df2, final_df2], ignore_index=True)
final_df2 = final_df2.sort_values('Year')

if final_df.shape[1] > 0:
try:
Comment thread
niveditasing marked this conversation as resolved.
filename = file
if ".csv" in filename:
# Extract year from the url
year = filename[-8:-4]

# comparing the year value as schema is chaning from 1959
if int(year) < 1960:

# reading the csv format input file
# and converting it to a dataframe
df = pd.read_csv(file)
Comment thread
niveditasing marked this conversation as resolved.
#Saving file to local
df.to_csv(_CODEDIR + "/../input_files/" +
f"nationals_result_{year}.csv",
index=False)
Comment thread
niveditasing marked this conversation as resolved.
Comment thread
niveditasing marked this conversation as resolved.

# providing proper column names
if len(df.columns) != 10:
logging.error(
f"Schema mismatch for {file}: expected 10 columns, but got {len(df.columns)}."
)
continue
Comment thread
niveditasing marked this conversation as resolved.
df.columns = [
"Age", "All race total", "Count_Person_Male",
"Count_Person_Female", "White Total",
"Count_Person_Male_WhiteAlone",
"Count_Person_Female_WhiteAlone", "Nonwhite Total",
"Count_Person_Male_NonWhite",
"Count_Person_Female_NonWhite"
]

# dropping the unwanted columns
df.drop(columns=[
"Age", "All race total", "White Total", "Nonwhite Total"
],
inplace=True)

# inserting year column to the dataframe
df.insert(loc=0, column='Year', value=year)
df = df.iloc[5:6, :]

# writing all the output to a dataframe
final_df = pd.concat([final_df, df], ignore_index=True)

# for the years after 1960 as schema is changing
else:
# reading the csv format input file
# and converting it to a dataframe
df2 = pd.read_csv(file)
Comment thread
niveditasing marked this conversation as resolved.
df2.to_csv(_CODEDIR + "/../input_files/" +
f"nationals_result_{year}.csv",
index=False)
Comment thread
niveditasing marked this conversation as resolved.
# providing proper column names
if len(df2.columns) != 13:
logging.error(
f"Schema mismatch for {file}: expected 13 columns, but got {len(df2.columns)}."
)
continue
df2.columns = [
"Age", "All race total", "Count_Person_Male",
"Count_Person_Female", "White Total",
"Count_Person_Male_WhiteAlone",
"Count_Person_Female_WhiteAlone", "Black Total",
"Count_Person_Male_BlackOrAfricanAmericanAlone",
"Count_Person_Female_BlackOrAfricanAmericanAlone",
"Other Races Total", "Count_Person_Male_OtherRaces",
"Count_Person_Female_OtherRaces"
]

# dropping the unwanted columns
df2.drop(columns=[
"Age", "All race total", "White Total", "Black Total",
"Other Races Total", "Count_Person_Male_OtherRaces",
"Count_Person_Female_OtherRaces"
],
inplace=True)

# inserting year column
df2.insert(loc=0, column='Year', value=year)
df2 = df2.iloc[4:5, :]

# writing all the output to a dataframe
final_df2 = pd.concat([final_df2, df2], ignore_index=True)
except (pd.errors.ParserError, pd.errors.EmptyDataError, IOError,
ValueError) as e:
logging.error(f"Error processing {file}: {e}")
continue

if not final_df.empty:
final_df = final_df.sort_values('Year')
# inserting geoId to the final dataframe
final_df.insert(1, 'geo_ID', 'country/USA', True)
if final_df2.shape[1] > 0:
if not final_df2.empty:
final_df2 = final_df2.sort_values('Year')
final_df2.insert(1, 'geo_ID', 'country/USA', True)

# removing numerics thousand seperator from the row values
# removing numerics thousand seperator from the row values and converting to numeric
for col in final_df.columns:
final_df[col] = final_df[col].str.replace(",", "")
if col not in ["Year", "geo_ID"]:
final_df[col] = pd.to_numeric(final_df[col].astype(str).str.replace(
",", ""),
errors='coerce')
for col in final_df2.columns:
final_df2[col] = final_df2[col].str.replace(",", "")
if col not in ["Year", "geo_ID"]:
final_df2[col] = final_df2[col].astype("int")
final_df2[col] = pd.to_numeric(
final_df2[col].astype(str).str.replace(",", ""),
errors='coerce')

final_df.to_csv(_CODEDIR + "/../output_files/intermediate/" +
"nationals_result_1900_1959.csv",
Expand Down
Loading