33Review and adapt before production use.
44"""
55
6- from sklearn .preprocessing import OneHotEncoder , OrdinalEncoder
7- from sklearn .preprocessing import RobustScaler
8- import numpy as np
96import pandas as pd
107
118
@@ -14,51 +11,51 @@ def apply_fixes(df):
1411 df = df .copy ()
1512
1613 # Column 'Cabin' has 77% missing values
17- df = df .drop (columns = [' Cabin' ])
14+ df = df .drop (columns = [" Cabin" ])
1815
1916 # Frequency encode high-cardinality column 'Name'
20- freq_Name = df [' Name' ].value_counts (normalize = True )
21- df [' Name_encoded' ] = df [' Name' ].map (freq_Name )
17+ freq_Name = df [" Name" ].value_counts (normalize = True )
18+ df [" Name_encoded" ] = df [" Name" ].map (freq_Name )
2219
2320 # Frequency encode high-cardinality column 'Ticket'
24- freq_Ticket = df [' Ticket' ].value_counts (normalize = True )
25- df [' Ticket_encoded' ] = df [' Ticket' ].map (freq_Ticket )
21+ freq_Ticket = df [" Ticket" ].value_counts (normalize = True )
22+ df [" Ticket_encoded" ] = df [" Ticket" ].map (freq_Ticket )
2623
2724 # Clip outliers in 'Fare' using IQR method
28- q1_Fare , q3_Fare = df [' Fare' ].quantile ([0.25 , 0.75 ])
25+ q1_Fare , q3_Fare = df [" Fare" ].quantile ([0.25 , 0.75 ])
2926 iqr_Fare = q3_Fare - q1_Fare
3027 lower_Fare , upper_Fare = q1_Fare - 1.5 * iqr_Fare , q3_Fare + 1.5 * iqr_Fare
31- df [' Fare' ] = df [' Fare' ].clip (lower = lower_Fare , upper = upper_Fare )
28+ df [" Fare" ] = df [" Fare" ].clip (lower = lower_Fare , upper = upper_Fare )
3229
3330 # Clip outliers in 'Parch' using IQR method
34- q1_Parch , q3_Parch = df [' Parch' ].quantile ([0.25 , 0.75 ])
31+ q1_Parch , q3_Parch = df [" Parch" ].quantile ([0.25 , 0.75 ])
3532 iqr_Parch = q3_Parch - q1_Parch
3633 lower_Parch , upper_Parch = q1_Parch - 1.5 * iqr_Parch , q3_Parch + 1.5 * iqr_Parch
37- df [' Parch' ] = df [' Parch' ].clip (lower = lower_Parch , upper = upper_Parch )
34+ df [" Parch" ] = df [" Parch" ].clip (lower = lower_Parch , upper = upper_Parch )
3835
3936 # Clip outliers in 'SibSp' using IQR method
40- q1_SibSp , q3_SibSp = df [' SibSp' ].quantile ([0.25 , 0.75 ])
37+ q1_SibSp , q3_SibSp = df [" SibSp" ].quantile ([0.25 , 0.75 ])
4138 iqr_SibSp = q3_SibSp - q1_SibSp
4239 lower_SibSp , upper_SibSp = q1_SibSp - 1.5 * iqr_SibSp , q3_SibSp + 1.5 * iqr_SibSp
43- df [' SibSp' ] = df [' SibSp' ].clip (lower = lower_SibSp , upper = upper_SibSp )
40+ df [" SibSp" ] = df [" SibSp" ].clip (lower = lower_SibSp , upper = upper_SibSp )
4441
4542 # Drop highly correlated column 'Survived,Sex'
46- df = df .drop (columns = [' Survived,Sex' ])
43+ df = df .drop (columns = [" Survived,Sex" ])
4744
4845 return df
4946
5047
51- if __name__ == ' __main__' :
48+ if __name__ == " __main__" :
5249 import sys
5350
5451 if len (sys .argv ) < 2 :
55- print (' Usage: python fixes.py <input.csv> [output.csv]' )
52+ print (" Usage: python fixes.py <input.csv> [output.csv]" )
5653 sys .exit (1 )
5754
5855 input_file = sys .argv [1 ]
59- output_file = sys .argv [2 ] if len (sys .argv ) > 2 else ' cleaned_data.csv'
56+ output_file = sys .argv [2 ] if len (sys .argv ) > 2 else " cleaned_data.csv"
6057
6158 df = pd .read_csv (input_file )
6259 df_clean = apply_fixes (df )
6360 df_clean .to_csv (output_file , index = False )
64- print (f' Cleaned data saved to { output_file } ' )
61+ print (f" Cleaned data saved to { output_file } " )
0 commit comments