Skip to content

Commit 47b45d5

Browse files
authored
feat: add edge case tests, GitHub Actions CI, and ruff linting (#64)
* feat: add edge case tests, GitHub Actions CI, and ruff linting - Add 37 edge case tests covering empty DataFrames, constant columns, infinite values, correlation edge cases, leakage, drift, and more - Add GitHub Actions CI workflow (pytest on Python 3.10-3.12, ruff lint) - Add ruff configuration with sensible defaults for the project - Add ruff as dev dependency - Fix all ruff lint issues (unused imports, re-exports, unused vars) - Apply ruff formatting across the entire codebase * fix: resolve ruff import sorting in markdown.py
1 parent 9f63435 commit 47b45d5

54 files changed

Lines changed: 1119 additions & 826 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/ci.yml

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
name: CI
2+
3+
on:
4+
push:
5+
branches: [main]
6+
pull_request:
7+
branches: [main]
8+
9+
jobs:
10+
test:
11+
runs-on: ubuntu-latest
12+
strategy:
13+
matrix:
14+
python-version: ["3.10", "3.11", "3.12"]
15+
16+
steps:
17+
- uses: actions/checkout@v4
18+
19+
- name: Install uv
20+
uses: astral-sh/setup-uv@v4
21+
22+
- name: Set up Python ${{ matrix.python-version }}
23+
run: uv python install ${{ matrix.python-version }}
24+
25+
- name: Install dependencies
26+
run: uv sync --dev
27+
28+
- name: Run tests
29+
run: uv run pytest tests/ -v --tb=short
30+
31+
lint:
32+
runs-on: ubuntu-latest
33+
steps:
34+
- uses: actions/checkout@v4
35+
36+
- name: Install uv
37+
uses: astral-sh/setup-uv@v4
38+
39+
- name: Set up Python
40+
run: uv python install 3.12
41+
42+
- name: Install dependencies
43+
run: uv sync --dev
44+
45+
- name: Run ruff check
46+
run: uv run ruff check .
47+
48+
- name: Run ruff format check
49+
run: uv run ruff format --check .

examples/reports/train_hashprep_report_fixes.py

Lines changed: 16 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,6 @@
33
Review and adapt before production use.
44
"""
55

6-
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
7-
from sklearn.preprocessing import RobustScaler
8-
import numpy as np
96
import pandas as pd
107

118

@@ -14,51 +11,51 @@ def apply_fixes(df):
1411
df = df.copy()
1512

1613
# Column 'Cabin' has 77% missing values
17-
df = df.drop(columns=['Cabin'])
14+
df = df.drop(columns=["Cabin"])
1815

1916
# Frequency encode high-cardinality column 'Name'
20-
freq_Name = df['Name'].value_counts(normalize=True)
21-
df['Name_encoded'] = df['Name'].map(freq_Name)
17+
freq_Name = df["Name"].value_counts(normalize=True)
18+
df["Name_encoded"] = df["Name"].map(freq_Name)
2219

2320
# Frequency encode high-cardinality column 'Ticket'
24-
freq_Ticket = df['Ticket'].value_counts(normalize=True)
25-
df['Ticket_encoded'] = df['Ticket'].map(freq_Ticket)
21+
freq_Ticket = df["Ticket"].value_counts(normalize=True)
22+
df["Ticket_encoded"] = df["Ticket"].map(freq_Ticket)
2623

2724
# Clip outliers in 'Fare' using IQR method
28-
q1_Fare, q3_Fare = df['Fare'].quantile([0.25, 0.75])
25+
q1_Fare, q3_Fare = df["Fare"].quantile([0.25, 0.75])
2926
iqr_Fare = q3_Fare - q1_Fare
3027
lower_Fare, upper_Fare = q1_Fare - 1.5 * iqr_Fare, q3_Fare + 1.5 * iqr_Fare
31-
df['Fare'] = df['Fare'].clip(lower=lower_Fare, upper=upper_Fare)
28+
df["Fare"] = df["Fare"].clip(lower=lower_Fare, upper=upper_Fare)
3229

3330
# Clip outliers in 'Parch' using IQR method
34-
q1_Parch, q3_Parch = df['Parch'].quantile([0.25, 0.75])
31+
q1_Parch, q3_Parch = df["Parch"].quantile([0.25, 0.75])
3532
iqr_Parch = q3_Parch - q1_Parch
3633
lower_Parch, upper_Parch = q1_Parch - 1.5 * iqr_Parch, q3_Parch + 1.5 * iqr_Parch
37-
df['Parch'] = df['Parch'].clip(lower=lower_Parch, upper=upper_Parch)
34+
df["Parch"] = df["Parch"].clip(lower=lower_Parch, upper=upper_Parch)
3835

3936
# Clip outliers in 'SibSp' using IQR method
40-
q1_SibSp, q3_SibSp = df['SibSp'].quantile([0.25, 0.75])
37+
q1_SibSp, q3_SibSp = df["SibSp"].quantile([0.25, 0.75])
4138
iqr_SibSp = q3_SibSp - q1_SibSp
4239
lower_SibSp, upper_SibSp = q1_SibSp - 1.5 * iqr_SibSp, q3_SibSp + 1.5 * iqr_SibSp
43-
df['SibSp'] = df['SibSp'].clip(lower=lower_SibSp, upper=upper_SibSp)
40+
df["SibSp"] = df["SibSp"].clip(lower=lower_SibSp, upper=upper_SibSp)
4441

4542
# Drop highly correlated column 'Survived,Sex'
46-
df = df.drop(columns=['Survived,Sex'])
43+
df = df.drop(columns=["Survived,Sex"])
4744

4845
return df
4946

5047

51-
if __name__ == '__main__':
48+
if __name__ == "__main__":
5249
import sys
5350

5451
if len(sys.argv) < 2:
55-
print('Usage: python fixes.py <input.csv> [output.csv]')
52+
print("Usage: python fixes.py <input.csv> [output.csv]")
5653
sys.exit(1)
5754

5855
input_file = sys.argv[1]
59-
output_file = sys.argv[2] if len(sys.argv) > 2 else 'cleaned_data.csv'
56+
output_file = sys.argv[2] if len(sys.argv) > 2 else "cleaned_data.csv"
6057

6158
df = pd.read_csv(input_file)
6259
df_clean = apply_fixes(df)
6360
df_clean.to_csv(output_file, index=False)
64-
print(f'Cleaned data saved to {output_file}')
61+
print(f"Cleaned data saved to {output_file}")

examples/reports/train_hashprep_report_pipeline.py

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,27 +5,27 @@
55

66
from sklearn.compose import ColumnTransformer
77
from sklearn.pipeline import Pipeline
8-
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
9-
import numpy as np
108

119

1210
def build_preprocessing_pipeline():
1311
"""Build sklearn preprocessing pipeline."""
1412

1513
transformers = [
16-
('drop_column_Cabin', 'drop', ['Cabin']),
17-
('drop_column_Survived,S', 'drop', ['Survived,Sex']),
14+
("drop_column_Cabin", "drop", ["Cabin"]),
15+
("drop_column_Survived,S", "drop", ["Survived,Sex"]),
1816
]
1917

2018
preprocessor = ColumnTransformer(
2119
transformers=transformers,
22-
remainder='passthrough',
20+
remainder="passthrough",
2321
verbose_feature_names_out=False,
2422
)
2523

26-
pipeline = Pipeline([
27-
('preprocessor', preprocessor),
28-
])
24+
pipeline = Pipeline(
25+
[
26+
("preprocessor", preprocessor),
27+
]
28+
)
2929

3030
return pipeline
3131

@@ -37,20 +37,18 @@ def get_pre_pipeline_steps():
3737
"""
3838
steps = []
3939
# Outlier clipping for ['Fare']
40-
steps.append(('clip_outliers_Fare', None)) # Implement manually
40+
steps.append(("clip_outliers_Fare", None)) # Implement manually
4141
# Outlier clipping for ['Parch']
42-
steps.append(('clip_outliers_Parch', None)) # Implement manually
42+
steps.append(("clip_outliers_Parch", None)) # Implement manually
4343
# Outlier clipping for ['SibSp']
44-
steps.append(('clip_outliers_SibSp', None)) # Implement manually
44+
steps.append(("clip_outliers_SibSp", None)) # Implement manually
4545
return steps
4646

4747

48-
if __name__ == '__main__':
49-
import joblib
50-
48+
if __name__ == "__main__":
5149
pipeline = build_preprocessing_pipeline()
5250
if pipeline:
53-
print('Pipeline created successfully')
51+
print("Pipeline created successfully")
5452
print(pipeline)
5553
# Example: Save pipeline
56-
# joblib.dump(pipeline, 'preprocessing_pipeline.joblib')
54+
# joblib.dump(pipeline, 'preprocessing_pipeline.joblib')

hashprep/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
from .core.analyzer import DatasetAnalyzer
1+
from .core.analyzer import DatasetAnalyzer as DatasetAnalyzer
22

3-
__version__ = "0.1.0b1"
3+
__version__ = "0.1.0b1"

hashprep/checks/__init__.py

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,31 @@
1-
from typing import List, Optional
2-
3-
from .core import Issue
1+
from .columns import _check_duplicates, _check_high_cardinality, _check_mixed_data_types, _check_single_value_columns
2+
from .core import Issue as Issue
3+
from .correlations import calculate_correlations
4+
from .distribution import _check_uniform_distribution, _check_unique_values
45
from .drift import check_drift
6+
from .imbalance import _check_class_imbalance
57
from .leakage import _check_data_leakage, _check_target_leakage_patterns
6-
from .missing_values import _check_high_missing_values, _check_empty_columns, _check_dataset_missingness, \
7-
_check_missing_patterns
8-
from .columns import _check_single_value_columns, _check_high_cardinality, _check_duplicates, _check_mixed_data_types
8+
from .missing_values import (
9+
_check_dataset_missingness,
10+
_check_empty_columns,
11+
_check_high_missing_values,
12+
_check_missing_patterns,
13+
)
914
from .outliers import (
10-
_check_outliers,
11-
_check_high_zero_counts,
12-
_check_extreme_text_lengths,
13-
_check_datetime_skew,
14-
_check_skewness,
15-
_check_infinite_values,
1615
_check_constant_length,
16+
_check_datetime_skew,
1717
_check_empty_dataset,
18+
_check_extreme_text_lengths,
19+
_check_high_zero_counts,
20+
_check_infinite_values,
21+
_check_outliers,
22+
_check_skewness,
1823
)
19-
from .correlations import calculate_correlations
20-
from .imbalance import _check_class_imbalance
21-
from .distribution import _check_uniform_distribution, _check_unique_values
2224

2325

2426
def _check_dataset_drift(analyzer):
2527
"""Wrapper for drift detection that uses analyzer's comparison_df."""
26-
if hasattr(analyzer, 'comparison_df') and analyzer.comparison_df is not None:
28+
if hasattr(analyzer, "comparison_df") and analyzer.comparison_df is not None:
2729
return check_drift(analyzer.df, analyzer.comparison_df)
2830
return []
2931

@@ -56,7 +58,7 @@ def _check_dataset_drift(analyzer):
5658
CORRELATION_CHECKS = {"feature_correlation", "categorical_correlation", "mixed_correlation"}
5759

5860

59-
def run_checks(analyzer, checks_to_run: List[str]):
61+
def run_checks(analyzer, checks_to_run: list[str]):
6062
issues = []
6163
correlation_requested = False
6264

@@ -70,4 +72,4 @@ def run_checks(analyzer, checks_to_run: List[str]):
7072
if correlation_requested:
7173
issues.extend(calculate_correlations(analyzer))
7274

73-
return issues
75+
return issues

hashprep/checks/columns.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1-
from .core import Issue
21
from ..config import DEFAULT_CONFIG
2+
from .core import Issue
33

44
_COL_THRESHOLDS = DEFAULT_CONFIG.columns
55

6+
67
def _check_single_value_columns(analyzer):
78
issues = []
89
for col in analyzer.df.columns:
@@ -26,7 +27,12 @@ def _check_single_value_columns(analyzer):
2627
)
2728
return issues
2829

29-
def _check_high_cardinality(analyzer, threshold: int = _COL_THRESHOLDS.high_cardinality_count, critical_threshold: float = _COL_THRESHOLDS.high_cardinality_ratio_critical):
30+
31+
def _check_high_cardinality(
32+
analyzer,
33+
threshold: int = _COL_THRESHOLDS.high_cardinality_count,
34+
critical_threshold: float = _COL_THRESHOLDS.high_cardinality_ratio_critical,
35+
):
3036
issues = []
3137
categorical_cols = analyzer.df.select_dtypes(include="object").columns.tolist()
3238
for col in categorical_cols:
@@ -52,6 +58,7 @@ def _check_high_cardinality(analyzer, threshold: int = _COL_THRESHOLDS.high_card
5258
)
5359
return issues
5460

61+
5562
def _check_duplicates(analyzer):
5663
issues = []
5764
duplicate_rows = int(analyzer.df.duplicated().sum())
@@ -76,6 +83,7 @@ def _check_duplicates(analyzer):
7683
)
7784
return issues
7885

86+
7987
def _check_mixed_data_types(analyzer):
8088
issues = []
8189
for col in analyzer.df.columns:
@@ -91,4 +99,4 @@ def _check_mixed_data_types(analyzer):
9199
quick_fix="Options: \n- Cast to single type: Ensure consistency (Pros: Simplifies processing; Cons: May lose nuance).\n- Split column: Separate types into new features (Pros: Preserves info; Cons: Adds complexity).\n- Investigate source: Check data collection errors (Pros: Improves quality; Cons: Time-consuming).",
92100
)
93101
)
94-
return issues
102+
return issues

hashprep/checks/core.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
from dataclasses import dataclass
22

3-
@dataclass
43

4+
@dataclass
55
class Issue:
6-
76
category: str
87

98
severity: str # critical or warning

0 commit comments

Comments
 (0)