-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_preprocessing.py
More file actions
68 lines (54 loc) · 2.25 KB
/
test_preprocessing.py
File metadata and controls
68 lines (54 loc) · 2.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from __future__ import annotations
import pandas as pd
from reprolab.preprocessing import DataPreprocessor, PreprocessingConfig
def test_preprocessing_handles_duplicates_missing_and_formatting() -> None:
df = pd.DataFrame(
{
"patient_id": ["P1", "P1", "P2"],
"diagnosis_code": ["e11", "e11", "i10"],
"hba1c_pct": [8.0, 8.0, 5.6],
"event_date": ["2026/01/01", "2026/01/01", "01-03-2026"],
"glucose_mg_dl": ["180 mg/dL", "180 mg/dL", None],
"adverse_event": ["yes", "yes", "NO"],
}
)
cleaned, logs = DataPreprocessor().process(df)
assert len(cleaned) == 2
assert cleaned["diagnosis_code"].str.isupper().all()
assert cleaned["event_date"].str.match(r"\d{4}-\d{2}-\d{2}").all()
assert cleaned["glucose_mg_dl"].notna().all()
assert len(logs) > 0
def test_preprocessing_supports_knn_numeric_imputation() -> None:
df = pd.DataFrame(
{
"patient_id": ["P1", "P2", "P3", "P4"],
"diagnosis_code": ["e11", "e11", "e11", "i10"],
"hba1c_pct": [8.0, 8.2, None, 5.5],
"glucose_mg_dl": [180.0, 182.0, 181.0, 95.0],
"weight_kg": [82.0, 84.0, 83.0, 60.0],
"adverse_event": ["yes", "no", "yes", "no"],
}
)
cleaned, logs = DataPreprocessor(
PreprocessingConfig(numeric_imputation_strategy="knn", knn_neighbors=2)
).process(df)
imputed_value = float(cleaned.loc[2, "hba1c_pct"])
assert 8.0 <= imputed_value <= 8.2
assert cleaned["hba1c_pct"].notna().all()
assert any(
"KNN imputation" in record.rationale for record in logs if record.column == "hba1c_pct"
)
def test_preprocessing_benchmark_includes_knn_strategy() -> None:
from reprolab.simulation.benchmark import run_preprocessing_benchmark
df = pd.DataFrame(
{
"patient_id": ["P1", "P2", "P3"],
"diagnosis_code": ["e11", "e11", "i10"],
"hba1c_pct": [8.1, None, 5.4],
"glucose_mg_dl": [175.0, 176.0, 92.0],
"adverse_event": ["yes", "no", "no"],
}
)
benchmark = run_preprocessing_benchmark(df)
assert "reprolab_median" in set(benchmark["strategy"])
assert "reprolab_knn" in set(benchmark["strategy"])