Skip to content

Commit 7106313

Browse files
authored
Merge pull request #217 from datakind/fix/pdp-course-handling-duplicates
fix: fix duplicate-handling step in validation
2 parents faebf9e + 652df89 commit 7106313

File tree

2 files changed

+29
-16
lines changed

2 files changed

+29
-16
lines changed

src/webapp/validation.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
import logging
2020
import tempfile
2121
from contextlib import contextmanager
22-
from functools import lru_cache, partial
22+
from functools import lru_cache
2323
from typing import (
2424
Any,
2525
BinaryIO,
@@ -46,6 +46,19 @@
4646
# Type for PDP converter functions (DataFrame -> DataFrame); used for cohort/course.
4747
PDPConverterFunc = Optional[Callable[[pd.DataFrame], pd.DataFrame]]
4848

49+
50+
def _default_pdp_course_duplicate_converter(df: pd.DataFrame) -> pd.DataFrame:
51+
"""
52+
PDP course duplicate cleanup for read_raw_pdp_course_data.
53+
54+
Passes the schema selector as the second *positional* argument so this works
55+
with current edvise (``schema_type``) and older builds that used the same slot
56+
for ``school_type``. Do not pass bare ``handling_duplicates`` as a converter:
57+
read_raw_pdp_course_data calls ``converter_func(df)`` with a single argument.
58+
"""
59+
return handling_duplicates(df, "pdp")
60+
61+
4962
# --------------------------------------------------------------------------- #
5063
# Logging
5164
# --------------------------------------------------------------------------- #
@@ -867,9 +880,8 @@ def _read_pdp_course_edvise(
867880
868881
Tries each datetime format with each converter. If a custom
869882
course_converter_func is provided (e.g. from a school), it is tried first;
870-
then the default handling_duplicates(..., school_type="pdp"), then
871-
handling_duplicates for older edvise. Raises HardValidationError if all
872-
attempts fail.
883+
then :func:`_default_pdp_course_duplicate_converter` (``handling_duplicates``
884+
with PDP settings). Raises HardValidationError if all attempts fail.
873885
874886
Args:
875887
path: Path to course CSV.
@@ -882,10 +894,7 @@ def _read_pdp_course_edvise(
882894
Raises:
883895
HardValidationError: If no (converter, format) pair succeeded.
884896
"""
885-
default_converters = (
886-
partial(handling_duplicates, school_type="pdp"),
887-
handling_duplicates,
888-
)
897+
default_converters = (_default_pdp_course_duplicate_converter,)
889898
converters = (
890899
(course_converter_func,) if course_converter_func is not None else ()
891900
) + default_converters
@@ -903,7 +912,7 @@ def _read_pdp_course_edvise(
903912
except ValueError as e:
904913
last_error = e
905914
except TypeError as e:
906-
if "school_type" in str(e):
915+
if "school_type" in str(e) or "schema_type" in str(e):
907916
last_error = None
908917
break
909918
raise

src/webapp/validation_pdp_read_path_test.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -366,20 +366,24 @@ def test_read_pdp_course_edvise_all_attempts_fail_raises_hard_validation_error()
366366
)
367367

368368

369-
def test_read_pdp_course_edvise_typeerror_school_type_tries_next_converter() -> None:
370-
"""When first converter raises TypeError with school_type, second converter is tried."""
369+
def test_read_pdp_course_edvise_falls_back_after_custom_converter_fails() -> None:
370+
"""When custom converter fails all datetime formats, default PDP converter is used."""
371371
expected = pd.DataFrame({"course_id": ["c1"]})
372372
with patch(
373373
"src.webapp.validation.read_raw_pdp_course_data",
374374
side_effect=[
375-
TypeError(
376-
"handling_duplicates() got an unexpected keyword argument 'school_type'"
377-
),
375+
ValueError("bad datetime"),
376+
ValueError("bad datetime"),
377+
ValueError("bad datetime"),
378378
expected,
379379
],
380-
):
381-
result = _read_pdp_course_edvise("/path.csv")
380+
) as mock_read:
381+
result = _read_pdp_course_edvise(
382+
"/path.csv",
383+
course_converter_func=lambda df: df, # noqa: ARG005
384+
)
382385
pd.testing.assert_frame_equal(result, expected)
386+
assert mock_read.call_count == 4
383387

384388

385389
def test_read_pdp_course_edvise_custom_converter_tried_first() -> None:

0 commit comments

Comments
 (0)