Skip to content

Commit b4ba00b

Browse files
author
Jean-Francois Zinque
authored
Reuse coerce in engines.utils (#645)
* reuse coerce logic in engines.utils * add test_coerce_error * rename coerce to try_coerce and _coerce to coerce
1 parent 70d55ee commit b4ba00b

File tree

8 files changed

+69
-20
lines changed

8 files changed

+69
-20
lines changed

pandera/dtypes.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,14 @@ def coerce(self, data_container: Any):
3131
"""Coerce data container to the data type."""
3232
raise NotImplementedError()
3333

34+
def try_coerce(self, data_container: Any):
35+
"""Coerce data container to the data type,
36+
raises a `~pandera.errors.ParserError` if the coercion fails
37+
38+
:raises: :class:`~pandera.errors.ParserError`: if coercion fails
39+
"""
40+
raise NotImplementedError()
41+
3442
def __call__(self, data_container: Any):
3543
"""Coerce data container to the data type."""
3644
return self.coerce(data_container)

pandera/engines/numpy_engine.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,16 @@ def __post_init__(self):
4343
self, "type", np.dtype(self.type)
4444
) # pragma: no cover
4545

46-
def coerce(
46+
def coerce(self, data_container: PandasObject) -> PandasObject:
47+
"""Pure coerce without catching exceptions."""
48+
return data_container.astype(self.type)
49+
50+
def try_coerce(
4751
self, data_container: Union[PandasObject, np.ndarray]
4852
) -> Union[PandasObject, np.ndarray]:
4953
try:
50-
return data_container.astype(self.type)
51-
except (ValueError, TypeError) as exc:
54+
return self.coerce(data_container)
55+
except Exception as exc: # pylint:disable=broad-except
5256
raise errors.ParserError(
5357
f"Could not coerce {type(data_container)} data_container "
5458
f"into type {self.type}",

pandera/engines/pandas_engine.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -72,14 +72,18 @@ def __post_init__(self):
7272
) # pragma: no cover
7373

7474
def coerce(self, data_container: PandasObject) -> PandasObject:
75+
"""Pure coerce without catching exceptions."""
76+
return data_container.astype(self.type)
77+
78+
def try_coerce(self, data_container: PandasObject) -> PandasObject:
7579
try:
76-
return data_container.astype(self.type)
77-
except (ValueError, TypeError) as exc:
80+
return self.coerce(data_container)
81+
except Exception as exc: # pylint:disable=broad-except
7882
raise errors.ParserError(
7983
f"Could not coerce {type(data_container)} data_container "
8084
f"into type {self.type}",
8185
failure_cases=utils.numpy_pandas_coerce_failure_cases(
82-
data_container, self.type
86+
data_container, self
8387
),
8488
) from exc
8589

@@ -376,9 +380,7 @@ class Category(DataType, dtypes.Category):
376380
type: pd.CategoricalDtype = dataclasses.field(default=None, init=False)
377381

378382
def __init__( # pylint:disable=super-init-not-called
379-
self,
380-
categories: Optional[Iterable[Any]] = None,
381-
ordered: bool = False,
383+
self, categories: Optional[Iterable[Any]] = None, ordered: bool = False
382384
) -> None:
383385
dtypes.Category.__init__(self, categories, ordered)
384386
object.__setattr__(

pandera/engines/utils.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@ def numpy_pandas_coercible(series: pd.Series, type_: Any) -> pd.Series:
1515
1616
Bisects the series until all the failure cases are found.
1717
"""
18+
# pylint: disable=import-outside-toplevel,cyclic-import
19+
from pandera.engines import pandas_engine
20+
21+
data_type = pandas_engine.Engine.dtype(type_)
1822

1923
def _bisect(series):
2024
assert (
@@ -25,9 +29,9 @@ def _bisect(series):
2529

2630
def _coercible(series):
2731
try:
28-
series.astype(type_)
32+
data_type.coerce(series)
2933
return True
30-
except (ValueError, TypeError):
34+
except Exception: # pylint:disable=broad-except
3135
return False
3236

3337
search_list = [series] if series.size == 1 else _bisect(series)
@@ -60,6 +64,9 @@ def numpy_pandas_coerce_failure_cases(
6064
"""
6165
# pylint: disable=import-outside-toplevel,cyclic-import
6266
from pandera import error_formatters
67+
from pandera.engines import pandas_engine
68+
69+
data_type = pandas_engine.Engine.dtype(type_)
6370

6471
if isinstance(data_container, np.ndarray):
6572
if len(data_container.shape) == 1:
@@ -76,16 +83,15 @@ def numpy_pandas_coerce_failure_cases(
7683

7784
if isinstance(data_container, pd.DataFrame):
7885
check_output = data_container.apply(
79-
numpy_pandas_coercible,
80-
args=(type_,),
86+
numpy_pandas_coercible, args=(data_type,)
8187
)
8288
_, failure_cases = check_utils.prepare_dataframe_check_output(
8389
data_container,
8490
check_output,
8591
ignore_na=False,
8692
)
8793
elif isinstance(data_container, pd.Series):
88-
check_output = numpy_pandas_coercible(data_container, type_)
94+
check_output = numpy_pandas_coercible(data_container, data_type)
8995
_, failure_cases = check_utils.prepare_series_check_output(
9096
data_container,
9197
check_output,

pandera/schema_components.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -376,7 +376,7 @@ def validate(
376376
check_obj.index = self.coerce_dtype(check_obj.index)
377377
# handles case where pandas native string type is not supported
378378
# by index.
379-
obj_to_validate = self.dtype.coerce(
379+
obj_to_validate = self.dtype.try_coerce(
380380
pd.Series(check_obj.index, name=check_obj.index.name)
381381
)
382382
else:

pandera/schemas.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,7 @@ def _coerce_dtype(self, obj: pd.DataFrame) -> pd.DataFrame:
326326
)
327327

328328
try:
329-
return self.dtype.coerce(obj)
329+
return self.dtype.try_coerce(obj)
330330
except errors.ParserError as exc:
331331
raise errors.SchemaError(
332332
self,
@@ -1689,7 +1689,7 @@ def coerce_dtype(self, obj: Union[pd.Series, pd.Index]) -> pd.Series:
16891689
return obj
16901690

16911691
try:
1692-
return self.dtype.coerce(obj)
1692+
return self.dtype.try_coerce(obj)
16931693
except errors.ParserError as exc:
16941694
msg = (
16951695
f"Error while coercing '{self.name}' to type "

tests/core/test_dtypes.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,36 @@ def test_coerce_cast(dtypes, examples, data):
363363
assert expected_dtype.check(pandas_engine.Engine.dtype(coerced_dtype))
364364

365365

366+
@pytest.mark.parametrize(
367+
"examples, type_, failure_indices",
368+
[
369+
(["a", 0, "b"], int, [0, 2]),
370+
(
371+
[
372+
"2021-09-01",
373+
datetime.datetime(2021, 1, 7),
374+
pd.NaT,
375+
"not_a_date",
376+
],
377+
datetime.datetime,
378+
[3],
379+
),
380+
],
381+
)
382+
def test_try_coerce(examples, type_, failure_indices):
383+
"""Test that try_coerce raises a ParseError."""
384+
data_type = pandas_engine.Engine.dtype(type_)
385+
data = pd.Series(examples)
386+
387+
with pytest.raises(pa.errors.ParserError):
388+
data_type.try_coerce(data)
389+
390+
try:
391+
data_type.try_coerce(data)
392+
except pa.errors.ParserError as exc:
393+
assert exc.failure_cases["index"].to_list() == failure_indices
394+
395+
366396
def test_coerce_string():
367397
"""Test that strings can be coerced."""
368398
data = pd.Series([1, None], dtype="Int32")
@@ -487,8 +517,7 @@ def test_is_float(float_dtype: Any, expected: bool):
487517

488518
@pytest.mark.parametrize(
489519
"complex_dtype, expected",
490-
[(dtype, True) for dtype in complex_dtypes]
491-
+ [("string", False)], # type: ignore
520+
[(dtype, True) for dtype in complex_dtypes] + [("string", False)], # type: ignore
492521
)
493522
def test_is_complex(complex_dtype: Any, expected: bool):
494523
"""Test is_complex."""

tests/core/test_pandas_engine.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,6 @@ def test_pandas_data_type_coerce(data_type):
3939
# don't test data types that require parameters e.g. Category
4040
return
4141
try:
42-
data_type().coerce(pd.Series(["1", "2", "a"]))
42+
data_type().try_coerce(pd.Series(["1", "2", "a"]))
4343
except ParserError as exc:
4444
assert exc.failure_cases.shape[0] > 0

0 commit comments

Comments
 (0)