Skip to content

Commit a5e594b

Browse files
authored
Implement regex option and add additional checks (#2061)
* Run `pre-commit` on all files to fix linter issues Signed-off-by: Deepyaman Datta <[email protected]> * Implement `regex` option and add additional checks Signed-off-by: Deepyaman Datta <[email protected]> * Support validating set of columns matching a regex Signed-off-by: Deepyaman Datta <[email protected]> --------- Signed-off-by: Deepyaman Datta <[email protected]>
1 parent a869d5b commit a5e594b

File tree

8 files changed

+292
-51
lines changed

8 files changed

+292
-51
lines changed

pandera/api/ibis/components.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Core Ibis schema component specifications."""
22

3+
import logging
34
from typing import Any, Optional, Type
45

56
import ibis
@@ -11,6 +12,8 @@
1112
from pandera.engines import ibis_engine
1213
from pandera.utils import is_regex
1314

15+
logger = logging.getLogger(__name__)
16+
1417

1518
class Column(ComponentSchema[ibis.Table]):
1619
"""Validate types and properties of table columns."""
@@ -97,7 +100,7 @@ def __init__(
97100
self.regex = regex
98101
self.name = name
99102

100-
# self.set_regex() # TODO(deepyaman): Implement method.
103+
self.set_regex()
101104

102105
# pylint: disable=unused-argument
103106
@staticmethod
@@ -118,10 +121,22 @@ def selector(self):
118121
return f"^{self.name}$"
119122
return self.name
120123

124+
def set_regex(self):
125+
if self.name is None:
126+
return
127+
128+
if is_regex(self.name) and not self.regex:
129+
logger.info(
130+
f"Column schema '{self.name}' is a regex expression. "
131+
"Setting regex=True."
132+
)
133+
self.regex = True
134+
121135
def set_name(self, name: str):
122136
"""Set or modify the name of a column object.
123137
124138
:param str name: the name of the column object
125139
"""
126140
self.name = name
141+
self.set_regex()
127142
return self

pandera/backends/ibis/components.py

Lines changed: 67 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22

33
from __future__ import annotations
44

5-
from typing import TYPE_CHECKING, List, Optional, cast
5+
from typing import TYPE_CHECKING, Iterable, List, Optional, cast
66

77
import ibis
8+
import ibis.selectors as s
89

910
from pandera.api.base.error_handler import ErrorHandler
1011
from pandera.backends.base import CoreCheckResult
@@ -36,45 +37,68 @@ def validate(
3637
"""Validation backend implementation for Ibis table columns."""
3738
error_handler = ErrorHandler(lazy)
3839

39-
# TODO(deepyaman): subsample the check object if head, tail, or sample are specified
40-
sample = check_obj[schema.name]
41-
42-
# run the checks
43-
core_checks = [
44-
self.check_dtype,
45-
self.run_checks,
46-
]
47-
48-
args = (sample, schema)
49-
for check in core_checks:
50-
results = check(*args)
51-
if isinstance(results, CoreCheckResult):
52-
results = [results]
53-
54-
for result in results:
55-
if result.passed:
56-
continue
57-
# Why cast `results` only in components.py, not in container.py?
58-
results = cast(List[CoreCheckResult], results)
59-
if result.schema_error is not None:
60-
error = result.schema_error
61-
else:
62-
error = SchemaError(
63-
schema=schema,
64-
data=check_obj,
65-
message=result.message,
66-
failure_cases=result.failure_cases,
67-
check=result.check,
68-
check_index=result.check_index,
69-
check_output=result.check_output,
70-
reason_code=result.reason_code,
71-
)
72-
error_handler.collect_error( # Why indent (unlike in container.py)?
73-
validation_type(result.reason_code),
74-
result.reason_code,
75-
error,
76-
original_exc=result.original_exc,
77-
)
40+
def validate_column(check_obj, column_name):
41+
# make sure the schema component mutations are reverted after
42+
# validation
43+
_orig_name = schema.name
44+
_orig_regex = schema.regex
45+
46+
# set the column name and regex flag for a single column
47+
schema.name = column_name
48+
schema.regex = False
49+
50+
# TODO(deepyaman): subsample the check object if head, tail, or sample are specified
51+
sample = check_obj[column_name]
52+
53+
# run the checks
54+
core_checks = [
55+
self.check_dtype,
56+
self.run_checks,
57+
]
58+
59+
args = (sample, schema)
60+
for check in core_checks:
61+
results = check(*args)
62+
if isinstance(results, CoreCheckResult):
63+
results = [results]
64+
65+
for result in results:
66+
if result.passed:
67+
continue
68+
# Why cast `results` only in components.py, not in container.py?
69+
results = cast(List[CoreCheckResult], results)
70+
if result.schema_error is not None:
71+
error = result.schema_error
72+
else:
73+
error = SchemaError(
74+
schema=schema,
75+
data=check_obj,
76+
message=result.message,
77+
failure_cases=result.failure_cases,
78+
check=result.check,
79+
check_index=result.check_index,
80+
check_output=result.check_output,
81+
reason_code=result.reason_code,
82+
)
83+
error_handler.collect_error( # Why indent (unlike in container.py)?
84+
validation_type(result.reason_code),
85+
result.reason_code,
86+
error,
87+
original_exc=result.original_exc,
88+
)
89+
90+
# revert the schema component mutations
91+
schema.name = _orig_name
92+
schema.regex = _orig_regex
93+
94+
column_keys_to_check = (
95+
self.get_regex_columns(schema, check_obj)
96+
if schema.regex
97+
else [schema.name]
98+
)
99+
100+
for column_name in column_keys_to_check:
101+
validate_column(check_obj, column_name)
78102

79103
if lazy and error_handler.collected_errors:
80104
raise SchemaErrors(
@@ -85,6 +109,9 @@ def validate(
85109

86110
return check_obj
87111

112+
def get_regex_columns(self, schema, check_obj) -> Iterable:
113+
return check_obj.select(s.matches(schema.selector)).columns
114+
88115
@validate_scope(scope=ValidationScope.SCHEMA)
89116
def check_dtype(
90117
self, check_obj: ibis.Column, schema: Column

pandera/backends/ibis/container.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
from typing import TYPE_CHECKING, Any, Iterable, List, Optional
88

99
import ibis
10+
import ibis.selectors as s
11+
from ibis.common.exceptions import IbisError
1012

1113
from pandera.api.base.error_handler import ErrorHandler
1214
from pandera.config import ValidationScope
@@ -203,7 +205,7 @@ def collect_column_info(
203205
):
204206
absent_column_names.append(col_name)
205207

206-
if col_schema.regex: # TODO(deepyaman): Implement functionality.
208+
if col_schema.regex:
207209
try:
208210
column_names.extend(
209211
col_schema.get_backend(check_obj).get_regex_columns(
@@ -291,7 +293,14 @@ def check_column_presence(
291293
if column_info.absent_column_names and not schema.add_missing_columns:
292294
for colname in column_info.absent_column_names:
293295
if is_regex(colname):
294-
continue # TODO(deepyaman): Support regex colnames.
296+
try:
297+
# don't raise an error if the column schema name is a
298+
# regex pattern
299+
check_obj.select(s.matches(colname))
300+
continue
301+
except IbisError:
302+
# regex pattern didn't match any columns
303+
pass
295304
results.append(
296305
CoreCheckResult(
297306
passed=False,

pandera/engines/ibis_engine.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ class DataType(dtypes.DataType):
2525
type: Any = dataclasses.field(repr=False, init=False)
2626
"""Native Ibis dtype boxed by the data type."""
2727

28-
def __init__(self, dtype: Any):
28+
def __init__(self, dtype: Optional[Any] = None):
2929
super().__init__()
3030
object.__setattr__(self, "type", ibis.dtype(dtype))
3131
dtype_cls = dtype if inspect.isclass(dtype) else dtype.__class__
@@ -220,6 +220,8 @@ class UInt64(DataType, dtypes.UInt64):
220220
class Float32(DataType, dtypes.Float32):
221221
"""Semantic representation of a :class:`dt.Float32`."""
222222

223+
type = dt.float32
224+
223225

224226
@Engine.register_dtype(
225227
equivalents=[

tests/ibis/test_ibis_builtin_checks.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1367,7 +1367,7 @@ def pytest_generate_tests(self, metafunc):
13671367
)
13681368

13691369
def get_data_param(self):
1370-
"""Generate the params which will be used to test this function. All the accpetable
1370+
"""Generate the params which will be used to test this function. All the acceptable
13711371
data types would be tested"""
13721372
return {
13731373
"test_unique_values_eq_check": [

0 commit comments

Comments
 (0)