diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4558241b5..9a9b48cfe 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -72,4 +72,4 @@ repos: hooks: - id: codespell additional_dependencies: - - tomli \ No newline at end of file + - tomli diff --git a/pandera/api/ibis/components.py b/pandera/api/ibis/components.py index 7712f69d0..b759d8a19 100644 --- a/pandera/api/ibis/components.py +++ b/pandera/api/ibis/components.py @@ -1,5 +1,6 @@ """Core Ibis schema component specifications.""" +import logging from typing import Any, Optional, Type import ibis @@ -11,6 +12,8 @@ from pandera.engines import ibis_engine from pandera.utils import is_regex +logger = logging.getLogger(__name__) + class Column(ComponentSchema[ibis.Table]): """Validate types and properties of table columns.""" @@ -97,7 +100,7 @@ def __init__( self.regex = regex self.name = name - # self.set_regex() # TODO(deepyaman): Implement method. + self.set_regex() # pylint: disable=unused-argument @staticmethod @@ -118,10 +121,22 @@ def selector(self): return f"^{self.name}$" return self.name + def set_regex(self): + if self.name is None: + return + + if is_regex(self.name) and not self.regex: + logger.info( + f"Column schema '{self.name}' is a regex expression. " + "Setting regex=True." + ) + self.regex = True + def set_name(self, name: str): """Set or modify the name of a column object. :param str name: the name of the column object """ self.name = name + self.set_regex() return self diff --git a/pandera/backends/ibis/components.py b/pandera/backends/ibis/components.py index 53aef4e36..3cc2da346 100644 --- a/pandera/backends/ibis/components.py +++ b/pandera/backends/ibis/components.py @@ -2,9 +2,10 @@ from __future__ import annotations -from typing import TYPE_CHECKING, List, Optional, cast +from typing import TYPE_CHECKING, Iterable, List, Optional, cast import ibis +import ibis.selectors as s from pandera.api.base.error_handler import ErrorHandler from pandera.backends.base import CoreCheckResult @@ -36,45 +37,68 @@ def validate( """Validation backend implementation for Ibis table columns.""" error_handler = ErrorHandler(lazy) - # TODO(deepyaman): subsample the check object if head, tail, or sample are specified - sample = check_obj[schema.name] - - # run the checks - core_checks = [ - self.check_dtype, - self.run_checks, - ] - - args = (sample, schema) - for check in core_checks: - results = check(*args) - if isinstance(results, CoreCheckResult): - results = [results] - - for result in results: - if result.passed: - continue - # Why cast `results` only in components.py, not in container.py? - results = cast(List[CoreCheckResult], results) - if result.schema_error is not None: - error = result.schema_error - else: - error = SchemaError( - schema=schema, - data=check_obj, - message=result.message, - failure_cases=result.failure_cases, - check=result.check, - check_index=result.check_index, - check_output=result.check_output, - reason_code=result.reason_code, - ) - error_handler.collect_error( # Why indent (unlike in container.py)? - validation_type(result.reason_code), - result.reason_code, - error, - original_exc=result.original_exc, - ) + def validate_column(check_obj, column_name): + # make sure the schema component mutations are reverted after + # validation + _orig_name = schema.name + _orig_regex = schema.regex + + # set the column name and regex flag for a single column + schema.name = column_name + schema.regex = False + + # TODO(deepyaman): subsample the check object if head, tail, or sample are specified + sample = check_obj[column_name] + + # run the checks + core_checks = [ + self.check_dtype, + self.run_checks, + ] + + args = (sample, schema) + for check in core_checks: + results = check(*args) + if isinstance(results, CoreCheckResult): + results = [results] + + for result in results: + if result.passed: + continue + # Why cast `results` only in components.py, not in container.py? + results = cast(List[CoreCheckResult], results) + if result.schema_error is not None: + error = result.schema_error + else: + error = SchemaError( + schema=schema, + data=check_obj, + message=result.message, + failure_cases=result.failure_cases, + check=result.check, + check_index=result.check_index, + check_output=result.check_output, + reason_code=result.reason_code, + ) + error_handler.collect_error( # Why indent (unlike in container.py)? + validation_type(result.reason_code), + result.reason_code, + error, + original_exc=result.original_exc, + ) + + # revert the schema component mutations + schema.name = _orig_name + schema.regex = _orig_regex + + column_keys_to_check = ( + self.get_regex_columns(schema, check_obj) + if schema.regex + else [schema.name] + ) + + for column_name in column_keys_to_check: + validate_column(check_obj, column_name) if lazy and error_handler.collected_errors: raise SchemaErrors( @@ -85,6 +109,9 @@ def validate( return check_obj + def get_regex_columns(self, schema, check_obj) -> Iterable: + return check_obj.select(s.matches(schema.selector)).columns + @validate_scope(scope=ValidationScope.SCHEMA) def check_dtype( self, check_obj: ibis.Column, schema: Column diff --git a/pandera/backends/ibis/container.py b/pandera/backends/ibis/container.py index b73353380..9a7542612 100644 --- a/pandera/backends/ibis/container.py +++ b/pandera/backends/ibis/container.py @@ -7,6 +7,8 @@ from typing import TYPE_CHECKING, Any, Iterable, List, Optional import ibis +import ibis.selectors as s +from ibis.common.exceptions import IbisError from pandera.api.base.error_handler import ErrorHandler from pandera.config import ValidationScope @@ -203,7 +205,7 @@ def collect_column_info( ): absent_column_names.append(col_name) - if col_schema.regex: # TODO(deepyaman): Implement functionality. + if col_schema.regex: try: column_names.extend( col_schema.get_backend(check_obj).get_regex_columns( @@ -291,7 +293,14 @@ def check_column_presence( if column_info.absent_column_names and not schema.add_missing_columns: for colname in column_info.absent_column_names: if is_regex(colname): - continue # TODO(deepyaman): Support regex colnames. + try: + # don't raise an error if the column schema name is a + # regex pattern + check_obj.select(s.matches(colname)) + continue + except IbisError: + # regex pattern didn't match any columns + pass results.append( CoreCheckResult( passed=False, diff --git a/pandera/engines/ibis_engine.py b/pandera/engines/ibis_engine.py index d4632f875..a1baec494 100644 --- a/pandera/engines/ibis_engine.py +++ b/pandera/engines/ibis_engine.py @@ -25,7 +25,7 @@ class DataType(dtypes.DataType): type: Any = dataclasses.field(repr=False, init=False) """Native Ibis dtype boxed by the data type.""" - def __init__(self, dtype: Any): + def __init__(self, dtype: Optional[Any] = None): super().__init__() object.__setattr__(self, "type", ibis.dtype(dtype)) dtype_cls = dtype if inspect.isclass(dtype) else dtype.__class__ @@ -220,6 +220,8 @@ class UInt64(DataType, dtypes.UInt64): class Float32(DataType, dtypes.Float32): """Semantic representation of a :class:`dt.Float32`.""" + type = dt.float32 + @Engine.register_dtype( equivalents=[ diff --git a/tests/ibis/test_ibis_builtin_checks.py b/tests/ibis/test_ibis_builtin_checks.py index d83c06b09..01cbaa089 100644 --- a/tests/ibis/test_ibis_builtin_checks.py +++ b/tests/ibis/test_ibis_builtin_checks.py @@ -1367,7 +1367,7 @@ def pytest_generate_tests(self, metafunc): ) def get_data_param(self): - """Generate the params which will be used to test this function. All the accpetable + """Generate the params which will be used to test this function. All the acceptable data types would be tested""" return { "test_unique_values_eq_check": [ diff --git a/tests/ibis/test_ibis_container.py b/tests/ibis/test_ibis_container.py index 4d7ad3f5d..f0e730ff8 100644 --- a/tests/ibis/test_ibis_container.py +++ b/tests/ibis/test_ibis_container.py @@ -1,9 +1,10 @@ """Unit tests for Ibis container.""" +from typing import Optional + import ibis import ibis.expr.datatypes as dt import ibis.expr.types as ir -import pandas as pd import pytest from ibis import _, selectors as s @@ -15,13 +16,13 @@ @pytest.fixture def t_basic(): """Basic Ibis table fixture.""" - df = pd.DataFrame( + return ibis.memtable( { "string_col": ["0", "1", "2"], "int_col": [0, 1, 2], - } + }, + name="t", ) - return ibis.memtable(df, name="t") @pytest.fixture @@ -35,6 +36,61 @@ def t_schema_basic(): ) +@pytest.fixture +def t_schema_with_check(): + """Ibis table schema with checks.""" + return DataFrameSchema( + { + "string_col": Column(dt.String, pa.Check.isin([*"012"])), + "int_col": Column(dt.Int64, pa.Check.ge(0)), + } + ) + + +@pytest.fixture +def t_for_regex_match(): + """Basic Ibis table fixture.""" + return ibis.memtable( + { + "string_col_0": [*"012"], + "string_col_1": [*"012"], + "string_col_2": [*"012"], + "int_col_0": [0, 1, 2], + "int_col_1": [0, 1, 2], + "int_col_2": [0, 1, 2], + }, + name="t", + ) + + +@pytest.fixture +def t_schema_with_regex_name(): + """Ibis table schema with checks.""" + return DataFrameSchema( + { + r"^string_col_\d+$": Column( + dt.String, pa.Check.isin([*"012"]), required=False + ), + r"^int_col_\d+$": Column(dt.Int64, pa.Check.ge(0), required=False), + } + ) + + +@pytest.fixture +def t_schema_with_regex_option(): + """Ibis table schema with checks.""" + return DataFrameSchema( + { + r"string_col_\d+": Column( + dt.String, pa.Check.isin([*"012"]), regex=True, required=False + ), + r"int_col_\d+": Column( + dt.Int64, pa.Check.ge(0), regex=True, required=False + ), + } + ) + + def test_basic_ibis_table(t_basic, t_schema_basic): """Test basic Ibis table.""" query = t_schema_basic.validate(t_basic) @@ -49,6 +105,29 @@ def test_basic_ibis_table_dtype_error(t_basic, t_schema_basic): t_schema_basic.validate(t) +def test_basic_ibis_table_check_error( + t_basic, + t_schema_with_check, +): + """Test basic Ibis table.""" + query = t_basic.pipe(t_schema_with_check.validate, lazy=True) + + validated_df = query.execute() + assert validated_df.equals(t_basic.execute()) + + +@pytest.mark.xfail( + reason="`coerce_dtype` parser not yet implemented for Ibis backend" +) +def test_coerce_column_dtype(t_basic, t_schema_basic): + """Test coerce dtype via column-level dtype specification.""" + t_schema_basic._coerce = True + modified_data = t_basic.cast({"int_col": dt.String}) + query = modified_data.pipe(t_schema_basic.validate) + coerced_df = query.execute() + assert coerced_df.equals(t_basic.collect()) + + def test_required_columns(): """Test required columns.""" schema = DataFrameSchema( @@ -57,12 +136,34 @@ def test_required_columns(): "b": Column(dt.String, required=False), } ) - t = ibis.memtable(pd.DataFrame({"a": [1, 2, 3]})) + t = ibis.memtable({"a": [1, 2, 3]}) assert schema.validate(t).execute().equals(t.execute()) with pytest.raises(pa.errors.SchemaError): schema.validate(t.rename({"c": "a"})).execute() +def test_missing_required_column_when_lazy_is_true(): + """Test missing required columns when lazy=True.""" + schema = DataFrameSchema( + { + "a": Column(dt.Int32), + "b": Column(dt.Int32), + } + ) + + t = ibis.memtable({"a": [1, 2, 3]}) + + with pytest.raises(pa.errors.SchemaErrors) as exc: + schema.validate(t, lazy=True) + + first_error = exc.value.schema_errors[0] + + assert ( + first_error.reason_code + == pa.errors.SchemaErrorReason.COLUMN_NOT_IN_DATAFRAME + ) + + def test_unique_column_names(): """Test unique column names.""" with pytest.warns( @@ -79,6 +180,17 @@ def test_column_absent_error(t_basic, t_schema_basic): t_basic.drop("int_col").pipe(t_schema_basic.validate) +@pytest.mark.xfail(reason="`unique` not yet implemented for Ibis backend") +def test_column_values_are_unique(t_basic, t_schema_basic): + """Test column values are unique.""" + t_schema_basic.unique = ["string_col", "int_col"] + modified_data = t_basic.mutate( + string_col=ibis.literal("a"), int_col=ibis.literal(0).cast("int64") + ) + with pytest.raises(pa.errors.SchemaError): + modified_data.pipe(t_schema_basic.validate) + + def test_dataframe_level_checks(): def custom_check(data: IbisData): return data.table.select(s.across(s.all(), _ == 0)) @@ -98,3 +210,60 @@ def custom_check(data: IbisData): t.pipe(schema.validate, lazy=True) except pa.errors.SchemaErrors as err: assert err.failure_cases.shape[0] == 6 + + +def _failure_value(column: str, dtype: Optional[ibis.DataType] = None): + if column.startswith("string"): + return ibis.literal("9", type=dtype or dt.String) + elif column.startswith("int"): + return ibis.literal(-1, type=dtype or dt.Int64) + raise ValueError(f"unexpected column name: {column}") + + +def _failure_type(column: str): + if column.startswith("string"): + return _failure_value(column, dtype=dt.Int64) + elif column.startswith("int"): + return _failure_value(column, dtype=dt.String) + raise ValueError(f"unexpected column name: {column}") + + +@pytest.mark.parametrize( + "transform_fn,exception_msg", + [ + [ + lambda t, col: t.mutate( + **{col: ibis.literal(None, type=t[col].type())} + ), + None, + ], + [ + lambda t, col: t.mutate(**{col: _failure_value(col)}), + "Column '.+' failed element-wise validator number", + ], + [ + lambda t, col: t.mutate(**{col: _failure_type(col)}), + "expected column '.+' to have type", + ], + ], +) +def test_regex_selector( + transform_fn, + exception_msg, + t_for_regex_match: ibis.Table, + t_schema_with_regex_name: DataFrameSchema, + t_schema_with_regex_option: DataFrameSchema, +): + for schema in ( + t_schema_with_regex_name, + t_schema_with_regex_option, + ): + result = t_for_regex_match.pipe(schema.validate).execute() + + assert result.equals(t_for_regex_match.execute()) + + for column in t_for_regex_match.columns: + # this should raise an error since columns are not nullable by default + modified_data = transform_fn(t_for_regex_match, column) + with pytest.raises(pa.errors.SchemaError, match=exception_msg): + modified_data.pipe(schema.validate) diff --git a/tests/ibis/test_ibis_dtypes.py b/tests/ibis/test_ibis_dtypes.py index 56512596c..2ab210e89 100644 --- a/tests/ibis/test_ibis_dtypes.py +++ b/tests/ibis/test_ibis_dtypes.py @@ -50,3 +50,22 @@ def test_coerce_no_cast(dtype, data): t = data.draw(get_table_strategy(dtype=pandera_dtype.type)) coerced = pandera_dtype.coerce(t) assert_frame_equal(t.to_polars(), coerced.to_polars()) + + +@pytest.mark.parametrize( + "from_dtype, to_dtype, strategy", + [ + (ie.UInt32(), ie.UInt64(), get_table_strategy), + (ie.Float32(), ie.Float64(), get_table_strategy), + (ie.Int16(), ie.String(), get_table_strategy), + ], +) +@given(st.data()) +@settings(max_examples=5) +def test_coerce_cast(from_dtype, to_dtype, strategy, data): + """Test that dtypes can be coerced with casting.""" + s = data.draw(strategy(from_dtype.type)) + + coerced = to_dtype.coerce(data_container=s) + for dtype in coerced.schema().values(): + assert dtype == to_dtype.type diff --git a/tests/polars/test_polars_container.py b/tests/polars/test_polars_container.py index e5f178dd9..e7221ec58 100644 --- a/tests/polars/test_polars_container.py +++ b/tests/polars/test_polars_container.py @@ -152,7 +152,7 @@ def test_coerce_column_dtype_error(ldf_basic, ldf_schema_basic): """Test coerce dtype raises error when values cannot be coerced.""" ldf_schema_basic._coerce = True - # change dtype of strong_col to int64, where coercion of values should fail + # change dtype of string_col to int64, where coercion of values should fail modified_ldf = ldf_basic.with_columns(string_col=pl.lit("a")) ldf_schema_basic.columns["string_col"].dtype = pl.Int64 with pytest.raises(pa.errors.SchemaError): diff --git a/tests/polars/test_polars_typing.py b/tests/polars/test_polars_typing.py index b2b85e3cd..8c3be5822 100644 --- a/tests/polars/test_polars_typing.py +++ b/tests/polars/test_polars_typing.py @@ -543,7 +543,7 @@ def test_to_format_parquet_direct(self): buffer = io.BytesIO() # Just check that the method exists and doesn't raise errors assert hasattr(df, "write_parquet") - except (IOError, ValueError, AssertionError) as e: + except (OSError, ValueError, AssertionError) as e: pytest.fail(f"Parquet buffer creation failed: {e}") def test_to_format_feather_direct(self): @@ -571,7 +571,7 @@ def test_to_format_feather_direct(self): buffer = io.BytesIO() # Just check that the method exists and doesn't raise errors assert hasattr(df, "write_ipc") - except (IOError, ValueError, AssertionError) as e: + except (OSError, ValueError, AssertionError) as e: pytest.fail(f"Feather buffer creation failed: {e}") def test_to_format_unsupported(self): @@ -685,7 +685,7 @@ def write_to_string_io(buffer): string_io.seek(0) result = string_io.getvalue() assert result == "string data" - except (IOError, ValueError) as e: + except (OSError, ValueError) as e: pytest.fail(f"StringIO buffer test failed: {e}") # Test BytesIO handling (covers parquet and feather formats) @@ -701,7 +701,7 @@ def write_to_bytes_io(buffer): bytes_io.seek(0) result = bytes_io.read() assert result == b"bytes data" - except (IOError, ValueError) as e: + except (OSError, ValueError) as e: pytest.fail(f"BytesIO buffer test failed: {e}") def test_direct_write_to_buffer(self): @@ -720,7 +720,7 @@ def write_to_buffer(buffer_type, write_func, error_prefix): return "string_result" else: return buffer - except (IOError, ValueError, RuntimeError) as exc: + except (OSError, ValueError, RuntimeError) as exc: raise ValueError(f"{error_prefix}: {exc}") from exc # Test StringIO success path @@ -761,7 +761,7 @@ def test_buffer(buffer): return "string result" else: return buffer - except (IOError, ValueError, RuntimeError) as exc: + except (OSError, ValueError, RuntimeError) as exc: raise ValueError("Buffer operation failed") from exc # Test successful case with StringIO @@ -777,7 +777,7 @@ def test_buffer(buffer): def test_error(): try: raise RuntimeError("Test error") - except (RuntimeError, ValueError, IOError) as exc: + except (RuntimeError, ValueError, OSError) as exc: raise ValueError("Error prefix: Test error") from exc with pytest.raises(ValueError, match="Error prefix: Test error"):