Add Basic Dask Support (#665)

Brian Phillips · cosmicBboy · commit 02063c8d127d · 2021-11-11T13:23:41.000-05:00
* first pass of basic Dask support

* cleanup docstrings

* cleanup after rebase

* improve coverage

* update CI for new extra

* cover branches for dask not installed

* more coverage improvements

* further coverage improvements
diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml
@@ -124,6 +124,13 @@ jobs:
           --non-interactive
           --session "tests-${{ matrix.python-version }}(extra='core', pandas='${{ matrix.pandas-version }}')"
 
+      - name: Unit Tests - Dask
+        run: >
+          nox
+          -db virtualenv -r -v
+          --non-interactive
+          --session "tests-${{ matrix.python-version }}(extra='dask', pandas='${{ matrix.pandas-version }}')"
+
       - name: Unit Tests - Hypotheses
         run: >
           nox
diff --git a/environment.yml b/environment.yml
@@ -27,6 +27,8 @@ dependencies:
   # modin extra
   - modin
   - ray
+
+  # dask extra
   - dask
   - distributed
 
diff --git a/pandera/__init__.py b/pandera/__init__.py
@@ -59,3 +59,11 @@
 if platform.system() != "Windows":
     # pylint: disable=ungrouped-imports
     from pandera.dtypes import Complex256, Float128
+
+
+try:
+    import dask.dataframe
+
+    from . import dask_accessor
+except ImportError:
+    pass
diff --git a/pandera/check_utils.py b/pandera/check_utils.py
@@ -8,10 +8,10 @@
 SupportedTypes = NamedTuple(
     "SupportedTypes",
     (
-        ("table_types", Tuple[type]),
-        ("field_types", Tuple[type]),
-        ("index_types", Tuple[type]),
-        ("multiindex_types", Tuple[type]),
+        ("table_types", Tuple[type, ...]),
+        ("field_types", Tuple[type, ...]),
+        ("index_types", Tuple[type, ...]),
+        ("multiindex_types", Tuple[type, ...]),
     ),
 )
 
@@ -42,6 +42,14 @@ def _supported_types():
         multiindex_types.append(mpd.MultiIndex)
     except ImportError:
         pass
+    try:
+        import dask.dataframe as dd
+
+        table_types.append(dd.DataFrame)
+        field_types.append(dd.Series)
+        index_types.append(dd.Index)
+    except ImportError:
+        pass
 
     return SupportedTypes(
         tuple(table_types),
diff --git a/pandera/dask_accessor.py b/pandera/dask_accessor.py
@@ -0,0 +1,14 @@
+"""Register dask accessor for pandera schema metadata."""
+
+from dask.dataframe.extensions import (
+    register_dataframe_accessor,
+    register_series_accessor,
+)
+
+from pandera.pandas_accessor import (
+    PanderaDataFrameAccessor,
+    PanderaSeriesAccessor,
+)
+
+register_dataframe_accessor("pandera")(PanderaDataFrameAccessor)
+register_series_accessor("pandera")(PanderaSeriesAccessor)
diff --git a/pandera/schemas.py b/pandera/schemas.py
@@ -404,7 +404,6 @@ def validate(
         lazy: bool = False,
         inplace: bool = False,
     ) -> pd.DataFrame:
-        # pylint: disable=too-many-locals,too-many-branches,too-many-statements
         """Check if all columns in a dataframe have a column in the Schema.
 
         :param pd.DataFrame check_obj: the dataframe to be validated.
@@ -460,6 +459,51 @@ def validate(
         5         0.76      dog
         """
 
+        if not check_utils.is_table(check_obj):
+            raise TypeError(f"expected pd.DataFrame, got {type(check_obj)}")
+
+        if hasattr(check_obj, "dask"):
+            # special case for dask dataframes
+            if inplace:
+                check_obj = check_obj.pandera.add_schema(self)
+            else:
+                check_obj = check_obj.copy()
+
+            check_obj = check_obj.map_partitions(
+                self._validate,
+                head=head,
+                tail=tail,
+                sample=sample,
+                random_state=random_state,
+                lazy=lazy,
+                inplace=inplace,
+                meta=check_obj,
+            )
+
+            return check_obj.pandera.add_schema(self)
+
+        return self._validate(
+            check_obj=check_obj,
+            head=head,
+            tail=tail,
+            sample=sample,
+            random_state=random_state,
+            lazy=lazy,
+            inplace=inplace,
+        )
+
+    def _validate(
+        self,
+        check_obj: pd.DataFrame,
+        head: Optional[int] = None,
+        tail: Optional[int] = None,
+        sample: Optional[int] = None,
+        random_state: Optional[int] = None,
+        lazy: bool = False,
+        inplace: bool = False,
+    ) -> pd.DataFrame:
+        # pylint: disable=too-many-locals,too-many-branches,too-many-statements
+
         if self._is_inferred:
             warnings.warn(
                 f"This {type(self)} is an inferred schema that hasn't been "
@@ -2074,7 +2118,6 @@ def validate(
         lazy: bool = False,
         inplace: bool = False,
     ) -> pd.Series:
-        # pylint: disable=too-many-branches
         """Validate a Series object.
 
         :param check_obj: One-dimensional ndarray with axis labels
@@ -2118,8 +2161,48 @@ def validate(
 
         """
         if not check_utils.is_field(check_obj):
-            raise TypeError(f"expected {pd.Series}, got {type(check_obj)}")
+            raise TypeError(f"expected pd.Series, got {type(check_obj)}")
+
+        if hasattr(check_obj, "dask"):
+            # special case for dask series
+            if inplace:
+                check_obj = check_obj.pandera.add_schema(self)
+            else:
+                check_obj = check_obj.copy()
+
+            check_obj = check_obj.map_partitions(
+                self._validate,
+                head=head,
+                tail=tail,
+                sample=sample,
+                random_state=random_state,
+                lazy=lazy,
+                inplace=inplace,
+                meta=check_obj,
+            )
 
+            return check_obj.pandera.add_schema(self)
+
+        return self._validate(
+            check_obj=check_obj,
+            head=head,
+            tail=tail,
+            sample=sample,
+            random_state=random_state,
+            lazy=lazy,
+            inplace=inplace,
+        )
+
+    def _validate(
+        self,
+        check_obj: pd.Series,
+        head: Optional[int] = None,
+        tail: Optional[int] = None,
+        sample: Optional[int] = None,
+        random_state: Optional[int] = None,
+        lazy: bool = False,
+        inplace: bool = False,
+    ) -> pd.Series:
         if not inplace:
             check_obj = check_obj.copy()
 
diff --git a/pandera/typing.py b/pandera/typing.py
@@ -20,6 +20,13 @@
 except ImportError:
     ModelField = Any  # type: ignore
 
+try:
+    import dask.dataframe as dd
+
+    _DASK_INSTALLED = True
+except ImportError:
+    _DASK_INSTALLED = False
+
 Bool = dtypes.Bool  #: ``"bool"`` numpy dtype
 DateTime = dtypes.DateTime  #: ``"datetime64[ns]"`` numpy dtype
 Timedelta = dtypes.Timedelta  #: ``"timedelta64[ns]"`` numpy dtype
@@ -178,6 +185,15 @@ def _pydantic_validate(
             raise ValueError(str(exc)) from exc
 
 
+if _DASK_INSTALLED:
+    # pylint:disable=too-few-public-methods
+    class DaskDataFrame(dd.DataFrame, Generic[T]):
+        """
+        Representation of dask.dataframe.DataFrame, only used for type
+        annotation.
+        """
+
+
 class AnnotationInfo:  # pylint:disable=too-few-public-methods
     """Captures extra information about an annotation.
 
@@ -195,11 +211,16 @@ def __init__(self, raw_annotation: Type) -> None:
 
     @property
     def is_generic_df(self) -> bool:
-        """True if the annotation is a pandera.typing.DataFrame."""
+        """True if the annotation is a pandera.typing.DataFrame or
+        pandera.typing.DaskDataFrame.
+        """
         try:
-            return self.origin is not None and issubclass(
-                self.origin, DataFrame
-            )
+            if self.origin is None:
+                return False
+            if _DASK_INSTALLED:
+                return issubclass(self.origin, (DataFrame, DaskDataFrame))
+            else:
+                return issubclass(self.origin, DataFrame)
         except TypeError:
             return False
 
diff --git a/setup.py b/setup.py
@@ -15,6 +15,7 @@
     "modin": ["modin", "ray", "dask"],
     "modin-ray": ["modin", "ray"],
     "modin-dask": ["modin", "dask"],
+    "dask": ["dask"],
 }
 extras_require = {
     **_extras_require,
diff --git a/tests/core/test_pandas_accessor.py b/tests/core/test_pandas_accessor.py
@@ -1,5 +1,6 @@
 """Unit tests for pandas_accessor module."""
 from typing import Union
+from unittest.mock import patch
 
 import pandas as pd
 import pytest
@@ -49,8 +50,20 @@ def test_dataframe_series_add_schema(
         assert validated_data_1.pandera.schema == schema1
     assert validated_data_2.pandera.schema == schema2
 
-    with pytest.raises(TypeError):
+    with pytest.raises(TypeError, match=f"expected pd.{type(data).__name__}"):
         schema1(invalid_data)
 
-    with pytest.raises(TypeError):
+    with pytest.raises(TypeError, match=f"expected pd.{type(data).__name__}"):
         schema2(invalid_data)
+
+    with patch.object(pa.schemas.check_utils, "is_table", return_value=True):
+        with patch.object(
+            pa.schemas.check_utils,
+            "is_field",
+            return_value=True,
+        ):
+            with pytest.raises(TypeError, match="schema arg"):
+                schema1(invalid_data)
+
+            with pytest.raises(TypeError, match="schema arg"):
+                schema2(invalid_data)
diff --git a/tests/dask/__init__.py b/tests/dask/__init__.py
diff --git a/tests/dask/test_dask.py b/tests/dask/test_dask.py
@@ -0,0 +1,108 @@
+""" Tests that basic Pandera functionality works for Dask objects. """
+
+
+import dask.dataframe as dd
+import pandas as pd
+import pytest
+
+import pandera as pa
+from pandera.typing import DaskDataFrame, Series
+
+
+class IntSchema(pa.SchemaModel):  # pylint: disable=missing-class-docstring
+    col: Series[int]
+
+
+class StrSchema(pa.SchemaModel):  # pylint: disable=missing-class-docstring
+    col: Series[str]
+
+
+def test_model_validation() -> None:
+    """
+    Test that model based pandera validation works with Dask DataFrames.
+    """
+    df = pd.DataFrame({"col": ["1"]})
+    ddf = dd.from_pandas(df, npartitions=1)
+
+    ddf = StrSchema.validate(ddf)
+    pd.testing.assert_frame_equal(df, ddf.compute())
+
+    ddf = IntSchema.validate(ddf)
+
+    with pytest.raises(pa.errors.SchemaError):
+        ddf.compute()
+
+    IntSchema.validate(ddf, inplace=True)
+
+    with pytest.raises(pa.errors.SchemaError):
+        ddf.compute()
+
+
+def test_dataframe_schema() -> None:
+    """
+    Test that DataFrameSchema based pandera validation works with Dask
+    DataFrames.
+    """
+    int_schema = pa.DataFrameSchema({"col": pa.Column(int)})
+    str_schema = pa.DataFrameSchema({"col": pa.Column(str)})
+
+    df = pd.DataFrame({"col": ["1"]})
+    ddf = dd.from_pandas(df, npartitions=1)
+
+    ddf = str_schema.validate(ddf)
+    pd.testing.assert_frame_equal(df, ddf.compute())
+
+    ddf = int_schema.validate(ddf)
+
+    with pytest.raises(pa.errors.SchemaError):
+        ddf.compute()
+
+    IntSchema.validate(ddf, inplace=True)
+
+    with pytest.raises(pa.errors.SchemaError):
+        ddf.compute()
+
+
+def test_series_schema() -> None:
+    """
+    Test that SeriesSchema based pandera validation works with Dask Series.
+    """
+    integer_schema = pa.SeriesSchema(int)
+    string_schema = pa.SeriesSchema(str)
+
+    series = pd.Series(["1"])
+    dseries = dd.from_pandas(series, npartitions=1)
+
+    dseries = string_schema.validate(dseries)
+    pd.testing.assert_series_equal(series, dseries.compute())
+
+    dseries = integer_schema.validate(dseries)
+
+    with pytest.raises(pa.errors.SchemaError):
+        dseries.compute()
+
+    integer_schema.validate(dseries, inplace=True)
+
+    with pytest.raises(pa.errors.SchemaError):
+        dseries.compute()
+
+
+def test_decorator() -> None:
+    """Test that pandera check_types decorator works with Dask DataFrames."""
+
+    @pa.check_types
+    def str_func(x: DaskDataFrame[StrSchema]) -> DaskDataFrame[StrSchema]:
+        return x
+
+    @pa.check_types
+    def int_func(x: DaskDataFrame[IntSchema]) -> DaskDataFrame[IntSchema]:
+        return x
+
+    df = pd.DataFrame({"col": ["1"]})
+    ddf = dd.from_pandas(df, npartitions=1)
+    pd.testing.assert_frame_equal(df, str_func(ddf).compute())
+
+    result = int_func(ddf)
+
+    with pytest.raises(pa.errors.SchemaError):
+        print(result.compute())
diff --git a/tests/dask/test_dask_accessor.py b/tests/dask/test_dask_accessor.py
diff --git a/tests/dask/test_dask_not_installed.py b/tests/dask/test_dask_not_installed.py

Original file line number	Diff line number	Diff line change
`@@ -15,6 +15,7 @@`
`15`	`15`	`"modin": ["modin", "ray", "dask"],`
`16`	`16`	`"modin-ray": ["modin", "ray"],`
`17`	`17`	`"modin-dask": ["modin", "dask"],`
	`18`	`+ "dask": ["dask"],`
`18`	`19`	`}`
`19`	`20`	`extras_require = {`
`20`	`21`	`**_extras_require,`