unionai-oss
diff --git a/‎.readthedocs.yml‎
Lines changed: 1 addition & 0 deletions b/‎.readthedocs.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/requirements.txt‎
Lines changed: 2 additions & 0 deletions b/‎docs/requirements.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/source/dataframe_schemas.rst‎
Lines changed: 32 additions & 0 deletions b/‎docs/source/dataframe_schemas.rst‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎docs/source/schema_inference.rst‎
Lines changed: 7 additions & 6 deletions b/‎docs/source/schema_inference.rst‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎environment.yml‎
Lines changed: 1 addition & 1 deletion b/‎environment.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pandera/engines/pandas_engine.py‎
Lines changed: 7 additions & 1 deletion b/‎pandera/engines/pandas_engine.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎pandera/errors.py‎
Lines changed: 3 additions & 1 deletion b/‎pandera/errors.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎pandera/io.py‎
Lines changed: 31 additions & 16 deletions b/‎pandera/io.py‎
Lines changed: 31 additions & 16 deletions
diff --git a/‎pandera/model.py‎
Lines changed: 14 additions & 16 deletions b/‎pandera/model.py‎
Lines changed: 14 additions & 16 deletions
@@ -20,6 +20,7 @@ formats: all
 python:
   version: 3.7
   install:
+    - requirements: docs/requirements.txt
     - requirements: requirements-dev.txt
     - method: pip
       path: .
@@ -0,0 +1,2 @@
+# pin this due to issue described here: https://github.com/pandera-dev/pandera/pull/602#issuecomment-915622823
+setuptools < 58.0.0
@@ -467,6 +467,38 @@ To validate the order of the Dataframe columns, specify ``ordered=True``:
 
 .. _index:
 
+Validating the joint uniqueness of columns
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In some cases you might want to ensure that a group of columns are unique:
+
+.. testcode:: joint_column_uniqueness
+
+    import pandas as pd
+    import pandera as pa
+
+    schema = pa.DataFrameSchema(
+        columns={col: pa.Column(int) for col in ["a", "b", "c"]},
+        unique=["a", "c"],
+    )
+    df = pd.DataFrame.from_records([
+        {"a": 1, "b": 2, "c": 3},
+        {"a": 1, "b": 2, "c": 3},
+    ])
+    schema.validate(df)
+
+.. testoutput:: joint_column_uniqueness
+
+    Traceback (most recent call last):
+    ...
+    SchemaError: columns '('a', 'c')' not unique:
+    column  index  failure_case
+    0      a      0             1
+    1      a      1             1
+    2      c      0             3
+    3      c      1             3
+
+
 Index Validation
 ----------------
 
 
@@ -107,7 +107,7 @@ You can also write your schema to a python script with :func:`~pandera.io.to_scr
                     Check.less_than_or_equal_to(max_value=20.0),
                 ],
                 nullable=False,
-                allow_duplicates=True,
+                unique=False,
                 coerce=False,
                 required=True,
                 regex=False,
@@ -116,7 +116,7 @@ You can also write your schema to a python script with :func:`~pandera.io.to_scr
                 dtype=pandera.engines.numpy_engine.Object,
                 checks=None,
                 nullable=False,
-                allow_duplicates=True,
+                unique=False,
                 coerce=False,
                 required=True,
                 regex=False,
@@ -132,7 +132,7 @@ You can also write your schema to a python script with :func:`~pandera.io.to_scr
                     ),
                 ],
                 nullable=False,
-                allow_duplicates=True,
+                unique=False,
                 coerce=False,
                 required=True,
                 regex=False,
@@ -185,15 +185,15 @@ is a convenience method for this functionality.
         checks:
           greater_than_or_equal_to: 5.0
           less_than_or_equal_to: 20.0
-        allow_duplicates: true
+        unique: false
         coerce: false
         required: true
         regex: false
       column2:
         dtype: object
         nullable: false
         checks: null
-        allow_duplicates: true
+        unique: false
         coerce: false
         required: true
         regex: false
@@ -203,7 +203,7 @@ is a convenience method for this functionality.
         checks:
           greater_than_or_equal_to: '2010-01-01 00:00:00'
           less_than_or_equal_to: '2012-01-01 00:00:00'
-        allow_duplicates: true
+        unique: false
         coerce: false
         required: true
         regex: false
@@ -218,6 +218,7 @@ is a convenience method for this functionality.
       coerce: false
     coerce: true
     strict: false
+    unique: null
 
 You can edit this yaml file by specifying column names under the ``column``
 key. The respective values map onto key-word arguments in the
 
@@ -32,7 +32,7 @@ dependencies:
   - pytest-xdist
   - pytest-asyncio
   - xdoctest
-  - setuptools >= 52.0.0
+  - setuptools < 58.0.0
   - nox = 2020.12.31 # pinning due to UnicodeDecodeError, see https://github.com/pandera-dev/pandera/pull/504/checks?check_run_id=2841360122
   - importlib_metadata # required if python < 3.8
 
 
@@ -157,7 +157,13 @@ def numpy_dtype(cls, pandera_dtype: dtypes.DataType) -> np.dtype:
             alias = "bool"
         elif alias.startswith("string"):
             alias = "str"
-        return np.dtype(alias)
+
+        try:
+            return np.dtype(alias)
+        except TypeError as err:
+            raise TypeError(
+                f"Data type '{pandera_dtype}' cannot be cast to a numpy dtype."
+            ) from err
 
 
 ###############################################################################
 
@@ -171,7 +171,9 @@ def _parse_schema_errors(schema_errors: List[Dict[str, Any]]):
                     schema_context=err.schema.__class__.__name__,
                     check=check_identifier,
                     check_number=err.check_index,
-                    column=column,
+                    # explicitly wrap `column` in a list of the column key is
+                    # a tuple in the case of MultiIndex column names.
+                    column=[column] if isinstance(column, tuple) else column,
                 )
                 check_failure_cases.append(failure_cases[column_order])
 
 
@@ -108,7 +108,7 @@ def _serialize_component_stats(component_stats):
             key: component_stats.get(key)
             for key in [
                 "name",
-                "allow_duplicates",
+                "unique",
                 "coerce",
                 "required",
                 "regex",
@@ -148,6 +148,7 @@ def _serialize_schema(dataframe_schema):
         "index": index,
         "coerce": dataframe_schema.coerce,
         "strict": dataframe_schema.strict,
+        "unique": dataframe_schema.unique,
     }
 
 
@@ -195,6 +196,9 @@ def _deserialize_component_stats(serialized_component_stats):
             for key in [
                 "name",
                 "nullable",
+                "unique",
+                # deserialize allow_duplicates property for backwards
+                # compatibility. Remove this for 0.8.0 release
                 "allow_duplicates",
                 "coerce",
                 "required",
@@ -255,6 +259,7 @@ def _deserialize_schema(serialized_schema):
         index=index,
         coerce=serialized_schema.get("coerce", False),
         strict=serialized_schema.get("strict", False),
+        unique=serialized_schema.get("unique", None),
     )
 
 
@@ -310,7 +315,7 @@ def _write_yaml(obj, stream):
     dtype={dtype},
     checks={checks},
     nullable={nullable},
-    allow_duplicates={allow_duplicates},
+    unique={unique},
     coerce={coerce},
     required={required},
     regex={regex},
@@ -397,7 +402,7 @@ def to_script(dataframe_schema, path_or_buf=None):
             ),
             checks=_format_checks(properties["checks"]),
             nullable=properties["nullable"],
-            allow_duplicates=properties["allow_duplicates"],
+            unique=properties["unique"],
             coerce=properties["coerce"],
             required=properties["required"],
             regex=properties["regex"],
@@ -418,6 +423,7 @@ def to_script(dataframe_schema, path_or_buf=None):
         coerce=dataframe_schema.coerce,
         strict=dataframe_schema.strict,
         name=dataframe_schema.name.__repr__(),
+        unique=dataframe_schema.unique,
     ).strip()
 
     # add pandas imports to handle datetime and timedelta.
@@ -445,15 +451,15 @@ class FrictionlessFieldParser:
     formats, titles, descriptions).
 
     :param field: a field object from a frictionless schema.
-    :param primary_keys: the primary keys from a frictionless schema. These are used
-        to ensure primary key fields are treated properly - no duplicates,
-        no missing values etc.
+    :param primary_keys: the primary keys from a frictionless schema. These
+        are used to ensure primary key fields are treated properly - no
+        duplicates, no missing values etc.
     """
 
     def __init__(self, field, primary_keys) -> None:
         self.constraints = field.constraints or {}
+        self.primary_keys = primary_keys
         self.name = field.name
-        self.is_a_primary_key = self.name in primary_keys
         self.type = field.get("type", "string")
 
     @property
@@ -544,18 +550,22 @@ def nullable(self) -> bool:
         """Determine whether this field can contain missing values.
 
         If a field is a primary key, this will return ``False``."""
-        if self.is_a_primary_key:
+        if self.name in self.primary_keys:
             return False
         return not self.constraints.get("required", False)
 
     @property
-    def allow_duplicates(self) -> bool:
+    def unique(self) -> bool:
         """Determine whether this field can contain duplicate values.
 
-        If a field is a primary key, this will return ``False``."""
-        if self.is_a_primary_key:
-            return False
-        return not self.constraints.get("unique", False)
+        If a field is a primary key, this will return ``True``.
+        """
+
+        # only set column-level uniqueness property if `primary_keys` contains
+        # more than one field name.
+        if len(self.primary_keys) == 1 and self.name in self.primary_keys:
+            return True
+        return self.constraints.get("unique", False)
 
     @property
     def coerce(self) -> bool:
@@ -587,10 +597,10 @@ def regex(self) -> bool:
     def to_pandera_column(self) -> Dict:
         """Export this field to a column spec dictionary."""
         return {
-            "allow_duplicates": self.allow_duplicates,
             "checks": self.checks,
             "coerce": self.coerce,
             "nullable": self.nullable,
+            "unique": self.unique,
             "dtype": self.dtype,
             "required": self.required,
             "name": self.name,
@@ -645,8 +655,8 @@ def from_frictionless_schema(
     [<Check in_range: in_range(10, 99)>]
     >>> schema.columns["column_1"].required
     True
-    >>> schema.columns["column_1"].allow_duplicates
-    False
+    >>> schema.columns["column_1"].unique
+    True
     >>> schema.columns["column_2"].checks
     [<Check str_length: str_length(None, 10)>, <Check str_matches: str_matches(re.compile('^\\\\S+$'))>]
     """
@@ -664,5 +674,10 @@ def from_frictionless_schema(
         "checks": None,
         "coerce": True,
         "strict": True,
+        # only set dataframe-level uniqueness if the frictionless primary
+        # key property specifies more than one field
+        "unique": (
+            None if len(schema.primary_key) == 1 else list(schema.primary_key)
+        ),
     }
     return _deserialize_schema(assembled_schema)
@@ -34,21 +34,9 @@
     FieldInfo,
 )
 from .schemas import DataFrameSchema
-from .typing import LEGACY_TYPING, AnnotationInfo, DataFrame, Index, Series
+from .typing import AnnotationInfo, DataFrame, Index, Series
 
-if LEGACY_TYPING:
-
-    def get_type_hints(
-        obj: Callable[..., Any],
-        globalns: Optional[Dict[str, Any]] = None,
-        localns: Optional[Dict[str, Any]] = None,
-        include_extras: bool = False,
-    ) -> Dict[str, Any]:
-        # pylint:disable=function-redefined, missing-function-docstring, unused-argument
-        return typing.get_type_hints(obj, globalns, localns)
-
-
-elif sys.version_info[:2] < (3, 9):
+if sys.version_info[:2] < (3, 9):
     from typing_extensions import get_type_hints
 else:
     from typing import get_type_hints
@@ -82,6 +70,9 @@ class BaseConfig:  # pylint:disable=R0903
     name: Optional[str] = None  #: name of schema
     coerce: bool = False  #: coerce types of all schema components
 
+    #: make sure certain column combinations are unique
+    unique: Optional[Union[str, List[str]]] = None
+
     #: make sure all specified columns are in the validated dataframe -
     #: if ``"filter"``, removes columns not specified in the schema
     strict: Union[bool, str] = False
@@ -218,6 +209,7 @@ def to_schema(cls) -> DataFrameSchema:
             strict=cls.__config__.strict,
             name=cls.__config__.name,
             ordered=cls.__config__.ordered,
+            unique=cls.__config__.unique,
         )
         if cls not in MODEL_CACHE:
             MODEL_CACHE[cls] = cls.__schema__  # type: ignore
@@ -300,7 +292,10 @@ def _build_columns_index(  # pylint:disable=too-many-locals
 
             dtype = None if dtype is Any else dtype
 
-            if annotation.origin is Series:
+            if (
+                annotation.origin is Series
+                or annotation.raw_annotation is Series
+            ):
                 col_constructor = (
                     field.to_column if field else schema_components.Column
                 )
@@ -316,7 +311,10 @@ def _build_columns_index(  # pylint:disable=too-many-locals
                     checks=field_checks,
                     name=field_name,
                 )
-            elif annotation.origin is Index:
+            elif (
+                annotation.origin is Index
+                or annotation.raw_annotation is Index
+            ):
                 if annotation.optional:
                     raise SchemaInitError(
                         f"Index '{field_name}' cannot be Optional."
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# pin this due to issue described here: https://github.com/pandera-dev/pandera/pull/602#issuecomment-915622823`
	`2`	`+setuptools < 58.0.0`