66from typing import Any , Dict , List , Optional
77
88from pyspark .sql import DataFrame
9- from pyspark .sql .functions import col
9+ from pyspark .sql .functions import col , count
1010
1111from pandera .api .pyspark .error_handler import ErrorCategory , ErrorHandler
1212from pandera .api .pyspark .types import is_table
1515from pandera .backends .pyspark .error_formatters import scalar_failure_case
1616from pandera .config import CONFIG
1717from pandera .errors import (
18- ParserError ,
1918 SchemaDefinitionError ,
2019 SchemaError ,
2120 SchemaErrorReason ,
@@ -31,14 +30,14 @@ def preprocess(self, check_obj: DataFrame, inplace: bool = False):
3130 return check_obj
3231
3332 @validate_scope (scope = ValidationScope .SCHEMA )
34- def _column_checks (
33+ def _schema_checks (
3534 self ,
3635 check_obj : DataFrame ,
3736 schema ,
3837 column_info : ColumnInfo ,
3938 error_handler : ErrorHandler ,
4039 ):
41- """run the checks related to columns presence, uniqueness and filter column if neccesary"""
40+ """run the checks related to columns presence, strictness and filter column if neccesary"""
4241
4342 # check the container metadata, e.g. field names
4443 try :
@@ -71,6 +70,7 @@ def _column_checks(
7170 reason_code = exc .reason_code ,
7271 schema_error = exc ,
7372 )
73+
7474 # try to coerce datatypes
7575 check_obj = self .coerce_dtype (
7676 check_obj ,
@@ -80,6 +80,28 @@ def _column_checks(
8080
8181 return check_obj
8282
83+ @validate_scope (scope = ValidationScope .DATA )
84+ def _data_checks (
85+ self ,
86+ check_obj : DataFrame ,
87+ schema ,
88+ column_info : ColumnInfo , # pylint: disable=unused-argument
89+ error_handler : ErrorHandler ,
90+ ):
91+ """Run the checks related to data validation and uniqueness."""
92+
93+ # uniqueness of values
94+ try :
95+ check_obj = self .unique (
96+ check_obj , schema = schema , error_handler = error_handler
97+ )
98+ except SchemaError as err :
99+ error_handler .collect_error (
100+ ErrorCategory .DATA , err .reason_code , err
101+ )
102+
103+ return check_obj
104+
83105 def validate (
84106 self ,
85107 check_obj : DataFrame ,
@@ -115,8 +137,13 @@ def validate(
115137 check_obj = check_obj .pandera .add_schema (schema )
116138 column_info = self .collect_column_info (check_obj , schema , lazy )
117139
118- # validate the columns of the dataframe
119- check_obj = self ._column_checks (
140+ # validate the columns (schema) of the dataframe
141+ check_obj = self ._schema_checks (
142+ check_obj , schema , column_info , error_handler
143+ )
144+
145+ # validate the rows (data) of the dataframe
146+ check_obj = self ._data_checks (
120147 check_obj , schema , column_info , error_handler
121148 )
122149
@@ -191,7 +218,7 @@ def run_checks(self, check_obj: DataFrame, schema, error_handler):
191218 check_results = []
192219 for check_index , check in enumerate (
193220 schema .checks
194- ): # schama .checks is null
221+ ): # schema .checks is null
195222 try :
196223 check_results .append (
197224 self .run_check (check_obj , schema , check , check_index )
@@ -386,8 +413,7 @@ def coerce_dtype(
386413 except SchemaErrors as err :
387414 for schema_error_dict in err .schema_errors :
388415 if not error_handler .lazy :
389- # raise the first error immediately if not doing lazy
390- # validation
416+ # raise the first error immediately if not doing lazy validation
391417 raise schema_error_dict ["error" ]
392418 error_handler .collect_error (
393419 ErrorCategory .DTYPE_COERCION ,
@@ -417,27 +443,6 @@ def _coerce_dtype(
417443 # NOTE: clean up the error handling!
418444 error_handler = ErrorHandler (lazy = True )
419445
420- def _coerce_df_dtype (obj : DataFrame ) -> DataFrame :
421- if schema .dtype is None :
422- raise ValueError (
423- "dtype argument is None. Must specify this argument "
424- "to coerce dtype"
425- )
426-
427- try :
428- return schema .dtype .try_coerce (obj )
429- except ParserError as exc :
430- raise SchemaError (
431- schema = schema ,
432- data = obj ,
433- message = (
434- f"Error while coercing '{ schema .name } ' to type "
435- f"{ schema .dtype } : { exc } \n { exc .failure_cases } "
436- ),
437- failure_cases = exc .failure_cases ,
438- check = f"coerce_dtype('{ schema .dtype } ')" ,
439- ) from exc
440-
441446 def _try_coercion (obj , colname , col_schema ):
442447 try :
443448 schema = obj .pandera .schema
@@ -490,6 +495,74 @@ def _try_coercion(obj, colname, col_schema):
490495
491496 return obj
492497
498+ @validate_scope (scope = ValidationScope .DATA )
499+ def unique (
500+ self ,
501+ check_obj : DataFrame ,
502+ * ,
503+ schema = None ,
504+ error_handler : ErrorHandler = None ,
505+ ):
506+ """Check uniqueness in the check object."""
507+ assert schema is not None , "The `schema` argument must be provided."
508+ assert (
509+ error_handler is not None
510+ ), "The `error_handler` argument must be provided."
511+
512+ if not schema .unique :
513+ return check_obj
514+
515+ # Determine unique columns based on schema's config
516+ unique_columns = (
517+ [schema .unique ]
518+ if isinstance (schema .unique , str )
519+ else schema .unique
520+ )
521+
522+ # Check if values belong to the dataframe columns
523+ missing_unique_columns = set (unique_columns ) - set (check_obj .columns )
524+ if missing_unique_columns :
525+ raise SchemaDefinitionError (
526+ "Specified `unique` columns are missing in the dataframe: "
527+ f"{ list (missing_unique_columns )} "
528+ )
529+
530+ duplicates_count = (
531+ check_obj .select (* unique_columns ) # ignore other cols
532+ .groupby (* unique_columns )
533+ .agg (count ("*" ).alias ("pandera_duplicate_counts" ))
534+ .filter (
535+ col ("pandera_duplicate_counts" ) > 1
536+ ) # long name to avoid colisions
537+ .count ()
538+ )
539+
540+ if duplicates_count > 0 :
541+ raise SchemaError (
542+ schema = schema ,
543+ data = check_obj ,
544+ message = (
545+ f"Duplicated rows [{ duplicates_count } ] were found "
546+ f"for columns { unique_columns } "
547+ ),
548+ check = "unique" ,
549+ reason_code = SchemaErrorReason .DUPLICATES ,
550+ )
551+
552+ return check_obj
553+
554+ def _check_uniqueness (
555+ self ,
556+ obj : DataFrame ,
557+ schema ,
558+ ) -> DataFrame :
559+ """Ensure uniqueness in dataframe columns.
560+
561+ :param obj: dataframe to check.
562+ :param schema: schema object.
563+ :returns: dataframe checked.
564+ """
565+
493566 ##########
494567 # Checks #
495568 ##########
@@ -516,8 +589,7 @@ def check_column_names_are_unique(self, check_obj: DataFrame, schema):
516589 schema = schema ,
517590 data = check_obj ,
518591 message = (
519- "dataframe contains multiple columns with label(s): "
520- f"{ failed } "
592+ f"dataframe contains multiple columns with label(s): { failed } "
521593 ),
522594 failure_cases = scalar_failure_case (failed ),
523595 check = "dataframe_column_labels_unique" ,
0 commit comments