Skip to content

Commit 8ca11fe

Browse files
xinrong-mengHyukjinKwon
authored andcommitted
[SPARK-36192][PYTHON] Better error messages for DataTypeOps against lists
### What changes were proposed in this pull request? Better error messages for DataTypeOps against lists. ### Why are the changes needed? Currently, DataTypeOps against lists throw a Py4JJavaError, we shall throw a TypeError with proper messages instead. ### Does this PR introduce _any_ user-facing change? Yes. A TypeError message will be showed rather than a Py4JJavaError. From: ```py >>> import pyspark.pandas as ps >>> ps.Series([1, 2, 3]) > [3, 2, 1] Traceback (most recent call last): ... py4j.protocol.Py4JJavaError: An error occurred while calling o107.gt. : java.lang.RuntimeException: Unsupported literal type class java.util.ArrayList [3, 2, 1] ... ``` To: ```py >>> import pyspark.pandas as ps >>> ps.Series([1, 2, 3]) > [3, 2, 1] Traceback (most recent call last): ... TypeError: The operation can not be applied to list. ``` ### How was this patch tested? Unit tests. Closes #33581 from xinrong-databricks/data_type_ops_list. Authored-by: Xinrong Meng <[email protected]> Signed-off-by: Hyukjin Kwon <[email protected]>
1 parent 0b0f4dd commit 8ca11fe

File tree

11 files changed

+123
-7
lines changed

11 files changed

+123
-7
lines changed

python/pyspark/pandas/data_type_ops/base.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,12 @@ def _as_other_type(
188188
return index_ops._with_new_scol(scol, field=InternalField(dtype=dtype))
189189

190190

191+
def _sanitize_list_like(operand: Any) -> None:
192+
"""Raise TypeError if operand is list-like."""
193+
if isinstance(operand, (list, tuple, dict, set)):
194+
raise TypeError("The operation can not be applied to %s." % type(operand).__name__)
195+
196+
191197
class DataTypeOps(object, metaclass=ABCMeta):
192198
"""The base class for binary operations of pandas-on-Spark objects (of different data types)."""
193199

@@ -314,9 +320,11 @@ def __or__(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
314320
raise TypeError("Bitwise or can not be applied to %s." % self.pretty_name)
315321

316322
def rand(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
323+
_sanitize_list_like(right)
317324
return left.__and__(right)
318325

319326
def ror(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
327+
_sanitize_list_like(right)
320328
return left.__or__(right)
321329

322330
def neg(self, operand: IndexOpsLike) -> IndexOpsLike:
@@ -340,11 +348,15 @@ def ge(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
340348
def eq(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
341349
from pyspark.pandas.base import column_op
342350

351+
_sanitize_list_like(right)
352+
343353
return column_op(Column.__eq__)(left, right)
344354

345355
def ne(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
346356
from pyspark.pandas.base import column_op
347357

358+
_sanitize_list_like(right)
359+
348360
return column_op(Column.__ne__)(left, right)
349361

350362
def invert(self, operand: IndexOpsLike) -> IndexOpsLike:

python/pyspark/pandas/data_type_ops/binary_ops.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
_as_categorical_type,
2828
_as_other_type,
2929
_as_string_type,
30+
_sanitize_list_like,
3031
)
3132
from pyspark.pandas.spark import functions as SF
3233
from pyspark.pandas.typedef import pandas_on_spark_type
@@ -44,6 +45,8 @@ def pretty_name(self) -> str:
4445
return "binaries"
4546

4647
def add(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
48+
_sanitize_list_like(right)
49+
4750
if isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, BinaryType):
4851
return column_op(F.concat)(left, right)
4952
elif isinstance(right, bytes):
@@ -54,6 +57,8 @@ def add(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
5457
)
5558

5659
def radd(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
60+
_sanitize_list_like(right)
61+
5762
if isinstance(right, bytes):
5863
return cast(
5964
SeriesOrIndex, left._with_new_scol(F.concat(SF.lit(right), left.spark.column))
@@ -66,21 +71,27 @@ def radd(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
6671
def lt(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
6772
from pyspark.pandas.base import column_op
6873

74+
_sanitize_list_like(right)
75+
6976
return column_op(Column.__lt__)(left, right)
7077

7178
def le(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
7279
from pyspark.pandas.base import column_op
7380

81+
_sanitize_list_like(right)
82+
7483
return column_op(Column.__le__)(left, right)
7584

7685
def ge(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
7786
from pyspark.pandas.base import column_op
7887

88+
_sanitize_list_like(right)
7989
return column_op(Column.__ge__)(left, right)
8090

8191
def gt(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
8292
from pyspark.pandas.base import column_op
8393

94+
_sanitize_list_like(right)
8495
return column_op(Column.__gt__)(left, right)
8596

8697
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike:

python/pyspark/pandas/data_type_ops/boolean_ops.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
_as_bool_type,
3131
_as_categorical_type,
3232
_as_other_type,
33+
_sanitize_list_like,
3334
)
3435
from pyspark.pandas.spark import functions as SF
3536
from pyspark.pandas.typedef.typehints import as_spark_type, extension_dtypes, pandas_on_spark_type
@@ -48,6 +49,7 @@ def pretty_name(self) -> str:
4849
return "bools"
4950

5051
def add(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
52+
_sanitize_list_like(right)
5153
if not is_valid_operand_for_numeric_arithmetic(right):
5254
raise TypeError(
5355
"Addition can not be applied to %s and the given type." % self.pretty_name
@@ -67,6 +69,7 @@ def add(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
6769
return left + right
6870

6971
def sub(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
72+
_sanitize_list_like(right)
7073
if not is_valid_operand_for_numeric_arithmetic(right, allow_bool=False):
7174
raise TypeError(
7275
"Subtraction can not be applied to %s and the given type." % self.pretty_name
@@ -80,6 +83,7 @@ def sub(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
8083
return left - right
8184

8285
def mul(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
86+
_sanitize_list_like(right)
8387
if not is_valid_operand_for_numeric_arithmetic(right):
8488
raise TypeError(
8589
"Multiplication can not be applied to %s and the given type." % self.pretty_name
@@ -98,6 +102,7 @@ def mul(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
98102
return left * right
99103

100104
def truediv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
105+
_sanitize_list_like(right)
101106
if not is_valid_operand_for_numeric_arithmetic(right, allow_bool=False):
102107
raise TypeError(
103108
"True division can not be applied to %s and the given type." % self.pretty_name
@@ -111,6 +116,7 @@ def truediv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
111116
return left / right
112117

113118
def floordiv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
119+
_sanitize_list_like(right)
114120
if not is_valid_operand_for_numeric_arithmetic(right, allow_bool=False):
115121
raise TypeError(
116122
"Floor division can not be applied to %s and the given type." % self.pretty_name
@@ -124,6 +130,7 @@ def floordiv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
124130
return left // right
125131

126132
def mod(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
133+
_sanitize_list_like(right)
127134
if not is_valid_operand_for_numeric_arithmetic(right, allow_bool=False):
128135
raise TypeError(
129136
"Modulo can not be applied to %s and the given type." % self.pretty_name
@@ -137,6 +144,7 @@ def mod(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
137144
return left % right
138145

139146
def pow(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
147+
_sanitize_list_like(right)
140148
if not is_valid_operand_for_numeric_arithmetic(right, allow_bool=False):
141149
raise TypeError(
142150
"Exponentiation can not be applied to %s and the given type." % self.pretty_name
@@ -150,6 +158,7 @@ def pow(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
150158
return left ** right
151159

152160
def radd(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
161+
_sanitize_list_like(right)
153162
if isinstance(right, bool):
154163
return left.__or__(right)
155164
elif isinstance(right, numbers.Number):
@@ -161,6 +170,7 @@ def radd(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
161170
)
162171

163172
def rsub(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
173+
_sanitize_list_like(right)
164174
if isinstance(right, numbers.Number) and not isinstance(right, bool):
165175
left = transform_boolean_operand_to_numeric(left, spark_type=as_spark_type(type(right)))
166176
return right - left
@@ -170,6 +180,7 @@ def rsub(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
170180
)
171181

172182
def rmul(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
183+
_sanitize_list_like(right)
173184
if isinstance(right, bool):
174185
return left.__and__(right)
175186
elif isinstance(right, numbers.Number):
@@ -181,6 +192,7 @@ def rmul(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
181192
)
182193

183194
def rtruediv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
195+
_sanitize_list_like(right)
184196
if isinstance(right, numbers.Number) and not isinstance(right, bool):
185197
left = transform_boolean_operand_to_numeric(left, spark_type=as_spark_type(type(right)))
186198
return right / left
@@ -190,6 +202,7 @@ def rtruediv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
190202
)
191203

192204
def rfloordiv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
205+
_sanitize_list_like(right)
193206
if isinstance(right, numbers.Number) and not isinstance(right, bool):
194207
left = transform_boolean_operand_to_numeric(left, spark_type=as_spark_type(type(right)))
195208
return right // left
@@ -199,6 +212,7 @@ def rfloordiv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
199212
)
200213

201214
def rpow(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
215+
_sanitize_list_like(right)
202216
if isinstance(right, numbers.Number) and not isinstance(right, bool):
203217
left = transform_boolean_operand_to_numeric(left, spark_type=as_spark_type(type(right)))
204218
return right ** left
@@ -208,6 +222,7 @@ def rpow(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
208222
)
209223

210224
def rmod(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
225+
_sanitize_list_like(right)
211226
if isinstance(right, numbers.Number) and not isinstance(right, bool):
212227
left = transform_boolean_operand_to_numeric(left, spark_type=as_spark_type(type(right)))
213228
return right % left
@@ -217,6 +232,7 @@ def rmod(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
217232
)
218233

219234
def __and__(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
235+
_sanitize_list_like(right)
220236
if isinstance(right, IndexOpsMixin) and isinstance(right.dtype, extension_dtypes):
221237
return right.__and__(left)
222238
else:
@@ -233,6 +249,7 @@ def and_func(left: Column, right: Any) -> Column:
233249
return column_op(and_func)(left, right)
234250

235251
def __or__(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
252+
_sanitize_list_like(right)
236253
if isinstance(right, IndexOpsMixin) and isinstance(right.dtype, extension_dtypes):
237254
return right.__or__(left)
238255
else:
@@ -281,15 +298,19 @@ def abs(self, operand: IndexOpsLike) -> IndexOpsLike:
281298
return operand
282299

283300
def lt(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
301+
_sanitize_list_like(right)
284302
return column_op(Column.__lt__)(left, right)
285303

286304
def le(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
305+
_sanitize_list_like(right)
287306
return column_op(Column.__le__)(left, right)
288307

289308
def ge(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
309+
_sanitize_list_like(right)
290310
return column_op(Column.__ge__)(left, right)
291311

292312
def gt(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
313+
_sanitize_list_like(right)
293314
return column_op(Column.__gt__)(left, right)
294315

295316
def invert(self, operand: IndexOpsLike) -> IndexOpsLike:
@@ -307,6 +328,8 @@ def pretty_name(self) -> str:
307328
return "booleans"
308329

309330
def __and__(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
331+
_sanitize_list_like(right)
332+
310333
def and_func(left: Column, right: Any) -> Column:
311334
if not isinstance(right, Column):
312335
if pd.isna(right):
@@ -318,6 +341,8 @@ def and_func(left: Column, right: Any) -> Column:
318341
return column_op(and_func)(left, right)
319342

320343
def __or__(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
344+
_sanitize_list_like(right)
345+
321346
def or_func(left: Column, right: Any) -> Column:
322347
if not isinstance(right, Column):
323348
if pd.isna(right):

python/pyspark/pandas/data_type_ops/categorical_ops.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex
2626
from pyspark.pandas.base import column_op, IndexOpsMixin
27-
from pyspark.pandas.data_type_ops.base import DataTypeOps
27+
from pyspark.pandas.data_type_ops.base import _sanitize_list_like, DataTypeOps
2828
from pyspark.pandas.spark import functions as SF
2929
from pyspark.pandas.typedef import pandas_on_spark_type
3030
from pyspark.sql import functions as F
@@ -63,21 +63,27 @@ def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> Ind
6363
return _to_cat(index_ops).astype(dtype)
6464

6565
def eq(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
66+
_sanitize_list_like(right)
6667
return _compare(left, right, Column.__eq__, is_equality_comparison=True)
6768

6869
def ne(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
70+
_sanitize_list_like(right)
6971
return _compare(left, right, Column.__ne__, is_equality_comparison=True)
7072

7173
def lt(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
74+
_sanitize_list_like(right)
7275
return _compare(left, right, Column.__lt__)
7376

7477
def le(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
78+
_sanitize_list_like(right)
7579
return _compare(left, right, Column.__le__)
7680

7781
def gt(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
82+
_sanitize_list_like(right)
7883
return _compare(left, right, Column.__gt__)
7984

8085
def ge(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
86+
_sanitize_list_like(right)
8187
return _compare(left, right, Column.__ge__)
8288

8389

python/pyspark/pandas/data_type_ops/complex_ops.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
_as_categorical_type,
2828
_as_other_type,
2929
_as_string_type,
30+
_sanitize_list_like,
3031
)
3132
from pyspark.pandas.typedef import pandas_on_spark_type
3233
from pyspark.sql import functions as F, Column
@@ -43,6 +44,7 @@ def pretty_name(self) -> str:
4344
return "arrays"
4445

4546
def add(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
47+
_sanitize_list_like(right)
4648
if not isinstance(right, IndexOpsMixin) or (
4749
isinstance(right, IndexOpsMixin) and not isinstance(right.spark.data_type, ArrayType)
4850
):
@@ -65,21 +67,25 @@ def add(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
6567
def lt(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
6668
from pyspark.pandas.base import column_op
6769

70+
_sanitize_list_like(right)
6871
return column_op(Column.__lt__)(left, right)
6972

7073
def le(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
7174
from pyspark.pandas.base import column_op
7275

76+
_sanitize_list_like(right)
7377
return column_op(Column.__le__)(left, right)
7478

7579
def ge(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
7680
from pyspark.pandas.base import column_op
7781

82+
_sanitize_list_like(right)
7883
return column_op(Column.__ge__)(left, right)
7984

8085
def gt(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
8186
from pyspark.pandas.base import column_op
8287

88+
_sanitize_list_like(right)
8389
return column_op(Column.__gt__)(left, right)
8490

8591
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike:
@@ -117,19 +123,23 @@ def pretty_name(self) -> str:
117123
def lt(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
118124
from pyspark.pandas.base import column_op
119125

126+
_sanitize_list_like(right)
120127
return column_op(Column.__lt__)(left, right)
121128

122129
def le(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
123130
from pyspark.pandas.base import column_op
124131

132+
_sanitize_list_like(right)
125133
return column_op(Column.__le__)(left, right)
126134

127135
def ge(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
128136
from pyspark.pandas.base import column_op
129137

138+
_sanitize_list_like(right)
130139
return column_op(Column.__ge__)(left, right)
131140

132141
def gt(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
133142
from pyspark.pandas.base import column_op
134143

144+
_sanitize_list_like(right)
135145
return column_op(Column.__gt__)(left, right)

0 commit comments

Comments
 (0)