From 4ad5c31d8efbf585acd6753f43aca4092cd2bea9 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 4 Mar 2025 19:33:51 +0000 Subject: [PATCH] fix: window operations over JSON columns --- bigframes/core/compile/compiled.py | 6 +++++- tests/data/json.jsonl | 4 ++-- tests/system/small/test_dataframe.py | 25 +++++++++++++++++++++---- 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 3a7e77a632..c3d4c10267 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -30,6 +30,7 @@ import bigframes.core.compile.googlesql import bigframes.core.compile.ibis_types import bigframes.core.compile.scalar_op_compiler as op_compilers +import bigframes.core.compile.scalar_op_compiler as scalar_op_compiler import bigframes.core.expression as ex import bigframes.core.guid from bigframes.core.ordering import OrderingExpression @@ -676,4 +677,7 @@ def _as_groupable(value: ibis_types.Value): # Some types need to be converted to string to enable groupby if value.type().is_float64() or value.type().is_geospatial(): return value.cast(ibis_dtypes.str) - return value + elif value.type().is_json(): + return scalar_op_compiler.to_json_string(value) + else: + return value diff --git a/tests/data/json.jsonl b/tests/data/json.jsonl index fbf0593612..1abdcc9d56 100644 --- a/tests/data/json.jsonl +++ b/tests/data/json.jsonl @@ -6,10 +6,10 @@ {"rowindex": 5, "json_col": []} {"rowindex": 6, "json_col": [1, 2, 3]} {"rowindex": 7, "json_col": [{"a": 1}, {"a": 2}, {"a": null}, {}]} -{"rowindex": 8, "json_col": {"bool_value": true}} +{"rowindex": 8, "json_col": "100"} {"rowindex": 9, "json_col": {"folat_num": 3.14159}} {"rowindex": 10, "json_col": {"date": "2024-07-16"}} -{"rowindex": 11, "json_col": {"null_filed": null}} +{"rowindex": 11, "json_col": 100} {"rowindex": 12, "json_col": {"int_value": 2, "null_filed": null}} {"rowindex": 13, "json_col": {"list_data": [10, 20, 30]}} {"rowindex": 14, "json_col": {"person": {"name": "Alice", "age": 35}}} diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 7f43583ef6..0a6f09a857 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -4534,11 +4534,28 @@ def test_loc_bf_index_integer_index_renamed_col( ) def test_df_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, subset): columns = ["bool_col", "int64_too", "int64_col"] - bf_series = scalars_df_index[columns].drop_duplicates(subset, keep=keep).to_pandas() - pd_series = scalars_pandas_df_index[columns].drop_duplicates(subset, keep=keep) + bf_df = scalars_df_index[columns].drop_duplicates(subset, keep=keep).to_pandas() + pd_df = scalars_pandas_df_index[columns].drop_duplicates(subset, keep=keep) pd.testing.assert_frame_equal( - pd_series, - bf_series, + pd_df, + bf_df, + ) + + +@pytest.mark.parametrize( + ("keep",), + [ + ("first",), + ("last",), + (False,), + ], +) +def test_df_drop_duplicates_w_json(json_df, keep): + bf_df = json_df.drop_duplicates(keep=keep).to_pandas() + pd_df = json_df.to_pandas().drop_duplicates(keep=keep) + pd.testing.assert_frame_equal( + pd_df, + bf_df, )