From 4ad5c31d8efbf585acd6753f43aca4092cd2bea9 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Tue, 4 Mar 2025 19:33:51 +0000
Subject: [PATCH] fix: window operations over JSON columns

---
 bigframes/core/compile/compiled.py   |  6 +++++-
 tests/data/json.jsonl                |  4 ++--
 tests/system/small/test_dataframe.py | 25 +++++++++++++++++++++----
 3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py
index 3a7e77a632..c3d4c10267 100644
--- a/bigframes/core/compile/compiled.py
+++ b/bigframes/core/compile/compiled.py
@@ -30,6 +30,7 @@
 import bigframes.core.compile.googlesql
 import bigframes.core.compile.ibis_types
 import bigframes.core.compile.scalar_op_compiler as op_compilers
+import bigframes.core.compile.scalar_op_compiler as scalar_op_compiler
 import bigframes.core.expression as ex
 import bigframes.core.guid
 from bigframes.core.ordering import OrderingExpression
@@ -676,4 +677,7 @@ def _as_groupable(value: ibis_types.Value):
     # Some types need to be converted to string to enable groupby
     if value.type().is_float64() or value.type().is_geospatial():
         return value.cast(ibis_dtypes.str)
-    return value
+    elif value.type().is_json():
+        return scalar_op_compiler.to_json_string(value)
+    else:
+        return value
diff --git a/tests/data/json.jsonl b/tests/data/json.jsonl
index fbf0593612..1abdcc9d56 100644
--- a/tests/data/json.jsonl
+++ b/tests/data/json.jsonl
@@ -6,10 +6,10 @@
 {"rowindex": 5, "json_col": []}
 {"rowindex": 6, "json_col": [1, 2, 3]}
 {"rowindex": 7, "json_col": [{"a": 1}, {"a": 2}, {"a": null}, {}]}
-{"rowindex": 8, "json_col": {"bool_value": true}}
+{"rowindex": 8, "json_col": "100"}
 {"rowindex": 9, "json_col": {"folat_num": 3.14159}}
 {"rowindex": 10, "json_col": {"date": "2024-07-16"}}
-{"rowindex": 11, "json_col": {"null_filed": null}}
+{"rowindex": 11, "json_col": 100}
 {"rowindex": 12, "json_col": {"int_value": 2, "null_filed": null}}
 {"rowindex": 13, "json_col": {"list_data": [10, 20, 30]}}
 {"rowindex": 14, "json_col": {"person": {"name": "Alice", "age": 35}}}
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index 7f43583ef6..0a6f09a857 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -4534,11 +4534,28 @@ def test_loc_bf_index_integer_index_renamed_col(
 )
 def test_df_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, subset):
     columns = ["bool_col", "int64_too", "int64_col"]
-    bf_series = scalars_df_index[columns].drop_duplicates(subset, keep=keep).to_pandas()
-    pd_series = scalars_pandas_df_index[columns].drop_duplicates(subset, keep=keep)
+    bf_df = scalars_df_index[columns].drop_duplicates(subset, keep=keep).to_pandas()
+    pd_df = scalars_pandas_df_index[columns].drop_duplicates(subset, keep=keep)
     pd.testing.assert_frame_equal(
-        pd_series,
-        bf_series,
+        pd_df,
+        bf_df,
+    )
+
+
+@pytest.mark.parametrize(
+    ("keep",),
+    [
+        ("first",),
+        ("last",),
+        (False,),
+    ],
+)
+def test_df_drop_duplicates_w_json(json_df, keep):
+    bf_df = json_df.drop_duplicates(keep=keep).to_pandas()
+    pd_df = json_df.to_pandas().drop_duplicates(keep=keep)
+    pd.testing.assert_frame_equal(
+        pd_df,
+        bf_df,
     )