chore: add more types for read_gbq_table in conftest (#1843)

chelsea-lin · web-flow · commit be9a89f18319 · 2025-06-24T13:32:06.000-05:00
* chore: add more types for read_gbq_table in conftest

* add bug id

* undo dtype conversions for repeated_pandas_df

* undo unrelated changes

* fix json
diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py
@@ -182,7 +182,7 @@ def from_union(
 
             selections = [
                 sge.Alias(
-                    this=expr.alias_or_name,
+                    this=sge.to_identifier(expr.alias_or_name, quoted=cls.quoted),
                     alias=sge.to_identifier(output_id, quoted=cls.quoted),
                 )
                 for expr, output_id in zip(select_expr.expressions, output_ids)
diff --git a/tests/unit/core/compile/sqlglot/conftest.py b/tests/unit/core/compile/sqlglot/conftest.py
@@ -29,24 +29,48 @@
 DATA_DIR = CURRENT_DIR.parent.parent.parent.parent / "data"
 
 
-@pytest.fixture(scope="session")
-def compiler_session(scalars_types_table_schema):
+def _create_compiler_session(table_name, table_schema):
+    """Helper function to create a compiler session."""
     from bigframes.testing import compiler_session
 
-    # TODO: Check if ordering mode is needed for the tests.
-    table_name = "scalar_types"
     anonymous_dataset = bigquery.DatasetReference.from_string(
         "bigframes-dev.sqlglot_test"
     )
     session = mocks.create_bigquery_session(
         table_name=table_name,
-        table_schema=scalars_types_table_schema,
+        table_schema=table_schema,
         anonymous_dataset=anonymous_dataset,
     )
     session._executor = compiler_session.SQLCompilerExecutor()
     return session
 
 
+@pytest.fixture(scope="session")
+def compiler_session(scalars_types_table_schema):
+    """Compiler session for scalar types."""
+    return _create_compiler_session("scalar_types", scalars_types_table_schema)
+
+
+@pytest.fixture(scope="session")
+def compiler_session_w_repeated_types(repeated_types_table_schema):
+    """Compiler session for repeated data types."""
+    return _create_compiler_session("repeated_types", repeated_types_table_schema)
+
+
+@pytest.fixture(scope="session")
+def compiler_session_w_nested_structs_types(nested_structs_types_table_schema):
+    """Compiler session for nested STRUCT data types."""
+    return _create_compiler_session(
+        "nested_structs_types", nested_structs_types_table_schema
+    )
+
+
+@pytest.fixture(scope="session")
+def compiler_session_w_json_types(json_types_table_schema):
+    """Compiler session for JSON data types."""
+    return _create_compiler_session("json_types", json_types_table_schema)
+
+
 @pytest.fixture(scope="session")
 def scalars_types_table_schema() -> typing.Sequence[bigquery.SchemaField]:
     return [
@@ -91,6 +115,40 @@ def scalars_types_pandas_df() -> pd.DataFrame:
     return df
 
 
+@pytest.fixture(scope="session")
+def nested_structs_types_table_schema() -> typing.Sequence[bigquery.SchemaField]:
+    return [
+        bigquery.SchemaField("id", "INTEGER"),
+        bigquery.SchemaField(
+            "people",
+            "RECORD",
+            fields=[
+                bigquery.SchemaField("name", "STRING"),
+                bigquery.SchemaField("age", "INTEGER"),
+                bigquery.SchemaField(
+                    "address",
+                    "RECORD",
+                    fields=[
+                        bigquery.SchemaField("city", "STRING"),
+                        bigquery.SchemaField("country", "STRING"),
+                    ],
+                ),
+            ],
+        ),
+    ]
+
+
+@pytest.fixture(scope="session")
+def nested_structs_types_df(compiler_session_w_nested_structs_types) -> bpd.DataFrame:
+    """Returns a BigFrames DataFrame containing all scalar types and using the `rowindex`
+    column as the index."""
+    bf_df = compiler_session_w_nested_structs_types.read_gbq_table(
+        "bigframes-dev.sqlglot_test.nested_structs_types"
+    )
+    bf_df = bf_df.set_index("id", drop=False)
+    return bf_df
+
+
 @pytest.fixture(scope="session")
 def nested_structs_pandas_df() -> pd.DataFrame:
     """Returns a pandas DataFrame containing STRUCT types and using the `id`
@@ -117,18 +175,64 @@ def nested_structs_pandas_df() -> pd.DataFrame:
 
 
 @pytest.fixture(scope="session")
-def repeated_pandas_df() -> pd.DataFrame:
+def repeated_types_table_schema() -> typing.Sequence[bigquery.SchemaField]:
+    return [
+        bigquery.SchemaField("rowindex", "INTEGER"),
+        bigquery.SchemaField("int_list_col", "INTEGER", "REPEATED"),
+        bigquery.SchemaField("bool_list_col", "BOOLEAN", "REPEATED"),
+        bigquery.SchemaField("float_list_col", "FLOAT", "REPEATED"),
+        bigquery.SchemaField("date_list_col", "DATE", "REPEATED"),
+        bigquery.SchemaField("date_time_list_col", "DATETIME", "REPEATED"),
+        bigquery.SchemaField("numeric_list_col", "NUMERIC", "REPEATED"),
+        bigquery.SchemaField("string_list_col", "STRING", "REPEATED"),
+    ]
+
+
+@pytest.fixture(scope="session")
+def repeated_types_df(compiler_session_w_repeated_types) -> bpd.DataFrame:
+    """Returns a BigFrames DataFrame containing all scalar types and using the `rowindex`
+    column as the index."""
+    bf_df = compiler_session_w_repeated_types.read_gbq_table(
+        "bigframes-dev.sqlglot_test.repeated_types"
+    )
+    bf_df = bf_df.set_index("rowindex", drop=False)
+    return bf_df
+
+
+@pytest.fixture(scope="session")
+def repeated_types_pandas_df() -> pd.DataFrame:
     """Returns a pandas DataFrame containing LIST types and using the `rowindex`
     column as the index."""
 
     df = pd.read_json(
         DATA_DIR / "repeated.jsonl",
         lines=True,
     )
+    # TODO: add dtype conversion here if needed.
     df = df.set_index("rowindex")
     return df
 
 
+@pytest.fixture(scope="session")
+def json_types_table_schema() -> typing.Sequence[bigquery.SchemaField]:
+    return [
+        bigquery.SchemaField("rowindex", "INTEGER"),
+        bigquery.SchemaField("json_col", "JSON"),
+    ]
+
+
+@pytest.fixture(scope="session")
+def json_types_df(compiler_session_w_json_types) -> bpd.DataFrame:
+    """Returns a BigFrames DataFrame containing JSON types and using the `rowindex`
+    column as the index."""
+    bf_df = compiler_session_w_json_types.read_gbq_table(
+        "bigframes-dev.sqlglot_test.json_types"
+    )
+    # TODO(b/427305807): Why `drop=False` will produce two "rowindex" columns?
+    bf_df = bf_df.set_index("rowindex", drop=True)
+    return bf_df
+
+
 @pytest.fixture(scope="session")
 def json_pandas_df() -> pd.DataFrame:
     """Returns a pandas DataFrame containing JSON types and using the `rowindex`
@@ -149,8 +253,10 @@ def json_pandas_df() -> pd.DataFrame:
     ]
     df = pd.DataFrame(
         {
+            "rowindex": pd.Series(range(len(json_data)), dtype=dtypes.INT_DTYPE),
             "json_col": pd.Series(json_data, dtype=dtypes.JSON_DTYPE),
         },
-        index=pd.Series(range(len(json_data)), dtype=dtypes.INT_DTYPE),
     )
+    # TODO(b/427305807): Why `drop=False` will produce two "rowindex" columns?
+    df = df.set_index("rowindex", drop=True)
     return df
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql
@@ -49,21 +49,21 @@ WITH `bfcte_1` AS (
     *
   FROM (
     SELECT
-      bfcol_17 AS `bfcol_46`,
-      bfcol_18 AS `bfcol_47`,
-      bfcol_19 AS `bfcol_48`,
-      bfcol_20 AS `bfcol_49`,
-      bfcol_21 AS `bfcol_50`,
-      bfcol_22 AS `bfcol_51`
+      `bfcol_17` AS `bfcol_46`,
+      `bfcol_18` AS `bfcol_47`,
+      `bfcol_19` AS `bfcol_48`,
+      `bfcol_20` AS `bfcol_49`,
+      `bfcol_21` AS `bfcol_50`,
+      `bfcol_22` AS `bfcol_51`
     FROM `bfcte_6`
     UNION ALL
     SELECT
-      bfcol_40 AS `bfcol_46`,
-      bfcol_41 AS `bfcol_47`,
-      bfcol_42 AS `bfcol_48`,
-      bfcol_43 AS `bfcol_49`,
-      bfcol_44 AS `bfcol_50`,
-      bfcol_45 AS `bfcol_51`
+      `bfcol_40` AS `bfcol_46`,
+      `bfcol_41` AS `bfcol_47`,
+      `bfcol_42` AS `bfcol_48`,
+      `bfcol_43` AS `bfcol_49`,
+      `bfcol_44` AS `bfcol_50`,
+      `bfcol_45` AS `bfcol_51`
     FROM `bfcte_7`
   )
 )
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql
@@ -1,10 +1,11 @@
 WITH `bfcte_0` AS (
   SELECT
     *
-  FROM UNNEST(ARRAY<STRUCT<`bfcol_0` JSON, `bfcol_1` INT64>>[STRUCT(PARSE_JSON('null'), 0), STRUCT(PARSE_JSON('true'), 1), STRUCT(PARSE_JSON('100'), 2), STRUCT(PARSE_JSON('0.98'), 3), STRUCT(PARSE_JSON('"a string"'), 4), STRUCT(PARSE_JSON('[]'), 5), STRUCT(PARSE_JSON('[1,2,3]'), 6), STRUCT(PARSE_JSON('[{"a":1},{"a":2},{"a":null},{}]'), 7), STRUCT(PARSE_JSON('"100"'), 8), STRUCT(PARSE_JSON('{"date":"2024-07-16"}'), 9), STRUCT(PARSE_JSON('{"int_value":2,"null_filed":null}'), 10), STRUCT(PARSE_JSON('{"list_data":[10,20,30]}'), 11)])
+  FROM UNNEST(ARRAY<STRUCT<`bfcol_0` INT64, `bfcol_1` JSON, `bfcol_2` INT64>>[STRUCT(0, PARSE_JSON('null'), 0), STRUCT(1, PARSE_JSON('true'), 1), STRUCT(2, PARSE_JSON('100'), 2), STRUCT(3, PARSE_JSON('0.98'), 3), STRUCT(4, PARSE_JSON('"a string"'), 4), STRUCT(5, PARSE_JSON('[]'), 5), STRUCT(6, PARSE_JSON('[1,2,3]'), 6), STRUCT(7, PARSE_JSON('[{"a":1},{"a":2},{"a":null},{}]'), 7), STRUCT(8, PARSE_JSON('"100"'), 8), STRUCT(9, PARSE_JSON('{"date":"2024-07-16"}'), 9), STRUCT(10, PARSE_JSON('{"int_value":2,"null_filed":null}'), 10), STRUCT(11, PARSE_JSON('{"list_data":[10,20,30]}'), 11)])
 )
 SELECT
-  `bfcol_0` AS `json_col`
+  `bfcol_0` AS `rowindex`,
+  `bfcol_1` AS `json_col`
 FROM `bfcte_0`
 ORDER BY
-  `bfcol_1` ASC NULLS LAST
+  `bfcol_2` ASC NULLS LAST
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_json_types/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_json_types/out.sql
@@ -0,0 +1,10 @@
+WITH `bfcte_0` AS (
+  SELECT
+    `rowindex` AS `bfcol_0`,
+    `json_col` AS `bfcol_1`
+  FROM `bigframes-dev`.`sqlglot_test`.`json_types`
+)
+SELECT
+  `bfcol_0` AS `rowindex`,
+  `bfcol_1` AS `json_col`
+FROM `bfcte_0`
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_nested_structs_types/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_nested_structs_types/out.sql
@@ -0,0 +1,11 @@
+WITH `bfcte_0` AS (
+  SELECT
+    `id` AS `bfcol_0`,
+    `people` AS `bfcol_1`
+  FROM `bigframes-dev`.`sqlglot_test`.`nested_structs_types`
+)
+SELECT
+  `bfcol_0` AS `id`,
+  `bfcol_0` AS `id_1`,
+  `bfcol_1` AS `people`
+FROM `bfcte_0`
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_repeated_types/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_repeated_types/out.sql
@@ -0,0 +1,23 @@
+WITH `bfcte_0` AS (
+  SELECT
+    `rowindex` AS `bfcol_0`,
+    `int_list_col` AS `bfcol_1`,
+    `bool_list_col` AS `bfcol_2`,
+    `float_list_col` AS `bfcol_3`,
+    `date_list_col` AS `bfcol_4`,
+    `date_time_list_col` AS `bfcol_5`,
+    `numeric_list_col` AS `bfcol_6`,
+    `string_list_col` AS `bfcol_7`
+  FROM `bigframes-dev`.`sqlglot_test`.`repeated_types`
+)
+SELECT
+  `bfcol_0` AS `rowindex`,
+  `bfcol_0` AS `rowindex_1`,
+  `bfcol_1` AS `int_list_col`,
+  `bfcol_2` AS `bool_list_col`,
+  `bfcol_3` AS `float_list_col`,
+  `bfcol_4` AS `date_list_col`,
+  `bfcol_5` AS `date_time_list_col`,
+  `bfcol_6` AS `numeric_list_col`,
+  `bfcol_7` AS `string_list_col`
+FROM `bfcte_0`
diff --git a/tests/unit/core/compile/sqlglot/test_compile_readlocal.py b/tests/unit/core/compile/sqlglot/test_compile_readlocal.py
@@ -30,26 +30,31 @@ def test_compile_readlocal(
 
 def test_compile_readlocal_w_structs_df(
     nested_structs_pandas_df: pd.DataFrame,
-    compiler_session: bigframes.Session,
+    compiler_session_w_nested_structs_types: bigframes.Session,
     snapshot,
 ):
-    bf_df = bpd.DataFrame(nested_structs_pandas_df, session=compiler_session)
+    # TODO(b/427306734): Check why the output is different from the expected output.
+    bf_df = bpd.DataFrame(
+        nested_structs_pandas_df, session=compiler_session_w_nested_structs_types
+    )
     snapshot.assert_match(bf_df.sql, "out.sql")
 
 
 def test_compile_readlocal_w_lists_df(
-    repeated_pandas_df: pd.DataFrame,
-    compiler_session: bigframes.Session,
+    repeated_types_pandas_df: pd.DataFrame,
+    compiler_session_w_repeated_types: bigframes.Session,
     snapshot,
 ):
-    bf_df = bpd.DataFrame(repeated_pandas_df, session=compiler_session)
+    bf_df = bpd.DataFrame(
+        repeated_types_pandas_df, session=compiler_session_w_repeated_types
+    )
     snapshot.assert_match(bf_df.sql, "out.sql")
 
 
 def test_compile_readlocal_w_json_df(
     json_pandas_df: pd.DataFrame,
-    compiler_session: bigframes.Session,
+    compiler_session_w_json_types: bigframes.Session,
     snapshot,
 ):
-    bf_df = bpd.DataFrame(json_pandas_df, session=compiler_session)
+    bf_df = bpd.DataFrame(json_pandas_df, session=compiler_session_w_json_types)
     snapshot.assert_match(bf_df.sql, "out.sql")
diff --git a/tests/unit/core/compile/sqlglot/test_compile_readtable.py b/tests/unit/core/compile/sqlglot/test_compile_readtable.py
@@ -23,6 +23,20 @@ def test_compile_readtable(scalars_types_df: bpd.DataFrame, snapshot):
     snapshot.assert_match(scalars_types_df.sql, "out.sql")
 
 
+def test_compile_readtable_w_repeated_types(repeated_types_df: bpd.DataFrame, snapshot):
+    snapshot.assert_match(repeated_types_df.sql, "out.sql")
+
+
+def test_compile_readtable_w_nested_structs_types(
+    nested_structs_types_df: bpd.DataFrame, snapshot
+):
+    snapshot.assert_match(nested_structs_types_df.sql, "out.sql")
+
+
+def test_compile_readtable_w_json_types(json_types_df: bpd.DataFrame, snapshot):
+    snapshot.assert_match(json_types_df.sql, "out.sql")
+
+
 def test_compile_readtable_w_ordering(scalars_types_df: bpd.DataFrame, snapshot):
     bf_df = scalars_types_df[["int64_col"]]
     bf_df = bf_df.sort_values("int64_col")
diff --git a/third_party/bigframes_vendored/google_cloud_bigquery/_pandas_helpers.py b/third_party/bigframes_vendored/google_cloud_bigquery/_pandas_helpers.py
@@ -17,6 +17,7 @@
 
 import warnings
 
+import db_dtypes
 import google.cloud.bigquery.schema as schema
 import pyarrow
 
@@ -61,6 +62,7 @@ def pyarrow_timestamp():
     "TIME": pyarrow_time,
     "TIMESTAMP": pyarrow_timestamp,
     "BIGNUMERIC": pyarrow_bignumeric,
+    "JSON": db_dtypes.JSONArrowType,
 }
 ARROW_SCALAR_IDS_TO_BQ = {
     # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes

Original file line number	Diff line number	Diff line change
`@@ -182,7 +182,7 @@ def from_union(`
`182`	`182`
`183`	`183`	`selections = [`
`184`	`184`	`sge.Alias(`
`185`		`- this=expr.alias_or_name,`
	`185`	`+ this=sge.to_identifier(expr.alias_or_name, quoted=cls.quoted),`
`186`	`186`	`alias=sge.to_identifier(output_id, quoted=cls.quoted),`
`187`	`187`	`)`
`188`	`188`	`for expr, output_id in zip(select_expr.expressions, output_ids)`