Skip to content

chore: add more types for read_gbq_table in conftest #1843

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jun 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bigframes/core/compile/sqlglot/sqlglot_ir.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ def from_union(

selections = [
sge.Alias(
this=expr.alias_or_name,
this=sge.to_identifier(expr.alias_or_name, quoted=cls.quoted),
alias=sge.to_identifier(output_id, quoted=cls.quoted),
)
for expr, output_id in zip(select_expr.expressions, output_ids)
Expand Down
120 changes: 113 additions & 7 deletions tests/unit/core/compile/sqlglot/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,24 +29,48 @@
DATA_DIR = CURRENT_DIR.parent.parent.parent.parent / "data"


@pytest.fixture(scope="session")
def compiler_session(scalars_types_table_schema):
def _create_compiler_session(table_name, table_schema):
"""Helper function to create a compiler session."""
from bigframes.testing import compiler_session

# TODO: Check if ordering mode is needed for the tests.
table_name = "scalar_types"
anonymous_dataset = bigquery.DatasetReference.from_string(
"bigframes-dev.sqlglot_test"
)
session = mocks.create_bigquery_session(
table_name=table_name,
table_schema=scalars_types_table_schema,
table_schema=table_schema,
anonymous_dataset=anonymous_dataset,
)
session._executor = compiler_session.SQLCompilerExecutor()
return session


@pytest.fixture(scope="session")
def compiler_session(scalars_types_table_schema):
"""Compiler session for scalar types."""
return _create_compiler_session("scalar_types", scalars_types_table_schema)


@pytest.fixture(scope="session")
def compiler_session_w_repeated_types(repeated_types_table_schema):
"""Compiler session for repeated data types."""
return _create_compiler_session("repeated_types", repeated_types_table_schema)


@pytest.fixture(scope="session")
def compiler_session_w_nested_structs_types(nested_structs_types_table_schema):
"""Compiler session for nested STRUCT data types."""
return _create_compiler_session(
"nested_structs_types", nested_structs_types_table_schema
)


@pytest.fixture(scope="session")
def compiler_session_w_json_types(json_types_table_schema):
"""Compiler session for JSON data types."""
return _create_compiler_session("json_types", json_types_table_schema)


@pytest.fixture(scope="session")
def scalars_types_table_schema() -> typing.Sequence[bigquery.SchemaField]:
return [
Expand Down Expand Up @@ -91,6 +115,40 @@ def scalars_types_pandas_df() -> pd.DataFrame:
return df


@pytest.fixture(scope="session")
def nested_structs_types_table_schema() -> typing.Sequence[bigquery.SchemaField]:
return [
bigquery.SchemaField("id", "INTEGER"),
bigquery.SchemaField(
"people",
"RECORD",
fields=[
bigquery.SchemaField("name", "STRING"),
bigquery.SchemaField("age", "INTEGER"),
bigquery.SchemaField(
"address",
"RECORD",
fields=[
bigquery.SchemaField("city", "STRING"),
bigquery.SchemaField("country", "STRING"),
],
),
],
),
]


@pytest.fixture(scope="session")
def nested_structs_types_df(compiler_session_w_nested_structs_types) -> bpd.DataFrame:
"""Returns a BigFrames DataFrame containing all scalar types and using the `rowindex`
column as the index."""
bf_df = compiler_session_w_nested_structs_types.read_gbq_table(
"bigframes-dev.sqlglot_test.nested_structs_types"
)
bf_df = bf_df.set_index("id", drop=False)
return bf_df


@pytest.fixture(scope="session")
def nested_structs_pandas_df() -> pd.DataFrame:
"""Returns a pandas DataFrame containing STRUCT types and using the `id`
Expand All @@ -117,18 +175,64 @@ def nested_structs_pandas_df() -> pd.DataFrame:


@pytest.fixture(scope="session")
def repeated_pandas_df() -> pd.DataFrame:
def repeated_types_table_schema() -> typing.Sequence[bigquery.SchemaField]:
return [
bigquery.SchemaField("rowindex", "INTEGER"),
bigquery.SchemaField("int_list_col", "INTEGER", "REPEATED"),
bigquery.SchemaField("bool_list_col", "BOOLEAN", "REPEATED"),
bigquery.SchemaField("float_list_col", "FLOAT", "REPEATED"),
bigquery.SchemaField("date_list_col", "DATE", "REPEATED"),
bigquery.SchemaField("date_time_list_col", "DATETIME", "REPEATED"),
bigquery.SchemaField("numeric_list_col", "NUMERIC", "REPEATED"),
bigquery.SchemaField("string_list_col", "STRING", "REPEATED"),
]


@pytest.fixture(scope="session")
def repeated_types_df(compiler_session_w_repeated_types) -> bpd.DataFrame:
"""Returns a BigFrames DataFrame containing all scalar types and using the `rowindex`
column as the index."""
bf_df = compiler_session_w_repeated_types.read_gbq_table(
"bigframes-dev.sqlglot_test.repeated_types"
)
bf_df = bf_df.set_index("rowindex", drop=False)
return bf_df


@pytest.fixture(scope="session")
def repeated_types_pandas_df() -> pd.DataFrame:
"""Returns a pandas DataFrame containing LIST types and using the `rowindex`
column as the index."""

df = pd.read_json(
DATA_DIR / "repeated.jsonl",
lines=True,
)
# TODO: add dtype conversion here if needed.
df = df.set_index("rowindex")
return df


@pytest.fixture(scope="session")
def json_types_table_schema() -> typing.Sequence[bigquery.SchemaField]:
return [
bigquery.SchemaField("rowindex", "INTEGER"),
bigquery.SchemaField("json_col", "JSON"),
]


@pytest.fixture(scope="session")
def json_types_df(compiler_session_w_json_types) -> bpd.DataFrame:
"""Returns a BigFrames DataFrame containing JSON types and using the `rowindex`
column as the index."""
bf_df = compiler_session_w_json_types.read_gbq_table(
"bigframes-dev.sqlglot_test.json_types"
)
# TODO(b/427305807): Why `drop=False` will produce two "rowindex" columns?
bf_df = bf_df.set_index("rowindex", drop=True)
return bf_df


@pytest.fixture(scope="session")
def json_pandas_df() -> pd.DataFrame:
"""Returns a pandas DataFrame containing JSON types and using the `rowindex`
Expand All @@ -149,8 +253,10 @@ def json_pandas_df() -> pd.DataFrame:
]
df = pd.DataFrame(
{
"rowindex": pd.Series(range(len(json_data)), dtype=dtypes.INT_DTYPE),
"json_col": pd.Series(json_data, dtype=dtypes.JSON_DTYPE),
},
index=pd.Series(range(len(json_data)), dtype=dtypes.INT_DTYPE),
)
# TODO(b/427305807): Why `drop=False` will produce two "rowindex" columns?
df = df.set_index("rowindex", drop=True)
return df
Original file line number Diff line number Diff line change
Expand Up @@ -49,21 +49,21 @@ WITH `bfcte_1` AS (
*
FROM (
SELECT
bfcol_17 AS `bfcol_46`,
bfcol_18 AS `bfcol_47`,
bfcol_19 AS `bfcol_48`,
bfcol_20 AS `bfcol_49`,
bfcol_21 AS `bfcol_50`,
bfcol_22 AS `bfcol_51`
`bfcol_17` AS `bfcol_46`,
`bfcol_18` AS `bfcol_47`,
`bfcol_19` AS `bfcol_48`,
`bfcol_20` AS `bfcol_49`,
`bfcol_21` AS `bfcol_50`,
`bfcol_22` AS `bfcol_51`
FROM `bfcte_6`
UNION ALL
SELECT
bfcol_40 AS `bfcol_46`,
bfcol_41 AS `bfcol_47`,
bfcol_42 AS `bfcol_48`,
bfcol_43 AS `bfcol_49`,
bfcol_44 AS `bfcol_50`,
bfcol_45 AS `bfcol_51`
`bfcol_40` AS `bfcol_46`,
`bfcol_41` AS `bfcol_47`,
`bfcol_42` AS `bfcol_48`,
`bfcol_43` AS `bfcol_49`,
`bfcol_44` AS `bfcol_50`,
`bfcol_45` AS `bfcol_51`
FROM `bfcte_7`
)
)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
WITH `bfcte_0` AS (
SELECT
*
FROM UNNEST(ARRAY<STRUCT<`bfcol_0` JSON, `bfcol_1` INT64>>[STRUCT(PARSE_JSON('null'), 0), STRUCT(PARSE_JSON('true'), 1), STRUCT(PARSE_JSON('100'), 2), STRUCT(PARSE_JSON('0.98'), 3), STRUCT(PARSE_JSON('"a string"'), 4), STRUCT(PARSE_JSON('[]'), 5), STRUCT(PARSE_JSON('[1,2,3]'), 6), STRUCT(PARSE_JSON('[{"a":1},{"a":2},{"a":null},{}]'), 7), STRUCT(PARSE_JSON('"100"'), 8), STRUCT(PARSE_JSON('{"date":"2024-07-16"}'), 9), STRUCT(PARSE_JSON('{"int_value":2,"null_filed":null}'), 10), STRUCT(PARSE_JSON('{"list_data":[10,20,30]}'), 11)])
FROM UNNEST(ARRAY<STRUCT<`bfcol_0` INT64, `bfcol_1` JSON, `bfcol_2` INT64>>[STRUCT(0, PARSE_JSON('null'), 0), STRUCT(1, PARSE_JSON('true'), 1), STRUCT(2, PARSE_JSON('100'), 2), STRUCT(3, PARSE_JSON('0.98'), 3), STRUCT(4, PARSE_JSON('"a string"'), 4), STRUCT(5, PARSE_JSON('[]'), 5), STRUCT(6, PARSE_JSON('[1,2,3]'), 6), STRUCT(7, PARSE_JSON('[{"a":1},{"a":2},{"a":null},{}]'), 7), STRUCT(8, PARSE_JSON('"100"'), 8), STRUCT(9, PARSE_JSON('{"date":"2024-07-16"}'), 9), STRUCT(10, PARSE_JSON('{"int_value":2,"null_filed":null}'), 10), STRUCT(11, PARSE_JSON('{"list_data":[10,20,30]}'), 11)])
)
SELECT
`bfcol_0` AS `json_col`
`bfcol_0` AS `rowindex`,
`bfcol_1` AS `json_col`
FROM `bfcte_0`
ORDER BY
`bfcol_1` ASC NULLS LAST
`bfcol_2` ASC NULLS LAST
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
WITH `bfcte_0` AS (
SELECT
`rowindex` AS `bfcol_0`,
`json_col` AS `bfcol_1`
FROM `bigframes-dev`.`sqlglot_test`.`json_types`
)
SELECT
`bfcol_0` AS `rowindex`,
`bfcol_1` AS `json_col`
FROM `bfcte_0`
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
WITH `bfcte_0` AS (
SELECT
`id` AS `bfcol_0`,
`people` AS `bfcol_1`
FROM `bigframes-dev`.`sqlglot_test`.`nested_structs_types`
)
SELECT
`bfcol_0` AS `id`,
`bfcol_0` AS `id_1`,
`bfcol_1` AS `people`
FROM `bfcte_0`
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
WITH `bfcte_0` AS (
SELECT
`rowindex` AS `bfcol_0`,
`int_list_col` AS `bfcol_1`,
`bool_list_col` AS `bfcol_2`,
`float_list_col` AS `bfcol_3`,
`date_list_col` AS `bfcol_4`,
`date_time_list_col` AS `bfcol_5`,
`numeric_list_col` AS `bfcol_6`,
`string_list_col` AS `bfcol_7`
FROM `bigframes-dev`.`sqlglot_test`.`repeated_types`
)
SELECT
`bfcol_0` AS `rowindex`,
`bfcol_0` AS `rowindex_1`,
`bfcol_1` AS `int_list_col`,
`bfcol_2` AS `bool_list_col`,
`bfcol_3` AS `float_list_col`,
`bfcol_4` AS `date_list_col`,
`bfcol_5` AS `date_time_list_col`,
`bfcol_6` AS `numeric_list_col`,
`bfcol_7` AS `string_list_col`
FROM `bfcte_0`
19 changes: 12 additions & 7 deletions tests/unit/core/compile/sqlglot/test_compile_readlocal.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,26 +30,31 @@ def test_compile_readlocal(

def test_compile_readlocal_w_structs_df(
nested_structs_pandas_df: pd.DataFrame,
compiler_session: bigframes.Session,
compiler_session_w_nested_structs_types: bigframes.Session,
snapshot,
):
bf_df = bpd.DataFrame(nested_structs_pandas_df, session=compiler_session)
# TODO(b/427306734): Check why the output is different from the expected output.
bf_df = bpd.DataFrame(
nested_structs_pandas_df, session=compiler_session_w_nested_structs_types
)
snapshot.assert_match(bf_df.sql, "out.sql")


def test_compile_readlocal_w_lists_df(
repeated_pandas_df: pd.DataFrame,
compiler_session: bigframes.Session,
repeated_types_pandas_df: pd.DataFrame,
compiler_session_w_repeated_types: bigframes.Session,
snapshot,
):
bf_df = bpd.DataFrame(repeated_pandas_df, session=compiler_session)
bf_df = bpd.DataFrame(
repeated_types_pandas_df, session=compiler_session_w_repeated_types
)
snapshot.assert_match(bf_df.sql, "out.sql")


def test_compile_readlocal_w_json_df(
json_pandas_df: pd.DataFrame,
compiler_session: bigframes.Session,
compiler_session_w_json_types: bigframes.Session,
snapshot,
):
bf_df = bpd.DataFrame(json_pandas_df, session=compiler_session)
bf_df = bpd.DataFrame(json_pandas_df, session=compiler_session_w_json_types)
snapshot.assert_match(bf_df.sql, "out.sql")
14 changes: 14 additions & 0 deletions tests/unit/core/compile/sqlglot/test_compile_readtable.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,20 @@ def test_compile_readtable(scalars_types_df: bpd.DataFrame, snapshot):
snapshot.assert_match(scalars_types_df.sql, "out.sql")


def test_compile_readtable_w_repeated_types(repeated_types_df: bpd.DataFrame, snapshot):
snapshot.assert_match(repeated_types_df.sql, "out.sql")


def test_compile_readtable_w_nested_structs_types(
nested_structs_types_df: bpd.DataFrame, snapshot
):
snapshot.assert_match(nested_structs_types_df.sql, "out.sql")


def test_compile_readtable_w_json_types(json_types_df: bpd.DataFrame, snapshot):
snapshot.assert_match(json_types_df.sql, "out.sql")


def test_compile_readtable_w_ordering(scalars_types_df: bpd.DataFrame, snapshot):
bf_df = scalars_types_df[["int64_col"]]
bf_df = bf_df.sort_values("int64_col")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import warnings

import db_dtypes
import google.cloud.bigquery.schema as schema
import pyarrow

Expand Down Expand Up @@ -61,6 +62,7 @@ def pyarrow_timestamp():
"TIME": pyarrow_time,
"TIMESTAMP": pyarrow_timestamp,
"BIGNUMERIC": pyarrow_bignumeric,
"JSON": db_dtypes.JSONArrowType,
}
ARROW_SCALAR_IDS_TO_BQ = {
# https://arrow.apache.org/docs/python/api/datatypes.html#type-classes
Expand Down