Skip to content

Commit be9a89f

Browse files
authored
chore: add more types for read_gbq_table in conftest (#1843)
* chore: add more types for read_gbq_table in conftest * add bug id * undo dtype conversions for repeated_pandas_df * undo unrelated changes * fix json
1 parent 1863538 commit be9a89f

File tree

10 files changed

+202
-30
lines changed

10 files changed

+202
-30
lines changed

bigframes/core/compile/sqlglot/sqlglot_ir.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ def from_union(
182182

183183
selections = [
184184
sge.Alias(
185-
this=expr.alias_or_name,
185+
this=sge.to_identifier(expr.alias_or_name, quoted=cls.quoted),
186186
alias=sge.to_identifier(output_id, quoted=cls.quoted),
187187
)
188188
for expr, output_id in zip(select_expr.expressions, output_ids)

tests/unit/core/compile/sqlglot/conftest.py

Lines changed: 113 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,24 +29,48 @@
2929
DATA_DIR = CURRENT_DIR.parent.parent.parent.parent / "data"
3030

3131

32-
@pytest.fixture(scope="session")
33-
def compiler_session(scalars_types_table_schema):
32+
def _create_compiler_session(table_name, table_schema):
33+
"""Helper function to create a compiler session."""
3434
from bigframes.testing import compiler_session
3535

36-
# TODO: Check if ordering mode is needed for the tests.
37-
table_name = "scalar_types"
3836
anonymous_dataset = bigquery.DatasetReference.from_string(
3937
"bigframes-dev.sqlglot_test"
4038
)
4139
session = mocks.create_bigquery_session(
4240
table_name=table_name,
43-
table_schema=scalars_types_table_schema,
41+
table_schema=table_schema,
4442
anonymous_dataset=anonymous_dataset,
4543
)
4644
session._executor = compiler_session.SQLCompilerExecutor()
4745
return session
4846

4947

48+
@pytest.fixture(scope="session")
49+
def compiler_session(scalars_types_table_schema):
50+
"""Compiler session for scalar types."""
51+
return _create_compiler_session("scalar_types", scalars_types_table_schema)
52+
53+
54+
@pytest.fixture(scope="session")
55+
def compiler_session_w_repeated_types(repeated_types_table_schema):
56+
"""Compiler session for repeated data types."""
57+
return _create_compiler_session("repeated_types", repeated_types_table_schema)
58+
59+
60+
@pytest.fixture(scope="session")
61+
def compiler_session_w_nested_structs_types(nested_structs_types_table_schema):
62+
"""Compiler session for nested STRUCT data types."""
63+
return _create_compiler_session(
64+
"nested_structs_types", nested_structs_types_table_schema
65+
)
66+
67+
68+
@pytest.fixture(scope="session")
69+
def compiler_session_w_json_types(json_types_table_schema):
70+
"""Compiler session for JSON data types."""
71+
return _create_compiler_session("json_types", json_types_table_schema)
72+
73+
5074
@pytest.fixture(scope="session")
5175
def scalars_types_table_schema() -> typing.Sequence[bigquery.SchemaField]:
5276
return [
@@ -91,6 +115,40 @@ def scalars_types_pandas_df() -> pd.DataFrame:
91115
return df
92116

93117

118+
@pytest.fixture(scope="session")
119+
def nested_structs_types_table_schema() -> typing.Sequence[bigquery.SchemaField]:
120+
return [
121+
bigquery.SchemaField("id", "INTEGER"),
122+
bigquery.SchemaField(
123+
"people",
124+
"RECORD",
125+
fields=[
126+
bigquery.SchemaField("name", "STRING"),
127+
bigquery.SchemaField("age", "INTEGER"),
128+
bigquery.SchemaField(
129+
"address",
130+
"RECORD",
131+
fields=[
132+
bigquery.SchemaField("city", "STRING"),
133+
bigquery.SchemaField("country", "STRING"),
134+
],
135+
),
136+
],
137+
),
138+
]
139+
140+
141+
@pytest.fixture(scope="session")
142+
def nested_structs_types_df(compiler_session_w_nested_structs_types) -> bpd.DataFrame:
143+
"""Returns a BigFrames DataFrame containing all scalar types and using the `rowindex`
144+
column as the index."""
145+
bf_df = compiler_session_w_nested_structs_types.read_gbq_table(
146+
"bigframes-dev.sqlglot_test.nested_structs_types"
147+
)
148+
bf_df = bf_df.set_index("id", drop=False)
149+
return bf_df
150+
151+
94152
@pytest.fixture(scope="session")
95153
def nested_structs_pandas_df() -> pd.DataFrame:
96154
"""Returns a pandas DataFrame containing STRUCT types and using the `id`
@@ -117,18 +175,64 @@ def nested_structs_pandas_df() -> pd.DataFrame:
117175

118176

119177
@pytest.fixture(scope="session")
120-
def repeated_pandas_df() -> pd.DataFrame:
178+
def repeated_types_table_schema() -> typing.Sequence[bigquery.SchemaField]:
179+
return [
180+
bigquery.SchemaField("rowindex", "INTEGER"),
181+
bigquery.SchemaField("int_list_col", "INTEGER", "REPEATED"),
182+
bigquery.SchemaField("bool_list_col", "BOOLEAN", "REPEATED"),
183+
bigquery.SchemaField("float_list_col", "FLOAT", "REPEATED"),
184+
bigquery.SchemaField("date_list_col", "DATE", "REPEATED"),
185+
bigquery.SchemaField("date_time_list_col", "DATETIME", "REPEATED"),
186+
bigquery.SchemaField("numeric_list_col", "NUMERIC", "REPEATED"),
187+
bigquery.SchemaField("string_list_col", "STRING", "REPEATED"),
188+
]
189+
190+
191+
@pytest.fixture(scope="session")
192+
def repeated_types_df(compiler_session_w_repeated_types) -> bpd.DataFrame:
193+
"""Returns a BigFrames DataFrame containing all scalar types and using the `rowindex`
194+
column as the index."""
195+
bf_df = compiler_session_w_repeated_types.read_gbq_table(
196+
"bigframes-dev.sqlglot_test.repeated_types"
197+
)
198+
bf_df = bf_df.set_index("rowindex", drop=False)
199+
return bf_df
200+
201+
202+
@pytest.fixture(scope="session")
203+
def repeated_types_pandas_df() -> pd.DataFrame:
121204
"""Returns a pandas DataFrame containing LIST types and using the `rowindex`
122205
column as the index."""
123206

124207
df = pd.read_json(
125208
DATA_DIR / "repeated.jsonl",
126209
lines=True,
127210
)
211+
# TODO: add dtype conversion here if needed.
128212
df = df.set_index("rowindex")
129213
return df
130214

131215

216+
@pytest.fixture(scope="session")
217+
def json_types_table_schema() -> typing.Sequence[bigquery.SchemaField]:
218+
return [
219+
bigquery.SchemaField("rowindex", "INTEGER"),
220+
bigquery.SchemaField("json_col", "JSON"),
221+
]
222+
223+
224+
@pytest.fixture(scope="session")
225+
def json_types_df(compiler_session_w_json_types) -> bpd.DataFrame:
226+
"""Returns a BigFrames DataFrame containing JSON types and using the `rowindex`
227+
column as the index."""
228+
bf_df = compiler_session_w_json_types.read_gbq_table(
229+
"bigframes-dev.sqlglot_test.json_types"
230+
)
231+
# TODO(b/427305807): Why `drop=False` will produce two "rowindex" columns?
232+
bf_df = bf_df.set_index("rowindex", drop=True)
233+
return bf_df
234+
235+
132236
@pytest.fixture(scope="session")
133237
def json_pandas_df() -> pd.DataFrame:
134238
"""Returns a pandas DataFrame containing JSON types and using the `rowindex`
@@ -149,8 +253,10 @@ def json_pandas_df() -> pd.DataFrame:
149253
]
150254
df = pd.DataFrame(
151255
{
256+
"rowindex": pd.Series(range(len(json_data)), dtype=dtypes.INT_DTYPE),
152257
"json_col": pd.Series(json_data, dtype=dtypes.JSON_DTYPE),
153258
},
154-
index=pd.Series(range(len(json_data)), dtype=dtypes.INT_DTYPE),
155259
)
260+
# TODO(b/427305807): Why `drop=False` will produce two "rowindex" columns?
261+
df = df.set_index("rowindex", drop=True)
156262
return df

tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -49,21 +49,21 @@ WITH `bfcte_1` AS (
4949
*
5050
FROM (
5151
SELECT
52-
bfcol_17 AS `bfcol_46`,
53-
bfcol_18 AS `bfcol_47`,
54-
bfcol_19 AS `bfcol_48`,
55-
bfcol_20 AS `bfcol_49`,
56-
bfcol_21 AS `bfcol_50`,
57-
bfcol_22 AS `bfcol_51`
52+
`bfcol_17` AS `bfcol_46`,
53+
`bfcol_18` AS `bfcol_47`,
54+
`bfcol_19` AS `bfcol_48`,
55+
`bfcol_20` AS `bfcol_49`,
56+
`bfcol_21` AS `bfcol_50`,
57+
`bfcol_22` AS `bfcol_51`
5858
FROM `bfcte_6`
5959
UNION ALL
6060
SELECT
61-
bfcol_40 AS `bfcol_46`,
62-
bfcol_41 AS `bfcol_47`,
63-
bfcol_42 AS `bfcol_48`,
64-
bfcol_43 AS `bfcol_49`,
65-
bfcol_44 AS `bfcol_50`,
66-
bfcol_45 AS `bfcol_51`
61+
`bfcol_40` AS `bfcol_46`,
62+
`bfcol_41` AS `bfcol_47`,
63+
`bfcol_42` AS `bfcol_48`,
64+
`bfcol_43` AS `bfcol_49`,
65+
`bfcol_44` AS `bfcol_50`,
66+
`bfcol_45` AS `bfcol_51`
6767
FROM `bfcte_7`
6868
)
6969
)
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
WITH `bfcte_0` AS (
22
SELECT
33
*
4-
FROM UNNEST(ARRAY<STRUCT<`bfcol_0` JSON, `bfcol_1` INT64>>[STRUCT(PARSE_JSON('null'), 0), STRUCT(PARSE_JSON('true'), 1), STRUCT(PARSE_JSON('100'), 2), STRUCT(PARSE_JSON('0.98'), 3), STRUCT(PARSE_JSON('"a string"'), 4), STRUCT(PARSE_JSON('[]'), 5), STRUCT(PARSE_JSON('[1,2,3]'), 6), STRUCT(PARSE_JSON('[{"a":1},{"a":2},{"a":null},{}]'), 7), STRUCT(PARSE_JSON('"100"'), 8), STRUCT(PARSE_JSON('{"date":"2024-07-16"}'), 9), STRUCT(PARSE_JSON('{"int_value":2,"null_filed":null}'), 10), STRUCT(PARSE_JSON('{"list_data":[10,20,30]}'), 11)])
4+
FROM UNNEST(ARRAY<STRUCT<`bfcol_0` INT64, `bfcol_1` JSON, `bfcol_2` INT64>>[STRUCT(0, PARSE_JSON('null'), 0), STRUCT(1, PARSE_JSON('true'), 1), STRUCT(2, PARSE_JSON('100'), 2), STRUCT(3, PARSE_JSON('0.98'), 3), STRUCT(4, PARSE_JSON('"a string"'), 4), STRUCT(5, PARSE_JSON('[]'), 5), STRUCT(6, PARSE_JSON('[1,2,3]'), 6), STRUCT(7, PARSE_JSON('[{"a":1},{"a":2},{"a":null},{}]'), 7), STRUCT(8, PARSE_JSON('"100"'), 8), STRUCT(9, PARSE_JSON('{"date":"2024-07-16"}'), 9), STRUCT(10, PARSE_JSON('{"int_value":2,"null_filed":null}'), 10), STRUCT(11, PARSE_JSON('{"list_data":[10,20,30]}'), 11)])
55
)
66
SELECT
7-
`bfcol_0` AS `json_col`
7+
`bfcol_0` AS `rowindex`,
8+
`bfcol_1` AS `json_col`
89
FROM `bfcte_0`
910
ORDER BY
10-
`bfcol_1` ASC NULLS LAST
11+
`bfcol_2` ASC NULLS LAST
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
WITH `bfcte_0` AS (
2+
SELECT
3+
`rowindex` AS `bfcol_0`,
4+
`json_col` AS `bfcol_1`
5+
FROM `bigframes-dev`.`sqlglot_test`.`json_types`
6+
)
7+
SELECT
8+
`bfcol_0` AS `rowindex`,
9+
`bfcol_1` AS `json_col`
10+
FROM `bfcte_0`
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
WITH `bfcte_0` AS (
2+
SELECT
3+
`id` AS `bfcol_0`,
4+
`people` AS `bfcol_1`
5+
FROM `bigframes-dev`.`sqlglot_test`.`nested_structs_types`
6+
)
7+
SELECT
8+
`bfcol_0` AS `id`,
9+
`bfcol_0` AS `id_1`,
10+
`bfcol_1` AS `people`
11+
FROM `bfcte_0`
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
WITH `bfcte_0` AS (
2+
SELECT
3+
`rowindex` AS `bfcol_0`,
4+
`int_list_col` AS `bfcol_1`,
5+
`bool_list_col` AS `bfcol_2`,
6+
`float_list_col` AS `bfcol_3`,
7+
`date_list_col` AS `bfcol_4`,
8+
`date_time_list_col` AS `bfcol_5`,
9+
`numeric_list_col` AS `bfcol_6`,
10+
`string_list_col` AS `bfcol_7`
11+
FROM `bigframes-dev`.`sqlglot_test`.`repeated_types`
12+
)
13+
SELECT
14+
`bfcol_0` AS `rowindex`,
15+
`bfcol_0` AS `rowindex_1`,
16+
`bfcol_1` AS `int_list_col`,
17+
`bfcol_2` AS `bool_list_col`,
18+
`bfcol_3` AS `float_list_col`,
19+
`bfcol_4` AS `date_list_col`,
20+
`bfcol_5` AS `date_time_list_col`,
21+
`bfcol_6` AS `numeric_list_col`,
22+
`bfcol_7` AS `string_list_col`
23+
FROM `bfcte_0`

tests/unit/core/compile/sqlglot/test_compile_readlocal.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,26 +30,31 @@ def test_compile_readlocal(
3030

3131
def test_compile_readlocal_w_structs_df(
3232
nested_structs_pandas_df: pd.DataFrame,
33-
compiler_session: bigframes.Session,
33+
compiler_session_w_nested_structs_types: bigframes.Session,
3434
snapshot,
3535
):
36-
bf_df = bpd.DataFrame(nested_structs_pandas_df, session=compiler_session)
36+
# TODO(b/427306734): Check why the output is different from the expected output.
37+
bf_df = bpd.DataFrame(
38+
nested_structs_pandas_df, session=compiler_session_w_nested_structs_types
39+
)
3740
snapshot.assert_match(bf_df.sql, "out.sql")
3841

3942

4043
def test_compile_readlocal_w_lists_df(
41-
repeated_pandas_df: pd.DataFrame,
42-
compiler_session: bigframes.Session,
44+
repeated_types_pandas_df: pd.DataFrame,
45+
compiler_session_w_repeated_types: bigframes.Session,
4346
snapshot,
4447
):
45-
bf_df = bpd.DataFrame(repeated_pandas_df, session=compiler_session)
48+
bf_df = bpd.DataFrame(
49+
repeated_types_pandas_df, session=compiler_session_w_repeated_types
50+
)
4651
snapshot.assert_match(bf_df.sql, "out.sql")
4752

4853

4954
def test_compile_readlocal_w_json_df(
5055
json_pandas_df: pd.DataFrame,
51-
compiler_session: bigframes.Session,
56+
compiler_session_w_json_types: bigframes.Session,
5257
snapshot,
5358
):
54-
bf_df = bpd.DataFrame(json_pandas_df, session=compiler_session)
59+
bf_df = bpd.DataFrame(json_pandas_df, session=compiler_session_w_json_types)
5560
snapshot.assert_match(bf_df.sql, "out.sql")

tests/unit/core/compile/sqlglot/test_compile_readtable.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,20 @@ def test_compile_readtable(scalars_types_df: bpd.DataFrame, snapshot):
2323
snapshot.assert_match(scalars_types_df.sql, "out.sql")
2424

2525

26+
def test_compile_readtable_w_repeated_types(repeated_types_df: bpd.DataFrame, snapshot):
27+
snapshot.assert_match(repeated_types_df.sql, "out.sql")
28+
29+
30+
def test_compile_readtable_w_nested_structs_types(
31+
nested_structs_types_df: bpd.DataFrame, snapshot
32+
):
33+
snapshot.assert_match(nested_structs_types_df.sql, "out.sql")
34+
35+
36+
def test_compile_readtable_w_json_types(json_types_df: bpd.DataFrame, snapshot):
37+
snapshot.assert_match(json_types_df.sql, "out.sql")
38+
39+
2640
def test_compile_readtable_w_ordering(scalars_types_df: bpd.DataFrame, snapshot):
2741
bf_df = scalars_types_df[["int64_col"]]
2842
bf_df = bf_df.sort_values("int64_col")

third_party/bigframes_vendored/google_cloud_bigquery/_pandas_helpers.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
import warnings
1919

20+
import db_dtypes
2021
import google.cloud.bigquery.schema as schema
2122
import pyarrow
2223

@@ -61,6 +62,7 @@ def pyarrow_timestamp():
6162
"TIME": pyarrow_time,
6263
"TIMESTAMP": pyarrow_timestamp,
6364
"BIGNUMERIC": pyarrow_bignumeric,
65+
"JSON": db_dtypes.JSONArrowType,
6466
}
6567
ARROW_SCALAR_IDS_TO_BQ = {
6668
# https://arrow.apache.org/docs/python/api/datatypes.html#type-classes

0 commit comments

Comments
 (0)