Skip to content

Commit fab3c38

Browse files
authored
fix: generate GoogleSQL instead of legacy SQL data types for dry_run=True from bpd._read_gbq_colab with local pandas DataFrame (#1867)
* fix: generate GoogleSQL instead of legacy SQL data types for `dry_run=True` from `bpd._read_gbq_colab` with local pandas DataFrame * finish adding system tests * add unit tests * map legacy sql types to googlesql * fix more unit tests * fix mypy * fix python 3.9 tests shapely
1 parent e3c06b4 commit fab3c38

File tree

7 files changed

+713
-47
lines changed

7 files changed

+713
-47
lines changed

bigframes/core/tools/bigquery_schema.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,12 @@
1818

1919
import google.cloud.bigquery
2020

21+
_LEGACY_TO_GOOGLESQL_TYPES = {
22+
"BOOLEAN": "BOOL",
23+
"INTEGER": "INT64",
24+
"FLOAT": "FLOAT64",
25+
}
26+
2127

2228
def _type_to_sql(field: google.cloud.bigquery.SchemaField):
2329
"""Turn the type information of the field into SQL.
@@ -26,7 +32,12 @@ def _type_to_sql(field: google.cloud.bigquery.SchemaField):
2632
"""
2733
if field.field_type.casefold() in ("record", "struct"):
2834
return _to_struct(field.fields)
29-
return field.field_type
35+
36+
# Map from legacy SQL names (the ones used in the BigQuery schema API) to
37+
# the GoogleSQL types. Importantly, FLOAT is from legacy SQL, but not valid
38+
# in GoogleSQL. See internal issue b/428190014.
39+
type_ = _LEGACY_TO_GOOGLESQL_TYPES.get(field.field_type.upper(), field.field_type)
40+
return type_
3041

3142

3243
def _field_to_sql(field: google.cloud.bigquery.SchemaField):

bigframes/pandas/io/api.py

Lines changed: 35 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -218,8 +218,27 @@ def read_gbq(
218218
read_gbq.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq)
219219

220220

221+
def _run_read_gbq_colab_sessionless_dry_run(
222+
query: str,
223+
*,
224+
pyformat_args: Dict[str, Any],
225+
) -> pandas.Series:
226+
"""Run a dry_run without a session."""
227+
228+
query_formatted = bigframes.core.pyformat.pyformat(
229+
query,
230+
pyformat_args=pyformat_args,
231+
dry_run=True,
232+
)
233+
bqclient = _get_bqclient()
234+
job = _dry_run(query_formatted, bqclient)
235+
return dry_runs.get_query_stats_with_inferred_dtypes(job, (), ())
236+
237+
221238
def _try_read_gbq_colab_sessionless_dry_run(
222-
create_query: Callable[[], str],
239+
query: str,
240+
*,
241+
pyformat_args: Dict[str, Any],
223242
) -> Optional[pandas.Series]:
224243
"""Run a dry_run without a session, only if the session hasn't yet started."""
225244

@@ -230,10 +249,9 @@ def _try_read_gbq_colab_sessionless_dry_run(
230249
# to local data and not any BigQuery tables.
231250
with _default_location_lock:
232251
if not config.options.bigquery._session_started:
233-
bqclient = _get_bqclient()
234-
query = create_query()
235-
job = _dry_run(query, bqclient)
236-
return dry_runs.get_query_stats_with_inferred_dtypes(job, (), ())
252+
return _run_read_gbq_colab_sessionless_dry_run(
253+
query, pyformat_args=pyformat_args
254+
)
237255

238256
# Explicitly return None to indicate that we didn't run the dry run query.
239257
return None
@@ -286,21 +304,13 @@ def _read_gbq_colab(
286304
if pyformat_args is None:
287305
pyformat_args = {}
288306

289-
# Delay formatting the query with the special "session-less" logic. This
290-
# avoids doing unnecessary work if the session already has a location or has
291-
# already started.
292-
create_query = functools.partial(
293-
bigframes.core.pyformat.pyformat,
294-
query_or_table,
295-
pyformat_args=pyformat_args,
296-
dry_run=True,
297-
)
298-
299307
# Only try to set the global location if it's not a dry run. We don't want
300308
# to bind to a location too early. This is especially important if the query
301309
# only refers to local data and not any BigQuery tables.
302310
if dry_run:
303-
result = _try_read_gbq_colab_sessionless_dry_run(create_query)
311+
result = _try_read_gbq_colab_sessionless_dry_run(
312+
query_or_table, pyformat_args=pyformat_args
313+
)
304314

305315
if result is not None:
306316
return result
@@ -309,6 +319,15 @@ def _read_gbq_colab(
309319
# started. That means we can safely call the "real" _read_gbq_colab,
310320
# which generates slightly nicer SQL.
311321
else:
322+
# Delay formatting the query with the special "session-less" logic. This
323+
# avoids doing unnecessary work if the session already has a location or has
324+
# already started.
325+
create_query = functools.partial(
326+
bigframes.core.pyformat.pyformat,
327+
query_or_table,
328+
pyformat_args=pyformat_args,
329+
dry_run=True,
330+
)
312331
_set_default_session_location_if_possible_deferred_query(create_query)
313332

314333
return global_session.with_default_session(
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.

0 commit comments

Comments
 (0)