-
Notifications
You must be signed in to change notification settings - Fork 52
feat: Support dry_run in to_pandas()
#1436
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 27 commits
dac34d7
b88ba73
330a647
5f8a76a
75f4ce1
0b4c48c
40c557b
fe82c6d
1adc96a
3c0efc2
9c3d849
725050b
7550f6a
3b9ea0e
70e1986
c2c3fca
cde29a0
e291c70
86bf46b
09fb874
4af0ac4
416ad49
301e993
67e40e9
6eeb69e
5a85ad5
e11ccdb
c610e57
b4db897
1401076
fb9f8bf
30b9f3d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,7 @@ | |
from __future__ import annotations | ||
|
||
import ast | ||
import copy | ||
import dataclasses | ||
import datetime | ||
import functools | ||
|
@@ -30,11 +31,13 @@ | |
import textwrap | ||
import typing | ||
from typing import ( | ||
Any, | ||
Iterable, | ||
List, | ||
Literal, | ||
Mapping, | ||
Optional, | ||
overload, | ||
Sequence, | ||
Tuple, | ||
TYPE_CHECKING, | ||
|
@@ -501,15 +504,42 @@ def to_arrow( | |
pa_table = pa_table.rename_columns(list(self.column_labels) + pa_index_labels) | ||
return pa_table, execute_result.query_job | ||
|
||
@overload | ||
def to_pandas( | ||
self, | ||
max_download_size: Optional[int] = ..., | ||
sampling_method: Optional[str] = ..., | ||
random_state: Optional[int] = ..., | ||
*, | ||
ordered: bool = ..., | ||
dry_run: Literal[False] = ..., | ||
allow_large_results: Optional[bool] = ..., | ||
) -> Tuple[pd.DataFrame, Optional[bigquery.QueryJob]]: | ||
... | ||
|
||
@overload | ||
def to_pandas( | ||
self, | ||
max_download_size: Optional[int] = ..., | ||
sampling_method: Optional[str] = ..., | ||
random_state: Optional[int] = ..., | ||
*, | ||
ordered: bool = ..., | ||
dry_run: Literal[True] = ..., | ||
allow_large_results: Optional[bool] = ..., | ||
) -> Tuple[pd.Series, Optional[bigquery.QueryJob]]: | ||
... | ||
|
||
def to_pandas( | ||
self, | ||
max_download_size: Optional[int] = None, | ||
sampling_method: Optional[str] = None, | ||
random_state: Optional[int] = None, | ||
*, | ||
ordered: bool = True, | ||
dry_run: bool = False, | ||
allow_large_results: Optional[bool] = None, | ||
) -> Tuple[pd.DataFrame, Optional[bigquery.QueryJob]]: | ||
) -> Tuple[pd.DataFrame | pd.Series, Optional[bigquery.QueryJob]]: | ||
"""Run query and download results as a pandas DataFrame. | ||
|
||
Args: | ||
|
@@ -531,9 +561,12 @@ def to_pandas( | |
ordered (bool, default True): | ||
Determines whether the resulting pandas dataframe will be ordered. | ||
Whether the row ordering is deterministics depends on whether session ordering is strict. | ||
dry_run (bool, default False): | ||
Whether to perfrom a dry run. If true, the method will return a pandas Series containing | ||
dry run statistics. | ||
|
||
Returns: | ||
pandas.DataFrame, QueryJob | ||
pandas.DataFrame | pandas.Series, QueryJob | ||
""" | ||
if (sampling_method is not None) and (sampling_method not in _SAMPLING_METHODS): | ||
raise NotImplementedError( | ||
|
@@ -549,6 +582,11 @@ def to_pandas( | |
else: | ||
sampling = sampling.with_disabled() | ||
|
||
if dry_run: | ||
if sampling.enable_downsampling: | ||
raise NotImplementedError("Dry run with sampling is not supproted") | ||
return self._compute_dry_run(ordered=ordered) | ||
|
||
df, query_job = self._materialize_local( | ||
materialize_options=MaterializationOptions( | ||
downsampling=sampling, | ||
|
@@ -798,11 +836,61 @@ def split( | |
return [sliced_block.drop_columns(drop_cols) for sliced_block in sliced_blocks] | ||
|
||
def _compute_dry_run( | ||
self, value_keys: Optional[Iterable[str]] = None | ||
) -> bigquery.QueryJob: | ||
self, value_keys: Optional[Iterable[str]] = None, ordered: bool = True | ||
) -> typing.Tuple[pd.Series, bigquery.QueryJob]: | ||
index: List[Any] = [] | ||
values: List[Any] = [] | ||
|
||
index.append("columnCount") | ||
values.append(len(self.value_columns)) | ||
index.append("columnDtypes") | ||
values.append( | ||
{ | ||
col: self.expr.get_column_type(self.resolve_label_exact_or_error(col)) | ||
for col in self.column_labels | ||
} | ||
) | ||
|
||
index.append("indexLevel") | ||
values.append(self.index.nlevels) | ||
index.append("indexDtypes") | ||
values.append(self.index.dtypes) | ||
|
||
expr = self._apply_value_keys_to_expr(value_keys=value_keys) | ||
query_job = self.session._executor.dry_run(expr) | ||
return query_job | ||
query_job = self.session._executor.dry_run(expr, ordered) | ||
job_api_repr = copy.deepcopy(query_job._properties) | ||
|
||
job_ref = job_api_repr["jobReference"] | ||
for key, val in job_ref.items(): | ||
index.append(key) | ||
values.append(val) | ||
|
||
index.append("jobType") | ||
values.append(job_api_repr["configuration"]["jobType"]) | ||
|
||
query_config = job_api_repr["configuration"]["query"] | ||
for key in ("destinationTable", "useLegacySql"): | ||
index.append(key) | ||
values.append(query_config.get(key)) | ||
|
||
query_stats = job_api_repr["statistics"]["query"] | ||
for key in ( | ||
"referencedTables", | ||
"totalBytesProcessed", | ||
"cacheHit", | ||
"statementType", | ||
): | ||
index.append(key) | ||
values.append(query_stats.get(key)) | ||
|
||
index.append("creationTime") | ||
values.append( | ||
pd.Timestamp( | ||
job_api_repr["statistics"]["creationTime"], unit="ms", tz="UTC" | ||
) | ||
) | ||
|
||
return pd.Series(values, index=index), query_job | ||
|
||
def _apply_value_keys_to_expr(self, value_keys: Optional[Iterable[str]] = None): | ||
expr = self._expr | ||
|
@@ -2691,20 +2779,49 @@ def column_ids(self) -> Sequence[str]: | |
def is_null(self) -> bool: | ||
return len(self._block._index_columns) == 0 | ||
|
||
@overload | ||
def to_pandas( | ||
self, | ||
*, | ||
ordered: Optional[bool] = ..., | ||
dry_run: Literal[False] = ..., | ||
allow_large_results: Optional[bool] = ..., | ||
) -> Tuple[pd.Index, Optional[bigquery.QueryJob]]: | ||
... | ||
|
||
@overload | ||
def to_pandas( | ||
self, | ||
*, | ||
ordered: Optional[bool] = ..., | ||
dry_run: Literal[True] = ..., | ||
allow_large_results: Optional[bool] = ..., | ||
) -> Tuple[pd.Series, Optional[bigquery.QueryJob]]: | ||
... | ||
|
||
def to_pandas( | ||
self, | ||
*, | ||
ordered: Optional[bool] = None, | ||
dry_run: bool = False, | ||
allow_large_results: Optional[bool] = None, | ||
) -> Tuple[pd.Index, Optional[bigquery.QueryJob]]: | ||
) -> Tuple[pd.Index | pd.Series, Optional[bigquery.QueryJob]]: | ||
"""Executes deferred operations and downloads the results.""" | ||
if len(self.column_ids) == 0: | ||
raise bigframes.exceptions.NullIndexError( | ||
"Cannot materialize index, as this object does not have an index. Set index column(s) using set_index." | ||
) | ||
ordered = ordered if ordered is not None else True | ||
if dry_run: | ||
series, query_job = self._block.select_columns([]).to_pandas( | ||
ordered=ordered, | ||
allow_large_results=allow_large_results, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think |
||
dry_run=dry_run, | ||
) | ||
return series, query_job | ||
|
||
df, query_job = self._block.select_columns([]).to_pandas( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Alternatively, we can get rid of this |
||
ordered=ordered, allow_large_results=allow_large_results | ||
ordered=ordered, allow_large_results=allow_large_results, dry_run=dry_run | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why include the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why include the |
||
) | ||
return df.index, query_job | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,7 +17,7 @@ | |
from __future__ import annotations | ||
|
||
import typing | ||
from typing import Hashable, Literal, Optional, Sequence, Union | ||
from typing import Hashable, Literal, Optional, overload, Sequence, Union | ||
|
||
import bigframes_vendored.constants as constants | ||
import bigframes_vendored.pandas.core.indexes.base as vendored_pandas_index | ||
|
@@ -228,15 +228,16 @@ def T(self) -> Index: | |
return self.transpose() | ||
|
||
@property | ||
def query_job(self) -> Optional[bigquery.QueryJob]: | ||
def query_job(self) -> bigquery.QueryJob: | ||
"""BigQuery job metadata for the most recent query. | ||
|
||
Returns: | ||
The most recent `QueryJob | ||
<https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob>`_. | ||
""" | ||
if self._query_job is None: | ||
self._query_job = self._block._compute_dry_run() | ||
_, query_job = self._block._compute_dry_run() | ||
self._query_job = query_job | ||
return self._query_job | ||
|
||
def __repr__(self) -> str: | ||
|
@@ -252,7 +253,8 @@ def __repr__(self) -> str: | |
opts = bigframes.options.display | ||
max_results = opts.max_rows | ||
if opts.repr_mode == "deferred": | ||
return formatter.repr_query_job(self._block._compute_dry_run()) | ||
_, dry_run_query_job = self._block._compute_dry_run() | ||
return formatter.repr_query_job(dry_run_query_job) | ||
|
||
pandas_df, _, query_job = self._block.retrieve_repr_request_results(max_results) | ||
self._query_job = query_job | ||
|
@@ -490,20 +492,51 @@ def __getitem__(self, key: int) -> typing.Any: | |
else: | ||
raise NotImplementedError(f"Index key not supported {key}") | ||
|
||
def to_pandas(self, *, allow_large_results: Optional[bool] = None) -> pandas.Index: | ||
@overload | ||
def to_pandas( | ||
self, | ||
*, | ||
allow_large_results: Optional[bool] = ..., | ||
dry_run: Literal[False] = ..., | ||
) -> pandas.Index: | ||
... | ||
|
||
@overload | ||
def to_pandas( | ||
self, *, allow_large_results: Optional[bool] = ..., dry_run: Literal[True] = ... | ||
) -> pandas.Series: | ||
... | ||
|
||
def to_pandas( | ||
self, *, allow_large_results: Optional[bool] = None, dry_run: bool = False | ||
) -> pandas.Index | pandas.Series: | ||
"""Gets the Index as a pandas Index. | ||
|
||
Args: | ||
allow_large_results (bool, default None): | ||
If not None, overrides the global setting to allow or disallow large query results | ||
over the default size limit of 10 GB. | ||
dry_run (bool, default False): | ||
If this argument is true, this method will not process the data. Instead, it returns | ||
a Pandas series containing dtype and the amount of bytes to be processed. | ||
|
||
Returns: | ||
pandas.Index: | ||
A pandas Index with all of the labels from this Index. | ||
pandas.Index | pandas.Series: | ||
A pandas Index with all of the labels from this Index. If dry run is set to True, | ||
returns a Series containing dry run statistics. | ||
""" | ||
if dry_run: | ||
sycai marked this conversation as resolved.
Show resolved
Hide resolved
|
||
series, query_job = self._block.index.to_pandas( | ||
ordered=True, allow_large_results=allow_large_results, dry_run=dry_run | ||
) | ||
if query_job: | ||
self._query_job = query_job | ||
return series | ||
|
||
# Repeat the to_pandas() call to make mypy deduce type correctly, because mypy cannot resolve | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why don't you just use |
||
# Literal[True/False] to bool | ||
df, query_job = self._block.index.to_pandas( | ||
ordered=True, allow_large_results=allow_large_results | ||
ordered=True, allow_large_results=allow_large_results, dry_run=dry_run | ||
) | ||
if query_job: | ||
self._query_job = query_job | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.