Skip to content

Commit 15e1277

Browse files
authored
docs: add data visualization samples for public doc (#1847)
1 parent be9a89f commit 15e1277

File tree

1 file changed

+149
-0
lines changed

1 file changed

+149
-0
lines changed
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (t
4+
# you may not use this file except in compliance wi
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in
10+
# distributed under the License is distributed on a
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, eit
12+
# See the License for the specific language governi
13+
# limitations under the License.
14+
15+
16+
def test_data_visualization() -> None:
17+
# [START bigquery_dataframes_data_visualization_penguin_histogram]
18+
import bigframes.pandas as bpd
19+
20+
penguins = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins")
21+
penguins["culmen_depth_mm"].plot.hist(bins=40)
22+
# [END bigquery_dataframes_data_visualization_penguin_histogram]
23+
24+
# [START bigquery_dataframes_data_visualization_noaa_line_chart]
25+
import bigframes.pandas as bpd
26+
27+
noaa_surface = bpd.read_gbq("bigquery-public-data.noaa_gsod.gsod2021")
28+
29+
# Calculate median temperature for each day
30+
noaa_surface_median_temps = noaa_surface[["date", "temp"]].groupby("date").median()
31+
32+
noaa_surface_median_temps.plot.line()
33+
# [END bigquery_dataframes_data_visualization_noaa_line_chart]
34+
35+
# [START bigquery_dataframes_data_visualization_usa_names_area_chart]
36+
import bigframes.pandas as bpd
37+
38+
usa_names = bpd.read_gbq("bigquery-public-data.usa_names.usa_1910_2013")
39+
40+
# Count the occurences of the target names each year. The result is a dataframe with a multi-index.
41+
name_counts = (
42+
usa_names[usa_names["name"].isin(("Mary", "Emily", "Lisa"))]
43+
.groupby(("year", "name"))["number"]
44+
.sum()
45+
)
46+
47+
# Flatten the index of the dataframe so that the counts for each name has their own columns.
48+
name_counts = name_counts.unstack(level=1).fillna(0)
49+
50+
name_counts.plot.area(stacked=False, alpha=0.5)
51+
# [END bigquery_dataframes_data_visualization_usa_names_area_chart]
52+
53+
# [START bigquery_dataframes_data_visualization_penguin_bar_chart]
54+
import bigframes.pandas as bpd
55+
56+
penguins = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins")
57+
58+
penguin_count_by_sex = (
59+
penguins[penguins["sex"].isin(("MALE", "FEMALE"))]
60+
.groupby("sex")["species"]
61+
.count()
62+
)
63+
penguin_count_by_sex.plot.bar()
64+
# [END bigquery_dataframes_data_visualization_penguin_bar_chart]
65+
66+
# [START bigquery_dataframes_data_visualization_taxi_scatter_plot]
67+
import bigframes.pandas as bpd
68+
69+
taxi_trips = bpd.read_gbq(
70+
"bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2021"
71+
).dropna()
72+
73+
# Data Cleaning
74+
taxi_trips = taxi_trips[
75+
taxi_trips["trip_distance"].between(0, 10, inclusive="right")
76+
]
77+
taxi_trips = taxi_trips[taxi_trips["fare_amount"].between(0, 50, inclusive="right")]
78+
79+
# If you are using partial ordering mode, you will also need to assign an order to your dataset.
80+
# Otherwise, the next line can be skipped.
81+
taxi_trips = taxi_trips.sort_values("pickup_datetime")
82+
83+
taxi_trips.plot.scatter(x="trip_distance", y="fare_amount", alpha=0.5)
84+
# [END bigquery_dataframes_data_visualization_taxi_scatter_plot]
85+
86+
# [START bigquery_dataframes_data_visualization_noaa_sampling_n]
87+
import bigframes.pandas as bpd
88+
89+
noaa_surface = bpd.read_gbq("bigquery-public-data.noaa_gsod.gsod2021")
90+
91+
# Calculate median temperature for each day
92+
noaa_surface_median_temps = noaa_surface[["date", "temp"]].groupby("date").median()
93+
94+
noaa_surface_median_temps.plot.line(sampling_n=40)
95+
# [END bigquery_dataframes_data_visualization_noaa_sampling_n]
96+
97+
# [START bigquery_dataframes_data_visualization_usa_names_subplots]
98+
import bigframes.pandas as bpd
99+
100+
usa_names = bpd.read_gbq("bigquery-public-data.usa_names.usa_1910_2013")
101+
102+
# Count the occurences of the target names each year. The result is a dataframe with a multi-index.
103+
name_counts = (
104+
usa_names[usa_names["name"].isin(("Mary", "Emily", "Lisa"))]
105+
.groupby(("year", "name"))["number"]
106+
.sum()
107+
)
108+
109+
# Flatten the index of the dataframe so that the counts for each name has their own columns.
110+
name_counts = name_counts.unstack(level=1).fillna(0)
111+
112+
name_counts.plot.area(subplots=True, alpha=0.5)
113+
# [END bigquery_dataframes_data_visualization_usa_names_subplots]
114+
115+
# [START bigquery_dataframes_data_visualization_taxi_scatter_multidimension]
116+
import bigframes.pandas as bpd
117+
118+
taxi_trips = bpd.read_gbq(
119+
"bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2021"
120+
).dropna()
121+
122+
# Data Cleaning
123+
taxi_trips = taxi_trips[
124+
taxi_trips["trip_distance"].between(0, 10, inclusive="right")
125+
]
126+
taxi_trips = taxi_trips[taxi_trips["fare_amount"].between(0, 50, inclusive="right")]
127+
128+
# If you are using partial ordering mode, you also need to assign an order to your dataset.
129+
# Otherwise, the next line can be skipped.
130+
taxi_trips = taxi_trips.sort_values("pickup_datetime")
131+
132+
taxi_trips["passenger_count_scaled"] = taxi_trips["passenger_count"] * 30
133+
134+
taxi_trips.plot.scatter(
135+
x="trip_distance",
136+
xlabel="trip distance (miles)",
137+
y="fare_amount",
138+
ylabel="fare amount (usd)",
139+
alpha=0.5,
140+
s="passenger_count_scaled",
141+
label="passenger_count",
142+
c="tip_amount",
143+
cmap="jet",
144+
colorbar=True,
145+
legend=True,
146+
figsize=(15, 7),
147+
sampling_n=1000,
148+
)
149+
# [END bigquery_dataframes_data_visualization_taxi_scatter_multidimension]

0 commit comments

Comments
 (0)