|
| 1 | +from pathlib import Path |
| 2 | + |
| 3 | +import geopandas as gpd |
1 | 4 | import pandas as pd
|
2 | 5 | import pyarrow as pa
|
3 | 6 | import pyarrow.feather as feather
|
4 | 7 | import shapely
|
| 8 | +from lonboard.colormap import apply_continuous_cmap |
| 9 | +from lonboard.geoarrow.geopandas_interop import geopandas_to_geoarrow |
| 10 | +from palettable.colorbrewer.diverging import BrBG_10 |
5 | 11 |
|
| 12 | +url = "https://ookla-open-data.s3.us-west-2.amazonaws.com/parquet/performance/type=mobile/year=2019/quarter=1/2019-01-01_performance_mobile_tiles.parquet" |
6 | 13 |
|
7 |
| -class PointGeometryType(pa.ExtensionType): |
8 |
| - def __init__(self): |
9 |
| - pa.ExtensionType.__init__(self, self._storage_type, self._extension_name) |
10 |
| - |
11 |
| - _storage_type = pa.list_(pa.field("xy", pa.float64()), 2) |
12 |
| - _extension_name = "geoarrow.point" |
13 |
| - |
14 |
| - def __arrow_ext_serialize__(self): |
15 |
| - # since we don't have a parameterized type, we don't need extra |
16 |
| - # metadata to be deserialized |
17 |
| - return b"" |
18 |
| - |
19 |
| - @classmethod |
20 |
| - def __arrow_ext_deserialize__(cls, storage_type, serialized): |
21 |
| - # return an instance of this subclass given the serialized |
22 |
| - # metadata. |
23 |
| - return PointGeometryType() |
24 |
| - |
25 |
| - |
26 |
| -# https://ookla-open-data.s3.us-west-2.amazonaws.com/parquet/performance/type=mobile/year=2019/quarter=1/2019-01-01_performance_mobile_tiles.parquet |
| 14 | +path = Path("2019-01-01_performance_mobile_tiles.parquet") |
27 | 15 |
|
28 | 16 |
|
29 | 17 | def main():
|
30 |
| - df = pd.read_parquet("2019-01-01_performance_mobile_tiles.parquet") |
| 18 | + if not path.exists(): |
| 19 | + msg = f"Please download file to this directory from {url=}." |
| 20 | + raise ValueError(msg) |
| 21 | + |
| 22 | + df = pd.read_parquet(path) |
31 | 23 | centroids = shapely.centroid(shapely.from_wkt(df["tile"]))
|
32 | 24 |
|
33 | 25 | # Save space by using a smaller data type
|
34 | 26 | df_cols = ["avg_d_kbps", "avg_u_kbps", "avg_lat_ms"]
|
35 | 27 | for col in df_cols:
|
36 | 28 | df[col] = pd.to_numeric(df[col], downcast="unsigned")
|
37 | 29 |
|
38 |
| - table = pa.Table.from_pandas(df[df_cols]) |
39 |
| - coords = shapely.get_coordinates(centroids) |
40 |
| - parr = pa.FixedSizeListArray.from_arrays(coords.flatten(), 2) |
41 |
| - extension_arr = pa.ExtensionArray.from_storage(PointGeometryType(), parr) |
42 |
| - table = table.append_column("geometry", extension_arr) |
| 30 | + gdf = gpd.GeoDataFrame(df[df_cols], geometry=centroids) |
| 31 | + table = geopandas_to_geoarrow(gdf, preserve_index=False) |
| 32 | + |
| 33 | + min_bound = 5000 |
| 34 | + max_bound = 50000 |
| 35 | + download_speed = gdf["avg_d_kbps"] |
| 36 | + normalized_download_speed = (download_speed - min_bound) / (max_bound - min_bound) |
| 37 | + |
| 38 | + colors = apply_continuous_cmap(normalized_download_speed, BrBG_10) |
| 39 | + table = table.append_column( |
| 40 | + "colors", pa.FixedSizeListArray.from_arrays(colors.flatten("C"), 3) |
| 41 | + ) |
| 42 | + |
43 | 43 | feather.write_feather(
|
44 | 44 | table, "2019-01-01_performance_mobile_tiles.feather", compression="uncompressed"
|
45 | 45 | )
|
|
0 commit comments