Skip to content

Commit 761c364

Browse files
authored
docs: update snippet for Create a k-means model tutorial (#1664)
* docs: update snippet for Create a k-means model tutorial * re-run bq_dataframes_llm_kmeans.ipynb * use type casting to fix mypy error * fix snippet * Update imports for create_kmeans_model_test.py * revert changes to llm k-means notebook * use shapely object for scalibility and remove links to docs
1 parent dd08857 commit 761c364

File tree

1 file changed

+23
-16
lines changed

1 file changed

+23
-16
lines changed

samples/snippets/create_kmeans_model_test.py

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,14 @@ def test_kmeans_sample(project_id: str, random_model_id_eu: str) -> None:
1818
your_model_id = random_model_id_eu
1919
# [START bigquery_dataframes_bqml_kmeans]
2020
import datetime
21+
import typing
2122

2223
import pandas as pd
24+
from shapely.geometry import Point
2325

2426
import bigframes
27+
import bigframes.bigquery as bbq
28+
import bigframes.geopandas
2529
import bigframes.pandas as bpd
2630

2731
bigframes.options.bigquery.project = your_gcp_project_id
@@ -41,21 +45,21 @@ def test_kmeans_sample(project_id: str, random_model_id_eu: str) -> None:
4145
}
4246
)
4347

44-
s = bpd.read_gbq(
45-
# Use ST_GEOPOINT and ST_DISTANCE to analyze geographical
46-
# data. These functions determine spatial relationships between
47-
# geographical features.
48-
"""
49-
SELECT
50-
id,
51-
ST_DISTANCE(
52-
ST_GEOGPOINT(s.longitude, s.latitude),
53-
ST_GEOGPOINT(-0.1, 51.5)
54-
) / 1000 AS distance_from_city_center
55-
FROM
56-
`bigquery-public-data.london_bicycles.cycle_stations` s
57-
"""
48+
# Use GeoSeries.from_xy and BigQuery.st_distance to analyze geographical
49+
# data. These functions determine spatial relationships between
50+
# geographical features.
51+
52+
cycle_stations = bpd.read_gbq("bigquery-public-data.london_bicycles.cycle_stations")
53+
s = bpd.DataFrame(
54+
{
55+
"id": cycle_stations["id"],
56+
"xy": bigframes.geopandas.GeoSeries.from_xy(
57+
cycle_stations["longitude"], cycle_stations["latitude"]
58+
),
59+
}
5860
)
61+
s_distance = bbq.st_distance(s["xy"], Point(-0.1, 51.5), use_spheroid=False) / 1000
62+
s = bpd.DataFrame({"id": s["id"], "distance_from_city_center": s_distance})
5963

6064
# Define Python datetime objects in the UTC timezone for range comparison,
6165
# because BigQuery stores timestamp data in the UTC timezone.
@@ -91,8 +95,11 @@ def test_kmeans_sample(project_id: str, random_model_id_eu: str) -> None:
9195

9296
# Engineer features to cluster the stations. For each station, find the
9397
# average trip duration, number of trips, and distance from city center.
94-
stationstats = merged_df.groupby(["station_name", "isweekday"]).agg(
95-
{"duration": ["mean", "count"], "distance_from_city_center": "max"}
98+
stationstats = typing.cast(
99+
bpd.DataFrame,
100+
merged_df.groupby(["station_name", "isweekday"]).agg(
101+
{"duration": ["mean", "count"], "distance_from_city_center": "max"}
102+
),
96103
)
97104
stationstats.columns = pd.Index(
98105
["duration", "num_trips", "distance_from_city_center"]

0 commit comments

Comments
 (0)