@@ -18,10 +18,14 @@ def test_kmeans_sample(project_id: str, random_model_id_eu: str) -> None:
18
18
your_model_id = random_model_id_eu
19
19
# [START bigquery_dataframes_bqml_kmeans]
20
20
import datetime
21
+ import typing
21
22
22
23
import pandas as pd
24
+ from shapely .geometry import Point
23
25
24
26
import bigframes
27
+ import bigframes .bigquery as bbq
28
+ import bigframes .geopandas
25
29
import bigframes .pandas as bpd
26
30
27
31
bigframes .options .bigquery .project = your_gcp_project_id
@@ -41,21 +45,21 @@ def test_kmeans_sample(project_id: str, random_model_id_eu: str) -> None:
41
45
}
42
46
)
43
47
44
- s = bpd .read_gbq (
45
- # Use ST_GEOPOINT and ST_DISTANCE to analyze geographical
46
- # data. These functions determine spatial relationships between
47
- # geographical features.
48
- """
49
- SELECT
50
- id,
51
- ST_DISTANCE(
52
- ST_GEOGPOINT(s.longitude, s.latitude),
53
- ST_GEOGPOINT(-0.1, 51.5)
54
- ) / 1000 AS distance_from_city_center
55
- FROM
56
- `bigquery-public-data.london_bicycles.cycle_stations` s
57
- """
48
+ # Use GeoSeries.from_xy and BigQuery.st_distance to analyze geographical
49
+ # data. These functions determine spatial relationships between
50
+ # geographical features.
51
+
52
+ cycle_stations = bpd .read_gbq ("bigquery-public-data.london_bicycles.cycle_stations" )
53
+ s = bpd .DataFrame (
54
+ {
55
+ "id" : cycle_stations ["id" ],
56
+ "xy" : bigframes .geopandas .GeoSeries .from_xy (
57
+ cycle_stations ["longitude" ], cycle_stations ["latitude" ]
58
+ ),
59
+ }
58
60
)
61
+ s_distance = bbq .st_distance (s ["xy" ], Point (- 0.1 , 51.5 ), use_spheroid = False ) / 1000
62
+ s = bpd .DataFrame ({"id" : s ["id" ], "distance_from_city_center" : s_distance })
59
63
60
64
# Define Python datetime objects in the UTC timezone for range comparison,
61
65
# because BigQuery stores timestamp data in the UTC timezone.
@@ -91,8 +95,11 @@ def test_kmeans_sample(project_id: str, random_model_id_eu: str) -> None:
91
95
92
96
# Engineer features to cluster the stations. For each station, find the
93
97
# average trip duration, number of trips, and distance from city center.
94
- stationstats = merged_df .groupby (["station_name" , "isweekday" ]).agg (
95
- {"duration" : ["mean" , "count" ], "distance_from_city_center" : "max" }
98
+ stationstats = typing .cast (
99
+ bpd .DataFrame ,
100
+ merged_df .groupby (["station_name" , "isweekday" ]).agg (
101
+ {"duration" : ["mean" , "count" ], "distance_from_city_center" : "max" }
102
+ ),
96
103
)
97
104
stationstats .columns = pd .Index (
98
105
["duration" , "num_trips" , "distance_from_city_center" ]
0 commit comments