Skip to content

Commit 4291d65

Browse files
SalemJordenSalem Boylandgcf-owl-bot[bot]tswast
authored
docs: add a code sample for creating a kmeans model (#267)
* k-means code sample * formatting * added test * docs: add code sampke for creating kmeans model * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * license header + region tags added * Update samples/snippets/create_kmeans_model_test.py Co-authored-by: Tim Swast <[email protected]> * code corrections resolved * code corrections commit 1 * descriptions of geospatial analysis functions * explantions revised for clarity * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * Update samples/snippets/create_kmeans_model_test.py Co-authored-by: Tim Swast <[email protected]> * code corrections * code revision * code changes * revisions * expected output previews * revisions * tests passing, expected output characters >80 * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * column wrapping * reset session before running code smaples * Update samples/snippets/create_kmeans_model_test.py Co-authored-by: Tim Swast <[email protected]> * predict function added to tutorial * replaced project_id with model_id * reformatting * reformat --------- Co-authored-by: Salem Boyland <[email protected]> Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com> Co-authored-by: Tim Swast <[email protected]>
1 parent 8f73d9e commit 4291d65

File tree

2 files changed

+189
-0
lines changed

2 files changed

+189
-0
lines changed

samples/snippets/conftest.py

+37
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
import pytest
1919
import test_utils.prefixer
2020

21+
import bigframes.pandas as bpd
22+
2123
prefixer = test_utils.prefixer.Prefixer(
2224
"python-bigquery-dataframes", "samples/snippets"
2325
)
@@ -43,6 +45,16 @@ def project_id(bigquery_client: bigquery.Client) -> str:
4345
return bigquery_client.project
4446

4547

48+
@pytest.fixture(autouse=True)
49+
def reset_session():
50+
"""An autouse fixture ensuring each sample runs in a fresh session.
51+
52+
This allows us to have samples that query data in different locations.
53+
"""
54+
bpd.reset_session()
55+
bpd.options.bigquery.location = None
56+
57+
4658
@pytest.fixture(scope="session")
4759
def dataset_id(bigquery_client: bigquery.Client, project_id: str) -> Iterator[str]:
4860
dataset_id = prefixer.create_prefix()
@@ -53,6 +65,17 @@ def dataset_id(bigquery_client: bigquery.Client, project_id: str) -> Iterator[st
5365
bigquery_client.delete_dataset(dataset, delete_contents=True, not_found_ok=True)
5466

5567

68+
@pytest.fixture(scope="session")
69+
def dataset_id_eu(bigquery_client: bigquery.Client, project_id: str) -> Iterator[str]:
70+
dataset_id = prefixer.create_prefix()
71+
full_dataset_id = f"{project_id}.{dataset_id}"
72+
dataset = bigquery.Dataset(full_dataset_id)
73+
dataset.location = "EU"
74+
bigquery_client.create_dataset(dataset)
75+
yield dataset_id
76+
bigquery_client.delete_dataset(dataset, delete_contents=True, not_found_ok=True)
77+
78+
5679
@pytest.fixture
5780
def random_model_id(
5881
bigquery_client: bigquery.Client, project_id: str, dataset_id: str
@@ -64,3 +87,17 @@ def random_model_id(
6487
full_model_id = f"{project_id}.{dataset_id}.{random_model_id}"
6588
yield full_model_id
6689
bigquery_client.delete_model(full_model_id, not_found_ok=True)
90+
91+
92+
@pytest.fixture
93+
def random_model_id_eu(
94+
bigquery_client: bigquery.Client, project_id: str, dataset_id_eu: str
95+
) -> Iterator[str]:
96+
"""
97+
Create a new table ID each time, so random_model_id_eu can be used
98+
as a target for load jobs.
99+
"""
100+
random_model_id_eu = prefixer.create_prefix()
101+
full_model_id = f"{project_id}.{dataset_id_eu}.{random_model_id_eu}"
102+
yield full_model_id
103+
bigquery_client.delete_model(full_model_id, not_found_ok=True)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
def test_kmeans_sample(project_id: str, random_model_id_eu: str):
17+
your_gcp_project_id = project_id
18+
your_model_id = random_model_id_eu
19+
# [START bigquery_dataframes_bqml_kmeans]
20+
import datetime
21+
22+
import bigframes
23+
import bigframes.pandas as bpd
24+
25+
bigframes.options.bigquery.project = your_gcp_project_id
26+
# Compute in the EU multi-region to query the London bicycles dataset.
27+
bigframes.options.bigquery.location = "EU"
28+
29+
# Extract the information you'll need to train the k-means model in this
30+
# tutorial. Use the read_gbq function to represent cycle hires
31+
# data as a DataFrame.
32+
h = bpd.read_gbq(
33+
"bigquery-public-data.london_bicycles.cycle_hire",
34+
col_order=["start_station_name", "start_station_id", "start_date", "duration"],
35+
).rename(
36+
columns={
37+
"start_station_name": "station_name",
38+
"start_station_id": "station_id",
39+
}
40+
)
41+
42+
s = bpd.read_gbq(
43+
# Use ST_GEOPOINT and ST_DISTANCE to analyze geographical
44+
# data. These functions determine spatial relationships between
45+
# geographical features.
46+
"""
47+
SELECT
48+
id,
49+
ST_DISTANCE(
50+
ST_GEOGPOINT(s.longitude, s.latitude),
51+
ST_GEOGPOINT(-0.1, 51.5)
52+
) / 1000 AS distance_from_city_center
53+
FROM
54+
`bigquery-public-data.london_bicycles.cycle_stations` s
55+
"""
56+
)
57+
58+
# Define Python datetime objects in the UTC timezone for range comparison,
59+
# because BigQuery stores timestamp data in the UTC timezone.
60+
sample_time = datetime.datetime(2015, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc)
61+
sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc)
62+
63+
h = h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)]
64+
65+
# Replace each day-of-the-week number with the corresponding "weekday" or
66+
# "weekend" label by using the Series.map method.
67+
h = h.assign(
68+
isweekday=h.start_date.dt.dayofweek.map(
69+
{
70+
0: "weekday",
71+
1: "weekday",
72+
2: "weekday",
73+
3: "weekday",
74+
4: "weekday",
75+
5: "weekend",
76+
6: "weekend",
77+
}
78+
)
79+
)
80+
81+
# Supplement each trip in "h" with the station distance information from
82+
# "s" by merging the two DataFrames by station ID.
83+
merged_df = h.merge(
84+
right=s,
85+
how="inner",
86+
left_on="station_id",
87+
right_on="id",
88+
)
89+
90+
# Engineer features to cluster the stations. For each station, find the
91+
# average trip duration, number of trips, and distance from city center.
92+
stationstats = merged_df.groupby(["station_name", "isweekday"]).agg(
93+
{"duration": ["mean", "count"], "distance_from_city_center": "max"}
94+
)
95+
stationstats.columns = ["duration", "num_trips", "distance_from_city_center"]
96+
stationstats = stationstats.sort_values(
97+
by="distance_from_city_center", ascending=True
98+
).reset_index()
99+
100+
# Expected output results: >>> stationstats.head(3)
101+
# station_name isweekday duration num_trips distance_from_city_center
102+
# Borough Road... weekday 1110 5749 0.12624
103+
# Borough Road... weekend 2125 1774 0.12624
104+
# Webber Street... weekday 795 6517 0.164021
105+
# 3 rows × 5 columns
106+
107+
# [END bigquery_dataframes_bqml_kmeans]
108+
109+
# [START bigquery_dataframes_bqml_kmeans_fit]
110+
111+
from bigframes.ml.cluster import KMeans
112+
113+
# To determine an optimal number of clusters, construct and fit several
114+
# K-Means objects with different values of num_clusters, find the error
115+
# measure, and pick the point at which the error measure is at its minimum
116+
# value.
117+
cluster_model = KMeans(n_clusters=4)
118+
cluster_model.fit(stationstats)
119+
cluster_model.to_gbq(
120+
your_model_id, # For example: "bqml_tutorial.london_station_clusters"
121+
replace=True,
122+
)
123+
# [END bigquery_dataframes_bqml_kmeans_fit]
124+
125+
# [START bigquery_dataframes_bqml_kmeans_predict]
126+
127+
# Select model you'll use for predictions. `read_gbq_model` loads model
128+
# data from BigQuery, but you could also use the `cluster_model` object
129+
# from previous steps.
130+
cluster_model = bpd.read_gbq_model(
131+
your_model_id,
132+
# For example: "bqml_tutorial.london_station_clusters",
133+
)
134+
135+
# Use 'contains' function to filter by stations containing the string
136+
# "Kennington".
137+
stationstats = stationstats.loc[
138+
stationstats["station_name"].str.contains("Kennington")
139+
]
140+
141+
result = cluster_model.predict(stationstats)
142+
143+
# Expected output results: >>>results.peek(3)
144+
# CENTROID... NEAREST... station_name isweekday duration num_trips dist...
145+
# 1 [{'CENTROID_ID'... Borough... weekday 1110 5749 0.13
146+
# 2 [{'CENTROID_ID'... Borough... weekend 2125 1774 0.13
147+
# 1 [{'CENTROID_ID'... Webber... weekday 795 6517 0.16
148+
# 3 rows × 7 columns
149+
150+
# [END bigquery_dataframes_bqml_kmeans_predict]
151+
152+
assert result is not None

0 commit comments

Comments
 (0)