Skip to content

Commit c840728

Browse files
morgandusasha-gitg
andauthored
feat: enable feature store batch serve to BigQuery and GCS for csv and tfrecord (#919)
* feat: add batch_serve_to_bq for bigquery table and batch_serve_to_gcs for csv and tfrecord files in Featurestore class * fix: change entity_type_ids and entity_type_destination_fields to serving_feature_ids and feature_destination_fields * fix: remove white space * Update google/cloud/aiplatform/featurestore/featurestore.py Co-authored-by: sasha-gitg <[email protected]> * Update google/cloud/aiplatform/featurestore/featurestore.py Co-authored-by: sasha-gitg <[email protected]> * Update google/cloud/aiplatform/featurestore/featurestore.py Co-authored-by: sasha-gitg <[email protected]> * Update google/cloud/aiplatform/featurestore/featurestore.py Co-authored-by: sasha-gitg <[email protected]> * Update google/cloud/aiplatform/featurestore/featurestore.py Co-authored-by: sasha-gitg <[email protected]> * fix: Featurestore create method example usage * fix: get_timestamp_proto for millisecond precision cap * fix: unit tests for get_timestamp_proto Co-authored-by: sasha-gitg <[email protected]>
1 parent 0f6b670 commit c840728

File tree

7 files changed

+959
-40
lines changed

7 files changed

+959
-40
lines changed

google/cloud/aiplatform/featurestore/featurestore.py

+467-10
Large diffs are not rendered by default.

google/cloud/aiplatform/utils/__init__.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -628,9 +628,11 @@ def get_timestamp_proto(
628628
"""
629629
if not time:
630630
time = datetime.datetime.now()
631-
t = time.timestamp()
632-
seconds = int(t)
633-
# must not have higher than millisecond precision.
634-
nanos = int((t % 1 * 1e6) * 1e3)
635631

636-
return timestamp_pb2.Timestamp(seconds=seconds, nanos=nanos)
632+
time_str = time.isoformat(sep=" ", timespec="milliseconds")
633+
time = datetime.datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S.%f")
634+
635+
timestamp_proto = timestamp_pb2.Timestamp()
636+
timestamp_proto.FromDatetime(time)
637+
638+
return timestamp_proto

google/cloud/aiplatform/utils/featurestore_utils.py

+1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929

3030
RESOURCE_ID_PATTERN_REGEX = r"[a-z_][a-z0-9_]{0,59}"
3131
GCS_SOURCE_TYPE = {"csv", "avro"}
32+
GCS_DESTINATION_TYPE = {"csv", "tfrecord"}
3233

3334
_FEATURE_VALUE_TYPE_UNSPECIFIED = "VALUE_TYPE_UNSPECIFIED"
3435

tests/system/aiplatform/e2e_base.py

+33
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
from google.api_core import exceptions
2626
from google.cloud import aiplatform
27+
from google.cloud import bigquery
2728
from google.cloud import storage
2829
from google.cloud.aiplatform import initializer
2930

@@ -90,6 +91,38 @@ def delete_staging_bucket(self, shared_state: Dict[str, Any]):
9091
bucket = shared_state["bucket"]
9192
bucket.delete(force=True)
9293

94+
@pytest.fixture(scope="class")
95+
def prepare_bigquery_dataset(
96+
self, shared_state: Dict[str, Any]
97+
) -> Generator[bigquery.dataset.Dataset, None, None]:
98+
"""Create a bigquery dataset and store bigquery resource object in shared state."""
99+
100+
bigquery_client = bigquery.Client(project=_PROJECT)
101+
shared_state["bigquery_client"] = bigquery_client
102+
103+
dataset_name = f"{self._temp_prefix.lower()}_{uuid.uuid4()}".replace("-", "_")
104+
dataset_id = f"{_PROJECT}.{dataset_name}"
105+
shared_state["bigquery_dataset_id"] = dataset_id
106+
107+
dataset = bigquery.Dataset(dataset_id)
108+
dataset.location = _LOCATION
109+
shared_state["bigquery_dataset"] = bigquery_client.create_dataset(dataset)
110+
111+
yield
112+
113+
@pytest.fixture(scope="class")
114+
def delete_bigquery_dataset(self, shared_state: Dict[str, Any]):
115+
"""Delete the bigquery dataset"""
116+
117+
yield
118+
119+
# Get the bigquery dataset id used for testing and wipe it
120+
bigquery_dataset = shared_state["bigquery_dataset"]
121+
bigquery_client = shared_state["bigquery_client"]
122+
bigquery_client.delete_dataset(
123+
bigquery_dataset.dataset_id, delete_contents=True, not_found_ok=True
124+
) # Make an API request.
125+
93126
@pytest.fixture(scope="class", autouse=True)
94127
def teardown(self, shared_state: Dict[str, Any]):
95128
"""Delete every Vertex AI resource created during test"""

tests/system/aiplatform/test_featurestore.py

+115-1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#
1717

1818
import logging
19+
import pytest
1920

2021
from google.cloud import aiplatform
2122
from tests.system.aiplatform import e2e_base
@@ -29,6 +30,8 @@
2930
"gs://cloud-samples-data-us-central1/vertex-ai/feature-store/datasets/movies.avro"
3031
)
3132

33+
_TEST_READ_INSTANCE_SRC = "gs://cloud-samples-data-us-central1/vertex-ai/feature-store/datasets/movie_prediction.csv"
34+
3235
_TEST_FEATURESTORE_ID = "movie_prediction"
3336
_TEST_USER_ENTITY_TYPE_ID = "users"
3437
_TEST_MOVIE_ENTITY_TYPE_ID = "movies"
@@ -42,6 +45,12 @@
4245
_TEST_MOVIE_AVERAGE_RATING_FEATURE_ID = "average_rating"
4346

4447

48+
@pytest.mark.usefixtures(
49+
"prepare_staging_bucket",
50+
"delete_staging_bucket",
51+
"prepare_bigquery_dataset",
52+
"delete_bigquery_dataset",
53+
)
4554
class TestFeaturestore(e2e_base.TestEndToEnd):
4655

4756
_temp_prefix = "temp_vertex_sdk_e2e_featurestore_test"
@@ -131,7 +140,7 @@ def test_create_get_list_features(self, shared_state):
131140
user_age_feature = user_entity_type.create_feature(
132141
feature_id=_TEST_USER_AGE_FEATURE_ID, value_type="INT64"
133142
)
134-
143+
shared_state["user_age_feature_resource_name"] = user_age_feature.resource_name
135144
get_user_age_feature = user_entity_type.get_feature(
136145
feature_id=_TEST_USER_AGE_FEATURE_ID
137146
)
@@ -142,6 +151,9 @@ def test_create_get_list_features(self, shared_state):
142151
value_type="STRING",
143152
entity_type_name=user_entity_type_name,
144153
)
154+
shared_state[
155+
"user_gender_feature_resource_name"
156+
] = user_gender_feature.resource_name
145157

146158
get_user_gender_feature = aiplatform.Feature(
147159
feature_name=user_gender_feature.resource_name
@@ -153,6 +165,9 @@ def test_create_get_list_features(self, shared_state):
153165
user_liked_genres_feature = user_entity_type.create_feature(
154166
feature_id=_TEST_USER_LIKED_GENRES_FEATURE_ID, value_type="STRING_ARRAY",
155167
)
168+
shared_state[
169+
"user_liked_genres_feature_resource_name"
170+
] = user_liked_genres_feature.resource_name
156171

157172
get_user_liked_genres_feature = aiplatform.Feature(
158173
feature_name=user_liked_genres_feature.resource_name
@@ -250,6 +265,105 @@ def test_search_features(self, shared_state):
250265
len(list_searched_features) - shared_state["base_list_searched_features"]
251266
) == 6
252267

268+
def test_batch_serve_to_gcs(self, shared_state, caplog):
269+
270+
assert shared_state["featurestore"]
271+
assert shared_state["bucket"]
272+
assert shared_state["user_age_feature_resource_name"]
273+
assert shared_state["user_gender_feature_resource_name"]
274+
assert shared_state["user_liked_genres_feature_resource_name"]
275+
276+
featurestore = shared_state["featurestore"]
277+
bucket_name = shared_state["staging_bucket_name"]
278+
user_age_feature_resource_name = shared_state["user_age_feature_resource_name"]
279+
user_gender_feature_resource_name = shared_state[
280+
"user_gender_feature_resource_name"
281+
]
282+
user_liked_genres_feature_resource_name = shared_state[
283+
"user_liked_genres_feature_resource_name"
284+
]
285+
286+
aiplatform.init(
287+
project=e2e_base._PROJECT, location=e2e_base._LOCATION,
288+
)
289+
290+
caplog.set_level(logging.INFO)
291+
292+
featurestore.batch_serve_to_gcs(
293+
serving_feature_ids={
294+
_TEST_USER_ENTITY_TYPE_ID: [
295+
_TEST_USER_AGE_FEATURE_ID,
296+
_TEST_USER_GENDER_FEATURE_ID,
297+
_TEST_USER_LIKED_GENRES_FEATURE_ID,
298+
],
299+
_TEST_MOVIE_ENTITY_TYPE_ID: [
300+
_TEST_MOVIE_TITLE_FEATURE_ID,
301+
_TEST_MOVIE_GENRES_FEATURE_ID,
302+
_TEST_MOVIE_AVERAGE_RATING_FEATURE_ID,
303+
],
304+
},
305+
feature_destination_fields={
306+
user_age_feature_resource_name: "user_age_dest",
307+
user_gender_feature_resource_name: "user_gender_dest",
308+
user_liked_genres_feature_resource_name: "user_liked_genres_dest",
309+
},
310+
read_instances=_TEST_READ_INSTANCE_SRC,
311+
gcs_destination_output_uri_prefix=f"gs://{bucket_name}/featurestore_test/tfrecord",
312+
gcs_destination_type="tfrecord",
313+
)
314+
assert "Featurestore feature values served." in caplog.text
315+
316+
caplog.clear()
317+
318+
def test_batch_serve_to_bq(self, shared_state, caplog):
319+
320+
assert shared_state["featurestore"]
321+
assert shared_state["bigquery_dataset"]
322+
assert shared_state["user_age_feature_resource_name"]
323+
assert shared_state["user_gender_feature_resource_name"]
324+
assert shared_state["user_liked_genres_feature_resource_name"]
325+
326+
featurestore = shared_state["featurestore"]
327+
bigquery_dataset_id = shared_state["bigquery_dataset_id"]
328+
user_age_feature_resource_name = shared_state["user_age_feature_resource_name"]
329+
user_gender_feature_resource_name = shared_state[
330+
"user_gender_feature_resource_name"
331+
]
332+
user_liked_genres_feature_resource_name = shared_state[
333+
"user_liked_genres_feature_resource_name"
334+
]
335+
336+
aiplatform.init(
337+
project=e2e_base._PROJECT, location=e2e_base._LOCATION,
338+
)
339+
340+
caplog.set_level(logging.INFO)
341+
342+
featurestore.batch_serve_to_bq(
343+
serving_feature_ids={
344+
_TEST_USER_ENTITY_TYPE_ID: [
345+
_TEST_USER_AGE_FEATURE_ID,
346+
_TEST_USER_GENDER_FEATURE_ID,
347+
_TEST_USER_LIKED_GENRES_FEATURE_ID,
348+
],
349+
_TEST_MOVIE_ENTITY_TYPE_ID: [
350+
_TEST_MOVIE_TITLE_FEATURE_ID,
351+
_TEST_MOVIE_GENRES_FEATURE_ID,
352+
_TEST_MOVIE_AVERAGE_RATING_FEATURE_ID,
353+
],
354+
},
355+
feature_destination_fields={
356+
user_age_feature_resource_name: "user_age_dest",
357+
user_gender_feature_resource_name: "user_gender_dest",
358+
user_liked_genres_feature_resource_name: "user_liked_genres_dest",
359+
},
360+
read_instances=_TEST_READ_INSTANCE_SRC,
361+
bq_destination_output_uri=f"bq://{bigquery_dataset_id}.test_table",
362+
)
363+
364+
assert "Featurestore feature values served." in caplog.text
365+
caplog.clear()
366+
253367
def test_online_reads(self, shared_state):
254368
assert shared_state["user_entity_type"]
255369
assert shared_state["movie_entity_type"]

0 commit comments

Comments
 (0)