diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml index 91d742b5b9..f30cb3775a 100644 --- a/.github/.OwlBot.lock.yaml +++ b/.github/.OwlBot.lock.yaml @@ -13,5 +13,5 @@ # limitations under the License. docker: image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest - digest: sha256:d3de8a02819f65001effcbd3ea76ce97e9bcff035c7a89457f40f892c87c5b32 -# created: 2024-07-03T17:43:00.77142528Z + digest: sha256:52210e0e0559f5ea8c52be148b33504022e1faef4e95fbe4b32d68022af2fa7e +# created: 2024-07-08T19:25:35.862283192Z diff --git a/.kokoro/docker/docs/Dockerfile b/.kokoro/docker/docs/Dockerfile index a26ce61930..5205308b33 100644 --- a/.kokoro/docker/docs/Dockerfile +++ b/.kokoro/docker/docs/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ubuntu:22.04 +from ubuntu:24.04 ENV DEBIAN_FRONTEND noninteractive @@ -40,7 +40,6 @@ RUN apt-get update \ libssl-dev \ libsqlite3-dev \ portaudio19-dev \ - python3-distutils \ redis-server \ software-properties-common \ ssh \ @@ -60,18 +59,22 @@ RUN apt-get update \ && rm -rf /var/lib/apt/lists/* \ && rm -f /var/cache/apt/archives/*.deb -###################### Install python 3.9.13 -# Download python 3.9.13 -RUN wget https://www.python.org/ftp/python/3.9.13/Python-3.9.13.tgz +###################### Install python 3.10.14 for docs/docfx session + +# Download python 3.10.14 +RUN wget https://www.python.org/ftp/python/3.10.14/Python-3.10.14.tgz # Extract files -RUN tar -xvf Python-3.9.13.tgz +RUN tar -xvf Python-3.10.14.tgz -# Install python 3.9.13 -RUN ./Python-3.9.13/configure --enable-optimizations +# Install python 3.10.14 +RUN ./Python-3.10.14/configure --enable-optimizations RUN make altinstall +RUN python3.10 -m venv /venv +ENV PATH /venv/bin:$PATH + ###################### Install pip RUN wget -O /tmp/get-pip.py 'https://bootstrap.pypa.io/get-pip.py' \ && python3 /tmp/get-pip.py \ @@ -84,4 +87,4 @@ RUN python3 -m pip COPY requirements.txt /requirements.txt RUN python3 -m pip install --require-hashes -r requirements.txt -CMD ["python3.8"] +CMD ["python3.10"] diff --git a/.kokoro/docker/docs/requirements.txt b/.kokoro/docker/docs/requirements.txt index 0e5d70f20f..7129c77155 100644 --- a/.kokoro/docker/docs/requirements.txt +++ b/.kokoro/docker/docs/requirements.txt @@ -4,9 +4,9 @@ # # pip-compile --allow-unsafe --generate-hashes requirements.in # -argcomplete==3.2.3 \ - --hash=sha256:bf7900329262e481be5a15f56f19736b376df6f82ed27576fa893652c5de6c23 \ - --hash=sha256:c12355e0494c76a2a7b73e3a59b09024ca0ba1e279fb9ed6c1b82d5b74b6a70c +argcomplete==3.4.0 \ + --hash=sha256:69a79e083a716173e5532e0fa3bef45f793f4e61096cf52b5a42c0211c8b8aa5 \ + --hash=sha256:c2abcdfe1be8ace47ba777d4fce319eb13bf8ad9dace8d085dcad6eded88057f # via nox colorlog==6.8.2 \ --hash=sha256:3e3e079a41feb5a1b64f978b5ea4f46040a94f11f0e8bbb8261e3dbbeca64d44 \ @@ -16,23 +16,27 @@ distlib==0.3.8 \ --hash=sha256:034db59a0b96f8ca18035f36290806a9a6e6bd9d1ff91e45a7f172eb17e51784 \ --hash=sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64 # via virtualenv -filelock==3.13.1 \ - --hash=sha256:521f5f56c50f8426f5e03ad3b281b490a87ef15bc6c526f168290f0c7148d44e \ - --hash=sha256:57dbda9b35157b05fb3e58ee91448612eb674172fab98ee235ccb0b5bee19a1c +filelock==3.15.4 \ + --hash=sha256:2207938cbc1844345cb01a5a95524dae30f0ce089eba5b00378295a17e3e90cb \ + --hash=sha256:6ca1fffae96225dab4c6eaf1c4f4f28cd2568d3ec2a44e15a08520504de468e7 # via virtualenv -nox==2024.3.2 \ - --hash=sha256:e53514173ac0b98dd47585096a55572fe504fecede58ced708979184d05440be \ - --hash=sha256:f521ae08a15adbf5e11f16cb34e8d0e6ea521e0b92868f684e91677deb974553 +nox==2024.4.15 \ + --hash=sha256:6492236efa15a460ecb98e7b67562a28b70da006ab0be164e8821177577c0565 \ + --hash=sha256:ecf6700199cdfa9e5ea0a41ff5e6ef4641d09508eda6edb89d9987864115817f # via -r requirements.in -packaging==24.0 \ - --hash=sha256:2ddfb553fdf02fb784c234c7ba6ccc288296ceabec964ad2eae3777778130bc5 \ - --hash=sha256:eb82c5e3e56209074766e6885bb04b8c38a0c015d0a30036ebe7ece34c9989e9 +packaging==24.1 \ + --hash=sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002 \ + --hash=sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124 # via nox -platformdirs==4.2.0 \ - --hash=sha256:0614df2a2f37e1a662acbd8e2b25b92ccf8632929bc6d43467e17fe89c75e068 \ - --hash=sha256:ef0cc731df711022c174543cb70a9b5bd22e5a9337c8624ef2c2ceb8ddad8768 +platformdirs==4.2.2 \ + --hash=sha256:2d7a1657e36a80ea911db832a8a6ece5ee53d8de21edd5cc5879af6530b1bfee \ + --hash=sha256:38b7b51f512eed9e84a22788b4bce1de17c0adb134d6becb09836e37d8654cd3 # via virtualenv -virtualenv==20.25.1 \ - --hash=sha256:961c026ac520bac5f69acb8ea063e8a4f071bcc9457b9c1f28f6b085c511583a \ - --hash=sha256:e08e13ecdca7a0bd53798f356d5831434afa5b07b93f0abdf0797b7a06ffe197 +tomli==2.0.1 \ + --hash=sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc \ + --hash=sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f + # via nox +virtualenv==20.26.3 \ + --hash=sha256:4c43a2a236279d9ea36a0d76f98d84bd6ca94ac4e0f4a3b9d46d05e10fea542a \ + --hash=sha256:8cc4a31139e796e9a7de2cd5cf2489de1217193116a8fd42328f1bd65f434589 # via nox diff --git a/.kokoro/requirements.txt b/.kokoro/requirements.txt index 35ece0e4d2..9622baf0ba 100644 --- a/.kokoro/requirements.txt +++ b/.kokoro/requirements.txt @@ -20,9 +20,9 @@ cachetools==5.3.3 \ --hash=sha256:0abad1021d3f8325b2fc1d2e9c8b9c9d57b04c3932657a72465447332c24d945 \ --hash=sha256:ba29e2dfa0b8b556606f097407ed1aa62080ee108ab0dc5ec9d6a723a007d105 # via google-auth -certifi==2024.6.2 \ - --hash=sha256:3cd43f1c6fa7dedc5899d69d3ad0398fd018ad1a17fba83ddaf78aa46c747516 \ - --hash=sha256:ddc6c8ce995e6987e7faf5e3f1b02b302836a0e5d98ece18392cb1a36c72ad56 +certifi==2024.7.4 \ + --hash=sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b \ + --hash=sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90 # via requests cffi==1.16.0 \ --hash=sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc \ @@ -371,23 +371,23 @@ more-itertools==10.3.0 \ # via # jaraco-classes # jaraco-functools -nh3==0.2.17 \ - --hash=sha256:0316c25b76289cf23be6b66c77d3608a4fdf537b35426280032f432f14291b9a \ - --hash=sha256:1a814dd7bba1cb0aba5bcb9bebcc88fd801b63e21e2450ae6c52d3b3336bc911 \ - --hash=sha256:1aa52a7def528297f256de0844e8dd680ee279e79583c76d6fa73a978186ddfb \ - --hash=sha256:22c26e20acbb253a5bdd33d432a326d18508a910e4dcf9a3316179860d53345a \ - --hash=sha256:40015514022af31975c0b3bca4014634fa13cb5dc4dbcbc00570acc781316dcc \ - --hash=sha256:40d0741a19c3d645e54efba71cb0d8c475b59135c1e3c580f879ad5514cbf028 \ - --hash=sha256:551672fd71d06cd828e282abdb810d1be24e1abb7ae2543a8fa36a71c1006fe9 \ - --hash=sha256:66f17d78826096291bd264f260213d2b3905e3c7fae6dfc5337d49429f1dc9f3 \ - --hash=sha256:85cdbcca8ef10733bd31f931956f7fbb85145a4d11ab9e6742bbf44d88b7e351 \ - --hash=sha256:a3f55fabe29164ba6026b5ad5c3151c314d136fd67415a17660b4aaddacf1b10 \ - --hash=sha256:b4427ef0d2dfdec10b641ed0bdaf17957eb625b2ec0ea9329b3d28806c153d71 \ - --hash=sha256:ba73a2f8d3a1b966e9cdba7b211779ad8a2561d2dba9674b8a19ed817923f65f \ - --hash=sha256:c21bac1a7245cbd88c0b0e4a420221b7bfa838a2814ee5bb924e9c2f10a1120b \ - --hash=sha256:c551eb2a3876e8ff2ac63dff1585236ed5dfec5ffd82216a7a174f7c5082a78a \ - --hash=sha256:c790769152308421283679a142dbdb3d1c46c79c823008ecea8e8141db1a2062 \ - --hash=sha256:d7a25fd8c86657f5d9d576268e3b3767c5cd4f42867c9383618be8517f0f022a +nh3==0.2.18 \ + --hash=sha256:0411beb0589eacb6734f28d5497ca2ed379eafab8ad8c84b31bb5c34072b7164 \ + --hash=sha256:14c5a72e9fe82aea5fe3072116ad4661af5cf8e8ff8fc5ad3450f123e4925e86 \ + --hash=sha256:19aaba96e0f795bd0a6c56291495ff59364f4300d4a39b29a0abc9cb3774a84b \ + --hash=sha256:34c03fa78e328c691f982b7c03d4423bdfd7da69cd707fe572f544cf74ac23ad \ + --hash=sha256:36c95d4b70530b320b365659bb5034341316e6a9b30f0b25fa9c9eff4c27a204 \ + --hash=sha256:3a157ab149e591bb638a55c8c6bcb8cdb559c8b12c13a8affaba6cedfe51713a \ + --hash=sha256:42c64511469005058cd17cc1537578eac40ae9f7200bedcfd1fc1a05f4f8c200 \ + --hash=sha256:5f36b271dae35c465ef5e9090e1fdaba4a60a56f0bb0ba03e0932a66f28b9189 \ + --hash=sha256:6955369e4d9f48f41e3f238a9e60f9410645db7e07435e62c6a9ea6135a4907f \ + --hash=sha256:7b7c2a3c9eb1a827d42539aa64091640bd275b81e097cd1d8d82ef91ffa2e811 \ + --hash=sha256:8ce0f819d2f1933953fca255db2471ad58184a60508f03e6285e5114b6254844 \ + --hash=sha256:94a166927e53972a9698af9542ace4e38b9de50c34352b962f4d9a7d4c927af4 \ + --hash=sha256:a7f1b5b2c15866f2db413a3649a8fe4fd7b428ae58be2c0f6bca5eefd53ca2be \ + --hash=sha256:c8b3a1cebcba9b3669ed1a84cc65bf005728d2f0bc1ed2a6594a992e817f3a50 \ + --hash=sha256:de3ceed6e661954871d6cd78b410213bdcb136f79aafe22aa7182e028b8c7307 \ + --hash=sha256:f0eca9ca8628dbb4e916ae2491d72957fdd35f7a5d326b7032a345f111ac07fe # via readme-renderer nox==2024.4.15 \ --hash=sha256:6492236efa15a460ecb98e7b67562a28b70da006ab0be164e8821177577c0565 \ @@ -460,9 +460,9 @@ python-dateutil==2.9.0.post0 \ --hash=sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3 \ --hash=sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427 # via gcp-releasetool -readme-renderer==43.0 \ - --hash=sha256:1818dd28140813509eeed8d62687f7cd4f7bad90d4db586001c5dc09d4fde311 \ - --hash=sha256:19db308d86ecd60e5affa3b2a98f017af384678c63c88e5d4556a380e674f3f9 +readme-renderer==44.0 \ + --hash=sha256:2fbca89b81a08526aadf1357a8c2ae889ec05fb03f5da67f9769c9a592166151 \ + --hash=sha256:8712034eabbfa6805cacf1402b4eeb2a73028f72d1166d6f5cb7f9c047c5d1e1 # via twine requests==2.32.3 \ --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \ diff --git a/CHANGELOG.md b/CHANGELOG.md index 8249515719..354c356c7c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,33 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.12.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.11.1...v1.12.0) (2024-07-31) + + +### Features + +* Add bigframes-mode label to query jobs ([#832](https://github.com/googleapis/python-bigquery-dataframes/issues/832)) ([c9eaff0](https://github.com/googleapis/python-bigquery-dataframes/commit/c9eaff0a1a0731b28f4c67bca5606db12a47c8c0)) +* Add config option to set partial ordering mode ([#855](https://github.com/googleapis/python-bigquery-dataframes/issues/855)) ([823c0ce](https://github.com/googleapis/python-bigquery-dataframes/commit/823c0ce57611c0918a9e9999638d7393337fe9af)) +* Add stratify param support to ml.model_selection.train_test_split method ([#815](https://github.com/googleapis/python-bigquery-dataframes/issues/815)) ([27f8631](https://github.com/googleapis/python-bigquery-dataframes/commit/27f8631be81a3e136cfeb8904558bb4f3f5caa05)) +* Add streaming.StreamingDataFrame class ([#864](https://github.com/googleapis/python-bigquery-dataframes/issues/864)) ([a7d7197](https://github.com/googleapis/python-bigquery-dataframes/commit/a7d7197a32c55b989ae4ea8f6cf6e1c0f7184cd4)) +* Allow DataFrame.join for self-join on Null index ([#860](https://github.com/googleapis/python-bigquery-dataframes/issues/860)) ([e950533](https://github.com/googleapis/python-bigquery-dataframes/commit/e95053372c36ea5a91a2d7295c1a3a3671181670)) +* Support remote function cleanup with `session.close` ([#818](https://github.com/googleapis/python-bigquery-dataframes/issues/818)) ([ed06436](https://github.com/googleapis/python-bigquery-dataframes/commit/ed06436612c0d46f190f79721416d473bde7e2f4)) +* Support to_csv/parquet/json to local files/objects ([#858](https://github.com/googleapis/python-bigquery-dataframes/issues/858)) ([d0ab9cc](https://github.com/googleapis/python-bigquery-dataframes/commit/d0ab9cc47298bdde638299baecac9dffd7841ede)) + + +### Bug Fixes + +* Fewer relation joins from df self-operations ([#823](https://github.com/googleapis/python-bigquery-dataframes/issues/823)) ([0d24f73](https://github.com/googleapis/python-bigquery-dataframes/commit/0d24f737041c7dd70253ebb4baa8d8ef67bd4f1d)) +* Fix 'sql' property for null index ([#844](https://github.com/googleapis/python-bigquery-dataframes/issues/844)) ([1b6a556](https://github.com/googleapis/python-bigquery-dataframes/commit/1b6a556206a7a66283339d827ab12db2753521e2)) +* Fix unordered mode using ordered path to print frame ([#839](https://github.com/googleapis/python-bigquery-dataframes/issues/839)) ([93785cb](https://github.com/googleapis/python-bigquery-dataframes/commit/93785cb48be4a2eb8770129148bd0b897fed4ee7)) +* Reduce redundant `remote_function` deployments ([#856](https://github.com/googleapis/python-bigquery-dataframes/issues/856)) ([cbf2d42](https://github.com/googleapis/python-bigquery-dataframes/commit/cbf2d42e4d961a7537381a9c3b28a8b463ad8f74)) + + +### Documentation + +* Add partner attribution steps to integrations sample notebook ([#835](https://github.com/googleapis/python-bigquery-dataframes/issues/835)) ([d7b333f](https://github.com/googleapis/python-bigquery-dataframes/commit/d7b333fa26acddaeb5ccca4f81b1d624dff03ba2)) +* Make `get_global_session`/`close_session`/`reset_session` appears in the docs ([#847](https://github.com/googleapis/python-bigquery-dataframes/issues/847)) ([01d6bbb](https://github.com/googleapis/python-bigquery-dataframes/commit/01d6bbb7479da706dc62bb5e7d51dc28a4042812)) + ## [1.11.1](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.11.0...v1.11.1) (2024-07-08) diff --git a/bigframes/_config/bigquery_options.py b/bigframes/_config/bigquery_options.py index ad79543cb8..0506f1841e 100644 --- a/bigframes/_config/bigquery_options.py +++ b/bigframes/_config/bigquery_options.py @@ -16,7 +16,8 @@ from __future__ import annotations -from typing import Optional +from enum import Enum +from typing import Literal, Optional import warnings import google.api_core.exceptions @@ -26,6 +27,12 @@ import bigframes.constants import bigframes.exceptions + +class OrderingMode(Enum): + STRICT = "strict" + PARTIAL = "partial" + + SESSION_STARTED_MESSAGE = ( "Cannot change '{attribute}' once a session has started. " "Call bigframes.pandas.close_session() first, if you are using the bigframes.pandas API." @@ -57,6 +64,14 @@ def _validate_location(value: Optional[str]): ) +def _validate_ordering_mode(value: str) -> OrderingMode: + if value.casefold() == OrderingMode.STRICT.value.casefold(): + return OrderingMode.STRICT + if value.casefold() == OrderingMode.PARTIAL.value.casefold(): + return OrderingMode.PARTIAL + raise ValueError("Ordering mode must be one of 'strict' or 'partial'.") + + class BigQueryOptions: """Encapsulates configuration for working with a session.""" @@ -71,7 +86,7 @@ def __init__( kms_key_name: Optional[str] = None, skip_bq_connection_check: bool = False, *, - _strictly_ordered: bool = True, + ordering_mode: Literal["strict", "partial"] = "strict", ): self._credentials = credentials self._project = project @@ -82,8 +97,8 @@ def __init__( self._kms_key_name = kms_key_name self._skip_bq_connection_check = skip_bq_connection_check self._session_started = False - # Determines the ordering strictness for the session. For internal use only. - self._strictly_ordered_internal = _strictly_ordered + # Determines the ordering strictness for the session. + self._ordering_mode = _validate_ordering_mode(ordering_mode) @property def application_name(self) -> Optional[str]: @@ -241,6 +256,10 @@ def kms_key_name(self, value: str): self._kms_key_name = value @property - def _strictly_ordered(self) -> bool: - """Internal use only. Controls whether total row order is always maintained for DataFrame/Series.""" - return self._strictly_ordered_internal + def ordering_mode(self) -> Literal["strict", "partial"]: + """Controls whether total row order is always maintained for DataFrame/Series.""" + return self._ordering_mode.value + + @ordering_mode.setter + def ordering_mode(self, ordering_mode: Literal["strict", "partial"]) -> None: + self._ordering_mode = _validate_ordering_mode(ordering_mode) diff --git a/bigframes/constants.py b/bigframes/constants.py index 9591297956..3c18fd20bd 100644 --- a/bigframes/constants.py +++ b/bigframes/constants.py @@ -99,3 +99,5 @@ # BigQuery default is 10000, leave 100 for overhead MAX_COLUMNS = 9900 + +SUGGEST_PEEK_PREVIEW = "Use .peek(n) to preview n arbitrary rows." diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 9b858046bc..aa66129572 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -147,7 +147,7 @@ def _compiled_schema(self) -> schemata.ArraySchema: def as_cached( self: ArrayValue, cache_table: google.cloud.bigquery.Table, - ordering: Optional[orderings.TotalOrdering], + ordering: Optional[orderings.RowOrdering], ) -> ArrayValue: """ Replace the node with an equivalent one that references a tabel where the value has been materialized to. @@ -194,7 +194,7 @@ def promote_offsets(self, col_id: str) -> ArrayValue: """ Convenience function to promote copy of column offsets to a value column. Can be used to reset index. """ - if not self.session._strictly_ordered: + if self.node.order_ambiguous and not self.session._strictly_ordered: raise ValueError("Generating offsets not supported in unordered mode") return ArrayValue(nodes.PromoteOffsetsNode(child=self.node, col_id=col_id)) @@ -346,7 +346,7 @@ def project_window_op( """ # TODO: Support non-deterministic windowing if window_spec.row_bounded or not op.order_independent: - if not self.session._strictly_ordered: + if self.node.order_ambiguous and not self.session._strictly_ordered: raise ValueError( "Order-dependent windowed ops not supported in unordered mode" ) @@ -460,9 +460,9 @@ def _cross_join_w_labels( conditions=(), mappings=(*labels_mappings, *table_mappings), type="cross" ) if join_side == "left": - joined_array = self.join(labels_array, join_def=join) + joined_array = self.relational_join(labels_array, join_def=join) else: - joined_array = labels_array.join(self, join_def=join) + joined_array = labels_array.relational_join(self, join_def=join) return joined_array def _create_unpivot_labels_array( @@ -485,30 +485,27 @@ def _create_unpivot_labels_array( return ArrayValue.from_pyarrow(pa.Table.from_pylist(rows), session=self.session) - def join( + def relational_join( self, other: ArrayValue, join_def: join_def.JoinDefinition, - allow_row_identity_join: bool = False, - ): + ) -> ArrayValue: join_node = nodes.JoinNode( left_child=self.node, right_child=other.node, join=join_def, - allow_row_identity_join=allow_row_identity_join, ) - if allow_row_identity_join: - return ArrayValue(bigframes.core.rewrite.maybe_rewrite_join(join_node)) return ArrayValue(join_node) def try_align_as_projection( self, other: ArrayValue, join_type: join_def.JoinType, + join_keys: typing.Tuple[join_def.CoalescedColumnMapping, ...], mappings: typing.Tuple[join_def.JoinColumnMapping, ...], ) -> typing.Optional[ArrayValue]: result = bigframes.core.rewrite.join_as_projection( - self.node, other.node, mappings, join_type + self.node, other.node, join_keys, mappings, join_type ) if result is not None: return ArrayValue(result) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index fef91f88dc..1b7b231403 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -327,8 +327,21 @@ def reset_index(self, drop: bool = True) -> Block: A new Block because dropping index columns can break references from Index classes that point to this block. """ - new_index_col_id = guid.generate_guid() - expr = self._expr.promote_offsets(new_index_col_id) + expr = self._expr + if ( + self.session._default_index_type + == bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64 + ): + new_index_col_id = guid.generate_guid() + expr = expr.promote_offsets(new_index_col_id) + new_index_cols = [new_index_col_id] + elif self.session._default_index_type == bigframes.enums.DefaultIndexKind.NULL: + new_index_cols = [] + else: + raise ValueError( + f"Unrecognized default index kind: {self.session._default_index_type}" + ) + if drop: # Even though the index might be part of the ordering, keep that # ordering expression as reset_index shouldn't change the row @@ -336,9 +349,8 @@ def reset_index(self, drop: bool = True) -> Block: expr = expr.drop_columns(self.index_columns) return Block( expr, - index_columns=[new_index_col_id], + index_columns=new_index_cols, column_labels=self.column_labels, - index_labels=[None], ) else: # Add index names to column index @@ -362,9 +374,8 @@ def reset_index(self, drop: bool = True) -> Block: return Block( expr, - index_columns=[new_index_col_id], + index_columns=new_index_cols, column_labels=column_labels_modified, - index_labels=[None], ) def set_index( @@ -500,7 +511,31 @@ def to_pandas( *, ordered: bool = True, ) -> Tuple[pd.DataFrame, bigquery.QueryJob]: - """Run query and download results as a pandas DataFrame.""" + """Run query and download results as a pandas DataFrame. + + Args: + max_download_size (int, default None): + Download size threshold in MB. If max_download_size is exceeded when downloading data + (e.g., to_pandas()), the data will be downsampled if + bigframes.options.sampling.enable_downsampling is True, otherwise, an error will be + raised. If set to a value other than None, this will supersede the global config. + sampling_method (str, default None): + Downsampling algorithms to be chosen from, the choices are: "head": This algorithm + returns a portion of the data from the beginning. It is fast and requires minimal + computations to perform the downsampling; "uniform": This algorithm returns uniform + random samples of the data. If set to a value other than None, this will supersede + the global config. + random_state (int, default None): + The seed for the uniform downsampling algorithm. If provided, the uniform method may + take longer to execute and require more computation. If set to a value other than + None, this will supersede the global config. + ordered (bool, default True): + Determines whether the resulting pandas dataframe will be ordered. + Whether the row ordering is deterministics depends on whether session ordering is strict. + + Returns: + pandas.DataFrame, QueryJob + """ if (sampling_method is not None) and (sampling_method not in _SAMPLING_METHODS): raise NotImplementedError( f"The downsampling method {sampling_method} is not implemented, " @@ -544,7 +579,7 @@ def to_pandas_batches( dtypes = dict(zip(self.index_columns, self.index.dtypes)) dtypes.update(zip(self.value_columns, self.dtypes)) _, query_job = self.session._query_to_destination( - self.session._to_sql(self.expr, ordered=self.session._strictly_ordered), + self.session._to_sql(self.expr, ordered=True), list(self.index_columns), api_name="cached", do_clustering=False, @@ -2008,7 +2043,7 @@ def merge( mappings=(*left_mappings, *right_mappings), type=how, ) - joined_expr = self.expr.join(other.expr, join_def=join_def) + joined_expr = self.expr.relational_join(other.expr, join_def=join_def) result_columns = [] matching_join_labels = [] @@ -2072,13 +2107,17 @@ def merge( # # This keeps us from generating an index if the user joins a large # BigQuery table against small local data, for example. - if len(self._index_columns) > 0 and len(other._index_columns) > 0: + if ( + self.index.is_null + or other.index.is_null + or self.session._default_index_type == bigframes.enums.DefaultIndexKind.NULL + ): + expr = joined_expr + index_columns = [] + else: offset_index_id = guid.generate_guid() expr = joined_expr.promote_offsets(offset_index_id) index_columns = [offset_index_id] - else: - expr = joined_expr - index_columns = [] return Block(expr, index_columns=index_columns, column_labels=labels) @@ -2267,25 +2306,33 @@ def join( raise NotImplementedError( f"Only how='outer','left','right','inner' currently supported. {constants.FEEDBACK_LINK}" ) - # Special case for null index, + # Handle null index, which only supports row join + # This is the canonical way of aligning on null index, so always allow (ignore block_identity_join) + if self.index.nlevels == other.index.nlevels == 0: + result = try_row_join(self, other, how=how) + if result is not None: + return result + raise bigframes.exceptions.NullIndexError( + "Cannot implicitly align objects. Set an explicit index using set_index." + ) + + # Oddly, pandas row-wise join ignores right index names if ( - (self.index.nlevels == other.index.nlevels == 0) - and not sort - and not block_identity_join + not block_identity_join + and (self.index.nlevels == other.index.nlevels) + and (self.index.dtypes == other.index.dtypes) ): - return join_indexless(self, other, how=how) + result = try_row_join(self, other, how=how) + if result is not None: + return result self._throw_if_null_index("join") other._throw_if_null_index("join") if self.index.nlevels == other.index.nlevels == 1: - return join_mono_indexed( - self, other, how=how, sort=sort, block_identity_join=block_identity_join - ) - else: + return join_mono_indexed(self, other, how=how, sort=sort) + else: # Handles cases where one or both sides are multi-indexed # Always sort mult-index join - return join_multi_indexed( - self, other, how=how, sort=sort, block_identity_join=block_identity_join - ) + return join_multi_indexed(self, other, how=how, sort=sort) def _force_reproject(self) -> Block: """Forces a reprojection of the underlying tables expression. Used to force predicate/order application before subsequent operations.""" @@ -2307,7 +2354,7 @@ def is_monotonic_decreasing( return self._is_monotonic(column_id, increasing=False) def to_sql_query( - self, include_index: bool + self, include_index: bool, enable_cache: bool = True ) -> typing.Tuple[str, list[str], list[Label]]: """ Compiles this DataFrame's expression tree to SQL, optionally @@ -2341,7 +2388,9 @@ def to_sql_query( # the BigQuery unicode column name feature? substitutions[old_id] = new_id - sql = self.session._to_sql(array_value, col_id_overrides=substitutions) + sql = self.session._to_sql( + array_value, col_id_overrides=substitutions, enable_cache=enable_cache + ) return ( sql, new_ids[: len(idx_labels)], @@ -2572,6 +2621,10 @@ def column_ids(self) -> Sequence[str]: """Column(s) to use as row labels.""" return self._block._index_columns + @property + def is_null(self) -> bool: + return len(self._block._index_columns) == 0 + def to_pandas(self, *, ordered: Optional[bool] = None) -> pd.Index: """Executes deferred operations and downloads the results.""" if len(self.column_ids) == 0: @@ -2582,10 +2635,7 @@ def to_pandas(self, *, ordered: Optional[bool] = None) -> pd.Index: index_columns = list(self._block.index_columns) expr = self._expr.select_columns(index_columns) results, _ = self.session._execute( - expr, - ordered=ordered - if (ordered is not None) - else self.session._strictly_ordered, + expr, ordered=ordered if ordered is not None else True ) df = expr.session._rows_to_dataframe(results) df = df.set_index(index_columns) @@ -2623,22 +2673,31 @@ def is_uniquely_named(self: BlockIndexProperties): return len(set(self.names)) == len(self.names) -def join_indexless( +def try_row_join( left: Block, right: Block, *, how="left", -) -> Tuple[Block, Tuple[Mapping[str, str], Mapping[str, str]],]: - """Joins two blocks""" +) -> Optional[Tuple[Block, Tuple[Mapping[str, str], Mapping[str, str]],]]: + """Joins two blocks that have a common root expression by merging the projections.""" left_expr = left.expr right_expr = right.expr + # Create a new array value, mapping from both, then left, and then right + join_keys = tuple( + join_defs.CoalescedColumnMapping( + left_source_id=left_id, + right_source_id=right_id, + destination_id=guid.generate_guid(), + ) + for left_id, right_id in zip(left.index_columns, right.index_columns) + ) left_mappings = [ join_defs.JoinColumnMapping( source_table=join_defs.JoinSide.LEFT, source_id=id, destination_id=guid.generate_guid(), ) - for id in left_expr.column_ids + for id in left.value_columns ] right_mappings = [ join_defs.JoinColumnMapping( @@ -2646,23 +2705,23 @@ def join_indexless( source_id=id, destination_id=guid.generate_guid(), ) - for id in right_expr.column_ids + for id in right.value_columns ] combined_expr = left_expr.try_align_as_projection( right_expr, join_type=how, + join_keys=join_keys, mappings=(*left_mappings, *right_mappings), ) if combined_expr is None: - raise bigframes.exceptions.NullIndexError( - "Cannot implicitly align objects. Set an explicit index using set_index." - ) + return None get_column_left = {m.source_id: m.destination_id for m in left_mappings} get_column_right = {m.source_id: m.destination_id for m in right_mappings} block = Block( combined_expr, column_labels=[*left.column_labels, *right.column_labels], - index_columns=(), + index_columns=(key.destination_id for key in join_keys), + index_labels=left.index.names, ) return ( block, @@ -2704,7 +2763,7 @@ def join_with_single_row( mappings=(*left_mappings, *right_mappings), type="cross", ) - combined_expr = left_expr.join( + combined_expr = left_expr.relational_join( right_expr, join_def=join_def, ) @@ -2731,7 +2790,6 @@ def join_mono_indexed( *, how="left", sort=False, - block_identity_join: bool = False, ) -> Tuple[Block, Tuple[Mapping[str, str], Mapping[str, str]],]: left_expr = left.expr right_expr = right.expr @@ -2759,14 +2817,14 @@ def join_mono_indexed( mappings=(*left_mappings, *right_mappings), type=how, ) - combined_expr = left_expr.join( + + combined_expr = left_expr.relational_join( right_expr, join_def=join_def, - allow_row_identity_join=(not block_identity_join), ) + get_column_left = join_def.get_left_mapping() get_column_right = join_def.get_right_mapping() - # Drop original indices from each side. and used the coalesced combination generated by the join. left_index = get_column_left[left.index_columns[0]] right_index = get_column_right[right.index_columns[0]] # Drop original indices from each side. and used the coalesced combination generated by the join. @@ -2800,7 +2858,6 @@ def join_multi_indexed( *, how="left", sort=False, - block_identity_join: bool = False, ) -> Tuple[Block, Tuple[Mapping[str, str], Mapping[str, str]],]: if not (left.index.is_uniquely_named() and right.index.is_uniquely_named()): raise ValueError("Joins not supported on indices with non-unique level names") @@ -2819,8 +2876,6 @@ def join_multi_indexed( left_join_ids = [left.index.resolve_level_exact(name) for name in common_names] right_join_ids = [right.index.resolve_level_exact(name) for name in common_names] - names_fully_match = len(left_only_names) == 0 and len(right_only_names) == 0 - left_expr = left.expr right_expr = right.expr @@ -2850,13 +2905,11 @@ def join_multi_indexed( type=how, ) - combined_expr = left_expr.join( + combined_expr = left_expr.relational_join( right_expr, join_def=join_def, - # If we're only joining on a subset of the index columns, we need to - # perform a true join. - allow_row_identity_join=(names_fully_match and not block_identity_join), ) + get_column_left = join_def.get_left_mapping() get_column_right = join_def.get_right_mapping() left_ids_post_join = [get_column_left[id] for id in left_join_ids] diff --git a/bigframes/core/compile/__init__.py b/bigframes/core/compile/__init__.py index 4c105ed03b..964113bd7b 100644 --- a/bigframes/core/compile/__init__.py +++ b/bigframes/core/compile/__init__.py @@ -14,19 +14,13 @@ from __future__ import annotations from bigframes.core.compile.api import ( - compile_ordered, - compile_peek, - compile_raw, - compile_unordered, + SQLCompiler, test_only_ibis_inferred_schema, test_only_try_evaluate, ) __all__ = [ - "compile_peek", - "compile_unordered", - "compile_ordered", - "compile_raw", + "SQLCompiler", "test_only_try_evaluate", "test_only_ibis_inferred_schema", ] diff --git a/bigframes/core/compile/api.py b/bigframes/core/compile/api.py index 1f7d0a4507..468c5522d9 100644 --- a/bigframes/core/compile/api.py +++ b/bigframes/core/compile/api.py @@ -22,41 +22,54 @@ import bigframes.core.ordering import bigframes.core.schema - -def compile_peek(node: bigframes.core.nodes.BigFrameNode, n_rows: int) -> str: - """Compile node into sql that selects N arbitrary rows, may not execute deterministically.""" - return compiler.compile_unordered_ir(node).peek_sql(n_rows) - - -def compile_unordered( - node: bigframes.core.nodes.BigFrameNode, *, col_id_overrides: Mapping[str, str] = {} -) -> str: - """Compile node into sql where rows are unsorted, and no ordering information is preserved.""" - return compiler.compile_unordered_ir(node).to_sql(col_id_overrides=col_id_overrides) - - -def compile_ordered( - node: bigframes.core.nodes.BigFrameNode, *, col_id_overrides: Mapping[str, str] = {} -) -> str: - """Compile node into sql where rows are sorted with ORDER BY.""" - return compiler.compile_ordered_ir(node).to_sql( - col_id_overrides=col_id_overrides, ordered=True - ) - - -def compile_raw( - node: bigframes.core.nodes.BigFrameNode, -) -> Tuple[str, bigframes.core.ordering.TotalOrdering]: - """Compile node into sql that exposes all columns, including hidden ordering-only columns.""" - ir = compiler.compile_ordered_ir(node) - sql = ir.raw_sql() - ordering_info = ir._ordering - return sql, ordering_info +_STRICT_COMPILER = compiler.Compiler(strict=True) + + +class SQLCompiler: + def __init__(self, strict: bool = True): + self._compiler = compiler.Compiler(strict=strict) + + def compile_peek(self, node: bigframes.core.nodes.BigFrameNode, n_rows: int) -> str: + """Compile node into sql that selects N arbitrary rows, may not execute deterministically.""" + return self._compiler.compile_unordered_ir(node).peek_sql(n_rows) + + def compile_unordered( + self, + node: bigframes.core.nodes.BigFrameNode, + *, + col_id_overrides: Mapping[str, str] = {}, + ) -> str: + """Compile node into sql where rows are unsorted, and no ordering information is preserved.""" + return self._compiler.compile_unordered_ir(node).to_sql( + col_id_overrides=col_id_overrides + ) + + def compile_ordered( + self, + node: bigframes.core.nodes.BigFrameNode, + *, + col_id_overrides: Mapping[str, str] = {}, + ) -> str: + """Compile node into sql where rows are sorted with ORDER BY.""" + return self._compiler.compile_ordered_ir(node).to_sql( + col_id_overrides=col_id_overrides, ordered=True + ) + + def compile_raw( + self, + node: bigframes.core.nodes.BigFrameNode, + ) -> Tuple[str, bigframes.core.ordering.RowOrdering]: + """Compile node into sql that exposes all columns, including hidden ordering-only columns.""" + ir = self._compiler.compile_ordered_ir(node) + sql = ir.raw_sql() + return sql, ir._ordering def test_only_try_evaluate(node: bigframes.core.nodes.BigFrameNode): """Use only for unit testing paths - not fully featured. Will throw exception if fails.""" - ibis = compiler.compile_ordered_ir(node)._to_ibis_expr(ordering_mode="unordered") + ibis = _STRICT_COMPILER.compile_ordered_ir(node)._to_ibis_expr( + ordering_mode="unordered" + ) return ibis.pandas.connect({}).execute(ibis) @@ -64,7 +77,7 @@ def test_only_ibis_inferred_schema(node: bigframes.core.nodes.BigFrameNode): """Use only for testing paths to ensure ibis inferred schema does not diverge from bigframes inferred schema.""" import bigframes.core.schema - compiled = compiler.compile_unordered_ir(node) + compiled = _STRICT_COMPILER.compile_unordered_ir(node) items = tuple( bigframes.core.schema.SchemaItem(id, compiled.get_column_type(id)) for id in compiled.column_ids diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index cc601744c1..c822dd331c 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -37,7 +37,9 @@ ascending_over, encode_order_string, IntegerEncoding, + join_orderings, OrderingExpression, + RowOrdering, TotalOrdering, ) import bigframes.core.schema as schemata @@ -519,7 +521,7 @@ def __init__( table: ibis_types.Table, columns: Sequence[ibis_types.Value], hidden_ordering_columns: Optional[Sequence[ibis_types.Value]] = None, - ordering: TotalOrdering = TotalOrdering(), + ordering: RowOrdering = RowOrdering(), predicates: Optional[Collection[ibis_types.BooleanValue]] = None, ): super().__init__(table, columns, predicates) @@ -566,6 +568,10 @@ def __init__( def is_ordered_ir(self) -> bool: return True + @property + def has_total_order(self) -> bool: + return isinstance(self._ordering, TotalOrdering) + @classmethod def from_pandas( cls, @@ -757,16 +763,13 @@ def explode(self, column_ids: typing.Sequence[str]) -> OrderedIR: ], table_w_unnest[unnest_offset_id], ] - ordering = TotalOrdering( - ordering_value_columns=tuple( - [ - *self._ordering.ordering_value_columns, - ascending_over(unnest_offset_id), - ] - ), - total_ordering_columns=frozenset( - [*self._ordering.total_ordering_columns, unnest_offset_id] - ), + l_mappings = {id: id for id in self._ordering.referenced_columns} + r_mappings = {unnest_offset_id: unnest_offset_id} + ordering = join_orderings( + self._ordering, + TotalOrdering.from_offset_col(unnest_offset_id), + l_mappings, + r_mappings, ) return OrderedIR( @@ -1150,12 +1153,19 @@ def _bake_ordering(self) -> OrderedIR: self._ibis_bindings[expr.scalar_expression.id] ) - new_ordering = TotalOrdering( - tuple(new_exprs), - self._ordering.integer_encoding, - self._ordering.string_encoding, - self._ordering.total_ordering_columns, - ) + if isinstance(self._ordering, TotalOrdering): + new_ordering: RowOrdering = TotalOrdering( + tuple(new_exprs), + self._ordering.integer_encoding, + self._ordering.string_encoding, + self._ordering.total_ordering_columns, + ) + else: + new_ordering = RowOrdering( + tuple(new_exprs), + self._ordering.integer_encoding, + self._ordering.string_encoding, + ) return OrderedIR( self._table, columns=self.columns, @@ -1297,7 +1307,7 @@ class Builder: def __init__( self, table: ibis_types.Table, - ordering: TotalOrdering, + ordering: RowOrdering, columns: Collection[ibis_types.Value] = (), hidden_ordering_columns: Collection[ibis_types.Value] = (), predicates: Optional[Collection[ibis_types.BooleanValue]] = None, diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index ca9c479fff..c7f8c5ab59 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -13,6 +13,7 @@ # limitations under the License. from __future__ import annotations +import dataclasses import functools import io import typing @@ -37,97 +38,192 @@ import bigframes.session -def compile_ordered_ir(node: nodes.BigFrameNode) -> compiled.OrderedIR: - return typing.cast(compiled.OrderedIR, compile_node(node, True)) - - -def compile_unordered_ir(node: nodes.BigFrameNode) -> compiled.UnorderedIR: - return typing.cast(compiled.UnorderedIR, compile_node(node, False)) - - -def compile_peak_sql(node: nodes.BigFrameNode, n_rows: int) -> typing.Optional[str]: - return compile_unordered_ir(node).peek_sql(n_rows) - - -# TODO: Remove cache when schema no longer requires compilation to derive schema (and therefor only compiles for execution) -@functools.lru_cache(maxsize=5000) -def compile_node( - node: nodes.BigFrameNode, ordered: bool = True -) -> compiled.UnorderedIR | compiled.OrderedIR: - """Compile node into CompileArrayValue. Caches result.""" - return _compile_node(node, ordered) - - -@functools.singledispatch -def _compile_node( - node: nodes.BigFrameNode, ordered: bool = True -) -> compiled.UnorderedIR: - """Defines transformation but isn't cached, always use compile_node instead""" - raise ValueError(f"Can't compile unrecognized node: {node}") - +@dataclasses.dataclass(frozen=True) +class Compiler: + # In strict mode, ordering will always be deterministic + # In unstrict mode, ordering from ReadTable or after joins may be ambiguous to improve query performance. + strict: bool = True + + def compile_ordered_ir(self, node: nodes.BigFrameNode) -> compiled.OrderedIR: + ir = typing.cast(compiled.OrderedIR, self.compile_node(node, True)) + if self.strict: + assert ir.has_total_order + return ir + + def compile_unordered_ir(self, node: nodes.BigFrameNode) -> compiled.UnorderedIR: + return typing.cast(compiled.UnorderedIR, self.compile_node(node, False)) + + def compile_peak_sql( + self, node: nodes.BigFrameNode, n_rows: int + ) -> typing.Optional[str]: + return self.compile_unordered_ir(node).peek_sql(n_rows) + + # TODO: Remove cache when schema no longer requires compilation to derive schema (and therefor only compiles for execution) + @functools.lru_cache(maxsize=5000) + def compile_node( + self, node: nodes.BigFrameNode, ordered: bool = True + ) -> compiled.UnorderedIR | compiled.OrderedIR: + """Compile node into CompileArrayValue. Caches result.""" + return self._compile_node(node, ordered) + + @functools.singledispatchmethod + def _compile_node( + self, node: nodes.BigFrameNode, ordered: bool = True + ) -> compiled.UnorderedIR: + """Defines transformation but isn't cached, always use compile_node instead""" + raise ValueError(f"Can't compile unrecognized node: {node}") + + @_compile_node.register + def compile_join(self, node: nodes.JoinNode, ordered: bool = True): + if ordered: + left_ordered = self.compile_ordered_ir(node.left_child) + right_ordered = self.compile_ordered_ir(node.right_child) + return bigframes.core.compile.single_column.join_by_column_ordered( + left=left_ordered, + right=right_ordered, + join=node.join, + ) + else: + left_unordered = self.compile_unordered_ir(node.left_child) + right_unordered = self.compile_unordered_ir(node.right_child) + return bigframes.core.compile.single_column.join_by_column_unordered( + left=left_unordered, + right=right_unordered, + join=node.join, + ) -@_compile_node.register -def compile_join(node: nodes.JoinNode, ordered: bool = True): - if ordered: - left_ordered = compile_ordered_ir(node.left_child) - right_ordered = compile_ordered_ir(node.right_child) - return bigframes.core.compile.single_column.join_by_column_ordered( - left=left_ordered, - right=right_ordered, - join=node.join, + @_compile_node.register + def compile_readlocal(self, node: nodes.ReadLocalNode, ordered: bool = True): + array_as_pd = pd.read_feather(io.BytesIO(node.feather_bytes)) + ordered_ir = compiled.OrderedIR.from_pandas(array_as_pd, node.schema) + if ordered: + return ordered_ir + else: + return ordered_ir.to_unordered() + + @_compile_node.register + def compile_cached_table(self, node: nodes.CachedTableNode, ordered: bool = True): + full_table_name = f"{node.project_id}.{node.dataset_id}.{node.table_id}" + used_columns = ( + *node.schema.names, + *node.hidden_columns, ) - else: - left_unordered = compile_unordered_ir(node.left_child) - right_unordered = compile_unordered_ir(node.right_child) - return bigframes.core.compile.single_column.join_by_column_unordered( - left=left_unordered, - right=right_unordered, - join=node.join, + # Physical schema might include unused columns, unsupported datatypes like JSON + physical_schema = ibis.backends.bigquery.BigQuerySchema.to_ibis( + list(i for i in node.physical_schema if i.name in used_columns) ) + ibis_table = ibis.table(physical_schema, full_table_name) + if ordered: + if node.ordering is None: + # If this happens, session malfunctioned while applying cached results. + raise ValueError( + "Cannot use unordered cached value. Result requires ordering information." + ) + if self.strict and not isinstance(node.ordering, bf_ordering.TotalOrdering): + raise ValueError( + "Cannot use partially ordered cached value. Result requires total ordering information." + ) + return compiled.OrderedIR( + ibis_table, + columns=tuple( + bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( + ibis_table[col] + ) + for col in node.schema.names + ), + ordering=node.ordering, + hidden_ordering_columns=[ibis_table[c] for c in node.hidden_columns], + ) + else: + return compiled.UnorderedIR( + ibis_table, + columns=tuple( + bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( + ibis_table[col] + ) + for col in node.schema.names + ), + ) -@_compile_node.register -def compile_readlocal(node: nodes.ReadLocalNode, ordered: bool = True): - array_as_pd = pd.read_feather(io.BytesIO(node.feather_bytes)) - ordered_ir = compiled.OrderedIR.from_pandas(array_as_pd, node.schema) - if ordered: - return ordered_ir - else: - return ordered_ir.to_unordered() - - -@_compile_node.register -def compile_cached_table(node: nodes.CachedTableNode, ordered: bool = True): - full_table_name = f"{node.project_id}.{node.dataset_id}.{node.table_id}" - used_columns = ( - *node.schema.names, - *node.hidden_columns, - ) - # Physical schema might include unused columns, unsupported datatypes like JSON - physical_schema = ibis.backends.bigquery.BigQuerySchema.to_ibis( - list(i for i in node.physical_schema if i.name in used_columns) - ) - ibis_table = ibis.table(physical_schema, full_table_name) - if ordered: - if node.ordering is None: - # If this happens, session malfunctioned while applying cached results. - raise ValueError( - "Cannot use unordered cached value. Result requires ordering information." + @_compile_node.register + def compile_readtable(self, node: nodes.ReadTableNode, ordered: bool = True): + if ordered: + return self.compile_read_table_ordered(node) + else: + return self.compile_read_table_unordered(node) + + def read_table_as_unordered_ibis( + self, node: nodes.ReadTableNode + ) -> ibis.expr.types.Table: + full_table_name = f"{node.project_id}.{node.dataset_id}.{node.table_id}" + used_columns = ( + *node.schema.names, + *[i for i in node.total_order_cols if i not in node.schema.names], + ) + # Physical schema might include unused columns, unsupported datatypes like JSON + physical_schema = ibis.backends.bigquery.BigQuerySchema.to_ibis( + list(i for i in node.physical_schema if i.name in used_columns) + ) + if node.at_time is not None or node.sql_predicate is not None: + import bigframes.session._io.bigquery + + sql = bigframes.session._io.bigquery.to_query( + full_table_name, + columns=used_columns, + sql_predicate=node.sql_predicate, + time_travel_timestamp=node.at_time, ) - return compiled.OrderedIR( + return ibis.backends.bigquery.Backend().sql( + schema=physical_schema, query=sql + ) + else: + return ibis.table(physical_schema, full_table_name) + + def compile_read_table_unordered(self, node: nodes.ReadTableNode): + ibis_table = self.read_table_as_unordered_ibis(node) + return compiled.UnorderedIR( ibis_table, - columns=tuple( + tuple( bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( ibis_table[col] ) for col in node.schema.names ), - ordering=node.ordering, - hidden_ordering_columns=[ibis_table[c] for c in node.hidden_columns], ) - else: - return compiled.UnorderedIR( + def compile_read_table_ordered(self, node: nodes.ReadTableNode): + ibis_table = self.read_table_as_unordered_ibis(node) + if node.total_order_cols: + ordering_value_columns = tuple( + bf_ordering.ascending_over(col) for col in node.total_order_cols + ) + if node.order_col_is_sequential: + integer_encoding = bf_ordering.IntegerEncoding( + is_encoded=True, is_sequential=True + ) + else: + integer_encoding = bf_ordering.IntegerEncoding() + ordering: bf_ordering.RowOrdering = bf_ordering.TotalOrdering( + ordering_value_columns, + integer_encoding=integer_encoding, + total_ordering_columns=frozenset(node.total_order_cols), + ) + hidden_columns = () + elif self.strict: + ibis_table, ordering = default_ordering.gen_default_ordering( + ibis_table, use_double_hash=True + ) + hidden_columns = tuple( + ibis_table[col] + for col in ibis_table.columns + if col not in node.schema.names + ) + else: + # In unstrict mode, don't generate total ordering from hashing as this is + # expensive (prevent removing any columns from table scan) + ordering, hidden_columns = bf_ordering.RowOrdering(), () + return compiled.OrderedIR( ibis_table, columns=tuple( bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( @@ -135,182 +231,91 @@ def compile_cached_table(node: nodes.CachedTableNode, ordered: bool = True): ) for col in node.schema.names ), + ordering=ordering, + hidden_ordering_columns=hidden_columns, ) + @_compile_node.register + def compile_promote_offsets( + self, node: nodes.PromoteOffsetsNode, ordered: bool = True + ): + result = self.compile_ordered_ir(node.child).promote_offsets(node.col_id) + return result if ordered else result.to_unordered() -@_compile_node.register -def compile_readtable(node: nodes.ReadTableNode, ordered: bool = True): - if ordered: - return compile_read_table_ordered(node) - else: - return compile_read_table_unordered(node) - - -def read_table_as_unordered_ibis(node: nodes.ReadTableNode) -> ibis.expr.types.Table: - full_table_name = f"{node.project_id}.{node.dataset_id}.{node.table_id}" - used_columns = ( - *node.schema.names, - *[i for i in node.total_order_cols if i not in node.schema.names], - ) - # Physical schema might include unused columns, unsupported datatypes like JSON - physical_schema = ibis.backends.bigquery.BigQuerySchema.to_ibis( - list(i for i in node.physical_schema if i.name in used_columns) - ) - if node.at_time is not None or node.sql_predicate is not None: - import bigframes.session._io.bigquery - - sql = bigframes.session._io.bigquery.to_query( - full_table_name, - columns=used_columns, - sql_predicate=node.sql_predicate, - time_travel_timestamp=node.at_time, - ) - return ibis.backends.bigquery.Backend().sql(schema=physical_schema, query=sql) - else: - return ibis.table(physical_schema, full_table_name) - + @_compile_node.register + def compile_filter(self, node: nodes.FilterNode, ordered: bool = True): + return self.compile_node(node.child, ordered).filter(node.predicate) -def compile_read_table_unordered(node: nodes.ReadTableNode): - ibis_table = read_table_as_unordered_ibis(node) - return compiled.UnorderedIR( - ibis_table, - tuple( - bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( - ibis_table[col] - ) - for col in node.schema.names - ), - ) + @_compile_node.register + def compile_orderby(self, node: nodes.OrderByNode, ordered: bool = True): + if ordered: + return self.compile_ordered_ir(node.child).order_by(node.by) + else: + return self.compile_unordered_ir(node.child) + @_compile_node.register + def compile_reversed(self, node: nodes.ReversedNode, ordered: bool = True): + if ordered: + return self.compile_ordered_ir(node.child).reversed() + else: + return self.compile_unordered_ir(node.child) + + @_compile_node.register + def compile_projection(self, node: nodes.ProjectionNode, ordered: bool = True): + result = self.compile_node(node.child, ordered) + return result.projection(node.assignments) + + @_compile_node.register + def compile_concat(self, node: nodes.ConcatNode, ordered: bool = True): + if ordered: + compiled_ordered = [self.compile_ordered_ir(node) for node in node.children] + return concat_impl.concat_ordered(compiled_ordered) + else: + compiled_unordered = [ + self.compile_unordered_ir(node) for node in node.children + ] + return concat_impl.concat_unordered(compiled_unordered) + + @_compile_node.register + def compile_rowcount(self, node: nodes.RowCountNode, ordered: bool = True): + result = self.compile_unordered_ir(node.child).row_count() + return result if ordered else result.to_unordered() -def compile_read_table_ordered(node: nodes.ReadTableNode): - ibis_table = read_table_as_unordered_ibis(node) - if node.total_order_cols: - ordering_value_columns = tuple( - bf_ordering.ascending_over(col) for col in node.total_order_cols + @_compile_node.register + def compile_aggregate(self, node: nodes.AggregateNode, ordered: bool = True): + has_ordered_aggregation_ops = any( + aggregate.op.can_order_by for aggregate, _ in node.aggregations ) - if node.order_col_is_sequential: - integer_encoding = bf_ordering.IntegerEncoding( - is_encoded=True, is_sequential=True + if ordered and has_ordered_aggregation_ops: + return self.compile_ordered_ir(node.child).aggregate( + node.aggregations, node.by_column_ids, node.dropna ) else: - integer_encoding = bf_ordering.IntegerEncoding() - ordering = bf_ordering.TotalOrdering( - ordering_value_columns, - integer_encoding=integer_encoding, - total_ordering_columns=frozenset(node.total_order_cols), - ) - hidden_columns = () - else: - ibis_table, ordering = default_ordering.gen_default_ordering( - ibis_table, use_double_hash=True - ) - hidden_columns = tuple( - ibis_table[col] - for col in ibis_table.columns - if col not in node.schema.names - ) - return compiled.OrderedIR( - ibis_table, - columns=tuple( - bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( - ibis_table[col] + result = self.compile_unordered_ir(node.child).aggregate( + node.aggregations, node.by_column_ids, node.dropna ) - for col in node.schema.names - ), - ordering=ordering, - hidden_ordering_columns=hidden_columns, - ) - - -@_compile_node.register -def compile_promote_offsets(node: nodes.PromoteOffsetsNode, ordered: bool = True): - result = compile_ordered_ir(node.child).promote_offsets(node.col_id) - return result if ordered else result.to_unordered() - - -@_compile_node.register -def compile_filter(node: nodes.FilterNode, ordered: bool = True): - return compile_node(node.child, ordered).filter(node.predicate) - - -@_compile_node.register -def compile_orderby(node: nodes.OrderByNode, ordered: bool = True): - if ordered: - return compile_ordered_ir(node.child).order_by(node.by) - else: - return compile_unordered_ir(node.child) - - -@_compile_node.register -def compile_reversed(node: nodes.ReversedNode, ordered: bool = True): - if ordered: - return compile_ordered_ir(node.child).reversed() - else: - return compile_unordered_ir(node.child) - - -@_compile_node.register -def compile_projection(node: nodes.ProjectionNode, ordered: bool = True): - result = compile_node(node.child, ordered) - return result.projection(node.assignments) - - -@_compile_node.register -def compile_concat(node: nodes.ConcatNode, ordered: bool = True): - if ordered: - compiled_ordered = [compile_ordered_ir(node) for node in node.children] - return concat_impl.concat_ordered(compiled_ordered) - else: - compiled_unordered = [compile_unordered_ir(node) for node in node.children] - return concat_impl.concat_unordered(compiled_unordered) - - -@_compile_node.register -def compile_rowcount(node: nodes.RowCountNode, ordered: bool = True): - result = compile_unordered_ir(node.child).row_count() - return result if ordered else result.to_unordered() - - -@_compile_node.register -def compile_aggregate(node: nodes.AggregateNode, ordered: bool = True): - has_ordered_aggregation_ops = any( - aggregate.op.can_order_by for aggregate, _ in node.aggregations - ) - if ordered and has_ordered_aggregation_ops: - return compile_ordered_ir(node.child).aggregate( - node.aggregations, node.by_column_ids, node.dropna - ) - else: - result = compile_unordered_ir(node.child).aggregate( - node.aggregations, node.by_column_ids, node.dropna + return result if ordered else result.to_unordered() + + @_compile_node.register + def compile_window(self, node: nodes.WindowOpNode, ordered: bool = True): + result = self.compile_ordered_ir(node.child).project_window_op( + node.column_name, + node.op, + node.window_spec, + node.output_name, + never_skip_nulls=node.never_skip_nulls, + skip_reproject_unsafe=node.skip_reproject_unsafe, ) return result if ordered else result.to_unordered() + @_compile_node.register + def compile_reproject(self, node: nodes.ReprojectOpNode, ordered: bool = True): + return self.compile_node(node.child, ordered)._reproject_to_table() -@_compile_node.register -def compile_window(node: nodes.WindowOpNode, ordered: bool = True): - result = compile_ordered_ir(node.child).project_window_op( - node.column_name, - node.op, - node.window_spec, - node.output_name, - never_skip_nulls=node.never_skip_nulls, - skip_reproject_unsafe=node.skip_reproject_unsafe, - ) - return result if ordered else result.to_unordered() - - -@_compile_node.register -def compile_reproject(node: nodes.ReprojectOpNode, ordered: bool = True): - return compile_node(node.child, ordered)._reproject_to_table() - - -@_compile_node.register -def compile_explode(node: nodes.ExplodeNode, ordered: bool = True): - return compile_node(node.child, ordered).explode(node.column_ids) - + @_compile_node.register + def compile_explode(self, node: nodes.ExplodeNode, ordered: bool = True): + return self.compile_node(node.child, ordered).explode(node.column_ids) -@_compile_node.register -def compile_random_sample(node: nodes.RandomSampleNode, ordered: bool = True): - return compile_node(node.child, ordered)._uniform_sampling(node.fraction) + @_compile_node.register + def compile_random_sample(self, node: nodes.RandomSampleNode, ordered: bool = True): + return self.compile_node(node.child, ordered)._uniform_sampling(node.fraction) diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 11a5d43ba0..02bf201ca0 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -109,6 +109,7 @@ def __getitem__( dropna=self._dropna, ) + @validations.requires_strict_ordering() def head(self, n: int = 5) -> df.DataFrame: block = self._block if self._dropna: @@ -531,6 +532,7 @@ def __init__( def _session(self) -> core.Session: return self._block.session + @validations.requires_strict_ordering() def head(self, n: int = 5) -> series.Series: block = self._block if self._dropna: diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 696742180b..8b039707c2 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -492,9 +492,7 @@ def to_pandas(self) -> pandas.Index: pandas.Index: A pandas Index with all of the labels from this Index. """ - return self._block.index.to_pandas( - ordered=self._block.session._strictly_ordered - ) + return self._block.index.to_pandas(ordered=True) def to_numpy(self, dtype=None, **kwargs) -> np.ndarray: return self.to_pandas().to_numpy(dtype, **kwargs) diff --git a/bigframes/core/join_def.py b/bigframes/core/join_def.py index 632a1864da..4079abc8fa 100644 --- a/bigframes/core/join_def.py +++ b/bigframes/core/join_def.py @@ -43,6 +43,15 @@ class JoinColumnMapping: destination_id: str +@dataclasses.dataclass(frozen=True) +class CoalescedColumnMapping: + """Special column mapping used only by implicit joiner only""" + + left_source_id: str + right_source_id: str + destination_id: str + + @dataclasses.dataclass(frozen=True) class JoinDefinition: conditions: Tuple[JoinCondition, ...] diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index a703cf1969..a979e07972 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -127,6 +127,14 @@ def joins(self) -> bool: """ return False + @property + @abc.abstractmethod + def order_ambiguous(self) -> bool: + """ + Whether row ordering is potentially ambiguous. For example, ReadTable (without a primary key) could be ordered in different ways. + """ + ... + @functools.cached_property def total_variables(self) -> int: return self.variables_introduced + sum( @@ -177,13 +185,16 @@ def transform_children( ) -> BigFrameNode: return replace(self, child=t(self.child)) + @property + def order_ambiguous(self) -> bool: + return self.child.order_ambiguous + @dataclass(frozen=True) class JoinNode(BigFrameNode): left_child: BigFrameNode right_child: BigFrameNode join: JoinDefinition - allow_row_identity_join: bool = False @property def row_preserving(self) -> bool: @@ -197,6 +208,10 @@ def non_local(self) -> bool: def child_nodes(self) -> typing.Sequence[BigFrameNode]: return (self.left_child, self.right_child) + @property + def order_ambiguous(self) -> bool: + return True + def __hash__(self): return self._node_hash @@ -248,6 +263,10 @@ def __post_init__(self): def child_nodes(self) -> typing.Sequence[BigFrameNode]: return self.children + @property + def order_ambiguous(self) -> bool: + return any(child.order_ambiguous for child in self.children) + def __hash__(self): return self._node_hash @@ -294,6 +313,10 @@ def variables_introduced(self) -> int: """Defines the number of variables generated by the current node. Used to estimate query planning complexity.""" return len(self.schema.items) + 1 + @property + def order_ambiguous(self) -> bool: + return False + def transform_children( self, t: Callable[[BigFrameNode], BigFrameNode] ) -> BigFrameNode: @@ -351,6 +374,10 @@ def relation_ops_created(self) -> int: # Assume worst case, where readgbq actually has baked in analytic operation to generate index return 3 + @property + def order_ambiguous(self) -> bool: + return len(self.total_order_cols) == 0 + @functools.cached_property def variables_introduced(self) -> int: return len(self.schema.items) + 1 @@ -373,7 +400,7 @@ class CachedTableNode(BigFrameNode): table_id: str = field() physical_schema: Tuple[bq.SchemaField, ...] = field() - ordering: typing.Optional[orderings.TotalOrdering] = field() + ordering: typing.Optional[orderings.RowOrdering] = field() def __post_init__(self): # enforce invariants @@ -418,6 +445,10 @@ def hidden_columns(self) -> typing.Tuple[str, ...]: if col not in self.schema.names ) + @property + def order_ambiguous(self) -> bool: + return not isinstance(self.ordering, orderings.TotalOrdering) + def transform_children( self, t: Callable[[BigFrameNode], BigFrameNode] ) -> BigFrameNode: @@ -601,6 +632,10 @@ def schema(self) -> schemata.ArraySchema: def variables_introduced(self) -> int: return len(self.aggregations) + len(self.by_column_ids) + @property + def order_ambiguous(self) -> bool: + return False + @dataclass(frozen=True) class WindowOpNode(UnaryNode): diff --git a/bigframes/core/ordering.py b/bigframes/core/ordering.py index 406ca52731..bff7e2ce44 100644 --- a/bigframes/core/ordering.py +++ b/bigframes/core/ordering.py @@ -98,6 +98,8 @@ class RowOrdering: """Immutable object that holds information about the ordering of rows in a ArrayValue object. May not be unambiguous.""" ordering_value_columns: typing.Tuple[OrderingExpression, ...] = () + integer_encoding: IntegerEncoding = IntegerEncoding(False) + string_encoding: StringEncoding = StringEncoding(False) @property def all_ordering_columns(self) -> Sequence[OrderingExpression]: @@ -111,6 +113,20 @@ def referenced_columns(self) -> Set[str]: for col in part.scalar_expression.unbound_variables ) + @property + def is_string_encoded(self) -> bool: + """True if ordering is fully defined by a fixed length string column.""" + return self.string_encoding.is_encoded + + @property + def is_sequential(self) -> bool: + return self.integer_encoding.is_encoded and self.integer_encoding.is_sequential + + @property + def total_order_col(self) -> Optional[OrderingExpression]: + """Returns column id of columns that defines total ordering, if such as column exists""" + return None + def with_reverse(self) -> RowOrdering: """Reverses the ordering.""" return RowOrdering( @@ -121,17 +137,66 @@ def with_column_remap(self, mapping: typing.Mapping[str, str]) -> RowOrdering: new_value_columns = [ col.remap_names(mapping) for col in self.all_ordering_columns ] - return TotalOrdering( + return RowOrdering( tuple(new_value_columns), ) + def with_non_sequential(self): + """Create a copy that is marked as non-sequential. + + This is useful when filtering, but not sorting, an expression. + """ + if self.integer_encoding.is_sequential: + return RowOrdering( + self.ordering_value_columns, + integer_encoding=IntegerEncoding( + self.integer_encoding.is_encoded, is_sequential=False + ), + ) + + return self + + def with_ordering_columns( + self, + ordering_value_columns: Sequence[OrderingExpression] = (), + ) -> RowOrdering: + """Creates a new ordering that reorders by the given columns. + + Args: + ordering_value_columns: + In decreasing precedence order, the values used to sort the ordering + + Returns: + Modified ExpressionOrdering + """ + + # Truncate to remove any unneded col references after all total order cols included + new_ordering = self._truncate_ordering( + (*ordering_value_columns, *self.ordering_value_columns) + ) + return RowOrdering( + new_ordering, + ) + + def _truncate_ordering( + self, order_refs: tuple[OrderingExpression, ...] + ) -> tuple[OrderingExpression, ...]: + # Truncate once we refer to a full key in bijective operations + columns_seen: Set[str] = set() + truncated_refs = [] + for order_part in order_refs: + expr = order_part.scalar_expression + if not set(expr.unbound_variables).issubset(columns_seen): + if expr.is_bijective: + columns_seen.update(expr.unbound_variables) + truncated_refs.append(order_part) + return tuple(truncated_refs) + @dataclass(frozen=True) class TotalOrdering(RowOrdering): """Immutable object that holds information about the ordering of rows in a ArrayValue object. Guaranteed to be unambiguous.""" - integer_encoding: IntegerEncoding = IntegerEncoding(False) - string_encoding: StringEncoding = StringEncoding(False) # A table has a total ordering defined by the identities of a set of 1 or more columns. # These columns must always be part of the ordering, in order to guarantee that the ordering is total. # Therefore, any modifications(or drops) done to these columns must result in hidden copies being made. @@ -234,15 +299,6 @@ def total_order_col(self) -> Optional[OrderingExpression]: return None return order_ref - @property - def is_string_encoded(self) -> bool: - """True if ordering is fully defined by a fixed length string column.""" - return self.string_encoding.is_encoded - - @property - def is_sequential(self) -> bool: - return self.integer_encoding.is_encoded and self.integer_encoding.is_sequential - def encode_order_string( order_id: ibis_types.IntegerColumn, length: int = DEFAULT_ORDERING_ID_LENGTH diff --git a/bigframes/core/rewrite.py b/bigframes/core/rewrite.py index 101d5cc882..60ed4069a9 100644 --- a/bigframes/core/rewrite.py +++ b/bigframes/core/rewrite.py @@ -106,21 +106,25 @@ def order_with(self, by: Tuple[order.OrderingExpression, ...]): ) def can_merge( - self, right: SquashedSelect, join_def: join_defs.JoinDefinition + self, + right: SquashedSelect, + join_keys: Tuple[join_defs.CoalescedColumnMapping, ...], ) -> bool: """Determines whether the two selections can be merged into a single selection.""" - if join_def.type == "cross": - # Cannot convert cross join to projection - return False - r_exprs_by_id = {id: expr for expr, id in right.columns} l_exprs_by_id = {id: expr for expr, id in self.columns} - l_join_exprs = [l_exprs_by_id[cond.left_id] for cond in join_def.conditions] - r_join_exprs = [r_exprs_by_id[cond.right_id] for cond in join_def.conditions] + l_join_exprs = [ + l_exprs_by_id[join_key.left_source_id] for join_key in join_keys + ] + r_join_exprs = [ + r_exprs_by_id[join_key.right_source_id] for join_key in join_keys + ] - if (self.root != right.root) or any( - l_expr != r_expr for l_expr, r_expr in zip(l_join_exprs, r_join_exprs) - ): + if self.root != right.root: + return False + if len(l_join_exprs) != len(r_join_exprs): + return False + if any(l_expr != r_expr for l_expr, r_expr in zip(l_join_exprs, r_join_exprs)): return False return True @@ -128,6 +132,7 @@ def merge( self, right: SquashedSelect, join_type: join_defs.JoinType, + join_keys: Tuple[join_defs.CoalescedColumnMapping, ...], mappings: Tuple[join_defs.JoinColumnMapping, ...], ) -> SquashedSelect: if self.root != right.root: @@ -147,11 +152,9 @@ def merge( l_relative, r_relative = relative_predicates(self.predicate, right.predicate) lmask = l_relative if join_type in {"right", "outer"} else None rmask = r_relative if join_type in {"left", "outer"} else None - if lmask is not None: - lselection = tuple((apply_mask(expr, lmask), id) for expr, id in lselection) - if rmask is not None: - rselection = tuple((apply_mask(expr, rmask), id) for expr, id in rselection) - new_columns = remap_names(mappings, lselection, rselection) + new_columns = merge_expressions( + join_keys, mappings, lselection, rselection, lmask, rmask + ) # Reconstruct ordering reverse_root = self.reverse_root @@ -204,26 +207,10 @@ def expand(self) -> nodes.BigFrameNode: return nodes.ProjectionNode(child=root, assignments=self.columns) -def maybe_rewrite_join(join_node: nodes.JoinNode) -> nodes.BigFrameNode: - rewrite_common_node = common_selection_root( - join_node.left_child, join_node.right_child - ) - if rewrite_common_node is None: - return join_node - left_side = SquashedSelect.from_node_span(join_node.left_child, rewrite_common_node) - right_side = SquashedSelect.from_node_span( - join_node.right_child, rewrite_common_node - ) - if left_side.can_merge(right_side, join_node.join): - return left_side.merge( - right_side, join_node.join.type, join_node.join.mappings - ).expand() - return join_node - - def join_as_projection( l_node: nodes.BigFrameNode, r_node: nodes.BigFrameNode, + join_keys: Tuple[join_defs.CoalescedColumnMapping, ...], mappings: Tuple[join_defs.JoinColumnMapping, ...], how: join_defs.JoinType, ) -> Optional[nodes.BigFrameNode]: @@ -231,7 +218,10 @@ def join_as_projection( if rewrite_common_node is not None: left_side = SquashedSelect.from_node_span(l_node, rewrite_common_node) right_side = SquashedSelect.from_node_span(r_node, rewrite_common_node) - merged = left_side.merge(right_side, how, mappings) + if not left_side.can_merge(right_side, join_keys): + # Most likely because join keys didn't match + return None + merged = left_side.merge(right_side, how, join_keys, mappings) assert ( merged is not None ), "Couldn't merge nodes. This shouldn't happen. Please share full stacktrace with the BigQuery DataFrames team at bigframes-feedback@google.com." @@ -240,21 +230,33 @@ def join_as_projection( return None -def remap_names( +def merge_expressions( + join_keys: Tuple[join_defs.CoalescedColumnMapping, ...], mappings: Tuple[join_defs.JoinColumnMapping, ...], lselection: Selection, rselection: Selection, + lmask: Optional[scalar_exprs.Expression], + rmask: Optional[scalar_exprs.Expression], ) -> Selection: new_selection: Selection = tuple() l_exprs_by_id = {id: expr for expr, id in lselection} r_exprs_by_id = {id: expr for expr, id in rselection} + for key in join_keys: + # Join keys expressions are equivalent on both sides, so can choose either left or right key + assert l_exprs_by_id[key.left_source_id] == r_exprs_by_id[key.right_source_id] + expr = l_exprs_by_id[key.left_source_id] + id = key.destination_id + new_selection = (*new_selection, (expr, id)) for mapping in mappings: if mapping.source_table == join_defs.JoinSide.LEFT: expr = l_exprs_by_id[mapping.source_id] + if lmask is not None: + expr = apply_mask(expr, lmask) else: # Right expr = r_exprs_by_id[mapping.source_id] - id = mapping.destination_id - new_selection = (*new_selection, (expr, id)) + if rmask is not None: + expr = apply_mask(expr, rmask) + new_selection = (*new_selection, (expr, mapping.destination_id)) return new_selection diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index 97c5ef03e5..43c05c6c83 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -23,6 +23,10 @@ UNNAMED_INDEX_ID = "bigframes_unnamed_index" +def is_gcs_path(value) -> typing_extensions.TypeGuard[str]: + return isinstance(value, str) and value.startswith("gs://") + + def get_axis_number(axis: typing.Union[str, int]) -> typing.Literal[0, 1]: if axis in {0, "index", "rows"}: return 0 diff --git a/bigframes/core/validations.py b/bigframes/core/validations.py index dc22047e3b..c5761f4e09 100644 --- a/bigframes/core/validations.py +++ b/bigframes/core/validations.py @@ -17,7 +17,7 @@ from __future__ import annotations import functools -from typing import Protocol, TYPE_CHECKING +from typing import Optional, Protocol, TYPE_CHECKING import bigframes.constants import bigframes.exceptions @@ -32,11 +32,11 @@ def _session(self) -> Session: ... -def requires_strict_ordering(): +def requires_strict_ordering(suggestion: Optional[str] = None): def decorator(meth): @functools.wraps(meth) def guarded_meth(object: HasSession, *args, **kwargs): - enforce_ordered(object, meth.__name__) + enforce_ordered(object, meth.__name__, suggestion) return meth(object, *args, **kwargs) return guarded_meth @@ -44,8 +44,11 @@ def guarded_meth(object: HasSession, *args, **kwargs): return decorator -def enforce_ordered(object: HasSession, opname: str) -> None: +def enforce_ordered( + object: HasSession, opname: str, suggestion: Optional[str] = None +) -> None: if not object._session._strictly_ordered: + suggestion_substr = suggestion + " " if suggestion else "" raise bigframes.exceptions.OrderRequiredError( - f"Op {opname} not supported when strict ordering is disabled. {bigframes.constants.FEEDBACK_LINK}" + f"Op {opname} not supported when strict ordering is disabled. {suggestion_substr}{bigframes.constants.FEEDBACK_LINK}" ) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 4dcc4414ed..9789c7cf9f 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -49,6 +49,7 @@ import bigframes import bigframes._config.display_options as display_options +import bigframes.constants import bigframes.constants as constants import bigframes.core from bigframes.core import log_adapter @@ -104,6 +105,8 @@ def guarded_meth(df: DataFrame, *args, **kwargs): @log_adapter.class_logger class DataFrame(vendored_pandas_frame.DataFrame): __doc__ = vendored_pandas_frame.DataFrame.__doc__ + # internal flag to disable cache at all + _disable_cache_override: bool = False def __init__( self, @@ -366,7 +369,7 @@ def astype( return self._apply_unary_op(ops.AsTypeOp(to_type=dtype)) def _to_sql_query( - self, include_index: bool + self, include_index: bool, enable_cache: bool = True ) -> Tuple[str, list[str], list[blocks.Label]]: """Compiles this DataFrame's expression tree to SQL, optionally including index columns. @@ -380,12 +383,14 @@ def _to_sql_query( If include_index is set to False, index_column_id_list and index_column_label_list return empty lists. """ - return self._block.to_sql_query(include_index) + return self._block.to_sql_query(include_index, enable_cache=enable_cache) @property def sql(self) -> str: """Compiles this DataFrame's expression tree to SQL.""" - include_index = self.index.name is not None or len(self.index.names) > 1 + include_index = self._has_index and ( + self.index.name is not None or len(self.index.names) > 1 + ) sql, _, _ = self._to_sql_query(include_index=include_index) return sql @@ -1192,15 +1197,14 @@ def cov(self, *, numeric_only: bool = False) -> DataFrame: def to_arrow( self, *, - ordered: Optional[bool] = None, + ordered: bool = True, ) -> pyarrow.Table: """Write DataFrame to an Arrow table / record batch. Args: - ordered (bool, default None): - Determines whether the resulting Arrow table will be deterministically ordered. - In some cases, unordered may result in a faster-executing query. If set to a value - other than None, will override Session default. + ordered (bool, default True): + Determines whether the resulting Arrow table will be ordered. + In some cases, unordered may result in a faster-executing query. Returns: pyarrow.Table: A pyarrow Table with all rows and columns of this DataFrame. @@ -1211,9 +1215,7 @@ def to_arrow( ) self._optimize_query_complexity() - pa_table, query_job = self._block.to_arrow( - ordered=ordered if ordered is not None else self._session._strictly_ordered, - ) + pa_table, query_job = self._block.to_arrow(ordered=ordered) self._set_internal_query_job(query_job) return pa_table @@ -1223,7 +1225,7 @@ def to_pandas( sampling_method: Optional[str] = None, random_state: Optional[int] = None, *, - ordered: Optional[bool] = None, + ordered: bool = True, ) -> pandas.DataFrame: """Write DataFrame to pandas DataFrame. @@ -1243,10 +1245,9 @@ def to_pandas( The seed for the uniform downsampling algorithm. If provided, the uniform method may take longer to execute and require more computation. If set to a value other than None, this will supersede the global config. - ordered (bool, default None): - Determines whether the resulting pandas dataframe will be deterministically ordered. - In some cases, unordered may result in a faster-executing query. If set to a value - other than None, will override Session default. + ordered (bool, default True): + Determines whether the resulting pandas dataframe will be ordered. + In some cases, unordered may result in a faster-executing query. Returns: pandas.DataFrame: A pandas DataFrame with all rows and columns of this DataFrame if the @@ -1259,7 +1260,7 @@ def to_pandas( max_download_size=max_download_size, sampling_method=sampling_method, random_state=random_state, - ordered=ordered if ordered is not None else self._session._strictly_ordered, + ordered=ordered, ) self._set_internal_query_job(query_job) return df.set_axis(self._block.column_labels, axis=1, copy=False) @@ -1295,6 +1296,7 @@ def _compute_dry_run(self) -> bigquery.QueryJob: def copy(self) -> DataFrame: return DataFrame(self._block) + @validations.requires_strict_ordering(bigframes.constants.SUGGEST_PEEK_PREVIEW) def head(self, n: int = 5) -> DataFrame: return typing.cast(DataFrame, self.iloc[:n]) @@ -2952,15 +2954,21 @@ def from_records( ) def to_csv( - self, path_or_buf: str, sep=",", *, header: bool = True, index: bool = True - ) -> None: + self, + path_or_buf=None, + sep=",", + *, + header: bool = True, + index: bool = True, + ) -> Optional[str]: # TODO(swast): Can we support partition columns argument? # TODO(chelsealin): Support local file paths. # TODO(swast): Some warning that wildcard is recommended for large # query results? See: # https://cloud.google.com/bigquery/docs/exporting-data#limit_the_exported_file_size - if not path_or_buf.startswith("gs://"): - raise NotImplementedError(ERROR_IO_ONLY_GS_PATHS) + if not utils.is_gcs_path(path_or_buf): + pd_df = self.to_pandas() + return pd_df.to_csv(path_or_buf, sep=sep, header=header, index=index) if "*" not in path_or_buf: raise NotImplementedError(ERROR_IO_REQUIRES_WILDCARD) @@ -2977,22 +2985,28 @@ def to_csv( export_data_statement, api_name="dataframe-to_csv" ) self._set_internal_query_job(query_job) + return None def to_json( self, - path_or_buf: str, - orient: Literal[ - "split", "records", "index", "columns", "values", "table" - ] = "columns", + path_or_buf=None, + orient: Optional[ + Literal["split", "records", "index", "columns", "values", "table"] + ] = None, *, lines: bool = False, index: bool = True, - ) -> None: + ) -> Optional[str]: # TODO(swast): Can we support partition columns argument? - # TODO(chelsealin): Support local file paths. - if not path_or_buf.startswith("gs://"): - raise NotImplementedError(ERROR_IO_ONLY_GS_PATHS) - + if not utils.is_gcs_path(path_or_buf): + pd_df = self.to_pandas() + return pd_df.to_json( + path_or_buf, + orient=orient, + lines=lines, + index=index, + default_handler=str, + ) if "*" not in path_or_buf: raise NotImplementedError(ERROR_IO_REQUIRES_WILDCARD) @@ -3021,6 +3035,7 @@ def to_json( export_data_statement, api_name="dataframe-to_json" ) self._set_internal_query_job(query_job) + return None def to_gbq( self, @@ -3119,19 +3134,19 @@ def __array__(self, dtype=None) -> numpy.ndarray: def to_parquet( self, - path: str, + path=None, *, compression: Optional[Literal["snappy", "gzip"]] = "snappy", index: bool = True, - ) -> None: + ) -> Optional[bytes]: # TODO(swast): Can we support partition columns argument? # TODO(chelsealin): Support local file paths. # TODO(swast): Some warning that wildcard is recommended for large # query results? See: # https://cloud.google.com/bigquery/docs/exporting-data#limit_the_exported_file_size - if not path.startswith("gs://"): - raise NotImplementedError(ERROR_IO_ONLY_GS_PATHS) - + if not utils.is_gcs_path(path): + pd_df = self.to_pandas() + return pd_df.to_parquet(path, compression=compression, index=index) if "*" not in path: raise NotImplementedError(ERROR_IO_REQUIRES_WILDCARD) @@ -3155,6 +3170,7 @@ def to_parquet( export_data_statement, api_name="dataframe-to_parquet" ) self._set_internal_query_job(query_job) + return None def to_dict( self, @@ -3614,6 +3630,8 @@ def _cached(self, *, force: bool = False) -> DataFrame: No-op if the dataframe represents a trivial transformation of an existing materialization. Force=True is used for BQML integration where need to copy data rather than use snapshot. """ + if self._disable_cache_override: + return self self._block.cached(force=force) return self diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py index c1878b6c31..d84fbcdbab 100644 --- a/bigframes/functions/remote_function.py +++ b/bigframes/functions/remote_function.py @@ -24,14 +24,17 @@ import string import sys import tempfile +import threading from typing import ( Any, cast, + Dict, List, Mapping, NamedTuple, Optional, Sequence, + Set, Tuple, TYPE_CHECKING, Union, @@ -67,11 +70,80 @@ logger = logging.getLogger(__name__) +# Naming convention for the remote function artifacts +_BIGFRAMES_REMOTE_FUNCTION_PREFIX = "bigframes" +_BQ_FUNCTION_NAME_SEPERATOR = "_" +_GCF_FUNCTION_NAME_SEPERATOR = "-" + # Protocol version 4 is available in python version 3.4 and above # https://docs.python.org/3/library/pickle.html#data-stream-format _pickle_protocol_version = 4 +def _clean_up_by_session_id( + bqclient: bigquery.Client, + gcfclient: functions_v2.FunctionServiceClient, + dataset: bigquery.DatasetReference, + session_id: str, +): + """Delete remote function artifacts for a session id, where the session id + was not necessarily created in the current runtime. This is useful if the + user worked with a BigQuery DataFrames session previously and remembered the + session id, and now wants to clean up its temporary resources at a later + point in time. + """ + + # First clean up the BQ remote functions and then the underlying + # cloud functions, so that at no point we are left with a remote function + # that is pointing to a cloud function that does not exist + + endpoints_to_be_deleted: Set[str] = set() + match_prefix = "".join( + [ + _BIGFRAMES_REMOTE_FUNCTION_PREFIX, + _BQ_FUNCTION_NAME_SEPERATOR, + session_id, + _BQ_FUNCTION_NAME_SEPERATOR, + ] + ) + for routine in bqclient.list_routines(dataset): + routine = cast(bigquery.Routine, routine) + + # skip past the routines not belonging to the given session id, or + # non-remote-function routines + if ( + routine.type_ != bigquery.RoutineType.SCALAR_FUNCTION + or not cast(str, routine.routine_id).startswith(match_prefix) + or not routine.remote_function_options + or not routine.remote_function_options.endpoint + ): + continue + + # Let's forgive the edge case possibility that the BQ remote function + # may have been deleted at the same time directly by the user + bqclient.delete_routine(routine, not_found_ok=True) + endpoints_to_be_deleted.add(routine.remote_function_options.endpoint) + + # Now clean up the cloud functions + bq_location = bqclient.get_dataset(dataset).location + bq_location, gcf_location = get_remote_function_locations(bq_location) + parent_path = gcfclient.common_location_path( + project=dataset.project, location=gcf_location + ) + for gcf in gcfclient.list_functions(parent=parent_path): + # skip past the cloud functions not attached to any BQ remote function + # belonging to the given session id + if gcf.service_config.uri not in endpoints_to_be_deleted: + continue + + # Let's forgive the edge case possibility that the cloud function + # may have been deleted at the same time directly by the user + try: + gcfclient.delete_function(name=gcf.name) + except google.api_core.exceptions.NotFound: + pass + + def get_remote_function_locations(bq_location): """Get BQ location and cloud functions region given a BQ client.""" # TODO(shobs, b/274647164): Find the best way to determine default location. @@ -95,14 +167,32 @@ def get_remote_function_locations(bq_location): def _get_hash(def_, package_requirements=None): "Get hash (32 digits alphanumeric) of a function." - def_repr = cloudpickle.dumps(def_, protocol=_pickle_protocol_version) + # There is a known cell-id sensitivity of the cloudpickle serialization in + # notebooks https://github.com/cloudpipe/cloudpickle/issues/538. Because of + # this, if a cell contains a udf decorated with @remote_function, a unique + # cloudpickle code is generated every time the cell is run, creating new + # cloud artifacts every time. This is slow and wasteful. + # A workaround of the same can be achieved by replacing the filename in the + # code object to a static value + # https://github.com/cloudpipe/cloudpickle/issues/120#issuecomment-338510661. + # + # To respect the user code/environment let's make this modification on a + # copy of the udf, not on the original udf itself. + def_copy = cloudpickle.loads(cloudpickle.dumps(def_)) + def_copy.__code__ = def_copy.__code__.replace( + co_filename="bigframes_place_holder_filename" + ) + + def_repr = cloudpickle.dumps(def_copy, protocol=_pickle_protocol_version) if package_requirements: for p in sorted(package_requirements): def_repr += p.encode() return hashlib.md5(def_repr).hexdigest() -def _get_updated_package_requirements(package_requirements, is_row_processor): +def _get_updated_package_requirements( + package_requirements=None, is_row_processor=False +): requirements = [f"cloudpickle=={cloudpickle.__version__}"] if is_row_processor: # bigframes remote function will send an entire row of data as json, @@ -130,31 +220,23 @@ class IbisSignature(NamedTuple): output_type: IbisDataType -def get_cloud_function_name( - def_, uniq_suffix=None, package_requirements=None, is_row_processor=False -): +def get_cloud_function_name(function_hash, session_id=None, uniq_suffix=None): "Get a name for the cloud function for the given user defined function." - - # Augment user package requirements with any internal package - # requirements - package_requirements = _get_updated_package_requirements( - package_requirements, is_row_processor - ) - - cf_name = _get_hash(def_, package_requirements) - cf_name = f"bigframes-{cf_name}" # for identification + parts = [_BIGFRAMES_REMOTE_FUNCTION_PREFIX] + if session_id: + parts.append(session_id) + parts.append(function_hash) if uniq_suffix: - cf_name = f"{cf_name}-{uniq_suffix}" - return cf_name, package_requirements + parts.append(uniq_suffix) + return _GCF_FUNCTION_NAME_SEPERATOR.join(parts) -def get_remote_function_name(def_, uniq_suffix=None, package_requirements=None): +def get_remote_function_name(function_hash, session_id, uniq_suffix=None): "Get a name for the BQ remote function for the given user defined function." - bq_rf_name = _get_hash(def_, package_requirements) - bq_rf_name = f"bigframes_{bq_rf_name}" # for identification + parts = [_BIGFRAMES_REMOTE_FUNCTION_PREFIX, session_id, function_hash] if uniq_suffix: - bq_rf_name = f"{bq_rf_name}_{uniq_suffix}" - return bq_rf_name + parts.append(uniq_suffix) + return _BQ_FUNCTION_NAME_SEPERATOR.join(parts) class RemoteFunctionClient: @@ -272,6 +354,10 @@ def get_cloud_function_fully_qualified_name(self, name): self._gcp_project_id, self._cloud_function_region, name ) + def get_remote_function_fully_qualilfied_name(self, name): + "Get the fully qualilfied name for a BQ remote function." + return f"{self._gcp_project_id}.{self._bq_dataset}.{name}" + def get_cloud_function_endpoint(self, name): """Get the http endpoint of a cloud function if it exists.""" fully_qualified_name = self.get_cloud_function_fully_qualified_name(name) @@ -478,20 +564,34 @@ def provision_bq_remote_function( cloud_function_memory_mib, ): """Provision a BigQuery remote function.""" + # Augment user package requirements with any internal package + # requirements + package_requirements = _get_updated_package_requirements( + package_requirements, is_row_processor + ) + + # Compute a unique hash representing the user code + function_hash = _get_hash(def_, package_requirements) + # If reuse of any existing function with the same name (indicated by the # same hash of its source code) is not intended, then attach a unique # suffix to the intended function name to make it unique. uniq_suffix = None if not reuse: + # use 4 digits as a unique suffix which should suffice for + # uniqueness per session uniq_suffix = "".join( - random.choices(string.ascii_lowercase + string.digits, k=8) + random.choices(string.ascii_lowercase + string.digits, k=4) ) # Derive the name of the cloud function underlying the intended BQ - # remote function, also collect updated package requirements as - # determined in the name resolution - cloud_function_name, package_requirements = get_cloud_function_name( - def_, uniq_suffix, package_requirements, is_row_processor + # remote function. Use the session id to identify the GCF for unnamed + # functions. The named remote functions are treated as a persistant + # artifacts, so let's keep them independent of session id, which also + # makes their naming more stable for the same udf code + session_id = None if name else self._session.session_id + cloud_function_name = get_cloud_function_name( + function_hash, session_id, uniq_suffix ) cf_endpoint = self.get_cloud_function_endpoint(cloud_function_name) @@ -516,7 +616,7 @@ def provision_bq_remote_function( remote_function_name = name if not remote_function_name: remote_function_name = get_remote_function_name( - def_, uniq_suffix, package_requirements + function_hash, self._session.session_id, uniq_suffix ) rf_endpoint, rf_conn = self.get_remote_function_specs(remote_function_name) @@ -524,6 +624,7 @@ def provision_bq_remote_function( # 1. It does not exist # 2. It exists but the existing remote function has different # configuration than intended + created_new = False if not rf_endpoint or ( rf_endpoint != cf_endpoint or rf_conn != self._bq_connection_id ): @@ -540,10 +641,12 @@ def provision_bq_remote_function( remote_function_name, max_batching_rows, ) + + created_new = True else: logger.info(f"Remote function {remote_function_name} already exists.") - return remote_function_name, cloud_function_name + return remote_function_name, cloud_function_name, created_new def get_remote_function_specs(self, remote_function_name): """Check whether a remote function already exists for the udf.""" @@ -554,13 +657,12 @@ def get_remote_function_specs(self, remote_function_name): ) try: for routine in routines: + routine = cast(bigquery.Routine, routine) if routine.reference.routine_id == remote_function_name: - # TODO(shobs): Use first class properties when they are available - # https://github.com/googleapis/python-bigquery/issues/1552 - rf_options = routine._properties.get("remoteFunctionOptions") + rf_options = routine.remote_function_options if rf_options: - http_endpoint = rf_options.get("endpoint") - bq_connection = rf_options.get("connection") + http_endpoint = rf_options.endpoint + bq_connection = rf_options.connection if bq_connection: bq_connection = os.path.basename(bq_connection) break @@ -645,426 +747,505 @@ def get_routine_reference( return dataset_ref.routine(routine_ref_str) -# Inspired by @udf decorator implemented in ibis-bigquery package -# https://github.com/ibis-project/ibis-bigquery/blob/main/ibis_bigquery/udf/__init__.py -# which has moved as @js to the ibis package -# https://github.com/ibis-project/ibis/blob/master/ibis/backends/bigquery/udf/__init__.py -def remote_function( - input_types: Union[None, type, Sequence[type]] = None, - output_type: Optional[type] = None, - session: Optional[Session] = None, - bigquery_client: Optional[bigquery.Client] = None, - bigquery_connection_client: Optional[ - bigquery_connection_v1.ConnectionServiceClient - ] = None, - cloud_functions_client: Optional[functions_v2.FunctionServiceClient] = None, - resource_manager_client: Optional[resourcemanager_v3.ProjectsClient] = None, - dataset: Optional[str] = None, - bigquery_connection: Optional[str] = None, - reuse: bool = True, - name: Optional[str] = None, - packages: Optional[Sequence[str]] = None, - cloud_function_service_account: Optional[str] = None, - cloud_function_kms_key_name: Optional[str] = None, - cloud_function_docker_repository: Optional[str] = None, - max_batching_rows: Optional[int] = 1000, - cloud_function_timeout: Optional[int] = 600, - cloud_function_max_instances: Optional[int] = None, - cloud_function_vpc_connector: Optional[str] = None, - cloud_function_memory_mib: Optional[int] = 1024, -): - """Decorator to turn a user defined function into a BigQuery remote function. - - .. deprecated:: 0.0.1 - This is an internal method. Please use :func:`bigframes.pandas.remote_function` instead. - - .. note:: - Please make sure following is setup before using this API: - - 1. Have the below APIs enabled for your project: - - * BigQuery Connection API - * Cloud Functions API - * Cloud Run API - * Cloud Build API - * Artifact Registry API - * Cloud Resource Manager API - - This can be done from the cloud console (change `PROJECT_ID` to yours): - https://console.cloud.google.com/apis/enableflow?apiid=bigqueryconnection.googleapis.com,cloudfunctions.googleapis.com,run.googleapis.com,cloudbuild.googleapis.com,artifactregistry.googleapis.com,cloudresourcemanager.googleapis.com&project=PROJECT_ID - - Or from the gcloud CLI: - - `$ gcloud services enable bigqueryconnection.googleapis.com cloudfunctions.googleapis.com run.googleapis.com cloudbuild.googleapis.com artifactregistry.googleapis.com cloudresourcemanager.googleapis.com` - - 2. Have following IAM roles enabled for you: - - * BigQuery Data Editor (roles/bigquery.dataEditor) - * BigQuery Connection Admin (roles/bigquery.connectionAdmin) - * Cloud Functions Developer (roles/cloudfunctions.developer) - * Service Account User (roles/iam.serviceAccountUser) on the service account `PROJECT_NUMBER-compute@developer.gserviceaccount.com` - * Storage Object Viewer (roles/storage.objectViewer) - * Project IAM Admin (roles/resourcemanager.projectIamAdmin) (Only required if the bigquery connection being used is not pre-created and is created dynamically with user credentials.) - - 3. Either the user has setIamPolicy privilege on the project, or a BigQuery connection is pre-created with necessary IAM role set: - - 1. To create a connection, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_connection - 2. To set up IAM, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function - - Alternatively, the IAM could also be setup via the gcloud CLI: - - `$ gcloud projects add-iam-policy-binding PROJECT_ID --member="serviceAccount:CONNECTION_SERVICE_ACCOUNT_ID" --role="roles/run.invoker"`. - - Args: - input_types (None, type, or sequence(type)): - For scalar user defined function it should be the input type or - sequence of input types. For row processing user defined function, - type `Series` should be specified. - output_type (Optional[type]): - Data type of the output in the user defined function. - session (bigframes.Session, Optional): - BigQuery DataFrames session to use for getting default project, - dataset and BigQuery connection. - bigquery_client (google.cloud.bigquery.Client, Optional): - Client to use for BigQuery operations. If this param is not provided - then bigquery client from the session would be used. - bigquery_connection_client (google.cloud.bigquery_connection_v1.ConnectionServiceClient, Optional): - Client to use for BigQuery connection operations. If this param is - not provided then bigquery connection client from the session would - be used. - cloud_functions_client (google.cloud.functions_v2.FunctionServiceClient, Optional): - Client to use for cloud functions operations. If this param is not - provided then the functions client from the session would be used. - resource_manager_client (google.cloud.resourcemanager_v3.ProjectsClient, Optional): - Client to use for cloud resource management operations, e.g. for - getting and setting IAM roles on cloud resources. If this param is - not provided then resource manager client from the session would be - used. - dataset (str, Optional.): - Dataset in which to create a BigQuery remote function. It should be in - `.` or `` format. If this - parameter is not provided then session dataset id is used. - bigquery_connection (str, Optional): - Name of the BigQuery connection in the form of `CONNECTION_ID` or - `LOCATION.CONNECTION_ID` or `PROJECT_ID.LOCATION.CONNECTION_ID`. - If this param is not provided then the bigquery connection from the session - would be used. If it is pre created in the same location as the - `bigquery_client.location` then it would be used, otherwise it is created - dynamically using the `bigquery_connection_client` assuming the user has necessary - priviliges. The PROJECT_ID should be the same as the BigQuery connection project. - reuse (bool, Optional): - Reuse the remote function if is already exists. - `True` by default, which results in reusing an existing remote - function and corresponding cloud function (if any) that was - previously created for the same udf. - Setting it to `False` forces the creation of a unique remote function. - If the required remote function does not exist then it would be - created irrespective of this param. - name (str, Optional): - Explicit name of the persisted BigQuery remote function. Use it with - caution, because two users working in the same project and dataset - could overwrite each other's remote functions if they use the same - persistent name. - packages (str[], Optional): - Explicit name of the external package dependencies. Each dependency - is added to the `requirements.txt` as is, and can be of the form - supported in https://pip.pypa.io/en/stable/reference/requirements-file-format/. - cloud_function_service_account (str, Optional): - Service account to use for the cloud functions. If not provided then - the default service account would be used. See - https://cloud.google.com/functions/docs/securing/function-identity - for more details. Please make sure the service account has the - necessary IAM permissions configured as described in - https://cloud.google.com/functions/docs/reference/iam/roles#additional-configuration. - cloud_function_kms_key_name (str, Optional): - Customer managed encryption key to protect cloud functions and - related data at rest. This is of the format - projects/PROJECT_ID/locations/LOCATION/keyRings/KEYRING/cryptoKeys/KEY. - Read https://cloud.google.com/functions/docs/securing/cmek for - more details including granting necessary service accounts - access to the key. - cloud_function_docker_repository (str, Optional): - Docker repository created with the same encryption key as - `cloud_function_kms_key_name` to store encrypted artifacts - created to support the cloud function. This is of the format - projects/PROJECT_ID/locations/LOCATION/repositories/REPOSITORY_NAME. - For more details see - https://cloud.google.com/functions/docs/securing/cmek#before_you_begin. - max_batching_rows (int, Optional): - The maximum number of rows to be batched for processing in the - BQ remote function. Default value is 1000. A lower number can be - passed to avoid timeouts in case the user code is too complex to - process large number of rows fast enough. A higher number can be - used to increase throughput in case the user code is fast enough. - `None` can be passed to let BQ remote functions service apply - default batching. See for more details - https://cloud.google.com/bigquery/docs/remote-functions#limiting_number_of_rows_in_a_batch_request. - cloud_function_timeout (int, Optional): - The maximum amount of time (in seconds) BigQuery should wait for - the cloud function to return a response. See for more details - https://cloud.google.com/functions/docs/configuring/timeout. - Please note that even though the cloud function (2nd gen) itself - allows seeting up to 60 minutes of timeout, BigQuery remote - function can wait only up to 20 minutes, see for more details - https://cloud.google.com/bigquery/quotas#remote_function_limits. - By default BigQuery DataFrames uses a 10 minute timeout. `None` - can be passed to let the cloud functions default timeout take effect. - cloud_function_max_instances (int, Optional): - The maximumm instance count for the cloud function created. This - can be used to control how many cloud function instances can be - active at max at any given point of time. Lower setting can help - control the spike in the billing. Higher setting can help - support processing larger scale data. When not specified, cloud - function's default setting applies. For more details see - https://cloud.google.com/functions/docs/configuring/max-instances. - cloud_function_vpc_connector (str, Optional): - The VPC connector you would like to configure for your cloud - function. This is useful if your code needs access to data or - service(s) that are on a VPC network. See for more details - https://cloud.google.com/functions/docs/networking/connecting-vpc. - cloud_function_memory_mib (int, Optional): - The amounts of memory (in mebibytes) to allocate for the cloud - function (2nd gen) created. This also dictates a corresponding - amount of allocated CPU for the function. By default a memory of - 1024 MiB is set for the cloud functions created to support - BigQuery DataFrames remote function. If you want to let the - default memory of cloud functions be allocated, pass `None`. See - for more details - https://cloud.google.com/functions/docs/configuring/memory. - """ - # Some defaults may be used from the session if not provided otherwise - import bigframes.exceptions as bf_exceptions - import bigframes.pandas as bpd - import bigframes.series as bf_series - import bigframes.session - - session = cast(bigframes.session.Session, session or bpd.get_global_session()) - - # A BigQuery client is required to perform BQ operations - if not bigquery_client: - bigquery_client = session.bqclient - if not bigquery_client: - raise ValueError( - "A bigquery client must be provided, either directly or via session. " - f"{constants.FEEDBACK_LINK}" - ) +class _RemoteFunctionSession: + """Session to manage remote functions.""" - # A BigQuery connection client is required to perform BQ connection operations - if not bigquery_connection_client: - bigquery_connection_client = session.bqconnectionclient - if not bigquery_connection_client: - raise ValueError( - "A bigquery connection client must be provided, either directly or via session. " - f"{constants.FEEDBACK_LINK}" - ) + def __init__(self): + # Session level mapping of remote function artifacts + self._temp_artifacts: Dict[str, str] = dict() - # A cloud functions client is required to perform cloud functions operations - if not cloud_functions_client: - cloud_functions_client = session.cloudfunctionsclient - if not cloud_functions_client: - raise ValueError( - "A cloud functions client must be provided, either directly or via session. " - f"{constants.FEEDBACK_LINK}" - ) + # Lock to synchronize the update of the session artifacts + self._artifacts_lock = threading.Lock() - # A resource manager client is required to get/set IAM operations - if not resource_manager_client: - resource_manager_client = session.resourcemanagerclient - if not resource_manager_client: - raise ValueError( - "A resource manager client must be provided, either directly or via session. " - f"{constants.FEEDBACK_LINK}" - ) + def _update_temp_artifacts(self, bqrf_routine: str, gcf_path: str): + """Update remote function artifacts in the current session.""" + with self._artifacts_lock: + self._temp_artifacts[bqrf_routine] = gcf_path - # BQ remote function must be persisted, for which we need a dataset - # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#:~:text=You%20cannot%20create%20temporary%20remote%20functions. - if dataset: - dataset_ref = bigquery.DatasetReference.from_string( - dataset, default_project=bigquery_client.project - ) - else: - dataset_ref = session._anonymous_dataset + def clean_up( + self, + bqclient: bigquery.Client, + gcfclient: functions_v2.FunctionServiceClient, + session_id: str, + ): + """Delete remote function artifacts in the current session.""" + with self._artifacts_lock: + for bqrf_routine, gcf_path in self._temp_artifacts.items(): + # Let's accept the possibility that the remote function may have + # been deleted directly by the user + bqclient.delete_routine(bqrf_routine, not_found_ok=True) + + # Let's accept the possibility that the cloud function may have + # been deleted directly by the user + try: + gcfclient.delete_function(name=gcf_path) + except google.api_core.exceptions.NotFound: + pass + + self._temp_artifacts.clear() + + # Inspired by @udf decorator implemented in ibis-bigquery package + # https://github.com/ibis-project/ibis-bigquery/blob/main/ibis_bigquery/udf/__init__.py + # which has moved as @js to the ibis package + # https://github.com/ibis-project/ibis/blob/master/ibis/backends/bigquery/udf/__init__.py + def remote_function( + self, + input_types: Union[None, type, Sequence[type]] = None, + output_type: Optional[type] = None, + session: Optional[Session] = None, + bigquery_client: Optional[bigquery.Client] = None, + bigquery_connection_client: Optional[ + bigquery_connection_v1.ConnectionServiceClient + ] = None, + cloud_functions_client: Optional[functions_v2.FunctionServiceClient] = None, + resource_manager_client: Optional[resourcemanager_v3.ProjectsClient] = None, + dataset: Optional[str] = None, + bigquery_connection: Optional[str] = None, + reuse: bool = True, + name: Optional[str] = None, + packages: Optional[Sequence[str]] = None, + cloud_function_service_account: Optional[str] = None, + cloud_function_kms_key_name: Optional[str] = None, + cloud_function_docker_repository: Optional[str] = None, + max_batching_rows: Optional[int] = 1000, + cloud_function_timeout: Optional[int] = 600, + cloud_function_max_instances: Optional[int] = None, + cloud_function_vpc_connector: Optional[str] = None, + cloud_function_memory_mib: Optional[int] = 1024, + ): + """Decorator to turn a user defined function into a BigQuery remote function. - bq_location, cloud_function_region = get_remote_function_locations( - bigquery_client.location - ) + .. deprecated:: 0.0.1 + This is an internal method. Please use :func:`bigframes.pandas.remote_function` instead. - # A connection is required for BQ remote function - # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function - if not bigquery_connection: - bigquery_connection = session._bq_connection # type: ignore + .. note:: + Please make sure following is setup before using this API: - bigquery_connection = clients.resolve_full_bq_connection_name( - bigquery_connection, - default_project=dataset_ref.project, - default_location=bq_location, - ) - # Guaranteed to be the form of .. - ( - gcp_project_id, - bq_connection_location, - bq_connection_id, - ) = bigquery_connection.split(".") - if gcp_project_id.casefold() != dataset_ref.project.casefold(): - raise ValueError( - "The project_id does not match BigQuery connection gcp_project_id: " - f"{dataset_ref.project}." - ) - if bq_connection_location.casefold() != bq_location.casefold(): - raise ValueError( - "The location does not match BigQuery connection location: " - f"{bq_location}." - ) + 1. Have the below APIs enabled for your project: - # If any CMEK is intended then check that a docker repository is also specified - if ( - cloud_function_kms_key_name is not None - and cloud_function_docker_repository is None - ): - raise ValueError( - "cloud_function_docker_repository must be specified with cloud_function_kms_key_name." - " For more details see https://cloud.google.com/functions/docs/securing/cmek#before_you_begin" - ) + * BigQuery Connection API + * Cloud Functions API + * Cloud Run API + * Cloud Build API + * Artifact Registry API + * Cloud Resource Manager API + + This can be done from the cloud console (change `PROJECT_ID` to yours): + https://console.cloud.google.com/apis/enableflow?apiid=bigqueryconnection.googleapis.com,cloudfunctions.googleapis.com,run.googleapis.com,cloudbuild.googleapis.com,artifactregistry.googleapis.com,cloudresourcemanager.googleapis.com&project=PROJECT_ID + + Or from the gcloud CLI: + + `$ gcloud services enable bigqueryconnection.googleapis.com cloudfunctions.googleapis.com run.googleapis.com cloudbuild.googleapis.com artifactregistry.googleapis.com cloudresourcemanager.googleapis.com` + + 2. Have following IAM roles enabled for you: + + * BigQuery Data Editor (roles/bigquery.dataEditor) + * BigQuery Connection Admin (roles/bigquery.connectionAdmin) + * Cloud Functions Developer (roles/cloudfunctions.developer) + * Service Account User (roles/iam.serviceAccountUser) on the service account `PROJECT_NUMBER-compute@developer.gserviceaccount.com` + * Storage Object Viewer (roles/storage.objectViewer) + * Project IAM Admin (roles/resourcemanager.projectIamAdmin) (Only required if the bigquery connection being used is not pre-created and is created dynamically with user credentials.) + + 3. Either the user has setIamPolicy privilege on the project, or a BigQuery connection is pre-created with necessary IAM role set: - bq_connection_manager = None if session is None else session.bqconnectionmanager + 1. To create a connection, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_connection + 2. To set up IAM, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function - def wrapper(func): - nonlocal input_types, output_type + Alternatively, the IAM could also be setup via the gcloud CLI: - if not callable(func): - raise TypeError("f must be callable, got {}".format(func)) + `$ gcloud projects add-iam-policy-binding PROJECT_ID --member="serviceAccount:CONNECTION_SERVICE_ACCOUNT_ID" --role="roles/run.invoker"`. - if sys.version_info >= (3, 10): - # Add `eval_str = True` so that deferred annotations are turned into their - # corresponding type objects. Need Python 3.10 for eval_str parameter. - # https://docs.python.org/3/library/inspect.html#inspect.signature - signature_kwargs: Mapping[str, Any] = {"eval_str": True} + Args: + input_types (None, type, or sequence(type)): + For scalar user defined function it should be the input type or + sequence of input types. For row processing user defined function, + type `Series` should be specified. + output_type (Optional[type]): + Data type of the output in the user defined function. + session (bigframes.Session, Optional): + BigQuery DataFrames session to use for getting default project, + dataset and BigQuery connection. + bigquery_client (google.cloud.bigquery.Client, Optional): + Client to use for BigQuery operations. If this param is not provided + then bigquery client from the session would be used. + bigquery_connection_client (google.cloud.bigquery_connection_v1.ConnectionServiceClient, Optional): + Client to use for BigQuery connection operations. If this param is + not provided then bigquery connection client from the session would + be used. + cloud_functions_client (google.cloud.functions_v2.FunctionServiceClient, Optional): + Client to use for cloud functions operations. If this param is not + provided then the functions client from the session would be used. + resource_manager_client (google.cloud.resourcemanager_v3.ProjectsClient, Optional): + Client to use for cloud resource management operations, e.g. for + getting and setting IAM roles on cloud resources. If this param is + not provided then resource manager client from the session would be + used. + dataset (str, Optional.): + Dataset in which to create a BigQuery remote function. It should be in + `.` or `` format. If this + parameter is not provided then session dataset id is used. + bigquery_connection (str, Optional): + Name of the BigQuery connection in the form of `CONNECTION_ID` or + `LOCATION.CONNECTION_ID` or `PROJECT_ID.LOCATION.CONNECTION_ID`. + If this param is not provided then the bigquery connection from the session + would be used. If it is pre created in the same location as the + `bigquery_client.location` then it would be used, otherwise it is created + dynamically using the `bigquery_connection_client` assuming the user has necessary + priviliges. The PROJECT_ID should be the same as the BigQuery connection project. + reuse (bool, Optional): + Reuse the remote function if already exists. + `True` by default, which will result in reusing an existing remote + function and corresponding cloud function (if any) that was + previously created for the same udf. + Please note that for an unnamed (i.e. created without an explicit + `name` argument) remote function, the BigQuery DataFrames + session id is attached in the cloud artifacts names. So for the + effective reuse across the sessions it is recommended to create + the remote function with an explicit `name`. + Setting it to `False` would force creating a unique remote function. + If the required remote function does not exist then it would be + created irrespective of this param. + name (str, Optional): + Explicit name of the persisted BigQuery remote function. Use it with + caution, because two users working in the same project and dataset + could overwrite each other's remote functions if they use the same + persistent name. When an explicit name is provided, any session + specific clean up (``bigframes.session.Session.close``/ + ``bigframes.pandas.close_session``/ + ``bigframes.pandas.reset_session``/ + ``bigframes.pandas.clean_up_by_session_id``) does not clean up + the function, and leaves it for the user to manage the function + and the associated cloud function directly. + packages (str[], Optional): + Explicit name of the external package dependencies. Each dependency + is added to the `requirements.txt` as is, and can be of the form + supported in https://pip.pypa.io/en/stable/reference/requirements-file-format/. + cloud_function_service_account (str, Optional): + Service account to use for the cloud functions. If not provided then + the default service account would be used. See + https://cloud.google.com/functions/docs/securing/function-identity + for more details. Please make sure the service account has the + necessary IAM permissions configured as described in + https://cloud.google.com/functions/docs/reference/iam/roles#additional-configuration. + cloud_function_kms_key_name (str, Optional): + Customer managed encryption key to protect cloud functions and + related data at rest. This is of the format + projects/PROJECT_ID/locations/LOCATION/keyRings/KEYRING/cryptoKeys/KEY. + Read https://cloud.google.com/functions/docs/securing/cmek for + more details including granting necessary service accounts + access to the key. + cloud_function_docker_repository (str, Optional): + Docker repository created with the same encryption key as + `cloud_function_kms_key_name` to store encrypted artifacts + created to support the cloud function. This is of the format + projects/PROJECT_ID/locations/LOCATION/repositories/REPOSITORY_NAME. + For more details see + https://cloud.google.com/functions/docs/securing/cmek#before_you_begin. + max_batching_rows (int, Optional): + The maximum number of rows to be batched for processing in the + BQ remote function. Default value is 1000. A lower number can be + passed to avoid timeouts in case the user code is too complex to + process large number of rows fast enough. A higher number can be + used to increase throughput in case the user code is fast enough. + `None` can be passed to let BQ remote functions service apply + default batching. See for more details + https://cloud.google.com/bigquery/docs/remote-functions#limiting_number_of_rows_in_a_batch_request. + cloud_function_timeout (int, Optional): + The maximum amount of time (in seconds) BigQuery should wait for + the cloud function to return a response. See for more details + https://cloud.google.com/functions/docs/configuring/timeout. + Please note that even though the cloud function (2nd gen) itself + allows seeting up to 60 minutes of timeout, BigQuery remote + function can wait only up to 20 minutes, see for more details + https://cloud.google.com/bigquery/quotas#remote_function_limits. + By default BigQuery DataFrames uses a 10 minute timeout. `None` + can be passed to let the cloud functions default timeout take effect. + cloud_function_max_instances (int, Optional): + The maximumm instance count for the cloud function created. This + can be used to control how many cloud function instances can be + active at max at any given point of time. Lower setting can help + control the spike in the billing. Higher setting can help + support processing larger scale data. When not specified, cloud + function's default setting applies. For more details see + https://cloud.google.com/functions/docs/configuring/max-instances. + cloud_function_vpc_connector (str, Optional): + The VPC connector you would like to configure for your cloud + function. This is useful if your code needs access to data or + service(s) that are on a VPC network. See for more details + https://cloud.google.com/functions/docs/networking/connecting-vpc. + cloud_function_memory_mib (int, Optional): + The amounts of memory (in mebibytes) to allocate for the cloud + function (2nd gen) created. This also dictates a corresponding + amount of allocated CPU for the function. By default a memory of + 1024 MiB is set for the cloud functions created to support + BigQuery DataFrames remote function. If you want to let the + default memory of cloud functions be allocated, pass `None`. See + for more details + https://cloud.google.com/functions/docs/configuring/memory. + """ + # Some defaults may be used from the session if not provided otherwise + import bigframes.exceptions as bf_exceptions + import bigframes.pandas as bpd + import bigframes.series as bf_series + import bigframes.session + + session = cast(bigframes.session.Session, session or bpd.get_global_session()) + + # A BigQuery client is required to perform BQ operations + if not bigquery_client: + bigquery_client = session.bqclient + if not bigquery_client: + raise ValueError( + "A bigquery client must be provided, either directly or via session. " + f"{constants.FEEDBACK_LINK}" + ) + + # A BigQuery connection client is required to perform BQ connection operations + if not bigquery_connection_client: + bigquery_connection_client = session.bqconnectionclient + if not bigquery_connection_client: + raise ValueError( + "A bigquery connection client must be provided, either directly or via session. " + f"{constants.FEEDBACK_LINK}" + ) + + # A cloud functions client is required to perform cloud functions operations + if not cloud_functions_client: + cloud_functions_client = session.cloudfunctionsclient + if not cloud_functions_client: + raise ValueError( + "A cloud functions client must be provided, either directly or via session. " + f"{constants.FEEDBACK_LINK}" + ) + + # A resource manager client is required to get/set IAM operations + if not resource_manager_client: + resource_manager_client = session.resourcemanagerclient + if not resource_manager_client: + raise ValueError( + "A resource manager client must be provided, either directly or via session. " + f"{constants.FEEDBACK_LINK}" + ) + + # BQ remote function must be persisted, for which we need a dataset + # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#:~:text=You%20cannot%20create%20temporary%20remote%20functions. + if dataset: + dataset_ref = bigquery.DatasetReference.from_string( + dataset, default_project=bigquery_client.project + ) else: - signature_kwargs = {} + dataset_ref = session._anonymous_dataset - signature = inspect.signature( - func, - **signature_kwargs, + bq_location, cloud_function_region = get_remote_function_locations( + bigquery_client.location ) - # Try to get input types via type annotations. - if input_types is None: - input_types = [] - for parameter in signature.parameters.values(): - if (param_type := parameter.annotation) is inspect.Signature.empty: + # A connection is required for BQ remote function + # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function + if not bigquery_connection: + bigquery_connection = session._bq_connection # type: ignore + + bigquery_connection = clients.resolve_full_bq_connection_name( + bigquery_connection, + default_project=dataset_ref.project, + default_location=bq_location, + ) + # Guaranteed to be the form of .. + ( + gcp_project_id, + bq_connection_location, + bq_connection_id, + ) = bigquery_connection.split(".") + if gcp_project_id.casefold() != dataset_ref.project.casefold(): + raise ValueError( + "The project_id does not match BigQuery connection gcp_project_id: " + f"{dataset_ref.project}." + ) + if bq_connection_location.casefold() != bq_location.casefold(): + raise ValueError( + "The location does not match BigQuery connection location: " + f"{bq_location}." + ) + + # If any CMEK is intended then check that a docker repository is also specified + if ( + cloud_function_kms_key_name is not None + and cloud_function_docker_repository is None + ): + raise ValueError( + "cloud_function_docker_repository must be specified with cloud_function_kms_key_name." + " For more details see https://cloud.google.com/functions/docs/securing/cmek#before_you_begin" + ) + + bq_connection_manager = session.bqconnectionmanager + + def wrapper(func): + nonlocal input_types, output_type + + if not callable(func): + raise TypeError("f must be callable, got {}".format(func)) + + if sys.version_info >= (3, 10): + # Add `eval_str = True` so that deferred annotations are turned into their + # corresponding type objects. Need Python 3.10 for eval_str parameter. + # https://docs.python.org/3/library/inspect.html#inspect.signature + signature_kwargs: Mapping[str, Any] = {"eval_str": True} + else: + signature_kwargs = {} + + signature = inspect.signature( + func, + **signature_kwargs, + ) + + # Try to get input types via type annotations. + if input_types is None: + input_types = [] + for parameter in signature.parameters.values(): + if (param_type := parameter.annotation) is inspect.Signature.empty: + raise ValueError( + "'input_types' was not set and parameter " + f"'{parameter.name}' is missing a type annotation. " + "Types are required to use @remote_function." + ) + input_types.append(param_type) + elif not isinstance(input_types, collections.abc.Sequence): + input_types = [input_types] + + if output_type is None: + if ( + output_type := signature.return_annotation + ) is inspect.Signature.empty: raise ValueError( - "'input_types' was not set and parameter " - f"'{parameter.name}' is missing a type annotation. " - "Types are required to use @remote_function." + "'output_type' was not set and function is missing a " + "return type annotation. Types are required to use " + "@remote_function." ) - input_types.append(param_type) - elif not isinstance(input_types, collections.abc.Sequence): - input_types = [input_types] - if output_type is None: - if (output_type := signature.return_annotation) is inspect.Signature.empty: - raise ValueError( - "'output_type' was not set and function is missing a " - "return type annotation. Types are required to use " - "@remote_function." + # The function will actually be receiving a pandas Series, but allow both + # BigQuery DataFrames and pandas object types for compatibility. + is_row_processor = False + if len(input_types) == 1 and ( + (input_type := input_types[0]) == bf_series.Series + or input_type == pandas.Series + ): + warnings.warn( + "input_types=Series is in preview.", + stacklevel=1, + category=bf_exceptions.PreviewWarning, ) - # The function will actually be receiving a pandas Series, but allow both - # BigQuery DataFrames and pandas object types for compatibility. - is_row_processor = False - if len(input_types) == 1 and ( - (input_type := input_types[0]) == bf_series.Series - or input_type == pandas.Series - ): - warnings.warn( - "input_types=Series is in preview.", - stacklevel=1, - category=bf_exceptions.PreviewWarning, + # we will model the row as a json serialized string containing the data + # and the metadata representing the row + input_types = [str] + is_row_processor = True + elif isinstance(input_types, type): + input_types = [input_types] + + # TODO(b/340898611): fix type error + ibis_signature = ibis_signature_from_python_signature( + signature, input_types, output_type # type: ignore ) - # we will model the row as a json serialized string containing the data - # and the metadata representing the row - input_types = [str] - is_row_processor = True - elif isinstance(input_types, type): - input_types = [input_types] + remote_function_client = RemoteFunctionClient( + dataset_ref.project, + cloud_function_region, + cloud_functions_client, + bq_location, + dataset_ref.dataset_id, + bigquery_client, + bq_connection_id, + bq_connection_manager, + cloud_function_service_account, + cloud_function_kms_key_name, + cloud_function_docker_repository, + session=session, # type: ignore + ) - # TODO(b/340898611): fix type error - ibis_signature = ibis_signature_from_python_signature( - signature, input_types, output_type # type: ignore - ) + # In the unlikely case where the user is trying to re-deploy the same + # function, cleanup the attributes we add below, first. This prevents + # the pickle from having dependencies that might not otherwise be + # present such as ibis or pandas. + def try_delattr(attr): + try: + delattr(func, attr) + except AttributeError: + pass + + try_delattr("bigframes_cloud_function") + try_delattr("bigframes_remote_function") + try_delattr("output_dtype") + try_delattr("ibis_node") + + ( + rf_name, + cf_name, + created_new, + ) = remote_function_client.provision_bq_remote_function( + func, + input_types=tuple( + third_party_ibis_bqtypes.BigQueryType.from_ibis(type_) + for type_ in ibis_signature.input_types + ), + output_type=third_party_ibis_bqtypes.BigQueryType.from_ibis( + ibis_signature.output_type + ), + reuse=reuse, + name=name, + package_requirements=packages, + max_batching_rows=max_batching_rows, + cloud_function_timeout=cloud_function_timeout, + cloud_function_max_instance_count=cloud_function_max_instances, + is_row_processor=is_row_processor, + cloud_function_vpc_connector=cloud_function_vpc_connector, + cloud_function_memory_mib=cloud_function_memory_mib, + ) - remote_function_client = RemoteFunctionClient( - dataset_ref.project, - cloud_function_region, - cloud_functions_client, - bq_location, - dataset_ref.dataset_id, - bigquery_client, - bq_connection_id, - bq_connection_manager, - cloud_function_service_account, - cloud_function_kms_key_name, - cloud_function_docker_repository, - session=session, # type: ignore - ) + # TODO: Move ibis logic to compiler step + node = ibis.udf.scalar.builtin( + func, + name=rf_name, + schema=f"{dataset_ref.project}.{dataset_ref.dataset_id}", + signature=(ibis_signature.input_types, ibis_signature.output_type), + ) + func.bigframes_cloud_function = ( + remote_function_client.get_cloud_function_fully_qualified_name(cf_name) + ) + func.bigframes_remote_function = ( + remote_function_client.get_remote_function_fully_qualilfied_name( + rf_name + ) + ) - # In the unlikely case where the user is trying to re-deploy the same - # function, cleanup the attributes we add below, first. This prevents - # the pickle from having dependencies that might not otherwise be - # present such as ibis or pandas. - def try_delattr(attr): - try: - delattr(func, attr) - except AttributeError: - pass - - try_delattr("bigframes_cloud_function") - try_delattr("bigframes_remote_function") - try_delattr("output_dtype") - try_delattr("ibis_node") - - rf_name, cf_name = remote_function_client.provision_bq_remote_function( - func, - input_types=tuple( - third_party_ibis_bqtypes.BigQueryType.from_ibis(type_) - for type_ in ibis_signature.input_types - ), - output_type=third_party_ibis_bqtypes.BigQueryType.from_ibis( - ibis_signature.output_type - ), - reuse=reuse, - name=name, - package_requirements=packages, - max_batching_rows=max_batching_rows, - cloud_function_timeout=cloud_function_timeout, - cloud_function_max_instance_count=cloud_function_max_instances, - is_row_processor=is_row_processor, - cloud_function_vpc_connector=cloud_function_vpc_connector, - cloud_function_memory_mib=cloud_function_memory_mib, - ) + func.output_dtype = ( + bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( + ibis_signature.output_type + ) + ) + func.ibis_node = node + + # If a new remote function was created, update the cloud artifacts + # created in the session. This would be used to clean up any + # resources in the session. Note that we need to do this only for + # the case where an explicit name was not provided by the user and + # we used an internal name. For the cases where the user provided an + # explicit name, we are assuming that the user wants to persist them + # with that name and would directly manage their lifecycle. + if created_new and (not name): + self._update_temp_artifacts( + func.bigframes_remote_function, func.bigframes_cloud_function + ) + return func - # TODO: Move ibis logic to compiler step - node = ibis.udf.scalar.builtin( - func, - name=rf_name, - schema=f"{dataset_ref.project}.{dataset_ref.dataset_id}", - signature=(ibis_signature.input_types, ibis_signature.output_type), - ) - func.bigframes_cloud_function = ( - remote_function_client.get_cloud_function_fully_qualified_name(cf_name) - ) - func.bigframes_remote_function = str(dataset_ref.routine(rf_name)) # type: ignore + return wrapper + + +def remote_function(*args, **kwargs): + remote_function_session = _RemoteFunctionSession() + return remote_function_session.remote_function(*args, **kwargs) - func.output_dtype = ( - bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( - ibis_signature.output_type - ) - ) - func.ibis_node = node - return func - return wrapper +remote_function.__doc__ = _RemoteFunctionSession.remote_function.__doc__ def read_gbq_function( diff --git a/bigframes/functions/remote_function_template.py b/bigframes/functions/remote_function_template.py index 68fe1b917d..c666f41daa 100644 --- a/bigframes/functions/remote_function_template.py +++ b/bigframes/functions/remote_function_template.py @@ -215,9 +215,9 @@ def udf_http_row_processor(request): def generate_udf_code(def_, directory): - """Generate serialized bytecode using cloudpickle given a udf.""" + """Generate serialized code using cloudpickle given a udf.""" udf_code_file_name = "udf.py" - udf_bytecode_file_name = "udf.cloudpickle" + udf_pickle_file_name = "udf.cloudpickle" # original code, only for debugging purpose udf_code = textwrap.dedent(inspect.getsource(def_)) @@ -225,13 +225,13 @@ def generate_udf_code(def_, directory): with open(udf_code_file_path, "w") as f: f.write(udf_code) - # serialized bytecode - udf_bytecode_file_path = os.path.join(directory, udf_bytecode_file_name) + # serialized udf + udf_pickle_file_path = os.path.join(directory, udf_pickle_file_name) # TODO(b/345433300): try io.BytesIO to avoid writing to the file system - with open(udf_bytecode_file_path, "wb") as f: + with open(udf_pickle_file_path, "wb") as f: cloudpickle.dump(def_, f, protocol=_pickle_protocol_version) - return udf_code_file_name, udf_bytecode_file_name + return udf_code_file_name, udf_pickle_file_name def generate_cloud_function_main_code( @@ -252,15 +252,15 @@ def generate_cloud_function_main_code( """ # Pickle the udf with all its dependencies - udf_code_file, udf_bytecode_file = generate_udf_code(def_, directory) + udf_code_file, udf_pickle_file = generate_udf_code(def_, directory) code_blocks = [ f"""\ import cloudpickle # original udf code is in {udf_code_file} -# serialized udf code is in {udf_bytecode_file} -with open("{udf_bytecode_file}", "rb") as f: +# serialized udf code is in {udf_pickle_file} +with open("{udf_pickle_file}", "rb") as f: udf = cloudpickle.load(f) input_types = {repr(input_types)} diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index 70854a36e9..6ae06c9d9f 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -188,6 +188,24 @@ def __init__(self): def _keys(self): pass + def _extract_output_names(self): + """Extract transform output column names. Save the results to self._output_names.""" + assert self._bqml_model is not None + + output_names = [] + for transform_col in self._bqml_model._model._properties["transformColumns"]: + transform_col_dict = cast(dict, transform_col) + # pass the columns that are not transformed + if "transformSql" not in transform_col_dict: + continue + transform_sql: str = transform_col_dict["transformSql"] + if not transform_sql.startswith("ML."): + continue + + output_names.append(transform_col_dict["name"]) + + self._output_names = output_names + def __eq__(self, other) -> bool: return type(self) is type(other) and self._keys() == other._keys() diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 7f1bfe8d55..4ea63d2e81 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -201,25 +201,20 @@ def _merge( def _compile_to_sql( self, - columns: List[str], X: bpd.DataFrame, - ) -> List[Tuple[str, str]]: + ) -> List[str]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause Args: - columns (List[str]): - a list of column names to transform - X (bpd.DataFrame): - The Dataframe with training data. + X: DataFrame to transform. - Returns: - a list of tuples of (sql_expression, output_name)""" + Returns: a list of sql_expr.""" result = [] for _, transformer, target_columns in self.transformers: if isinstance(target_columns, str): target_columns = [target_columns] - result += transformer._compile_to_sql(target_columns, X=X) + result += transformer._compile_to_sql(X, target_columns) return result def fit( @@ -229,17 +224,14 @@ def fit( ) -> ColumnTransformer: (X,) = utils.convert_to_dataframe(X) - compiled_transforms = self._compile_to_sql(X.columns.tolist(), X) - transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] - + transform_sqls = self._compile_to_sql(X) self._bqml_model = self._bqml_model_factory.create_model( X, options={"model_type": "transform_only"}, transforms=transform_sqls, ) - # The schema of TRANSFORM output is not available in the model API, so save it during fitting - self._output_names = [name for _, name in compiled_transforms] + self._extract_output_names() return self def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index ee4d8a8c27..f1b36651f4 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -83,7 +83,7 @@ def distance( """ assert len(x.columns) == 1 and len(y.columns) == 1 - input_data = x.cache().join(y.cache(), how="outer") + input_data = x.join(y, how="outer").cache() x_column_id, y_column_id = x._block.value_columns[0], y._block.value_columns[0] return self._apply_sql( @@ -326,7 +326,7 @@ def create_model( if y_train is None: input_data = X_train.cache() else: - input_data = X_train.cache().join(y_train.cache(), how="outer") + input_data = X_train.join(y_train, how="outer").cache() options.update({"INPUT_LABEL_COLS": y_train.columns.tolist()}) session = X_train._session @@ -366,7 +366,7 @@ def create_llm_remote_model( options = dict(options) # Cache dataframes to make sure base table is not a snapshot # cached dataframe creates a full copy, never uses snapshot - input_data = X_train.cache().join(y_train.cache(), how="outer") + input_data = X_train.join(y_train, how="outer").cache() options.update({"INPUT_LABEL_COLS": y_train.columns.tolist()}) session = X_train._session @@ -399,7 +399,7 @@ def create_time_series_model( options = dict(options) # Cache dataframes to make sure base table is not a snapshot # cached dataframe creates a full copy, never uses snapshot - input_data = X_train.cache().join(y_train.cache(), how="outer") + input_data = X_train.join(y_train, how="outer").cache() options.update({"TIME_SERIES_TIMESTAMP_COL": X_train.columns.tolist()[0]}) options.update({"TIME_SERIES_DATA_COL": y_train.columns.tolist()[0]}) diff --git a/bigframes/ml/impute.py b/bigframes/ml/impute.py index ae71637aa5..4955eb5de5 100644 --- a/bigframes/ml/impute.py +++ b/bigframes/ml/impute.py @@ -18,7 +18,7 @@ from __future__ import annotations import typing -from typing import Iterable, List, Literal, Optional, Tuple, Union +from typing import Iterable, List, Literal, Optional, Union import bigframes_vendored.sklearn.impute._base @@ -49,25 +49,22 @@ def _keys(self): def _compile_to_sql( self, - columns: Iterable[str], - X=None, - ) -> List[Tuple[str, str]]: + X: bpd.DataFrame, + columns: Optional[Iterable[str]] = None, + ) -> List[str]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause Args: - columns: - A list of column names to transform. - X: - The Dataframe with training data. + X: DataFrame to transform. + columns: transform columns. If None, transform all columns in X. - Returns: a list of tuples of (sql_expression, output_name)""" + Returns: a list of tuples sql_expr.""" + if columns is None: + columns = X.columns return [ - ( - self._base_sql_generator.ml_imputer( - column, self.strategy, f"imputer_{column}" - ), - f"imputer_{column}", + self._base_sql_generator.ml_imputer( + column, self.strategy, f"imputer_{column}" ) for column in columns ] @@ -92,17 +89,14 @@ def fit( ) -> SimpleImputer: (X,) = utils.convert_to_dataframe(X) - compiled_transforms = self._compile_to_sql(X.columns.tolist(), X) - transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] - + transform_sqls = self._compile_to_sql(X) self._bqml_model = self._bqml_model_factory.create_model( X, options={"model_type": "transform_only"}, transforms=transform_sqls, ) - # The schema of TRANSFORM output is not available in the model API, so save it during fitting - self._output_names = [name for _, name in compiled_transforms] + self._extract_output_names() return self def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py index 48eb5a93a7..6220e899ae 100644 --- a/bigframes/ml/model_selection.py +++ b/bigframes/ml/model_selection.py @@ -18,7 +18,7 @@ import typing -from typing import List, Union +from typing import cast, List, Union from bigframes.ml import utils import bigframes.pandas as bpd @@ -29,6 +29,7 @@ def train_test_split( test_size: Union[float, None] = None, train_size: Union[float, None] = None, random_state: Union[int, None] = None, + stratify: Union[bpd.Series, None] = None, ) -> List[Union[bpd.DataFrame, bpd.Series]]: """Splits dataframes or series into random train and test subsets. @@ -46,6 +47,10 @@ def train_test_split( random_state (default None): A seed to use for randomly choosing the rows of the split. If not set, a random split will be generated each time. + stratify: (bigframes.series.Series or None, default None): + If not None, data is split in a stratified fashion, using this as the class labels. Each split has the same distribution of the class labels with the original dataset. + Default to None. + Note: By setting the stratify parameter, the memory consumption and generated SQL will be linear to the unique values in the Series. May return errors if the unique values size is too large. Returns: List[Union[bigframes.dataframe.DataFrame, bigframes.series.Series]]: A list of BigQuery DataFrames or Series. @@ -76,7 +81,38 @@ def train_test_split( dfs = list(utils.convert_to_dataframe(*arrays)) - split_dfs = dfs[0]._split(fracs=(train_size, test_size), random_state=random_state) + def _stratify_split(df: bpd.DataFrame, stratify: bpd.Series) -> List[bpd.DataFrame]: + """Split a single DF accoding to the stratify Series.""" + stratify = stratify.rename("bigframes_stratify_col") # avoid name conflicts + merged_df = df.join(stratify.to_frame(), how="outer") + + train_dfs, test_dfs = [], [] + uniq = stratify.unique() + for value in uniq: + cur = merged_df[merged_df["bigframes_stratify_col"] == value] + train, test = train_test_split( + cur, + test_size=test_size, + train_size=train_size, + random_state=random_state, + ) + train_dfs.append(train) + test_dfs.append(test) + + train_df = cast( + bpd.DataFrame, bpd.concat(train_dfs).drop(columns="bigframes_stratify_col") + ) + test_df = cast( + bpd.DataFrame, bpd.concat(test_dfs).drop(columns="bigframes_stratify_col") + ) + return [train_df, test_df] + + if stratify is None: + split_dfs = dfs[0]._split( + fracs=(train_size, test_size), random_state=random_state + ) + else: + split_dfs = _stratify_split(dfs[0], stratify) train_index = split_dfs[0].index test_index = split_dfs[1].index diff --git a/bigframes/ml/pipeline.py b/bigframes/ml/pipeline.py index 04b8d73cf5..4cd60c5836 100644 --- a/bigframes/ml/pipeline.py +++ b/bigframes/ml/pipeline.py @@ -106,9 +106,7 @@ def fit( ) -> Pipeline: (X,) = utils.convert_to_dataframe(X) - compiled_transforms = self._transform._compile_to_sql(X.columns.tolist(), X=X) - transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] - + transform_sqls = self._transform._compile_to_sql(X) if y is not None: # If labels columns are present, they should pass through un-transformed (y,) = utils.convert_to_dataframe(y) diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index 07fdc171cf..13d2041ef3 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -18,7 +18,7 @@ from __future__ import annotations import typing -from typing import cast, Iterable, List, Literal, Optional, Tuple, Union +from typing import cast, Iterable, List, Literal, Optional, Union import bigframes_vendored.sklearn.preprocessing._data import bigframes_vendored.sklearn.preprocessing._discretization @@ -46,23 +46,22 @@ def __init__(self): def _keys(self): return (self._bqml_model,) - def _compile_to_sql(self, columns: Iterable[str], X=None) -> List[Tuple[str, str]]: + def _compile_to_sql( + self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None + ) -> List[str]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause Args: - columns: - a list of column names to transform. - X (default None): - Ignored. + X: DataFrame to transform. + columns: transform columns. If None, transform all columns in X. - Returns: a list of tuples of (sql_expression, output_name)""" + Returns: a list of tuples sql_expr.""" + if columns is None: + columns = X.columns return [ - ( - self._base_sql_generator.ml_standard_scaler( - column, f"standard_scaled_{column}" - ), - f"standard_scaled_{column}", + self._base_sql_generator.ml_standard_scaler( + column, f"standard_scaled_{column}" ) for column in columns ] @@ -86,17 +85,14 @@ def fit( ) -> StandardScaler: (X,) = utils.convert_to_dataframe(X) - compiled_transforms = self._compile_to_sql(X.columns.tolist()) - transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] - + transform_sqls = self._compile_to_sql(X) self._bqml_model = self._bqml_model_factory.create_model( X, options={"model_type": "transform_only"}, transforms=transform_sqls, ) - # The schema of TRANSFORM output is not available in the model API, so save it during fitting - self._output_names = [name for _, name in compiled_transforms] + self._extract_output_names() return self def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: @@ -127,23 +123,22 @@ def __init__(self): def _keys(self): return (self._bqml_model,) - def _compile_to_sql(self, columns: Iterable[str], X=None) -> List[Tuple[str, str]]: + def _compile_to_sql( + self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None + ) -> List[str]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause Args: - columns: - a list of column names to transform. - X (default None): - Ignored. + X: DataFrame to transform. + columns: transform columns. If None, transform all columns in X. - Returns: a list of tuples of (sql_expression, output_name)""" + Returns: a list of tuples sql_expr.""" + if columns is None: + columns = X.columns return [ - ( - self._base_sql_generator.ml_max_abs_scaler( - column, f"max_abs_scaled_{column}" - ), - f"max_abs_scaled_{column}", + self._base_sql_generator.ml_max_abs_scaler( + column, f"max_abs_scaled_{column}" ) for column in columns ] @@ -167,17 +162,14 @@ def fit( ) -> MaxAbsScaler: (X,) = utils.convert_to_dataframe(X) - compiled_transforms = self._compile_to_sql(X.columns.tolist()) - transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] - + transform_sqls = self._compile_to_sql(X) self._bqml_model = self._bqml_model_factory.create_model( X, options={"model_type": "transform_only"}, transforms=transform_sqls, ) - # The schema of TRANSFORM output is not available in the model API, so save it during fitting - self._output_names = [name for _, name in compiled_transforms] + self._extract_output_names() return self def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: @@ -208,23 +200,22 @@ def __init__(self): def _keys(self): return (self._bqml_model,) - def _compile_to_sql(self, columns: Iterable[str], X=None) -> List[Tuple[str, str]]: + def _compile_to_sql( + self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None + ) -> List[str]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause Args: - columns: - a list of column names to transform. - X (default None): - Ignored. + X: DataFrame to transform. + columns: transform columns. If None, transform all columns in X. - Returns: a list of tuples of (sql_expression, output_name)""" + Returns: a list of tuples sql_expr.""" + if columns is None: + columns = X.columns return [ - ( - self._base_sql_generator.ml_min_max_scaler( - column, f"min_max_scaled_{column}" - ), - f"min_max_scaled_{column}", + self._base_sql_generator.ml_min_max_scaler( + column, f"min_max_scaled_{column}" ) for column in columns ] @@ -248,17 +239,14 @@ def fit( ) -> MinMaxScaler: (X,) = utils.convert_to_dataframe(X) - compiled_transforms = self._compile_to_sql(X.columns.tolist()) - transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] - + transform_sqls = self._compile_to_sql(X) self._bqml_model = self._bqml_model_factory.create_model( X, options={"model_type": "transform_only"}, transforms=transform_sqls, ) - # The schema of TRANSFORM output is not available in the model API, so save it during fitting - self._output_names = [name for _, name in compiled_transforms] + self._extract_output_names() return self def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: @@ -302,20 +290,18 @@ def _keys(self): return (self._bqml_model, self.n_bins, self.strategy) def _compile_to_sql( - self, - columns: Iterable[str], - X: bpd.DataFrame, - ) -> List[Tuple[str, str]]: + self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None + ) -> List[str]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause Args: - columns: - a list of column names to transform - X: - The Dataframe with training data. + X: DataFrame to transform. + columns: transform columns. If None, transform all columns in X. - Returns: a list of tuples of (sql_expression, output_name)""" + Returns: a list of tuples sql_expr.""" + if columns is None: + columns = X.columns array_split_points = {} if self.strategy == "uniform": for column in columns: @@ -327,11 +313,8 @@ def _compile_to_sql( ] return [ - ( - self._base_sql_generator.ml_bucketize( - column, array_split_points[column], f"kbinsdiscretizer_{column}" - ), - f"kbinsdiscretizer_{column}", + self._base_sql_generator.ml_bucketize( + column, array_split_points[column], f"kbinsdiscretizer_{column}" ) for column in columns ] @@ -339,11 +322,8 @@ def _compile_to_sql( elif self.strategy == "quantile": return [ - ( - self._base_sql_generator.ml_quantile_bucketize( - column, self.n_bins, f"kbinsdiscretizer_{column}" - ), - f"kbinsdiscretizer_{column}", + self._base_sql_generator.ml_quantile_bucketize( + column, self.n_bins, f"kbinsdiscretizer_{column}" ) for column in columns ] @@ -381,17 +361,14 @@ def fit( ) -> KBinsDiscretizer: (X,) = utils.convert_to_dataframe(X) - compiled_transforms = self._compile_to_sql(X.columns.tolist(), X) - transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] - + transform_sqls = self._compile_to_sql(X) self._bqml_model = self._bqml_model_factory.create_model( X, options={"model_type": "transform_only"}, transforms=transform_sqls, ) - # The schema of TRANSFORM output is not available in the model API, so save it during fitting - self._output_names = [name for _, name in compiled_transforms] + self._extract_output_names() return self def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: @@ -440,18 +417,19 @@ def __init__( def _keys(self): return (self._bqml_model, self.drop, self.min_frequency, self.max_categories) - def _compile_to_sql(self, columns: Iterable[str], X=None) -> List[Tuple[str, str]]: + def _compile_to_sql( + self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None + ) -> List[str]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause Args: - columns: - a list of column names to transform. - X (default None): - Ignored. - - Returns: a list of tuples of (sql_expression, output_name)""" + X: DataFrame to transform. + columns: transform columns. If None, transform all columns in X. + Returns: a list of tuples sql_expr.""" + if columns is None: + columns = X.columns drop = self.drop if self.drop is not None else "none" # minus one here since BQML's inplimentation always includes index 0, and top_k is on top of that. top_k = ( @@ -465,11 +443,8 @@ def _compile_to_sql(self, columns: Iterable[str], X=None) -> List[Tuple[str, str else OneHotEncoder.FREQUENCY_THRESHOLD_DEFAULT ) return [ - ( - self._base_sql_generator.ml_one_hot_encoder( - column, drop, top_k, frequency_threshold, f"onehotencoded_{column}" - ), - f"onehotencoded_{column}", + self._base_sql_generator.ml_one_hot_encoder( + column, drop, top_k, frequency_threshold, f"onehotencoded_{column}" ) for column in columns ] @@ -502,17 +477,14 @@ def fit( ) -> OneHotEncoder: (X,) = utils.convert_to_dataframe(X) - compiled_transforms = self._compile_to_sql(X.columns.tolist()) - transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] - + transform_sqls = self._compile_to_sql(X) self._bqml_model = self._bqml_model_factory.create_model( X, options={"model_type": "transform_only"}, transforms=transform_sqls, ) - # The schema of TRANSFORM output is not available in the model API, so save it during fitting - self._output_names = [name for _, name in compiled_transforms] + self._extract_output_names() return self def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: @@ -559,17 +531,19 @@ def __init__( def _keys(self): return (self._bqml_model, self.min_frequency, self.max_categories) - def _compile_to_sql(self, columns: Iterable[str], X=None) -> List[Tuple[str, str]]: + def _compile_to_sql( + self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None + ) -> List[str]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause Args: - columns: - a list of column names to transform. - X (default None): - Ignored. + X: DataFrame to transform. + columns: transform columns. If None, transform all columns in X. - Returns: a list of tuples of (sql_expression, output_name)""" + Returns: a list of tuples sql_expr.""" + if columns is None: + columns = X.columns # minus one here since BQML's inplimentation always includes index 0, and top_k is on top of that. top_k = ( @@ -583,11 +557,8 @@ def _compile_to_sql(self, columns: Iterable[str], X=None) -> List[Tuple[str, str else LabelEncoder.FREQUENCY_THRESHOLD_DEFAULT ) return [ - ( - self._base_sql_generator.ml_label_encoder( - column, top_k, frequency_threshold, f"labelencoded_{column}" - ), - f"labelencoded_{column}", + self._base_sql_generator.ml_label_encoder( + column, top_k, frequency_threshold, f"labelencoded_{column}" ) for column in columns ] @@ -614,17 +585,14 @@ def fit( ) -> LabelEncoder: (y,) = utils.convert_to_dataframe(y) - compiled_transforms = self._compile_to_sql(y.columns.tolist()) - transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] - + transform_sqls = self._compile_to_sql(y) self._bqml_model = self._bqml_model_factory.create_model( y, options={"model_type": "transform_only"}, transforms=transform_sqls, ) - # The schema of TRANSFORM output is not available in the model API, so save it during fitting - self._output_names = [name for _, name in compiled_transforms] + self._extract_output_names() return self def transform(self, y: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: @@ -660,24 +628,23 @@ def __init__(self, degree: int = 2): def _keys(self): return (self._bqml_model, self.degree) - def _compile_to_sql(self, columns: Iterable[str], X=None) -> List[Tuple[str, str]]: + def _compile_to_sql( + self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None + ) -> List[str]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause Args: - columns: - a list of column names to transform. - X (default None): - Ignored. + X: DataFrame to transform. + columns: transform columns. If None, transform all columns in X. - Returns: a list of tuples of (sql_expression, output_name)""" + Returns: a list of tuples sql_expr.""" + if columns is None: + columns = X.columns output_name = "poly_feat" return [ - ( - self._base_sql_generator.ml_polynomial_expand( - columns, self.degree, output_name - ), - output_name, + self._base_sql_generator.ml_polynomial_expand( + columns, self.degree, output_name ) ] @@ -702,29 +669,14 @@ def fit( ) -> PolynomialFeatures: (X,) = utils.convert_to_dataframe(X) - compiled_transforms = self._compile_to_sql(X.columns.tolist()) - transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] - + transform_sqls = self._compile_to_sql(X) self._bqml_model = self._bqml_model_factory.create_model( X, options={"model_type": "transform_only"}, transforms=transform_sqls, ) - # TODO(garrettwu): generalize the approach to other transformers - output_names = [] - for transform_col in self._bqml_model._model._properties["transformColumns"]: - transform_col_dict = cast(dict, transform_col) - # pass the columns that are not transformed - if "transformSql" not in transform_col_dict: - continue - transform_sql: str = transform_col_dict["transformSql"] - if not transform_sql.startswith("ML."): - continue - - output_names.append(transform_col_dict["name"]) - - self._output_names = output_names + self._extract_output_names() return self diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index faba0f3aa3..21f75eb82c 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -63,6 +63,7 @@ import bigframes.core.tools import bigframes.dataframe import bigframes.enums +import bigframes.functions.remote_function as bigframes_rf import bigframes.operations as ops import bigframes.series import bigframes.session @@ -768,8 +769,11 @@ def clean_up_by_session_id( location: Optional[str] = None, project: Optional[str] = None, ) -> None: - """Searches through table names in BigQuery and deletes tables - found matching the expected format. + """Searches through BigQuery tables and routines and deletes the ones + created during the session with the given session id. The match is + determined by having the session id present in the resource name or + metadata. The cloud functions serving the cleaned up routines are also + cleaned up. This could be useful if the session object has been lost. Calling `session.close()` or `bigframes.pandas.close_session()` @@ -794,7 +798,6 @@ def clean_up_by_session_id( None """ session = get_global_session() - client = session.bqclient if (location is None) != (project is None): raise ValueError( @@ -804,14 +807,18 @@ def clean_up_by_session_id( dataset = session._anonymous_dataset else: dataset = bigframes.session._io.bigquery.create_bq_dataset_reference( - client, + session.bqclient, location=location, project=project, api_name="clean_up_by_session_id", ) bigframes.session._io.bigquery.delete_tables_matching_session_id( - client, dataset, session_id + session.bqclient, dataset, session_id + ) + + bigframes_rf._clean_up_by_session_id( + session.bqclient, session.cloudfunctionsclient, dataset, session_id ) @@ -840,10 +847,28 @@ def clean_up_by_session_id( option_context = config.option_context """Global :class:`~bigframes._config.option_context` to configure BigQuery DataFrames.""" + # Session management APIs -get_global_session = global_session.get_global_session -close_session = global_session.close_session -reset_session = global_session.close_session +def get_global_session(): + return global_session.get_global_session() + + +get_global_session.__doc__ = global_session.get_global_session.__doc__ + + +def close_session(): + return global_session.close_session() + + +close_session.__doc__ = global_session.close_session.__doc__ + + +def reset_session(): + return global_session.close_session() + + +reset_session.__doc__ = global_session.close_session.__doc__ + # SQL Compilation uses recursive algorithms on deep trees # 10M tree depth should be sufficient to generate any sql that is under bigquery limit diff --git a/bigframes/series.py b/bigframes/series.py index c325783e96..1a5661529c 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -329,7 +329,7 @@ def to_pandas( sampling_method: Optional[str] = None, random_state: Optional[int] = None, *, - ordered: Optional[bool] = None, + ordered: bool = True, ) -> pandas.Series: """Writes Series to pandas Series. @@ -349,10 +349,9 @@ def to_pandas( The seed for the uniform downsampling algorithm. If provided, the uniform method may take longer to execute and require more computation. If set to a value other than None, this will supersede the global config. - ordered (bool, default None): - Determines whether the resulting pandas series will be deterministically ordered. - In some cases, unordered may result in a faster-executing query. If set to a value - other than None, will override Session default. + ordered (bool, default True): + Determines whether the resulting pandas series will be ordered. + In some cases, unordered may result in a faster-executing query. Returns: @@ -364,7 +363,7 @@ def to_pandas( max_download_size=max_download_size, sampling_method=sampling_method, random_state=random_state, - ordered=ordered if ordered is not None else self._session._strictly_ordered, + ordered=ordered, ) self._set_internal_query_job(query_job) series = df.squeeze(axis=1) @@ -634,7 +633,7 @@ def dropna( result = result.reset_index() return Series(result) - @validations.requires_strict_ordering() + @validations.requires_strict_ordering(bigframes.constants.SUGGEST_PEEK_PREVIEW) def head(self, n: int = 5) -> Series: return typing.cast(Series, self.iloc[0:n]) @@ -1653,9 +1652,22 @@ def to_frame(self, name: blocks.Label = None) -> bigframes.dataframe.DataFrame: return bigframes.dataframe.DataFrame(block) def to_csv( - self, path_or_buf: str, sep=",", *, header: bool = True, index: bool = True - ) -> None: - return self.to_frame().to_csv(path_or_buf, sep=sep, header=header, index=index) + self, + path_or_buf=None, + sep=",", + *, + header: bool = True, + index: bool = True, + ) -> Optional[str]: + if utils.is_gcs_path(path_or_buf): + return self.to_frame().to_csv( + path_or_buf, sep=sep, header=header, index=index + ) + else: + pd_series = self.to_pandas() + return pd_series.to_csv( + path_or_buf=path_or_buf, sep=sep, header=header, index=index + ) def to_dict(self, into: type[dict] = dict) -> typing.Mapping: return typing.cast(dict, self.to_pandas().to_dict(into)) # type: ignore @@ -1665,17 +1677,23 @@ def to_excel(self, excel_writer, sheet_name="Sheet1", **kwargs) -> None: def to_json( self, - path_or_buf: str, - orient: typing.Literal[ - "split", "records", "index", "columns", "values", "table" - ] = "columns", + path_or_buf=None, + orient: Optional[ + typing.Literal["split", "records", "index", "columns", "values", "table"] + ] = None, *, lines: bool = False, index: bool = True, - ) -> None: - return self.to_frame().to_json( - path_or_buf=path_or_buf, orient=orient, lines=lines, index=index - ) + ) -> Optional[str]: + if utils.is_gcs_path(path_or_buf): + return self.to_frame().to_json( + path_or_buf=path_or_buf, orient=orient, lines=lines, index=index + ) + else: + pd_series = self.to_pandas() + return pd_series.to_json( + path_or_buf=path_or_buf, orient=orient, lines=lines, index=index # type: ignore + ) def to_latex( self, buf=None, columns=None, header=True, index=True, **kwargs diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 0f7953d3d4..98cba867f2 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -95,8 +95,7 @@ import bigframes.dtypes import bigframes.exceptions import bigframes.formatting_helpers as formatting_helpers -from bigframes.functions.remote_function import read_gbq_function as bigframes_rgf -from bigframes.functions.remote_function import remote_function as bigframes_rf +import bigframes.functions.remote_function as bigframes_rf import bigframes.session._io.bigquery as bf_io_bigquery import bigframes.session._io.bigquery.read_gbq_table as bf_read_gbq_table import bigframes.session.clients @@ -108,6 +107,7 @@ import bigframes.core.indexes import bigframes.dataframe as dataframe import bigframes.series + import bigframes.streaming.dataframe as streaming_dataframe _BIGFRAMES_DEFAULT_CONNECTION_ID = "bigframes-default-connection" @@ -298,13 +298,24 @@ def __init__( self._execution_count = 0 # Whether this session treats objects as totally ordered. # Will expose as feature later, only False for internal testing - self._strictly_ordered: bool = context._strictly_ordered + self._strictly_ordered: bool = context.ordering_mode != "partial" + if not self._strictly_ordered: + warnings.warn( + "Partial ordering mode is a preview feature and is subject to change.", + bigframes.exceptions.PreviewWarning, + ) + # Sequential index needs total ordering to generate, so use null index with unstrict ordering. self._default_index_type: bigframes.enums.DefaultIndexKind = ( bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64 - if context._strictly_ordered + if self._strictly_ordered else bigframes.enums.DefaultIndexKind.NULL ) + self._compiler = bigframes.core.compile.SQLCompiler( + strict=self._strictly_ordered + ) + + self._remote_function_session = bigframes_rf._RemoteFunctionSession() @property def bqclient(self): @@ -383,7 +394,7 @@ def __hash__(self): # Stable hash needed to use in expression tree return hash(str(self._anonymous_dataset)) - def close(self): + def _clean_up_tables(self): """Delete tables that were created with this session's session_id.""" client = self.bqclient project_id = self._anonymous_dataset.project @@ -393,6 +404,15 @@ def close(self): full_id = ".".join([project_id, dataset_id, table_id]) client.delete_table(full_id, not_found_ok=True) + def close(self): + """Delete resources that were created with this session's session_id. + This includes BigQuery tables, remote functions and cloud functions + serving the remote functions.""" + self._clean_up_tables() + self._remote_function_session.clean_up( + self.bqclient, self.cloudfunctionsclient, self.session_id + ) + def read_gbq( self, query_or_table: str, @@ -637,15 +657,17 @@ def _read_gbq_query( index_cols = _to_index_cols(index_col) - filters = list(filters) - if len(filters) != 0 or max_results is not None: + filters_copy1, filters_copy2 = itertools.tee(filters) + has_filters = len(list(filters_copy1)) != 0 + filters = typing.cast(third_party_pandas_gbq.FiltersType, filters_copy2) + if has_filters or max_results is not None: # TODO(b/338111344): If we are running a query anyway, we might as # well generate ROW_NUMBER() at the same time. all_columns = itertools.chain(index_cols, columns) if columns else () query = bf_io_bigquery.to_query( query, all_columns, - bf_io_bigquery.compile_filters(filters) if filters else None, + bf_io_bigquery.compile_filters(filters) if has_filters else None, max_results=max_results, # We're executing the query, so we don't need time travel for # determinism. @@ -728,6 +750,38 @@ def read_gbq_table( filters=filters, ) + def read_gbq_table_streaming( + self, table: str + ) -> streaming_dataframe.StreamingDataFrame: + """Turn a BigQuery table into a StreamingDataFrame. + + Note: The bigframes.streaming module is a preview feature, and subject to change. + + **Examples:** + + >>> import bigframes.streaming as bst + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> sdf = bst.read_gbq_table("bigquery-public-data.ml_datasets.penguins") + """ + warnings.warn( + "The bigframes.streaming module is a preview feature, and subject to change.", + stacklevel=1, + category=bigframes.exceptions.PreviewWarning, + ) + + import bigframes.streaming.dataframe as streaming_dataframe + + df = self._read_gbq_table( + table, + api_name="read_gbq_table_steaming", + enable_snapshot=False, + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + + return streaming_dataframe.StreamingDataFrame._from_table_df(df) + def _read_gbq_table( self, query: str, @@ -738,6 +792,7 @@ def _read_gbq_table( api_name: str, use_cache: bool = True, filters: third_party_pandas_gbq.FiltersType = (), + enable_snapshot: bool = True, ) -> dataframe.DataFrame: import bigframes.dataframe as dataframe @@ -755,7 +810,7 @@ def _read_gbq_table( ) columns = list(columns) - filters = list(filters) + filters = typing.cast(list, list(filters)) # --------------------------------- # Fetch table metadata and validate @@ -856,7 +911,7 @@ def _read_gbq_table( else (*columns, *[col for col in index_cols if col not in columns]) ) - supports_snapshot = bf_read_gbq_table.validate_table( + enable_snapshot = enable_snapshot and bf_read_gbq_table.validate_table( self.bqclient, table_ref, all_columns, time_travel_timestamp, filter_str ) @@ -874,6 +929,8 @@ def _read_gbq_table( table=table, index_cols=index_cols, api_name=api_name, + # If non in strict ordering mode, don't go through overhead of scanning index column(s) to determine if unique + metadata_only=not self._strictly_ordered, ) schema = schemata.ArraySchema.from_bq_table(table) if columns: @@ -882,7 +939,7 @@ def _read_gbq_table( table, schema=schema, predicate=filter_str, - at_time=time_travel_timestamp if supports_snapshot else None, + at_time=time_travel_timestamp if enable_snapshot else None, primary_key=index_cols if is_index_unique else (), session=self, ) @@ -1606,14 +1663,26 @@ def remote_function( `True` by default, which will result in reusing an existing remote function and corresponding cloud function (if any) that was previously created for the same udf. + Please note that for an unnamed (i.e. created without an explicit + `name` argument) remote function, the BigQuery DataFrames + session id is attached in the cloud artifacts names. So for the + effective reuse across the sessions it is recommended to create + the remote function with an explicit `name`. Setting it to `False` would force creating a unique remote function. If the required remote function does not exist then it would be created irrespective of this param. name (str, Optional): - Explicit name of the persisted BigQuery remote function. Use it with - caution, because two users working in the same project and dataset - could overwrite each other's remote functions if they use the same - persistent name. + Explicit name of the persisted BigQuery remote function. Use it + with caution, because more than one users working in the same + project and dataset could overwrite each other's remote + functions if they use the same persistent name. When an explicit + name is provided, any session specific clean up ( + ``bigframes.session.Session.close``/ + ``bigframes.pandas.close_session``/ + ``bigframes.pandas.reset_session``/ + ``bigframes.pandas.clean_up_by_session_id``) does not clean up + the function, and leaves it for the user to manage the function + and the associated cloud function directly. packages (str[], Optional): Explicit name of the external package dependencies. Each dependency is added to the `requirements.txt` as is, and can be of the form @@ -1689,7 +1758,7 @@ def remote_function( `bigframes_remote_function` - The bigquery remote function capable of calling into `bigframes_cloud_function`. """ - return bigframes_rf( + return self._remote_function_session.remote_function( input_types, output_type, session=self, @@ -1769,7 +1838,7 @@ def read_gbq_function( not including the `bigframes_cloud_function` property. """ - return bigframes_rgf( + return bigframes_rf.read_gbq_function( function_name=function_name, session=self, ) @@ -1833,6 +1902,8 @@ def _start_query( Starts BigQuery query job and waits for results. """ job_config = self._prepare_query_job_config(job_config) + if not self._strictly_ordered: + job_config.labels = {"bigframes-mode": "unordered"} try: return bigframes.session._io.bigquery.start_query_with_client( self, @@ -1875,10 +1946,8 @@ def _cache_with_cluster_cols( """Executes the query and uses the resulting table to rewrite future executions.""" # TODO: Use this for all executions? Problem is that caching materializes extra # ordering columns - # TODO: May want to support some partial ordering info even for non-strict ordering mode - keep_order_info = self._strictly_ordered - sql, ordering_info = bigframes.core.compile.compile_raw( + sql, ordering_info = self._compiler.compile_raw( self._with_cached_executions(array_value.node) ) tmp_table = self._sql_to_temp_table( @@ -1886,7 +1955,7 @@ def _cache_with_cluster_cols( ) cached_replacement = array_value.as_cached( cache_table=self.bqclient.get_table(tmp_table), - ordering=ordering_info if keep_order_info else None, + ordering=ordering_info, ).node self._cached_executions[array_value.node] = cached_replacement @@ -1899,7 +1968,7 @@ def _cache_with_offsets(self, array_value: core.ArrayValue): "Caching with offsets only supported in strictly ordered mode." ) offset_column = bigframes.core.guid.generate_guid("bigframes_offsets") - sql = bigframes.core.compile.compile_unordered( + sql = self._compiler.compile_unordered( self._with_cached_executions( array_value.promote_offsets(offset_column).node ) @@ -2005,7 +2074,7 @@ def _peek( """A 'peek' efficiently accesses a small number of rows in the dataframe.""" if not tree_properties.peekable(self._with_cached_executions(array_value.node)): warnings.warn("Peeking this value cannot be done efficiently.") - sql = bigframes.core.compile.compile_peek( + sql = self._compiler.compile_peek( self._with_cached_executions(array_value.node), n_rows ) @@ -2021,17 +2090,20 @@ def _to_sql( offset_column: typing.Optional[str] = None, col_id_overrides: typing.Mapping[str, str] = {}, ordered: bool = False, + enable_cache: bool = True, ) -> str: if offset_column: array_value = array_value.promote_offsets(offset_column) - node_w_cached = self._with_cached_executions(array_value.node) + node = ( + self._with_cached_executions(array_value.node) + if enable_cache + else array_value.node + ) if ordered: - return bigframes.core.compile.compile_ordered( - node_w_cached, col_id_overrides=col_id_overrides + return self._compiler.compile_ordered( + node, col_id_overrides=col_id_overrides ) - return bigframes.core.compile.compile_unordered( - node_w_cached, col_id_overrides=col_id_overrides - ) + return self._compiler.compile_unordered(node, col_id_overrides=col_id_overrides) def _get_table_size(self, destination_table): table = self.bqclient.get_table(destination_table) diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index 879a8ba44c..03b26f9460 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -152,6 +152,7 @@ def are_index_cols_unique( table: bigquery.table.Table, index_cols: List[str], api_name: str, + metadata_only: bool = False, ) -> bool: if len(index_cols) == 0: return False @@ -161,6 +162,9 @@ def are_index_cols_unique( if (len(primary_keys) > 0) and primary_keys <= frozenset(index_cols): return True + if metadata_only: + # Sometimes not worth scanning data to check uniqueness + return False # TODO(b/337925142): Avoid a "SELECT *" subquery here by ensuring # table_expression only selects just index_cols. is_unique_sql = bigframes.core.sql.is_distinct_sql(index_cols, table.reference) diff --git a/bigframes/streaming/__init__.py b/bigframes/streaming/__init__.py index 0b6fd18561..66f345f0ab 100644 --- a/bigframes/streaming/__init__.py +++ b/bigframes/streaming/__init__.py @@ -12,253 +12,23 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Module for bigquery continuous queries""" +import inspect -import json -from typing import Optional -import warnings +import bigframes.core.global_session as global_session +import bigframes.pandas as bpd +import bigframes.session +import bigframes.streaming.dataframe as streaming_dataframe -from google.cloud import bigquery -import bigframes - - -def to_bigtable( - query: str, - *, - instance: str, - table: str, - service_account_email: Optional[str] = None, - session: Optional[bigframes.Session] = None, - app_profile: Optional[str] = None, - truncate: bool = False, - overwrite: bool = False, - auto_create_column_families: bool = False, - bigtable_options: Optional[dict] = None, - job_id: Optional[str] = None, - job_id_prefix: Optional[str] = None, -) -> bigquery.QueryJob: - """Launches a BigQuery continuous query and returns a - QueryJob object for some management functionality. - - This method requires an existing bigtable preconfigured to - accept the continuous query export statement. For instructions - on export to bigtable, see - https://cloud.google.com/bigquery/docs/export-to-bigtable. - - Args: - query (str): - The sql statement to execute as a continuous function. - For example: "SELECT * FROM dataset.table" - This will be wrapped in an EXPORT DATA statement to - launch a continuous query writing to bigtable. - instance (str): - The name of the bigtable instance to export to. - table (str): - The name of the bigtable table to export to. - service_account_email (str): - Full name of the service account to run the continuous query. - Example: accountname@projectname.gserviceaccounts.com - If not provided, the user account will be used, but this - limits the lifetime of the continuous query. - session (bigframes.Session, default None): - The session object to use for the query. This determines - the project id and location of the query. If None, will - default to the bigframes global session. - app_profile (str, default None): - The bigtable app profile to export to. If None, no app - profile will be used. - truncate (bool, default False): - The export truncate option, see - https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option - overwrite (bool, default False): - The export overwrite option, see - https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option - auto_create_column_families (bool, default False): - The auto_create_column_families option, see - https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option - bigtable_options (dict, default None): - The bigtable options dict, which will be converted to JSON - using json.dumps, see - https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option - If None, no bigtable_options parameter will be passed. - job_id (str, default None): - If specified, replace the default job id for the query, - see job_id parameter of - https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query - job_id_prefix (str, default None): - If specified, a job id prefix for the query, see - job_id_prefix parameter of - https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query - - Returns: - google.cloud.bigquery.QueryJob: - See https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob - The ongoing query job can be managed using this object. - For example, the job can be cancelled or its error status - can be examined. - """ - warnings.warn( - "The bigframes.streaming module is a preview feature, and subject to change.", - stacklevel=1, - category=bigframes.exceptions.PreviewWarning, - ) - - # get default client if not passed - if session is None: - session = bigframes.get_global_session() - bq_client = session.bqclient - - # build export string from parameters - project = bq_client.project - - app_profile_url_string = "" - if app_profile is not None: - app_profile_url_string = f"appProfiles/{app_profile}/" - - bigtable_options_parameter_string = "" - if bigtable_options is not None: - bigtable_options_parameter_string = ( - 'bigtable_options = """' + json.dumps(bigtable_options) + '""",\n' - ) - - sql = ( - "EXPORT DATA\n" - "OPTIONS (\n" - "format = 'CLOUD_BIGTABLE',\n" - f"{bigtable_options_parameter_string}" - f"truncate = {str(truncate)},\n" - f"overwrite = {str(overwrite)},\n" - f"auto_create_column_families = {str(auto_create_column_families)},\n" - f'uri = "https://bigtable.googleapis.com/projects/{project}/instances/{instance}/{app_profile_url_string}tables/{table}"\n' - ")\n" - "AS (\n" - f"{query});" - ) - - # override continuous http parameter - job_config = bigquery.job.QueryJobConfig() - - job_config_dict: dict = {"query": {"continuous": True}} - if service_account_email is not None: - job_config_dict["query"]["connectionProperties"] = { - "key": "service_account", - "value": service_account_email, - } - job_config_filled = job_config.from_api_repr(job_config_dict) - job_config_filled.labels = {"bigframes-api": "streaming_to_bigtable"} - - # begin the query job - query_job = bq_client.query( - sql, - job_config=job_config_filled, # type:ignore - # typing error above is in bq client library - # (should accept abstract job_config, only takes concrete) - job_id=job_id, - job_id_prefix=job_id_prefix, - ) - - # return the query job to the user for lifetime management - return query_job - - -def to_pubsub( - query: str, - *, - topic: str, - service_account_email: str, - session: Optional[bigframes.Session] = None, - job_id: Optional[str] = None, - job_id_prefix: Optional[str] = None, -) -> bigquery.QueryJob: - """Launches a BigQuery continuous query and returns a - QueryJob object for some management functionality. - - This method requires an existing pubsub topic. For instructions - on creating a pubsub topic, see - https://cloud.google.com/pubsub/docs/samples/pubsub-quickstart-create-topic?hl=en - - Note that a service account is a requirement for continuous queries - exporting to pubsub. - - Args: - query (str): - The sql statement to execute as a continuous function. - For example: "SELECT * FROM dataset.table" - This will be wrapped in an EXPORT DATA statement to - launch a continuous query writing to pubsub. - topic (str): - The name of the pubsub topic to export to. - For example: "taxi-rides" - service_account_email (str): - Full name of the service account to run the continuous query. - Example: accountname@projectname.gserviceaccounts.com - session (bigframes.Session, default None): - The session object to use for the query. This determines - the project id and location of the query. If None, will - default to the bigframes global session. - job_id (str, default None): - If specified, replace the default job id for the query, - see job_id parameter of - https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query - job_id_prefix (str, default None): - If specified, a job id prefix for the query, see - job_id_prefix parameter of - https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query - - Returns: - google.cloud.bigquery.QueryJob: - See https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob - The ongoing query job can be managed using this object. - For example, the job can be cancelled or its error status - can be examined. - """ - warnings.warn( - "The bigframes.streaming module is a preview feature, and subject to change.", - stacklevel=1, - category=bigframes.exceptions.PreviewWarning, +def read_gbq_table(table: str) -> streaming_dataframe.StreamingDataFrame: + bpd._set_default_session_location_if_possible(table) + return global_session.with_default_session( + bigframes.session.Session.read_gbq_table_streaming, table ) - # get default client if not passed - if session is None: - session = bigframes.get_global_session() - bq_client = session.bqclient - - # build export string from parameters - sql = ( - "EXPORT DATA\n" - "OPTIONS (\n" - "format = 'CLOUD_PUBSUB',\n" - f'uri = "https://pubsub.googleapis.com/projects/{bq_client.project}/topics/{topic}"\n' - ")\n" - "AS (\n" - f"{query});" - ) - # override continuous http parameter - job_config = bigquery.job.QueryJobConfig() - job_config_filled = job_config.from_api_repr( - { - "query": { - "continuous": True, - "connectionProperties": { - "key": "service_account", - "value": service_account_email, - }, - } - } - ) - job_config_filled.labels = {"bigframes-api": "streaming_to_pubsub"} - - # begin the query job - query_job = bq_client.query( - sql, - job_config=job_config_filled, # type:ignore - # typing error above is in bq client library - # (should accept abstract job_config, only takes concrete) - job_id=job_id, - job_id_prefix=job_id_prefix, - ) +read_gbq_table.__doc__ = inspect.getdoc( + bigframes.session.Session.read_gbq_table_streaming +) - # return the query job to the user for lifetime management - return query_job +StreamingDataFrame = streaming_dataframe.StreamingDataFrame diff --git a/bigframes/streaming/dataframe.py b/bigframes/streaming/dataframe.py new file mode 100644 index 0000000000..64a4898c57 --- /dev/null +++ b/bigframes/streaming/dataframe.py @@ -0,0 +1,504 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Module for bigquery continuous queries""" +from __future__ import annotations + +import functools +import inspect +import json +from typing import Optional +import warnings + +from google.cloud import bigquery + +import bigframes +from bigframes import dataframe +from bigframes.core import log_adapter + + +def _return_type_wrapper(method, cls): + @functools.wraps(method) + def wrapper(*args, **kwargs): + return_value = method(*args, **kwargs) + if isinstance(return_value, dataframe.DataFrame): + return cls._from_table_df(return_value) + return return_value + + return wrapper + + +def _curate_df_doc(doc: Optional[str]): + if not doc: + return doc + + # Remove examples, some are not applicable to StreamingDataFrame + doc = doc[: doc.find("**Examples:**")] + doc[doc.find("Args:") :] + + doc = doc.replace("dataframe.DataFrame", "streaming.StreamingDataFrame") + doc = doc.replace(" DataFrame", " StreamingDataFrame") + + return doc + + +class StreamingBase: + sql: str + _session: bigframes.Session + + def to_bigtable( + self, + *, + instance: str, + table: str, + service_account_email: Optional[str] = None, + app_profile: Optional[str] = None, + truncate: bool = False, + overwrite: bool = False, + auto_create_column_families: bool = False, + bigtable_options: Optional[dict] = None, + job_id: Optional[str] = None, + job_id_prefix: Optional[str] = None, + ) -> bigquery.QueryJob: + """ + Export the StreamingDataFrame as a continue job and returns a + QueryJob object for some management functionality. + + This method requires an existing bigtable preconfigured to + accept the continuous query export statement. For instructions + on export to bigtable, see + https://cloud.google.com/bigquery/docs/export-to-bigtable. + + Args: + instance (str): + The name of the bigtable instance to export to. + table (str): + The name of the bigtable table to export to. + service_account_email (str): + Full name of the service account to run the continuous query. + Example: accountname@projectname.gserviceaccounts.com + If not provided, the user account will be used, but this + limits the lifetime of the continuous query. + app_profile (str, default None): + The bigtable app profile to export to. If None, no app + profile will be used. + truncate (bool, default False): + The export truncate option, see + https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option + overwrite (bool, default False): + The export overwrite option, see + https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option + auto_create_column_families (bool, default False): + The auto_create_column_families option, see + https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option + bigtable_options (dict, default None): + The bigtable options dict, which will be converted to JSON + using json.dumps, see + https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option + If None, no bigtable_options parameter will be passed. + job_id (str, default None): + If specified, replace the default job id for the query, + see job_id parameter of + https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query + job_id_prefix (str, default None): + If specified, a job id prefix for the query, see + job_id_prefix parameter of + https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query + + Returns: + google.cloud.bigquery.QueryJob: + See https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob + The ongoing query job can be managed using this object. + For example, the job can be cancelled or its error status + can be examined. + """ + return _to_bigtable( + self.sql, + instance=instance, + table=table, + service_account_email=service_account_email, + session=self._session, + app_profile=app_profile, + truncate=truncate, + overwrite=overwrite, + auto_create_column_families=auto_create_column_families, + bigtable_options=bigtable_options, + job_id=job_id, + job_id_prefix=job_id_prefix, + ) + + def to_pubsub( + self, + *, + topic: str, + service_account_email: str, + job_id: Optional[str] = None, + job_id_prefix: Optional[str] = None, + ) -> bigquery.QueryJob: + """ + Export the StreamingDataFrame as a continue job and returns a + QueryJob object for some management functionality. + + This method requires an existing pubsub topic. For instructions + on creating a pubsub topic, see + https://cloud.google.com/pubsub/docs/samples/pubsub-quickstart-create-topic?hl=en + + Note that a service account is a requirement for continuous queries + exporting to pubsub. + + Args: + topic (str): + The name of the pubsub topic to export to. + For example: "taxi-rides" + service_account_email (str): + Full name of the service account to run the continuous query. + Example: accountname@projectname.gserviceaccounts.com + job_id (str, default None): + If specified, replace the default job id for the query, + see job_id parameter of + https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query + job_id_prefix (str, default None): + If specified, a job id prefix for the query, see + job_id_prefix parameter of + https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query + + Returns: + google.cloud.bigquery.QueryJob: + See https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob + The ongoing query job can be managed using this object. + For example, the job can be cancelled or its error status + can be examined. + """ + return _to_pubsub( + self.sql, + topic=topic, + service_account_email=service_account_email, + session=self._session, + job_id=job_id, + job_id_prefix=job_id_prefix, + ) + + +@log_adapter.class_logger +class StreamingDataFrame(StreamingBase): + __doc__ = _curate_df_doc(dataframe.DataFrame.__doc__) + + # Private constructor + _create_key = object() + + def __init__(self, df: dataframe.DataFrame, *, create_key=0): + if create_key is not StreamingDataFrame._create_key: + raise ValueError( + "StreamingDataFrame class shouldn't be created through constructor. Call bigframes.Session.read_gbq_table_streaming method to create." + ) + self._df = df + self._df._disable_cache_override = True + + @classmethod + def _from_table_df(cls, df: dataframe.DataFrame) -> StreamingDataFrame: + return cls(df, create_key=cls._create_key) + + def __getitem__(self, *args, **kwargs): + return _return_type_wrapper(self._df.__getitem__, StreamingDataFrame)( + *args, **kwargs + ) + + __getitem__.__doc__ = _curate_df_doc( + inspect.getdoc(dataframe.DataFrame.__getitem__) + ) + + def __setitem__(self, *args, **kwargs): + return _return_type_wrapper(self._df.__setitem__, StreamingDataFrame)( + *args, **kwargs + ) + + __setitem__.__doc__ = _curate_df_doc( + inspect.getdoc(dataframe.DataFrame.__setitem__) + ) + + def rename(self, *args, **kwargs): + return _return_type_wrapper(self._df.rename, StreamingDataFrame)( + *args, **kwargs + ) + + rename.__doc__ = _curate_df_doc(inspect.getdoc(dataframe.DataFrame.rename)) + + def __repr__(self, *args, **kwargs): + return _return_type_wrapper(self._df.__repr__, StreamingDataFrame)( + *args, **kwargs + ) + + __repr__.__doc__ = _curate_df_doc(inspect.getdoc(dataframe.DataFrame.__repr__)) + + def _repr_html_(self, *args, **kwargs): + return _return_type_wrapper(self._df._repr_html_, StreamingDataFrame)( + *args, **kwargs + ) + + _repr_html_.__doc__ = _curate_df_doc( + inspect.getdoc(dataframe.DataFrame._repr_html_) + ) + + @property + def sql(self): + sql_str, _, _ = self._df._to_sql_query(include_index=False, enable_cache=False) + return sql_str + + sql.__doc__ = _curate_df_doc(inspect.getdoc(dataframe.DataFrame.sql)) + + @property + def _session(self): + return self._df._session + + _session.__doc__ = _curate_df_doc(inspect.getdoc(dataframe.DataFrame._session)) + + +def _to_bigtable( + query: str, + *, + instance: str, + table: str, + service_account_email: Optional[str] = None, + session: Optional[bigframes.Session] = None, + app_profile: Optional[str] = None, + truncate: bool = False, + overwrite: bool = False, + auto_create_column_families: bool = False, + bigtable_options: Optional[dict] = None, + job_id: Optional[str] = None, + job_id_prefix: Optional[str] = None, +) -> bigquery.QueryJob: + """Launches a BigQuery continuous query and returns a + QueryJob object for some management functionality. + + This method requires an existing bigtable preconfigured to + accept the continuous query export statement. For instructions + on export to bigtable, see + https://cloud.google.com/bigquery/docs/export-to-bigtable. + + Args: + query (str): + The sql statement to execute as a continuous function. + For example: "SELECT * FROM dataset.table" + This will be wrapped in an EXPORT DATA statement to + launch a continuous query writing to bigtable. + instance (str): + The name of the bigtable instance to export to. + table (str): + The name of the bigtable table to export to. + service_account_email (str): + Full name of the service account to run the continuous query. + Example: accountname@projectname.gserviceaccounts.com + If not provided, the user account will be used, but this + limits the lifetime of the continuous query. + session (bigframes.Session, default None): + The session object to use for the query. This determines + the project id and location of the query. If None, will + default to the bigframes global session. + app_profile (str, default None): + The bigtable app profile to export to. If None, no app + profile will be used. + truncate (bool, default False): + The export truncate option, see + https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option + overwrite (bool, default False): + The export overwrite option, see + https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option + auto_create_column_families (bool, default False): + The auto_create_column_families option, see + https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option + bigtable_options (dict, default None): + The bigtable options dict, which will be converted to JSON + using json.dumps, see + https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option + If None, no bigtable_options parameter will be passed. + job_id (str, default None): + If specified, replace the default job id for the query, + see job_id parameter of + https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query + job_id_prefix (str, default None): + If specified, a job id prefix for the query, see + job_id_prefix parameter of + https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query + + Returns: + google.cloud.bigquery.QueryJob: + See https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob + The ongoing query job can be managed using this object. + For example, the job can be cancelled or its error status + can be examined. + """ + warnings.warn( + "The bigframes.streaming module is a preview feature, and subject to change.", + stacklevel=1, + category=bigframes.exceptions.PreviewWarning, + ) + + # get default client if not passed + if session is None: + session = bigframes.get_global_session() + bq_client = session.bqclient + + # build export string from parameters + project = bq_client.project + + app_profile_url_string = "" + if app_profile is not None: + app_profile_url_string = f"appProfiles/{app_profile}/" + + bigtable_options_parameter_string = "" + if bigtable_options is not None: + bigtable_options_parameter_string = ( + 'bigtable_options = """' + json.dumps(bigtable_options) + '""",\n' + ) + + sql = ( + "EXPORT DATA\n" + "OPTIONS (\n" + "format = 'CLOUD_BIGTABLE',\n" + f"{bigtable_options_parameter_string}" + f"truncate = {str(truncate)},\n" + f"overwrite = {str(overwrite)},\n" + f"auto_create_column_families = {str(auto_create_column_families)},\n" + f'uri = "https://bigtable.googleapis.com/projects/{project}/instances/{instance}/{app_profile_url_string}tables/{table}"\n' + ")\n" + "AS (\n" + f"{query});" + ) + + # override continuous http parameter + job_config = bigquery.job.QueryJobConfig() + + job_config_dict: dict = {"query": {"continuous": True}} + if service_account_email is not None: + job_config_dict["query"]["connectionProperties"] = { + "key": "service_account", + "value": service_account_email, + } + job_config_filled = job_config.from_api_repr(job_config_dict) + job_config_filled.labels = {"bigframes-api": "streaming_to_bigtable"} + + # begin the query job + query_job = bq_client.query( + sql, + job_config=job_config_filled, # type:ignore + # typing error above is in bq client library + # (should accept abstract job_config, only takes concrete) + job_id=job_id, + job_id_prefix=job_id_prefix, + ) + + # return the query job to the user for lifetime management + return query_job + + +def _to_pubsub( + query: str, + *, + topic: str, + service_account_email: str, + session: Optional[bigframes.Session] = None, + job_id: Optional[str] = None, + job_id_prefix: Optional[str] = None, +) -> bigquery.QueryJob: + """Launches a BigQuery continuous query and returns a + QueryJob object for some management functionality. + + This method requires an existing pubsub topic. For instructions + on creating a pubsub topic, see + https://cloud.google.com/pubsub/docs/samples/pubsub-quickstart-create-topic?hl=en + + Note that a service account is a requirement for continuous queries + exporting to pubsub. + + Args: + query (str): + The sql statement to execute as a continuous function. + For example: "SELECT * FROM dataset.table" + This will be wrapped in an EXPORT DATA statement to + launch a continuous query writing to pubsub. + topic (str): + The name of the pubsub topic to export to. + For example: "taxi-rides" + service_account_email (str): + Full name of the service account to run the continuous query. + Example: accountname@projectname.gserviceaccounts.com + session (bigframes.Session, default None): + The session object to use for the query. This determines + the project id and location of the query. If None, will + default to the bigframes global session. + job_id (str, default None): + If specified, replace the default job id for the query, + see job_id parameter of + https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query + job_id_prefix (str, default None): + If specified, a job id prefix for the query, see + job_id_prefix parameter of + https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query + + Returns: + google.cloud.bigquery.QueryJob: + See https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob + The ongoing query job can be managed using this object. + For example, the job can be cancelled or its error status + can be examined. + """ + warnings.warn( + "The bigframes.streaming module is a preview feature, and subject to change.", + stacklevel=1, + category=bigframes.exceptions.PreviewWarning, + ) + + # get default client if not passed + if session is None: + session = bigframes.get_global_session() + bq_client = session.bqclient + + # build export string from parameters + sql = ( + "EXPORT DATA\n" + "OPTIONS (\n" + "format = 'CLOUD_PUBSUB',\n" + f'uri = "https://pubsub.googleapis.com/projects/{bq_client.project}/topics/{topic}"\n' + ")\n" + "AS (\n" + f"{query});" + ) + + # override continuous http parameter + job_config = bigquery.job.QueryJobConfig() + job_config_filled = job_config.from_api_repr( + { + "query": { + "continuous": True, + "connectionProperties": { + "key": "service_account", + "value": service_account_email, + }, + } + } + ) + job_config_filled.labels = {"bigframes-api": "streaming_to_pubsub"} + + # begin the query job + query_job = bq_client.query( + sql, + job_config=job_config_filled, # type:ignore + # typing error above is in bq client library + # (should accept abstract job_config, only takes concrete) + job_id=job_id, + job_id_prefix=job_id_prefix, + ) + + # return the query job to the user for lifetime management + return query_job diff --git a/bigframes/version.py b/bigframes/version.py index 1186811c97..29cf036f42 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.11.1" +__version__ = "1.12.0" diff --git a/notebooks/dataframes/integrations.ipynb b/notebooks/dataframes/integrations.ipynb index 735e18d94e..9edb174f18 100644 --- a/notebooks/dataframes/integrations.ipynb +++ b/notebooks/dataframes/integrations.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 35, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -30,11 +30,47 @@ "This notebook demonstrates operations for building applications that integrate with BigQuery DataFrames. Follow these samples to build an integration that accepts a BigQuery DataFrames object or returns one." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Attributing requests initiated by BigQuery DataFrames\n", + "\n", + "Partners are required to attribute API calls to BigQuery and other Google APIs. Where possible, this should be done via the User-Agent string, but can also be done via job labels if your integration doesn't initialize the BigQuery DataFrames session.\n", + "\n", + "### Setting the User-Agent\n", + "\n", + "Set [`bpd.options.bigquery.application_name`](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes._config.bigquery_options.BigQueryOptions#bigframes__config_bigquery_options_BigQueryOptions_application_name) to a compliant string. Reach out to your Google Partner Engineering team contact for further instructions." + ] + }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 2, "metadata": {}, "outputs": [], + "source": [ + "import bigframes.pandas as bpd\n", + "\n", + "# Set this to the string informed by your Google Partner Engineering team contact.\n", + "# Note: This can only be set once per session, so is most appropriate for partners\n", + "# who provide a Python + BigQuery DataFrames environment to their customers.\n", + "bpd.options.bigquery.application_name = \"notebook-samples/1.0.0 (GPN:notebook-samples)\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/swast/src/bigframes-2/bigframes/core/global_session.py:113: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " return func(get_global_session(), *args, **kwargs)\n" + ] + } + ], "source": [ "import bigframes.pandas as bpd\n", "\n", @@ -47,6 +83,40 @@ "}).set_index(\"index\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setting the job label\n", + "\n", + "If your application works with customer-created BigQuery DataFrames objects, you might not be able to set the user-agent header because the session has already started (watch https://github.com/googleapis/python-bigquery-dataframes/issues/833 for updates on this limitation). Instead, attach a label to the jobs your application initiates, such as if you are performing `to_gbq()`on an existing DataFrame, as described below.\n", + "\n", + "Use `bpd.option_context()` so that the labels are only set during the operations your application performs." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job eb7f3bbe-dda9-4d2f-b195-21de862d7055 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "with bpd.option_context(\"compute.extra_query_labels\", {\"application-name\": \"notebook-samples\"}):\n", + " table_id = df.to_gbq()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -58,13 +128,13 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 00b5c727-f2bf-4265-be22-d7d505619db7 is DONE. 0 Bytes processed. Open Job" + "Query job 4ad50c3c-91d0-4fef-91f6-0a2c5a30c38f is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -76,10 +146,10 @@ { "data": { "text/plain": [ - "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240327_43bbc4c64fb947f7b69db570a5641506'" + "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240710_sessionf75568_9a045ff143db4f8ab2018994287020f3'" ] }, - "execution_count": 37, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -102,13 +172,13 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job f9c39ac2-a428-45c9-bb3a-643fc62a1c5b is DONE. 0 Bytes processed. Open Job" + "Query job 9e7d4b1a-d7fc-4599-bab4-40062c83288e is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -122,11 +192,11 @@ "output_type": "stream", "text": [ " index int_col float_col string_col\n", - "0 2 3 0.2500 c\n", - "1 4 5 0.0625 e\n", + "0 3 4 -0.1250 d\n", + "1 1 2 -0.5000 b\n", "2 0 1 1.0000 a\n", - "3 1 2 -0.5000 b\n", - "4 3 4 -0.1250 d\n" + "3 4 5 0.0625 e\n", + "4 2 3 0.2500 c\n" ] } ], @@ -168,13 +238,13 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job ad53c7f2-e3bd-4667-b60b-b700c24b7a81 is DONE. 0 Bytes processed. Open Job" + "Query job 62db313e-7632-4dbb-8eff-5035d0e6c27e is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -188,11 +258,11 @@ "output_type": "stream", "text": [ " index int_col float_col string_col\n", - "0 4 5 0.0625 e\n", - "1 0 1 1.0000 a\n", - "2 2 3 0.2500 c\n", - "3 3 4 -0.1250 d\n", - "4 1 2 -0.5000 b\n" + "0 1 2 -0.5000 b\n", + "1 3 4 -0.1250 d\n", + "2 0 1 1.0000 a\n", + "3 4 5 0.0625 e\n", + "4 2 3 0.2500 c\n" ] } ], @@ -265,13 +335,13 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 2aa7033c-c547-4ae2-a9aa-33272be82b9c is DONE. 0 Bytes processed. Open Job" + "Query job 1cbd8898-97c7-419e-87af-b72a9432afb6 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -283,10 +353,10 @@ { "data": { "text/plain": [ - "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240327_b484a3967fba4a41850f4eb21b4b3bd8'" + "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240710_sessionf75568_58b9b6fc0c3349bf8d3dd6fb29ab5322'" ] }, - "execution_count": 40, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -308,13 +378,13 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 1d489f94-2840-405e-9114-d439dcfcf7aa is DONE. 0 Bytes processed. Open Job" + "Query job 40e54aa9-fad7-47c3-9bec-144f6c7106d8 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -326,10 +396,10 @@ { "data": { "text/plain": [ - "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240327_d00699eeeed743b487c870dca5bcf23b'" + "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240710_sessionf75568_cdb4f54063b0417a8309c462b70239fa'" ] }, - "execution_count": 41, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -357,7 +427,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -366,7 +436,7 @@ "Dataset(DatasetReference('swast-scratch', 'my_dataset'))" ] }, - "execution_count": 42, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -381,13 +451,13 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 40977e60-97c3-4c93-89e2-d7334e5af71d is DONE. 0 Bytes processed. Open Job" + "Query job 73cf9e04-d5fa-4765-827c-665f0e6b9e00 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -399,7 +469,7 @@ { "data": { "text/html": [ - "Query job 81e35bb8-2e27-4a18-b596-15a7805331f0 is DONE. 270 Bytes processed. Open Job" + "Query job b177eb37-197f-4732-8978-c74cccb36e01 is DONE. 270 Bytes processed. Open Job" ], "text/plain": [ "" @@ -523,7 +593,7 @@ "[10 rows x 3 columns]" ] }, - "execution_count": 43, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -627,7 +697,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.1" + "version": "3.10.9" } }, "nbformat": 4, diff --git a/notebooks/regression/sklearn_linear_regression.ipynb b/notebooks/regression/sklearn_linear_regression.ipynb index 2873527449..95aa314bb0 100644 --- a/notebooks/regression/sklearn_linear_regression.ipynb +++ b/notebooks/regression/sklearn_linear_regression.ipynb @@ -857,7 +857,7 @@ "from bigframes.ml.preprocessing import StandardScaler, OneHotEncoder\n", "\n", "preprocessing = ColumnTransformer([\n", - " (\"onehot\", OneHotEncoder(), [\"island\", \"species\", \"sex\"]),\n", + " (\"onehot\", OneHotEncoder(), [\"island\", \"sex\"]),\n", " (\"scaler\", StandardScaler(), [\"culmen_depth_mm\", \"culmen_length_mm\", \"flipper_length_mm\"]),\n", "])\n", "\n", diff --git a/notebooks/remote_functions/remote_function_usecases.ipynb b/notebooks/remote_functions/remote_function_usecases.ipynb index 3d7ae3e8c7..9317e4b8fe 100644 --- a/notebooks/remote_functions/remote_function_usecases.ipynb +++ b/notebooks/remote_functions/remote_function_usecases.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 28, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -30,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 21, "metadata": { "id": "Y6QAttCqqMM0" }, @@ -41,7 +41,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 22, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -55,14 +55,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/shobs/code/bigframes/venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py:3550: UserWarning: Reading cached table from 2024-06-28 02:49:31.716256+00:00 to avoid incompatibilies with previous reads of this table. To read the latest version, set `use_cache=False` or close the current session with Session.close() or bigframes.pandas.close_session().\n", + "/usr/local/google/home/shobs/code/bigframes1/venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py:3550: UserWarning: Reading cached table from 2024-07-24 08:01:12.491984+00:00 to avoid incompatibilies with previous reads of this table. To read the latest version, set `use_cache=False` or close the current session with Session.close() or bigframes.pandas.close_session().\n", " exec(code_obj, self.user_global_ns, self.user_ns)\n" ] }, { "data": { "text/html": [ - "Query job f72cda67-2a96-4cd2-a624-591c0d540fc9 is DONE. 582.8 kB processed. Open Job" + "Query job 9d155f10-e37a-4d20-b2ff-02868ecb58f4 is DONE. 582.8 kB processed. Open Job" ], "text/plain": [ "" @@ -74,7 +74,7 @@ { "data": { "text/html": [ - "Query job 65cf6ca3-73f0-49e6-84a8-1ff79af6ec75 is DONE. 82.0 kB processed. Open Job" + "Query job 5a524e70-12dc-4116-b416-04570bbf754e is DONE. 82.0 kB processed. Open Job" ], "text/plain": [ "" @@ -111,49 +111,49 @@ " \n", " \n", " \n", - " 50\n", - " Rays\n", - " Rangers\n", - " 181\n", + " 36\n", + " Reds\n", + " Cubs\n", + " 159\n", " \n", " \n", - " 72\n", - " Phillies\n", - " Pirates\n", - " 192\n", + " 358\n", + " Dodgers\n", + " Diamondbacks\n", + " 223\n", " \n", " \n", - " 89\n", - " Mariners\n", - " Blue Jays\n", - " 183\n", + " 416\n", + " Yankees\n", + " White Sox\n", + " 216\n", " \n", " \n", - " 351\n", - " Astros\n", - " Angels\n", - " 212\n", + " 523\n", + " Rays\n", + " Athletics\n", + " 187\n", " \n", " \n", - " 382\n", - " Royals\n", - " Yankees\n", - " 259\n", + " 594\n", + " Pirates\n", + " Brewers\n", + " 169\n", " \n", " \n", "\n", "" ], "text/plain": [ - " homeTeamName awayTeamName duration_minutes\n", - "50 Rays Rangers 181\n", - "72 Phillies Pirates 192\n", - "89 Mariners Blue Jays 183\n", - "351 Astros Angels 212\n", - "382 Royals Yankees 259" + " homeTeamName awayTeamName duration_minutes\n", + "36 Reds Cubs 159\n", + "358 Dodgers Diamondbacks 223\n", + "416 Yankees White Sox 216\n", + "523 Rays Athletics 187\n", + "594 Pirates Brewers 169" ] }, - "execution_count": 30, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -202,7 +202,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 23, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -215,7 +215,7 @@ { "data": { "text/html": [ - "Query job f039d478-8dc4-4b60-8eda-179955e06586 is DONE. 0 Bytes processed. Open Job" + "Query job ec8d958d-93ef-45ae-8150-6ccfa8feb89a is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -228,7 +228,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-862150459da5240a6df1ce01c59b32d8-em4ibov0' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_862150459da5240a6df1ce01c59b32d8_em4ibov0'.\n" + "Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-session54c8b0-e22dbecc9ec0374bda36bc23df3775b0-g8zp' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_session54c8b0_e22dbecc9ec0374bda36bc23df3775b0_g8zp'.\n" ] } ], @@ -247,7 +247,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 24, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -260,7 +260,7 @@ { "data": { "text/html": [ - "Query job 23e95831-d913-4d2b-97f6-588fc7967455 is DONE. 58.3 kB processed. Open Job" + "Query job 4b116e3e-d4d3-4eb6-9764-0a29a7c5d036 is DONE. 58.3 kB processed. Open Job" ], "text/plain": [ "" @@ -272,7 +272,7 @@ { "data": { "text/html": [ - "Query job bb8b3d13-a521-4d45-b4c8-5686c944a9f2 is DONE. 157.2 kB processed. Open Job" + "Query job d62ac4f0-47c9-47ae-8611-c9ecf78f20c9 is DONE. 157.2 kB processed. Open Job" ], "text/plain": [ "" @@ -284,7 +284,7 @@ { "data": { "text/html": [ - "Query job 2a4653f5-cc6b-4279-a45e-40f0f97090a7 is DONE. 98.8 kB processed. Open Job" + "Query job 5f876ebb-2d95-4c68-9d84-947e02b37bad is DONE. 98.8 kB processed. Open Job" ], "text/plain": [ "" @@ -369,7 +369,7 @@ "654 Astros Angels 143 medium" ] }, - "execution_count": 32, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -396,7 +396,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 25, "metadata": { "id": "2UEmTbu4znyS" }, @@ -409,7 +409,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 26, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -422,7 +422,7 @@ { "data": { "text/html": [ - "Query job 5d914fde-81ec-46eb-9219-9822f77dd9a2 is DONE. 0 Bytes processed. Open Job" + "Query job 1909a652-5735-401b-8a77-674d8539ded0 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -435,7 +435,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-f3231b74ec807496f4894218d5d40ed5-688mx7hi' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_f3231b74ec807496f4894218d5d40ed5_688mx7hi'.\n" + "Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-session54c8b0-4191f0fce98d46cc09359de47e203236-e009' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_session54c8b0_4191f0fce98d46cc09359de47e203236_e009'.\n" ] } ], @@ -454,7 +454,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 27, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -467,7 +467,7 @@ { "data": { "text/html": [ - "Query job b0b39944-1e69-4185-97ba-985178ee241f is DONE. 58.3 kB processed. Open Job" + "Query job a942bdc5-6a6d-4db8-b2aa-a556197377b3 is DONE. 58.3 kB processed. Open Job" ], "text/plain": [ "" @@ -479,7 +479,7 @@ { "data": { "text/html": [ - "Query job 90d99515-eb5e-4bcd-bce5-292eea09770e is DONE. 147.7 kB processed. Open Job" + "Query job 175ae9d3-604f-495b-a167-8b06c0283bd2 is DONE. 147.7 kB processed. Open Job" ], "text/plain": [ "" @@ -491,7 +491,7 @@ { "data": { "text/html": [ - "Query job eb31d033-c871-49c5-a75e-4427e376516f is DONE. 89.3 kB processed. Open Job" + "Query job d331a785-e574-45c9-86c8-d29ddd79a4d1 is DONE. 89.3 kB processed. Open Job" ], "text/plain": [ "" @@ -576,7 +576,7 @@ "654 Astros Angels 143 M" ] }, - "execution_count": 35, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -607,7 +607,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 28, "metadata": { "id": "zlQfhcW41uzM" }, @@ -618,7 +618,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 29, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -631,7 +631,7 @@ { "data": { "text/html": [ - "Query job 2895676f-d15c-40fd-8cf2-3a0436291e6b is DONE. 0 Bytes processed. Open Job" + "Query job bbc0b78f-bc04-4bd5-b711-399786a51519 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -644,7 +644,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-9b20b0257558a42da610d8998022c25e-7k62x9l6' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_9b20b0257558a42da610d8998022c25e_7k62x9l6'.\n" + "Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-session54c8b0-cf31fc2d2c7fe111afa5526f5a9cdf06-gmmo' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_session54c8b0_cf31fc2d2c7fe111afa5526f5a9cdf06_gmmo'.\n" ] } ], @@ -659,7 +659,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 30, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -672,7 +672,7 @@ { "data": { "text/html": [ - "Query job 4efda755-2f54-4477-b48a-4a424c888559 is DONE. 58.3 kB processed. Open Job" + "Query job 991b54ed-9eaa-450f-9208-3e73404bb112 is DONE. 58.3 kB processed. Open Job" ], "text/plain": [ "" @@ -684,7 +684,7 @@ { "data": { "text/html": [ - "Query job a8992776-c2e8-4c3e-ab75-dfc01c5de89f is DONE. 150.1 kB processed. Open Job" + "Query job 4e464a58-ac5b-42fd-91e3-92c115bdd273 is DONE. 150.1 kB processed. Open Job" ], "text/plain": [ "" @@ -696,7 +696,7 @@ { "data": { "text/html": [ - "Query job 3ea299b0-27ad-432b-8dbf-81da3aae884f is DONE. 91.7 kB processed. Open Job" + "Query job d340f55d-1511-431a-970d-a70ed4356935 is DONE. 91.7 kB processed. Open Job" ], "text/plain": [ "" @@ -781,7 +781,7 @@ "654 Astros Angels 143 3h" ] }, - "execution_count": 38, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -812,7 +812,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 31, "metadata": { "id": "0G91fWiF3pKg" }, @@ -829,7 +829,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 32, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -842,7 +842,7 @@ { "data": { "text/html": [ - "Query job 411853db-bf83-4df8-af78-55b1ceb39cb1 is DONE. 0 Bytes processed. Open Job" + "Query job 10d1afa3-349b-49a8-adbd-79a8309ce77c is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -855,7 +855,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-b54aa0aa752af6a3bd6d9d529dac373b-h4lgpy4y' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_b54aa0aa752af6a3bd6d9d529dac373b_h4lgpy4y'.\n" + "Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-session54c8b0-3c03836c2044bf625d02e25ccdbfe101-k1m4' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_session54c8b0_3c03836c2044bf625d02e25ccdbfe101_k1m4'.\n" ] } ], @@ -870,7 +870,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 33, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -883,7 +883,7 @@ { "data": { "text/html": [ - "Query job d04abfa5-e2f2-4936-a708-ed97ef429df3 is DONE. 58.3 kB processed. Open Job" + "Query job 33aff336-48d6-4caa-8cae-f459d21b180e is DONE. 58.3 kB processed. Open Job" ], "text/plain": [ "" @@ -895,7 +895,7 @@ { "data": { "text/html": [ - "Query job 2fc4edf0-7a86-4532-b8fb-bd3f5d153dcb is DONE. 157.4 kB processed. Open Job" + "Query job 561e0aa7-3962-4ef3-b308-a117a0ac3a7d is DONE. 157.4 kB processed. Open Job" ], "text/plain": [ "" @@ -907,7 +907,7 @@ { "data": { "text/html": [ - "Query job f7e6e18c-70d7-4b4e-926a-03b3a1abd1fe is DONE. 99.0 kB processed. Open Job" + "Query job 759dccf8-3d88-40e1-a38a-2a2064e1d269 is DONE. 99.0 kB processed. Open Job" ], "text/plain": [ "" @@ -992,7 +992,7 @@ "654 Astros Angels 143 3 hrs" ] }, - "execution_count": 41, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -1018,7 +1018,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 34, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1031,7 +1031,7 @@ { "data": { "text/html": [ - "Query job c674e7b7-2349-4317-8f08-8bfd9aa99785 is DONE. 0 Bytes processed. Open Job" + "Query job e2a44878-2564-44a5-8dec-b7ea2f42afd4 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1057,7 +1057,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 35, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1070,7 +1070,7 @@ { "data": { "text/html": [ - "Query job eb9384c9-de7d-4232-bdca-94b61b50ff89 is DONE. 60.5 kB processed. Open Job" + "Query job bcfab000-ca19-4633-bf0e-45e7d053f3eb is DONE. 60.5 kB processed. Open Job" ], "text/plain": [ "" @@ -1082,7 +1082,7 @@ { "data": { "text/html": [ - "Query job 11a736a5-96d1-4e62-90e2-576156131a94 is DONE. 388.3 kB processed. Open Job" + "Query job 139a6449-c07e-41ff-9aed-c6fdd633740a is DONE. 388.3 kB processed. Open Job" ], "text/plain": [ "" @@ -1094,7 +1094,7 @@ { "data": { "text/html": [ - "Query job c66a9ad1-60f7-4af1-ad7c-65e4eecbb035 is DONE. 330.0 kB processed. Open Job" + "Query job 035fa2fb-0a55-4358-bb50-3ef915f5bf54 is DONE. 330.0 kB processed. Open Job" ], "text/plain": [ "" @@ -1132,61 +1132,61 @@ " \n", " \n", " \n", - " 719\n", - " Astros\n", - " Angels\n", - " 180\n", - " gAAAAABmflbKCFygsmoTzFkUCObFSBJG29Ksk8HEtk82ib...\n", + " 641\n", + " American League\n", + " National League\n", + " 185\n", + " gAAAAABmo0n2I391cbYwIYeg8lyJq1MSFZatrtpvuUD5v-...\n", " \n", " \n", - " 2295\n", - " Astros\n", + " 349\n", " Angels\n", - " 204\n", - " gAAAAABmflbKv-XzIxcNS92RO4fXYIAwA0kGWsAy-tI5fm...\n", + " Astros\n", + " 187\n", + " gAAAAABmo0n2pX-siRwl2tIZA4m--swndC_b7vgGXrqSNM...\n", " \n", " \n", - " 1126\n", - " Astros\n", + " 2349\n", " Angels\n", - " 176\n", - " gAAAAABmflbJdjgpqnfvmklU7Zg3NJUqlTMYMs44dLEkwg...\n", + " Astros\n", + " 160\n", + " gAAAAABmo0n28Q9RwH62HvYRhTDpQ9lo8c6G8F5bnn7wgF...\n", " \n", " \n", - " 294\n", - " Astros\n", + " 557\n", " Angels\n", - " 189\n", - " gAAAAABmflbKmfBh4P3FnwyiIpVFek9TzF4GzwP_5rQmkv...\n", + " Astros\n", + " 166\n", + " gAAAAABmo0n2YlwHlSGQ0_XvXd-QVBtB_Lq2zUifu7vKhg...\n", " \n", " \n", - " 351\n", - " Astros\n", + " 220\n", " Angels\n", - " 212\n", - " gAAAAABmflbJ_mzqao9i7BtoYlMpb6y3bV3x7-cYuWGxsT...\n", + " Astros\n", + " 162\n", + " gAAAAABmo0n2l8HMSGKYizxfEmRvGQy96mrjwx734-Rl_Z...\n", " \n", " \n", "\n", "" ], "text/plain": [ - " homeTeamName awayTeamName duration_minutes \\\n", - "719 Astros Angels 180 \n", - "2295 Astros Angels 204 \n", - "1126 Astros Angels 176 \n", - "294 Astros Angels 189 \n", - "351 Astros Angels 212 \n", + " homeTeamName awayTeamName duration_minutes \\\n", + "641 American League National League 185 \n", + "349 Angels Astros 187 \n", + "2349 Angels Astros 160 \n", + "557 Angels Astros 166 \n", + "220 Angels Astros 162 \n", "\n", " homeTeamNameRedacted \n", - "719 gAAAAABmflbKCFygsmoTzFkUCObFSBJG29Ksk8HEtk82ib... \n", - "2295 gAAAAABmflbKv-XzIxcNS92RO4fXYIAwA0kGWsAy-tI5fm... \n", - "1126 gAAAAABmflbJdjgpqnfvmklU7Zg3NJUqlTMYMs44dLEkwg... \n", - "294 gAAAAABmflbKmfBh4P3FnwyiIpVFek9TzF4GzwP_5rQmkv... \n", - "351 gAAAAABmflbJ_mzqao9i7BtoYlMpb6y3bV3x7-cYuWGxsT... " + "641 gAAAAABmo0n2I391cbYwIYeg8lyJq1MSFZatrtpvuUD5v-... \n", + "349 gAAAAABmo0n2pX-siRwl2tIZA4m--swndC_b7vgGXrqSNM... \n", + "2349 gAAAAABmo0n28Q9RwH62HvYRhTDpQ9lo8c6G8F5bnn7wgF... \n", + "557 gAAAAABmo0n2YlwHlSGQ0_XvXd-QVBtB_Lq2zUifu7vKhg... \n", + "220 gAAAAABmo0n2l8HMSGKYizxfEmRvGQy96mrjwx734-Rl_Z... " ] }, - "execution_count": 43, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -1211,7 +1211,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -1221,13 +1221,13 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 21b054a9-8fb2-418f-a17b-effdf5aba9b5 is DONE. 0 Bytes processed. Open Job" + "Query job af73ab2d-8d88-4cbe-863f-d35e48af84e1 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1240,7 +1240,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-0879f72acd9b8ede460b69c5a8cc0dcb-edxlst27' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_0879f72acd9b8ede460b69c5a8cc0dcb_edxlst27'.\n" + "Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-session54c8b0-a5e21a4ad488ce8b90de19c3c8cd33b6-0ab2' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_session54c8b0_a5e21a4ad488ce8b90de19c3c8cd33b6_0ab2'.\n" ] } ], @@ -1255,13 +1255,13 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job d67b7cb9-9813-4863-99d1-01cf45ab4949 is DONE. 58.3 kB processed. Open Job" + "Query job 0a9ac329-619d-4303-8dbd-176a576d4ce8 is DONE. 58.3 kB processed. Open Job" ], "text/plain": [ "" @@ -1273,7 +1273,7 @@ { "data": { "text/html": [ - "Query job 579ba853-a7b8-49df-9539-bf22f08d2370 is DONE. 162.2 kB processed. Open Job" + "Query job 456bb9b4-0576-4c04-b707-4a04496aa538 is DONE. 162.2 kB processed. Open Job" ], "text/plain": [ "" @@ -1285,7 +1285,7 @@ { "data": { "text/html": [ - "Query job 72f9eb5d-1c1a-4ce8-8f2f-1f5a8f7cec99 is DONE. 103.9 kB processed. Open Job" + "Query job 37f59939-5d2c-4fb1-839b-282ae3702d3d is DONE. 103.9 kB processed. Open Job" ], "text/plain": [ "" @@ -1370,7 +1370,7 @@ "654 Astros Angels 143 2 hours" ] }, - "execution_count": 46, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } diff --git a/notebooks/streaming/streaming_dataframe.ipynb b/notebooks/streaming/streaming_dataframe.ipynb new file mode 100644 index 0000000000..a2da30720d --- /dev/null +++ b/notebooks/streaming/streaming_dataframe.ipynb @@ -0,0 +1,535 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### BigFrames StreamingDataFrame\n", + "bigframes.streaming.StreamingDataFrame is a special DataFrame type that allows simple operations and can create steaming jobs to BigTable and PubSub.\n", + "\n", + "In this notebook, we will:\n", + "* Create a StreamingDataFrame from a BigQuery table\n", + "* Do some opeartions like select, filter and preview the content\n", + "* Create and manage streaming jobs to both BigTable and Pubsub" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes\n", + "import bigframes.streaming as bst" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "bigframes.options._bigquery_options.project = \"bigframes-load-testing\"\n", + "job_id_prefix = \"test_streaming_\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create, select, filter and preview\n", + "Create the StreamingDataFrame from a BigQuery table, select certain columns, filter rows and preview the output" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/blocks.py:126: NullIndexPreviewWarning: Creating object with Null Index. Null Index is a preview feature.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "sdf = bst.read_gbq_table(\"birds.penguins_bigtable_streaming\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/blocks.py:126: NullIndexPreviewWarning: Creating object with Null Index. Null Index is a preview feature.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + "Query job d57200dd-e6f1-42c7-876b-7f4a54994ae6 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/blocks.py:126: NullIndexPreviewWarning: Creating object with Null Index. Null Index is a preview feature.\n", + " warnings.warn(\n", + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/blocks.py:126: NullIndexPreviewWarning: Creating object with Null Index. Null Index is a preview feature.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 1decce4a-eb32-49f4-8e47-7bda0220037a is DONE. 28.9 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
speciesrowkeybody_mass_g
0Adelie Penguin (Pygoscelis adeliae)Torgersen3875
1Adelie Penguin (Pygoscelis adeliae)Torgersen2900
2Adelie Penguin (Pygoscelis adeliae)Biscoe3725
3Adelie Penguin (Pygoscelis adeliae)Dream2975
4Adelie Penguin (Pygoscelis adeliae)Torgersen3050
5Chinstrap penguin (Pygoscelis antarctica)Dream2700
6Adelie Penguin (Pygoscelis adeliae)Dream3900
7Adelie Penguin (Pygoscelis adeliae)Biscoe3825
8Chinstrap penguin (Pygoscelis antarctica)Dream3775
9Adelie Penguin (Pygoscelis adeliae)Dream3350
10Adelie Penguin (Pygoscelis adeliae)Biscoe3900
11Adelie Penguin (Pygoscelis adeliae)Torgersen3650
12Adelie Penguin (Pygoscelis adeliae)Biscoe3200
13Chinstrap penguin (Pygoscelis antarctica)Dream3650
14Adelie Penguin (Pygoscelis adeliae)Dream3700
15Chinstrap penguin (Pygoscelis antarctica)Dream3800
16Chinstrap penguin (Pygoscelis antarctica)Dream3950
17Chinstrap penguin (Pygoscelis antarctica)Dream3350
18Adelie Penguin (Pygoscelis adeliae)Dream3100
19Chinstrap penguin (Pygoscelis antarctica)Dream3750
20Adelie Penguin (Pygoscelis adeliae)Biscoe3550
21Chinstrap penguin (Pygoscelis antarctica)Dream3400
22Adelie Penguin (Pygoscelis adeliae)Torgersen3450
23Adelie Penguin (Pygoscelis adeliae)Torgersen3600
24Chinstrap penguin (Pygoscelis antarctica)Dream3650
\n", + "

25 rows Ă— 3 columns

\n", + "
[165 rows x 3 columns in total]" + ], + "text/plain": [ + " species rowkey body_mass_g\n", + " Adelie Penguin (Pygoscelis adeliae) Torgersen 3875\n", + " Adelie Penguin (Pygoscelis adeliae) Torgersen 2900\n", + " Adelie Penguin (Pygoscelis adeliae) Biscoe 3725\n", + " Adelie Penguin (Pygoscelis adeliae) Dream 2975\n", + " Adelie Penguin (Pygoscelis adeliae) Torgersen 3050\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 2700\n", + " Adelie Penguin (Pygoscelis adeliae) Dream 3900\n", + " Adelie Penguin (Pygoscelis adeliae) Biscoe 3825\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3775\n", + " Adelie Penguin (Pygoscelis adeliae) Dream 3350\n", + " Adelie Penguin (Pygoscelis adeliae) Biscoe 3900\n", + " Adelie Penguin (Pygoscelis adeliae) Torgersen 3650\n", + " Adelie Penguin (Pygoscelis adeliae) Biscoe 3200\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3650\n", + " Adelie Penguin (Pygoscelis adeliae) Dream 3700\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3800\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3950\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3350\n", + " Adelie Penguin (Pygoscelis adeliae) Dream 3100\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3750\n", + " Adelie Penguin (Pygoscelis adeliae) Biscoe 3550\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3400\n", + " Adelie Penguin (Pygoscelis adeliae) Torgersen 3450\n", + " Adelie Penguin (Pygoscelis adeliae) Torgersen 3600\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3650\n", + "...\n", + "\n", + "[165 rows x 3 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sdf = sdf[[\"species\", \"island\", \"body_mass_g\"]]\n", + "sdf = sdf[sdf[\"body_mass_g\"] < 4000]\n", + "# BigTable needs a rowkey column\n", + "sdf = sdf.rename(columns={\"island\": \"rowkey\"})\n", + "print(type(sdf))\n", + "sdf" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### BigTable\n", + "Create BigTable streaming job" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/streaming/dataframe.py:338: PreviewWarning: The bigframes.streaming module is a preview feature, and subject to change.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "job = sdf.to_bigtable(instance=\"streaming-testing-instance\",\n", + " table=\"garrettwu-no-col-family\",\n", + " service_account_email=\"streaming-testing-admin@bigframes-load-testing.iam.gserviceaccount.com\",\n", + " app_profile=None,\n", + " truncate=True,\n", + " overwrite=True,\n", + " auto_create_column_families=True,\n", + " bigtable_options={},\n", + " job_id=None,\n", + " job_id_prefix=job_id_prefix,)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n", + "None\n" + ] + } + ], + "source": [ + "print(job.running())\n", + "print(job.error_result)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "job.cancel()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### PubSub\n", + "Create Pubsub streaming job" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/blocks.py:126: NullIndexPreviewWarning: Creating object with Null Index. Null Index is a preview feature.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "sdf = sdf[[\"rowkey\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/streaming/dataframe.py:453: PreviewWarning: The bigframes.streaming module is a preview feature, and subject to change.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "job = sdf.to_pubsub(\n", + " topic=\"penguins\",\n", + " service_account_email=\"streaming-testing@bigframes-load-testing.iam.gserviceaccount.com\",\n", + " job_id=None,\n", + " job_id_prefix=job_id_prefix,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n", + "None\n" + ] + } + ], + "source": [ + "print(job.running())\n", + "print(job.error_result)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "job.cancel()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/noxfile.py b/noxfile.py index 177e0e2ab8..d69c16e69c 100644 --- a/noxfile.py +++ b/noxfile.py @@ -429,7 +429,8 @@ def cover(session): "--show-missing", "--include=tests/unit/*", "--include=tests/system/small/*", - "--fail-under=100", + # TODO(b/353775058) resume coverage to 100 when the issue is fixed. + "--fail-under=99", ) session.run("coverage", "erase") @@ -552,8 +553,6 @@ def prerelease(session: nox.sessions.Session, tests_path): already_installed.add("pyarrow") session.install( - "--extra-index-url", - "https://pypi.anaconda.org/scipy-wheels-nightly/simple", "--prefer-binary", "--pre", "--upgrade", diff --git a/samples/polars/requirements-test.txt b/samples/polars/requirements-test.txt index beca2e44d9..cbac5e3f12 100644 --- a/samples/polars/requirements-test.txt +++ b/samples/polars/requirements-test.txt @@ -1,3 +1,3 @@ # samples/snippets should be runnable with no "extras" google-cloud-testutils==1.4.0 -pytest==8.2.0 +pytest==8.3.2 diff --git a/samples/polars/requirements.txt b/samples/polars/requirements.txt index e3f886e7e3..a1d8fbcdac 100644 --- a/samples/polars/requirements.txt +++ b/samples/polars/requirements.txt @@ -1,3 +1,3 @@ -bigframes==1.6.0 -polars==0.20.31 +bigframes==1.11.1 +polars==1.3.0 pyarrow==15.0.0 diff --git a/samples/snippets/requirements-test.txt b/samples/snippets/requirements-test.txt index beca2e44d9..cbac5e3f12 100644 --- a/samples/snippets/requirements-test.txt +++ b/samples/snippets/requirements-test.txt @@ -1,3 +1,3 @@ # samples/snippets should be runnable with no "extras" google-cloud-testutils==1.4.0 -pytest==8.2.0 +pytest==8.3.2 diff --git a/samples/snippets/requirements.txt b/samples/snippets/requirements.txt index 8fcd19bb2c..9b5da5182e 100644 --- a/samples/snippets/requirements.txt +++ b/samples/snippets/requirements.txt @@ -1,2 +1,2 @@ # samples/snippets should be runnable with no "extras" -bigframes==1.6.0 +bigframes==1.11.1 diff --git a/scripts/create_bigtable.py b/scripts/create_bigtable.py index f81bb8a013..da40e9063d 100644 --- a/scripts/create_bigtable.py +++ b/scripts/create_bigtable.py @@ -18,6 +18,7 @@ import os import sys +from google.cloud.bigtable import column_family import google.cloud.bigtable as bigtable PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT") @@ -57,8 +58,11 @@ def create_table(instance): table_id, instance, ) + max_versions_rule = column_family.MaxVersionsGCRule(1) + column_family_id = "body_mass_g" + column_families = {column_family_id: max_versions_rule} if not table.exists(): - table.create() + table.create(column_families=column_families) print(f"Created table {table_id}") diff --git a/tests/system/conftest.py b/tests/system/conftest.py index df4ff9aff0..6bd7bf9348 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -43,11 +43,15 @@ # Use this to control the number of cloud functions being deleted in a single # test session. This should help soften the spike of the number of mutations per -# minute tracked against a quota limit (default 60, increased to 120 for -# bigframes-dev project) by the Cloud Functions API -# We are running pytest with "-n 20". Let's say each session lasts about a -# minute, so we are setting a limit of 120/20 = 6 deletions per session. -MAX_NUM_FUNCTIONS_TO_DELETE_PER_SESSION = 6 +# minute tracked against the quota limit: +# Cloud Functions API -> Per project mutation requests per minute per region +# (default 60, increased to 1000 for the test projects) +# We are running pytest with "-n 20". For a rough estimation, let's say all +# parallel sessions run in parallel. So that allows 1000/20 = 50 mutations per +# minute. One session takes about 1 minute to create a remote function. This +# would allow 50-1 = 49 deletions per session. As a heuristic let's use half of +# that potential for the clean up. +MAX_NUM_FUNCTIONS_TO_DELETE_PER_SESSION = 25 CURRENT_DIR = pathlib.Path(__file__).parent DATA_DIR = CURRENT_DIR.parent / "data" @@ -139,9 +143,25 @@ def session() -> Generator[bigframes.Session, None, None]: session.close() # close generated session at cleanup time +@pytest.fixture(scope="session") +def session_load() -> Generator[bigframes.Session, None, None]: + context = bigframes.BigQueryOptions(location="US", project="bigframes-load-testing") + session = bigframes.Session(context=context) + yield session + session.close() # close generated session at cleanup time + + +@pytest.fixture(scope="session", params=["ordered", "unordered"]) +def maybe_ordered_session(request) -> Generator[bigframes.Session, None, None]: + context = bigframes.BigQueryOptions(location="US", ordering_mode="partial") + session = bigframes.Session(context=context) + yield session + session.close() # close generated session at cleanup type + + @pytest.fixture(scope="session") def unordered_session() -> Generator[bigframes.Session, None, None]: - context = bigframes.BigQueryOptions(location="US", _strictly_ordered=False) + context = bigframes.BigQueryOptions(location="US", ordering_mode="partial") session = bigframes.Session(context=context) yield session session.close() # close generated session at cleanup type @@ -467,6 +487,17 @@ def scalars_dfs( return scalars_df_index, scalars_pandas_df_index +@pytest.fixture(scope="session") +def scalars_dfs_maybe_ordered( + maybe_ordered_session, + scalars_pandas_df_index, +): + return ( + maybe_ordered_session.read_pandas(scalars_pandas_df_index), + scalars_pandas_df_index, + ) + + @pytest.fixture(scope="session") def hockey_df( hockey_table_id: str, session: bigframes.Session diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py index 0cc9fc5353..2f4c07fa28 100644 --- a/tests/system/large/ml/test_linear_model.py +++ b/tests/system/large/ml/test_linear_model.py @@ -111,6 +111,50 @@ def test_linear_regression_customized_params_fit_score( assert reloaded_model.learning_rate == 0.2 +def test_unordered_mode_regression_configure_fit_score( + unordered_session, penguins_table_id, dataset_id +): + model = bigframes.ml.linear_model.LinearRegression() + + df = unordered_session.read_gbq(penguins_table_id).dropna() + X_train = df[ + [ + "species", + "island", + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "sex", + ] + ] + y_train = df[["body_mass_g"]] + model.fit(X_train, y_train) + + # Check score to ensure the model was fitted + result = model.score(X_train, y_train).to_pandas() + utils.check_pandas_df_schema_and_index( + result, columns=utils.ML_REGRESSION_METRICS, index=1 + ) + + # save, load, check parameters to ensure configuration was kept + reloaded_model = model.to_gbq(f"{dataset_id}.temp_configured_model", replace=True) + assert reloaded_model._bqml_model is not None + assert ( + f"{dataset_id}.temp_configured_model" in reloaded_model._bqml_model.model_name + ) + assert reloaded_model.optimize_strategy == "NORMAL_EQUATION" + assert reloaded_model.fit_intercept is True + assert reloaded_model.calculate_p_values is False + assert reloaded_model.enable_global_explain is False + assert reloaded_model.l1_reg is None + assert reloaded_model.l2_reg == 0.0 + assert reloaded_model.learning_rate is None + assert reloaded_model.learning_rate_strategy == "line_search" + assert reloaded_model.ls_init_learning_rate is None + assert reloaded_model.max_iterations == 20 + assert reloaded_model.tol == 0.01 + + # TODO(garrettwu): add tests for param warm_start. Requires a trained model. diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index ef8b9811df..303c74f1fd 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -22,13 +22,14 @@ import textwrap import google.api_core.exceptions -from google.cloud import bigquery, storage +from google.cloud import bigquery, functions_v2, storage import pandas import pytest import test_utils.prefixer import bigframes -from bigframes.functions.remote_function import get_cloud_function_name +import bigframes.functions.remote_function as bigframes_rf +import bigframes.pandas as bpd import bigframes.series from tests.system.utils import ( assert_pandas_df_equal, @@ -590,7 +591,11 @@ def add_one(x): add_one_uniq, add_one_uniq_dir = make_uniq_udf(add_one) # Expected cloud function name for the unique udf - add_one_uniq_cf_name, _ = get_cloud_function_name(add_one_uniq) + package_requirements = bigframes_rf._get_updated_package_requirements() + add_one_uniq_hash = bigframes_rf._get_hash(add_one_uniq, package_requirements) + add_one_uniq_cf_name = bigframes_rf.get_cloud_function_name( + add_one_uniq_hash, session.session_id + ) # There should be no cloud function yet for the unique udf cloud_functions = list( @@ -1860,3 +1865,142 @@ def test_remote_function_gcf_memory_unsupported(session, memory_mib): @session.remote_function(reuse=False, cloud_function_memory_mib=memory_mib) def square(x: int) -> int: return x * x + + +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_unnamed_removed_w_session_cleanup(): + # create a clean session + session = bigframes.connect() + + # create an unnamed remote function in the session + @session.remote_function(reuse=False) + def foo(x: int) -> int: + return x + 1 + + # ensure that remote function artifacts are created + assert foo.bigframes_remote_function is not None + session.bqclient.get_routine(foo.bigframes_remote_function) is not None + assert foo.bigframes_cloud_function is not None + session.cloudfunctionsclient.get_function( + name=foo.bigframes_cloud_function + ) is not None + + # explicitly close the session + session.close() + + # ensure that the bq remote function is deleted + with pytest.raises(google.cloud.exceptions.NotFound): + session.bqclient.get_routine(foo.bigframes_remote_function) + + # the deletion of cloud function happens in a non-blocking way, ensure that + # it either exists in a being-deleted state, or is already deleted + try: + gcf = session.cloudfunctionsclient.get_function( + name=foo.bigframes_cloud_function + ) + assert gcf.state is functions_v2.Function.State.DELETING + except google.cloud.exceptions.NotFound: + pass + + +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_named_perists_w_session_cleanup(): + try: + # create a clean session + session = bigframes.connect() + + # create a name for the remote function + name = test_utils.prefixer.Prefixer("bigframes", "").create_prefix() + + # create an unnamed remote function in the session + @session.remote_function(name=name) + def foo(x: int) -> int: + return x + 1 + + # ensure that remote function artifacts are created + assert foo.bigframes_remote_function is not None + session.bqclient.get_routine(foo.bigframes_remote_function) is not None + assert foo.bigframes_cloud_function is not None + session.cloudfunctionsclient.get_function( + name=foo.bigframes_cloud_function + ) is not None + + # explicitly close the session + session.close() + + # ensure that the bq remote function still exists + session.bqclient.get_routine(foo.bigframes_remote_function) is not None + + # the deletion of cloud function happens in a non-blocking way, ensure + # that it was not deleted and still exists in active state + gcf = session.cloudfunctionsclient.get_function( + name=foo.bigframes_cloud_function + ) + assert gcf.state is functions_v2.Function.State.ACTIVE + finally: + # clean up the gcp assets created for the remote function + cleanup_remote_function_assets( + session.bqclient, session.cloudfunctionsclient, foo + ) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_clean_up_by_session_id(): + # Use a brand new session to avoid conflict with other tests + session = bigframes.Session() + session_id = session.session_id + try: + # we will create remote functions, one with explicit name and another + # without it, and later confirm that the former is deleted when the session + # is cleaned up by session id, but the latter remains + ## unnamed + @session.remote_function(reuse=False) + def foo_unnamed(x: int) -> int: + return x + 1 + + ## named + rf_name = test_utils.prefixer.Prefixer("bigframes", "").create_prefix() + + @session.remote_function(reuse=False, name=rf_name) + def foo_named(x: int) -> int: + return x + 2 + + # check that BQ remote functiosn were created with corresponding cloud + # functions + for foo in [foo_unnamed, foo_named]: + assert foo.bigframes_remote_function is not None + session.bqclient.get_routine(foo.bigframes_remote_function) is not None + assert foo.bigframes_cloud_function is not None + session.cloudfunctionsclient.get_function( + name=foo.bigframes_cloud_function + ) is not None + + # clean up using explicit session id + bpd.clean_up_by_session_id( + session_id, location=session._location, project=session._project + ) + + # ensure that the unnamed bq remote function is deleted along with its + # corresponding cloud function + with pytest.raises(google.cloud.exceptions.NotFound): + session.bqclient.get_routine(foo_unnamed.bigframes_remote_function) + try: + gcf = session.cloudfunctionsclient.get_function( + name=foo_unnamed.bigframes_cloud_function + ) + assert gcf.state is functions_v2.Function.State.DELETING + except google.cloud.exceptions.NotFound: + pass + + # ensure that the named bq remote function still exists along with its + # corresponding cloud function + session.bqclient.get_routine(foo_named.bigframes_remote_function) is not None + gcf = session.cloudfunctionsclient.get_function( + name=foo_named.bigframes_cloud_function + ) + assert gcf.state is functions_v2.Function.State.ACTIVE + finally: + # clean up the gcp assets created for the remote function + cleanup_remote_function_assets( + session.bqclient, session.cloudfunctionsclient, foo_named + ) diff --git a/tests/system/large/test_session.py b/tests/system/large/test_session.py index c7a19dc26e..2b82d0133b 100644 --- a/tests/system/large/test_session.py +++ b/tests/system/large/test_session.py @@ -19,6 +19,7 @@ import bigframes import bigframes.pandas as bpd +import bigframes.session._io.bigquery @pytest.mark.parametrize( @@ -93,8 +94,7 @@ def test_clean_up_by_session_id(): session_id = session.session_id # we will create two tables and confirm that they are deleted - # when the session is closed by id - + # when the session is cleaned up by id bqclient = session.bqclient dataset = session._anonymous_dataset expiration = ( @@ -110,9 +110,7 @@ def test_clean_up_by_session_id(): max_results=bigframes.session._io.bigquery._LIST_TABLES_LIMIT, page_size=bigframes.session._io.bigquery._LIST_TABLES_LIMIT, ) - assert any( - [(session.session_id in table.full_table_id) for table in list(tables_before)] - ) + assert any([(session.session_id in table.full_table_id) for table in tables_before]) bpd.clean_up_by_session_id( session_id, location=session._location, project=session._project @@ -125,5 +123,5 @@ def test_clean_up_by_session_id(): page_size=bigframes.session._io.bigquery._LIST_TABLES_LIMIT, ) assert not any( - [(session.session_id in table.full_table_id) for table in list(tables_after)] + [(session.session_id in table.full_table_id) for table in tables_after] ) diff --git a/tests/system/large/test_streaming.py b/tests/system/large/test_streaming.py index c125fde15a..391aec8533 100644 --- a/tests/system/large/test_streaming.py +++ b/tests/system/large/test_streaming.py @@ -14,17 +14,20 @@ import time +import bigframes import bigframes.streaming -def test_streaming_to_bigtable(): +def test_streaming_df_to_bigtable(session_load: bigframes.Session): # launch a continuous query job_id_prefix = "test_streaming_" - sql = """SELECT - body_mass_g, island as rowkey - FROM birds.penguins_bigtable_streaming""" - query_job = bigframes.streaming.to_bigtable( - sql, + sdf = session_load.read_gbq_table_streaming("birds.penguins_bigtable_streaming") + + sdf = sdf[["species", "island", "body_mass_g"]] + sdf = sdf[sdf["body_mass_g"] < 4000] + sdf = sdf.rename(columns={"island": "rowkey"}) + + query_job = sdf.to_bigtable( instance="streaming-testing-instance", table="table-testing", service_account_email="streaming-testing@bigframes-load-testing.iam.gserviceaccount.com", @@ -41,22 +44,22 @@ def test_streaming_to_bigtable(): # wait 100 seconds in order to ensure the query doesn't stop # (i.e. it is continuous) time.sleep(100) - assert query_job.error_result is None - assert query_job.errors is None assert query_job.running() + assert query_job.error_result is None assert str(query_job.job_id).startswith(job_id_prefix) finally: query_job.cancel() -def test_streaming_to_pubsub(): +def test_streaming_df_to_pubsub(session_load: bigframes.Session): # launch a continuous query job_id_prefix = "test_streaming_pubsub_" - sql = """SELECT - island - FROM birds.penguins_pubsub_streaming""" - query_job = bigframes.streaming.to_pubsub( - sql, + sdf = session_load.read_gbq_table_streaming("birds.penguins_bigtable_streaming") + + sdf = sdf[sdf["body_mass_g"] < 4000] + sdf = sdf[["island"]] + + query_job = sdf.to_pubsub( topic="penguins", service_account_email="streaming-testing@bigframes-load-testing.iam.gserviceaccount.com", job_id=None, @@ -67,9 +70,8 @@ def test_streaming_to_pubsub(): # wait 100 seconds in order to ensure the query doesn't stop # (i.e. it is continuous) time.sleep(100) - assert query_job.error_result is None - assert query_job.errors is None assert query_job.running() + assert query_job.error_result is None assert str(query_job.job_id).startswith(job_id_prefix) finally: query_job.cancel() diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py index fd047b3ba6..6d22963a97 100644 --- a/tests/system/load/test_llm.py +++ b/tests/system/load/test_llm.py @@ -15,7 +15,8 @@ import pandas as pd import pytest -import bigframes.ml.llm +from bigframes.ml import llm +from tests.system import utils @pytest.fixture(scope="session") @@ -39,9 +40,7 @@ def llm_remote_text_df(session, llm_remote_text_pandas_df): @pytest.mark.flaky(retries=2) def test_llm_palm_configure_fit(llm_fine_tune_df_default_index, llm_remote_text_df): - model = bigframes.ml.llm.PaLM2TextGenerator( - model_name="text-bison", max_iterations=1 - ) + model = llm.PaLM2TextGenerator(model_name="text-bison", max_iterations=1) X_train = llm_fine_tune_df_default_index[["prompt"]] y_train = llm_fine_tune_df_default_index[["label"]] @@ -50,62 +49,22 @@ def test_llm_palm_configure_fit(llm_fine_tune_df_default_index, llm_remote_text_ assert model is not None df = model.predict(llm_remote_text_df["prompt"]).to_pandas() - assert df.shape == (3, 4) - assert "ml_generate_text_llm_result" in df.columns - series = df["ml_generate_text_llm_result"] - assert all(series.str.len() == 1) - - # TODO(ashleyxu b/335492787): After bqml rolled out version control: save, load, check parameters to ensure configuration was kept - - -@pytest.mark.flaky(retries=2) -def test_llm_palm_score(llm_fine_tune_df_default_index): - model = bigframes.ml.llm.PaLM2TextGenerator(model_name="text-bison") - - # Check score to ensure the model was fitted - score_result = model.score( - X=llm_fine_tune_df_default_index[["prompt"]], - y=llm_fine_tune_df_default_index[["label"]], - ).to_pandas() - score_result_col = score_result.columns.to_list() - expected_col = [ - "bleu4_score", - "rouge-l_precision", - "rouge-l_recall", - "rouge-l_f1_score", - "evaluation_status", - ] - assert all(col in score_result_col for col in expected_col) - - -@pytest.mark.flaky(retries=2) -def test_llm_palm_score_params(llm_fine_tune_df_default_index): - model = bigframes.ml.llm.PaLM2TextGenerator( - model_name="text-bison", max_iterations=1 + utils.check_pandas_df_schema_and_index( + df, + columns=[ + "ml_generate_text_llm_result", + "ml_generate_text_rai_result", + "ml_generate_text_status", + "prompt", + ], + index=3, ) - - # Check score to ensure the model was fitted - score_result = model.score( - X=llm_fine_tune_df_default_index["prompt"], - y=llm_fine_tune_df_default_index["label"], - task_type="classification", - ).to_pandas() - score_result_col = score_result.columns.to_list() - expected_col = [ - "precision", - "recall", - "f1_score", - "label", - "evaluation_status", - ] - assert all(col in score_result_col for col in expected_col) + # TODO(ashleyxu b/335492787): After bqml rolled out version control: save, load, check parameters to ensure configuration was kept @pytest.mark.flaky(retries=2) def test_llm_gemini_configure_fit(llm_fine_tune_df_default_index, llm_remote_text_df): - model = bigframes.ml.llm.GeminiTextGenerator( - model_name="gemini-pro", max_iterations=1 - ) + model = llm.GeminiTextGenerator(model_name="gemini-pro", max_iterations=1) X_train = llm_fine_tune_df_default_index[["prompt"]] y_train = llm_fine_tune_df_default_index[["label"]] @@ -120,9 +79,14 @@ def test_llm_gemini_configure_fit(llm_fine_tune_df_default_index, llm_remote_tex top_k=20, top_p=0.5, ).to_pandas() - assert df.shape == (3, 4) - assert "ml_generate_text_llm_result" in df.columns - series = df["ml_generate_text_llm_result"] - assert all(series.str.len() == 1) - + utils.check_pandas_df_schema_and_index( + df, + columns=[ + "ml_generate_text_llm_result", + "ml_generate_text_rai_result", + "ml_generate_text_status", + "prompt", + ], + index=3, + ) # TODO(ashleyxu b/335492787): After bqml rolled out version control: save, load, check parameters to ensure configuration was kept diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index ff759b8fda..9e0c06e0bd 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -85,18 +85,18 @@ def test_json_set_w_more_pairs(): ) -@pytest.mark.parametrize( - ("series", "json_path_value_pairs"), - [ - pytest.param( - _get_series_from_json([{"a": 10}]), - [("$.a", 1, 100)], - id="invalid_json_path_value_pairs", - marks=pytest.mark.xfail(raises=ValueError), - ), - pytest.param( +def test_json_set_w_invalid_json_path_value_pairs(): + with pytest.raises(ValueError): + bbq.json_set( + _get_series_from_json([{"a": 10}]), json_path_value_pairs=[("$.a", 1, 100)] # type: ignore + ) + + +def test_json_set_w_invalid_value_type(): + with pytest.raises(TypeError): + bbq.json_set( _get_series_from_json([{"a": 10}]), - [ + json_path_value_pairs=[ ( "$.a", bpd.read_pandas( @@ -104,16 +104,9 @@ def test_json_set_w_more_pairs(): ), ) ], - id="invalid_json_value_type", - marks=pytest.mark.xfail(raises=TypeError), - ), - pytest.param( - bpd.Series([1, 2]), - [("$.a", 1)], - id="invalid_series_type", - marks=pytest.mark.xfail(raises=TypeError), - ), - ], -) -def test_json_set_w_invalid(series, json_path_value_pairs): - bbq.json_set(series, json_path_value_pairs=json_path_value_pairs) + ) + + +def test_json_set_w_invalid_series_type(): + with pytest.raises(TypeError): + bbq.json_set(bpd.Series([1, 2]), json_path_value_pairs=[("$.a", 1)]) diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index c505057d7b..95719ea0db 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -312,6 +312,7 @@ def test_model_detect_anomalies( ) +@pytest.mark.skip("b/353775058 BQML internal error") def test_remote_model_predict( bqml_linear_remote_model: core.BqmlModel, new_penguins_df ): diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 36d01e126f..b926004fd8 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -328,12 +328,7 @@ def test_create_load_gemini_text_generator_model( @pytest.mark.parametrize( "model_name", - ( - "gemini-pro", - "gemini-1.5-pro-preview-0514", - # TODO(garrrettwu): enable when cl/637028077 is in prod. - # "gemini-1.5-flash-preview-0514" - ), + ("gemini-pro", "gemini-1.5-pro-preview-0514", "gemini-1.5-flash-preview-0514"), ) @pytest.mark.flaky(retries=2) def test_gemini_text_generator_predict_default_params_success( @@ -369,6 +364,51 @@ def test_gemini_text_generator_predict_with_params_success( assert all(series.str.len() > 20) +@pytest.mark.flaky(retries=2) +def test_llm_palm_score(llm_fine_tune_df_default_index): + model = llm.PaLM2TextGenerator(model_name="text-bison") + + # Check score to ensure the model was fitted + score_result = model.score( + X=llm_fine_tune_df_default_index[["prompt"]], + y=llm_fine_tune_df_default_index[["label"]], + ).to_pandas() + utils.check_pandas_df_schema_and_index( + score_result, + columns=[ + "bleu4_score", + "rouge-l_precision", + "rouge-l_recall", + "rouge-l_f1_score", + "evaluation_status", + ], + index=1, + ) + + +@pytest.mark.flaky(retries=2) +def test_llm_palm_score_params(llm_fine_tune_df_default_index): + model = llm.PaLM2TextGenerator(model_name="text-bison", max_iterations=1) + + # Check score to ensure the model was fitted + score_result = model.score( + X=llm_fine_tune_df_default_index["prompt"], + y=llm_fine_tune_df_default_index["label"], + task_type="classification", + ).to_pandas() + utils.check_pandas_df_schema_and_index( + score_result, + columns=[ + "precision", + "recall", + "f1_score", + "label", + "evaluation_status", + ], + index=6, + ) + + @pytest.mark.flaky(retries=2) def test_llm_gemini_pro_score(llm_fine_tune_df_default_index): model = llm.GeminiTextGenerator(model_name="gemini-pro") diff --git a/tests/system/small/ml/test_model_selection.py b/tests/system/small/ml/test_model_selection.py index 63d0840d29..ea9220feb4 100644 --- a/tests/system/small/ml/test_model_selection.py +++ b/tests/system/small/ml/test_model_selection.py @@ -234,3 +234,65 @@ def test_train_test_split_value_error(penguins_df_default_index, train_size, tes model_selection.train_test_split( X, y, train_size=train_size, test_size=test_size ) + + +def test_train_test_split_stratify(penguins_df_default_index): + X = penguins_df_default_index[ + [ + "species", + "island", + "culmen_length_mm", + ] + ] + y = penguins_df_default_index[["species"]] + X_train, X_test, y_train, y_test = model_selection.train_test_split( + X, y, stratify=penguins_df_default_index["species"] + ) + + # Original distribution is [152, 124, 68]. All the categories follow 75/25 split + train_counts = pd.Series( + [114, 93, 51], + index=pd.Index( + [ + "Adelie Penguin (Pygoscelis adeliae)", + "Gentoo penguin (Pygoscelis papua)", + "Chinstrap penguin (Pygoscelis antarctica)", + ], + name="species", + ), + dtype="Int64", + name="count", + ) + test_counts = pd.Series( + [38, 31, 17], + index=pd.Index( + [ + "Adelie Penguin (Pygoscelis adeliae)", + "Gentoo penguin (Pygoscelis papua)", + "Chinstrap penguin (Pygoscelis antarctica)", + ], + name="species", + ), + dtype="Int64", + name="count", + ) + pd.testing.assert_series_equal( + X_train["species"].value_counts().to_pandas(), + train_counts, + check_index_type=False, + ) + pd.testing.assert_series_equal( + X_test["species"].value_counts().to_pandas(), + test_counts, + check_index_type=False, + ) + pd.testing.assert_series_equal( + y_train["species"].value_counts().to_pandas(), + train_counts, + check_index_type=False, + ) + pd.testing.assert_series_equal( + y_test["species"].value_counts().to_pandas(), + test_counts, + check_index_type=False, + ) diff --git a/tests/system/small/ml/test_remote.py b/tests/system/small/ml/test_remote.py index 5036cdadfc..c52c452244 100644 --- a/tests/system/small/ml/test_remote.py +++ b/tests/system/small/ml/test_remote.py @@ -13,10 +13,12 @@ # limitations under the License. import pandas as pd +import pytest from bigframes.ml import remote +@pytest.mark.skip("b/353775058 BQML internal error") def test_remote_linear_vertex_model_predict( linear_remote_vertex_model: remote.VertexAIModel, new_penguins_df ): diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 625b920763..3a7eff621f 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -33,8 +33,10 @@ import bigframes.pandas as bpd import bigframes.series as series from tests.system.utils import ( + assert_dfs_equivalent, assert_pandas_df_equal, assert_series_equal, + assert_series_equivalent, skip_legacy_pandas, ) @@ -75,7 +77,7 @@ def test_df_construct_large_strings(): pandas.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) -def test_df_construct_pandas_load_job(scalars_dfs): +def test_df_construct_pandas_load_job(scalars_dfs_maybe_ordered): # This should trigger the inlined codepath columns = [ "int64_too", @@ -91,10 +93,10 @@ def test_df_construct_pandas_load_job(scalars_dfs): "timestamp_col", "geography_col", ] - _, scalars_pandas_df = scalars_dfs - bf_result = dataframe.DataFrame(scalars_pandas_df, columns=columns).to_pandas() + _, scalars_pandas_df = scalars_dfs_maybe_ordered + bf_result = dataframe.DataFrame(scalars_pandas_df, columns=columns) pd_result = pd.DataFrame(scalars_pandas_df, columns=columns) - pandas.testing.assert_frame_equal(bf_result, pd_result) + assert_dfs_equivalent(pd_result, bf_result) def test_df_construct_pandas_set_dtype(scalars_dfs): @@ -112,17 +114,17 @@ def test_df_construct_pandas_set_dtype(scalars_dfs): pandas.testing.assert_frame_equal(bf_result, pd_result) -def test_df_construct_from_series(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs +def test_df_construct_from_series(scalars_dfs_maybe_ordered): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered bf_result = dataframe.DataFrame( {"a": scalars_df["int64_col"], "b": scalars_df["string_col"]}, dtype="string[pyarrow]", - ).to_pandas() + ) pd_result = pd.DataFrame( {"a": scalars_pandas_df["int64_col"], "b": scalars_pandas_df["string_col"]}, dtype="string[pyarrow]", ) - pandas.testing.assert_frame_equal(bf_result, pd_result) + assert_dfs_equivalent(pd_result, bf_result) def test_df_construct_from_dict(): @@ -140,8 +142,6 @@ def test_df_construct_from_dict(): def test_df_construct_inline_respects_location(): - import bigframes.pandas as bpd - # Note: This starts a thread-local session. with bpd.option_context("bigquery.location", "europe-west1"): df = bpd.DataFrame([[1, 2, 3], [4, 5, 6]]) @@ -507,8 +507,8 @@ def test_rename(scalars_dfs): ) -def test_df_peek(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs +def test_df_peek(scalars_dfs_maybe_ordered): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered peek_result = scalars_df.peek(n=3, force=False) pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) assert len(peek_result) == 3 @@ -1711,14 +1711,14 @@ def test_sort_index(scalars_dfs, ascending, na_position): pandas.testing.assert_frame_equal(bf_result, pd_result) -def test_df_abs(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs +def test_df_abs(scalars_dfs_maybe_ordered): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered columns = ["int64_col", "int64_too", "float64_col"] - bf_result = scalars_df[columns].abs().to_pandas() + bf_result = scalars_df[columns].abs() pd_result = scalars_pandas_df[columns].abs() - assert_pandas_df_equal(bf_result, pd_result) + assert_dfs_equivalent(pd_result, bf_result) def test_df_pos(scalars_dfs): @@ -2270,8 +2270,10 @@ def test_series_binop_add_different_table( @all_joins -def test_join_same_table(scalars_dfs, how): - bf_df, pd_df = scalars_dfs +def test_join_same_table(scalars_dfs_maybe_ordered, how): + bf_df, pd_df = scalars_dfs_maybe_ordered + if not bf_df._session._strictly_ordered and how == "cross": + pytest.skip("Cross join not supported in unordered mode.") bf_df_a = bf_df.set_index("int64_too")[["string_col", "int64_col"]] bf_df_a = bf_df_a.sort_index() @@ -2505,7 +2507,7 @@ def test_dataframe_agg_int_single_string(scalars_dfs, agg): ) -def test_dataframe_agg_multi_string(scalars_dfs): +def test_dataframe_agg_multi_string(scalars_dfs_maybe_ordered): numeric_cols = ["int64_col", "int64_too", "float64_col"] aggregations = [ "sum", @@ -2518,8 +2520,8 @@ def test_dataframe_agg_multi_string(scalars_dfs): "nunique", "count", ] - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[numeric_cols].agg(aggregations).to_pandas() + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + bf_result = scalars_df[numeric_cols].agg(aggregations) pd_result = scalars_pandas_df[numeric_cols].agg(aggregations) # Pandas may produce narrower numeric types, but bigframes always produces Float64 @@ -2530,7 +2532,7 @@ def test_dataframe_agg_multi_string(scalars_dfs): bf_result = bf_result.drop(labels=["median"]) pd_result = pd_result.drop(labels=["median"]) - pd.testing.assert_frame_equal(pd_result, bf_result, check_index_type=False) + assert_dfs_equivalent(pd_result, bf_result, check_index_type=False) # Double-check that median is at least plausible. assert ( @@ -3207,13 +3209,6 @@ def test_dataframe_aggregate_bool(scalars_df_index, scalars_pandas_df_index, col assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) -@pytest.mark.parametrize( - ("ordered"), - [ - (True), - (False), - ], -) @pytest.mark.parametrize( ("op", "bf_dtype"), [ @@ -3228,12 +3223,11 @@ def test_dataframe_aggregate_bool(scalars_df_index, scalars_pandas_df_index, col ], ids=["sum", "mean", "min", "max", "std", "var", "count", "nunique"], ) -def test_dataframe_aggregates( - scalars_df_index, scalars_pandas_df_index, op, bf_dtype, ordered -): +def test_dataframe_aggregates(scalars_dfs_maybe_ordered, op, bf_dtype): + scalars_df_index, scalars_pandas_df_index = scalars_dfs_maybe_ordered col_names = ["int64_too", "float64_col", "string_col", "int64_col", "bool_col"] bf_series = op(scalars_df_index[col_names]) - bf_result = bf_series.to_pandas(ordered=ordered) + bf_result = bf_series pd_result = op(scalars_pandas_df_index[col_names]) # Check dtype separately @@ -3242,12 +3236,11 @@ def test_dataframe_aggregates( # Pandas may produce narrower numeric types, but bigframes always produces Float64 # Pandas has object index type pd_result.index = pd_result.index.astype("string[pyarrow]") - assert_series_equal( + assert_series_equivalent( pd_result, bf_result, check_dtype=False, check_index_type=False, - ignore_order=not ordered, ) @@ -3599,16 +3592,17 @@ def test_df_rows_filter_regex(scalars_df_index, scalars_pandas_df_index): ) -def test_df_reindex_rows_list(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.reindex(index=[5, 1, 3, 99, 1]).to_pandas() +def test_df_reindex_rows_list(scalars_dfs_maybe_ordered): + scalars_df_index, scalars_pandas_df_index = scalars_dfs_maybe_ordered + bf_result = scalars_df_index.reindex(index=[5, 1, 3, 99, 1]) pd_result = scalars_pandas_df_index.reindex(index=[5, 1, 3, 99, 1]) # Pandas uses int64 instead of Int64 (nullable) dtype. pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - pd.testing.assert_frame_equal( - bf_result, + assert_dfs_equivalent( pd_result, + bf_result, ) @@ -3863,7 +3857,8 @@ def test_loc_list_integer_index(scalars_df_index, scalars_pandas_df_index): ) -def test_loc_list_multiindex(scalars_df_index, scalars_pandas_df_index): +def test_loc_list_multiindex(scalars_dfs_maybe_ordered): + scalars_df_index, scalars_pandas_df_index = scalars_dfs_maybe_ordered scalars_df_multiindex = scalars_df_index.set_index(["string_col", "int64_col"]) scalars_pandas_df_multiindex = scalars_pandas_df_index.set_index( ["string_col", "int64_col"] @@ -3873,9 +3868,9 @@ def test_loc_list_multiindex(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_multiindex.loc[index_list] pd_result = scalars_pandas_df_multiindex.loc[index_list] - pd.testing.assert_frame_equal( - bf_result.to_pandas(), + assert_dfs_equivalent( pd_result, + bf_result, ) @@ -4130,6 +4125,72 @@ def test_df_to_latex(scalars_df_index, scalars_pandas_df_index): assert bf_result == pd_result +def test_df_to_json_local_str(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.to_json() + # default_handler for arrow types that have no default conversion + pd_result = scalars_pandas_df_index.to_json(default_handler=str) + + assert bf_result == pd_result + + +@skip_legacy_pandas +def test_df_to_json_local_file(scalars_df_index, scalars_pandas_df_index): + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.to_json(bf_result_file, orient="table") + # default_handler for arrow types that have no default conversion + scalars_pandas_df_index.to_json( + pd_result_file, orient="table", default_handler=str + ) + + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + +def test_df_to_csv_local_str(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.to_csv() + # default_handler for arrow types that have no default conversion + pd_result = scalars_pandas_df_index.to_csv() + + assert bf_result == pd_result + + +def test_df_to_csv_local_file(scalars_df_index, scalars_pandas_df_index): + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.to_csv(bf_result_file) + scalars_pandas_df_index.to_csv(pd_result_file) + + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + +def test_df_to_parquet_local_bytes(scalars_df_index, scalars_pandas_df_index): + # GEOGRAPHY not supported in parquet export. + unsupported = ["geography_col"] + + bf_result = scalars_df_index.drop(columns=unsupported).to_parquet() + # default_handler for arrow types that have no default conversion + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_parquet() + + assert bf_result == pd_result + + +def test_df_to_parquet_local_file(scalars_df_index, scalars_pandas_df_index): + # GEOGRAPHY not supported in parquet export. + unsupported = ["geography_col"] + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.drop(columns=unsupported).to_parquet(bf_result_file) + scalars_pandas_df_index.drop(columns=unsupported).to_parquet(pd_result_file) + + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + def test_df_to_records(scalars_df_index, scalars_pandas_df_index): unsupported = ["numeric_col"] bf_result = scalars_df_index.drop(columns=unsupported).to_records() @@ -4171,7 +4232,7 @@ def test_df_to_pickle(scalars_df_index, scalars_pandas_df_index): scalars_df_index.to_pickle(bf_result_file) scalars_pandas_df_index.to_pickle(pd_result_file) bf_result = bf_result_file.read() - pd_result = bf_result_file.read() + pd_result = pd_result_file.read() assert bf_result == pd_result @@ -4336,6 +4397,25 @@ def test_df_cached(scalars_df_index): pandas.testing.assert_frame_equal(df.to_pandas(), df_cached_copy.to_pandas()) +def test_assign_after_binop_row_joins(): + pd_df = pd.DataFrame( + { + "idx1": [1, 1, 1, 1, 2, 2, 2, 2], + "idx2": [10, 10, 20, 20, 10, 10, 20, 20], + "metric1": [10, 14, 2, 13, 6, 2, 9, 5], + "metric2": [25, -3, 8, 2, -1, 0, 0, -4], + }, + dtype=pd.Int64Dtype(), + ).set_index(["idx1", "idx2"]) + bf_df = dataframe.DataFrame(pd_df) + + # Expect implicit joiner to be used, preserving input cardinality rather than getting relational join + bf_df["metric_diff"] = bf_df.metric1 - bf_df.metric2 + pd_df["metric_diff"] = pd_df.metric1 - pd_df.metric2 + + assert_pandas_df_equal(bf_df.to_pandas(), pd_df) + + def test_df_cache_with_implicit_join(scalars_df_index): """expectation is that cache will be used, but no explicit join will be performed""" df = scalars_df_index[["int64_col", "int64_too"]].sort_index().reset_index() + 3 @@ -4510,7 +4590,7 @@ def test_query_complexity_repeated_subtrees( bf_df = scalars_df_index for _ in range(5): pd_df = pd.concat(10 * [pd_df]).head(5) - bf_df = bigframes.pandas.concat(10 * [bf_df]).head(5) + bf_df = bpd.concat(10 * [bf_df]).head(5) bf_result = bf_df.to_pandas() pd_result = pd_df assert_pandas_df_equal(bf_result, pd_result) diff --git a/tests/system/small/test_null_index.py b/tests/system/small/test_null_index.py index 27a3d8dffe..a1e360f73d 100644 --- a/tests/system/small/test_null_index.py +++ b/tests/system/small/test_null_index.py @@ -201,6 +201,20 @@ def test_null_index_stack(scalars_df_null_index, scalars_pandas_df_default_index ) +def test_null_index_series_self_join( + scalars_df_null_index, scalars_pandas_df_default_index +): + bf_result = scalars_df_null_index[["int64_col"]].join( + scalars_df_null_index[["int64_too"]] + ) + pd_result = scalars_pandas_df_default_index[["int64_col"]].join( + scalars_pandas_df_default_index[["int64_too"]] + ) + pd.testing.assert_frame_equal( + bf_result.to_pandas(), pd_result.reset_index(drop=True), check_dtype=False + ) + + def test_null_index_series_self_aligns( scalars_df_null_index, scalars_pandas_df_default_index ): diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index d84d520988..c07a0afb44 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -77,15 +77,27 @@ def bq_cf_connection_location_project_mismatched() -> str: @pytest.fixture(scope="module") -def session_with_bq_connection( - bq_cf_connection, dataset_id_permanent -) -> bigframes.Session: +def session_with_bq_connection(bq_cf_connection) -> bigframes.Session: session = bigframes.Session( bigframes.BigQueryOptions(bq_connection=bq_cf_connection, location="US") ) return session +def get_rf_name(func, package_requirements=None, is_row_processor=False): + """Get a remote function name for testing given a udf.""" + # Augment user package requirements with any internal package + # requirements + package_requirements = rf._get_updated_package_requirements( + package_requirements, is_row_processor + ) + + # Compute a unique hash representing the user code + function_hash = rf._get_hash(func, package_requirements) + + return f"bigframes_{function_hash}" + + @pytest.mark.flaky(retries=2, delay=120) def test_remote_function_direct_no_session_param( bigquery_client, @@ -96,8 +108,11 @@ def test_remote_function_direct_no_session_param( dataset_id_permanent, bq_cf_connection, ): - @rf.remote_function( - [int], + def square(x): + return x * x + + square = rf.remote_function( + int, int, bigquery_client=bigquery_client, bigquery_connection_client=bigqueryconnection_client, @@ -107,9 +122,8 @@ def test_remote_function_direct_no_session_param( bigquery_connection=bq_cf_connection, # See e2e tests for tests that actually deploy the Cloud Function. reuse=True, - ) - def square(x): - return x * x + name=get_rf_name(square), + )(square) # Function should still work normally. assert square(2) == 4 @@ -153,8 +167,11 @@ def test_remote_function_direct_no_session_param_location_specified( dataset_id_permanent, bq_cf_connection_location, ): - @rf.remote_function( - [int], + def square(x): + return x * x + + square = rf.remote_function( + int, int, bigquery_client=bigquery_client, bigquery_connection_client=bigqueryconnection_client, @@ -164,9 +181,8 @@ def test_remote_function_direct_no_session_param_location_specified( bigquery_connection=bq_cf_connection_location, # See e2e tests for tests that actually deploy the Cloud Function. reuse=True, - ) - def square(x): - return x * x + name=get_rf_name(square), + )(square) # Function should still work normally. assert square(2) == 4 @@ -204,13 +220,17 @@ def test_remote_function_direct_no_session_param_location_mismatched( dataset_id_permanent, bq_cf_connection_location_mismatched, ): + def square(x): + # Not expected to reach this code, as the location of the + # connection doesn't match the location of the dataset. + return x * x # pragma: NO COVER + with pytest.raises( ValueError, match=re.escape("The location does not match BigQuery connection location:"), ): - - @rf.remote_function( - [int], + rf.remote_function( + int, int, bigquery_client=bigquery_client, bigquery_connection_client=bigqueryconnection_client, @@ -220,11 +240,8 @@ def test_remote_function_direct_no_session_param_location_mismatched( bigquery_connection=bq_cf_connection_location_mismatched, # See e2e tests for tests that actually deploy the Cloud Function. reuse=True, - ) - def square(x): - # Not expected to reach this code, as the location of the - # connection doesn't match the location of the dataset. - return x * x # pragma: NO COVER + name=get_rf_name(square), + )(square) @pytest.mark.flaky(retries=2, delay=120) @@ -237,8 +254,11 @@ def test_remote_function_direct_no_session_param_location_project_specified( dataset_id_permanent, bq_cf_connection_location_project, ): - @rf.remote_function( - [int], + def square(x): + return x * x + + square = rf.remote_function( + int, int, bigquery_client=bigquery_client, bigquery_connection_client=bigqueryconnection_client, @@ -248,9 +268,8 @@ def test_remote_function_direct_no_session_param_location_project_specified( bigquery_connection=bq_cf_connection_location_project, # See e2e tests for tests that actually deploy the Cloud Function. reuse=True, - ) - def square(x): - return x * x + name=get_rf_name(square), + )(square) # Function should still work normally. assert square(2) == 4 @@ -288,15 +307,19 @@ def test_remote_function_direct_no_session_param_project_mismatched( dataset_id_permanent, bq_cf_connection_location_project_mismatched, ): + def square(x): + # Not expected to reach this code, as the project of the + # connection doesn't match the project of the dataset. + return x * x # pragma: NO COVER + with pytest.raises( ValueError, match=re.escape( "The project_id does not match BigQuery connection gcp_project_id:" ), ): - - @rf.remote_function( - [int], + rf.remote_function( + int, int, bigquery_client=bigquery_client, bigquery_connection_client=bigqueryconnection_client, @@ -306,23 +329,25 @@ def test_remote_function_direct_no_session_param_project_mismatched( bigquery_connection=bq_cf_connection_location_project_mismatched, # See e2e tests for tests that actually deploy the Cloud Function. reuse=True, - ) - def square(x): - # Not expected to reach this code, as the project of the - # connection doesn't match the project of the dataset. - return x * x # pragma: NO COVER + name=get_rf_name(square), + )(square) @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_direct_session_param(session_with_bq_connection, scalars_dfs): - @rf.remote_function( - [int], - int, - session=session_with_bq_connection, - ) +def test_remote_function_direct_session_param( + session_with_bq_connection, scalars_dfs, dataset_id_permanent +): def square(x): return x * x + square = rf.remote_function( + int, + int, + session=session_with_bq_connection, + dataset=dataset_id_permanent, + name=get_rf_name(square), + )(square) + # Function should still work normally. assert square(2) == 4 @@ -351,7 +376,12 @@ def square(x): @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_via_session_default(session_with_bq_connection, scalars_dfs): +def test_remote_function_via_session_default( + session_with_bq_connection, scalars_dfs, dataset_id_permanent +): + def square(x): + return x * x + # Session has bigquery connection initialized via context. Without an # explicit dataset the default dataset from the session would be used. # Without an explicit bigquery connection, the one present in Session set @@ -359,9 +389,9 @@ def test_remote_function_via_session_default(session_with_bq_connection, scalars # the default behavior of reuse=True will take effect. Please note that the # udf is same as the one used in other tests in this file so the underlying # cloud function would be common and quickly reused. - @session_with_bq_connection.remote_function([int], int) - def square(x): - return x * x + square = session_with_bq_connection.remote_function( + int, int, dataset_id_permanent, name=get_rf_name(square) + )(square) # Function should still work normally. assert square(2) == 4 @@ -394,16 +424,18 @@ def square(x): def test_remote_function_via_session_with_overrides( session, scalars_dfs, dataset_id_permanent, bq_cf_connection ): - @session.remote_function( - [int], + def square(x): + return x * x + + square = session.remote_function( + int, int, dataset_id_permanent, bq_cf_connection, # See e2e tests for tests that actually deploy the Cloud Function. reuse=True, - ) - def square(x): - return x * x + name=get_rf_name(square), + )(square) # Function should still work normally. assert square(2) == 4 @@ -433,11 +465,15 @@ def square(x): @pytest.mark.flaky(retries=2, delay=120) -def test_dataframe_applymap(session_with_bq_connection, scalars_dfs): +def test_dataframe_applymap( + session_with_bq_connection, scalars_dfs, dataset_id_permanent +): def add_one(x): return x + 1 - remote_add_one = session_with_bq_connection.remote_function([int], int)(add_one) + remote_add_one = session_with_bq_connection.remote_function( + [int], int, dataset_id_permanent, name=get_rf_name(add_one) + )(add_one) scalars_df, scalars_pandas_df = scalars_dfs int64_cols = ["int64_col", "int64_too"] @@ -460,11 +496,15 @@ def add_one(x): @pytest.mark.flaky(retries=2, delay=120) -def test_dataframe_applymap_na_ignore(session_with_bq_connection, scalars_dfs): +def test_dataframe_applymap_na_ignore( + session_with_bq_connection, scalars_dfs, dataset_id_permanent +): def add_one(x): return x + 1 - remote_add_one = session_with_bq_connection.remote_function([int], int)(add_one) + remote_add_one = session_with_bq_connection.remote_function( + [int], int, dataset_id_permanent, name=get_rf_name(add_one) + )(add_one) scalars_df, scalars_pandas_df = scalars_dfs int64_cols = ["int64_col", "int64_too"] @@ -485,7 +525,9 @@ def add_one(x): @pytest.mark.flaky(retries=2, delay=120) -def test_series_map_bytes(session_with_bq_connection, scalars_dfs): +def test_series_map_bytes( + session_with_bq_connection, scalars_dfs, dataset_id_permanent +): """Check that bytes is support as input and output.""" scalars_df, scalars_pandas_df = scalars_dfs @@ -502,8 +544,11 @@ def bytes_to_hex(mybytes: bytes) -> bytes: pd.ArrowDtype(pyarrow.binary()) ) + packages = ["pandas"] remote_bytes_to_hex = session_with_bq_connection.remote_function( - packages=["pandas"] + dataset=dataset_id_permanent, + name=get_rf_name(bytes_to_hex, package_requirements=packages), + packages=packages, )(bytes_to_hex) bf_result = scalars_df.bytes_col.map(remote_bytes_to_hex).to_pandas() @@ -541,11 +586,14 @@ def test_skip_bq_connection_check(dataset_id_permanent): match=f"Not found: Connection {connection_name}", ): - @session.remote_function([int], int, dataset=dataset_id_permanent) def add_one(x): # Not expected to reach this code, as the connection doesn't exist. return x + 1 # pragma: NO COVER + session.remote_function( + [int], int, dataset=dataset_id_permanent, name=get_rf_name(add_one) + )(add_one) + @pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_detects_invalid_function(session, dataset_id): @@ -570,7 +618,10 @@ def test_read_gbq_function_like_original( dataset_id_permanent, bq_cf_connection, ): - @rf.remote_function( + def square1(x): + return x * x + + square1 = rf.remote_function( [int], int, bigquery_client=bigquery_client, @@ -580,29 +631,28 @@ def test_read_gbq_function_like_original( resource_manager_client=resourcemanager_client, bigquery_connection=bq_cf_connection, reuse=True, - ) - def square1(x): - return x * x + name=get_rf_name(square1), + )(square1) # Function should still work normally. assert square1(2) == 4 square2 = rf.read_gbq_function( - function_name=square1.bigframes_remote_function, + function_name=square1.bigframes_remote_function, # type: ignore session=session, ) # The newly-created function (square1) should have a remote function AND a # cloud function associated with it, while the read-back version (square2) # should only have a remote function. - assert square1.bigframes_remote_function - assert square1.bigframes_cloud_function + assert square1.bigframes_remote_function # type: ignore + assert square1.bigframes_cloud_function # type: ignore assert square2.bigframes_remote_function assert not hasattr(square2, "bigframes_cloud_function") # They should point to the same function. - assert square1.bigframes_remote_function == square2.bigframes_remote_function + assert square1.bigframes_remote_function == square2.bigframes_remote_function # type: ignore # The result of applying them should be the same. int64_col = scalars_df_index["int64_col"] @@ -743,7 +793,7 @@ def test_read_gbq_function_enforces_explicit_types( @pytest.mark.flaky(retries=2, delay=120) -def test_df_apply_axis_1(session, scalars_dfs): +def test_df_apply_axis_1(session, scalars_dfs, dataset_id_permanent): columns = [ "bool_col", "int64_col", @@ -764,6 +814,8 @@ def add_ints(row): add_ints_remote = session.remote_function( bigframes.series.Series, int, + dataset_id_permanent, + name=get_rf_name(add_ints, is_row_processor=True), )(add_ints) with pytest.warns( @@ -785,7 +837,7 @@ def add_ints(row): @pytest.mark.flaky(retries=2, delay=120) -def test_df_apply_axis_1_ordering(session, scalars_dfs): +def test_df_apply_axis_1_ordering(session, scalars_dfs, dataset_id_permanent): columns = ["bool_col", "int64_col", "int64_too", "float64_col", "string_col"] ordering_columns = ["bool_col", "int64_col"] scalars_df, scalars_pandas_df = scalars_dfs @@ -793,7 +845,12 @@ def test_df_apply_axis_1_ordering(session, scalars_dfs): def add_ints(row): return row["int64_col"] + row["int64_too"] - add_ints_remote = session.remote_function(bigframes.series.Series, int)(add_ints) + add_ints_remote = session.remote_function( + bigframes.series.Series, + int, + dataset_id_permanent, + name=get_rf_name(add_ints, is_row_processor=True), + )(add_ints) bf_result = ( scalars_df[columns] @@ -817,7 +874,7 @@ def add_ints(row): @pytest.mark.flaky(retries=2, delay=120) -def test_df_apply_axis_1_multiindex(session): +def test_df_apply_axis_1_multiindex(session, dataset_id_permanent): pd_df = pd.DataFrame( {"x": [1, 2, 3], "y": [1.5, 3.75, 5], "z": ["pq", "rs", "tu"]}, index=pd.MultiIndex.from_tuples([("a", 100), ("a", 200), ("b", 300)]), @@ -827,9 +884,12 @@ def test_df_apply_axis_1_multiindex(session): def add_numbers(row): return row["x"] + row["y"] - add_numbers_remote = session.remote_function(bigframes.series.Series, float)( - add_numbers - ) + add_numbers_remote = session.remote_function( + bigframes.series.Series, + float, + dataset_id_permanent, + name=get_rf_name(add_numbers, is_row_processor=True), + )(add_numbers) bf_result = bf_df.apply(add_numbers_remote, axis=1).to_pandas() pd_result = pd_df.apply(add_numbers, axis=1) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 10fcec63ce..fe6e001797 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -2753,6 +2753,44 @@ def test_to_latex(scalars_df_index, scalars_pandas_df_index): assert bf_result == pd_result +def test_series_to_json_local_str(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.int64_col.to_json() + pd_result = scalars_pandas_df_index.int64_col.to_json() + + assert bf_result == pd_result + + +@skip_legacy_pandas +def test_series_to_json_local_file(scalars_df_index, scalars_pandas_df_index): + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.int64_col.to_json(bf_result_file) + scalars_pandas_df_index.int64_col.to_json(pd_result_file) + + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + +def test_series_to_csv_local_str(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.int64_col.to_csv() + # default_handler for arrow types that have no default conversion + pd_result = scalars_pandas_df_index.int64_col.to_csv() + + assert bf_result == pd_result + + +def test_series_to_csv_local_file(scalars_df_index, scalars_pandas_df_index): + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.int64_col.to_csv(bf_result_file) + scalars_pandas_df_index.int64_col.to_csv(pd_result_file) + + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + def test_to_dict(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index["int64_too"].to_dict() diff --git a/tests/system/small/test_unordered.py b/tests/system/small/test_unordered.py index 36bf2a2585..7d7097ceb3 100644 --- a/tests/system/small/test_unordered.py +++ b/tests/system/small/test_unordered.py @@ -20,6 +20,24 @@ from tests.system.utils import assert_pandas_df_equal, skip_legacy_pandas +def test_unordered_mode_sql_no_hash(unordered_session): + bf_df = unordered_session.read_gbq( + "bigquery-public-data.ethereum_blockchain.blocks" + ) + sql = bf_df.sql + assert "ORDER BY".casefold() not in sql.casefold() + assert "farm_fingerprint".casefold() not in sql.casefold() + + +def test_unordered_mode_job_label(unordered_session): + pd_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, dtype=pd.Int64Dtype()) + df = bpd.DataFrame(pd_df, session=unordered_session) + df.to_pandas() + job_labels = df.query_job.labels # type:ignore + assert "bigframes-mode" in job_labels + assert job_labels["bigframes-mode"] == "unordered" + + def test_unordered_mode_cache_aggregate(unordered_session): pd_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, dtype=pd.Int64Dtype()) df = bpd.DataFrame(pd_df, session=unordered_session) @@ -31,6 +49,19 @@ def test_unordered_mode_cache_aggregate(unordered_session): assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) +def test_unordered_mode_single_aggregate(unordered_session): + pd_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, dtype=pd.Int64Dtype()) + bf_df = bpd.DataFrame(pd_df, session=unordered_session) + + assert bf_df.a.mean() == pd_df.a.mean() + + +def test_unordered_mode_print(unordered_session): + pd_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, dtype=pd.Int64Dtype()) + df = bpd.DataFrame(pd_df, session=unordered_session).cache() + print(df) + + @skip_legacy_pandas def test_unordered_mode_read_gbq(unordered_session): df = unordered_session.read_gbq( @@ -85,6 +116,28 @@ def test_unordered_drop_duplicates(unordered_session, keep): assert_pandas_df_equal(bf_result.to_pandas(), pd_result, ignore_order=True) +def test_unordered_reset_index(unordered_session): + pd_df = pd.DataFrame({"a": [1, 1, 3], "b": [4, 4, 6]}, dtype=pd.Int64Dtype()) + bf_df = bpd.DataFrame(pd_df, session=unordered_session) + + bf_result = bf_df.set_index("b").reset_index(drop=False) + pd_result = pd_df.set_index("b").reset_index(drop=False) + + assert_pandas_df_equal(bf_result.to_pandas(), pd_result) + + +def test_unordered_merge(unordered_session): + pd_df = pd.DataFrame( + {"a": [1, 1, 3], "b": [4, 4, 6], "c": [1, 2, 3]}, dtype=pd.Int64Dtype() + ) + bf_df = bpd.DataFrame(pd_df, session=unordered_session) + + bf_result = bf_df.merge(bf_df, left_on="a", right_on="c") + pd_result = pd_df.merge(pd_df, left_on="a", right_on="c") + + assert_pandas_df_equal(bf_result.to_pandas(), pd_result, ignore_order=True) + + @pytest.mark.parametrize( ("function"), [ @@ -100,6 +153,10 @@ def test_unordered_drop_duplicates(unordered_session, keep): lambda x: x.a.iloc[1::2], id="series_iloc", ), + pytest.param( + lambda x: x.head(3), + id="head", + ), ], ) def test_unordered_mode_blocks_windowing(unordered_session, function): @@ -110,3 +167,17 @@ def test_unordered_mode_blocks_windowing(unordered_session, function): match=r"Op.*not supported when strict ordering is disabled", ): function(df) + + +def test_unordered_mode_cache_preserves_order(unordered_session): + pd_df = pd.DataFrame( + {"a": [1, 2, 3, 4, 5, 6], "b": [4, 5, 9, 3, 1, 6]}, dtype=pd.Int64Dtype() + ) + pd_df.index = pd_df.index.astype(pd.Int64Dtype()) + df = bpd.DataFrame(pd_df, session=unordered_session) + sorted_df = df.sort_values("b").cache() + bf_result = sorted_df.to_pandas() + pd_result = pd_df.sort_values("b") + + # B is unique so unstrict order mode result here should be equivalent to strictly ordered + assert_pandas_df_equal(bf_result, pd_result, ignore_order=False) diff --git a/tests/system/utils.py b/tests/system/utils.py index ab4c2c119f..9fbf191a3a 100644 --- a/tests/system/utils.py +++ b/tests/system/utils.py @@ -27,6 +27,7 @@ import pytest from bigframes.functions import remote_function +import bigframes.pandas ML_REGRESSION_METRICS = [ "mean_absolute_error", @@ -56,6 +57,23 @@ def wrapper(*args, **kwds): return wrapper +# Prefer this function for tests that run in both ordered and unordered mode +def assert_dfs_equivalent( + pd_df: pd.DataFrame, bf_df: bigframes.pandas.DataFrame, **kwargs +): + bf_df_local = bf_df.to_pandas() + ignore_order = not bf_df._session._strictly_ordered + assert_pandas_df_equal(bf_df_local, pd_df, ignore_order=ignore_order, **kwargs) + + +def assert_series_equivalent( + pd_series: pd.Series, bf_series: bigframes.pandas.Series, **kwargs +): + bf_df_local = bf_series.to_pandas() + ignore_order = not bf_series._session._strictly_ordered + assert_series_equal(bf_df_local, pd_series, ignore_order=ignore_order, **kwargs) + + def assert_pandas_df_equal(df0, df1, ignore_order: bool = False, **kwargs): if ignore_order: # Sort by a column to get consistent results. diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index 48fb7011ea..aa7e919b24 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -78,6 +78,7 @@ def mock_X(mock_y, mock_session): ["index_column_label"], ) mock_X.join(mock_y).sql = "input_X_y_sql" + mock_X.join(mock_y).cache.return_value = mock_X.join(mock_y) mock_X.join(mock_y)._to_sql_query.return_value = ( "input_X_y_sql", ["index_column_id"], diff --git a/tests/unit/test_pandas.py b/tests/unit/test_pandas.py index 408590d4bb..1ee52c08a1 100644 --- a/tests/unit/test_pandas.py +++ b/tests/unit/test_pandas.py @@ -35,6 +35,8 @@ def all_session_methods(): if not attribute.startswith("_") ) session_attributes.remove("close") + # streaming isn't in pandas + session_attributes.remove("read_gbq_table_streaming") for attribute in sorted(session_attributes): session_method = getattr(bigframes.session.Session, attribute) diff --git a/third_party/bigframes_vendored/ibis/README.md b/third_party/bigframes_vendored/ibis/README.md index 8a00750e92..fa8224214f 100644 --- a/third_party/bigframes_vendored/ibis/README.md +++ b/third_party/bigframes_vendored/ibis/README.md @@ -1,7 +1,6 @@ # Ibis [![Documentation Status](https://img.shields.io/badge/docs-docs.ibis--project.org-blue.svg)](http://ibis-project.org) -[![Anaconda-Server Badge](https://anaconda.org/conda-forge/ibis-framework/badges/version.svg)](https://anaconda.org/conda-forge/ibis-framework) [![PyPI](https://img.shields.io/pypi/v/ibis-framework.svg)](https://pypi.org/project/ibis-framework) [![Build status](https://github.com/ibis-project/ibis/actions/workflows/ibis-main.yml/badge.svg)](https://github.com/ibis-project/ibis/actions/workflows/ibis-main.yml?query=branch%3Amaster) [![Build status](https://github.com/ibis-project/ibis/actions/workflows/ibis-backends.yml/badge.svg)](https://github.com/ibis-project/ibis/actions/workflows/ibis-backends.yml?query=branch%3Amaster) @@ -83,28 +82,14 @@ Install Ibis from PyPI with: pip install 'ibis-framework[duckdb]' ``` -Or from conda-forge with: - -```bash -conda install ibis-framework -c conda-forge -``` - (It’s a common mistake to `pip install ibis`. If you try to use Ibis and get errors early on try uninstalling `ibis` and installing `ibis-framework`) -To discover ibis, we suggest starting with the DuckDB backend (which is included by default in the conda-forge package). The DuckDB backend is performant and fully featured. - To use ibis with other backends, include the backend name in brackets for PyPI: ```bash pip install 'ibis-framework[postgres]' ``` -Or use `ibis-$BACKEND` where `$BACKEND` is the specific backend you want to use when installing from conda-forge: - -```bash -conda install ibis-postgres -c conda-forge -``` - ## Getting Started with Ibis We provide a number of tutorial and example notebooks in the diff --git a/third_party/bigframes_vendored/pandas/README.md b/third_party/bigframes_vendored/pandas/README.md index 9f2bc800e8..1aa5068d5e 100644 --- a/third_party/bigframes_vendored/pandas/README.md +++ b/third_party/bigframes_vendored/pandas/README.md @@ -6,7 +6,6 @@ # pandas: powerful Python data analysis toolkit [![PyPI Latest Release](https://img.shields.io/pypi/v/pandas.svg)](https://pypi.org/project/pandas/) -[![Conda Latest Release](https://anaconda.org/conda-forge/pandas/badges/version.svg)](https://anaconda.org/anaconda/pandas/) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3509134.svg)](https://doi.org/10.5281/zenodo.3509134) [![Package Status](https://img.shields.io/pypi/status/pandas.svg)](https://pypi.org/project/pandas/) [![License](https://img.shields.io/pypi/l/pandas.svg)](https://github.com/pandas-dev/pandas/blob/main/LICENSE) @@ -86,15 +85,10 @@ The source code is currently hosted on GitHub at: https://github.com/pandas-dev/pandas Binary installers for the latest released version are available at the [Python -Package Index (PyPI)](https://pypi.org/project/pandas) and on [Conda](https://docs.conda.io/en/latest/). +Package Index (PyPI)](https://pypi.org/project/pandas). ```sh -# conda -conda install -c conda-forge pandas -``` - -```sh -# or PyPI +# PyPI pip install pandas ``` diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index f8088f8060..7048d9c6dd 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -476,11 +476,11 @@ def to_gbq( def to_parquet( self, - path: str, + path: Optional[str], *, compression: Optional[Literal["snappy", "gzip"]] = "snappy", index: bool = True, - ) -> None: + ) -> Optional[bytes]: """Write a DataFrame to the binary Parquet format. This function writes the dataframe as a `parquet file @@ -496,9 +496,13 @@ def to_parquet( >>> df.to_parquet(path=gcs_bucket) Args: - path (str): + path (str, path object, file-like object, or None, default None): + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``write()`` function. If None, the result is + returned as bytes. If a string or path, it will be used as Root Directory + path when writing a partitioned dataset. Destination URI(s) of Cloud Storage files(s) to store the extracted dataframe - in format of ``gs:///``. + should be formatted ``gs:///``. If the data size is more than 1GB, you must use a wildcard to export the data into multiple files and the size of the files varies. @@ -511,7 +515,7 @@ def to_parquet( If ``False``, they will not be written to the file. Returns: - None. + bytes if no path argument is provided else None """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 95302e51b2..6734fb6aa9 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -210,14 +210,14 @@ def empty(self) -> bool: def to_json( self, - path_or_buf: str, - orient: Literal[ - "split", "records", "index", "columns", "values", "table" - ] = "columns", + path_or_buf, + orient: Optional[ + Literal["split", "records", "index", "columns", "values", "table"] + ] = None, *, index: bool = True, lines: bool = False, - ) -> None: + ) -> Optional[str]: """Convert the object to a JSON string, written to Cloud Storage. Note NaN's and None will be converted to null and datetime objects @@ -227,16 +227,18 @@ def to_json( Only ``orient='records'`` and ``lines=True`` is supported so far. Args: - path_or_buf (str): - A destination URI of Cloud Storage files(s) to store the extracted + path_or_buf (str, path object, file-like object, or None, default None): + String, path object (implementing os.PathLike[str]), or file-like + object implementing a write() function. If None, the result is + returned as a string. + + Can be a destination URI of Cloud Storage files(s) to store the extracted dataframe in format of ``gs:///``. Must contain a wildcard `*` character. If the data size is more than 1GB, you must use a wildcard to export the data into multiple files and the size of the files varies. - - None, file-like objects or local file paths not yet supported. orient ({`split`, `records`, `index`, `columns`, `values`, `table`}, default 'columns): Indication of expected JSON string format. @@ -271,17 +273,25 @@ def to_json( list-like. Returns: - None: String output not yet supported. + None or str: If path_or_buf is None, returns the resulting json format as a + string. Otherwise returns None. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def to_csv(self, path_or_buf: str, *, index: bool = True) -> None: + def to_csv(self, path_or_buf, *, index: bool = True) -> Optional[str]: """Write object to a comma-separated values (csv) file on Cloud Storage. Args: - path_or_buf (str): - A destination URI of Cloud Storage files(s) to store the extracted dataframe - in format of ``gs:///``. + path_or_buf (str, path object, file-like object, or None, default None): + String, path object (implementing os.PathLike[str]), or file-like + object implementing a write() function. If None, the result is + returned as a string. If a non-binary file object is passed, it should + be opened with `newline=''`, disabling universal newlines. If a binary + file object is passed, `mode` might need to contain a `'b'`. + + Alternatively, a destination URI of Cloud Storage files(s) to store the + extracted dataframe in format of + ``gs:///``. If the data size is more than 1GB, you must use a wildcard to export the data into multiple files and the size of the files @@ -293,7 +303,8 @@ def to_csv(self, path_or_buf: str, *, index: bool = True) -> None: If True, write row names (index). Returns: - None: String output not yet supported. + None or str: If path_or_buf is None, returns the resulting json format as a + string. Otherwise returns None. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index a430c3375f..a30ed9cd92 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -3521,7 +3521,7 @@ def mask(self, cond, other): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def clip(self): + def clip(self, lower, upper): """Trim values at input threshold(s). Assigns values outside boundary to boundary values. Thresholds can be