diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml index a3da1b0d4c..a9bdb1b7ac 100644 --- a/.github/.OwlBot.lock.yaml +++ b/.github/.OwlBot.lock.yaml @@ -13,5 +13,5 @@ # limitations under the License. docker: image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest - digest: sha256:3e3800bb100af5d7f9e810d48212b37812c1856d20ffeafb99ebe66461b61fc7 -# created: 2023-08-02T10:53:29.114535628Z + digest: sha256:fac304457974bb530cc5396abd4ab25d26a469cd3bc97cbfb18c8d4324c584eb +# created: 2023-10-02T21:31:03.517640371Z diff --git a/.gitignore b/.gitignore index b4243ced74..d083ea1ddc 100644 --- a/.gitignore +++ b/.gitignore @@ -50,6 +50,7 @@ docs.metadata # Virtual environment env/ +venv/ # Test logs coverage.xml diff --git a/.kokoro/build.sh b/.kokoro/build.sh index f91c541c6c..58eaa7fedf 100755 --- a/.kokoro/build.sh +++ b/.kokoro/build.sh @@ -15,11 +15,7 @@ set -eo pipefail -if [[ -z "${KOKORO_GOB_COMMIT}" ]]; then - PROJECT_SCM="github/python-bigquery-dataframes" -else - PROJECT_SCM="git/bigframes" -fi +PROJECT_SCM="github/python-bigquery-dataframes" if [[ -z "${PROJECT_ROOT:-}" ]]; then PROJECT_ROOT="${KOKORO_ARTIFACTS_DIR}/${PROJECT_SCM}" @@ -30,6 +26,9 @@ cd "${PROJECT_ROOT}" # Disable buffering, so that the logs stream through. export PYTHONUNBUFFERED=1 +# Workaround https://github.com/pytest-dev/pytest/issues/9567 +export PY_IGNORE_IMPORTMISMATCH=1 + # Debug: show build environment env | grep KOKORO diff --git a/.kokoro/continuous/common.cfg b/.kokoro/continuous/common.cfg index 5d40578ac7..97e0651aa9 100644 --- a/.kokoro/continuous/common.cfg +++ b/.kokoro/continuous/common.cfg @@ -7,4 +7,4 @@ action { } } -build_file: "bigframes/.kokoro/build.sh" +build_file: "python-bigquery-dataframes/.kokoro/build.sh" diff --git a/.kokoro/continuous/nightly.cfg b/.kokoro/continuous/nightly.cfg index 63c3f51d05..2b7111664f 100644 --- a/.kokoro/continuous/nightly.cfg +++ b/.kokoro/continuous/nightly.cfg @@ -1,3 +1,3 @@ # Format: //devtools/kokoro/config/proto/build.proto -build_file: "bigframes/.kokoro/release-nightly.sh" +build_file: "python-bigquery-dataframes/.kokoro/release-nightly.sh" diff --git a/.kokoro/requirements.txt b/.kokoro/requirements.txt index 029bd342de..96d593c8c8 100644 --- a/.kokoro/requirements.txt +++ b/.kokoro/requirements.txt @@ -113,30 +113,30 @@ commonmark==0.9.1 \ --hash=sha256:452f9dc859be7f06631ddcb328b6919c67984aca654e5fefb3914d54691aed60 \ --hash=sha256:da2f38c92590f83de410ba1a3cbceafbc74fee9def35f9251ba9a971d6d66fd9 # via rich -cryptography==41.0.3 \ - --hash=sha256:0d09fb5356f975974dbcb595ad2d178305e5050656affb7890a1583f5e02a306 \ - --hash=sha256:23c2d778cf829f7d0ae180600b17e9fceea3c2ef8b31a99e3c694cbbf3a24b84 \ - --hash=sha256:3fb248989b6363906827284cd20cca63bb1a757e0a2864d4c1682a985e3dca47 \ - --hash=sha256:41d7aa7cdfded09b3d73a47f429c298e80796c8e825ddfadc84c8a7f12df212d \ - --hash=sha256:42cb413e01a5d36da9929baa9d70ca90d90b969269e5a12d39c1e0d475010116 \ - --hash=sha256:4c2f0d35703d61002a2bbdcf15548ebb701cfdd83cdc12471d2bae80878a4207 \ - --hash=sha256:4fd871184321100fb400d759ad0cddddf284c4b696568204d281c902fc7b0d81 \ - --hash=sha256:5259cb659aa43005eb55a0e4ff2c825ca111a0da1814202c64d28a985d33b087 \ - --hash=sha256:57a51b89f954f216a81c9d057bf1a24e2f36e764a1ca9a501a6964eb4a6800dd \ - --hash=sha256:652627a055cb52a84f8c448185922241dd5217443ca194d5739b44612c5e6507 \ - --hash=sha256:67e120e9a577c64fe1f611e53b30b3e69744e5910ff3b6e97e935aeb96005858 \ - --hash=sha256:6af1c6387c531cd364b72c28daa29232162010d952ceb7e5ca8e2827526aceae \ - --hash=sha256:6d192741113ef5e30d89dcb5b956ef4e1578f304708701b8b73d38e3e1461f34 \ - --hash=sha256:7efe8041897fe7a50863e51b77789b657a133c75c3b094e51b5e4b5cec7bf906 \ - --hash=sha256:84537453d57f55a50a5b6835622ee405816999a7113267739a1b4581f83535bd \ - --hash=sha256:8f09daa483aedea50d249ef98ed500569841d6498aa9c9f4b0531b9964658922 \ - --hash=sha256:95dd7f261bb76948b52a5330ba5202b91a26fbac13ad0e9fc8a3ac04752058c7 \ - --hash=sha256:a74fbcdb2a0d46fe00504f571a2a540532f4c188e6ccf26f1f178480117b33c4 \ - --hash=sha256:a983e441a00a9d57a4d7c91b3116a37ae602907a7618b882c8013b5762e80574 \ - --hash=sha256:ab8de0d091acbf778f74286f4989cf3d1528336af1b59f3e5d2ebca8b5fe49e1 \ - --hash=sha256:aeb57c421b34af8f9fe830e1955bf493a86a7996cc1338fe41b30047d16e962c \ - --hash=sha256:ce785cf81a7bdade534297ef9e490ddff800d956625020ab2ec2780a556c313e \ - --hash=sha256:d0d651aa754ef58d75cec6edfbd21259d93810b73f6ec246436a21b7841908de +cryptography==41.0.4 \ + --hash=sha256:004b6ccc95943f6a9ad3142cfabcc769d7ee38a3f60fb0dddbfb431f818c3a67 \ + --hash=sha256:047c4603aeb4bbd8db2756e38f5b8bd7e94318c047cfe4efeb5d715e08b49311 \ + --hash=sha256:0d9409894f495d465fe6fda92cb70e8323e9648af912d5b9141d616df40a87b8 \ + --hash=sha256:23a25c09dfd0d9f28da2352503b23e086f8e78096b9fd585d1d14eca01613e13 \ + --hash=sha256:2ed09183922d66c4ec5fdaa59b4d14e105c084dd0febd27452de8f6f74704143 \ + --hash=sha256:35c00f637cd0b9d5b6c6bd11b6c3359194a8eba9c46d4e875a3660e3b400005f \ + --hash=sha256:37480760ae08065437e6573d14be973112c9e6dcaf5f11d00147ee74f37a3829 \ + --hash=sha256:3b224890962a2d7b57cf5eeb16ccaafba6083f7b811829f00476309bce2fe0fd \ + --hash=sha256:5a0f09cefded00e648a127048119f77bc2b2ec61e736660b5789e638f43cc397 \ + --hash=sha256:5b72205a360f3b6176485a333256b9bcd48700fc755fef51c8e7e67c4b63e3ac \ + --hash=sha256:7e53db173370dea832190870e975a1e09c86a879b613948f09eb49324218c14d \ + --hash=sha256:7febc3094125fc126a7f6fb1f420d0da639f3f32cb15c8ff0dc3997c4549f51a \ + --hash=sha256:80907d3faa55dc5434a16579952ac6da800935cd98d14dbd62f6f042c7f5e839 \ + --hash=sha256:86defa8d248c3fa029da68ce61fe735432b047e32179883bdb1e79ed9bb8195e \ + --hash=sha256:8ac4f9ead4bbd0bc8ab2d318f97d85147167a488be0e08814a37eb2f439d5cf6 \ + --hash=sha256:93530900d14c37a46ce3d6c9e6fd35dbe5f5601bf6b3a5c325c7bffc030344d9 \ + --hash=sha256:9eeb77214afae972a00dee47382d2591abe77bdae166bda672fb1e24702a3860 \ + --hash=sha256:b5f4dfe950ff0479f1f00eda09c18798d4f49b98f4e2006d644b3301682ebdca \ + --hash=sha256:c3391bd8e6de35f6f1140e50aaeb3e2b3d6a9012536ca23ab0d9c35ec18c8a91 \ + --hash=sha256:c880eba5175f4307129784eca96f4e70b88e57aa3f680aeba3bab0e980b0f37d \ + --hash=sha256:cecfefa17042941f94ab54f769c8ce0fe14beff2694e9ac684176a2535bf9714 \ + --hash=sha256:e40211b4923ba5a6dc9769eab704bdb3fbb58d56c5b336d30996c24fcf12aadb \ + --hash=sha256:efc8ad4e6fc4f1752ebfb58aefece8b4e3c4cae940b0994d43649bdfce8d0d4f # via # gcp-releasetool # secretstorage @@ -382,6 +382,7 @@ protobuf==3.20.3 \ # gcp-docuploader # gcp-releasetool # google-api-core + # googleapis-common-protos pyasn1==0.4.8 \ --hash=sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d \ --hash=sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba diff --git a/CHANGELOG.md b/CHANGELOG.md index e4b2bff3c7..880f791625 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,25 @@ [1]: https://pypi.org/project/bigframes/#history +## [0.6.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.5.0...v0.6.0) (2023-10-04) + + +### Features + +* Add df.unstack ([#63](https://github.com/googleapis/python-bigquery-dataframes/issues/63)) ([4a84714](https://github.com/googleapis/python-bigquery-dataframes/commit/4a84714e2fb07f70c70c79f8b8da9fcb41096e33)) +* Add idxmin, idxmax to series, dataframe ([#74](https://github.com/googleapis/python-bigquery-dataframes/issues/74)) ([781307e](https://github.com/googleapis/python-bigquery-dataframes/commit/781307ec22d31a7657f8ee5c6eedc0e419450ccd)) +* Add ml.preprocessing.KBinsDiscretizer ([#81](https://github.com/googleapis/python-bigquery-dataframes/issues/81)) ([24c6256](https://github.com/googleapis/python-bigquery-dataframes/commit/24c625638984f6a84191c7a4c8ac9fb6c3cf1dca)) +* Add multi-column dataframe merge ([#73](https://github.com/googleapis/python-bigquery-dataframes/issues/73)) ([c9fa85c](https://github.com/googleapis/python-bigquery-dataframes/commit/c9fa85cc338be5e9a8dde59b255690aedbbc1127)) +* Add update and align methods to dataframe ([#57](https://github.com/googleapis/python-bigquery-dataframes/issues/57)) ([bf050cf](https://github.com/googleapis/python-bigquery-dataframes/commit/bf050cf475ad8a9e3e0ca3f896ddaf96dbe13ae3)) +* Support STRUCT data type with `Series.struct.field` to extract child fields ([#71](https://github.com/googleapis/python-bigquery-dataframes/issues/71)) ([17afac9](https://github.com/googleapis/python-bigquery-dataframes/commit/17afac9ff70a2b93ed70dc7bcce7beb9a53c2ece)) + + +### Bug Fixes + +* Avoid `403 response too large to return` error with `read_gbq` and large query results ([#77](https://github.com/googleapis/python-bigquery-dataframes/issues/77)) ([8f3b5b2](https://github.com/googleapis/python-bigquery-dataframes/commit/8f3b5b240f0f28fef92465abc53504e875d7335a)) +* Change return type of `Series.loc[scalar]` ([#40](https://github.com/googleapis/python-bigquery-dataframes/issues/40)) ([fff3d45](https://github.com/googleapis/python-bigquery-dataframes/commit/fff3d45f03ffbc7bb23143a1572e3dd157463ca9)) +* Fix df/series.iloc by list with multiindex ([#79](https://github.com/googleapis/python-bigquery-dataframes/issues/79)) ([971d091](https://github.com/googleapis/python-bigquery-dataframes/commit/971d091cac9ad662145a3d43d8f9a785eb0ccc23)) + ## [0.5.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.4.0...v0.5.0) (2023-09-28) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index c529f83351..8008c1189a 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -963,10 +963,11 @@ def unpivot( ], *, passthrough_columns: typing.Sequence[str] = (), - index_col_id: str = "index", + index_col_ids: typing.Sequence[str] = ["index"], dtype: typing.Union[ bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype] ] = pandas.Float64Dtype(), + how="left", ) -> ArrayValue: """ Unpivot ArrayValue columns. @@ -981,8 +982,11 @@ def unpivot( Returns: ArrayValue: The unpivoted ArrayValue """ - table = self._to_ibis_expr(ordering_mode="offset_col") + if how not in ("left", "right"): + raise ValueError("'how' must be 'left' or 'right'") + table = self._to_ibis_expr(ordering_mode="unordered", expose_hidden_cols=True) row_n = len(row_labels) + hidden_col_ids = self._hidden_ordering_column_names.keys() if not all( len(source_columns) == row_n for _, source_columns in unpivot_columns ): @@ -992,33 +996,44 @@ def unpivot( unpivot_table = table.cross_join( ibis.memtable({unpivot_offset_id: range(row_n)}) ) - unpivot_offsets_value = ( - ( - (unpivot_table[ORDER_ID_COLUMN] * row_n) - + unpivot_table[unpivot_offset_id] - ) - .cast(ibis_dtypes.int64) - .name(ORDER_ID_COLUMN), - ) - # Use ibis memtable to infer type of rowlabels (if possible) # TODO: Allow caller to specify dtype - labels_ibis_type = ibis.memtable({"col": row_labels})["col"].type() - labels_dtype = bigframes.dtypes.ibis_dtype_to_bigframes_dtype(labels_ibis_type) - cases = [ - ( - i, - bigframes.dtypes.literal_to_ibis_scalar( - row_labels[i], force_dtype=labels_dtype # type:ignore - ), - ) - for i in range(len(row_labels)) + if isinstance(row_labels[0], tuple): + labels_table = ibis.memtable(row_labels) + labels_ibis_types = [ + labels_table[col].type() for col in labels_table.columns + ] + else: + labels_ibis_types = [ibis.memtable({"col": row_labels})["col"].type()] + labels_dtypes = [ + bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type) + for ibis_type in labels_ibis_types ] - labels_value = ( - typing.cast(ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id]) - .cases(cases, default=None) # type:ignore - .name(index_col_id) - ) + + label_columns = [] + for label_part, (col_id, label_dtype) in enumerate( + zip(index_col_ids, labels_dtypes) + ): + # interpret as tuples even if it wasn't originally so can apply same logic for multi-column labels + labels_as_tuples = [ + label if isinstance(label, tuple) else (label,) for label in row_labels + ] + cases = [ + ( + i, + bigframes.dtypes.literal_to_ibis_scalar( + label_tuple[label_part], # type:ignore + force_dtype=label_dtype, # type:ignore + ), + ) + for i, label_tuple in enumerate(labels_as_tuples) + ] + labels_value = ( + typing.cast(ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id]) + .cases(cases, default=None) # type:ignore + .name(col_id) + ) + label_columns.append(labels_value) unpivot_values = [] for j in range(len(unpivot_columns)): @@ -1042,23 +1057,53 @@ def unpivot( unpivot_values.append(unpivot_value.name(result_col)) unpivot_table = unpivot_table.select( - passthrough_columns, labels_value, *unpivot_values, unpivot_offsets_value + passthrough_columns, + *label_columns, + *unpivot_values, + *hidden_col_ids, + unpivot_offset_id, ) + # Extend the original ordering using unpivot_offset_id + old_ordering = self._ordering + if how == "left": + new_ordering = ExpressionOrdering( + ordering_value_columns=[ + *old_ordering.ordering_value_columns, + OrderingColumnReference(unpivot_offset_id), + ], + total_ordering_columns=frozenset( + [*old_ordering.total_ordering_columns, unpivot_offset_id] + ), + ) + else: # how=="right" + new_ordering = ExpressionOrdering( + ordering_value_columns=[ + OrderingColumnReference(unpivot_offset_id), + *old_ordering.ordering_value_columns, + ], + total_ordering_columns=frozenset( + [*old_ordering.total_ordering_columns, unpivot_offset_id] + ), + ) value_columns = [ unpivot_table[value_col_id] for value_col_id, _ in unpivot_columns ] passthrough_values = [unpivot_table[col] for col in passthrough_columns] + hidden_ordering_columns = [ + unpivot_table[unpivot_offset_id], + *[unpivot_table[hidden_col] for hidden_col in hidden_col_ids], + ] return ArrayValue( session=self._session, table=unpivot_table, - columns=[unpivot_table[index_col_id], *value_columns, *passthrough_values], - hidden_ordering_columns=[unpivot_table[ORDER_ID_COLUMN]], - ordering=ExpressionOrdering( - ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)], - integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - ), + columns=[ + *[unpivot_table[col_id] for col_id in index_col_ids], + *value_columns, + *passthrough_values, + ], + hidden_ordering_columns=hidden_ordering_columns, + ordering=new_ordering, ) def assign(self, source_id: str, destination_id: str) -> ArrayValue: @@ -1153,8 +1198,8 @@ def cached(self, cluster_cols: typing.Sequence[str]) -> ArrayValue: destination = self._session._ibis_to_session_table( ibis_expr, cluster_cols=cluster_cols, api_name="cache" ) - table_expression = self._session.ibis_client.sql( - f"SELECT * FROM `_SESSION`.`{destination.table_id}`" + table_expression = self._session.ibis_client.table( + f"{destination.project}.{destination.dataset_id}.{destination.table_id}" ) new_columns = [table_expression[column] for column in self.column_names] new_hidden_columns = [ diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index d22112417c..30c7902981 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -17,6 +17,7 @@ import pandas as pd +import bigframes.constants as constants import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.core.ordering as ordering @@ -504,3 +505,125 @@ def _kurt_from_moments_and_count( kurt_id, na_cond_id, ops.partial_arg3(ops.where_op, None) ) return block, kurt_id + + +def align( + left_block: blocks.Block, + right_block: blocks.Block, + join: str = "outer", + axis: typing.Union[str, int, None] = None, +) -> typing.Tuple[blocks.Block, blocks.Block]: + axis_n = core.utils.get_axis_number(axis) if axis is not None else None + # Must align columns first as other way will likely create extra joins + if (axis_n is None) or axis_n == 1: + left_block, right_block = align_columns(left_block, right_block, join=join) + if (axis_n is None) or axis_n == 0: + left_block, right_block = align_rows(left_block, right_block, join=join) + return left_block, right_block + + +def align_rows( + left_block: blocks.Block, + right_block: blocks.Block, + join: str = "outer", +): + joined_index, (get_column_left, get_column_right) = left_block.index.join( + right_block.index, how=join + ) + left_columns = [get_column_left(col) for col in left_block.value_columns] + right_columns = [get_column_right(col) for col in right_block.value_columns] + + left_block = joined_index._block.select_columns(left_columns) + right_block = joined_index._block.select_columns(right_columns) + return left_block, right_block + + +def align_columns( + left_block: blocks.Block, + right_block: blocks.Block, + join: str = "outer", +): + columns, lcol_indexer, rcol_indexer = left_block.column_labels.join( + right_block.column_labels, how=join, return_indexers=True + ) + column_indices = zip( + lcol_indexer if (lcol_indexer is not None) else range(len(columns)), + rcol_indexer if (rcol_indexer is not None) else range(len(columns)), + ) + left_column_ids = [] + right_column_ids = [] + + original_left_block = left_block + original_right_block = right_block + + for left_index, right_index in column_indices: + if left_index >= 0: + left_col_id = original_left_block.value_columns[left_index] + else: + dtype = right_block.dtypes[right_index] + left_block, left_col_id = left_block.create_constant( + None, dtype=dtype, label=original_right_block.column_labels[right_index] + ) + left_column_ids.append(left_col_id) + + if right_index >= 0: + right_col_id = original_right_block.value_columns[right_index] + else: + dtype = original_left_block.dtypes[left_index] + right_block, right_col_id = right_block.create_constant( + None, dtype=dtype, label=left_block.column_labels[left_index] + ) + right_column_ids.append(right_col_id) + left_final = left_block.select_columns(left_column_ids) + right_final = right_block.select_columns(right_column_ids) + return left_final, right_final + + +def idxmin(block: blocks.Block) -> blocks.Block: + return _idx_extrema(block, "min") + + +def idxmax(block: blocks.Block) -> blocks.Block: + return _idx_extrema(block, "max") + + +def _idx_extrema( + block: blocks.Block, min_or_max: typing.Literal["min", "max"] +) -> blocks.Block: + if len(block.index_columns) != 1: + # TODO: Need support for tuple dtype + raise NotImplementedError( + f"idxmin not support for multi-index. {constants.FEEDBACK_LINK}" + ) + + original_block = block + result_cols = [] + for value_col in original_block.value_columns: + direction = ( + ordering.OrderingDirection.ASC + if min_or_max == "min" + else ordering.OrderingDirection.DESC + ) + # Have to find the min for each + order_refs = [ + ordering.OrderingColumnReference(value_col, direction), + *[ + ordering.OrderingColumnReference(idx_col) + for idx_col in original_block.index_columns + ], + ] + window_spec = core.WindowSpec(ordering=order_refs) + idx_col = original_block.index_columns[0] + block, result_col = block.apply_window_op( + idx_col, agg_ops.first_op, window_spec + ) + result_cols.append(result_col) + + block = block.select_columns(result_cols).with_column_labels( + original_block.column_labels + ) + # Stack the entire column axis to produce single-column result + # Assumption: uniform dtype for stackability + return block.aggregate_all_and_stack( + agg_ops.AnyValueOp(), dtype=block.dtypes[0] + ).with_column_labels([original_block.index.name]) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index b53c2212c1..0161d17361 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -38,6 +38,7 @@ import bigframes.core as core import bigframes.core.guid as guid import bigframes.core.indexes as indexes +import bigframes.core.joins as joins import bigframes.core.ordering as ordering import bigframes.core.utils import bigframes.core.utils as utils @@ -838,7 +839,7 @@ def aggregate_all_and_stack( ] result_expr = self.expr.aggregate(aggregations, dropna=dropna).unpivot( row_labels=self.column_labels.to_list(), - index_col_id="index", + index_col_ids=["index"], unpivot_columns=[(value_col_id, self.value_columns)], dtype=dtype, ) @@ -849,7 +850,7 @@ def aggregate_all_and_stack( expr_with_offsets, offset_col = self.expr.promote_offsets() stacked_expr = expr_with_offsets.unpivot( row_labels=self.column_labels.to_list(), - index_col_id=guid.generate_guid(), + index_col_ids=[guid.generate_guid()], unpivot_columns=[(value_col_id, self.value_columns)], passthrough_columns=[*self.index_columns, offset_col], dtype=dtype, @@ -1041,7 +1042,7 @@ def summarize( expr = self.expr.aggregate(aggregations).unpivot( labels, unpivot_columns=columns, - index_col_id=label_col_id, + index_col_ids=[label_col_id], ) labels = self._get_labels_for_columns(column_ids) return Block(expr, column_labels=labels, index_columns=[label_col_id]) @@ -1225,116 +1226,83 @@ def pivot( return result_block.with_column_labels(column_index) - def stack(self): + def stack(self, how="left", dropna=True, sort=True, levels: int = 1): """Unpivot last column axis level into row axis""" - if isinstance(self.column_labels, pd.MultiIndex): - return self._stack_multi() - else: - return self._stack_mono() - - def _stack_mono(self): - if isinstance(self.column_labels, pd.MultiIndex): - raise ValueError("Expected single level index") - # These are the values that will be turned into rows - stack_values = self.column_labels.drop_duplicates().sort_values() - # Get matching columns - unpivot_columns: List[Tuple[str, List[str]]] = [] - dtypes: List[bigframes.dtypes.Dtype] = [] - col_id = guid.generate_guid("unpivot_") - dtype = None - input_columns: Sequence[Optional[str]] = [] - for uvalue in stack_values: - matching_ids = self.label_to_col_id.get(uvalue, []) - input_id = matching_ids[0] if len(matching_ids) > 0 else None - if input_id: - if dtype and dtype != self._column_type(input_id): - raise NotImplementedError( - "Cannot stack columns with non-matching dtypes." - ) - else: - dtype = self._column_type(input_id) - input_columns.append(input_id) - unpivot_columns.append((col_id, input_columns)) - if dtype: - dtypes.append(dtype or pd.Float64Dtype()) + col_labels, row_labels = utils.split_index(self.column_labels, levels=levels) + if dropna: + row_labels = row_labels.drop_duplicates() + if sort: + row_labels = row_labels.sort_values() - added_index_column = col_id = guid.generate_guid() - unpivot_expr = self._expr.unpivot( - row_labels=stack_values, - passthrough_columns=self.index_columns, - unpivot_columns=unpivot_columns, - index_col_id=added_index_column, - dtype=dtypes, - ) - block = Block( - unpivot_expr, - index_columns=[*self.index_columns, added_index_column], - column_labels=[None], - index_labels=[*self._index_labels, self.column_labels.names[-1]], - ) - return block + row_label_tuples = utils.index_as_tuples(row_labels) - def _stack_multi(self): - if not isinstance(self.column_labels, pd.MultiIndex): - raise ValueError("Expected multi-index") - - # These are the values that will be turned into rows - stack_values = ( - self.column_labels.get_level_values(-1).drop_duplicates().sort_values() - ) - - result_col_labels = ( - self.column_labels.droplevel(-1) - .drop_duplicates() - .sort_values() - .dropna(how="all") - ) + if col_labels is not None: + result_index = col_labels.drop_duplicates().sort_values().dropna(how="all") + result_col_labels = utils.index_as_tuples(result_index) + else: + result_index = pd.Index([None]) + result_col_labels = list([()]) # Get matching columns unpivot_columns: List[Tuple[str, List[str]]] = [] dtypes = [] for val in result_col_labels: col_id = guid.generate_guid("unpivot_") - dtype = None - input_columns: Sequence[Optional[str]] = [] - for uvalue in stack_values: - # Need to unpack if still a multi-index after dropping 1 level - label_to_match = ( - (val, uvalue) if result_col_labels.nlevels == 1 else (*val, uvalue) - ) - matching_ids = self.label_to_col_id.get(label_to_match, []) - input_id = matching_ids[0] if len(matching_ids) > 0 else None - if input_id: - if dtype and dtype != self._column_type(input_id): - raise NotImplementedError( - "Cannot stack columns with non-matching dtypes." - ) - else: - dtype = self._column_type(input_id) - input_columns.append(input_id) - # Input column i is the first one that + input_columns, dtype = self._create_stack_column(val, row_label_tuples) unpivot_columns.append((col_id, input_columns)) if dtype: dtypes.append(dtype or pd.Float64Dtype()) - added_index_column = col_id = guid.generate_guid() + added_index_columns = [guid.generate_guid() for _ in range(row_labels.nlevels)] unpivot_expr = self._expr.unpivot( - row_labels=stack_values, + row_labels=row_label_tuples, passthrough_columns=self.index_columns, unpivot_columns=unpivot_columns, - index_col_id=added_index_column, + index_col_ids=added_index_columns, dtype=dtypes, + how=how, ) + new_index_level_names = self.column_labels.names[-levels:] + if how == "left": + index_columns = [*self.index_columns, *added_index_columns] + index_labels = [*self._index_labels, *new_index_level_names] + else: + index_columns = [*added_index_columns, *self.index_columns] + index_labels = [*new_index_level_names, *self._index_labels] + block = Block( unpivot_expr, - index_columns=[*self.index_columns, added_index_column], - column_labels=result_col_labels, - index_labels=[*self._index_labels, self.column_labels.names[-1]], + index_columns=index_columns, + column_labels=result_index, + index_labels=index_labels, ) return block + def _create_stack_column( + self, col_label: typing.Tuple, stack_labels: typing.Sequence[typing.Tuple] + ): + dtype = None + input_columns: list[Optional[str]] = [] + for uvalue in stack_labels: + label_to_match = (*col_label, *uvalue) + label_to_match = ( + label_to_match[0] if len(label_to_match) == 1 else label_to_match + ) + matching_ids = self.label_to_col_id.get(label_to_match, []) + input_id = matching_ids[0] if len(matching_ids) > 0 else None + if input_id: + if dtype and dtype != self._column_type(input_id): + raise NotImplementedError( + "Cannot stack columns with non-matching dtypes." + ) + else: + dtype = self._column_type(input_id) + input_columns.append(input_id) + # Input column i is the first one that + return input_columns, dtype or pd.Float64Dtype() + def _column_type(self, col_id: str) -> bigframes.dtypes.Dtype: col_offset = self.value_columns.index(col_id) dtype = self.dtypes[col_offset] @@ -1436,6 +1404,78 @@ def concat( result_block = result_block.reset_index() return result_block + def merge( + self, + other: Block, + how: typing.Literal[ + "inner", + "left", + "outer", + "right", + ], + left_col_ids: typing.Sequence[str], + right_col_ids: typing.Sequence[str], + sort: bool, + suffixes: tuple[str, str] = ("_x", "_y"), + ) -> Block: + ( + joined_expr, + coalesced_join_cols, + (get_column_left, get_column_right), + ) = joins.join_by_column( + self.expr, + left_col_ids, + other.expr, + right_col_ids, + how=how, + sort=sort, + ) + + # which join key parts should be coalesced + merge_join_key_mask = [ + str(self.col_id_to_label[left_id]) == str(other.col_id_to_label[right_id]) + for left_id, right_id in zip(left_col_ids, right_col_ids) + ] + labels_to_coalesce = [ + self.col_id_to_label[col_id] + for i, col_id in enumerate(left_col_ids) + if merge_join_key_mask[i] + ] + + def left_col_mapping(col_id: str) -> str: + if col_id in left_col_ids: + join_key_part = left_col_ids.index(col_id) + if merge_join_key_mask[join_key_part]: + return coalesced_join_cols[join_key_part] + return get_column_left(col_id) + + def right_col_mapping(col_id: str) -> typing.Optional[str]: + if col_id in right_col_ids: + join_key_part = right_col_ids.index(col_id) + if merge_join_key_mask[join_key_part]: + return None + return get_column_right(col_id) + + left_columns = [left_col_mapping(col_id) for col_id in self.value_columns] + + right_columns = [ + typing.cast(str, right_col_mapping(col_id)) + for col_id in other.value_columns + if right_col_mapping(col_id) + ] + + expr = joined_expr.select_columns([*left_columns, *right_columns]) + labels = utils.merge_column_labels( + self.column_labels, + other.column_labels, + coalesce_labels=labels_to_coalesce, + suffixes=suffixes, + ) + + # Constructs default index + expr, offset_index_id = expr.promote_offsets() + return Block(expr, index_columns=[offset_index_id], column_labels=labels) + def _force_reproject(self) -> Block: """Forces a reprojection of the underlying tables expression. Used to force predicate/order application before subsequent operations.""" return Block( diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index a538c80711..1a88b2abd6 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -15,7 +15,7 @@ from __future__ import annotations import typing -from typing import Tuple +from typing import Tuple, Union import ibis import pandas as pd @@ -29,20 +29,19 @@ import bigframes.series if typing.TYPE_CHECKING: - LocSingleKey = typing.Union[bigframes.series.Series, indexes.Index, slice] + LocSingleKey = Union[ + bigframes.series.Series, indexes.Index, slice, bigframes.core.scalar.Scalar + ] class LocSeriesIndexer: def __init__(self, series: bigframes.series.Series): self._series = series - def __getitem__(self, key) -> bigframes.series.Series: - """ - Only indexing by a boolean bigframes.series.Series or list of index entries is currently supported - """ - return typing.cast( - bigframes.series.Series, _loc_getitem_series_or_dataframe(self._series, key) - ) + def __getitem__( + self, key + ) -> Union[bigframes.core.scalar.Scalar, bigframes.series.Series]: + return _loc_getitem_series_or_dataframe(self._series, key) def __setitem__(self, key, value) -> None: # TODO(swast): support MultiIndex @@ -84,7 +83,7 @@ def __init__(self, series: bigframes.series.Series): def __getitem__( self, key - ) -> bigframes.core.scalar.Scalar | bigframes.series.Series: + ) -> Union[bigframes.core.scalar.Scalar, bigframes.series.Series]: """ Index series using integer offsets. Currently supports index by key type: @@ -103,13 +102,17 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): self._dataframe = dataframe @typing.overload - def __getitem__(self, key: LocSingleKey) -> bigframes.dataframe.DataFrame: + def __getitem__( + self, key: LocSingleKey + ) -> Union[bigframes.dataframe.DataFrame, pd.Series]: ... # Technically this is wrong since we can have duplicate column labels, but # this is expected to be rare. @typing.overload - def __getitem__(self, key: Tuple[LocSingleKey, str]) -> bigframes.series.Series: + def __getitem__( + self, key: Tuple[LocSingleKey, str] + ) -> Union[bigframes.series.Series, bigframes.core.scalar.Scalar]: ... def __getitem__(self, key): @@ -173,7 +176,7 @@ class ILocDataFrameIndexer: def __init__(self, dataframe: bigframes.dataframe.DataFrame): self._dataframe = dataframe - def __getitem__(self, key) -> bigframes.dataframe.DataFrame | pd.Series: + def __getitem__(self, key) -> Union[bigframes.dataframe.DataFrame, pd.Series]: """ Index dataframe using integer offsets. Currently supports index by key type: @@ -188,21 +191,26 @@ def __getitem__(self, key) -> bigframes.dataframe.DataFrame | pd.Series: @typing.overload def _loc_getitem_series_or_dataframe( series_or_dataframe: bigframes.series.Series, key -) -> bigframes.series.Series: +) -> Union[bigframes.core.scalar.Scalar, bigframes.series.Series]: ... @typing.overload def _loc_getitem_series_or_dataframe( series_or_dataframe: bigframes.dataframe.DataFrame, key -) -> bigframes.dataframe.DataFrame: +) -> Union[bigframes.dataframe.DataFrame, pd.Series]: ... def _loc_getitem_series_or_dataframe( - series_or_dataframe: bigframes.dataframe.DataFrame | bigframes.series.Series, + series_or_dataframe: Union[bigframes.dataframe.DataFrame, bigframes.series.Series], key: LocSingleKey, -) -> bigframes.dataframe.DataFrame | bigframes.series.Series: +) -> Union[ + bigframes.dataframe.DataFrame, + bigframes.series.Series, + pd.Series, + bigframes.core.scalar.Scalar, +]: if isinstance(key, bigframes.series.Series) and key.dtype == "boolean": return series_or_dataframe[key] elif isinstance(key, bigframes.series.Series): @@ -222,7 +230,7 @@ def _loc_getitem_series_or_dataframe( # TODO(henryjsolberg): support MultiIndex if len(key) == 0: # type: ignore return typing.cast( - typing.Union[bigframes.dataframe.DataFrame, bigframes.series.Series], + Union[bigframes.dataframe.DataFrame, bigframes.series.Series], series_or_dataframe.iloc[0:0], ) @@ -258,11 +266,22 @@ def _loc_getitem_series_or_dataframe( ) keys_df = keys_df.set_index(index_name, drop=True) keys_df.index.name = None - return _perform_loc_list_join(series_or_dataframe, keys_df) + result = _perform_loc_list_join(series_or_dataframe, keys_df) + pandas_result = result.to_pandas() + # although loc[scalar_key] returns multiple results when scalar_key + # is not unique, we download the results here and return the computed + # individual result (as a scalar or pandas series) when the key is unique, + # since we expect unique index keys to be more common. loc[[scalar_key]] + # can be used to retrieve one-item DataFrames or Series. + if len(pandas_result) == 1: + return pandas_result.iloc[0] + # when the key is not unique, we return a bigframes data type + # as usual for methods that return dataframes/series + return result else: raise TypeError( - "Invalid argument type. loc currently only supports indexing with a " - "boolean bigframes Series, a list of index entries or a single index entry. " + "Invalid argument type. Expected bigframes.Series, bigframes.Index, " + "list, : (empty slice), or scalar. " f"{constants.FEEDBACK_LINK}" ) @@ -284,9 +303,9 @@ def _perform_loc_list_join( def _perform_loc_list_join( - series_or_dataframe: bigframes.dataframe.DataFrame | bigframes.series.Series, + series_or_dataframe: Union[bigframes.dataframe.DataFrame, bigframes.series.Series], keys_df: bigframes.dataframe.DataFrame, -) -> bigframes.series.Series | bigframes.dataframe.DataFrame: +) -> Union[bigframes.series.Series, bigframes.dataframe.DataFrame]: # right join based on the old index so that the matching rows from the user's # original dataframe will be duplicated and reordered appropriately original_index_names = series_or_dataframe.index.names @@ -309,20 +328,26 @@ def _perform_loc_list_join( @typing.overload def _iloc_getitem_series_or_dataframe( series_or_dataframe: bigframes.series.Series, key -) -> bigframes.series.Series | bigframes.core.scalar.Scalar: +) -> Union[bigframes.series.Series, bigframes.core.scalar.Scalar]: ... @typing.overload def _iloc_getitem_series_or_dataframe( series_or_dataframe: bigframes.dataframe.DataFrame, key -) -> bigframes.dataframe.DataFrame | pd.Series: +) -> Union[bigframes.dataframe.DataFrame, pd.Series]: ... def _iloc_getitem_series_or_dataframe( - series_or_dataframe: bigframes.dataframe.DataFrame | bigframes.series.Series, key -) -> bigframes.dataframe.DataFrame | bigframes.series.Series | bigframes.core.scalar.Scalar | pd.Series: + series_or_dataframe: Union[bigframes.dataframe.DataFrame, bigframes.series.Series], + key, +) -> Union[ + bigframes.dataframe.DataFrame, + bigframes.series.Series, + bigframes.core.scalar.Scalar, + pd.Series, +]: if isinstance(key, int): internal_slice_result = series_or_dataframe._slice(key, key + 1, 1) result_pd_df = internal_slice_result.to_pandas() @@ -332,11 +357,9 @@ def _iloc_getitem_series_or_dataframe( elif isinstance(key, slice): return series_or_dataframe._slice(key.start, key.stop, key.step) elif pd.api.types.is_list_like(key): - # TODO(henryjsolberg): support MultiIndex - if len(key) == 0: return typing.cast( - typing.Union[bigframes.dataframe.DataFrame, bigframes.series.Series], + Union[bigframes.dataframe.DataFrame, bigframes.series.Series], series_or_dataframe.iloc[0:0], ) df = series_or_dataframe @@ -346,15 +369,18 @@ def _iloc_getitem_series_or_dataframe( original_series_name if original_series_name is not None else "0" ) df = series_or_dataframe.to_frame() - original_index_name = df.index.name - temporary_index_name = guid.generate_guid(prefix="temp_iloc_index_") - df = df.rename_axis(temporary_index_name) + original_index_names = df.index.names + temporary_index_names = [ + guid.generate_guid(prefix="temp_iloc_index_") + for _ in range(len(df.index.names)) + ] + df = df.rename_axis(temporary_index_names) # set to offset index and use regular loc, then restore index df = df.reset_index(drop=False) result = df.loc[key] - result = result.set_index(temporary_index_name) - result = result.rename_axis(original_index_name) + result = result.set_index(temporary_index_names) + result = result.rename_axis(original_index_names) if isinstance(series_or_dataframe, bigframes.series.Series): result = result[series_name] diff --git a/bigframes/core/io.py b/bigframes/core/io.py index 3c2e5a25f5..d47efbdddc 100644 --- a/bigframes/core/io.py +++ b/bigframes/core/io.py @@ -16,7 +16,8 @@ import datetime import textwrap -from typing import Dict, Union +import types +from typing import Dict, Iterable, Union import google.cloud.bigquery as bigquery @@ -89,6 +90,48 @@ def create_snapshot_sql( ) +# BigQuery REST API returns types in Legacy SQL format +# https://cloud.google.com/bigquery/docs/data-types but we use Standard SQL +# names +# https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types +BQ_STANDARD_TYPES = types.MappingProxyType( + { + "BOOLEAN": "BOOL", + "INTEGER": "INT64", + "FLOAT": "FLOAT64", + } +) + + +def bq_field_to_type_sql(field: bigquery.SchemaField): + if field.mode == "REPEATED": + nested_type = bq_field_to_type_sql( + bigquery.SchemaField( + field.name, field.field_type, mode="NULLABLE", fields=field.fields + ) + ) + return f"ARRAY<{nested_type}>" + + if field.field_type == "RECORD": + nested_fields_sql = ", ".join( + bq_field_to_sql(child_field) for child_field in field.fields + ) + return f"STRUCT<{nested_fields_sql}>" + + type_ = field.field_type + return BQ_STANDARD_TYPES.get(type_, type_) + + +def bq_field_to_sql(field: bigquery.SchemaField): + name = field.name + type_ = bq_field_to_type_sql(field) + return f"`{name}` {type_}" + + +def bq_schema_to_sql(schema: Iterable[bigquery.SchemaField]): + return ", ".join(bq_field_to_sql(field) for field in schema) + + def format_option(key: str, value: Union[bool, str]) -> str: if isinstance(value, bool): return f"{key}=true" if value else f"{key}=false" diff --git a/bigframes/core/joins/single_column.py b/bigframes/core/joins/single_column.py index 8a9825cf0b..2d616fc3f0 100644 --- a/bigframes/core/joins/single_column.py +++ b/bigframes/core/joins/single_column.py @@ -44,7 +44,6 @@ def join_by_column( "right", ], sort: bool = False, - coalesce_join_keys: bool = True, allow_row_identity_join: bool = True, ) -> Tuple[ core.ArrayValue, @@ -59,8 +58,6 @@ def join_by_column( right: Expression for right table to join. right_column_ids: Column IDs (not label) to join by. how: The type of join to perform. - coalesce_join_keys: if set to False, returned column ids will contain - both left and right join key columns. allow_row_identity_join (bool): If True, allow matching by row identity. Set to False to always perform a true JOIN in generated SQL. @@ -71,8 +68,6 @@ def join_by_column( * Sequence[str]: Column IDs of the coalesced join columns. Sometimes either the left/right table will have missing rows. This column pulls the non-NULL value from either left/right. - If coalesce_join_keys is False, will return uncombined left and - right key columns. * Tuple[Callable, Callable]: For a given column ID from left or right, respectively, return the new column id from the combined expression. """ @@ -100,9 +95,7 @@ def join_by_column( right_join_keys = [ combined_expr.get_column(get_column_right(col)) for col in right_column_ids ] - join_key_cols = get_join_cols( - left_join_keys, right_join_keys, how, coalesce_join_keys - ) + join_key_cols = get_coalesced_join_cols(left_join_keys, right_join_keys, how) join_key_ids = [col.get_name() for col in join_key_cols] combined_expr = combined_expr.projection( [*join_key_cols, *combined_expr.columns] @@ -182,9 +175,7 @@ def get_column_right(col_id): right_join_keys = [ combined_table[get_column_right(col)] for col in right_column_ids ] - join_key_cols = get_join_cols( - left_join_keys, right_join_keys, how, coalesce_join_keys - ) + join_key_cols = get_coalesced_join_cols(left_join_keys, right_join_keys, how) # We could filter out the original join columns, but predicates/ordering # might still reference them in implicit joins. columns = ( @@ -226,46 +217,35 @@ def get_column_right(col_id): ) -def get_join_cols( +def get_coalesced_join_cols( left_join_cols: typing.Iterable[ibis_types.Value], right_join_cols: typing.Iterable[ibis_types.Value], how: str, - coalesce_join_keys: bool = True, ) -> typing.List[ibis_types.Value]: join_key_cols: list[ibis_types.Value] = [] for left_col, right_col in zip(left_join_cols, right_join_cols): - if not coalesce_join_keys: + if how == "left" or how == "inner": join_key_cols.append(left_col.name(guid.generate_guid(prefix="index_"))) + elif how == "right": join_key_cols.append(right_col.name(guid.generate_guid(prefix="index_"))) - else: - if how == "left" or how == "inner": + elif how == "outer": + # The left index and the right index might contain null values, for + # example due to an outer join with different numbers of rows. Coalesce + # these to take the index value from either column. + # Use a random name in case the left index and the right index have the + # same name. In such a case, _x and _y suffixes will already be used. + # Don't need to coalesce if they are exactly the same column. + if left_col.name("index").equals(right_col.name("index")): join_key_cols.append(left_col.name(guid.generate_guid(prefix="index_"))) - elif how == "right": - join_key_cols.append( - right_col.name(guid.generate_guid(prefix="index_")) - ) - elif how == "outer": - # The left index and the right index might contain null values, for - # example due to an outer join with different numbers of rows. Coalesce - # these to take the index value from either column. - # Use a random name in case the left index and the right index have the - # same name. In such a case, _x and _y suffixes will already be used. - # Don't need to coalesce if they are exactly the same column. - if left_col.name("index").equals(right_col.name("index")): - join_key_cols.append( - left_col.name(guid.generate_guid(prefix="index_")) - ) - else: - join_key_cols.append( - ibis.coalesce( - left_col, - right_col, - ).name(guid.generate_guid(prefix="index_")) - ) else: - raise ValueError( - f"Unexpected join type: {how}. {constants.FEEDBACK_LINK}" + join_key_cols.append( + ibis.coalesce( + left_col, + right_col, + ).name(guid.generate_guid(prefix="index_")) ) + else: + raise ValueError(f"Unexpected join type: {how}. {constants.FEEDBACK_LINK}") return join_key_cols diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index 75175690ce..dc7c709011 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -49,6 +49,26 @@ def combine_indices(index1: pd.Index, index2: pd.Index) -> pd.MultiIndex: return multi_index +def index_as_tuples(index: pd.Index) -> typing.Sequence[typing.Tuple]: + if isinstance(index, pd.MultiIndex): + return [label for label in index] + else: + return [(label,) for label in index] + + +def split_index( + index: pd.Index, levels: int = 1 +) -> typing.Tuple[typing.Optional[pd.Index], pd.Index]: + nlevels = index.nlevels + remaining = nlevels - levels + if remaining > 0: + return index.droplevel(list(range(remaining, nlevels))), index.droplevel( + list(range(0, remaining)) + ) + else: + return (None, index) + + def get_standardized_ids( col_labels: Iterable[Hashable], idx_labels: Iterable[Hashable] = () ) -> tuple[list[str], list[str]]: @@ -84,3 +104,36 @@ def get_standardized_ids( idx_ids, col_ids = ids[: len(idx_ids)], ids[len(idx_ids) :] return col_ids, idx_ids + + +def merge_column_labels( + left_labels: pd.Index, + right_labels: pd.Index, + coalesce_labels: typing.Sequence, + suffixes: tuple[str, str] = ("_x", "_y"), +) -> pd.Index: + result_labels = [] + + for col_label in left_labels: + if col_label in right_labels: + if col_label in coalesce_labels: + # Merging on the same column only returns 1 key column from coalesce both. + # Take the left key column. + result_labels.append(col_label) + else: + result_labels.append(str(col_label) + suffixes[0]) + else: + result_labels.append(col_label) + + for col_label in right_labels: + if col_label in left_labels: + if col_label in coalesce_labels: + # Merging on the same column only returns 1 key column from coalesce both. + # Pass the right key column. + pass + else: + result_labels.append(str(col_label) + suffixes[1]) + else: + result_labels.append(col_label) + + return pd.Index(result_labels) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 113355589b..eea8beb130 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -46,7 +46,6 @@ import bigframes.core.indexers as indexers import bigframes.core.indexes as indexes import bigframes.core.io -import bigframes.core.joins as joins import bigframes.core.ordering as order import bigframes.core.utils as utils import bigframes.core.window @@ -161,7 +160,15 @@ def __init__( columns=columns, # type:ignore dtype=dtype, # type:ignore ) - if pd_dataframe.size < MAX_INLINE_DF_SIZE: + if ( + pd_dataframe.size < MAX_INLINE_DF_SIZE + # TODO(swast): Workaround data types limitation in inline data. + and not any( + dt.pyarrow_dtype + for dt in pd_dataframe.dtypes + if isinstance(dt, pandas.ArrowDtype) + ) + ): self._block = blocks.block_from_local( pd_dataframe, session or bigframes.pandas.get_global_session() ) @@ -745,6 +752,55 @@ def rpow( __rpow__ = rpow + def align( + self, + other: typing.Union[DataFrame, bigframes.series.Series], + join: str = "outer", + axis: typing.Union[str, int, None] = None, + ) -> typing.Tuple[ + typing.Union[DataFrame, bigframes.series.Series], + typing.Union[DataFrame, bigframes.series.Series], + ]: + axis_n = utils.get_axis_number(axis) if axis else None + if axis_n == 1 and isinstance(other, bigframes.series.Series): + raise NotImplementedError( + f"align with series and axis=1 not supported. {constants.FEEDBACK_LINK}" + ) + left_block, right_block = block_ops.align( + self._block, other._block, join=join, axis=axis + ) + return DataFrame(left_block), other.__class__(right_block) + + def update(self, other, join: str = "left", overwrite=True, filter_func=None): + other = other if isinstance(other, DataFrame) else DataFrame(other) + if join != "left": + raise ValueError("Only 'left' join supported for update") + + if filter_func is not None: # Will always take other if possible + + def update_func( + left: bigframes.series.Series, right: bigframes.series.Series + ) -> bigframes.series.Series: + return left.mask(right.notna() & filter_func(left), right) + + elif overwrite: + + def update_func( + left: bigframes.series.Series, right: bigframes.series.Series + ) -> bigframes.series.Series: + return left.mask(right.notna(), right) + + else: + + def update_func( + left: bigframes.series.Series, right: bigframes.series.Series + ) -> bigframes.series.Series: + return left.mask(left.isna(), right) + + result = self.combine(other, update_func, how=join) + + self._set_block(result._block) + def combine( self, other: DataFrame, @@ -753,56 +809,31 @@ def combine( ], fill_value=None, overwrite: bool = True, + *, + how: str = "outer", ) -> DataFrame: - # Join rows - joined_index, (get_column_left, get_column_right) = self._block.index.join( - other._block.index, how="outer" - ) - columns, lcol_indexer, rcol_indexer = self.columns.join( - other.columns, how="outer", return_indexers=True - ) + l_aligned, r_aligned = block_ops.align(self._block, other._block, join=how) - column_indices = zip( - lcol_indexer if (lcol_indexer is not None) else range(len(columns)), - rcol_indexer if (lcol_indexer is not None) else range(len(columns)), + other_missing_labels = self._block.column_labels.difference( + other._block.column_labels ) - block = joined_index._block + l_frame = DataFrame(l_aligned) + r_frame = DataFrame(r_aligned) results = [] - for left_index, right_index in column_indices: - if left_index >= 0 and right_index >= 0: # -1 indices indicate missing - left_col_id = get_column_left(self._block.value_columns[left_index]) - right_col_id = get_column_right(other._block.value_columns[right_index]) - left_series = bigframes.series.Series(block.select_column(left_col_id)) - right_series = bigframes.series.Series( - block.select_column(right_col_id) - ) + for (label, lseries), (_, rseries) in zip(l_frame.items(), r_frame.items()): + if not ((label in other_missing_labels) and not overwrite): if fill_value is not None: - left_series = left_series.fillna(fill_value) - right_series = right_series.fillna(fill_value) - results.append(func(left_series, right_series)) - elif left_index >= 0: - # Does not exist in other - if overwrite: - dtype = self.dtypes[left_index] - block, null_col_id = block.create_constant(None, dtype=dtype) - result = bigframes.series.Series(block.select_column(null_col_id)) - results.append(result) + result = func( + lseries.fillna(fill_value), rseries.fillna(fill_value) + ) else: - left_col_id = get_column_left(self._block.value_columns[left_index]) - result = bigframes.series.Series(block.select_column(left_col_id)) - if fill_value is not None: - result = result.fillna(fill_value) - results.append(result) - elif right_index >= 0: - right_col_id = get_column_right(other._block.value_columns[right_index]) - result = bigframes.series.Series(block.select_column(right_col_id)) - if fill_value is not None: - result = result.fillna(fill_value) - results.append(result) + result = func(lseries, rseries) else: - # Should not be possible - raise ValueError("No right or left index.") + result = ( + lseries.fillna(fill_value) if fill_value is not None else lseries + ) + results.append(result) if all([isinstance(val, bigframes.series.Series) for val in results]): import bigframes.core.reshape as rs @@ -1611,6 +1642,12 @@ def agg( aggregate = agg + def idxmin(self) -> bigframes.series.Series: + return bigframes.series.Series(block_ops.idxmin(self._block)) + + def idxmax(self) -> bigframes.series.Series: + return bigframes.series.Series(block_ops.idxmax(self._block)) + def describe(self) -> DataFrame: df_numeric = self._drop_non_numeric(keep_bool=False) if len(df_numeric.columns) == 0: @@ -1682,6 +1719,27 @@ def stack(self): return bigframes.series.Series(result_block) return DataFrame(result_block) + def unstack(self): + block = self._block + # Special case, unstack with mono-index transpose into a series + if self.index.nlevels == 1: + block = block.stack( + how="right", dropna=False, sort=False, levels=self.columns.nlevels + ) + return bigframes.series.Series(block) + + # Pivot by last level of index + index_ids = block.index_columns + block = block.reset_index(drop=False) + block = block.set_index(index_ids[:-1]) + + pivot_block = block.pivot( + columns=[index_ids[-1]], + values=self._block.value_columns, + values_in_index=True, + ) + return DataFrame(pivot_block) + def _drop_non_numeric(self, keep_bool=True) -> DataFrame: types_to_keep = set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES) if not keep_bool: @@ -1734,12 +1792,10 @@ def merge( ] = "inner", # TODO(garrettwu): Currently can take inner, outer, left and right. To support # cross joins - # TODO(garrettwu): Support "on" list of columns and None. Currently a single - # column must be provided - on: Optional[str] = None, + on: Union[blocks.Label, Sequence[blocks.Label], None] = None, *, - left_on: Optional[str] = None, - right_on: Optional[str] = None, + left_on: Union[blocks.Label, Sequence[blocks.Label], None] = None, + right_on: Union[blocks.Label, Sequence[blocks.Label], None] = None, sort: bool = False, suffixes: tuple[str, str] = ("_x", "_y"), ) -> DataFrame: @@ -1753,97 +1809,41 @@ def merge( ) left_on, right_on = on, on - left = self - left_on_sql = self._sql_names(left_on) - # 0 elements already throws an exception - if len(left_on_sql) > 1: - raise ValueError(f"The column label {left_on} is not unique.") - left_on_sql = left_on_sql[0] - - right_on_sql = right._sql_names(right_on) - if len(right_on_sql) > 1: - raise ValueError(f"The column label {right_on} is not unique.") - right_on_sql = right_on_sql[0] - - ( - joined_expr, - join_key_ids, - (get_column_left, get_column_right), - ) = joins.join_by_column( - left._block.expr, - [left_on_sql], - right._block.expr, - [right_on_sql], - how=how, - sort=sort, - # In merging on the same column, it only returns 1 key column from coalesced both. - # While if 2 different columns, both will be presented in the result. - coalesce_join_keys=(left_on == right_on), - ) - # TODO(swast): Add suffixes to the column labels instead of reusing the - # column IDs as the new labels. - # Drop the index column(s) to be consistent with pandas. - left_columns = [ - join_key_ids[0] if (col_id == left_on_sql) else get_column_left(col_id) - for col_id in left._block.value_columns - ] - - right_columns = [] - for col_id in right._block.value_columns: - if col_id == right_on_sql: - # When left_on == right_on - if len(join_key_ids) > 1: - right_columns.append(join_key_ids[1]) - else: - right_columns.append(get_column_right(col_id)) - - expr = joined_expr.select_columns([*left_columns, *right_columns]) - labels = self._get_merged_col_labels( - right, left_on=left_on, right_on=right_on, suffixes=suffixes - ) + if utils.is_list_like(left_on): + left_on = list(left_on) # type: ignore + else: + left_on = [left_on] - # Constructs default index - expr, offset_index_id = expr.promote_offsets() - block = blocks.Block( - expr, index_columns=[offset_index_id], column_labels=labels + if utils.is_list_like(right_on): + right_on = list(right_on) # type: ignore + else: + right_on = [right_on] + + left_join_ids = [] + for label in left_on: # type: ignore + left_col_id = self._resolve_label_exact(label) + # 0 elements already throws an exception + if not left_col_id: + raise ValueError(f"No column {label} found in self.") + left_join_ids.append(left_col_id) + + right_join_ids = [] + for label in right_on: # type: ignore + right_col_id = right._resolve_label_exact(label) + if not right_col_id: + raise ValueError(f"No column {label} found in other.") + right_join_ids.append(right_col_id) + + block = self._block.merge( + right._block, + how, + left_join_ids, + right_join_ids, + sort=sort, + suffixes=suffixes, ) return DataFrame(block) - def _get_merged_col_labels( - self, - right: DataFrame, - left_on: str, - right_on: str, - suffixes: tuple[str, str] = ("_x", "_y"), - ) -> List[blocks.Label]: - on_col_equal = left_on == right_on - - left_col_labels: list[blocks.Label] = [] - for col_label in self._block.column_labels: - if col_label in right._block.column_labels: - if on_col_equal and col_label == left_on: - # Merging on the same column only returns 1 key column from coalesce both. - # Take the left key column. - left_col_labels.append(col_label) - else: - left_col_labels.append(str(col_label) + suffixes[0]) - else: - left_col_labels.append(col_label) - - right_col_labels: list[blocks.Label] = [] - for col_label in right._block.column_labels: - if col_label in self._block.column_labels: - if on_col_equal and col_label == left_on: - # Merging on the same column only returns 1 key column from coalesce both. - # Pass the right key column. - pass - else: - right_col_labels.append(str(col_label) + suffixes[1]) - else: - right_col_labels.append(col_label) - - return left_col_labels + right_col_labels - def join( self, other: DataFrame, *, on: Optional[str] = None, how: str = "left" ) -> DataFrame: diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 59d3007fab..46a7a1cb50 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -84,10 +84,10 @@ BIDIRECTIONAL_MAPPINGS: Iterable[Tuple[IbisDtype, Dtype]] = ( (ibis_dtypes.boolean, pd.BooleanDtype()), + (ibis_dtypes.date, pd.ArrowDtype(pa.date32())), (ibis_dtypes.float64, pd.Float64Dtype()), (ibis_dtypes.int64, pd.Int64Dtype()), (ibis_dtypes.string, pd.StringDtype(storage="pyarrow")), - (ibis_dtypes.date, pd.ArrowDtype(pa.date32())), (ibis_dtypes.time, pd.ArrowDtype(pa.time64("us"))), (ibis_dtypes.Timestamp(timezone=None), pd.ArrowDtype(pa.timestamp("us"))), ( @@ -100,6 +100,19 @@ pandas: ibis for ibis, pandas in BIDIRECTIONAL_MAPPINGS } +IBIS_TO_ARROW: Dict[ibis_dtypes.DataType, pa.DataType] = { + ibis_dtypes.boolean: pa.bool_(), + ibis_dtypes.date: pa.date32(), + ibis_dtypes.float64: pa.float64(), + ibis_dtypes.int64: pa.int64(), + ibis_dtypes.string: pa.string(), + ibis_dtypes.time: pa.time64("us"), + ibis_dtypes.Timestamp(timezone=None): pa.timestamp("us"), + ibis_dtypes.Timestamp(timezone="UTC"): pa.timestamp("us", tz="UTC"), +} + +ARROW_TO_IBIS = {arrow: ibis for ibis, arrow in IBIS_TO_ARROW.items()} + IBIS_TO_BIGFRAMES: Dict[ibis_dtypes.DataType, Union[Dtype, np.dtype[Any]]] = { ibis: pandas for ibis, pandas in BIDIRECTIONAL_MAPPINGS } @@ -148,11 +161,12 @@ def ibis_dtype_to_bigframes_dtype( # Special cases: Ibis supports variations on these types, but currently # our IO returns them as objects. Eventually, we should support them as # ArrowDType (and update the IO accordingly) - if isinstance(ibis_dtype, ibis_dtypes.Array) or isinstance( - ibis_dtype, ibis_dtypes.Struct - ): + if isinstance(ibis_dtype, ibis_dtypes.Array): return np.dtype("O") + if isinstance(ibis_dtype, ibis_dtypes.Struct): + return pd.ArrowDtype(ibis_dtype_to_arrow_dtype(ibis_dtype)) + if ibis_dtype in IBIS_TO_BIGFRAMES: return IBIS_TO_BIGFRAMES[ibis_dtype] elif isinstance(ibis_dtype, ibis_dtypes.Null): @@ -164,6 +178,26 @@ def ibis_dtype_to_bigframes_dtype( ) +def ibis_dtype_to_arrow_dtype(ibis_dtype: ibis_dtypes.DataType) -> pa.DataType: + if isinstance(ibis_dtype, ibis_dtypes.Array): + return pa.list_(ibis_dtype_to_arrow_dtype(ibis_dtype.value_type)) + + if isinstance(ibis_dtype, ibis_dtypes.Struct): + return pa.struct( + [ + (name, ibis_dtype_to_arrow_dtype(dtype)) + for name, dtype in ibis_dtype.fields.items() + ] + ) + + if ibis_dtype in IBIS_TO_ARROW: + return IBIS_TO_ARROW[ibis_dtype] + else: + raise ValueError( + f"Unexpected Ibis data type {ibis_dtype}. {constants.FEEDBACK_LINK}" + ) + + def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value: """Converts an Ibis expression to canonical type. @@ -187,6 +221,24 @@ def ibis_table_to_canonical_types(table: ibis_types.Table) -> ibis_types.Table: return table.select(*casted_columns) +def arrow_dtype_to_ibis_dtype(arrow_dtype: pa.DataType) -> ibis_dtypes.DataType: + if pa.types.is_struct(arrow_dtype): + struct_dtype = typing.cast(pa.StructType, arrow_dtype) + return ibis_dtypes.Struct.from_tuples( + [ + (field.name, arrow_dtype_to_ibis_dtype(field.type)) + for field in struct_dtype + ] + ) + + if arrow_dtype in ARROW_TO_IBIS: + return ARROW_TO_IBIS[arrow_dtype] + else: + raise ValueError( + f"Unexpected Arrow data type {arrow_dtype}. {constants.FEEDBACK_LINK}" + ) + + def bigframes_dtype_to_ibis_dtype( bigframes_dtype: Union[DtypeString, Dtype, np.dtype[Any]] ) -> ibis_dtypes.DataType: @@ -202,6 +254,9 @@ def bigframes_dtype_to_ibis_dtype( Raises: ValueError: If passed a dtype not supported by BigQuery DataFrames. """ + if isinstance(bigframes_dtype, pd.ArrowDtype): + return arrow_dtype_to_ibis_dtype(bigframes_dtype.pyarrow_dtype) + type_string = str(bigframes_dtype) if type_string in BIGFRAMES_STRING_TO_BIGFRAMES: bigframes_dtype = BIGFRAMES_STRING_TO_BIGFRAMES[ diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 9effbf1968..bf046ff691 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -31,6 +31,7 @@ preprocessing.StandardScaler, preprocessing.MaxAbsScaler, preprocessing.MinMaxScaler, + preprocessing.KBinsDiscretizer, preprocessing.LabelEncoder, ] @@ -91,18 +92,24 @@ def transformers_( return result - def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]: + def _compile_to_sql( + self, + columns: List[str], + X: bpd.DataFrame, + ) -> List[Tuple[str, str]]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause Args: columns (List[str]): a list of column names to transform + X (bpd.DataFrame): + The Dataframe with training data. Returns: a list of tuples of (sql_expression, output_name)""" return [ - transformer._compile_to_sql([column])[0] + transformer._compile_to_sql([column], X=X)[0] for column in columns for _, transformer, target_column in self.transformers_ if column == target_column @@ -115,7 +122,7 @@ def fit( ) -> ColumnTransformer: (X,) = utils.convert_to_dataframe(X) - compiled_transforms = self._compile_to_sql(X.columns.tolist()) + compiled_transforms = self._compile_to_sql(X.columns.tolist(), X) transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] self._bqml_model = self._bqml_model_factory.create_model( diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py index 110cbcf493..443b9e7be6 100644 --- a/bigframes/ml/model_selection.py +++ b/bigframes/ml/model_selection.py @@ -17,6 +17,7 @@ https://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection.""" +import typing from typing import List, Union from bigframes.ml import utils @@ -79,9 +80,10 @@ def train_test_split( train_index = split_dfs[0].index test_index = split_dfs[1].index - split_dfs += [ - df.loc[index] for df in dfs[1:] for index in (train_index, test_index) - ] + split_dfs += typing.cast( + List[bpd.DataFrame], + [df.loc[index] for df in dfs[1:] for index in (train_index, test_index)], + ) # convert back to Series. results: List[Union[bpd.DataFrame, bpd.Series]] = [] diff --git a/bigframes/ml/pipeline.py b/bigframes/ml/pipeline.py index ac02c39112..ad0b3fae11 100644 --- a/bigframes/ml/pipeline.py +++ b/bigframes/ml/pipeline.py @@ -52,6 +52,7 @@ def __init__(self, steps: List[Tuple[str, base.BaseEstimator]]): preprocessing.OneHotEncoder, preprocessing.MaxAbsScaler, preprocessing.MinMaxScaler, + preprocessing.KBinsDiscretizer, preprocessing.LabelEncoder, ), ): @@ -93,7 +94,7 @@ def fit( ) -> Pipeline: (X,) = utils.convert_to_dataframe(X) - compiled_transforms = self._transform._compile_to_sql(X.columns.tolist()) + compiled_transforms = self._transform._compile_to_sql(X.columns.tolist(), X=X) transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] if y is not None: @@ -151,6 +152,7 @@ def _extract_as_column_transformer( preprocessing.StandardScaler, preprocessing.MaxAbsScaler, preprocessing.MinMaxScaler, + preprocessing.KBinsDiscretizer, preprocessing.LabelEncoder, ], Union[str, List[str]], @@ -190,6 +192,13 @@ def _extract_as_column_transformer( *preprocessing.MinMaxScaler._parse_from_sql(transform_sql), ) ) + elif transform_sql.startswith("ML.BUCKETIZE"): + transformers.append( + ( + "k_bins_discretizer", + *preprocessing.KBinsDiscretizer._parse_from_sql(transform_sql), + ) + ) elif transform_sql.startswith("ML.LABEL_ENCODER"): transformers.append( ( @@ -213,6 +222,7 @@ def _merge_column_transformer( preprocessing.OneHotEncoder, preprocessing.MaxAbsScaler, preprocessing.MinMaxScaler, + preprocessing.KBinsDiscretizer, preprocessing.LabelEncoder, ]: """Try to merge the column transformer to a simple transformer.""" diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index caf4657a63..5f44d40218 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -23,6 +23,7 @@ from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.preprocessing._data +import third_party.bigframes_vendored.sklearn.preprocessing._discretization import third_party.bigframes_vendored.sklearn.preprocessing._encoder import third_party.bigframes_vendored.sklearn.preprocessing._label @@ -44,12 +45,15 @@ def __init__(self): def __eq__(self, other: Any) -> bool: return type(other) is StandardScaler and self._bqml_model == other._bqml_model - def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]: + def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause Args: - columns: a list of column names to transform + columns: + a list of column names to transform. + X (default None): + Ignored. Returns: a list of tuples of (sql_expression, output_name)""" return [ @@ -124,12 +128,15 @@ def __init__(self): def __eq__(self, other: Any) -> bool: return type(other) is MaxAbsScaler and self._bqml_model == other._bqml_model - def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]: + def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause Args: - columns: a list of column names to transform + columns: + a list of column names to transform. + X (default None): + Ignored. Returns: a list of tuples of (sql_expression, output_name)""" return [ @@ -204,12 +211,15 @@ def __init__(self): def __eq__(self, other: Any) -> bool: return type(other) is MinMaxScaler and self._bqml_model == other._bqml_model - def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]: + def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause Args: - columns: a list of column names to transform + columns: + a list of column names to transform. + X (default None): + Ignored. Returns: a list of tuples of (sql_expression, output_name)""" return [ @@ -267,6 +277,124 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: ) +class KBinsDiscretizer( + base.Transformer, + third_party.bigframes_vendored.sklearn.preprocessing._discretization.KBinsDiscretizer, +): + __doc__ = ( + third_party.bigframes_vendored.sklearn.preprocessing._discretization.KBinsDiscretizer.__doc__ + ) + + def __init__( + self, + n_bins: int = 5, + strategy: Literal["uniform", "quantile"] = "quantile", + ): + if strategy != "uniform": + raise NotImplementedError( + f"Only strategy = 'uniform' is supported now, input is {strategy}." + ) + if n_bins < 2: + raise ValueError( + f"n_bins has to be larger than or equal to 2, input is {n_bins}." + ) + self.n_bins = n_bins + self.strategy = strategy + self._bqml_model: Optional[core.BqmlModel] = None + self._bqml_model_factory = globals.bqml_model_factory() + self._base_sql_generator = globals.base_sql_generator() + + # TODO(garrettwu): implement __hash__ + def __eq__(self, other: Any) -> bool: + return ( + type(other) is KBinsDiscretizer + and self.n_bins == other.n_bins + and self._bqml_model == other._bqml_model + ) + + def _compile_to_sql( + self, + columns: List[str], + X: bpd.DataFrame, + ) -> List[Tuple[str, str]]: + """Compile this transformer to a list of SQL expressions that can be included in + a BQML TRANSFORM clause + + Args: + columns: + a list of column names to transform + X: + The Dataframe with training data. + + Returns: a list of tuples of (sql_expression, output_name)""" + array_split_points = {} + if self.strategy == "uniform": + for column in columns: + min_value = X[column].min() + max_value = X[column].max() + bin_size = (max_value - min_value) / self.n_bins + array_split_points[column] = [ + min_value + i * bin_size for i in range(self.n_bins - 1) + ] + + return [ + ( + self._base_sql_generator.ml_bucketize( + column, array_split_points[column], f"kbinsdiscretizer_{column}" + ), + f"kbinsdiscretizer_{column}", + ) + for column in columns + ] + + @classmethod + def _parse_from_sql(cls, sql: str) -> tuple[KBinsDiscretizer, str]: + """Parse SQL to tuple(KBinsDiscretizer, column_label). + + Args: + sql: SQL string of format "ML.BUCKETIZE({col_label}, array_split_points, FALSE) OVER()" + + Returns: + tuple(KBinsDiscretizer, column_label)""" + s = sql[sql.find("(") + 1 : sql.find(")")] + array_split_points = s[s.find("[") + 1 : s.find("]")] + col_label = s[: s.find(",")] + n_bins = array_split_points.count(",") + 2 + return cls(n_bins, "uniform"), col_label + + def fit( + self, + X: Union[bpd.DataFrame, bpd.Series], + y=None, # ignored + ) -> KBinsDiscretizer: + (X,) = utils.convert_to_dataframe(X) + + compiled_transforms = self._compile_to_sql(X.columns.tolist(), X) + transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] + + self._bqml_model = self._bqml_model_factory.create_model( + X, + options={"model_type": "transform_only"}, + transforms=transform_sqls, + ) + + # The schema of TRANSFORM output is not available in the model API, so save it during fitting + self._output_names = [name for _, name in compiled_transforms] + return self + + def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + if not self._bqml_model: + raise RuntimeError("Must be fitted before transform") + + (X,) = utils.convert_to_dataframe(X) + + df = self._bqml_model.transform(X) + return typing.cast( + bpd.DataFrame, + df[self._output_names], + ) + + class OneHotEncoder( base.Transformer, third_party.bigframes_vendored.sklearn.preprocessing._encoder.OneHotEncoder, @@ -308,13 +436,15 @@ def __eq__(self, other: Any) -> bool: and self.max_categories == other.max_categories ) - def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]: + def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause Args: columns: - a list of column names to transform + a list of column names to transform. + X (default None): + Ignored. Returns: a list of tuples of (sql_expression, output_name)""" @@ -432,13 +562,15 @@ def __eq__(self, other: Any) -> bool: and self.max_categories == other.max_categories ) - def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]: + def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause Args: columns: - a list of column names to transform + a list of column names to transform. + X (default None): + Ignored. Returns: a list of tuples of (sql_expression, output_name)""" diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 57c8ba672a..601b271099 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -85,6 +85,15 @@ def ml_min_max_scaler(self, numeric_expr_sql: str, name: str) -> str: """Encode ML.MIN_MAX_SCALER for BQML""" return f"""ML.MIN_MAX_SCALER({numeric_expr_sql}) OVER() AS {name}""" + def ml_bucketize( + self, + numeric_expr_sql: str, + array_split_points: Iterable[Union[int, float]], + name: str, + ) -> str: + """Encode ML.MIN_MAX_SCALER for BQML""" + return f"""ML.BUCKETIZE({numeric_expr_sql}, {array_split_points}, FALSE) AS {name}""" + def ml_one_hot_encoder( self, numeric_expr_sql: str, diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index add6af57f4..51eaad18b9 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -86,7 +86,15 @@ def __init__( if pd_series.name is None: # to_frame will set default numeric column label if unnamed, but we do not support int column label, so must rename pd_dataframe = pd_dataframe.set_axis(["unnamed_col"], axis=1) - if pd_dataframe.size < MAX_INLINE_SERIES_SIZE: + if ( + pd_dataframe.size < MAX_INLINE_SERIES_SIZE + # TODO(swast): Workaround data types limitation in inline data. + and not any( + dt.pyarrow_dtype + for dt in pd_dataframe.dtypes + if isinstance(dt, pd.ArrowDtype) + ) + ): self._block = blocks.block_from_local( pd_dataframe, session or bigframes.pandas.get_global_session() ) diff --git a/bigframes/operations/structs.py b/bigframes/operations/structs.py new file mode 100644 index 0000000000..80d51115d0 --- /dev/null +++ b/bigframes/operations/structs.py @@ -0,0 +1,61 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import typing + +import ibis.expr.types as ibis_types + +import bigframes.dataframe +import bigframes.operations +import bigframes.operations.base +import bigframes.series +import third_party.bigframes_vendored.pandas.core.arrays.arrow.accessors as vendoracessors + + +class StructField(bigframes.operations.UnaryOp): + def __init__(self, name_or_index: str | int): + self._name_or_index = name_or_index + + def _as_ibis(self, x: ibis_types.Value): + struct_value = typing.cast(ibis_types.StructValue, x) + if isinstance(self._name_or_index, str): + name = self._name_or_index + else: + name = struct_value.names[self._name_or_index] + return struct_value[name].name(name) + + +class StructAccessor( + bigframes.operations.base.SeriesMethods, vendoracessors.StructAccessor +): + __doc__ = vendoracessors.StructAccessor.__doc__ + + def field(self, name_or_index: str | int) -> bigframes.series.Series: + series = self._apply_unary_op(StructField(name_or_index)) + if isinstance(name_or_index, str): + name = name_or_index + else: + struct_field = self._dtype.pyarrow_dtype[name_or_index] + name = struct_field.name + return series.rename(name) + + def explode(self) -> bigframes.dataframe.DataFrame: + import bigframes.pandas + + pa_type = self._dtype.pyarrow_dtype + return bigframes.pandas.concat( + [self.field(i) for i in range(pa_type.num_fields)], axis="columns" + ) diff --git a/bigframes/series.py b/bigframes/series.py index 47298d59f5..8815a6abde 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -51,6 +51,7 @@ import bigframes.operations.base import bigframes.operations.datetimes as dt import bigframes.operations.strings as strings +import bigframes.operations.structs as structs import third_party.bigframes_vendored.pandas.core.series as vendored_pandas_series LevelType = typing.Union[str, int] @@ -118,6 +119,10 @@ def query_job(self) -> Optional[bigquery.QueryJob]: self._set_internal_query_job(self._compute_dry_run()) return self._query_job + @property + def struct(self) -> structs.StructAccessor: + return structs.StructAccessor(self._block) + def _set_internal_query_job(self, query_job: bigquery.QueryJob): self._query_job = query_job @@ -882,6 +887,34 @@ def argmin(self) -> int: scalars.Scalar, Series(block.select_column(row_nums)).iloc[0] ) + def idxmax(self) -> blocks.Label: + block = self._block.order_by( + [ + OrderingColumnReference( + self._value_column, direction=OrderingDirection.DESC + ), + *[ + OrderingColumnReference(idx_col) + for idx_col in self._block.index_columns + ], + ] + ) + block = block.slice(0, 1) + return indexes.Index._from_block(block).to_pandas()[0] + + def idxmin(self) -> blocks.Label: + block = self._block.order_by( + [ + OrderingColumnReference(self._value_column), + *[ + OrderingColumnReference(idx_col) + for idx_col in self._block.index_columns + ], + ] + ) + block = block.slice(0, 1) + return indexes.Index._from_block(block).to_pandas()[0] + @property def is_monotonic_increasing(self) -> bool: return typing.cast( diff --git a/bigframes/session.py b/bigframes/session.py index 7b827c7dcf..ac48c977cb 100644 --- a/bigframes/session.py +++ b/bigframes/session.py @@ -449,13 +449,6 @@ def _query_to_destination( index_cols: List[str], api_name: str, ) -> Tuple[Optional[bigquery.TableReference], Optional[bigquery.QueryJob]]: - # If there are no index columns, then there's no reason to cache to a - # (clustered) session table, as we'll just have to query it again to - # create a default index & ordering. - if not index_cols: - _, query_job = self._start_query(query) - return query_job.destination, query_job - # If a dry_run indicates this is not a query type job, then don't # bother trying to do a CREATE TEMP TABLE ... AS SELECT ... statement. dry_run_config = bigquery.QueryJobConfig() @@ -465,15 +458,24 @@ def _query_to_destination( _, query_job = self._start_query(query) return query_job.destination, query_job - # Make sure we cluster by the index column(s) so that subsequent - # operations are as speedy as they can be. + # Create a table to workaround BigQuery 10 GB query results limit. See: + # internal issue 303057336. + # Since we have a `statement_type == 'SELECT'`, schema should be populated. + schema = typing.cast(Iterable[bigquery.SchemaField], dry_run_job.schema) + temp_table = self._create_session_table_empty(api_name, schema, index_cols) + + job_config = bigquery.QueryJobConfig() + job_config.destination = temp_table + try: - ibis_expr = self.ibis_client.sql(query) - return self._ibis_to_session_table(ibis_expr, index_cols, api_name), None + # Write to temp table to workaround BigQuery 10 GB query results + # limit. See: internal issue 303057336. + _, query_job = self._start_query(query, job_config=job_config) + return query_job.destination, query_job except google.api_core.exceptions.BadRequest: - # Some SELECT statements still aren't compatible with CREATE TEMP - # TABLE ... AS SELECT ... statements. For example, if the query has - # a top-level ORDER BY, this conflicts with our ability to cluster + # Some SELECT statements still aren't compatible with cluster + # tables as the destination. For example, if the query has a + # top-level ORDER BY, this conflicts with our ability to cluster # the table by the index column(s). _, query_job = self._start_query(query) return query_job.destination, query_job @@ -1231,6 +1233,54 @@ def _create_session_table(self) -> bigquery.TableReference: ) return dataset.table(table_name) + def _create_session_table_empty( + self, + api_name: str, + schema: Iterable[bigquery.SchemaField], + cluster_cols: List[str], + ) -> bigquery.TableReference: + # Can't set a table in _SESSION as destination via query job API, so we + # run DDL, instead. + table = self._create_session_table() + schema_sql = bigframes_io.bq_schema_to_sql(schema) + + clusterable_cols = [ + col.name + for col in schema + if col.name in cluster_cols and _can_cluster_bq(col) + ][:_MAX_CLUSTER_COLUMNS] + + if clusterable_cols: + cluster_cols_sql = ", ".join( + f"`{cluster_col}`" for cluster_col in clusterable_cols + ) + cluster_sql = f"CLUSTER BY {cluster_cols_sql}" + else: + cluster_sql = "" + + ddl_text = f""" + CREATE TEMP TABLE + `_SESSION`.`{table.table_id}` + ({schema_sql}) + {cluster_sql} + """ + + job_config = bigquery.QueryJobConfig() + + # Include a label so that Dataplex Lineage can identify temporary + # tables that BigQuery DataFrames creates. Googlers: See internal issue + # 296779699. We're labeling the job instead of the table because + # otherwise we get `BadRequest: 400 OPTIONS on temporary tables are not + # supported`. + job_config.labels = {"source": "bigquery-dataframes-temp"} + job_config.labels["bigframes-api"] = api_name + + _, query_job = self._start_query(ddl_text, job_config=job_config) + + # Use fully-qualified name instead of `_SESSION` name so that the + # created table can be used as the destination table. + return query_job.destination + def _create_sequential_ordering( self, table: ibis_types.Table, @@ -1249,7 +1299,9 @@ def _create_sequential_ordering( cluster_cols=list(index_cols) + [default_ordering_name], api_name=api_name, ) - table = self.ibis_client.sql(f"SELECT * FROM `{table_ref.table_id}`") + table = self.ibis_client.table( + f"{table_ref.project}.{table_ref.dataset_id}.{table_ref.table_id}" + ) ordering_reference = core.OrderingColumnReference(default_ordering_name) ordering = core.ExpressionOrdering( ordering_value_columns=[ordering_reference], @@ -1264,55 +1316,13 @@ def _ibis_to_session_table( cluster_cols: Iterable[str], api_name: str, ) -> bigquery.TableReference: - clusterable_cols = [ - col for col in cluster_cols if _can_cluster(table[col].type()) - ][:_MAX_CLUSTER_COLUMNS] - return self._query_to_session_table( + desination, _ = self._query_to_destination( self.ibis_client.compile(table), - cluster_cols=clusterable_cols, + index_cols=list(cluster_cols), api_name=api_name, ) - - def _query_to_session_table( - self, - query_text: str, - cluster_cols: Iterable[str], - api_name: str, - ) -> bigquery.TableReference: - if len(list(cluster_cols)) > _MAX_CLUSTER_COLUMNS: - raise ValueError( - f"Too many cluster columns: {list(cluster_cols)}, max {_MAX_CLUSTER_COLUMNS} allowed." - ) - # Can't set a table in _SESSION as destination via query job API, so we - # run DDL, instead. - table = self._create_session_table() - cluster_cols_sql = ", ".join(f"`{cluster_col}`" for cluster_col in cluster_cols) - - # TODO(swast): This might not support multi-statement SQL queries (scripts). - ddl_text = f""" - CREATE TEMP TABLE `_SESSION`.`{table.table_id}` - CLUSTER BY {cluster_cols_sql} - AS {query_text} - """ - - job_config = bigquery.QueryJobConfig() - - # Include a label so that Dataplex Lineage can identify temporary - # tables that BigQuery DataFrames creates. Googlers: See internal issue - # 296779699. We're labeling the job instead of the table because - # otherwise we get `BadRequest: 400 OPTIONS on temporary tables are not - # supported`. - job_config.labels = {"source": "bigquery-dataframes-temp"} - job_config.labels["bigframes-api"] = api_name - - try: - self._start_query( - ddl_text, job_config=job_config - ) # Wait for the job to complete - except google.api_core.exceptions.Conflict: - # Allow query retry to succeed. - pass - return table + # There should always be a destination table for this query type. + return typing.cast(bigquery.TableReference, desination) def remote_function( self, @@ -1494,14 +1504,21 @@ def connect(context: Optional[bigquery_options.BigQueryOptions] = None) -> Sessi return Session(context) -def _can_cluster(ibis_type: ibis_dtypes.DataType): +def _can_cluster_bq(field: bigquery.SchemaField): # https://cloud.google.com/bigquery/docs/clustered-tables # Notably, float is excluded - return ( - ibis_type.is_integer() - or ibis_type.is_string() - or ibis_type.is_decimal() - or ibis_type.is_date() - or ibis_type.is_timestamp() - or ibis_type.is_boolean() + type_ = field.field_type + return type_ in ( + "INTEGER", + "INT64", + "STRING", + "NUMERIC", + "DECIMAL", + "BIGNUMERIC", + "BIGDECIMAL", + "DATE", + "DATETIME", + "TIMESTAMP", + "BOOL", + "BOOLEAN", ) diff --git a/bigframes/version.py b/bigframes/version.py index ad3c3082c5..238b64473a 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.5.0" +__version__ = "0.6.0" diff --git a/notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb b/notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb new file mode 100644 index 0000000000..598d958f0c --- /dev/null +++ b/notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb @@ -0,0 +1,723 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2023 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Train a pytorch model with Vertex AI SDK 2.0 and Bigframes\n", + "\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + " \n", + " \n", + " \"VertexOpen in Vertex AI Workbench\n", + " \n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "This tutorial demonstrates how to train a pytorch model using Vertex AI local-to-remote training with Vertex AI SDK 2.0 and BigQuery Bigframes as the data source.\n", + "\n", + "Learn more about [bigframes](https://cloud.google.com/bigquery/docs/)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d975e698c9a4" + }, + "source": [ + "### Objective\n", + "\n", + "In this tutorial, you learn to use `Vertex AI SDK 2.0` with Bigframes as input data source.\n", + "\n", + "\n", + "This tutorial uses the following Google Cloud ML services:\n", + "\n", + "- `Vertex AI Training`\n", + "- `Vertex AI Remote Training`\n", + "\n", + "\n", + "The steps performed include:\n", + "\n", + "- Initialize a dataframe from a BigQuery table and split the dataset\n", + "- Perform transformations as a Vertex AI remote training.\n", + "- Train the model remotely and evaluate the model locally\n", + "\n", + "**Local-to-remote training**\n", + "\n", + "```\n", + "import vertexai\n", + "from my_module import MyModelClass\n", + "\n", + "vertexai.preview.init(remote=True, project=\"my-project\", location=\"my-location\", staging_bucket=\"gs://my-bucket\")\n", + "\n", + "# Wrap the model class with `vertex_ai.preview.remote`\n", + "MyModelClass = vertexai.preview.remote(MyModelClass)\n", + "\n", + "# Instantiate the class\n", + "model = MyModelClass(...)\n", + "\n", + "# Optional set remote config\n", + "model.fit.vertex.remote_config.display_name = \"MyModelClass-remote-training\"\n", + "model.fit.vertex.remote_config.staging_bucket = \"gs://my-bucket\"\n", + "\n", + "# This `fit` call will be executed remotely\n", + "model.fit(...)\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "08d289fa873f" + }, + "source": [ + "### Dataset\n", + "\n", + "This tutorial uses the IRIS dataset, which predicts the iris species." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aed92deeb4a0" + }, + "source": [ + "### Costs\n", + "\n", + "This tutorial uses billable components of Google Cloud:\n", + "\n", + "* Vertex AI\n", + "* BigQuery\n", + "* Cloud Storage\n", + "\n", + "Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing),\n", + "[BigQuery pricing](https://cloud.google.com/bigquery/pricing),\n", + "and [Cloud Storage pricing](https://cloud.google.com/storage/pricing), \n", + "and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)\n", + "to generate a cost estimate based on your projected usage." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i7EUnXsZhAGF" + }, + "source": [ + "## Installation\n", + "\n", + "Install the following packages required to execute this notebook. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2b4ef9b72d43" + }, + "outputs": [], + "source": [ + "# Install the packages\n", + "! pip3 install --upgrade --quiet google-cloud-aiplatform[preview]\n", + "! pip3 install --upgrade --quiet bigframes\n", + "! pip3 install --upgrade --quiet torch" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "58707a750154" + }, + "source": [ + "### Colab only: Uncomment the following cell to restart the kernel." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "f200f10a1da3" + }, + "outputs": [], + "source": [ + "# Automatically restart kernel after installs so that your environment can access the new packages\n", + "# import IPython\n", + "\n", + "# app = IPython.Application.instance()\n", + "# app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BF1j6f9HApxa" + }, + "source": [ + "## Before you begin\n", + "\n", + "### Set up your Google Cloud project\n", + "\n", + "**The following steps are required, regardless of your notebook environment.**\n", + "\n", + "1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.\n", + "\n", + "2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n", + "\n", + "3. [Enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "4. If you are running this notebook locally, you need to install the [Cloud SDK](https://cloud.google.com/sdk)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WReHDGG5g0XY" + }, + "source": [ + "#### Set your project ID\n", + "\n", + "**If you don't know your project ID**, try the following:\n", + "* Run `gcloud config list`.\n", + "* Run `gcloud projects list`.\n", + "* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oM1iC_MfAts1" + }, + "outputs": [], + "source": [ + "PROJECT_ID = \"[your-project-id]\" # @param {type:\"string\"}\n", + "\n", + "# Set the project id\n", + "! gcloud config set project {PROJECT_ID}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "region" + }, + "source": [ + "#### Region\n", + "\n", + "You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "region" + }, + "outputs": [], + "source": [ + "REGION = \"us-central1\" # @param {type: \"string\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sBCra4QMA2wR" + }, + "source": [ + "### Authenticate your Google Cloud account\n", + "\n", + "Depending on your Jupyter environment, you may have to manually authenticate. Follow the relevant instructions below." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "74ccc9e52986" + }, + "source": [ + "**1. Vertex AI Workbench**\n", + "* Do nothing as you are already authenticated." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "de775a3773ba" + }, + "source": [ + "**2. Local JupyterLab instance, uncomment and run:**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "254614fa0c46" + }, + "outputs": [], + "source": [ + "# ! gcloud auth login" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ef21552ccea8" + }, + "source": [ + "**3. Colab, uncomment and run:**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "603adbbf0532" + }, + "outputs": [], + "source": [ + "# from google.colab import auth\n", + "# auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "f6b2ccc891ed" + }, + "source": [ + "**4. Service account or other**\n", + "* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zgPO1eR3CYjk" + }, + "source": [ + "### Create a Cloud Storage bucket\n", + "\n", + "Create a storage bucket to store intermediate artifacts such as datasets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MzGDU7TWdts_" + }, + "outputs": [], + "source": [ + "BUCKET_URI = f\"gs://your-bucket-name-{PROJECT_ID}-unique\" # @param {type:\"string\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-EcIXiGsCePi" + }, + "source": [ + "**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NIq7R4HZCfIc" + }, + "outputs": [], + "source": [ + "! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "960505627ddf" + }, + "source": [ + "### Import libraries and define constants" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PyQmSRbKA8r-" + }, + "outputs": [], + "source": [ + "import bigframes.pandas as bf\n", + "import torch\n", + "import vertexai\n", + "from vertexai.preview import VertexModel\n", + "\n", + "bf.options.bigquery.location = \"us\" # Dataset is in 'us' not 'us-central1'\n", + "bf.options.bigquery.project = PROJECT_ID\n", + "\n", + "from bigframes.ml.model_selection import \\\n", + " train_test_split as bf_train_test_split" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "init_aip:mbsdk,all" + }, + "source": [ + "## Initialize Vertex AI SDK for Python\n", + "\n", + "Initialize the Vertex AI SDK for Python for your project and corresponding bucket." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "init_aip:mbsdk,all" + }, + "outputs": [], + "source": [ + "vertexai.init(\n", + " project=PROJECT_ID,\n", + " location=REGION,\n", + " staging_bucket=BUCKET_URI,\n", + ")\n", + "\n", + "REMOTE_JOB_NAME = \"sdk2-bigframes-pytorch\"\n", + "REMOTE_JOB_BUCKET = f\"{BUCKET_URI}/{REMOTE_JOB_NAME}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "105334524e96" + }, + "source": [ + "## Prepare the dataset\n", + "\n", + "Now load the Iris dataset and split the data into train and test sets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "b44cdc4e03f1" + }, + "outputs": [], + "source": [ + "df = bf.read_gbq(\"bigquery-public-data.ml_datasets.iris\")\n", + "\n", + "species_categories = {\n", + " \"versicolor\": 0,\n", + " \"virginica\": 1,\n", + " \"setosa\": 2,\n", + "}\n", + "df[\"species\"] = df[\"species\"].map(species_categories)\n", + "\n", + "# Assign an index column name\n", + "index_col = \"index\"\n", + "df.index.name = index_col" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9cb8616b1997" + }, + "outputs": [], + "source": [ + "feature_columns = df[[\"sepal_length\", \"sepal_width\", \"petal_length\", \"petal_width\"]]\n", + "label_columns = df[[\"species\"]]\n", + "train_X, test_X, train_y, test_y = bf_train_test_split(\n", + " feature_columns, label_columns, test_size=0.2\n", + ")\n", + "\n", + "print(\"X_train size: \", train_X.size)\n", + "print(\"X_test size: \", test_X.size)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "23fe7b734b08" + }, + "outputs": [], + "source": [ + "# Switch to remote mode for training\n", + "vertexai.preview.init(remote=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5904a0f1bb03" + }, + "source": [ + "## PyTorch remote training with CPU (Custom PyTorch model)\n", + "\n", + "First, train a PyTorch model as a remote training job:\n", + "\n", + "- Reinitialize Vertex AI for remote training.\n", + "- Set TorchLogisticRegression for the remote training job.\n", + "- Invoke TorchLogisticRegression locally which will launch the remote training job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2a1b85195a17" + }, + "outputs": [], + "source": [ + "# define the custom model\n", + "class TorchLogisticRegression(VertexModel, torch.nn.Module):\n", + " def __init__(self, input_size: int, output_size: int):\n", + " torch.nn.Module.__init__(self)\n", + " VertexModel.__init__(self)\n", + " self.linear = torch.nn.Linear(input_size, output_size)\n", + " self.softmax = torch.nn.Softmax(dim=1)\n", + "\n", + " def forward(self, x):\n", + " return self.softmax(self.linear(x))\n", + "\n", + " @vertexai.preview.developer.mark.train()\n", + " def train(self, X, y, num_epochs, lr):\n", + " X = X.to(torch.float32)\n", + " y = torch.flatten(y) # necessary to get 1D tensor\n", + " dataloader = torch.utils.data.DataLoader(\n", + " torch.utils.data.TensorDataset(X, y),\n", + " batch_size=10,\n", + " shuffle=True,\n", + " generator=torch.Generator(device=X.device),\n", + " )\n", + "\n", + " criterion = torch.nn.CrossEntropyLoss()\n", + " optimizer = torch.optim.SGD(self.parameters(), lr=lr)\n", + "\n", + " for t in range(num_epochs):\n", + " for batch, (X, y) in enumerate(dataloader):\n", + " optimizer.zero_grad()\n", + " pred = self(X)\n", + " loss = criterion(pred, y)\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " @vertexai.preview.developer.mark.predict()\n", + " def predict(self, X):\n", + " X = torch.tensor(X).to(torch.float32)\n", + " with torch.no_grad():\n", + " pred = torch.argmax(self(X), dim=1)\n", + " return pred" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4e35593f520a" + }, + "outputs": [], + "source": [ + "# Switch to remote mode for training\n", + "vertexai.preview.init(remote=True)\n", + "\n", + "# Instantiate model\n", + "model = TorchLogisticRegression(4, 3)\n", + "\n", + "# Set training config\n", + "model.train.vertex.remote_config.custom_commands = [\n", + " \"pip install torchdata\",\n", + " \"pip install torcharrow\",\n", + "]\n", + "model.train.vertex.remote_config.display_name = REMOTE_JOB_NAME + \"-torch-model\"\n", + "model.train.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n", + "\n", + "# Train model on Vertex\n", + "model.train(train_X, train_y, num_epochs=200, lr=0.05)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "edf4d0708f02" + }, + "source": [ + "## Remote prediction\n", + "\n", + "Obtain predictions from the trained model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "42dfbff0ca15" + }, + "outputs": [], + "source": [ + "vertexai.preview.init(remote=True)\n", + "\n", + "# Set remote config\n", + "model.predict.vertex.remote_config.custom_commands = [\n", + " \"pip install torchdata\",\n", + " \"pip install torcharrow\",\n", + "]\n", + "model.predict.vertex.remote_config.display_name = REMOTE_JOB_NAME + \"-torch-predict\"\n", + "model.predict.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n", + "\n", + "predictions = model.predict(test_X)\n", + "\n", + "print(f\"Remote predictions: {predictions}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4340ed8316cd" + }, + "source": [ + "## Local evaluation\n", + "\n", + "Evaluate model results locally." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "eb27a31cec6f" + }, + "outputs": [], + "source": [ + "# User must convert bigframes to torch tensor for local evaluation\n", + "train_X_tensor = torch.from_numpy(\n", + " train_X.to_pandas().reset_index().drop(columns=[\"index\"]).values.astype(float)\n", + ")\n", + "train_y_tensor = torch.from_numpy(\n", + " train_y.to_pandas().reset_index().drop(columns=[\"index\"]).values.astype(float)\n", + ")\n", + "\n", + "test_X_tensor = torch.from_numpy(\n", + " test_X.to_pandas().reset_index().drop(columns=[\"index\"]).values.astype(float)\n", + ")\n", + "test_y_tensor = torch.from_numpy(\n", + " test_y.to_pandas().reset_index().drop(columns=[\"index\"]).values.astype(float)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7db44ad81389" + }, + "outputs": [], + "source": [ + "from sklearn.metrics import accuracy_score\n", + "\n", + "# Switch to local mode for evaluation\n", + "vertexai.preview.init(remote=False)\n", + "\n", + "# Evaluate model's accuracy score\n", + "print(\n", + " f\"Train accuracy: {accuracy_score(train_y_tensor, model.predict(train_X_tensor))}\"\n", + ")\n", + "\n", + "print(f\"Test accuracy: {accuracy_score(test_y_tensor, model.predict(test_X_tensor))}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TpV-iwP9qw9c" + }, + "source": [ + "## Cleaning up\n", + "\n", + "To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud\n", + "project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.\n", + "\n", + "Otherwise, you can delete the individual resources you created in this tutorial:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sx_vKniMq9ZX" + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# Delete Cloud Storage objects that were created\n", + "delete_bucket = False\n", + "if delete_bucket or os.getenv(\"IS_TESTING\"):\n", + " ! gsutil -m rm -r $BUCKET_URI" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "sdk2_bigframes_pytorch.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb b/notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb new file mode 100644 index 0000000000..021c070753 --- /dev/null +++ b/notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb @@ -0,0 +1,727 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2023 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Train a scikit-learn model with Vertex AI SDK 2.0 and Bigframes\n", + "\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + " \n", + " \n", + " \"VertexOpen in Vertex AI Workbench\n", + " \n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "This tutorial demonstrates how to train a scikit-learn model using Vertex AI local-to-remote training with Vertex AI SDK 2.0 and BigQuery Bigframes as the data source.\n", + "\n", + "Learn more about [bigframes](https://cloud.google.com/bigquery/docs/)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d975e698c9a4" + }, + "source": [ + "### Objective\n", + "\n", + "In this tutorial, you learn to use `Vertex AI SDK 2.0` with Bigframes as input data source.\n", + "\n", + "\n", + "This tutorial uses the following Google Cloud ML services:\n", + "\n", + "- `Vertex AI Training`\n", + "- `Vertex AI Remote Training`\n", + "\n", + "\n", + "The steps performed include:\n", + "\n", + "- Initialize a dataframe from a BigQuery table and split the dataset\n", + "- Perform transformations as a Vertex AI remote training.\n", + "- Train the model remotely and evaluate the model locally\n", + "\n", + "**Local-to-remote training**\n", + "\n", + "```\n", + "import vertexai\n", + "from my_module import MyModelClass\n", + "\n", + "vertexai.preview.init(remote=True, project=\"my-project\", location=\"my-location\", staging_bucket=\"gs://my-bucket\")\n", + "\n", + "# Wrap the model class with `vertex_ai.preview.remote`\n", + "MyModelClass = vertexai.preview.remote(MyModelClass)\n", + "\n", + "# Instantiate the class\n", + "model = MyModelClass(...)\n", + "\n", + "# Optional set remote config\n", + "model.fit.vertex.remote_config.display_name = \"MyModelClass-remote-training\"\n", + "model.fit.vertex.remote_config.staging_bucket = \"gs://my-bucket\"\n", + "\n", + "# This `fit` call will be executed remotely\n", + "model.fit(...)\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "08d289fa873f" + }, + "source": [ + "### Dataset\n", + "\n", + "This tutorial uses the IRIS dataset, which predicts the iris species." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aed92deeb4a0" + }, + "source": [ + "### Costs\n", + "\n", + "This tutorial uses billable components of Google Cloud:\n", + "\n", + "* Vertex AI\n", + "* BigQuery\n", + "* Cloud Storage\n", + "\n", + "Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing),\n", + "[BigQuery pricing](https://cloud.google.com/bigquery/pricing),\n", + "and [Cloud Storage pricing](https://cloud.google.com/storage/pricing), \n", + "and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)\n", + "to generate a cost estimate based on your projected usage." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i7EUnXsZhAGF" + }, + "source": [ + "## Installation\n", + "\n", + "Install the following packages required to execute this notebook. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2b4ef9b72d43" + }, + "outputs": [], + "source": [ + "# Install the packages\n", + "! pip3 install --upgrade --quiet google-cloud-aiplatform[preview]\n", + "! pip3 install --upgrade --quiet bigframes" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "58707a750154" + }, + "source": [ + "### Colab only: Uncomment the following cell to restart the kernel." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "f200f10a1da3" + }, + "outputs": [], + "source": [ + "# Automatically restart kernel after installs so that your environment can access the new packages\n", + "# import IPython\n", + "\n", + "# app = IPython.Application.instance()\n", + "# app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BF1j6f9HApxa" + }, + "source": [ + "## Before you begin\n", + "\n", + "### Set up your Google Cloud project\n", + "\n", + "**The following steps are required, regardless of your notebook environment.**\n", + "\n", + "1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.\n", + "\n", + "2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n", + "\n", + "3. [Enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "4. If you are running this notebook locally, you need to install the [Cloud SDK](https://cloud.google.com/sdk)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WReHDGG5g0XY" + }, + "source": [ + "#### Set your project ID\n", + "\n", + "**If you don't know your project ID**, try the following:\n", + "* Run `gcloud config list`.\n", + "* Run `gcloud projects list`.\n", + "* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oM1iC_MfAts1" + }, + "outputs": [], + "source": [ + "PROJECT_ID = \"[your-project-id]\" # @param {type:\"string\"}\n", + "\n", + "# Set the project id\n", + "! gcloud config set project {PROJECT_ID}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "region" + }, + "source": [ + "#### Region\n", + "\n", + "You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "region" + }, + "outputs": [], + "source": [ + "REGION = \"us-central1\" # @param {type: \"string\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sBCra4QMA2wR" + }, + "source": [ + "### Authenticate your Google Cloud account\n", + "\n", + "Depending on your Jupyter environment, you may have to manually authenticate. Follow the relevant instructions below." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "74ccc9e52986" + }, + "source": [ + "**1. Vertex AI Workbench**\n", + "* Do nothing as you are already authenticated." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "de775a3773ba" + }, + "source": [ + "**2. Local JupyterLab instance, uncomment and run:**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "254614fa0c46" + }, + "outputs": [], + "source": [ + "# ! gcloud auth login" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ef21552ccea8" + }, + "source": [ + "**3. Colab, uncomment and run:**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "603adbbf0532" + }, + "outputs": [], + "source": [ + "# from google.colab import auth\n", + "# auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "f6b2ccc891ed" + }, + "source": [ + "**4. Service account or other**\n", + "* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zgPO1eR3CYjk" + }, + "source": [ + "### Create a Cloud Storage bucket\n", + "\n", + "Create a storage bucket to store intermediate artifacts such as datasets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MzGDU7TWdts_" + }, + "outputs": [], + "source": [ + "BUCKET_URI = f\"gs://your-bucket-name-{PROJECT_ID}-unique\" # @param {type:\"string\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-EcIXiGsCePi" + }, + "source": [ + "**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NIq7R4HZCfIc" + }, + "outputs": [], + "source": [ + "! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "960505627ddf" + }, + "source": [ + "### Import libraries and define constants" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PyQmSRbKA8r-" + }, + "outputs": [], + "source": [ + "import bigframes.pandas as bf\n", + "import vertexai\n", + "\n", + "bf.options.bigquery.location = \"us\" # Dataset is in 'us' not 'us-central1'\n", + "bf.options.bigquery.project = PROJECT_ID\n", + "\n", + "from bigframes.ml.model_selection import \\\n", + " train_test_split as bf_train_test_split\n", + "\n", + "REMOTE_JOB_NAME = \"sdk2-bigframes-sklearn\"\n", + "REMOTE_JOB_BUCKET = f\"{BUCKET_URI}/{REMOTE_JOB_NAME}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "init_aip:mbsdk,all" + }, + "source": [ + "## Initialize Vertex AI SDK for Python\n", + "\n", + "Initialize the Vertex AI SDK for Python for your project and corresponding bucket." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "init_aip:mbsdk,all" + }, + "outputs": [], + "source": [ + "vertexai.init(\n", + " project=PROJECT_ID,\n", + " location=REGION,\n", + " staging_bucket=BUCKET_URI,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "105334524e96" + }, + "source": [ + "## Prepare the dataset\n", + "\n", + "Now load the Iris dataset and split the data into train and test sets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "b44cdc4e03f1" + }, + "outputs": [], + "source": [ + "df = bf.read_gbq(\"bigquery-public-data.ml_datasets.iris\")\n", + "\n", + "species_categories = {\n", + " \"versicolor\": 0,\n", + " \"virginica\": 1,\n", + " \"setosa\": 2,\n", + "}\n", + "df[\"species\"] = df[\"species\"].map(species_categories)\n", + "\n", + "# Assign an index column name\n", + "index_col = \"index\"\n", + "df.index.name = index_col" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9cb8616b1997" + }, + "outputs": [], + "source": [ + "feature_columns = df[[\"sepal_length\", \"sepal_width\", \"petal_length\", \"petal_width\"]]\n", + "label_columns = df[[\"species\"]]\n", + "train_X, test_X, train_y, test_y = bf_train_test_split(\n", + " feature_columns, label_columns, test_size=0.2\n", + ")\n", + "\n", + "print(\"X_train size: \", train_X.size)\n", + "print(\"X_test size: \", test_X.size)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8306545fcc57" + }, + "source": [ + "## Feature transformation\n", + "\n", + "Next, you do feature transformations on the data using the Vertex AI remote training service.\n", + "\n", + "First, you re-initialize Vertex AI to enable remote training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "55e701c31036" + }, + "outputs": [], + "source": [ + "# Switch to remote mode for training\n", + "vertexai.preview.init(remote=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4a0e9d59b273" + }, + "source": [ + "### Execute remote job for fit_transform() on training data\n", + "\n", + "Next, indicate that the `StandardScalar` class is to be executed remotely. Then set up the data transform and call the `fit_transform()` method is executed remotely." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "90333089d362" + }, + "outputs": [], + "source": [ + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "# Wrap classes to enable Vertex remote execution\n", + "StandardScaler = vertexai.preview.remote(StandardScaler)\n", + "\n", + "# Instantiate transformer\n", + "transformer = StandardScaler()\n", + "\n", + "# Set training config\n", + "transformer.fit_transform.vertex.remote_config.display_name = (\n", + " f\"{REMOTE_JOB_NAME}-fit-transformer-bigframes\"\n", + ")\n", + "transformer.fit_transform.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n", + "\n", + "# Execute transformer on Vertex (train_X is bigframes.dataframe.DataFrame, X_train is np.array)\n", + "X_train = transformer.fit_transform(train_X)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6bf95574c907" + }, + "source": [ + "### Remote transform on test data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "da6eea22a89a" + }, + "outputs": [], + "source": [ + "# Transform test dataset before calculate test score\n", + "transformer.transform.vertex.remote_config.display_name = (\n", + " REMOTE_JOB_NAME + \"-transformer\"\n", + ")\n", + "transformer.transform.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n", + "\n", + "# Execute transformer on Vertex (test_X is bigframes.dataframe.DataFrame, X_test is np.array)\n", + "X_test = transformer.transform(test_X)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ddf906c886e4" + }, + "source": [ + "## Remote training\n", + "\n", + "First, train the scikit-learn model as a remote training job:\n", + "\n", + "- Set LogisticRegression for the remote training job.\n", + "- Invoke LogisticRegression locally which will launch the remote training job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c7b0116fa60c" + }, + "outputs": [], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "# Wrap classes to enable Vertex remote execution\n", + "LogisticRegression = vertexai.preview.remote(LogisticRegression)\n", + "\n", + "# Instantiate model, warm_start=True for uptraining\n", + "model = LogisticRegression(warm_start=True)\n", + "\n", + "# Set training config\n", + "model.fit.vertex.remote_config.display_name = REMOTE_JOB_NAME + \"-sklearn-model\"\n", + "model.fit.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n", + "\n", + "# Train model on Vertex\n", + "model.fit(train_X, train_y)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ffe1d5903bcb" + }, + "source": [ + "## Remote prediction\n", + "\n", + "Obtain predictions from the trained model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "d00ce35920fa" + }, + "outputs": [], + "source": [ + "# Remote evaluation\n", + "vertexai.preview.init(remote=True)\n", + "\n", + "# Evaluate model's accuracy score\n", + "predictions = model.predict(test_X)\n", + "\n", + "print(f\"Remote predictions: {predictions}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "a8cd6cbd4403" + }, + "source": [ + "## Local evaluation\n", + "\n", + "Score model results locally." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dc105dafdfb9" + }, + "outputs": [], + "source": [ + "# User must convert bigframes to pandas dataframe for local evaluation\n", + "train_X_pd = train_X.to_pandas().reset_index(drop=True)\n", + "train_y_pd = train_y.to_pandas().reset_index(drop=True)\n", + "\n", + "test_X_pd = test_X.to_pandas().reset_index(drop=True)\n", + "test_y_pd = test_y.to_pandas().reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "25fec549de69" + }, + "outputs": [], + "source": [ + "# Switch to local mode for testing\n", + "vertexai.preview.init(remote=False)\n", + "\n", + "# Evaluate model's accuracy score\n", + "print(f\"Train accuracy: {model.score(train_X_pd, train_y_pd)}\")\n", + "\n", + "print(f\"Test accuracy: {model.score(test_X_pd, test_y_pd)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TpV-iwP9qw9c" + }, + "source": [ + "## Cleaning up\n", + "\n", + "To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud\n", + "project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.\n", + "\n", + "Otherwise, you can delete the individual resources you created in this tutorial:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sx_vKniMq9ZX" + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# Delete Cloud Storage objects that were created\n", + "delete_bucket = False\n", + "if delete_bucket or os.getenv(\"IS_TESTING\"):\n", + " ! gsutil -m rm -r $BUCKET_URI" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "sdk2_bigframes_sklearn.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb b/notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb new file mode 100644 index 0000000000..e6843b66b5 --- /dev/null +++ b/notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb @@ -0,0 +1,646 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2023 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Train a Tensorflow Keras model with Vertex AI SDK 2.0 and Bigframes \n", + "\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + " \n", + " \n", + " \"VertexOpen in Vertex AI Workbench\n", + " \n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "This tutorial demonstrates how to train a tensorflow keras model using Vertex AI local-to-remote training with Vertex AI SDK 2.0 and BigQuery Bigframes as the data source.\n", + "\n", + "Learn more about [bigframes](https://cloud.google.com/bigquery/docs/)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d975e698c9a4" + }, + "source": [ + "### Objective\n", + "\n", + "In this tutorial, you learn to use `Vertex AI SDK 2.0` with Bigframes as input data source.\n", + "\n", + "\n", + "This tutorial uses the following Google Cloud ML services:\n", + "\n", + "- `Vertex AI Training`\n", + "- `Vertex AI Remote Training`\n", + "\n", + "\n", + "The steps performed include:\n", + "\n", + "- Initialize a dataframe from a BigQuery table and split the dataset\n", + "- Perform transformations as a Vertex AI remote training.\n", + "- Train the model remotely and evaluate the model locally\n", + "\n", + "**Local-to-remote training**\n", + "\n", + "```\n", + "import vertexai\n", + "from my_module import MyModelClass\n", + "\n", + "vertexai.preview.init(remote=True, project=\"my-project\", location=\"my-location\", staging_bucket=\"gs://my-bucket\")\n", + "\n", + "# Wrap the model class with `vertex_ai.preview.remote`\n", + "MyModelClass = vertexai.preview.remote(MyModelClass)\n", + "\n", + "# Instantiate the class\n", + "model = MyModelClass(...)\n", + "\n", + "# Optional set remote config\n", + "model.fit.vertex.remote_config.display_name = \"MyModelClass-remote-training\"\n", + "model.fit.vertex.remote_config.staging_bucket = \"gs://my-bucket\"\n", + "\n", + "# This `fit` call will be executed remotely\n", + "model.fit(...)\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "08d289fa873f" + }, + "source": [ + "### Dataset\n", + "\n", + "This tutorial uses the IRIS dataset, which predicts the iris species." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aed92deeb4a0" + }, + "source": [ + "### Costs\n", + "\n", + "This tutorial uses billable components of Google Cloud:\n", + "\n", + "* Vertex AI\n", + "* BigQuery\n", + "* Cloud Storage\n", + "\n", + "Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing),\n", + "[BigQuery pricing](https://cloud.google.com/bigquery/pricing),\n", + "and [Cloud Storage pricing](https://cloud.google.com/storage/pricing), \n", + "and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)\n", + "to generate a cost estimate based on your projected usage." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i7EUnXsZhAGF" + }, + "source": [ + "## Installation\n", + "\n", + "Install the following packages required to execute this notebook. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2b4ef9b72d43" + }, + "outputs": [], + "source": [ + "# Install the packages\n", + "! pip3 install --upgrade --quiet google-cloud-aiplatform[preview]\n", + "! pip3 install --upgrade --quiet bigframes\n", + "! pip3 install --upgrade --quiet tensorflow==2.12.0" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "58707a750154" + }, + "source": [ + "### Colab only: Uncomment the following cell to restart the kernel." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "f200f10a1da3" + }, + "outputs": [], + "source": [ + "# Automatically restart kernel after installs so that your environment can access the new packages\n", + "# import IPython\n", + "\n", + "# app = IPython.Application.instance()\n", + "# app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BF1j6f9HApxa" + }, + "source": [ + "## Before you begin\n", + "\n", + "### Set up your Google Cloud project\n", + "\n", + "**The following steps are required, regardless of your notebook environment.**\n", + "\n", + "1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.\n", + "\n", + "2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n", + "\n", + "3. [Enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "4. If you are running this notebook locally, you need to install the [Cloud SDK](https://cloud.google.com/sdk)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WReHDGG5g0XY" + }, + "source": [ + "#### Set your project ID\n", + "\n", + "**If you don't know your project ID**, try the following:\n", + "* Run `gcloud config list`.\n", + "* Run `gcloud projects list`.\n", + "* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oM1iC_MfAts1" + }, + "outputs": [], + "source": [ + "PROJECT_ID = \"[your-project-id]\" # @param {type:\"string\"}\n", + "\n", + "# Set the project id\n", + "! gcloud config set project {PROJECT_ID}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "region" + }, + "source": [ + "#### Region\n", + "\n", + "You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "region" + }, + "outputs": [], + "source": [ + "REGION = \"us-central1\" # @param {type: \"string\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sBCra4QMA2wR" + }, + "source": [ + "### Authenticate your Google Cloud account\n", + "\n", + "Depending on your Jupyter environment, you may have to manually authenticate. Follow the relevant instructions below." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "74ccc9e52986" + }, + "source": [ + "**1. Vertex AI Workbench**\n", + "* Do nothing as you are already authenticated." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "de775a3773ba" + }, + "source": [ + "**2. Local JupyterLab instance, uncomment and run:**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "254614fa0c46" + }, + "outputs": [], + "source": [ + "# ! gcloud auth login" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ef21552ccea8" + }, + "source": [ + "**3. Colab, uncomment and run:**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "603adbbf0532" + }, + "outputs": [], + "source": [ + "# from google.colab import auth\n", + "# auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "f6b2ccc891ed" + }, + "source": [ + "**4. Service account or other**\n", + "* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zgPO1eR3CYjk" + }, + "source": [ + "### Create a Cloud Storage bucket\n", + "\n", + "Create a storage bucket to store intermediate artifacts such as datasets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MzGDU7TWdts_" + }, + "outputs": [], + "source": [ + "BUCKET_URI = f\"gs://your-bucket-name-{PROJECT_ID}-unique\" # @param {type:\"string\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-EcIXiGsCePi" + }, + "source": [ + "**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NIq7R4HZCfIc" + }, + "outputs": [], + "source": [ + "! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "960505627ddf" + }, + "source": [ + "### Import libraries and define constants" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PyQmSRbKA8r-" + }, + "outputs": [], + "source": [ + "import bigframes.pandas as bf\n", + "import tensorflow as tf\n", + "import vertexai\n", + "from tensorflow import keras\n", + "\n", + "bf.options.bigquery.location = \"us\" # Dataset is in 'us' not 'us-central1'\n", + "bf.options.bigquery.project = PROJECT_ID\n", + "\n", + "from bigframes.ml.model_selection import \\\n", + " train_test_split as bf_train_test_split" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "init_aip:mbsdk,all" + }, + "source": [ + "## Initialize Vertex AI SDK for Python\n", + "\n", + "Initialize the Vertex AI SDK for Python for your project and corresponding bucket." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "init_aip:mbsdk,all" + }, + "outputs": [], + "source": [ + "vertexai.init(\n", + " project=PROJECT_ID,\n", + " location=REGION,\n", + " staging_bucket=BUCKET_URI,\n", + ")\n", + "\n", + "REMOTE_JOB_NAME = \"sdk2-bigframes-tensorflow\"\n", + "REMOTE_JOB_BUCKET = f\"{BUCKET_URI}/{REMOTE_JOB_NAME}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "105334524e96" + }, + "source": [ + "## Prepare the dataset\n", + "\n", + "Now load the Iris dataset and split the data into train and test sets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "94576deccd8c" + }, + "outputs": [], + "source": [ + "df = bf.read_gbq(\"bigquery-public-data.ml_datasets.iris\")\n", + "\n", + "species_categories = {\n", + " \"versicolor\": 0,\n", + " \"virginica\": 1,\n", + " \"setosa\": 2,\n", + "}\n", + "df[\"target\"] = df[\"species\"].map(species_categories)\n", + "df = df.drop(columns=[\"species\"])\n", + "\n", + "train, test = bf_train_test_split(df, test_size=0.2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cfcbce726efa" + }, + "source": [ + "## Remote training with GPU\n", + "\n", + "First, train a TensorFlow model as a remote training job:\n", + "\n", + "- Reinitialize Vertex AI for remote training.\n", + "- Instantiate the tensorflow keras model for the remote training job.\n", + "- Invoke the tensorflow keras model.fit() locally which will launch the remote training job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fd865b0c4e8b" + }, + "outputs": [], + "source": [ + "# Switch to remote mode for training\n", + "vertexai.preview.init(remote=True)\n", + "\n", + "keras.Sequential = vertexai.preview.remote(keras.Sequential)\n", + "\n", + "# Instantiate model\n", + "model = keras.Sequential(\n", + " [keras.layers.Dense(5, input_shape=(4,)), keras.layers.Softmax()]\n", + ")\n", + "\n", + "# Specify optimizer and loss function\n", + "model.compile(optimizer=\"adam\", loss=\"mean_squared_error\")\n", + "\n", + "# Set training config\n", + "model.fit.vertex.remote_config.enable_cuda = True\n", + "model.fit.vertex.remote_config.display_name = REMOTE_JOB_NAME + \"-keras-model-gpu\"\n", + "model.fit.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n", + "model.fit.vertex.remote_config.custom_commands = [\"pip install tensorflow-io==0.32.0\"]\n", + "\n", + "# Manually set compute resources this time\n", + "model.fit.vertex.remote_config.machine_type = \"n1-highmem-4\"\n", + "model.fit.vertex.remote_config.accelerator_type = \"NVIDIA_TESLA_K80\"\n", + "model.fit.vertex.remote_config.accelerator_count = 4\n", + "\n", + "# Train model on Vertex\n", + "model.fit(train, epochs=10)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "f1af94ac1477" + }, + "source": [ + "## Remote prediction\n", + "\n", + "Obtain predictions from the trained model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1d75879948b5" + }, + "outputs": [], + "source": [ + "vertexai.preview.init(remote=True)\n", + "\n", + "# Set remote config\n", + "model.predict.vertex.remote_config.enable_cuda = False\n", + "model.predict.vertex.remote_config.display_name = REMOTE_JOB_NAME + \"-keras-predict-cpu\"\n", + "model.predict.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n", + "model.predict.vertex.remote_config.custom_commands = [\n", + " \"pip install tensorflow-io==0.32.0\"\n", + "]\n", + "\n", + "predictions = model.predict(train)\n", + "\n", + "print(f\"Remote predictions: {predictions}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "798b77c95067" + }, + "source": [ + "## Local evaluation\n", + "\n", + "Evaluate model results locally." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "88e734e30791" + }, + "outputs": [], + "source": [ + "# User must convert bigframes to pandas dataframe for local evaluation\n", + "feature_columns = [\"sepal_length\", \"sepal_width\", \"petal_length\", \"petal_width\"]\n", + "label_columns = [\"target\"]\n", + "\n", + "train_X_np = train[feature_columns].to_pandas().values.astype(float)\n", + "train_y_np = train[label_columns].to_pandas().values.astype(float)\n", + "train_ds = tf.data.Dataset.from_tensor_slices((train_X_np, train_y_np))\n", + "\n", + "test_X_np = test[feature_columns].to_pandas().values.astype(float)\n", + "test_y_np = test[label_columns].to_pandas().values.astype(float)\n", + "test_ds = tf.data.Dataset.from_tensor_slices((test_X_np, test_y_np))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cb8637f783ad" + }, + "outputs": [], + "source": [ + "# Switch to local mode for evaluation\n", + "vertexai.preview.init(remote=False)\n", + "\n", + "# Evaluate model's mean square errors\n", + "print(f\"Train loss: {model.evaluate(train_ds.batch(32))}\")\n", + "\n", + "print(f\"Test loss: {model.evaluate(test_ds.batch(32))}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TpV-iwP9qw9c" + }, + "source": [ + "## Cleaning up\n", + "\n", + "To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud\n", + "project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.\n", + "\n", + "Otherwise, you can delete the individual resources you created in this tutorial:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sx_vKniMq9ZX" + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# Delete Cloud Storage objects that were created\n", + "delete_bucket = False\n", + "if delete_bucket or os.getenv(\"IS_TESTING\"):\n", + " ! gsutil -m rm -r $BUCKET_URI" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "sdk2_bigframes_tensorflow.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/noxfile.py b/noxfile.py index 033bbfefe4..a113e1fcde 100644 --- a/noxfile.py +++ b/noxfile.py @@ -362,7 +362,7 @@ def doctest(session: nox.sessions.Session): run_system( session=session, prefix_name="doctest", - extra_pytest_options=("--doctest-modules",), + extra_pytest_options=("--doctest-modules", "third_party"), test_folder="bigframes", check_cov=True, ) @@ -610,6 +610,9 @@ def notebook(session): "notebooks/getting_started/bq_dataframes_llm_code_generation.ipynb", "notebooks/getting_started/bq_dataframes_ml_linear_regression.ipynb", "notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb", + "notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb", + "notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb", + "notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb", # The experimental notebooks imagine features that don't yet # exist or only exist as temporary prototypes. "notebooks/experimental/longer_ml_demo.ipynb", diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py index 34a2ca0101..9294740dd6 100644 --- a/tests/system/large/ml/test_pipeline.py +++ b/tests/system/large/ml/test_pipeline.py @@ -580,6 +580,11 @@ def test_pipeline_columntransformer_fit_predict(session, penguins_df_default_ind preprocessing.MinMaxScaler(), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "k_bins_discretizer", + preprocessing.KBinsDiscretizer(strategy="uniform"), + ["culmen_length_mm", "flipper_length_mm"], + ), ( "label", preprocessing.LabelEncoder(), @@ -657,6 +662,11 @@ def test_pipeline_columntransformer_to_gbq(penguins_df_default_index, dataset_id preprocessing.MinMaxScaler(), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "k_bins_discretizer", + preprocessing.KBinsDiscretizer(strategy="uniform"), + ["culmen_length_mm", "flipper_length_mm"], + ), ( "label", preprocessing.LabelEncoder(), @@ -696,9 +706,19 @@ def test_pipeline_columntransformer_to_gbq(penguins_df_default_index, dataset_id ("standard_scaler", preprocessing.StandardScaler(), "culmen_length_mm"), ("max_abs_scaler", preprocessing.MaxAbsScaler(), "culmen_length_mm"), ("min_max_scaler", preprocessing.MinMaxScaler(), "culmen_length_mm"), + ( + "k_bins_discretizer", + preprocessing.KBinsDiscretizer(strategy="uniform"), + "culmen_length_mm", + ), ("standard_scaler", preprocessing.StandardScaler(), "flipper_length_mm"), ("max_abs_scaler", preprocessing.MaxAbsScaler(), "flipper_length_mm"), ("min_max_scaler", preprocessing.MinMaxScaler(), "flipper_length_mm"), + ( + "k_bins_discretizer", + preprocessing.KBinsDiscretizer(strategy="uniform"), + "flipper_length_mm", + ), ] assert transformers == expected @@ -791,6 +811,32 @@ def test_pipeline_min_max_scaler_to_gbq(penguins_df_default_index, dataset_id): assert pl_loaded._estimator.fit_intercept is False +def test_pipeline_k_bins_discretizer_to_gbq(penguins_df_default_index, dataset_id): + pl = pipeline.Pipeline( + [ + ("transform", preprocessing.KBinsDiscretizer(strategy="uniform")), + ("estimator", linear_model.LinearRegression(fit_intercept=False)), + ] + ) + + df = penguins_df_default_index.dropna() + X_train = df[ + [ + "culmen_length_mm", + ] + ] + y_train = df[["body_mass_g"]] + pl.fit(X_train, y_train) + + pl_loaded = pl.to_gbq( + f"{dataset_id}.test_penguins_pipeline_k_bins_discretizer", replace=True + ) + assert isinstance(pl_loaded._transform, preprocessing.KBinsDiscretizer) + + assert isinstance(pl_loaded._estimator, linear_model.LinearRegression) + assert pl_loaded._estimator.fit_intercept is False + + def test_pipeline_one_hot_encoder_to_gbq(penguins_df_default_index, dataset_id): pl = pipeline.Pipeline( [ diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index ace943956f..f911dd7eeb 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -23,6 +23,7 @@ import bigframes from bigframes.ml import core +import tests.system.utils def test_model_eval( @@ -224,7 +225,7 @@ def test_pca_model_principal_component_info(penguins_bqml_pca_model: core.BqmlMo "cumulative_explained_variance_ratio": [0.469357, 0.651283, 0.812383], }, ) - pd.testing.assert_frame_equal( + tests.system.utils.assert_pandas_df_equal_ignore_ordering( result, expected, check_exact=False, diff --git a/tests/system/small/ml/test_decomposition.py b/tests/system/small/ml/test_decomposition.py index c71bbbe3b0..e31681f4a0 100644 --- a/tests/system/small/ml/test_decomposition.py +++ b/tests/system/small/ml/test_decomposition.py @@ -15,6 +15,7 @@ import pandas as pd from bigframes.ml import decomposition +import tests.system.utils def test_pca_predict(penguins_pca_model, new_penguins_df): @@ -129,7 +130,7 @@ def test_pca_explained_variance_(penguins_pca_model: decomposition.PCA): "explained_variance": [3.278657, 1.270829, 1.125354], }, ) - pd.testing.assert_frame_equal( + tests.system.utils.assert_pandas_df_equal_ignore_ordering( result, expected, check_exact=False, @@ -148,7 +149,7 @@ def test_pca_explained_variance_ratio_(penguins_pca_model: decomposition.PCA): "explained_variance_ratio": [0.469357, 0.181926, 0.1611], }, ) - pd.testing.assert_frame_equal( + tests.system.utils.assert_pandas_df_equal_ignore_ordering( result, expected, check_exact=False, diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index fc8f3251bd..45548acca3 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -121,7 +121,7 @@ def test_standard_scaler_series_normalizes(penguins_df_default_index, new_pengui def test_max_abs_scaler_normalizes(penguins_df_default_index, new_penguins_df): - # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod. + # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.MaxAbsScaler, when BQML's change is in prod. scaler = bigframes.ml.preprocessing.MaxAbsScaler() scaler.fit( penguins_df_default_index[ @@ -211,7 +211,7 @@ def test_max_abs_scaler_series_normalizes(penguins_df_default_index, new_penguin pd.testing.assert_frame_equal(result, expected, rtol=1e-3) -def test_min_max_scaler_normalizeds_fit_transform(new_penguins_df): +def test_min_max_scaler_normalized_fit_transform(new_penguins_df): scaler = bigframes.ml.preprocessing.MinMaxScaler() result = scaler.fit_transform( new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] @@ -265,7 +265,7 @@ def test_min_max_scaler_series_normalizes(penguins_df_default_index, new_penguin def test_min_max_scaler_normalizes(penguins_df_default_index, new_penguins_df): - # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod. + # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.MinMaxScaler, when BQML's change is in prod. scaler = bigframes.ml.preprocessing.MinMaxScaler() scaler.fit( penguins_df_default_index[ @@ -304,6 +304,131 @@ def test_min_max_scaler_normalizes(penguins_df_default_index, new_penguins_df): pd.testing.assert_frame_equal(result, expected, rtol=1e-3) +def test_k_bins_discretizer_normalized_fit_transform_default_params(new_penguins_df): + discretizer = bigframes.ml.preprocessing.KBinsDiscretizer(strategy="uniform") + result = discretizer.fit_transform( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "kbinsdiscretizer_culmen_depth_mm": ["bin_5", "bin_2", "bin_4"], + "kbinsdiscretizer_culmen_length_mm": ["bin_5", "bin_3", "bin_2"], + "kbinsdiscretizer_flipper_length_mm": ["bin_5", "bin_2", "bin_4"], + }, + dtype="string[pyarrow]", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + + +def test_k_bins_discretizer_series_normalizes( + penguins_df_default_index, new_penguins_df +): + discretizer = bigframes.ml.preprocessing.KBinsDiscretizer(strategy="uniform") + discretizer.fit(penguins_df_default_index["culmen_length_mm"]) + + result = discretizer.transform( + penguins_df_default_index["culmen_length_mm"] + ).to_pandas() + result = discretizer.transform(new_penguins_df).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "kbinsdiscretizer_culmen_length_mm": ["bin_3", "bin_3", "bin_3"], + }, + dtype="string[pyarrow]", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + + +def test_k_bins_discretizer_normalizes(penguins_df_default_index, new_penguins_df): + # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.KBinsDiscretizer, when BQML's change is in prod. + discretizer = bigframes.ml.preprocessing.KBinsDiscretizer(strategy="uniform") + discretizer.fit( + penguins_df_default_index[ + ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] + ] + ) + + result = discretizer.transform( + penguins_df_default_index[ + ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] + ] + ).to_pandas() + + result = discretizer.transform(new_penguins_df).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "kbinsdiscretizer_culmen_depth_mm": ["bin_5", "bin_4", "bin_4"], + "kbinsdiscretizer_culmen_length_mm": ["bin_3", "bin_3", "bin_3"], + "kbinsdiscretizer_flipper_length_mm": ["bin_4", "bin_2", "bin_3"], + }, + dtype="string[pyarrow]", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + + +def test_k_bins_discretizer_normalizes_different_params( + penguins_df_default_index, new_penguins_df +): + # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.KBinsDiscretizer, when BQML's change is in prod. + discretizer = bigframes.ml.preprocessing.KBinsDiscretizer( + n_bins=6, strategy="uniform" + ) + discretizer.fit( + penguins_df_default_index[ + ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] + ] + ) + + result = discretizer.transform( + penguins_df_default_index[ + ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] + ] + ).to_pandas() + + result = discretizer.transform(new_penguins_df).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "kbinsdiscretizer_culmen_depth_mm": ["bin_6", "bin_4", "bin_5"], + "kbinsdiscretizer_culmen_length_mm": ["bin_3", "bin_3", "bin_3"], + "kbinsdiscretizer_flipper_length_mm": ["bin_4", "bin_2", "bin_3"], + }, + dtype="string[pyarrow]", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + + def test_one_hot_encoder_default_params(new_penguins_df): encoder = bigframes.ml.preprocessing.OneHotEncoder() encoder.fit(new_penguins_df[["species", "sex"]]) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index adf17848ee..b8616a54d6 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -757,7 +757,7 @@ def test_df_isin_dict(scalars_dfs): ("right",), ], ) -def test_merge(scalars_dfs, merge_how): +def test_df_merge(scalars_dfs, merge_how): scalars_df, scalars_pandas_df = scalars_dfs on = "rowindex_2" left_columns = ["int64_col", "float64_col", "rowindex_2"] @@ -782,6 +782,39 @@ def test_merge(scalars_dfs, merge_how): assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) +@pytest.mark.parametrize( + ("left_on", "right_on"), + [ + (["int64_col", "rowindex_2"], ["int64_col", "rowindex_2"]), + (["rowindex_2", "int64_col"], ["int64_col", "rowindex_2"]), + (["rowindex_2", "float64_col"], ["int64_col", "rowindex_2"]), + ], +) +def test_df_merge_multi_key(scalars_dfs, left_on, right_on): + scalars_df, scalars_pandas_df = scalars_dfs + left_columns = ["int64_col", "float64_col", "rowindex_2"] + right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] + + left = scalars_df[left_columns] + # Offset the rows somewhat so that outer join can have an effect. + right = scalars_df[right_columns].assign(rowindex_2=scalars_df["rowindex_2"] + 2) + + df = left.merge(right, "outer", left_on=left_on, right_on=right_on, sort=True) + bf_result = df.to_pandas() + + pd_result = scalars_pandas_df[left_columns].merge( + scalars_pandas_df[right_columns].assign( + rowindex_2=scalars_pandas_df["rowindex_2"] + 2 + ), + "outer", + left_on=left_on, + right_on=right_on, + sort=True, + ) + + assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + + @pytest.mark.parametrize( ("merge_how",), [ @@ -884,7 +917,19 @@ def test_get_dtypes_array_struct(session): dtypes = df.dtypes pd.testing.assert_series_equal( dtypes, - pd.Series({"array_column": np.dtype("O"), "struct_column": np.dtype("O")}), + pd.Series( + { + "array_column": np.dtype("O"), + "struct_column": pd.ArrowDtype( + pa.struct( + [ + ("string_field", pa.string()), + ("float_field", pa.float64()), + ] + ) + ), + } + ), ) @@ -1211,6 +1256,105 @@ def test_combine( pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) +@pytest.mark.parametrize( + ("overwrite", "filter_func"), + [ + (True, None), + (False, None), + (True, lambda x: x.isna() | (x % 2 == 0)), + ], + ids=[ + "default", + "overwritefalse", + "customfilter", + ], +) +def test_df_update(overwrite, filter_func): + if pd.__version__.startswith("1."): + pytest.skip("dtype handled differently in pandas 1.x.") + index1 = pandas.Index([1, 2, 3, 4], dtype="Int64") + index2 = pandas.Index([1, 2, 4, 5], dtype="Int64") + pd_df1 = pandas.DataFrame( + {"a": [1, None, 3, 4], "b": [5, 6, None, 8]}, dtype="Int64", index=index1 + ) + pd_df2 = pandas.DataFrame( + {"a": [None, 20, 30, 40], "c": [90, None, 110, 120]}, + dtype="Int64", + index=index2, + ) + + bf_df1 = dataframe.DataFrame(pd_df1) + bf_df2 = dataframe.DataFrame(pd_df2) + + bf_df1.update(bf_df2, overwrite=overwrite, filter_func=filter_func) + pd_df1.update(pd_df2, overwrite=overwrite, filter_func=filter_func) + + pd.testing.assert_frame_equal(bf_df1.to_pandas(), pd_df1) + + +def test_df_idxmin(): + pd_df = pd.DataFrame( + {"a": [1, 2, 3], "b": [7, None, 3], "c": [4, 4, 4]}, index=["x", "y", "z"] + ) + bf_df = dataframe.DataFrame(pd_df) + + bf_result = bf_df.idxmin().to_pandas() + pd_result = pd_df.idxmin() + + pd.testing.assert_series_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) + + +def test_df_idxmax(): + pd_df = pd.DataFrame( + {"a": [1, 2, 3], "b": [7, None, 3], "c": [4, 4, 4]}, index=["x", "y", "z"] + ) + bf_df = dataframe.DataFrame(pd_df) + + bf_result = bf_df.idxmax().to_pandas() + pd_result = pd_df.idxmax() + + pd.testing.assert_series_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) + + +@pytest.mark.parametrize( + ("join", "axis"), + [ + ("outer", None), + ("outer", 0), + ("outer", 1), + ("left", 0), + ("right", 1), + ("inner", None), + ("inner", 1), + ], +) +def test_df_align(join, axis): + index1 = pandas.Index([1, 2, 3, 4], dtype="Int64") + index2 = pandas.Index([1, 2, 4, 5], dtype="Int64") + pd_df1 = pandas.DataFrame( + {"a": [1, None, 3, 4], "b": [5, 6, None, 8]}, dtype="Int64", index=index1 + ) + pd_df2 = pandas.DataFrame( + {"a": [None, 20, 30, 40], "c": [90, None, 110, 120]}, + dtype="Int64", + index=index2, + ) + + bf_df1 = dataframe.DataFrame(pd_df1) + bf_df2 = dataframe.DataFrame(pd_df2) + + bf_result1, bf_result2 = bf_df1.align(bf_df2, join=join, axis=axis) + pd_result1, pd_result2 = pd_df1.align(pd_df2, join=join, axis=axis) + + # Don't check dtype as pandas does unnecessary float conversion + pd.testing.assert_frame_equal(bf_result1.to_pandas(), pd_result1, check_dtype=False) + pd.testing.assert_frame_equal(bf_result2.to_pandas(), pd_result2, check_dtype=False) + + def test_combine_first( scalars_df_index, scalars_df_2_index, @@ -1232,11 +1376,6 @@ def test_combine_first( pd_df_b.columns = ["b", "a", "d"] pd_result = pd_df_a.combine_first(pd_df_b) - print("pandas") - print(pd_result.to_string()) - print("bigframes") - print(bf_result.to_string()) - # Some dtype inconsistency for all-NULL columns pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) @@ -1705,6 +1844,26 @@ def test_df_stack(scalars_dfs): pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) +def test_df_unstack(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + columns = [ + "rowindex_2", + "int64_col", + "int64_too", + ] + + # unstack on mono-index produces series + bf_result = scalars_df[columns].unstack().to_pandas() + pd_result = scalars_pandas_df[columns].unstack() + + # Pandas produces NaN, where bq dataframes produces pd.NA + pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + + @pytest.mark.parametrize( ("values", "index", "columns"), [ @@ -1922,7 +2081,7 @@ def test_loc_single_index_no_duplicate(scalars_df_index, scalars_pandas_df_index bf_result = scalars_df_index.loc[index] pd_result = scalars_pandas_df_index.loc[index] pd.testing.assert_series_equal( - bf_result.to_pandas().iloc[0, :], + bf_result, pd_result, ) @@ -2439,6 +2598,24 @@ def test_iloc_list(scalars_df_index, scalars_pandas_df_index): ) +def test_iloc_list_multiindex(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.copy() + scalars_pandas_df = scalars_pandas_df.copy() + scalars_df = scalars_df.set_index(["bytes_col", "numeric_col"]) + scalars_pandas_df = scalars_pandas_df.set_index(["bytes_col", "numeric_col"]) + + index_list = [0, 0, 0, 5, 4, 7] + + bf_result = scalars_df.iloc[index_list] + pd_result = scalars_pandas_df.iloc[index_list] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + def test_iloc_empty_list(scalars_df_index, scalars_pandas_df_index): index_list = [] diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 1e38b47b4c..19f1c557ef 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -41,6 +41,17 @@ def test_reset_multi_index(scalars_df_index, scalars_pandas_df_index): pandas.testing.assert_frame_equal(bf_result, pd_result) +def test_series_multi_index_idxmin(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.set_index(["bool_col", "int64_too"])[ + "float64_col" + ].idxmin() + pd_result = scalars_pandas_df_index.set_index(["bool_col", "int64_too"])[ + "float64_col" + ].idxmin() + + assert bf_result == pd_result + + def test_binop_series_series_matching_multi_indices( scalars_df_index, scalars_pandas_df_index ): @@ -729,6 +740,26 @@ def test_column_multi_index_stack(scalars_df_index, scalars_pandas_df_index): ) +def test_column_multi_index_unstack(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_too", "int64_col", "rowindex_2"] + level1 = pandas.Index(["b", "a", "b"], dtype="string[pyarrow]") + # Need resulting column to be pyarrow string rather than object dtype + level2 = pandas.Index(["a", "b", "b"], dtype="string[pyarrow]") + multi_columns = pandas.MultiIndex.from_arrays([level1, level2]) + bf_df = scalars_df_index[columns].copy() + bf_df.columns = multi_columns + pd_df = scalars_pandas_df_index[columns].copy() + pd_df.columns = multi_columns + + bf_result = bf_df.unstack().to_pandas() + # Shifting sort behavior in stack + pd_result = pd_df.unstack() + + # Pandas produces NaN, where bq dataframes produces pd.NA + # Column ordering seems to depend on pandas version + pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + + @pytest.mark.skip(reason="Pandas fails in newer versions.") def test_column_multi_index_w_na_stack(scalars_df_index, scalars_pandas_df_index): columns = ["int64_too", "int64_col", "rowindex_2"] @@ -866,6 +897,17 @@ def test_column_multi_index_reorder_levels(scalars_df_index, scalars_pandas_df_i pandas.testing.assert_frame_equal(bf_result, pd_result) +def test_multi_index_unstack(hockey_df, hockey_pandas_df): + bf_result = ( + hockey_df.set_index(["team_name", "season", "position"]).unstack().to_pandas() + ) + pd_result = hockey_pandas_df.set_index( + ["team_name", "season", "position"] + ).unstack() + + pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + def test_column_multi_index_swaplevel(scalars_df_index, scalars_pandas_df_index): columns = ["int64_too", "string_col", "bool_col"] multi_columns = pandas.MultiIndex.from_tuples( diff --git a/tests/system/small/test_pandas_options.py b/tests/system/small/test_pandas_options.py index 6510c4fa27..956b29ae12 100644 --- a/tests/system/small/test_pandas_options.py +++ b/tests/system/small/test_pandas_options.py @@ -75,7 +75,7 @@ def test_read_gbq_start_sets_session_location( # Now read_gbq* from another location should fail with pytest.raises( google.api_core.exceptions.NotFound, - match=f"404 Not found: Dataset {dataset_id_permanent} was not found in location {tokyo_location}", + match=dataset_id_permanent, ): read_method(query) @@ -100,7 +100,7 @@ def test_read_gbq_start_sets_session_location( # Now read_gbq* from another location should fail with pytest.raises( google.api_core.exceptions.NotFound, - match=f"404 Not found: Dataset {dataset_id_permanent_tokyo} was not found in location US", + match=dataset_id_permanent_tokyo, ): read_method(query_tokyo) @@ -146,7 +146,7 @@ def test_read_gbq_after_session_start_must_comply_with_default_location( # Doing read_gbq* from a table in another location should fail with pytest.raises( google.api_core.exceptions.NotFound, - match=f"404 Not found: Dataset {dataset_id_permanent_tokyo} was not found in location US", + match=dataset_id_permanent_tokyo, ): read_method(query_tokyo) @@ -194,7 +194,7 @@ def test_read_gbq_must_comply_with_set_location_US( # Starting user journey with read_gbq* from another location should fail with pytest.raises( google.api_core.exceptions.NotFound, - match=f"404 Not found: Dataset {dataset_id_permanent_tokyo} was not found in location US", + match=dataset_id_permanent_tokyo, ): read_method(query_tokyo) @@ -244,7 +244,7 @@ def test_read_gbq_must_comply_with_set_location_non_US( # Starting user journey with read_gbq* from another location should fail with pytest.raises( google.api_core.exceptions.NotFound, - match=f"404 Not found: Dataset {dataset_id_permanent} was not found in location {tokyo_location}", + match=dataset_id_permanent, ): read_method(query) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 588dcc2c83..8c1c36720b 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -118,7 +118,7 @@ def test_series_get_with_default_index(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_result = scalars_df[col_name].get(key) pd_result = scalars_pandas_df[col_name].get(key) - assert bf_result.to_pandas().iloc[0] == pd_result + assert bf_result == pd_result @pytest.mark.parametrize( @@ -157,7 +157,7 @@ def test_series___getitem___with_default_index(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_result = scalars_df[col_name][key] pd_result = scalars_pandas_df[col_name][key] - assert bf_result.to_pandas().iloc[0] == pd_result + assert bf_result == pd_result @pytest.mark.parametrize( @@ -2468,6 +2468,18 @@ def test_argmax(scalars_df_index, scalars_pandas_df_index): assert bf_result == pd_result +def test_series_idxmin(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.string_col.idxmin() + pd_result = scalars_pandas_df_index.string_col.idxmin() + assert bf_result == pd_result + + +def test_series_idxmax(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.int64_too.idxmax() + pd_result = scalars_pandas_df_index.int64_too.idxmax() + assert bf_result == pd_result + + def test_getattr_attribute_error_when_pandas_has(scalars_df_index): # asof is implemented in pandas but not in bigframes with pytest.raises(AttributeError): @@ -2640,7 +2652,7 @@ def test_loc_single_index_no_duplicate(scalars_df_index, scalars_pandas_df_index index = -2345 bf_result = scalars_df_index.date_col.loc[index] pd_result = scalars_pandas_df_index.date_col.loc[index] - assert bf_result.to_pandas().iloc[0] == pd_result + assert bf_result == pd_result def test_series_bool_interpretation_error(scalars_df_index): diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 614c953764..53ddfa3c49 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -57,6 +57,7 @@ def test_read_gbq_tokyo( ), pytest.param( """SELECT + t.int64_col + 1 as my_ints, t.float64_col * 2 AS my_floats, CONCAT(t.string_col, "_2") AS my_strings, t.int64_col > 0 AS my_bools, @@ -321,11 +322,10 @@ def test_read_pandas_multi_index(session, scalars_pandas_df_multi_index): def test_read_pandas_rowid_exists_adds_suffix(session, scalars_pandas_df_default_index): - scalars_pandas_df_default_index["rowid"] = np.arange( - scalars_pandas_df_default_index.shape[0] - ) + pandas_df = scalars_pandas_df_default_index.copy() + pandas_df["rowid"] = np.arange(pandas_df.shape[0]) - df = session.read_pandas(scalars_pandas_df_default_index) + df = session.read_pandas(pandas_df) total_order_col = df._block._expr._ordering.total_order_col assert total_order_col and total_order_col.column_id == "rowid_2" diff --git a/tests/unit/core/test_io.py b/tests/unit/core/test_io.py index c5074f80c2..afb38a5f75 100644 --- a/tests/unit/core/test_io.py +++ b/tests/unit/core/test_io.py @@ -13,8 +13,10 @@ # limitations under the License. import datetime +from typing import Iterable import google.cloud.bigquery as bigquery +import pytest import bigframes.core.io @@ -47,3 +49,56 @@ def test_create_snapshot_sql_doesnt_timetravel_session_datasets(): # Don't need the project ID for _SESSION tables. assert "my-test-project" not in sql + + +@pytest.mark.parametrize( + ("schema", "expected"), + ( + ( + [bigquery.SchemaField("My Column", "INTEGER")], + "`My Column` INT64", + ), + ( + [ + bigquery.SchemaField("My Column", "INTEGER"), + bigquery.SchemaField("Float Column", "FLOAT"), + bigquery.SchemaField("Bool Column", "BOOLEAN"), + ], + "`My Column` INT64, `Float Column` FLOAT64, `Bool Column` BOOL", + ), + ( + [ + bigquery.SchemaField("My Column", "INTEGER", mode="REPEATED"), + bigquery.SchemaField("Float Column", "FLOAT", mode="REPEATED"), + bigquery.SchemaField("Bool Column", "BOOLEAN", mode="REPEATED"), + ], + "`My Column` ARRAY, `Float Column` ARRAY, `Bool Column` ARRAY", + ), + ( + [ + bigquery.SchemaField( + "My Column", + "RECORD", + mode="REPEATED", + fields=( + bigquery.SchemaField("Float Column", "FLOAT", mode="REPEATED"), + bigquery.SchemaField("Bool Column", "BOOLEAN", mode="REPEATED"), + bigquery.SchemaField( + "Nested Column", + "RECORD", + fields=(bigquery.SchemaField("Int Column", "INTEGER"),), + ), + ), + ), + ], + ( + "`My Column` ARRAY," + + " `Bool Column` ARRAY," + + " `Nested Column` STRUCT<`Int Column` INT64>>>" + ), + ), + ), +) +def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str): + pass diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index 8c8fbd6ab5..60dcc75b63 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -23,6 +23,7 @@ def test_columntransformer_init_expectedtransforms(): standard_scaler_transformer = preprocessing.StandardScaler() max_abs_scaler_transformer = preprocessing.MaxAbsScaler() min_max_scaler_transformer = preprocessing.MinMaxScaler() + k_bins_discretizer_transformer = preprocessing.KBinsDiscretizer(strategy="uniform") label_transformer = preprocessing.LabelEncoder() column_transformer = compose.ColumnTransformer( [ @@ -42,6 +43,11 @@ def test_columntransformer_init_expectedtransforms(): min_max_scaler_transformer, ["culmen_length_mm", "flipper_length_mm"], ), + ( + "k_bins_discretizer", + k_bins_discretizer_transformer, + ["culmen_length_mm", "flipper_length_mm"], + ), ("label", label_transformer, "species"), ] ) @@ -54,6 +60,8 @@ def test_columntransformer_init_expectedtransforms(): ("max_abs_scale", max_abs_scaler_transformer, "flipper_length_mm"), ("min_max_scale", min_max_scaler_transformer, "culmen_length_mm"), ("min_max_scale", min_max_scaler_transformer, "flipper_length_mm"), + ("k_bins_discretizer", k_bins_discretizer_transformer, "culmen_length_mm"), + ("k_bins_discretizer", k_bins_discretizer_transformer, "flipper_length_mm"), ("label", label_transformer, "species"), ] @@ -81,6 +89,11 @@ def test_columntransformer_repr(): preprocessing.MinMaxScaler(), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "k_bins_discretizer", + preprocessing.KBinsDiscretizer(strategy="uniform"), + ["culmen_length_mm", "flipper_length_mm"], + ), ] ) @@ -92,6 +105,9 @@ def test_columntransformer_repr(): ('max_abs_scale', MaxAbsScaler(), ['culmen_length_mm', 'flipper_length_mm']), ('min_max_scale', MinMaxScaler(), + ['culmen_length_mm', 'flipper_length_mm']), + ('k_bins_discretizer', + KBinsDiscretizer(strategy='uniform'), ['culmen_length_mm', 'flipper_length_mm'])])""" ) @@ -119,6 +135,11 @@ def test_columntransformer_repr_matches_sklearn(): preprocessing.MinMaxScaler(), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "k_bins_discretizer", + preprocessing.KBinsDiscretizer(strategy="uniform"), + ["culmen_length_mm", "flipper_length_mm"], + ), ] ) sk_column_transformer = sklearn_compose.ColumnTransformer( @@ -143,6 +164,11 @@ def test_columntransformer_repr_matches_sklearn(): sklearn_preprocessing.MinMaxScaler(), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "k_bins_discretizer", + sklearn_preprocessing.KBinsDiscretizer(strategy="uniform"), + ["culmen_length_mm", "flipper_length_mm"], + ), ] ) diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index a3338e762d..34a02edd42 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -95,6 +95,13 @@ def test_min_max_scaler_produces_correct_sql( assert sql == "ML.MIN_MAX_SCALER(col_a) OVER() AS scaled_col_a" +def test_k_bins_discretizer_produces_correct_sql( + base_sql_generator: ml_sql.BaseSqlGenerator, +): + sql = base_sql_generator.ml_bucketize("col_a", [1, 2, 3, 4], "scaled_col_a") + assert sql == "ML.BUCKETIZE(col_a, [1, 2, 3, 4], FALSE) AS scaled_col_a" + + def test_one_hot_encoder_produces_correct_sql( base_sql_generator: ml_sql.BaseSqlGenerator, ): diff --git a/tests/unit/test_dtypes.py b/tests/unit/test_dtypes.py index bb8ae570dc..3baff2e1f5 100644 --- a/tests/unit/test_dtypes.py +++ b/tests/unit/test_dtypes.py @@ -85,6 +85,70 @@ def test_ibis_float32_raises_unexpected_datatype(): bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_dtypes.float32) +IBIS_ARROW_DTYPES = ( + (ibis_dtypes.boolean, pa.bool_()), + (ibis_dtypes.date, pa.date32()), + (ibis_dtypes.Timestamp(), pa.timestamp("us")), + (ibis_dtypes.float64, pa.float64()), + ( + ibis_dtypes.Timestamp(timezone="UTC"), + pa.timestamp("us", tz="UTC"), + ), + ( + ibis_dtypes.Struct.from_tuples( + [ + ("name", ibis_dtypes.string()), + ("version", ibis_dtypes.int64()), + ] + ), + pa.struct( + [ + ("name", pa.string()), + ("version", pa.int64()), + ] + ), + ), + ( + ibis_dtypes.Struct.from_tuples( + [ + ( + "nested", + ibis_dtypes.Struct.from_tuples( + [ + ("field", ibis_dtypes.string()), + ] + ), + ), + ] + ), + pa.struct( + [ + ( + "nested", + pa.struct( + [ + ("field", pa.string()), + ] + ), + ), + ] + ), + ), +) + + +@pytest.mark.parametrize(("ibis_dtype", "arrow_dtype"), IBIS_ARROW_DTYPES) +def test_arrow_dtype_to_ibis_dtype(ibis_dtype, arrow_dtype): + result = bigframes.dtypes.arrow_dtype_to_ibis_dtype(arrow_dtype) + assert result == ibis_dtype + + +@pytest.mark.parametrize(("ibis_dtype", "arrow_dtype"), IBIS_ARROW_DTYPES) +def test_ibis_dtype_to_arrow_dtype(ibis_dtype, arrow_dtype): + result = bigframes.dtypes.ibis_dtype_to_arrow_dtype(ibis_dtype) + assert result == arrow_dtype + + @pytest.mark.parametrize( ["bigframes_dtype", "ibis_dtype"], [ diff --git a/third_party/bigframes_vendored/pandas/core/arrays/__init__.py b/third_party/bigframes_vendored/pandas/core/arrays/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/__init__.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py new file mode 100644 index 0000000000..8e3ea06a3d --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py @@ -0,0 +1,94 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/arrays/arrow/accessors.py +"""Accessors for arrow-backed data.""" + +from __future__ import annotations + +from bigframes import constants + + +class StructAccessor: + """ + Accessor object for structured data properties of the Series values. + """ + + def field(self, name_or_index: str | int): + """ + Extract a child field of a struct as a Series. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import pyarrow as pa + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series( + ... [ + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, + ... ], + ... dtype=bpd.ArrowDtype(pa.struct( + ... [("version", pa.int64()), ("project", pa.string())] + ... )) + ... ) + + Extract by field name. + + >>> s.struct.field("project") + 0 pandas + 1 pandas + 2 numpy + Name: project, dtype: string + + Extract by field index. + + >>> s.struct.field(0) + 0 1 + 1 2 + 2 1 + Name: version, dtype: Int64 + + Args: + name_or_index: + Name (str) or index (int) of the child field to extract. + + Returns: + Series: + The data corresponding to the selected child field. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def explode(self): + """ + Extract all child fields of a struct as a DataFrame. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import pyarrow as pa + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series( + ... [ + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, + ... ], + ... dtype=bpd.ArrowDtype(pa.struct( + ... [("version", pa.int64()), ("project", pa.string())] + ... )) + ... ) + + Extract all child fields. + + >>> s.struct.explode() + version project + 0 1 pandas + 1 2 pandas + 2 1 numpy + + [3 rows x 2 columns] + + Returns: + DataFrame: + The data corresponding to all child fields. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 6ce11cd7e9..17d941fbdd 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -503,6 +503,35 @@ def drop( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def align( + self, + other, + join="outer", + axis=None, + ) -> tuple: + """ + Align two objects on their axes with the specified join method. + + Join method is specified for each axis Index. + + Args: + other (DataFrame or Series): + join ({{'outer', 'inner', 'left', 'right'}}, default 'outer'): + Type of alignment to be performed. + left: use only keys from left frame, preserve key order. + right: use only keys from right frame, preserve key order. + outer: use union of keys from both frames, sort keys lexicographically. + inner: use intersection of keys from both frames, + preserve the order of the left keys. + + axis (allowed axis of the other object, default None): + Align on index (0), columns (1), or both (None). + + Returns: + tuple of (DataFrame, type of other): Aligned objects. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def rename( self, *, @@ -1265,6 +1294,39 @@ def combine_first(self, other) -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def update( + self, other, join: str = "left", overwrite: bool = True, filter_func=None + ) -> DataFrame: + """ + Modify in place using non-NA values from another DataFrame. + + Aligns on indices. There is no return value. + + Args: + other (DataFrame, or object coercible into a DataFrame): + Should have at least one matching index/column label + with the original DataFrame. If a Series is passed, + its name attribute must be set, and that will be + used as the column name to align with the original DataFrame. + join ({'left'}, default 'left'): + Only left join is implemented, keeping the index and columns of the + original object. + overwrite (bool, default True): + How to handle non-NA values for overlapping keys: + True: overwrite original DataFrame's values + with values from `other`. + False: only update values that are NA in + the original DataFrame. + + filter_func (callable(1d-array) -> bool 1d-array, optional): + Can choose to replace values other than NA. Return True for values + that should be updated. + + Returns: + None: This method directly changes calling object. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + # ---------------------------------------------------------------------- # Data reshaping @@ -1406,14 +1468,14 @@ def merge( ``inner``: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys. - on: - Column join on. It must be found in both DataFrames. Either on or left_on + right_on + on (label or list of labels): + Columns to join on. It must be found in both DataFrames. Either on or left_on + right_on must be passed in. - left_on: - Column join on in the left DataFrame. Either on or left_on + right_on + left_on (label or list of labels): + Columns to join on in the left DataFrame. Either on or left_on + right_on must be passed in. - right_on: - Column join on in the right DataFrame. Either on or left_on + right_on + right_on (label or list of labels): + Columns to join on in the right DataFrame. Either on or left_on + right_on must be passed in. sort: Default False. Sort the join keys lexicographically in the @@ -1743,6 +1805,28 @@ def nsmallest(self, n: int, columns, keep: str = "first"): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def idxmin(self): + """ + Return index of first occurrence of minimum over requested axis. + + NA/null values are excluded. + + Returns: + Series: Indexes of minima along the specified axis. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def idxmax(self): + """ + Return index of first occurrence of maximum over requested axis. + + NA/null values are excluded. + + Returns: + Series: Indexes of maxima along the specified axis. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def nunique(self): """ Count number of distinct elements in specified axis. @@ -1910,6 +1994,21 @@ def stack(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def unstack(self): + """ + Pivot a level of the (necessarily hierarchical) index labels. + + Returns a DataFrame having a new level of column labels whose inner-most level + consists of the pivoted index labels. + + If the index is not a MultiIndex, the output will be a Series + (the analogue of stack when the columns are not a MultiIndex). + + Returns: + DataFrame or Series + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + # ---------------------------------------------------------------------- # Add index and columns diff --git a/third_party/bigframes_vendored/pandas/core/reshape/merge.py b/third_party/bigframes_vendored/pandas/core/reshape/merge.py index ee02d698da..cc81de405b 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/merge.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/merge.py @@ -51,14 +51,14 @@ def merge( ``inner``: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys. - on: - Column join on. It must be found in both DataFrames. Either on or left_on + right_on + on (label or list of labels): + Columns to join on. It must be found in both DataFrames. Either on or left_on + right_on must be passed in. - left_on: - Column join on in the left DataFrame. Either on or left_on + right_on + left_on (label or list of labels): + Columns to join on in the left DataFrame. Either on or left_on + right_on must be passed in. - right_on: - Column join on in the right DataFrame. Either on or left_on + right_on + right_on (label or list of labels): + Columns to join on in the right DataFrame. Either on or left_on + right_on must be passed in. sort: Default False. Sort the join keys lexicographically in the diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index d58c1ccc3b..a41a3454ca 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -8,7 +8,6 @@ import numpy as np from pandas._libs import lib from pandas._typing import Axis, FilePath, NaPosition, WriteBuffer -import pandas.io.formats.format as fmt from bigframes import constants from third_party.bigframes_vendored.pandas.core.generic import NDFrame @@ -151,21 +150,6 @@ def to_string( str or None: String representation of Series if ``buf=None``, otherwise None. """ - formatter = fmt.SeriesFormatter( - self, - name=name, - length=length, - header=header, - index=index, - dtype=dtype, - na_rep=na_rep, - float_format=float_format, - min_rows=min_rows, - max_rows=max_rows, - ) - result = formatter.to_string() - - # catch contract violations raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_markdown( @@ -475,6 +459,30 @@ def duplicated(self, keep="first") -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def idxmin(self) -> Hashable: + """ + Return the row label of the minimum value. + + If multiple values equal the minimum, the first row label with that + value is returned. + + Returns: + Index: Label of the minimum value. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def idxmax(self) -> Hashable: + """ + Return the row label of the maximum value. + + If multiple values equal the maximum, the first row label with that + value is returned. + + Returns: + Index: Label of the maximum value. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def round(self, decimals: int = 0) -> Series: """ Round each value in a Series to the given number of decimals. diff --git a/third_party/bigframes_vendored/sklearn/__init__.py b/third_party/bigframes_vendored/sklearn/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/third_party/bigframes_vendored/sklearn/ensemble/__init__.py b/third_party/bigframes_vendored/sklearn/ensemble/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py b/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py new file mode 100644 index 0000000000..0236558dd4 --- /dev/null +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py @@ -0,0 +1,47 @@ +# Author: Henry Lin +# Tom Dupré la Tour + +# License: BSD + +from bigframes import constants +from third_party.bigframes_vendored.sklearn.base import BaseEstimator, TransformerMixin + + +class KBinsDiscretizer(TransformerMixin, BaseEstimator): + """ + Bin continuous data into intervals. + + Args: + n_bins (int, default 5): + The number of bins to produce. Raises ValueError if ``n_bins < 2``. + strategy ({'uniform', 'quantile'}, default='quantile'): + Strategy used to define the widths of the bins. 'uniform': All bins + in each feature have identical widths. 'quantile': All bins in each + feature have the same number of points. Only `uniform` is supported now. + """ + + def fit(self, X, y=None): + """Fit the estimator. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + The Dataframe or Series with training data. + + y (default None): + Ignored. + + Returns: + KBinsDiscretizer: Fitted scaler. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def transform(self, X): + """Discretize the data. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + The DataFrame or Series to be transformed. + + Returns: + bigframes.dataframe.DataFrame: Transformed result.""" + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/xgboost/__init__.py b/third_party/bigframes_vendored/xgboost/__init__.py new file mode 100644 index 0000000000..e69de29bb2