diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml
index a3da1b0d4c..a9bdb1b7ac 100644
--- a/.github/.OwlBot.lock.yaml
+++ b/.github/.OwlBot.lock.yaml
@@ -13,5 +13,5 @@
# limitations under the License.
docker:
image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest
- digest: sha256:3e3800bb100af5d7f9e810d48212b37812c1856d20ffeafb99ebe66461b61fc7
-# created: 2023-08-02T10:53:29.114535628Z
+ digest: sha256:fac304457974bb530cc5396abd4ab25d26a469cd3bc97cbfb18c8d4324c584eb
+# created: 2023-10-02T21:31:03.517640371Z
diff --git a/.gitignore b/.gitignore
index b4243ced74..d083ea1ddc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -50,6 +50,7 @@ docs.metadata
# Virtual environment
env/
+venv/
# Test logs
coverage.xml
diff --git a/.kokoro/build.sh b/.kokoro/build.sh
index f91c541c6c..58eaa7fedf 100755
--- a/.kokoro/build.sh
+++ b/.kokoro/build.sh
@@ -15,11 +15,7 @@
set -eo pipefail
-if [[ -z "${KOKORO_GOB_COMMIT}" ]]; then
- PROJECT_SCM="github/python-bigquery-dataframes"
-else
- PROJECT_SCM="git/bigframes"
-fi
+PROJECT_SCM="github/python-bigquery-dataframes"
if [[ -z "${PROJECT_ROOT:-}" ]]; then
PROJECT_ROOT="${KOKORO_ARTIFACTS_DIR}/${PROJECT_SCM}"
@@ -30,6 +26,9 @@ cd "${PROJECT_ROOT}"
# Disable buffering, so that the logs stream through.
export PYTHONUNBUFFERED=1
+# Workaround https://github.com/pytest-dev/pytest/issues/9567
+export PY_IGNORE_IMPORTMISMATCH=1
+
# Debug: show build environment
env | grep KOKORO
diff --git a/.kokoro/continuous/common.cfg b/.kokoro/continuous/common.cfg
index 5d40578ac7..97e0651aa9 100644
--- a/.kokoro/continuous/common.cfg
+++ b/.kokoro/continuous/common.cfg
@@ -7,4 +7,4 @@ action {
}
}
-build_file: "bigframes/.kokoro/build.sh"
+build_file: "python-bigquery-dataframes/.kokoro/build.sh"
diff --git a/.kokoro/continuous/nightly.cfg b/.kokoro/continuous/nightly.cfg
index 63c3f51d05..2b7111664f 100644
--- a/.kokoro/continuous/nightly.cfg
+++ b/.kokoro/continuous/nightly.cfg
@@ -1,3 +1,3 @@
# Format: //devtools/kokoro/config/proto/build.proto
-build_file: "bigframes/.kokoro/release-nightly.sh"
+build_file: "python-bigquery-dataframes/.kokoro/release-nightly.sh"
diff --git a/.kokoro/requirements.txt b/.kokoro/requirements.txt
index 029bd342de..96d593c8c8 100644
--- a/.kokoro/requirements.txt
+++ b/.kokoro/requirements.txt
@@ -113,30 +113,30 @@ commonmark==0.9.1 \
--hash=sha256:452f9dc859be7f06631ddcb328b6919c67984aca654e5fefb3914d54691aed60 \
--hash=sha256:da2f38c92590f83de410ba1a3cbceafbc74fee9def35f9251ba9a971d6d66fd9
# via rich
-cryptography==41.0.3 \
- --hash=sha256:0d09fb5356f975974dbcb595ad2d178305e5050656affb7890a1583f5e02a306 \
- --hash=sha256:23c2d778cf829f7d0ae180600b17e9fceea3c2ef8b31a99e3c694cbbf3a24b84 \
- --hash=sha256:3fb248989b6363906827284cd20cca63bb1a757e0a2864d4c1682a985e3dca47 \
- --hash=sha256:41d7aa7cdfded09b3d73a47f429c298e80796c8e825ddfadc84c8a7f12df212d \
- --hash=sha256:42cb413e01a5d36da9929baa9d70ca90d90b969269e5a12d39c1e0d475010116 \
- --hash=sha256:4c2f0d35703d61002a2bbdcf15548ebb701cfdd83cdc12471d2bae80878a4207 \
- --hash=sha256:4fd871184321100fb400d759ad0cddddf284c4b696568204d281c902fc7b0d81 \
- --hash=sha256:5259cb659aa43005eb55a0e4ff2c825ca111a0da1814202c64d28a985d33b087 \
- --hash=sha256:57a51b89f954f216a81c9d057bf1a24e2f36e764a1ca9a501a6964eb4a6800dd \
- --hash=sha256:652627a055cb52a84f8c448185922241dd5217443ca194d5739b44612c5e6507 \
- --hash=sha256:67e120e9a577c64fe1f611e53b30b3e69744e5910ff3b6e97e935aeb96005858 \
- --hash=sha256:6af1c6387c531cd364b72c28daa29232162010d952ceb7e5ca8e2827526aceae \
- --hash=sha256:6d192741113ef5e30d89dcb5b956ef4e1578f304708701b8b73d38e3e1461f34 \
- --hash=sha256:7efe8041897fe7a50863e51b77789b657a133c75c3b094e51b5e4b5cec7bf906 \
- --hash=sha256:84537453d57f55a50a5b6835622ee405816999a7113267739a1b4581f83535bd \
- --hash=sha256:8f09daa483aedea50d249ef98ed500569841d6498aa9c9f4b0531b9964658922 \
- --hash=sha256:95dd7f261bb76948b52a5330ba5202b91a26fbac13ad0e9fc8a3ac04752058c7 \
- --hash=sha256:a74fbcdb2a0d46fe00504f571a2a540532f4c188e6ccf26f1f178480117b33c4 \
- --hash=sha256:a983e441a00a9d57a4d7c91b3116a37ae602907a7618b882c8013b5762e80574 \
- --hash=sha256:ab8de0d091acbf778f74286f4989cf3d1528336af1b59f3e5d2ebca8b5fe49e1 \
- --hash=sha256:aeb57c421b34af8f9fe830e1955bf493a86a7996cc1338fe41b30047d16e962c \
- --hash=sha256:ce785cf81a7bdade534297ef9e490ddff800d956625020ab2ec2780a556c313e \
- --hash=sha256:d0d651aa754ef58d75cec6edfbd21259d93810b73f6ec246436a21b7841908de
+cryptography==41.0.4 \
+ --hash=sha256:004b6ccc95943f6a9ad3142cfabcc769d7ee38a3f60fb0dddbfb431f818c3a67 \
+ --hash=sha256:047c4603aeb4bbd8db2756e38f5b8bd7e94318c047cfe4efeb5d715e08b49311 \
+ --hash=sha256:0d9409894f495d465fe6fda92cb70e8323e9648af912d5b9141d616df40a87b8 \
+ --hash=sha256:23a25c09dfd0d9f28da2352503b23e086f8e78096b9fd585d1d14eca01613e13 \
+ --hash=sha256:2ed09183922d66c4ec5fdaa59b4d14e105c084dd0febd27452de8f6f74704143 \
+ --hash=sha256:35c00f637cd0b9d5b6c6bd11b6c3359194a8eba9c46d4e875a3660e3b400005f \
+ --hash=sha256:37480760ae08065437e6573d14be973112c9e6dcaf5f11d00147ee74f37a3829 \
+ --hash=sha256:3b224890962a2d7b57cf5eeb16ccaafba6083f7b811829f00476309bce2fe0fd \
+ --hash=sha256:5a0f09cefded00e648a127048119f77bc2b2ec61e736660b5789e638f43cc397 \
+ --hash=sha256:5b72205a360f3b6176485a333256b9bcd48700fc755fef51c8e7e67c4b63e3ac \
+ --hash=sha256:7e53db173370dea832190870e975a1e09c86a879b613948f09eb49324218c14d \
+ --hash=sha256:7febc3094125fc126a7f6fb1f420d0da639f3f32cb15c8ff0dc3997c4549f51a \
+ --hash=sha256:80907d3faa55dc5434a16579952ac6da800935cd98d14dbd62f6f042c7f5e839 \
+ --hash=sha256:86defa8d248c3fa029da68ce61fe735432b047e32179883bdb1e79ed9bb8195e \
+ --hash=sha256:8ac4f9ead4bbd0bc8ab2d318f97d85147167a488be0e08814a37eb2f439d5cf6 \
+ --hash=sha256:93530900d14c37a46ce3d6c9e6fd35dbe5f5601bf6b3a5c325c7bffc030344d9 \
+ --hash=sha256:9eeb77214afae972a00dee47382d2591abe77bdae166bda672fb1e24702a3860 \
+ --hash=sha256:b5f4dfe950ff0479f1f00eda09c18798d4f49b98f4e2006d644b3301682ebdca \
+ --hash=sha256:c3391bd8e6de35f6f1140e50aaeb3e2b3d6a9012536ca23ab0d9c35ec18c8a91 \
+ --hash=sha256:c880eba5175f4307129784eca96f4e70b88e57aa3f680aeba3bab0e980b0f37d \
+ --hash=sha256:cecfefa17042941f94ab54f769c8ce0fe14beff2694e9ac684176a2535bf9714 \
+ --hash=sha256:e40211b4923ba5a6dc9769eab704bdb3fbb58d56c5b336d30996c24fcf12aadb \
+ --hash=sha256:efc8ad4e6fc4f1752ebfb58aefece8b4e3c4cae940b0994d43649bdfce8d0d4f
# via
# gcp-releasetool
# secretstorage
@@ -382,6 +382,7 @@ protobuf==3.20.3 \
# gcp-docuploader
# gcp-releasetool
# google-api-core
+ # googleapis-common-protos
pyasn1==0.4.8 \
--hash=sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d \
--hash=sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e4b2bff3c7..880f791625 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,25 @@
[1]: https://pypi.org/project/bigframes/#history
+## [0.6.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.5.0...v0.6.0) (2023-10-04)
+
+
+### Features
+
+* Add df.unstack ([#63](https://github.com/googleapis/python-bigquery-dataframes/issues/63)) ([4a84714](https://github.com/googleapis/python-bigquery-dataframes/commit/4a84714e2fb07f70c70c79f8b8da9fcb41096e33))
+* Add idxmin, idxmax to series, dataframe ([#74](https://github.com/googleapis/python-bigquery-dataframes/issues/74)) ([781307e](https://github.com/googleapis/python-bigquery-dataframes/commit/781307ec22d31a7657f8ee5c6eedc0e419450ccd))
+* Add ml.preprocessing.KBinsDiscretizer ([#81](https://github.com/googleapis/python-bigquery-dataframes/issues/81)) ([24c6256](https://github.com/googleapis/python-bigquery-dataframes/commit/24c625638984f6a84191c7a4c8ac9fb6c3cf1dca))
+* Add multi-column dataframe merge ([#73](https://github.com/googleapis/python-bigquery-dataframes/issues/73)) ([c9fa85c](https://github.com/googleapis/python-bigquery-dataframes/commit/c9fa85cc338be5e9a8dde59b255690aedbbc1127))
+* Add update and align methods to dataframe ([#57](https://github.com/googleapis/python-bigquery-dataframes/issues/57)) ([bf050cf](https://github.com/googleapis/python-bigquery-dataframes/commit/bf050cf475ad8a9e3e0ca3f896ddaf96dbe13ae3))
+* Support STRUCT data type with `Series.struct.field` to extract child fields ([#71](https://github.com/googleapis/python-bigquery-dataframes/issues/71)) ([17afac9](https://github.com/googleapis/python-bigquery-dataframes/commit/17afac9ff70a2b93ed70dc7bcce7beb9a53c2ece))
+
+
+### Bug Fixes
+
+* Avoid `403 response too large to return` error with `read_gbq` and large query results ([#77](https://github.com/googleapis/python-bigquery-dataframes/issues/77)) ([8f3b5b2](https://github.com/googleapis/python-bigquery-dataframes/commit/8f3b5b240f0f28fef92465abc53504e875d7335a))
+* Change return type of `Series.loc[scalar]` ([#40](https://github.com/googleapis/python-bigquery-dataframes/issues/40)) ([fff3d45](https://github.com/googleapis/python-bigquery-dataframes/commit/fff3d45f03ffbc7bb23143a1572e3dd157463ca9))
+* Fix df/series.iloc by list with multiindex ([#79](https://github.com/googleapis/python-bigquery-dataframes/issues/79)) ([971d091](https://github.com/googleapis/python-bigquery-dataframes/commit/971d091cac9ad662145a3d43d8f9a785eb0ccc23))
+
## [0.5.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.4.0...v0.5.0) (2023-09-28)
diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py
index c529f83351..8008c1189a 100644
--- a/bigframes/core/__init__.py
+++ b/bigframes/core/__init__.py
@@ -963,10 +963,11 @@ def unpivot(
],
*,
passthrough_columns: typing.Sequence[str] = (),
- index_col_id: str = "index",
+ index_col_ids: typing.Sequence[str] = ["index"],
dtype: typing.Union[
bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype]
] = pandas.Float64Dtype(),
+ how="left",
) -> ArrayValue:
"""
Unpivot ArrayValue columns.
@@ -981,8 +982,11 @@ def unpivot(
Returns:
ArrayValue: The unpivoted ArrayValue
"""
- table = self._to_ibis_expr(ordering_mode="offset_col")
+ if how not in ("left", "right"):
+ raise ValueError("'how' must be 'left' or 'right'")
+ table = self._to_ibis_expr(ordering_mode="unordered", expose_hidden_cols=True)
row_n = len(row_labels)
+ hidden_col_ids = self._hidden_ordering_column_names.keys()
if not all(
len(source_columns) == row_n for _, source_columns in unpivot_columns
):
@@ -992,33 +996,44 @@ def unpivot(
unpivot_table = table.cross_join(
ibis.memtable({unpivot_offset_id: range(row_n)})
)
- unpivot_offsets_value = (
- (
- (unpivot_table[ORDER_ID_COLUMN] * row_n)
- + unpivot_table[unpivot_offset_id]
- )
- .cast(ibis_dtypes.int64)
- .name(ORDER_ID_COLUMN),
- )
-
# Use ibis memtable to infer type of rowlabels (if possible)
# TODO: Allow caller to specify dtype
- labels_ibis_type = ibis.memtable({"col": row_labels})["col"].type()
- labels_dtype = bigframes.dtypes.ibis_dtype_to_bigframes_dtype(labels_ibis_type)
- cases = [
- (
- i,
- bigframes.dtypes.literal_to_ibis_scalar(
- row_labels[i], force_dtype=labels_dtype # type:ignore
- ),
- )
- for i in range(len(row_labels))
+ if isinstance(row_labels[0], tuple):
+ labels_table = ibis.memtable(row_labels)
+ labels_ibis_types = [
+ labels_table[col].type() for col in labels_table.columns
+ ]
+ else:
+ labels_ibis_types = [ibis.memtable({"col": row_labels})["col"].type()]
+ labels_dtypes = [
+ bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type)
+ for ibis_type in labels_ibis_types
]
- labels_value = (
- typing.cast(ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id])
- .cases(cases, default=None) # type:ignore
- .name(index_col_id)
- )
+
+ label_columns = []
+ for label_part, (col_id, label_dtype) in enumerate(
+ zip(index_col_ids, labels_dtypes)
+ ):
+ # interpret as tuples even if it wasn't originally so can apply same logic for multi-column labels
+ labels_as_tuples = [
+ label if isinstance(label, tuple) else (label,) for label in row_labels
+ ]
+ cases = [
+ (
+ i,
+ bigframes.dtypes.literal_to_ibis_scalar(
+ label_tuple[label_part], # type:ignore
+ force_dtype=label_dtype, # type:ignore
+ ),
+ )
+ for i, label_tuple in enumerate(labels_as_tuples)
+ ]
+ labels_value = (
+ typing.cast(ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id])
+ .cases(cases, default=None) # type:ignore
+ .name(col_id)
+ )
+ label_columns.append(labels_value)
unpivot_values = []
for j in range(len(unpivot_columns)):
@@ -1042,23 +1057,53 @@ def unpivot(
unpivot_values.append(unpivot_value.name(result_col))
unpivot_table = unpivot_table.select(
- passthrough_columns, labels_value, *unpivot_values, unpivot_offsets_value
+ passthrough_columns,
+ *label_columns,
+ *unpivot_values,
+ *hidden_col_ids,
+ unpivot_offset_id,
)
+ # Extend the original ordering using unpivot_offset_id
+ old_ordering = self._ordering
+ if how == "left":
+ new_ordering = ExpressionOrdering(
+ ordering_value_columns=[
+ *old_ordering.ordering_value_columns,
+ OrderingColumnReference(unpivot_offset_id),
+ ],
+ total_ordering_columns=frozenset(
+ [*old_ordering.total_ordering_columns, unpivot_offset_id]
+ ),
+ )
+ else: # how=="right"
+ new_ordering = ExpressionOrdering(
+ ordering_value_columns=[
+ OrderingColumnReference(unpivot_offset_id),
+ *old_ordering.ordering_value_columns,
+ ],
+ total_ordering_columns=frozenset(
+ [*old_ordering.total_ordering_columns, unpivot_offset_id]
+ ),
+ )
value_columns = [
unpivot_table[value_col_id] for value_col_id, _ in unpivot_columns
]
passthrough_values = [unpivot_table[col] for col in passthrough_columns]
+ hidden_ordering_columns = [
+ unpivot_table[unpivot_offset_id],
+ *[unpivot_table[hidden_col] for hidden_col in hidden_col_ids],
+ ]
return ArrayValue(
session=self._session,
table=unpivot_table,
- columns=[unpivot_table[index_col_id], *value_columns, *passthrough_values],
- hidden_ordering_columns=[unpivot_table[ORDER_ID_COLUMN]],
- ordering=ExpressionOrdering(
- ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)],
- integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True),
- total_ordering_columns=frozenset([ORDER_ID_COLUMN]),
- ),
+ columns=[
+ *[unpivot_table[col_id] for col_id in index_col_ids],
+ *value_columns,
+ *passthrough_values,
+ ],
+ hidden_ordering_columns=hidden_ordering_columns,
+ ordering=new_ordering,
)
def assign(self, source_id: str, destination_id: str) -> ArrayValue:
@@ -1153,8 +1198,8 @@ def cached(self, cluster_cols: typing.Sequence[str]) -> ArrayValue:
destination = self._session._ibis_to_session_table(
ibis_expr, cluster_cols=cluster_cols, api_name="cache"
)
- table_expression = self._session.ibis_client.sql(
- f"SELECT * FROM `_SESSION`.`{destination.table_id}`"
+ table_expression = self._session.ibis_client.table(
+ f"{destination.project}.{destination.dataset_id}.{destination.table_id}"
)
new_columns = [table_expression[column] for column in self.column_names]
new_hidden_columns = [
diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py
index d22112417c..30c7902981 100644
--- a/bigframes/core/block_transforms.py
+++ b/bigframes/core/block_transforms.py
@@ -17,6 +17,7 @@
import pandas as pd
+import bigframes.constants as constants
import bigframes.core as core
import bigframes.core.blocks as blocks
import bigframes.core.ordering as ordering
@@ -504,3 +505,125 @@ def _kurt_from_moments_and_count(
kurt_id, na_cond_id, ops.partial_arg3(ops.where_op, None)
)
return block, kurt_id
+
+
+def align(
+ left_block: blocks.Block,
+ right_block: blocks.Block,
+ join: str = "outer",
+ axis: typing.Union[str, int, None] = None,
+) -> typing.Tuple[blocks.Block, blocks.Block]:
+ axis_n = core.utils.get_axis_number(axis) if axis is not None else None
+ # Must align columns first as other way will likely create extra joins
+ if (axis_n is None) or axis_n == 1:
+ left_block, right_block = align_columns(left_block, right_block, join=join)
+ if (axis_n is None) or axis_n == 0:
+ left_block, right_block = align_rows(left_block, right_block, join=join)
+ return left_block, right_block
+
+
+def align_rows(
+ left_block: blocks.Block,
+ right_block: blocks.Block,
+ join: str = "outer",
+):
+ joined_index, (get_column_left, get_column_right) = left_block.index.join(
+ right_block.index, how=join
+ )
+ left_columns = [get_column_left(col) for col in left_block.value_columns]
+ right_columns = [get_column_right(col) for col in right_block.value_columns]
+
+ left_block = joined_index._block.select_columns(left_columns)
+ right_block = joined_index._block.select_columns(right_columns)
+ return left_block, right_block
+
+
+def align_columns(
+ left_block: blocks.Block,
+ right_block: blocks.Block,
+ join: str = "outer",
+):
+ columns, lcol_indexer, rcol_indexer = left_block.column_labels.join(
+ right_block.column_labels, how=join, return_indexers=True
+ )
+ column_indices = zip(
+ lcol_indexer if (lcol_indexer is not None) else range(len(columns)),
+ rcol_indexer if (rcol_indexer is not None) else range(len(columns)),
+ )
+ left_column_ids = []
+ right_column_ids = []
+
+ original_left_block = left_block
+ original_right_block = right_block
+
+ for left_index, right_index in column_indices:
+ if left_index >= 0:
+ left_col_id = original_left_block.value_columns[left_index]
+ else:
+ dtype = right_block.dtypes[right_index]
+ left_block, left_col_id = left_block.create_constant(
+ None, dtype=dtype, label=original_right_block.column_labels[right_index]
+ )
+ left_column_ids.append(left_col_id)
+
+ if right_index >= 0:
+ right_col_id = original_right_block.value_columns[right_index]
+ else:
+ dtype = original_left_block.dtypes[left_index]
+ right_block, right_col_id = right_block.create_constant(
+ None, dtype=dtype, label=left_block.column_labels[left_index]
+ )
+ right_column_ids.append(right_col_id)
+ left_final = left_block.select_columns(left_column_ids)
+ right_final = right_block.select_columns(right_column_ids)
+ return left_final, right_final
+
+
+def idxmin(block: blocks.Block) -> blocks.Block:
+ return _idx_extrema(block, "min")
+
+
+def idxmax(block: blocks.Block) -> blocks.Block:
+ return _idx_extrema(block, "max")
+
+
+def _idx_extrema(
+ block: blocks.Block, min_or_max: typing.Literal["min", "max"]
+) -> blocks.Block:
+ if len(block.index_columns) != 1:
+ # TODO: Need support for tuple dtype
+ raise NotImplementedError(
+ f"idxmin not support for multi-index. {constants.FEEDBACK_LINK}"
+ )
+
+ original_block = block
+ result_cols = []
+ for value_col in original_block.value_columns:
+ direction = (
+ ordering.OrderingDirection.ASC
+ if min_or_max == "min"
+ else ordering.OrderingDirection.DESC
+ )
+ # Have to find the min for each
+ order_refs = [
+ ordering.OrderingColumnReference(value_col, direction),
+ *[
+ ordering.OrderingColumnReference(idx_col)
+ for idx_col in original_block.index_columns
+ ],
+ ]
+ window_spec = core.WindowSpec(ordering=order_refs)
+ idx_col = original_block.index_columns[0]
+ block, result_col = block.apply_window_op(
+ idx_col, agg_ops.first_op, window_spec
+ )
+ result_cols.append(result_col)
+
+ block = block.select_columns(result_cols).with_column_labels(
+ original_block.column_labels
+ )
+ # Stack the entire column axis to produce single-column result
+ # Assumption: uniform dtype for stackability
+ return block.aggregate_all_and_stack(
+ agg_ops.AnyValueOp(), dtype=block.dtypes[0]
+ ).with_column_labels([original_block.index.name])
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
index b53c2212c1..0161d17361 100644
--- a/bigframes/core/blocks.py
+++ b/bigframes/core/blocks.py
@@ -38,6 +38,7 @@
import bigframes.core as core
import bigframes.core.guid as guid
import bigframes.core.indexes as indexes
+import bigframes.core.joins as joins
import bigframes.core.ordering as ordering
import bigframes.core.utils
import bigframes.core.utils as utils
@@ -838,7 +839,7 @@ def aggregate_all_and_stack(
]
result_expr = self.expr.aggregate(aggregations, dropna=dropna).unpivot(
row_labels=self.column_labels.to_list(),
- index_col_id="index",
+ index_col_ids=["index"],
unpivot_columns=[(value_col_id, self.value_columns)],
dtype=dtype,
)
@@ -849,7 +850,7 @@ def aggregate_all_and_stack(
expr_with_offsets, offset_col = self.expr.promote_offsets()
stacked_expr = expr_with_offsets.unpivot(
row_labels=self.column_labels.to_list(),
- index_col_id=guid.generate_guid(),
+ index_col_ids=[guid.generate_guid()],
unpivot_columns=[(value_col_id, self.value_columns)],
passthrough_columns=[*self.index_columns, offset_col],
dtype=dtype,
@@ -1041,7 +1042,7 @@ def summarize(
expr = self.expr.aggregate(aggregations).unpivot(
labels,
unpivot_columns=columns,
- index_col_id=label_col_id,
+ index_col_ids=[label_col_id],
)
labels = self._get_labels_for_columns(column_ids)
return Block(expr, column_labels=labels, index_columns=[label_col_id])
@@ -1225,116 +1226,83 @@ def pivot(
return result_block.with_column_labels(column_index)
- def stack(self):
+ def stack(self, how="left", dropna=True, sort=True, levels: int = 1):
"""Unpivot last column axis level into row axis"""
- if isinstance(self.column_labels, pd.MultiIndex):
- return self._stack_multi()
- else:
- return self._stack_mono()
-
- def _stack_mono(self):
- if isinstance(self.column_labels, pd.MultiIndex):
- raise ValueError("Expected single level index")
-
# These are the values that will be turned into rows
- stack_values = self.column_labels.drop_duplicates().sort_values()
- # Get matching columns
- unpivot_columns: List[Tuple[str, List[str]]] = []
- dtypes: List[bigframes.dtypes.Dtype] = []
- col_id = guid.generate_guid("unpivot_")
- dtype = None
- input_columns: Sequence[Optional[str]] = []
- for uvalue in stack_values:
- matching_ids = self.label_to_col_id.get(uvalue, [])
- input_id = matching_ids[0] if len(matching_ids) > 0 else None
- if input_id:
- if dtype and dtype != self._column_type(input_id):
- raise NotImplementedError(
- "Cannot stack columns with non-matching dtypes."
- )
- else:
- dtype = self._column_type(input_id)
- input_columns.append(input_id)
- unpivot_columns.append((col_id, input_columns))
- if dtype:
- dtypes.append(dtype or pd.Float64Dtype())
+ col_labels, row_labels = utils.split_index(self.column_labels, levels=levels)
+ if dropna:
+ row_labels = row_labels.drop_duplicates()
+ if sort:
+ row_labels = row_labels.sort_values()
- added_index_column = col_id = guid.generate_guid()
- unpivot_expr = self._expr.unpivot(
- row_labels=stack_values,
- passthrough_columns=self.index_columns,
- unpivot_columns=unpivot_columns,
- index_col_id=added_index_column,
- dtype=dtypes,
- )
- block = Block(
- unpivot_expr,
- index_columns=[*self.index_columns, added_index_column],
- column_labels=[None],
- index_labels=[*self._index_labels, self.column_labels.names[-1]],
- )
- return block
+ row_label_tuples = utils.index_as_tuples(row_labels)
- def _stack_multi(self):
- if not isinstance(self.column_labels, pd.MultiIndex):
- raise ValueError("Expected multi-index")
-
- # These are the values that will be turned into rows
- stack_values = (
- self.column_labels.get_level_values(-1).drop_duplicates().sort_values()
- )
-
- result_col_labels = (
- self.column_labels.droplevel(-1)
- .drop_duplicates()
- .sort_values()
- .dropna(how="all")
- )
+ if col_labels is not None:
+ result_index = col_labels.drop_duplicates().sort_values().dropna(how="all")
+ result_col_labels = utils.index_as_tuples(result_index)
+ else:
+ result_index = pd.Index([None])
+ result_col_labels = list([()])
# Get matching columns
unpivot_columns: List[Tuple[str, List[str]]] = []
dtypes = []
for val in result_col_labels:
col_id = guid.generate_guid("unpivot_")
- dtype = None
- input_columns: Sequence[Optional[str]] = []
- for uvalue in stack_values:
- # Need to unpack if still a multi-index after dropping 1 level
- label_to_match = (
- (val, uvalue) if result_col_labels.nlevels == 1 else (*val, uvalue)
- )
- matching_ids = self.label_to_col_id.get(label_to_match, [])
- input_id = matching_ids[0] if len(matching_ids) > 0 else None
- if input_id:
- if dtype and dtype != self._column_type(input_id):
- raise NotImplementedError(
- "Cannot stack columns with non-matching dtypes."
- )
- else:
- dtype = self._column_type(input_id)
- input_columns.append(input_id)
- # Input column i is the first one that
+ input_columns, dtype = self._create_stack_column(val, row_label_tuples)
unpivot_columns.append((col_id, input_columns))
if dtype:
dtypes.append(dtype or pd.Float64Dtype())
- added_index_column = col_id = guid.generate_guid()
+ added_index_columns = [guid.generate_guid() for _ in range(row_labels.nlevels)]
unpivot_expr = self._expr.unpivot(
- row_labels=stack_values,
+ row_labels=row_label_tuples,
passthrough_columns=self.index_columns,
unpivot_columns=unpivot_columns,
- index_col_id=added_index_column,
+ index_col_ids=added_index_columns,
dtype=dtypes,
+ how=how,
)
+ new_index_level_names = self.column_labels.names[-levels:]
+ if how == "left":
+ index_columns = [*self.index_columns, *added_index_columns]
+ index_labels = [*self._index_labels, *new_index_level_names]
+ else:
+ index_columns = [*added_index_columns, *self.index_columns]
+ index_labels = [*new_index_level_names, *self._index_labels]
+
block = Block(
unpivot_expr,
- index_columns=[*self.index_columns, added_index_column],
- column_labels=result_col_labels,
- index_labels=[*self._index_labels, self.column_labels.names[-1]],
+ index_columns=index_columns,
+ column_labels=result_index,
+ index_labels=index_labels,
)
return block
+ def _create_stack_column(
+ self, col_label: typing.Tuple, stack_labels: typing.Sequence[typing.Tuple]
+ ):
+ dtype = None
+ input_columns: list[Optional[str]] = []
+ for uvalue in stack_labels:
+ label_to_match = (*col_label, *uvalue)
+ label_to_match = (
+ label_to_match[0] if len(label_to_match) == 1 else label_to_match
+ )
+ matching_ids = self.label_to_col_id.get(label_to_match, [])
+ input_id = matching_ids[0] if len(matching_ids) > 0 else None
+ if input_id:
+ if dtype and dtype != self._column_type(input_id):
+ raise NotImplementedError(
+ "Cannot stack columns with non-matching dtypes."
+ )
+ else:
+ dtype = self._column_type(input_id)
+ input_columns.append(input_id)
+ # Input column i is the first one that
+ return input_columns, dtype or pd.Float64Dtype()
+
def _column_type(self, col_id: str) -> bigframes.dtypes.Dtype:
col_offset = self.value_columns.index(col_id)
dtype = self.dtypes[col_offset]
@@ -1436,6 +1404,78 @@ def concat(
result_block = result_block.reset_index()
return result_block
+ def merge(
+ self,
+ other: Block,
+ how: typing.Literal[
+ "inner",
+ "left",
+ "outer",
+ "right",
+ ],
+ left_col_ids: typing.Sequence[str],
+ right_col_ids: typing.Sequence[str],
+ sort: bool,
+ suffixes: tuple[str, str] = ("_x", "_y"),
+ ) -> Block:
+ (
+ joined_expr,
+ coalesced_join_cols,
+ (get_column_left, get_column_right),
+ ) = joins.join_by_column(
+ self.expr,
+ left_col_ids,
+ other.expr,
+ right_col_ids,
+ how=how,
+ sort=sort,
+ )
+
+ # which join key parts should be coalesced
+ merge_join_key_mask = [
+ str(self.col_id_to_label[left_id]) == str(other.col_id_to_label[right_id])
+ for left_id, right_id in zip(left_col_ids, right_col_ids)
+ ]
+ labels_to_coalesce = [
+ self.col_id_to_label[col_id]
+ for i, col_id in enumerate(left_col_ids)
+ if merge_join_key_mask[i]
+ ]
+
+ def left_col_mapping(col_id: str) -> str:
+ if col_id in left_col_ids:
+ join_key_part = left_col_ids.index(col_id)
+ if merge_join_key_mask[join_key_part]:
+ return coalesced_join_cols[join_key_part]
+ return get_column_left(col_id)
+
+ def right_col_mapping(col_id: str) -> typing.Optional[str]:
+ if col_id in right_col_ids:
+ join_key_part = right_col_ids.index(col_id)
+ if merge_join_key_mask[join_key_part]:
+ return None
+ return get_column_right(col_id)
+
+ left_columns = [left_col_mapping(col_id) for col_id in self.value_columns]
+
+ right_columns = [
+ typing.cast(str, right_col_mapping(col_id))
+ for col_id in other.value_columns
+ if right_col_mapping(col_id)
+ ]
+
+ expr = joined_expr.select_columns([*left_columns, *right_columns])
+ labels = utils.merge_column_labels(
+ self.column_labels,
+ other.column_labels,
+ coalesce_labels=labels_to_coalesce,
+ suffixes=suffixes,
+ )
+
+ # Constructs default index
+ expr, offset_index_id = expr.promote_offsets()
+ return Block(expr, index_columns=[offset_index_id], column_labels=labels)
+
def _force_reproject(self) -> Block:
"""Forces a reprojection of the underlying tables expression. Used to force predicate/order application before subsequent operations."""
return Block(
diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py
index a538c80711..1a88b2abd6 100644
--- a/bigframes/core/indexers.py
+++ b/bigframes/core/indexers.py
@@ -15,7 +15,7 @@
from __future__ import annotations
import typing
-from typing import Tuple
+from typing import Tuple, Union
import ibis
import pandas as pd
@@ -29,20 +29,19 @@
import bigframes.series
if typing.TYPE_CHECKING:
- LocSingleKey = typing.Union[bigframes.series.Series, indexes.Index, slice]
+ LocSingleKey = Union[
+ bigframes.series.Series, indexes.Index, slice, bigframes.core.scalar.Scalar
+ ]
class LocSeriesIndexer:
def __init__(self, series: bigframes.series.Series):
self._series = series
- def __getitem__(self, key) -> bigframes.series.Series:
- """
- Only indexing by a boolean bigframes.series.Series or list of index entries is currently supported
- """
- return typing.cast(
- bigframes.series.Series, _loc_getitem_series_or_dataframe(self._series, key)
- )
+ def __getitem__(
+ self, key
+ ) -> Union[bigframes.core.scalar.Scalar, bigframes.series.Series]:
+ return _loc_getitem_series_or_dataframe(self._series, key)
def __setitem__(self, key, value) -> None:
# TODO(swast): support MultiIndex
@@ -84,7 +83,7 @@ def __init__(self, series: bigframes.series.Series):
def __getitem__(
self, key
- ) -> bigframes.core.scalar.Scalar | bigframes.series.Series:
+ ) -> Union[bigframes.core.scalar.Scalar, bigframes.series.Series]:
"""
Index series using integer offsets. Currently supports index by key type:
@@ -103,13 +102,17 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame):
self._dataframe = dataframe
@typing.overload
- def __getitem__(self, key: LocSingleKey) -> bigframes.dataframe.DataFrame:
+ def __getitem__(
+ self, key: LocSingleKey
+ ) -> Union[bigframes.dataframe.DataFrame, pd.Series]:
...
# Technically this is wrong since we can have duplicate column labels, but
# this is expected to be rare.
@typing.overload
- def __getitem__(self, key: Tuple[LocSingleKey, str]) -> bigframes.series.Series:
+ def __getitem__(
+ self, key: Tuple[LocSingleKey, str]
+ ) -> Union[bigframes.series.Series, bigframes.core.scalar.Scalar]:
...
def __getitem__(self, key):
@@ -173,7 +176,7 @@ class ILocDataFrameIndexer:
def __init__(self, dataframe: bigframes.dataframe.DataFrame):
self._dataframe = dataframe
- def __getitem__(self, key) -> bigframes.dataframe.DataFrame | pd.Series:
+ def __getitem__(self, key) -> Union[bigframes.dataframe.DataFrame, pd.Series]:
"""
Index dataframe using integer offsets. Currently supports index by key type:
@@ -188,21 +191,26 @@ def __getitem__(self, key) -> bigframes.dataframe.DataFrame | pd.Series:
@typing.overload
def _loc_getitem_series_or_dataframe(
series_or_dataframe: bigframes.series.Series, key
-) -> bigframes.series.Series:
+) -> Union[bigframes.core.scalar.Scalar, bigframes.series.Series]:
...
@typing.overload
def _loc_getitem_series_or_dataframe(
series_or_dataframe: bigframes.dataframe.DataFrame, key
-) -> bigframes.dataframe.DataFrame:
+) -> Union[bigframes.dataframe.DataFrame, pd.Series]:
...
def _loc_getitem_series_or_dataframe(
- series_or_dataframe: bigframes.dataframe.DataFrame | bigframes.series.Series,
+ series_or_dataframe: Union[bigframes.dataframe.DataFrame, bigframes.series.Series],
key: LocSingleKey,
-) -> bigframes.dataframe.DataFrame | bigframes.series.Series:
+) -> Union[
+ bigframes.dataframe.DataFrame,
+ bigframes.series.Series,
+ pd.Series,
+ bigframes.core.scalar.Scalar,
+]:
if isinstance(key, bigframes.series.Series) and key.dtype == "boolean":
return series_or_dataframe[key]
elif isinstance(key, bigframes.series.Series):
@@ -222,7 +230,7 @@ def _loc_getitem_series_or_dataframe(
# TODO(henryjsolberg): support MultiIndex
if len(key) == 0: # type: ignore
return typing.cast(
- typing.Union[bigframes.dataframe.DataFrame, bigframes.series.Series],
+ Union[bigframes.dataframe.DataFrame, bigframes.series.Series],
series_or_dataframe.iloc[0:0],
)
@@ -258,11 +266,22 @@ def _loc_getitem_series_or_dataframe(
)
keys_df = keys_df.set_index(index_name, drop=True)
keys_df.index.name = None
- return _perform_loc_list_join(series_or_dataframe, keys_df)
+ result = _perform_loc_list_join(series_or_dataframe, keys_df)
+ pandas_result = result.to_pandas()
+ # although loc[scalar_key] returns multiple results when scalar_key
+ # is not unique, we download the results here and return the computed
+ # individual result (as a scalar or pandas series) when the key is unique,
+ # since we expect unique index keys to be more common. loc[[scalar_key]]
+ # can be used to retrieve one-item DataFrames or Series.
+ if len(pandas_result) == 1:
+ return pandas_result.iloc[0]
+ # when the key is not unique, we return a bigframes data type
+ # as usual for methods that return dataframes/series
+ return result
else:
raise TypeError(
- "Invalid argument type. loc currently only supports indexing with a "
- "boolean bigframes Series, a list of index entries or a single index entry. "
+ "Invalid argument type. Expected bigframes.Series, bigframes.Index, "
+ "list, : (empty slice), or scalar. "
f"{constants.FEEDBACK_LINK}"
)
@@ -284,9 +303,9 @@ def _perform_loc_list_join(
def _perform_loc_list_join(
- series_or_dataframe: bigframes.dataframe.DataFrame | bigframes.series.Series,
+ series_or_dataframe: Union[bigframes.dataframe.DataFrame, bigframes.series.Series],
keys_df: bigframes.dataframe.DataFrame,
-) -> bigframes.series.Series | bigframes.dataframe.DataFrame:
+) -> Union[bigframes.series.Series, bigframes.dataframe.DataFrame]:
# right join based on the old index so that the matching rows from the user's
# original dataframe will be duplicated and reordered appropriately
original_index_names = series_or_dataframe.index.names
@@ -309,20 +328,26 @@ def _perform_loc_list_join(
@typing.overload
def _iloc_getitem_series_or_dataframe(
series_or_dataframe: bigframes.series.Series, key
-) -> bigframes.series.Series | bigframes.core.scalar.Scalar:
+) -> Union[bigframes.series.Series, bigframes.core.scalar.Scalar]:
...
@typing.overload
def _iloc_getitem_series_or_dataframe(
series_or_dataframe: bigframes.dataframe.DataFrame, key
-) -> bigframes.dataframe.DataFrame | pd.Series:
+) -> Union[bigframes.dataframe.DataFrame, pd.Series]:
...
def _iloc_getitem_series_or_dataframe(
- series_or_dataframe: bigframes.dataframe.DataFrame | bigframes.series.Series, key
-) -> bigframes.dataframe.DataFrame | bigframes.series.Series | bigframes.core.scalar.Scalar | pd.Series:
+ series_or_dataframe: Union[bigframes.dataframe.DataFrame, bigframes.series.Series],
+ key,
+) -> Union[
+ bigframes.dataframe.DataFrame,
+ bigframes.series.Series,
+ bigframes.core.scalar.Scalar,
+ pd.Series,
+]:
if isinstance(key, int):
internal_slice_result = series_or_dataframe._slice(key, key + 1, 1)
result_pd_df = internal_slice_result.to_pandas()
@@ -332,11 +357,9 @@ def _iloc_getitem_series_or_dataframe(
elif isinstance(key, slice):
return series_or_dataframe._slice(key.start, key.stop, key.step)
elif pd.api.types.is_list_like(key):
- # TODO(henryjsolberg): support MultiIndex
-
if len(key) == 0:
return typing.cast(
- typing.Union[bigframes.dataframe.DataFrame, bigframes.series.Series],
+ Union[bigframes.dataframe.DataFrame, bigframes.series.Series],
series_or_dataframe.iloc[0:0],
)
df = series_or_dataframe
@@ -346,15 +369,18 @@ def _iloc_getitem_series_or_dataframe(
original_series_name if original_series_name is not None else "0"
)
df = series_or_dataframe.to_frame()
- original_index_name = df.index.name
- temporary_index_name = guid.generate_guid(prefix="temp_iloc_index_")
- df = df.rename_axis(temporary_index_name)
+ original_index_names = df.index.names
+ temporary_index_names = [
+ guid.generate_guid(prefix="temp_iloc_index_")
+ for _ in range(len(df.index.names))
+ ]
+ df = df.rename_axis(temporary_index_names)
# set to offset index and use regular loc, then restore index
df = df.reset_index(drop=False)
result = df.loc[key]
- result = result.set_index(temporary_index_name)
- result = result.rename_axis(original_index_name)
+ result = result.set_index(temporary_index_names)
+ result = result.rename_axis(original_index_names)
if isinstance(series_or_dataframe, bigframes.series.Series):
result = result[series_name]
diff --git a/bigframes/core/io.py b/bigframes/core/io.py
index 3c2e5a25f5..d47efbdddc 100644
--- a/bigframes/core/io.py
+++ b/bigframes/core/io.py
@@ -16,7 +16,8 @@
import datetime
import textwrap
-from typing import Dict, Union
+import types
+from typing import Dict, Iterable, Union
import google.cloud.bigquery as bigquery
@@ -89,6 +90,48 @@ def create_snapshot_sql(
)
+# BigQuery REST API returns types in Legacy SQL format
+# https://cloud.google.com/bigquery/docs/data-types but we use Standard SQL
+# names
+# https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
+BQ_STANDARD_TYPES = types.MappingProxyType(
+ {
+ "BOOLEAN": "BOOL",
+ "INTEGER": "INT64",
+ "FLOAT": "FLOAT64",
+ }
+)
+
+
+def bq_field_to_type_sql(field: bigquery.SchemaField):
+ if field.mode == "REPEATED":
+ nested_type = bq_field_to_type_sql(
+ bigquery.SchemaField(
+ field.name, field.field_type, mode="NULLABLE", fields=field.fields
+ )
+ )
+ return f"ARRAY<{nested_type}>"
+
+ if field.field_type == "RECORD":
+ nested_fields_sql = ", ".join(
+ bq_field_to_sql(child_field) for child_field in field.fields
+ )
+ return f"STRUCT<{nested_fields_sql}>"
+
+ type_ = field.field_type
+ return BQ_STANDARD_TYPES.get(type_, type_)
+
+
+def bq_field_to_sql(field: bigquery.SchemaField):
+ name = field.name
+ type_ = bq_field_to_type_sql(field)
+ return f"`{name}` {type_}"
+
+
+def bq_schema_to_sql(schema: Iterable[bigquery.SchemaField]):
+ return ", ".join(bq_field_to_sql(field) for field in schema)
+
+
def format_option(key: str, value: Union[bool, str]) -> str:
if isinstance(value, bool):
return f"{key}=true" if value else f"{key}=false"
diff --git a/bigframes/core/joins/single_column.py b/bigframes/core/joins/single_column.py
index 8a9825cf0b..2d616fc3f0 100644
--- a/bigframes/core/joins/single_column.py
+++ b/bigframes/core/joins/single_column.py
@@ -44,7 +44,6 @@ def join_by_column(
"right",
],
sort: bool = False,
- coalesce_join_keys: bool = True,
allow_row_identity_join: bool = True,
) -> Tuple[
core.ArrayValue,
@@ -59,8 +58,6 @@ def join_by_column(
right: Expression for right table to join.
right_column_ids: Column IDs (not label) to join by.
how: The type of join to perform.
- coalesce_join_keys: if set to False, returned column ids will contain
- both left and right join key columns.
allow_row_identity_join (bool):
If True, allow matching by row identity. Set to False to always
perform a true JOIN in generated SQL.
@@ -71,8 +68,6 @@ def join_by_column(
* Sequence[str]: Column IDs of the coalesced join columns. Sometimes either the
left/right table will have missing rows. This column pulls the
non-NULL value from either left/right.
- If coalesce_join_keys is False, will return uncombined left and
- right key columns.
* Tuple[Callable, Callable]: For a given column ID from left or right,
respectively, return the new column id from the combined expression.
"""
@@ -100,9 +95,7 @@ def join_by_column(
right_join_keys = [
combined_expr.get_column(get_column_right(col)) for col in right_column_ids
]
- join_key_cols = get_join_cols(
- left_join_keys, right_join_keys, how, coalesce_join_keys
- )
+ join_key_cols = get_coalesced_join_cols(left_join_keys, right_join_keys, how)
join_key_ids = [col.get_name() for col in join_key_cols]
combined_expr = combined_expr.projection(
[*join_key_cols, *combined_expr.columns]
@@ -182,9 +175,7 @@ def get_column_right(col_id):
right_join_keys = [
combined_table[get_column_right(col)] for col in right_column_ids
]
- join_key_cols = get_join_cols(
- left_join_keys, right_join_keys, how, coalesce_join_keys
- )
+ join_key_cols = get_coalesced_join_cols(left_join_keys, right_join_keys, how)
# We could filter out the original join columns, but predicates/ordering
# might still reference them in implicit joins.
columns = (
@@ -226,46 +217,35 @@ def get_column_right(col_id):
)
-def get_join_cols(
+def get_coalesced_join_cols(
left_join_cols: typing.Iterable[ibis_types.Value],
right_join_cols: typing.Iterable[ibis_types.Value],
how: str,
- coalesce_join_keys: bool = True,
) -> typing.List[ibis_types.Value]:
join_key_cols: list[ibis_types.Value] = []
for left_col, right_col in zip(left_join_cols, right_join_cols):
- if not coalesce_join_keys:
+ if how == "left" or how == "inner":
join_key_cols.append(left_col.name(guid.generate_guid(prefix="index_")))
+ elif how == "right":
join_key_cols.append(right_col.name(guid.generate_guid(prefix="index_")))
- else:
- if how == "left" or how == "inner":
+ elif how == "outer":
+ # The left index and the right index might contain null values, for
+ # example due to an outer join with different numbers of rows. Coalesce
+ # these to take the index value from either column.
+ # Use a random name in case the left index and the right index have the
+ # same name. In such a case, _x and _y suffixes will already be used.
+ # Don't need to coalesce if they are exactly the same column.
+ if left_col.name("index").equals(right_col.name("index")):
join_key_cols.append(left_col.name(guid.generate_guid(prefix="index_")))
- elif how == "right":
- join_key_cols.append(
- right_col.name(guid.generate_guid(prefix="index_"))
- )
- elif how == "outer":
- # The left index and the right index might contain null values, for
- # example due to an outer join with different numbers of rows. Coalesce
- # these to take the index value from either column.
- # Use a random name in case the left index and the right index have the
- # same name. In such a case, _x and _y suffixes will already be used.
- # Don't need to coalesce if they are exactly the same column.
- if left_col.name("index").equals(right_col.name("index")):
- join_key_cols.append(
- left_col.name(guid.generate_guid(prefix="index_"))
- )
- else:
- join_key_cols.append(
- ibis.coalesce(
- left_col,
- right_col,
- ).name(guid.generate_guid(prefix="index_"))
- )
else:
- raise ValueError(
- f"Unexpected join type: {how}. {constants.FEEDBACK_LINK}"
+ join_key_cols.append(
+ ibis.coalesce(
+ left_col,
+ right_col,
+ ).name(guid.generate_guid(prefix="index_"))
)
+ else:
+ raise ValueError(f"Unexpected join type: {how}. {constants.FEEDBACK_LINK}")
return join_key_cols
diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py
index 75175690ce..dc7c709011 100644
--- a/bigframes/core/utils.py
+++ b/bigframes/core/utils.py
@@ -49,6 +49,26 @@ def combine_indices(index1: pd.Index, index2: pd.Index) -> pd.MultiIndex:
return multi_index
+def index_as_tuples(index: pd.Index) -> typing.Sequence[typing.Tuple]:
+ if isinstance(index, pd.MultiIndex):
+ return [label for label in index]
+ else:
+ return [(label,) for label in index]
+
+
+def split_index(
+ index: pd.Index, levels: int = 1
+) -> typing.Tuple[typing.Optional[pd.Index], pd.Index]:
+ nlevels = index.nlevels
+ remaining = nlevels - levels
+ if remaining > 0:
+ return index.droplevel(list(range(remaining, nlevels))), index.droplevel(
+ list(range(0, remaining))
+ )
+ else:
+ return (None, index)
+
+
def get_standardized_ids(
col_labels: Iterable[Hashable], idx_labels: Iterable[Hashable] = ()
) -> tuple[list[str], list[str]]:
@@ -84,3 +104,36 @@ def get_standardized_ids(
idx_ids, col_ids = ids[: len(idx_ids)], ids[len(idx_ids) :]
return col_ids, idx_ids
+
+
+def merge_column_labels(
+ left_labels: pd.Index,
+ right_labels: pd.Index,
+ coalesce_labels: typing.Sequence,
+ suffixes: tuple[str, str] = ("_x", "_y"),
+) -> pd.Index:
+ result_labels = []
+
+ for col_label in left_labels:
+ if col_label in right_labels:
+ if col_label in coalesce_labels:
+ # Merging on the same column only returns 1 key column from coalesce both.
+ # Take the left key column.
+ result_labels.append(col_label)
+ else:
+ result_labels.append(str(col_label) + suffixes[0])
+ else:
+ result_labels.append(col_label)
+
+ for col_label in right_labels:
+ if col_label in left_labels:
+ if col_label in coalesce_labels:
+ # Merging on the same column only returns 1 key column from coalesce both.
+ # Pass the right key column.
+ pass
+ else:
+ result_labels.append(str(col_label) + suffixes[1])
+ else:
+ result_labels.append(col_label)
+
+ return pd.Index(result_labels)
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 113355589b..eea8beb130 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -46,7 +46,6 @@
import bigframes.core.indexers as indexers
import bigframes.core.indexes as indexes
import bigframes.core.io
-import bigframes.core.joins as joins
import bigframes.core.ordering as order
import bigframes.core.utils as utils
import bigframes.core.window
@@ -161,7 +160,15 @@ def __init__(
columns=columns, # type:ignore
dtype=dtype, # type:ignore
)
- if pd_dataframe.size < MAX_INLINE_DF_SIZE:
+ if (
+ pd_dataframe.size < MAX_INLINE_DF_SIZE
+ # TODO(swast): Workaround data types limitation in inline data.
+ and not any(
+ dt.pyarrow_dtype
+ for dt in pd_dataframe.dtypes
+ if isinstance(dt, pandas.ArrowDtype)
+ )
+ ):
self._block = blocks.block_from_local(
pd_dataframe, session or bigframes.pandas.get_global_session()
)
@@ -745,6 +752,55 @@ def rpow(
__rpow__ = rpow
+ def align(
+ self,
+ other: typing.Union[DataFrame, bigframes.series.Series],
+ join: str = "outer",
+ axis: typing.Union[str, int, None] = None,
+ ) -> typing.Tuple[
+ typing.Union[DataFrame, bigframes.series.Series],
+ typing.Union[DataFrame, bigframes.series.Series],
+ ]:
+ axis_n = utils.get_axis_number(axis) if axis else None
+ if axis_n == 1 and isinstance(other, bigframes.series.Series):
+ raise NotImplementedError(
+ f"align with series and axis=1 not supported. {constants.FEEDBACK_LINK}"
+ )
+ left_block, right_block = block_ops.align(
+ self._block, other._block, join=join, axis=axis
+ )
+ return DataFrame(left_block), other.__class__(right_block)
+
+ def update(self, other, join: str = "left", overwrite=True, filter_func=None):
+ other = other if isinstance(other, DataFrame) else DataFrame(other)
+ if join != "left":
+ raise ValueError("Only 'left' join supported for update")
+
+ if filter_func is not None: # Will always take other if possible
+
+ def update_func(
+ left: bigframes.series.Series, right: bigframes.series.Series
+ ) -> bigframes.series.Series:
+ return left.mask(right.notna() & filter_func(left), right)
+
+ elif overwrite:
+
+ def update_func(
+ left: bigframes.series.Series, right: bigframes.series.Series
+ ) -> bigframes.series.Series:
+ return left.mask(right.notna(), right)
+
+ else:
+
+ def update_func(
+ left: bigframes.series.Series, right: bigframes.series.Series
+ ) -> bigframes.series.Series:
+ return left.mask(left.isna(), right)
+
+ result = self.combine(other, update_func, how=join)
+
+ self._set_block(result._block)
+
def combine(
self,
other: DataFrame,
@@ -753,56 +809,31 @@ def combine(
],
fill_value=None,
overwrite: bool = True,
+ *,
+ how: str = "outer",
) -> DataFrame:
- # Join rows
- joined_index, (get_column_left, get_column_right) = self._block.index.join(
- other._block.index, how="outer"
- )
- columns, lcol_indexer, rcol_indexer = self.columns.join(
- other.columns, how="outer", return_indexers=True
- )
+ l_aligned, r_aligned = block_ops.align(self._block, other._block, join=how)
- column_indices = zip(
- lcol_indexer if (lcol_indexer is not None) else range(len(columns)),
- rcol_indexer if (lcol_indexer is not None) else range(len(columns)),
+ other_missing_labels = self._block.column_labels.difference(
+ other._block.column_labels
)
- block = joined_index._block
+ l_frame = DataFrame(l_aligned)
+ r_frame = DataFrame(r_aligned)
results = []
- for left_index, right_index in column_indices:
- if left_index >= 0 and right_index >= 0: # -1 indices indicate missing
- left_col_id = get_column_left(self._block.value_columns[left_index])
- right_col_id = get_column_right(other._block.value_columns[right_index])
- left_series = bigframes.series.Series(block.select_column(left_col_id))
- right_series = bigframes.series.Series(
- block.select_column(right_col_id)
- )
+ for (label, lseries), (_, rseries) in zip(l_frame.items(), r_frame.items()):
+ if not ((label in other_missing_labels) and not overwrite):
if fill_value is not None:
- left_series = left_series.fillna(fill_value)
- right_series = right_series.fillna(fill_value)
- results.append(func(left_series, right_series))
- elif left_index >= 0:
- # Does not exist in other
- if overwrite:
- dtype = self.dtypes[left_index]
- block, null_col_id = block.create_constant(None, dtype=dtype)
- result = bigframes.series.Series(block.select_column(null_col_id))
- results.append(result)
+ result = func(
+ lseries.fillna(fill_value), rseries.fillna(fill_value)
+ )
else:
- left_col_id = get_column_left(self._block.value_columns[left_index])
- result = bigframes.series.Series(block.select_column(left_col_id))
- if fill_value is not None:
- result = result.fillna(fill_value)
- results.append(result)
- elif right_index >= 0:
- right_col_id = get_column_right(other._block.value_columns[right_index])
- result = bigframes.series.Series(block.select_column(right_col_id))
- if fill_value is not None:
- result = result.fillna(fill_value)
- results.append(result)
+ result = func(lseries, rseries)
else:
- # Should not be possible
- raise ValueError("No right or left index.")
+ result = (
+ lseries.fillna(fill_value) if fill_value is not None else lseries
+ )
+ results.append(result)
if all([isinstance(val, bigframes.series.Series) for val in results]):
import bigframes.core.reshape as rs
@@ -1611,6 +1642,12 @@ def agg(
aggregate = agg
+ def idxmin(self) -> bigframes.series.Series:
+ return bigframes.series.Series(block_ops.idxmin(self._block))
+
+ def idxmax(self) -> bigframes.series.Series:
+ return bigframes.series.Series(block_ops.idxmax(self._block))
+
def describe(self) -> DataFrame:
df_numeric = self._drop_non_numeric(keep_bool=False)
if len(df_numeric.columns) == 0:
@@ -1682,6 +1719,27 @@ def stack(self):
return bigframes.series.Series(result_block)
return DataFrame(result_block)
+ def unstack(self):
+ block = self._block
+ # Special case, unstack with mono-index transpose into a series
+ if self.index.nlevels == 1:
+ block = block.stack(
+ how="right", dropna=False, sort=False, levels=self.columns.nlevels
+ )
+ return bigframes.series.Series(block)
+
+ # Pivot by last level of index
+ index_ids = block.index_columns
+ block = block.reset_index(drop=False)
+ block = block.set_index(index_ids[:-1])
+
+ pivot_block = block.pivot(
+ columns=[index_ids[-1]],
+ values=self._block.value_columns,
+ values_in_index=True,
+ )
+ return DataFrame(pivot_block)
+
def _drop_non_numeric(self, keep_bool=True) -> DataFrame:
types_to_keep = set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES)
if not keep_bool:
@@ -1734,12 +1792,10 @@ def merge(
] = "inner",
# TODO(garrettwu): Currently can take inner, outer, left and right. To support
# cross joins
- # TODO(garrettwu): Support "on" list of columns and None. Currently a single
- # column must be provided
- on: Optional[str] = None,
+ on: Union[blocks.Label, Sequence[blocks.Label], None] = None,
*,
- left_on: Optional[str] = None,
- right_on: Optional[str] = None,
+ left_on: Union[blocks.Label, Sequence[blocks.Label], None] = None,
+ right_on: Union[blocks.Label, Sequence[blocks.Label], None] = None,
sort: bool = False,
suffixes: tuple[str, str] = ("_x", "_y"),
) -> DataFrame:
@@ -1753,97 +1809,41 @@ def merge(
)
left_on, right_on = on, on
- left = self
- left_on_sql = self._sql_names(left_on)
- # 0 elements already throws an exception
- if len(left_on_sql) > 1:
- raise ValueError(f"The column label {left_on} is not unique.")
- left_on_sql = left_on_sql[0]
-
- right_on_sql = right._sql_names(right_on)
- if len(right_on_sql) > 1:
- raise ValueError(f"The column label {right_on} is not unique.")
- right_on_sql = right_on_sql[0]
-
- (
- joined_expr,
- join_key_ids,
- (get_column_left, get_column_right),
- ) = joins.join_by_column(
- left._block.expr,
- [left_on_sql],
- right._block.expr,
- [right_on_sql],
- how=how,
- sort=sort,
- # In merging on the same column, it only returns 1 key column from coalesced both.
- # While if 2 different columns, both will be presented in the result.
- coalesce_join_keys=(left_on == right_on),
- )
- # TODO(swast): Add suffixes to the column labels instead of reusing the
- # column IDs as the new labels.
- # Drop the index column(s) to be consistent with pandas.
- left_columns = [
- join_key_ids[0] if (col_id == left_on_sql) else get_column_left(col_id)
- for col_id in left._block.value_columns
- ]
-
- right_columns = []
- for col_id in right._block.value_columns:
- if col_id == right_on_sql:
- # When left_on == right_on
- if len(join_key_ids) > 1:
- right_columns.append(join_key_ids[1])
- else:
- right_columns.append(get_column_right(col_id))
-
- expr = joined_expr.select_columns([*left_columns, *right_columns])
- labels = self._get_merged_col_labels(
- right, left_on=left_on, right_on=right_on, suffixes=suffixes
- )
+ if utils.is_list_like(left_on):
+ left_on = list(left_on) # type: ignore
+ else:
+ left_on = [left_on]
- # Constructs default index
- expr, offset_index_id = expr.promote_offsets()
- block = blocks.Block(
- expr, index_columns=[offset_index_id], column_labels=labels
+ if utils.is_list_like(right_on):
+ right_on = list(right_on) # type: ignore
+ else:
+ right_on = [right_on]
+
+ left_join_ids = []
+ for label in left_on: # type: ignore
+ left_col_id = self._resolve_label_exact(label)
+ # 0 elements already throws an exception
+ if not left_col_id:
+ raise ValueError(f"No column {label} found in self.")
+ left_join_ids.append(left_col_id)
+
+ right_join_ids = []
+ for label in right_on: # type: ignore
+ right_col_id = right._resolve_label_exact(label)
+ if not right_col_id:
+ raise ValueError(f"No column {label} found in other.")
+ right_join_ids.append(right_col_id)
+
+ block = self._block.merge(
+ right._block,
+ how,
+ left_join_ids,
+ right_join_ids,
+ sort=sort,
+ suffixes=suffixes,
)
return DataFrame(block)
- def _get_merged_col_labels(
- self,
- right: DataFrame,
- left_on: str,
- right_on: str,
- suffixes: tuple[str, str] = ("_x", "_y"),
- ) -> List[blocks.Label]:
- on_col_equal = left_on == right_on
-
- left_col_labels: list[blocks.Label] = []
- for col_label in self._block.column_labels:
- if col_label in right._block.column_labels:
- if on_col_equal and col_label == left_on:
- # Merging on the same column only returns 1 key column from coalesce both.
- # Take the left key column.
- left_col_labels.append(col_label)
- else:
- left_col_labels.append(str(col_label) + suffixes[0])
- else:
- left_col_labels.append(col_label)
-
- right_col_labels: list[blocks.Label] = []
- for col_label in right._block.column_labels:
- if col_label in self._block.column_labels:
- if on_col_equal and col_label == left_on:
- # Merging on the same column only returns 1 key column from coalesce both.
- # Pass the right key column.
- pass
- else:
- right_col_labels.append(str(col_label) + suffixes[1])
- else:
- right_col_labels.append(col_label)
-
- return left_col_labels + right_col_labels
-
def join(
self, other: DataFrame, *, on: Optional[str] = None, how: str = "left"
) -> DataFrame:
diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
index 59d3007fab..46a7a1cb50 100644
--- a/bigframes/dtypes.py
+++ b/bigframes/dtypes.py
@@ -84,10 +84,10 @@
BIDIRECTIONAL_MAPPINGS: Iterable[Tuple[IbisDtype, Dtype]] = (
(ibis_dtypes.boolean, pd.BooleanDtype()),
+ (ibis_dtypes.date, pd.ArrowDtype(pa.date32())),
(ibis_dtypes.float64, pd.Float64Dtype()),
(ibis_dtypes.int64, pd.Int64Dtype()),
(ibis_dtypes.string, pd.StringDtype(storage="pyarrow")),
- (ibis_dtypes.date, pd.ArrowDtype(pa.date32())),
(ibis_dtypes.time, pd.ArrowDtype(pa.time64("us"))),
(ibis_dtypes.Timestamp(timezone=None), pd.ArrowDtype(pa.timestamp("us"))),
(
@@ -100,6 +100,19 @@
pandas: ibis for ibis, pandas in BIDIRECTIONAL_MAPPINGS
}
+IBIS_TO_ARROW: Dict[ibis_dtypes.DataType, pa.DataType] = {
+ ibis_dtypes.boolean: pa.bool_(),
+ ibis_dtypes.date: pa.date32(),
+ ibis_dtypes.float64: pa.float64(),
+ ibis_dtypes.int64: pa.int64(),
+ ibis_dtypes.string: pa.string(),
+ ibis_dtypes.time: pa.time64("us"),
+ ibis_dtypes.Timestamp(timezone=None): pa.timestamp("us"),
+ ibis_dtypes.Timestamp(timezone="UTC"): pa.timestamp("us", tz="UTC"),
+}
+
+ARROW_TO_IBIS = {arrow: ibis for ibis, arrow in IBIS_TO_ARROW.items()}
+
IBIS_TO_BIGFRAMES: Dict[ibis_dtypes.DataType, Union[Dtype, np.dtype[Any]]] = {
ibis: pandas for ibis, pandas in BIDIRECTIONAL_MAPPINGS
}
@@ -148,11 +161,12 @@ def ibis_dtype_to_bigframes_dtype(
# Special cases: Ibis supports variations on these types, but currently
# our IO returns them as objects. Eventually, we should support them as
# ArrowDType (and update the IO accordingly)
- if isinstance(ibis_dtype, ibis_dtypes.Array) or isinstance(
- ibis_dtype, ibis_dtypes.Struct
- ):
+ if isinstance(ibis_dtype, ibis_dtypes.Array):
return np.dtype("O")
+ if isinstance(ibis_dtype, ibis_dtypes.Struct):
+ return pd.ArrowDtype(ibis_dtype_to_arrow_dtype(ibis_dtype))
+
if ibis_dtype in IBIS_TO_BIGFRAMES:
return IBIS_TO_BIGFRAMES[ibis_dtype]
elif isinstance(ibis_dtype, ibis_dtypes.Null):
@@ -164,6 +178,26 @@ def ibis_dtype_to_bigframes_dtype(
)
+def ibis_dtype_to_arrow_dtype(ibis_dtype: ibis_dtypes.DataType) -> pa.DataType:
+ if isinstance(ibis_dtype, ibis_dtypes.Array):
+ return pa.list_(ibis_dtype_to_arrow_dtype(ibis_dtype.value_type))
+
+ if isinstance(ibis_dtype, ibis_dtypes.Struct):
+ return pa.struct(
+ [
+ (name, ibis_dtype_to_arrow_dtype(dtype))
+ for name, dtype in ibis_dtype.fields.items()
+ ]
+ )
+
+ if ibis_dtype in IBIS_TO_ARROW:
+ return IBIS_TO_ARROW[ibis_dtype]
+ else:
+ raise ValueError(
+ f"Unexpected Ibis data type {ibis_dtype}. {constants.FEEDBACK_LINK}"
+ )
+
+
def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value:
"""Converts an Ibis expression to canonical type.
@@ -187,6 +221,24 @@ def ibis_table_to_canonical_types(table: ibis_types.Table) -> ibis_types.Table:
return table.select(*casted_columns)
+def arrow_dtype_to_ibis_dtype(arrow_dtype: pa.DataType) -> ibis_dtypes.DataType:
+ if pa.types.is_struct(arrow_dtype):
+ struct_dtype = typing.cast(pa.StructType, arrow_dtype)
+ return ibis_dtypes.Struct.from_tuples(
+ [
+ (field.name, arrow_dtype_to_ibis_dtype(field.type))
+ for field in struct_dtype
+ ]
+ )
+
+ if arrow_dtype in ARROW_TO_IBIS:
+ return ARROW_TO_IBIS[arrow_dtype]
+ else:
+ raise ValueError(
+ f"Unexpected Arrow data type {arrow_dtype}. {constants.FEEDBACK_LINK}"
+ )
+
+
def bigframes_dtype_to_ibis_dtype(
bigframes_dtype: Union[DtypeString, Dtype, np.dtype[Any]]
) -> ibis_dtypes.DataType:
@@ -202,6 +254,9 @@ def bigframes_dtype_to_ibis_dtype(
Raises:
ValueError: If passed a dtype not supported by BigQuery DataFrames.
"""
+ if isinstance(bigframes_dtype, pd.ArrowDtype):
+ return arrow_dtype_to_ibis_dtype(bigframes_dtype.pyarrow_dtype)
+
type_string = str(bigframes_dtype)
if type_string in BIGFRAMES_STRING_TO_BIGFRAMES:
bigframes_dtype = BIGFRAMES_STRING_TO_BIGFRAMES[
diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py
index 9effbf1968..bf046ff691 100644
--- a/bigframes/ml/compose.py
+++ b/bigframes/ml/compose.py
@@ -31,6 +31,7 @@
preprocessing.StandardScaler,
preprocessing.MaxAbsScaler,
preprocessing.MinMaxScaler,
+ preprocessing.KBinsDiscretizer,
preprocessing.LabelEncoder,
]
@@ -91,18 +92,24 @@ def transformers_(
return result
- def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]:
+ def _compile_to_sql(
+ self,
+ columns: List[str],
+ X: bpd.DataFrame,
+ ) -> List[Tuple[str, str]]:
"""Compile this transformer to a list of SQL expressions that can be included in
a BQML TRANSFORM clause
Args:
columns (List[str]):
a list of column names to transform
+ X (bpd.DataFrame):
+ The Dataframe with training data.
Returns:
a list of tuples of (sql_expression, output_name)"""
return [
- transformer._compile_to_sql([column])[0]
+ transformer._compile_to_sql([column], X=X)[0]
for column in columns
for _, transformer, target_column in self.transformers_
if column == target_column
@@ -115,7 +122,7 @@ def fit(
) -> ColumnTransformer:
(X,) = utils.convert_to_dataframe(X)
- compiled_transforms = self._compile_to_sql(X.columns.tolist())
+ compiled_transforms = self._compile_to_sql(X.columns.tolist(), X)
transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms]
self._bqml_model = self._bqml_model_factory.create_model(
diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py
index 110cbcf493..443b9e7be6 100644
--- a/bigframes/ml/model_selection.py
+++ b/bigframes/ml/model_selection.py
@@ -17,6 +17,7 @@
https://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection."""
+import typing
from typing import List, Union
from bigframes.ml import utils
@@ -79,9 +80,10 @@ def train_test_split(
train_index = split_dfs[0].index
test_index = split_dfs[1].index
- split_dfs += [
- df.loc[index] for df in dfs[1:] for index in (train_index, test_index)
- ]
+ split_dfs += typing.cast(
+ List[bpd.DataFrame],
+ [df.loc[index] for df in dfs[1:] for index in (train_index, test_index)],
+ )
# convert back to Series.
results: List[Union[bpd.DataFrame, bpd.Series]] = []
diff --git a/bigframes/ml/pipeline.py b/bigframes/ml/pipeline.py
index ac02c39112..ad0b3fae11 100644
--- a/bigframes/ml/pipeline.py
+++ b/bigframes/ml/pipeline.py
@@ -52,6 +52,7 @@ def __init__(self, steps: List[Tuple[str, base.BaseEstimator]]):
preprocessing.OneHotEncoder,
preprocessing.MaxAbsScaler,
preprocessing.MinMaxScaler,
+ preprocessing.KBinsDiscretizer,
preprocessing.LabelEncoder,
),
):
@@ -93,7 +94,7 @@ def fit(
) -> Pipeline:
(X,) = utils.convert_to_dataframe(X)
- compiled_transforms = self._transform._compile_to_sql(X.columns.tolist())
+ compiled_transforms = self._transform._compile_to_sql(X.columns.tolist(), X=X)
transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms]
if y is not None:
@@ -151,6 +152,7 @@ def _extract_as_column_transformer(
preprocessing.StandardScaler,
preprocessing.MaxAbsScaler,
preprocessing.MinMaxScaler,
+ preprocessing.KBinsDiscretizer,
preprocessing.LabelEncoder,
],
Union[str, List[str]],
@@ -190,6 +192,13 @@ def _extract_as_column_transformer(
*preprocessing.MinMaxScaler._parse_from_sql(transform_sql),
)
)
+ elif transform_sql.startswith("ML.BUCKETIZE"):
+ transformers.append(
+ (
+ "k_bins_discretizer",
+ *preprocessing.KBinsDiscretizer._parse_from_sql(transform_sql),
+ )
+ )
elif transform_sql.startswith("ML.LABEL_ENCODER"):
transformers.append(
(
@@ -213,6 +222,7 @@ def _merge_column_transformer(
preprocessing.OneHotEncoder,
preprocessing.MaxAbsScaler,
preprocessing.MinMaxScaler,
+ preprocessing.KBinsDiscretizer,
preprocessing.LabelEncoder,
]:
"""Try to merge the column transformer to a simple transformer."""
diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py
index caf4657a63..5f44d40218 100644
--- a/bigframes/ml/preprocessing.py
+++ b/bigframes/ml/preprocessing.py
@@ -23,6 +23,7 @@
from bigframes.ml import base, core, globals, utils
import bigframes.pandas as bpd
import third_party.bigframes_vendored.sklearn.preprocessing._data
+import third_party.bigframes_vendored.sklearn.preprocessing._discretization
import third_party.bigframes_vendored.sklearn.preprocessing._encoder
import third_party.bigframes_vendored.sklearn.preprocessing._label
@@ -44,12 +45,15 @@ def __init__(self):
def __eq__(self, other: Any) -> bool:
return type(other) is StandardScaler and self._bqml_model == other._bqml_model
- def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]:
+ def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]:
"""Compile this transformer to a list of SQL expressions that can be included in
a BQML TRANSFORM clause
Args:
- columns: a list of column names to transform
+ columns:
+ a list of column names to transform.
+ X (default None):
+ Ignored.
Returns: a list of tuples of (sql_expression, output_name)"""
return [
@@ -124,12 +128,15 @@ def __init__(self):
def __eq__(self, other: Any) -> bool:
return type(other) is MaxAbsScaler and self._bqml_model == other._bqml_model
- def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]:
+ def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]:
"""Compile this transformer to a list of SQL expressions that can be included in
a BQML TRANSFORM clause
Args:
- columns: a list of column names to transform
+ columns:
+ a list of column names to transform.
+ X (default None):
+ Ignored.
Returns: a list of tuples of (sql_expression, output_name)"""
return [
@@ -204,12 +211,15 @@ def __init__(self):
def __eq__(self, other: Any) -> bool:
return type(other) is MinMaxScaler and self._bqml_model == other._bqml_model
- def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]:
+ def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]:
"""Compile this transformer to a list of SQL expressions that can be included in
a BQML TRANSFORM clause
Args:
- columns: a list of column names to transform
+ columns:
+ a list of column names to transform.
+ X (default None):
+ Ignored.
Returns: a list of tuples of (sql_expression, output_name)"""
return [
@@ -267,6 +277,124 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame:
)
+class KBinsDiscretizer(
+ base.Transformer,
+ third_party.bigframes_vendored.sklearn.preprocessing._discretization.KBinsDiscretizer,
+):
+ __doc__ = (
+ third_party.bigframes_vendored.sklearn.preprocessing._discretization.KBinsDiscretizer.__doc__
+ )
+
+ def __init__(
+ self,
+ n_bins: int = 5,
+ strategy: Literal["uniform", "quantile"] = "quantile",
+ ):
+ if strategy != "uniform":
+ raise NotImplementedError(
+ f"Only strategy = 'uniform' is supported now, input is {strategy}."
+ )
+ if n_bins < 2:
+ raise ValueError(
+ f"n_bins has to be larger than or equal to 2, input is {n_bins}."
+ )
+ self.n_bins = n_bins
+ self.strategy = strategy
+ self._bqml_model: Optional[core.BqmlModel] = None
+ self._bqml_model_factory = globals.bqml_model_factory()
+ self._base_sql_generator = globals.base_sql_generator()
+
+ # TODO(garrettwu): implement __hash__
+ def __eq__(self, other: Any) -> bool:
+ return (
+ type(other) is KBinsDiscretizer
+ and self.n_bins == other.n_bins
+ and self._bqml_model == other._bqml_model
+ )
+
+ def _compile_to_sql(
+ self,
+ columns: List[str],
+ X: bpd.DataFrame,
+ ) -> List[Tuple[str, str]]:
+ """Compile this transformer to a list of SQL expressions that can be included in
+ a BQML TRANSFORM clause
+
+ Args:
+ columns:
+ a list of column names to transform
+ X:
+ The Dataframe with training data.
+
+ Returns: a list of tuples of (sql_expression, output_name)"""
+ array_split_points = {}
+ if self.strategy == "uniform":
+ for column in columns:
+ min_value = X[column].min()
+ max_value = X[column].max()
+ bin_size = (max_value - min_value) / self.n_bins
+ array_split_points[column] = [
+ min_value + i * bin_size for i in range(self.n_bins - 1)
+ ]
+
+ return [
+ (
+ self._base_sql_generator.ml_bucketize(
+ column, array_split_points[column], f"kbinsdiscretizer_{column}"
+ ),
+ f"kbinsdiscretizer_{column}",
+ )
+ for column in columns
+ ]
+
+ @classmethod
+ def _parse_from_sql(cls, sql: str) -> tuple[KBinsDiscretizer, str]:
+ """Parse SQL to tuple(KBinsDiscretizer, column_label).
+
+ Args:
+ sql: SQL string of format "ML.BUCKETIZE({col_label}, array_split_points, FALSE) OVER()"
+
+ Returns:
+ tuple(KBinsDiscretizer, column_label)"""
+ s = sql[sql.find("(") + 1 : sql.find(")")]
+ array_split_points = s[s.find("[") + 1 : s.find("]")]
+ col_label = s[: s.find(",")]
+ n_bins = array_split_points.count(",") + 2
+ return cls(n_bins, "uniform"), col_label
+
+ def fit(
+ self,
+ X: Union[bpd.DataFrame, bpd.Series],
+ y=None, # ignored
+ ) -> KBinsDiscretizer:
+ (X,) = utils.convert_to_dataframe(X)
+
+ compiled_transforms = self._compile_to_sql(X.columns.tolist(), X)
+ transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms]
+
+ self._bqml_model = self._bqml_model_factory.create_model(
+ X,
+ options={"model_type": "transform_only"},
+ transforms=transform_sqls,
+ )
+
+ # The schema of TRANSFORM output is not available in the model API, so save it during fitting
+ self._output_names = [name for _, name in compiled_transforms]
+ return self
+
+ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame:
+ if not self._bqml_model:
+ raise RuntimeError("Must be fitted before transform")
+
+ (X,) = utils.convert_to_dataframe(X)
+
+ df = self._bqml_model.transform(X)
+ return typing.cast(
+ bpd.DataFrame,
+ df[self._output_names],
+ )
+
+
class OneHotEncoder(
base.Transformer,
third_party.bigframes_vendored.sklearn.preprocessing._encoder.OneHotEncoder,
@@ -308,13 +436,15 @@ def __eq__(self, other: Any) -> bool:
and self.max_categories == other.max_categories
)
- def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]:
+ def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]:
"""Compile this transformer to a list of SQL expressions that can be included in
a BQML TRANSFORM clause
Args:
columns:
- a list of column names to transform
+ a list of column names to transform.
+ X (default None):
+ Ignored.
Returns: a list of tuples of (sql_expression, output_name)"""
@@ -432,13 +562,15 @@ def __eq__(self, other: Any) -> bool:
and self.max_categories == other.max_categories
)
- def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]:
+ def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]:
"""Compile this transformer to a list of SQL expressions that can be included in
a BQML TRANSFORM clause
Args:
columns:
- a list of column names to transform
+ a list of column names to transform.
+ X (default None):
+ Ignored.
Returns: a list of tuples of (sql_expression, output_name)"""
diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py
index 57c8ba672a..601b271099 100644
--- a/bigframes/ml/sql.py
+++ b/bigframes/ml/sql.py
@@ -85,6 +85,15 @@ def ml_min_max_scaler(self, numeric_expr_sql: str, name: str) -> str:
"""Encode ML.MIN_MAX_SCALER for BQML"""
return f"""ML.MIN_MAX_SCALER({numeric_expr_sql}) OVER() AS {name}"""
+ def ml_bucketize(
+ self,
+ numeric_expr_sql: str,
+ array_split_points: Iterable[Union[int, float]],
+ name: str,
+ ) -> str:
+ """Encode ML.MIN_MAX_SCALER for BQML"""
+ return f"""ML.BUCKETIZE({numeric_expr_sql}, {array_split_points}, FALSE) AS {name}"""
+
def ml_one_hot_encoder(
self,
numeric_expr_sql: str,
diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py
index add6af57f4..51eaad18b9 100644
--- a/bigframes/operations/base.py
+++ b/bigframes/operations/base.py
@@ -86,7 +86,15 @@ def __init__(
if pd_series.name is None:
# to_frame will set default numeric column label if unnamed, but we do not support int column label, so must rename
pd_dataframe = pd_dataframe.set_axis(["unnamed_col"], axis=1)
- if pd_dataframe.size < MAX_INLINE_SERIES_SIZE:
+ if (
+ pd_dataframe.size < MAX_INLINE_SERIES_SIZE
+ # TODO(swast): Workaround data types limitation in inline data.
+ and not any(
+ dt.pyarrow_dtype
+ for dt in pd_dataframe.dtypes
+ if isinstance(dt, pd.ArrowDtype)
+ )
+ ):
self._block = blocks.block_from_local(
pd_dataframe, session or bigframes.pandas.get_global_session()
)
diff --git a/bigframes/operations/structs.py b/bigframes/operations/structs.py
new file mode 100644
index 0000000000..80d51115d0
--- /dev/null
+++ b/bigframes/operations/structs.py
@@ -0,0 +1,61 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import typing
+
+import ibis.expr.types as ibis_types
+
+import bigframes.dataframe
+import bigframes.operations
+import bigframes.operations.base
+import bigframes.series
+import third_party.bigframes_vendored.pandas.core.arrays.arrow.accessors as vendoracessors
+
+
+class StructField(bigframes.operations.UnaryOp):
+ def __init__(self, name_or_index: str | int):
+ self._name_or_index = name_or_index
+
+ def _as_ibis(self, x: ibis_types.Value):
+ struct_value = typing.cast(ibis_types.StructValue, x)
+ if isinstance(self._name_or_index, str):
+ name = self._name_or_index
+ else:
+ name = struct_value.names[self._name_or_index]
+ return struct_value[name].name(name)
+
+
+class StructAccessor(
+ bigframes.operations.base.SeriesMethods, vendoracessors.StructAccessor
+):
+ __doc__ = vendoracessors.StructAccessor.__doc__
+
+ def field(self, name_or_index: str | int) -> bigframes.series.Series:
+ series = self._apply_unary_op(StructField(name_or_index))
+ if isinstance(name_or_index, str):
+ name = name_or_index
+ else:
+ struct_field = self._dtype.pyarrow_dtype[name_or_index]
+ name = struct_field.name
+ return series.rename(name)
+
+ def explode(self) -> bigframes.dataframe.DataFrame:
+ import bigframes.pandas
+
+ pa_type = self._dtype.pyarrow_dtype
+ return bigframes.pandas.concat(
+ [self.field(i) for i in range(pa_type.num_fields)], axis="columns"
+ )
diff --git a/bigframes/series.py b/bigframes/series.py
index 47298d59f5..8815a6abde 100644
--- a/bigframes/series.py
+++ b/bigframes/series.py
@@ -51,6 +51,7 @@
import bigframes.operations.base
import bigframes.operations.datetimes as dt
import bigframes.operations.strings as strings
+import bigframes.operations.structs as structs
import third_party.bigframes_vendored.pandas.core.series as vendored_pandas_series
LevelType = typing.Union[str, int]
@@ -118,6 +119,10 @@ def query_job(self) -> Optional[bigquery.QueryJob]:
self._set_internal_query_job(self._compute_dry_run())
return self._query_job
+ @property
+ def struct(self) -> structs.StructAccessor:
+ return structs.StructAccessor(self._block)
+
def _set_internal_query_job(self, query_job: bigquery.QueryJob):
self._query_job = query_job
@@ -882,6 +887,34 @@ def argmin(self) -> int:
scalars.Scalar, Series(block.select_column(row_nums)).iloc[0]
)
+ def idxmax(self) -> blocks.Label:
+ block = self._block.order_by(
+ [
+ OrderingColumnReference(
+ self._value_column, direction=OrderingDirection.DESC
+ ),
+ *[
+ OrderingColumnReference(idx_col)
+ for idx_col in self._block.index_columns
+ ],
+ ]
+ )
+ block = block.slice(0, 1)
+ return indexes.Index._from_block(block).to_pandas()[0]
+
+ def idxmin(self) -> blocks.Label:
+ block = self._block.order_by(
+ [
+ OrderingColumnReference(self._value_column),
+ *[
+ OrderingColumnReference(idx_col)
+ for idx_col in self._block.index_columns
+ ],
+ ]
+ )
+ block = block.slice(0, 1)
+ return indexes.Index._from_block(block).to_pandas()[0]
+
@property
def is_monotonic_increasing(self) -> bool:
return typing.cast(
diff --git a/bigframes/session.py b/bigframes/session.py
index 7b827c7dcf..ac48c977cb 100644
--- a/bigframes/session.py
+++ b/bigframes/session.py
@@ -449,13 +449,6 @@ def _query_to_destination(
index_cols: List[str],
api_name: str,
) -> Tuple[Optional[bigquery.TableReference], Optional[bigquery.QueryJob]]:
- # If there are no index columns, then there's no reason to cache to a
- # (clustered) session table, as we'll just have to query it again to
- # create a default index & ordering.
- if not index_cols:
- _, query_job = self._start_query(query)
- return query_job.destination, query_job
-
# If a dry_run indicates this is not a query type job, then don't
# bother trying to do a CREATE TEMP TABLE ... AS SELECT ... statement.
dry_run_config = bigquery.QueryJobConfig()
@@ -465,15 +458,24 @@ def _query_to_destination(
_, query_job = self._start_query(query)
return query_job.destination, query_job
- # Make sure we cluster by the index column(s) so that subsequent
- # operations are as speedy as they can be.
+ # Create a table to workaround BigQuery 10 GB query results limit. See:
+ # internal issue 303057336.
+ # Since we have a `statement_type == 'SELECT'`, schema should be populated.
+ schema = typing.cast(Iterable[bigquery.SchemaField], dry_run_job.schema)
+ temp_table = self._create_session_table_empty(api_name, schema, index_cols)
+
+ job_config = bigquery.QueryJobConfig()
+ job_config.destination = temp_table
+
try:
- ibis_expr = self.ibis_client.sql(query)
- return self._ibis_to_session_table(ibis_expr, index_cols, api_name), None
+ # Write to temp table to workaround BigQuery 10 GB query results
+ # limit. See: internal issue 303057336.
+ _, query_job = self._start_query(query, job_config=job_config)
+ return query_job.destination, query_job
except google.api_core.exceptions.BadRequest:
- # Some SELECT statements still aren't compatible with CREATE TEMP
- # TABLE ... AS SELECT ... statements. For example, if the query has
- # a top-level ORDER BY, this conflicts with our ability to cluster
+ # Some SELECT statements still aren't compatible with cluster
+ # tables as the destination. For example, if the query has a
+ # top-level ORDER BY, this conflicts with our ability to cluster
# the table by the index column(s).
_, query_job = self._start_query(query)
return query_job.destination, query_job
@@ -1231,6 +1233,54 @@ def _create_session_table(self) -> bigquery.TableReference:
)
return dataset.table(table_name)
+ def _create_session_table_empty(
+ self,
+ api_name: str,
+ schema: Iterable[bigquery.SchemaField],
+ cluster_cols: List[str],
+ ) -> bigquery.TableReference:
+ # Can't set a table in _SESSION as destination via query job API, so we
+ # run DDL, instead.
+ table = self._create_session_table()
+ schema_sql = bigframes_io.bq_schema_to_sql(schema)
+
+ clusterable_cols = [
+ col.name
+ for col in schema
+ if col.name in cluster_cols and _can_cluster_bq(col)
+ ][:_MAX_CLUSTER_COLUMNS]
+
+ if clusterable_cols:
+ cluster_cols_sql = ", ".join(
+ f"`{cluster_col}`" for cluster_col in clusterable_cols
+ )
+ cluster_sql = f"CLUSTER BY {cluster_cols_sql}"
+ else:
+ cluster_sql = ""
+
+ ddl_text = f"""
+ CREATE TEMP TABLE
+ `_SESSION`.`{table.table_id}`
+ ({schema_sql})
+ {cluster_sql}
+ """
+
+ job_config = bigquery.QueryJobConfig()
+
+ # Include a label so that Dataplex Lineage can identify temporary
+ # tables that BigQuery DataFrames creates. Googlers: See internal issue
+ # 296779699. We're labeling the job instead of the table because
+ # otherwise we get `BadRequest: 400 OPTIONS on temporary tables are not
+ # supported`.
+ job_config.labels = {"source": "bigquery-dataframes-temp"}
+ job_config.labels["bigframes-api"] = api_name
+
+ _, query_job = self._start_query(ddl_text, job_config=job_config)
+
+ # Use fully-qualified name instead of `_SESSION` name so that the
+ # created table can be used as the destination table.
+ return query_job.destination
+
def _create_sequential_ordering(
self,
table: ibis_types.Table,
@@ -1249,7 +1299,9 @@ def _create_sequential_ordering(
cluster_cols=list(index_cols) + [default_ordering_name],
api_name=api_name,
)
- table = self.ibis_client.sql(f"SELECT * FROM `{table_ref.table_id}`")
+ table = self.ibis_client.table(
+ f"{table_ref.project}.{table_ref.dataset_id}.{table_ref.table_id}"
+ )
ordering_reference = core.OrderingColumnReference(default_ordering_name)
ordering = core.ExpressionOrdering(
ordering_value_columns=[ordering_reference],
@@ -1264,55 +1316,13 @@ def _ibis_to_session_table(
cluster_cols: Iterable[str],
api_name: str,
) -> bigquery.TableReference:
- clusterable_cols = [
- col for col in cluster_cols if _can_cluster(table[col].type())
- ][:_MAX_CLUSTER_COLUMNS]
- return self._query_to_session_table(
+ desination, _ = self._query_to_destination(
self.ibis_client.compile(table),
- cluster_cols=clusterable_cols,
+ index_cols=list(cluster_cols),
api_name=api_name,
)
-
- def _query_to_session_table(
- self,
- query_text: str,
- cluster_cols: Iterable[str],
- api_name: str,
- ) -> bigquery.TableReference:
- if len(list(cluster_cols)) > _MAX_CLUSTER_COLUMNS:
- raise ValueError(
- f"Too many cluster columns: {list(cluster_cols)}, max {_MAX_CLUSTER_COLUMNS} allowed."
- )
- # Can't set a table in _SESSION as destination via query job API, so we
- # run DDL, instead.
- table = self._create_session_table()
- cluster_cols_sql = ", ".join(f"`{cluster_col}`" for cluster_col in cluster_cols)
-
- # TODO(swast): This might not support multi-statement SQL queries (scripts).
- ddl_text = f"""
- CREATE TEMP TABLE `_SESSION`.`{table.table_id}`
- CLUSTER BY {cluster_cols_sql}
- AS {query_text}
- """
-
- job_config = bigquery.QueryJobConfig()
-
- # Include a label so that Dataplex Lineage can identify temporary
- # tables that BigQuery DataFrames creates. Googlers: See internal issue
- # 296779699. We're labeling the job instead of the table because
- # otherwise we get `BadRequest: 400 OPTIONS on temporary tables are not
- # supported`.
- job_config.labels = {"source": "bigquery-dataframes-temp"}
- job_config.labels["bigframes-api"] = api_name
-
- try:
- self._start_query(
- ddl_text, job_config=job_config
- ) # Wait for the job to complete
- except google.api_core.exceptions.Conflict:
- # Allow query retry to succeed.
- pass
- return table
+ # There should always be a destination table for this query type.
+ return typing.cast(bigquery.TableReference, desination)
def remote_function(
self,
@@ -1494,14 +1504,21 @@ def connect(context: Optional[bigquery_options.BigQueryOptions] = None) -> Sessi
return Session(context)
-def _can_cluster(ibis_type: ibis_dtypes.DataType):
+def _can_cluster_bq(field: bigquery.SchemaField):
# https://cloud.google.com/bigquery/docs/clustered-tables
# Notably, float is excluded
- return (
- ibis_type.is_integer()
- or ibis_type.is_string()
- or ibis_type.is_decimal()
- or ibis_type.is_date()
- or ibis_type.is_timestamp()
- or ibis_type.is_boolean()
+ type_ = field.field_type
+ return type_ in (
+ "INTEGER",
+ "INT64",
+ "STRING",
+ "NUMERIC",
+ "DECIMAL",
+ "BIGNUMERIC",
+ "BIGDECIMAL",
+ "DATE",
+ "DATETIME",
+ "TIMESTAMP",
+ "BOOL",
+ "BOOLEAN",
)
diff --git a/bigframes/version.py b/bigframes/version.py
index ad3c3082c5..238b64473a 100644
--- a/bigframes/version.py
+++ b/bigframes/version.py
@@ -12,4 +12,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-__version__ = "0.5.0"
+__version__ = "0.6.0"
diff --git a/notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb b/notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb
new file mode 100644
index 0000000000..598d958f0c
--- /dev/null
+++ b/notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb
@@ -0,0 +1,723 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "ur8xi4C7S06n"
+ },
+ "outputs": [],
+ "source": [
+ "# Copyright 2023 Google LLC\n",
+ "#\n",
+ "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+ "# you may not use this file except in compliance with the License.\n",
+ "# You may obtain a copy of the License at\n",
+ "#\n",
+ "# https://www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "JAPoU8Sm5E6e"
+ },
+ "source": [
+ "# Train a pytorch model with Vertex AI SDK 2.0 and Bigframes\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " Run in Colab\n",
+ " \n",
+ " | \n",
+ " \n",
+ " \n",
+ " \n",
+ " View on GitHub\n",
+ " \n",
+ " | \n",
+ " \n",
+ " \n",
+ " Open in Vertex AI Workbench\n",
+ " \n",
+ " |
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "tvgnzT1CKxrO"
+ },
+ "source": [
+ "## Overview\n",
+ "\n",
+ "This tutorial demonstrates how to train a pytorch model using Vertex AI local-to-remote training with Vertex AI SDK 2.0 and BigQuery Bigframes as the data source.\n",
+ "\n",
+ "Learn more about [bigframes](https://cloud.google.com/bigquery/docs/)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "d975e698c9a4"
+ },
+ "source": [
+ "### Objective\n",
+ "\n",
+ "In this tutorial, you learn to use `Vertex AI SDK 2.0` with Bigframes as input data source.\n",
+ "\n",
+ "\n",
+ "This tutorial uses the following Google Cloud ML services:\n",
+ "\n",
+ "- `Vertex AI Training`\n",
+ "- `Vertex AI Remote Training`\n",
+ "\n",
+ "\n",
+ "The steps performed include:\n",
+ "\n",
+ "- Initialize a dataframe from a BigQuery table and split the dataset\n",
+ "- Perform transformations as a Vertex AI remote training.\n",
+ "- Train the model remotely and evaluate the model locally\n",
+ "\n",
+ "**Local-to-remote training**\n",
+ "\n",
+ "```\n",
+ "import vertexai\n",
+ "from my_module import MyModelClass\n",
+ "\n",
+ "vertexai.preview.init(remote=True, project=\"my-project\", location=\"my-location\", staging_bucket=\"gs://my-bucket\")\n",
+ "\n",
+ "# Wrap the model class with `vertex_ai.preview.remote`\n",
+ "MyModelClass = vertexai.preview.remote(MyModelClass)\n",
+ "\n",
+ "# Instantiate the class\n",
+ "model = MyModelClass(...)\n",
+ "\n",
+ "# Optional set remote config\n",
+ "model.fit.vertex.remote_config.display_name = \"MyModelClass-remote-training\"\n",
+ "model.fit.vertex.remote_config.staging_bucket = \"gs://my-bucket\"\n",
+ "\n",
+ "# This `fit` call will be executed remotely\n",
+ "model.fit(...)\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "08d289fa873f"
+ },
+ "source": [
+ "### Dataset\n",
+ "\n",
+ "This tutorial uses the IRIS dataset, which predicts the iris species."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "aed92deeb4a0"
+ },
+ "source": [
+ "### Costs\n",
+ "\n",
+ "This tutorial uses billable components of Google Cloud:\n",
+ "\n",
+ "* Vertex AI\n",
+ "* BigQuery\n",
+ "* Cloud Storage\n",
+ "\n",
+ "Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing),\n",
+ "[BigQuery pricing](https://cloud.google.com/bigquery/pricing),\n",
+ "and [Cloud Storage pricing](https://cloud.google.com/storage/pricing), \n",
+ "and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)\n",
+ "to generate a cost estimate based on your projected usage."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "i7EUnXsZhAGF"
+ },
+ "source": [
+ "## Installation\n",
+ "\n",
+ "Install the following packages required to execute this notebook. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "2b4ef9b72d43"
+ },
+ "outputs": [],
+ "source": [
+ "# Install the packages\n",
+ "! pip3 install --upgrade --quiet google-cloud-aiplatform[preview]\n",
+ "! pip3 install --upgrade --quiet bigframes\n",
+ "! pip3 install --upgrade --quiet torch"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "58707a750154"
+ },
+ "source": [
+ "### Colab only: Uncomment the following cell to restart the kernel."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "f200f10a1da3"
+ },
+ "outputs": [],
+ "source": [
+ "# Automatically restart kernel after installs so that your environment can access the new packages\n",
+ "# import IPython\n",
+ "\n",
+ "# app = IPython.Application.instance()\n",
+ "# app.kernel.do_shutdown(True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "BF1j6f9HApxa"
+ },
+ "source": [
+ "## Before you begin\n",
+ "\n",
+ "### Set up your Google Cloud project\n",
+ "\n",
+ "**The following steps are required, regardless of your notebook environment.**\n",
+ "\n",
+ "1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.\n",
+ "\n",
+ "2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n",
+ "\n",
+ "3. [Enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n",
+ "\n",
+ "4. If you are running this notebook locally, you need to install the [Cloud SDK](https://cloud.google.com/sdk)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "WReHDGG5g0XY"
+ },
+ "source": [
+ "#### Set your project ID\n",
+ "\n",
+ "**If you don't know your project ID**, try the following:\n",
+ "* Run `gcloud config list`.\n",
+ "* Run `gcloud projects list`.\n",
+ "* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "oM1iC_MfAts1"
+ },
+ "outputs": [],
+ "source": [
+ "PROJECT_ID = \"[your-project-id]\" # @param {type:\"string\"}\n",
+ "\n",
+ "# Set the project id\n",
+ "! gcloud config set project {PROJECT_ID}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "region"
+ },
+ "source": [
+ "#### Region\n",
+ "\n",
+ "You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "region"
+ },
+ "outputs": [],
+ "source": [
+ "REGION = \"us-central1\" # @param {type: \"string\"}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "sBCra4QMA2wR"
+ },
+ "source": [
+ "### Authenticate your Google Cloud account\n",
+ "\n",
+ "Depending on your Jupyter environment, you may have to manually authenticate. Follow the relevant instructions below."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "74ccc9e52986"
+ },
+ "source": [
+ "**1. Vertex AI Workbench**\n",
+ "* Do nothing as you are already authenticated."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "de775a3773ba"
+ },
+ "source": [
+ "**2. Local JupyterLab instance, uncomment and run:**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "254614fa0c46"
+ },
+ "outputs": [],
+ "source": [
+ "# ! gcloud auth login"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ef21552ccea8"
+ },
+ "source": [
+ "**3. Colab, uncomment and run:**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "603adbbf0532"
+ },
+ "outputs": [],
+ "source": [
+ "# from google.colab import auth\n",
+ "# auth.authenticate_user()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "f6b2ccc891ed"
+ },
+ "source": [
+ "**4. Service account or other**\n",
+ "* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "zgPO1eR3CYjk"
+ },
+ "source": [
+ "### Create a Cloud Storage bucket\n",
+ "\n",
+ "Create a storage bucket to store intermediate artifacts such as datasets."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "MzGDU7TWdts_"
+ },
+ "outputs": [],
+ "source": [
+ "BUCKET_URI = f\"gs://your-bucket-name-{PROJECT_ID}-unique\" # @param {type:\"string\"}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "-EcIXiGsCePi"
+ },
+ "source": [
+ "**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "NIq7R4HZCfIc"
+ },
+ "outputs": [],
+ "source": [
+ "! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "960505627ddf"
+ },
+ "source": [
+ "### Import libraries and define constants"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "PyQmSRbKA8r-"
+ },
+ "outputs": [],
+ "source": [
+ "import bigframes.pandas as bf\n",
+ "import torch\n",
+ "import vertexai\n",
+ "from vertexai.preview import VertexModel\n",
+ "\n",
+ "bf.options.bigquery.location = \"us\" # Dataset is in 'us' not 'us-central1'\n",
+ "bf.options.bigquery.project = PROJECT_ID\n",
+ "\n",
+ "from bigframes.ml.model_selection import \\\n",
+ " train_test_split as bf_train_test_split"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "init_aip:mbsdk,all"
+ },
+ "source": [
+ "## Initialize Vertex AI SDK for Python\n",
+ "\n",
+ "Initialize the Vertex AI SDK for Python for your project and corresponding bucket."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "init_aip:mbsdk,all"
+ },
+ "outputs": [],
+ "source": [
+ "vertexai.init(\n",
+ " project=PROJECT_ID,\n",
+ " location=REGION,\n",
+ " staging_bucket=BUCKET_URI,\n",
+ ")\n",
+ "\n",
+ "REMOTE_JOB_NAME = \"sdk2-bigframes-pytorch\"\n",
+ "REMOTE_JOB_BUCKET = f\"{BUCKET_URI}/{REMOTE_JOB_NAME}\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "105334524e96"
+ },
+ "source": [
+ "## Prepare the dataset\n",
+ "\n",
+ "Now load the Iris dataset and split the data into train and test sets."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "b44cdc4e03f1"
+ },
+ "outputs": [],
+ "source": [
+ "df = bf.read_gbq(\"bigquery-public-data.ml_datasets.iris\")\n",
+ "\n",
+ "species_categories = {\n",
+ " \"versicolor\": 0,\n",
+ " \"virginica\": 1,\n",
+ " \"setosa\": 2,\n",
+ "}\n",
+ "df[\"species\"] = df[\"species\"].map(species_categories)\n",
+ "\n",
+ "# Assign an index column name\n",
+ "index_col = \"index\"\n",
+ "df.index.name = index_col"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "9cb8616b1997"
+ },
+ "outputs": [],
+ "source": [
+ "feature_columns = df[[\"sepal_length\", \"sepal_width\", \"petal_length\", \"petal_width\"]]\n",
+ "label_columns = df[[\"species\"]]\n",
+ "train_X, test_X, train_y, test_y = bf_train_test_split(\n",
+ " feature_columns, label_columns, test_size=0.2\n",
+ ")\n",
+ "\n",
+ "print(\"X_train size: \", train_X.size)\n",
+ "print(\"X_test size: \", test_X.size)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "23fe7b734b08"
+ },
+ "outputs": [],
+ "source": [
+ "# Switch to remote mode for training\n",
+ "vertexai.preview.init(remote=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "5904a0f1bb03"
+ },
+ "source": [
+ "## PyTorch remote training with CPU (Custom PyTorch model)\n",
+ "\n",
+ "First, train a PyTorch model as a remote training job:\n",
+ "\n",
+ "- Reinitialize Vertex AI for remote training.\n",
+ "- Set TorchLogisticRegression for the remote training job.\n",
+ "- Invoke TorchLogisticRegression locally which will launch the remote training job."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "2a1b85195a17"
+ },
+ "outputs": [],
+ "source": [
+ "# define the custom model\n",
+ "class TorchLogisticRegression(VertexModel, torch.nn.Module):\n",
+ " def __init__(self, input_size: int, output_size: int):\n",
+ " torch.nn.Module.__init__(self)\n",
+ " VertexModel.__init__(self)\n",
+ " self.linear = torch.nn.Linear(input_size, output_size)\n",
+ " self.softmax = torch.nn.Softmax(dim=1)\n",
+ "\n",
+ " def forward(self, x):\n",
+ " return self.softmax(self.linear(x))\n",
+ "\n",
+ " @vertexai.preview.developer.mark.train()\n",
+ " def train(self, X, y, num_epochs, lr):\n",
+ " X = X.to(torch.float32)\n",
+ " y = torch.flatten(y) # necessary to get 1D tensor\n",
+ " dataloader = torch.utils.data.DataLoader(\n",
+ " torch.utils.data.TensorDataset(X, y),\n",
+ " batch_size=10,\n",
+ " shuffle=True,\n",
+ " generator=torch.Generator(device=X.device),\n",
+ " )\n",
+ "\n",
+ " criterion = torch.nn.CrossEntropyLoss()\n",
+ " optimizer = torch.optim.SGD(self.parameters(), lr=lr)\n",
+ "\n",
+ " for t in range(num_epochs):\n",
+ " for batch, (X, y) in enumerate(dataloader):\n",
+ " optimizer.zero_grad()\n",
+ " pred = self(X)\n",
+ " loss = criterion(pred, y)\n",
+ " loss.backward()\n",
+ " optimizer.step()\n",
+ "\n",
+ " @vertexai.preview.developer.mark.predict()\n",
+ " def predict(self, X):\n",
+ " X = torch.tensor(X).to(torch.float32)\n",
+ " with torch.no_grad():\n",
+ " pred = torch.argmax(self(X), dim=1)\n",
+ " return pred"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "4e35593f520a"
+ },
+ "outputs": [],
+ "source": [
+ "# Switch to remote mode for training\n",
+ "vertexai.preview.init(remote=True)\n",
+ "\n",
+ "# Instantiate model\n",
+ "model = TorchLogisticRegression(4, 3)\n",
+ "\n",
+ "# Set training config\n",
+ "model.train.vertex.remote_config.custom_commands = [\n",
+ " \"pip install torchdata\",\n",
+ " \"pip install torcharrow\",\n",
+ "]\n",
+ "model.train.vertex.remote_config.display_name = REMOTE_JOB_NAME + \"-torch-model\"\n",
+ "model.train.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n",
+ "\n",
+ "# Train model on Vertex\n",
+ "model.train(train_X, train_y, num_epochs=200, lr=0.05)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "edf4d0708f02"
+ },
+ "source": [
+ "## Remote prediction\n",
+ "\n",
+ "Obtain predictions from the trained model."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "42dfbff0ca15"
+ },
+ "outputs": [],
+ "source": [
+ "vertexai.preview.init(remote=True)\n",
+ "\n",
+ "# Set remote config\n",
+ "model.predict.vertex.remote_config.custom_commands = [\n",
+ " \"pip install torchdata\",\n",
+ " \"pip install torcharrow\",\n",
+ "]\n",
+ "model.predict.vertex.remote_config.display_name = REMOTE_JOB_NAME + \"-torch-predict\"\n",
+ "model.predict.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n",
+ "\n",
+ "predictions = model.predict(test_X)\n",
+ "\n",
+ "print(f\"Remote predictions: {predictions}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "4340ed8316cd"
+ },
+ "source": [
+ "## Local evaluation\n",
+ "\n",
+ "Evaluate model results locally."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "eb27a31cec6f"
+ },
+ "outputs": [],
+ "source": [
+ "# User must convert bigframes to torch tensor for local evaluation\n",
+ "train_X_tensor = torch.from_numpy(\n",
+ " train_X.to_pandas().reset_index().drop(columns=[\"index\"]).values.astype(float)\n",
+ ")\n",
+ "train_y_tensor = torch.from_numpy(\n",
+ " train_y.to_pandas().reset_index().drop(columns=[\"index\"]).values.astype(float)\n",
+ ")\n",
+ "\n",
+ "test_X_tensor = torch.from_numpy(\n",
+ " test_X.to_pandas().reset_index().drop(columns=[\"index\"]).values.astype(float)\n",
+ ")\n",
+ "test_y_tensor = torch.from_numpy(\n",
+ " test_y.to_pandas().reset_index().drop(columns=[\"index\"]).values.astype(float)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "7db44ad81389"
+ },
+ "outputs": [],
+ "source": [
+ "from sklearn.metrics import accuracy_score\n",
+ "\n",
+ "# Switch to local mode for evaluation\n",
+ "vertexai.preview.init(remote=False)\n",
+ "\n",
+ "# Evaluate model's accuracy score\n",
+ "print(\n",
+ " f\"Train accuracy: {accuracy_score(train_y_tensor, model.predict(train_X_tensor))}\"\n",
+ ")\n",
+ "\n",
+ "print(f\"Test accuracy: {accuracy_score(test_y_tensor, model.predict(test_X_tensor))}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "TpV-iwP9qw9c"
+ },
+ "source": [
+ "## Cleaning up\n",
+ "\n",
+ "To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud\n",
+ "project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.\n",
+ "\n",
+ "Otherwise, you can delete the individual resources you created in this tutorial:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "sx_vKniMq9ZX"
+ },
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "\n",
+ "# Delete Cloud Storage objects that were created\n",
+ "delete_bucket = False\n",
+ "if delete_bucket or os.getenv(\"IS_TESTING\"):\n",
+ " ! gsutil -m rm -r $BUCKET_URI"
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "collapsed_sections": [],
+ "name": "sdk2_bigframes_pytorch.ipynb",
+ "toc_visible": true
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb b/notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb
new file mode 100644
index 0000000000..021c070753
--- /dev/null
+++ b/notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb
@@ -0,0 +1,727 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "ur8xi4C7S06n"
+ },
+ "outputs": [],
+ "source": [
+ "# Copyright 2023 Google LLC\n",
+ "#\n",
+ "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+ "# you may not use this file except in compliance with the License.\n",
+ "# You may obtain a copy of the License at\n",
+ "#\n",
+ "# https://www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "JAPoU8Sm5E6e"
+ },
+ "source": [
+ "# Train a scikit-learn model with Vertex AI SDK 2.0 and Bigframes\n",
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " Run in Colab\n",
+ " \n",
+ " | \n",
+ " \n",
+ " \n",
+ " \n",
+ " View on GitHub\n",
+ " \n",
+ " | \n",
+ " \n",
+ " \n",
+ " Open in Vertex AI Workbench\n",
+ " \n",
+ " |
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "tvgnzT1CKxrO"
+ },
+ "source": [
+ "## Overview\n",
+ "\n",
+ "This tutorial demonstrates how to train a scikit-learn model using Vertex AI local-to-remote training with Vertex AI SDK 2.0 and BigQuery Bigframes as the data source.\n",
+ "\n",
+ "Learn more about [bigframes](https://cloud.google.com/bigquery/docs/)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "d975e698c9a4"
+ },
+ "source": [
+ "### Objective\n",
+ "\n",
+ "In this tutorial, you learn to use `Vertex AI SDK 2.0` with Bigframes as input data source.\n",
+ "\n",
+ "\n",
+ "This tutorial uses the following Google Cloud ML services:\n",
+ "\n",
+ "- `Vertex AI Training`\n",
+ "- `Vertex AI Remote Training`\n",
+ "\n",
+ "\n",
+ "The steps performed include:\n",
+ "\n",
+ "- Initialize a dataframe from a BigQuery table and split the dataset\n",
+ "- Perform transformations as a Vertex AI remote training.\n",
+ "- Train the model remotely and evaluate the model locally\n",
+ "\n",
+ "**Local-to-remote training**\n",
+ "\n",
+ "```\n",
+ "import vertexai\n",
+ "from my_module import MyModelClass\n",
+ "\n",
+ "vertexai.preview.init(remote=True, project=\"my-project\", location=\"my-location\", staging_bucket=\"gs://my-bucket\")\n",
+ "\n",
+ "# Wrap the model class with `vertex_ai.preview.remote`\n",
+ "MyModelClass = vertexai.preview.remote(MyModelClass)\n",
+ "\n",
+ "# Instantiate the class\n",
+ "model = MyModelClass(...)\n",
+ "\n",
+ "# Optional set remote config\n",
+ "model.fit.vertex.remote_config.display_name = \"MyModelClass-remote-training\"\n",
+ "model.fit.vertex.remote_config.staging_bucket = \"gs://my-bucket\"\n",
+ "\n",
+ "# This `fit` call will be executed remotely\n",
+ "model.fit(...)\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "08d289fa873f"
+ },
+ "source": [
+ "### Dataset\n",
+ "\n",
+ "This tutorial uses the IRIS dataset, which predicts the iris species."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "aed92deeb4a0"
+ },
+ "source": [
+ "### Costs\n",
+ "\n",
+ "This tutorial uses billable components of Google Cloud:\n",
+ "\n",
+ "* Vertex AI\n",
+ "* BigQuery\n",
+ "* Cloud Storage\n",
+ "\n",
+ "Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing),\n",
+ "[BigQuery pricing](https://cloud.google.com/bigquery/pricing),\n",
+ "and [Cloud Storage pricing](https://cloud.google.com/storage/pricing), \n",
+ "and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)\n",
+ "to generate a cost estimate based on your projected usage."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "i7EUnXsZhAGF"
+ },
+ "source": [
+ "## Installation\n",
+ "\n",
+ "Install the following packages required to execute this notebook. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "2b4ef9b72d43"
+ },
+ "outputs": [],
+ "source": [
+ "# Install the packages\n",
+ "! pip3 install --upgrade --quiet google-cloud-aiplatform[preview]\n",
+ "! pip3 install --upgrade --quiet bigframes"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "58707a750154"
+ },
+ "source": [
+ "### Colab only: Uncomment the following cell to restart the kernel."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "f200f10a1da3"
+ },
+ "outputs": [],
+ "source": [
+ "# Automatically restart kernel after installs so that your environment can access the new packages\n",
+ "# import IPython\n",
+ "\n",
+ "# app = IPython.Application.instance()\n",
+ "# app.kernel.do_shutdown(True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "BF1j6f9HApxa"
+ },
+ "source": [
+ "## Before you begin\n",
+ "\n",
+ "### Set up your Google Cloud project\n",
+ "\n",
+ "**The following steps are required, regardless of your notebook environment.**\n",
+ "\n",
+ "1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.\n",
+ "\n",
+ "2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n",
+ "\n",
+ "3. [Enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n",
+ "\n",
+ "4. If you are running this notebook locally, you need to install the [Cloud SDK](https://cloud.google.com/sdk)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "WReHDGG5g0XY"
+ },
+ "source": [
+ "#### Set your project ID\n",
+ "\n",
+ "**If you don't know your project ID**, try the following:\n",
+ "* Run `gcloud config list`.\n",
+ "* Run `gcloud projects list`.\n",
+ "* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "oM1iC_MfAts1"
+ },
+ "outputs": [],
+ "source": [
+ "PROJECT_ID = \"[your-project-id]\" # @param {type:\"string\"}\n",
+ "\n",
+ "# Set the project id\n",
+ "! gcloud config set project {PROJECT_ID}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "region"
+ },
+ "source": [
+ "#### Region\n",
+ "\n",
+ "You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "region"
+ },
+ "outputs": [],
+ "source": [
+ "REGION = \"us-central1\" # @param {type: \"string\"}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "sBCra4QMA2wR"
+ },
+ "source": [
+ "### Authenticate your Google Cloud account\n",
+ "\n",
+ "Depending on your Jupyter environment, you may have to manually authenticate. Follow the relevant instructions below."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "74ccc9e52986"
+ },
+ "source": [
+ "**1. Vertex AI Workbench**\n",
+ "* Do nothing as you are already authenticated."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "de775a3773ba"
+ },
+ "source": [
+ "**2. Local JupyterLab instance, uncomment and run:**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "254614fa0c46"
+ },
+ "outputs": [],
+ "source": [
+ "# ! gcloud auth login"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ef21552ccea8"
+ },
+ "source": [
+ "**3. Colab, uncomment and run:**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "603adbbf0532"
+ },
+ "outputs": [],
+ "source": [
+ "# from google.colab import auth\n",
+ "# auth.authenticate_user()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "f6b2ccc891ed"
+ },
+ "source": [
+ "**4. Service account or other**\n",
+ "* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "zgPO1eR3CYjk"
+ },
+ "source": [
+ "### Create a Cloud Storage bucket\n",
+ "\n",
+ "Create a storage bucket to store intermediate artifacts such as datasets."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "MzGDU7TWdts_"
+ },
+ "outputs": [],
+ "source": [
+ "BUCKET_URI = f\"gs://your-bucket-name-{PROJECT_ID}-unique\" # @param {type:\"string\"}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "-EcIXiGsCePi"
+ },
+ "source": [
+ "**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "NIq7R4HZCfIc"
+ },
+ "outputs": [],
+ "source": [
+ "! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "960505627ddf"
+ },
+ "source": [
+ "### Import libraries and define constants"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "PyQmSRbKA8r-"
+ },
+ "outputs": [],
+ "source": [
+ "import bigframes.pandas as bf\n",
+ "import vertexai\n",
+ "\n",
+ "bf.options.bigquery.location = \"us\" # Dataset is in 'us' not 'us-central1'\n",
+ "bf.options.bigquery.project = PROJECT_ID\n",
+ "\n",
+ "from bigframes.ml.model_selection import \\\n",
+ " train_test_split as bf_train_test_split\n",
+ "\n",
+ "REMOTE_JOB_NAME = \"sdk2-bigframes-sklearn\"\n",
+ "REMOTE_JOB_BUCKET = f\"{BUCKET_URI}/{REMOTE_JOB_NAME}\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "init_aip:mbsdk,all"
+ },
+ "source": [
+ "## Initialize Vertex AI SDK for Python\n",
+ "\n",
+ "Initialize the Vertex AI SDK for Python for your project and corresponding bucket."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "init_aip:mbsdk,all"
+ },
+ "outputs": [],
+ "source": [
+ "vertexai.init(\n",
+ " project=PROJECT_ID,\n",
+ " location=REGION,\n",
+ " staging_bucket=BUCKET_URI,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "105334524e96"
+ },
+ "source": [
+ "## Prepare the dataset\n",
+ "\n",
+ "Now load the Iris dataset and split the data into train and test sets."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "b44cdc4e03f1"
+ },
+ "outputs": [],
+ "source": [
+ "df = bf.read_gbq(\"bigquery-public-data.ml_datasets.iris\")\n",
+ "\n",
+ "species_categories = {\n",
+ " \"versicolor\": 0,\n",
+ " \"virginica\": 1,\n",
+ " \"setosa\": 2,\n",
+ "}\n",
+ "df[\"species\"] = df[\"species\"].map(species_categories)\n",
+ "\n",
+ "# Assign an index column name\n",
+ "index_col = \"index\"\n",
+ "df.index.name = index_col"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "9cb8616b1997"
+ },
+ "outputs": [],
+ "source": [
+ "feature_columns = df[[\"sepal_length\", \"sepal_width\", \"petal_length\", \"petal_width\"]]\n",
+ "label_columns = df[[\"species\"]]\n",
+ "train_X, test_X, train_y, test_y = bf_train_test_split(\n",
+ " feature_columns, label_columns, test_size=0.2\n",
+ ")\n",
+ "\n",
+ "print(\"X_train size: \", train_X.size)\n",
+ "print(\"X_test size: \", test_X.size)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "8306545fcc57"
+ },
+ "source": [
+ "## Feature transformation\n",
+ "\n",
+ "Next, you do feature transformations on the data using the Vertex AI remote training service.\n",
+ "\n",
+ "First, you re-initialize Vertex AI to enable remote training."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "55e701c31036"
+ },
+ "outputs": [],
+ "source": [
+ "# Switch to remote mode for training\n",
+ "vertexai.preview.init(remote=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "4a0e9d59b273"
+ },
+ "source": [
+ "### Execute remote job for fit_transform() on training data\n",
+ "\n",
+ "Next, indicate that the `StandardScalar` class is to be executed remotely. Then set up the data transform and call the `fit_transform()` method is executed remotely."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "90333089d362"
+ },
+ "outputs": [],
+ "source": [
+ "from sklearn.preprocessing import StandardScaler\n",
+ "\n",
+ "# Wrap classes to enable Vertex remote execution\n",
+ "StandardScaler = vertexai.preview.remote(StandardScaler)\n",
+ "\n",
+ "# Instantiate transformer\n",
+ "transformer = StandardScaler()\n",
+ "\n",
+ "# Set training config\n",
+ "transformer.fit_transform.vertex.remote_config.display_name = (\n",
+ " f\"{REMOTE_JOB_NAME}-fit-transformer-bigframes\"\n",
+ ")\n",
+ "transformer.fit_transform.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n",
+ "\n",
+ "# Execute transformer on Vertex (train_X is bigframes.dataframe.DataFrame, X_train is np.array)\n",
+ "X_train = transformer.fit_transform(train_X)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "6bf95574c907"
+ },
+ "source": [
+ "### Remote transform on test data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "da6eea22a89a"
+ },
+ "outputs": [],
+ "source": [
+ "# Transform test dataset before calculate test score\n",
+ "transformer.transform.vertex.remote_config.display_name = (\n",
+ " REMOTE_JOB_NAME + \"-transformer\"\n",
+ ")\n",
+ "transformer.transform.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n",
+ "\n",
+ "# Execute transformer on Vertex (test_X is bigframes.dataframe.DataFrame, X_test is np.array)\n",
+ "X_test = transformer.transform(test_X)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ddf906c886e4"
+ },
+ "source": [
+ "## Remote training\n",
+ "\n",
+ "First, train the scikit-learn model as a remote training job:\n",
+ "\n",
+ "- Set LogisticRegression for the remote training job.\n",
+ "- Invoke LogisticRegression locally which will launch the remote training job."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "c7b0116fa60c"
+ },
+ "outputs": [],
+ "source": [
+ "from sklearn.linear_model import LogisticRegression\n",
+ "\n",
+ "# Wrap classes to enable Vertex remote execution\n",
+ "LogisticRegression = vertexai.preview.remote(LogisticRegression)\n",
+ "\n",
+ "# Instantiate model, warm_start=True for uptraining\n",
+ "model = LogisticRegression(warm_start=True)\n",
+ "\n",
+ "# Set training config\n",
+ "model.fit.vertex.remote_config.display_name = REMOTE_JOB_NAME + \"-sklearn-model\"\n",
+ "model.fit.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n",
+ "\n",
+ "# Train model on Vertex\n",
+ "model.fit(train_X, train_y)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ffe1d5903bcb"
+ },
+ "source": [
+ "## Remote prediction\n",
+ "\n",
+ "Obtain predictions from the trained model."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "d00ce35920fa"
+ },
+ "outputs": [],
+ "source": [
+ "# Remote evaluation\n",
+ "vertexai.preview.init(remote=True)\n",
+ "\n",
+ "# Evaluate model's accuracy score\n",
+ "predictions = model.predict(test_X)\n",
+ "\n",
+ "print(f\"Remote predictions: {predictions}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "a8cd6cbd4403"
+ },
+ "source": [
+ "## Local evaluation\n",
+ "\n",
+ "Score model results locally."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "dc105dafdfb9"
+ },
+ "outputs": [],
+ "source": [
+ "# User must convert bigframes to pandas dataframe for local evaluation\n",
+ "train_X_pd = train_X.to_pandas().reset_index(drop=True)\n",
+ "train_y_pd = train_y.to_pandas().reset_index(drop=True)\n",
+ "\n",
+ "test_X_pd = test_X.to_pandas().reset_index(drop=True)\n",
+ "test_y_pd = test_y.to_pandas().reset_index(drop=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "25fec549de69"
+ },
+ "outputs": [],
+ "source": [
+ "# Switch to local mode for testing\n",
+ "vertexai.preview.init(remote=False)\n",
+ "\n",
+ "# Evaluate model's accuracy score\n",
+ "print(f\"Train accuracy: {model.score(train_X_pd, train_y_pd)}\")\n",
+ "\n",
+ "print(f\"Test accuracy: {model.score(test_X_pd, test_y_pd)}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "TpV-iwP9qw9c"
+ },
+ "source": [
+ "## Cleaning up\n",
+ "\n",
+ "To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud\n",
+ "project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.\n",
+ "\n",
+ "Otherwise, you can delete the individual resources you created in this tutorial:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "sx_vKniMq9ZX"
+ },
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "\n",
+ "# Delete Cloud Storage objects that were created\n",
+ "delete_bucket = False\n",
+ "if delete_bucket or os.getenv(\"IS_TESTING\"):\n",
+ " ! gsutil -m rm -r $BUCKET_URI"
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "collapsed_sections": [],
+ "name": "sdk2_bigframes_sklearn.ipynb",
+ "toc_visible": true
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb b/notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb
new file mode 100644
index 0000000000..e6843b66b5
--- /dev/null
+++ b/notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb
@@ -0,0 +1,646 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "ur8xi4C7S06n"
+ },
+ "outputs": [],
+ "source": [
+ "# Copyright 2023 Google LLC\n",
+ "#\n",
+ "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+ "# you may not use this file except in compliance with the License.\n",
+ "# You may obtain a copy of the License at\n",
+ "#\n",
+ "# https://www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "JAPoU8Sm5E6e"
+ },
+ "source": [
+ "# Train a Tensorflow Keras model with Vertex AI SDK 2.0 and Bigframes \n",
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " Run in Colab\n",
+ " \n",
+ " | \n",
+ " \n",
+ " \n",
+ " \n",
+ " View on GitHub\n",
+ " \n",
+ " | \n",
+ " \n",
+ " \n",
+ " Open in Vertex AI Workbench\n",
+ " \n",
+ " |
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "tvgnzT1CKxrO"
+ },
+ "source": [
+ "## Overview\n",
+ "\n",
+ "This tutorial demonstrates how to train a tensorflow keras model using Vertex AI local-to-remote training with Vertex AI SDK 2.0 and BigQuery Bigframes as the data source.\n",
+ "\n",
+ "Learn more about [bigframes](https://cloud.google.com/bigquery/docs/)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "d975e698c9a4"
+ },
+ "source": [
+ "### Objective\n",
+ "\n",
+ "In this tutorial, you learn to use `Vertex AI SDK 2.0` with Bigframes as input data source.\n",
+ "\n",
+ "\n",
+ "This tutorial uses the following Google Cloud ML services:\n",
+ "\n",
+ "- `Vertex AI Training`\n",
+ "- `Vertex AI Remote Training`\n",
+ "\n",
+ "\n",
+ "The steps performed include:\n",
+ "\n",
+ "- Initialize a dataframe from a BigQuery table and split the dataset\n",
+ "- Perform transformations as a Vertex AI remote training.\n",
+ "- Train the model remotely and evaluate the model locally\n",
+ "\n",
+ "**Local-to-remote training**\n",
+ "\n",
+ "```\n",
+ "import vertexai\n",
+ "from my_module import MyModelClass\n",
+ "\n",
+ "vertexai.preview.init(remote=True, project=\"my-project\", location=\"my-location\", staging_bucket=\"gs://my-bucket\")\n",
+ "\n",
+ "# Wrap the model class with `vertex_ai.preview.remote`\n",
+ "MyModelClass = vertexai.preview.remote(MyModelClass)\n",
+ "\n",
+ "# Instantiate the class\n",
+ "model = MyModelClass(...)\n",
+ "\n",
+ "# Optional set remote config\n",
+ "model.fit.vertex.remote_config.display_name = \"MyModelClass-remote-training\"\n",
+ "model.fit.vertex.remote_config.staging_bucket = \"gs://my-bucket\"\n",
+ "\n",
+ "# This `fit` call will be executed remotely\n",
+ "model.fit(...)\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "08d289fa873f"
+ },
+ "source": [
+ "### Dataset\n",
+ "\n",
+ "This tutorial uses the IRIS dataset, which predicts the iris species."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "aed92deeb4a0"
+ },
+ "source": [
+ "### Costs\n",
+ "\n",
+ "This tutorial uses billable components of Google Cloud:\n",
+ "\n",
+ "* Vertex AI\n",
+ "* BigQuery\n",
+ "* Cloud Storage\n",
+ "\n",
+ "Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing),\n",
+ "[BigQuery pricing](https://cloud.google.com/bigquery/pricing),\n",
+ "and [Cloud Storage pricing](https://cloud.google.com/storage/pricing), \n",
+ "and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)\n",
+ "to generate a cost estimate based on your projected usage."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "i7EUnXsZhAGF"
+ },
+ "source": [
+ "## Installation\n",
+ "\n",
+ "Install the following packages required to execute this notebook. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "2b4ef9b72d43"
+ },
+ "outputs": [],
+ "source": [
+ "# Install the packages\n",
+ "! pip3 install --upgrade --quiet google-cloud-aiplatform[preview]\n",
+ "! pip3 install --upgrade --quiet bigframes\n",
+ "! pip3 install --upgrade --quiet tensorflow==2.12.0"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "58707a750154"
+ },
+ "source": [
+ "### Colab only: Uncomment the following cell to restart the kernel."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "f200f10a1da3"
+ },
+ "outputs": [],
+ "source": [
+ "# Automatically restart kernel after installs so that your environment can access the new packages\n",
+ "# import IPython\n",
+ "\n",
+ "# app = IPython.Application.instance()\n",
+ "# app.kernel.do_shutdown(True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "BF1j6f9HApxa"
+ },
+ "source": [
+ "## Before you begin\n",
+ "\n",
+ "### Set up your Google Cloud project\n",
+ "\n",
+ "**The following steps are required, regardless of your notebook environment.**\n",
+ "\n",
+ "1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.\n",
+ "\n",
+ "2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n",
+ "\n",
+ "3. [Enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n",
+ "\n",
+ "4. If you are running this notebook locally, you need to install the [Cloud SDK](https://cloud.google.com/sdk)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "WReHDGG5g0XY"
+ },
+ "source": [
+ "#### Set your project ID\n",
+ "\n",
+ "**If you don't know your project ID**, try the following:\n",
+ "* Run `gcloud config list`.\n",
+ "* Run `gcloud projects list`.\n",
+ "* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "oM1iC_MfAts1"
+ },
+ "outputs": [],
+ "source": [
+ "PROJECT_ID = \"[your-project-id]\" # @param {type:\"string\"}\n",
+ "\n",
+ "# Set the project id\n",
+ "! gcloud config set project {PROJECT_ID}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "region"
+ },
+ "source": [
+ "#### Region\n",
+ "\n",
+ "You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "region"
+ },
+ "outputs": [],
+ "source": [
+ "REGION = \"us-central1\" # @param {type: \"string\"}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "sBCra4QMA2wR"
+ },
+ "source": [
+ "### Authenticate your Google Cloud account\n",
+ "\n",
+ "Depending on your Jupyter environment, you may have to manually authenticate. Follow the relevant instructions below."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "74ccc9e52986"
+ },
+ "source": [
+ "**1. Vertex AI Workbench**\n",
+ "* Do nothing as you are already authenticated."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "de775a3773ba"
+ },
+ "source": [
+ "**2. Local JupyterLab instance, uncomment and run:**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "254614fa0c46"
+ },
+ "outputs": [],
+ "source": [
+ "# ! gcloud auth login"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ef21552ccea8"
+ },
+ "source": [
+ "**3. Colab, uncomment and run:**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "603adbbf0532"
+ },
+ "outputs": [],
+ "source": [
+ "# from google.colab import auth\n",
+ "# auth.authenticate_user()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "f6b2ccc891ed"
+ },
+ "source": [
+ "**4. Service account or other**\n",
+ "* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "zgPO1eR3CYjk"
+ },
+ "source": [
+ "### Create a Cloud Storage bucket\n",
+ "\n",
+ "Create a storage bucket to store intermediate artifacts such as datasets."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "MzGDU7TWdts_"
+ },
+ "outputs": [],
+ "source": [
+ "BUCKET_URI = f\"gs://your-bucket-name-{PROJECT_ID}-unique\" # @param {type:\"string\"}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "-EcIXiGsCePi"
+ },
+ "source": [
+ "**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "NIq7R4HZCfIc"
+ },
+ "outputs": [],
+ "source": [
+ "! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "960505627ddf"
+ },
+ "source": [
+ "### Import libraries and define constants"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "PyQmSRbKA8r-"
+ },
+ "outputs": [],
+ "source": [
+ "import bigframes.pandas as bf\n",
+ "import tensorflow as tf\n",
+ "import vertexai\n",
+ "from tensorflow import keras\n",
+ "\n",
+ "bf.options.bigquery.location = \"us\" # Dataset is in 'us' not 'us-central1'\n",
+ "bf.options.bigquery.project = PROJECT_ID\n",
+ "\n",
+ "from bigframes.ml.model_selection import \\\n",
+ " train_test_split as bf_train_test_split"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "init_aip:mbsdk,all"
+ },
+ "source": [
+ "## Initialize Vertex AI SDK for Python\n",
+ "\n",
+ "Initialize the Vertex AI SDK for Python for your project and corresponding bucket."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "init_aip:mbsdk,all"
+ },
+ "outputs": [],
+ "source": [
+ "vertexai.init(\n",
+ " project=PROJECT_ID,\n",
+ " location=REGION,\n",
+ " staging_bucket=BUCKET_URI,\n",
+ ")\n",
+ "\n",
+ "REMOTE_JOB_NAME = \"sdk2-bigframes-tensorflow\"\n",
+ "REMOTE_JOB_BUCKET = f\"{BUCKET_URI}/{REMOTE_JOB_NAME}\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "105334524e96"
+ },
+ "source": [
+ "## Prepare the dataset\n",
+ "\n",
+ "Now load the Iris dataset and split the data into train and test sets."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "94576deccd8c"
+ },
+ "outputs": [],
+ "source": [
+ "df = bf.read_gbq(\"bigquery-public-data.ml_datasets.iris\")\n",
+ "\n",
+ "species_categories = {\n",
+ " \"versicolor\": 0,\n",
+ " \"virginica\": 1,\n",
+ " \"setosa\": 2,\n",
+ "}\n",
+ "df[\"target\"] = df[\"species\"].map(species_categories)\n",
+ "df = df.drop(columns=[\"species\"])\n",
+ "\n",
+ "train, test = bf_train_test_split(df, test_size=0.2)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "cfcbce726efa"
+ },
+ "source": [
+ "## Remote training with GPU\n",
+ "\n",
+ "First, train a TensorFlow model as a remote training job:\n",
+ "\n",
+ "- Reinitialize Vertex AI for remote training.\n",
+ "- Instantiate the tensorflow keras model for the remote training job.\n",
+ "- Invoke the tensorflow keras model.fit() locally which will launch the remote training job."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "fd865b0c4e8b"
+ },
+ "outputs": [],
+ "source": [
+ "# Switch to remote mode for training\n",
+ "vertexai.preview.init(remote=True)\n",
+ "\n",
+ "keras.Sequential = vertexai.preview.remote(keras.Sequential)\n",
+ "\n",
+ "# Instantiate model\n",
+ "model = keras.Sequential(\n",
+ " [keras.layers.Dense(5, input_shape=(4,)), keras.layers.Softmax()]\n",
+ ")\n",
+ "\n",
+ "# Specify optimizer and loss function\n",
+ "model.compile(optimizer=\"adam\", loss=\"mean_squared_error\")\n",
+ "\n",
+ "# Set training config\n",
+ "model.fit.vertex.remote_config.enable_cuda = True\n",
+ "model.fit.vertex.remote_config.display_name = REMOTE_JOB_NAME + \"-keras-model-gpu\"\n",
+ "model.fit.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n",
+ "model.fit.vertex.remote_config.custom_commands = [\"pip install tensorflow-io==0.32.0\"]\n",
+ "\n",
+ "# Manually set compute resources this time\n",
+ "model.fit.vertex.remote_config.machine_type = \"n1-highmem-4\"\n",
+ "model.fit.vertex.remote_config.accelerator_type = \"NVIDIA_TESLA_K80\"\n",
+ "model.fit.vertex.remote_config.accelerator_count = 4\n",
+ "\n",
+ "# Train model on Vertex\n",
+ "model.fit(train, epochs=10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "f1af94ac1477"
+ },
+ "source": [
+ "## Remote prediction\n",
+ "\n",
+ "Obtain predictions from the trained model."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "1d75879948b5"
+ },
+ "outputs": [],
+ "source": [
+ "vertexai.preview.init(remote=True)\n",
+ "\n",
+ "# Set remote config\n",
+ "model.predict.vertex.remote_config.enable_cuda = False\n",
+ "model.predict.vertex.remote_config.display_name = REMOTE_JOB_NAME + \"-keras-predict-cpu\"\n",
+ "model.predict.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n",
+ "model.predict.vertex.remote_config.custom_commands = [\n",
+ " \"pip install tensorflow-io==0.32.0\"\n",
+ "]\n",
+ "\n",
+ "predictions = model.predict(train)\n",
+ "\n",
+ "print(f\"Remote predictions: {predictions}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "798b77c95067"
+ },
+ "source": [
+ "## Local evaluation\n",
+ "\n",
+ "Evaluate model results locally."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "88e734e30791"
+ },
+ "outputs": [],
+ "source": [
+ "# User must convert bigframes to pandas dataframe for local evaluation\n",
+ "feature_columns = [\"sepal_length\", \"sepal_width\", \"petal_length\", \"petal_width\"]\n",
+ "label_columns = [\"target\"]\n",
+ "\n",
+ "train_X_np = train[feature_columns].to_pandas().values.astype(float)\n",
+ "train_y_np = train[label_columns].to_pandas().values.astype(float)\n",
+ "train_ds = tf.data.Dataset.from_tensor_slices((train_X_np, train_y_np))\n",
+ "\n",
+ "test_X_np = test[feature_columns].to_pandas().values.astype(float)\n",
+ "test_y_np = test[label_columns].to_pandas().values.astype(float)\n",
+ "test_ds = tf.data.Dataset.from_tensor_slices((test_X_np, test_y_np))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "cb8637f783ad"
+ },
+ "outputs": [],
+ "source": [
+ "# Switch to local mode for evaluation\n",
+ "vertexai.preview.init(remote=False)\n",
+ "\n",
+ "# Evaluate model's mean square errors\n",
+ "print(f\"Train loss: {model.evaluate(train_ds.batch(32))}\")\n",
+ "\n",
+ "print(f\"Test loss: {model.evaluate(test_ds.batch(32))}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "TpV-iwP9qw9c"
+ },
+ "source": [
+ "## Cleaning up\n",
+ "\n",
+ "To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud\n",
+ "project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.\n",
+ "\n",
+ "Otherwise, you can delete the individual resources you created in this tutorial:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "sx_vKniMq9ZX"
+ },
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "\n",
+ "# Delete Cloud Storage objects that were created\n",
+ "delete_bucket = False\n",
+ "if delete_bucket or os.getenv(\"IS_TESTING\"):\n",
+ " ! gsutil -m rm -r $BUCKET_URI"
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "collapsed_sections": [],
+ "name": "sdk2_bigframes_tensorflow.ipynb",
+ "toc_visible": true
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/noxfile.py b/noxfile.py
index 033bbfefe4..a113e1fcde 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -362,7 +362,7 @@ def doctest(session: nox.sessions.Session):
run_system(
session=session,
prefix_name="doctest",
- extra_pytest_options=("--doctest-modules",),
+ extra_pytest_options=("--doctest-modules", "third_party"),
test_folder="bigframes",
check_cov=True,
)
@@ -610,6 +610,9 @@ def notebook(session):
"notebooks/getting_started/bq_dataframes_llm_code_generation.ipynb",
"notebooks/getting_started/bq_dataframes_ml_linear_regression.ipynb",
"notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb",
+ "notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb",
+ "notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb",
+ "notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb",
# The experimental notebooks imagine features that don't yet
# exist or only exist as temporary prototypes.
"notebooks/experimental/longer_ml_demo.ipynb",
diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py
index 34a2ca0101..9294740dd6 100644
--- a/tests/system/large/ml/test_pipeline.py
+++ b/tests/system/large/ml/test_pipeline.py
@@ -580,6 +580,11 @@ def test_pipeline_columntransformer_fit_predict(session, penguins_df_default_ind
preprocessing.MinMaxScaler(),
["culmen_length_mm", "flipper_length_mm"],
),
+ (
+ "k_bins_discretizer",
+ preprocessing.KBinsDiscretizer(strategy="uniform"),
+ ["culmen_length_mm", "flipper_length_mm"],
+ ),
(
"label",
preprocessing.LabelEncoder(),
@@ -657,6 +662,11 @@ def test_pipeline_columntransformer_to_gbq(penguins_df_default_index, dataset_id
preprocessing.MinMaxScaler(),
["culmen_length_mm", "flipper_length_mm"],
),
+ (
+ "k_bins_discretizer",
+ preprocessing.KBinsDiscretizer(strategy="uniform"),
+ ["culmen_length_mm", "flipper_length_mm"],
+ ),
(
"label",
preprocessing.LabelEncoder(),
@@ -696,9 +706,19 @@ def test_pipeline_columntransformer_to_gbq(penguins_df_default_index, dataset_id
("standard_scaler", preprocessing.StandardScaler(), "culmen_length_mm"),
("max_abs_scaler", preprocessing.MaxAbsScaler(), "culmen_length_mm"),
("min_max_scaler", preprocessing.MinMaxScaler(), "culmen_length_mm"),
+ (
+ "k_bins_discretizer",
+ preprocessing.KBinsDiscretizer(strategy="uniform"),
+ "culmen_length_mm",
+ ),
("standard_scaler", preprocessing.StandardScaler(), "flipper_length_mm"),
("max_abs_scaler", preprocessing.MaxAbsScaler(), "flipper_length_mm"),
("min_max_scaler", preprocessing.MinMaxScaler(), "flipper_length_mm"),
+ (
+ "k_bins_discretizer",
+ preprocessing.KBinsDiscretizer(strategy="uniform"),
+ "flipper_length_mm",
+ ),
]
assert transformers == expected
@@ -791,6 +811,32 @@ def test_pipeline_min_max_scaler_to_gbq(penguins_df_default_index, dataset_id):
assert pl_loaded._estimator.fit_intercept is False
+def test_pipeline_k_bins_discretizer_to_gbq(penguins_df_default_index, dataset_id):
+ pl = pipeline.Pipeline(
+ [
+ ("transform", preprocessing.KBinsDiscretizer(strategy="uniform")),
+ ("estimator", linear_model.LinearRegression(fit_intercept=False)),
+ ]
+ )
+
+ df = penguins_df_default_index.dropna()
+ X_train = df[
+ [
+ "culmen_length_mm",
+ ]
+ ]
+ y_train = df[["body_mass_g"]]
+ pl.fit(X_train, y_train)
+
+ pl_loaded = pl.to_gbq(
+ f"{dataset_id}.test_penguins_pipeline_k_bins_discretizer", replace=True
+ )
+ assert isinstance(pl_loaded._transform, preprocessing.KBinsDiscretizer)
+
+ assert isinstance(pl_loaded._estimator, linear_model.LinearRegression)
+ assert pl_loaded._estimator.fit_intercept is False
+
+
def test_pipeline_one_hot_encoder_to_gbq(penguins_df_default_index, dataset_id):
pl = pipeline.Pipeline(
[
diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py
index ace943956f..f911dd7eeb 100644
--- a/tests/system/small/ml/test_core.py
+++ b/tests/system/small/ml/test_core.py
@@ -23,6 +23,7 @@
import bigframes
from bigframes.ml import core
+import tests.system.utils
def test_model_eval(
@@ -224,7 +225,7 @@ def test_pca_model_principal_component_info(penguins_bqml_pca_model: core.BqmlMo
"cumulative_explained_variance_ratio": [0.469357, 0.651283, 0.812383],
},
)
- pd.testing.assert_frame_equal(
+ tests.system.utils.assert_pandas_df_equal_ignore_ordering(
result,
expected,
check_exact=False,
diff --git a/tests/system/small/ml/test_decomposition.py b/tests/system/small/ml/test_decomposition.py
index c71bbbe3b0..e31681f4a0 100644
--- a/tests/system/small/ml/test_decomposition.py
+++ b/tests/system/small/ml/test_decomposition.py
@@ -15,6 +15,7 @@
import pandas as pd
from bigframes.ml import decomposition
+import tests.system.utils
def test_pca_predict(penguins_pca_model, new_penguins_df):
@@ -129,7 +130,7 @@ def test_pca_explained_variance_(penguins_pca_model: decomposition.PCA):
"explained_variance": [3.278657, 1.270829, 1.125354],
},
)
- pd.testing.assert_frame_equal(
+ tests.system.utils.assert_pandas_df_equal_ignore_ordering(
result,
expected,
check_exact=False,
@@ -148,7 +149,7 @@ def test_pca_explained_variance_ratio_(penguins_pca_model: decomposition.PCA):
"explained_variance_ratio": [0.469357, 0.181926, 0.1611],
},
)
- pd.testing.assert_frame_equal(
+ tests.system.utils.assert_pandas_df_equal_ignore_ordering(
result,
expected,
check_exact=False,
diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py
index fc8f3251bd..45548acca3 100644
--- a/tests/system/small/ml/test_preprocessing.py
+++ b/tests/system/small/ml/test_preprocessing.py
@@ -121,7 +121,7 @@ def test_standard_scaler_series_normalizes(penguins_df_default_index, new_pengui
def test_max_abs_scaler_normalizes(penguins_df_default_index, new_penguins_df):
- # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod.
+ # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.MaxAbsScaler, when BQML's change is in prod.
scaler = bigframes.ml.preprocessing.MaxAbsScaler()
scaler.fit(
penguins_df_default_index[
@@ -211,7 +211,7 @@ def test_max_abs_scaler_series_normalizes(penguins_df_default_index, new_penguin
pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
-def test_min_max_scaler_normalizeds_fit_transform(new_penguins_df):
+def test_min_max_scaler_normalized_fit_transform(new_penguins_df):
scaler = bigframes.ml.preprocessing.MinMaxScaler()
result = scaler.fit_transform(
new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]]
@@ -265,7 +265,7 @@ def test_min_max_scaler_series_normalizes(penguins_df_default_index, new_penguin
def test_min_max_scaler_normalizes(penguins_df_default_index, new_penguins_df):
- # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod.
+ # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.MinMaxScaler, when BQML's change is in prod.
scaler = bigframes.ml.preprocessing.MinMaxScaler()
scaler.fit(
penguins_df_default_index[
@@ -304,6 +304,131 @@ def test_min_max_scaler_normalizes(penguins_df_default_index, new_penguins_df):
pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
+def test_k_bins_discretizer_normalized_fit_transform_default_params(new_penguins_df):
+ discretizer = bigframes.ml.preprocessing.KBinsDiscretizer(strategy="uniform")
+ result = discretizer.fit_transform(
+ new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]]
+ ).to_pandas()
+
+ # TODO: bug? feature columns seem to be in nondeterministic random order
+ # workaround: sort columns by name. Can't repro it in pantheon, so could
+ # be a bigframes issue...
+ result = result.reindex(sorted(result.columns), axis=1)
+
+ expected = pd.DataFrame(
+ {
+ "kbinsdiscretizer_culmen_depth_mm": ["bin_5", "bin_2", "bin_4"],
+ "kbinsdiscretizer_culmen_length_mm": ["bin_5", "bin_3", "bin_2"],
+ "kbinsdiscretizer_flipper_length_mm": ["bin_5", "bin_2", "bin_4"],
+ },
+ dtype="string[pyarrow]",
+ index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
+ )
+
+ pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
+
+
+def test_k_bins_discretizer_series_normalizes(
+ penguins_df_default_index, new_penguins_df
+):
+ discretizer = bigframes.ml.preprocessing.KBinsDiscretizer(strategy="uniform")
+ discretizer.fit(penguins_df_default_index["culmen_length_mm"])
+
+ result = discretizer.transform(
+ penguins_df_default_index["culmen_length_mm"]
+ ).to_pandas()
+ result = discretizer.transform(new_penguins_df).to_pandas()
+
+ # TODO: bug? feature columns seem to be in nondeterministic random order
+ # workaround: sort columns by name. Can't repro it in pantheon, so could
+ # be a bigframes issue...
+ result = result.reindex(sorted(result.columns), axis=1)
+
+ expected = pd.DataFrame(
+ {
+ "kbinsdiscretizer_culmen_length_mm": ["bin_3", "bin_3", "bin_3"],
+ },
+ dtype="string[pyarrow]",
+ index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
+ )
+
+ pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
+
+
+def test_k_bins_discretizer_normalizes(penguins_df_default_index, new_penguins_df):
+ # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.KBinsDiscretizer, when BQML's change is in prod.
+ discretizer = bigframes.ml.preprocessing.KBinsDiscretizer(strategy="uniform")
+ discretizer.fit(
+ penguins_df_default_index[
+ ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]
+ ]
+ )
+
+ result = discretizer.transform(
+ penguins_df_default_index[
+ ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]
+ ]
+ ).to_pandas()
+
+ result = discretizer.transform(new_penguins_df).to_pandas()
+
+ # TODO: bug? feature columns seem to be in nondeterministic random order
+ # workaround: sort columns by name. Can't repro it in pantheon, so could
+ # be a bigframes issue...
+ result = result.reindex(sorted(result.columns), axis=1)
+
+ expected = pd.DataFrame(
+ {
+ "kbinsdiscretizer_culmen_depth_mm": ["bin_5", "bin_4", "bin_4"],
+ "kbinsdiscretizer_culmen_length_mm": ["bin_3", "bin_3", "bin_3"],
+ "kbinsdiscretizer_flipper_length_mm": ["bin_4", "bin_2", "bin_3"],
+ },
+ dtype="string[pyarrow]",
+ index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
+ )
+
+ pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
+
+
+def test_k_bins_discretizer_normalizes_different_params(
+ penguins_df_default_index, new_penguins_df
+):
+ # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.KBinsDiscretizer, when BQML's change is in prod.
+ discretizer = bigframes.ml.preprocessing.KBinsDiscretizer(
+ n_bins=6, strategy="uniform"
+ )
+ discretizer.fit(
+ penguins_df_default_index[
+ ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]
+ ]
+ )
+
+ result = discretizer.transform(
+ penguins_df_default_index[
+ ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]
+ ]
+ ).to_pandas()
+
+ result = discretizer.transform(new_penguins_df).to_pandas()
+
+ # TODO: bug? feature columns seem to be in nondeterministic random order
+ # workaround: sort columns by name. Can't repro it in pantheon, so could
+ # be a bigframes issue...
+ result = result.reindex(sorted(result.columns), axis=1)
+
+ expected = pd.DataFrame(
+ {
+ "kbinsdiscretizer_culmen_depth_mm": ["bin_6", "bin_4", "bin_5"],
+ "kbinsdiscretizer_culmen_length_mm": ["bin_3", "bin_3", "bin_3"],
+ "kbinsdiscretizer_flipper_length_mm": ["bin_4", "bin_2", "bin_3"],
+ },
+ dtype="string[pyarrow]",
+ index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
+ )
+
+ pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
+
+
def test_one_hot_encoder_default_params(new_penguins_df):
encoder = bigframes.ml.preprocessing.OneHotEncoder()
encoder.fit(new_penguins_df[["species", "sex"]])
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index adf17848ee..b8616a54d6 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -757,7 +757,7 @@ def test_df_isin_dict(scalars_dfs):
("right",),
],
)
-def test_merge(scalars_dfs, merge_how):
+def test_df_merge(scalars_dfs, merge_how):
scalars_df, scalars_pandas_df = scalars_dfs
on = "rowindex_2"
left_columns = ["int64_col", "float64_col", "rowindex_2"]
@@ -782,6 +782,39 @@ def test_merge(scalars_dfs, merge_how):
assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+@pytest.mark.parametrize(
+ ("left_on", "right_on"),
+ [
+ (["int64_col", "rowindex_2"], ["int64_col", "rowindex_2"]),
+ (["rowindex_2", "int64_col"], ["int64_col", "rowindex_2"]),
+ (["rowindex_2", "float64_col"], ["int64_col", "rowindex_2"]),
+ ],
+)
+def test_df_merge_multi_key(scalars_dfs, left_on, right_on):
+ scalars_df, scalars_pandas_df = scalars_dfs
+ left_columns = ["int64_col", "float64_col", "rowindex_2"]
+ right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"]
+
+ left = scalars_df[left_columns]
+ # Offset the rows somewhat so that outer join can have an effect.
+ right = scalars_df[right_columns].assign(rowindex_2=scalars_df["rowindex_2"] + 2)
+
+ df = left.merge(right, "outer", left_on=left_on, right_on=right_on, sort=True)
+ bf_result = df.to_pandas()
+
+ pd_result = scalars_pandas_df[left_columns].merge(
+ scalars_pandas_df[right_columns].assign(
+ rowindex_2=scalars_pandas_df["rowindex_2"] + 2
+ ),
+ "outer",
+ left_on=left_on,
+ right_on=right_on,
+ sort=True,
+ )
+
+ assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+
+
@pytest.mark.parametrize(
("merge_how",),
[
@@ -884,7 +917,19 @@ def test_get_dtypes_array_struct(session):
dtypes = df.dtypes
pd.testing.assert_series_equal(
dtypes,
- pd.Series({"array_column": np.dtype("O"), "struct_column": np.dtype("O")}),
+ pd.Series(
+ {
+ "array_column": np.dtype("O"),
+ "struct_column": pd.ArrowDtype(
+ pa.struct(
+ [
+ ("string_field", pa.string()),
+ ("float_field", pa.float64()),
+ ]
+ )
+ ),
+ }
+ ),
)
@@ -1211,6 +1256,105 @@ def test_combine(
pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)
+@pytest.mark.parametrize(
+ ("overwrite", "filter_func"),
+ [
+ (True, None),
+ (False, None),
+ (True, lambda x: x.isna() | (x % 2 == 0)),
+ ],
+ ids=[
+ "default",
+ "overwritefalse",
+ "customfilter",
+ ],
+)
+def test_df_update(overwrite, filter_func):
+ if pd.__version__.startswith("1."):
+ pytest.skip("dtype handled differently in pandas 1.x.")
+ index1 = pandas.Index([1, 2, 3, 4], dtype="Int64")
+ index2 = pandas.Index([1, 2, 4, 5], dtype="Int64")
+ pd_df1 = pandas.DataFrame(
+ {"a": [1, None, 3, 4], "b": [5, 6, None, 8]}, dtype="Int64", index=index1
+ )
+ pd_df2 = pandas.DataFrame(
+ {"a": [None, 20, 30, 40], "c": [90, None, 110, 120]},
+ dtype="Int64",
+ index=index2,
+ )
+
+ bf_df1 = dataframe.DataFrame(pd_df1)
+ bf_df2 = dataframe.DataFrame(pd_df2)
+
+ bf_df1.update(bf_df2, overwrite=overwrite, filter_func=filter_func)
+ pd_df1.update(pd_df2, overwrite=overwrite, filter_func=filter_func)
+
+ pd.testing.assert_frame_equal(bf_df1.to_pandas(), pd_df1)
+
+
+def test_df_idxmin():
+ pd_df = pd.DataFrame(
+ {"a": [1, 2, 3], "b": [7, None, 3], "c": [4, 4, 4]}, index=["x", "y", "z"]
+ )
+ bf_df = dataframe.DataFrame(pd_df)
+
+ bf_result = bf_df.idxmin().to_pandas()
+ pd_result = pd_df.idxmin()
+
+ pd.testing.assert_series_equal(
+ bf_result, pd_result, check_index_type=False, check_dtype=False
+ )
+
+
+def test_df_idxmax():
+ pd_df = pd.DataFrame(
+ {"a": [1, 2, 3], "b": [7, None, 3], "c": [4, 4, 4]}, index=["x", "y", "z"]
+ )
+ bf_df = dataframe.DataFrame(pd_df)
+
+ bf_result = bf_df.idxmax().to_pandas()
+ pd_result = pd_df.idxmax()
+
+ pd.testing.assert_series_equal(
+ bf_result, pd_result, check_index_type=False, check_dtype=False
+ )
+
+
+@pytest.mark.parametrize(
+ ("join", "axis"),
+ [
+ ("outer", None),
+ ("outer", 0),
+ ("outer", 1),
+ ("left", 0),
+ ("right", 1),
+ ("inner", None),
+ ("inner", 1),
+ ],
+)
+def test_df_align(join, axis):
+ index1 = pandas.Index([1, 2, 3, 4], dtype="Int64")
+ index2 = pandas.Index([1, 2, 4, 5], dtype="Int64")
+ pd_df1 = pandas.DataFrame(
+ {"a": [1, None, 3, 4], "b": [5, 6, None, 8]}, dtype="Int64", index=index1
+ )
+ pd_df2 = pandas.DataFrame(
+ {"a": [None, 20, 30, 40], "c": [90, None, 110, 120]},
+ dtype="Int64",
+ index=index2,
+ )
+
+ bf_df1 = dataframe.DataFrame(pd_df1)
+ bf_df2 = dataframe.DataFrame(pd_df2)
+
+ bf_result1, bf_result2 = bf_df1.align(bf_df2, join=join, axis=axis)
+ pd_result1, pd_result2 = pd_df1.align(pd_df2, join=join, axis=axis)
+
+ # Don't check dtype as pandas does unnecessary float conversion
+ pd.testing.assert_frame_equal(bf_result1.to_pandas(), pd_result1, check_dtype=False)
+ pd.testing.assert_frame_equal(bf_result2.to_pandas(), pd_result2, check_dtype=False)
+
+
def test_combine_first(
scalars_df_index,
scalars_df_2_index,
@@ -1232,11 +1376,6 @@ def test_combine_first(
pd_df_b.columns = ["b", "a", "d"]
pd_result = pd_df_a.combine_first(pd_df_b)
- print("pandas")
- print(pd_result.to_string())
- print("bigframes")
- print(bf_result.to_string())
-
# Some dtype inconsistency for all-NULL columns
pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)
@@ -1705,6 +1844,26 @@ def test_df_stack(scalars_dfs):
pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False)
+def test_df_unstack(scalars_dfs):
+ scalars_df, scalars_pandas_df = scalars_dfs
+ # To match bigquery dataframes
+ scalars_pandas_df = scalars_pandas_df.copy()
+ scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]")
+ # Can only stack identically-typed columns
+ columns = [
+ "rowindex_2",
+ "int64_col",
+ "int64_too",
+ ]
+
+ # unstack on mono-index produces series
+ bf_result = scalars_df[columns].unstack().to_pandas()
+ pd_result = scalars_pandas_df[columns].unstack()
+
+ # Pandas produces NaN, where bq dataframes produces pd.NA
+ pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False)
+
+
@pytest.mark.parametrize(
("values", "index", "columns"),
[
@@ -1922,7 +2081,7 @@ def test_loc_single_index_no_duplicate(scalars_df_index, scalars_pandas_df_index
bf_result = scalars_df_index.loc[index]
pd_result = scalars_pandas_df_index.loc[index]
pd.testing.assert_series_equal(
- bf_result.to_pandas().iloc[0, :],
+ bf_result,
pd_result,
)
@@ -2439,6 +2598,24 @@ def test_iloc_list(scalars_df_index, scalars_pandas_df_index):
)
+def test_iloc_list_multiindex(scalars_dfs):
+ scalars_df, scalars_pandas_df = scalars_dfs
+ scalars_df = scalars_df.copy()
+ scalars_pandas_df = scalars_pandas_df.copy()
+ scalars_df = scalars_df.set_index(["bytes_col", "numeric_col"])
+ scalars_pandas_df = scalars_pandas_df.set_index(["bytes_col", "numeric_col"])
+
+ index_list = [0, 0, 0, 5, 4, 7]
+
+ bf_result = scalars_df.iloc[index_list]
+ pd_result = scalars_pandas_df.iloc[index_list]
+
+ pd.testing.assert_frame_equal(
+ bf_result.to_pandas(),
+ pd_result,
+ )
+
+
def test_iloc_empty_list(scalars_df_index, scalars_pandas_df_index):
index_list = []
diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py
index 1e38b47b4c..19f1c557ef 100644
--- a/tests/system/small/test_multiindex.py
+++ b/tests/system/small/test_multiindex.py
@@ -41,6 +41,17 @@ def test_reset_multi_index(scalars_df_index, scalars_pandas_df_index):
pandas.testing.assert_frame_equal(bf_result, pd_result)
+def test_series_multi_index_idxmin(scalars_df_index, scalars_pandas_df_index):
+ bf_result = scalars_df_index.set_index(["bool_col", "int64_too"])[
+ "float64_col"
+ ].idxmin()
+ pd_result = scalars_pandas_df_index.set_index(["bool_col", "int64_too"])[
+ "float64_col"
+ ].idxmin()
+
+ assert bf_result == pd_result
+
+
def test_binop_series_series_matching_multi_indices(
scalars_df_index, scalars_pandas_df_index
):
@@ -729,6 +740,26 @@ def test_column_multi_index_stack(scalars_df_index, scalars_pandas_df_index):
)
+def test_column_multi_index_unstack(scalars_df_index, scalars_pandas_df_index):
+ columns = ["int64_too", "int64_col", "rowindex_2"]
+ level1 = pandas.Index(["b", "a", "b"], dtype="string[pyarrow]")
+ # Need resulting column to be pyarrow string rather than object dtype
+ level2 = pandas.Index(["a", "b", "b"], dtype="string[pyarrow]")
+ multi_columns = pandas.MultiIndex.from_arrays([level1, level2])
+ bf_df = scalars_df_index[columns].copy()
+ bf_df.columns = multi_columns
+ pd_df = scalars_pandas_df_index[columns].copy()
+ pd_df.columns = multi_columns
+
+ bf_result = bf_df.unstack().to_pandas()
+ # Shifting sort behavior in stack
+ pd_result = pd_df.unstack()
+
+ # Pandas produces NaN, where bq dataframes produces pd.NA
+ # Column ordering seems to depend on pandas version
+ pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False)
+
+
@pytest.mark.skip(reason="Pandas fails in newer versions.")
def test_column_multi_index_w_na_stack(scalars_df_index, scalars_pandas_df_index):
columns = ["int64_too", "int64_col", "rowindex_2"]
@@ -866,6 +897,17 @@ def test_column_multi_index_reorder_levels(scalars_df_index, scalars_pandas_df_i
pandas.testing.assert_frame_equal(bf_result, pd_result)
+def test_multi_index_unstack(hockey_df, hockey_pandas_df):
+ bf_result = (
+ hockey_df.set_index(["team_name", "season", "position"]).unstack().to_pandas()
+ )
+ pd_result = hockey_pandas_df.set_index(
+ ["team_name", "season", "position"]
+ ).unstack()
+
+ pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)
+
+
def test_column_multi_index_swaplevel(scalars_df_index, scalars_pandas_df_index):
columns = ["int64_too", "string_col", "bool_col"]
multi_columns = pandas.MultiIndex.from_tuples(
diff --git a/tests/system/small/test_pandas_options.py b/tests/system/small/test_pandas_options.py
index 6510c4fa27..956b29ae12 100644
--- a/tests/system/small/test_pandas_options.py
+++ b/tests/system/small/test_pandas_options.py
@@ -75,7 +75,7 @@ def test_read_gbq_start_sets_session_location(
# Now read_gbq* from another location should fail
with pytest.raises(
google.api_core.exceptions.NotFound,
- match=f"404 Not found: Dataset {dataset_id_permanent} was not found in location {tokyo_location}",
+ match=dataset_id_permanent,
):
read_method(query)
@@ -100,7 +100,7 @@ def test_read_gbq_start_sets_session_location(
# Now read_gbq* from another location should fail
with pytest.raises(
google.api_core.exceptions.NotFound,
- match=f"404 Not found: Dataset {dataset_id_permanent_tokyo} was not found in location US",
+ match=dataset_id_permanent_tokyo,
):
read_method(query_tokyo)
@@ -146,7 +146,7 @@ def test_read_gbq_after_session_start_must_comply_with_default_location(
# Doing read_gbq* from a table in another location should fail
with pytest.raises(
google.api_core.exceptions.NotFound,
- match=f"404 Not found: Dataset {dataset_id_permanent_tokyo} was not found in location US",
+ match=dataset_id_permanent_tokyo,
):
read_method(query_tokyo)
@@ -194,7 +194,7 @@ def test_read_gbq_must_comply_with_set_location_US(
# Starting user journey with read_gbq* from another location should fail
with pytest.raises(
google.api_core.exceptions.NotFound,
- match=f"404 Not found: Dataset {dataset_id_permanent_tokyo} was not found in location US",
+ match=dataset_id_permanent_tokyo,
):
read_method(query_tokyo)
@@ -244,7 +244,7 @@ def test_read_gbq_must_comply_with_set_location_non_US(
# Starting user journey with read_gbq* from another location should fail
with pytest.raises(
google.api_core.exceptions.NotFound,
- match=f"404 Not found: Dataset {dataset_id_permanent} was not found in location {tokyo_location}",
+ match=dataset_id_permanent,
):
read_method(query)
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
index 588dcc2c83..8c1c36720b 100644
--- a/tests/system/small/test_series.py
+++ b/tests/system/small/test_series.py
@@ -118,7 +118,7 @@ def test_series_get_with_default_index(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
bf_result = scalars_df[col_name].get(key)
pd_result = scalars_pandas_df[col_name].get(key)
- assert bf_result.to_pandas().iloc[0] == pd_result
+ assert bf_result == pd_result
@pytest.mark.parametrize(
@@ -157,7 +157,7 @@ def test_series___getitem___with_default_index(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
bf_result = scalars_df[col_name][key]
pd_result = scalars_pandas_df[col_name][key]
- assert bf_result.to_pandas().iloc[0] == pd_result
+ assert bf_result == pd_result
@pytest.mark.parametrize(
@@ -2468,6 +2468,18 @@ def test_argmax(scalars_df_index, scalars_pandas_df_index):
assert bf_result == pd_result
+def test_series_idxmin(scalars_df_index, scalars_pandas_df_index):
+ bf_result = scalars_df_index.string_col.idxmin()
+ pd_result = scalars_pandas_df_index.string_col.idxmin()
+ assert bf_result == pd_result
+
+
+def test_series_idxmax(scalars_df_index, scalars_pandas_df_index):
+ bf_result = scalars_df_index.int64_too.idxmax()
+ pd_result = scalars_pandas_df_index.int64_too.idxmax()
+ assert bf_result == pd_result
+
+
def test_getattr_attribute_error_when_pandas_has(scalars_df_index):
# asof is implemented in pandas but not in bigframes
with pytest.raises(AttributeError):
@@ -2640,7 +2652,7 @@ def test_loc_single_index_no_duplicate(scalars_df_index, scalars_pandas_df_index
index = -2345
bf_result = scalars_df_index.date_col.loc[index]
pd_result = scalars_pandas_df_index.date_col.loc[index]
- assert bf_result.to_pandas().iloc[0] == pd_result
+ assert bf_result == pd_result
def test_series_bool_interpretation_error(scalars_df_index):
diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py
index 614c953764..53ddfa3c49 100644
--- a/tests/system/small/test_session.py
+++ b/tests/system/small/test_session.py
@@ -57,6 +57,7 @@ def test_read_gbq_tokyo(
),
pytest.param(
"""SELECT
+ t.int64_col + 1 as my_ints,
t.float64_col * 2 AS my_floats,
CONCAT(t.string_col, "_2") AS my_strings,
t.int64_col > 0 AS my_bools,
@@ -321,11 +322,10 @@ def test_read_pandas_multi_index(session, scalars_pandas_df_multi_index):
def test_read_pandas_rowid_exists_adds_suffix(session, scalars_pandas_df_default_index):
- scalars_pandas_df_default_index["rowid"] = np.arange(
- scalars_pandas_df_default_index.shape[0]
- )
+ pandas_df = scalars_pandas_df_default_index.copy()
+ pandas_df["rowid"] = np.arange(pandas_df.shape[0])
- df = session.read_pandas(scalars_pandas_df_default_index)
+ df = session.read_pandas(pandas_df)
total_order_col = df._block._expr._ordering.total_order_col
assert total_order_col and total_order_col.column_id == "rowid_2"
diff --git a/tests/unit/core/test_io.py b/tests/unit/core/test_io.py
index c5074f80c2..afb38a5f75 100644
--- a/tests/unit/core/test_io.py
+++ b/tests/unit/core/test_io.py
@@ -13,8 +13,10 @@
# limitations under the License.
import datetime
+from typing import Iterable
import google.cloud.bigquery as bigquery
+import pytest
import bigframes.core.io
@@ -47,3 +49,56 @@ def test_create_snapshot_sql_doesnt_timetravel_session_datasets():
# Don't need the project ID for _SESSION tables.
assert "my-test-project" not in sql
+
+
+@pytest.mark.parametrize(
+ ("schema", "expected"),
+ (
+ (
+ [bigquery.SchemaField("My Column", "INTEGER")],
+ "`My Column` INT64",
+ ),
+ (
+ [
+ bigquery.SchemaField("My Column", "INTEGER"),
+ bigquery.SchemaField("Float Column", "FLOAT"),
+ bigquery.SchemaField("Bool Column", "BOOLEAN"),
+ ],
+ "`My Column` INT64, `Float Column` FLOAT64, `Bool Column` BOOL",
+ ),
+ (
+ [
+ bigquery.SchemaField("My Column", "INTEGER", mode="REPEATED"),
+ bigquery.SchemaField("Float Column", "FLOAT", mode="REPEATED"),
+ bigquery.SchemaField("Bool Column", "BOOLEAN", mode="REPEATED"),
+ ],
+ "`My Column` ARRAY, `Float Column` ARRAY, `Bool Column` ARRAY",
+ ),
+ (
+ [
+ bigquery.SchemaField(
+ "My Column",
+ "RECORD",
+ mode="REPEATED",
+ fields=(
+ bigquery.SchemaField("Float Column", "FLOAT", mode="REPEATED"),
+ bigquery.SchemaField("Bool Column", "BOOLEAN", mode="REPEATED"),
+ bigquery.SchemaField(
+ "Nested Column",
+ "RECORD",
+ fields=(bigquery.SchemaField("Int Column", "INTEGER"),),
+ ),
+ ),
+ ),
+ ],
+ (
+ "`My Column` ARRAY,"
+ + " `Bool Column` ARRAY,"
+ + " `Nested Column` STRUCT<`Int Column` INT64>>>"
+ ),
+ ),
+ ),
+)
+def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str):
+ pass
diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py
index 8c8fbd6ab5..60dcc75b63 100644
--- a/tests/unit/ml/test_compose.py
+++ b/tests/unit/ml/test_compose.py
@@ -23,6 +23,7 @@ def test_columntransformer_init_expectedtransforms():
standard_scaler_transformer = preprocessing.StandardScaler()
max_abs_scaler_transformer = preprocessing.MaxAbsScaler()
min_max_scaler_transformer = preprocessing.MinMaxScaler()
+ k_bins_discretizer_transformer = preprocessing.KBinsDiscretizer(strategy="uniform")
label_transformer = preprocessing.LabelEncoder()
column_transformer = compose.ColumnTransformer(
[
@@ -42,6 +43,11 @@ def test_columntransformer_init_expectedtransforms():
min_max_scaler_transformer,
["culmen_length_mm", "flipper_length_mm"],
),
+ (
+ "k_bins_discretizer",
+ k_bins_discretizer_transformer,
+ ["culmen_length_mm", "flipper_length_mm"],
+ ),
("label", label_transformer, "species"),
]
)
@@ -54,6 +60,8 @@ def test_columntransformer_init_expectedtransforms():
("max_abs_scale", max_abs_scaler_transformer, "flipper_length_mm"),
("min_max_scale", min_max_scaler_transformer, "culmen_length_mm"),
("min_max_scale", min_max_scaler_transformer, "flipper_length_mm"),
+ ("k_bins_discretizer", k_bins_discretizer_transformer, "culmen_length_mm"),
+ ("k_bins_discretizer", k_bins_discretizer_transformer, "flipper_length_mm"),
("label", label_transformer, "species"),
]
@@ -81,6 +89,11 @@ def test_columntransformer_repr():
preprocessing.MinMaxScaler(),
["culmen_length_mm", "flipper_length_mm"],
),
+ (
+ "k_bins_discretizer",
+ preprocessing.KBinsDiscretizer(strategy="uniform"),
+ ["culmen_length_mm", "flipper_length_mm"],
+ ),
]
)
@@ -92,6 +105,9 @@ def test_columntransformer_repr():
('max_abs_scale', MaxAbsScaler(),
['culmen_length_mm', 'flipper_length_mm']),
('min_max_scale', MinMaxScaler(),
+ ['culmen_length_mm', 'flipper_length_mm']),
+ ('k_bins_discretizer',
+ KBinsDiscretizer(strategy='uniform'),
['culmen_length_mm', 'flipper_length_mm'])])"""
)
@@ -119,6 +135,11 @@ def test_columntransformer_repr_matches_sklearn():
preprocessing.MinMaxScaler(),
["culmen_length_mm", "flipper_length_mm"],
),
+ (
+ "k_bins_discretizer",
+ preprocessing.KBinsDiscretizer(strategy="uniform"),
+ ["culmen_length_mm", "flipper_length_mm"],
+ ),
]
)
sk_column_transformer = sklearn_compose.ColumnTransformer(
@@ -143,6 +164,11 @@ def test_columntransformer_repr_matches_sklearn():
sklearn_preprocessing.MinMaxScaler(),
["culmen_length_mm", "flipper_length_mm"],
),
+ (
+ "k_bins_discretizer",
+ sklearn_preprocessing.KBinsDiscretizer(strategy="uniform"),
+ ["culmen_length_mm", "flipper_length_mm"],
+ ),
]
)
diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py
index a3338e762d..34a02edd42 100644
--- a/tests/unit/ml/test_sql.py
+++ b/tests/unit/ml/test_sql.py
@@ -95,6 +95,13 @@ def test_min_max_scaler_produces_correct_sql(
assert sql == "ML.MIN_MAX_SCALER(col_a) OVER() AS scaled_col_a"
+def test_k_bins_discretizer_produces_correct_sql(
+ base_sql_generator: ml_sql.BaseSqlGenerator,
+):
+ sql = base_sql_generator.ml_bucketize("col_a", [1, 2, 3, 4], "scaled_col_a")
+ assert sql == "ML.BUCKETIZE(col_a, [1, 2, 3, 4], FALSE) AS scaled_col_a"
+
+
def test_one_hot_encoder_produces_correct_sql(
base_sql_generator: ml_sql.BaseSqlGenerator,
):
diff --git a/tests/unit/test_dtypes.py b/tests/unit/test_dtypes.py
index bb8ae570dc..3baff2e1f5 100644
--- a/tests/unit/test_dtypes.py
+++ b/tests/unit/test_dtypes.py
@@ -85,6 +85,70 @@ def test_ibis_float32_raises_unexpected_datatype():
bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_dtypes.float32)
+IBIS_ARROW_DTYPES = (
+ (ibis_dtypes.boolean, pa.bool_()),
+ (ibis_dtypes.date, pa.date32()),
+ (ibis_dtypes.Timestamp(), pa.timestamp("us")),
+ (ibis_dtypes.float64, pa.float64()),
+ (
+ ibis_dtypes.Timestamp(timezone="UTC"),
+ pa.timestamp("us", tz="UTC"),
+ ),
+ (
+ ibis_dtypes.Struct.from_tuples(
+ [
+ ("name", ibis_dtypes.string()),
+ ("version", ibis_dtypes.int64()),
+ ]
+ ),
+ pa.struct(
+ [
+ ("name", pa.string()),
+ ("version", pa.int64()),
+ ]
+ ),
+ ),
+ (
+ ibis_dtypes.Struct.from_tuples(
+ [
+ (
+ "nested",
+ ibis_dtypes.Struct.from_tuples(
+ [
+ ("field", ibis_dtypes.string()),
+ ]
+ ),
+ ),
+ ]
+ ),
+ pa.struct(
+ [
+ (
+ "nested",
+ pa.struct(
+ [
+ ("field", pa.string()),
+ ]
+ ),
+ ),
+ ]
+ ),
+ ),
+)
+
+
+@pytest.mark.parametrize(("ibis_dtype", "arrow_dtype"), IBIS_ARROW_DTYPES)
+def test_arrow_dtype_to_ibis_dtype(ibis_dtype, arrow_dtype):
+ result = bigframes.dtypes.arrow_dtype_to_ibis_dtype(arrow_dtype)
+ assert result == ibis_dtype
+
+
+@pytest.mark.parametrize(("ibis_dtype", "arrow_dtype"), IBIS_ARROW_DTYPES)
+def test_ibis_dtype_to_arrow_dtype(ibis_dtype, arrow_dtype):
+ result = bigframes.dtypes.ibis_dtype_to_arrow_dtype(ibis_dtype)
+ assert result == arrow_dtype
+
+
@pytest.mark.parametrize(
["bigframes_dtype", "ibis_dtype"],
[
diff --git a/third_party/bigframes_vendored/pandas/core/arrays/__init__.py b/third_party/bigframes_vendored/pandas/core/arrays/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/__init__.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py
new file mode 100644
index 0000000000..8e3ea06a3d
--- /dev/null
+++ b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py
@@ -0,0 +1,94 @@
+# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/arrays/arrow/accessors.py
+"""Accessors for arrow-backed data."""
+
+from __future__ import annotations
+
+from bigframes import constants
+
+
+class StructAccessor:
+ """
+ Accessor object for structured data properties of the Series values.
+ """
+
+ def field(self, name_or_index: str | int):
+ """
+ Extract a child field of a struct as a Series.
+
+ **Examples:**
+
+ >>> import bigframes.pandas as bpd
+ >>> import pyarrow as pa
+ >>> bpd.options.display.progress_bar = None
+ >>> s = bpd.Series(
+ ... [
+ ... {"version": 1, "project": "pandas"},
+ ... {"version": 2, "project": "pandas"},
+ ... {"version": 1, "project": "numpy"},
+ ... ],
+ ... dtype=bpd.ArrowDtype(pa.struct(
+ ... [("version", pa.int64()), ("project", pa.string())]
+ ... ))
+ ... )
+
+ Extract by field name.
+
+ >>> s.struct.field("project")
+ 0 pandas
+ 1 pandas
+ 2 numpy
+ Name: project, dtype: string
+
+ Extract by field index.
+
+ >>> s.struct.field(0)
+ 0 1
+ 1 2
+ 2 1
+ Name: version, dtype: Int64
+
+ Args:
+ name_or_index:
+ Name (str) or index (int) of the child field to extract.
+
+ Returns:
+ Series:
+ The data corresponding to the selected child field.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def explode(self):
+ """
+ Extract all child fields of a struct as a DataFrame.
+
+ **Examples:**
+
+ >>> import bigframes.pandas as bpd
+ >>> import pyarrow as pa
+ >>> bpd.options.display.progress_bar = None
+ >>> s = bpd.Series(
+ ... [
+ ... {"version": 1, "project": "pandas"},
+ ... {"version": 2, "project": "pandas"},
+ ... {"version": 1, "project": "numpy"},
+ ... ],
+ ... dtype=bpd.ArrowDtype(pa.struct(
+ ... [("version", pa.int64()), ("project", pa.string())]
+ ... ))
+ ... )
+
+ Extract all child fields.
+
+ >>> s.struct.explode()
+ version project
+ 0 1 pandas
+ 1 2 pandas
+ 2 1 numpy
+
+ [3 rows x 2 columns]
+
+ Returns:
+ DataFrame:
+ The data corresponding to all child fields.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
index 6ce11cd7e9..17d941fbdd 100644
--- a/third_party/bigframes_vendored/pandas/core/frame.py
+++ b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -503,6 +503,35 @@ def drop(
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+ def align(
+ self,
+ other,
+ join="outer",
+ axis=None,
+ ) -> tuple:
+ """
+ Align two objects on their axes with the specified join method.
+
+ Join method is specified for each axis Index.
+
+ Args:
+ other (DataFrame or Series):
+ join ({{'outer', 'inner', 'left', 'right'}}, default 'outer'):
+ Type of alignment to be performed.
+ left: use only keys from left frame, preserve key order.
+ right: use only keys from right frame, preserve key order.
+ outer: use union of keys from both frames, sort keys lexicographically.
+ inner: use intersection of keys from both frames,
+ preserve the order of the left keys.
+
+ axis (allowed axis of the other object, default None):
+ Align on index (0), columns (1), or both (None).
+
+ Returns:
+ tuple of (DataFrame, type of other): Aligned objects.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
def rename(
self,
*,
@@ -1265,6 +1294,39 @@ def combine_first(self, other) -> DataFrame:
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+ def update(
+ self, other, join: str = "left", overwrite: bool = True, filter_func=None
+ ) -> DataFrame:
+ """
+ Modify in place using non-NA values from another DataFrame.
+
+ Aligns on indices. There is no return value.
+
+ Args:
+ other (DataFrame, or object coercible into a DataFrame):
+ Should have at least one matching index/column label
+ with the original DataFrame. If a Series is passed,
+ its name attribute must be set, and that will be
+ used as the column name to align with the original DataFrame.
+ join ({'left'}, default 'left'):
+ Only left join is implemented, keeping the index and columns of the
+ original object.
+ overwrite (bool, default True):
+ How to handle non-NA values for overlapping keys:
+ True: overwrite original DataFrame's values
+ with values from `other`.
+ False: only update values that are NA in
+ the original DataFrame.
+
+ filter_func (callable(1d-array) -> bool 1d-array, optional):
+ Can choose to replace values other than NA. Return True for values
+ that should be updated.
+
+ Returns:
+ None: This method directly changes calling object.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
# ----------------------------------------------------------------------
# Data reshaping
@@ -1406,14 +1468,14 @@ def merge(
``inner``: use intersection of keys from both frames, similar to a SQL inner
join; preserve the order of the left keys.
- on:
- Column join on. It must be found in both DataFrames. Either on or left_on + right_on
+ on (label or list of labels):
+ Columns to join on. It must be found in both DataFrames. Either on or left_on + right_on
must be passed in.
- left_on:
- Column join on in the left DataFrame. Either on or left_on + right_on
+ left_on (label or list of labels):
+ Columns to join on in the left DataFrame. Either on or left_on + right_on
must be passed in.
- right_on:
- Column join on in the right DataFrame. Either on or left_on + right_on
+ right_on (label or list of labels):
+ Columns to join on in the right DataFrame. Either on or left_on + right_on
must be passed in.
sort:
Default False. Sort the join keys lexicographically in the
@@ -1743,6 +1805,28 @@ def nsmallest(self, n: int, columns, keep: str = "first"):
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+ def idxmin(self):
+ """
+ Return index of first occurrence of minimum over requested axis.
+
+ NA/null values are excluded.
+
+ Returns:
+ Series: Indexes of minima along the specified axis.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def idxmax(self):
+ """
+ Return index of first occurrence of maximum over requested axis.
+
+ NA/null values are excluded.
+
+ Returns:
+ Series: Indexes of maxima along the specified axis.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
def nunique(self):
"""
Count number of distinct elements in specified axis.
@@ -1910,6 +1994,21 @@ def stack(self):
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+ def unstack(self):
+ """
+ Pivot a level of the (necessarily hierarchical) index labels.
+
+ Returns a DataFrame having a new level of column labels whose inner-most level
+ consists of the pivoted index labels.
+
+ If the index is not a MultiIndex, the output will be a Series
+ (the analogue of stack when the columns are not a MultiIndex).
+
+ Returns:
+ DataFrame or Series
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
# ----------------------------------------------------------------------
# Add index and columns
diff --git a/third_party/bigframes_vendored/pandas/core/reshape/merge.py b/third_party/bigframes_vendored/pandas/core/reshape/merge.py
index ee02d698da..cc81de405b 100644
--- a/third_party/bigframes_vendored/pandas/core/reshape/merge.py
+++ b/third_party/bigframes_vendored/pandas/core/reshape/merge.py
@@ -51,14 +51,14 @@ def merge(
``inner``: use intersection of keys from both frames, similar to a SQL inner
join; preserve the order of the left keys.
- on:
- Column join on. It must be found in both DataFrames. Either on or left_on + right_on
+ on (label or list of labels):
+ Columns to join on. It must be found in both DataFrames. Either on or left_on + right_on
must be passed in.
- left_on:
- Column join on in the left DataFrame. Either on or left_on + right_on
+ left_on (label or list of labels):
+ Columns to join on in the left DataFrame. Either on or left_on + right_on
must be passed in.
- right_on:
- Column join on in the right DataFrame. Either on or left_on + right_on
+ right_on (label or list of labels):
+ Columns to join on in the right DataFrame. Either on or left_on + right_on
must be passed in.
sort:
Default False. Sort the join keys lexicographically in the
diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py
index d58c1ccc3b..a41a3454ca 100644
--- a/third_party/bigframes_vendored/pandas/core/series.py
+++ b/third_party/bigframes_vendored/pandas/core/series.py
@@ -8,7 +8,6 @@
import numpy as np
from pandas._libs import lib
from pandas._typing import Axis, FilePath, NaPosition, WriteBuffer
-import pandas.io.formats.format as fmt
from bigframes import constants
from third_party.bigframes_vendored.pandas.core.generic import NDFrame
@@ -151,21 +150,6 @@ def to_string(
str or None: String representation of Series if ``buf=None``,
otherwise None.
"""
- formatter = fmt.SeriesFormatter(
- self,
- name=name,
- length=length,
- header=header,
- index=index,
- dtype=dtype,
- na_rep=na_rep,
- float_format=float_format,
- min_rows=min_rows,
- max_rows=max_rows,
- )
- result = formatter.to_string()
-
- # catch contract violations
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def to_markdown(
@@ -475,6 +459,30 @@ def duplicated(self, keep="first") -> Series:
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+ def idxmin(self) -> Hashable:
+ """
+ Return the row label of the minimum value.
+
+ If multiple values equal the minimum, the first row label with that
+ value is returned.
+
+ Returns:
+ Index: Label of the minimum value.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def idxmax(self) -> Hashable:
+ """
+ Return the row label of the maximum value.
+
+ If multiple values equal the maximum, the first row label with that
+ value is returned.
+
+ Returns:
+ Index: Label of the maximum value.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
def round(self, decimals: int = 0) -> Series:
"""
Round each value in a Series to the given number of decimals.
diff --git a/third_party/bigframes_vendored/sklearn/__init__.py b/third_party/bigframes_vendored/sklearn/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/third_party/bigframes_vendored/sklearn/ensemble/__init__.py b/third_party/bigframes_vendored/sklearn/ensemble/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py b/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py
new file mode 100644
index 0000000000..0236558dd4
--- /dev/null
+++ b/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py
@@ -0,0 +1,47 @@
+# Author: Henry Lin
+# Tom Dupré la Tour
+
+# License: BSD
+
+from bigframes import constants
+from third_party.bigframes_vendored.sklearn.base import BaseEstimator, TransformerMixin
+
+
+class KBinsDiscretizer(TransformerMixin, BaseEstimator):
+ """
+ Bin continuous data into intervals.
+
+ Args:
+ n_bins (int, default 5):
+ The number of bins to produce. Raises ValueError if ``n_bins < 2``.
+ strategy ({'uniform', 'quantile'}, default='quantile'):
+ Strategy used to define the widths of the bins. 'uniform': All bins
+ in each feature have identical widths. 'quantile': All bins in each
+ feature have the same number of points. Only `uniform` is supported now.
+ """
+
+ def fit(self, X, y=None):
+ """Fit the estimator.
+
+ Args:
+ X (bigframes.dataframe.DataFrame or bigframes.series.Series):
+ The Dataframe or Series with training data.
+
+ y (default None):
+ Ignored.
+
+ Returns:
+ KBinsDiscretizer: Fitted scaler.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def transform(self, X):
+ """Discretize the data.
+
+ Args:
+ X (bigframes.dataframe.DataFrame or bigframes.series.Series):
+ The DataFrame or Series to be transformed.
+
+ Returns:
+ bigframes.dataframe.DataFrame: Transformed result."""
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/xgboost/__init__.py b/third_party/bigframes_vendored/xgboost/__init__.py
new file mode 100644
index 0000000000..e69de29bb2