From 66d1839c3e9a3011c7feb13a59d966b64cf8313f Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Thu, 30 Nov 2023 11:39:47 -0800 Subject: [PATCH 01/20] fix: update the llm_kmeans notebook (#247) --- notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb index 5f74046fc0..69efb11018 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb @@ -371,7 +371,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We now have the complaints and their text embeddings as two columns in our combined_df. Recall that complaints with numerically similar text embeddings should have similar meanings semantically. We will now group similar complaints together." + "We now have the complaints and their text embeddings as two columns in our predicted_embeddings DataFrame." ] }, { @@ -426,7 +426,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Our dataframe combined_clustered_result now has three complaint columns: the content, their text embeddings, and an ID from 1-10 (inclusive) indicating which semantically similar group they belong to." + "Our DataFrame clustered_result now has an additional column that includes an ID from 1-10 (inclusive) indicating which semantically similar group they belong to." ] }, { @@ -501,7 +501,7 @@ "source": [ "# The plain English request we will make of PaLM 2\n", "prompt = (\n", - " \"Please highlight the most obvious difference between\"\n", + " \"Please highlight the most obvious difference between \"\n", " \"the two lists of comments:\\n\" + prompt1 + prompt2\n", ")\n", "print(prompt)" From 1737acc51b4fdd9b385bbf91a758efd2e7ead11a Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 30 Nov 2023 20:22:16 -0800 Subject: [PATCH 02/20] feat: add DataFrame.select_dtypes method (#242) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/dataframe.py | 13 +++++++ tests/system/small/test_dataframe.py | 20 +++++++++++ .../bigframes_vendored/pandas/core/frame.py | 36 +++++++++++++++++++ 3 files changed, 69 insertions(+) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index f7796291b9..c6b28f1b01 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -434,6 +434,19 @@ def info( # TODO: Convert to different units (kb, mb, etc.) obuf.write(f"memory usage: {self.memory_usage().sum()} bytes\n") + def select_dtypes(self, include=None, exclude=None) -> DataFrame: + # Create empty pandas dataframe with same schema and then leverage actual pandas implementation + as_pandas = pandas.DataFrame( + { + col_id: pandas.Series([], dtype=dtype) + for col_id, dtype in zip(self._block.value_columns, self._block.dtypes) + } + ) + selected_columns = tuple( + as_pandas.select_dtypes(include=include, exclude=exclude).columns + ) + return DataFrame(self._block.select_columns(selected_columns)) + def _set_internal_query_job(self, query_job: bigquery.QueryJob): self._query_job = query_job diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 9744d3f6e9..5940df590c 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -297,6 +297,26 @@ def test_df_info(scalars_dfs): assert expected == bf_result.getvalue() +@pytest.mark.parametrize( + ("include", "exclude"), + [ + ("Int64", None), + (["int"], None), + ("number", None), + ([pd.Int64Dtype(), pd.BooleanDtype()], None), + (None, [pd.Int64Dtype(), pd.BooleanDtype()]), + ("Int64", ["boolean"]), + ], +) +def test_select_dtypes(scalars_dfs, include, exclude): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df.select_dtypes(include=include, exclude=exclude) + bf_result = scalars_df.select_dtypes(include=include, exclude=exclude).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + def test_drop_index(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 099d8b8e66..3bd90be2e4 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -158,6 +158,42 @@ def memory_usage(self, index: bool = True): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def select_dtypes(self, include=None, exclude=None) -> DataFrame: + """ + Return a subset of the DataFrame's columns based on the column dtypes. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': ["hello", "world"], 'col3': [True, False]}) + >>> df.select_dtypes(include=['Int64']) + col1 + 0 1 + 1 2 + + [2 rows x 1 columns] + + >>> df.select_dtypes(exclude=['Int64']) + col2 col3 + 0 hello True + 1 world False + + [2 rows x 2 columns] + + + Args: + include (scalar or list-like): + A selection of dtypes or strings to be included. + exclude (scalar or list-like): + A selection of dtypes or strings to be excluded. + + Returns: + DataFrame: The subset of the frame including the dtypes in ``include`` and excluding the dtypes in ``exclude``. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + # ---------------------------------------------------------------------- # IO methods (to / from other formats) def to_numpy(self, dtype=None, copy=False, na_value=None, **kwargs) -> np.ndarray: From 0523a31fa0b589f88afe0ad5b447634409ddeb86 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Fri, 1 Dec 2023 10:06:23 -0800 Subject: [PATCH 03/20] docs: add examples for dataframe.cummin, dataframe.cummax, dataframe.cumsum, dataframe.cumprod (#243) --- .../bigframes_vendored/pandas/core/frame.py | 96 ++++++++++++++++++- 1 file changed, 92 insertions(+), 4 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 3bd90be2e4..6b5a580e99 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -3367,40 +3367,128 @@ def nunique(self): raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cummin(self) -> DataFrame: - """Return cumulative minimum over a DataFrame axis. + """Return cumulative minimum over columns. Returns a DataFrame of the same size containing the cumulative minimum. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) + >>> df + A B + 0 3 1 + 1 1 2 + 2 2 3 + + [3 rows x 2 columns] + + >>> df.cummin() + A B + 0 3 1 + 1 1 1 + 2 1 1 + + [3 rows x 2 columns] + Returns: bigframes.dataframe.DataFrame: Return cumulative minimum of DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cummax(self) -> DataFrame: - """Return cumulative maximum over a DataFrame axis. + """Return cumulative maximum over columns. Returns a DataFrame of the same size containing the cumulative maximum. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) + >>> df + A B + 0 3 1 + 1 1 2 + 2 2 3 + + [3 rows x 2 columns] + + >>> df.cummax() + A B + 0 3 1 + 1 3 2 + 2 3 3 + + [3 rows x 2 columns] + Returns: bigframes.dataframe.DataFrame: Return cumulative maximum of DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cumsum(self) -> DataFrame: - """Return cumulative sum over a DataFrame axis. + """Return cumulative sum over columns. Returns a DataFrame of the same size containing the cumulative sum. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) + >>> df + A B + 0 3 1 + 1 1 2 + 2 2 3 + + [3 rows x 2 columns] + + >>> df.cumsum() + A B + 0 3 1 + 1 4 3 + 2 6 6 + + [3 rows x 2 columns] + Returns: bigframes.dataframe.DataFrame: Return cumulative sum of DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cumprod(self) -> DataFrame: - """Return cumulative product over a DataFrame axis. + """Return cumulative product over columns. Returns a DataFrame of the same size containing the cumulative product. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) + >>> df + A B + 0 3 1 + 1 1 2 + 2 2 3 + + [3 rows x 2 columns] + + >>> df.cumprod() + A B + 0 3 1 + 1 3 2 + 2 6 6 + + [3 rows x 2 columns] + Returns: bigframes.dataframe.DataFrame: Return cumulative product of DataFrame. """ From 8d81e24677613dcf4d275c27a327384b8c17bc85 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 1 Dec 2023 11:58:10 -0800 Subject: [PATCH 04/20] feat: add DataFrame from_dict and from_records methods (#244) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/dataframe.py | 26 +++++++ tests/system/small/test_dataframe.py | 48 ++++++++++++ .../bigframes_vendored/pandas/core/frame.py | 75 +++++++++++++++++++ 3 files changed, 149 insertions(+) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index c6b28f1b01..3b0fd7008a 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2381,6 +2381,32 @@ def _split( blocks = self._block._split(ns=ns, fracs=fracs, random_state=random_state) return [DataFrame(block) for block in blocks] + @classmethod + def from_dict( + cls, + data: dict, + orient: str = "columns", + dtype=None, + columns=None, + ) -> DataFrame: + return cls(pandas.DataFrame.from_dict(data, orient, dtype, columns)) # type: ignore + + @classmethod + def from_records( + cls, + data, + index=None, + exclude=None, + columns=None, + coerce_float: bool = False, + nrows: int | None = None, + ) -> DataFrame: + return cls( + pandas.DataFrame.from_records( + data, index, exclude, columns, coerce_float, nrows + ) + ) + def to_csv( self, path_or_buf: str, sep=",", *, header: bool = True, index: bool = True ) -> None: diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 5940df590c..9318a5d9d2 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3309,6 +3309,54 @@ def test_df_duplicated(scalars_df_index, scalars_pandas_df_index, keep, subset): pd.testing.assert_series_equal(pd_series, bf_series, check_dtype=False) +def test_df_from_dict_columns_orient(): + data = {"a": [1, 2], "b": [3.3, 2.4]} + bf_result = dataframe.DataFrame.from_dict(data, orient="columns").to_pandas() + pd_result = pd.DataFrame.from_dict(data, orient="columns") + assert_pandas_df_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_df_from_dict_index_orient(): + data = {"a": [1, 2], "b": [3.3, 2.4]} + bf_result = dataframe.DataFrame.from_dict( + data, orient="index", columns=["col1", "col2"] + ).to_pandas() + pd_result = pd.DataFrame.from_dict(data, orient="index", columns=["col1", "col2"]) + assert_pandas_df_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_df_from_dict_tight_orient(): + data = { + "index": [("i1", "i2"), ("i3", "i4")], + "columns": ["col1", "col2"], + "data": [[1, 2.6], [3, 4.5]], + "index_names": ["in1", "in2"], + "column_names": ["column_axis"], + } + + bf_result = dataframe.DataFrame.from_dict(data, orient="tight").to_pandas() + pd_result = pd.DataFrame.from_dict(data, orient="tight") + assert_pandas_df_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_df_from_records(): + records = ((1, "a"), (2.5, "b"), (3.3, "c"), (4.9, "d")) + + bf_result = dataframe.DataFrame.from_records( + records, columns=["c1", "c2"] + ).to_pandas() + pd_result = pd.DataFrame.from_records(records, columns=["c1", "c2"]) + assert_pandas_df_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + def test_df_to_dict(scalars_df_index, scalars_pandas_df_index): unsupported = ["numeric_col"] # formatted differently bf_result = scalars_df_index.drop(columns=unsupported).to_dict() diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 6b5a580e99..08fe8e2de0 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -196,6 +196,81 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: # ---------------------------------------------------------------------- # IO methods (to / from other formats) + @classmethod + def from_dict( + cls, + data: dict, + orient="columns", + dtype=None, + columns=None, + ) -> DataFrame: + """ + Construct DataFrame from dict of array-like or dicts. + + Creates DataFrame object from dictionary by columns or by index + allowing dtype specification. + + Args: + data (dict): + Of the form {field : array-like} or {field : dict}. + orient ({'columns', 'index', 'tight'}, default 'columns'): + The "orientation" of the data. If the keys of the passed dict + should be the columns of the resulting DataFrame, pass 'columns' + (default). Otherwise if the keys should be rows, pass 'index'. + If 'tight', assume a dict with keys ['index', 'columns', 'data', + 'index_names', 'column_names']. + dtype (dtype, default None): + Data type to force after DataFrame construction, otherwise infer. + columns (list, default None): + Column labels to use when ``orient='index'``. Raises a ValueError + if used with ``orient='columns'`` or ``orient='tight'``. + + Returns: + DataFrame + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + @classmethod + def from_records( + cls, + data, + index=None, + exclude=None, + columns=None, + coerce_float: bool = False, + nrows: int | None = None, + ) -> DataFrame: + """ + Convert structured or record ndarray to DataFrame. + + Creates a DataFrame object from a structured ndarray, sequence of + tuples or dicts, or DataFrame. + + Args: + data (structured ndarray, sequence of tuples or dicts): + Structured input data. + index (str, list of fields, array-like): + Field of array to use as the index, alternately a specific set of + input labels to use. + exclude (sequence, default None): + Columns or fields to exclude. + columns (sequence, default None): + Column names to use. If the passed data do not have names + associated with them, this argument provides names for the + columns. Otherwise this argument indicates the order of the columns + in the result (any names not found in the data will become all-NA + columns). + coerce_float (bool, default False): + Attempt to convert values of non-string, non-numeric objects (like + decimal.Decimal) to floating point, useful for SQL result sets. + nrows (int, default None): + Number of rows to read if data is an iterator. + + Returns: + DataFrame + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def to_numpy(self, dtype=None, copy=False, na_value=None, **kwargs) -> np.ndarray: """ Convert the DataFrame to a NumPy array. From c2829e3d976a43c53251c9288266e3a8ec5304c5 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Mon, 4 Dec 2023 16:18:16 -0800 Subject: [PATCH 05/20] docs: correct the params rendering for `ml.remote` and `ml.ensemble` modules (#248) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [x] Appropriate docs were updated (if necessary) - `ensemble.RandomForestClassifier`: https://screenshot.googleplex.com/4Q88xgdm5hkaYXu - `ensemble.RandomForestRegressor`: https://screenshot.googleplex.com/3CU6pJBjYHQvnDo - `remote.VertexAIModel`: https://screenshot.googleplex.com/8SL2max6GfPMwFe Fixes internal issue 314150462 🦕 --- bigframes/ml/remote.py | 8 +-- docs/templates/toc.yml | 12 ++-- .../sklearn/ensemble/_forest.py | 72 +++++++++---------- 3 files changed, 46 insertions(+), 46 deletions(-) diff --git a/bigframes/ml/remote.py b/bigframes/ml/remote.py index d4c34bbd0d..8da073802d 100644 --- a/bigframes/ml/remote.py +++ b/bigframes/ml/remote.py @@ -47,10 +47,10 @@ class VertexAIModel(base.BaseEstimator): Args: endpoint (str): Vertex AI https endpoint. - input ({column_name: column_type}): - Input schema. Supported types are "bool", "string", "int64", "float64", "array", "array", "array", "array". - output ({column_name: column_type}): - Output label schema. Supported the same types as the input. + input (Mapping): + Input schema: `{column_name: column_type}`. Supported types are "bool", "string", "int64", "float64", "array", "array", "array", "array". + output (Mapping): + Output label schema: `{column_name: column_type}`. Supported the same types as the input. session (bigframes.Session or None): BQ session to create the model. If None, use the global default session. connection_name (str or None): diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index 58ac1c0efe..b680a5fc1a 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -108,12 +108,6 @@ - name: PaLM2TextEmbeddingGenerator uid: bigframes.ml.llm.PaLM2TextEmbeddingGenerator name: llm - - items: - - name: Overview - uid: bigframes.ml.remote - - name: VertexAIModel - uid: bigframes.ml.remote.VertexAIModel - name: remote - items: - name: metrics uid: bigframes.ml.metrics @@ -144,6 +138,12 @@ - name: OneHotEncoder uid: bigframes.ml.preprocessing.OneHotEncoder name: preprocessing + - items: + - name: Overview + uid: bigframes.ml.remote + - name: VertexAIModel + uid: bigframes.ml.remote.VertexAIModel + name: remote name: bigframes.ml name: BigQuery DataFrames status: beta diff --git a/third_party/bigframes_vendored/sklearn/ensemble/_forest.py b/third_party/bigframes_vendored/sklearn/ensemble/_forest.py index 6be41bf9aa..63c62274fd 100644 --- a/third_party/bigframes_vendored/sklearn/ensemble/_forest.py +++ b/third_party/bigframes_vendored/sklearn/ensemble/_forest.py @@ -47,16 +47,16 @@ def fit(self, X, y): """Build a forest of trees from the training set (X, y). Args: - X: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): Series or DataFrame of shape (n_samples, n_features). Training data. - y: + y (bigframes.dataframe.DataFrame or bigframes.series.Series): Series or DataFrame of shape (n_samples,) or (n_samples, n_targets). Target values. Will be cast to X's dtype if necessary. Returns: - Fitted Estimator. + ForestModel: Fitted Estimator. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -73,12 +73,12 @@ def predict(self, X): mean predicted regression targets of the trees in the forest. Args: - X: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): Series or DataFrame of shape (n_samples, n_features). The data matrix for which we want to get the predictions. Returns: - The predicted values. + bigframes.dataframe.DataFrame: The predicted values. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -91,38 +91,38 @@ class RandomForestRegressor(ForestRegressor): to improve the predictive accuracy and control over-fitting. Args: - num_parallel_tree: Optional[int] + num_parallel_tree (Optional[int]): Number of parallel trees constructed during each iteration. Default to 100. Minimum value is 2. - tree_method: Optional[str] + tree_method (Optional[str]): Specify which tree method to use. Default to "auto". If this parameter is set to default, XGBoost will choose the most conservative option available. Possible values: ""exact", "approx", "hist". - min_child_weight : Optional[float] + min_child_weight (Optional[float]): Minimum sum of instance weight(hessian) needed in a child. Default to 1. - colsample_bytree : Optional[float] + colsample_bytree (Optional[float]): Subsample ratio of columns when constructing each tree. Default to 1.0. The value should be between 0 and 1. - colsample_bylevel : Optional[float] + colsample_bylevel (Optional[float]): Subsample ratio of columns for each level. Default to 1.0. The value should be between 0 and 1. - colsample_bynode : Optional[float] + colsample_bynode (Optional[float]): Subsample ratio of columns for each split. Default to 0.8. The value should be between 0 and 1. - gamma : Optional[float] + gamma (Optional[float]): (min_split_loss) Minimum loss reduction required to make a further partition on a leaf node of the tree. Default to 0.0. - max_depth : Optional[int] + max_depth (Optional[int]): Maximum tree depth for base learners. Default to 15. The value should be greater than 0 and less than 1. - subsample : Optional[float] + subsample (Optional[float]: Subsample ratio of the training instance. Default to 0.8. The value should be greater than 0 and less than 1. - reg_alpha : Optional[float] + reg_alpha (Optional[float]): L1 regularization term on weights (xgb's alpha). Default to 0.0. - reg_lambda : Optional[float] + reg_lambda (Optional[float]): L2 regularization term on weights (xgb's lambda). Default to 1.0. - early_stop: Optional[bool] + early_stop (Optional[bool]): Whether training should stop after the first iteration. Default to True. - min_rel_progress: Optional[float] + min_rel_progress (Optional[float]): Minimum relative loss improvement necessary to continue training when early_stop is set to True. Default to 0.01. - enable_global_explain: Optional[bool] + enable_global_explain (Optional[bool]): Whether to compute global explanations using explainable AI to evaluate global feature importance to the model. Default to False. - xgboost_version: Optional[str] + xgboost_version (Optional[str]): Specifies the Xgboost version for model training. Default to "0.9". Possible values: "0.9", "1.1". """ @@ -144,7 +144,7 @@ def predict(self, X): which we want to get the predictions. Returns: - The predicted values. + bigframes.dataframe.DataFrame: The predicted values. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -158,37 +158,37 @@ class RandomForestClassifier(ForestClassifier): improve the predictive accuracy and control over-fitting. Args: - num_parallel_tree: Optional[int] + num_parallel_tree (Optional[int]): Number of parallel trees constructed during each iteration. Default to 100. Minimum value is 2. - tree_method: Optional[str] + tree_method (Optional[str]): Specify which tree method to use. Default to "auto". If this parameter is set to default, XGBoost will choose the most conservative option available. Possible values: ""exact", "approx", "hist". - min_child_weight : Optional[float] + min_child_weight (Optional[float]): Minimum sum of instance weight(hessian) needed in a child. Default to 1. - colsample_bytree : Optional[float] + colsample_bytree (Optional[float]): Subsample ratio of columns when constructing each tree. Default to 1.0. The value should be between 0 and 1. - colsample_bylevel : Optional[float] + colsample_bylevel (Optional[float]): Subsample ratio of columns for each level. Default to 1.0. The value should be between 0 and 1. - colsample_bynode : Optional[float] + colsample_bynode (Optional[float]): Subsample ratio of columns for each split. Default to 0.8. The value should be between 0 and 1. - gamma : Optional[float] + gamma (Optional[float]): (min_split_loss) Minimum loss reduction required to make a further partition on a leaf node of the tree. Default to 0.0. - max_depth : Optional[int] + max_depth (Optional[int]): Maximum tree depth for base learners. Default to 15. The value should be greater than 0 and less than 1. - subsample : Optional[float] + subsample (Optional[float]): Subsample ratio of the training instance. Default to 0.8. The value should be greater than 0 and less than 1. - reg_alpha : Optional[float] + reg_alpha (Optional[float]): L1 regularization term on weights (xgb's alpha). Default to 0.0. - reg_lambda : Optional[float] + reg_lambda (Optional[float]): L2 regularization term on weights (xgb's lambda). Default to 1.0. - early_stop: Optional[bool] + early_stop (Optional[bool]): Whether training should stop after the first iteration. Default to True. - min_rel_progress: Optional[float] + min_rel_progress (Optional[float]): Minimum relative loss improvement necessary to continue training when early_stop is set to True. Default to 0.01. - enable_global_explain: Optional[bool] + enable_global_explain (Optional[bool]): Whether to compute global explanations using explainable AI to evaluate global feature importance to the model. Default to False. - xgboost_version: Optional[str] + xgboost_version (Optional[str]): Specifies the Xgboost version for model training. Default to "0.9". Possible values: "0.9", "1.1".ß """ From 77074ecbe7f52d1d7d1d1dc537fbe4062b407672 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Mon, 4 Dec 2023 17:15:14 -0800 Subject: [PATCH 06/20] =?UTF-8?q?docs:=20add=20examples=20for=20dataframe.?= =?UTF-8?q?nunique,=20dataframe.diff,=20dataframe.a=E2=80=A6=20(#251)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs: add examples for dataframe.nunique, dataframe.diff, dataframe.agg, dataframe.describe * update spacing * update ordering --- .../bigframes_vendored/pandas/core/frame.py | 114 +++++++++++++++++- 1 file changed, 112 insertions(+), 2 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 08fe8e2de0..174ab069f6 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -3434,7 +3434,26 @@ def melt(self, id_vars, value_vars, var_name, value_name): def nunique(self): """ - Count number of distinct elements in specified axis. + Count number of distinct elements in each column. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 2]}) + >>> df + A B + 0 3 1 + 1 1 2 + 2 2 2 + + [3 rows x 2 columns] + + >>> df.nunique() + A 3.0 + B 2.0 + dtype: Float64 Returns: bigframes.series.Series: Series with number of distinct elements. @@ -3578,6 +3597,40 @@ def diff( Calculates the difference of a DataFrame element compared with another element in the DataFrame (default is element in previous row). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) + >>> df + A B + 0 3 1 + 1 1 2 + 2 2 3 + + [3 rows x 2 columns] + + Calculating difference with default periods=1: + + >>> df.diff() + A B + 0 + 1 -2 1 + 2 1 1 + + [3 rows x 2 columns] + + Calculating difference with periods=-1: + + >>> df.diff(periods=-1) + A B + 0 2 -1 + 1 -1 -1 + 2 + + [3 rows x 2 columns] + Args: periods (int, default 1): Periods to shift for calculating difference, accepts negative @@ -3590,7 +3643,37 @@ def diff( def agg(self, func): """ - Aggregate using one or more operations over the specified axis. + Aggregate using one or more operations over columns. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) + >>> df + A B + 0 3 1 + 1 1 2 + 2 2 3 + + [3 rows x 2 columns] + + Using a single function: + + >>> df.agg('sum') + A 6.0 + B 6.0 + dtype: Float64 + + Using a list of functions: + + >>> df.agg(['sum', 'mean']) + A B + sum 6.0 6.0 + mean 2.0 2.0 + + [2 rows x 2 columns] Args: func (function): @@ -3623,6 +3706,33 @@ def describe(self): upper percentile is ``75``. The ``50`` percentile is the same as the median. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [0, 2, 8]}) + >>> df + A B + 0 3 0 + 1 1 2 + 2 2 8 + + [3 rows x 2 columns] + + >>> df.describe() + A B + count 3.0 3.0 + mean 2.0 3.333333 + std 1.0 4.163332 + min 1.0 0.0 + 25% 1.0 0.0 + 50% 2.0 2.0 + 75% 3.0 8.0 + max 3.0 8.0 + + [8 rows x 2 columns] + Returns: bigframes.dataframe.DataFrame: Summary statistics of the Series or Dataframe provided. """ From 89a1c67fa5cbb76c1cc6ae24d5f919e22514705c Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Tue, 5 Dec 2023 02:14:14 +0000 Subject: [PATCH 07/20] docs: Fix return annotation in API docstrings (#253) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal issue 314367409 🦕 --- .../bigframes_vendored/pandas/core/frame.py | 30 +++++++-------- .../bigframes_vendored/pandas/core/series.py | 38 +++++++++---------- 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 174ab069f6..7168572705 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -226,7 +226,7 @@ def from_dict( if used with ``orient='columns'`` or ``orient='tight'``. Returns: - DataFrame + DataFrame: DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -267,7 +267,7 @@ def from_records( Number of rows to read if data is an iterator. Returns: - DataFrame + DataFrame: DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -717,7 +717,7 @@ def to_markdown( These parameters will be passed to `tabulate `_. Returns: - DataFrame in Markdown-friendly format. + DataFrame: DataFrame in Markdown-friendly format. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1272,7 +1272,7 @@ def sort_values( if `first`; `last` puts NaNs at the end. Returns: - DataFrame with sorted values. + DataFrame: DataFrame with sorted values. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1282,7 +1282,7 @@ def sort_index( """Sort object by labels (along an axis). Returns: - The original DataFrame sorted by the labels. + DataFrame: The original DataFrame sorted by the labels. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1330,7 +1330,7 @@ def eq(self, other, axis: str | int = "columns") -> DataFrame: (1 or 'columns'). Returns: - Result of the comparison. + DataFrame: Result of the comparison. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1861,7 +1861,7 @@ def rtruediv(self, other, axis: str | int = "columns") -> DataFrame: (1 or 'columns'). For Series input, axis to match Series index on. Returns: - DataFrame result of the arithmetic operation. + DataFrame: DataFrame result of the arithmetic operation. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2796,7 +2796,7 @@ def any(self, *, axis=0, bool_only: bool = False): Include only boolean columns. Returns: - Series + bigframes.series.Series: Series indicating if any element is True per column. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2843,7 +2843,7 @@ def all(self, axis=0, *, bool_only: bool = False): Include only boolean columns. Returns: - bigframes.series.Series: Series if all elements are True. + bigframes.series.Series: Series indicating if all elements are True per column. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3184,7 +3184,7 @@ def skew(self, *, numeric_only: bool = False): Include only float, int, boolean columns. Returns: - Series + Series: Series. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3225,7 +3225,7 @@ def kurt(self, *, numeric_only: bool = False): Include only float, int, boolean columns. Returns: - Series + Series: Series. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3770,7 +3770,7 @@ def pivot(self, *, columns, index=None, values=None): have hierarchically indexed columns. Returns: - Returns reshaped DataFrame. + DataFrame: Returns reshaped DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3808,7 +3808,7 @@ def unstack(self): (the analogue of stack when the columns are not a MultiIndex). Returns: - DataFrame or Series + DataFrame or Series: DataFrame or Series. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3866,7 +3866,7 @@ def index(self): dtype=object) Returns: - The index labels of the DataFrame. + Index: The index object of the DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -4115,7 +4115,7 @@ def dot(self, other): The other object to compute the matrix product with. Returns: - Series or DataFrame + Series or DataFrame: If `other` is a Series, return the matrix product between self and other as a Series. If other is a DataFrame, return the matrix product of self and other in a DataFrame. diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 1b751ed83b..6b8dd1d64d 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -90,7 +90,7 @@ def index(self): dtype=object) Returns: - The index labels of the Series. + Index: The index object of the Series. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -138,7 +138,7 @@ def transpose(self) -> Series: Return the transpose, which is by definition self. Returns: - Series + Series: Series. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -593,7 +593,7 @@ def corr(self, other, method="pearson", min_periods=None) -> float: are not yet supported, so a result will be returned for at least two observations. Returns: - float; Will return NaN if there are fewer than two numeric pairs, either series has a + float: Will return NaN if there are fewer than two numeric pairs, either series has a variance or covariance of zero, or any input value is infinite. """ raise NotImplementedError("abstract method") @@ -611,7 +611,7 @@ def diff(self) -> Series: values. Returns: - {klass}: First differences of the Series. + Series: First differences of the Series. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1317,7 +1317,7 @@ def le(self, other) -> Series: other: Series, or scalar value Returns: - bigframes.series.Series. The result of the comparison. + bigframes.series.Series: The result of the comparison. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1331,7 +1331,7 @@ def lt(self, other) -> Series: Args: other (Series, or scalar value): - Returns: + Returns: bigframes.series.Series: The result of the operation. """ @@ -1588,7 +1588,7 @@ def divmod(self, other) -> Series: other: Series, or scalar value Returns: - 2-Tuple of Series. The result of the operation. The result is always + 2-Tuple of Series: The result of the operation. The result is always consistent with (floordiv, mod) (though pandas may not). """ @@ -1603,7 +1603,7 @@ def rdivmod(self, other) -> Series: other: Series, or scalar value Returns: - 2-Tuple of Series. The result of the operation. The result is always + 2-Tuple of Series: The result of the operation. The result is always consistent with (rfloordiv, rmod) (though pandas may not). """ @@ -1650,7 +1650,7 @@ def max( Returns: - scalar or scalar + scalar: Scalar. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1664,7 +1664,7 @@ def min( of the ``numpy.ndarray`` method ``argmin``. Returns: - scalar or scalar + scalar: Scalar. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1692,7 +1692,7 @@ def var( Normalized by N-1 by default. Returns: - scalar or Series (if level specified) + scalar or Series (if level specified): Variance. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1702,7 +1702,7 @@ def sum(self): This is equivalent to the method ``numpy.sum``. Returns: - scalar + scalar: Scalar. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1710,7 +1710,7 @@ def mean(self): """Return the mean of the values over the requested axis. Returns: - scalar + scalar: Scalar. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1723,7 +1723,7 @@ def median(self, *, exact: bool = False): one. Note: ``exact=True`` not yet supported. Returns: - scalar + scalar: Scalar. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1731,7 +1731,7 @@ def prod(self): """Return the product of the values over the requested axis. Returns: - scalar + scalar: Scalar. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1741,7 +1741,7 @@ def skew(self): Normalized by N-1. Returns: - scalar + scalar: Scalar. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1932,7 +1932,7 @@ def clip(self): Maximum threshold value. All values above this threshold will be set to it. A missing threshold (e.g NA) will not clip the value. Returns: - Series. + Series: Series. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2083,7 +2083,7 @@ def is_monotonic_increasing(self) -> bool: Return boolean if values in the object are monotonically increasing. Returns: - bool + bool: Boolean. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2093,7 +2093,7 @@ def is_monotonic_decreasing(self) -> bool: Return boolean if values in the object are monotonically decreasing. Returns: - bool + bool: Boolean. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From c8ec245070402aa0770bc9b2375693de674ca925 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 5 Dec 2023 11:34:15 -0800 Subject: [PATCH 08/20] feat: add nunique method to Series/DataFrameGroupby (#256) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/core/groupby/__init__.py | 6 ++++++ tests/system/small/test_groupby.py | 2 ++ .../pandas/core/groupby/__init__.py | 18 ++++++++++++++++++ 3 files changed, 26 insertions(+) diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 18cb83fa18..a8b8afdae7 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -179,6 +179,9 @@ def any(self) -> df.DataFrame: def count(self) -> df.DataFrame: return self._aggregate_all(agg_ops.count_op) + def nunique(self) -> df.DataFrame: + return self._aggregate_all(agg_ops.nunique_op) + def cumsum(self, *args, numeric_only: bool = False, **kwargs) -> df.DataFrame: if not numeric_only: self._raise_on_non_numeric("cumsum") @@ -442,6 +445,9 @@ def max(self, *args) -> series.Series: def count(self) -> series.Series: return self._aggregate(agg_ops.count_op) + def nunique(self) -> series.Series: + return self._aggregate(agg_ops.nunique_op) + def sum(self, *args) -> series.Series: return self._aggregate(agg_ops.sum_op) diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index a24713c2b3..5214905186 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -69,11 +69,13 @@ def test_dataframe_groupby_median(scalars_df_index, scalars_pandas_df_index): ("operator"), [ (lambda x: x.count()), + (lambda x: x.nunique()), (lambda x: x.any()), (lambda x: x.all()), ], ids=[ "count", + "nunique", "any", "all", ], diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index b05319b4f7..8730cf0007 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -363,6 +363,15 @@ def agg(self, func): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def nunique(self): + """ + Return number of unique elements in the group. + + Returns: + Series: Number of unique values within each group. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + class DataFrameGroupBy(GroupBy): def agg(self, func, **kwargs): @@ -391,3 +400,12 @@ def agg(self, func, **kwargs): DataFrame """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def nunique(self): + """ + Return DataFrame with counts of unique elements in each position. + + Returns: + DataFrame + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From d3fa6f26931d5d0f0ae3fa49baccfc148f870417 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Wed, 6 Dec 2023 11:04:15 -0800 Subject: [PATCH 09/20] fix: fix value_counts column label for normalize=True (#245) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/core/block_transforms.py | 4 +++- tests/system/small/test_dataframe.py | 6 ++---- tests/system/small/test_series.py | 11 ++++------- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index ce0fdd219a..df84f70859 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -353,7 +353,9 @@ def value_counts( ) ] ) - return block.select_column(count_id).with_column_labels(["count"]) + return block.select_column(count_id).with_column_labels( + ["proportion" if normalize else "count"] + ) def pct_change(block: blocks.Block, periods: int = 1) -> blocks.Block: diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 9318a5d9d2..45490e00ca 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3453,6 +3453,8 @@ def test_df_to_orc(scalars_df_index, scalars_pandas_df_index): ], ) def test_df_value_counts(scalars_dfs, subset, normalize, ascending, dropna): + if pd.__version__.startswith("1."): + pytest.skip("pandas 1.x produces different column labels.") scalars_df, scalars_pandas_df = scalars_dfs bf_result = ( @@ -3464,10 +3466,6 @@ def test_df_value_counts(scalars_dfs, subset, normalize, ascending, dropna): subset, normalize=normalize, ascending=ascending, dropna=dropna ) - # Older pandas version may not have these values, bigframes tries to emulate 2.0+ - pd_result.name = "count" - pd_result.index.names = bf_result.index.names - pd.testing.assert_series_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index d9fc23fad0..92a7b6f099 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1940,16 +1940,14 @@ def test_cummax_int(scalars_df_index, scalars_pandas_df_index): def test_value_counts(scalars_dfs): + if pd.__version__.startswith("1."): + pytest.skip("pandas 1.x produces different column labels.") scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_too" bf_result = scalars_df[col_name].value_counts().to_pandas() pd_result = scalars_pandas_df[col_name].value_counts() - # Older pandas version may not have these values, bigframes tries to emulate 2.0+ - pd_result.name = "count" - pd_result.index.name = col_name - pd.testing.assert_series_equal( bf_result, pd_result, @@ -1957,6 +1955,8 @@ def test_value_counts(scalars_dfs): def test_value_counts_w_cut(scalars_dfs): + if pd.__version__.startswith("1."): + pytest.skip("value_counts results different in pandas 1.x.") scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_col" @@ -1965,9 +1965,6 @@ def test_value_counts_w_cut(scalars_dfs): bf_result = bf_cut.value_counts().to_pandas() pd_result = pd_cut.value_counts() - # Older pandas version may not have these values, bigframes tries to emulate 2.0+ - pd_result.name = "count" - pd_result.index.name = col_name pd_result.index = pd_result.index.astype(pd.Int64Dtype()) pd.testing.assert_series_equal( From 5bdcc6594ef2e99e96636341d286ea70420858fe Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 7 Dec 2023 08:24:14 +0000 Subject: [PATCH 10/20] docs: add code samples for `shape` and `head` (#257) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [x] Appropriate docs were updated (if necessary) - `DataFrame.head`: https://screenshot.googleplex.com/BmM7jPxCk3iLuay - `Series.head`: https://screenshot.googleplex.com/7hANtzZCw8SbEKL - `Series.shape`: https://screenshot.googleplex.com/8AJ2xvLY6dmQUZe Fixes internal issue 314875595 🦕 --- .../bigframes_vendored/pandas/core/generic.py | 60 ++++++++++++++++++- .../bigframes_vendored/pandas/core/series.py | 15 ++++- 2 files changed, 72 insertions(+), 3 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 127efe6a3d..607243f844 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -272,17 +272,73 @@ def head(self, n: int = 5): on position. It is useful for quickly testing if your object has the right type of data in it. - **Not yet supported** For negative values of `n`, this function returns + For negative values of `n`, this function returns all rows except the last `|n|` rows, equivalent to ``df[:n]``. If n is larger than the number of rows, this function returns all rows. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion', + ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']}) + >>> df + animal + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + 5 parrot + 6 shark + 7 whale + 8 zebra + + [9 rows x 1 columns] + + Viewing the first 5 lines: + + >>> df.head() + animal + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + + [5 rows x 1 columns] + + Viewing the first `n` lines (three in this case): + + >>> df.head(3) + animal + 0 alligator + 1 bee + 2 falcon + + [3 rows x 1 columns] + + For negative values of `n`: + + >>> df.head(-3) + animal + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + 5 parrot + + [6 rows x 1 columns] + Args: n (int, default 5): Default 5. Number of rows to select. Returns: - The first `n` rows of the caller object. + same type as caller: The first ``n`` rows of the caller object. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 6b8dd1d64d..e6af1648fd 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -96,7 +96,20 @@ def index(self): @property def shape(self): - """Return a tuple of the shape of the underlying data.""" + """Return a tuple of the shape of the underlying data. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([1, 4, 9, 16]) + >>> s.shape + (4,) + >>> s = bpd.Series(['Alice', 'Bob', bpd.NA]) + >>> s.shape + (3,) + """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property From 68c6fdf78af8b87fa4ef4f832631f24d7433a4d8 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Thu, 7 Dec 2023 11:30:15 -0800 Subject: [PATCH 11/20] fix: ml.sql logic (#262) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/ml/sql.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 1c88eda4ab..5fb40624dd 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -153,14 +153,12 @@ def create_model( ) -> str: """Encode the CREATE OR REPLACE MODEL statement for BQML""" source_sql = source_df.sql - transform_sql = self.transform(*transforms) if transforms is not None else None - options_sql = self.options(**options) parts = [f"CREATE OR REPLACE MODEL {self._model_id_sql(model_ref)}"] - if transform_sql: - parts.append(transform_sql) - if options_sql: - parts.append(options_sql) + if transforms: + parts.append(self.transform(*transforms)) + if options: + parts.append(self.options(**options)) parts.append(f"AS {source_sql}") return "\n".join(parts) @@ -189,11 +187,10 @@ def create_imported_model( options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, ) -> str: """Encode the CREATE OR REPLACE MODEL statement for BQML remote model.""" - options_sql = self.options(**options) parts = [f"CREATE OR REPLACE MODEL {self._model_id_sql(model_ref)}"] - if options_sql: - parts.append(options_sql) + if options: + parts.append(self.options(**options)) return "\n".join(parts) From d21c6dd26eadd64c526b0fd35b977a74b8334562 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Fri, 8 Dec 2023 11:52:17 -0800 Subject: [PATCH 12/20] docs: correct the docs for `option_context` (#263) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal issue #315382764 🦕 --- .../bigframes_vendored/pandas/_config/config.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/_config/config.py b/third_party/bigframes_vendored/pandas/_config/config.py index 8abaca76c7..1b73e649c8 100644 --- a/third_party/bigframes_vendored/pandas/_config/config.py +++ b/third_party/bigframes_vendored/pandas/_config/config.py @@ -11,11 +11,12 @@ class option_context(contextlib.ContextDecorator): You need to invoke as ``option_context(pat, val, [(pat, val), ...])``. - Examples - -------- - >>> import bigframes - >>> with bigframes.option_context('display.max_rows', 10, 'display.max_columns', 5): - ... pass + **Examples:** + + >>> import bigframes + + >>> with bigframes.option_context('display.max_rows', 10, 'display.max_columns', 5): + ... pass """ def __init__(self, *args) -> None: From 9dd63f6dcb6234e1f3aebd63c59e1e5c717099dc Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Sat, 9 Dec 2023 02:10:15 +0000 Subject: [PATCH 13/20] fix: enfore pandas version requirement <2.1.4 (#265) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There seem to be a breaking change in pandas release 2.1.4 that is failing tests using `pandas.read_json`. This change is pinning pandas dependency version to <2.1.4 until the proper fix. Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal issue 315539920 🦕 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index abf165b3df..3351542985 100644 --- a/setup.py +++ b/setup.py @@ -45,7 +45,7 @@ "google-cloud-storage >=2.0.0", # TODO: Relax upper bound once we have fixed `system_prerelease` tests. "ibis-framework[bigquery] >=6.2.0,<7.0.0dev", - "pandas >=1.5.0", + "pandas >=1.5.0,<2.1.4", "pydata-google-auth >=1.8.2", "requests >=2.27.1", "scikit-learn >=1.2.2", From 99598c7d359f1d1e0671dcf27a5c77094f3c7f67 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Sun, 10 Dec 2023 22:02:15 -0800 Subject: [PATCH 14/20] feat: add ARIMAPlus.predict parameters (#264) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/ml/core.py | 4 +-- bigframes/ml/forecasting.py | 21 +++++++++-- bigframes/ml/sql.py | 6 ++-- tests/system/small/ml/test_core.py | 9 ++--- tests/system/small/ml/test_forecasting.py | 43 +++++++++++++++++++++-- tests/unit/ml/test_sql.py | 16 +++++++++ 6 files changed, 86 insertions(+), 13 deletions(-) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 5aad77a394..1e2224c9bc 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -126,8 +126,8 @@ def generate_text_embedding( ), ) - def forecast(self) -> bpd.DataFrame: - sql = self._model_manipulation_sql_generator.ml_forecast() + def forecast(self, options: Mapping[str, int | float]) -> bpd.DataFrame: + sql = self._model_manipulation_sql_generator.ml_forecast(struct_options=options) return self._session.read_gbq(sql, index_col="forecast_timestamp").reset_index() def evaluate(self, input_data: Optional[bpd.DataFrame] = None): diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py index 995201062b..03b9857cc5 100644 --- a/bigframes/ml/forecasting.py +++ b/bigframes/ml/forecasting.py @@ -86,21 +86,38 @@ def _fit( options=self._bqml_options, ) - def predict(self, X=None) -> bpd.DataFrame: + def predict( + self, X=None, horizon: int = 3, confidence_level: float = 0.95 + ) -> bpd.DataFrame: """Predict the closest cluster for each sample in X. Args: X (default None): ignored, to be compatible with other APIs. + horizon (int, default: 3): + an int value that specifies the number of time points to forecast. + The default value is 3, and the maximum value is 1000. + confidence_level (float, default 0.95): + a float value that specifies percentage of the future values that fall in the prediction interval. + The valid input range is [0.0, 1.0). Returns: bigframes.dataframe.DataFrame: The predicted DataFrames. Which contains 2 columns "forecast_timestamp" and "forecast_value". """ + if horizon < 1 or horizon > 1000: + raise ValueError(f"horizon must be [1, 1000], but is {horizon}.") + if confidence_level < 0.0 or confidence_level >= 1.0: + raise ValueError( + f"confidence_level must be [0.0, 1.0), but is {confidence_level}." + ) + if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - return self._bqml_model.forecast() + return self._bqml_model.forecast( + options={"horizon": horizon, "confidence_level": confidence_level} + ) def score( self, diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 5fb40624dd..25caaf1ac6 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -223,9 +223,11 @@ def ml_predict(self, source_df: bpd.DataFrame) -> str: return f"""SELECT * FROM ML.PREDICT(MODEL `{self._model_name}`, ({self._source_sql(source_df)}))""" - def ml_forecast(self) -> str: + def ml_forecast(self, struct_options: Mapping[str, Union[int, float]]) -> str: """Encode ML.FORECAST for BQML""" - return f"""SELECT * FROM ML.FORECAST(MODEL `{self._model_name}`)""" + struct_options_sql = self.struct_options(**struct_options) + return f"""SELECT * FROM ML.FORECAST(MODEL `{self._model_name}`, + {struct_options_sql})""" def ml_generate_text( self, source_df: bpd.DataFrame, struct_options: Mapping[str, Union[int, float]] diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index 22cbbb1932..915c4aa444 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -336,17 +336,18 @@ def test_model_generate_text( def test_model_forecast(time_series_bqml_arima_plus_model: core.BqmlModel): utc = pytz.utc - forecast = time_series_bqml_arima_plus_model.forecast().to_pandas()[ - ["forecast_timestamp", "forecast_value"] - ] + forecast = time_series_bqml_arima_plus_model.forecast( + {"horizon": 4, "confidence_level": 0.8} + ).to_pandas()[["forecast_timestamp", "forecast_value"]] expected = pd.DataFrame( { "forecast_timestamp": [ datetime(2017, 8, 2, tzinfo=utc), datetime(2017, 8, 3, tzinfo=utc), datetime(2017, 8, 4, tzinfo=utc), + datetime(2017, 8, 5, tzinfo=utc), ], - "forecast_value": [2724.472284, 2593.368389, 2353.613034], + "forecast_value": [2724.472284, 2593.368389, 2353.613034, 1781.623071], } ) expected["forecast_value"] = expected["forecast_value"].astype(pd.Float64Dtype()) diff --git a/tests/system/small/ml/test_forecasting.py b/tests/system/small/ml/test_forecasting.py index 948db59650..be8d9c2bac 100644 --- a/tests/system/small/ml/test_forecasting.py +++ b/tests/system/small/ml/test_forecasting.py @@ -18,8 +18,10 @@ import pyarrow as pa import pytz +from bigframes.ml import forecasting -def test_model_predict(time_series_arima_plus_model): + +def test_model_predict_default(time_series_arima_plus_model: forecasting.ARIMAPlus): utc = pytz.utc predictions = time_series_arima_plus_model.predict().to_pandas() assert predictions.shape == (3, 8) @@ -47,7 +49,40 @@ def test_model_predict(time_series_arima_plus_model): ) -def test_model_score(time_series_arima_plus_model, new_time_series_df): +def test_model_predict_params(time_series_arima_plus_model: forecasting.ARIMAPlus): + utc = pytz.utc + predictions = time_series_arima_plus_model.predict( + horizon=4, confidence_level=0.9 + ).to_pandas() + assert predictions.shape == (4, 8) + result = predictions[["forecast_timestamp", "forecast_value"]] + expected = pd.DataFrame( + { + "forecast_timestamp": [ + datetime(2017, 8, 2, tzinfo=utc), + datetime(2017, 8, 3, tzinfo=utc), + datetime(2017, 8, 4, tzinfo=utc), + datetime(2017, 8, 5, tzinfo=utc), + ], + "forecast_value": [2724.472284, 2593.368389, 2353.613034, 1781.623071], + } + ) + expected["forecast_value"] = expected["forecast_value"].astype(pd.Float64Dtype()) + expected["forecast_timestamp"] = expected["forecast_timestamp"].astype( + pd.ArrowDtype(pa.timestamp("us", tz="UTC")) + ) + + pd.testing.assert_frame_equal( + result, + expected, + rtol=0.1, + check_index_type=False, + ) + + +def test_model_score( + time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df +): result = time_series_arima_plus_model.score( new_time_series_df[["parsed_date"]], new_time_series_df[["total_visits"]] ).to_pandas() @@ -69,7 +104,9 @@ def test_model_score(time_series_arima_plus_model, new_time_series_df): ) -def test_model_score_series(time_series_arima_plus_model, new_time_series_df): +def test_model_score_series( + time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df +): result = time_series_arima_plus_model.score( new_time_series_df["parsed_date"], new_time_series_df["total_visits"] ).to_pandas() diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index 9223058540..73d19cc0bb 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -293,6 +293,22 @@ def test_ml_centroids_produces_correct_sql( ) +def test_forecast_correct_sql( + model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, + mock_df: bpd.DataFrame, +): + sql = model_manipulation_sql_generator.ml_forecast( + struct_options={"option_key1": 1, "option_key2": 2.2}, + ) + assert ( + sql + == """SELECT * FROM ML.FORECAST(MODEL `my_project_id.my_dataset_id.my_model_id`, + STRUCT( + 1 AS option_key1, + 2.2 AS option_key2))""" + ) + + def test_ml_generate_text_produces_correct_sql( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, mock_df: bpd.DataFrame, From 3febea99358d10f823d43c3af83ea30458e579a2 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Mon, 11 Dec 2023 11:57:30 -0800 Subject: [PATCH 15/20] feat: support dataframe.loc with conditional columns selection (#233) Co-authored-by: Tim Swast --- bigframes/core/indexers.py | 7 ++++++- tests/system/small/test_dataframe.py | 11 +++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index 69048b6845..12a1303d29 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -156,7 +156,12 @@ def __getitem__(self, key): bigframes.dataframe.DataFrame, _loc_getitem_series_or_dataframe(self._dataframe, key[0]), ) - return df[key[1]] + + columns = key[1] + if isinstance(columns, pd.Series) and columns.dtype == "bool": + columns = df.columns[columns] + + return df[columns] return typing.cast( bigframes.dataframe.DataFrame, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 45490e00ca..57115335dc 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2474,6 +2474,17 @@ def test_loc_select_column(scalars_df_index, scalars_pandas_df_index): ) +def test_loc_select_with_column_condition(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.loc[:, scalars_df_index.dtypes == "Int64"].to_pandas() + pd_result = scalars_pandas_df_index.loc[ + :, scalars_pandas_df_index.dtypes == "Int64" + ] + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + def test_loc_single_index_with_duplicate(scalars_df_index, scalars_pandas_df_index): scalars_df_index = scalars_df_index.set_index("string_col", drop=False) scalars_pandas_df_index = scalars_pandas_df_index.set_index( From 8c636978f4a21eda2856862100b7a8272797fe42 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Mon, 11 Dec 2023 14:34:31 -0800 Subject: [PATCH 16/20] =?UTF-8?q?docs:=20add=20example=20for=20dataframe.m?= =?UTF-8?q?elt,=20dataframe.pivot,=20dataframe.stac=E2=80=A6=20(#252)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs: add example for dataframe.melt, dataframe.pivot, dataframe.stack, dataframe.unstack * remove empty line * docstring fix * spacing update * docs: correct the params rendering for `ml.remote` and `ml.ensemble` modules (#248) Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [x] Appropriate docs were updated (if necessary) - `ensemble.RandomForestClassifier`: https://screenshot.googleplex.com/4Q88xgdm5hkaYXu - `ensemble.RandomForestRegressor`: https://screenshot.googleplex.com/3CU6pJBjYHQvnDo - `remote.VertexAIModel`: https://screenshot.googleplex.com/8SL2max6GfPMwFe Fixes internal issue 314150462 🦕 * docs: add examples for dataframe.nunique, dataframe.diff, dataframe.a… (#251) * docs: add examples for dataframe.nunique, dataframe.diff, dataframe.agg, dataframe.describe * update spacing * update ordering * docs: Fix return annotation in API docstrings (#253) Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal issue 314367409 🦕 * feat: add nunique method to Series/DataFrameGroupby (#256) Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 * docs: add example for dataframe.melt, dataframe.pivot, dataframe.stack, dataframe.unstack * docstring fix --------- Co-authored-by: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Co-authored-by: Shobhit Singh Co-authored-by: TrevorBergeron --- .../bigframes_vendored/pandas/core/frame.py | 179 ++++++++++++++++-- 1 file changed, 165 insertions(+), 14 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 7168572705..5b00385eb8 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -3414,18 +3414,75 @@ def melt(self, id_vars, value_vars, var_name, value_name): the row axis, leaving just two non-identifier columns, 'variable' and 'value'. - Parameters - ---------- - id_vars (tuple, list, or ndarray, optional): - Column(s) to use as identifier variables. - value_vars (tuple, list, or ndarray, optional): - Column(s) to unpivot. If not specified, uses all columns that - are not set as `id_vars`. - var_name (scalar): - Name to use for the 'variable' column. If None it uses - ``frame.columns.name`` or 'variable'. - value_name (scalar, default 'value'): - Name to use for the 'value' column. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [1, None, 3, 4, 5], + ... "B": [1, 2, 3, 4, 5], + ... "C": [None, 3.5, None, 4.5, 5.0]}) + >>> df + A B C + 0 1.0 1 + 1 2 3.5 + 2 3.0 3 + 3 4.0 4 4.5 + 4 5.0 5 5.0 + + [5 rows x 3 columns] + + Using `melt` without optional arguments: + + >>> df.melt() + variable value + 0 A 1.0 + 1 A + 2 A 3.0 + 3 A 4.0 + 4 A 5.0 + 5 B 1.0 + 6 B 2.0 + 7 B 3.0 + 8 B 4.0 + 9 B 5.0 + 10 C + 11 C 3.5 + 12 C + 13 C 4.5 + 14 C 5.0 + + [15 rows x 2 columns] + + Using `melt` with `id_vars` and `value_vars`: + + >>> df.melt(id_vars='A', value_vars=['B', 'C']) + A variable value + 0 1.0 B 1 + 1 B 2 + 2 3.0 B 3 + 3 4.0 B 4 + 4 5.0 B 5 + 5 1.0 C + 6 C 3 + 7 3.0 C + 8 4.0 C 4 + 9 5.0 C 5 + + [10 rows x 3 columns] + + + Args: + id_vars (tuple, list, or ndarray, optional): + Column(s) to use as identifier variables. + value_vars (tuple, list, or ndarray, optional): + Column(s) to unpivot. If not specified, uses all columns that + are not set as `id_vars`. + var_name (scalar): + Name to use for the 'variable' column. If None it uses + ``frame.columns.name`` or 'variable'. + value_name (scalar, default 'value'): + Name to use for the 'value' column. Returns: DataFrame: Unpivoted DataFrame. @@ -3757,6 +3814,52 @@ def pivot(self, *, columns, index=None, values=None): do not together uniquely identify input rows, the output will be silently non-deterministic. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... "foo": ["one", "one", "one", "two", "two"], + ... "bar": ["A", "B", "C", "A", "B"], + ... "baz": [1, 2, 3, 4, 5], + ... "zoo": ['x', 'y', 'z', 'q', 'w'] + ... }) + + >>> df + foo bar baz zoo + 0 one A 1 x + 1 one B 2 y + 2 one C 3 z + 3 two A 4 q + 4 two B 5 w + + [5 rows x 4 columns] + + Using `pivot` without optional arguments: + + >>> df.pivot(columns='foo') + bar baz zoo + foo one two one two one two + 0 A 1 x + 1 B 2 y + 2 C 3 z + 3 A 4 q + 4 B 5 w + + [5 rows x 6 columns] + + Using `pivot` with `index` and `values`: + + >>> df.pivot(columns='foo', index='bar', values='baz') + foo one two + bar + A 1 4 + B 2 5 + C 3 + + [3 rows x 2 columns] + Args: columns (str or object or a list of str): Column to use to make new frame's columns. @@ -3774,7 +3877,7 @@ def pivot(self, *, columns, index=None, values=None): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def stack(self): + def stack(self, level=-1): """ Stack the prescribed level(s) from columns to index. @@ -3792,12 +3895,36 @@ def stack(self): BigQuery DataFrames does not support stack operations that would combine columns of different dtypes. + **Example:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'A': [1, 3], 'B': [2, 4]}, index=['foo', 'bar']) + >>> df + A B + foo 1 2 + bar 3 4 + + [2 rows x 2 columns] + + >>> df.stack() + foo A 1 + B 2 + bar A 3 + B 4 + dtype: Int64 + + Args: + level (int, str, or list of these, default -1 (last level)): + Level(s) to stack from the column axis onto the index axis. + Returns: DataFrame or Series: Stacked dataframe or series. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def unstack(self): + def unstack(self, level=-1): """ Pivot a level of the (necessarily hierarchical) index labels. @@ -3807,6 +3934,30 @@ def unstack(self): If the index is not a MultiIndex, the output will be a Series (the analogue of stack when the columns are not a MultiIndex). + **Example:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'A': [1, 3], 'B': [2, 4]}, index=['foo', 'bar']) + >>> df + A B + foo 1 2 + bar 3 4 + + [2 rows x 2 columns] + + >>> df.unstack() + A foo 1 + bar 3 + B foo 2 + bar 4 + dtype: Int64 + + Args: + level (int, str, or list of these, default -1 (last level)): + Level(s) of index to unstack, can pass level name. + Returns: DataFrame or Series: DataFrame or Series. """ From e735412fdc52d034df92dd5462d6956bdc0167be Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Mon, 11 Dec 2023 15:54:55 -0800 Subject: [PATCH 17/20] =?UTF-8?q?docs:=20add=20example=20to=20dataframe.nl?= =?UTF-8?q?argest,=20dataframe.nsmallest,=20datafra=E2=80=A6=20(#234)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs: add example to dataframe.nlargest, dataframe.nsmallest, dataframe.idxmin, dataframe .idxmax * update example output --- .../bigframes_vendored/pandas/core/frame.py | 151 +++++++++++++++++- 1 file changed, 147 insertions(+), 4 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 5b00385eb8..4753bfc589 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -3324,6 +3324,58 @@ def nlargest(self, n: int, columns, keep: str = "first"): ``df.sort_values(columns, ascending=False).head(n)``, but more performant. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [1, 1, 3, 3, 5, 5], + ... "B": [5, 6, 3, 4, 1, 2], + ... "C": ['a', 'b', 'a', 'b', 'a', 'b']}) + >>> df + A B C + 0 1 5 a + 1 1 6 b + 2 3 3 a + 3 3 4 b + 4 5 1 a + 5 5 2 b + + [6 rows x 3 columns] + + Returns rows with the largest value in 'A', including all ties: + + >>> df.nlargest(1, 'A', keep = "all") + A B C + 4 5 1 a + 5 5 2 b + + [2 rows x 3 columns] + + Returns the first row with the largest value in 'A', default behavior in case of ties: + + >>> df.nlargest(1, 'A') + A B C + 4 5 1 a + + [1 rows x 3 columns] + + Returns the last row with the largest value in 'A' in case of ties: + + >>> df.nlargest(1, 'A', keep = "last") + A B C + 5 5 2 b + + [1 rows x 3 columns] + + Returns the row with the largest combined values in both 'A' and 'C': + + >>> df.nlargest(1, ['A', 'C']) + A B C + 5 5 2 b + + [1 rows x 3 columns] + Args: n (int): Number of rows to return. @@ -3359,6 +3411,59 @@ def nsmallest(self, n: int, columns, keep: str = "first"): ``df.sort_values(columns, ascending=True).head(n)``, but more performant. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [1, 1, 3, 3, 5, 5], + ... "B": [5, 6, 3, 4, 1, 2], + ... "C": ['a', 'b', 'a', 'b', 'a', 'b']}) + >>> df + A B C + 0 1 5 a + 1 1 6 b + 2 3 3 a + 3 3 4 b + 4 5 1 a + 5 5 2 b + + [6 rows x 3 columns] + + Returns rows with the smallest value in 'A', including all ties: + + >>> df.nsmallest(1, 'A', keep = "all") + A B C + 0 1 5 a + 1 1 6 b + + [2 rows x 3 columns] + + Returns the first row with the smallest value in 'A', default behavior in case of ties: + + >>> df.nsmallest(1, 'A') + A B C + 0 1 5 a + + [1 rows x 3 columns] + + Returns the last row with the smallest value in 'A' in case of ties: + + >>> df.nsmallest(1, 'A', keep = "last") + A B C + 1 1 6 b + + [1 rows x 3 columns] + + Returns rows with the smallest values in 'A' and 'C' + + >>> df.nsmallest(1, ['A', 'C']) + A B C + 0 1 5 a + + [1 rows x 3 columns] + + Args: n (int): Number of rows to return. @@ -3384,23 +3489,61 @@ def nsmallest(self, n: int, columns, keep: str = "first"): def idxmin(self): """ - Return index of first occurrence of minimum over requested axis. + Return index of first occurrence of minimum over columns. NA/null values are excluded. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) + >>> df + A B + 0 3 1 + 1 1 2 + 2 2 3 + + [3 rows x 2 columns] + + >>> df.idxmin() + A 1 + B 0 + dtype: Int64 + Returns: - Series: Indexes of minima along the specified axis. + Series: Indexes of minima along the columns. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def idxmax(self): """ - Return index of first occurrence of maximum over requested axis. + Return index of first occurrence of maximum over columns. NA/null values are excluded. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) + >>> df + A B + 0 3 1 + 1 1 2 + 2 2 3 + + [3 rows x 2 columns] + + >>> df.idxmax() + A 0 + B 2 + dtype: Int64 + Returns: - Series: Indexes of maxima along the specified axis. + Series: Indexes of maxima along the columns. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From b02fc2c1843e18d3a8d6894c64763f53e6af1b73 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Tue, 12 Dec 2023 02:34:27 +0000 Subject: [PATCH 18/20] fix: pin prerelease tests to pandas 2.1.3 to unblock e2e tests (#268) * fix: pin prerelease tests to pandas 2.1.3 to unblock e2e tests * specify excluded pandas version differently, to automatically test on a release --- noxfile.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index 3b10a37fc7..2174e27529 100644 --- a/noxfile.py +++ b/noxfile.py @@ -518,7 +518,9 @@ def prerelease(session: nox.sessions.Session, tests_path): "--prefer-binary", "--pre", "--upgrade", - "pandas", + # TODO(shobs): Remove tying to version 2.1.3 after + # https://github.com/pandas-dev/pandas/issues/56463 is resolved + "pandas!=2.1.4", ) already_installed.add("pandas") From 8766ac63f501929577f71e6bd2b523e92c43ba66 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Tue, 12 Dec 2023 03:22:15 +0000 Subject: [PATCH 19/20] test: migrate e2e presubmit tests to bigframes-load-testing project (#160) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BEGIN_COMMIT_OVERRIDE fix: migrate e2e tests to bigframes-load-testing project END_COMMIT_OVERRIDE Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal issue 307809767 🦕 --- .kokoro/continuous/e2e.cfg | 10 + .kokoro/presubmit/e2e.cfg | 10 + CONTRIBUTING.rst | 39 +- bigframes/remote_function.py | 24 +- .../generative_ai/large_language_models.ipynb | 6 +- .../regression/easy_linear_regression.ipynb | 1116 ++++++++--------- owlbot.py | 1 + samples/snippets/gen_ai_model_test.py | 9 +- .../load_data_from_biquery_job_test.py | 11 +- samples/snippets/quickstart_test.py | 9 +- samples/snippets/remote_function_test.py | 9 +- scripts/create_test_model_vertex.py | 71 ++ scripts/setup-project-for-testing.sh | 256 ++++ tests/system/conftest.py | 1 - tests/system/large/ml/test_decomposition.py | 8 +- tests/system/large/ml/test_pipeline.py | 21 +- tests/system/large/test_remote_function.py | 6 +- tests/system/small/ml/conftest.py | 12 +- tests/system/small/ml/test_core.py | 4 +- tests/system/small/ml/test_decomposition.py | 11 +- tests/system/small/ml/test_llm.py | 28 +- tests/system/small/test_remote_function.py | 34 +- tests/system/utils.py | 74 ++ 23 files changed, 1130 insertions(+), 640 deletions(-) create mode 100644 scripts/create_test_model_vertex.py create mode 100755 scripts/setup-project-for-testing.sh diff --git a/.kokoro/continuous/e2e.cfg b/.kokoro/continuous/e2e.cfg index 2f93a58212..7479346590 100644 --- a/.kokoro/continuous/e2e.cfg +++ b/.kokoro/continuous/e2e.cfg @@ -5,3 +5,13 @@ env_vars: { key: "NOX_SESSION" value: "unit_prerelease system_prerelease system_noextras e2e notebook samples" } + +env_vars: { + key: "GOOGLE_CLOUD_PROJECT" + value: "bigframes-load-testing" +} + +env_vars: { + key: "BIGFRAMES_TEST_MODEL_VERTEX_ENDPOINT" + value: "https://us-central1-aiplatform.googleapis.com/v1/projects/272725758477/locations/us-central1/endpoints/590545496255234048" +} diff --git a/.kokoro/presubmit/e2e.cfg b/.kokoro/presubmit/e2e.cfg index 2f93a58212..7479346590 100644 --- a/.kokoro/presubmit/e2e.cfg +++ b/.kokoro/presubmit/e2e.cfg @@ -5,3 +5,13 @@ env_vars: { key: "NOX_SESSION" value: "unit_prerelease system_prerelease system_noextras e2e notebook samples" } + +env_vars: { + key: "GOOGLE_CLOUD_PROJECT" + value: "bigframes-load-testing" +} + +env_vars: { + key: "BIGFRAMES_TEST_MODEL_VERTEX_ENDPOINT" + value: "https://us-central1-aiplatform.googleapis.com/v1/projects/272725758477/locations/us-central1/endpoints/590545496255234048" +} diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index f9103bfa72..5146b4bc7e 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -155,7 +155,44 @@ Running System Tests auth settings and change some configuration in your project to run all the tests. -- System tests will be run against an actual project. You should use local credentials from gcloud when possible. See `Best practices for application authentication `__. Some tests require a service account. For those tests see `Authenticating as a service account `__. +- System tests will be run against an actual project. A project can be set in + the environment variable ``$GOOGLE_CLOUD_PROJECT``. If not, the project property + set in the `Google Cloud CLI `__ + will be effective, which can be peeked into via ``gcloud config get project``, + or set via ``gcloud config set project ``. The following roles + carry the permissions to run the system tests in the project: + + - `BigQuery User `__ + to be able to create test datasets and run BigQuery jobs in the project. + + - `BigQuery Connection Admin `__ + to be able to use BigQuery connections in the project. + + - `BigQuery Data Editor `__ + to be able to create BigQuery remote functions in the project. + + - `Browser `__ + to be able to get current IAM policy for the service accounts of the BigQuery connections in the project. + + - `Cloud Functions Developer `__ + to be able to create cloud functions to support BigQuery DataFrames remote functions. + + - `Service Account User `__ + to be able to use the project's service accounts. + + - `Vertex AI User `__ + to be able to use the BigQuery DataFrames' ML integration with Vertex AI. + +- You can run the script ``scripts/setup-project-for-testing.sh []`` + to set up a project for running system tests and optionally set up necessary + IAM roles for a principal (user/group/service-account). You need to have the following + IAM permission to be able to run the set up script successfully: + + - ``serviceusage.services.enable`` + - ``bigquery.connections.create`` + - ``resourcemanager.projects.setIamPolicy`` + +- You should use local credentials from gcloud when possible. See `Best practices for application authentication `__. Some tests require a service account. For those tests see `Authenticating as a service account `__. ************* Test Coverage diff --git a/bigframes/remote_function.py b/bigframes/remote_function.py index 7280ac7d42..a899ebd371 100644 --- a/bigframes/remote_function.py +++ b/bigframes/remote_function.py @@ -411,13 +411,23 @@ def create_cloud_function(self, def_, cf_name, package_requirements=None): create_function_request.function = function # Create the cloud function and wait for it to be ready to use - operation = self._cloud_functions_client.create_function( - request=create_function_request - ) - operation.result() - - # Cleanup - os.remove(archive_path) + try: + operation = self._cloud_functions_client.create_function( + request=create_function_request + ) + operation.result() + + # Cleanup + os.remove(archive_path) + except google.api_core.exceptions.AlreadyExists: + # If a cloud function with the same name already exists, let's + # update it + update_function_request = functions_v2.UpdateFunctionRequest() + update_function_request.function = function + operation = self._cloud_functions_client.update_function( + request=update_function_request + ) + operation.result() # Fetch the endpoint of the just created function endpoint = self.get_cloud_function_endpoint(cf_name) diff --git a/notebooks/generative_ai/large_language_models.ipynb b/notebooks/generative_ai/large_language_models.ipynb index 45a46c44af..2695ee9dc0 100644 --- a/notebooks/generative_ai/large_language_models.ipynb +++ b/notebooks/generative_ai/large_language_models.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -22,12 +22,12 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "session = bigframes.pandas.get_global_session()\n", - "connection = \"bigframes-dev.us.bigframes-ml\"" + "connection = f\"{session.bqclient.project}.us.bigframes-default-connection\"" ] }, { diff --git a/notebooks/regression/easy_linear_regression.ipynb b/notebooks/regression/easy_linear_regression.ipynb index c441a966ec..fdabd82a4b 100644 --- a/notebooks/regression/easy_linear_regression.ipynb +++ b/notebooks/regression/easy_linear_regression.ipynb @@ -26,48 +26,86 @@ "## 1. Init & load data" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Import `bigframes.pandas` module and get the default session" + ] + }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes.pandas\n", + "session = bigframes.pandas.get_global_session()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Define a dataset for storing BQML model, and create it if it does not exist." + ] + }, + { + "cell_type": "code", + "execution_count": 23, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "ffc6d6c7815a4a92903a08a11af6db11", - "version_major": 2, - "version_minor": 0 - }, "text/plain": [ - "HTML(value='Query job d1e085ba-66d8-4631-bb51-50a17d0a6e51 is RUNNING. Open Job" + ], "text/plain": [ - "HTML(value='Query job 8fe1dc50-9d32-4466-9c2b-76d32cbde7c5 is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -75,13 +113,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "042c351aa0944eeeab8b36254f88c072", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 91aa1b30-2b0e-41eb-9bfb-4f6232913b31 is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job e40d99ae-1b3a-4a12-b4be-e264af8b22e5 is RUNNING. " ] }, "metadata": {}, @@ -121,250 +157,250 @@ " \n", " 0\n", " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 36.6\n", - " 18.4\n", - " 184.0\n", - " 3475.0\n", - " FEMALE\n", + " Biscoe\n", + " 40.1\n", + " 18.9\n", + " 188.0\n", + " 4300.0\n", + " MALE\n", " \n", " \n", " 1\n", " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 39.8\n", - " 19.1\n", - " 184.0\n", - " 4650.0\n", + " Torgersen\n", + " 39.1\n", + " 18.7\n", + " 181.0\n", + " 3750.0\n", " MALE\n", " \n", " \n", " 2\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 40.9\n", - " 18.9\n", - " 184.0\n", - " 3900.0\n", - " MALE\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 47.4\n", + " 14.6\n", + " 212.0\n", + " 4725.0\n", + " FEMALE\n", " \n", " \n", " 3\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 46.5\n", - " 17.9\n", - " 192.0\n", - " 3500.0\n", + " 42.5\n", + " 16.7\n", + " 187.0\n", + " 3350.0\n", " FEMALE\n", " \n", " \n", " 4\n", " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 37.3\n", - " 16.8\n", - " 192.0\n", - " 3000.0\n", - " FEMALE\n", + " Biscoe\n", + " 43.2\n", + " 19.0\n", + " 197.0\n", + " 4775.0\n", + " MALE\n", " \n", " \n", " 5\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 43.2\n", - " 18.5\n", - " 192.0\n", - " 4100.0\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 46.7\n", + " 15.3\n", + " 219.0\n", + " 5200.0\n", " MALE\n", " \n", " \n", " 6\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 46.9\n", - " 16.6\n", - " 192.0\n", - " 2700.0\n", - " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Biscoe\n", + " 41.3\n", + " 21.1\n", + " 195.0\n", + " 4400.0\n", + " MALE\n", " \n", " \n", " 7\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 50.5\n", - " 18.4\n", - " 200.0\n", - " 3400.0\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 45.2\n", + " 13.8\n", + " 215.0\n", + " 4750.0\n", " FEMALE\n", " \n", " \n", " 8\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 49.5\n", - " 19.0\n", - " 200.0\n", - " 3800.0\n", - " MALE\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 46.5\n", + " 13.5\n", + " 210.0\n", + " 4550.0\n", + " FEMALE\n", " \n", " \n", " 9\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 40.2\n", - " 20.1\n", - " 200.0\n", - " 3975.0\n", - " MALE\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 50.5\n", + " 15.2\n", + " 216.0\n", + " 5000.0\n", + " FEMALE\n", " \n", " \n", " 10\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 40.8\n", - " 18.9\n", - " 208.0\n", - " 4300.0\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 48.2\n", + " 15.6\n", + " 221.0\n", + " 5100.0\n", " MALE\n", " \n", " \n", " 11\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Dream\n", - " 39.0\n", - " 18.7\n", - " 185.0\n", - " 3650.0\n", - " MALE\n", + " 38.1\n", + " 18.6\n", + " 190.0\n", + " 3700.0\n", + " FEMALE\n", " \n", " \n", " 12\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 37.0\n", - " 16.9\n", - " 185.0\n", - " 3000.0\n", - " FEMALE\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 50.7\n", + " 15.0\n", + " 223.0\n", + " 5550.0\n", + " MALE\n", " \n", " \n", " 13\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 47.0\n", - " 17.3\n", - " 185.0\n", - " 3700.0\n", - " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Biscoe\n", + " 37.8\n", + " 20.0\n", + " 190.0\n", + " 4250.0\n", + " MALE\n", " \n", " \n", " 14\n", " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 34.0\n", - " 17.1\n", - " 185.0\n", - " 3400.0\n", + " Biscoe\n", + " 35.0\n", + " 17.9\n", + " 190.0\n", + " 3450.0\n", " FEMALE\n", " \n", " \n", " 15\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 37.0\n", - " 16.5\n", - " 185.0\n", - " 3400.0\n", - " FEMALE\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 48.7\n", + " 15.7\n", + " 208.0\n", + " 5350.0\n", + " MALE\n", " \n", " \n", " 16\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 45.7\n", - " 17.3\n", - " 193.0\n", - " 3600.0\n", - " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Torgersen\n", + " 34.6\n", + " 21.1\n", + " 198.0\n", + " 4400.0\n", + " MALE\n", " \n", " \n", " 17\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 50.6\n", - " 19.4\n", - " 193.0\n", - " 3800.0\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 46.8\n", + " 15.4\n", + " 215.0\n", + " 5150.0\n", " MALE\n", " \n", " \n", " 18\n", - " Adelie Penguin (Pygoscelis adeliae)\n", + " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 39.7\n", - " 17.9\n", - " 193.0\n", - " 4250.0\n", + " 50.3\n", + " 20.0\n", + " 197.0\n", + " 3300.0\n", " MALE\n", " \n", " \n", " 19\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Dream\n", - " 37.8\n", + " 37.2\n", " 18.1\n", - " 193.0\n", - " 3750.0\n", + " 178.0\n", + " 3900.0\n", " MALE\n", " \n", " \n", " 20\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 46.6\n", - " 17.8\n", - " 193.0\n", - " 3800.0\n", - " FEMALE\n", + " 51.0\n", + " 18.8\n", + " 203.0\n", + " 4100.0\n", + " MALE\n", " \n", " \n", " 21\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 51.3\n", - " 19.2\n", - " 193.0\n", - " 3650.0\n", - " MALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Biscoe\n", + " 40.5\n", + " 17.9\n", + " 187.0\n", + " 3200.0\n", + " FEMALE\n", " \n", " \n", " 22\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 40.2\n", - " 17.1\n", - " 193.0\n", - " 3400.0\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 45.5\n", + " 13.9\n", + " 210.0\n", + " 4200.0\n", " FEMALE\n", " \n", " \n", " 23\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Dream\n", - " 36.8\n", + " 42.2\n", " 18.5\n", - " 193.0\n", - " 3500.0\n", + " 180.0\n", + " 3550.0\n", " FEMALE\n", " \n", " \n", " 24\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 49.6\n", - " 18.2\n", - " 193.0\n", + " 51.7\n", + " 20.3\n", + " 194.0\n", " 3775.0\n", " MALE\n", " \n", @@ -374,74 +410,72 @@ "[344 rows x 7 columns in total]" ], "text/plain": [ - " species island culmen_length_mm \\\n", - "0 Adelie Penguin (Pygoscelis adeliae) Dream 36.6 \n", - "1 Adelie Penguin (Pygoscelis adeliae) Dream 39.8 \n", - "2 Adelie Penguin (Pygoscelis adeliae) Dream 40.9 \n", - "3 Chinstrap penguin (Pygoscelis antarctica) Dream 46.5 \n", - "4 Adelie Penguin (Pygoscelis adeliae) Dream 37.3 \n", - "5 Adelie Penguin (Pygoscelis adeliae) Dream 43.2 \n", - "6 Chinstrap penguin (Pygoscelis antarctica) Dream 46.9 \n", - "7 Chinstrap penguin (Pygoscelis antarctica) Dream 50.5 \n", - "8 Chinstrap penguin (Pygoscelis antarctica) Dream 49.5 \n", - "9 Adelie Penguin (Pygoscelis adeliae) Dream 40.2 \n", - "10 Adelie Penguin (Pygoscelis adeliae) Dream 40.8 \n", - "11 Adelie Penguin (Pygoscelis adeliae) Dream 39.0 \n", - "12 Adelie Penguin (Pygoscelis adeliae) Dream 37.0 \n", - "13 Chinstrap penguin (Pygoscelis antarctica) Dream 47.0 \n", - "14 Adelie Penguin (Pygoscelis adeliae) Dream 34.0 \n", - "15 Adelie Penguin (Pygoscelis adeliae) Dream 37.0 \n", - "16 Chinstrap penguin (Pygoscelis antarctica) Dream 45.7 \n", - "17 Chinstrap penguin (Pygoscelis antarctica) Dream 50.6 \n", - "18 Adelie Penguin (Pygoscelis adeliae) Dream 39.7 \n", - "19 Adelie Penguin (Pygoscelis adeliae) Dream 37.8 \n", - "20 Chinstrap penguin (Pygoscelis antarctica) Dream 46.6 \n", - "21 Chinstrap penguin (Pygoscelis antarctica) Dream 51.3 \n", - "22 Adelie Penguin (Pygoscelis adeliae) Dream 40.2 \n", - "23 Adelie Penguin (Pygoscelis adeliae) Dream 36.8 \n", - "24 Chinstrap penguin (Pygoscelis antarctica) Dream 49.6 \n", + " species island culmen_length_mm \\\n", + "0 Adelie Penguin (Pygoscelis adeliae) Biscoe 40.1 \n", + "1 Adelie Penguin (Pygoscelis adeliae) Torgersen 39.1 \n", + "2 Gentoo penguin (Pygoscelis papua) Biscoe 47.4 \n", + "3 Chinstrap penguin (Pygoscelis antarctica) Dream 42.5 \n", + "4 Adelie Penguin (Pygoscelis adeliae) Biscoe 43.2 \n", + "5 Gentoo penguin (Pygoscelis papua) Biscoe 46.7 \n", + "6 Adelie Penguin (Pygoscelis adeliae) Biscoe 41.3 \n", + "7 Gentoo penguin (Pygoscelis papua) Biscoe 45.2 \n", + "8 Gentoo penguin (Pygoscelis papua) Biscoe 46.5 \n", + "9 Gentoo penguin (Pygoscelis papua) Biscoe 50.5 \n", + "10 Gentoo penguin (Pygoscelis papua) Biscoe 48.2 \n", + "11 Adelie Penguin (Pygoscelis adeliae) Dream 38.1 \n", + "12 Gentoo penguin (Pygoscelis papua) Biscoe 50.7 \n", + "13 Adelie Penguin (Pygoscelis adeliae) Biscoe 37.8 \n", + "14 Adelie Penguin (Pygoscelis adeliae) Biscoe 35.0 \n", + "15 Gentoo penguin (Pygoscelis papua) Biscoe 48.7 \n", + "16 Adelie Penguin (Pygoscelis adeliae) Torgersen 34.6 \n", + "17 Gentoo penguin (Pygoscelis papua) Biscoe 46.8 \n", + "18 Chinstrap penguin (Pygoscelis antarctica) Dream 50.3 \n", + "19 Adelie Penguin (Pygoscelis adeliae) Dream 37.2 \n", + "20 Chinstrap penguin (Pygoscelis antarctica) Dream 51.0 \n", + "21 Adelie Penguin (Pygoscelis adeliae) Biscoe 40.5 \n", + "22 Gentoo penguin (Pygoscelis papua) Biscoe 45.5 \n", + "23 Adelie Penguin (Pygoscelis adeliae) Dream 42.2 \n", + "24 Chinstrap penguin (Pygoscelis antarctica) Dream 51.7 \n", "\n", " culmen_depth_mm flipper_length_mm body_mass_g sex \n", - "0 18.4 184.0 3475.0 FEMALE \n", - "1 19.1 184.0 4650.0 MALE \n", - "2 18.9 184.0 3900.0 MALE \n", - "3 17.9 192.0 3500.0 FEMALE \n", - "4 16.8 192.0 3000.0 FEMALE \n", - "5 18.5 192.0 4100.0 MALE \n", - "6 16.6 192.0 2700.0 FEMALE \n", - "7 18.4 200.0 3400.0 FEMALE \n", - "8 19.0 200.0 3800.0 MALE \n", - "9 20.1 200.0 3975.0 MALE \n", - "10 18.9 208.0 4300.0 MALE \n", - "11 18.7 185.0 3650.0 MALE \n", - "12 16.9 185.0 3000.0 FEMALE \n", - "13 17.3 185.0 3700.0 FEMALE \n", - "14 17.1 185.0 3400.0 FEMALE \n", - "15 16.5 185.0 3400.0 FEMALE \n", - "16 17.3 193.0 3600.0 FEMALE \n", - "17 19.4 193.0 3800.0 MALE \n", - "18 17.9 193.0 4250.0 MALE \n", - "19 18.1 193.0 3750.0 MALE \n", - "20 17.8 193.0 3800.0 FEMALE \n", - "21 19.2 193.0 3650.0 MALE \n", - "22 17.1 193.0 3400.0 FEMALE \n", - "23 18.5 193.0 3500.0 FEMALE \n", - "24 18.2 193.0 3775.0 MALE \n", + "0 18.9 188.0 4300.0 MALE \n", + "1 18.7 181.0 3750.0 MALE \n", + "2 14.6 212.0 4725.0 FEMALE \n", + "3 16.7 187.0 3350.0 FEMALE \n", + "4 19.0 197.0 4775.0 MALE \n", + "5 15.3 219.0 5200.0 MALE \n", + "6 21.1 195.0 4400.0 MALE \n", + "7 13.8 215.0 4750.0 FEMALE \n", + "8 13.5 210.0 4550.0 FEMALE \n", + "9 15.2 216.0 5000.0 FEMALE \n", + "10 15.6 221.0 5100.0 MALE \n", + "11 18.6 190.0 3700.0 FEMALE \n", + "12 15.0 223.0 5550.0 MALE \n", + "13 20.0 190.0 4250.0 MALE \n", + "14 17.9 190.0 3450.0 FEMALE \n", + "15 15.7 208.0 5350.0 MALE \n", + "16 21.1 198.0 4400.0 MALE \n", + "17 15.4 215.0 5150.0 MALE \n", + "18 20.0 197.0 3300.0 MALE \n", + "19 18.1 178.0 3900.0 MALE \n", + "20 18.8 203.0 4100.0 MALE \n", + "21 17.9 187.0 3200.0 FEMALE \n", + "22 13.9 210.0 4200.0 FEMALE \n", + "23 18.5 180.0 3550.0 FEMALE \n", + "24 20.3 194.0 3775.0 MALE \n", "...\n", "\n", "[344 rows x 7 columns]" ] }, - "execution_count": 20, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "import bigframes.pandas\n", - "\n", "# read a BigQuery table to a BigQuery DataFrame\n", - "df = bigframes.pandas.read_gbq(\"bigframes-dev.bqml_tutorial.penguins\")\n", + "df = bigframes.pandas.read_gbq(f\"bigquery-public-data.ml_datasets.penguins\")\n", "\n", "# take a peek at the dataframe\n", "df" @@ -457,18 +491,16 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 26, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "0ddb322731fe4b80b2904e1610862c31", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job d2bd7c5e-2652-4c0d-8495-8ef65e89031b is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 7d289291-5c60-4d8f-b476-e46cb2ab06a7 is DONE. 28.9 kB processed. " ] }, "metadata": {}, @@ -476,13 +508,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9f91e7a3d7ed416096d7660a110e0eab", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 92f0a5e5-bc61-426f-a9ef-213a1c376851 is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 8411db98-9ec3-4655-a40f-f9bf272e2403 is RUNNING. " ] }, "metadata": {}, @@ -520,294 +550,294 @@ " \n", " \n", " 0\n", - " Dream\n", - " 36.6\n", - " 18.4\n", - " 184.0\n", - " 3475.0\n", - " FEMALE\n", - " \n", - " \n", - " 1\n", - " Dream\n", - " 39.8\n", - " 19.1\n", - " 184.0\n", - " 4650.0\n", + " Biscoe\n", + " 40.1\n", + " 18.9\n", + " 188.0\n", + " 4300.0\n", " MALE\n", " \n", " \n", - " 2\n", - " Dream\n", - " 40.9\n", - " 18.9\n", - " 184.0\n", - " 3900.0\n", + " 1\n", + " Torgersen\n", + " 39.1\n", + " 18.7\n", + " 181.0\n", + " 3750.0\n", " MALE\n", " \n", " \n", " 4\n", - " Dream\n", - " 37.3\n", - " 16.8\n", - " 192.0\n", - " 3000.0\n", - " FEMALE\n", - " \n", - " \n", - " 5\n", - " Dream\n", + " Biscoe\n", " 43.2\n", - " 18.5\n", - " 192.0\n", - " 4100.0\n", - " MALE\n", - " \n", - " \n", - " 9\n", - " Dream\n", - " 40.2\n", - " 20.1\n", - " 200.0\n", - " 3975.0\n", + " 19.0\n", + " 197.0\n", + " 4775.0\n", " MALE\n", " \n", " \n", - " 10\n", - " Dream\n", - " 40.8\n", - " 18.9\n", - " 208.0\n", - " 4300.0\n", + " 6\n", + " Biscoe\n", + " 41.3\n", + " 21.1\n", + " 195.0\n", + " 4400.0\n", " MALE\n", " \n", " \n", " 11\n", " Dream\n", - " 39.0\n", - " 18.7\n", - " 185.0\n", - " 3650.0\n", - " MALE\n", - " \n", - " \n", - " 12\n", - " Dream\n", - " 37.0\n", - " 16.9\n", - " 185.0\n", - " 3000.0\n", + " 38.1\n", + " 18.6\n", + " 190.0\n", + " 3700.0\n", " FEMALE\n", " \n", " \n", - " 14\n", - " Dream\n", - " 34.0\n", - " 17.1\n", - " 185.0\n", - " 3400.0\n", - " FEMALE\n", + " 13\n", + " Biscoe\n", + " 37.8\n", + " 20.0\n", + " 190.0\n", + " 4250.0\n", + " MALE\n", " \n", " \n", - " 15\n", - " Dream\n", - " 37.0\n", - " 16.5\n", - " 185.0\n", - " 3400.0\n", + " 14\n", + " Biscoe\n", + " 35.0\n", + " 17.9\n", + " 190.0\n", + " 3450.0\n", " FEMALE\n", " \n", " \n", - " 18\n", - " Dream\n", - " 39.7\n", - " 17.9\n", - " 193.0\n", - " 4250.0\n", + " 16\n", + " Torgersen\n", + " 34.6\n", + " 21.1\n", + " 198.0\n", + " 4400.0\n", " MALE\n", " \n", " \n", " 19\n", " Dream\n", - " 37.8\n", + " 37.2\n", " 18.1\n", - " 193.0\n", - " 3750.0\n", + " 178.0\n", + " 3900.0\n", " MALE\n", " \n", " \n", - " 22\n", - " Dream\n", - " 40.2\n", - " 17.1\n", - " 193.0\n", - " 3400.0\n", + " 21\n", + " Biscoe\n", + " 40.5\n", + " 17.9\n", + " 187.0\n", + " 3200.0\n", " FEMALE\n", " \n", " \n", " 23\n", " Dream\n", - " 36.8\n", + " 42.2\n", " 18.5\n", - " 193.0\n", - " 3500.0\n", + " 180.0\n", + " 3550.0\n", " FEMALE\n", " \n", " \n", - " 26\n", + " 30\n", " Dream\n", - " 41.5\n", - " 18.5\n", - " 201.0\n", - " 4000.0\n", + " 39.2\n", + " 21.1\n", + " 196.0\n", + " 4150.0\n", " MALE\n", " \n", " \n", - " 31\n", - " Dream\n", - " 33.1\n", - " 16.1\n", - " 178.0\n", - " 2900.0\n", - " FEMALE\n", + " 32\n", + " Torgersen\n", + " 42.9\n", + " 17.6\n", + " 196.0\n", + " 4700.0\n", + " MALE\n", " \n", " \n", - " 32\n", + " 38\n", " Dream\n", - " 37.2\n", - " 18.1\n", - " 178.0\n", + " 41.1\n", + " 17.5\n", + " 190.0\n", " 3900.0\n", " MALE\n", " \n", " \n", - " 33\n", - " Dream\n", - " 39.5\n", - " 16.7\n", - " 178.0\n", - " 3250.0\n", + " 40\n", + " Torgersen\n", + " 38.6\n", + " 21.2\n", + " 191.0\n", + " 3800.0\n", + " MALE\n", + " \n", + " \n", + " 42\n", + " Biscoe\n", + " 35.5\n", + " 16.2\n", + " 195.0\n", + " 3350.0\n", " FEMALE\n", " \n", " \n", - " 35\n", + " 44\n", " Dream\n", - " 36.0\n", - " 18.5\n", + " 39.2\n", + " 18.6\n", + " 190.0\n", + " 4250.0\n", + " MALE\n", + " \n", + " \n", + " 45\n", + " Torgersen\n", + " 35.2\n", + " 15.9\n", " 186.0\n", - " 3100.0\n", + " 3050.0\n", " FEMALE\n", " \n", " \n", - " 36\n", + " 46\n", " Dream\n", + " 43.2\n", + " 18.5\n", + " 192.0\n", + " 4100.0\n", + " MALE\n", + " \n", + " \n", + " 49\n", + " Biscoe\n", " 39.6\n", - " 18.1\n", + " 17.7\n", " 186.0\n", - " 4450.0\n", - " MALE\n", + " 3500.0\n", + " FEMALE\n", " \n", " \n", - " 38\n", - " Dream\n", - " 41.3\n", + " 53\n", + " Biscoe\n", + " 45.6\n", " 20.3\n", - " 194.0\n", - " 3550.0\n", + " 191.0\n", + " 4600.0\n", " MALE\n", " \n", " \n", - " 41\n", - " Dream\n", - " 35.7\n", - " 18.0\n", - " 202.0\n", - " 3550.0\n", + " 58\n", + " Torgersen\n", + " 40.9\n", + " 16.8\n", + " 191.0\n", + " 3700.0\n", " FEMALE\n", " \n", " \n", - " 51\n", - " Dream\n", - " 38.1\n", - " 17.6\n", - " 187.0\n", - " 3425.0\n", + " 60\n", + " Torgersen\n", + " 40.3\n", + " 18.0\n", + " 195.0\n", + " 3250.0\n", " FEMALE\n", " \n", " \n", - " 53\n", + " 62\n", " Dream\n", " 36.0\n", - " 17.1\n", - " 187.0\n", - " 3700.0\n", + " 18.5\n", + " 186.0\n", + " 3100.0\n", " FEMALE\n", " \n", + " \n", + " 63\n", + " Torgersen\n", + " 39.3\n", + " 20.6\n", + " 190.0\n", + " 3650.0\n", + " MALE\n", + " \n", " \n", "\n", "

25 rows Ă— 6 columns

\n", "[146 rows x 6 columns in total]" ], "text/plain": [ - " island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g \\\n", - "0 Dream 36.6 18.4 184.0 3475.0 \n", - "1 Dream 39.8 19.1 184.0 4650.0 \n", - "2 Dream 40.9 18.9 184.0 3900.0 \n", - "4 Dream 37.3 16.8 192.0 3000.0 \n", - "5 Dream 43.2 18.5 192.0 4100.0 \n", - "9 Dream 40.2 20.1 200.0 3975.0 \n", - "10 Dream 40.8 18.9 208.0 4300.0 \n", - "11 Dream 39.0 18.7 185.0 3650.0 \n", - "12 Dream 37.0 16.9 185.0 3000.0 \n", - "14 Dream 34.0 17.1 185.0 3400.0 \n", - "15 Dream 37.0 16.5 185.0 3400.0 \n", - "18 Dream 39.7 17.9 193.0 4250.0 \n", - "19 Dream 37.8 18.1 193.0 3750.0 \n", - "22 Dream 40.2 17.1 193.0 3400.0 \n", - "23 Dream 36.8 18.5 193.0 3500.0 \n", - "26 Dream 41.5 18.5 201.0 4000.0 \n", - "31 Dream 33.1 16.1 178.0 2900.0 \n", - "32 Dream 37.2 18.1 178.0 3900.0 \n", - "33 Dream 39.5 16.7 178.0 3250.0 \n", - "35 Dream 36.0 18.5 186.0 3100.0 \n", - "36 Dream 39.6 18.1 186.0 4450.0 \n", - "38 Dream 41.3 20.3 194.0 3550.0 \n", - "41 Dream 35.7 18.0 202.0 3550.0 \n", - "51 Dream 38.1 17.6 187.0 3425.0 \n", - "53 Dream 36.0 17.1 187.0 3700.0 \n", + " island culmen_length_mm culmen_depth_mm flipper_length_mm \\\n", + "0 Biscoe 40.1 18.9 188.0 \n", + "1 Torgersen 39.1 18.7 181.0 \n", + "4 Biscoe 43.2 19.0 197.0 \n", + "6 Biscoe 41.3 21.1 195.0 \n", + "11 Dream 38.1 18.6 190.0 \n", + "13 Biscoe 37.8 20.0 190.0 \n", + "14 Biscoe 35.0 17.9 190.0 \n", + "16 Torgersen 34.6 21.1 198.0 \n", + "19 Dream 37.2 18.1 178.0 \n", + "21 Biscoe 40.5 17.9 187.0 \n", + "23 Dream 42.2 18.5 180.0 \n", + "30 Dream 39.2 21.1 196.0 \n", + "32 Torgersen 42.9 17.6 196.0 \n", + "38 Dream 41.1 17.5 190.0 \n", + "40 Torgersen 38.6 21.2 191.0 \n", + "42 Biscoe 35.5 16.2 195.0 \n", + "44 Dream 39.2 18.6 190.0 \n", + "45 Torgersen 35.2 15.9 186.0 \n", + "46 Dream 43.2 18.5 192.0 \n", + "49 Biscoe 39.6 17.7 186.0 \n", + "53 Biscoe 45.6 20.3 191.0 \n", + "58 Torgersen 40.9 16.8 191.0 \n", + "60 Torgersen 40.3 18.0 195.0 \n", + "62 Dream 36.0 18.5 186.0 \n", + "63 Torgersen 39.3 20.6 190.0 \n", "\n", - " sex \n", - "0 FEMALE \n", - "1 MALE \n", - "2 MALE \n", - "4 FEMALE \n", - "5 MALE \n", - "9 MALE \n", - "10 MALE \n", - "11 MALE \n", - "12 FEMALE \n", - "14 FEMALE \n", - "15 FEMALE \n", - "18 MALE \n", - "19 MALE \n", - "22 FEMALE \n", - "23 FEMALE \n", - "26 MALE \n", - "31 FEMALE \n", - "32 MALE \n", - "33 FEMALE \n", - "35 FEMALE \n", - "36 MALE \n", - "38 MALE \n", - "41 FEMALE \n", - "51 FEMALE \n", - "53 FEMALE \n", + " body_mass_g sex \n", + "0 4300.0 MALE \n", + "1 3750.0 MALE \n", + "4 4775.0 MALE \n", + "6 4400.0 MALE \n", + "11 3700.0 FEMALE \n", + "13 4250.0 MALE \n", + "14 3450.0 FEMALE \n", + "16 4400.0 MALE \n", + "19 3900.0 MALE \n", + "21 3200.0 FEMALE \n", + "23 3550.0 FEMALE \n", + "30 4150.0 MALE \n", + "32 4700.0 MALE \n", + "38 3900.0 MALE \n", + "40 3800.0 MALE \n", + "42 3350.0 FEMALE \n", + "44 4250.0 MALE \n", + "45 3050.0 FEMALE \n", + "46 4100.0 MALE \n", + "49 3500.0 FEMALE \n", + "53 4600.0 MALE \n", + "58 3700.0 FEMALE \n", + "60 3250.0 FEMALE \n", + "62 3100.0 FEMALE \n", + "63 3650.0 MALE \n", "...\n", "\n", "[146 rows x 6 columns]" ] }, - "execution_count": 21, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -828,7 +858,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -850,18 +880,40 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 28, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "cf14ebed505a4a92b4c72f51c82efe55", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 43c8fdc2-0bc3-4607-a36d-5bee87c894d8 is DONE. 28.9 kB processed.
Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 97e0c84d-aa6a-4197-9377-740d973ea44d is DONE. 28.9 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 726b9a5e-48a1-4ced-ac34-fa028dcb2bf4 is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job dcef36e5-4bd6-40f8-88c6-72e84360533f is RUNNING. " ] }, "metadata": {}, @@ -873,7 +925,7 @@ "LinearRegression()" ] }, - "execution_count": 23, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -890,60 +942,16 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 29, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "66af192d9a784994b9d4a48a49c70721", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HTML(value='Query job 87895ee3-81d0-4267-8a50-ab00e04664a7 is RUNNING. Open Job" + ], "text/plain": [ - "HTML(value='Query job 2e3a6603-9f0e-44ff-9086-2e14ad50bd25 is RUNNING. " ] }, "metadata": {}, @@ -951,13 +959,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "163d87d9a2274142b31f5aafa145357a", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 5c71d3d9-0e1c-45bd-866f-1f98f056260d is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 357878f9-b705-4a03-aeeb-818a51873724 is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -965,13 +971,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "3d307bbbd60e431a8d5bbd2ef7c41e2b", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 890767f7-a83b-469a-9f3e-abd5667f8202 is DONE. 48 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 7d6c2e32-56e7-43ef-9b21-ccd2a25930ea is RUNNING. " ] }, "metadata": {}, @@ -1031,7 +1035,7 @@ "[1 rows x 6 columns]" ] }, - "execution_count": 24, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -1043,32 +1047,16 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 30, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "3966d9ee16b346cf943305112ce60fb6", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HTML(value='Query job a25c445d-9b60-4a8d-a325-1bfacd32bc8d is RUNNING. Open Job" + ], "text/plain": [ - "HTML(value='Query job b881b602-abfa-4c19-a385-2480b3e8b2bd is RUNNING. " ] }, "metadata": {}, @@ -1076,13 +1064,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "2084e1cd66ba449081eda92350f72fd0", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 5af493aa-96f9-434f-a101-ec855f4de694 is DONE. 8 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 03249520-93d3-4b2e-8976-f49cc4efe520 is RUNNING. " ] }, "metadata": {}, @@ -1090,13 +1076,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "68049943e6ad477988b9e65a962ecdf2", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job e2076bc3-3966-4c45-8265-c461756a7782 is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 31094013-70ea-415f-8b96-85c1af7ee9c8 is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -1104,13 +1088,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "341e6796def340cb9e0681ddeb40ff9d", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job e9cdfca7-30f6-4e93-95fb-244896e7c2ab is DONE. 16 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 0e456f33-4cb7-45a0-88e6-29324175b5a6 is RUNNING. " ] }, "metadata": {}, @@ -1142,8 +1124,8 @@ " \n", " \n", " \n", - " 292\n", - " 3459.735118\n", + " 334\n", + " 5891.735118\n", " \n", " \n", "\n", @@ -1152,12 +1134,12 @@ ], "text/plain": [ " predicted_body_mass_g\n", - "292 3459.735118\n", + "334 5891.735118\n", "\n", "[1 rows x 1 columns]" ] }, - "execution_count": 25, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -1177,18 +1159,16 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 31, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "256ff43296a9405f890e78511acc38e5", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Copy job cb4ef454-10df-4325-b9cb-6084df3ac9d5 is DONE. Open Job" + ], "text/plain": [ - "HTML(value='Copy job 1a273ccd-212a-4750-a3c1-615256af6d48 is RUNNING. " ] }, "metadata": {}, @@ -1200,14 +1180,14 @@ "LinearRegression(optimize_strategy='NORMAL_EQUATION')" ] }, - "execution_count": 26, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# save the model to a permanent location in BigQuery, so we can use it in future sessions (and elsewhere in BQ)\n", - "model.to_gbq(\"bigframes-dev.bqml_tutorial.penguins_model\", replace=True)" + "model.to_gbq(penguins_model, replace=True)" ] }, { @@ -1219,7 +1199,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -1228,7 +1208,7 @@ "LinearRegression(optimize_strategy='NORMAL_EQUATION')" ] }, - "execution_count": 27, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -1236,7 +1216,7 @@ "source": [ "# WARNING - until b/281709360 is fixed & pipeline is updated, pipelines will load as models,\n", "# and details of their transform steps will be lost (the loaded model will behave the same)\n", - "bigframes.pandas.read_gbq_model(\"bigframes-dev.bqml_tutorial.penguins_model\")" + "bigframes.pandas.read_gbq_model(penguins_model)" ] } ], diff --git a/owlbot.py b/owlbot.py index 082970018d..dc84de7d8f 100644 --- a/owlbot.py +++ b/owlbot.py @@ -46,6 +46,7 @@ "noxfile.py", ".pre-commit-config.yaml", "README.rst", + "CONTRIBUTING.rst", ".github/release-trigger.yml", # BigQuery DataFrames manages its own Kokoro cluster for presubmit & continuous tests. ".kokoro/build.sh", diff --git a/samples/snippets/gen_ai_model_test.py b/samples/snippets/gen_ai_model_test.py index 7cbc90d4c0..e4bead0e46 100644 --- a/samples/snippets/gen_ai_model_test.py +++ b/samples/snippets/gen_ai_model_test.py @@ -14,9 +14,14 @@ def test_llm_model(): - PROJECT_ID = "bigframes-dev" + # Determine project id, in this case prefer the one set in the environment + # variable GOOGLE_CLOUD_PROJECT (if any) + import os + + PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT", "bigframes-dev") REGION = "us" - CONN_NAME = "bigframes-ml" + CONN_NAME = "bigframes-default-connection" + # [START bigquery_dataframes_gen_ai_model] from bigframes.ml.llm import PaLM2TextGenerator import bigframes.pandas as bpd diff --git a/samples/snippets/load_data_from_biquery_job_test.py b/samples/snippets/load_data_from_biquery_job_test.py index 5271574a49..9a7793a7e5 100644 --- a/samples/snippets/load_data_from_biquery_job_test.py +++ b/samples/snippets/load_data_from_biquery_job_test.py @@ -14,10 +14,16 @@ def test_bigquery_dataframes_load_data_from_bigquery_job(): - from google.cloud import bigquery + # Determine project id, in this case prefer the one set in the environment + # variable GOOGLE_CLOUD_PROJECT (if any) + import os + + your_project_id = os.getenv("GOOGLE_CLOUD_PROJECT", "bigframes-dev") # Construct a BigQuery client object. - client = bigquery.Client(project="bigframes-dev", location="us") + from google.cloud import bigquery + + client = bigquery.Client(project=your_project_id, location="us") query = """ SELECT * @@ -26,7 +32,6 @@ def test_bigquery_dataframes_load_data_from_bigquery_job(): """ query_job = client.query(query) JOB_ID = query_job.job_id - your_project_id = "bigframes-dev" # [START bigquery_dataframes_load_data_from_bigquery_job] from google.cloud import bigquery diff --git a/samples/snippets/quickstart_test.py b/samples/snippets/quickstart_test.py index bbe4a8b3c4..4abc87d011 100644 --- a/samples/snippets/quickstart_test.py +++ b/samples/snippets/quickstart_test.py @@ -25,7 +25,12 @@ def test_quickstart( # We need a fresh session since we're modifying connection options. bigframes.pandas.close_session() - # TODO(swast): Get project from environment so contributors can run tests. - quickstart.run_quickstart("bigframes-dev") + # Determine project id, in this case prefer the one set in the environment + # variable GOOGLE_CLOUD_PROJECT (if any) + import os + + your_project_id = os.getenv("GOOGLE_CLOUD_PROJECT", "bigframes-dev") + + quickstart.run_quickstart(your_project_id) out, _ = capsys.readouterr() assert "average_body_mass (df_session):" in out diff --git a/samples/snippets/remote_function_test.py b/samples/snippets/remote_function_test.py index e1317c6ac0..8f891274de 100644 --- a/samples/snippets/remote_function_test.py +++ b/samples/snippets/remote_function_test.py @@ -25,8 +25,13 @@ def test_remote_function_and_read_gbq_function( # We need a fresh session since we're modifying connection options. bigframes.pandas.close_session() - # TODO(swast): Get project from environment so contributors can run tests. - remote_function.run_remote_function_and_read_gbq_function("bigframes-dev") + # Determine project id, in this case prefer the one set in the environment + # variable GOOGLE_CLOUD_PROJECT (if any) + import os + + your_project_id = os.getenv("GOOGLE_CLOUD_PROJECT", "bigframes-dev") + + remote_function.run_remote_function_and_read_gbq_function(your_project_id) out, _ = capsys.readouterr() assert "Created BQ remote function:" in out assert "Created cloud function:" in out diff --git a/scripts/create_test_model_vertex.py b/scripts/create_test_model_vertex.py new file mode 100644 index 0000000000..946e54773e --- /dev/null +++ b/scripts/create_test_model_vertex.py @@ -0,0 +1,71 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import sys + +import bigframes.ml.linear_model +import bigframes.pandas + + +def create_vertex_model(vertex_model_name): + df = bigframes.pandas.read_gbq("bigquery-public-data.ml_datasets.penguins") + + # filter down to the data we want to analyze + adelie_data = df[df.species == "Adelie Penguin (Pygoscelis adeliae)"] + + # drop the columns we don't care about + adelie_data = adelie_data.drop(columns=["species"]) + + # drop rows with nulls to get our training data + training_data = adelie_data.dropna() + + feature_columns = training_data["culmen_length_mm"] + label_columns = training_data[["body_mass_g"]] + + # create model + model = bigframes.ml.linear_model.LinearRegression() + model.fit(feature_columns, label_columns) + + # register to Vertex Registry + model.register(vertex_model_name) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Get top APIs for which there are no code samples in the docstring." + ) + parser.add_argument( + "-m", + "--model-name", + type=str, + required=True, + action="store", + help="Name of the model in Vertex.", + ) + parser.add_argument( + "-p", + "--project-id", + type=str, + required=False, + action="store", + help="Project id in which the model should be created. " + "By default, a project will be resolved as per https://cloud.google.com/python/docs/reference/google-cloud-core/latest/config#overview.", + ) + + args = parser.parse_args(sys.argv[1:]) + if args.project_id: + bigframes.pandas.options.bigquery.project = args.project_id + + create_vertex_model(args.model_name) diff --git a/scripts/setup-project-for-testing.sh b/scripts/setup-project-for-testing.sh new file mode 100755 index 0000000000..a160784c12 --- /dev/null +++ b/scripts/setup-project-for-testing.sh @@ -0,0 +1,256 @@ +#!/bin/bash + +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +if [ $# -lt 1 ]; then + echo "USAGE: `basename $0` []" + echo "EXAMPLES:" + echo " `basename $0` my-project" + echo " `basename $0` my-project user:user_id@example.com" + echo " `basename $0` my-project group:group_id@example.com" + echo " `basename $0` my-project serviceAccount:service_account_id@example.com" + exit 1 +fi + +PROJECT_ID=$1 +PRINCIPAL=$2 +BIGFRAMES_DEFAULT_CONNECTION_NAME=bigframes-default-connection +BIGFRAMES_RF_CONNECTION_NAME=bigframes-rf-conn + +if [ "$PRINCIPAL" != "" ]; then + echo $PRINCIPAL | grep -E "(user|group|serviceAccount):" >/dev/null + if [ $? -ne 0 ]; then + echo "principal must have prefix 'user:', 'group:' or 'serviceAccount:'" + exit 1 + fi +fi + +if ! test `which gcloud`; then + echo "gcloud CLI is not installed. Install it from https://cloud.google.com/sdk/docs/install." >&2 + exit 1 +fi + +################################################################################ +# Log and execute a command +################################################################################ +function log_and_execute() { + echo Running command: $* + $* +} + + +################################################################################ +# Enable APIs +################################################################################ +function enable_apis() { + for service in aiplatform.googleapis.com \ + bigquery.googleapis.com \ + bigqueryconnection.googleapis.com \ + bigquerystorage.googleapis.com \ + cloudbuild.googleapis.com \ + cloudfunctions.googleapis.com \ + cloudresourcemanager.googleapis.com \ + run.googleapis.com \ + ; do + log_and_execute gcloud --project=$PROJECT_ID services enable $service + if [ $? -ne 0 ]; then + echo "Failed to enable service $service, exiting..." + exit 1 + fi + done +} + + +################################################################################ +# Ensure a BQ connection exists with desired IAM rols +################################################################################ +function ensure_bq_connection_with_iam() { + if [ $# -ne 2 ]; then + echo "USAGE: `basename $0` " + echo "EXAMPLES:" + echo " `basename $0` my-project my-connection" + exit 1 + fi + + location=$1 + connection_name=$2 + + log_and_execute bq show \ + --connection \ + --project_id=$PROJECT_ID \ + --location=$location \ + $connection_name 2>&1 >/dev/null + if [ $? -ne 0 ]; then + echo "Connection $connection_name doesn't exists in location \"$location\", creating..." + log_and_execute bq mk \ + --connection \ + --project_id=$PROJECT_ID \ + --location=$location \ + --connection_type=CLOUD_RESOURCE \ + $connection_name + if [ $? -ne 0 ]; then + echo "Failed creating connection, exiting." + exit 1 + fi + else + echo "Connection $connection_name already exists in location $location." + fi + + compact_json_info_cmd="bq show --connection \ + --project_id=$PROJECT_ID \ + --location=$location \ + --format=json \ + $connection_name" + compact_json_info_cmd_output=`$compact_json_info_cmd` + if [ $? -ne 0 ]; then + echo "Failed to fetch connection info: $compact_json_info_cmd_output" + exit 1 + fi + + connection_service_account=`echo $compact_json_info_cmd_output | sed -e 's/.*"cloudResource":{"serviceAccountId":"//' -e 's/".*//'` + + # Configure roles for the service accounts associated with the connection + for role in run.invoker aiplatform.user; do + log_and_execute gcloud projects add-iam-policy-binding $PROJECT_ID \ + --member=serviceAccount:$connection_service_account \ + --role=roles/$role + if [ $? -ne 0 ]; then + echo "Failed to set IAM, exiting..." + exit 1 + fi + done +} + + +################################################################################ +# Create the default BQ connection in US location +################################################################################ +function ensure_bq_connections_with_iam() { + ensure_bq_connection_with_iam "us" "$BIGFRAMES_DEFAULT_CONNECTION_NAME" + + # Create commonly used BQ connection in various locations + for location in asia-southeast1 \ + eu \ + europe-west4 \ + southamerica-west1 \ + us \ + us-central1 \ + ; do + ensure_bq_connection_with_iam "$location" "$BIGFRAMES_RF_CONNECTION_NAME" + done +} + + +################################################################################ +# Set up IAM roles for principal +################################################################################ +function setup_iam_roles () { + if [ "$PRINCIPAL" != "" ]; then + for role in aiplatform.user \ + bigquery.user \ + bigquery.connectionAdmin \ + bigquery.dataEditor \ + browser \ + cloudfunctions.developer \ + iam.serviceAccountUser \ + ; do + log_and_execute gcloud projects add-iam-policy-binding $PROJECT_ID \ + --member=$PRINCIPAL \ + --role=roles/$role + if [ $? -ne 0 ]; then + echo "Failed to set IAM, exiting..." + exit 1 + fi + done + fi +} + + +################################################################################ +# Create vertex endpoint for test ML model +################################################################################ +function create_bq_model_vertex_endpoint () { + vertex_region=us-central1 + model_name=bigframes-test-linreg2 + endpoint_name=$model_name-endpoint + + # Create vertex model + log_and_execute python scripts/create_test_model_vertex.py \ + -m $model_name \ + -p $PROJECT_ID + if [ $? -ne 0 ]; then + echo "Failed to create model, exiting..." + exit 1 + fi + + # Create vertex endpoint + log_and_execute gcloud ai endpoints create \ + --project=$PROJECT_ID \ + --region=$vertex_region \ + --display-name=$endpoint_name + if [ $? -ne 0 ]; then + echo "Failed to create vertex endpoint, exiting..." + exit 1 + fi + + # Fetch endpoint id + endpoint_id=`gcloud ai endpoints list \ + --project=$PROJECT_ID \ + --region=$vertex_region \ + --filter=display_name=$endpoint_name 2>/dev/null \ + | tail -n1 | cut -d' ' -f 1` + if [ "$endpoint_id" = "" ]; then + echo "Failed to fetch vertex endpoint id, exiting..." + exit 1 + fi + + # Deploy the model to the vertex endpoint + log_and_execute gcloud ai endpoints deploy-model $endpoint_id \ + --project=$PROJECT_ID \ + --region=$vertex_region \ + --model=$model_name \ + --display-name=$model_name + if [ $? -ne 0 ]; then + echo "Failed to deploy model to vertex endpoint, exiting..." + exit 1 + fi + + # Form the endpoint + endpoint_rel_path=`gcloud ai endpoints describe \ + --project=$PROJECT_ID \ + --region=us-central1 \ + $endpoint_id 2>/dev/null \ + | grep "^name:" | cut -d' ' -f2` + if [ "$endpoint_rel_path" = "" ]; then + echo "Failed to fetch vertex endpoint relativr path, exiting..." + exit 1 + fi + endpoint_path=https://$vertex_region-aiplatform.googleapis.com/v1/$endpoint_rel_path + + # Print the endpoint configuration to be used in tests + echo + echo Run following command to set test model vertex endpoint: + echo export BIGFRAMES_TEST_MODEL_VERTEX_ENDPOINT=$endpoint_path +} + + +################################################################################ +# Set the things up +################################################################################ +enable_apis +ensure_bq_connections_with_iam +setup_iam_roles +create_bq_model_vertex_endpoint diff --git a/tests/system/conftest.py b/tests/system/conftest.py index f9f69c6c8e..0ad4280497 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -120,7 +120,6 @@ def session() -> bigframes.Session: def session_tokyo(tokyo_location: str) -> bigframes.Session: context = bigframes.BigQueryOptions( location=tokyo_location, - use_regional_endpoints=True, ) return bigframes.Session(context=context) diff --git a/tests/system/large/ml/test_decomposition.py b/tests/system/large/ml/test_decomposition.py index a7049d4c18..953287def2 100644 --- a/tests/system/large/ml/test_decomposition.py +++ b/tests/system/large/ml/test_decomposition.py @@ -15,6 +15,7 @@ import pandas as pd from bigframes.ml import decomposition +import tests.system.utils def test_decomposition_configure_fit_score_predict( @@ -66,9 +67,10 @@ def test_decomposition_configure_fit_score_predict( dtype="Float64", index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal( - abs(result.sort_index()), # results may differ by a minus sign - abs(expected), + + tests.system.utils.assert_pandas_df_equal_pca( + result, + expected, check_exact=False, rtol=0.1, ) diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py index 2929baf3f7..c128469bd2 100644 --- a/tests/system/large/ml/test_pipeline.py +++ b/tests/system/large/ml/test_pipeline.py @@ -24,7 +24,7 @@ pipeline, preprocessing, ) -from tests.system.utils import assert_pandas_df_equal +from tests.system.utils import assert_pandas_df_equal, assert_pandas_df_equal_pca def test_pipeline_linear_regression_fit_score_predict( @@ -430,17 +430,16 @@ def test_pipeline_PCA_fit_score_predict(session, penguins_df_default_index): dtype="Float64", index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal( - abs( # results may differ by a minus sign - predictions[ - [ - "principal_component_1", - "principal_component_2", - "principal_component_3", - ] + + assert_pandas_df_equal_pca( + predictions[ + [ + "principal_component_1", + "principal_component_2", + "principal_component_3", ] - ), - abs(expected), + ], + expected, check_exact=False, rtol=0.1, ) diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index 5cb4df188c..4b4c794a05 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -161,8 +161,10 @@ def make_uniq_udf(udf): @pytest.fixture(scope="module") def bq_cf_connection() -> str: - """Pre-created BQ connection to invoke cloud function for bigframes-dev - $ bq show --connection --location=us --project_id=bigframes-dev bigframes-rf-conn + """Pre-created BQ connection in the test project in US location, used to + invoke cloud function. + + $ bq show --connection --location=us --project_id=PROJECT_ID bigframes-rf-conn """ return "bigframes-rf-conn" diff --git a/tests/system/small/ml/conftest.py b/tests/system/small/ml/conftest.py index c4a1272e44..e3180d2892 100644 --- a/tests/system/small/ml/conftest.py +++ b/tests/system/small/ml/conftest.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os from typing import cast import uuid @@ -34,8 +35,8 @@ @pytest.fixture(scope="session") -def bq_connection() -> str: - return "bigframes-dev.us.bigframes-rf-conn" +def bq_connection(bigquery_client) -> str: + return f"{bigquery_client.project}.us.bigframes-rf-conn" @pytest.fixture(scope="session") @@ -252,10 +253,15 @@ def palm2_embedding_generator_multilingual_model( def linear_remote_model_params() -> dict: # Pre-deployed endpoint of linear reg model in Vertex. # bigframes-test-linreg2 -> bigframes-test-linreg-endpoint2 + model_vertex_endpoint = os.environ.get( + "BIGFRAMES_TEST_MODEL_VERTEX_ENDPOINT", + "https://us-central1-aiplatform.googleapis.com/v1/projects/1084210331973/locations/us-central1/endpoints/3193318217619603456", + ) + return { "input": {"culmen_length_mm": "float64"}, "output": {"predicted_body_mass_g": "array"}, - "endpoint": "https://us-central1-aiplatform.googleapis.com/v1/projects/1084210331973/locations/us-central1/endpoints/3193318217619603456", + "endpoint": model_vertex_endpoint, } diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index 915c4aa444..eece5ef21d 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -210,12 +210,12 @@ def test_pca_model_principal_components(penguins_bqml_pca_model: core.BqmlModel) .sort_values(["principal_component_id", "feature"]) .reset_index(drop=True) ) - pd.testing.assert_frame_equal( + + tests.system.utils.assert_pandas_df_equal_pca_components( result, expected, check_exact=False, rtol=0.1, - # int64 Index by default in pandas versus Int64 (nullable) Index in BigQuery DataFrame check_index_type=False, check_dtype=False, ) diff --git a/tests/system/small/ml/test_decomposition.py b/tests/system/small/ml/test_decomposition.py index 42fea66cf8..9565b8f7a8 100644 --- a/tests/system/small/ml/test_decomposition.py +++ b/tests/system/small/ml/test_decomposition.py @@ -29,11 +29,9 @@ def test_pca_predict(penguins_pca_model, new_penguins_df): dtype="Float64", index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal( - predictions.sort_index(), - expected, - check_exact=False, - rtol=0.1, + + tests.system.utils.assert_pandas_df_equal_pca( + predictions, expected, check_exact=False, rtol=0.1 ) @@ -115,7 +113,8 @@ def test_pca_components_(penguins_pca_model: decomposition.PCA): .sort_values(["principal_component_id", "feature"]) .reset_index(drop=True) ) - pd.testing.assert_frame_equal( + + tests.system.utils.assert_pandas_df_equal_pca_components( result, expected, check_exact=False, diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 306098548e..267a2ed9c1 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -24,14 +24,10 @@ def test_create_text_generator_model(palm2_text_generator_model): assert palm2_text_generator_model._bqml_model is not None -def test_create_text_generator_32k_model(palm2_text_generator_32k_model): - # Model creation doesn't return error - assert palm2_text_generator_32k_model is not None - assert palm2_text_generator_32k_model._bqml_model is not None - - @pytest.mark.flaky(retries=2, delay=120) -def test_create_text_generator_model_default_session(bq_connection, llm_text_pandas_df): +def test_create_text_generator_model_default_session( + bq_connection, llm_text_pandas_df, bigquery_client +): import bigframes.pandas as bpd bpd.close_session() @@ -41,7 +37,10 @@ def test_create_text_generator_model_default_session(bq_connection, llm_text_pan model = llm.PaLM2TextGenerator() assert model is not None assert model._bqml_model is not None - assert model.connection_name.casefold() == "bigframes-dev.us.bigframes-rf-conn" + assert ( + model.connection_name.casefold() + == f"{bigquery_client.project}.us.bigframes-rf-conn" + ) llm_text_df = bpd.read_pandas(llm_text_pandas_df) @@ -54,7 +53,7 @@ def test_create_text_generator_model_default_session(bq_connection, llm_text_pan @pytest.mark.flaky(retries=2, delay=120) def test_create_text_generator_32k_model_default_session( - bq_connection, llm_text_pandas_df + bq_connection, llm_text_pandas_df, bigquery_client ): import bigframes.pandas as bpd @@ -65,7 +64,10 @@ def test_create_text_generator_32k_model_default_session( model = llm.PaLM2TextGenerator(model_name="text-bison-32k") assert model is not None assert model._bqml_model is not None - assert model.connection_name.casefold() == "bigframes-dev.us.bigframes-rf-conn" + assert ( + model.connection_name.casefold() + == f"{bigquery_client.project}.us.bigframes-rf-conn" + ) llm_text_df = bpd.read_pandas(llm_text_pandas_df) @@ -77,7 +79,9 @@ def test_create_text_generator_32k_model_default_session( @pytest.mark.flaky(retries=2, delay=120) -def test_create_text_generator_model_default_connection(llm_text_pandas_df): +def test_create_text_generator_model_default_connection( + llm_text_pandas_df, bigquery_client +): from bigframes import _config import bigframes.pandas as bpd @@ -91,7 +95,7 @@ def test_create_text_generator_model_default_connection(llm_text_pandas_df): assert model._bqml_model is not None assert ( model.connection_name.casefold() - == "bigframes-dev.us.bigframes-default-connection" + == f"{bigquery_client.project}.us.bigframes-default-connection" ) df = model.predict(llm_text_df).to_pandas() diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index 960a384126..a98056d82a 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -23,40 +23,50 @@ @pytest.fixture(scope="module") def bq_cf_connection() -> str: - """Pre-created BQ connection to invoke cloud function for bigframes-dev - $ bq show --connection --location=us --project_id=bigframes-dev bigframes-rf-conn + """Pre-created BQ connection in the test project in US location, used to + invoke cloud function. + + $ bq show --connection --location=us --project_id=PROJECT_ID bigframes-rf-conn """ return "bigframes-rf-conn" @pytest.fixture(scope="module") def bq_cf_connection_location() -> str: - """Pre-created BQ connection to invoke cloud function for bigframes-dev - $ bq show --connection --location=us --project_id=bigframes-dev bigframes-rf-conn + """Pre-created BQ connection in the test project in US location, in format + PROJECT_ID.LOCATION.CONNECTION_NAME, used to invoke cloud function. + + $ bq show --connection --location=us --project_id=PROJECT_ID bigframes-rf-conn """ return "us.bigframes-rf-conn" @pytest.fixture(scope="module") def bq_cf_connection_location_mismatched() -> str: - """Pre-created BQ connection to invoke cloud function for bigframes-dev - $ bq show --connection --location=eu --project_id=bigframes-dev bigframes-rf-conn + """Pre-created BQ connection in the test project in EU location, in format + LOCATION.CONNECTION_NAME, used to invoke cloud function. + + $ bq show --connection --location=us --project_id=PROJECT_ID bigframes-rf-conn """ return "eu.bigframes-rf-conn" @pytest.fixture(scope="module") -def bq_cf_connection_location_project() -> str: - """Pre-created BQ connection to invoke cloud function for bigframes-dev - $ bq show --connection --location=us --project_id=bigframes-dev bigframes-rf-conn +def bq_cf_connection_location_project(bigquery_client) -> str: + """Pre-created BQ connection in the test project in US location, in format + PROJECT_ID.LOCATION.CONNECTION_NAME, used to invoke cloud function. + + $ bq show --connection --location=us --project_id=PROJECT_ID bigframes-rf-conn """ - return "bigframes-dev.us.bigframes-rf-conn" + return f"{bigquery_client.project}.us.bigframes-rf-conn" @pytest.fixture(scope="module") def bq_cf_connection_location_project_mismatched() -> str: - """Pre-created BQ connection to invoke cloud function for bigframes-dev - $ bq show --connection --location=eu --project_id=bigframes-metrics bigframes-rf-conn + """Pre-created BQ connection in the migframes-metrics project in US location, + in format PROJECT_ID.LOCATION.CONNECTION_NAME, used to invoke cloud function. + + $ bq show --connection --location=us --project_id=PROJECT_ID bigframes-rf-conn """ return "bigframes-metrics.eu.bigframes-rf-conn" diff --git a/tests/system/utils.py b/tests/system/utils.py index f7831972b8..f49b5ece31 100644 --- a/tests/system/utils.py +++ b/tests/system/utils.py @@ -143,3 +143,77 @@ def convert_pandas_dtypes(df: pd.DataFrame, bytes_col: bool): df["numeric_col"] = df["numeric_col"].apply( lambda value: decimal.Decimal(str(value)) if value else None # type: ignore ) + + +def assert_pandas_df_equal_pca_components(actual, expected, **kwargs): + """Compare two pandas dataframes representing PCA components. The columns + required to be present in the dataframes are: + numerical_value: numeric, + categorical_value: List[object(category, value)] + + The index types of `actual` and `expected` are ignored in the comparison. + + Args: + actual: Actual Pandas DataFrame + + expected: Expected Pandas DataFrame + + kwargs: kwargs to use in `pandas.testing.assert_series_equal` per column + """ + # Compare the index, columns and values separately, as the polarity of the + # PCA vectors can be arbitrary + pd.testing.assert_index_equal( + actual.index, expected.index.astype(actual.index.dtype) + ) # dtype agnostic index comparison + pd.testing.assert_index_equal(actual.columns, expected.columns) + for column in expected.columns: + try: + pd.testing.assert_series_equal(actual[column], expected[column], **kwargs) + except AssertionError: + if column not in {"numerical_value", "categorical_value"}: + raise + + # Allow for sign difference per numeric/categorical column + if column == "numerical_value": + actual_ = -actual[column] + expected_ = expected[column] + else: + # In this column each element is an array of objects, where the + # object has attributes "category" and "value". For the sake of + # comparison let's normalize by flipping the polarity of "value". + def normalize_array_of_objects(arr, reverse_polarity=False): + newarr = [] + for element in arr: + newelement = dict(element) + if reverse_polarity: + newelement["value"] = -newelement["value"] + newarr.append(newelement) + return sorted(newarr, key=lambda d: d["category"]) + + actual_ = actual[column].apply(normalize_array_of_objects, args=(True,)) + expected_ = expected[column].apply(normalize_array_of_objects) + + pd.testing.assert_series_equal(actual_, expected_, **kwargs) + + +def assert_pandas_df_equal_pca(actual, expected, **kwargs): + """Compare two pandas dataframes representing PCA predictions. The columns + in the dataframes are expected to be numeric. + + Args: + actual: Actual Pandas DataFrame + + expected: Expected Pandas DataFrame + + kwargs: kwargs to use in `pandas.testing.assert_series_equal` per column + """ + # Compare the index, columns and values separately, as the polarity of the + # PCA vector can be arbitrary + pd.testing.assert_index_equal(actual.index, expected.index) + pd.testing.assert_index_equal(actual.columns, expected.columns) + for column in expected.columns: + try: + pd.testing.assert_series_equal(actual[column], expected[column], **kwargs) + except AssertionError: + # Allow for sign difference per column + pd.testing.assert_series_equal(-actual[column], expected[column], **kwargs) From 9cde708bb4a94d3ba35ecdf298cc80bc5680e7b4 Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Tue, 12 Dec 2023 04:59:17 +0000 Subject: [PATCH 20/20] chore(main): release 0.16.0 (#250) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 33 +++++++++++++++++++++++++++++++++ bigframes/version.py | 2 +- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ef75a017e0..68ea51707c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,39 @@ [1]: https://pypi.org/project/bigframes/#history +## [0.16.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.15.0...v0.16.0) (2023-12-12) + + +### Features + +* Add ARIMAPlus.predict parameters ([#264](https://github.com/googleapis/python-bigquery-dataframes/issues/264)) ([99598c7](https://github.com/googleapis/python-bigquery-dataframes/commit/99598c7d359f1d1e0671dcf27a5c77094f3c7f67)) +* Add DataFrame from_dict and from_records methods ([#244](https://github.com/googleapis/python-bigquery-dataframes/issues/244)) ([8d81e24](https://github.com/googleapis/python-bigquery-dataframes/commit/8d81e24677613dcf4d275c27a327384b8c17bc85)) +* Add DataFrame.select_dtypes method ([#242](https://github.com/googleapis/python-bigquery-dataframes/issues/242)) ([1737acc](https://github.com/googleapis/python-bigquery-dataframes/commit/1737acc51b4fdd9b385bbf91a758efd2e7ead11a)) +* Add nunique method to Series/DataFrameGroupby ([#256](https://github.com/googleapis/python-bigquery-dataframes/issues/256)) ([c8ec245](https://github.com/googleapis/python-bigquery-dataframes/commit/c8ec245070402aa0770bc9b2375693de674ca925)) +* Support dataframe.loc with conditional columns selection ([#233](https://github.com/googleapis/python-bigquery-dataframes/issues/233)) ([3febea9](https://github.com/googleapis/python-bigquery-dataframes/commit/3febea99358d10f823d43c3af83ea30458e579a2)) + + +### Bug Fixes + +* Enfore pandas version requirement <2.1.4 ([#265](https://github.com/googleapis/python-bigquery-dataframes/issues/265)) ([9dd63f6](https://github.com/googleapis/python-bigquery-dataframes/commit/9dd63f6dcb6234e1f3aebd63c59e1e5c717099dc)) +* Exclude pandas 2.1.4 from prerelease tests to unblock e2e tests ([b02fc2c](https://github.com/googleapis/python-bigquery-dataframes/commit/b02fc2c1843e18d3a8d6894c64763f53e6af1b73)) +* Fix value_counts column label for normalize=True ([#245](https://github.com/googleapis/python-bigquery-dataframes/issues/245)) ([d3fa6f2](https://github.com/googleapis/python-bigquery-dataframes/commit/d3fa6f26931d5d0f0ae3fa49baccfc148f870417)) +* Migrate e2e tests to bigframes-load-testing project ([8766ac6](https://github.com/googleapis/python-bigquery-dataframes/commit/8766ac63f501929577f71e6bd2b523e92c43ba66)) +* Ml.sql logic ([#262](https://github.com/googleapis/python-bigquery-dataframes/issues/262)) ([68c6fdf](https://github.com/googleapis/python-bigquery-dataframes/commit/68c6fdf78af8b87fa4ef4f832631f24d7433a4d8)) +* Update the llm_kmeans notebook ([#247](https://github.com/googleapis/python-bigquery-dataframes/issues/247)) ([66d1839](https://github.com/googleapis/python-bigquery-dataframes/commit/66d1839c3e9a3011c7feb13a59d966b64cf8313f)) + + +### Documentation + +* Add code samples for `shape` and `head` ([#257](https://github.com/googleapis/python-bigquery-dataframes/issues/257)) ([5bdcc65](https://github.com/googleapis/python-bigquery-dataframes/commit/5bdcc6594ef2e99e96636341d286ea70420858fe)) +* Add example for dataframe.melt, dataframe.pivot, dataframe.stac… ([#252](https://github.com/googleapis/python-bigquery-dataframes/issues/252)) ([8c63697](https://github.com/googleapis/python-bigquery-dataframes/commit/8c636978f4a21eda2856862100b7a8272797fe42)) +* Add example to dataframe.nlargest, dataframe.nsmallest, datafra… ([#234](https://github.com/googleapis/python-bigquery-dataframes/issues/234)) ([e735412](https://github.com/googleapis/python-bigquery-dataframes/commit/e735412fdc52d034df92dd5462d6956bdc0167be)) +* Add examples for dataframe.cummin, dataframe.cummax, dataframe.cumsum, dataframe.cumprod ([#243](https://github.com/googleapis/python-bigquery-dataframes/issues/243)) ([0523a31](https://github.com/googleapis/python-bigquery-dataframes/commit/0523a31fa0b589f88afe0ad5b447634409ddeb86)) +* Add examples for dataframe.nunique, dataframe.diff, dataframe.a… ([#251](https://github.com/googleapis/python-bigquery-dataframes/issues/251)) ([77074ec](https://github.com/googleapis/python-bigquery-dataframes/commit/77074ecbe7f52d1d7d1d1dc537fbe4062b407672)) +* Correct the docs for `option_context` ([#263](https://github.com/googleapis/python-bigquery-dataframes/issues/263)) ([d21c6dd](https://github.com/googleapis/python-bigquery-dataframes/commit/d21c6dd26eadd64c526b0fd35b977a74b8334562)) +* Correct the params rendering for `ml.remote` and `ml.ensemble` modules ([#248](https://github.com/googleapis/python-bigquery-dataframes/issues/248)) ([c2829e3](https://github.com/googleapis/python-bigquery-dataframes/commit/c2829e3d976a43c53251c9288266e3a8ec5304c5)) +* Fix return annotation in API docstrings ([#253](https://github.com/googleapis/python-bigquery-dataframes/issues/253)) ([89a1c67](https://github.com/googleapis/python-bigquery-dataframes/commit/89a1c67fa5cbb76c1cc6ae24d5f919e22514705c)) + ## [0.15.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.14.1...v0.15.0) (2023-11-29) diff --git a/bigframes/version.py b/bigframes/version.py index 920cb95c3d..3ddf7e0f79 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.15.0" +__version__ = "0.16.0"