diff --git a/CHANGELOG.md b/CHANGELOG.md index bcb062f08f..a3314c976e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,34 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.2.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.1.0...v1.2.0) (2024-04-15) + + +### Features + +* Add hasnans, combine_first, update to Series ([#600](https://github.com/googleapis/python-bigquery-dataframes/issues/600)) ([86e0f38](https://github.com/googleapis/python-bigquery-dataframes/commit/86e0f38adc71d76e09dd832e5e33cb7c1aab02ac)) +* Add MultiIndex subclass. ([#596](https://github.com/googleapis/python-bigquery-dataframes/issues/596)) ([5d0f149](https://github.com/googleapis/python-bigquery-dataframes/commit/5d0f149dce5425098fcd154d96a302c1661ce5d3)) +* Add pivot_table for DataFrame. ([#473](https://github.com/googleapis/python-bigquery-dataframes/issues/473)) ([5f1d670](https://github.com/googleapis/python-bigquery-dataframes/commit/5f1d670e6b839a30acdb495a05011c2ce4e0c7a4)) +* Add Series.autocorr ([#605](https://github.com/googleapis/python-bigquery-dataframes/issues/605)) ([4ec8034](https://github.com/googleapis/python-bigquery-dataframes/commit/4ec80340459e675b82b437f6c48b2872d362bafe)) +* Support list of numerics in pandas.cut ([#580](https://github.com/googleapis/python-bigquery-dataframes/issues/580)) ([290f95d](https://github.com/googleapis/python-bigquery-dataframes/commit/290f95dc5198f9ab7cd9d726d40af704250c0449)) + + +### Bug Fixes + +* Address more technical writers feedback ([#581](https://github.com/googleapis/python-bigquery-dataframes/issues/581)) ([4b08d92](https://github.com/googleapis/python-bigquery-dataframes/commit/4b08d9243272229f71688152dbeb69d0ab7c68b4)) +* Error for object dtype on read_pandas ([#570](https://github.com/googleapis/python-bigquery-dataframes/issues/570)) ([8702dcf](https://github.com/googleapis/python-bigquery-dataframes/commit/8702dcf54c0f2073e21df42eaef51927481da421)) +* Inverting int now does bitwise inversion rather than sign flip ([#574](https://github.com/googleapis/python-bigquery-dataframes/issues/574)) ([5f1db8b](https://github.com/googleapis/python-bigquery-dataframes/commit/5f1db8b270b32ab366be3690761da137d9fe65f5)) +* Loc setitem dtype issue. ([#603](https://github.com/googleapis/python-bigquery-dataframes/issues/603)) ([b94bae9](https://github.com/googleapis/python-bigquery-dataframes/commit/b94bae9892e0fa79dc4bde0f4f1427d00accda6d)) +* Toc menu missing plotting name ([#591](https://github.com/googleapis/python-bigquery-dataframes/issues/591)) ([eed12c1](https://github.com/googleapis/python-bigquery-dataframes/commit/eed12c181ff8724333b1c426a0eb442c627528b8)) + + +### Documentation + +* (Series|Dataframe).dtypes ([#598](https://github.com/googleapis/python-bigquery-dataframes/issues/598)) ([edef48f](https://github.com/googleapis/python-bigquery-dataframes/commit/edef48f7a93e19bc1f6d37fb041dfd6314d881d5)) +* Add code samples for `str` accessor methdos ([#594](https://github.com/googleapis/python-bigquery-dataframes/issues/594)) ([a557ea2](https://github.com/googleapis/python-bigquery-dataframes/commit/a557ea2b64633932f730b56688f76806da6195fb)) +* Add docs for `DataFrame` and `Series` dunder methods ([#562](https://github.com/googleapis/python-bigquery-dataframes/issues/562)) ([8fc26c4](https://github.com/googleapis/python-bigquery-dataframes/commit/8fc26c424b29a8b78542372e402fcc4e8fface7b)) +* Add examples for at/iat ([#582](https://github.com/googleapis/python-bigquery-dataframes/issues/582)) ([3be4a2e](https://github.com/googleapis/python-bigquery-dataframes/commit/3be4a2e784e046ca9a1fac8d386d072537b6c4de)) + ## [1.1.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.0.0...v1.1.0) (2024-04-04) diff --git a/bigframes/_config/compute_options.py b/bigframes/_config/compute_options.py index 2b849c558a..81ef044f4d 100644 --- a/bigframes/_config/compute_options.py +++ b/bigframes/_config/compute_options.py @@ -21,7 +21,7 @@ @dataclasses.dataclass class ComputeOptions: """ - Encapsulates configuration for compute options. + Encapsulates the configuration for compute options. **Examples:** @@ -39,7 +39,7 @@ class ComputeOptions: Limits the bytes billed for query jobs. Queries that will have bytes billed beyond this limit will fail (without incurring a charge). If unspecified, this will be set to your project default. - See `maximum_bytes_billed `_. + See `maximum_bytes_billed`: https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJobConfig#google_cloud_bigquery_job_QueryJobConfig_maximum_bytes_billed. enable_multi_query_execution (bool, Options): If enabled, large queries may be factored into multiple smaller queries in order to avoid generating queries that are too complex for the query diff --git a/bigframes/constants.py b/bigframes/constants.py index a1ffd2b755..0751501085 100644 --- a/bigframes/constants.py +++ b/bigframes/constants.py @@ -27,3 +27,68 @@ ABSTRACT_METHOD_ERROR_MESSAGE = f"Abstract method. You have likely encountered a bug. Please share this stacktrace and how you reached it with the BigQuery DataFrames team. {FEEDBACK_LINK}" DEFAULT_EXPIRATION = datetime.timedelta(days=7) + +# https://cloud.google.com/bigquery/docs/locations +ALL_BIGQUERY_LOCATIONS = frozenset( + { + "us-east5", + "us-south1", + "us-central1", + "us-west4", + "us-west2", + "northamerica-northeast1", + "us-east4", + "us-west1", + "us-west3", + "southamerica-east1", + "southamerica-west1", + "us-east1", + "northamerica-northeast2", + "asia-south2", + "asia-east2", + "asia-southeast2", + "australia-southeast2", + "asia-south1", + "asia-northeast2", + "asia-northeast3", + "asia-southeast1", + "australia-southeast1", + "asia-east1", + "asia-northeast1", + "europe-west1", + "europe-west10", + "europe-north1", + "europe-west3", + "europe-west2", + "europe-southwest1", + "europe-west8", + "europe-west4", + "europe-west9", + "europe-west12", + "europe-central2", + "europe-west6", + "me-central2", + "me-central1", + "me-west1", + "me-central2", + "me-central1", + "me-west1", + "africa-south1", + } +) + +# https://cloud.google.com/storage/docs/regional-endpoints +REP_ENABLED_BIGQUERY_LOCATIONS = frozenset( + { + "me-central2", + "europe-west9", + "europe-west3", + "us-east4", + "us-west1", + } +) + +# https://cloud.google.com/storage/docs/locational-endpoints +LEP_ENABLED_BIGQUERY_LOCATIONS = frozenset( + ALL_BIGQUERY_LOCATIONS - REP_ENABLED_BIGQUERY_LOCATIONS +) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 9358dab1b1..3fa690ef37 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -106,8 +106,7 @@ def session(self) -> Session: @functools.cached_property def schema(self) -> schemata.ArraySchema: - # TODO: switch to use self.node.schema - return self._compiled_schema + return self.node.schema @functools.cached_property def _compiled_schema(self) -> schemata.ArraySchema: diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index c7b41e93eb..5b411e5416 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -24,6 +24,7 @@ import dataclasses import functools import itertools +import os import random import typing from typing import Iterable, List, Literal, Mapping, Optional, Sequence, Tuple @@ -41,10 +42,12 @@ import bigframes.core.guid as guid import bigframes.core.join_def as join_defs import bigframes.core.ordering as ordering +import bigframes.core.schema as bf_schema import bigframes.core.tree_properties as tree_properties import bigframes.core.utils import bigframes.core.utils as utils import bigframes.dtypes +import bigframes.features import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops import bigframes.session._io.pandas @@ -411,7 +414,32 @@ def _to_dataframe(self, result) -> pd.DataFrame: """Convert BigQuery data to pandas DataFrame with specific dtypes.""" dtypes = dict(zip(self.index_columns, self.index.dtypes)) dtypes.update(zip(self.value_columns, self.dtypes)) - return self.session._rows_to_dataframe(result, dtypes) + result_dataframe = self.session._rows_to_dataframe(result, dtypes) + # Runs strict validations to ensure internal type predictions and ibis are completely in sync + # Do not execute these validations outside of testing suite. + if "PYTEST_CURRENT_TEST" in os.environ: + self._validate_result_schema(result_dataframe) + return result_dataframe + + def _validate_result_schema(self, result_df: pd.DataFrame): + ibis_schema = self.expr._compiled_schema + internal_schema = self.expr.node.schema + actual_schema = bf_schema.ArraySchema( + tuple( + bf_schema.SchemaItem(name, dtype) # type: ignore + for name, dtype in result_df.dtypes.items() + ) + ) + if not bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable: + return + if internal_schema != actual_schema: + raise ValueError( + f"This error should only occur while testing. BigFrames internal schema: {internal_schema} does not match actual schema: {actual_schema}" + ) + if ibis_schema != actual_schema: + raise ValueError( + f"This error should only occur while testing. Ibis schema: {ibis_schema} does not match actual schema: {actual_schema}" + ) def to_pandas( self, @@ -1204,7 +1232,7 @@ def _standard_stats(self, column_id) -> typing.Sequence[agg_ops.UnaryAggregateOp # TODO: annotate aggregations themself with this information dtype = self.expr.get_column_type(column_id) stats: list[agg_ops.UnaryAggregateOp] = [agg_ops.count_op] - if dtype not in bigframes.dtypes.UNORDERED_DTYPES: + if bigframes.dtypes.is_orderable(dtype): stats += [agg_ops.min_op, agg_ops.max_op] if dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE: # Notable exclusions: diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 5c165fa1df..53a25d63ed 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -397,7 +397,7 @@ def expm1_op_impl(x: ibis_types.Value): @scalar_op_compiler.register_unary_op(ops.invert_op) def invert_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.NumericValue, x).negate() + return x.__invert__() ## String Operation diff --git a/bigframes/core/compile/single_column.py b/bigframes/core/compile/single_column.py index f1a3d723ac..dbf25891bf 100644 --- a/bigframes/core/compile/single_column.py +++ b/bigframes/core/compile/single_column.py @@ -23,8 +23,8 @@ import ibis.expr.types as ibis_types import bigframes.core.compile.compiled as compiled +import bigframes.core.guid as guids import bigframes.core.join_def as join_defs -import bigframes.core.joins as joining import bigframes.core.ordering as orderings @@ -50,9 +50,13 @@ def join_by_column_ordered( finally, all the right columns. """ - l_hidden_mapping, r_hidden_mapping = joining.JoinNameRemapper(namespace="hidden")( - left._hidden_column_ids, right._hidden_column_ids - ) + l_hidden_mapping = { + id: guids.generate_guid("hidden_") for id in left._hidden_column_ids + } + r_hidden_mapping = { + id: guids.generate_guid("hidden_") for id in right._hidden_column_ids + } + l_mapping = {**join.get_left_mapping(), **l_hidden_mapping} r_mapping = {**join.get_right_mapping(), **r_hidden_mapping} diff --git a/bigframes/core/convert.py b/bigframes/core/convert.py index 98f854ad72..1ef329b0c7 100644 --- a/bigframes/core/convert.py +++ b/bigframes/core/convert.py @@ -13,13 +13,27 @@ # limitations under the License. from __future__ import annotations +from typing import Optional + import pandas as pd import bigframes.core.indexes as index import bigframes.series as series -def to_bf_series(obj, default_index: index.Index) -> series.Series: +def to_bf_series(obj, default_index: Optional[index.Index]) -> series.Series: + """ + Convert a an object to a bigframes series + + Args: + obj (list-like or Series): + Object to convert to bigframes Series + default_index (list-like or Index or None): + Index to use if obj has no index + + Returns + bigframes.pandas.Series + """ if isinstance(obj, series.Series): return obj if isinstance(obj, pd.Series): @@ -35,6 +49,18 @@ def to_bf_series(obj, default_index: index.Index) -> series.Series: def to_pd_series(obj, default_index: pd.Index) -> pd.Series: + """ + Convert a an object to a pandas series + + Args: + obj (list-like or Series): + Object to convert to pandas Series + default_index (list-like or Index or None): + Index to use if obj has no index + + Returns + pandas.Series + """ if isinstance(obj, series.Series): return obj.to_pandas() if isinstance(obj, pd.Series): diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index da6f3f3740..bc03bd1df0 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -192,7 +192,15 @@ def __setitem__( and isinstance(key[0], bigframes.series.Series) and key[0].dtype == "boolean" ) and pd.api.types.is_scalar(value): - new_column = key[0].map({True: value, False: None}) + # For integer scalar, if set value to a new column, the dtype would be default to float. + # But if set value to an existing Int64 column, the dtype would still be integer. + # So we need to use different NaN type to match this behavior. + new_column = key[0].map( + { + True: value, + False: pd.NA if key[1] in self._dataframe.columns else None, + } + ) try: original_column = self._dataframe[key[1]] except KeyError: diff --git a/bigframes/core/indexes/__init__.py b/bigframes/core/indexes/__init__.py index ae6011ffa5..0a95adcd83 100644 --- a/bigframes/core/indexes/__init__.py +++ b/bigframes/core/indexes/__init__.py @@ -13,7 +13,9 @@ # limitations under the License. from bigframes.core.indexes.base import Index +from bigframes.core.indexes.multi import MultiIndex __all__ = [ "Index", + "MultiIndex", ] diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index daa52a02b9..46a9e30637 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -42,9 +42,15 @@ class Index(vendored_pandas_index.Index): __doc__ = vendored_pandas_index.Index.__doc__ - - def __init__( - self, + _query_job = None + _block: blocks.Block + _linked_frame: Union[ + bigframes.dataframe.DataFrame, bigframes.series.Series, None + ] = None + + # Overrided on __new__ to create subclasses like pandas does + def __new__( + cls, data=None, dtype=None, *, @@ -73,18 +79,30 @@ def __init__( if dtype is not None: index = index.astype(dtype) block = index._block + elif isinstance(data, pandas.Index): + pd_df = pandas.DataFrame(index=data) + block = df.DataFrame(pd_df, session=session)._block else: pd_index = pandas.Index(data=data, dtype=dtype, name=name) pd_df = pandas.DataFrame(index=pd_index) block = df.DataFrame(pd_df, session=session)._block - self._query_job = None - self._block: blocks.Block = block + + # TODO: Support more index subtypes + from bigframes.core.indexes.multi import MultiIndex + + klass = MultiIndex if len(block._index_columns) > 1 else cls + result = typing.cast(Index, object.__new__(klass)) + result._query_job = None + result._block = block + return result @classmethod def from_frame( cls, frame: Union[bigframes.series.Series, bigframes.dataframe.DataFrame] ) -> Index: - return FrameIndex(frame) + index = Index(frame._block) + index._linked_frame = frame + return index @property def name(self) -> blocks.Label: @@ -107,6 +125,10 @@ def names(self) -> typing.Sequence[blocks.Label]: @names.setter def names(self, values: typing.Sequence[blocks.Label]): new_block = self._block.with_index_labels(values) + if self._linked_frame is not None: + self._linked_frame._set_block( + self._linked_frame._block.with_index_labels(values) + ) self._block = new_block @property @@ -452,26 +474,3 @@ def to_numpy(self, dtype=None, **kwargs) -> np.ndarray: def __len__(self): return self.shape[0] - - -# Index that mutates the originating dataframe/series -class FrameIndex(Index): - def __init__( - self, - series_or_dataframe: typing.Union[ - bigframes.series.Series, bigframes.dataframe.DataFrame - ], - ): - super().__init__(series_or_dataframe._block) - self._whole_frame = series_or_dataframe - - @property - def names(self) -> typing.Sequence[blocks.Label]: - """Returns the names of the Index.""" - return self._block._index_labels - - @names.setter - def names(self, values: typing.Sequence[blocks.Label]): - new_block = self._whole_frame._get_block().with_index_labels(values) - self._whole_frame._set_block(new_block) - self._block = new_block diff --git a/bigframes/core/indexes/multi.py b/bigframes/core/indexes/multi.py new file mode 100644 index 0000000000..182d1f101c --- /dev/null +++ b/bigframes/core/indexes/multi.py @@ -0,0 +1,48 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import cast, Hashable, Iterable, Sequence + +import bigframes_vendored.pandas.core.indexes.multi as vendored_pandas_multindex +import pandas + +from bigframes.core.indexes.base import Index + + +class MultiIndex(Index, vendored_pandas_multindex.MultiIndex): + __doc__ = vendored_pandas_multindex.MultiIndex.__doc__ + + @classmethod + def from_tuples( + cls, + tuples: Iterable[tuple[Hashable, ...]], + sortorder: int | None = None, + names: Sequence[Hashable] | Hashable | None = None, + ) -> MultiIndex: + pd_index = pandas.MultiIndex.from_tuples(tuples, sortorder, names) + # Index.__new__ should detect multiple levels and properly create a multiindex + return cast(MultiIndex, Index(pd_index)) + + @classmethod + def from_arrays( + cls, + arrays, + sortorder: int | None = None, + names=None, + ) -> MultiIndex: + pd_index = pandas.MultiIndex.from_arrays(arrays, sortorder, names) + # Index.__new__ should detect multiple levels and properly create a multiindex + return cast(MultiIndex, Index(pd_index)) diff --git a/bigframes/core/joins/__init__.py b/bigframes/core/joins/__init__.py index 415ee4e49d..3c5b9605a3 100644 --- a/bigframes/core/joins/__init__.py +++ b/bigframes/core/joins/__init__.py @@ -15,6 +15,7 @@ """Helpers to join ArrayValue objects.""" from bigframes.core.joins.merge import merge -from bigframes.core.joins.name_resolution import JoinNameRemapper -__all__ = ("merge", "JoinNameRemapper") +__all__ = [ + "merge", +] diff --git a/bigframes/core/joins/name_resolution.py b/bigframes/core/joins/name_resolution.py deleted file mode 100644 index f648d28ad2..0000000000 --- a/bigframes/core/joins/name_resolution.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from __future__ import annotations - -from typing import Mapping, Sequence, Tuple - - -class JoinNameRemapper: - def __init__(self, namespace: str) -> None: - self._namespace = namespace - - def __call__( - self, left_column_ids: Sequence[str], right_column_ids: Sequence[str] - ) -> Tuple[Mapping[str, str], Mapping[str, str]]: - """ - When joining column ids from different namespaces, this function defines how names are remapped. - - Take care to map value column ids and hidden column ids in separate namespaces. This is important because value - column ids must be deterministic as they are referenced by dependent operators. The generation of hidden ids is - dependent on compilation context, and should be completely separated from value column id mappings. - """ - # This naming strategy depends on the number of value columns in source tables. - # This means column id mappings must be adjusted if pushing operations above or below join in transformation - new_left_ids = { - col: f"{self._namespace}_l_{i}" for i, col in enumerate(left_column_ids) - } - new_right_ids = { - col: f"{self._namespace}_r_{i}" for i, col in enumerate(right_column_ids) - } - return new_left_ids, new_right_ids diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py index e3ed8edd21..6bcc25319b 100644 --- a/bigframes/core/reshape/__init__.py +++ b/bigframes/core/reshape/__init__.py @@ -14,7 +14,7 @@ from __future__ import annotations import typing -from typing import Iterable, Literal, Optional, Tuple, Union +from typing import Iterable, Literal, Optional, Union import pandas as pd @@ -113,7 +113,7 @@ def cut( bins: Union[ int, pd.IntervalIndex, - Iterable[Tuple[Union[int, float], Union[int, float]]], + Iterable, ], *, labels: Optional[bool] = None, @@ -125,9 +125,29 @@ def cut( if isinstance(bins, pd.IntervalIndex): as_index: pd.IntervalIndex = bins bins = tuple((bin.left.item(), bin.right.item()) for bin in bins) - else: + elif len(list(bins)) == 0: + raise ValueError("`bins` iterable should have at least one item") + elif isinstance(list(bins)[0], tuple): as_index = pd.IntervalIndex.from_tuples(list(bins)) bins = tuple(bins) + elif pd.api.types.is_number(list(bins)[0]): + bins_list = list(bins) + if len(bins_list) < 2: + raise ValueError( + "`bins` iterable of numeric breaks should have" + " at least two items" + ) + as_index = pd.IntervalIndex.from_breaks(bins_list) + single_type = all([isinstance(n, type(bins_list[0])) for n in bins_list]) + numeric_type = type(bins_list[0]) if single_type else float + bins = tuple( + [ + (numeric_type(bins_list[i]), numeric_type(bins_list[i + 1])) + for i in range(len(bins_list) - 1) + ] + ) + else: + raise ValueError("`bins` iterable should contain tuples or numerics") if as_index.is_overlapping: raise ValueError("Overlapping IntervalIndex is not accepted.") diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 460d1056a3..2deef95277 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -17,7 +17,7 @@ from __future__ import annotations import datetime -import os +import inspect import re import sys import textwrap @@ -175,11 +175,6 @@ def __init__( self._block = bigframes.pandas.read_pandas(pd_dataframe)._get_block() self._query_job: Optional[bigquery.QueryJob] = None - # Runs strict validations to ensure internal type predictions and ibis are completely in sync - # Do not execute these validations outside of testing suite. - if "PYTEST_CURRENT_TEST" in os.environ: - self._block.expr.validate_schema() - def __dir__(self): return dir(type(self)) + [ label @@ -320,6 +315,8 @@ def __len__(self): rows, _ = self.shape return rows + __len__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__len__) + def __iter__(self): return iter(self.columns) @@ -472,7 +469,6 @@ def __getitem__( bigframes.series.Series, ], ): # No return type annotations (like pandas) as type cannot always be determined statically - """Gets the specified column(s) from the DataFrame.""" # NOTE: This implements the operations described in # https://pandas.pydata.org/docs/getting_started/intro_tutorials/03_subset_data.html @@ -504,6 +500,8 @@ def __getitem__( return DataFrame(self._block.select_columns(selected_ids)) + __getitem__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__getitem__) + def _getitem_label(self, key: blocks.Label): col_ids = self._block.cols_matching_label(key) if len(col_ids) == 0: @@ -648,14 +646,11 @@ def _repr_html_(self) -> str: return html_string def __setitem__(self, key: str, value: SingleItemValue): - """Modify or insert a column into the DataFrame. - - Note: This does **not** modify the original table the DataFrame was - derived from. - """ df = self._assign_single_item(key, value) self._set_block(df._get_block()) + __setitem__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__setitem__) + def _apply_binop( self, other: float | int | bigframes.series.Series | DataFrame, @@ -844,32 +839,50 @@ def _apply_dataframe_binop( def eq(self, other: typing.Any, axis: str | int = "columns") -> DataFrame: return self._apply_binop(other, ops.eq_op, axis=axis) + def __eq__(self, other) -> DataFrame: # type: ignore + return self.eq(other) + + __eq__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__eq__) + def ne(self, other: typing.Any, axis: str | int = "columns") -> DataFrame: return self._apply_binop(other, ops.ne_op, axis=axis) - __eq__ = eq # type: ignore + def __ne__(self, other) -> DataFrame: # type: ignore + return self.ne(other) - __ne__ = ne # type: ignore + __ne__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__ne__) def le(self, other: typing.Any, axis: str | int = "columns") -> DataFrame: return self._apply_binop(other, ops.le_op, axis=axis) + def __le__(self, other) -> DataFrame: + return self.le(other) + + __le__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__le__) + def lt(self, other: typing.Any, axis: str | int = "columns") -> DataFrame: return self._apply_binop(other, ops.lt_op, axis=axis) + def __lt__(self, other) -> DataFrame: + return self.lt(other) + + __lt__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__lt__) + def ge(self, other: typing.Any, axis: str | int = "columns") -> DataFrame: return self._apply_binop(other, ops.ge_op, axis=axis) - def gt(self, other: typing.Any, axis: str | int = "columns") -> DataFrame: - return self._apply_binop(other, ops.gt_op, axis=axis) + def __ge__(self, other) -> DataFrame: + return self.ge(other) - __lt__ = lt + __ge__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__ge__) - __le__ = le + def gt(self, other: typing.Any, axis: str | int = "columns") -> DataFrame: + return self._apply_binop(other, ops.gt_op, axis=axis) - __gt__ = gt + def __gt__(self, other) -> DataFrame: + return self.gt(other) - __ge__ = ge + __gt__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__gt__) def add( self, @@ -880,7 +893,21 @@ def add( # TODO(swast): Support level parameter with MultiIndex. return self._apply_binop(other, ops.add_op, axis=axis) - __radd__ = __add__ = radd = add + def radd( + self, + other: float | int | bigframes.series.Series | DataFrame, + axis: str | int = "columns", + ) -> DataFrame: + # TODO(swast): Support fill_value parameter. + # TODO(swast): Support level parameter with MultiIndex. + return self.add(other, axis=axis) + + def __add__(self, other) -> DataFrame: + return self.add(other) + + __add__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__add__) + + __radd__ = __add__ def sub( self, @@ -889,7 +916,13 @@ def sub( ) -> DataFrame: return self._apply_binop(other, ops.sub_op, axis=axis) - __sub__ = subtract = sub + subtract = sub + subtract.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.sub) + + def __sub__(self, other): + return self.sub(other) + + __sub__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__sub__) def rsub( self, @@ -898,7 +931,10 @@ def rsub( ) -> DataFrame: return self._apply_binop(other, ops.sub_op, axis=axis, reverse=True) - __rsub__ = rsub + def __rsub__(self, other): + return self.rsub(other) + + __rsub__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__rsub__) def mul( self, @@ -907,7 +943,25 @@ def mul( ) -> DataFrame: return self._apply_binop(other, ops.mul_op, axis=axis) - __rmul__ = __mul__ = rmul = multiply = mul + multiply = mul + multiply.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.mul) + + def __mul__(self, other): + return self.mul(other) + + __mul__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__mul__) + + def rmul( + self, + other: float | int | bigframes.series.Series | DataFrame, + axis: str | int = "columns", + ) -> DataFrame: + return self.mul(other, axis=axis) + + def __rmul__(self, other): + return self.rmul(other) + + __rmul__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__rmul__) def truediv( self, @@ -916,7 +970,13 @@ def truediv( ) -> DataFrame: return self._apply_binop(other, ops.div_op, axis=axis) - div = divide = __truediv__ = truediv + truediv.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.truediv) + div = divide = truediv + + def __truediv__(self, other): + return self.truediv(other) + + __truediv__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__truediv__) def rtruediv( self, @@ -925,7 +985,13 @@ def rtruediv( ) -> DataFrame: return self._apply_binop(other, ops.div_op, axis=axis, reverse=True) - __rtruediv__ = rdiv = rtruediv + rdiv = rtruediv + rdiv.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.rtruediv) + + def __rtruediv__(self, other): + return self.rtruediv(other) + + __rtruediv__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__rtruediv__) def floordiv( self, @@ -934,7 +1000,10 @@ def floordiv( ) -> DataFrame: return self._apply_binop(other, ops.floordiv_op, axis=axis) - __floordiv__ = floordiv + def __floordiv__(self, other): + return self.floordiv(other) + + __floordiv__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__floordiv__) def rfloordiv( self, @@ -943,31 +1012,48 @@ def rfloordiv( ) -> DataFrame: return self._apply_binop(other, ops.floordiv_op, axis=axis, reverse=True) - __rfloordiv__ = rfloordiv + def __rfloordiv__(self, other): + return self.rfloordiv(other) + + __rfloordiv__.__doc__ = inspect.getdoc( + vendored_pandas_frame.DataFrame.__rfloordiv__ + ) def mod(self, other: int | bigframes.series.Series | DataFrame, axis: str | int = "columns") -> DataFrame: # type: ignore return self._apply_binop(other, ops.mod_op, axis=axis) + def __mod__(self, other): + return self.mod(other) + + __mod__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__mod__) + def rmod(self, other: int | bigframes.series.Series | DataFrame, axis: str | int = "columns") -> DataFrame: # type: ignore return self._apply_binop(other, ops.mod_op, axis=axis, reverse=True) - __mod__ = mod + def __rmod__(self, other): + return self.rmod(other) - __rmod__ = rmod + __rmod__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__rmod__) def pow( self, other: int | bigframes.series.Series, axis: str | int = "columns" ) -> DataFrame: return self._apply_binop(other, ops.pow_op, axis=axis) + def __pow__(self, other): + return self.pow(other) + + __pow__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__pow__) + def rpow( self, other: int | bigframes.series.Series, axis: str | int = "columns" ) -> DataFrame: return self._apply_binop(other, ops.pow_op, axis=axis, reverse=True) - __pow__ = pow + def __rpow__(self, other): + return self.rpow(other) - __rpow__ = rpow + __rpow__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__rpow__) def align( self, @@ -1977,6 +2063,7 @@ def prod( return bigframes.series.Series(block.select_column("values")) product = prod + product.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.prod) def count(self, *, numeric_only: bool = False) -> bigframes.series.Series: if not numeric_only: @@ -2016,6 +2103,7 @@ def agg( ) aggregate = agg + aggregate.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.agg) def idxmin(self) -> bigframes.series.Series: return bigframes.series.Series(block_ops.idxmin(self._block)) @@ -2089,6 +2177,7 @@ def kurt(self, *, numeric_only: bool = False): return bigframes.series.Series(result_block) kurtosis = kurt + kurtosis.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.kurt) def _pivot( self, @@ -2138,6 +2227,66 @@ def pivot( ) -> DataFrame: return self._pivot(columns=columns, index=index, values=values) + def pivot_table( + self, + values: typing.Optional[ + typing.Union[blocks.Label, Sequence[blocks.Label]] + ] = None, + index: typing.Optional[ + typing.Union[blocks.Label, Sequence[blocks.Label]] + ] = None, + columns: typing.Union[blocks.Label, Sequence[blocks.Label]] = None, + aggfunc: str = "mean", + ) -> DataFrame: + if isinstance(index, Iterable) and not ( + isinstance(index, blocks.Label) and index in self.columns + ): + index = list(index) + else: + index = [index] + + if isinstance(columns, Iterable) and not ( + isinstance(columns, blocks.Label) and columns in self.columns + ): + columns = list(columns) + else: + columns = [columns] + + if isinstance(values, Iterable) and not ( + isinstance(values, blocks.Label) and values in self.columns + ): + values = list(values) + else: + values = [values] + + # Unlike pivot, pivot_table has values always ordered. + values.sort() + + keys = index + columns + agged = self.groupby(keys, dropna=True)[values].agg(aggfunc) + + if isinstance(agged, bigframes.series.Series): + agged = agged.to_frame() + + agged = agged.dropna(how="all") + + if len(values) == 1: + agged = agged.rename(columns={agged.columns[0]: values[0]}) + + agged = agged.reset_index() + + pivoted = agged.pivot( + columns=columns, + index=index, + values=values if len(values) > 1 else None, + ).sort_index() + + # TODO: Remove the reordering step once the issue is resolved. + # The pivot_table method results in multi-index columns that are always ordered. + # However, the order of the pivoted result columns is not guaranteed to be sorted. + # Sort and reorder. + return pivoted[pivoted.columns.sort_values()] + def stack(self, level: LevelsType = -1): if not isinstance(self.columns, pandas.MultiIndex): if level not in [0, -1, self.columns.name]: @@ -2488,11 +2637,13 @@ def isna(self) -> DataFrame: return self._apply_unary_op(ops.isnull_op) isnull = isna + isnull.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.isna) def notna(self) -> DataFrame: return self._apply_unary_op(ops.notnull_op) notnull = notna + notnull.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.notna) def cumsum(self): is_numeric_types = [ @@ -2806,7 +2957,10 @@ def to_numpy( ) -> numpy.ndarray: return self.to_pandas().to_numpy(dtype, copy, na_value, **kwargs) - __array__ = to_numpy + def __array__(self, dtype=None) -> numpy.ndarray: + return self.to_numpy(dtype=dtype) + + __array__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__array__) def to_parquet( self, @@ -3173,6 +3327,7 @@ def first_valid_index(self): return applymap = map + applymap.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.map) def _slice( self, @@ -3313,4 +3468,7 @@ def get_right_id(id): def plot(self): return plotting.PlotAccessor(self) - __matmul__ = dot + def __matmul__(self, other) -> DataFrame: + return self.dot(other) + + __matmul__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__matmul__) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index c5bf5db2fe..3b2092bf85 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -57,13 +57,11 @@ TIME_DTYPE = pd.ArrowDtype(pa.time64("us")) DATETIME_DTYPE = pd.ArrowDtype(pa.timestamp("us")) TIMESTAMP_DTYPE = pd.ArrowDtype(pa.timestamp("us", tz="UTC")) +GEO_DTYPE = gpd.array.GeometryDtype() # Used when storing Null expressions DEFAULT_DTYPE = FLOAT_DTYPE -# On BQ side, ARRAY, STRUCT, GEOGRAPHY, JSON are not orderable -UNORDERED_DTYPES = [gpd.array.GeometryDtype()] - # Type hints for dtype strings supported by BigQuery DataFrame DtypeString = Literal[ "boolean", @@ -134,6 +132,12 @@ def is_array_like(type: ExpressionType) -> bool: ) +def is_struct_like(type: ExpressionType) -> bool: + return isinstance(type, pd.ArrowDtype) and isinstance( + type.pyarrow_dtype, pa.StructType + ) + + def is_numeric(type: ExpressionType) -> bool: return type in NUMERIC_BIGFRAMES_TYPES_PERMISSIVE @@ -143,18 +147,18 @@ def is_iterable(type: ExpressionType) -> bool: def is_comparable(type: ExpressionType) -> bool: - return (type is not None) and (type not in UNORDERED_DTYPES) + return (type is not None) and is_orderable(type) -# Type hints for Ibis data types that can be read to Python objects by BigQuery DataFrame -ReadOnlyIbisDtype = Union[ - ibis_dtypes.Binary, - ibis_dtypes.JSON, - ibis_dtypes.Decimal, - ibis_dtypes.GeoSpatial, - ibis_dtypes.Array, - ibis_dtypes.Struct, -] +def is_orderable(type: ExpressionType) -> bool: + # On BQ side, ARRAY, STRUCT, GEOGRAPHY, JSON are not orderable + return not is_array_like(type) and not is_struct_like(type) and (type != GEO_DTYPE) + + +def is_bool_coercable(type: ExpressionType) -> bool: + # TODO: Implement more bool coercions + return (type is None) or is_numeric(type) or is_string_like(type) + BIDIRECTIONAL_MAPPINGS: Iterable[Tuple[IbisDtype, Dtype]] = ( (ibis_dtypes.boolean, pd.BooleanDtype()), diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index c57cb78791..6c81b66e55 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -193,7 +193,7 @@ def to_gbq(self: _T, model_name: str, replace: bool = False) -> _T: model_name (str): The name of the model. replace (bool, default False): - Whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. Returns: Saved transformer.""" diff --git a/bigframes/ml/cluster.py b/bigframes/ml/cluster.py index 1035def54d..e63764e7bb 100644 --- a/bigframes/ml/cluster.py +++ b/bigframes/ml/cluster.py @@ -177,7 +177,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> KMeans: model_name (str): the name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. Returns: KMeans: saved model.""" diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 475b4a046f..0dfb46efaa 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -171,7 +171,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> PCA: model_name (str): the name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. Returns: PCA: saved model.""" diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index a8f0329145..b248c295f4 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -192,9 +192,9 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBRegressor: model_name (str): The name of the model. replace (bool, default False): - Whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. - Returns: saved model.""" + Returns: Saved model.""" if not self._bqml_model: raise RuntimeError("A model must be fitted before it can be saved") @@ -345,10 +345,10 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBClassifier: model_name (str): The name of the model. replace (bool, default False): - Whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. Returns: - XGBClassifier: saved model.""" + XGBClassifier: Saved model.""" if not self._bqml_model: raise RuntimeError("A model must be fitted before it can be saved") @@ -508,10 +508,10 @@ def to_gbq(self, model_name: str, replace: bool = False) -> RandomForestRegresso model_name (str): The name of the model. replace (bool, default False): - Whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. Returns: - RandomForestRegressor: saved model.""" + RandomForestRegressor: Saved model.""" if not self._bqml_model: raise RuntimeError("A model must be fitted before it can be saved") @@ -671,10 +671,10 @@ def to_gbq(self, model_name: str, replace: bool = False) -> RandomForestClassifi model_name (str): The name of the model. replace (bool, default False): - Whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. Returns: - RandomForestClassifier: saved model.""" + RandomForestClassifier: Saved model.""" if not self._bqml_model: raise RuntimeError("A model must be fitted before it can be saved") diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py index e50a8ed35b..a7e0c3c0d9 100644 --- a/bigframes/ml/forecasting.py +++ b/bigframes/ml/forecasting.py @@ -363,10 +363,10 @@ def to_gbq(self, model_name: str, replace: bool = False) -> ARIMAPlus: model_name (str): the name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. Returns: - ARIMAPlus: saved model.""" + ARIMAPlus: Saved model.""" if not self._bqml_model: raise RuntimeError("A model must be fitted before it can be saved") diff --git a/bigframes/ml/imported.py b/bigframes/ml/imported.py index b551150050..9198b4eafb 100644 --- a/bigframes/ml/imported.py +++ b/bigframes/ml/imported.py @@ -32,7 +32,7 @@ class TensorFlowModel(base.Predictor): Args: model_path (str): - GCS path that holds the model files. + Cloud Storage path that holds the model files. session (BigQuery Session): BQ session to create the model. """ @@ -69,10 +69,10 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: Args: X (bigframes.dataframe.DataFrame): - Input DataFrame, schema is defined by the model. + Input DataFrame. Schema is defined by the model. Returns: - bigframes.dataframe.DataFrame: Output DataFrame, schema is defined by the model.""" + bigframes.dataframe.DataFrame: Output DataFrame. Schema is defined by the model.""" if not self._bqml_model: if self.model_path is None: @@ -91,10 +91,10 @@ def to_gbq(self, model_name: str, replace: bool = False) -> TensorFlowModel: model_name (str): the name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Default to False. Returns: - TensorFlowModel: saved model.""" + TensorFlowModel: Saved model.""" if not self._bqml_model: if self.model_path is None: raise ValueError("Model GCS path must be provided.") @@ -146,7 +146,7 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): - Input DataFrame or Series, schema is defined by the model. + Input DataFrame or Series. Schema is defined by the model. Returns: bigframes.dataframe.DataFrame: Output DataFrame, schema is defined by the model.""" @@ -168,10 +168,10 @@ def to_gbq(self, model_name: str, replace: bool = False) -> ONNXModel: model_name (str): the name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. Returns: - ONNXModel: saved model.""" + ONNXModel: Saved model.""" if not self._bqml_model: if self.model_path is None: raise ValueError("Model GCS path must be provided.") @@ -262,10 +262,10 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): - Input DataFrame or Series, schema is defined by the model. + Input DataFrame or Series. Schema is defined by the model. Returns: - bigframes.dataframe.DataFrame: Output DataFrame, schema is defined by the model.""" + bigframes.dataframe.DataFrame: Output DataFrame. Schema is defined by the model.""" if not self._bqml_model: if self.model_path is None: @@ -284,10 +284,10 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBoostModel: model_name (str): the name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. Returns: - XGBoostModel: saved model.""" + XGBoostModel: Saved model.""" if not self._bqml_model: if self.model_path is None: raise ValueError("Model GCS path must be provided.") diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index c0abe77b9f..63462be09f 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -184,7 +184,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> LinearRegression: model_name (str): the name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. Returns: LinearRegression: saved model.""" @@ -349,7 +349,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> LogisticRegression: model_name (str): the name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. Returns: LogisticRegression: saved model.""" diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index ffaeb399bb..31c691fd51 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -248,7 +248,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> PaLM2TextGenerator: model_name (str): the name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. Returns: PaLM2TextGenerator: saved model.""" @@ -415,7 +415,7 @@ def to_gbq( model_name (str): the name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. Returns: PaLM2TextEmbeddingGenerator: saved model.""" @@ -595,7 +595,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> GeminiTextGenerator: model_name (str): the name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Determine whether to replace if the model already exists. Default to False. Returns: GeminiTextGenerator: saved model.""" diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 76aa2a6112..f33dc16e30 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -17,12 +17,13 @@ import abc import dataclasses import typing -from typing import ClassVar, Hashable, Optional, Tuple +from typing import ClassVar, Iterable, Optional import pandas as pd import pyarrow as pa import bigframes.dtypes as dtypes +import bigframes.operations.type as signatures @dataclasses.dataclass(frozen=True) @@ -38,7 +39,7 @@ def handles_ties(self): return False @abc.abstractmethod - def output_type(self, *input_types: dtypes.ExpressionType): + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: ... @@ -48,7 +49,7 @@ class UnaryWindowOp(WindowOp): def arguments(self) -> int: return 1 - def output_type(self, *input_types: dtypes.ExpressionType): + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: return input_types[0] @@ -85,7 +86,9 @@ def arguments(self) -> int: class SumOp(UnaryAggregateOp): name: ClassVar[str] = "sum" - def output_type(self, *input_types: dtypes.ExpressionType): + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if not dtypes.is_numeric(input_types[0]): + raise TypeError(f"Type {input_types[0]} is not numeric") if pd.api.types.is_bool_dtype(input_types[0]): return dtypes.INT_DTYPE else: @@ -96,8 +99,10 @@ def output_type(self, *input_types: dtypes.ExpressionType): class MedianOp(UnaryAggregateOp): name: ClassVar[str] = "median" - def output_type(self, *input_types: dtypes.ExpressionType): + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: # These will change if median is changed to exact implementation. + if not dtypes.is_orderable(input_types[0]): + raise TypeError(f"Type {input_types[0]} is not orderable") if pd.api.types.is_bool_dtype(input_types[0]): return dtypes.INT_DTYPE else: @@ -112,7 +117,9 @@ class ApproxQuartilesOp(UnaryAggregateOp): def name(self): return f"{self.quartile*25}%" - def output_type(self, *input_types: dtypes.ExpressionType): + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if not dtypes.is_orderable(input_types[0]): + raise TypeError(f"Type {input_types[0]} is not orderable") if pd.api.types.is_bool_dtype(input_types[0]) or pd.api.types.is_integer_dtype( input_types[0] ): @@ -125,55 +132,68 @@ def output_type(self, *input_types: dtypes.ExpressionType): class MeanOp(UnaryAggregateOp): name: ClassVar[str] = "mean" - def output_type(self, *input_types: dtypes.ExpressionType): - if pd.api.types.is_bool_dtype(input_types[0]) or pd.api.types.is_integer_dtype( - input_types[0] - ): - return dtypes.FLOAT_DTYPE - else: - return input_types[0] + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.UNARY_REAL_NUMERIC.output_type(input_types[0]) @dataclasses.dataclass(frozen=True) class ProductOp(UnaryAggregateOp): name: ClassVar[str] = "product" - def output_type(self, *input_types: dtypes.ExpressionType): - return dtypes.FLOAT_DTYPE + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.FixedOutputType( + dtypes.is_numeric, dtypes.FLOAT_DTYPE, "numeric" + ).output_type(input_types[0]) @dataclasses.dataclass(frozen=True) class MaxOp(UnaryAggregateOp): name: ClassVar[str] = "max" + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.TypePreserving(dtypes.is_orderable, "orderable").output_type( + input_types[0] + ) + @dataclasses.dataclass(frozen=True) class MinOp(UnaryAggregateOp): name: ClassVar[str] = "min" + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.TypePreserving(dtypes.is_orderable, "orderable").output_type( + input_types[0] + ) + @dataclasses.dataclass(frozen=True) class StdOp(UnaryAggregateOp): name: ClassVar[str] = "std" - def output_type(self, *input_types: dtypes.ExpressionType): - return dtypes.FLOAT_DTYPE + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.FixedOutputType( + dtypes.is_numeric, dtypes.FLOAT_DTYPE, "numeric" + ).output_type(input_types[0]) @dataclasses.dataclass(frozen=True) class VarOp(UnaryAggregateOp): name: ClassVar[str] = "var" - def output_type(self, *input_types: dtypes.ExpressionType): - return dtypes.FLOAT_DTYPE + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.FixedOutputType( + dtypes.is_numeric, dtypes.FLOAT_DTYPE, "numeric" + ).output_type(input_types[0]) @dataclasses.dataclass(frozen=True) class PopVarOp(UnaryAggregateOp): name: ClassVar[str] = "popvar" - def output_type(self, *input_types: dtypes.ExpressionType): - return dtypes.FLOAT_DTYPE + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.FixedOutputType( + dtypes.is_numeric, dtypes.FLOAT_DTYPE, "numeric" + ).output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -184,14 +204,16 @@ class CountOp(UnaryAggregateOp): def skips_nulls(self): return False - def output_type(self, *input_types: dtypes.ExpressionType): - return dtypes.INT_DTYPE + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.FixedOutputType( + lambda x: True, dtypes.INT_DTYPE, "" + ).output_type(input_types[0]) @dataclasses.dataclass(frozen=True) class CutOp(UnaryWindowOp): # TODO: Unintuitive, refactor into multiple ops? - bins: typing.Union[int, Tuple[Tuple[Hashable, Hashable], ...]] + bins: typing.Union[int, Iterable] labels: Optional[bool] @property @@ -202,7 +224,7 @@ def skips_nulls(self): def handles_ties(self): return True - def output_type(self, *input_types: dtypes.ExpressionType): + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: if isinstance(self.bins, int) and (self.labels is False): return dtypes.INT_DTYPE else: @@ -210,7 +232,7 @@ def output_type(self, *input_types: dtypes.ExpressionType): interval_dtype = ( pa.float64() if isinstance(self.bins, int) - else dtypes.infer_literal_arrow_type(self.bins[0][0]) + else dtypes.infer_literal_arrow_type(list(self.bins)[0][0]) ) pa_type = pa.struct( [ @@ -237,8 +259,10 @@ def skips_nulls(self): def handles_ties(self): return True - def output_type(self, *input_types: dtypes.ExpressionType): - return dtypes.INT_DTYPE + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.FixedOutputType( + dtypes.is_orderable, dtypes.INT_DTYPE, "orderable" + ).output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -249,7 +273,7 @@ class NuniqueOp(UnaryAggregateOp): def skips_nulls(self): return False - def output_type(self, *input_types: dtypes.ExpressionType): + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: return dtypes.INT_DTYPE @@ -276,8 +300,10 @@ def skips_nulls(self): def handles_ties(self): return True - def output_type(self, *input_types: dtypes.ExpressionType): - return dtypes.INT_DTYPE + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.FixedOutputType( + dtypes.is_orderable, dtypes.INT_DTYPE, "orderable" + ).output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -290,8 +316,10 @@ def skips_nulls(self): def handles_ties(self): return True - def output_type(self, *input_types: dtypes.ExpressionType): - return dtypes.INT_DTYPE + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.FixedOutputType( + dtypes.is_orderable, dtypes.INT_DTYPE, "orderable" + ).output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -340,32 +368,40 @@ def skips_nulls(self): class AllOp(UnaryAggregateOp): name: ClassVar[str] = "all" - def output_type(self, *input_types: dtypes.ExpressionType): - return dtypes.BOOL_DTYPE + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.FixedOutputType( + dtypes.is_bool_coercable, dtypes.BOOL_DTYPE, "convertible to boolean" + ).output_type(input_types[0]) @dataclasses.dataclass(frozen=True) class AnyOp(UnaryAggregateOp): name: ClassVar[str] = "any" - def output_type(self, *input_types: dtypes.ExpressionType): - return dtypes.BOOL_DTYPE + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.FixedOutputType( + dtypes.is_bool_coercable, dtypes.BOOL_DTYPE, "convertible to boolean" + ).output_type(input_types[0]) @dataclasses.dataclass(frozen=True) class CorrOp(BinaryAggregateOp): name: ClassVar[str] = "corr" - def output_type(self, *input_types: dtypes.ExpressionType): - return dtypes.FLOAT_DTYPE + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.BINARY_REAL_NUMERIC.output_type( + input_types[0], input_types[1] + ) @dataclasses.dataclass(frozen=True) class CovOp(BinaryAggregateOp): name: ClassVar[str] = "cov" - def output_type(self, *input_types: dtypes.ExpressionType): - return dtypes.FLOAT_DTYPE + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.BINARY_REAL_NUMERIC.output_type( + input_types[0], input_types[1] + ) sum_op = SumOp() diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index abd45a1453..883d19a1e3 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -53,7 +53,25 @@ def lower(self) -> series.Series: return self._apply_unary_op(ops.lower_op) def reverse(self) -> series.Series: - """Reverse strings in the Series.""" + """Reverse strings in the Series. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(["apple", "banana", "", bpd.NA]) + >>> s.str.reverse() + 0 elppa + 1 ananab + 2 + 3 + dtype: string + + Returns: + bigframes.series.Series: A Series of booleans indicating whether the given + pattern matches the start of each string element. + """ # reverse method is in ibis, not pandas. return self._apply_unary_op(ops.reverse_op) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 4b0ac4310c..91c3eb603b 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -59,6 +59,7 @@ import bigframes.core.expression as ex import bigframes.core.global_session as global_session import bigframes.core.indexes +import bigframes.core.joins import bigframes.core.reshape import bigframes.core.tools import bigframes.dataframe @@ -707,6 +708,7 @@ def to_datetime( # checking and docstrings. DataFrame = bigframes.dataframe.DataFrame Index = bigframes.core.indexes.Index +MultiIndex = bigframes.core.indexes.MultiIndex Series = bigframes.series.Series # Other public pandas attributes @@ -760,6 +762,7 @@ def to_datetime( # Class aliases "DataFrame", "Index", + "MultiIndex", "Series", # Other public pandas attributes "NamedAgg", diff --git a/bigframes/series.py b/bigframes/series.py index 185891bc01..2f9123f9a3 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -17,12 +17,13 @@ from __future__ import annotations import functools +import inspect import itertools import numbers import os import textwrap import typing -from typing import Any, Literal, Mapping, Optional, Tuple, Union +from typing import Any, Literal, Mapping, Optional, Sequence, Tuple, Union import bigframes_vendored.pandas.core.series as vendored_pandas_series import google.cloud.bigquery as bigquery @@ -130,6 +131,11 @@ def ndim(self) -> int: def empty(self) -> bool: return self.shape[0] == 0 + @property + def hasnans(self) -> bool: + # Note, hasnans is actually a null check, and NaNs don't count for nullable float + return self.isnull().any() + @property def values(self) -> numpy.ndarray: return self.to_numpy() @@ -175,6 +181,8 @@ def _set_internal_query_job(self, query_job: bigquery.QueryJob): def __len__(self): return self.shape[0] + __len__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__len__) + def __iter__(self) -> typing.Iterator: self._optimize_query_complexity() return itertools.chain.from_iterable( @@ -418,6 +426,7 @@ def ffill(self, *, limit: typing.Optional[int] = None) -> Series: return self._apply_window_op(agg_ops.LastNonNullOp(), window) pad = ffill + pad.__doc__ = inspect.getdoc(vendored_pandas_series.Series.ffill) def bfill(self, *, limit: typing.Optional[int] = None) -> Series: window = bigframes.core.window_spec.WindowSpec(preceding=0, following=limit) @@ -604,28 +613,38 @@ def isna(self) -> "Series": return self._apply_unary_op(ops.isnull_op) isnull = isna + isnull.__doc__ = inspect.getdoc(vendored_pandas_series.Series.isna) def notna(self) -> "Series": return self._apply_unary_op(ops.notnull_op) notnull = notna + notnull.__doc__ = inspect.getdoc(vendored_pandas_series.Series.notna) def __and__(self, other: bool | int | Series) -> Series: return self._apply_binary_op(other, ops.and_op) + __and__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__and__) + __rand__ = __and__ def __or__(self, other: bool | int | Series) -> Series: return self._apply_binary_op(other, ops.or_op) + __or__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__or__) + __ror__ = __or__ def __add__(self, other: float | int | Series) -> Series: return self.add(other) + __add__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__add__) + def __radd__(self, other: float | int | Series) -> Series: return self.radd(other) + __radd__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__radd__) + def add(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.add_op) @@ -635,9 +654,13 @@ def radd(self, other: float | int | Series) -> Series: def __sub__(self, other: float | int | Series) -> Series: return self.sub(other) + __sub__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__sub__) + def __rsub__(self, other: float | int | Series) -> Series: return self.rsub(other) + __rsub__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__rsub__) + def sub(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.sub_op) @@ -645,13 +668,18 @@ def rsub(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.sub_op, reverse=True) subtract = sub + subtract.__doc__ = inspect.getdoc(vendored_pandas_series.Series.sub) def __mul__(self, other: float | int | Series) -> Series: return self.mul(other) + __mul__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__mul__) + def __rmul__(self, other: float | int | Series) -> Series: return self.rmul(other) + __rmul__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__rmul__) + def mul(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.mul_op) @@ -659,31 +687,40 @@ def rmul(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.mul_op, reverse=True) multiply = mul + multiply.__doc__ = inspect.getdoc(vendored_pandas_series.Series.mul) def __truediv__(self, other: float | int | Series) -> Series: return self.truediv(other) + __truediv__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__truediv__) + def __rtruediv__(self, other: float | int | Series) -> Series: return self.rtruediv(other) + __rtruediv__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__rtruediv__) + def truediv(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.div_op) def rtruediv(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.div_op, reverse=True) - div = truediv - - divide = truediv + truediv.__doc__ = inspect.getdoc(vendored_pandas_series.Series.truediv) + div = divide = truediv rdiv = rtruediv + rdiv.__doc__ = inspect.getdoc(vendored_pandas_series.Series.rtruediv) def __floordiv__(self, other: float | int | Series) -> Series: return self.floordiv(other) + __floordiv__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__floordiv__) + def __rfloordiv__(self, other: float | int | Series) -> Series: return self.rfloordiv(other) + __rfloordiv__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__rfloordiv__) + def floordiv(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.floordiv_op) @@ -693,9 +730,13 @@ def rfloordiv(self, other: float | int | Series) -> Series: def __pow__(self, other: float | int | Series) -> Series: return self.pow(other) + __pow__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__pow__) + def __rpow__(self, other: float | int | Series) -> Series: return self.rpow(other) + __rpow__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__rpow__) + def pow(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.pow_op) @@ -729,9 +770,13 @@ def ge(self, other) -> Series: def __mod__(self, other) -> Series: # type: ignore return self.mod(other) + __mod__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__mod__) + def __rmod__(self, other) -> Series: # type: ignore return self.rmod(other) + __rmod__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__rmod__) + def mod(self, other) -> Series: # type: ignore return self._apply_binary_op(other, ops.mod_op) @@ -748,10 +793,32 @@ def rdivmod(self, other) -> Tuple[Series, Series]: # type: ignore # the output should be dtype float, both floordiv and mod returns dtype int in this case. return (self.rfloordiv(other), self.rmod(other)) - def __matmul__(self, other): + def dot(self, other): return (self * other).sum() - dot = __matmul__ + def __matmul__(self, other): + return self.dot(other) + + __matmul__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__matmul__) + + def __rmatmul__(self, other): + return self.dot(other) + + __rmatmul__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__rmatmul__) + + def combine_first(self, other: Series) -> Series: + result = self._apply_binary_op(other, ops.coalesce_op) + result.name = self.name + return result + + def update(self, other: Union[Series, Sequence, Mapping]) -> None: + import bigframes.core.convert + + other = bigframes.core.convert.to_bf_series(other, default_index=None) + result = self._apply_binary_op( + other, ops.coalesce_op, reverse=True, alignment="left" + ) + self._set_block(result._get_block()) def abs(self) -> Series: return self._apply_unary_op(ops.abs_op) @@ -772,6 +839,9 @@ def corr(self, other: Series, method="pearson", min_periods=None) -> float: ) return self._apply_binary_aggregation(other, agg_ops.CorrOp()) + def autocorr(self, lag: int = 1) -> float: + return self.corr(self.shift(lag)) + def cov(self, other: Series) -> float: return self._apply_binary_aggregation(other, agg_ops.CovOp()) @@ -827,6 +897,7 @@ def agg(self, func: str | typing.Sequence[str]) -> scalars.Scalar | Series: ) aggregate = agg + aggregate.__doc__ = inspect.getdoc(vendored_pandas_series.Series.agg) def skew(self): count = self.count() @@ -861,6 +932,7 @@ def kurt(self): return (numerator / denominator) - adjustment kurtosis = kurt + kurtosis.__doc__ = inspect.getdoc(vendored_pandas_series.Series.kurt) def mode(self) -> Series: block = self._block @@ -908,6 +980,7 @@ def prod(self) -> float: return typing.cast(float, self._apply_aggregation(agg_ops.product_op)) product = prod + product.__doc__ = inspect.getdoc(vendored_pandas_series.Series.prod) def __eq__(self, other: object) -> Series: # type: ignore return self.eq(other) @@ -918,6 +991,8 @@ def __ne__(self, other: object) -> Series: # type: ignore def __invert__(self) -> Series: return self._apply_unary_op(ops.invert_op) + __invert__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__invert__) + def eq(self, other: object) -> Series: # TODO: enforce stricter alignment return self._apply_binary_op(other, ops.eq_op) @@ -1052,6 +1127,8 @@ def __getitem__(self, indexer): return Series(block) return self.loc[indexer] + __getitem__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__getitem__) + def __getattr__(self, key: str): if hasattr(pandas.Series, key): raise AttributeError( @@ -1439,6 +1516,7 @@ def tolist(self) -> list: return self.to_pandas().to_list() to_list = tolist + to_list.__doc__ = inspect.getdoc(vendored_pandas_series.Series.tolist) def to_markdown( self, @@ -1454,7 +1532,10 @@ def to_numpy( ) -> numpy.ndarray: return self.to_pandas().to_numpy(dtype, copy, na_value, **kwargs) - __array__ = to_numpy + def __array__(self, dtype=None) -> numpy.ndarray: + return self.to_numpy(dtype=dtype) + + __array__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__array__) def to_pickle(self, path, **kwargs) -> None: return self.to_pandas().to_pickle(path, **kwargs) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 354352f1c9..b6d56006be 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1050,7 +1050,12 @@ def _read_pandas( inline_df = self._read_pandas_inline(pandas_dataframe) if inline_df is not None: return inline_df - return self._read_pandas_load_job(pandas_dataframe, api_name) + try: + return self._read_pandas_load_job(pandas_dataframe, api_name) + except pa.ArrowInvalid as e: + raise pa.ArrowInvalid( + f"Could not convert with a BigQuery type: `{e}`. " + ) from e def _read_pandas_inline( self, pandas_dataframe: pandas.DataFrame @@ -1064,6 +1069,10 @@ def _read_pandas_inline( inline_df = dataframe.DataFrame( blocks.Block.from_local(pandas_dataframe, self) ) + except pa.ArrowInvalid as e: + raise pa.ArrowInvalid( + f"Could not convert with a BigQuery type: `{e}`. " + ) from e except ValueError: # Thrown by ibis for some unhandled types return None except pa.ArrowTypeError: # Thrown by arrow for types without mapping (geo). diff --git a/bigframes/version.py b/bigframes/version.py index 41a3895549..ec2105b648 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.1.0" +__version__ = "1.2.0" diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index 3c2c688d78..4573296ec3 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -12,6 +12,8 @@ name: Methods - href: summary_property.html name: Properties and Attributes + - href: supported_pandas_apis.html + name: Supported pandas APIs name: BigQuery DataFrames API - items: - items: @@ -67,6 +69,7 @@ uid: bigframes.operations.plotting - name: PlotAccessor uid: bigframes.operations.plotting.PlotAccessor + name: Plotting - items: - name: Series uid: bigframes.series.Series @@ -79,8 +82,6 @@ name: Series - name: Window uid: bigframes.core.window.Window - - href: supported_pandas_apis.html - name: Supported pandas APIs name: bigframes.pandas - items: - items: diff --git a/noxfile.py b/noxfile.py index 4ac3a81723..fa9c0a57d8 100644 --- a/noxfile.py +++ b/noxfile.py @@ -112,8 +112,7 @@ def lint(session): "--check", *LINT_PATHS, ) - # TODO(tswast): lint all LINT_PATHS - session.run("flake8", "bigframes", "tests") + session.run("flake8", *LINT_PATHS) @nox.session(python=DEFAULT_PYTHON_VERSION) @@ -411,8 +410,8 @@ def samples(session): CURRENT_DIRECTORY / "testing" / f"constraints-{session.python}.txt" ) - # TODO(swast): Use `requirements.txt` files from the samples directories to - # test samples. + # TODO(b/332735129): Remove this session and use python_samples templates + # where each samples directory has its own noxfile.py file, instead. install_test_extra = True install_systemtest_dependencies(session, install_test_extra, "-c", constraints_path) @@ -434,12 +433,12 @@ def cover(session): session.run("coverage", "report", "--show-missing", "--fail-under=90") # Make sure there is no dead code in our test directories. - # TODO(swast): Cleanup dead code in the system tests directory. session.run( "coverage", "report", "--show-missing", "--include=tests/unit/*", + "--include=tests/system/small/*", "--fail-under=100", ) @@ -505,7 +504,7 @@ def docfx(session): SPHINX_VERSION, "alabaster", "recommonmark", - "gcp-sphinx-docfx-yaml", + "gcp-sphinx-docfx-yaml==3.0.1", ) shutil.rmtree(os.path.join("docs", "_build"), ignore_errors=True) @@ -714,7 +713,7 @@ def notebook(session: nox.Session): "notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb", # Needs DATASET. "notebooks/regression/bq_dataframes_ml_linear_regression.ipynb", # Needs DATASET_ID. "notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb", # Needs CONNECTION. - # TODO(swast): investigate why we get 404 errors, even though + # TODO(b/332737009): investigate why we get 404 errors, even though # bq_dataframes_llm_code_generation creates a bucket in the sample. "notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb", # Needs BUCKET_URI. "notebooks/generative_ai/sentiment_analysis.ipynb", # Too slow diff --git a/scripts/get_documentation_coverage.py b/scripts/get_documentation_coverage.py index 0b9417b2d3..a6566cafab 100755 --- a/scripts/get_documentation_coverage.py +++ b/scripts/get_documentation_coverage.py @@ -97,6 +97,10 @@ def get_coverage_summary( if name.startswith("_") and not name.startswith("__"): continue + # ignore constructor + if name == "__init__": + continue + def predicate(impl): return ( # This includes class methods like `from_dict`, `from_records` diff --git a/tests/config.py b/tests/config.py deleted file mode 100644 index a885d7e71d..0000000000 --- a/tests/config.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# https://cloud.google.com/bigquery/docs/locations -ALL_BIGQUERY_LOCATIONS = [ - "us-east5", - "us-south1", - "us-central1", - "us-west4", - "us-west2", - "northamerica-northeast1", - "us-east4", - "us-west1", - "us-west3", - "southamerica-east1", - "southamerica-west1", - "us-east1", - "northamerica-northeast2", - "asia-south2", - "asia-east2", - "asia-southeast2", - "australia-southeast2", - "asia-south1", - "asia-northeast2", - "asia-northeast3", - "asia-southeast1", - "australia-southeast1", - "asia-east1", - "asia-northeast1", - "europe-west1", - "europe-west10", - "europe-north1", - "europe-west3", - "europe-west2", - "europe-southwest1", - "europe-west8", - "europe-west4", - "europe-west9", - "europe-west12", - "europe-central2", - "europe-west6", - "me-central2", - "me-central1", - "me-west1", - "me-central2", - "me-central1", - "me-west1", - "africa-south1", -] - -REP_ENABLED_BIGQUERY_LOCATIONS = [ - "me-central2", - "europe-west9", - "europe-west3", - "us-east4", - "us-west1", -] - -LEP_ENABLED_BIGQUERY_LOCATIONS = sorted( - set(ALL_BIGQUERY_LOCATIONS) - set(REP_ENABLED_BIGQUERY_LOCATIONS) -) diff --git a/tests/system/large/test_location.py b/tests/system/large/test_location.py index a4cf8919a0..204c6b7463 100644 --- a/tests/system/large/test_location.py +++ b/tests/system/large/test_location.py @@ -18,8 +18,8 @@ import pytest import bigframes +import bigframes.constants import bigframes.session.clients -from tests import config def _assert_bq_execution_location(session: bigframes.Session): @@ -66,7 +66,11 @@ def test_bq_location_default(): _assert_bq_execution_location(session) -@pytest.mark.parametrize("bigquery_location", config.ALL_BIGQUERY_LOCATIONS) +@pytest.mark.parametrize( + "bigquery_location", + # Sort the set to avoid nondeterminism. + sorted(bigframes.constants.ALL_BIGQUERY_LOCATIONS), +) def test_bq_location(bigquery_location): session = bigframes.Session( context=bigframes.BigQueryOptions(location=bigquery_location) @@ -85,7 +89,8 @@ def test_bq_location(bigquery_location): @pytest.mark.parametrize( "bigquery_location", - config.REP_ENABLED_BIGQUERY_LOCATIONS, + # Sort the set to avoid nondeterminism. + sorted(bigframes.constants.REP_ENABLED_BIGQUERY_LOCATIONS), ) def test_bq_rep_endpoints(bigquery_location): session = bigframes.Session( @@ -108,7 +113,8 @@ def test_bq_rep_endpoints(bigquery_location): @pytest.mark.parametrize( "bigquery_location", - config.LEP_ENABLED_BIGQUERY_LOCATIONS, + # Sort the set to avoid nondeterminism. + sorted(bigframes.constants.LEP_ENABLED_BIGQUERY_LOCATIONS), ) def test_bq_lep_endpoints(bigquery_location): # We are not testing BigFrames Session for LEP endpoints because it involves diff --git a/tests/system/small/operations/test_plotting.py b/tests/system/small/operations/test_plotting.py index 6542ce6de3..faf7cb7e6b 100644 --- a/tests/system/small/operations/test_plotting.py +++ b/tests/system/small/operations/test_plotting.py @@ -27,13 +27,10 @@ def _check_legend_labels(ax, labels): """ assert ax.get_legend() is not None texts = ax.get_legend().get_texts() - if not isinstance(texts, list): - assert texts.get_text() == labels - else: - actual_labels = [t.get_text() for t in texts] - assert len(actual_labels) == len(labels) - for label, e in zip(actual_labels, labels): - assert label == e + actual_labels = [t.get_text() for t in texts] + assert len(actual_labels) == len(labels) + for label, e in zip(actual_labels, labels): + assert label == e def test_series_hist_bins(scalars_dfs): diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 5d6a859c11..e70764fcc0 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -524,13 +524,6 @@ def test_repr_w_all_rows(scalars_dfs): scalars_df = scalars_df.drop(columns=["numeric_col"]) scalars_pandas_df = scalars_pandas_df.drop(columns=["numeric_col"]) - if scalars_pandas_df.index.name is None: - # Note: Not quite the same as no index / default index, but hopefully - # simulates it well enough while being consistent enough for string - # comparison to work. - scalars_df = scalars_df.set_index("rowindex", drop=False).sort_index() - scalars_df.index.name = None - # When there are 10 or fewer rows, the outputs should be identical. actual = repr(scalars_df.head(10)) @@ -2613,6 +2606,34 @@ def test_df_pivot_hockey(hockey_df, hockey_pandas_df, values, index, columns): pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) +@pytest.mark.parametrize( + ("values", "index", "columns", "aggfunc"), + [ + (("culmen_length_mm", "body_mass_g"), "species", "sex", "std"), + (["body_mass_g", "culmen_length_mm"], ("species", "island"), "sex", "sum"), + ("body_mass_g", "sex", ["island", "species"], "mean"), + ("culmen_depth_mm", "island", "species", "max"), + ], +) +def test_df_pivot_table( + penguins_df_default_index, + penguins_pandas_df_default_index, + values, + index, + columns, + aggfunc, +): + bf_result = penguins_df_default_index.pivot_table( + values=values, index=index, columns=columns, aggfunc=aggfunc + ).to_pandas() + pd_result = penguins_pandas_df_default_index.pivot_table( + values=values, index=index, columns=columns, aggfunc=aggfunc + ) + pd.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_column_type=False + ) + + def test_ipython_key_completions_with_drop(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_names = "string_col" @@ -2897,15 +2918,23 @@ def test_loc_setitem_bool_series_scalar_new_col(scalars_dfs): ) -def test_loc_setitem_bool_series_scalar_existing_col(scalars_dfs): +@pytest.mark.parametrize( + ("col", "value"), + [ + ("string_col", "hello"), + ("int64_col", 3), + ("float64_col", 3.5), + ], +) +def test_loc_setitem_bool_series_scalar_existing_col(scalars_dfs, col, value): if pd.__version__.startswith("1."): pytest.skip("this loc overload not supported in pandas 1.x.") scalars_df, scalars_pandas_df = scalars_dfs bf_df = scalars_df.copy() pd_df = scalars_pandas_df.copy() - bf_df.loc[bf_df["int64_too"] == 1, "string_col"] = "hello" - pd_df.loc[pd_df["int64_too"] == 1, "string_col"] = "hello" + bf_df.loc[bf_df["int64_too"] == 1, col] = value + pd_df.loc[pd_df["int64_too"] == 1, col] = value pd.testing.assert_frame_equal( bf_df.to_pandas(), @@ -3956,9 +3985,6 @@ def test_df_value_counts(scalars_dfs, subset, normalize, ascending, dropna): ("bottom", "dense", False, False), ], ) -@pytest.mark.skipif( - True, reason="Blocked by possible pandas rank() regression (b/283278923)" -) def test_df_rank_with_nulls( scalars_df_index, scalars_pandas_df_index, diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 10d7408790..f26902f084 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -23,7 +23,8 @@ try: import pandas_gbq # type: ignore -except ImportError: +except ImportError: # pragma: NO COVER + # TODO(b/332758806): Run system tests without "extras" pandas_gbq = None import typing @@ -129,12 +130,9 @@ def test_to_csv_index( """Test the `to_csv` API with the `index` parameter.""" scalars_df, scalars_pandas_df = scalars_dfs index_col = None - if scalars_df.index.name is not None: - path = gcs_folder + f"test_index_df_to_csv_index_{index}*.csv" - if index: - index_col = typing.cast(str, scalars_df.index.name) - else: - path = gcs_folder + f"test_default_index_df_to_csv_index_{index}*.csv" + path = gcs_folder + f"test_index_df_to_csv_index_{index}*.csv" + if index: + index_col = typing.cast(str, scalars_df.index.name) # TODO(swast): Support "date_format" parameter and make sure our # DATETIME/TIMESTAMP column export is the same format as pandas by default. @@ -386,11 +384,8 @@ def test_to_json_index_invalid_orient( gcs_folder: str, index: bool, ): - scalars_df, scalars_pandas_df = scalars_dfs - if scalars_df.index.name is not None: - path = gcs_folder + f"test_index_df_to_json_index_{index}*.jsonl" - else: - path = gcs_folder + f"test_default_index_df_to_json_index_{index}*.jsonl" + scalars_df, _ = scalars_dfs + path = gcs_folder + f"test_index_df_to_json_index_{index}*.jsonl" with pytest.raises(ValueError): scalars_df.to_json(path, index=index, lines=True) @@ -404,11 +399,8 @@ def test_to_json_index_invalid_lines( gcs_folder: str, index: bool, ): - scalars_df, scalars_pandas_df = scalars_dfs - if scalars_df.index.name is not None: - path = gcs_folder + f"test_index_df_to_json_index_{index}.jsonl" - else: - path = gcs_folder + f"test_default_index_df_to_json_index_{index}.jsonl" + scalars_df, _ = scalars_dfs + path = gcs_folder + f"test_index_df_to_json_index_{index}.jsonl" with pytest.raises(NotImplementedError): scalars_df.to_json(path, index=index) @@ -422,14 +414,13 @@ def test_to_json_index_records_orient( gcs_folder: str, index: bool, ): - """Test the `to_json` API with the `index` parameter.""" + """Test the `to_json` API with the `index` parameter. + + Uses the scalable options orient='records' and lines=True. + """ scalars_df, scalars_pandas_df = scalars_dfs - if scalars_df.index.name is not None: - path = gcs_folder + f"test_index_df_to_json_index_{index}*.jsonl" - else: - path = gcs_folder + f"test_default_index_df_to_json_index_{index}*.jsonl" + path = gcs_folder + f"test_index_df_to_json_index_{index}*.jsonl" - """ Test the `to_json` API with `orient` is `records` and `lines` is True""" scalars_df.to_json(path, index=index, orient="records", lines=True) gcs_df = pd.read_json( @@ -460,11 +451,7 @@ def test_to_parquet_index(scalars_dfs, gcs_folder, index): """Test the `to_parquet` API with the `index` parameter.""" scalars_df, scalars_pandas_df = scalars_dfs scalars_pandas_df = scalars_pandas_df.copy() - - if scalars_df.index.name is not None: - path = gcs_folder + f"test_index_df_to_parquet_{index}*.parquet" - else: - path = gcs_folder + f"test_default_index_df_to_parquet_{index}*.parquet" + path = gcs_folder + f"test_index_df_to_parquet_{index}*.parquet" # TODO(b/268693993): Type GEOGRAPHY is not currently supported for parquet. scalars_df = scalars_df.drop(columns="geography_col") diff --git a/tests/system/small/test_encryption.py b/tests/system/small/test_encryption.py index 3389e5cd68..eae667dc9d 100644 --- a/tests/system/small/test_encryption.py +++ b/tests/system/small/test_encryption.py @@ -64,8 +64,8 @@ def _assert_bq_table_is_encrypted( def test_session_query_job(bq_cmek, session_with_bq_cmek): - if not bq_cmek: - pytest.skip("no cmek set for testing") + if not bq_cmek: # pragma: NO COVER + pytest.skip("no cmek set for testing") # pragma: NO COVER _, query_job = session_with_bq_cmek._start_query( "SELECT 123", job_config=bigquery.QueryJobConfig(use_query_cache=False) @@ -82,8 +82,8 @@ def test_session_query_job(bq_cmek, session_with_bq_cmek): def test_session_load_job(bq_cmek, session_with_bq_cmek): - if not bq_cmek: - pytest.skip("no cmek set for testing") + if not bq_cmek: # pragma: NO COVER + pytest.skip("no cmek set for testing") # pragma: NO COVER # Session should have cmek set in the default query and load job configs load_table = bigframes.session._io.bigquery.random_table( @@ -114,8 +114,8 @@ def test_session_load_job(bq_cmek, session_with_bq_cmek): def test_read_gbq(bq_cmek, session_with_bq_cmek, scalars_table_id): - if not bq_cmek: - pytest.skip("no cmek set for testing") + if not bq_cmek: # pragma: NO COVER + pytest.skip("no cmek set for testing") # pragma: NO COVER # Read the BQ table df = session_with_bq_cmek.read_gbq(scalars_table_id) @@ -125,8 +125,8 @@ def test_read_gbq(bq_cmek, session_with_bq_cmek, scalars_table_id): def test_df_apis(bq_cmek, session_with_bq_cmek, scalars_table_id): - if not bq_cmek: - pytest.skip("no cmek set for testing") + if not bq_cmek: # pragma: NO COVER + pytest.skip("no cmek set for testing") # pragma: NO COVER # Read a BQ table and assert encryption df = session_with_bq_cmek.read_gbq(scalars_table_id) @@ -152,8 +152,8 @@ def test_df_apis(bq_cmek, session_with_bq_cmek, scalars_table_id): def test_read_csv_gcs( bq_cmek, session_with_bq_cmek, scalars_df_index, gcs_folder, engine ): - if not bq_cmek: - pytest.skip("no cmek set for testing") + if not bq_cmek: # pragma: NO COVER + pytest.skip("no cmek set for testing") # pragma: NO COVER # Create a csv in gcs write_path = gcs_folder + "test_read_csv_gcs_bigquery_engine*.csv" @@ -170,8 +170,8 @@ def test_read_csv_gcs( def test_to_gbq(bq_cmek, session_with_bq_cmek, scalars_table_id): - if not bq_cmek: - pytest.skip("no cmek set for testing") + if not bq_cmek: # pragma: NO COVER + pytest.skip("no cmek set for testing") # pragma: NO COVER # Read a BQ table and assert encryption df = session_with_bq_cmek.read_gbq(scalars_table_id) @@ -205,8 +205,8 @@ def test_to_gbq(bq_cmek, session_with_bq_cmek, scalars_table_id): def test_read_pandas(bq_cmek, session_with_bq_cmek): - if not bq_cmek: - pytest.skip("no cmek set for testing") + if not bq_cmek: # pragma: NO COVER + pytest.skip("no cmek set for testing") # pragma: NO COVER # Read a pandas dataframe df = session_with_bq_cmek.read_pandas(pandas.DataFrame([1])) @@ -216,8 +216,8 @@ def test_read_pandas(bq_cmek, session_with_bq_cmek): def test_read_pandas_large(bq_cmek, session_with_bq_cmek): - if not bq_cmek: - pytest.skip("no cmek set for testing") + if not bq_cmek: # pragma: NO COVER + pytest.skip("no cmek set for testing") # pragma: NO COVER # Read a pandas dataframe large enough to trigger a BQ load job df = session_with_bq_cmek.read_pandas(pandas.DataFrame(range(10_000))) @@ -227,8 +227,8 @@ def test_read_pandas_large(bq_cmek, session_with_bq_cmek): def test_bqml(bq_cmek, session_with_bq_cmek, penguins_table_id): - if not bq_cmek: - pytest.skip("no cmek set for testing") + if not bq_cmek: # pragma: NO COVER + pytest.skip("no cmek set for testing") # pragma: NO COVER model = bigframes.ml.linear_model.LinearRegression() df = session_with_bq_cmek.read_gbq(penguins_table_id).dropna() diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 6aca7628cf..bb0af52976 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -20,6 +20,31 @@ from tests.system.utils import assert_pandas_df_equal, skip_legacy_pandas +def test_multi_index_from_arrays(): + bf_idx = bpd.MultiIndex.from_arrays( + [ + pandas.Index([4, 99], dtype=pandas.Int64Dtype()), + pandas.Index( + [" Hello, World!", "_some_new_string"], + dtype=pandas.StringDtype(storage="pyarrow"), + ), + ], + names=[" 1index 1", "_1index 2"], + ) + pd_idx = pandas.MultiIndex.from_arrays( + [ + pandas.Index([4, 99], dtype=pandas.Int64Dtype()), + pandas.Index( + [" Hello, World!", "_some_new_string"], + dtype=pandas.StringDtype(storage="pyarrow"), + ), + ], + names=[" 1index 1", "_1index 2"], + ) + assert bf_idx.names == pd_idx.names + pandas.testing.assert_index_equal(bf_idx.to_pandas(), pd_idx) + + @skip_legacy_pandas def test_read_pandas_multi_index_axes(): index = pandas.MultiIndex.from_arrays( @@ -882,25 +907,6 @@ def test_column_multi_index_unstack(scalars_df_index, scalars_pandas_df_index): pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) -@pytest.mark.skip(reason="Pandas fails in newer versions.") -def test_column_multi_index_w_na_stack(scalars_df_index, scalars_pandas_df_index): - columns = ["int64_too", "int64_col", "rowindex_2"] - level1 = pandas.Index(["b", pandas.NA, pandas.NA]) - # Need resulting column to be pyarrow string rather than object dtype - level2 = pandas.Index([pandas.NA, "b", "b"], dtype="string[pyarrow]") - multi_columns = pandas.MultiIndex.from_arrays([level1, level2]) - bf_df = scalars_df_index[columns].copy() - bf_df.columns = multi_columns - pd_df = scalars_pandas_df_index[columns].copy() - pd_df.columns = multi_columns - - bf_result = bf_df.stack().to_pandas() - pd_result = pd_df.stack() - - # Pandas produces NaN, where bq dataframes produces pd.NA - pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) - - def test_corr_w_multi_index(scalars_df_index, scalars_pandas_df_index): columns = ["int64_too", "float64_col", "int64_col"] multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "b"], [1, 2, 2])) diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index a080a969c8..d543f92655 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -136,7 +136,7 @@ def test_get_dummies_series(scalars_dfs): # adjust for expected dtype differences for (column_name, type_name) in zip(pd_result.columns, pd_result.dtypes): - if type_name == "bool": + if type_name == "bool": # pragma: NO COVER pd_result[column_name] = pd_result[column_name].astype("boolean") pd_result.columns = pd_result.columns.astype(object) @@ -157,7 +157,7 @@ def test_get_dummies_series_nameless(scalars_dfs): # adjust for expected dtype differences for (column_name, type_name) in zip(pd_result.columns, pd_result.dtypes): - if type_name == "bool": + if type_name == "bool": # pragma: NO COVER pd_result[column_name] = pd_result[column_name].astype("boolean") pd_result.columns = pd_result.columns.astype(object) @@ -424,6 +424,58 @@ def test_cut_default_labels(scalars_dfs): ) +@pytest.mark.parametrize( + ("breaks",), + [ + ([0, 5, 10, 15, 20, 100, 1000],), # ints + ([0.5, 10.5, 15.5, 20.5, 100.5, 1000.5],), # floats + ([0, 5, 10.5, 15.5, 20, 100, 1000.5],), # mixed + ], +) +def test_cut_numeric_breaks(scalars_dfs, breaks): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = pd.cut(scalars_pandas_df["float64_col"], breaks) + bf_result = bpd.cut(scalars_df["float64_col"], breaks).to_pandas() + + # Convert to match data format + pd_result_converted = pd.Series( + [ + {"left_exclusive": interval.left, "right_inclusive": interval.right} + if pd.notna(val) + else pd.NA + for val, interval in zip( + pd_result, pd_result.cat.categories[pd_result.cat.codes] + ) + ], + name=pd_result.name, + ) + + pd.testing.assert_series_equal( + bf_result, pd_result_converted, check_index=False, check_dtype=False + ) + + +@pytest.mark.parametrize( + ("bins",), + [ + (-1,), # negative integer bins argument + ([],), # empty iterable of bins + (["notabreak"],), # iterable of wrong type + ([1],), # numeric breaks with only one numeric + # this is supported by pandas but not by + # the bigquery operation and a bigframes workaround + # is not yet available. Should return column + # of structs with all NaN values. + ], +) +def test_cut_errors(scalars_dfs, bins): + scalars_df, _ = scalars_dfs + + with pytest.raises(ValueError): + bpd.cut(scalars_df["float64_col"], bins) + + @pytest.mark.parametrize( ("bins",), [ diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py index ea139b9802..5ccc6db0ac 100644 --- a/tests/system/small/test_progress_bar.py +++ b/tests/system/small/test_progress_bar.py @@ -126,13 +126,3 @@ def test_query_job_repr(penguins_df_default_index: bf.dataframe.DataFrame): ] for string in string_checks: assert string in query_job_repr - - -def test_query_job_dry_run(penguins_df_default_index: bf.dataframe.DataFrame, capsys): - with bf.option_context("display.repr_mode", "deferred"): - repr(penguins_df_default_index) - repr(penguins_df_default_index["body_mass_g"]) - lines = capsys.readouterr().out.split("\n") - lines = filter(None, lines) - for line in lines: - assert "Computation deferred. Computation will process" in line diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index e7e434dbd0..106638cef3 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -105,7 +105,8 @@ def test_remote_function_direct_no_session_param( reuse=True, ) def square(x): - return x * x + # This executes on a remote function, where coverage isn't tracked. + return x * x # pragma: NO COVER assert square.bigframes_remote_function assert square.bigframes_cloud_function @@ -157,7 +158,8 @@ def test_remote_function_direct_no_session_param_location_specified( reuse=True, ) def square(x): - return x * x + # This executes on a remote function, where coverage isn't tracked. + return x * x # pragma: NO COVER scalars_df, scalars_pandas_df = scalars_dfs @@ -207,7 +209,8 @@ def test_remote_function_direct_no_session_param_location_mismatched( reuse=True, ) def square(x): - return x * x + # This executes on a remote function, where coverage isn't tracked. + return x * x # pragma: NO COVER @pytest.mark.flaky(retries=2, delay=120) @@ -233,7 +236,8 @@ def test_remote_function_direct_no_session_param_location_project_specified( reuse=True, ) def square(x): - return x * x + # This executes on a remote function, where coverage isn't tracked. + return x * x # pragma: NO COVER scalars_df, scalars_pandas_df = scalars_dfs @@ -283,7 +287,8 @@ def test_remote_function_direct_no_session_param_project_mismatched( reuse=True, ) def square(x): - return x * x + # This executes on a remote function, where coverage isn't tracked. + return x * x # pragma: NO COVER @pytest.mark.flaky(retries=2, delay=120) @@ -294,7 +299,8 @@ def test_remote_function_direct_session_param(session_with_bq_connection, scalar session=session_with_bq_connection, ) def square(x): - return x * x + # This executes on a remote function, where coverage isn't tracked. + return x * x # pragma: NO COVER scalars_df, scalars_pandas_df = scalars_dfs @@ -331,7 +337,8 @@ def test_remote_function_via_session_default(session_with_bq_connection, scalars # cloud function would be common and quickly reused. @session_with_bq_connection.remote_function([int], int) def square(x): - return x * x + # This executes on a remote function, where coverage isn't tracked. + return x * x # pragma: NO COVER scalars_df, scalars_pandas_df = scalars_dfs @@ -370,7 +377,8 @@ def test_remote_function_via_session_with_overrides( reuse=True, ) def square(x): - return x * x + # This executes on a remote function, where coverage isn't tracked. + return x * x # pragma: NO COVER scalars_df, scalars_pandas_df = scalars_dfs @@ -497,7 +505,8 @@ def test_skip_bq_connection_check(dataset_id_permanent): @session.remote_function([int], int, dataset=dataset_id_permanent) def add_one(x): - return x + 1 + # This executes on a remote function, where coverage isn't tracked. + return x + 1 # pragma: NO COVER @pytest.mark.flaky(retries=2, delay=120) @@ -534,7 +543,8 @@ def test_read_gbq_function_like_original( reuse=True, ) def square1(x): - return x * x + # This executes on a remote function, where coverage isn't tracked. + return x * x # pragma: NO COVER square2 = rf.read_gbq_function( function_name=square1.bigframes_remote_function, diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index e350286940..d27cd0a236 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -284,6 +284,21 @@ def test_abs(scalars_dfs, col_name): assert_series_equal(pd_result, bf_result) +@pytest.mark.parametrize( + ("col_name",), + ( + ("bool_col",), + ("int64_col",), + ), +) +def test_series_invert(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = (~scalars_df[col_name]).to_pandas() + pd_result = ~scalars_pandas_df[col_name] + + assert_series_equal(pd_result, bf_result) + + def test_fillna(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" @@ -703,6 +718,14 @@ def test_series_corr(scalars_dfs): assert math.isclose(pd_result, bf_result) +@skip_legacy_pandas +def test_series_autocorr(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["float64_col"].autocorr(2) + pd_result = scalars_pandas_df["float64_col"].autocorr(2) + assert math.isclose(pd_result, bf_result) + + def test_series_cov(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_result = scalars_df["int64_too"].cov(scalars_df["int64_too"]) @@ -1246,6 +1269,39 @@ def test_binop_right_filtered(scalars_dfs): ) +@skip_legacy_pandas +def test_series_combine_first(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + int64_col = scalars_df["int64_col"].head(7) + float64_col = scalars_df["float64_col"].tail(7) + bf_result = int64_col.combine_first(float64_col).to_pandas() + + pd_int64_col = scalars_pandas_df["int64_col"].head(7) + pd_float64_col = scalars_pandas_df["float64_col"].tail(7) + pd_result = pd_int64_col.combine_first(pd_float64_col) + + assert_series_equal( + bf_result, + pd_result, + ) + + +def test_series_update(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + int64_col = scalars_df["int64_col"].head(7) + float64_col = scalars_df["float64_col"].tail(7).copy() + float64_col.update(int64_col) + + pd_int64_col = scalars_pandas_df["int64_col"].head(7) + pd_float64_col = scalars_pandas_df["float64_col"].tail(7).copy() + pd_float64_col.update(pd_int64_col) + + assert_series_equal( + float64_col.to_pandas(), + pd_float64_col, + ) + + def test_mean(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_col" @@ -1276,8 +1332,6 @@ def test_numeric_literal(scalars_dfs): def test_repr(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs - if scalars_pandas_df.index.name != "rowindex": - pytest.skip("Require index & ordering for consistent repr.") col_name = "int64_col" bf_series = scalars_df[col_name] @@ -1405,8 +1459,6 @@ def test_groupby_level_sum(scalars_dfs): # TODO(tbergeron): Use a non-unique index once that becomes possible in tests scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_too" - if scalars_pandas_df.index.name != "rowindex": - pytest.skip("Require index for groupby level.") bf_series = scalars_df[col_name].groupby(level=0).sum() pd_series = scalars_pandas_df[col_name].groupby(level=0).sum() @@ -1421,8 +1473,6 @@ def test_groupby_level_list_sum(scalars_dfs): # TODO(tbergeron): Use a non-unique index once that becomes possible in tests scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_too" - if scalars_pandas_df.index.name != "rowindex": - pytest.skip("Require index for groupby level.") bf_series = scalars_df[col_name].groupby(level=["rowindex"]).sum() pd_series = scalars_pandas_df[col_name].groupby(level=["rowindex"]).sum() @@ -1640,6 +1690,24 @@ def test_size(scalars_dfs): assert pd_result == bf_result +def test_series_hasnans_true(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["string_col"].hasnans + pd_result = scalars_pandas_df["string_col"].hasnans + + assert pd_result == bf_result + + +def test_series_hasnans_false(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["string_col"].dropna().hasnans + pd_result = scalars_pandas_df["string_col"].dropna().hasnans + + assert pd_result == bf_result + + def test_empty_false(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs @@ -1710,9 +1778,6 @@ def test_dtypes(scalars_dfs): def test_head(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs - if scalars_df.index.name is None: - pytest.skip("Require explicit index for offset ops.") - bf_result = scalars_df["string_col"].head(2).to_pandas() pd_result = scalars_pandas_df["string_col"].head(2) @@ -1725,9 +1790,6 @@ def test_head(scalars_dfs): def test_tail(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs - if scalars_df.index.name is None: - pytest.skip("Require explicit index for offset ops.") - bf_result = scalars_df["string_col"].tail(2).to_pandas() pd_result = scalars_pandas_df["string_col"].tail(2) @@ -1740,9 +1802,6 @@ def test_tail(scalars_dfs): def test_head_then_scalar_operation(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs - if scalars_df.index.name is None: - pytest.skip("Require explicit index for offset ops.") - bf_result = (scalars_df["float64_col"].head(1) + 4).to_pandas() pd_result = scalars_pandas_df["float64_col"].head(1) + 4 @@ -1755,9 +1814,6 @@ def test_head_then_scalar_operation(scalars_dfs): def test_head_then_series_operation(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs - if scalars_df.index.name is None: - pytest.skip("Require explicit index for offset ops.") - bf_result = ( scalars_df["float64_col"].head(4) + scalars_df["float64_col"].head(2) ).to_pandas() @@ -1841,44 +1897,6 @@ def test_cumsum_int_ordered(scalars_df_index, scalars_pandas_df_index): ) -@pytest.mark.parametrize( - ("na_option",), - [ - ("keep",), - ("top",), - ("bottom",), - ], -) -@pytest.mark.parametrize( - ("method",), - [ - ("average",), - ("min",), - ("max",), - ("first",), - ("dense",), - ], -) -@pytest.mark.skipif( - True, reason="Blocked by possible pandas rank() regression (b/283278923)" -) -def test_rank_with_nulls(scalars_df_index, scalars_pandas_df_index, na_option, method): - col_name = "bool_col" - bf_result = ( - scalars_df_index[col_name].rank(na_option=na_option, method=method).to_pandas() - ) - pd_result = ( - scalars_pandas_df_index[col_name] - .rank(na_option=na_option, method=method) - .astype(pd.Float64Dtype()) - ) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - @pytest.mark.parametrize( ("keep",), [ @@ -3445,9 +3463,8 @@ def foo(x: int, y: int, df): ], ) def test_series_explode(data): - data = [[1, 2, 3], [], numpy.nan, [3, 4]] s = bigframes.pandas.Series(data) - pd_s = pd.Series(data) + pd_s = s.to_pandas() pd.testing.assert_series_equal( s.explode().to_pandas(), pd_s.explode(), diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index eb6a0a8dd9..ce415f9324 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -24,6 +24,7 @@ import google.cloud.bigquery as bigquery import numpy as np import pandas as pd +import pyarrow as pa import pytest import bigframes @@ -436,6 +437,11 @@ def test_read_pandas_index(session): pd.testing.assert_index_equal(bf_idx.to_pandas(), pd_idx) +def test_read_pandas_w_unsupported_mixed_dtype(session): + with pytest.raises(pa.ArrowInvalid, match="Could not convert"): + session.read_pandas(pd.DataFrame({"a": [1, "hello"]})) + + def test_read_pandas_inline_respects_location(): options = bigframes.BigQueryOptions(location="europe-west1") session = bigframes.Session(options) @@ -493,10 +499,7 @@ def test_read_pandas_tokyo( @utils.skip_legacy_pandas def test_read_csv_gcs_default_engine(session, scalars_dfs, gcs_folder): scalars_df, _ = scalars_dfs - if scalars_df.index.name is not None: - path = gcs_folder + "test_read_csv_gcs_default_engine_w_index*.csv" - else: - path = gcs_folder + "test_read_csv_gcs_default_engine_wo_index*.csv" + path = gcs_folder + "test_read_csv_gcs_default_engine_w_index*.csv" read_path = utils.get_first_file_from_wildcard(path) scalars_df.to_csv(path, index=False) dtype = scalars_df.dtypes.to_dict() @@ -520,10 +523,7 @@ def test_read_csv_gcs_default_engine(session, scalars_dfs, gcs_folder): def test_read_csv_gcs_bq_engine(session, scalars_dfs, gcs_folder): scalars_df, _ = scalars_dfs - if scalars_df.index.name is not None: - path = gcs_folder + "test_read_csv_gcs_bq_engine_w_index*.csv" - else: - path = gcs_folder + "test_read_csv_gcs_bq_engine_wo_index*.csv" + path = gcs_folder + "test_read_csv_gcs_bq_engine_w_index*.csv" scalars_df.to_csv(path, index=False) df = session.read_csv(path, engine="bigquery") diff --git a/third_party/bigframes_vendored/cpython/_pprint.py b/third_party/bigframes_vendored/cpython/_pprint.py index 617c14df0d..9b586c939b 100644 --- a/third_party/bigframes_vendored/cpython/_pprint.py +++ b/third_party/bigframes_vendored/cpython/_pprint.py @@ -110,6 +110,7 @@ def has_changed(k, v): # try to avoid calling repr on nested estimators if isinstance(v, BaseEstimator) and v.__class__ != init_params[k].__class__: return True + # Use repr as a last resort. It may be expensive. def is_scalar_nan(x): return isinstance(x, numbers.Real) and math.isnan(x) diff --git a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py index ce5f8d55f3..0d910cec92 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py @@ -9,8 +9,8 @@ def strftime(self, date_format: str): Convert to string Series using specified date_format. Return a Series of formatted strings specified by date_format. Details - of the string format can be found in `BigQuery format elements doc - <%(https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements)s>`__. + of the string format can be found in BigQuery format elements doc: + https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time. **Examples:** diff --git a/third_party/bigframes_vendored/pandas/core/config_init.py b/third_party/bigframes_vendored/pandas/core/config_init.py index a3178e2761..84ab90a322 100644 --- a/third_party/bigframes_vendored/pandas/core/config_init.py +++ b/third_party/bigframes_vendored/pandas/core/config_init.py @@ -13,7 +13,7 @@ from __future__ import annotations display_options_doc = """ -Encapsulates configuration for displaying objects. +Encapsulates the configuration for displaying objects. **Examples:** @@ -79,7 +79,7 @@ """ sampling_options_doc = """ -Encapsulates configuration for data sampling. +Encapsulates the configuration for data sampling. Attributes: max_download_size (int, default 500): diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index e5aa47ad3e..6707dc1403 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -339,6 +339,7 @@ def to_gbq( [2 rows x 2 columns] Write a DataFrame to a BigQuery table with clustering columns: + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4], 'col3': [5, 6]}) >>> clustering_cols = ['col1', 'col3'] >>> df.to_gbq( @@ -910,28 +911,6 @@ def to_orc(self, path=None, **kwargs) -> bytes | None: # ---------------------------------------------------------------------- # Unsorted - def equals(self, other) -> bool: - """ - Test whether two objects contain the same elements. - - This function allows two Series or DataFrames to be compared against - each other to see if they have the same shape and elements. NaNs in - the same location are considered equal. - - The row/column index do not need to have the same type, as long - as the values are considered equal. Corresponding columns must be of - the same dtype. - - Args: - other (Series or DataFrame): - The other Series or DataFrame to be compared with the first. - - Returns: - bool: True if all elements are the same in both objects, False - otherwise. - """ - raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def assign(self, **kwargs) -> DataFrame: r""" Assign new columns to a DataFrame. @@ -1208,7 +1187,6 @@ def rename_axis(self, mapper: Optional[str], **kwargs) -> DataFrame: Set the name of the axis for the index. .. note:: - Currently only accepts a single string parameter (the new name of the index). Args: @@ -1862,7 +1840,7 @@ def sort_index( raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- - # Arithmetic Methods + # Arithmetic and Logical Methods def eq(self, other, axis: str | int = "columns") -> DataFrame: """ @@ -1890,7 +1868,8 @@ def eq(self, other, axis: str | int = "columns") -> DataFrame: rectangle True Name: degrees, dtype: boolean - You can also use arithmetic operator ``==``: + You can also use logical operator `==`: + >>> df["degrees"] == 360 circle True triangle False @@ -1909,6 +1888,39 @@ def eq(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __eq__(self, other): + """ + Check equality of DataFrame and other, element-wise, using logical + operator `==`. + + Equivalent to `DataFrame.eq(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'a': [0, 3, 4], + ... 'b': [360, 0, 180] + ... }) + >>> df == 0 + a b + 0 True False + 1 False True + 2 False False + + [3 rows x 2 columns] + + Args: + other (scalar or DataFrame): + Object to be compared to the DataFrame for equality. + + Returns: + DataFrame: The result of comparing `other` to DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def ne(self, other, axis: str | int = "columns") -> DataFrame: """ Get not equal to of DataFrame and other, element-wise (binary operator `ne`). @@ -1954,6 +1966,39 @@ def ne(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __ne__(self, other): + """ + Check inequality of DataFrame and other, element-wise, using logical + operator `!=`. + + Equivalent to `DataFrame.ne(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'a': [0, 3, 4], + ... 'b': [360, 0, 180] + ... }) + >>> df != 0 + a b + 0 False True + 1 True False + 2 True True + + [3 rows x 2 columns] + + Args: + other (scalar or DataFrame): + Object to be compared to the DataFrame for inequality. + + Returns: + DataFrame: The result of comparing `other` to DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def le(self, other, axis: str | int = "columns") -> DataFrame: """Get 'less than or equal to' of dataframe and other, element-wise (binary operator `<=`). @@ -2004,6 +2049,39 @@ def le(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __le__(self, other): + """ + Check whether DataFrame is less than or equal to other, element-wise, + using logical operator `<=`. + + Equivalent to `DataFrame.le(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'a': [0, -1, 1], + ... 'b': [1, 0, -1] + ... }) + >>> df <= 0 + a b + 0 True False + 1 True True + 2 False True + + [3 rows x 2 columns] + + Args: + other (scalar or DataFrame): + Object to be compared to the DataFrame. + + Returns: + DataFrame: The result of comparing `other` to DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def lt(self, other, axis: str | int = "columns") -> DataFrame: """Get 'less than' of DataFrame and other, element-wise (binary operator `<`). @@ -2054,6 +2132,39 @@ def lt(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __lt__(self, other): + """ + Check whether DataFrame is less than other, element-wise, using logical + operator `<`. + + Equivalent to `DataFrame.lt(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'a': [0, -1, 1], + ... 'b': [1, 0, -1] + ... }) + >>> df < 0 + a b + 0 False False + 1 True False + 2 False True + + [3 rows x 2 columns] + + Args: + other (scalar or DataFrame): + Object to be compared to the DataFrame. + + Returns: + DataFrame: The result of comparing `other` to DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def ge(self, other, axis: str | int = "columns") -> DataFrame: """Get 'greater than or equal to' of DataFrame and other, element-wise (binary operator `>=`). @@ -2104,6 +2215,39 @@ def ge(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __ge__(self, other): + """ + Check whether DataFrame is greater than or equal to other, element-wise, + using logical operator `>=`. + + Equivalent to `DataFrame.ge(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'a': [0, -1, 1], + ... 'b': [1, 0, -1] + ... }) + >>> df >= 0 + a b + 0 True True + 1 False True + 2 True False + + [3 rows x 2 columns] + + Args: + other (scalar or DataFrame): + Object to be compared to the DataFrame. + + Returns: + DataFrame: The result of comparing `other` to DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def gt(self, other, axis: str | int = "columns") -> DataFrame: """Get 'greater than' of DataFrame and other, element-wise (binary operator `>`). @@ -2152,6 +2296,39 @@ def gt(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __gt__(self, other): + """ + Check whether DataFrame is greater than other, element-wise, using logical + operator `>`. + + Equivalent to `DataFrame.gt(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'a': [0, -1, 1], + ... 'b': [1, 0, -1] + ... }) + >>> df > 0 + a b + 0 False True + 1 False False + 2 True False + + [3 rows x 2 columns] + + Args: + other (scalar or DataFrame): + Object to be compared to the DataFrame. + + Returns: + DataFrame: The result of comparing `other` to DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def add(self, other, axis: str | int = "columns") -> DataFrame: """Get addition of DataFrame and other, element-wise (binary operator `+`). @@ -2183,7 +2360,126 @@ def add(self, other, axis: str | int = "columns") -> DataFrame: You can also use arithmetic operator ``+``: - >>> df['A'] + (df['B']) + >>> df['A'] + df['B'] + 0 5 + 1 7 + 2 9 + dtype: Int64 + + Args: + other (float, int, or Series): + Any single or multiple element data structure, or list-like object. + axis ({0 or 'index', 1 or 'columns'}): + Whether to compare by the index (0 or 'index') or columns. + (1 or 'columns'). For Series input, axis to match Series index on. + + Returns: + DataFrame: DataFrame result of the arithmetic operation. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def __add__(self, other) -> DataFrame: + """Get addition of DataFrame and other, column-wise, using arithmatic + operator `+`. + + Equivalent to ``DataFrame.add(other)``. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'height': [1.5, 2.6], + ... 'weight': [500, 800] + ... }, + ... index=['elk', 'moose']) + >>> df + height weight + elk 1.5 500 + moose 2.6 800 + + [2 rows x 2 columns] + + Adding a scalar affects all rows and columns. + + >>> df + 1.5 + height weight + elk 3.0 501.5 + moose 4.1 801.5 + + [2 rows x 2 columns] + + You can add another DataFrame with index and columns aligned. + + >>> delta = bpd.DataFrame({ + ... 'height': [0.5, 0.9], + ... 'weight': [50, 80] + ... }, + ... index=['elk', 'moose']) + >>> df + delta + height weight + elk 2.0 550 + moose 3.5 880 + + [2 rows x 2 columns] + + Adding any mis-aligned index and columns will result in invalid values. + + >>> delta = bpd.DataFrame({ + ... 'depth': [0.5, 0.9, 1.0], + ... 'weight': [50, 80, 100] + ... }, + ... index=['elk', 'moose', 'bison']) + >>> df + delta + depth height weight + elk 550 + moose 880 + bison + + [3 rows x 3 columns] + + Args: + other (scalar or DataFrame): + Object to be added to the DataFrame. + + Returns: + DataFrame: The result of adding `other` to DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def radd(self, other, axis: str | int = "columns") -> DataFrame: + """Get addition of DataFrame and other, element-wise (binary operator `+`). + + Equivalent to ``other + dataframe``. With reverse version, `add`. + + Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`) to + arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`. + + .. note:: + Mismatched indices will be unioned together. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].radd(df['B']) + 0 5 + 1 7 + 2 9 + dtype: Int64 + + You can also use arithmetic operator ``+``: + + >>> df['A'] + df['B'] 0 5 1 7 2 9 @@ -2250,6 +2546,49 @@ def sub(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __sub__(self, other): + """ + Get subtraction of other from DataFrame, element-wise, using operator `-`. + + Equivalent to `DataFrame.sub(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can subtract a scalar: + + >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df - 2 + a b + 0 -1 2 + 1 0 3 + 2 1 4 + + [3 rows x 2 columns] + + You can also subtract another DataFrame with index and column labels + aligned: + + >>> df1 = bpd.DataFrame({"a": [2, 2, 2], "b": [3, 3, 3]}) + >>> df - df1 + a b + 0 -1 1 + 1 0 2 + 2 1 3 + + [3 rows x 2 columns] + + Args: + other (scalar or DataFrame): + Object to subtract from the DataFrame. + + Returns: + DataFrame: The result of the subtraction. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def rsub(self, other, axis: str | int = "columns") -> DataFrame: """Get subtraction of DataFrame and other, element-wise (binary operator `-`). @@ -2296,6 +2635,21 @@ def rsub(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __rsub__(self, other): + """ + Get subtraction of DataFrame from other, element-wise, using operator `-`. + + Equivalent to `DataFrame.rsub(other)`. + + Args: + other (scalar or DataFrame): + Object to subtract the DataFrame from. + + Returns: + DataFrame: The result of the subtraction. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def mul(self, other, axis: str | int = "columns") -> DataFrame: """Get multiplication of DataFrame and other, element-wise (binary operator `*`). @@ -2345,6 +2699,141 @@ def mul(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __mul__(self, other): + """ + Get multiplication of DataFrame with other, element-wise, using operator `*`. + + Equivalent to `DataFrame.mul(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can multiply with a scalar: + + >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df * 3 + a b + 0 3 12 + 1 6 15 + 2 9 18 + + [3 rows x 2 columns] + + You can also multiply with another DataFrame with index and column labels + aligned: + + >>> df1 = bpd.DataFrame({"a": [2, 2, 2], "b": [3, 3, 3]}) + >>> df * df1 + a b + 0 2 12 + 1 4 15 + 2 6 18 + + [3 rows x 2 columns] + + Args: + other (scalar or DataFrame): + Object to multiply with the DataFrame. + + Returns: + DataFrame: The result of the multiplication. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def rmul(self, other, axis: str | int = "columns") -> DataFrame: + """Get multiplication of DataFrame and other, element-wise (binary operator `*`). + + Equivalent to ``other * dataframe``. With reverse version, `mul`. + + Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`) to + arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`. + + .. note:: + Mismatched indices will be unioned together. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].rmul(df['B']) + 0 4 + 1 10 + 2 18 + dtype: Int64 + + You can also use arithmetic operator ``*``: + + >>> df['A'] * (df['B']) + 0 4 + 1 10 + 2 18 + dtype: Int64 + + Args: + other (float, int, or Series): + Any single or multiple element data structure, or list-like object. + axis ({0 or 'index', 1 or 'columns'}): + Whether to compare by the index (0 or 'index') or columns. + (1 or 'columns'). For Series input, axis to match Series index on. + + Returns: + DataFrame: DataFrame result of the arithmetic operation. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def __rmul__(self, other): + """ + Get multiplication of DataFrame with other, element-wise, using operator `*`. + + Equivalent to `DataFrame.rmul(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can multiply with a scalar: + + >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df * 3 + a b + 0 3 12 + 1 6 15 + 2 9 18 + + [3 rows x 2 columns] + + You can also multiply with another DataFrame with index and column labels + aligned: + + >>> df1 = bpd.DataFrame({"a": [2, 2, 2], "b": [3, 3, 3]}) + >>> df * df1 + a b + 0 2 12 + 1 4 15 + 2 6 18 + + [3 rows x 2 columns] + + Args: + other (scalar or DataFrame): + Object to multiply the DataFrame with. + + Returns: + DataFrame: The result of the multiplication. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def truediv(self, other, axis: str | int = "columns") -> DataFrame: """Get floating division of DataFrame and other, element-wise (binary operator `/`). @@ -2394,6 +2883,49 @@ def truediv(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __truediv__(self, other): + """ + Get division of DataFrame by other, element-wise, using operator `/`. + + Equivalent to `DataFrame.truediv(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can multiply with a scalar: + + >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df / 2 + a b + 0 0.5 2.0 + 1 1.0 2.5 + 2 1.5 3.0 + + [3 rows x 2 columns] + + You can also multiply with another DataFrame with index and column labels + aligned: + + >>> denominator = bpd.DataFrame({"a": [2, 2, 2], "b": [3, 3, 3]}) + >>> df / denominator + a b + 0 0.5 1.333333 + 1 1.0 1.666667 + 2 1.5 2.0 + + [3 rows x 2 columns] + + Args: + other (scalar or DataFrame): + Object to divide the DataFrame by. + + Returns: + DataFrame: The result of the division. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def rtruediv(self, other, axis: str | int = "columns") -> DataFrame: """Get floating division of DataFrame and other, element-wise (binary operator `/`). @@ -2440,6 +2972,21 @@ def rtruediv(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __rtruediv__(self, other): + """ + Get division of other by DataFrame, element-wise, using operator `/`. + + Equivalent to `DataFrame.rtruediv(other)`. + + Args: + other (scalar or DataFrame): + Object to divide by the DataFrame. + + Returns: + DataFrame: The result of the division. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def floordiv(self, other, axis: str | int = "columns") -> DataFrame: """Get integer division of DataFrame and other, element-wise (binary operator `//`). @@ -2489,6 +3036,49 @@ def floordiv(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __floordiv__(self, other): + """ + Get integer divison of DataFrame by other, using arithmatic operator `//`. + + Equivalent to `DataFrame.floordiv(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can divide by a scalar: + + >>> df = bpd.DataFrame({"a": [15, 15, 15], "b": [30, 30, 30]}) + >>> df // 2 + a b + 0 7 15 + 1 7 15 + 2 7 15 + + [3 rows x 2 columns] + + You can also divide by another DataFrame with index and column labels + aligned: + + >>> divisor = bpd.DataFrame({"a": [2, 3, 4], "b": [5, 6, 7]}) + >>> df // divisor + a b + 0 7 6 + 1 5 5 + 2 3 4 + + [3 rows x 2 columns] + + Args: + other (scalar or DataFrame): + Object to divide the DataFrame by. + + Returns: + DataFrame: The result of the integer divison. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def rfloordiv(self, other, axis: str | int = "columns") -> DataFrame: """Get integer division of DataFrame and other, element-wise (binary operator `//`). @@ -2535,6 +3125,21 @@ def rfloordiv(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __rfloordiv__(self, other): + """ + Get integer divison of other by DataFrame. + + Equivalent to `DataFrame.rfloordiv(other)`. + + Args: + other (scalar or DataFrame): + Object to divide by the DataFrame. + + Returns: + DataFrame: The result of the integer divison. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def mod(self, other, axis: str | int = "columns") -> DataFrame: """Get modulo of DataFrame and other, element-wise (binary operator `%`). @@ -2573,14 +3178,57 @@ def mod(self, other, axis: str | int = "columns") -> DataFrame: dtype: Int64 Args: - other: - Any single or multiple element data structure, or list-like object. - axis ({0 or 'index', 1 or 'columns'}): - Whether to compare by the index (0 or 'index') or columns. - (1 or 'columns'). For Series input, axis to match Series index on. + other: + Any single or multiple element data structure, or list-like object. + axis ({0 or 'index', 1 or 'columns'}): + Whether to compare by the index (0 or 'index') or columns. + (1 or 'columns'). For Series input, axis to match Series index on. + + Returns: + DataFrame: DataFrame result of the arithmetic operation. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def __mod__(self, other): + """ + Get modulo of DataFrame with other, element-wise, using operator `%`. + + Equivalent to `DataFrame.mod(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can modulo with a scalar: + + >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df % 3 + a b + 0 1 1 + 1 2 2 + 2 0 0 + + [3 rows x 2 columns] + + You can also modulo with another DataFrame with index and column labels + aligned: + + >>> modulo = bpd.DataFrame({"a": [2, 2, 2], "b": [3, 3, 3]}) + >>> df % modulo + a b + 0 1 1 + 1 0 2 + 2 1 0 + + [3 rows x 2 columns] + + Args: + other (scalar or DataFrame): + Object to modulo the DataFrame by. Returns: - DataFrame: DataFrame result of the arithmetic operation. + DataFrame: The result of the modulo. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2630,6 +3278,21 @@ def rmod(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __rmod__(self, other): + """ + Get integer divison of other by DataFrame. + + Equivalent to `DataFrame.rmod(other)`. + + Args: + other (scalar or DataFrame): + Object to modulo by the DataFrame. + + Returns: + DataFrame: The result of the modulo. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def pow(self, other, axis: str | int = "columns") -> DataFrame: """Get Exponential power of dataframe and other, element-wise (binary operator `**`). @@ -2680,6 +3343,50 @@ def pow(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __pow__(self, other): + """ + Get exponentiation of DataFrame with other, element-wise, using operator + `**`. + + Equivalent to `DataFrame.pow(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can exponentiate with a scalar: + + >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df ** 2 + a b + 0 1 16 + 1 4 25 + 2 9 36 + + [3 rows x 2 columns] + + You can also exponentiate with another DataFrame with index and column + labels aligned: + + >>> exponent = bpd.DataFrame({"a": [2, 2, 2], "b": [3, 3, 3]}) + >>> df ** exponent + a b + 0 1 64 + 1 4 125 + 2 9 216 + + [3 rows x 2 columns] + + Args: + other (scalar or DataFrame): + Object to exponentiate the DataFrame with. + + Returns: + DataFrame: The result of the exponentiation. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def rpow(self, other, axis: str | int = "columns") -> DataFrame: """Get Exponential power of dataframe and other, element-wise (binary operator `rpow`). @@ -2727,6 +3434,22 @@ def rpow(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __rpow__(self, other): + """ + Get exponentiation of other with DataFrame, element-wise, using operator + `**`. + + Equivalent to `DataFrame.rpow(other)`. + + Args: + other (scalar or DataFrame): + Object to exponentiate with the DataFrame. + + Returns: + DataFrame: The result of the exponentiation. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def combine( self, other, func, fill_value=None, overwrite: bool = True ) -> DataFrame: @@ -4102,7 +4825,6 @@ def nsmallest(self, n: int, columns, keep: str = "first"): performant. .. note:: - This function cannot be used with all column types. For example, when specifying columns with `object` or `category` dtypes, ``TypeError`` is raised. @@ -4711,6 +5433,88 @@ def pivot(self, *, columns, index=None, values=None): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def pivot_table(self, values=None, index=None, columns=None, aggfunc="mean"): + """ + Create a spreadsheet-style pivot table as a DataFrame. + + The levels in the pivot table will be stored in MultiIndex objects (hierarchical indexes) + on the index and columns of the result DataFrame. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'Product': ['Product A', 'Product B', 'Product A', 'Product B', 'Product A', 'Product B'], + ... 'Region': ['East', 'West', 'East', 'West', 'West', 'East'], + ... 'Sales': [100, 200, 150, 100, 200, 150], + ... 'Rating': [3, 5, 4, 3, 3, 5] + ... }) + >>> df + Product Region Sales Rating + 0 Product A East 100 3 + 1 Product B West 200 5 + 2 Product A East 150 4 + 3 Product B West 100 3 + 4 Product A West 200 3 + 5 Product B East 150 5 + + [6 rows x 4 columns] + + Using `pivot_table` with default aggfunc "mean": + + >>> pivot_table = df.pivot_table( + ... values=['Sales', 'Rating'], + ... index='Product', + ... columns='Region' + ... ) + >>> pivot_table + Rating Sales + Region East West East West + Product + Product A 3.5 3.0 125.0 200.0 + Product B 5.0 4.0 150.0 150.0 + + [2 rows x 4 columns] + + Using `pivot_table` with specified aggfunc "max": + + >>> pivot_table = df.pivot_table( + ... values=['Sales', 'Rating'], + ... index='Product', + ... columns='Region', + ... aggfunc="max" + ... ) + >>> pivot_table + Rating Sales + Region East West East West + Product + Product A 4 3 150 200 + Product B 5 5 150 200 + + [2 rows x 4 columns] + + Args: + values (str, object or a list of the previous, optional): + Column(s) to use for populating new frame's values. If not + specified, all remaining columns will be used and the result will + have hierarchically indexed columns. + + index (str or object or a list of str, optional): + Column to use to make new frame's index. If not given, uses existing index. + + columns (str or object or a list of str): + Column to use to make new frame's columns. + + aggfunc (str, default "mean"): + Aggregation function name to compute summary statistics (e.g., 'sum', 'mean'). + + Returns: + DataFrame: An Excel style pivot table. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def stack(self, level=-1): """ Stack the prescribed level(s) from columns to index. @@ -4992,6 +5796,7 @@ def eval(self, expr: str) -> DataFrame: injection if you pass user input to this function. **Examples:** + >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None @@ -5013,11 +5818,11 @@ def eval(self, expr: str) -> DataFrame: 4 7 dtype: Int64 - Assignment is allowed though by default the original DataFrame is not - modified. + Assignment is allowed though by default the original DataFrame is not + modified. >>> df.eval('C = A + B') - A B C + A B C 0 1 10 11 1 2 8 10 2 3 6 9 @@ -5026,7 +5831,7 @@ def eval(self, expr: str) -> DataFrame: [5 rows x 3 columns] >>> df - A B + A B 0 1 10 1 2 8 2 3 6 @@ -5035,7 +5840,7 @@ def eval(self, expr: str) -> DataFrame: [5 rows x 2 columns] - Multiple columns can be assigned to using multi-line expressions: + Multiple columns can be assigned to using multi-line expressions: >>> df.eval( ... ''' @@ -5043,7 +5848,7 @@ def eval(self, expr: str) -> DataFrame: ... D = A - B ... ''' ... ) - A B C D + A B C D 0 1 10 11 -9 1 2 8 10 -6 2 3 6 9 -3 @@ -5067,6 +5872,7 @@ def query(self, expr: str) -> DataFrame | None: Query the columns of a DataFrame with a boolean expression. **Examples:** + >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None @@ -5362,6 +6168,30 @@ def loc(self): def iat(self): """Access a single value for a row/column pair by integer position. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> df = bpd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], + ... columns=['A', 'B', 'C']) + >>> bpd.options.display.progress_bar = None + >>> df + A B C + 0 0 2 3 + 1 0 4 1 + 2 10 20 30 + + [3 rows x 3 columns] + + Get value at specified row/column pair + + >>> df.iat[1, 2] + 1 + + Get value within a series + + >>> df.loc[0].iat[1] + 2 + Returns: bigframes.core.indexers.IatDataFrameIndexer: Indexers object. """ @@ -5371,6 +6201,30 @@ def iat(self): def at(self): """Access a single value for a row/column label pair. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> df = bpd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], + ... index=[4, 5, 6], columns=['A', 'B', 'C']) + >>> bpd.options.display.progress_bar = None + >>> df + A B C + 4 0 2 3 + 5 0 4 1 + 6 10 20 30 + + [3 rows x 3 columns] + + Get value at specified row/column pair + + >>> df.at[4, 'B'] + 2 + + Get value within a series + + >>> df.loc[5].at['B'] + 4 + Returns: bigframes.core.indexers.AtDataFrameIndexer: Indexers object. """ @@ -5391,6 +6245,7 @@ def dot(self, other): DataFrame and the index of other must contain the same values, as they will be aligned prior to the multiplication. + .. note:: The dot method for Series computes the inner product, instead of the matrix product here. @@ -5477,6 +6332,59 @@ def dot(self, other): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __matmul__(self, other): + """ + Compute the matrix multiplication between the DataFrame and other, using + operator `@`. + + Equivalent to `DataFrame.dot(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> left = bpd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) + >>> left + 0 1 2 3 + 0 0 1 -2 -1 + 1 1 1 1 1 + + [2 rows x 4 columns] + >>> right = bpd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]]) + >>> right + 0 1 + 0 0 1 + 1 1 2 + 2 -1 -1 + 3 2 0 + + [4 rows x 2 columns] + >>> left @ right + 0 1 + 0 1 4 + 1 2 2 + + [2 rows x 2 columns] + + The operand can be a Series, in which case the result will also be a + Series: + + >>> right = bpd.Series([1, 2, -1,0]) + >>> left @ right + 0 4 + 1 2 + dtype: Int64 + + Args: + other (DataFrame or Series): + Object to be matrix multiplied with the DataFrame. + + Returns: + DataFrame or Series: The result of the matrix multiplication. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + @property def plot(self): """ @@ -5487,3 +6395,197 @@ def plot(self): An accessor making plots. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def __len__(self): + """Returns number of rows in the DataFrame, serves `len` operator. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'a': [0, 1, 2], + ... 'b': [3, 4, 5] + ... }) + >>> len(df) + 3 + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def __array__(self): + """ + Returns the rows as NumPy array. + + Equivalent to `DataFrame.to_numpy(dtype)`. + + Users should not call this directly. Rather, it is invoked by + `numpy.array` and `numpy.asarray`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> import numpy as np + + >>> df = bpd.DataFrame({"a": [1, 2, 3], "b": [11, 22, 33]}) + + >>> np.array(df) + array([[1, 11], + [2, 22], + [3, 33]], dtype=object) + + >>> np.asarray(df) + array([[1, 11], + [2, 22], + [3, 33]], dtype=object) + + Args: + dtype (str or numpy.dtype, optional): + The dtype to use for the resulting NumPy array. By default, + the dtype is inferred from the data. + + Returns: + numpy.ndarray: + The rows in the DataFrame converted to a `numpy.ndarray` with + the specified dtype. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def __getitem__(self, key): + """Gets the specified column(s) from the DataFrame. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... "name" : ["alpha", "beta", "gamma"], + ... "age": [20, 30, 40], + ... "location": ["WA", "NY", "CA"] + ... }) + >>> df + name age location + 0 alpha 20 WA + 1 beta 30 NY + 2 gamma 40 CA + + [3 rows x 3 columns] + + You can specify a column label to retrieve the corresponding Series. + + >>> df["name"] + 0 alpha + 1 beta + 2 gamma + Name: name, dtype: string + + You can specify a list of column labels to retrieve a Dataframe. + + >>> df[["name", "age"]] + name age + 0 alpha 20 + 1 beta 30 + 2 gamma 40 + + [3 rows x 2 columns] + + You can specify a condition as a series of booleans to retrieve matching + rows. + + >>> df[df["age"] > 25] + name age location + 1 beta 30 NY + 2 gamma 40 CA + + [2 rows x 3 columns] + + You can specify a pandas Index with desired column labels. + + >>> import pandas as pd + >>> df[pd.Index(["age", "location"])] + age location + 0 20 WA + 1 30 NY + 2 40 CA + + [3 rows x 2 columns] + + Args: + key (index): + Index or list of indices. It can be a column label, a list of + column labels, a Series of booleans or a pandas Index of desired + column labels + + Returns: + Series or Value: Value(s) at the requested index(es). + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def __setitem__(self, key, value): + """Modify or insert a column into the DataFrame. + + .. note:: + This does **not** modify the original table the DataFrame was + derived from. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... "name" : ["alpha", "beta", "gamma"], + ... "age": [20, 30, 40], + ... "location": ["WA", "NY", "CA"] + ... }) + >>> df + name age location + 0 alpha 20 WA + 1 beta 30 NY + 2 gamma 40 CA + + [3 rows x 3 columns] + + You can add assign a constant to a new column. + + >>> df["country"] = "USA" + >>> df + name age location country + 0 alpha 20 WA USA + 1 beta 30 NY USA + 2 gamma 40 CA USA + + [3 rows x 4 columns] + + You can assign a Series to a new column. + + >>> df["new_age"] = df["age"] + 5 + >>> df + name age location country new_age + 0 alpha 20 WA USA 25 + 1 beta 30 NY USA 35 + 2 gamma 40 CA USA 45 + + [3 rows x 5 columns] + + You can assign a Series to an existing column. + + >>> df["new_age"] = bpd.Series([29, 39, 19], index=[1, 2, 0]) + >>> df + name age location country new_age + 0 alpha 20 WA USA 19 + 1 beta 30 NY USA 29 + 2 gamma 40 CA USA 39 + + [3 rows x 5 columns] + + Args: + key (column index): + It can be a new column to be inserted, or an existing column to + be modified. + value (scalar or Series): + Value to be assigned to the column + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 04cc3990a4..9c6120fd6c 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -165,13 +165,20 @@ def astype(self, dtype): Args: dtype (str or pandas.ExtensionDtype): - A dtype supported by BigQuery DataFrame include ``'boolean'``, ``'Float64'``, ``'Int64'``, - ``'int64[pyarrow]'``, ``'string'``, ``'string[pyarrow]'``, ``'timestamp[us, tz=UTC][pyarrow]'``, - ``'timestamp\[us\]\[pyarrow\]'``, ``'date32\[day\]\[pyarrow\]'``, ``'time64\[us\]\[pyarrow\]'``. - A pandas.ExtensionDtype include ``pandas.BooleanDtype()``, ``pandas.Float64Dtype()``, - ``pandas.Int64Dtype()``, ``pandas.StringDtype(storage="pyarrow")``, - ``pd.ArrowDtype(pa.date32())``, ``pd.ArrowDtype(pa.time64("us"))``, - ``pd.ArrowDtype(pa.timestamp("us"))``, ``pd.ArrowDtype(pa.timestamp("us", tz="UTC"))``. + A dtype supported by BigQuery DataFrame include ``'boolean'``, + ``'Float64'``, ``'Int64'``, ``'int64\\[pyarrow\\]'``, + ``'string'``, ``'string\\[pyarrow\\]'``, + ``'timestamp\\[us, tz=UTC\\]\\[pyarrow\\]'``, + ``'timestamp\\[us\\]\\[pyarrow\\]'``, + ``'date32\\[day\\]\\[pyarrow\\]'``, + ``'time64\\[us\\]\\[pyarrow\\]'``. + A pandas.ExtensionDtype include ``pandas.BooleanDtype()``, + ``pandas.Float64Dtype()``, ``pandas.Int64Dtype()``, + ``pandas.StringDtype(storage="pyarrow")``, + ``pd.ArrowDtype(pa.date32())``, + ``pd.ArrowDtype(pa.time64("us"))``, + ``pd.ArrowDtype(pa.timestamp("us"))``, + ``pd.ArrowDtype(pa.timestamp("us", tz="UTC"))``. Returns: same type as caller @@ -582,6 +589,18 @@ def dtypes(self): The result's index is the original DataFrame's columns. Columns with mixed types aren't supported yet in BigQuery DataFrames. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'float': [1.0], 'int': [1], 'string': ['foo']}) + >>> df.dtypes + float Float64 + int Int64 + string string[pyarrow] + dtype: object + Returns: A *pandas* Series with the data type of each column. """ @@ -643,9 +662,9 @@ def copy(self): >>> df.loc[df["b"] == 2, "b"] = 22 >>> df - a b - 0 1 22.0 - 1 3 4.0 + a b + 0 1 22 + 1 3 4 [2 rows x 2 columns] >>> df_copy @@ -1101,9 +1120,39 @@ def pipe( return common.pipe(self, func, *args, **kwargs) def __nonzero__(self): + """Returns the truth value of the object.""" raise ValueError( f"The truth value of a {type(self).__name__} is ambiguous. " "Use a.empty, a.bool(), a.item(), a.any() or a.all()." ) __bool__ = __nonzero__ + + def __getattr__(self, name: str): + """ + After regular attribute access, try looking up the name + This allows simpler access to columns for interactive use. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def equals(self, other) -> bool: + """ + Test whether two objects contain the same elements. + + This function allows two Series or DataFrames to be compared against + each other to see if they have the same shape and elements. NaNs in + the same location are considered equal. + + The row/column index do not need to have the same type, as long + as the values are considered equal. Corresponding columns must be of + the same dtype. + + Args: + other (Series or DataFrame): + The other Series or DataFrame to be compared with the first. + + Returns: + bool: True if all elements are the same in both objects, False + otherwise. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index e1cc8c5a53..ed4ca66f38 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -24,7 +24,7 @@ def any(self): Returns: Series or DataFrame: DataFrame or Series of boolean values, where a value is True if any element is True within its - respective group, False otherwise. + respective group; otherwise False. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -35,7 +35,7 @@ def all(self): Returns: Series or DataFrame: DataFrame or Series of boolean values, where a value is True if all elements are True within its - respective group, False otherwise. + respective group; otherwise False. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -339,7 +339,7 @@ def expanding(self, *args, **kwargs): Provides expanding functionality. Returns: - Series or DataFrame: A expanding grouper, providing expanding functionality per group. + Series or DataFrame: An expanding grouper, providing expanding functionality per group. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py index 9490f4608b..3f0175359a 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py @@ -8,7 +8,27 @@ class DatetimeProperties: @property def day(self): - """The day of the datetime.""" + """The day of the datetime. + + **Examples:** + + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="D") + ... ) + >>> s + 0 2000-01-01 00:00:00 + 1 2000-01-02 00:00:00 + 2 2000-01-03 00:00:00 + dtype: timestamp[us][pyarrow] + >>> s.dt.day + 0 1 + 1 2 + 2 3 + dtype: Int64 + """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -18,62 +38,187 @@ def dayofweek(self): Return the day of the week. It is assumed the week starts on Monday, which is denoted by 0 and ends on Sunday which is denoted - by 6. This method is available on both Series with datetime - values (using the `dt` accessor) or DatetimeIndex. + by 6. + + **Examples:** + + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series( + ... pd.date_range('2016-12-31', '2017-01-08', freq='D').to_series() + ... ) + >>> s.dt.dayofweek + 2016-12-31 00:00:00 5 + 2017-01-01 00:00:00 6 + 2017-01-02 00:00:00 0 + 2017-01-03 00:00:00 1 + 2017-01-04 00:00:00 2 + 2017-01-05 00:00:00 3 + 2017-01-06 00:00:00 4 + 2017-01-07 00:00:00 5 + 2017-01-08 00:00:00 6 + dtype: Int64 Returns: - Series or Index: Containing integers indicating the day number. + Series: Containing integers indicating the day number. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def date(self): - """Returns numpy array of Python :class:`datetime.date` objects. - - Namely, the date part of Timestamps without time and + """Returns a Series with the date part of Timestamps without time and timezone information. .. warning:: This method returns a Series whereas pandas returns a numpy array. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) + >>> s = bpd.to_datetime(s, utc=True, format="%d/%m/%Y %H:%M:%S%Ez") + >>> s + 0 2020-01-01 10:00:00+00:00 + 1 2020-01-02 11:00:00+00:00 + dtype: timestamp[us, tz=UTC][pyarrow] + >>> s.dt.date + 0 2020-01-01 + 1 2020-01-02 + dtype: date32[day][pyarrow] """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def hour(self): - """The hours of the datetime.""" + """The hours of the datetime. + + **Examples:** + + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="h") + ... ) + >>> s + 0 2000-01-01 00:00:00 + 1 2000-01-01 01:00:00 + 2 2000-01-01 02:00:00 + dtype: timestamp[us][pyarrow] + >>> s.dt.hour + 0 0 + 1 1 + 2 2 + dtype: Int64 + """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def minute(self): - """The minutes of the datetime.""" + """The minutes of the datetime. + + **Examples:** + + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="min") + ... ) + >>> s + 0 2000-01-01 00:00:00 + 1 2000-01-01 00:01:00 + 2 2000-01-01 00:02:00 + dtype: timestamp[us][pyarrow] + >>> s.dt.minute + 0 0 + 1 1 + 2 2 + dtype: Int64 + """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def month(self): - """The month as January=1, December=12.""" + """The month as January=1, December=12. + + **Examples:** + + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="M") + ... ) + >>> s + 0 2000-01-31 00:00:00 + 1 2000-02-29 00:00:00 + 2 2000-03-31 00:00:00 + dtype: timestamp[us][pyarrow] + >>> s.dt.month + 0 1 + 1 2 + 2 3 + dtype: Int64 + """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def second(self): - """The seconds of the datetime.""" + """The seconds of the datetime. + + **Examples:** + + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="s") + ... ) + >>> s + 0 2000-01-01 00:00:00 + 1 2000-01-01 00:00:01 + 2 2000-01-01 00:00:02 + dtype: timestamp[us][pyarrow] + >>> s.dt.second + 0 0 + 1 1 + 2 2 + dtype: Int64 + """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def time(self): - """Returns numpy array of :class:`datetime.time` objects. - - The time part of the Timestamps. + """Returns a Series with the time part of the Timestamps. .. warning:: This method returns a Series whereas pandas returns a numpy array. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) + >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") + >>> s + 0 2020-01-01 10:00:00+00:00 + 1 2020-02-01 11:00:00+00:00 + dtype: timestamp[us, tz=UTC][pyarrow] + >>> s.dt.time + 0 10:00:00 + 1 11:00:00 + dtype: time64[us][pyarrow] """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -82,16 +227,47 @@ def time(self): def quarter(self): """The quarter of the date. - .. warning:: - This method returns a Series whereas pandas returns - a numpy array. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "4/1/2020 11:00:00+00:00"]) + >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") + >>> s + 0 2020-01-01 10:00:00+00:00 + 1 2020-04-01 11:00:00+00:00 + dtype: timestamp[us, tz=UTC][pyarrow] + >>> s.dt.quarter + 0 1 + 1 2 + dtype: Int64 """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def year(self): - """The year of the datetime.""" + """The year of the datetime. + + **Examples:** + + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="Y") + ... ) + >>> s + 0 2000-12-31 00:00:00 + 1 2001-12-31 00:00:00 + 2 2002-12-31 00:00:00 + dtype: timestamp[us][pyarrow] + >>> s.dt.year + 0 2000 + 1 2001 + 2 2002 + dtype: Int64 + """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -99,6 +275,19 @@ def year(self): def tz(self): """Return the timezone. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) + >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") + >>> s + 0 2020-01-01 10:00:00+00:00 + 1 2020-02-01 11:00:00+00:00 + dtype: timestamp[us, tz=UTC][pyarrow] + >>> s.dt.tz + datetime.timezone.utc + Returns: datetime.tzinfo, pytz.tzinfo.BaseTZInfo, dateutil.tz.tz.tzfile, or None """ @@ -109,6 +298,19 @@ def tz(self): def unit(self) -> str: """Returns the unit of time precision. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) + >>> s = bpd.to_datetime(s, utc=True, format="%m/%d/%Y %H:%M:%S%Ez") + >>> s + 0 2020-01-01 10:00:00+00:00 + 1 2020-02-01 11:00:00+00:00 + dtype: timestamp[us, tz=UTC][pyarrow] + >>> s.dt.unit + 'us' + Returns: Unit as string (eg. "us"). """ diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index 7f5761e45b..eb6b9161fc 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -83,7 +83,7 @@ def copy( name (Label, optional): Set name for new object. Returns: - Index: Index refer to new object which is a copy of this object. + Index: Index reference to new object, which is a copy of this object. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -148,7 +148,7 @@ def isin(self, values): """ Return a boolean array where the index values are in `values`. - Compute boolean array of whether each index value is found in the + Compute boolean array to check whether each index value is found in the passed set of values. The length of the returned boolean array matches the length of the index. @@ -195,7 +195,7 @@ def max(self): def argmin(self) -> int: """ - Return int position of the smallest value in the Series. + Return int position of the smallest value in the series. If the minimum is achieved in multiple locations, the first row position is returned. @@ -264,7 +264,7 @@ def value_counts( Args: normalize (bool, default False): - If True then the object returned will contain the relative + If True, then the object returned will contain the relative frequencies of the unique values. sort (bool, default True): Sort by frequencies. @@ -316,7 +316,7 @@ def drop(self, labels) -> Index: labels (array-like or scalar): Returns: - Index: Will be same type as self + Index: Will be same type as self. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/indexes/multi.py b/third_party/bigframes_vendored/pandas/core/indexes/multi.py new file mode 100644 index 0000000000..a882aa40e3 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/indexes/multi.py @@ -0,0 +1,88 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/indexes/multi.py +from __future__ import annotations + +from typing import Hashable, Iterable, Sequence + +import bigframes_vendored.pandas.core.indexes.base + +from bigframes import constants + + +class MultiIndex(bigframes_vendored.pandas.core.indexes.base.Index): + """ + A multi-level, or hierarchical, index object for pandas objects. + """ + + @classmethod + def from_tuples( + cls, + tuples: Iterable[tuple[Hashable, ...]], + sortorder: int | None = None, + names: Sequence[Hashable] | Hashable | None = None, + ) -> MultiIndex: + """ + Convert list of tuples to MultiIndex. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> tuples = [(1, 'red'), (1, 'blue'), + ... (2, 'red'), (2, 'blue')] + >>> bpd.MultiIndex.from_tuples(tuples, names=('number', 'color')) + MultiIndex([(1, 'red'), + (1, 'blue'), + (2, 'red'), + (2, 'blue')], + names=['number', 'color']) + + Args: + tuples (list / sequence of tuple-likes): + Each tuple is the index of one row/column. + sortorder (int or None): + Level of sortedness (must be lexicographically sorted by that + level). + names (list / sequence of str, optional): + Names for the levels in the index. + + Returns: + MultiIndex + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + @classmethod + def from_arrays( + cls, + arrays, + sortorder: int | None = None, + names=None, + ) -> MultiIndex: + """ + Convert arrays to MultiIndex. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] + >>> bpd.MultiIndex.from_arrays(arrays, names=('number', 'color')) + MultiIndex([(1, 'red'), + (1, 'blue'), + (2, 'red'), + (2, 'blue')], + names=['number', 'color']) + + Args: + arrays (list / sequence of array-likes): + Each array-like gives one level's value for each data point. + len(arrays) is the number of levels. + sortorder (int or None): + Level of sortedness (must be lexicographically sorted by that + level). + names (list / sequence of str, optional): + Names for the levels in the index. + + Returns: + MultiIndex + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index fbd1d2d052..6ba3950a76 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -76,10 +76,20 @@ def cut( 3 {'left_exclusive': 5, 'right_inclusive': 20} dtype: struct[pyarrow] + Cut with an iterable of ints: + + >>> bins_ints = [0, 1, 5, 20] + >>> bpd.cut(s, bins=bins_ints) + 0 + 1 {'left_exclusive': 0, 'right_inclusive': 1} + 2 {'left_exclusive': 1, 'right_inclusive': 5} + 3 {'left_exclusive': 5, 'right_inclusive': 20} + dtype: struct[pyarrow] + Args: x (Series): The input Series to be binned. Must be 1-dimensional. - bins (int, pd.IntervalIndex, Iterable[Tuple[Union[int, float], Union[int, float]]]): + bins (int, pd.IntervalIndex, Iterable): The criteria to bin by. int: Defines the number of equal-width bins in the range of `x`. The @@ -88,6 +98,10 @@ def cut( pd.IntervalIndex or Iterable of tuples: Defines the exact bins to be used. It's important to ensure that these bins are non-overlapping. + + Iterable of numerics: Defines the exact bins by using the interval + between each item and its following item. The items must be monotonically + increasing. labels (None): Specifies the labels for the returned bins. Must be the same length as the resulting bins. If False, returns only integer indicators of the diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 785755a562..46bc9714f8 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -6,6 +6,7 @@ from typing import Hashable, IO, Literal, Mapping, Optional, Sequence, TYPE_CHECKING from bigframes_vendored.pandas.core.generic import NDFrame +import numpy import numpy as np from pandas._libs import lib from pandas._typing import Axis, FilePath, NaPosition, WriteBuffer @@ -119,13 +120,15 @@ def shape(self): def dtype(self): """ Return the dtype object of the underlying data. - """ - raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - @property - def dtypes(self): - """ - Return the dtype object of the underlying data. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([1, 2, 3]) + >>> s.dtype + Int64Dtype() """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -173,6 +176,31 @@ def name(self) -> Hashable: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + @property + def hasnans(self) -> bool: + """ + Return True if there are any NaNs. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([1, 2, 3, None]) + >>> s + 0 1.0 + 1 2.0 + 2 3.0 + 3 + dtype: Float64 + >>> s.hasnans + True + + Returns: + bool + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + @property def T(self) -> Series: """Return the transpose, which is by definition self. @@ -817,6 +845,38 @@ def corr(self, other, method="pearson", min_periods=None) -> float: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def autocorr(self, lag: int = 1) -> float: + """ + Compute the lag-N autocorrelation. + + This method computes the Pearson correlation between + the Series and its shifted self. + + **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([0.25, 0.5, 0.2, -0.05]) + >>> s.autocorr() # doctest: +ELLIPSIS + 0.10355... + >>> s.autocorr(lag=2) + -1.0 + + If the Pearson correlation is not well defined, then 'NaN' is returned. + + >>> s = bpd.Series([1, 0, 0, 0]) + >>> s.autocorr() + nan + + Args: + lag (int, default 1): + Number of lags to apply before performing autocorrelation. + + Returns: + float: The Pearson correlation between self and self.shift(lag). + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def cov( self, other, @@ -902,13 +962,13 @@ def dot(self, other) -> Series | np.ndarray: def __matmul__(self, other): """ - Matrix multiplication using binary `@` operator in Python>=3.5. + Matrix multiplication using binary `@` operator. """ return NotImplemented def __rmatmul__(self, other): """ - Matrix multiplication using binary `@` operator in Python>=3.5. + Matrix multiplication using binary `@` operator. """ return NotImplemented @@ -2114,6 +2174,55 @@ def add(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __add__(self, other): + """Get addition of Series and other, element-wise, using operator `+`. + + Equivalent to `Series.add(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([1.5, 2.6], index=['elk', 'moose']) + >>> s + elk 1.5 + moose 2.6 + dtype: Float64 + + You can add a scalar. + + >>> s + 1.5 + elk 3.0 + moose 4.1 + dtype: Float64 + + You can add another Series with index aligned. + + >>> delta = bpd.Series([1.5, 2.6], index=['elk', 'moose']) + >>> s + delta + elk 3.0 + moose 5.2 + dtype: Float64 + + Adding any mis-aligned index will result in invalid values. + + >>> delta = bpd.Series([1.5, 2.6], index=['moose', 'bison']) + >>> s + delta + elk + moose 4.1 + bison + dtype: Float64 + + Args: + other (scalar or Series): + Object to be added to the Series. + + Returns: + Series: The result of adding `other` to Series. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def radd(self, other) -> Series: """Return addition of Series and other, element-wise (binary operator radd). @@ -2129,6 +2238,20 @@ def radd(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __radd__(self, other): + """Get addition of Series and other, element-wise, using operator `+`. + + Equivalent to `Series.radd(other)`. + + Args: + other (scalar or Series): + Object to which Series should be added. + + Returns: + Series: The result of adding Series to `other`. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def sub( self, other, @@ -2147,6 +2270,55 @@ def sub( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __sub__(self, other): + """Get subtraction of other from Series, element-wise, using operator `-`. + + Equivalent to `Series.sub(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([1.5, 2.6], index=['elk', 'moose']) + >>> s + elk 1.5 + moose 2.6 + dtype: Float64 + + You can subtract a scalar. + + >>> s - 1.5 + elk 0.0 + moose 1.1 + dtype: Float64 + + You can subtract another Series with index aligned. + + >>> delta = bpd.Series([0.5, 1.0], index=['elk', 'moose']) + >>> s - delta + elk 1.0 + moose 1.6 + dtype: Float64 + + Adding any mis-aligned index will result in invalid values. + + >>> delta = bpd.Series([0.5, 1.0], index=['moose', 'bison']) + >>> s - delta + elk + moose 2.1 + bison + dtype: Float64 + + Args: + other (scalar or Series): + Object to subtract from the Series. + + Returns: + Series: The result of subtraction. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def rsub(self, other) -> Series: """Return subtraction of Series and other, element-wise (binary operator rsub). @@ -2162,6 +2334,20 @@ def rsub(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __rsub__(self, other): + """Get subtraction of Series from other, element-wise, using operator `-`. + + Equivalent to `Series.rsub(other)`. + + Args: + other (scalar or Series): + Object to subtract the Series from. + + Returns: + Series: The result of subtraction. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def mul(self, other) -> Series: """Return multiplication of Series and other, element-wise (binary operator mul). @@ -2177,6 +2363,44 @@ def mul(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __mul__(self, other): + """ + Get multiplication of Series with other, element-wise, using operator `*`. + + Equivalent to `Series.mul(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can multiply with a scalar: + + >>> s = bpd.Series([1, 2, 3]) + >>> s * 3 + 0 3 + 1 6 + 2 9 + dtype: Int64 + + You can also multiply with another Series: + + >>> s1 = bpd.Series([2, 3, 4]) + >>> s * s1 + 0 2 + 1 6 + 2 12 + dtype: Int64 + + Args: + other (scalar or Series): + Object to multiply with the Series. + + Returns: + Series: The result of the multiplication. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def rmul(self, other) -> Series: """Return multiplication of Series and other, element-wise (binary operator mul). @@ -2191,6 +2415,21 @@ def rmul(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __rmul__(self, other): + """ + Get multiplication of other with Series, element-wise, using operator `*`. + + Equivalent to `Series.rmul(other)`. + + Args: + other (scalar or Series): + Object to multiply the Series with. + + Returns: + Series: The result of the multiplication. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def truediv(self, other) -> Series: """Return floating division of Series and other, element-wise (binary operator truediv). @@ -2206,6 +2445,44 @@ def truediv(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __truediv__(self, other): + """ + Get division of Series by other, element-wise, using operator `/`. + + Equivalent to `Series.truediv(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can multiply with a scalar: + + >>> s = bpd.Series([1, 2, 3]) + >>> s / 2 + 0 0.5 + 1 1.0 + 2 1.5 + dtype: Float64 + + You can also multiply with another Series: + + >>> denominator = bpd.Series([2, 3, 4]) + >>> s / denominator + 0 0.5 + 1 0.666667 + 2 0.75 + dtype: Float64 + + Args: + other (scalar or Series): + Object to divide the Series by. + + Returns: + Series: The result of the division. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def rtruediv(self, other) -> Series: """Return floating division of Series and other, element-wise (binary operator rtruediv). @@ -2221,6 +2498,21 @@ def rtruediv(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __rtruediv__(self, other): + """ + Get division of other by Series, element-wise, using operator `/`. + + Equivalent to `Series.rtruediv(other)`. + + Args: + other (scalar or Series): + Object to divide by the Series. + + Returns: + Series: The result of the division. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def floordiv(self, other) -> Series: """Return integer division of Series and other, element-wise (binary operator floordiv). @@ -2236,6 +2528,44 @@ def floordiv(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __floordiv__(self, other): + """ + Get integer divison of Series by other, using arithmatic operator `//`. + + Equivalent to `Series.floordiv(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can divide by a scalar: + + >>> s = bpd.Series([15, 30, 45]) + >>> s // 2 + 0 7 + 1 15 + 2 22 + dtype: Int64 + + You can also divide by another DataFrame: + + >>> divisor = bpd.Series([3, 4, 4]) + >>> s // divisor + 0 5 + 1 7 + 2 11 + dtype: Int64 + + Args: + other (scalar or Series): + Object to divide the Series by. + + Returns: + Series: The result of the integer divison. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def rfloordiv(self, other) -> Series: """Return integer division of Series and other, element-wise (binary operator rfloordiv). @@ -2251,6 +2581,21 @@ def rfloordiv(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __rfloordiv__(self, other): + """ + Get integer divison of other by Series, using arithmatic operator `//`. + + Equivalent to `Series.rfloordiv(other)`. + + Args: + other (scalar or Series): + Object to divide by the Series. + + Returns: + Series: The result of the integer divison. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def mod(self, other) -> Series: """Return modulo of Series and other, element-wise (binary operator mod). @@ -2266,6 +2611,44 @@ def mod(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __mod__(self, other): + """ + Get modulo of Series with other, element-wise, using operator `%`. + + Equivalent to `Series.mod(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can modulo with a scalar: + + >>> s = bpd.Series([1, 2, 3]) + >>> s % 3 + 0 1 + 1 2 + 2 0 + dtype: Int64 + + You can also modulo with another Series: + + >>> modulo = bpd.Series([3, 3, 3]) + >>> s % modulo + 0 1 + 1 2 + 2 0 + dtype: Int64 + + Args: + other (scalar or Series): + Object to modulo the Series by. + + Returns: + Series: The result of the modulo. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def rmod(self, other) -> Series: """Return modulo of Series and other, element-wise (binary operator mod). @@ -2281,6 +2664,21 @@ def rmod(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __rmod__(self, other): + """ + Get modulo of other with Series, element-wise, using operator `%`. + + Equivalent to `Series.rmod(other)`. + + Args: + other (scalar or Series): + Object to modulo by the Series. + + Returns: + Series: The result of the modulo. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def pow(self, other) -> Series: """Return Exponential power of series and other, element-wise (binary operator `pow`). @@ -2296,6 +2694,45 @@ def pow(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __pow__(self, other): + """ + Get exponentiation of Series with other, element-wise, using operator + `**`. + + Equivalent to `Series.pow(other)`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can exponentiate with a scalar: + + >>> s = bpd.Series([1, 2, 3]) + >>> s ** 2 + 0 1 + 1 4 + 2 9 + dtype: Int64 + + You can also exponentiate with another Series: + + >>> exponent = bpd.Series([3, 2, 1]) + >>> s ** exponent + 0 1 + 1 4 + 2 3 + dtype: Int64 + + Args: + other (scalar or Series): + Object to exponentiate the Series with. + + Returns: + Series: The result of the exponentiation. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def rpow(self, other) -> Series: """Return Exponential power of series and other, element-wise (binary operator `rpow`). @@ -2311,6 +2748,22 @@ def rpow(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __rpow__(self, other): + """ + Get exponentiation of other with Series, element-wise, using operator + `**`. + + Equivalent to `Series.rpow(other)`. + + Args: + other (scalar or Series): + Object to exponentiate with the Series. + + Returns: + Series: The result of the exponentiation. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def divmod(self, other) -> Series: """Return integer division and modulo of Series and other, element-wise (binary operator divmod). @@ -2341,6 +2794,119 @@ def rdivmod(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def combine_first(self, other) -> Series: + """ + Update null elements with value in the same location in 'other'. + + Combine two Series objects by filling null values in one Series with + non-null values from the other Series. Result index will be the union + of the two indexes. + + **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + + >>> s1 = bpd.Series([1, np.nan]) + >>> s2 = bpd.Series([3, 4, 5]) + >>> s1.combine_first(s2) + 0 1.0 + 1 4.0 + 2 5.0 + dtype: Float64 + + Null values still persist if the location of that null value + does not exist in `other` + + >>> s1 = bpd.Series({'falcon': np.nan, 'eagle': 160.0}) + >>> s2 = bpd.Series({'eagle': 200.0, 'duck': 30.0}) + >>> s1.combine_first(s2) + falcon + eagle 160.0 + duck 30.0 + dtype: Float64 + + Args: + other (Series): + The value(s) to be used for filling null values. + + Returns: + Series: The result of combining the provided Series with the other object. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def update(self, other) -> None: + """ + Modify Series in place using values from passed Series. + + Uses non-NA values from passed Series to make updates. Aligns + on index. + + **Examples:** + >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([1, 2, 3]) + >>> s.update(bpd.Series([4, 5, 6])) + >>> s + 0 4 + 1 5 + 2 6 + dtype: Int64 + + >>> s = bpd.Series(['a', 'b', 'c']) + >>> s.update(bpd.Series(['d', 'e'], index=[0, 2])) + >>> s + 0 d + 1 b + 2 e + dtype: string + + >>> s = bpd.Series([1, 2, 3]) + >>> s.update(bpd.Series([4, 5, 6, 7, 8])) + >>> s + 0 4 + 1 5 + 2 6 + dtype: Int64 + + If ``other`` contains NaNs the corresponding values are not updated + in the original Series. + + >>> s = bpd.Series([1, 2, 3]) + >>> s.update(bpd.Series([4, np.nan, 6], dtype=pd.Int64Dtype())) + >>> s + 0 4 + 1 2 + 2 6 + dtype: Int64 + + ``other`` can also be a non-Series object type + that is coercible into a Series + + >>> s = bpd.Series([1, 2, 3]) + >>> s.update([4, np.nan, 6]) + >>> s + 0 4.0 + 1 2.0 + 2 6.0 + dtype: Float64 + + >>> s = bpd.Series([1, 2, 3]) + >>> s.update({1: 9}) + >>> s + 0 1 + 1 9 + 2 3 + dtype: Int64 + + Args: + other (Series, or object coercible into Series) + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def all( self, ): @@ -2836,7 +3402,7 @@ def unstack(self, level): def argmax(self): """ - Return int position of the smallest value in the Series. + Return int position of the smallest value in the series. If the minimum is achieved in multiple locations, the first row position is returned. @@ -3308,6 +3874,22 @@ def loc(self): def iat(self): """Access a single value for a row/column pair by integer position. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> s = bpd.Series(bpd.Series([1, 2, 3])) + >>> bpd.options.display.progress_bar = None + >>> s + 0 1 + 1 2 + 2 3 + dtype: Int64 + + Get value at specified row number + + >>> s.iat[1] + 2 + Returns: bigframes.core.indexers.IatSeriesIndexer: Indexers object. """ @@ -3317,6 +3899,23 @@ def iat(self): def at(self): """Access a single value for a row/column label pair. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> s = bpd.Series([1, 2, 3], index=['A', 'B', 'C']) + >>> bpd.options.display.progress_bar = None + >>> s + A 1 + B 2 + C 3 + dtype: Int64 + + Get value at specified row label + + >>> s.at['B'] + 2 + + Returns: bigframes.core.indexers.AtSeriesIndexer: Indexers object. """ @@ -3369,3 +3968,172 @@ def size(self) -> int: int: Return the number of elements in the underlying data. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def __array__(self, dtype=None) -> numpy.ndarray: + """ + Returns the values as NumPy array. + + Equivalent to `Series.to_numpy(dtype)`. + + Users should not call this directly. Rather, it is invoked by + `numpy.array` and `numpy.asarray`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> import numpy as np + + >>> ser = bpd.Series([1, 2, 3]) + + >>> np.asarray(ser) + array([1, 2, 3]) + + Args: + dtype (str or numpy.dtype, optional): + The dtype to use for the resulting NumPy array. By default, + the dtype is inferred from the data. + + Returns: + numpy.ndarray: + The values in the series converted to a `numpy.ndarray` with the + specified dtype. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def __len__(self): + """Returns number of values in the Series, serves `len` operator. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([1, 2, 3]) + >>> len(s) + 3 + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def __invert__(self): + """ + Returns the logical inversion (binary NOT) of the Series, element-wise + using operator `~`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> ser = bpd.Series([True, False, True]) + >>> ~ser + 0 False + 1 True + 2 False + dtype: boolean + + Returns: + Series: The inverted values in the series. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def __and__(self, other): + """Get bitwise AND of Series and other, element-wise, using operator `&`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([0, 1, 2, 3]) + + You can operate with a scalar. + + >>> s & 6 + 0 0 + 1 0 + 2 2 + 3 2 + dtype: Int64 + + You can operate with another Series. + + >>> s1 = bpd.Series([5, 6, 7, 8]) + >>> s & s1 + 0 0 + 1 0 + 2 2 + 3 0 + dtype: Int64 + + Args: + other (scalar or Series): + Object to bitwise AND with the Series. + + Returns: + Series: The result of the operation. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def __or__(self, other): + """Get bitwise OR of Series and other, element-wise, using operator `|`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([0, 1, 2, 3]) + + You can operate with a scalar. + + >>> s | 6 + 0 6 + 1 7 + 2 6 + 3 7 + dtype: Int64 + + You can operate with another Series. + + >>> s1 = bpd.Series([5, 6, 7, 8]) + >>> s | s1 + 0 5 + 1 7 + 2 7 + 3 11 + dtype: Int64 + + Args: + other (scalar or Series): + Object to bitwise OR with the Series. + + Returns: + Series: The result of the operation. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def __getitem__(self, indexer): + """Gets the specified index from the Series. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([15, 30, 45]) + >>> s[1] + 30 + >>> s[0:2] + 0 15 + 1 30 + dtype: Int64 + + Args: + indexer (int or slice): + Index or slice of indices. + + Returns: + Series or Value: Value(s) at the requested index(es). + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/strings/accessor.py b/third_party/bigframes_vendored/pandas/core/strings/accessor.py index ecdd9547d5..5bb69dc1f2 100644 --- a/third_party/bigframes_vendored/pandas/core/strings/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/strings/accessor.py @@ -20,19 +20,57 @@ def extract(self, pat: str, flags: int = 0): For each subject string in the Series, extract groups from the first match of regular expression `pat`. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + A pattern with two groups will return a DataFrame with two columns. + Non-matches will be `NaN`. + + >>> s = bpd.Series(['a1', 'b2', 'c3']) + >>> s.str.extract(r'([ab])(\\d)') + 0 1 + 0 a 1 + 1 b 2 + 2 + + [3 rows x 2 columns] + + Named groups will become column names in the result. + + >>> s.str.extract(r'(?P[ab])(?P\\d)') + letter digit + 0 a 1 + 1 b 2 + 2 + + [3 rows x 2 columns] + + A pattern with one group will return a DataFrame with one column. + + >>> s.str.extract(r'[ab](\\d)') + 0 + 0 1 + 1 2 + 2 + + [3 rows x 1 columns] + Args: - pat: + pat (str): Regular expression pattern with capturing groups. - flags: + flags (int, default 0 (no flags)): Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that modify regular expression matching for things like case, spaces, etc. For more details, see :mod:`re`. Returns: - A DataFrame with one row for each subject string, and one - column for each group. Any capture group names in regular - expression pat will be used for column names; otherwise - capture group numbers will be used. + bigframes.dataframe.DataFrame: + A DataFrame with one row for each subject string, and one + column for each group. Any capture group names in regular + expression pat will be used for column names; otherwise + capture group numbers will be used. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -43,12 +81,24 @@ def find(self, sub, start: int = 0, end=None): substring is fully contained between [start:end]. Return -1 on failure. Equivalent to standard :meth:`str.find`. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> ser = bpd.Series(["cow_", "duck_", "do_ve"]) + >>> ser.str.find("_") + 0 3 + 1 4 + 2 2 + dtype: Int64 + Args: - sub: + sub (str): Substring being searched. start (int, default 0): Left edge index. - end (None): + end (int, default None): Right edge index. Returns: @@ -62,6 +112,20 @@ def len(self): The element may be a sequence (such as a string, tuple or list) or a collection (such as a dictionary). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Returns the length (number of characters) in a string. + + >>> s = bpd.Series(['dog', '', bpd.NA]) + >>> s.str.len() + 0 3 + 1 0 + 2 + dtype: Int64 + Returns: bigframes.series.Series: A Series or Index of integer values indicating the length of each element in the Series or Index. @@ -74,6 +138,22 @@ def lower(self): Equivalent to :meth:`str.lower`. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['lower', + ... 'CAPITALS', + ... 'this is a sentence', + ... 'SwApCaSe']) + >>> s.str.lower() + 0 lower + 1 capitals + 2 this is a sentence + 3 swapcase + dtype: string + Returns: bigframes.series.Series: Series with lowercase. """ @@ -83,6 +163,36 @@ def lower(self): def slice(self, start=None, stop=None): """Slice substrings from each element in the Series or Index. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(["koala", "dog", "chameleon"]) + >>> s + 0 koala + 1 dog + 2 chameleon + dtype: string + + >>> s.str.slice(start=1) + 0 oala + 1 og + 2 hameleon + dtype: string + + >>> s.str.slice(stop=2) + 0 ko + 1 do + 2 ch + dtype: string + + >>> s.str.slice(start=2, stop=5) + 0 ala + 1 g + 2 ame + dtype: string + Args: start (int, optional): Start position for slice operation. @@ -106,6 +216,27 @@ def strip(self): Replaces any non-strings in Series with NaNs. Equivalent to :meth:`str.strip`. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['Ant', ' Bee ', '\\tCat\\n', bpd.NA]) + >>> s + 0 Ant + 1 Bee + 2 Cat + + 3 + dtype: string + + >>> s.str.strip() + 0 Ant + 1 Bee + 2 Cat + 3 + dtype: string + Returns: bigframes.series.Series: Series or Index without leading and trailing characters. @@ -118,6 +249,22 @@ def upper(self): Equivalent to :meth:`str.upper`. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['lower', + ... 'CAPITALS', + ... 'this is a sentence', + ... 'SwApCaSe']) + >>> s.str.upper() + 0 LOWER + 1 CAPITALS + 2 THIS IS A SENTENCE + 3 SWAPCASE + dtype: string + Returns: bigframes.series.Series: Series with uppercase strings. """ @@ -131,6 +278,19 @@ def isnumeric(self): :meth:`str.isnumeric` for each element of the Series/Index. If a string has zero characters, ``False`` is returned for that check. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s1 = bpd.Series(['one', 'one1', '1', '']) + >>> s1.str.isnumeric() + 0 False + 1 False + 2 True + 3 False + dtype: boolean + Returns: bigframes.series.Series: Series or Index of boolean values with the same length as the original Series/Index. @@ -145,6 +305,19 @@ def isalpha(self): :meth:`str.isalpha` for each element of the Series/Index. If a string has zero characters, ``False`` is returned for that check. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s1 = bpd.Series(['one', 'one1', '1', '']) + >>> s1.str.isalpha() + 0 True + 1 False + 2 False + 3 False + dtype: boolean + Returns: bigframes.series.Series: Series with the same length as the originalSeries/Index. """ @@ -158,6 +331,19 @@ def isdigit(self): :meth:`str.isdigit` for each element of the Series/Index. If a string has zero characters, ``False`` is returned for that check. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['23', '1a', '1/5', '']) + >>> s.str.isdigit() + 0 True + 1 False + 2 False + 3 False + dtype: boolean + Returns: bigframes.series.Series: Series with the same length as the originalSeries/Index. """ @@ -171,6 +357,30 @@ def isalnum(self): :meth:`str.isalnum` for each element of the Series/Index. If a string has zero characters, ``False`` is returned for that check. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s1 = bpd.Series(['one', 'one1', '1', '']) + >>> s1.str.isalnum() + 0 True + 1 True + 2 True + 3 False + dtype: boolean + + Note that checks against characters mixed with any additional + punctuation or whitespace will evaluate to false for an alphanumeric + check. + + >>> s2 = bpd.Series(['A B', '1.5', '3,000']) + >>> s2.str.isalnum() + 0 False + 1 False + 2 False + dtype: boolean + Returns: bigframes.series.Series: Series or Index of boolean values with the same length as the original Series/Index. @@ -185,6 +395,18 @@ def isspace(self): :meth:`str.isspace` for each element of the Series/Index. If a string has zero characters, ``False`` is returned for that check. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([' ', '\\t\\r\\n ', '']) + >>> s.str.isspace() + 0 True + 1 True + 2 False + dtype: boolean + Returns: bigframes.series.Series: Series or Index of boolean values with the same length as the original Series/Index. @@ -199,6 +421,19 @@ def islower(self): :meth:`str.islower` for each element of the Series/Index. If a string has zero characters, ``False`` is returned for that check. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) + >>> s.str.islower() + 0 True + 1 False + 2 False + 3 False + dtype: boolean + Returns: bigframes.series.Series: Series or Index of boolean values with the same length as the original Series/Index. @@ -213,6 +448,19 @@ def isupper(self): :meth:`str.isupper` for each element of the Series/Index. If a string has zero characters, ``False`` is returned for that check. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) + >>> s.str.isupper() + 0 False + 1 False + 2 True + 3 False + dtype: boolean + Returns: bigframes.series.Series: Series or Index of boolean values with the same length as the original Series/Index. @@ -227,6 +475,22 @@ def isdecimal(self): :meth:`str.isdecimal` for each element of the Series/Index. If a string has zero characters, ``False`` is returned for that check. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + The `isdecimal` method checks for characters used to form numbers in + base 10. + + >>> s = bpd.Series(['23', '³', '⅕', '']) + >>> s.str.isdecimal() + 0 True + 1 False + 2 False + 3 False + dtype: boolean + Returns: bigframes.series.Series: Series or Index of boolean values with the same length as the original Series/Index. @@ -242,6 +506,27 @@ def rstrip(self): Replaces any non-strings in Series with NaNs. Equivalent to :meth:`str.rstrip`. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['Ant', ' Bee ', '\\tCat\\n', bpd.NA]) + >>> s + 0 Ant + 1 Bee + 2 Cat + + 3 + dtype: string + + >>> s.str.rstrip() + 0 Ant + 1 Bee + 2 Cat + 3 + dtype: string + Returns: bigframes.series.Series: Series without trailing characters. """ @@ -256,6 +541,28 @@ def lstrip(self): Replaces any non-strings in Series with NaNs. Equivalent to :meth:`str.lstrip`. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['Ant', ' Bee ', '\\tCat\\n', bpd.NA]) + >>> s + 0 Ant + 1 Bee + 2 Cat + + 3 + dtype: string + + >>> s.str.lstrip() + 0 Ant + 1 Bee + 2 Cat + + 3 + dtype: string + Returns: bigframes.series.Series: Series without leading characters. """ @@ -265,6 +572,24 @@ def lstrip(self): def repeat(self, repeats: int): """Duplicate each string in the Series or Index. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['a', 'b', 'c']) + >>> s + 0 a + 1 b + 2 c + dtype: string + + >>> s.str.repeat(repeats=2) + 0 aa + 1 bb + 2 cc + dtype: string + Args: repeats : int or sequence of int Same value for all (int) or different value per (sequence). @@ -281,6 +606,22 @@ def capitalize(self): Equivalent to :meth:`str.capitalize`. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['lower', + ... 'CAPITALS', + ... 'this is a sentence', + ... 'SwApCaSe']) + >>> s.str.capitalize() + 0 Lower + 1 Capitals + 2 This is a sentence + 3 Swapcase + dtype: string + Returns: bigframes.series.Series: Series with captitalized strings. """ @@ -293,8 +634,43 @@ def cat(self, others, *, join): If `others` is specified, this function concatenates the Series/Index and elements of `others` element-wise. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can concatenate each string in a Series to another string. + + >>> s = bpd.Series(['Jane', 'John']) + >>> s.str.cat(" Doe") + 0 Jane Doe + 1 John Doe + dtype: string + + You can concatenate another Series. By default left join is performed to + align the corresponding elements. + + >>> s.str.cat(bpd.Series([" Doe", " Foe", " Roe"])) + 0 Jane Doe + 1 John Foe + dtype: string + + >>> s.str.cat(bpd.Series([" Doe", " Foe", " Roe"], index=[2, 0, 1])) + 0 Jane Foe + 1 John Roe + dtype: string + + You can enforce an outer join. + + >>> s.str.cat(bpd.Series([" Doe", " Foe", " Roe"]), join="outer") + 0 Jane Doe + 1 John Foe + 2 + dtype: string + Args: - others (Series): + others (str or Series): + A string or a Series of strings. join ({'left', 'outer'}, default 'left'): Determines the join-style between the calling Series and any @@ -315,6 +691,77 @@ def contains(self, pat, case: bool = True, flags: int = 0, *, regex: bool = True Return boolean Series or Index based on whether a given pattern or regex is contained within a string of a Series or Index. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Returning a Series of booleans using only a literal pattern. + + >>> s1 = bpd.Series(['Mouse', 'dog', 'house and parrot', '23', None]) + >>> s1.str.contains('og') + 0 False + 1 True + 2 False + 3 False + 4 + dtype: boolean + + Specifying case sensitivity using `case`. + + >>> s1.str.contains('oG', case=True) + 0 False + 1 False + 2 False + 3 False + 4 + dtype: boolean + + Returning 'house' or 'dog' when either expression occurs in a string. + + >>> s1.str.contains('house|dog', regex=True) + 0 False + 1 True + 2 True + 3 False + 4 + dtype: boolean + + Ignoring case sensitivity using `flags` with regex. + + >>> import re + >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True) + 0 False + 1 False + 2 True + 3 False + 4 + dtype: boolean + + Returning any digit using regular expression. + + >>> s1.str.contains('\\d', regex=True) + 0 False + 1 False + 2 False + 3 True + 4 + dtype: boolean + + Ensure `pat` is a not a literal pattern when `regex` is set to True. + Note in the following example one might expect only *s2[1]* and *s2[3]* + to return `True`. However, '.0' as a regex matches any character + followed by a 0. + + >>> s2 = bpd.Series(['40', '40.0', '41', '41.0', '35']) + >>> s2.str.contains('.0', regex=True) + 0 True + 1 True + 2 False + 3 True + 4 False + dtype: boolean + Args: pat (str, re.Pattern): Character sequence or regular expression. @@ -348,6 +795,32 @@ def replace( Equivalent to :meth:`str.replace` or :func:`re.sub`, depending on the regex value. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + When *pat* is a string and *regex* is True, the given *pat* is compiled + as a regex. When *repl* is a string, it replaces matching regex patterns + as with `re.sub()`. NaN value(s) in the Series are left as is: + + >>> s = bpd.Series(['foo', 'fuz', bpd.NA]) + >>> s.str.replace('f.', 'ba', regex=True) + 0 bao + 1 baz + 2 + dtype: string + + When *pat* is a string and *regex* is False, every *pat* is replaced + with *repl* as with `str.replace()`: + + >>> s = bpd.Series(['f.o', 'fuz', bpd.NA]) + >>> s.str.replace('f.', 'ba', regex=False) + 0 bao + 1 fuz + 2 + dtype: string + Args: pat (str, re.Pattern): String can be a character sequence or regular expression. @@ -384,6 +857,33 @@ def startswith( """ Test if the start of each string element matches a pattern. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['bat', 'Bear', 'caT', bpd.NA]) + >>> s + 0 bat + 1 Bear + 2 caT + 3 + dtype: string + + >>> s.str.startswith('b') + 0 True + 1 False + 2 False + 3 + dtype: boolean + + >>> s.str.startswith(('b', 'B')) + 0 True + 1 True + 2 False + 3 + dtype: boolean + Args: pat (str, tuple[str, ...]): Character sequence or tuple of strings. Regular expressions are not @@ -402,6 +902,33 @@ def endswith( """ Test if the end of each string element matches a pattern. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['bat', 'bear', 'caT', bpd.NA]) + >>> s + 0 bat + 1 bear + 2 caT + 3 + dtype: string + + >>> s.str.endswith('t') + 0 True + 1 False + 2 False + 3 + dtype: boolean + + >>> s.str.endswith(('t', 'T')) + 0 True + 1 False + 2 True + 3 + dtype: boolean + Args: pat (str, tuple[str, ...]): Character sequence or tuple of strings. Regular expressions are not @@ -417,6 +944,18 @@ def match(self, pat: str, case: bool = True, flags: int = 0): """ Determine if each string starts with a match of a regular expression. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> ser = bpd.Series(["horse", "eagle", "donkey"]) + >>> ser.str.match("e") + 0 False + 1 True + 2 False + dtype: boolean + Args: pat (str): Character sequence or regular expression. @@ -434,6 +973,18 @@ def fullmatch(self, pat: str, case: bool = True, flags: int = 0): """ Determine if each string entirely matches a regular expression. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> ser = bpd.Series(["cat", "duck", "dove"]) + >>> ser.str.fullmatch(r'd.+') + 0 False + 1 True + 2 True + dtype: boolean + Args: pat (str): Character sequence or regular expression. @@ -454,6 +1005,18 @@ def get(self, i: int): Extract element from lists, tuples, dict, or strings in each element in the Series/Index. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(["apple", "banana", "fig"]) + >>> s.str.get(3) + 0 l + 1 a + 2 + dtype: string + Args: i (int): Position or key of element to extract. @@ -472,6 +1035,32 @@ def pad( """ Pad strings in the Series/Index up to width. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(["caribou", "tiger"]) + >>> s + 0 caribou + 1 tiger + dtype: string + + >>> s.str.pad(width=10) + 0 caribou + 1 tiger + dtype: string + + >>> s.str.pad(width=10, side='right', fillchar='-') + 0 caribou--- + 1 tiger----- + dtype: string + + >>> s.str.pad(width=10, side='both', fillchar='-') + 0 -caribou-- + 1 --tiger--- + dtype: string + Args: width (int): Minimum width of resulting string; additional characters will be filled @@ -494,6 +1083,18 @@ def ljust( """ Pad right side of strings in the Series/Index up to width. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> ser = bpd.Series(['dog', 'bird', 'mouse']) + >>> ser.str.ljust(8, fillchar='.') + 0 dog..... + 1 bird.... + 2 mouse... + dtype: string + Args: width (int): Minimum width of resulting string; additional characters will be filled @@ -514,6 +1115,18 @@ def rjust( """ Pad left side of strings in the Series/Index up to width. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> ser = bpd.Series(['dog', 'bird', 'mouse']) + >>> ser.str.rjust(8, fillchar='.') + 0 .....dog + 1 ....bird + 2 ...mouse + dtype: string + Args: width (int): Minimum width of resulting string; additional characters will be filled @@ -538,6 +1151,26 @@ def zfill( in the Series/Index with length greater or equal to `width` are unchanged. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['-1', '1', '1000', bpd.NA]) + >>> s + 0 -1 + 1 1 + 2 1000 + 3 + dtype: string + + >>> s.str.zfill(3) + 0 -01 + 1 001 + 2 1000 + 3 + dtype: string + Args: width (int): Minimum length of resulting string; strings with length less @@ -558,6 +1191,18 @@ def center( Equivalent to :meth:`str.center`. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> ser = bpd.Series(['dog', 'bird', 'mouse']) + >>> ser.str.center(8, fillchar='.') + 0 ..dog... + 1 ..bird.. + 2 .mouse.. + dtype: string + Args: width (int): Minimum width of resulting string; additional characters will be filled diff --git a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py index 442220f237..3d460b2b16 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py @@ -29,7 +29,7 @@ def to_datetime( .. note:: The format strings for specifying datetime representations in BigQuery and pandas are not completely identical. Ensure that the format string provided is compatible - with BigQuery. + with BigQuery (https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time). **Examples:** diff --git a/third_party/bigframes_vendored/sklearn/base.py b/third_party/bigframes_vendored/sklearn/base.py index 768328e552..fd8db7a227 100644 --- a/third_party/bigframes_vendored/sklearn/base.py +++ b/third_party/bigframes_vendored/sklearn/base.py @@ -81,9 +81,9 @@ class ClassifierMixin: def score(self, X, y): """Return the mean accuracy on the given test data and labels. - In multi-label classification, this is the subset accuracy - which is a harsh metric since you require for each sample that - each label set be correctly predicted. + In multi-label classification, this is the subset accuracy, + which is a harsh metric since you require that + each label set be correctly predicted for each sample. .. note:: diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py index f126e0439d..71e53bf4a9 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py @@ -22,7 +22,7 @@ class PCA(BaseEstimator, metaclass=ABCMeta): Args: n_components (int, float or None, default None): - Number of components to keep. If n_components is not set all + Number of components to keep. If n_components is not set, all components are kept, n_components = min(n_samples, n_features). If 0 < n_components < 1, select the number of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components. svd_solver ("full", "randomized" or "auto", default "auto"): @@ -75,7 +75,7 @@ def predict(self, X): Series or a DataFrame to predict. Returns: - bigframes.dataframe.DataFrame: predicted DataFrames.""" + bigframes.dataframe.DataFrame: Predicted DataFrames.""" raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property @@ -90,7 +90,7 @@ def components_(self): numerical_value: If feature is numeric, the value of feature for the principal component that principal_component_id identifies. If feature isn't numeric, the value is NULL. - categorical_value: An list of mappings containing information about categorical features. Each mapping contains the following fields: + categorical_value: A list of mappings containing information about categorical features. Each mapping contains the following fields: categorical_value.category: The name of each category. categorical_value.value: The value of categorical_value.category for the centroid that centroid_id identifies.