diff --git a/.github/sync-repo-settings.yaml b/.github/sync-repo-settings.yaml index b98d68799a..e098468da6 100644 --- a/.github/sync-repo-settings.yaml +++ b/.github/sync-repo-settings.yaml @@ -7,15 +7,16 @@ branchProtectionRules: requiresCodeOwnerReviews: true requiresStrictStatusChecks: false requiredStatusCheckContexts: +# TODO(b/347075426): Restore owlbot as required check +# - 'OwlBot Post Processor' - 'conventionalcommits.org' - 'cla/google' - - 'OwlBot Post Processor' - 'docs' - 'lint' - 'unit (3.9)' - 'unit (3.10)' - 'unit (3.11)' - - 'unit (3.12)' + - 'unit (3.12)' - 'cover' - 'Kokoro presubmit' permissionRules: diff --git a/CHANGELOG.md b/CHANGELOG.md index d585b5b1c2..633f9930ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,26 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.10.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.9.0...v1.10.0) (2024-06-21) + + +### Features + +* Add dataframe.insert ([#770](https://github.com/googleapis/python-bigquery-dataframes/issues/770)) ([e8bab68](https://github.com/googleapis/python-bigquery-dataframes/commit/e8bab681a2d07636e5809e804f4fd81b0d582685)) +* Add groupby head API ([#791](https://github.com/googleapis/python-bigquery-dataframes/issues/791)) ([44202bc](https://github.com/googleapis/python-bigquery-dataframes/commit/44202bc3541df03154ea0b2cca8eac18094a91a9)) +* Add ml.preprocessing.PolynomialFeatures class ([#793](https://github.com/googleapis/python-bigquery-dataframes/issues/793)) ([b4fbb51](https://github.com/googleapis/python-bigquery-dataframes/commit/b4fbb518711922c09ac6f55f3b8f6ab57c89114b)) +* Bigframes.streaming module for continuous queries ([#703](https://github.com/googleapis/python-bigquery-dataframes/issues/703)) ([0433a1c](https://github.com/googleapis/python-bigquery-dataframes/commit/0433a1cff57fddda26b2c57adc0ea71f3fdd3201)) +* Include index columns in DataFrame.sql if they are named ([#788](https://github.com/googleapis/python-bigquery-dataframes/issues/788)) ([c8d16c0](https://github.com/googleapis/python-bigquery-dataframes/commit/c8d16c0f72a25bce854b80be517114e1603c947e)) + + +### Bug Fixes + +* Allow `__repr__` to work with uninitialed DataFrame/Series/Index ([#778](https://github.com/googleapis/python-bigquery-dataframes/issues/778)) ([e14c7a9](https://github.com/googleapis/python-bigquery-dataframes/commit/e14c7a9e7a9cb8847e0382b135fc06c7b82b872a)) +* Df.loc with the 2nd input as bigframes boolean Series ([#789](https://github.com/googleapis/python-bigquery-dataframes/issues/789)) ([a4ac82e](https://github.com/googleapis/python-bigquery-dataframes/commit/a4ac82e06221581ddfcfc1246a3e3cd65a8bb00e)) +* Ensure numpy version matches in `remote_function` deployment ([#798](https://github.com/googleapis/python-bigquery-dataframes/issues/798)) ([324d93c](https://github.com/googleapis/python-bigquery-dataframes/commit/324d93cb31191520b790bbbc501468b8d1d8467d)) +* Fix temp table creation retries by now throwing if table already exists. ([#787](https://github.com/googleapis/python-bigquery-dataframes/issues/787)) ([0e57d1f](https://github.com/googleapis/python-bigquery-dataframes/commit/0e57d1f1f8a150ba6faac5f667bb5b4c78f4c0a3)) +* Self-join optimization doesn't needlessly invalidate caching ([#797](https://github.com/googleapis/python-bigquery-dataframes/issues/797)) ([1b96b80](https://github.com/googleapis/python-bigquery-dataframes/commit/1b96b8027a550e1601a5360f2af35d24a8806da9)) + ## [1.9.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.8.0...v1.9.0) (2024-06-10) diff --git a/bigframes/_config/bigquery_options.py b/bigframes/_config/bigquery_options.py index 6f841a36b3..ad79543cb8 100644 --- a/bigframes/_config/bigquery_options.py +++ b/bigframes/_config/bigquery_options.py @@ -70,6 +70,8 @@ def __init__( application_name: Optional[str] = None, kms_key_name: Optional[str] = None, skip_bq_connection_check: bool = False, + *, + _strictly_ordered: bool = True, ): self._credentials = credentials self._project = project @@ -80,6 +82,8 @@ def __init__( self._kms_key_name = kms_key_name self._skip_bq_connection_check = skip_bq_connection_check self._session_started = False + # Determines the ordering strictness for the session. For internal use only. + self._strictly_ordered_internal = _strictly_ordered @property def application_name(self) -> Optional[str]: @@ -235,3 +239,8 @@ def kms_key_name(self, value: str): raise ValueError(SESSION_STARTED_MESSAGE.format(attribute="kms_key_name")) self._kms_key_name = value + + @property + def _strictly_ordered(self) -> bool: + """Internal use only. Controls whether total row order is always maintained for DataFrame/Series.""" + return self._strictly_ordered_internal diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index e0b63b4a8c..89ef5f525e 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -23,12 +23,11 @@ import warnings import google.cloud.bigquery -import ibis.expr.types as ibis_types import pandas import pyarrow as pa import pyarrow.feather as pa_feather -import bigframes.core.compile as compiling +import bigframes.core.compile import bigframes.core.expression as ex import bigframes.core.guid import bigframes.core.join_def as join_def @@ -60,30 +59,6 @@ class ArrayValue: node: nodes.BigFrameNode - # DO NOT use, on deprecation path - @classmethod - def from_ibis( - cls, - session: Session, - table: ibis_types.Table, - columns: Sequence[ibis_types.Value], - hidden_ordering_columns: Sequence[ibis_types.Value], - ordering: orderings.ExpressionOrdering, - ): - import bigframes.core.compile.ibis_types - - node = nodes.ReadGbqNode( - table=table, - table_session=session, - columns=tuple( - bigframes.core.compile.ibis_types.ibis_value_to_canonical_type(column) - for column in columns - ), - hidden_ordering_columns=tuple(hidden_ordering_columns), - ordering=ordering, - ) - return cls(node) - @classmethod def from_pyarrow(cls, arrow_table: pa.Table, session: Session): adapted_table = local_data.adapt_pa_table(arrow_table) @@ -167,12 +142,7 @@ def schema(self) -> schemata.ArraySchema: @functools.cached_property def _compiled_schema(self) -> schemata.ArraySchema: - compiled = self._compile_unordered() - items = tuple( - schemata.SchemaItem(id, compiled.get_column_type(id)) - for id in compiled.column_ids - ) - return schemata.ArraySchema(items) + return bigframes.core.compile.test_only_ibis_inferred_schema(self.node) def as_cached( self: ArrayValue, @@ -194,21 +164,11 @@ def as_cached( def _try_evaluate_local(self): """Use only for unit testing paths - not fully featured. Will throw exception if fails.""" - import ibis - - return ibis.pandas.connect({}).execute( - self._compile_ordered()._to_ibis_expr(ordering_mode="unordered") - ) + return bigframes.core.compile.test_only_try_evaluate(self.node) def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: return self.schema.get_type(key) - def _compile_ordered(self) -> compiling.OrderedIR: - return compiling.compile_ordered_ir(self.node) - - def _compile_unordered(self) -> compiling.UnorderedIR: - return compiling.compile_unordered_ir(self.node) - def row_count(self) -> ArrayValue: """Get number of rows in ArrayValue as a single-entry ArrayValue.""" return ArrayValue(nodes.RowCountNode(child=self.node)) @@ -545,11 +505,11 @@ def try_align_as_projection( join_type: join_def.JoinType, mappings: typing.Tuple[join_def.JoinColumnMapping, ...], ) -> typing.Optional[ArrayValue]: - left_side = bigframes.core.rewrite.SquashedSelect.from_node(self.node) - right_side = bigframes.core.rewrite.SquashedSelect.from_node(other.node) - result = left_side.maybe_merge(right_side, join_type, mappings) + result = bigframes.core.rewrite.join_as_projection( + self.node, other.node, mappings, join_type + ) if result is not None: - return ArrayValue(result.expand()) + return ArrayValue(result) return None def explode(self, column_ids: typing.Sequence[str]) -> ArrayValue: @@ -568,7 +528,3 @@ def _uniform_sampling(self, fraction: float) -> ArrayValue: The row numbers of result is non-deterministic, avoid to use. """ return ArrayValue(nodes.RandomSampleNode(self.node, fraction)) - - def merge_projections(self) -> ArrayValue: - new_node = bigframes.core.rewrite.maybe_squash_projection(self.node) - return ArrayValue(new_node) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 301bcc20e9..598c32670e 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -40,6 +40,7 @@ import bigframes.constants import bigframes.constants as constants import bigframes.core as core +import bigframes.core.compile.googlesql as googlesql import bigframes.core.expression as ex import bigframes.core.expression as scalars import bigframes.core.guid as guid @@ -209,7 +210,7 @@ def shape(self) -> typing.Tuple[int, int]: except Exception: pass - iter, _ = self.session._execute(row_count_expr, sorted=False) + iter, _ = self.session._execute(row_count_expr, ordered=False) row_count = next(iter)[0] return (row_count, len(self.value_columns)) @@ -518,7 +519,7 @@ def to_pandas_batches( dtypes = dict(zip(self.index_columns, self.index.dtypes)) dtypes.update(zip(self.value_columns, self.dtypes)) _, query_job = self.session._query_to_destination( - self.session._to_sql(self.expr, sorted=True), + self.session._to_sql(self.expr, ordered=self.session._strictly_ordered), list(self.index_columns), api_name="cached", do_clustering=False, @@ -553,7 +554,7 @@ def _materialize_local( """Run query and download results as a pandas DataFrame. Return the total number of results as well.""" # TODO(swast): Allow for dry run and timeout. _, query_job = self.session._query_to_destination( - self.session._to_sql(self.expr, sorted=materialize_options.ordered), + self.session._to_sql(self.expr, ordered=materialize_options.ordered), list(self.index_columns), api_name="cached", do_clustering=False, @@ -1384,6 +1385,26 @@ def _normalize_expression( raise ValueError("Unexpected number of value columns.") return expr.select_columns([*index_columns, *value_columns]) + def grouped_head( + self, + by_column_ids: typing.Sequence[str], + value_columns: typing.Sequence[str], + n: int, + ): + window_spec = window_specs.cumulative_rows(grouping_keys=tuple(by_column_ids)) + + block, result_id = self.apply_window_op( + value_columns[0], + agg_ops.rank_op, + window_spec=window_spec, + ) + + cond = ops.lt_op.as_expr(result_id, ex.const(n + 1)) + block, cond_id = block.project_expr(cond) + block = block.filter_by_id(cond_id) + if value_columns: + return block.select_columns(value_columns) + def slice( self, start: typing.Optional[int] = None, @@ -1716,7 +1737,7 @@ def transpose( original_row_index = ( original_row_index if original_row_index is not None - else self.index.to_pandas() + else self.index.to_pandas(ordered=True) ) original_row_count = len(original_row_index) if original_row_count > bigframes.constants.MAX_COLUMNS: @@ -2345,7 +2366,6 @@ def _get_rows_as_json_values(self) -> Block: # TODO(shobs): Replace direct SQL manipulation by structured expression # manipulation ordering_column_name = guid.generate_guid() - self.session._cache_with_offsets(self.expr) expr = self.expr.promote_offsets(ordering_column_name) expr_sql = self.session._to_sql(expr) @@ -2398,7 +2418,9 @@ def _get_rows_as_json_values(self) -> Block: select_columns = ( [ordering_column_name] + list(self.index_columns) + [row_json_column_name] ) - select_columns_csv = sql.csv([sql.identifier(col) for col in select_columns]) + select_columns_csv = sql.csv( + [googlesql.identifier(col) for col in select_columns] + ) json_sql = f"""\ With T0 AS ( {textwrap.indent(expr_sql, " ")} @@ -2411,21 +2433,35 @@ def _get_rows_as_json_values(self) -> Block: "values", [{column_references_csv}], "indexlength", {index_columns_count}, "dtype", {pandas_row_dtype} - ) AS {sql.identifier(row_json_column_name)} FROM T0 + ) AS {googlesql.identifier(row_json_column_name)} FROM T0 ) SELECT {select_columns_csv} FROM T1 """ - ibis_table = self.session.ibis_client.sql(json_sql) - order_for_ibis_table = ordering.ExpressionOrdering.from_offset_col( - ordering_column_name - ) - expr = core.ArrayValue.from_ibis( - self.session, - ibis_table, - [ibis_table[col] for col in select_columns if col != ordering_column_name], - hidden_ordering_columns=[ibis_table[ordering_column_name]], - ordering=order_for_ibis_table, + # The only ways this code is used is through df.apply(axis=1) cope path + destination, query_job = self.session._query_to_destination( + json_sql, index_cols=[ordering_column_name], api_name="apply" + ) + if not destination: + raise ValueError(f"Query job {query_job} did not produce result table") + + new_schema = ( + self.expr.schema.select([*self.index_columns]) + .append( + bf_schema.SchemaItem( + row_json_column_name, bigframes.dtypes.STRING_DTYPE + ) + ) + .append( + bf_schema.SchemaItem(ordering_column_name, bigframes.dtypes.INT_DTYPE) + ) ) + + expr = core.ArrayValue.from_table( + self.session.bqclient.get_table(destination), + schema=new_schema, + session=self.session, + offsets_col=ordering_column_name, + ).drop_columns([ordering_column_name]) block = Block( expr, index_columns=self.index_columns, @@ -2474,7 +2510,7 @@ def column_ids(self) -> Sequence[str]: """Column(s) to use as row labels.""" return self._block._index_columns - def to_pandas(self) -> pd.Index: + def to_pandas(self, *, ordered: Optional[bool] = None) -> pd.Index: """Executes deferred operations and downloads the results.""" if len(self.column_ids) == 0: raise bigframes.exceptions.NullIndexError( @@ -2484,7 +2520,12 @@ def to_pandas(self) -> pd.Index: index_columns = list(self._block.index_columns) dtypes = dict(zip(index_columns, self.dtypes)) expr = self._expr.select_columns(index_columns) - results, _ = self.session._execute(expr) + results, _ = self.session._execute( + expr, + ordered=ordered + if (ordered is not None) + else self.session._strictly_ordered, + ) df = expr.session._rows_to_dataframe(results, dtypes) df = df.set_index(index_columns) index = df.index diff --git a/bigframes/core/compile/__init__.py b/bigframes/core/compile/__init__.py index c3e2bd832a..4c105ed03b 100644 --- a/bigframes/core/compile/__init__.py +++ b/bigframes/core/compile/__init__.py @@ -11,13 +11,22 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations -from bigframes.core.compile.compiled import OrderedIR, UnorderedIR -from bigframes.core.compile.compiler import compile_ordered_ir, compile_unordered_ir +from bigframes.core.compile.api import ( + compile_ordered, + compile_peek, + compile_raw, + compile_unordered, + test_only_ibis_inferred_schema, + test_only_try_evaluate, +) __all__ = [ - "compile_ordered_ir", - "compile_unordered_ir", - "OrderedIR", - "UnorderedIR", + "compile_peek", + "compile_unordered", + "compile_ordered", + "compile_raw", + "test_only_try_evaluate", + "test_only_ibis_inferred_schema", ] diff --git a/bigframes/core/compile/api.py b/bigframes/core/compile/api.py new file mode 100644 index 0000000000..9fba3081ca --- /dev/null +++ b/bigframes/core/compile/api.py @@ -0,0 +1,72 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +from typing import Mapping, Tuple, TYPE_CHECKING + +import bigframes.core.compile.compiler as compiler + +if TYPE_CHECKING: + import bigframes.core.nodes + import bigframes.core.ordering + import bigframes.core.schema + + +def compile_peek(node: bigframes.core.nodes.BigFrameNode, n_rows: int) -> str: + """Compile node into sql that selects N arbitrary rows, may not execute deterministically.""" + return compiler.compile_unordered_ir(node).peek_sql(n_rows) + + +def compile_unordered( + node: bigframes.core.nodes.BigFrameNode, *, col_id_overrides: Mapping[str, str] = {} +) -> str: + """Compile node into sql where rows are unsorted, and no ordering information is preserved.""" + return compiler.compile_unordered_ir(node).to_sql(col_id_overrides=col_id_overrides) + + +def compile_ordered( + node: bigframes.core.nodes.BigFrameNode, *, col_id_overrides: Mapping[str, str] = {} +) -> str: + """Compile node into sql where rows are sorted with ORDER BY.""" + return compiler.compile_ordered_ir(node).to_sql( + col_id_overrides=col_id_overrides, ordered=True + ) + + +def compile_raw( + node: bigframes.core.nodes.BigFrameNode, +) -> Tuple[str, bigframes.core.ordering.ExpressionOrdering]: + """Compile node into sql that exposes all columns, including hidden ordering-only columns.""" + ir = compiler.compile_ordered_ir(node) + sql = ir.raw_sql() + ordering_info = ir._ordering + return sql, ordering_info + + +def test_only_try_evaluate(node: bigframes.core.nodes.BigFrameNode): + """Use only for unit testing paths - not fully featured. Will throw exception if fails.""" + ibis = compiler.compile_ordered_ir(node)._to_ibis_expr(ordering_mode="unordered") + return ibis.pandas.connect({}).execute(ibis) + + +def test_only_ibis_inferred_schema(node: bigframes.core.nodes.BigFrameNode): + """Use only for testing paths to ensure ibis inferred schema does not diverge from bigframes inferred schema.""" + import bigframes.core.schema + + compiled = compiler.compile_unordered_ir(node) + items = tuple( + bigframes.core.schema.SchemaItem(id, compiled.get_column_type(id)) + for id in compiled.column_ids + ) + return bigframes.core.schema.ArraySchema(items) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index dac814a08c..907c918efd 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -28,6 +28,7 @@ import pandas import bigframes.core.compile.aggregate_compiler as agg_compiler +import bigframes.core.compile.googlesql import bigframes.core.compile.ibis_types import bigframes.core.compile.scalar_op_compiler as op_compilers import bigframes.core.expression as ex @@ -257,9 +258,9 @@ def to_sql( self, offset_column: typing.Optional[str] = None, col_id_overrides: typing.Mapping[str, str] = {}, - sorted: bool = False, + ordered: bool = False, ) -> str: - if offset_column or sorted: + if offset_column or ordered: raise ValueError("Cannot produce sorted sql in unordered mode") sql = ibis_bigquery.Backend().compile( self._to_ibis_expr( @@ -890,9 +891,9 @@ def _reproject_to_table(self) -> OrderedIR: def to_sql( self, col_id_overrides: typing.Mapping[str, str] = {}, - sorted: bool = False, + ordered: bool = False, ) -> str: - if sorted: + if ordered: # Need to bake ordering expressions into the selected column in order for our ordering clause builder to work. baked_ir = self._bake_ordering() sql = ibis_bigquery.Backend().compile( @@ -905,7 +906,12 @@ def to_sql( output_columns = [ col_id_overrides.get(col, col) for col in baked_ir.column_ids ] - sql = bigframes.core.sql.select_from_subquery(output_columns, sql) + sql = ( + bigframes.core.compile.googlesql.Select() + .from_(sql) + .select(output_columns) + .sql() + ) # Single row frames may not have any ordering columns if len(baked_ir._ordering.all_ordering_columns) > 0: @@ -923,6 +929,15 @@ def to_sql( ) return typing.cast(str, sql) + def raw_sql(self) -> str: + """Return sql with all hidden columns. Used to cache with ordering information.""" + return ibis_bigquery.Backend().compile( + self._to_ibis_expr( + ordering_mode="unordered", + expose_hidden_cols=True, + ) + ) + def _to_ibis_expr( self, *, diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 021ec8b176..9272b8ad1b 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -223,22 +223,6 @@ def compile_read_table_ordered(node: nodes.ReadTableNode): ) -@_compile_node.register -def compile_readgbq(node: nodes.ReadGbqNode, ordered: bool = True): - if ordered: - return compiled.OrderedIR( - node.table, - node.columns, - node.hidden_ordering_columns, - node.ordering, - ) - else: - return compiled.UnorderedIR( - node.table, - node.columns, - ) - - @_compile_node.register def compile_promote_offsets(node: nodes.PromoteOffsetsNode, ordered: bool = True): result = compile_ordered_ir(node.child).promote_offsets(node.col_id) diff --git a/bigframes/core/compile/googlesql/__init__.py b/bigframes/core/compile/googlesql/__init__.py index 32265c0d51..add0c5ec44 100644 --- a/bigframes/core/compile/googlesql/__init__.py +++ b/bigframes/core/compile/googlesql/__init__.py @@ -17,13 +17,17 @@ from __future__ import annotations +from bigframes.core.compile.googlesql.datatype import DataType from bigframes.core.compile.googlesql.expression import ( + _escape_chars, AliasExpression, ColumnExpression, CTEExpression, + identifier, StarExpression, TableExpression, ) +from bigframes.core.compile.googlesql.function import Cast from bigframes.core.compile.googlesql.query import ( AsAlias, FromClause, @@ -36,10 +40,14 @@ ) __all__ = [ + "_escape_chars", + "identifier", "AliasExpression", "AsAlias", + "Cast", "ColumnExpression", "CTEExpression", + "DataType", "FromClause", "FromItem", "NonRecursiveCTE", @@ -48,5 +56,6 @@ "SelectAll", "SelectExpression", "StarExpression", + "StringType", "TableExpression", ] diff --git a/bigframes/core/compile/googlesql/datatype.py b/bigframes/core/compile/googlesql/datatype.py new file mode 100644 index 0000000000..ccf3ff4d41 --- /dev/null +++ b/bigframes/core/compile/googlesql/datatype.py @@ -0,0 +1,23 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import enum + +"""This module represents all GoogleSQL for BigQuery data types: +https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types""" + + +class DataType(enum.Enum): + STRING = 1 + FLOAT64 = 2 diff --git a/bigframes/core/compile/googlesql/expression.py b/bigframes/core/compile/googlesql/expression.py index 702aa2c5e5..581ab67718 100644 --- a/bigframes/core/compile/googlesql/expression.py +++ b/bigframes/core/compile/googlesql/expression.py @@ -45,8 +45,8 @@ class ColumnExpression(Expression): def sql(self) -> str: if self.parent is not None: - return f"{self.parent.sql()}.`{self.name}`" - return f"`{self.name}`" + return f"{self.parent.sql()}.{identifier(self.name)}" + return identifier(self.name) @dataclasses.dataclass @@ -72,10 +72,10 @@ def __post_init__(self): def sql(self) -> str: text = [] if self.project_id is not None: - text.append(f"`{self.project_id}`") + text.append(identifier(self.project_id)) if self.dataset_id is not None: - text.append(f"`{self.dataset_id}`") - text.append(f"`{self.table_id}`") + text.append(identifier(self.dataset_id)) + text.append(identifier(self.table_id)) return ".".join(text) @@ -84,7 +84,7 @@ class AliasExpression(Expression): alias: str def sql(self) -> str: - return f"`{self.alias}`" + return identifier(self.alias) @dataclasses.dataclass @@ -92,4 +92,33 @@ class CTEExpression(Expression): name: str def sql(self) -> str: - return f"`{self.name}`" + return identifier(self.name) + + +def identifier(id: str) -> str: + """Return a string representing column reference in a SQL.""" + # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#identifiers + # Just always escape, otherwise need to check against every reserved sql keyword + return f"`{_escape_chars(id)}`" + + +def _escape_chars(value: str): + """Escapes all special charactesrs""" + # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#string_and_bytes_literals + trans_table = str.maketrans( + { + "\a": r"\a", + "\b": r"\b", + "\f": r"\f", + "\n": r"\n", + "\r": r"\r", + "\t": r"\t", + "\v": r"\v", + "\\": r"\\", + "?": r"\?", + '"': r"\"", + "'": r"\'", + "`": r"\`", + } + ) + return value.translate(trans_table) diff --git a/bigframes/core/compile/googlesql/function.py b/bigframes/core/compile/googlesql/function.py new file mode 100644 index 0000000000..19b61f2fc9 --- /dev/null +++ b/bigframes/core/compile/googlesql/function.py @@ -0,0 +1,32 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dataclasses + +import bigframes.core.compile.googlesql.datatype as datatype +import bigframes.core.compile.googlesql.expression as expr + +# Conversion functions: +# https://cloud.google.com/bigquery/docs/reference/standard-sql/conversion_functions + + +@dataclasses.dataclass +class Cast(expr.Expression): + """This class represents the `cast` function.""" + + expression: expr.ColumnExpression + type: datatype.DataType + + def sql(self) -> str: + return f"CAST ({self.expression.sql()} AS {self.type.name})" diff --git a/bigframes/core/compile/googlesql/query.py b/bigframes/core/compile/googlesql/query.py index 6210aa67f4..dfe21ef7b2 100644 --- a/bigframes/core/compile/googlesql/query.py +++ b/bigframes/core/compile/googlesql/query.py @@ -17,6 +17,8 @@ import dataclasses import typing +import google.cloud.bigquery as bigquery + import bigframes.core.compile.googlesql.abc as abc import bigframes.core.compile.googlesql.expression as expr @@ -25,6 +27,8 @@ syntax rules outlined in the documentation: https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax""" +TABLE_SOURCE_TYPE = typing.Union[str, bigquery.TableReference] + @dataclasses.dataclass class QueryExpr(abc.SQLSyntax): @@ -49,16 +53,56 @@ def sql(self) -> str: class Select(abc.SQLSyntax): """This class represents GoogleSQL `select` syntax.""" - select_list: typing.Sequence[typing.Union[SelectExpression, SelectAll]] - from_clause_list: typing.Sequence[FromClause] = () + select_list: typing.Sequence[ + typing.Union[SelectExpression, SelectAll] + ] = dataclasses.field(default_factory=list) + from_clause_list: typing.Sequence[FromClause] = dataclasses.field( + default_factory=list + ) + distinct: bool = False + + def select( + self, + columns: typing.Union[typing.Iterable[str], str, None] = None, + distinct: bool = False, + ) -> Select: + if isinstance(columns, str): + columns = [columns] + self.select_list: typing.List[typing.Union[SelectExpression, SelectAll]] = ( + [ + SelectExpression(expression=expr.ColumnExpression(name=column)) + for column in columns + ] + if columns + else [SelectAll(expression=expr.StarExpression())] + ) + self.distinct = distinct + return self + + def from_( + self, + sources: typing.Union[TABLE_SOURCE_TYPE, typing.Iterable[TABLE_SOURCE_TYPE]], + ) -> Select: + if (not isinstance(sources, typing.Iterable)) or isinstance(sources, str): + sources = [sources] + self.from_clause_list = [ + FromClause(FromItem.from_source(source)) for source in sources + ] + return self def sql(self) -> str: + if (self.select_list is not None) and (not self.select_list): + raise ValueError("Select clause has not been properly initialized.") + text = ["SELECT"] + if self.distinct: + text.append("DISTINCT") + select_list_sql = ",\n".join([select.sql() for select in self.select_list]) text.append(select_list_sql) - if self.from_clause_list is not None: + if self.from_clause_list: from_clauses_sql = ",\n".join( [clause.sql() for clause in self.from_clause_list] ) @@ -104,39 +148,46 @@ def sql(self) -> str: class FromItem(abc.SQLSyntax): """This class represents GoogleSQL `from_item` syntax.""" - table_name: typing.Optional[expr.TableExpression] = None # Note: Temporarily introduces the `str` type to interact with pre-existing, # compiled SQL strings. - query_expr: typing.Optional[QueryExpr | str] = None - cte_name: typing.Optional[expr.CTEExpression] = None + expression: typing.Union[expr.TableExpression, QueryExpr, str, expr.CTEExpression] as_alias: typing.Optional[AsAlias] = None - def __post_init__(self): - non_none = sum( - expr is not None - for expr in [ - self.table_name, - self.query_expr, - self.cte_name, - ] - ) - if non_none != 1: - raise ValueError("Exactly one of expressions must be provided.") + @classmethod + def from_source( + cls, + subquery_or_tableref: typing.Union[bigquery.TableReference, str], + as_alias: typing.Optional[AsAlias] = None, + ): + if isinstance(subquery_or_tableref, bigquery.TableReference): + return cls( + expression=expr.TableExpression( + table_id=subquery_or_tableref.table_id, + dataset_id=subquery_or_tableref.dataset_id, + project_id=subquery_or_tableref.project, + ), + as_alias=as_alias, + ) + elif isinstance(subquery_or_tableref, str): + return cls( + expression=subquery_or_tableref, + as_alias=as_alias, + ) + else: + raise ValueError("The source must be bigquery.TableReference or str.") def sql(self) -> str: - if self.table_name is not None: - text = self.table_name.sql() - elif self.query_expr is not None: - text = ( - self.query_expr - if isinstance(self.query_expr, str) - else self.query_expr.sql() - ) - text = f"({text})" - elif self.cte_name is not None: - text = self.cte_name.sql() + if isinstance(self.expression, (expr.TableExpression, expr.CTEExpression)): + text = self.expression.sql() + elif isinstance(self.expression, str): + text = f"({self.expression})" + elif isinstance(self.expression, QueryExpr): + text = f"({self.expression.sql()})" else: - raise ValueError("One of from items must be provided.") + raise ValueError( + f"Unsupported expression type {type(self.expression).__name__};" + "expected one of TableExpression, QueryExpr, str, or CTEExpression." + ) if self.as_alias is None: return text diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index f73fce3e4d..f3221f605f 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -74,23 +74,6 @@ BIGFRAMES_TO_IBIS: Dict[bigframes.dtypes.Dtype, ibis_dtypes.DataType] = { pandas: ibis for ibis, pandas in BIDIRECTIONAL_MAPPINGS } - -IBIS_TO_ARROW: Dict[ibis_dtypes.DataType, pa.DataType] = { - ibis_dtypes.boolean: pa.bool_(), - ibis_dtypes.date: pa.date32(), - ibis_dtypes.float64: pa.float64(), - ibis_dtypes.int64: pa.int64(), - ibis_dtypes.string: pa.string(), - ibis_dtypes.time: pa.time64("us"), - ibis_dtypes.Timestamp(timezone=None): pa.timestamp("us"), - ibis_dtypes.Timestamp(timezone="UTC"): pa.timestamp("us", tz="UTC"), - ibis_dtypes.binary: pa.binary(), - ibis_dtypes.Decimal(precision=38, scale=9, nullable=True): pa.decimal128(38, 9), - ibis_dtypes.Decimal(precision=76, scale=38, nullable=True): pa.decimal256(76, 38), -} - -ARROW_TO_IBIS = {arrow: ibis for ibis, arrow in IBIS_TO_ARROW.items()} - IBIS_TO_BIGFRAMES: Dict[ibis_dtypes.DataType, bigframes.dtypes.Dtype] = { ibis: pandas for ibis, pandas in BIDIRECTIONAL_MAPPINGS } @@ -248,14 +231,17 @@ def bigframes_dtype_to_ibis_dtype( Raises: ValueError: If passed a dtype not supported by BigQuery DataFrames. """ - if isinstance(bigframes_dtype, pd.ArrowDtype): - return _arrow_dtype_to_ibis_dtype(bigframes_dtype.pyarrow_dtype) - - type_string = str(bigframes_dtype) - if type_string in bigframes.dtypes.BIGFRAMES_STRING_TO_BIGFRAMES: + if str(bigframes_dtype) in bigframes.dtypes.BIGFRAMES_STRING_TO_BIGFRAMES: bigframes_dtype = bigframes.dtypes.BIGFRAMES_STRING_TO_BIGFRAMES[ - cast(bigframes.dtypes.DtypeString, type_string) + cast(bigframes.dtypes.DtypeString, str(bigframes_dtype)) ] + + if bigframes_dtype in BIGFRAMES_TO_IBIS.keys(): + return BIGFRAMES_TO_IBIS[bigframes_dtype] + + elif isinstance(bigframes_dtype, pd.ArrowDtype) and bigframes_dtype.pyarrow_dtype: + return _arrow_dtype_to_ibis_dtype(bigframes_dtype.pyarrow_dtype) + else: raise ValueError( textwrap.dedent( @@ -276,8 +262,6 @@ def bigframes_dtype_to_ibis_dtype( ) ) - return BIGFRAMES_TO_IBIS[bigframes_dtype] - def ibis_dtype_to_bigframes_dtype( ibis_dtype: ibis_dtypes.DataType, @@ -348,15 +332,27 @@ def _ibis_dtype_to_arrow_dtype(ibis_dtype: ibis_dtypes.DataType) -> pa.DataType: ] ) - if ibis_dtype in IBIS_TO_ARROW: - return IBIS_TO_ARROW[ibis_dtype] + if ibis_dtype in IBIS_TO_BIGFRAMES: + dtype = IBIS_TO_BIGFRAMES[ibis_dtype] + # Note: arrow mappings are incomplete, no geography type + return bigframes.dtypes.bigframes_dtype_to_arrow_dtype(dtype) else: raise ValueError( f"Unexpected Ibis data type {ibis_dtype}. {constants.FEEDBACK_LINK}" ) +_ARROW_TO_IBIS = { + mapping.arrow_dtype: bigframes_dtype_to_ibis_dtype(mapping.dtype) + for mapping in bigframes.dtypes.SIMPLE_TYPES + if mapping.arrow_dtype is not None +} + + def _arrow_dtype_to_ibis_dtype(arrow_dtype: pa.DataType) -> ibis_dtypes.DataType: + if arrow_dtype == pa.null(): + # Used for empty local dataframes where pyarrow has null type + return ibis_dtypes.float64 if pa.types.is_struct(arrow_dtype): struct_dtype = cast(pa.StructType, arrow_dtype) return ibis_dtypes.Struct.from_tuples( @@ -365,16 +361,15 @@ def _arrow_dtype_to_ibis_dtype(arrow_dtype: pa.DataType) -> ibis_dtypes.DataType for field in struct_dtype ] ) - - if arrow_dtype in ARROW_TO_IBIS: - return ARROW_TO_IBIS[arrow_dtype] - if arrow_dtype == pa.null(): - # Used for empty local dataframes where pyarrow has null type - return ibis_dtypes.float64 + if pa.types.is_list(arrow_dtype): + list_dtype = cast(pa.ListType, arrow_dtype) + value_dtype = list_dtype.value_type + value_ibis_type = _arrow_dtype_to_ibis_dtype(value_dtype) + return ibis_dtypes.Array(value_type=value_ibis_type) + elif arrow_dtype in _ARROW_TO_IBIS: + return _ARROW_TO_IBIS[arrow_dtype] else: - raise ValueError( - f"Unexpected Arrow data type {arrow_dtype}. {constants.FEEDBACK_LINK}" - ) + raise ValueError(f"Unexpected arrow type: {arrow_dtype}") def literal_to_ibis_scalar( diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index ee120635d3..9c2bf18caa 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -104,6 +104,18 @@ def __getitem__( dropna=self._dropna, ) + def head(self, n: int = 5) -> df.DataFrame: + block = self._block + if self._dropna: + block = block_ops.dropna(self._block, self._by_col_ids, how="any") + return df.DataFrame( + block.grouped_head( + by_column_ids=self._by_col_ids, + value_columns=self._block.value_columns, + n=n, + ) + ) + def size(self) -> typing.Union[df.DataFrame, series.Series]: agg_block, _ = self._block.aggregate_size( by_column_ids=self._by_col_ids, @@ -498,6 +510,16 @@ def __init__( self._value_name = value_name self._dropna = dropna # Applies to aggregations but not windowing + def head(self, n: int = 5) -> series.Series: + block = self._block + if self._dropna: + block = block_ops.dropna(self._block, self._by_col_ids, how="any") + return series.Series( + block.grouped_head( + by_column_ids=self._by_col_ids, value_columns=[self._value_column], n=n + ) + ) + def all(self) -> series.Series: return self._aggregate(agg_ops.all_op) diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index 582141d539..dae5eada70 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -159,9 +159,13 @@ def __getitem__(self, key): ) columns = key[1] - if isinstance(columns, pd.Series) and columns.dtype == "bool": - # TODO(b/340892590): fix type error - columns = df.columns[columns] # type: ignore + if isinstance(columns, bigframes.series.Series): + columns = columns.to_pandas() + if isinstance(columns, pd.Series) and columns.dtype in ( + bool, + pd.BooleanDtype(), + ): + columns = df.columns[typing.cast(pd.Series, columns)] return df[columns] @@ -252,7 +256,7 @@ def __getitem__(self, key: tuple) -> bigframes.core.scalar.Scalar: raise ValueError(error_message) if len(key) != 2: raise TypeError(error_message) - block: bigframes.core.blocks.Block = self._dataframe._block # type: ignore + block: bigframes.core.blocks.Block = self._dataframe._block column_block = block.select_columns([block.value_columns[key[1]]]) column = bigframes.series.Series(column_block) return column.iloc[key[0]] @@ -376,14 +380,14 @@ def _perform_loc_list_join( ) result = result.rename(original_name) else: - result = series_or_dataframe._perform_join_by_index(keys_index, how="right") # type: ignore + result = series_or_dataframe._perform_join_by_index(keys_index, how="right") if drop_levels and series_or_dataframe.index.nlevels > keys_index.nlevels: # drop common levels levels_to_drop = [ name for name in series_or_dataframe.index.names if name in keys_index.names ] - result = result.droplevel(levels_to_drop) # type: ignore + result = result.droplevel(levels_to_drop) return result diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 0e5082447a..cfb22929c8 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -243,6 +243,11 @@ def query_job(self) -> Optional[bigquery.QueryJob]: return self._query_job def __repr__(self) -> str: + # Protect against errors with uninitialized Series. See: + # https://github.com/googleapis/python-bigquery-dataframes/issues/728 + if not hasattr(self, "_block"): + return object.__repr__(self) + # TODO(swast): Add a timeout here? If the query is taking a long time, # maybe we just print the job metadata that we have so far? # TODO(swast): Avoid downloading the whole series by using job @@ -476,7 +481,9 @@ def to_pandas(self) -> pandas.Index: pandas.Index: A pandas Index with all of the labels from this Index. """ - return self._block.index.to_pandas() + return self._block.index.to_pandas( + ordered=self._block.session._strictly_ordered + ) def to_numpy(self, dtype=None, **kwargs) -> np.ndarray: return self.to_pandas().to_numpy(dtype, **kwargs) diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 077a362ba0..65a1bd8084 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -34,8 +34,6 @@ import bigframes.operations.aggregations as agg_ops if typing.TYPE_CHECKING: - import ibis.expr.types as ibis_types - import bigframes.core.ordering as orderings import bigframes.session @@ -302,54 +300,6 @@ def transform_children( return self -# TODO: Refactor to take raw gbq object reference -@dataclass(frozen=True) -class ReadGbqNode(BigFrameNode): - table: ibis_types.Table = field() - table_session: bigframes.session.Session = field() - columns: Tuple[ibis_types.Value, ...] = field() - hidden_ordering_columns: Tuple[ibis_types.Value, ...] = field() - ordering: orderings.ExpressionOrdering = field() - - @property - def session(self): - return self.table_session - - def __hash__(self): - return self._node_hash - - @property - def roots(self) -> typing.Set[BigFrameNode]: - return {self} - - @functools.cached_property - def schema(self) -> schemata.ArraySchema: - from bigframes.core.compile.ibis_types import ibis_dtype_to_bigframes_dtype - - items = tuple( - schemata.SchemaItem( - value.get_name(), - ibis_dtype_to_bigframes_dtype(value.type()), - ) - for value in self.columns - ) - return schemata.ArraySchema(items) - - @functools.cached_property - def variables_introduced(self) -> int: - return len(self.columns) + len(self.hidden_ordering_columns) - - @property - def relation_ops_created(self) -> int: - # Assume worst case, where readgbq actually has baked in analytic operation to generate index - return 2 - - def transform_children( - self, t: Callable[[BigFrameNode], BigFrameNode] - ) -> BigFrameNode: - return self - - ## Put ordering in here or just add order_by node above? @dataclass(frozen=True) class ReadTableNode(BigFrameNode): @@ -425,6 +375,19 @@ class CachedTableNode(BigFrameNode): ordering: typing.Optional[orderings.ExpressionOrdering] = field() + def __post_init__(self): + # enforce invariants + physical_names = set(map(lambda i: i.name, self.physical_schema)) + logical_names = self.original_node.schema.names + if not set(logical_names).issubset(physical_names): + raise ValueError( + f"Requested schema {logical_names} cannot be derived from table schema {self.physical_schema}" + ) + if not set(self.hidden_columns).issubset(physical_names): + raise ValueError( + f"Requested hidden columns {self.hidden_columns} cannot be derived from table schema {self.physical_schema}" + ) + @property def session(self): return self.original_node.session diff --git a/bigframes/core/rewrite.py b/bigframes/core/rewrite.py index 15999c0558..101d5cc882 100644 --- a/bigframes/core/rewrite.py +++ b/bigframes/core/rewrite.py @@ -26,10 +26,17 @@ Selection = Tuple[Tuple[scalar_exprs.Expression, str], ...] +REWRITABLE_NODE_TYPES = ( + nodes.ProjectionNode, + nodes.FilterNode, + nodes.ReversedNode, + nodes.OrderByNode, +) + @dataclasses.dataclass(frozen=True) class SquashedSelect: - """Squash together as many nodes as possible, separating out the projection, filter and reordering expressions.""" + """Squash nodes together until target node, separating out the projection, filter and reordering expressions.""" root: nodes.BigFrameNode columns: Tuple[Tuple[scalar_exprs.Expression, str], ...] @@ -38,25 +45,25 @@ class SquashedSelect: reverse_root: bool = False @classmethod - def from_node( - cls, node: nodes.BigFrameNode, projections_only: bool = False + def from_node_span( + cls, node: nodes.BigFrameNode, target: nodes.BigFrameNode ) -> SquashedSelect: - if isinstance(node, nodes.ProjectionNode): - return cls.from_node(node.child, projections_only=projections_only).project( - node.assignments - ) - elif not projections_only and isinstance(node, nodes.FilterNode): - return cls.from_node(node.child).filter(node.predicate) - elif not projections_only and isinstance(node, nodes.ReversedNode): - return cls.from_node(node.child).reverse() - elif not projections_only and isinstance(node, nodes.OrderByNode): - return cls.from_node(node.child).order_with(node.by) - else: + if node == target: selection = tuple( (scalar_exprs.UnboundVariableExpression(id), id) for id in get_node_column_ids(node) ) return cls(node, selection, None, ()) + if isinstance(node, nodes.ProjectionNode): + return cls.from_node_span(node.child, target).project(node.assignments) + elif isinstance(node, nodes.FilterNode): + return cls.from_node_span(node.child, target).filter(node.predicate) + elif isinstance(node, nodes.ReversedNode): + return cls.from_node_span(node.child, target).reverse() + elif isinstance(node, nodes.OrderByNode): + return cls.from_node_span(node.child, target).order_with(node.by) + else: + raise ValueError(f"Cannot rewrite node {node}") @property def column_lookup(self) -> Mapping[str, scalar_exprs.Expression]: @@ -98,9 +105,10 @@ def order_with(self, by: Tuple[order.OrderingExpression, ...]): self.root, self.columns, self.predicate, new_ordering, self.reverse_root ) - def can_join( + def can_merge( self, right: SquashedSelect, join_def: join_defs.JoinDefinition ) -> bool: + """Determines whether the two selections can be merged into a single selection.""" if join_def.type == "cross": # Cannot convert cross join to projection return False @@ -116,14 +124,14 @@ def can_join( return False return True - def maybe_merge( + def merge( self, right: SquashedSelect, join_type: join_defs.JoinType, mappings: Tuple[join_defs.JoinColumnMapping, ...], - ) -> Optional[SquashedSelect]: + ) -> SquashedSelect: if self.root != right.root: - return None + raise ValueError("Cannot merge expressions with different roots") # Mask columns and remap names to expected schema lselection = self.columns rselection = right.columns @@ -196,28 +204,40 @@ def expand(self) -> nodes.BigFrameNode: return nodes.ProjectionNode(child=root, assignments=self.columns) -def maybe_squash_projection(node: nodes.BigFrameNode) -> nodes.BigFrameNode: - if isinstance(node, nodes.ProjectionNode) and isinstance( - node.child, nodes.ProjectionNode - ): - # Conservative approach, only squash consecutive projections, even though could also squash filters, reorderings - return SquashedSelect.from_node(node, projections_only=True).expand() - return node - - def maybe_rewrite_join(join_node: nodes.JoinNode) -> nodes.BigFrameNode: - left_side = SquashedSelect.from_node(join_node.left_child) - right_side = SquashedSelect.from_node(join_node.right_child) - if left_side.can_join(right_side, join_node.join): - merged = left_side.maybe_merge( + rewrite_common_node = common_selection_root( + join_node.left_child, join_node.right_child + ) + if rewrite_common_node is None: + return join_node + left_side = SquashedSelect.from_node_span(join_node.left_child, rewrite_common_node) + right_side = SquashedSelect.from_node_span( + join_node.right_child, rewrite_common_node + ) + if left_side.can_merge(right_side, join_node.join): + return left_side.merge( right_side, join_node.join.type, join_node.join.mappings - ) + ).expand() + return join_node + + +def join_as_projection( + l_node: nodes.BigFrameNode, + r_node: nodes.BigFrameNode, + mappings: Tuple[join_defs.JoinColumnMapping, ...], + how: join_defs.JoinType, +) -> Optional[nodes.BigFrameNode]: + rewrite_common_node = common_selection_root(l_node, r_node) + if rewrite_common_node is not None: + left_side = SquashedSelect.from_node_span(l_node, rewrite_common_node) + right_side = SquashedSelect.from_node_span(r_node, rewrite_common_node) + merged = left_side.merge(right_side, how, mappings) assert ( merged is not None ), "Couldn't merge nodes. This shouldn't happen. Please share full stacktrace with the BigQuery DataFrames team at bigframes-feedback@google.com." return merged.expand() else: - return join_node + return None def remap_names( @@ -311,3 +331,25 @@ def get_node_column_ids(node: nodes.BigFrameNode) -> Tuple[str, ...]: import bigframes.core return tuple(bigframes.core.ArrayValue(node).column_ids) + + +def common_selection_root( + l_tree: nodes.BigFrameNode, r_tree: nodes.BigFrameNode +) -> Optional[nodes.BigFrameNode]: + """Find common subtree between join subtrees""" + l_node = l_tree + l_nodes: set[nodes.BigFrameNode] = set() + while isinstance(l_node, REWRITABLE_NODE_TYPES): + l_nodes.add(l_node) + l_node = l_node.child + l_nodes.add(l_node) + + r_node = r_tree + while isinstance(r_node, REWRITABLE_NODE_TYPES): + if r_node in l_nodes: + return r_node + r_node = r_node.child + + if r_node in l_nodes: + return r_node + return None diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py index 01dcebad6e..528c9bcc74 100644 --- a/bigframes/core/sql.py +++ b/bigframes/core/sql.py @@ -19,12 +19,9 @@ import datetime import math -import textwrap from typing import Iterable, Mapping, TYPE_CHECKING, Union -# Literals and identifiers matching this pattern can be unquoted -unquoted = r"^[A-Za-z_][A-Za-z_0-9]*$" - +import bigframes.core.compile.googlesql as googlesql if TYPE_CHECKING: import google.cloud.bigquery as bigquery @@ -38,7 +35,7 @@ def simple_literal(value: str | int | bool | float | datetime.datetime): # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#literals if isinstance(value, str): # Single quoting seems to work nicer with ibis than double quoting - return f"'{escape_special_characters(value)}'" + return f"'{googlesql._escape_chars(value)}'" elif isinstance(value, (bool, int)): return str(value) elif isinstance(value, float): @@ -61,45 +58,18 @@ def multi_literal(*values: str): return "(" + ", ".join(literal_strings) + ")" -def identifier(id: str) -> str: - """Return a string representing column reference in a SQL.""" - # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#identifiers - # Just always escape, otherwise need to check against every reserved sql keyword - return f"`{escape_special_characters(id)}`" - - -def escape_special_characters(value: str): - """Escapes all special charactesrs""" - # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#string_and_bytes_literals - trans_table = str.maketrans( - { - "\a": r"\a", - "\b": r"\b", - "\f": r"\f", - "\n": r"\n", - "\r": r"\r", - "\t": r"\t", - "\v": r"\v", - "\\": r"\\", - "?": r"\?", - '"': r"\"", - "'": r"\'", - "`": r"\`", - } - ) - return value.translate(trans_table) - - def cast_as_string(column_name: str) -> str: """Return a string representing string casting of a column.""" - return f"CAST({identifier(column_name)} AS STRING)" + return googlesql.Cast( + googlesql.ColumnExpression(column_name), googlesql.DataType.STRING + ).sql() def to_json_string(column_name: str) -> str: """Return a string representing JSON version of a column.""" - return f"TO_JSON_STRING({identifier(column_name)})" + return f"TO_JSON_STRING({googlesql.identifier(column_name)})" def csv(values: Iterable[str]) -> str: @@ -107,46 +77,17 @@ def csv(values: Iterable[str]) -> str: return ", ".join(values) -def table_reference(table_ref: bigquery.TableReference) -> str: - return f"`{escape_special_characters(table_ref.project)}`.`{escape_special_characters(table_ref.dataset_id)}`.`{escape_special_characters(table_ref.table_id)}`" - - def infix_op(opname: str, left_arg: str, right_arg: str): # Maybe should add parentheses?? return f"{left_arg} {opname} {right_arg}" -### Writing SELECT expressions -def select_from_subquery(columns: Iterable[str], subquery: str, distinct: bool = False): - selection = ", ".join(map(identifier, columns)) - distinct_clause = "DISTINCT " if distinct else "" - - return textwrap.dedent( - f"SELECT {distinct_clause}{selection}\nFROM (\n" f"{subquery}\n" ")\n" - ) - - -def select_from_table_ref( - columns: Iterable[str], table_ref: bigquery.TableReference, distinct: bool = False -): - selection = ", ".join(map(identifier, columns)) - distinct_clause = "DISTINCT " if distinct else "" - - return textwrap.dedent( - f"SELECT {distinct_clause}{selection}\nFROM {table_reference(table_ref)}" - ) - - -def select_table(table_ref: bigquery.TableReference): - return textwrap.dedent(f"SELECT * FROM {table_reference(table_ref)}") - - def is_distinct_sql(columns: Iterable[str], table_ref: bigquery.TableReference) -> str: is_unique_sql = f"""WITH full_table AS ( - {select_from_table_ref(columns, table_ref)} + {googlesql.Select().from_(table_ref).select(columns).sql()} ), distinct_table AS ( - {select_from_table_ref(columns, table_ref, distinct=True)} + {googlesql.Select().from_(table_ref).select(columns, distinct=True).sql()} ) SELECT (SELECT COUNT(*) FROM full_table) AS `total_count`, diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index f12c346776..75420ca957 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -379,7 +379,8 @@ def _to_sql_query( @property def sql(self) -> str: """Compiles this DataFrame's expression tree to SQL.""" - sql, _, _ = self._to_sql_query(include_index=False) + include_index = self.index.name is not None or len(self.index.names) > 1 + sql, _, _ = self._to_sql_query(include_index=include_index) return sql @property @@ -574,9 +575,18 @@ def _getitem_bool_series(self, key: bigframes.series.Series) -> DataFrame: return DataFrame(block) def __getattr__(self, key: str): + # Protect against recursion errors with uninitialized DataFrame + # objects. See: + # https://github.com/googleapis/python-bigquery-dataframes/issues/728 + # and + # https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html + if key == "_block": + raise AttributeError("_block") + if key in self._block.column_labels: return self.__getitem__(key) - elif hasattr(pandas.DataFrame, key): + + if hasattr(pandas.DataFrame, key): raise AttributeError( textwrap.dedent( f""" @@ -585,8 +595,7 @@ def __getattr__(self, key: str): """ ) ) - else: - raise AttributeError(key) + raise AttributeError(key) def __setattr__(self, key: str, value): if key in ["_block", "_query_job"]: @@ -616,6 +625,11 @@ def __repr__(self) -> str: Only represents the first `bigframes.options.display.max_rows`. """ + # Protect against errors with uninitialized DataFrame. See: + # https://github.com/googleapis/python-bigquery-dataframes/issues/728 + if not hasattr(self, "_block"): + return object.__repr__(self) + opts = bigframes.options.display max_results = opts.max_rows if opts.repr_mode == "deferred": @@ -1175,7 +1189,7 @@ def to_pandas( sampling_method: Optional[str] = None, random_state: Optional[int] = None, *, - ordered: bool = True, + ordered: Optional[bool] = None, ) -> pandas.DataFrame: """Write DataFrame to pandas DataFrame. @@ -1195,9 +1209,10 @@ def to_pandas( The seed for the uniform downsampling algorithm. If provided, the uniform method may take longer to execute and require more computation. If set to a value other than None, this will supersede the global config. - ordered (bool, default True): + ordered (bool, default None): Determines whether the resulting pandas dataframe will be deterministically ordered. - In some cases, unordered may result in a faster-executing query. + In some cases, unordered may result in a faster-executing query. If set to a value + other than None, will override Session default. Returns: pandas.DataFrame: A pandas DataFrame with all rows and columns of this DataFrame if the @@ -1210,7 +1225,7 @@ def to_pandas( max_download_size=max_download_size, sampling_method=sampling_method, random_state=random_state, - ordered=ordered, + ordered=ordered if ordered is not None else self._session._strictly_ordered, ) self._set_internal_query_job(query_job) return df.set_axis(self._block.column_labels, axis=1, copy=False) @@ -1305,6 +1320,34 @@ def nsmallest( column_ids = self._sql_names(columns) return DataFrame(block_ops.nsmallest(self._block, n, column_ids, keep=keep)) + def insert( + self, + loc: int, + column: blocks.Label, + value: SingleItemValue, + allow_duplicates: bool = False, + ): + column_count = len(self.columns) + if loc > column_count: + raise IndexError( + f"Column index {loc} is out of bounds with {column_count} total columns." + ) + if (column in self.columns) and not allow_duplicates: + raise ValueError(f"cannot insert {column}, already exists") + + temp_column = bigframes.core.guid.generate_guid(prefix=str(column)) + df = self._assign_single_item(temp_column, value) + + block = df._get_block() + value_columns = typing.cast(List, block.value_columns) + value_columns, new_column = value_columns[:-1], value_columns[-1] + value_columns.insert(loc, new_column) + + block = block.select_columns(value_columns) + block = block.rename(columns={temp_column: column}) + + self._set_block(block) + def drop( self, labels: typing.Any = None, @@ -3297,7 +3340,7 @@ def _run_io_query( _, query_job = session._execute( export_array, job_config=job_config, - sorted=False, + ordered=False, col_id_overrides=id_overrides, ) self._set_internal_query_job(query_job) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 3b9d5bf141..ced1c215e5 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -14,14 +14,14 @@ """Mappings for Pandas dtypes supported by BigQuery DataFrames package""" +from dataclasses import dataclass import datetime import decimal import typing -from typing import Any, Dict, Literal, Union +from typing import Dict, Literal, Union -import bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes import geopandas as gpd # type: ignore -import ibis +import google.cloud.bigquery import numpy as np import pandas as pd import pyarrow as pa @@ -41,21 +41,89 @@ # None represents the type of a None scalar. ExpressionType = typing.Optional[Dtype] - +# Convert to arrow when in array or struct INT_DTYPE = pd.Int64Dtype() FLOAT_DTYPE = pd.Float64Dtype() BOOL_DTYPE = pd.BooleanDtype() +# Wrapped arrow dtypes STRING_DTYPE = pd.StringDtype(storage="pyarrow") BYTES_DTYPE = pd.ArrowDtype(pa.binary()) DATE_DTYPE = pd.ArrowDtype(pa.date32()) TIME_DTYPE = pd.ArrowDtype(pa.time64("us")) DATETIME_DTYPE = pd.ArrowDtype(pa.timestamp("us")) TIMESTAMP_DTYPE = pd.ArrowDtype(pa.timestamp("us", tz="UTC")) +NUMERIC_DTYPE = pd.ArrowDtype(pa.decimal128(38, 9)) +BIGNUMERIC_DTYPE = pd.ArrowDtype(pa.decimal256(76, 38)) +# No arrow equivalent GEO_DTYPE = gpd.array.GeometryDtype() # Used when storing Null expressions DEFAULT_DTYPE = FLOAT_DTYPE + +# Will have a few dtype variants: simple(eg. int, string, bool), complex (eg. list, struct), and virtual (eg. micro intervals, categorical) +@dataclass(frozen=True) +class SimpleDtypeInfo: + """ + A simple dtype maps 1:1 with a database type and is not parameterized. + """ + + dtype: Dtype + arrow_dtype: typing.Optional[pa.DataType] + type_kind: typing.Tuple[str, ...] # Should all correspond to the same db type + logical_bytes: int = ( + 8 # this is approximate only, some types are variably sized, also, compression + ) + + +# TODO: Missing BQ types: INTERVAL, JSON, RANGE +# TODO: Add mappings to python types +SIMPLE_TYPES = ( + SimpleDtypeInfo( + dtype=INT_DTYPE, arrow_dtype=pa.int64(), type_kind=("INT64", "INTEGER") + ), + SimpleDtypeInfo( + dtype=FLOAT_DTYPE, arrow_dtype=pa.float64(), type_kind=("FLOAT64", "FLOAT") + ), + SimpleDtypeInfo( + dtype=BOOL_DTYPE, + arrow_dtype=pa.bool_(), + type_kind=("BOOL", "BOOLEAN"), + logical_bytes=1, + ), + SimpleDtypeInfo(dtype=STRING_DTYPE, arrow_dtype=pa.string(), type_kind=("STRING",)), + SimpleDtypeInfo( + dtype=DATE_DTYPE, arrow_dtype=pa.date32(), type_kind=("DATE",), logical_bytes=4 + ), + SimpleDtypeInfo(dtype=TIME_DTYPE, arrow_dtype=pa.time64("us"), type_kind=("TIME",)), + SimpleDtypeInfo( + dtype=DATETIME_DTYPE, arrow_dtype=pa.timestamp("us"), type_kind=("DATETIME",) + ), + SimpleDtypeInfo( + dtype=TIMESTAMP_DTYPE, + arrow_dtype=pa.timestamp("us", tz="UTC"), + type_kind=("TIMESTAMP",), + ), + SimpleDtypeInfo(dtype=BYTES_DTYPE, arrow_dtype=pa.binary(), type_kind=("BYTES",)), + SimpleDtypeInfo( + dtype=NUMERIC_DTYPE, + arrow_dtype=pa.decimal128(38, 9), + type_kind=("NUMERIC",), + logical_bytes=16, + ), + SimpleDtypeInfo( + dtype=BIGNUMERIC_DTYPE, + arrow_dtype=pa.decimal256(76, 38), + type_kind=("BIGNUMERIC",), + logical_bytes=32, + ), + # Geo has no corresponding arrow dtype + SimpleDtypeInfo( + dtype=GEO_DTYPE, arrow_dtype=None, type_kind=("GEOGRAPHY",), logical_bytes=40 + ), +) + + # Type hints for dtype strings supported by BigQuery DataFrame DtypeString = Literal[ "boolean", @@ -151,23 +219,9 @@ def is_bool_coercable(type: ExpressionType) -> bool: return (type is None) or is_numeric(type) or is_string_like(type) -_ALL_DTYPES = ( - pd.BooleanDtype(), - pd.ArrowDtype(pa.date32()), - pd.Float64Dtype(), - pd.Int64Dtype(), - pd.StringDtype(storage="pyarrow"), - pd.ArrowDtype(pa.time64("us")), - pd.ArrowDtype(pa.timestamp("us")), - pd.ArrowDtype(pa.timestamp("us", tz="UTC")), - pd.ArrowDtype(pa.binary()), - pd.ArrowDtype(pa.decimal128(38, 9)), - pd.ArrowDtype(pa.decimal256(76, 38)), - gpd.array.GeometryDtype(), -) - BIGFRAMES_STRING_TO_BIGFRAMES: Dict[DtypeString, Dtype] = { - typing.cast(DtypeString, dtype.name): dtype for dtype in _ALL_DTYPES + typing.cast(DtypeString, mapping.dtype.name): mapping.dtype + for mapping in SIMPLE_TYPES } # special case - string[pyarrow] doesn't include the storage in its name, and both @@ -178,18 +232,12 @@ def is_bool_coercable(type: ExpressionType) -> bool: BIGFRAMES_STRING_TO_BIGFRAMES["int64[pyarrow]"] = pd.Int64Dtype() # For the purposes of dataframe.memory_usage -# https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#data_type_sizes DTYPE_BYTE_SIZES = { - pd.BooleanDtype(): 1, - pd.Int64Dtype(): 8, - pd.Float32Dtype(): 8, - pd.StringDtype(): 8, - pd.ArrowDtype(pa.time64("us")): 8, - pd.ArrowDtype(pa.timestamp("us")): 8, - pd.ArrowDtype(pa.timestamp("us", tz="UTC")): 8, - pd.ArrowDtype(pa.date32()): 8, + type_info.dtype: type_info.logical_bytes for type_info in SIMPLE_TYPES } +### Conversion Functions + def dtype_for_etype(etype: ExpressionType) -> Dtype: if etype is None: @@ -198,26 +246,141 @@ def dtype_for_etype(etype: ExpressionType) -> Dtype: return etype +# Mapping between arrow and bigframes types are necessary because arrow types are used for structured types, but not all primitive types, +# so conversion are needed when data is nested or unnested. Also, sometimes local data is stored as arrow. +_ARROW_TO_BIGFRAMES = { + mapping.arrow_dtype: mapping.dtype + for mapping in SIMPLE_TYPES + if mapping.arrow_dtype is not None +} + + def arrow_dtype_to_bigframes_dtype(arrow_dtype: pa.DataType) -> Dtype: - # TODO: Directly convert instead of using ibis dtype as intermediate step - from bigframes.core.compile.ibis_types import ( - _arrow_dtype_to_ibis_dtype, - ibis_dtype_to_bigframes_dtype, - ) + if arrow_dtype in _ARROW_TO_BIGFRAMES: + return _ARROW_TO_BIGFRAMES[arrow_dtype] + if pa.types.is_list(arrow_dtype): + return pd.ArrowDtype(arrow_dtype) + if pa.types.is_struct(arrow_dtype): + return pd.ArrowDtype(arrow_dtype) + if arrow_dtype == pa.null(): + return DEFAULT_DTYPE + else: + raise ValueError( + f"Unexpected Arrow data type {arrow_dtype}. {constants.FEEDBACK_LINK}" + ) - return ibis_dtype_to_bigframes_dtype(_arrow_dtype_to_ibis_dtype(arrow_dtype)) + +_BIGFRAMES_TO_ARROW = { + mapping.dtype: mapping.arrow_dtype + for mapping in SIMPLE_TYPES + if mapping.arrow_dtype is not None +} def bigframes_dtype_to_arrow_dtype( - bigframes_dtype: Union[DtypeString, Dtype, np.dtype[Any]] + bigframes_dtype: Dtype, ) -> pa.DataType: - # TODO: Directly convert instead of using ibis dtype as intermediate step - from bigframes.core.compile.ibis_types import ( - _ibis_dtype_to_arrow_dtype, - bigframes_dtype_to_ibis_dtype, - ) + if bigframes_dtype in _BIGFRAMES_TO_ARROW: + return _BIGFRAMES_TO_ARROW[bigframes_dtype] + if isinstance(bigframes_dtype, pd.ArrowDtype): + if pa.types.is_list(bigframes_dtype.pyarrow_dtype): + return bigframes_dtype.pyarrow_dtype + if pa.types.is_struct(bigframes_dtype.pyarrow_dtype): + return bigframes_dtype.pyarrow_dtype + else: + raise ValueError( + f"No arrow conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}" + ) + + +def infer_literal_type(literal) -> typing.Optional[Dtype]: + # Maybe also normalize literal to canonical python representation to remove this burden from compilers? + if pd.api.types.is_list_like(literal): + element_types = [infer_literal_type(i) for i in literal] + common_type = lcd_type(*element_types) + as_arrow = bigframes_dtype_to_arrow_dtype(common_type) + return pd.ArrowDtype(as_arrow) + if pd.api.types.is_dict_like(literal): + fields = [ + (key, bigframes_dtype_to_arrow_dtype(infer_literal_type(literal[key]))) + for key in literal.keys() + ] + return pd.ArrowDtype(pa.struct(fields)) + if pd.isna(literal): + return None # Null value without a definite type + if isinstance(literal, (bool, np.bool_)): + return BOOL_DTYPE + if isinstance(literal, (int, np.integer)): + return INT_DTYPE + if isinstance(literal, (float, np.floating)): + return FLOAT_DTYPE + if isinstance(literal, decimal.Decimal): + return NUMERIC_DTYPE + if isinstance(literal, (str, np.str_)): + return STRING_DTYPE + if isinstance(literal, (bytes, np.bytes_)): + return BYTES_DTYPE + # Make sure to check datetime before date as datetimes are also dates + if isinstance(literal, (datetime.datetime, pd.Timestamp)): + if literal.tzinfo is not None: + return TIMESTAMP_DTYPE + else: + return DATETIME_DTYPE + if isinstance(literal, datetime.date): + return DATE_DTYPE + if isinstance(literal, datetime.time): + return TIME_DTYPE + else: + raise ValueError(f"Unable to infer type for value: {literal}") + - return _ibis_dtype_to_arrow_dtype(bigframes_dtype_to_ibis_dtype(bigframes_dtype)) +def infer_literal_arrow_type(literal) -> typing.Optional[pa.DataType]: + if pd.isna(literal): + return None # Null value without a definite type + return bigframes_dtype_to_arrow_dtype(infer_literal_type(literal)) + + +# Don't have dtype for json, so just end up interpreting as STRING +_REMAPPED_TYPEKINDS = {"JSON": "STRING"} +_TK_TO_BIGFRAMES = { + type_kind: mapping.dtype + for mapping in SIMPLE_TYPES + for type_kind in mapping.type_kind +} + + +def convert_schema_field( + field: google.cloud.bigquery.SchemaField, +) -> typing.Tuple[str, Dtype]: + is_repeated = field.mode == "REPEATED" + if field.field_type == "RECORD": + mapped_fields = map(convert_schema_field, field.fields) + pa_struct = pa.struct( + (name, bigframes_dtype_to_arrow_dtype(dtype)) + for name, dtype in mapped_fields + ) + pa_type = pa.list_(pa_struct) if is_repeated else pa_struct + return field.name, pd.ArrowDtype(pa_type) + elif ( + field.field_type in _TK_TO_BIGFRAMES or field.field_type in _REMAPPED_TYPEKINDS + ): + singular_type = _TK_TO_BIGFRAMES[ + _REMAPPED_TYPEKINDS.get(field.field_type, field.field_type) + ] + if is_repeated: + pa_type = pa.list_(bigframes_dtype_to_arrow_dtype(singular_type)) + return field.name, pd.ArrowDtype(pa_type) + else: + return field.name, singular_type + else: + raise ValueError(f"Cannot handle type: {field.field_type}") + + +def bf_type_from_type_kind( + bq_schema: list[google.cloud.bigquery.SchemaField], +) -> typing.Dict[str, Dtype]: + """Converts bigquery sql type to the default bigframes dtype.""" + return {name: dtype for name, dtype in map(convert_schema_field, bq_schema)} def is_dtype(scalar: typing.Any, dtype: Dtype) -> bool: @@ -266,6 +429,7 @@ def is_patype(scalar: typing.Any, pa_type: pa.DataType) -> bool: return False +# Utilities for type coercion, and compatibility def is_compatible(scalar: typing.Any, dtype: Dtype) -> typing.Optional[Dtype]: """Whether scalar can be compare to items of dtype (though maybe requiring coercion). Returns the datatype that must be used for the comparison""" if is_dtype(scalar, dtype): @@ -337,47 +501,7 @@ def lcd_type_or_throw(dtype1: Dtype, dtype2: Dtype) -> Dtype: return result -def infer_literal_type(literal) -> typing.Optional[Dtype]: - if pd.isna(literal): - return None # Null value without a definite type - # Temporary logic, use ibis inferred type - from bigframes.core.compile.ibis_types import ( - ibis_dtype_to_bigframes_dtype, - literal_to_ibis_scalar, - ) - - ibis_literal = literal_to_ibis_scalar(literal) - return ibis_dtype_to_bigframes_dtype(ibis_literal.type()) - - -def infer_literal_arrow_type(literal) -> typing.Optional[pa.DataType]: - if pd.isna(literal): - return None # Null value without a definite type - # Temporary logic, use ibis inferred type - # TODO: Directly convert instead of using ibis dtype as intermediate step - from bigframes.core.compile.ibis_types import ( - _ibis_dtype_to_arrow_dtype, - literal_to_ibis_scalar, - ) - - ibis_literal = literal_to_ibis_scalar(literal) - return _ibis_dtype_to_arrow_dtype(ibis_literal.type()) - - -def bf_type_from_type_kind(bf_schema) -> Dict[str, Dtype]: - """Converts bigquery sql type to the default bigframes dtype.""" - ibis_schema: ibis.Schema = third_party_ibis_bqtypes.BigQuerySchema.to_ibis( - bf_schema - ) - # TODO: Directly convert instead of using ibis dtype as intermediate step - from bigframes.core.compile.ibis_types import ibis_dtype_to_bigframes_dtype - - return { - name: ibis_dtype_to_bigframes_dtype(type) for name, type in ibis_schema.items() - } - - -# Remote functions use only +### Remote functions use only # TODO: Refactor into remote function module # Input and output types supported by BigQuery DataFrames remote functions. diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py index 472ac07547..920dc7c039 100644 --- a/bigframes/functions/remote_function.py +++ b/bigframes/functions/remote_function.py @@ -39,6 +39,7 @@ import warnings import ibis +import numpy import pandas import pyarrow import requests @@ -280,6 +281,9 @@ def generate_cloud_function_code( if is_row_processor: # bigframes remote function will send an entire row of data as json, # which would be converted to a pandas series and processed + # Ensure numpy versions match to avoid unpickling problems. See + # internal issue b/347934471. + requirements.append(f"numpy=={numpy.__version__}") requirements.append(f"pandas=={pandas.__version__}") requirements.append(f"pyarrow=={pyarrow.__version__}") if package_requirements: diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index 4b1a3fb7b7..f3621d3a33 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -24,6 +24,7 @@ import bigframes_vendored.sklearn.preprocessing._discretization import bigframes_vendored.sklearn.preprocessing._encoder import bigframes_vendored.sklearn.preprocessing._label +import bigframes_vendored.sklearn.preprocessing._polynomial from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils @@ -661,6 +662,109 @@ def transform(self, y: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: ) +@log_adapter.class_logger +class PolynomialFeatures( + base.Transformer, + bigframes_vendored.sklearn.preprocessing._polynomial.PolynomialFeatures, +): + __doc__ = ( + bigframes_vendored.sklearn.preprocessing._polynomial.PolynomialFeatures.__doc__ + ) + + def __init__(self, degree: int = 2): + self.degree = degree + self._bqml_model: Optional[core.BqmlModel] = None + self._bqml_model_factory = globals.bqml_model_factory() + self._base_sql_generator = globals.base_sql_generator() + + # TODO(garrettwu): implement __hash__ + def __eq__(self, other: Any) -> bool: + return ( + type(other) is PolynomialFeatures and self._bqml_model == other._bqml_model + ) + + def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]: + """Compile this transformer to a list of SQL expressions that can be included in + a BQML TRANSFORM clause + + Args: + columns: + a list of column names to transform. + X (default None): + Ignored. + + Returns: a list of tuples of (sql_expression, output_name)""" + output_name = "poly_feat" + return [ + ( + self._base_sql_generator.ml_polynomial_expand( + columns, self.degree, output_name + ), + output_name, + ) + ] + + @classmethod + def _parse_from_sql(cls, sql: str) -> tuple[PolynomialFeatures, str]: + """Parse SQL to tuple(PolynomialFeatures, column_label). + + Args: + sql: SQL string of format "ML.POLYNOMIAL_EXPAND(STRUCT(col_label0, col_label1, ...), degree)" + + Returns: + tuple(MaxAbsScaler, column_label)""" + col_label = sql[sql.find("STRUCT(") + 7 : sql.find(")")] + degree = int(sql[sql.rfind(",") + 1 : sql.rfind(")")]) + return cls(degree), col_label + + def fit( + self, + X: Union[bpd.DataFrame, bpd.Series], + y=None, # ignored + ) -> PolynomialFeatures: + (X,) = utils.convert_to_dataframe(X) + + compiled_transforms = self._compile_to_sql(X.columns.tolist()) + transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] + + self._bqml_model = self._bqml_model_factory.create_model( + X, + options={"model_type": "transform_only"}, + transforms=transform_sqls, + ) + + # TODO(garrettwu): generalize the approach to other transformers + output_names = [] + for transform_col in self._bqml_model._model._properties["transformColumns"]: + transform_col_dict = cast(dict, transform_col) + # pass the columns that are not transformed + if "transformSql" not in transform_col_dict: + continue + transform_sql: str = transform_col_dict["transformSql"] + if not transform_sql.startswith("ML."): + continue + + output_names.append(transform_col_dict["name"]) + + self._output_names = output_names + + return self + + def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + if not self._bqml_model: + raise RuntimeError("Must be fitted before transform") + + (X,) = utils.convert_to_dataframe(X) + + df = self._bqml_model.transform(X) + return typing.cast( + bpd.DataFrame, + df[self._output_names], + ) + + # TODO(garrettwu): to_gbq() + + PreprocessingType = Union[ OneHotEncoder, StandardScaler, diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index f060584a11..0399db3a10 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -73,6 +73,11 @@ def struct_options(self, **kwargs: Union[int, float]) -> str: """Encode a BQ STRUCT as options.""" return f"STRUCT({self.build_structs(**kwargs)})" + def struct_columns(self, columns: Iterable[str]) -> str: + """Encode a BQ Table columns to a STRUCT.""" + columns_str = ", ".join(columns) + return f"STRUCT({columns_str})" + def input(self, **kwargs: str) -> str: """Encode a BQML INPUT clause.""" return f"INPUT({self.build_schema(**kwargs)})" @@ -153,6 +158,13 @@ def ml_label_encoder( https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-label-encoder for params.""" return f"""ML.LABEL_ENCODER({numeric_expr_sql}, {top_k}, {frequency_threshold}) OVER() AS {name}""" + def ml_polynomial_expand( + self, columns: Iterable[str], degree: int, name: str + ) -> str: + """Encode ML.POLYNOMIAL_EXPAND. + https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-polynomial-expand""" + return f"""ML.POLYNOMIAL_EXPAND({self.struct_columns(columns)}, {degree}) AS {name}""" + def ml_distance( self, col_x: str, diff --git a/bigframes/series.py b/bigframes/series.py index d858060aec..eda95fa1e8 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -281,6 +281,11 @@ def reset_index( return bigframes.dataframe.DataFrame(block) def __repr__(self) -> str: + # Protect against errors with uninitialized Series. See: + # https://github.com/googleapis/python-bigquery-dataframes/issues/728 + if not hasattr(self, "_block"): + return object.__repr__(self) + # TODO(swast): Add a timeout here? If the query is taking a long time, # maybe we just print the job metadata that we have so far? # TODO(swast): Avoid downloading the whole series by using job @@ -318,7 +323,7 @@ def to_pandas( sampling_method: Optional[str] = None, random_state: Optional[int] = None, *, - ordered: bool = True, + ordered: Optional[bool] = None, ) -> pandas.Series: """Writes Series to pandas Series. @@ -338,9 +343,10 @@ def to_pandas( The seed for the uniform downsampling algorithm. If provided, the uniform method may take longer to execute and require more computation. If set to a value other than None, this will supersede the global config. - ordered (bool, default True): + ordered (bool, default None): Determines whether the resulting pandas series will be deterministically ordered. - In some cases, unordered may result in a faster-executing query. + In some cases, unordered may result in a faster-executing query. If set to a value + other than None, will override Session default. Returns: @@ -352,7 +358,7 @@ def to_pandas( max_download_size=max_download_size, sampling_method=sampling_method, random_state=random_state, - ordered=ordered, + ordered=ordered if ordered is not None else self._session._strictly_ordered, ) self._set_internal_query_job(query_job) series = df.squeeze(axis=1) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 4c5ce21153..b0b2a3c418 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -64,7 +64,6 @@ import google.cloud.storage as storage # type: ignore import ibis import ibis.backends.bigquery as ibis_bigquery -import ibis.expr.types as ibis_types import jellyfish import numpy as np import pandas @@ -248,6 +247,8 @@ def __init__( # the ibis client has been created original_default_query_job_config = self.bqclient.default_query_job_config + # Only used to fetch remote function metadata. + # TODO: Remove in favor of raw bq client self.ibis_client = typing.cast( ibis_bigquery.Backend, ibis.bigquery.connect( @@ -296,7 +297,13 @@ def __init__( self._execution_count = 0 # Whether this session treats objects as totally ordered. # Will expose as feature later, only False for internal testing - self._strictly_ordered = True + self._strictly_ordered: bool = context._strictly_ordered + # Sequential index needs total ordering to generate, so use null index with unstrict ordering. + self._default_index_type: bigframes.enums.DefaultIndexKind = ( + bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64 + if context._strictly_ordered + else bigframes.enums.DefaultIndexKind.NULL + ) @property def bqclient(self): @@ -881,11 +888,11 @@ def _read_gbq_table( # Create Default Sequential Index if still have no index # ---------------------------------------------------- - # If no index columns provided or found, fall back to sequential index + # If no index columns provided or found, fall back to session default if (index_col != bigframes.enums.DefaultIndexKind.NULL) and len( index_cols ) == 0: - index_col = bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64 + index_col = self._default_index_type index_names: Sequence[Hashable] = index_cols if index_col == bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64: @@ -1497,14 +1504,14 @@ def _create_empty_temp_table( ) return bigquery.TableReference.from_string(table) - def _ibis_to_temp_table( + def _sql_to_temp_table( self, - table: ibis_types.Table, + sql: str, cluster_cols: Iterable[str], api_name: str, ) -> bigquery.TableReference: destination, _ = self._query_to_destination( - self.ibis_client.compile(table), + sql, index_cols=list(cluster_cols), api_name=api_name, ) @@ -1847,17 +1854,15 @@ def _cache_with_cluster_cols( # TODO: May want to support some partial ordering info even for non-strict ordering mode keep_order_info = self._strictly_ordered - compiled_value = self._compile_ordered(array_value) - - ibis_expr = compiled_value._to_ibis_expr( - ordering_mode="unordered", expose_hidden_cols=keep_order_info + sql, ordering_info = bigframes.core.compile.compile_raw( + self._with_cached_executions(array_value.node) ) - tmp_table = self._ibis_to_temp_table( - ibis_expr, cluster_cols=cluster_cols, api_name="cached" + tmp_table = self._sql_to_temp_table( + sql, cluster_cols=cluster_cols, api_name="cached" ) cached_replacement = array_value.as_cached( cache_table=self.bqclient.get_table(tmp_table), - ordering=compiled_value._ordering if keep_order_info else None, + ordering=ordering_info if keep_order_info else None, ).node self._cached_executions[array_value.node] = cached_replacement @@ -1869,13 +1874,14 @@ def _cache_with_offsets(self, array_value: core.ArrayValue): raise ValueError( "Caching with offsets only supported in strictly ordered mode." ) - compiled_value = self._compile_ordered(array_value) - - ibis_expr = compiled_value._to_ibis_expr( - ordering_mode="offset_col", order_col_name="bigframes_offsets" + sql = bigframes.core.compile.compile_unordered( + self._with_cached_executions( + array_value.promote_offsets("bigframes_offsets").node + ) ) - tmp_table = self._ibis_to_temp_table( - ibis_expr, cluster_cols=["bigframes_offsets"], api_name="cached" + + tmp_table = self._sql_to_temp_table( + sql, cluster_cols=["bigframes_offsets"], api_name="cached" ) cached_replacement = array_value.as_cached( cache_table=self.bqclient.get_table(tmp_table), @@ -1935,14 +1941,14 @@ def _execute( array_value: core.ArrayValue, job_config: Optional[bigquery.job.QueryJobConfig] = None, *, - sorted: bool = True, + ordered: bool = True, dry_run=False, col_id_overrides: Mapping[str, str] = {}, ) -> tuple[bigquery.table.RowIterator, bigquery.QueryJob]: if not dry_run: self._add_execution(1) sql = self._to_sql( - array_value, sorted=sorted, col_id_overrides=col_id_overrides + array_value, ordered=ordered, col_id_overrides=col_id_overrides ) # type:ignore if job_config is None: job_config = bigquery.QueryJobConfig(dry_run=dry_run) @@ -1962,7 +1968,9 @@ def _peek( """A 'peek' efficiently accesses a small number of rows in the dataframe.""" if not tree_properties.peekable(self._with_cached_executions(array_value.node)): warnings.warn("Peeking this value cannot be done efficiently.") - sql = self._compile_unordered(array_value).peek_sql(n_rows) + sql = bigframes.core.compile.compile_peek( + self._with_cached_executions(array_value.node), n_rows + ) # TODO(swast): plumb through the api_name of the user-facing api that # caused this query. @@ -1975,30 +1983,17 @@ def _to_sql( array_value: core.ArrayValue, offset_column: typing.Optional[str] = None, col_id_overrides: typing.Mapping[str, str] = {}, - sorted: bool = False, + ordered: bool = False, ) -> str: if offset_column: array_value = array_value.promote_offsets(offset_column) - if sorted: - return self._compile_ordered(array_value).to_sql( - col_id_overrides=col_id_overrides, sorted=True + node_w_cached = self._with_cached_executions(array_value.node) + if ordered: + return bigframes.core.compile.compile_ordered( + node_w_cached, col_id_overrides=col_id_overrides ) - return self._compile_unordered(array_value).to_sql( - col_id_overrides=col_id_overrides - ) - - def _compile_ordered( - self, array_value: core.ArrayValue - ) -> bigframes.core.compile.OrderedIR: - return bigframes.core.compile.compile_ordered_ir( - self._with_cached_executions(array_value.node) - ) - - def _compile_unordered( - self, array_value: core.ArrayValue - ) -> bigframes.core.compile.UnorderedIR: - return bigframes.core.compile.compile_unordered_ir( - self._with_cached_executions(array_value.node) + return bigframes.core.compile.compile_unordered( + node_w_cached, col_id_overrides=col_id_overrides ) def _get_table_size(self, destination_table): diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py index f26ca26c2a..3a33352a67 100644 --- a/bigframes/session/_io/bigquery/__init__.py +++ b/bigframes/session/_io/bigquery/__init__.py @@ -32,6 +32,7 @@ import bigframes from bigframes.core import log_adapter +import bigframes.core.compile.googlesql as googlesql import bigframes.core.sql import bigframes.formatting_helpers as formatting_helpers @@ -146,7 +147,9 @@ def create_temp_table( destination.schema = schema if cluster_columns: destination.clustering_fields = cluster_columns - bqclient.create_table(destination) + # Ok if already exists, since this will only happen from retries internal to this method + # as the requested table id has a random UUID4 component. + bqclient.create_table(destination, exists_ok=True) return f"{table_ref.project}.{table_ref.dataset_id}.{table_ref.table_id}" @@ -478,7 +481,7 @@ def compile_filters(filters: third_party_pandas_gbq.FiltersType) -> str: operator_str = valid_operators[operator] - column_ref = bigframes.core.sql.identifier(column) + column_ref = googlesql.identifier(column) if operator_str in ["IN", "NOT IN"]: value_literal = bigframes.core.sql.multi_literal(*value) else: diff --git a/bigframes/streaming/__init__.py b/bigframes/streaming/__init__.py new file mode 100644 index 0000000000..16da677ef5 --- /dev/null +++ b/bigframes/streaming/__init__.py @@ -0,0 +1,139 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Module for bigquery continuous queries""" + +import json +from typing import Optional + +from google.cloud import bigquery + +import bigframes + + +def to_bigtable( + query: str, + instance: str, + table: str, + bq_client: Optional[bigquery.Client] = None, + app_profile: Optional[str] = None, + truncate: bool = False, + overwrite: bool = False, + auto_create_column_families: bool = False, + bigtable_options: Optional[dict] = None, + job_id: Optional[str] = None, + job_id_prefix: Optional[str] = None, +) -> bigquery.QueryJob: + """Launches a BigQuery continuous query and returns a + QueryJob object for some management functionality. + + This method requires an existing bigtable preconfigured to + accept the continuous query export statement. For instructions + on export to bigtable, see + https://cloud.google.com/bigquery/docs/export-to-bigtable. + + Args: + query (str): + The sql statement to execute as a continuous function. + For example: "SELECT * FROM dataset.table" + This will be wrapped in an EXPORT DATA statement to + launch a continuous query writing to bigtable. + instance (str): + The name of the bigtable instance to export to. + table (str): + The name of the bigtable table to export to. + bq_client (str, default None): + The Client object to use for the query. This determines + the project id and location of the query. If None, will + default to the bigframes global session default client. + app_profile (str, default None): + The bigtable app profile to export to. If None, no app + profile will be used. + truncate (bool, default False): + The export truncate option, see + https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option + overwrite (bool, default False): + The export overwrite option, see + https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option + auto_create_column_families (bool, default False): + The auto_create_column_families option, see + https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option + bigtable_options (dict, default None): + The bigtable options dict, which will be converted to JSON + using json.dumps, see + https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option + If None, no bigtable_options parameter will be passed. + job_id (str, default None): + If specified, replace the default job id for the query, + see job_id parameter of + https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query + job_id_prefix (str, default None): + If specified, a job id prefix for the query, see + job_id_prefix parameter of + https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query + + Returns: + google.cloud.bigquery.QueryJob: + See https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob + The ongoing query job can be managed using this object. + For example, the job can be cancelled or its error status + can be examined. + """ + # get default client if not passed + if bq_client is None: + bq_client = bigframes.get_global_session().bqclient + + # build export string from parameters + project = bq_client.project + + app_profile_url_string = "" + if app_profile is not None: + app_profile_url_string = f"appProfiles/{app_profile}/" + + bigtable_options_parameter_string = "" + if bigtable_options is not None: + bigtable_options_parameter_string = ( + 'bigtable_options = """' + json.dumps(bigtable_options) + '""",\n' + ) + + sql = ( + "EXPORT DATA\n" + "OPTIONS (\n" + "format = 'CLOUD_BIGTABLE',\n" + f"{bigtable_options_parameter_string}" + f"truncate = {str(truncate)},\n" + f"overwrite = {str(overwrite)},\n" + f"auto_create_column_families = {str(auto_create_column_families)},\n" + f'uri = "https://bigtable.googleapis.com/projects/{project}/instances/{instance}/{app_profile_url_string}tables/{table}"\n' + ")\n" + "AS (\n" + f"{query});" + ) + + # override continuous http parameter + job_config = bigquery.job.QueryJobConfig() + job_config_filled = job_config.from_api_repr({"query": {"continuous": True}}) + + # begin the query job + query_job = bq_client.query( + sql, + job_config=job_config_filled, # type:ignore + # typing error above is in bq client library + # (should accept abstract job_config, only takes concrete) + job_id=job_id, + job_id_prefix=job_id_prefix, + ) + + # return the query job to the user for lifetime management + return query_job diff --git a/bigframes/version.py b/bigframes/version.py index 56a1200857..014b064071 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.9.0" +__version__ = "1.10.0" diff --git a/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q8.py b/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q8.py new file mode 100644 index 0000000000..4bbad0048f --- /dev/null +++ b/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q8.py @@ -0,0 +1,19 @@ +# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/groupby-pandas.py + +import bigframes.pandas as bpd + +print("Groupby benchmark 8: largest two v3 by id6") + +x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.G1_1e9_1e2_5_0") + +ans = ( + x[~x["v3"].isna()][["id6", "v3"]] + .sort_values("v3", ascending=False) + .groupby("id6", as_index=False, dropna=False) + .head(2) +) +print(ans.shape) +chk = [ans["v3"].sum()] +print(chk) + +bpd.reset_session() diff --git a/scripts/benchmark/db-benchmark/sort b/scripts/benchmark/db-benchmark/sort deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/scripts/benchmark/db-benchmark/sort/J1_1e9_NA_0_0/q1.py b/scripts/benchmark/db-benchmark/sort/J1_1e9_NA_0_0/q1.py new file mode 100644 index 0000000000..45cac7b543 --- /dev/null +++ b/scripts/benchmark/db-benchmark/sort/J1_1e9_NA_0_0/q1.py @@ -0,0 +1,15 @@ +# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/sort-pandas.py + +import bigframes.pandas as bpd + +print("Sort benchmark 1: sort by int id2") + +x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.J1_1e9_NA_0_0") + +ans = x.sort_values("id2") +print(ans.shape) + +chk = [ans["v1"].sum()] +print(chk) + +bpd.reset_session() diff --git a/scripts/create_bigtable.py b/scripts/create_bigtable.py new file mode 100644 index 0000000000..655e4b31ab --- /dev/null +++ b/scripts/create_bigtable.py @@ -0,0 +1,76 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script create the bigtable resources required for +# bigframes.streaming testing if they don't already exist + +import os +import pathlib +import sys + +import google.cloud.bigtable as bigtable + +REPO_ROOT = pathlib.Path(__file__).parent.parent + +PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT") + +if not PROJECT_ID: + print( + "Please set GOOGLE_CLOUD_PROJECT environment variable before running.", + file=sys.stderr, + ) + sys.exit(1) + + +def create_instance(client): + instance_name = "streaming-testing-instance" + instance = bigtable.instance.Instance( + instance_name, + client, + ) + cluster_id = "streaming-testing-instance-c1" + cluster = instance.cluster( + cluster_id, + location_id="us-west1-a", + serve_nodes=1, + ) + if not instance.exists(): + operation = instance.create( + clusters=[cluster], + ) + operation.result(timeout=480) + print(f"Created instance {instance_name}") + return instance + + +def create_table(instance): + table_id = "table-testing" + table = bigtable.table.Table( + table_id, + instance, + ) + if not table.exists(): + table.create() + print(f"Created table {table_id}") + + +def main(): + client = bigtable.Client(project=PROJECT_ID, admin=True) + + instance = create_instance(client) + create_table(instance) + + +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index d5d282d11a..dbd9ce5fc2 100644 --- a/setup.py +++ b/setup.py @@ -39,6 +39,7 @@ "gcsfs >=2023.3.0", "geopandas >=0.12.2", "google-auth >=2.15.0,<3.0dev", + "google-cloud-bigtable >=2.24.0", "google-cloud-bigquery[bqstorage,pandas] >=3.16.0", "google-cloud-functions >=1.12.0", "google-cloud-bigquery-connection >=1.12.0", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 3c51668655..bbd7bf0069 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -4,6 +4,7 @@ fsspec==2023.3.0 gcsfs==2023.3.0 geopandas==0.12.2 google-auth==2.15.0 +google-cloud-bigtable==2.24.0 google-cloud-bigquery==3.16.0 google-cloud-functions==1.12.0 google-cloud-bigquery-connection==1.12.0 diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 06ad73a702..a41e6dc6b7 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -141,11 +141,8 @@ def session() -> Generator[bigframes.Session, None, None]: @pytest.fixture(scope="session") def unordered_session() -> Generator[bigframes.Session, None, None]: - context = bigframes.BigQueryOptions( - location="US", - ) + context = bigframes.BigQueryOptions(location="US", _strictly_ordered=False) session = bigframes.Session(context=context) - session._strictly_ordered = False yield session session.close() # close generated session at cleanup type diff --git a/tests/system/large/test_streaming.py b/tests/system/large/test_streaming.py new file mode 100644 index 0000000000..48db61e5bf --- /dev/null +++ b/tests/system/large/test_streaming.py @@ -0,0 +1,48 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time + +import bigframes.streaming + + +def test_streaming_to_bigtable(): + # launch a continuous query + job_id_prefix = "test_streaming_" + sql = """SELECT + body_mass_g, island as rowkey + FROM birds.penguins""" + query_job = bigframes.streaming.to_bigtable( + sql, + "streaming-testing-instance", + "table-testing", + app_profile=None, + truncate=True, + overwrite=True, + auto_create_column_families=True, + bigtable_options={}, + job_id=None, + job_id_prefix=job_id_prefix, + ) + + try: + # wait 100 seconds in order to ensure the query doesn't stop + # (i.e. it is continuous) + time.sleep(100) + assert query_job.error_result is None + assert query_job.errors is None + assert query_job.running() + assert str(query_job.job_id).startswith(job_id_prefix) + finally: + query_job.cancel() diff --git a/tests/system/load/test_large_tables.py b/tests/system/load/test_large_tables.py index f92207b191..472be3d2ad 100644 --- a/tests/system/load/test_large_tables.py +++ b/tests/system/load/test_large_tables.py @@ -94,8 +94,7 @@ def test_to_pandas_large_table(): # df will be downloaded locally expected_row_count, expected_column_count = df.shape - # TODO(b/340893653): fix type error - df = df.to_pandas() # type: ignore - row_count, column_count = df.shape + df_converted = df.to_pandas() + row_count, column_count = df_converted.shape assert column_count == expected_column_count assert row_count == expected_row_count diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index 5b457cc9c0..73b1855e09 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -19,6 +19,7 @@ import bigframes.features from bigframes.ml import preprocessing +from tests.system import utils ONE_HOT_ENCODED_DTYPE = ( pd.ArrowDtype(pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())]))) @@ -840,3 +841,69 @@ def test_label_encoder_save_load(new_penguins_df, dataset_id): # TODO(garrettwu): add OneHotEncoder tests to compare with sklearn. + + +def test_poly_features_default_params(new_penguins_df): + transformer = preprocessing.PolynomialFeatures() + df = new_penguins_df[["culmen_length_mm", "culmen_depth_mm"]] + transformer.fit(df) + + result = transformer.transform(df).to_pandas() + + expected = pd.DataFrame( + { + "poly_feat_culmen_length_mm": [ + 39.5, + 38.5, + 37.9, + ], + "poly_feat_culmen_length_mm_culmen_length_mm": [ + 1560.25, + 1482.25, + 1436.41, + ], + "poly_feat_culmen_length_mm_culmen_depth_mm": [ + 742.6, + 662.2, + 685.99, + ], + "poly_feat_culmen_depth_mm": [ + 18.8, + 17.2, + 18.1, + ], + "poly_feat_culmen_depth_mm_culmen_depth_mm": [ + 353.44, + 295.84, + 327.61, + ], + }, + dtype="Float64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, check_exact=False, rtol=0.1) + + +def test_poly_features_params(new_penguins_df): + transformer = preprocessing.PolynomialFeatures(degree=3) + df = new_penguins_df[["culmen_length_mm", "culmen_depth_mm"]] + transformer.fit(df) + + result = transformer.transform(df).to_pandas() + + utils.check_pandas_df_schema_and_index( + result, + [ + "poly_feat_culmen_length_mm", + "poly_feat_culmen_length_mm_culmen_length_mm", + "poly_feat_culmen_length_mm_culmen_length_mm_culmen_length_mm", + "poly_feat_culmen_length_mm_culmen_length_mm_culmen_depth_mm", + "poly_feat_culmen_length_mm_culmen_depth_mm", + "poly_feat_culmen_length_mm_culmen_depth_mm_culmen_depth_mm", + "poly_feat_culmen_depth_mm", + "poly_feat_culmen_depth_mm_culmen_depth_mm", + "poly_feat_culmen_depth_mm_culmen_depth_mm_culmen_depth_mm", + ], + [1633, 1672, 1690], + ) diff --git a/tests/system/small/operations/test_plotting.py b/tests/system/small/operations/test_plotting.py index e0ef84641c..7be44e0a0f 100644 --- a/tests/system/small/operations/test_plotting.py +++ b/tests/system/small/operations/test_plotting.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from matplotlib.collections import PathCollection import numpy as np import pandas as pd import pandas._testing as tm @@ -258,9 +259,10 @@ def test_scatter_args_s(s): ax = df.plot.scatter(x="a", y="b", s="s") pd_ax = pd_df.plot.scatter(x="a", y="b", s="s") - # TODO(b/340891723): fix type error + + assert isinstance(pd_ax.collections[0], PathCollection) tm.assert_numpy_array_equal( - ax.collections[0].get_sizes(), pd_ax.collections[0].get_sizes() # type: ignore + ax.collections[0].get_sizes(), pd_ax.collections[0].get_sizes() ) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index d5854bd8d0..0aac9e2578 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -270,6 +270,44 @@ def test_get_columns_default(scalars_dfs): assert result == "default_val" +@pytest.mark.parametrize( + ("loc", "column", "value", "allow_duplicates"), + [ + (0, 666, 2, False), + (5, "float64_col", 2.2, True), + (13, "rowindex_2", [8, 7, 6, 5, 4, 3, 2, 1, 0], True), + pytest.param( + 14, + "test", + 2, + False, + marks=pytest.mark.xfail( + raises=IndexError, + ), + ), + pytest.param( + 12, + "int64_col", + 2, + False, + marks=pytest.mark.xfail( + raises=ValueError, + ), + ), + ], +) +def test_insert(scalars_dfs, loc, column, value, allow_duplicates): + scalars_df, scalars_pandas_df = scalars_dfs + # insert works inplace, so will influence other tests. + # make a copy to avoid inplace changes. + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df.insert(loc, column, value, allow_duplicates) + pd_df.insert(loc, column, value, allow_duplicates) + + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df, check_dtype=False) + + def test_drop_column(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_col" @@ -346,7 +384,7 @@ def test_df_info(scalars_dfs): " 11 time_col 6 non-null time64[us][pyarrow]\n" " 12 timestamp_col 6 non-null timestamp[us, tz=UTC][pyarrow]\n" "dtypes: Float64(1), Int64(3), binary[pyarrow](1), boolean(1), date32[day][pyarrow](1), decimal128(38, 9)[pyarrow](1), geometry(1), string(1), time64[us][pyarrow](1), timestamp[us, tz=UTC][pyarrow](1), timestamp[us][pyarrow](1)\n" - "memory usage: 945 bytes\n" + "memory usage: 1269 bytes\n" ) scalars_df, _ = scalars_dfs @@ -2982,6 +3020,29 @@ def test_loc_select_with_column_condition(scalars_df_index, scalars_pandas_df_in ) +def test_loc_select_with_column_condition_bf_series( + scalars_df_index, scalars_pandas_df_index +): + # (b/347072677) GEOGRAPH type doesn't support DISTINCT op + columns = [ + item for item in scalars_pandas_df_index.columns if item != "geography_col" + ] + scalars_df_index = scalars_df_index[columns] + scalars_pandas_df_index = scalars_pandas_df_index[columns] + + size_half = len(scalars_pandas_df_index) / 2 + bf_result = scalars_df_index.loc[ + :, scalars_df_index.nunique() > size_half + ].to_pandas() + pd_result = scalars_pandas_df_index.loc[ + :, scalars_pandas_df_index.nunique() > size_half + ] + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + def test_loc_single_index_with_duplicate(scalars_df_index, scalars_pandas_df_index): scalars_df_index = scalars_df_index.set_index("string_col", drop=False) scalars_pandas_df_index = scalars_pandas_df_index.set_index( @@ -4267,6 +4328,18 @@ def test_df_cached(scalars_df_index): pandas.testing.assert_frame_equal(df.to_pandas(), df_cached_copy.to_pandas()) +def test_df_cache_with_implicit_join(scalars_df_index): + """expectation is that cache will be used, but no explicit join will be performed""" + df = scalars_df_index[["int64_col", "int64_too"]].sort_index().reset_index() + 3 + df.cache() + bf_result = df + (df * 2) + sql = bf_result.sql + + # Very crude asserts, want sql to not use join and not use base table, only reference cached table + assert "JOIN" not in sql + assert "bigframes_testing" not in sql + + def test_df_dot_inline(session): df1 = pd.DataFrame([[1, 2, 3], [2, 5, 7]]) df2 = pd.DataFrame([[2, 4, 8], [1, 5, 10], [3, 6, 9]]) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index f36dd64cbe..8adbea88e4 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -16,6 +16,7 @@ import google.api_core.exceptions import pandas as pd +import pandas.testing import pyarrow as pa import pytest @@ -35,6 +36,102 @@ import bigframes.pandas as bpd +def test_sql_executes(scalars_df_default_index, bigquery_client): + """Test that DataFrame.sql returns executable SQL. + + DF.sql is used in public documentation such as + https://cloud.google.com/blog/products/data-analytics/using-bigquery-dataframes-with-carto-geospatial-tools + as a way to pass a DataFrame on to carto without executing the SQL + immediately. + + Make sure that this SQL can be run outside of BigQuery DataFrames (assuming + similar credentials / access to the referenced tables). + """ + # Do some operations to make for more complex SQL. + df = ( + scalars_df_default_index.drop(columns=["geography_col"]) + .groupby("string_col") + .max() + ) + df.index.name = None # Don't include unnamed indexes. + query = df.sql + + bf_result = df.to_pandas().sort_values("rowindex").reset_index(drop=True) + bq_result = ( + bigquery_client.query_and_wait(query) + .to_dataframe() + .sort_values("rowindex") + .reset_index(drop=True) + ) + pandas.testing.assert_frame_equal(bf_result, bq_result, check_dtype=False) + + +def test_sql_executes_and_includes_named_index( + scalars_df_default_index, bigquery_client +): + """Test that DataFrame.sql returns executable SQL. + + DF.sql is used in public documentation such as + https://cloud.google.com/blog/products/data-analytics/using-bigquery-dataframes-with-carto-geospatial-tools + as a way to pass a DataFrame on to carto without executing the SQL + immediately. + + Make sure that this SQL can be run outside of BigQuery DataFrames (assuming + similar credentials / access to the referenced tables). + """ + # Do some operations to make for more complex SQL. + df = ( + scalars_df_default_index.drop(columns=["geography_col"]) + .groupby("string_col") + .max() + ) + query = df.sql + + bf_result = df.to_pandas().sort_values("rowindex") + bq_result = ( + bigquery_client.query_and_wait(query) + .to_dataframe() + .set_index("string_col") + .sort_values("rowindex") + ) + pandas.testing.assert_frame_equal( + bf_result, bq_result, check_dtype=False, check_index_type=False + ) + + +def test_sql_executes_and_includes_named_multiindex( + scalars_df_default_index, bigquery_client +): + """Test that DataFrame.sql returns executable SQL. + + DF.sql is used in public documentation such as + https://cloud.google.com/blog/products/data-analytics/using-bigquery-dataframes-with-carto-geospatial-tools + as a way to pass a DataFrame on to carto without executing the SQL + immediately. + + Make sure that this SQL can be run outside of BigQuery DataFrames (assuming + similar credentials / access to the referenced tables). + """ + # Do some operations to make for more complex SQL. + df = ( + scalars_df_default_index.drop(columns=["geography_col"]) + .groupby(["string_col", "bool_col"]) + .max() + ) + query = df.sql + + bf_result = df.to_pandas().sort_values("rowindex") + bq_result = ( + bigquery_client.query_and_wait(query) + .to_dataframe() + .set_index(["string_col", "bool_col"]) + .sort_values("rowindex") + ) + pandas.testing.assert_frame_equal( + bf_result, bq_result, check_dtype=False, check_index_type=False + ) + + def test_to_pandas_w_correct_dtypes(scalars_df_default_index): """Verify to_pandas() APIs returns the expected dtypes.""" actual = scalars_df_default_index.to_pandas().dtypes diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index b332d48574..960dc10948 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -53,6 +53,13 @@ def test_dataframe_groupby_numeric_aggregate( pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) +def test_dataframe_groupby_head(scalars_df_index, scalars_pandas_df_index): + col_names = ["int64_too", "float64_col", "int64_col", "bool_col", "string_col"] + bf_result = scalars_df_index[col_names].groupby("bool_col").head(2).to_pandas() + pd_result = scalars_pandas_df_index[col_names].groupby("bool_col").head(2) + pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) + + def test_dataframe_groupby_median(scalars_df_index, scalars_pandas_df_index): col_names = ["int64_too", "float64_col", "int64_col", "bool_col", "string_col"] bf_result = ( @@ -442,6 +449,19 @@ def test_series_groupby_agg_list(scalars_df_index, scalars_pandas_df_index): ) +@pytest.mark.parametrize("dropna", [True, False]) +def test_series_groupby_head(scalars_df_index, scalars_pandas_df_index, dropna): + bf_result = ( + scalars_df_index.groupby("bool_col", dropna=dropna)["int64_too"] + .head(1) + .to_pandas() + ) + pd_result = scalars_pandas_df_index.groupby("bool_col", dropna=dropna)[ + "int64_too" + ].head(1) + pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + + def test_series_groupby_kurt(scalars_df_index, scalars_pandas_df_index): bf_result = ( scalars_df_index["int64_too"] diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index de631ee20e..ab2a9c19b8 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -850,14 +850,13 @@ def test_column_multi_index_stack(level): bf_result = bf_df.stack(level=level).to_pandas() # BigFrames emulates future_stack impl - # TODO(b/340884387): fix type error - pd_result = pd_df.stack(level=level, future_stack=True) # type: ignore + pd_result = pd_df.stack(level=level, future_stack=True) # Pandas produces NaN, where bq dataframes produces pd.NA # Column ordering seems to depend on pandas version - # TODO(b/340884387): fix type error + assert isinstance(pd_result, pandas.DataFrame) pandas.testing.assert_frame_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False # type: ignore + bf_result, pd_result, check_dtype=False, check_index_type=False ) @@ -891,11 +890,9 @@ def test_column_multi_index_melt(): def test_column_multi_index_unstack(scalars_df_index, scalars_pandas_df_index): columns = ["int64_too", "int64_col", "rowindex_2"] - # TODO(b/340884387): fix type error - level1 = pandas.Index(["b", "a", "b"], dtype="string[pyarrow]") # type: ignore + level1: pandas.Index = pandas.Index(["b", "a", "b"], dtype="string[pyarrow]") # Need resulting column to be pyarrow string rather than object dtype - # TODO(b/340884387): fix type error - level2 = pandas.Index(["a", "b", "b"], dtype="string[pyarrow]") # type: ignore + level2: pandas.Index = pandas.Index(["a", "b", "b"], dtype="string[pyarrow]") multi_columns = pandas.MultiIndex.from_arrays([level1, level2]) bf_df = scalars_df_index[columns].copy() bf_df.columns = multi_columns @@ -1189,10 +1186,12 @@ def test_explode_w_multi_index(): df = bpd.DataFrame(data, columns=multi_level_columns) pd_df = df.to_pandas() - # TODO(b/340884387): fix type error + + assert isinstance(pd_df, pandas.DataFrame) + assert isinstance(pd_df["col0"], pandas.DataFrame) pandas.testing.assert_frame_equal( df["col0"].explode("col00").to_pandas(), - pd_df["col0"].explode("col00"), # type: ignore + pd_df["col0"].explode("col00"), check_dtype=False, check_index_type=False, ) @@ -1202,8 +1201,7 @@ def test_column_multi_index_w_na_stack(scalars_df_index, scalars_pandas_df_index columns = ["int64_too", "int64_col", "rowindex_2"] level1 = pandas.Index(["b", "c", "d"]) # Need resulting column to be pyarrow string rather than object dtype - # TODO(b/340884387): fix type error - level2 = pandas.Index([None, "b", "b"], dtype="string[pyarrow]") # type: ignore + level2: pandas.Index = pandas.Index([None, "b", "b"], dtype="string[pyarrow]") multi_columns = pandas.MultiIndex.from_arrays([level1, level2]) bf_df = scalars_df_index[columns].copy() bf_df.columns = multi_columns diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index d84d520988..5838ad75b0 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -742,109 +742,6 @@ def test_read_gbq_function_enforces_explicit_types( ) -@pytest.mark.flaky(retries=2, delay=120) -def test_df_apply_axis_1(session, scalars_dfs): - columns = [ - "bool_col", - "int64_col", - "int64_too", - "float64_col", - "string_col", - "bytes_col", - ] - scalars_df, scalars_pandas_df = scalars_dfs - - def add_ints(row): - return row["int64_col"] + row["int64_too"] - - with pytest.warns( - bigframes.exceptions.PreviewWarning, - match="input_types=Series is in preview.", - ): - add_ints_remote = session.remote_function( - bigframes.series.Series, - int, - )(add_ints) - - with pytest.warns( - bigframes.exceptions.PreviewWarning, match="axis=1 scenario is in preview." - ): - bf_result = scalars_df[columns].apply(add_ints_remote, axis=1).to_pandas() - - pd_result = scalars_pandas_df[columns].apply(add_ints, axis=1) - - # bf_result.dtype is 'Int64' while pd_result.dtype is 'object', ignore this - # mismatch by using check_dtype=False. - # - # bf_result.to_numpy() produces an array of numpy.float64's - # (in system_prerelease tests), while pd_result.to_numpy() produces an - # array of ints, ignore this mismatch by using check_exact=False. - pd.testing.assert_series_equal( - pd_result, bf_result, check_dtype=False, check_exact=False - ) - - -@pytest.mark.flaky(retries=2, delay=120) -def test_df_apply_axis_1_ordering(session, scalars_dfs): - columns = ["bool_col", "int64_col", "int64_too", "float64_col", "string_col"] - ordering_columns = ["bool_col", "int64_col"] - scalars_df, scalars_pandas_df = scalars_dfs - - def add_ints(row): - return row["int64_col"] + row["int64_too"] - - add_ints_remote = session.remote_function(bigframes.series.Series, int)(add_ints) - - bf_result = ( - scalars_df[columns] - .sort_values(ordering_columns) - .apply(add_ints_remote, axis=1) - .to_pandas() - ) - pd_result = ( - scalars_pandas_df[columns].sort_values(ordering_columns).apply(add_ints, axis=1) - ) - - # bf_result.dtype is 'Int64' while pd_result.dtype is 'object', ignore this - # mismatch by using check_dtype=False. - # - # bf_result.to_numpy() produces an array of numpy.float64's - # (in system_prerelease tests), while pd_result.to_numpy() produces an - # array of ints, ignore this mismatch by using check_exact=False. - pd.testing.assert_series_equal( - pd_result, bf_result, check_dtype=False, check_exact=False - ) - - -@pytest.mark.flaky(retries=2, delay=120) -def test_df_apply_axis_1_multiindex(session): - pd_df = pd.DataFrame( - {"x": [1, 2, 3], "y": [1.5, 3.75, 5], "z": ["pq", "rs", "tu"]}, - index=pd.MultiIndex.from_tuples([("a", 100), ("a", 200), ("b", 300)]), - ) - bf_df = session.read_pandas(pd_df) - - def add_numbers(row): - return row["x"] + row["y"] - - add_numbers_remote = session.remote_function(bigframes.series.Series, float)( - add_numbers - ) - - bf_result = bf_df.apply(add_numbers_remote, axis=1).to_pandas() - pd_result = pd_df.apply(add_numbers, axis=1) - - # bf_result.dtype is 'Float64' while pd_result.dtype is 'float64', ignore this - # mismatch by using check_dtype=False. - # - # bf_result.index[0].dtype is 'string[pyarrow]' while - # pd_result.index[0].dtype is 'object', ignore this mismatch by using - # check_index_type=False. - pd.testing.assert_series_equal( - pd_result, bf_result, check_dtype=False, check_index_type=False - ) - - def test_df_apply_axis_1_unsupported_callable(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs columns = ["bool_col", "int64_col", "int64_too", "float64_col", "string_col"] diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 9631e0c7ab..5d53a5af17 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import io import random import re @@ -22,6 +21,7 @@ from typing import List, Optional, Sequence import warnings +import bigframes_vendored.pandas.io.gbq as vendored_pandas_gbq import google import google.cloud.bigquery as bigquery import numpy as np @@ -302,10 +302,13 @@ def test_read_gbq_w_primary_keys_table_and_filters( df = session.read_gbq( f"{table.project}.{table.dataset_id}.{table.table_id}", - filters=[ - ("name", "LIKE", "W%"), - ("total_people", ">", 100), - ], # type: ignore + filters=typing.cast( + vendored_pandas_gbq.FiltersType, + [ + ("name", "LIKE", "W%"), + ("total_people", ">", 100), + ], + ), ) result = df.to_pandas() @@ -397,7 +400,10 @@ def test_read_gbq_on_linked_dataset_warns(session): def test_read_gbq_table_clustered_with_filter(session: bigframes.Session): df = session.read_gbq_table( "bigquery-public-data.cloud_storage_geo_index.landsat_index", - filters=[[("sensor_id", "LIKE", "OLI%")], [("sensor_id", "LIKE", "%TIRS")]], # type: ignore + filters=typing.cast( + vendored_pandas_gbq.FiltersType, + [[("sensor_id", "LIKE", "OLI%")], [("sensor_id", "LIKE", "%TIRS")]], + ), columns=["sensor_id"], ) sensors = df.groupby(["sensor_id"]).agg("count").to_pandas(ordered=False) @@ -581,8 +587,8 @@ def test_read_pandas(session, scalars_dfs): def test_read_pandas_series(session): - # TODO(b/340887657): fix type error - idx = pd.Index([2, 7, 1, 2, 8], dtype=pd.Int64Dtype()) # type: ignore + + idx: pd.Index = pd.Index([2, 7, 1, 2, 8], dtype=pd.Int64Dtype()) pd_series = pd.Series([3, 1, 4, 1, 5], dtype=pd.Int64Dtype(), index=idx) bf_series = session.read_pandas(pd_series) @@ -590,8 +596,8 @@ def test_read_pandas_series(session): def test_read_pandas_index(session): - # TODO(b/340887657): fix type error - pd_idx = pd.Index([2, 7, 1, 2, 8], dtype=pd.Int64Dtype()) # type: ignore + + pd_idx: pd.Index = pd.Index([2, 7, 1, 2, 8], dtype=pd.Int64Dtype()) bf_idx = session.read_pandas(pd_idx) pd.testing.assert_index_equal(bf_idx.to_pandas(), pd_idx) @@ -609,8 +615,9 @@ def test_read_pandas_inline_respects_location(): df = session.read_pandas(pd.DataFrame([[1, 2, 3], [4, 5, 6]])) repr(df) - # TODO(b/340887657): fix type error - table = session.bqclient.get_table(df.query_job.destination) # type: ignore + assert df.query_job is not None + + table = session.bqclient.get_table(df.query_job.destination) assert table.location == "europe-west1" diff --git a/tests/system/small/test_unordered.py b/tests/system/small/test_unordered.py index 12c0d6e259..d555cedcc0 100644 --- a/tests/system/small/test_unordered.py +++ b/tests/system/small/test_unordered.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import pandas as pd +import pyarrow as pa import bigframes.pandas as bpd -from tests.system.utils import assert_pandas_df_equal +from tests.system.utils import assert_pandas_df_equal, skip_legacy_pandas def test_unordered_mode_cache_aggregate(unordered_session): @@ -26,3 +27,35 @@ def test_unordered_mode_cache_aggregate(unordered_session): pd_result = pd_df - pd_df.mean() assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + + +@skip_legacy_pandas +def test_unordered_mode_read_gbq(unordered_session): + df = unordered_session.read_gbq( + """SELECT + [1, 3, 2] AS array_column, + STRUCT( + "a" AS string_field, + 1.2 AS float_field) AS struct_column""" + ) + expected = pd.DataFrame( + { + "array_column": pd.Series( + [[1, 3, 2]], + dtype=(pd.ArrowDtype(pa.list_(pa.int64()))), + ), + "struct_column": pd.Series( + [{"string_field": "a", "float_field": 1.2}], + dtype=pd.ArrowDtype( + pa.struct( + [ + ("string_field", pa.string()), + ("float_field", pa.float64()), + ] + ) + ), + ), + } + ) + # Don't need ignore_order as there is only 1 row + assert_pandas_df_equal(df.to_pandas(), expected) diff --git a/tests/unit/core/compiler/__init__.py b/tests/unit/core/compile/__init__.py similarity index 100% rename from tests/unit/core/compiler/__init__.py rename to tests/unit/core/compile/__init__.py diff --git a/tests/unit/core/compile/googlesql/__init__.py b/tests/unit/core/compile/googlesql/__init__.py new file mode 100644 index 0000000000..6d5e14bcf4 --- /dev/null +++ b/tests/unit/core/compile/googlesql/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/unit/core/compile/googlesql/test_expression.py b/tests/unit/core/compile/googlesql/test_expression.py new file mode 100644 index 0000000000..e72598b176 --- /dev/null +++ b/tests/unit/core/compile/googlesql/test_expression.py @@ -0,0 +1,37 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import bigframes.core.compile.googlesql as sql + + +@pytest.mark.parametrize( + ("table_id", "dataset_id", "project_id", "expected"), + [ + pytest.param("a", None, None, "`a`"), + pytest.param("a", "b", None, "`b`.`a`"), + pytest.param("a", "b", "c", "`c`.`b`.`a`"), + pytest.param("a", None, "c", None, marks=pytest.mark.xfail(raises=ValueError)), + ], +) +def test_table_expression(table_id, dataset_id, project_id, expected): + expr = sql.TableExpression( + table_id=table_id, dataset_id=dataset_id, project_id=project_id + ) + assert expr.sql() == expected + + +def test_escape_chars(): + assert sql._escape_chars("\a\b\f\n\r\t\v\\?'\"`") == r"\a\b\f\n\r\t\v\\\?\'\"\`" diff --git a/tests/unit/core/compile/googlesql/test_function.py b/tests/unit/core/compile/googlesql/test_function.py new file mode 100644 index 0000000000..4edfda6f34 --- /dev/null +++ b/tests/unit/core/compile/googlesql/test_function.py @@ -0,0 +1,21 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import bigframes.core.compile.googlesql as sql + + +def test_cast(): + col = sql.ColumnExpression("col") + assert sql.Cast(col, sql.DataType.STRING).sql() == "CAST (`col` AS STRING)" + assert sql.Cast(col, sql.DataType.FLOAT64).sql() == "CAST (`col` AS FLOAT64)" diff --git a/tests/unit/core/compiler/test_googlesql.py b/tests/unit/core/compile/googlesql/test_query.py similarity index 61% rename from tests/unit/core/compiler/test_googlesql.py rename to tests/unit/core/compile/googlesql/test_query.py index 70ca5cfa12..b8d1d024e2 100644 --- a/tests/unit/core/compiler/test_googlesql.py +++ b/tests/unit/core/compile/googlesql/test_query.py @@ -12,6 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +from unittest.mock import MagicMock + +import google.cloud.bigquery as bigquery import pytest import bigframes.core.compile.googlesql as sql @@ -36,16 +39,13 @@ def test_table_expression(table_id, dataset_id, project_id, expected): @pytest.mark.parametrize( ("table_name", "alias", "expected"), [ - pytest.param(None, None, None, marks=pytest.mark.xfail(raises=ValueError)), pytest.param("a", None, "`a`"), pytest.param("a", "aa", "`a` AS `aa`"), ], ) def test_from_item_w_table_name(table_name, alias, expected): expr = sql.FromItem( - table_name=None - if table_name is None - else sql.TableExpression(table_id=table_name), + sql.TableExpression(table_id=table_name), as_alias=None if alias is None else sql.AsAlias(sql.AliasExpression(alias=alias)), @@ -55,7 +55,7 @@ def test_from_item_w_table_name(table_name, alias, expected): def test_from_item_w_query_expr(): from_clause = sql.FromClause( - sql.FromItem(table_name=sql.TableExpression(table_id="table_a")) + sql.FromItem(expression=sql.TableExpression(table_id="table_a")) ) select = sql.Select( select_list=[sql.SelectAll(sql.StarExpression())], @@ -65,19 +65,30 @@ def test_from_item_w_query_expr(): expected = "SELECT\n*\nFROM\n`table_a`" # A QueryExpr object - expr = sql.FromItem(query_expr=query_expr) + expr = sql.FromItem(expression=query_expr) assert expr.sql() == f"({expected})" # A str object - expr = sql.FromItem(query_expr=expected) + expr = sql.FromItem(expression=expected) assert expr.sql() == f"({expected})" def test_from_item_w_cte(): - expr = sql.FromItem(cte_name=sql.CTEExpression("test")) + expr = sql.FromItem(expression=sql.CTEExpression("test")) assert expr.sql() == "`test`" +def test_from_item_w_table_ref(): + mock_table_ref = MagicMock(spec=bigquery.TableReference) + mock_table_ref.table_id = "mock_table" + mock_table_ref.dataset_id = "mock_dataset" + mock_table_ref.project = "mock_project" + + from_item = sql.FromItem.from_source(mock_table_ref) + + assert from_item.sql() == "`mock_project`.`mock_dataset`.`mock_table`" + + @pytest.mark.parametrize( ("col_name", "alias", "expected"), [ @@ -98,9 +109,9 @@ def test_select(): select_2 = sql.SelectExpression( expression=sql.ColumnExpression("b"), alias=sql.AliasExpression(alias="bb") ) - from_1 = sql.FromItem(table_name=sql.TableExpression(table_id="table_a")) + from_1 = sql.FromItem(expression=sql.TableExpression(table_id="table_a")) from_2 = sql.FromItem( - query_expr="SELECT * FROM project.table_b", + expression="SELECT * FROM project.table_b", as_alias=sql.AsAlias(sql.AliasExpression(alias="table_b")), ) expr = sql.Select( @@ -112,10 +123,58 @@ def test_select(): assert expr.sql() == expected +@pytest.mark.parametrize( + "columns, source, expected", + [ + ( + ["a", "b", "c"], + "select * from test", + "SELECT\nDISTINCT\n`a`,\n`b`,\n`c`\nFROM\n(select * from test)", + ), + ( + "a", + "select * from test", + "SELECT\nDISTINCT\n`a`\nFROM\n(select * from test)", + ), + ], +) +def test_select_from_str(columns, source, expected): + expr = sql.Select().from_(source).select(columns, distinct=True) + assert expr.sql() == expected + + +@pytest.mark.parametrize( + ("columns", "distinct", "expected"), + [ + pytest.param( + ["a", "b", "c"], + True, + "SELECT\nDISTINCT\n`a`,\n`b`,\n`c`\nFROM\n`mock_project`.`mock_dataset`.`mock_table`", + ), + pytest.param( + None, + True, + "SELECT\nDISTINCT\n*\nFROM\n`mock_project`.`mock_dataset`.`mock_table`", + ), + pytest.param( + None, False, "SELECT\n*\nFROM\n`mock_project`.`mock_dataset`.`mock_table`" + ), + ], +) +def test_select_from_table_ref(columns, distinct, expected): + mock_table_ref = MagicMock(spec=bigquery.TableReference) + mock_table_ref.table_id = "mock_table" + mock_table_ref.dataset_id = "mock_dataset" + mock_table_ref.project = "mock_project" + + expr = sql.Select().from_(mock_table_ref).select(columns, distinct=distinct) + assert expr.sql() == expected + + def test_query_expr_w_cte(): # Test a simple SELECT query. from_clause1 = sql.FromClause( - sql.FromItem(table_name=sql.TableExpression(table_id="table_a")) + sql.FromItem(expression=sql.TableExpression(table_id="table_a")) ) select1 = sql.Select( select_list=[sql.SelectAll(sql.StarExpression())], @@ -143,13 +202,22 @@ def test_query_expr_w_cte(): sql.SelectAll(sql.StarExpression(parent=cte2.cte_name)), ], from_clause_list=[ - sql.FromClause(sql.FromItem(cte_name=cte1.cte_name)), - sql.FromClause(sql.FromItem(cte_name=cte2.cte_name)), + sql.FromClause(sql.FromItem(expression=cte1.cte_name)), + sql.FromClause(sql.FromItem(expression=cte2.cte_name)), ], + distinct=True, ) - select2_sql = "SELECT\n`a`.`column_x`,\n`b`.*\nFROM\n`a`,\n`b`" + select2_sql = "SELECT\nDISTINCT\n`a`.`column_x`,\n`b`.*\nFROM\n`a`,\n`b`" assert select2.sql() == select2_sql query2 = sql.QueryExpr(select=select2, with_cte_list=with_cte_list) query2_sql = f"WITH {cte1_sql},\n{cte2_sql}\n{select2_sql}" assert query2.sql() == query2_sql + + +def test_identifier(): + assert sql.identifier("\aa") == r"`\aa`" + + +def test_escape_chars(): + assert sql._escape_chars("\a\b\f\n\r\t\v\\?'\"`") == r"\a\b\f\n\r\t\v\\\?\'\"\`" diff --git a/tests/unit/core/test_indexes.py b/tests/unit/core/test_indexes.py new file mode 100644 index 0000000000..6e739c9dc9 --- /dev/null +++ b/tests/unit/core/test_indexes.py @@ -0,0 +1,39 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import bigframes.core.indexes + + +def test_index_repr_with_uninitialized_object(): + """Ensures Index.__init__ can be paused in a visual debugger without crashing. + + Regression test for https://github.com/googleapis/python-bigquery-dataframes/issues/728 + """ + # Avoid calling __init__ to simulate pausing __init__ in a debugger. + # https://stackoverflow.com/a/6384982/101923 + index = object.__new__(bigframes.core.indexes.Index) + got = repr(index) + assert "Index" in got + + +def test_multiindex_repr_with_uninitialized_object(): + """Ensures MultiIndex.__init__ can be paused in a visual debugger without crashing. + + Regression test for https://github.com/googleapis/python-bigquery-dataframes/issues/728 + """ + # Avoid calling __init__ to simulate pausing __init__ in a debugger. + # https://stackoverflow.com/a/6384982/101923 + index = object.__new__(bigframes.core.indexes.MultiIndex) + got = repr(index) + assert "MultiIndex" in got diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index 01f173812c..e90146565d 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -145,6 +145,13 @@ def test_label_encoder_correct( assert sql == "ML.LABEL_ENCODER(col_a, 1000000, 0) OVER() AS encoded_col_a" +def test_polynomial_expand( + base_sql_generator: ml_sql.BaseSqlGenerator, +): + sql = base_sql_generator.ml_polynomial_expand(["col_a", "col_b"], 2, "poly_exp") + assert sql == "ML.POLYNOMIAL_EXPAND(STRUCT(col_a, col_b), 2) AS poly_exp" + + def test_distance_correct( base_sql_generator: ml_sql.BaseSqlGenerator, mock_df: bpd.DataFrame, diff --git a/tests/unit/resources.py b/tests/unit/resources.py index 84699459e6..d45da82ab9 100644 --- a/tests/unit/resources.py +++ b/tests/unit/resources.py @@ -20,6 +20,7 @@ import google.cloud.bigquery import ibis import pandas +import pyarrow as pa import pytest import bigframes @@ -130,18 +131,9 @@ def create_arrayvalue( df: pandas.DataFrame, total_ordering_columns: List[str] ) -> core.ArrayValue: session = create_pandas_session({"test_table": df}) - ibis_table = session.ibis_client.table("test_table") - columns = tuple(ibis_table[key] for key in ibis_table.columns) - ordering = bigframes.core.ordering.ExpressionOrdering( - tuple( - [core.orderings.ascending_over(column) for column in total_ordering_columns] - ), - total_ordering_columns=frozenset(total_ordering_columns), - ) - return core.ArrayValue.from_ibis( + return core.ArrayValue.from_pyarrow( + arrow_table=pa.Table.from_pandas(df, preserve_index=False), session=session, - table=ibis_table, - columns=columns, - hidden_ordering_columns=(), - ordering=ordering, + ).order_by( + [bigframes.core.ordering.ascending_over(col) for col in total_ordering_columns] ) diff --git a/tests/unit/test_core.py b/tests/unit/test_core.py deleted file mode 100644 index 0a2fc61418..0000000000 --- a/tests/unit/test_core.py +++ /dev/null @@ -1,210 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import ibis.expr.types as ibis_types -import pandas - -import bigframes.core as core -import bigframes.core.expression as ex -import bigframes.core.ordering as order -import bigframes.operations as ops -import bigframes.operations.aggregations as agg_ops - -from . import resources - - -def test_arrayvalue_constructor_from_ibis_table_adds_all_columns(): - session = resources.create_pandas_session( - { - "test_table": pandas.DataFrame( - { - "col1": [1, 2, 3], - "not_included": [True, False, True], - "col2": ["a", "b", "c"], - "col3": [0.1, 0.2, 0.3], - } - ) - } - ) - ibis_table = session.ibis_client.table("test_table") - columns = (ibis_table["col1"], ibis_table["col2"], ibis_table["col3"]) - ordering = order.ExpressionOrdering( - tuple([order.ascending_over("col1")]), - total_ordering_columns=frozenset(["col1"]), - ) - actual = core.ArrayValue.from_ibis( - session=session, - table=ibis_table, - columns=columns, - ordering=ordering, - hidden_ordering_columns=(), - ) - assert actual._compile_ordered()._table is ibis_table - assert len(actual.column_ids) == 3 - - -def test_arrayvalue_with_get_column_type(): - value = resources.create_arrayvalue( - pandas.DataFrame( - { - "col1": [1, 2, 3], - "col2": ["a", "b", "c"], - "col3": [0.1, 0.2, 0.3], - } - ), - total_ordering_columns=["col1"], - ) - col1_type = value.get_column_type("col1") - col2_type = value.get_column_type("col2") - col3_type = value.get_column_type("col3") - assert isinstance(col1_type, pandas.Int64Dtype) - assert isinstance(col2_type, pandas.StringDtype) - assert isinstance(col3_type, pandas.Float64Dtype) - - -def test_arrayvalue_with_get_column(): - value = resources.create_arrayvalue( - pandas.DataFrame( - { - "col1": [1, 2, 3], - "col2": ["a", "b", "c"], - "col3": [0.1, 0.2, 0.3], - } - ), - total_ordering_columns=["col1"], - ) - col1 = value._compile_ordered()._get_ibis_column("col1") - assert isinstance(col1, ibis_types.Value) - assert col1.get_name() == "col1" - assert col1.type().is_int64() - - -def test_arrayvalues_to_ibis_expr_with_get_column(): - value = resources.create_arrayvalue( - pandas.DataFrame( - { - "col1": [1, 2, 3], - "col2": ["a", "b", "c"], - "col3": [0.1, 0.2, 0.3], - } - ), - total_ordering_columns=["col1"], - ) - expr = value._compile_ordered()._get_ibis_column("col1") - assert expr.get_name() == "col1" - assert expr.type().is_int64() - - -def test_arrayvalues_to_ibis_expr_with_concat(): - value = resources.create_arrayvalue( - pandas.DataFrame( - { - "col1": [1, 2, 3], - "col2": ["a", "b", "c"], - "col3": [0.1, 0.2, 0.3], - } - ), - total_ordering_columns=["col1"], - ) - expr = value.concat([value]) - actual = expr._compile_unordered()._to_ibis_expr() - assert len(actual.columns) == 3 - # TODO(ashleyxu, b/299631930): test out the union expression - assert actual.columns[0] == "column_0" - assert actual.columns[1] == "column_1" - assert actual.columns[2] == "column_2" - - -def test_arrayvalues_to_ibis_expr_with_project_unary_op(): - value = resources.create_arrayvalue( - pandas.DataFrame( - { - "col1": [1, 2, 3], - "col2": ["a", "b", "c"], - "col3": [0.1, 0.2, 0.3], - } - ), - total_ordering_columns=["col1"], - ) - expr = value.project_to_id( - ops.AsTypeOp("string").as_expr("col1"), output_id="col1" - )._compile_ordered() - assert value._compile_ordered().columns[0].type().is_int64() - assert expr.columns[0].type().is_string() - - -def test_arrayvalues_to_ibis_expr_with_project_binary_op(): - value = resources.create_arrayvalue( - pandas.DataFrame( - { - "col1": [1, 2, 3], - "col2": [0.2, 0.3, 0.4], - "col3": [0.1, 0.2, 0.3], - } - ), - total_ordering_columns=["col1"], - ) - expr = value.project_to_id( - ops.add_op.as_expr("col2", "col3"), "col4" - )._compile_ordered() - assert expr.columns[3].type().is_float64() - actual = expr._to_ibis_expr(ordering_mode="unordered") - assert len(expr.columns) == 4 - assert actual.columns[3] == "col4" - - -def test_arrayvalues_to_ibis_expr_with_project_ternary_op(): - value = resources.create_arrayvalue( - pandas.DataFrame( - { - "col1": [1, 2, 3], - "col2": [0.2, 0.3, 0.4], - "col3": [True, False, False], - "col4": [0.1, 0.2, 0.3], - } - ), - total_ordering_columns=["col1"], - ) - expr = value.project_to_id( - ops.where_op.as_expr("col2", "col3", "col4"), "col5" - )._compile_ordered() - assert expr.columns[4].type().is_float64() - actual = expr._to_ibis_expr(ordering_mode="unordered") - assert len(expr.columns) == 5 - assert actual.columns[4] == "col5" - - -def test_arrayvalue_to_ibis_expr_with_aggregate(): - value = resources.create_arrayvalue( - pandas.DataFrame( - { - "col1": [1, 2, 3], - "col2": ["a", "b", "c"], - "col3": [0.1, 0.2, 0.3], - } - ), - total_ordering_columns=["col1"], - ) - expr = value.aggregate( - aggregations=( - (ex.UnaryAggregation(agg_ops.sum_op, ex.free_var("col1")), "col4"), - ), - by_column_ids=["col1"], - dropna=False, - )._compile_ordered() - actual = expr._to_ibis_expr(ordering_mode="unordered") - assert len(expr.columns) == 2 - assert actual.columns[0] == "col1" - assert actual.columns[1] == "col4" - assert expr.columns[1].type().is_int64() diff --git a/tests/unit/test_dataframe.py b/tests/unit/test_dataframe.py index 17a8290889..6370d1b987 100644 --- a/tests/unit/test_dataframe.py +++ b/tests/unit/test_dataframe.py @@ -15,9 +15,23 @@ import google.cloud.bigquery import pytest +import bigframes.dataframe + from . import resources +def test_dataframe_repr_with_uninitialized_object(): + """Ensures DataFrame.__init__ can be paused in a visual debugger without crashing. + + Regression test for https://github.com/googleapis/python-bigquery-dataframes/issues/728 + """ + # Avoid calling __init__ to simulate pausing __init__ in a debugger. + # https://stackoverflow.com/a/6384982/101923 + dataframe = bigframes.dataframe.DataFrame.__new__(bigframes.dataframe.DataFrame) + got = repr(dataframe) + assert "DataFrame" in got + + def test_dataframe_to_gbq_invalid_destination(monkeypatch: pytest.MonkeyPatch): dataframe = resources.create_dataframe(monkeypatch) diff --git a/tests/unit/test_series.py b/tests/unit/test_series.py new file mode 100644 index 0000000000..1409209c6c --- /dev/null +++ b/tests/unit/test_series.py @@ -0,0 +1,27 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import bigframes.series + + +def test_series_repr_with_uninitialized_object(): + """Ensures Series.__init__ can be paused in a visual debugger without crashing. + + Regression test for https://github.com/googleapis/python-bigquery-dataframes/issues/728 + """ + # Avoid calling __init__ to simulate pausing __init__ in a debugger. + # https://stackoverflow.com/a/6384982/101923 + series = bigframes.series.Series.__new__(bigframes.series.Series) + got = repr(series) + assert "Series" in got diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index d46fa4cfc7..f8088f8060 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -1067,6 +1067,51 @@ def reindex_like(self, other): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def insert(self, loc, column, value, allow_duplicates=False): + """Insert column into DataFrame at specified location. + + Raises a ValueError if `column` is already contained in the DataFrame, + unless `allow_duplicates` is set to True. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + + Insert a new column named 'col3' between 'col1' and 'col2' with all entries set to 5. + + >>> df.insert(1, 'col3', 5) + >>> df + col1 col3 col2 + 0 1 5 3 + 1 2 5 4 + + [2 rows x 3 columns] + + Insert another column named 'col2' at the beginning of the DataFrame with values [5, 6] + + >>> df.insert(0, 'col2', [5, 6], allow_duplicates=True) + >>> df + col2 col1 col3 col2 + 0 5 1 5 3 + 1 6 2 5 4 + + [2 rows x 4 columns] + + Args: + loc (int): + Insertion index. Must verify 0 <= loc <= len(columns). + column (str, number, or hashable object): + Label of the inserted column. + value (Scalar, Series, or array-like): + Content of the inserted column. + allow_duplicates (bool, default False): + Allow duplicate column labels to be created. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def drop( self, labels=None, *, axis=0, index=None, columns=None, level=None ) -> DataFrame | None: diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_polynomial.py b/third_party/bigframes_vendored/sklearn/preprocessing/_polynomial.py new file mode 100644 index 0000000000..4e4624ba84 --- /dev/null +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_polynomial.py @@ -0,0 +1,38 @@ +""" +This file contains preprocessing tools based on polynomials. +""" + +from bigframes_vendored.sklearn.base import BaseEstimator, TransformerMixin + +from bigframes import constants + + +class PolynomialFeatures(TransformerMixin, BaseEstimator): + """Generate polynomial and interaction features.""" + + def fit(self, X, y=None): + """Compute number of output features. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + The Dataframe or Series with training data. + + y (default None): + Ignored. + + Returns: + PolynomialFeatures: Fitted transformer. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def transform(self, X): + """Transform data to polynomial features. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + The DataFrame or Series to be transformed. + + Returns: + bigframes.dataframe.DataFrame: Transformed result. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)