Skip to content

Commit 6ef878d

Browse files
Merge remote-tracking branch 'github/main' into unordered_mode
2 parents bfcdeb9 + b4fbb51 commit 6ef878d

File tree

23 files changed

+622
-130
lines changed

23 files changed

+622
-130
lines changed

bigframes/core/__init__.py

+4-8
Original file line numberDiff line numberDiff line change
@@ -507,11 +507,11 @@ def try_align_as_projection(
507507
join_type: join_def.JoinType,
508508
mappings: typing.Tuple[join_def.JoinColumnMapping, ...],
509509
) -> typing.Optional[ArrayValue]:
510-
left_side = bigframes.core.rewrite.SquashedSelect.from_node(self.node)
511-
right_side = bigframes.core.rewrite.SquashedSelect.from_node(other.node)
512-
result = left_side.maybe_merge(right_side, join_type, mappings)
510+
result = bigframes.core.rewrite.join_as_projection(
511+
self.node, other.node, mappings, join_type
512+
)
513513
if result is not None:
514-
return ArrayValue(result.expand())
514+
return ArrayValue(result)
515515
return None
516516

517517
def explode(self, column_ids: typing.Sequence[str]) -> ArrayValue:
@@ -530,7 +530,3 @@ def _uniform_sampling(self, fraction: float) -> ArrayValue:
530530
The row numbers of result is non-deterministic, avoid to use.
531531
"""
532532
return ArrayValue(nodes.RandomSampleNode(self.node, fraction))
533-
534-
def merge_projections(self) -> ArrayValue:
535-
new_node = bigframes.core.rewrite.maybe_squash_projection(self.node)
536-
return ArrayValue(new_node)

bigframes/core/blocks.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
import bigframes.constants
4141
import bigframes.constants as constants
4242
import bigframes.core as core
43+
import bigframes.core.compile.googlesql as googlesql
4344
import bigframes.core.expression as ex
4445
import bigframes.core.expression as scalars
4546
import bigframes.core.guid as guid
@@ -2417,7 +2418,9 @@ def _get_rows_as_json_values(self) -> Block:
24172418
select_columns = (
24182419
[ordering_column_name] + list(self.index_columns) + [row_json_column_name]
24192420
)
2420-
select_columns_csv = sql.csv([sql.identifier(col) for col in select_columns])
2421+
select_columns_csv = sql.csv(
2422+
[googlesql.identifier(col) for col in select_columns]
2423+
)
24212424
json_sql = f"""\
24222425
With T0 AS (
24232426
{textwrap.indent(expr_sql, " ")}
@@ -2430,7 +2433,7 @@ def _get_rows_as_json_values(self) -> Block:
24302433
"values", [{column_references_csv}],
24312434
"indexlength", {index_columns_count},
24322435
"dtype", {pandas_row_dtype}
2433-
) AS {sql.identifier(row_json_column_name)} FROM T0
2436+
) AS {googlesql.identifier(row_json_column_name)} FROM T0
24342437
)
24352438
SELECT {select_columns_csv} FROM T1
24362439
"""

bigframes/core/compile/compiled.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import pandas
2929

3030
import bigframes.core.compile.aggregate_compiler as agg_compiler
31+
import bigframes.core.compile.googlesql
3132
import bigframes.core.compile.ibis_types
3233
import bigframes.core.compile.scalar_op_compiler as op_compilers
3334
import bigframes.core.expression as ex
@@ -905,7 +906,12 @@ def to_sql(
905906
output_columns = [
906907
col_id_overrides.get(col, col) for col in baked_ir.column_ids
907908
]
908-
sql = bigframes.core.sql.select_from_subquery(output_columns, sql)
909+
sql = (
910+
bigframes.core.compile.googlesql.Select()
911+
.from_(sql)
912+
.select(output_columns)
913+
.sql()
914+
)
909915

910916
# Single row frames may not have any ordering columns
911917
if len(baked_ir._ordering.all_ordering_columns) > 0:

bigframes/core/compile/googlesql/__init__.py

+7
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,17 @@
1717

1818
from __future__ import annotations
1919

20+
from bigframes.core.compile.googlesql.datatype import DataType
2021
from bigframes.core.compile.googlesql.expression import (
2122
_escape_chars,
2223
AliasExpression,
2324
ColumnExpression,
2425
CTEExpression,
26+
identifier,
2527
StarExpression,
2628
TableExpression,
2729
)
30+
from bigframes.core.compile.googlesql.function import Cast
2831
from bigframes.core.compile.googlesql.query import (
2932
AsAlias,
3033
FromClause,
@@ -38,10 +41,13 @@
3841

3942
__all__ = [
4043
"_escape_chars",
44+
"identifier",
4145
"AliasExpression",
4246
"AsAlias",
47+
"Cast",
4348
"ColumnExpression",
4449
"CTEExpression",
50+
"DataType",
4551
"FromClause",
4652
"FromItem",
4753
"NonRecursiveCTE",
@@ -50,5 +56,6 @@
5056
"SelectAll",
5157
"SelectExpression",
5258
"StarExpression",
59+
"StringType",
5360
"TableExpression",
5461
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import enum
16+
17+
"""This module represents all GoogleSQL for BigQuery data types:
18+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types"""
19+
20+
21+
class DataType(enum.Enum):
22+
STRING = 1
23+
FLOAT64 = 2

bigframes/core/compile/googlesql/expression.py

+14-7
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,8 @@ class ColumnExpression(Expression):
4545

4646
def sql(self) -> str:
4747
if self.parent is not None:
48-
return f"{self.parent.sql()}.`{self.name}`"
49-
return f"`{self.name}`"
48+
return f"{self.parent.sql()}.{identifier(self.name)}"
49+
return identifier(self.name)
5050

5151

5252
@dataclasses.dataclass
@@ -72,10 +72,10 @@ def __post_init__(self):
7272
def sql(self) -> str:
7373
text = []
7474
if self.project_id is not None:
75-
text.append(f"`{_escape_chars(self.project_id)}`")
75+
text.append(identifier(self.project_id))
7676
if self.dataset_id is not None:
77-
text.append(f"`{_escape_chars(self.dataset_id)}`")
78-
text.append(f"`{_escape_chars(self.table_id)}`")
77+
text.append(identifier(self.dataset_id))
78+
text.append(identifier(self.table_id))
7979
return ".".join(text)
8080

8181

@@ -84,15 +84,22 @@ class AliasExpression(Expression):
8484
alias: str
8585

8686
def sql(self) -> str:
87-
return f"`{_escape_chars(self.alias)}`"
87+
return identifier(self.alias)
8888

8989

9090
@dataclasses.dataclass
9191
class CTEExpression(Expression):
9292
name: str
9393

9494
def sql(self) -> str:
95-
return f"`{_escape_chars(self.name)}`"
95+
return identifier(self.name)
96+
97+
98+
def identifier(id: str) -> str:
99+
"""Return a string representing column reference in a SQL."""
100+
# https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#identifiers
101+
# Just always escape, otherwise need to check against every reserved sql keyword
102+
return f"`{_escape_chars(id)}`"
96103

97104

98105
def _escape_chars(value: str):
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import dataclasses
16+
17+
import bigframes.core.compile.googlesql.datatype as datatype
18+
import bigframes.core.compile.googlesql.expression as expr
19+
20+
# Conversion functions:
21+
# https://cloud.google.com/bigquery/docs/reference/standard-sql/conversion_functions
22+
23+
24+
@dataclasses.dataclass
25+
class Cast(expr.Expression):
26+
"""This class represents the `cast` function."""
27+
28+
expression: expr.ColumnExpression
29+
type: datatype.DataType
30+
31+
def sql(self) -> str:
32+
return f"CAST ({self.expression.sql()} AS {self.type.name})"

bigframes/core/compile/googlesql/query.py

+61-17
Original file line numberDiff line numberDiff line change
@@ -16,19 +16,19 @@
1616

1717
import dataclasses
1818
import typing
19-
from typing import TYPE_CHECKING
19+
20+
import google.cloud.bigquery as bigquery
2021

2122
import bigframes.core.compile.googlesql.abc as abc
2223
import bigframes.core.compile.googlesql.expression as expr
2324

24-
if TYPE_CHECKING:
25-
import google.cloud.bigquery as bigquery
26-
2725
"""This module provides a structured representation of GoogleSQL syntax using nodes.
2826
Each node's name and child nodes are designed to strictly follow the official GoogleSQL
2927
syntax rules outlined in the documentation:
3028
https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax"""
3129

30+
TABLE_SOURCE_TYPE = typing.Union[str, bigquery.TableReference]
31+
3232

3333
@dataclasses.dataclass
3434
class QueryExpr(abc.SQLSyntax):
@@ -53,11 +53,47 @@ def sql(self) -> str:
5353
class Select(abc.SQLSyntax):
5454
"""This class represents GoogleSQL `select` syntax."""
5555

56-
select_list: typing.Sequence[typing.Union[SelectExpression, SelectAll]]
57-
from_clause_list: typing.Sequence[FromClause] = ()
56+
select_list: typing.Sequence[
57+
typing.Union[SelectExpression, SelectAll]
58+
] = dataclasses.field(default_factory=list)
59+
from_clause_list: typing.Sequence[FromClause] = dataclasses.field(
60+
default_factory=list
61+
)
5862
distinct: bool = False
5963

64+
def select(
65+
self,
66+
columns: typing.Union[typing.Iterable[str], str, None] = None,
67+
distinct: bool = False,
68+
) -> Select:
69+
if isinstance(columns, str):
70+
columns = [columns]
71+
self.select_list: typing.List[typing.Union[SelectExpression, SelectAll]] = (
72+
[
73+
SelectExpression(expression=expr.ColumnExpression(name=column))
74+
for column in columns
75+
]
76+
if columns
77+
else [SelectAll(expression=expr.StarExpression())]
78+
)
79+
self.distinct = distinct
80+
return self
81+
82+
def from_(
83+
self,
84+
sources: typing.Union[TABLE_SOURCE_TYPE, typing.Iterable[TABLE_SOURCE_TYPE]],
85+
) -> Select:
86+
if (not isinstance(sources, typing.Iterable)) or isinstance(sources, str):
87+
sources = [sources]
88+
self.from_clause_list = [
89+
FromClause(FromItem.from_source(source)) for source in sources
90+
]
91+
return self
92+
6093
def sql(self) -> str:
94+
if (self.select_list is not None) and (not self.select_list):
95+
raise ValueError("Select clause has not been properly initialized.")
96+
6197
text = ["SELECT"]
6298

6399
if self.distinct:
@@ -66,7 +102,7 @@ def sql(self) -> str:
66102
select_list_sql = ",\n".join([select.sql() for select in self.select_list])
67103
text.append(select_list_sql)
68104

69-
if self.from_clause_list is not None:
105+
if self.from_clause_list:
70106
from_clauses_sql = ",\n".join(
71107
[clause.sql() for clause in self.from_clause_list]
72108
)
@@ -118,19 +154,27 @@ class FromItem(abc.SQLSyntax):
118154
as_alias: typing.Optional[AsAlias] = None
119155

120156
@classmethod
121-
def from_table_ref(
157+
def from_source(
122158
cls,
123-
table_ref: bigquery.TableReference,
159+
subquery_or_tableref: typing.Union[bigquery.TableReference, str],
124160
as_alias: typing.Optional[AsAlias] = None,
125161
):
126-
return cls(
127-
expression=expr.TableExpression(
128-
table_id=table_ref.table_id,
129-
dataset_id=table_ref.dataset_id,
130-
project_id=table_ref.project,
131-
),
132-
as_alias=as_alias,
133-
)
162+
if isinstance(subquery_or_tableref, bigquery.TableReference):
163+
return cls(
164+
expression=expr.TableExpression(
165+
table_id=subquery_or_tableref.table_id,
166+
dataset_id=subquery_or_tableref.dataset_id,
167+
project_id=subquery_or_tableref.project,
168+
),
169+
as_alias=as_alias,
170+
)
171+
elif isinstance(subquery_or_tableref, str):
172+
return cls(
173+
expression=subquery_or_tableref,
174+
as_alias=as_alias,
175+
)
176+
else:
177+
raise ValueError("The source must be bigquery.TableReference or str.")
134178

135179
def sql(self) -> str:
136180
if isinstance(self.expression, (expr.TableExpression, expr.CTEExpression)):

0 commit comments

Comments
 (0)