Skip to content

feat: Add filters and columns arguments to read_gbq for enhanced data querying #198

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 23 commits into from
Dec 13, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
7a006b0
feat: Add filters argument to read_gbq for enhanced data querying
Genesis929 Nov 13, 2023
37794a3
feat: Add filters argument to read_gbq for enhanced data querying
Genesis929 Nov 13, 2023
499bdcd
🦉 Updates from OwlBot post-processor
gcf-owl-bot[bot] Nov 13, 2023
300263e
feat: Add filters and columns arguments to read_gbq for enhanced data…
gcf-owl-bot[bot] Nov 13, 2023
fdc539d
🦉 Updates from OwlBot post-processor
gcf-owl-bot[bot] Nov 14, 2023
6ed4194
feat: Add filters and columns arguments to read_gbq for enhanced data…
gcf-owl-bot[bot] Nov 13, 2023
ad6d37f
feat: Add filters and columns arguments to read_gbq for enhanced data…
gcf-owl-bot[bot] Nov 13, 2023
3473780
feat: Add filters and columns arguments to read_gbq for enhanced data…
gcf-owl-bot[bot] Nov 13, 2023
276bfd0
feat: Add filters and columns arguments to read_gbq for enhanced data…
gcf-owl-bot[bot] Nov 13, 2023
8a4e940
feat: Add filters and columns arguments to read_gbq for enhanced data…
gcf-owl-bot[bot] Nov 13, 2023
b29e9b7
Merge branch 'main' into b299514019-read-gbq-filter
Genesis929 Nov 14, 2023
c00a05e
feat: Add filters and columns arguments to read_gbq for enhanced data…
gcf-owl-bot[bot] Nov 13, 2023
dd94369
feat: Add filters and columns arguments to read_gbq for enhanced data…
gcf-owl-bot[bot] Nov 13, 2023
54ca688
feat: Add filters and columns arguments to read_gbq for enhanced data…
gcf-owl-bot[bot] Nov 13, 2023
95e318b
🦉 Updates from OwlBot post-processor
gcf-owl-bot[bot] Nov 20, 2023
ced491f
feat: Add filters and columns arguments to read_gbq for enhanced data…
gcf-owl-bot[bot] Nov 13, 2023
0f2840d
🦉 Updates from OwlBot post-processor
gcf-owl-bot[bot] Nov 20, 2023
771d093
Merge branch 'main' into b299514019-read-gbq-filter
Genesis929 Dec 12, 2023
82f74fd
update docstring
Genesis929 Dec 12, 2023
1c038b5
Merge branch 'main' into b299514019-read-gbq-filter
tswast Dec 13, 2023
354fd8e
remove columns input
Genesis929 Dec 13, 2023
434c559
make filter_to_query run only when there are filters
Genesis929 Dec 13, 2023
c17b815
remove named input
Genesis929 Dec 13, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 0 additions & 58 deletions tests/system/small/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,64 +309,6 @@ def test_read_gbq_w_script_no_select(session, dataset_id: str):
assert df["statement_type"][0] == "SCRIPT"


@pytest.mark.parametrize(
("query_or_table", "filters", "validator"),
[
pytest.param(
"""SELECT
rowindex,
string_col,
FROM `{scalars_table_id}` AS t
""",
[("rowindex", "<", 4), ("string_col", "==", "Hello, World!")],
lambda row: row["rowindex"] < 4 and row["string_col"] == "Hello, World!",
id="query_input",
),
pytest.param(
"{scalars_table_id}",
[("date_col", ">", "2022-10-20")],
lambda row: pd.to_datetime(row["date_col"]) > pd.to_datetime("2022-10-20"),
id="table_input",
),
pytest.param(
"{scalars_table_id}",
[
(("rowindex", "not in", [0, 6])),
(("string_col", "in", ["Hello, World!", "こんにちは"])),
],
lambda row: row["rowindex"] not in [0, 6]
or row["string_col"] in ["Hello, World!", "こんにちは"],
id="or_operation",
),
pytest.param(
"{scalars_table_id}",
["date_col", ">", "2022-10-20"],
None,
marks=pytest.mark.xfail(
raises=ValueError,
),
id="raise_error",
),
],
)
def test_read_gbq_with_filters(
session, scalars_table_id: str, query_or_table, filters, validator
):
df = session.read_gbq(
query_or_table.format(scalars_table_id=scalars_table_id),
filters=filters,
)

for _, row in df.iterrows():
assert validator(row)


def test_read_gbq_with_columns_filter(session, scalars_table_id: str):
cols = ["int64_too", "string_col", "date_col"]
df = session.read_gbq(scalars_table_id, columns=cols)
assert list(df.columns) == cols


def test_read_gbq_model(session, penguins_linear_model_name):
model = session.read_gbq_model(penguins_linear_model_name)
assert isinstance(model, bigframes.ml.linear_model.LinearRegression)
Expand Down
57 changes: 57 additions & 0 deletions tests/unit/session/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,60 @@ def test_session_init_fails_with_no_project():
credentials=mock.Mock(spec=google.auth.credentials.Credentials)
)
)


@pytest.mark.parametrize(
("query_or_table", "columns", "filters", "expected_output"),
[
pytest.param(
"""SELECT
rowindex,
string_col,
FROM `test_table` AS t
""",
[],
[("rowindex", "<", 4), ("string_col", "==", "Hello, World!")],
"""SELECT * FROM (SELECT
rowindex,
string_col,
FROM `test_table` AS t
) AS sub WHERE `rowindex` < 4 AND `string_col` = 'Hello, World!'""",
id="query_input",
),
pytest.param(
"test_table",
[],
[("date_col", ">", "2022-10-20")],
"SELECT * FROM test_table AS sub WHERE `date_col` > '2022-10-20'",
id="table_input",
),
pytest.param(
"test_table",
["row_index", "string_col"],
[
(("rowindex", "not in", [0, 6]),),
(("string_col", "in", ["Hello, World!", "こんにちは"]),),
],
(
"SELECT `row_index`, `string_col` FROM test_table AS sub WHERE "
"`rowindex` NOT IN (0, 6) OR `string_col` IN ('Hello, World!', "
"'こんにちは')"
),
id="or_operation",
),
pytest.param(
"test_table",
[],
["date_col", ">", "2022-10-20"],
None,
marks=pytest.mark.xfail(
raises=ValueError,
),
id="raise_error",
),
],
)
def test_read_gbq_with_filters(query_or_table, columns, filters, expected_output):
session = resources.create_bigquery_session()
query = session._filters_to_query(query_or_table, columns, filters)
assert query == expected_output
32 changes: 10 additions & 22 deletions third_party/bigframes_vendored/pandas/io/gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,8 @@

from bigframes import constants

FiltersType = (
Iterable[
Union[
Tuple[str, Literal["in", "not in", "<", "<=", "==", "!=", ">=", ">"], Any],
Iterable[
Tuple[
str,
Literal["in", "not in", "<", "<=", "==", "!=", ">=", ">"],
Any,
]
],
]
],
)
FilterType = Tuple[str, Literal["in", "not in", "<", "<=", "==", "!=", ">=", ">"], Any]
FiltersType = Iterable[Union[FilterType, Iterable[FilterType]]]


class GBQIOMixin:
Expand Down Expand Up @@ -100,15 +88,15 @@ def read_gbq(
max_results (Optional[int], default None):
If set, limit the maximum number of rows to fetch from the
query results.
columns(Iterable[str], default ()): If not empty, only these columns
columns (Iterable[str], default ()): If not empty, only these columns
will be read from table.
filters (List[Tuple], default ()): To filter out data. Filter syntax:
[[(column, op, val), …],…] where op is [==, >, >=, <, <=, !=, in,
not in] The innermost tuples are transposed into a set of filters
applied through an AND operation. The outer list combines these
sets of filters through an OR operation. A single list of tuples
can also be used, meaning that no OR operation between set of
filters is to be conducted.
filters (Iterable[Iterable[[Tuple]], default ()): To filter out data.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like this would be Iterable[Union[Tuple, Iterable[Tuple]]]

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be great to add a code sample (in the EXAMPLES section)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sample added.

Filter syntax: [[(column, op, val), …],…] where op is [==, >, >=,
<, <=, !=, in, not in] The innermost tuples are transposed into a
set of filters applied through an AND operation. The outer list
combines these sets of filters through an OR operation. A single
list of tuples can also be used, meaning that no OR operation
between set of filters is to be conducted.

Returns:
bigframes.dataframe.DataFrame: A DataFrame representing results of the query or table.
Expand Down