Skip to content

Commit 1737acc

Browse files
feat: add DataFrame.select_dtypes method (#242)
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes #<issue_number_goes_here> 🦕
1 parent 66d1839 commit 1737acc

File tree

3 files changed

+69
-0
lines changed

3 files changed

+69
-0
lines changed

bigframes/dataframe.py

+13
Original file line numberDiff line numberDiff line change
@@ -434,6 +434,19 @@ def info(
434434
# TODO: Convert to different units (kb, mb, etc.)
435435
obuf.write(f"memory usage: {self.memory_usage().sum()} bytes\n")
436436

437+
def select_dtypes(self, include=None, exclude=None) -> DataFrame:
438+
# Create empty pandas dataframe with same schema and then leverage actual pandas implementation
439+
as_pandas = pandas.DataFrame(
440+
{
441+
col_id: pandas.Series([], dtype=dtype)
442+
for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
443+
}
444+
)
445+
selected_columns = tuple(
446+
as_pandas.select_dtypes(include=include, exclude=exclude).columns
447+
)
448+
return DataFrame(self._block.select_columns(selected_columns))
449+
437450
def _set_internal_query_job(self, query_job: bigquery.QueryJob):
438451
self._query_job = query_job
439452

tests/system/small/test_dataframe.py

+20
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,26 @@ def test_df_info(scalars_dfs):
297297
assert expected == bf_result.getvalue()
298298

299299

300+
@pytest.mark.parametrize(
301+
("include", "exclude"),
302+
[
303+
("Int64", None),
304+
(["int"], None),
305+
("number", None),
306+
([pd.Int64Dtype(), pd.BooleanDtype()], None),
307+
(None, [pd.Int64Dtype(), pd.BooleanDtype()]),
308+
("Int64", ["boolean"]),
309+
],
310+
)
311+
def test_select_dtypes(scalars_dfs, include, exclude):
312+
scalars_df, scalars_pandas_df = scalars_dfs
313+
314+
pd_result = scalars_pandas_df.select_dtypes(include=include, exclude=exclude)
315+
bf_result = scalars_df.select_dtypes(include=include, exclude=exclude).to_pandas()
316+
317+
pd.testing.assert_frame_equal(pd_result, bf_result)
318+
319+
300320
def test_drop_index(scalars_dfs):
301321
scalars_df, scalars_pandas_df = scalars_dfs
302322

third_party/bigframes_vendored/pandas/core/frame.py

+36
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,42 @@ def memory_usage(self, index: bool = True):
158158
"""
159159
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
160160

161+
def select_dtypes(self, include=None, exclude=None) -> DataFrame:
162+
"""
163+
Return a subset of the DataFrame's columns based on the column dtypes.
164+
165+
**Examples:**
166+
167+
>>> import bigframes.pandas as bpd
168+
>>> bpd.options.display.progress_bar = None
169+
170+
>>> df = bpd.DataFrame({'col1': [1, 2], 'col2': ["hello", "world"], 'col3': [True, False]})
171+
>>> df.select_dtypes(include=['Int64'])
172+
col1
173+
0 1
174+
1 2
175+
<BLANKLINE>
176+
[2 rows x 1 columns]
177+
178+
>>> df.select_dtypes(exclude=['Int64'])
179+
col2 col3
180+
0 hello True
181+
1 world False
182+
<BLANKLINE>
183+
[2 rows x 2 columns]
184+
185+
186+
Args:
187+
include (scalar or list-like):
188+
A selection of dtypes or strings to be included.
189+
exclude (scalar or list-like):
190+
A selection of dtypes or strings to be excluded.
191+
192+
Returns:
193+
DataFrame: The subset of the frame including the dtypes in ``include`` and excluding the dtypes in ``exclude``.
194+
"""
195+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
196+
161197
# ----------------------------------------------------------------------
162198
# IO methods (to / from other formats)
163199
def to_numpy(self, dtype=None, copy=False, na_value=None, **kwargs) -> np.ndarray:

0 commit comments

Comments
 (0)