feat: add DataFrame.select_dtypes method (#242)

TrevorBergeron · web-flow · commit 1737acc51b4f · 2023-12-01T04:22:16.000Z
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes #<issue_number_goes_here> 🦕
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -434,6 +434,19 @@ def info(
             # TODO: Convert to different units (kb, mb, etc.)
             obuf.write(f"memory usage: {self.memory_usage().sum()} bytes\n")
 
+    def select_dtypes(self, include=None, exclude=None) -> DataFrame:
+        # Create empty pandas dataframe with same schema and then leverage actual pandas implementation
+        as_pandas = pandas.DataFrame(
+            {
+                col_id: pandas.Series([], dtype=dtype)
+                for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
+            }
+        )
+        selected_columns = tuple(
+            as_pandas.select_dtypes(include=include, exclude=exclude).columns
+        )
+        return DataFrame(self._block.select_columns(selected_columns))
+
     def _set_internal_query_job(self, query_job: bigquery.QueryJob):
         self._query_job = query_job
 
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -297,6 +297,26 @@ def test_df_info(scalars_dfs):
     assert expected == bf_result.getvalue()
 
 
+@pytest.mark.parametrize(
+    ("include", "exclude"),
+    [
+        ("Int64", None),
+        (["int"], None),
+        ("number", None),
+        ([pd.Int64Dtype(), pd.BooleanDtype()], None),
+        (None, [pd.Int64Dtype(), pd.BooleanDtype()]),
+        ("Int64", ["boolean"]),
+    ],
+)
+def test_select_dtypes(scalars_dfs, include, exclude):
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    pd_result = scalars_pandas_df.select_dtypes(include=include, exclude=exclude)
+    bf_result = scalars_df.select_dtypes(include=include, exclude=exclude).to_pandas()
+
+    pd.testing.assert_frame_equal(pd_result, bf_result)
+
+
 def test_drop_index(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
 
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -158,6 +158,42 @@ def memory_usage(self, index: bool = True):
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
+    def select_dtypes(self, include=None, exclude=None) -> DataFrame:
+        """
+        Return a subset of the DataFrame's columns based on the column dtypes.
+
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+            >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': ["hello", "world"], 'col3': [True, False]})
+            >>> df.select_dtypes(include=['Int64'])
+               col1
+            0     1
+            1     2
+            <BLANKLINE>
+            [2 rows x 1 columns]
+
+            >>> df.select_dtypes(exclude=['Int64'])
+                col2   col3
+            0  hello   True
+            1  world  False
+            <BLANKLINE>
+            [2 rows x 2 columns]
+
+
+        Args:
+            include (scalar or list-like):
+                A selection of dtypes or strings to be included.
+            exclude (scalar or list-like):
+                A selection of dtypes or strings to be excluded.
+
+        Returns:
+            DataFrame: The subset of the frame including the dtypes in ``include`` and excluding the dtypes in ``exclude``.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
     # ----------------------------------------------------------------------
     # IO methods (to / from other formats)
     def to_numpy(self, dtype=None, copy=False, na_value=None, **kwargs) -> np.ndarray: