Skip to content

Commit a6e32aa

Browse files
feat: Add more index methods (#54)
1 parent 3502f83 commit a6e32aa

File tree

6 files changed

+373
-23
lines changed

6 files changed

+373
-23
lines changed

bigframes/core/block_transforms.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -218,13 +218,17 @@ def rank(
218218
return block.select_columns(rownum_col_ids).with_column_labels(labels)
219219

220220

221-
def dropna(block: blocks.Block, how: typing.Literal["all", "any"] = "any"):
221+
def dropna(
222+
block: blocks.Block,
223+
column_ids: typing.Sequence[str],
224+
how: typing.Literal["all", "any"] = "any",
225+
):
222226
"""
223227
Drop na entries from block
224228
"""
225229
if how == "any":
226230
filtered_block = block
227-
for column in block.value_columns:
231+
for column in column_ids:
228232
filtered_block, result_id = filtered_block.apply_unary_op(
229233
column, ops.notnull_op
230234
)
@@ -234,7 +238,7 @@ def dropna(block: blocks.Block, how: typing.Literal["all", "any"] = "any"):
234238
else: # "all"
235239
filtered_block = block
236240
predicate = None
237-
for column in block.value_columns:
241+
for column in column_ids:
238242
filtered_block, partial_predicate = filtered_block.apply_unary_op(
239243
column, ops.notnull_op
240244
)

bigframes/core/indexes/index.py

+103-15
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,10 @@
2424

2525
import bigframes.constants as constants
2626
import bigframes.core as core
27+
import bigframes.core.block_transforms as block_ops
2728
import bigframes.core.blocks as blocks
2829
import bigframes.core.joins as joins
30+
import bigframes.core.ordering as order
2931
import bigframes.core.utils as utils
3032
import bigframes.dtypes
3133
import bigframes.dtypes as bf_dtypes
@@ -149,6 +151,27 @@ def has_duplicates(self) -> bool:
149151
def _block(self) -> blocks.Block:
150152
return self._data._get_block()
151153

154+
@property
155+
def T(self) -> Index:
156+
return self.transpose()
157+
158+
def transpose(self) -> Index:
159+
return self
160+
161+
def sort_values(self, *, ascending: bool = True, na_position: str = "last"):
162+
if na_position not in ["first", "last"]:
163+
raise ValueError("Param na_position must be one of 'first' or 'last'")
164+
direction = (
165+
order.OrderingDirection.ASC if ascending else order.OrderingDirection.DESC
166+
)
167+
na_last = na_position == "last"
168+
index_columns = self._block.index_columns
169+
ordering = [
170+
order.OrderingColumnReference(column, direction=direction, na_last=na_last)
171+
for column in index_columns
172+
]
173+
return Index._from_block(self._block.order_by(ordering))
174+
152175
def astype(
153176
self,
154177
dtype: Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype],
@@ -176,6 +199,57 @@ def max(self) -> typing.Any:
176199
def min(self) -> typing.Any:
177200
return self._apply_aggregation(agg_ops.min_op)
178201

202+
def argmax(self) -> int:
203+
block, row_nums = self._block.promote_offsets()
204+
block = block.order_by(
205+
[
206+
*[
207+
order.OrderingColumnReference(
208+
col, direction=order.OrderingDirection.DESC
209+
)
210+
for col in self._block.index_columns
211+
],
212+
order.OrderingColumnReference(row_nums),
213+
]
214+
)
215+
import bigframes.series as series
216+
217+
return typing.cast(int, series.Series(block.select_column(row_nums)).iloc[0])
218+
219+
def argmin(self) -> int:
220+
block, row_nums = self._block.promote_offsets()
221+
block = block.order_by(
222+
[
223+
*[
224+
order.OrderingColumnReference(col)
225+
for col in self._block.index_columns
226+
],
227+
order.OrderingColumnReference(row_nums),
228+
]
229+
)
230+
import bigframes.series as series
231+
232+
return typing.cast(int, series.Series(block.select_column(row_nums)).iloc[0])
233+
234+
def value_counts(
235+
self,
236+
normalize: bool = False,
237+
sort: bool = True,
238+
ascending: bool = False,
239+
*,
240+
dropna: bool = True,
241+
):
242+
block = block_ops.value_counts(
243+
self._block,
244+
self._block.index_columns,
245+
normalize=normalize,
246+
ascending=ascending,
247+
dropna=dropna,
248+
)
249+
import bigframes.series as series
250+
251+
return series.Series(block)
252+
179253
def fillna(self, value=None) -> Index:
180254
if self.nlevels > 1:
181255
raise TypeError("Multiindex does not support 'fillna'")
@@ -185,10 +259,7 @@ def rename(self, name: Union[str, Sequence[str]]) -> Index:
185259
names = [name] if isinstance(name, str) else list(name)
186260
if len(names) != self.nlevels:
187261
raise ValueError("'name' must be same length as levels")
188-
189-
import bigframes.dataframe as df
190-
191-
return Index(df.DataFrame(self._block.with_index_labels(names)))
262+
return Index._from_block(self._block.with_index_labels(names))
192263

193264
def drop(
194265
self,
@@ -210,9 +281,28 @@ def drop(
210281
)
211282
block = block.filter(condition_id, keep_null=True)
212283
block = block.drop_columns([condition_id])
213-
import bigframes.dataframe as df
284+
return Index._from_block(block)
285+
286+
def dropna(self, how: str = "any") -> Index:
287+
if how not in ("any", "all"):
288+
raise ValueError("'how' must be one of 'any', 'all'")
289+
result = block_ops.dropna(self._block, self._block.index_columns, how=how) # type: ignore
290+
return Index._from_block(result)
291+
292+
def drop_duplicates(self, *, keep: str = "first") -> Index:
293+
block = block_ops.drop_duplicates(self._block, self._block.index_columns, keep)
294+
return Index._from_block(block)
295+
296+
def isin(self, values) -> Index:
297+
if not utils.is_list_like(values):
298+
raise TypeError(
299+
"only list-like objects are allowed to be passed to "
300+
f"isin(), you passed a [{type(values).__name__}]"
301+
)
214302

215-
return Index(df.DataFrame(block.select_columns([])))
303+
return self._apply_unary_op(ops.IsInOp(values, match_nulls=True)).fillna(
304+
value=False
305+
)
216306

217307
def _apply_unary_op(
218308
self,
@@ -226,9 +316,7 @@ def _apply_unary_op(
226316
result_ids.append(result_id)
227317

228318
block = block.set_index(result_ids, index_labels=self._block.index_labels)
229-
import bigframes.dataframe as df
230-
231-
return Index(df.DataFrame(block))
319+
return Index._from_block(block)
232320

233321
def _apply_aggregation(self, op: agg_ops.AggregateOp) -> typing.Any:
234322
if self.nlevels > 1:
@@ -262,6 +350,12 @@ def to_numpy(self, dtype=None, **kwargs) -> np.ndarray:
262350
def __len__(self):
263351
return self.shape[0]
264352

353+
@classmethod
354+
def _from_block(cls, block: blocks.Block) -> Index:
355+
import bigframes.dataframe as df
356+
357+
return Index(df.DataFrame(block))
358+
265359

266360
class IndexValue:
267361
"""An immutable index."""
@@ -356,12 +450,6 @@ def resolve_level_name(self: IndexValue, label: blocks.Label) -> str:
356450
def is_uniquely_named(self: IndexValue):
357451
return len(set(self.names)) == len(self.names)
358452

359-
def _set_block(self, block: blocks.Block):
360-
self._block = block
361-
362-
def _get_block(self) -> blocks.Block:
363-
return self._block
364-
365453

366454
def join_mono_indexed(
367455
left: IndexValue,

bigframes/dataframe.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -1440,7 +1440,7 @@ def dropna(
14401440
axis_n = utils.get_axis_number(axis)
14411441

14421442
if axis_n == 0:
1443-
result = block_ops.dropna(self._block, how=how) # type: ignore
1443+
result = block_ops.dropna(self._block, self._block.value_columns, how=how) # type: ignore
14441444
if ignore_index:
14451445
result = result.reset_index()
14461446
return DataFrame(result)
@@ -1674,7 +1674,10 @@ def pivot(
16741674
def stack(self):
16751675
# TODO: support 'level' param by simply reordering levels such that selected level is last before passing to Block.stack.
16761676
# TODO: match impl to pandas future_stack as described in pandas 2.1 release notes
1677-
result_block = block_ops.dropna(self._block.stack(), how="all")
1677+
stack_block = self._block.stack()
1678+
result_block = block_ops.dropna(
1679+
stack_block, stack_block.value_columns, how="all"
1680+
)
16781681
if not isinstance(self.columns, pandas.MultiIndex):
16791682
return bigframes.series.Series(result_block)
16801683
return DataFrame(result_block)

bigframes/series.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -459,7 +459,7 @@ def dropna(
459459
) -> Series:
460460
if inplace:
461461
raise NotImplementedError("'inplace'=True not supported")
462-
result = block_ops.dropna(self._block, how="any")
462+
result = block_ops.dropna(self._block, [self._value_column], how="any")
463463
if ignore_index:
464464
result = result.reset_index()
465465
return Series(result)
@@ -856,7 +856,7 @@ def clip(self, lower, upper):
856856
)
857857
return Series(block.select_column(result_id).with_column_labels([self.name]))
858858

859-
def argmax(self) -> scalars.Scalar:
859+
def argmax(self) -> int:
860860
block, row_nums = self._block.promote_offsets()
861861
block = block.order_by(
862862
[
@@ -870,7 +870,7 @@ def argmax(self) -> scalars.Scalar:
870870
scalars.Scalar, Series(block.select_column(row_nums)).iloc[0]
871871
)
872872

873-
def argmin(self) -> scalars.Scalar:
873+
def argmin(self) -> int:
874874
block, row_nums = self._block.promote_offsets()
875875
block = block.order_by(
876876
[

tests/system/small/test_index.py

+120
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
import numpy
1616
import pandas as pd
17+
import pytest
1718

1819
from tests.system.utils import assert_pandas_index_equal_ignore_index_type
1920

@@ -174,3 +175,122 @@ def test_is_monotonic_decreasing(scalars_df_index, scalars_pandas_df_index):
174175
scalars_df_index.index.is_monotonic_increasing
175176
== scalars_pandas_df_index.index.is_monotonic_increasing
176177
)
178+
179+
180+
def test_index_argmin(scalars_df_index, scalars_pandas_df_index):
181+
if pd.__version__.startswith("1."):
182+
pytest.skip("doesn't work in pandas 1.x.")
183+
bf_result = scalars_df_index.set_index(["int64_too", "rowindex_2"]).index.argmin()
184+
pd_result = scalars_pandas_df_index.set_index(
185+
["int64_too", "rowindex_2"]
186+
).index.argmin()
187+
assert bf_result == pd_result
188+
189+
190+
def test_index_argmax(scalars_df_index, scalars_pandas_df_index):
191+
if pd.__version__.startswith("1."):
192+
pytest.skip("doesn't work in pandas 1.x.")
193+
bf_result = scalars_df_index.set_index(["int64_too", "rowindex_2"]).index.argmax()
194+
pd_result = scalars_pandas_df_index.set_index(
195+
["int64_too", "rowindex_2"]
196+
).index.argmax()
197+
assert bf_result == pd_result
198+
199+
200+
@pytest.mark.parametrize(
201+
("ascending", "na_position"),
202+
[
203+
(True, "first"),
204+
(True, "last"),
205+
(False, "first"),
206+
(False, "last"),
207+
],
208+
)
209+
def test_index_sort_values(
210+
scalars_df_index, scalars_pandas_df_index, ascending, na_position
211+
):
212+
# Test needs values to be unique
213+
bf_result = (
214+
scalars_df_index.set_index(["int64_too", "rowindex_2"])
215+
.index.sort_values(ascending=ascending, na_position=na_position)
216+
.to_pandas()
217+
)
218+
pd_result = scalars_pandas_df_index.set_index(
219+
["int64_too", "rowindex_2"]
220+
).index.sort_values(ascending=ascending, na_position=na_position)
221+
222+
pd.testing.assert_index_equal(
223+
bf_result,
224+
pd_result,
225+
)
226+
227+
228+
def test_index_value_counts(scalars_df_index, scalars_pandas_df_index):
229+
if pd.__version__.startswith("1."):
230+
pytest.skip("value_counts results different in pandas 1.x.")
231+
bf_result = (
232+
scalars_df_index.set_index(["int64_too", "rowindex_2"])
233+
.index.value_counts()
234+
.to_pandas()
235+
)
236+
pd_result = scalars_pandas_df_index.set_index(
237+
["int64_too", "rowindex_2"]
238+
).index.value_counts()
239+
240+
pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False)
241+
242+
243+
@pytest.mark.parametrize(
244+
("how",),
245+
[
246+
("any",),
247+
("all",),
248+
],
249+
)
250+
def test_index_dropna(scalars_df_index, scalars_pandas_df_index, how):
251+
bf_result = (
252+
scalars_df_index.set_index(["int64_col", "float64_col"])
253+
.index.dropna(how=how)
254+
.to_pandas()
255+
)
256+
pd_result = scalars_pandas_df_index.set_index(
257+
["int64_col", "float64_col"]
258+
).index.dropna(how=how)
259+
pd.testing.assert_index_equal(pd_result, bf_result)
260+
261+
262+
@pytest.mark.parametrize(
263+
("keep",),
264+
[
265+
("first",),
266+
("last",),
267+
(False,),
268+
],
269+
)
270+
def test_index_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep):
271+
bf_series = (
272+
scalars_df_index.set_index("int64_col")
273+
.index.drop_duplicates(keep=keep)
274+
.to_pandas()
275+
)
276+
pd_series = scalars_pandas_df_index.set_index("int64_col").index.drop_duplicates(
277+
keep=keep
278+
)
279+
pd.testing.assert_index_equal(
280+
pd_series,
281+
bf_series,
282+
)
283+
284+
285+
def test_index_isin(scalars_df_index, scalars_pandas_df_index):
286+
bf_series = (
287+
scalars_df_index.set_index("int64_col").index.isin([2, 55555, 4]).to_pandas()
288+
)
289+
pd_result_array = scalars_pandas_df_index.set_index("int64_col").index.isin(
290+
[2, 55555, 4]
291+
)
292+
pd.testing.assert_index_equal(
293+
pd.Index(pd_result_array),
294+
bf_series,
295+
check_names=False,
296+
)

0 commit comments

Comments
 (0)