diff --git a/ci/environment-dev.yaml b/ci/environment-dev.yaml
index 57748fef1a2e5..c72abd0c19516 100644
--- a/ci/environment-dev.yaml
+++ b/ci/environment-dev.yaml
@@ -10,5 +10,5 @@ dependencies:
- python-dateutil>=2.5.0
- python=3
- pytz
- - setuptools
+ - setuptools>=3.3
- sphinx
diff --git a/ci/requirements_dev.txt b/ci/requirements_dev.txt
index e9840388203b1..82f8de277c57b 100644
--- a/ci/requirements_dev.txt
+++ b/ci/requirements_dev.txt
@@ -6,5 +6,5 @@ moto
pytest>=3.1
python-dateutil>=2.5.0
pytz
-setuptools
-sphinx
\ No newline at end of file
+setuptools>=3.3
+sphinx
diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml
index 8152af84228b8..0b54980d2bc87 100644
--- a/conda.recipe/meta.yaml
+++ b/conda.recipe/meta.yaml
@@ -15,7 +15,7 @@ requirements:
- python
- cython
- numpy x.x
- - setuptools
+ - setuptools >=3.3
run:
- python
diff --git a/doc/source/computation.rst b/doc/source/computation.rst
index 66e16808f6af9..49ac516af6d37 100644
--- a/doc/source/computation.rst
+++ b/doc/source/computation.rst
@@ -253,12 +253,6 @@ accept the following arguments:
result is NA)
- ``center``: boolean, whether to set the labels at the center (default is False)
-.. warning::
-
- The ``freq`` and ``how`` arguments were in the API prior to 0.18.0 changes. These are deprecated in the new API. You can simply resample the input prior to creating a window function.
-
- For example, instead of ``s.rolling(window=5,freq='D').max()`` to get the max value on a rolling 5 Day window, one could use ``s.resample('D').max().rolling(window=5).max()``, which first resamples the data to daily data, then provides a rolling 5 day window.
-
We can then call methods on these ``rolling`` objects. These return like-indexed objects:
.. ipython:: python
diff --git a/doc/source/install.rst b/doc/source/install.rst
index 105135fd2c454..979d5afd0a04f 100644
--- a/doc/source/install.rst
+++ b/doc/source/install.rst
@@ -194,7 +194,7 @@ installed), make sure you have `pytest
Dependencies
------------
-* `setuptools `__
+* `setuptools `__: 3.3.0 or higher
* `NumPy `__: 1.9.0 or higher
* `python-dateutil /https://dateutil.readthedocs.io/en/stable/>`__: 2.5.0 or higher
* `pytz `__
diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt
index 314cfbb903b73..db3a1dc4d39e8 100644
--- a/doc/source/whatsnew/v0.22.0.txt
+++ b/doc/source/whatsnew/v0.22.0.txt
@@ -186,6 +186,7 @@ Other API Changes
- :func:`Series.fillna` now raises a ``TypeError`` instead of a ``ValueError`` when passed a list, tuple or DataFrame as a ``value`` (:issue:`18293`)
- :func:`pandas.DataFrame.merge` no longer casts a ``float`` column to ``object`` when merging on ``int`` and ``float`` columns (:issue:`16572`)
- The default NA value for :class:`UInt64Index` has changed from 0 to ``NaN``, which impacts methods that mask with NA, such as ``UInt64Index.where()`` (:issue:`18398`)
+- Refactored ``setup.py`` to use ``find_packages`` instead of explicitly listing out all subpackages (:issue:`18535`)
.. _whatsnew_0220.deprecations:
@@ -209,8 +210,8 @@ Removal of prior version deprecations/changes
- The ``SparseList`` class has been removed (:issue:`14007`)
- The ``pandas.io.wb`` and ``pandas.io.data`` stub modules have been removed (:issue:`13735`)
- ``Categorical.from_array`` has been removed (:issue:`13854`)
-- The ``freq`` parameter has been removed from the ``rolling``/``expanding``/``ewm`` methods of DataFrame
- and Series (deprecated since v0.18). Instead, resample before calling the methods. (:issue:18601)
+- The ``freq`` and ``how`` parameters have been removed from the ``rolling``/``expanding``/``ewm`` methods of DataFrame
+ and Series (deprecated since v0.18). Instead, resample before calling the methods. (:issue:18601 & :issue:18668)
- ``DatetimeIndex.to_datetime``, ``Timestamp.to_datetime``, ``PeriodIndex.to_datetime``, and ``Index.to_datetime`` have been removed (:issue:`8254`, :issue:`14096`, :issue:`14113`)
.. _whatsnew_0220.performance:
diff --git a/pandas/core/window.py b/pandas/core/window.py
index 807f8bfa12674..5ad8d20cc03e2 100644
--- a/pandas/core/window.py
+++ b/pandas/core/window.py
@@ -106,17 +106,17 @@ def validate(self):
raise ValueError("closed must be 'right', 'left', 'both' or "
"'neither'")
- def _convert_freq(self, how=None):
+ def _convert_freq(self):
""" resample according to the how, return a new object """
obj = self._selected_obj
index = None
return obj, index
- def _create_blocks(self, how):
+ def _create_blocks(self):
""" split data into blocks & return conformed data """
- obj, index = self._convert_freq(how)
+ obj, index = self._convert_freq()
if index is not None:
index = self._on
@@ -196,7 +196,7 @@ def _get_index(self, index=None):
return index, index.asi8
return index, index
- def _prep_values(self, values=None, kill_inf=True, how=None):
+ def _prep_values(self, values=None, kill_inf=True):
if values is None:
values = getattr(self._selected_obj, 'values', self._selected_obj)
@@ -320,22 +320,10 @@ def aggregate(self, arg, *args, **kwargs):
agg = aggregate
_shared_docs['sum'] = dedent("""
- %(name)s sum
-
- Parameters
- ----------
- how : string, default None
- .. deprecated:: 0.18.0
- Method for down- or re-sampling""")
+ %(name)s sum""")
_shared_docs['mean'] = dedent("""
- %(name)s mean
-
- Parameters
- ----------
- how : string, default None
- .. deprecated:: 0.18.0
- Method for down- or re-sampling""")
+ %(name)s mean""")
class Window(_Window):
@@ -549,7 +537,7 @@ def _pop_args(win_type, arg_names, kwargs):
# GH #15662. `False` makes symmetric window, rather than periodic.
return sig.get_window(win_type, window, False).astype(float)
- def _apply_window(self, mean=True, how=None, **kwargs):
+ def _apply_window(self, mean=True, **kwargs):
"""
Applies a moving window of type ``window_type`` on the data.
@@ -557,9 +545,6 @@ def _apply_window(self, mean=True, how=None, **kwargs):
----------
mean : boolean, default True
If True computes weighted mean, else weighted sum
- how : string, default to None
- .. deprecated:: 0.18.0
- how to resample
Returns
-------
@@ -569,7 +554,7 @@ def _apply_window(self, mean=True, how=None, **kwargs):
window = self._prep_window(**kwargs)
center = self.center
- blocks, obj, index = self._create_blocks(how=how)
+ blocks, obj, index = self._create_blocks()
results = []
for b in blocks:
try:
@@ -686,7 +671,7 @@ def __init__(self, obj, *args, **kwargs):
cov = GroupByMixin._dispatch('cov', other=None, pairwise=None)
def _apply(self, func, name, window=None, center=None,
- check_minp=None, how=None, **kwargs):
+ check_minp=None, **kwargs):
"""
dispatch to apply; we are stripping all of the _apply kwargs and
performing the original function call on the grouped object
@@ -710,7 +695,7 @@ def _constructor(self):
return Rolling
def _apply(self, func, name=None, window=None, center=None,
- check_minp=None, how=None, **kwargs):
+ check_minp=None, **kwargs):
"""
Rolling statistical measure using supplied function. Designed to be
used with passed-in Cython array-based functions.
@@ -723,9 +708,6 @@ def _apply(self, func, name=None, window=None, center=None,
window : int/array, default to _get_window()
center : boolean, default to self.center
check_minp : function, default to _use_window
- how : string, default to None
- .. deprecated:: 0.18.0
- how to resample
Returns
-------
@@ -739,7 +721,7 @@ def _apply(self, func, name=None, window=None, center=None,
if check_minp is None:
check_minp = _use_window
- blocks, obj, index = self._create_blocks(how=how)
+ blocks, obj, index = self._create_blocks()
index, indexi = self._get_index(index=index)
results = []
for b in blocks:
@@ -803,7 +785,7 @@ class _Rolling_and_Expanding(_Rolling):
def count(self):
- blocks, obj, index = self._create_blocks(how=None)
+ blocks, obj, index = self._create_blocks()
index, indexi = self._get_index(index=index)
window = self._get_window()
@@ -849,29 +831,19 @@ def sum(self, *args, **kwargs):
_shared_docs['max'] = dedent("""
%(name)s maximum
+ """)
- Parameters
- ----------
- how : string, default 'max'
- .. deprecated:: 0.18.0
- Method for down- or re-sampling""")
-
- def max(self, how=None, *args, **kwargs):
+ def max(self, *args, **kwargs):
nv.validate_window_func('max', args, kwargs)
- return self._apply('roll_max', 'max', how=how, **kwargs)
+ return self._apply('roll_max', 'max', **kwargs)
_shared_docs['min'] = dedent("""
%(name)s minimum
+ """)
- Parameters
- ----------
- how : string, default 'min'
- .. deprecated:: 0.18.0
- Method for down- or re-sampling""")
-
- def min(self, how=None, *args, **kwargs):
+ def min(self, *args, **kwargs):
nv.validate_window_func('min', args, kwargs)
- return self._apply('roll_min', 'min', how=how, **kwargs)
+ return self._apply('roll_min', 'min', **kwargs)
def mean(self, *args, **kwargs):
nv.validate_window_func('mean', args, kwargs)
@@ -879,15 +851,10 @@ def mean(self, *args, **kwargs):
_shared_docs['median'] = dedent("""
%(name)s median
+ """)
- Parameters
- ----------
- how : string, default 'median'
- .. deprecated:: 0.18.0
- Method for down- or re-sampling""")
-
- def median(self, how=None, **kwargs):
- return self._apply('roll_median_c', 'median', how=how, **kwargs)
+ def median(self, **kwargs):
+ return self._apply('roll_median_c', 'median', **kwargs)
_shared_docs['std'] = dedent("""
%(name)s standard deviation
@@ -1709,23 +1676,20 @@ def aggregate(self, arg, *args, **kwargs):
agg = aggregate
- def _apply(self, func, how=None, **kwargs):
+ def _apply(self, func, **kwargs):
"""Rolling statistical measure using supplied function. Designed to be
used with passed-in Cython array-based functions.
Parameters
----------
func : string/callable to apply
- how : string, default to None
- .. deprecated:: 0.18.0
- how to resample
Returns
-------
y : type of input argument
"""
- blocks, obj, index = self._create_blocks(how=how)
+ blocks, obj, index = self._create_blocks()
results = []
for b in blocks:
try:
diff --git a/pandas/tests/categorical/__init__.py b/pandas/tests/categorical/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/pandas/tests/categorical/common.py b/pandas/tests/categorical/common.py
new file mode 100644
index 0000000000000..9462482553ed8
--- /dev/null
+++ b/pandas/tests/categorical/common.py
@@ -0,0 +1,10 @@
+# -*- coding: utf-8 -*-
+
+from pandas import Categorical
+
+
+class TestCategorical(object):
+
+ def setup_method(self, method):
+ self.factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'],
+ ordered=True)
diff --git a/pandas/tests/categorical/test_analytics.py b/pandas/tests/categorical/test_analytics.py
new file mode 100644
index 0000000000000..53d0e596a1d99
--- /dev/null
+++ b/pandas/tests/categorical/test_analytics.py
@@ -0,0 +1,320 @@
+# -*- coding: utf-8 -*-
+
+import pytest
+import sys
+
+import numpy as np
+
+import pandas.util.testing as tm
+from pandas import Categorical, Index, Series
+
+from pandas.compat import PYPY
+
+
+class TestCategoricalAnalytics(object):
+
+ def test_min_max(self):
+
+ # unordered cats have no min/max
+ cat = Categorical(["a", "b", "c", "d"], ordered=False)
+ pytest.raises(TypeError, lambda: cat.min())
+ pytest.raises(TypeError, lambda: cat.max())
+
+ cat = Categorical(["a", "b", "c", "d"], ordered=True)
+ _min = cat.min()
+ _max = cat.max()
+ assert _min == "a"
+ assert _max == "d"
+
+ cat = Categorical(["a", "b", "c", "d"],
+ categories=['d', 'c', 'b', 'a'], ordered=True)
+ _min = cat.min()
+ _max = cat.max()
+ assert _min == "d"
+ assert _max == "a"
+
+ cat = Categorical([np.nan, "b", "c", np.nan],
+ categories=['d', 'c', 'b', 'a'], ordered=True)
+ _min = cat.min()
+ _max = cat.max()
+ assert np.isnan(_min)
+ assert _max == "b"
+
+ _min = cat.min(numeric_only=True)
+ assert _min == "c"
+ _max = cat.max(numeric_only=True)
+ assert _max == "b"
+
+ cat = Categorical([np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1],
+ ordered=True)
+ _min = cat.min()
+ _max = cat.max()
+ assert np.isnan(_min)
+ assert _max == 1
+
+ _min = cat.min(numeric_only=True)
+ assert _min == 2
+ _max = cat.max(numeric_only=True)
+ assert _max == 1
+
+ @pytest.mark.parametrize("values,categories,exp_mode", [
+ ([1, 1, 2, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5]),
+ ([1, 1, 1, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5, 1]),
+ ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [5, 4, 3, 2, 1]),
+ ([np.nan, np.nan, np.nan, 4, 5], [5, 4, 3, 2, 1], [5, 4]),
+ ([np.nan, np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]),
+ ([np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4])])
+ def test_mode(self, values, categories, exp_mode):
+ s = Categorical(values, categories=categories, ordered=True)
+ res = s.mode()
+ exp = Categorical(exp_mode, categories=categories, ordered=True)
+ tm.assert_categorical_equal(res, exp)
+
+ def test_searchsorted(self):
+ # https://github.com/pandas-dev/pandas/issues/8420
+ # https://github.com/pandas-dev/pandas/issues/14522
+
+ c1 = Categorical(['cheese', 'milk', 'apple', 'bread', 'bread'],
+ categories=['cheese', 'milk', 'apple', 'bread'],
+ ordered=True)
+ s1 = Series(c1)
+ c2 = Categorical(['cheese', 'milk', 'apple', 'bread', 'bread'],
+ categories=['cheese', 'milk', 'apple', 'bread'],
+ ordered=False)
+ s2 = Series(c2)
+
+ # Searching for single item argument, side='left' (default)
+ res_cat = c1.searchsorted('apple')
+ res_ser = s1.searchsorted('apple')
+ exp = np.array([2], dtype=np.intp)
+ tm.assert_numpy_array_equal(res_cat, exp)
+ tm.assert_numpy_array_equal(res_ser, exp)
+
+ # Searching for single item array, side='left' (default)
+ res_cat = c1.searchsorted(['bread'])
+ res_ser = s1.searchsorted(['bread'])
+ exp = np.array([3], dtype=np.intp)
+ tm.assert_numpy_array_equal(res_cat, exp)
+ tm.assert_numpy_array_equal(res_ser, exp)
+
+ # Searching for several items array, side='right'
+ res_cat = c1.searchsorted(['apple', 'bread'], side='right')
+ res_ser = s1.searchsorted(['apple', 'bread'], side='right')
+ exp = np.array([3, 5], dtype=np.intp)
+ tm.assert_numpy_array_equal(res_cat, exp)
+ tm.assert_numpy_array_equal(res_ser, exp)
+
+ # Searching for a single value that is not from the Categorical
+ pytest.raises(ValueError, lambda: c1.searchsorted('cucumber'))
+ pytest.raises(ValueError, lambda: s1.searchsorted('cucumber'))
+
+ # Searching for multiple values one of each is not from the Categorical
+ pytest.raises(ValueError,
+ lambda: c1.searchsorted(['bread', 'cucumber']))
+ pytest.raises(ValueError,
+ lambda: s1.searchsorted(['bread', 'cucumber']))
+
+ # searchsorted call for unordered Categorical
+ pytest.raises(ValueError, lambda: c2.searchsorted('apple'))
+ pytest.raises(ValueError, lambda: s2.searchsorted('apple'))
+
+ with tm.assert_produces_warning(FutureWarning):
+ res = c1.searchsorted(v=['bread'])
+ exp = np.array([3], dtype=np.intp)
+ tm.assert_numpy_array_equal(res, exp)
+
+ def test_unique(self):
+ # categories are reordered based on value when ordered=False
+ cat = Categorical(["a", "b"])
+ exp = Index(["a", "b"])
+ res = cat.unique()
+ tm.assert_index_equal(res.categories, exp)
+ tm.assert_categorical_equal(res, cat)
+
+ cat = Categorical(["a", "b", "a", "a"], categories=["a", "b", "c"])
+ res = cat.unique()
+ tm.assert_index_equal(res.categories, exp)
+ tm.assert_categorical_equal(res, Categorical(exp))
+
+ cat = Categorical(["c", "a", "b", "a", "a"],
+ categories=["a", "b", "c"])
+ exp = Index(["c", "a", "b"])
+ res = cat.unique()
+ tm.assert_index_equal(res.categories, exp)
+ exp_cat = Categorical(exp, categories=['c', 'a', 'b'])
+ tm.assert_categorical_equal(res, exp_cat)
+
+ # nan must be removed
+ cat = Categorical(["b", np.nan, "b", np.nan, "a"],
+ categories=["a", "b", "c"])
+ res = cat.unique()
+ exp = Index(["b", "a"])
+ tm.assert_index_equal(res.categories, exp)
+ exp_cat = Categorical(["b", np.nan, "a"], categories=["b", "a"])
+ tm.assert_categorical_equal(res, exp_cat)
+
+ def test_unique_ordered(self):
+ # keep categories order when ordered=True
+ cat = Categorical(['b', 'a', 'b'], categories=['a', 'b'], ordered=True)
+ res = cat.unique()
+ exp_cat = Categorical(['b', 'a'], categories=['a', 'b'], ordered=True)
+ tm.assert_categorical_equal(res, exp_cat)
+
+ cat = Categorical(['c', 'b', 'a', 'a'], categories=['a', 'b', 'c'],
+ ordered=True)
+ res = cat.unique()
+ exp_cat = Categorical(['c', 'b', 'a'], categories=['a', 'b', 'c'],
+ ordered=True)
+ tm.assert_categorical_equal(res, exp_cat)
+
+ cat = Categorical(['b', 'a', 'a'], categories=['a', 'b', 'c'],
+ ordered=True)
+ res = cat.unique()
+ exp_cat = Categorical(['b', 'a'], categories=['a', 'b'], ordered=True)
+ tm.assert_categorical_equal(res, exp_cat)
+
+ cat = Categorical(['b', 'b', np.nan, 'a'], categories=['a', 'b', 'c'],
+ ordered=True)
+ res = cat.unique()
+ exp_cat = Categorical(['b', np.nan, 'a'], categories=['a', 'b'],
+ ordered=True)
+ tm.assert_categorical_equal(res, exp_cat)
+
+ def test_unique_index_series(self):
+ c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1])
+ # Categorical.unique sorts categories by appearance order
+ # if ordered=False
+ exp = Categorical([3, 1, 2], categories=[3, 1, 2])
+ tm.assert_categorical_equal(c.unique(), exp)
+
+ tm.assert_index_equal(Index(c).unique(), Index(exp))
+ tm.assert_categorical_equal(Series(c).unique(), exp)
+
+ c = Categorical([1, 1, 2, 2], categories=[3, 2, 1])
+ exp = Categorical([1, 2], categories=[1, 2])
+ tm.assert_categorical_equal(c.unique(), exp)
+ tm.assert_index_equal(Index(c).unique(), Index(exp))
+ tm.assert_categorical_equal(Series(c).unique(), exp)
+
+ c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1], ordered=True)
+ # Categorical.unique keeps categories order if ordered=True
+ exp = Categorical([3, 1, 2], categories=[3, 2, 1], ordered=True)
+ tm.assert_categorical_equal(c.unique(), exp)
+
+ tm.assert_index_equal(Index(c).unique(), Index(exp))
+ tm.assert_categorical_equal(Series(c).unique(), exp)
+
+ def test_shift(self):
+ # GH 9416
+ cat = Categorical(['a', 'b', 'c', 'd', 'a'])
+
+ # shift forward
+ sp1 = cat.shift(1)
+ xp1 = Categorical([np.nan, 'a', 'b', 'c', 'd'])
+ tm.assert_categorical_equal(sp1, xp1)
+ tm.assert_categorical_equal(cat[:-1], sp1[1:])
+
+ # shift back
+ sn2 = cat.shift(-2)
+ xp2 = Categorical(['c', 'd', 'a', np.nan, np.nan],
+ categories=['a', 'b', 'c', 'd'])
+ tm.assert_categorical_equal(sn2, xp2)
+ tm.assert_categorical_equal(cat[2:], sn2[:-2])
+
+ # shift by zero
+ tm.assert_categorical_equal(cat, cat.shift(0))
+
+ def test_nbytes(self):
+ cat = Categorical([1, 2, 3])
+ exp = 3 + 3 * 8 # 3 int8s for values + 3 int64s for categories
+ assert cat.nbytes == exp
+
+ def test_memory_usage(self):
+ cat = Categorical([1, 2, 3])
+
+ # .categories is an index, so we include the hashtable
+ assert 0 < cat.nbytes <= cat.memory_usage()
+ assert 0 < cat.nbytes <= cat.memory_usage(deep=True)
+
+ cat = Categorical(['foo', 'foo', 'bar'])
+ assert cat.memory_usage(deep=True) > cat.nbytes
+
+ if not PYPY:
+ # sys.getsizeof will call the .memory_usage with
+ # deep=True, and add on some GC overhead
+ diff = cat.memory_usage(deep=True) - sys.getsizeof(cat)
+ assert abs(diff) < 100
+
+ def test_map(self):
+ c = Categorical(list('ABABC'), categories=list('CBA'), ordered=True)
+ result = c.map(lambda x: x.lower())
+ exp = Categorical(list('ababc'), categories=list('cba'), ordered=True)
+ tm.assert_categorical_equal(result, exp)
+
+ c = Categorical(list('ABABC'), categories=list('ABC'), ordered=False)
+ result = c.map(lambda x: x.lower())
+ exp = Categorical(list('ababc'), categories=list('abc'), ordered=False)
+ tm.assert_categorical_equal(result, exp)
+
+ result = c.map(lambda x: 1)
+ # GH 12766: Return an index not an array
+ tm.assert_index_equal(result, Index(np.array([1] * 5, dtype=np.int64)))
+
+ def test_validate_inplace(self):
+ cat = Categorical(['A', 'B', 'B', 'C', 'A'])
+ invalid_values = [1, "True", [1, 2, 3], 5.0]
+
+ for value in invalid_values:
+ with pytest.raises(ValueError):
+ cat.set_ordered(value=True, inplace=value)
+
+ with pytest.raises(ValueError):
+ cat.as_ordered(inplace=value)
+
+ with pytest.raises(ValueError):
+ cat.as_unordered(inplace=value)
+
+ with pytest.raises(ValueError):
+ cat.set_categories(['X', 'Y', 'Z'], rename=True, inplace=value)
+
+ with pytest.raises(ValueError):
+ cat.rename_categories(['X', 'Y', 'Z'], inplace=value)
+
+ with pytest.raises(ValueError):
+ cat.reorder_categories(
+ ['X', 'Y', 'Z'], ordered=True, inplace=value)
+
+ with pytest.raises(ValueError):
+ cat.add_categories(
+ new_categories=['D', 'E', 'F'], inplace=value)
+
+ with pytest.raises(ValueError):
+ cat.remove_categories(removals=['D', 'E', 'F'], inplace=value)
+
+ with pytest.raises(ValueError):
+ cat.remove_unused_categories(inplace=value)
+
+ with pytest.raises(ValueError):
+ cat.sort_values(inplace=value)
+
+ def test_repeat(self):
+ # GH10183
+ cat = Categorical(["a", "b"], categories=["a", "b"])
+ exp = Categorical(["a", "a", "b", "b"], categories=["a", "b"])
+ res = cat.repeat(2)
+ tm.assert_categorical_equal(res, exp)
+
+ def test_numpy_repeat(self):
+ cat = Categorical(["a", "b"], categories=["a", "b"])
+ exp = Categorical(["a", "a", "b", "b"], categories=["a", "b"])
+ tm.assert_categorical_equal(np.repeat(cat, 2), exp)
+
+ msg = "the 'axis' parameter is not supported"
+ tm.assert_raises_regex(ValueError, msg, np.repeat, cat, 2, axis=1)
+
+ def test_isna(self):
+ exp = np.array([False, False, True])
+ c = Categorical(["a", "b", np.nan])
+ res = c.isna()
+
+ tm.assert_numpy_array_equal(res, exp)
diff --git a/pandas/tests/categorical/test_api.py b/pandas/tests/categorical/test_api.py
new file mode 100644
index 0000000000000..7cc0aafaf05b6
--- /dev/null
+++ b/pandas/tests/categorical/test_api.py
@@ -0,0 +1,522 @@
+# -*- coding: utf-8 -*-
+
+import pytest
+
+import numpy as np
+
+import pandas.util.testing as tm
+from pandas import Categorical, CategoricalIndex, Index, Series, DataFrame
+
+from pandas.core.categorical import _recode_for_categories
+from pandas.tests.categorical.common import TestCategorical
+
+
+class TestCategoricalAPI(object):
+
+ def test_ordered_api(self):
+ # GH 9347
+ cat1 = Categorical(list('acb'), ordered=False)
+ tm.assert_index_equal(cat1.categories, Index(['a', 'b', 'c']))
+ assert not cat1.ordered
+
+ cat2 = Categorical(list('acb'), categories=list('bca'), ordered=False)
+ tm.assert_index_equal(cat2.categories, Index(['b', 'c', 'a']))
+ assert not cat2.ordered
+
+ cat3 = Categorical(list('acb'), ordered=True)
+ tm.assert_index_equal(cat3.categories, Index(['a', 'b', 'c']))
+ assert cat3.ordered
+
+ cat4 = Categorical(list('acb'), categories=list('bca'), ordered=True)
+ tm.assert_index_equal(cat4.categories, Index(['b', 'c', 'a']))
+ assert cat4.ordered
+
+ def test_set_ordered(self):
+
+ cat = Categorical(["a", "b", "c", "a"], ordered=True)
+ cat2 = cat.as_unordered()
+ assert not cat2.ordered
+ cat2 = cat.as_ordered()
+ assert cat2.ordered
+ cat2.as_unordered(inplace=True)
+ assert not cat2.ordered
+ cat2.as_ordered(inplace=True)
+ assert cat2.ordered
+
+ assert cat2.set_ordered(True).ordered
+ assert not cat2.set_ordered(False).ordered
+ cat2.set_ordered(True, inplace=True)
+ assert cat2.ordered
+ cat2.set_ordered(False, inplace=True)
+ assert not cat2.ordered
+
+ # removed in 0.19.0
+ msg = "can\'t set attribute"
+ with tm.assert_raises_regex(AttributeError, msg):
+ cat.ordered = True
+ with tm.assert_raises_regex(AttributeError, msg):
+ cat.ordered = False
+
+ def test_rename_categories(self):
+ cat = Categorical(["a", "b", "c", "a"])
+
+ # inplace=False: the old one must not be changed
+ res = cat.rename_categories([1, 2, 3])
+ tm.assert_numpy_array_equal(res.__array__(), np.array([1, 2, 3, 1],
+ dtype=np.int64))
+ tm.assert_index_equal(res.categories, Index([1, 2, 3]))
+
+ exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_)
+ tm.assert_numpy_array_equal(cat.__array__(), exp_cat)
+
+ exp_cat = Index(["a", "b", "c"])
+ tm.assert_index_equal(cat.categories, exp_cat)
+ res = cat.rename_categories([1, 2, 3], inplace=True)
+
+ # and now inplace
+ assert res is None
+ tm.assert_numpy_array_equal(cat.__array__(), np.array([1, 2, 3, 1],
+ dtype=np.int64))
+ tm.assert_index_equal(cat.categories, Index([1, 2, 3]))
+
+ # Lengthen
+ with pytest.raises(ValueError):
+ cat.rename_categories([1, 2, 3, 4])
+
+ # Shorten
+ with pytest.raises(ValueError):
+ cat.rename_categories([1, 2])
+
+ def test_rename_categories_series(self):
+ # https://github.com/pandas-dev/pandas/issues/17981
+ c = Categorical(['a', 'b'])
+ xpr = "Treating Series 'new_categories' as a list-like "
+ with tm.assert_produces_warning(FutureWarning) as rec:
+ result = c.rename_categories(Series([0, 1]))
+
+ assert len(rec) == 1
+ assert xpr in str(rec[0].message)
+ expected = Categorical([0, 1])
+ tm.assert_categorical_equal(result, expected)
+
+ def test_rename_categories_dict(self):
+ # GH 17336
+ cat = Categorical(['a', 'b', 'c', 'd'])
+ res = cat.rename_categories({'a': 4, 'b': 3, 'c': 2, 'd': 1})
+ expected = Index([4, 3, 2, 1])
+ tm.assert_index_equal(res.categories, expected)
+
+ # Test for inplace
+ res = cat.rename_categories({'a': 4, 'b': 3, 'c': 2, 'd': 1},
+ inplace=True)
+ assert res is None
+ tm.assert_index_equal(cat.categories, expected)
+
+ # Test for dicts of smaller length
+ cat = Categorical(['a', 'b', 'c', 'd'])
+ res = cat.rename_categories({'a': 1, 'c': 3})
+
+ expected = Index([1, 'b', 3, 'd'])
+ tm.assert_index_equal(res.categories, expected)
+
+ # Test for dicts with bigger length
+ cat = Categorical(['a', 'b', 'c', 'd'])
+ res = cat.rename_categories({'a': 1, 'b': 2, 'c': 3,
+ 'd': 4, 'e': 5, 'f': 6})
+ expected = Index([1, 2, 3, 4])
+ tm.assert_index_equal(res.categories, expected)
+
+ # Test for dicts with no items from old categories
+ cat = Categorical(['a', 'b', 'c', 'd'])
+ res = cat.rename_categories({'f': 1, 'g': 3})
+
+ expected = Index(['a', 'b', 'c', 'd'])
+ tm.assert_index_equal(res.categories, expected)
+
+ def test_reorder_categories(self):
+ cat = Categorical(["a", "b", "c", "a"], ordered=True)
+ old = cat.copy()
+ new = Categorical(["a", "b", "c", "a"], categories=["c", "b", "a"],
+ ordered=True)
+
+ # first inplace == False
+ res = cat.reorder_categories(["c", "b", "a"])
+ # cat must be the same as before
+ tm.assert_categorical_equal(cat, old)
+ # only res is changed
+ tm.assert_categorical_equal(res, new)
+
+ # inplace == True
+ res = cat.reorder_categories(["c", "b", "a"], inplace=True)
+ assert res is None
+ tm.assert_categorical_equal(cat, new)
+
+ # not all "old" included in "new"
+ cat = Categorical(["a", "b", "c", "a"], ordered=True)
+
+ def f():
+ cat.reorder_categories(["a"])
+
+ pytest.raises(ValueError, f)
+
+ # still not all "old" in "new"
+ def f():
+ cat.reorder_categories(["a", "b", "d"])
+
+ pytest.raises(ValueError, f)
+
+ # all "old" included in "new", but too long
+ def f():
+ cat.reorder_categories(["a", "b", "c", "d"])
+
+ pytest.raises(ValueError, f)
+
+ def test_add_categories(self):
+ cat = Categorical(["a", "b", "c", "a"], ordered=True)
+ old = cat.copy()
+ new = Categorical(["a", "b", "c", "a"],
+ categories=["a", "b", "c", "d"], ordered=True)
+
+ # first inplace == False
+ res = cat.add_categories("d")
+ tm.assert_categorical_equal(cat, old)
+ tm.assert_categorical_equal(res, new)
+
+ res = cat.add_categories(["d"])
+ tm.assert_categorical_equal(cat, old)
+ tm.assert_categorical_equal(res, new)
+
+ # inplace == True
+ res = cat.add_categories("d", inplace=True)
+ tm.assert_categorical_equal(cat, new)
+ assert res is None
+
+ # new is in old categories
+ def f():
+ cat.add_categories(["d"])
+
+ pytest.raises(ValueError, f)
+
+ # GH 9927
+ cat = Categorical(list("abc"), ordered=True)
+ expected = Categorical(
+ list("abc"), categories=list("abcde"), ordered=True)
+ # test with Series, np.array, index, list
+ res = cat.add_categories(Series(["d", "e"]))
+ tm.assert_categorical_equal(res, expected)
+ res = cat.add_categories(np.array(["d", "e"]))
+ tm.assert_categorical_equal(res, expected)
+ res = cat.add_categories(Index(["d", "e"]))
+ tm.assert_categorical_equal(res, expected)
+ res = cat.add_categories(["d", "e"])
+ tm.assert_categorical_equal(res, expected)
+
+ def test_set_categories(self):
+ cat = Categorical(["a", "b", "c", "a"], ordered=True)
+ exp_categories = Index(["c", "b", "a"])
+ exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_)
+
+ res = cat.set_categories(["c", "b", "a"], inplace=True)
+ tm.assert_index_equal(cat.categories, exp_categories)
+ tm.assert_numpy_array_equal(cat.__array__(), exp_values)
+ assert res is None
+
+ res = cat.set_categories(["a", "b", "c"])
+ # cat must be the same as before
+ tm.assert_index_equal(cat.categories, exp_categories)
+ tm.assert_numpy_array_equal(cat.__array__(), exp_values)
+ # only res is changed
+ exp_categories_back = Index(["a", "b", "c"])
+ tm.assert_index_equal(res.categories, exp_categories_back)
+ tm.assert_numpy_array_equal(res.__array__(), exp_values)
+
+ # not all "old" included in "new" -> all not included ones are now
+ # np.nan
+ cat = Categorical(["a", "b", "c", "a"], ordered=True)
+ res = cat.set_categories(["a"])
+ tm.assert_numpy_array_equal(res.codes, np.array([0, -1, -1, 0],
+ dtype=np.int8))
+
+ # still not all "old" in "new"
+ res = cat.set_categories(["a", "b", "d"])
+ tm.assert_numpy_array_equal(res.codes, np.array([0, 1, -1, 0],
+ dtype=np.int8))
+ tm.assert_index_equal(res.categories, Index(["a", "b", "d"]))
+
+ # all "old" included in "new"
+ cat = cat.set_categories(["a", "b", "c", "d"])
+ exp_categories = Index(["a", "b", "c", "d"])
+ tm.assert_index_equal(cat.categories, exp_categories)
+
+ # internals...
+ c = Categorical([1, 2, 3, 4, 1], categories=[1, 2, 3, 4], ordered=True)
+ tm.assert_numpy_array_equal(c._codes, np.array([0, 1, 2, 3, 0],
+ dtype=np.int8))
+ tm.assert_index_equal(c.categories, Index([1, 2, 3, 4]))
+
+ exp = np.array([1, 2, 3, 4, 1], dtype=np.int64)
+ tm.assert_numpy_array_equal(c.get_values(), exp)
+
+ # all "pointers" to '4' must be changed from 3 to 0,...
+ c = c.set_categories([4, 3, 2, 1])
+
+ # positions are changed
+ tm.assert_numpy_array_equal(c._codes, np.array([3, 2, 1, 0, 3],
+ dtype=np.int8))
+
+ # categories are now in new order
+ tm.assert_index_equal(c.categories, Index([4, 3, 2, 1]))
+
+ # output is the same
+ exp = np.array([1, 2, 3, 4, 1], dtype=np.int64)
+ tm.assert_numpy_array_equal(c.get_values(), exp)
+ assert c.min() == 4
+ assert c.max() == 1
+
+ # set_categories should set the ordering if specified
+ c2 = c.set_categories([4, 3, 2, 1], ordered=False)
+ assert not c2.ordered
+
+ tm.assert_numpy_array_equal(c.get_values(), c2.get_values())
+
+ # set_categories should pass thru the ordering
+ c2 = c.set_ordered(False).set_categories([4, 3, 2, 1])
+ assert not c2.ordered
+
+ tm.assert_numpy_array_equal(c.get_values(), c2.get_values())
+
+ @pytest.mark.parametrize('values, categories, new_categories', [
+ # No NaNs, same cats, same order
+ (['a', 'b', 'a'], ['a', 'b'], ['a', 'b'],),
+ # No NaNs, same cats, different order
+ (['a', 'b', 'a'], ['a', 'b'], ['b', 'a'],),
+ # Same, unsorted
+ (['b', 'a', 'a'], ['a', 'b'], ['a', 'b'],),
+ # No NaNs, same cats, different order
+ (['b', 'a', 'a'], ['a', 'b'], ['b', 'a'],),
+ # NaNs
+ (['a', 'b', 'c'], ['a', 'b'], ['a', 'b']),
+ (['a', 'b', 'c'], ['a', 'b'], ['b', 'a']),
+ (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
+ (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
+ # Introduce NaNs
+ (['a', 'b', 'c'], ['a', 'b'], ['a']),
+ (['a', 'b', 'c'], ['a', 'b'], ['b']),
+ (['b', 'a', 'c'], ['a', 'b'], ['a']),
+ (['b', 'a', 'c'], ['a', 'b'], ['a']),
+ # No overlap
+ (['a', 'b', 'c'], ['a', 'b'], ['d', 'e']),
+ ])
+ @pytest.mark.parametrize('ordered', [True, False])
+ def test_set_categories_many(self, values, categories, new_categories,
+ ordered):
+ c = Categorical(values, categories)
+ expected = Categorical(values, new_categories, ordered)
+ result = c.set_categories(new_categories, ordered=ordered)
+ tm.assert_categorical_equal(result, expected)
+
+ def test_set_categories_private(self):
+ cat = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c', 'd'])
+ cat._set_categories(['a', 'c', 'd', 'e'])
+ expected = Categorical(['a', 'c', 'd'], categories=list('acde'))
+ tm.assert_categorical_equal(cat, expected)
+
+ # fastpath
+ cat = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c', 'd'])
+ cat._set_categories(['a', 'c', 'd', 'e'], fastpath=True)
+ expected = Categorical(['a', 'c', 'd'], categories=list('acde'))
+ tm.assert_categorical_equal(cat, expected)
+
+ def test_remove_categories(self):
+ cat = Categorical(["a", "b", "c", "a"], ordered=True)
+ old = cat.copy()
+ new = Categorical(["a", "b", np.nan, "a"], categories=["a", "b"],
+ ordered=True)
+
+ # first inplace == False
+ res = cat.remove_categories("c")
+ tm.assert_categorical_equal(cat, old)
+ tm.assert_categorical_equal(res, new)
+
+ res = cat.remove_categories(["c"])
+ tm.assert_categorical_equal(cat, old)
+ tm.assert_categorical_equal(res, new)
+
+ # inplace == True
+ res = cat.remove_categories("c", inplace=True)
+ tm.assert_categorical_equal(cat, new)
+ assert res is None
+
+ # removal is not in categories
+ def f():
+ cat.remove_categories(["c"])
+
+ pytest.raises(ValueError, f)
+
+ def test_remove_unused_categories(self):
+ c = Categorical(["a", "b", "c", "d", "a"],
+ categories=["a", "b", "c", "d", "e"])
+ exp_categories_all = Index(["a", "b", "c", "d", "e"])
+ exp_categories_dropped = Index(["a", "b", "c", "d"])
+
+ tm.assert_index_equal(c.categories, exp_categories_all)
+
+ res = c.remove_unused_categories()
+ tm.assert_index_equal(res.categories, exp_categories_dropped)
+ tm.assert_index_equal(c.categories, exp_categories_all)
+
+ res = c.remove_unused_categories(inplace=True)
+ tm.assert_index_equal(c.categories, exp_categories_dropped)
+ assert res is None
+
+ # with NaN values (GH11599)
+ c = Categorical(["a", "b", "c", np.nan],
+ categories=["a", "b", "c", "d", "e"])
+ res = c.remove_unused_categories()
+ tm.assert_index_equal(res.categories,
+ Index(np.array(["a", "b", "c"])))
+ exp_codes = np.array([0, 1, 2, -1], dtype=np.int8)
+ tm.assert_numpy_array_equal(res.codes, exp_codes)
+ tm.assert_index_equal(c.categories, exp_categories_all)
+
+ val = ['F', np.nan, 'D', 'B', 'D', 'F', np.nan]
+ cat = Categorical(values=val, categories=list('ABCDEFG'))
+ out = cat.remove_unused_categories()
+ tm.assert_index_equal(out.categories, Index(['B', 'D', 'F']))
+ exp_codes = np.array([2, -1, 1, 0, 1, 2, -1], dtype=np.int8)
+ tm.assert_numpy_array_equal(out.codes, exp_codes)
+ assert out.get_values().tolist() == val
+
+ alpha = list('abcdefghijklmnopqrstuvwxyz')
+ val = np.random.choice(alpha[::2], 10000).astype('object')
+ val[np.random.choice(len(val), 100)] = np.nan
+
+ cat = Categorical(values=val, categories=alpha)
+ out = cat.remove_unused_categories()
+ assert out.get_values().tolist() == val.tolist()
+
+ def test_deprecated_labels(self):
+ # TODO: labels is deprecated and should be removed in 0.18 or 2017,
+ # whatever is earlier
+ cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
+ exp = cat.codes
+ with tm.assert_produces_warning(FutureWarning):
+ res = cat.labels
+ tm.assert_numpy_array_equal(res, exp)
+
+
+class TestCategoricalAPIWithFactor(TestCategorical):
+
+ def test_describe(self):
+ # string type
+ desc = self.factor.describe()
+ assert self.factor.ordered
+ exp_index = CategoricalIndex(['a', 'b', 'c'], name='categories',
+ ordered=self.factor.ordered)
+ expected = DataFrame({'counts': [3, 2, 3],
+ 'freqs': [3 / 8., 2 / 8., 3 / 8.]},
+ index=exp_index)
+ tm.assert_frame_equal(desc, expected)
+
+ # check unused categories
+ cat = self.factor.copy()
+ cat.set_categories(["a", "b", "c", "d"], inplace=True)
+ desc = cat.describe()
+
+ exp_index = CategoricalIndex(
+ list('abcd'), ordered=self.factor.ordered, name='categories')
+ expected = DataFrame({'counts': [3, 2, 3, 0],
+ 'freqs': [3 / 8., 2 / 8., 3 / 8., 0]},
+ index=exp_index)
+ tm.assert_frame_equal(desc, expected)
+
+ # check an integer one
+ cat = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1])
+ desc = cat.describe()
+ exp_index = CategoricalIndex([1, 2, 3], ordered=cat.ordered,
+ name='categories')
+ expected = DataFrame({'counts': [5, 3, 3],
+ 'freqs': [5 / 11., 3 / 11., 3 / 11.]},
+ index=exp_index)
+ tm.assert_frame_equal(desc, expected)
+
+ # https://github.com/pandas-dev/pandas/issues/3678
+ # describe should work with NaN
+ cat = Categorical([np.nan, 1, 2, 2])
+ desc = cat.describe()
+ expected = DataFrame({'counts': [1, 2, 1],
+ 'freqs': [1 / 4., 2 / 4., 1 / 4.]},
+ index=CategoricalIndex([1, 2, np.nan],
+ categories=[1, 2],
+ name='categories'))
+ tm.assert_frame_equal(desc, expected)
+
+ def test_set_categories_inplace(self):
+ cat = self.factor.copy()
+ cat.set_categories(['a', 'b', 'c', 'd'], inplace=True)
+ tm.assert_index_equal(cat.categories, Index(['a', 'b', 'c', 'd']))
+
+
+class TestPrivateCategoricalAPI(object):
+
+ def test_codes_immutable(self):
+
+ # Codes should be read only
+ c = Categorical(["a", "b", "c", "a", np.nan])
+ exp = np.array([0, 1, 2, 0, -1], dtype='int8')
+ tm.assert_numpy_array_equal(c.codes, exp)
+
+ # Assignments to codes should raise
+ def f():
+ c.codes = np.array([0, 1, 2, 0, 1], dtype='int8')
+
+ pytest.raises(ValueError, f)
+
+ # changes in the codes array should raise
+ # np 1.6.1 raises RuntimeError rather than ValueError
+ codes = c.codes
+
+ def f():
+ codes[4] = 1
+
+ pytest.raises(ValueError, f)
+
+ # But even after getting the codes, the original array should still be
+ # writeable!
+ c[4] = "a"
+ exp = np.array([0, 1, 2, 0, 0], dtype='int8')
+ tm.assert_numpy_array_equal(c.codes, exp)
+ c._codes[4] = 2
+ exp = np.array([0, 1, 2, 0, 2], dtype='int8')
+ tm.assert_numpy_array_equal(c.codes, exp)
+
+ @pytest.mark.parametrize('codes, old, new, expected', [
+ ([0, 1], ['a', 'b'], ['a', 'b'], [0, 1]),
+ ([0, 1], ['b', 'a'], ['b', 'a'], [0, 1]),
+ ([0, 1], ['a', 'b'], ['b', 'a'], [1, 0]),
+ ([0, 1], ['b', 'a'], ['a', 'b'], [1, 0]),
+ ([0, 1, 0, 1], ['a', 'b'], ['a', 'b', 'c'], [0, 1, 0, 1]),
+ ([0, 1, 2, 2], ['a', 'b', 'c'], ['a', 'b'], [0, 1, -1, -1]),
+ ([0, 1, -1], ['a', 'b', 'c'], ['a', 'b', 'c'], [0, 1, -1]),
+ ([0, 1, -1], ['a', 'b', 'c'], ['b'], [-1, 0, -1]),
+ ([0, 1, -1], ['a', 'b', 'c'], ['d'], [-1, -1, -1]),
+ ([0, 1, -1], ['a', 'b', 'c'], [], [-1, -1, -1]),
+ ([-1, -1], [], ['a', 'b'], [-1, -1]),
+ ([1, 0], ['b', 'a'], ['a', 'b'], [0, 1]),
+ ])
+ def test_recode_to_categories(self, codes, old, new, expected):
+ codes = np.asanyarray(codes, dtype=np.int8)
+ expected = np.asanyarray(expected, dtype=np.int8)
+ old = Index(old)
+ new = Index(new)
+ result = _recode_for_categories(codes, old, new)
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_recode_to_categories_large(self):
+ N = 1000
+ codes = np.arange(N)
+ old = Index(codes)
+ expected = np.arange(N - 1, -1, -1, dtype=np.int16)
+ new = Index(expected)
+ result = _recode_for_categories(codes, old, new)
+ tm.assert_numpy_array_equal(result, expected)
diff --git a/pandas/tests/categorical/test_constructors.py b/pandas/tests/categorical/test_constructors.py
new file mode 100644
index 0000000000000..abea7e9a0e0b4
--- /dev/null
+++ b/pandas/tests/categorical/test_constructors.py
@@ -0,0 +1,517 @@
+# -*- coding: utf-8 -*-
+
+import pytest
+from datetime import datetime
+
+import numpy as np
+
+import pandas as pd
+import pandas.util.testing as tm
+from pandas import (Categorical, Index, Series, Timestamp,
+ CategoricalIndex, date_range, DatetimeIndex,
+ period_range, timedelta_range, NaT,
+ Interval, IntervalIndex)
+from pandas.core.dtypes.dtypes import CategoricalDtype
+from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype
+
+
+class TestCategoricalConstructors(object):
+
+ def test_validate_ordered(self):
+ # see gh-14058
+ exp_msg = "'ordered' must either be 'True' or 'False'"
+ exp_err = TypeError
+
+ # This should be a boolean.
+ ordered = np.array([0, 1, 2])
+
+ with tm.assert_raises_regex(exp_err, exp_msg):
+ Categorical([1, 2, 3], ordered=ordered)
+
+ with tm.assert_raises_regex(exp_err, exp_msg):
+ Categorical.from_codes([0, 0, 1], categories=['a', 'b', 'c'],
+ ordered=ordered)
+
+ def test_constructor_empty(self):
+ # GH 17248
+ c = Categorical([])
+ expected = Index([])
+ tm.assert_index_equal(c.categories, expected)
+
+ c = Categorical([], categories=[1, 2, 3])
+ expected = pd.Int64Index([1, 2, 3])
+ tm.assert_index_equal(c.categories, expected)
+
+ def test_constructor_tuples(self):
+ values = np.array([(1,), (1, 2), (1,), (1, 2)], dtype=object)
+ result = Categorical(values)
+ expected = Index([(1,), (1, 2)], tupleize_cols=False)
+ tm.assert_index_equal(result.categories, expected)
+ assert result.ordered is False
+
+ def test_constructor_tuples_datetimes(self):
+ # numpy will auto reshape when all of the tuples are the
+ # same len, so add an extra one with 2 items and slice it off
+ values = np.array([(Timestamp('2010-01-01'),),
+ (Timestamp('2010-01-02'),),
+ (Timestamp('2010-01-01'),),
+ (Timestamp('2010-01-02'),),
+ ('a', 'b')], dtype=object)[:-1]
+ result = Categorical(values)
+ expected = Index([(Timestamp('2010-01-01'),),
+ (Timestamp('2010-01-02'),)], tupleize_cols=False)
+ tm.assert_index_equal(result.categories, expected)
+
+ def test_constructor_unsortable(self):
+
+ # it works!
+ arr = np.array([1, 2, 3, datetime.now()], dtype='O')
+ factor = Categorical(arr, ordered=False)
+ assert not factor.ordered
+
+ # this however will raise as cannot be sorted
+ pytest.raises(
+ TypeError, lambda: Categorical(arr, ordered=True))
+
+ def test_constructor_interval(self):
+ result = Categorical([Interval(1, 2), Interval(2, 3), Interval(3, 6)],
+ ordered=True)
+ ii = IntervalIndex.from_intervals([Interval(1, 2),
+ Interval(2, 3),
+ Interval(3, 6)])
+ exp = Categorical(ii, ordered=True)
+ tm.assert_categorical_equal(result, exp)
+ tm.assert_index_equal(result.categories, ii)
+
+ def test_constructor(self):
+
+ exp_arr = np.array(["a", "b", "c", "a", "b", "c"], dtype=np.object_)
+ c1 = Categorical(exp_arr)
+ tm.assert_numpy_array_equal(c1.__array__(), exp_arr)
+ c2 = Categorical(exp_arr, categories=["a", "b", "c"])
+ tm.assert_numpy_array_equal(c2.__array__(), exp_arr)
+ c2 = Categorical(exp_arr, categories=["c", "b", "a"])
+ tm.assert_numpy_array_equal(c2.__array__(), exp_arr)
+
+ # categories must be unique
+ def f():
+ Categorical([1, 2], [1, 2, 2])
+
+ pytest.raises(ValueError, f)
+
+ def f():
+ Categorical(["a", "b"], ["a", "b", "b"])
+
+ pytest.raises(ValueError, f)
+
+ # The default should be unordered
+ c1 = Categorical(["a", "b", "c", "a"])
+ assert not c1.ordered
+
+ # Categorical as input
+ c1 = Categorical(["a", "b", "c", "a"])
+ c2 = Categorical(c1)
+ tm.assert_categorical_equal(c1, c2)
+
+ c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
+ c2 = Categorical(c1)
+ tm.assert_categorical_equal(c1, c2)
+
+ c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
+ c2 = Categorical(c1)
+ tm.assert_categorical_equal(c1, c2)
+
+ c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
+ c2 = Categorical(c1, categories=["a", "b", "c"])
+ tm.assert_numpy_array_equal(c1.__array__(), c2.__array__())
+ tm.assert_index_equal(c2.categories, Index(["a", "b", "c"]))
+
+ # Series of dtype category
+ c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
+ c2 = Categorical(Series(c1))
+ tm.assert_categorical_equal(c1, c2)
+
+ c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
+ c2 = Categorical(Series(c1))
+ tm.assert_categorical_equal(c1, c2)
+
+ # Series
+ c1 = Categorical(["a", "b", "c", "a"])
+ c2 = Categorical(Series(["a", "b", "c", "a"]))
+ tm.assert_categorical_equal(c1, c2)
+
+ c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
+ c2 = Categorical(Series(["a", "b", "c", "a"]),
+ categories=["a", "b", "c", "d"])
+ tm.assert_categorical_equal(c1, c2)
+
+ # This should result in integer categories, not float!
+ cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
+ assert is_integer_dtype(cat.categories)
+
+ # https://github.com/pandas-dev/pandas/issues/3678
+ cat = Categorical([np.nan, 1, 2, 3])
+ assert is_integer_dtype(cat.categories)
+
+ # this should result in floats
+ cat = Categorical([np.nan, 1, 2., 3])
+ assert is_float_dtype(cat.categories)
+
+ cat = Categorical([np.nan, 1., 2., 3.])
+ assert is_float_dtype(cat.categories)
+
+ # This doesn't work -> this would probably need some kind of "remember
+ # the original type" feature to try to cast the array interface result
+ # to...
+
+ # vals = np.asarray(cat[cat.notna()])
+ # assert is_integer_dtype(vals)
+
+ # corner cases
+ cat = Categorical([1])
+ assert len(cat.categories) == 1
+ assert cat.categories[0] == 1
+ assert len(cat.codes) == 1
+ assert cat.codes[0] == 0
+
+ cat = Categorical(["a"])
+ assert len(cat.categories) == 1
+ assert cat.categories[0] == "a"
+ assert len(cat.codes) == 1
+ assert cat.codes[0] == 0
+
+ # Scalars should be converted to lists
+ cat = Categorical(1)
+ assert len(cat.categories) == 1
+ assert cat.categories[0] == 1
+ assert len(cat.codes) == 1
+ assert cat.codes[0] == 0
+
+ # two arrays
+ # - when the first is an integer dtype and the second is not
+ # - when the resulting codes are all -1/NaN
+ with tm.assert_produces_warning(None):
+ c_old = Categorical([0, 1, 2, 0, 1, 2],
+ categories=["a", "b", "c"]) # noqa
+
+ with tm.assert_produces_warning(None):
+ c_old = Categorical([0, 1, 2, 0, 1, 2], # noqa
+ categories=[3, 4, 5])
+
+ # the next one are from the old docs
+ with tm.assert_produces_warning(None):
+ c_old2 = Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3]) # noqa
+ cat = Categorical([1, 2], categories=[1, 2, 3])
+
+ # this is a legitimate constructor
+ with tm.assert_produces_warning(None):
+ c = Categorical(np.array([], dtype='int64'), # noqa
+ categories=[3, 2, 1], ordered=True)
+
+ def test_constructor_not_sequence(self):
+ # https://github.com/pandas-dev/pandas/issues/16022
+ with pytest.raises(TypeError):
+ Categorical(['a', 'b'], categories='a')
+
+ def test_constructor_with_null(self):
+
+ # Cannot have NaN in categories
+ with pytest.raises(ValueError):
+ Categorical([np.nan, "a", "b", "c"],
+ categories=[np.nan, "a", "b", "c"])
+
+ with pytest.raises(ValueError):
+ Categorical([None, "a", "b", "c"],
+ categories=[None, "a", "b", "c"])
+
+ with pytest.raises(ValueError):
+ Categorical(DatetimeIndex(['nat', '20160101']),
+ categories=[NaT, Timestamp('20160101')])
+
+ def test_constructor_with_index(self):
+ ci = CategoricalIndex(list('aabbca'), categories=list('cab'))
+ tm.assert_categorical_equal(ci.values, Categorical(ci))
+
+ ci = CategoricalIndex(list('aabbca'), categories=list('cab'))
+ tm.assert_categorical_equal(ci.values,
+ Categorical(ci.astype(object),
+ categories=ci.categories))
+
+ def test_constructor_with_generator(self):
+ # This was raising an Error in isna(single_val).any() because isna
+ # returned a scalar for a generator
+ xrange = range
+
+ exp = Categorical([0, 1, 2])
+ cat = Categorical((x for x in [0, 1, 2]))
+ tm.assert_categorical_equal(cat, exp)
+ cat = Categorical(xrange(3))
+ tm.assert_categorical_equal(cat, exp)
+
+ # This uses xrange internally
+ from pandas.core.index import MultiIndex
+ MultiIndex.from_product([range(5), ['a', 'b', 'c']])
+
+ # check that categories accept generators and sequences
+ cat = Categorical([0, 1, 2], categories=(x for x in [0, 1, 2]))
+ tm.assert_categorical_equal(cat, exp)
+ cat = Categorical([0, 1, 2], categories=xrange(3))
+ tm.assert_categorical_equal(cat, exp)
+
+ def test_constructor_with_datetimelike(self):
+
+ # 12077
+ # constructor wwth a datetimelike and NaT
+
+ for dtl in [date_range('1995-01-01 00:00:00', periods=5, freq='s'),
+ date_range('1995-01-01 00:00:00', periods=5,
+ freq='s', tz='US/Eastern'),
+ timedelta_range('1 day', periods=5, freq='s')]:
+
+ s = Series(dtl)
+ c = Categorical(s)
+ expected = type(dtl)(s)
+ expected.freq = None
+ tm.assert_index_equal(c.categories, expected)
+ tm.assert_numpy_array_equal(c.codes, np.arange(5, dtype='int8'))
+
+ # with NaT
+ s2 = s.copy()
+ s2.iloc[-1] = NaT
+ c = Categorical(s2)
+ expected = type(dtl)(s2.dropna())
+ expected.freq = None
+ tm.assert_index_equal(c.categories, expected)
+
+ exp = np.array([0, 1, 2, 3, -1], dtype=np.int8)
+ tm.assert_numpy_array_equal(c.codes, exp)
+
+ result = repr(c)
+ assert 'NaT' in result
+
+ def test_constructor_from_index_series_datetimetz(self):
+ idx = date_range('2015-01-01 10:00', freq='D', periods=3,
+ tz='US/Eastern')
+ result = Categorical(idx)
+ tm.assert_index_equal(result.categories, idx)
+
+ result = Categorical(Series(idx))
+ tm.assert_index_equal(result.categories, idx)
+
+ def test_constructor_from_index_series_timedelta(self):
+ idx = timedelta_range('1 days', freq='D', periods=3)
+ result = Categorical(idx)
+ tm.assert_index_equal(result.categories, idx)
+
+ result = Categorical(Series(idx))
+ tm.assert_index_equal(result.categories, idx)
+
+ def test_constructor_from_index_series_period(self):
+ idx = period_range('2015-01-01', freq='D', periods=3)
+ result = Categorical(idx)
+ tm.assert_index_equal(result.categories, idx)
+
+ result = Categorical(Series(idx))
+ tm.assert_index_equal(result.categories, idx)
+
+ def test_constructor_invariant(self):
+ # GH 14190
+ vals = [
+ np.array([1., 1.2, 1.8, np.nan]),
+ np.array([1, 2, 3], dtype='int64'),
+ ['a', 'b', 'c', np.nan],
+ [pd.Period('2014-01'), pd.Period('2014-02'), NaT],
+ [Timestamp('2014-01-01'), Timestamp('2014-01-02'), NaT],
+ [Timestamp('2014-01-01', tz='US/Eastern'),
+ Timestamp('2014-01-02', tz='US/Eastern'), NaT],
+ ]
+ for val in vals:
+ c = Categorical(val)
+ c2 = Categorical(c)
+ tm.assert_categorical_equal(c, c2)
+
+ @pytest.mark.parametrize('ordered', [True, False])
+ def test_constructor_with_dtype(self, ordered):
+ categories = ['b', 'a', 'c']
+ dtype = CategoricalDtype(categories, ordered=ordered)
+ result = Categorical(['a', 'b', 'a', 'c'], dtype=dtype)
+ expected = Categorical(['a', 'b', 'a', 'c'], categories=categories,
+ ordered=ordered)
+ tm.assert_categorical_equal(result, expected)
+ assert result.ordered is ordered
+
+ def test_constructor_dtype_and_others_raises(self):
+ dtype = CategoricalDtype(['a', 'b'], ordered=True)
+ with tm.assert_raises_regex(ValueError, "Cannot"):
+ Categorical(['a', 'b'], categories=['a', 'b'], dtype=dtype)
+
+ with tm.assert_raises_regex(ValueError, "Cannot"):
+ Categorical(['a', 'b'], ordered=True, dtype=dtype)
+
+ with tm.assert_raises_regex(ValueError, "Cannot"):
+ Categorical(['a', 'b'], ordered=False, dtype=dtype)
+
+ @pytest.mark.parametrize('categories', [
+ None, ['a', 'b'], ['a', 'c'],
+ ])
+ @pytest.mark.parametrize('ordered', [True, False])
+ def test_constructor_str_category(self, categories, ordered):
+ result = Categorical(['a', 'b'], categories=categories,
+ ordered=ordered, dtype='category')
+ expected = Categorical(['a', 'b'], categories=categories,
+ ordered=ordered)
+ tm.assert_categorical_equal(result, expected)
+
+ def test_constructor_str_unknown(self):
+ with tm.assert_raises_regex(ValueError, "Unknown `dtype`"):
+ Categorical([1, 2], dtype="foo")
+
+ def test_constructor_from_categorical_with_dtype(self):
+ dtype = CategoricalDtype(['a', 'b', 'c'], ordered=True)
+ values = Categorical(['a', 'b', 'd'])
+ result = Categorical(values, dtype=dtype)
+ # We use dtype.categories, not values.categories
+ expected = Categorical(['a', 'b', 'd'], categories=['a', 'b', 'c'],
+ ordered=True)
+ tm.assert_categorical_equal(result, expected)
+
+ def test_constructor_from_categorical_with_unknown_dtype(self):
+ dtype = CategoricalDtype(None, ordered=True)
+ values = Categorical(['a', 'b', 'd'])
+ result = Categorical(values, dtype=dtype)
+ # We use values.categories, not dtype.categories
+ expected = Categorical(['a', 'b', 'd'], categories=['a', 'b', 'd'],
+ ordered=True)
+ tm.assert_categorical_equal(result, expected)
+
+ def test_contructor_from_categorical_string(self):
+ values = Categorical(['a', 'b', 'd'])
+ # use categories, ordered
+ result = Categorical(values, categories=['a', 'b', 'c'], ordered=True,
+ dtype='category')
+ expected = Categorical(['a', 'b', 'd'], categories=['a', 'b', 'c'],
+ ordered=True)
+ tm.assert_categorical_equal(result, expected)
+
+ # No string
+ result = Categorical(values, categories=['a', 'b', 'c'], ordered=True)
+ tm.assert_categorical_equal(result, expected)
+
+ def test_constructor_with_categorical_categories(self):
+ # GH17884
+ expected = Categorical(['a', 'b'], categories=['a', 'b', 'c'])
+
+ result = Categorical(
+ ['a', 'b'], categories=Categorical(['a', 'b', 'c']))
+ tm.assert_categorical_equal(result, expected)
+
+ result = Categorical(
+ ['a', 'b'], categories=CategoricalIndex(['a', 'b', 'c']))
+ tm.assert_categorical_equal(result, expected)
+
+ def test_from_codes(self):
+
+ # too few categories
+ def f():
+ Categorical.from_codes([1, 2], [1, 2])
+
+ pytest.raises(ValueError, f)
+
+ # no int codes
+ def f():
+ Categorical.from_codes(["a"], [1, 2])
+
+ pytest.raises(ValueError, f)
+
+ # no unique categories
+ def f():
+ Categorical.from_codes([0, 1, 2], ["a", "a", "b"])
+
+ pytest.raises(ValueError, f)
+
+ # NaN categories included
+ def f():
+ Categorical.from_codes([0, 1, 2], ["a", "b", np.nan])
+
+ pytest.raises(ValueError, f)
+
+ # too negative
+ def f():
+ Categorical.from_codes([-2, 1, 2], ["a", "b", "c"])
+
+ pytest.raises(ValueError, f)
+
+ exp = Categorical(["a", "b", "c"], ordered=False)
+ res = Categorical.from_codes([0, 1, 2], ["a", "b", "c"])
+ tm.assert_categorical_equal(exp, res)
+
+ # Not available in earlier numpy versions
+ if hasattr(np.random, "choice"):
+ codes = np.random.choice([0, 1], 5, p=[0.9, 0.1])
+ Categorical.from_codes(codes, categories=["train", "test"])
+
+ def test_from_codes_with_categorical_categories(self):
+ # GH17884
+ expected = Categorical(['a', 'b'], categories=['a', 'b', 'c'])
+
+ result = Categorical.from_codes(
+ [0, 1], categories=Categorical(['a', 'b', 'c']))
+ tm.assert_categorical_equal(result, expected)
+
+ result = Categorical.from_codes(
+ [0, 1], categories=CategoricalIndex(['a', 'b', 'c']))
+ tm.assert_categorical_equal(result, expected)
+
+ # non-unique Categorical still raises
+ with pytest.raises(ValueError):
+ Categorical.from_codes([0, 1], Categorical(['a', 'b', 'a']))
+
+ @pytest.mark.parametrize('dtype', [None, 'category'])
+ def test_from_inferred_categories(self, dtype):
+ cats = ['a', 'b']
+ codes = np.array([0, 0, 1, 1], dtype='i8')
+ result = Categorical._from_inferred_categories(cats, codes, dtype)
+ expected = Categorical.from_codes(codes, cats)
+ tm.assert_categorical_equal(result, expected)
+
+ @pytest.mark.parametrize('dtype', [None, 'category'])
+ def test_from_inferred_categories_sorts(self, dtype):
+ cats = ['b', 'a']
+ codes = np.array([0, 1, 1, 1], dtype='i8')
+ result = Categorical._from_inferred_categories(cats, codes, dtype)
+ expected = Categorical.from_codes([1, 0, 0, 0], ['a', 'b'])
+ tm.assert_categorical_equal(result, expected)
+
+ def test_from_inferred_categories_dtype(self):
+ cats = ['a', 'b', 'd']
+ codes = np.array([0, 1, 0, 2], dtype='i8')
+ dtype = CategoricalDtype(['c', 'b', 'a'], ordered=True)
+ result = Categorical._from_inferred_categories(cats, codes, dtype)
+ expected = Categorical(['a', 'b', 'a', 'd'],
+ categories=['c', 'b', 'a'],
+ ordered=True)
+ tm.assert_categorical_equal(result, expected)
+
+ def test_from_inferred_categories_coerces(self):
+ cats = ['1', '2', 'bad']
+ codes = np.array([0, 0, 1, 2], dtype='i8')
+ dtype = CategoricalDtype([1, 2])
+ result = Categorical._from_inferred_categories(cats, codes, dtype)
+ expected = Categorical([1, 1, 2, np.nan])
+ tm.assert_categorical_equal(result, expected)
+
+ def test_construction_with_ordered(self):
+ # GH 9347, 9190
+ cat = Categorical([0, 1, 2])
+ assert not cat.ordered
+ cat = Categorical([0, 1, 2], ordered=False)
+ assert not cat.ordered
+ cat = Categorical([0, 1, 2], ordered=True)
+ assert cat.ordered
+
+ @pytest.mark.xfail(reason="Imaginary values not supported in Categorical")
+ def test_constructor_imaginary(self):
+ values = [1, 2, 3 + 1j]
+ c1 = Categorical(values)
+ tm.assert_index_equal(c1.categories, Index(values))
+ tm.assert_numpy_array_equal(np.array(c1), np.array(values))
diff --git a/pandas/tests/categorical/test_dtypes.py b/pandas/tests/categorical/test_dtypes.py
new file mode 100644
index 0000000000000..0a41b628bc057
--- /dev/null
+++ b/pandas/tests/categorical/test_dtypes.py
@@ -0,0 +1,108 @@
+# -*- coding: utf-8 -*-
+
+import pytest
+
+import numpy as np
+
+import pandas.util.testing as tm
+from pandas.core.dtypes.dtypes import CategoricalDtype
+from pandas import Categorical, Index, CategoricalIndex
+
+
+class TestCategoricalDtypes(object):
+
+ def test_is_equal_dtype(self):
+
+ # test dtype comparisons between cats
+
+ c1 = Categorical(list('aabca'), categories=list('abc'), ordered=False)
+ c2 = Categorical(list('aabca'), categories=list('cab'), ordered=False)
+ c3 = Categorical(list('aabca'), categories=list('cab'), ordered=True)
+ assert c1.is_dtype_equal(c1)
+ assert c2.is_dtype_equal(c2)
+ assert c3.is_dtype_equal(c3)
+ assert c1.is_dtype_equal(c2)
+ assert not c1.is_dtype_equal(c3)
+ assert not c1.is_dtype_equal(Index(list('aabca')))
+ assert not c1.is_dtype_equal(c1.astype(object))
+ assert c1.is_dtype_equal(CategoricalIndex(c1))
+ assert (c1.is_dtype_equal(
+ CategoricalIndex(c1, categories=list('cab'))))
+ assert not c1.is_dtype_equal(CategoricalIndex(c1, ordered=True))
+
+ def test_set_dtype_same(self):
+ c = Categorical(['a', 'b', 'c'])
+ result = c._set_dtype(CategoricalDtype(['a', 'b', 'c']))
+ tm.assert_categorical_equal(result, c)
+
+ def test_set_dtype_new_categories(self):
+ c = Categorical(['a', 'b', 'c'])
+ result = c._set_dtype(CategoricalDtype(list('abcd')))
+ tm.assert_numpy_array_equal(result.codes, c.codes)
+ tm.assert_index_equal(result.dtype.categories, Index(list('abcd')))
+
+ @pytest.mark.parametrize('values, categories, new_categories', [
+ # No NaNs, same cats, same order
+ (['a', 'b', 'a'], ['a', 'b'], ['a', 'b'],),
+ # No NaNs, same cats, different order
+ (['a', 'b', 'a'], ['a', 'b'], ['b', 'a'],),
+ # Same, unsorted
+ (['b', 'a', 'a'], ['a', 'b'], ['a', 'b'],),
+ # No NaNs, same cats, different order
+ (['b', 'a', 'a'], ['a', 'b'], ['b', 'a'],),
+ # NaNs
+ (['a', 'b', 'c'], ['a', 'b'], ['a', 'b']),
+ (['a', 'b', 'c'], ['a', 'b'], ['b', 'a']),
+ (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
+ (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
+ # Introduce NaNs
+ (['a', 'b', 'c'], ['a', 'b'], ['a']),
+ (['a', 'b', 'c'], ['a', 'b'], ['b']),
+ (['b', 'a', 'c'], ['a', 'b'], ['a']),
+ (['b', 'a', 'c'], ['a', 'b'], ['a']),
+ # No overlap
+ (['a', 'b', 'c'], ['a', 'b'], ['d', 'e']),
+ ])
+ @pytest.mark.parametrize('ordered', [True, False])
+ def test_set_dtype_many(self, values, categories, new_categories,
+ ordered):
+ c = Categorical(values, categories)
+ expected = Categorical(values, new_categories, ordered)
+ result = c._set_dtype(expected.dtype)
+ tm.assert_categorical_equal(result, expected)
+
+ def test_set_dtype_no_overlap(self):
+ c = Categorical(['a', 'b', 'c'], ['d', 'e'])
+ result = c._set_dtype(CategoricalDtype(['a', 'b']))
+ expected = Categorical([None, None, None], categories=['a', 'b'])
+ tm.assert_categorical_equal(result, expected)
+
+ def test_codes_dtypes(self):
+
+ # GH 8453
+ result = Categorical(['foo', 'bar', 'baz'])
+ assert result.codes.dtype == 'int8'
+
+ result = Categorical(['foo%05d' % i for i in range(400)])
+ assert result.codes.dtype == 'int16'
+
+ result = Categorical(['foo%05d' % i for i in range(40000)])
+ assert result.codes.dtype == 'int32'
+
+ # adding cats
+ result = Categorical(['foo', 'bar', 'baz'])
+ assert result.codes.dtype == 'int8'
+ result = result.add_categories(['foo%05d' % i for i in range(400)])
+ assert result.codes.dtype == 'int16'
+
+ # removing cats
+ result = result.remove_categories(['foo%05d' % i for i in range(300)])
+ assert result.codes.dtype == 'int8'
+
+ def test_astype_categorical(self):
+
+ cat = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
+ tm.assert_categorical_equal(cat, cat.astype('category'))
+ tm.assert_almost_equal(np.array(cat), cat.astype('object'))
+
+ pytest.raises(ValueError, lambda: cat.astype(float))
diff --git a/pandas/tests/categorical/test_indexing.py b/pandas/tests/categorical/test_indexing.py
new file mode 100644
index 0000000000000..9c27b1101e5ca
--- /dev/null
+++ b/pandas/tests/categorical/test_indexing.py
@@ -0,0 +1,105 @@
+# -*- coding: utf-8 -*-
+
+import pytest
+
+import numpy as np
+
+import pandas.util.testing as tm
+from pandas import Categorical, Index, PeriodIndex
+from pandas.tests.categorical.common import TestCategorical
+
+
+class TestCategoricalIndexingWithFactor(TestCategorical):
+
+ def test_getitem(self):
+ assert self.factor[0] == 'a'
+ assert self.factor[-1] == 'c'
+
+ subf = self.factor[[0, 1, 2]]
+ tm.assert_numpy_array_equal(subf._codes,
+ np.array([0, 1, 1], dtype=np.int8))
+
+ subf = self.factor[np.asarray(self.factor) == 'c']
+ tm.assert_numpy_array_equal(subf._codes,
+ np.array([2, 2, 2], dtype=np.int8))
+
+ def test_setitem(self):
+
+ # int/positional
+ c = self.factor.copy()
+ c[0] = 'b'
+ assert c[0] == 'b'
+ c[-1] = 'a'
+ assert c[-1] == 'a'
+
+ # boolean
+ c = self.factor.copy()
+ indexer = np.zeros(len(c), dtype='bool')
+ indexer[0] = True
+ indexer[-1] = True
+ c[indexer] = 'c'
+ expected = Categorical(['c', 'b', 'b', 'a', 'a', 'c', 'c', 'c'],
+ ordered=True)
+
+ tm.assert_categorical_equal(c, expected)
+
+
+class TestCategoricalIndexing(object):
+
+ def test_getitem_listlike(self):
+
+ # GH 9469
+ # properly coerce the input indexers
+ np.random.seed(1)
+ c = Categorical(np.random.randint(0, 5, size=150000).astype(np.int8))
+ result = c.codes[np.array([100000]).astype(np.int64)]
+ expected = c[np.array([100000]).astype(np.int64)].codes
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_periodindex(self):
+ idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02',
+ '2014-03', '2014-03'], freq='M')
+
+ cat1 = Categorical(idx1)
+ str(cat1)
+ exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.int8)
+ exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M')
+ tm.assert_numpy_array_equal(cat1._codes, exp_arr)
+ tm.assert_index_equal(cat1.categories, exp_idx)
+
+ idx2 = PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01',
+ '2014-03', '2014-01'], freq='M')
+ cat2 = Categorical(idx2, ordered=True)
+ str(cat2)
+ exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.int8)
+ exp_idx2 = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M')
+ tm.assert_numpy_array_equal(cat2._codes, exp_arr)
+ tm.assert_index_equal(cat2.categories, exp_idx2)
+
+ idx3 = PeriodIndex(['2013-12', '2013-11', '2013-10', '2013-09',
+ '2013-08', '2013-07', '2013-05'], freq='M')
+ cat3 = Categorical(idx3, ordered=True)
+ exp_arr = np.array([6, 5, 4, 3, 2, 1, 0], dtype=np.int8)
+ exp_idx = PeriodIndex(['2013-05', '2013-07', '2013-08', '2013-09',
+ '2013-10', '2013-11', '2013-12'], freq='M')
+ tm.assert_numpy_array_equal(cat3._codes, exp_arr)
+ tm.assert_index_equal(cat3.categories, exp_idx)
+
+ def test_categories_assigments(self):
+ s = Categorical(["a", "b", "c", "a"])
+ exp = np.array([1, 2, 3, 1], dtype=np.int64)
+ s.categories = [1, 2, 3]
+ tm.assert_numpy_array_equal(s.__array__(), exp)
+ tm.assert_index_equal(s.categories, Index([1, 2, 3]))
+
+ # lengthen
+ def f():
+ s.categories = [1, 2, 3, 4]
+
+ pytest.raises(ValueError, f)
+
+ # shorten
+ def f():
+ s.categories = [1, 2]
+
+ pytest.raises(ValueError, f)
diff --git a/pandas/tests/categorical/test_missing.py b/pandas/tests/categorical/test_missing.py
new file mode 100644
index 0000000000000..79758dee5cfda
--- /dev/null
+++ b/pandas/tests/categorical/test_missing.py
@@ -0,0 +1,55 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+
+import pandas.util.testing as tm
+from pandas import (Categorical, Index, isna)
+from pandas.compat import lrange
+from pandas.core.dtypes.dtypes import CategoricalDtype
+
+
+class TestCategoricalMissing(object):
+
+ def test_na_flags_int_categories(self):
+ # #1457
+
+ categories = lrange(10)
+ labels = np.random.randint(0, 10, 20)
+ labels[::5] = -1
+
+ cat = Categorical(labels, categories, fastpath=True)
+ repr(cat)
+
+ tm.assert_numpy_array_equal(isna(cat), labels == -1)
+
+ def test_nan_handling(self):
+
+ # Nans are represented as -1 in codes
+ c = Categorical(["a", "b", np.nan, "a"])
+ tm.assert_index_equal(c.categories, Index(["a", "b"]))
+ tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0],
+ dtype=np.int8))
+ c[1] = np.nan
+ tm.assert_index_equal(c.categories, Index(["a", "b"]))
+ tm.assert_numpy_array_equal(c._codes, np.array([0, -1, -1, 0],
+ dtype=np.int8))
+
+ # Adding nan to categories should make assigned nan point to the
+ # category!
+ c = Categorical(["a", "b", np.nan, "a"])
+ tm.assert_index_equal(c.categories, Index(["a", "b"]))
+ tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0],
+ dtype=np.int8))
+
+ def test_set_dtype_nans(self):
+ c = Categorical(['a', 'b', np.nan])
+ result = c._set_dtype(CategoricalDtype(['a', 'c']))
+ tm.assert_numpy_array_equal(result.codes, np.array([0, -1, -1],
+ dtype='int8'))
+
+ def test_set_item_nan(self):
+ cat = Categorical([1, 2, 3])
+ cat[1] = np.nan
+
+ exp = Categorical([1, np.nan, 3], categories=[1, 2, 3])
+ tm.assert_categorical_equal(cat, exp)
diff --git a/pandas/tests/categorical/test_operators.py b/pandas/tests/categorical/test_operators.py
new file mode 100644
index 0000000000000..09a0607b67a88
--- /dev/null
+++ b/pandas/tests/categorical/test_operators.py
@@ -0,0 +1,286 @@
+# -*- coding: utf-8 -*-
+
+import pytest
+
+import pandas as pd
+import numpy as np
+
+import pandas.util.testing as tm
+from pandas import Categorical, Series, DataFrame, date_range
+from pandas.tests.categorical.common import TestCategorical
+
+
+class TestCategoricalOpsWithFactor(TestCategorical):
+
+ def test_categories_none_comparisons(self):
+ factor = Categorical(['a', 'b', 'b', 'a',
+ 'a', 'c', 'c', 'c'], ordered=True)
+ tm.assert_categorical_equal(factor, self.factor)
+
+ def test_comparisons(self):
+
+ result = self.factor[self.factor == 'a']
+ expected = self.factor[np.asarray(self.factor) == 'a']
+ tm.assert_categorical_equal(result, expected)
+
+ result = self.factor[self.factor != 'a']
+ expected = self.factor[np.asarray(self.factor) != 'a']
+ tm.assert_categorical_equal(result, expected)
+
+ result = self.factor[self.factor < 'c']
+ expected = self.factor[np.asarray(self.factor) < 'c']
+ tm.assert_categorical_equal(result, expected)
+
+ result = self.factor[self.factor > 'a']
+ expected = self.factor[np.asarray(self.factor) > 'a']
+ tm.assert_categorical_equal(result, expected)
+
+ result = self.factor[self.factor >= 'b']
+ expected = self.factor[np.asarray(self.factor) >= 'b']
+ tm.assert_categorical_equal(result, expected)
+
+ result = self.factor[self.factor <= 'b']
+ expected = self.factor[np.asarray(self.factor) <= 'b']
+ tm.assert_categorical_equal(result, expected)
+
+ n = len(self.factor)
+
+ other = self.factor[np.random.permutation(n)]
+ result = self.factor == other
+ expected = np.asarray(self.factor) == np.asarray(other)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = self.factor == 'd'
+ expected = np.repeat(False, len(self.factor))
+ tm.assert_numpy_array_equal(result, expected)
+
+ # comparisons with categoricals
+ cat_rev = Categorical(
+ ["a", "b", "c"], categories=["c", "b", "a"], ordered=True)
+ cat_rev_base = Categorical(
+ ["b", "b", "b"], categories=["c", "b", "a"], ordered=True)
+ cat = Categorical(["a", "b", "c"], ordered=True)
+ cat_base = Categorical(
+ ["b", "b", "b"], categories=cat.categories, ordered=True)
+
+ # comparisons need to take categories ordering into account
+ res_rev = cat_rev > cat_rev_base
+ exp_rev = np.array([True, False, False])
+ tm.assert_numpy_array_equal(res_rev, exp_rev)
+
+ res_rev = cat_rev < cat_rev_base
+ exp_rev = np.array([False, False, True])
+ tm.assert_numpy_array_equal(res_rev, exp_rev)
+
+ res = cat > cat_base
+ exp = np.array([False, False, True])
+ tm.assert_numpy_array_equal(res, exp)
+
+ # Only categories with same categories can be compared
+ def f():
+ cat > cat_rev
+
+ pytest.raises(TypeError, f)
+
+ cat_rev_base2 = Categorical(
+ ["b", "b", "b"], categories=["c", "b", "a", "d"])
+
+ def f():
+ cat_rev > cat_rev_base2
+
+ pytest.raises(TypeError, f)
+
+ # Only categories with same ordering information can be compared
+ cat_unorderd = cat.set_ordered(False)
+ assert not (cat > cat).any()
+
+ def f():
+ cat > cat_unorderd
+
+ pytest.raises(TypeError, f)
+
+ # comparison (in both directions) with Series will raise
+ s = Series(["b", "b", "b"])
+ pytest.raises(TypeError, lambda: cat > s)
+ pytest.raises(TypeError, lambda: cat_rev > s)
+ pytest.raises(TypeError, lambda: s < cat)
+ pytest.raises(TypeError, lambda: s < cat_rev)
+
+ # comparison with numpy.array will raise in both direction, but only on
+ # newer numpy versions
+ a = np.array(["b", "b", "b"])
+ pytest.raises(TypeError, lambda: cat > a)
+ pytest.raises(TypeError, lambda: cat_rev > a)
+
+ # Make sure that unequal comparison take the categories order in
+ # account
+ cat_rev = Categorical(
+ list("abc"), categories=list("cba"), ordered=True)
+ exp = np.array([True, False, False])
+ res = cat_rev > "b"
+ tm.assert_numpy_array_equal(res, exp)
+
+
+class TestCategoricalOps(object):
+
+ def test_datetime_categorical_comparison(self):
+ dt_cat = Categorical(date_range('2014-01-01', periods=3), ordered=True)
+ tm.assert_numpy_array_equal(dt_cat > dt_cat[0],
+ np.array([False, True, True]))
+ tm.assert_numpy_array_equal(dt_cat[0] < dt_cat,
+ np.array([False, True, True]))
+
+ def test_reflected_comparison_with_scalars(self):
+ # GH8658
+ cat = Categorical([1, 2, 3], ordered=True)
+ tm.assert_numpy_array_equal(cat > cat[0],
+ np.array([False, True, True]))
+ tm.assert_numpy_array_equal(cat[0] < cat,
+ np.array([False, True, True]))
+
+ def test_comparison_with_unknown_scalars(self):
+ # https://github.com/pandas-dev/pandas/issues/9836#issuecomment-92123057
+ # and following comparisons with scalars not in categories should raise
+ # for unequal comps, but not for equal/not equal
+ cat = Categorical([1, 2, 3], ordered=True)
+
+ pytest.raises(TypeError, lambda: cat < 4)
+ pytest.raises(TypeError, lambda: cat > 4)
+ pytest.raises(TypeError, lambda: 4 < cat)
+ pytest.raises(TypeError, lambda: 4 > cat)
+
+ tm.assert_numpy_array_equal(cat == 4,
+ np.array([False, False, False]))
+ tm.assert_numpy_array_equal(cat != 4,
+ np.array([True, True, True]))
+
+ @pytest.mark.parametrize('data,reverse,base', [
+ (list("abc"), list("cba"), list("bbb")),
+ ([1, 2, 3], [3, 2, 1], [2, 2, 2])]
+ )
+ def test_comparisons(self, data, reverse, base):
+ cat_rev = Series(
+ Categorical(data, categories=reverse, ordered=True))
+ cat_rev_base = Series(
+ Categorical(base, categories=reverse, ordered=True))
+ cat = Series(Categorical(data, ordered=True))
+ cat_base = Series(
+ Categorical(base, categories=cat.cat.categories, ordered=True))
+ s = Series(base)
+ a = np.array(base)
+
+ # comparisons need to take categories ordering into account
+ res_rev = cat_rev > cat_rev_base
+ exp_rev = Series([True, False, False])
+ tm.assert_series_equal(res_rev, exp_rev)
+
+ res_rev = cat_rev < cat_rev_base
+ exp_rev = Series([False, False, True])
+ tm.assert_series_equal(res_rev, exp_rev)
+
+ res = cat > cat_base
+ exp = Series([False, False, True])
+ tm.assert_series_equal(res, exp)
+
+ scalar = base[1]
+ res = cat > scalar
+ exp = Series([False, False, True])
+ exp2 = cat.values > scalar
+ tm.assert_series_equal(res, exp)
+ tm.assert_numpy_array_equal(res.values, exp2)
+ res_rev = cat_rev > scalar
+ exp_rev = Series([True, False, False])
+ exp_rev2 = cat_rev.values > scalar
+ tm.assert_series_equal(res_rev, exp_rev)
+ tm.assert_numpy_array_equal(res_rev.values, exp_rev2)
+
+ # Only categories with same categories can be compared
+ def f():
+ cat > cat_rev
+
+ pytest.raises(TypeError, f)
+
+ # categorical cannot be compared to Series or numpy array, and also
+ # not the other way around
+ pytest.raises(TypeError, lambda: cat > s)
+ pytest.raises(TypeError, lambda: cat_rev > s)
+ pytest.raises(TypeError, lambda: cat > a)
+ pytest.raises(TypeError, lambda: cat_rev > a)
+
+ pytest.raises(TypeError, lambda: s < cat)
+ pytest.raises(TypeError, lambda: s < cat_rev)
+
+ pytest.raises(TypeError, lambda: a < cat)
+ pytest.raises(TypeError, lambda: a < cat_rev)
+
+ @pytest.mark.parametrize('ctor', [
+ lambda *args, **kwargs: Categorical(*args, **kwargs),
+ lambda *args, **kwargs: Series(Categorical(*args, **kwargs)),
+ ])
+ def test_unordered_different_order_equal(self, ctor):
+ # https://github.com/pandas-dev/pandas/issues/16014
+ c1 = ctor(['a', 'b'], categories=['a', 'b'], ordered=False)
+ c2 = ctor(['a', 'b'], categories=['b', 'a'], ordered=False)
+ assert (c1 == c2).all()
+
+ c1 = ctor(['a', 'b'], categories=['a', 'b'], ordered=False)
+ c2 = ctor(['b', 'a'], categories=['b', 'a'], ordered=False)
+ assert (c1 != c2).all()
+
+ c1 = ctor(['a', 'a'], categories=['a', 'b'], ordered=False)
+ c2 = ctor(['b', 'b'], categories=['b', 'a'], ordered=False)
+ assert (c1 != c2).all()
+
+ c1 = ctor(['a', 'a'], categories=['a', 'b'], ordered=False)
+ c2 = ctor(['a', 'b'], categories=['b', 'a'], ordered=False)
+ result = c1 == c2
+ tm.assert_numpy_array_equal(np.array(result), np.array([True, False]))
+
+ def test_unordered_different_categories_raises(self):
+ c1 = Categorical(['a', 'b'], categories=['a', 'b'], ordered=False)
+ c2 = Categorical(['a', 'c'], categories=['c', 'a'], ordered=False)
+ with tm.assert_raises_regex(TypeError,
+ "Categoricals can only be compared"):
+ c1 == c2
+
+ def test_compare_different_lengths(self):
+ c1 = Categorical([], categories=['a', 'b'])
+ c2 = Categorical([], categories=['a'])
+ msg = "Categories are different lengths"
+ with tm.assert_raises_regex(TypeError, msg):
+ c1 == c2
+
+ def test_numeric_like_ops(self):
+
+ df = DataFrame({'value': np.random.randint(0, 10000, 100)})
+ labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
+ cat_labels = Categorical(labels, labels)
+
+ df = df.sort_values(by=['value'], ascending=True)
+ df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
+ right=False, labels=cat_labels)
+
+ # numeric ops should not succeed
+ for op in ['__add__', '__sub__', '__mul__', '__truediv__']:
+ pytest.raises(TypeError,
+ lambda: getattr(df, op)(df))
+
+ # reduction ops should not succeed (unless specifically defined, e.g.
+ # min/max)
+ s = df['value_group']
+ for op in ['kurt', 'skew', 'var', 'std', 'mean', 'sum', 'median']:
+ pytest.raises(TypeError,
+ lambda: getattr(s, op)(numeric_only=False))
+
+ # mad technically works because it takes always the numeric data
+
+ # numpy ops
+ s = Series(Categorical([1, 2, 3, 4]))
+ pytest.raises(TypeError, lambda: np.sum(s))
+
+ # numeric ops on a Series
+ for op in ['__add__', '__sub__', '__mul__', '__truediv__']:
+ pytest.raises(TypeError, lambda: getattr(s, op)(2))
+
+ # invalid ufunc
+ pytest.raises(TypeError, lambda: np.log(s))
diff --git a/pandas/tests/categorical/test_repr.py b/pandas/tests/categorical/test_repr.py
new file mode 100644
index 0000000000000..0cadf66b24d46
--- /dev/null
+++ b/pandas/tests/categorical/test_repr.py
@@ -0,0 +1,517 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+
+from pandas import (Categorical, Series, CategoricalIndex, date_range,
+ period_range, timedelta_range)
+from pandas.compat import u, PY3
+from pandas.core.config import option_context
+from pandas.tests.categorical.common import TestCategorical
+
+
+class TestCategoricalReprWithFactor(TestCategorical):
+
+ def test_print(self):
+ expected = ["[a, b, b, a, a, c, c, c]",
+ "Categories (3, object): [a < b < c]"]
+ expected = "\n".join(expected)
+ actual = repr(self.factor)
+ assert actual == expected
+
+
+class TestCategoricalRepr(object):
+
+ def test_big_print(self):
+ factor = Categorical([0, 1, 2, 0, 1, 2] * 100, ['a', 'b', 'c'],
+ fastpath=True)
+ expected = ["[a, b, c, a, b, ..., b, c, a, b, c]", "Length: 600",
+ "Categories (3, object): [a, b, c]"]
+ expected = "\n".join(expected)
+
+ actual = repr(factor)
+
+ assert actual == expected
+
+ def test_empty_print(self):
+ factor = Categorical([], ["a", "b", "c"])
+ expected = ("[], Categories (3, object): [a, b, c]")
+ # hack because array_repr changed in numpy > 1.6.x
+ actual = repr(factor)
+ assert actual == expected
+
+ assert expected == actual
+ factor = Categorical([], ["a", "b", "c"], ordered=True)
+ expected = ("[], Categories (3, object): [a < b < c]")
+ actual = repr(factor)
+ assert expected == actual
+
+ factor = Categorical([], [])
+ expected = ("[], Categories (0, object): []")
+ assert expected == repr(factor)
+
+ def test_print_none_width(self):
+ # GH10087
+ a = Series(Categorical([1, 2, 3, 4]))
+ exp = u("0 1\n1 2\n2 3\n3 4\n" +
+ "dtype: category\nCategories (4, int64): [1, 2, 3, 4]")
+
+ with option_context("display.width", None):
+ assert exp == repr(a)
+
+ def test_unicode_print(self):
+ if PY3:
+ _rep = repr
+ else:
+ _rep = unicode # noqa
+
+ c = Categorical(['aaaaa', 'bb', 'cccc'] * 20)
+ expected = u"""\
+[aaaaa, bb, cccc, aaaaa, bb, ..., bb, cccc, aaaaa, bb, cccc]
+Length: 60
+Categories (3, object): [aaaaa, bb, cccc]"""
+
+ assert _rep(c) == expected
+
+ c = Categorical([u'ああああ', u'いいいいい', u'ううううううう'] * 20)
+ expected = u"""\
+[ああああ, いいいいい, ううううううう, ああああ, いいいいい, ..., いいいいい, ううううううう, ああああ, いいいいい, ううううううう]
+Length: 60
+Categories (3, object): [ああああ, いいいいい, ううううううう]""" # noqa
+
+ assert _rep(c) == expected
+
+ # unicode option should not affect to Categorical, as it doesn't care
+ # the repr width
+ with option_context('display.unicode.east_asian_width', True):
+
+ c = Categorical([u'ああああ', u'いいいいい', u'ううううううう'] * 20)
+ expected = u"""[ああああ, いいいいい, ううううううう, ああああ, いいいいい, ..., いいいいい, ううううううう, ああああ, いいいいい, ううううううう]
+Length: 60
+Categories (3, object): [ああああ, いいいいい, ううううううう]""" # noqa
+
+ assert _rep(c) == expected
+
+ def test_categorical_repr(self):
+ c = Categorical([1, 2, 3])
+ exp = """[1, 2, 3]
+Categories (3, int64): [1, 2, 3]"""
+
+ assert repr(c) == exp
+
+ c = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3])
+ exp = """[1, 2, 3, 1, 2, 3]
+Categories (3, int64): [1, 2, 3]"""
+
+ assert repr(c) == exp
+
+ c = Categorical([1, 2, 3, 4, 5] * 10)
+ exp = """[1, 2, 3, 4, 5, ..., 1, 2, 3, 4, 5]
+Length: 50
+Categories (5, int64): [1, 2, 3, 4, 5]"""
+
+ assert repr(c) == exp
+
+ c = Categorical(np.arange(20))
+ exp = """[0, 1, 2, 3, 4, ..., 15, 16, 17, 18, 19]
+Length: 20
+Categories (20, int64): [0, 1, 2, 3, ..., 16, 17, 18, 19]"""
+
+ assert repr(c) == exp
+
+ def test_categorical_repr_ordered(self):
+ c = Categorical([1, 2, 3], ordered=True)
+ exp = """[1, 2, 3]
+Categories (3, int64): [1 < 2 < 3]"""
+
+ assert repr(c) == exp
+
+ c = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3], ordered=True)
+ exp = """[1, 2, 3, 1, 2, 3]
+Categories (3, int64): [1 < 2 < 3]"""
+
+ assert repr(c) == exp
+
+ c = Categorical([1, 2, 3, 4, 5] * 10, ordered=True)
+ exp = """[1, 2, 3, 4, 5, ..., 1, 2, 3, 4, 5]
+Length: 50
+Categories (5, int64): [1 < 2 < 3 < 4 < 5]"""
+
+ assert repr(c) == exp
+
+ c = Categorical(np.arange(20), ordered=True)
+ exp = """[0, 1, 2, 3, 4, ..., 15, 16, 17, 18, 19]
+Length: 20
+Categories (20, int64): [0 < 1 < 2 < 3 ... 16 < 17 < 18 < 19]"""
+
+ assert repr(c) == exp
+
+ def test_categorical_repr_datetime(self):
+ idx = date_range('2011-01-01 09:00', freq='H', periods=5)
+ c = Categorical(idx)
+
+ # TODO(wesm): exceeding 80 characters in the console is not good
+ # behavior
+ exp = (
+ "[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, "
+ "2011-01-01 12:00:00, 2011-01-01 13:00:00]\n"
+ "Categories (5, datetime64[ns]): [2011-01-01 09:00:00, "
+ "2011-01-01 10:00:00, 2011-01-01 11:00:00,\n"
+ " 2011-01-01 12:00:00, "
+ "2011-01-01 13:00:00]""")
+ assert repr(c) == exp
+
+ c = Categorical(idx.append(idx), categories=idx)
+ exp = (
+ "[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, "
+ "2011-01-01 12:00:00, 2011-01-01 13:00:00, 2011-01-01 09:00:00, "
+ "2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, "
+ "2011-01-01 13:00:00]\n"
+ "Categories (5, datetime64[ns]): [2011-01-01 09:00:00, "
+ "2011-01-01 10:00:00, 2011-01-01 11:00:00,\n"
+ " 2011-01-01 12:00:00, "
+ "2011-01-01 13:00:00]")
+
+ assert repr(c) == exp
+
+ idx = date_range('2011-01-01 09:00', freq='H', periods=5,
+ tz='US/Eastern')
+ c = Categorical(idx)
+ exp = (
+ "[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, "
+ "2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, "
+ "2011-01-01 13:00:00-05:00]\n"
+ "Categories (5, datetime64[ns, US/Eastern]): "
+ "[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00,\n"
+ " "
+ "2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00,\n"
+ " "
+ "2011-01-01 13:00:00-05:00]")
+
+ assert repr(c) == exp
+
+ c = Categorical(idx.append(idx), categories=idx)
+ exp = (
+ "[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, "
+ "2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, "
+ "2011-01-01 13:00:00-05:00, 2011-01-01 09:00:00-05:00, "
+ "2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, "
+ "2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00]\n"
+ "Categories (5, datetime64[ns, US/Eastern]): "
+ "[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00,\n"
+ " "
+ "2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00,\n"
+ " "
+ "2011-01-01 13:00:00-05:00]")
+
+ assert repr(c) == exp
+
+ def test_categorical_repr_datetime_ordered(self):
+ idx = date_range('2011-01-01 09:00', freq='H', periods=5)
+ c = Categorical(idx, ordered=True)
+ exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00]
+Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 <
+ 2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa
+
+ assert repr(c) == exp
+
+ c = Categorical(idx.append(idx), categories=idx, ordered=True)
+ exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00, 2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00]
+Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 <
+ 2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa
+
+ assert repr(c) == exp
+
+ idx = date_range('2011-01-01 09:00', freq='H', periods=5,
+ tz='US/Eastern')
+ c = Categorical(idx, ordered=True)
+ exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00]
+Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 <
+ 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 <
+ 2011-01-01 13:00:00-05:00]""" # noqa
+
+ assert repr(c) == exp
+
+ c = Categorical(idx.append(idx), categories=idx, ordered=True)
+ exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00, 2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00]
+Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 <
+ 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 <
+ 2011-01-01 13:00:00-05:00]""" # noqa
+
+ assert repr(c) == exp
+
+ def test_categorical_repr_period(self):
+ idx = period_range('2011-01-01 09:00', freq='H', periods=5)
+ c = Categorical(idx)
+ exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
+Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00,
+ 2011-01-01 13:00]""" # noqa
+
+ assert repr(c) == exp
+
+ c = Categorical(idx.append(idx), categories=idx)
+ exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
+Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00,
+ 2011-01-01 13:00]""" # noqa
+
+ assert repr(c) == exp
+
+ idx = period_range('2011-01', freq='M', periods=5)
+ c = Categorical(idx)
+ exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
+Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]"""
+
+ assert repr(c) == exp
+
+ c = Categorical(idx.append(idx), categories=idx)
+ exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05, 2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
+Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" # noqa
+
+ assert repr(c) == exp
+
+ def test_categorical_repr_period_ordered(self):
+ idx = period_range('2011-01-01 09:00', freq='H', periods=5)
+ c = Categorical(idx, ordered=True)
+ exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
+Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 <
+ 2011-01-01 13:00]""" # noqa
+
+ assert repr(c) == exp
+
+ c = Categorical(idx.append(idx), categories=idx, ordered=True)
+ exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
+Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 <
+ 2011-01-01 13:00]""" # noqa
+
+ assert repr(c) == exp
+
+ idx = period_range('2011-01', freq='M', periods=5)
+ c = Categorical(idx, ordered=True)
+ exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
+Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]"""
+
+ assert repr(c) == exp
+
+ c = Categorical(idx.append(idx), categories=idx, ordered=True)
+ exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05, 2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
+Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" # noqa
+
+ assert repr(c) == exp
+
+ def test_categorical_repr_timedelta(self):
+ idx = timedelta_range('1 days', periods=5)
+ c = Categorical(idx)
+ exp = """[1 days, 2 days, 3 days, 4 days, 5 days]
+Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]"""
+
+ assert repr(c) == exp
+
+ c = Categorical(idx.append(idx), categories=idx)
+ exp = """[1 days, 2 days, 3 days, 4 days, 5 days, 1 days, 2 days, 3 days, 4 days, 5 days]
+Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]""" # noqa
+
+ assert repr(c) == exp
+
+ idx = timedelta_range('1 hours', periods=20)
+ c = Categorical(idx)
+ exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
+Length: 20
+Categories (20, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00,
+ 3 days 01:00:00, ..., 16 days 01:00:00, 17 days 01:00:00,
+ 18 days 01:00:00, 19 days 01:00:00]""" # noqa
+
+ assert repr(c) == exp
+
+ c = Categorical(idx.append(idx), categories=idx)
+ exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
+Length: 40
+Categories (20, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00,
+ 3 days 01:00:00, ..., 16 days 01:00:00, 17 days 01:00:00,
+ 18 days 01:00:00, 19 days 01:00:00]""" # noqa
+
+ assert repr(c) == exp
+
+ def test_categorical_repr_timedelta_ordered(self):
+ idx = timedelta_range('1 days', periods=5)
+ c = Categorical(idx, ordered=True)
+ exp = """[1 days, 2 days, 3 days, 4 days, 5 days]
+Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" # noqa
+
+ assert repr(c) == exp
+
+ c = Categorical(idx.append(idx), categories=idx, ordered=True)
+ exp = """[1 days, 2 days, 3 days, 4 days, 5 days, 1 days, 2 days, 3 days, 4 days, 5 days]
+Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" # noqa
+
+ assert repr(c) == exp
+
+ idx = timedelta_range('1 hours', periods=20)
+ c = Categorical(idx, ordered=True)
+ exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
+Length: 20
+Categories (20, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 <
+ 3 days 01:00:00 ... 16 days 01:00:00 < 17 days 01:00:00 <
+ 18 days 01:00:00 < 19 days 01:00:00]""" # noqa
+
+ assert repr(c) == exp
+
+ c = Categorical(idx.append(idx), categories=idx, ordered=True)
+ exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
+Length: 40
+Categories (20, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 <
+ 3 days 01:00:00 ... 16 days 01:00:00 < 17 days 01:00:00 <
+ 18 days 01:00:00 < 19 days 01:00:00]""" # noqa
+
+ assert repr(c) == exp
+
+ def test_categorical_index_repr(self):
+ idx = CategoricalIndex(Categorical([1, 2, 3]))
+ exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=False, dtype='category')""" # noqa
+ assert repr(idx) == exp
+
+ i = CategoricalIndex(Categorical(np.arange(10)))
+ exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, 4, 5, 6, 7, ...], ordered=False, dtype='category')""" # noqa
+ assert repr(i) == exp
+
+ def test_categorical_index_repr_ordered(self):
+ i = CategoricalIndex(Categorical([1, 2, 3], ordered=True))
+ exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=True, dtype='category')""" # noqa
+ assert repr(i) == exp
+
+ i = CategoricalIndex(Categorical(np.arange(10), ordered=True))
+ exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, 4, 5, 6, 7, ...], ordered=True, dtype='category')""" # noqa
+ assert repr(i) == exp
+
+ def test_categorical_index_repr_datetime(self):
+ idx = date_range('2011-01-01 09:00', freq='H', periods=5)
+ i = CategoricalIndex(Categorical(idx))
+ exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00',
+ '2011-01-01 11:00:00', '2011-01-01 12:00:00',
+ '2011-01-01 13:00:00'],
+ categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=False, dtype='category')""" # noqa
+
+ assert repr(i) == exp
+
+ idx = date_range('2011-01-01 09:00', freq='H', periods=5,
+ tz='US/Eastern')
+ i = CategoricalIndex(Categorical(idx))
+ exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00',
+ '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00',
+ '2011-01-01 13:00:00-05:00'],
+ categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=False, dtype='category')""" # noqa
+
+ assert repr(i) == exp
+
+ def test_categorical_index_repr_datetime_ordered(self):
+ idx = date_range('2011-01-01 09:00', freq='H', periods=5)
+ i = CategoricalIndex(Categorical(idx, ordered=True))
+ exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00',
+ '2011-01-01 11:00:00', '2011-01-01 12:00:00',
+ '2011-01-01 13:00:00'],
+ categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=True, dtype='category')""" # noqa
+
+ assert repr(i) == exp
+
+ idx = date_range('2011-01-01 09:00', freq='H', periods=5,
+ tz='US/Eastern')
+ i = CategoricalIndex(Categorical(idx, ordered=True))
+ exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00',
+ '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00',
+ '2011-01-01 13:00:00-05:00'],
+ categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" # noqa
+
+ assert repr(i) == exp
+
+ i = CategoricalIndex(Categorical(idx.append(idx), ordered=True))
+ exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00',
+ '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00',
+ '2011-01-01 13:00:00-05:00', '2011-01-01 09:00:00-05:00',
+ '2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00',
+ '2011-01-01 12:00:00-05:00', '2011-01-01 13:00:00-05:00'],
+ categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" # noqa
+
+ assert repr(i) == exp
+
+ def test_categorical_index_repr_period(self):
+ # test all length
+ idx = period_range('2011-01-01 09:00', freq='H', periods=1)
+ i = CategoricalIndex(Categorical(idx))
+ exp = """CategoricalIndex(['2011-01-01 09:00'], categories=[2011-01-01 09:00], ordered=False, dtype='category')""" # noqa
+ assert repr(i) == exp
+
+ idx = period_range('2011-01-01 09:00', freq='H', periods=2)
+ i = CategoricalIndex(Categorical(idx))
+ exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00], ordered=False, dtype='category')""" # noqa
+ assert repr(i) == exp
+
+ idx = period_range('2011-01-01 09:00', freq='H', periods=3)
+ i = CategoricalIndex(Categorical(idx))
+ exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00], ordered=False, dtype='category')""" # noqa
+ assert repr(i) == exp
+
+ idx = period_range('2011-01-01 09:00', freq='H', periods=5)
+ i = CategoricalIndex(Categorical(idx))
+ exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00',
+ '2011-01-01 12:00', '2011-01-01 13:00'],
+ categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" # noqa
+
+ assert repr(i) == exp
+
+ i = CategoricalIndex(Categorical(idx.append(idx)))
+ exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00',
+ '2011-01-01 12:00', '2011-01-01 13:00', '2011-01-01 09:00',
+ '2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00',
+ '2011-01-01 13:00'],
+ categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" # noqa
+
+ assert repr(i) == exp
+
+ idx = period_range('2011-01', freq='M', periods=5)
+ i = CategoricalIndex(Categorical(idx))
+ exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=False, dtype='category')""" # noqa
+ assert repr(i) == exp
+
+ def test_categorical_index_repr_period_ordered(self):
+ idx = period_range('2011-01-01 09:00', freq='H', periods=5)
+ i = CategoricalIndex(Categorical(idx, ordered=True))
+ exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00',
+ '2011-01-01 12:00', '2011-01-01 13:00'],
+ categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=True, dtype='category')""" # noqa
+
+ assert repr(i) == exp
+
+ idx = period_range('2011-01', freq='M', periods=5)
+ i = CategoricalIndex(Categorical(idx, ordered=True))
+ exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=True, dtype='category')""" # noqa
+ assert repr(i) == exp
+
+ def test_categorical_index_repr_timedelta(self):
+ idx = timedelta_range('1 days', periods=5)
+ i = CategoricalIndex(Categorical(idx))
+ exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days 00:00:00, 2 days 00:00:00, 3 days 00:00:00, 4 days 00:00:00, 5 days 00:00:00], ordered=False, dtype='category')""" # noqa
+ assert repr(i) == exp
+
+ idx = timedelta_range('1 hours', periods=10)
+ i = CategoricalIndex(Categorical(idx))
+ exp = """CategoricalIndex(['0 days 01:00:00', '1 days 01:00:00', '2 days 01:00:00',
+ '3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00',
+ '6 days 01:00:00', '7 days 01:00:00', '8 days 01:00:00',
+ '9 days 01:00:00'],
+ categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, 5 days 01:00:00, 6 days 01:00:00, 7 days 01:00:00, ...], ordered=False, dtype='category')""" # noqa
+
+ assert repr(i) == exp
+
+ def test_categorical_index_repr_timedelta_ordered(self):
+ idx = timedelta_range('1 days', periods=5)
+ i = CategoricalIndex(Categorical(idx, ordered=True))
+ exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days 00:00:00, 2 days 00:00:00, 3 days 00:00:00, 4 days 00:00:00, 5 days 00:00:00], ordered=True, dtype='category')""" # noqa
+ assert repr(i) == exp
+
+ idx = timedelta_range('1 hours', periods=10)
+ i = CategoricalIndex(Categorical(idx, ordered=True))
+ exp = """CategoricalIndex(['0 days 01:00:00', '1 days 01:00:00', '2 days 01:00:00',
+ '3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00',
+ '6 days 01:00:00', '7 days 01:00:00', '8 days 01:00:00',
+ '9 days 01:00:00'],
+ categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, 5 days 01:00:00, 6 days 01:00:00, 7 days 01:00:00, ...], ordered=True, dtype='category')""" # noqa
+
+ assert repr(i) == exp
diff --git a/pandas/tests/categorical/test_sorting.py b/pandas/tests/categorical/test_sorting.py
new file mode 100644
index 0000000000000..88edb6c8f1348
--- /dev/null
+++ b/pandas/tests/categorical/test_sorting.py
@@ -0,0 +1,123 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+
+import pandas.util.testing as tm
+from pandas import Categorical, Index
+
+
+class TestCategoricalSort(object):
+
+ def test_argsort(self):
+ c = Categorical([5, 3, 1, 4, 2], ordered=True)
+
+ expected = np.array([2, 4, 1, 3, 0])
+ tm.assert_numpy_array_equal(c.argsort(ascending=True), expected,
+ check_dtype=False)
+
+ expected = expected[::-1]
+ tm.assert_numpy_array_equal(c.argsort(ascending=False), expected,
+ check_dtype=False)
+
+ def test_numpy_argsort(self):
+ c = Categorical([5, 3, 1, 4, 2], ordered=True)
+
+ expected = np.array([2, 4, 1, 3, 0])
+ tm.assert_numpy_array_equal(np.argsort(c), expected,
+ check_dtype=False)
+
+ tm.assert_numpy_array_equal(np.argsort(c, kind='mergesort'), expected,
+ check_dtype=False)
+
+ msg = "the 'axis' parameter is not supported"
+ tm.assert_raises_regex(ValueError, msg, np.argsort,
+ c, axis=0)
+
+ msg = "the 'order' parameter is not supported"
+ tm.assert_raises_regex(ValueError, msg, np.argsort,
+ c, order='C')
+
+ def test_sort_values(self):
+
+ # unordered cats are sortable
+ cat = Categorical(["a", "b", "b", "a"], ordered=False)
+ cat.sort_values()
+
+ cat = Categorical(["a", "c", "b", "d"], ordered=True)
+
+ # sort_values
+ res = cat.sort_values()
+ exp = np.array(["a", "b", "c", "d"], dtype=object)
+ tm.assert_numpy_array_equal(res.__array__(), exp)
+ tm.assert_index_equal(res.categories, cat.categories)
+
+ cat = Categorical(["a", "c", "b", "d"],
+ categories=["a", "b", "c", "d"], ordered=True)
+ res = cat.sort_values()
+ exp = np.array(["a", "b", "c", "d"], dtype=object)
+ tm.assert_numpy_array_equal(res.__array__(), exp)
+ tm.assert_index_equal(res.categories, cat.categories)
+
+ res = cat.sort_values(ascending=False)
+ exp = np.array(["d", "c", "b", "a"], dtype=object)
+ tm.assert_numpy_array_equal(res.__array__(), exp)
+ tm.assert_index_equal(res.categories, cat.categories)
+
+ # sort (inplace order)
+ cat1 = cat.copy()
+ cat1.sort_values(inplace=True)
+ exp = np.array(["a", "b", "c", "d"], dtype=object)
+ tm.assert_numpy_array_equal(cat1.__array__(), exp)
+ tm.assert_index_equal(res.categories, cat.categories)
+
+ # reverse
+ cat = Categorical(["a", "c", "c", "b", "d"], ordered=True)
+ res = cat.sort_values(ascending=False)
+ exp_val = np.array(["d", "c", "c", "b", "a"], dtype=object)
+ exp_categories = Index(["a", "b", "c", "d"])
+ tm.assert_numpy_array_equal(res.__array__(), exp_val)
+ tm.assert_index_equal(res.categories, exp_categories)
+
+ def test_sort_values_na_position(self):
+ # see gh-12882
+ cat = Categorical([5, 2, np.nan, 2, np.nan], ordered=True)
+ exp_categories = Index([2, 5])
+
+ exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan])
+ res = cat.sort_values() # default arguments
+ tm.assert_numpy_array_equal(res.__array__(), exp)
+ tm.assert_index_equal(res.categories, exp_categories)
+
+ exp = np.array([np.nan, np.nan, 2.0, 2.0, 5.0])
+ res = cat.sort_values(ascending=True, na_position='first')
+ tm.assert_numpy_array_equal(res.__array__(), exp)
+ tm.assert_index_equal(res.categories, exp_categories)
+
+ exp = np.array([np.nan, np.nan, 5.0, 2.0, 2.0])
+ res = cat.sort_values(ascending=False, na_position='first')
+ tm.assert_numpy_array_equal(res.__array__(), exp)
+ tm.assert_index_equal(res.categories, exp_categories)
+
+ exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan])
+ res = cat.sort_values(ascending=True, na_position='last')
+ tm.assert_numpy_array_equal(res.__array__(), exp)
+ tm.assert_index_equal(res.categories, exp_categories)
+
+ exp = np.array([5.0, 2.0, 2.0, np.nan, np.nan])
+ res = cat.sort_values(ascending=False, na_position='last')
+ tm.assert_numpy_array_equal(res.__array__(), exp)
+ tm.assert_index_equal(res.categories, exp_categories)
+
+ cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
+ res = cat.sort_values(ascending=False, na_position='last')
+ exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object)
+ exp_categories = Index(["a", "b", "c", "d"])
+ tm.assert_numpy_array_equal(res.__array__(), exp_val)
+ tm.assert_index_equal(res.categories, exp_categories)
+
+ cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
+ res = cat.sort_values(ascending=False, na_position='first')
+ exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object)
+ exp_categories = Index(["a", "b", "c", "d"])
+ tm.assert_numpy_array_equal(res.__array__(), exp_val)
+ tm.assert_index_equal(res.categories, exp_categories)
diff --git a/pandas/tests/categorical/test_subclass.py b/pandas/tests/categorical/test_subclass.py
new file mode 100644
index 0000000000000..4060d2ebf633a
--- /dev/null
+++ b/pandas/tests/categorical/test_subclass.py
@@ -0,0 +1,26 @@
+# -*- coding: utf-8 -*-
+
+from pandas import Categorical
+
+import pandas.util.testing as tm
+
+
+class TestCategoricalSubclassing(object):
+
+ def test_constructor(self):
+ sc = tm.SubclassedCategorical(['a', 'b', 'c'])
+ assert isinstance(sc, tm.SubclassedCategorical)
+ tm.assert_categorical_equal(sc, Categorical(['a', 'b', 'c']))
+
+ def test_from_codes(self):
+ sc = tm.SubclassedCategorical.from_codes([1, 0, 2], ['a', 'b', 'c'])
+ assert isinstance(sc, tm.SubclassedCategorical)
+ exp = Categorical.from_codes([1, 0, 2], ['a', 'b', 'c'])
+ tm.assert_categorical_equal(sc, exp)
+
+ def test_map(self):
+ sc = tm.SubclassedCategorical(['a', 'b', 'c'])
+ res = sc.map(lambda x: x.upper())
+ assert isinstance(res, tm.SubclassedCategorical)
+ exp = Categorical(['A', 'B', 'C'])
+ tm.assert_categorical_equal(res, exp)
diff --git a/pandas/tests/categorical/test_warnings.py b/pandas/tests/categorical/test_warnings.py
new file mode 100644
index 0000000000000..91278580254aa
--- /dev/null
+++ b/pandas/tests/categorical/test_warnings.py
@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+
+import pytest
+
+import pandas.util.testing as tm
+
+
+class TestCategoricalWarnings(object):
+ def test_tab_complete_warning(self, ip):
+ # https://github.com/pandas-dev/pandas/issues/16409
+ pytest.importorskip('IPython', minversion="6.0.0")
+ from IPython.core.completer import provisionalcompleter
+
+ code = "import pandas as pd; c = Categorical([])"
+ ip.run_code(code)
+ with tm.assert_produces_warning(None):
+ with provisionalcompleter('ignore'):
+ list(ip.Completer.completions('c.', 1))
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
index 8086eda1fd58a..4bba6d7601ae8 100644
--- a/pandas/tests/frame/test_analytics.py
+++ b/pandas/tests/frame/test_analytics.py
@@ -15,7 +15,7 @@
from pandas.compat import lrange, product
from pandas import (compat, isna, notna, DataFrame, Series,
- MultiIndex, date_range, Timestamp)
+ MultiIndex, date_range, Timestamp, Categorical)
import pandas as pd
import pandas.core.nanops as nanops
import pandas.core.algorithms as algorithms
@@ -306,6 +306,36 @@ def test_describe_bool_frame(self):
index=['count', 'unique', 'top', 'freq'])
tm.assert_frame_equal(result, expected)
+ def test_describe_categorical(self):
+ df = DataFrame({'value': np.random.randint(0, 10000, 100)})
+ labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
+ cat_labels = Categorical(labels, labels)
+
+ df = df.sort_values(by=['value'], ascending=True)
+ df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
+ right=False, labels=cat_labels)
+ cat = df
+
+ # Categoricals should not show up together with numerical columns
+ result = cat.describe()
+ assert len(result.columns) == 1
+
+ # In a frame, describe() for the cat should be the same as for string
+ # arrays (count, unique, top, freq)
+
+ cat = Categorical(["a", "b", "b", "b"], categories=['a', 'b', 'c'],
+ ordered=True)
+ s = Series(cat)
+ result = s.describe()
+ expected = Series([4, 2, "b", 3],
+ index=['count', 'unique', 'top', 'freq'])
+ tm.assert_series_equal(result, expected)
+
+ cat = Series(Categorical(["a", "b", "c", "c"]))
+ df3 = DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]})
+ res = df3.describe()
+ tm.assert_numpy_array_equal(res["cat"].values, res["s"].values)
+
def test_describe_categorical_columns(self):
# GH 11558
columns = pd.CategoricalIndex(['int1', 'int2', 'obj'],
diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py
index 3d2bee9e01d34..7e952a87688bc 100644
--- a/pandas/tests/frame/test_api.py
+++ b/pandas/tests/frame/test_api.py
@@ -15,7 +15,7 @@
from numpy.random import randn
import numpy as np
-from pandas import DataFrame, Series, date_range, timedelta_range
+from pandas import DataFrame, Series, date_range, timedelta_range, Categorical
import pandas as pd
from pandas.util.testing import (assert_almost_equal,
@@ -240,6 +240,29 @@ def test_itertuples(self):
assert not hasattr(tup3, '_fields')
assert isinstance(tup3, tuple)
+ def test_sequence_like_with_categorical(self):
+
+ # GH 7839
+ # make sure can iterate
+ df = DataFrame({"id": [1, 2, 3, 4, 5, 6],
+ "raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']})
+ df['grade'] = Categorical(df['raw_grade'])
+
+ # basic sequencing testing
+ result = list(df.grade.values)
+ expected = np.array(df.grade.values).tolist()
+ tm.assert_almost_equal(result, expected)
+
+ # iteration
+ for t in df.itertuples(index=False):
+ str(t)
+
+ for row, s in df.iterrows():
+ str(s)
+
+ for c, col in df.iteritems():
+ str(s)
+
def test_len(self):
assert len(self.frame) == len(self.frame.index)
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
index 8fd196bfc4d2a..22ad2258e70bc 100644
--- a/pandas/tests/frame/test_constructors.py
+++ b/pandas/tests/frame/test_constructors.py
@@ -19,7 +19,7 @@
from pandas import compat
from pandas import (DataFrame, Index, Series, isna,
MultiIndex, Timedelta, Timestamp,
- date_range)
+ date_range, Categorical)
import pandas as pd
import pandas._libs.lib as lib
import pandas.util.testing as tm
@@ -1562,6 +1562,79 @@ def test_constructor_lists_to_object_dtype(self):
assert d['a'].dtype == np.object_
assert not d['a'][1]
+ def test_constructor_categorical(self):
+
+ # GH8626
+
+ # dict creation
+ df = DataFrame({'A': list('abc')}, dtype='category')
+ expected = Series(list('abc'), dtype='category', name='A')
+ tm.assert_series_equal(df['A'], expected)
+
+ # to_frame
+ s = Series(list('abc'), dtype='category')
+ result = s.to_frame()
+ expected = Series(list('abc'), dtype='category', name=0)
+ tm.assert_series_equal(result[0], expected)
+ result = s.to_frame(name='foo')
+ expected = Series(list('abc'), dtype='category', name='foo')
+ tm.assert_series_equal(result['foo'], expected)
+
+ # list-like creation
+ df = DataFrame(list('abc'), dtype='category')
+ expected = Series(list('abc'), dtype='category', name=0)
+ tm.assert_series_equal(df[0], expected)
+
+ # ndim != 1
+ df = DataFrame([Categorical(list('abc'))])
+ expected = DataFrame({0: Series(list('abc'), dtype='category')})
+ tm.assert_frame_equal(df, expected)
+
+ df = DataFrame([Categorical(list('abc')), Categorical(list('abd'))])
+ expected = DataFrame({0: Series(list('abc'), dtype='category'),
+ 1: Series(list('abd'), dtype='category')},
+ columns=[0, 1])
+ tm.assert_frame_equal(df, expected)
+
+ # mixed
+ df = DataFrame([Categorical(list('abc')), list('def')])
+ expected = DataFrame({0: Series(list('abc'), dtype='category'),
+ 1: list('def')}, columns=[0, 1])
+ tm.assert_frame_equal(df, expected)
+
+ # invalid (shape)
+ pytest.raises(ValueError,
+ lambda: DataFrame([Categorical(list('abc')),
+ Categorical(list('abdefg'))]))
+
+ # ndim > 1
+ pytest.raises(NotImplementedError,
+ lambda: Categorical(np.array([list('abcd')])))
+
+ def test_constructor_categorical_series(self):
+
+ l = [1, 2, 3, 1]
+ exp = Series(l).astype('category')
+ res = Series(l, dtype='category')
+ tm.assert_series_equal(res, exp)
+
+ l = ["a", "b", "c", "a"]
+ exp = Series(l).astype('category')
+ res = Series(l, dtype='category')
+ tm.assert_series_equal(res, exp)
+
+ # insert into frame with different index
+ # GH 8076
+ index = date_range('20000101', periods=3)
+ expected = Series(Categorical(values=[np.nan, np.nan, np.nan],
+ categories=['a', 'b', 'c']))
+ expected.index = index
+
+ expected = DataFrame({'x': expected})
+ df = DataFrame(
+ {'x': Series(['a', 'b', 'c'], dtype='category')}, index=index)
+ tm.assert_frame_equal(df, expected)
+
def test_from_records_to_records(self):
# from numpy documentation
arr = np.zeros((2,), dtype=('i4,f4,a10'))
diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py
index 7d2d18db8d41c..024de8bc13f72 100644
--- a/pandas/tests/frame/test_convert_to.py
+++ b/pandas/tests/frame/test_convert_to.py
@@ -151,6 +151,27 @@ def test_to_records_with_unicode_column_names(self):
)
tm.assert_almost_equal(result, expected)
+ def test_to_records_with_categorical(self):
+
+ # GH8626
+
+ # dict creation
+ df = DataFrame({'A': list('abc')}, dtype='category')
+ expected = Series(list('abc'), dtype='category', name='A')
+ tm.assert_series_equal(df['A'], expected)
+
+ # list-like creation
+ df = DataFrame(list('abc'), dtype='category')
+ expected = Series(list('abc'), dtype='category', name=0)
+ tm.assert_series_equal(df[0], expected)
+
+ # to record array
+ # this coerces
+ result = df.to_records()
+ expected = np.rec.array([(0, 'a'), (1, 'b'), (2, 'c')],
+ dtype=[('index', '=i8'), ('0', 'O')])
+ tm.assert_almost_equal(result, expected)
+
@pytest.mark.parametrize('mapping', [
dict,
collections.defaultdict(list),
diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py
index 7591f1f1459be..610b9f7bdbf6c 100644
--- a/pandas/tests/frame/test_dtypes.py
+++ b/pandas/tests/frame/test_dtypes.py
@@ -619,6 +619,13 @@ def test_astype_duplicate_col(self):
expected = concat([a1_str, b, a2_str], axis=1)
assert_frame_equal(result, expected)
+ @pytest.mark.parametrize('columns', [['x'], ['x', 'y'], ['x', 'y', 'z']])
+ def test_categorical_astype_ndim_raises(self, columns):
+ # GH 18004
+ msg = '> 1 ndim Categorical are not supported at this time'
+ with tm.assert_raises_regex(NotImplementedError, msg):
+ DataFrame(columns=columns).astype('category')
+
@pytest.mark.parametrize("cls", [
pd.api.types.CategoricalDtype,
pd.api.types.DatetimeTZDtype,
diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py
index 33a3495dcd71e..62bc0eada9d89 100644
--- a/pandas/tests/frame/test_indexing.py
+++ b/pandas/tests/frame/test_indexing.py
@@ -17,7 +17,9 @@
import pandas.core.common as com
from pandas import (DataFrame, Index, Series, notna, isna,
MultiIndex, DatetimeIndex, Timestamp,
- date_range)
+ date_range, Categorical)
+from pandas.core.dtypes.dtypes import CategoricalDtype
+
import pandas as pd
from pandas._libs.tslib import iNaT
@@ -3075,3 +3077,372 @@ def test_transpose(self):
expected = DataFrame(self.df.values.T)
expected.index = ['A', 'B']
assert_frame_equal(result, expected)
+
+
+class TestDataFrameIndexingCategorical(object):
+
+ def test_assignment(self):
+ # assignment
+ df = DataFrame({'value': np.array(
+ np.random.randint(0, 10000, 100), dtype='int32')})
+ labels = Categorical(["{0} - {1}".format(i, i + 499)
+ for i in range(0, 10000, 500)])
+
+ df = df.sort_values(by=['value'], ascending=True)
+ s = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels)
+ d = s.values
+ df['D'] = d
+ str(df)
+
+ result = df.dtypes
+ expected = Series(
+ [np.dtype('int32'), CategoricalDtype(categories=labels,
+ ordered=False)],
+ index=['value', 'D'])
+ tm.assert_series_equal(result, expected)
+
+ df['E'] = s
+ str(df)
+
+ result = df.dtypes
+ expected = Series([np.dtype('int32'),
+ CategoricalDtype(categories=labels, ordered=False),
+ CategoricalDtype(categories=labels, ordered=False)],
+ index=['value', 'D', 'E'])
+ tm.assert_series_equal(result, expected)
+
+ result1 = df['D']
+ result2 = df['E']
+ tm.assert_categorical_equal(result1._data._block.values, d)
+
+ # sorting
+ s.name = 'E'
+ tm.assert_series_equal(result2.sort_index(), s.sort_index())
+
+ cat = Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10])
+ df = DataFrame(Series(cat))
+
+ def test_assigning_ops(self):
+ # systematically test the assigning operations:
+ # for all slicing ops:
+ # for value in categories and value not in categories:
+
+ # - assign a single value -> exp_single_cats_value
+
+ # - assign a complete row (mixed values) -> exp_single_row
+
+ # assign multiple rows (mixed values) (-> array) -> exp_multi_row
+
+ # assign a part of a column with dtype == categorical ->
+ # exp_parts_cats_col
+
+ # assign a part of a column with dtype != categorical ->
+ # exp_parts_cats_col
+
+ cats = Categorical(["a", "a", "a", "a", "a", "a", "a"],
+ categories=["a", "b"])
+ idx = Index(["h", "i", "j", "k", "l", "m", "n"])
+ values = [1, 1, 1, 1, 1, 1, 1]
+ orig = DataFrame({"cats": cats, "values": values}, index=idx)
+
+ # the expected values
+ # changed single row
+ cats1 = Categorical(["a", "a", "b", "a", "a", "a", "a"],
+ categories=["a", "b"])
+ idx1 = Index(["h", "i", "j", "k", "l", "m", "n"])
+ values1 = [1, 1, 2, 1, 1, 1, 1]
+ exp_single_row = DataFrame({"cats": cats1,
+ "values": values1}, index=idx1)
+
+ # changed multiple rows
+ cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"],
+ categories=["a", "b"])
+ idx2 = Index(["h", "i", "j", "k", "l", "m", "n"])
+ values2 = [1, 1, 2, 2, 1, 1, 1]
+ exp_multi_row = DataFrame({"cats": cats2,
+ "values": values2}, index=idx2)
+
+ # changed part of the cats column
+ cats3 = Categorical(
+ ["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"])
+ idx3 = Index(["h", "i", "j", "k", "l", "m", "n"])
+ values3 = [1, 1, 1, 1, 1, 1, 1]
+ exp_parts_cats_col = DataFrame({"cats": cats3,
+ "values": values3}, index=idx3)
+
+ # changed single value in cats col
+ cats4 = Categorical(
+ ["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"])
+ idx4 = Index(["h", "i", "j", "k", "l", "m", "n"])
+ values4 = [1, 1, 1, 1, 1, 1, 1]
+ exp_single_cats_value = DataFrame({"cats": cats4,
+ "values": values4}, index=idx4)
+
+ # iloc
+ # ###############
+ # - assign a single value -> exp_single_cats_value
+ df = orig.copy()
+ df.iloc[2, 0] = "b"
+ tm.assert_frame_equal(df, exp_single_cats_value)
+
+ df = orig.copy()
+ df.iloc[df.index == "j", 0] = "b"
+ tm.assert_frame_equal(df, exp_single_cats_value)
+
+ # - assign a single value not in the current categories set
+ def f():
+ df = orig.copy()
+ df.iloc[2, 0] = "c"
+
+ pytest.raises(ValueError, f)
+
+ # - assign a complete row (mixed values) -> exp_single_row
+ df = orig.copy()
+ df.iloc[2, :] = ["b", 2]
+ tm.assert_frame_equal(df, exp_single_row)
+
+ # - assign a complete row (mixed values) not in categories set
+ def f():
+ df = orig.copy()
+ df.iloc[2, :] = ["c", 2]
+
+ pytest.raises(ValueError, f)
+
+ # - assign multiple rows (mixed values) -> exp_multi_row
+ df = orig.copy()
+ df.iloc[2:4, :] = [["b", 2], ["b", 2]]
+ tm.assert_frame_equal(df, exp_multi_row)
+
+ def f():
+ df = orig.copy()
+ df.iloc[2:4, :] = [["c", 2], ["c", 2]]
+
+ pytest.raises(ValueError, f)
+
+ # assign a part of a column with dtype == categorical ->
+ # exp_parts_cats_col
+ df = orig.copy()
+ df.iloc[2:4, 0] = Categorical(["b", "b"], categories=["a", "b"])
+ tm.assert_frame_equal(df, exp_parts_cats_col)
+
+ with pytest.raises(ValueError):
+ # different categories -> not sure if this should fail or pass
+ df = orig.copy()
+ df.iloc[2:4, 0] = Categorical(list('bb'), categories=list('abc'))
+
+ with pytest.raises(ValueError):
+ # different values
+ df = orig.copy()
+ df.iloc[2:4, 0] = Categorical(list('cc'), categories=list('abc'))
+
+ # assign a part of a column with dtype != categorical ->
+ # exp_parts_cats_col
+ df = orig.copy()
+ df.iloc[2:4, 0] = ["b", "b"]
+ tm.assert_frame_equal(df, exp_parts_cats_col)
+
+ with pytest.raises(ValueError):
+ df.iloc[2:4, 0] = ["c", "c"]
+
+ # loc
+ # ##############
+ # - assign a single value -> exp_single_cats_value
+ df = orig.copy()
+ df.loc["j", "cats"] = "b"
+ tm.assert_frame_equal(df, exp_single_cats_value)
+
+ df = orig.copy()
+ df.loc[df.index == "j", "cats"] = "b"
+ tm.assert_frame_equal(df, exp_single_cats_value)
+
+ # - assign a single value not in the current categories set
+ def f():
+ df = orig.copy()
+ df.loc["j", "cats"] = "c"
+
+ pytest.raises(ValueError, f)
+
+ # - assign a complete row (mixed values) -> exp_single_row
+ df = orig.copy()
+ df.loc["j", :] = ["b", 2]
+ tm.assert_frame_equal(df, exp_single_row)
+
+ # - assign a complete row (mixed values) not in categories set
+ def f():
+ df = orig.copy()
+ df.loc["j", :] = ["c", 2]
+
+ pytest.raises(ValueError, f)
+
+ # - assign multiple rows (mixed values) -> exp_multi_row
+ df = orig.copy()
+ df.loc["j":"k", :] = [["b", 2], ["b", 2]]
+ tm.assert_frame_equal(df, exp_multi_row)
+
+ def f():
+ df = orig.copy()
+ df.loc["j":"k", :] = [["c", 2], ["c", 2]]
+
+ pytest.raises(ValueError, f)
+
+ # assign a part of a column with dtype == categorical ->
+ # exp_parts_cats_col
+ df = orig.copy()
+ df.loc["j":"k", "cats"] = Categorical(
+ ["b", "b"], categories=["a", "b"])
+ tm.assert_frame_equal(df, exp_parts_cats_col)
+
+ with pytest.raises(ValueError):
+ # different categories -> not sure if this should fail or pass
+ df = orig.copy()
+ df.loc["j":"k", "cats"] = Categorical(
+ ["b", "b"], categories=["a", "b", "c"])
+
+ with pytest.raises(ValueError):
+ # different values
+ df = orig.copy()
+ df.loc["j":"k", "cats"] = Categorical(
+ ["c", "c"], categories=["a", "b", "c"])
+
+ # assign a part of a column with dtype != categorical ->
+ # exp_parts_cats_col
+ df = orig.copy()
+ df.loc["j":"k", "cats"] = ["b", "b"]
+ tm.assert_frame_equal(df, exp_parts_cats_col)
+
+ with pytest.raises(ValueError):
+ df.loc["j":"k", "cats"] = ["c", "c"]
+
+ # loc
+ # ##############
+ # - assign a single value -> exp_single_cats_value
+ df = orig.copy()
+ df.loc["j", df.columns[0]] = "b"
+ tm.assert_frame_equal(df, exp_single_cats_value)
+
+ df = orig.copy()
+ df.loc[df.index == "j", df.columns[0]] = "b"
+ tm.assert_frame_equal(df, exp_single_cats_value)
+
+ # - assign a single value not in the current categories set
+ def f():
+ df = orig.copy()
+ df.loc["j", df.columns[0]] = "c"
+
+ pytest.raises(ValueError, f)
+
+ # - assign a complete row (mixed values) -> exp_single_row
+ df = orig.copy()
+ df.loc["j", :] = ["b", 2]
+ tm.assert_frame_equal(df, exp_single_row)
+
+ # - assign a complete row (mixed values) not in categories set
+ def f():
+ df = orig.copy()
+ df.loc["j", :] = ["c", 2]
+
+ pytest.raises(ValueError, f)
+
+ # - assign multiple rows (mixed values) -> exp_multi_row
+ df = orig.copy()
+ df.loc["j":"k", :] = [["b", 2], ["b", 2]]
+ tm.assert_frame_equal(df, exp_multi_row)
+
+ def f():
+ df = orig.copy()
+ df.loc["j":"k", :] = [["c", 2], ["c", 2]]
+
+ pytest.raises(ValueError, f)
+
+ # assign a part of a column with dtype == categorical ->
+ # exp_parts_cats_col
+ df = orig.copy()
+ df.loc["j":"k", df.columns[0]] = Categorical(
+ ["b", "b"], categories=["a", "b"])
+ tm.assert_frame_equal(df, exp_parts_cats_col)
+
+ with pytest.raises(ValueError):
+ # different categories -> not sure if this should fail or pass
+ df = orig.copy()
+ df.loc["j":"k", df.columns[0]] = Categorical(
+ ["b", "b"], categories=["a", "b", "c"])
+
+ with pytest.raises(ValueError):
+ # different values
+ df = orig.copy()
+ df.loc["j":"k", df.columns[0]] = Categorical(
+ ["c", "c"], categories=["a", "b", "c"])
+
+ # assign a part of a column with dtype != categorical ->
+ # exp_parts_cats_col
+ df = orig.copy()
+ df.loc["j":"k", df.columns[0]] = ["b", "b"]
+ tm.assert_frame_equal(df, exp_parts_cats_col)
+
+ with pytest.raises(ValueError):
+ df.loc["j":"k", df.columns[0]] = ["c", "c"]
+
+ # iat
+ df = orig.copy()
+ df.iat[2, 0] = "b"
+ tm.assert_frame_equal(df, exp_single_cats_value)
+
+ # - assign a single value not in the current categories set
+ def f():
+ df = orig.copy()
+ df.iat[2, 0] = "c"
+
+ pytest.raises(ValueError, f)
+
+ # at
+ # - assign a single value -> exp_single_cats_value
+ df = orig.copy()
+ df.at["j", "cats"] = "b"
+ tm.assert_frame_equal(df, exp_single_cats_value)
+
+ # - assign a single value not in the current categories set
+ def f():
+ df = orig.copy()
+ df.at["j", "cats"] = "c"
+
+ pytest.raises(ValueError, f)
+
+ # fancy indexing
+ catsf = Categorical(["a", "a", "c", "c", "a", "a", "a"],
+ categories=["a", "b", "c"])
+ idxf = Index(["h", "i", "j", "k", "l", "m", "n"])
+ valuesf = [1, 1, 3, 3, 1, 1, 1]
+ df = DataFrame({"cats": catsf, "values": valuesf}, index=idxf)
+
+ exp_fancy = exp_multi_row.copy()
+ exp_fancy["cats"].cat.set_categories(["a", "b", "c"], inplace=True)
+
+ df[df["cats"] == "c"] = ["b", 2]
+ # category c is kept in .categories
+ tm.assert_frame_equal(df, exp_fancy)
+
+ # set_value
+ df = orig.copy()
+ df.at["j", "cats"] = "b"
+ tm.assert_frame_equal(df, exp_single_cats_value)
+
+ def f():
+ df = orig.copy()
+ df.at["j", "cats"] = "c"
+
+ pytest.raises(ValueError, f)
+
+ # Assigning a Category to parts of a int/... column uses the values of
+ # the Catgorical
+ df = DataFrame({"a": [1, 1, 1, 1, 1], "b": list("aaaaa")})
+ exp = DataFrame({"a": [1, "b", "b", 1, 1], "b": list("aabba")})
+ df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"])
+ df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"])
+ tm.assert_frame_equal(df, exp)
+
+ def test_functions_no_warnings(self):
+ df = DataFrame({'value': np.random.randint(0, 100, 20)})
+ labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)]
+ with tm.assert_produces_warning(False):
+ df['group'] = pd.cut(df.value, range(0, 105, 10), right=False,
+ labels=labels)
diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py
index 37f8c0cc85b23..8c46dc30a0f5f 100644
--- a/pandas/tests/frame/test_repr_info.py
+++ b/pandas/tests/frame/test_repr_info.py
@@ -10,7 +10,8 @@
import numpy as np
import pytest
-from pandas import (DataFrame, compat, option_context)
+from pandas import (DataFrame, Series, compat, option_context,
+ date_range, period_range, Categorical)
from pandas.compat import StringIO, lrange, u, PYPY
import pandas.io.formats.format as fmt
import pandas as pd
@@ -471,3 +472,34 @@ def test_info_categorical(self):
buf = StringIO()
df.info(buf=buf)
+
+ def test_info_categorical_column(self):
+
+ # make sure it works
+ n = 2500
+ df = DataFrame({'int64': np.random.randint(100, size=n)})
+ df['category'] = Series(np.array(list('abcdefghij')).take(
+ np.random.randint(0, 10, size=n))).astype('category')
+ df.isna()
+ buf = StringIO()
+ df.info(buf=buf)
+
+ df2 = df[df['category'] == 'd']
+ buf = compat.StringIO()
+ df2.info(buf=buf)
+
+ def test_repr_categorical_dates_periods(self):
+ # normal DataFrame
+ dt = date_range('2011-01-01 09:00', freq='H', periods=5,
+ tz='US/Eastern')
+ p = period_range('2011-01', freq='M', periods=5)
+ df = DataFrame({'dt': dt, 'p': p})
+ exp = """ dt p
+0 2011-01-01 09:00:00-05:00 2011-01
+1 2011-01-01 10:00:00-05:00 2011-02
+2 2011-01-01 11:00:00-05:00 2011-03
+3 2011-01-01 12:00:00-05:00 2011-04
+4 2011-01-01 13:00:00-05:00 2011-05"""
+
+ df = DataFrame({'dt': Categorical(dt), 'p': Categorical(p)})
+ assert repr(df) == exp
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
index fdc03acd3e931..c73423921898d 100644
--- a/pandas/tests/groupby/test_categorical.py
+++ b/pandas/tests/groupby/test_categorical.py
@@ -17,6 +17,142 @@
class TestGroupByCategorical(MixIn):
+ def test_groupby(self):
+
+ cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+ categories=["a", "b", "c", "d"], ordered=True)
+ data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})
+
+ exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True)
+ expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index)
+ result = data.groupby("b").mean()
+ tm.assert_frame_equal(result, expected)
+
+ raw_cat1 = Categorical(["a", "a", "b", "b"],
+ categories=["a", "b", "z"], ordered=True)
+ raw_cat2 = Categorical(["c", "d", "c", "d"],
+ categories=["c", "d", "y"], ordered=True)
+ df = DataFrame({"A": raw_cat1, "B": raw_cat2, "values": [1, 2, 3, 4]})
+
+ # single grouper
+ gb = df.groupby("A")
+ exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True)
+ expected = DataFrame({'values': Series([3, 7, np.nan], index=exp_idx)})
+ result = gb.sum()
+ tm.assert_frame_equal(result, expected)
+
+ # multiple groupers
+ gb = df.groupby(['A', 'B'])
+ exp_index = pd.MultiIndex.from_product(
+ [Categorical(["a", "b", "z"], ordered=True),
+ Categorical(["c", "d", "y"], ordered=True)],
+ names=['A', 'B'])
+ expected = DataFrame({'values': [1, 2, np.nan, 3, 4, np.nan,
+ np.nan, np.nan, np.nan]},
+ index=exp_index)
+ result = gb.sum()
+ tm.assert_frame_equal(result, expected)
+
+ # multiple groupers with a non-cat
+ df = df.copy()
+ df['C'] = ['foo', 'bar'] * 2
+ gb = df.groupby(['A', 'B', 'C'])
+ exp_index = pd.MultiIndex.from_product(
+ [Categorical(["a", "b", "z"], ordered=True),
+ Categorical(["c", "d", "y"], ordered=True),
+ ['foo', 'bar']],
+ names=['A', 'B', 'C'])
+ expected = DataFrame({'values': Series(
+ np.nan, index=exp_index)}).sort_index()
+ expected.iloc[[1, 2, 7, 8], 0] = [1, 2, 3, 4]
+ result = gb.sum()
+ tm.assert_frame_equal(result, expected)
+
+ # GH 8623
+ x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'],
+ [1, 'John P. Doe']],
+ columns=['person_id', 'person_name'])
+ x['person_name'] = Categorical(x.person_name)
+
+ g = x.groupby(['person_id'])
+ result = g.transform(lambda x: x)
+ tm.assert_frame_equal(result, x[['person_name']])
+
+ result = x.drop_duplicates('person_name')
+ expected = x.iloc[[0, 1]]
+ tm.assert_frame_equal(result, expected)
+
+ def f(x):
+ return x.drop_duplicates('person_name').iloc[0]
+
+ result = g.apply(f)
+ expected = x.iloc[[0, 1]].copy()
+ expected.index = Index([1, 2], name='person_id')
+ expected['person_name'] = expected['person_name'].astype('object')
+ tm.assert_frame_equal(result, expected)
+
+ # GH 9921
+ # Monotonic
+ df = DataFrame({"a": [5, 15, 25]})
+ c = pd.cut(df.a, bins=[0, 10, 20, 30, 40])
+
+ result = df.a.groupby(c).transform(sum)
+ tm.assert_series_equal(result, df['a'])
+
+ tm.assert_series_equal(
+ df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a'])
+ tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']])
+ tm.assert_frame_equal(
+ df.groupby(c).transform(lambda xs: np.max(xs)), df[['a']])
+
+ # Filter
+ tm.assert_series_equal(df.a.groupby(c).filter(np.all), df['a'])
+ tm.assert_frame_equal(df.groupby(c).filter(np.all), df)
+
+ # Non-monotonic
+ df = DataFrame({"a": [5, 15, 25, -5]})
+ c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40])
+
+ result = df.a.groupby(c).transform(sum)
+ tm.assert_series_equal(result, df['a'])
+
+ tm.assert_series_equal(
+ df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a'])
+ tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']])
+ tm.assert_frame_equal(
+ df.groupby(c).transform(lambda xs: np.sum(xs)), df[['a']])
+
+ # GH 9603
+ df = DataFrame({'a': [1, 0, 0, 0]})
+ c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd')))
+ result = df.groupby(c).apply(len)
+
+ exp_index = CategoricalIndex(
+ c.values.categories, ordered=c.values.ordered)
+ expected = Series([1, 0, 0, 0], index=exp_index)
+ expected.index.name = 'a'
+ tm.assert_series_equal(result, expected)
+
+ def test_groupby_sort(self):
+
+ # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby
+ # This should result in a properly sorted Series so that the plot
+ # has a sorted x axis
+ # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar')
+
+ df = DataFrame({'value': np.random.randint(0, 10000, 100)})
+ labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
+ cat_labels = Categorical(labels, labels)
+
+ df = df.sort_values(by=['value'], ascending=True)
+ df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
+ right=False, labels=cat_labels)
+
+ res = df.groupby(['value_group'])['value_group'].count()
+ exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))]
+ exp.index = CategoricalIndex(exp.index, name=exp.index.name)
+ tm.assert_series_equal(res, exp)
+
def test_level_groupby_get_group(self):
# GH15155
df = DataFrame(data=np.arange(2, 22, 2),
diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py
index 22b3fd9073bab..f2182687d047f 100644
--- a/pandas/tests/indexing/test_categorical.py
+++ b/pandas/tests/indexing/test_categorical.py
@@ -3,12 +3,15 @@
import pytest
import pandas as pd
+import pandas.compat as compat
import numpy as np
-from pandas import (Series, DataFrame, Timestamp,
- Categorical, CategoricalIndex)
+from pandas import (Series, DataFrame, Timestamp, Categorical,
+ CategoricalIndex, Interval, Index)
from pandas.util.testing import assert_series_equal, assert_frame_equal
from pandas.util import testing as tm
+from pandas.core.dtypes.common import is_categorical_dtype
from pandas.api.types import CategoricalDtype as CDT
+from pandas.core.dtypes.dtypes import CategoricalDtype
class TestCategoricalIndex(object):
@@ -75,6 +78,229 @@ def test_getitem_scalar(self):
result = s[cats[0]]
assert result == expected
+ def test_slicing_directly(self):
+ cat = Categorical(["a", "b", "c", "d", "a", "b", "c"])
+ sliced = cat[3]
+ assert sliced == "d"
+ sliced = cat[3:5]
+ expected = Categorical(["d", "a"], categories=['a', 'b', 'c', 'd'])
+ tm.assert_numpy_array_equal(sliced._codes, expected._codes)
+ tm.assert_index_equal(sliced.categories, expected.categories)
+
+ def test_slicing(self):
+ cat = Series(Categorical([1, 2, 3, 4]))
+ reversed = cat[::-1]
+ exp = np.array([4, 3, 2, 1], dtype=np.int64)
+ tm.assert_numpy_array_equal(reversed.__array__(), exp)
+
+ df = DataFrame({'value': (np.arange(100) + 1).astype('int64')})
+ df['D'] = pd.cut(df.value, bins=[0, 25, 50, 75, 100])
+
+ expected = Series([11, Interval(0, 25)], index=['value', 'D'], name=10)
+ result = df.iloc[10]
+ tm.assert_series_equal(result, expected)
+
+ expected = DataFrame({'value': np.arange(11, 21).astype('int64')},
+ index=np.arange(10, 20).astype('int64'))
+ expected['D'] = pd.cut(expected.value, bins=[0, 25, 50, 75, 100])
+ result = df.iloc[10:20]
+ tm.assert_frame_equal(result, expected)
+
+ expected = Series([9, Interval(0, 25)], index=['value', 'D'], name=8)
+ result = df.loc[8]
+ tm.assert_series_equal(result, expected)
+
+ def test_slicing_and_getting_ops(self):
+
+ # systematically test the slicing operations:
+ # for all slicing ops:
+ # - returning a dataframe
+ # - returning a column
+ # - returning a row
+ # - returning a single value
+
+ cats = Categorical(
+ ["a", "c", "b", "c", "c", "c", "c"], categories=["a", "b", "c"])
+ idx = Index(["h", "i", "j", "k", "l", "m", "n"])
+ values = [1, 2, 3, 4, 5, 6, 7]
+ df = DataFrame({"cats": cats, "values": values}, index=idx)
+
+ # the expected values
+ cats2 = Categorical(["b", "c"], categories=["a", "b", "c"])
+ idx2 = Index(["j", "k"])
+ values2 = [3, 4]
+
+ # 2:4,: | "j":"k",:
+ exp_df = DataFrame({"cats": cats2, "values": values2}, index=idx2)
+
+ # :,"cats" | :,0
+ exp_col = Series(cats, index=idx, name='cats')
+
+ # "j",: | 2,:
+ exp_row = Series(["b", 3], index=["cats", "values"], dtype="object",
+ name="j")
+
+ # "j","cats | 2,0
+ exp_val = "b"
+
+ # iloc
+ # frame
+ res_df = df.iloc[2:4, :]
+ tm.assert_frame_equal(res_df, exp_df)
+ assert is_categorical_dtype(res_df["cats"])
+
+ # row
+ res_row = df.iloc[2, :]
+ tm.assert_series_equal(res_row, exp_row)
+ assert isinstance(res_row["cats"], compat.string_types)
+
+ # col
+ res_col = df.iloc[:, 0]
+ tm.assert_series_equal(res_col, exp_col)
+ assert is_categorical_dtype(res_col)
+
+ # single value
+ res_val = df.iloc[2, 0]
+ assert res_val == exp_val
+
+ # loc
+ # frame
+ res_df = df.loc["j":"k", :]
+ tm.assert_frame_equal(res_df, exp_df)
+ assert is_categorical_dtype(res_df["cats"])
+
+ # row
+ res_row = df.loc["j", :]
+ tm.assert_series_equal(res_row, exp_row)
+ assert isinstance(res_row["cats"], compat.string_types)
+
+ # col
+ res_col = df.loc[:, "cats"]
+ tm.assert_series_equal(res_col, exp_col)
+ assert is_categorical_dtype(res_col)
+
+ # single value
+ res_val = df.loc["j", "cats"]
+ assert res_val == exp_val
+
+ # ix
+ # frame
+ # res_df = df.loc["j":"k",[0,1]] # doesn't work?
+ res_df = df.loc["j":"k", :]
+ tm.assert_frame_equal(res_df, exp_df)
+ assert is_categorical_dtype(res_df["cats"])
+
+ # row
+ res_row = df.loc["j", :]
+ tm.assert_series_equal(res_row, exp_row)
+ assert isinstance(res_row["cats"], compat.string_types)
+
+ # col
+ res_col = df.loc[:, "cats"]
+ tm.assert_series_equal(res_col, exp_col)
+ assert is_categorical_dtype(res_col)
+
+ # single value
+ res_val = df.loc["j", df.columns[0]]
+ assert res_val == exp_val
+
+ # iat
+ res_val = df.iat[2, 0]
+ assert res_val == exp_val
+
+ # at
+ res_val = df.at["j", "cats"]
+ assert res_val == exp_val
+
+ # fancy indexing
+ exp_fancy = df.iloc[[2]]
+
+ res_fancy = df[df["cats"] == "b"]
+ tm.assert_frame_equal(res_fancy, exp_fancy)
+ res_fancy = df[df["values"] == 3]
+ tm.assert_frame_equal(res_fancy, exp_fancy)
+
+ # get_value
+ res_val = df.at["j", "cats"]
+ assert res_val == exp_val
+
+ # i : int, slice, or sequence of integers
+ res_row = df.iloc[2]
+ tm.assert_series_equal(res_row, exp_row)
+ assert isinstance(res_row["cats"], compat.string_types)
+
+ res_df = df.iloc[slice(2, 4)]
+ tm.assert_frame_equal(res_df, exp_df)
+ assert is_categorical_dtype(res_df["cats"])
+
+ res_df = df.iloc[[2, 3]]
+ tm.assert_frame_equal(res_df, exp_df)
+ assert is_categorical_dtype(res_df["cats"])
+
+ res_col = df.iloc[:, 0]
+ tm.assert_series_equal(res_col, exp_col)
+ assert is_categorical_dtype(res_col)
+
+ res_df = df.iloc[:, slice(0, 2)]
+ tm.assert_frame_equal(res_df, df)
+ assert is_categorical_dtype(res_df["cats"])
+
+ res_df = df.iloc[:, [0, 1]]
+ tm.assert_frame_equal(res_df, df)
+ assert is_categorical_dtype(res_df["cats"])
+
+ def test_slicing_doc_examples(self):
+
+ # GH 7918
+ cats = Categorical(["a", "b", "b", "b", "c", "c", "c"],
+ categories=["a", "b", "c"])
+ idx = Index(["h", "i", "j", "k", "l", "m", "n", ])
+ values = [1, 2, 2, 2, 3, 4, 5]
+ df = DataFrame({"cats": cats, "values": values}, index=idx)
+
+ result = df.iloc[2:4, :]
+ expected = DataFrame(
+ {"cats": Categorical(['b', 'b'], categories=['a', 'b', 'c']),
+ "values": [2, 2]}, index=['j', 'k'])
+ tm.assert_frame_equal(result, expected)
+
+ result = df.iloc[2:4, :].dtypes
+ expected = Series(['category', 'int64'], ['cats', 'values'])
+ tm.assert_series_equal(result, expected)
+
+ result = df.loc["h":"j", "cats"]
+ expected = Series(Categorical(['a', 'b', 'b'],
+ categories=['a', 'b', 'c']),
+ index=['h', 'i', 'j'], name='cats')
+ tm.assert_series_equal(result, expected)
+
+ result = df.loc["h":"j", df.columns[0:1]]
+ expected = DataFrame({'cats': Categorical(['a', 'b', 'b'],
+ categories=['a', 'b', 'c'])},
+ index=['h', 'i', 'j'])
+ tm.assert_frame_equal(result, expected)
+
+ def test_getitem_category_type(self):
+ # GH 14580
+ # test iloc() on Series with Categorical data
+
+ s = Series([1, 2, 3]).astype('category')
+
+ # get slice
+ result = s.iloc[0:2]
+ expected = Series([1, 2]).astype(CategoricalDtype([1, 2, 3]))
+ tm.assert_series_equal(result, expected)
+
+ # get list of indexes
+ result = s.iloc[[0, 1]]
+ expected = Series([1, 2]).astype(CategoricalDtype([1, 2, 3]))
+ tm.assert_series_equal(result, expected)
+
+ # get boolean array
+ result = s.iloc[[True, False, False]]
+ expected = Series([1]).astype(CategoricalDtype([1, 2, 3]))
+ tm.assert_series_equal(result, expected)
+
def test_loc_listlike(self):
# list of labels
@@ -217,6 +443,21 @@ def test_getitem_with_listlike(self):
result = dummies[[c for c in dummies.columns]]
assert_frame_equal(result, expected)
+ def test_setitem_listlike(self):
+
+ # GH 9469
+ # properly coerce the input indexers
+ np.random.seed(1)
+ c = Categorical(np.random.randint(0, 5, size=150000).astype(
+ np.int8)).add_categories([-1000])
+ indexer = np.array([100000]).astype(np.int64)
+ c[indexer] = -1000
+
+ # we are asserting the code result here
+ # which maps to the -1000 category
+ result = c.codes[np.array([100000]).astype(np.int64)]
+ tm.assert_numpy_array_equal(result, np.array([5], dtype='int8'))
+
def test_ix_categorical_index(self):
# GH 12531
df = DataFrame(np.random.randn(3, 3),
diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
index cd0701e3864fc..6f2d2ce2a8583 100644
--- a/pandas/tests/reshape/merge/test_merge.py
+++ b/pandas/tests/reshape/merge/test_merge.py
@@ -1504,6 +1504,57 @@ def test_basic(self, left, right):
index=['X', 'Y', 'Z'])
assert_series_equal(result, expected)
+ def test_merge_categorical(self):
+ # GH 9426
+
+ right = DataFrame({'c': {0: 'a',
+ 1: 'b',
+ 2: 'c',
+ 3: 'd',
+ 4: 'e'},
+ 'd': {0: 'null',
+ 1: 'null',
+ 2: 'null',
+ 3: 'null',
+ 4: 'null'}})
+ left = DataFrame({'a': {0: 'f',
+ 1: 'f',
+ 2: 'f',
+ 3: 'f',
+ 4: 'f'},
+ 'b': {0: 'g',
+ 1: 'g',
+ 2: 'g',
+ 3: 'g',
+ 4: 'g'}})
+ df = pd.merge(left, right, how='left', left_on='b', right_on='c')
+
+ # object-object
+ expected = df.copy()
+
+ # object-cat
+ # note that we propagate the category
+ # because we don't have any matching rows
+ cright = right.copy()
+ cright['d'] = cright['d'].astype('category')
+ result = pd.merge(left, cright, how='left', left_on='b', right_on='c')
+ expected['d'] = expected['d'].astype(CategoricalDtype(['null']))
+ tm.assert_frame_equal(result, expected)
+
+ # cat-object
+ cleft = left.copy()
+ cleft['b'] = cleft['b'].astype('category')
+ result = pd.merge(cleft, cright, how='left', left_on='b', right_on='c')
+ tm.assert_frame_equal(result, expected)
+
+ # cat-cat
+ cright = right.copy()
+ cright['d'] = cright['d'].astype('category')
+ cleft = left.copy()
+ cleft['b'] = cleft['b'].astype('category')
+ result = pd.merge(cleft, cright, how='left', left_on='b', right_on='c')
+ tm.assert_frame_equal(result, expected)
+
def test_other_columns(self, left, right):
# non-merge columns should preserve if possible
right = right.assign(Z=right.Z.astype('category'))
diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py
index ae41502f237f1..bdbf2a0ee2f68 100644
--- a/pandas/tests/reshape/test_concat.py
+++ b/pandas/tests/reshape/test_concat.py
@@ -10,7 +10,8 @@
from pandas import (DataFrame, concat,
read_csv, isna, Series, date_range,
Index, Panel, MultiIndex, Timestamp,
- DatetimeIndex)
+ DatetimeIndex, Categorical)
+from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.util import testing as tm
from pandas.util.testing import (assert_frame_equal,
makeCustomDataframe as mkdf)
@@ -1927,6 +1928,144 @@ def test_concat_multiindex_dfs_with_deepcopy(self):
result_no_copy = pd.concat(example_dict, names=['testname'])
tm.assert_frame_equal(result_no_copy, expected)
+ def test_categorical_concat_append(self):
+ cat = Categorical(["a", "b"], categories=["a", "b"])
+ vals = [1, 2]
+ df = DataFrame({"cats": cat, "vals": vals})
+ cat2 = Categorical(["a", "b", "a", "b"], categories=["a", "b"])
+ vals2 = [1, 2, 1, 2]
+ exp = DataFrame({"cats": cat2, "vals": vals2},
+ index=Index([0, 1, 0, 1]))
+
+ tm.assert_frame_equal(pd.concat([df, df]), exp)
+ tm.assert_frame_equal(df.append(df), exp)
+
+ # GH 13524 can concat different categories
+ cat3 = Categorical(["a", "b"], categories=["a", "b", "c"])
+ vals3 = [1, 2]
+ df_different_categories = DataFrame({"cats": cat3, "vals": vals3})
+
+ res = pd.concat([df, df_different_categories], ignore_index=True)
+ exp = DataFrame({"cats": list('abab'), "vals": [1, 2, 1, 2]})
+ tm.assert_frame_equal(res, exp)
+
+ res = df.append(df_different_categories, ignore_index=True)
+ tm.assert_frame_equal(res, exp)
+
+ def test_categorical_concat_dtypes(self):
+
+ # GH8143
+ index = ['cat', 'obj', 'num']
+ cat = Categorical(['a', 'b', 'c'])
+ obj = Series(['a', 'b', 'c'])
+ num = Series([1, 2, 3])
+ df = pd.concat([Series(cat), obj, num], axis=1, keys=index)
+
+ result = df.dtypes == 'object'
+ expected = Series([False, True, False], index=index)
+ tm.assert_series_equal(result, expected)
+
+ result = df.dtypes == 'int64'
+ expected = Series([False, False, True], index=index)
+ tm.assert_series_equal(result, expected)
+
+ result = df.dtypes == 'category'
+ expected = Series([True, False, False], index=index)
+ tm.assert_series_equal(result, expected)
+
+ def test_categorical_concat(self):
+ # See GH 10177
+ df1 = DataFrame(np.arange(18, dtype='int64').reshape(6, 3),
+ columns=["a", "b", "c"])
+
+ df2 = DataFrame(np.arange(14, dtype='int64').reshape(7, 2),
+ columns=["a", "c"])
+
+ cat_values = ["one", "one", "two", "one", "two", "two", "one"]
+ df2['h'] = Series(Categorical(cat_values))
+
+ res = pd.concat((df1, df2), axis=0, ignore_index=True)
+ exp = DataFrame({'a': [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12],
+ 'b': [1, 4, 7, 10, 13, 16, np.nan, np.nan, np.nan,
+ np.nan, np.nan, np.nan, np.nan],
+ 'c': [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13],
+ 'h': [None] * 6 + cat_values})
+ tm.assert_frame_equal(res, exp)
+
+ def test_categorical_concat_gh7864(self):
+ # GH 7864
+ # make sure ordering is preserverd
+ df = DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": list('abbaae')})
+ df["grade"] = Categorical(df["raw_grade"])
+ df['grade'].cat.set_categories(['e', 'a', 'b'])
+
+ df1 = df[0:3]
+ df2 = df[3:]
+
+ tm.assert_index_equal(df['grade'].cat.categories,
+ df1['grade'].cat.categories)
+ tm.assert_index_equal(df['grade'].cat.categories,
+ df2['grade'].cat.categories)
+
+ dfx = pd.concat([df1, df2])
+ tm.assert_index_equal(df['grade'].cat.categories,
+ dfx['grade'].cat.categories)
+
+ dfa = df1.append(df2)
+ tm.assert_index_equal(df['grade'].cat.categories,
+ dfa['grade'].cat.categories)
+
+ def test_categorical_concat_preserve(self):
+
+ # GH 8641 series concat not preserving category dtype
+ # GH 13524 can concat different categories
+ s = Series(list('abc'), dtype='category')
+ s2 = Series(list('abd'), dtype='category')
+
+ exp = Series(list('abcabd'))
+ res = pd.concat([s, s2], ignore_index=True)
+ tm.assert_series_equal(res, exp)
+
+ exp = Series(list('abcabc'), dtype='category')
+ res = pd.concat([s, s], ignore_index=True)
+ tm.assert_series_equal(res, exp)
+
+ exp = Series(list('abcabc'), index=[0, 1, 2, 0, 1, 2],
+ dtype='category')
+ res = pd.concat([s, s])
+ tm.assert_series_equal(res, exp)
+
+ a = Series(np.arange(6, dtype='int64'))
+ b = Series(list('aabbca'))
+
+ df2 = DataFrame({'A': a,
+ 'B': b.astype(CategoricalDtype(list('cab')))})
+ res = pd.concat([df2, df2])
+ exp = DataFrame(
+ {'A': pd.concat([a, a]),
+ 'B': pd.concat([b, b]).astype(CategoricalDtype(list('cab')))})
+ tm.assert_frame_equal(res, exp)
+
+ def test_categorical_index_preserver(self):
+
+ a = Series(np.arange(6, dtype='int64'))
+ b = Series(list('aabbca'))
+
+ df2 = DataFrame({'A': a,
+ 'B': b.astype(CategoricalDtype(list('cab')))
+ }).set_index('B')
+ result = pd.concat([df2, df2])
+ expected = DataFrame(
+ {'A': pd.concat([a, a]),
+ 'B': pd.concat([b, b]).astype(CategoricalDtype(list('cab')))
+ }).set_index('B')
+ tm.assert_frame_equal(result, expected)
+
+ # wrong catgories
+ df3 = DataFrame({'A': a, 'B': Categorical(b, categories=list('abe'))
+ }).set_index('B')
+ pytest.raises(TypeError, lambda: pd.concat([df2, df3]))
+
def test_concat_categoricalindex(self):
# GH 16111, categories that aren't lexsorted
categories = [9, 0, 1, 2, 3]
diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
index 857116c8f8f78..c8b7ae044b71c 100644
--- a/pandas/tests/reshape/test_pivot.py
+++ b/pandas/tests/reshape/test_pivot.py
@@ -9,7 +9,7 @@
from collections import OrderedDict
import pandas as pd
from pandas import (DataFrame, Series, Index, MultiIndex,
- Grouper, date_range, concat)
+ Grouper, date_range, concat, Categorical)
from pandas.core.reshape.pivot import pivot_table, crosstab
from pandas.compat import range, product
import pandas.util.testing as tm
@@ -91,6 +91,24 @@ def test_pivot_table_dropna(self):
tm.assert_index_equal(pv_col.columns, m)
tm.assert_index_equal(pv_ind.index, m)
+ def test_pivot_table_categorical(self):
+
+ raw_cat1 = Categorical(["a", "a", "b", "b"],
+ categories=["a", "b", "z"], ordered=True)
+ raw_cat2 = Categorical(["c", "d", "c", "d"],
+ categories=["c", "d", "y"], ordered=True)
+ df = DataFrame({"A": raw_cat1, "B": raw_cat2, "values": [1, 2, 3, 4]})
+ result = pd.pivot_table(df, values='values', index=['A', 'B'])
+
+ exp_index = pd.MultiIndex.from_product(
+ [Categorical(["a", "b", "z"], ordered=True),
+ Categorical(["c", "d", "y"], ordered=True)],
+ names=['A', 'B'])
+ expected = DataFrame(
+ {'values': [1, 2, np.nan, 3, 4, np.nan, np.nan, np.nan, np.nan]},
+ index=exp_index)
+ tm.assert_frame_equal(result, expected)
+
def test_pivot_table_dropna_categoricals(self):
# GH 15193
categories = ['a', 'b', 'c', 'd']
diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py
index 3e68ff7cf2f59..f472c6ae9383c 100644
--- a/pandas/tests/reshape/test_reshape.py
+++ b/pandas/tests/reshape/test_reshape.py
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
# pylint: disable-msg=W0612,E1101
+from warnings import catch_warnings
import pytest
from pandas import DataFrame, Series
@@ -11,7 +12,7 @@
from pandas.util.testing import assert_frame_equal
-from pandas import get_dummies
+from pandas import get_dummies, Categorical, Index
import pandas.util.testing as tm
from pandas.compat import u
@@ -454,6 +455,63 @@ def test_dataframe_dummies_preserve_categorical_dtype(self, dtype):
tm.assert_frame_equal(result, expected)
+class TestCategoricalReshape(object):
+
+ def test_reshaping_panel_categorical(self):
+
+ with catch_warnings(record=True):
+ p = tm.makePanel()
+ p['str'] = 'foo'
+ df = p.to_frame()
+
+ df['category'] = df['str'].astype('category')
+ result = df['category'].unstack()
+
+ c = Categorical(['foo'] * len(p.major_axis))
+ expected = DataFrame({'A': c.copy(),
+ 'B': c.copy(),
+ 'C': c.copy(),
+ 'D': c.copy()},
+ columns=Index(list('ABCD'), name='minor'),
+ index=p.major_axis.set_names('major'))
+ tm.assert_frame_equal(result, expected)
+
+ def test_reshape_categorical(self):
+ cat = Categorical([], categories=["a", "b"])
+ tm.assert_produces_warning(FutureWarning, cat.reshape, 0)
+
+ with tm.assert_produces_warning(FutureWarning):
+ cat = Categorical([], categories=["a", "b"])
+ tm.assert_categorical_equal(cat.reshape(0), cat)
+
+ with tm.assert_produces_warning(FutureWarning):
+ cat = Categorical([], categories=["a", "b"])
+ tm.assert_categorical_equal(cat.reshape((5, -1)), cat)
+
+ with tm.assert_produces_warning(FutureWarning):
+ cat = Categorical(["a", "b"], categories=["a", "b"])
+ tm.assert_categorical_equal(cat.reshape(cat.shape), cat)
+
+ with tm.assert_produces_warning(FutureWarning):
+ cat = Categorical(["a", "b"], categories=["a", "b"])
+ tm.assert_categorical_equal(cat.reshape(cat.size), cat)
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ msg = "can only specify one unknown dimension"
+ cat = Categorical(["a", "b"], categories=["a", "b"])
+ tm.assert_raises_regex(ValueError, msg, cat.reshape, (-2, -1))
+
+ def test_reshape_categorical_numpy(self):
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ cat = Categorical(["a", "b"], categories=["a", "b"])
+ tm.assert_categorical_equal(np.reshape(cat, cat.shape), cat)
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ msg = "the 'order' parameter is not supported"
+ tm.assert_raises_regex(ValueError, msg, np.reshape,
+ cat, cat.shape, order='F')
+
+
class TestMakeAxisDummies(object):
def test_preserve_categorical_dtype(self):
diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py
index caaa122024cba..289b5c01c1263 100644
--- a/pandas/tests/series/test_analytics.py
+++ b/pandas/tests/series/test_analytics.py
@@ -11,7 +11,8 @@
import pandas as pd
from pandas import (Series, Categorical, DataFrame, isna, notna,
- bdate_range, date_range, _np_version_under1p10)
+ bdate_range, date_range, _np_version_under1p10,
+ CategoricalIndex)
from pandas.core.index import MultiIndex
from pandas.core.indexes.datetimes import Timestamp
from pandas.core.indexes.timedeltas import Timedelta
@@ -1865,3 +1866,218 @@ def test_n(self, n):
result = s.nsmallest(n)
expected = s.sort_values().head(n)
assert_series_equal(result, expected)
+
+
+class TestCategoricalSeriesAnalytics(object):
+
+ def test_count(self):
+
+ s = Series(Categorical([np.nan, 1, 2, np.nan],
+ categories=[5, 4, 3, 2, 1], ordered=True))
+ result = s.count()
+ assert result == 2
+
+ def test_min_max(self):
+ # unordered cats have no min/max
+ cat = Series(Categorical(["a", "b", "c", "d"], ordered=False))
+ pytest.raises(TypeError, lambda: cat.min())
+ pytest.raises(TypeError, lambda: cat.max())
+
+ cat = Series(Categorical(["a", "b", "c", "d"], ordered=True))
+ _min = cat.min()
+ _max = cat.max()
+ assert _min == "a"
+ assert _max == "d"
+
+ cat = Series(Categorical(["a", "b", "c", "d"], categories=[
+ 'd', 'c', 'b', 'a'], ordered=True))
+ _min = cat.min()
+ _max = cat.max()
+ assert _min == "d"
+ assert _max == "a"
+
+ cat = Series(Categorical(
+ [np.nan, "b", "c", np.nan], categories=['d', 'c', 'b', 'a'
+ ], ordered=True))
+ _min = cat.min()
+ _max = cat.max()
+ assert np.isnan(_min)
+ assert _max == "b"
+
+ cat = Series(Categorical(
+ [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True))
+ _min = cat.min()
+ _max = cat.max()
+ assert np.isnan(_min)
+ assert _max == 1
+
+ def test_mode(self):
+ s = Series(Categorical([1, 1, 2, 4, 5, 5, 5],
+ categories=[5, 4, 3, 2, 1], ordered=True))
+ res = s.mode()
+ exp = Series(Categorical([5], categories=[
+ 5, 4, 3, 2, 1], ordered=True))
+ tm.assert_series_equal(res, exp)
+ s = Series(Categorical([1, 1, 1, 4, 5, 5, 5],
+ categories=[5, 4, 3, 2, 1], ordered=True))
+ res = s.mode()
+ exp = Series(Categorical([5, 1], categories=[
+ 5, 4, 3, 2, 1], ordered=True))
+ tm.assert_series_equal(res, exp)
+ s = Series(Categorical([1, 2, 3, 4, 5], categories=[5, 4, 3, 2, 1],
+ ordered=True))
+ res = s.mode()
+ exp = Series(Categorical([5, 4, 3, 2, 1], categories=[5, 4, 3, 2, 1],
+ ordered=True))
+ tm.assert_series_equal(res, exp)
+
+ def test_value_counts(self):
+ # GH 12835
+ cats = Categorical(list('abcccb'), categories=list('cabd'))
+ s = Series(cats, name='xxx')
+ res = s.value_counts(sort=False)
+
+ exp_index = CategoricalIndex(list('cabd'), categories=cats.categories)
+ exp = Series([3, 1, 2, 0], name='xxx', index=exp_index)
+ tm.assert_series_equal(res, exp)
+
+ res = s.value_counts(sort=True)
+
+ exp_index = CategoricalIndex(list('cbad'), categories=cats.categories)
+ exp = Series([3, 2, 1, 0], name='xxx', index=exp_index)
+ tm.assert_series_equal(res, exp)
+
+ # check object dtype handles the Series.name as the same
+ # (tested in test_base.py)
+ s = Series(["a", "b", "c", "c", "c", "b"], name='xxx')
+ res = s.value_counts()
+ exp = Series([3, 2, 1], name='xxx', index=["c", "b", "a"])
+ tm.assert_series_equal(res, exp)
+
+ def test_value_counts_with_nan(self):
+ # see gh-9443
+
+ # sanity check
+ s = Series(["a", "b", "a"], dtype="category")
+ exp = Series([2, 1], index=CategoricalIndex(["a", "b"]))
+
+ res = s.value_counts(dropna=True)
+ tm.assert_series_equal(res, exp)
+
+ res = s.value_counts(dropna=True)
+ tm.assert_series_equal(res, exp)
+
+ # same Series via two different constructions --> same behaviour
+ series = [
+ Series(["a", "b", None, "a", None, None], dtype="category"),
+ Series(Categorical(["a", "b", None, "a", None, None],
+ categories=["a", "b"]))
+ ]
+
+ for s in series:
+ # None is a NaN value, so we exclude its count here
+ exp = Series([2, 1], index=CategoricalIndex(["a", "b"]))
+ res = s.value_counts(dropna=True)
+ tm.assert_series_equal(res, exp)
+
+ # we don't exclude the count of None and sort by counts
+ exp = Series([3, 2, 1], index=CategoricalIndex([np.nan, "a", "b"]))
+ res = s.value_counts(dropna=False)
+ tm.assert_series_equal(res, exp)
+
+ # When we aren't sorting by counts, and np.nan isn't a
+ # category, it should be last.
+ exp = Series([2, 1, 3], index=CategoricalIndex(["a", "b", np.nan]))
+ res = s.value_counts(dropna=False, sort=False)
+ tm.assert_series_equal(res, exp)
+
+ @pytest.mark.parametrize(
+ "dtype",
+ ["int_", "uint", "float_", "unicode_", "timedelta64[h]",
+ pytest.param("datetime64[D]",
+ marks=pytest.mark.xfail(reason="issue7996"))]
+ )
+ @pytest.mark.parametrize("is_ordered", [True, False])
+ def test_drop_duplicates_categorical_non_bool(self, dtype, is_ordered):
+ cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype))
+
+ # Test case 1
+ input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype))
+ tc1 = Series(Categorical(input1, categories=cat_array,
+ ordered=is_ordered))
+
+ expected = Series([False, False, False, True])
+ tm.assert_series_equal(tc1.duplicated(), expected)
+ tm.assert_series_equal(tc1.drop_duplicates(), tc1[~expected])
+ sc = tc1.copy()
+ sc.drop_duplicates(inplace=True)
+ tm.assert_series_equal(sc, tc1[~expected])
+
+ expected = Series([False, False, True, False])
+ tm.assert_series_equal(tc1.duplicated(keep='last'), expected)
+ tm.assert_series_equal(tc1.drop_duplicates(keep='last'),
+ tc1[~expected])
+ sc = tc1.copy()
+ sc.drop_duplicates(keep='last', inplace=True)
+ tm.assert_series_equal(sc, tc1[~expected])
+
+ expected = Series([False, False, True, True])
+ tm.assert_series_equal(tc1.duplicated(keep=False), expected)
+ tm.assert_series_equal(tc1.drop_duplicates(keep=False), tc1[~expected])
+ sc = tc1.copy()
+ sc.drop_duplicates(keep=False, inplace=True)
+ tm.assert_series_equal(sc, tc1[~expected])
+
+ # Test case 2
+ input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype))
+ tc2 = Series(Categorical(
+ input2, categories=cat_array, ordered=is_ordered)
+ )
+
+ expected = Series([False, False, False, False, True, True, False])
+ tm.assert_series_equal(tc2.duplicated(), expected)
+ tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected])
+ sc = tc2.copy()
+ sc.drop_duplicates(inplace=True)
+ tm.assert_series_equal(sc, tc2[~expected])
+
+ expected = Series([False, True, True, False, False, False, False])
+ tm.assert_series_equal(tc2.duplicated(keep='last'), expected)
+ tm.assert_series_equal(tc2.drop_duplicates(keep='last'),
+ tc2[~expected])
+ sc = tc2.copy()
+ sc.drop_duplicates(keep='last', inplace=True)
+ tm.assert_series_equal(sc, tc2[~expected])
+
+ expected = Series([False, True, True, False, True, True, False])
+ tm.assert_series_equal(tc2.duplicated(keep=False), expected)
+ tm.assert_series_equal(tc2.drop_duplicates(keep=False), tc2[~expected])
+ sc = tc2.copy()
+ sc.drop_duplicates(keep=False, inplace=True)
+ tm.assert_series_equal(sc, tc2[~expected])
+
+ @pytest.mark.parametrize("is_ordered", [True, False])
+ def test_drop_duplicates_categorical_bool(self, is_ordered):
+ tc = Series(Categorical([True, False, True, False],
+ categories=[True, False], ordered=is_ordered))
+
+ expected = Series([False, False, True, True])
+ tm.assert_series_equal(tc.duplicated(), expected)
+ tm.assert_series_equal(tc.drop_duplicates(), tc[~expected])
+ sc = tc.copy()
+ sc.drop_duplicates(inplace=True)
+ tm.assert_series_equal(sc, tc[~expected])
+
+ expected = Series([True, True, False, False])
+ tm.assert_series_equal(tc.duplicated(keep='last'), expected)
+ tm.assert_series_equal(tc.drop_duplicates(keep='last'), tc[~expected])
+ sc = tc.copy()
+ sc.drop_duplicates(keep='last', inplace=True)
+ tm.assert_series_equal(sc, tc[~expected])
+
+ expected = Series([True, True, True, True])
+ tm.assert_series_equal(tc.duplicated(keep=False), expected)
+ tm.assert_series_equal(tc.drop_duplicates(keep=False), tc[~expected])
+ sc = tc.copy()
+ sc.drop_duplicates(keep=False, inplace=True)
+ tm.assert_series_equal(sc, tc[~expected])
diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py
index 9aae40e1b8dbb..2898ace27f535 100644
--- a/pandas/tests/series/test_api.py
+++ b/pandas/tests/series/test_api.py
@@ -11,7 +11,8 @@
from pandas.core.indexes.datetimes import Timestamp
from pandas.compat import range
-from pandas import compat
+from pandas import (compat, Categorical, period_range, timedelta_range,
+ DatetimeIndex, PeriodIndex, TimedeltaIndex)
import pandas.io.formats.printing as printing
from pandas.util.testing import (assert_series_equal,
ensure_clean)
@@ -234,6 +235,21 @@ def test_tab_completion(self):
assert 'str' not in dir(s)
assert 'dt' in dir(s) # as it is a datetime categorical
+ def test_tab_completion_with_categorical(self):
+ # test the tab completion display
+ ok_for_cat = ['categories', 'codes', 'ordered', 'set_categories',
+ 'add_categories', 'remove_categories',
+ 'rename_categories', 'reorder_categories',
+ 'remove_unused_categories', 'as_ordered', 'as_unordered']
+
+ def get_dir(s):
+ results = [r for r in s.cat.__dir__() if not r.startswith('_')]
+ return list(sorted(set(results)))
+
+ s = Series(list('aabbcde')).astype('category')
+ results = get_dir(s)
+ tm.assert_almost_equal(results, list(sorted(set(ok_for_cat))))
+
def test_not_hashable(self):
s_empty = Series()
s = Series([1])
@@ -427,3 +443,277 @@ def test_tab_complete_warning(self, ip):
with tm.assert_produces_warning(None):
with provisionalcompleter('ignore'):
list(ip.Completer.completions('s.', 1))
+
+
+class TestCategoricalSeries(object):
+
+ @pytest.mark.parametrize(
+ "method",
+ [
+ lambda x: x.cat.set_categories([1, 2, 3]),
+ lambda x: x.cat.reorder_categories([2, 3, 1], ordered=True),
+ lambda x: x.cat.rename_categories([1, 2, 3]),
+ lambda x: x.cat.remove_unused_categories(),
+ lambda x: x.cat.remove_categories([2]),
+ lambda x: x.cat.add_categories([4]),
+ lambda x: x.cat.as_ordered(),
+ lambda x: x.cat.as_unordered(),
+ ])
+ def test_getname_categorical_accessor(self, method):
+ # GH 17509
+ s = Series([1, 2, 3], name='A').astype('category')
+ expected = 'A'
+ result = method(s).name
+ assert result == expected
+
+ def test_cat_accessor(self):
+ s = Series(Categorical(["a", "b", np.nan, "a"]))
+ tm.assert_index_equal(s.cat.categories, Index(["a", "b"]))
+ assert not s.cat.ordered, False
+
+ exp = Categorical(["a", "b", np.nan, "a"], categories=["b", "a"])
+ s.cat.set_categories(["b", "a"], inplace=True)
+ tm.assert_categorical_equal(s.values, exp)
+
+ res = s.cat.set_categories(["b", "a"])
+ tm.assert_categorical_equal(res.values, exp)
+
+ s[:] = "a"
+ s = s.cat.remove_unused_categories()
+ tm.assert_index_equal(s.cat.categories, Index(["a"]))
+
+ def test_cat_accessor_api(self):
+ # GH 9322
+ from pandas.core.categorical import CategoricalAccessor
+ assert Series.cat is CategoricalAccessor
+ s = Series(list('aabbcde')).astype('category')
+ assert isinstance(s.cat, CategoricalAccessor)
+
+ invalid = Series([1])
+ with tm.assert_raises_regex(AttributeError,
+ "only use .cat accessor"):
+ invalid.cat
+ assert not hasattr(invalid, 'cat')
+
+ def test_cat_accessor_no_new_attributes(self):
+ # https://github.com/pandas-dev/pandas/issues/10673
+ c = Series(list('aabbcde')).astype('category')
+ with tm.assert_raises_regex(AttributeError,
+ "You cannot add any new attribute"):
+ c.cat.xlabel = "a"
+
+ def test_categorical_delegations(self):
+
+ # invalid accessor
+ pytest.raises(AttributeError, lambda: Series([1, 2, 3]).cat)
+ tm.assert_raises_regex(
+ AttributeError,
+ r"Can only use .cat accessor with a 'category' dtype",
+ lambda: Series([1, 2, 3]).cat)
+ pytest.raises(AttributeError, lambda: Series(['a', 'b', 'c']).cat)
+ pytest.raises(AttributeError, lambda: Series(np.arange(5.)).cat)
+ pytest.raises(AttributeError,
+ lambda: Series([Timestamp('20130101')]).cat)
+
+ # Series should delegate calls to '.categories', '.codes', '.ordered'
+ # and the methods '.set_categories()' 'drop_unused_categories()' to the
+ # categorical# -*- coding: utf-8 -*-
+ s = Series(Categorical(["a", "b", "c", "a"], ordered=True))
+ exp_categories = Index(["a", "b", "c"])
+ tm.assert_index_equal(s.cat.categories, exp_categories)
+ s.cat.categories = [1, 2, 3]
+ exp_categories = Index([1, 2, 3])
+ tm.assert_index_equal(s.cat.categories, exp_categories)
+
+ exp_codes = Series([0, 1, 2, 0], dtype='int8')
+ tm.assert_series_equal(s.cat.codes, exp_codes)
+
+ assert s.cat.ordered
+ s = s.cat.as_unordered()
+ assert not s.cat.ordered
+ s.cat.as_ordered(inplace=True)
+ assert s.cat.ordered
+
+ # reorder
+ s = Series(Categorical(["a", "b", "c", "a"], ordered=True))
+ exp_categories = Index(["c", "b", "a"])
+ exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_)
+ s = s.cat.set_categories(["c", "b", "a"])
+ tm.assert_index_equal(s.cat.categories, exp_categories)
+ tm.assert_numpy_array_equal(s.values.__array__(), exp_values)
+ tm.assert_numpy_array_equal(s.__array__(), exp_values)
+
+ # remove unused categories
+ s = Series(Categorical(["a", "b", "b", "a"], categories=["a", "b", "c"
+ ]))
+ exp_categories = Index(["a", "b"])
+ exp_values = np.array(["a", "b", "b", "a"], dtype=np.object_)
+ s = s.cat.remove_unused_categories()
+ tm.assert_index_equal(s.cat.categories, exp_categories)
+ tm.assert_numpy_array_equal(s.values.__array__(), exp_values)
+ tm.assert_numpy_array_equal(s.__array__(), exp_values)
+
+ # This method is likely to be confused, so test that it raises an error
+ # on wrong inputs:
+ def f():
+ s.set_categories([4, 3, 2, 1])
+
+ pytest.raises(Exception, f)
+ # right: s.cat.set_categories([4,3,2,1])
+
+ def test_str_accessor_api_for_categorical(self):
+ # https://github.com/pandas-dev/pandas/issues/10661
+ from pandas.core.strings import StringMethods
+ s = Series(list('aabb'))
+ s = s + " " + s
+ c = s.astype('category')
+ assert isinstance(c.str, StringMethods)
+
+ # str functions, which need special arguments
+ special_func_defs = [
+ ('cat', (list("zyxw"),), {"sep": ","}),
+ ('center', (10,), {}),
+ ('contains', ("a",), {}),
+ ('count', ("a",), {}),
+ ('decode', ("UTF-8",), {}),
+ ('encode', ("UTF-8",), {}),
+ ('endswith', ("a",), {}),
+ ('extract', ("([a-z]*) ",), {"expand": False}),
+ ('extract', ("([a-z]*) ",), {"expand": True}),
+ ('extractall', ("([a-z]*) ",), {}),
+ ('find', ("a",), {}),
+ ('findall', ("a",), {}),
+ ('index', (" ",), {}),
+ ('ljust', (10,), {}),
+ ('match', ("a"), {}), # deprecated...
+ ('normalize', ("NFC",), {}),
+ ('pad', (10,), {}),
+ ('partition', (" ",), {"expand": False}), # not default
+ ('partition', (" ",), {"expand": True}), # default
+ ('repeat', (3,), {}),
+ ('replace', ("a", "z"), {}),
+ ('rfind', ("a",), {}),
+ ('rindex', (" ",), {}),
+ ('rjust', (10,), {}),
+ ('rpartition', (" ",), {"expand": False}), # not default
+ ('rpartition', (" ",), {"expand": True}), # default
+ ('slice', (0, 1), {}),
+ ('slice_replace', (0, 1, "z"), {}),
+ ('split', (" ",), {"expand": False}), # default
+ ('split', (" ",), {"expand": True}), # not default
+ ('startswith', ("a",), {}),
+ ('wrap', (2,), {}),
+ ('zfill', (10,), {})
+ ]
+ _special_func_names = [f[0] for f in special_func_defs]
+
+ # * get, join: they need a individual elements of type lists, but
+ # we can't make a categorical with lists as individual categories.
+ # -> `s.str.split(" ").astype("category")` will error!
+ # * `translate` has different interfaces for py2 vs. py3
+ _ignore_names = ["get", "join", "translate"]
+
+ str_func_names = [f for f in dir(s.str) if not (
+ f.startswith("_") or
+ f in _special_func_names or
+ f in _ignore_names)]
+
+ func_defs = [(f, (), {}) for f in str_func_names]
+ func_defs.extend(special_func_defs)
+
+ for func, args, kwargs in func_defs:
+ res = getattr(c.str, func)(*args, **kwargs)
+ exp = getattr(s.str, func)(*args, **kwargs)
+
+ if isinstance(res, DataFrame):
+ tm.assert_frame_equal(res, exp)
+ else:
+ tm.assert_series_equal(res, exp)
+
+ invalid = Series([1, 2, 3]).astype('category')
+ with tm.assert_raises_regex(AttributeError,
+ "Can only use .str "
+ "accessor with string"):
+ invalid.str
+ assert not hasattr(invalid, 'str')
+
+ def test_dt_accessor_api_for_categorical(self):
+ # https://github.com/pandas-dev/pandas/issues/10661
+ from pandas.core.indexes.accessors import Properties
+
+ s_dr = Series(date_range('1/1/2015', periods=5, tz="MET"))
+ c_dr = s_dr.astype("category")
+
+ s_pr = Series(period_range('1/1/2015', freq='D', periods=5))
+ c_pr = s_pr.astype("category")
+
+ s_tdr = Series(timedelta_range('1 days', '10 days'))
+ c_tdr = s_tdr.astype("category")
+
+ # only testing field (like .day)
+ # and bool (is_month_start)
+ get_ops = lambda x: x._datetimelike_ops
+
+ test_data = [
+ ("Datetime", get_ops(DatetimeIndex), s_dr, c_dr),
+ ("Period", get_ops(PeriodIndex), s_pr, c_pr),
+ ("Timedelta", get_ops(TimedeltaIndex), s_tdr, c_tdr)]
+
+ assert isinstance(c_dr.dt, Properties)
+
+ special_func_defs = [
+ ('strftime', ("%Y-%m-%d",), {}),
+ ('tz_convert', ("EST",), {}),
+ ('round', ("D",), {}),
+ ('floor', ("D",), {}),
+ ('ceil', ("D",), {}),
+ ('asfreq', ("D",), {}),
+ # ('tz_localize', ("UTC",), {}),
+ ]
+ _special_func_names = [f[0] for f in special_func_defs]
+
+ # the series is already localized
+ _ignore_names = ['tz_localize', 'components']
+
+ for name, attr_names, s, c in test_data:
+ func_names = [f
+ for f in dir(s.dt)
+ if not (f.startswith("_") or f in attr_names or f in
+ _special_func_names or f in _ignore_names)]
+
+ func_defs = [(f, (), {}) for f in func_names]
+ for f_def in special_func_defs:
+ if f_def[0] in dir(s.dt):
+ func_defs.append(f_def)
+
+ for func, args, kwargs in func_defs:
+ res = getattr(c.dt, func)(*args, **kwargs)
+ exp = getattr(s.dt, func)(*args, **kwargs)
+
+ if isinstance(res, DataFrame):
+ tm.assert_frame_equal(res, exp)
+ elif isinstance(res, Series):
+ tm.assert_series_equal(res, exp)
+ else:
+ tm.assert_almost_equal(res, exp)
+
+ for attr in attr_names:
+ try:
+ res = getattr(c.dt, attr)
+ exp = getattr(s.dt, attr)
+ except Exception as e:
+ print(name, attr)
+ raise e
+
+ if isinstance(res, DataFrame):
+ tm.assert_frame_equal(res, exp)
+ elif isinstance(res, Series):
+ tm.assert_series_equal(res, exp)
+ else:
+ tm.assert_almost_equal(res, exp)
+
+ invalid = Series([1, 2, 3]).astype('category')
+ with tm.assert_raises_regex(
+ AttributeError, "Can only use .dt accessor with datetimelike"):
+ invalid.dt
+ assert not hasattr(invalid, 'str')
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
index c814cade77e5c..f9842514ed5e5 100644
--- a/pandas/tests/series/test_constructors.py
+++ b/pandas/tests/series/test_constructors.py
@@ -17,7 +17,7 @@
is_datetime64tz_dtype)
from pandas import (Index, Series, isna, date_range, Timestamp,
NaT, period_range, timedelta_range, MultiIndex,
- IntervalIndex)
+ IntervalIndex, Categorical, DataFrame)
from pandas._libs import lib
from pandas._libs.tslib import iNaT
@@ -184,6 +184,60 @@ def test_constructor_categorical(self):
assert is_categorical_dtype(s)
assert is_categorical_dtype(s.dtype)
+ def test_constructor_categorical_with_coercion(self):
+ factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
+ # test basic creation / coercion of categoricals
+ s = Series(factor, name='A')
+ assert s.dtype == 'category'
+ assert len(s) == len(factor)
+ str(s.values)
+ str(s)
+
+ # in a frame
+ df = DataFrame({'A': factor})
+ result = df['A']
+ tm.assert_series_equal(result, s)
+ result = df.iloc[:, 0]
+ tm.assert_series_equal(result, s)
+ assert len(df) == len(factor)
+ str(df.values)
+ str(df)
+
+ df = DataFrame({'A': s})
+ result = df['A']
+ tm.assert_series_equal(result, s)
+ assert len(df) == len(factor)
+ str(df.values)
+ str(df)
+
+ # multiples
+ df = DataFrame({'A': s, 'B': s, 'C': 1})
+ result1 = df['A']
+ result2 = df['B']
+ tm.assert_series_equal(result1, s)
+ tm.assert_series_equal(result2, s, check_names=False)
+ assert result2.name == 'B'
+ assert len(df) == len(factor)
+ str(df.values)
+ str(df)
+
+ # GH8623
+ x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'],
+ [1, 'John P. Doe']],
+ columns=['person_id', 'person_name'])
+ x['person_name'] = Categorical(x.person_name
+ ) # doing this breaks transform
+
+ expected = x.iloc[0].person_name
+ result = x.person_name.iloc[0]
+ assert result == expected
+
+ result = x.person_name[0]
+ assert result == expected
+
+ result = x.person_name.loc[0]
+ assert result == expected
+
def test_constructor_categorical_dtype(self):
result = pd.Series(['a', 'b'],
dtype=CategoricalDtype(['a', 'b', 'c'],
@@ -197,6 +251,40 @@ def test_constructor_categorical_dtype(self):
tm.assert_index_equal(result.cat.categories, pd.Index(['b', 'a']))
assert result.cat.ordered is False
+ def test_categorical_sideeffects_free(self):
+ # Passing a categorical to a Series and then changing values in either
+ # the series or the categorical should not change the values in the
+ # other one, IF you specify copy!
+ cat = Categorical(["a", "b", "c", "a"])
+ s = Series(cat, copy=True)
+ assert s.cat is not cat
+ s.cat.categories = [1, 2, 3]
+ exp_s = np.array([1, 2, 3, 1], dtype=np.int64)
+ exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_)
+ tm.assert_numpy_array_equal(s.__array__(), exp_s)
+ tm.assert_numpy_array_equal(cat.__array__(), exp_cat)
+
+ # setting
+ s[0] = 2
+ exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64)
+ tm.assert_numpy_array_equal(s.__array__(), exp_s2)
+ tm.assert_numpy_array_equal(cat.__array__(), exp_cat)
+
+ # however, copy is False by default
+ # so this WILL change values
+ cat = Categorical(["a", "b", "c", "a"])
+ s = Series(cat)
+ assert s.values is cat
+ s.cat.categories = [1, 2, 3]
+ exp_s = np.array([1, 2, 3, 1], dtype=np.int64)
+ tm.assert_numpy_array_equal(s.__array__(), exp_s)
+ tm.assert_numpy_array_equal(cat.__array__(), exp_s)
+
+ s[0] = 2
+ exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64)
+ tm.assert_numpy_array_equal(s.__array__(), exp_s2)
+ tm.assert_numpy_array_equal(cat.__array__(), exp_s2)
+
def test_unordered_compare_equal(self):
left = pd.Series(['a', 'b', 'c'],
dtype=CategoricalDtype(['a', 'b']))
diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py
index 163950b75bc34..12d0267005f19 100644
--- a/pandas/tests/series/test_dtypes.py
+++ b/pandas/tests/series/test_dtypes.py
@@ -10,6 +10,7 @@
import warnings
from numpy import nan
+import pandas as pd
import numpy as np
from pandas import (
@@ -229,6 +230,98 @@ def test_astype_categories_deprecation(self):
result = s.astype('category', categories=['a', 'b'], ordered=True)
tm.assert_series_equal(result, expected)
+ def test_astype_from_categorical(self):
+ l = ["a", "b", "c", "a"]
+ s = Series(l)
+ exp = Series(Categorical(l))
+ res = s.astype('category')
+ tm.assert_series_equal(res, exp)
+
+ l = [1, 2, 3, 1]
+ s = Series(l)
+ exp = Series(Categorical(l))
+ res = s.astype('category')
+ tm.assert_series_equal(res, exp)
+
+ df = DataFrame({"cats": [1, 2, 3, 4, 5, 6],
+ "vals": [1, 2, 3, 4, 5, 6]})
+ cats = Categorical([1, 2, 3, 4, 5, 6])
+ exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
+ df["cats"] = df["cats"].astype("category")
+ tm.assert_frame_equal(exp_df, df)
+
+ df = DataFrame({"cats": ['a', 'b', 'b', 'a', 'a', 'd'],
+ "vals": [1, 2, 3, 4, 5, 6]})
+ cats = Categorical(['a', 'b', 'b', 'a', 'a', 'd'])
+ exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
+ df["cats"] = df["cats"].astype("category")
+ tm.assert_frame_equal(exp_df, df)
+
+ # with keywords
+ l = ["a", "b", "c", "a"]
+ s = Series(l)
+ exp = Series(Categorical(l, ordered=True))
+ res = s.astype(CategoricalDtype(None, ordered=True))
+ tm.assert_series_equal(res, exp)
+
+ exp = Series(Categorical(l, categories=list('abcdef'), ordered=True))
+ res = s.astype(CategoricalDtype(list('abcdef'), ordered=True))
+ tm.assert_series_equal(res, exp)
+
+ def test_astype_categorical_to_other(self):
+
+ df = DataFrame({'value': np.random.randint(0, 10000, 100)})
+ labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
+ cat_labels = Categorical(labels, labels)
+
+ df = df.sort_values(by=['value'], ascending=True)
+ df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
+ right=False, labels=cat_labels)
+
+ s = df['value_group']
+ expected = s
+ tm.assert_series_equal(s.astype('category'), expected)
+ tm.assert_series_equal(s.astype(CategoricalDtype()), expected)
+ pytest.raises(ValueError, lambda: s.astype('float64'))
+
+ cat = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']))
+ exp = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
+ tm.assert_series_equal(cat.astype('str'), exp)
+ s2 = Series(Categorical(['1', '2', '3', '4']))
+ exp2 = Series([1, 2, 3, 4]).astype(int)
+ tm.assert_series_equal(s2.astype('int'), exp2)
+
+ # object don't sort correctly, so just compare that we have the same
+ # values
+ def cmp(a, b):
+ tm.assert_almost_equal(
+ np.sort(np.unique(a)), np.sort(np.unique(b)))
+
+ expected = Series(np.array(s.values), name='value_group')
+ cmp(s.astype('object'), expected)
+ cmp(s.astype(np.object_), expected)
+
+ # array conversion
+ tm.assert_almost_equal(np.array(s), np.array(s.values))
+
+ # valid conversion
+ for valid in [lambda x: x.astype('category'),
+ lambda x: x.astype(CategoricalDtype()),
+ lambda x: x.astype('object').astype('category'),
+ lambda x: x.astype('object').astype(
+ CategoricalDtype())
+ ]:
+
+ result = valid(s)
+ # compare series values
+ # internal .categories can't be compared because it is sorted
+ tm.assert_series_equal(result, s, check_categorical=False)
+
+ # invalid conversion (these are NOT a dtype)
+ for invalid in [lambda x: x.astype(Categorical),
+ lambda x: x.astype('object').astype(Categorical)]:
+ pytest.raises(TypeError, lambda: invalid(s))
+
def test_astype_categoricaldtype(self):
s = Series(['a', 'b', 'a'])
result = s.astype(CategoricalDtype(['a', 'b'], ordered=True))
diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py
index c0ef5a3694bf3..b7381c6b8fd30 100644
--- a/pandas/tests/series/test_indexing.py
+++ b/pandas/tests/series/test_indexing.py
@@ -13,7 +13,8 @@
from pandas.core.dtypes.common import is_integer, is_scalar
from pandas import (Index, Series, DataFrame, isna,
date_range, NaT, MultiIndex,
- Timestamp, DatetimeIndex, Timedelta)
+ Timestamp, DatetimeIndex, Timedelta,
+ Categorical)
from pandas.core.indexing import IndexingError
from pandas.tseries.offsets import BDay
from pandas._libs import tslib, lib
@@ -2237,6 +2238,31 @@ def test_reindex_fill_value(self):
expected = Series([False, True, False], index=[1, 2, 3])
assert_series_equal(result, expected)
+ def test_reindex_categorical(self):
+
+ index = date_range('20000101', periods=3)
+
+ # reindexing to an invalid Categorical
+ s = Series(['a', 'b', 'c'], dtype='category')
+ result = s.reindex(index)
+ expected = Series(Categorical(values=[np.nan, np.nan, np.nan],
+ categories=['a', 'b', 'c']))
+ expected.index = index
+ tm.assert_series_equal(result, expected)
+
+ # partial reindexing
+ expected = Series(Categorical(values=['b', 'c'], categories=['a', 'b',
+ 'c']))
+ expected.index = [1, 2]
+ result = s.reindex([1, 2])
+ tm.assert_series_equal(result, expected)
+
+ expected = Series(Categorical(
+ values=['c', np.nan], categories=['a', 'b', 'c']))
+ expected.index = [2, 3]
+ result = s.reindex([2, 3])
+ tm.assert_series_equal(result, expected)
+
def test_rename(self):
# GH 17407
@@ -2337,6 +2363,41 @@ def test_setitem_slice_into_readonly_backing_data(self):
assert not array.any()
+ def test_categorial_assigning_ops(self):
+ orig = Series(Categorical(["b", "b"], categories=["a", "b"]))
+ s = orig.copy()
+ s[:] = "a"
+ exp = Series(Categorical(["a", "a"], categories=["a", "b"]))
+ tm.assert_series_equal(s, exp)
+
+ s = orig.copy()
+ s[1] = "a"
+ exp = Series(Categorical(["b", "a"], categories=["a", "b"]))
+ tm.assert_series_equal(s, exp)
+
+ s = orig.copy()
+ s[s.index > 0] = "a"
+ exp = Series(Categorical(["b", "a"], categories=["a", "b"]))
+ tm.assert_series_equal(s, exp)
+
+ s = orig.copy()
+ s[[False, True]] = "a"
+ exp = Series(Categorical(["b", "a"], categories=["a", "b"]))
+ tm.assert_series_equal(s, exp)
+
+ s = orig.copy()
+ s.index = ["x", "y"]
+ s["y"] = "a"
+ exp = Series(Categorical(["b", "a"], categories=["a", "b"]),
+ index=["x", "y"])
+ tm.assert_series_equal(s, exp)
+
+ # ensure that one can set something to np.nan
+ s = Series(Categorical([1, 2, 3]))
+ exp = Series(Categorical([1, np.nan, 3], categories=[1, 2, 3]))
+ s[1] = np.nan
+ tm.assert_series_equal(s, exp)
+
class TestTimeSeriesDuplicates(object):
diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py
index 594049d3d3bb9..b23dc37016b69 100644
--- a/pandas/tests/series/test_missing.py
+++ b/pandas/tests/series/test_missing.py
@@ -365,6 +365,20 @@ def test_fillna_raise(self):
with pytest.raises(ValueError):
s.fillna(1, limit=limit, method=method)
+ def test_categorical_nan_equality(self):
+ cat = Series(Categorical(["a", "b", "c", np.nan]))
+ exp = Series([True, True, True, False])
+ res = (cat == cat)
+ tm.assert_series_equal(res, exp)
+
+ def test_categorical_nan_handling(self):
+
+ # NaNs are represented as -1 in labels
+ s = Series(Categorical(["a", "b", np.nan, "a"]))
+ tm.assert_index_equal(s.cat.categories, Index(["a", "b"]))
+ tm.assert_numpy_array_equal(s.values.codes,
+ np.array([0, 1, -1, 0], dtype=np.int8))
+
@pytest.mark.parametrize('fill_value, expected_output', [
('a', ['a', 'a', 'b', 'a', 'a']),
({1: 'a', 3: 'b', 4: 'b'}, ['a', 'a', 'b', 'b', 'b']),
diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py
index 0d064983fb546..6cc866a35514f 100644
--- a/pandas/tests/series/test_operators.py
+++ b/pandas/tests/series/test_operators.py
@@ -14,7 +14,7 @@
import pandas as pd
from pandas import (Index, Series, DataFrame, isna, bdate_range,
- NaT, date_range, timedelta_range)
+ NaT, date_range, timedelta_range, Categorical)
from pandas.core.indexes.datetimes import Timestamp
from pandas.core.indexes.timedeltas import Timedelta
import pandas.core.nanops as nanops
@@ -931,6 +931,58 @@ def test_object_comparisons(self):
expected = -(s == 'a')
assert_series_equal(result, expected)
+ def test_categorical_comparisons(self):
+
+ # GH 8938
+ # allow equality comparisons
+ a = Series(list('abc'), dtype="category")
+ b = Series(list('abc'), dtype="object")
+ c = Series(['a', 'b', 'cc'], dtype="object")
+ d = Series(list('acb'), dtype="object")
+ e = Categorical(list('abc'))
+ f = Categorical(list('acb'))
+
+ # vs scalar
+ assert not (a == 'a').all()
+ assert ((a != 'a') == ~(a == 'a')).all()
+
+ assert not ('a' == a).all()
+ assert (a == 'a')[0]
+ assert ('a' == a)[0]
+ assert not ('a' != a)[0]
+
+ # vs list-like
+ assert (a == a).all()
+ assert not (a != a).all()
+
+ assert (a == list(a)).all()
+ assert (a == b).all()
+ assert (b == a).all()
+ assert ((~(a == b)) == (a != b)).all()
+ assert ((~(b == a)) == (b != a)).all()
+
+ assert not (a == c).all()
+ assert not (c == a).all()
+ assert not (a == d).all()
+ assert not (d == a).all()
+
+ # vs a cat-like
+ assert (a == e).all()
+ assert (e == a).all()
+ assert not (a == f).all()
+ assert not (f == a).all()
+
+ assert ((~(a == e) == (a != e)).all())
+ assert ((~(e == a) == (e != a)).all())
+ assert ((~(a == f) == (a != f)).all())
+ assert ((~(f == a) == (f != a)).all())
+
+ # non-equality is not comparable
+ pytest.raises(TypeError, lambda: a < b)
+ pytest.raises(TypeError, lambda: b < a)
+ pytest.raises(TypeError, lambda: a > b)
+ pytest.raises(TypeError, lambda: b > a)
+
def test_comparison_tuples(self):
# GH11339
# comparisons vs tuple
@@ -1036,6 +1088,34 @@ def test_comparison_invalid(self):
pytest.raises(TypeError, lambda: x < y)
pytest.raises(TypeError, lambda: x <= y)
+ def test_unequal_categorical_comparison_raises_type_error(self):
+ # unequal comparison should raise for unordered cats
+ cat = Series(Categorical(list("abc")))
+
+ def f():
+ cat > "b"
+
+ pytest.raises(TypeError, f)
+ cat = Series(Categorical(list("abc"), ordered=False))
+
+ def f():
+ cat > "b"
+
+ pytest.raises(TypeError, f)
+
+ # https://github.com/pandas-dev/pandas/issues/9836#issuecomment-92123057
+ # and following comparisons with scalars not in categories should raise
+ # for unequal comps, but not for equal/not equal
+ cat = Series(Categorical(list("abc"), ordered=True))
+
+ pytest.raises(TypeError, lambda: cat < "d")
+ pytest.raises(TypeError, lambda: cat > "d")
+ pytest.raises(TypeError, lambda: "d" < cat)
+ pytest.raises(TypeError, lambda: "d" > cat)
+
+ tm.assert_series_equal(cat == "d", Series([False, False, False]))
+ tm.assert_series_equal(cat != "d", Series([True, True, True]))
+
def test_more_na_comparisons(self):
for dtype in [None, object]:
left = Series(['a', np.nan, 'c'], dtype=dtype)
diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py
index c22e2ca8e0dc8..bf3e584657763 100644
--- a/pandas/tests/series/test_repr.py
+++ b/pandas/tests/series/test_repr.py
@@ -8,7 +8,8 @@
import numpy as np
import pandas as pd
-from pandas import (Index, Series, DataFrame, date_range, option_context)
+from pandas import (Index, Series, DataFrame, date_range, option_context,
+ Categorical, period_range, timedelta_range)
from pandas.core.index import MultiIndex
from pandas.compat import lrange, range, u
@@ -198,3 +199,251 @@ def test_latex_repr(self):
assert result == s._repr_latex_()
assert s._repr_latex_() is None
+
+
+class TestCategoricalRepr(object):
+
+ def test_categorical_repr(self):
+ a = Series(Categorical([1, 2, 3, 4]))
+ exp = u("0 1\n1 2\n2 3\n3 4\n" +
+ "dtype: category\nCategories (4, int64): [1, 2, 3, 4]")
+
+ assert exp == a.__unicode__()
+
+ a = Series(Categorical(["a", "b"] * 25))
+ exp = u("0 a\n1 b\n" + " ..\n" + "48 a\n49 b\n" +
+ "Length: 50, dtype: category\nCategories (2, object): [a, b]")
+ with option_context("display.max_rows", 5):
+ assert exp == repr(a)
+
+ levs = list("abcdefghijklmnopqrstuvwxyz")
+ a = Series(Categorical(["a", "b"], categories=levs, ordered=True))
+ exp = u("0 a\n1 b\n" + "dtype: category\n"
+ "Categories (26, object): [a < b < c < d ... w < x < y < z]")
+ assert exp == a.__unicode__()
+
+ def test_categorical_series_repr(self):
+ s = Series(Categorical([1, 2, 3]))
+ exp = """0 1
+1 2
+2 3
+dtype: category
+Categories (3, int64): [1, 2, 3]"""
+
+ assert repr(s) == exp
+
+ s = Series(Categorical(np.arange(10)))
+ exp = """0 0
+1 1
+2 2
+3 3
+4 4
+5 5
+6 6
+7 7
+8 8
+9 9
+dtype: category
+Categories (10, int64): [0, 1, 2, 3, ..., 6, 7, 8, 9]"""
+
+ assert repr(s) == exp
+
+ def test_categorical_series_repr_ordered(self):
+ s = Series(Categorical([1, 2, 3], ordered=True))
+ exp = """0 1
+1 2
+2 3
+dtype: category
+Categories (3, int64): [1 < 2 < 3]"""
+
+ assert repr(s) == exp
+
+ s = Series(Categorical(np.arange(10), ordered=True))
+ exp = """0 0
+1 1
+2 2
+3 3
+4 4
+5 5
+6 6
+7 7
+8 8
+9 9
+dtype: category
+Categories (10, int64): [0 < 1 < 2 < 3 ... 6 < 7 < 8 < 9]"""
+
+ assert repr(s) == exp
+
+ def test_categorical_series_repr_datetime(self):
+ idx = date_range('2011-01-01 09:00', freq='H', periods=5)
+ s = Series(Categorical(idx))
+ exp = """0 2011-01-01 09:00:00
+1 2011-01-01 10:00:00
+2 2011-01-01 11:00:00
+3 2011-01-01 12:00:00
+4 2011-01-01 13:00:00
+dtype: category
+Categories (5, datetime64[ns]): [2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00,
+ 2011-01-01 12:00:00, 2011-01-01 13:00:00]""" # noqa
+
+ assert repr(s) == exp
+
+ idx = date_range('2011-01-01 09:00', freq='H', periods=5,
+ tz='US/Eastern')
+ s = Series(Categorical(idx))
+ exp = """0 2011-01-01 09:00:00-05:00
+1 2011-01-01 10:00:00-05:00
+2 2011-01-01 11:00:00-05:00
+3 2011-01-01 12:00:00-05:00
+4 2011-01-01 13:00:00-05:00
+dtype: category
+Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00,
+ 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00,
+ 2011-01-01 13:00:00-05:00]""" # noqa
+
+ assert repr(s) == exp
+
+ def test_categorical_series_repr_datetime_ordered(self):
+ idx = date_range('2011-01-01 09:00', freq='H', periods=5)
+ s = Series(Categorical(idx, ordered=True))
+ exp = """0 2011-01-01 09:00:00
+1 2011-01-01 10:00:00
+2 2011-01-01 11:00:00
+3 2011-01-01 12:00:00
+4 2011-01-01 13:00:00
+dtype: category
+Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 <
+ 2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa
+
+ assert repr(s) == exp
+
+ idx = date_range('2011-01-01 09:00', freq='H', periods=5,
+ tz='US/Eastern')
+ s = Series(Categorical(idx, ordered=True))
+ exp = """0 2011-01-01 09:00:00-05:00
+1 2011-01-01 10:00:00-05:00
+2 2011-01-01 11:00:00-05:00
+3 2011-01-01 12:00:00-05:00
+4 2011-01-01 13:00:00-05:00
+dtype: category
+Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 <
+ 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 <
+ 2011-01-01 13:00:00-05:00]""" # noqa
+
+ assert repr(s) == exp
+
+ def test_categorical_series_repr_period(self):
+ idx = period_range('2011-01-01 09:00', freq='H', periods=5)
+ s = Series(Categorical(idx))
+ exp = """0 2011-01-01 09:00
+1 2011-01-01 10:00
+2 2011-01-01 11:00
+3 2011-01-01 12:00
+4 2011-01-01 13:00
+dtype: category
+Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00,
+ 2011-01-01 13:00]""" # noqa
+
+ assert repr(s) == exp
+
+ idx = period_range('2011-01', freq='M', periods=5)
+ s = Series(Categorical(idx))
+ exp = """0 2011-01
+1 2011-02
+2 2011-03
+3 2011-04
+4 2011-05
+dtype: category
+Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]"""
+
+ assert repr(s) == exp
+
+ def test_categorical_series_repr_period_ordered(self):
+ idx = period_range('2011-01-01 09:00', freq='H', periods=5)
+ s = Series(Categorical(idx, ordered=True))
+ exp = """0 2011-01-01 09:00
+1 2011-01-01 10:00
+2 2011-01-01 11:00
+3 2011-01-01 12:00
+4 2011-01-01 13:00
+dtype: category
+Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 <
+ 2011-01-01 13:00]""" # noqa
+
+ assert repr(s) == exp
+
+ idx = period_range('2011-01', freq='M', periods=5)
+ s = Series(Categorical(idx, ordered=True))
+ exp = """0 2011-01
+1 2011-02
+2 2011-03
+3 2011-04
+4 2011-05
+dtype: category
+Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]"""
+
+ assert repr(s) == exp
+
+ def test_categorical_series_repr_timedelta(self):
+ idx = timedelta_range('1 days', periods=5)
+ s = Series(Categorical(idx))
+ exp = """0 1 days
+1 2 days
+2 3 days
+3 4 days
+4 5 days
+dtype: category
+Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]"""
+
+ assert repr(s) == exp
+
+ idx = timedelta_range('1 hours', periods=10)
+ s = Series(Categorical(idx))
+ exp = """0 0 days 01:00:00
+1 1 days 01:00:00
+2 2 days 01:00:00
+3 3 days 01:00:00
+4 4 days 01:00:00
+5 5 days 01:00:00
+6 6 days 01:00:00
+7 7 days 01:00:00
+8 8 days 01:00:00
+9 9 days 01:00:00
+dtype: category
+Categories (10, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00,
+ 3 days 01:00:00, ..., 6 days 01:00:00, 7 days 01:00:00,
+ 8 days 01:00:00, 9 days 01:00:00]""" # noqa
+
+ assert repr(s) == exp
+
+ def test_categorical_series_repr_timedelta_ordered(self):
+ idx = timedelta_range('1 days', periods=5)
+ s = Series(Categorical(idx, ordered=True))
+ exp = """0 1 days
+1 2 days
+2 3 days
+3 4 days
+4 5 days
+dtype: category
+Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" # noqa
+
+ assert repr(s) == exp
+
+ idx = timedelta_range('1 hours', periods=10)
+ s = Series(Categorical(idx, ordered=True))
+ exp = """0 0 days 01:00:00
+1 1 days 01:00:00
+2 2 days 01:00:00
+3 3 days 01:00:00
+4 4 days 01:00:00
+5 5 days 01:00:00
+6 6 days 01:00:00
+7 7 days 01:00:00
+8 8 days 01:00:00
+9 9 days 01:00:00
+dtype: category
+Categories (10, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 <
+ 3 days 01:00:00 ... 6 days 01:00:00 < 7 days 01:00:00 <
+ 8 days 01:00:00 < 9 days 01:00:00]""" # noqa
+
+ assert repr(s) == exp
diff --git a/pandas/tests/series/test_sorting.py b/pandas/tests/series/test_sorting.py
index 40b0280de3719..310412e53bd1c 100644
--- a/pandas/tests/series/test_sorting.py
+++ b/pandas/tests/series/test_sorting.py
@@ -5,7 +5,7 @@
import numpy as np
import random
-from pandas import DataFrame, Series, MultiIndex, IntervalIndex
+from pandas import DataFrame, Series, MultiIndex, IntervalIndex, Categorical
from pandas.util.testing import assert_series_equal, assert_almost_equal
import pandas.util.testing as tm
@@ -195,3 +195,72 @@ def test_sort_index_intervals(self):
[3, 2, 1, 0],
[4, 3, 2, 1]))
assert_series_equal(result, expected)
+
+ def test_sort_values_categorical(self):
+
+ c = Categorical(["a", "b", "b", "a"], ordered=False)
+ cat = Series(c.copy())
+
+ # sort in the categories order
+ expected = Series(
+ Categorical(["a", "a", "b", "b"],
+ ordered=False), index=[0, 3, 1, 2])
+ result = cat.sort_values()
+ tm.assert_series_equal(result, expected)
+
+ cat = Series(Categorical(["a", "c", "b", "d"], ordered=True))
+ res = cat.sort_values()
+ exp = np.array(["a", "b", "c", "d"], dtype=np.object_)
+ tm.assert_numpy_array_equal(res.__array__(), exp)
+
+ cat = Series(Categorical(["a", "c", "b", "d"], categories=[
+ "a", "b", "c", "d"], ordered=True))
+ res = cat.sort_values()
+ exp = np.array(["a", "b", "c", "d"], dtype=np.object_)
+ tm.assert_numpy_array_equal(res.__array__(), exp)
+
+ res = cat.sort_values(ascending=False)
+ exp = np.array(["d", "c", "b", "a"], dtype=np.object_)
+ tm.assert_numpy_array_equal(res.__array__(), exp)
+
+ raw_cat1 = Categorical(["a", "b", "c", "d"],
+ categories=["a", "b", "c", "d"], ordered=False)
+ raw_cat2 = Categorical(["a", "b", "c", "d"],
+ categories=["d", "c", "b", "a"], ordered=True)
+ s = ["a", "b", "c", "d"]
+ df = DataFrame({"unsort": raw_cat1,
+ "sort": raw_cat2,
+ "string": s,
+ "values": [1, 2, 3, 4]})
+
+ # Cats must be sorted in a dataframe
+ res = df.sort_values(by=["string"], ascending=False)
+ exp = np.array(["d", "c", "b", "a"], dtype=np.object_)
+ tm.assert_numpy_array_equal(res["sort"].values.__array__(), exp)
+ assert res["sort"].dtype == "category"
+
+ res = df.sort_values(by=["sort"], ascending=False)
+ exp = df.sort_values(by=["string"], ascending=True)
+ tm.assert_series_equal(res["values"], exp["values"])
+ assert res["sort"].dtype == "category"
+ assert res["unsort"].dtype == "category"
+
+ # unordered cat, but we allow this
+ df.sort_values(by=["unsort"], ascending=False)
+
+ # multi-columns sort
+ # GH 7848
+ df = DataFrame({"id": [6, 5, 4, 3, 2, 1],
+ "raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']})
+ df["grade"] = Categorical(df["raw_grade"], ordered=True)
+ df['grade'] = df['grade'].cat.set_categories(['b', 'e', 'a'])
+
+ # sorts 'grade' according to the order of the categories
+ result = df.sort_values(by=['grade'])
+ expected = df.iloc[[1, 2, 5, 0, 3, 4]]
+ tm.assert_frame_equal(result, expected)
+
+ # multi
+ result = df.sort_values(by=['grade', 'id'])
+ expected = df.iloc[[2, 1, 5, 4, 3, 0]]
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
deleted file mode 100644
index 17f34fdf3604c..0000000000000
--- a/pandas/tests/test_categorical.py
+++ /dev/null
@@ -1,4826 +0,0 @@
-# -*- coding: utf-8 -*-
-# pylint: disable=E1101,E1103,W0232
-
-from warnings import catch_warnings
-import pytest
-import sys
-from datetime import datetime
-from distutils.version import LooseVersion
-
-import numpy as np
-
-from pandas.core.dtypes.dtypes import CategoricalDtype
-from pandas.core.dtypes.common import (
- is_categorical_dtype,
- is_float_dtype,
- is_integer_dtype)
-
-import pandas as pd
-import pandas.compat as compat
-import pandas.util.testing as tm
-from pandas import (Categorical, Index, Series, DataFrame, Timestamp,
- CategoricalIndex, isna, date_range, DatetimeIndex,
- period_range, PeriodIndex, timedelta_range,
- TimedeltaIndex, NaT, Interval, IntervalIndex)
-from pandas.compat import range, lrange, u, PY3, PYPY
-from pandas.core.config import option_context
-from pandas.core.categorical import _recode_for_categories
-
-
-class TestCategorical(object):
-
- def setup_method(self, method):
- self.factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'],
- ordered=True)
-
- def test_getitem(self):
- assert self.factor[0] == 'a'
- assert self.factor[-1] == 'c'
-
- subf = self.factor[[0, 1, 2]]
- tm.assert_numpy_array_equal(subf._codes,
- np.array([0, 1, 1], dtype=np.int8))
-
- subf = self.factor[np.asarray(self.factor) == 'c']
- tm.assert_numpy_array_equal(subf._codes,
- np.array([2, 2, 2], dtype=np.int8))
-
- def test_getitem_listlike(self):
-
- # GH 9469
- # properly coerce the input indexers
- np.random.seed(1)
- c = Categorical(np.random.randint(0, 5, size=150000).astype(np.int8))
- result = c.codes[np.array([100000]).astype(np.int64)]
- expected = c[np.array([100000]).astype(np.int64)].codes
- tm.assert_numpy_array_equal(result, expected)
-
- @pytest.mark.parametrize(
- "method",
- [
- lambda x: x.cat.set_categories([1, 2, 3]),
- lambda x: x.cat.reorder_categories([2, 3, 1], ordered=True),
- lambda x: x.cat.rename_categories([1, 2, 3]),
- lambda x: x.cat.remove_unused_categories(),
- lambda x: x.cat.remove_categories([2]),
- lambda x: x.cat.add_categories([4]),
- lambda x: x.cat.as_ordered(),
- lambda x: x.cat.as_unordered(),
- ])
- def test_getname_categorical_accessor(self, method):
- # GH 17509
- s = Series([1, 2, 3], name='A').astype('category')
- expected = 'A'
- result = method(s).name
- assert result == expected
-
- def test_getitem_category_type(self):
- # GH 14580
- # test iloc() on Series with Categorical data
-
- s = Series([1, 2, 3]).astype('category')
-
- # get slice
- result = s.iloc[0:2]
- expected = Series([1, 2]).astype(CategoricalDtype([1, 2, 3]))
- tm.assert_series_equal(result, expected)
-
- # get list of indexes
- result = s.iloc[[0, 1]]
- expected = Series([1, 2]).astype(CategoricalDtype([1, 2, 3]))
- tm.assert_series_equal(result, expected)
-
- # get boolean array
- result = s.iloc[[True, False, False]]
- expected = Series([1]).astype(CategoricalDtype([1, 2, 3]))
- tm.assert_series_equal(result, expected)
-
- def test_setitem(self):
-
- # int/positional
- c = self.factor.copy()
- c[0] = 'b'
- assert c[0] == 'b'
- c[-1] = 'a'
- assert c[-1] == 'a'
-
- # boolean
- c = self.factor.copy()
- indexer = np.zeros(len(c), dtype='bool')
- indexer[0] = True
- indexer[-1] = True
- c[indexer] = 'c'
- expected = Categorical(['c', 'b', 'b', 'a', 'a', 'c', 'c', 'c'],
- ordered=True)
-
- tm.assert_categorical_equal(c, expected)
-
- def test_setitem_listlike(self):
-
- # GH 9469
- # properly coerce the input indexers
- np.random.seed(1)
- c = Categorical(np.random.randint(0, 5, size=150000).astype(
- np.int8)).add_categories([-1000])
- indexer = np.array([100000]).astype(np.int64)
- c[indexer] = -1000
-
- # we are asserting the code result here
- # which maps to the -1000 category
- result = c.codes[np.array([100000]).astype(np.int64)]
- tm.assert_numpy_array_equal(result, np.array([5], dtype='int8'))
-
- def test_constructor_empty(self):
- # GH 17248
- c = Categorical([])
- expected = Index([])
- tm.assert_index_equal(c.categories, expected)
-
- c = Categorical([], categories=[1, 2, 3])
- expected = pd.Int64Index([1, 2, 3])
- tm.assert_index_equal(c.categories, expected)
-
- def test_constructor_tuples(self):
- values = np.array([(1,), (1, 2), (1,), (1, 2)], dtype=object)
- result = Categorical(values)
- expected = Index([(1,), (1, 2)], tupleize_cols=False)
- tm.assert_index_equal(result.categories, expected)
- assert result.ordered is False
-
- def test_constructor_tuples_datetimes(self):
- # numpy will auto reshape when all of the tuples are the
- # same len, so add an extra one with 2 items and slice it off
- values = np.array([(Timestamp('2010-01-01'),),
- (Timestamp('2010-01-02'),),
- (Timestamp('2010-01-01'),),
- (Timestamp('2010-01-02'),),
- ('a', 'b')], dtype=object)[:-1]
- result = Categorical(values)
- expected = Index([(Timestamp('2010-01-01'),),
- (Timestamp('2010-01-02'),)], tupleize_cols=False)
- tm.assert_index_equal(result.categories, expected)
-
- def test_constructor_unsortable(self):
-
- # it works!
- arr = np.array([1, 2, 3, datetime.now()], dtype='O')
- factor = Categorical(arr, ordered=False)
- assert not factor.ordered
-
- # this however will raise as cannot be sorted
- pytest.raises(
- TypeError, lambda: Categorical(arr, ordered=True))
-
- def test_constructor_interval(self):
- result = Categorical([Interval(1, 2), Interval(2, 3), Interval(3, 6)],
- ordered=True)
- ii = IntervalIndex.from_intervals([Interval(1, 2),
- Interval(2, 3),
- Interval(3, 6)])
- exp = Categorical(ii, ordered=True)
- tm.assert_categorical_equal(result, exp)
- tm.assert_index_equal(result.categories, ii)
-
- def test_is_equal_dtype(self):
-
- # test dtype comparisons between cats
-
- c1 = Categorical(list('aabca'), categories=list('abc'), ordered=False)
- c2 = Categorical(list('aabca'), categories=list('cab'), ordered=False)
- c3 = Categorical(list('aabca'), categories=list('cab'), ordered=True)
- assert c1.is_dtype_equal(c1)
- assert c2.is_dtype_equal(c2)
- assert c3.is_dtype_equal(c3)
- assert c1.is_dtype_equal(c2)
- assert not c1.is_dtype_equal(c3)
- assert not c1.is_dtype_equal(Index(list('aabca')))
- assert not c1.is_dtype_equal(c1.astype(object))
- assert c1.is_dtype_equal(CategoricalIndex(c1))
- assert (c1.is_dtype_equal(
- CategoricalIndex(c1, categories=list('cab'))))
- assert not c1.is_dtype_equal(CategoricalIndex(c1, ordered=True))
-
- def test_constructor(self):
-
- exp_arr = np.array(["a", "b", "c", "a", "b", "c"], dtype=np.object_)
- c1 = Categorical(exp_arr)
- tm.assert_numpy_array_equal(c1.__array__(), exp_arr)
- c2 = Categorical(exp_arr, categories=["a", "b", "c"])
- tm.assert_numpy_array_equal(c2.__array__(), exp_arr)
- c2 = Categorical(exp_arr, categories=["c", "b", "a"])
- tm.assert_numpy_array_equal(c2.__array__(), exp_arr)
-
- # categories must be unique
- def f():
- Categorical([1, 2], [1, 2, 2])
-
- pytest.raises(ValueError, f)
-
- def f():
- Categorical(["a", "b"], ["a", "b", "b"])
-
- pytest.raises(ValueError, f)
-
- # The default should be unordered
- c1 = Categorical(["a", "b", "c", "a"])
- assert not c1.ordered
-
- # Categorical as input
- c1 = Categorical(["a", "b", "c", "a"])
- c2 = Categorical(c1)
- tm.assert_categorical_equal(c1, c2)
-
- c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
- c2 = Categorical(c1)
- tm.assert_categorical_equal(c1, c2)
-
- c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
- c2 = Categorical(c1)
- tm.assert_categorical_equal(c1, c2)
-
- c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
- c2 = Categorical(c1, categories=["a", "b", "c"])
- tm.assert_numpy_array_equal(c1.__array__(), c2.__array__())
- tm.assert_index_equal(c2.categories, Index(["a", "b", "c"]))
-
- # Series of dtype category
- c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
- c2 = Categorical(Series(c1))
- tm.assert_categorical_equal(c1, c2)
-
- c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
- c2 = Categorical(Series(c1))
- tm.assert_categorical_equal(c1, c2)
-
- # Series
- c1 = Categorical(["a", "b", "c", "a"])
- c2 = Categorical(Series(["a", "b", "c", "a"]))
- tm.assert_categorical_equal(c1, c2)
-
- c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
- c2 = Categorical(Series(["a", "b", "c", "a"]),
- categories=["a", "b", "c", "d"])
- tm.assert_categorical_equal(c1, c2)
-
- # This should result in integer categories, not float!
- cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
- assert is_integer_dtype(cat.categories)
-
- # https://github.com/pandas-dev/pandas/issues/3678
- cat = Categorical([np.nan, 1, 2, 3])
- assert is_integer_dtype(cat.categories)
-
- # this should result in floats
- cat = Categorical([np.nan, 1, 2., 3])
- assert is_float_dtype(cat.categories)
-
- cat = Categorical([np.nan, 1., 2., 3.])
- assert is_float_dtype(cat.categories)
-
- # This doesn't work -> this would probably need some kind of "remember
- # the original type" feature to try to cast the array interface result
- # to...
-
- # vals = np.asarray(cat[cat.notna()])
- # assert is_integer_dtype(vals)
-
- # corner cases
- cat = Categorical([1])
- assert len(cat.categories) == 1
- assert cat.categories[0] == 1
- assert len(cat.codes) == 1
- assert cat.codes[0] == 0
-
- cat = Categorical(["a"])
- assert len(cat.categories) == 1
- assert cat.categories[0] == "a"
- assert len(cat.codes) == 1
- assert cat.codes[0] == 0
-
- # Scalars should be converted to lists
- cat = Categorical(1)
- assert len(cat.categories) == 1
- assert cat.categories[0] == 1
- assert len(cat.codes) == 1
- assert cat.codes[0] == 0
-
- # two arrays
- # - when the first is an integer dtype and the second is not
- # - when the resulting codes are all -1/NaN
- with tm.assert_produces_warning(None):
- c_old = Categorical([0, 1, 2, 0, 1, 2],
- categories=["a", "b", "c"]) # noqa
-
- with tm.assert_produces_warning(None):
- c_old = Categorical([0, 1, 2, 0, 1, 2], # noqa
- categories=[3, 4, 5])
-
- # the next one are from the old docs
- with tm.assert_produces_warning(None):
- c_old2 = Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3]) # noqa
- cat = Categorical([1, 2], categories=[1, 2, 3])
-
- # this is a legitimate constructor
- with tm.assert_produces_warning(None):
- c = Categorical(np.array([], dtype='int64'), # noqa
- categories=[3, 2, 1], ordered=True)
-
- def test_constructor_not_sequence(self):
- # https://github.com/pandas-dev/pandas/issues/16022
- with pytest.raises(TypeError):
- Categorical(['a', 'b'], categories='a')
-
- def test_constructor_with_null(self):
-
- # Cannot have NaN in categories
- with pytest.raises(ValueError):
- Categorical([np.nan, "a", "b", "c"],
- categories=[np.nan, "a", "b", "c"])
-
- with pytest.raises(ValueError):
- Categorical([None, "a", "b", "c"],
- categories=[None, "a", "b", "c"])
-
- with pytest.raises(ValueError):
- Categorical(DatetimeIndex(['nat', '20160101']),
- categories=[NaT, Timestamp('20160101')])
-
- def test_constructor_with_index(self):
- ci = CategoricalIndex(list('aabbca'), categories=list('cab'))
- tm.assert_categorical_equal(ci.values, Categorical(ci))
-
- ci = CategoricalIndex(list('aabbca'), categories=list('cab'))
- tm.assert_categorical_equal(ci.values,
- Categorical(ci.astype(object),
- categories=ci.categories))
-
- def test_constructor_with_generator(self):
- # This was raising an Error in isna(single_val).any() because isna
- # returned a scalar for a generator
- xrange = range
-
- exp = Categorical([0, 1, 2])
- cat = Categorical((x for x in [0, 1, 2]))
- tm.assert_categorical_equal(cat, exp)
- cat = Categorical(xrange(3))
- tm.assert_categorical_equal(cat, exp)
-
- # This uses xrange internally
- from pandas.core.index import MultiIndex
- MultiIndex.from_product([range(5), ['a', 'b', 'c']])
-
- # check that categories accept generators and sequences
- cat = Categorical([0, 1, 2], categories=(x for x in [0, 1, 2]))
- tm.assert_categorical_equal(cat, exp)
- cat = Categorical([0, 1, 2], categories=xrange(3))
- tm.assert_categorical_equal(cat, exp)
-
- def test_constructor_with_datetimelike(self):
-
- # 12077
- # constructor wwth a datetimelike and NaT
-
- for dtl in [date_range('1995-01-01 00:00:00', periods=5, freq='s'),
- date_range('1995-01-01 00:00:00', periods=5,
- freq='s', tz='US/Eastern'),
- timedelta_range('1 day', periods=5, freq='s')]:
-
- s = Series(dtl)
- c = Categorical(s)
- expected = type(dtl)(s)
- expected.freq = None
- tm.assert_index_equal(c.categories, expected)
- tm.assert_numpy_array_equal(c.codes, np.arange(5, dtype='int8'))
-
- # with NaT
- s2 = s.copy()
- s2.iloc[-1] = NaT
- c = Categorical(s2)
- expected = type(dtl)(s2.dropna())
- expected.freq = None
- tm.assert_index_equal(c.categories, expected)
-
- exp = np.array([0, 1, 2, 3, -1], dtype=np.int8)
- tm.assert_numpy_array_equal(c.codes, exp)
-
- result = repr(c)
- assert 'NaT' in result
-
- def test_constructor_from_index_series_datetimetz(self):
- idx = date_range('2015-01-01 10:00', freq='D', periods=3,
- tz='US/Eastern')
- result = Categorical(idx)
- tm.assert_index_equal(result.categories, idx)
-
- result = Categorical(Series(idx))
- tm.assert_index_equal(result.categories, idx)
-
- def test_constructor_from_index_series_timedelta(self):
- idx = timedelta_range('1 days', freq='D', periods=3)
- result = Categorical(idx)
- tm.assert_index_equal(result.categories, idx)
-
- result = Categorical(Series(idx))
- tm.assert_index_equal(result.categories, idx)
-
- def test_constructor_from_index_series_period(self):
- idx = period_range('2015-01-01', freq='D', periods=3)
- result = Categorical(idx)
- tm.assert_index_equal(result.categories, idx)
-
- result = Categorical(Series(idx))
- tm.assert_index_equal(result.categories, idx)
-
- def test_constructor_invariant(self):
- # GH 14190
- vals = [
- np.array([1., 1.2, 1.8, np.nan]),
- np.array([1, 2, 3], dtype='int64'),
- ['a', 'b', 'c', np.nan],
- [pd.Period('2014-01'), pd.Period('2014-02'), NaT],
- [Timestamp('2014-01-01'), Timestamp('2014-01-02'), NaT],
- [Timestamp('2014-01-01', tz='US/Eastern'),
- Timestamp('2014-01-02', tz='US/Eastern'), NaT],
- ]
- for val in vals:
- c = Categorical(val)
- c2 = Categorical(c)
- tm.assert_categorical_equal(c, c2)
-
- @pytest.mark.parametrize('ordered', [True, False])
- def test_constructor_with_dtype(self, ordered):
- categories = ['b', 'a', 'c']
- dtype = CategoricalDtype(categories, ordered=ordered)
- result = Categorical(['a', 'b', 'a', 'c'], dtype=dtype)
- expected = Categorical(['a', 'b', 'a', 'c'], categories=categories,
- ordered=ordered)
- tm.assert_categorical_equal(result, expected)
- assert result.ordered is ordered
-
- def test_constructor_dtype_and_others_raises(self):
- dtype = CategoricalDtype(['a', 'b'], ordered=True)
- with tm.assert_raises_regex(ValueError, "Cannot"):
- Categorical(['a', 'b'], categories=['a', 'b'], dtype=dtype)
-
- with tm.assert_raises_regex(ValueError, "Cannot"):
- Categorical(['a', 'b'], ordered=True, dtype=dtype)
-
- with tm.assert_raises_regex(ValueError, "Cannot"):
- Categorical(['a', 'b'], ordered=False, dtype=dtype)
-
- @pytest.mark.parametrize('categories', [
- None, ['a', 'b'], ['a', 'c'],
- ])
- @pytest.mark.parametrize('ordered', [True, False])
- def test_constructor_str_category(self, categories, ordered):
- result = Categorical(['a', 'b'], categories=categories,
- ordered=ordered, dtype='category')
- expected = Categorical(['a', 'b'], categories=categories,
- ordered=ordered)
- tm.assert_categorical_equal(result, expected)
-
- def test_constructor_str_unknown(self):
- with tm.assert_raises_regex(ValueError, "Unknown `dtype`"):
- Categorical([1, 2], dtype="foo")
-
- def test_constructor_from_categorical_with_dtype(self):
- dtype = CategoricalDtype(['a', 'b', 'c'], ordered=True)
- values = Categorical(['a', 'b', 'd'])
- result = Categorical(values, dtype=dtype)
- # We use dtype.categories, not values.categories
- expected = Categorical(['a', 'b', 'd'], categories=['a', 'b', 'c'],
- ordered=True)
- tm.assert_categorical_equal(result, expected)
-
- def test_constructor_from_categorical_with_unknown_dtype(self):
- dtype = CategoricalDtype(None, ordered=True)
- values = Categorical(['a', 'b', 'd'])
- result = Categorical(values, dtype=dtype)
- # We use values.categories, not dtype.categories
- expected = Categorical(['a', 'b', 'd'], categories=['a', 'b', 'd'],
- ordered=True)
- tm.assert_categorical_equal(result, expected)
-
- def test_contructor_from_categorical_string(self):
- values = Categorical(['a', 'b', 'd'])
- # use categories, ordered
- result = Categorical(values, categories=['a', 'b', 'c'], ordered=True,
- dtype='category')
- expected = Categorical(['a', 'b', 'd'], categories=['a', 'b', 'c'],
- ordered=True)
- tm.assert_categorical_equal(result, expected)
-
- # No string
- result = Categorical(values, categories=['a', 'b', 'c'], ordered=True)
- tm.assert_categorical_equal(result, expected)
-
- def test_constructor_with_categorical_categories(self):
- # GH17884
- expected = Categorical(['a', 'b'], categories=['a', 'b', 'c'])
-
- result = Categorical(
- ['a', 'b'], categories=Categorical(['a', 'b', 'c']))
- tm.assert_categorical_equal(result, expected)
-
- result = Categorical(
- ['a', 'b'], categories=CategoricalIndex(['a', 'b', 'c']))
- tm.assert_categorical_equal(result, expected)
-
- def test_from_codes(self):
-
- # too few categories
- def f():
- Categorical.from_codes([1, 2], [1, 2])
-
- pytest.raises(ValueError, f)
-
- # no int codes
- def f():
- Categorical.from_codes(["a"], [1, 2])
-
- pytest.raises(ValueError, f)
-
- # no unique categories
- def f():
- Categorical.from_codes([0, 1, 2], ["a", "a", "b"])
-
- pytest.raises(ValueError, f)
-
- # NaN categories included
- def f():
- Categorical.from_codes([0, 1, 2], ["a", "b", np.nan])
-
- pytest.raises(ValueError, f)
-
- # too negative
- def f():
- Categorical.from_codes([-2, 1, 2], ["a", "b", "c"])
-
- pytest.raises(ValueError, f)
-
- exp = Categorical(["a", "b", "c"], ordered=False)
- res = Categorical.from_codes([0, 1, 2], ["a", "b", "c"])
- tm.assert_categorical_equal(exp, res)
-
- # Not available in earlier numpy versions
- if hasattr(np.random, "choice"):
- codes = np.random.choice([0, 1], 5, p=[0.9, 0.1])
- Categorical.from_codes(codes, categories=["train", "test"])
-
- def test_from_codes_with_categorical_categories(self):
- # GH17884
- expected = Categorical(['a', 'b'], categories=['a', 'b', 'c'])
-
- result = Categorical.from_codes(
- [0, 1], categories=Categorical(['a', 'b', 'c']))
- tm.assert_categorical_equal(result, expected)
-
- result = Categorical.from_codes(
- [0, 1], categories=CategoricalIndex(['a', 'b', 'c']))
- tm.assert_categorical_equal(result, expected)
-
- # non-unique Categorical still raises
- with pytest.raises(ValueError):
- Categorical.from_codes([0, 1], Categorical(['a', 'b', 'a']))
-
- @pytest.mark.parametrize('dtype', [None, 'category'])
- def test_from_inferred_categories(self, dtype):
- cats = ['a', 'b']
- codes = np.array([0, 0, 1, 1], dtype='i8')
- result = Categorical._from_inferred_categories(cats, codes, dtype)
- expected = Categorical.from_codes(codes, cats)
- tm.assert_categorical_equal(result, expected)
-
- @pytest.mark.parametrize('dtype', [None, 'category'])
- def test_from_inferred_categories_sorts(self, dtype):
- cats = ['b', 'a']
- codes = np.array([0, 1, 1, 1], dtype='i8')
- result = Categorical._from_inferred_categories(cats, codes, dtype)
- expected = Categorical.from_codes([1, 0, 0, 0], ['a', 'b'])
- tm.assert_categorical_equal(result, expected)
-
- def test_from_inferred_categories_dtype(self):
- cats = ['a', 'b', 'd']
- codes = np.array([0, 1, 0, 2], dtype='i8')
- dtype = CategoricalDtype(['c', 'b', 'a'], ordered=True)
- result = Categorical._from_inferred_categories(cats, codes, dtype)
- expected = Categorical(['a', 'b', 'a', 'd'],
- categories=['c', 'b', 'a'],
- ordered=True)
- tm.assert_categorical_equal(result, expected)
-
- def test_from_inferred_categories_coerces(self):
- cats = ['1', '2', 'bad']
- codes = np.array([0, 0, 1, 2], dtype='i8')
- dtype = CategoricalDtype([1, 2])
- result = Categorical._from_inferred_categories(cats, codes, dtype)
- expected = Categorical([1, 1, 2, np.nan])
- tm.assert_categorical_equal(result, expected)
-
- def test_validate_ordered(self):
- # see gh-14058
- exp_msg = "'ordered' must either be 'True' or 'False'"
- exp_err = TypeError
-
- # This should be a boolean.
- ordered = np.array([0, 1, 2])
-
- with tm.assert_raises_regex(exp_err, exp_msg):
- Categorical([1, 2, 3], ordered=ordered)
-
- with tm.assert_raises_regex(exp_err, exp_msg):
- Categorical.from_codes([0, 0, 1], categories=['a', 'b', 'c'],
- ordered=ordered)
-
- def test_comparisons(self):
-
- result = self.factor[self.factor == 'a']
- expected = self.factor[np.asarray(self.factor) == 'a']
- tm.assert_categorical_equal(result, expected)
-
- result = self.factor[self.factor != 'a']
- expected = self.factor[np.asarray(self.factor) != 'a']
- tm.assert_categorical_equal(result, expected)
-
- result = self.factor[self.factor < 'c']
- expected = self.factor[np.asarray(self.factor) < 'c']
- tm.assert_categorical_equal(result, expected)
-
- result = self.factor[self.factor > 'a']
- expected = self.factor[np.asarray(self.factor) > 'a']
- tm.assert_categorical_equal(result, expected)
-
- result = self.factor[self.factor >= 'b']
- expected = self.factor[np.asarray(self.factor) >= 'b']
- tm.assert_categorical_equal(result, expected)
-
- result = self.factor[self.factor <= 'b']
- expected = self.factor[np.asarray(self.factor) <= 'b']
- tm.assert_categorical_equal(result, expected)
-
- n = len(self.factor)
-
- other = self.factor[np.random.permutation(n)]
- result = self.factor == other
- expected = np.asarray(self.factor) == np.asarray(other)
- tm.assert_numpy_array_equal(result, expected)
-
- result = self.factor == 'd'
- expected = np.repeat(False, len(self.factor))
- tm.assert_numpy_array_equal(result, expected)
-
- # comparisons with categoricals
- cat_rev = Categorical(
- ["a", "b", "c"], categories=["c", "b", "a"], ordered=True)
- cat_rev_base = Categorical(
- ["b", "b", "b"], categories=["c", "b", "a"], ordered=True)
- cat = Categorical(["a", "b", "c"], ordered=True)
- cat_base = Categorical(
- ["b", "b", "b"], categories=cat.categories, ordered=True)
-
- # comparisons need to take categories ordering into account
- res_rev = cat_rev > cat_rev_base
- exp_rev = np.array([True, False, False])
- tm.assert_numpy_array_equal(res_rev, exp_rev)
-
- res_rev = cat_rev < cat_rev_base
- exp_rev = np.array([False, False, True])
- tm.assert_numpy_array_equal(res_rev, exp_rev)
-
- res = cat > cat_base
- exp = np.array([False, False, True])
- tm.assert_numpy_array_equal(res, exp)
-
- # Only categories with same categories can be compared
- def f():
- cat > cat_rev
-
- pytest.raises(TypeError, f)
-
- cat_rev_base2 = Categorical(
- ["b", "b", "b"], categories=["c", "b", "a", "d"])
-
- def f():
- cat_rev > cat_rev_base2
-
- pytest.raises(TypeError, f)
-
- # Only categories with same ordering information can be compared
- cat_unorderd = cat.set_ordered(False)
- assert not (cat > cat).any()
-
- def f():
- cat > cat_unorderd
-
- pytest.raises(TypeError, f)
-
- # comparison (in both directions) with Series will raise
- s = Series(["b", "b", "b"])
- pytest.raises(TypeError, lambda: cat > s)
- pytest.raises(TypeError, lambda: cat_rev > s)
- pytest.raises(TypeError, lambda: s < cat)
- pytest.raises(TypeError, lambda: s < cat_rev)
-
- # comparison with numpy.array will raise in both direction, but only on
- # newer numpy versions
- a = np.array(["b", "b", "b"])
- pytest.raises(TypeError, lambda: cat > a)
- pytest.raises(TypeError, lambda: cat_rev > a)
-
- # The following work via '__array_priority__ = 1000'
- # works only on numpy >= 1.7.1
- if LooseVersion(np.__version__) > LooseVersion("1.7.1"):
- pytest.raises(TypeError, lambda: a < cat)
- pytest.raises(TypeError, lambda: a < cat_rev)
-
- # Make sure that unequal comparison take the categories order in
- # account
- cat_rev = Categorical(
- list("abc"), categories=list("cba"), ordered=True)
- exp = np.array([True, False, False])
- res = cat_rev > "b"
- tm.assert_numpy_array_equal(res, exp)
-
- def test_argsort(self):
- c = Categorical([5, 3, 1, 4, 2], ordered=True)
-
- expected = np.array([2, 4, 1, 3, 0])
- tm.assert_numpy_array_equal(c.argsort(ascending=True), expected,
- check_dtype=False)
-
- expected = expected[::-1]
- tm.assert_numpy_array_equal(c.argsort(ascending=False), expected,
- check_dtype=False)
-
- def test_numpy_argsort(self):
- c = Categorical([5, 3, 1, 4, 2], ordered=True)
-
- expected = np.array([2, 4, 1, 3, 0])
- tm.assert_numpy_array_equal(np.argsort(c), expected,
- check_dtype=False)
-
- tm.assert_numpy_array_equal(np.argsort(c, kind='mergesort'), expected,
- check_dtype=False)
-
- msg = "the 'axis' parameter is not supported"
- tm.assert_raises_regex(ValueError, msg, np.argsort,
- c, axis=0)
-
- msg = "the 'order' parameter is not supported"
- tm.assert_raises_regex(ValueError, msg, np.argsort,
- c, order='C')
-
- def test_na_flags_int_categories(self):
- # #1457
-
- categories = lrange(10)
- labels = np.random.randint(0, 10, 20)
- labels[::5] = -1
-
- cat = Categorical(labels, categories, fastpath=True)
- repr(cat)
-
- tm.assert_numpy_array_equal(isna(cat), labels == -1)
-
- def test_categories_none(self):
- factor = Categorical(['a', 'b', 'b', 'a',
- 'a', 'c', 'c', 'c'], ordered=True)
- tm.assert_categorical_equal(factor, self.factor)
-
- def test_set_categories_inplace(self):
- cat = self.factor.copy()
- cat.set_categories(['a', 'b', 'c', 'd'], inplace=True)
- tm.assert_index_equal(cat.categories, Index(['a', 'b', 'c', 'd']))
-
- @pytest.mark.parametrize(
- "dtype",
- ["int_", "uint", "float_", "unicode_", "timedelta64[h]",
- pytest.param("datetime64[D]",
- marks=pytest.mark.xfail(reason="issue7996"))]
- )
- @pytest.mark.parametrize("is_ordered", [True, False])
- def test_drop_duplicates_non_bool(self, dtype, is_ordered):
- cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype))
-
- # Test case 1
- input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype))
- tc1 = Series(Categorical(input1, categories=cat_array,
- ordered=is_ordered))
-
- expected = Series([False, False, False, True])
- tm.assert_series_equal(tc1.duplicated(), expected)
- tm.assert_series_equal(tc1.drop_duplicates(), tc1[~expected])
- sc = tc1.copy()
- sc.drop_duplicates(inplace=True)
- tm.assert_series_equal(sc, tc1[~expected])
-
- expected = Series([False, False, True, False])
- tm.assert_series_equal(tc1.duplicated(keep='last'), expected)
- tm.assert_series_equal(tc1.drop_duplicates(keep='last'),
- tc1[~expected])
- sc = tc1.copy()
- sc.drop_duplicates(keep='last', inplace=True)
- tm.assert_series_equal(sc, tc1[~expected])
-
- expected = Series([False, False, True, True])
- tm.assert_series_equal(tc1.duplicated(keep=False), expected)
- tm.assert_series_equal(tc1.drop_duplicates(keep=False), tc1[~expected])
- sc = tc1.copy()
- sc.drop_duplicates(keep=False, inplace=True)
- tm.assert_series_equal(sc, tc1[~expected])
-
- # Test case 2
- input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype))
- tc2 = Series(Categorical(
- input2, categories=cat_array, ordered=is_ordered)
- )
-
- expected = Series([False, False, False, False, True, True, False])
- tm.assert_series_equal(tc2.duplicated(), expected)
- tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected])
- sc = tc2.copy()
- sc.drop_duplicates(inplace=True)
- tm.assert_series_equal(sc, tc2[~expected])
-
- expected = Series([False, True, True, False, False, False, False])
- tm.assert_series_equal(tc2.duplicated(keep='last'), expected)
- tm.assert_series_equal(tc2.drop_duplicates(keep='last'),
- tc2[~expected])
- sc = tc2.copy()
- sc.drop_duplicates(keep='last', inplace=True)
- tm.assert_series_equal(sc, tc2[~expected])
-
- expected = Series([False, True, True, False, True, True, False])
- tm.assert_series_equal(tc2.duplicated(keep=False), expected)
- tm.assert_series_equal(tc2.drop_duplicates(keep=False), tc2[~expected])
- sc = tc2.copy()
- sc.drop_duplicates(keep=False, inplace=True)
- tm.assert_series_equal(sc, tc2[~expected])
-
- @pytest.mark.parametrize("is_ordered", [True, False])
- def test_drop_duplicates_bool(self, is_ordered):
- tc = Series(Categorical([True, False, True, False],
- categories=[True, False], ordered=is_ordered))
-
- expected = Series([False, False, True, True])
- tm.assert_series_equal(tc.duplicated(), expected)
- tm.assert_series_equal(tc.drop_duplicates(), tc[~expected])
- sc = tc.copy()
- sc.drop_duplicates(inplace=True)
- tm.assert_series_equal(sc, tc[~expected])
-
- expected = Series([True, True, False, False])
- tm.assert_series_equal(tc.duplicated(keep='last'), expected)
- tm.assert_series_equal(tc.drop_duplicates(keep='last'), tc[~expected])
- sc = tc.copy()
- sc.drop_duplicates(keep='last', inplace=True)
- tm.assert_series_equal(sc, tc[~expected])
-
- expected = Series([True, True, True, True])
- tm.assert_series_equal(tc.duplicated(keep=False), expected)
- tm.assert_series_equal(tc.drop_duplicates(keep=False), tc[~expected])
- sc = tc.copy()
- sc.drop_duplicates(keep=False, inplace=True)
- tm.assert_series_equal(sc, tc[~expected])
-
- def test_describe(self):
- # string type
- desc = self.factor.describe()
- assert self.factor.ordered
- exp_index = CategoricalIndex(['a', 'b', 'c'], name='categories',
- ordered=self.factor.ordered)
- expected = DataFrame({'counts': [3, 2, 3],
- 'freqs': [3 / 8., 2 / 8., 3 / 8.]},
- index=exp_index)
- tm.assert_frame_equal(desc, expected)
-
- # check unused categories
- cat = self.factor.copy()
- cat.set_categories(["a", "b", "c", "d"], inplace=True)
- desc = cat.describe()
-
- exp_index = CategoricalIndex(
- list('abcd'), ordered=self.factor.ordered, name='categories')
- expected = DataFrame({'counts': [3, 2, 3, 0],
- 'freqs': [3 / 8., 2 / 8., 3 / 8., 0]},
- index=exp_index)
- tm.assert_frame_equal(desc, expected)
-
- # check an integer one
- cat = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1])
- desc = cat.describe()
- exp_index = CategoricalIndex([1, 2, 3], ordered=cat.ordered,
- name='categories')
- expected = DataFrame({'counts': [5, 3, 3],
- 'freqs': [5 / 11., 3 / 11., 3 / 11.]},
- index=exp_index)
- tm.assert_frame_equal(desc, expected)
-
- # https://github.com/pandas-dev/pandas/issues/3678
- # describe should work with NaN
- cat = Categorical([np.nan, 1, 2, 2])
- desc = cat.describe()
- expected = DataFrame({'counts': [1, 2, 1],
- 'freqs': [1 / 4., 2 / 4., 1 / 4.]},
- index=CategoricalIndex([1, 2, np.nan],
- categories=[1, 2],
- name='categories'))
- tm.assert_frame_equal(desc, expected)
-
- def test_print(self):
- expected = ["[a, b, b, a, a, c, c, c]",
- "Categories (3, object): [a < b < c]"]
- expected = "\n".join(expected)
- actual = repr(self.factor)
- assert actual == expected
-
- def test_big_print(self):
- factor = Categorical([0, 1, 2, 0, 1, 2] * 100, ['a', 'b', 'c'],
- fastpath=True)
- expected = ["[a, b, c, a, b, ..., b, c, a, b, c]", "Length: 600",
- "Categories (3, object): [a, b, c]"]
- expected = "\n".join(expected)
-
- actual = repr(factor)
-
- assert actual == expected
-
- def test_empty_print(self):
- factor = Categorical([], ["a", "b", "c"])
- expected = ("[], Categories (3, object): [a, b, c]")
- # hack because array_repr changed in numpy > 1.6.x
- actual = repr(factor)
- assert actual == expected
-
- assert expected == actual
- factor = Categorical([], ["a", "b", "c"], ordered=True)
- expected = ("[], Categories (3, object): [a < b < c]")
- actual = repr(factor)
- assert expected == actual
-
- factor = Categorical([], [])
- expected = ("[], Categories (0, object): []")
- assert expected == repr(factor)
-
- def test_print_none_width(self):
- # GH10087
- a = Series(Categorical([1, 2, 3, 4]))
- exp = u("0 1\n1 2\n2 3\n3 4\n" +
- "dtype: category\nCategories (4, int64): [1, 2, 3, 4]")
-
- with option_context("display.width", None):
- assert exp == repr(a)
-
- def test_unicode_print(self):
- if PY3:
- _rep = repr
- else:
- _rep = unicode # noqa
-
- c = Categorical(['aaaaa', 'bb', 'cccc'] * 20)
- expected = u"""\
-[aaaaa, bb, cccc, aaaaa, bb, ..., bb, cccc, aaaaa, bb, cccc]
-Length: 60
-Categories (3, object): [aaaaa, bb, cccc]"""
-
- assert _rep(c) == expected
-
- c = Categorical([u'ああああ', u'いいいいい', u'ううううううう'] * 20)
- expected = u"""\
-[ああああ, いいいいい, ううううううう, ああああ, いいいいい, ..., いいいいい, ううううううう, ああああ, いいいいい, ううううううう]
-Length: 60
-Categories (3, object): [ああああ, いいいいい, ううううううう]""" # noqa
-
- assert _rep(c) == expected
-
- # unicode option should not affect to Categorical, as it doesn't care
- # the repr width
- with option_context('display.unicode.east_asian_width', True):
-
- c = Categorical([u'ああああ', u'いいいいい', u'ううううううう'] * 20)
- expected = u"""[ああああ, いいいいい, ううううううう, ああああ, いいいいい, ..., いいいいい, ううううううう, ああああ, いいいいい, ううううううう]
-Length: 60
-Categories (3, object): [ああああ, いいいいい, ううううううう]""" # noqa
-
- assert _rep(c) == expected
-
- def test_tab_complete_warning(self, ip):
- # https://github.com/pandas-dev/pandas/issues/16409
- pytest.importorskip('IPython', minversion="6.0.0")
- from IPython.core.completer import provisionalcompleter
-
- code = "import pandas as pd; c = Categorical([])"
- ip.run_code(code)
- with tm.assert_produces_warning(None):
- with provisionalcompleter('ignore'):
- list(ip.Completer.completions('c.', 1))
-
- def test_periodindex(self):
- idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02',
- '2014-03', '2014-03'], freq='M')
-
- cat1 = Categorical(idx1)
- str(cat1)
- exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.int8)
- exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M')
- tm.assert_numpy_array_equal(cat1._codes, exp_arr)
- tm.assert_index_equal(cat1.categories, exp_idx)
-
- idx2 = PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01',
- '2014-03', '2014-01'], freq='M')
- cat2 = Categorical(idx2, ordered=True)
- str(cat2)
- exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.int8)
- exp_idx2 = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M')
- tm.assert_numpy_array_equal(cat2._codes, exp_arr)
- tm.assert_index_equal(cat2.categories, exp_idx2)
-
- idx3 = PeriodIndex(['2013-12', '2013-11', '2013-10', '2013-09',
- '2013-08', '2013-07', '2013-05'], freq='M')
- cat3 = Categorical(idx3, ordered=True)
- exp_arr = np.array([6, 5, 4, 3, 2, 1, 0], dtype=np.int8)
- exp_idx = PeriodIndex(['2013-05', '2013-07', '2013-08', '2013-09',
- '2013-10', '2013-11', '2013-12'], freq='M')
- tm.assert_numpy_array_equal(cat3._codes, exp_arr)
- tm.assert_index_equal(cat3.categories, exp_idx)
-
- def test_categories_assigments(self):
- s = Categorical(["a", "b", "c", "a"])
- exp = np.array([1, 2, 3, 1], dtype=np.int64)
- s.categories = [1, 2, 3]
- tm.assert_numpy_array_equal(s.__array__(), exp)
- tm.assert_index_equal(s.categories, Index([1, 2, 3]))
-
- # lengthen
- def f():
- s.categories = [1, 2, 3, 4]
-
- pytest.raises(ValueError, f)
-
- # shorten
- def f():
- s.categories = [1, 2]
-
- pytest.raises(ValueError, f)
-
- def test_construction_with_ordered(self):
- # GH 9347, 9190
- cat = Categorical([0, 1, 2])
- assert not cat.ordered
- cat = Categorical([0, 1, 2], ordered=False)
- assert not cat.ordered
- cat = Categorical([0, 1, 2], ordered=True)
- assert cat.ordered
-
- def test_ordered_api(self):
- # GH 9347
- cat1 = Categorical(list('acb'), ordered=False)
- tm.assert_index_equal(cat1.categories, Index(['a', 'b', 'c']))
- assert not cat1.ordered
-
- cat2 = Categorical(list('acb'), categories=list('bca'), ordered=False)
- tm.assert_index_equal(cat2.categories, Index(['b', 'c', 'a']))
- assert not cat2.ordered
-
- cat3 = Categorical(list('acb'), ordered=True)
- tm.assert_index_equal(cat3.categories, Index(['a', 'b', 'c']))
- assert cat3.ordered
-
- cat4 = Categorical(list('acb'), categories=list('bca'), ordered=True)
- tm.assert_index_equal(cat4.categories, Index(['b', 'c', 'a']))
- assert cat4.ordered
-
- def test_set_dtype_same(self):
- c = Categorical(['a', 'b', 'c'])
- result = c._set_dtype(CategoricalDtype(['a', 'b', 'c']))
- tm.assert_categorical_equal(result, c)
-
- def test_set_dtype_new_categories(self):
- c = Categorical(['a', 'b', 'c'])
- result = c._set_dtype(CategoricalDtype(list('abcd')))
- tm.assert_numpy_array_equal(result.codes, c.codes)
- tm.assert_index_equal(result.dtype.categories, Index(list('abcd')))
-
- def test_set_dtype_nans(self):
- c = Categorical(['a', 'b', np.nan])
- result = c._set_dtype(CategoricalDtype(['a', 'c']))
- tm.assert_numpy_array_equal(result.codes, np.array([0, -1, -1],
- dtype='int8'))
-
- def test_set_categories_private(self):
- cat = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c', 'd'])
- cat._set_categories(['a', 'c', 'd', 'e'])
- expected = Categorical(['a', 'c', 'd'], categories=list('acde'))
- tm.assert_categorical_equal(cat, expected)
-
- # fastpath
- cat = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c', 'd'])
- cat._set_categories(['a', 'c', 'd', 'e'], fastpath=True)
- expected = Categorical(['a', 'c', 'd'], categories=list('acde'))
- tm.assert_categorical_equal(cat, expected)
-
- @pytest.mark.parametrize('values, categories, new_categories', [
- # No NaNs, same cats, same order
- (['a', 'b', 'a'], ['a', 'b'], ['a', 'b'],),
- # No NaNs, same cats, different order
- (['a', 'b', 'a'], ['a', 'b'], ['b', 'a'],),
- # Same, unsorted
- (['b', 'a', 'a'], ['a', 'b'], ['a', 'b'],),
- # No NaNs, same cats, different order
- (['b', 'a', 'a'], ['a', 'b'], ['b', 'a'],),
- # NaNs
- (['a', 'b', 'c'], ['a', 'b'], ['a', 'b']),
- (['a', 'b', 'c'], ['a', 'b'], ['b', 'a']),
- (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
- (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
- # Introduce NaNs
- (['a', 'b', 'c'], ['a', 'b'], ['a']),
- (['a', 'b', 'c'], ['a', 'b'], ['b']),
- (['b', 'a', 'c'], ['a', 'b'], ['a']),
- (['b', 'a', 'c'], ['a', 'b'], ['a']),
- # No overlap
- (['a', 'b', 'c'], ['a', 'b'], ['d', 'e']),
- ])
- @pytest.mark.parametrize('ordered', [True, False])
- def test_set_dtype_many(self, values, categories, new_categories,
- ordered):
- c = Categorical(values, categories)
- expected = Categorical(values, new_categories, ordered)
- result = c._set_dtype(expected.dtype)
- tm.assert_categorical_equal(result, expected)
-
- def test_set_dtype_no_overlap(self):
- c = Categorical(['a', 'b', 'c'], ['d', 'e'])
- result = c._set_dtype(CategoricalDtype(['a', 'b']))
- expected = Categorical([None, None, None], categories=['a', 'b'])
- tm.assert_categorical_equal(result, expected)
-
- def test_set_ordered(self):
-
- cat = Categorical(["a", "b", "c", "a"], ordered=True)
- cat2 = cat.as_unordered()
- assert not cat2.ordered
- cat2 = cat.as_ordered()
- assert cat2.ordered
- cat2.as_unordered(inplace=True)
- assert not cat2.ordered
- cat2.as_ordered(inplace=True)
- assert cat2.ordered
-
- assert cat2.set_ordered(True).ordered
- assert not cat2.set_ordered(False).ordered
- cat2.set_ordered(True, inplace=True)
- assert cat2.ordered
- cat2.set_ordered(False, inplace=True)
- assert not cat2.ordered
-
- # removed in 0.19.0
- msg = "can\'t set attribute"
- with tm.assert_raises_regex(AttributeError, msg):
- cat.ordered = True
- with tm.assert_raises_regex(AttributeError, msg):
- cat.ordered = False
-
- def test_set_categories(self):
- cat = Categorical(["a", "b", "c", "a"], ordered=True)
- exp_categories = Index(["c", "b", "a"])
- exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_)
-
- res = cat.set_categories(["c", "b", "a"], inplace=True)
- tm.assert_index_equal(cat.categories, exp_categories)
- tm.assert_numpy_array_equal(cat.__array__(), exp_values)
- assert res is None
-
- res = cat.set_categories(["a", "b", "c"])
- # cat must be the same as before
- tm.assert_index_equal(cat.categories, exp_categories)
- tm.assert_numpy_array_equal(cat.__array__(), exp_values)
- # only res is changed
- exp_categories_back = Index(["a", "b", "c"])
- tm.assert_index_equal(res.categories, exp_categories_back)
- tm.assert_numpy_array_equal(res.__array__(), exp_values)
-
- # not all "old" included in "new" -> all not included ones are now
- # np.nan
- cat = Categorical(["a", "b", "c", "a"], ordered=True)
- res = cat.set_categories(["a"])
- tm.assert_numpy_array_equal(res.codes, np.array([0, -1, -1, 0],
- dtype=np.int8))
-
- # still not all "old" in "new"
- res = cat.set_categories(["a", "b", "d"])
- tm.assert_numpy_array_equal(res.codes, np.array([0, 1, -1, 0],
- dtype=np.int8))
- tm.assert_index_equal(res.categories, Index(["a", "b", "d"]))
-
- # all "old" included in "new"
- cat = cat.set_categories(["a", "b", "c", "d"])
- exp_categories = Index(["a", "b", "c", "d"])
- tm.assert_index_equal(cat.categories, exp_categories)
-
- # internals...
- c = Categorical([1, 2, 3, 4, 1], categories=[1, 2, 3, 4], ordered=True)
- tm.assert_numpy_array_equal(c._codes, np.array([0, 1, 2, 3, 0],
- dtype=np.int8))
- tm.assert_index_equal(c.categories, Index([1, 2, 3, 4]))
-
- exp = np.array([1, 2, 3, 4, 1], dtype=np.int64)
- tm.assert_numpy_array_equal(c.get_values(), exp)
-
- # all "pointers" to '4' must be changed from 3 to 0,...
- c = c.set_categories([4, 3, 2, 1])
-
- # positions are changed
- tm.assert_numpy_array_equal(c._codes, np.array([3, 2, 1, 0, 3],
- dtype=np.int8))
-
- # categories are now in new order
- tm.assert_index_equal(c.categories, Index([4, 3, 2, 1]))
-
- # output is the same
- exp = np.array([1, 2, 3, 4, 1], dtype=np.int64)
- tm.assert_numpy_array_equal(c.get_values(), exp)
- assert c.min() == 4
- assert c.max() == 1
-
- # set_categories should set the ordering if specified
- c2 = c.set_categories([4, 3, 2, 1], ordered=False)
- assert not c2.ordered
-
- tm.assert_numpy_array_equal(c.get_values(), c2.get_values())
-
- # set_categories should pass thru the ordering
- c2 = c.set_ordered(False).set_categories([4, 3, 2, 1])
- assert not c2.ordered
-
- tm.assert_numpy_array_equal(c.get_values(), c2.get_values())
-
- def test_rename_categories(self):
- cat = Categorical(["a", "b", "c", "a"])
-
- # inplace=False: the old one must not be changed
- res = cat.rename_categories([1, 2, 3])
- tm.assert_numpy_array_equal(res.__array__(), np.array([1, 2, 3, 1],
- dtype=np.int64))
- tm.assert_index_equal(res.categories, Index([1, 2, 3]))
-
- exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_)
- tm.assert_numpy_array_equal(cat.__array__(), exp_cat)
-
- exp_cat = Index(["a", "b", "c"])
- tm.assert_index_equal(cat.categories, exp_cat)
- res = cat.rename_categories([1, 2, 3], inplace=True)
-
- # and now inplace
- assert res is None
- tm.assert_numpy_array_equal(cat.__array__(), np.array([1, 2, 3, 1],
- dtype=np.int64))
- tm.assert_index_equal(cat.categories, Index([1, 2, 3]))
-
- # Lengthen
- with pytest.raises(ValueError):
- cat.rename_categories([1, 2, 3, 4])
-
- # Shorten
- with pytest.raises(ValueError):
- cat.rename_categories([1, 2])
-
- def test_rename_categories_series(self):
- # https://github.com/pandas-dev/pandas/issues/17981
- c = Categorical(['a', 'b'])
- xpr = "Treating Series 'new_categories' as a list-like "
- with tm.assert_produces_warning(FutureWarning) as rec:
- result = c.rename_categories(Series([0, 1]))
-
- assert len(rec) == 1
- assert xpr in str(rec[0].message)
- expected = Categorical([0, 1])
- tm.assert_categorical_equal(result, expected)
-
- def test_rename_categories_dict(self):
- # GH 17336
- cat = Categorical(['a', 'b', 'c', 'd'])
- res = cat.rename_categories({'a': 4, 'b': 3, 'c': 2, 'd': 1})
- expected = Index([4, 3, 2, 1])
- tm.assert_index_equal(res.categories, expected)
-
- # Test for inplace
- res = cat.rename_categories({'a': 4, 'b': 3, 'c': 2, 'd': 1},
- inplace=True)
- assert res is None
- tm.assert_index_equal(cat.categories, expected)
-
- # Test for dicts of smaller length
- cat = Categorical(['a', 'b', 'c', 'd'])
- res = cat.rename_categories({'a': 1, 'c': 3})
-
- expected = Index([1, 'b', 3, 'd'])
- tm.assert_index_equal(res.categories, expected)
-
- # Test for dicts with bigger length
- cat = Categorical(['a', 'b', 'c', 'd'])
- res = cat.rename_categories({'a': 1, 'b': 2, 'c': 3,
- 'd': 4, 'e': 5, 'f': 6})
- expected = Index([1, 2, 3, 4])
- tm.assert_index_equal(res.categories, expected)
-
- # Test for dicts with no items from old categories
- cat = Categorical(['a', 'b', 'c', 'd'])
- res = cat.rename_categories({'f': 1, 'g': 3})
-
- expected = Index(['a', 'b', 'c', 'd'])
- tm.assert_index_equal(res.categories, expected)
-
- @pytest.mark.parametrize('codes, old, new, expected', [
- ([0, 1], ['a', 'b'], ['a', 'b'], [0, 1]),
- ([0, 1], ['b', 'a'], ['b', 'a'], [0, 1]),
- ([0, 1], ['a', 'b'], ['b', 'a'], [1, 0]),
- ([0, 1], ['b', 'a'], ['a', 'b'], [1, 0]),
- ([0, 1, 0, 1], ['a', 'b'], ['a', 'b', 'c'], [0, 1, 0, 1]),
- ([0, 1, 2, 2], ['a', 'b', 'c'], ['a', 'b'], [0, 1, -1, -1]),
- ([0, 1, -1], ['a', 'b', 'c'], ['a', 'b', 'c'], [0, 1, -1]),
- ([0, 1, -1], ['a', 'b', 'c'], ['b'], [-1, 0, -1]),
- ([0, 1, -1], ['a', 'b', 'c'], ['d'], [-1, -1, -1]),
- ([0, 1, -1], ['a', 'b', 'c'], [], [-1, -1, -1]),
- ([-1, -1], [], ['a', 'b'], [-1, -1]),
- ([1, 0], ['b', 'a'], ['a', 'b'], [0, 1]),
- ])
- def test_recode_to_categories(self, codes, old, new, expected):
- codes = np.asanyarray(codes, dtype=np.int8)
- expected = np.asanyarray(expected, dtype=np.int8)
- old = Index(old)
- new = Index(new)
- result = _recode_for_categories(codes, old, new)
- tm.assert_numpy_array_equal(result, expected)
-
- def test_recode_to_categories_large(self):
- N = 1000
- codes = np.arange(N)
- old = Index(codes)
- expected = np.arange(N - 1, -1, -1, dtype=np.int16)
- new = Index(expected)
- result = _recode_for_categories(codes, old, new)
- tm.assert_numpy_array_equal(result, expected)
-
- @pytest.mark.parametrize('values, categories, new_categories', [
- # No NaNs, same cats, same order
- (['a', 'b', 'a'], ['a', 'b'], ['a', 'b'],),
- # No NaNs, same cats, different order
- (['a', 'b', 'a'], ['a', 'b'], ['b', 'a'],),
- # Same, unsorted
- (['b', 'a', 'a'], ['a', 'b'], ['a', 'b'],),
- # No NaNs, same cats, different order
- (['b', 'a', 'a'], ['a', 'b'], ['b', 'a'],),
- # NaNs
- (['a', 'b', 'c'], ['a', 'b'], ['a', 'b']),
- (['a', 'b', 'c'], ['a', 'b'], ['b', 'a']),
- (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
- (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
- # Introduce NaNs
- (['a', 'b', 'c'], ['a', 'b'], ['a']),
- (['a', 'b', 'c'], ['a', 'b'], ['b']),
- (['b', 'a', 'c'], ['a', 'b'], ['a']),
- (['b', 'a', 'c'], ['a', 'b'], ['a']),
- # No overlap
- (['a', 'b', 'c'], ['a', 'b'], ['d', 'e']),
- ])
- @pytest.mark.parametrize('ordered', [True, False])
- def test_set_categories_many(self, values, categories, new_categories,
- ordered):
- c = Categorical(values, categories)
- expected = Categorical(values, new_categories, ordered)
- result = c.set_categories(new_categories, ordered=ordered)
- tm.assert_categorical_equal(result, expected)
-
- def test_reorder_categories(self):
- cat = Categorical(["a", "b", "c", "a"], ordered=True)
- old = cat.copy()
- new = Categorical(["a", "b", "c", "a"], categories=["c", "b", "a"],
- ordered=True)
-
- # first inplace == False
- res = cat.reorder_categories(["c", "b", "a"])
- # cat must be the same as before
- tm.assert_categorical_equal(cat, old)
- # only res is changed
- tm.assert_categorical_equal(res, new)
-
- # inplace == True
- res = cat.reorder_categories(["c", "b", "a"], inplace=True)
- assert res is None
- tm.assert_categorical_equal(cat, new)
-
- # not all "old" included in "new"
- cat = Categorical(["a", "b", "c", "a"], ordered=True)
-
- def f():
- cat.reorder_categories(["a"])
-
- pytest.raises(ValueError, f)
-
- # still not all "old" in "new"
- def f():
- cat.reorder_categories(["a", "b", "d"])
-
- pytest.raises(ValueError, f)
-
- # all "old" included in "new", but too long
- def f():
- cat.reorder_categories(["a", "b", "c", "d"])
-
- pytest.raises(ValueError, f)
-
- def test_add_categories(self):
- cat = Categorical(["a", "b", "c", "a"], ordered=True)
- old = cat.copy()
- new = Categorical(["a", "b", "c", "a"],
- categories=["a", "b", "c", "d"], ordered=True)
-
- # first inplace == False
- res = cat.add_categories("d")
- tm.assert_categorical_equal(cat, old)
- tm.assert_categorical_equal(res, new)
-
- res = cat.add_categories(["d"])
- tm.assert_categorical_equal(cat, old)
- tm.assert_categorical_equal(res, new)
-
- # inplace == True
- res = cat.add_categories("d", inplace=True)
- tm.assert_categorical_equal(cat, new)
- assert res is None
-
- # new is in old categories
- def f():
- cat.add_categories(["d"])
-
- pytest.raises(ValueError, f)
-
- # GH 9927
- cat = Categorical(list("abc"), ordered=True)
- expected = Categorical(
- list("abc"), categories=list("abcde"), ordered=True)
- # test with Series, np.array, index, list
- res = cat.add_categories(Series(["d", "e"]))
- tm.assert_categorical_equal(res, expected)
- res = cat.add_categories(np.array(["d", "e"]))
- tm.assert_categorical_equal(res, expected)
- res = cat.add_categories(Index(["d", "e"]))
- tm.assert_categorical_equal(res, expected)
- res = cat.add_categories(["d", "e"])
- tm.assert_categorical_equal(res, expected)
-
- def test_remove_categories(self):
- cat = Categorical(["a", "b", "c", "a"], ordered=True)
- old = cat.copy()
- new = Categorical(["a", "b", np.nan, "a"], categories=["a", "b"],
- ordered=True)
-
- # first inplace == False
- res = cat.remove_categories("c")
- tm.assert_categorical_equal(cat, old)
- tm.assert_categorical_equal(res, new)
-
- res = cat.remove_categories(["c"])
- tm.assert_categorical_equal(cat, old)
- tm.assert_categorical_equal(res, new)
-
- # inplace == True
- res = cat.remove_categories("c", inplace=True)
- tm.assert_categorical_equal(cat, new)
- assert res is None
-
- # removal is not in categories
- def f():
- cat.remove_categories(["c"])
-
- pytest.raises(ValueError, f)
-
- def test_remove_unused_categories(self):
- c = Categorical(["a", "b", "c", "d", "a"],
- categories=["a", "b", "c", "d", "e"])
- exp_categories_all = Index(["a", "b", "c", "d", "e"])
- exp_categories_dropped = Index(["a", "b", "c", "d"])
-
- tm.assert_index_equal(c.categories, exp_categories_all)
-
- res = c.remove_unused_categories()
- tm.assert_index_equal(res.categories, exp_categories_dropped)
- tm.assert_index_equal(c.categories, exp_categories_all)
-
- res = c.remove_unused_categories(inplace=True)
- tm.assert_index_equal(c.categories, exp_categories_dropped)
- assert res is None
-
- # with NaN values (GH11599)
- c = Categorical(["a", "b", "c", np.nan],
- categories=["a", "b", "c", "d", "e"])
- res = c.remove_unused_categories()
- tm.assert_index_equal(res.categories,
- Index(np.array(["a", "b", "c"])))
- exp_codes = np.array([0, 1, 2, -1], dtype=np.int8)
- tm.assert_numpy_array_equal(res.codes, exp_codes)
- tm.assert_index_equal(c.categories, exp_categories_all)
-
- val = ['F', np.nan, 'D', 'B', 'D', 'F', np.nan]
- cat = Categorical(values=val, categories=list('ABCDEFG'))
- out = cat.remove_unused_categories()
- tm.assert_index_equal(out.categories, Index(['B', 'D', 'F']))
- exp_codes = np.array([2, -1, 1, 0, 1, 2, -1], dtype=np.int8)
- tm.assert_numpy_array_equal(out.codes, exp_codes)
- assert out.get_values().tolist() == val
-
- alpha = list('abcdefghijklmnopqrstuvwxyz')
- val = np.random.choice(alpha[::2], 10000).astype('object')
- val[np.random.choice(len(val), 100)] = np.nan
-
- cat = Categorical(values=val, categories=alpha)
- out = cat.remove_unused_categories()
- assert out.get_values().tolist() == val.tolist()
-
- def test_nan_handling(self):
-
- # Nans are represented as -1 in codes
- c = Categorical(["a", "b", np.nan, "a"])
- tm.assert_index_equal(c.categories, Index(["a", "b"]))
- tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0],
- dtype=np.int8))
- c[1] = np.nan
- tm.assert_index_equal(c.categories, Index(["a", "b"]))
- tm.assert_numpy_array_equal(c._codes, np.array([0, -1, -1, 0],
- dtype=np.int8))
-
- # Adding nan to categories should make assigned nan point to the
- # category!
- c = Categorical(["a", "b", np.nan, "a"])
- tm.assert_index_equal(c.categories, Index(["a", "b"]))
- tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0],
- dtype=np.int8))
-
- def test_isna(self):
- exp = np.array([False, False, True])
- c = Categorical(["a", "b", np.nan])
- res = c.isna()
-
- tm.assert_numpy_array_equal(res, exp)
-
- def test_codes_immutable(self):
-
- # Codes should be read only
- c = Categorical(["a", "b", "c", "a", np.nan])
- exp = np.array([0, 1, 2, 0, -1], dtype='int8')
- tm.assert_numpy_array_equal(c.codes, exp)
-
- # Assignments to codes should raise
- def f():
- c.codes = np.array([0, 1, 2, 0, 1], dtype='int8')
-
- pytest.raises(ValueError, f)
-
- # changes in the codes array should raise
- # np 1.6.1 raises RuntimeError rather than ValueError
- codes = c.codes
-
- def f():
- codes[4] = 1
-
- pytest.raises(ValueError, f)
-
- # But even after getting the codes, the original array should still be
- # writeable!
- c[4] = "a"
- exp = np.array([0, 1, 2, 0, 0], dtype='int8')
- tm.assert_numpy_array_equal(c.codes, exp)
- c._codes[4] = 2
- exp = np.array([0, 1, 2, 0, 2], dtype='int8')
- tm.assert_numpy_array_equal(c.codes, exp)
-
- def test_min_max(self):
-
- # unordered cats have no min/max
- cat = Categorical(["a", "b", "c", "d"], ordered=False)
- pytest.raises(TypeError, lambda: cat.min())
- pytest.raises(TypeError, lambda: cat.max())
- cat = Categorical(["a", "b", "c", "d"], ordered=True)
- _min = cat.min()
- _max = cat.max()
- assert _min == "a"
- assert _max == "d"
- cat = Categorical(["a", "b", "c", "d"],
- categories=['d', 'c', 'b', 'a'], ordered=True)
- _min = cat.min()
- _max = cat.max()
- assert _min == "d"
- assert _max == "a"
- cat = Categorical([np.nan, "b", "c", np.nan],
- categories=['d', 'c', 'b', 'a'], ordered=True)
- _min = cat.min()
- _max = cat.max()
- assert np.isnan(_min)
- assert _max == "b"
-
- _min = cat.min(numeric_only=True)
- assert _min == "c"
- _max = cat.max(numeric_only=True)
- assert _max == "b"
-
- cat = Categorical([np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1],
- ordered=True)
- _min = cat.min()
- _max = cat.max()
- assert np.isnan(_min)
- assert _max == 1
-
- _min = cat.min(numeric_only=True)
- assert _min == 2
- _max = cat.max(numeric_only=True)
- assert _max == 1
-
- def test_unique(self):
- # categories are reordered based on value when ordered=False
- cat = Categorical(["a", "b"])
- exp = Index(["a", "b"])
- res = cat.unique()
- tm.assert_index_equal(res.categories, exp)
- tm.assert_categorical_equal(res, cat)
-
- cat = Categorical(["a", "b", "a", "a"], categories=["a", "b", "c"])
- res = cat.unique()
- tm.assert_index_equal(res.categories, exp)
- tm.assert_categorical_equal(res, Categorical(exp))
-
- cat = Categorical(["c", "a", "b", "a", "a"],
- categories=["a", "b", "c"])
- exp = Index(["c", "a", "b"])
- res = cat.unique()
- tm.assert_index_equal(res.categories, exp)
- exp_cat = Categorical(exp, categories=['c', 'a', 'b'])
- tm.assert_categorical_equal(res, exp_cat)
-
- # nan must be removed
- cat = Categorical(["b", np.nan, "b", np.nan, "a"],
- categories=["a", "b", "c"])
- res = cat.unique()
- exp = Index(["b", "a"])
- tm.assert_index_equal(res.categories, exp)
- exp_cat = Categorical(["b", np.nan, "a"], categories=["b", "a"])
- tm.assert_categorical_equal(res, exp_cat)
-
- def test_unique_ordered(self):
- # keep categories order when ordered=True
- cat = Categorical(['b', 'a', 'b'], categories=['a', 'b'], ordered=True)
- res = cat.unique()
- exp_cat = Categorical(['b', 'a'], categories=['a', 'b'], ordered=True)
- tm.assert_categorical_equal(res, exp_cat)
-
- cat = Categorical(['c', 'b', 'a', 'a'], categories=['a', 'b', 'c'],
- ordered=True)
- res = cat.unique()
- exp_cat = Categorical(['c', 'b', 'a'], categories=['a', 'b', 'c'],
- ordered=True)
- tm.assert_categorical_equal(res, exp_cat)
-
- cat = Categorical(['b', 'a', 'a'], categories=['a', 'b', 'c'],
- ordered=True)
- res = cat.unique()
- exp_cat = Categorical(['b', 'a'], categories=['a', 'b'], ordered=True)
- tm.assert_categorical_equal(res, exp_cat)
-
- cat = Categorical(['b', 'b', np.nan, 'a'], categories=['a', 'b', 'c'],
- ordered=True)
- res = cat.unique()
- exp_cat = Categorical(['b', np.nan, 'a'], categories=['a', 'b'],
- ordered=True)
- tm.assert_categorical_equal(res, exp_cat)
-
- def test_unique_index_series(self):
- c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1])
- # Categorical.unique sorts categories by appearance order
- # if ordered=False
- exp = Categorical([3, 1, 2], categories=[3, 1, 2])
- tm.assert_categorical_equal(c.unique(), exp)
-
- tm.assert_index_equal(Index(c).unique(), Index(exp))
- tm.assert_categorical_equal(Series(c).unique(), exp)
-
- c = Categorical([1, 1, 2, 2], categories=[3, 2, 1])
- exp = Categorical([1, 2], categories=[1, 2])
- tm.assert_categorical_equal(c.unique(), exp)
- tm.assert_index_equal(Index(c).unique(), Index(exp))
- tm.assert_categorical_equal(Series(c).unique(), exp)
-
- c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1], ordered=True)
- # Categorical.unique keeps categories order if ordered=True
- exp = Categorical([3, 1, 2], categories=[3, 2, 1], ordered=True)
- tm.assert_categorical_equal(c.unique(), exp)
-
- tm.assert_index_equal(Index(c).unique(), Index(exp))
- tm.assert_categorical_equal(Series(c).unique(), exp)
-
- def test_mode(self):
- s = Categorical([1, 1, 2, 4, 5, 5, 5], categories=[5, 4, 3, 2, 1],
- ordered=True)
- res = s.mode()
- exp = Categorical([5], categories=[5, 4, 3, 2, 1], ordered=True)
- tm.assert_categorical_equal(res, exp)
- s = Categorical([1, 1, 1, 4, 5, 5, 5], categories=[5, 4, 3, 2, 1],
- ordered=True)
- res = s.mode()
- exp = Categorical([5, 1], categories=[5, 4, 3, 2, 1], ordered=True)
- tm.assert_categorical_equal(res, exp)
- s = Categorical([1, 2, 3, 4, 5], categories=[5, 4, 3, 2, 1],
- ordered=True)
- res = s.mode()
- exp = Categorical([5, 4, 3, 2, 1],
- categories=[5, 4, 3, 2, 1], ordered=True)
- tm.assert_categorical_equal(res, exp)
- # NaN should not become the mode!
- s = Categorical([np.nan, np.nan, np.nan, 4, 5],
- categories=[5, 4, 3, 2, 1], ordered=True)
- res = s.mode()
- exp = Categorical([5, 4], categories=[5, 4, 3, 2, 1], ordered=True)
- tm.assert_categorical_equal(res, exp)
- s = Categorical([np.nan, np.nan, np.nan, 4, 5, 4],
- categories=[5, 4, 3, 2, 1], ordered=True)
- res = s.mode()
- exp = Categorical([4], categories=[5, 4, 3, 2, 1], ordered=True)
- tm.assert_categorical_equal(res, exp)
- s = Categorical([np.nan, np.nan, 4, 5, 4], categories=[5, 4, 3, 2, 1],
- ordered=True)
- res = s.mode()
- exp = Categorical([4], categories=[5, 4, 3, 2, 1], ordered=True)
- tm.assert_categorical_equal(res, exp)
-
- def test_sort_values(self):
-
- # unordered cats are sortable
- cat = Categorical(["a", "b", "b", "a"], ordered=False)
- cat.sort_values()
-
- cat = Categorical(["a", "c", "b", "d"], ordered=True)
-
- # sort_values
- res = cat.sort_values()
- exp = np.array(["a", "b", "c", "d"], dtype=object)
- tm.assert_numpy_array_equal(res.__array__(), exp)
- tm.assert_index_equal(res.categories, cat.categories)
-
- cat = Categorical(["a", "c", "b", "d"],
- categories=["a", "b", "c", "d"], ordered=True)
- res = cat.sort_values()
- exp = np.array(["a", "b", "c", "d"], dtype=object)
- tm.assert_numpy_array_equal(res.__array__(), exp)
- tm.assert_index_equal(res.categories, cat.categories)
-
- res = cat.sort_values(ascending=False)
- exp = np.array(["d", "c", "b", "a"], dtype=object)
- tm.assert_numpy_array_equal(res.__array__(), exp)
- tm.assert_index_equal(res.categories, cat.categories)
-
- # sort (inplace order)
- cat1 = cat.copy()
- cat1.sort_values(inplace=True)
- exp = np.array(["a", "b", "c", "d"], dtype=object)
- tm.assert_numpy_array_equal(cat1.__array__(), exp)
- tm.assert_index_equal(res.categories, cat.categories)
-
- # reverse
- cat = Categorical(["a", "c", "c", "b", "d"], ordered=True)
- res = cat.sort_values(ascending=False)
- exp_val = np.array(["d", "c", "c", "b", "a"], dtype=object)
- exp_categories = Index(["a", "b", "c", "d"])
- tm.assert_numpy_array_equal(res.__array__(), exp_val)
- tm.assert_index_equal(res.categories, exp_categories)
-
- def test_sort_values_na_position(self):
- # see gh-12882
- cat = Categorical([5, 2, np.nan, 2, np.nan], ordered=True)
- exp_categories = Index([2, 5])
-
- exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan])
- res = cat.sort_values() # default arguments
- tm.assert_numpy_array_equal(res.__array__(), exp)
- tm.assert_index_equal(res.categories, exp_categories)
-
- exp = np.array([np.nan, np.nan, 2.0, 2.0, 5.0])
- res = cat.sort_values(ascending=True, na_position='first')
- tm.assert_numpy_array_equal(res.__array__(), exp)
- tm.assert_index_equal(res.categories, exp_categories)
-
- exp = np.array([np.nan, np.nan, 5.0, 2.0, 2.0])
- res = cat.sort_values(ascending=False, na_position='first')
- tm.assert_numpy_array_equal(res.__array__(), exp)
- tm.assert_index_equal(res.categories, exp_categories)
-
- exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan])
- res = cat.sort_values(ascending=True, na_position='last')
- tm.assert_numpy_array_equal(res.__array__(), exp)
- tm.assert_index_equal(res.categories, exp_categories)
-
- exp = np.array([5.0, 2.0, 2.0, np.nan, np.nan])
- res = cat.sort_values(ascending=False, na_position='last')
- tm.assert_numpy_array_equal(res.__array__(), exp)
- tm.assert_index_equal(res.categories, exp_categories)
-
- cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
- res = cat.sort_values(ascending=False, na_position='last')
- exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object)
- exp_categories = Index(["a", "b", "c", "d"])
- tm.assert_numpy_array_equal(res.__array__(), exp_val)
- tm.assert_index_equal(res.categories, exp_categories)
-
- cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
- res = cat.sort_values(ascending=False, na_position='first')
- exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object)
- exp_categories = Index(["a", "b", "c", "d"])
- tm.assert_numpy_array_equal(res.__array__(), exp_val)
- tm.assert_index_equal(res.categories, exp_categories)
-
- def test_slicing_directly(self):
- cat = Categorical(["a", "b", "c", "d", "a", "b", "c"])
- sliced = cat[3]
- assert sliced == "d"
- sliced = cat[3:5]
- expected = Categorical(["d", "a"], categories=['a', 'b', 'c', 'd'])
- tm.assert_numpy_array_equal(sliced._codes, expected._codes)
- tm.assert_index_equal(sliced.categories, expected.categories)
-
- def test_set_item_nan(self):
- cat = Categorical([1, 2, 3])
- cat[1] = np.nan
-
- exp = Categorical([1, np.nan, 3], categories=[1, 2, 3])
- tm.assert_categorical_equal(cat, exp)
-
- def test_shift(self):
- # GH 9416
- cat = Categorical(['a', 'b', 'c', 'd', 'a'])
-
- # shift forward
- sp1 = cat.shift(1)
- xp1 = Categorical([np.nan, 'a', 'b', 'c', 'd'])
- tm.assert_categorical_equal(sp1, xp1)
- tm.assert_categorical_equal(cat[:-1], sp1[1:])
-
- # shift back
- sn2 = cat.shift(-2)
- xp2 = Categorical(['c', 'd', 'a', np.nan, np.nan],
- categories=['a', 'b', 'c', 'd'])
- tm.assert_categorical_equal(sn2, xp2)
- tm.assert_categorical_equal(cat[2:], sn2[:-2])
-
- # shift by zero
- tm.assert_categorical_equal(cat, cat.shift(0))
-
- def test_nbytes(self):
- cat = Categorical([1, 2, 3])
- exp = 3 + 3 * 8 # 3 int8s for values + 3 int64s for categories
- assert cat.nbytes == exp
-
- def test_memory_usage(self):
- cat = Categorical([1, 2, 3])
-
- # .categories is an index, so we include the hashtable
- assert 0 < cat.nbytes <= cat.memory_usage()
- assert 0 < cat.nbytes <= cat.memory_usage(deep=True)
-
- cat = Categorical(['foo', 'foo', 'bar'])
- assert cat.memory_usage(deep=True) > cat.nbytes
-
- if not PYPY:
- # sys.getsizeof will call the .memory_usage with
- # deep=True, and add on some GC overhead
- diff = cat.memory_usage(deep=True) - sys.getsizeof(cat)
- assert abs(diff) < 100
-
- def test_searchsorted(self):
- # https://github.com/pandas-dev/pandas/issues/8420
- # https://github.com/pandas-dev/pandas/issues/14522
-
- c1 = Categorical(['cheese', 'milk', 'apple', 'bread', 'bread'],
- categories=['cheese', 'milk', 'apple', 'bread'],
- ordered=True)
- s1 = Series(c1)
- c2 = Categorical(['cheese', 'milk', 'apple', 'bread', 'bread'],
- categories=['cheese', 'milk', 'apple', 'bread'],
- ordered=False)
- s2 = Series(c2)
-
- # Searching for single item argument, side='left' (default)
- res_cat = c1.searchsorted('apple')
- res_ser = s1.searchsorted('apple')
- exp = np.array([2], dtype=np.intp)
- tm.assert_numpy_array_equal(res_cat, exp)
- tm.assert_numpy_array_equal(res_ser, exp)
-
- # Searching for single item array, side='left' (default)
- res_cat = c1.searchsorted(['bread'])
- res_ser = s1.searchsorted(['bread'])
- exp = np.array([3], dtype=np.intp)
- tm.assert_numpy_array_equal(res_cat, exp)
- tm.assert_numpy_array_equal(res_ser, exp)
-
- # Searching for several items array, side='right'
- res_cat = c1.searchsorted(['apple', 'bread'], side='right')
- res_ser = s1.searchsorted(['apple', 'bread'], side='right')
- exp = np.array([3, 5], dtype=np.intp)
- tm.assert_numpy_array_equal(res_cat, exp)
- tm.assert_numpy_array_equal(res_ser, exp)
-
- # Searching for a single value that is not from the Categorical
- pytest.raises(ValueError, lambda: c1.searchsorted('cucumber'))
- pytest.raises(ValueError, lambda: s1.searchsorted('cucumber'))
-
- # Searching for multiple values one of each is not from the Categorical
- pytest.raises(ValueError,
- lambda: c1.searchsorted(['bread', 'cucumber']))
- pytest.raises(ValueError,
- lambda: s1.searchsorted(['bread', 'cucumber']))
-
- # searchsorted call for unordered Categorical
- pytest.raises(ValueError, lambda: c2.searchsorted('apple'))
- pytest.raises(ValueError, lambda: s2.searchsorted('apple'))
-
- with tm.assert_produces_warning(FutureWarning):
- res = c1.searchsorted(v=['bread'])
- exp = np.array([3], dtype=np.intp)
- tm.assert_numpy_array_equal(res, exp)
-
- def test_deprecated_labels(self):
- # TODO: labels is deprecated and should be removed in 0.18 or 2017,
- # whatever is earlier
- cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
- exp = cat.codes
- with tm.assert_produces_warning(FutureWarning):
- res = cat.labels
- tm.assert_numpy_array_equal(res, exp)
-
- def test_datetime_categorical_comparison(self):
- dt_cat = Categorical(date_range('2014-01-01', periods=3), ordered=True)
- tm.assert_numpy_array_equal(dt_cat > dt_cat[0],
- np.array([False, True, True]))
- tm.assert_numpy_array_equal(dt_cat[0] < dt_cat,
- np.array([False, True, True]))
-
- def test_reflected_comparison_with_scalars(self):
- # GH8658
- cat = Categorical([1, 2, 3], ordered=True)
- tm.assert_numpy_array_equal(cat > cat[0],
- np.array([False, True, True]))
- tm.assert_numpy_array_equal(cat[0] < cat,
- np.array([False, True, True]))
-
- def test_comparison_with_unknown_scalars(self):
- # https://github.com/pandas-dev/pandas/issues/9836#issuecomment-92123057
- # and following comparisons with scalars not in categories should raise
- # for unequal comps, but not for equal/not equal
- cat = Categorical([1, 2, 3], ordered=True)
-
- pytest.raises(TypeError, lambda: cat < 4)
- pytest.raises(TypeError, lambda: cat > 4)
- pytest.raises(TypeError, lambda: 4 < cat)
- pytest.raises(TypeError, lambda: 4 > cat)
-
- tm.assert_numpy_array_equal(cat == 4,
- np.array([False, False, False]))
- tm.assert_numpy_array_equal(cat != 4,
- np.array([True, True, True]))
-
- def test_map(self):
- c = Categorical(list('ABABC'), categories=list('CBA'), ordered=True)
- result = c.map(lambda x: x.lower())
- exp = Categorical(list('ababc'), categories=list('cba'), ordered=True)
- tm.assert_categorical_equal(result, exp)
-
- c = Categorical(list('ABABC'), categories=list('ABC'), ordered=False)
- result = c.map(lambda x: x.lower())
- exp = Categorical(list('ababc'), categories=list('abc'), ordered=False)
- tm.assert_categorical_equal(result, exp)
-
- result = c.map(lambda x: 1)
- # GH 12766: Return an index not an array
- tm.assert_index_equal(result, Index(np.array([1] * 5, dtype=np.int64)))
-
- def test_validate_inplace(self):
- cat = Categorical(['A', 'B', 'B', 'C', 'A'])
- invalid_values = [1, "True", [1, 2, 3], 5.0]
-
- for value in invalid_values:
- with pytest.raises(ValueError):
- cat.set_ordered(value=True, inplace=value)
-
- with pytest.raises(ValueError):
- cat.as_ordered(inplace=value)
-
- with pytest.raises(ValueError):
- cat.as_unordered(inplace=value)
-
- with pytest.raises(ValueError):
- cat.set_categories(['X', 'Y', 'Z'], rename=True, inplace=value)
-
- with pytest.raises(ValueError):
- cat.rename_categories(['X', 'Y', 'Z'], inplace=value)
-
- with pytest.raises(ValueError):
- cat.reorder_categories(
- ['X', 'Y', 'Z'], ordered=True, inplace=value)
-
- with pytest.raises(ValueError):
- cat.add_categories(
- new_categories=['D', 'E', 'F'], inplace=value)
-
- with pytest.raises(ValueError):
- cat.remove_categories(removals=['D', 'E', 'F'], inplace=value)
-
- with pytest.raises(ValueError):
- cat.remove_unused_categories(inplace=value)
-
- with pytest.raises(ValueError):
- cat.sort_values(inplace=value)
-
- @pytest.mark.xfail(reason="Imaginary values not supported in Categorical")
- def test_imaginary(self):
- values = [1, 2, 3 + 1j]
- c1 = Categorical(values)
- tm.assert_index_equal(c1.categories, Index(values))
- tm.assert_numpy_array_equal(np.array(c1), np.array(values))
-
-
-class TestCategoricalAsBlock(object):
-
- def setup_method(self, method):
- self.factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
-
- df = DataFrame({'value': np.random.randint(0, 10000, 100)})
- labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
- cat_labels = Categorical(labels, labels)
-
- df = df.sort_values(by=['value'], ascending=True)
- df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
- right=False, labels=cat_labels)
- self.cat = df
-
- def test_dtypes(self):
-
- # GH8143
- index = ['cat', 'obj', 'num']
- cat = Categorical(['a', 'b', 'c'])
- obj = Series(['a', 'b', 'c'])
- num = Series([1, 2, 3])
- df = pd.concat([Series(cat), obj, num], axis=1, keys=index)
-
- result = df.dtypes == 'object'
- expected = Series([False, True, False], index=index)
- tm.assert_series_equal(result, expected)
-
- result = df.dtypes == 'int64'
- expected = Series([False, False, True], index=index)
- tm.assert_series_equal(result, expected)
-
- result = df.dtypes == 'category'
- expected = Series([True, False, False], index=index)
- tm.assert_series_equal(result, expected)
-
- def test_codes_dtypes(self):
-
- # GH 8453
- result = Categorical(['foo', 'bar', 'baz'])
- assert result.codes.dtype == 'int8'
-
- result = Categorical(['foo%05d' % i for i in range(400)])
- assert result.codes.dtype == 'int16'
-
- result = Categorical(['foo%05d' % i for i in range(40000)])
- assert result.codes.dtype == 'int32'
-
- # adding cats
- result = Categorical(['foo', 'bar', 'baz'])
- assert result.codes.dtype == 'int8'
- result = result.add_categories(['foo%05d' % i for i in range(400)])
- assert result.codes.dtype == 'int16'
-
- # removing cats
- result = result.remove_categories(['foo%05d' % i for i in range(300)])
- assert result.codes.dtype == 'int8'
-
- def test_basic(self):
-
- # test basic creation / coercion of categoricals
- s = Series(self.factor, name='A')
- assert s.dtype == 'category'
- assert len(s) == len(self.factor)
- str(s.values)
- str(s)
-
- # in a frame
- df = DataFrame({'A': self.factor})
- result = df['A']
- tm.assert_series_equal(result, s)
- result = df.iloc[:, 0]
- tm.assert_series_equal(result, s)
- assert len(df) == len(self.factor)
- str(df.values)
- str(df)
-
- df = DataFrame({'A': s})
- result = df['A']
- tm.assert_series_equal(result, s)
- assert len(df) == len(self.factor)
- str(df.values)
- str(df)
-
- # multiples
- df = DataFrame({'A': s, 'B': s, 'C': 1})
- result1 = df['A']
- result2 = df['B']
- tm.assert_series_equal(result1, s)
- tm.assert_series_equal(result2, s, check_names=False)
- assert result2.name == 'B'
- assert len(df) == len(self.factor)
- str(df.values)
- str(df)
-
- # GH8623
- x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'],
- [1, 'John P. Doe']],
- columns=['person_id', 'person_name'])
- x['person_name'] = Categorical(x.person_name
- ) # doing this breaks transform
-
- expected = x.iloc[0].person_name
- result = x.person_name.iloc[0]
- assert result == expected
-
- result = x.person_name[0]
- assert result == expected
-
- result = x.person_name.loc[0]
- assert result == expected
-
- def test_creation_astype(self):
- l = ["a", "b", "c", "a"]
- s = Series(l)
- exp = Series(Categorical(l))
- res = s.astype('category')
- tm.assert_series_equal(res, exp)
-
- l = [1, 2, 3, 1]
- s = Series(l)
- exp = Series(Categorical(l))
- res = s.astype('category')
- tm.assert_series_equal(res, exp)
-
- df = DataFrame({"cats": [1, 2, 3, 4, 5, 6],
- "vals": [1, 2, 3, 4, 5, 6]})
- cats = Categorical([1, 2, 3, 4, 5, 6])
- exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
- df["cats"] = df["cats"].astype("category")
- tm.assert_frame_equal(exp_df, df)
-
- df = DataFrame({"cats": ['a', 'b', 'b', 'a', 'a', 'd'],
- "vals": [1, 2, 3, 4, 5, 6]})
- cats = Categorical(['a', 'b', 'b', 'a', 'a', 'd'])
- exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
- df["cats"] = df["cats"].astype("category")
- tm.assert_frame_equal(exp_df, df)
-
- # with keywords
- l = ["a", "b", "c", "a"]
- s = Series(l)
- exp = Series(Categorical(l, ordered=True))
- res = s.astype(CategoricalDtype(None, ordered=True))
- tm.assert_series_equal(res, exp)
-
- exp = Series(Categorical(l, categories=list('abcdef'), ordered=True))
- res = s.astype(CategoricalDtype(list('abcdef'), ordered=True))
- tm.assert_series_equal(res, exp)
-
- @pytest.mark.parametrize('columns', [['x'], ['x', 'y'], ['x', 'y', 'z']])
- def test_empty_astype(self, columns):
- # GH 18004
- msg = '> 1 ndim Categorical are not supported at this time'
- with tm.assert_raises_regex(NotImplementedError, msg):
- DataFrame(columns=columns).astype('category')
-
- def test_construction_series(self):
-
- l = [1, 2, 3, 1]
- exp = Series(l).astype('category')
- res = Series(l, dtype='category')
- tm.assert_series_equal(res, exp)
-
- l = ["a", "b", "c", "a"]
- exp = Series(l).astype('category')
- res = Series(l, dtype='category')
- tm.assert_series_equal(res, exp)
-
- # insert into frame with different index
- # GH 8076
- index = date_range('20000101', periods=3)
- expected = Series(Categorical(values=[np.nan, np.nan, np.nan],
- categories=['a', 'b', 'c']))
- expected.index = index
-
- expected = DataFrame({'x': expected})
- df = DataFrame(
- {'x': Series(['a', 'b', 'c'], dtype='category')}, index=index)
- tm.assert_frame_equal(df, expected)
-
- def test_construction_frame(self):
-
- # GH8626
-
- # dict creation
- df = DataFrame({'A': list('abc')}, dtype='category')
- expected = Series(list('abc'), dtype='category', name='A')
- tm.assert_series_equal(df['A'], expected)
-
- # to_frame
- s = Series(list('abc'), dtype='category')
- result = s.to_frame()
- expected = Series(list('abc'), dtype='category', name=0)
- tm.assert_series_equal(result[0], expected)
- result = s.to_frame(name='foo')
- expected = Series(list('abc'), dtype='category', name='foo')
- tm.assert_series_equal(result['foo'], expected)
-
- # list-like creation
- df = DataFrame(list('abc'), dtype='category')
- expected = Series(list('abc'), dtype='category', name=0)
- tm.assert_series_equal(df[0], expected)
-
- # ndim != 1
- df = DataFrame([Categorical(list('abc'))])
- expected = DataFrame({0: Series(list('abc'), dtype='category')})
- tm.assert_frame_equal(df, expected)
-
- df = DataFrame([Categorical(list('abc')), Categorical(list('abd'))])
- expected = DataFrame({0: Series(list('abc'), dtype='category'),
- 1: Series(list('abd'), dtype='category')},
- columns=[0, 1])
- tm.assert_frame_equal(df, expected)
-
- # mixed
- df = DataFrame([Categorical(list('abc')), list('def')])
- expected = DataFrame({0: Series(list('abc'), dtype='category'),
- 1: list('def')}, columns=[0, 1])
- tm.assert_frame_equal(df, expected)
-
- # invalid (shape)
- pytest.raises(ValueError,
- lambda: DataFrame([Categorical(list('abc')),
- Categorical(list('abdefg'))]))
-
- # ndim > 1
- pytest.raises(NotImplementedError,
- lambda: Categorical(np.array([list('abcd')])))
-
- def test_reshaping(self):
-
- with catch_warnings(record=True):
- p = tm.makePanel()
- p['str'] = 'foo'
- df = p.to_frame()
-
- df['category'] = df['str'].astype('category')
- result = df['category'].unstack()
-
- c = Categorical(['foo'] * len(p.major_axis))
- expected = DataFrame({'A': c.copy(),
- 'B': c.copy(),
- 'C': c.copy(),
- 'D': c.copy()},
- columns=Index(list('ABCD'), name='minor'),
- index=p.major_axis.set_names('major'))
- tm.assert_frame_equal(result, expected)
-
- def test_reindex(self):
-
- index = date_range('20000101', periods=3)
-
- # reindexing to an invalid Categorical
- s = Series(['a', 'b', 'c'], dtype='category')
- result = s.reindex(index)
- expected = Series(Categorical(values=[np.nan, np.nan, np.nan],
- categories=['a', 'b', 'c']))
- expected.index = index
- tm.assert_series_equal(result, expected)
-
- # partial reindexing
- expected = Series(Categorical(values=['b', 'c'], categories=['a', 'b',
- 'c']))
- expected.index = [1, 2]
- result = s.reindex([1, 2])
- tm.assert_series_equal(result, expected)
-
- expected = Series(Categorical(
- values=['c', np.nan], categories=['a', 'b', 'c']))
- expected.index = [2, 3]
- result = s.reindex([2, 3])
- tm.assert_series_equal(result, expected)
-
- def test_sideeffects_free(self):
- # Passing a categorical to a Series and then changing values in either
- # the series or the categorical should not change the values in the
- # other one, IF you specify copy!
- cat = Categorical(["a", "b", "c", "a"])
- s = Series(cat, copy=True)
- assert s.cat is not cat
- s.cat.categories = [1, 2, 3]
- exp_s = np.array([1, 2, 3, 1], dtype=np.int64)
- exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_)
- tm.assert_numpy_array_equal(s.__array__(), exp_s)
- tm.assert_numpy_array_equal(cat.__array__(), exp_cat)
-
- # setting
- s[0] = 2
- exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64)
- tm.assert_numpy_array_equal(s.__array__(), exp_s2)
- tm.assert_numpy_array_equal(cat.__array__(), exp_cat)
-
- # however, copy is False by default
- # so this WILL change values
- cat = Categorical(["a", "b", "c", "a"])
- s = Series(cat)
- assert s.values is cat
- s.cat.categories = [1, 2, 3]
- exp_s = np.array([1, 2, 3, 1], dtype=np.int64)
- tm.assert_numpy_array_equal(s.__array__(), exp_s)
- tm.assert_numpy_array_equal(cat.__array__(), exp_s)
-
- s[0] = 2
- exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64)
- tm.assert_numpy_array_equal(s.__array__(), exp_s2)
- tm.assert_numpy_array_equal(cat.__array__(), exp_s2)
-
- def test_nan_handling(self):
-
- # NaNs are represented as -1 in labels
- s = Series(Categorical(["a", "b", np.nan, "a"]))
- tm.assert_index_equal(s.cat.categories, Index(["a", "b"]))
- tm.assert_numpy_array_equal(s.values.codes,
- np.array([0, 1, -1, 0], dtype=np.int8))
-
- def test_cat_accessor(self):
- s = Series(Categorical(["a", "b", np.nan, "a"]))
- tm.assert_index_equal(s.cat.categories, Index(["a", "b"]))
- assert not s.cat.ordered, False
-
- exp = Categorical(["a", "b", np.nan, "a"], categories=["b", "a"])
- s.cat.set_categories(["b", "a"], inplace=True)
- tm.assert_categorical_equal(s.values, exp)
-
- res = s.cat.set_categories(["b", "a"])
- tm.assert_categorical_equal(res.values, exp)
-
- s[:] = "a"
- s = s.cat.remove_unused_categories()
- tm.assert_index_equal(s.cat.categories, Index(["a"]))
-
- def test_sequence_like(self):
-
- # GH 7839
- # make sure can iterate
- df = DataFrame({"id": [1, 2, 3, 4, 5, 6],
- "raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']})
- df['grade'] = Categorical(df['raw_grade'])
-
- # basic sequencing testing
- result = list(df.grade.values)
- expected = np.array(df.grade.values).tolist()
- tm.assert_almost_equal(result, expected)
-
- # iteration
- for t in df.itertuples(index=False):
- str(t)
-
- for row, s in df.iterrows():
- str(s)
-
- for c, col in df.iteritems():
- str(s)
-
- def test_series_delegations(self):
-
- # invalid accessor
- pytest.raises(AttributeError, lambda: Series([1, 2, 3]).cat)
- tm.assert_raises_regex(
- AttributeError,
- r"Can only use .cat accessor with a 'category' dtype",
- lambda: Series([1, 2, 3]).cat)
- pytest.raises(AttributeError, lambda: Series(['a', 'b', 'c']).cat)
- pytest.raises(AttributeError, lambda: Series(np.arange(5.)).cat)
- pytest.raises(AttributeError,
- lambda: Series([Timestamp('20130101')]).cat)
-
- # Series should delegate calls to '.categories', '.codes', '.ordered'
- # and the methods '.set_categories()' 'drop_unused_categories()' to the
- # categorical
- s = Series(Categorical(["a", "b", "c", "a"], ordered=True))
- exp_categories = Index(["a", "b", "c"])
- tm.assert_index_equal(s.cat.categories, exp_categories)
- s.cat.categories = [1, 2, 3]
- exp_categories = Index([1, 2, 3])
- tm.assert_index_equal(s.cat.categories, exp_categories)
-
- exp_codes = Series([0, 1, 2, 0], dtype='int8')
- tm.assert_series_equal(s.cat.codes, exp_codes)
-
- assert s.cat.ordered
- s = s.cat.as_unordered()
- assert not s.cat.ordered
- s.cat.as_ordered(inplace=True)
- assert s.cat.ordered
-
- # reorder
- s = Series(Categorical(["a", "b", "c", "a"], ordered=True))
- exp_categories = Index(["c", "b", "a"])
- exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_)
- s = s.cat.set_categories(["c", "b", "a"])
- tm.assert_index_equal(s.cat.categories, exp_categories)
- tm.assert_numpy_array_equal(s.values.__array__(), exp_values)
- tm.assert_numpy_array_equal(s.__array__(), exp_values)
-
- # remove unused categories
- s = Series(Categorical(["a", "b", "b", "a"], categories=["a", "b", "c"
- ]))
- exp_categories = Index(["a", "b"])
- exp_values = np.array(["a", "b", "b", "a"], dtype=np.object_)
- s = s.cat.remove_unused_categories()
- tm.assert_index_equal(s.cat.categories, exp_categories)
- tm.assert_numpy_array_equal(s.values.__array__(), exp_values)
- tm.assert_numpy_array_equal(s.__array__(), exp_values)
-
- # This method is likely to be confused, so test that it raises an error
- # on wrong inputs:
- def f():
- s.set_categories([4, 3, 2, 1])
-
- pytest.raises(Exception, f)
- # right: s.cat.set_categories([4,3,2,1])
-
- def test_series_functions_no_warnings(self):
- df = DataFrame({'value': np.random.randint(0, 100, 20)})
- labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)]
- with tm.assert_produces_warning(False):
- df['group'] = pd.cut(df.value, range(0, 105, 10), right=False,
- labels=labels)
-
- def test_assignment_to_dataframe(self):
- # assignment
- df = DataFrame({'value': np.array(
- np.random.randint(0, 10000, 100), dtype='int32')})
- labels = Categorical(["{0} - {1}".format(i, i + 499)
- for i in range(0, 10000, 500)])
-
- df = df.sort_values(by=['value'], ascending=True)
- s = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels)
- d = s.values
- df['D'] = d
- str(df)
-
- result = df.dtypes
- expected = Series(
- [np.dtype('int32'), CategoricalDtype(categories=labels,
- ordered=False)],
- index=['value', 'D'])
- tm.assert_series_equal(result, expected)
-
- df['E'] = s
- str(df)
-
- result = df.dtypes
- expected = Series([np.dtype('int32'),
- CategoricalDtype(categories=labels, ordered=False),
- CategoricalDtype(categories=labels, ordered=False)],
- index=['value', 'D', 'E'])
- tm.assert_series_equal(result, expected)
-
- result1 = df['D']
- result2 = df['E']
- tm.assert_categorical_equal(result1._data._block.values, d)
-
- # sorting
- s.name = 'E'
- tm.assert_series_equal(result2.sort_index(), s.sort_index())
-
- cat = Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10])
- df = DataFrame(Series(cat))
-
- def test_describe(self):
-
- # Categoricals should not show up together with numerical columns
- result = self.cat.describe()
- assert len(result.columns) == 1
-
- # In a frame, describe() for the cat should be the same as for string
- # arrays (count, unique, top, freq)
-
- cat = Categorical(["a", "b", "b", "b"], categories=['a', 'b', 'c'],
- ordered=True)
- s = Series(cat)
- result = s.describe()
- expected = Series([4, 2, "b", 3],
- index=['count', 'unique', 'top', 'freq'])
- tm.assert_series_equal(result, expected)
-
- cat = Series(Categorical(["a", "b", "c", "c"]))
- df3 = DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]})
- res = df3.describe()
- tm.assert_numpy_array_equal(res["cat"].values, res["s"].values)
-
- def test_repr(self):
- a = Series(Categorical([1, 2, 3, 4]))
- exp = u("0 1\n1 2\n2 3\n3 4\n" +
- "dtype: category\nCategories (4, int64): [1, 2, 3, 4]")
-
- assert exp == a.__unicode__()
-
- a = Series(Categorical(["a", "b"] * 25))
- exp = u("0 a\n1 b\n" + " ..\n" + "48 a\n49 b\n" +
- "Length: 50, dtype: category\nCategories (2, object): [a, b]")
- with option_context("display.max_rows", 5):
- assert exp == repr(a)
-
- levs = list("abcdefghijklmnopqrstuvwxyz")
- a = Series(Categorical(["a", "b"], categories=levs, ordered=True))
- exp = u("0 a\n1 b\n" + "dtype: category\n"
- "Categories (26, object): [a < b < c < d ... w < x < y < z]")
- assert exp == a.__unicode__()
-
- def test_categorical_repr(self):
- c = Categorical([1, 2, 3])
- exp = """[1, 2, 3]
-Categories (3, int64): [1, 2, 3]"""
-
- assert repr(c) == exp
-
- c = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3])
- exp = """[1, 2, 3, 1, 2, 3]
-Categories (3, int64): [1, 2, 3]"""
-
- assert repr(c) == exp
-
- c = Categorical([1, 2, 3, 4, 5] * 10)
- exp = """[1, 2, 3, 4, 5, ..., 1, 2, 3, 4, 5]
-Length: 50
-Categories (5, int64): [1, 2, 3, 4, 5]"""
-
- assert repr(c) == exp
-
- c = Categorical(np.arange(20))
- exp = """[0, 1, 2, 3, 4, ..., 15, 16, 17, 18, 19]
-Length: 20
-Categories (20, int64): [0, 1, 2, 3, ..., 16, 17, 18, 19]"""
-
- assert repr(c) == exp
-
- def test_categorical_repr_ordered(self):
- c = Categorical([1, 2, 3], ordered=True)
- exp = """[1, 2, 3]
-Categories (3, int64): [1 < 2 < 3]"""
-
- assert repr(c) == exp
-
- c = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3], ordered=True)
- exp = """[1, 2, 3, 1, 2, 3]
-Categories (3, int64): [1 < 2 < 3]"""
-
- assert repr(c) == exp
-
- c = Categorical([1, 2, 3, 4, 5] * 10, ordered=True)
- exp = """[1, 2, 3, 4, 5, ..., 1, 2, 3, 4, 5]
-Length: 50
-Categories (5, int64): [1 < 2 < 3 < 4 < 5]"""
-
- assert repr(c) == exp
-
- c = Categorical(np.arange(20), ordered=True)
- exp = """[0, 1, 2, 3, 4, ..., 15, 16, 17, 18, 19]
-Length: 20
-Categories (20, int64): [0 < 1 < 2 < 3 ... 16 < 17 < 18 < 19]"""
-
- assert repr(c) == exp
-
- def test_categorical_repr_datetime(self):
- idx = date_range('2011-01-01 09:00', freq='H', periods=5)
- c = Categorical(idx)
-
- # TODO(wesm): exceeding 80 characters in the console is not good
- # behavior
- exp = (
- "[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, "
- "2011-01-01 12:00:00, 2011-01-01 13:00:00]\n"
- "Categories (5, datetime64[ns]): [2011-01-01 09:00:00, "
- "2011-01-01 10:00:00, 2011-01-01 11:00:00,\n"
- " 2011-01-01 12:00:00, "
- "2011-01-01 13:00:00]""")
- assert repr(c) == exp
-
- c = Categorical(idx.append(idx), categories=idx)
- exp = (
- "[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, "
- "2011-01-01 12:00:00, 2011-01-01 13:00:00, 2011-01-01 09:00:00, "
- "2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, "
- "2011-01-01 13:00:00]\n"
- "Categories (5, datetime64[ns]): [2011-01-01 09:00:00, "
- "2011-01-01 10:00:00, 2011-01-01 11:00:00,\n"
- " 2011-01-01 12:00:00, "
- "2011-01-01 13:00:00]")
-
- assert repr(c) == exp
-
- idx = date_range('2011-01-01 09:00', freq='H', periods=5,
- tz='US/Eastern')
- c = Categorical(idx)
- exp = (
- "[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, "
- "2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, "
- "2011-01-01 13:00:00-05:00]\n"
- "Categories (5, datetime64[ns, US/Eastern]): "
- "[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00,\n"
- " "
- "2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00,\n"
- " "
- "2011-01-01 13:00:00-05:00]")
-
- assert repr(c) == exp
-
- c = Categorical(idx.append(idx), categories=idx)
- exp = (
- "[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, "
- "2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, "
- "2011-01-01 13:00:00-05:00, 2011-01-01 09:00:00-05:00, "
- "2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, "
- "2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00]\n"
- "Categories (5, datetime64[ns, US/Eastern]): "
- "[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00,\n"
- " "
- "2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00,\n"
- " "
- "2011-01-01 13:00:00-05:00]")
-
- assert repr(c) == exp
-
- def test_categorical_repr_datetime_ordered(self):
- idx = date_range('2011-01-01 09:00', freq='H', periods=5)
- c = Categorical(idx, ordered=True)
- exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00]
-Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 <
- 2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa
-
- assert repr(c) == exp
-
- c = Categorical(idx.append(idx), categories=idx, ordered=True)
- exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00, 2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00]
-Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 <
- 2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa
-
- assert repr(c) == exp
-
- idx = date_range('2011-01-01 09:00', freq='H', periods=5,
- tz='US/Eastern')
- c = Categorical(idx, ordered=True)
- exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00]
-Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 <
- 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 <
- 2011-01-01 13:00:00-05:00]""" # noqa
-
- assert repr(c) == exp
-
- c = Categorical(idx.append(idx), categories=idx, ordered=True)
- exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00, 2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00]
-Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 <
- 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 <
- 2011-01-01 13:00:00-05:00]""" # noqa
-
- assert repr(c) == exp
-
- def test_categorical_repr_period(self):
- idx = period_range('2011-01-01 09:00', freq='H', periods=5)
- c = Categorical(idx)
- exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
-Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00,
- 2011-01-01 13:00]""" # noqa
-
- assert repr(c) == exp
-
- c = Categorical(idx.append(idx), categories=idx)
- exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
-Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00,
- 2011-01-01 13:00]""" # noqa
-
- assert repr(c) == exp
-
- idx = period_range('2011-01', freq='M', periods=5)
- c = Categorical(idx)
- exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
-Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]"""
-
- assert repr(c) == exp
-
- c = Categorical(idx.append(idx), categories=idx)
- exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05, 2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
-Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" # noqa
-
- assert repr(c) == exp
-
- def test_categorical_repr_period_ordered(self):
- idx = period_range('2011-01-01 09:00', freq='H', periods=5)
- c = Categorical(idx, ordered=True)
- exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
-Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 <
- 2011-01-01 13:00]""" # noqa
-
- assert repr(c) == exp
-
- c = Categorical(idx.append(idx), categories=idx, ordered=True)
- exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
-Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 <
- 2011-01-01 13:00]""" # noqa
-
- assert repr(c) == exp
-
- idx = period_range('2011-01', freq='M', periods=5)
- c = Categorical(idx, ordered=True)
- exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
-Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]"""
-
- assert repr(c) == exp
-
- c = Categorical(idx.append(idx), categories=idx, ordered=True)
- exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05, 2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
-Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" # noqa
-
- assert repr(c) == exp
-
- def test_categorical_repr_timedelta(self):
- idx = timedelta_range('1 days', periods=5)
- c = Categorical(idx)
- exp = """[1 days, 2 days, 3 days, 4 days, 5 days]
-Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]"""
-
- assert repr(c) == exp
-
- c = Categorical(idx.append(idx), categories=idx)
- exp = """[1 days, 2 days, 3 days, 4 days, 5 days, 1 days, 2 days, 3 days, 4 days, 5 days]
-Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]""" # noqa
-
- assert repr(c) == exp
-
- idx = timedelta_range('1 hours', periods=20)
- c = Categorical(idx)
- exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
-Length: 20
-Categories (20, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00,
- 3 days 01:00:00, ..., 16 days 01:00:00, 17 days 01:00:00,
- 18 days 01:00:00, 19 days 01:00:00]""" # noqa
-
- assert repr(c) == exp
-
- c = Categorical(idx.append(idx), categories=idx)
- exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
-Length: 40
-Categories (20, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00,
- 3 days 01:00:00, ..., 16 days 01:00:00, 17 days 01:00:00,
- 18 days 01:00:00, 19 days 01:00:00]""" # noqa
-
- assert repr(c) == exp
-
- def test_categorical_repr_timedelta_ordered(self):
- idx = timedelta_range('1 days', periods=5)
- c = Categorical(idx, ordered=True)
- exp = """[1 days, 2 days, 3 days, 4 days, 5 days]
-Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" # noqa
-
- assert repr(c) == exp
-
- c = Categorical(idx.append(idx), categories=idx, ordered=True)
- exp = """[1 days, 2 days, 3 days, 4 days, 5 days, 1 days, 2 days, 3 days, 4 days, 5 days]
-Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" # noqa
-
- assert repr(c) == exp
-
- idx = timedelta_range('1 hours', periods=20)
- c = Categorical(idx, ordered=True)
- exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
-Length: 20
-Categories (20, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 <
- 3 days 01:00:00 ... 16 days 01:00:00 < 17 days 01:00:00 <
- 18 days 01:00:00 < 19 days 01:00:00]""" # noqa
-
- assert repr(c) == exp
-
- c = Categorical(idx.append(idx), categories=idx, ordered=True)
- exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
-Length: 40
-Categories (20, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 <
- 3 days 01:00:00 ... 16 days 01:00:00 < 17 days 01:00:00 <
- 18 days 01:00:00 < 19 days 01:00:00]""" # noqa
-
- assert repr(c) == exp
-
- def test_categorical_series_repr(self):
- s = Series(Categorical([1, 2, 3]))
- exp = """0 1
-1 2
-2 3
-dtype: category
-Categories (3, int64): [1, 2, 3]"""
-
- assert repr(s) == exp
-
- s = Series(Categorical(np.arange(10)))
- exp = """0 0
-1 1
-2 2
-3 3
-4 4
-5 5
-6 6
-7 7
-8 8
-9 9
-dtype: category
-Categories (10, int64): [0, 1, 2, 3, ..., 6, 7, 8, 9]"""
-
- assert repr(s) == exp
-
- def test_categorical_series_repr_ordered(self):
- s = Series(Categorical([1, 2, 3], ordered=True))
- exp = """0 1
-1 2
-2 3
-dtype: category
-Categories (3, int64): [1 < 2 < 3]"""
-
- assert repr(s) == exp
-
- s = Series(Categorical(np.arange(10), ordered=True))
- exp = """0 0
-1 1
-2 2
-3 3
-4 4
-5 5
-6 6
-7 7
-8 8
-9 9
-dtype: category
-Categories (10, int64): [0 < 1 < 2 < 3 ... 6 < 7 < 8 < 9]"""
-
- assert repr(s) == exp
-
- def test_categorical_series_repr_datetime(self):
- idx = date_range('2011-01-01 09:00', freq='H', periods=5)
- s = Series(Categorical(idx))
- exp = """0 2011-01-01 09:00:00
-1 2011-01-01 10:00:00
-2 2011-01-01 11:00:00
-3 2011-01-01 12:00:00
-4 2011-01-01 13:00:00
-dtype: category
-Categories (5, datetime64[ns]): [2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00,
- 2011-01-01 12:00:00, 2011-01-01 13:00:00]""" # noqa
-
- assert repr(s) == exp
-
- idx = date_range('2011-01-01 09:00', freq='H', periods=5,
- tz='US/Eastern')
- s = Series(Categorical(idx))
- exp = """0 2011-01-01 09:00:00-05:00
-1 2011-01-01 10:00:00-05:00
-2 2011-01-01 11:00:00-05:00
-3 2011-01-01 12:00:00-05:00
-4 2011-01-01 13:00:00-05:00
-dtype: category
-Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00,
- 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00,
- 2011-01-01 13:00:00-05:00]""" # noqa
-
- assert repr(s) == exp
-
- def test_categorical_series_repr_datetime_ordered(self):
- idx = date_range('2011-01-01 09:00', freq='H', periods=5)
- s = Series(Categorical(idx, ordered=True))
- exp = """0 2011-01-01 09:00:00
-1 2011-01-01 10:00:00
-2 2011-01-01 11:00:00
-3 2011-01-01 12:00:00
-4 2011-01-01 13:00:00
-dtype: category
-Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 <
- 2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa
-
- assert repr(s) == exp
-
- idx = date_range('2011-01-01 09:00', freq='H', periods=5,
- tz='US/Eastern')
- s = Series(Categorical(idx, ordered=True))
- exp = """0 2011-01-01 09:00:00-05:00
-1 2011-01-01 10:00:00-05:00
-2 2011-01-01 11:00:00-05:00
-3 2011-01-01 12:00:00-05:00
-4 2011-01-01 13:00:00-05:00
-dtype: category
-Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 <
- 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 <
- 2011-01-01 13:00:00-05:00]""" # noqa
-
- assert repr(s) == exp
-
- def test_categorical_series_repr_period(self):
- idx = period_range('2011-01-01 09:00', freq='H', periods=5)
- s = Series(Categorical(idx))
- exp = """0 2011-01-01 09:00
-1 2011-01-01 10:00
-2 2011-01-01 11:00
-3 2011-01-01 12:00
-4 2011-01-01 13:00
-dtype: category
-Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00,
- 2011-01-01 13:00]""" # noqa
-
- assert repr(s) == exp
-
- idx = period_range('2011-01', freq='M', periods=5)
- s = Series(Categorical(idx))
- exp = """0 2011-01
-1 2011-02
-2 2011-03
-3 2011-04
-4 2011-05
-dtype: category
-Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]"""
-
- assert repr(s) == exp
-
- def test_categorical_series_repr_period_ordered(self):
- idx = period_range('2011-01-01 09:00', freq='H', periods=5)
- s = Series(Categorical(idx, ordered=True))
- exp = """0 2011-01-01 09:00
-1 2011-01-01 10:00
-2 2011-01-01 11:00
-3 2011-01-01 12:00
-4 2011-01-01 13:00
-dtype: category
-Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 <
- 2011-01-01 13:00]""" # noqa
-
- assert repr(s) == exp
-
- idx = period_range('2011-01', freq='M', periods=5)
- s = Series(Categorical(idx, ordered=True))
- exp = """0 2011-01
-1 2011-02
-2 2011-03
-3 2011-04
-4 2011-05
-dtype: category
-Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]"""
-
- assert repr(s) == exp
-
- def test_categorical_series_repr_timedelta(self):
- idx = timedelta_range('1 days', periods=5)
- s = Series(Categorical(idx))
- exp = """0 1 days
-1 2 days
-2 3 days
-3 4 days
-4 5 days
-dtype: category
-Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]"""
-
- assert repr(s) == exp
-
- idx = timedelta_range('1 hours', periods=10)
- s = Series(Categorical(idx))
- exp = """0 0 days 01:00:00
-1 1 days 01:00:00
-2 2 days 01:00:00
-3 3 days 01:00:00
-4 4 days 01:00:00
-5 5 days 01:00:00
-6 6 days 01:00:00
-7 7 days 01:00:00
-8 8 days 01:00:00
-9 9 days 01:00:00
-dtype: category
-Categories (10, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00,
- 3 days 01:00:00, ..., 6 days 01:00:00, 7 days 01:00:00,
- 8 days 01:00:00, 9 days 01:00:00]""" # noqa
-
- assert repr(s) == exp
-
- def test_categorical_series_repr_timedelta_ordered(self):
- idx = timedelta_range('1 days', periods=5)
- s = Series(Categorical(idx, ordered=True))
- exp = """0 1 days
-1 2 days
-2 3 days
-3 4 days
-4 5 days
-dtype: category
-Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" # noqa
-
- assert repr(s) == exp
-
- idx = timedelta_range('1 hours', periods=10)
- s = Series(Categorical(idx, ordered=True))
- exp = """0 0 days 01:00:00
-1 1 days 01:00:00
-2 2 days 01:00:00
-3 3 days 01:00:00
-4 4 days 01:00:00
-5 5 days 01:00:00
-6 6 days 01:00:00
-7 7 days 01:00:00
-8 8 days 01:00:00
-9 9 days 01:00:00
-dtype: category
-Categories (10, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 <
- 3 days 01:00:00 ... 6 days 01:00:00 < 7 days 01:00:00 <
- 8 days 01:00:00 < 9 days 01:00:00]""" # noqa
-
- assert repr(s) == exp
-
- def test_categorical_index_repr(self):
- idx = CategoricalIndex(Categorical([1, 2, 3]))
- exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=False, dtype='category')""" # noqa
- assert repr(idx) == exp
-
- i = CategoricalIndex(Categorical(np.arange(10)))
- exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, 4, 5, 6, 7, ...], ordered=False, dtype='category')""" # noqa
- assert repr(i) == exp
-
- def test_categorical_index_repr_ordered(self):
- i = CategoricalIndex(Categorical([1, 2, 3], ordered=True))
- exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=True, dtype='category')""" # noqa
- assert repr(i) == exp
-
- i = CategoricalIndex(Categorical(np.arange(10), ordered=True))
- exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, 4, 5, 6, 7, ...], ordered=True, dtype='category')""" # noqa
- assert repr(i) == exp
-
- def test_categorical_index_repr_datetime(self):
- idx = date_range('2011-01-01 09:00', freq='H', periods=5)
- i = CategoricalIndex(Categorical(idx))
- exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00',
- '2011-01-01 11:00:00', '2011-01-01 12:00:00',
- '2011-01-01 13:00:00'],
- categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=False, dtype='category')""" # noqa
-
- assert repr(i) == exp
-
- idx = date_range('2011-01-01 09:00', freq='H', periods=5,
- tz='US/Eastern')
- i = CategoricalIndex(Categorical(idx))
- exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00',
- '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00',
- '2011-01-01 13:00:00-05:00'],
- categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=False, dtype='category')""" # noqa
-
- assert repr(i) == exp
-
- def test_categorical_index_repr_datetime_ordered(self):
- idx = date_range('2011-01-01 09:00', freq='H', periods=5)
- i = CategoricalIndex(Categorical(idx, ordered=True))
- exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00',
- '2011-01-01 11:00:00', '2011-01-01 12:00:00',
- '2011-01-01 13:00:00'],
- categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=True, dtype='category')""" # noqa
-
- assert repr(i) == exp
-
- idx = date_range('2011-01-01 09:00', freq='H', periods=5,
- tz='US/Eastern')
- i = CategoricalIndex(Categorical(idx, ordered=True))
- exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00',
- '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00',
- '2011-01-01 13:00:00-05:00'],
- categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" # noqa
-
- assert repr(i) == exp
-
- i = CategoricalIndex(Categorical(idx.append(idx), ordered=True))
- exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00',
- '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00',
- '2011-01-01 13:00:00-05:00', '2011-01-01 09:00:00-05:00',
- '2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00',
- '2011-01-01 12:00:00-05:00', '2011-01-01 13:00:00-05:00'],
- categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" # noqa
-
- assert repr(i) == exp
-
- def test_categorical_index_repr_period(self):
- # test all length
- idx = period_range('2011-01-01 09:00', freq='H', periods=1)
- i = CategoricalIndex(Categorical(idx))
- exp = """CategoricalIndex(['2011-01-01 09:00'], categories=[2011-01-01 09:00], ordered=False, dtype='category')""" # noqa
- assert repr(i) == exp
-
- idx = period_range('2011-01-01 09:00', freq='H', periods=2)
- i = CategoricalIndex(Categorical(idx))
- exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00], ordered=False, dtype='category')""" # noqa
- assert repr(i) == exp
-
- idx = period_range('2011-01-01 09:00', freq='H', periods=3)
- i = CategoricalIndex(Categorical(idx))
- exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00], ordered=False, dtype='category')""" # noqa
- assert repr(i) == exp
-
- idx = period_range('2011-01-01 09:00', freq='H', periods=5)
- i = CategoricalIndex(Categorical(idx))
- exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00',
- '2011-01-01 12:00', '2011-01-01 13:00'],
- categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" # noqa
-
- assert repr(i) == exp
-
- i = CategoricalIndex(Categorical(idx.append(idx)))
- exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00',
- '2011-01-01 12:00', '2011-01-01 13:00', '2011-01-01 09:00',
- '2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00',
- '2011-01-01 13:00'],
- categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" # noqa
-
- assert repr(i) == exp
-
- idx = period_range('2011-01', freq='M', periods=5)
- i = CategoricalIndex(Categorical(idx))
- exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=False, dtype='category')""" # noqa
- assert repr(i) == exp
-
- def test_categorical_index_repr_period_ordered(self):
- idx = period_range('2011-01-01 09:00', freq='H', periods=5)
- i = CategoricalIndex(Categorical(idx, ordered=True))
- exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00',
- '2011-01-01 12:00', '2011-01-01 13:00'],
- categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=True, dtype='category')""" # noqa
-
- assert repr(i) == exp
-
- idx = period_range('2011-01', freq='M', periods=5)
- i = CategoricalIndex(Categorical(idx, ordered=True))
- exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=True, dtype='category')""" # noqa
- assert repr(i) == exp
-
- def test_categorical_index_repr_timedelta(self):
- idx = timedelta_range('1 days', periods=5)
- i = CategoricalIndex(Categorical(idx))
- exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days 00:00:00, 2 days 00:00:00, 3 days 00:00:00, 4 days 00:00:00, 5 days 00:00:00], ordered=False, dtype='category')""" # noqa
- assert repr(i) == exp
-
- idx = timedelta_range('1 hours', periods=10)
- i = CategoricalIndex(Categorical(idx))
- exp = """CategoricalIndex(['0 days 01:00:00', '1 days 01:00:00', '2 days 01:00:00',
- '3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00',
- '6 days 01:00:00', '7 days 01:00:00', '8 days 01:00:00',
- '9 days 01:00:00'],
- categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, 5 days 01:00:00, 6 days 01:00:00, 7 days 01:00:00, ...], ordered=False, dtype='category')""" # noqa
-
- assert repr(i) == exp
-
- def test_categorical_index_repr_timedelta_ordered(self):
- idx = timedelta_range('1 days', periods=5)
- i = CategoricalIndex(Categorical(idx, ordered=True))
- exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days 00:00:00, 2 days 00:00:00, 3 days 00:00:00, 4 days 00:00:00, 5 days 00:00:00], ordered=True, dtype='category')""" # noqa
- assert repr(i) == exp
-
- idx = timedelta_range('1 hours', periods=10)
- i = CategoricalIndex(Categorical(idx, ordered=True))
- exp = """CategoricalIndex(['0 days 01:00:00', '1 days 01:00:00', '2 days 01:00:00',
- '3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00',
- '6 days 01:00:00', '7 days 01:00:00', '8 days 01:00:00',
- '9 days 01:00:00'],
- categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, 5 days 01:00:00, 6 days 01:00:00, 7 days 01:00:00, ...], ordered=True, dtype='category')""" # noqa
-
- assert repr(i) == exp
-
- def test_categorical_frame(self):
- # normal DataFrame
- dt = date_range('2011-01-01 09:00', freq='H', periods=5,
- tz='US/Eastern')
- p = period_range('2011-01', freq='M', periods=5)
- df = DataFrame({'dt': dt, 'p': p})
- exp = """ dt p
-0 2011-01-01 09:00:00-05:00 2011-01
-1 2011-01-01 10:00:00-05:00 2011-02
-2 2011-01-01 11:00:00-05:00 2011-03
-3 2011-01-01 12:00:00-05:00 2011-04
-4 2011-01-01 13:00:00-05:00 2011-05"""
-
- df = DataFrame({'dt': Categorical(dt), 'p': Categorical(p)})
- assert repr(df) == exp
-
- def test_info(self):
-
- # make sure it works
- n = 2500
- df = DataFrame({'int64': np.random.randint(100, size=n)})
- df['category'] = Series(np.array(list('abcdefghij')).take(
- np.random.randint(0, 10, size=n))).astype('category')
- df.isna()
- buf = compat.StringIO()
- df.info(buf=buf)
-
- df2 = df[df['category'] == 'd']
- buf = compat.StringIO()
- df2.info(buf=buf)
-
- def test_groupby_sort(self):
-
- # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby
- # This should result in a properly sorted Series so that the plot
- # has a sorted x axis
- # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar')
-
- res = self.cat.groupby(['value_group'])['value_group'].count()
- exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))]
- exp.index = CategoricalIndex(exp.index, name=exp.index.name)
- tm.assert_series_equal(res, exp)
-
- def test_min_max(self):
- # unordered cats have no min/max
- cat = Series(Categorical(["a", "b", "c", "d"], ordered=False))
- pytest.raises(TypeError, lambda: cat.min())
- pytest.raises(TypeError, lambda: cat.max())
-
- cat = Series(Categorical(["a", "b", "c", "d"], ordered=True))
- _min = cat.min()
- _max = cat.max()
- assert _min == "a"
- assert _max == "d"
-
- cat = Series(Categorical(["a", "b", "c", "d"], categories=[
- 'd', 'c', 'b', 'a'], ordered=True))
- _min = cat.min()
- _max = cat.max()
- assert _min == "d"
- assert _max == "a"
-
- cat = Series(Categorical(
- [np.nan, "b", "c", np.nan], categories=['d', 'c', 'b', 'a'
- ], ordered=True))
- _min = cat.min()
- _max = cat.max()
- assert np.isnan(_min)
- assert _max == "b"
-
- cat = Series(Categorical(
- [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True))
- _min = cat.min()
- _max = cat.max()
- assert np.isnan(_min)
- assert _max == 1
-
- def test_mode(self):
- s = Series(Categorical([1, 1, 2, 4, 5, 5, 5],
- categories=[5, 4, 3, 2, 1], ordered=True))
- res = s.mode()
- exp = Series(Categorical([5], categories=[
- 5, 4, 3, 2, 1], ordered=True))
- tm.assert_series_equal(res, exp)
- s = Series(Categorical([1, 1, 1, 4, 5, 5, 5],
- categories=[5, 4, 3, 2, 1], ordered=True))
- res = s.mode()
- exp = Series(Categorical([5, 1], categories=[
- 5, 4, 3, 2, 1], ordered=True))
- tm.assert_series_equal(res, exp)
- s = Series(Categorical([1, 2, 3, 4, 5], categories=[5, 4, 3, 2, 1],
- ordered=True))
- res = s.mode()
- exp = Series(Categorical([5, 4, 3, 2, 1], categories=[5, 4, 3, 2, 1],
- ordered=True))
- tm.assert_series_equal(res, exp)
-
- def test_value_counts(self):
- # GH 12835
- cats = Categorical(list('abcccb'), categories=list('cabd'))
- s = Series(cats, name='xxx')
- res = s.value_counts(sort=False)
-
- exp_index = CategoricalIndex(list('cabd'), categories=cats.categories)
- exp = Series([3, 1, 2, 0], name='xxx', index=exp_index)
- tm.assert_series_equal(res, exp)
-
- res = s.value_counts(sort=True)
-
- exp_index = CategoricalIndex(list('cbad'), categories=cats.categories)
- exp = Series([3, 2, 1, 0], name='xxx', index=exp_index)
- tm.assert_series_equal(res, exp)
-
- # check object dtype handles the Series.name as the same
- # (tested in test_base.py)
- s = Series(["a", "b", "c", "c", "c", "b"], name='xxx')
- res = s.value_counts()
- exp = Series([3, 2, 1], name='xxx', index=["c", "b", "a"])
- tm.assert_series_equal(res, exp)
-
- def test_value_counts_with_nan(self):
- # see gh-9443
-
- # sanity check
- s = Series(["a", "b", "a"], dtype="category")
- exp = Series([2, 1], index=CategoricalIndex(["a", "b"]))
-
- res = s.value_counts(dropna=True)
- tm.assert_series_equal(res, exp)
-
- res = s.value_counts(dropna=True)
- tm.assert_series_equal(res, exp)
-
- # same Series via two different constructions --> same behaviour
- series = [
- Series(["a", "b", None, "a", None, None], dtype="category"),
- Series(Categorical(["a", "b", None, "a", None, None],
- categories=["a", "b"]))
- ]
-
- for s in series:
- # None is a NaN value, so we exclude its count here
- exp = Series([2, 1], index=CategoricalIndex(["a", "b"]))
- res = s.value_counts(dropna=True)
- tm.assert_series_equal(res, exp)
-
- # we don't exclude the count of None and sort by counts
- exp = Series([3, 2, 1], index=CategoricalIndex([np.nan, "a", "b"]))
- res = s.value_counts(dropna=False)
- tm.assert_series_equal(res, exp)
-
- # When we aren't sorting by counts, and np.nan isn't a
- # category, it should be last.
- exp = Series([2, 1, 3], index=CategoricalIndex(["a", "b", np.nan]))
- res = s.value_counts(dropna=False, sort=False)
- tm.assert_series_equal(res, exp)
-
- def test_groupby(self):
-
- cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
- categories=["a", "b", "c", "d"], ordered=True)
- data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})
-
- exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True)
- expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index)
- result = data.groupby("b").mean()
- tm.assert_frame_equal(result, expected)
-
- raw_cat1 = Categorical(["a", "a", "b", "b"],
- categories=["a", "b", "z"], ordered=True)
- raw_cat2 = Categorical(["c", "d", "c", "d"],
- categories=["c", "d", "y"], ordered=True)
- df = DataFrame({"A": raw_cat1, "B": raw_cat2, "values": [1, 2, 3, 4]})
-
- # single grouper
- gb = df.groupby("A")
- exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True)
- expected = DataFrame({'values': Series([3, 7, np.nan], index=exp_idx)})
- result = gb.sum()
- tm.assert_frame_equal(result, expected)
-
- # multiple groupers
- gb = df.groupby(['A', 'B'])
- exp_index = pd.MultiIndex.from_product(
- [Categorical(["a", "b", "z"], ordered=True),
- Categorical(["c", "d", "y"], ordered=True)],
- names=['A', 'B'])
- expected = DataFrame({'values': [1, 2, np.nan, 3, 4, np.nan,
- np.nan, np.nan, np.nan]},
- index=exp_index)
- result = gb.sum()
- tm.assert_frame_equal(result, expected)
-
- # multiple groupers with a non-cat
- df = df.copy()
- df['C'] = ['foo', 'bar'] * 2
- gb = df.groupby(['A', 'B', 'C'])
- exp_index = pd.MultiIndex.from_product(
- [Categorical(["a", "b", "z"], ordered=True),
- Categorical(["c", "d", "y"], ordered=True),
- ['foo', 'bar']],
- names=['A', 'B', 'C'])
- expected = DataFrame({'values': Series(
- np.nan, index=exp_index)}).sort_index()
- expected.iloc[[1, 2, 7, 8], 0] = [1, 2, 3, 4]
- result = gb.sum()
- tm.assert_frame_equal(result, expected)
-
- # GH 8623
- x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'],
- [1, 'John P. Doe']],
- columns=['person_id', 'person_name'])
- x['person_name'] = Categorical(x.person_name)
-
- g = x.groupby(['person_id'])
- result = g.transform(lambda x: x)
- tm.assert_frame_equal(result, x[['person_name']])
-
- result = x.drop_duplicates('person_name')
- expected = x.iloc[[0, 1]]
- tm.assert_frame_equal(result, expected)
-
- def f(x):
- return x.drop_duplicates('person_name').iloc[0]
-
- result = g.apply(f)
- expected = x.iloc[[0, 1]].copy()
- expected.index = Index([1, 2], name='person_id')
- expected['person_name'] = expected['person_name'].astype('object')
- tm.assert_frame_equal(result, expected)
-
- # GH 9921
- # Monotonic
- df = DataFrame({"a": [5, 15, 25]})
- c = pd.cut(df.a, bins=[0, 10, 20, 30, 40])
-
- result = df.a.groupby(c).transform(sum)
- tm.assert_series_equal(result, df['a'])
-
- tm.assert_series_equal(
- df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a'])
- tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']])
- tm.assert_frame_equal(
- df.groupby(c).transform(lambda xs: np.max(xs)), df[['a']])
-
- # Filter
- tm.assert_series_equal(df.a.groupby(c).filter(np.all), df['a'])
- tm.assert_frame_equal(df.groupby(c).filter(np.all), df)
-
- # Non-monotonic
- df = DataFrame({"a": [5, 15, 25, -5]})
- c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40])
-
- result = df.a.groupby(c).transform(sum)
- tm.assert_series_equal(result, df['a'])
-
- tm.assert_series_equal(
- df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a'])
- tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']])
- tm.assert_frame_equal(
- df.groupby(c).transform(lambda xs: np.sum(xs)), df[['a']])
-
- # GH 9603
- df = DataFrame({'a': [1, 0, 0, 0]})
- c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd')))
- result = df.groupby(c).apply(len)
-
- exp_index = CategoricalIndex(
- c.values.categories, ordered=c.values.ordered)
- expected = Series([1, 0, 0, 0], index=exp_index)
- expected.index.name = 'a'
- tm.assert_series_equal(result, expected)
-
- def test_pivot_table(self):
-
- raw_cat1 = Categorical(["a", "a", "b", "b"],
- categories=["a", "b", "z"], ordered=True)
- raw_cat2 = Categorical(["c", "d", "c", "d"],
- categories=["c", "d", "y"], ordered=True)
- df = DataFrame({"A": raw_cat1, "B": raw_cat2, "values": [1, 2, 3, 4]})
- result = pd.pivot_table(df, values='values', index=['A', 'B'])
-
- exp_index = pd.MultiIndex.from_product(
- [Categorical(["a", "b", "z"], ordered=True),
- Categorical(["c", "d", "y"], ordered=True)],
- names=['A', 'B'])
- expected = DataFrame(
- {'values': [1, 2, np.nan, 3, 4, np.nan, np.nan, np.nan, np.nan]},
- index=exp_index)
- tm.assert_frame_equal(result, expected)
-
- def test_count(self):
-
- s = Series(Categorical([np.nan, 1, 2, np.nan],
- categories=[5, 4, 3, 2, 1], ordered=True))
- result = s.count()
- assert result == 2
-
- def test_sort_values(self):
-
- c = Categorical(["a", "b", "b", "a"], ordered=False)
- cat = Series(c.copy())
-
- # sort in the categories order
- expected = Series(
- Categorical(["a", "a", "b", "b"],
- ordered=False), index=[0, 3, 1, 2])
- result = cat.sort_values()
- tm.assert_series_equal(result, expected)
-
- cat = Series(Categorical(["a", "c", "b", "d"], ordered=True))
- res = cat.sort_values()
- exp = np.array(["a", "b", "c", "d"], dtype=np.object_)
- tm.assert_numpy_array_equal(res.__array__(), exp)
-
- cat = Series(Categorical(["a", "c", "b", "d"], categories=[
- "a", "b", "c", "d"], ordered=True))
- res = cat.sort_values()
- exp = np.array(["a", "b", "c", "d"], dtype=np.object_)
- tm.assert_numpy_array_equal(res.__array__(), exp)
-
- res = cat.sort_values(ascending=False)
- exp = np.array(["d", "c", "b", "a"], dtype=np.object_)
- tm.assert_numpy_array_equal(res.__array__(), exp)
-
- raw_cat1 = Categorical(["a", "b", "c", "d"],
- categories=["a", "b", "c", "d"], ordered=False)
- raw_cat2 = Categorical(["a", "b", "c", "d"],
- categories=["d", "c", "b", "a"], ordered=True)
- s = ["a", "b", "c", "d"]
- df = DataFrame({"unsort": raw_cat1,
- "sort": raw_cat2,
- "string": s,
- "values": [1, 2, 3, 4]})
-
- # Cats must be sorted in a dataframe
- res = df.sort_values(by=["string"], ascending=False)
- exp = np.array(["d", "c", "b", "a"], dtype=np.object_)
- tm.assert_numpy_array_equal(res["sort"].values.__array__(), exp)
- assert res["sort"].dtype == "category"
-
- res = df.sort_values(by=["sort"], ascending=False)
- exp = df.sort_values(by=["string"], ascending=True)
- tm.assert_series_equal(res["values"], exp["values"])
- assert res["sort"].dtype == "category"
- assert res["unsort"].dtype == "category"
-
- # unordered cat, but we allow this
- df.sort_values(by=["unsort"], ascending=False)
-
- # multi-columns sort
- # GH 7848
- df = DataFrame({"id": [6, 5, 4, 3, 2, 1],
- "raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']})
- df["grade"] = Categorical(df["raw_grade"], ordered=True)
- df['grade'] = df['grade'].cat.set_categories(['b', 'e', 'a'])
-
- # sorts 'grade' according to the order of the categories
- result = df.sort_values(by=['grade'])
- expected = df.iloc[[1, 2, 5, 0, 3, 4]]
- tm.assert_frame_equal(result, expected)
-
- # multi
- result = df.sort_values(by=['grade', 'id'])
- expected = df.iloc[[2, 1, 5, 4, 3, 0]]
- tm.assert_frame_equal(result, expected)
-
- def test_slicing(self):
- cat = Series(Categorical([1, 2, 3, 4]))
- reversed = cat[::-1]
- exp = np.array([4, 3, 2, 1], dtype=np.int64)
- tm.assert_numpy_array_equal(reversed.__array__(), exp)
-
- df = DataFrame({'value': (np.arange(100) + 1).astype('int64')})
- df['D'] = pd.cut(df.value, bins=[0, 25, 50, 75, 100])
-
- expected = Series([11, Interval(0, 25)], index=['value', 'D'], name=10)
- result = df.iloc[10]
- tm.assert_series_equal(result, expected)
-
- expected = DataFrame({'value': np.arange(11, 21).astype('int64')},
- index=np.arange(10, 20).astype('int64'))
- expected['D'] = pd.cut(expected.value, bins=[0, 25, 50, 75, 100])
- result = df.iloc[10:20]
- tm.assert_frame_equal(result, expected)
-
- expected = Series([9, Interval(0, 25)], index=['value', 'D'], name=8)
- result = df.loc[8]
- tm.assert_series_equal(result, expected)
-
- def test_slicing_and_getting_ops(self):
-
- # systematically test the slicing operations:
- # for all slicing ops:
- # - returning a dataframe
- # - returning a column
- # - returning a row
- # - returning a single value
-
- cats = Categorical(
- ["a", "c", "b", "c", "c", "c", "c"], categories=["a", "b", "c"])
- idx = Index(["h", "i", "j", "k", "l", "m", "n"])
- values = [1, 2, 3, 4, 5, 6, 7]
- df = DataFrame({"cats": cats, "values": values}, index=idx)
-
- # the expected values
- cats2 = Categorical(["b", "c"], categories=["a", "b", "c"])
- idx2 = Index(["j", "k"])
- values2 = [3, 4]
-
- # 2:4,: | "j":"k",:
- exp_df = DataFrame({"cats": cats2, "values": values2}, index=idx2)
-
- # :,"cats" | :,0
- exp_col = Series(cats, index=idx, name='cats')
-
- # "j",: | 2,:
- exp_row = Series(["b", 3], index=["cats", "values"], dtype="object",
- name="j")
-
- # "j","cats | 2,0
- exp_val = "b"
-
- # iloc
- # frame
- res_df = df.iloc[2:4, :]
- tm.assert_frame_equal(res_df, exp_df)
- assert is_categorical_dtype(res_df["cats"])
-
- # row
- res_row = df.iloc[2, :]
- tm.assert_series_equal(res_row, exp_row)
- assert isinstance(res_row["cats"], compat.string_types)
-
- # col
- res_col = df.iloc[:, 0]
- tm.assert_series_equal(res_col, exp_col)
- assert is_categorical_dtype(res_col)
-
- # single value
- res_val = df.iloc[2, 0]
- assert res_val == exp_val
-
- # loc
- # frame
- res_df = df.loc["j":"k", :]
- tm.assert_frame_equal(res_df, exp_df)
- assert is_categorical_dtype(res_df["cats"])
-
- # row
- res_row = df.loc["j", :]
- tm.assert_series_equal(res_row, exp_row)
- assert isinstance(res_row["cats"], compat.string_types)
-
- # col
- res_col = df.loc[:, "cats"]
- tm.assert_series_equal(res_col, exp_col)
- assert is_categorical_dtype(res_col)
-
- # single value
- res_val = df.loc["j", "cats"]
- assert res_val == exp_val
-
- # ix
- # frame
- # res_df = df.loc["j":"k",[0,1]] # doesn't work?
- res_df = df.loc["j":"k", :]
- tm.assert_frame_equal(res_df, exp_df)
- assert is_categorical_dtype(res_df["cats"])
-
- # row
- res_row = df.loc["j", :]
- tm.assert_series_equal(res_row, exp_row)
- assert isinstance(res_row["cats"], compat.string_types)
-
- # col
- res_col = df.loc[:, "cats"]
- tm.assert_series_equal(res_col, exp_col)
- assert is_categorical_dtype(res_col)
-
- # single value
- res_val = df.loc["j", df.columns[0]]
- assert res_val == exp_val
-
- # iat
- res_val = df.iat[2, 0]
- assert res_val == exp_val
-
- # at
- res_val = df.at["j", "cats"]
- assert res_val == exp_val
-
- # fancy indexing
- exp_fancy = df.iloc[[2]]
-
- res_fancy = df[df["cats"] == "b"]
- tm.assert_frame_equal(res_fancy, exp_fancy)
- res_fancy = df[df["values"] == 3]
- tm.assert_frame_equal(res_fancy, exp_fancy)
-
- # get_value
- res_val = df.at["j", "cats"]
- assert res_val == exp_val
-
- # i : int, slice, or sequence of integers
- res_row = df.iloc[2]
- tm.assert_series_equal(res_row, exp_row)
- assert isinstance(res_row["cats"], compat.string_types)
-
- res_df = df.iloc[slice(2, 4)]
- tm.assert_frame_equal(res_df, exp_df)
- assert is_categorical_dtype(res_df["cats"])
-
- res_df = df.iloc[[2, 3]]
- tm.assert_frame_equal(res_df, exp_df)
- assert is_categorical_dtype(res_df["cats"])
-
- res_col = df.iloc[:, 0]
- tm.assert_series_equal(res_col, exp_col)
- assert is_categorical_dtype(res_col)
-
- res_df = df.iloc[:, slice(0, 2)]
- tm.assert_frame_equal(res_df, df)
- assert is_categorical_dtype(res_df["cats"])
-
- res_df = df.iloc[:, [0, 1]]
- tm.assert_frame_equal(res_df, df)
- assert is_categorical_dtype(res_df["cats"])
-
- def test_slicing_doc_examples(self):
-
- # GH 7918
- cats = Categorical(["a", "b", "b", "b", "c", "c", "c"],
- categories=["a", "b", "c"])
- idx = Index(["h", "i", "j", "k", "l", "m", "n", ])
- values = [1, 2, 2, 2, 3, 4, 5]
- df = DataFrame({"cats": cats, "values": values}, index=idx)
-
- result = df.iloc[2:4, :]
- expected = DataFrame(
- {"cats": Categorical(['b', 'b'], categories=['a', 'b', 'c']),
- "values": [2, 2]}, index=['j', 'k'])
- tm.assert_frame_equal(result, expected)
-
- result = df.iloc[2:4, :].dtypes
- expected = Series(['category', 'int64'], ['cats', 'values'])
- tm.assert_series_equal(result, expected)
-
- result = df.loc["h":"j", "cats"]
- expected = Series(Categorical(['a', 'b', 'b'],
- categories=['a', 'b', 'c']),
- index=['h', 'i', 'j'], name='cats')
- tm.assert_series_equal(result, expected)
-
- result = df.loc["h":"j", df.columns[0:1]]
- expected = DataFrame({'cats': Categorical(['a', 'b', 'b'],
- categories=['a', 'b', 'c'])},
- index=['h', 'i', 'j'])
- tm.assert_frame_equal(result, expected)
-
- def test_assigning_ops(self):
- # systematically test the assigning operations:
- # for all slicing ops:
- # for value in categories and value not in categories:
-
- # - assign a single value -> exp_single_cats_value
-
- # - assign a complete row (mixed values) -> exp_single_row
-
- # assign multiple rows (mixed values) (-> array) -> exp_multi_row
-
- # assign a part of a column with dtype == categorical ->
- # exp_parts_cats_col
-
- # assign a part of a column with dtype != categorical ->
- # exp_parts_cats_col
-
- cats = Categorical(["a", "a", "a", "a", "a", "a", "a"],
- categories=["a", "b"])
- idx = Index(["h", "i", "j", "k", "l", "m", "n"])
- values = [1, 1, 1, 1, 1, 1, 1]
- orig = DataFrame({"cats": cats, "values": values}, index=idx)
-
- # the expected values
- # changed single row
- cats1 = Categorical(["a", "a", "b", "a", "a", "a", "a"],
- categories=["a", "b"])
- idx1 = Index(["h", "i", "j", "k", "l", "m", "n"])
- values1 = [1, 1, 2, 1, 1, 1, 1]
- exp_single_row = DataFrame({"cats": cats1,
- "values": values1}, index=idx1)
-
- # changed multiple rows
- cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"],
- categories=["a", "b"])
- idx2 = Index(["h", "i", "j", "k", "l", "m", "n"])
- values2 = [1, 1, 2, 2, 1, 1, 1]
- exp_multi_row = DataFrame({"cats": cats2,
- "values": values2}, index=idx2)
-
- # changed part of the cats column
- cats3 = Categorical(
- ["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"])
- idx3 = Index(["h", "i", "j", "k", "l", "m", "n"])
- values3 = [1, 1, 1, 1, 1, 1, 1]
- exp_parts_cats_col = DataFrame({"cats": cats3,
- "values": values3}, index=idx3)
-
- # changed single value in cats col
- cats4 = Categorical(
- ["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"])
- idx4 = Index(["h", "i", "j", "k", "l", "m", "n"])
- values4 = [1, 1, 1, 1, 1, 1, 1]
- exp_single_cats_value = DataFrame({"cats": cats4,
- "values": values4}, index=idx4)
-
- # iloc
- # ###############
- # - assign a single value -> exp_single_cats_value
- df = orig.copy()
- df.iloc[2, 0] = "b"
- tm.assert_frame_equal(df, exp_single_cats_value)
-
- df = orig.copy()
- df.iloc[df.index == "j", 0] = "b"
- tm.assert_frame_equal(df, exp_single_cats_value)
-
- # - assign a single value not in the current categories set
- def f():
- df = orig.copy()
- df.iloc[2, 0] = "c"
-
- pytest.raises(ValueError, f)
-
- # - assign a complete row (mixed values) -> exp_single_row
- df = orig.copy()
- df.iloc[2, :] = ["b", 2]
- tm.assert_frame_equal(df, exp_single_row)
-
- # - assign a complete row (mixed values) not in categories set
- def f():
- df = orig.copy()
- df.iloc[2, :] = ["c", 2]
-
- pytest.raises(ValueError, f)
-
- # - assign multiple rows (mixed values) -> exp_multi_row
- df = orig.copy()
- df.iloc[2:4, :] = [["b", 2], ["b", 2]]
- tm.assert_frame_equal(df, exp_multi_row)
-
- def f():
- df = orig.copy()
- df.iloc[2:4, :] = [["c", 2], ["c", 2]]
-
- pytest.raises(ValueError, f)
-
- # assign a part of a column with dtype == categorical ->
- # exp_parts_cats_col
- df = orig.copy()
- df.iloc[2:4, 0] = Categorical(["b", "b"], categories=["a", "b"])
- tm.assert_frame_equal(df, exp_parts_cats_col)
-
- with pytest.raises(ValueError):
- # different categories -> not sure if this should fail or pass
- df = orig.copy()
- df.iloc[2:4, 0] = Categorical(list('bb'), categories=list('abc'))
-
- with pytest.raises(ValueError):
- # different values
- df = orig.copy()
- df.iloc[2:4, 0] = Categorical(list('cc'), categories=list('abc'))
-
- # assign a part of a column with dtype != categorical ->
- # exp_parts_cats_col
- df = orig.copy()
- df.iloc[2:4, 0] = ["b", "b"]
- tm.assert_frame_equal(df, exp_parts_cats_col)
-
- with pytest.raises(ValueError):
- df.iloc[2:4, 0] = ["c", "c"]
-
- # loc
- # ##############
- # - assign a single value -> exp_single_cats_value
- df = orig.copy()
- df.loc["j", "cats"] = "b"
- tm.assert_frame_equal(df, exp_single_cats_value)
-
- df = orig.copy()
- df.loc[df.index == "j", "cats"] = "b"
- tm.assert_frame_equal(df, exp_single_cats_value)
-
- # - assign a single value not in the current categories set
- def f():
- df = orig.copy()
- df.loc["j", "cats"] = "c"
-
- pytest.raises(ValueError, f)
-
- # - assign a complete row (mixed values) -> exp_single_row
- df = orig.copy()
- df.loc["j", :] = ["b", 2]
- tm.assert_frame_equal(df, exp_single_row)
-
- # - assign a complete row (mixed values) not in categories set
- def f():
- df = orig.copy()
- df.loc["j", :] = ["c", 2]
-
- pytest.raises(ValueError, f)
-
- # - assign multiple rows (mixed values) -> exp_multi_row
- df = orig.copy()
- df.loc["j":"k", :] = [["b", 2], ["b", 2]]
- tm.assert_frame_equal(df, exp_multi_row)
-
- def f():
- df = orig.copy()
- df.loc["j":"k", :] = [["c", 2], ["c", 2]]
-
- pytest.raises(ValueError, f)
-
- # assign a part of a column with dtype == categorical ->
- # exp_parts_cats_col
- df = orig.copy()
- df.loc["j":"k", "cats"] = Categorical(
- ["b", "b"], categories=["a", "b"])
- tm.assert_frame_equal(df, exp_parts_cats_col)
-
- with pytest.raises(ValueError):
- # different categories -> not sure if this should fail or pass
- df = orig.copy()
- df.loc["j":"k", "cats"] = Categorical(
- ["b", "b"], categories=["a", "b", "c"])
-
- with pytest.raises(ValueError):
- # different values
- df = orig.copy()
- df.loc["j":"k", "cats"] = Categorical(
- ["c", "c"], categories=["a", "b", "c"])
-
- # assign a part of a column with dtype != categorical ->
- # exp_parts_cats_col
- df = orig.copy()
- df.loc["j":"k", "cats"] = ["b", "b"]
- tm.assert_frame_equal(df, exp_parts_cats_col)
-
- with pytest.raises(ValueError):
- df.loc["j":"k", "cats"] = ["c", "c"]
-
- # loc
- # ##############
- # - assign a single value -> exp_single_cats_value
- df = orig.copy()
- df.loc["j", df.columns[0]] = "b"
- tm.assert_frame_equal(df, exp_single_cats_value)
-
- df = orig.copy()
- df.loc[df.index == "j", df.columns[0]] = "b"
- tm.assert_frame_equal(df, exp_single_cats_value)
-
- # - assign a single value not in the current categories set
- def f():
- df = orig.copy()
- df.loc["j", df.columns[0]] = "c"
-
- pytest.raises(ValueError, f)
-
- # - assign a complete row (mixed values) -> exp_single_row
- df = orig.copy()
- df.loc["j", :] = ["b", 2]
- tm.assert_frame_equal(df, exp_single_row)
-
- # - assign a complete row (mixed values) not in categories set
- def f():
- df = orig.copy()
- df.loc["j", :] = ["c", 2]
-
- pytest.raises(ValueError, f)
-
- # - assign multiple rows (mixed values) -> exp_multi_row
- df = orig.copy()
- df.loc["j":"k", :] = [["b", 2], ["b", 2]]
- tm.assert_frame_equal(df, exp_multi_row)
-
- def f():
- df = orig.copy()
- df.loc["j":"k", :] = [["c", 2], ["c", 2]]
-
- pytest.raises(ValueError, f)
-
- # assign a part of a column with dtype == categorical ->
- # exp_parts_cats_col
- df = orig.copy()
- df.loc["j":"k", df.columns[0]] = Categorical(
- ["b", "b"], categories=["a", "b"])
- tm.assert_frame_equal(df, exp_parts_cats_col)
-
- with pytest.raises(ValueError):
- # different categories -> not sure if this should fail or pass
- df = orig.copy()
- df.loc["j":"k", df.columns[0]] = Categorical(
- ["b", "b"], categories=["a", "b", "c"])
-
- with pytest.raises(ValueError):
- # different values
- df = orig.copy()
- df.loc["j":"k", df.columns[0]] = Categorical(
- ["c", "c"], categories=["a", "b", "c"])
-
- # assign a part of a column with dtype != categorical ->
- # exp_parts_cats_col
- df = orig.copy()
- df.loc["j":"k", df.columns[0]] = ["b", "b"]
- tm.assert_frame_equal(df, exp_parts_cats_col)
-
- with pytest.raises(ValueError):
- df.loc["j":"k", df.columns[0]] = ["c", "c"]
-
- # iat
- df = orig.copy()
- df.iat[2, 0] = "b"
- tm.assert_frame_equal(df, exp_single_cats_value)
-
- # - assign a single value not in the current categories set
- def f():
- df = orig.copy()
- df.iat[2, 0] = "c"
-
- pytest.raises(ValueError, f)
-
- # at
- # - assign a single value -> exp_single_cats_value
- df = orig.copy()
- df.at["j", "cats"] = "b"
- tm.assert_frame_equal(df, exp_single_cats_value)
-
- # - assign a single value not in the current categories set
- def f():
- df = orig.copy()
- df.at["j", "cats"] = "c"
-
- pytest.raises(ValueError, f)
-
- # fancy indexing
- catsf = Categorical(["a", "a", "c", "c", "a", "a", "a"],
- categories=["a", "b", "c"])
- idxf = Index(["h", "i", "j", "k", "l", "m", "n"])
- valuesf = [1, 1, 3, 3, 1, 1, 1]
- df = DataFrame({"cats": catsf, "values": valuesf}, index=idxf)
-
- exp_fancy = exp_multi_row.copy()
- exp_fancy["cats"].cat.set_categories(["a", "b", "c"], inplace=True)
-
- df[df["cats"] == "c"] = ["b", 2]
- # category c is kept in .categories
- tm.assert_frame_equal(df, exp_fancy)
-
- # set_value
- df = orig.copy()
- df.at["j", "cats"] = "b"
- tm.assert_frame_equal(df, exp_single_cats_value)
-
- def f():
- df = orig.copy()
- df.at["j", "cats"] = "c"
-
- pytest.raises(ValueError, f)
-
- # Assigning a Category to parts of a int/... column uses the values of
- # the Catgorical
- df = DataFrame({"a": [1, 1, 1, 1, 1], "b": list("aaaaa")})
- exp = DataFrame({"a": [1, "b", "b", 1, 1], "b": list("aabba")})
- df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"])
- df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"])
- tm.assert_frame_equal(df, exp)
-
- # Series
- orig = Series(Categorical(["b", "b"], categories=["a", "b"]))
- s = orig.copy()
- s[:] = "a"
- exp = Series(Categorical(["a", "a"], categories=["a", "b"]))
- tm.assert_series_equal(s, exp)
-
- s = orig.copy()
- s[1] = "a"
- exp = Series(Categorical(["b", "a"], categories=["a", "b"]))
- tm.assert_series_equal(s, exp)
-
- s = orig.copy()
- s[s.index > 0] = "a"
- exp = Series(Categorical(["b", "a"], categories=["a", "b"]))
- tm.assert_series_equal(s, exp)
-
- s = orig.copy()
- s[[False, True]] = "a"
- exp = Series(Categorical(["b", "a"], categories=["a", "b"]))
- tm.assert_series_equal(s, exp)
-
- s = orig.copy()
- s.index = ["x", "y"]
- s["y"] = "a"
- exp = Series(Categorical(["b", "a"], categories=["a", "b"]),
- index=["x", "y"])
- tm.assert_series_equal(s, exp)
-
- # ensure that one can set something to np.nan
- s = Series(Categorical([1, 2, 3]))
- exp = Series(Categorical([1, np.nan, 3], categories=[1, 2, 3]))
- s[1] = np.nan
- tm.assert_series_equal(s, exp)
-
- def test_comparisons(self):
- tests_data = [(list("abc"), list("cba"), list("bbb")),
- ([1, 2, 3], [3, 2, 1], [2, 2, 2])]
- for data, reverse, base in tests_data:
- cat_rev = Series(
- Categorical(data, categories=reverse, ordered=True))
- cat_rev_base = Series(
- Categorical(base, categories=reverse, ordered=True))
- cat = Series(Categorical(data, ordered=True))
- cat_base = Series(
- Categorical(base, categories=cat.cat.categories, ordered=True))
- s = Series(base)
- a = np.array(base)
-
- # comparisons need to take categories ordering into account
- res_rev = cat_rev > cat_rev_base
- exp_rev = Series([True, False, False])
- tm.assert_series_equal(res_rev, exp_rev)
-
- res_rev = cat_rev < cat_rev_base
- exp_rev = Series([False, False, True])
- tm.assert_series_equal(res_rev, exp_rev)
-
- res = cat > cat_base
- exp = Series([False, False, True])
- tm.assert_series_equal(res, exp)
-
- scalar = base[1]
- res = cat > scalar
- exp = Series([False, False, True])
- exp2 = cat.values > scalar
- tm.assert_series_equal(res, exp)
- tm.assert_numpy_array_equal(res.values, exp2)
- res_rev = cat_rev > scalar
- exp_rev = Series([True, False, False])
- exp_rev2 = cat_rev.values > scalar
- tm.assert_series_equal(res_rev, exp_rev)
- tm.assert_numpy_array_equal(res_rev.values, exp_rev2)
-
- # Only categories with same categories can be compared
- def f():
- cat > cat_rev
-
- pytest.raises(TypeError, f)
-
- # categorical cannot be compared to Series or numpy array, and also
- # not the other way around
- pytest.raises(TypeError, lambda: cat > s)
- pytest.raises(TypeError, lambda: cat_rev > s)
- pytest.raises(TypeError, lambda: cat > a)
- pytest.raises(TypeError, lambda: cat_rev > a)
-
- pytest.raises(TypeError, lambda: s < cat)
- pytest.raises(TypeError, lambda: s < cat_rev)
-
- pytest.raises(TypeError, lambda: a < cat)
- pytest.raises(TypeError, lambda: a < cat_rev)
-
- # unequal comparison should raise for unordered cats
- cat = Series(Categorical(list("abc")))
-
- def f():
- cat > "b"
-
- pytest.raises(TypeError, f)
- cat = Series(Categorical(list("abc"), ordered=False))
-
- def f():
- cat > "b"
-
- pytest.raises(TypeError, f)
-
- # https://github.com/pandas-dev/pandas/issues/9836#issuecomment-92123057
- # and following comparisons with scalars not in categories should raise
- # for unequal comps, but not for equal/not equal
- cat = Series(Categorical(list("abc"), ordered=True))
-
- pytest.raises(TypeError, lambda: cat < "d")
- pytest.raises(TypeError, lambda: cat > "d")
- pytest.raises(TypeError, lambda: "d" < cat)
- pytest.raises(TypeError, lambda: "d" > cat)
-
- tm.assert_series_equal(cat == "d", Series([False, False, False]))
- tm.assert_series_equal(cat != "d", Series([True, True, True]))
-
- # And test NaN handling...
- cat = Series(Categorical(["a", "b", "c", np.nan]))
- exp = Series([True, True, True, False])
- res = (cat == cat)
- tm.assert_series_equal(res, exp)
-
- def test_cat_equality(self):
-
- # GH 8938
- # allow equality comparisons
- a = Series(list('abc'), dtype="category")
- b = Series(list('abc'), dtype="object")
- c = Series(['a', 'b', 'cc'], dtype="object")
- d = Series(list('acb'), dtype="object")
- e = Categorical(list('abc'))
- f = Categorical(list('acb'))
-
- # vs scalar
- assert not (a == 'a').all()
- assert ((a != 'a') == ~(a == 'a')).all()
-
- assert not ('a' == a).all()
- assert (a == 'a')[0]
- assert ('a' == a)[0]
- assert not ('a' != a)[0]
-
- # vs list-like
- assert (a == a).all()
- assert not (a != a).all()
-
- assert (a == list(a)).all()
- assert (a == b).all()
- assert (b == a).all()
- assert ((~(a == b)) == (a != b)).all()
- assert ((~(b == a)) == (b != a)).all()
-
- assert not (a == c).all()
- assert not (c == a).all()
- assert not (a == d).all()
- assert not (d == a).all()
-
- # vs a cat-like
- assert (a == e).all()
- assert (e == a).all()
- assert not (a == f).all()
- assert not (f == a).all()
-
- assert ((~(a == e) == (a != e)).all())
- assert ((~(e == a) == (e != a)).all())
- assert ((~(a == f) == (a != f)).all())
- assert ((~(f == a) == (f != a)).all())
-
- # non-equality is not comparable
- pytest.raises(TypeError, lambda: a < b)
- pytest.raises(TypeError, lambda: b < a)
- pytest.raises(TypeError, lambda: a > b)
- pytest.raises(TypeError, lambda: b > a)
-
- @pytest.mark.parametrize('ctor', [
- lambda *args, **kwargs: Categorical(*args, **kwargs),
- lambda *args, **kwargs: Series(Categorical(*args, **kwargs)),
- ])
- def test_unordered_different_order_equal(self, ctor):
- # https://github.com/pandas-dev/pandas/issues/16014
- c1 = ctor(['a', 'b'], categories=['a', 'b'], ordered=False)
- c2 = ctor(['a', 'b'], categories=['b', 'a'], ordered=False)
- assert (c1 == c2).all()
-
- c1 = ctor(['a', 'b'], categories=['a', 'b'], ordered=False)
- c2 = ctor(['b', 'a'], categories=['b', 'a'], ordered=False)
- assert (c1 != c2).all()
-
- c1 = ctor(['a', 'a'], categories=['a', 'b'], ordered=False)
- c2 = ctor(['b', 'b'], categories=['b', 'a'], ordered=False)
- assert (c1 != c2).all()
-
- c1 = ctor(['a', 'a'], categories=['a', 'b'], ordered=False)
- c2 = ctor(['a', 'b'], categories=['b', 'a'], ordered=False)
- result = c1 == c2
- tm.assert_numpy_array_equal(np.array(result), np.array([True, False]))
-
- def test_unordered_different_categories_raises(self):
- c1 = Categorical(['a', 'b'], categories=['a', 'b'], ordered=False)
- c2 = Categorical(['a', 'c'], categories=['c', 'a'], ordered=False)
- with tm.assert_raises_regex(TypeError,
- "Categoricals can only be compared"):
- c1 == c2
-
- def test_compare_different_lengths(self):
- c1 = Categorical([], categories=['a', 'b'])
- c2 = Categorical([], categories=['a'])
- msg = "Categories are different lengths"
- with tm.assert_raises_regex(TypeError, msg):
- c1 == c2
-
- def test_concat_append(self):
- cat = Categorical(["a", "b"], categories=["a", "b"])
- vals = [1, 2]
- df = DataFrame({"cats": cat, "vals": vals})
- cat2 = Categorical(["a", "b", "a", "b"], categories=["a", "b"])
- vals2 = [1, 2, 1, 2]
- exp = DataFrame({"cats": cat2, "vals": vals2},
- index=Index([0, 1, 0, 1]))
-
- tm.assert_frame_equal(pd.concat([df, df]), exp)
- tm.assert_frame_equal(df.append(df), exp)
-
- # GH 13524 can concat different categories
- cat3 = Categorical(["a", "b"], categories=["a", "b", "c"])
- vals3 = [1, 2]
- df_different_categories = DataFrame({"cats": cat3, "vals": vals3})
-
- res = pd.concat([df, df_different_categories], ignore_index=True)
- exp = DataFrame({"cats": list('abab'), "vals": [1, 2, 1, 2]})
- tm.assert_frame_equal(res, exp)
-
- res = df.append(df_different_categories, ignore_index=True)
- tm.assert_frame_equal(res, exp)
-
- def test_concat_append_gh7864(self):
- # GH 7864
- # make sure ordering is preserverd
- df = DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": list('abbaae')})
- df["grade"] = Categorical(df["raw_grade"])
- df['grade'].cat.set_categories(['e', 'a', 'b'])
-
- df1 = df[0:3]
- df2 = df[3:]
-
- tm.assert_index_equal(df['grade'].cat.categories,
- df1['grade'].cat.categories)
- tm.assert_index_equal(df['grade'].cat.categories,
- df2['grade'].cat.categories)
-
- dfx = pd.concat([df1, df2])
- tm.assert_index_equal(df['grade'].cat.categories,
- dfx['grade'].cat.categories)
-
- dfa = df1.append(df2)
- tm.assert_index_equal(df['grade'].cat.categories,
- dfa['grade'].cat.categories)
-
- def test_concat_preserve(self):
-
- # GH 8641 series concat not preserving category dtype
- # GH 13524 can concat different categories
- s = Series(list('abc'), dtype='category')
- s2 = Series(list('abd'), dtype='category')
-
- exp = Series(list('abcabd'))
- res = pd.concat([s, s2], ignore_index=True)
- tm.assert_series_equal(res, exp)
-
- exp = Series(list('abcabc'), dtype='category')
- res = pd.concat([s, s], ignore_index=True)
- tm.assert_series_equal(res, exp)
-
- exp = Series(list('abcabc'), index=[0, 1, 2, 0, 1, 2],
- dtype='category')
- res = pd.concat([s, s])
- tm.assert_series_equal(res, exp)
-
- a = Series(np.arange(6, dtype='int64'))
- b = Series(list('aabbca'))
-
- df2 = DataFrame({'A': a,
- 'B': b.astype(CategoricalDtype(list('cab')))})
- res = pd.concat([df2, df2])
- exp = DataFrame(
- {'A': pd.concat([a, a]),
- 'B': pd.concat([b, b]).astype(CategoricalDtype(list('cab')))})
- tm.assert_frame_equal(res, exp)
-
- def test_categorical_index_preserver(self):
-
- a = Series(np.arange(6, dtype='int64'))
- b = Series(list('aabbca'))
-
- df2 = DataFrame({'A': a,
- 'B': b.astype(CategoricalDtype(list('cab')))
- }).set_index('B')
- result = pd.concat([df2, df2])
- expected = DataFrame(
- {'A': pd.concat([a, a]),
- 'B': pd.concat([b, b]).astype(CategoricalDtype(list('cab')))
- }).set_index('B')
- tm.assert_frame_equal(result, expected)
-
- # wrong catgories
- df3 = DataFrame({'A': a, 'B': Categorical(b, categories=list('abe'))
- }).set_index('B')
- pytest.raises(TypeError, lambda: pd.concat([df2, df3]))
-
- def test_merge(self):
- # GH 9426
-
- right = DataFrame({'c': {0: 'a',
- 1: 'b',
- 2: 'c',
- 3: 'd',
- 4: 'e'},
- 'd': {0: 'null',
- 1: 'null',
- 2: 'null',
- 3: 'null',
- 4: 'null'}})
- left = DataFrame({'a': {0: 'f',
- 1: 'f',
- 2: 'f',
- 3: 'f',
- 4: 'f'},
- 'b': {0: 'g',
- 1: 'g',
- 2: 'g',
- 3: 'g',
- 4: 'g'}})
- df = pd.merge(left, right, how='left', left_on='b', right_on='c')
-
- # object-object
- expected = df.copy()
-
- # object-cat
- # note that we propagate the category
- # because we don't have any matching rows
- cright = right.copy()
- cright['d'] = cright['d'].astype('category')
- result = pd.merge(left, cright, how='left', left_on='b', right_on='c')
- expected['d'] = expected['d'].astype(CategoricalDtype(['null']))
- tm.assert_frame_equal(result, expected)
-
- # cat-object
- cleft = left.copy()
- cleft['b'] = cleft['b'].astype('category')
- result = pd.merge(cleft, cright, how='left', left_on='b', right_on='c')
- tm.assert_frame_equal(result, expected)
-
- # cat-cat
- cright = right.copy()
- cright['d'] = cright['d'].astype('category')
- cleft = left.copy()
- cleft['b'] = cleft['b'].astype('category')
- result = pd.merge(cleft, cright, how='left', left_on='b', right_on='c')
- tm.assert_frame_equal(result, expected)
-
- def test_repeat(self):
- # GH10183
- cat = Categorical(["a", "b"], categories=["a", "b"])
- exp = Categorical(["a", "a", "b", "b"], categories=["a", "b"])
- res = cat.repeat(2)
- tm.assert_categorical_equal(res, exp)
-
- def test_numpy_repeat(self):
- cat = Categorical(["a", "b"], categories=["a", "b"])
- exp = Categorical(["a", "a", "b", "b"], categories=["a", "b"])
- tm.assert_categorical_equal(np.repeat(cat, 2), exp)
-
- msg = "the 'axis' parameter is not supported"
- tm.assert_raises_regex(ValueError, msg, np.repeat, cat, 2, axis=1)
-
- def test_reshape(self):
- cat = Categorical([], categories=["a", "b"])
- tm.assert_produces_warning(FutureWarning, cat.reshape, 0)
-
- with tm.assert_produces_warning(FutureWarning):
- cat = Categorical([], categories=["a", "b"])
- tm.assert_categorical_equal(cat.reshape(0), cat)
-
- with tm.assert_produces_warning(FutureWarning):
- cat = Categorical([], categories=["a", "b"])
- tm.assert_categorical_equal(cat.reshape((5, -1)), cat)
-
- with tm.assert_produces_warning(FutureWarning):
- cat = Categorical(["a", "b"], categories=["a", "b"])
- tm.assert_categorical_equal(cat.reshape(cat.shape), cat)
-
- with tm.assert_produces_warning(FutureWarning):
- cat = Categorical(["a", "b"], categories=["a", "b"])
- tm.assert_categorical_equal(cat.reshape(cat.size), cat)
-
- with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
- msg = "can only specify one unknown dimension"
- cat = Categorical(["a", "b"], categories=["a", "b"])
- tm.assert_raises_regex(ValueError, msg, cat.reshape, (-2, -1))
-
- def test_numpy_reshape(self):
- with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
- cat = Categorical(["a", "b"], categories=["a", "b"])
- tm.assert_categorical_equal(np.reshape(cat, cat.shape), cat)
-
- with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
- msg = "the 'order' parameter is not supported"
- tm.assert_raises_regex(ValueError, msg, np.reshape,
- cat, cat.shape, order='F')
-
- def test_astype_to_other(self):
-
- s = self.cat['value_group']
- expected = s
- tm.assert_series_equal(s.astype('category'), expected)
- tm.assert_series_equal(s.astype(CategoricalDtype()), expected)
- pytest.raises(ValueError, lambda: s.astype('float64'))
-
- cat = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']))
- exp = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
- tm.assert_series_equal(cat.astype('str'), exp)
- s2 = Series(Categorical(['1', '2', '3', '4']))
- exp2 = Series([1, 2, 3, 4]).astype(int)
- tm.assert_series_equal(s2.astype('int'), exp2)
-
- # object don't sort correctly, so just compare that we have the same
- # values
- def cmp(a, b):
- tm.assert_almost_equal(
- np.sort(np.unique(a)), np.sort(np.unique(b)))
-
- expected = Series(np.array(s.values), name='value_group')
- cmp(s.astype('object'), expected)
- cmp(s.astype(np.object_), expected)
-
- # array conversion
- tm.assert_almost_equal(np.array(s), np.array(s.values))
-
- # valid conversion
- for valid in [lambda x: x.astype('category'),
- lambda x: x.astype(CategoricalDtype()),
- lambda x: x.astype('object').astype('category'),
- lambda x: x.astype('object').astype(
- CategoricalDtype())
- ]:
-
- result = valid(s)
- # compare series values
- # internal .categories can't be compared because it is sorted
- tm.assert_series_equal(result, s, check_categorical=False)
-
- # invalid conversion (these are NOT a dtype)
- for invalid in [lambda x: x.astype(Categorical),
- lambda x: x.astype('object').astype(Categorical)]:
- pytest.raises(TypeError, lambda: invalid(s))
-
- def test_astype_categorical(self):
-
- cat = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
- tm.assert_categorical_equal(cat, cat.astype('category'))
- tm.assert_almost_equal(np.array(cat), cat.astype('object'))
-
- pytest.raises(ValueError, lambda: cat.astype(float))
-
- def test_to_records(self):
-
- # GH8626
-
- # dict creation
- df = DataFrame({'A': list('abc')}, dtype='category')
- expected = Series(list('abc'), dtype='category', name='A')
- tm.assert_series_equal(df['A'], expected)
-
- # list-like creation
- df = DataFrame(list('abc'), dtype='category')
- expected = Series(list('abc'), dtype='category', name=0)
- tm.assert_series_equal(df[0], expected)
-
- # to record array
- # this coerces
- result = df.to_records()
- expected = np.rec.array([(0, 'a'), (1, 'b'), (2, 'c')],
- dtype=[('index', '=i8'), ('0', 'O')])
- tm.assert_almost_equal(result, expected)
-
- def test_numeric_like_ops(self):
-
- # numeric ops should not succeed
- for op in ['__add__', '__sub__', '__mul__', '__truediv__']:
- pytest.raises(TypeError,
- lambda: getattr(self.cat, op)(self.cat))
-
- # reduction ops should not succeed (unless specifically defined, e.g.
- # min/max)
- s = self.cat['value_group']
- for op in ['kurt', 'skew', 'var', 'std', 'mean', 'sum', 'median']:
- pytest.raises(TypeError,
- lambda: getattr(s, op)(numeric_only=False))
-
- # mad technically works because it takes always the numeric data
-
- # numpy ops
- s = Series(Categorical([1, 2, 3, 4]))
- pytest.raises(TypeError, lambda: np.sum(s))
-
- # numeric ops on a Series
- for op in ['__add__', '__sub__', '__mul__', '__truediv__']:
- pytest.raises(TypeError, lambda: getattr(s, op)(2))
-
- # invalid ufunc
- pytest.raises(TypeError, lambda: np.log(s))
-
- def test_cat_tab_completition(self):
- # test the tab completion display
- ok_for_cat = ['categories', 'codes', 'ordered', 'set_categories',
- 'add_categories', 'remove_categories',
- 'rename_categories', 'reorder_categories',
- 'remove_unused_categories', 'as_ordered', 'as_unordered']
-
- def get_dir(s):
- results = [r for r in s.cat.__dir__() if not r.startswith('_')]
- return list(sorted(set(results)))
-
- s = Series(list('aabbcde')).astype('category')
- results = get_dir(s)
- tm.assert_almost_equal(results, list(sorted(set(ok_for_cat))))
-
- def test_cat_accessor_api(self):
- # GH 9322
- from pandas.core.categorical import CategoricalAccessor
- assert Series.cat is CategoricalAccessor
- s = Series(list('aabbcde')).astype('category')
- assert isinstance(s.cat, CategoricalAccessor)
-
- invalid = Series([1])
- with tm.assert_raises_regex(AttributeError,
- "only use .cat accessor"):
- invalid.cat
- assert not hasattr(invalid, 'cat')
-
- def test_cat_accessor_no_new_attributes(self):
- # https://github.com/pandas-dev/pandas/issues/10673
- c = Series(list('aabbcde')).astype('category')
- with tm.assert_raises_regex(AttributeError,
- "You cannot add any new attribute"):
- c.cat.xlabel = "a"
-
- def test_str_accessor_api_for_categorical(self):
- # https://github.com/pandas-dev/pandas/issues/10661
- from pandas.core.strings import StringMethods
- s = Series(list('aabb'))
- s = s + " " + s
- c = s.astype('category')
- assert isinstance(c.str, StringMethods)
-
- # str functions, which need special arguments
- special_func_defs = [
- ('cat', (list("zyxw"),), {"sep": ","}),
- ('center', (10,), {}),
- ('contains', ("a",), {}),
- ('count', ("a",), {}),
- ('decode', ("UTF-8",), {}),
- ('encode', ("UTF-8",), {}),
- ('endswith', ("a",), {}),
- ('extract', ("([a-z]*) ",), {"expand": False}),
- ('extract', ("([a-z]*) ",), {"expand": True}),
- ('extractall', ("([a-z]*) ",), {}),
- ('find', ("a",), {}),
- ('findall', ("a",), {}),
- ('index', (" ",), {}),
- ('ljust', (10,), {}),
- ('match', ("a"), {}), # deprecated...
- ('normalize', ("NFC",), {}),
- ('pad', (10,), {}),
- ('partition', (" ",), {"expand": False}), # not default
- ('partition', (" ",), {"expand": True}), # default
- ('repeat', (3,), {}),
- ('replace', ("a", "z"), {}),
- ('rfind', ("a",), {}),
- ('rindex', (" ",), {}),
- ('rjust', (10,), {}),
- ('rpartition', (" ",), {"expand": False}), # not default
- ('rpartition', (" ",), {"expand": True}), # default
- ('slice', (0, 1), {}),
- ('slice_replace', (0, 1, "z"), {}),
- ('split', (" ",), {"expand": False}), # default
- ('split', (" ",), {"expand": True}), # not default
- ('startswith', ("a",), {}),
- ('wrap', (2,), {}),
- ('zfill', (10,), {})
- ]
- _special_func_names = [f[0] for f in special_func_defs]
-
- # * get, join: they need a individual elements of type lists, but
- # we can't make a categorical with lists as individual categories.
- # -> `s.str.split(" ").astype("category")` will error!
- # * `translate` has different interfaces for py2 vs. py3
- _ignore_names = ["get", "join", "translate"]
-
- str_func_names = [f for f in dir(s.str) if not (
- f.startswith("_") or
- f in _special_func_names or
- f in _ignore_names)]
-
- func_defs = [(f, (), {}) for f in str_func_names]
- func_defs.extend(special_func_defs)
-
- for func, args, kwargs in func_defs:
- res = getattr(c.str, func)(*args, **kwargs)
- exp = getattr(s.str, func)(*args, **kwargs)
-
- if isinstance(res, DataFrame):
- tm.assert_frame_equal(res, exp)
- else:
- tm.assert_series_equal(res, exp)
-
- invalid = Series([1, 2, 3]).astype('category')
- with tm.assert_raises_regex(AttributeError,
- "Can only use .str "
- "accessor with string"):
- invalid.str
- assert not hasattr(invalid, 'str')
-
- def test_dt_accessor_api_for_categorical(self):
- # https://github.com/pandas-dev/pandas/issues/10661
- from pandas.core.indexes.accessors import Properties
-
- s_dr = Series(date_range('1/1/2015', periods=5, tz="MET"))
- c_dr = s_dr.astype("category")
-
- s_pr = Series(period_range('1/1/2015', freq='D', periods=5))
- c_pr = s_pr.astype("category")
-
- s_tdr = Series(timedelta_range('1 days', '10 days'))
- c_tdr = s_tdr.astype("category")
-
- # only testing field (like .day)
- # and bool (is_month_start)
- get_ops = lambda x: x._datetimelike_ops
-
- test_data = [
- ("Datetime", get_ops(DatetimeIndex), s_dr, c_dr),
- ("Period", get_ops(PeriodIndex), s_pr, c_pr),
- ("Timedelta", get_ops(TimedeltaIndex), s_tdr, c_tdr)]
-
- assert isinstance(c_dr.dt, Properties)
-
- special_func_defs = [
- ('strftime', ("%Y-%m-%d",), {}),
- ('tz_convert', ("EST",), {}),
- ('round', ("D",), {}),
- ('floor', ("D",), {}),
- ('ceil', ("D",), {}),
- ('asfreq', ("D",), {}),
- # ('tz_localize', ("UTC",), {}),
- ]
- _special_func_names = [f[0] for f in special_func_defs]
-
- # the series is already localized
- _ignore_names = ['tz_localize', 'components']
-
- for name, attr_names, s, c in test_data:
- func_names = [f
- for f in dir(s.dt)
- if not (f.startswith("_") or f in attr_names or f in
- _special_func_names or f in _ignore_names)]
-
- func_defs = [(f, (), {}) for f in func_names]
- for f_def in special_func_defs:
- if f_def[0] in dir(s.dt):
- func_defs.append(f_def)
-
- for func, args, kwargs in func_defs:
- res = getattr(c.dt, func)(*args, **kwargs)
- exp = getattr(s.dt, func)(*args, **kwargs)
-
- if isinstance(res, DataFrame):
- tm.assert_frame_equal(res, exp)
- elif isinstance(res, Series):
- tm.assert_series_equal(res, exp)
- else:
- tm.assert_almost_equal(res, exp)
-
- for attr in attr_names:
- try:
- res = getattr(c.dt, attr)
- exp = getattr(s.dt, attr)
- except Exception as e:
- print(name, attr)
- raise e
-
- if isinstance(res, DataFrame):
- tm.assert_frame_equal(res, exp)
- elif isinstance(res, Series):
- tm.assert_series_equal(res, exp)
- else:
- tm.assert_almost_equal(res, exp)
-
- invalid = Series([1, 2, 3]).astype('category')
- with tm.assert_raises_regex(
- AttributeError, "Can only use .dt accessor with datetimelike"):
- invalid.dt
- assert not hasattr(invalid, 'str')
-
- def test_concat_categorical(self):
- # See GH 10177
- df1 = DataFrame(np.arange(18, dtype='int64').reshape(6, 3),
- columns=["a", "b", "c"])
-
- df2 = DataFrame(np.arange(14, dtype='int64').reshape(7, 2),
- columns=["a", "c"])
-
- cat_values = ["one", "one", "two", "one", "two", "two", "one"]
- df2['h'] = Series(Categorical(cat_values))
-
- res = pd.concat((df1, df2), axis=0, ignore_index=True)
- exp = DataFrame({'a': [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12],
- 'b': [1, 4, 7, 10, 13, 16, np.nan, np.nan, np.nan,
- np.nan, np.nan, np.nan, np.nan],
- 'c': [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13],
- 'h': [None] * 6 + cat_values})
- tm.assert_frame_equal(res, exp)
-
-
-class TestCategoricalSubclassing(object):
-
- def test_constructor(self):
- sc = tm.SubclassedCategorical(['a', 'b', 'c'])
- assert isinstance(sc, tm.SubclassedCategorical)
- tm.assert_categorical_equal(sc, Categorical(['a', 'b', 'c']))
-
- def test_from_codes(self):
- sc = tm.SubclassedCategorical.from_codes([1, 0, 2], ['a', 'b', 'c'])
- assert isinstance(sc, tm.SubclassedCategorical)
- exp = Categorical.from_codes([1, 0, 2], ['a', 'b', 'c'])
- tm.assert_categorical_equal(sc, exp)
-
- def test_map(self):
- sc = tm.SubclassedCategorical(['a', 'b', 'c'])
- res = sc.map(lambda x: x.upper())
- assert isinstance(res, tm.SubclassedCategorical)
- exp = Categorical(['A', 'B', 'C'])
- tm.assert_categorical_equal(res, exp)
diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py
index db94cd08b0050..661c7dfcc50fc 100644
--- a/pandas/tests/test_window.py
+++ b/pandas/tests/test_window.py
@@ -3019,7 +3019,7 @@ def test_rolling_max_gh6297(self):
x = series.resample('D').max().rolling(window=1).max()
tm.assert_series_equal(expected, x)
- def test_rolling_max_how_resample(self):
+ def test_rolling_max_resample(self):
indices = [datetime(1975, 1, i) for i in range(1, 6)]
# So that we can have 3 datapoints on last day (4, 10, and 20)
@@ -3040,17 +3040,17 @@ def test_rolling_max_how_resample(self):
# Now specify median (10.0)
expected = Series([0.0, 1.0, 2.0, 3.0, 10.0],
index=[datetime(1975, 1, i, 0) for i in range(1, 6)])
- x = series.resample('D').median().rolling(window=1).max(how='median')
+ x = series.resample('D').median().rolling(window=1).max()
tm.assert_series_equal(expected, x)
# Now specify mean (4+10+20)/3
v = (4.0 + 10.0 + 20.0) / 3.0
expected = Series([0.0, 1.0, 2.0, 3.0, v],
index=[datetime(1975, 1, i, 0) for i in range(1, 6)])
- x = series.resample('D').mean().rolling(window=1).max(how='mean')
+ x = series.resample('D').mean().rolling(window=1).max()
tm.assert_series_equal(expected, x)
- def test_rolling_min_how_resample(self):
+ def test_rolling_min_resample(self):
indices = [datetime(1975, 1, i) for i in range(1, 6)]
# So that we can have 3 datapoints on last day (4, 10, and 20)
@@ -3068,7 +3068,7 @@ def test_rolling_min_how_resample(self):
r = series.resample('D').min().rolling(window=1)
tm.assert_series_equal(expected, r.min())
- def test_rolling_median_how_resample(self):
+ def test_rolling_median_resample(self):
indices = [datetime(1975, 1, i) for i in range(1, 6)]
# So that we can have 3 datapoints on last day (4, 10, and 20)
diff --git a/setup.py b/setup.py
index e6480cfedaee0..515e1660fa6de 100755
--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,7 @@
import sys
import shutil
from distutils.version import LooseVersion
-from setuptools import setup, Command
+from setuptools import setup, Command, find_packages
# versioning
import versioneer
@@ -716,99 +716,12 @@ def pxd(name):
setup(name=DISTNAME,
maintainer=AUTHOR,
version=versioneer.get_version(),
- packages=['pandas',
- 'pandas.api',
- 'pandas.api.types',
- 'pandas.compat',
- 'pandas.compat.numpy',
- 'pandas.core',
- 'pandas.core.dtypes',
- 'pandas.core.indexes',
- 'pandas.core.computation',
- 'pandas.core.reshape',
- 'pandas.core.sparse',
- 'pandas.core.tools',
- 'pandas.core.util',
- 'pandas.computation',
- 'pandas.errors',
- 'pandas.formats',
- 'pandas.io',
- 'pandas.io.json',
- 'pandas.io.sas',
- 'pandas.io.msgpack',
- 'pandas.io.formats',
- 'pandas.io.clipboard',
- 'pandas._libs',
- 'pandas._libs.tslibs',
- 'pandas.plotting',
- 'pandas.stats',
- 'pandas.types',
- 'pandas.util',
- 'pandas.tests',
- 'pandas.tests.api',
- 'pandas.tests.dtypes',
- 'pandas.tests.computation',
- 'pandas.tests.sparse',
- 'pandas.tests.frame',
- 'pandas.tests.generic',
- 'pandas.tests.indexing',
- 'pandas.tests.indexes',
- 'pandas.tests.indexes.datetimes',
- 'pandas.tests.indexes.timedeltas',
- 'pandas.tests.indexes.period',
- 'pandas.tests.internals',
- 'pandas.tests.io',
- 'pandas.tests.io.json',
- 'pandas.tests.io.parser',
- 'pandas.tests.io.sas',
- 'pandas.tests.io.msgpack',
- 'pandas.tests.io.formats',
- 'pandas.tests.groupby',
- 'pandas.tests.reshape',
- 'pandas.tests.reshape.merge',
- 'pandas.tests.series',
- 'pandas.tests.scalar',
- 'pandas.tests.tseries',
- 'pandas.tests.tseries.offsets',
- 'pandas.tests.plotting',
- 'pandas.tests.tools',
- 'pandas.tests.util',
- 'pandas.tools',
- 'pandas.tseries',
- ],
- package_data={'pandas.tests': ['data/*.csv'],
- 'pandas.tests.indexes': ['data/*.pickle'],
+ packages=find_packages(include=['pandas', 'pandas.*']),
+ package_data={'': ['data/*', 'templates/*'],
'pandas.tests.io': ['data/legacy_hdf/*.h5',
'data/legacy_pickle/*/*.pickle',
'data/legacy_msgpack/*/*.msgpack',
- 'data/*.csv*',
- 'data/*.dta',
- 'data/*.pickle',
- 'data/*.txt',
- 'data/*.xls',
- 'data/*.xlsx',
- 'data/*.xlsm',
- 'data/*.table',
- 'parser/data/*.csv',
- 'parser/data/*.gz',
- 'parser/data/*.bz2',
- 'parser/data/*.txt',
- 'parser/data/*.tar',
- 'parser/data/*.zip',
- 'parser/data/*.tar.gz',
- 'sas/data/*.csv',
- 'sas/data/*.xpt',
- 'sas/data/*.sas7bdat',
- 'data/*.html',
- 'data/html_encoding/*.html',
- 'json/data/*.json*'],
- 'pandas.tests.io.formats': ['data/*.csv'],
- 'pandas.tests.io.msgpack': ['data/*.mp'],
- 'pandas.tests.reshape': ['data/*.csv'],
- 'pandas.tests.reshape.merge': ['data/*.csv'],
- 'pandas.tests.tseries.offsets': ['data/*.pickle'],
- 'pandas.io.formats': ['templates/*.tpl']
- },
+ 'data/html_encoding/*.html']},
ext_modules=extensions,
maintainer_email=EMAIL,
description=DESCRIPTION,