diff --git a/.travis.yml b/.travis.yml
index bc87853b26d6e..0d143d7f7133b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -22,6 +22,7 @@ matrix:
- LOCALE_OVERRIDE="it_IT.UTF-8"
- BUILD_TYPE=conda
- JOB_NAME: "26_nslow_nnet"
+ - INSTALL_TEST=true
- python: 2.7
env:
- NOSE_ARGS="slow and not network and not disabled"
@@ -183,6 +184,7 @@ script:
# nothing here, or failed tests won't fail travis
after_script:
+ - ci/install_test.sh
- if [ -f /tmp/doc.log ]; then cat /tmp/doc.log; fi
- source activate pandas && ci/print_versions.py
- ci/print_skipped.py /tmp/nosetests.xml
diff --git a/README.md b/README.md
index cea7e8c6bfd72..c76fbe7df9e6b 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
# pandas: powerful Python data analysis toolkit
-
+[](https://travis-ci.org/pydata/pandas)
## What is it
diff --git a/ci/install_test.sh b/ci/install_test.sh
new file mode 100755
index 0000000000000..e01ad7b94a349
--- /dev/null
+++ b/ci/install_test.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+echo "inside $0"
+
+if [ "$INSTALL_TEST" ]; then
+ source activate pandas
+ echo "Starting installation test."
+ conda uninstall cython || exit 1
+ python "$TRAVIS_BUILD_DIR"/setup.py sdist --formats=zip,gztar || exit 1
+ pip install "$TRAVIS_BUILD_DIR"/dist/*tar.gz || exit 1
+ nosetests --exe -A "$NOSE_ARGS" pandas/tests/test_series.py --with-xunit --xunit-file=/tmp/nosetests_install.xml
+else
+ echo "Skipping installation test."
+fi
+RET="$?"
+
+exit "$RET"
diff --git a/doc/_templates/api_redirect.html b/doc/_templates/api_redirect.html
new file mode 100644
index 0000000000000..24bdd8363830f
--- /dev/null
+++ b/doc/_templates/api_redirect.html
@@ -0,0 +1,15 @@
+{% set pgn = pagename.split('.') -%}
+{% if pgn[-2][0].isupper() -%}
+ {% set redirect = ["pandas", pgn[-2], pgn[-1], 'html']|join('.') -%}
+{% else -%}
+ {% set redirect = ["pandas", pgn[-1], 'html']|join('.') -%}
+{% endif -%}
+
+
+
+ This API page has moved
+
+
+ This API page has moved here.
+
+
\ No newline at end of file
diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst
index 1749409c863df..688935c6b104d 100644
--- a/doc/source/advanced.rst
+++ b/doc/source/advanced.rst
@@ -594,6 +594,95 @@ faster than fancy indexing.
timeit ser.ix[indexer]
timeit ser.take(indexer)
+.. _indexing.categoricalindex:
+
+CategoricalIndex
+----------------
+
+.. versionadded:: 0.16.1
+
+We introduce a ``CategoricalIndex``, a new type of index object that is useful for supporting
+indexing with duplicates. This is a container around a ``Categorical`` (introduced in v0.15.0)
+and allows efficient indexing and storage of an index with a large number of duplicated elements. Prior to 0.16.1,
+setting the index of a ``DataFrame/Series`` with a ``category`` dtype would convert this to regular object-based ``Index``.
+
+.. ipython:: python
+
+ df = DataFrame({'A' : np.arange(6),
+ 'B' : Series(list('aabbca')).astype('category',
+ categories=list('cab'))
+ })
+ df
+ df.dtypes
+ df.B.cat.categories
+
+Setting the index, will create create a ``CategoricalIndex``
+
+.. ipython:: python
+
+ df2 = df.set_index('B')
+ df2.index
+
+Indexing with ``__getitem__/.iloc/.loc/.ix`` works similarly to an ``Index`` with duplicates.
+The indexers MUST be in the category or the operation will raise.
+
+.. ipython:: python
+
+ df2.loc['a']
+
+These PRESERVE the ``CategoricalIndex``
+
+.. ipython:: python
+
+ df2.loc['a'].index
+
+Sorting will order by the order of the categories
+
+.. ipython:: python
+
+ df2.sort_index()
+
+Groupby operations on the index will preserve the index nature as well
+
+.. ipython:: python
+
+ df2.groupby(level=0).sum()
+ df2.groupby(level=0).sum().index
+
+Reindexing operations, will return a resulting index based on the type of the passed
+indexer, meaning that passing a list will return a plain-old-``Index``; indexing with
+a ``Categorical`` will return a ``CategoricalIndex``, indexed according to the categories
+of the PASSED ``Categorical`` dtype. This allows one to arbitrarly index these even with
+values NOT in the categories, similarly to how you can reindex ANY pandas index.
+
+.. ipython :: python
+
+ df2.reindex(['a','e'])
+ df2.reindex(['a','e']).index
+ df2.reindex(pd.Categorical(['a','e'],categories=list('abcde')))
+ df2.reindex(pd.Categorical(['a','e'],categories=list('abcde'))).index
+
+.. warning::
+
+ Reshaping and Comparision operations on a ``CategoricalIndex`` must have the same categories
+ or a ``TypeError`` will be raised.
+
+ .. code-block:: python
+
+ In [10]: df3 = DataFrame({'A' : np.arange(6),
+ 'B' : Series(list('aabbca')).astype('category',
+ categories=list('abc'))
+ }).set_index('B')
+
+ In [11]: df3.index
+ Out[11]:
+ CategoricalIndex([u'a', u'a', u'b', u'b', u'c', u'a'],
+ categories=[u'a', u'b', u'c'],
+ ordered=False)
+
+ In [12]: pd.concat([df2,df3]
+ TypeError: categories must match existing categories when appending
+
.. _indexing.float64index:
Float64Index
@@ -706,4 +795,3 @@ Of course if you need integer based selection, then use ``iloc``
.. ipython:: python
dfir.iloc[0:5]
-
diff --git a/doc/source/api.rst b/doc/source/api.rst
index af9f8c84388bd..87e9b20f97e69 100644
--- a/doc/source/api.rst
+++ b/doc/source/api.rst
@@ -553,6 +553,7 @@ strings and apply several methods to it. These can be acccessed like
Series.str.swapcase
Series.str.title
Series.str.upper
+ Series.str.wrap
Series.str.zfill
Series.str.isalnum
Series.str.isalpha
@@ -1291,6 +1292,34 @@ Selecting
Index.slice_indexer
Index.slice_locs
+.. _api.categoricalindex:
+
+CategoricalIndex
+----------------
+
+.. autosummary::
+ :toctree: generated/
+
+ CategoricalIndex
+
+Categorical Components
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. autosummary::
+ :toctree: generated/
+
+ CategoricalIndex.codes
+ CategoricalIndex.categories
+ CategoricalIndex.ordered
+ CategoricalIndex.rename_categories
+ CategoricalIndex.reorder_categories
+ CategoricalIndex.add_categories
+ CategoricalIndex.remove_categories
+ CategoricalIndex.remove_unused_categories
+ CategoricalIndex.set_categories
+ CategoricalIndex.as_ordered
+ CategoricalIndex.as_unordered
+
.. _api.datetimeindex:
DatetimeIndex
@@ -1521,230 +1550,3 @@ Working with options
get_option
set_option
option_context
-
-
-..
- HACK - see github issue #4539. To ensure old links remain valid, include
- here the autosummaries with previous currentmodules as a comment and add
- them to a hidden toctree (to avoid warnings):
-
-.. toctree::
- :hidden:
-
- generated/pandas.core.common.isnull
- generated/pandas.core.common.notnull
- generated/pandas.core.reshape.get_dummies
- generated/pandas.io.clipboard.read_clipboard
- generated/pandas.io.excel.ExcelFile.parse
- generated/pandas.io.excel.read_excel
- generated/pandas.io.html.read_html
- generated/pandas.io.json.read_json
- generated/pandas.io.parsers.read_csv
- generated/pandas.io.parsers.read_fwf
- generated/pandas.io.parsers.read_table
- generated/pandas.io.pickle.read_pickle
- generated/pandas.io.pytables.HDFStore.append
- generated/pandas.io.pytables.HDFStore.get
- generated/pandas.io.pytables.HDFStore.put
- generated/pandas.io.pytables.HDFStore.select
- generated/pandas.io.pytables.read_hdf
- generated/pandas.io.sql.read_sql
- generated/pandas.io.sql.read_frame
- generated/pandas.io.sql.write_frame
- generated/pandas.io.stata.read_stata
- generated/pandas.stats.moments.ewma
- generated/pandas.stats.moments.ewmcorr
- generated/pandas.stats.moments.ewmcov
- generated/pandas.stats.moments.ewmstd
- generated/pandas.stats.moments.ewmvar
- generated/pandas.stats.moments.expanding_apply
- generated/pandas.stats.moments.expanding_corr
- generated/pandas.stats.moments.expanding_count
- generated/pandas.stats.moments.expanding_cov
- generated/pandas.stats.moments.expanding_kurt
- generated/pandas.stats.moments.expanding_mean
- generated/pandas.stats.moments.expanding_median
- generated/pandas.stats.moments.expanding_quantile
- generated/pandas.stats.moments.expanding_skew
- generated/pandas.stats.moments.expanding_std
- generated/pandas.stats.moments.expanding_sum
- generated/pandas.stats.moments.expanding_var
- generated/pandas.stats.moments.rolling_apply
- generated/pandas.stats.moments.rolling_corr
- generated/pandas.stats.moments.rolling_count
- generated/pandas.stats.moments.rolling_cov
- generated/pandas.stats.moments.rolling_kurt
- generated/pandas.stats.moments.rolling_mean
- generated/pandas.stats.moments.rolling_median
- generated/pandas.stats.moments.rolling_quantile
- generated/pandas.stats.moments.rolling_skew
- generated/pandas.stats.moments.rolling_std
- generated/pandas.stats.moments.rolling_sum
- generated/pandas.stats.moments.rolling_var
- generated/pandas.tools.merge.concat
- generated/pandas.tools.merge.merge
- generated/pandas.tools.pivot.pivot_table
- generated/pandas.tseries.tools.to_datetime
-
-..
- .. currentmodule:: pandas.io.pickle
-
- .. autosummary::
- :toctree: generated/
-
- read_pickle
-
- .. currentmodule:: pandas.io.parsers
-
- .. autosummary::
- :toctree: generated/
-
- read_table
- read_csv
- read_fwf
-
- .. currentmodule:: pandas.io.clipboard
-
- .. autosummary::
- :toctree: generated/
-
- read_clipboard
-
- .. currentmodule:: pandas.io.excel
-
- .. autosummary::
- :toctree: generated/
-
- read_excel
- ExcelFile.parse
-
- .. currentmodule:: pandas.io.json
-
- .. autosummary::
- :toctree: generated/
-
- read_json
-
- .. currentmodule:: pandas.io.html
-
- .. autosummary::
- :toctree: generated/
-
- read_html
-
- .. currentmodule:: pandas.io.pytables
-
- .. autosummary::
- :toctree: generated/
-
- read_hdf
- HDFStore.put
- HDFStore.append
- HDFStore.get
- HDFStore.select
-
- .. currentmodule:: pandas.io.sql
-
- .. autosummary::
- :toctree: generated/
-
- read_sql
- read_frame
- write_frame
-
- .. currentmodule:: pandas.io.stata
-
- .. autosummary::
- :toctree: generated/
-
- read_stata
- StataReader.data
- StataReader.data_label
- StataReader.value_labels
- StataReader.variable_labels
- StataWriter.write_file
-
- .. currentmodule:: pandas.tools.pivot
-
- .. autosummary::
- :toctree: generated/
-
- pivot_table
-
- .. currentmodule:: pandas.tools.merge
-
- .. autosummary::
- :toctree: generated/
-
- merge
- concat
-
- .. currentmodule:: pandas.core.reshape
-
- .. autosummary::
- :toctree: generated/
-
- get_dummies
-
- .. currentmodule:: pandas.core.common
-
- .. autosummary::
- :toctree: generated/
-
- isnull
- notnull
-
- .. currentmodule:: pandas.tseries.tools
-
- .. autosummary::
- :toctree: generated/
-
- to_datetime
-
-
- .. currentmodule:: pandas.stats.moments
-
- .. autosummary::
- :toctree: generated/
-
- rolling_count
- rolling_sum
- rolling_mean
- rolling_median
- rolling_var
- rolling_std
- rolling_corr
- rolling_cov
- rolling_skew
- rolling_kurt
- rolling_apply
- rolling_quantile
-
-
- .. currentmodule:: pandas.stats.moments
-
- .. autosummary::
- :toctree: generated/
-
- expanding_count
- expanding_sum
- expanding_mean
- expanding_median
- expanding_var
- expanding_std
- expanding_corr
- expanding_cov
- expanding_skew
- expanding_kurt
- expanding_apply
- expanding_quantile
-
-
- .. autosummary::
- :toctree: generated/
-
- ewma
- ewmstd
- ewmvar
- ewmcorr
- ewmcov
diff --git a/doc/source/conf.py b/doc/source/conf.py
index fcb9c3fdd0016..08fc8483762ab 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -211,7 +211,30 @@
# Additional templates that should be rendered to pages, maps page names to
# template names.
-# html_additional_pages = {}
+
+# Add redirect for previously existing API pages (which are now included in
+# the API pages as top-level functions) based on a template (GH9911)
+moved_api_pages = [
+ 'pandas.core.common.isnull', 'pandas.core.common.notnull', 'pandas.core.reshape.get_dummies',
+ 'pandas.tools.merge.concat', 'pandas.tools.merge.merge', 'pandas.tools.pivot.pivot_table',
+ 'pandas.tseries.tools.to_datetime', 'pandas.io.clipboard.read_clipboard', 'pandas.io.excel.ExcelFile.parse',
+ 'pandas.io.excel.read_excel', 'pandas.io.html.read_html', 'pandas.io.json.read_json',
+ 'pandas.io.parsers.read_csv', 'pandas.io.parsers.read_fwf', 'pandas.io.parsers.read_table',
+ 'pandas.io.pickle.read_pickle', 'pandas.io.pytables.HDFStore.append', 'pandas.io.pytables.HDFStore.get',
+ 'pandas.io.pytables.HDFStore.put', 'pandas.io.pytables.HDFStore.select', 'pandas.io.pytables.read_hdf',
+ 'pandas.io.sql.read_sql', 'pandas.io.sql.read_frame', 'pandas.io.sql.write_frame',
+ 'pandas.io.stata.read_stata', 'pandas.stats.moments.ewma', 'pandas.stats.moments.ewmcorr',
+ 'pandas.stats.moments.ewmcov', 'pandas.stats.moments.ewmstd', 'pandas.stats.moments.ewmvar',
+ 'pandas.stats.moments.expanding_apply', 'pandas.stats.moments.expanding_corr', 'pandas.stats.moments.expanding_count',
+ 'pandas.stats.moments.expanding_cov', 'pandas.stats.moments.expanding_kurt', 'pandas.stats.moments.expanding_mean',
+ 'pandas.stats.moments.expanding_median', 'pandas.stats.moments.expanding_quantile', 'pandas.stats.moments.expanding_skew',
+ 'pandas.stats.moments.expanding_std', 'pandas.stats.moments.expanding_sum', 'pandas.stats.moments.expanding_var',
+ 'pandas.stats.moments.rolling_apply', 'pandas.stats.moments.rolling_corr', 'pandas.stats.moments.rolling_count',
+ 'pandas.stats.moments.rolling_cov', 'pandas.stats.moments.rolling_kurt', 'pandas.stats.moments.rolling_mean',
+ 'pandas.stats.moments.rolling_median', 'pandas.stats.moments.rolling_quantile', 'pandas.stats.moments.rolling_skew',
+ 'pandas.stats.moments.rolling_std', 'pandas.stats.moments.rolling_sum', 'pandas.stats.moments.rolling_var']
+
+html_additional_pages = {'generated/' + page: 'api_redirect.html' for page in moved_api_pages}
# If false, no module index is generated.
html_use_modindex = True
diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst
index 0e6386955a653..f69f926296020 100644
--- a/doc/source/cookbook.rst
+++ b/doc/source/cookbook.rst
@@ -1006,6 +1006,9 @@ The :ref:`HDFStores ` docs
`Merging on-disk tables with millions of rows
`__
+`Avoiding inconsistencies when writing to a store from multiple processes/threads
+`__
+
De-duplicating a large store by chunks, essentially a recursive reduction operation. Shows a function for taking in data from
csv file and creating a store by chunks, with date parsing as well.
`See here
diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst
index e6b735173110b..d007446a5b922 100644
--- a/doc/source/enhancingperf.rst
+++ b/doc/source/enhancingperf.rst
@@ -66,7 +66,7 @@ Here's the function in pure python:
s += f(a + i * dx)
return s * dx
-We achieve our result by by using ``apply`` (row-wise):
+We achieve our result by using ``apply`` (row-wise):
.. ipython:: python
@@ -86,7 +86,7 @@ hence we'll concentrate our efforts cythonizing these two functions.
.. note::
In python 2 replacing the ``range`` with its generator counterpart (``xrange``)
- would mean the ``range`` line would vanish. In python 3 range is already a generator.
+ would mean the ``range`` line would vanish. In python 3 ``range`` is already a generator.
.. _enhancingperf.plain:
@@ -248,7 +248,7 @@ efforts here.
More advanced techniques
~~~~~~~~~~~~~~~~~~~~~~~~
-There is still scope for improvement, here's an example of using some more
+There is still hope for improvement. Here's an example of using some more
advanced cython techniques:
.. ipython::
@@ -373,7 +373,7 @@ This Python syntax is **not** allowed:
:func:`~pandas.eval` Examples
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-:func:`pandas.eval` works well with expressions containing large arrays
+:func:`pandas.eval` works well with expressions containing large arrays.
First let's create a few decent-sized arrays to play with:
diff --git a/doc/source/faq.rst b/doc/source/faq.rst
index 467ec02b55f20..20762e3fc039f 100644
--- a/doc/source/faq.rst
+++ b/doc/source/faq.rst
@@ -369,3 +369,4 @@ just a thin layer around the ``QTableView``.
mw = MainWidget()
mw.show()
app.exec_()
+
diff --git a/doc/source/internals.rst b/doc/source/internals.rst
index 9418ca5265f1a..bc1189a8961d6 100644
--- a/doc/source/internals.rst
+++ b/doc/source/internals.rst
@@ -95,3 +95,155 @@ constructors ``from_tuples`` and ``from_arrays`` ensure that this is true, but
if you compute the levels and labels yourself, please be careful.
+.. _:
+
+Subclassing pandas Data Structures
+----------------------------------
+
+.. warning:: There are some easier alternatives before considering subclassing ``pandas`` data structures.
+
+ 1. Monkey-patching: See :ref:`Adding Features to your pandas Installation `.
+
+ 2. Use *composition*. See `here `_.
+
+This section describes how to subclass ``pandas`` data structures to meet more specific needs. There are 2 points which need attention:
+
+1. Override constructor properties.
+2. Define original properties
+
+.. note:: You can find a nice example in `geopandas `_ project.
+
+Override Constructor Properties
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Each data structure has constructor properties to specifying data constructors. By overriding these properties, you can retain defined-classes through ``pandas`` data manipulations.
+
+There are 3 constructors to be defined:
+
+- ``_constructor``: Used when a manipulation result has the same dimesions as the original.
+- ``_constructor_sliced``: Used when a manipulation result has one lower dimension(s) as the original, such as ``DataFrame`` single columns slicing.
+- ``_constructor_expanddim``: Used when a manipulation result has one higher dimension as the original, such as ``Series.to_frame()`` and ``DataFrame.to_panel()``.
+
+Following table shows how ``pandas`` data structures define constructor properties by default.
+
+=========================== ======================= =================== =======================
+Property Attributes ``Series`` ``DataFrame`` ``Panel``
+=========================== ======================= =================== =======================
+``_constructor`` ``Series`` ``DataFrame`` ``Panel``
+``_constructor_sliced`` ``NotImplementedError`` ``Series`` ``DataFrame``
+``_constructor_expanddim`` ``DataFrame`` ``Panel`` ``NotImplementedError``
+=========================== ======================= =================== =======================
+
+Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame`` overriding constructor properties.
+
+.. code-block:: python
+
+ class SubclassedSeries(Series):
+
+ @property
+ def _constructor(self):
+ return SubclassedSeries
+
+ @property
+ def _constructor_expanddim(self):
+ return SubclassedDataFrame
+
+ class SubclassedDataFrame(DataFrame):
+
+ @property
+ def _constructor(self):
+ return SubclassedDataFrame
+
+ @property
+ def _constructor_sliced(self):
+ return SubclassedSeries
+
+.. code-block:: python
+
+ >>> s = SubclassedSeries([1, 2, 3])
+ >>> type(s)
+
+
+ >>> to_framed = s.to_frame()
+ >>> type(to_framed)
+
+
+ >>> df = SubclassedDataFrame({'A', [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})
+ >>> df
+ A B C
+ 0 1 4 7
+ 1 2 5 8
+ 2 3 6 9
+
+ >>> type(df)
+
+
+ >>> sliced1 = df[['A', 'B']]
+ >>> sliced1
+ A B
+ 0 1 4
+ 1 2 5
+ 2 3 6
+ >>> type(sliced1)
+
+
+ >>> sliced2 = df['A']
+ >>> sliced2
+ 0 1
+ 1 2
+ 2 3
+ Name: A, dtype: int64
+ >>> type(sliced2)
+
+
+Define Original Properties
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To let original data structures have additional properties, you should let ``pandas`` knows what properties are added. ``pandas`` maps unknown properties to data names overriding ``__getattribute__``. Defining original properties can be done in one of 2 ways:
+
+1. Define ``_internal_names`` and ``_internal_names_set`` for temporary properties which WILL NOT be passed to manipulation results.
+2. Define ``_metadata`` for normal properties which will be passed to manipulation results.
+
+Below is an example to define 2 original properties, "internal_cache" as a temporary property and "added_property" as a normal property
+
+.. code-block:: python
+
+ class SubclassedDataFrame2(DataFrame):
+
+ # temporary properties
+ _internal_names = DataFrame._internal_names + ['internal_cache']
+ _internal_names_set = set(_internal_names)
+
+ # normal properties
+ _metadata = ['added_property']
+
+ @property
+ def _constructor(self):
+ return SubclassedDataFrame2
+
+.. code-block:: python
+
+ >>> df = SubclassedDataFrame2({'A', [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})
+ >>> df
+ A B C
+ 0 1 4 7
+ 1 2 5 8
+ 2 3 6 9
+
+ >>> df.internal_cache = 'cached'
+ >>> df.added_property = 'property'
+
+ >>> df.internal_cache
+ cached
+ >>> df.added_property
+ property
+
+ # properties defined in _internal_names is reset after manipulation
+ >>> df[['A', 'B']].internal_cache
+ AttributeError: 'SubclassedDataFrame2' object has no attribute 'internal_cache'
+
+ # properties defined in _metadata are retained
+ >>> df[['A', 'B']].added_property
+ property
+
+
diff --git a/doc/source/remote_data.rst b/doc/source/remote_data.rst
index ac9b6c9aecc4a..65fcf600cdfd2 100644
--- a/doc/source/remote_data.rst
+++ b/doc/source/remote_data.rst
@@ -49,7 +49,7 @@ Yahoo! Finance
import datetime
start = datetime.datetime(2010, 1, 1)
end = datetime.datetime(2013, 1, 27)
- f=web.DataReader("F", 'yahoo', start, end)
+ f = web.DataReader("F", 'yahoo', start, end)
f.ix['2010-01-04']
.. _remote_data.yahoo_options:
@@ -58,10 +58,10 @@ Yahoo! Finance Options
----------------------
***Experimental***
-The Options class allows the download of options data from Yahoo! Finance.
+The ``Options`` class allows the download of options data from Yahoo! Finance.
The ``get_all_data`` method downloads and caches option data for all expiry months
-and provides a formatted ``DataFrame`` with a hierarchical index, so its easy to get
+and provides a formatted ``DataFrame`` with a hierarchical index, so it is easy to get
to the specific option you want.
.. ipython:: python
@@ -71,10 +71,10 @@ to the specific option you want.
data = aapl.get_all_data()
data.iloc[0:5, 0:5]
- #Show the $100 strike puts at all expiry dates:
+ # Show the $100 strike puts at all expiry dates:
data.loc[(100, slice(None), 'put'),:].iloc[0:5, 0:5]
- #Show the volume traded of $100 strike puts at all expiry dates:
+ # Show the volume traded of $100 strike puts at all expiry dates:
data.loc[(100, slice(None), 'put'),'Vol'].head()
If you don't want to download all the data, more specific requests can be made.
@@ -121,7 +121,7 @@ Google Finance
import datetime
start = datetime.datetime(2010, 1, 1)
end = datetime.datetime(2013, 1, 27)
- f=web.DataReader("F", 'google', start, end)
+ f = web.DataReader("F", 'google', start, end)
f.ix['2010-01-04']
.. _remote_data.fred:
@@ -152,7 +152,7 @@ Dataset names are listed at `Fama/French Data Library
.. ipython:: python
import pandas.io.data as web
- ip=web.DataReader("5_Industry_Portfolios", "famafrench")
+ ip = web.DataReader("5_Industry_Portfolios", "famafrench")
ip[4].ix[192607]
.. _remote_data.wb:
@@ -302,9 +302,8 @@ Problematic Country Codes & Indicators
:func:`wb.download()` is more flexible. To achieve this, the warning
and exception logic changed.
-The world bank converts some country codes,
-in their response, which makes error checking by pandas difficult.
-Retired indicators still persist in the search.
+The world bank converts some country codes in their response, which makes error
+checking by pandas difficult. Retired indicators still persist in the search.
Given the new flexibility of 0.15.1, improved error handling by the user
may be necessary for fringe cases.
@@ -377,13 +376,13 @@ The following will fetch users and pageviews (metrics) data per day of the week,
filters = "pagePath=~aboutus;ga:country==France",
)
-The only mandatory arguments are ``metrics,`` ``dimensions`` and ``start_date``. We can only strongly recommend you to always specify the ``account_id``, ``profile_id`` and ``property_id`` to avoid accessing the wrong data bucket in Google Analytics.
+The only mandatory arguments are ``metrics,`` ``dimensions`` and ``start_date``. We strongly recommend that you always specify the ``account_id``, ``profile_id`` and ``property_id`` to avoid accessing the wrong data bucket in Google Analytics.
The ``index_col`` argument indicates which dimension(s) has to be taken as index.
-The ``filters`` argument indicates the filtering to apply to the query. In the above example, the page has URL has to contain ``aboutus`` AND the visitors country has to be France.
+The ``filters`` argument indicates the filtering to apply to the query. In the above example, the page URL has to contain ``aboutus`` AND the visitors country has to be France.
-Detailed informations in the followings:
+Detailed information in the following:
* `pandas & google analytics, by yhat `__
* `Google Analytics integration in pandas, by Chang She `__
diff --git a/doc/source/text.rst b/doc/source/text.rst
index f417f56f51fbc..dea40fb48748d 100644
--- a/doc/source/text.rst
+++ b/doc/source/text.rst
@@ -266,7 +266,7 @@ Method Summary
:meth:`~Series.str.upper`,Equivalent to ``str.upper``
:meth:`~Series.str.find`,Equivalent to ``str.find``
:meth:`~Series.str.rfind`,Equivalent to ``str.rfind``
- :meth:`~Series.str.capicalize`,Equivalent to ``str.capitalize``
+ :meth:`~Series.str.capitalize`,Equivalent to ``str.capitalize``
:meth:`~Series.str.swapcase`,Equivalent to ``str.swapcase``
:meth:`~Series.str.isalnum`,Equivalent to ``str.isalnum``
:meth:`~Series.str.isalpha`,Equivalent to ``str.isalpha``
@@ -276,4 +276,4 @@ Method Summary
:meth:`~Series.str.isupper`,Equivalent to ``str.isupper``
:meth:`~Series.str.istitle`,Equivalent to ``str.istitle``
:meth:`~Series.str.isnumeric`,Equivalent to ``str.isnumeric``
- :meth:`~Series.str.isnumeric`,Equivalent to ``str.isdecimal``
+ :meth:`~Series.str.isdecimal`,Equivalent to ``str.isdecimal``
diff --git a/doc/source/timedeltas.rst b/doc/source/timedeltas.rst
index 786a46d343be1..8215414e425fe 100644
--- a/doc/source/timedeltas.rst
+++ b/doc/source/timedeltas.rst
@@ -29,13 +29,13 @@ Time Deltas
Starting in v0.15.0, we introduce a new scalar type ``Timedelta``, which is a subclass of ``datetime.timedelta``, and behaves in a similar manner,
but allows compatibility with ``np.timedelta64`` types as well as a host of custom representation, parsing, and attributes.
-Timedeltas are differences in times, expressed in difference units, e.g. days,hours,minutes,seconds.
+Timedeltas are differences in times, expressed in difference units, e.g. days, hours, minutes, seconds.
They can be both positive and negative.
Parsing
-------
-You can construct a ``Timedelta`` scalar thru various arguments:
+You can construct a ``Timedelta`` scalar through various arguments:
.. ipython:: python
@@ -46,7 +46,7 @@ You can construct a ``Timedelta`` scalar thru various arguments:
Timedelta('-1 days 2 min 3us')
# like datetime.timedelta
- # note: these MUST be specified as keyword argments
+ # note: these MUST be specified as keyword arguments
Timedelta(days=1,seconds=1)
# integers with a unit
@@ -100,7 +100,7 @@ It will construct Series if the input is a Series, a scalar if the input is scal
Operations
----------
-You can operate on Series/DataFrames and construct ``timedelta64[ns]`` Series thru
+You can operate on Series/DataFrames and construct ``timedelta64[ns]`` Series through
subtraction operations on ``datetime64[ns]`` Series, or ``Timestamps``.
.. ipython:: python
@@ -290,7 +290,7 @@ TimedeltaIndex
.. versionadded:: 0.15.0
-To generate an index with time delta, you can use either the TimedeltaIndex or
+To generate an index with time delta, you can use either the ``TimedeltaIndex`` or
the ``timedelta_range`` constructor.
Using ``TimedeltaIndex`` you can pass string-like, ``Timedelta``, ``timedelta``,
diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst
index 9d4cba2e5ee8c..43fa6ea759b33 100644
--- a/doc/source/visualization.rst
+++ b/doc/source/visualization.rst
@@ -267,7 +267,7 @@ You can pass other keywords supported by matplotlib ``hist``. For example, horiz
plt.close('all')
See the :meth:`hist ` method and the
-`matplotlib hist documenation `__ for more.
+`matplotlib hist documentation `__ for more.
The existing interface ``DataFrame.hist`` to plot histogram still can be used.
diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt
index 659aa6786b366..b42c22364ef16 100755
--- a/doc/source/whatsnew/v0.16.1.txt
+++ b/doc/source/whatsnew/v0.16.1.txt
@@ -7,6 +7,10 @@ This is a minor bug-fix release from 0.16.0 and includes a a large number of
bug fixes along several new features, enhancements, and performance improvements.
We recommend that all users upgrade to this version.
+Highlights include:
+
+- Support for a ``CategoricalIndex``, a category based index, see :ref:`here `
+
.. contents:: What's new in v0.16.1
:local:
:backlinks: none
@@ -20,17 +24,18 @@ Enhancements
- Added ``StringMethods.capitalize()`` and ``swapcase`` which behave as the same as standard ``str`` (:issue:`9766`)
- Added ``StringMethods`` (.str accessor) to ``Index`` (:issue:`9068`)
- The `.str` accessor is now available for both `Series` and `Index`.
+ The ``.str`` accessor is now available for both ``Series`` and ``Index``.
.. ipython:: python
idx = Index([' jack', 'jill ', ' jesse ', 'frank'])
idx.str.strip()
- One special case for the `.str` accessor on `Index` is that if a string method returns `bool`, the `.str` accessor
- will return a `np.array` instead of a boolean `Index` (:issue:`8875`). This enables the following expression
+ One special case for the `.str` accessor on ``Index`` is that if a string method returns ``bool``, the ``.str`` accessor
+ will return a ``np.array`` instead of a boolean ``Index`` (:issue:`8875`). This enables the following expression
to work naturally:
+
.. ipython:: python
idx = Index(['a1', 'a2', 'b1', 'b2'])
@@ -40,7 +45,8 @@ Enhancements
s[s.index.str.startswith('a')]
- ``DataFrame.mask()`` and ``Series.mask()`` now support same keywords as ``where`` (:issue:`8801`)
-- ``drop`` function can now accept ``errors`` keyword to suppress ValueError raised when any of label does not exist in the target data. (:issue:`6736`)
+
+- ``drop`` function can now accept ``errors`` keyword to suppress ``ValueError`` raised when any of label does not exist in the target data. (:issue:`6736`)
.. ipython:: python
@@ -55,6 +61,79 @@ Enhancements
- Allow Panel.shift with ``axis='items'`` (:issue:`9890`)
- Trying to write an excel file now raises ``NotImplementedError`` if the ``DataFrame`` has a ``MultiIndex`` instead of writing a broken Excel file. (:issue:`9794`)
+- Allow ``Categorical.add_categories`` to accept ``Series`` or ``np.array``. (:issue:`9927`)
+
+- Add/delete ``str/dt/cat`` accessors dynamically from ``__dir__``. (:issue:`9910`)
+
+- ``DataFrame`` and ``Series`` now have ``_constructor_expanddim`` property as overridable constructor for one higher dimensionality data. This should be used only when it is really needed, see :ref:`here `
+
+.. _whatsnew_0161.enhancements.categoricalindex:
+
+CategoricalIndex
+^^^^^^^^^^^^^^^^
+
+We introduce a ``CategoricalIndex``, a new type of index object that is useful for supporting
+indexing with duplicates. This is a container around a ``Categorical`` (introduced in v0.15.0)
+and allows efficient indexing and storage of an index with a large number of duplicated elements. Prior to 0.16.1,
+setting the index of a ``DataFrame/Series`` with a ``category`` dtype would convert this to regular object-based ``Index``.
+
+.. ipython :: python
+
+ df = DataFrame({'A' : np.arange(6),
+ 'B' : Series(list('aabbca')).astype('category',
+ categories=list('cab'))
+ })
+ df
+ df.dtypes
+ df.B.cat.categories
+
+setting the index, will create create a ``CategoricalIndex``
+
+.. ipython :: python
+
+ df2 = df.set_index('B')
+ df2.index
+
+indexing with ``__getitem__/.iloc/.loc/.ix`` works similarly to an Index with duplicates.
+The indexers MUST be in the category or the operation will raise.
+
+.. ipython :: python
+
+ df2.loc['a']
+
+and preserves the ``CategoricalIndex``
+
+.. ipython :: python
+
+ df2.loc['a'].index
+
+sorting will order by the order of the categories
+
+.. ipython :: python
+
+ df2.sort_index()
+
+groupby operations on the index will preserve the index nature as well
+
+.. ipython :: python
+
+ df2.groupby(level=0).sum()
+ df2.groupby(level=0).sum().index
+
+reindexing operations, will return a resulting index based on the type of the passed
+indexer, meaning that passing a list will return a plain-old-``Index``; indexing with
+a ``Categorical`` will return a ``CategoricalIndex``, indexed according to the categories
+of the PASSED ``Categorical`` dtype. This allows one to arbitrarly index these even with
+values NOT in the categories, similarly to how you can reindex ANY pandas index.
+
+.. ipython :: python
+
+ df2.reindex(['a','e'])
+ df2.reindex(['a','e']).index
+ df2.reindex(pd.Categorical(['a','e'],categories=list('abcde')))
+ df2.reindex(pd.Categorical(['a','e'],categories=list('abcde'))).index
+
+See the :ref:`documentation ` for more. (:issue:`7629`)
.. _whatsnew_0161.api:
@@ -87,7 +166,8 @@ API changes
Performance Improvements
~~~~~~~~~~~~~~~~~~~~~~~~
-
+- Improved csv write performance with mixed dtypes, including datetimes by up to 5x (:issue:`9940`)
+- Improved csv write performance generally by 2x (:issue:`9940`)
@@ -99,15 +179,19 @@ Bug Fixes
- Fixed bug (:issue:`9542`) where labels did not appear properly in legend of ``DataFrame.plot()``. Passing ``label=`` args also now works, and series indices are no longer mutated.
- Bug in json serialization when frame has length zero.(:issue:`9805`)
-- Bug in `read_csv` where missing trailing delimiters would cause segfault. (:issue:`5664`)
+- Bug in ``read_csv`` where missing trailing delimiters would cause segfault. (:issue:`5664`)
- Bug in retaining index name on appending (:issue:`9862`)
- Bug in ``scatter_matrix`` draws unexpected axis ticklabels (:issue:`5662`)
- Fixed bug in ``StataWriter`` resulting in changes to input ``DataFrame`` upon save (:issue:`9795`).
- Bug in ``transform`` causing length mismatch when null entries were present and a fast aggregator was being used (:issue:`9697`)
- Bug in ``equals`` causing false negatives when block order differed (:issue:`9330`)
+
- Bug in ``read_sql_table`` error when reading postgres table with timezone (:issue:`7139`)
- Bug in ``DataFrame`` slicing may not retain metadata (:issue:`9776`)
- Bug where ``TimdeltaIndex`` were not properly serialized in fixed ``HDFStore`` (:issue:`9635`)
+
+- Bug in ``groupby.apply()`` that would raise if a passed user defined function either returned only ``None`` (for all input). (:issue:`9685`)
+
- Bug in plotting continuously using ``secondary_y`` may not show legend properly. (:issue:`9610`, :issue:`9779`)
- Bug in ``DataFrame.plot(kind="hist")`` results in ``TypeError`` when ``DataFrame`` contains non-numeric columns (:issue:`9853`)
- Bug where repeated plotting of ``DataFrame`` with a ``DatetimeIndex`` may raise ``TypeError`` (:issue:`9852`)
@@ -118,24 +202,22 @@ Bug Fixes
- Bug in ``where`` causing incorrect results when upcasting was required (:issue:`9731`)
- Bug in ``FloatArrayFormatter`` where decision boundary for displaying "small" floats in decimal format is off by one order of magnitude for a given display.precision (:issue:`9764`)
- Fixed bug where ``DataFrame.plot()`` raised an error when both ``color`` and ``style`` keywords were passed and there was no color symbol in the style strings (:issue:`9671`)
+
- Bug in ``read_csv`` and ``read_table`` when using ``skip_rows`` parameter if blank lines are present. (:issue:`9832`)
- Bug in ``read_csv()`` interprets ``index_col=True`` as ``1`` (:issue:`9798`)
-- Bug in index equality comparisons using ``==`` failing on Index/MultiIndex type incompatibility (:issue:`9875`)
+- Bug in index equality comparisons using ``==`` failing on Index/MultiIndex type incompatibility (:issue:`9785`)
- Bug in which ``SparseDataFrame`` could not take `nan` as a column name (:issue:`8822`)
- Bug in ``to_msgpack`` and ``read_msgpack`` zlib and blosc compression support (:issue:`9783`)
-- Bug in unequal comparisons between a ``Series`` of dtype `"category"` and a scalar (e.g. ``Series(Categorical(list("abc"), categories=list("cba"), ordered=True)) > "b"``, which wouldn't use the order of the categories but use the lexicographical order. (:issue:`9848`)
-
-
-
-
-
-
+- Bug ``GroupBy.size`` doesn't attach index name properly if grouped by ``TimeGrouper`` (:issue:`9925`)
+- Bug causing an exception in slice assignments because ``length_of_indexer`` returns wrong results (:issue:`9995`)
+- Bug in csv parser causing lines with initial whitespace plus one non-space character to be skipped. (:issue:`9710`)
+- Bug in invalid attribute access on a ``TimedeltaIndex`` incorrectly raised ``ValueError`` instead of ``AttributeError`` (:issue:`9680`)
@@ -143,7 +225,7 @@ Bug Fixes
- Bug in unequal comparisons between categorical data and a scalar, which was not in the categories (e.g. ``Series(Categorical(list("abc"), ordered=True)) > "d"``. This returned ``False`` for all elements, but now raises a ``TypeError``. Equality comparisons also now return ``False`` for ``==`` and ``True`` for ``!=``. (:issue:`9848`)
- Bug in DataFrame ``__setitem__`` when right hand side is a dictionary (:issue:`9874`)
- Bug in ``where`` when dtype is ``datetime64/timedelta64``, but dtype of other is not (:issue:`9804`)
-- Bug in ``MultiIndex.sortlevel()`` results in unicode level name breaks (:issue:`9875`)
+- Bug in ``MultiIndex.sortlevel()`` results in unicode level name breaks (:issue:`9856`)
- Bug in which ``groupby.transform`` incorrectly enforced output dtypes to match input dtypes. (:issue:`9807`)
- Bug in bar plot with ``log=True`` raises ``TypeError`` if all values are less than 1 (:issue:`9905`)
@@ -161,3 +243,10 @@ Bug Fixes
- Changed caching in ``AbstractHolidayCalendar`` to be at the instance level rather than at the class level as the latter can result in unexpected behaviour. (:issue:`9552`)
- Fixed latex output for multi-indexed dataframes (:issue:`9778`)
+- Bug causing an exception when setting an empty range using ``DataFrame.loc`` (:issue:`9596`)
+
+
+- Bug in hiding ticklabels with subplots and shared axes when adding a new plot to an existing grid of axes (:issue:`9158`)
+
+- Bug causing ``DataFrame.where`` to not respect the ``axis`` parameter when the frame has a symmetric shape. (:issue:`9736`)
+
diff --git a/pandas/core/api.py b/pandas/core/api.py
index a8b10342593ce..fde9bc77c4bd9 100644
--- a/pandas/core/api.py
+++ b/pandas/core/api.py
@@ -8,7 +8,7 @@
from pandas.core.categorical import Categorical
from pandas.core.groupby import Grouper
from pandas.core.format import set_eng_float_format
-from pandas.core.index import Index, Int64Index, Float64Index, MultiIndex
+from pandas.core.index import Index, CategoricalIndex, Int64Index, Float64Index, MultiIndex
from pandas.core.series import Series, TimeSeries
from pandas.core.frame import DataFrame
diff --git a/pandas/core/base.py b/pandas/core/base.py
index a25651a73f507..9c27f3c7a2cc3 100644
--- a/pandas/core/base.py
+++ b/pandas/core/base.py
@@ -86,16 +86,22 @@ def __unicode__(self):
# Should be overwritten by base classes
return object.__repr__(self)
- def _local_dir(self):
- """ provide addtional __dir__ for this object """
- return []
+ def _dir_additions(self):
+ """ add addtional __dir__ for this object """
+ return set()
+
+ def _dir_deletions(self):
+ """ delete unwanted __dir__ for this object """
+ return set()
def __dir__(self):
"""
Provide method name lookup and completion
Only provide 'public' methods
"""
- return list(sorted(list(set(dir(type(self)) + self._local_dir()))))
+ rv = set(dir(type(self)))
+ rv = (rv - self._dir_deletions()) | self._dir_additions()
+ return sorted(rv)
def _reset_cache(self, key=None):
"""
@@ -121,7 +127,7 @@ def _delegate_method(self, name, *args, **kwargs):
raise TypeError("You cannot call method {name}".format(name=name))
@classmethod
- def _add_delegate_accessors(cls, delegate, accessors, typ):
+ def _add_delegate_accessors(cls, delegate, accessors, typ, overwrite=False):
"""
add accessors to cls from the delegate class
@@ -131,6 +137,8 @@ def _add_delegate_accessors(cls, delegate, accessors, typ):
delegate : the class to get methods/properties & doc-strings
acccessors : string list of accessors to add
typ : 'property' or 'method'
+ overwrite : boolean, default False
+ overwrite the method/property in the target class if it exists
"""
@@ -164,7 +172,7 @@ def f(self, *args, **kwargs):
f = _create_delegator_method(name)
# don't overwrite existing methods/properties
- if not hasattr(cls, name):
+ if overwrite or not hasattr(cls, name):
setattr(cls,name,f)
@@ -516,6 +524,16 @@ def _make_str_accessor(self):
str = AccessorProperty(StringMethods, _make_str_accessor)
+ def _dir_additions(self):
+ return set()
+
+ def _dir_deletions(self):
+ try:
+ getattr(self, 'str')
+ except AttributeError:
+ return set(['str'])
+ return set()
+
_shared_docs['drop_duplicates'] = (
"""Return %(klass)s with duplicate values removed
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
index 0d66a89b0a585..caf706fcbcbbd 100644
--- a/pandas/core/categorical.py
+++ b/pandas/core/categorical.py
@@ -9,12 +9,11 @@
from pandas.core.algorithms import factorize
from pandas.core.base import PandasObject, PandasDelegate
-from pandas.core.index import Index, _ensure_index
-from pandas.tseries.period import PeriodIndex
import pandas.core.common as com
from pandas.util.decorators import cache_readonly
-from pandas.core.common import (CategoricalDtype, ABCSeries, isnull, notnull,
+from pandas.core.common import (CategoricalDtype, ABCSeries, ABCIndexClass, ABCPeriodIndex, ABCCategoricalIndex,
+ isnull, notnull, is_dtype_equal,
is_categorical_dtype, is_integer_dtype, is_object_dtype,
_possibly_infer_to_datetimelike, get_dtype_kinds,
is_list_like, is_sequence, is_null_slice, is_bool,
@@ -22,7 +21,6 @@
_coerce_indexer_dtype, _values_from_object, take_1d)
from pandas.util.terminal import get_terminal_size
from pandas.core.config import get_option
-from pandas.core import format as fmt
def _cat_compare_op(op):
def f(self, other):
@@ -86,7 +84,7 @@ def f(self, other):
def maybe_to_categorical(array):
""" coerce to a categorical if a series is given """
- if isinstance(array, ABCSeries):
+ if isinstance(array, (ABCSeries, ABCCategoricalIndex)):
return array.values
return array
@@ -236,15 +234,17 @@ def __init__(self, values, categories=None, ordered=False, name=None, fastpath=F
# sanitize input
if is_categorical_dtype(values):
- # we are either a Series or a Categorical
- cat = values
- if isinstance(values, ABCSeries):
- cat = values.values
+ # we are either a Series or a CategoricalIndex
+ if isinstance(values, (ABCSeries, ABCCategoricalIndex)):
+ values = values.values
+
+ if ordered is None:
+ ordered = values.ordered
if categories is None:
- categories = cat.categories
+ categories = values.categories
values = values.__array__()
- elif isinstance(values, Index):
+ elif isinstance(values, ABCIndexClass):
pass
else:
@@ -295,11 +295,11 @@ def __init__(self, values, categories=None, ordered=False, name=None, fastpath=F
warn("Values and categories have different dtypes. Did you mean to use\n"
"'Categorical.from_codes(codes, categories)'?", RuntimeWarning)
- if is_integer_dtype(values) and (codes == -1).all():
+ if len(values) and is_integer_dtype(values) and (codes == -1).all():
warn("None of the categories were found in values. Did you mean to use\n"
"'Categorical.from_codes(codes, categories)'?", RuntimeWarning)
- self.set_ordered(ordered, inplace=True)
+ self.set_ordered(ordered or False, inplace=True)
self.categories = categories
self.name = name
self._codes = _coerce_indexer_dtype(codes, categories)
@@ -309,11 +309,27 @@ def copy(self):
return Categorical(values=self._codes.copy(),categories=self.categories,
name=self.name, ordered=self.ordered, fastpath=True)
+ def astype(self, dtype):
+ """ coerce this type to another dtype """
+ if is_categorical_dtype(dtype):
+ return self
+ return np.array(self, dtype=dtype)
+
@cache_readonly
def ndim(self):
"""Number of dimensions of the Categorical """
return self._codes.ndim
+ @cache_readonly
+ def size(self):
+ """ return the len of myself """
+ return len(self)
+
+ @cache_readonly
+ def itemsize(self):
+ """ return the size of a single category """
+ return self.categories.itemsize
+
def reshape(self, new_shape, **kwargs):
""" compat with .reshape """
return self
@@ -395,7 +411,8 @@ def _set_codes(self, codes):
codes = property(fget=_get_codes, fset=_set_codes, doc=_codes_doc)
def _get_labels(self):
- """ Get the category labels (deprecated).
+ """
+ Get the category labels (deprecated).
Deprecated, use .codes!
"""
@@ -409,8 +426,10 @@ def _get_labels(self):
@classmethod
def _validate_categories(cls, categories):
- """" Validates that we have good categories """
- if not isinstance(categories, Index):
+ """
+ Validates that we have good categories
+ """
+ if not isinstance(categories, ABCIndexClass):
dtype = None
if not hasattr(categories, "dtype"):
categories = _convert_to_list_like(categories)
@@ -421,6 +440,8 @@ def _validate_categories(cls, categories):
with_na = np.array(categories)
if with_na.dtype != without_na.dtype:
dtype = "object"
+
+ from pandas import Index
categories = Index(categories, dtype=dtype)
if not categories.is_unique:
raise ValueError('Categorical categories must be unique')
@@ -687,7 +708,7 @@ def add_categories(self, new_categories, inplace=False):
if len(already_included) != 0:
msg = "new categories must not include old categories: %s" % str(already_included)
raise ValueError(msg)
- new_categories = list(self._categories) + (new_categories)
+ new_categories = list(self._categories) + list(new_categories)
new_categories = self._validate_categories(new_categories)
cat = self if inplace else self.copy()
cat._categories = new_categories
@@ -761,6 +782,8 @@ def remove_unused_categories(self, inplace=False):
cat = self if inplace else self.copy()
_used = sorted(np.unique(cat._codes))
new_categories = cat.categories.take(_ensure_platform_int(_used))
+
+ from pandas.core.index import _ensure_index
new_categories = _ensure_index(new_categories)
cat._codes = _get_codes_for_values(cat.__array__(), new_categories)
cat._categories = new_categories
@@ -790,7 +813,8 @@ def shape(self):
return tuple([len(self._codes)])
def __array__(self, dtype=None):
- """ The numpy array interface.
+ """
+ The numpy array interface.
Returns
-------
@@ -799,7 +823,7 @@ def __array__(self, dtype=None):
dtype as categorical.categories.dtype
"""
ret = take_1d(self.categories.values, self._codes)
- if dtype and dtype != self.categories.dtype:
+ if dtype and not is_dtype_equal(dtype,self.categories.dtype):
return np.asarray(ret, dtype)
return ret
@@ -997,7 +1021,7 @@ def get_values(self):
"""
# if we are a period index, return a string repr
- if isinstance(self.categories, PeriodIndex):
+ if isinstance(self.categories, ABCPeriodIndex):
return take_1d(np.array(self.categories.to_native_types(), dtype=object),
self._codes)
@@ -1243,7 +1267,8 @@ def __iter__(self):
"""Returns an Iterator over the values of this Categorical."""
return iter(np.array(self))
- def _tidy_repr(self, max_vals=10):
+ def _tidy_repr(self, max_vals=10, footer=True):
+ """ a short repr displaying only max_vals and an optional (but default footer) """
num = max_vals // 2
head = self[:num]._get_repr(length=False, name=False, footer=False)
tail = self[-(max_vals - num):]._get_repr(length=False,
@@ -1251,23 +1276,31 @@ def _tidy_repr(self, max_vals=10):
footer=False)
result = '%s, ..., %s' % (head[:-1], tail[1:])
- result = '%s\n%s' % (result, self._repr_footer())
+ if footer:
+ result = '%s\n%s' % (result, self._repr_footer())
return compat.text_type(result)
- def _repr_categories_info(self):
- """ Returns a string representation of the footer."""
-
+ def _repr_categories(self):
+ """ return the base repr for the categories """
max_categories = (10 if get_option("display.max_categories") == 0
else get_option("display.max_categories"))
+ from pandas.core import format as fmt
category_strs = fmt.format_array(self.categories.get_values(), None)
if len(category_strs) > max_categories:
num = max_categories // 2
head = category_strs[:num]
tail = category_strs[-(max_categories - num):]
category_strs = head + ["..."] + tail
+
# Strip all leading spaces, which format_array adds for columns...
category_strs = [x.strip() for x in category_strs]
+ return category_strs
+
+ def _repr_categories_info(self):
+ """ Returns a string representation of the footer."""
+
+ category_strs = self._repr_categories()
levheader = "Categories (%d, %s): " % (len(self.categories),
self.categories.dtype)
width, height = get_terminal_size()
@@ -1299,8 +1332,11 @@ def _repr_footer(self):
len(self), self._repr_categories_info())
def _get_repr(self, name=False, length=True, na_rep='NaN', footer=True):
- formatter = fmt.CategoricalFormatter(self, name=name,
- length=length, na_rep=na_rep,
+ from pandas.core import format as fmt
+ formatter = fmt.CategoricalFormatter(self,
+ name=name,
+ length=length,
+ na_rep=na_rep,
footer=footer)
result = formatter.to_string()
return compat.text_type(result)
@@ -1315,9 +1351,9 @@ def __unicode__(self):
name=True)
else:
result = '[], %s' % self._get_repr(name=True,
- length=False,
- footer=True,
- ).replace("\n",", ")
+ length=False,
+ footer=True,
+ ).replace("\n",", ")
return result
@@ -1358,6 +1394,8 @@ def __setitem__(self, key, value):
"categories")
rvalue = value if is_list_like(value) else [value]
+
+ from pandas import Index
to_add = Index(rvalue).difference(self.categories)
# no assignments of values not in categories, but it's always ok to set something to np.nan
@@ -1516,11 +1554,27 @@ def equals(self, other):
-------
are_equal : boolean
"""
- if not isinstance(other, Categorical):
- return False
# TODO: should this also test if name is equal?
- return (self.categories.equals(other.categories) and self.ordered == other.ordered and
- np.array_equal(self._codes, other._codes))
+ return self.is_dtype_equal(other) and np.array_equal(self._codes, other._codes)
+
+ def is_dtype_equal(self, other):
+ """
+ Returns True if categoricals are the same dtype
+ same categories, and same ordered
+
+ Parameters
+ ----------
+ other : Categorical
+
+ Returns
+ -------
+ are_equal : boolean
+ """
+
+ try:
+ return self.categories.equals(other.categories) and self.ordered == other.ordered
+ except (AttributeError, TypeError):
+ return False
def describe(self):
""" Describes this Categorical
@@ -1604,18 +1658,20 @@ def _delegate_method(self, name, *args, **kwargs):
##### utility routines #####
def _get_codes_for_values(values, categories):
- """"
+ """
utility routine to turn values into codes given the specified categories
"""
from pandas.core.algorithms import _get_data_algo, _hashtables
- if values.dtype != categories.dtype:
+ if not is_dtype_equal(values.dtype,categories.dtype):
values = _ensure_object(values)
categories = _ensure_object(categories)
+
(hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables)
- t = hash_klass(len(categories))
- t.map_locations(_values_from_object(categories))
- return _coerce_indexer_dtype(t.lookup(values), categories)
+ (_, _), cats = _get_data_algo(categories, _hashtables)
+ t = hash_klass(len(cats))
+ t.map_locations(cats)
+ return _coerce_indexer_dtype(t.lookup(vals), cats)
def _convert_to_list_like(list_like):
if hasattr(list_like, "dtype"):
diff --git a/pandas/core/common.py b/pandas/core/common.py
index ffe12d0c1546c..3d23aeff942dc 100644
--- a/pandas/core/common.py
+++ b/pandas/core/common.py
@@ -83,6 +83,16 @@ def _check(cls, inst):
ABCDatetimeIndex = create_pandas_abc_type("ABCDatetimeIndex", "_typ", ("datetimeindex",))
ABCTimedeltaIndex = create_pandas_abc_type("ABCTimedeltaIndex", "_typ", ("timedeltaindex",))
ABCPeriodIndex = create_pandas_abc_type("ABCPeriodIndex", "_typ", ("periodindex",))
+ABCCategoricalIndex = create_pandas_abc_type("ABCCategoricalIndex", "_typ", ("categoricalindex",))
+ABCIndexClass = create_pandas_abc_type("ABCIndexClass", "_typ", ("index",
+ "int64index",
+ "float64index",
+ "multiindex",
+ "datetimeindex",
+ "timedeltaindex",
+ "periodindex",
+ "categoricalindex"))
+
ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series",))
ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",))
ABCPanel = create_pandas_abc_type("ABCPanel", "_typ", ("panel",))
@@ -2455,11 +2465,27 @@ def _get_dtype_type(arr_or_dtype):
return np.dtype(arr_or_dtype).type
elif isinstance(arr_or_dtype, CategoricalDtype):
return CategoricalDtypeType
+ elif isinstance(arr_or_dtype, compat.string_types):
+ if is_categorical_dtype(arr_or_dtype):
+ return CategoricalDtypeType
+ return _get_dtype_type(np.dtype(arr_or_dtype))
try:
return arr_or_dtype.dtype.type
except AttributeError:
raise ValueError('%r is not a dtype' % arr_or_dtype)
+def is_dtype_equal(source, target):
+ """ return a boolean if the dtypes are equal """
+ source = _get_dtype_type(source)
+ target = _get_dtype_type(target)
+
+ try:
+ return source == target
+ except TypeError:
+
+ # invalid comparison
+ # object == category will hit this
+ return False
def is_any_int_dtype(arr_or_dtype):
tipo = _get_dtype_type(arr_or_dtype)
diff --git a/pandas/core/format.py b/pandas/core/format.py
index 06e1fab27cd6d..6e632e6ea741b 100644
--- a/pandas/core/format.py
+++ b/pandas/core/format.py
@@ -14,15 +14,14 @@
from pandas.core.config import get_option, set_option
import pandas.core.common as com
import pandas.lib as lib
-from pandas.tslib import iNaT, Timestamp, Timedelta
-
+from pandas.tslib import iNaT, Timestamp, Timedelta, format_array_from_datetime
+from pandas.tseries.index import DatetimeIndex
+from pandas.tseries.period import PeriodIndex
import numpy as np
import itertools
import csv
-from pandas.tseries.period import PeriodIndex, DatetimeIndex
-
docstring_to_string = """
Parameters
----------
@@ -1259,9 +1258,10 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None,
if isinstance(cols, Index):
cols = cols.to_native_types(na_rep=na_rep,
float_format=float_format,
- date_format=date_format)
+ date_format=date_format,
+ quoting=self.quoting)
else:
- cols = list(cols)
+ cols = np.asarray(list(cols))
self.obj = self.obj.loc[:, cols]
# update columns to include possible multiplicity of dupes
@@ -1270,9 +1270,10 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None,
if isinstance(cols, Index):
cols = cols.to_native_types(na_rep=na_rep,
float_format=float_format,
- date_format=date_format)
+ date_format=date_format,
+ quoting=self.quoting)
else:
- cols = list(cols)
+ cols = np.asarray(list(cols))
# save it
self.cols = cols
@@ -1371,8 +1372,10 @@ def strftime_with_nulls(x):
values = self.obj.copy()
values.index = data_index
values.columns = values.columns.to_native_types(
- na_rep=na_rep, float_format=float_format,
- date_format=date_format)
+ na_rep=na_rep,
+ float_format=float_format,
+ date_format=date_format,
+ quoting=self.quoting)
values = values[cols]
series = {}
@@ -1543,18 +1546,22 @@ def _save_chunk(self, start_i, end_i):
slicer = slice(start_i, end_i)
for i in range(len(self.blocks)):
b = self.blocks[i]
- d = b.to_native_types(slicer=slicer, na_rep=self.na_rep,
+ d = b.to_native_types(slicer=slicer,
+ na_rep=self.na_rep,
float_format=self.float_format,
decimal=self.decimal,
- date_format=self.date_format)
+ date_format=self.date_format,
+ quoting=self.quoting)
for col_loc, col in zip(b.mgr_locs, d):
# self.data is a preallocated list
self.data[col_loc] = col
- ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep,
+ ix = data_index.to_native_types(slicer=slicer,
+ na_rep=self.na_rep,
float_format=self.float_format,
- date_format=self.date_format)
+ date_format=self.date_format,
+ quoting=self.quoting)
lib.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer)
@@ -2030,16 +2037,43 @@ def __init__(self, values, nat_rep='NaT', date_format=None, **kwargs):
self.date_format = date_format
def _format_strings(self):
- formatter = (self.formatter or
- _get_format_datetime64_from_values(self.values,
- nat_rep=self.nat_rep,
- date_format=self.date_format))
- fmt_values = [formatter(x) for x in self.values]
+ # we may have a tz, if so, then need to process element-by-element
+ # when DatetimeBlockWithTimezones is a reality this could be fixed
+ values = self.values
+ if not isinstance(values, DatetimeIndex):
+ values = DatetimeIndex(values)
+
+ if values.tz is None:
+ fmt_values = format_array_from_datetime(values.asi8.ravel(),
+ format=_get_format_datetime64_from_values(values, self.date_format),
+ na_rep=self.nat_rep).reshape(values.shape)
+ fmt_values = fmt_values.tolist()
+
+ else:
+
+ values = values.asobject
+ is_dates_only = _is_dates_only(values)
+ formatter = (self.formatter or _get_format_datetime64(is_dates_only, values, date_format=self.date_format))
+ fmt_values = [ formatter(x) for x in self.values ]
return fmt_values
+def _is_dates_only(values):
+ # return a boolean if we are only dates (and don't have a timezone)
+ values = DatetimeIndex(values)
+ if values.tz is not None:
+ return False
+
+ values_int = values.asi8
+ consider_values = values_int != iNaT
+ one_day_nanos = (86400 * 1e9)
+ even_days = np.logical_and(consider_values, values_int % one_day_nanos != 0).sum() == 0
+ if even_days:
+ return True
+ return False
+
def _format_datetime64(x, tz=None, nat_rep='NaT'):
if x is None or lib.checknull(x):
return nat_rep
@@ -2062,22 +2096,6 @@ def _format_datetime64_dateonly(x, nat_rep='NaT', date_format=None):
else:
return x._date_repr
-
-def _is_dates_only(values):
- # return a boolean if we are only dates (and don't have a timezone)
- from pandas import DatetimeIndex
- values = DatetimeIndex(values)
- if values.tz is not None:
- return False
-
- values_int = values.asi8
- consider_values = values_int != iNaT
- one_day_nanos = (86400 * 1e9)
- even_days = np.logical_and(consider_values, values_int % one_day_nanos != 0).sum() == 0
- if even_days:
- return True
- return False
-
def _get_format_datetime64(is_dates_only, nat_rep='NaT', date_format=None):
if is_dates_only:
@@ -2088,13 +2106,12 @@ def _get_format_datetime64(is_dates_only, nat_rep='NaT', date_format=None):
return lambda x, tz=None: _format_datetime64(x, tz=tz, nat_rep=nat_rep)
-def _get_format_datetime64_from_values(values,
- nat_rep='NaT',
- date_format=None):
+def _get_format_datetime64_from_values(values, date_format):
+ """ given values and a date_format, return a string format """
is_dates_only = _is_dates_only(values)
- return _get_format_datetime64(is_dates_only=is_dates_only,
- nat_rep=nat_rep,
- date_format=date_format)
+ if is_dates_only:
+ return date_format or "%Y-%m-%d"
+ return None
class Timedelta64Formatter(GenericArrayFormatter):
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 4f7bc11cbf03c..272c401c18761 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -191,6 +191,11 @@ def _constructor(self):
_constructor_sliced = Series
+ @property
+ def _constructor_expanddim(self):
+ from pandas.core.panel import Panel
+ return Panel
+
def __init__(self, data=None, index=None, columns=None, dtype=None,
copy=False):
if data is None:
@@ -1061,8 +1066,6 @@ def to_panel(self):
-------
panel : Panel
"""
- from pandas.core.panel import Panel
-
# only support this kind for now
if (not isinstance(self.index, MultiIndex) or # pragma: no cover
len(self.index.levels) != 2):
@@ -1100,7 +1103,7 @@ def to_panel(self):
shape=shape,
ref_items=selfsorted.columns)
- return Panel(new_mgr)
+ return self._constructor_expanddim(new_mgr)
to_wide = deprecate('to_wide', to_panel)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 8bd85a008f077..e58bdbfa346a4 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -146,15 +146,19 @@ def __unicode__(self):
prepr = '[%s]' % ','.join(map(com.pprint_thing, self))
return '%s(%s)' % (self.__class__.__name__, prepr)
- def _local_dir(self):
+ def _dir_additions(self):
""" add the string-like attributes from the info_axis """
- return [c for c in self._info_axis
- if isinstance(c, string_types) and isidentifier(c)]
+ return set([c for c in self._info_axis
+ if isinstance(c, string_types) and isidentifier(c)])
@property
def _constructor_sliced(self):
raise AbstractMethodError(self)
+ @property
+ def _constructor_expanddim(self):
+ raise NotImplementedError
+
#----------------------------------------------------------------------
# Axis
@@ -3337,10 +3341,18 @@ def where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
matches = (new_other == np.array(other))
if matches is False or not matches.all():
- other = np.array(other)
+
+ # coerce other to a common dtype if we can
+ if com.needs_i8_conversion(self.dtype):
+ try:
+ other = np.array(other, dtype=self.dtype)
+ except:
+ other = np.array(other)
+ else:
+ other = np.asarray(other)
+ other = np.asarray(other, dtype=np.common_type(other, new_other))
- # we can't use our existing dtype
- # because of incompatibilities
+ # we need to use the new dtype
try_quick = False
else:
other = new_other
@@ -3397,19 +3409,31 @@ def where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
else:
other = self._constructor(other, **self._construct_axes_dict())
+ if axis is None:
+ axis = 0
+
+ if self.ndim == getattr(other, 'ndim', 0):
+ align = True
+ else:
+ align = (self._get_axis_number(axis) == 1)
+
+ block_axis = self._get_block_manager_axis(axis)
+
if inplace:
# we may have different type blocks come out of putmask, so
# reconstruct the block manager
self._check_inplace_setting(other)
- new_data = self._data.putmask(mask=cond, new=other, align=axis is None,
- inplace=True)
+ new_data = self._data.putmask(mask=cond, new=other, align=align,
+ inplace=True, axis=block_axis,
+ transpose=self._AXIS_REVERSED)
self._update_inplace(new_data)
else:
- new_data = self._data.where(other=other, cond=cond, align=axis is None,
+ new_data = self._data.where(other=other, cond=cond, align=align,
raise_on_error=raise_on_error,
- try_cast=try_cast)
+ try_cast=try_cast, axis=block_axis,
+ transpose=self._AXIS_REVERSED)
return self._constructor(new_data).__finalize__(self)
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index 6b2c9639ac71f..38619229f1086 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -14,7 +14,7 @@
from pandas.core.categorical import Categorical
from pandas.core.frame import DataFrame
from pandas.core.generic import NDFrame
-from pandas.core.index import Index, MultiIndex, _ensure_index, _union_indexes
+from pandas.core.index import Index, MultiIndex, CategoricalIndex, _ensure_index, _union_indexes
from pandas.core.internals import BlockManager, make_block
from pandas.core.series import Series
from pandas.core.panel import Panel
@@ -498,8 +498,8 @@ def _set_result_index_ordered(self, result):
result.index = self.obj.index
return result
- def _local_dir(self):
- return sorted(set(self.obj._local_dir() + list(self._apply_whitelist)))
+ def _dir_additions(self):
+ return self.obj._dir_additions() | self._apply_whitelist
def __getattr__(self, attr):
if attr in self._internal_names_set:
@@ -1780,12 +1780,14 @@ def size(self):
Compute group sizes
"""
- base = Series(np.zeros(len(self.result_index), dtype=np.int64),
- index=self.result_index)
+ index = self.result_index
+ base = Series(np.zeros(len(index), dtype=np.int64), index=index)
indices = self.indices
for k, v in compat.iteritems(indices):
indices[k] = len(v)
bin_counts = Series(indices, dtype=np.int64)
+ # make bin_counts.index to have same name to preserve it
+ bin_counts.index.name = index.name
result = base.add(bin_counts, fill_value=0)
# addition with fill_value changes dtype to float64
result = result.astype(np.int64)
@@ -1926,7 +1928,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
self.grouper = com._asarray_tuplesafe(self.grouper)
# a passed Categorical
- elif isinstance(self.grouper, Categorical):
+ elif is_categorical_dtype(self.grouper):
# must have an ordered categorical
if self.sort:
@@ -1940,8 +1942,15 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
# fix bug #GH8868 sort=False being ignored in categorical groupby
else:
self.grouper = self.grouper.reorder_categories(self.grouper.unique())
+
+ # we make a CategoricalIndex out of the cat grouper
+ # preserving the categories / ordered attributes
self._labels = self.grouper.codes
- self._group_index = self.grouper.categories
+
+ c = self.grouper.categories
+ self._group_index = CategoricalIndex(Categorical.from_codes(np.arange(len(c)),
+ categories=c,
+ ordered=self.grouper.ordered))
if self.name is None:
self.name = self.grouper.name
@@ -2129,8 +2138,8 @@ def is_in_obj(gpr):
else:
in_axis, name = False, None
- if isinstance(gpr, Categorical) and len(gpr) != len(obj):
- raise ValueError("Categorical grouper must have len(grouper) == len(data)")
+ if is_categorical_dtype(gpr) and len(gpr) != len(obj):
+ raise ValueError("Categorical dtype grouper must have len(grouper) == len(data)")
ping = Grouping(group_axis, gpr, obj=obj, name=name,
level=level, sort=sort, in_axis=in_axis)
@@ -2813,7 +2822,12 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
# make Nones an empty object
if com._count_not_none(*values) != len(values):
- v = next(v for v in values if v is not None)
+ try:
+ v = next(v for v in values if v is not None)
+ except StopIteration:
+ # If all values are None, then this will throw an error.
+ # We'd prefer it return an empty dataframe.
+ return DataFrame()
if v is None:
return DataFrame()
elif isinstance(v, NDFrame):
@@ -3250,7 +3264,7 @@ def _reindex_output(self, result):
return result
elif len(groupings) == 1:
return result
- elif not any([isinstance(ping.grouper, Categorical)
+ elif not any([isinstance(ping.grouper, (Categorical, CategoricalIndex))
for ping in groupings]):
return result
diff --git a/pandas/core/index.py b/pandas/core/index.py
index 0a3adbd19ae92..8b650fea9b440 100644
--- a/pandas/core/index.py
+++ b/pandas/core/index.py
@@ -2,6 +2,7 @@
import datetime
import warnings
import operator
+
from functools import partial
from pandas.compat import range, zip, lrange, lzip, u, reduce, filter, map
from pandas import compat
@@ -13,13 +14,13 @@
import pandas.algos as _algos
import pandas.index as _index
from pandas.lib import Timestamp, Timedelta, is_datetime_array
-from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs
+from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs, PandasDelegate
from pandas.util.decorators import (Appender, Substitution, cache_readonly,
deprecate)
-from pandas.core.common import isnull, array_equivalent
import pandas.core.common as com
-from pandas.core.common import (_values_from_object, is_float, is_integer,
- ABCSeries, _ensure_object, _ensure_int64, is_bool_indexer,
+from pandas.core.common import (isnull, array_equivalent, is_dtype_equal, is_object_dtype,
+ _values_from_object, is_float, is_integer, is_iterator, is_categorical_dtype,
+ ABCSeries, ABCCategorical, _ensure_object, _ensure_int64, is_bool_indexer,
is_list_like, is_bool_dtype, is_null_slice, is_integer_dtype)
from pandas.core.config import get_option
from pandas.io.common import PerformanceWarning
@@ -44,26 +45,6 @@ def _try_get_item(x):
except AttributeError:
return x
-def _indexOp(opname):
- """
- Wrapper function for index comparison operations, to avoid
- code duplication.
- """
- def wrapper(self, other):
- func = getattr(self.values, opname)
- result = func(np.asarray(other))
-
- # technically we could support bool dtyped Index
- # for now just return the indexing array directly
- if is_bool_dtype(result):
- return result
- try:
- return Index(result)
- except: # pragma: no cover
- return result
- return wrapper
-
-
class InvalidIndexError(Exception):
pass
@@ -162,6 +143,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False,
return Float64Index(data, copy=copy, dtype=dtype, name=name)
elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):
subarr = data.astype('object')
+ elif is_categorical_dtype(data) or is_categorical_dtype(dtype):
+ return CategoricalIndex(data, copy=copy, name=name, **kwargs)
else:
subarr = com._asarray_tuplesafe(data, dtype=object)
@@ -170,6 +153,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False,
if copy:
subarr = subarr.copy()
+ elif is_categorical_dtype(data) or is_categorical_dtype(dtype):
+ return CategoricalIndex(data, copy=copy, name=name, **kwargs)
elif hasattr(data, '__array__'):
return Index(np.asarray(data), dtype=dtype, copy=copy, name=name,
**kwargs)
@@ -258,7 +243,7 @@ def __len__(self):
"""
return len(self._data)
- def __array__(self, result=None):
+ def __array__(self, dtype=None):
""" the array interface, return my values """
return self._data.view(np.ndarray)
@@ -282,9 +267,6 @@ def get_values(self):
""" return the underlying data as an ndarray """
return self.values
- def _array_values(self):
- return self._data
-
# ops compat
def tolist(self):
"""
@@ -410,8 +392,7 @@ def __unicode__(self):
Invoked by unicode(df) in py2 only. Yields a Unicode String in both
py2/py3.
"""
- prepr = com.pprint_thing(self, escape_chars=('\t', '\r', '\n'),
- quote_strings=True)
+ prepr = default_pprint(self)
return "%s(%s, dtype='%s')" % (type(self).__name__, prepr, self.dtype)
def to_series(self, **kwargs):
@@ -429,9 +410,10 @@ def to_series(self, **kwargs):
def _to_embed(self, keep_tz=False):
"""
+ *this is an internal non-public method*
+
return an array repr of this object, potentially casting to object
- This is for internal compat
"""
return self.values
@@ -623,7 +605,10 @@ def is_numeric(self):
return self.inferred_type in ['integer', 'floating']
def is_object(self):
- return self.dtype == np.object_
+ return is_object_dtype(self.dtype)
+
+ def is_categorical(self):
+ return self.inferred_type in ['categorical']
def is_mixed(self):
return 'mixed' in self.inferred_type
@@ -772,14 +757,11 @@ def is_int(v):
return indexer
- def _convert_list_indexer(self, key, kind=None):
- """ convert a list indexer. these should be locations """
- return key
-
- def _convert_list_indexer_for_mixed(self, keyarr, kind=None):
- """ passed a key that is tuplesafe that is integer based
- and we have a mixed index (e.g. number/labels). figure out
- the indexer. return None if we can't help
+ def _convert_list_indexer(self, keyarr, kind=None):
+ """
+ passed a key that is tuplesafe that is integer based
+ and we have a mixed index (e.g. number/labels). figure out
+ the indexer. return None if we can't help
"""
if (kind is None or kind in ['iloc','ix']) and (is_integer_dtype(keyarr) and not self.is_floating()):
if self.inferred_type != 'integer':
@@ -954,17 +936,13 @@ def __getitem__(self, key):
else:
return result
- def append(self, other):
+ def _ensure_compat_append(self, other):
"""
- Append a collection of Index options together
-
- Parameters
- ----------
- other : Index or list/tuple of indices
+ prepare the append
Returns
-------
- appended : Index
+ list of to_concat, name of result Index
"""
name = self.name
to_concat = [self]
@@ -984,7 +962,21 @@ def append(self, other):
to_concat = self._ensure_compat_concat(to_concat)
to_concat = [x.values if isinstance(x, Index) else x
for x in to_concat]
+ return to_concat, name
+ def append(self, other):
+ """
+ Append a collection of Index options together
+
+ Parameters
+ ----------
+ other : Index or list/tuple of indices
+
+ Returns
+ -------
+ appended : Index
+ """
+ to_concat, name = self._ensure_compat_append(other)
return Index(np.concatenate(to_concat), name=name)
@staticmethod
@@ -1046,10 +1038,12 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs):
from pandas.core.format import format_array
- if values.dtype == np.object_:
+ if is_categorical_dtype(values.dtype):
+ values = np.array(values)
+ elif is_object_dtype(values.dtype):
values = lib.maybe_convert_objects(values, safe=1)
- if values.dtype == np.object_:
+ if is_object_dtype(values.dtype):
result = [com.pprint_thing(x, escape_chars=('\t', '\r', '\n'))
for x in values]
@@ -1071,12 +1065,16 @@ def to_native_types(self, slicer=None, **kwargs):
values = values[slicer]
return values._format_native_types(**kwargs)
- def _format_native_types(self, na_rep='', **kwargs):
+ def _format_native_types(self, na_rep='', quoting=None, **kwargs):
""" actually format my specific types """
mask = isnull(self)
- values = np.array(self, dtype=object, copy=True)
+ if not self.is_object() and not quoting:
+ values = np.asarray(self).astype(str)
+ else:
+ values = np.array(self, dtype=object, copy=True)
+
values[mask] = na_rep
- return values.tolist()
+ return values
def equals(self, other):
"""
@@ -1088,9 +1086,6 @@ def equals(self, other):
if not isinstance(other, Index):
return False
- if type(other) != Index:
- return other.equals(self)
-
return array_equivalent(_values_from_object(self), _values_from_object(other))
def identical(self, other):
@@ -1197,13 +1192,6 @@ def __sub__(self, other):
"use .difference()",FutureWarning)
return self.difference(other)
- __eq__ = _indexOp('__eq__')
- __ne__ = _indexOp('__ne__')
- __lt__ = _indexOp('__lt__')
- __gt__ = _indexOp('__gt__')
- __le__ = _indexOp('__le__')
- __ge__ = _indexOp('__ge__')
-
def __and__(self, other):
return self.intersection(other)
@@ -1236,7 +1224,7 @@ def union(self, other):
self._assert_can_do_setop(other)
- if self.dtype != other.dtype:
+ if not is_dtype_equal(self.dtype,other.dtype):
this = self.astype('O')
other = other.astype('O')
return this.union(other)
@@ -1310,7 +1298,7 @@ def intersection(self, other):
if self.equals(other):
return self
- if self.dtype != other.dtype:
+ if not is_dtype_equal(self.dtype,other.dtype):
this = self.astype('O')
other = other.astype('O')
return this.intersection(other)
@@ -1469,7 +1457,7 @@ def get_value(self, series, key):
raise
except TypeError:
# generator/iterator-like
- if com.is_iterator(key):
+ if is_iterator(key):
raise InvalidIndexError(key)
else:
raise e1
@@ -1544,7 +1532,7 @@ def get_indexer(self, target, method=None, limit=None):
if pself is not self or ptarget is not target:
return pself.get_indexer(ptarget, method=method, limit=limit)
- if self.dtype != target.dtype:
+ if not is_dtype_equal(self.dtype,target.dtype):
this = self.astype(object)
target = target.astype(object)
return this.get_indexer(target, method=method, limit=limit)
@@ -1643,7 +1631,8 @@ def get_indexer_for(self, target, **kwargs):
""" guaranteed return of an indexer even when non-unique """
if self.is_unique:
return self.get_indexer(target, **kwargs)
- return self.get_indexer_non_unique(target, **kwargs)[0]
+ indexer, _ = self.get_indexer_non_unique(target, **kwargs)
+ return indexer
def _possibly_promote(self, other):
# A hack, but it works
@@ -1651,7 +1640,7 @@ def _possibly_promote(self, other):
if self.inferred_type == 'date' and isinstance(other, DatetimeIndex):
return DatetimeIndex(self), other
elif self.inferred_type == 'boolean':
- if self.dtype != 'object':
+ if not is_object_dtype(self.dtype):
return self.astype('object'), other.astype('object')
return self, other
@@ -1703,12 +1692,35 @@ def isin(self, values, level=None):
value_set = set(values)
if level is not None:
self._validate_index_level(level)
- return lib.ismember(self._array_values(), value_set)
+ return lib.ismember(np.array(self), value_set)
+
+ def _can_reindex(self, indexer):
+ """
+ *this is an internal non-public method*
+
+ Check if we are allowing reindexing with this particular indexer
+
+ Parameters
+ ----------
+ indexer : an integer indexer
+
+ Raises
+ ------
+ ValueError if its a duplicate axis
+ """
+
+ # trying to reindex on an axis with duplicates
+ if not self.is_unique and len(indexer):
+ raise ValueError("cannot reindex from a duplicate axis")
def reindex(self, target, method=None, level=None, limit=None):
"""
Create index with target's values (move/add/delete values as necessary)
+ Parameters
+ ----------
+ target : an iterable
+
Returns
-------
new_index : pd.Index
@@ -1729,6 +1741,7 @@ def reindex(self, target, method=None, level=None, limit=None):
target = self._simple_new(np.empty(0, dtype=self.dtype), **attrs)
else:
target = _ensure_index(target)
+
if level is not None:
if method is not None:
raise TypeError('Fill method not supported if level passed')
@@ -1753,9 +1766,72 @@ def reindex(self, target, method=None, level=None, limit=None):
return target, indexer
+ def _reindex_non_unique(self, target):
+ """
+ *this is an internal non-public method*
+
+ Create a new index with target's values (move/add/delete values as necessary)
+ use with non-unique Index and a possibly non-unique target
+
+ Parameters
+ ----------
+ target : an iterable
+
+ Returns
+ -------
+ new_index : pd.Index
+ Resulting index
+ indexer : np.ndarray or None
+ Indices of output values in original index
+
+ """
+
+ target = _ensure_index(target)
+ indexer, missing = self.get_indexer_non_unique(target)
+ check = indexer != -1
+ new_labels = self.take(indexer[check])
+ new_indexer = None
+
+ if len(missing):
+ l = np.arange(len(indexer))
+
+ missing = com._ensure_platform_int(missing)
+ missing_labels = target.take(missing)
+ missing_indexer = com._ensure_int64(l[~check])
+ cur_labels = self.take(indexer[check]).values
+ cur_indexer = com._ensure_int64(l[check])
+
+ new_labels = np.empty(tuple([len(indexer)]), dtype=object)
+ new_labels[cur_indexer] = cur_labels
+ new_labels[missing_indexer] = missing_labels
+
+ # a unique indexer
+ if target.is_unique:
+
+ # see GH5553, make sure we use the right indexer
+ new_indexer = np.arange(len(indexer))
+ new_indexer[cur_indexer] = np.arange(len(cur_labels))
+ new_indexer[missing_indexer] = -1
+
+ # we have a non_unique selector, need to use the original
+ # indexer here
+ else:
+
+ # need to retake to have the same size as the indexer
+ indexer = indexer.values
+ indexer[~check] = 0
+
+ # reset the new indexer to account for the new size
+ new_indexer = np.arange(len(self.take(indexer)))
+ new_indexer[~check] = -1
+
+ return self._shallow_copy(new_labels), indexer, new_indexer
+
def join(self, other, how='left', level=None, return_indexers=False):
"""
- Internal API method. Compute join_index and indexers to conform data
+ *this is an internal non-public method*
+
+ Compute join_index and indexers to conform data
structures to the new index.
Parameters
@@ -1814,7 +1890,7 @@ def join(self, other, how='left', level=None, return_indexers=False):
result = x, z, y
return result
- if self.dtype != other.dtype:
+ if not is_dtype_equal(self.dtype,other.dtype):
this = self.astype('O')
other = other.astype('O')
return this.join(other, how=how,
@@ -2365,6 +2441,34 @@ def _evaluate_with_timedelta_like(self, other, op, opstr):
def _evaluate_with_datetime_like(self, other, op, opstr):
raise TypeError("can only perform ops with datetime like values")
+ @classmethod
+ def _add_comparison_methods(cls):
+ """ add in comparison methods """
+
+ def _make_compare(op):
+
+ def _evaluate_compare(self, other):
+ func = getattr(self.values, op)
+ result = func(np.asarray(other))
+
+ # technically we could support bool dtyped Index
+ # for now just return the indexing array directly
+ if is_bool_dtype(result):
+ return result
+ try:
+ return Index(result)
+ except TypeError:
+ return result
+
+ return _evaluate_compare
+
+ cls.__eq__ = _make_compare('__eq__')
+ cls.__ne__ = _make_compare('__ne__')
+ cls.__lt__ = _make_compare('__lt__')
+ cls.__gt__ = _make_compare('__gt__')
+ cls.__le__ = _make_compare('__le__')
+ cls.__ge__ = _make_compare('__ge__')
+
@classmethod
def _add_numeric_methods_disabled(cls):
""" add in numeric methods to disable """
@@ -2419,7 +2523,7 @@ def _evaluate_numeric_binop(self, other):
elif isinstance(other, (Timestamp, np.datetime64)):
return self._evaluate_with_datetime_like(other, op, opstr)
else:
- if not (com.is_float(other) or com.is_integer(other)):
+ if not (is_float(other) or is_integer(other)):
raise TypeError("can only perform ops with scalar values")
# if we are a reversed non-communative op
@@ -2483,7 +2587,7 @@ def _make_logical_function(name, desc, f):
@Appender(_doc)
def logical_func(self, *args, **kwargs):
result = f(self.values)
- if isinstance(result, (np.ndarray, com.ABCSeries, Index)) \
+ if isinstance(result, (np.ndarray, ABCSeries, Index)) \
and result.ndim == 0:
# return NumPy type
return result.dtype.type(result.item())
@@ -2515,6 +2619,539 @@ def invalid_op(self, other=None):
Index._add_numeric_methods_disabled()
Index._add_logical_methods()
+Index._add_comparison_methods()
+
+class CategoricalIndex(Index, PandasDelegate):
+ """
+
+ Immutable Index implementing an ordered, sliceable set. CategoricalIndex
+ represents a sparsely populated Index with an underlying Categorical.
+
+ Parameters
+ ----------
+ data : array-like or Categorical, (1-dimensional)
+ categories : optional, array-like
+ categories for the CategoricalIndex
+ ordered : boolean,
+ designating if the categories are ordered
+ copy : bool
+ Make a copy of input ndarray
+ name : object
+ Name to be stored in the index
+
+ """
+
+ _typ = 'categoricalindex'
+ _engine_type = _index.Int64Engine
+ _attributes = ['name','categories','ordered']
+
+ def __new__(cls, data=None, categories=None, ordered=None, dtype=None, copy=False, name=None, fastpath=False, **kwargs):
+
+ if fastpath:
+ return cls._simple_new(data, name=name)
+
+ if isinstance(data, ABCCategorical):
+ data = cls._create_categorical(cls, data, categories, ordered)
+ elif isinstance(data, CategoricalIndex):
+ data = data._data
+ data = cls._create_categorical(cls, data, categories, ordered)
+ else:
+
+ # don't allow scalars
+ # if data is None, then categories must be provided
+ if lib.isscalar(data):
+ if data is not None or categories is None:
+ cls._scalar_data_error(data)
+ data = []
+ data = cls._create_categorical(cls, data, categories, ordered)
+
+ if copy:
+ data = data.copy()
+
+ return cls._simple_new(data, name=name)
+
+ def _create_from_codes(self, codes, categories=None, ordered=None, name=None):
+ """
+ *this is an internal non-public method*
+
+ create the correct categorical from codes
+
+ Parameters
+ ----------
+ codes : new codes
+ categories : optional categories, defaults to existing
+ ordered : optional ordered attribute, defaults to existing
+ name : optional name attribute, defaults to existing
+
+ Returns
+ -------
+ CategoricalIndex
+ """
+
+ from pandas.core.categorical import Categorical
+ if categories is None:
+ categories = self.categories
+ if ordered is None:
+ ordered = self.ordered
+ if name is None:
+ name = self.name
+ cat = Categorical.from_codes(codes, categories=categories, ordered=self.ordered)
+ return CategoricalIndex(cat, name=name)
+
+ @staticmethod
+ def _create_categorical(self, data, categories=None, ordered=None):
+ """
+ *this is an internal non-public method*
+
+ create the correct categorical from data and the properties
+
+ Parameters
+ ----------
+ data : data for new Categorical
+ categories : optional categories, defaults to existing
+ ordered : optional ordered attribute, defaults to existing
+
+ Returns
+ -------
+ Categorical
+ """
+
+ if not isinstance(data, ABCCategorical):
+ from pandas.core.categorical import Categorical
+ data = Categorical(data, categories=categories, ordered=ordered)
+ else:
+ if categories is not None:
+ data = data.set_categories(categories)
+ if ordered is not None:
+ data = data.set_ordered(ordered)
+ return data
+
+ @classmethod
+ def _simple_new(cls, values, name=None, categories=None, ordered=None, **kwargs):
+ result = object.__new__(cls)
+
+ values = cls._create_categorical(cls, values, categories, ordered)
+ result._data = values
+ result.name = name
+ for k, v in compat.iteritems(kwargs):
+ setattr(result,k,v)
+
+ result._reset_identity()
+ return result
+
+ def _is_dtype_compat(self, other):
+ """
+ *this is an internal non-public method*
+
+ provide a comparison between the dtype of self and other (coercing if needed)
+
+ Raises
+ ------
+ TypeError if the dtypes are not compatible
+ """
+
+ if is_categorical_dtype(other):
+ if isinstance(other, CategoricalIndex):
+ other = other.values
+ if not other.is_dtype_equal(self):
+ raise TypeError("categories must match existing categories when appending")
+ else:
+ values = other
+ other = CategoricalIndex(self._create_categorical(self, other, categories=self.categories, ordered=self.ordered))
+ if not other.isin(values).all():
+ raise TypeError("cannot append a non-category item to a CategoricalIndex")
+
+ return other
+
+ def equals(self, other):
+ """
+ Determines if two CategorialIndex objects contain the same elements.
+ """
+ if self.is_(other):
+ return True
+
+ try:
+ other = self._is_dtype_compat(other)
+ return array_equivalent(self._data, other)
+ except (TypeError, ValueError):
+ pass
+
+ return False
+
+ def __unicode__(self):
+ """
+ Return a string representation for this object.
+
+ Invoked by unicode(df) in py2 only. Yields a Unicode String in both
+ py2/py3.
+ """
+
+ # currently doesn't use the display.max_categories, or display.max_seq_len
+ # for head/tail printing
+ values = default_pprint(self.values.get_values())
+ cats = default_pprint(self.categories.get_values())
+ space = ' ' * (len(self.__class__.__name__) + 1)
+ name = self.name
+ if name is not None:
+ name = default_pprint(name)
+
+ result = u("{klass}({values},\n{space}categories={categories},\n{space}ordered={ordered},\n{space}name={name})").format(
+ klass=self.__class__.__name__,
+ values=values,
+ categories=cats,
+ ordered=self.ordered,
+ name=name,
+ space=space)
+
+ return result
+
+ @property
+ def inferred_type(self):
+ return 'categorical'
+
+ @property
+ def values(self):
+ """ return the underlying data, which is a Categorical """
+ return self._data
+
+ @property
+ def codes(self):
+ return self._data.codes
+
+ @property
+ def categories(self):
+ return self._data.categories
+
+ @property
+ def ordered(self):
+ return self._data.ordered
+
+ def __contains__(self, key):
+ hash(key)
+ return key in self.values
+
+ def __array__(self, dtype=None):
+ """ the array interface, return my values """
+ return np.array(self._data, dtype=dtype)
+
+ def argsort(self, *args, **kwargs):
+ return self.values.argsort(*args, **kwargs)
+
+ @cache_readonly
+ def _engine(self):
+
+ # we are going to look things up with the codes themselves
+ return self._engine_type(lambda: self.codes.astype('i8'), len(self))
+
+ @cache_readonly
+ def is_unique(self):
+ return not self.duplicated().any()
+
+ @Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
+ def duplicated(self, take_last=False):
+ from pandas.hashtable import duplicated_int64
+ return duplicated_int64(self.codes.astype('i8'), take_last)
+
+ def get_loc(self, key, method=None):
+ """
+ Get integer location for requested label
+
+ Parameters
+ ----------
+ key : label
+ method : {None}
+ * default: exact matches only.
+
+ Returns
+ -------
+ loc : int if unique index, possibly slice or mask if not
+ """
+ codes = self.categories.get_loc(key)
+ if (codes == -1):
+ raise KeyError(key)
+ indexer, _ = self._engine.get_indexer_non_unique(np.array([codes]))
+ if (indexer==-1).any():
+ raise KeyError(key)
+
+ return indexer
+
+ def _can_reindex(self, indexer):
+ """ always allow reindexing """
+ pass
+
+ def reindex(self, target, method=None, level=None, limit=None):
+ """
+ Create index with target's values (move/add/delete values as necessary)
+
+ Returns
+ -------
+ new_index : pd.Index
+ Resulting index
+ indexer : np.ndarray or None
+ Indices of output values in original index
+
+ """
+
+ if method is not None:
+ raise NotImplementedError("argument method is not implemented for CategoricalIndex.reindex")
+ if level is not None:
+ raise NotImplementedError("argument level is not implemented for CategoricalIndex.reindex")
+ if limit is not None:
+ raise NotImplementedError("argument limit is not implemented for CategoricalIndex.reindex")
+
+ target = _ensure_index(target)
+
+ if not is_categorical_dtype(target) and not target.is_unique:
+ raise ValueError("cannot reindex with a non-unique indexer")
+
+ indexer, missing = self.get_indexer_non_unique(np.array(target))
+ new_target = self.take(indexer)
+
+
+ # filling in missing if needed
+ if len(missing):
+ cats = self.categories.get_indexer(target)
+ if (cats==-1).any():
+
+ # coerce to a regular index here!
+ result = Index(np.array(self),name=self.name)
+ new_target, indexer, _ = result._reindex_non_unique(np.array(target))
+
+ else:
+
+ codes = new_target.codes.copy()
+ codes[indexer==-1] = cats[missing]
+ new_target = self._create_from_codes(codes)
+
+ # we always want to return an Index type here
+ # to be consistent with .reindex for other index types (e.g. they don't coerce
+ # based on the actual values, only on the dtype)
+ # unless we had an inital Categorical to begin with
+ # in which case we are going to conform to the passed Categorical
+ new_target = np.asarray(new_target)
+ if is_categorical_dtype(target):
+ new_target = target._shallow_copy(new_target, name=self.name)
+ else:
+ new_target = Index(new_target, name=self.name)
+
+ return new_target, indexer
+
+ def _reindex_non_unique(self, target):
+ """ reindex from a non-unique; which CategoricalIndex's are almost always """
+ new_target, indexer = self.reindex(target)
+ new_indexer = None
+
+ check = indexer==-1
+ if check.any():
+ new_indexer = np.arange(len(self.take(indexer)))
+ new_indexer[check] = -1
+
+ return new_target, indexer, new_indexer
+
+ def get_indexer(self, target, method=None, limit=None):
+ """
+ Compute indexer and mask for new index given the current index. The
+ indexer should be then used as an input to ndarray.take to align the
+ current data to the new index. The mask determines whether labels are
+ found or not in the current index
+
+ Parameters
+ ----------
+ target : MultiIndex or Index (of tuples)
+ method : {'pad', 'ffill', 'backfill', 'bfill'}
+ pad / ffill: propagate LAST valid observation forward to next valid
+ backfill / bfill: use NEXT valid observation to fill gap
+
+ Notes
+ -----
+ This is a low-level method and probably should be used at your own risk
+
+ Examples
+ --------
+ >>> indexer, mask = index.get_indexer(new_index)
+ >>> new_values = cur_values.take(indexer)
+ >>> new_values[-mask] = np.nan
+
+ Returns
+ -------
+ (indexer, mask) : (ndarray, ndarray)
+ """
+ method = com._clean_reindex_fill_method(method)
+ target = _ensure_index(target)
+
+ if isinstance(target, CategoricalIndex):
+ target = target.categories
+
+ if method == 'pad' or method == 'backfill':
+ raise NotImplementedError("method='pad' and method='backfill' not implemented yet "
+ 'for CategoricalIndex')
+ elif method == 'nearest':
+ raise NotImplementedError("method='nearest' not implemented yet "
+ 'for CategoricalIndex')
+ else:
+
+ codes = self.categories.get_indexer(target)
+ indexer, _ = self._engine.get_indexer_non_unique(codes)
+
+ return com._ensure_platform_int(indexer)
+
+ def get_indexer_non_unique(self, target):
+ """ this is the same for a CategoricalIndex for get_indexer; the API returns the missing values as well """
+ target = _ensure_index(target)
+
+ if isinstance(target, CategoricalIndex):
+ target = target.categories
+
+ codes = self.categories.get_indexer(target)
+ return self._engine.get_indexer_non_unique(codes)
+
+ def _convert_list_indexer(self, keyarr, kind=None):
+ """
+ we are passed a list indexer.
+ Return our indexer or raise if all of the values are not included in the categories
+ """
+ codes = self.categories.get_indexer(keyarr)
+ if (codes==-1).any():
+ raise KeyError("a list-indexer must only include values that are in the categories")
+
+ return None
+
+ def take(self, indexer, axis=0):
+ """
+ return a new CategoricalIndex of the values selected by the indexer
+
+ See also
+ --------
+ numpy.ndarray.take
+ """
+
+ indexer = com._ensure_platform_int(indexer)
+ taken = self.codes.take(indexer)
+ return self._create_from_codes(taken)
+
+ def delete(self, loc):
+ """
+ Make new Index with passed location(-s) deleted
+
+ Returns
+ -------
+ new_index : Index
+ """
+ return self._create_from_codes(np.delete(self.codes, loc))
+
+ def insert(self, loc, item):
+ """
+ Make new Index inserting new item at location. Follows
+ Python list.append semantics for negative values
+
+ Parameters
+ ----------
+ loc : int
+ item : object
+
+ Returns
+ -------
+ new_index : Index
+
+ Raises
+ ------
+ ValueError if the item is not in the categories
+
+ """
+ code = self.categories.get_indexer([item])
+ if (code == -1):
+ raise TypeError("cannot insert an item into a CategoricalIndex that is not already an existing category")
+
+ codes = self.codes
+ codes = np.concatenate(
+ (codes[:loc], code, codes[loc:]))
+ return self._create_from_codes(codes)
+
+ def append(self, other):
+ """
+ Append a collection of CategoricalIndex options together
+
+ Parameters
+ ----------
+ other : Index or list/tuple of indices
+
+ Returns
+ -------
+ appended : Index
+
+ Raises
+ ------
+ ValueError if other is not in the categories
+ """
+ to_concat, name = self._ensure_compat_append(other)
+ to_concat = [ self._is_dtype_compat(c) for c in to_concat ]
+ codes = np.concatenate([ c.codes for c in to_concat ])
+ return self._create_from_codes(codes, name=name)
+
+ @classmethod
+ def _add_comparison_methods(cls):
+ """ add in comparison methods """
+
+ def _make_compare(op):
+
+ def _evaluate_compare(self, other):
+
+ # if we have a Categorical type, then must have the same categories
+ if isinstance(other, CategoricalIndex):
+ other = other.values
+ elif isinstance(other, Index):
+ other = self._create_categorical(self, other.values, categories=self.categories, ordered=self.ordered)
+
+ if isinstance(other, ABCCategorical):
+ if not (self.values.is_dtype_equal(other) and len(self.values) == len(other)):
+ raise TypeError("categorical index comparisions must have the same categories and ordered attributes")
+
+ return getattr(self.values, op)(other)
+
+ return _evaluate_compare
+
+ cls.__eq__ = _make_compare('__eq__')
+ cls.__ne__ = _make_compare('__ne__')
+ cls.__lt__ = _make_compare('__lt__')
+ cls.__gt__ = _make_compare('__gt__')
+ cls.__le__ = _make_compare('__le__')
+ cls.__ge__ = _make_compare('__ge__')
+
+
+ def _delegate_method(self, name, *args, **kwargs):
+ """ method delegation to the .values """
+ method = getattr(self.values, name)
+ if 'inplace' in kwargs:
+ raise ValueError("cannot use inplace with CategoricalIndex")
+ res = method(*args, **kwargs)
+ if lib.isscalar(res):
+ return res
+ return CategoricalIndex(res, name=self.name)
+
+ @classmethod
+ def _add_accessors(cls):
+ """ add in Categorical accessor methods """
+
+ from pandas.core.categorical import Categorical
+ CategoricalIndex._add_delegate_accessors(delegate=Categorical,
+ accessors=["rename_categories",
+ "reorder_categories",
+ "add_categories",
+ "remove_categories",
+ "remove_unused_categories",
+ "set_categories",
+ "as_ordered",
+ "as_unordered",
+ "min",
+ "max"],
+ typ='method',
+ overwrite=True)
+
+
+CategoricalIndex._add_numeric_methods_disabled()
+CategoricalIndex._add_logical_methods_disabled()
+CategoricalIndex._add_comparison_methods()
+CategoricalIndex._add_accessors()
class NumericIndex(Index):
@@ -2787,7 +3424,7 @@ def equals(self, other):
try:
if not isinstance(other, Float64Index):
other = self._constructor(other)
- if self.dtype != other.dtype or self.shape != other.shape:
+ if not is_dtype_equal(self.dtype,other.dtype) or self.shape != other.shape:
return False
left, right = self.values, other.values
return ((left == right) | (self._isnan & other._isnan)).all()
@@ -2853,7 +3490,7 @@ def isin(self, values, level=None):
value_set = set(values)
if level is not None:
self._validate_index_level(level)
- return lib.ismember_nans(self._array_values(), value_set,
+ return lib.ismember_nans(np.array(self), value_set,
isnull(list(value_set)).any())
@@ -3193,7 +3830,7 @@ def copy(self, names=None, dtype=None, levels=None, labels=None,
verify_integrity=False,
_set_identity=_set_identity)
- def __array__(self, result=None):
+ def __array__(self, dtype=None):
""" the array interface, return my values """
return self.values
@@ -3205,10 +3842,6 @@ def view(self, cls=None):
_shallow_copy = view
- def _array_values(self):
- # hack for various methods
- return self.values
-
@cache_readonly
def dtype(self):
return np.dtype('O')
@@ -3298,7 +3931,7 @@ def _reference_duplicate_name(self, name):
return np.sum(name == np.asarray(self.names)) > 1
def _format_native_types(self, **kwargs):
- return self.tolist()
+ return self.values
@property
def _constructor(self):
@@ -3355,7 +3988,7 @@ def values(self):
taken = com.take_1d(lev._box_values(lev.values), lab,
fill_value=_get_na_value(lev.dtype.type))
else:
- taken = com.take_1d(lev.values, lab)
+ taken = com.take_1d(np.asarray(lev.values), lab)
values.append(taken)
self._tuples = lib.fast_zip(values)
@@ -3420,7 +4053,7 @@ def _try_mi(k):
raise
except TypeError:
# generator/iterator-like
- if com.is_iterator(key):
+ if is_iterator(key):
raise InvalidIndexError(key)
else:
raise e1
@@ -4091,7 +4724,7 @@ def get_indexer(self, target, method=None, limit=None):
if isinstance(target, MultiIndex):
target_index = target._tuple_index
- if target_index.dtype != object:
+ if not is_object_dtype(target_index.dtype):
return np.ones(len(target_index)) * -1
if not self.is_unique:
@@ -4650,9 +5283,9 @@ def equals(self, other):
return False
for i in range(self.nlevels):
- svalues = com.take_nd(self.levels[i].values, self.labels[i],
+ svalues = com.take_nd(np.asarray(self.levels[i].values), self.labels[i],
allow_fill=False)
- ovalues = com.take_nd(other.levels[i].values, other.labels[i],
+ ovalues = com.take_nd(np.asarray(other.levels[i].values), other.labels[i],
allow_fill=False)
if not array_equivalent(svalues, ovalues):
return False
@@ -4768,7 +5401,7 @@ def _assert_can_do_setop(self, other):
pass
def astype(self, dtype):
- if np.dtype(dtype) != np.object_:
+ if not is_object_dtype(np.dtype(dtype)):
raise TypeError('Setting %s dtype to anything other than object '
'is not supported' % self.__class__)
return self._shallow_copy()
@@ -4848,7 +5481,7 @@ def _wrap_joined_index(self, joined, other):
@Appender(Index.isin.__doc__)
def isin(self, values, level=None):
if level is None:
- return lib.ismember(self._array_values(), set(values))
+ return lib.ismember(np.array(self), set(values))
else:
num = self._get_level_number(level)
levs = self.levels[num]
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
index 8154eb1bb6c8b..7c373b0a2b01d 100644
--- a/pandas/core/indexing.py
+++ b/pandas/core/indexing.py
@@ -253,7 +253,7 @@ def _setitem_with_indexer(self, indexer, value):
# just replacing the block manager here
# so the object is the same
index = self.obj._get_axis(i)
- labels = safe_append_to_index(index, key)
+ labels = index.insert(len(index),key)
self.obj._data = self.obj.reindex_axis(labels, i)._data
self.obj._maybe_update_cacher(clear=True)
self.obj.is_copy=None
@@ -274,10 +274,7 @@ def _setitem_with_indexer(self, indexer, value):
# and set inplace
if self.ndim == 1:
index = self.obj.index
- if len(index) == 0:
- new_index = Index([indexer])
- else:
- new_index = safe_append_to_index(index, indexer)
+ new_index = index.insert(len(index),indexer)
# this preserves dtype of the value
new_values = Series([value]).values
@@ -928,24 +925,6 @@ def _getitem_iterable(self, key, axis=0):
labels = self.obj._get_axis(axis)
- def _reindex(keys, level=None):
-
- try:
- result = self.obj.reindex_axis(keys, axis=axis, level=level)
- except AttributeError:
- # Series
- if axis != 0:
- raise AssertionError('axis must be 0')
- return self.obj.reindex(keys, level=level)
-
- # this is an error as we are trying to find
- # keys in a multi-index that don't exist
- if isinstance(labels, MultiIndex) and level is not None:
- if hasattr(result,'ndim') and not np.prod(result.shape) and len(keys):
- raise KeyError("cannot index a multi-index axis with these keys")
-
- return result
-
if is_bool_indexer(key):
key = check_bool_indexer(labels, key)
inds, = key.nonzero()
@@ -958,8 +937,9 @@ def _reindex(keys, level=None):
# asarray can be unsafe, NumPy strings are weird
keyarr = _asarray_tuplesafe(key)
- # handle a mixed integer scenario
- indexer = labels._convert_list_indexer_for_mixed(keyarr, kind=self.name)
+ # have the index handle the indexer and possibly return
+ # an indexer or raising
+ indexer = labels._convert_list_indexer(keyarr, kind=self.name)
if indexer is not None:
return self.obj.take(indexer, axis=axis)
@@ -970,65 +950,48 @@ def _reindex(keys, level=None):
else:
level = None
- keyarr_is_unique = Index(keyarr).is_unique
+ # existing labels are unique and indexer are unique
+ if labels.is_unique and Index(keyarr).is_unique:
+
+ try:
+ result = self.obj.reindex_axis(keyarr, axis=axis, level=level)
+
+ # this is an error as we are trying to find
+ # keys in a multi-index that don't exist
+ if isinstance(labels, MultiIndex) and level is not None:
+ if hasattr(result,'ndim') and not np.prod(result.shape) and len(keyarr):
+ raise KeyError("cannot index a multi-index axis with these keys")
+
+ return result
- # existing labels are unique and indexer is unique
- if labels.is_unique and keyarr_is_unique:
- return _reindex(keyarr, level=level)
+ except AttributeError:
+ # Series
+ if axis != 0:
+ raise AssertionError('axis must be 0')
+ return self.obj.reindex(keyarr, level=level)
+
+ # existing labels are non-unique
else:
- indexer, missing = labels.get_indexer_non_unique(keyarr)
- check = indexer != -1
- result = self.obj.take(indexer[check], axis=axis,
- convert=False)
-
- # need to merge the result labels and the missing labels
- if len(missing):
- l = np.arange(len(indexer))
-
- missing = com._ensure_platform_int(missing)
- missing_labels = keyarr.take(missing)
- missing_indexer = com._ensure_int64(l[~check])
- cur_labels = result._get_axis(axis).values
- cur_indexer = com._ensure_int64(l[check])
-
- new_labels = np.empty(tuple([len(indexer)]), dtype=object)
- new_labels[cur_indexer] = cur_labels
- new_labels[missing_indexer] = missing_labels
-
- # reindex with the specified axis
- ndim = self.obj.ndim
- if axis + 1 > ndim:
- raise AssertionError("invalid indexing error with "
- "non-unique index")
-
- # a unique indexer
- if keyarr_is_unique:
-
- # see GH5553, make sure we use the right indexer
- new_indexer = np.arange(len(indexer))
- new_indexer[cur_indexer] = np.arange(
- len(result._get_axis(axis))
- )
- new_indexer[missing_indexer] = -1
- # we have a non_unique selector, need to use the original
- # indexer here
- else:
+ # reindex with the specified axis
+ if axis + 1 > self.obj.ndim:
+ raise AssertionError("invalid indexing error with "
+ "non-unique index")
- # need to retake to have the same size as the indexer
- rindexer = indexer.values
- rindexer[~check] = 0
- result = self.obj.take(rindexer, axis=axis,
- convert=False)
+ new_target, indexer, new_indexer = labels._reindex_non_unique(keyarr)
- # reset the new indexer to account for the new size
- new_indexer = np.arange(len(result))
- new_indexer[~check] = -1
+ if new_indexer is not None:
+ result = self.obj.take(indexer[indexer!=-1], axis=axis,
+ convert=False)
result = result._reindex_with_indexers({
- axis: [new_labels, new_indexer]
- }, copy=True, allow_dups=True)
+ axis: [new_target, new_indexer]
+ }, copy=True, allow_dups=True)
+
+ else:
+ result = self.obj.take(indexer, axis=axis,
+ convert=False)
return result
@@ -1105,8 +1068,9 @@ def _convert_to_indexer(self, obj, axis=0, is_setter=False):
else:
objarr = _asarray_tuplesafe(obj)
- # If have integer labels, defer to label-based indexing
- indexer = labels._convert_list_indexer_for_mixed(objarr, kind=self.name)
+ # The index may want to handle a list indexer differently
+ # by returning an indexer or raising
+ indexer = labels._convert_list_indexer(objarr, kind=self.name)
if indexer is not None:
return indexer
@@ -1627,8 +1591,8 @@ def length_of_indexer(indexer, target=None):
if step is None:
step = 1
elif step < 0:
- step = abs(step)
- return (stop - start) / step
+ step = -step
+ return (stop - start + step-1) // step
elif isinstance(indexer, (ABCSeries, Index, np.ndarray, list)):
return len(indexer)
elif not is_list_like_indexer(indexer):
@@ -1719,19 +1683,6 @@ def get_indexer(_i, _idx):
return tuple([get_indexer(_i, _idx) for _i, _idx in enumerate(indexer)])
-def safe_append_to_index(index, key):
- """ a safe append to an index, if incorrect type, then catch and recreate
- """
- try:
- return index.insert(len(index), key)
- except:
-
- # raise here as this is basically an unsafe operation and we want
- # it to be obvious that you are doing something wrong
- raise ValueError("unsafe appending to index of type {0} with a key "
- "{1}".format(index.__class__.__name__, key))
-
-
def maybe_convert_indices(indices, n):
""" if we have negative indicies, translate to postive here
if have indicies that are out-of-bounds, raise an IndexError
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
index 4d0f8394fbd2a..276b42cde68bc 100644
--- a/pandas/core/internals.py
+++ b/pandas/core/internals.py
@@ -484,16 +484,21 @@ def _try_coerce_and_cast_result(self, result, dtype=None):
def _try_fill(self, value):
return value
- def to_native_types(self, slicer=None, na_rep='', **kwargs):
+ def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs):
""" convert to our native types format, slicing if desired """
values = self.values
if slicer is not None:
values = values[:, slicer]
- values = np.array(values, dtype=object)
mask = isnull(values)
+
+ if not self.is_object and not quoting:
+ values = values.astype(str)
+ else:
+ values = np.array(values, dtype='object')
+
values[mask] = na_rep
- return values.tolist()
+ return values
# block actions ####
def copy(self, deep=True):
@@ -582,7 +587,7 @@ def _is_empty_indexer(indexer):
if arr_value.ndim == 1:
if not isinstance(indexer, tuple):
indexer = tuple([indexer])
- return all([ isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer ])
+ return any(isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer)
return False
# empty indexers
@@ -627,7 +632,8 @@ def _is_empty_indexer(indexer):
return [self]
- def putmask(self, mask, new, align=True, inplace=False):
+ def putmask(self, mask, new, align=True, inplace=False,
+ axis=0, transpose=False):
""" putmask the data to the block; it is possible that we may create a
new dtype of block
@@ -639,37 +645,55 @@ def putmask(self, mask, new, align=True, inplace=False):
new : a ndarray/object
align : boolean, perform alignment on other/cond, default is True
inplace : perform inplace modification, default is False
+ axis : int
+ transpose : boolean
+ Set to True if self is stored with axes reversed
Returns
-------
- a new block(s), the result of the putmask
+ a list of new blocks, the result of the putmask
"""
new_values = self.values if inplace else self.values.copy()
- # may need to align the new
if hasattr(new, 'reindex_axis'):
- new = new.values.T
+ new = new.values
- # may need to align the mask
if hasattr(mask, 'reindex_axis'):
- mask = mask.values.T
+ mask = mask.values
# if we are passed a scalar None, convert it here
if not is_list_like(new) and isnull(new) and not self.is_object:
new = self.fill_value
if self._can_hold_element(new):
+ if transpose:
+ new_values = new_values.T
+
new = self._try_cast(new)
- # pseudo-broadcast
- if isinstance(new, np.ndarray) and new.ndim == self.ndim - 1:
- new = np.repeat(new, self.shape[-1]).reshape(self.shape)
+ # If the default repeat behavior in np.putmask would go in the wrong
+ # direction, then explictly repeat and reshape new instead
+ if getattr(new, 'ndim', 0) >= 1:
+ if self.ndim - 1 == new.ndim and axis == 1:
+ new = np.repeat(new, new_values.shape[-1]).reshape(self.shape)
np.putmask(new_values, mask, new)
# maybe upcast me
elif mask.any():
+ if transpose:
+ mask = mask.T
+ if isinstance(new, np.ndarray):
+ new = new.T
+ axis = new_values.ndim - axis - 1
+
+ # Pseudo-broadcast
+ if getattr(new, 'ndim', 0) >= 1:
+ if self.ndim - 1 == new.ndim:
+ new_shape = list(new.shape)
+ new_shape.insert(axis, 1)
+ new = new.reshape(tuple(new_shape))
# need to go column by column
new_blocks = []
@@ -680,14 +704,15 @@ def putmask(self, mask, new, align=True, inplace=False):
# need a new block
if m.any():
-
- n = new[i] if isinstance(
- new, np.ndarray) else np.array(new)
+ if isinstance(new, np.ndarray):
+ n = np.squeeze(new[i % new.shape[0]])
+ else:
+ n = np.array(new)
# type of the new block
dtype, _ = com._maybe_promote(n.dtype)
- # we need to exiplicty astype here to make a copy
+ # we need to explicitly astype here to make a copy
n = n.astype(dtype)
nv = _putmask_smart(v, m, n)
@@ -713,8 +738,10 @@ def putmask(self, mask, new, align=True, inplace=False):
if inplace:
return [self]
- return [make_block(new_values,
- placement=self.mgr_locs, fastpath=True)]
+ if transpose:
+ new_values = new_values.T
+
+ return [make_block(new_values, placement=self.mgr_locs, fastpath=True)]
def interpolate(self, method='pad', axis=0, index=None,
values=None, inplace=False, limit=None,
@@ -998,7 +1025,7 @@ def handle_error():
fastpath=True, placement=self.mgr_locs)]
def where(self, other, cond, align=True, raise_on_error=True,
- try_cast=False):
+ try_cast=False, axis=0, transpose=False):
"""
evaluate the block; return result block(s) from the result
@@ -1009,6 +1036,9 @@ def where(self, other, cond, align=True, raise_on_error=True,
align : boolean, perform alignment on other/cond
raise_on_error : if True, raise when I can't perform the function,
False by default (and just return the data that we had coming in)
+ axis : int
+ transpose : boolean
+ Set to True if self is stored with axes reversed
Returns
-------
@@ -1016,43 +1046,23 @@ def where(self, other, cond, align=True, raise_on_error=True,
"""
values = self.values
+ if transpose:
+ values = values.T
- # see if we can align other
if hasattr(other, 'reindex_axis'):
other = other.values
- # make sure that we can broadcast
- is_transposed = False
- if hasattr(other, 'ndim') and hasattr(values, 'ndim'):
- if values.ndim != other.ndim or values.shape == other.shape[::-1]:
-
- # if its symmetric are ok, no reshaping needed (GH 7506)
- if (values.shape[0] == np.array(values.shape)).all():
- pass
-
- # pseodo broadcast (its a 2d vs 1d say and where needs it in a
- # specific direction)
- elif (other.ndim >= 1 and values.ndim - 1 == other.ndim and
- values.shape[0] != other.shape[0]):
- other = _block_shape(other).T
- else:
- values = values.T
- is_transposed = True
-
- # see if we can align cond
- if not hasattr(cond, 'shape'):
- raise ValueError(
- "where must have a condition that is ndarray like")
-
if hasattr(cond, 'reindex_axis'):
cond = cond.values
- # may need to undo transpose of values
- if hasattr(values, 'ndim'):
- if values.ndim != cond.ndim or values.shape == cond.shape[::-1]:
+ # If the default broadcasting would go in the wrong direction, then
+ # explictly reshape other instead
+ if getattr(other, 'ndim', 0) >= 1:
+ if values.ndim - 1 == other.ndim and axis == 1:
+ other = other.reshape(tuple(other.shape + (1,)))
- values = values.T
- is_transposed = not is_transposed
+ if not hasattr(cond, 'shape'):
+ raise ValueError("where must have a condition that is ndarray like")
other = _maybe_convert_string_to_object(other)
@@ -1085,15 +1095,14 @@ def func(c, v, o):
raise TypeError('Could not compare [%s] with block values'
% repr(other))
- if is_transposed:
+ if transpose:
result = result.T
# try to cast if requested
if try_cast:
result = self._try_cast_result(result)
- return make_block(result,
- ndim=self.ndim, placement=self.mgr_locs)
+ return make_block(result, ndim=self.ndim, placement=self.mgr_locs)
# might need to separate out blocks
axis = cond.ndim - 1
@@ -1221,32 +1230,34 @@ def _try_cast(self, element):
return element
def to_native_types(self, slicer=None, na_rep='', float_format=None, decimal='.',
- **kwargs):
+ quoting=None, **kwargs):
""" convert to our native types format, slicing if desired """
values = self.values
if slicer is not None:
values = values[:, slicer]
- values = np.array(values, dtype=object)
mask = isnull(values)
- values[mask] = na_rep
-
+ formatter = None
if float_format and decimal != '.':
formatter = lambda v : (float_format % v).replace('.',decimal,1)
elif decimal != '.':
formatter = lambda v : ('%g' % v).replace('.',decimal,1)
elif float_format:
formatter = lambda v : float_format % v
+
+ if formatter is None and not quoting:
+ values = values.astype(str)
else:
- formatter = None
+ values = np.array(values, dtype='object')
+ values[mask] = na_rep
if formatter:
imask = (~mask).ravel()
values.flat[imask] = np.array(
[formatter(val) for val in values.ravel()[imask]])
- return values.tolist()
+ return values
def should_store(self, value):
# when inserting a column should not coerce integers to floats
@@ -1366,7 +1377,7 @@ def _try_coerce_result(self, result):
def should_store(self, value):
return issubclass(value.dtype.type, np.timedelta64)
- def to_native_types(self, slicer=None, na_rep=None, **kwargs):
+ def to_native_types(self, slicer=None, na_rep=None, quoting=None, **kwargs):
""" convert to our native types format, slicing if desired """
values = self.values
@@ -1387,7 +1398,7 @@ def to_native_types(self, slicer=None, na_rep=None, **kwargs):
rvalues.flat[imask] = np.array([Timedelta(val)._repr_base(format='all')
for val in values.ravel()[imask]],
dtype=object)
- return rvalues.tolist()
+ return rvalues
def get_values(self, dtype=None):
@@ -1723,7 +1734,8 @@ def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None):
return self.make_block_same_class(new_values, new_mgr_locs)
- def putmask(self, mask, new, align=True, inplace=False):
+ def putmask(self, mask, new, align=True, inplace=False,
+ axis=0, transpose=False):
""" putmask the data to the block; it is possible that we may create a
new dtype of block
@@ -1763,18 +1775,19 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
ndim=self.ndim,
placement=self.mgr_locs)
- def to_native_types(self, slicer=None, na_rep='', **kwargs):
+ def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs):
""" convert to our native types format, slicing if desired """
values = self.values
if slicer is not None:
# Categorical is always one dimension
values = values[slicer]
- values = np.array(values, dtype=object)
mask = isnull(values)
+ values = np.array(values, dtype='object')
values[mask] = na_rep
- # Blocks.to_native_type returns list of lists, but we are always only a list
- return [values.tolist()]
+
+ # we are expected to return a 2-d ndarray
+ return values.reshape(1,len(values))
class DatetimeBlock(Block):
__slots__ = ()
@@ -1864,29 +1877,21 @@ def fillna(self, value, limit=None,
fastpath=True, placement=self.mgr_locs)]
def to_native_types(self, slicer=None, na_rep=None, date_format=None,
- **kwargs):
+ quoting=None, **kwargs):
""" convert to our native types format, slicing if desired """
values = self.values
if slicer is not None:
values = values[:, slicer]
- mask = isnull(values)
- rvalues = np.empty(values.shape, dtype=object)
- if na_rep is None:
- na_rep = 'NaT'
- rvalues[mask] = na_rep
- imask = (~mask).ravel()
-
- if date_format is None:
- date_formatter = lambda x: Timestamp(x)._repr_base
- else:
- date_formatter = lambda x: Timestamp(x).strftime(date_format)
+ from pandas.core.format import _get_format_datetime64_from_values
+ format = _get_format_datetime64_from_values(values, date_format)
- rvalues.flat[imask] = np.array([date_formatter(val) for val in
- values.ravel()[imask]], dtype=object)
-
- return rvalues.tolist()
+ result = tslib.format_array_from_datetime(values.view('i8').ravel(),
+ tz=None,
+ format=format,
+ na_rep=na_rep).reshape(values.shape)
+ return result
def should_store(self, value):
return issubclass(value.dtype.type, np.datetime64)
@@ -2422,12 +2427,18 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, **kwargs):
else:
kwargs['filter'] = filter_locs
- if f == 'where' and kwargs.get('align', True):
+ if f == 'where':
align_copy = True
- align_keys = ['other', 'cond']
- elif f == 'putmask' and kwargs.get('align', True):
+ if kwargs.get('align', True):
+ align_keys = ['other', 'cond']
+ else:
+ align_keys = ['cond']
+ elif f == 'putmask':
align_copy = False
- align_keys = ['new', 'mask']
+ if kwargs.get('align', True):
+ align_keys = ['new', 'mask']
+ else:
+ align_keys = ['mask']
elif f == 'eval':
align_copy = False
align_keys = ['other']
@@ -3134,7 +3145,6 @@ def reindex_indexer(self, new_axis, indexer, axis, fill_value=None,
pandas-indexer with -1's only.
"""
-
if indexer is None:
if new_axis is self.axes[axis] and not copy:
return self
@@ -3146,10 +3156,9 @@ def reindex_indexer(self, new_axis, indexer, axis, fill_value=None,
self._consolidate_inplace()
- # trying to reindex on an axis with duplicates
- if (not allow_dups and not self.axes[axis].is_unique
- and len(indexer)):
- raise ValueError("cannot reindex from a duplicate axis")
+ # some axes don't allow reindexing with dups
+ if not allow_dups:
+ self.axes[axis]._can_reindex(indexer)
if axis >= self.ndim:
raise IndexError("Requested axis not found in manager")
diff --git a/pandas/core/series.py b/pandas/core/series.py
index f9c56db018639..4ad5e06693221 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -236,6 +236,11 @@ def from_array(cls, arr, index=None, name=None, dtype=None, copy=False,
def _constructor(self):
return Series
+ @property
+ def _constructor_expanddim(self):
+ from pandas.core.frame import DataFrame
+ return DataFrame
+
# types
@property
def _can_hold_na(self):
@@ -1047,11 +1052,10 @@ def to_frame(self, name=None):
-------
data_frame : DataFrame
"""
- from pandas.core.frame import DataFrame
if name is None:
- df = DataFrame(self)
+ df = self._constructor_expanddim(self)
else:
- df = DataFrame({name: self})
+ df = self._constructor_expanddim({name: self})
return df
@@ -2517,6 +2521,21 @@ def _make_cat_accessor(self):
cat = base.AccessorProperty(CategoricalAccessor, _make_cat_accessor)
+ def _dir_deletions(self):
+ return self._accessors
+
+ def _dir_additions(self):
+ rv = set()
+ # these accessors are mutually exclusive, so break loop when one exists
+ for accessor in self._accessors:
+ try:
+ getattr(self, accessor)
+ rv.add(accessor)
+ break
+ except AttributeError:
+ pass
+ return rv
+
Series._setup_axes(['index'], info_axis=0, stat_axis=0,
aliases={'rows': 0})
Series._add_numeric_operations()
@@ -2590,8 +2609,9 @@ def _try_cast(arr, take_fast_path):
# GH #846
if isinstance(data, (np.ndarray, Index, Series)):
- subarr = np.array(data, copy=False)
+
if dtype is not None:
+ subarr = np.array(data, copy=False)
# possibility of nan -> garbage
if com.is_float_dtype(data.dtype) and com.is_integer_dtype(dtype):
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
index 799872d036c4f..819c49f4fb0dd 100755
--- a/pandas/io/tests/test_parsers.py
+++ b/pandas/io/tests/test_parsers.py
@@ -2273,6 +2273,20 @@ def test_nrows_and_chunksize_raises_notimplemented(self):
self.assertRaises(NotImplementedError, self.read_csv, StringIO(data),
nrows=10, chunksize=5)
+ def test_single_char_leading_whitespace(self):
+ # GH 9710
+ data = """\
+MyColumn
+ a
+ b
+ a
+ b\n"""
+
+ expected = DataFrame({'MyColumn' : list('abab')})
+
+ result = self.read_csv(StringIO(data), skipinitialspace=True)
+ tm.assert_frame_equal(result, expected)
+
class TestPythonParser(ParserTests, tm.TestCase):
def test_negative_skipfooter_raises(self):
@@ -3313,6 +3327,25 @@ def test_buffer_overflow(self):
except Exception as cperr:
self.assertIn('Buffer overflow caught - possible malformed input file.', str(cperr))
+ def test_single_char_leading_whitespace(self):
+ # GH 9710
+ data = """\
+MyColumn
+ a
+ b
+ a
+ b\n"""
+
+ expected = DataFrame({'MyColumn' : list('abab')})
+
+ result = self.read_csv(StringIO(data), delim_whitespace=True,
+ skipinitialspace=True)
+ tm.assert_frame_equal(result, expected)
+
+ result = self.read_csv(StringIO(data), lineterminator='\n',
+ skipinitialspace=True)
+ tm.assert_frame_equal(result, expected)
+
class TestCParserLowMemory(ParserTests, tm.TestCase):
def read_csv(self, *args, **kwds):
@@ -3734,6 +3767,25 @@ def test_buffer_overflow(self):
except Exception as cperr:
self.assertIn('Buffer overflow caught - possible malformed input file.', str(cperr))
+ def test_single_char_leading_whitespace(self):
+ # GH 9710
+ data = """\
+MyColumn
+ a
+ b
+ a
+ b\n"""
+
+ expected = DataFrame({'MyColumn' : list('abab')})
+
+ result = self.read_csv(StringIO(data), delim_whitespace=True,
+ skipinitialspace=True)
+ tm.assert_frame_equal(result, expected)
+
+ result = self.read_csv(StringIO(data), lineterminator='\n',
+ skipinitialspace=True)
+ tm.assert_frame_equal(result, expected)
+
class TestMiscellaneous(tm.TestCase):
# for tests that don't fit into any of the other classes, e.g. those that
diff --git a/pandas/lib.pyx b/pandas/lib.pyx
index 5ab2ee4327177..0d53b19425c2f 100644
--- a/pandas/lib.pyx
+++ b/pandas/lib.pyx
@@ -933,7 +933,7 @@ def string_array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_re
@cython.boundscheck(False)
@cython.wraparound(False)
-def write_csv_rows(list data, list data_index, int nlevels, list cols, object writer):
+def write_csv_rows(list data, ndarray data_index, int nlevels, ndarray cols, object writer):
cdef int N, j, i, ncols
cdef list rows
diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c
index 1850aab50b55a..e7b5db9c5e361 100644
--- a/pandas/src/parser/tokenizer.c
+++ b/pandas/src/parser/tokenizer.c
@@ -849,10 +849,11 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
;
else { // backtrack
/* We have to use i + 1 because buf has been incremented but not i */
- while (i + 1 > self->datapos && *buf != '\n') {
+ do {
--buf;
--i;
- }
+ } while (i + 1 > self->datapos && *buf != '\n');
+
if (i + 1 > self->datapos) // reached a newline rather than the beginning
{
++buf; // move pointer to first char after newline
@@ -1073,7 +1074,7 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit)
// Next character in file
c = *buf++;
- TRACE(("Iter: %d Char: %c Line %d field_count %d, state %d\n",
+ TRACE(("tokenize_delim_customterm - Iter: %d Char: %c Line %d field_count %d, state %d\n",
i, c, self->file_lines + 1, self->line_fields[self->lines],
self->state));
@@ -1166,10 +1167,11 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit)
;
else { // backtrack
/* We have to use i + 1 because buf has been incremented but not i */
- while (i + 1 > self->datapos && *buf != self->lineterminator) {
+ do {
--buf;
--i;
- }
+ } while (i + 1 > self->datapos && *buf != self->lineterminator);
+
if (i + 1 > self->datapos) // reached a newline rather than the beginning
{
++buf; // move pointer to first char after newline
@@ -1336,7 +1338,7 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
// Next character in file
c = *buf++;
- TRACE(("Iter: %d Char: %c Line %d field_count %d, state %d\n",
+ TRACE(("tokenize_whitespace - Iter: %d Char: %c Line %d field_count %d, state %d\n",
i, c, self->file_lines + 1, self->line_fields[self->lines],
self->state));
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
index af48774492b11..6a6564347d35f 100644
--- a/pandas/tests/test_categorical.py
+++ b/pandas/tests/test_categorical.py
@@ -11,7 +11,7 @@
import numpy as np
import pandas as pd
-from pandas import Categorical, Index, Series, DataFrame, PeriodIndex, Timestamp
+from pandas import Categorical, Index, Series, DataFrame, PeriodIndex, Timestamp, CategoricalIndex
from pandas.core.config import option_context
import pandas.core.common as com
@@ -93,6 +93,24 @@ def test_constructor_unsortable(self):
else:
Categorical.from_array(arr, ordered=True)
+ def test_is_equal_dtype(self):
+
+ # test dtype comparisons between cats
+
+ c1 = Categorical(list('aabca'),categories=list('abc'),ordered=False)
+ c2 = Categorical(list('aabca'),categories=list('cab'),ordered=False)
+ c3 = Categorical(list('aabca'),categories=list('cab'),ordered=True)
+ self.assertTrue(c1.is_dtype_equal(c1))
+ self.assertTrue(c2.is_dtype_equal(c2))
+ self.assertTrue(c3.is_dtype_equal(c3))
+ self.assertFalse(c1.is_dtype_equal(c2))
+ self.assertFalse(c1.is_dtype_equal(c3))
+ self.assertFalse(c1.is_dtype_equal(Index(list('aabca'))))
+ self.assertFalse(c1.is_dtype_equal(c1.astype(object)))
+ self.assertTrue(c1.is_dtype_equal(CategoricalIndex(c1)))
+ self.assertFalse(c1.is_dtype_equal(CategoricalIndex(c1,categories=list('cab'))))
+ self.assertFalse(c1.is_dtype_equal(CategoricalIndex(c1,ordered=True)))
+
def test_constructor(self):
exp_arr = np.array(["a", "b", "c", "a", "b", "c"])
@@ -224,6 +242,18 @@ def f():
c_old2 = Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3])
cat = Categorical([1,2], categories=[1,2,3])
+ # this is a legitimate constructor
+ with tm.assert_produces_warning(None):
+ c = Categorical(np.array([],dtype='int64'),categories=[3,2,1],ordered=True)
+
+ def test_constructor_with_index(self):
+
+ ci = CategoricalIndex(list('aabbca'),categories=list('cab'))
+ self.assertTrue(ci.values.equals(Categorical(ci)))
+
+ ci = CategoricalIndex(list('aabbca'),categories=list('cab'))
+ self.assertTrue(ci.values.equals(Categorical(ci.astype(object),categories=ci.categories)))
+
def test_constructor_with_generator(self):
# This was raising an Error in isnull(single_val).any() because isnull returned a scalar
# for a generator
@@ -727,6 +757,19 @@ def f():
cat.add_categories(["d"])
self.assertRaises(ValueError, f)
+ # GH 9927
+ cat = Categorical(list("abc"), ordered=True)
+ expected = Categorical(list("abc"), categories=list("abcde"), ordered=True)
+ # test with Series, np.array, index, list
+ res = cat.add_categories(Series(["d", "e"]))
+ self.assert_categorical_equal(res, expected)
+ res = cat.add_categories(np.array(["d", "e"]))
+ self.assert_categorical_equal(res, expected)
+ res = cat.add_categories(Index(["d", "e"]))
+ self.assert_categorical_equal(res, expected)
+ res = cat.add_categories(["d", "e"])
+ self.assert_categorical_equal(res, expected)
+
def test_remove_categories(self):
cat = Categorical(["a","b","c","a"], ordered=True)
old = cat.copy()
@@ -2562,6 +2605,8 @@ def f():
dfx['grade'].cat.categories
self.assert_numpy_array_equal(df['grade'].cat.categories, dfx['grade'].cat.categories)
+ def test_concat_preserve(self):
+
# GH 8641
# series concat not preserving category dtype
s = Series(list('abc'),dtype='category')
@@ -2579,6 +2624,28 @@ def f():
expected = Series(list('abcabc'),index=[0,1,2,0,1,2]).astype('category')
tm.assert_series_equal(result, expected)
+ a = Series(np.arange(6,dtype='int64'))
+ b = Series(list('aabbca'))
+
+ df2 = DataFrame({'A' : a, 'B' : b.astype('category',categories=list('cab')) })
+ result = pd.concat([df2,df2])
+ expected = DataFrame({'A' : pd.concat([a,a]), 'B' : pd.concat([b,b]).astype('category',categories=list('cab')) })
+ tm.assert_frame_equal(result, expected)
+
+ def test_categorical_index_preserver(self):
+
+ a = Series(np.arange(6,dtype='int64'))
+ b = Series(list('aabbca'))
+
+ df2 = DataFrame({'A' : a, 'B' : b.astype('category',categories=list('cab')) }).set_index('B')
+ result = pd.concat([df2,df2])
+ expected = DataFrame({'A' : pd.concat([a,a]), 'B' : pd.concat([b,b]).astype('category',categories=list('cab')) }).set_index('B')
+ tm.assert_frame_equal(result, expected)
+
+ # wrong catgories
+ df3 = DataFrame({'A' : a, 'B' : b.astype('category',categories=list('abc')) }).set_index('B')
+ self.assertRaises(TypeError, lambda : pd.concat([df2,df3]))
+
def test_append(self):
cat = pd.Categorical(["a","b"], categories=["a","b"])
vals = [1,2]
@@ -2714,6 +2781,14 @@ def cmp(a,b):
self.assertRaises(TypeError, lambda : invalid(s))
+ def test_astype_categorical(self):
+
+ cat = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
+ tm.assert_categorical_equal(cat,cat.astype('category'))
+ tm.assert_almost_equal(np.array(cat),cat.astype('object'))
+
+ self.assertRaises(ValueError, lambda : cat.astype(float))
+
def test_to_records(self):
# GH8626
diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py
index e3455d2449b55..b557594e8e7ef 100644
--- a/pandas/tests/test_format.py
+++ b/pandas/tests/test_format.py
@@ -3010,12 +3010,12 @@ def test_format(self):
def test_output_significant_digits(self):
# Issue #9764
-
+
# In case default display precision changes:
with pd.option_context('display.precision', 7):
# DataFrame example from issue #9764
d=pd.DataFrame({'col1':[9.999e-8, 1e-7, 1.0001e-7, 2e-7, 4.999e-7, 5e-7, 5.0001e-7, 6e-7, 9.999e-7, 1e-6, 1.0001e-6, 2e-6, 4.999e-6, 5e-6, 5.0001e-6, 6e-6]})
-
+
expected_output={
(0,6):' col1\n0 9.999000e-08\n1 1.000000e-07\n2 1.000100e-07\n3 2.000000e-07\n4 4.999000e-07\n5 5.000000e-07',
(1,6):' col1\n1 1.000000e-07\n2 1.000100e-07\n3 2.000000e-07\n4 4.999000e-07\n5 5.000000e-07',
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
index c7c35e63d3d91..555cb9efa5eee 100644
--- a/pandas/tests/test_frame.py
+++ b/pandas/tests/test_frame.py
@@ -31,9 +31,9 @@
import pandas.core.common as com
import pandas.core.format as fmt
import pandas.core.datetools as datetools
-from pandas import (DataFrame, Index, Series, notnull, isnull,
+from pandas import (DataFrame, Index, Series, Panel, notnull, isnull,
MultiIndex, DatetimeIndex, Timestamp, date_range,
- read_csv, timedelta_range, Timedelta,
+ read_csv, timedelta_range, Timedelta, CategoricalIndex,
option_context)
import pandas as pd
from pandas.parser import CParserError
@@ -784,6 +784,16 @@ def test_setitem_None(self):
assert_series_equal(self.frame[None], self.frame['A'])
repr(self.frame)
+ def test_setitem_empty(self):
+ # GH 9596
+ df = pd.DataFrame({'a': ['1', '2', '3'],
+ 'b': ['11', '22', '33'],
+ 'c': ['111', '222', '333']})
+
+ result = df.copy()
+ result.loc[result.b.isnull(), 'a'] = result.a
+ assert_frame_equal(result, df)
+
def test_delitem_corner(self):
f = self.frame.copy()
del f['D']
@@ -2376,6 +2386,32 @@ def test_set_index_pass_arrays(self):
expected = df.set_index(['A', 'B'], drop=False)
assert_frame_equal(result, expected, check_names=False) # TODO should set_index check_names ?
+ def test_construction_with_categorical_index(self):
+
+ ci = tm.makeCategoricalIndex(10)
+
+ # with Categorical
+ df = DataFrame({'A' : np.random.randn(10),
+ 'B' : ci.values })
+ idf = df.set_index('B')
+ str(idf)
+ tm.assert_index_equal(idf.index,ci)
+
+ # from a CategoricalIndex
+ df = DataFrame({'A' : np.random.randn(10),
+ 'B' : ci })
+ idf = df.set_index('B')
+ str(idf)
+ tm.assert_index_equal(idf.index,ci)
+
+ idf = df.set_index('B').reset_index().set_index('B')
+ str(idf)
+ tm.assert_index_equal(idf.index,ci)
+
+ new_df = idf.reset_index()
+ new_df.index = df.B
+ tm.assert_index_equal(new_df.index,ci)
+
def test_set_index_cast_datetimeindex(self):
df = DataFrame({'A': [datetime(2000, 1, 1) + timedelta(i)
for i in range(1000)],
@@ -9838,6 +9874,110 @@ def test_where_complex(self):
df[df.abs() >= 5] = np.nan
assert_frame_equal(df,expected)
+ def test_where_axis(self):
+ # GH 9736
+ df = DataFrame(np.random.randn(2, 2))
+ mask = DataFrame([[False, False], [False, False]])
+ s = Series([0, 1])
+
+ expected = DataFrame([[0, 0], [1, 1]], dtype='float64')
+ result = df.where(mask, s, axis='index')
+ assert_frame_equal(result, expected)
+
+ result = df.copy()
+ result.where(mask, s, axis='index', inplace=True)
+ assert_frame_equal(result, expected)
+
+ expected = DataFrame([[0, 1], [0, 1]], dtype='float64')
+ result = df.where(mask, s, axis='columns')
+ assert_frame_equal(result, expected)
+
+ result = df.copy()
+ result.where(mask, s, axis='columns', inplace=True)
+ assert_frame_equal(result, expected)
+
+ # Upcast needed
+ df = DataFrame([[1, 2], [3, 4]], dtype='int64')
+ mask = DataFrame([[False, False], [False, False]])
+ s = Series([0, np.nan])
+
+ expected = DataFrame([[0, 0], [np.nan, np.nan]], dtype='float64')
+ result = df.where(mask, s, axis='index')
+ assert_frame_equal(result, expected)
+
+ result = df.copy()
+ result.where(mask, s, axis='index', inplace=True)
+ assert_frame_equal(result, expected)
+
+ expected = DataFrame([[0, np.nan], [0, np.nan]], dtype='float64')
+ result = df.where(mask, s, axis='columns')
+ assert_frame_equal(result, expected)
+
+ expected = DataFrame({0 : np.array([0, 0], dtype='int64'),
+ 1 : np.array([np.nan, np.nan], dtype='float64')})
+ result = df.copy()
+ result.where(mask, s, axis='columns', inplace=True)
+ assert_frame_equal(result, expected)
+
+ # Multiple dtypes (=> multiple Blocks)
+ df = pd.concat([DataFrame(np.random.randn(10, 2)),
+ DataFrame(np.random.randint(0, 10, size=(10, 2)))],
+ ignore_index=True, axis=1)
+ mask = DataFrame(False, columns=df.columns, index=df.index)
+ s1 = Series(1, index=df.columns)
+ s2 = Series(2, index=df.index)
+
+ result = df.where(mask, s1, axis='columns')
+ expected = DataFrame(1.0, columns=df.columns, index=df.index)
+ expected[2] = expected[2].astype(int)
+ expected[3] = expected[3].astype(int)
+ assert_frame_equal(result, expected)
+
+ result = df.copy()
+ result.where(mask, s1, axis='columns', inplace=True)
+ assert_frame_equal(result, expected)
+
+ result = df.where(mask, s2, axis='index')
+ expected = DataFrame(2.0, columns=df.columns, index=df.index)
+ expected[2] = expected[2].astype(int)
+ expected[3] = expected[3].astype(int)
+ assert_frame_equal(result, expected)
+
+ result = df.copy()
+ result.where(mask, s2, axis='index', inplace=True)
+ assert_frame_equal(result, expected)
+
+ # DataFrame vs DataFrame
+ d1 = df.copy().drop(1, axis=0)
+ expected = df.copy()
+ expected.loc[1, :] = np.nan
+
+ result = df.where(mask, d1)
+ assert_frame_equal(result, expected)
+ result = df.where(mask, d1, axis='index')
+ assert_frame_equal(result, expected)
+ result = df.copy()
+ result.where(mask, d1, inplace=True)
+ assert_frame_equal(result, expected)
+ result = df.copy()
+ result.where(mask, d1, inplace=True, axis='index')
+ assert_frame_equal(result, expected)
+
+ d2 = df.copy().drop(1, axis=1)
+ expected = df.copy()
+ expected.loc[:, 1] = np.nan
+
+ result = df.where(mask, d2)
+ assert_frame_equal(result, expected)
+ result = df.where(mask, d2, axis='columns')
+ assert_frame_equal(result, expected)
+ result = df.copy()
+ result.where(mask, d2, inplace=True)
+ assert_frame_equal(result, expected)
+ result = df.copy()
+ result.where(mask, d2, inplace=True, axis='columns')
+ assert_frame_equal(result, expected)
+
def test_mask(self):
df = DataFrame(np.random.randn(5, 3))
cond = df > 0
@@ -10734,6 +10874,19 @@ def test_sort_index(self):
with assertRaisesRegexp(ValueError, msg):
frame.sort_index(by=['A', 'B'], axis=0, ascending=[True] * 5)
+ def test_sort_index_categorical_index(self):
+
+ df = DataFrame({'A' : np.arange(6,dtype='int64'),
+ 'B' : Series(list('aabbca')).astype('category',categories=list('cab')) }).set_index('B')
+
+ result = df.sort_index()
+ expected = df.iloc[[4,0,1,5,2,3]]
+ assert_frame_equal(result, expected)
+
+ result = df.sort_index(ascending=False)
+ expected = df.iloc[[3,2,5,1,0,4]]
+ assert_frame_equal(result, expected)
+
def test_sort_nan(self):
# GH3917
nan = np.nan
@@ -14204,6 +14357,27 @@ def _constructor(self):
# GH9776
self.assertEqual(df.iloc[0:1, :].testattr, 'XXX')
+ def test_to_panel_expanddim(self):
+ # GH 9762
+
+ class SubclassedFrame(DataFrame):
+ @property
+ def _constructor_expanddim(self):
+ return SubclassedPanel
+
+ class SubclassedPanel(Panel):
+ pass
+
+ index = MultiIndex.from_tuples([(0, 0), (0, 1), (0, 2)])
+ df = SubclassedFrame({'X':[1, 2, 3], 'Y': [4, 5, 6]}, index=index)
+ result = df.to_panel()
+ self.assertTrue(isinstance(result, SubclassedPanel))
+ expected = SubclassedPanel([[[1, 2, 3]], [[4, 5, 6]]],
+ items=['X', 'Y'], major_axis=[0],
+ minor_axis=[0, 1, 2],
+ dtype='int64')
+ tm.assert_panel_equal(result, expected)
+
def skip_if_no_ne(engine='numexpr'):
if engine == 'numexpr':
diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py
index 7ec57c0304530..33c88b0e3b4b7 100644
--- a/pandas/tests/test_graphics.py
+++ b/pandas/tests/test_graphics.py
@@ -1534,6 +1534,19 @@ def test_subplots_ts_share_axes(self):
for ax in axes[[0, 1, 2], [2]].ravel():
self._check_visible(ax.get_yticklabels(), visible=False)
+ def test_subplots_sharex_axes_existing_axes(self):
+ # GH 9158
+ d = {'A': [1., 2., 3., 4.], 'B': [4., 3., 2., 1.], 'C': [5, 1, 3, 4]}
+ df = DataFrame(d, index=date_range('2014 10 11', '2014 10 14'))
+
+ axes = df[['A', 'B']].plot(subplots=True)
+ df['C'].plot(ax=axes[0], secondary_y=True)
+
+ self._check_visible(axes[0].get_xticklabels(), visible=False)
+ self._check_visible(axes[1].get_xticklabels(), visible=True)
+ for ax in axes.ravel():
+ self._check_visible(ax.get_yticklabels(), visible=True)
+
def test_negative_log(self):
df = - DataFrame(rand(6, 4),
index=list(string.ascii_letters[:6]),
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
index 87536b9bf0ff8..7af53c88f0f72 100644
--- a/pandas/tests/test_groupby.py
+++ b/pandas/tests/test_groupby.py
@@ -8,7 +8,7 @@
from numpy import nan
from pandas import date_range,bdate_range, Timestamp
-from pandas.core.index import Index, MultiIndex, Int64Index
+from pandas.core.index import Index, MultiIndex, Int64Index, CategoricalIndex
from pandas.core.api import Categorical, DataFrame
from pandas.core.groupby import (SpecificationError, DataError,
_nargsort, _lexsort_indexer)
@@ -3378,12 +3378,11 @@ def test_groupby_datetime_categorical(self):
cats = Categorical.from_codes(codes, levels, name='myfactor', ordered=True)
data = DataFrame(np.random.randn(100, 4))
-
result = data.groupby(cats).mean()
expected = data.groupby(np.asarray(cats)).mean()
expected = expected.reindex(levels)
- expected.index.name = 'myfactor'
+ expected.index = CategoricalIndex(expected.index,categories=expected.index,name='myfactor',ordered=True)
assert_frame_equal(result, expected)
self.assertEqual(result.index.name, cats.name)
@@ -3398,6 +3397,26 @@ def test_groupby_datetime_categorical(self):
expected.index.names = ['myfactor', None]
assert_frame_equal(desc_result, expected)
+ def test_groupby_categorical_index(self):
+
+ levels = ['foo', 'bar', 'baz', 'qux']
+ codes = np.random.randint(0, 4, size=20)
+ cats = Categorical.from_codes(codes, levels, name='myfactor', ordered=True)
+ df = DataFrame(np.repeat(np.arange(20),4).reshape(-1,4), columns=list('abcd'))
+ df['cats'] = cats
+
+ # with a cat index
+ result = df.set_index('cats').groupby(level=0).sum()
+ expected = df[list('abcd')].groupby(cats.codes).sum()
+ expected.index = CategoricalIndex(Categorical.from_codes([0,1,2,3], levels, ordered=True),name='cats')
+ assert_frame_equal(result, expected)
+
+ # with a cat column, should produce a cat index
+ result = df.groupby('cats').sum()
+ expected = df[list('abcd')].groupby(cats.codes).sum()
+ expected.index = CategoricalIndex(Categorical.from_codes([0,1,2,3], levels, ordered=True),name='cats')
+ assert_frame_equal(result, expected)
+
def test_groupby_groups_datetimeindex(self):
# #1430
from pandas.tseries.api import DatetimeIndex
@@ -3526,6 +3545,8 @@ def test_groupby_categorical_no_compress(self):
result = data.groupby(cats).mean()
exp = data.groupby(codes).mean()
+
+ exp.index = CategoricalIndex(exp.index,categories=cats.categories,ordered=cats.ordered)
assert_series_equal(result, exp)
codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3])
@@ -3533,6 +3554,7 @@ def test_groupby_categorical_no_compress(self):
result = data.groupby(cats).mean()
exp = data.groupby(codes).mean().reindex(cats.categories)
+ exp.index = CategoricalIndex(exp.index,categories=cats.categories,ordered=cats.ordered)
assert_series_equal(result, exp)
cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
@@ -5061,6 +5083,17 @@ def test_groupby_categorical_two_columns(self):
"C3":[nan,nan,nan,nan, 10,100,nan,nan, nan,nan,200,34]}, index=idx)
tm.assert_frame_equal(res, exp)
+ def test_groupby_apply_all_none(self):
+ # Tests to make sure no errors if apply function returns all None
+ # values. Issue 9684.
+ test_df = DataFrame({'groups': [0,0,1,1], 'random_vars': [8,7,4,5]})
+
+ def test_func(x):
+ pass
+ result = test_df.groupby('groups').apply(test_func)
+ expected = DataFrame()
+ tm.assert_frame_equal(result, expected)
+
def assert_fp_equal(a, b):
assert (np.abs(a - b) < 1e-12).all()
diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py
index 336340dd95991..3c9dbd2e48cb6 100644
--- a/pandas/tests/test_index.py
+++ b/pandas/tests/test_index.py
@@ -12,14 +12,10 @@
import numpy as np
from numpy.testing import assert_array_equal
-from pandas import period_range, date_range
-
-from pandas.core.index import (Index, Float64Index, Int64Index, MultiIndex,
- InvalidIndexError, NumericIndex)
-from pandas.tseries.index import DatetimeIndex
-from pandas.tseries.tdi import TimedeltaIndex
-from pandas.tseries.period import PeriodIndex
-from pandas.core.series import Series
+from pandas import (period_range, date_range, Categorical, Series,
+ Index, Float64Index, Int64Index, MultiIndex,
+ CategoricalIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex)
+from pandas.core.index import InvalidIndexError, NumericIndex
from pandas.util.testing import (assert_almost_equal, assertRaisesRegexp,
assert_copy)
from pandas import compat
@@ -41,6 +37,11 @@ class Base(object):
_holder = None
_compat_props = ['shape', 'ndim', 'size', 'itemsize', 'nbytes']
+ def setup_indices(self):
+ # setup the test indices in the self.indicies dict
+ for name, ind in self.indices.items():
+ setattr(self, name, ind)
+
def verify_pickle(self,index):
unpickled = self.round_trip_pickle(index)
self.assertTrue(index.equals(unpickled))
@@ -98,6 +99,7 @@ def f():
def test_reindex_base(self):
idx = self.create_index()
expected = np.arange(idx.size)
+
actual = idx.get_indexer(idx)
assert_array_equal(expected, actual)
@@ -118,29 +120,6 @@ def test_ndarray_compat_properties(self):
idx.nbytes
idx.values.nbytes
-
-class TestIndex(Base, tm.TestCase):
- _holder = Index
- _multiprocess_can_split_ = True
-
- def setUp(self):
- self.indices = dict(
- unicodeIndex = tm.makeUnicodeIndex(100),
- strIndex = tm.makeStringIndex(100),
- dateIndex = tm.makeDateIndex(100),
- intIndex = tm.makeIntIndex(100),
- floatIndex = tm.makeFloatIndex(100),
- boolIndex = Index([True,False]),
- empty = Index([]),
- tuples = MultiIndex.from_tuples(lzip(['foo', 'bar', 'baz'],
- [1, 2, 3]))
- )
- for name, ind in self.indices.items():
- setattr(self, name, ind)
-
- def create_index(self):
- return Index(list('abcde'))
-
def test_wrong_number_names(self):
def testit(ind):
ind.names = ["apple", "banana", "carrot"]
@@ -150,14 +129,18 @@ def testit(ind):
def test_set_name_methods(self):
new_name = "This is the new name for this index"
- indices = (self.dateIndex, self.intIndex, self.unicodeIndex,
- self.empty)
- for ind in indices:
+ for ind in self.indices.values():
+
+ # don't tests a MultiIndex here (as its tested separated)
+ if isinstance(ind, MultiIndex):
+ continue
+
original_name = ind.name
new_ind = ind.set_names([new_name])
self.assertEqual(new_ind.name, new_name)
self.assertEqual(ind.name, original_name)
res = ind.rename(new_name, inplace=True)
+
# should return None
self.assertIsNone(res)
self.assertEqual(ind.name, new_name)
@@ -167,46 +150,128 @@ def test_set_name_methods(self):
# ind.set_names("a")
with assertRaisesRegexp(ValueError, "Level must be None"):
ind.set_names("a", level=0)
- # rename in place just leaves tuples and other containers alone
- name = ('A', 'B')
- ind = self.intIndex
- ind.rename(name, inplace=True)
- self.assertEqual(ind.name, name)
- self.assertEqual(ind.names, [name])
- def test_hash_error(self):
- with tm.assertRaisesRegexp(TypeError,
- "unhashable type: %r" %
- type(self.strIndex).__name__):
- hash(self.strIndex)
+ # rename in place just leaves tuples and other containers alone
+ name = ('A', 'B')
+ ind.rename(name, inplace=True)
+ self.assertEqual(ind.name, name)
+ self.assertEqual(ind.names, [name])
- def test_new_axis(self):
- new_index = self.dateIndex[None, :]
- self.assertEqual(new_index.ndim, 2)
- tm.assert_isinstance(new_index, np.ndarray)
+ def test_hash_error(self):
+ for ind in self.indices.values():
+ with tm.assertRaisesRegexp(TypeError,
+ "unhashable type: %r" %
+ type(ind).__name__):
+ hash(ind)
def test_copy_and_deepcopy(self):
from copy import copy, deepcopy
- for func in (copy, deepcopy):
- idx_copy = func(self.strIndex)
- self.assertIsNot(idx_copy, self.strIndex)
- self.assertTrue(idx_copy.equals(self.strIndex))
+ for ind in self.indices.values():
- new_copy = self.strIndex.copy(deep=True, name="banana")
- self.assertEqual(new_copy.name, "banana")
- new_copy2 = self.intIndex.copy(dtype=int)
- self.assertEqual(new_copy2.dtype.kind, 'i')
+ # don't tests a MultiIndex here (as its tested separated)
+ if isinstance(ind, MultiIndex):
+ continue
+
+ for func in (copy, deepcopy):
+ idx_copy = func(ind)
+ self.assertIsNot(idx_copy, ind)
+ self.assertTrue(idx_copy.equals(ind))
+
+ new_copy = ind.copy(deep=True, name="banana")
+ self.assertEqual(new_copy.name, "banana")
def test_duplicates(self):
- idx = Index([0, 0, 0])
- self.assertFalse(idx.is_unique)
+ for ind in self.indices.values():
+
+ if not len(ind):
+ continue
+ idx = self._holder([ind[0]]*5)
+ self.assertFalse(idx.is_unique)
+ self.assertTrue(idx.has_duplicates)
def test_sort(self):
- self.assertRaises(TypeError, self.strIndex.sort)
+ for ind in self.indices.values():
+ self.assertRaises(TypeError, ind.sort)
def test_mutability(self):
- self.assertRaises(TypeError, self.strIndex.__setitem__, 0, 'foo')
+ for ind in self.indices.values():
+ if not len(ind):
+ continue
+ self.assertRaises(TypeError, ind.__setitem__, 0, ind[0])
+
+ def test_view(self):
+ for ind in self.indices.values():
+ i_view = ind.view()
+ self.assertEqual(i_view.name, ind.name)
+
+ def test_compat(self):
+ for ind in self.indices.values():
+ self.assertEqual(ind.tolist(),list(ind))
+
+ def test_argsort(self):
+ for k, ind in self.indices.items():
+
+ # sep teststed
+ if k in ['catIndex']:
+ continue
+
+ result = ind.argsort()
+ expected = np.array(ind).argsort()
+ self.assert_numpy_array_equal(result, expected)
+
+ def test_pickle(self):
+ for ind in self.indices.values():
+ self.verify_pickle(ind)
+ ind.name = 'foo'
+ self.verify_pickle(ind)
+
+ def test_take(self):
+ indexer = [4, 3, 0, 2]
+ for k, ind in self.indices.items():
+
+ # separate
+ if k in ['boolIndex','tuples','empty']:
+ continue
+
+ result = ind.take(indexer)
+ expected = ind[indexer]
+ self.assertTrue(result.equals(expected))
+
+class TestIndex(Base, tm.TestCase):
+ _holder = Index
+ _multiprocess_can_split_ = True
+
+ def setUp(self):
+ self.indices = dict(
+ unicodeIndex = tm.makeUnicodeIndex(100),
+ strIndex = tm.makeStringIndex(100),
+ dateIndex = tm.makeDateIndex(100),
+ periodIndex = tm.makePeriodIndex(100),
+ tdIndex = tm.makeTimedeltaIndex(100),
+ intIndex = tm.makeIntIndex(100),
+ floatIndex = tm.makeFloatIndex(100),
+ boolIndex = Index([True,False]),
+ catIndex = tm.makeCategoricalIndex(100),
+ empty = Index([]),
+ tuples = MultiIndex.from_tuples(lzip(['foo', 'bar', 'baz'],
+ [1, 2, 3]))
+ )
+ self.setup_indices()
+
+ def create_index(self):
+ return Index(list('abcde'))
+
+ def test_new_axis(self):
+ new_index = self.dateIndex[None, :]
+ self.assertEqual(new_index.ndim, 2)
+ tm.assert_isinstance(new_index, np.ndarray)
+
+ def test_copy_and_deepcopy(self):
+ super(TestIndex, self).test_copy_and_deepcopy()
+
+ new_copy2 = self.intIndex.copy(dtype=int)
+ self.assertEqual(new_copy2.dtype.kind, 'i')
def test_constructor(self):
# regular instance creation
@@ -297,18 +362,22 @@ def test_constructor_simple_new(self):
result = idx._simple_new(idx, 'obj')
self.assertTrue(result.equals(idx))
- def test_copy(self):
- i = Index([], name='Foo')
- i_copy = i.copy()
- self.assertEqual(i_copy.name, 'Foo')
+ def test_view_with_args(self):
- def test_view(self):
- i = Index([], name='Foo')
- i_view = i.view()
- self.assertEqual(i_view.name, 'Foo')
+ restricted = ['unicodeIndex','strIndex','catIndex','boolIndex','empty']
+
+ for i in restricted:
+ ind = self.indices[i]
+
+ # with arguments
+ self.assertRaises(TypeError, lambda : ind.view('i8'))
- # with arguments
- self.assertRaises(TypeError, lambda : i.view('i8'))
+ # these are ok
+ for i in list(set(self.indices.keys())-set(restricted)):
+ ind = self.indices[i]
+
+ # with arguments
+ ind.view('i8')
def test_legacy_pickle_identity(self):
@@ -330,9 +399,6 @@ def test_astype(self):
casted = self.intIndex.astype('i8')
self.assertEqual(casted.name, 'foobar')
- def test_compat(self):
- self.strIndex.tolist()
-
def test_equals(self):
# same
self.assertTrue(Index(['a', 'b', 'c']).equals(Index(['a', 'b', 'c'])))
@@ -459,11 +525,6 @@ def test_nanosecond_index_access(self):
self.assertEqual(first_value, x[Timestamp(np.datetime64('2013-01-01 00:00:00.000000050+0000', 'ns'))])
- def test_argsort(self):
- result = self.strIndex.argsort()
- expected = np.array(self.strIndex).argsort()
- self.assert_numpy_array_equal(result, expected)
-
def test_comparators(self):
index = self.dateIndex
element = index[len(index) // 2]
@@ -760,22 +821,17 @@ def test_symmetric_diff(self):
with tm.assertRaises(TypeError):
Index(idx1,dtype='object') - 1
- def test_pickle(self):
-
- self.verify_pickle(self.strIndex)
- self.strIndex.name = 'foo'
- self.verify_pickle(self.strIndex)
- self.verify_pickle(self.dateIndex)
-
def test_is_numeric(self):
self.assertFalse(self.dateIndex.is_numeric())
self.assertFalse(self.strIndex.is_numeric())
self.assertTrue(self.intIndex.is_numeric())
self.assertTrue(self.floatIndex.is_numeric())
+ self.assertFalse(self.catIndex.is_numeric())
def test_is_object(self):
self.assertTrue(self.strIndex.is_object())
self.assertTrue(self.boolIndex.is_object())
+ self.assertFalse(self.catIndex.is_object())
self.assertFalse(self.intIndex.is_object())
self.assertFalse(self.dateIndex.is_object())
self.assertFalse(self.floatIndex.is_object())
@@ -839,12 +895,6 @@ def test_format_none(self):
idx.format()
self.assertIsNone(idx[3])
- def test_take(self):
- indexer = [4, 3, 0, 2]
- result = self.dateIndex.take(indexer)
- expected = self.dateIndex[indexer]
- self.assertTrue(result.equals(expected))
-
def test_logical_compat(self):
idx = self.create_index()
self.assertEqual(idx.all(), idx.values.all())
@@ -857,6 +907,7 @@ def _check_method_works(self, method):
method(self.strIndex)
method(self.intIndex)
method(self.tuples)
+ method(self.catIndex)
def test_get_indexer(self):
idx1 = Index([1, 2, 3, 4, 5])
@@ -1232,6 +1283,14 @@ def test_str_attribute(self):
expected = Series(range(2), index=['a1', 'a2'])
tm.assert_series_equal(s[s.index.str.startswith('a')], expected)
+ def test_tab_completion(self):
+ # GH 9910
+ idx = Index(list('abcd'))
+ self.assertTrue('str' in dir(idx))
+
+ idx = Index(range(4))
+ self.assertTrue('str' not in dir(idx))
+
def test_indexing_doesnt_change_class(self):
idx = Index([1, 2, 3, 'a', 'b', 'c'])
@@ -1338,6 +1397,352 @@ def test_equals_op(self):
index_b == index_a,
)
+class TestCategoricalIndex(Base, tm.TestCase):
+ _holder = CategoricalIndex
+
+ def setUp(self):
+ self.indices = dict(catIndex = tm.makeCategoricalIndex(100))
+ self.setup_indices()
+
+ def create_index(self, categories=None, ordered=False):
+ if categories is None:
+ categories = list('cab')
+ return CategoricalIndex(list('aabbca'), categories=categories, ordered=ordered)
+
+ def test_construction(self):
+
+ ci = self.create_index(categories=list('abcd'))
+ categories = ci.categories
+
+ result = Index(ci)
+ tm.assert_index_equal(result,ci,exact=True)
+ self.assertFalse(result.ordered)
+
+ result = Index(ci.values)
+ tm.assert_index_equal(result,ci,exact=True)
+ self.assertFalse(result.ordered)
+
+ # empty
+ result = CategoricalIndex(categories=categories)
+ self.assertTrue(result.categories.equals(Index(categories)))
+ self.assert_numpy_array_equal(result.codes,np.array([],dtype='int8'))
+ self.assertFalse(result.ordered)
+
+ # passing categories
+ result = CategoricalIndex(list('aabbca'),categories=categories)
+ self.assertTrue(result.categories.equals(Index(categories)))
+ self.assert_numpy_array_equal(result.codes,np.array([0,0,1,1,2,0],dtype='int8'))
+
+ c = pd.Categorical(list('aabbca'))
+ result = CategoricalIndex(c)
+ self.assertTrue(result.categories.equals(Index(list('abc'))))
+ self.assert_numpy_array_equal(result.codes,np.array([0,0,1,1,2,0],dtype='int8'))
+ self.assertFalse(result.ordered)
+
+ result = CategoricalIndex(c,categories=categories)
+ self.assertTrue(result.categories.equals(Index(categories)))
+ self.assert_numpy_array_equal(result.codes,np.array([0,0,1,1,2,0],dtype='int8'))
+ self.assertFalse(result.ordered)
+
+ ci = CategoricalIndex(c,categories=list('abcd'))
+ result = CategoricalIndex(ci)
+ self.assertTrue(result.categories.equals(Index(categories)))
+ self.assert_numpy_array_equal(result.codes,np.array([0,0,1,1,2,0],dtype='int8'))
+ self.assertFalse(result.ordered)
+
+ result = CategoricalIndex(ci, categories=list('ab'))
+ self.assertTrue(result.categories.equals(Index(list('ab'))))
+ self.assert_numpy_array_equal(result.codes,np.array([0,0,1,1,-1,0],dtype='int8'))
+ self.assertFalse(result.ordered)
+
+ result = CategoricalIndex(ci, categories=list('ab'), ordered=True)
+ self.assertTrue(result.categories.equals(Index(list('ab'))))
+ self.assert_numpy_array_equal(result.codes,np.array([0,0,1,1,-1,0],dtype='int8'))
+ self.assertTrue(result.ordered)
+
+ # turn me to an Index
+ result = Index(np.array(ci))
+ self.assertIsInstance(result, Index)
+ self.assertNotIsInstance(result, CategoricalIndex)
+
+ def test_construction_with_dtype(self):
+
+ # specify dtype
+ ci = self.create_index(categories=list('abc'))
+
+ result = Index(np.array(ci), dtype='category')
+ tm.assert_index_equal(result,ci,exact=True)
+
+ result = Index(np.array(ci).tolist(), dtype='category')
+ tm.assert_index_equal(result,ci,exact=True)
+
+ # these are generally only equal when the categories are reordered
+ ci = self.create_index()
+
+ result = Index(np.array(ci), dtype='category').reorder_categories(ci.categories)
+ tm.assert_index_equal(result,ci,exact=True)
+
+ # make sure indexes are handled
+ expected = CategoricalIndex([0,1,2], categories=[0,1,2], ordered=True)
+ idx = Index(range(3))
+ result = CategoricalIndex(idx, categories=idx, ordered=True)
+ tm.assert_index_equal(result, expected, exact=True)
+
+ def test_method_delegation(self):
+
+ ci = CategoricalIndex(list('aabbca'), categories=list('cabdef'))
+ result = ci.set_categories(list('cab'))
+ tm.assert_index_equal(result, CategoricalIndex(list('aabbca'), categories=list('cab')))
+
+ ci = CategoricalIndex(list('aabbca'), categories=list('cab'))
+ result = ci.rename_categories(list('efg'))
+ tm.assert_index_equal(result, CategoricalIndex(list('ffggef'), categories=list('efg')))
+
+ ci = CategoricalIndex(list('aabbca'), categories=list('cab'))
+ result = ci.add_categories(['d'])
+ tm.assert_index_equal(result, CategoricalIndex(list('aabbca'), categories=list('cabd')))
+
+ ci = CategoricalIndex(list('aabbca'), categories=list('cab'))
+ result = ci.remove_categories(['c'])
+ tm.assert_index_equal(result, CategoricalIndex(list('aabb') + [np.nan] + ['a'], categories=list('ab')))
+
+ ci = CategoricalIndex(list('aabbca'), categories=list('cabdef'))
+ result = ci.as_unordered()
+ tm.assert_index_equal(result, ci)
+
+ ci = CategoricalIndex(list('aabbca'), categories=list('cabdef'))
+ result = ci.as_ordered()
+ tm.assert_index_equal(result, CategoricalIndex(list('aabbca'), categories=list('cabdef'), ordered=True))
+
+ # invalid
+ self.assertRaises(ValueError, lambda : ci.set_categories(list('cab'), inplace=True))
+
+ def test_contains(self):
+
+ ci = self.create_index(categories=list('cabdef'))
+
+ self.assertTrue('a' in ci)
+ self.assertTrue('z' not in ci)
+ self.assertTrue('e' not in ci)
+ self.assertTrue(np.nan not in ci)
+
+ # assert codes NOT in index
+ self.assertFalse(0 in ci)
+ self.assertFalse(1 in ci)
+
+ ci = CategoricalIndex(list('aabbca'), categories=list('cabdef') + [np.nan])
+ self.assertFalse(np.nan in ci)
+
+ ci = CategoricalIndex(list('aabbca') + [np.nan], categories=list('cabdef') + [np.nan])
+ self.assertTrue(np.nan in ci)
+
+ def test_min_max(self):
+
+ ci = self.create_index(ordered=False)
+ self.assertRaises(TypeError, lambda : ci.min())
+ self.assertRaises(TypeError, lambda : ci.max())
+
+ ci = self.create_index(ordered=True)
+
+ self.assertEqual(ci.min(),'c')
+ self.assertEqual(ci.max(),'b')
+
+ def test_append(self):
+
+ ci = self.create_index()
+ categories = ci.categories
+
+ # append cats with the same categories
+ result = ci[:3].append(ci[3:])
+ tm.assert_index_equal(result,ci,exact=True)
+
+ foos = [ci[:1], ci[1:3], ci[3:]]
+ result = foos[0].append(foos[1:])
+ tm.assert_index_equal(result,ci,exact=True)
+
+ # empty
+ result = ci.append([])
+ tm.assert_index_equal(result,ci,exact=True)
+
+ # appending with different categories or reoreded is not ok
+ self.assertRaises(TypeError, lambda : ci.append(ci.values.set_categories(list('abcd'))))
+ self.assertRaises(TypeError, lambda : ci.append(ci.values.reorder_categories(list('abc'))))
+
+ # with objects
+ result = ci.append(['c','a'])
+ expected = CategoricalIndex(list('aabbcaca'), categories=categories)
+ tm.assert_index_equal(result,expected,exact=True)
+
+ # invalid objects
+ self.assertRaises(TypeError, lambda : ci.append(['a','d']))
+
+ def test_insert(self):
+
+ ci = self.create_index()
+ categories = ci.categories
+
+ #test 0th element
+ result = ci.insert(0, 'a')
+ expected = CategoricalIndex(list('aaabbca'),categories=categories)
+ tm.assert_index_equal(result,expected,exact=True)
+
+ #test Nth element that follows Python list behavior
+ result = ci.insert(-1, 'a')
+ expected = CategoricalIndex(list('aabbcaa'),categories=categories)
+ tm.assert_index_equal(result,expected,exact=True)
+
+ #test empty
+ result = CategoricalIndex(categories=categories).insert(0, 'a')
+ expected = CategoricalIndex(['a'],categories=categories)
+ tm.assert_index_equal(result,expected,exact=True)
+
+ # invalid
+ self.assertRaises(TypeError, lambda : ci.insert(0,'d'))
+
+ def test_delete(self):
+
+ ci = self.create_index()
+ categories = ci.categories
+
+ result = ci.delete(0)
+ expected = CategoricalIndex(list('abbca'),categories=categories)
+ tm.assert_index_equal(result,expected,exact=True)
+
+ result = ci.delete(-1)
+ expected = CategoricalIndex(list('aabbc'),categories=categories)
+ tm.assert_index_equal(result,expected,exact=True)
+
+ with tm.assertRaises((IndexError, ValueError)):
+ # either depeidnig on numpy version
+ result = ci.delete(10)
+
+ def test_astype(self):
+
+ ci = self.create_index()
+ result = ci.astype('category')
+ tm.assert_index_equal(result,ci,exact=True)
+
+ result = ci.astype(object)
+ self.assertTrue(result.equals(Index(np.array(ci))))
+
+ # this IS equal, but not the same class
+ self.assertTrue(result.equals(ci))
+ self.assertIsInstance(result, Index)
+ self.assertNotIsInstance(result, CategoricalIndex)
+
+ def test_reindex_base(self):
+
+ # determined by cat ordering
+ idx = self.create_index()
+ expected = np.array([4,0,1,5,2,3])
+
+ actual = idx.get_indexer(idx)
+ assert_array_equal(expected, actual)
+
+ with tm.assertRaisesRegexp(ValueError, 'Invalid fill method'):
+ idx.get_indexer(idx, method='invalid')
+
+ def test_reindexing(self):
+
+ ci = self.create_index()
+ oidx = Index(np.array(ci))
+
+ for n in [1,2,5,len(ci)]:
+ finder = oidx[np.random.randint(0,len(ci),size=n)]
+ expected = oidx.get_indexer_non_unique(finder)[0]
+
+ actual = ci.get_indexer(finder)
+ assert_array_equal(expected, actual)
+
+ def test_duplicates(self):
+
+ idx = CategoricalIndex([0, 0, 0])
+ self.assertFalse(idx.is_unique)
+ self.assertTrue(idx.has_duplicates)
+
+ def test_get_indexer(self):
+
+ idx1 = CategoricalIndex(list('aabcde'),categories=list('edabc'))
+ idx2 = CategoricalIndex(list('abf'))
+
+ for indexer in [idx2, list('abf'), Index(list('abf'))]:
+ r1 = idx1.get_indexer(idx2)
+ assert_almost_equal(r1, [0, 1, 2, -1])
+
+ self.assertRaises(NotImplementedError, lambda : idx2.get_indexer(idx1, method='pad'))
+ self.assertRaises(NotImplementedError, lambda : idx2.get_indexer(idx1, method='backfill'))
+ self.assertRaises(NotImplementedError, lambda : idx2.get_indexer(idx1, method='nearest'))
+
+ def test_repr(self):
+
+ ci = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True)
+ str(ci)
+ tm.assert_index_equal(eval(repr(ci)),ci,exact=True)
+
+ # formatting
+ if compat.PY3:
+ str(ci)
+ else:
+ compat.text_type(ci)
+
+ # long format
+ ci = CategoricalIndex(np.random.randint(0,5,size=100))
+ result = str(ci)
+ tm.assert_index_equal(eval(repr(ci)),ci,exact=True)
+
+ def test_isin(self):
+
+ ci = CategoricalIndex(list('aabca') + [np.nan],categories=['c','a','b',np.nan])
+ self.assert_numpy_array_equal(ci.isin(['c']),np.array([False,False,False,True,False,False]))
+ self.assert_numpy_array_equal(ci.isin(['c','a','b']),np.array([True]*5 + [False]))
+ self.assert_numpy_array_equal(ci.isin(['c','a','b',np.nan]),np.array([True]*6))
+
+ # mismatched categorical -> coerced to ndarray so doesn't matter
+ self.assert_numpy_array_equal(ci.isin(ci.set_categories(list('abcdefghi'))),np.array([True]*6))
+ self.assert_numpy_array_equal(ci.isin(ci.set_categories(list('defghi'))),np.array([False]*5 + [True]))
+
+ def test_identical(self):
+
+ ci1 = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True)
+ ci2 = CategoricalIndex(['a', 'b'], categories=['a', 'b', 'c'], ordered=True)
+ self.assertTrue(ci1.identical(ci1))
+ self.assertTrue(ci1.identical(ci1.copy()))
+ self.assertFalse(ci1.identical(ci2))
+
+ def test_equals(self):
+
+ ci1 = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True)
+ ci2 = CategoricalIndex(['a', 'b'], categories=['a', 'b', 'c'], ordered=True)
+
+ self.assertTrue(ci1.equals(ci1))
+ self.assertFalse(ci1.equals(ci2))
+ self.assertTrue(ci1.equals(ci1.astype(object)))
+ self.assertTrue(ci1.astype(object).equals(ci1))
+
+ self.assertTrue((ci1 == ci1).all())
+ self.assertFalse((ci1 != ci1).all())
+ self.assertFalse((ci1 > ci1).all())
+ self.assertFalse((ci1 < ci1).all())
+ self.assertTrue((ci1 <= ci1).all())
+ self.assertTrue((ci1 >= ci1).all())
+
+ self.assertFalse((ci1 == 1).all())
+ self.assertTrue((ci1 == Index(['a','b'])).all())
+ self.assertTrue((ci1 == ci1.values).all())
+
+ # invalid comparisons
+ self.assertRaises(TypeError, lambda : ci1 == Index(['a','b','c']))
+ self.assertRaises(TypeError, lambda : ci1 == ci2)
+ self.assertRaises(TypeError, lambda : ci1 == Categorical(ci1.values, ordered=False))
+ self.assertRaises(TypeError, lambda : ci1 == Categorical(ci1.values, categories=list('abc')))
+
+ # tests
+ # make sure that we are testing for category inclusion properly
+ self.assertTrue(CategoricalIndex(list('aabca'),categories=['c','a','b']).equals(list('aabca')))
+ self.assertTrue(CategoricalIndex(list('aabca'),categories=['c','a','b',np.nan]).equals(list('aabca')))
+
+ self.assertFalse(CategoricalIndex(list('aabca') + [np.nan],categories=['c','a','b',np.nan]).equals(list('aabca')))
+ self.assertTrue(CategoricalIndex(list('aabca') + [np.nan],categories=['c','a','b',np.nan]).equals(list('aabca') + [np.nan]))
class Numeric(Base):
@@ -1417,18 +1822,13 @@ class TestFloat64Index(Numeric, tm.TestCase):
_multiprocess_can_split_ = True
def setUp(self):
- self.mixed = Float64Index([1.5, 2, 3, 4, 5])
- self.float = Float64Index(np.arange(5) * 2.5)
+ self.indices = dict(mixed = Float64Index([1.5, 2, 3, 4, 5]),
+ float = Float64Index(np.arange(5) * 2.5))
+ self.setup_indices()
def create_index(self):
return Float64Index(np.arange(5,dtype='float64'))
- def test_hash_error(self):
- with tm.assertRaisesRegexp(TypeError,
- "unhashable type: %r" %
- type(self.float).__name__):
- hash(self.float)
-
def test_repr_roundtrip(self):
for ind in (self.mixed, self.float):
tm.assert_index_equal(eval(repr(ind)), ind)
@@ -1594,7 +1994,8 @@ class TestInt64Index(Numeric, tm.TestCase):
_multiprocess_can_split_ = True
def setUp(self):
- self.index = Int64Index(np.arange(0, 20, 2))
+ self.indices = dict(index = Int64Index(np.arange(0, 20, 2)))
+ self.setup_indices()
def create_index(self):
return Int64Index(np.arange(5,dtype='int64'))
@@ -1641,18 +2042,14 @@ def test_constructor_corner(self):
with tm.assertRaisesRegexp(TypeError, 'casting'):
Int64Index(arr_with_floats)
- def test_hash_error(self):
- with tm.assertRaisesRegexp(TypeError,
- "unhashable type: %r" %
- type(self.index).__name__):
- hash(self.index)
-
def test_copy(self):
i = Int64Index([], name='Foo')
i_copy = i.copy()
self.assertEqual(i_copy.name, 'Foo')
def test_view(self):
+ super(TestInt64Index, self).test_view()
+
i = Int64Index([], name='Foo')
i_view = i.view()
self.assertEqual(i_view.name, 'Foo')
@@ -2053,6 +2450,7 @@ def test_slice_keep_name(self):
class DatetimeLike(Base):
def test_view(self):
+ super(DatetimeLike, self).test_view()
i = self.create_index()
@@ -2068,6 +2466,10 @@ class TestDatetimeIndex(DatetimeLike, tm.TestCase):
_holder = DatetimeIndex
_multiprocess_can_split_ = True
+ def setUp(self):
+ self.indices = dict(index = tm.makeDateIndex(10))
+ self.setup_indices()
+
def create_index(self):
return date_range('20130101',periods=5)
@@ -2186,6 +2588,10 @@ class TestPeriodIndex(DatetimeLike, tm.TestCase):
_holder = PeriodIndex
_multiprocess_can_split_ = True
+ def setUp(self):
+ self.indices = dict(index = tm.makePeriodIndex(10))
+ self.setup_indices()
+
def create_index(self):
return period_range('20130101',periods=5,freq='D')
@@ -2220,6 +2626,10 @@ class TestTimedeltaIndex(DatetimeLike, tm.TestCase):
_holder = TimedeltaIndex
_multiprocess_can_split_ = True
+ def setUp(self):
+ self.indices = dict(index = tm.makeTimedeltaIndex(10))
+ self.setup_indices()
+
def create_index(self):
return pd.to_timedelta(range(5),unit='d') + pd.offsets.Hour(1)
@@ -2294,9 +2704,10 @@ def setUp(self):
major_labels = np.array([0, 0, 1, 2, 3, 3])
minor_labels = np.array([0, 1, 0, 1, 0, 1])
self.index_names = ['first', 'second']
- self.index = MultiIndex(levels=[major_axis, minor_axis],
- labels=[major_labels, minor_labels],
- names=self.index_names, verify_integrity=False)
+ self.indices = dict(index = MultiIndex(levels=[major_axis, minor_axis],
+ labels=[major_labels, minor_labels],
+ names=self.index_names, verify_integrity=False))
+ self.setup_indices()
def create_index(self):
return self.index
@@ -2332,13 +2743,7 @@ def test_labels_dtypes(self):
self.assertTrue((i.labels[0]>=0).all())
self.assertTrue((i.labels[1]>=0).all())
- def test_hash_error(self):
- with tm.assertRaisesRegexp(TypeError,
- "unhashable type: %r" %
- type(self.index).__name__):
- hash(self.index)
-
- def test_set_names_and_rename(self):
+ def test_set_name_methods(self):
# so long as these are synonyms, we don't need to test set_names
self.assertEqual(self.index.rename, self.index.set_names)
new_names = [name + "SUFFIX" for name in self.index_names]
@@ -3838,7 +4243,7 @@ def test_reindex_level(self):
assertRaisesRegexp(TypeError, "Fill method not supported",
idx.reindex, idx, method='bfill', level='first')
- def test_has_duplicates(self):
+ def test_duplicates(self):
self.assertFalse(self.index.has_duplicates)
self.assertTrue(self.index.append(self.index).has_duplicates)
diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py
index 5f109212add06..19ed799853ed4 100644
--- a/pandas/tests/test_indexing.py
+++ b/pandas/tests/test_indexing.py
@@ -1438,6 +1438,13 @@ def test_iloc_setitem_series(self):
result = s.iloc[:4]
assert_series_equal(result, expected)
+ s= Series([-1]*6)
+ s.iloc[0::2]= [0,2,4]
+ s.iloc[1::2]= [1,3,5]
+ result = s
+ expected= Series([0,1,2,3,4,5])
+ assert_series_equal(result, expected)
+
def test_iloc_setitem_list_of_lists(self):
# GH 7551
@@ -2366,6 +2373,7 @@ def test_dups_fancy_indexing(self):
rows = ['C','B','E']
expected = DataFrame({'test' : [11,9,np.nan], 'test1': [7.,6,np.nan], 'other': ['d','c',np.nan]},index=rows)
+
result = df.ix[rows]
assert_frame_equal(result, expected)
@@ -4422,6 +4430,212 @@ def test_indexing_assignment_dict_already_exists(self):
tm.assert_frame_equal(df, expected)
+
+class TestCategoricalIndex(tm.TestCase):
+
+ def setUp(self):
+
+ self.df = DataFrame({'A' : np.arange(6,dtype='int64'),
+ 'B' : Series(list('aabbca')).astype('category',categories=list('cab')) }).set_index('B')
+ self.df2 = DataFrame({'A' : np.arange(6,dtype='int64'),
+ 'B' : Series(list('aabbca')).astype('category',categories=list('cabe')) }).set_index('B')
+ self.df3 = DataFrame({'A' : np.arange(6,dtype='int64'),
+ 'B' : Series([1,1,2,1,3,2]).astype('category',categories=[3,2,1],ordered=True) }).set_index('B')
+ self.df4 = DataFrame({'A' : np.arange(6,dtype='int64'),
+ 'B' : Series([1,1,2,1,3,2]).astype('category',categories=[3,2,1],ordered=False) }).set_index('B')
+
+
+ def test_loc_scalar(self):
+
+ result = self.df.loc['a']
+ expected = DataFrame({'A' : [0,1,5],
+ 'B' : Series(list('aaa')).astype('category',categories=list('cab')) }).set_index('B')
+ assert_frame_equal(result, expected)
+
+
+ df = self.df.copy()
+ df.loc['a'] = 20
+ expected = DataFrame({'A' : [20,20,2,3,4,20],
+ 'B' : Series(list('aabbca')).astype('category',categories=list('cab')) }).set_index('B')
+ assert_frame_equal(df, expected)
+
+ # value not in the categories
+ self.assertRaises(KeyError, lambda : df.loc['d'])
+
+ def f():
+ df.loc['d'] = 10
+ self.assertRaises(TypeError, f)
+
+ def f():
+ df.loc['d','A'] = 10
+ self.assertRaises(TypeError, f)
+
+ def f():
+ df.loc['d','C'] = 10
+ self.assertRaises(TypeError, f)
+
+ def test_loc_listlike(self):
+
+ # list of labels
+ result = self.df.loc[['c','a']]
+ expected = self.df.iloc[[4,0,1,5]]
+ assert_frame_equal(result, expected)
+
+ result = self.df2.loc[['a','b','e']]
+ expected = DataFrame({'A' : [0,1,5,2,3,np.nan],
+ 'B' : Series(list('aaabbe')).astype('category',categories=list('cabe')) }).set_index('B')
+ assert_frame_equal(result, expected)
+
+ # element in the categories but not in the values
+ self.assertRaises(KeyError, lambda : self.df2.loc['e'])
+
+ # assign is ok
+ df = self.df2.copy()
+ df.loc['e'] = 20
+ result = df.loc[['a','b','e']]
+ expected = DataFrame({'A' : [0,1,5,2,3,20],
+ 'B' : Series(list('aaabbe')).astype('category',categories=list('cabe')) }).set_index('B')
+ assert_frame_equal(result, expected)
+
+ df = self.df2.copy()
+ result = df.loc[['a','b','e']]
+ expected = DataFrame({'A' : [0,1,5,2,3,np.nan],
+ 'B' : Series(list('aaabbe')).astype('category',categories=list('cabe')) }).set_index('B')
+ assert_frame_equal(result, expected)
+
+
+ # not all labels in the categories
+ self.assertRaises(KeyError, lambda : self.df2.loc[['a','d']])
+
+ def test_reindexing(self):
+
+ # reindexing
+ # convert to a regular index
+ result = self.df2.reindex(['a','b','e'])
+ expected = DataFrame({'A' : [0,1,5,2,3,np.nan],
+ 'B' : Series(list('aaabbe')) }).set_index('B')
+ assert_frame_equal(result, expected)
+
+ result = self.df2.reindex(['a','b'])
+ expected = DataFrame({'A' : [0,1,5,2,3],
+ 'B' : Series(list('aaabb')) }).set_index('B')
+ assert_frame_equal(result, expected)
+
+ result = self.df2.reindex(['e'])
+ expected = DataFrame({'A' : [np.nan],
+ 'B' : Series(['e']) }).set_index('B')
+ assert_frame_equal(result, expected)
+
+ result = self.df2.reindex(['d'])
+ expected = DataFrame({'A' : [np.nan],
+ 'B' : Series(['d']) }).set_index('B')
+ assert_frame_equal(result, expected)
+
+ # since we are actually reindexing with a Categorical
+ # then return a Categorical
+ cats = list('cabe')
+
+ result = self.df2.reindex(pd.Categorical(['a','d'],categories=cats))
+ expected = DataFrame({'A' : [0,1,5,np.nan],
+ 'B' : Series(list('aaad')).astype('category',categories=cats) }).set_index('B')
+ assert_frame_equal(result, expected)
+
+ result = self.df2.reindex(pd.Categorical(['a'],categories=cats))
+ expected = DataFrame({'A' : [0,1,5],
+ 'B' : Series(list('aaa')).astype('category',categories=cats) }).set_index('B')
+ assert_frame_equal(result, expected)
+
+ result = self.df2.reindex(['a','b','e'])
+ expected = DataFrame({'A' : [0,1,5,2,3,np.nan],
+ 'B' : Series(list('aaabbe')) }).set_index('B')
+ assert_frame_equal(result, expected)
+
+ result = self.df2.reindex(['a','b'])
+ expected = DataFrame({'A' : [0,1,5,2,3],
+ 'B' : Series(list('aaabb')) }).set_index('B')
+ assert_frame_equal(result, expected)
+
+ result = self.df2.reindex(['e'])
+ expected = DataFrame({'A' : [np.nan],
+ 'B' : Series(['e']) }).set_index('B')
+ assert_frame_equal(result, expected)
+
+ # give back the type of categorical that we received
+ result = self.df2.reindex(pd.Categorical(['a','d'],categories=cats,ordered=True))
+ expected = DataFrame({'A' : [0,1,5,np.nan],
+ 'B' : Series(list('aaad')).astype('category',categories=cats,ordered=True) }).set_index('B')
+ assert_frame_equal(result, expected)
+
+ result = self.df2.reindex(pd.Categorical(['a','d'],categories=['a','d']))
+ expected = DataFrame({'A' : [0,1,5,np.nan],
+ 'B' : Series(list('aaad')).astype('category',categories=['a','d']) }).set_index('B')
+ assert_frame_equal(result, expected)
+
+ # passed duplicate indexers are not allowed
+ self.assertRaises(ValueError, lambda : self.df2.reindex(['a','a']))
+
+ # args NotImplemented ATM
+ self.assertRaises(NotImplementedError, lambda : self.df2.reindex(['a'],method='ffill'))
+ self.assertRaises(NotImplementedError, lambda : self.df2.reindex(['a'],level=1))
+ self.assertRaises(NotImplementedError, lambda : self.df2.reindex(['a'],limit=2))
+
+ def test_loc_slice(self):
+
+ # slicing
+ # not implemented ATM
+ # GH9748
+
+ self.assertRaises(TypeError, lambda : self.df.loc[1:5])
+
+ #result = df.loc[1:5]
+ #expected = df.iloc[[1,2,3,4]]
+ #assert_frame_equal(result, expected)
+
+ def test_boolean_selection(self):
+
+ df3 = self.df3
+ df4 = self.df4
+
+ result = df3[df3.index == 'a']
+ expected = df3.iloc[[]]
+ assert_frame_equal(result,expected)
+
+ result = df4[df4.index == 'a']
+ expected = df4.iloc[[]]
+ assert_frame_equal(result,expected)
+
+ result = df3[df3.index == 1]
+ expected = df3.iloc[[0,1,3]]
+ assert_frame_equal(result,expected)
+
+ result = df4[df4.index == 1]
+ expected = df4.iloc[[0,1,3]]
+ assert_frame_equal(result,expected)
+
+ # since we have an ordered categorical
+
+ # CategoricalIndex([1, 1, 2, 1, 3, 2],
+ # categories=[3, 2, 1],
+ # ordered=True,
+ # name=u'B')
+ result = df3[df3.index < 2]
+ expected = df3.iloc[[4]]
+ assert_frame_equal(result,expected)
+
+ result = df3[df3.index > 1]
+ expected = df3.iloc[[]]
+ assert_frame_equal(result,expected)
+
+ # unordered
+ # cannot be compared
+
+ # CategoricalIndex([1, 1, 2, 1, 3, 2],
+ # categories=[3, 2, 1],
+ # ordered=False,
+ # name=u'B')
+ self.assertRaises(TypeError, lambda : df4[df4.index < 2])
+ self.assertRaises(TypeError, lambda : df4[df4.index > 1])
+
class TestSeriesNoneCoercion(tm.TestCase):
EXPECTED_RESULTS = [
# For numeric series, we should coerce to NaN.
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
index c3b43f3ec70c0..f1a9e23796804 100644
--- a/pandas/tests/test_series.py
+++ b/pandas/tests/test_series.py
@@ -242,6 +242,26 @@ def test_dt_accessor_api(self):
s.dt
self.assertFalse(hasattr(s, 'dt'))
+ def test_tab_completion(self):
+ # GH 9910
+ s = Series(list('abcd'))
+ # Series of str values should have .str but not .dt/.cat in __dir__
+ self.assertTrue('str' in dir(s))
+ self.assertTrue('dt' not in dir(s))
+ self.assertTrue('cat' not in dir(s))
+
+ # similiarly for .dt
+ s = Series(date_range('1/1/2015', periods=5))
+ self.assertTrue('dt' in dir(s))
+ self.assertTrue('str' not in dir(s))
+ self.assertTrue('cat' not in dir(s))
+
+ # similiarly for .cat
+ s = Series(list('abbcd'), dtype="category")
+ self.assertTrue('cat' in dir(s))
+ self.assertTrue('str' not in dir(s))
+ self.assertTrue('dt' not in dir(s))
+
def test_binop_maybe_preserve_name(self):
# names match, preserve
@@ -6851,6 +6871,22 @@ def test_searchsorted_sorter(self):
e = np.array([0, 2])
tm.assert_array_equal(r, e)
+ def test_to_frame_expanddim(self):
+ # GH 9762
+
+ class SubclassedSeries(Series):
+ @property
+ def _constructor_expanddim(self):
+ return SubclassedFrame
+
+ class SubclassedFrame(DataFrame):
+ pass
+
+ s = SubclassedSeries([1, 2, 3], name='X')
+ result = s.to_frame()
+ self.assertTrue(isinstance(result, SubclassedFrame))
+ expected = SubclassedFrame({'X': [1, 2, 3]})
+ assert_frame_equal(result, expected)
class TestSeriesNonUnique(tm.TestCase):
diff --git a/pandas/tests/test_util.py b/pandas/tests/test_util.py
index bb8bd3df96b71..38f058358b37f 100644
--- a/pandas/tests/test_util.py
+++ b/pandas/tests/test_util.py
@@ -3,6 +3,7 @@
import nose
+import sys
import pandas.util
from pandas.util.decorators import deprecate_kwarg
import pandas.util.testing as tm
@@ -80,6 +81,9 @@ def test_warning(self):
self.assertNotAlmostEquals(1, 2)
def test_locale(self):
+ if sys.platform == 'win32':
+ raise nose.SkipTest("skipping on win platforms as locale not available")
+
#GH9744
locales = pandas.util.testing.get_locales()
self.assertTrue(len(locales) >= 1)
diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py
index 513f165af4686..268bd306585ad 100644
--- a/pandas/tools/plotting.py
+++ b/pandas/tools/plotting.py
@@ -1040,7 +1040,10 @@ def _adorn_subplots(self):
if len(self.axes) > 0:
all_axes = self._get_axes()
nrows, ncols = self._get_axes_layout()
- _handle_shared_axes(all_axes, len(all_axes), len(all_axes), nrows, ncols, self.sharex, self.sharey)
+ _handle_shared_axes(axarr=all_axes, nplots=len(all_axes),
+ naxes=nrows * ncols, nrows=nrows,
+ ncols=ncols, sharex=self.sharex,
+ sharey=self.sharey)
for ax in to_adorn:
if self.yticks is not None:
diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py
index 2b37c64940170..f15de87dbd81c 100644
--- a/pandas/tseries/base.py
+++ b/pandas/tseries/base.py
@@ -61,13 +61,13 @@ def groupby(self, f):
return _algos.groupby_object(objs, f)
def _format_with_header(self, header, **kwargs):
- return header + self._format_native_types(**kwargs)
+ return header + list(self._format_native_types(**kwargs))
def __contains__(self, key):
try:
res = self.get_loc(key)
return np.isscalar(res) or type(res) == slice or np.any(res)
- except (KeyError, TypeError):
+ except (KeyError, TypeError, ValueError):
return False
@property
diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py
index 7dac36a9ae5cc..7b0ff578b0d90 100644
--- a/pandas/tseries/index.py
+++ b/pandas/tseries/index.py
@@ -673,12 +673,13 @@ def _add_delta(self, delta):
def _format_native_types(self, na_rep=u('NaT'),
date_format=None, **kwargs):
- data = self.asobject
- from pandas.core.format import Datetime64Formatter
- return Datetime64Formatter(values=data,
- nat_rep=na_rep,
- date_format=date_format,
- justify='all').get_result()
+ from pandas.core.format import _get_format_datetime64_from_values
+ format = _get_format_datetime64_from_values(self, date_format)
+
+ return tslib.format_array_from_datetime(self.asi8,
+ tz=self.tz,
+ format=format,
+ na_rep=na_rep)
def to_datetime(self, dayfirst=False):
return self.copy()
diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py
index b1f0ba1f127fa..a4b754f5a6bbd 100644
--- a/pandas/tseries/period.py
+++ b/pandas/tseries/period.py
@@ -387,7 +387,7 @@ def to_datetime(self, dayfirst=False):
qyear = _field_accessor('qyear', 1)
days_in_month = _field_accessor('days_in_month', 11, "The number of days in the month")
daysinmonth = days_in_month
-
+
def _get_object_array(self):
freq = self.freq
return np.array([ Period._from_ordinal(ordinal=x, freq=freq) for x in self.values], copy=False)
@@ -687,7 +687,7 @@ def _format_native_types(self, na_rep=u('NaT'), **kwargs):
imask = ~mask
values[imask] = np.array([u('%s') % dt for dt in values[imask]])
- return values.tolist()
+ return values
def __array_finalize__(self, obj):
if not self.ndim: # pragma: no cover
diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py
index c42802bdb31ad..6bdff5aab3cfd 100644
--- a/pandas/tseries/tests/test_base.py
+++ b/pandas/tseries/tests/test_base.py
@@ -745,6 +745,13 @@ def test_nonunique_contains(self):
['00:01:00', '00:01:00', '00:00:01'])):
tm.assertIn(idx[0], idx)
+ def test_unknown_attribute(self):
+ #GH 9680
+ tdi = pd.timedelta_range(start=0,periods=10,freq='1s')
+ ts = pd.Series(np.random.normal(size=10),index=tdi)
+ self.assertNotIn('foo',ts.__dict__.keys())
+ self.assertRaises(AttributeError,lambda : ts.foo)
+
class TestPeriodIndexOps(Ops):
diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py
index c338bbeae79c7..2ae311e044a75 100644
--- a/pandas/tseries/tests/test_resample.py
+++ b/pandas/tseries/tests/test_resample.py
@@ -875,23 +875,23 @@ def test_resmaple_dst_anchor(self):
# 5172
dti = DatetimeIndex([datetime(2012, 11, 4, 23)], tz='US/Eastern')
df = DataFrame([5], index=dti)
- assert_frame_equal(df.resample(rule='D', how='sum'),
+ assert_frame_equal(df.resample(rule='D', how='sum'),
DataFrame([5], index=df.index.normalize()))
df.resample(rule='MS', how='sum')
assert_frame_equal(df.resample(rule='MS', how='sum'),
- DataFrame([5], index=DatetimeIndex([datetime(2012, 11, 1)],
+ DataFrame([5], index=DatetimeIndex([datetime(2012, 11, 1)],
tz='US/Eastern')))
dti = date_range('2013-09-30', '2013-11-02', freq='30Min', tz='Europe/Paris')
values = range(dti.size)
df = DataFrame({"a": values, "b": values, "c": values}, index=dti, dtype='int64')
how = {"a": "min", "b": "max", "c": "count"}
-
+
assert_frame_equal(df.resample("W-MON", how=how)[["a", "b", "c"]],
DataFrame({"a": [0, 48, 384, 720, 1056, 1394],
"b": [47, 383, 719, 1055, 1393, 1586],
"c": [48, 336, 336, 336, 338, 193]},
- index=date_range('9/30/2013', '11/4/2013',
+ index=date_range('9/30/2013', '11/4/2013',
freq='W-MON', tz='Europe/Paris')),
'W-MON Frequency')
@@ -899,7 +899,7 @@ def test_resmaple_dst_anchor(self):
DataFrame({"a": [0, 48, 720, 1394],
"b": [47, 719, 1393, 1586],
"c": [48, 672, 674, 193]},
- index=date_range('9/30/2013', '11/11/2013',
+ index=date_range('9/30/2013', '11/11/2013',
freq='2W-MON', tz='Europe/Paris')),
'2W-MON Frequency')
@@ -907,7 +907,7 @@ def test_resmaple_dst_anchor(self):
DataFrame({"a": [0, 48, 1538],
"b": [47, 1537, 1586],
"c": [48, 1490, 49]},
- index=date_range('9/1/2013', '11/1/2013',
+ index=date_range('9/1/2013', '11/1/2013',
freq='MS', tz='Europe/Paris')),
'MS Frequency')
@@ -915,7 +915,7 @@ def test_resmaple_dst_anchor(self):
DataFrame({"a": [0, 1538],
"b": [1537, 1586],
"c": [1538, 49]},
- index=date_range('9/1/2013', '11/1/2013',
+ index=date_range('9/1/2013', '11/1/2013',
freq='2MS', tz='Europe/Paris')),
'2MS Frequency')
@@ -1553,6 +1553,8 @@ def test_aggregate_with_nat(self):
expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key')
dt_result = getattr(dt_grouped, func)()
assert_series_equal(expected, dt_result)
+ # GH 9925
+ self.assertEqual(dt_result.index.name, 'key')
# if NaT is included, 'var', 'std', 'mean', 'first','last' and 'nth' doesn't work yet
diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx
index 7580fa5489e15..40dbbd7584c7a 100644
--- a/pandas/tslib.pyx
+++ b/pandas/tslib.pyx
@@ -1398,6 +1398,69 @@ def parse_datetime_string(date_string, **kwargs):
dt = parse_date(date_string, **kwargs)
return dt
+def format_array_from_datetime(ndarray[int64_t] values, object tz=None, object format=None, object na_rep=None):
+ """
+ return a np object array of the string formatted values
+
+ Parameters
+ ----------
+ values : a 1-d i8 array
+ tz : the timezone (or None)
+ format : optional, default is None
+ a strftime capable string
+ na_rep : optional, default is None
+ a nat format
+
+ """
+ cdef:
+ int64_t val, ns, N = len(values)
+ ndarray[object] result = np.empty(N, dtype=object)
+ object ts, res
+ pandas_datetimestruct dts
+
+ if na_rep is None:
+ na_rep = 'NaT'
+
+ for i in range(N):
+ val = values[i]
+
+ if val == iNaT:
+ result[i] = na_rep
+ else:
+ if format is None and tz is None:
+
+ pandas_datetime_to_datetimestruct(val, PANDAS_FR_ns, &dts)
+ res = '%d-%.2d-%.2d %.2d:%.2d:%.2d' % (dts.year,
+ dts.month,
+ dts.day,
+ dts.hour,
+ dts.min,
+ dts.sec)
+
+ ns = dts.ps / 1000
+
+ if ns != 0:
+ res += '.%.9d' % (ns + 1000 * dts.us)
+ elif dts.us != 0:
+ res += '.%.6d' % dts.us
+
+ result[i] = res
+
+ else:
+ ts = Timestamp(val, tz=tz)
+ if format is None:
+ result[i] = str(ts)
+ else:
+
+ # invalid format string
+ # requires dates > 1900
+ try:
+ result[i] = ts.strftime(format)
+ except ValueError:
+ result[i] = str(ts)
+
+ return result
+
def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False,
format=None, utc=None, coerce=False, unit=None):
cdef:
diff --git a/pandas/util/testing.py b/pandas/util/testing.py
index b4baedada46e1..ea7354a9334ff 100644
--- a/pandas/util/testing.py
+++ b/pandas/util/testing.py
@@ -25,11 +25,6 @@
import pandas as pd
from pandas.core.common import is_sequence, array_equivalent, is_list_like
-import pandas.core.index as index
-import pandas.core.series as series
-import pandas.core.frame as frame
-import pandas.core.panel as panel
-import pandas.core.panel4d as panel4d
import pandas.compat as compat
from pandas.compat import(
filter, map, zip, range, unichr, lrange, lmap, lzip, u, callable, Counter,
@@ -38,24 +33,12 @@
from pandas.computation import expressions as expr
-from pandas import bdate_range
-from pandas.tseries.index import DatetimeIndex
-from pandas.tseries.tdi import TimedeltaIndex
-from pandas.tseries.period import PeriodIndex
+from pandas import (bdate_range, CategoricalIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex,
+ Index, MultiIndex, Series, DataFrame, Panel, Panel4D)
from pandas.util.decorators import deprecate
-
from pandas import _testing
-
-
from pandas.io.common import urlopen
-Index = index.Index
-MultiIndex = index.MultiIndex
-Series = series.Series
-DataFrame = frame.DataFrame
-Panel = panel.Panel
-Panel4D = panel4d.Panel4D
-
N = 30
K = 4
_RAISE_NETWORK_ERROR_DEFAULT = False
@@ -550,16 +533,14 @@ def assert_equal(a, b, msg=""):
assert a == b, "%s: %r != %r" % (msg.format(a,b), a, b)
-def assert_index_equal(left, right):
+def assert_index_equal(left, right, exact=False):
assert_isinstance(left, Index, '[index] ')
assert_isinstance(right, Index, '[index] ')
- if not left.equals(right):
+ if not left.equals(right) or (exact and type(left) != type(right)):
raise AssertionError("[index] left [{0} {1}], right [{2} {3}]".format(left.dtype,
left,
right,
right.dtype))
-
-
def assert_attr_equal(attr, left, right):
"""checks attributes are equal. Both objects must have attribute."""
left_attr = getattr(left, attr)
@@ -627,6 +608,7 @@ def assertNotIsInstance(obj, cls, msg=''):
def assert_categorical_equal(res, exp):
+
if not array_equivalent(res.categories, exp.categories):
raise AssertionError(
'categories not equivalent: {0} vs {1}.'.format(res.categories,
@@ -827,6 +809,11 @@ def makeStringIndex(k=10):
def makeUnicodeIndex(k=10):
return Index(randu_array(nchars=10, size=k))
+def makeCategoricalIndex(k=10, n=3):
+ """ make a length k index or n categories """
+ x = rands_array(nchars=4, size=n)
+ return CategoricalIndex(np.random.choice(x,k))
+
def makeBoolIndex(k=10):
if k == 1:
return Index([True])
diff --git a/setup.py b/setup.py
index 8066a6e0cae4f..f0090aff31430 100755
--- a/setup.py
+++ b/setup.py
@@ -602,7 +602,7 @@ def pxd(name):
],
package_data={'pandas.io': ['tests/data/legacy_hdf/*.h5',
'tests/data/legacy_pickle/*/*.pickle',
- 'tests/data/*.csv',
+ 'tests/data/*.csv*',
'tests/data/*.dta',
'tests/data/*.txt',
'tests/data/*.xls',