diff --git a/.travis.yml b/.travis.yml index bc87853b26d6e..0d143d7f7133b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,6 +22,7 @@ matrix: - LOCALE_OVERRIDE="it_IT.UTF-8" - BUILD_TYPE=conda - JOB_NAME: "26_nslow_nnet" + - INSTALL_TEST=true - python: 2.7 env: - NOSE_ARGS="slow and not network and not disabled" @@ -183,6 +184,7 @@ script: # nothing here, or failed tests won't fail travis after_script: + - ci/install_test.sh - if [ -f /tmp/doc.log ]; then cat /tmp/doc.log; fi - source activate pandas && ci/print_versions.py - ci/print_skipped.py /tmp/nosetests.xml diff --git a/README.md b/README.md index cea7e8c6bfd72..c76fbe7df9e6b 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # pandas: powerful Python data analysis toolkit -![Travis-CI Build Status](https://travis-ci.org/pydata/pandas.svg) +[![Build Status](https://travis-ci.org/pydata/pandas.svg?branch=master)](https://travis-ci.org/pydata/pandas) ## What is it diff --git a/ci/install_test.sh b/ci/install_test.sh new file mode 100755 index 0000000000000..e01ad7b94a349 --- /dev/null +++ b/ci/install_test.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +echo "inside $0" + +if [ "$INSTALL_TEST" ]; then + source activate pandas + echo "Starting installation test." + conda uninstall cython || exit 1 + python "$TRAVIS_BUILD_DIR"/setup.py sdist --formats=zip,gztar || exit 1 + pip install "$TRAVIS_BUILD_DIR"/dist/*tar.gz || exit 1 + nosetests --exe -A "$NOSE_ARGS" pandas/tests/test_series.py --with-xunit --xunit-file=/tmp/nosetests_install.xml +else + echo "Skipping installation test." +fi +RET="$?" + +exit "$RET" diff --git a/doc/_templates/api_redirect.html b/doc/_templates/api_redirect.html new file mode 100644 index 0000000000000..24bdd8363830f --- /dev/null +++ b/doc/_templates/api_redirect.html @@ -0,0 +1,15 @@ +{% set pgn = pagename.split('.') -%} +{% if pgn[-2][0].isupper() -%} + {% set redirect = ["pandas", pgn[-2], pgn[-1], 'html']|join('.') -%} +{% else -%} + {% set redirect = ["pandas", pgn[-1], 'html']|join('.') -%} +{% endif -%} + + + + This API page has moved + + +

This API page has moved here.

+ + \ No newline at end of file diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 1749409c863df..688935c6b104d 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -594,6 +594,95 @@ faster than fancy indexing. timeit ser.ix[indexer] timeit ser.take(indexer) +.. _indexing.categoricalindex: + +CategoricalIndex +---------------- + +.. versionadded:: 0.16.1 + +We introduce a ``CategoricalIndex``, a new type of index object that is useful for supporting +indexing with duplicates. This is a container around a ``Categorical`` (introduced in v0.15.0) +and allows efficient indexing and storage of an index with a large number of duplicated elements. Prior to 0.16.1, +setting the index of a ``DataFrame/Series`` with a ``category`` dtype would convert this to regular object-based ``Index``. + +.. ipython:: python + + df = DataFrame({'A' : np.arange(6), + 'B' : Series(list('aabbca')).astype('category', + categories=list('cab')) + }) + df + df.dtypes + df.B.cat.categories + +Setting the index, will create create a ``CategoricalIndex`` + +.. ipython:: python + + df2 = df.set_index('B') + df2.index + +Indexing with ``__getitem__/.iloc/.loc/.ix`` works similarly to an ``Index`` with duplicates. +The indexers MUST be in the category or the operation will raise. + +.. ipython:: python + + df2.loc['a'] + +These PRESERVE the ``CategoricalIndex`` + +.. ipython:: python + + df2.loc['a'].index + +Sorting will order by the order of the categories + +.. ipython:: python + + df2.sort_index() + +Groupby operations on the index will preserve the index nature as well + +.. ipython:: python + + df2.groupby(level=0).sum() + df2.groupby(level=0).sum().index + +Reindexing operations, will return a resulting index based on the type of the passed +indexer, meaning that passing a list will return a plain-old-``Index``; indexing with +a ``Categorical`` will return a ``CategoricalIndex``, indexed according to the categories +of the PASSED ``Categorical`` dtype. This allows one to arbitrarly index these even with +values NOT in the categories, similarly to how you can reindex ANY pandas index. + +.. ipython :: python + + df2.reindex(['a','e']) + df2.reindex(['a','e']).index + df2.reindex(pd.Categorical(['a','e'],categories=list('abcde'))) + df2.reindex(pd.Categorical(['a','e'],categories=list('abcde'))).index + +.. warning:: + + Reshaping and Comparision operations on a ``CategoricalIndex`` must have the same categories + or a ``TypeError`` will be raised. + + .. code-block:: python + + In [10]: df3 = DataFrame({'A' : np.arange(6), + 'B' : Series(list('aabbca')).astype('category', + categories=list('abc')) + }).set_index('B') + + In [11]: df3.index + Out[11]: + CategoricalIndex([u'a', u'a', u'b', u'b', u'c', u'a'], + categories=[u'a', u'b', u'c'], + ordered=False) + + In [12]: pd.concat([df2,df3] + TypeError: categories must match existing categories when appending + .. _indexing.float64index: Float64Index @@ -706,4 +795,3 @@ Of course if you need integer based selection, then use ``iloc`` .. ipython:: python dfir.iloc[0:5] - diff --git a/doc/source/api.rst b/doc/source/api.rst index af9f8c84388bd..87e9b20f97e69 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -553,6 +553,7 @@ strings and apply several methods to it. These can be acccessed like Series.str.swapcase Series.str.title Series.str.upper + Series.str.wrap Series.str.zfill Series.str.isalnum Series.str.isalpha @@ -1291,6 +1292,34 @@ Selecting Index.slice_indexer Index.slice_locs +.. _api.categoricalindex: + +CategoricalIndex +---------------- + +.. autosummary:: + :toctree: generated/ + + CategoricalIndex + +Categorical Components +~~~~~~~~~~~~~~~~~~~~~~ + +.. autosummary:: + :toctree: generated/ + + CategoricalIndex.codes + CategoricalIndex.categories + CategoricalIndex.ordered + CategoricalIndex.rename_categories + CategoricalIndex.reorder_categories + CategoricalIndex.add_categories + CategoricalIndex.remove_categories + CategoricalIndex.remove_unused_categories + CategoricalIndex.set_categories + CategoricalIndex.as_ordered + CategoricalIndex.as_unordered + .. _api.datetimeindex: DatetimeIndex @@ -1521,230 +1550,3 @@ Working with options get_option set_option option_context - - -.. - HACK - see github issue #4539. To ensure old links remain valid, include - here the autosummaries with previous currentmodules as a comment and add - them to a hidden toctree (to avoid warnings): - -.. toctree:: - :hidden: - - generated/pandas.core.common.isnull - generated/pandas.core.common.notnull - generated/pandas.core.reshape.get_dummies - generated/pandas.io.clipboard.read_clipboard - generated/pandas.io.excel.ExcelFile.parse - generated/pandas.io.excel.read_excel - generated/pandas.io.html.read_html - generated/pandas.io.json.read_json - generated/pandas.io.parsers.read_csv - generated/pandas.io.parsers.read_fwf - generated/pandas.io.parsers.read_table - generated/pandas.io.pickle.read_pickle - generated/pandas.io.pytables.HDFStore.append - generated/pandas.io.pytables.HDFStore.get - generated/pandas.io.pytables.HDFStore.put - generated/pandas.io.pytables.HDFStore.select - generated/pandas.io.pytables.read_hdf - generated/pandas.io.sql.read_sql - generated/pandas.io.sql.read_frame - generated/pandas.io.sql.write_frame - generated/pandas.io.stata.read_stata - generated/pandas.stats.moments.ewma - generated/pandas.stats.moments.ewmcorr - generated/pandas.stats.moments.ewmcov - generated/pandas.stats.moments.ewmstd - generated/pandas.stats.moments.ewmvar - generated/pandas.stats.moments.expanding_apply - generated/pandas.stats.moments.expanding_corr - generated/pandas.stats.moments.expanding_count - generated/pandas.stats.moments.expanding_cov - generated/pandas.stats.moments.expanding_kurt - generated/pandas.stats.moments.expanding_mean - generated/pandas.stats.moments.expanding_median - generated/pandas.stats.moments.expanding_quantile - generated/pandas.stats.moments.expanding_skew - generated/pandas.stats.moments.expanding_std - generated/pandas.stats.moments.expanding_sum - generated/pandas.stats.moments.expanding_var - generated/pandas.stats.moments.rolling_apply - generated/pandas.stats.moments.rolling_corr - generated/pandas.stats.moments.rolling_count - generated/pandas.stats.moments.rolling_cov - generated/pandas.stats.moments.rolling_kurt - generated/pandas.stats.moments.rolling_mean - generated/pandas.stats.moments.rolling_median - generated/pandas.stats.moments.rolling_quantile - generated/pandas.stats.moments.rolling_skew - generated/pandas.stats.moments.rolling_std - generated/pandas.stats.moments.rolling_sum - generated/pandas.stats.moments.rolling_var - generated/pandas.tools.merge.concat - generated/pandas.tools.merge.merge - generated/pandas.tools.pivot.pivot_table - generated/pandas.tseries.tools.to_datetime - -.. - .. currentmodule:: pandas.io.pickle - - .. autosummary:: - :toctree: generated/ - - read_pickle - - .. currentmodule:: pandas.io.parsers - - .. autosummary:: - :toctree: generated/ - - read_table - read_csv - read_fwf - - .. currentmodule:: pandas.io.clipboard - - .. autosummary:: - :toctree: generated/ - - read_clipboard - - .. currentmodule:: pandas.io.excel - - .. autosummary:: - :toctree: generated/ - - read_excel - ExcelFile.parse - - .. currentmodule:: pandas.io.json - - .. autosummary:: - :toctree: generated/ - - read_json - - .. currentmodule:: pandas.io.html - - .. autosummary:: - :toctree: generated/ - - read_html - - .. currentmodule:: pandas.io.pytables - - .. autosummary:: - :toctree: generated/ - - read_hdf - HDFStore.put - HDFStore.append - HDFStore.get - HDFStore.select - - .. currentmodule:: pandas.io.sql - - .. autosummary:: - :toctree: generated/ - - read_sql - read_frame - write_frame - - .. currentmodule:: pandas.io.stata - - .. autosummary:: - :toctree: generated/ - - read_stata - StataReader.data - StataReader.data_label - StataReader.value_labels - StataReader.variable_labels - StataWriter.write_file - - .. currentmodule:: pandas.tools.pivot - - .. autosummary:: - :toctree: generated/ - - pivot_table - - .. currentmodule:: pandas.tools.merge - - .. autosummary:: - :toctree: generated/ - - merge - concat - - .. currentmodule:: pandas.core.reshape - - .. autosummary:: - :toctree: generated/ - - get_dummies - - .. currentmodule:: pandas.core.common - - .. autosummary:: - :toctree: generated/ - - isnull - notnull - - .. currentmodule:: pandas.tseries.tools - - .. autosummary:: - :toctree: generated/ - - to_datetime - - - .. currentmodule:: pandas.stats.moments - - .. autosummary:: - :toctree: generated/ - - rolling_count - rolling_sum - rolling_mean - rolling_median - rolling_var - rolling_std - rolling_corr - rolling_cov - rolling_skew - rolling_kurt - rolling_apply - rolling_quantile - - - .. currentmodule:: pandas.stats.moments - - .. autosummary:: - :toctree: generated/ - - expanding_count - expanding_sum - expanding_mean - expanding_median - expanding_var - expanding_std - expanding_corr - expanding_cov - expanding_skew - expanding_kurt - expanding_apply - expanding_quantile - - - .. autosummary:: - :toctree: generated/ - - ewma - ewmstd - ewmvar - ewmcorr - ewmcov diff --git a/doc/source/conf.py b/doc/source/conf.py index fcb9c3fdd0016..08fc8483762ab 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -211,7 +211,30 @@ # Additional templates that should be rendered to pages, maps page names to # template names. -# html_additional_pages = {} + +# Add redirect for previously existing API pages (which are now included in +# the API pages as top-level functions) based on a template (GH9911) +moved_api_pages = [ + 'pandas.core.common.isnull', 'pandas.core.common.notnull', 'pandas.core.reshape.get_dummies', + 'pandas.tools.merge.concat', 'pandas.tools.merge.merge', 'pandas.tools.pivot.pivot_table', + 'pandas.tseries.tools.to_datetime', 'pandas.io.clipboard.read_clipboard', 'pandas.io.excel.ExcelFile.parse', + 'pandas.io.excel.read_excel', 'pandas.io.html.read_html', 'pandas.io.json.read_json', + 'pandas.io.parsers.read_csv', 'pandas.io.parsers.read_fwf', 'pandas.io.parsers.read_table', + 'pandas.io.pickle.read_pickle', 'pandas.io.pytables.HDFStore.append', 'pandas.io.pytables.HDFStore.get', + 'pandas.io.pytables.HDFStore.put', 'pandas.io.pytables.HDFStore.select', 'pandas.io.pytables.read_hdf', + 'pandas.io.sql.read_sql', 'pandas.io.sql.read_frame', 'pandas.io.sql.write_frame', + 'pandas.io.stata.read_stata', 'pandas.stats.moments.ewma', 'pandas.stats.moments.ewmcorr', + 'pandas.stats.moments.ewmcov', 'pandas.stats.moments.ewmstd', 'pandas.stats.moments.ewmvar', + 'pandas.stats.moments.expanding_apply', 'pandas.stats.moments.expanding_corr', 'pandas.stats.moments.expanding_count', + 'pandas.stats.moments.expanding_cov', 'pandas.stats.moments.expanding_kurt', 'pandas.stats.moments.expanding_mean', + 'pandas.stats.moments.expanding_median', 'pandas.stats.moments.expanding_quantile', 'pandas.stats.moments.expanding_skew', + 'pandas.stats.moments.expanding_std', 'pandas.stats.moments.expanding_sum', 'pandas.stats.moments.expanding_var', + 'pandas.stats.moments.rolling_apply', 'pandas.stats.moments.rolling_corr', 'pandas.stats.moments.rolling_count', + 'pandas.stats.moments.rolling_cov', 'pandas.stats.moments.rolling_kurt', 'pandas.stats.moments.rolling_mean', + 'pandas.stats.moments.rolling_median', 'pandas.stats.moments.rolling_quantile', 'pandas.stats.moments.rolling_skew', + 'pandas.stats.moments.rolling_std', 'pandas.stats.moments.rolling_sum', 'pandas.stats.moments.rolling_var'] + +html_additional_pages = {'generated/' + page: 'api_redirect.html' for page in moved_api_pages} # If false, no module index is generated. html_use_modindex = True diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index 0e6386955a653..f69f926296020 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -1006,6 +1006,9 @@ The :ref:`HDFStores ` docs `Merging on-disk tables with millions of rows `__ +`Avoiding inconsistencies when writing to a store from multiple processes/threads +`__ + De-duplicating a large store by chunks, essentially a recursive reduction operation. Shows a function for taking in data from csv file and creating a store by chunks, with date parsing as well. `See here diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst index e6b735173110b..d007446a5b922 100644 --- a/doc/source/enhancingperf.rst +++ b/doc/source/enhancingperf.rst @@ -66,7 +66,7 @@ Here's the function in pure python: s += f(a + i * dx) return s * dx -We achieve our result by by using ``apply`` (row-wise): +We achieve our result by using ``apply`` (row-wise): .. ipython:: python @@ -86,7 +86,7 @@ hence we'll concentrate our efforts cythonizing these two functions. .. note:: In python 2 replacing the ``range`` with its generator counterpart (``xrange``) - would mean the ``range`` line would vanish. In python 3 range is already a generator. + would mean the ``range`` line would vanish. In python 3 ``range`` is already a generator. .. _enhancingperf.plain: @@ -248,7 +248,7 @@ efforts here. More advanced techniques ~~~~~~~~~~~~~~~~~~~~~~~~ -There is still scope for improvement, here's an example of using some more +There is still hope for improvement. Here's an example of using some more advanced cython techniques: .. ipython:: @@ -373,7 +373,7 @@ This Python syntax is **not** allowed: :func:`~pandas.eval` Examples ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:func:`pandas.eval` works well with expressions containing large arrays +:func:`pandas.eval` works well with expressions containing large arrays. First let's create a few decent-sized arrays to play with: diff --git a/doc/source/faq.rst b/doc/source/faq.rst index 467ec02b55f20..20762e3fc039f 100644 --- a/doc/source/faq.rst +++ b/doc/source/faq.rst @@ -369,3 +369,4 @@ just a thin layer around the ``QTableView``. mw = MainWidget() mw.show() app.exec_() + diff --git a/doc/source/internals.rst b/doc/source/internals.rst index 9418ca5265f1a..bc1189a8961d6 100644 --- a/doc/source/internals.rst +++ b/doc/source/internals.rst @@ -95,3 +95,155 @@ constructors ``from_tuples`` and ``from_arrays`` ensure that this is true, but if you compute the levels and labels yourself, please be careful. +.. _: + +Subclassing pandas Data Structures +---------------------------------- + +.. warning:: There are some easier alternatives before considering subclassing ``pandas`` data structures. + + 1. Monkey-patching: See :ref:`Adding Features to your pandas Installation `. + + 2. Use *composition*. See `here `_. + +This section describes how to subclass ``pandas`` data structures to meet more specific needs. There are 2 points which need attention: + +1. Override constructor properties. +2. Define original properties + +.. note:: You can find a nice example in `geopandas `_ project. + +Override Constructor Properties +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Each data structure has constructor properties to specifying data constructors. By overriding these properties, you can retain defined-classes through ``pandas`` data manipulations. + +There are 3 constructors to be defined: + +- ``_constructor``: Used when a manipulation result has the same dimesions as the original. +- ``_constructor_sliced``: Used when a manipulation result has one lower dimension(s) as the original, such as ``DataFrame`` single columns slicing. +- ``_constructor_expanddim``: Used when a manipulation result has one higher dimension as the original, such as ``Series.to_frame()`` and ``DataFrame.to_panel()``. + +Following table shows how ``pandas`` data structures define constructor properties by default. + +=========================== ======================= =================== ======================= +Property Attributes ``Series`` ``DataFrame`` ``Panel`` +=========================== ======================= =================== ======================= +``_constructor`` ``Series`` ``DataFrame`` ``Panel`` +``_constructor_sliced`` ``NotImplementedError`` ``Series`` ``DataFrame`` +``_constructor_expanddim`` ``DataFrame`` ``Panel`` ``NotImplementedError`` +=========================== ======================= =================== ======================= + +Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame`` overriding constructor properties. + +.. code-block:: python + + class SubclassedSeries(Series): + + @property + def _constructor(self): + return SubclassedSeries + + @property + def _constructor_expanddim(self): + return SubclassedDataFrame + + class SubclassedDataFrame(DataFrame): + + @property + def _constructor(self): + return SubclassedDataFrame + + @property + def _constructor_sliced(self): + return SubclassedSeries + +.. code-block:: python + + >>> s = SubclassedSeries([1, 2, 3]) + >>> type(s) + + + >>> to_framed = s.to_frame() + >>> type(to_framed) + + + >>> df = SubclassedDataFrame({'A', [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) + >>> df + A B C + 0 1 4 7 + 1 2 5 8 + 2 3 6 9 + + >>> type(df) + + + >>> sliced1 = df[['A', 'B']] + >>> sliced1 + A B + 0 1 4 + 1 2 5 + 2 3 6 + >>> type(sliced1) + + + >>> sliced2 = df['A'] + >>> sliced2 + 0 1 + 1 2 + 2 3 + Name: A, dtype: int64 + >>> type(sliced2) + + +Define Original Properties +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To let original data structures have additional properties, you should let ``pandas`` knows what properties are added. ``pandas`` maps unknown properties to data names overriding ``__getattribute__``. Defining original properties can be done in one of 2 ways: + +1. Define ``_internal_names`` and ``_internal_names_set`` for temporary properties which WILL NOT be passed to manipulation results. +2. Define ``_metadata`` for normal properties which will be passed to manipulation results. + +Below is an example to define 2 original properties, "internal_cache" as a temporary property and "added_property" as a normal property + +.. code-block:: python + + class SubclassedDataFrame2(DataFrame): + + # temporary properties + _internal_names = DataFrame._internal_names + ['internal_cache'] + _internal_names_set = set(_internal_names) + + # normal properties + _metadata = ['added_property'] + + @property + def _constructor(self): + return SubclassedDataFrame2 + +.. code-block:: python + + >>> df = SubclassedDataFrame2({'A', [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) + >>> df + A B C + 0 1 4 7 + 1 2 5 8 + 2 3 6 9 + + >>> df.internal_cache = 'cached' + >>> df.added_property = 'property' + + >>> df.internal_cache + cached + >>> df.added_property + property + + # properties defined in _internal_names is reset after manipulation + >>> df[['A', 'B']].internal_cache + AttributeError: 'SubclassedDataFrame2' object has no attribute 'internal_cache' + + # properties defined in _metadata are retained + >>> df[['A', 'B']].added_property + property + + diff --git a/doc/source/remote_data.rst b/doc/source/remote_data.rst index ac9b6c9aecc4a..65fcf600cdfd2 100644 --- a/doc/source/remote_data.rst +++ b/doc/source/remote_data.rst @@ -49,7 +49,7 @@ Yahoo! Finance import datetime start = datetime.datetime(2010, 1, 1) end = datetime.datetime(2013, 1, 27) - f=web.DataReader("F", 'yahoo', start, end) + f = web.DataReader("F", 'yahoo', start, end) f.ix['2010-01-04'] .. _remote_data.yahoo_options: @@ -58,10 +58,10 @@ Yahoo! Finance Options ---------------------- ***Experimental*** -The Options class allows the download of options data from Yahoo! Finance. +The ``Options`` class allows the download of options data from Yahoo! Finance. The ``get_all_data`` method downloads and caches option data for all expiry months -and provides a formatted ``DataFrame`` with a hierarchical index, so its easy to get +and provides a formatted ``DataFrame`` with a hierarchical index, so it is easy to get to the specific option you want. .. ipython:: python @@ -71,10 +71,10 @@ to the specific option you want. data = aapl.get_all_data() data.iloc[0:5, 0:5] - #Show the $100 strike puts at all expiry dates: + # Show the $100 strike puts at all expiry dates: data.loc[(100, slice(None), 'put'),:].iloc[0:5, 0:5] - #Show the volume traded of $100 strike puts at all expiry dates: + # Show the volume traded of $100 strike puts at all expiry dates: data.loc[(100, slice(None), 'put'),'Vol'].head() If you don't want to download all the data, more specific requests can be made. @@ -121,7 +121,7 @@ Google Finance import datetime start = datetime.datetime(2010, 1, 1) end = datetime.datetime(2013, 1, 27) - f=web.DataReader("F", 'google', start, end) + f = web.DataReader("F", 'google', start, end) f.ix['2010-01-04'] .. _remote_data.fred: @@ -152,7 +152,7 @@ Dataset names are listed at `Fama/French Data Library .. ipython:: python import pandas.io.data as web - ip=web.DataReader("5_Industry_Portfolios", "famafrench") + ip = web.DataReader("5_Industry_Portfolios", "famafrench") ip[4].ix[192607] .. _remote_data.wb: @@ -302,9 +302,8 @@ Problematic Country Codes & Indicators :func:`wb.download()` is more flexible. To achieve this, the warning and exception logic changed. -The world bank converts some country codes, -in their response, which makes error checking by pandas difficult. -Retired indicators still persist in the search. +The world bank converts some country codes in their response, which makes error +checking by pandas difficult. Retired indicators still persist in the search. Given the new flexibility of 0.15.1, improved error handling by the user may be necessary for fringe cases. @@ -377,13 +376,13 @@ The following will fetch users and pageviews (metrics) data per day of the week, filters = "pagePath=~aboutus;ga:country==France", ) -The only mandatory arguments are ``metrics,`` ``dimensions`` and ``start_date``. We can only strongly recommend you to always specify the ``account_id``, ``profile_id`` and ``property_id`` to avoid accessing the wrong data bucket in Google Analytics. +The only mandatory arguments are ``metrics,`` ``dimensions`` and ``start_date``. We strongly recommend that you always specify the ``account_id``, ``profile_id`` and ``property_id`` to avoid accessing the wrong data bucket in Google Analytics. The ``index_col`` argument indicates which dimension(s) has to be taken as index. -The ``filters`` argument indicates the filtering to apply to the query. In the above example, the page has URL has to contain ``aboutus`` AND the visitors country has to be France. +The ``filters`` argument indicates the filtering to apply to the query. In the above example, the page URL has to contain ``aboutus`` AND the visitors country has to be France. -Detailed informations in the followings: +Detailed information in the following: * `pandas & google analytics, by yhat `__ * `Google Analytics integration in pandas, by Chang She `__ diff --git a/doc/source/text.rst b/doc/source/text.rst index f417f56f51fbc..dea40fb48748d 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -266,7 +266,7 @@ Method Summary :meth:`~Series.str.upper`,Equivalent to ``str.upper`` :meth:`~Series.str.find`,Equivalent to ``str.find`` :meth:`~Series.str.rfind`,Equivalent to ``str.rfind`` - :meth:`~Series.str.capicalize`,Equivalent to ``str.capitalize`` + :meth:`~Series.str.capitalize`,Equivalent to ``str.capitalize`` :meth:`~Series.str.swapcase`,Equivalent to ``str.swapcase`` :meth:`~Series.str.isalnum`,Equivalent to ``str.isalnum`` :meth:`~Series.str.isalpha`,Equivalent to ``str.isalpha`` @@ -276,4 +276,4 @@ Method Summary :meth:`~Series.str.isupper`,Equivalent to ``str.isupper`` :meth:`~Series.str.istitle`,Equivalent to ``str.istitle`` :meth:`~Series.str.isnumeric`,Equivalent to ``str.isnumeric`` - :meth:`~Series.str.isnumeric`,Equivalent to ``str.isdecimal`` + :meth:`~Series.str.isdecimal`,Equivalent to ``str.isdecimal`` diff --git a/doc/source/timedeltas.rst b/doc/source/timedeltas.rst index 786a46d343be1..8215414e425fe 100644 --- a/doc/source/timedeltas.rst +++ b/doc/source/timedeltas.rst @@ -29,13 +29,13 @@ Time Deltas Starting in v0.15.0, we introduce a new scalar type ``Timedelta``, which is a subclass of ``datetime.timedelta``, and behaves in a similar manner, but allows compatibility with ``np.timedelta64`` types as well as a host of custom representation, parsing, and attributes. -Timedeltas are differences in times, expressed in difference units, e.g. days,hours,minutes,seconds. +Timedeltas are differences in times, expressed in difference units, e.g. days, hours, minutes, seconds. They can be both positive and negative. Parsing ------- -You can construct a ``Timedelta`` scalar thru various arguments: +You can construct a ``Timedelta`` scalar through various arguments: .. ipython:: python @@ -46,7 +46,7 @@ You can construct a ``Timedelta`` scalar thru various arguments: Timedelta('-1 days 2 min 3us') # like datetime.timedelta - # note: these MUST be specified as keyword argments + # note: these MUST be specified as keyword arguments Timedelta(days=1,seconds=1) # integers with a unit @@ -100,7 +100,7 @@ It will construct Series if the input is a Series, a scalar if the input is scal Operations ---------- -You can operate on Series/DataFrames and construct ``timedelta64[ns]`` Series thru +You can operate on Series/DataFrames and construct ``timedelta64[ns]`` Series through subtraction operations on ``datetime64[ns]`` Series, or ``Timestamps``. .. ipython:: python @@ -290,7 +290,7 @@ TimedeltaIndex .. versionadded:: 0.15.0 -To generate an index with time delta, you can use either the TimedeltaIndex or +To generate an index with time delta, you can use either the ``TimedeltaIndex`` or the ``timedelta_range`` constructor. Using ``TimedeltaIndex`` you can pass string-like, ``Timedelta``, ``timedelta``, diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index 9d4cba2e5ee8c..43fa6ea759b33 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -267,7 +267,7 @@ You can pass other keywords supported by matplotlib ``hist``. For example, horiz plt.close('all') See the :meth:`hist ` method and the -`matplotlib hist documenation `__ for more. +`matplotlib hist documentation `__ for more. The existing interface ``DataFrame.hist`` to plot histogram still can be used. diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index 659aa6786b366..b42c22364ef16 100755 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -7,6 +7,10 @@ This is a minor bug-fix release from 0.16.0 and includes a a large number of bug fixes along several new features, enhancements, and performance improvements. We recommend that all users upgrade to this version. +Highlights include: + +- Support for a ``CategoricalIndex``, a category based index, see :ref:`here ` + .. contents:: What's new in v0.16.1 :local: :backlinks: none @@ -20,17 +24,18 @@ Enhancements - Added ``StringMethods.capitalize()`` and ``swapcase`` which behave as the same as standard ``str`` (:issue:`9766`) - Added ``StringMethods`` (.str accessor) to ``Index`` (:issue:`9068`) - The `.str` accessor is now available for both `Series` and `Index`. + The ``.str`` accessor is now available for both ``Series`` and ``Index``. .. ipython:: python idx = Index([' jack', 'jill ', ' jesse ', 'frank']) idx.str.strip() - One special case for the `.str` accessor on `Index` is that if a string method returns `bool`, the `.str` accessor - will return a `np.array` instead of a boolean `Index` (:issue:`8875`). This enables the following expression + One special case for the `.str` accessor on ``Index`` is that if a string method returns ``bool``, the ``.str`` accessor + will return a ``np.array`` instead of a boolean ``Index`` (:issue:`8875`). This enables the following expression to work naturally: + .. ipython:: python idx = Index(['a1', 'a2', 'b1', 'b2']) @@ -40,7 +45,8 @@ Enhancements s[s.index.str.startswith('a')] - ``DataFrame.mask()`` and ``Series.mask()`` now support same keywords as ``where`` (:issue:`8801`) -- ``drop`` function can now accept ``errors`` keyword to suppress ValueError raised when any of label does not exist in the target data. (:issue:`6736`) + +- ``drop`` function can now accept ``errors`` keyword to suppress ``ValueError`` raised when any of label does not exist in the target data. (:issue:`6736`) .. ipython:: python @@ -55,6 +61,79 @@ Enhancements - Allow Panel.shift with ``axis='items'`` (:issue:`9890`) - Trying to write an excel file now raises ``NotImplementedError`` if the ``DataFrame`` has a ``MultiIndex`` instead of writing a broken Excel file. (:issue:`9794`) +- Allow ``Categorical.add_categories`` to accept ``Series`` or ``np.array``. (:issue:`9927`) + +- Add/delete ``str/dt/cat`` accessors dynamically from ``__dir__``. (:issue:`9910`) + +- ``DataFrame`` and ``Series`` now have ``_constructor_expanddim`` property as overridable constructor for one higher dimensionality data. This should be used only when it is really needed, see :ref:`here ` + +.. _whatsnew_0161.enhancements.categoricalindex: + +CategoricalIndex +^^^^^^^^^^^^^^^^ + +We introduce a ``CategoricalIndex``, a new type of index object that is useful for supporting +indexing with duplicates. This is a container around a ``Categorical`` (introduced in v0.15.0) +and allows efficient indexing and storage of an index with a large number of duplicated elements. Prior to 0.16.1, +setting the index of a ``DataFrame/Series`` with a ``category`` dtype would convert this to regular object-based ``Index``. + +.. ipython :: python + + df = DataFrame({'A' : np.arange(6), + 'B' : Series(list('aabbca')).astype('category', + categories=list('cab')) + }) + df + df.dtypes + df.B.cat.categories + +setting the index, will create create a ``CategoricalIndex`` + +.. ipython :: python + + df2 = df.set_index('B') + df2.index + +indexing with ``__getitem__/.iloc/.loc/.ix`` works similarly to an Index with duplicates. +The indexers MUST be in the category or the operation will raise. + +.. ipython :: python + + df2.loc['a'] + +and preserves the ``CategoricalIndex`` + +.. ipython :: python + + df2.loc['a'].index + +sorting will order by the order of the categories + +.. ipython :: python + + df2.sort_index() + +groupby operations on the index will preserve the index nature as well + +.. ipython :: python + + df2.groupby(level=0).sum() + df2.groupby(level=0).sum().index + +reindexing operations, will return a resulting index based on the type of the passed +indexer, meaning that passing a list will return a plain-old-``Index``; indexing with +a ``Categorical`` will return a ``CategoricalIndex``, indexed according to the categories +of the PASSED ``Categorical`` dtype. This allows one to arbitrarly index these even with +values NOT in the categories, similarly to how you can reindex ANY pandas index. + +.. ipython :: python + + df2.reindex(['a','e']) + df2.reindex(['a','e']).index + df2.reindex(pd.Categorical(['a','e'],categories=list('abcde'))) + df2.reindex(pd.Categorical(['a','e'],categories=list('abcde'))).index + +See the :ref:`documentation ` for more. (:issue:`7629`) .. _whatsnew_0161.api: @@ -87,7 +166,8 @@ API changes Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - +- Improved csv write performance with mixed dtypes, including datetimes by up to 5x (:issue:`9940`) +- Improved csv write performance generally by 2x (:issue:`9940`) @@ -99,15 +179,19 @@ Bug Fixes - Fixed bug (:issue:`9542`) where labels did not appear properly in legend of ``DataFrame.plot()``. Passing ``label=`` args also now works, and series indices are no longer mutated. - Bug in json serialization when frame has length zero.(:issue:`9805`) -- Bug in `read_csv` where missing trailing delimiters would cause segfault. (:issue:`5664`) +- Bug in ``read_csv`` where missing trailing delimiters would cause segfault. (:issue:`5664`) - Bug in retaining index name on appending (:issue:`9862`) - Bug in ``scatter_matrix`` draws unexpected axis ticklabels (:issue:`5662`) - Fixed bug in ``StataWriter`` resulting in changes to input ``DataFrame`` upon save (:issue:`9795`). - Bug in ``transform`` causing length mismatch when null entries were present and a fast aggregator was being used (:issue:`9697`) - Bug in ``equals`` causing false negatives when block order differed (:issue:`9330`) + - Bug in ``read_sql_table`` error when reading postgres table with timezone (:issue:`7139`) - Bug in ``DataFrame`` slicing may not retain metadata (:issue:`9776`) - Bug where ``TimdeltaIndex`` were not properly serialized in fixed ``HDFStore`` (:issue:`9635`) + +- Bug in ``groupby.apply()`` that would raise if a passed user defined function either returned only ``None`` (for all input). (:issue:`9685`) + - Bug in plotting continuously using ``secondary_y`` may not show legend properly. (:issue:`9610`, :issue:`9779`) - Bug in ``DataFrame.plot(kind="hist")`` results in ``TypeError`` when ``DataFrame`` contains non-numeric columns (:issue:`9853`) - Bug where repeated plotting of ``DataFrame`` with a ``DatetimeIndex`` may raise ``TypeError`` (:issue:`9852`) @@ -118,24 +202,22 @@ Bug Fixes - Bug in ``where`` causing incorrect results when upcasting was required (:issue:`9731`) - Bug in ``FloatArrayFormatter`` where decision boundary for displaying "small" floats in decimal format is off by one order of magnitude for a given display.precision (:issue:`9764`) - Fixed bug where ``DataFrame.plot()`` raised an error when both ``color`` and ``style`` keywords were passed and there was no color symbol in the style strings (:issue:`9671`) + - Bug in ``read_csv`` and ``read_table`` when using ``skip_rows`` parameter if blank lines are present. (:issue:`9832`) - Bug in ``read_csv()`` interprets ``index_col=True`` as ``1`` (:issue:`9798`) -- Bug in index equality comparisons using ``==`` failing on Index/MultiIndex type incompatibility (:issue:`9875`) +- Bug in index equality comparisons using ``==`` failing on Index/MultiIndex type incompatibility (:issue:`9785`) - Bug in which ``SparseDataFrame`` could not take `nan` as a column name (:issue:`8822`) - Bug in ``to_msgpack`` and ``read_msgpack`` zlib and blosc compression support (:issue:`9783`) -- Bug in unequal comparisons between a ``Series`` of dtype `"category"` and a scalar (e.g. ``Series(Categorical(list("abc"), categories=list("cba"), ordered=True)) > "b"``, which wouldn't use the order of the categories but use the lexicographical order. (:issue:`9848`) - - - - - - +- Bug ``GroupBy.size`` doesn't attach index name properly if grouped by ``TimeGrouper`` (:issue:`9925`) +- Bug causing an exception in slice assignments because ``length_of_indexer`` returns wrong results (:issue:`9995`) +- Bug in csv parser causing lines with initial whitespace plus one non-space character to be skipped. (:issue:`9710`) +- Bug in invalid attribute access on a ``TimedeltaIndex`` incorrectly raised ``ValueError`` instead of ``AttributeError`` (:issue:`9680`) @@ -143,7 +225,7 @@ Bug Fixes - Bug in unequal comparisons between categorical data and a scalar, which was not in the categories (e.g. ``Series(Categorical(list("abc"), ordered=True)) > "d"``. This returned ``False`` for all elements, but now raises a ``TypeError``. Equality comparisons also now return ``False`` for ``==`` and ``True`` for ``!=``. (:issue:`9848`) - Bug in DataFrame ``__setitem__`` when right hand side is a dictionary (:issue:`9874`) - Bug in ``where`` when dtype is ``datetime64/timedelta64``, but dtype of other is not (:issue:`9804`) -- Bug in ``MultiIndex.sortlevel()`` results in unicode level name breaks (:issue:`9875`) +- Bug in ``MultiIndex.sortlevel()`` results in unicode level name breaks (:issue:`9856`) - Bug in which ``groupby.transform`` incorrectly enforced output dtypes to match input dtypes. (:issue:`9807`) - Bug in bar plot with ``log=True`` raises ``TypeError`` if all values are less than 1 (:issue:`9905`) @@ -161,3 +243,10 @@ Bug Fixes - Changed caching in ``AbstractHolidayCalendar`` to be at the instance level rather than at the class level as the latter can result in unexpected behaviour. (:issue:`9552`) - Fixed latex output for multi-indexed dataframes (:issue:`9778`) +- Bug causing an exception when setting an empty range using ``DataFrame.loc`` (:issue:`9596`) + + +- Bug in hiding ticklabels with subplots and shared axes when adding a new plot to an existing grid of axes (:issue:`9158`) + +- Bug causing ``DataFrame.where`` to not respect the ``axis`` parameter when the frame has a symmetric shape. (:issue:`9736`) + diff --git a/pandas/core/api.py b/pandas/core/api.py index a8b10342593ce..fde9bc77c4bd9 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -8,7 +8,7 @@ from pandas.core.categorical import Categorical from pandas.core.groupby import Grouper from pandas.core.format import set_eng_float_format -from pandas.core.index import Index, Int64Index, Float64Index, MultiIndex +from pandas.core.index import Index, CategoricalIndex, Int64Index, Float64Index, MultiIndex from pandas.core.series import Series, TimeSeries from pandas.core.frame import DataFrame diff --git a/pandas/core/base.py b/pandas/core/base.py index a25651a73f507..9c27f3c7a2cc3 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -86,16 +86,22 @@ def __unicode__(self): # Should be overwritten by base classes return object.__repr__(self) - def _local_dir(self): - """ provide addtional __dir__ for this object """ - return [] + def _dir_additions(self): + """ add addtional __dir__ for this object """ + return set() + + def _dir_deletions(self): + """ delete unwanted __dir__ for this object """ + return set() def __dir__(self): """ Provide method name lookup and completion Only provide 'public' methods """ - return list(sorted(list(set(dir(type(self)) + self._local_dir())))) + rv = set(dir(type(self))) + rv = (rv - self._dir_deletions()) | self._dir_additions() + return sorted(rv) def _reset_cache(self, key=None): """ @@ -121,7 +127,7 @@ def _delegate_method(self, name, *args, **kwargs): raise TypeError("You cannot call method {name}".format(name=name)) @classmethod - def _add_delegate_accessors(cls, delegate, accessors, typ): + def _add_delegate_accessors(cls, delegate, accessors, typ, overwrite=False): """ add accessors to cls from the delegate class @@ -131,6 +137,8 @@ def _add_delegate_accessors(cls, delegate, accessors, typ): delegate : the class to get methods/properties & doc-strings acccessors : string list of accessors to add typ : 'property' or 'method' + overwrite : boolean, default False + overwrite the method/property in the target class if it exists """ @@ -164,7 +172,7 @@ def f(self, *args, **kwargs): f = _create_delegator_method(name) # don't overwrite existing methods/properties - if not hasattr(cls, name): + if overwrite or not hasattr(cls, name): setattr(cls,name,f) @@ -516,6 +524,16 @@ def _make_str_accessor(self): str = AccessorProperty(StringMethods, _make_str_accessor) + def _dir_additions(self): + return set() + + def _dir_deletions(self): + try: + getattr(self, 'str') + except AttributeError: + return set(['str']) + return set() + _shared_docs['drop_duplicates'] = ( """Return %(klass)s with duplicate values removed diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 0d66a89b0a585..caf706fcbcbbd 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -9,12 +9,11 @@ from pandas.core.algorithms import factorize from pandas.core.base import PandasObject, PandasDelegate -from pandas.core.index import Index, _ensure_index -from pandas.tseries.period import PeriodIndex import pandas.core.common as com from pandas.util.decorators import cache_readonly -from pandas.core.common import (CategoricalDtype, ABCSeries, isnull, notnull, +from pandas.core.common import (CategoricalDtype, ABCSeries, ABCIndexClass, ABCPeriodIndex, ABCCategoricalIndex, + isnull, notnull, is_dtype_equal, is_categorical_dtype, is_integer_dtype, is_object_dtype, _possibly_infer_to_datetimelike, get_dtype_kinds, is_list_like, is_sequence, is_null_slice, is_bool, @@ -22,7 +21,6 @@ _coerce_indexer_dtype, _values_from_object, take_1d) from pandas.util.terminal import get_terminal_size from pandas.core.config import get_option -from pandas.core import format as fmt def _cat_compare_op(op): def f(self, other): @@ -86,7 +84,7 @@ def f(self, other): def maybe_to_categorical(array): """ coerce to a categorical if a series is given """ - if isinstance(array, ABCSeries): + if isinstance(array, (ABCSeries, ABCCategoricalIndex)): return array.values return array @@ -236,15 +234,17 @@ def __init__(self, values, categories=None, ordered=False, name=None, fastpath=F # sanitize input if is_categorical_dtype(values): - # we are either a Series or a Categorical - cat = values - if isinstance(values, ABCSeries): - cat = values.values + # we are either a Series or a CategoricalIndex + if isinstance(values, (ABCSeries, ABCCategoricalIndex)): + values = values.values + + if ordered is None: + ordered = values.ordered if categories is None: - categories = cat.categories + categories = values.categories values = values.__array__() - elif isinstance(values, Index): + elif isinstance(values, ABCIndexClass): pass else: @@ -295,11 +295,11 @@ def __init__(self, values, categories=None, ordered=False, name=None, fastpath=F warn("Values and categories have different dtypes. Did you mean to use\n" "'Categorical.from_codes(codes, categories)'?", RuntimeWarning) - if is_integer_dtype(values) and (codes == -1).all(): + if len(values) and is_integer_dtype(values) and (codes == -1).all(): warn("None of the categories were found in values. Did you mean to use\n" "'Categorical.from_codes(codes, categories)'?", RuntimeWarning) - self.set_ordered(ordered, inplace=True) + self.set_ordered(ordered or False, inplace=True) self.categories = categories self.name = name self._codes = _coerce_indexer_dtype(codes, categories) @@ -309,11 +309,27 @@ def copy(self): return Categorical(values=self._codes.copy(),categories=self.categories, name=self.name, ordered=self.ordered, fastpath=True) + def astype(self, dtype): + """ coerce this type to another dtype """ + if is_categorical_dtype(dtype): + return self + return np.array(self, dtype=dtype) + @cache_readonly def ndim(self): """Number of dimensions of the Categorical """ return self._codes.ndim + @cache_readonly + def size(self): + """ return the len of myself """ + return len(self) + + @cache_readonly + def itemsize(self): + """ return the size of a single category """ + return self.categories.itemsize + def reshape(self, new_shape, **kwargs): """ compat with .reshape """ return self @@ -395,7 +411,8 @@ def _set_codes(self, codes): codes = property(fget=_get_codes, fset=_set_codes, doc=_codes_doc) def _get_labels(self): - """ Get the category labels (deprecated). + """ + Get the category labels (deprecated). Deprecated, use .codes! """ @@ -409,8 +426,10 @@ def _get_labels(self): @classmethod def _validate_categories(cls, categories): - """" Validates that we have good categories """ - if not isinstance(categories, Index): + """ + Validates that we have good categories + """ + if not isinstance(categories, ABCIndexClass): dtype = None if not hasattr(categories, "dtype"): categories = _convert_to_list_like(categories) @@ -421,6 +440,8 @@ def _validate_categories(cls, categories): with_na = np.array(categories) if with_na.dtype != without_na.dtype: dtype = "object" + + from pandas import Index categories = Index(categories, dtype=dtype) if not categories.is_unique: raise ValueError('Categorical categories must be unique') @@ -687,7 +708,7 @@ def add_categories(self, new_categories, inplace=False): if len(already_included) != 0: msg = "new categories must not include old categories: %s" % str(already_included) raise ValueError(msg) - new_categories = list(self._categories) + (new_categories) + new_categories = list(self._categories) + list(new_categories) new_categories = self._validate_categories(new_categories) cat = self if inplace else self.copy() cat._categories = new_categories @@ -761,6 +782,8 @@ def remove_unused_categories(self, inplace=False): cat = self if inplace else self.copy() _used = sorted(np.unique(cat._codes)) new_categories = cat.categories.take(_ensure_platform_int(_used)) + + from pandas.core.index import _ensure_index new_categories = _ensure_index(new_categories) cat._codes = _get_codes_for_values(cat.__array__(), new_categories) cat._categories = new_categories @@ -790,7 +813,8 @@ def shape(self): return tuple([len(self._codes)]) def __array__(self, dtype=None): - """ The numpy array interface. + """ + The numpy array interface. Returns ------- @@ -799,7 +823,7 @@ def __array__(self, dtype=None): dtype as categorical.categories.dtype """ ret = take_1d(self.categories.values, self._codes) - if dtype and dtype != self.categories.dtype: + if dtype and not is_dtype_equal(dtype,self.categories.dtype): return np.asarray(ret, dtype) return ret @@ -997,7 +1021,7 @@ def get_values(self): """ # if we are a period index, return a string repr - if isinstance(self.categories, PeriodIndex): + if isinstance(self.categories, ABCPeriodIndex): return take_1d(np.array(self.categories.to_native_types(), dtype=object), self._codes) @@ -1243,7 +1267,8 @@ def __iter__(self): """Returns an Iterator over the values of this Categorical.""" return iter(np.array(self)) - def _tidy_repr(self, max_vals=10): + def _tidy_repr(self, max_vals=10, footer=True): + """ a short repr displaying only max_vals and an optional (but default footer) """ num = max_vals // 2 head = self[:num]._get_repr(length=False, name=False, footer=False) tail = self[-(max_vals - num):]._get_repr(length=False, @@ -1251,23 +1276,31 @@ def _tidy_repr(self, max_vals=10): footer=False) result = '%s, ..., %s' % (head[:-1], tail[1:]) - result = '%s\n%s' % (result, self._repr_footer()) + if footer: + result = '%s\n%s' % (result, self._repr_footer()) return compat.text_type(result) - def _repr_categories_info(self): - """ Returns a string representation of the footer.""" - + def _repr_categories(self): + """ return the base repr for the categories """ max_categories = (10 if get_option("display.max_categories") == 0 else get_option("display.max_categories")) + from pandas.core import format as fmt category_strs = fmt.format_array(self.categories.get_values(), None) if len(category_strs) > max_categories: num = max_categories // 2 head = category_strs[:num] tail = category_strs[-(max_categories - num):] category_strs = head + ["..."] + tail + # Strip all leading spaces, which format_array adds for columns... category_strs = [x.strip() for x in category_strs] + return category_strs + + def _repr_categories_info(self): + """ Returns a string representation of the footer.""" + + category_strs = self._repr_categories() levheader = "Categories (%d, %s): " % (len(self.categories), self.categories.dtype) width, height = get_terminal_size() @@ -1299,8 +1332,11 @@ def _repr_footer(self): len(self), self._repr_categories_info()) def _get_repr(self, name=False, length=True, na_rep='NaN', footer=True): - formatter = fmt.CategoricalFormatter(self, name=name, - length=length, na_rep=na_rep, + from pandas.core import format as fmt + formatter = fmt.CategoricalFormatter(self, + name=name, + length=length, + na_rep=na_rep, footer=footer) result = formatter.to_string() return compat.text_type(result) @@ -1315,9 +1351,9 @@ def __unicode__(self): name=True) else: result = '[], %s' % self._get_repr(name=True, - length=False, - footer=True, - ).replace("\n",", ") + length=False, + footer=True, + ).replace("\n",", ") return result @@ -1358,6 +1394,8 @@ def __setitem__(self, key, value): "categories") rvalue = value if is_list_like(value) else [value] + + from pandas import Index to_add = Index(rvalue).difference(self.categories) # no assignments of values not in categories, but it's always ok to set something to np.nan @@ -1516,11 +1554,27 @@ def equals(self, other): ------- are_equal : boolean """ - if not isinstance(other, Categorical): - return False # TODO: should this also test if name is equal? - return (self.categories.equals(other.categories) and self.ordered == other.ordered and - np.array_equal(self._codes, other._codes)) + return self.is_dtype_equal(other) and np.array_equal(self._codes, other._codes) + + def is_dtype_equal(self, other): + """ + Returns True if categoricals are the same dtype + same categories, and same ordered + + Parameters + ---------- + other : Categorical + + Returns + ------- + are_equal : boolean + """ + + try: + return self.categories.equals(other.categories) and self.ordered == other.ordered + except (AttributeError, TypeError): + return False def describe(self): """ Describes this Categorical @@ -1604,18 +1658,20 @@ def _delegate_method(self, name, *args, **kwargs): ##### utility routines ##### def _get_codes_for_values(values, categories): - """" + """ utility routine to turn values into codes given the specified categories """ from pandas.core.algorithms import _get_data_algo, _hashtables - if values.dtype != categories.dtype: + if not is_dtype_equal(values.dtype,categories.dtype): values = _ensure_object(values) categories = _ensure_object(categories) + (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables) - t = hash_klass(len(categories)) - t.map_locations(_values_from_object(categories)) - return _coerce_indexer_dtype(t.lookup(values), categories) + (_, _), cats = _get_data_algo(categories, _hashtables) + t = hash_klass(len(cats)) + t.map_locations(cats) + return _coerce_indexer_dtype(t.lookup(vals), cats) def _convert_to_list_like(list_like): if hasattr(list_like, "dtype"): diff --git a/pandas/core/common.py b/pandas/core/common.py index ffe12d0c1546c..3d23aeff942dc 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -83,6 +83,16 @@ def _check(cls, inst): ABCDatetimeIndex = create_pandas_abc_type("ABCDatetimeIndex", "_typ", ("datetimeindex",)) ABCTimedeltaIndex = create_pandas_abc_type("ABCTimedeltaIndex", "_typ", ("timedeltaindex",)) ABCPeriodIndex = create_pandas_abc_type("ABCPeriodIndex", "_typ", ("periodindex",)) +ABCCategoricalIndex = create_pandas_abc_type("ABCCategoricalIndex", "_typ", ("categoricalindex",)) +ABCIndexClass = create_pandas_abc_type("ABCIndexClass", "_typ", ("index", + "int64index", + "float64index", + "multiindex", + "datetimeindex", + "timedeltaindex", + "periodindex", + "categoricalindex")) + ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series",)) ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",)) ABCPanel = create_pandas_abc_type("ABCPanel", "_typ", ("panel",)) @@ -2455,11 +2465,27 @@ def _get_dtype_type(arr_or_dtype): return np.dtype(arr_or_dtype).type elif isinstance(arr_or_dtype, CategoricalDtype): return CategoricalDtypeType + elif isinstance(arr_or_dtype, compat.string_types): + if is_categorical_dtype(arr_or_dtype): + return CategoricalDtypeType + return _get_dtype_type(np.dtype(arr_or_dtype)) try: return arr_or_dtype.dtype.type except AttributeError: raise ValueError('%r is not a dtype' % arr_or_dtype) +def is_dtype_equal(source, target): + """ return a boolean if the dtypes are equal """ + source = _get_dtype_type(source) + target = _get_dtype_type(target) + + try: + return source == target + except TypeError: + + # invalid comparison + # object == category will hit this + return False def is_any_int_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) diff --git a/pandas/core/format.py b/pandas/core/format.py index 06e1fab27cd6d..6e632e6ea741b 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -14,15 +14,14 @@ from pandas.core.config import get_option, set_option import pandas.core.common as com import pandas.lib as lib -from pandas.tslib import iNaT, Timestamp, Timedelta - +from pandas.tslib import iNaT, Timestamp, Timedelta, format_array_from_datetime +from pandas.tseries.index import DatetimeIndex +from pandas.tseries.period import PeriodIndex import numpy as np import itertools import csv -from pandas.tseries.period import PeriodIndex, DatetimeIndex - docstring_to_string = """ Parameters ---------- @@ -1259,9 +1258,10 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, if isinstance(cols, Index): cols = cols.to_native_types(na_rep=na_rep, float_format=float_format, - date_format=date_format) + date_format=date_format, + quoting=self.quoting) else: - cols = list(cols) + cols = np.asarray(list(cols)) self.obj = self.obj.loc[:, cols] # update columns to include possible multiplicity of dupes @@ -1270,9 +1270,10 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, if isinstance(cols, Index): cols = cols.to_native_types(na_rep=na_rep, float_format=float_format, - date_format=date_format) + date_format=date_format, + quoting=self.quoting) else: - cols = list(cols) + cols = np.asarray(list(cols)) # save it self.cols = cols @@ -1371,8 +1372,10 @@ def strftime_with_nulls(x): values = self.obj.copy() values.index = data_index values.columns = values.columns.to_native_types( - na_rep=na_rep, float_format=float_format, - date_format=date_format) + na_rep=na_rep, + float_format=float_format, + date_format=date_format, + quoting=self.quoting) values = values[cols] series = {} @@ -1543,18 +1546,22 @@ def _save_chunk(self, start_i, end_i): slicer = slice(start_i, end_i) for i in range(len(self.blocks)): b = self.blocks[i] - d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, + d = b.to_native_types(slicer=slicer, + na_rep=self.na_rep, float_format=self.float_format, decimal=self.decimal, - date_format=self.date_format) + date_format=self.date_format, + quoting=self.quoting) for col_loc, col in zip(b.mgr_locs, d): # self.data is a preallocated list self.data[col_loc] = col - ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, + ix = data_index.to_native_types(slicer=slicer, + na_rep=self.na_rep, float_format=self.float_format, - date_format=self.date_format) + date_format=self.date_format, + quoting=self.quoting) lib.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer) @@ -2030,16 +2037,43 @@ def __init__(self, values, nat_rep='NaT', date_format=None, **kwargs): self.date_format = date_format def _format_strings(self): - formatter = (self.formatter or - _get_format_datetime64_from_values(self.values, - nat_rep=self.nat_rep, - date_format=self.date_format)) - fmt_values = [formatter(x) for x in self.values] + # we may have a tz, if so, then need to process element-by-element + # when DatetimeBlockWithTimezones is a reality this could be fixed + values = self.values + if not isinstance(values, DatetimeIndex): + values = DatetimeIndex(values) + + if values.tz is None: + fmt_values = format_array_from_datetime(values.asi8.ravel(), + format=_get_format_datetime64_from_values(values, self.date_format), + na_rep=self.nat_rep).reshape(values.shape) + fmt_values = fmt_values.tolist() + + else: + + values = values.asobject + is_dates_only = _is_dates_only(values) + formatter = (self.formatter or _get_format_datetime64(is_dates_only, values, date_format=self.date_format)) + fmt_values = [ formatter(x) for x in self.values ] return fmt_values +def _is_dates_only(values): + # return a boolean if we are only dates (and don't have a timezone) + values = DatetimeIndex(values) + if values.tz is not None: + return False + + values_int = values.asi8 + consider_values = values_int != iNaT + one_day_nanos = (86400 * 1e9) + even_days = np.logical_and(consider_values, values_int % one_day_nanos != 0).sum() == 0 + if even_days: + return True + return False + def _format_datetime64(x, tz=None, nat_rep='NaT'): if x is None or lib.checknull(x): return nat_rep @@ -2062,22 +2096,6 @@ def _format_datetime64_dateonly(x, nat_rep='NaT', date_format=None): else: return x._date_repr - -def _is_dates_only(values): - # return a boolean if we are only dates (and don't have a timezone) - from pandas import DatetimeIndex - values = DatetimeIndex(values) - if values.tz is not None: - return False - - values_int = values.asi8 - consider_values = values_int != iNaT - one_day_nanos = (86400 * 1e9) - even_days = np.logical_and(consider_values, values_int % one_day_nanos != 0).sum() == 0 - if even_days: - return True - return False - def _get_format_datetime64(is_dates_only, nat_rep='NaT', date_format=None): if is_dates_only: @@ -2088,13 +2106,12 @@ def _get_format_datetime64(is_dates_only, nat_rep='NaT', date_format=None): return lambda x, tz=None: _format_datetime64(x, tz=tz, nat_rep=nat_rep) -def _get_format_datetime64_from_values(values, - nat_rep='NaT', - date_format=None): +def _get_format_datetime64_from_values(values, date_format): + """ given values and a date_format, return a string format """ is_dates_only = _is_dates_only(values) - return _get_format_datetime64(is_dates_only=is_dates_only, - nat_rep=nat_rep, - date_format=date_format) + if is_dates_only: + return date_format or "%Y-%m-%d" + return None class Timedelta64Formatter(GenericArrayFormatter): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4f7bc11cbf03c..272c401c18761 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -191,6 +191,11 @@ def _constructor(self): _constructor_sliced = Series + @property + def _constructor_expanddim(self): + from pandas.core.panel import Panel + return Panel + def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False): if data is None: @@ -1061,8 +1066,6 @@ def to_panel(self): ------- panel : Panel """ - from pandas.core.panel import Panel - # only support this kind for now if (not isinstance(self.index, MultiIndex) or # pragma: no cover len(self.index.levels) != 2): @@ -1100,7 +1103,7 @@ def to_panel(self): shape=shape, ref_items=selfsorted.columns) - return Panel(new_mgr) + return self._constructor_expanddim(new_mgr) to_wide = deprecate('to_wide', to_panel) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8bd85a008f077..e58bdbfa346a4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -146,15 +146,19 @@ def __unicode__(self): prepr = '[%s]' % ','.join(map(com.pprint_thing, self)) return '%s(%s)' % (self.__class__.__name__, prepr) - def _local_dir(self): + def _dir_additions(self): """ add the string-like attributes from the info_axis """ - return [c for c in self._info_axis - if isinstance(c, string_types) and isidentifier(c)] + return set([c for c in self._info_axis + if isinstance(c, string_types) and isidentifier(c)]) @property def _constructor_sliced(self): raise AbstractMethodError(self) + @property + def _constructor_expanddim(self): + raise NotImplementedError + #---------------------------------------------------------------------- # Axis @@ -3337,10 +3341,18 @@ def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, matches = (new_other == np.array(other)) if matches is False or not matches.all(): - other = np.array(other) + + # coerce other to a common dtype if we can + if com.needs_i8_conversion(self.dtype): + try: + other = np.array(other, dtype=self.dtype) + except: + other = np.array(other) + else: + other = np.asarray(other) + other = np.asarray(other, dtype=np.common_type(other, new_other)) - # we can't use our existing dtype - # because of incompatibilities + # we need to use the new dtype try_quick = False else: other = new_other @@ -3397,19 +3409,31 @@ def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, else: other = self._constructor(other, **self._construct_axes_dict()) + if axis is None: + axis = 0 + + if self.ndim == getattr(other, 'ndim', 0): + align = True + else: + align = (self._get_axis_number(axis) == 1) + + block_axis = self._get_block_manager_axis(axis) + if inplace: # we may have different type blocks come out of putmask, so # reconstruct the block manager self._check_inplace_setting(other) - new_data = self._data.putmask(mask=cond, new=other, align=axis is None, - inplace=True) + new_data = self._data.putmask(mask=cond, new=other, align=align, + inplace=True, axis=block_axis, + transpose=self._AXIS_REVERSED) self._update_inplace(new_data) else: - new_data = self._data.where(other=other, cond=cond, align=axis is None, + new_data = self._data.where(other=other, cond=cond, align=align, raise_on_error=raise_on_error, - try_cast=try_cast) + try_cast=try_cast, axis=block_axis, + transpose=self._AXIS_REVERSED) return self._constructor(new_data).__finalize__(self) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 6b2c9639ac71f..38619229f1086 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -14,7 +14,7 @@ from pandas.core.categorical import Categorical from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -from pandas.core.index import Index, MultiIndex, _ensure_index, _union_indexes +from pandas.core.index import Index, MultiIndex, CategoricalIndex, _ensure_index, _union_indexes from pandas.core.internals import BlockManager, make_block from pandas.core.series import Series from pandas.core.panel import Panel @@ -498,8 +498,8 @@ def _set_result_index_ordered(self, result): result.index = self.obj.index return result - def _local_dir(self): - return sorted(set(self.obj._local_dir() + list(self._apply_whitelist))) + def _dir_additions(self): + return self.obj._dir_additions() | self._apply_whitelist def __getattr__(self, attr): if attr in self._internal_names_set: @@ -1780,12 +1780,14 @@ def size(self): Compute group sizes """ - base = Series(np.zeros(len(self.result_index), dtype=np.int64), - index=self.result_index) + index = self.result_index + base = Series(np.zeros(len(index), dtype=np.int64), index=index) indices = self.indices for k, v in compat.iteritems(indices): indices[k] = len(v) bin_counts = Series(indices, dtype=np.int64) + # make bin_counts.index to have same name to preserve it + bin_counts.index.name = index.name result = base.add(bin_counts, fill_value=0) # addition with fill_value changes dtype to float64 result = result.astype(np.int64) @@ -1926,7 +1928,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, self.grouper = com._asarray_tuplesafe(self.grouper) # a passed Categorical - elif isinstance(self.grouper, Categorical): + elif is_categorical_dtype(self.grouper): # must have an ordered categorical if self.sort: @@ -1940,8 +1942,15 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, # fix bug #GH8868 sort=False being ignored in categorical groupby else: self.grouper = self.grouper.reorder_categories(self.grouper.unique()) + + # we make a CategoricalIndex out of the cat grouper + # preserving the categories / ordered attributes self._labels = self.grouper.codes - self._group_index = self.grouper.categories + + c = self.grouper.categories + self._group_index = CategoricalIndex(Categorical.from_codes(np.arange(len(c)), + categories=c, + ordered=self.grouper.ordered)) if self.name is None: self.name = self.grouper.name @@ -2129,8 +2138,8 @@ def is_in_obj(gpr): else: in_axis, name = False, None - if isinstance(gpr, Categorical) and len(gpr) != len(obj): - raise ValueError("Categorical grouper must have len(grouper) == len(data)") + if is_categorical_dtype(gpr) and len(gpr) != len(obj): + raise ValueError("Categorical dtype grouper must have len(grouper) == len(data)") ping = Grouping(group_axis, gpr, obj=obj, name=name, level=level, sort=sort, in_axis=in_axis) @@ -2813,7 +2822,12 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # make Nones an empty object if com._count_not_none(*values) != len(values): - v = next(v for v in values if v is not None) + try: + v = next(v for v in values if v is not None) + except StopIteration: + # If all values are None, then this will throw an error. + # We'd prefer it return an empty dataframe. + return DataFrame() if v is None: return DataFrame() elif isinstance(v, NDFrame): @@ -3250,7 +3264,7 @@ def _reindex_output(self, result): return result elif len(groupings) == 1: return result - elif not any([isinstance(ping.grouper, Categorical) + elif not any([isinstance(ping.grouper, (Categorical, CategoricalIndex)) for ping in groupings]): return result diff --git a/pandas/core/index.py b/pandas/core/index.py index 0a3adbd19ae92..8b650fea9b440 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -2,6 +2,7 @@ import datetime import warnings import operator + from functools import partial from pandas.compat import range, zip, lrange, lzip, u, reduce, filter, map from pandas import compat @@ -13,13 +14,13 @@ import pandas.algos as _algos import pandas.index as _index from pandas.lib import Timestamp, Timedelta, is_datetime_array -from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs +from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs, PandasDelegate from pandas.util.decorators import (Appender, Substitution, cache_readonly, deprecate) -from pandas.core.common import isnull, array_equivalent import pandas.core.common as com -from pandas.core.common import (_values_from_object, is_float, is_integer, - ABCSeries, _ensure_object, _ensure_int64, is_bool_indexer, +from pandas.core.common import (isnull, array_equivalent, is_dtype_equal, is_object_dtype, + _values_from_object, is_float, is_integer, is_iterator, is_categorical_dtype, + ABCSeries, ABCCategorical, _ensure_object, _ensure_int64, is_bool_indexer, is_list_like, is_bool_dtype, is_null_slice, is_integer_dtype) from pandas.core.config import get_option from pandas.io.common import PerformanceWarning @@ -44,26 +45,6 @@ def _try_get_item(x): except AttributeError: return x -def _indexOp(opname): - """ - Wrapper function for index comparison operations, to avoid - code duplication. - """ - def wrapper(self, other): - func = getattr(self.values, opname) - result = func(np.asarray(other)) - - # technically we could support bool dtyped Index - # for now just return the indexing array directly - if is_bool_dtype(result): - return result - try: - return Index(result) - except: # pragma: no cover - return result - return wrapper - - class InvalidIndexError(Exception): pass @@ -162,6 +143,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, return Float64Index(data, copy=copy, dtype=dtype, name=name) elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data): subarr = data.astype('object') + elif is_categorical_dtype(data) or is_categorical_dtype(dtype): + return CategoricalIndex(data, copy=copy, name=name, **kwargs) else: subarr = com._asarray_tuplesafe(data, dtype=object) @@ -170,6 +153,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, if copy: subarr = subarr.copy() + elif is_categorical_dtype(data) or is_categorical_dtype(dtype): + return CategoricalIndex(data, copy=copy, name=name, **kwargs) elif hasattr(data, '__array__'): return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, **kwargs) @@ -258,7 +243,7 @@ def __len__(self): """ return len(self._data) - def __array__(self, result=None): + def __array__(self, dtype=None): """ the array interface, return my values """ return self._data.view(np.ndarray) @@ -282,9 +267,6 @@ def get_values(self): """ return the underlying data as an ndarray """ return self.values - def _array_values(self): - return self._data - # ops compat def tolist(self): """ @@ -410,8 +392,7 @@ def __unicode__(self): Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3. """ - prepr = com.pprint_thing(self, escape_chars=('\t', '\r', '\n'), - quote_strings=True) + prepr = default_pprint(self) return "%s(%s, dtype='%s')" % (type(self).__name__, prepr, self.dtype) def to_series(self, **kwargs): @@ -429,9 +410,10 @@ def to_series(self, **kwargs): def _to_embed(self, keep_tz=False): """ + *this is an internal non-public method* + return an array repr of this object, potentially casting to object - This is for internal compat """ return self.values @@ -623,7 +605,10 @@ def is_numeric(self): return self.inferred_type in ['integer', 'floating'] def is_object(self): - return self.dtype == np.object_ + return is_object_dtype(self.dtype) + + def is_categorical(self): + return self.inferred_type in ['categorical'] def is_mixed(self): return 'mixed' in self.inferred_type @@ -772,14 +757,11 @@ def is_int(v): return indexer - def _convert_list_indexer(self, key, kind=None): - """ convert a list indexer. these should be locations """ - return key - - def _convert_list_indexer_for_mixed(self, keyarr, kind=None): - """ passed a key that is tuplesafe that is integer based - and we have a mixed index (e.g. number/labels). figure out - the indexer. return None if we can't help + def _convert_list_indexer(self, keyarr, kind=None): + """ + passed a key that is tuplesafe that is integer based + and we have a mixed index (e.g. number/labels). figure out + the indexer. return None if we can't help """ if (kind is None or kind in ['iloc','ix']) and (is_integer_dtype(keyarr) and not self.is_floating()): if self.inferred_type != 'integer': @@ -954,17 +936,13 @@ def __getitem__(self, key): else: return result - def append(self, other): + def _ensure_compat_append(self, other): """ - Append a collection of Index options together - - Parameters - ---------- - other : Index or list/tuple of indices + prepare the append Returns ------- - appended : Index + list of to_concat, name of result Index """ name = self.name to_concat = [self] @@ -984,7 +962,21 @@ def append(self, other): to_concat = self._ensure_compat_concat(to_concat) to_concat = [x.values if isinstance(x, Index) else x for x in to_concat] + return to_concat, name + def append(self, other): + """ + Append a collection of Index options together + + Parameters + ---------- + other : Index or list/tuple of indices + + Returns + ------- + appended : Index + """ + to_concat, name = self._ensure_compat_append(other) return Index(np.concatenate(to_concat), name=name) @staticmethod @@ -1046,10 +1038,12 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs): from pandas.core.format import format_array - if values.dtype == np.object_: + if is_categorical_dtype(values.dtype): + values = np.array(values) + elif is_object_dtype(values.dtype): values = lib.maybe_convert_objects(values, safe=1) - if values.dtype == np.object_: + if is_object_dtype(values.dtype): result = [com.pprint_thing(x, escape_chars=('\t', '\r', '\n')) for x in values] @@ -1071,12 +1065,16 @@ def to_native_types(self, slicer=None, **kwargs): values = values[slicer] return values._format_native_types(**kwargs) - def _format_native_types(self, na_rep='', **kwargs): + def _format_native_types(self, na_rep='', quoting=None, **kwargs): """ actually format my specific types """ mask = isnull(self) - values = np.array(self, dtype=object, copy=True) + if not self.is_object() and not quoting: + values = np.asarray(self).astype(str) + else: + values = np.array(self, dtype=object, copy=True) + values[mask] = na_rep - return values.tolist() + return values def equals(self, other): """ @@ -1088,9 +1086,6 @@ def equals(self, other): if not isinstance(other, Index): return False - if type(other) != Index: - return other.equals(self) - return array_equivalent(_values_from_object(self), _values_from_object(other)) def identical(self, other): @@ -1197,13 +1192,6 @@ def __sub__(self, other): "use .difference()",FutureWarning) return self.difference(other) - __eq__ = _indexOp('__eq__') - __ne__ = _indexOp('__ne__') - __lt__ = _indexOp('__lt__') - __gt__ = _indexOp('__gt__') - __le__ = _indexOp('__le__') - __ge__ = _indexOp('__ge__') - def __and__(self, other): return self.intersection(other) @@ -1236,7 +1224,7 @@ def union(self, other): self._assert_can_do_setop(other) - if self.dtype != other.dtype: + if not is_dtype_equal(self.dtype,other.dtype): this = self.astype('O') other = other.astype('O') return this.union(other) @@ -1310,7 +1298,7 @@ def intersection(self, other): if self.equals(other): return self - if self.dtype != other.dtype: + if not is_dtype_equal(self.dtype,other.dtype): this = self.astype('O') other = other.astype('O') return this.intersection(other) @@ -1469,7 +1457,7 @@ def get_value(self, series, key): raise except TypeError: # generator/iterator-like - if com.is_iterator(key): + if is_iterator(key): raise InvalidIndexError(key) else: raise e1 @@ -1544,7 +1532,7 @@ def get_indexer(self, target, method=None, limit=None): if pself is not self or ptarget is not target: return pself.get_indexer(ptarget, method=method, limit=limit) - if self.dtype != target.dtype: + if not is_dtype_equal(self.dtype,target.dtype): this = self.astype(object) target = target.astype(object) return this.get_indexer(target, method=method, limit=limit) @@ -1643,7 +1631,8 @@ def get_indexer_for(self, target, **kwargs): """ guaranteed return of an indexer even when non-unique """ if self.is_unique: return self.get_indexer(target, **kwargs) - return self.get_indexer_non_unique(target, **kwargs)[0] + indexer, _ = self.get_indexer_non_unique(target, **kwargs) + return indexer def _possibly_promote(self, other): # A hack, but it works @@ -1651,7 +1640,7 @@ def _possibly_promote(self, other): if self.inferred_type == 'date' and isinstance(other, DatetimeIndex): return DatetimeIndex(self), other elif self.inferred_type == 'boolean': - if self.dtype != 'object': + if not is_object_dtype(self.dtype): return self.astype('object'), other.astype('object') return self, other @@ -1703,12 +1692,35 @@ def isin(self, values, level=None): value_set = set(values) if level is not None: self._validate_index_level(level) - return lib.ismember(self._array_values(), value_set) + return lib.ismember(np.array(self), value_set) + + def _can_reindex(self, indexer): + """ + *this is an internal non-public method* + + Check if we are allowing reindexing with this particular indexer + + Parameters + ---------- + indexer : an integer indexer + + Raises + ------ + ValueError if its a duplicate axis + """ + + # trying to reindex on an axis with duplicates + if not self.is_unique and len(indexer): + raise ValueError("cannot reindex from a duplicate axis") def reindex(self, target, method=None, level=None, limit=None): """ Create index with target's values (move/add/delete values as necessary) + Parameters + ---------- + target : an iterable + Returns ------- new_index : pd.Index @@ -1729,6 +1741,7 @@ def reindex(self, target, method=None, level=None, limit=None): target = self._simple_new(np.empty(0, dtype=self.dtype), **attrs) else: target = _ensure_index(target) + if level is not None: if method is not None: raise TypeError('Fill method not supported if level passed') @@ -1753,9 +1766,72 @@ def reindex(self, target, method=None, level=None, limit=None): return target, indexer + def _reindex_non_unique(self, target): + """ + *this is an internal non-public method* + + Create a new index with target's values (move/add/delete values as necessary) + use with non-unique Index and a possibly non-unique target + + Parameters + ---------- + target : an iterable + + Returns + ------- + new_index : pd.Index + Resulting index + indexer : np.ndarray or None + Indices of output values in original index + + """ + + target = _ensure_index(target) + indexer, missing = self.get_indexer_non_unique(target) + check = indexer != -1 + new_labels = self.take(indexer[check]) + new_indexer = None + + if len(missing): + l = np.arange(len(indexer)) + + missing = com._ensure_platform_int(missing) + missing_labels = target.take(missing) + missing_indexer = com._ensure_int64(l[~check]) + cur_labels = self.take(indexer[check]).values + cur_indexer = com._ensure_int64(l[check]) + + new_labels = np.empty(tuple([len(indexer)]), dtype=object) + new_labels[cur_indexer] = cur_labels + new_labels[missing_indexer] = missing_labels + + # a unique indexer + if target.is_unique: + + # see GH5553, make sure we use the right indexer + new_indexer = np.arange(len(indexer)) + new_indexer[cur_indexer] = np.arange(len(cur_labels)) + new_indexer[missing_indexer] = -1 + + # we have a non_unique selector, need to use the original + # indexer here + else: + + # need to retake to have the same size as the indexer + indexer = indexer.values + indexer[~check] = 0 + + # reset the new indexer to account for the new size + new_indexer = np.arange(len(self.take(indexer))) + new_indexer[~check] = -1 + + return self._shallow_copy(new_labels), indexer, new_indexer + def join(self, other, how='left', level=None, return_indexers=False): """ - Internal API method. Compute join_index and indexers to conform data + *this is an internal non-public method* + + Compute join_index and indexers to conform data structures to the new index. Parameters @@ -1814,7 +1890,7 @@ def join(self, other, how='left', level=None, return_indexers=False): result = x, z, y return result - if self.dtype != other.dtype: + if not is_dtype_equal(self.dtype,other.dtype): this = self.astype('O') other = other.astype('O') return this.join(other, how=how, @@ -2365,6 +2441,34 @@ def _evaluate_with_timedelta_like(self, other, op, opstr): def _evaluate_with_datetime_like(self, other, op, opstr): raise TypeError("can only perform ops with datetime like values") + @classmethod + def _add_comparison_methods(cls): + """ add in comparison methods """ + + def _make_compare(op): + + def _evaluate_compare(self, other): + func = getattr(self.values, op) + result = func(np.asarray(other)) + + # technically we could support bool dtyped Index + # for now just return the indexing array directly + if is_bool_dtype(result): + return result + try: + return Index(result) + except TypeError: + return result + + return _evaluate_compare + + cls.__eq__ = _make_compare('__eq__') + cls.__ne__ = _make_compare('__ne__') + cls.__lt__ = _make_compare('__lt__') + cls.__gt__ = _make_compare('__gt__') + cls.__le__ = _make_compare('__le__') + cls.__ge__ = _make_compare('__ge__') + @classmethod def _add_numeric_methods_disabled(cls): """ add in numeric methods to disable """ @@ -2419,7 +2523,7 @@ def _evaluate_numeric_binop(self, other): elif isinstance(other, (Timestamp, np.datetime64)): return self._evaluate_with_datetime_like(other, op, opstr) else: - if not (com.is_float(other) or com.is_integer(other)): + if not (is_float(other) or is_integer(other)): raise TypeError("can only perform ops with scalar values") # if we are a reversed non-communative op @@ -2483,7 +2587,7 @@ def _make_logical_function(name, desc, f): @Appender(_doc) def logical_func(self, *args, **kwargs): result = f(self.values) - if isinstance(result, (np.ndarray, com.ABCSeries, Index)) \ + if isinstance(result, (np.ndarray, ABCSeries, Index)) \ and result.ndim == 0: # return NumPy type return result.dtype.type(result.item()) @@ -2515,6 +2619,539 @@ def invalid_op(self, other=None): Index._add_numeric_methods_disabled() Index._add_logical_methods() +Index._add_comparison_methods() + +class CategoricalIndex(Index, PandasDelegate): + """ + + Immutable Index implementing an ordered, sliceable set. CategoricalIndex + represents a sparsely populated Index with an underlying Categorical. + + Parameters + ---------- + data : array-like or Categorical, (1-dimensional) + categories : optional, array-like + categories for the CategoricalIndex + ordered : boolean, + designating if the categories are ordered + copy : bool + Make a copy of input ndarray + name : object + Name to be stored in the index + + """ + + _typ = 'categoricalindex' + _engine_type = _index.Int64Engine + _attributes = ['name','categories','ordered'] + + def __new__(cls, data=None, categories=None, ordered=None, dtype=None, copy=False, name=None, fastpath=False, **kwargs): + + if fastpath: + return cls._simple_new(data, name=name) + + if isinstance(data, ABCCategorical): + data = cls._create_categorical(cls, data, categories, ordered) + elif isinstance(data, CategoricalIndex): + data = data._data + data = cls._create_categorical(cls, data, categories, ordered) + else: + + # don't allow scalars + # if data is None, then categories must be provided + if lib.isscalar(data): + if data is not None or categories is None: + cls._scalar_data_error(data) + data = [] + data = cls._create_categorical(cls, data, categories, ordered) + + if copy: + data = data.copy() + + return cls._simple_new(data, name=name) + + def _create_from_codes(self, codes, categories=None, ordered=None, name=None): + """ + *this is an internal non-public method* + + create the correct categorical from codes + + Parameters + ---------- + codes : new codes + categories : optional categories, defaults to existing + ordered : optional ordered attribute, defaults to existing + name : optional name attribute, defaults to existing + + Returns + ------- + CategoricalIndex + """ + + from pandas.core.categorical import Categorical + if categories is None: + categories = self.categories + if ordered is None: + ordered = self.ordered + if name is None: + name = self.name + cat = Categorical.from_codes(codes, categories=categories, ordered=self.ordered) + return CategoricalIndex(cat, name=name) + + @staticmethod + def _create_categorical(self, data, categories=None, ordered=None): + """ + *this is an internal non-public method* + + create the correct categorical from data and the properties + + Parameters + ---------- + data : data for new Categorical + categories : optional categories, defaults to existing + ordered : optional ordered attribute, defaults to existing + + Returns + ------- + Categorical + """ + + if not isinstance(data, ABCCategorical): + from pandas.core.categorical import Categorical + data = Categorical(data, categories=categories, ordered=ordered) + else: + if categories is not None: + data = data.set_categories(categories) + if ordered is not None: + data = data.set_ordered(ordered) + return data + + @classmethod + def _simple_new(cls, values, name=None, categories=None, ordered=None, **kwargs): + result = object.__new__(cls) + + values = cls._create_categorical(cls, values, categories, ordered) + result._data = values + result.name = name + for k, v in compat.iteritems(kwargs): + setattr(result,k,v) + + result._reset_identity() + return result + + def _is_dtype_compat(self, other): + """ + *this is an internal non-public method* + + provide a comparison between the dtype of self and other (coercing if needed) + + Raises + ------ + TypeError if the dtypes are not compatible + """ + + if is_categorical_dtype(other): + if isinstance(other, CategoricalIndex): + other = other.values + if not other.is_dtype_equal(self): + raise TypeError("categories must match existing categories when appending") + else: + values = other + other = CategoricalIndex(self._create_categorical(self, other, categories=self.categories, ordered=self.ordered)) + if not other.isin(values).all(): + raise TypeError("cannot append a non-category item to a CategoricalIndex") + + return other + + def equals(self, other): + """ + Determines if two CategorialIndex objects contain the same elements. + """ + if self.is_(other): + return True + + try: + other = self._is_dtype_compat(other) + return array_equivalent(self._data, other) + except (TypeError, ValueError): + pass + + return False + + def __unicode__(self): + """ + Return a string representation for this object. + + Invoked by unicode(df) in py2 only. Yields a Unicode String in both + py2/py3. + """ + + # currently doesn't use the display.max_categories, or display.max_seq_len + # for head/tail printing + values = default_pprint(self.values.get_values()) + cats = default_pprint(self.categories.get_values()) + space = ' ' * (len(self.__class__.__name__) + 1) + name = self.name + if name is not None: + name = default_pprint(name) + + result = u("{klass}({values},\n{space}categories={categories},\n{space}ordered={ordered},\n{space}name={name})").format( + klass=self.__class__.__name__, + values=values, + categories=cats, + ordered=self.ordered, + name=name, + space=space) + + return result + + @property + def inferred_type(self): + return 'categorical' + + @property + def values(self): + """ return the underlying data, which is a Categorical """ + return self._data + + @property + def codes(self): + return self._data.codes + + @property + def categories(self): + return self._data.categories + + @property + def ordered(self): + return self._data.ordered + + def __contains__(self, key): + hash(key) + return key in self.values + + def __array__(self, dtype=None): + """ the array interface, return my values """ + return np.array(self._data, dtype=dtype) + + def argsort(self, *args, **kwargs): + return self.values.argsort(*args, **kwargs) + + @cache_readonly + def _engine(self): + + # we are going to look things up with the codes themselves + return self._engine_type(lambda: self.codes.astype('i8'), len(self)) + + @cache_readonly + def is_unique(self): + return not self.duplicated().any() + + @Appender(_shared_docs['duplicated'] % _index_doc_kwargs) + def duplicated(self, take_last=False): + from pandas.hashtable import duplicated_int64 + return duplicated_int64(self.codes.astype('i8'), take_last) + + def get_loc(self, key, method=None): + """ + Get integer location for requested label + + Parameters + ---------- + key : label + method : {None} + * default: exact matches only. + + Returns + ------- + loc : int if unique index, possibly slice or mask if not + """ + codes = self.categories.get_loc(key) + if (codes == -1): + raise KeyError(key) + indexer, _ = self._engine.get_indexer_non_unique(np.array([codes])) + if (indexer==-1).any(): + raise KeyError(key) + + return indexer + + def _can_reindex(self, indexer): + """ always allow reindexing """ + pass + + def reindex(self, target, method=None, level=None, limit=None): + """ + Create index with target's values (move/add/delete values as necessary) + + Returns + ------- + new_index : pd.Index + Resulting index + indexer : np.ndarray or None + Indices of output values in original index + + """ + + if method is not None: + raise NotImplementedError("argument method is not implemented for CategoricalIndex.reindex") + if level is not None: + raise NotImplementedError("argument level is not implemented for CategoricalIndex.reindex") + if limit is not None: + raise NotImplementedError("argument limit is not implemented for CategoricalIndex.reindex") + + target = _ensure_index(target) + + if not is_categorical_dtype(target) and not target.is_unique: + raise ValueError("cannot reindex with a non-unique indexer") + + indexer, missing = self.get_indexer_non_unique(np.array(target)) + new_target = self.take(indexer) + + + # filling in missing if needed + if len(missing): + cats = self.categories.get_indexer(target) + if (cats==-1).any(): + + # coerce to a regular index here! + result = Index(np.array(self),name=self.name) + new_target, indexer, _ = result._reindex_non_unique(np.array(target)) + + else: + + codes = new_target.codes.copy() + codes[indexer==-1] = cats[missing] + new_target = self._create_from_codes(codes) + + # we always want to return an Index type here + # to be consistent with .reindex for other index types (e.g. they don't coerce + # based on the actual values, only on the dtype) + # unless we had an inital Categorical to begin with + # in which case we are going to conform to the passed Categorical + new_target = np.asarray(new_target) + if is_categorical_dtype(target): + new_target = target._shallow_copy(new_target, name=self.name) + else: + new_target = Index(new_target, name=self.name) + + return new_target, indexer + + def _reindex_non_unique(self, target): + """ reindex from a non-unique; which CategoricalIndex's are almost always """ + new_target, indexer = self.reindex(target) + new_indexer = None + + check = indexer==-1 + if check.any(): + new_indexer = np.arange(len(self.take(indexer))) + new_indexer[check] = -1 + + return new_target, indexer, new_indexer + + def get_indexer(self, target, method=None, limit=None): + """ + Compute indexer and mask for new index given the current index. The + indexer should be then used as an input to ndarray.take to align the + current data to the new index. The mask determines whether labels are + found or not in the current index + + Parameters + ---------- + target : MultiIndex or Index (of tuples) + method : {'pad', 'ffill', 'backfill', 'bfill'} + pad / ffill: propagate LAST valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + + Notes + ----- + This is a low-level method and probably should be used at your own risk + + Examples + -------- + >>> indexer, mask = index.get_indexer(new_index) + >>> new_values = cur_values.take(indexer) + >>> new_values[-mask] = np.nan + + Returns + ------- + (indexer, mask) : (ndarray, ndarray) + """ + method = com._clean_reindex_fill_method(method) + target = _ensure_index(target) + + if isinstance(target, CategoricalIndex): + target = target.categories + + if method == 'pad' or method == 'backfill': + raise NotImplementedError("method='pad' and method='backfill' not implemented yet " + 'for CategoricalIndex') + elif method == 'nearest': + raise NotImplementedError("method='nearest' not implemented yet " + 'for CategoricalIndex') + else: + + codes = self.categories.get_indexer(target) + indexer, _ = self._engine.get_indexer_non_unique(codes) + + return com._ensure_platform_int(indexer) + + def get_indexer_non_unique(self, target): + """ this is the same for a CategoricalIndex for get_indexer; the API returns the missing values as well """ + target = _ensure_index(target) + + if isinstance(target, CategoricalIndex): + target = target.categories + + codes = self.categories.get_indexer(target) + return self._engine.get_indexer_non_unique(codes) + + def _convert_list_indexer(self, keyarr, kind=None): + """ + we are passed a list indexer. + Return our indexer or raise if all of the values are not included in the categories + """ + codes = self.categories.get_indexer(keyarr) + if (codes==-1).any(): + raise KeyError("a list-indexer must only include values that are in the categories") + + return None + + def take(self, indexer, axis=0): + """ + return a new CategoricalIndex of the values selected by the indexer + + See also + -------- + numpy.ndarray.take + """ + + indexer = com._ensure_platform_int(indexer) + taken = self.codes.take(indexer) + return self._create_from_codes(taken) + + def delete(self, loc): + """ + Make new Index with passed location(-s) deleted + + Returns + ------- + new_index : Index + """ + return self._create_from_codes(np.delete(self.codes, loc)) + + def insert(self, loc, item): + """ + Make new Index inserting new item at location. Follows + Python list.append semantics for negative values + + Parameters + ---------- + loc : int + item : object + + Returns + ------- + new_index : Index + + Raises + ------ + ValueError if the item is not in the categories + + """ + code = self.categories.get_indexer([item]) + if (code == -1): + raise TypeError("cannot insert an item into a CategoricalIndex that is not already an existing category") + + codes = self.codes + codes = np.concatenate( + (codes[:loc], code, codes[loc:])) + return self._create_from_codes(codes) + + def append(self, other): + """ + Append a collection of CategoricalIndex options together + + Parameters + ---------- + other : Index or list/tuple of indices + + Returns + ------- + appended : Index + + Raises + ------ + ValueError if other is not in the categories + """ + to_concat, name = self._ensure_compat_append(other) + to_concat = [ self._is_dtype_compat(c) for c in to_concat ] + codes = np.concatenate([ c.codes for c in to_concat ]) + return self._create_from_codes(codes, name=name) + + @classmethod + def _add_comparison_methods(cls): + """ add in comparison methods """ + + def _make_compare(op): + + def _evaluate_compare(self, other): + + # if we have a Categorical type, then must have the same categories + if isinstance(other, CategoricalIndex): + other = other.values + elif isinstance(other, Index): + other = self._create_categorical(self, other.values, categories=self.categories, ordered=self.ordered) + + if isinstance(other, ABCCategorical): + if not (self.values.is_dtype_equal(other) and len(self.values) == len(other)): + raise TypeError("categorical index comparisions must have the same categories and ordered attributes") + + return getattr(self.values, op)(other) + + return _evaluate_compare + + cls.__eq__ = _make_compare('__eq__') + cls.__ne__ = _make_compare('__ne__') + cls.__lt__ = _make_compare('__lt__') + cls.__gt__ = _make_compare('__gt__') + cls.__le__ = _make_compare('__le__') + cls.__ge__ = _make_compare('__ge__') + + + def _delegate_method(self, name, *args, **kwargs): + """ method delegation to the .values """ + method = getattr(self.values, name) + if 'inplace' in kwargs: + raise ValueError("cannot use inplace with CategoricalIndex") + res = method(*args, **kwargs) + if lib.isscalar(res): + return res + return CategoricalIndex(res, name=self.name) + + @classmethod + def _add_accessors(cls): + """ add in Categorical accessor methods """ + + from pandas.core.categorical import Categorical + CategoricalIndex._add_delegate_accessors(delegate=Categorical, + accessors=["rename_categories", + "reorder_categories", + "add_categories", + "remove_categories", + "remove_unused_categories", + "set_categories", + "as_ordered", + "as_unordered", + "min", + "max"], + typ='method', + overwrite=True) + + +CategoricalIndex._add_numeric_methods_disabled() +CategoricalIndex._add_logical_methods_disabled() +CategoricalIndex._add_comparison_methods() +CategoricalIndex._add_accessors() class NumericIndex(Index): @@ -2787,7 +3424,7 @@ def equals(self, other): try: if not isinstance(other, Float64Index): other = self._constructor(other) - if self.dtype != other.dtype or self.shape != other.shape: + if not is_dtype_equal(self.dtype,other.dtype) or self.shape != other.shape: return False left, right = self.values, other.values return ((left == right) | (self._isnan & other._isnan)).all() @@ -2853,7 +3490,7 @@ def isin(self, values, level=None): value_set = set(values) if level is not None: self._validate_index_level(level) - return lib.ismember_nans(self._array_values(), value_set, + return lib.ismember_nans(np.array(self), value_set, isnull(list(value_set)).any()) @@ -3193,7 +3830,7 @@ def copy(self, names=None, dtype=None, levels=None, labels=None, verify_integrity=False, _set_identity=_set_identity) - def __array__(self, result=None): + def __array__(self, dtype=None): """ the array interface, return my values """ return self.values @@ -3205,10 +3842,6 @@ def view(self, cls=None): _shallow_copy = view - def _array_values(self): - # hack for various methods - return self.values - @cache_readonly def dtype(self): return np.dtype('O') @@ -3298,7 +3931,7 @@ def _reference_duplicate_name(self, name): return np.sum(name == np.asarray(self.names)) > 1 def _format_native_types(self, **kwargs): - return self.tolist() + return self.values @property def _constructor(self): @@ -3355,7 +3988,7 @@ def values(self): taken = com.take_1d(lev._box_values(lev.values), lab, fill_value=_get_na_value(lev.dtype.type)) else: - taken = com.take_1d(lev.values, lab) + taken = com.take_1d(np.asarray(lev.values), lab) values.append(taken) self._tuples = lib.fast_zip(values) @@ -3420,7 +4053,7 @@ def _try_mi(k): raise except TypeError: # generator/iterator-like - if com.is_iterator(key): + if is_iterator(key): raise InvalidIndexError(key) else: raise e1 @@ -4091,7 +4724,7 @@ def get_indexer(self, target, method=None, limit=None): if isinstance(target, MultiIndex): target_index = target._tuple_index - if target_index.dtype != object: + if not is_object_dtype(target_index.dtype): return np.ones(len(target_index)) * -1 if not self.is_unique: @@ -4650,9 +5283,9 @@ def equals(self, other): return False for i in range(self.nlevels): - svalues = com.take_nd(self.levels[i].values, self.labels[i], + svalues = com.take_nd(np.asarray(self.levels[i].values), self.labels[i], allow_fill=False) - ovalues = com.take_nd(other.levels[i].values, other.labels[i], + ovalues = com.take_nd(np.asarray(other.levels[i].values), other.labels[i], allow_fill=False) if not array_equivalent(svalues, ovalues): return False @@ -4768,7 +5401,7 @@ def _assert_can_do_setop(self, other): pass def astype(self, dtype): - if np.dtype(dtype) != np.object_: + if not is_object_dtype(np.dtype(dtype)): raise TypeError('Setting %s dtype to anything other than object ' 'is not supported' % self.__class__) return self._shallow_copy() @@ -4848,7 +5481,7 @@ def _wrap_joined_index(self, joined, other): @Appender(Index.isin.__doc__) def isin(self, values, level=None): if level is None: - return lib.ismember(self._array_values(), set(values)) + return lib.ismember(np.array(self), set(values)) else: num = self._get_level_number(level) levs = self.levels[num] diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 8154eb1bb6c8b..7c373b0a2b01d 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -253,7 +253,7 @@ def _setitem_with_indexer(self, indexer, value): # just replacing the block manager here # so the object is the same index = self.obj._get_axis(i) - labels = safe_append_to_index(index, key) + labels = index.insert(len(index),key) self.obj._data = self.obj.reindex_axis(labels, i)._data self.obj._maybe_update_cacher(clear=True) self.obj.is_copy=None @@ -274,10 +274,7 @@ def _setitem_with_indexer(self, indexer, value): # and set inplace if self.ndim == 1: index = self.obj.index - if len(index) == 0: - new_index = Index([indexer]) - else: - new_index = safe_append_to_index(index, indexer) + new_index = index.insert(len(index),indexer) # this preserves dtype of the value new_values = Series([value]).values @@ -928,24 +925,6 @@ def _getitem_iterable(self, key, axis=0): labels = self.obj._get_axis(axis) - def _reindex(keys, level=None): - - try: - result = self.obj.reindex_axis(keys, axis=axis, level=level) - except AttributeError: - # Series - if axis != 0: - raise AssertionError('axis must be 0') - return self.obj.reindex(keys, level=level) - - # this is an error as we are trying to find - # keys in a multi-index that don't exist - if isinstance(labels, MultiIndex) and level is not None: - if hasattr(result,'ndim') and not np.prod(result.shape) and len(keys): - raise KeyError("cannot index a multi-index axis with these keys") - - return result - if is_bool_indexer(key): key = check_bool_indexer(labels, key) inds, = key.nonzero() @@ -958,8 +937,9 @@ def _reindex(keys, level=None): # asarray can be unsafe, NumPy strings are weird keyarr = _asarray_tuplesafe(key) - # handle a mixed integer scenario - indexer = labels._convert_list_indexer_for_mixed(keyarr, kind=self.name) + # have the index handle the indexer and possibly return + # an indexer or raising + indexer = labels._convert_list_indexer(keyarr, kind=self.name) if indexer is not None: return self.obj.take(indexer, axis=axis) @@ -970,65 +950,48 @@ def _reindex(keys, level=None): else: level = None - keyarr_is_unique = Index(keyarr).is_unique + # existing labels are unique and indexer are unique + if labels.is_unique and Index(keyarr).is_unique: + + try: + result = self.obj.reindex_axis(keyarr, axis=axis, level=level) + + # this is an error as we are trying to find + # keys in a multi-index that don't exist + if isinstance(labels, MultiIndex) and level is not None: + if hasattr(result,'ndim') and not np.prod(result.shape) and len(keyarr): + raise KeyError("cannot index a multi-index axis with these keys") + + return result - # existing labels are unique and indexer is unique - if labels.is_unique and keyarr_is_unique: - return _reindex(keyarr, level=level) + except AttributeError: + # Series + if axis != 0: + raise AssertionError('axis must be 0') + return self.obj.reindex(keyarr, level=level) + + # existing labels are non-unique else: - indexer, missing = labels.get_indexer_non_unique(keyarr) - check = indexer != -1 - result = self.obj.take(indexer[check], axis=axis, - convert=False) - - # need to merge the result labels and the missing labels - if len(missing): - l = np.arange(len(indexer)) - - missing = com._ensure_platform_int(missing) - missing_labels = keyarr.take(missing) - missing_indexer = com._ensure_int64(l[~check]) - cur_labels = result._get_axis(axis).values - cur_indexer = com._ensure_int64(l[check]) - - new_labels = np.empty(tuple([len(indexer)]), dtype=object) - new_labels[cur_indexer] = cur_labels - new_labels[missing_indexer] = missing_labels - - # reindex with the specified axis - ndim = self.obj.ndim - if axis + 1 > ndim: - raise AssertionError("invalid indexing error with " - "non-unique index") - - # a unique indexer - if keyarr_is_unique: - - # see GH5553, make sure we use the right indexer - new_indexer = np.arange(len(indexer)) - new_indexer[cur_indexer] = np.arange( - len(result._get_axis(axis)) - ) - new_indexer[missing_indexer] = -1 - # we have a non_unique selector, need to use the original - # indexer here - else: + # reindex with the specified axis + if axis + 1 > self.obj.ndim: + raise AssertionError("invalid indexing error with " + "non-unique index") - # need to retake to have the same size as the indexer - rindexer = indexer.values - rindexer[~check] = 0 - result = self.obj.take(rindexer, axis=axis, - convert=False) + new_target, indexer, new_indexer = labels._reindex_non_unique(keyarr) - # reset the new indexer to account for the new size - new_indexer = np.arange(len(result)) - new_indexer[~check] = -1 + if new_indexer is not None: + result = self.obj.take(indexer[indexer!=-1], axis=axis, + convert=False) result = result._reindex_with_indexers({ - axis: [new_labels, new_indexer] - }, copy=True, allow_dups=True) + axis: [new_target, new_indexer] + }, copy=True, allow_dups=True) + + else: + result = self.obj.take(indexer, axis=axis, + convert=False) return result @@ -1105,8 +1068,9 @@ def _convert_to_indexer(self, obj, axis=0, is_setter=False): else: objarr = _asarray_tuplesafe(obj) - # If have integer labels, defer to label-based indexing - indexer = labels._convert_list_indexer_for_mixed(objarr, kind=self.name) + # The index may want to handle a list indexer differently + # by returning an indexer or raising + indexer = labels._convert_list_indexer(objarr, kind=self.name) if indexer is not None: return indexer @@ -1627,8 +1591,8 @@ def length_of_indexer(indexer, target=None): if step is None: step = 1 elif step < 0: - step = abs(step) - return (stop - start) / step + step = -step + return (stop - start + step-1) // step elif isinstance(indexer, (ABCSeries, Index, np.ndarray, list)): return len(indexer) elif not is_list_like_indexer(indexer): @@ -1719,19 +1683,6 @@ def get_indexer(_i, _idx): return tuple([get_indexer(_i, _idx) for _i, _idx in enumerate(indexer)]) -def safe_append_to_index(index, key): - """ a safe append to an index, if incorrect type, then catch and recreate - """ - try: - return index.insert(len(index), key) - except: - - # raise here as this is basically an unsafe operation and we want - # it to be obvious that you are doing something wrong - raise ValueError("unsafe appending to index of type {0} with a key " - "{1}".format(index.__class__.__name__, key)) - - def maybe_convert_indices(indices, n): """ if we have negative indicies, translate to postive here if have indicies that are out-of-bounds, raise an IndexError diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 4d0f8394fbd2a..276b42cde68bc 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -484,16 +484,21 @@ def _try_coerce_and_cast_result(self, result, dtype=None): def _try_fill(self, value): return value - def to_native_types(self, slicer=None, na_rep='', **kwargs): + def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ values = self.values if slicer is not None: values = values[:, slicer] - values = np.array(values, dtype=object) mask = isnull(values) + + if not self.is_object and not quoting: + values = values.astype(str) + else: + values = np.array(values, dtype='object') + values[mask] = na_rep - return values.tolist() + return values # block actions #### def copy(self, deep=True): @@ -582,7 +587,7 @@ def _is_empty_indexer(indexer): if arr_value.ndim == 1: if not isinstance(indexer, tuple): indexer = tuple([indexer]) - return all([ isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer ]) + return any(isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer) return False # empty indexers @@ -627,7 +632,8 @@ def _is_empty_indexer(indexer): return [self] - def putmask(self, mask, new, align=True, inplace=False): + def putmask(self, mask, new, align=True, inplace=False, + axis=0, transpose=False): """ putmask the data to the block; it is possible that we may create a new dtype of block @@ -639,37 +645,55 @@ def putmask(self, mask, new, align=True, inplace=False): new : a ndarray/object align : boolean, perform alignment on other/cond, default is True inplace : perform inplace modification, default is False + axis : int + transpose : boolean + Set to True if self is stored with axes reversed Returns ------- - a new block(s), the result of the putmask + a list of new blocks, the result of the putmask """ new_values = self.values if inplace else self.values.copy() - # may need to align the new if hasattr(new, 'reindex_axis'): - new = new.values.T + new = new.values - # may need to align the mask if hasattr(mask, 'reindex_axis'): - mask = mask.values.T + mask = mask.values # if we are passed a scalar None, convert it here if not is_list_like(new) and isnull(new) and not self.is_object: new = self.fill_value if self._can_hold_element(new): + if transpose: + new_values = new_values.T + new = self._try_cast(new) - # pseudo-broadcast - if isinstance(new, np.ndarray) and new.ndim == self.ndim - 1: - new = np.repeat(new, self.shape[-1]).reshape(self.shape) + # If the default repeat behavior in np.putmask would go in the wrong + # direction, then explictly repeat and reshape new instead + if getattr(new, 'ndim', 0) >= 1: + if self.ndim - 1 == new.ndim and axis == 1: + new = np.repeat(new, new_values.shape[-1]).reshape(self.shape) np.putmask(new_values, mask, new) # maybe upcast me elif mask.any(): + if transpose: + mask = mask.T + if isinstance(new, np.ndarray): + new = new.T + axis = new_values.ndim - axis - 1 + + # Pseudo-broadcast + if getattr(new, 'ndim', 0) >= 1: + if self.ndim - 1 == new.ndim: + new_shape = list(new.shape) + new_shape.insert(axis, 1) + new = new.reshape(tuple(new_shape)) # need to go column by column new_blocks = [] @@ -680,14 +704,15 @@ def putmask(self, mask, new, align=True, inplace=False): # need a new block if m.any(): - - n = new[i] if isinstance( - new, np.ndarray) else np.array(new) + if isinstance(new, np.ndarray): + n = np.squeeze(new[i % new.shape[0]]) + else: + n = np.array(new) # type of the new block dtype, _ = com._maybe_promote(n.dtype) - # we need to exiplicty astype here to make a copy + # we need to explicitly astype here to make a copy n = n.astype(dtype) nv = _putmask_smart(v, m, n) @@ -713,8 +738,10 @@ def putmask(self, mask, new, align=True, inplace=False): if inplace: return [self] - return [make_block(new_values, - placement=self.mgr_locs, fastpath=True)] + if transpose: + new_values = new_values.T + + return [make_block(new_values, placement=self.mgr_locs, fastpath=True)] def interpolate(self, method='pad', axis=0, index=None, values=None, inplace=False, limit=None, @@ -998,7 +1025,7 @@ def handle_error(): fastpath=True, placement=self.mgr_locs)] def where(self, other, cond, align=True, raise_on_error=True, - try_cast=False): + try_cast=False, axis=0, transpose=False): """ evaluate the block; return result block(s) from the result @@ -1009,6 +1036,9 @@ def where(self, other, cond, align=True, raise_on_error=True, align : boolean, perform alignment on other/cond raise_on_error : if True, raise when I can't perform the function, False by default (and just return the data that we had coming in) + axis : int + transpose : boolean + Set to True if self is stored with axes reversed Returns ------- @@ -1016,43 +1046,23 @@ def where(self, other, cond, align=True, raise_on_error=True, """ values = self.values + if transpose: + values = values.T - # see if we can align other if hasattr(other, 'reindex_axis'): other = other.values - # make sure that we can broadcast - is_transposed = False - if hasattr(other, 'ndim') and hasattr(values, 'ndim'): - if values.ndim != other.ndim or values.shape == other.shape[::-1]: - - # if its symmetric are ok, no reshaping needed (GH 7506) - if (values.shape[0] == np.array(values.shape)).all(): - pass - - # pseodo broadcast (its a 2d vs 1d say and where needs it in a - # specific direction) - elif (other.ndim >= 1 and values.ndim - 1 == other.ndim and - values.shape[0] != other.shape[0]): - other = _block_shape(other).T - else: - values = values.T - is_transposed = True - - # see if we can align cond - if not hasattr(cond, 'shape'): - raise ValueError( - "where must have a condition that is ndarray like") - if hasattr(cond, 'reindex_axis'): cond = cond.values - # may need to undo transpose of values - if hasattr(values, 'ndim'): - if values.ndim != cond.ndim or values.shape == cond.shape[::-1]: + # If the default broadcasting would go in the wrong direction, then + # explictly reshape other instead + if getattr(other, 'ndim', 0) >= 1: + if values.ndim - 1 == other.ndim and axis == 1: + other = other.reshape(tuple(other.shape + (1,))) - values = values.T - is_transposed = not is_transposed + if not hasattr(cond, 'shape'): + raise ValueError("where must have a condition that is ndarray like") other = _maybe_convert_string_to_object(other) @@ -1085,15 +1095,14 @@ def func(c, v, o): raise TypeError('Could not compare [%s] with block values' % repr(other)) - if is_transposed: + if transpose: result = result.T # try to cast if requested if try_cast: result = self._try_cast_result(result) - return make_block(result, - ndim=self.ndim, placement=self.mgr_locs) + return make_block(result, ndim=self.ndim, placement=self.mgr_locs) # might need to separate out blocks axis = cond.ndim - 1 @@ -1221,32 +1230,34 @@ def _try_cast(self, element): return element def to_native_types(self, slicer=None, na_rep='', float_format=None, decimal='.', - **kwargs): + quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ values = self.values if slicer is not None: values = values[:, slicer] - values = np.array(values, dtype=object) mask = isnull(values) - values[mask] = na_rep - + formatter = None if float_format and decimal != '.': formatter = lambda v : (float_format % v).replace('.',decimal,1) elif decimal != '.': formatter = lambda v : ('%g' % v).replace('.',decimal,1) elif float_format: formatter = lambda v : float_format % v + + if formatter is None and not quoting: + values = values.astype(str) else: - formatter = None + values = np.array(values, dtype='object') + values[mask] = na_rep if formatter: imask = (~mask).ravel() values.flat[imask] = np.array( [formatter(val) for val in values.ravel()[imask]]) - return values.tolist() + return values def should_store(self, value): # when inserting a column should not coerce integers to floats @@ -1366,7 +1377,7 @@ def _try_coerce_result(self, result): def should_store(self, value): return issubclass(value.dtype.type, np.timedelta64) - def to_native_types(self, slicer=None, na_rep=None, **kwargs): + def to_native_types(self, slicer=None, na_rep=None, quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ values = self.values @@ -1387,7 +1398,7 @@ def to_native_types(self, slicer=None, na_rep=None, **kwargs): rvalues.flat[imask] = np.array([Timedelta(val)._repr_base(format='all') for val in values.ravel()[imask]], dtype=object) - return rvalues.tolist() + return rvalues def get_values(self, dtype=None): @@ -1723,7 +1734,8 @@ def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): return self.make_block_same_class(new_values, new_mgr_locs) - def putmask(self, mask, new, align=True, inplace=False): + def putmask(self, mask, new, align=True, inplace=False, + axis=0, transpose=False): """ putmask the data to the block; it is possible that we may create a new dtype of block @@ -1763,18 +1775,19 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None, ndim=self.ndim, placement=self.mgr_locs) - def to_native_types(self, slicer=None, na_rep='', **kwargs): + def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ values = self.values if slicer is not None: # Categorical is always one dimension values = values[slicer] - values = np.array(values, dtype=object) mask = isnull(values) + values = np.array(values, dtype='object') values[mask] = na_rep - # Blocks.to_native_type returns list of lists, but we are always only a list - return [values.tolist()] + + # we are expected to return a 2-d ndarray + return values.reshape(1,len(values)) class DatetimeBlock(Block): __slots__ = () @@ -1864,29 +1877,21 @@ def fillna(self, value, limit=None, fastpath=True, placement=self.mgr_locs)] def to_native_types(self, slicer=None, na_rep=None, date_format=None, - **kwargs): + quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ values = self.values if slicer is not None: values = values[:, slicer] - mask = isnull(values) - rvalues = np.empty(values.shape, dtype=object) - if na_rep is None: - na_rep = 'NaT' - rvalues[mask] = na_rep - imask = (~mask).ravel() - - if date_format is None: - date_formatter = lambda x: Timestamp(x)._repr_base - else: - date_formatter = lambda x: Timestamp(x).strftime(date_format) + from pandas.core.format import _get_format_datetime64_from_values + format = _get_format_datetime64_from_values(values, date_format) - rvalues.flat[imask] = np.array([date_formatter(val) for val in - values.ravel()[imask]], dtype=object) - - return rvalues.tolist() + result = tslib.format_array_from_datetime(values.view('i8').ravel(), + tz=None, + format=format, + na_rep=na_rep).reshape(values.shape) + return result def should_store(self, value): return issubclass(value.dtype.type, np.datetime64) @@ -2422,12 +2427,18 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, **kwargs): else: kwargs['filter'] = filter_locs - if f == 'where' and kwargs.get('align', True): + if f == 'where': align_copy = True - align_keys = ['other', 'cond'] - elif f == 'putmask' and kwargs.get('align', True): + if kwargs.get('align', True): + align_keys = ['other', 'cond'] + else: + align_keys = ['cond'] + elif f == 'putmask': align_copy = False - align_keys = ['new', 'mask'] + if kwargs.get('align', True): + align_keys = ['new', 'mask'] + else: + align_keys = ['mask'] elif f == 'eval': align_copy = False align_keys = ['other'] @@ -3134,7 +3145,6 @@ def reindex_indexer(self, new_axis, indexer, axis, fill_value=None, pandas-indexer with -1's only. """ - if indexer is None: if new_axis is self.axes[axis] and not copy: return self @@ -3146,10 +3156,9 @@ def reindex_indexer(self, new_axis, indexer, axis, fill_value=None, self._consolidate_inplace() - # trying to reindex on an axis with duplicates - if (not allow_dups and not self.axes[axis].is_unique - and len(indexer)): - raise ValueError("cannot reindex from a duplicate axis") + # some axes don't allow reindexing with dups + if not allow_dups: + self.axes[axis]._can_reindex(indexer) if axis >= self.ndim: raise IndexError("Requested axis not found in manager") diff --git a/pandas/core/series.py b/pandas/core/series.py index f9c56db018639..4ad5e06693221 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -236,6 +236,11 @@ def from_array(cls, arr, index=None, name=None, dtype=None, copy=False, def _constructor(self): return Series + @property + def _constructor_expanddim(self): + from pandas.core.frame import DataFrame + return DataFrame + # types @property def _can_hold_na(self): @@ -1047,11 +1052,10 @@ def to_frame(self, name=None): ------- data_frame : DataFrame """ - from pandas.core.frame import DataFrame if name is None: - df = DataFrame(self) + df = self._constructor_expanddim(self) else: - df = DataFrame({name: self}) + df = self._constructor_expanddim({name: self}) return df @@ -2517,6 +2521,21 @@ def _make_cat_accessor(self): cat = base.AccessorProperty(CategoricalAccessor, _make_cat_accessor) + def _dir_deletions(self): + return self._accessors + + def _dir_additions(self): + rv = set() + # these accessors are mutually exclusive, so break loop when one exists + for accessor in self._accessors: + try: + getattr(self, accessor) + rv.add(accessor) + break + except AttributeError: + pass + return rv + Series._setup_axes(['index'], info_axis=0, stat_axis=0, aliases={'rows': 0}) Series._add_numeric_operations() @@ -2590,8 +2609,9 @@ def _try_cast(arr, take_fast_path): # GH #846 if isinstance(data, (np.ndarray, Index, Series)): - subarr = np.array(data, copy=False) + if dtype is not None: + subarr = np.array(data, copy=False) # possibility of nan -> garbage if com.is_float_dtype(data.dtype) and com.is_integer_dtype(dtype): diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 799872d036c4f..819c49f4fb0dd 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -2273,6 +2273,20 @@ def test_nrows_and_chunksize_raises_notimplemented(self): self.assertRaises(NotImplementedError, self.read_csv, StringIO(data), nrows=10, chunksize=5) + def test_single_char_leading_whitespace(self): + # GH 9710 + data = """\ +MyColumn + a + b + a + b\n""" + + expected = DataFrame({'MyColumn' : list('abab')}) + + result = self.read_csv(StringIO(data), skipinitialspace=True) + tm.assert_frame_equal(result, expected) + class TestPythonParser(ParserTests, tm.TestCase): def test_negative_skipfooter_raises(self): @@ -3313,6 +3327,25 @@ def test_buffer_overflow(self): except Exception as cperr: self.assertIn('Buffer overflow caught - possible malformed input file.', str(cperr)) + def test_single_char_leading_whitespace(self): + # GH 9710 + data = """\ +MyColumn + a + b + a + b\n""" + + expected = DataFrame({'MyColumn' : list('abab')}) + + result = self.read_csv(StringIO(data), delim_whitespace=True, + skipinitialspace=True) + tm.assert_frame_equal(result, expected) + + result = self.read_csv(StringIO(data), lineterminator='\n', + skipinitialspace=True) + tm.assert_frame_equal(result, expected) + class TestCParserLowMemory(ParserTests, tm.TestCase): def read_csv(self, *args, **kwds): @@ -3734,6 +3767,25 @@ def test_buffer_overflow(self): except Exception as cperr: self.assertIn('Buffer overflow caught - possible malformed input file.', str(cperr)) + def test_single_char_leading_whitespace(self): + # GH 9710 + data = """\ +MyColumn + a + b + a + b\n""" + + expected = DataFrame({'MyColumn' : list('abab')}) + + result = self.read_csv(StringIO(data), delim_whitespace=True, + skipinitialspace=True) + tm.assert_frame_equal(result, expected) + + result = self.read_csv(StringIO(data), lineterminator='\n', + skipinitialspace=True) + tm.assert_frame_equal(result, expected) + class TestMiscellaneous(tm.TestCase): # for tests that don't fit into any of the other classes, e.g. those that diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 5ab2ee4327177..0d53b19425c2f 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -933,7 +933,7 @@ def string_array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_re @cython.boundscheck(False) @cython.wraparound(False) -def write_csv_rows(list data, list data_index, int nlevels, list cols, object writer): +def write_csv_rows(list data, ndarray data_index, int nlevels, ndarray cols, object writer): cdef int N, j, i, ncols cdef list rows diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 1850aab50b55a..e7b5db9c5e361 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -849,10 +849,11 @@ int tokenize_delimited(parser_t *self, size_t line_limit) ; else { // backtrack /* We have to use i + 1 because buf has been incremented but not i */ - while (i + 1 > self->datapos && *buf != '\n') { + do { --buf; --i; - } + } while (i + 1 > self->datapos && *buf != '\n'); + if (i + 1 > self->datapos) // reached a newline rather than the beginning { ++buf; // move pointer to first char after newline @@ -1073,7 +1074,7 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit) // Next character in file c = *buf++; - TRACE(("Iter: %d Char: %c Line %d field_count %d, state %d\n", + TRACE(("tokenize_delim_customterm - Iter: %d Char: %c Line %d field_count %d, state %d\n", i, c, self->file_lines + 1, self->line_fields[self->lines], self->state)); @@ -1166,10 +1167,11 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit) ; else { // backtrack /* We have to use i + 1 because buf has been incremented but not i */ - while (i + 1 > self->datapos && *buf != self->lineterminator) { + do { --buf; --i; - } + } while (i + 1 > self->datapos && *buf != self->lineterminator); + if (i + 1 > self->datapos) // reached a newline rather than the beginning { ++buf; // move pointer to first char after newline @@ -1336,7 +1338,7 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) // Next character in file c = *buf++; - TRACE(("Iter: %d Char: %c Line %d field_count %d, state %d\n", + TRACE(("tokenize_whitespace - Iter: %d Char: %c Line %d field_count %d, state %d\n", i, c, self->file_lines + 1, self->line_fields[self->lines], self->state)); diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index af48774492b11..6a6564347d35f 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -11,7 +11,7 @@ import numpy as np import pandas as pd -from pandas import Categorical, Index, Series, DataFrame, PeriodIndex, Timestamp +from pandas import Categorical, Index, Series, DataFrame, PeriodIndex, Timestamp, CategoricalIndex from pandas.core.config import option_context import pandas.core.common as com @@ -93,6 +93,24 @@ def test_constructor_unsortable(self): else: Categorical.from_array(arr, ordered=True) + def test_is_equal_dtype(self): + + # test dtype comparisons between cats + + c1 = Categorical(list('aabca'),categories=list('abc'),ordered=False) + c2 = Categorical(list('aabca'),categories=list('cab'),ordered=False) + c3 = Categorical(list('aabca'),categories=list('cab'),ordered=True) + self.assertTrue(c1.is_dtype_equal(c1)) + self.assertTrue(c2.is_dtype_equal(c2)) + self.assertTrue(c3.is_dtype_equal(c3)) + self.assertFalse(c1.is_dtype_equal(c2)) + self.assertFalse(c1.is_dtype_equal(c3)) + self.assertFalse(c1.is_dtype_equal(Index(list('aabca')))) + self.assertFalse(c1.is_dtype_equal(c1.astype(object))) + self.assertTrue(c1.is_dtype_equal(CategoricalIndex(c1))) + self.assertFalse(c1.is_dtype_equal(CategoricalIndex(c1,categories=list('cab')))) + self.assertFalse(c1.is_dtype_equal(CategoricalIndex(c1,ordered=True))) + def test_constructor(self): exp_arr = np.array(["a", "b", "c", "a", "b", "c"]) @@ -224,6 +242,18 @@ def f(): c_old2 = Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3]) cat = Categorical([1,2], categories=[1,2,3]) + # this is a legitimate constructor + with tm.assert_produces_warning(None): + c = Categorical(np.array([],dtype='int64'),categories=[3,2,1],ordered=True) + + def test_constructor_with_index(self): + + ci = CategoricalIndex(list('aabbca'),categories=list('cab')) + self.assertTrue(ci.values.equals(Categorical(ci))) + + ci = CategoricalIndex(list('aabbca'),categories=list('cab')) + self.assertTrue(ci.values.equals(Categorical(ci.astype(object),categories=ci.categories))) + def test_constructor_with_generator(self): # This was raising an Error in isnull(single_val).any() because isnull returned a scalar # for a generator @@ -727,6 +757,19 @@ def f(): cat.add_categories(["d"]) self.assertRaises(ValueError, f) + # GH 9927 + cat = Categorical(list("abc"), ordered=True) + expected = Categorical(list("abc"), categories=list("abcde"), ordered=True) + # test with Series, np.array, index, list + res = cat.add_categories(Series(["d", "e"])) + self.assert_categorical_equal(res, expected) + res = cat.add_categories(np.array(["d", "e"])) + self.assert_categorical_equal(res, expected) + res = cat.add_categories(Index(["d", "e"])) + self.assert_categorical_equal(res, expected) + res = cat.add_categories(["d", "e"]) + self.assert_categorical_equal(res, expected) + def test_remove_categories(self): cat = Categorical(["a","b","c","a"], ordered=True) old = cat.copy() @@ -2562,6 +2605,8 @@ def f(): dfx['grade'].cat.categories self.assert_numpy_array_equal(df['grade'].cat.categories, dfx['grade'].cat.categories) + def test_concat_preserve(self): + # GH 8641 # series concat not preserving category dtype s = Series(list('abc'),dtype='category') @@ -2579,6 +2624,28 @@ def f(): expected = Series(list('abcabc'),index=[0,1,2,0,1,2]).astype('category') tm.assert_series_equal(result, expected) + a = Series(np.arange(6,dtype='int64')) + b = Series(list('aabbca')) + + df2 = DataFrame({'A' : a, 'B' : b.astype('category',categories=list('cab')) }) + result = pd.concat([df2,df2]) + expected = DataFrame({'A' : pd.concat([a,a]), 'B' : pd.concat([b,b]).astype('category',categories=list('cab')) }) + tm.assert_frame_equal(result, expected) + + def test_categorical_index_preserver(self): + + a = Series(np.arange(6,dtype='int64')) + b = Series(list('aabbca')) + + df2 = DataFrame({'A' : a, 'B' : b.astype('category',categories=list('cab')) }).set_index('B') + result = pd.concat([df2,df2]) + expected = DataFrame({'A' : pd.concat([a,a]), 'B' : pd.concat([b,b]).astype('category',categories=list('cab')) }).set_index('B') + tm.assert_frame_equal(result, expected) + + # wrong catgories + df3 = DataFrame({'A' : a, 'B' : b.astype('category',categories=list('abc')) }).set_index('B') + self.assertRaises(TypeError, lambda : pd.concat([df2,df3])) + def test_append(self): cat = pd.Categorical(["a","b"], categories=["a","b"]) vals = [1,2] @@ -2714,6 +2781,14 @@ def cmp(a,b): self.assertRaises(TypeError, lambda : invalid(s)) + def test_astype_categorical(self): + + cat = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) + tm.assert_categorical_equal(cat,cat.astype('category')) + tm.assert_almost_equal(np.array(cat),cat.astype('object')) + + self.assertRaises(ValueError, lambda : cat.astype(float)) + def test_to_records(self): # GH8626 diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index e3455d2449b55..b557594e8e7ef 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -3010,12 +3010,12 @@ def test_format(self): def test_output_significant_digits(self): # Issue #9764 - + # In case default display precision changes: with pd.option_context('display.precision', 7): # DataFrame example from issue #9764 d=pd.DataFrame({'col1':[9.999e-8, 1e-7, 1.0001e-7, 2e-7, 4.999e-7, 5e-7, 5.0001e-7, 6e-7, 9.999e-7, 1e-6, 1.0001e-6, 2e-6, 4.999e-6, 5e-6, 5.0001e-6, 6e-6]}) - + expected_output={ (0,6):' col1\n0 9.999000e-08\n1 1.000000e-07\n2 1.000100e-07\n3 2.000000e-07\n4 4.999000e-07\n5 5.000000e-07', (1,6):' col1\n1 1.000000e-07\n2 1.000100e-07\n3 2.000000e-07\n4 4.999000e-07\n5 5.000000e-07', diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index c7c35e63d3d91..555cb9efa5eee 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -31,9 +31,9 @@ import pandas.core.common as com import pandas.core.format as fmt import pandas.core.datetools as datetools -from pandas import (DataFrame, Index, Series, notnull, isnull, +from pandas import (DataFrame, Index, Series, Panel, notnull, isnull, MultiIndex, DatetimeIndex, Timestamp, date_range, - read_csv, timedelta_range, Timedelta, + read_csv, timedelta_range, Timedelta, CategoricalIndex, option_context) import pandas as pd from pandas.parser import CParserError @@ -784,6 +784,16 @@ def test_setitem_None(self): assert_series_equal(self.frame[None], self.frame['A']) repr(self.frame) + def test_setitem_empty(self): + # GH 9596 + df = pd.DataFrame({'a': ['1', '2', '3'], + 'b': ['11', '22', '33'], + 'c': ['111', '222', '333']}) + + result = df.copy() + result.loc[result.b.isnull(), 'a'] = result.a + assert_frame_equal(result, df) + def test_delitem_corner(self): f = self.frame.copy() del f['D'] @@ -2376,6 +2386,32 @@ def test_set_index_pass_arrays(self): expected = df.set_index(['A', 'B'], drop=False) assert_frame_equal(result, expected, check_names=False) # TODO should set_index check_names ? + def test_construction_with_categorical_index(self): + + ci = tm.makeCategoricalIndex(10) + + # with Categorical + df = DataFrame({'A' : np.random.randn(10), + 'B' : ci.values }) + idf = df.set_index('B') + str(idf) + tm.assert_index_equal(idf.index,ci) + + # from a CategoricalIndex + df = DataFrame({'A' : np.random.randn(10), + 'B' : ci }) + idf = df.set_index('B') + str(idf) + tm.assert_index_equal(idf.index,ci) + + idf = df.set_index('B').reset_index().set_index('B') + str(idf) + tm.assert_index_equal(idf.index,ci) + + new_df = idf.reset_index() + new_df.index = df.B + tm.assert_index_equal(new_df.index,ci) + def test_set_index_cast_datetimeindex(self): df = DataFrame({'A': [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], @@ -9838,6 +9874,110 @@ def test_where_complex(self): df[df.abs() >= 5] = np.nan assert_frame_equal(df,expected) + def test_where_axis(self): + # GH 9736 + df = DataFrame(np.random.randn(2, 2)) + mask = DataFrame([[False, False], [False, False]]) + s = Series([0, 1]) + + expected = DataFrame([[0, 0], [1, 1]], dtype='float64') + result = df.where(mask, s, axis='index') + assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s, axis='index', inplace=True) + assert_frame_equal(result, expected) + + expected = DataFrame([[0, 1], [0, 1]], dtype='float64') + result = df.where(mask, s, axis='columns') + assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s, axis='columns', inplace=True) + assert_frame_equal(result, expected) + + # Upcast needed + df = DataFrame([[1, 2], [3, 4]], dtype='int64') + mask = DataFrame([[False, False], [False, False]]) + s = Series([0, np.nan]) + + expected = DataFrame([[0, 0], [np.nan, np.nan]], dtype='float64') + result = df.where(mask, s, axis='index') + assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s, axis='index', inplace=True) + assert_frame_equal(result, expected) + + expected = DataFrame([[0, np.nan], [0, np.nan]], dtype='float64') + result = df.where(mask, s, axis='columns') + assert_frame_equal(result, expected) + + expected = DataFrame({0 : np.array([0, 0], dtype='int64'), + 1 : np.array([np.nan, np.nan], dtype='float64')}) + result = df.copy() + result.where(mask, s, axis='columns', inplace=True) + assert_frame_equal(result, expected) + + # Multiple dtypes (=> multiple Blocks) + df = pd.concat([DataFrame(np.random.randn(10, 2)), + DataFrame(np.random.randint(0, 10, size=(10, 2)))], + ignore_index=True, axis=1) + mask = DataFrame(False, columns=df.columns, index=df.index) + s1 = Series(1, index=df.columns) + s2 = Series(2, index=df.index) + + result = df.where(mask, s1, axis='columns') + expected = DataFrame(1.0, columns=df.columns, index=df.index) + expected[2] = expected[2].astype(int) + expected[3] = expected[3].astype(int) + assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s1, axis='columns', inplace=True) + assert_frame_equal(result, expected) + + result = df.where(mask, s2, axis='index') + expected = DataFrame(2.0, columns=df.columns, index=df.index) + expected[2] = expected[2].astype(int) + expected[3] = expected[3].astype(int) + assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s2, axis='index', inplace=True) + assert_frame_equal(result, expected) + + # DataFrame vs DataFrame + d1 = df.copy().drop(1, axis=0) + expected = df.copy() + expected.loc[1, :] = np.nan + + result = df.where(mask, d1) + assert_frame_equal(result, expected) + result = df.where(mask, d1, axis='index') + assert_frame_equal(result, expected) + result = df.copy() + result.where(mask, d1, inplace=True) + assert_frame_equal(result, expected) + result = df.copy() + result.where(mask, d1, inplace=True, axis='index') + assert_frame_equal(result, expected) + + d2 = df.copy().drop(1, axis=1) + expected = df.copy() + expected.loc[:, 1] = np.nan + + result = df.where(mask, d2) + assert_frame_equal(result, expected) + result = df.where(mask, d2, axis='columns') + assert_frame_equal(result, expected) + result = df.copy() + result.where(mask, d2, inplace=True) + assert_frame_equal(result, expected) + result = df.copy() + result.where(mask, d2, inplace=True, axis='columns') + assert_frame_equal(result, expected) + def test_mask(self): df = DataFrame(np.random.randn(5, 3)) cond = df > 0 @@ -10734,6 +10874,19 @@ def test_sort_index(self): with assertRaisesRegexp(ValueError, msg): frame.sort_index(by=['A', 'B'], axis=0, ascending=[True] * 5) + def test_sort_index_categorical_index(self): + + df = DataFrame({'A' : np.arange(6,dtype='int64'), + 'B' : Series(list('aabbca')).astype('category',categories=list('cab')) }).set_index('B') + + result = df.sort_index() + expected = df.iloc[[4,0,1,5,2,3]] + assert_frame_equal(result, expected) + + result = df.sort_index(ascending=False) + expected = df.iloc[[3,2,5,1,0,4]] + assert_frame_equal(result, expected) + def test_sort_nan(self): # GH3917 nan = np.nan @@ -14204,6 +14357,27 @@ def _constructor(self): # GH9776 self.assertEqual(df.iloc[0:1, :].testattr, 'XXX') + def test_to_panel_expanddim(self): + # GH 9762 + + class SubclassedFrame(DataFrame): + @property + def _constructor_expanddim(self): + return SubclassedPanel + + class SubclassedPanel(Panel): + pass + + index = MultiIndex.from_tuples([(0, 0), (0, 1), (0, 2)]) + df = SubclassedFrame({'X':[1, 2, 3], 'Y': [4, 5, 6]}, index=index) + result = df.to_panel() + self.assertTrue(isinstance(result, SubclassedPanel)) + expected = SubclassedPanel([[[1, 2, 3]], [[4, 5, 6]]], + items=['X', 'Y'], major_axis=[0], + minor_axis=[0, 1, 2], + dtype='int64') + tm.assert_panel_equal(result, expected) + def skip_if_no_ne(engine='numexpr'): if engine == 'numexpr': diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index 7ec57c0304530..33c88b0e3b4b7 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -1534,6 +1534,19 @@ def test_subplots_ts_share_axes(self): for ax in axes[[0, 1, 2], [2]].ravel(): self._check_visible(ax.get_yticklabels(), visible=False) + def test_subplots_sharex_axes_existing_axes(self): + # GH 9158 + d = {'A': [1., 2., 3., 4.], 'B': [4., 3., 2., 1.], 'C': [5, 1, 3, 4]} + df = DataFrame(d, index=date_range('2014 10 11', '2014 10 14')) + + axes = df[['A', 'B']].plot(subplots=True) + df['C'].plot(ax=axes[0], secondary_y=True) + + self._check_visible(axes[0].get_xticklabels(), visible=False) + self._check_visible(axes[1].get_xticklabels(), visible=True) + for ax in axes.ravel(): + self._check_visible(ax.get_yticklabels(), visible=True) + def test_negative_log(self): df = - DataFrame(rand(6, 4), index=list(string.ascii_letters[:6]), diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 87536b9bf0ff8..7af53c88f0f72 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -8,7 +8,7 @@ from numpy import nan from pandas import date_range,bdate_range, Timestamp -from pandas.core.index import Index, MultiIndex, Int64Index +from pandas.core.index import Index, MultiIndex, Int64Index, CategoricalIndex from pandas.core.api import Categorical, DataFrame from pandas.core.groupby import (SpecificationError, DataError, _nargsort, _lexsort_indexer) @@ -3378,12 +3378,11 @@ def test_groupby_datetime_categorical(self): cats = Categorical.from_codes(codes, levels, name='myfactor', ordered=True) data = DataFrame(np.random.randn(100, 4)) - result = data.groupby(cats).mean() expected = data.groupby(np.asarray(cats)).mean() expected = expected.reindex(levels) - expected.index.name = 'myfactor' + expected.index = CategoricalIndex(expected.index,categories=expected.index,name='myfactor',ordered=True) assert_frame_equal(result, expected) self.assertEqual(result.index.name, cats.name) @@ -3398,6 +3397,26 @@ def test_groupby_datetime_categorical(self): expected.index.names = ['myfactor', None] assert_frame_equal(desc_result, expected) + def test_groupby_categorical_index(self): + + levels = ['foo', 'bar', 'baz', 'qux'] + codes = np.random.randint(0, 4, size=20) + cats = Categorical.from_codes(codes, levels, name='myfactor', ordered=True) + df = DataFrame(np.repeat(np.arange(20),4).reshape(-1,4), columns=list('abcd')) + df['cats'] = cats + + # with a cat index + result = df.set_index('cats').groupby(level=0).sum() + expected = df[list('abcd')].groupby(cats.codes).sum() + expected.index = CategoricalIndex(Categorical.from_codes([0,1,2,3], levels, ordered=True),name='cats') + assert_frame_equal(result, expected) + + # with a cat column, should produce a cat index + result = df.groupby('cats').sum() + expected = df[list('abcd')].groupby(cats.codes).sum() + expected.index = CategoricalIndex(Categorical.from_codes([0,1,2,3], levels, ordered=True),name='cats') + assert_frame_equal(result, expected) + def test_groupby_groups_datetimeindex(self): # #1430 from pandas.tseries.api import DatetimeIndex @@ -3526,6 +3545,8 @@ def test_groupby_categorical_no_compress(self): result = data.groupby(cats).mean() exp = data.groupby(codes).mean() + + exp.index = CategoricalIndex(exp.index,categories=cats.categories,ordered=cats.ordered) assert_series_equal(result, exp) codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3]) @@ -3533,6 +3554,7 @@ def test_groupby_categorical_no_compress(self): result = data.groupby(cats).mean() exp = data.groupby(codes).mean().reindex(cats.categories) + exp.index = CategoricalIndex(exp.index,categories=cats.categories,ordered=cats.ordered) assert_series_equal(result, exp) cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], @@ -5061,6 +5083,17 @@ def test_groupby_categorical_two_columns(self): "C3":[nan,nan,nan,nan, 10,100,nan,nan, nan,nan,200,34]}, index=idx) tm.assert_frame_equal(res, exp) + def test_groupby_apply_all_none(self): + # Tests to make sure no errors if apply function returns all None + # values. Issue 9684. + test_df = DataFrame({'groups': [0,0,1,1], 'random_vars': [8,7,4,5]}) + + def test_func(x): + pass + result = test_df.groupby('groups').apply(test_func) + expected = DataFrame() + tm.assert_frame_equal(result, expected) + def assert_fp_equal(a, b): assert (np.abs(a - b) < 1e-12).all() diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 336340dd95991..3c9dbd2e48cb6 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -12,14 +12,10 @@ import numpy as np from numpy.testing import assert_array_equal -from pandas import period_range, date_range - -from pandas.core.index import (Index, Float64Index, Int64Index, MultiIndex, - InvalidIndexError, NumericIndex) -from pandas.tseries.index import DatetimeIndex -from pandas.tseries.tdi import TimedeltaIndex -from pandas.tseries.period import PeriodIndex -from pandas.core.series import Series +from pandas import (period_range, date_range, Categorical, Series, + Index, Float64Index, Int64Index, MultiIndex, + CategoricalIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex) +from pandas.core.index import InvalidIndexError, NumericIndex from pandas.util.testing import (assert_almost_equal, assertRaisesRegexp, assert_copy) from pandas import compat @@ -41,6 +37,11 @@ class Base(object): _holder = None _compat_props = ['shape', 'ndim', 'size', 'itemsize', 'nbytes'] + def setup_indices(self): + # setup the test indices in the self.indicies dict + for name, ind in self.indices.items(): + setattr(self, name, ind) + def verify_pickle(self,index): unpickled = self.round_trip_pickle(index) self.assertTrue(index.equals(unpickled)) @@ -98,6 +99,7 @@ def f(): def test_reindex_base(self): idx = self.create_index() expected = np.arange(idx.size) + actual = idx.get_indexer(idx) assert_array_equal(expected, actual) @@ -118,29 +120,6 @@ def test_ndarray_compat_properties(self): idx.nbytes idx.values.nbytes - -class TestIndex(Base, tm.TestCase): - _holder = Index - _multiprocess_can_split_ = True - - def setUp(self): - self.indices = dict( - unicodeIndex = tm.makeUnicodeIndex(100), - strIndex = tm.makeStringIndex(100), - dateIndex = tm.makeDateIndex(100), - intIndex = tm.makeIntIndex(100), - floatIndex = tm.makeFloatIndex(100), - boolIndex = Index([True,False]), - empty = Index([]), - tuples = MultiIndex.from_tuples(lzip(['foo', 'bar', 'baz'], - [1, 2, 3])) - ) - for name, ind in self.indices.items(): - setattr(self, name, ind) - - def create_index(self): - return Index(list('abcde')) - def test_wrong_number_names(self): def testit(ind): ind.names = ["apple", "banana", "carrot"] @@ -150,14 +129,18 @@ def testit(ind): def test_set_name_methods(self): new_name = "This is the new name for this index" - indices = (self.dateIndex, self.intIndex, self.unicodeIndex, - self.empty) - for ind in indices: + for ind in self.indices.values(): + + # don't tests a MultiIndex here (as its tested separated) + if isinstance(ind, MultiIndex): + continue + original_name = ind.name new_ind = ind.set_names([new_name]) self.assertEqual(new_ind.name, new_name) self.assertEqual(ind.name, original_name) res = ind.rename(new_name, inplace=True) + # should return None self.assertIsNone(res) self.assertEqual(ind.name, new_name) @@ -167,46 +150,128 @@ def test_set_name_methods(self): # ind.set_names("a") with assertRaisesRegexp(ValueError, "Level must be None"): ind.set_names("a", level=0) - # rename in place just leaves tuples and other containers alone - name = ('A', 'B') - ind = self.intIndex - ind.rename(name, inplace=True) - self.assertEqual(ind.name, name) - self.assertEqual(ind.names, [name]) - def test_hash_error(self): - with tm.assertRaisesRegexp(TypeError, - "unhashable type: %r" % - type(self.strIndex).__name__): - hash(self.strIndex) + # rename in place just leaves tuples and other containers alone + name = ('A', 'B') + ind.rename(name, inplace=True) + self.assertEqual(ind.name, name) + self.assertEqual(ind.names, [name]) - def test_new_axis(self): - new_index = self.dateIndex[None, :] - self.assertEqual(new_index.ndim, 2) - tm.assert_isinstance(new_index, np.ndarray) + def test_hash_error(self): + for ind in self.indices.values(): + with tm.assertRaisesRegexp(TypeError, + "unhashable type: %r" % + type(ind).__name__): + hash(ind) def test_copy_and_deepcopy(self): from copy import copy, deepcopy - for func in (copy, deepcopy): - idx_copy = func(self.strIndex) - self.assertIsNot(idx_copy, self.strIndex) - self.assertTrue(idx_copy.equals(self.strIndex)) + for ind in self.indices.values(): - new_copy = self.strIndex.copy(deep=True, name="banana") - self.assertEqual(new_copy.name, "banana") - new_copy2 = self.intIndex.copy(dtype=int) - self.assertEqual(new_copy2.dtype.kind, 'i') + # don't tests a MultiIndex here (as its tested separated) + if isinstance(ind, MultiIndex): + continue + + for func in (copy, deepcopy): + idx_copy = func(ind) + self.assertIsNot(idx_copy, ind) + self.assertTrue(idx_copy.equals(ind)) + + new_copy = ind.copy(deep=True, name="banana") + self.assertEqual(new_copy.name, "banana") def test_duplicates(self): - idx = Index([0, 0, 0]) - self.assertFalse(idx.is_unique) + for ind in self.indices.values(): + + if not len(ind): + continue + idx = self._holder([ind[0]]*5) + self.assertFalse(idx.is_unique) + self.assertTrue(idx.has_duplicates) def test_sort(self): - self.assertRaises(TypeError, self.strIndex.sort) + for ind in self.indices.values(): + self.assertRaises(TypeError, ind.sort) def test_mutability(self): - self.assertRaises(TypeError, self.strIndex.__setitem__, 0, 'foo') + for ind in self.indices.values(): + if not len(ind): + continue + self.assertRaises(TypeError, ind.__setitem__, 0, ind[0]) + + def test_view(self): + for ind in self.indices.values(): + i_view = ind.view() + self.assertEqual(i_view.name, ind.name) + + def test_compat(self): + for ind in self.indices.values(): + self.assertEqual(ind.tolist(),list(ind)) + + def test_argsort(self): + for k, ind in self.indices.items(): + + # sep teststed + if k in ['catIndex']: + continue + + result = ind.argsort() + expected = np.array(ind).argsort() + self.assert_numpy_array_equal(result, expected) + + def test_pickle(self): + for ind in self.indices.values(): + self.verify_pickle(ind) + ind.name = 'foo' + self.verify_pickle(ind) + + def test_take(self): + indexer = [4, 3, 0, 2] + for k, ind in self.indices.items(): + + # separate + if k in ['boolIndex','tuples','empty']: + continue + + result = ind.take(indexer) + expected = ind[indexer] + self.assertTrue(result.equals(expected)) + +class TestIndex(Base, tm.TestCase): + _holder = Index + _multiprocess_can_split_ = True + + def setUp(self): + self.indices = dict( + unicodeIndex = tm.makeUnicodeIndex(100), + strIndex = tm.makeStringIndex(100), + dateIndex = tm.makeDateIndex(100), + periodIndex = tm.makePeriodIndex(100), + tdIndex = tm.makeTimedeltaIndex(100), + intIndex = tm.makeIntIndex(100), + floatIndex = tm.makeFloatIndex(100), + boolIndex = Index([True,False]), + catIndex = tm.makeCategoricalIndex(100), + empty = Index([]), + tuples = MultiIndex.from_tuples(lzip(['foo', 'bar', 'baz'], + [1, 2, 3])) + ) + self.setup_indices() + + def create_index(self): + return Index(list('abcde')) + + def test_new_axis(self): + new_index = self.dateIndex[None, :] + self.assertEqual(new_index.ndim, 2) + tm.assert_isinstance(new_index, np.ndarray) + + def test_copy_and_deepcopy(self): + super(TestIndex, self).test_copy_and_deepcopy() + + new_copy2 = self.intIndex.copy(dtype=int) + self.assertEqual(new_copy2.dtype.kind, 'i') def test_constructor(self): # regular instance creation @@ -297,18 +362,22 @@ def test_constructor_simple_new(self): result = idx._simple_new(idx, 'obj') self.assertTrue(result.equals(idx)) - def test_copy(self): - i = Index([], name='Foo') - i_copy = i.copy() - self.assertEqual(i_copy.name, 'Foo') + def test_view_with_args(self): - def test_view(self): - i = Index([], name='Foo') - i_view = i.view() - self.assertEqual(i_view.name, 'Foo') + restricted = ['unicodeIndex','strIndex','catIndex','boolIndex','empty'] + + for i in restricted: + ind = self.indices[i] + + # with arguments + self.assertRaises(TypeError, lambda : ind.view('i8')) - # with arguments - self.assertRaises(TypeError, lambda : i.view('i8')) + # these are ok + for i in list(set(self.indices.keys())-set(restricted)): + ind = self.indices[i] + + # with arguments + ind.view('i8') def test_legacy_pickle_identity(self): @@ -330,9 +399,6 @@ def test_astype(self): casted = self.intIndex.astype('i8') self.assertEqual(casted.name, 'foobar') - def test_compat(self): - self.strIndex.tolist() - def test_equals(self): # same self.assertTrue(Index(['a', 'b', 'c']).equals(Index(['a', 'b', 'c']))) @@ -459,11 +525,6 @@ def test_nanosecond_index_access(self): self.assertEqual(first_value, x[Timestamp(np.datetime64('2013-01-01 00:00:00.000000050+0000', 'ns'))]) - def test_argsort(self): - result = self.strIndex.argsort() - expected = np.array(self.strIndex).argsort() - self.assert_numpy_array_equal(result, expected) - def test_comparators(self): index = self.dateIndex element = index[len(index) // 2] @@ -760,22 +821,17 @@ def test_symmetric_diff(self): with tm.assertRaises(TypeError): Index(idx1,dtype='object') - 1 - def test_pickle(self): - - self.verify_pickle(self.strIndex) - self.strIndex.name = 'foo' - self.verify_pickle(self.strIndex) - self.verify_pickle(self.dateIndex) - def test_is_numeric(self): self.assertFalse(self.dateIndex.is_numeric()) self.assertFalse(self.strIndex.is_numeric()) self.assertTrue(self.intIndex.is_numeric()) self.assertTrue(self.floatIndex.is_numeric()) + self.assertFalse(self.catIndex.is_numeric()) def test_is_object(self): self.assertTrue(self.strIndex.is_object()) self.assertTrue(self.boolIndex.is_object()) + self.assertFalse(self.catIndex.is_object()) self.assertFalse(self.intIndex.is_object()) self.assertFalse(self.dateIndex.is_object()) self.assertFalse(self.floatIndex.is_object()) @@ -839,12 +895,6 @@ def test_format_none(self): idx.format() self.assertIsNone(idx[3]) - def test_take(self): - indexer = [4, 3, 0, 2] - result = self.dateIndex.take(indexer) - expected = self.dateIndex[indexer] - self.assertTrue(result.equals(expected)) - def test_logical_compat(self): idx = self.create_index() self.assertEqual(idx.all(), idx.values.all()) @@ -857,6 +907,7 @@ def _check_method_works(self, method): method(self.strIndex) method(self.intIndex) method(self.tuples) + method(self.catIndex) def test_get_indexer(self): idx1 = Index([1, 2, 3, 4, 5]) @@ -1232,6 +1283,14 @@ def test_str_attribute(self): expected = Series(range(2), index=['a1', 'a2']) tm.assert_series_equal(s[s.index.str.startswith('a')], expected) + def test_tab_completion(self): + # GH 9910 + idx = Index(list('abcd')) + self.assertTrue('str' in dir(idx)) + + idx = Index(range(4)) + self.assertTrue('str' not in dir(idx)) + def test_indexing_doesnt_change_class(self): idx = Index([1, 2, 3, 'a', 'b', 'c']) @@ -1338,6 +1397,352 @@ def test_equals_op(self): index_b == index_a, ) +class TestCategoricalIndex(Base, tm.TestCase): + _holder = CategoricalIndex + + def setUp(self): + self.indices = dict(catIndex = tm.makeCategoricalIndex(100)) + self.setup_indices() + + def create_index(self, categories=None, ordered=False): + if categories is None: + categories = list('cab') + return CategoricalIndex(list('aabbca'), categories=categories, ordered=ordered) + + def test_construction(self): + + ci = self.create_index(categories=list('abcd')) + categories = ci.categories + + result = Index(ci) + tm.assert_index_equal(result,ci,exact=True) + self.assertFalse(result.ordered) + + result = Index(ci.values) + tm.assert_index_equal(result,ci,exact=True) + self.assertFalse(result.ordered) + + # empty + result = CategoricalIndex(categories=categories) + self.assertTrue(result.categories.equals(Index(categories))) + self.assert_numpy_array_equal(result.codes,np.array([],dtype='int8')) + self.assertFalse(result.ordered) + + # passing categories + result = CategoricalIndex(list('aabbca'),categories=categories) + self.assertTrue(result.categories.equals(Index(categories))) + self.assert_numpy_array_equal(result.codes,np.array([0,0,1,1,2,0],dtype='int8')) + + c = pd.Categorical(list('aabbca')) + result = CategoricalIndex(c) + self.assertTrue(result.categories.equals(Index(list('abc')))) + self.assert_numpy_array_equal(result.codes,np.array([0,0,1,1,2,0],dtype='int8')) + self.assertFalse(result.ordered) + + result = CategoricalIndex(c,categories=categories) + self.assertTrue(result.categories.equals(Index(categories))) + self.assert_numpy_array_equal(result.codes,np.array([0,0,1,1,2,0],dtype='int8')) + self.assertFalse(result.ordered) + + ci = CategoricalIndex(c,categories=list('abcd')) + result = CategoricalIndex(ci) + self.assertTrue(result.categories.equals(Index(categories))) + self.assert_numpy_array_equal(result.codes,np.array([0,0,1,1,2,0],dtype='int8')) + self.assertFalse(result.ordered) + + result = CategoricalIndex(ci, categories=list('ab')) + self.assertTrue(result.categories.equals(Index(list('ab')))) + self.assert_numpy_array_equal(result.codes,np.array([0,0,1,1,-1,0],dtype='int8')) + self.assertFalse(result.ordered) + + result = CategoricalIndex(ci, categories=list('ab'), ordered=True) + self.assertTrue(result.categories.equals(Index(list('ab')))) + self.assert_numpy_array_equal(result.codes,np.array([0,0,1,1,-1,0],dtype='int8')) + self.assertTrue(result.ordered) + + # turn me to an Index + result = Index(np.array(ci)) + self.assertIsInstance(result, Index) + self.assertNotIsInstance(result, CategoricalIndex) + + def test_construction_with_dtype(self): + + # specify dtype + ci = self.create_index(categories=list('abc')) + + result = Index(np.array(ci), dtype='category') + tm.assert_index_equal(result,ci,exact=True) + + result = Index(np.array(ci).tolist(), dtype='category') + tm.assert_index_equal(result,ci,exact=True) + + # these are generally only equal when the categories are reordered + ci = self.create_index() + + result = Index(np.array(ci), dtype='category').reorder_categories(ci.categories) + tm.assert_index_equal(result,ci,exact=True) + + # make sure indexes are handled + expected = CategoricalIndex([0,1,2], categories=[0,1,2], ordered=True) + idx = Index(range(3)) + result = CategoricalIndex(idx, categories=idx, ordered=True) + tm.assert_index_equal(result, expected, exact=True) + + def test_method_delegation(self): + + ci = CategoricalIndex(list('aabbca'), categories=list('cabdef')) + result = ci.set_categories(list('cab')) + tm.assert_index_equal(result, CategoricalIndex(list('aabbca'), categories=list('cab'))) + + ci = CategoricalIndex(list('aabbca'), categories=list('cab')) + result = ci.rename_categories(list('efg')) + tm.assert_index_equal(result, CategoricalIndex(list('ffggef'), categories=list('efg'))) + + ci = CategoricalIndex(list('aabbca'), categories=list('cab')) + result = ci.add_categories(['d']) + tm.assert_index_equal(result, CategoricalIndex(list('aabbca'), categories=list('cabd'))) + + ci = CategoricalIndex(list('aabbca'), categories=list('cab')) + result = ci.remove_categories(['c']) + tm.assert_index_equal(result, CategoricalIndex(list('aabb') + [np.nan] + ['a'], categories=list('ab'))) + + ci = CategoricalIndex(list('aabbca'), categories=list('cabdef')) + result = ci.as_unordered() + tm.assert_index_equal(result, ci) + + ci = CategoricalIndex(list('aabbca'), categories=list('cabdef')) + result = ci.as_ordered() + tm.assert_index_equal(result, CategoricalIndex(list('aabbca'), categories=list('cabdef'), ordered=True)) + + # invalid + self.assertRaises(ValueError, lambda : ci.set_categories(list('cab'), inplace=True)) + + def test_contains(self): + + ci = self.create_index(categories=list('cabdef')) + + self.assertTrue('a' in ci) + self.assertTrue('z' not in ci) + self.assertTrue('e' not in ci) + self.assertTrue(np.nan not in ci) + + # assert codes NOT in index + self.assertFalse(0 in ci) + self.assertFalse(1 in ci) + + ci = CategoricalIndex(list('aabbca'), categories=list('cabdef') + [np.nan]) + self.assertFalse(np.nan in ci) + + ci = CategoricalIndex(list('aabbca') + [np.nan], categories=list('cabdef') + [np.nan]) + self.assertTrue(np.nan in ci) + + def test_min_max(self): + + ci = self.create_index(ordered=False) + self.assertRaises(TypeError, lambda : ci.min()) + self.assertRaises(TypeError, lambda : ci.max()) + + ci = self.create_index(ordered=True) + + self.assertEqual(ci.min(),'c') + self.assertEqual(ci.max(),'b') + + def test_append(self): + + ci = self.create_index() + categories = ci.categories + + # append cats with the same categories + result = ci[:3].append(ci[3:]) + tm.assert_index_equal(result,ci,exact=True) + + foos = [ci[:1], ci[1:3], ci[3:]] + result = foos[0].append(foos[1:]) + tm.assert_index_equal(result,ci,exact=True) + + # empty + result = ci.append([]) + tm.assert_index_equal(result,ci,exact=True) + + # appending with different categories or reoreded is not ok + self.assertRaises(TypeError, lambda : ci.append(ci.values.set_categories(list('abcd')))) + self.assertRaises(TypeError, lambda : ci.append(ci.values.reorder_categories(list('abc')))) + + # with objects + result = ci.append(['c','a']) + expected = CategoricalIndex(list('aabbcaca'), categories=categories) + tm.assert_index_equal(result,expected,exact=True) + + # invalid objects + self.assertRaises(TypeError, lambda : ci.append(['a','d'])) + + def test_insert(self): + + ci = self.create_index() + categories = ci.categories + + #test 0th element + result = ci.insert(0, 'a') + expected = CategoricalIndex(list('aaabbca'),categories=categories) + tm.assert_index_equal(result,expected,exact=True) + + #test Nth element that follows Python list behavior + result = ci.insert(-1, 'a') + expected = CategoricalIndex(list('aabbcaa'),categories=categories) + tm.assert_index_equal(result,expected,exact=True) + + #test empty + result = CategoricalIndex(categories=categories).insert(0, 'a') + expected = CategoricalIndex(['a'],categories=categories) + tm.assert_index_equal(result,expected,exact=True) + + # invalid + self.assertRaises(TypeError, lambda : ci.insert(0,'d')) + + def test_delete(self): + + ci = self.create_index() + categories = ci.categories + + result = ci.delete(0) + expected = CategoricalIndex(list('abbca'),categories=categories) + tm.assert_index_equal(result,expected,exact=True) + + result = ci.delete(-1) + expected = CategoricalIndex(list('aabbc'),categories=categories) + tm.assert_index_equal(result,expected,exact=True) + + with tm.assertRaises((IndexError, ValueError)): + # either depeidnig on numpy version + result = ci.delete(10) + + def test_astype(self): + + ci = self.create_index() + result = ci.astype('category') + tm.assert_index_equal(result,ci,exact=True) + + result = ci.astype(object) + self.assertTrue(result.equals(Index(np.array(ci)))) + + # this IS equal, but not the same class + self.assertTrue(result.equals(ci)) + self.assertIsInstance(result, Index) + self.assertNotIsInstance(result, CategoricalIndex) + + def test_reindex_base(self): + + # determined by cat ordering + idx = self.create_index() + expected = np.array([4,0,1,5,2,3]) + + actual = idx.get_indexer(idx) + assert_array_equal(expected, actual) + + with tm.assertRaisesRegexp(ValueError, 'Invalid fill method'): + idx.get_indexer(idx, method='invalid') + + def test_reindexing(self): + + ci = self.create_index() + oidx = Index(np.array(ci)) + + for n in [1,2,5,len(ci)]: + finder = oidx[np.random.randint(0,len(ci),size=n)] + expected = oidx.get_indexer_non_unique(finder)[0] + + actual = ci.get_indexer(finder) + assert_array_equal(expected, actual) + + def test_duplicates(self): + + idx = CategoricalIndex([0, 0, 0]) + self.assertFalse(idx.is_unique) + self.assertTrue(idx.has_duplicates) + + def test_get_indexer(self): + + idx1 = CategoricalIndex(list('aabcde'),categories=list('edabc')) + idx2 = CategoricalIndex(list('abf')) + + for indexer in [idx2, list('abf'), Index(list('abf'))]: + r1 = idx1.get_indexer(idx2) + assert_almost_equal(r1, [0, 1, 2, -1]) + + self.assertRaises(NotImplementedError, lambda : idx2.get_indexer(idx1, method='pad')) + self.assertRaises(NotImplementedError, lambda : idx2.get_indexer(idx1, method='backfill')) + self.assertRaises(NotImplementedError, lambda : idx2.get_indexer(idx1, method='nearest')) + + def test_repr(self): + + ci = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True) + str(ci) + tm.assert_index_equal(eval(repr(ci)),ci,exact=True) + + # formatting + if compat.PY3: + str(ci) + else: + compat.text_type(ci) + + # long format + ci = CategoricalIndex(np.random.randint(0,5,size=100)) + result = str(ci) + tm.assert_index_equal(eval(repr(ci)),ci,exact=True) + + def test_isin(self): + + ci = CategoricalIndex(list('aabca') + [np.nan],categories=['c','a','b',np.nan]) + self.assert_numpy_array_equal(ci.isin(['c']),np.array([False,False,False,True,False,False])) + self.assert_numpy_array_equal(ci.isin(['c','a','b']),np.array([True]*5 + [False])) + self.assert_numpy_array_equal(ci.isin(['c','a','b',np.nan]),np.array([True]*6)) + + # mismatched categorical -> coerced to ndarray so doesn't matter + self.assert_numpy_array_equal(ci.isin(ci.set_categories(list('abcdefghi'))),np.array([True]*6)) + self.assert_numpy_array_equal(ci.isin(ci.set_categories(list('defghi'))),np.array([False]*5 + [True])) + + def test_identical(self): + + ci1 = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True) + ci2 = CategoricalIndex(['a', 'b'], categories=['a', 'b', 'c'], ordered=True) + self.assertTrue(ci1.identical(ci1)) + self.assertTrue(ci1.identical(ci1.copy())) + self.assertFalse(ci1.identical(ci2)) + + def test_equals(self): + + ci1 = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True) + ci2 = CategoricalIndex(['a', 'b'], categories=['a', 'b', 'c'], ordered=True) + + self.assertTrue(ci1.equals(ci1)) + self.assertFalse(ci1.equals(ci2)) + self.assertTrue(ci1.equals(ci1.astype(object))) + self.assertTrue(ci1.astype(object).equals(ci1)) + + self.assertTrue((ci1 == ci1).all()) + self.assertFalse((ci1 != ci1).all()) + self.assertFalse((ci1 > ci1).all()) + self.assertFalse((ci1 < ci1).all()) + self.assertTrue((ci1 <= ci1).all()) + self.assertTrue((ci1 >= ci1).all()) + + self.assertFalse((ci1 == 1).all()) + self.assertTrue((ci1 == Index(['a','b'])).all()) + self.assertTrue((ci1 == ci1.values).all()) + + # invalid comparisons + self.assertRaises(TypeError, lambda : ci1 == Index(['a','b','c'])) + self.assertRaises(TypeError, lambda : ci1 == ci2) + self.assertRaises(TypeError, lambda : ci1 == Categorical(ci1.values, ordered=False)) + self.assertRaises(TypeError, lambda : ci1 == Categorical(ci1.values, categories=list('abc'))) + + # tests + # make sure that we are testing for category inclusion properly + self.assertTrue(CategoricalIndex(list('aabca'),categories=['c','a','b']).equals(list('aabca'))) + self.assertTrue(CategoricalIndex(list('aabca'),categories=['c','a','b',np.nan]).equals(list('aabca'))) + + self.assertFalse(CategoricalIndex(list('aabca') + [np.nan],categories=['c','a','b',np.nan]).equals(list('aabca'))) + self.assertTrue(CategoricalIndex(list('aabca') + [np.nan],categories=['c','a','b',np.nan]).equals(list('aabca') + [np.nan])) class Numeric(Base): @@ -1417,18 +1822,13 @@ class TestFloat64Index(Numeric, tm.TestCase): _multiprocess_can_split_ = True def setUp(self): - self.mixed = Float64Index([1.5, 2, 3, 4, 5]) - self.float = Float64Index(np.arange(5) * 2.5) + self.indices = dict(mixed = Float64Index([1.5, 2, 3, 4, 5]), + float = Float64Index(np.arange(5) * 2.5)) + self.setup_indices() def create_index(self): return Float64Index(np.arange(5,dtype='float64')) - def test_hash_error(self): - with tm.assertRaisesRegexp(TypeError, - "unhashable type: %r" % - type(self.float).__name__): - hash(self.float) - def test_repr_roundtrip(self): for ind in (self.mixed, self.float): tm.assert_index_equal(eval(repr(ind)), ind) @@ -1594,7 +1994,8 @@ class TestInt64Index(Numeric, tm.TestCase): _multiprocess_can_split_ = True def setUp(self): - self.index = Int64Index(np.arange(0, 20, 2)) + self.indices = dict(index = Int64Index(np.arange(0, 20, 2))) + self.setup_indices() def create_index(self): return Int64Index(np.arange(5,dtype='int64')) @@ -1641,18 +2042,14 @@ def test_constructor_corner(self): with tm.assertRaisesRegexp(TypeError, 'casting'): Int64Index(arr_with_floats) - def test_hash_error(self): - with tm.assertRaisesRegexp(TypeError, - "unhashable type: %r" % - type(self.index).__name__): - hash(self.index) - def test_copy(self): i = Int64Index([], name='Foo') i_copy = i.copy() self.assertEqual(i_copy.name, 'Foo') def test_view(self): + super(TestInt64Index, self).test_view() + i = Int64Index([], name='Foo') i_view = i.view() self.assertEqual(i_view.name, 'Foo') @@ -2053,6 +2450,7 @@ def test_slice_keep_name(self): class DatetimeLike(Base): def test_view(self): + super(DatetimeLike, self).test_view() i = self.create_index() @@ -2068,6 +2466,10 @@ class TestDatetimeIndex(DatetimeLike, tm.TestCase): _holder = DatetimeIndex _multiprocess_can_split_ = True + def setUp(self): + self.indices = dict(index = tm.makeDateIndex(10)) + self.setup_indices() + def create_index(self): return date_range('20130101',periods=5) @@ -2186,6 +2588,10 @@ class TestPeriodIndex(DatetimeLike, tm.TestCase): _holder = PeriodIndex _multiprocess_can_split_ = True + def setUp(self): + self.indices = dict(index = tm.makePeriodIndex(10)) + self.setup_indices() + def create_index(self): return period_range('20130101',periods=5,freq='D') @@ -2220,6 +2626,10 @@ class TestTimedeltaIndex(DatetimeLike, tm.TestCase): _holder = TimedeltaIndex _multiprocess_can_split_ = True + def setUp(self): + self.indices = dict(index = tm.makeTimedeltaIndex(10)) + self.setup_indices() + def create_index(self): return pd.to_timedelta(range(5),unit='d') + pd.offsets.Hour(1) @@ -2294,9 +2704,10 @@ def setUp(self): major_labels = np.array([0, 0, 1, 2, 3, 3]) minor_labels = np.array([0, 1, 0, 1, 0, 1]) self.index_names = ['first', 'second'] - self.index = MultiIndex(levels=[major_axis, minor_axis], - labels=[major_labels, minor_labels], - names=self.index_names, verify_integrity=False) + self.indices = dict(index = MultiIndex(levels=[major_axis, minor_axis], + labels=[major_labels, minor_labels], + names=self.index_names, verify_integrity=False)) + self.setup_indices() def create_index(self): return self.index @@ -2332,13 +2743,7 @@ def test_labels_dtypes(self): self.assertTrue((i.labels[0]>=0).all()) self.assertTrue((i.labels[1]>=0).all()) - def test_hash_error(self): - with tm.assertRaisesRegexp(TypeError, - "unhashable type: %r" % - type(self.index).__name__): - hash(self.index) - - def test_set_names_and_rename(self): + def test_set_name_methods(self): # so long as these are synonyms, we don't need to test set_names self.assertEqual(self.index.rename, self.index.set_names) new_names = [name + "SUFFIX" for name in self.index_names] @@ -3838,7 +4243,7 @@ def test_reindex_level(self): assertRaisesRegexp(TypeError, "Fill method not supported", idx.reindex, idx, method='bfill', level='first') - def test_has_duplicates(self): + def test_duplicates(self): self.assertFalse(self.index.has_duplicates) self.assertTrue(self.index.append(self.index).has_duplicates) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 5f109212add06..19ed799853ed4 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -1438,6 +1438,13 @@ def test_iloc_setitem_series(self): result = s.iloc[:4] assert_series_equal(result, expected) + s= Series([-1]*6) + s.iloc[0::2]= [0,2,4] + s.iloc[1::2]= [1,3,5] + result = s + expected= Series([0,1,2,3,4,5]) + assert_series_equal(result, expected) + def test_iloc_setitem_list_of_lists(self): # GH 7551 @@ -2366,6 +2373,7 @@ def test_dups_fancy_indexing(self): rows = ['C','B','E'] expected = DataFrame({'test' : [11,9,np.nan], 'test1': [7.,6,np.nan], 'other': ['d','c',np.nan]},index=rows) + result = df.ix[rows] assert_frame_equal(result, expected) @@ -4422,6 +4430,212 @@ def test_indexing_assignment_dict_already_exists(self): tm.assert_frame_equal(df, expected) + +class TestCategoricalIndex(tm.TestCase): + + def setUp(self): + + self.df = DataFrame({'A' : np.arange(6,dtype='int64'), + 'B' : Series(list('aabbca')).astype('category',categories=list('cab')) }).set_index('B') + self.df2 = DataFrame({'A' : np.arange(6,dtype='int64'), + 'B' : Series(list('aabbca')).astype('category',categories=list('cabe')) }).set_index('B') + self.df3 = DataFrame({'A' : np.arange(6,dtype='int64'), + 'B' : Series([1,1,2,1,3,2]).astype('category',categories=[3,2,1],ordered=True) }).set_index('B') + self.df4 = DataFrame({'A' : np.arange(6,dtype='int64'), + 'B' : Series([1,1,2,1,3,2]).astype('category',categories=[3,2,1],ordered=False) }).set_index('B') + + + def test_loc_scalar(self): + + result = self.df.loc['a'] + expected = DataFrame({'A' : [0,1,5], + 'B' : Series(list('aaa')).astype('category',categories=list('cab')) }).set_index('B') + assert_frame_equal(result, expected) + + + df = self.df.copy() + df.loc['a'] = 20 + expected = DataFrame({'A' : [20,20,2,3,4,20], + 'B' : Series(list('aabbca')).astype('category',categories=list('cab')) }).set_index('B') + assert_frame_equal(df, expected) + + # value not in the categories + self.assertRaises(KeyError, lambda : df.loc['d']) + + def f(): + df.loc['d'] = 10 + self.assertRaises(TypeError, f) + + def f(): + df.loc['d','A'] = 10 + self.assertRaises(TypeError, f) + + def f(): + df.loc['d','C'] = 10 + self.assertRaises(TypeError, f) + + def test_loc_listlike(self): + + # list of labels + result = self.df.loc[['c','a']] + expected = self.df.iloc[[4,0,1,5]] + assert_frame_equal(result, expected) + + result = self.df2.loc[['a','b','e']] + expected = DataFrame({'A' : [0,1,5,2,3,np.nan], + 'B' : Series(list('aaabbe')).astype('category',categories=list('cabe')) }).set_index('B') + assert_frame_equal(result, expected) + + # element in the categories but not in the values + self.assertRaises(KeyError, lambda : self.df2.loc['e']) + + # assign is ok + df = self.df2.copy() + df.loc['e'] = 20 + result = df.loc[['a','b','e']] + expected = DataFrame({'A' : [0,1,5,2,3,20], + 'B' : Series(list('aaabbe')).astype('category',categories=list('cabe')) }).set_index('B') + assert_frame_equal(result, expected) + + df = self.df2.copy() + result = df.loc[['a','b','e']] + expected = DataFrame({'A' : [0,1,5,2,3,np.nan], + 'B' : Series(list('aaabbe')).astype('category',categories=list('cabe')) }).set_index('B') + assert_frame_equal(result, expected) + + + # not all labels in the categories + self.assertRaises(KeyError, lambda : self.df2.loc[['a','d']]) + + def test_reindexing(self): + + # reindexing + # convert to a regular index + result = self.df2.reindex(['a','b','e']) + expected = DataFrame({'A' : [0,1,5,2,3,np.nan], + 'B' : Series(list('aaabbe')) }).set_index('B') + assert_frame_equal(result, expected) + + result = self.df2.reindex(['a','b']) + expected = DataFrame({'A' : [0,1,5,2,3], + 'B' : Series(list('aaabb')) }).set_index('B') + assert_frame_equal(result, expected) + + result = self.df2.reindex(['e']) + expected = DataFrame({'A' : [np.nan], + 'B' : Series(['e']) }).set_index('B') + assert_frame_equal(result, expected) + + result = self.df2.reindex(['d']) + expected = DataFrame({'A' : [np.nan], + 'B' : Series(['d']) }).set_index('B') + assert_frame_equal(result, expected) + + # since we are actually reindexing with a Categorical + # then return a Categorical + cats = list('cabe') + + result = self.df2.reindex(pd.Categorical(['a','d'],categories=cats)) + expected = DataFrame({'A' : [0,1,5,np.nan], + 'B' : Series(list('aaad')).astype('category',categories=cats) }).set_index('B') + assert_frame_equal(result, expected) + + result = self.df2.reindex(pd.Categorical(['a'],categories=cats)) + expected = DataFrame({'A' : [0,1,5], + 'B' : Series(list('aaa')).astype('category',categories=cats) }).set_index('B') + assert_frame_equal(result, expected) + + result = self.df2.reindex(['a','b','e']) + expected = DataFrame({'A' : [0,1,5,2,3,np.nan], + 'B' : Series(list('aaabbe')) }).set_index('B') + assert_frame_equal(result, expected) + + result = self.df2.reindex(['a','b']) + expected = DataFrame({'A' : [0,1,5,2,3], + 'B' : Series(list('aaabb')) }).set_index('B') + assert_frame_equal(result, expected) + + result = self.df2.reindex(['e']) + expected = DataFrame({'A' : [np.nan], + 'B' : Series(['e']) }).set_index('B') + assert_frame_equal(result, expected) + + # give back the type of categorical that we received + result = self.df2.reindex(pd.Categorical(['a','d'],categories=cats,ordered=True)) + expected = DataFrame({'A' : [0,1,5,np.nan], + 'B' : Series(list('aaad')).astype('category',categories=cats,ordered=True) }).set_index('B') + assert_frame_equal(result, expected) + + result = self.df2.reindex(pd.Categorical(['a','d'],categories=['a','d'])) + expected = DataFrame({'A' : [0,1,5,np.nan], + 'B' : Series(list('aaad')).astype('category',categories=['a','d']) }).set_index('B') + assert_frame_equal(result, expected) + + # passed duplicate indexers are not allowed + self.assertRaises(ValueError, lambda : self.df2.reindex(['a','a'])) + + # args NotImplemented ATM + self.assertRaises(NotImplementedError, lambda : self.df2.reindex(['a'],method='ffill')) + self.assertRaises(NotImplementedError, lambda : self.df2.reindex(['a'],level=1)) + self.assertRaises(NotImplementedError, lambda : self.df2.reindex(['a'],limit=2)) + + def test_loc_slice(self): + + # slicing + # not implemented ATM + # GH9748 + + self.assertRaises(TypeError, lambda : self.df.loc[1:5]) + + #result = df.loc[1:5] + #expected = df.iloc[[1,2,3,4]] + #assert_frame_equal(result, expected) + + def test_boolean_selection(self): + + df3 = self.df3 + df4 = self.df4 + + result = df3[df3.index == 'a'] + expected = df3.iloc[[]] + assert_frame_equal(result,expected) + + result = df4[df4.index == 'a'] + expected = df4.iloc[[]] + assert_frame_equal(result,expected) + + result = df3[df3.index == 1] + expected = df3.iloc[[0,1,3]] + assert_frame_equal(result,expected) + + result = df4[df4.index == 1] + expected = df4.iloc[[0,1,3]] + assert_frame_equal(result,expected) + + # since we have an ordered categorical + + # CategoricalIndex([1, 1, 2, 1, 3, 2], + # categories=[3, 2, 1], + # ordered=True, + # name=u'B') + result = df3[df3.index < 2] + expected = df3.iloc[[4]] + assert_frame_equal(result,expected) + + result = df3[df3.index > 1] + expected = df3.iloc[[]] + assert_frame_equal(result,expected) + + # unordered + # cannot be compared + + # CategoricalIndex([1, 1, 2, 1, 3, 2], + # categories=[3, 2, 1], + # ordered=False, + # name=u'B') + self.assertRaises(TypeError, lambda : df4[df4.index < 2]) + self.assertRaises(TypeError, lambda : df4[df4.index > 1]) + class TestSeriesNoneCoercion(tm.TestCase): EXPECTED_RESULTS = [ # For numeric series, we should coerce to NaN. diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index c3b43f3ec70c0..f1a9e23796804 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -242,6 +242,26 @@ def test_dt_accessor_api(self): s.dt self.assertFalse(hasattr(s, 'dt')) + def test_tab_completion(self): + # GH 9910 + s = Series(list('abcd')) + # Series of str values should have .str but not .dt/.cat in __dir__ + self.assertTrue('str' in dir(s)) + self.assertTrue('dt' not in dir(s)) + self.assertTrue('cat' not in dir(s)) + + # similiarly for .dt + s = Series(date_range('1/1/2015', periods=5)) + self.assertTrue('dt' in dir(s)) + self.assertTrue('str' not in dir(s)) + self.assertTrue('cat' not in dir(s)) + + # similiarly for .cat + s = Series(list('abbcd'), dtype="category") + self.assertTrue('cat' in dir(s)) + self.assertTrue('str' not in dir(s)) + self.assertTrue('dt' not in dir(s)) + def test_binop_maybe_preserve_name(self): # names match, preserve @@ -6851,6 +6871,22 @@ def test_searchsorted_sorter(self): e = np.array([0, 2]) tm.assert_array_equal(r, e) + def test_to_frame_expanddim(self): + # GH 9762 + + class SubclassedSeries(Series): + @property + def _constructor_expanddim(self): + return SubclassedFrame + + class SubclassedFrame(DataFrame): + pass + + s = SubclassedSeries([1, 2, 3], name='X') + result = s.to_frame() + self.assertTrue(isinstance(result, SubclassedFrame)) + expected = SubclassedFrame({'X': [1, 2, 3]}) + assert_frame_equal(result, expected) class TestSeriesNonUnique(tm.TestCase): diff --git a/pandas/tests/test_util.py b/pandas/tests/test_util.py index bb8bd3df96b71..38f058358b37f 100644 --- a/pandas/tests/test_util.py +++ b/pandas/tests/test_util.py @@ -3,6 +3,7 @@ import nose +import sys import pandas.util from pandas.util.decorators import deprecate_kwarg import pandas.util.testing as tm @@ -80,6 +81,9 @@ def test_warning(self): self.assertNotAlmostEquals(1, 2) def test_locale(self): + if sys.platform == 'win32': + raise nose.SkipTest("skipping on win platforms as locale not available") + #GH9744 locales = pandas.util.testing.get_locales() self.assertTrue(len(locales) >= 1) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 513f165af4686..268bd306585ad 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -1040,7 +1040,10 @@ def _adorn_subplots(self): if len(self.axes) > 0: all_axes = self._get_axes() nrows, ncols = self._get_axes_layout() - _handle_shared_axes(all_axes, len(all_axes), len(all_axes), nrows, ncols, self.sharex, self.sharey) + _handle_shared_axes(axarr=all_axes, nplots=len(all_axes), + naxes=nrows * ncols, nrows=nrows, + ncols=ncols, sharex=self.sharex, + sharey=self.sharey) for ax in to_adorn: if self.yticks is not None: diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 2b37c64940170..f15de87dbd81c 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -61,13 +61,13 @@ def groupby(self, f): return _algos.groupby_object(objs, f) def _format_with_header(self, header, **kwargs): - return header + self._format_native_types(**kwargs) + return header + list(self._format_native_types(**kwargs)) def __contains__(self, key): try: res = self.get_loc(key) return np.isscalar(res) or type(res) == slice or np.any(res) - except (KeyError, TypeError): + except (KeyError, TypeError, ValueError): return False @property diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 7dac36a9ae5cc..7b0ff578b0d90 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -673,12 +673,13 @@ def _add_delta(self, delta): def _format_native_types(self, na_rep=u('NaT'), date_format=None, **kwargs): - data = self.asobject - from pandas.core.format import Datetime64Formatter - return Datetime64Formatter(values=data, - nat_rep=na_rep, - date_format=date_format, - justify='all').get_result() + from pandas.core.format import _get_format_datetime64_from_values + format = _get_format_datetime64_from_values(self, date_format) + + return tslib.format_array_from_datetime(self.asi8, + tz=self.tz, + format=format, + na_rep=na_rep) def to_datetime(self, dayfirst=False): return self.copy() diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index b1f0ba1f127fa..a4b754f5a6bbd 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -387,7 +387,7 @@ def to_datetime(self, dayfirst=False): qyear = _field_accessor('qyear', 1) days_in_month = _field_accessor('days_in_month', 11, "The number of days in the month") daysinmonth = days_in_month - + def _get_object_array(self): freq = self.freq return np.array([ Period._from_ordinal(ordinal=x, freq=freq) for x in self.values], copy=False) @@ -687,7 +687,7 @@ def _format_native_types(self, na_rep=u('NaT'), **kwargs): imask = ~mask values[imask] = np.array([u('%s') % dt for dt in values[imask]]) - return values.tolist() + return values def __array_finalize__(self, obj): if not self.ndim: # pragma: no cover diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index c42802bdb31ad..6bdff5aab3cfd 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -745,6 +745,13 @@ def test_nonunique_contains(self): ['00:01:00', '00:01:00', '00:00:01'])): tm.assertIn(idx[0], idx) + def test_unknown_attribute(self): + #GH 9680 + tdi = pd.timedelta_range(start=0,periods=10,freq='1s') + ts = pd.Series(np.random.normal(size=10),index=tdi) + self.assertNotIn('foo',ts.__dict__.keys()) + self.assertRaises(AttributeError,lambda : ts.foo) + class TestPeriodIndexOps(Ops): diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index c338bbeae79c7..2ae311e044a75 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -875,23 +875,23 @@ def test_resmaple_dst_anchor(self): # 5172 dti = DatetimeIndex([datetime(2012, 11, 4, 23)], tz='US/Eastern') df = DataFrame([5], index=dti) - assert_frame_equal(df.resample(rule='D', how='sum'), + assert_frame_equal(df.resample(rule='D', how='sum'), DataFrame([5], index=df.index.normalize())) df.resample(rule='MS', how='sum') assert_frame_equal(df.resample(rule='MS', how='sum'), - DataFrame([5], index=DatetimeIndex([datetime(2012, 11, 1)], + DataFrame([5], index=DatetimeIndex([datetime(2012, 11, 1)], tz='US/Eastern'))) dti = date_range('2013-09-30', '2013-11-02', freq='30Min', tz='Europe/Paris') values = range(dti.size) df = DataFrame({"a": values, "b": values, "c": values}, index=dti, dtype='int64') how = {"a": "min", "b": "max", "c": "count"} - + assert_frame_equal(df.resample("W-MON", how=how)[["a", "b", "c"]], DataFrame({"a": [0, 48, 384, 720, 1056, 1394], "b": [47, 383, 719, 1055, 1393, 1586], "c": [48, 336, 336, 336, 338, 193]}, - index=date_range('9/30/2013', '11/4/2013', + index=date_range('9/30/2013', '11/4/2013', freq='W-MON', tz='Europe/Paris')), 'W-MON Frequency') @@ -899,7 +899,7 @@ def test_resmaple_dst_anchor(self): DataFrame({"a": [0, 48, 720, 1394], "b": [47, 719, 1393, 1586], "c": [48, 672, 674, 193]}, - index=date_range('9/30/2013', '11/11/2013', + index=date_range('9/30/2013', '11/11/2013', freq='2W-MON', tz='Europe/Paris')), '2W-MON Frequency') @@ -907,7 +907,7 @@ def test_resmaple_dst_anchor(self): DataFrame({"a": [0, 48, 1538], "b": [47, 1537, 1586], "c": [48, 1490, 49]}, - index=date_range('9/1/2013', '11/1/2013', + index=date_range('9/1/2013', '11/1/2013', freq='MS', tz='Europe/Paris')), 'MS Frequency') @@ -915,7 +915,7 @@ def test_resmaple_dst_anchor(self): DataFrame({"a": [0, 1538], "b": [1537, 1586], "c": [1538, 49]}, - index=date_range('9/1/2013', '11/1/2013', + index=date_range('9/1/2013', '11/1/2013', freq='2MS', tz='Europe/Paris')), '2MS Frequency') @@ -1553,6 +1553,8 @@ def test_aggregate_with_nat(self): expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key') dt_result = getattr(dt_grouped, func)() assert_series_equal(expected, dt_result) + # GH 9925 + self.assertEqual(dt_result.index.name, 'key') # if NaT is included, 'var', 'std', 'mean', 'first','last' and 'nth' doesn't work yet diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 7580fa5489e15..40dbbd7584c7a 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -1398,6 +1398,69 @@ def parse_datetime_string(date_string, **kwargs): dt = parse_date(date_string, **kwargs) return dt +def format_array_from_datetime(ndarray[int64_t] values, object tz=None, object format=None, object na_rep=None): + """ + return a np object array of the string formatted values + + Parameters + ---------- + values : a 1-d i8 array + tz : the timezone (or None) + format : optional, default is None + a strftime capable string + na_rep : optional, default is None + a nat format + + """ + cdef: + int64_t val, ns, N = len(values) + ndarray[object] result = np.empty(N, dtype=object) + object ts, res + pandas_datetimestruct dts + + if na_rep is None: + na_rep = 'NaT' + + for i in range(N): + val = values[i] + + if val == iNaT: + result[i] = na_rep + else: + if format is None and tz is None: + + pandas_datetime_to_datetimestruct(val, PANDAS_FR_ns, &dts) + res = '%d-%.2d-%.2d %.2d:%.2d:%.2d' % (dts.year, + dts.month, + dts.day, + dts.hour, + dts.min, + dts.sec) + + ns = dts.ps / 1000 + + if ns != 0: + res += '.%.9d' % (ns + 1000 * dts.us) + elif dts.us != 0: + res += '.%.6d' % dts.us + + result[i] = res + + else: + ts = Timestamp(val, tz=tz) + if format is None: + result[i] = str(ts) + else: + + # invalid format string + # requires dates > 1900 + try: + result[i] = ts.strftime(format) + except ValueError: + result[i] = str(ts) + + return result + def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, format=None, utc=None, coerce=False, unit=None): cdef: diff --git a/pandas/util/testing.py b/pandas/util/testing.py index b4baedada46e1..ea7354a9334ff 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -25,11 +25,6 @@ import pandas as pd from pandas.core.common import is_sequence, array_equivalent, is_list_like -import pandas.core.index as index -import pandas.core.series as series -import pandas.core.frame as frame -import pandas.core.panel as panel -import pandas.core.panel4d as panel4d import pandas.compat as compat from pandas.compat import( filter, map, zip, range, unichr, lrange, lmap, lzip, u, callable, Counter, @@ -38,24 +33,12 @@ from pandas.computation import expressions as expr -from pandas import bdate_range -from pandas.tseries.index import DatetimeIndex -from pandas.tseries.tdi import TimedeltaIndex -from pandas.tseries.period import PeriodIndex +from pandas import (bdate_range, CategoricalIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex, + Index, MultiIndex, Series, DataFrame, Panel, Panel4D) from pandas.util.decorators import deprecate - from pandas import _testing - - from pandas.io.common import urlopen -Index = index.Index -MultiIndex = index.MultiIndex -Series = series.Series -DataFrame = frame.DataFrame -Panel = panel.Panel -Panel4D = panel4d.Panel4D - N = 30 K = 4 _RAISE_NETWORK_ERROR_DEFAULT = False @@ -550,16 +533,14 @@ def assert_equal(a, b, msg=""): assert a == b, "%s: %r != %r" % (msg.format(a,b), a, b) -def assert_index_equal(left, right): +def assert_index_equal(left, right, exact=False): assert_isinstance(left, Index, '[index] ') assert_isinstance(right, Index, '[index] ') - if not left.equals(right): + if not left.equals(right) or (exact and type(left) != type(right)): raise AssertionError("[index] left [{0} {1}], right [{2} {3}]".format(left.dtype, left, right, right.dtype)) - - def assert_attr_equal(attr, left, right): """checks attributes are equal. Both objects must have attribute.""" left_attr = getattr(left, attr) @@ -627,6 +608,7 @@ def assertNotIsInstance(obj, cls, msg=''): def assert_categorical_equal(res, exp): + if not array_equivalent(res.categories, exp.categories): raise AssertionError( 'categories not equivalent: {0} vs {1}.'.format(res.categories, @@ -827,6 +809,11 @@ def makeStringIndex(k=10): def makeUnicodeIndex(k=10): return Index(randu_array(nchars=10, size=k)) +def makeCategoricalIndex(k=10, n=3): + """ make a length k index or n categories """ + x = rands_array(nchars=4, size=n) + return CategoricalIndex(np.random.choice(x,k)) + def makeBoolIndex(k=10): if k == 1: return Index([True]) diff --git a/setup.py b/setup.py index 8066a6e0cae4f..f0090aff31430 100755 --- a/setup.py +++ b/setup.py @@ -602,7 +602,7 @@ def pxd(name): ], package_data={'pandas.io': ['tests/data/legacy_hdf/*.h5', 'tests/data/legacy_pickle/*/*.pickle', - 'tests/data/*.csv', + 'tests/data/*.csv*', 'tests/data/*.dta', 'tests/data/*.txt', 'tests/data/*.xls',