From b83ebc2ee229e692594ed53aaea70740bd228003 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Thu, 10 Jul 2014 22:34:08 +0100 Subject: [PATCH] ENH: Add uint and bool support in to_stata Added support for uint (uint8, uint16 and uint32, but not uint64) and bool datatypes in to_stata. Added an explanation of supported data types in io.rst. closes #7097 and closes #7365 --- doc/source/io.rst | 25 +++++++++++++++++++++++++ doc/source/v0.15.0.txt | 2 +- pandas/io/stata.py | 35 +++++++++++++++++++++++++++++++---- pandas/io/tests/test_stata.py | 23 +++++++++++++++++++++++ 4 files changed, 80 insertions(+), 5 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index fa6ab646a47c8..d82af333330db 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -3504,6 +3504,31 @@ into a .dta file. The format version of this file is always 115 (Stata 12). df = DataFrame(randn(10, 2), columns=list('AB')) df.to_stata('stata.dta') +*Stata* data files have limited data type support; only strings with 244 or +fewer characters, ``int8``, ``int16``, ``int32`` and ``float64`` can be stored +in ``.dta`` files. Additionally, *Stata* reserveds certain values to represent +missing data, and when a value is encountered outside of the +permitted range, the data type is upcast to the next larger size. For +example, ``int8`` values are restricted to lie between -127 and 100, and so +variables with values above 100 will trigger a conversion to ``int16``. ``nan`` +values in floating points data types are stored as the basic missing data type +(``.`` in *Stata*). Tt is not possible to indicate missing data values for +integer data types. + +The *Stata* writer gracefully handles other data types including ``int64``, +``bool``, ``uint8``, ``uint16``, ``uint32`` and ``float32`` by upcasting to +the smallest supported type that can represent the data. For example, data +with a type of ``uint8`` will be cast to ``int8`` if all values are less than +100 (the upper bound for non-missing ``int8`` data in *Stata*), or, if values are +outside of this range, the data is cast to ``int16``. + + +.. warning:: + + Conversion from ``int64`` to ``float64`` may result in a loss of precision + if ``int64`` values are larger than 2**53. + + .. _io.stata_reader: Reading from STATA format diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index f305d088e996f..3ccd8c14fbff8 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -114,7 +114,7 @@ Known Issues Enhancements ~~~~~~~~~~~~ - +- Added support for bool, uint8, uint16 and uint32 datatypes in ``to_stata`` (:issue:`7097`, :issue:`7365`) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index ed6b540b890a2..48a5f5ee6c994 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -206,6 +206,7 @@ class InvalidColumnName(Warning): underscores, no Stata reserved words) """ + def _cast_to_stata_types(data): """Checks the dtypes of the columns of a pandas DataFrame for compatibility with the data types and ranges supported by Stata, and @@ -218,18 +219,44 @@ def _cast_to_stata_types(data): Notes ----- - Numeric columns must be one of int8, int16, int32, float32 or float64, with - some additional value restrictions on the integer data types. int8 and - int16 columns are checked for violations of the value restrictions and + Numeric columns in Stata must be one of int8, int16, int32, float32 or + float64, with some additional value restrictions. int8 and int16 columns + are checked for violations of the value restrictions and upcast if needed. int64 data is not usable in Stata, and so it is downcast to int32 whenever the value are in the int32 range, and sidecast to float64 when larger than this range. If the int64 values are outside of the range of those perfectly representable as float64 values, a warning is raised. + + bool columns are cast to int8. uint colums are converted to int of the same + size if there is no loss in precision, other wise are upcast to a larger + type. uint64 is currently not supported since it is concerted to object in + a DataFrame. """ ws = '' + # original, if small, if large + conversion_data = ((np.bool, np.int8, np.int8), + (np.uint8, np.int8, np.int16), + (np.uint16, np.int16, np.int32), + (np.uint32, np.int32, np.int64)) + for col in data: dtype = data[col].dtype + # Cast from unsupported types to supported types + for c_data in conversion_data: + if dtype == c_data[0]: + if data[col].max() <= np.iinfo(c_data[1]).max: + dtype = c_data[1] + else: + dtype = c_data[2] + if c_data[2] == np.float64: # Warn if necessary + if data[col].max() >= 2 * 53: + ws = precision_loss_doc % ('uint64', 'float64') + + data[col] = data[col].astype(dtype) + + + # Check values and upcast if necessary if dtype == np.int8: if data[col].max() > 100 or data[col].min() < -127: data[col] = data[col].astype(np.int16) @@ -241,7 +268,7 @@ def _cast_to_stata_types(data): data[col] = data[col].astype(np.int32) else: data[col] = data[col].astype(np.float64) - if data[col].max() <= 2 * 53 or data[col].min() >= -2 ** 53: + if data[col].max() >= 2 ** 53 or data[col].min() <= -2 ** 53: ws = precision_loss_doc % ('int64', 'float64') if ws: diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 1a2673342df45..435226bc4313f 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -527,6 +527,29 @@ def test_write_missing_strings(self): tm.assert_frame_equal(written_and_read_again.set_index('index'), expected) + def test_bool_uint(self): + s0 = Series([0, 1, True], dtype=np.bool) + s1 = Series([0, 1, 100], dtype=np.uint8) + s2 = Series([0, 1, 255], dtype=np.uint8) + s3 = Series([0, 1, 2 ** 15 - 100], dtype=np.uint16) + s4 = Series([0, 1, 2 ** 16 - 1], dtype=np.uint16) + s5 = Series([0, 1, 2 ** 31 - 100], dtype=np.uint32) + s6 = Series([0, 1, 2 ** 32 - 1], dtype=np.uint32) + + original = DataFrame({'s0': s0, 's1': s1, 's2': s2, 's3': s3, + 's4': s4, 's5': s5, 's6': s6}) + original.index.name = 'index' + expected = original.copy() + expected_types = (np.int8, np.int8, np.int16, np.int16, np.int32, + np.int32, np.float64) + for c, t in zip(expected.columns, expected_types): + expected[c] = expected[c].astype(t) + + with tm.ensure_clean() as path: + original.to_stata(path) + written_and_read_again = self.read_dta(path) + written_and_read_again = written_and_read_again.set_index('index') + tm.assert_frame_equal(written_and_read_again, expected) if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],