pandas-dev · bashtage · Jul 10, 2014 · jreback · Jul 15, 2014
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -3504,6 +3504,31 @@ into a .dta file. The format version of this file is always 115 (Stata 12).
    df = DataFrame(randn(10, 2), columns=list('AB'))
    df.to_stata('stata.dta')
 
+*Stata* data files have limited data type support; only strings with 244 or
+fewer characters, ``int8``, ``int16``, ``int32`` and ``float64`` can be stored
+in ``.dta`` files.  Additionally, *Stata* reserveds certain values to represent
+missing data, and when a value is encountered outside of the
+permitted range, the data type is upcast to the next larger size.  For
+example, ``int8`` values are restricted to lie between -127 and 100, and so
+variables with values above 100 will trigger a conversion to ``int16``. ``nan``
+values in floating points data types are stored as the basic missing data type
+(``.`` in *Stata*).  Tt is not possible to indicate missing data values for
+integer data types.
+
+The *Stata* writer gracefully handles other data types including ``int64``,
+``bool``, ``uint8``, ``uint16``, ``uint32`` and ``float32`` by upcasting to
+the smallest supported type that can represent the data.  For example, data
+with a type of ``uint8`` will be cast to ``int8`` if all values are less than
+100 (the upper bound for non-missing ``int8`` data in *Stata*), or, if values are
+outside of this range, the data is cast to ``int16``.
+
+
+.. warning::
+
+   Conversion from ``int64`` to ``float64`` may result in a loss of precision
+   if ``int64`` values are larger than 2**53.
+
+
 .. _io.stata_reader:
 
 Reading from STATA format

diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt
@@ -114,7 +114,7 @@ Known Issues
 
 Enhancements
 ~~~~~~~~~~~~
-
+- Added support for bool, uint8, uint16 and uint32 datatypes in ``to_stata`` (:issue:`7097`, :issue:`7365`)
 
 
 

diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -206,6 +206,7 @@ class InvalidColumnName(Warning):
 underscores, no Stata reserved words)
 """
 
+
 def _cast_to_stata_types(data):
     """Checks the dtypes of the columns of a pandas DataFrame for
     compatibility with the data types and ranges supported by Stata, and
@@ -218,18 +219,44 @@ def _cast_to_stata_types(data):
 
     Notes
     -----
-    Numeric columns must be one of int8, int16, int32, float32 or float64, with
-    some additional value restrictions on the integer data types.  int8 and
-    int16 columns are checked for violations of the value restrictions and
+    Numeric columns in Stata must be one of int8, int16, int32, float32 or
+    float64, with some additional value restrictions.  int8 and int16 columns
+    are checked for violations of the value restrictions and
     upcast if needed.  int64 data is not usable in Stata, and so it is
     downcast to int32 whenever the value are in the int32 range, and
     sidecast to float64 when larger than this range.  If the int64 values
     are outside of the range of those perfectly representable as float64 values,
     a warning is raised.
+
+    bool columns are cast to int8.  uint colums are converted to int of the same
+    size if there is no loss in precision, other wise are upcast to a larger
+    type.  uint64 is currently not supported since it is concerted to object in
+    a DataFrame.
     """
     ws = ''
+    #                  original, if small, if large
+    conversion_data = ((np.bool, np.int8, np.int8),
+                       (np.uint8, np.int8, np.int16),
+                       (np.uint16, np.int16, np.int32),
+                       (np.uint32, np.int32, np.int64))
+
     for col in data:
         dtype = data[col].dtype
+        # Cast from unsupported types to supported types
+        for c_data in conversion_data:
+            if dtype == c_data[0]:
+                if data[col].max() <= np.iinfo(c_data[1]).max:
+                    dtype = c_data[1]
+                else:
+                    dtype = c_data[2]
+                if c_data[2] == np.float64:  # Warn if necessary
+                        if data[col].max() >= 2 * 53:
+                            ws = precision_loss_doc % ('uint64', 'float64')
+
+                data[col] = data[col].astype(dtype)
+
+
+        # Check values and upcast if necessary
         if dtype == np.int8:
             if data[col].max() > 100 or data[col].min() < -127:
                 data[col] = data[col].astype(np.int16)
@@ -241,7 +268,7 @@ def _cast_to_stata_types(data):
                 data[col] = data[col].astype(np.int32)
             else:
                 data[col] = data[col].astype(np.float64)
-                if data[col].max() <= 2 * 53 or data[col].min() >= -2 ** 53:
+                if data[col].max() >= 2 ** 53 or data[col].min() <= -2 ** 53:
                     ws = precision_loss_doc % ('int64', 'float64')
 
     if ws:

diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py
@@ -527,6 +527,29 @@ def test_write_missing_strings(self):
             tm.assert_frame_equal(written_and_read_again.set_index('index'),
                                   expected)
 
+    def test_bool_uint(self):
+        s0 = Series([0, 1, True], dtype=np.bool)
+        s1 = Series([0, 1, 100], dtype=np.uint8)
+        s2 = Series([0, 1, 255], dtype=np.uint8)
+        s3 = Series([0, 1, 2 ** 15 - 100], dtype=np.uint16)
+        s4 = Series([0, 1, 2 ** 16 - 1], dtype=np.uint16)
+        s5 = Series([0, 1, 2 ** 31 - 100], dtype=np.uint32)
+        s6 = Series([0, 1, 2 ** 32 - 1], dtype=np.uint32)
+
+        original = DataFrame({'s0': s0, 's1': s1, 's2': s2, 's3': s3,
+                              's4': s4, 's5': s5, 's6': s6})
+        original.index.name = 'index'
+        expected = original.copy()
+        expected_types = (np.int8, np.int8, np.int16, np.int16, np.int32,
+                          np.int32, np.float64)
+        for c, t in zip(expected.columns, expected_types):
+            expected[c] = expected[c].astype(t)
+
+        with tm.ensure_clean() as path:
+            original.to_stata(path)
+            written_and_read_again = self.read_dta(path)
+            written_and_read_again = written_and_read_again.set_index('index')
+            tm.assert_frame_equal(written_and_read_again, expected)
 
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
Original file line number	Diff line number	Diff line change
Expand Up		@@ -114,7 +114,7 @@ Known Issues

		Enhancements
		~~~~~~~~~~~~

		- Added support for bool, uint8, uint16 and uint32 datatypes in ``to_stata`` (:issue:`7097`, :issue:`7365`)



Expand Down