Merge pull request #4783 from jtratner/wrap-compressed-fileobjects-in-py3

jtratner · jtratner · commit d702de0930f1 · 2013-09-13T18:59:59.000-07:00
BUG: Fix input bytes conversion in Py3 to return str
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -393,6 +393,9 @@ Bug Fixes
   - Fixed bug with reading compressed files with ``read_fwf`` in Python 3.
     (:issue:`3963`)
   - Fixed an issue with a duplicate index and assignment with a dtype change (:issue:`4686`)
+  - Fixed bug with reading compressed files in as ``bytes`` rather than ``str``
+    in Python 3. Simplifies bytes-producing file-handling in Python 3
+    (:issue:`3963`, :issue:`4785`).
 
 pandas 0.12.0
 -------------
diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py
@@ -36,6 +36,7 @@
 import types
 
 PY3 = (sys.version_info[0] >= 3)
+PY3_2 = sys.version_info[:2] == (3, 2)
 
 try:
     import __builtin__ as builtins
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -5,6 +5,7 @@
 import re
 import codecs
 import csv
+import sys
 
 from numpy.lib.format import read_array, write_array
 import numpy as np
@@ -1858,27 +1859,42 @@ def next(self):
 
 
 def _get_handle(path, mode, encoding=None, compression=None):
+    """Gets file handle for given path and mode.
+    NOTE: Under Python 3.2, getting a compressed file handle means reading in the entire file,
+    decompressing it and decoding it to ``str`` all at once and then wrapping it in a StringIO.
+    """
     if compression is not None:
-        if encoding is not None:
-            raise ValueError('encoding + compression not yet supported')
+        if encoding is not None and not compat.PY3:
+            msg = 'encoding + compression not yet supported in Python 2'
+            raise ValueError(msg)
 
         if compression == 'gzip':
             import gzip
-            return gzip.GzipFile(path, 'rb')
+            f = gzip.GzipFile(path, 'rb')
         elif compression == 'bz2':
             import bz2
-            return bz2.BZ2File(path, 'rb')
+
+            f = bz2.BZ2File(path, 'rb')
         else:
             raise ValueError('Unrecognized compression type: %s' %
                              compression)
-
-    if compat.PY3:  # pragma: no cover
-        if encoding:
-            f = open(path, mode, encoding=encoding)
-        else:
-            f = open(path, mode, errors='replace')
+        if compat.PY3_2:
+            # gzip and bz2 don't work with TextIOWrapper in 3.2
+            encoding = encoding or get_option('display.encoding')
+            f = StringIO(f.read().decode(encoding))
+        elif compat.PY3:
+            from io import TextIOWrapper
+            f = TextIOWrapper(f, encoding=encoding)
+        return f
     else:
-        f = open(path, mode)
+        if compat.PY3:
+            if encoding:
+                f = open(path, mode, encoding=encoding)
+            else:
+                f = open(path, mode, errors='replace')
+        else:
+            f = open(path, mode)
+
     return f
 
 if compat.PY3:  # pragma: no cover
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -1175,13 +1175,36 @@ def count_empty_vals(vals):
     return sum([1 for v in vals if v == '' or v is None])
 
 
-def _wrap_compressed(f, compression):
+def _wrap_compressed(f, compression, encoding=None):
+    """wraps compressed fileobject in a decompressing fileobject
+    NOTE: For all files in Python 3.2 and for bzip'd files under all Python
+    versions, this means reading in the entire file and then re-wrapping it in
+    StringIO.
+    """
     compression = compression.lower()
+    encoding = encoding or get_option('display.encoding')
     if compression == 'gzip':
         import gzip
-        return gzip.GzipFile(fileobj=f)
+
+        f = gzip.GzipFile(fileobj=f)
+        if compat.PY3_2:
+            # 3.2's gzip doesn't support read1
+            f = StringIO(f.read().decode(encoding))
+        elif compat.PY3:
+            from io import TextIOWrapper
+
+            f = TextIOWrapper(f)
+        return f
     elif compression == 'bz2':
-        raise ValueError('Python cannot read bz2 data from file handle')
+        import bz2
+
+        # bz2 module can't take file objects, so have to run through decompress
+        # manually
+        data = bz2.decompress(f.read())
+        if compat.PY3:
+            data = data.decode(encoding)
+        f = StringIO(data)
+        return f
     else:
         raise ValueError('do not recognize compression method %s'
                          % compression)
@@ -1235,7 +1258,12 @@ def __init__(self, f, **kwds):
             f = com._get_handle(f, 'r', encoding=self.encoding,
                                 compression=self.compression)
         elif self.compression:
-            f = _wrap_compressed(f, self.compression)
+            f = _wrap_compressed(f, self.compression, self.encoding)
+        # in Python 3, convert BytesIO or fileobjects passed with an encoding
+        elif compat.PY3 and isinstance(f, compat.BytesIO):
+            from io import TextIOWrapper
+
+            f = TextIOWrapper(f, encoding=self.encoding)
 
         if hasattr(f, 'readline'):
             self._make_reader(f)
@@ -1321,14 +1349,9 @@ class MyDialect(csv.Dialect):
             def _read():
                 line = next(f)
                 pat = re.compile(sep)
-                if (compat.PY3 and isinstance(line, bytes)):
-                    yield pat.split(line.decode('utf-8').strip())
-                    for line in f:
-                        yield pat.split(line.decode('utf-8').strip())
-                else:
+                yield pat.split(line.strip())
+                for line in f:
                     yield pat.split(line.strip())
-                    for line in f:
-                        yield pat.split(line.strip())
             reader = _read()
 
         self.data = reader
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 # pylint: disable=E1101
 
 from datetime import datetime
@@ -2043,8 +2044,8 @@ def test_fwf_compression(self):
         expected = read_fwf(StringIO(data), widths=widths, names=names)
         if compat.PY3:
             data = bytes(data, encoding='utf-8')
-        for comp_name, compresser in [('gzip', gzip.GzipFile),
-                                      ('bz2', bz2.BZ2File)]:
+        comps = [('gzip', gzip.GzipFile), ('bz2', bz2.BZ2File)]
+        for comp_name, compresser in comps:
             with tm.ensure_clean() as path:
                 tmp = compresser(path, mode='wb')
                 tmp.write(data)
@@ -2053,6 +2054,18 @@ def test_fwf_compression(self):
                                   compression=comp_name)
                 tm.assert_frame_equal(result, expected)
 
+    def test_BytesIO_input(self):
+        if not compat.PY3:
+            raise nose.SkipTest("Bytes-related test - only needs to work on Python 3")
+        result = pd.read_fwf(BytesIO("שלום\nשלום".encode('utf8')), widths=[2,2])
+        expected = pd.DataFrame([["של", "ום"]], columns=["של", "ום"])
+        tm.assert_frame_equal(result, expected)
+        data = BytesIO("שלום::1234\n562::123".encode('cp1255'))
+        result = pd.read_table(data, sep="::", engine='python',
+                               encoding='cp1255')
+        expected = pd.DataFrame([[562, 123]], columns=["שלום","1234"])
+        tm.assert_frame_equal(result, expected)
+
     def test_verbose_import(self):
         text = """a,b,c,d
 one,1,2,3