Skip to content

Commit d702de0

Browse files
committed
Merge pull request #4783 from jtratner/wrap-compressed-fileobjects-in-py3
BUG: Fix input bytes conversion in Py3 to return str
2 parents 8b69209 + a623e3e commit d702de0

File tree

5 files changed

+80
-24
lines changed

5 files changed

+80
-24
lines changed

doc/source/release.rst

+3
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,9 @@ Bug Fixes
393393
- Fixed bug with reading compressed files with ``read_fwf`` in Python 3.
394394
(:issue:`3963`)
395395
- Fixed an issue with a duplicate index and assignment with a dtype change (:issue:`4686`)
396+
- Fixed bug with reading compressed files in as ``bytes`` rather than ``str``
397+
in Python 3. Simplifies bytes-producing file-handling in Python 3
398+
(:issue:`3963`, :issue:`4785`).
396399

397400
pandas 0.12.0
398401
-------------

pandas/compat/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
import types
3737

3838
PY3 = (sys.version_info[0] >= 3)
39+
PY3_2 = sys.version_info[:2] == (3, 2)
3940

4041
try:
4142
import __builtin__ as builtins

pandas/core/common.py

+27-11
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import re
66
import codecs
77
import csv
8+
import sys
89

910
from numpy.lib.format import read_array, write_array
1011
import numpy as np
@@ -1858,27 +1859,42 @@ def next(self):
18581859

18591860

18601861
def _get_handle(path, mode, encoding=None, compression=None):
1862+
"""Gets file handle for given path and mode.
1863+
NOTE: Under Python 3.2, getting a compressed file handle means reading in the entire file,
1864+
decompressing it and decoding it to ``str`` all at once and then wrapping it in a StringIO.
1865+
"""
18611866
if compression is not None:
1862-
if encoding is not None:
1863-
raise ValueError('encoding + compression not yet supported')
1867+
if encoding is not None and not compat.PY3:
1868+
msg = 'encoding + compression not yet supported in Python 2'
1869+
raise ValueError(msg)
18641870

18651871
if compression == 'gzip':
18661872
import gzip
1867-
return gzip.GzipFile(path, 'rb')
1873+
f = gzip.GzipFile(path, 'rb')
18681874
elif compression == 'bz2':
18691875
import bz2
1870-
return bz2.BZ2File(path, 'rb')
1876+
1877+
f = bz2.BZ2File(path, 'rb')
18711878
else:
18721879
raise ValueError('Unrecognized compression type: %s' %
18731880
compression)
1874-
1875-
if compat.PY3: # pragma: no cover
1876-
if encoding:
1877-
f = open(path, mode, encoding=encoding)
1878-
else:
1879-
f = open(path, mode, errors='replace')
1881+
if compat.PY3_2:
1882+
# gzip and bz2 don't work with TextIOWrapper in 3.2
1883+
encoding = encoding or get_option('display.encoding')
1884+
f = StringIO(f.read().decode(encoding))
1885+
elif compat.PY3:
1886+
from io import TextIOWrapper
1887+
f = TextIOWrapper(f, encoding=encoding)
1888+
return f
18801889
else:
1881-
f = open(path, mode)
1890+
if compat.PY3:
1891+
if encoding:
1892+
f = open(path, mode, encoding=encoding)
1893+
else:
1894+
f = open(path, mode, errors='replace')
1895+
else:
1896+
f = open(path, mode)
1897+
18821898
return f
18831899

18841900
if compat.PY3: # pragma: no cover

pandas/io/parsers.py

+34-11
Original file line numberDiff line numberDiff line change
@@ -1175,13 +1175,36 @@ def count_empty_vals(vals):
11751175
return sum([1 for v in vals if v == '' or v is None])
11761176

11771177

1178-
def _wrap_compressed(f, compression):
1178+
def _wrap_compressed(f, compression, encoding=None):
1179+
"""wraps compressed fileobject in a decompressing fileobject
1180+
NOTE: For all files in Python 3.2 and for bzip'd files under all Python
1181+
versions, this means reading in the entire file and then re-wrapping it in
1182+
StringIO.
1183+
"""
11791184
compression = compression.lower()
1185+
encoding = encoding or get_option('display.encoding')
11801186
if compression == 'gzip':
11811187
import gzip
1182-
return gzip.GzipFile(fileobj=f)
1188+
1189+
f = gzip.GzipFile(fileobj=f)
1190+
if compat.PY3_2:
1191+
# 3.2's gzip doesn't support read1
1192+
f = StringIO(f.read().decode(encoding))
1193+
elif compat.PY3:
1194+
from io import TextIOWrapper
1195+
1196+
f = TextIOWrapper(f)
1197+
return f
11831198
elif compression == 'bz2':
1184-
raise ValueError('Python cannot read bz2 data from file handle')
1199+
import bz2
1200+
1201+
# bz2 module can't take file objects, so have to run through decompress
1202+
# manually
1203+
data = bz2.decompress(f.read())
1204+
if compat.PY3:
1205+
data = data.decode(encoding)
1206+
f = StringIO(data)
1207+
return f
11851208
else:
11861209
raise ValueError('do not recognize compression method %s'
11871210
% compression)
@@ -1235,7 +1258,12 @@ def __init__(self, f, **kwds):
12351258
f = com._get_handle(f, 'r', encoding=self.encoding,
12361259
compression=self.compression)
12371260
elif self.compression:
1238-
f = _wrap_compressed(f, self.compression)
1261+
f = _wrap_compressed(f, self.compression, self.encoding)
1262+
# in Python 3, convert BytesIO or fileobjects passed with an encoding
1263+
elif compat.PY3 and isinstance(f, compat.BytesIO):
1264+
from io import TextIOWrapper
1265+
1266+
f = TextIOWrapper(f, encoding=self.encoding)
12391267

12401268
if hasattr(f, 'readline'):
12411269
self._make_reader(f)
@@ -1321,14 +1349,9 @@ class MyDialect(csv.Dialect):
13211349
def _read():
13221350
line = next(f)
13231351
pat = re.compile(sep)
1324-
if (compat.PY3 and isinstance(line, bytes)):
1325-
yield pat.split(line.decode('utf-8').strip())
1326-
for line in f:
1327-
yield pat.split(line.decode('utf-8').strip())
1328-
else:
1352+
yield pat.split(line.strip())
1353+
for line in f:
13291354
yield pat.split(line.strip())
1330-
for line in f:
1331-
yield pat.split(line.strip())
13321355
reader = _read()
13331356

13341357
self.data = reader

pandas/io/tests/test_parsers.py

+15-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# -*- coding: utf-8 -*-
12
# pylint: disable=E1101
23

34
from datetime import datetime
@@ -2043,8 +2044,8 @@ def test_fwf_compression(self):
20432044
expected = read_fwf(StringIO(data), widths=widths, names=names)
20442045
if compat.PY3:
20452046
data = bytes(data, encoding='utf-8')
2046-
for comp_name, compresser in [('gzip', gzip.GzipFile),
2047-
('bz2', bz2.BZ2File)]:
2047+
comps = [('gzip', gzip.GzipFile), ('bz2', bz2.BZ2File)]
2048+
for comp_name, compresser in comps:
20482049
with tm.ensure_clean() as path:
20492050
tmp = compresser(path, mode='wb')
20502051
tmp.write(data)
@@ -2053,6 +2054,18 @@ def test_fwf_compression(self):
20532054
compression=comp_name)
20542055
tm.assert_frame_equal(result, expected)
20552056

2057+
def test_BytesIO_input(self):
2058+
if not compat.PY3:
2059+
raise nose.SkipTest("Bytes-related test - only needs to work on Python 3")
2060+
result = pd.read_fwf(BytesIO("שלום\nשלום".encode('utf8')), widths=[2,2])
2061+
expected = pd.DataFrame([["של", "ום"]], columns=["של", "ום"])
2062+
tm.assert_frame_equal(result, expected)
2063+
data = BytesIO("שלום::1234\n562::123".encode('cp1255'))
2064+
result = pd.read_table(data, sep="::", engine='python',
2065+
encoding='cp1255')
2066+
expected = pd.DataFrame([[562, 123]], columns=["שלום","1234"])
2067+
tm.assert_frame_equal(result, expected)
2068+
20562069
def test_verbose_import(self):
20572070
text = """a,b,c,d
20582071
one,1,2,3

0 commit comments

Comments
 (0)