Skip to content

Commit a006df3

Browse files
committed
fix: fall back to 'charset' of 'content_type' in 'download_as_text'
Explicit 'encoding' overrides the fallback. Use the 'charset' param of 'content_type', rather than 'content_encoding', which isn't going to be a Unicode -> bytes encoding. Closes #319.
1 parent 547740c commit a006df3

File tree

2 files changed

+120
-59
lines changed

2 files changed

+120
-59
lines changed

google/cloud/storage/blob.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
"""
2727

2828
import base64
29+
import cgi
2930
import copy
3031
import hashlib
3132
from io import BytesIO
@@ -1345,7 +1346,7 @@ def download_as_text(
13451346
start=None,
13461347
end=None,
13471348
raw_download=False,
1348-
encoding="utf-8",
1349+
encoding=None,
13491350
if_generation_match=None,
13501351
if_generation_not_match=None,
13511352
if_metageneration_match=None,
@@ -1374,8 +1375,8 @@ def download_as_text(
13741375
13751376
:type encoding: str
13761377
:param encoding: (Optional) The data of the blob will be decoded by
1377-
encoding method. Defaults to UTF-8. Apply only
1378-
if the value of ``blob.content_encoding`` is None.
1378+
encoding method. Defaults to the ``charset`` param
1379+
of attr:`content_type`, or else to "utf-8".
13791380
13801381
:type if_generation_match: long
13811382
:param if_generation_match: (Optional) Make the operation conditional on whether
@@ -1422,11 +1423,16 @@ def download_as_text(
14221423
timeout=timeout,
14231424
)
14241425

1425-
if self.content_encoding:
1426-
return data.decode(self.content_encoding)
1427-
else:
1426+
if encoding is not None:
14281427
return data.decode(encoding)
14291428

1429+
if self.content_type is not None:
1430+
_, params = cgi.parse_header(self.content_type)
1431+
if "charset" in params:
1432+
return data.decode(params["charset"])
1433+
1434+
return data.decode("utf-8")
1435+
14301436
def _get_content_type(self, content_type, filename=None):
14311437
"""Determine the content type from the current object.
14321438

tests/unit/test_blob.py

Lines changed: 108 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1622,66 +1622,88 @@ def test_download_as_bytes_w_raw(self):
16221622
def test_download_as_byte_w_custom_timeout(self):
16231623
self._download_as_bytes_helper(raw_download=False, timeout=9.58)
16241624

1625-
def _download_as_text_helper(self, raw_download, encoding=None, timeout=None):
1625+
def _download_as_text_helper(
1626+
self,
1627+
raw_download,
1628+
client=None,
1629+
start=None,
1630+
end=None,
1631+
if_generation_match=None,
1632+
if_generation_not_match=None,
1633+
if_metageneration_match=None,
1634+
if_metageneration_not_match=None,
1635+
timeout=None,
1636+
encoding=None,
1637+
charset=None,
1638+
no_charset=False,
1639+
expected_value=u"DEADBEEF",
1640+
payload=None,
1641+
):
1642+
if payload is None:
1643+
if encoding is not None:
1644+
payload = expected_value.encode(encoding)
1645+
elif charset is not None:
1646+
payload = expected_value.encode(charset)
1647+
else:
1648+
payload = expected_value.encode()
1649+
16261650
blob_name = "blob-name"
1627-
client = mock.Mock(spec=["_http"])
1628-
bucket = _Bucket(client)
1629-
media_link = "http://example.com/media/"
1630-
properties = {"mediaLink": media_link}
1631-
if encoding:
1632-
properties["contentEncoding"] = encoding
1651+
bucket = _Bucket()
1652+
1653+
properties = {}
1654+
if charset is not None:
1655+
properties["contentType"] = "text/plain; charset={}".format(charset)
1656+
elif no_charset:
1657+
properties = {"contentType": "text/plain"}
1658+
16331659
blob = self._make_one(blob_name, bucket=bucket, properties=properties)
1634-
blob._do_download = mock.Mock()
1660+
blob.download_as_bytes = mock.Mock(return_value=payload)
16351661

1636-
if timeout is None:
1637-
expected_timeout = self._get_default_timeout()
1638-
fetched = blob.download_as_text(raw_download=raw_download)
1639-
else:
1640-
expected_timeout = timeout
1641-
fetched = blob.download_as_text(raw_download=raw_download, timeout=timeout)
1662+
kwargs = {"raw_download": raw_download}
16421663

1643-
self.assertEqual(fetched, "")
1664+
if client is not None:
1665+
kwargs["client"] = client
16441666

1645-
headers = {"accept-encoding": "gzip"}
1646-
blob._do_download.assert_called_once_with(
1647-
client._http,
1648-
mock.ANY,
1649-
media_link,
1650-
headers,
1651-
None,
1652-
None,
1653-
raw_download,
1654-
timeout=expected_timeout,
1655-
checksum="md5",
1656-
)
1657-
stream = blob._do_download.mock_calls[0].args[1]
1658-
self.assertIsInstance(stream, io.BytesIO)
1667+
if start is not None:
1668+
kwargs["start"] = start
16591669

1660-
def test_download_as_text_w_generation_match(self):
1661-
GENERATION_NUMBER = 6
1662-
MEDIA_LINK = "http://example.com/media/"
1670+
if end is not None:
1671+
kwargs["end"] = end
16631672

1664-
client = mock.Mock(spec=["_http"])
1665-
blob = self._make_one(
1666-
"blob-name", bucket=_Bucket(client), properties={"mediaLink": MEDIA_LINK}
1667-
)
1668-
blob.download_to_file = mock.Mock()
1673+
if encoding is not None:
1674+
kwargs["encoding"] = encoding
16691675

1670-
fetched = blob.download_as_text(if_generation_match=GENERATION_NUMBER)
1671-
self.assertEqual(fetched, "")
1676+
if if_generation_match is not None:
1677+
kwargs["if_generation_match"] = if_generation_match
16721678

1673-
blob.download_to_file.assert_called_once_with(
1674-
mock.ANY,
1675-
client=None,
1676-
start=None,
1677-
end=None,
1678-
raw_download=False,
1679-
if_generation_match=GENERATION_NUMBER,
1680-
if_generation_not_match=None,
1681-
if_metageneration_match=None,
1682-
if_metageneration_not_match=None,
1683-
timeout=self._get_default_timeout(),
1684-
checksum="md5",
1679+
if if_generation_not_match is not None:
1680+
kwargs["if_generation_not_match"] = if_generation_not_match
1681+
1682+
if if_metageneration_match is not None:
1683+
kwargs["if_metageneration_match"] = if_metageneration_match
1684+
1685+
if if_metageneration_not_match is not None:
1686+
kwargs["if_metageneration_not_match"] = if_metageneration_not_match
1687+
1688+
if timeout is None:
1689+
expected_timeout = self._get_default_timeout()
1690+
else:
1691+
kwargs["timeout"] = expected_timeout = timeout
1692+
1693+
fetched = blob.download_as_text(**kwargs)
1694+
1695+
self.assertEqual(fetched, expected_value)
1696+
1697+
blob.download_as_bytes.assert_called_once_with(
1698+
client=client,
1699+
start=client,
1700+
end=client,
1701+
raw_download=raw_download,
1702+
timeout=expected_timeout,
1703+
if_generation_match=if_generation_match,
1704+
if_generation_not_match=if_generation_not_match,
1705+
if_metageneration_match=if_metageneration_match,
1706+
if_metageneration_not_match=if_metageneration_not_match,
16851707
)
16861708

16871709
def test_download_as_text_wo_raw(self):
@@ -1693,8 +1715,41 @@ def test_download_as_text_w_raw(self):
16931715
def test_download_as_text_w_custom_timeout(self):
16941716
self._download_as_text_helper(raw_download=False, timeout=9.58)
16951717

1696-
def test_download_as_text_w_encoding(self):
1697-
self._download_as_text_helper(raw_download=False, encoding="utf-8")
1718+
def test_download_as_text_w_if_generation_match(self):
1719+
self._download_as_text_helper(raw_download=False, if_generation_match=6)
1720+
1721+
def test_download_as_text_w_if_generation_not_match(self):
1722+
self._download_as_text_helper(raw_download=False, if_generation_not_match=6)
1723+
1724+
def test_download_as_text_w_if_metageneration_match(self):
1725+
self._download_as_text_helper(raw_download=False, if_metageneration_match=6)
1726+
1727+
def test_download_as_text_w_if_metageneration_not_match(self):
1728+
self._download_as_text_helper(raw_download=False, if_metageneration_not_match=6)
1729+
1730+
def test_download_as_text_w_non_ascii_w_explicit_encoding(self):
1731+
expected_value = u"\x0AFe"
1732+
encoding = "utf-16"
1733+
charset = "latin1"
1734+
payload = expected_value.encode(encoding)
1735+
self._download_as_text_helper(
1736+
raw_download=False,
1737+
expected_value=expected_value,
1738+
payload=payload,
1739+
encoding=encoding,
1740+
charset=charset,
1741+
)
1742+
1743+
def test_download_as_text_w_non_ascii_wo_explicit_encoding_w_charset(self):
1744+
expected_value = u"\x0AFe"
1745+
charset = "utf-16"
1746+
payload = expected_value.encode(charset)
1747+
self._download_as_text_helper(
1748+
raw_download=False,
1749+
expected_value=expected_value,
1750+
payload=payload,
1751+
charset=charset,
1752+
)
16981753

16991754
@mock.patch("warnings.warn")
17001755
def test_download_as_string(self, mock_warn):

0 commit comments

Comments
 (0)