Skip to content

Commit d0fe113

Browse files
Drop chardet (#1269)
* Internal refactoring to swap auth/redirects ordering * Drop chardet for charset detection * Drop chardet in favour of simpler charset autodetection * Revert unintentionally included changes * Update test case * Refactor to prefer different decoding style * Update text decoding docs/docstrings * Resolve typo * Update docs/quickstart.md Co-authored-by: Florimond Manca <[email protected]> Co-authored-by: Florimond Manca <[email protected]>
1 parent 2d6c30d commit d0fe113

File tree

7 files changed

+128
-112
lines changed

7 files changed

+128
-112
lines changed

docs/quickstart.md

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,14 +65,27 @@ HTTPX will automatically handle decoding the response content into Unicode text.
6565
'<!doctype html>\n<html>\n<head>\n<title>Example Domain</title>...'
6666
```
6767

68-
You can inspect what encoding has been used to decode the response.
68+
You can inspect what encoding will be used to decode the response.
6969

7070
```pycon
7171
>>> r.encoding
7272
'UTF-8'
7373
```
7474

75-
If you need to override the standard behavior and explicitly set the encoding to
75+
In some cases the response may not contain an explicit encoding, in which case HTTPX
76+
will attempt to automatically determine an encoding to use. This defaults to
77+
UTF-8, but also includes robust fallback behaviour for handling ascii,
78+
iso-8859-1 and windows 1252 encodings.
79+
80+
```pycon
81+
>>> r.encoding
82+
None
83+
>>> r.text
84+
'<!doctype html>\n<html>\n<head>\n<title>Example Domain</title>...'
85+
```
86+
87+
88+
If you need to override the standard behaviour and explicitly set the encoding to
7689
use, then you can do that too.
7790

7891
```pycon

httpx/_decoders.py

Lines changed: 41 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
import typing
88
import zlib
99

10-
import chardet
11-
1210
try:
1311
import brotli
1412
except ImportError: # pragma: nocover
@@ -163,62 +161,52 @@ class TextDecoder:
163161
"""
164162

165163
def __init__(self, encoding: typing.Optional[str] = None):
166-
self.decoder: typing.Optional[codecs.IncrementalDecoder] = (
167-
None if encoding is None else codecs.getincrementaldecoder(encoding)()
168-
)
169-
self.detector = chardet.universaldetector.UniversalDetector()
170-
171-
# This buffer is only needed if 'decoder' is 'None'
172-
# we want to trigger errors if data is getting added to
173-
# our internal buffer for some silly reason while
174-
# a decoder is discovered.
175-
self.buffer: typing.Optional[bytearray] = None if self.decoder else bytearray()
164+
self.decoder: typing.Optional[codecs.IncrementalDecoder] = None
165+
if encoding is not None:
166+
self.decoder = codecs.getincrementaldecoder(encoding)(errors="strict")
176167

177168
def decode(self, data: bytes) -> str:
178-
try:
179-
if self.decoder is not None:
180-
text = self.decoder.decode(data)
181-
else:
182-
assert self.buffer is not None
183-
text = ""
184-
self.detector.feed(data)
185-
self.buffer += data
186-
187-
# Should be more than enough data to process, we don't
188-
# want to buffer too long as chardet will wait until
189-
# detector.close() is used to give back common
190-
# encodings like 'utf-8'.
191-
if len(self.buffer) >= 4096:
192-
self.decoder = codecs.getincrementaldecoder(
193-
self._detector_result()
194-
)()
195-
text = self.decoder.decode(bytes(self.buffer), False)
196-
self.buffer = None
197-
198-
return text
199-
except UnicodeDecodeError as exc: # pragma: nocover
200-
raise ValueError(str(exc))
169+
"""
170+
If an encoding is explicitly specified, then we use that.
171+
Otherwise our strategy is to attempt UTF-8, and fallback to Windows 1252.
201172
202-
def flush(self) -> str:
203-
try:
204-
if self.decoder is None:
205-
# Empty string case as chardet is guaranteed to not have a guess.
206-
assert self.buffer is not None
207-
if len(self.buffer) == 0:
208-
return ""
209-
return bytes(self.buffer).decode(self._detector_result())
210-
211-
return self.decoder.decode(b"", True)
212-
except UnicodeDecodeError as exc: # pragma: nocover
213-
raise ValueError(str(exc))
173+
Note that UTF-8 is a strict superset of ascii, and Windows 1252 is a
174+
superset of the non-control characters in iso-8859-1, so we essentially
175+
end up supporting any of ascii, utf-8, iso-8859-1, cp1252.
214176
215-
def _detector_result(self) -> str:
216-
self.detector.close()
217-
result = self.detector.result["encoding"]
218-
if not result: # pragma: nocover
219-
raise ValueError("Unable to determine encoding of content")
177+
Given that UTF-8 is now by *far* the most widely used encoding, this
178+
should be a pretty robust strategy for cases where a charset has
179+
not been explicitly included.
220180
221-
return result
181+
Useful stats on the prevalence of different charsets in the wild...
182+
183+
* https://w3techs.com/technologies/overview/character_encoding
184+
* https://w3techs.com/technologies/history_overview/character_encoding
185+
186+
The HTML5 spec also has some useful guidelines, suggesting defaults of
187+
either UTF-8 or Windows 1252 in most cases...
188+
189+
* https://dev.w3.org/html5/spec-LC/Overview.html
190+
"""
191+
if self.decoder is None:
192+
# If this is the first decode pass then we need to determine which
193+
# encoding to use by attempting UTF-8 and raising any decode errors.
194+
attempt_utf_8 = codecs.getincrementaldecoder("utf-8")(errors="strict")
195+
try:
196+
attempt_utf_8.decode(data)
197+
except UnicodeDecodeError:
198+
# Could not decode as UTF-8. Use Windows 1252.
199+
self.decoder = codecs.getincrementaldecoder("cp1252")(errors="replace")
200+
else:
201+
# Can decode as UTF-8. Use UTF-8 with lenient error settings.
202+
self.decoder = codecs.getincrementaldecoder("utf-8")(errors="replace")
203+
204+
return self.decoder.decode(data)
205+
206+
def flush(self) -> str:
207+
if self.decoder is None:
208+
return ""
209+
return self.decoder.decode(b"", True)
222210

223211

224212
class LineDecoder:

httpx/_models.py

Lines changed: 16 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
from http.cookiejar import Cookie, CookieJar
1111
from urllib.parse import parse_qsl, quote, unquote, urlencode
1212

13-
import chardet
1413
import rfc3986
1514
import rfc3986.exceptions
1615

@@ -755,19 +754,22 @@ def text(self) -> str:
755754
if not content:
756755
self._text = ""
757756
else:
758-
encoding = self.encoding
759-
self._text = content.decode(encoding, errors="replace")
757+
decoder = TextDecoder(encoding=self.encoding)
758+
self._text = "".join([decoder.decode(self.content), decoder.flush()])
760759
return self._text
761760

762761
@property
763-
def encoding(self) -> str:
762+
def encoding(self) -> typing.Optional[str]:
763+
"""
764+
Return the encoding, which may have been set explicitly, or may have
765+
been specified by the Content-Type header.
766+
"""
764767
if not hasattr(self, "_encoding"):
765768
encoding = self.charset_encoding
766769
if encoding is None or not is_known_encoding(encoding):
767-
encoding = self.apparent_encoding
768-
if encoding is None or not is_known_encoding(encoding):
769-
encoding = "utf-8"
770-
self._encoding = encoding
770+
self._encoding = None
771+
else:
772+
self._encoding = encoding
771773
return self._encoding
772774

773775
@encoding.setter
@@ -783,25 +785,11 @@ def charset_encoding(self) -> typing.Optional[str]:
783785
if content_type is None:
784786
return None
785787

786-
parsed = cgi.parse_header(content_type)
787-
media_type, params = parsed[0], parsed[-1]
788-
if "charset" in params:
789-
return params["charset"].strip("'\"")
790-
791-
# RFC 2616 specifies that 'iso-8859-1' should be used as the default
792-
# for 'text/*' media types, if no charset is provided.
793-
# See: https://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
794-
if media_type.startswith("text/"):
795-
return "iso-8859-1"
796-
797-
return None
788+
_, params = cgi.parse_header(content_type)
789+
if "charset" not in params:
790+
return None
798791

799-
@property
800-
def apparent_encoding(self) -> typing.Optional[str]:
801-
"""
802-
Return the encoding, as it appears to autodetection.
803-
"""
804-
return chardet.detect(self.content)["encoding"]
792+
return params["charset"].strip("'\"")
805793

806794
def _get_content_decoder(self) -> ContentDecoder:
807795
"""
@@ -936,7 +924,7 @@ def iter_text(self) -> typing.Iterator[str]:
936924
that handles both gzip, deflate, etc but also detects the content's
937925
string encoding.
938926
"""
939-
decoder = TextDecoder(encoding=self.charset_encoding)
927+
decoder = TextDecoder(encoding=self.encoding)
940928
with self._wrap_decoder_errors():
941929
for chunk in self.iter_bytes():
942930
yield decoder.decode(chunk)
@@ -1020,7 +1008,7 @@ async def aiter_text(self) -> typing.AsyncIterator[str]:
10201008
that handles both gzip, deflate, etc but also detects the content's
10211009
string encoding.
10221010
"""
1023-
decoder = TextDecoder(encoding=self.charset_encoding)
1011+
decoder = TextDecoder(encoding=self.encoding)
10241012
with self._wrap_decoder_errors():
10251013
async for chunk in self.aiter_bytes():
10261014
yield decoder.decode(chunk)

setup.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,6 @@ def get_packages(package):
5757
install_requires=[
5858
"certifi",
5959
"sniffio",
60-
"chardet==3.*",
6160
"rfc3986[idna2008]>=1.3,<2",
6261
"httpcore==0.10.*",
6362
],

tests/client/test_client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def test_get(server):
1515
assert response.content == b"Hello, world!"
1616
assert response.text == "Hello, world!"
1717
assert response.http_version == "HTTP/1.1"
18-
assert response.encoding == "iso-8859-1"
18+
assert response.encoding is None
1919
assert response.request.url == url
2020
assert response.headers
2121
assert response.is_redirect is False

tests/models/test_responses.py

Lines changed: 53 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -81,36 +81,36 @@ def test_response_content_type_encoding():
8181

8282
def test_response_autodetect_encoding():
8383
"""
84-
Autodetect encoding if there is no charset info in a Content-Type header.
84+
Autodetect encoding if there is no Content-Type header.
8585
"""
86-
content = "おはようございます。".encode("EUC-JP")
86+
content = "おはようございます。".encode("utf-8")
8787
response = httpx.Response(
8888
200,
8989
content=content,
9090
)
9191
assert response.text == "おはようございます。"
92-
assert response.encoding == "EUC-JP"
92+
assert response.encoding is None
9393

9494

9595
def test_response_fallback_to_autodetect():
9696
"""
9797
Fallback to autodetection if we get an invalid charset in the Content-Type header.
9898
"""
9999
headers = {"Content-Type": "text-plain; charset=invalid-codec-name"}
100-
content = "おはようございます。".encode("EUC-JP")
100+
content = "おはようございます。".encode("utf-8")
101101
response = httpx.Response(
102102
200,
103103
content=content,
104104
headers=headers,
105105
)
106106
assert response.text == "おはようございます。"
107-
assert response.encoding == "EUC-JP"
107+
assert response.encoding is None
108108

109109

110-
def test_response_default_text_encoding():
110+
def test_response_no_charset_with_ascii_content():
111111
"""
112-
A media type of 'text/*' with no charset should default to ISO-8859-1.
113-
See: https://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
112+
A response with ascii encoded content should decode correctly,
113+
even with no charset specified.
114114
"""
115115
content = b"Hello, world!"
116116
headers = {"Content-Type": "text/plain"}
@@ -120,20 +120,56 @@ def test_response_default_text_encoding():
120120
headers=headers,
121121
)
122122
assert response.status_code == 200
123-
assert response.encoding == "iso-8859-1"
123+
assert response.encoding is None
124124
assert response.text == "Hello, world!"
125125

126126

127-
def test_response_default_encoding():
127+
def test_response_no_charset_with_utf8_content():
128128
"""
129-
Default to utf-8 if all else fails.
129+
A response with UTF-8 encoded content should decode correctly,
130+
even with no charset specified.
130131
"""
132+
content = "Unicode Snowman: ☃".encode("utf-8")
133+
headers = {"Content-Type": "text/plain"}
131134
response = httpx.Response(
132135
200,
133-
content=b"",
136+
content=content,
137+
headers=headers,
134138
)
135-
assert response.text == ""
136-
assert response.encoding == "utf-8"
139+
assert response.text == "Unicode Snowman: ☃"
140+
assert response.encoding is None
141+
142+
143+
def test_response_no_charset_with_iso_8859_1_content():
144+
"""
145+
A response with ISO 8859-1 encoded content should decode correctly,
146+
even with no charset specified.
147+
"""
148+
content = "Accented: Österreich".encode("iso-8859-1")
149+
headers = {"Content-Type": "text/plain"}
150+
response = httpx.Response(
151+
200,
152+
content=content,
153+
headers=headers,
154+
)
155+
assert response.text == "Accented: Österreich"
156+
assert response.encoding is None
157+
158+
159+
def test_response_no_charset_with_cp_1252_content():
160+
"""
161+
A response with Windows 1252 encoded content should decode correctly,
162+
even with no charset specified.
163+
"""
164+
content = "Euro Currency: €".encode("cp1252")
165+
headers = {"Content-Type": "text/plain"}
166+
response = httpx.Response(
167+
200,
168+
content=content,
169+
headers=headers,
170+
)
171+
assert response.text == "Euro Currency: €"
172+
assert response.encoding is None
137173

138174

139175
def test_response_non_text_encoding():
@@ -147,7 +183,7 @@ def test_response_non_text_encoding():
147183
headers=headers,
148184
)
149185
assert response.text == "xyz"
150-
assert response.encoding == "ascii"
186+
assert response.encoding is None
151187

152188

153189
def test_response_set_explicit_encoding():
@@ -184,7 +220,7 @@ def test_read():
184220

185221
assert response.status_code == 200
186222
assert response.text == "Hello, world!"
187-
assert response.encoding == "ascii"
223+
assert response.encoding is None
188224
assert response.is_closed
189225

190226
content = response.read()
@@ -203,7 +239,7 @@ async def test_aread():
203239

204240
assert response.status_code == 200
205241
assert response.text == "Hello, world!"
206-
assert response.encoding == "ascii"
242+
assert response.encoding is None
207243
assert response.is_closed
208244

209245
content = await response.aread()

tests/test_decoders.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -177,16 +177,8 @@ def test_decoding_errors(header_value):
177177
[
178178
((b"Hello,", b" world!"), "ascii"),
179179
((b"\xe3\x83", b"\x88\xe3\x83\xa9", b"\xe3", b"\x83\x99\xe3\x83\xab"), "utf-8"),
180-
((b"\x83g\x83\x89\x83x\x83\x8b",) * 64, "shift-jis"),
181-
((b"\x83g\x83\x89\x83x\x83\x8b",) * 600, "shift-jis"),
182-
(
183-
(b"\xcb\xee\xf0\xe5\xec \xe8\xef\xf1\xf3\xec \xe4\xee\xeb\xee\xf0",) * 64,
184-
"MacCyrillic",
185-
),
186-
(
187-
(b"\xa5\xa6\xa5\xa7\xa5\xd6\xa4\xce\xb9\xf1\xba\xdd\xb2\xbd",) * 512,
188-
"euc-jp",
189-
),
180+
((b"Euro character: \x88!", b""), "cp1252"),
181+
((b"Accented: \xd6sterreich", b""), "iso-8859-1"),
190182
],
191183
)
192184
@pytest.mark.asyncio

0 commit comments

Comments
 (0)