Drop chardet (#1269)

tomchristie · florimondmanca · web-flow · commit d0fe1139458f · 2020-09-15T11:20:19.000+01:00
* Internal refactoring to swap auth/redirects ordering

* Drop chardet for charset detection

* Drop chardet in favour of simpler charset autodetection

* Revert unintentionally included changes

* Update test case

* Refactor to prefer different decoding style

* Update text decoding docs/docstrings

* Resolve typo

* Update docs/quickstart.md

Co-authored-by: Florimond Manca &lt;florimond.manca@gmail.com&gt;

Co-authored-by: Florimond Manca &lt;florimond.manca@gmail.com&gt;
diff --git a/docs/quickstart.md b/docs/quickstart.md
@@ -65,14 +65,27 @@ HTTPX will automatically handle decoding the response content into Unicode text.
 '<!doctype html>\n<html>\n<head>\n<title>Example Domain</title>...'
 ```
 
-You can inspect what encoding has been used to decode the response.
+You can inspect what encoding will be used to decode the response.
 
 ```pycon
 >>> r.encoding
 'UTF-8'
 ```
 
-If you need to override the standard behavior and explicitly set the encoding to
+In some cases the response may not contain an explicit encoding, in which case HTTPX
+will attempt to automatically determine an encoding to use. This defaults to
+UTF-8, but also includes robust fallback behaviour for handling ascii,
+iso-8859-1 and windows 1252 encodings.
+
+```pycon
+>>> r.encoding
+None
+>>> r.text
+'<!doctype html>\n<html>\n<head>\n<title>Example Domain</title>...'
+```
+
+
+If you need to override the standard behaviour and explicitly set the encoding to
 use, then you can do that too.
 
 ```pycon
diff --git a/httpx/_decoders.py b/httpx/_decoders.py
@@ -7,8 +7,6 @@
 import typing
 import zlib
 
-import chardet
-
 try:
     import brotli
 except ImportError:  # pragma: nocover
@@ -163,62 +161,52 @@ class TextDecoder:
     """
 
     def __init__(self, encoding: typing.Optional[str] = None):
-        self.decoder: typing.Optional[codecs.IncrementalDecoder] = (
-            None if encoding is None else codecs.getincrementaldecoder(encoding)()
-        )
-        self.detector = chardet.universaldetector.UniversalDetector()
-
-        # This buffer is only needed if 'decoder' is 'None'
-        # we want to trigger errors if data is getting added to
-        # our internal buffer for some silly reason while
-        # a decoder is discovered.
-        self.buffer: typing.Optional[bytearray] = None if self.decoder else bytearray()
+        self.decoder: typing.Optional[codecs.IncrementalDecoder] = None
+        if encoding is not None:
+            self.decoder = codecs.getincrementaldecoder(encoding)(errors="strict")
 
     def decode(self, data: bytes) -> str:
-        try:
-            if self.decoder is not None:
-                text = self.decoder.decode(data)
-            else:
-                assert self.buffer is not None
-                text = ""
-                self.detector.feed(data)
-                self.buffer += data
-
-                # Should be more than enough data to process, we don't
-                # want to buffer too long as chardet will wait until
-                # detector.close() is used to give back common
-                # encodings like 'utf-8'.
-                if len(self.buffer) >= 4096:
-                    self.decoder = codecs.getincrementaldecoder(
-                        self._detector_result()
-                    )()
-                    text = self.decoder.decode(bytes(self.buffer), False)
-                    self.buffer = None
-
-            return text
-        except UnicodeDecodeError as exc:  # pragma: nocover
-            raise ValueError(str(exc))
+        """
+        If an encoding is explicitly specified, then we use that.
+        Otherwise our strategy is to attempt UTF-8, and fallback to Windows 1252.
 
-    def flush(self) -> str:
-        try:
-            if self.decoder is None:
-                # Empty string case as chardet is guaranteed to not have a guess.
-                assert self.buffer is not None
-                if len(self.buffer) == 0:
-                    return ""
-                return bytes(self.buffer).decode(self._detector_result())
-
-            return self.decoder.decode(b"", True)
-        except UnicodeDecodeError as exc:  # pragma: nocover
-            raise ValueError(str(exc))
+        Note that UTF-8 is a strict superset of ascii, and Windows 1252 is a
+        superset of the non-control characters in iso-8859-1, so we essentially
+        end up supporting any of ascii, utf-8, iso-8859-1, cp1252.
 
-    def _detector_result(self) -> str:
-        self.detector.close()
-        result = self.detector.result["encoding"]
-        if not result:  # pragma: nocover
-            raise ValueError("Unable to determine encoding of content")
+        Given that UTF-8 is now by *far* the most widely used encoding, this
+        should be a pretty robust strategy for cases where a charset has
+        not been explicitly included.
 
-        return result
+        Useful stats on the prevalence of different charsets in the wild...
+
+        * https://w3techs.com/technologies/overview/character_encoding
+        * https://w3techs.com/technologies/history_overview/character_encoding
+
+        The HTML5 spec also has some useful guidelines, suggesting defaults of
+        either UTF-8 or Windows 1252 in most cases...
+
+        * https://dev.w3.org/html5/spec-LC/Overview.html
+        """
+        if self.decoder is None:
+            # If this is the first decode pass then we need to determine which
+            # encoding to use by attempting UTF-8 and raising any decode errors.
+            attempt_utf_8 = codecs.getincrementaldecoder("utf-8")(errors="strict")
+            try:
+                attempt_utf_8.decode(data)
+            except UnicodeDecodeError:
+                # Could not decode as UTF-8. Use Windows 1252.
+                self.decoder = codecs.getincrementaldecoder("cp1252")(errors="replace")
+            else:
+                # Can decode as UTF-8. Use UTF-8 with lenient error settings.
+                self.decoder = codecs.getincrementaldecoder("utf-8")(errors="replace")
+
+        return self.decoder.decode(data)
+
+    def flush(self) -> str:
+        if self.decoder is None:
+            return ""
+        return self.decoder.decode(b"", True)
 
 
 class LineDecoder:
diff --git a/httpx/_models.py b/httpx/_models.py
@@ -10,7 +10,6 @@
 from http.cookiejar import Cookie, CookieJar
 from urllib.parse import parse_qsl, quote, unquote, urlencode
 
-import chardet
 import rfc3986
 import rfc3986.exceptions
 
@@ -755,19 +754,22 @@ def text(self) -> str:
             if not content:
                 self._text = ""
             else:
-                encoding = self.encoding
-                self._text = content.decode(encoding, errors="replace")
+                decoder = TextDecoder(encoding=self.encoding)
+                self._text = "".join([decoder.decode(self.content), decoder.flush()])
         return self._text
 
     @property
-    def encoding(self) -> str:
+    def encoding(self) -> typing.Optional[str]:
+        """
+        Return the encoding, which may have been set explicitly, or may have
+        been specified by the Content-Type header.
+        """
         if not hasattr(self, "_encoding"):
             encoding = self.charset_encoding
             if encoding is None or not is_known_encoding(encoding):
-                encoding = self.apparent_encoding
-                if encoding is None or not is_known_encoding(encoding):
-                    encoding = "utf-8"
-            self._encoding = encoding
+                self._encoding = None
+            else:
+                self._encoding = encoding
         return self._encoding
 
     @encoding.setter
@@ -783,25 +785,11 @@ def charset_encoding(self) -> typing.Optional[str]:
         if content_type is None:
             return None
 
-        parsed = cgi.parse_header(content_type)
-        media_type, params = parsed[0], parsed[-1]
-        if "charset" in params:
-            return params["charset"].strip("'\"")
-
-        # RFC 2616 specifies that 'iso-8859-1' should be used as the default
-        # for 'text/*' media types, if no charset is provided.
-        # See: https://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
-        if media_type.startswith("text/"):
-            return "iso-8859-1"
-
-        return None
+        _, params = cgi.parse_header(content_type)
+        if "charset" not in params:
+            return None
 
-    @property
-    def apparent_encoding(self) -> typing.Optional[str]:
-        """
-        Return the encoding, as it appears to autodetection.
-        """
-        return chardet.detect(self.content)["encoding"]
+        return params["charset"].strip("'\"")
 
     def _get_content_decoder(self) -> ContentDecoder:
         """
@@ -936,7 +924,7 @@ def iter_text(self) -> typing.Iterator[str]:
         that handles both gzip, deflate, etc but also detects the content's
         string encoding.
         """
-        decoder = TextDecoder(encoding=self.charset_encoding)
+        decoder = TextDecoder(encoding=self.encoding)
         with self._wrap_decoder_errors():
             for chunk in self.iter_bytes():
                 yield decoder.decode(chunk)
@@ -1020,7 +1008,7 @@ async def aiter_text(self) -> typing.AsyncIterator[str]:
         that handles both gzip, deflate, etc but also detects the content's
         string encoding.
         """
-        decoder = TextDecoder(encoding=self.charset_encoding)
+        decoder = TextDecoder(encoding=self.encoding)
         with self._wrap_decoder_errors():
             async for chunk in self.aiter_bytes():
                 yield decoder.decode(chunk)
diff --git a/setup.py b/setup.py
@@ -57,7 +57,6 @@ def get_packages(package):
     install_requires=[
         "certifi",
         "sniffio",
-        "chardet==3.*",
         "rfc3986[idna2008]>=1.3,<2",
         "httpcore==0.10.*",
     ],
diff --git a/tests/client/test_client.py b/tests/client/test_client.py
@@ -15,7 +15,7 @@ def test_get(server):
     assert response.content == b"Hello, world!"
     assert response.text == "Hello, world!"
     assert response.http_version == "HTTP/1.1"
-    assert response.encoding == "iso-8859-1"
+    assert response.encoding is None
     assert response.request.url == url
     assert response.headers
     assert response.is_redirect is False
diff --git a/tests/models/test_responses.py b/tests/models/test_responses.py
@@ -81,36 +81,36 @@ def test_response_content_type_encoding():
 
 def test_response_autodetect_encoding():
     """
-    Autodetect encoding if there is no charset info in a Content-Type header.
+    Autodetect encoding if there is no Content-Type header.
     """
-    content = "おはようございます。".encode("EUC-JP")
+    content = "おはようございます。".encode("utf-8")
     response = httpx.Response(
         200,
         content=content,
     )
     assert response.text == "おはようございます。"
-    assert response.encoding == "EUC-JP"
+    assert response.encoding is None
 
 
 def test_response_fallback_to_autodetect():
     """
     Fallback to autodetection if we get an invalid charset in the Content-Type header.
     """
     headers = {"Content-Type": "text-plain; charset=invalid-codec-name"}
-    content = "おはようございます。".encode("EUC-JP")
+    content = "おはようございます。".encode("utf-8")
     response = httpx.Response(
         200,
         content=content,
         headers=headers,
     )
     assert response.text == "おはようございます。"
-    assert response.encoding == "EUC-JP"
+    assert response.encoding is None
 
 
-def test_response_default_text_encoding():
+def test_response_no_charset_with_ascii_content():
     """
-    A media type of 'text/*' with no charset should default to ISO-8859-1.
-    See: https://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
+    A response with ascii encoded content should decode correctly,
+    even with no charset specified.
     """
     content = b"Hello, world!"
     headers = {"Content-Type": "text/plain"}
@@ -120,20 +120,56 @@ def test_response_default_text_encoding():
         headers=headers,
     )
     assert response.status_code == 200
-    assert response.encoding == "iso-8859-1"
+    assert response.encoding is None
     assert response.text == "Hello, world!"
 
 
-def test_response_default_encoding():
+def test_response_no_charset_with_utf8_content():
     """
-    Default to utf-8 if all else fails.
+    A response with UTF-8 encoded content should decode correctly,
+    even with no charset specified.
     """
+    content = "Unicode Snowman: ☃".encode("utf-8")
+    headers = {"Content-Type": "text/plain"}
     response = httpx.Response(
         200,
-        content=b"",
+        content=content,
+        headers=headers,
     )
-    assert response.text == ""
-    assert response.encoding == "utf-8"
+    assert response.text == "Unicode Snowman: ☃"
+    assert response.encoding is None
+
+
+def test_response_no_charset_with_iso_8859_1_content():
+    """
+    A response with ISO 8859-1 encoded content should decode correctly,
+    even with no charset specified.
+    """
+    content = "Accented: Österreich".encode("iso-8859-1")
+    headers = {"Content-Type": "text/plain"}
+    response = httpx.Response(
+        200,
+        content=content,
+        headers=headers,
+    )
+    assert response.text == "Accented: Österreich"
+    assert response.encoding is None
+
+
+def test_response_no_charset_with_cp_1252_content():
+    """
+    A response with Windows 1252 encoded content should decode correctly,
+    even with no charset specified.
+    """
+    content = "Euro Currency: €".encode("cp1252")
+    headers = {"Content-Type": "text/plain"}
+    response = httpx.Response(
+        200,
+        content=content,
+        headers=headers,
+    )
+    assert response.text == "Euro Currency: €"
+    assert response.encoding is None
 
 
 def test_response_non_text_encoding():
@@ -147,7 +183,7 @@ def test_response_non_text_encoding():
         headers=headers,
     )
     assert response.text == "xyz"
-    assert response.encoding == "ascii"
+    assert response.encoding is None
 
 
 def test_response_set_explicit_encoding():
@@ -184,7 +220,7 @@ def test_read():
 
     assert response.status_code == 200
     assert response.text == "Hello, world!"
-    assert response.encoding == "ascii"
+    assert response.encoding is None
     assert response.is_closed
 
     content = response.read()
@@ -203,7 +239,7 @@ async def test_aread():
 
     assert response.status_code == 200
     assert response.text == "Hello, world!"
-    assert response.encoding == "ascii"
+    assert response.encoding is None
     assert response.is_closed
 
     content = await response.aread()
diff --git a/tests/test_decoders.py b/tests/test_decoders.py
@@ -177,16 +177,8 @@ def test_decoding_errors(header_value):
     [
         ((b"Hello,", b" world!"), "ascii"),
         ((b"\xe3\x83", b"\x88\xe3\x83\xa9", b"\xe3", b"\x83\x99\xe3\x83\xab"), "utf-8"),
-        ((b"\x83g\x83\x89\x83x\x83\x8b",) * 64, "shift-jis"),
-        ((b"\x83g\x83\x89\x83x\x83\x8b",) * 600, "shift-jis"),
-        (
-            (b"\xcb\xee\xf0\xe5\xec \xe8\xef\xf1\xf3\xec \xe4\xee\xeb\xee\xf0",) * 64,
-            "MacCyrillic",
-        ),
-        (
-            (b"\xa5\xa6\xa5\xa7\xa5\xd6\xa4\xce\xb9\xf1\xba\xdd\xb2\xbd",) * 512,
-            "euc-jp",
-        ),
+        ((b"Euro character: \x88!", b""), "cp1252"),
+        ((b"Accented: \xd6sterreich", b""), "iso-8859-1"),
     ],
 )
 @pytest.mark.asyncio