Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove chardet/charset-normalizer. #7589

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGES/7561.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Replace automatic character set detection with a `fallback_charset_resolver` parameter
in `ClientSession` to allow user-supplied character set detection functions.
1 change: 1 addition & 0 deletions CONTRIBUTORS.txt
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ Jesus Cea
Jian Zeng
Jinkyu Yi
Joel Watts
John Parton
Jon Nabozny
Jonas Krüger Svensson
Jonas Obrist
Expand Down
26 changes: 26 additions & 0 deletions aiohttp/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,11 @@
from .tracing import Trace, TraceConfig
from .typedefs import Final, JSONEncoder, LooseCookies, LooseHeaders, StrOrURL

try:
import cchardet as chardet
except ImportError: # pragma: no cover
import charset_normalizer as chardet # type: ignore[no-redef]

__all__ = (
# client_exceptions
"ClientConnectionError",
Expand Down Expand Up @@ -159,6 +164,22 @@ class ClientTimeout:
DEFAULT_TIMEOUT: Final[ClientTimeout] = ClientTimeout(total=5 * 60)

_RetType = TypeVar("_RetType")
_CharsetResolver = Callable[[ClientResponse, bytes], str]


def _default_fallback_charset_resolver(response: ClientResponse, body: bytes) -> str:

ret: str = chardet.detect(body)["encoding"] or "utf-8"

if ret != "utf-8":
warnings.warn(
"Automatic charset detection will be removed in 3.9, see: "
"https://docs.aiohttp.org/en/stable/client_advanced.html#character-set-detection", # noqa: E501
DeprecationWarning,
stacklevel=3,
)

return ret


class ClientSession:
Expand Down Expand Up @@ -220,6 +241,9 @@ def __init__(
requote_redirect_url: bool = True,
trace_configs: Optional[List[TraceConfig]] = None,
read_bufsize: int = 2**16,
fallback_charset_resolver: _CharsetResolver = (
_default_fallback_charset_resolver
),
) -> None:
if loop is None:
if connector is not None:
Expand Down Expand Up @@ -313,6 +337,8 @@ def __init__(
for trace_config in self._trace_configs:
trace_config.freeze()

self._resolve_charset = fallback_charset_resolver

def __init_subclass__(cls: Type["ClientSession"]) -> None:
warnings.warn(
"Inheritance class {} from ClientSession "
Expand Down
55 changes: 28 additions & 27 deletions aiohttp/client_reqrep.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import asyncio
import codecs
import contextlib
import functools
import io
import re
Expand All @@ -12,6 +13,7 @@
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Iterable,
List,
Expand Down Expand Up @@ -66,11 +68,6 @@
ssl = None # type: ignore[assignment]
SSLContext = object # type: ignore[misc,assignment]

try:
import cchardet as chardet
except ImportError: # pragma: no cover
import charset_normalizer as chardet # type: ignore[no-redef]


__all__ = ("ClientRequest", "ClientResponse", "RequestInfo", "Fingerprint")

Expand Down Expand Up @@ -722,8 +719,8 @@ class ClientResponse(HeadersMixin):
_raw_headers: RawHeaders = None # type: ignore[assignment] # Response raw headers

_connection = None # current connection
_source_traceback = None
# setted up by ClientRequest after ClientResponse object creation
_source_traceback: Optional[traceback.StackSummary] = None
# set up by ClientRequest after ClientResponse object creation
# post-init stage allows to not change ctor signature
_closed = True # to allow __del__ for non-initialized properly response
_released = False
Expand Down Expand Up @@ -760,6 +757,15 @@ def __init__(
self._loop = loop
# store a reference to session #1985
self._session: Optional[ClientSession] = session
# Save reference to _resolve_charset, so that get_encoding() will still
# work after the response has finished reading the body.
if session is None:
# TODO: Fix session=None in tests (see ClientRequest.__init__).
self._resolve_charset: Callable[
["ClientResponse", bytes], str
] = lambda *_: "utf-8"
else:
self._resolve_charset = session._resolve_charset
if loop.get_debug():
self._source_traceback = traceback.extract_stack(sys._getframe(1))

Expand Down Expand Up @@ -1053,27 +1059,22 @@ def get_encoding(self) -> str:

encoding = mimetype.parameters.get("charset")
if encoding:
try:
codecs.lookup(encoding)
except LookupError:
encoding = None
if not encoding:
if mimetype.type == "application" and (
mimetype.subtype == "json" or mimetype.subtype == "rdap"
):
# RFC 7159 states that the default encoding is UTF-8.
# RFC 7483 defines application/rdap+json
encoding = "utf-8"
elif self._body is None:
raise RuntimeError(
"Cannot guess the encoding of " "a not yet read body"
)
else:
encoding = chardet.detect(self._body)["encoding"]
if not encoding:
encoding = "utf-8"
with contextlib.suppress(LookupError):
return codecs.lookup(encoding).name

if mimetype.type == "application" and (
mimetype.subtype == "json" or mimetype.subtype == "rdap"
):
# RFC 7159 states that the default encoding is UTF-8.
# RFC 7483 defines application/rdap+json
return "utf-8"

if self._body is None:
raise RuntimeError(
"Cannot compute fallback encoding of a not yet read body"
)

return encoding
return self._resolve_charset(self, self._body)

async def text(self, encoding: Optional[str] = None, errors: str = "strict") -> str:
"""Read response payload and decode."""
Expand Down
30 changes: 30 additions & 0 deletions docs/client_advanced.rst
Original file line number Diff line number Diff line change
Expand Up @@ -640,3 +640,33 @@ are changed so that aiohttp itself can wait on the underlying
connection to close. Please follow issue `#1925
<https://github.com/aio-libs/aiohttp/issues/1925>`_ for the progress
on this.


Character Set Detection
-----------------------

If you encounter an 'Automatic charset detection will be removed' warning
when using :meth:`ClientResponse.text()` this may be because the response
does not include the charset needed to decode the body.

If you know the correct encoding for a request, you can simply specify
the encoding as a parameter (e.g. ``resp.text("windows-1252")``).

Alternatively, :class:`ClientSession` accepts a ``fallback_charset_resolver`` parameter which
can be used to reintroduce charset guessing functionality. When a charset is not found
in the Content-Type header, this function will be called to get the charset encoding. For
example, this can be used with the ``chardetng_py`` library.::

from chardetng_py import detect

def charset_resolver(resp: ClientResponse, body: bytes) -> str:
tld = resp.url.host.rsplit(".", maxsplit=1)[-1]
return detect(body, allow_utf8=True, tld=tld)

ClientSession(fallback_charset_resolver=charset_resolver)

Or, if ``chardetng_py`` doesn't work for you, then ``charset-normalizer`` is another option::

from charset_normalizer import detect

ClientSession(fallback_charset_resolver=lamba r, b: detect(b)["encoding"] or "utf-8")
51 changes: 23 additions & 28 deletions docs/client_reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ The client session supports the context manager protocol for self closing.
read_bufsize=2**16, \
requote_redirect_url=False, \
trust_env=False, \
trace_configs=None)
trace_configs=None, \
fallback_charset_resolver=_chardet_resolver)

The class for creating client sessions and making requests.

Expand Down Expand Up @@ -200,6 +201,18 @@ The client session supports the context manager protocol for self closing.
disabling. See :ref:`aiohttp-client-tracing-reference` for
more information.

:param Callable[[ClientResponse,bytes],str] fallback_charset_resolver:
A :term:`callable` that accepts a :class:`ClientResponse` and the
:class:`bytes` contents, and returns a :class:`str` which will be used as
the encoding parameter to :meth:`bytes.decode()`.

This function will be called when the charset is not known (e.g. not specified in the
Content-Type header). The default function in 3.8.6 calls ``chardetng``
or ``charset-normaliser``. In 3.9+ this be replaced with a function that
simply defaults to ``utf-8``.

.. versionadded:: 3.8.6

.. attribute:: closed

``True`` if the session has been closed, ``False`` otherwise.
Expand Down Expand Up @@ -1400,12 +1413,8 @@ Response object
Read response's body and return decoded :class:`str` using
specified *encoding* parameter.

If *encoding* is ``None`` content encoding is autocalculated
using ``Content-Type`` HTTP header and *charset-normalizer* tool if the
header is not provided by server.

:term:`cchardet` is used with fallback to :term:`charset-normalizer` if
*cchardet* is not available.
If *encoding* is ``None`` content encoding is determined from the
Content-Type header, or using the ``fallback_charset_resolver`` function.

Close underlying connection if data reading gets an error,
release connection otherwise.
Expand All @@ -1414,10 +1423,7 @@ Response object
``None`` for encoding autodetection
(default).

:return str: decoded *BODY*

:raise LookupError: if the encoding detected by cchardet is
unknown by Python (e.g. VISCII).

.. note::

Expand All @@ -1430,18 +1436,15 @@ Response object

await resp.text('ISO-8859-1')

.. comethod:: json(*, encoding=None, loads=json.loads, \
.. method:: json(*, encoding=None, loads=json.loads, \
content_type='application/json')
Dreamsorcerer marked this conversation as resolved.
Show resolved Hide resolved
:async:

Read response's body as *JSON*, return :class:`dict` using
specified *encoding* and *loader*. If data is not still available
a ``read`` call will be done,
a ``read`` call will be done.

If *encoding* is ``None`` content encoding is autocalculated
using :term:`cchardet` or :term:`charset-normalizer` as fallback if
*cchardet* is not available.

if response's `content-type` does not match `content_type` parameter
If response's `content-type` does not match `content_type` parameter
:exc:`aiohttp.ContentTypeError` get raised.
To disable content type check pass ``None`` value.

Expand Down Expand Up @@ -1473,17 +1476,9 @@ Response object

.. method:: get_encoding()

Automatically detect content encoding using ``charset`` info in
``Content-Type`` HTTP header. If this info is not exists or there
are no appropriate codecs for encoding then :term:`cchardet` /
:term:`charset-normalizer` is used.

Beware that it is not always safe to use the result of this function to
decode a response. Some encodings detected by cchardet are not known by
Python (e.g. VISCII). *charset-normalizer* is not concerned by that issue.

:raise RuntimeError: if called before the body has been read,
for :term:`cchardet` usage
Retrieve content encoding using ``charset`` info in ``Content-Type`` HTTP header.
If no charset is present or the charset is not understood by Python, the
``fallback_charset_resolver`` function associated with the ``ClientSession`` is called.

.. versionadded:: 3.0

Expand Down
8 changes: 0 additions & 8 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -162,14 +162,6 @@ Dependencies
- *charset-normalizer*
- *multidict*
- *yarl*
- *Optional* :term:`cchardet` as faster replacement for
:term:`charset-normalizer`.

Install it explicitly via:

.. code-block:: bash

$ pip install cchardet

- *Optional* :term:`aiodns` for fast DNS resolving. The
library is highly recommended.
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ filterwarnings =
# can be dropped with the next release of `certify`, specifically
# `certify > 2022.06.15`.
ignore:path is deprecated. Use files.. instead. Refer to https.//importlib-resources.readthedocs.io/en/latest/using.html#migrating-from-legacy for migration advice.:DeprecationWarning:certifi.core
ignore:Automatic charset detection will be removed in 3.9:DeprecationWarning
junit_suite_name = aiohttp_test_suite
norecursedirs = dist docs build .tox .eggs
minversion = 3.8.2
Expand Down
Loading
Loading