Skip to content

Commit

Permalink
encodings: decode utf-8 with errors='replace' when confident
Browse files Browse the repository at this point in the history
"Confident" means "metadata of the document explicitly indicates that
the encoding is UTF-8". This prevents feedparser from falling back to
other encodings when there are only tiny errors.
  • Loading branch information
Rongronggg9 committed Dec 26, 2023
1 parent 0af72dc commit dd2d6bf
Show file tree
Hide file tree
Showing 7 changed files with 76 additions and 1 deletion.
1 change: 1 addition & 0 deletions CONTRIBUTORS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,4 @@ bug report!
* `Aaron Swartz <http://www.aaronsw.com/>`_
* `Jakub Wilk <http://jwilk.net/>`_
* `Nestor Rodriguez <https://github.com/n3s7or>`_
* `Rong Zhang <https://github.com/Rongronggg9>`_
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Fixed
-----

* If the metadata of a feed explicitly indicates that the encoding is UTF-8,
try decode it with ``errors="replace"`` when decoding fails. This prevents
feeds from being decoded with wrong encodings when they are mostly UTF-8 but
contain a few invalid bytes.
2 changes: 2 additions & 0 deletions feedparser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from .api import parse
from .datetimes import registerDateHandler
from .exceptions import (
CharacterEncodingErrorsReplace,
CharacterEncodingOverride,
CharacterEncodingUnknown,
FeedparserError,
Expand Down Expand Up @@ -64,6 +65,7 @@
"registerDateHandler",
"FeedParserDict",
"FeedparserError",
"CharacterEncodingErrorsReplace",
"CharacterEncodingOverride",
"CharacterEncodingUnknown",
"NonXMLContentType",
Expand Down
34 changes: 33 additions & 1 deletion feedparser/encodings.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def lazy_chardet_encoding(data):


from .exceptions import (
CharacterEncodingErrorsReplace,
CharacterEncodingOverride,
CharacterEncodingUnknown,
FeedparserError,
Expand Down Expand Up @@ -218,6 +219,21 @@ def convert_to_utf8(
http_content_type = http_headers.get("content-type") or ""
http_content_type, http_encoding = parse_content_type(http_content_type)

# Some UTF-8 documents may contain invalid characters, resulting in
# falling back to lazy_chardet_encoding or iso-8859-2.
# In such a case, lazy_chardet_encoding may not be able to detect the
# encoding correctly, and iso-8859-2 is apparently a wrong guess.

# Therefore, we use the flag to allow decoding UTF-8 documents with
# errors='replace'.

# Considering the fact that UTF-8 is the most popular encoding,
# the flag can be safely set if any metadata of the document explicitly
# indicates that the encoding is UTF-8.

# 1st pass: adhere to HTTP encoding (Content-Type)
utf_8_confident = http_encoding == "utf-8"

acceptable_content_type = 0
application_content_types = (
"application/xml",
Expand All @@ -232,6 +248,11 @@ def convert_to_utf8(
and http_content_type.endswith("+xml")
):
acceptable_content_type = 1
# 2nd pass: adhere to the declared XML encoding
# (but not in the inconsistent case)
utf_8_confident = utf_8_confident or (
xml_encoding == "utf-8" and not http_encoding
)
rfc3023_encoding = http_encoding or xml_encoding or "utf-8"
elif http_content_type in text_content_types or (
http_content_type.startswith("text/") and http_content_type.endswith("+xml")
Expand Down Expand Up @@ -298,7 +319,18 @@ def convert_to_utf8(
try:
text = data.decode(proposed_encoding)
except (UnicodeDecodeError, LookupError):
continue
if proposed_encoding != "utf-8" or not utf_8_confident:
continue
# try utf-8 with errors='replace' if we are confident
try:
text = data.decode("utf-8", errors="replace")
error = CharacterEncodingErrorsReplace(
"document explicitly declared its encoding as utf-8, "
"but has encoding errors, "
"which has been replaced with � (U+FFFD)"
)
except (UnicodeDecodeError, LookupError):
continue

known_encoding = True
if not json:
Expand Down
5 changes: 5 additions & 0 deletions feedparser/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

__all__ = [
"FeedparserError",
"CharacterEncodingErrorsReplace",
"CharacterEncodingOverride",
"CharacterEncodingUnknown",
"NonXMLContentType",
Expand All @@ -39,6 +40,10 @@ class FeedparserError(Exception):
pass


class CharacterEncodingErrorsReplace(FeedparserError):
pass


class CharacterEncodingOverride(FeedparserError):
pass

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<?xml version="1.0" encoding="utf-8"?>
<!--
Header: Content-type: application/rss+xml
Description: Replace errors instead of falling back to other encodings when application/*xml w/ encoding="utf-8" header
Expect: bozo and encoding == 'utf-8' and entries[0].summary == '𝐔𝐓𝐅-𝟖, 𝑏𝑢𝑡 𝒆��𝒐𝒓𝒔'
-->

<rss version="2.0">
<channel>
<item>
<description><![CDATA[ 𝐔𝐓𝐅-𝟖, 𝑏𝑢𝑡 𝒆©©ð’ð’“𝒔 ]]></description>
</item>
</channel>
</rss>
14 changes: 14 additions & 0 deletions tests/encoding/bozo_http_charset_utf-8_errors_replace.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<?xml version="1.0"?>
<!--
Header: Content-type: text/rss+xml; charset="utf-8"
Description: Replace errors instead of falling back to other encodings when charset="utf-8"
Expect: bozo and encoding == 'utf-8' and entries[0].summary == '𝐔𝐓𝐅-𝟖, 𝑏𝑢𝑡 𝒆��𝒐𝒓𝒔'
-->

<rss version="2.0">
<channel>
<item>
<description><![CDATA[ 𝐔𝐓𝐅-𝟖, 𝑏𝑢𝑡 𝒆©©ð’ð’“𝒔 ]]></description>
</item>
</channel>
</rss>

0 comments on commit dd2d6bf

Please sign in to comment.