Skip to content

Commit 388e79e

Browse files
authored
fix: use different encodings (#248)
Fixes #187 Signed-off-by: Mike Fiedler <[email protected]>
1 parent d062c87 commit 388e79e

File tree

2 files changed

+210
-3
lines changed

2 files changed

+210
-3
lines changed

inspector/main.py

Lines changed: 129 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,132 @@
1616
from .utilities import pypi_report_form, requests_session
1717

1818

19+
def _is_likely_text(decoded_str):
20+
"""Check if decoded string looks like valid text (not corrupted)."""
21+
if not decoded_str:
22+
return True
23+
24+
# Too many control characters suggests wrong encoding
25+
control_chars = sum(1 for c in decoded_str if ord(c) < 32 and c not in "\t\n\r")
26+
return control_chars / len(decoded_str) <= 0.3
27+
28+
29+
def _is_likely_misencoded_asian_text(decoded_str, encoding):
30+
"""
31+
Detect when Western encodings decode Asian text as Latin Extended garbage.
32+
33+
When cp1252/latin-1 decode multi-byte Asian text, they produce strings
34+
with many Latin Extended/Supplement characters and few/no spaces.
35+
"""
36+
if encoding not in ("cp1252", "latin-1") or len(decoded_str) <= 3:
37+
return False
38+
39+
# Count Latin Extended-A/B (Ā-ʯ) and Latin-1 Supplement (À-ÿ)
40+
high_latin = sum(1 for c in decoded_str if 0x0080 <= ord(c) <= 0x024F)
41+
spaces = decoded_str.count(" ")
42+
43+
# If >50% high Latin chars and <10% spaces, likely misencoded
44+
return high_latin / len(decoded_str) > 0.5 and spaces < len(decoded_str) * 0.1
45+
46+
47+
def _is_likely_misencoded_cross_asian(decoded_str, encoding):
48+
"""
49+
Detect when Asian encodings misinterpret other Asian encodings.
50+
51+
Patterns:
52+
- shift_jis decoding GB2312 produces excessive half-width katakana
53+
- Asian encodings decoding Western text produce ASCII+CJK mix (unlikely)
54+
"""
55+
if len(decoded_str) <= 3:
56+
return False
57+
58+
# Pattern 1: Excessive half-width katakana (shift_jis misinterpreting GB2312)
59+
# Half-width katakana range: U+FF61-FF9F
60+
if encoding == "shift_jis":
61+
half_width_katakana = sum(1 for c in decoded_str if 0xFF61 <= ord(c) <= 0xFF9F)
62+
# If >30% is half-width katakana, likely wrong encoding
63+
# (Real Japanese text uses mostly full-width kana and kanji)
64+
if half_width_katakana / len(decoded_str) > 0.3:
65+
return True
66+
67+
# Pattern 2: ASCII mixed with CJK (Asian encoding misinterpreting Western)
68+
# CJK Unified Ideographs: U+4E00-U+9FFF
69+
if encoding in ("big5", "gbk", "gb2312", "shift_jis", "euc-kr"):
70+
ascii_chars = sum(1 for c in decoded_str if ord(c) < 128)
71+
cjk_chars = sum(1 for c in decoded_str if 0x4E00 <= ord(c) <= 0x9FFF)
72+
73+
# If we have ASCII letters and scattered CJK chars, likely misencoded
74+
# Real CJK text is mostly CJK with occasional ASCII punctuation
75+
if ascii_chars > 0 and cjk_chars > 0:
76+
# Check if there are ASCII letters (not just punctuation)
77+
ascii_letters = sum(1 for c in decoded_str if c.isalpha() and ord(c) < 128)
78+
# If we have ASCII letters AND CJK, and CJK is <50%, likely wrong
79+
if ascii_letters >= 2 and cjk_chars / len(decoded_str) < 0.5:
80+
return True
81+
82+
return False
83+
84+
85+
def decode_with_fallback(content_bytes):
86+
"""
87+
Decode bytes to string, trying multiple encodings.
88+
89+
Strategy:
90+
1. Try UTF-8 (most common)
91+
2. Try common encodings with sanity checks
92+
3. Fall back to latin-1 (decodes anything, but may produce garbage)
93+
94+
Returns decoded string or None if all attempts fail (only if truly binary).
95+
"""
96+
# Try UTF-8 first (most common)
97+
try:
98+
decoded = content_bytes.decode("utf-8")
99+
# Apply same heuristics as other encodings
100+
if _is_likely_text(decoded):
101+
return decoded
102+
except (UnicodeDecodeError, AttributeError):
103+
pass
104+
105+
# Try encodings from most to least restrictive. Even with improved heuristics,
106+
# putting GBK/GB2312 early breaks too many other encodings. The order below
107+
# maximizes correct detections while minimizing misdetections.
108+
common_encodings = [
109+
"shift_jis", # Japanese (restrictive multi-byte)
110+
"euc-kr", # Korean (restrictive multi-byte)
111+
"big5", # Chinese Traditional (restrictive multi-byte)
112+
"gbk", # Chinese Simplified
113+
"gb2312", # Chinese Simplified, older
114+
"cp1251", # Cyrillic
115+
"iso-8859-2", # Central/Eastern European
116+
"cp1252", # Windows Western European (very permissive)
117+
"latin-1", # ISO-8859-1 fallback (never fails)
118+
]
119+
120+
for encoding in common_encodings:
121+
try:
122+
decoded = content_bytes.decode(encoding)
123+
124+
# Skip if decoded text looks corrupted
125+
if not _is_likely_text(decoded):
126+
continue
127+
128+
# Skip if Western encoding produced Asian-text-as-garbage pattern
129+
if _is_likely_misencoded_asian_text(decoded, encoding):
130+
continue
131+
132+
# Skip if Asian encoding misinterpreted other Asian/Western text
133+
if _is_likely_misencoded_cross_asian(decoded, encoding):
134+
continue
135+
136+
return decoded
137+
138+
except (UnicodeDecodeError, LookupError):
139+
continue
140+
141+
# If we get here, all encodings failed sanity checks (truly binary data)
142+
return None
143+
144+
19145
def traces_sampler(sampling_context):
20146
"""
21147
Filter out noisy transactions.
@@ -251,10 +377,10 @@ def file(project_name, version, first, second, rest, distname, filepath):
251377
)
252378

253379
if isinstance(contents, bytes):
254-
try:
255-
contents = contents.decode()
256-
except UnicodeDecodeError:
380+
decoded_contents = decode_with_fallback(contents)
381+
if decoded_contents is None:
257382
return "Binary files are not supported."
383+
contents = decoded_contents
258384

259385
return render_template(
260386
"code.html", code=contents, name=file_extension, **common_params

tests/test_main.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,89 @@
11
import pretend
2+
import pytest
23

34
import inspector.main
45

56

7+
@pytest.mark.parametrize(
8+
"text,encoding",
9+
[
10+
# UTF-8 (most common)
11+
("Hello, World!", "utf-8"),
12+
# Windows CP1252 with trademark symbol
13+
("Windows™ text", "cp1252"),
14+
# Shift_JIS - Japanese
15+
("こんにちは世界", "shift_jis"),
16+
# EUC-KR - Korean
17+
("안녕하세요", "euc-kr"),
18+
# Big5 - Traditional Chinese
19+
("繁體中文", "big5"),
20+
# CP1251 - Russian/Cyrillic
21+
("Привет мир", "cp1251"),
22+
],
23+
)
24+
def test_decode_with_fallback_various_encodings(text, encoding):
25+
"""Test decoding bytes with various text encodings that work correctly.
26+
27+
These 6 encodings decode correctly with the current ordering and heuristics.
28+
"""
29+
content = text.encode(encoding)
30+
result = inspector.main.decode_with_fallback(content)
31+
assert result == text
32+
33+
34+
@pytest.mark.parametrize(
35+
"text,encoding,decoded_by",
36+
[
37+
("你好世界", "gbk", "big5 or euc-kr"),
38+
("中文测试", "gb2312", "shift_jis (rejected) then euc-kr"),
39+
("Héllo Wörld", "iso-8859-1", "big5 (rejected) then cp1251"),
40+
("Cześć świat", "iso-8859-2", "big5 (rejected) then cp1251"),
41+
],
42+
)
43+
def test_decode_with_fallback_misdetected_encodings(text, encoding, decoded_by):
44+
"""Test encodings that still get misdetected despite improved heuristics.
45+
46+
These encodings are misdetected by earlier encodings in the `common_encodings` list.
47+
Improved heuristics help but can't solve all cases without breaking others.
48+
49+
Tried cross-Asian heuristics that reject some misdetections (e.g., shift_jis
50+
with excessive half-width katakana, Asian encodings with ASCII+CJK mix),
51+
but ordering remains a fundamental trade-off:
52+
no order works perfectly for all encodings.
53+
"""
54+
content = text.encode(encoding)
55+
result = inspector.main.decode_with_fallback(content)
56+
# Should decode to something (not None), but won't match original
57+
assert result is not None
58+
assert isinstance(result, str)
59+
assert len(result) > 0
60+
# Verify it's actually different (misdetected)
61+
assert result != text
62+
63+
64+
@pytest.mark.parametrize(
65+
"description,binary_data",
66+
[
67+
(
68+
"Random binary with null bytes",
69+
bytes([0xFF, 0xFE, 0x00, 0x00, 0x01, 0x02, 0x03]),
70+
),
71+
("Null bytes only", bytes([0x00] * 10)),
72+
("Low control characters", bytes([0x01, 0x02, 0x03, 0x04, 0x05])),
73+
("JPEG header", bytes([0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10])),
74+
],
75+
)
76+
def test_decode_with_fallback_binary(description, binary_data):
77+
"""Test that binary data with many control characters returns None.
78+
79+
Binary data should be rejected by our heuristics even though some
80+
encodings (like UTF-8 for ASCII control chars, or cp1251 for high bytes)
81+
can technically decode them.
82+
"""
83+
result = inspector.main.decode_with_fallback(binary_data)
84+
assert result is None
85+
86+
687
def test_versions(monkeypatch):
788
stub_json = {"releases": {"0.5.1e": None}}
889
stub_response = pretend.stub(

0 commit comments

Comments
 (0)