fix: use different encodings (#248)

miketheman · web-flow · commit 388e79e5717c · 2025-11-14T11:19:52.000-05:00
Fixes #187 Signed-off-by: Mike Fiedler <miketheman@gmail.com>
diff --git a/inspector/main.py b/inspector/main.py
@@ -16,6 +16,132 @@
 from .utilities import pypi_report_form, requests_session
 
 
+def _is_likely_text(decoded_str):
+    """Check if decoded string looks like valid text (not corrupted)."""
+    if not decoded_str:
+        return True
+
+    # Too many control characters suggests wrong encoding
+    control_chars = sum(1 for c in decoded_str if ord(c) < 32 and c not in "\t\n\r")
+    return control_chars / len(decoded_str) <= 0.3
+
+
+def _is_likely_misencoded_asian_text(decoded_str, encoding):
+    """
+    Detect when Western encodings decode Asian text as Latin Extended garbage.
+
+    When cp1252/latin-1 decode multi-byte Asian text, they produce strings
+    with many Latin Extended/Supplement characters and few/no spaces.
+    """
+    if encoding not in ("cp1252", "latin-1") or len(decoded_str) <= 3:
+        return False
+
+    # Count Latin Extended-A/B (Ā-ʯ) and Latin-1 Supplement (À-ÿ)
+    high_latin = sum(1 for c in decoded_str if 0x0080 <= ord(c) <= 0x024F)
+    spaces = decoded_str.count(" ")
+
+    # If >50% high Latin chars and <10% spaces, likely misencoded
+    return high_latin / len(decoded_str) > 0.5 and spaces < len(decoded_str) * 0.1
+
+
+def _is_likely_misencoded_cross_asian(decoded_str, encoding):
+    """
+    Detect when Asian encodings misinterpret other Asian encodings.
+
+    Patterns:
+    - shift_jis decoding GB2312 produces excessive half-width katakana
+    - Asian encodings decoding Western text produce ASCII+CJK mix (unlikely)
+    """
+    if len(decoded_str) <= 3:
+        return False
+
+    # Pattern 1: Excessive half-width katakana (shift_jis misinterpreting GB2312)
+    # Half-width katakana range: U+FF61-FF9F
+    if encoding == "shift_jis":
+        half_width_katakana = sum(1 for c in decoded_str if 0xFF61 <= ord(c) <= 0xFF9F)
+        # If >30% is half-width katakana, likely wrong encoding
+        # (Real Japanese text uses mostly full-width kana and kanji)
+        if half_width_katakana / len(decoded_str) > 0.3:
+            return True
+
+    # Pattern 2: ASCII mixed with CJK (Asian encoding misinterpreting Western)
+    # CJK Unified Ideographs: U+4E00-U+9FFF
+    if encoding in ("big5", "gbk", "gb2312", "shift_jis", "euc-kr"):
+        ascii_chars = sum(1 for c in decoded_str if ord(c) < 128)
+        cjk_chars = sum(1 for c in decoded_str if 0x4E00 <= ord(c) <= 0x9FFF)
+
+        # If we have ASCII letters and scattered CJK chars, likely misencoded
+        # Real CJK text is mostly CJK with occasional ASCII punctuation
+        if ascii_chars > 0 and cjk_chars > 0:
+            # Check if there are ASCII letters (not just punctuation)
+            ascii_letters = sum(1 for c in decoded_str if c.isalpha() and ord(c) < 128)
+            # If we have ASCII letters AND CJK, and CJK is <50%, likely wrong
+            if ascii_letters >= 2 and cjk_chars / len(decoded_str) < 0.5:
+                return True
+
+    return False
+
+
+def decode_with_fallback(content_bytes):
+    """
+    Decode bytes to string, trying multiple encodings.
+
+    Strategy:
+    1. Try UTF-8 (most common)
+    2. Try common encodings with sanity checks
+    3. Fall back to latin-1 (decodes anything, but may produce garbage)
+
+    Returns decoded string or None if all attempts fail (only if truly binary).
+    """
+    # Try UTF-8 first (most common)
+    try:
+        decoded = content_bytes.decode("utf-8")
+        # Apply same heuristics as other encodings
+        if _is_likely_text(decoded):
+            return decoded
+    except (UnicodeDecodeError, AttributeError):
+        pass
+
+    # Try encodings from most to least restrictive. Even with improved heuristics,
+    # putting GBK/GB2312 early breaks too many other encodings. The order below
+    # maximizes correct detections while minimizing misdetections.
+    common_encodings = [
+        "shift_jis",  # Japanese (restrictive multi-byte)
+        "euc-kr",  # Korean (restrictive multi-byte)
+        "big5",  # Chinese Traditional (restrictive multi-byte)
+        "gbk",  # Chinese Simplified
+        "gb2312",  # Chinese Simplified, older
+        "cp1251",  # Cyrillic
+        "iso-8859-2",  # Central/Eastern European
+        "cp1252",  # Windows Western European (very permissive)
+        "latin-1",  # ISO-8859-1 fallback (never fails)
+    ]
+
+    for encoding in common_encodings:
+        try:
+            decoded = content_bytes.decode(encoding)
+
+            # Skip if decoded text looks corrupted
+            if not _is_likely_text(decoded):
+                continue
+
+            # Skip if Western encoding produced Asian-text-as-garbage pattern
+            if _is_likely_misencoded_asian_text(decoded, encoding):
+                continue
+
+            # Skip if Asian encoding misinterpreted other Asian/Western text
+            if _is_likely_misencoded_cross_asian(decoded, encoding):
+                continue
+
+            return decoded
+
+        except (UnicodeDecodeError, LookupError):
+            continue
+
+    # If we get here, all encodings failed sanity checks (truly binary data)
+    return None
+
+
 def traces_sampler(sampling_context):
     """
     Filter out noisy transactions.
@@ -251,10 +377,10 @@ def file(project_name, version, first, second, rest, distname, filepath):
             )
 
         if isinstance(contents, bytes):
-            try:
-                contents = contents.decode()
-            except UnicodeDecodeError:
+            decoded_contents = decode_with_fallback(contents)
+            if decoded_contents is None:
                 return "Binary files are not supported."
+            contents = decoded_contents
 
         return render_template(
             "code.html", code=contents, name=file_extension, **common_params
diff --git a/tests/test_main.py b/tests/test_main.py
@@ -1,8 +1,89 @@
 import pretend
+import pytest
 
 import inspector.main
 
 
+@pytest.mark.parametrize(
+    "text,encoding",
+    [
+        # UTF-8 (most common)
+        ("Hello, World!", "utf-8"),
+        # Windows CP1252 with trademark symbol
+        ("Windows™ text", "cp1252"),
+        # Shift_JIS - Japanese
+        ("こんにちは世界", "shift_jis"),
+        # EUC-KR - Korean
+        ("안녕하세요", "euc-kr"),
+        # Big5 - Traditional Chinese
+        ("繁體中文", "big5"),
+        # CP1251 - Russian/Cyrillic
+        ("Привет мир", "cp1251"),
+    ],
+)
+def test_decode_with_fallback_various_encodings(text, encoding):
+    """Test decoding bytes with various text encodings that work correctly.
+
+    These 6 encodings decode correctly with the current ordering and heuristics.
+    """
+    content = text.encode(encoding)
+    result = inspector.main.decode_with_fallback(content)
+    assert result == text
+
+
+@pytest.mark.parametrize(
+    "text,encoding,decoded_by",
+    [
+        ("你好世界", "gbk", "big5 or euc-kr"),
+        ("中文测试", "gb2312", "shift_jis (rejected) then euc-kr"),
+        ("Héllo Wörld", "iso-8859-1", "big5 (rejected) then cp1251"),
+        ("Cześć świat", "iso-8859-2", "big5 (rejected) then cp1251"),
+    ],
+)
+def test_decode_with_fallback_misdetected_encodings(text, encoding, decoded_by):
+    """Test encodings that still get misdetected despite improved heuristics.
+
+    These encodings are misdetected by earlier encodings in the `common_encodings` list.
+    Improved heuristics help but can't solve all cases without breaking others.
+
+    Tried cross-Asian heuristics that reject some misdetections (e.g., shift_jis
+    with excessive half-width katakana, Asian encodings with ASCII+CJK mix),
+    but ordering remains a fundamental trade-off:
+    no order works perfectly for all encodings.
+    """
+    content = text.encode(encoding)
+    result = inspector.main.decode_with_fallback(content)
+    # Should decode to something (not None), but won't match original
+    assert result is not None
+    assert isinstance(result, str)
+    assert len(result) > 0
+    # Verify it's actually different (misdetected)
+    assert result != text
+
+
+@pytest.mark.parametrize(
+    "description,binary_data",
+    [
+        (
+            "Random binary with null bytes",
+            bytes([0xFF, 0xFE, 0x00, 0x00, 0x01, 0x02, 0x03]),
+        ),
+        ("Null bytes only", bytes([0x00] * 10)),
+        ("Low control characters", bytes([0x01, 0x02, 0x03, 0x04, 0x05])),
+        ("JPEG header", bytes([0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10])),
+    ],
+)
+def test_decode_with_fallback_binary(description, binary_data):
+    """Test that binary data with many control characters returns None.
+
+    Binary data should be rejected by our heuristics even though some
+    encodings (like UTF-8 for ASCII control chars, or cp1251 for high bytes)
+    can technically decode them.
+    """
+    result = inspector.main.decode_with_fallback(binary_data)
+    assert result is None
+
+
 def test_versions(monkeypatch):
     stub_json = {"releases": {"0.5.1e": None}}
     stub_response = pretend.stub(