|
16 | 16 | from .utilities import pypi_report_form, requests_session |
17 | 17 |
|
18 | 18 |
|
| 19 | +def _is_likely_text(decoded_str): |
| 20 | + """Check if decoded string looks like valid text (not corrupted).""" |
| 21 | + if not decoded_str: |
| 22 | + return True |
| 23 | + |
| 24 | + # Too many control characters suggests wrong encoding |
| 25 | + control_chars = sum(1 for c in decoded_str if ord(c) < 32 and c not in "\t\n\r") |
| 26 | + return control_chars / len(decoded_str) <= 0.3 |
| 27 | + |
| 28 | + |
| 29 | +def _is_likely_misencoded_asian_text(decoded_str, encoding): |
| 30 | + """ |
| 31 | + Detect when Western encodings decode Asian text as Latin Extended garbage. |
| 32 | +
|
| 33 | + When cp1252/latin-1 decode multi-byte Asian text, they produce strings |
| 34 | + with many Latin Extended/Supplement characters and few/no spaces. |
| 35 | + """ |
| 36 | + if encoding not in ("cp1252", "latin-1") or len(decoded_str) <= 3: |
| 37 | + return False |
| 38 | + |
| 39 | + # Count Latin Extended-A/B (Ā-ʯ) and Latin-1 Supplement (À-ÿ) |
| 40 | + high_latin = sum(1 for c in decoded_str if 0x0080 <= ord(c) <= 0x024F) |
| 41 | + spaces = decoded_str.count(" ") |
| 42 | + |
| 43 | + # If >50% high Latin chars and <10% spaces, likely misencoded |
| 44 | + return high_latin / len(decoded_str) > 0.5 and spaces < len(decoded_str) * 0.1 |
| 45 | + |
| 46 | + |
| 47 | +def _is_likely_misencoded_cross_asian(decoded_str, encoding): |
| 48 | + """ |
| 49 | + Detect when Asian encodings misinterpret other Asian encodings. |
| 50 | +
|
| 51 | + Patterns: |
| 52 | + - shift_jis decoding GB2312 produces excessive half-width katakana |
| 53 | + - Asian encodings decoding Western text produce ASCII+CJK mix (unlikely) |
| 54 | + """ |
| 55 | + if len(decoded_str) <= 3: |
| 56 | + return False |
| 57 | + |
| 58 | + # Pattern 1: Excessive half-width katakana (shift_jis misinterpreting GB2312) |
| 59 | + # Half-width katakana range: U+FF61-FF9F |
| 60 | + if encoding == "shift_jis": |
| 61 | + half_width_katakana = sum(1 for c in decoded_str if 0xFF61 <= ord(c) <= 0xFF9F) |
| 62 | + # If >30% is half-width katakana, likely wrong encoding |
| 63 | + # (Real Japanese text uses mostly full-width kana and kanji) |
| 64 | + if half_width_katakana / len(decoded_str) > 0.3: |
| 65 | + return True |
| 66 | + |
| 67 | + # Pattern 2: ASCII mixed with CJK (Asian encoding misinterpreting Western) |
| 68 | + # CJK Unified Ideographs: U+4E00-U+9FFF |
| 69 | + if encoding in ("big5", "gbk", "gb2312", "shift_jis", "euc-kr"): |
| 70 | + ascii_chars = sum(1 for c in decoded_str if ord(c) < 128) |
| 71 | + cjk_chars = sum(1 for c in decoded_str if 0x4E00 <= ord(c) <= 0x9FFF) |
| 72 | + |
| 73 | + # If we have ASCII letters and scattered CJK chars, likely misencoded |
| 74 | + # Real CJK text is mostly CJK with occasional ASCII punctuation |
| 75 | + if ascii_chars > 0 and cjk_chars > 0: |
| 76 | + # Check if there are ASCII letters (not just punctuation) |
| 77 | + ascii_letters = sum(1 for c in decoded_str if c.isalpha() and ord(c) < 128) |
| 78 | + # If we have ASCII letters AND CJK, and CJK is <50%, likely wrong |
| 79 | + if ascii_letters >= 2 and cjk_chars / len(decoded_str) < 0.5: |
| 80 | + return True |
| 81 | + |
| 82 | + return False |
| 83 | + |
| 84 | + |
| 85 | +def decode_with_fallback(content_bytes): |
| 86 | + """ |
| 87 | + Decode bytes to string, trying multiple encodings. |
| 88 | +
|
| 89 | + Strategy: |
| 90 | + 1. Try UTF-8 (most common) |
| 91 | + 2. Try common encodings with sanity checks |
| 92 | + 3. Fall back to latin-1 (decodes anything, but may produce garbage) |
| 93 | +
|
| 94 | + Returns decoded string or None if all attempts fail (only if truly binary). |
| 95 | + """ |
| 96 | + # Try UTF-8 first (most common) |
| 97 | + try: |
| 98 | + decoded = content_bytes.decode("utf-8") |
| 99 | + # Apply same heuristics as other encodings |
| 100 | + if _is_likely_text(decoded): |
| 101 | + return decoded |
| 102 | + except (UnicodeDecodeError, AttributeError): |
| 103 | + pass |
| 104 | + |
| 105 | + # Try encodings from most to least restrictive. Even with improved heuristics, |
| 106 | + # putting GBK/GB2312 early breaks too many other encodings. The order below |
| 107 | + # maximizes correct detections while minimizing misdetections. |
| 108 | + common_encodings = [ |
| 109 | + "shift_jis", # Japanese (restrictive multi-byte) |
| 110 | + "euc-kr", # Korean (restrictive multi-byte) |
| 111 | + "big5", # Chinese Traditional (restrictive multi-byte) |
| 112 | + "gbk", # Chinese Simplified |
| 113 | + "gb2312", # Chinese Simplified, older |
| 114 | + "cp1251", # Cyrillic |
| 115 | + "iso-8859-2", # Central/Eastern European |
| 116 | + "cp1252", # Windows Western European (very permissive) |
| 117 | + "latin-1", # ISO-8859-1 fallback (never fails) |
| 118 | + ] |
| 119 | + |
| 120 | + for encoding in common_encodings: |
| 121 | + try: |
| 122 | + decoded = content_bytes.decode(encoding) |
| 123 | + |
| 124 | + # Skip if decoded text looks corrupted |
| 125 | + if not _is_likely_text(decoded): |
| 126 | + continue |
| 127 | + |
| 128 | + # Skip if Western encoding produced Asian-text-as-garbage pattern |
| 129 | + if _is_likely_misencoded_asian_text(decoded, encoding): |
| 130 | + continue |
| 131 | + |
| 132 | + # Skip if Asian encoding misinterpreted other Asian/Western text |
| 133 | + if _is_likely_misencoded_cross_asian(decoded, encoding): |
| 134 | + continue |
| 135 | + |
| 136 | + return decoded |
| 137 | + |
| 138 | + except (UnicodeDecodeError, LookupError): |
| 139 | + continue |
| 140 | + |
| 141 | + # If we get here, all encodings failed sanity checks (truly binary data) |
| 142 | + return None |
| 143 | + |
| 144 | + |
19 | 145 | def traces_sampler(sampling_context): |
20 | 146 | """ |
21 | 147 | Filter out noisy transactions. |
@@ -251,10 +377,10 @@ def file(project_name, version, first, second, rest, distname, filepath): |
251 | 377 | ) |
252 | 378 |
|
253 | 379 | if isinstance(contents, bytes): |
254 | | - try: |
255 | | - contents = contents.decode() |
256 | | - except UnicodeDecodeError: |
| 380 | + decoded_contents = decode_with_fallback(contents) |
| 381 | + if decoded_contents is None: |
257 | 382 | return "Binary files are not supported." |
| 383 | + contents = decoded_contents |
258 | 384 |
|
259 | 385 | return render_template( |
260 | 386 | "code.html", code=contents, name=file_extension, **common_params |
|
0 commit comments