Merge pull request #63 from pescheckit/hotfix_fixed-issue-with-whitespaces

pescheck-bram · web-flow · commit 3985e059c26d · 2025-10-08T12:57:48.000+02:00
Fixed issue with whitespaces
diff --git a/docs/usage.md b/docs/usage.md
@@ -426,6 +426,86 @@ gpt-po-translator --folder ./locales --lang de --no-ai-comment
 
 ---
 
+## Whitespace Handling in Translations
+
+### Overview
+
+The tool automatically preserves leading and trailing whitespace from `msgid` entries in translations. While **best practice** is to handle whitespace in your UI framework rather than in translation strings, the tool ensures that any existing whitespace patterns are maintained exactly.
+
+### How Whitespace Preservation Works
+
+The tool uses a three-step process to reliably preserve whitespace:
+
+1. **Detection and Warning**
+   When processing PO files, the tool scans for entries with leading or trailing whitespace and logs a warning:
+   ```
+   WARNING: Found 3 entries with leading/trailing whitespace in messages.po
+   Whitespace will be preserved in translations, but ideally should be handled in your UI framework.
+   ```
+
+2. **Before Sending to AI** (Bulk Mode)
+   To prevent the AI from being confused by or accidentally modifying whitespace, the tool:
+   - Strips all leading/trailing whitespace from texts
+   - Stores the original whitespace pattern
+   - Sends only the clean text content to the AI
+
+   For example, if `msgid` is `" Incorrect"` (with leading space), the AI receives only `"Incorrect"`.
+
+3. **After Receiving Translation**
+   Once the AI returns the translation, the tool:
+   - Extracts the original whitespace pattern from the source `msgid`
+   - Applies that exact pattern to the translated `msgstr`
+   - Ensures the output matches the input whitespace structure
+
+   So `" Incorrect"` → AI translates `"Incorrect"` → Result: `" Incorreto"` (leading space preserved)
+
+### Examples
+
+| Original msgid | AI Receives | AI Returns | Final msgstr |
+|----------------|-------------|------------|--------------|
+| `" Hello"` | `"Hello"` | `"Bonjour"` | `" Bonjour"` |
+| `"World "` | `"World"` | `"Monde"` | `"Monde "` |
+| `"  Hi  "` | `"Hi"` | `"Salut"` | `"  Salut  "` |
+| `"\tTab"` | `"Tab"` | `"Onglet"` | `"\tOnglet"` |
+
+### Why This Approach Is Reliable
+
+This implementation is **bulletproof** because:
+- **The AI never sees the problematic whitespace**, so it can't strip or modify it
+- **Whitespace is managed entirely in code**, not reliant on AI behavior
+- **Works consistently across all providers** (OpenAI, Anthropic, Azure, DeepSeek)
+- **Handles edge cases**: empty strings, whitespace-only strings, mixed whitespace types (spaces, tabs, newlines)
+
+### Single vs. Bulk Mode
+
+- **Single Mode**: Each text is stripped before sending to AI, then whitespace is restored after receiving the translation
+- **Bulk Mode**: Entire batches are stripped before sending to AI (JSON array of clean texts), then whitespace is restored to each translation individually
+
+Both modes use the same preservation logic, ensuring consistent behavior.
+
+### Best Practices
+
+1. **Avoid whitespace in msgid when possible**
+   Whitespace in translation strings can cause formatting issues. Instead, handle spacing in your UI layer:
+   ```python
+   # Bad - whitespace in msgid
+   msgid " Settings"
+
+   # Good - whitespace in code
+   print(f"  {_('Settings')}")
+   ```
+
+2. **If whitespace is unavoidable**
+   The tool will preserve it automatically. Use verbose mode to see which entries contain whitespace:
+   ```bash
+   gpt-po-translator --folder ./locales --lang fr -vv
+   ```
+
+3. **Review whitespace warnings**
+   When the tool warns about whitespace entries, consider refactoring your code to move the whitespace out of the translation strings.
+
+---
+
 ## Behind the Scenes: API Calls and Post-Processing
 
 - **Provider-Specific API Calls:**  
diff --git a/python_gpt_po/services/translation_service.py b/python_gpt_po/services/translation_service.py
@@ -206,6 +206,9 @@ def perform_translation_without_validation(
             target_language: str,
             detail_language: Optional[str] = None) -> str:
         """Performs translation without validation for single words or short phrases."""
+        # Strip text before sending to AI (whitespace will be restored in validate_translation)
+        text_stripped = text.strip()
+
         # Use the detailed language name if provided, otherwise use the short code
         target_lang_text = detail_language if detail_language else target_language
 
@@ -216,7 +219,7 @@ def perform_translation_without_validation(
         )
 
         return self.validate_translation(text, self.perform_translation(
-            prompt + text, target_language, is_bulk=False, detail_language=detail_language
+            prompt + text_stripped, target_language, is_bulk=False, detail_language=detail_language
         ), target_language)
 
     @staticmethod
@@ -231,6 +234,7 @@ def get_translation_prompt(target_language: str, is_bulk: bool, detail_language:
                 "Provide only the translations in a JSON array format, maintaining the original order. "
                 "Each translation should be concise and direct, without explanations or additional context. "
                 "Keep special characters, placeholders, and formatting intact. "
+                "Do NOT add or remove any leading/trailing whitespace - translate only the text content. "
                 "If a term should not be translated (like 'URL' or technical terms), keep it as is. "
                 "Example format: [\"Translation 1\", \"Translation 2\", ...]\n\n"
                 "Texts to translate:\n"
@@ -253,15 +257,21 @@ def perform_translation(
         """Performs the actual translation using the selected provider's API."""
         logging.debug("Translating to '%s' via %s API", target_language, self.config.provider.value)
         prompt = self.get_translation_prompt(target_language, is_bulk, detail_language)
-        content = prompt + (json.dumps(texts) if is_bulk else texts)
+
+        # For bulk mode, strip whitespace before sending to AI
+        if is_bulk:
+            stripped_texts = [text.strip() for text in texts]
+            content = prompt + json.dumps(stripped_texts)
+        else:
+            content = prompt + texts
 
         try:
             # Get the response text from the provider
             response_text = self._get_provider_response(content)
 
             # Process the response according to bulk mode
             if is_bulk:
-                return self._process_bulk_response(response_text, texts, target_language)
+                return self._process_bulk_response(response_text, texts, target_language, stripped_texts)
             return self.validate_translation(texts, response_text, target_language)
 
         except Exception as e:
@@ -280,67 +290,112 @@ def _get_provider_response(self, content: str) -> str:
             return ""
         return provider_instance.translate(self.config.provider_clients, self.config.model, content)
 
-    def _process_bulk_response(self, response_text: str, original_texts: List[str], target_language: str) -> List[str]:
-        """Process a bulk translation response."""
+    @staticmethod
+    def _fix_json_quotes(json_text: str) -> str:
+        """Fix non-standard quotes in JSON response.
+
+        Args:
+            json_text: JSON text with potentially non-standard quotes
+
+        Returns:
+            JSON text with normalized quotes
+        """
+        quote_fixes = [
+            ('"', '"'),   # Left double quotation mark
+            ('"', '"'),   # Right double quotation mark
+            ('„', '"'),   # Double low-9 quotation mark (Lithuanian, German)
+            ('"', '"'),   # Left double quotation mark (alternative)
+            (''', "'"),   # Left single quotation mark
+            (''', "'"),   # Right single quotation mark
+            ('‚', "'"),   # Single low-9 quotation mark
+            ('«', '"'),   # Left-pointing double angle quotation mark
+            ('»', '"'),   # Right-pointing double angle quotation mark
+            ('‹', "'"),   # Left-pointing single angle quotation mark
+            ('›', "'"),   # Right-pointing single angle quotation mark
+        ]
+
+        fixed_text = json_text
+        for old_quote, new_quote in quote_fixes:
+            fixed_text = fixed_text.replace(old_quote, new_quote)
+
+        # Apply regex fix to handle quotes inside strings
+        fixed_text = re.sub(
+            r'"([^"\\]*(\\.[^"\\]*)*)"',
+            lambda m: f'"{m.group(1).replace(chr(92) + chr(34), chr(34))}"',
+            fixed_text
+        )
+        return fixed_text
+
+    def _extract_translations_from_malformed_json(
+            self,
+            json_text: str,
+            expected_count: int) -> List[str]:
+        """Extract translations from malformed JSON as a fallback.
+
+        Args:
+            json_text: Malformed JSON text
+            expected_count: Expected number of translations
+
+        Returns:
+            List of extracted translations
+
+        Raises:
+            ValueError: If extraction fails or count mismatch
+        """
+        if '[' not in json_text or ']' not in json_text:
+            raise ValueError("No array structure found in malformed JSON")
+
+        # Extract content between first [ and last ]
+        start_idx = json_text.find('[')
+        end_idx = json_text.rfind(']') + 1
+        array_content = json_text[start_idx:end_idx]
+
+        # Try to extract quoted strings
+        matches = re.findall(r'"([^"]*(?:\\.[^"]*)*)"', array_content)
+        if not matches or len(matches) != expected_count:
+            raise ValueError(
+                f"Could not extract expected number of translations "
+                f"(expected {expected_count}, got {len(matches) if matches else 0})"
+            )
+
+        # Unescape the extracted strings
+        return [match.replace('\\"', '"').replace("\\'", "'") for match in matches]
+
+    def _process_bulk_response(
+            self,
+            response_text: str,
+            original_texts: List[str],
+            target_language: str,
+            _stripped_texts: Optional[List[str]] = None) -> List[str]:
+        """Process a bulk translation response.
+
+        Args:
+            response_text: The raw response from the AI provider
+            original_texts: The original texts WITH whitespace
+            target_language: Target language code
+            _stripped_texts: The stripped texts sent to AI (unused, for future use)
+        """
+        # Note: _stripped_texts parameter kept for future validation features
+        # Current validation happens per-entry using original_texts
         try:
-            # Clean the response text for formatting issues
             clean_response = self._clean_json_response(response_text)
             logging.debug("Cleaned JSON response: %s...", clean_response[:100])
 
             # First attempt: try parsing as-is
             try:
                 translated_texts = json.loads(clean_response)
             except json.JSONDecodeError:
-                # Second attempt: fix various quote types that break JSON
-                # First, normalize all quote types to standard quotes
-                # Handle different languages' quotation marks
-                quote_fixes = [
-                    ('"', '"'),   # Left double quotation mark
-                    ('"', '"'),   # Right double quotation mark
-                    ('„', '"'),   # Double low-9 quotation mark (Lithuanian, German)
-                    ('"', '"'),   # Left double quotation mark (alternative)
-                    (''', "'"),   # Left single quotation mark
-                    (''', "'"),   # Right single quotation mark
-                    ('‚', "'"),   # Single low-9 quotation mark
-                    ('«', '"'),   # Left-pointing double angle quotation mark
-                    ('»', '"'),   # Right-pointing double angle quotation mark
-                    ('‹', "'"),   # Left-pointing single angle quotation mark
-                    ('›', "'"),   # Right-pointing single angle quotation mark
-                ]
-
-                fixed_response = clean_response
-                for old_quote, new_quote in quote_fixes:
-                    fixed_response = fixed_response.replace(old_quote, new_quote)
-
-                # Apply fix to all JSON strings (but not the JSON structure quotes)
+                # Second attempt: fix non-standard quotes
+                fixed_response = self._fix_json_quotes(clean_response)
                 try:
-                    # More sophisticated regex to handle quotes inside strings
-                    fixed_response = re.sub(
-                        r'"([^"\\]*(\\.[^"\\]*)*)"',
-                        lambda m: f'"{m.group(1).replace(chr(92) + chr(34), chr(34))}"',
-                        fixed_response)
                     translated_texts = json.loads(fixed_response)
-                except json.JSONDecodeError as e:
-                    # Final attempt: try to extract array elements manually
-                    # This is a fallback for severely malformed JSON
-                    logging.warning("API returned malformed JSON, attempting to extract translations manually")
-
-                    # Try to find array-like structure and extract elements
-                    if '[' in fixed_response and ']' in fixed_response:
-                        # Extract content between first [ and last ]
-                        start_idx = fixed_response.find('[')
-                        end_idx = fixed_response.rfind(']') + 1
-                        array_content = fixed_response[start_idx:end_idx]
-
-                        # Try to extract quoted strings
-                        matches = re.findall(r'"([^"]*(?:\\.[^"]*)*)"', array_content)
-                        if matches and len(matches) == len(original_texts):
-                            # Unescape the extracted strings
-                            translated_texts = [match.replace('\\"', '"').replace("\\'", "'") for match in matches]
-                        else:
-                            raise ValueError("Could not extract expected number of translations") from e
-                    else:
-                        raise
+                except json.JSONDecodeError:
+                    # Final attempt: extract from malformed JSON
+                    logging.warning("API returned malformed JSON, extracting translations manually")
+                    translated_texts = self._extract_translations_from_malformed_json(
+                        fixed_response,
+                        len(original_texts)
+                    )
 
             # Validate the format
             if not isinstance(translated_texts, list) or len(translated_texts) != len(original_texts):
@@ -386,9 +441,19 @@ def _clean_json_response(self, response_text: str) -> str:
 
     def validate_translation(self, original: str, translated: str, target_language: str) -> str:
         """Validates the translation and retries if necessary."""
+        # Extract leading/trailing whitespace from original
+        original_stripped = original.strip()
+        if not original_stripped:
+            # If original is all whitespace, preserve it as-is
+            return original
+
+        leading_ws = original[:len(original) - len(original.lstrip())]
+        trailing_ws = original[len(original.rstrip()):]
+
+        # Strip the translation for validation
         translated = translated.strip()
 
-        if len(translated.split()) > 2 * len(original.split()) + 1:
+        if len(translated.split()) > 2 * len(original_stripped.split()) + 1:
             logging.debug("Translation too verbose (%d words), retrying", len(translated.split()))
             return self.retry_long_translation(original, target_language)
 
@@ -397,10 +462,16 @@ def validate_translation(self, original: str, translated: str, target_language:
             logging.debug("Translation contains explanation, retrying")
             return self.retry_long_translation(original, target_language)
 
-        return translated
+        # Restore original whitespace
+        return leading_ws + translated + trailing_ws
 
     def retry_long_translation(self, text: str, target_language: str) -> str:
         """Retries translation for long or explanatory responses."""
+        # Extract leading/trailing whitespace from original
+        leading_ws = text[:len(text) - len(text.lstrip())]
+        trailing_ws = text[len(text.rstrip()):]
+        text_stripped = text.strip()
+
         prompt = (
             f"Translate this text concisely from English to {target_language}. "
             "Provide only the direct translation without any explanation or additional context. "
@@ -410,15 +481,16 @@ def retry_long_translation(self, text: str, target_language: str) -> str:
         )
 
         try:
-            content = prompt + text
-            retried_translation = self._get_provider_response(content)
+            content = prompt + text_stripped
+            retried_translation = self._get_provider_response(content).strip()
 
-            if len(retried_translation.split()) > 2 * len(text.split()) + 1:
+            if len(retried_translation.split()) > 2 * len(text_stripped.split()) + 1:
                 logging.debug("Retry still too verbose, skipping")
                 return ""  # Return empty string instead of English text
 
             logging.debug("Retry successful")
-            return retried_translation
+            # Restore original whitespace
+            return leading_ws + retried_translation + trailing_ws
 
         except Exception as e:
             logging.debug("Retry failed: %s", str(e)[:100])
@@ -732,6 +804,22 @@ def _prepare_translation_request(self, po_file, po_file_path, file_lang, detail_
         texts = [entry.msgid for entry in entries]
         detail_lang = detail_languages.get(file_lang) if detail_languages else None
 
+        # Check for and warn about whitespace in msgid
+        whitespace_entries = [
+            text for text in texts
+            if text and (text != text.strip())
+        ]
+        if whitespace_entries:
+            logging.warning(
+                "Found %d entries with leading/trailing whitespace in %s. "
+                "Whitespace will be preserved in translations, but ideally should be handled in your UI framework.",
+                len(whitespace_entries),
+                po_file_path
+            )
+            if logging.getLogger().isEnabledFor(logging.DEBUG):
+                for text in whitespace_entries[:3]:  # Show first 3 examples
+                    logging.debug("  Example: %s", repr(text))
+
         return TranslationRequest(
             po_file=po_file,
             entries=entries,
diff --git a/python_gpt_po/tests/test_whitespace_preservation.py b/python_gpt_po/tests/test_whitespace_preservation.py