From 504c0cb3f1ef109dfb443dac39c298f9d7105a43 Mon Sep 17 00:00:00 2001
From: keyul2 <minizhiren@outlook.com>
Date: Tue, 14 Oct 2025 19:15:44 -0500
Subject: [PATCH 1/2] Fix: add lemma immunity in WordCoherencyRule to prevent
 false positives for inflected forms

---
 .../rules/en/WordCoherencyRule.java           | 74 ++++++++++++++++++-
 1 file changed, 72 insertions(+), 2 deletions(-)
diff --git a/languagetool-language-modules/en/src/main/java/org/languagetool/rules/en/WordCoherencyRule.java b/languagetool-language-modules/en/src/main/java/org/languagetool/rules/en/WordCoherencyRule.java
index 347125072b3c..b063ec9f9a4f 100644
--- a/languagetool-language-modules/en/src/main/java/org/languagetool/rules/en/WordCoherencyRule.java
+++ b/languagetool-language-modules/en/src/main/java/org/languagetool/rules/en/WordCoherencyRule.java
@@ -19,12 +19,21 @@
 package org.languagetool.rules.en;
 
 import java.io.IOException;
+import java.util.Objects;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import java.util.ResourceBundle;
 import java.util.Set;
 
+import org.languagetool.AnalyzedSentence;
+import org.languagetool.AnalyzedToken;
+import org.languagetool.AnalyzedTokenReadings;
 import org.languagetool.rules.AbstractWordCoherencyRule;
 import org.languagetool.rules.Example;
+import org.languagetool.rules.RuleMatch;
 import org.languagetool.rules.WordCoherencyDataLoader;
 
 /**
@@ -34,10 +43,71 @@ public class WordCoherencyRule extends AbstractWordCoherencyRule {
 
   private static final Map<String, Set<String>> wordMap = new WordCoherencyDataLoader().loadWords("/en/coherency.txt");
 
+  @Override
+  public RuleMatch[] match(List<AnalyzedSentence> sentences) {
+    List<RuleMatch> ruleMatches = new ArrayList<>();
+    Map<String, String> shouldNotAppearWord = new HashMap<>(); // e.g. doggie -> doggy
+    int pos = 0;
+
+    for (AnalyzedSentence sentence : sentences) {
+      AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
+      for (AnalyzedTokenReadings atr : tokens) {
+        String surface = atr.getToken();
+        String surfaceLc = surface.toLowerCase(Locale.ROOT);
+
+        // Check whether this token is one of the variant candidates
+        Set<String> variants = getWordMap().get(surfaceLc);
+        if (variants == null || variants.isEmpty()) {
+          continue;
+        }
+
+        // ====== Key: lemma immunity to prevent false positives (e.g. doggies/doggier/doggiest) ======
+        Set<String> lemmasLc = atr.getReadings().stream()
+            .map(AnalyzedToken::getLemma)
+            .filter(Objects::nonNull)
+            .map(s -> s.toLowerCase(Locale.ROOT))
+            .collect(java.util.stream.Collectors.toSet());
+        if (!java.util.Collections.disjoint(lemmasLc, variants)) {
+           // The lemma itself is one of the coherent variants → inflected form → skip reporting
+          continue;
+        }
+        // ========================================================================
+
+        int fromPos = pos + atr.getStartPos();
+        int toPos = pos + atr.getEndPos();
+
+        // If an alternative spelling has already been encountered, this word is the opposite variant → create a match
+        if (shouldNotAppearWord.containsKey(surfaceLc)) {
+          String other = shouldNotAppearWord.get(surfaceLc);
+          String msg = getMessage(surface, other);
+          RuleMatch rm = new RuleMatch(this, sentence, fromPos, toPos, msg);
+
+          String marked = sentence.getText().substring(atr.getStartPos(), atr.getEndPos());
+          String replacement = createReplacement(marked, surfaceLc, other, atr);
+          if (org.languagetool.tools.StringTools.startsWithUppercase(surface)) {
+            replacement = org.languagetool.tools.StringTools.uppercaseFirstChar(replacement);
+          }
+          if (!marked.equalsIgnoreCase(replacement)) {
+            rm.setSuggestedReplacement(replacement);
+            ruleMatches.add(rm);
+          }
+          rm.setShortMessage(getShortMessage());
+        } else {
+          // Record the variant spelling so that later occurrences of the opposite form can be detected
+          for (String v : variants) {
+            shouldNotAppearWord.put(v, surfaceLc);
+          }
+        }
+      }
+      pos += sentence.getCorrectedTextLength();
+    }
+    return toRuleMatchArray(ruleMatches);
+  }
+
   public WordCoherencyRule(ResourceBundle messages) throws IOException {
     super(messages);
     addExamplePair(Example.wrong("He likes archaeology. Really? She likes <marker>archeology</marker>, too."),
-                   Example.fixed("He likes archaeology. Really? She likes <marker>archaeology</marker>, too."));
+        Example.fixed("He likes archaeology. Really? She likes <marker>archaeology</marker>, too."));
   }
 
   @Override
@@ -49,7 +119,7 @@ protected Map<String, Set<String>> getWordMap() {
   protected String getMessage(String word1, String word2) {
     return "Do not mix variants of the same word ('" + word1 + "' and '" + word2 + "') within a single text.";
   }
-  
+
   @Override
   public String getId() {
     return "EN_WORD_COHERENCY";

From 77ea1d5b6a9e58dd7c002df5fcfe4b1bff9e5beb Mon Sep 17 00:00:00 2001
From: keyul2 <minizhiren@outlook.com>
Date: Tue, 14 Oct 2025 22:08:02 -0500
Subject: [PATCH 2/2] Refine WordCoherencyRule: safer fallback, cleanup,
 improved inflection handling

---
 .../rules/en/WordCoherencyRule.java           | 208 +++++++++++++-----
 1 file changed, 149 insertions(+), 59 deletions(-)

diff --git a/languagetool-language-modules/en/src/main/java/org/languagetool/rules/en/WordCoherencyRule.java b/languagetool-language-modules/en/src/main/java/org/languagetool/rules/en/WordCoherencyRule.java
index b063ec9f9a4f..8824b11872ab 100644
--- a/languagetool-language-modules/en/src/main/java/org/languagetool/rules/en/WordCoherencyRule.java
+++ b/languagetool-language-modules/en/src/main/java/org/languagetool/rules/en/WordCoherencyRule.java
@@ -1,26 +1,9 @@
-/* LanguageTool, a natural language style checker 
- * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
- * 
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
- * USA
- */
 package org.languagetool.rules.en;
 
 import java.io.IOException;
 import java.util.Objects;
 import java.util.ArrayList;
+import java.util.LinkedHashSet;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
@@ -36,80 +19,111 @@
 import org.languagetool.rules.RuleMatch;
 import org.languagetool.rules.WordCoherencyDataLoader;
 
-/**
- * English version of {@link AbstractWordCoherencyRule}
- */
+/** English version of {@link AbstractWordCoherencyRule}. */
 public class WordCoherencyRule extends AbstractWordCoherencyRule {
+  private static final boolean DEBUG = Boolean.getBoolean("lt.debug.coherency");
+
+  private static void dbg(String fmt, Object... args) {
+    if (DEBUG)
+      System.err.println(String.format(fmt, args));
+  }
 
   private static final Map<String, Set<String>> wordMap = new WordCoherencyDataLoader().loadWords("/en/coherency.txt");
 
+  public WordCoherencyRule(ResourceBundle messages) throws IOException {
+    super(messages);
+    addExamplePair(
+        Example.wrong("He likes archaeology. Really? She likes <marker>archeology</marker>, too."),
+        Example.fixed("He likes archaeology. Really? She likes <marker>archaeology</marker>, too."));
+    if (DEBUG)
+      dbg("Loaded wordMap with %d entries", wordMap.size());
+  }
+
   @Override
   public RuleMatch[] match(List<AnalyzedSentence> sentences) {
     List<RuleMatch> ruleMatches = new ArrayList<>();
-    Map<String, String> shouldNotAppearWord = new HashMap<>(); // e.g. doggie -> doggy
+    Map<String, String> shouldNotAppearWord = new HashMap<>();
     int pos = 0;
 
     for (AnalyzedSentence sentence : sentences) {
+      dbg("=== NEW SENTENCE === %s", sentence.getText());
       AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
+
       for (AnalyzedTokenReadings atr : tokens) {
         String surface = atr.getToken();
         String surfaceLc = surface.toLowerCase(Locale.ROOT);
+        dbg("Token='%s'", surface);
 
-        // Check whether this token is one of the variant candidates
-        Set<String> variants = getWordMap().get(surfaceLc);
-        if (variants == null || variants.isEmpty()) {
-          continue;
-        }
-
-        // ====== Key: lemma immunity to prevent false positives (e.g. doggies/doggier/doggiest) ======
+        // collect lemmas
         Set<String> lemmasLc = atr.getReadings().stream()
             .map(AnalyzedToken::getLemma)
             .filter(Objects::nonNull)
             .map(s -> s.toLowerCase(Locale.ROOT))
             .collect(java.util.stream.Collectors.toSet());
-        if (!java.util.Collections.disjoint(lemmasLc, variants)) {
-           // The lemma itself is one of the coherent variants → inflected form → skip reporting
-          continue;
-        }
-        // ========================================================================
+        dbg("  Lemmas=%s", lemmasLc);
 
-        int fromPos = pos + atr.getStartPos();
-        int toPos = pos + atr.getEndPos();
+        List<String> keysToCheck = candidateKeys(surfaceLc, lemmasLc);
+        dbg("  keysToCheck=%s", keysToCheck);
 
-        // If an alternative spelling has already been encountered, this word is the opposite variant → create a match
-        if (shouldNotAppearWord.containsKey(surfaceLc)) {
-          String other = shouldNotAppearWord.get(surfaceLc);
-          String msg = getMessage(surface, other);
-          RuleMatch rm = new RuleMatch(this, sentence, fromPos, toPos, msg);
+        for (String key : keysToCheck) {
+          Set<String> variants = wordMap.get(key);
+          if (variants == null || variants.isEmpty())
+            continue;
+          dbg("  key='%s' → variants=%s", key, variants);
 
-          String marked = sentence.getText().substring(atr.getStartPos(), atr.getEndPos());
-          String replacement = createReplacement(marked, surfaceLc, other, atr);
-          if (org.languagetool.tools.StringTools.startsWithUppercase(surface)) {
-            replacement = org.languagetool.tools.StringTools.uppercaseFirstChar(replacement);
+          if (shouldSkipInflection(surfaceLc, key, lemmasLc)) {
+            dbg("  skipped inflection for key='%s'", key);
+            continue;
           }
-          if (!marked.equalsIgnoreCase(replacement)) {
-            rm.setSuggestedReplacement(replacement);
-            ruleMatches.add(rm);
+
+          int fromPos = pos + atr.getStartPos();
+          int toPos = pos + atr.getEndPos();
+
+          // try match
+          String matchKey = null;
+          if (shouldNotAppearWord.containsKey(key)) {
+            matchKey = key;
+          } else {
+            for (String v : variants) {
+              if (shouldNotAppearWord.containsKey(v)) {
+                matchKey = v;
+                break;
+              }
+            }
           }
-          rm.setShortMessage(getShortMessage());
-        } else {
-          // Record the variant spelling so that later occurrences of the opposite form can be detected
-          for (String v : variants) {
-            shouldNotAppearWord.put(v, surfaceLc);
+
+          if (matchKey != null) {
+            String other = shouldNotAppearWord.get(matchKey);
+            dbg("  MATCH: '%s' vs '%s' (span %d-%d)", surface, other, fromPos, toPos);
+            String msg = getMessage(surface, other);
+            RuleMatch rm = new RuleMatch(this, sentence, fromPos, toPos, msg);
+
+            String marked = sentence.getText().substring(atr.getStartPos(), atr.getEndPos());
+            String replacement = createReplacement(marked, key, other, atr);
+            if (org.languagetool.tools.StringTools.startsWithUppercase(surface)) {
+              replacement = org.languagetool.tools.StringTools.uppercaseFirstChar(replacement);
+            }
+            if (!marked.equalsIgnoreCase(replacement)) {
+              rm.setSuggestedReplacement(replacement);
+              ruleMatches.add(rm);
+            }
+            rm.setShortMessage(getShortMessage());
+            break;
+          } else {
+            // register new opposites
+            for (String v : variants) {
+              shouldNotAppearWord.put(v, key);
+            }
+            dbg("  register opposites: %s → %s", variants, key);
           }
         }
       }
       pos += sentence.getCorrectedTextLength();
     }
+    dbg("Total matches: %d", ruleMatches.size());
     return toRuleMatchArray(ruleMatches);
   }
 
-  public WordCoherencyRule(ResourceBundle messages) throws IOException {
-    super(messages);
-    addExamplePair(Example.wrong("He likes archaeology. Really? She likes <marker>archeology</marker>, too."),
-        Example.fixed("He likes archaeology. Really? She likes <marker>archaeology</marker>, too."));
-  }
-
   @Override
   protected Map<String, Set<String>> getWordMap() {
     return wordMap;
@@ -130,4 +144,80 @@ public String getDescription() {
     return "Coherent spelling of words with two admitted variants.";
   }
 
+  private static List<String> candidateKeys(String surfaceLc, Set<String> lemmasLc) {
+    LinkedHashSet<String> keys = new LinkedHashSet<>();
+    if (!lemmasLc.isEmpty())
+      keys.addAll(lemmasLc);
+    else
+      keys.add(surfaceLc);
+
+    boolean anyHit = keys.stream().anyMatch(k -> {
+      Set<String> vs = wordMap.get(k);
+      return vs != null && !vs.isEmpty();
+    });
+    if (!anyHit) {
+      for (String cand : computePastTenseFallbacks(surfaceLc)) {
+        Set<String> vs = wordMap.get(cand);
+        if (vs != null && !vs.isEmpty())
+          keys.add(cand);
+      }
+    }
+    return new ArrayList<>(keys);
+  }
+
+  /** Very small, safe fallback: trim trailing "ed"; if that leaves a trailing hyphen, trim it too. */
+  private static List<String> computePastTenseFallbacks(String surfaceLc) {
+    List<String> extra = new ArrayList<>();
+    if (surfaceLc == null) return extra;
+
+    int len = surfaceLc.length();
+    // plain "...ed"
+    if (len >= 3 && surfaceLc.endsWith("ed")) {
+      String base = surfaceLc.substring(0, len - 2);   // safe: len-2 >= 1
+      if (!base.isEmpty()) {
+        extra.add(base);                                // e.g., "reelected" -> "reelect"
+        // hyphenated end: "...-ed" -> "...-"
+        if (base.charAt(base.length() - 1) == '-') {
+          String baseNoHyphen = base.substring(0, base.length() - 1);  // safe
+          if (!baseNoHyphen.isEmpty()) {
+            extra.add(baseNoHyphen);                    // e.g., "re-elected" -> "re-elect"
+          }
+        }
+      }
+    }
+    return extra;
+  }
+
+
+  private static boolean shouldSkipInflection(String surfaceLc, String key, Set<String> lemmasLc) {
+    return !lemmasLc.isEmpty() && lemmasLc.contains(key) && isNounOrAdjInflectionOf(surfaceLc, key);
+  }
+
+  /** Whitelist only noun/adjective inflections that won't mask verb forms. */
+  private static boolean isNounOrAdjInflectionOf(String surfaceLc, String lemmaLc) {
+    if (surfaceLc.equals(lemmaLc))
+      return false;
+
+    // DO NOT: generic +s/+es — this hides verb 3sg (e.g., oxidises)
+    // Keep only cases that are unlikely to be verb forms:
+
+    // y-ending: y -> ies / ier / iest (e.g., doggy/doggie ->
+    // doggies/doggier/doggiest)
+    if (lemmaLc.endsWith("y")) {
+      String stem = lemmaLc.substring(0, lemmaLc.length() - 1);
+      if (surfaceLc.equals(stem + "ies"))
+        return true; // plural of -y nouns/adjs
+      if (surfaceLc.equals(stem + "ier"))
+        return true; // comparative
+      if (surfaceLc.equals(stem + "iest"))
+        return true; // superlative
+    }
+
+    // Generic comparative/superlative (non-y cases): safer/safest etc.
+    if (surfaceLc.equals(lemmaLc + "er") || surfaceLc.equals(lemmaLc + "est"))
+      return true;
+
+    // No whitelist for -ed/-ing/-s to avoid hiding real inconsistencies.
+    return false;
+  }
 }