From 504c0cb3f1ef109dfb443dac39c298f9d7105a43 Mon Sep 17 00:00:00 2001 From: keyul2 Date: Tue, 14 Oct 2025 19:15:44 -0500 Subject: [PATCH 1/2] Fix: add lemma immunity in WordCoherencyRule to prevent false positives for inflected forms --- .../rules/en/WordCoherencyRule.java | 74 ++++++++++++++++++- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/languagetool-language-modules/en/src/main/java/org/languagetool/rules/en/WordCoherencyRule.java b/languagetool-language-modules/en/src/main/java/org/languagetool/rules/en/WordCoherencyRule.java index 347125072b3c..b063ec9f9a4f 100644 --- a/languagetool-language-modules/en/src/main/java/org/languagetool/rules/en/WordCoherencyRule.java +++ b/languagetool-language-modules/en/src/main/java/org/languagetool/rules/en/WordCoherencyRule.java @@ -19,12 +19,21 @@ package org.languagetool.rules.en; import java.io.IOException; +import java.util.Objects; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.ResourceBundle; import java.util.Set; +import org.languagetool.AnalyzedSentence; +import org.languagetool.AnalyzedToken; +import org.languagetool.AnalyzedTokenReadings; import org.languagetool.rules.AbstractWordCoherencyRule; import org.languagetool.rules.Example; +import org.languagetool.rules.RuleMatch; import org.languagetool.rules.WordCoherencyDataLoader; /** @@ -34,10 +43,71 @@ public class WordCoherencyRule extends AbstractWordCoherencyRule { private static final Map> wordMap = new WordCoherencyDataLoader().loadWords("/en/coherency.txt"); + @Override + public RuleMatch[] match(List sentences) { + List ruleMatches = new ArrayList<>(); + Map shouldNotAppearWord = new HashMap<>(); // e.g. doggie -> doggy + int pos = 0; + + for (AnalyzedSentence sentence : sentences) { + AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace(); + for (AnalyzedTokenReadings atr : tokens) { + String surface = atr.getToken(); + String surfaceLc = surface.toLowerCase(Locale.ROOT); + + // Check whether this token is one of the variant candidates + Set variants = getWordMap().get(surfaceLc); + if (variants == null || variants.isEmpty()) { + continue; + } + + // ====== Key: lemma immunity to prevent false positives (e.g. doggies/doggier/doggiest) ====== + Set lemmasLc = atr.getReadings().stream() + .map(AnalyzedToken::getLemma) + .filter(Objects::nonNull) + .map(s -> s.toLowerCase(Locale.ROOT)) + .collect(java.util.stream.Collectors.toSet()); + if (!java.util.Collections.disjoint(lemmasLc, variants)) { + // The lemma itself is one of the coherent variants → inflected form → skip reporting + continue; + } + // ======================================================================== + + int fromPos = pos + atr.getStartPos(); + int toPos = pos + atr.getEndPos(); + + // If an alternative spelling has already been encountered, this word is the opposite variant → create a match + if (shouldNotAppearWord.containsKey(surfaceLc)) { + String other = shouldNotAppearWord.get(surfaceLc); + String msg = getMessage(surface, other); + RuleMatch rm = new RuleMatch(this, sentence, fromPos, toPos, msg); + + String marked = sentence.getText().substring(atr.getStartPos(), atr.getEndPos()); + String replacement = createReplacement(marked, surfaceLc, other, atr); + if (org.languagetool.tools.StringTools.startsWithUppercase(surface)) { + replacement = org.languagetool.tools.StringTools.uppercaseFirstChar(replacement); + } + if (!marked.equalsIgnoreCase(replacement)) { + rm.setSuggestedReplacement(replacement); + ruleMatches.add(rm); + } + rm.setShortMessage(getShortMessage()); + } else { + // Record the variant spelling so that later occurrences of the opposite form can be detected + for (String v : variants) { + shouldNotAppearWord.put(v, surfaceLc); + } + } + } + pos += sentence.getCorrectedTextLength(); + } + return toRuleMatchArray(ruleMatches); + } + public WordCoherencyRule(ResourceBundle messages) throws IOException { super(messages); addExamplePair(Example.wrong("He likes archaeology. Really? She likes archeology, too."), - Example.fixed("He likes archaeology. Really? She likes archaeology, too.")); + Example.fixed("He likes archaeology. Really? She likes archaeology, too.")); } @Override @@ -49,7 +119,7 @@ protected Map> getWordMap() { protected String getMessage(String word1, String word2) { return "Do not mix variants of the same word ('" + word1 + "' and '" + word2 + "') within a single text."; } - + @Override public String getId() { return "EN_WORD_COHERENCY"; From 77ea1d5b6a9e58dd7c002df5fcfe4b1bff9e5beb Mon Sep 17 00:00:00 2001 From: keyul2 Date: Tue, 14 Oct 2025 22:08:02 -0500 Subject: [PATCH 2/2] Refine WordCoherencyRule: safer fallback, cleanup, improved inflection handling --- .../rules/en/WordCoherencyRule.java | 208 +++++++++++++----- 1 file changed, 149 insertions(+), 59 deletions(-) diff --git a/languagetool-language-modules/en/src/main/java/org/languagetool/rules/en/WordCoherencyRule.java b/languagetool-language-modules/en/src/main/java/org/languagetool/rules/en/WordCoherencyRule.java index b063ec9f9a4f..8824b11872ab 100644 --- a/languagetool-language-modules/en/src/main/java/org/languagetool/rules/en/WordCoherencyRule.java +++ b/languagetool-language-modules/en/src/main/java/org/languagetool/rules/en/WordCoherencyRule.java @@ -1,26 +1,9 @@ -/* LanguageTool, a natural language style checker - * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 - * USA - */ package org.languagetool.rules.en; import java.io.IOException; import java.util.Objects; import java.util.ArrayList; +import java.util.LinkedHashSet; import java.util.HashMap; import java.util.List; import java.util.Locale; @@ -36,80 +19,111 @@ import org.languagetool.rules.RuleMatch; import org.languagetool.rules.WordCoherencyDataLoader; -/** - * English version of {@link AbstractWordCoherencyRule} - */ +/** English version of {@link AbstractWordCoherencyRule}. */ public class WordCoherencyRule extends AbstractWordCoherencyRule { + private static final boolean DEBUG = Boolean.getBoolean("lt.debug.coherency"); + + private static void dbg(String fmt, Object... args) { + if (DEBUG) + System.err.println(String.format(fmt, args)); + } private static final Map> wordMap = new WordCoherencyDataLoader().loadWords("/en/coherency.txt"); + public WordCoherencyRule(ResourceBundle messages) throws IOException { + super(messages); + addExamplePair( + Example.wrong("He likes archaeology. Really? She likes archeology, too."), + Example.fixed("He likes archaeology. Really? She likes archaeology, too.")); + if (DEBUG) + dbg("Loaded wordMap with %d entries", wordMap.size()); + } + @Override public RuleMatch[] match(List sentences) { List ruleMatches = new ArrayList<>(); - Map shouldNotAppearWord = new HashMap<>(); // e.g. doggie -> doggy + Map shouldNotAppearWord = new HashMap<>(); int pos = 0; for (AnalyzedSentence sentence : sentences) { + dbg("=== NEW SENTENCE === %s", sentence.getText()); AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace(); + for (AnalyzedTokenReadings atr : tokens) { String surface = atr.getToken(); String surfaceLc = surface.toLowerCase(Locale.ROOT); + dbg("Token='%s'", surface); - // Check whether this token is one of the variant candidates - Set variants = getWordMap().get(surfaceLc); - if (variants == null || variants.isEmpty()) { - continue; - } - - // ====== Key: lemma immunity to prevent false positives (e.g. doggies/doggier/doggiest) ====== + // collect lemmas Set lemmasLc = atr.getReadings().stream() .map(AnalyzedToken::getLemma) .filter(Objects::nonNull) .map(s -> s.toLowerCase(Locale.ROOT)) .collect(java.util.stream.Collectors.toSet()); - if (!java.util.Collections.disjoint(lemmasLc, variants)) { - // The lemma itself is one of the coherent variants → inflected form → skip reporting - continue; - } - // ======================================================================== + dbg(" Lemmas=%s", lemmasLc); - int fromPos = pos + atr.getStartPos(); - int toPos = pos + atr.getEndPos(); + List keysToCheck = candidateKeys(surfaceLc, lemmasLc); + dbg(" keysToCheck=%s", keysToCheck); - // If an alternative spelling has already been encountered, this word is the opposite variant → create a match - if (shouldNotAppearWord.containsKey(surfaceLc)) { - String other = shouldNotAppearWord.get(surfaceLc); - String msg = getMessage(surface, other); - RuleMatch rm = new RuleMatch(this, sentence, fromPos, toPos, msg); + for (String key : keysToCheck) { + Set variants = wordMap.get(key); + if (variants == null || variants.isEmpty()) + continue; + dbg(" key='%s' → variants=%s", key, variants); - String marked = sentence.getText().substring(atr.getStartPos(), atr.getEndPos()); - String replacement = createReplacement(marked, surfaceLc, other, atr); - if (org.languagetool.tools.StringTools.startsWithUppercase(surface)) { - replacement = org.languagetool.tools.StringTools.uppercaseFirstChar(replacement); + if (shouldSkipInflection(surfaceLc, key, lemmasLc)) { + dbg(" skipped inflection for key='%s'", key); + continue; } - if (!marked.equalsIgnoreCase(replacement)) { - rm.setSuggestedReplacement(replacement); - ruleMatches.add(rm); + + int fromPos = pos + atr.getStartPos(); + int toPos = pos + atr.getEndPos(); + + // try match + String matchKey = null; + if (shouldNotAppearWord.containsKey(key)) { + matchKey = key; + } else { + for (String v : variants) { + if (shouldNotAppearWord.containsKey(v)) { + matchKey = v; + break; + } + } } - rm.setShortMessage(getShortMessage()); - } else { - // Record the variant spelling so that later occurrences of the opposite form can be detected - for (String v : variants) { - shouldNotAppearWord.put(v, surfaceLc); + + if (matchKey != null) { + String other = shouldNotAppearWord.get(matchKey); + dbg(" MATCH: '%s' vs '%s' (span %d-%d)", surface, other, fromPos, toPos); + String msg = getMessage(surface, other); + RuleMatch rm = new RuleMatch(this, sentence, fromPos, toPos, msg); + + String marked = sentence.getText().substring(atr.getStartPos(), atr.getEndPos()); + String replacement = createReplacement(marked, key, other, atr); + if (org.languagetool.tools.StringTools.startsWithUppercase(surface)) { + replacement = org.languagetool.tools.StringTools.uppercaseFirstChar(replacement); + } + if (!marked.equalsIgnoreCase(replacement)) { + rm.setSuggestedReplacement(replacement); + ruleMatches.add(rm); + } + rm.setShortMessage(getShortMessage()); + break; + } else { + // register new opposites + for (String v : variants) { + shouldNotAppearWord.put(v, key); + } + dbg(" register opposites: %s → %s", variants, key); } } } pos += sentence.getCorrectedTextLength(); } + dbg("Total matches: %d", ruleMatches.size()); return toRuleMatchArray(ruleMatches); } - public WordCoherencyRule(ResourceBundle messages) throws IOException { - super(messages); - addExamplePair(Example.wrong("He likes archaeology. Really? She likes archeology, too."), - Example.fixed("He likes archaeology. Really? She likes archaeology, too.")); - } - @Override protected Map> getWordMap() { return wordMap; @@ -130,4 +144,80 @@ public String getDescription() { return "Coherent spelling of words with two admitted variants."; } + private static List candidateKeys(String surfaceLc, Set lemmasLc) { + LinkedHashSet keys = new LinkedHashSet<>(); + if (!lemmasLc.isEmpty()) + keys.addAll(lemmasLc); + else + keys.add(surfaceLc); + + boolean anyHit = keys.stream().anyMatch(k -> { + Set vs = wordMap.get(k); + return vs != null && !vs.isEmpty(); + }); + if (!anyHit) { + for (String cand : computePastTenseFallbacks(surfaceLc)) { + Set vs = wordMap.get(cand); + if (vs != null && !vs.isEmpty()) + keys.add(cand); + } + } + return new ArrayList<>(keys); + } + + /** Very small, safe fallback: trim trailing "ed"; if that leaves a trailing hyphen, trim it too. */ + private static List computePastTenseFallbacks(String surfaceLc) { + List extra = new ArrayList<>(); + if (surfaceLc == null) return extra; + + int len = surfaceLc.length(); + // plain "...ed" + if (len >= 3 && surfaceLc.endsWith("ed")) { + String base = surfaceLc.substring(0, len - 2); // safe: len-2 >= 1 + if (!base.isEmpty()) { + extra.add(base); // e.g., "reelected" -> "reelect" + // hyphenated end: "...-ed" -> "...-" + if (base.charAt(base.length() - 1) == '-') { + String baseNoHyphen = base.substring(0, base.length() - 1); // safe + if (!baseNoHyphen.isEmpty()) { + extra.add(baseNoHyphen); // e.g., "re-elected" -> "re-elect" + } + } + } + } + return extra; + } + + + private static boolean shouldSkipInflection(String surfaceLc, String key, Set lemmasLc) { + return !lemmasLc.isEmpty() && lemmasLc.contains(key) && isNounOrAdjInflectionOf(surfaceLc, key); + } + + /** Whitelist only noun/adjective inflections that won't mask verb forms. */ + private static boolean isNounOrAdjInflectionOf(String surfaceLc, String lemmaLc) { + if (surfaceLc.equals(lemmaLc)) + return false; + + // DO NOT: generic +s/+es — this hides verb 3sg (e.g., oxidises) + // Keep only cases that are unlikely to be verb forms: + + // y-ending: y -> ies / ier / iest (e.g., doggy/doggie -> + // doggies/doggier/doggiest) + if (lemmaLc.endsWith("y")) { + String stem = lemmaLc.substring(0, lemmaLc.length() - 1); + if (surfaceLc.equals(stem + "ies")) + return true; // plural of -y nouns/adjs + if (surfaceLc.equals(stem + "ier")) + return true; // comparative + if (surfaceLc.equals(stem + "iest")) + return true; // superlative + } + + // Generic comparative/superlative (non-y cases): safer/safest etc. + if (surfaceLc.equals(lemmaLc + "er") || surfaceLc.equals(lemmaLc + "est")) + return true; + + // No whitelist for -ed/-ing/-s to avoid hiding real inconsistencies. + return false; + } }