1919package org .languagetool .rules .en ;
2020
2121import java .io .IOException ;
22+ import java .util .Objects ;
23+ import java .util .ArrayList ;
24+ import java .util .HashMap ;
25+ import java .util .List ;
26+ import java .util .Locale ;
2227import java .util .Map ;
2328import java .util .ResourceBundle ;
2429import java .util .Set ;
2530
31+ import org .languagetool .AnalyzedSentence ;
32+ import org .languagetool .AnalyzedToken ;
33+ import org .languagetool .AnalyzedTokenReadings ;
2634import org .languagetool .rules .AbstractWordCoherencyRule ;
2735import org .languagetool .rules .Example ;
36+ import org .languagetool .rules .RuleMatch ;
2837import org .languagetool .rules .WordCoherencyDataLoader ;
2938
3039/**
@@ -34,22 +43,118 @@ public class WordCoherencyRule extends AbstractWordCoherencyRule {
3443
3544 private static final Map <String , Set <String >> wordMap = new WordCoherencyDataLoader ().loadWords ("/en/coherency.txt" );
3645
46+ /**
47+ * Detects occurrences of mixed spelling variants of the same word within a text and produces matches.
48+ *
49+ * Scans each sentence token-by-token for known variant forms (loaded from the rule's word map).
50+ * If two different variants of the same lemma appear in the same text, produces a RuleMatch for the later occurrence
51+ * with a short message and, when applicable, a suggested replacement that preserves initial capitalization.
52+ *
53+ * @param sentences the analyzed sentences of the document to check
54+ * @return an array of RuleMatch objects for each detected mixed-variant occurrence; each match may include a single suggested replacement
55+ */
56+ @ Override
57+ public RuleMatch [] match (List <AnalyzedSentence > sentences ) {
58+ List <RuleMatch > ruleMatches = new ArrayList <>();
59+ Map <String , String > shouldNotAppearWord = new HashMap <>(); // e.g. doggie -> doggy
60+ int pos = 0 ;
61+
62+ for (AnalyzedSentence sentence : sentences ) {
63+ AnalyzedTokenReadings [] tokens = sentence .getTokensWithoutWhitespace ();
64+ for (AnalyzedTokenReadings atr : tokens ) {
65+ String surface = atr .getToken ();
66+ String surfaceLc = surface .toLowerCase (Locale .ROOT );
67+
68+ // Check whether this token is one of the variant candidates
69+ Set <String > variants = getWordMap ().get (surfaceLc );
70+ if (variants == null || variants .isEmpty ()) {
71+ continue ;
72+ }
73+
74+ // ====== Key: lemma immunity to prevent false positives (e.g. doggies/doggier/doggiest) ======
75+ Set <String > lemmasLc = atr .getReadings ().stream ()
76+ .map (AnalyzedToken ::getLemma )
77+ .filter (Objects ::nonNull )
78+ .map (s -> s .toLowerCase (Locale .ROOT ))
79+ .collect (java .util .stream .Collectors .toSet ());
80+ if (!java .util .Collections .disjoint (lemmasLc , variants )) {
81+ // The lemma itself is one of the coherent variants → inflected form → skip reporting
82+ continue ;
83+ }
84+ // ========================================================================
85+
86+ int fromPos = pos + atr .getStartPos ();
87+ int toPos = pos + atr .getEndPos ();
88+
89+ // If an alternative spelling has already been encountered, this word is the opposite variant → create a match
90+ if (shouldNotAppearWord .containsKey (surfaceLc )) {
91+ String other = shouldNotAppearWord .get (surfaceLc );
92+ String msg = getMessage (surface , other );
93+ RuleMatch rm = new RuleMatch (this , sentence , fromPos , toPos , msg );
94+
95+ String marked = sentence .getText ().substring (atr .getStartPos (), atr .getEndPos ());
96+ String replacement = createReplacement (marked , surfaceLc , other , atr );
97+ if (org .languagetool .tools .StringTools .startsWithUppercase (surface )) {
98+ replacement = org .languagetool .tools .StringTools .uppercaseFirstChar (replacement );
99+ }
100+ if (!marked .equalsIgnoreCase (replacement )) {
101+ rm .setSuggestedReplacement (replacement );
102+ ruleMatches .add (rm );
103+ }
104+ rm .setShortMessage (getShortMessage ());
105+ } else {
106+ // Record the variant spelling so that later occurrences of the opposite form can be detected
107+ for (String v : variants ) {
108+ shouldNotAppearWord .put (v , surfaceLc );
109+ }
110+ }
111+ }
112+ pos += sentence .getCorrectedTextLength ();
113+ }
114+ return toRuleMatchArray (ruleMatches );
115+ }
116+
117+ /**
118+ * Creates a WordCoherencyRule configured with the provided resource bundle.
119+ *
120+ * Registers an example demonstrating the incorrect mixing of variants ("archeology" vs "archaeology").
121+ *
122+ * @param messages the ResourceBundle containing localized messages for this rule
123+ * @throws IOException if loading the coherency data fails
124+ */
37125 public WordCoherencyRule (ResourceBundle messages ) throws IOException {
38126 super (messages );
39127 addExamplePair (Example .wrong ("He likes archaeology. Really? She likes <marker>archeology</marker>, too." ),
40- Example .fixed ("He likes archaeology. Really? She likes <marker>archaeology</marker>, too." ));
128+ Example .fixed ("He likes archaeology. Really? She likes <marker>archaeology</marker>, too." ));
41129 }
42130
131+ /**
132+ * Provides the mapping of normalized (lowercase) word variants to their admitted variants used for coherency checks.
133+ *
134+ * @return the map whose keys are lowercase variant forms and whose values are sets of admitted variant strings
135+ */
43136 @ Override
44137 protected Map <String , Set <String >> getWordMap () {
45138 return wordMap ;
46139 }
47140
141+ /**
142+ * Produce the user-facing message advising against mixing two spelling variants in the same text.
143+ *
144+ * @param word1 the first spelling variant to mention in the message
145+ * @param word2 the second spelling variant to mention in the message
146+ * @return a message telling the user not to mix the two variants, containing `word1` and `word2`
147+ */
48148 @ Override
49149 protected String getMessage (String word1 , String word2 ) {
50150 return "Do not mix variants of the same word ('" + word1 + "' and '" + word2 + "') within a single text." ;
51151 }
52-
152+
153+ /**
154+ * Provides the unique identifier for this word coherency rule.
155+ *
156+ * @return the rule identifier "EN_WORD_COHERENCY"
157+ */
53158 @ Override
54159 public String getId () {
55160 return "EN_WORD_COHERENCY" ;
@@ -60,4 +165,4 @@ public String getDescription() {
60165 return "Coherent spelling of words with two admitted variants." ;
61166 }
62167
63- }
168+ }
0 commit comments