Skip to content

Commit 91cefbf

Browse files
📝 Add docstrings to fix-wordcoherencyrule
Docstrings generation was requested by @minizhiren. * #11568 (comment) The following files were modified: * `languagetool-language-modules/en/src/main/java/org/languagetool/rules/en/WordCoherencyRule.java`
1 parent 53ab426 commit 91cefbf

File tree

1 file changed

+108
-3
lines changed

1 file changed

+108
-3
lines changed

languagetool-language-modules/en/src/main/java/org/languagetool/rules/en/WordCoherencyRule.java

Lines changed: 108 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,21 @@
1919
package org.languagetool.rules.en;
2020

2121
import java.io.IOException;
22+
import java.util.Objects;
23+
import java.util.ArrayList;
24+
import java.util.HashMap;
25+
import java.util.List;
26+
import java.util.Locale;
2227
import java.util.Map;
2328
import java.util.ResourceBundle;
2429
import java.util.Set;
2530

31+
import org.languagetool.AnalyzedSentence;
32+
import org.languagetool.AnalyzedToken;
33+
import org.languagetool.AnalyzedTokenReadings;
2634
import org.languagetool.rules.AbstractWordCoherencyRule;
2735
import org.languagetool.rules.Example;
36+
import org.languagetool.rules.RuleMatch;
2837
import org.languagetool.rules.WordCoherencyDataLoader;
2938

3039
/**
@@ -34,22 +43,118 @@ public class WordCoherencyRule extends AbstractWordCoherencyRule {
3443

3544
private static final Map<String, Set<String>> wordMap = new WordCoherencyDataLoader().loadWords("/en/coherency.txt");
3645

46+
/**
47+
* Detects occurrences of mixed spelling variants of the same word within a text and produces matches.
48+
*
49+
* Scans each sentence token-by-token for known variant forms (loaded from the rule's word map).
50+
* If two different variants of the same lemma appear in the same text, produces a RuleMatch for the later occurrence
51+
* with a short message and, when applicable, a suggested replacement that preserves initial capitalization.
52+
*
53+
* @param sentences the analyzed sentences of the document to check
54+
* @return an array of RuleMatch objects for each detected mixed-variant occurrence; each match may include a single suggested replacement
55+
*/
56+
@Override
57+
public RuleMatch[] match(List<AnalyzedSentence> sentences) {
58+
List<RuleMatch> ruleMatches = new ArrayList<>();
59+
Map<String, String> shouldNotAppearWord = new HashMap<>(); // e.g. doggie -> doggy
60+
int pos = 0;
61+
62+
for (AnalyzedSentence sentence : sentences) {
63+
AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
64+
for (AnalyzedTokenReadings atr : tokens) {
65+
String surface = atr.getToken();
66+
String surfaceLc = surface.toLowerCase(Locale.ROOT);
67+
68+
// Check whether this token is one of the variant candidates
69+
Set<String> variants = getWordMap().get(surfaceLc);
70+
if (variants == null || variants.isEmpty()) {
71+
continue;
72+
}
73+
74+
// ====== Key: lemma immunity to prevent false positives (e.g. doggies/doggier/doggiest) ======
75+
Set<String> lemmasLc = atr.getReadings().stream()
76+
.map(AnalyzedToken::getLemma)
77+
.filter(Objects::nonNull)
78+
.map(s -> s.toLowerCase(Locale.ROOT))
79+
.collect(java.util.stream.Collectors.toSet());
80+
if (!java.util.Collections.disjoint(lemmasLc, variants)) {
81+
// The lemma itself is one of the coherent variants → inflected form → skip reporting
82+
continue;
83+
}
84+
// ========================================================================
85+
86+
int fromPos = pos + atr.getStartPos();
87+
int toPos = pos + atr.getEndPos();
88+
89+
// If an alternative spelling has already been encountered, this word is the opposite variant → create a match
90+
if (shouldNotAppearWord.containsKey(surfaceLc)) {
91+
String other = shouldNotAppearWord.get(surfaceLc);
92+
String msg = getMessage(surface, other);
93+
RuleMatch rm = new RuleMatch(this, sentence, fromPos, toPos, msg);
94+
95+
String marked = sentence.getText().substring(atr.getStartPos(), atr.getEndPos());
96+
String replacement = createReplacement(marked, surfaceLc, other, atr);
97+
if (org.languagetool.tools.StringTools.startsWithUppercase(surface)) {
98+
replacement = org.languagetool.tools.StringTools.uppercaseFirstChar(replacement);
99+
}
100+
if (!marked.equalsIgnoreCase(replacement)) {
101+
rm.setSuggestedReplacement(replacement);
102+
ruleMatches.add(rm);
103+
}
104+
rm.setShortMessage(getShortMessage());
105+
} else {
106+
// Record the variant spelling so that later occurrences of the opposite form can be detected
107+
for (String v : variants) {
108+
shouldNotAppearWord.put(v, surfaceLc);
109+
}
110+
}
111+
}
112+
pos += sentence.getCorrectedTextLength();
113+
}
114+
return toRuleMatchArray(ruleMatches);
115+
}
116+
117+
/**
118+
* Creates a WordCoherencyRule configured with the provided resource bundle.
119+
*
120+
* Registers an example demonstrating the incorrect mixing of variants ("archeology" vs "archaeology").
121+
*
122+
* @param messages the ResourceBundle containing localized messages for this rule
123+
* @throws IOException if loading the coherency data fails
124+
*/
37125
public WordCoherencyRule(ResourceBundle messages) throws IOException {
38126
super(messages);
39127
addExamplePair(Example.wrong("He likes archaeology. Really? She likes <marker>archeology</marker>, too."),
40-
Example.fixed("He likes archaeology. Really? She likes <marker>archaeology</marker>, too."));
128+
Example.fixed("He likes archaeology. Really? She likes <marker>archaeology</marker>, too."));
41129
}
42130

131+
/**
132+
* Provides the mapping of normalized (lowercase) word variants to their admitted variants used for coherency checks.
133+
*
134+
* @return the map whose keys are lowercase variant forms and whose values are sets of admitted variant strings
135+
*/
43136
@Override
44137
protected Map<String, Set<String>> getWordMap() {
45138
return wordMap;
46139
}
47140

141+
/**
142+
* Produce the user-facing message advising against mixing two spelling variants in the same text.
143+
*
144+
* @param word1 the first spelling variant to mention in the message
145+
* @param word2 the second spelling variant to mention in the message
146+
* @return a message telling the user not to mix the two variants, containing `word1` and `word2`
147+
*/
48148
@Override
49149
protected String getMessage(String word1, String word2) {
50150
return "Do not mix variants of the same word ('" + word1 + "' and '" + word2 + "') within a single text.";
51151
}
52-
152+
153+
/**
154+
* Provides the unique identifier for this word coherency rule.
155+
*
156+
* @return the rule identifier "EN_WORD_COHERENCY"
157+
*/
53158
@Override
54159
public String getId() {
55160
return "EN_WORD_COHERENCY";
@@ -60,4 +165,4 @@ public String getDescription() {
60165
return "Coherent spelling of words with two admitted variants.";
61166
}
62167

63-
}
168+
}

0 commit comments

Comments
 (0)