Skip to content

Commit 1d0e61a

Browse files
authored
Merge pull request #11655 from languagetool-org/custom-rules-jlanguagetool
extend JLanguageTool to allow customising used rules easier
2 parents 0e99722 + 96da4cf commit 1d0e61a

File tree

4 files changed

+106
-52
lines changed

4 files changed

+106
-52
lines changed

languagetool-core/src/main/java/org/languagetool/JLanguageTool.java

Lines changed: 66 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,8 @@ public JLanguageTool(Language language, List<Language> altLanguages, Language mo
298298
GlobalConfig globalConfig, UserConfig userConfig, boolean inputLogging) {
299299
this(language, altLanguages, motherTongue, cache, globalConfig, userConfig, true, false);
300300
}
301-
301+
302+
302303
/**
303304
* Create a JLanguageTool and setup the built-in rules for the
304305
* given language and false friend rules for the text language / mother tongue pair.
@@ -317,25 +318,51 @@ public JLanguageTool(Language language, List<Language> altLanguages, Language mo
317318
* @since 6.6
318319
*/
319320
public JLanguageTool(Language language, List<Language> altLanguages, Language motherTongue, ResultCache cache, GlobalConfig globalConfig, UserConfig userConfig, boolean inputLogging, boolean withLanguageModel) {
321+
this(language, altLanguages, motherTongue, cache, globalConfig, userConfig, inputLogging, withLanguageModel, null);
322+
}
323+
324+
/**
325+
* Create a JLanguageTool and setup the built-in rules for the
326+
* given language and false friend rules for the text language / mother tongue pair.
327+
*
328+
* @param language the language of the text to be checked
329+
* @param altLanguages The languages that are accepted as alternative languages - currently this means
330+
* words are accepted if they are in an alternative language and not similar to
331+
* a word from {@code language}. If there's a similar word in {@code language},
332+
* there will be an error of type {@link RuleMatch.Type#Hint} (EXPERIMENTAL)
333+
* @param motherTongue the user's mother tongue, used for false friend rules, or <code>null</code>.
334+
* The mother tongue may also be used as a source language for checking bilingual texts.
335+
* @param cache a cache to speed up checking if the same sentences get checked more than once,
336+
* e.g. when LT is running as a server and texts are re-checked due to changes
337+
* @param inputLogging allow inclusion of input in logs on exceptions
338+
* @param withLanguageModel will not call updateOptionalLanguageModelRules(null) if this is true
339+
* @param customRules rules to use for the JLanguageTool instance instead of initializing with the built-in ones, or null to use built-in rules
340+
* @since 6.6
341+
*/
342+
public JLanguageTool(Language language, List<Language> altLanguages, Language motherTongue, ResultCache cache, GlobalConfig globalConfig, UserConfig userConfig, boolean inputLogging, boolean withLanguageModel, List<Rule> customRules) {
320343
this.language = Objects.requireNonNull(language, "language cannot be null");
321344
this.altLanguages = Objects.requireNonNull(altLanguages, "altLanguages cannot be null (but empty)");
322345
this.motherTongue = motherTongue;
323346
this.userConfig = Objects.requireNonNullElseGet(userConfig, UserConfig::new);
324347
this.globalConfig = globalConfig;
325-
ResourceBundle messages = ResourceBundleTools.getMessageBundle(language);
326-
builtinRules = getAllBuiltinRules(language, messages, userConfig, globalConfig);
327348
this.cleanOverlappingMatches = true;
328-
try {
329-
activateDefaultPatternRules();
330-
if (!language.hasNGramFalseFriendRule(motherTongue)) {
331-
// use the old false friends, which always match, not depending on context
332-
activateDefaultFalseFriendRules();
333-
}
334-
if (!withLanguageModel) {
335-
updateOptionalLanguageModelRules(null); // start out with rules without language model
349+
ResourceBundle messages = ResourceBundleTools.getMessageBundle(language);
350+
if (customRules != null) {
351+
builtinRules = new ArrayList<>(customRules);
352+
} else {
353+
builtinRules = getAllBuiltinRules(language, messages, userConfig, globalConfig);
354+
try {
355+
activateDefaultPatternRules();
356+
if (!language.hasNGramFalseFriendRule(motherTongue)) {
357+
// use the old false friends, which always match, not depending on context
358+
activateDefaultFalseFriendRules();
359+
}
360+
if (!withLanguageModel) {
361+
updateOptionalLanguageModelRules(null); // start out with rules without language model
362+
}
363+
} catch (Exception e) {
364+
throw new RuntimeException("Could not activate rules", e);
336365
}
337-
} catch (Exception e) {
338-
throw new RuntimeException("Could not activate rules", e);
339366
}
340367
this.cache = cache;
341368
descProvider = new ShortDescriptionProvider();
@@ -775,6 +802,20 @@ public void disableRules(List<String> ruleIds) {
775802
ruleSetCache.clear();
776803
}
777804

805+
/**
806+
* Updates the rules for the system by replacing the user-defined rules with the provided set of rules.
807+
* Clears any existing user and built-in rules, as well as the cached rule set, before applying the new rules.
808+
*
809+
* @param rules a list of Rule objects to be set as the new user-defined rules
810+
* @since 6.8
811+
*/
812+
public void setRules(List<Rule> rules) {
813+
builtinRules.clear();
814+
userRules.clear();
815+
userRules.addAll(rules);
816+
ruleSetCache.clear();
817+
}
818+
778819
/**
779820
* Disable the given rule category so the check methods like {@link #check(String)} won't use it.
780821
*
@@ -1038,9 +1079,15 @@ protected CheckResults checkInternal(AnnotatedText annotatedText, ParagraphHandl
10381079
}
10391080

10401081
protected CheckResults checkInternal(AnnotatedText annotatedText, ParagraphHandling paraMode, RuleMatchListener listener,
1082+
Mode mode, Level level, @NotNull Set<ToneTag> toneTags,
1083+
@Nullable Long textSessionID, List<String> sentences, List<AnalyzedSentence> analyzedSentences) throws IOException {
1084+
RuleSet rules = getActiveRulesForLevelAndToneTags(level, toneTags);
1085+
return checkInternalWithCustomRules(rules, annotatedText, paraMode, listener, mode, level, toneTags, textSessionID, sentences, analyzedSentences);
1086+
}
1087+
1088+
public CheckResults checkInternalWithCustomRules(RuleSet rules, AnnotatedText annotatedText, ParagraphHandling paraMode, RuleMatchListener listener,
10411089
Mode mode, Level level, @NotNull Set<ToneTag> toneTags,
10421090
@Nullable Long textSessionID, List<String> sentences, List<AnalyzedSentence> analyzedSentences) throws IOException {
1043-
RuleSet rules = getActiveRulesForLevelAndToneTags(level, toneTags);
10441091
if (printStream != null) {
10451092
printIfVerbose(rules.allRules().size() + " rules activated for language " + language);
10461093
}
@@ -1304,7 +1351,7 @@ private RemoteRuleResult fetchResults(long deadlineStartNanos, Mode mode, Level
13041351
if (matches == null) {
13051352
continue;
13061353
}
1307-
if (cache != null && result.isSuccess()) {
1354+
if (cache != null && result.isSuccess() && result.adjustOffsets()) {
13081355
// store in cache
13091356
InputSentence cacheKey = new InputSentence(
13101357
sentence, language, motherTongue, disabledRules, disabledRuleCategories,
@@ -1318,8 +1365,10 @@ private RemoteRuleResult fetchResults(long deadlineStartNanos, Mode mode, Level
13181365
// clone matches before adjusting offsets
13191366
// match objects could be relevant to multiple (duplicate) sentences at different offsets
13201367
List<RuleMatch> adjustedMatches = matches.stream().map(RuleMatch::new).collect(Collectors.toList());
1321-
for (RuleMatch match : adjustedMatches) {
1322-
adjustOffset(annotatedText, offset, match);
1368+
if (result.adjustOffsets()) {
1369+
for (RuleMatch match : adjustedMatches) {
1370+
adjustOffset(annotatedText, offset, match);
1371+
}
13231372
}
13241373
remoteMatches.addAll(adjustedMatches);
13251374
}

languagetool-core/src/main/java/org/languagetool/rules/RemoteRule.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ public FutureTask<RemoteRuleResult> run(List<AnalyzedSentence> sentences, @Nulla
210210
filteredMatches.addAll(filteredSentenceMatches);
211211
}
212212
}
213-
result = new RemoteRuleResult(result.isRemote(), result.isSuccess(), filteredMatches, sentences);
213+
result = new RemoteRuleResult(result.isRemote(), result.isSuccess(), result.adjustOffsets(), filteredMatches, sentences);
214214
}
215215

216216
List<RuleMatch> filteredMatches = new ArrayList<>();
@@ -221,7 +221,7 @@ public FutureTask<RemoteRuleResult> run(List<AnalyzedSentence> sentences, @Nulla
221221
filteredMatches.addAll(filteredSentenceMatches);
222222
}
223223
}
224-
result = new RemoteRuleResult(result.isRemote(), result.isSuccess(), filteredMatches, sentences);
224+
result = new RemoteRuleResult(result.isRemote(), result.isSuccess(), result.adjustOffsets(), filteredMatches, sentences);
225225
return result;
226226
});
227227
}

languagetool-core/src/main/java/org/languagetool/rules/RemoteRuleResult.java

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,16 +29,19 @@
2929
public class RemoteRuleResult {
3030
private final boolean remote; // was remote needed/involved? rules may filter input sentences and only call remote on some; for metrics
3131
private final boolean success; // successful -> for caching, so that we can cache: remote not needed for this sentence
32+
private final boolean adjustOffsets; // whether rule matches are relative to each sentence and need to be adjusted further
33+
// or already use the positions from the analyzed sentence and don't need to be adjusted
3234
private final List<RuleMatch> matches;
3335
private final Set<AnalyzedSentence> processedSentences;
3436
// which sentences were processed? to distinguish between no matches because not processed (e.g. cached)
3537
// and no errors/corrections found
3638

3739
private final Map<AnalyzedSentence, List<RuleMatch>> sentenceMatches = new HashMap<>();
3840

39-
public RemoteRuleResult(boolean remote, boolean success, List<RuleMatch> matches, List<AnalyzedSentence> processedSentences) {
41+
public RemoteRuleResult(boolean remote, boolean success, boolean adjustOffsets, List<RuleMatch> matches, List<AnalyzedSentence> processedSentences) {
4042
this.remote = remote;
4143
this.success = success;
44+
this.adjustOffsets = adjustOffsets;
4245
this.matches = matches;
4346
this.processedSentences = Collections.unmodifiableSet(new HashSet<>(processedSentences));
4447

@@ -54,6 +57,10 @@ public RemoteRuleResult(boolean remote, boolean success, List<RuleMatch> matches
5457
}
5558
}
5659

60+
public RemoteRuleResult(boolean remote, boolean success, List<RuleMatch> matches, List<AnalyzedSentence> processedSentences) {
61+
this(remote, success, true, matches, processedSentences);
62+
}
63+
5764
public boolean isRemote() {
5865
return remote;
5966
}
@@ -62,6 +69,10 @@ public boolean isSuccess() {
6269
return success;
6370
}
6471

72+
public boolean adjustOffsets() {
73+
return adjustOffsets;
74+
}
75+
6576
public List<RuleMatch> getMatches() {
6677
return matches;
6778
}

languagetool-server/src/main/java/org/languagetool/server/TextChecker.java

Lines changed: 26 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,13 @@
4444
import java.io.FileInputStream;
4545
import java.io.IOException;
4646
import java.net.HttpURLConnection;
47+
import java.nio.ByteBuffer;
48+
import java.nio.charset.StandardCharsets;
4749
import java.nio.file.Files;
4850
import java.nio.file.Path;
4951
import java.nio.file.Paths;
52+
import java.security.MessageDigest;
53+
import java.security.NoSuchAlgorithmException;
5054
import java.util.*;
5155
import java.util.concurrent.*;
5256
import java.util.regex.Pattern;
@@ -262,6 +266,27 @@ protected static Language parseLanguage(String code) throws BadRequestException
262266
}
263267
}
264268

269+
/**
270+
* Hash a string deterministically into a 64-bit signed long; use textSessionIdParam if set, fall back to client IP.
271+
*/
272+
protected static Long computeTextSessionID(String textSessionIdParam, String ip) {
273+
String input = textSessionIdParam != null ? textSessionIdParam : ip;
274+
if (input == null) {
275+
return null;
276+
}
277+
try {
278+
MessageDigest md = MessageDigest.getInstance("SHA-256");
279+
byte[] bytes = md.digest(input.getBytes(StandardCharsets.UTF_8));
280+
281+
ByteBuffer buffer = ByteBuffer.wrap(bytes);
282+
Long textSessionId = buffer.getLong();
283+
return textSessionId;
284+
} catch (NoSuchAlgorithmException e) {
285+
// Should not happen for SHA-256, wrap in a runtime exception
286+
throw new RuntimeException("SHA-256 not supported", e);
287+
}
288+
}
289+
265290
private void prewarmPipelinePool() {
266291
// setting + number of pipelines
267292
// typical addon settings at the moment (2018-11-05)
@@ -433,38 +458,7 @@ public RuleMatch[] match(AnalyzedSentence sentence) throws IOException {
433458

434459
boolean filterDictionaryMatches = "true".equals(params.getOrDefault("filterDictionaryMatches", "true"));
435460

436-
Long textSessionId = null;
437-
try {
438-
if (params.containsKey("textSessionId")) {
439-
String textSessionIdStr = params.get("textSessionId");
440-
if (textSessionIdStr.startsWith("user:")) {
441-
int sepPos = textSessionIdStr.indexOf(':');
442-
String sessionId = textSessionIdStr.substring(sepPos + 1);
443-
textSessionId = Long.valueOf(sessionId);
444-
} else if (textSessionIdStr.contains(":")) { // transitioning to new format used in chrome addon
445-
// format: "{random number in 0..99999}:{unix time}"
446-
long random, timestamp;
447-
int sepPos = textSessionIdStr.indexOf(':');
448-
random = Long.parseLong(textSessionIdStr.substring(0, sepPos));
449-
timestamp = Long.parseLong(textSessionIdStr.substring(sepPos + 1));
450-
// use random number to choose a slice in possible range of values
451-
// then choose position in slice by timestamp
452-
long maxRandom = 100000;
453-
long randomSegmentSize = (Long.MAX_VALUE - maxRandom) / maxRandom;
454-
long segmentOffset = random * randomSegmentSize;
455-
if (timestamp > randomSegmentSize) {
456-
log.warn(String.format("Could not transform textSessionId '%s'", textSessionIdStr));
457-
}
458-
textSessionId = segmentOffset + timestamp;
459-
} else {
460-
textSessionId = Long.valueOf(textSessionIdStr);
461-
}
462-
}
463-
} catch (NumberFormatException ex) {
464-
log.info("Could not parse textSessionId '" + params.get("textSessionId") + "' as long: " + ex.getMessage() +
465-
", user agent: " + params.get("useragent") + ", version: " + params.get("v") +
466-
", HTTP user agent: " + getHttpUserAgent(httpExchange) + ", referrer: " + getHttpReferrer(httpExchange));
467-
}
461+
Long textSessionId = computeTextSessionID(params.get("textSessionId"), remoteAddress);
468462

469463
List<String> abTest = AB_TEST_SERVICE.getActiveAbTestForClient(params, config);
470464

0 commit comments

Comments
 (0)