Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 66 additions & 17 deletions languagetool-core/src/main/java/org/languagetool/JLanguageTool.java
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,8 @@ public JLanguageTool(Language language, List<Language> altLanguages, Language mo
GlobalConfig globalConfig, UserConfig userConfig, boolean inputLogging) {
this(language, altLanguages, motherTongue, cache, globalConfig, userConfig, true, false);
}



/**
* Create a JLanguageTool and setup the built-in rules for the
* given language and false friend rules for the text language / mother tongue pair.
Expand All @@ -317,25 +318,51 @@ public JLanguageTool(Language language, List<Language> altLanguages, Language mo
* @since 6.6
*/
public JLanguageTool(Language language, List<Language> altLanguages, Language motherTongue, ResultCache cache, GlobalConfig globalConfig, UserConfig userConfig, boolean inputLogging, boolean withLanguageModel) {
this(language, altLanguages, motherTongue, cache, globalConfig, userConfig, inputLogging, withLanguageModel, null);
}

/**
* Create a JLanguageTool and setup the built-in rules for the
* given language and false friend rules for the text language / mother tongue pair.
*
* @param language the language of the text to be checked
* @param altLanguages The languages that are accepted as alternative languages - currently this means
* words are accepted if they are in an alternative language and not similar to
* a word from {@code language}. If there's a similar word in {@code language},
* there will be an error of type {@link RuleMatch.Type#Hint} (EXPERIMENTAL)
* @param motherTongue the user's mother tongue, used for false friend rules, or <code>null</code>.
* The mother tongue may also be used as a source language for checking bilingual texts.
* @param cache a cache to speed up checking if the same sentences get checked more than once,
* e.g. when LT is running as a server and texts are re-checked due to changes
* @param inputLogging allow inclusion of input in logs on exceptions
* @param withLanguageModel will not call updateOptionalLanguageModelRules(null) if this is true
* @param customRules rules to use for the JLanguageTool instance instead of initializing with the built-in ones, or null to use built-in rules
* @since 6.6
*/
public JLanguageTool(Language language, List<Language> altLanguages, Language motherTongue, ResultCache cache, GlobalConfig globalConfig, UserConfig userConfig, boolean inputLogging, boolean withLanguageModel, List<Rule> customRules) {
this.language = Objects.requireNonNull(language, "language cannot be null");
this.altLanguages = Objects.requireNonNull(altLanguages, "altLanguages cannot be null (but empty)");
this.motherTongue = motherTongue;
this.userConfig = Objects.requireNonNullElseGet(userConfig, UserConfig::new);
this.globalConfig = globalConfig;
ResourceBundle messages = ResourceBundleTools.getMessageBundle(language);
builtinRules = getAllBuiltinRules(language, messages, userConfig, globalConfig);
this.cleanOverlappingMatches = true;
try {
activateDefaultPatternRules();
if (!language.hasNGramFalseFriendRule(motherTongue)) {
// use the old false friends, which always match, not depending on context
activateDefaultFalseFriendRules();
}
if (!withLanguageModel) {
updateOptionalLanguageModelRules(null); // start out with rules without language model
ResourceBundle messages = ResourceBundleTools.getMessageBundle(language);
if (customRules != null) {
builtinRules = new ArrayList<>(customRules);
} else {
builtinRules = getAllBuiltinRules(language, messages, userConfig, globalConfig);
try {
activateDefaultPatternRules();
if (!language.hasNGramFalseFriendRule(motherTongue)) {
// use the old false friends, which always match, not depending on context
activateDefaultFalseFriendRules();
}
if (!withLanguageModel) {
updateOptionalLanguageModelRules(null); // start out with rules without language model
}
} catch (Exception e) {
throw new RuntimeException("Could not activate rules", e);
}
} catch (Exception e) {
throw new RuntimeException("Could not activate rules", e);
}
this.cache = cache;
descProvider = new ShortDescriptionProvider();
Expand Down Expand Up @@ -775,6 +802,20 @@ public void disableRules(List<String> ruleIds) {
ruleSetCache.clear();
}

/**
* Updates the rules for the system by replacing the user-defined rules with the provided set of rules.
* Clears any existing user and built-in rules, as well as the cached rule set, before applying the new rules.
*
* @param rules a list of Rule objects to be set as the new user-defined rules
* @since 6.8
*/
public void setRules(List<Rule> rules) {
builtinRules.clear();
userRules.clear();
userRules.addAll(rules);
ruleSetCache.clear();
}

/**
* Disable the given rule category so the check methods like {@link #check(String)} won't use it.
*
Expand Down Expand Up @@ -1038,9 +1079,15 @@ protected CheckResults checkInternal(AnnotatedText annotatedText, ParagraphHandl
}

protected CheckResults checkInternal(AnnotatedText annotatedText, ParagraphHandling paraMode, RuleMatchListener listener,
Mode mode, Level level, @NotNull Set<ToneTag> toneTags,
@Nullable Long textSessionID, List<String> sentences, List<AnalyzedSentence> analyzedSentences) throws IOException {
RuleSet rules = getActiveRulesForLevelAndToneTags(level, toneTags);
return checkInternalWithCustomRules(rules, annotatedText, paraMode, listener, mode, level, toneTags, textSessionID, sentences, analyzedSentences);
}

public CheckResults checkInternalWithCustomRules(RuleSet rules, AnnotatedText annotatedText, ParagraphHandling paraMode, RuleMatchListener listener,
Mode mode, Level level, @NotNull Set<ToneTag> toneTags,
@Nullable Long textSessionID, List<String> sentences, List<AnalyzedSentence> analyzedSentences) throws IOException {
RuleSet rules = getActiveRulesForLevelAndToneTags(level, toneTags);
if (printStream != null) {
printIfVerbose(rules.allRules().size() + " rules activated for language " + language);
}
Expand Down Expand Up @@ -1304,7 +1351,7 @@ private RemoteRuleResult fetchResults(long deadlineStartNanos, Mode mode, Level
if (matches == null) {
continue;
}
if (cache != null && result.isSuccess()) {
if (cache != null && result.isSuccess() && result.adjustOffsets()) {
// store in cache
InputSentence cacheKey = new InputSentence(
sentence, language, motherTongue, disabledRules, disabledRuleCategories,
Expand All @@ -1318,8 +1365,10 @@ private RemoteRuleResult fetchResults(long deadlineStartNanos, Mode mode, Level
// clone matches before adjusting offsets
// match objects could be relevant to multiple (duplicate) sentences at different offsets
List<RuleMatch> adjustedMatches = matches.stream().map(RuleMatch::new).collect(Collectors.toList());
for (RuleMatch match : adjustedMatches) {
adjustOffset(annotatedText, offset, match);
if (result.adjustOffsets()) {
for (RuleMatch match : adjustedMatches) {
adjustOffset(annotatedText, offset, match);
}
}
remoteMatches.addAll(adjustedMatches);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ public FutureTask<RemoteRuleResult> run(List<AnalyzedSentence> sentences, @Nulla
filteredMatches.addAll(filteredSentenceMatches);
}
}
result = new RemoteRuleResult(result.isRemote(), result.isSuccess(), filteredMatches, sentences);
result = new RemoteRuleResult(result.isRemote(), result.isSuccess(), result.adjustOffsets(), filteredMatches, sentences);
}

List<RuleMatch> filteredMatches = new ArrayList<>();
Expand All @@ -221,7 +221,7 @@ public FutureTask<RemoteRuleResult> run(List<AnalyzedSentence> sentences, @Nulla
filteredMatches.addAll(filteredSentenceMatches);
}
}
result = new RemoteRuleResult(result.isRemote(), result.isSuccess(), filteredMatches, sentences);
result = new RemoteRuleResult(result.isRemote(), result.isSuccess(), result.adjustOffsets(), filteredMatches, sentences);
return result;
});
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,19 @@
public class RemoteRuleResult {
private final boolean remote; // was remote needed/involved? rules may filter input sentences and only call remote on some; for metrics
private final boolean success; // successful -> for caching, so that we can cache: remote not needed for this sentence
private final boolean adjustOffsets; // whether rule matches are relative to each sentence and need to be adjusted further
// or already use the positions from the analyzed sentence and don't need to be adjusted
private final List<RuleMatch> matches;
private final Set<AnalyzedSentence> processedSentences;
// which sentences were processed? to distinguish between no matches because not processed (e.g. cached)
// and no errors/corrections found

private final Map<AnalyzedSentence, List<RuleMatch>> sentenceMatches = new HashMap<>();

public RemoteRuleResult(boolean remote, boolean success, List<RuleMatch> matches, List<AnalyzedSentence> processedSentences) {
public RemoteRuleResult(boolean remote, boolean success, boolean adjustOffsets, List<RuleMatch> matches, List<AnalyzedSentence> processedSentences) {
this.remote = remote;
this.success = success;
this.adjustOffsets = adjustOffsets;
this.matches = matches;
this.processedSentences = Collections.unmodifiableSet(new HashSet<>(processedSentences));

Expand All @@ -54,6 +57,10 @@ public RemoteRuleResult(boolean remote, boolean success, List<RuleMatch> matches
}
}

public RemoteRuleResult(boolean remote, boolean success, List<RuleMatch> matches, List<AnalyzedSentence> processedSentences) {
this(remote, success, true, matches, processedSentences);
}

public boolean isRemote() {
return remote;
}
Expand All @@ -62,6 +69,10 @@ public boolean isSuccess() {
return success;
}

public boolean adjustOffsets() {
return adjustOffsets;
}

public List<RuleMatch> getMatches() {
return matches;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,13 @@
import java.io.FileInputStream;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.*;
import java.util.concurrent.*;
import java.util.regex.Pattern;
Expand Down Expand Up @@ -262,6 +266,27 @@ protected static Language parseLanguage(String code) throws BadRequestException
}
}

/**
* Hash a string deterministically into a 64-bit signed long; use textSessionIdParam if set, fall back to client IP.
*/
protected static Long computeTextSessionID(String textSessionIdParam, String ip) {
String input = textSessionIdParam != null ? textSessionIdParam : ip;
if (input == null) {
return null;
}
try {
MessageDigest md = MessageDigest.getInstance("SHA-256");
byte[] bytes = md.digest(input.getBytes(StandardCharsets.UTF_8));

ByteBuffer buffer = ByteBuffer.wrap(bytes);
Long textSessionId = buffer.getLong();
return textSessionId;
} catch (NoSuchAlgorithmException e) {
// Should not happen for SHA-256, wrap in a runtime exception
throw new RuntimeException("SHA-256 not supported", e);
}
}

private void prewarmPipelinePool() {
// setting + number of pipelines
// typical addon settings at the moment (2018-11-05)
Expand Down Expand Up @@ -433,38 +458,7 @@ public RuleMatch[] match(AnalyzedSentence sentence) throws IOException {

boolean filterDictionaryMatches = "true".equals(params.getOrDefault("filterDictionaryMatches", "true"));

Long textSessionId = null;
try {
if (params.containsKey("textSessionId")) {
String textSessionIdStr = params.get("textSessionId");
if (textSessionIdStr.startsWith("user:")) {
int sepPos = textSessionIdStr.indexOf(':');
String sessionId = textSessionIdStr.substring(sepPos + 1);
textSessionId = Long.valueOf(sessionId);
} else if (textSessionIdStr.contains(":")) { // transitioning to new format used in chrome addon
// format: "{random number in 0..99999}:{unix time}"
long random, timestamp;
int sepPos = textSessionIdStr.indexOf(':');
random = Long.parseLong(textSessionIdStr.substring(0, sepPos));
timestamp = Long.parseLong(textSessionIdStr.substring(sepPos + 1));
// use random number to choose a slice in possible range of values
// then choose position in slice by timestamp
long maxRandom = 100000;
long randomSegmentSize = (Long.MAX_VALUE - maxRandom) / maxRandom;
long segmentOffset = random * randomSegmentSize;
if (timestamp > randomSegmentSize) {
log.warn(String.format("Could not transform textSessionId '%s'", textSessionIdStr));
}
textSessionId = segmentOffset + timestamp;
} else {
textSessionId = Long.valueOf(textSessionIdStr);
}
}
} catch (NumberFormatException ex) {
log.info("Could not parse textSessionId '" + params.get("textSessionId") + "' as long: " + ex.getMessage() +
", user agent: " + params.get("useragent") + ", version: " + params.get("v") +
", HTTP user agent: " + getHttpUserAgent(httpExchange) + ", referrer: " + getHttpReferrer(httpExchange));
}
Long textSessionId = computeTextSessionID(params.get("textSessionId"), remoteAddress);

List<String> abTest = AB_TEST_SERVICE.getActiveAbTestForClient(params, config);

Expand Down