diff --git a/README.md b/README.md index 1ca68bf..8181edb 100644 --- a/README.md +++ b/README.md @@ -165,6 +165,34 @@ RE2("б").replace("абв", bufReplacer); This feature works for string and buffer inputs. If a buffer was used as an input, its output will be returned as a buffer too, otherwise a string will be returned. +### `RE2.Set` + +When the same string must be tested against many patterns, [`RE2::Set`](https://github.com/google/re2/wiki/SetSyntax) builds a single automaton for all of them. It frequently beats running a large list of individual regular expressions one by one. + +* `new RE2.Set(patterns[, flagsOrOptions][, options])` + * `patterns` is any iterable of strings, `Buffer`s, `RegExp`, or `RE2` instances; flags (if provided) apply to the whole set. + * `flagsOrOptions` can be a string/`Buffer` with flags (`i`, `m`, `s`, `u`, `g`, `y`, `d`) or an options object. + * `options.anchor` can be `'unanchored'` (default), `'start'`, or `'both'`. +* `set.match(str)` returns an array of indexes of matching patterns. +* `set.test(str)` returns `true` if any pattern matches. +* Read-only properties: + * `set.size`, `set.flags`, `set.anchor` + * `set.source` (all patterns joined with `|`), `set.sources` (individual pattern sources) + +Example: + +```js +const routes = new RE2.Set([ + '^/users/\\d+$', + '^/posts/\\d+$' +], 'i', {anchor: 'start'}); + +routes.test('/posts/42'); // true +routes.match('/users/7'); // [0] +routes.sources; // ['^/users/\\d+$', '^/posts/\\d+$'] +routes.toString(); // '/^/users/\\d+$|^/posts/\\d+$/iu' +``` + ### Calculate length Two functions to calculate string sizes between diff --git a/binding.gyp b/binding.gyp index a7be55e..20c2e14 100644 --- a/binding.gyp +++ b/binding.gyp @@ -5,6 +5,7 @@ "sources": [ "lib/addon.cc", "lib/accessors.cc", + "lib/pattern.cc", "lib/util.cc", "lib/new.cc", "lib/exec.cc", @@ -14,6 +15,7 @@ "lib/search.cc", "lib/split.cc", "lib/to_string.cc", + "lib/set.cc", "vendor/re2/re2/bitmap256.cc", "vendor/re2/re2/bitstate.cc", "vendor/re2/re2/compile.cc", diff --git a/lib/addon.cc b/lib/addon.cc index cc072a9..8bfb548 100644 --- a/lib/addon.cc +++ b/lib/addon.cc @@ -1,4 +1,5 @@ #include "./wrapped_re2.h" +#include "./wrapped_re2_set.h" static NAN_METHOD(GetUtf8Length) { @@ -75,6 +76,9 @@ v8::Local WrappedRE2::Init() Nan::SetAccessor(instanceTemplate, Nan::New("internalSource").ToLocalChecked(), GetInternalSource); auto ctr = Nan::GetFunction(tpl).ToLocalChecked(); + auto setCtr = WrappedRE2Set::Init(); + + Nan::Set(ctr, Nan::New("Set").ToLocalChecked(), setCtr); // properties diff --git a/lib/new.cc b/lib/new.cc index ea08d12..813018c 100644 --- a/lib/new.cc +++ b/lib/new.cc @@ -1,5 +1,6 @@ #include "./wrapped_re2.h" #include "./util.h" +#include "./pattern.h" #include #include @@ -7,252 +8,6 @@ #include #include -static char hex[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'}; - -inline bool isUpperCaseAlpha(char ch) -{ - return 'A' <= ch && ch <= 'Z'; -} - -inline bool isHexadecimal(char ch) -{ - return ('0' <= ch && ch <= '9') || ('A' <= ch && ch <= 'Z') || ('a' <= ch && ch <= 'z'); -} - -static std::map unicodeClasses = { - {"Uppercase_Letter", "Lu"}, - {"Lowercase_Letter", "Ll"}, - {"Titlecase_Letter", "Lt"}, - {"Cased_Letter", "LC"}, - {"Modifier_Letter", "Lm"}, - {"Other_Letter", "Lo"}, - {"Letter", "L"}, - {"Nonspacing_Mark", "Mn"}, - {"Spacing_Mark", "Mc"}, - {"Enclosing_Mark", "Me"}, - {"Mark", "M"}, - {"Decimal_Number", "Nd"}, - {"Letter_Number", "Nl"}, - {"Other_Number", "No"}, - {"Number", "N"}, - {"Connector_Punctuation", "Pc"}, - {"Dash_Punctuation", "Pd"}, - {"Open_Punctuation", "Ps"}, - {"Close_Punctuation", "Pe"}, - {"Initial_Punctuation", "Pi"}, - {"Final_Punctuation", "Pf"}, - {"Other_Punctuation", "Po"}, - {"Punctuation", "P"}, - {"Math_Symbol", "Sm"}, - {"Currency_Symbol", "Sc"}, - {"Modifier_Symbol", "Sk"}, - {"Other_Symbol", "So"}, - {"Symbol", "S"}, - {"Space_Separator", "Zs"}, - {"Line_Separator", "Zl"}, - {"Paragraph_Separator", "Zp"}, - {"Separator", "Z"}, - {"Control", "Cc"}, - {"Format", "Cf"}, - {"Surrogate", "Cs"}, - {"Private_Use", "Co"}, - {"Unassigned", "Cn"}, - {"Other", "C"}, -}; - -static bool translateRegExp(const char *data, size_t size, bool multiline, std::vector &buffer) -{ - std::string result; - bool changed = false; - - if (!size) - { - result = "(?:)"; - changed = true; - } - else if (multiline) - { - result = "(?m)"; - changed = true; - } - - for (size_t i = 0; i < size;) - { - char ch = data[i]; - if (ch == '\\') - { - if (i + 1 < size) - { - ch = data[i + 1]; - switch (ch) - { - case '\\': - result += "\\\\"; - i += 2; - continue; - case 'c': - if (i + 2 < size) - { - ch = data[i + 2]; - if (isUpperCaseAlpha(ch)) - { - result += "\\x"; - result += hex[((ch - '@') / 16) & 15]; - result += hex[(ch - '@') & 15]; - i += 3; - changed = true; - continue; - } - } - result += "\\c"; - i += 2; - continue; - case 'u': - if (i + 2 < size) - { - ch = data[i + 2]; - if (isHexadecimal(ch)) - { - result += "\\x{"; - result += ch; - i += 3; - for (size_t j = 0; j < 3 && i < size; ++i, ++j) - { - ch = data[i]; - if (!isHexadecimal(ch)) - { - break; - } - result += ch; - } - result += '}'; - changed = true; - continue; - } - else if (ch == '{') - { - result += "\\x"; - i += 2; - changed = true; - continue; - } - } - result += "\\u"; - i += 2; - continue; - case 'p': - case 'P': - if (i + 2 < size) { - if (data[i + 2] == '{') { - size_t j = i + 3; - while (j < size && data[j] != '}') ++j; - if (j < size) { - result += "\\"; - result += data[i + 1]; - std::string name(data + i + 3, j - i - 3); - if (unicodeClasses.find(name) != unicodeClasses.end()) { - name = unicodeClasses[name]; - } else if (name.size() > 7 && !strncmp(name.c_str(), "Script=", 7)) { - name = name.substr(7); - } else if (name.size() > 3 && !strncmp(name.c_str(), "sc=", 3)) { - name = name.substr(3); - } - if (name.size() == 1) { - result += name; - } else { - result += "{"; - result += name; - result += "}"; - } - i = j + 1; - changed = true; - continue; - } - } - } - result += "\\"; - result += data[i + 1]; - i += 2; - continue; - default: - result += "\\"; - size_t sym_size = getUtf8CharSize(ch); - result.append(data + i + 1, sym_size); - i += sym_size + 1; - continue; - } - } - } - else if (ch == '/') - { - result += "\\/"; - i += 1; - changed = true; - continue; - } - else if (ch == '(' && i + 2 < size && data[i + 1] == '?' && data[i + 2] == '<') - { - if (i + 3 >= size || (data[i + 3] != '=' && data[i + 3] != '!')) - { - result += "(?P<"; - i += 3; - changed = true; - continue; - } - } - size_t sym_size = getUtf8CharSize(ch); - result.append(data + i, sym_size); - i += sym_size; - } - - if (!changed) - { - return false; - } - - buffer.resize(0); - buffer.insert(buffer.end(), result.data(), result.data() + result.size()); - buffer.push_back('\0'); - - return true; -} - -static std::string escapeRegExp(const char *data, size_t size) -{ - std::string result; - - if (!size) - { - result = "(?:)"; - } - - size_t prevBackSlashes = 0; - for (size_t i = 0; i < size;) - { - char ch = data[i]; - if (ch == '\\') - { - ++prevBackSlashes; - } - else if (ch == '/' && !(prevBackSlashes & 1)) - { - result += "\\/"; - i += 1; - prevBackSlashes = 0; - continue; - } - else - { - prevBackSlashes = 0; - } - size_t sym_size = getUtf8CharSize(ch); - result.append(data + i, sym_size); - i += sym_size; - } - - return result; -} - bool WrappedRE2::alreadyWarnedAboutUnicode = false; static const char *deprecationMessage = "BMP patterns aren't supported by node-re2. An implicit \"u\" flag is assumed by the RE2 constructor. In a future major version, calling the RE2 constructor without the \"u\" flag may become forbidden, or cause a different behavior. Please see https://github.com/uhop/node-re2/issues/21 for more information."; diff --git a/lib/pattern.cc b/lib/pattern.cc new file mode 100644 index 0000000..29cb2fb --- /dev/null +++ b/lib/pattern.cc @@ -0,0 +1,252 @@ +#include "./pattern.h" +#include "./wrapped_re2.h" + +#include +#include +#include + +static char hex[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'}; + +inline bool isUpperCaseAlpha(char ch) +{ + return 'A' <= ch && ch <= 'Z'; +} + +inline bool isHexadecimal(char ch) +{ + return ('0' <= ch && ch <= '9') || ('A' <= ch && ch <= 'Z') || ('a' <= ch && ch <= 'z'); +} + +static std::map unicodeClasses = { + {"Uppercase_Letter", "Lu"}, + {"Lowercase_Letter", "Ll"}, + {"Titlecase_Letter", "Lt"}, + {"Cased_Letter", "LC"}, + {"Modifier_Letter", "Lm"}, + {"Other_Letter", "Lo"}, + {"Letter", "L"}, + {"Nonspacing_Mark", "Mn"}, + {"Spacing_Mark", "Mc"}, + {"Enclosing_Mark", "Me"}, + {"Mark", "M"}, + {"Decimal_Number", "Nd"}, + {"Letter_Number", "Nl"}, + {"Other_Number", "No"}, + {"Number", "N"}, + {"Connector_Punctuation", "Pc"}, + {"Dash_Punctuation", "Pd"}, + {"Open_Punctuation", "Ps"}, + {"Close_Punctuation", "Pe"}, + {"Initial_Punctuation", "Pi"}, + {"Final_Punctuation", "Pf"}, + {"Other_Punctuation", "Po"}, + {"Punctuation", "P"}, + {"Math_Symbol", "Sm"}, + {"Currency_Symbol", "Sc"}, + {"Modifier_Symbol", "Sk"}, + {"Other_Symbol", "So"}, + {"Symbol", "S"}, + {"Space_Separator", "Zs"}, + {"Line_Separator", "Zl"}, + {"Paragraph_Separator", "Zp"}, + {"Separator", "Z"}, + {"Control", "Cc"}, + {"Format", "Cf"}, + {"Surrogate", "Cs"}, + {"Private_Use", "Co"}, + {"Unassigned", "Cn"}, + {"Other", "C"}, +}; + +bool translateRegExp(const char *data, size_t size, bool multiline, std::vector &buffer) +{ + std::string result; + bool changed = false; + + if (!size) + { + result = "(?:)"; + changed = true; + } + else if (multiline) + { + result = "(?m)"; + changed = true; + } + + for (size_t i = 0; i < size;) + { + char ch = data[i]; + if (ch == '\\') + { + if (i + 1 < size) + { + ch = data[i + 1]; + switch (ch) + { + case '\\': + result += "\\\\"; + i += 2; + continue; + case 'c': + if (i + 2 < size) + { + ch = data[i + 2]; + if (isUpperCaseAlpha(ch)) + { + result += "\\x"; + result += hex[((ch - '@') / 16) & 15]; + result += hex[(ch - '@') & 15]; + i += 3; + changed = true; + continue; + } + } + result += "\\c"; + i += 2; + continue; + case 'u': + if (i + 2 < size) + { + ch = data[i + 2]; + if (isHexadecimal(ch)) + { + result += "\\x{"; + result += ch; + i += 3; + for (size_t j = 0; j < 3 && i < size; ++i, ++j) + { + ch = data[i]; + if (!isHexadecimal(ch)) + { + break; + } + result += ch; + } + result += '}'; + changed = true; + continue; + } + else if (ch == '{') + { + result += "\\x"; + i += 2; + changed = true; + continue; + } + } + result += "\\u"; + i += 2; + continue; + case 'p': + case 'P': + if (i + 2 < size) { + if (data[i + 2] == '{') { + size_t j = i + 3; + while (j < size && data[j] != '}') ++j; + if (j < size) { + result += "\\"; + result += data[i + 1]; + std::string name(data + i + 3, j - i - 3); + if (unicodeClasses.find(name) != unicodeClasses.end()) { + name = unicodeClasses[name]; + } else if (name.size() > 7 && !strncmp(name.c_str(), "Script=", 7)) { + name = name.substr(7); + } else if (name.size() > 3 && !strncmp(name.c_str(), "sc=", 3)) { + name = name.substr(3); + } + if (name.size() == 1) { + result += name; + } else { + result += "{"; + result += name; + result += "}"; + } + i = j + 1; + changed = true; + continue; + } + } + } + result += "\\"; + result += data[i + 1]; + i += 2; + continue; + default: + result += "\\"; + size_t sym_size = getUtf8CharSize(ch); + result.append(data + i + 1, sym_size); + i += sym_size + 1; + continue; + } + } + } + else if (ch == '/') + { + result += "\\/"; + i += 1; + changed = true; + continue; + } + else if (ch == '(' && i + 2 < size && data[i + 1] == '?' && data[i + 2] == '<') + { + if (i + 3 >= size || (data[i + 3] != '=' && data[i + 3] != '!')) + { + result += "(?P<"; + i += 3; + changed = true; + continue; + } + } + size_t sym_size = getUtf8CharSize(ch); + result.append(data + i, sym_size); + i += sym_size; + } + + if (!changed) + { + return false; + } + + buffer.resize(0); + buffer.insert(buffer.end(), result.data(), result.data() + result.size()); + buffer.push_back('\0'); + + return true; +} + +std::string escapeRegExp(const char *data, size_t size) +{ + std::string result; + + if (!size) + { + result = "(?:)"; + } + + size_t prevBackSlashes = 0; + for (size_t i = 0; i < size;) + { + char ch = data[i]; + if (ch == '\\') + { + ++prevBackSlashes; + } + else if (ch == '/' && !(prevBackSlashes & 1)) + { + result += "\\/"; + i += 1; + prevBackSlashes = 0; + continue; + } + else + { + prevBackSlashes = 0; + } + size_t sym_size = getUtf8CharSize(ch); + result.append(data + i, sym_size); + i += sym_size; + } + + return result; +} diff --git a/lib/pattern.h b/lib/pattern.h new file mode 100644 index 0000000..45c7faf --- /dev/null +++ b/lib/pattern.h @@ -0,0 +1,10 @@ +#pragma once + +#include +#include + +// Shared helpers for translating JavaScript-style regular expressions +// into RE2-compatible patterns. +bool translateRegExp(const char *data, size_t size, bool multiline, std::vector &buffer); +std::string escapeRegExp(const char *data, size_t size); + diff --git a/lib/set.cc b/lib/set.cc new file mode 100644 index 0000000..46de927 --- /dev/null +++ b/lib/set.cc @@ -0,0 +1,784 @@ +#include "./wrapped_re2_set.h" +#include "./pattern.h" +#include "./util.h" +#include "./wrapped_re2.h" + +#include +#include +#include +#include + +Nan::Persistent WrappedRE2Set::constructor; + +struct SetFlags +{ + bool global = false; + bool ignoreCase = false; + bool multiline = false; + bool dotAll = false; + bool unicode = false; + bool sticky = false; + bool hasIndices = false; +}; + +static bool parseFlags(const v8::Local &arg, SetFlags &flags) +{ + const char *data = nullptr; + size_t size = 0; + std::vector buffer; + + if (arg->IsString()) + { + auto isolate = v8::Isolate::GetCurrent(); + auto t = arg->ToString(Nan::GetCurrentContext()); + if (t.IsEmpty()) + { + return false; + } + auto s = t.ToLocalChecked(); + size = s->Utf8Length(isolate); + buffer.resize(size + 1); + s->WriteUtf8(isolate, &buffer[0], buffer.size()); + buffer[size] = '\0'; + data = &buffer[0]; + } + else if (node::Buffer::HasInstance(arg)) + { + size = node::Buffer::Length(arg); + data = node::Buffer::Data(arg); + } + else + { + return false; + } + + for (size_t i = 0; i < size; ++i) + { + switch (data[i]) + { + case 'd': + flags.hasIndices = true; + break; + case 'g': + flags.global = true; + break; + case 'i': + flags.ignoreCase = true; + break; + case 'm': + flags.multiline = true; + break; + case 's': + flags.dotAll = true; + break; + case 'u': + flags.unicode = true; + break; + case 'y': + flags.sticky = true; + break; + default: + return false; + } + } + + return true; +} + +static bool sameEffectiveOptions(const SetFlags &a, const SetFlags &b) +{ + return a.ignoreCase == b.ignoreCase && a.multiline == b.multiline && a.dotAll == b.dotAll && a.unicode == b.unicode; +} + +static std::string flagsToString(const SetFlags &flags) +{ + std::string result; + if (flags.hasIndices) + { + result += 'd'; + } + if (flags.global) + { + result += 'g'; + } + if (flags.ignoreCase) + { + result += 'i'; + } + if (flags.multiline) + { + result += 'm'; + } + if (flags.dotAll) + { + result += 's'; + } + result += 'u'; + if (flags.sticky) + { + result += 'y'; + } + return result; +} + +static bool collectIterable(const v8::Local &input, std::vector> &items) +{ + auto context = Nan::GetCurrentContext(); + auto isolate = v8::Isolate::GetCurrent(); + + if (input->IsArray()) + { + auto array = v8::Local::Cast(input); + auto length = array->Length(); + items.reserve(length); + for (uint32_t i = 0; i < length; ++i) + { + auto maybe = Nan::Get(array, i); + if (maybe.IsEmpty()) + { + return false; + } + items.push_back(maybe.ToLocalChecked()); + } + return true; + } + + auto maybeObject = input->ToObject(context); + if (maybeObject.IsEmpty()) + { + return false; + } + auto object = maybeObject.ToLocalChecked(); + + auto maybeIteratorFn = object->Get(context, v8::Symbol::GetIterator(isolate)); + if (maybeIteratorFn.IsEmpty()) + { + return false; + } + auto iteratorFn = maybeIteratorFn.ToLocalChecked(); + if (!iteratorFn->IsFunction()) + { + return false; + } + + auto maybeIterator = iteratorFn.As()->Call(context, object, 0, nullptr); + if (maybeIterator.IsEmpty()) + { + return false; + } + auto iterator = maybeIterator.ToLocalChecked(); + if (!iterator->IsObject()) + { + return false; + } + + auto nextKey = Nan::New("next").ToLocalChecked(); + auto valueKey = Nan::New("value").ToLocalChecked(); + auto doneKey = Nan::New("done").ToLocalChecked(); + + for (;;) + { + auto maybeNext = Nan::Get(iterator.As(), nextKey); + if (maybeNext.IsEmpty()) + { + return false; + } + auto next = maybeNext.ToLocalChecked(); + if (!next->IsFunction()) + { + return false; + } + auto maybeResult = next.As()->Call(context, iterator, 0, nullptr); + if (maybeResult.IsEmpty()) + { + return false; + } + auto result = maybeResult.ToLocalChecked(); + if (!result->IsObject()) + { + return false; + } + auto resultObj = result->ToObject(context).ToLocalChecked(); + auto maybeDone = Nan::Get(resultObj, doneKey); + if (maybeDone.IsEmpty()) + { + return false; + } + if (maybeDone.ToLocalChecked()->BooleanValue(isolate)) + { + break; + } + auto maybeValue = Nan::Get(resultObj, valueKey); + if (maybeValue.IsEmpty()) + { + return false; + } + items.push_back(maybeValue.ToLocalChecked()); + } + + return true; +} + +static bool parseAnchor(const v8::Local &arg, re2::RE2::Anchor &anchor, std::string &anchorName) +{ + if (arg.IsEmpty() || arg->IsUndefined() || arg->IsNull()) + { + anchor = re2::RE2::UNANCHORED; + anchorName = "unanchored"; + return true; + } + + v8::Local value = arg; + if (arg->IsObject() && !arg->IsString()) + { + auto context = Nan::GetCurrentContext(); + auto object = arg->ToObject(context).ToLocalChecked(); + auto maybeAnchor = Nan::Get(object, Nan::New("anchor").ToLocalChecked()); + if (maybeAnchor.IsEmpty()) + { + return false; + } + value = maybeAnchor.ToLocalChecked(); + if (value->IsUndefined() || value->IsNull()) + { + anchor = re2::RE2::UNANCHORED; + anchorName = "unanchored"; + return true; + } + } + + if (!value->IsString()) + { + return false; + } + + Nan::Utf8String val(value); + std::string text(*val, val.length()); + + if (text == "unanchored") + { + anchor = re2::RE2::UNANCHORED; + anchorName = text; + return true; + } + if (text == "start") + { + anchor = re2::RE2::ANCHOR_START; + anchorName = text; + return true; + } + if (text == "both") + { + anchor = re2::RE2::ANCHOR_BOTH; + anchorName = text; + return true; + } + + return false; +} + +static bool fillInput(const v8::Local &arg, StrVal &str, v8::Local &keepAlive) +{ + if (node::Buffer::HasInstance(arg)) + { + auto size = node::Buffer::Length(arg); + str.reset(arg, size, size, 0, true); + return true; + } + + auto context = Nan::GetCurrentContext(); + auto isolate = v8::Isolate::GetCurrent(); + auto t = arg->ToString(context); + if (t.IsEmpty()) + { + return false; + } + auto s = t.ToLocalChecked(); + auto utf8Length = s->Utf8Length(isolate); + auto buffer = node::Buffer::New(isolate, s).ToLocalChecked(); + keepAlive = buffer; + str.reset(buffer, node::Buffer::Length(buffer), utf8Length, 0); + return true; +} + +static std::string anchorToString(re2::RE2::Anchor anchor) +{ + switch (anchor) + { + case re2::RE2::ANCHOR_BOTH: + return "both"; + case re2::RE2::ANCHOR_START: + return "start"; + default: + return "unanchored"; + } +} + +static std::string makeCombinedSource(const std::vector &sources) +{ + if (sources.empty()) + { + return "(?:)"; + } + + std::string combined; + for (size_t i = 0, n = sources.size(); i < n; ++i) + { + if (i) + { + combined += '|'; + } + combined += sources[i]; + } + return combined; +} + +static const char setDeprecationMessage[] = "BMP patterns aren't supported by node-re2. An implicit \"u\" flag is assumed by RE2.Set. In a future major version, calling RE2.Set without the \"u\" flag may become forbidden, or cause a different behavior. Please see https://github.com/uhop/node-re2/issues/21 for more information."; + +NAN_METHOD(WrappedRE2Set::New) +{ + auto context = Nan::GetCurrentContext(); + auto isolate = context->GetIsolate(); + + if (!info.IsConstructCall()) + { + std::vector> parameters(info.Length()); + for (size_t i = 0, n = info.Length(); i < n; ++i) + { + parameters[i] = info[i]; + } + auto maybeNew = Nan::NewInstance(Nan::GetFunction(Nan::New(constructor)).ToLocalChecked(), parameters.size(), ¶meters[0]); + if (!maybeNew.IsEmpty()) + { + info.GetReturnValue().Set(maybeNew.ToLocalChecked()); + } + return; + } + + if (!info.Length()) + { + return Nan::ThrowTypeError("Expected an iterable of patterns as the 1st argument."); + } + + SetFlags flags; + bool haveFlags = false; + bool flagsFromArg = false; + + v8::Local flagsArg; + v8::Local optionsArg; + if (info.Length() > 1) + { + if (info[1]->IsObject() && !info[1]->IsString() && !node::Buffer::HasInstance(info[1])) + { + optionsArg = info[1]; + } + else + { + flagsArg = info[1]; + if (info.Length() > 2) + { + optionsArg = info[2]; + } + } + } + + if (!flagsArg.IsEmpty()) + { + if (!parseFlags(flagsArg, flags)) + { + return Nan::ThrowTypeError("Invalid flags for RE2.Set."); + } + haveFlags = true; + flagsFromArg = true; + } + + re2::RE2::Anchor anchor = re2::RE2::UNANCHORED; + std::string anchorName = "unanchored"; + if (!optionsArg.IsEmpty()) + { + if (!parseAnchor(optionsArg, anchor, anchorName)) + { + return Nan::ThrowTypeError("Invalid anchor option for RE2.Set."); + } + } + (void)anchorName; + + std::vector> patterns; + if (!collectIterable(info[0], patterns)) + { + return Nan::ThrowTypeError("Expected an iterable of patterns as the 1st argument."); + } + + auto mergeFlags = [&](const SetFlags &candidate) { + if (flagsFromArg) + { + return true; + } + if (!haveFlags) + { + flags = candidate; + haveFlags = true; + return true; + } + return sameEffectiveOptions(flags, candidate); + }; + + for (auto &value : patterns) + { + SetFlags patternFlags; + bool hasFlagsForPattern = false; + + if (value->IsRegExp()) + { + const auto *re = v8::RegExp::Cast(*value); + v8::RegExp::Flags reFlags = re->GetFlags(); + patternFlags.global = bool(reFlags & v8::RegExp::kGlobal); + patternFlags.ignoreCase = bool(reFlags & v8::RegExp::kIgnoreCase); + patternFlags.multiline = bool(reFlags & v8::RegExp::kMultiline); + patternFlags.dotAll = bool(reFlags & v8::RegExp::kDotAll); + patternFlags.unicode = bool(reFlags & v8::RegExp::kUnicode); + patternFlags.sticky = bool(reFlags & v8::RegExp::kSticky); + patternFlags.hasIndices = bool(reFlags & v8::RegExp::kHasIndices); + hasFlagsForPattern = true; + } + else if (value->IsObject()) + { + auto maybeObj = value->ToObject(context); + if (!maybeObj.IsEmpty()) + { + auto obj = maybeObj.ToLocalChecked(); + if (WrappedRE2::HasInstance(obj)) + { + auto re2 = Nan::ObjectWrap::Unwrap(obj); + patternFlags.global = re2->global; + patternFlags.ignoreCase = re2->ignoreCase; + patternFlags.multiline = re2->multiline; + patternFlags.dotAll = re2->dotAll; + patternFlags.unicode = true; + patternFlags.sticky = re2->sticky; + patternFlags.hasIndices = re2->hasIndices; + hasFlagsForPattern = true; + } + } + } + + if (hasFlagsForPattern && !mergeFlags(patternFlags)) + { + return Nan::ThrowTypeError("All patterns in RE2.Set must use the same flags."); + } + } + + if (!flags.unicode) + { + switch (WrappedRE2::unicodeWarningLevel) + { + case WrappedRE2::THROW: + return Nan::ThrowSyntaxError(setDeprecationMessage); + case WrappedRE2::WARN: + printDeprecationWarning(setDeprecationMessage); + break; + case WrappedRE2::WARN_ONCE: + if (!WrappedRE2::alreadyWarnedAboutUnicode) + { + printDeprecationWarning(setDeprecationMessage); + WrappedRE2::alreadyWarnedAboutUnicode = true; + } + break; + default: + break; + } + } + + re2::RE2::Options options; + options.set_case_sensitive(!flags.ignoreCase); + options.set_one_line(!flags.multiline); + options.set_dot_nl(flags.dotAll); + options.set_log_errors(false); + + std::unique_ptr set(new WrappedRE2Set(options, anchor, flagsToString(flags))); + std::vector buffer; + + for (auto &value : patterns) + { + const char *data = nullptr; + size_t size = 0; + std::string source; + + if (node::Buffer::HasInstance(value)) + { + size = node::Buffer::Length(value); + data = node::Buffer::Data(value); + source = escapeRegExp(data, size); + } + else if (value->IsRegExp()) + { + const auto *re = v8::RegExp::Cast(*value); + auto t = re->GetSource()->ToString(context); + if (t.IsEmpty()) + { + return; + } + auto s = t.ToLocalChecked(); + size = s->Utf8Length(isolate); + buffer.resize(size + 1); + s->WriteUtf8(isolate, &buffer[0], buffer.size()); + buffer[size] = '\0'; + data = &buffer[0]; + source = escapeRegExp(data, size); + } + else if (value->IsString()) + { + auto t = value->ToString(context); + if (t.IsEmpty()) + { + return; + } + auto s = t.ToLocalChecked(); + size = s->Utf8Length(isolate); + buffer.resize(size + 1); + s->WriteUtf8(isolate, &buffer[0], buffer.size()); + buffer[size] = '\0'; + data = &buffer[0]; + source = escapeRegExp(data, size); + } + else if (value->IsObject()) + { + auto maybeObj = value->ToObject(context); + if (maybeObj.IsEmpty()) + { + return; + } + auto obj = maybeObj.ToLocalChecked(); + if (!WrappedRE2::HasInstance(obj)) + { + return Nan::ThrowTypeError("Expected a string, Buffer, RegExp, or RE2 instance in the pattern list."); + } + + auto re2 = Nan::ObjectWrap::Unwrap(obj); + source = re2->source; + data = source.data(); + size = source.size(); + } + else + { + return Nan::ThrowTypeError("Expected a string, Buffer, RegExp, or RE2 instance in the pattern list."); + } + + if (translateRegExp(data, size, flags.multiline, buffer)) + { + data = &buffer[0]; + size = buffer.size() - 1; + } + + std::string error; + if (set->set.Add(re2::StringPiece(data, size), &error) < 0) + { + if (error.empty()) + { + error = "Invalid pattern in RE2.Set."; + } + return Nan::ThrowSyntaxError(error.c_str()); + } + set->sources.push_back(source); + } + + if (!set->set.Compile()) + { + return Nan::ThrowError("RE2.Set could not be compiled."); + } + + set->combinedSource = makeCombinedSource(set->sources); + set->Wrap(info.This()); + set.release(); + + info.GetReturnValue().Set(info.This()); +} + +NAN_METHOD(WrappedRE2Set::Test) +{ + auto re2set = Nan::ObjectWrap::Unwrap(info.This()); + if (!re2set) + { + info.GetReturnValue().Set(false); + return; + } + + StrVal str; + v8::Local keepAlive; + if (!fillInput(info[0], str, keepAlive)) + { + return; + } + + re2::RE2::Set::ErrorInfo errorInfo{re2::RE2::Set::kNoError}; + bool matched = re2set->set.Match(str, nullptr, &errorInfo); + if (!matched && errorInfo.kind != re2::RE2::Set::kNoError) + { + const char *message = "RE2.Set matching failed."; + switch (errorInfo.kind) + { + case re2::RE2::Set::kOutOfMemory: + message = "RE2.Set matching failed: out of memory."; + break; + case re2::RE2::Set::kInconsistent: + message = "RE2.Set matching failed: inconsistent result."; + break; + case re2::RE2::Set::kNotCompiled: + message = "RE2.Set matching failed: set is not compiled."; + break; + default: + break; + } + return Nan::ThrowError(message); + } + + info.GetReturnValue().Set(matched); +} + +NAN_METHOD(WrappedRE2Set::Match) +{ + auto re2set = Nan::ObjectWrap::Unwrap(info.This()); + if (!re2set) + { + info.GetReturnValue().Set(Nan::New(0)); + return; + } + + StrVal str; + v8::Local keepAlive; + if (!fillInput(info[0], str, keepAlive)) + { + return; + } + + std::vector matches; + re2::RE2::Set::ErrorInfo errorInfo{re2::RE2::Set::kNoError}; + bool matched = re2set->set.Match(str, &matches, &errorInfo); + if (!matched && errorInfo.kind != re2::RE2::Set::kNoError) + { + const char *message = "RE2.Set matching failed."; + switch (errorInfo.kind) + { + case re2::RE2::Set::kOutOfMemory: + message = "RE2.Set matching failed: out of memory."; + break; + case re2::RE2::Set::kInconsistent: + message = "RE2.Set matching failed: inconsistent result."; + break; + case re2::RE2::Set::kNotCompiled: + message = "RE2.Set matching failed: set is not compiled."; + break; + default: + break; + } + return Nan::ThrowError(message); + } + + std::sort(matches.begin(), matches.end()); + auto result = Nan::New(matches.size()); + for (size_t i = 0, n = matches.size(); i < n; ++i) + { + Nan::Set(result, i, Nan::New(matches[i])); + } + + info.GetReturnValue().Set(result); +} + +NAN_METHOD(WrappedRE2Set::ToString) +{ + auto re2set = Nan::ObjectWrap::Unwrap(info.This()); + if (!re2set) + { + info.GetReturnValue().SetEmptyString(); + return; + } + + std::string result = "/"; + result += re2set->combinedSource; + result += "/"; + result += re2set->flags; + info.GetReturnValue().Set(Nan::New(result).ToLocalChecked()); +} + +NAN_GETTER(WrappedRE2Set::GetFlags) +{ + auto re2set = Nan::ObjectWrap::Unwrap(info.This()); + if (!re2set) + { + info.GetReturnValue().Set(Nan::New("u").ToLocalChecked()); + return; + } + info.GetReturnValue().Set(Nan::New(re2set->flags).ToLocalChecked()); +} + +NAN_GETTER(WrappedRE2Set::GetSources) +{ + auto re2set = Nan::ObjectWrap::Unwrap(info.This()); + if (!re2set) + { + info.GetReturnValue().Set(Nan::New(0)); + return; + } + auto result = Nan::New(re2set->sources.size()); + for (size_t i = 0, n = re2set->sources.size(); i < n; ++i) + { + Nan::Set(result, i, Nan::New(re2set->sources[i]).ToLocalChecked()); + } + info.GetReturnValue().Set(result); +} + +NAN_GETTER(WrappedRE2Set::GetSource) +{ + auto re2set = Nan::ObjectWrap::Unwrap(info.This()); + if (!re2set) + { + info.GetReturnValue().Set(Nan::New("(?:)").ToLocalChecked()); + return; + } + info.GetReturnValue().Set(Nan::New(re2set->combinedSource).ToLocalChecked()); +} + +NAN_GETTER(WrappedRE2Set::GetSize) +{ + auto re2set = Nan::ObjectWrap::Unwrap(info.This()); + if (!re2set) + { + info.GetReturnValue().Set(0); + return; + } + info.GetReturnValue().Set(static_cast(re2set->sources.size())); +} + +NAN_GETTER(WrappedRE2Set::GetAnchor) +{ + auto re2set = Nan::ObjectWrap::Unwrap(info.This()); + if (!re2set) + { + info.GetReturnValue().Set(Nan::New("unanchored").ToLocalChecked()); + return; + } + info.GetReturnValue().Set(Nan::New(anchorToString(re2set->anchor)).ToLocalChecked()); +} + +v8::Local WrappedRE2Set::Init() +{ + Nan::EscapableHandleScope scope; + + auto tpl = Nan::New(New); + tpl->SetClassName(Nan::New("RE2Set").ToLocalChecked()); + auto instanceTemplate = tpl->InstanceTemplate(); + instanceTemplate->SetInternalFieldCount(1); + + Nan::SetPrototypeMethod(tpl, "test", Test); + Nan::SetPrototypeMethod(tpl, "match", Match); + Nan::SetPrototypeMethod(tpl, "toString", ToString); + + Nan::SetAccessor(instanceTemplate, Nan::New("flags").ToLocalChecked(), GetFlags); + Nan::SetAccessor(instanceTemplate, Nan::New("sources").ToLocalChecked(), GetSources); + Nan::SetAccessor(instanceTemplate, Nan::New("source").ToLocalChecked(), GetSource); + Nan::SetAccessor(instanceTemplate, Nan::New("size").ToLocalChecked(), GetSize); + Nan::SetAccessor(instanceTemplate, Nan::New("anchor").ToLocalChecked(), GetAnchor); + + constructor.Reset(tpl); + return scope.Escape(Nan::GetFunction(tpl).ToLocalChecked()); +} diff --git a/lib/wrapped_re2_set.h b/lib/wrapped_re2_set.h new file mode 100644 index 0000000..171d142 --- /dev/null +++ b/lib/wrapped_re2_set.h @@ -0,0 +1,42 @@ +#pragma once + +#include +#include +#include + +#include +#include + +class WrappedRE2Set : public Nan::ObjectWrap +{ +public: + static v8::Local Init(); + static inline bool HasInstance(v8::Local object) + { + auto isolate = v8::Isolate::GetCurrent(); + return !constructor.IsEmpty() && constructor.Get(isolate)->HasInstance(object); + } + +private: + WrappedRE2Set(const re2::RE2::Options &options, re2::RE2::Anchor anchor, const std::string &flags) : set(options, anchor), flags(flags), anchor(anchor) {} + + static NAN_METHOD(New); + static NAN_METHOD(Test); + static NAN_METHOD(Match); + static NAN_METHOD(ToString); + + static NAN_GETTER(GetFlags); + static NAN_GETTER(GetSources); + static NAN_GETTER(GetSource); + static NAN_GETTER(GetSize); + static NAN_GETTER(GetAnchor); + + static Nan::Persistent constructor; + + re2::RE2::Set set; + std::vector sources; + std::string combinedSource; + std::string flags; + re2::RE2::Anchor anchor; +}; + diff --git a/package-lock.json b/package-lock.json index 9200677..e7ba25f 100644 --- a/package-lock.json +++ b/package-lock.json @@ -817,7 +817,6 @@ "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.2.tgz", "integrity": "sha512-M7BAV6Rlcy5u+m6oPhAPFgJTzAioX/6B0DxyvDlo9l8+T3nLKbrczg2WLUyzd45L8RqfUMyGPzekbMvX2Ldkwg==", "license": "MIT", - "peer": true, "engines": { "node": ">=12" }, diff --git a/re2.d.ts b/re2.d.ts index 1f8043d..4ac2d0b 100644 --- a/re2.d.ts +++ b/re2.d.ts @@ -35,6 +35,28 @@ declare module 're2' { split(str: K, limit?: number): K[]; } + interface RE2SetOptions { + anchor?: 'unanchored' | 'start' | 'both'; + } + + interface RE2Set { + readonly size: number; + readonly source: string; + readonly sources: string[]; + readonly flags: string; + readonly anchor: 'unanchored' | 'start' | 'both'; + + match(str: string | Buffer): number[]; + test(str: string | Buffer): boolean; + toString(): string; + } + + interface RE2SetConstructor { + new(patterns: Iterable, flagsOrOptions?: string | Buffer | RE2SetOptions, options?: RE2SetOptions): RE2Set; + (patterns: Iterable, flagsOrOptions?: string | Buffer | RE2SetOptions, options?: RE2SetOptions): RE2Set; + readonly prototype: RE2Set; + } + interface RE2Constructor extends RegExpConstructor { new(pattern: Buffer | RegExp | RE2 | string): RE2; new(pattern: Buffer | string, flags?: string | Buffer): RE2; @@ -45,6 +67,8 @@ declare module 're2' { unicodeWarningLevel: 'nothing' | 'warnOnce' | 'warn' | 'throw'; getUtf8Length(value: string): number; getUtf16Length(value: Buffer): number; + + Set: RE2SetConstructor; } var RE2: RE2Constructor; diff --git a/tests/test_set.js b/tests/test_set.js new file mode 100644 index 0000000..e2d7236 --- /dev/null +++ b/tests/test_set.js @@ -0,0 +1,110 @@ +'use strict'; + +var unit = require('heya-unit'); +var RE2 = require('../re2'); + +unit.add(module, [ + function test_setBasics(t) { + var set = new RE2.Set(['foo', 'bar'], 'im'); + + eval(t.TEST('set instanceof Object')); + eval(t.TEST("typeof set.match === 'function'")); + eval(t.TEST('set.size === 2')); + eval(t.TEST("set.flags === 'imu'")); + eval(t.TEST("set.anchor === 'unanchored'")); + eval(t.TEST("Array.isArray(set.sources)")); + eval(t.TEST("set.sources[0] === 'foo'")); + eval(t.TEST("set.source === 'foo|bar'")); + eval(t.TEST("set.toString() === '/foo|bar/imu'")); + }, + function test_setMatching(t) { + var set = new RE2.Set(['foo', 'bar'], 'i'); + + var result = set.match('xxFOOxxbar'); + eval(t.TEST('result.length === 2')); + eval(t.TEST('result[0] === 0')); + eval(t.TEST('result[1] === 1')); + eval(t.TEST('set.test("nothing here") === false')); + eval(t.TEST('set.match("nothing here").length === 0')); + }, + function test_setAnchors(t) { + var start = new RE2.Set(['abc'], {anchor: 'start'}); + var both = new RE2.Set(['abc'], {anchor: 'both'}); + + eval(t.TEST('start.test("zabc") === false')); + eval(t.TEST('start.test("abc") === true')); + eval(t.TEST('both.test("abc") === true')); + eval(t.TEST('both.test("abc1") === false')); + }, + function test_setIterable(t) { + function *gen() { + yield 'cat'; + yield 'dog'; + } + + var set = new RE2.Set(gen()); + eval(t.TEST('set.size === 2')); + var result = set.match('hotdog'); + eval(t.TEST('result.length === 1')); + eval(t.TEST('result[0] === 1')); + }, + function test_setFlagsOverride(t) { + var set = new RE2.Set([/abc/], 'i'); + eval(t.TEST('set.test("ABC") === true')); + eval(t.TEST('set.flags === "iu"')); + }, + function test_setInvalid(t) { + try { + var set = new RE2.Set([null]); + t.test(false); + } catch (e) { + eval(t.TEST('e instanceof TypeError')); + } + }, + function test_setPerformance(t) { + var patternCount = 200; + var iterations = 4000; + + var patterns = []; + for (var i = 0; i < patternCount; ++i) { + patterns.push('token' + i + '(?:[a-z]+)?'); + } + + var inputs = []; + for (var j = 0; j < iterations; ++j) { + inputs.push('xx' + (j % patternCount) + ' ' + (j & 7) + ' token' + (j % patternCount) + ' tail'); + } + + var set = new RE2.Set(patterns); + var re2List = patterns.map(function (p) { return new RE2(p); }); + var jsList = patterns.map(function (p) { return new RegExp(p); }); + + function measure(fn) { + var start = process.hrtime.bigint(); + var matches = 0; + for (var i = 0; i < inputs.length; ++i) { + matches += fn(inputs[i]); + } + var duration = Number(process.hrtime.bigint() - start) / 1e6; + return {time: duration, matches: matches}; + } + + var setResult = measure(function (str) { return set.test(str) ? 1 : 0; }); + var re2Result = measure(function (str) { + for (var i = 0; i < re2List.length; ++i) { + if (re2List[i].test(str)) return 1; + } + return 0; + }); + var jsResult = measure(function (str) { + for (var i = 0; i < jsList.length; ++i) { + if (jsList[i].test(str)) return 1; + } + return 0; + }); + + eval(t.TEST('setResult.matches === re2Result.matches')); + eval(t.TEST('setResult.matches === jsResult.matches')); + eval(t.TEST('setResult.time < re2Result.time')); + } +]); diff --git a/tests/tests.js b/tests/tests.js index 8a472f4..08c1a10 100644 --- a/tests/tests.js +++ b/tests/tests.js @@ -12,6 +12,7 @@ require('./test_matchAll'); require('./test_replace'); require('./test_search'); require('./test_split'); +require('./test_set'); require('./test_invalid'); require('./test_symbols'); require('./test_prototype'); diff --git a/ts-tests/test-types.ts b/ts-tests/test-types.ts index a1f4647..1c9ce58 100644 --- a/ts-tests/test-types.ts +++ b/ts-tests/test-types.ts @@ -24,5 +24,15 @@ function test_matchTypes() { assertType(result.groups['verb']) } +function test_setTypes() { + const set = new RE2.Set(['alpha', Buffer.from('beta')], 'i', {anchor: 'start'}) + assertType(set.match('alphabet')) + assertType(set.test(Buffer.from('alphabet'))) + assertType<'unanchored' | 'start' | 'both'>(set.anchor) + assertType(set.sources) + assertType(set.flags) +} + test_execTypes() test_matchTypes() +test_setTypes()