Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,34 @@ RE2("б").replace("абв", bufReplacer);
This feature works for string and buffer inputs. If a buffer was used as an input, its output will be returned as
a buffer too, otherwise a string will be returned.

### `RE2.Set`

When the same string must be tested against many patterns, [`RE2::Set`](https://github.com/google/re2/wiki/SetSyntax) builds a single automaton for all of them. It frequently beats running a large list of individual regular expressions one by one.

* `new RE2.Set(patterns[, flagsOrOptions][, options])`
* `patterns` is any iterable of strings, `Buffer`s, `RegExp`, or `RE2` instances; flags (if provided) apply to the whole set.
* `flagsOrOptions` can be a string/`Buffer` with flags (`i`, `m`, `s`, `u`, `g`, `y`, `d`) or an options object.
* `options.anchor` can be `'unanchored'` (default), `'start'`, or `'both'`.
* `set.match(str)` returns an array of indexes of matching patterns.
* `set.test(str)` returns `true` if any pattern matches.
* Read-only properties:
* `set.size`, `set.flags`, `set.anchor`
* `set.source` (all patterns joined with `|`), `set.sources` (individual pattern sources)

Example:

```js
const routes = new RE2.Set([
'^/users/\\d+$',
'^/posts/\\d+$'
], 'i', {anchor: 'start'});

routes.test('/posts/42'); // true
routes.match('/users/7'); // [0]
routes.sources; // ['^/users/\\d+$', '^/posts/\\d+$']
routes.toString(); // '/^/users/\\d+$|^/posts/\\d+$/iu'
```

### Calculate length

Two functions to calculate string sizes between
Expand Down
2 changes: 2 additions & 0 deletions binding.gyp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"sources": [
"lib/addon.cc",
"lib/accessors.cc",
"lib/pattern.cc",
"lib/util.cc",
"lib/new.cc",
"lib/exec.cc",
Expand All @@ -14,6 +15,7 @@
"lib/search.cc",
"lib/split.cc",
"lib/to_string.cc",
"lib/set.cc",
"vendor/re2/re2/bitmap256.cc",
"vendor/re2/re2/bitstate.cc",
"vendor/re2/re2/compile.cc",
Expand Down
4 changes: 4 additions & 0 deletions lib/addon.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "./wrapped_re2.h"
#include "./wrapped_re2_set.h"

static NAN_METHOD(GetUtf8Length)
{
Expand Down Expand Up @@ -75,6 +76,9 @@ v8::Local<v8::Function> WrappedRE2::Init()
Nan::SetAccessor(instanceTemplate, Nan::New("internalSource").ToLocalChecked(), GetInternalSource);

auto ctr = Nan::GetFunction(tpl).ToLocalChecked();
auto setCtr = WrappedRE2Set::Init();

Nan::Set(ctr, Nan::New("Set").ToLocalChecked(), setCtr);

// properties

Expand Down
247 changes: 1 addition & 246 deletions lib/new.cc
Original file line number Diff line number Diff line change
@@ -1,258 +1,13 @@
#include "./wrapped_re2.h"
#include "./util.h"
#include "./pattern.h"

#include <map>
#include <memory>
#include <string>
#include <unordered_set>
#include <vector>

static char hex[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};

inline bool isUpperCaseAlpha(char ch)
{
return 'A' <= ch && ch <= 'Z';
}

inline bool isHexadecimal(char ch)
{
return ('0' <= ch && ch <= '9') || ('A' <= ch && ch <= 'Z') || ('a' <= ch && ch <= 'z');
}

static std::map<std::string, std::string> unicodeClasses = {
{"Uppercase_Letter", "Lu"},
{"Lowercase_Letter", "Ll"},
{"Titlecase_Letter", "Lt"},
{"Cased_Letter", "LC"},
{"Modifier_Letter", "Lm"},
{"Other_Letter", "Lo"},
{"Letter", "L"},
{"Nonspacing_Mark", "Mn"},
{"Spacing_Mark", "Mc"},
{"Enclosing_Mark", "Me"},
{"Mark", "M"},
{"Decimal_Number", "Nd"},
{"Letter_Number", "Nl"},
{"Other_Number", "No"},
{"Number", "N"},
{"Connector_Punctuation", "Pc"},
{"Dash_Punctuation", "Pd"},
{"Open_Punctuation", "Ps"},
{"Close_Punctuation", "Pe"},
{"Initial_Punctuation", "Pi"},
{"Final_Punctuation", "Pf"},
{"Other_Punctuation", "Po"},
{"Punctuation", "P"},
{"Math_Symbol", "Sm"},
{"Currency_Symbol", "Sc"},
{"Modifier_Symbol", "Sk"},
{"Other_Symbol", "So"},
{"Symbol", "S"},
{"Space_Separator", "Zs"},
{"Line_Separator", "Zl"},
{"Paragraph_Separator", "Zp"},
{"Separator", "Z"},
{"Control", "Cc"},
{"Format", "Cf"},
{"Surrogate", "Cs"},
{"Private_Use", "Co"},
{"Unassigned", "Cn"},
{"Other", "C"},
};

static bool translateRegExp(const char *data, size_t size, bool multiline, std::vector<char> &buffer)
{
std::string result;
bool changed = false;

if (!size)
{
result = "(?:)";
changed = true;
}
else if (multiline)
{
result = "(?m)";
changed = true;
}

for (size_t i = 0; i < size;)
{
char ch = data[i];
if (ch == '\\')
{
if (i + 1 < size)
{
ch = data[i + 1];
switch (ch)
{
case '\\':
result += "\\\\";
i += 2;
continue;
case 'c':
if (i + 2 < size)
{
ch = data[i + 2];
if (isUpperCaseAlpha(ch))
{
result += "\\x";
result += hex[((ch - '@') / 16) & 15];
result += hex[(ch - '@') & 15];
i += 3;
changed = true;
continue;
}
}
result += "\\c";
i += 2;
continue;
case 'u':
if (i + 2 < size)
{
ch = data[i + 2];
if (isHexadecimal(ch))
{
result += "\\x{";
result += ch;
i += 3;
for (size_t j = 0; j < 3 && i < size; ++i, ++j)
{
ch = data[i];
if (!isHexadecimal(ch))
{
break;
}
result += ch;
}
result += '}';
changed = true;
continue;
}
else if (ch == '{')
{
result += "\\x";
i += 2;
changed = true;
continue;
}
}
result += "\\u";
i += 2;
continue;
case 'p':
case 'P':
if (i + 2 < size) {
if (data[i + 2] == '{') {
size_t j = i + 3;
while (j < size && data[j] != '}') ++j;
if (j < size) {
result += "\\";
result += data[i + 1];
std::string name(data + i + 3, j - i - 3);
if (unicodeClasses.find(name) != unicodeClasses.end()) {
name = unicodeClasses[name];
} else if (name.size() > 7 && !strncmp(name.c_str(), "Script=", 7)) {
name = name.substr(7);
} else if (name.size() > 3 && !strncmp(name.c_str(), "sc=", 3)) {
name = name.substr(3);
}
if (name.size() == 1) {
result += name;
} else {
result += "{";
result += name;
result += "}";
}
i = j + 1;
changed = true;
continue;
}
}
}
result += "\\";
result += data[i + 1];
i += 2;
continue;
default:
result += "\\";
size_t sym_size = getUtf8CharSize(ch);
result.append(data + i + 1, sym_size);
i += sym_size + 1;
continue;
}
}
}
else if (ch == '/')
{
result += "\\/";
i += 1;
changed = true;
continue;
}
else if (ch == '(' && i + 2 < size && data[i + 1] == '?' && data[i + 2] == '<')
{
if (i + 3 >= size || (data[i + 3] != '=' && data[i + 3] != '!'))
{
result += "(?P<";
i += 3;
changed = true;
continue;
}
}
size_t sym_size = getUtf8CharSize(ch);
result.append(data + i, sym_size);
i += sym_size;
}

if (!changed)
{
return false;
}

buffer.resize(0);
buffer.insert(buffer.end(), result.data(), result.data() + result.size());
buffer.push_back('\0');

return true;
}

static std::string escapeRegExp(const char *data, size_t size)
{
std::string result;

if (!size)
{
result = "(?:)";
}

size_t prevBackSlashes = 0;
for (size_t i = 0; i < size;)
{
char ch = data[i];
if (ch == '\\')
{
++prevBackSlashes;
}
else if (ch == '/' && !(prevBackSlashes & 1))
{
result += "\\/";
i += 1;
prevBackSlashes = 0;
continue;
}
else
{
prevBackSlashes = 0;
}
size_t sym_size = getUtf8CharSize(ch);
result.append(data + i, sym_size);
i += sym_size;
}

return result;
}

bool WrappedRE2::alreadyWarnedAboutUnicode = false;

static const char *deprecationMessage = "BMP patterns aren't supported by node-re2. An implicit \"u\" flag is assumed by the RE2 constructor. In a future major version, calling the RE2 constructor without the \"u\" flag may become forbidden, or cause a different behavior. Please see https://github.com/uhop/node-re2/issues/21 for more information.";
Expand Down
Loading