Skip to content

Commit ca8ec58

Browse files
authored
Use const pointers for const tensors (#450)
* Use const pointers for const tensors Signed-off-by: Raasz, Pawel <[email protected]> * Fix bug after refactoring Signed-off-by: Raasz, Pawel <[email protected]> --------- Signed-off-by: Raasz, Pawel <[email protected]>
1 parent cc2fccf commit ca8ec58

File tree

5 files changed

+36
-49
lines changed

5 files changed

+36
-49
lines changed

src/bytes_to_chars.cpp

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -289,10 +289,7 @@ bool BytesToChars::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i
289289
auto chars = inputs[4].data<const uint8_t>();
290290

291291
const bool has_skips = inputs.size() == 6;
292-
bool * skips;
293-
if (has_skips) {
294-
skips = inputs[5].data<bool>();
295-
};
292+
auto skips = has_skips ? inputs[5].data<bool>() : nullptr;
296293

297294
// Set output shapes
298295
outputs[0] = inputs[0];
@@ -340,4 +337,3 @@ bool BytesToChars::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i
340337
outputs[4].set_shape({char_pointer});
341338
return true;
342339
}
343-

src/regex_split.cpp

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,11 @@ const std::map<std::string, RegexSplit::SplitMode> split_modes_map = {
2525

2626
void RegexSplit::compile_pattern_if_necessary(std::string split_pattern) const {
2727
m_split_mode = split_modes_map.at(m_behaviour);
28-
28+
2929
if (m_search_pattern_pcre2) {
3030
return;
3131
}
32-
32+
3333
if (m_behaviour == "contiguous" && split_pattern[split_pattern.length() - 1] != '+') {
3434
std::stringstream tmp_stream;
3535
tmp_stream << "(" << split_pattern << ")+";
@@ -149,7 +149,7 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp
149149
std::lock_guard<std::mutex> lock(m_mutex);
150150
compile_pattern_if_necessary(split_pattern);
151151
}
152-
152+
153153
auto get_next_match = [this](const std::string& str, size_t curr_start) -> std::optional<std::pair<size_t, size_t>>{
154154
auto match = this->m_search_pattern_pcre2->match(str, curr_start);
155155
if (match.first != SIZE_MAX && match.first != match.second) {
@@ -186,15 +186,15 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp
186186
auto ends = inputs[3].data<const int32_t>();
187187
auto chars = inputs[4].data<const uint8_t>();
188188
const size_t num_rows = inputs[0].get_size();
189-
bool * skips;
190-
bool init_skips = false;
189+
const bool *skips;
190+
Tensor skips_t, new_skips_t;
191+
191192
if (has_skips) {
192193
skips = inputs[5].data<bool>();
193194
outputs[5].set_shape(Shape{max_shape});
194195
} else {
195-
skips = new bool[num_rows];
196-
init_skips = true;
197-
std::fill(skips, skips + num_rows, false);
196+
skips_t = Tensor(element::boolean, Shape{num_rows});
197+
skips = std::fill_n(skips_t.data<bool>(), num_rows, false) - num_rows;
198198
};
199199

200200
outputs[0].set_shape(inputs[0].get_shape());
@@ -213,7 +213,8 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp
213213
if (has_skips) {
214214
new_skips = outputs[5].data<bool>();
215215
} else {
216-
new_skips = new bool[max_shape];
216+
new_skips_t = Tensor(element::boolean, Shape{max_shape});
217+
new_skips = new_skips_t.data<bool>();
217218
};
218219
int32_t ragged_offset = 0;
219220

@@ -234,7 +235,7 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp
234235
} else {
235236
size_t start = 0;
236237
uint32_t num_splits = 0;
237-
238+
238239
size_t last_begin = -1;
239240
auto add_split = [&](int begin, int end, bool invert) {
240241
switch (m_split_mode) {
@@ -274,14 +275,14 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp
274275
end = str.length();
275276
};
276277
new_ends[ragged_offset++] = begins[ragged_col] + end;
277-
278+
278279
++num_splits;
279280
};
280281

281282
std::optional<std::pair<size_t, size_t>> match;
282283
while ((match = get_next_match(str, start)) != std::nullopt) {
283284
auto [curr_start, curr_end] = *match;
284-
285+
285286
if (curr_start != start) {
286287
if (has_skips) {
287288
new_skips[ragged_offset] = false;
@@ -314,10 +315,6 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp
314315
if (has_skips) {
315316
outputs[5].set_shape({size_t(ragged_offset)});
316317
};
317-
if (init_skips) {
318-
delete[] skips;
319-
delete[] new_skips;
320-
};
321318

322319
return true;
323320
}

src/special_tokens_split.cpp

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -74,16 +74,16 @@ bool SpecialTokensSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVec
7474
const size_t batch_size = inputs[0].get_size();
7575
const size_t num_chars = inputs[4].get_size();
7676

77-
bool * skips;
78-
bool init_skips = false;
77+
Tensor skips_alternative;
78+
const bool *skips;
7979
if (has_skips) {
8080
skips = inputs[5].data<bool>();
8181
outputs[5].set_shape(Shape{num_chars});
8282
} else {
8383
outputs[5].set_shape(Shape{num_chars});
84-
skips = new bool[batch_size];
85-
init_skips = true;
86-
std::fill(skips, skips + batch_size, false);
84+
skips_alternative = Tensor(element::boolean, Shape{batch_size});
85+
skips = std::fill_n(skips_alternative.data<bool>(), batch_size, false) -
86+
batch_size;
8787
};
8888

8989
outputs[0].set_shape(inputs[0].get_shape());
@@ -145,8 +145,5 @@ bool SpecialTokensSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVec
145145
outputs[3].set_shape({size_t(ragged_offset)});
146146
outputs[5].set_shape({size_t(ragged_offset)});
147147

148-
if (init_skips) {
149-
delete[] skips;
150-
};
151148
return true;
152149
}

src/utf8_validate.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,26 +18,26 @@ void UTF8Validate::validate_and_infer_types() {
1818
bool UTF8Validate::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const {
1919
auto begins = inputs[0].data<int32_t>();
2020
auto ends = inputs[1].data<int32_t>();
21-
uint8_t* bytes = inputs[2].data<uint8_t>();
21+
auto bytes = inputs[2].data<uint8_t>();
2222
auto begins_shape = inputs[0].get_shape();
2323
auto chars_shape = inputs[2].get_shape();
24-
24+
2525
const unsigned char replacement_symbol[] = {0xEF, 0xBF, 0xBD}; // UTF-8 encoding for "�"
2626
outputs[0].set_shape(begins_shape);
2727
outputs[1].set_shape(begins_shape);
28-
28+
2929
// One byte can be replaced by 3 bytes at most,
3030
// therefore need to allocate more space
3131
size_t last_axis = chars_shape.size() - 1;
3232
chars_shape[last_axis] = chars_shape[last_axis] * 3;
3333
outputs[2].set_shape(chars_shape);
34-
34+
3535
auto out_begins = outputs[0].data<int32_t>();
3636
auto out_ends = outputs[1].data<int32_t>();
3737
auto out_bytes = outputs[2].data<uint8_t>();
3838

3939
// UTF-8 code points should not intersect:
40-
// if 2 byte object has code point < 0x80 then it's not valid 2 byte utf-8,
40+
// if 2 byte object has code point < 0x80 then it's not valid 2 byte utf-8,
4141
// even if it has a valid bit mask.
4242
const uint32_t code_point_starts[4] = {0x0, 0x80, 0x800, 0x10000};
4343
uint32_t utf_code_point;
@@ -49,7 +49,7 @@ bool UTF8Validate::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i
4949
// Flag indicating whether UTF8 symbol is complete: true means it's complete, false means we expect continuation.
5050
// bool new_symbol_flag = true;
5151
bytes_to_consume = 0;
52-
52+
5353
out_begins[i] = out_idx;
5454
for (size_t j = begins[i]; j < ends[i]; j += 1) {
5555
// Beggining of the symbol.
@@ -70,7 +70,7 @@ bool UTF8Validate::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i
7070
utf_code_point = (0b1111 & bytes[j]) << 6 * bytes_to_consume;
7171
continue;
7272
} else if (!bytes_to_consume && bytes[j] >> 3 == 0b11110) {
73-
num_bytes = 4;
73+
num_bytes = 4;
7474
bytes_to_consume = 3;
7575
utf_code_point = (0b111 & bytes[j]) << 6 * bytes_to_consume;
7676
continue;

src/utils.cpp

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ void check_string_input(const Node* node, size_t input_index) {
3636
void check_string_scalar_input(const Node* node, size_t input_index) {
3737
auto shape = node->get_input_partial_shape(input_index);
3838
auto element_type = node->get_input_element_type(input_index);
39-
40-
#if false && USE_STRING_TENSORS
39+
40+
#if false && USE_STRING_TENSORS
4141
// This block is not used when we convert ops to decomposed representation (and we really do)
4242
OPENVINO_ASSERT(
4343
(element_type == element::dynamic || element_type == element::string) &&
@@ -117,7 +117,7 @@ void unpack_strings_to_tensors (const std::string* strings, const Shape shape, o
117117
}
118118

119119
void override_parameter (std::shared_ptr<ov::Node> node, element::Type type, const PartialShape& shape) {
120-
if (auto parameter = std::dynamic_pointer_cast<Parameter>(node)) {
120+
if (auto parameter = std::dynamic_pointer_cast<Parameter>(node)) {
121121
// TODO: Apply this change conditionally based on real Parameter value
122122
if (getenv_bool("OPENVINO_TOKENIZERS_PRINT_DEBUG_INFO", false)) {
123123
std::cerr << "Overriding Parameter element_type to " << type << " and shape " << shape << "\n";
@@ -170,10 +170,7 @@ bool evaluate_normalization_helper (ov::TensorVector& outputs, const ov::TensorV
170170
auto ends = inputs[1].data<const int32_t>();
171171
auto chars = inputs[2].data<const uint8_t>();
172172

173-
bool * skips;
174-
if (has_skips) {
175-
skips = inputs[3].data<bool>();
176-
};
173+
auto skips = has_skips ? inputs[3].data<bool>() : nullptr;
177174

178175
// Set output shapes
179176
outputs[0].set_shape(inputs[0].get_shape());
@@ -276,7 +273,7 @@ std::string PCRE2Wrapper::substitute(const std::string& orig_str,
276273
}
277274
pcre2_match_data* match_data = pcre2_match_data_create_from_pattern(m_compiled, NULL);
278275
PCRE2_SIZE subject_length = orig_str.size();
279-
276+
280277
// Check if the string matches the pattern
281278
int num_matches = pcre2_match(
282279
m_compiled,
@@ -290,7 +287,7 @@ std::string PCRE2Wrapper::substitute(const std::string& orig_str,
290287
pcre2_match_data_free(match_data);
291288
return orig_str;
292289
}
293-
290+
294291
// Allocate dynamically since lenght depends dynamically on the lenght of input and replace strings.
295292
// Allocated memory will be freed at the exit from function.
296293
size_t buffer_length = sizeof(PCRE2_UCHAR) * 4 * (subject_length + num_matches * replace_pattern.size());
@@ -302,7 +299,7 @@ std::string PCRE2Wrapper::substitute(const std::string& orig_str,
302299
pcre2_match_data_free(match_data);
303300
return orig_str;
304301
}
305-
302+
306303
int rc = pcre2_substitute(
307304
m_compiled,
308305
(PCRE2_SPTR) orig_str.c_str(), orig_str.size(),
@@ -332,7 +329,7 @@ std::string PCRE2Wrapper::substitute(const std::string& orig_str,
332329
}
333330
auto res = std::string(reinterpret_cast<char*>(buffer), buffer_length);
334331
std::free(buffer);
335-
pcre2_match_data_free(match_data);
332+
pcre2_match_data_free(match_data);
336333
return res;
337334
}
338335

@@ -353,7 +350,7 @@ std::pair<size_t, size_t> PCRE2Wrapper::match(const std::string& str, size_t cur
353350
);
354351

355352
if (match_result < 0) {
356-
pcre2_match_data_free(match_data);
353+
pcre2_match_data_free(match_data);
357354
return {SIZE_MAX, SIZE_MAX};
358355
}
359356

@@ -363,7 +360,7 @@ std::pair<size_t, size_t> PCRE2Wrapper::match(const std::string& str, size_t cur
363360
std::pair<size_t, size_t> res = {ovector[0], ovector[1]};
364361

365362
// Free only after copying results from match_data to res;
366-
pcre2_match_data_free(match_data);
363+
pcre2_match_data_free(match_data);
367364
return res;
368365
}
369366

0 commit comments

Comments
 (0)