Skip to content

Commit d05a2db

Browse files
wip
Signed-off-by: Christian Parpart <[email protected]>
1 parent 2207a5a commit d05a2db

File tree

3 files changed

+164
-18
lines changed

3 files changed

+164
-18
lines changed

src/libunicode/grapheme_line_segmenter.cpp

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -84,27 +84,25 @@ grapheme_line_segmenter::result_type grapheme_line_segmenter::process(unsigned m
8484
if (_buffer.empty())
8585
return result_type { .text = _buffer.substr(0, 0), .width = 0 };
8686

87-
char const* start = _next;
88-
char const* const resultStart = _utf8.expectedLength ? start - _utf8.currentLength : start;
87+
if (_next == _buffer.data() + _buffer.size())
88+
return result_type { .text = _buffer.substr(0, 0), .width = 0 };
89+
90+
// Points to the beginning of a grapheme cluster.
91+
char const* const resultStart = _utf8.expectedLength ? _next - _utf8.currentLength : _next;
8992

9093
// Number of bytes used in the current line.
9194
size_t totalByteCountProcessed = 0;
9295

9396
// Number of width used in the current line.
9497
unsigned totalWidthProcessed = 0;
9598

96-
auto const makeResult = [&]() -> result_type {
97-
return result_type { .text = std::string_view(resultStart, totalByteCountProcessed),
98-
.width = totalWidthProcessed };
99-
};
100-
10199
enum class State
102100
{
103101
ASCII,
104102
ComplexUnicode,
105103
};
106104

107-
while (maxWidth > 0 && !_buffer.empty())
105+
while (maxWidth > 0 && _next != _buffer.data() + _buffer.size())
108106
{
109107
State const state =
110108
(_utf8.expectedLength != 0 || is_complex(_buffer.front())) ? State::ComplexUnicode : State::ASCII;
@@ -114,7 +112,8 @@ grapheme_line_segmenter::result_type grapheme_line_segmenter::process(unsigned m
114112
case State::ASCII: {
115113
auto const count = process_ascii(maxWidth);
116114
if (count == 0)
117-
return makeResult();
115+
return result_type { .text = std::string_view { resultStart, totalByteCountProcessed },
116+
.width = totalWidthProcessed };
118117
_events.on_ascii(_buffer.substr(0, count));
119118
maxWidth -= count;
120119
totalWidthProcessed += count;
@@ -127,7 +126,8 @@ grapheme_line_segmenter::result_type grapheme_line_segmenter::process(unsigned m
127126
if (sub.graphemeClusterCount == 0)
128127
{
129128
_next += sub.byteCount;
130-
return makeResult();
129+
return result_type { .text = std::string_view { resultStart, totalByteCountProcessed },
130+
.width = totalWidthProcessed };
131131
}
132132
maxWidth -= sub.graphemeClusterCount;
133133
totalWidthProcessed += sub.graphemeClusterCount;
@@ -138,7 +138,8 @@ grapheme_line_segmenter::result_type grapheme_line_segmenter::process(unsigned m
138138
}
139139
}
140140

141-
return makeResult();
141+
return result_type { .text = std::string_view { resultStart, totalByteCountProcessed },
142+
.width = totalWidthProcessed };
142143
}
143144

144145
unsigned grapheme_line_segmenter::process_ascii(unsigned maxWidth) const noexcept
@@ -231,10 +232,12 @@ auto grapheme_line_segmenter::process_complex_unicode(unsigned maxWidth) noexcep
231232
{
232233
auto const prevCodepoint = _lastCodepointHint;
233234
auto const nextCodepoint = std::get<Success>(result).value;
234-
auto const nextWidth = std::max(_currentClusterWidth, static_cast<unsigned>(unicode::width(nextCodepoint)));
235+
auto const nextWidth =
236+
std::max(_currentClusterWidth, static_cast<unsigned>(unicode::width(nextCodepoint)));
235237
_lastCodepointHint = nextCodepoint;
236238
if (grapheme_segmenter::breakable(prevCodepoint, nextCodepoint))
237239
{
240+
printf("breakable: 0x%04X -> 0x%04X\n", prevCodepoint, nextCodepoint);
238241
// Flush out current grapheme cluster's East Asian Width.
239242
consumedWidth += _currentClusterWidth;
240243
maxWidth -= _currentClusterWidth;
@@ -259,6 +262,7 @@ auto grapheme_line_segmenter::process_complex_unicode(unsigned maxWidth) noexcep
259262
}
260263
else
261264
{
265+
printf("NOT breakable: 0x%04X -> 0x%04X\n", prevCodepoint, nextCodepoint);
262266
lastClusterEnd = input;
263267
// Increase width on VS16 but do not decrease on VS15.
264268
if (nextCodepoint == 0xFE0F) // VS16

src/libunicode/grapheme_line_segmenter_test.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
#include <fmt/format.h>
1919

20-
#include <catch2/catch.hpp>
20+
#include <catch2/catch_test_macros.hpp>
2121

2222
#include <string_view>
2323
#include <variant>

src/tools/unicode-query.cpp

Lines changed: 147 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,10 @@
1515
#include <libunicode/codepoint_properties_loader.h>
1616
#include <libunicode/convert.h>
1717
#include <libunicode/grapheme_segmenter.h>
18+
#include <libunicode/run_segmenter.h>
1819
#include <libunicode/ucd.h>
1920
#include <libunicode/ucd_enums.h>
21+
#include <libunicode/ucd_fmt.h>
2022
#include <libunicode/ucd_ostream.h>
2123
#include <libunicode/utf8_grapheme_segmenter.h>
2224

@@ -33,27 +35,62 @@ using namespace std;
3335
namespace
3436
{
3537

36-
std::string quotedAndEscaped(std::string const& text)
38+
std::string escapeControlCodes(std::string const& text)
39+
{
40+
auto result = stringstream {};
41+
for (char const ch: text)
42+
{
43+
if (ch < 0x20)
44+
result << "\\x" << setw(2) << std::hex << (unsigned(ch) & 0xFF);
45+
else
46+
result << ch;
47+
}
48+
return result.str();
49+
}
50+
51+
std::string escaped(std::string const& text)
3752
{
3853
auto result = stringstream {};
39-
result << '"';
4054
for (char const ch: text)
4155
{
4256
if (std::isprint(ch) && ch != '"')
4357
result << ch;
4458
else
4559
result << "\\x" << setw(2) << std::hex << (unsigned(ch) & 0xFF);
4660
}
47-
result << "\"";
4861
return result.str();
4962
}
5063

64+
std::string quotedAndEscaped(std::string const& text)
65+
{
66+
return '"' + escaped(text) + '"';
67+
}
68+
5169
int printUsage(int exitCode)
5270
{
53-
cout << "unicode-query [properties] U+XXXX [...]\n";
71+
cout << "unicode-query [properties] U+XXXX [...]\n"
72+
<< " gc [-e] [--] \"Text string\"\n"
73+
<< " runs [-e] [--] \"Text string\"\n";
5474
return exitCode;
5575
}
5676

77+
std::string_view seq(std::string_view const& text)
78+
{
79+
static const bool isTTY = []() {
80+
#if !defined(_WIN32)
81+
auto const isPTY = isatty(STDOUT_FILENO);
82+
return isPTY;
83+
#else
84+
return false;
85+
#endif
86+
}();
87+
if (isTTY)
88+
return text;
89+
else
90+
return {};
91+
}
92+
93+
// {{{ properties
5794
optional<char32_t> parseChar(std::string_view text)
5895
{
5996
if (text.size() >= 3 && text[0] == 'U' && text[1] == '+')
@@ -116,7 +153,7 @@ void showCodepointProperties(char32_t codepoint)
116153
cout << "Emoji Segmentation Category : " << properties.emoji_segmentation_category << '\n';
117154
cout << "Grapheme Cluster Break : " << properties.grapheme_cluster_break << '\n';
118155
cout << "\n";
119-
// clang-format off
156+
// clang-format on
120157
}
121158

122159
int showCodepointProperties(int argc, char const* argv[])
@@ -134,7 +171,100 @@ int showCodepointProperties(int argc, char const* argv[])
134171
}
135172
return EXIT_SUCCESS;
136173
}
174+
// }}}
175+
176+
// {{{ grapheme clusters
177+
int showGraphemeClusters(int argc, char const* argv[])
178+
{
179+
int i = 0;
180+
bool escapeText = false;
181+
for (; i < argc; ++i)
182+
{
183+
auto const arg = string_view(argv[i]);
184+
if (arg == "-e")
185+
escapeText = true;
186+
else if (arg == "--")
187+
{
188+
++i;
189+
break;
190+
}
191+
else if (arg.starts_with('-'))
192+
return printUsage(EXIT_FAILURE);
193+
else
194+
break;
195+
}
196+
for (; i < argc; ++i)
197+
{
198+
auto const text = string_view(argv[i]);
199+
auto const gcs = unicode::utf8_grapheme_segmenter(text);
200+
for (auto const& gc: gcs)
201+
{
202+
auto const text32 = std::u32string_view(gc);
203+
auto const text8 = unicode::convert_to<char>(text32);
204+
std::cout << (escapeText ? escaped(text8) : escapeControlCodes(text8)) << "\n";
205+
}
206+
}
207+
return EXIT_SUCCESS;
208+
}
209+
// }}}
137210

211+
// {{{ runs
212+
int showRuns(istream& in, bool escapeRunText)
213+
{
214+
string bytes((istreambuf_iterator<char>(in)), istreambuf_iterator<char>());
215+
u32string const codepoints = unicode::convert_to<char32_t>(string_view(bytes));
216+
217+
unicode::run_segmenter rs(codepoints);
218+
unicode::run_segmenter::range run;
219+
220+
while (rs.consume(unicode::out(run)))
221+
{
222+
auto const script = get<unicode::Script>(run.properties);
223+
auto const presentationStyle = get<unicode::PresentationStyle>(run.properties);
224+
225+
auto const text32 = u32string_view(codepoints.data() + run.start, run.end - run.start);
226+
auto const text8 = unicode::convert_to<char>(text32);
227+
auto const textEscaped = escapeRunText ? escaped(text8) : escapeControlCodes(text8);
228+
229+
cout << run.start << "-" << run.end - 1 << " (" << run.end - run.start << "): " << script << " "
230+
<< presentationStyle << "\n"
231+
<< '"' << seq("\033[32m") << textEscaped << seq("\033[m")
232+
<< "\"\n\n";
233+
}
234+
235+
return EXIT_SUCCESS;
236+
}
237+
238+
int showRuns(int argc, char const* argv[])
239+
{
240+
// [-e]
241+
int i = 0;
242+
bool escaped = false;
243+
for (; i < argc; ++i)
244+
{
245+
auto const arg = string_view(argv[i]);
246+
if (arg == "-e")
247+
escaped = true;
248+
else if (arg == "--")
249+
{
250+
++i;
251+
break;
252+
}
253+
else if (arg.starts_with('-'))
254+
return printUsage(EXIT_FAILURE);
255+
else
256+
break;
257+
}
258+
259+
for (; i < argc; ++i)
260+
{
261+
auto in = std::istringstream(argv[i]);
262+
showRuns(in, escaped);
263+
}
264+
265+
return EXIT_SUCCESS;
266+
}
267+
// }}}
138268
} // namespace
139269

140270
// Example usage:
@@ -154,6 +284,18 @@ int main(int argc, char const* argv[])
154284
if (string_view(argv[argIndex]) == "help")
155285
return printUsage(EXIT_SUCCESS);
156286

287+
if (string_view(argv[argIndex]) == "runs")
288+
{
289+
++argIndex;
290+
return showRuns(argc - argIndex, argv + argIndex);
291+
}
292+
293+
if (string_view(argv[argIndex]) == "gc")
294+
{
295+
++argIndex;
296+
return showGraphemeClusters(argc - argIndex, argv + argIndex);
297+
}
298+
157299
if (string_view(argv[argIndex]) == "properties")
158300
++argIndex;
159301

0 commit comments

Comments
 (0)