1515#include < libunicode/codepoint_properties_loader.h>
1616#include < libunicode/convert.h>
1717#include < libunicode/grapheme_segmenter.h>
18+ #include < libunicode/run_segmenter.h>
1819#include < libunicode/ucd.h>
1920#include < libunicode/ucd_enums.h>
21+ #include < libunicode/ucd_fmt.h>
2022#include < libunicode/ucd_ostream.h>
2123#include < libunicode/utf8_grapheme_segmenter.h>
2224
@@ -33,27 +35,62 @@ using namespace std;
3335namespace
3436{
3537
36- std::string quotedAndEscaped (std::string const & text)
38+ std::string escapeControlCodes (std::string const & text)
39+ {
40+ auto result = stringstream {};
41+ for (char const ch: text)
42+ {
43+ if (ch < 0x20 )
44+ result << " \\ x" << setw (2 ) << std::hex << (unsigned (ch) & 0xFF );
45+ else
46+ result << ch;
47+ }
48+ return result.str ();
49+ }
50+
51+ std::string escaped (std::string const & text)
3752{
3853 auto result = stringstream {};
39- result << ' "' ;
4054 for (char const ch: text)
4155 {
4256 if (std::isprint (ch) && ch != ' "' )
4357 result << ch;
4458 else
4559 result << " \\ x" << setw (2 ) << std::hex << (unsigned (ch) & 0xFF );
4660 }
47- result << " \" " ;
4861 return result.str ();
4962}
5063
64+ std::string quotedAndEscaped (std::string const & text)
65+ {
66+ return ' "' + escaped (text) + ' "' ;
67+ }
68+
5169int printUsage (int exitCode)
5270{
53- cout << " unicode-query [properties] U+XXXX [...]\n " ;
71+ cout << " unicode-query [properties] U+XXXX [...]\n "
72+ << " gc [-e] [--] \" Text string\"\n "
73+ << " runs [-e] [--] \" Text string\"\n " ;
5474 return exitCode;
5575}
5676
77+ std::string_view seq (std::string_view const & text)
78+ {
79+ static const bool isTTY = []() {
80+ #if !defined(_WIN32)
81+ auto const isPTY = isatty (STDOUT_FILENO);
82+ return isPTY;
83+ #else
84+ return false ;
85+ #endif
86+ }();
87+ if (isTTY)
88+ return text;
89+ else
90+ return {};
91+ }
92+
93+ // {{{ properties
5794optional<char32_t > parseChar (std::string_view text)
5895{
5996 if (text.size () >= 3 && text[0 ] == ' U' && text[1 ] == ' +' )
@@ -116,7 +153,7 @@ void showCodepointProperties(char32_t codepoint)
116153 cout << " Emoji Segmentation Category : " << properties.emoji_segmentation_category << ' \n ' ;
117154 cout << " Grapheme Cluster Break : " << properties.grapheme_cluster_break << ' \n ' ;
118155 cout << " \n " ;
119- // clang-format off
156+ // clang-format on
120157}
121158
122159int showCodepointProperties (int argc, char const * argv[])
@@ -134,7 +171,100 @@ int showCodepointProperties(int argc, char const* argv[])
134171 }
135172 return EXIT_SUCCESS;
136173}
174+ // }}}
175+
176+ // {{{ grapheme clusters
177+ int showGraphemeClusters (int argc, char const * argv[])
178+ {
179+ int i = 0 ;
180+ bool escapeText = false ;
181+ for (; i < argc; ++i)
182+ {
183+ auto const arg = string_view (argv[i]);
184+ if (arg == " -e" )
185+ escapeText = true ;
186+ else if (arg == " --" )
187+ {
188+ ++i;
189+ break ;
190+ }
191+ else if (arg.starts_with (' -' ))
192+ return printUsage (EXIT_FAILURE);
193+ else
194+ break ;
195+ }
196+ for (; i < argc; ++i)
197+ {
198+ auto const text = string_view (argv[i]);
199+ auto const gcs = unicode::utf8_grapheme_segmenter (text);
200+ for (auto const & gc: gcs)
201+ {
202+ auto const text32 = std::u32string_view (gc);
203+ auto const text8 = unicode::convert_to<char >(text32);
204+ std::cout << (escapeText ? escaped (text8) : escapeControlCodes (text8)) << " \n " ;
205+ }
206+ }
207+ return EXIT_SUCCESS;
208+ }
209+ // }}}
137210
211+ // {{{ runs
212+ int showRuns (istream& in, bool escapeRunText)
213+ {
214+ string bytes ((istreambuf_iterator<char >(in)), istreambuf_iterator<char >());
215+ u32string const codepoints = unicode::convert_to<char32_t >(string_view (bytes));
216+
217+ unicode::run_segmenter rs (codepoints);
218+ unicode::run_segmenter::range run;
219+
220+ while (rs.consume (unicode::out (run)))
221+ {
222+ auto const script = get<unicode::Script>(run.properties );
223+ auto const presentationStyle = get<unicode::PresentationStyle>(run.properties );
224+
225+ auto const text32 = u32string_view (codepoints.data () + run.start , run.end - run.start );
226+ auto const text8 = unicode::convert_to<char >(text32);
227+ auto const textEscaped = escapeRunText ? escaped (text8) : escapeControlCodes (text8);
228+
229+ cout << run.start << " -" << run.end - 1 << " (" << run.end - run.start << " ): " << script << " "
230+ << presentationStyle << " \n "
231+ << ' "' << seq (" \033 [32m" ) << textEscaped << seq (" \033 [m" )
232+ << " \"\n\n " ;
233+ }
234+
235+ return EXIT_SUCCESS;
236+ }
237+
238+ int showRuns (int argc, char const * argv[])
239+ {
240+ // [-e]
241+ int i = 0 ;
242+ bool escaped = false ;
243+ for (; i < argc; ++i)
244+ {
245+ auto const arg = string_view (argv[i]);
246+ if (arg == " -e" )
247+ escaped = true ;
248+ else if (arg == " --" )
249+ {
250+ ++i;
251+ break ;
252+ }
253+ else if (arg.starts_with (' -' ))
254+ return printUsage (EXIT_FAILURE);
255+ else
256+ break ;
257+ }
258+
259+ for (; i < argc; ++i)
260+ {
261+ auto in = std::istringstream (argv[i]);
262+ showRuns (in, escaped);
263+ }
264+
265+ return EXIT_SUCCESS;
266+ }
267+ // }}}
138268} // namespace
139269
140270// Example usage:
@@ -154,6 +284,18 @@ int main(int argc, char const* argv[])
154284 if (string_view (argv[argIndex]) == " help" )
155285 return printUsage (EXIT_SUCCESS);
156286
287+ if (string_view (argv[argIndex]) == " runs" )
288+ {
289+ ++argIndex;
290+ return showRuns (argc - argIndex, argv + argIndex);
291+ }
292+
293+ if (string_view (argv[argIndex]) == " gc" )
294+ {
295+ ++argIndex;
296+ return showGraphemeClusters (argc - argIndex, argv + argIndex);
297+ }
298+
157299 if (string_view (argv[argIndex]) == " properties" )
158300 ++argIndex;
159301
0 commit comments