Skip to content

Commit 5896e54

Browse files
committed
Pretty printing unicode data
1 parent 28d2149 commit 5896e54

File tree

8 files changed

+2436
-2120
lines changed

8 files changed

+2436
-2120
lines changed

lib/inc/sys_string/impl/unicode/mappings.h

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -20,29 +20,22 @@
2020

2121
namespace sysstr::util::unicode
2222
{
23-
template<class Derived>
24-
class lookup
23+
class is_whitespace
2524
{
25+
private:
26+
static const char16_t chars[26];
2627
public:
28+
static constexpr char32_t max_char = U'\u3000';
29+
2730
static bool test(char32_t c) noexcept
2831
{
29-
if (c > Derived::max_char)
32+
if (c > max_char)
3033
return false;
31-
for(auto p = Derived::chars; *p; ++p)
34+
for(auto p = chars; *p; ++p)
3235
if (*p == c)
3336
return true;
3437
return false;
3538
}
36-
};
37-
38-
39-
class is_whitespace : public lookup<is_whitespace>
40-
{
41-
friend lookup<is_whitespace>;
42-
private:
43-
static const char16_t chars[26];
44-
public:
45-
static constexpr char32_t max_char = U'\u3000';
4639

4740
static constexpr size_t data_size = sizeof(chars);
4841
};

lib/src/unicode_mappings.cpp

Lines changed: 2352 additions & 2002 deletions
Large diffs are not rendered by default.

unicode/scripts/case_builder.py

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from textwrap import dedent
44
from typing import Optional
55

6-
from common import char_name, chars_len_in_utf16, indent_insert
6+
from common import chars_len_in_utf16, format_utf16_string, indent_insert
77
from table_builder import table_builder
88

99

@@ -186,21 +186,10 @@ class props
186186
'''
187187
return dedent(ret)
188188

189-
def __print_cased_data(self):
190-
ret = '\nu"'
191-
char_count = 0
192-
for char in self.__mapped_data:
193-
ret += char_name(char)
194-
char_count += 1
195-
if char_count > 0 and char_count % 16 == 0:
196-
ret += '"\nu"'
197-
ret += '"'
198-
return ret
199-
200189
def print_impl(self):
201190
ret = f'''
202191
const char16_t case_mapper::cased_data[{self.__cased_data_len}] =
203-
{indent_insert(self.__print_cased_data(), 12)};
192+
{indent_insert(format_utf16_string(self.__mapped_data), 12)};
204193
205194
{indent_insert(self.__builder.print_impl("case_mapper::lookup"), 8)}
206195
'''

unicode/scripts/common.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,20 @@
88
#
99

1010
import textwrap
11+
from math import log10
1112

1213
def char_name(code):
1314
if code < 0x10000:
1415
return f"\\u{code:04X}"
1516
else:
1617
return f"\\U{code:08X}"
18+
19+
def split_utf16(code):
20+
if code < 0x10000:
21+
return (code, )
22+
first = ((code - 0x010000) >> 10) + 0x00D800
23+
second = ((code - 0x010000) & 0x03FF) + 0x00DC00
24+
return (first, second)
1725

1826
def parse_char_range(text):
1927
char_range = [int(x, 16) for x in text.split('..')]
@@ -58,6 +66,39 @@ def chars_len_in_utf16(chars):
5866
ret += char_len_in_utf16(char)
5967
return ret
6068

69+
def format_utf16_string(chars):
70+
ret = '\nu"'
71+
line_len = 0
72+
for char in chars:
73+
cur = char_name(char)
74+
ret += cur
75+
line_len += len(cur)
76+
if line_len >= 100:
77+
ret += '"\nu"'
78+
line_len = 0
79+
ret += '"'
80+
return ret
81+
82+
def format_array(values, ishex=False, bits=0, max_width=120):
83+
if ishex:
84+
hf = hex_format_for_bits(bits)
85+
fmt = '0x{:' + hf + '}'
86+
else:
87+
digits = int(log10(1 << bits)) + 1
88+
fmt = '{:>' + str(digits) + '}'
89+
90+
entry_len = len(fmt.format(0) + ', ')
91+
max_count_per_line = max_width // entry_len
92+
93+
ret = ''
94+
for idx, value in enumerate(values):
95+
if idx > 0:
96+
ret += ', '
97+
if idx % max_count_per_line == 0:
98+
ret += '\n'
99+
ret += fmt.format(value)
100+
return ret
101+
61102
def indent_insert(text: str, count: int):
62103
ret = textwrap.indent(text, ' ' * count, lambda line: True).lstrip()
63104
return ret

unicode/scripts/genmappings.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -318,8 +318,6 @@ def main():
318318
319319
namespace sysstr::util::unicode
320320
{{
321-
{indent_insert(lookup_builder.print_common_header(), 4)}
322-
323321
{indent_insert(whitespaces.print_header('is_whitespace'), 4)}
324322
325323
{indent_insert(case_info_builder.print_header(), 4)}

unicode/scripts/lookup_builder.py

Lines changed: 13 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
#
88

99
from textwrap import dedent
10-
from common import char_name, indent_insert
10+
from common import char_name, format_utf16_string, indent_insert
1111

1212
class lookup_builder:
1313

@@ -27,46 +27,24 @@ def generate(self):
2727
self.length += 1
2828
return self.length * 2
2929

30-
def __make_content(self):
31-
ret = 'u"'
32-
char_count = 0
33-
for char in self.chars:
34-
ret += char_name(char)
35-
char_count += 1
36-
if char_count > 0 and char_count % 16 == 0:
37-
ret += '"\nu"'
38-
ret += '"'
39-
return ret
40-
41-
@staticmethod
42-
def print_common_header():
43-
ret = '''
44-
template<class Derived>
45-
class lookup
46-
{
47-
public:
48-
static bool test(char32_t c) noexcept
49-
{
50-
if (c > Derived::max_char)
51-
return false;
52-
for(auto p = Derived::chars; *p; ++p)
53-
if (*p == c)
54-
return true;
55-
return false;
56-
}
57-
};
58-
'''
59-
return dedent(ret)
60-
6130
def print_header(self, name):
6231
ret = f'''
63-
class {name} : public lookup<{name}>
32+
class {name}
6433
{{
65-
friend lookup<{name}>;
6634
private:
6735
static const char16_t chars[{self.length}];
6836
public:
6937
static constexpr char32_t max_char = U'{char_name(self.max_char)}';
38+
39+
static bool test(char32_t c) noexcept
40+
{{
41+
if (c > max_char)
42+
return false;
43+
for(auto p = chars; *p; ++p)
44+
if (*p == c)
45+
return true;
46+
return false;
47+
}}
7048
7149
static constexpr size_t data_size = sizeof(chars);
7250
}};
@@ -76,6 +54,6 @@ class {name} : public lookup<{name}>
7654
def print_impl(self, name):
7755
ret = f'''
7856
const char16_t {name}::chars[] =
79-
{indent_insert(self.__make_content(), 12)};
57+
{indent_insert(format_utf16_string(self.chars), 12)};
8058
'''
8159
return dedent(ret)

unicode/scripts/norm_builder.py

Lines changed: 3 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,7 @@
33
from dataclasses import dataclass
44
from textwrap import dedent
55

6-
from common import indent_insert
7-
#from trie_builder import trie_builder
6+
from common import format_array, indent_insert
87
from table_builder import table_builder
98

109

@@ -356,34 +355,14 @@ class normalizer
356355
return dedent(ret)
357356

358357

359-
def __print_compositions(self):
360-
ret = '\n'
361-
val_count = 0
362-
for val in self.__compositions:
363-
ret += f'0x{val:08X}, '
364-
val_count += 1
365-
if val_count > 0 and val_count % 16 == 0:
366-
ret += '\n'
367-
return ret
368-
369-
def __print_values(self):
370-
ret = '\n'
371-
val_count = 0
372-
for val in self.__values:
373-
ret += f'0x{val:08X}, '
374-
val_count += 1
375-
if val_count > 0 and val_count % 16 == 0:
376-
ret += '\n'
377-
return ret
378-
379358
def print_impl(self):
380359
ret = f'''
381360
const uint32_t normalizer::compositions[] = {{
382-
{indent_insert(self.__print_compositions(), 12)}
361+
{indent_insert(format_array(self.__compositions, ishex=True, bits=32), 12)}
383362
}};
384363
385364
const uint32_t normalizer::values[] = {{
386-
{indent_insert(self.__print_values(), 12)}
365+
{indent_insert(format_array(self.__values, ishex=True, bits=32), 12)}
387366
}};
388367
389368
{indent_insert(self.__decomp_builder.print_impl("normalizer::lookup"), 8)}

unicode/scripts/table_builder.py

Lines changed: 18 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
#
88

99
from textwrap import dedent
10-
from common import bytes_for_bits, type_for_bits, indent_insert
10+
import textwrap
11+
from common import bytes_for_bits, format_array, type_for_bits, indent_insert
1112

1213

1314
class table_builder:
@@ -133,30 +134,17 @@ def generate(self):
133134

134135
return total_size
135136

136-
def __make_nested(self, values):
137+
def __make_nested(self, values, bits=0):
137138
ret = ''
138139
for block_idx, block in enumerate(values):
139140
if block_idx > 0:
140141
ret += ','
141-
ret += '\n'
142-
ret += '{'
143-
for idx, val in enumerate(block):
144-
if idx > 0:
145-
ret += ', '
146-
#ret += f'0x{val:01X}'
147-
ret += f'{val}'
148-
ret += '}'
149-
return ret
150-
151-
def __make_simple(self, values):
152-
ret = '{'
153-
for idx, value in enumerate(values):
154-
if idx > 0:
155-
ret += ', '
156-
if idx % 32 == 0:
157-
ret += '\n'
158-
ret += f'{value}'
159-
ret += '}'
142+
ret += '\n{{'
143+
arr = format_array(block, bits=bits, max_width=200)
144+
if arr.find('\n') != -1:
145+
arr = '\n' + textwrap.indent(arr, ' ') + '\n'
146+
ret += arr
147+
ret += '}}'
160148
return ret
161149

162150
def print_header(self, name, values_enum_content):
@@ -227,22 +215,22 @@ class {name}
227215

228216
def print_impl(self, name):
229217
ret = f'''
230-
const std::array<{type_for_bits(self.__bits_for_ascii)}, {len(self.__ascii)}> {name}::ascii({{
231-
{indent_insert(self.__make_simple(self.__ascii), 12)}
232-
}});
218+
const std::array<{type_for_bits(self.__bits_for_ascii)}, {len(self.__ascii)}> {name}::ascii({{{{
219+
{indent_insert(format_array(self.__ascii, ishex=True, bits=self.__bits_for_ascii), 12)}
220+
}}}});
233221
'''
234222
for idx, values in enumerate(self.__values):
235223
if isinstance(values[0], int):
236224
ret += f'''
237-
const std::array<{type_for_bits(self.__bits_for_values[idx])}, {len(values)}> {name}::stage{idx + 1}({{
238-
{indent_insert(self.__make_simple(values), 12)}
239-
}});
225+
const std::array<{type_for_bits(self.__bits_for_values[idx])}, {len(values)}> {name}::stage{idx + 1}({{{{
226+
{indent_insert(format_array(values, ishex=idx!=0, bits=self.__bits_for_values[idx]), 12)}
227+
}}}});
240228
'''
241229
else:
242230
ret += f'''
243-
const std::array<std::array<{type_for_bits(self.__bits_for_values[idx])}, {len(values[0])}>, {len(values)}> {name}::stage{idx + 1}({{
244-
{indent_insert(self.__make_nested(values), 12)}
245-
}});
231+
const std::array<std::array<{type_for_bits(self.__bits_for_values[idx])}, {len(values[0])}>, {len(values)}> {name}::stage{idx + 1}({{{{
232+
{indent_insert(self.__make_nested(values, bits=self.__bits_for_values[idx]), 12)}
233+
}}}});
246234
'''
247235

248236

0 commit comments

Comments
 (0)