Skip to content

Commit 732204c

Browse files
committed
First pass on NFC
1 parent 41c8803 commit 732204c

File tree

5 files changed

+534
-330
lines changed

5 files changed

+534
-330
lines changed

lib/inc/sys_string/impl/unicode/algorithms.h

Lines changed: 120 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,32 @@ namespace sysstr
110110
m_data.heap.clear();
111111
m_size = 0;
112112
}
113+
114+
void reserve(size_t size)
115+
{
116+
if (size < StackLimit)
117+
return;
118+
119+
if (m_size == 0)
120+
{
121+
new (&m_data.heap) std::vector<T>();
122+
m_data.heap.reserve(size);
123+
m_first = m_data.heap.data();
124+
}
125+
else if (size <= StackLimit)
126+
{
127+
std::vector<T> heap;
128+
heap.reserve(size);
129+
heap.insert(heap.begin(), m_first, m_first + m_size);
130+
new (&m_data.heap) std::vector<T>(std::move(heap));
131+
m_first = m_data.heap.data();
132+
}
133+
else
134+
{
135+
m_data.heap.reserve(size);
136+
m_first = m_data.heap.data();
137+
}
138+
}
113139

114140
void erase_front(size_t size)
115141
{
@@ -913,18 +939,107 @@ namespace sysstr
913939
template<utf_encoding OutEnc>
914940
class nfc
915941
{
942+
private:
943+
using nfc_qc_status = util::unicode::normalizer::nfc_qc_status;
916944
public:
917945
template<std::ranges::forward_range Range, std::output_iterator<utf_char_of<OutEnc>> OutIt>
918946
requires(utf_encoding_of<std::ranges::range_value_t<Range>> == utf32)
919947
inline auto operator()(const Range & range, OutIt dest) -> OutIt
920948
{
921-
std::vector<char32_t> buf;
922-
if constexpr (std::ranges::sized_range<Range>)
923-
buf.reserve(std::ranges::size(range));
924-
nfd<utf32>()(range, std::back_inserter(buf));
925-
return convert(buf, dest);
949+
using namespace util;
950+
using namespace util::unicode;
951+
952+
auto first = std::ranges::begin(range);
953+
auto last = std::ranges::end(range);
954+
955+
if (first == last)
956+
return dest;
957+
958+
stack_or_heap_buffer<char32_t, 32> buffer;
959+
960+
auto status = get_nfc_qc_status(*first);
961+
962+
for ( ; ; )
963+
{
964+
auto conv_range = find_conversion_range(status, first, last);
965+
for ( ; first != conv_range.begin(); ++first)
966+
dest = write_unsafe<OutEnc>(*first, dest);
967+
if (conv_range.empty())
968+
{
969+
assert(first == last);
970+
break;
971+
}
972+
973+
if constexpr (std::ranges::sized_range<decltype(conv_range)>)
974+
buffer.reserve(conv_range.size());
975+
nfd<utf32>()(conv_range, std::back_inserter(buffer));
976+
dest = convert(buffer, dest);
977+
first = conv_range.end();
978+
if (first == last)
979+
break;
980+
status = nfc_qc_status::stable;
981+
buffer.clear();
982+
}
983+
984+
return dest;
926985
}
986+
927987
private:
988+
template<std::forward_iterator It, std::sentinel_for<It> EndIt>
989+
requires(std::is_same_v<std::iter_value_t<It>, char32_t>)
990+
inline auto find_conversion_range(nfc_qc_status first_status,
991+
It first, EndIt last) -> std::ranges::subrange<It>
992+
{
993+
using namespace util;
994+
using namespace util::unicode;
995+
996+
auto status = first_status;
997+
It start = first;
998+
for ( ; ; )
999+
{
1000+
if (status == nfc_qc_status::bad)
1001+
{
1002+
for (++first; first != last; ++first)
1003+
{
1004+
status = get_nfc_qc_status(*first);
1005+
if (status == nfc_qc_status::stable)
1006+
break;
1007+
}
1008+
return {start, first};
1009+
}
1010+
if (status == nfc_qc_status::stable)
1011+
{
1012+
start = first;
1013+
if (++first == last)
1014+
return {first, first};
1015+
}
1016+
else
1017+
{
1018+
if (++first == last)
1019+
return {start, first};
1020+
}
1021+
1022+
status = get_nfc_qc_status(*first);
1023+
}
1024+
return {first, first}; // == {last, last}
1025+
}
1026+
1027+
static auto get_nfc_qc_status(char32_t c) -> nfc_qc_status
1028+
{
1029+
using namespace util;
1030+
using namespace util::unicode;
1031+
1032+
if (c >= SBase && c < SBase + SCount)
1033+
return nfc_qc_status::good;
1034+
1035+
if ((c >= LBase && c < LBase + LCount) ||
1036+
(c >= VBase && c < VBase + VCount) ||
1037+
(c >= TBase && c < TBase + TCount))
1038+
return nfc_qc_status::bad;
1039+
1040+
return normalizer::get_nfc_qc_status(c);
1041+
}
1042+
9281043
template<std::ranges::forward_range Range, std::output_iterator<utf_char_of<OutEnc>> OutIt>
9291044
requires(utf_encoding_of<std::ranges::range_value_t<Range>> == utf32)
9301045
inline auto convert(const Range & range, OutIt dest) -> OutIt

lib/inc/sys_string/impl/unicode/mappings.h

Lines changed: 58 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -222,11 +222,11 @@ namespace sysstr::util::unicode
222222
friend trie_lookup<4,lookup>;
223223
private:
224224
using entry_type = std::array<uint16_t, 16>;
225-
using value_type = uint32_t;
225+
using value_type = uint16_t;
226226

227-
static const std::array<entry_type, 885> entries;
227+
static const std::array<entry_type, 891> entries;
228228

229-
static const std::array<value_type, 501> values;
229+
static const std::array<value_type, 496> values;
230230

231231
public:
232232

@@ -253,17 +253,17 @@ namespace sysstr::util::unicode
253253
*dest = src;
254254
return ++dest;
255255
}
256-
if (res > 0x0FFF)
256+
if (res & 0x1000)
257257
{
258-
uint32_t shifted_ccc = res << 9;
258+
uint32_t shifted_ccc = uint32_t(res) << 21;
259259
uint32_t val = uint32_t(src) | shifted_ccc;
260260
*dest = val;
261261
return ++dest;
262262
}
263263

264-
size_t value_offset = ((size_t(src) - res) & 0x0FFF) - 1;
264+
size_t value_offset = ((size_t(src) - res) & 0x0FFF);
265265
uint32_t value = values[value_offset];
266-
if (value & (1 << 30))
266+
if (value & (1 << 31))
267267
{
268268
uint32_t shifted_ccc = (value & 0x0FF000) << 9;
269269
uint32_t val = uint32_t(src) | shifted_ccc;
@@ -276,8 +276,7 @@ namespace sysstr::util::unicode
276276
uint16_t decomp_start = value & 0xFFF;
277277
value >>= 12;
278278
uint16_t decomp_idx = value & 0x1F;
279-
value >>= 5;
280-
int final = value;
279+
int final = ((value & (1 << 5)) != 0);
281280

282281
auto * comps = compositions + decomp_start;
283282

@@ -308,21 +307,21 @@ namespace sysstr::util::unicode
308307
if (res == 0)
309308
return nullptr;
310309

311-
if (res > 0x0FFF)
310+
if (res & 0x1000)
312311
return nullptr;
313312

314-
size_t value_offset = ((size_t(src) - res) & 0x0FFF) - 1;
313+
size_t value_offset = ((size_t(src) - res) & 0x0FFF);
315314
uint32_t value = values[value_offset];
316315

317316
uint16_t comp_idx = value & 0xFFF;
318317
if (comp_idx == 0xFFF)
319318
return nullptr;
320319

321320
auto ret = compositions + comp_idx;
322-
bool is_last = (ret[0] >> 29);
321+
bool is_last = (ret[0] & (uint32_t(1) << 29));
323322
if (is_last)
324323
return nullptr;
325-
if (ret[0] >> 21)
324+
if (ret[0] & (uint32_t(0xFF) << 21))
326325
return nullptr;
327326

328327
++ret;
@@ -331,18 +330,22 @@ namespace sysstr::util::unicode
331330

332331
static auto get_comb_class(char32_t c) -> uint8_t
333332
{
333+
if (c < 128)
334+
return 0;
335+
334336
auto res = lookup::get(c);
335337
if (res == 0)
336338
return 0;
337339

338-
if (res > 0x0FFF)
339-
return uint8_t(res >> 12);
340+
if (res & 0x1000)
341+
return uint8_t(res);
340342

341-
size_t value_offset = ((size_t(c) - res) & 0x0FFF) - 1;
343+
size_t value_offset = ((size_t(c) - res) & 0x0FFF);
342344
uint32_t value = values[value_offset];
343345

344-
if (value & 0x2000000)
345-
return (value >> 12) & 0xFF;
346+
// possible but an extra if...
347+
//if (value & (uint32_t(1) << 30))
348+
// return (value >> 12) & 0xFF;
346349

347350
uint16_t comp_idx = value & 0xFFF;
348351
if (comp_idx == 0xFFF)
@@ -351,6 +354,43 @@ namespace sysstr::util::unicode
351354
auto * comps = compositions + comp_idx;
352355
return (comps[0] >> 21) & 0xFF;
353356
}
357+
358+
enum class nfc_qc_status
359+
{
360+
bad,
361+
good,
362+
stable
363+
};
364+
365+
static auto get_nfc_qc_status(char32_t c) -> nfc_qc_status
366+
{
367+
if (c < 128)
368+
return nfc_qc_status::stable;
369+
370+
auto res = lookup::get(c);
371+
if (res == 0)
372+
return nfc_qc_status::stable;
373+
374+
if (res & 0x1000)
375+
{
376+
bool is_ccc_zero = uint8_t(res) == 0;
377+
return nfc_qc_status(0 + is_ccc_zero);
378+
}
379+
380+
size_t value_offset = ((size_t(c) - res) & 0x0FFF);
381+
uint32_t value = values[value_offset];
382+
383+
bool is_nfc_qc_yes = !(value & (1 << 30));
384+
385+
uint16_t comp_idx = value & 0xFFF;
386+
if (comp_idx == 0xFFF)
387+
return nfc_qc_status(1 + is_nfc_qc_yes);
388+
389+
auto * comps = compositions + comp_idx;
390+
bool is_ccc_zero = !(comps[0] & (uint32_t(0xFF) << 21));
391+
392+
return nfc_qc_status(0 + is_nfc_qc_yes + is_ccc_zero);
393+
}
354394
};
355395

356396

0 commit comments

Comments
 (0)