Skip to content

Commit e176253

Browse files
committed
Backporting Unicode data improvements and normalization
1 parent 8a5696f commit e176253

26 files changed

+89014
-1485
lines changed

CMakeLists.txt

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,43 @@ string(STRIP ${SYSSTR_VERSION} SYSSTR_VERSION)
1616

1717
project(sys_string VERSION ${SYSSTR_VERSION} LANGUAGES CXX)
1818

19+
find_package (Python3 COMPONENTS Interpreter Development)
20+
1921

2022
add_subdirectory(lib)
2123

24+
if(${Python3_Interpreter_FOUND})
25+
26+
file(GLOB UNICODE_DATA ${CMAKE_CURRENT_LIST_DIR}/unicode/data/*.txt)
27+
file(GLOB UNICODE_SCRIPTS ${CMAKE_CURRENT_LIST_DIR}/unicode/scripts/*.py)
28+
29+
set(UNICODE_GENERATED_FILES
30+
${CMAKE_CURRENT_LIST_DIR}/lib/src/unicode_mappings.cpp
31+
${CMAKE_CURRENT_LIST_DIR}/lib/inc/sys_string/impl/unicode/mappings.h
32+
${CMAKE_CURRENT_LIST_DIR}/test/test_normalization_data.h
33+
)
34+
35+
add_custom_command(
36+
COMMENT "Generating Unicoode mappings"
37+
OUTPUT ${UNICODE_GENERATED_FILES}
38+
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/lib/scripts/genmappings.py
39+
${CMAKE_CURRENT_LIST_DIR}/lib/res
40+
${UNICODE_GENERATED_FILES}
41+
DEPENDS
42+
${UNICODE_DATA}
43+
${UNICODE_SCRIPTS}
44+
)
45+
46+
add_custom_target(generate_unicode_mappings
47+
DEPENDS
48+
${UNICODE_GENERATED_FILES}
49+
)
50+
51+
add_dependencies(sys_string generate_unicode_mappings)
52+
53+
endif()
54+
55+
2256
if (PROJECT_IS_TOP_LEVEL)
2357

2458
include(lib/cmake/install.cmake)

lib/CMakeLists.txt

Lines changed: 1 addition & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@ if (NOT SYSSTR_VERSION)
1717
endif()
1818
project(sys_string VERSION ${SYSSTR_VERSION})
1919

20-
find_package (Python3 COMPONENTS Interpreter)
21-
2220
set(SRCDIR ${CMAKE_CURRENT_LIST_DIR})
2321
set(LIBNAME sys_string)
2422

@@ -94,6 +92,7 @@ set(UNICODE_FILES
9492
${SRCDIR}/inc/sys_string/impl/unicode/utf_util.h
9593
${SRCDIR}/inc/sys_string/impl/unicode/algorithms.h
9694
${SRCDIR}/inc/sys_string/impl/unicode/mappings.h
95+
${SRCDIR}/inc/sys_string/impl/unicode/mappings_common.h
9796
)
9897
set(SOURCES
9998
${SRCDIR}/src/unicode_mappings.cpp
@@ -129,22 +128,6 @@ PRIVATE
129128
add_library(${LIBNAME}::${LIBNAME} ALIAS ${LIBNAME})
130129

131130

132-
if(${Python3_Interpreter_FOUND})
133-
134-
file(GLOB UNICODE_DATA ${SRCDIR}/res/*.txt)
135-
file(GLOB UNICODE_SCRIPTS ${SRCDIR}/scripts/*.py)
136-
137-
add_custom_command(
138-
COMMENT "Generating Unicoode mappings"
139-
OUTPUT ${SRCDIR}/src/unicode_mappings.cpp
140-
COMMAND ${Python3_EXECUTABLE} ${SRCDIR}/scripts/genmappings.py ${SRCDIR}/res ${SRCDIR}/src/unicode_mappings.cpp
141-
DEPENDS
142-
${UNICODE_DATA}
143-
${UNICODE_SCRIPTS}
144-
)
145-
146-
endif()
147-
148131
if (PROJECT_IS_TOP_LEVEL)
149132

150133
include(cmake/install.cmake)

lib/inc/sys_string/impl/misc.h

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,43 @@ namespace sysstr::util
7878

7979
return std::equal(test.begin(), test.end(), access.begin());
8080
}
81+
82+
template<class Storage>
83+
struct transform_sink
84+
{
85+
const typename sys_string_t<Storage>::utf32_view & access;
86+
decltype(std::declval<sys_string_builder_t<Storage>>().chars()) chars;
87+
88+
89+
SYS_STRING_FORCE_INLINE
90+
void copy(typename sys_string_t<Storage>::utf32_view::iterator it)
91+
{
92+
auto cur = it.storage_cursor();
93+
auto size = it.storage_size();
94+
95+
for (decltype(size) i = 0; i < size; ++i, ++cur)
96+
this->chars.push_back(*cur);
97+
}
98+
99+
SYS_STRING_FORCE_INLINE
100+
void copy(typename sys_string_t<Storage>::utf32_view::iterator first, typename sys_string_t<Storage>::utf32_view::iterator last)
101+
{
102+
if (chars.empty())
103+
{
104+
if (first == access.begin() && last == access.end())
105+
return;
106+
}
107+
auto fcur = first.storage_cursor();
108+
auto lcur = last.storage_cursor();
109+
110+
for ( ; fcur != lcur; ++fcur)
111+
this->chars.push_back(*fcur);
112+
}
113+
SYS_STRING_FORCE_INLINE
114+
void write(char32_t c)
115+
{ write_unsafe<utf_encoding_of<typename sys_string_t<Storage>::storage_type>>(c, std::back_inserter(chars)); }
116+
117+
};
81118
}
82119

83120
namespace sysstr
@@ -140,6 +177,28 @@ auto sysstr::sys_string_t<Storage>::to_upper() const -> sys_string_t<Storage>
140177
return builder.build();
141178
}
142179

180+
template<class Storage>
181+
inline
182+
auto sysstr::sys_string_t<Storage>::normalize(normalization norm) const -> sys_string_t
183+
{
184+
185+
sys_string_builder_t<Storage> builder;
186+
sys_string_t<Storage>::utf32_view access(*this);
187+
switch (norm)
188+
{
189+
break; case normalization::nfd:
190+
normalize::nfd<utf_encoding_of<storage_type>>()(access.begin(), access.end(), std::back_inserter(builder.chars()));
191+
break; case normalization::nfc:
192+
{
193+
util::transform_sink<Storage> sink{access, builder.chars()};
194+
normalize::nfc<utf_encoding_of<storage_type>>().call_with_sink(access.begin(), access.end(), sink);
195+
if (sink.chars.empty())
196+
return *this;
197+
}
198+
break; default: return *this;
199+
}
200+
return builder.build();
201+
}
143202

144203
template<class Storage>
145204
template<class Pred>

0 commit comments

Comments
 (0)