Skip to content

Commit e1edb6f

Browse files
committed
Merge branch 'dev'
2 parents 1dcab97 + cf87a43 commit e1edb6f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+10887
-882
lines changed

.github/workflows/test.yml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ on:
1414
- 'tools/**'
1515

1616
env:
17-
BUILD_TYPE: Release
17+
BUILD_TYPE: MinSizeRel
1818
NDK_VER: 27.2.12479018
1919
NDK_ARCH: x86_64
2020
NDK_API: 29
@@ -35,7 +35,7 @@ jobs:
3535
- {os: ubuntu-latest, compiler: gcc, version: 13 }
3636
- {os: ubuntu-24.04, compiler: gcc, version: 14 }
3737

38-
- {os: ubuntu-latest, compiler: clang, version: 16 }
38+
- {os: ubuntu-22.04, compiler: clang, version: 16 }
3939
- {os: ubuntu-latest, compiler: clang, version: 17 }
4040
- {os: ubuntu-latest, compiler: clang, version: 18 }
4141

@@ -51,9 +51,10 @@ jobs:
5151
wget https://apt.llvm.org/llvm.sh
5252
chmod u+x llvm.sh
5353
sudo ./llvm.sh ${{ matrix.version }}
54-
sudo apt-get install -y clang-tools-${{ matrix.version }}
54+
sudo apt-get install -y clang-tools-${{ matrix.version }} libc++-${{ matrix.version }}-dev libc++abi-${{ matrix.version }}-dev
5555
echo "CC=clang-${{ matrix.version }}" >> $GITHUB_ENV
5656
echo "CXX=clang++-${{ matrix.version }}" >> $GITHUB_ENV
57+
echo "CXXFLAGS=-stdlib=libc++" >> $GITHUB_ENV
5758
fi
5859
5960
if [[ '${{ matrix.compiler }}' == 'gcc' ]]; then

CHANGELOG.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,22 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
55

66
## Unreleased
77

8+
### Added
9+
10+
- `grapheme_view` and `graphemes` adapter which provide ability to iterate over grapheme clusters in `sys_string` and any UTF range.
11+
12+
### Changed
13+
14+
- Unicode data has been optimized for better size/speed balance
15+
- `sys_string_t::hash_type` has been changed from `unsigned` to `size_t` on some platforms.
16+
17+
### Fixed
18+
19+
- Invalid character access in unicode mappings.
20+
- Crash when sys_string_builder is re-used after `build()` on Apple and Python platforms.
21+
- `utf_ref_view` and `utf_owning_view` now actually work with forward and input underlying ranges
22+
- MSVC warnings when using `std::hash<sys_string>`
23+
824
## [3.0] - 2024-12-02
925

1026
This is a major release with some breaking changes
@@ -57,6 +73,12 @@ This is a major release with some breaking changes
5773
behavior applies to `wchar_t` on platform where it is UTF-16 or UTF-32.
5874
- `operator<<` no longer pollutes global namespace
5975

76+
## [2.15] - 2025-01-07
77+
78+
### Fixed
79+
80+
- Invalid character access in unicode mappings.
81+
6082
## [2.14] - 2024-05-02
6183

6284
### Fixed
@@ -207,4 +229,5 @@ This is a major release with some breaking changes
207229
[2.12]: https://github.com/gershnik/sys_string/releases/v2.12
208230
[2.13]: https://github.com/gershnik/sys_string/releases/v2.13
209231
[2.14]: https://github.com/gershnik/sys_string/releases/v2.14
232+
[2.15]: https://github.com/gershnik/sys_string/releases/v2.15
210233
[3.0]: https://github.com/gershnik/sys_string/releases/v3.0

CMakeLists.txt

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,41 @@ string(STRIP ${SYSSTR_VERSION} SYSSTR_VERSION)
1616

1717
project(sys_string VERSION ${SYSSTR_VERSION} LANGUAGES CXX)
1818

19+
find_package (Python3 COMPONENTS Interpreter Development)
1920

2021
add_subdirectory(lib)
2122

23+
if(${Python3_Interpreter_FOUND})
24+
25+
file(GLOB UNICODE_DATA ${CMAKE_CURRENT_LIST_DIR}/unicode/data/*.txt)
26+
file(GLOB UNICODE_SCRIPTS ${CMAKE_CURRENT_LIST_DIR}/unicode/scripts/*.py)
27+
28+
set(UNICODE_GENERATED_FILES
29+
${CMAKE_CURRENT_LIST_DIR}/lib/src/unicode_mappings.cpp
30+
${CMAKE_CURRENT_LIST_DIR}/lib/inc/sys_string/impl/unicode/mappings.h
31+
${CMAKE_CURRENT_LIST_DIR}/test/test_grapheme_data.h
32+
)
33+
34+
add_custom_command(
35+
COMMENT "Generating Unicoode mappings"
36+
OUTPUT ${UNICODE_GENERATED_FILES}
37+
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/unicode/scripts/genmappings.py
38+
${CMAKE_CURRENT_LIST_DIR}/unicode/data
39+
${UNICODE_GENERATED_FILES}
40+
DEPENDS
41+
${UNICODE_DATA}
42+
${UNICODE_SCRIPTS}
43+
)
44+
45+
add_custom_target(generate_unicode_mappings
46+
DEPENDS
47+
${UNICODE_GENERATED_FILES}
48+
)
49+
50+
add_dependencies(sys_string generate_unicode_mappings)
51+
52+
endif()
53+
2254
if (PROJECT_IS_TOP_LEVEL)
2355

2456
include(lib/cmake/install.cmake)

README.md

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,8 @@
55
[![License](https://img.shields.io/badge/license-BSD-brightgreen.svg)](https://opensource.org/licenses/BSD-3-Clause)
66
[![Tests](https://github.com/gershnik/sys_string/actions/workflows/test.yml/badge.svg)](https://github.com/gershnik/sys_string/actions/workflows/test.yml)
77

8-
This library provides a C++ string class template `sys_string_t` that is optimized for **interoperability with external native string type**. It is **immutable**, **Unicode-first** and exposes convenient **operations similar to Python or ECMAScript strings**. It uses a separate `sys_string_builder_t` class template to construct strings. It provides fast concatenation via `+` operator that **does not allocate temporary strings**.
9-
The library exposes bidirectional UTF-8/UTF-16/UTF-32 views of `sys_string_t` as well as of any C++ input ranges of chracters.
10-
of characters.
8+
This library provides a C++ string class template `sys_string_t` that is optimized for **interoperability with external native string types**. It is **immutable**, **Unicode-first** and exposes convenient **operations similar to Python or ECMAScript strings**. It uses a separate `sys_string_builder_t` class template to construct strings. It provides fast concatenation via `+` operator that **does not allocate temporary strings**.
9+
The library exposes bidirectional UTF-8/UTF-16/UTF-32 and grapheme cluster views of `sys_string_t` as well as of other C++ ranges of characters.
1110

1211
## What does it mean?
1312

@@ -38,11 +37,16 @@ of characters.
3837

3938
* **Concatenation does not allocate temporaries.** You can safely do things like `result = s1 + s2 + s3`. It will result in **one** memory allocation and 3 calls to `memcpy` to copy each of `s1`, `s2` and `s3` content into the final result. Not 2 allocations and 5 copies like in other languages or with `std::string`.
4039

41-
* **Bidirectional UTF-8/UTF-16/UTF-32 views**. You can view `sys_string_t` as a sequence of UTF-8/16/32 characters and iterate forward or __backward__ equally efficiently. Consider trying to find last instance of Unicode whitespace in UTF-8 data. Doing it as fast as finding the first instance is non-trivial. The views also work on any random access containers (C array, `std::array`, `std::vector`, `std::string`) of characters. Thus you can iterate in UTF-8 over `std::vector<char16_t>` etc.
40+
* **Bidirectional UTF-8/UTF-16/UTF-32 views**. You can view `sys_string_t` as a sequence of UTF-8/16/32 characters and iterate forward or __backward__ equally efficiently. Consider trying to find last instance of Unicode whitespace in UTF-8 data. Doing it as fast as finding the first instance is non-trivial. The views also work on any C++ input ranges (C array, `std::array`, `std::vector`, `std::string` or even `std::ranges::istream_view`) of characters (`char`, `char8_t`, `char16_t`, `char32_t` and `wchar_t` on platforms where it is Unicode). Thus you can iterate in UTF-8 over `std::vector<char16_t>` etc.
41+
42+
* **Bidirectional grapheme cluster views**. Similarly you can also further view any of the UTF-8/UTF-16/UTF-32 views of `sys_string_t` as a sequence of
43+
[grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries) and iterate over them forward or __backward__ equally efficiently.
44+
Consider the task of erasing the last user perceived "character" from a string. To do so correctly you need to erase the last _grapheme cluster_. Doing it and doing it fast is very non-trivial. This functionality also works on any C++ range of characters (but requires a _forward_ range).
4245

4346
## Why bother? Doesn't `std::string` work well?
4447

4548
An `std::string` storing UTF-8 (or `std::wstring` storing UTF-16 on Windows) works very well for some scenarios but fails miserably for others. `sys_string` class is an attempt to create something that works well in situations `std::string` would be a bad choice.
49+
4650
Specifically, `std::basic_string` is an STL container of a character type that owns its memory and controls it via a user-supplied allocator. These design choices make it very fast for direct character access but create the following problems:
4751

4852
* They rule out (efficient) reference-counted implementations. Which means that when you copy an `std::string` instance it must copy its content. Some of the penalty of that is alleviated by modern [small string optimization](https://akrzemi1.wordpress.com/2014/04/14/common-optimizations/) but this is, at best, a band-aid. There are workarounds, of course, such as using `std::shared_ptr<std::string>>` "when it matters" but they result in even more complexity for something that is quite fundamental to any data processing.

doc/Usage.md

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414
- [Iterating over string content](#iterating-over-string-content)
1515
- [Storage iteration](#storage-iteration)
1616
- [UTF iteration](#utf-iteration)
17-
- [Helper: UTF iteration over any C++ character range](#helper-utf-iteration-over-any-c-character-range)
17+
- [UTF iteration over any C++ character range](#utf-iteration-over-any-c-character-range)
18+
- [Grapheme iteration](#grapheme-iteration)
1819
- [Substrings](#substrings)
1920
- [Accessing C strings](#accessing-c-strings)
2021
- [Accessing storage as C array](#accessing-storage-as-c-array)
@@ -307,7 +308,7 @@ utf32_access::iterator first = access.reverse(access.rend());
307308
308309
```
309310

310-
### Helper: UTF iteration over any C++ character range
311+
### UTF iteration over any C++ character range
311312

312313
Since the internal facility to perform UTF iteration is quite generic this library exposes it to allow you to perform UTF iteration over any C++ input range of compatible characters (`char`, `char8_t`, `char16_t`, `char32_t`, and possibly `wchar_t` on platforms where it is encoded in UTF-16 or UTF-32). At the time of this writing there is a work on including something similar to C++ standard library but, even if eventually approved, it will be a long time before it will become available.
313314

@@ -335,6 +336,50 @@ If your standard library supports user-supplied [range adapter closures](https:/
335336
as_utf8(u"😀😜") | std::views::take(1) | ...
336337
```
337338
339+
### Grapheme iteration
340+
341+
Sometimes even UTF-32 iteration is not what you need. Many user perceived "characters" are actually composed from multiple
342+
UTF-32 codepoints. Unicode standard defines [grapheme cluster](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries) as
343+
what corresponds to a user notion of a character. A single grapheme cluster, or grapheme for short, can contain one or more Unicode
344+
codepoints.
345+
346+
This library allows you to easily iterate over grapheme clusters in `sys_string_t` content as well as in any C++
347+
[forward_range](https://en.cppreference.com/w/cpp/ranges/forward_range) of compatible character type.
348+
349+
To iterate over graphemes you need to construct an instance of `grapheme_view` directly or use `graphemes` view adapter. In either case
350+
you need to supply a **view** of characters to iterate over. The view can be a reference to `sys_string_t::char_access`, `sys_string_t::utfX_access`
351+
or any other compatible forward view.
352+
353+
The "values" returned from `grapheme_view` are `std::ranges::subrange` of the underlying view containing graphemes.
354+
355+
To put it all in context here is how you can iterate over all graphemes in a `sys_string`.
356+
357+
```cpp
358+
sys_string str = S("क्त्य"); //6 Unicode codepoints but one grapheme!
359+
sys_string::char_access access(str);
360+
for (auto grapheme_range: graphemes(access)) {
361+
//grapheme_range is a subrange of sys_string::char_access::iterator
362+
sys_string grapheme(grapheme_range);
363+
}
364+
```
365+
366+
A `grapheme_view` is reversible, that is it can be iterated in both directions. Here is how to accomplish a common task -
367+
safely remove the last "character" from a string (see [Substrings](#Substrings) below for details on how to obtain parts of a string):
368+
369+
```cpp
370+
sys_string str = S("abक्त्य");
371+
sys_string::char_access access(str);
372+
auto gr_view = graphemes(access);
373+
if (auto rit = gr_view.rbegin(); rit != gr_view.rend()) {
374+
auto grapheme = *rit;
375+
str = sys_string(access.begin(), grapheme.begin());
376+
}
377+
assert(str == S("ab"));
378+
```
379+
380+
You can easily extend this to removing any number of trailing characters.
381+
382+
338383
## Substrings
339384
340385
You can obtain a substring of a `sys_string` in two ways:

lib/CMakeLists.txt

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@ if (NOT SYSSTR_VERSION)
1717
endif()
1818
project(sys_string VERSION ${SYSSTR_VERSION})
1919

20-
find_package (Python3 COMPONENTS Interpreter)
21-
2220
set(SRCDIR ${CMAKE_CURRENT_LIST_DIR})
2321
set(LIBNAME sys_string)
2422

@@ -64,6 +62,7 @@ PUBLIC
6462
set(MAIN_CODE
6563
${SRCDIR}/inc/sys_string/config.h
6664
${SRCDIR}/inc/sys_string/utf_view.h
65+
${SRCDIR}/inc/sys_string/grapheme_view.h
6766
${SRCDIR}/inc/sys_string/sys_string.h
6867
${SRCDIR}/inc/sys_string/impl/compare.h
6968
${SRCDIR}/inc/sys_string/impl/hash.h
@@ -129,23 +128,6 @@ PRIVATE
129128

130129
add_library(${LIBNAME}::${LIBNAME} ALIAS ${LIBNAME})
131130

132-
133-
if(${Python3_Interpreter_FOUND})
134-
135-
file(GLOB UNICODE_DATA ${SRCDIR}/res/*.txt)
136-
file(GLOB UNICODE_SCRIPTS ${SRCDIR}/scripts/*.py)
137-
138-
add_custom_command(
139-
COMMENT "Generating Unicoode mappings"
140-
OUTPUT ${SRCDIR}/src/unicode_mappings.cpp
141-
COMMAND ${Python3_EXECUTABLE} ${SRCDIR}/scripts/genmappings.py ${SRCDIR}/res ${SRCDIR}/src/unicode_mappings.cpp
142-
DEPENDS
143-
${UNICODE_DATA}
144-
${UNICODE_SCRIPTS}
145-
)
146-
147-
endif()
148-
149131
if (PROJECT_IS_TOP_LEVEL)
150132

151133
include(cmake/install.cmake)

lib/inc/sys_string/config.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,12 @@
8484
#error Please define how to force inline for your compiler
8585
#endif
8686

87+
#if defined(_MSC_VER)
88+
#define SYS_STRING_NO_UNIQUE_ADDRESS [[msvc::no_unique_address]]
89+
#else
90+
#define SYS_STRING_NO_UNIQUE_ADDRESS [[no_unique_address]]
91+
#endif
92+
8793
//GCC up to 11.3 has a weird constexpr bug in some palces
8894
#if __GNUC__ > 11 || (__GNUC__ == 11 && __GNUC_MINOR__ > 2)
8995
#define BUGGY_CONSTEXPR constexpr
@@ -105,7 +111,7 @@
105111

106112
//See https://github.com/llvm/llvm-project/issues/77773 for the sad story of how feature test
107113
//macros are useless with libc++
108-
#if __cpp_lib_format >= 201907L || (defined(_LIBCPP_VERSION) && __has_include(<format>))
114+
#if __cpp_lib_format >= 201907L || (defined(_LIBCPP_VERSION) && _LIBCPP_VERSION >= 170000 && __has_include(<format>))
109115

110116
#define SYS_STRING_SUPPORTS_STD_FORMAT 1
111117

0 commit comments

Comments
 (0)