Skip to content

Commit 1c56d10

Browse files
authored
[libc] Added isValidState to CharacterConverter class to ensure a provided mbstate is valid (llvm#145564)
Added isValidState to CharacterConverter class to ensure a provided mbstate is valid
1 parent 20f56d1 commit 1c56d10

File tree

3 files changed

+58
-3
lines changed

3 files changed

+58
-3
lines changed

libc/src/__support/wchar/character_converter.cpp

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
2525
// Information not metadata (# of bits excluding the byte headers)
2626
constexpr uint32_t MASK_ENCODED_BITS =
2727
mask_trailing_ones<uint32_t, ENCODED_BITS_PER_UTF8>();
28+
// Maximum value for utf-32 for a utf-8 sequence of a given length
29+
constexpr char32_t MAX_VALUE_PER_UTF8_LEN[] = {0x7f, 0x7ff, 0xffff, 0x10ffff};
30+
constexpr int MAX_UTF8_LENGTH = 4;
2831

2932
CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
3033

@@ -40,6 +43,17 @@ bool CharacterConverter::isFull() {
4043

4144
bool CharacterConverter::isEmpty() { return state->bytes_stored == 0; }
4245

46+
bool CharacterConverter::isValidState() {
47+
if (state->total_bytes > MAX_UTF8_LENGTH)
48+
return false;
49+
50+
const char32_t max_utf32_value =
51+
state->total_bytes == 0 ? 0
52+
: MAX_VALUE_PER_UTF8_LEN[state->total_bytes - 1];
53+
return state->bytes_stored <= state->total_bytes &&
54+
state->partial <= max_utf32_value;
55+
}
56+
4357
int CharacterConverter::push(char8_t utf8_byte) {
4458
uint8_t num_ones = static_cast<uint8_t>(cpp::countl_one(utf8_byte));
4559
// Checking the first byte if first push
@@ -90,9 +104,7 @@ int CharacterConverter::push(char32_t utf32) {
90104
state->partial = utf32;
91105

92106
// determine number of utf-8 bytes needed to represent this utf32 value
93-
constexpr char32_t MAX_VALUE_PER_UTF8_LEN[] = {0x7f, 0x7ff, 0xffff, 0x10ffff};
94-
constexpr int NUM_RANGES = 4;
95-
for (uint8_t i = 0; i < NUM_RANGES; i++) {
107+
for (uint8_t i = 0; i < MAX_UTF8_LENGTH; i++) {
96108
if (state->partial <= MAX_VALUE_PER_UTF8_LEN[i]) {
97109
state->total_bytes = i + 1;
98110
state->bytes_stored = i + 1;

libc/src/__support/wchar/character_converter.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ class CharacterConverter {
2828
void clear();
2929
bool isFull();
3030
bool isEmpty();
31+
bool isValidState();
3132

3233
int push(char8_t utf8_byte);
3334
int push(char32_t utf32);

libc/test/src/__support/wchar/utf32_to_8_test.cpp

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,3 +186,45 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, CantPushMidConversion) {
186186
int err = cr.push(utf32);
187187
ASSERT_EQ(err, -1);
188188
}
189+
190+
TEST(LlvmLibcCharacterConverterUTF32To8Test, InvalidState) {
191+
LIBC_NAMESPACE::internal::mbstate s1;
192+
LIBC_NAMESPACE::internal::CharacterConverter c1(&s1);
193+
ASSERT_TRUE(c1.isValidState());
194+
195+
LIBC_NAMESPACE::internal::mbstate s2{0, 2, 0};
196+
LIBC_NAMESPACE::internal::CharacterConverter c2(&s2);
197+
ASSERT_FALSE(c2.isValidState());
198+
199+
LIBC_NAMESPACE::internal::mbstate s3{0x7f, 1, 1};
200+
LIBC_NAMESPACE::internal::CharacterConverter c3(&s3);
201+
ASSERT_TRUE(c3.isValidState());
202+
LIBC_NAMESPACE::internal::mbstate s4{0x80, 1, 1};
203+
LIBC_NAMESPACE::internal::CharacterConverter c4(&s4);
204+
ASSERT_FALSE(c4.isValidState());
205+
206+
LIBC_NAMESPACE::internal::mbstate s5{0x7ff, 1, 2};
207+
LIBC_NAMESPACE::internal::CharacterConverter c5(&s5);
208+
ASSERT_TRUE(c5.isValidState());
209+
LIBC_NAMESPACE::internal::mbstate s6{0x800, 1, 2};
210+
LIBC_NAMESPACE::internal::CharacterConverter c6(&s6);
211+
ASSERT_FALSE(c6.isValidState());
212+
213+
LIBC_NAMESPACE::internal::mbstate s7{0xffff, 1, 3};
214+
LIBC_NAMESPACE::internal::CharacterConverter c7(&s7);
215+
ASSERT_TRUE(c7.isValidState());
216+
LIBC_NAMESPACE::internal::mbstate s8{0x10000, 1, 3};
217+
LIBC_NAMESPACE::internal::CharacterConverter c8(&s8);
218+
ASSERT_FALSE(c8.isValidState());
219+
220+
LIBC_NAMESPACE::internal::mbstate s9{0x10ffff, 1, 4};
221+
LIBC_NAMESPACE::internal::CharacterConverter c9(&s9);
222+
ASSERT_TRUE(c9.isValidState());
223+
LIBC_NAMESPACE::internal::mbstate s10{0x110000, 1, 2};
224+
LIBC_NAMESPACE::internal::CharacterConverter c10(&s10);
225+
ASSERT_FALSE(c10.isValidState());
226+
227+
LIBC_NAMESPACE::internal::mbstate s11{0, 0, 5};
228+
LIBC_NAMESPACE::internal::CharacterConverter c11(&s11);
229+
ASSERT_FALSE(c11.isValidState());
230+
}

0 commit comments

Comments
 (0)