Skip to content

Commit 5da0090

Browse files
Give U+115F HANGUL CHOSEONG FILLER width 2
1 parent a6b5a52 commit 5da0090

File tree

3 files changed

+15
-5
lines changed

3 files changed

+15
-5
lines changed

scripts/unicode.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,8 @@ def load_zero_widths() -> "list[bool]":
210210
for cp in range(low, high + 1):
211211
zw_map[cp] = False
212212

213-
# `Default_Ignorable_Code_Point`s also have 0 width
213+
# `Default_Ignorable_Code_Point`s also have 0 width:
214+
# https://www.unicode.org/faq/unsup_char.html#3
214215
with fetch_open("DerivedCoreProperties.txt") as properties:
215216
single = re.compile(r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+")
216217
multiple = re.compile(
@@ -238,6 +239,8 @@ def load_zero_widths() -> "list[bool]":
238239
# into a single wide grapheme. So we treat vowel and trailing jamo as
239240
# 0-width, such that only the width of the leading jamo is counted
240241
# and the resulting grapheme has width 2.
242+
#
243+
# (See the Unicode Standard sections 3.12 and 18.6 for more on Hangul)
241244
with fetch_open("HangulSyllableType.txt") as categories:
242245
single = re.compile(r"^([0-9A-F]+)\s+;\s+(V|T)\s+")
243246
multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(V|T)\s+")
@@ -255,6 +258,12 @@ def load_zero_widths() -> "list[bool]":
255258
for cp in range(low, high + 1):
256259
zw_map[cp] = True
257260

261+
# Special case: U+115F HANGUL CHOSEONG FILLER.
262+
# U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have
263+
# zero width. However, the expected usage is to combine it with vowel or trailing jamo
264+
# (which are considered 0-width on their own) to form a composed Hangul syllable with
265+
# width 2. Therefore, we treat it as having width 2.
266+
zw_map[0x115F] = False
258267
return zw_map
259268

260269

@@ -541,8 +550,8 @@ def main(module_filename: str):
541550
542551
We obey the following rules in decreasing order of importance:
543552
- The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
544-
- Hangul Jamo medial vowels & final consonants are zero-width.
545-
- All `Default_Ignorable_Code_Point`s are zero-width.
553+
- Hangul jamo medial vowels & final consonants are zero-width.
554+
- All `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
546555
- All codepoints in general categories `Cc`, `Cf`, `Mn`, or `Me` are zero-width,
547556
except for `Prepended_Concatenation_Mark`s.
548557
- All codepoints with an East Asian Width of `Ambigous` are ambiguous-width.

src/tables.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,7 @@ pub mod charwidth {
340340
0x55, 0x55, 0x55, 0x50, 0x05, 0x54, 0x55, 0x55, 0x55, 0x01, 0x54, 0x55, 0x55, 0x45, 0x41,
341341
0x55, 0x51, 0x55, 0x55, 0x55, 0x51, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0xAA,
342342
0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
343-
0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x2A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
343+
0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
344344
0x00, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x01, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
345345
0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x54, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05,
346346
0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,

src/tests.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,6 @@ fn test_default_ignorable() {
201201

202202
assert_eq!(UnicodeWidthChar::width('\u{E0000}'), Some(0));
203203

204-
assert_eq!(UnicodeWidthChar::width('\u{115F}'), Some(0));
205204
assert_eq!(UnicodeWidthChar::width('\u{1160}'), Some(0));
206205
assert_eq!(UnicodeWidthChar::width('\u{3164}'), Some(0));
207206
assert_eq!(UnicodeWidthChar::width('\u{FFA0}'), Some(0));
@@ -215,6 +214,8 @@ fn test_jamo() {
215214

216215
assert_eq!(UnicodeWidthChar::width('\u{1100}'), Some(2));
217216
assert_eq!(UnicodeWidthChar::width('\u{A97C}'), Some(2));
217+
// Special case: U+115F HANGUL CHOSEONG FILLER
218+
assert_eq!(UnicodeWidthChar::width('\u{115F}'), Some(2));
218219
assert_eq!(UnicodeWidthChar::width('\u{1160}'), Some(0));
219220
assert_eq!(UnicodeWidthChar::width('\u{D7C6}'), Some(0));
220221
assert_eq!(UnicodeWidthChar::width('\u{11A8}'), Some(0));

0 commit comments

Comments
 (0)