Skip to content

Commit b6a1fc8

Browse files
authored
Merge pull request #23 from sujayakar/hangul-fix
Fix corner case in Hangul composition
2 parents ba25f07 + 6c94f92 commit b6a1fc8

File tree

2 files changed

+48
-27
lines changed

2 files changed

+48
-27
lines changed

src/normalize.rs

Lines changed: 46 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F)
4242
}
4343

4444
// Perform decomposition for Hangul
45-
if is_hangul(c) {
45+
if is_hangul_syllable(c) {
4646
decompose_hangul(c, emit_char);
4747
return;
4848
}
@@ -77,26 +77,34 @@ const T_COUNT: u32 = 28;
7777
const N_COUNT: u32 = (V_COUNT * T_COUNT);
7878
const S_COUNT: u32 = (L_COUNT * N_COUNT);
7979

80-
pub(crate) fn is_hangul(c: char) -> bool {
80+
const S_LAST: u32 = S_BASE + S_COUNT - 1;
81+
const L_LAST: u32 = L_BASE + L_COUNT - 1;
82+
const V_LAST: u32 = V_BASE + V_COUNT - 1;
83+
const T_LAST: u32 = T_BASE + T_COUNT - 1;
84+
85+
// Composition only occurs for `TPart`s in `U+11A8 ... U+11C2`,
86+
// i.e. `T_BASE + 1 ... T_LAST`.
87+
const T_FIRST: u32 = T_BASE + 1;
88+
89+
pub(crate) fn is_hangul_syllable(c: char) -> bool {
8190
(c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT)
8291
}
8392

8493
// Decompose a precomposed Hangul syllable
8594
#[allow(unsafe_code)]
8695
#[inline(always)]
8796
fn decompose_hangul<F>(s: char, mut emit_char: F) where F: FnMut(char) {
88-
let si = s as u32 - S_BASE;
89-
90-
let li = si / N_COUNT;
97+
let s_index = s as u32 - S_BASE;
98+
let l_index = s_index / N_COUNT;
9199
unsafe {
92-
emit_char(char::from_u32_unchecked(L_BASE + li));
100+
emit_char(char::from_u32_unchecked(L_BASE + l_index));
93101

94-
let vi = (si % N_COUNT) / T_COUNT;
95-
emit_char(char::from_u32_unchecked(V_BASE + vi));
102+
let v_index = (s_index % N_COUNT) / T_COUNT;
103+
emit_char(char::from_u32_unchecked(V_BASE + v_index));
96104

97-
let ti = si % T_COUNT;
98-
if ti > 0 {
99-
emit_char(char::from_u32_unchecked(T_BASE + ti));
105+
let t_index = s_index % T_COUNT;
106+
if t_index > 0 {
107+
emit_char(char::from_u32_unchecked(T_BASE + t_index));
100108
}
101109
}
102110
}
@@ -112,20 +120,33 @@ pub(crate) fn hangul_decomposition_length(s: char) -> usize {
112120
#[allow(unsafe_code)]
113121
#[inline(always)]
114122
fn compose_hangul(a: char, b: char) -> Option<char> {
115-
let l = a as u32;
116-
let v = b as u32;
117-
// Compose an LPart and a VPart
118-
if L_BASE <= l && l < (L_BASE + L_COUNT) // l should be an L choseong jamo
119-
&& V_BASE <= v && v < (V_BASE + V_COUNT) { // v should be a V jungseong jamo
120-
let r = S_BASE + (l - L_BASE) * N_COUNT + (v - V_BASE) * T_COUNT;
121-
return unsafe { Some(char::from_u32_unchecked(r)) };
123+
let (a, b) = (a as u32, b as u32);
124+
match (a, b) {
125+
// Compose a leading consonant and a vowel together into an LV_Syllable
126+
(L_BASE ... L_LAST, V_BASE ... V_LAST) => {
127+
let l_index = a - L_BASE;
128+
let v_index = b - V_BASE;
129+
let lv_index = l_index * N_COUNT + v_index * T_COUNT;
130+
let s = S_BASE + lv_index;
131+
Some(unsafe {char::from_u32_unchecked(s)})
132+
},
133+
// Compose an LV_Syllable and a trailing consonant into an LVT_Syllable
134+
(S_BASE ... S_LAST, T_FIRST ... T_LAST) if (a - S_BASE) % T_COUNT == 0 => {
135+
Some(unsafe {char::from_u32_unchecked(a + (b - T_BASE))})
136+
},
137+
_ => None,
122138
}
123-
// Compose an LVPart and a TPart
124-
if S_BASE <= l && l <= (S_BASE+S_COUNT-T_COUNT) // l should be a syllable block
125-
&& T_BASE <= v && v < (T_BASE+T_COUNT) // v should be a T jongseong jamo
126-
&& (l - S_BASE) % T_COUNT == 0 { // l should be an LV syllable block (not LVT)
127-
let r = l + (v - T_BASE);
128-
return unsafe { Some(char::from_u32_unchecked(r)) };
139+
}
140+
141+
#[cfg(test)]
142+
mod tests {
143+
use super::compose_hangul;
144+
145+
// Regression test from a bugfix where we were composing an LV_Syllable with
146+
// T_BASE directly. (We should only compose an LV_Syllable with a character
147+
// in the range `T_BASE + 1 ... T_LAST`.)
148+
#[test]
149+
fn test_hangul_composition() {
150+
assert_eq!(compose_hangul('\u{c8e0}', '\u{11a7}'), None);
129151
}
130-
None
131152
}

src/stream_safe.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use normalize::{
22
hangul_decomposition_length,
3-
is_hangul,
3+
is_hangul_syllable,
44
};
55
use tables;
66

@@ -71,7 +71,7 @@ fn classify_nonstarters(c: char) -> Decomposition {
7171
}
7272
}
7373
// Next, special case Hangul, since it's not handled by our tables.
74-
if is_hangul(c) {
74+
if is_hangul_syllable(c) {
7575
return Decomposition {
7676
leading_nonstarters: 0,
7777
trailing_nonstarters: 0,

0 commit comments

Comments
 (0)