Skip to content

Commit bd3770c

Browse files
committed
Fix corner case in Hangul composition
1 parent ba25f07 commit bd3770c

File tree

2 files changed

+44
-27
lines changed

2 files changed

+44
-27
lines changed

src/normalize.rs

Lines changed: 42 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F)
4242
}
4343

4444
// Perform decomposition for Hangul
45-
if is_hangul(c) {
45+
if is_hangul_syllable(c) {
4646
decompose_hangul(c, emit_char);
4747
return;
4848
}
@@ -77,26 +77,30 @@ const T_COUNT: u32 = 28;
7777
const N_COUNT: u32 = (V_COUNT * T_COUNT);
7878
const S_COUNT: u32 = (L_COUNT * N_COUNT);
7979

80-
pub(crate) fn is_hangul(c: char) -> bool {
80+
const S_END: u32 = S_BASE + S_COUNT - 1;
81+
const L_END: u32 = L_BASE + L_COUNT - 1;
82+
const V_END: u32 = V_BASE + V_COUNT - 1;
83+
const T_END: u32 = T_BASE + T_COUNT - 1;
84+
85+
pub(crate) fn is_hangul_syllable(c: char) -> bool {
8186
(c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT)
8287
}
8388

8489
// Decompose a precomposed Hangul syllable
8590
#[allow(unsafe_code)]
8691
#[inline(always)]
8792
fn decompose_hangul<F>(s: char, mut emit_char: F) where F: FnMut(char) {
88-
let si = s as u32 - S_BASE;
89-
90-
let li = si / N_COUNT;
93+
let s_index = s as u32 - S_BASE;
94+
let l_index = s_index / N_COUNT;
9195
unsafe {
92-
emit_char(char::from_u32_unchecked(L_BASE + li));
96+
emit_char(char::from_u32_unchecked(L_BASE + l_index));
9397

94-
let vi = (si % N_COUNT) / T_COUNT;
95-
emit_char(char::from_u32_unchecked(V_BASE + vi));
98+
let v_index = (s_index % N_COUNT) / T_COUNT;
99+
emit_char(char::from_u32_unchecked(V_BASE + v_index));
96100

97-
let ti = si % T_COUNT;
98-
if ti > 0 {
99-
emit_char(char::from_u32_unchecked(T_BASE + ti));
101+
let t_index = s_index % T_COUNT;
102+
if t_index > 0 {
103+
emit_char(char::from_u32_unchecked(T_BASE + t_index));
100104
}
101105
}
102106
}
@@ -112,20 +116,33 @@ pub(crate) fn hangul_decomposition_length(s: char) -> usize {
112116
#[allow(unsafe_code)]
113117
#[inline(always)]
114118
fn compose_hangul(a: char, b: char) -> Option<char> {
115-
let l = a as u32;
116-
let v = b as u32;
117-
// Compose an LPart and a VPart
118-
if L_BASE <= l && l < (L_BASE + L_COUNT) // l should be an L choseong jamo
119-
&& V_BASE <= v && v < (V_BASE + V_COUNT) { // v should be a V jungseong jamo
120-
let r = S_BASE + (l - L_BASE) * N_COUNT + (v - V_BASE) * T_COUNT;
121-
return unsafe { Some(char::from_u32_unchecked(r)) };
119+
let (a, b) = (a as u32, b as u32);
120+
match (a, b) {
121+
// Compose a leading consonant and a vowel together into an LV_Syllable
122+
(L_BASE ... L_END, V_BASE ... V_END) => {
123+
let l_index = a - L_BASE;
124+
let v_index = b - V_BASE;
125+
let lv_index = l_index * N_COUNT + v_index * T_COUNT;
126+
let s = S_BASE + lv_index;
127+
Some(unsafe {char::from_u32_unchecked(s)})
128+
},
129+
// Compose an LV_Syllable and a trailing consonant into an LVT_Syllable
130+
(S_BASE ... S_END, T_BASE ... T_END) if (a - S_BASE) % T_COUNT == 0 && (b - T_BASE) > 0 => {
131+
Some(unsafe {char::from_u32_unchecked(a + (b - T_BASE))})
132+
},
133+
_ => None,
122134
}
123-
// Compose an LVPart and a TPart
124-
if S_BASE <= l && l <= (S_BASE+S_COUNT-T_COUNT) // l should be a syllable block
125-
&& T_BASE <= v && v < (T_BASE+T_COUNT) // v should be a T jongseong jamo
126-
&& (l - S_BASE) % T_COUNT == 0 { // l should be an LV syllable block (not LVT)
127-
let r = l + (v - T_BASE);
128-
return unsafe { Some(char::from_u32_unchecked(r)) };
135+
}
136+
137+
#[cfg(test)]
138+
mod tests {
139+
use super::compose_hangul;
140+
141+
// Regression test from a bugfix where we were composing an LV_Syllable with
142+
// T_BASE directly. (We should only compose an LV_Syllable with a character
143+
// in the range `T_BASE + 1 ... T_END`.)
144+
#[test]
145+
fn test_hangul_composition() {
146+
assert_eq!(compose_hangul('\u{c8e0}', '\u{11a7}'), None);
129147
}
130-
None
131148
}

src/stream_safe.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use normalize::{
22
hangul_decomposition_length,
3-
is_hangul,
3+
is_hangul_syllable,
44
};
55
use tables;
66

@@ -71,7 +71,7 @@ fn classify_nonstarters(c: char) -> Decomposition {
7171
}
7272
}
7373
// Next, special case Hangul, since it's not handled by our tables.
74-
if is_hangul(c) {
74+
if is_hangul_syllable(c) {
7575
return Decomposition {
7676
leading_nonstarters: 0,
7777
trailing_nonstarters: 0,

0 commit comments

Comments
 (0)