Skip to content

Commit b56f3a1

Browse files
committed
Add mixed-script detection
1 parent 03c709c commit b56f3a1

File tree

3 files changed

+124
-0
lines changed

3 files changed

+124
-0
lines changed

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ according to Unicode Technical Standard #39 rules.
1616
exclude = [ "target/*", "Cargo.lock" ]
1717

1818
[dependencies]
19+
unicode-script = { git = "https://github.com/unicode-rs/unicode-script", default-features = false }
1920
std = { version = "1.0", package = "rustc-std-workspace-std", optional = true }
2021
core = { version = "1.0", package = "rustc-std-workspace-core", optional = true }
2122
compiler_builtins = { version = "0.1", optional = true }

src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ extern crate test;
5858
use tables::identifier_status as is;
5959
pub use tables::UNICODE_VERSION;
6060

61+
pub mod mixed_script;
62+
6163
mod tables;
6264

6365
#[cfg(test)]

src/mixed_script.rs

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
//! [Mixed-script detection](https://www.unicode.org/reports/tr39/#Mixed_Script_Detection)
2+
3+
use unicode_script::{Script, ScriptExtension};
4+
5+
/// An Augmented script set, as defined by UTS 39
6+
///
7+
/// https://www.unicode.org/reports/tr39/#def-augmented-script-set
8+
pub struct AugmentedScriptSet {
9+
/// The base ScriptExtension value
10+
pub base: ScriptExtension,
11+
/// Han With Bopomofo
12+
pub hanb: bool,
13+
/// Japanese
14+
pub jpan: bool,
15+
/// Korean
16+
pub kore: bool,
17+
}
18+
19+
impl From<ScriptExtension> for AugmentedScriptSet {
20+
fn from(ext: ScriptExtension) -> Self {
21+
let mut hanb = false;
22+
let mut jpan = false;
23+
let mut kore = false;
24+
25+
if ext.contains_script(Script::Han) {
26+
hanb = true;
27+
jpan = true;
28+
kore = true;
29+
} else {
30+
if ext.contains_script(Script::Hiragana) || ext.contains_script(Script::Katakana) {
31+
jpan = true;
32+
}
33+
34+
if ext.contains_script(Script::Hangul) {
35+
kore = true;
36+
}
37+
38+
if ext.contains_script(Script::Bopomofo) {
39+
hanb = true;
40+
}
41+
}
42+
Self {
43+
base: ext,
44+
hanb, jpan, kore
45+
}
46+
}
47+
}
48+
49+
impl From<char> for AugmentedScriptSet {
50+
fn from(c: char) -> Self {
51+
AugmentedScriptSet::for_char(c)
52+
}
53+
}
54+
55+
impl From<&'_ str> for AugmentedScriptSet {
56+
fn from(s: &'_ str) -> Self {
57+
AugmentedScriptSet::for_str(s)
58+
}
59+
}
60+
61+
impl Default for AugmentedScriptSet {
62+
fn default() -> Self {
63+
AugmentedScriptSet {
64+
base: ScriptExtension::Single(Script::Common),
65+
hanb: true,
66+
jpan: true,
67+
kore: true,
68+
}
69+
}
70+
}
71+
72+
impl AugmentedScriptSet {
73+
/// Intersect this set with another
74+
pub fn intersect(mut self, other: Self) -> Self {
75+
self.base = self.base.intersect(other.base);
76+
self.hanb = self.hanb && other.hanb;
77+
self.jpan = self.jpan && other.jpan;
78+
self.kore = self.kore && other.kore;
79+
self
80+
}
81+
82+
/// Check if the set is empty
83+
pub fn is_empty(&self) -> bool {
84+
self.base.is_empty() && ! self.hanb && !self.jpan && !self.kore
85+
}
86+
87+
/// Construct an AugmentedScriptSet for a given character
88+
pub fn for_char(c: char) -> Self {
89+
ScriptExtension::from(c).into()
90+
}
91+
92+
/// Find the [resolved script set](https://www.unicode.org/reports/tr39/#def-resolved-script-set) of a given string
93+
pub fn for_str(s: &str) -> Self {
94+
let mut set = AugmentedScriptSet::default();
95+
for ch in s.chars() {
96+
set = set.intersect(ch.into())
97+
}
98+
set
99+
}
100+
}
101+
102+
/// Extension trait for [mixed-script detection](https://www.unicode.org/reports/tr39/#Mixed_Script_Detection)
103+
pub trait MixedScript {
104+
/// Check if a string is [single-script](https://www.unicode.org/reports/tr39/#def-single-script)
105+
///
106+
/// Note that a single-script string may still contain multiple Script properties!
107+
fn is_single_script(self) -> bool;
108+
109+
/// Find the [resolved script set](https://www.unicode.org/reports/tr39/#def-resolved-script-set) of a given string
110+
fn resolve_script_set(self) -> AugmentedScriptSet;
111+
}
112+
113+
impl MixedScript for &'_ str {
114+
fn is_single_script(self) -> bool {
115+
!AugmentedScriptSet::for_str(self).is_empty()
116+
}
117+
118+
fn resolve_script_set(self) -> AugmentedScriptSet {
119+
self.into()
120+
}
121+
}

0 commit comments

Comments
 (0)