|
| 1 | +//! [Mixed-script detection](https://www.unicode.org/reports/tr39/#Mixed_Script_Detection) |
| 2 | +
|
| 3 | +use unicode_script::{Script, ScriptExtension}; |
| 4 | + |
| 5 | +/// An Augmented script set, as defined by UTS 39 |
| 6 | +/// |
| 7 | +/// https://www.unicode.org/reports/tr39/#def-augmented-script-set |
| 8 | +pub struct AugmentedScriptSet { |
| 9 | + /// The base ScriptExtension value |
| 10 | + pub base: ScriptExtension, |
| 11 | + /// Han With Bopomofo |
| 12 | + pub hanb: bool, |
| 13 | + /// Japanese |
| 14 | + pub jpan: bool, |
| 15 | + /// Korean |
| 16 | + pub kore: bool, |
| 17 | +} |
| 18 | + |
| 19 | +impl From<ScriptExtension> for AugmentedScriptSet { |
| 20 | + fn from(ext: ScriptExtension) -> Self { |
| 21 | + let mut hanb = false; |
| 22 | + let mut jpan = false; |
| 23 | + let mut kore = false; |
| 24 | + |
| 25 | + if ext == ScriptExtension::Single(Script::Common) || |
| 26 | + ext == ScriptExtension::Single(Script::Inherited) || |
| 27 | + ext.contains_script(Script::Han) { |
| 28 | + hanb = true; |
| 29 | + jpan = true; |
| 30 | + kore = true; |
| 31 | + } else { |
| 32 | + if ext.contains_script(Script::Hiragana) || ext.contains_script(Script::Katakana) { |
| 33 | + jpan = true; |
| 34 | + } |
| 35 | + |
| 36 | + if ext.contains_script(Script::Hangul) { |
| 37 | + kore = true; |
| 38 | + } |
| 39 | + |
| 40 | + if ext.contains_script(Script::Bopomofo) { |
| 41 | + hanb = true; |
| 42 | + } |
| 43 | + } |
| 44 | + Self { |
| 45 | + base: ext, |
| 46 | + hanb, jpan, kore |
| 47 | + } |
| 48 | + } |
| 49 | +} |
| 50 | + |
| 51 | +impl From<char> for AugmentedScriptSet { |
| 52 | + fn from(c: char) -> Self { |
| 53 | + AugmentedScriptSet::for_char(c) |
| 54 | + } |
| 55 | +} |
| 56 | + |
| 57 | +impl From<&'_ str> for AugmentedScriptSet { |
| 58 | + fn from(s: &'_ str) -> Self { |
| 59 | + AugmentedScriptSet::for_str(s) |
| 60 | + } |
| 61 | +} |
| 62 | + |
| 63 | +impl Default for AugmentedScriptSet { |
| 64 | + fn default() -> Self { |
| 65 | + AugmentedScriptSet { |
| 66 | + base: ScriptExtension::Single(Script::Common), |
| 67 | + hanb: true, |
| 68 | + jpan: true, |
| 69 | + kore: true, |
| 70 | + } |
| 71 | + } |
| 72 | +} |
| 73 | + |
| 74 | +impl AugmentedScriptSet { |
| 75 | + /// Intersect this set with another |
| 76 | + pub fn intersect(mut self, other: Self) -> Self { |
| 77 | + self.base = self.base.intersect(other.base); |
| 78 | + self.hanb = self.hanb && other.hanb; |
| 79 | + self.jpan = self.jpan && other.jpan; |
| 80 | + self.kore = self.kore && other.kore; |
| 81 | + self |
| 82 | + } |
| 83 | + |
| 84 | + /// Check if the set is empty |
| 85 | + pub fn is_empty(&self) -> bool { |
| 86 | + self.base.is_empty() && ! self.hanb && !self.jpan && !self.kore |
| 87 | + } |
| 88 | + |
| 89 | + /// Check if the set is "All" (Common or Inherited) |
| 90 | + pub fn is_all(&self) -> bool { |
| 91 | + self.base == ScriptExtension::Single(Script::Common) || |
| 92 | + self.base == ScriptExtension::Single(Script::Inherited) |
| 93 | + } |
| 94 | + |
| 95 | + /// Construct an AugmentedScriptSet for a given character |
| 96 | + pub fn for_char(c: char) -> Self { |
| 97 | + ScriptExtension::from(c).into() |
| 98 | + } |
| 99 | + |
| 100 | + /// Find the [resolved script set](https://www.unicode.org/reports/tr39/#def-resolved-script-set) of a given string |
| 101 | + pub fn for_str(s: &str) -> Self { |
| 102 | + let mut set = AugmentedScriptSet::default(); |
| 103 | + for ch in s.chars() { |
| 104 | + set = set.intersect(ch.into()) |
| 105 | + } |
| 106 | + set |
| 107 | + } |
| 108 | +} |
| 109 | + |
| 110 | +/// Extension trait for [mixed-script detection](https://www.unicode.org/reports/tr39/#Mixed_Script_Detection) |
| 111 | +pub trait MixedScript { |
| 112 | + /// Check if a string is [single-script](https://www.unicode.org/reports/tr39/#def-single-script) |
| 113 | + /// |
| 114 | + /// Note that a single-script string may still contain multiple Script properties! |
| 115 | + fn is_single_script(self) -> bool; |
| 116 | + |
| 117 | + /// Find the [resolved script set](https://www.unicode.org/reports/tr39/#def-resolved-script-set) of a given string |
| 118 | + fn resolve_script_set(self) -> AugmentedScriptSet; |
| 119 | +} |
| 120 | + |
| 121 | +impl MixedScript for &'_ str { |
| 122 | + fn is_single_script(self) -> bool { |
| 123 | + !AugmentedScriptSet::for_str(self).is_empty() |
| 124 | + } |
| 125 | + |
| 126 | + fn resolve_script_set(self) -> AugmentedScriptSet { |
| 127 | + self.into() |
| 128 | + } |
| 129 | +} |
0 commit comments