Skip to content

Commit f35d6b6

Browse files
authored
Merge pull request #6 from unicode-rs/mixed
Add mixed-script detection
2 parents 03c709c + 633ee3e commit f35d6b6

File tree

5 files changed

+166
-21
lines changed

5 files changed

+166
-21
lines changed

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ according to Unicode Technical Standard #39 rules.
1616
exclude = [ "target/*", "Cargo.lock" ]
1717

1818
[dependencies]
19+
unicode-script = { git = "https://github.com/unicode-rs/unicode-script", default-features = false }
1920
std = { version = "1.0", package = "rustc-std-workspace-std", optional = true }
2021
core = { version = "1.0", package = "rustc-std-workspace-core", optional = true }
2122
compiler_builtins = { version = "0.1", optional = true }

src/general_security_profile.rs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
//! Utilities for working with the [General Security Profile](https://www.unicode.org/reports/tr39/#General_Security_Profile)
2+
//! for identifiers
3+
4+
use crate::tables::identifier_status as is;
5+
6+
/// Methods for determining characters not restricted from use for identifiers.
7+
pub trait GeneralSecurityProfile {
8+
/// Returns whether the character is not restricted from use for identifiers.
9+
fn identifier_allowed(self) -> bool;
10+
}
11+
12+
impl GeneralSecurityProfile for char {
13+
#[inline]
14+
fn identifier_allowed(self) -> bool { is::identifier_status_allowed(self) }
15+
}
16+
17+
impl GeneralSecurityProfile for &'_ str {
18+
#[inline]
19+
fn identifier_allowed(self) -> bool { self.chars().all(is::identifier_status_allowed) }
20+
}

src/lib.rs

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
//! ```rust
1616
//! extern crate unicode_security;
1717
//!
18-
//! use unicode_security::IdentifierStatusChar;
18+
//! use unicode_security::GeneralSecurityProfile;
1919
//!
2020
//! fn main() {
2121
//! let ch = 'µ'; // U+00B5 MICRO SIGN
@@ -55,21 +55,16 @@ extern crate std;
5555
#[cfg(feature = "bench")]
5656
extern crate test;
5757

58-
use tables::identifier_status as is;
5958
pub use tables::UNICODE_VERSION;
6059

61-
mod tables;
60+
pub mod mixed_script;
61+
pub mod general_security_profile;
6262

63-
#[cfg(test)]
64-
mod tests;
63+
pub use mixed_script::MixedScript;
64+
pub use general_security_profile::GeneralSecurityProfile;
6565

66-
/// Methods for determining characters not restricted from use for identifiers.
67-
pub trait UnicodeIdentifierStatus {
68-
/// Returns whether the character is not restricted from use for identifiers.
69-
fn identifier_allowed(self) -> bool;
70-
}
66+
#[rustfmt::skip]
67+
pub(crate) mod tables;
7168

72-
impl UnicodeIdentifierStatus for char {
73-
#[inline]
74-
fn identifier_allowed(self) -> bool { is::identifier_status_allowed(self) }
75-
}
69+
#[cfg(test)]
70+
mod tests;

src/mixed_script.rs

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
//! [Mixed-script detection](https://www.unicode.org/reports/tr39/#Mixed_Script_Detection)
2+
3+
use unicode_script::{Script, ScriptExtension};
4+
5+
/// An Augmented script set, as defined by UTS 39
6+
///
7+
/// https://www.unicode.org/reports/tr39/#def-augmented-script-set
8+
pub struct AugmentedScriptSet {
9+
/// The base ScriptExtension value
10+
pub base: ScriptExtension,
11+
/// Han With Bopomofo
12+
pub hanb: bool,
13+
/// Japanese
14+
pub jpan: bool,
15+
/// Korean
16+
pub kore: bool,
17+
}
18+
19+
impl From<ScriptExtension> for AugmentedScriptSet {
20+
fn from(ext: ScriptExtension) -> Self {
21+
let mut hanb = false;
22+
let mut jpan = false;
23+
let mut kore = false;
24+
25+
if ext == ScriptExtension::Single(Script::Common) ||
26+
ext == ScriptExtension::Single(Script::Inherited) ||
27+
ext.contains_script(Script::Han) {
28+
hanb = true;
29+
jpan = true;
30+
kore = true;
31+
} else {
32+
if ext.contains_script(Script::Hiragana) || ext.contains_script(Script::Katakana) {
33+
jpan = true;
34+
}
35+
36+
if ext.contains_script(Script::Hangul) {
37+
kore = true;
38+
}
39+
40+
if ext.contains_script(Script::Bopomofo) {
41+
hanb = true;
42+
}
43+
}
44+
Self {
45+
base: ext,
46+
hanb, jpan, kore
47+
}
48+
}
49+
}
50+
51+
impl From<char> for AugmentedScriptSet {
52+
fn from(c: char) -> Self {
53+
AugmentedScriptSet::for_char(c)
54+
}
55+
}
56+
57+
impl From<&'_ str> for AugmentedScriptSet {
58+
fn from(s: &'_ str) -> Self {
59+
AugmentedScriptSet::for_str(s)
60+
}
61+
}
62+
63+
impl Default for AugmentedScriptSet {
64+
fn default() -> Self {
65+
AugmentedScriptSet {
66+
base: ScriptExtension::Single(Script::Common),
67+
hanb: true,
68+
jpan: true,
69+
kore: true,
70+
}
71+
}
72+
}
73+
74+
impl AugmentedScriptSet {
75+
/// Intersect this set with another
76+
pub fn intersect(mut self, other: Self) -> Self {
77+
self.base = self.base.intersect(other.base);
78+
self.hanb = self.hanb && other.hanb;
79+
self.jpan = self.jpan && other.jpan;
80+
self.kore = self.kore && other.kore;
81+
self
82+
}
83+
84+
/// Check if the set is empty
85+
pub fn is_empty(&self) -> bool {
86+
self.base.is_empty() && ! self.hanb && !self.jpan && !self.kore
87+
}
88+
89+
/// Check if the set is "All" (Common or Inherited)
90+
pub fn is_all(&self) -> bool {
91+
self.base == ScriptExtension::Single(Script::Common) ||
92+
self.base == ScriptExtension::Single(Script::Inherited)
93+
}
94+
95+
/// Construct an AugmentedScriptSet for a given character
96+
pub fn for_char(c: char) -> Self {
97+
ScriptExtension::from(c).into()
98+
}
99+
100+
/// Find the [resolved script set](https://www.unicode.org/reports/tr39/#def-resolved-script-set) of a given string
101+
pub fn for_str(s: &str) -> Self {
102+
let mut set = AugmentedScriptSet::default();
103+
for ch in s.chars() {
104+
set = set.intersect(ch.into())
105+
}
106+
set
107+
}
108+
}
109+
110+
/// Extension trait for [mixed-script detection](https://www.unicode.org/reports/tr39/#Mixed_Script_Detection)
111+
pub trait MixedScript {
112+
/// Check if a string is [single-script](https://www.unicode.org/reports/tr39/#def-single-script)
113+
///
114+
/// Note that a single-script string may still contain multiple Script properties!
115+
fn is_single_script(self) -> bool;
116+
117+
/// Find the [resolved script set](https://www.unicode.org/reports/tr39/#def-resolved-script-set) of a given string
118+
fn resolve_script_set(self) -> AugmentedScriptSet;
119+
}
120+
121+
impl MixedScript for &'_ str {
122+
fn is_single_script(self) -> bool {
123+
!AugmentedScriptSet::for_str(self).is_empty()
124+
}
125+
126+
fn resolve_script_set(self) -> AugmentedScriptSet {
127+
self.into()
128+
}
129+
}

src/tests.rs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,19 +10,19 @@
1010

1111
#[test]
1212
fn test_char() {
13-
use super::IdentifierStatusChar;
14-
assert_eq!(IdentifierStatusChar::identifier_allowed('A'), true);
13+
use crate::GeneralSecurityProfile;
14+
assert_eq!(GeneralSecurityProfile::identifier_allowed('A'), true);
1515
assert_eq!('A'.identifier_allowed(), true);
16-
assert_eq!(IdentifierStatusChar::identifier_allowed('0'), true);
16+
assert_eq!(GeneralSecurityProfile::identifier_allowed('0'), true);
1717
assert_eq!('0'.identifier_allowed(), true);
18-
assert_eq!(IdentifierStatusChar::identifier_allowed('_'), true);
18+
assert_eq!(GeneralSecurityProfile::identifier_allowed('_'), true);
1919
assert_eq!('_'.identifier_allowed(), true);
20-
assert_eq!(IdentifierStatusChar::identifier_allowed('\x00'), false);
20+
assert_eq!(GeneralSecurityProfile::identifier_allowed('\x00'), false);
2121
assert_eq!('\x00'.identifier_allowed(), false);
2222
// U+00B5 MICRO SIGN
23-
assert_eq!(IdentifierStatusChar::identifier_allowed('µ'), false);
23+
assert_eq!(GeneralSecurityProfile::identifier_allowed('µ'), false);
2424
assert_eq!('µ'.identifier_allowed(), false);
2525
// U+2160 ROMAN NUMERAL ONE
26-
assert_eq!(IdentifierStatusChar::identifier_allowed('Ⅰ'), false);
26+
assert_eq!(GeneralSecurityProfile::identifier_allowed('Ⅰ'), false);
2727
assert_eq!('Ⅰ'.identifier_allowed(), false);
2828
}

0 commit comments

Comments
 (0)