Skip to content

Commit 0df76e8

Browse files
committed
Update scripts to share more code with other unicode crates
1 parent f35d6b6 commit 0df76e8

File tree

3 files changed

+82
-49
lines changed

3 files changed

+82
-49
lines changed

scripts/unicode.py

Lines changed: 56 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -47,37 +47,39 @@ def fetch(f):
4747
sys.stderr.write("cannot load %s\n" % f)
4848
exit(1)
4949

50-
# load identifier status data
51-
def load_identifier_status():
52-
f = "IdentifierStatus.txt"
50+
# Implementation from unicode-segmentation
51+
def load_properties(f, interestingprops = None):
5352
fetch(f)
54-
statuses = []
55-
re1 = re.compile("^([0-9A-F]+) +; +(\w+)")
56-
re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+) +; +(\w+)")
53+
props = {}
54+
re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
55+
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
5756

58-
for line in fileinput.input(f):
57+
for line in fileinput.input(os.path.basename(f)):
58+
prop = None
5959
d_lo = 0
6060
d_hi = 0
61-
cat = None
6261
m = re1.match(line)
6362
if m:
6463
d_lo = m.group(1)
6564
d_hi = m.group(1)
66-
cat = m.group(2)
65+
prop = m.group(2).strip()
6766
else:
6867
m = re2.match(line)
6968
if m:
7069
d_lo = m.group(1)
7170
d_hi = m.group(2)
72-
cat = m.group(3)
71+
prop = m.group(3).strip()
7372
else:
7473
continue
75-
if cat != "Allowed":
74+
if interestingprops and prop not in interestingprops:
7675
continue
7776
d_lo = int(d_lo, 16)
7877
d_hi = int(d_hi, 16)
79-
statuses.append((d_lo, d_hi))
80-
return statuses
78+
if prop not in props:
79+
props[prop] = []
80+
props[prop].append((d_lo, d_hi))
81+
82+
return props
8183

8284
def format_table_content(f, content, indent):
8385
line = " "*indent
@@ -115,41 +117,57 @@ def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
115117
format_table_content(f, data, 8)
116118
f.write("\n ];\n\n")
117119

118-
def emit_identifier_status_module(f, statuses_table):
119-
f.write("pub mod identifier_status {")
120+
def emit_identifier_module(f):
121+
f.write("pub mod identifier {")
120122
f.write("""
121-
use core::result::Result::{Ok, Err};
123+
#[inline]
124+
pub fn identifier_status_allowed(c: char) -> bool {
125+
// FIXME: do we want to special case ASCII here?
126+
match c as usize {
127+
_ => super::util::bsearch_range_table(c, identifier_status_table)
128+
}
129+
}
130+
""")
131+
132+
f.write(" // Identifier status table:\n")
133+
identifier_status_table = load_properties("IdentifierStatus.txt")
134+
emit_table(f, "identifier_status_table", identifier_status_table['Allowed'], "&'static [(char, char)]", is_pub=False,
135+
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])))
136+
f.write("}\n\n")
122137

138+
def emit_util_mod(f):
139+
f.write("""
140+
pub mod util {
141+
use core::result::Result::{Ok, Err};
123142
#[inline]
124-
fn bsearch_range_value_table(c: char, r: &'static [(char, char)]) -> bool {
143+
pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
125144
use core::cmp::Ordering::{Equal, Less, Greater};
126-
match r.binary_search_by(|&(lo, hi)| {
145+
r.binary_search_by(|&(lo,hi)| {
146+
if lo <= c && c <= hi { Equal }
147+
else if hi < c { Less }
148+
else { Greater }
149+
}).is_ok()
150+
}
151+
152+
pub fn bsearch_range_value_table<T: Copy>(c: char, r: &'static [(char, char, T)]) -> Option<T> {
153+
use core::cmp::Ordering::{Equal, Less, Greater};
154+
match r.binary_search_by(|&(lo, hi, _)| {
127155
if lo <= c && c <= hi { Equal }
128156
else if hi < c { Less }
129157
else { Greater }
130158
}) {
131-
Ok(_) => true,
132-
Err(_) => false
159+
Ok(idx) => {
160+
let (_, _, cat) = r[idx];
161+
Some(cat)
162+
}
163+
Err(_) => None
133164
}
134165
}
135-
""")
136166
137-
f.write("""
138-
#[inline]
139-
pub fn identifier_status_allowed(c: char) -> bool {
140-
// FIXME: do we want to special case ASCII here?
141-
match c as usize {
142-
_ => bsearch_range_value_table(c, identifier_status_table)
143-
}
144-
}
167+
}
145168
146169
""")
147170

148-
f.write(" // identifier status table.\n")
149-
emit_table(f, "identifier_status_table", statuses_table, "&'static [(char, char)]", is_pub=False,
150-
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])))
151-
f.write("}\n\n")
152-
153171
if __name__ == "__main__":
154172
r = "tables.rs"
155173
if os.path.exists(r):
@@ -164,6 +182,7 @@ def emit_identifier_status_module(f, statuses_table):
164182
pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
165183
166184
""" % UNICODE_VERSION)
167-
### identifier status module
168-
identifier_status_table = load_identifier_status()
169-
emit_identifier_status_module(rf, identifier_status_table)
185+
186+
emit_util_mod(rf)
187+
### identifier module
188+
emit_identifier_module(rf)

src/general_security_profile.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
//! Utilities for working with the [General Security Profile](https://www.unicode.org/reports/tr39/#General_Security_Profile)
22
//! for identifiers
33
4-
use crate::tables::identifier_status as is;
4+
use crate::tables::identifier;
55

66
/// Methods for determining characters not restricted from use for identifiers.
77
pub trait GeneralSecurityProfile {
@@ -11,10 +11,10 @@ pub trait GeneralSecurityProfile {
1111

1212
impl GeneralSecurityProfile for char {
1313
#[inline]
14-
fn identifier_allowed(self) -> bool { is::identifier_status_allowed(self) }
14+
fn identifier_allowed(self) -> bool { identifier::identifier_status_allowed(self) }
1515
}
1616

1717
impl GeneralSecurityProfile for &'_ str {
1818
#[inline]
19-
fn identifier_allowed(self) -> bool { self.chars().all(is::identifier_status_allowed) }
19+
fn identifier_allowed(self) -> bool { self.chars().all(identifier::identifier_status_allowed) }
2020
}

src/tables.rs

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,31 +16,45 @@
1616
/// that this version of unicode-security is based on.
1717
pub const UNICODE_VERSION: (u64, u64, u64) = (12, 1, 0);
1818

19-
pub mod identifier_status {
20-
use core::result::Result::{Ok, Err};
2119

20+
pub mod util {
21+
use core::result::Result::{Ok, Err};
2222
#[inline]
23-
fn bsearch_range_value_table(c: char, r: &'static [(char, char)]) -> bool {
23+
pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
24+
use core::cmp::Ordering::{Equal, Less, Greater};
25+
r.binary_search_by(|&(lo,hi)| {
26+
if lo <= c && c <= hi { Equal }
27+
else if hi < c { Less }
28+
else { Greater }
29+
}).is_ok()
30+
}
31+
32+
pub fn bsearch_range_value_table<T: Copy>(c: char, r: &'static [(char, char, T)]) -> Option<T> {
2433
use core::cmp::Ordering::{Equal, Less, Greater};
25-
match r.binary_search_by(|&(lo, hi)| {
34+
match r.binary_search_by(|&(lo, hi, _)| {
2635
if lo <= c && c <= hi { Equal }
2736
else if hi < c { Less }
2837
else { Greater }
2938
}) {
30-
Ok(_) => true,
31-
Err(_) => false
39+
Ok(idx) => {
40+
let (_, _, cat) = r[idx];
41+
Some(cat)
42+
}
43+
Err(_) => None
3244
}
3345
}
3446

47+
}
48+
49+
pub mod identifier {
3550
#[inline]
3651
pub fn identifier_status_allowed(c: char) -> bool {
3752
// FIXME: do we want to special case ASCII here?
3853
match c as usize {
39-
_ => bsearch_range_value_table(c, identifier_status_table)
54+
_ => super::util::bsearch_range_table(c, identifier_status_table)
4055
}
4156
}
42-
43-
// identifier status table.
57+
// Identifier status table:
4458
const identifier_status_table: &'static [(char, char)] = &[
4559
('\u{27}', '\u{27}'), ('\u{2d}', '\u{2e}'), ('\u{30}', '\u{3a}'), ('\u{41}', '\u{5a}'),
4660
('\u{5f}', '\u{5f}'), ('\u{61}', '\u{7a}'), ('\u{b7}', '\u{b7}'), ('\u{c0}', '\u{d6}'),

0 commit comments

Comments
 (0)