Skip to content

Commit 602aaf1

Browse files
bminixhoferdrahnr
andauthored
Improve tagger: Return iterators over WordData, remove groups, parallelize deserialization (#70)
* use vec instead of hashmap internally in tagger * parallelize tagger deserialization * return iterators in tagger * chore: improve benches a bit (#71) * remove redundant vectors in chunker * cleanup wordidmap * update changelog Co-authored-by: Bernhard Schuster <bernhard@ahoi.io>
1 parent 26e1983 commit 602aaf1

File tree

10 files changed

+417
-241
lines changed

10 files changed

+417
-241
lines changed

CHANGELOG.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,21 @@
1+
# 0.6.4
2+
3+
## Internal improvements
4+
5+
- Decrease time it takes to load the `Tokenizer` by ~ 40% (#70).
6+
- Tag lookup is backed by a vector instead of a hashmap now.
7+
8+
## Breaking changes
9+
10+
- The tagger now returns iterators over tags instead of allocating a vector.
11+
- Remove `get_group_members` function.
12+
13+
# 0.6.3
14+
15+
## Fixes
16+
17+
- Fix a bug where calling `Rule::suggest` in parallel across threads would cause a panic (#68, thanks @drahnr!)
18+
119
# 0.6.2
220

321
## Internal improvements

nlprule/benches/load.rs

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,31 @@
11
use criterion::{black_box, criterion_group, criterion_main, Criterion};
22
use nlprule::{Rules, Tokenizer};
3+
use std::time::Duration;
34

4-
fn criterion_benchmark(c: &mut Criterion) {
5+
fn parse_tokenizer(c: &mut Criterion) {
56
c.bench_function("load tokenizer", |b| {
67
b.iter(|| Tokenizer::new(black_box("../storage/en_tokenizer.bin")).unwrap())
78
});
9+
}
810

11+
fn parse_rules(c: &mut Criterion) {
912
c.bench_function("load rules", |b| {
1013
b.iter(|| Rules::new(black_box("../storage/en_rules.bin")).unwrap())
1114
});
1215
}
1316

14-
criterion_group!(benches, criterion_benchmark);
15-
criterion_main!(benches);
17+
fn no_warmup_criterion() -> Criterion {
18+
Criterion::default()
19+
.sample_size(20)
20+
.warm_up_time(Duration::from_nanos(1))
21+
}
22+
23+
criterion_group!(
24+
name = parse;
25+
config = no_warmup_criterion();
26+
targets =
27+
parse_rules,
28+
parse_tokenizer,
29+
);
30+
31+
criterion_main!(parse);

nlprule/src/compile/impls.rs

Lines changed: 33 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ use crate::{
2323
tokenizer::{
2424
chunk,
2525
multiword::{MultiwordTagger, MultiwordTaggerFields},
26-
tag::{Tagger, TaggerLangOptions},
26+
tag::{Tagger, TaggerLangOptions, WordIdMap},
2727
Tokenizer, TokenizerLangOptions,
2828
},
2929
types::*,
@@ -94,9 +94,6 @@ impl Tagger {
9494
common_words: &HashSet<String>,
9595
lang_options: TaggerLangOptions,
9696
) -> std::io::Result<Self> {
97-
let mut tags = DefaultHashMap::default();
98-
let mut groups = DefaultHashMap::default();
99-
10097
let mut tag_store = HashSet::new();
10198
let mut word_store = HashSet::new();
10299

@@ -148,24 +145,25 @@ impl Tagger {
148145
.map(|(i, x)| (x.to_string(), PosIdInt::from_value_unchecked(i as u16)))
149146
.collect();
150147

148+
let mut tags: Vec<Option<Vec<(WordIdInt, PosIdInt)>>> = vec![None; word_store.len()];
149+
151150
for (word, inflection, tag) in lines.iter() {
152151
let word_id = word_store.get_by_left(word).unwrap();
153152
let lemma_id = word_store.get_by_left(inflection).unwrap();
154153
let pos_id = tag_store.get_by_left(tag).unwrap();
155154

156-
let group = groups.entry(*lemma_id).or_insert_with(Vec::new);
157-
if !group.contains(word_id) {
158-
group.push(*word_id);
155+
match &mut tags[word_id.value() as usize] {
156+
Some(vec) => {
157+
vec.push((*lemma_id, *pos_id));
158+
}
159+
None => {
160+
tags[word_id.value() as usize] = Some(vec![(*lemma_id, *pos_id)]);
161+
}
159162
}
160-
161-
tags.entry(*word_id)
162-
.or_insert_with(Vec::new)
163-
.push((*lemma_id, *pos_id));
164163
}
165164

166165
Ok(Tagger {
167-
tags,
168-
groups,
166+
tags: WordIdMap(tags),
169167
word_store,
170168
tag_store,
171169
lang_options,
@@ -453,24 +451,32 @@ pub(in crate::compile) struct ContextData {
453451
outcomes: Vec<usize>,
454452
}
455453

456-
impl From<ContextData> for chunk::Context {
457-
fn from(data: ContextData) -> Self {
458-
chunk::Context {
459-
parameters: data.parameters,
460-
outcomes: data.outcomes,
461-
}
462-
}
463-
}
464-
465454
impl From<ModelData> for chunk::Model {
466455
fn from(data: ModelData) -> Self {
456+
let mut outcomes: Vec<usize> = Vec::new();
457+
let mut parameters: Vec<f32> = Vec::new();
458+
459+
let pmap = data
460+
.pmap
461+
.into_iter()
462+
.map(|(key, value)| {
463+
assert_eq!(value.outcomes.len(), value.parameters.len());
464+
465+
let offset = outcomes.len();
466+
let length = value.outcomes.len();
467+
468+
outcomes.extend(value.outcomes);
469+
parameters.extend(value.parameters);
470+
471+
(chunk::hash::hash_str(&key), (offset, length))
472+
})
473+
.collect::<DefaultHashMap<_, _>>();
474+
467475
chunk::Model {
468476
outcome_labels: data.outcome_labels,
469-
pmap: data
470-
.pmap
471-
.into_iter()
472-
.map(|(key, value)| (chunk::hash::hash_str(&key), value.into()))
473-
.collect::<DefaultHashMap<_, _>>(),
477+
outcomes,
478+
parameters,
479+
pmap,
474480
}
475481
}
476482
}

nlprule/src/compile/parse_structure.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,7 @@ fn parse_match(m: structure::Match, engine: &Engine, info: &mut BuildInfo) -> Re
380380
|| m.postag_replace.is_some()
381381
|| m.text.is_some()
382382
{
383+
// this would need a fully functional PosReplacer to work
383384
return Err(Error::Unimplemented(
384385
"postag, postag_regex, postag_replace and text in `match` are not implemented.".into(),
385386
));

nlprule/src/filter/mod.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,11 @@ impl Filterable for NoDisambiguationEnglishPartialPosTagFilter {
2727
fn keep(&self, sentence: &MatchSentence, graph: &MatchGraph) -> bool {
2828
graph.by_id(self.id).tokens(sentence).all(|token| {
2929
if let Some(captures) = self.regexp.captures(&token.word().as_str()) {
30-
let tags = sentence
30+
let mut tags = sentence
3131
.tagger()
3232
.get_tags(&captures.get(1).unwrap().as_str());
3333

34-
tags.iter()
35-
.any(|x| self.postag_regexp.is_match(x.pos().as_str()))
34+
tags.any(|x| self.postag_regexp.is_match(x.pos().as_str()))
3635
} else {
3736
false
3837
}

nlprule/src/rule/grammar.rs

Lines changed: 3 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -51,35 +51,9 @@ pub struct PosReplacer {
5151
}
5252

5353
impl PosReplacer {
54-
fn apply(&self, text: &str, sentence: &MatchSentence) -> Option<String> {
55-
let mut candidates: Vec<_> = sentence
56-
.tagger()
57-
.get_tags(text)
58-
.iter()
59-
.map(|x| {
60-
let group_words = sentence.tagger().get_group_members(&x.lemma().as_str());
61-
let mut data = Vec::new();
62-
for word in group_words {
63-
if let Some(i) = sentence
64-
.tagger()
65-
.get_tags(word)
66-
.iter()
67-
.position(|x| self.matcher.is_match(x.pos()))
68-
{
69-
data.push((word.to_string(), i));
70-
}
71-
}
72-
data
73-
})
74-
.rev()
75-
.flatten()
76-
.collect();
77-
candidates.sort_by(|(_, a), (_, b)| a.cmp(b));
78-
if candidates.is_empty() {
79-
None
80-
} else {
81-
Some(candidates.remove(0).0)
82-
}
54+
fn apply(&self, _text: &str, _sentence: &MatchSentence) -> Option<String> {
55+
// TODO: needs to be implemented with correct ordering, currently rules which would need this are disabled
56+
unimplemented!()
8357
}
8458
}
8559

nlprule/src/tokenizer.rs

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -324,11 +324,13 @@ impl Tokenizer {
324324
IncompleteToken::new(
325325
Word::new(
326326
self.tagger.id_word(token_text.into()),
327-
self.tagger.get_tags_with_options(
328-
token_text,
329-
if is_sentence_start { Some(true) } else { None },
330-
None,
331-
),
327+
self.tagger
328+
.get_tags_with_options(
329+
token_text,
330+
if is_sentence_start { Some(true) } else { None },
331+
None,
332+
)
333+
.collect(),
332334
),
333335
Span::new(
334336
byte_start..byte_start + token_text.len(),

nlprule/src/tokenizer/chunk.rs

Lines changed: 50 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,6 @@ fn softmax(vec: &mut Vec<f32>) {
1818
}
1919
}
2020

21-
#[derive(Debug, Serialize, Deserialize, Clone)]
22-
pub(crate) struct Context {
23-
pub(crate) parameters: Vec<f32>,
24-
pub(crate) outcomes: Vec<usize>,
25-
}
26-
2721
#[derive(Debug, Clone)]
2822
struct Sequence<'a> {
2923
outcomes: Vec<&'a str>,
@@ -122,77 +116,70 @@ pub(crate) mod hash {
122116
#[derive(Serialize, Deserialize)]
123117
struct ModelFields {
124118
outcome_labels: Vec<String>,
125-
// stores each hash and the length of the context of the hash
126-
// this is kind of close to gzip compression already and it is difficult
127-
// where to draw the line. The chunker model should have a custom
128-
// serialization implementation anyway for bf16 compression so this is OK here.
129-
cols: Vec<(u64, u8)>,
130-
// stores the context outcome labels
131-
rows: Vec<u8>,
132-
// stores the context parameter values
133-
values: Vec<bf16>,
119+
pmap: Vec<(u64, u8)>,
120+
outcomes: Vec<u8>,
121+
parameters: Vec<bf16>,
134122
}
135123

136124
impl From<Model> for ModelFields {
137125
fn from(model: Model) -> Self {
138-
let mut cols = Vec::new();
139-
let mut rows = Vec::new();
140-
let mut values = Vec::new();
126+
let mut pmap: Vec<_> = model.pmap.into_iter().collect();
127+
pmap.sort_by_key(|(_, (offset, _))| *offset);
141128

142-
for (key, context) in model.pmap.iter() {
143-
assert_eq!(context.outcomes.len(), context.parameters.len());
144-
assert!(context.outcomes.len() <= std::u8::MAX as usize);
145-
cols.push((*key, context.outcomes.len() as u8));
146-
147-
for (label, value) in context.outcomes.iter().zip(context.parameters.iter()) {
148-
assert!(*label <= std::u8::MAX as usize);
129+
let pmap = pmap
130+
.into_iter()
131+
.map(|(key, (_, length))| {
132+
assert!(length <= u8::MAX as usize);
133+
(key, length as u8)
134+
})
135+
.collect();
149136

150-
rows.push(*label as u8);
151-
values.push(bf16::from_f32(*value));
152-
}
153-
}
137+
let outcomes = model
138+
.outcomes
139+
.into_iter()
140+
.map(|outcome| {
141+
assert!(outcome <= u8::MAX as usize);
142+
outcome as u8
143+
})
144+
.collect();
145+
let parameters = model.parameters.into_iter().map(bf16::from_f32).collect();
154146

155147
ModelFields {
156148
outcome_labels: model.outcome_labels,
157-
cols,
158-
rows,
159-
values,
149+
pmap,
150+
outcomes,
151+
parameters,
160152
}
161153
}
162154
}
163155

164156
impl From<ModelFields> for Model {
165157
fn from(data: ModelFields) -> Self {
166158
let mut pmap = DefaultHashMap::new();
159+
let mut offset = 0;
160+
161+
for (key, length) in data.pmap {
162+
pmap.insert(key, (offset, length as usize));
167163

168-
let mut row_iter = data.rows.iter();
169-
let mut value_iter = data.values.iter();
170-
171-
for (key, n) in data.cols.iter() {
172-
let outcomes: Vec<_> = (0..*n as usize)
173-
.map(|_| *row_iter.next().expect("checked in From<Model> impl") as usize)
174-
.collect();
175-
let parameters: Vec<_> = (0..*n as usize)
176-
.map(|_| {
177-
value_iter
178-
.next()
179-
.expect("checked in From<Model> impl")
180-
.to_f32()
181-
})
182-
.collect();
183-
184-
pmap.insert(
185-
*key,
186-
Context {
187-
outcomes,
188-
parameters,
189-
},
190-
);
164+
offset += length as usize;
191165
}
192166

167+
let outcomes = data
168+
.outcomes
169+
.into_iter()
170+
.map(|outcome| outcome as usize)
171+
.collect();
172+
let parameters = data
173+
.parameters
174+
.into_iter()
175+
.map(|parameter| parameter.to_f32())
176+
.collect();
177+
193178
Model {
194179
outcome_labels: data.outcome_labels,
195180
pmap,
181+
outcomes,
182+
parameters,
196183
}
197184
}
198185
}
@@ -201,16 +188,21 @@ impl From<ModelFields> for Model {
201188
#[serde(from = "ModelFields", into = "ModelFields")]
202189
pub(crate) struct Model {
203190
pub(crate) outcome_labels: Vec<String>,
204-
pub(crate) pmap: DefaultHashMap<u64, Context>,
191+
pub(crate) outcomes: Vec<usize>,
192+
pub(crate) parameters: Vec<f32>,
193+
pub(crate) pmap: DefaultHashMap<u64, (usize, usize)>,
205194
}
206195

207196
impl Model {
208197
fn eval(&self, context: &[u64]) -> Vec<f32> {
209198
let mut prior =
210199
vec![(1. / (self.outcome_labels.len() as f32)).ln(); self.outcome_labels.len()];
211200

212-
for context in context.iter().filter_map(|x| self.pmap.get(&x)) {
213-
for (idx, param) in context.outcomes.iter().zip(context.parameters.iter()) {
201+
for (offset, length) in context.iter().filter_map(|x| self.pmap.get(&x)) {
202+
let outcomes = &self.outcomes[*offset..*offset + length];
203+
let parameters = &self.parameters[*offset..*offset + length];
204+
205+
for (idx, param) in outcomes.iter().zip(parameters.iter()) {
214206
prior[*idx] += param;
215207
}
216208
}

0 commit comments

Comments
 (0)