Skip to content

Commit 0f8c212

Browse files
committed
Construct a nfa directly from a match pattern
It mostly seems to work if you ignore the fact that accepting states are found multiple times under some conditions
1 parent c4a3173 commit 0f8c212

File tree

2 files changed

+114
-29
lines changed

2 files changed

+114
-29
lines changed

src/parser.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ use self::{
2121

2222
pub use braille::dots_to_unicode;
2323
pub use braille::fallback;
24-
pub use match_rule::{Attribute, Pattern, Patterns};
24+
pub use match_rule::{Attribute, Pattern, PatternParser, Patterns};
2525

2626
mod braille;
2727
mod match_rule;

src/translator/nfa.rs

Lines changed: 113 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
//! the reachable states is an accepting state
88
use std::collections::{HashMap, HashSet, VecDeque};
99

10+
use crate::parser::Pattern;
11+
1012
use super::Translation;
1113

1214
/// Reference to a state in the [NFA] states vector
@@ -181,6 +183,16 @@ impl NFA {
181183
}
182184
}
183185

186+
fn add_character_class(&mut self, chars: &HashSet<char>) -> Fragment {
187+
let start = self.add_state(State { translation: None });
188+
let end = self.add_state(State { translation: None });
189+
for c in chars {
190+
self.transitions
191+
.insert((start, Transition::Character(*c)), end);
192+
}
193+
Fragment { start, end }
194+
}
195+
184196
fn add_fragment(&mut self, ast: &AST) -> Fragment {
185197
match ast {
186198
AST::Character(c) => self.add_char(*c, None),
@@ -229,6 +241,46 @@ impl NFA {
229241
nfa
230242
}
231243

244+
fn add_pattern_fragment(&mut self, pattern: &Pattern) -> Fragment {
245+
match pattern {
246+
Pattern::Empty => todo!(),
247+
Pattern::Characters(s) => self.add_string(s, None),
248+
Pattern::Boundary => todo!(),
249+
Pattern::Any => self.add_any(None),
250+
Pattern::Set(chars) => self.add_character_class(chars),
251+
Pattern::Attributes(hash_set) => todo!(),
252+
Pattern::Group(vec) => todo!(),
253+
Pattern::Negate(pattern) => todo!(),
254+
Pattern::Optional(pattern) => {
255+
let fragment = self.add_pattern_fragment(pattern);
256+
self.add_optional(&fragment)
257+
}
258+
Pattern::ZeroOrMore(pattern) => {
259+
let fragment = self.add_pattern_fragment(pattern);
260+
self.add_kleene(&fragment)
261+
}
262+
Pattern::OneOrMore(pattern) => {
263+
let one = self.add_pattern_fragment(pattern);
264+
let fragment = self.add_pattern_fragment(pattern);
265+
let kleene = self.add_kleene(&fragment);
266+
self.add_concatenation(&one, &kleene)
267+
}
268+
Pattern::Either(pattern, other) => {
269+
let r1 = self.add_pattern_fragment(pattern);
270+
let r2 = self.add_pattern_fragment(other);
271+
self.add_union(&r1, &r2)
272+
}
273+
}
274+
}
275+
276+
fn from_match_pattern(pattern: &Pattern, translation: &Translation) -> NFA {
277+
let mut nfa = NFA::new();
278+
let body = nfa.add_pattern_fragment(pattern);
279+
nfa.start = body.start;
280+
nfa.states[body.end].translation = Some(translation.clone());
281+
nfa
282+
}
283+
232284
/// Return all states that are reachable from a set of `states`
233285
/// via epsilon stransitions
234286
fn epsilon_closure(&self, states: &HashSet<StateId>) -> HashSet<StateId> {
@@ -289,7 +341,7 @@ impl NFA {
289341
) -> Vec<Translation> {
290342
dbg!(&state);
291343
let mut matching_rules = Vec::new();
292-
let mut next_states = self.epsilon_closure(&HashSet::from([state]));
344+
let next_states = dbg!(self.epsilon_closure(&HashSet::from([state])));
293345

294346
// if any of the states in the epsilon closure (reachable via epsilon transition)
295347
// has a translation add it to the list of matching rules
@@ -320,36 +372,30 @@ impl NFA {
320372
));
321373
}
322374

323-
match input.chars().next() {
324-
Some(c) => {
325-
let reachable_via_character =
326-
self.move_state(&next_states, Transition::Character(c));
327-
let reachable_via_any = self.move_state(&next_states, Transition::Any);
328-
next_states = reachable_via_character
329-
.union(&reachable_via_any)
330-
.cloned()
331-
.collect();
332-
next_states = self.epsilon_closure(&next_states);
333-
for state in next_states {
334-
let bytes = c.len_utf8();
335-
matching_rules.extend(self.find_translations_from_state(
336-
state,
337-
&input[bytes..],
338-
match_length + 1,
339-
offset,
340-
));
341-
}
342-
matching_rules
375+
if let Some(c) = input.chars().next() {
376+
let reachable_via_character = self.move_state(&next_states, Transition::Character(c));
377+
let reachable_via_any = self.move_state(&next_states, Transition::Any);
378+
let mut next_states = reachable_via_character
379+
.union(&reachable_via_any)
380+
.cloned()
381+
.collect();
382+
next_states = self.epsilon_closure(&next_states);
383+
for state in next_states {
384+
let bytes = c.len_utf8();
385+
matching_rules.extend(self.find_translations_from_state(
386+
state,
387+
&input[bytes..],
388+
match_length + 1,
389+
offset,
390+
));
343391
}
344-
None => matching_rules,
345392
}
393+
394+
matching_rules
346395
}
347396

348397
pub fn find_translations(&self, input: &str) -> Vec<Translation> {
349-
let mut matching_rules = Vec::new();
350-
351-
matching_rules.extend(self.find_translations_from_state(self.start, input, 0, 0));
352-
matching_rules
398+
self.find_translations_from_state(self.start, input, 0, 0)
353399
}
354400
}
355401

@@ -401,6 +447,7 @@ pub fn nfa_dot(nfa: &NFA) -> String {
401447
#[cfg(test)]
402448
mod tests {
403449
use super::*;
450+
use crate::parser::PatternParser;
404451

405452
#[test]
406453
fn character() {
@@ -480,7 +527,7 @@ mod tests {
480527
#[test]
481528
fn find_kleene() {
482529
let ast = AST::ZeroOrMore(Box::new(AST::Character('a')));
483-
let nfa = dbg!(NFA::from(&ast));
530+
let nfa = NFA::from(&ast);
484531
assert!(!nfa.find_translations("").is_empty());
485532
assert!(!nfa.find_translations("a").is_empty());
486533
assert!(!nfa.find_translations("aa").is_empty());
@@ -495,7 +542,7 @@ mod tests {
495542
Box::new(AST::Character('a')),
496543
Box::new(AST::ZeroOrMore(Box::new(AST::Character('b')))),
497544
);
498-
let nfa = dbg!(NFA::from(&ast));
545+
let nfa = NFA::from(&ast);
499546
assert!(!nfa.find_translations("a").is_empty());
500547
assert!(!nfa.find_translations("aa").is_empty());
501548
assert!(!nfa.find_translations("ab").is_empty());
@@ -637,4 +684,42 @@ mod tests {
637684
assert!(nfa.find_translations("()").is_empty());
638685
assert!(nfa.find_translations("(helo)").is_empty());
639686
}
687+
688+
#[test]
689+
fn find_pattern() {
690+
let patterns = PatternParser::new("abc").pattern().unwrap();
691+
let pattern = patterns.first().unwrap();
692+
let blank = String::new();
693+
let translation = Translation::new(blank.clone(), blank, 0);
694+
let nfa = dbg!(NFA::from_match_pattern(&pattern, &translation));
695+
assert_eq!(nfa.find_translations("abc"), vec![translation]);
696+
assert!(nfa.find_translations("def").is_empty());
697+
}
698+
699+
#[test]
700+
fn find_character_class() {
701+
let patterns = PatternParser::new("[abc]").pattern().unwrap();
702+
let pattern = patterns.first().unwrap();
703+
let blank = String::new();
704+
let translation = Translation::new(blank.clone(), blank, 0);
705+
let nfa = NFA::from_match_pattern(&pattern, &translation);
706+
assert_eq!(nfa.find_translations("a"), vec![translation.clone()]);
707+
assert_eq!(nfa.find_translations("b"), vec![translation.clone()]);
708+
assert_eq!(nfa.find_translations("c"), vec![translation]);
709+
assert!(nfa.find_translations("def").is_empty());
710+
}
711+
712+
#[test]
713+
#[ignore = "finds the same translation multiple times"]
714+
fn find_character_class_one_or_more() {
715+
let patterns = PatternParser::new("[abc]+").pattern().unwrap();
716+
let pattern = patterns.first().unwrap();
717+
let blank = String::new();
718+
let translation = Translation::new(blank.clone(), blank, 0);
719+
let nfa = NFA::from_match_pattern(&pattern, &translation);
720+
assert_eq!(nfa.find_translations("a"), vec![translation.clone()]);
721+
assert_eq!(nfa.find_translations("b"), vec![translation.clone()]);
722+
assert_eq!(nfa.find_translations("c"), vec![translation]);
723+
assert!(nfa.find_translations("def").is_empty());
724+
}
640725
}

0 commit comments

Comments
 (0)