Skip to content

Commit c4a3173

Browse files
committed
Add recursive traversal of the nfa
so that we can handle offsets into the search string which is needed for lookbehind. The liblouis match pre-patterns are a form of lookbehind
1 parent 4d6bd7d commit c4a3173

File tree

1 file changed

+208
-0
lines changed

1 file changed

+208
-0
lines changed

src/translator/nfa.rs

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,17 @@ enum Boundary {
3434
/// An transition between two [States](State) in the [NFA]
3535
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
3636
enum Transition {
37+
/// A transition that accepts a character
3738
Character(char),
39+
/// A transition that accepts any character
3840
Any,
3941
Start(Boundary),
4042
End(Boundary),
43+
/// An epsilon transition that accepts the empty string
4144
Epsilon,
45+
/// An Offset transition is essentially an epsilon transition that marks the end of a
46+
/// non-capturing group. It is used to mark the end of the pre pattern in match rules
47+
Offset,
4248
}
4349

4450
/// An NFA consisting of a set of states and transitions between them
@@ -273,6 +279,78 @@ impl NFA {
273279
.iter()
274280
.any(|s| self.states[*s].translation.is_some())
275281
}
282+
283+
fn find_translations_from_state(
284+
&self,
285+
state: StateId,
286+
input: &str,
287+
match_length: usize,
288+
offset: usize,
289+
) -> Vec<Translation> {
290+
dbg!(&state);
291+
let mut matching_rules = Vec::new();
292+
let mut next_states = self.epsilon_closure(&HashSet::from([state]));
293+
294+
// if any of the states in the epsilon closure (reachable via epsilon transition)
295+
// has a translation add it to the list of matching rules
296+
matching_rules.extend(
297+
next_states
298+
.iter()
299+
.flat_map(|state| &self.states[*state].translation)
300+
.map(|translation| {
301+
translation
302+
.clone()
303+
.with_offset(offset)
304+
// if there is an offset (typically in a match opcode), the weight needs
305+
// to be calculated at run-time. The weight is the actual length of match.
306+
.with_weight_if_offset(match_length, offset)
307+
}),
308+
);
309+
310+
// traverse all states that are reachable via an offset transition (essentially an
311+
// epsilon transition that marks the end of a non-capture group)
312+
let reachable_via_offset = self.move_state(&next_states, Transition::Offset);
313+
let next_states_with_offset = self.epsilon_closure(&reachable_via_offset);
314+
for state in next_states_with_offset {
315+
matching_rules.extend(self.find_translations_from_state(
316+
state,
317+
input,
318+
match_length + 1,
319+
offset + match_length,
320+
));
321+
}
322+
323+
match input.chars().next() {
324+
Some(c) => {
325+
let reachable_via_character =
326+
self.move_state(&next_states, Transition::Character(c));
327+
let reachable_via_any = self.move_state(&next_states, Transition::Any);
328+
next_states = reachable_via_character
329+
.union(&reachable_via_any)
330+
.cloned()
331+
.collect();
332+
next_states = self.epsilon_closure(&next_states);
333+
for state in next_states {
334+
let bytes = c.len_utf8();
335+
matching_rules.extend(self.find_translations_from_state(
336+
state,
337+
&input[bytes..],
338+
match_length + 1,
339+
offset,
340+
));
341+
}
342+
matching_rules
343+
}
344+
None => matching_rules,
345+
}
346+
}
347+
348+
pub fn find_translations(&self, input: &str) -> Vec<Translation> {
349+
let mut matching_rules = Vec::new();
350+
351+
matching_rules.extend(self.find_translations_from_state(self.start, input, 0, 0));
352+
matching_rules
353+
}
276354
}
277355

278356
/**
@@ -297,6 +375,9 @@ pub fn nfa_dot(nfa: &NFA) -> String {
297375
Transition::Any => {
298376
dot.push_str(&format!("\t{} -> {} [label=\"{}\"]\n", from, to, "Any"))
299377
}
378+
Transition::Offset => {
379+
dot.push_str(&format!("\t{} -> {} [label=\"{}\"]\n", from, to, "Offset"))
380+
}
300381
Transition::Start(boundary) => dot.push_str(&format!(
301382
"\t{} -> {} [label=\"{:?}\"]\n",
302383
from, to, boundary
@@ -329,6 +410,14 @@ mod tests {
329410
assert!(!nfa.accepts("b"));
330411
}
331412

413+
#[test]
414+
fn find_character() {
415+
let ast = AST::Character('a');
416+
let nfa = NFA::from(&ast);
417+
assert!(!nfa.find_translations("a").is_empty());
418+
assert!(nfa.find_translations("b").is_empty());
419+
}
420+
332421
#[test]
333422
fn alteration() {
334423
let ast = AST::Either(Box::new(AST::Character('a')), Box::new(AST::Character('b')));
@@ -339,6 +428,16 @@ mod tests {
339428
assert!(!nfa.accepts("c"));
340429
}
341430

431+
#[test]
432+
fn find_alteration() {
433+
let ast = AST::Either(Box::new(AST::Character('a')), Box::new(AST::Character('b')));
434+
let nfa = NFA::from(&ast);
435+
assert!(!nfa.find_translations("a").is_empty());
436+
assert!(!nfa.find_translations("b").is_empty());
437+
assert!(!nfa.find_translations("ab").is_empty());
438+
assert!(nfa.find_translations("c").is_empty());
439+
}
440+
342441
#[test]
343442
fn concatenation() {
344443
let ast = AST::Concat(Box::new(AST::Character('a')), Box::new(AST::Character('b')));
@@ -351,6 +450,18 @@ mod tests {
351450
assert!(!nfa.accepts("abc"));
352451
}
353452

453+
#[test]
454+
fn find_concatenation() {
455+
let ast = AST::Concat(Box::new(AST::Character('a')), Box::new(AST::Character('b')));
456+
let nfa = NFA::from(&ast);
457+
assert!(!nfa.find_translations("ab").is_empty());
458+
assert!(!nfa.find_translations("abc").is_empty());
459+
assert!(nfa.find_translations("a").is_empty());
460+
assert!(nfa.find_translations("b").is_empty());
461+
assert!(nfa.find_translations("ba").is_empty());
462+
assert!(nfa.find_translations("c").is_empty());
463+
}
464+
354465
#[test]
355466
fn kleene() {
356467
let ast = AST::ZeroOrMore(Box::new(AST::Character('a')));
@@ -366,6 +477,38 @@ mod tests {
366477
assert!(!nfa.accepts("abc"));
367478
}
368479

480+
#[test]
481+
fn find_kleene() {
482+
let ast = AST::ZeroOrMore(Box::new(AST::Character('a')));
483+
let nfa = dbg!(NFA::from(&ast));
484+
assert!(!nfa.find_translations("").is_empty());
485+
assert!(!nfa.find_translations("a").is_empty());
486+
assert!(!nfa.find_translations("aa").is_empty());
487+
assert!(!nfa.find_translations("aaaaa").is_empty());
488+
assert!(!nfa.find_translations("ab").is_empty());
489+
assert!(!nfa.find_translations("abc").is_empty());
490+
assert!(!nfa.find_translations("b").is_empty());
491+
assert!(!nfa.find_translations("ba").is_empty());
492+
assert!(!nfa.find_translations("c").is_empty());
493+
494+
let ast = AST::Concat(
495+
Box::new(AST::Character('a')),
496+
Box::new(AST::ZeroOrMore(Box::new(AST::Character('b')))),
497+
);
498+
let nfa = dbg!(NFA::from(&ast));
499+
assert!(!nfa.find_translations("a").is_empty());
500+
assert!(!nfa.find_translations("aa").is_empty());
501+
assert!(!nfa.find_translations("ab").is_empty());
502+
assert!(!nfa.find_translations("abbbb").is_empty());
503+
assert!(nfa.find_translations("").is_empty());
504+
assert!(nfa.find_translations("ccccc").is_empty());
505+
assert!(nfa.find_translations("cb").is_empty());
506+
assert!(nfa.find_translations("cba").is_empty());
507+
assert!(nfa.find_translations("b").is_empty());
508+
assert!(nfa.find_translations("ba").is_empty());
509+
assert!(nfa.find_translations("c").is_empty());
510+
}
511+
369512
#[test]
370513
fn one_or_more() {
371514
let ast = AST::OneOrMore(Box::new(AST::Character('a')));
@@ -381,6 +524,21 @@ mod tests {
381524
assert!(!nfa.accepts("abc"));
382525
}
383526

527+
#[test]
528+
fn find_one_or_more() {
529+
let ast = AST::OneOrMore(Box::new(AST::Character('a')));
530+
let nfa = NFA::from(&ast);
531+
assert!(nfa.find_translations("").is_empty());
532+
assert!(!nfa.find_translations("a").is_empty());
533+
assert!(!nfa.find_translations("aa").is_empty());
534+
assert!(!nfa.find_translations("aaaaa").is_empty());
535+
assert!(nfa.find_translations("b").is_empty());
536+
assert!(nfa.find_translations("ba").is_empty());
537+
assert!(!nfa.find_translations("ab").is_empty());
538+
assert!(nfa.find_translations("c").is_empty());
539+
assert!(!nfa.find_translations("abc").is_empty());
540+
}
541+
384542
#[test]
385543
fn any() {
386544
let ast = AST::Concat(
@@ -394,6 +552,19 @@ mod tests {
394552
assert!(nfa.accepts("abb"));
395553
}
396554

555+
#[test]
556+
fn find_any() {
557+
let ast = AST::Concat(
558+
Box::new(AST::Concat(
559+
Box::new(AST::Character('a')),
560+
Box::new(AST::Any),
561+
)),
562+
Box::new(AST::Character('b')),
563+
);
564+
let nfa = NFA::from(&ast);
565+
assert!(!nfa.find_translations("abb").is_empty());
566+
}
567+
397568
#[test]
398569
fn optional() {
399570
let ast = AST::Concat(
@@ -411,6 +582,24 @@ mod tests {
411582
assert!(!nfa.accepts("bbb"));
412583
}
413584

585+
#[test]
586+
fn find_optional() {
587+
let ast = AST::Concat(
588+
Box::new(AST::Optional(Box::new(AST::Concat(
589+
Box::new(AST::Character('a')),
590+
Box::new(AST::Any),
591+
)))),
592+
Box::new(AST::Character('b')),
593+
);
594+
let nfa = NFA::from(&ast);
595+
assert!(!nfa.find_translations("acb").is_empty());
596+
assert!(!nfa.find_translations("axb").is_empty());
597+
assert!(!nfa.find_translations("b").is_empty());
598+
assert!(!nfa.find_translations("bbb").is_empty());
599+
assert!(nfa.find_translations("c").is_empty());
600+
assert!(nfa.find_translations("").is_empty());
601+
}
602+
414603
#[test]
415604
fn string() {
416605
let ast = AST::Concat(
@@ -429,4 +618,23 @@ mod tests {
429618
assert!(!nfa.accepts("()"));
430619
assert!(!nfa.accepts("(helo)"));
431620
}
621+
622+
#[test]
623+
fn find_string() {
624+
let ast = AST::Concat(
625+
Box::new(AST::Concat(
626+
Box::new(AST::OneOrMore(Box::new(AST::Character('(')))),
627+
Box::new(AST::String("hello".to_string())),
628+
)),
629+
Box::new(AST::OneOrMore(Box::new(AST::Character(')')))),
630+
);
631+
let nfa = NFA::from(&ast);
632+
assert!(!nfa.find_translations("(hello)").is_empty());
633+
assert!(!nfa.find_translations("(((((hello)))").is_empty());
634+
assert!(nfa.find_translations("hello").is_empty());
635+
assert!(nfa.find_translations("(hello").is_empty());
636+
assert!(nfa.find_translations("hello)").is_empty());
637+
assert!(nfa.find_translations("()").is_empty());
638+
assert!(nfa.find_translations("(helo)").is_empty());
639+
}
432640
}

0 commit comments

Comments
 (0)