aho_corasick/
nfa.rs

1use std::cmp;
2use std::collections::{BTreeSet, VecDeque};
3use std::fmt;
4use std::mem::size_of;
5use std::ops::{Index, IndexMut};
6
7use ahocorasick::MatchKind;
8use automaton::Automaton;
9use classes::{ByteClassBuilder, ByteClasses};
10use error::Result;
11use prefilter::{self, opposite_ascii_case, Prefilter, PrefilterObj};
12use state_id::{dead_id, fail_id, usize_to_state_id, StateID};
13use Match;
14
15/// The identifier for a pattern, which is simply the position of the pattern
16/// in the sequence of patterns given by the caller.
17pub type PatternID = usize;
18
19/// The length of a pattern, in bytes.
20pub type PatternLength = usize;
21
22/// An Aho-Corasick automaton, represented as an NFA.
23///
24/// This is the classical formulation of Aho-Corasick, which involves building
25/// up a prefix trie of a given set of patterns, and then wiring up failure
26/// transitions between states in order to guarantee linear time matching. The
27/// standard formulation is, technically, an NFA because of these failure
28/// transitions. That is, one can see them as enabling the automaton to be in
29/// multiple states at once. Indeed, during search, it is possible to check
30/// the transitions on multiple states for a single input byte.
31///
32/// This particular implementation not only supports the standard style of
33/// matching, but also provides a mode for choosing leftmost-first or
34/// leftmost-longest match semantics. When a leftmost mode is chosen, some
35/// failure transitions that would otherwise be added are elided. See
36/// the documentation of `MatchKind` for more details and examples on how the
37/// match semantics may differ.
38///
39/// If one wants a DFA, then it is necessary to first build an NFA and convert
40/// it into a DFA. Note, however, that because we've constrained ourselves to
41/// matching literal patterns, this does not need to use subset construction
42/// for determinization. Instead, the DFA has at most a number of states
43/// equivalent to the number of NFA states. The only real difference between
44/// them is that all failure transitions are followed and pre-computed. This
45/// uses much more memory, but also executes searches more quickly.
46#[derive(Clone)]
47pub struct NFA<S> {
48    /// The match semantics built into this NFA.
49    match_kind: MatchKind,
50    /// The start state id as an index into `states`.
51    start_id: S,
52    /// The length, in bytes, of the longest pattern in this automaton. This
53    /// information is useful for keeping correct buffer sizes when searching
54    /// on streams.
55    max_pattern_len: usize,
56    /// The total number of patterns added to this automaton, including
57    /// patterns that may never be matched.
58    pattern_count: usize,
59    /// The number of bytes of heap used by this NFA's transition table.
60    heap_bytes: usize,
61    /// A prefilter for quickly skipping to candidate matches, if pertinent.
62    prefilter: Option<PrefilterObj>,
63    /// Whether this automaton anchors all matches to the start of input.
64    anchored: bool,
65    /// A set of equivalence classes in terms of bytes. We compute this while
66    /// building the NFA, but don't use it in the NFA's states. Instead, we
67    /// use this for building the DFA. We store it on the NFA since it's easy
68    /// to compute while visiting the the patterns.
69    byte_classes: ByteClasses,
70    /// A set of states. Each state defines its own transitions, a fail
71    /// transition and a set of indices corresponding to matches.
72    ///
73    /// The first state is always the fail state, which is used only as a
74    /// sentinel. Namely, in the final NFA, no transition into the fail state
75    /// exists. (Well, they do, but they aren't followed. Instead, the state's
76    /// failure transition is followed.)
77    ///
78    /// The second state (index 1) is always the dead state. Dead states are
79    /// in every automaton, but only used when leftmost-{first,longest} match
80    /// semantics are enabled. Specifically, they instruct search to stop
81    /// at specific points in order to report the correct match location. In
82    /// the standard Aho-Corasick construction, there are no transitions to
83    /// the dead state.
84    ///
85    /// The third state (index 2) is generally intended to be the starting or
86    /// "root" state.
87    states: Vec<State<S>>,
88}
89
90impl<S: StateID> NFA<S> {
91    /// Returns the equivalence classes of bytes found while constructing
92    /// this NFA.
93    ///
94    /// Note that the NFA doesn't actually make use of these equivalence
95    /// classes. Instead, these are useful for building the DFA when desired.
96    pub fn byte_classes(&self) -> &ByteClasses {
97        &self.byte_classes
98    }
99
100    /// Returns a prefilter, if one exists.
101    pub fn prefilter_obj(&self) -> Option<&PrefilterObj> {
102        self.prefilter.as_ref()
103    }
104
105    /// Returns the total number of heap bytes used by this NFA's transition
106    /// table.
107    pub fn heap_bytes(&self) -> usize {
108        self.heap_bytes
109            + self.prefilter.as_ref().map_or(0, |p| p.as_ref().heap_bytes())
110    }
111
112    /// Return the length of the longest pattern in this automaton.
113    pub fn max_pattern_len(&self) -> usize {
114        self.max_pattern_len
115    }
116
117    /// Return the total number of patterns added to this automaton.
118    pub fn pattern_count(&self) -> usize {
119        self.pattern_count
120    }
121
122    /// Returns the total number of states in this NFA.
123    pub fn state_len(&self) -> usize {
124        self.states.len()
125    }
126
127    /// Returns the matches for the given state.
128    pub fn matches(&self, id: S) -> &[(PatternID, PatternLength)] {
129        &self.states[id.to_usize()].matches
130    }
131
132    /// Returns an iterator over all transitions in the given state according
133    /// to the given equivalence classes, including transitions to `fail_id()`.
134    /// The number of transitions returned is always equivalent to the number
135    /// of equivalence classes.
136    pub fn iter_all_transitions<F: FnMut(u8, S)>(
137        &self,
138        byte_classes: &ByteClasses,
139        id: S,
140        f: F,
141    ) {
142        self.states[id.to_usize()].trans.iter_all(byte_classes, f);
143    }
144
145    /// Returns the failure transition for the given state.
146    pub fn failure_transition(&self, id: S) -> S {
147        self.states[id.to_usize()].fail
148    }
149
150    /// Returns the next state for the given state and input byte.
151    ///
152    /// Note that this does not follow failure transitions. As such, the id
153    /// returned may be `fail_id`.
154    pub fn next_state(&self, current: S, input: u8) -> S {
155        self.states[current.to_usize()].next_state(input)
156    }
157
158    fn state(&self, id: S) -> &State<S> {
159        &self.states[id.to_usize()]
160    }
161
162    fn state_mut(&mut self, id: S) -> &mut State<S> {
163        &mut self.states[id.to_usize()]
164    }
165
166    fn start(&self) -> &State<S> {
167        self.state(self.start_id)
168    }
169
170    fn start_mut(&mut self) -> &mut State<S> {
171        let id = self.start_id;
172        self.state_mut(id)
173    }
174
175    fn iter_transitions_mut(&mut self, id: S) -> IterTransitionsMut<S> {
176        IterTransitionsMut::new(self, id)
177    }
178
179    fn copy_matches(&mut self, src: S, dst: S) {
180        let (src, dst) =
181            get_two_mut(&mut self.states, src.to_usize(), dst.to_usize());
182        dst.matches.extend_from_slice(&src.matches);
183    }
184
185    fn copy_empty_matches(&mut self, dst: S) {
186        let start_id = self.start_id;
187        self.copy_matches(start_id, dst);
188    }
189
190    fn add_dense_state(&mut self, depth: usize) -> Result<S> {
191        let trans = Transitions::Dense(Dense::new());
192        let id = usize_to_state_id(self.states.len())?;
193        self.states.push(State {
194            trans,
195            // Anchored automatons do not have any failure transitions.
196            fail: if self.anchored { dead_id() } else { self.start_id },
197            depth: depth,
198            matches: vec![],
199        });
200        Ok(id)
201    }
202
203    fn add_sparse_state(&mut self, depth: usize) -> Result<S> {
204        let trans = Transitions::Sparse(vec![]);
205        let id = usize_to_state_id(self.states.len())?;
206        self.states.push(State {
207            trans,
208            // Anchored automatons do not have any failure transitions.
209            fail: if self.anchored { dead_id() } else { self.start_id },
210            depth: depth,
211            matches: vec![],
212        });
213        Ok(id)
214    }
215}
216
217impl<S: StateID> Automaton for NFA<S> {
218    type ID = S;
219
220    fn match_kind(&self) -> &MatchKind {
221        &self.match_kind
222    }
223
224    fn anchored(&self) -> bool {
225        self.anchored
226    }
227
228    fn prefilter(&self) -> Option<&dyn Prefilter> {
229        self.prefilter.as_ref().map(|p| p.as_ref())
230    }
231
232    fn start_state(&self) -> S {
233        self.start_id
234    }
235
236    fn is_valid(&self, id: S) -> bool {
237        id.to_usize() < self.states.len()
238    }
239
240    fn is_match_state(&self, id: S) -> bool {
241        self.states[id.to_usize()].is_match()
242    }
243
244    fn get_match(
245        &self,
246        id: S,
247        match_index: usize,
248        end: usize,
249    ) -> Option<Match> {
250        let state = match self.states.get(id.to_usize()) {
251            None => return None,
252            Some(state) => state,
253        };
254        state.matches.get(match_index).map(|&(id, len)| Match {
255            pattern: id,
256            len,
257            end,
258        })
259    }
260
261    fn match_count(&self, id: S) -> usize {
262        self.states[id.to_usize()].matches.len()
263    }
264
265    unsafe fn next_state_unchecked(&self, mut current: S, input: u8) -> S {
266        // This terminates since:
267        //
268        // 1. `State.fail` never points to fail_id().
269        // 2. All `State.fail` values point to a state closer to `start`.
270        // 3. The start state has no transitions to fail_id().
271        loop {
272            let state = self.states.get_unchecked(current.to_usize());
273            let next = state.next_state(input);
274            if next != fail_id() {
275                return next;
276            }
277            current = state.fail;
278        }
279    }
280}
281
282/// A representation of an NFA state for an Aho-Corasick automaton.
283///
284/// It contains the transitions to the next state, a failure transition for
285/// cases where there exists no other transition for the current input byte,
286/// the matches implied by visiting this state (if any) and the depth of this
287/// state. The depth of a state is simply the distance from it to the start
288/// state in the automaton, where the depth of the start state is 0.
289#[derive(Clone, Debug)]
290pub struct State<S> {
291    trans: Transitions<S>,
292    fail: S,
293    matches: Vec<(PatternID, PatternLength)>,
294    // TODO: Strictly speaking, this isn't needed for searching. It's only
295    // used when building an NFA that supports leftmost match semantics. We
296    // could drop this from the state and dynamically build a map only when
297    // computing failure transitions, but it's not clear which is better.
298    // Benchmark this.
299    depth: usize,
300}
301
302impl<S: StateID> State<S> {
303    fn heap_bytes(&self) -> usize {
304        self.trans.heap_bytes()
305            + (self.matches.len() * size_of::<(PatternID, PatternLength)>())
306    }
307
308    fn add_match(&mut self, i: PatternID, len: PatternLength) {
309        self.matches.push((i, len));
310    }
311
312    fn is_match(&self) -> bool {
313        !self.matches.is_empty()
314    }
315
316    fn get_longest_match_len(&self) -> Option<usize> {
317        // Why is this true? Because the first match in any matching state
318        // will always correspond to the match added to it during trie
319        // construction (since when we copy matches due to failure transitions,
320        // we always append them). Therefore, it follows that the first match
321        // must always be longest since any subsequent match must be from a
322        // failure transition, and a failure transition by construction points
323        // to a proper suffix. A proper suffix is, by definition, smaller.
324        self.matches.get(0).map(|&(_, len)| len)
325    }
326
327    fn next_state(&self, input: u8) -> S {
328        self.trans.next_state(input)
329    }
330
331    fn set_next_state(&mut self, input: u8, next: S) {
332        self.trans.set_next_state(input, next);
333    }
334}
335
336/// Represents the transitions for a single dense state.
337///
338/// The primary purpose here is to encapsulate unchecked index access. Namely,
339/// since a dense representation always contains 256 elements, all values of
340/// `u8` are valid indices.
341#[derive(Clone, Debug)]
342struct Dense<S>(Vec<S>);
343
344impl<S> Dense<S>
345where
346    S: StateID,
347{
348    fn new() -> Self {
349        Dense(vec![fail_id(); 256])
350    }
351
352    #[inline]
353    fn len(&self) -> usize {
354        self.0.len()
355    }
356}
357
358impl<S> Index<u8> for Dense<S> {
359    type Output = S;
360
361    #[inline]
362    fn index(&self, i: u8) -> &S {
363        // SAFETY: This is safe because all dense transitions have
364        // exactly 256 elements, so all u8 values are valid indices.
365        unsafe { self.0.get_unchecked(i as usize) }
366    }
367}
368
369impl<S> IndexMut<u8> for Dense<S> {
370    #[inline]
371    fn index_mut(&mut self, i: u8) -> &mut S {
372        // SAFETY: This is safe because all dense transitions have
373        // exactly 256 elements, so all u8 values are valid indices.
374        unsafe { self.0.get_unchecked_mut(i as usize) }
375    }
376}
377
378/// A representation of transitions in an NFA.
379///
380/// Transitions have either a sparse representation, which is slower for
381/// lookups but uses less memory, or a dense representation, which is faster
382/// for lookups but uses more memory. In the sparse representation, the absence
383/// of a state implies a transition to `fail_id()`. Transitions to `dead_id()`
384/// are still explicitly represented.
385///
386/// For the NFA, by default, we use a dense representation for transitions for
387/// states close to the start state because it's likely these are the states
388/// that will be most frequently visited.
389#[derive(Clone, Debug)]
390enum Transitions<S> {
391    Sparse(Vec<(u8, S)>),
392    Dense(Dense<S>),
393}
394
395impl<S: StateID> Transitions<S> {
396    fn heap_bytes(&self) -> usize {
397        match *self {
398            Transitions::Sparse(ref sparse) => {
399                sparse.len() * size_of::<(u8, S)>()
400            }
401            Transitions::Dense(ref dense) => dense.len() * size_of::<S>(),
402        }
403    }
404
405    fn next_state(&self, input: u8) -> S {
406        match *self {
407            Transitions::Sparse(ref sparse) => {
408                for &(b, id) in sparse {
409                    if b == input {
410                        return id;
411                    }
412                }
413                fail_id()
414            }
415            Transitions::Dense(ref dense) => dense[input],
416        }
417    }
418
419    fn set_next_state(&mut self, input: u8, next: S) {
420        match *self {
421            Transitions::Sparse(ref mut sparse) => {
422                match sparse.binary_search_by_key(&input, |&(b, _)| b) {
423                    Ok(i) => sparse[i] = (input, next),
424                    Err(i) => sparse.insert(i, (input, next)),
425                }
426            }
427            Transitions::Dense(ref mut dense) => {
428                dense[input] = next;
429            }
430        }
431    }
432
433    /// Iterate over transitions in this state while skipping over transitions
434    /// to `fail_id()`.
435    fn iter<F: FnMut(u8, S)>(&self, mut f: F) {
436        match *self {
437            Transitions::Sparse(ref sparse) => {
438                for &(b, id) in sparse {
439                    f(b, id);
440                }
441            }
442            Transitions::Dense(ref dense) => {
443                for b in AllBytesIter::new() {
444                    let id = dense[b];
445                    if id != fail_id() {
446                        f(b, id);
447                    }
448                }
449            }
450        }
451    }
452
453    /// Iterate over all transitions in this state according to the given
454    /// equivalence classes, including transitions to `fail_id()`.
455    fn iter_all<F: FnMut(u8, S)>(&self, classes: &ByteClasses, mut f: F) {
456        if classes.is_singleton() {
457            match *self {
458                Transitions::Sparse(ref sparse) => {
459                    sparse_iter(sparse, f);
460                }
461                Transitions::Dense(ref dense) => {
462                    for b in AllBytesIter::new() {
463                        f(b, dense[b]);
464                    }
465                }
466            }
467        } else {
468            // In this case, we only want to yield a single byte for each
469            // equivalence class.
470            match *self {
471                Transitions::Sparse(ref sparse) => {
472                    let mut last_class = None;
473                    sparse_iter(sparse, |b, next| {
474                        let class = classes.get(b);
475                        if last_class != Some(class) {
476                            last_class = Some(class);
477                            f(b, next);
478                        }
479                    })
480                }
481                Transitions::Dense(ref dense) => {
482                    for b in classes.representatives() {
483                        f(b, dense[b]);
484                    }
485                }
486            }
487        }
488    }
489}
490
491/// Iterator over transitions in a state, skipping transitions to `fail_id()`.
492///
493/// This abstracts over the representation of NFA transitions, which may be
494/// either in a sparse or dense representation.
495///
496/// This somewhat idiosyncratically borrows the NFA mutably, so that when one
497/// is iterating over transitions, the caller can still mutate the NFA. This
498/// is useful when creating failure transitions.
499#[derive(Debug)]
500struct IterTransitionsMut<'a, S: StateID + 'a> {
501    nfa: &'a mut NFA<S>,
502    state_id: S,
503    cur: usize,
504}
505
506impl<'a, S: StateID> IterTransitionsMut<'a, S> {
507    fn new(nfa: &'a mut NFA<S>, state_id: S) -> IterTransitionsMut<'a, S> {
508        IterTransitionsMut { nfa, state_id, cur: 0 }
509    }
510
511    fn nfa(&mut self) -> &mut NFA<S> {
512        self.nfa
513    }
514}
515
516impl<'a, S: StateID> Iterator for IterTransitionsMut<'a, S> {
517    type Item = (u8, S);
518
519    fn next(&mut self) -> Option<(u8, S)> {
520        match self.nfa.states[self.state_id.to_usize()].trans {
521            Transitions::Sparse(ref sparse) => {
522                if self.cur >= sparse.len() {
523                    return None;
524                }
525                let i = self.cur;
526                self.cur += 1;
527                Some(sparse[i])
528            }
529            Transitions::Dense(ref dense) => {
530                while self.cur < dense.len() {
531                    // There are always exactly 255 transitions in dense repr.
532                    debug_assert!(self.cur < 256);
533
534                    let b = self.cur as u8;
535                    let id = dense[b];
536                    self.cur += 1;
537                    if id != fail_id() {
538                        return Some((b, id));
539                    }
540                }
541                None
542            }
543        }
544    }
545}
546
547/// A simple builder for configuring the NFA construction of Aho-Corasick.
548#[derive(Clone, Debug)]
549pub struct Builder {
550    dense_depth: usize,
551    match_kind: MatchKind,
552    prefilter: bool,
553    anchored: bool,
554    ascii_case_insensitive: bool,
555}
556
557impl Default for Builder {
558    fn default() -> Builder {
559        Builder {
560            dense_depth: 2,
561            match_kind: MatchKind::default(),
562            prefilter: true,
563            anchored: false,
564            ascii_case_insensitive: false,
565        }
566    }
567}
568
569impl Builder {
570    pub fn new() -> Builder {
571        Builder::default()
572    }
573
574    pub fn build<I, P, S: StateID>(&self, patterns: I) -> Result<NFA<S>>
575    where
576        I: IntoIterator<Item = P>,
577        P: AsRef<[u8]>,
578    {
579        Compiler::new(self)?.compile(patterns)
580    }
581
582    pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
583        self.match_kind = kind;
584        self
585    }
586
587    pub fn dense_depth(&mut self, depth: usize) -> &mut Builder {
588        self.dense_depth = depth;
589        self
590    }
591
592    pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
593        self.prefilter = yes;
594        self
595    }
596
597    pub fn anchored(&mut self, yes: bool) -> &mut Builder {
598        self.anchored = yes;
599        self
600    }
601
602    pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
603        self.ascii_case_insensitive = yes;
604        self
605    }
606}
607
608/// A compiler uses a builder configuration and builds up the NFA formulation
609/// of an Aho-Corasick automaton. This roughly corresponds to the standard
610/// formulation described in textbooks.
611#[derive(Debug)]
612struct Compiler<'a, S: StateID> {
613    builder: &'a Builder,
614    prefilter: prefilter::Builder,
615    nfa: NFA<S>,
616    byte_classes: ByteClassBuilder,
617}
618
619impl<'a, S: StateID> Compiler<'a, S> {
620    fn new(builder: &'a Builder) -> Result<Compiler<'a, S>> {
621        Ok(Compiler {
622            builder: builder,
623            prefilter: prefilter::Builder::new(builder.match_kind)
624                .ascii_case_insensitive(builder.ascii_case_insensitive),
625            nfa: NFA {
626                match_kind: builder.match_kind,
627                start_id: usize_to_state_id(2)?,
628                max_pattern_len: 0,
629                pattern_count: 0,
630                heap_bytes: 0,
631                prefilter: None,
632                anchored: builder.anchored,
633                byte_classes: ByteClasses::singletons(),
634                states: vec![],
635            },
636            byte_classes: ByteClassBuilder::new(),
637        })
638    }
639
640    fn compile<I, P>(mut self, patterns: I) -> Result<NFA<S>>
641    where
642        I: IntoIterator<Item = P>,
643        P: AsRef<[u8]>,
644    {
645        self.add_state(0)?; // the fail state, which is never entered
646        self.add_state(0)?; // the dead state, only used for leftmost
647        self.add_state(0)?; // the start state
648        self.build_trie(patterns)?;
649        self.add_start_state_loop();
650        self.add_dead_state_loop();
651        if !self.builder.anchored {
652            if self.match_kind().is_leftmost() {
653                self.fill_failure_transitions_leftmost();
654            } else {
655                self.fill_failure_transitions_standard();
656            }
657        }
658        self.close_start_state_loop();
659        self.nfa.byte_classes = self.byte_classes.build();
660        if !self.builder.anchored {
661            self.nfa.prefilter = self.prefilter.build();
662        }
663        self.calculate_size();
664        Ok(self.nfa)
665    }
666
667    /// This sets up the initial prefix trie that makes up the Aho-Corasick
668    /// automaton. Effectively, it creates the basic structure of the
669    /// automaton, where every pattern given has a path from the start state to
670    /// the end of the pattern.
671    fn build_trie<I, P>(&mut self, patterns: I) -> Result<()>
672    where
673        I: IntoIterator<Item = P>,
674        P: AsRef<[u8]>,
675    {
676        'PATTERNS: for (pati, pat) in patterns.into_iter().enumerate() {
677            let pat = pat.as_ref();
678            self.nfa.max_pattern_len =
679                cmp::max(self.nfa.max_pattern_len, pat.len());
680            self.nfa.pattern_count += 1;
681
682            let mut prev = self.nfa.start_id;
683            let mut saw_match = false;
684            for (depth, &b) in pat.iter().enumerate() {
685                // When leftmost-first match semantics are requested, we
686                // specifically stop adding patterns when a previously added
687                // pattern is a prefix of it. We avoid adding it because
688                // leftmost-first semantics imply that the pattern can never
689                // match. This is not just an optimization to save space! It
690                // is necessary for correctness. In fact, this is the only
691                // difference in the automaton between the implementations for
692                // leftmost-first and leftmost-longest.
693                saw_match = saw_match || self.nfa.state(prev).is_match();
694                if self.builder.match_kind.is_leftmost_first() && saw_match {
695                    // Skip to the next pattern immediately. This avoids
696                    // incorrectly adding a match after this loop terminates.
697                    continue 'PATTERNS;
698                }
699
700                // Add this byte to our equivalence classes. We don't use these
701                // for NFA construction. These are instead used only if we're
702                // building a DFA. They would technically be useful for the
703                // NFA, but it would require a second pass over the patterns.
704                self.byte_classes.set_range(b, b);
705
706                // If the transition from prev using the current byte already
707                // exists, then just move through it. Otherwise, add a new
708                // state. We track the depth here so that we can determine
709                // how to represent transitions. States near the start state
710                // use a dense representation that uses more memory but is
711                // faster. Other states use a sparse representation that uses
712                // less memory but is slower.
713                let next = self.nfa.state(prev).next_state(b);
714                if next != fail_id() {
715                    prev = next;
716                } else {
717                    let next = self.add_state(depth + 1)?;
718                    self.nfa.state_mut(prev).set_next_state(b, next);
719                    if self.builder.ascii_case_insensitive {
720                        let b = opposite_ascii_case(b);
721                        self.nfa.state_mut(prev).set_next_state(b, next);
722                    }
723                    prev = next;
724                }
725            }
726            // Once the pattern has been added, log the match in the final
727            // state that it reached.
728            self.nfa.state_mut(prev).add_match(pati, pat.len());
729            // ... and hand it to the prefilter builder, if applicable.
730            if self.builder.prefilter {
731                self.prefilter.add(pat);
732            }
733        }
734        Ok(())
735    }
736
737    /// This routine creates failure transitions according to the standard
738    /// textbook formulation of the Aho-Corasick algorithm.
739    ///
740    /// Building failure transitions is the most interesting part of building
741    /// the Aho-Corasick automaton, because they are what allow searches to
742    /// be performed in linear time. Specifically, a failure transition is
743    /// a single transition associated with each state that points back to
744    /// the longest proper suffix of the pattern being searched. The failure
745    /// transition is followed whenever there exists no transition on the
746    /// current state for the current input byte. If there is no other proper
747    /// suffix, then the failure transition points back to the starting state.
748    ///
749    /// For example, let's say we built an Aho-Corasick automaton with the
750    /// following patterns: 'abcd' and 'cef'. The trie looks like this:
751    ///
752    /// ```ignore
753    ///          a - S1 - b - S2 - c - S3 - d - S4*
754    ///         /
755    ///     S0 - c - S5 - e - S6 - f - S7*
756    /// ```
757    ///
758    /// At this point, it should be fairly straight-forward to see how this
759    /// trie can be used in a simplistic way. At any given position in the
760    /// text we're searching (called the "subject" string), all we need to do
761    /// is follow the transitions in the trie by consuming one transition for
762    /// each byte in the subject string. If we reach a match state, then we can
763    /// report that location as a match.
764    ///
765    /// The trick comes when searching a subject string like 'abcef'. We'll
766    /// initially follow the transition from S0 to S1 and wind up in S3 after
767    /// observng the 'c' byte. At this point, the next byte is 'e' but state
768    /// S3 has no transition for 'e', so the search fails. We then would need
769    /// to restart the search at the next position in 'abcef', which
770    /// corresponds to 'b'. The match would fail, but the next search starting
771    /// at 'c' would finally succeed. The problem with this approach is that
772    /// we wind up searching the subject string potentially many times. In
773    /// effect, this makes the algorithm have worst case `O(n * m)` complexity,
774    /// where `n ~ len(subject)` and `m ~ len(all patterns)`. We would instead
775    /// like to achieve a `O(n + m)` worst case complexity.
776    ///
777    /// This is where failure transitions come in. Instead of dying at S3 in
778    /// the first search, the automaton can instruct the search to move to
779    /// another part of the automaton that corresponds to a suffix of what
780    /// we've seen so far. Recall that we've seen 'abc' in the subject string,
781    /// and the automaton does indeed have a non-empty suffix, 'c', that could
782    /// potentially lead to another match. Thus, the actual Aho-Corasick
783    /// automaton for our patterns in this case looks like this:
784    ///
785    /// ```ignore
786    ///          a - S1 - b - S2 - c - S3 - d - S4*
787    ///         /                      /
788    ///        /       ----------------
789    ///       /       /
790    ///     S0 - c - S5 - e - S6 - f - S7*
791    /// ```
792    ///
793    /// That is, we have a failure transition from S3 to S5, which is followed
794    /// exactly in cases when we are in state S3 but see any byte other than
795    /// 'd' (that is, we've "failed" to find a match in this portion of our
796    /// trie). We know we can transition back to S5 because we've already seen
797    /// a 'c' byte, so we don't need to re-scan it. We can then pick back up
798    /// with the search starting at S5 and complete our match.
799    ///
800    /// Adding failure transitions to a trie is fairly simple, but subtle. The
801    /// key issue is that you might have multiple failure transition that you
802    /// need to follow. For example, look at the trie for the patterns
803    /// 'abcd', 'b', 'bcd' and 'cd':
804    ///
805    /// ```ignore
806    ///        - a - S1 - b - S2 - c - S3 - d - S4*
807    ///       /
808    ///     S0 - b - S5* - c - S6 - d - S7*
809    ///       \
810    ///        - c - S8 - d - S9*
811    /// ```
812    ///
813    /// The failure transitions for this trie are defined from S2 to S5,
814    /// S3 to S6 and S6 to S8. Moreover, state S2 needs to track that it
815    /// corresponds to a match, since its failure transition to S5 is itself
816    /// a match state.
817    ///
818    /// Perhaps simplest way to think about adding these failure transitions
819    /// is recursively. That is, if you know the failure transitions for every
820    /// possible previous state that could be visited (e.g., when computing the
821    /// failure transition for S3, you already know the failure transitions
822    /// for S0, S1 and S2), then you can simply follow the failure transition
823    /// of the previous state and check whether the incoming transition is
824    /// defined after following the failure transition.
825    ///
826    /// For example, when determining the failure state for S3, by our
827    /// assumptions, we already know that there is a failure transition from
828    /// S2 (the previous state) to S5. So we follow that transition and check
829    /// whether the transition connecting S2 to S3 is defined. Indeed, it is,
830    /// as there is a transition from S5 to S6 for the byte 'c'. If no such
831    /// transition existed, we could keep following the failure transitions
832    /// until we reach the start state, which is the failure transition for
833    /// every state that has no corresponding proper suffix.
834    ///
835    /// We don't actually use recursion to implement this, but instead, use a
836    /// breadth first search of the automaton. Our base case is the start
837    /// state, whose failure transition is just a transition to itself.
838    fn fill_failure_transitions_standard(&mut self) {
839        // Initialize the queue for breadth first search with all transitions
840        // out of the start state. We handle the start state specially because
841        // we only want to follow non-self transitions. If we followed self
842        // transitions, then this would never terminate.
843        let mut queue = VecDeque::new();
844        let mut seen = self.queued_set();
845        for b in AllBytesIter::new() {
846            let next = self.nfa.start().next_state(b);
847            if next != self.nfa.start_id {
848                if !seen.contains(next) {
849                    queue.push_back(next);
850                    seen.insert(next);
851                }
852            }
853        }
854        while let Some(id) = queue.pop_front() {
855            let mut it = self.nfa.iter_transitions_mut(id);
856            while let Some((b, next)) = it.next() {
857                if !seen.contains(next) {
858                    queue.push_back(next);
859                    seen.insert(next);
860                }
861
862                let mut fail = it.nfa().state(id).fail;
863                while it.nfa().state(fail).next_state(b) == fail_id() {
864                    fail = it.nfa().state(fail).fail;
865                }
866                fail = it.nfa().state(fail).next_state(b);
867                it.nfa().state_mut(next).fail = fail;
868                it.nfa().copy_matches(fail, next);
869            }
870            // If the start state is a match state, then this automaton can
871            // match the empty string. This implies all states are match states
872            // since every position matches the empty string, so copy the
873            // matches from the start state to every state. Strictly speaking,
874            // this is only necessary for overlapping matches since each
875            // non-empty non-start match state needs to report empty matches
876            // in addition to its own. For the non-overlapping case, such
877            // states only report the first match, which is never empty since
878            // it isn't a start state.
879            it.nfa().copy_empty_matches(id);
880        }
881    }
882
883    /// This routine is just like fill_failure_transitions_standard, except
884    /// it adds failure transitions in a way that preserves leftmost match
885    /// semantics (for both leftmost-first and leftmost-longest).
886    ///
887    /// The algorithms are so similar that it would be possible to write it
888    /// generically. But doing so without overhead would require a bit of
889    /// ceremony, so we just copy it and add in the extra leftmost logic.
890    /// Moreover, the standard algorithm above is so simple that it feels like
891    /// a crime to disturb it.
892    ///
893    /// In effect, this proceeds just like the standard approach, but we
894    /// specifically add only a subset of all failure transitions. Namely, we
895    /// only add failure transitions that either do not occur after a match
896    /// or failure transitions that do occur after a match but preserve the
897    /// match. The comments in the implementation below should help.
898    ///
899    /// N.B. The only differences in the automaton between leftmost-first and
900    /// leftmost-longest are in trie construction. Otherwise, both have exactly
901    /// the same set of failure transitions. leftmost-longest adds everything
902    /// to the trie, where as leftmost-first skips any patterns for which there
903    /// exists a prefix of it that was added earlier.
904    ///
905    /// N.B. I came up with this algorithm on my own, and after scouring all of
906    /// the other AC implementations I know of (Perl, Snort, many on GitHub).
907    /// I couldn't find any that implement leftmost semantics like this.
908    /// Perl of course needs leftmost-first semantics, but they implement it
909    /// with a seeming hack at *search* time instead of encoding it into the
910    /// automaton. There are also a couple Java libraries that support leftmost
911    /// longest semantics, but they do it by building a queue of matches at
912    /// search time, which is even worse than what Perl is doing. ---AG
913    fn fill_failure_transitions_leftmost(&mut self) {
914        /// Represents an item in our queue of states to process.
915        ///
916        /// Fundamentally, this queue serves the same purpose as the queue
917        /// for filling failure transitions using the standard formulation.
918        /// In the leftmost case, though, we need to track a bit more
919        /// information. See comments below.
920        #[derive(Clone, Copy, Debug)]
921        struct QueuedState<S> {
922            /// The id of the state to visit.
923            id: S,
924            /// The depth at which the first match was observed in the path
925            /// to this state. Note that this corresponds to the depth at
926            /// which the beginning of the match was detected. If no match
927            /// has been seen, then this is None.
928            match_at_depth: Option<usize>,
929        }
930
931        impl<S: StateID> QueuedState<S> {
932            /// Create a queued state corresponding to the given NFA's start
933            /// state.
934            fn start(nfa: &NFA<S>) -> QueuedState<S> {
935                let match_at_depth =
936                    if nfa.start().is_match() { Some(0) } else { None };
937                QueuedState { id: nfa.start_id, match_at_depth }
938            }
939
940            /// Return the next state to queue up. The given id must be a state
941            /// corresponding to a single transition from this queued state.
942            fn next_queued_state(
943                &self,
944                nfa: &NFA<S>,
945                id: S,
946            ) -> QueuedState<S> {
947                let match_at_depth = self.next_match_at_depth(nfa, id);
948                QueuedState { id, match_at_depth }
949            }
950
951            /// Return the earliest depth at which a match has occurred for
952            /// the given state. The given state must correspond to a single
953            /// transition from this queued state.
954            fn next_match_at_depth(
955                &self,
956                nfa: &NFA<S>,
957                next: S,
958            ) -> Option<usize> {
959                // This is a little tricky. If the previous state has already
960                // seen a match or if `next` isn't a match state, then nothing
961                // needs to change since a later state cannot find an earlier
962                // match.
963                match self.match_at_depth {
964                    Some(x) => return Some(x),
965                    None if nfa.state(next).is_match() => {}
966                    None => return None,
967                }
968                let depth = nfa.state(next).depth
969                    - nfa.state(next).get_longest_match_len().unwrap()
970                    + 1;
971                Some(depth)
972            }
973        }
974
975        // Initialize the queue for breadth first search with all transitions
976        // out of the start state. We handle the start state specially because
977        // we only want to follow non-self transitions. If we followed self
978        // transitions, then this would never terminate.
979        let mut queue: VecDeque<QueuedState<S>> = VecDeque::new();
980        let mut seen = self.queued_set();
981        let start = QueuedState::start(&self.nfa);
982        for b in AllBytesIter::new() {
983            let next_id = self.nfa.start().next_state(b);
984            if next_id != start.id {
985                let next = start.next_queued_state(&self.nfa, next_id);
986                if !seen.contains(next.id) {
987                    queue.push_back(next);
988                    seen.insert(next.id);
989                }
990                // If a state immediately following the start state is a match
991                // state, then we never want to follow its failure transition
992                // since the failure transition necessarily leads back to the
993                // start state, which we never want to do for leftmost matching
994                // after a match has been found.
995                //
996                // N.B. This is a special case of the more general handling
997                // found below.
998                if self.nfa.state(next_id).is_match() {
999                    self.nfa.state_mut(next_id).fail = dead_id();
1000                }
1001            }
1002        }
1003        while let Some(item) = queue.pop_front() {
1004            let mut any_trans = false;
1005            let mut it = self.nfa.iter_transitions_mut(item.id);
1006            while let Some((b, next_id)) = it.next() {
1007                any_trans = true;
1008
1009                // Queue up the next state.
1010                let next = item.next_queued_state(it.nfa(), next_id);
1011                if !seen.contains(next.id) {
1012                    queue.push_back(next);
1013                    seen.insert(next.id);
1014                }
1015
1016                // Find the failure state for next. Same as standard.
1017                let mut fail = it.nfa().state(item.id).fail;
1018                while it.nfa().state(fail).next_state(b) == fail_id() {
1019                    fail = it.nfa().state(fail).fail;
1020                }
1021                fail = it.nfa().state(fail).next_state(b);
1022
1023                // This is the key difference from the standard formulation.
1024                // Namely, if we've seen a match, then we only want a failure
1025                // transition if the failure transition preserves the match
1026                // we've seen. In general, this is not true of all failure
1027                // transitions since they can point back to any suffix of what
1028                // we've seen so far. Instead, we only want to point back to
1029                // suffixes that contain any match we've seen.
1030                //
1031                // We achieve this by comparing the depth of the failure
1032                // transition with the number of states between this state
1033                // and the beginning of the earliest match detected. If the
1034                // depth of the failure state is smaller than this difference,
1035                // then it cannot contain the match. If it's bigger or equal
1036                // to the difference, then it necessarily includes the match
1037                // we've seen since all failure transitions correspond to a
1038                // suffix.
1039                //
1040                // If we've determined that we don't want the failure
1041                // transition, then we set this state's failure transition to
1042                // the dead state. In other words, when a search hits this
1043                // state, it will not continue and correctly stop. (N.B. A
1044                // dead state is different than a fail state. A dead state
1045                // MUST be preceded by a match and acts as a sentinel to search
1046                // routines to terminate.)
1047                //
1048                // Understanding this is tricky, and it took me several days
1049                // to think through this and get it right. If you want to grok
1050                // it, then I'd recommend: 1) switch the implementation to
1051                // always use the standard algorithm for filling in failure
1052                // transitions, 2) run the test suite and 3) examine the test
1053                // failures. Write out the automatons for them and try to work
1054                // backwards by figuring out which failure transitions should
1055                // be removed. You should arrive at the same rule used below.
1056                if let Some(match_depth) = next.match_at_depth {
1057                    let fail_depth = it.nfa().state(fail).depth;
1058                    let next_depth = it.nfa().state(next.id).depth;
1059                    if next_depth - match_depth + 1 > fail_depth {
1060                        it.nfa().state_mut(next.id).fail = dead_id();
1061                        continue;
1062                    }
1063                    assert_ne!(
1064                        start.id,
1065                        it.nfa().state(next.id).fail,
1066                        "states that are match states or follow match \
1067                         states should never have a failure transition \
1068                         back to the start state in leftmost searching",
1069                    );
1070                }
1071                it.nfa().state_mut(next.id).fail = fail;
1072                it.nfa().copy_matches(fail, next.id);
1073            }
1074            // If there are no transitions for this state and if it's a match
1075            // state, then we must set its failure transition to the dead
1076            // state since we never want it to restart the search.
1077            if !any_trans && it.nfa().state(item.id).is_match() {
1078                it.nfa().state_mut(item.id).fail = dead_id();
1079            }
1080            // We don't need to copy empty matches from the start state here
1081            // because that's only necessary for overlapping matches and
1082            // leftmost match kinds don't support overlapping matches.
1083        }
1084    }
1085
1086    /// Returns a set that tracked queued states.
1087    ///
1088    /// This is only necessary when ASCII case insensitivity is enabled, since
1089    /// it is the only way to visit the same state twice. Otherwise, this
1090    /// returns an inert set that nevers adds anything and always reports
1091    /// `false` for every member test.
1092    fn queued_set(&self) -> QueuedSet<S> {
1093        if self.builder.ascii_case_insensitive {
1094            QueuedSet::active()
1095        } else {
1096            QueuedSet::inert()
1097        }
1098    }
1099
1100    /// Set the failure transitions on the start state to loop back to the
1101    /// start state. This effectively permits the Aho-Corasick automaton to
1102    /// match at any position. This is also required for finding the next
1103    /// state to terminate, namely, finding the next state should never return
1104    /// a fail_id.
1105    ///
1106    /// This must be done after building the initial trie, since trie
1107    /// construction depends on transitions to `fail_id` to determine whether a
1108    /// state already exists or not.
1109    fn add_start_state_loop(&mut self) {
1110        let start_id = self.nfa.start_id;
1111        let start = self.nfa.start_mut();
1112        for b in AllBytesIter::new() {
1113            if start.next_state(b) == fail_id() {
1114                start.set_next_state(b, start_id);
1115            }
1116        }
1117    }
1118
1119    /// Remove the start state loop by rewriting any transitions on the start
1120    /// state back to the start state with transitions to the dead state.
1121    ///
1122    /// The loop is only closed when two conditions are met: the start state
1123    /// is a match state and the match kind is leftmost-first or
1124    /// leftmost-longest. (Alternatively, if this is an anchored automaton,
1125    /// then the start state is always closed, regardless of aforementioned
1126    /// conditions.)
1127    ///
1128    /// The reason for this is that under leftmost semantics, a start state
1129    /// that is also a match implies that we should never restart the search
1130    /// process. We allow normal transitions out of the start state, but if
1131    /// none exist, we transition to the dead state, which signals that
1132    /// searching should stop.
1133    fn close_start_state_loop(&mut self) {
1134        if self.builder.anchored
1135            || (self.match_kind().is_leftmost() && self.nfa.start().is_match())
1136        {
1137            let start_id = self.nfa.start_id;
1138            let start = self.nfa.start_mut();
1139            for b in AllBytesIter::new() {
1140                if start.next_state(b) == start_id {
1141                    start.set_next_state(b, dead_id());
1142                }
1143            }
1144        }
1145    }
1146
1147    /// Sets all transitions on the dead state to point back to the dead state.
1148    /// Normally, missing transitions map back to the failure state, but the
1149    /// point of the dead state is to act as a sink that can never be escaped.
1150    fn add_dead_state_loop(&mut self) {
1151        let dead = self.nfa.state_mut(dead_id());
1152        for b in AllBytesIter::new() {
1153            dead.set_next_state(b, dead_id());
1154        }
1155    }
1156
1157    /// Computes the total amount of heap used by this NFA in bytes.
1158    fn calculate_size(&mut self) {
1159        let mut size = 0;
1160        for state in &self.nfa.states {
1161            size += state.heap_bytes();
1162        }
1163        self.nfa.heap_bytes = size;
1164    }
1165
1166    /// Add a new state to the underlying NFA with the given depth. The depth
1167    /// is used to determine how to represent the transitions.
1168    ///
1169    /// If adding the new state would overflow the chosen state ID
1170    /// representation, then this returns an error.
1171    fn add_state(&mut self, depth: usize) -> Result<S> {
1172        if depth < self.builder.dense_depth {
1173            self.nfa.add_dense_state(depth)
1174        } else {
1175            self.nfa.add_sparse_state(depth)
1176        }
1177    }
1178
1179    /// Returns the match kind configured on the underlying builder.
1180    fn match_kind(&self) -> MatchKind {
1181        self.builder.match_kind
1182    }
1183}
1184
1185/// A set of state identifiers used to avoid revisiting the same state multiple
1186/// times when filling in failure transitions.
1187///
1188/// This set has an "inert" and an "active" mode. When inert, the set never
1189/// stores anything and always returns `false` for every member test. This is
1190/// useful to avoid the performance and memory overhead of maintaining this
1191/// set when it is not needed.
1192#[derive(Debug)]
1193struct QueuedSet<S> {
1194    set: Option<BTreeSet<S>>,
1195}
1196
1197impl<S: StateID> QueuedSet<S> {
1198    /// Return an inert set that returns `false` for every state ID membership
1199    /// test.
1200    fn inert() -> QueuedSet<S> {
1201        QueuedSet { set: None }
1202    }
1203
1204    /// Return an active set that tracks state ID membership.
1205    fn active() -> QueuedSet<S> {
1206        QueuedSet { set: Some(BTreeSet::new()) }
1207    }
1208
1209    /// Inserts the given state ID into this set. (If the set is inert, then
1210    /// this is a no-op.)
1211    fn insert(&mut self, state_id: S) {
1212        if let Some(ref mut set) = self.set {
1213            set.insert(state_id);
1214        }
1215    }
1216
1217    /// Returns true if and only if the given state ID is in this set. If the
1218    /// set is inert, this always returns false.
1219    fn contains(&self, state_id: S) -> bool {
1220        match self.set {
1221            None => false,
1222            Some(ref set) => set.contains(&state_id),
1223        }
1224    }
1225}
1226
1227/// An iterator over every byte value.
1228///
1229/// We use this instead of (0..256).map(|b| b as u8) because this optimizes
1230/// better in debug builds.
1231///
1232/// We also use this instead of 0..=255 because we're targeting Rust 1.24 and
1233/// inclusive range syntax was stabilized in Rust 1.26. We can get rid of this
1234/// once our MSRV is Rust 1.26 or newer.
1235#[derive(Debug)]
1236struct AllBytesIter(u16);
1237
1238impl AllBytesIter {
1239    fn new() -> AllBytesIter {
1240        AllBytesIter(0)
1241    }
1242}
1243
1244impl Iterator for AllBytesIter {
1245    type Item = u8;
1246
1247    fn next(&mut self) -> Option<Self::Item> {
1248        if self.0 >= 256 {
1249            None
1250        } else {
1251            let b = self.0 as u8;
1252            self.0 += 1;
1253            Some(b)
1254        }
1255    }
1256}
1257
1258impl<S: StateID> fmt::Debug for NFA<S> {
1259    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1260        writeln!(f, "NFA(")?;
1261        writeln!(f, "match_kind: {:?}", self.match_kind)?;
1262        writeln!(f, "{}", "-".repeat(79))?;
1263        for (id, s) in self.states.iter().enumerate() {
1264            let mut trans = vec![];
1265            s.trans.iter(|byte, next| {
1266                // The start state has a bunch of uninteresting transitions
1267                // back into itself. It's questionable to hide them since they
1268                // are critical to understanding the automaton, but they are
1269                // very noisy without better formatting for contiugous ranges
1270                // to the same state.
1271                if id == self.start_id.to_usize() && next == self.start_id {
1272                    return;
1273                }
1274                // Similarly, the dead state has a bunch of uninteresting
1275                // transitions too.
1276                if id == dead_id() {
1277                    return;
1278                }
1279                trans.push(format!("{} => {}", escape(byte), next.to_usize()));
1280            });
1281            writeln!(f, "{:04}: {}", id, trans.join(", "))?;
1282
1283            let matches: Vec<String> = s
1284                .matches
1285                .iter()
1286                .map(|&(pattern_id, _)| pattern_id.to_string())
1287                .collect();
1288            writeln!(f, "  matches: {}", matches.join(", "))?;
1289            writeln!(f, "     fail: {}", s.fail.to_usize())?;
1290            writeln!(f, "    depth: {}", s.depth)?;
1291        }
1292        writeln!(f, "{}", "-".repeat(79))?;
1293        writeln!(f, ")")?;
1294        Ok(())
1295    }
1296}
1297
1298/// Iterate over all possible byte transitions given a sparse set.
1299fn sparse_iter<S: StateID, F: FnMut(u8, S)>(trans: &[(u8, S)], mut f: F) {
1300    let mut byte = 0u16;
1301    for &(b, id) in trans {
1302        while byte < (b as u16) {
1303            f(byte as u8, fail_id());
1304            byte += 1;
1305        }
1306        f(b, id);
1307        byte += 1;
1308    }
1309    for b in byte..256 {
1310        f(b as u8, fail_id());
1311    }
1312}
1313
1314/// Safely return two mutable borrows to two different locations in the given
1315/// slice.
1316///
1317/// This panics if i == j.
1318fn get_two_mut<T>(xs: &mut [T], i: usize, j: usize) -> (&mut T, &mut T) {
1319    assert!(i != j, "{} must not be equal to {}", i, j);
1320    if i < j {
1321        let (before, after) = xs.split_at_mut(j);
1322        (&mut before[i], &mut after[0])
1323    } else {
1324        let (before, after) = xs.split_at_mut(i);
1325        (&mut after[0], &mut before[j])
1326    }
1327}
1328
1329/// Return the given byte as its escaped string form.
1330fn escape(b: u8) -> String {
1331    use std::ascii;
1332
1333    String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
1334}
1335
1336#[cfg(test)]
1337mod tests {
1338    use super::*;
1339
1340    #[test]
1341    fn scratch() {
1342        let nfa: NFA<usize> = Builder::new()
1343            .dense_depth(0)
1344            // .match_kind(MatchKind::LeftmostShortest)
1345            // .match_kind(MatchKind::LeftmostLongest)
1346            .match_kind(MatchKind::LeftmostFirst)
1347            // .build(&["abcd", "ce", "b"])
1348            // .build(&["ab", "bc"])
1349            // .build(&["b", "bcd", "ce"])
1350            // .build(&["abc", "bx"])
1351            // .build(&["abc", "bd", "ab"])
1352            // .build(&["abcdefghi", "hz", "abcdefgh"])
1353            // .build(&["abcd", "bce", "b"])
1354            .build(&["abcdefg", "bcde", "bcdef"])
1355            .unwrap();
1356        println!("{:?}", nfa);
1357    }
1358}