aho_corasick/
automaton.rs

1use ahocorasick::MatchKind;
2use prefilter::{self, Candidate, Prefilter, PrefilterState};
3use state_id::{dead_id, fail_id, StateID};
4use Match;
5
6// NOTE: This trait essentially started as a copy of the same trait from from
7// regex-automata, with some wording changed since we use this trait for
8// NFAs in addition to DFAs in this crate. Additionally, we do not export
9// this trait. It's only used internally to reduce code duplication. The
10// regex-automata crate needs to expose it because its Regex type is generic
11// over implementations of this trait. In this crate, we encapsulate everything
12// behind the AhoCorasick type.
13//
14// This trait is a bit of a mess, but it's not quite clear how to fix it.
15// Basically, there are several competing concerns:
16//
17// * We need performance, so everything effectively needs to get monomorphized.
18// * There are several variations on searching Aho-Corasick automatons:
19//   overlapping, standard and leftmost. Overlapping and standard are somewhat
20//   combined together below, but there is no real way to combine standard with
21//   leftmost. Namely, leftmost requires continuing a search even after a match
22//   is found, in order to correctly disambiguate a match.
23// * On top of that, *sometimes* callers want to know which state the automaton
24//   is in after searching. This is principally useful for overlapping and
25//   stream searches. However, when callers don't care about this, we really
26//   do not want to be forced to compute it, since it sometimes requires extra
27//   work. Thus, there are effectively two copies of leftmost searching: one
28//   for tracking the state ID and one that doesn't. We should ideally do the
29//   same for standard searching, but my sanity stopped me.
30
31/// A trait describing the interface of an Aho-Corasick finite state machine.
32///
33/// Every automaton has exactly one fail state, one dead state and exactly one
34/// start state. Generally, these correspond to the first, second and third
35/// states, respectively. The failure state is always treated as a sentinel.
36/// That is, no correct Aho-Corasick automaton will ever transition into the
37/// fail state. The dead state, however, can be transitioned into, but only
38/// when leftmost-first or leftmost-longest match semantics are enabled and
39/// only when at least one match has been observed.
40///
41/// Every automaton also has one or more match states, such that
42/// `Automaton::is_match_state_unchecked(id)` returns `true` if and only if
43/// `id` corresponds to a match state.
44pub trait Automaton {
45    /// The representation used for state identifiers in this automaton.
46    ///
47    /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`.
48    type ID: StateID;
49
50    /// The type of matching that should be done.
51    fn match_kind(&self) -> &MatchKind;
52
53    /// Returns true if and only if this automaton uses anchored searches.
54    fn anchored(&self) -> bool;
55
56    /// An optional prefilter for quickly skipping to the next candidate match.
57    /// A prefilter must report at least every match, although it may report
58    /// positions that do not correspond to a match. That is, it must not allow
59    /// false negatives, but can allow false positives.
60    ///
61    /// Currently, a prefilter only runs when the automaton is in the start
62    /// state. That is, the position reported by a prefilter should always
63    /// correspond to the start of a potential match.
64    fn prefilter(&self) -> Option<&dyn Prefilter>;
65
66    /// Return the identifier of this automaton's start state.
67    fn start_state(&self) -> Self::ID;
68
69    /// Returns true if and only if the given state identifier refers to a
70    /// valid state.
71    fn is_valid(&self, id: Self::ID) -> bool;
72
73    /// Returns true if and only if the given identifier corresponds to a match
74    /// state.
75    ///
76    /// The state ID given must be valid, or else implementors may panic.
77    fn is_match_state(&self, id: Self::ID) -> bool;
78
79    /// Returns true if and only if the given identifier corresponds to a state
80    /// that is either the dead state or a match state.
81    ///
82    /// Depending on the implementation of the automaton, this routine can
83    /// be used to save a branch in the core matching loop. Nevertheless,
84    /// `is_match_state(id) || id == dead_id()` is always a valid
85    /// implementation. Indeed, this is the default implementation.
86    ///
87    /// The state ID given must be valid, or else implementors may panic.
88    fn is_match_or_dead_state(&self, id: Self::ID) -> bool {
89        id == dead_id() || self.is_match_state(id)
90    }
91
92    /// If the given state is a match state, return the match corresponding
93    /// to the given match index. `end` must be the ending position of the
94    /// detected match. If no match exists or if `match_index` exceeds the
95    /// number of matches in this state, then `None` is returned.
96    ///
97    /// The state ID given must be valid, or else implementors may panic.
98    ///
99    /// If the given state ID is correct and if the `match_index` is less than
100    /// the number of matches for that state, then this is guaranteed to return
101    /// a match.
102    fn get_match(
103        &self,
104        id: Self::ID,
105        match_index: usize,
106        end: usize,
107    ) -> Option<Match>;
108
109    /// Returns the number of matches for the given state. If the given state
110    /// is not a match state, then this returns 0.
111    ///
112    /// The state ID given must be valid, or else implementors must panic.
113    fn match_count(&self, id: Self::ID) -> usize;
114
115    /// Given the current state that this automaton is in and the next input
116    /// byte, this method returns the identifier of the next state. The
117    /// identifier returned must always be valid and may never correspond to
118    /// the fail state. The returned identifier may, however, point to the
119    /// dead state.
120    ///
121    /// This is not safe so that implementors may look up the next state
122    /// without memory safety checks such as bounds checks. As such, callers
123    /// must ensure that the given identifier corresponds to a valid automaton
124    /// state. Implementors must, in turn, ensure that this routine is safe for
125    /// all valid state identifiers and for all possible `u8` values.
126    unsafe fn next_state_unchecked(
127        &self,
128        current: Self::ID,
129        input: u8,
130    ) -> Self::ID;
131
132    /// Like next_state_unchecked, but debug_asserts that the underlying
133    /// implementation never returns a `fail_id()` for the next state.
134    unsafe fn next_state_unchecked_no_fail(
135        &self,
136        current: Self::ID,
137        input: u8,
138    ) -> Self::ID {
139        let next = self.next_state_unchecked(current, input);
140        // We should never see a transition to the failure state.
141        debug_assert!(
142            next != fail_id(),
143            "automaton should never return fail_id for next state"
144        );
145        next
146    }
147
148    /// Execute a search using standard match semantics.
149    ///
150    /// This can be used even when the automaton was constructed with leftmost
151    /// match semantics when you want to find the earliest possible match. This
152    /// can also be used as part of an overlapping search implementation.
153    ///
154    /// N.B. This does not report a match if `state_id` is given as a matching
155    /// state. As such, this should not be used directly.
156    #[inline(always)]
157    fn standard_find_at(
158        &self,
159        prestate: &mut PrefilterState,
160        haystack: &[u8],
161        at: usize,
162        state_id: &mut Self::ID,
163    ) -> Option<Match> {
164        if let Some(pre) = self.prefilter() {
165            self.standard_find_at_imp(
166                prestate,
167                Some(pre),
168                haystack,
169                at,
170                state_id,
171            )
172        } else {
173            self.standard_find_at_imp(prestate, None, haystack, at, state_id)
174        }
175    }
176
177    // It's important for this to always be inlined. Namely, it's only caller
178    // is standard_find_at, and the inlining should remove the case analysis
179    // for prefilter scanning when there is no prefilter available.
180    #[inline(always)]
181    fn standard_find_at_imp(
182        &self,
183        prestate: &mut PrefilterState,
184        prefilter: Option<&dyn Prefilter>,
185        haystack: &[u8],
186        at: usize,
187        state_id: &mut Self::ID,
188    ) -> Option<Match> {
189        // This is necessary for guaranteeing a safe API, since we use the
190        // state ID below in a function that exhibits UB if called with an
191        // invalid state ID.
192        assert!(
193            self.is_valid(*state_id),
194            "{} is not a valid state ID",
195            state_id.to_usize()
196        );
197        unsafe {
198            let start = haystack.as_ptr();
199            let end = haystack[haystack.len()..].as_ptr();
200            let mut ptr = haystack[at..].as_ptr();
201            while ptr < end {
202                if let Some(pre) = prefilter {
203                    let at = ptr as usize - start as usize;
204                    if prestate.is_effective(at)
205                        && *state_id == self.start_state()
206                    {
207                        let c = prefilter::next(prestate, pre, haystack, at)
208                            .into_option();
209                        match c {
210                            None => return None,
211                            Some(i) => {
212                                ptr = start.offset(i as isize);
213                            }
214                        }
215                    }
216                }
217                // SAFETY: next_state is safe for all possible u8 values,
218                // so the only thing we're concerned about is the validity
219                // of `state_id`. `state_id` either comes from the caller
220                // (in which case, we assert above that it is valid), or it
221                // comes from the return value of next_state, which is also
222                // guaranteed to be valid.
223                *state_id = self.next_state_unchecked_no_fail(*state_id, *ptr);
224                ptr = ptr.offset(1);
225                // This routine always quits immediately after seeing a
226                // match, and since dead states can only come after seeing
227                // a match, seeing a dead state here is impossible. (Unless
228                // we have an anchored automaton, in which case, dead states
229                // are used to stop a search.)
230                debug_assert!(
231                    *state_id != dead_id() || self.anchored(),
232                    "standard find should never see a dead state"
233                );
234
235                if self.is_match_or_dead_state(*state_id) {
236                    return if *state_id == dead_id() {
237                        None
238                    } else {
239                        let end = ptr as usize - start as usize;
240                        self.get_match(*state_id, 0, end)
241                    };
242                }
243            }
244            None
245        }
246    }
247
248    /// Execute a search using leftmost (either first or longest) match
249    /// semantics.
250    ///
251    /// The principle difference between searching with standard semantics and
252    /// searching with leftmost semantics is that leftmost searching will
253    /// continue searching even after a match has been found. Once a match
254    /// is found, the search does not stop until either the haystack has been
255    /// exhausted or a dead state is observed in the automaton. (Dead states
256    /// only exist in automatons constructed with leftmost semantics.) That is,
257    /// we rely on the construction of the automaton to tell us when to quit.
258    #[inline(never)]
259    fn leftmost_find_at(
260        &self,
261        prestate: &mut PrefilterState,
262        haystack: &[u8],
263        at: usize,
264        state_id: &mut Self::ID,
265    ) -> Option<Match> {
266        if let Some(pre) = self.prefilter() {
267            self.leftmost_find_at_imp(
268                prestate,
269                Some(pre),
270                haystack,
271                at,
272                state_id,
273            )
274        } else {
275            self.leftmost_find_at_imp(prestate, None, haystack, at, state_id)
276        }
277    }
278
279    // It's important for this to always be inlined. Namely, it's only caller
280    // is leftmost_find_at, and the inlining should remove the case analysis
281    // for prefilter scanning when there is no prefilter available.
282    #[inline(always)]
283    fn leftmost_find_at_imp(
284        &self,
285        prestate: &mut PrefilterState,
286        prefilter: Option<&dyn Prefilter>,
287        haystack: &[u8],
288        at: usize,
289        state_id: &mut Self::ID,
290    ) -> Option<Match> {
291        debug_assert!(self.match_kind().is_leftmost());
292        // This is necessary for guaranteeing a safe API, since we use the
293        // state ID below in a function that exhibits UB if called with an
294        // invalid state ID.
295        assert!(
296            self.is_valid(*state_id),
297            "{} is not a valid state ID",
298            state_id.to_usize()
299        );
300        if self.anchored() && at > 0 && *state_id == self.start_state() {
301            return None;
302        }
303        unsafe {
304            let start = haystack.as_ptr();
305            let end = haystack[haystack.len()..].as_ptr();
306            let mut ptr = haystack[at..].as_ptr();
307
308            let mut last_match = self.get_match(*state_id, 0, at);
309            while ptr < end {
310                if let Some(pre) = prefilter {
311                    let at = ptr as usize - start as usize;
312                    if prestate.is_effective(at)
313                        && *state_id == self.start_state()
314                    {
315                        let c = prefilter::next(prestate, pre, haystack, at)
316                            .into_option();
317                        match c {
318                            None => return None,
319                            Some(i) => {
320                                ptr = start.offset(i as isize);
321                            }
322                        }
323                    }
324                }
325                // SAFETY: next_state is safe for all possible u8 values,
326                // so the only thing we're concerned about is the validity
327                // of `state_id`. `state_id` either comes from the caller
328                // (in which case, we assert above that it is valid), or it
329                // comes from the return value of next_state, which is also
330                // guaranteed to be valid.
331                *state_id = self.next_state_unchecked_no_fail(*state_id, *ptr);
332                ptr = ptr.offset(1);
333                if self.is_match_or_dead_state(*state_id) {
334                    if *state_id == dead_id() {
335                        // The only way to enter into a dead state is if a
336                        // match has been found, so we assert as much. This
337                        // is different from normal automata, where you might
338                        // enter a dead state if you know a subsequent match
339                        // will never be found (regardless of whether a match
340                        // has already been found). For Aho-Corasick, it is
341                        // built so that we can match at any position, so the
342                        // possibility of a match always exists.
343                        //
344                        // (Unless we have an anchored automaton, in which
345                        // case, dead states are used to stop a search.)
346                        debug_assert!(
347                            last_match.is_some() || self.anchored(),
348                            "failure state should only be seen after match"
349                        );
350                        return last_match;
351                    }
352                    let end = ptr as usize - start as usize;
353                    last_match = self.get_match(*state_id, 0, end);
354                }
355            }
356            last_match
357        }
358    }
359
360    /// This is like leftmost_find_at, but does not need to track a caller
361    /// provided state id. In other words, the only output of this routine is a
362    /// match, if one exists.
363    ///
364    /// It is regrettable that we need to effectively copy a chunk of
365    /// implementation twice, but when we don't need to track the state ID, we
366    /// can allow the prefilter to report matches immediately without having
367    /// to re-confirm them with the automaton. The re-confirmation step is
368    /// necessary in leftmost_find_at because tracing through the automaton is
369    /// the only way to correctly set the state ID. (Perhaps an alternative
370    /// would be to keep a map from pattern ID to matching state ID, but that
371    /// complicates the code and still doesn't permit us to defer to the
372    /// prefilter entirely when possible.)
373    ///
374    /// I did try a few things to avoid the code duplication here, but nothing
375    /// optimized as well as this approach. (In microbenchmarks, there was
376    /// about a 25% difference.)
377    #[inline(never)]
378    fn leftmost_find_at_no_state(
379        &self,
380        prestate: &mut PrefilterState,
381        haystack: &[u8],
382        at: usize,
383    ) -> Option<Match> {
384        if let Some(pre) = self.prefilter() {
385            self.leftmost_find_at_no_state_imp(
386                prestate,
387                Some(pre),
388                haystack,
389                at,
390            )
391        } else {
392            self.leftmost_find_at_no_state_imp(prestate, None, haystack, at)
393        }
394    }
395
396    // It's important for this to always be inlined. Namely, it's only caller
397    // is leftmost_find_at_no_state, and the inlining should remove the case
398    // analysis for prefilter scanning when there is no prefilter available.
399    #[inline(always)]
400    fn leftmost_find_at_no_state_imp(
401        &self,
402        prestate: &mut PrefilterState,
403        prefilter: Option<&dyn Prefilter>,
404        haystack: &[u8],
405        at: usize,
406    ) -> Option<Match> {
407        debug_assert!(self.match_kind().is_leftmost());
408        if self.anchored() && at > 0 {
409            return None;
410        }
411        // If our prefilter handles confirmation of matches 100% of the
412        // time, and since we don't need to track state IDs, we can avoid
413        // Aho-Corasick completely.
414        if let Some(pre) = prefilter {
415            // We should never have a prefilter during an anchored search.
416            debug_assert!(!self.anchored());
417            if !pre.reports_false_positives() {
418                return match pre.next_candidate(prestate, haystack, at) {
419                    Candidate::None => None,
420                    Candidate::Match(m) => Some(m),
421                    Candidate::PossibleStartOfMatch(_) => unreachable!(),
422                };
423            }
424        }
425        let mut state_id = self.start_state();
426        unsafe {
427            let start = haystack.as_ptr();
428            let end = haystack[haystack.len()..].as_ptr();
429            let mut ptr = haystack[at..].as_ptr();
430
431            let mut last_match = self.get_match(state_id, 0, at);
432            while ptr < end {
433                if let Some(pre) = prefilter {
434                    let at = ptr as usize - start as usize;
435                    if prestate.is_effective(at)
436                        && state_id == self.start_state()
437                    {
438                        match prefilter::next(prestate, pre, haystack, at) {
439                            Candidate::None => return None,
440                            // Since we aren't tracking a state ID, we can
441                            // quit early once we know we have a match.
442                            Candidate::Match(m) => return Some(m),
443                            Candidate::PossibleStartOfMatch(i) => {
444                                ptr = start.offset(i as isize);
445                            }
446                        }
447                    }
448                }
449                // SAFETY: next_state is safe for all possible u8 values,
450                // so the only thing we're concerned about is the validity
451                // of `state_id`. `state_id` either comes from the caller
452                // (in which case, we assert above that it is valid), or it
453                // comes from the return value of next_state, which is also
454                // guaranteed to be valid.
455                state_id = self.next_state_unchecked_no_fail(state_id, *ptr);
456                ptr = ptr.offset(1);
457                if self.is_match_or_dead_state(state_id) {
458                    if state_id == dead_id() {
459                        // The only way to enter into a dead state is if a
460                        // match has been found, so we assert as much. This
461                        // is different from normal automata, where you might
462                        // enter a dead state if you know a subsequent match
463                        // will never be found (regardless of whether a match
464                        // has already been found). For Aho-Corasick, it is
465                        // built so that we can match at any position, so the
466                        // possibility of a match always exists.
467                        //
468                        // (Unless we have an anchored automaton, in which
469                        // case, dead states are used to stop a search.)
470                        debug_assert!(
471                            last_match.is_some() || self.anchored(),
472                            "failure state should only be seen after match"
473                        );
474                        return last_match;
475                    }
476                    let end = ptr as usize - start as usize;
477                    last_match = self.get_match(state_id, 0, end);
478                }
479            }
480            last_match
481        }
482    }
483
484    /// Execute an overlapping search.
485    ///
486    /// When executing an overlapping match, the previous state ID in addition
487    /// to the previous match index should be given. If there are more matches
488    /// at the given state, then the match is reported and the given index is
489    /// incremented.
490    #[inline(always)]
491    fn overlapping_find_at(
492        &self,
493        prestate: &mut PrefilterState,
494        haystack: &[u8],
495        at: usize,
496        state_id: &mut Self::ID,
497        match_index: &mut usize,
498    ) -> Option<Match> {
499        if self.anchored() && at > 0 && *state_id == self.start_state() {
500            return None;
501        }
502
503        let match_count = self.match_count(*state_id);
504        if *match_index < match_count {
505            // This is guaranteed to return a match since
506            // match_index < match_count.
507            let result = self.get_match(*state_id, *match_index, at);
508            debug_assert!(result.is_some(), "must be a match");
509            *match_index += 1;
510            return result;
511        }
512
513        *match_index = 0;
514        match self.standard_find_at(prestate, haystack, at, state_id) {
515            None => None,
516            Some(m) => {
517                *match_index = 1;
518                Some(m)
519            }
520        }
521    }
522
523    /// Return the earliest match found. This returns as soon as we know that
524    /// we have a match. As such, this does not necessarily correspond to the
525    /// leftmost starting match, but rather, the leftmost position at which a
526    /// match ends.
527    #[inline(always)]
528    fn earliest_find_at(
529        &self,
530        prestate: &mut PrefilterState,
531        haystack: &[u8],
532        at: usize,
533        state_id: &mut Self::ID,
534    ) -> Option<Match> {
535        if *state_id == self.start_state() {
536            if self.anchored() && at > 0 {
537                return None;
538            }
539            if let Some(m) = self.get_match(*state_id, 0, at) {
540                return Some(m);
541            }
542        }
543        self.standard_find_at(prestate, haystack, at, state_id)
544    }
545
546    /// A convenience function for finding the next match according to the
547    /// match semantics of this automaton. For standard match semantics, this
548    /// finds the earliest match. Otherwise, the leftmost match is found.
549    #[inline(always)]
550    fn find_at(
551        &self,
552        prestate: &mut PrefilterState,
553        haystack: &[u8],
554        at: usize,
555        state_id: &mut Self::ID,
556    ) -> Option<Match> {
557        match *self.match_kind() {
558            MatchKind::Standard => {
559                self.earliest_find_at(prestate, haystack, at, state_id)
560            }
561            MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => {
562                self.leftmost_find_at(prestate, haystack, at, state_id)
563            }
564            MatchKind::__Nonexhaustive => unreachable!(),
565        }
566    }
567
568    /// Like find_at, but does not track state identifiers. This permits some
569    /// optimizations when a prefilter that confirms its own matches is
570    /// present.
571    #[inline(always)]
572    fn find_at_no_state(
573        &self,
574        prestate: &mut PrefilterState,
575        haystack: &[u8],
576        at: usize,
577    ) -> Option<Match> {
578        match *self.match_kind() {
579            MatchKind::Standard => {
580                let mut state = self.start_state();
581                self.earliest_find_at(prestate, haystack, at, &mut state)
582            }
583            MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => {
584                self.leftmost_find_at_no_state(prestate, haystack, at)
585            }
586            MatchKind::__Nonexhaustive => unreachable!(),
587        }
588    }
589}