aho_corasick/automaton.rs
1use ahocorasick::MatchKind;
2use prefilter::{self, Candidate, Prefilter, PrefilterState};
3use state_id::{dead_id, fail_id, StateID};
4use Match;
5
6// NOTE: This trait essentially started as a copy of the same trait from from
7// regex-automata, with some wording changed since we use this trait for
8// NFAs in addition to DFAs in this crate. Additionally, we do not export
9// this trait. It's only used internally to reduce code duplication. The
10// regex-automata crate needs to expose it because its Regex type is generic
11// over implementations of this trait. In this crate, we encapsulate everything
12// behind the AhoCorasick type.
13//
14// This trait is a bit of a mess, but it's not quite clear how to fix it.
15// Basically, there are several competing concerns:
16//
17// * We need performance, so everything effectively needs to get monomorphized.
18// * There are several variations on searching Aho-Corasick automatons:
19// overlapping, standard and leftmost. Overlapping and standard are somewhat
20// combined together below, but there is no real way to combine standard with
21// leftmost. Namely, leftmost requires continuing a search even after a match
22// is found, in order to correctly disambiguate a match.
23// * On top of that, *sometimes* callers want to know which state the automaton
24// is in after searching. This is principally useful for overlapping and
25// stream searches. However, when callers don't care about this, we really
26// do not want to be forced to compute it, since it sometimes requires extra
27// work. Thus, there are effectively two copies of leftmost searching: one
28// for tracking the state ID and one that doesn't. We should ideally do the
29// same for standard searching, but my sanity stopped me.
30
31/// A trait describing the interface of an Aho-Corasick finite state machine.
32///
33/// Every automaton has exactly one fail state, one dead state and exactly one
34/// start state. Generally, these correspond to the first, second and third
35/// states, respectively. The failure state is always treated as a sentinel.
36/// That is, no correct Aho-Corasick automaton will ever transition into the
37/// fail state. The dead state, however, can be transitioned into, but only
38/// when leftmost-first or leftmost-longest match semantics are enabled and
39/// only when at least one match has been observed.
40///
41/// Every automaton also has one or more match states, such that
42/// `Automaton::is_match_state_unchecked(id)` returns `true` if and only if
43/// `id` corresponds to a match state.
44pub trait Automaton {
45 /// The representation used for state identifiers in this automaton.
46 ///
47 /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`.
48 type ID: StateID;
49
50 /// The type of matching that should be done.
51 fn match_kind(&self) -> &MatchKind;
52
53 /// Returns true if and only if this automaton uses anchored searches.
54 fn anchored(&self) -> bool;
55
56 /// An optional prefilter for quickly skipping to the next candidate match.
57 /// A prefilter must report at least every match, although it may report
58 /// positions that do not correspond to a match. That is, it must not allow
59 /// false negatives, but can allow false positives.
60 ///
61 /// Currently, a prefilter only runs when the automaton is in the start
62 /// state. That is, the position reported by a prefilter should always
63 /// correspond to the start of a potential match.
64 fn prefilter(&self) -> Option<&dyn Prefilter>;
65
66 /// Return the identifier of this automaton's start state.
67 fn start_state(&self) -> Self::ID;
68
69 /// Returns true if and only if the given state identifier refers to a
70 /// valid state.
71 fn is_valid(&self, id: Self::ID) -> bool;
72
73 /// Returns true if and only if the given identifier corresponds to a match
74 /// state.
75 ///
76 /// The state ID given must be valid, or else implementors may panic.
77 fn is_match_state(&self, id: Self::ID) -> bool;
78
79 /// Returns true if and only if the given identifier corresponds to a state
80 /// that is either the dead state or a match state.
81 ///
82 /// Depending on the implementation of the automaton, this routine can
83 /// be used to save a branch in the core matching loop. Nevertheless,
84 /// `is_match_state(id) || id == dead_id()` is always a valid
85 /// implementation. Indeed, this is the default implementation.
86 ///
87 /// The state ID given must be valid, or else implementors may panic.
88 fn is_match_or_dead_state(&self, id: Self::ID) -> bool {
89 id == dead_id() || self.is_match_state(id)
90 }
91
92 /// If the given state is a match state, return the match corresponding
93 /// to the given match index. `end` must be the ending position of the
94 /// detected match. If no match exists or if `match_index` exceeds the
95 /// number of matches in this state, then `None` is returned.
96 ///
97 /// The state ID given must be valid, or else implementors may panic.
98 ///
99 /// If the given state ID is correct and if the `match_index` is less than
100 /// the number of matches for that state, then this is guaranteed to return
101 /// a match.
102 fn get_match(
103 &self,
104 id: Self::ID,
105 match_index: usize,
106 end: usize,
107 ) -> Option<Match>;
108
109 /// Returns the number of matches for the given state. If the given state
110 /// is not a match state, then this returns 0.
111 ///
112 /// The state ID given must be valid, or else implementors must panic.
113 fn match_count(&self, id: Self::ID) -> usize;
114
115 /// Given the current state that this automaton is in and the next input
116 /// byte, this method returns the identifier of the next state. The
117 /// identifier returned must always be valid and may never correspond to
118 /// the fail state. The returned identifier may, however, point to the
119 /// dead state.
120 ///
121 /// This is not safe so that implementors may look up the next state
122 /// without memory safety checks such as bounds checks. As such, callers
123 /// must ensure that the given identifier corresponds to a valid automaton
124 /// state. Implementors must, in turn, ensure that this routine is safe for
125 /// all valid state identifiers and for all possible `u8` values.
126 unsafe fn next_state_unchecked(
127 &self,
128 current: Self::ID,
129 input: u8,
130 ) -> Self::ID;
131
132 /// Like next_state_unchecked, but debug_asserts that the underlying
133 /// implementation never returns a `fail_id()` for the next state.
134 unsafe fn next_state_unchecked_no_fail(
135 &self,
136 current: Self::ID,
137 input: u8,
138 ) -> Self::ID {
139 let next = self.next_state_unchecked(current, input);
140 // We should never see a transition to the failure state.
141 debug_assert!(
142 next != fail_id(),
143 "automaton should never return fail_id for next state"
144 );
145 next
146 }
147
148 /// Execute a search using standard match semantics.
149 ///
150 /// This can be used even when the automaton was constructed with leftmost
151 /// match semantics when you want to find the earliest possible match. This
152 /// can also be used as part of an overlapping search implementation.
153 ///
154 /// N.B. This does not report a match if `state_id` is given as a matching
155 /// state. As such, this should not be used directly.
156 #[inline(always)]
157 fn standard_find_at(
158 &self,
159 prestate: &mut PrefilterState,
160 haystack: &[u8],
161 at: usize,
162 state_id: &mut Self::ID,
163 ) -> Option<Match> {
164 if let Some(pre) = self.prefilter() {
165 self.standard_find_at_imp(
166 prestate,
167 Some(pre),
168 haystack,
169 at,
170 state_id,
171 )
172 } else {
173 self.standard_find_at_imp(prestate, None, haystack, at, state_id)
174 }
175 }
176
177 // It's important for this to always be inlined. Namely, it's only caller
178 // is standard_find_at, and the inlining should remove the case analysis
179 // for prefilter scanning when there is no prefilter available.
180 #[inline(always)]
181 fn standard_find_at_imp(
182 &self,
183 prestate: &mut PrefilterState,
184 prefilter: Option<&dyn Prefilter>,
185 haystack: &[u8],
186 at: usize,
187 state_id: &mut Self::ID,
188 ) -> Option<Match> {
189 // This is necessary for guaranteeing a safe API, since we use the
190 // state ID below in a function that exhibits UB if called with an
191 // invalid state ID.
192 assert!(
193 self.is_valid(*state_id),
194 "{} is not a valid state ID",
195 state_id.to_usize()
196 );
197 unsafe {
198 let start = haystack.as_ptr();
199 let end = haystack[haystack.len()..].as_ptr();
200 let mut ptr = haystack[at..].as_ptr();
201 while ptr < end {
202 if let Some(pre) = prefilter {
203 let at = ptr as usize - start as usize;
204 if prestate.is_effective(at)
205 && *state_id == self.start_state()
206 {
207 let c = prefilter::next(prestate, pre, haystack, at)
208 .into_option();
209 match c {
210 None => return None,
211 Some(i) => {
212 ptr = start.offset(i as isize);
213 }
214 }
215 }
216 }
217 // SAFETY: next_state is safe for all possible u8 values,
218 // so the only thing we're concerned about is the validity
219 // of `state_id`. `state_id` either comes from the caller
220 // (in which case, we assert above that it is valid), or it
221 // comes from the return value of next_state, which is also
222 // guaranteed to be valid.
223 *state_id = self.next_state_unchecked_no_fail(*state_id, *ptr);
224 ptr = ptr.offset(1);
225 // This routine always quits immediately after seeing a
226 // match, and since dead states can only come after seeing
227 // a match, seeing a dead state here is impossible. (Unless
228 // we have an anchored automaton, in which case, dead states
229 // are used to stop a search.)
230 debug_assert!(
231 *state_id != dead_id() || self.anchored(),
232 "standard find should never see a dead state"
233 );
234
235 if self.is_match_or_dead_state(*state_id) {
236 return if *state_id == dead_id() {
237 None
238 } else {
239 let end = ptr as usize - start as usize;
240 self.get_match(*state_id, 0, end)
241 };
242 }
243 }
244 None
245 }
246 }
247
248 /// Execute a search using leftmost (either first or longest) match
249 /// semantics.
250 ///
251 /// The principle difference between searching with standard semantics and
252 /// searching with leftmost semantics is that leftmost searching will
253 /// continue searching even after a match has been found. Once a match
254 /// is found, the search does not stop until either the haystack has been
255 /// exhausted or a dead state is observed in the automaton. (Dead states
256 /// only exist in automatons constructed with leftmost semantics.) That is,
257 /// we rely on the construction of the automaton to tell us when to quit.
258 #[inline(never)]
259 fn leftmost_find_at(
260 &self,
261 prestate: &mut PrefilterState,
262 haystack: &[u8],
263 at: usize,
264 state_id: &mut Self::ID,
265 ) -> Option<Match> {
266 if let Some(pre) = self.prefilter() {
267 self.leftmost_find_at_imp(
268 prestate,
269 Some(pre),
270 haystack,
271 at,
272 state_id,
273 )
274 } else {
275 self.leftmost_find_at_imp(prestate, None, haystack, at, state_id)
276 }
277 }
278
279 // It's important for this to always be inlined. Namely, it's only caller
280 // is leftmost_find_at, and the inlining should remove the case analysis
281 // for prefilter scanning when there is no prefilter available.
282 #[inline(always)]
283 fn leftmost_find_at_imp(
284 &self,
285 prestate: &mut PrefilterState,
286 prefilter: Option<&dyn Prefilter>,
287 haystack: &[u8],
288 at: usize,
289 state_id: &mut Self::ID,
290 ) -> Option<Match> {
291 debug_assert!(self.match_kind().is_leftmost());
292 // This is necessary for guaranteeing a safe API, since we use the
293 // state ID below in a function that exhibits UB if called with an
294 // invalid state ID.
295 assert!(
296 self.is_valid(*state_id),
297 "{} is not a valid state ID",
298 state_id.to_usize()
299 );
300 if self.anchored() && at > 0 && *state_id == self.start_state() {
301 return None;
302 }
303 unsafe {
304 let start = haystack.as_ptr();
305 let end = haystack[haystack.len()..].as_ptr();
306 let mut ptr = haystack[at..].as_ptr();
307
308 let mut last_match = self.get_match(*state_id, 0, at);
309 while ptr < end {
310 if let Some(pre) = prefilter {
311 let at = ptr as usize - start as usize;
312 if prestate.is_effective(at)
313 && *state_id == self.start_state()
314 {
315 let c = prefilter::next(prestate, pre, haystack, at)
316 .into_option();
317 match c {
318 None => return None,
319 Some(i) => {
320 ptr = start.offset(i as isize);
321 }
322 }
323 }
324 }
325 // SAFETY: next_state is safe for all possible u8 values,
326 // so the only thing we're concerned about is the validity
327 // of `state_id`. `state_id` either comes from the caller
328 // (in which case, we assert above that it is valid), or it
329 // comes from the return value of next_state, which is also
330 // guaranteed to be valid.
331 *state_id = self.next_state_unchecked_no_fail(*state_id, *ptr);
332 ptr = ptr.offset(1);
333 if self.is_match_or_dead_state(*state_id) {
334 if *state_id == dead_id() {
335 // The only way to enter into a dead state is if a
336 // match has been found, so we assert as much. This
337 // is different from normal automata, where you might
338 // enter a dead state if you know a subsequent match
339 // will never be found (regardless of whether a match
340 // has already been found). For Aho-Corasick, it is
341 // built so that we can match at any position, so the
342 // possibility of a match always exists.
343 //
344 // (Unless we have an anchored automaton, in which
345 // case, dead states are used to stop a search.)
346 debug_assert!(
347 last_match.is_some() || self.anchored(),
348 "failure state should only be seen after match"
349 );
350 return last_match;
351 }
352 let end = ptr as usize - start as usize;
353 last_match = self.get_match(*state_id, 0, end);
354 }
355 }
356 last_match
357 }
358 }
359
360 /// This is like leftmost_find_at, but does not need to track a caller
361 /// provided state id. In other words, the only output of this routine is a
362 /// match, if one exists.
363 ///
364 /// It is regrettable that we need to effectively copy a chunk of
365 /// implementation twice, but when we don't need to track the state ID, we
366 /// can allow the prefilter to report matches immediately without having
367 /// to re-confirm them with the automaton. The re-confirmation step is
368 /// necessary in leftmost_find_at because tracing through the automaton is
369 /// the only way to correctly set the state ID. (Perhaps an alternative
370 /// would be to keep a map from pattern ID to matching state ID, but that
371 /// complicates the code and still doesn't permit us to defer to the
372 /// prefilter entirely when possible.)
373 ///
374 /// I did try a few things to avoid the code duplication here, but nothing
375 /// optimized as well as this approach. (In microbenchmarks, there was
376 /// about a 25% difference.)
377 #[inline(never)]
378 fn leftmost_find_at_no_state(
379 &self,
380 prestate: &mut PrefilterState,
381 haystack: &[u8],
382 at: usize,
383 ) -> Option<Match> {
384 if let Some(pre) = self.prefilter() {
385 self.leftmost_find_at_no_state_imp(
386 prestate,
387 Some(pre),
388 haystack,
389 at,
390 )
391 } else {
392 self.leftmost_find_at_no_state_imp(prestate, None, haystack, at)
393 }
394 }
395
396 // It's important for this to always be inlined. Namely, it's only caller
397 // is leftmost_find_at_no_state, and the inlining should remove the case
398 // analysis for prefilter scanning when there is no prefilter available.
399 #[inline(always)]
400 fn leftmost_find_at_no_state_imp(
401 &self,
402 prestate: &mut PrefilterState,
403 prefilter: Option<&dyn Prefilter>,
404 haystack: &[u8],
405 at: usize,
406 ) -> Option<Match> {
407 debug_assert!(self.match_kind().is_leftmost());
408 if self.anchored() && at > 0 {
409 return None;
410 }
411 // If our prefilter handles confirmation of matches 100% of the
412 // time, and since we don't need to track state IDs, we can avoid
413 // Aho-Corasick completely.
414 if let Some(pre) = prefilter {
415 // We should never have a prefilter during an anchored search.
416 debug_assert!(!self.anchored());
417 if !pre.reports_false_positives() {
418 return match pre.next_candidate(prestate, haystack, at) {
419 Candidate::None => None,
420 Candidate::Match(m) => Some(m),
421 Candidate::PossibleStartOfMatch(_) => unreachable!(),
422 };
423 }
424 }
425 let mut state_id = self.start_state();
426 unsafe {
427 let start = haystack.as_ptr();
428 let end = haystack[haystack.len()..].as_ptr();
429 let mut ptr = haystack[at..].as_ptr();
430
431 let mut last_match = self.get_match(state_id, 0, at);
432 while ptr < end {
433 if let Some(pre) = prefilter {
434 let at = ptr as usize - start as usize;
435 if prestate.is_effective(at)
436 && state_id == self.start_state()
437 {
438 match prefilter::next(prestate, pre, haystack, at) {
439 Candidate::None => return None,
440 // Since we aren't tracking a state ID, we can
441 // quit early once we know we have a match.
442 Candidate::Match(m) => return Some(m),
443 Candidate::PossibleStartOfMatch(i) => {
444 ptr = start.offset(i as isize);
445 }
446 }
447 }
448 }
449 // SAFETY: next_state is safe for all possible u8 values,
450 // so the only thing we're concerned about is the validity
451 // of `state_id`. `state_id` either comes from the caller
452 // (in which case, we assert above that it is valid), or it
453 // comes from the return value of next_state, which is also
454 // guaranteed to be valid.
455 state_id = self.next_state_unchecked_no_fail(state_id, *ptr);
456 ptr = ptr.offset(1);
457 if self.is_match_or_dead_state(state_id) {
458 if state_id == dead_id() {
459 // The only way to enter into a dead state is if a
460 // match has been found, so we assert as much. This
461 // is different from normal automata, where you might
462 // enter a dead state if you know a subsequent match
463 // will never be found (regardless of whether a match
464 // has already been found). For Aho-Corasick, it is
465 // built so that we can match at any position, so the
466 // possibility of a match always exists.
467 //
468 // (Unless we have an anchored automaton, in which
469 // case, dead states are used to stop a search.)
470 debug_assert!(
471 last_match.is_some() || self.anchored(),
472 "failure state should only be seen after match"
473 );
474 return last_match;
475 }
476 let end = ptr as usize - start as usize;
477 last_match = self.get_match(state_id, 0, end);
478 }
479 }
480 last_match
481 }
482 }
483
484 /// Execute an overlapping search.
485 ///
486 /// When executing an overlapping match, the previous state ID in addition
487 /// to the previous match index should be given. If there are more matches
488 /// at the given state, then the match is reported and the given index is
489 /// incremented.
490 #[inline(always)]
491 fn overlapping_find_at(
492 &self,
493 prestate: &mut PrefilterState,
494 haystack: &[u8],
495 at: usize,
496 state_id: &mut Self::ID,
497 match_index: &mut usize,
498 ) -> Option<Match> {
499 if self.anchored() && at > 0 && *state_id == self.start_state() {
500 return None;
501 }
502
503 let match_count = self.match_count(*state_id);
504 if *match_index < match_count {
505 // This is guaranteed to return a match since
506 // match_index < match_count.
507 let result = self.get_match(*state_id, *match_index, at);
508 debug_assert!(result.is_some(), "must be a match");
509 *match_index += 1;
510 return result;
511 }
512
513 *match_index = 0;
514 match self.standard_find_at(prestate, haystack, at, state_id) {
515 None => None,
516 Some(m) => {
517 *match_index = 1;
518 Some(m)
519 }
520 }
521 }
522
523 /// Return the earliest match found. This returns as soon as we know that
524 /// we have a match. As such, this does not necessarily correspond to the
525 /// leftmost starting match, but rather, the leftmost position at which a
526 /// match ends.
527 #[inline(always)]
528 fn earliest_find_at(
529 &self,
530 prestate: &mut PrefilterState,
531 haystack: &[u8],
532 at: usize,
533 state_id: &mut Self::ID,
534 ) -> Option<Match> {
535 if *state_id == self.start_state() {
536 if self.anchored() && at > 0 {
537 return None;
538 }
539 if let Some(m) = self.get_match(*state_id, 0, at) {
540 return Some(m);
541 }
542 }
543 self.standard_find_at(prestate, haystack, at, state_id)
544 }
545
546 /// A convenience function for finding the next match according to the
547 /// match semantics of this automaton. For standard match semantics, this
548 /// finds the earliest match. Otherwise, the leftmost match is found.
549 #[inline(always)]
550 fn find_at(
551 &self,
552 prestate: &mut PrefilterState,
553 haystack: &[u8],
554 at: usize,
555 state_id: &mut Self::ID,
556 ) -> Option<Match> {
557 match *self.match_kind() {
558 MatchKind::Standard => {
559 self.earliest_find_at(prestate, haystack, at, state_id)
560 }
561 MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => {
562 self.leftmost_find_at(prestate, haystack, at, state_id)
563 }
564 MatchKind::__Nonexhaustive => unreachable!(),
565 }
566 }
567
568 /// Like find_at, but does not track state identifiers. This permits some
569 /// optimizations when a prefilter that confirms its own matches is
570 /// present.
571 #[inline(always)]
572 fn find_at_no_state(
573 &self,
574 prestate: &mut PrefilterState,
575 haystack: &[u8],
576 at: usize,
577 ) -> Option<Match> {
578 match *self.match_kind() {
579 MatchKind::Standard => {
580 let mut state = self.start_state();
581 self.earliest_find_at(prestate, haystack, at, &mut state)
582 }
583 MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => {
584 self.leftmost_find_at_no_state(prestate, haystack, at)
585 }
586 MatchKind::__Nonexhaustive => unreachable!(),
587 }
588 }
589}