aho_corasick/prefilter.rs
1use std::cmp;
2use std::fmt;
3use std::panic::{RefUnwindSafe, UnwindSafe};
4use std::u8;
5
6use memchr::{memchr, memchr2, memchr3};
7
8use ahocorasick::MatchKind;
9use packed;
10use Match;
11
12/// A candidate is the result of running a prefilter on a haystack at a
13/// particular position. The result is either no match, a confirmed match or
14/// a possible match.
15///
16/// When no match is returned, the prefilter is guaranteeing that no possible
17/// match can be found in the haystack, and the caller may trust this. That is,
18/// all correct prefilters must never report false negatives.
19///
20/// In some cases, a prefilter can confirm a match very quickly, in which case,
21/// the caller may use this to stop what it's doing and report the match. In
22/// this case, prefilter implementations must never report a false positive.
23/// In other cases, the prefilter can only report a potential match, in which
24/// case the callers must attempt to confirm the match. In this case, prefilter
25/// implementations are permitted to return false positives.
26#[derive(Clone, Debug)]
27pub enum Candidate {
28 None,
29 Match(Match),
30 PossibleStartOfMatch(usize),
31}
32
33impl Candidate {
34 /// Convert this candidate into an option. This is useful when callers
35 /// do not distinguish between true positives and false positives (i.e.,
36 /// the caller must always confirm the match in order to update some other
37 /// state).
38 pub fn into_option(self) -> Option<usize> {
39 match self {
40 Candidate::None => None,
41 Candidate::Match(ref m) => Some(m.start()),
42 Candidate::PossibleStartOfMatch(start) => Some(start),
43 }
44 }
45}
46
47/// A prefilter describes the behavior of fast literal scanners for quickly
48/// skipping past bytes in the haystack that we know cannot possibly
49/// participate in a match.
50pub trait Prefilter:
51 Send + Sync + RefUnwindSafe + UnwindSafe + fmt::Debug
52{
53 /// Returns the next possible match candidate. This may yield false
54 /// positives, so callers must confirm a match starting at the position
55 /// returned. This, however, must never produce false negatives. That is,
56 /// this must, at minimum, return the starting position of the next match
57 /// in the given haystack after or at the given position.
58 fn next_candidate(
59 &self,
60 state: &mut PrefilterState,
61 haystack: &[u8],
62 at: usize,
63 ) -> Candidate;
64
65 /// A method for cloning a prefilter, to work-around the fact that Clone
66 /// is not object-safe.
67 fn clone_prefilter(&self) -> Box<dyn Prefilter>;
68
69 /// Returns the approximate total amount of heap used by this prefilter, in
70 /// units of bytes.
71 fn heap_bytes(&self) -> usize;
72
73 /// Returns true if and only if this prefilter never returns false
74 /// positives. This is useful for completely avoiding the automaton
75 /// when the prefilter can quickly confirm its own matches.
76 ///
77 /// By default, this returns true, which is conservative; it is always
78 /// correct to return `true`. Returning `false` here and reporting a false
79 /// positive will result in incorrect searches.
80 fn reports_false_positives(&self) -> bool {
81 true
82 }
83}
84
85impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P {
86 #[inline]
87 fn next_candidate(
88 &self,
89 state: &mut PrefilterState,
90 haystack: &[u8],
91 at: usize,
92 ) -> Candidate {
93 (**self).next_candidate(state, haystack, at)
94 }
95
96 fn clone_prefilter(&self) -> Box<dyn Prefilter> {
97 (**self).clone_prefilter()
98 }
99
100 fn heap_bytes(&self) -> usize {
101 (**self).heap_bytes()
102 }
103
104 fn reports_false_positives(&self) -> bool {
105 (**self).reports_false_positives()
106 }
107}
108
109/// A convenience object for representing any type that implements Prefilter
110/// and is cloneable.
111#[derive(Debug)]
112pub struct PrefilterObj(Box<dyn Prefilter>);
113
114impl Clone for PrefilterObj {
115 fn clone(&self) -> Self {
116 PrefilterObj(self.0.clone_prefilter())
117 }
118}
119
120impl PrefilterObj {
121 /// Create a new prefilter object.
122 pub fn new<T: Prefilter + 'static>(t: T) -> PrefilterObj {
123 PrefilterObj(Box::new(t))
124 }
125
126 /// Return the underlying prefilter trait object.
127 pub fn as_ref(&self) -> &dyn Prefilter {
128 &*self.0
129 }
130}
131
132/// PrefilterState tracks state associated with the effectiveness of a
133/// prefilter. It is used to track how many bytes, on average, are skipped by
134/// the prefilter. If this average dips below a certain threshold over time,
135/// then the state renders the prefilter inert and stops using it.
136///
137/// A prefilter state should be created for each search. (Where creating an
138/// iterator via, e.g., `find_iter`, is treated as a single search.)
139#[derive(Clone, Debug)]
140pub struct PrefilterState {
141 /// The number of skips that has been executed.
142 skips: usize,
143 /// The total number of bytes that have been skipped.
144 skipped: usize,
145 /// The maximum length of a match. This is used to help determine how many
146 /// bytes on average should be skipped in order for a prefilter to be
147 /// effective.
148 max_match_len: usize,
149 /// Once this heuristic has been deemed permanently ineffective, it will be
150 /// inert throughout the rest of its lifetime. This serves as a cheap way
151 /// to check inertness.
152 inert: bool,
153 /// The last (absolute) position at which a prefilter scanned to.
154 /// Prefilters can use this position to determine whether to re-scan or
155 /// not.
156 ///
157 /// Unlike other things that impact effectiveness, this is a fleeting
158 /// condition. That is, a prefilter can be considered ineffective if it is
159 /// at a position before `last_scan_at`, but can become effective again
160 /// once the search moves past `last_scan_at`.
161 ///
162 /// The utility of this is to both avoid additional overhead from calling
163 /// the prefilter and to avoid quadratic behavior. This ensures that a
164 /// prefilter will scan any particular byte at most once. (Note that some
165 /// prefilters, like the start-byte prefilter, do not need to use this
166 /// field at all, since it only looks for starting bytes.)
167 last_scan_at: usize,
168}
169
170impl PrefilterState {
171 /// The minimum number of skip attempts to try before considering whether
172 /// a prefilter is effective or not.
173 const MIN_SKIPS: usize = 40;
174
175 /// The minimum amount of bytes that skipping must average, expressed as a
176 /// factor of the multiple of the length of a possible match.
177 ///
178 /// That is, after MIN_SKIPS have occurred, if the average number of bytes
179 /// skipped ever falls below MIN_AVG_FACTOR * max-match-length, then the
180 /// prefilter outed to be rendered inert.
181 const MIN_AVG_FACTOR: usize = 2;
182
183 /// Create a fresh prefilter state.
184 pub fn new(max_match_len: usize) -> PrefilterState {
185 PrefilterState {
186 skips: 0,
187 skipped: 0,
188 max_match_len,
189 inert: false,
190 last_scan_at: 0,
191 }
192 }
193
194 /// Update this state with the number of bytes skipped on the last
195 /// invocation of the prefilter.
196 #[inline]
197 fn update_skipped_bytes(&mut self, skipped: usize) {
198 self.skips += 1;
199 self.skipped += skipped;
200 }
201
202 /// Updates the position at which the last scan stopped. This may be
203 /// greater than the position of the last candidate reported. For example,
204 /// searching for the "rare" byte `z` in `abczdef` for the pattern `abcz`
205 /// will report a candidate at position `0`, but the end of its last scan
206 /// will be at position `3`.
207 ///
208 /// This position factors into the effectiveness of this prefilter. If the
209 /// current position is less than the last position at which a scan ended,
210 /// then the prefilter should not be re-run until the search moves past
211 /// that position.
212 #[inline]
213 fn update_at(&mut self, at: usize) {
214 if at > self.last_scan_at {
215 self.last_scan_at = at;
216 }
217 }
218
219 /// Return true if and only if this state indicates that a prefilter is
220 /// still effective.
221 ///
222 /// The given pos should correspond to the current starting position of the
223 /// search.
224 #[inline]
225 pub fn is_effective(&mut self, at: usize) -> bool {
226 if self.inert {
227 return false;
228 }
229 if at < self.last_scan_at {
230 return false;
231 }
232 if self.skips < PrefilterState::MIN_SKIPS {
233 return true;
234 }
235
236 let min_avg = PrefilterState::MIN_AVG_FACTOR * self.max_match_len;
237 if self.skipped >= min_avg * self.skips {
238 return true;
239 }
240
241 // We're inert.
242 self.inert = true;
243 false
244 }
245}
246
247/// A builder for constructing the best possible prefilter. When constructed,
248/// this builder will heuristically select the best prefilter it can build,
249/// if any, and discard the rest.
250#[derive(Debug)]
251pub struct Builder {
252 count: usize,
253 ascii_case_insensitive: bool,
254 start_bytes: StartBytesBuilder,
255 rare_bytes: RareBytesBuilder,
256 packed: Option<packed::Builder>,
257}
258
259impl Builder {
260 /// Create a new builder for constructing the best possible prefilter.
261 pub fn new(kind: MatchKind) -> Builder {
262 let pbuilder = kind
263 .as_packed()
264 .map(|kind| packed::Config::new().match_kind(kind).builder());
265 Builder {
266 count: 0,
267 ascii_case_insensitive: false,
268 start_bytes: StartBytesBuilder::new(),
269 rare_bytes: RareBytesBuilder::new(),
270 packed: pbuilder,
271 }
272 }
273
274 /// Enable ASCII case insensitivity. When set, byte strings added to this
275 /// builder will be interpreted without respect to ASCII case.
276 pub fn ascii_case_insensitive(mut self, yes: bool) -> Builder {
277 self.ascii_case_insensitive = yes;
278 self.start_bytes = self.start_bytes.ascii_case_insensitive(yes);
279 self.rare_bytes = self.rare_bytes.ascii_case_insensitive(yes);
280 self
281 }
282
283 /// Return a prefilter suitable for quickly finding potential matches.
284 ///
285 /// All patterns added to an Aho-Corasick automaton should be added to this
286 /// builder before attempting to construct the prefilter.
287 pub fn build(&self) -> Option<PrefilterObj> {
288 match (self.start_bytes.build(), self.rare_bytes.build()) {
289 // If we could build both start and rare prefilters, then there are
290 // a few cases in which we'd want to use the start-byte prefilter
291 // over the rare-byte prefilter, since the former has lower
292 // overhead.
293 (prestart @ Some(_), prerare @ Some(_)) => {
294 // If the start-byte prefilter can scan for a smaller number
295 // of bytes than the rare-byte prefilter, then it's probably
296 // faster.
297 let has_fewer_bytes =
298 self.start_bytes.count < self.rare_bytes.count;
299 // Otherwise, if the combined frequency rank of the detected
300 // bytes in the start-byte prefilter is "close" to the combined
301 // frequency rank of the rare-byte prefilter, then we pick
302 // the start-byte prefilter even if the rare-byte prefilter
303 // heuristically searches for rare bytes. This is because the
304 // rare-byte prefilter has higher constant costs, so we tend to
305 // prefer the start-byte prefilter when we can.
306 let has_rarer_bytes =
307 self.start_bytes.rank_sum <= self.rare_bytes.rank_sum + 50;
308 if has_fewer_bytes || has_rarer_bytes {
309 prestart
310 } else {
311 prerare
312 }
313 }
314 (prestart @ Some(_), None) => prestart,
315 (None, prerare @ Some(_)) => prerare,
316 (None, None) if self.ascii_case_insensitive => None,
317 (None, None) => self
318 .packed
319 .as_ref()
320 .and_then(|b| b.build())
321 .map(|s| PrefilterObj::new(Packed(s))),
322 }
323 }
324
325 /// Add a literal string to this prefilter builder.
326 pub fn add(&mut self, bytes: &[u8]) {
327 self.count += 1;
328 self.start_bytes.add(bytes);
329 self.rare_bytes.add(bytes);
330 if let Some(ref mut pbuilder) = self.packed {
331 pbuilder.add(bytes);
332 }
333 }
334}
335
336/// A type that wraps a packed searcher and implements the `Prefilter`
337/// interface.
338#[derive(Clone, Debug)]
339struct Packed(packed::Searcher);
340
341impl Prefilter for Packed {
342 fn next_candidate(
343 &self,
344 _state: &mut PrefilterState,
345 haystack: &[u8],
346 at: usize,
347 ) -> Candidate {
348 self.0.find_at(haystack, at).map_or(Candidate::None, Candidate::Match)
349 }
350
351 fn clone_prefilter(&self) -> Box<dyn Prefilter> {
352 Box::new(self.clone())
353 }
354
355 fn heap_bytes(&self) -> usize {
356 self.0.heap_bytes()
357 }
358
359 fn reports_false_positives(&self) -> bool {
360 false
361 }
362}
363
364/// A builder for constructing a rare byte prefilter.
365///
366/// A rare byte prefilter attempts to pick out a small set of rare bytes that
367/// occurr in the patterns, and then quickly scan to matches of those rare
368/// bytes.
369#[derive(Clone, Debug)]
370struct RareBytesBuilder {
371 /// Whether this prefilter should account for ASCII case insensitivity or
372 /// not.
373 ascii_case_insensitive: bool,
374 /// A set of byte offsets associated with detected rare bytes. An entry is
375 /// only set if a rare byte is detected in a pattern.
376 byte_offsets: RareByteOffsets,
377 /// Whether this is available as a prefilter or not. This can be set to
378 /// false during construction if a condition is seen that invalidates the
379 /// use of the rare-byte prefilter.
380 available: bool,
381 /// The number of bytes set to an active value in `byte_offsets`.
382 count: usize,
383 /// The sum of frequency ranks for the rare bytes detected. This is
384 /// intended to give a heuristic notion of how rare the bytes are.
385 rank_sum: u16,
386}
387
388/// A set of rare byte offsets, keyed by byte.
389#[derive(Clone, Copy)]
390struct RareByteOffsets {
391 /// When an item in this set has an offset of u8::MAX (255), then it is
392 /// considered unset.
393 set: [RareByteOffset; 256],
394}
395
396impl RareByteOffsets {
397 /// Create a new empty set of rare byte offsets.
398 pub fn empty() -> RareByteOffsets {
399 RareByteOffsets { set: [RareByteOffset::default(); 256] }
400 }
401
402 /// Add the given offset for the given byte to this set. If the offset is
403 /// greater than the existing offset, then it overwrites the previous
404 /// value and returns false. If there is no previous value set, then this
405 /// sets it and returns true.
406 ///
407 /// The given offset must be active, otherwise this panics.
408 pub fn apply(&mut self, byte: u8, off: RareByteOffset) -> bool {
409 assert!(off.is_active());
410
411 let existing = &mut self.set[byte as usize];
412 if !existing.is_active() {
413 *existing = off;
414 true
415 } else {
416 if existing.max < off.max {
417 *existing = off;
418 }
419 false
420 }
421 }
422}
423
424impl fmt::Debug for RareByteOffsets {
425 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
426 let mut offsets = vec![];
427 for off in self.set.iter() {
428 if off.is_active() {
429 offsets.push(off);
430 }
431 }
432 f.debug_struct("RareByteOffsets").field("set", &offsets).finish()
433 }
434}
435
436/// Offsets associated with an occurrence of a "rare" byte in any of the
437/// patterns used to construct a single Aho-Corasick automaton.
438#[derive(Clone, Copy, Debug)]
439struct RareByteOffset {
440 /// The maximum offset at which a particular byte occurs from the start
441 /// of any pattern. This is used as a shift amount. That is, when an
442 /// occurrence of this byte is found, the candidate position reported by
443 /// the prefilter is `position_of_byte - max`, such that the automaton
444 /// will begin its search at a position that is guaranteed to observe a
445 /// match.
446 ///
447 /// To avoid accidentally quadratic behavior, a prefilter is considered
448 /// ineffective when it is asked to start scanning from a position that it
449 /// has already scanned past.
450 ///
451 /// N.B. The maximum value for this is 254. A value of 255 indicates that
452 /// this is unused. If a rare byte is found at an offset of 255 or greater,
453 /// then the rare-byte prefilter is disabled for simplicity.
454 max: u8,
455}
456
457impl Default for RareByteOffset {
458 fn default() -> RareByteOffset {
459 RareByteOffset { max: u8::MAX }
460 }
461}
462
463impl RareByteOffset {
464 /// Create a new rare byte offset. If the given offset is too big, then
465 /// an inactive `RareByteOffset` is returned.
466 fn new(max: usize) -> RareByteOffset {
467 if max > (u8::MAX - 1) as usize {
468 RareByteOffset::default()
469 } else {
470 RareByteOffset { max: max as u8 }
471 }
472 }
473
474 /// Returns true if and only if this offset is active. If it's inactive,
475 /// then it should not be used.
476 fn is_active(&self) -> bool {
477 self.max < u8::MAX
478 }
479}
480
481impl RareBytesBuilder {
482 /// Create a new builder for constructing a rare byte prefilter.
483 fn new() -> RareBytesBuilder {
484 RareBytesBuilder {
485 ascii_case_insensitive: false,
486 byte_offsets: RareByteOffsets::empty(),
487 available: true,
488 count: 0,
489 rank_sum: 0,
490 }
491 }
492
493 /// Enable ASCII case insensitivity. When set, byte strings added to this
494 /// builder will be interpreted without respect to ASCII case.
495 fn ascii_case_insensitive(mut self, yes: bool) -> RareBytesBuilder {
496 self.ascii_case_insensitive = yes;
497 self
498 }
499
500 /// Build the rare bytes prefilter.
501 ///
502 /// If there are more than 3 distinct starting bytes, or if heuristics
503 /// otherwise determine that this prefilter should not be used, then `None`
504 /// is returned.
505 fn build(&self) -> Option<PrefilterObj> {
506 if !self.available || self.count > 3 {
507 return None;
508 }
509 let (mut bytes, mut len) = ([0; 3], 0);
510 for b in 0..256 {
511 if self.byte_offsets.set[b].is_active() {
512 bytes[len] = b as u8;
513 len += 1;
514 }
515 }
516 match len {
517 0 => None,
518 1 => Some(PrefilterObj::new(RareBytesOne {
519 byte1: bytes[0],
520 offset: self.byte_offsets.set[bytes[0] as usize],
521 })),
522 2 => Some(PrefilterObj::new(RareBytesTwo {
523 offsets: self.byte_offsets,
524 byte1: bytes[0],
525 byte2: bytes[1],
526 })),
527 3 => Some(PrefilterObj::new(RareBytesThree {
528 offsets: self.byte_offsets,
529 byte1: bytes[0],
530 byte2: bytes[1],
531 byte3: bytes[2],
532 })),
533 _ => unreachable!(),
534 }
535 }
536
537 /// Add a byte string to this builder.
538 ///
539 /// All patterns added to an Aho-Corasick automaton should be added to this
540 /// builder before attempting to construct the prefilter.
541 fn add(&mut self, bytes: &[u8]) {
542 // If we've already blown our budget, then don't waste time looking
543 // for more rare bytes.
544 if self.count > 3 {
545 self.available = false;
546 return;
547 }
548 let mut rarest = match bytes.get(0) {
549 None => return,
550 Some(&b) => (b, 0, freq_rank(b)),
551 };
552 // The idea here is to look for the rarest byte in each pattern, and
553 // add that to our set. As a special exception, if we see a byte that
554 // we've already added, then we immediately stop and choose that byte,
555 // even if there's another rare byte in the pattern. This helps us
556 // apply the rare byte optimization in more cases by attempting to pick
557 // bytes that are in common between patterns. So for example, if we
558 // were searching for `Sherlock` and `lockjaw`, then this would pick
559 // `k` for both patterns, resulting in the use of `memchr` instead of
560 // `memchr2` for `k` and `j`.
561 for (pos, &b) in bytes.iter().enumerate() {
562 if self.byte_offsets.set[b as usize].is_active() {
563 self.add_rare_byte(b, pos);
564 return;
565 }
566 let rank = freq_rank(b);
567 if rank < rarest.2 {
568 rarest = (b, pos, rank);
569 }
570 }
571 self.add_rare_byte(rarest.0, rarest.1);
572 }
573
574 fn add_rare_byte(&mut self, byte: u8, pos: usize) {
575 self.add_one_byte(byte, pos);
576 if self.ascii_case_insensitive {
577 self.add_one_byte(opposite_ascii_case(byte), pos);
578 }
579 }
580
581 fn add_one_byte(&mut self, byte: u8, pos: usize) {
582 let off = RareByteOffset::new(pos);
583 if !off.is_active() {
584 self.available = false;
585 return;
586 }
587 if self.byte_offsets.apply(byte, off) {
588 self.count += 1;
589 self.rank_sum += freq_rank(byte) as u16;
590 }
591 }
592}
593
594/// A prefilter for scanning for a single "rare" byte.
595#[derive(Clone, Debug)]
596struct RareBytesOne {
597 byte1: u8,
598 offset: RareByteOffset,
599}
600
601impl Prefilter for RareBytesOne {
602 fn next_candidate(
603 &self,
604 state: &mut PrefilterState,
605 haystack: &[u8],
606 at: usize,
607 ) -> Candidate {
608 memchr(self.byte1, &haystack[at..])
609 .map(|i| {
610 let pos = at + i;
611 state.last_scan_at = pos;
612 cmp::max(at, pos.saturating_sub(self.offset.max as usize))
613 })
614 .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
615 }
616
617 fn clone_prefilter(&self) -> Box<dyn Prefilter> {
618 Box::new(self.clone())
619 }
620
621 fn heap_bytes(&self) -> usize {
622 0
623 }
624}
625
626/// A prefilter for scanning for two "rare" bytes.
627#[derive(Clone, Debug)]
628struct RareBytesTwo {
629 offsets: RareByteOffsets,
630 byte1: u8,
631 byte2: u8,
632}
633
634impl Prefilter for RareBytesTwo {
635 fn next_candidate(
636 &self,
637 state: &mut PrefilterState,
638 haystack: &[u8],
639 at: usize,
640 ) -> Candidate {
641 memchr2(self.byte1, self.byte2, &haystack[at..])
642 .map(|i| {
643 let pos = at + i;
644 state.update_at(pos);
645 let offset = self.offsets.set[haystack[pos] as usize].max;
646 cmp::max(at, pos.saturating_sub(offset as usize))
647 })
648 .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
649 }
650
651 fn clone_prefilter(&self) -> Box<dyn Prefilter> {
652 Box::new(self.clone())
653 }
654
655 fn heap_bytes(&self) -> usize {
656 0
657 }
658}
659
660/// A prefilter for scanning for three "rare" bytes.
661#[derive(Clone, Debug)]
662struct RareBytesThree {
663 offsets: RareByteOffsets,
664 byte1: u8,
665 byte2: u8,
666 byte3: u8,
667}
668
669impl Prefilter for RareBytesThree {
670 fn next_candidate(
671 &self,
672 state: &mut PrefilterState,
673 haystack: &[u8],
674 at: usize,
675 ) -> Candidate {
676 memchr3(self.byte1, self.byte2, self.byte3, &haystack[at..])
677 .map(|i| {
678 let pos = at + i;
679 state.update_at(pos);
680 let offset = self.offsets.set[haystack[pos] as usize].max;
681 cmp::max(at, pos.saturating_sub(offset as usize))
682 })
683 .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
684 }
685
686 fn clone_prefilter(&self) -> Box<dyn Prefilter> {
687 Box::new(self.clone())
688 }
689
690 fn heap_bytes(&self) -> usize {
691 0
692 }
693}
694
695/// A builder for constructing a starting byte prefilter.
696///
697/// A starting byte prefilter is a simplistic prefilter that looks for possible
698/// matches by reporting all positions corresponding to a particular byte. This
699/// generally only takes affect when there are at most 3 distinct possible
700/// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two
701/// distinct starting bytes (`f` and `b`), and this prefiler returns all
702/// occurrences of either `f` or `b`.
703///
704/// In some cases, a heuristic frequency analysis may determine that it would
705/// be better not to use this prefilter even when there are 3 or fewer distinct
706/// starting bytes.
707#[derive(Clone, Debug)]
708struct StartBytesBuilder {
709 /// Whether this prefilter should account for ASCII case insensitivity or
710 /// not.
711 ascii_case_insensitive: bool,
712 /// The set of starting bytes observed.
713 byteset: Vec<bool>,
714 /// The number of bytes set to true in `byteset`.
715 count: usize,
716 /// The sum of frequency ranks for the rare bytes detected. This is
717 /// intended to give a heuristic notion of how rare the bytes are.
718 rank_sum: u16,
719}
720
721impl StartBytesBuilder {
722 /// Create a new builder for constructing a start byte prefilter.
723 fn new() -> StartBytesBuilder {
724 StartBytesBuilder {
725 ascii_case_insensitive: false,
726 byteset: vec![false; 256],
727 count: 0,
728 rank_sum: 0,
729 }
730 }
731
732 /// Enable ASCII case insensitivity. When set, byte strings added to this
733 /// builder will be interpreted without respect to ASCII case.
734 fn ascii_case_insensitive(mut self, yes: bool) -> StartBytesBuilder {
735 self.ascii_case_insensitive = yes;
736 self
737 }
738
739 /// Build the starting bytes prefilter.
740 ///
741 /// If there are more than 3 distinct starting bytes, or if heuristics
742 /// otherwise determine that this prefilter should not be used, then `None`
743 /// is returned.
744 fn build(&self) -> Option<PrefilterObj> {
745 if self.count > 3 {
746 return None;
747 }
748 let (mut bytes, mut len) = ([0; 3], 0);
749 for b in 0..256 {
750 if !self.byteset[b] {
751 continue;
752 }
753 // We don't handle non-ASCII bytes for now. Getting non-ASCII
754 // bytes right is trickier, since we generally don't want to put
755 // a leading UTF-8 code unit into a prefilter that isn't ASCII,
756 // since they can frequently. Instead, it would be better to use a
757 // continuation byte, but this requires more sophisticated analysis
758 // of the automaton and a richer prefilter API.
759 if b > 0x7F {
760 return None;
761 }
762 bytes[len] = b as u8;
763 len += 1;
764 }
765 match len {
766 0 => None,
767 1 => Some(PrefilterObj::new(StartBytesOne { byte1: bytes[0] })),
768 2 => Some(PrefilterObj::new(StartBytesTwo {
769 byte1: bytes[0],
770 byte2: bytes[1],
771 })),
772 3 => Some(PrefilterObj::new(StartBytesThree {
773 byte1: bytes[0],
774 byte2: bytes[1],
775 byte3: bytes[2],
776 })),
777 _ => unreachable!(),
778 }
779 }
780
781 /// Add a byte string to this builder.
782 ///
783 /// All patterns added to an Aho-Corasick automaton should be added to this
784 /// builder before attempting to construct the prefilter.
785 fn add(&mut self, bytes: &[u8]) {
786 if self.count > 3 {
787 return;
788 }
789 if let Some(&byte) = bytes.get(0) {
790 self.add_one_byte(byte);
791 if self.ascii_case_insensitive {
792 self.add_one_byte(opposite_ascii_case(byte));
793 }
794 }
795 }
796
797 fn add_one_byte(&mut self, byte: u8) {
798 if !self.byteset[byte as usize] {
799 self.byteset[byte as usize] = true;
800 self.count += 1;
801 self.rank_sum += freq_rank(byte) as u16;
802 }
803 }
804}
805
806/// A prefilter for scanning for a single starting byte.
807#[derive(Clone, Debug)]
808struct StartBytesOne {
809 byte1: u8,
810}
811
812impl Prefilter for StartBytesOne {
813 fn next_candidate(
814 &self,
815 _state: &mut PrefilterState,
816 haystack: &[u8],
817 at: usize,
818 ) -> Candidate {
819 memchr(self.byte1, &haystack[at..])
820 .map(|i| at + i)
821 .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
822 }
823
824 fn clone_prefilter(&self) -> Box<dyn Prefilter> {
825 Box::new(self.clone())
826 }
827
828 fn heap_bytes(&self) -> usize {
829 0
830 }
831}
832
833/// A prefilter for scanning for two starting bytes.
834#[derive(Clone, Debug)]
835struct StartBytesTwo {
836 byte1: u8,
837 byte2: u8,
838}
839
840impl Prefilter for StartBytesTwo {
841 fn next_candidate(
842 &self,
843 _state: &mut PrefilterState,
844 haystack: &[u8],
845 at: usize,
846 ) -> Candidate {
847 memchr2(self.byte1, self.byte2, &haystack[at..])
848 .map(|i| at + i)
849 .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
850 }
851
852 fn clone_prefilter(&self) -> Box<dyn Prefilter> {
853 Box::new(self.clone())
854 }
855
856 fn heap_bytes(&self) -> usize {
857 0
858 }
859}
860
861/// A prefilter for scanning for three starting bytes.
862#[derive(Clone, Debug)]
863struct StartBytesThree {
864 byte1: u8,
865 byte2: u8,
866 byte3: u8,
867}
868
869impl Prefilter for StartBytesThree {
870 fn next_candidate(
871 &self,
872 _state: &mut PrefilterState,
873 haystack: &[u8],
874 at: usize,
875 ) -> Candidate {
876 memchr3(self.byte1, self.byte2, self.byte3, &haystack[at..])
877 .map(|i| at + i)
878 .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
879 }
880
881 fn clone_prefilter(&self) -> Box<dyn Prefilter> {
882 Box::new(self.clone())
883 }
884
885 fn heap_bytes(&self) -> usize {
886 0
887 }
888}
889
890/// Return the next candidate reported by the given prefilter while
891/// simultaneously updating the given prestate.
892///
893/// The caller is responsible for checking the prestate before deciding whether
894/// to initiate a search.
895#[inline]
896pub fn next<P: Prefilter>(
897 prestate: &mut PrefilterState,
898 prefilter: P,
899 haystack: &[u8],
900 at: usize,
901) -> Candidate {
902 let cand = prefilter.next_candidate(prestate, haystack, at);
903 match cand {
904 Candidate::None => {
905 prestate.update_skipped_bytes(haystack.len() - at);
906 }
907 Candidate::Match(ref m) => {
908 prestate.update_skipped_bytes(m.start() - at);
909 }
910 Candidate::PossibleStartOfMatch(i) => {
911 prestate.update_skipped_bytes(i - at);
912 }
913 }
914 cand
915}
916
917/// If the given byte is an ASCII letter, then return it in the opposite case.
918/// e.g., Given `b'A'`, this returns `b'a'`, and given `b'a'`, this returns
919/// `b'A'`. If a non-ASCII letter is given, then the given byte is returned.
920pub fn opposite_ascii_case(b: u8) -> u8 {
921 if b'A' <= b && b <= b'Z' {
922 b.to_ascii_lowercase()
923 } else if b'a' <= b && b <= b'z' {
924 b.to_ascii_uppercase()
925 } else {
926 b
927 }
928}
929
930/// Return the frequency rank of the given byte. The higher the rank, the more
931/// common the byte (heuristically speaking).
932fn freq_rank(b: u8) -> u8 {
933 use byte_frequencies::BYTE_FREQUENCIES;
934 BYTE_FREQUENCIES[b as usize]
935}
936
937#[cfg(test)]
938mod tests {
939 use super::*;
940
941 #[test]
942 fn scratch() {
943 let mut b = Builder::new(MatchKind::LeftmostFirst);
944 b.add(b"Sherlock");
945 b.add(b"locjaw");
946 // b.add(b"Sherlock");
947 // b.add(b"Holmes");
948 // b.add(b"Watson");
949 // b.add("Шерлок Холмс".as_bytes());
950 // b.add("Джон Уотсон".as_bytes());
951
952 let s = b.build().unwrap();
953 println!("{:?}", s);
954 }
955}