aho_corasick/
prefilter.rs

1use std::cmp;
2use std::fmt;
3use std::panic::{RefUnwindSafe, UnwindSafe};
4use std::u8;
5
6use memchr::{memchr, memchr2, memchr3};
7
8use ahocorasick::MatchKind;
9use packed;
10use Match;
11
12/// A candidate is the result of running a prefilter on a haystack at a
13/// particular position. The result is either no match, a confirmed match or
14/// a possible match.
15///
16/// When no match is returned, the prefilter is guaranteeing that no possible
17/// match can be found in the haystack, and the caller may trust this. That is,
18/// all correct prefilters must never report false negatives.
19///
20/// In some cases, a prefilter can confirm a match very quickly, in which case,
21/// the caller may use this to stop what it's doing and report the match. In
22/// this case, prefilter implementations must never report a false positive.
23/// In other cases, the prefilter can only report a potential match, in which
24/// case the callers must attempt to confirm the match. In this case, prefilter
25/// implementations are permitted to return false positives.
26#[derive(Clone, Debug)]
27pub enum Candidate {
28    None,
29    Match(Match),
30    PossibleStartOfMatch(usize),
31}
32
33impl Candidate {
34    /// Convert this candidate into an option. This is useful when callers
35    /// do not distinguish between true positives and false positives (i.e.,
36    /// the caller must always confirm the match in order to update some other
37    /// state).
38    pub fn into_option(self) -> Option<usize> {
39        match self {
40            Candidate::None => None,
41            Candidate::Match(ref m) => Some(m.start()),
42            Candidate::PossibleStartOfMatch(start) => Some(start),
43        }
44    }
45}
46
47/// A prefilter describes the behavior of fast literal scanners for quickly
48/// skipping past bytes in the haystack that we know cannot possibly
49/// participate in a match.
50pub trait Prefilter:
51    Send + Sync + RefUnwindSafe + UnwindSafe + fmt::Debug
52{
53    /// Returns the next possible match candidate. This may yield false
54    /// positives, so callers must confirm a match starting at the position
55    /// returned. This, however, must never produce false negatives. That is,
56    /// this must, at minimum, return the starting position of the next match
57    /// in the given haystack after or at the given position.
58    fn next_candidate(
59        &self,
60        state: &mut PrefilterState,
61        haystack: &[u8],
62        at: usize,
63    ) -> Candidate;
64
65    /// A method for cloning a prefilter, to work-around the fact that Clone
66    /// is not object-safe.
67    fn clone_prefilter(&self) -> Box<dyn Prefilter>;
68
69    /// Returns the approximate total amount of heap used by this prefilter, in
70    /// units of bytes.
71    fn heap_bytes(&self) -> usize;
72
73    /// Returns true if and only if this prefilter never returns false
74    /// positives. This is useful for completely avoiding the automaton
75    /// when the prefilter can quickly confirm its own matches.
76    ///
77    /// By default, this returns true, which is conservative; it is always
78    /// correct to return `true`. Returning `false` here and reporting a false
79    /// positive will result in incorrect searches.
80    fn reports_false_positives(&self) -> bool {
81        true
82    }
83}
84
85impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P {
86    #[inline]
87    fn next_candidate(
88        &self,
89        state: &mut PrefilterState,
90        haystack: &[u8],
91        at: usize,
92    ) -> Candidate {
93        (**self).next_candidate(state, haystack, at)
94    }
95
96    fn clone_prefilter(&self) -> Box<dyn Prefilter> {
97        (**self).clone_prefilter()
98    }
99
100    fn heap_bytes(&self) -> usize {
101        (**self).heap_bytes()
102    }
103
104    fn reports_false_positives(&self) -> bool {
105        (**self).reports_false_positives()
106    }
107}
108
109/// A convenience object for representing any type that implements Prefilter
110/// and is cloneable.
111#[derive(Debug)]
112pub struct PrefilterObj(Box<dyn Prefilter>);
113
114impl Clone for PrefilterObj {
115    fn clone(&self) -> Self {
116        PrefilterObj(self.0.clone_prefilter())
117    }
118}
119
120impl PrefilterObj {
121    /// Create a new prefilter object.
122    pub fn new<T: Prefilter + 'static>(t: T) -> PrefilterObj {
123        PrefilterObj(Box::new(t))
124    }
125
126    /// Return the underlying prefilter trait object.
127    pub fn as_ref(&self) -> &dyn Prefilter {
128        &*self.0
129    }
130}
131
132/// PrefilterState tracks state associated with the effectiveness of a
133/// prefilter. It is used to track how many bytes, on average, are skipped by
134/// the prefilter. If this average dips below a certain threshold over time,
135/// then the state renders the prefilter inert and stops using it.
136///
137/// A prefilter state should be created for each search. (Where creating an
138/// iterator via, e.g., `find_iter`, is treated as a single search.)
139#[derive(Clone, Debug)]
140pub struct PrefilterState {
141    /// The number of skips that has been executed.
142    skips: usize,
143    /// The total number of bytes that have been skipped.
144    skipped: usize,
145    /// The maximum length of a match. This is used to help determine how many
146    /// bytes on average should be skipped in order for a prefilter to be
147    /// effective.
148    max_match_len: usize,
149    /// Once this heuristic has been deemed permanently ineffective, it will be
150    /// inert throughout the rest of its lifetime. This serves as a cheap way
151    /// to check inertness.
152    inert: bool,
153    /// The last (absolute) position at which a prefilter scanned to.
154    /// Prefilters can use this position to determine whether to re-scan or
155    /// not.
156    ///
157    /// Unlike other things that impact effectiveness, this is a fleeting
158    /// condition. That is, a prefilter can be considered ineffective if it is
159    /// at a position before `last_scan_at`, but can become effective again
160    /// once the search moves past `last_scan_at`.
161    ///
162    /// The utility of this is to both avoid additional overhead from calling
163    /// the prefilter and to avoid quadratic behavior. This ensures that a
164    /// prefilter will scan any particular byte at most once. (Note that some
165    /// prefilters, like the start-byte prefilter, do not need to use this
166    /// field at all, since it only looks for starting bytes.)
167    last_scan_at: usize,
168}
169
170impl PrefilterState {
171    /// The minimum number of skip attempts to try before considering whether
172    /// a prefilter is effective or not.
173    const MIN_SKIPS: usize = 40;
174
175    /// The minimum amount of bytes that skipping must average, expressed as a
176    /// factor of the multiple of the length of a possible match.
177    ///
178    /// That is, after MIN_SKIPS have occurred, if the average number of bytes
179    /// skipped ever falls below MIN_AVG_FACTOR * max-match-length, then the
180    /// prefilter outed to be rendered inert.
181    const MIN_AVG_FACTOR: usize = 2;
182
183    /// Create a fresh prefilter state.
184    pub fn new(max_match_len: usize) -> PrefilterState {
185        PrefilterState {
186            skips: 0,
187            skipped: 0,
188            max_match_len,
189            inert: false,
190            last_scan_at: 0,
191        }
192    }
193
194    /// Update this state with the number of bytes skipped on the last
195    /// invocation of the prefilter.
196    #[inline]
197    fn update_skipped_bytes(&mut self, skipped: usize) {
198        self.skips += 1;
199        self.skipped += skipped;
200    }
201
202    /// Updates the position at which the last scan stopped. This may be
203    /// greater than the position of the last candidate reported. For example,
204    /// searching for the "rare" byte `z` in `abczdef` for the pattern `abcz`
205    /// will report a candidate at position `0`, but the end of its last scan
206    /// will be at position `3`.
207    ///
208    /// This position factors into the effectiveness of this prefilter. If the
209    /// current position is less than the last position at which a scan ended,
210    /// then the prefilter should not be re-run until the search moves past
211    /// that position.
212    #[inline]
213    fn update_at(&mut self, at: usize) {
214        if at > self.last_scan_at {
215            self.last_scan_at = at;
216        }
217    }
218
219    /// Return true if and only if this state indicates that a prefilter is
220    /// still effective.
221    ///
222    /// The given pos should correspond to the current starting position of the
223    /// search.
224    #[inline]
225    pub fn is_effective(&mut self, at: usize) -> bool {
226        if self.inert {
227            return false;
228        }
229        if at < self.last_scan_at {
230            return false;
231        }
232        if self.skips < PrefilterState::MIN_SKIPS {
233            return true;
234        }
235
236        let min_avg = PrefilterState::MIN_AVG_FACTOR * self.max_match_len;
237        if self.skipped >= min_avg * self.skips {
238            return true;
239        }
240
241        // We're inert.
242        self.inert = true;
243        false
244    }
245}
246
247/// A builder for constructing the best possible prefilter. When constructed,
248/// this builder will heuristically select the best prefilter it can build,
249/// if any, and discard the rest.
250#[derive(Debug)]
251pub struct Builder {
252    count: usize,
253    ascii_case_insensitive: bool,
254    start_bytes: StartBytesBuilder,
255    rare_bytes: RareBytesBuilder,
256    packed: Option<packed::Builder>,
257}
258
259impl Builder {
260    /// Create a new builder for constructing the best possible prefilter.
261    pub fn new(kind: MatchKind) -> Builder {
262        let pbuilder = kind
263            .as_packed()
264            .map(|kind| packed::Config::new().match_kind(kind).builder());
265        Builder {
266            count: 0,
267            ascii_case_insensitive: false,
268            start_bytes: StartBytesBuilder::new(),
269            rare_bytes: RareBytesBuilder::new(),
270            packed: pbuilder,
271        }
272    }
273
274    /// Enable ASCII case insensitivity. When set, byte strings added to this
275    /// builder will be interpreted without respect to ASCII case.
276    pub fn ascii_case_insensitive(mut self, yes: bool) -> Builder {
277        self.ascii_case_insensitive = yes;
278        self.start_bytes = self.start_bytes.ascii_case_insensitive(yes);
279        self.rare_bytes = self.rare_bytes.ascii_case_insensitive(yes);
280        self
281    }
282
283    /// Return a prefilter suitable for quickly finding potential matches.
284    ///
285    /// All patterns added to an Aho-Corasick automaton should be added to this
286    /// builder before attempting to construct the prefilter.
287    pub fn build(&self) -> Option<PrefilterObj> {
288        match (self.start_bytes.build(), self.rare_bytes.build()) {
289            // If we could build both start and rare prefilters, then there are
290            // a few cases in which we'd want to use the start-byte prefilter
291            // over the rare-byte prefilter, since the former has lower
292            // overhead.
293            (prestart @ Some(_), prerare @ Some(_)) => {
294                // If the start-byte prefilter can scan for a smaller number
295                // of bytes than the rare-byte prefilter, then it's probably
296                // faster.
297                let has_fewer_bytes =
298                    self.start_bytes.count < self.rare_bytes.count;
299                // Otherwise, if the combined frequency rank of the detected
300                // bytes in the start-byte prefilter is "close" to the combined
301                // frequency rank of the rare-byte prefilter, then we pick
302                // the start-byte prefilter even if the rare-byte prefilter
303                // heuristically searches for rare bytes. This is because the
304                // rare-byte prefilter has higher constant costs, so we tend to
305                // prefer the start-byte prefilter when we can.
306                let has_rarer_bytes =
307                    self.start_bytes.rank_sum <= self.rare_bytes.rank_sum + 50;
308                if has_fewer_bytes || has_rarer_bytes {
309                    prestart
310                } else {
311                    prerare
312                }
313            }
314            (prestart @ Some(_), None) => prestart,
315            (None, prerare @ Some(_)) => prerare,
316            (None, None) if self.ascii_case_insensitive => None,
317            (None, None) => self
318                .packed
319                .as_ref()
320                .and_then(|b| b.build())
321                .map(|s| PrefilterObj::new(Packed(s))),
322        }
323    }
324
325    /// Add a literal string to this prefilter builder.
326    pub fn add(&mut self, bytes: &[u8]) {
327        self.count += 1;
328        self.start_bytes.add(bytes);
329        self.rare_bytes.add(bytes);
330        if let Some(ref mut pbuilder) = self.packed {
331            pbuilder.add(bytes);
332        }
333    }
334}
335
336/// A type that wraps a packed searcher and implements the `Prefilter`
337/// interface.
338#[derive(Clone, Debug)]
339struct Packed(packed::Searcher);
340
341impl Prefilter for Packed {
342    fn next_candidate(
343        &self,
344        _state: &mut PrefilterState,
345        haystack: &[u8],
346        at: usize,
347    ) -> Candidate {
348        self.0.find_at(haystack, at).map_or(Candidate::None, Candidate::Match)
349    }
350
351    fn clone_prefilter(&self) -> Box<dyn Prefilter> {
352        Box::new(self.clone())
353    }
354
355    fn heap_bytes(&self) -> usize {
356        self.0.heap_bytes()
357    }
358
359    fn reports_false_positives(&self) -> bool {
360        false
361    }
362}
363
364/// A builder for constructing a rare byte prefilter.
365///
366/// A rare byte prefilter attempts to pick out a small set of rare bytes that
367/// occurr in the patterns, and then quickly scan to matches of those rare
368/// bytes.
369#[derive(Clone, Debug)]
370struct RareBytesBuilder {
371    /// Whether this prefilter should account for ASCII case insensitivity or
372    /// not.
373    ascii_case_insensitive: bool,
374    /// A set of byte offsets associated with detected rare bytes. An entry is
375    /// only set if a rare byte is detected in a pattern.
376    byte_offsets: RareByteOffsets,
377    /// Whether this is available as a prefilter or not. This can be set to
378    /// false during construction if a condition is seen that invalidates the
379    /// use of the rare-byte prefilter.
380    available: bool,
381    /// The number of bytes set to an active value in `byte_offsets`.
382    count: usize,
383    /// The sum of frequency ranks for the rare bytes detected. This is
384    /// intended to give a heuristic notion of how rare the bytes are.
385    rank_sum: u16,
386}
387
388/// A set of rare byte offsets, keyed by byte.
389#[derive(Clone, Copy)]
390struct RareByteOffsets {
391    /// When an item in this set has an offset of u8::MAX (255), then it is
392    /// considered unset.
393    set: [RareByteOffset; 256],
394}
395
396impl RareByteOffsets {
397    /// Create a new empty set of rare byte offsets.
398    pub fn empty() -> RareByteOffsets {
399        RareByteOffsets { set: [RareByteOffset::default(); 256] }
400    }
401
402    /// Add the given offset for the given byte to this set. If the offset is
403    /// greater than the existing offset, then it overwrites the previous
404    /// value and returns false. If there is no previous value set, then this
405    /// sets it and returns true.
406    ///
407    /// The given offset must be active, otherwise this panics.
408    pub fn apply(&mut self, byte: u8, off: RareByteOffset) -> bool {
409        assert!(off.is_active());
410
411        let existing = &mut self.set[byte as usize];
412        if !existing.is_active() {
413            *existing = off;
414            true
415        } else {
416            if existing.max < off.max {
417                *existing = off;
418            }
419            false
420        }
421    }
422}
423
424impl fmt::Debug for RareByteOffsets {
425    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
426        let mut offsets = vec![];
427        for off in self.set.iter() {
428            if off.is_active() {
429                offsets.push(off);
430            }
431        }
432        f.debug_struct("RareByteOffsets").field("set", &offsets).finish()
433    }
434}
435
436/// Offsets associated with an occurrence of a "rare" byte in any of the
437/// patterns used to construct a single Aho-Corasick automaton.
438#[derive(Clone, Copy, Debug)]
439struct RareByteOffset {
440    /// The maximum offset at which a particular byte occurs from the start
441    /// of any pattern. This is used as a shift amount. That is, when an
442    /// occurrence of this byte is found, the candidate position reported by
443    /// the prefilter is `position_of_byte - max`, such that the automaton
444    /// will begin its search at a position that is guaranteed to observe a
445    /// match.
446    ///
447    /// To avoid accidentally quadratic behavior, a prefilter is considered
448    /// ineffective when it is asked to start scanning from a position that it
449    /// has already scanned past.
450    ///
451    /// N.B. The maximum value for this is 254. A value of 255 indicates that
452    /// this is unused. If a rare byte is found at an offset of 255 or greater,
453    /// then the rare-byte prefilter is disabled for simplicity.
454    max: u8,
455}
456
457impl Default for RareByteOffset {
458    fn default() -> RareByteOffset {
459        RareByteOffset { max: u8::MAX }
460    }
461}
462
463impl RareByteOffset {
464    /// Create a new rare byte offset. If the given offset is too big, then
465    /// an inactive `RareByteOffset` is returned.
466    fn new(max: usize) -> RareByteOffset {
467        if max > (u8::MAX - 1) as usize {
468            RareByteOffset::default()
469        } else {
470            RareByteOffset { max: max as u8 }
471        }
472    }
473
474    /// Returns true if and only if this offset is active. If it's inactive,
475    /// then it should not be used.
476    fn is_active(&self) -> bool {
477        self.max < u8::MAX
478    }
479}
480
481impl RareBytesBuilder {
482    /// Create a new builder for constructing a rare byte prefilter.
483    fn new() -> RareBytesBuilder {
484        RareBytesBuilder {
485            ascii_case_insensitive: false,
486            byte_offsets: RareByteOffsets::empty(),
487            available: true,
488            count: 0,
489            rank_sum: 0,
490        }
491    }
492
493    /// Enable ASCII case insensitivity. When set, byte strings added to this
494    /// builder will be interpreted without respect to ASCII case.
495    fn ascii_case_insensitive(mut self, yes: bool) -> RareBytesBuilder {
496        self.ascii_case_insensitive = yes;
497        self
498    }
499
500    /// Build the rare bytes prefilter.
501    ///
502    /// If there are more than 3 distinct starting bytes, or if heuristics
503    /// otherwise determine that this prefilter should not be used, then `None`
504    /// is returned.
505    fn build(&self) -> Option<PrefilterObj> {
506        if !self.available || self.count > 3 {
507            return None;
508        }
509        let (mut bytes, mut len) = ([0; 3], 0);
510        for b in 0..256 {
511            if self.byte_offsets.set[b].is_active() {
512                bytes[len] = b as u8;
513                len += 1;
514            }
515        }
516        match len {
517            0 => None,
518            1 => Some(PrefilterObj::new(RareBytesOne {
519                byte1: bytes[0],
520                offset: self.byte_offsets.set[bytes[0] as usize],
521            })),
522            2 => Some(PrefilterObj::new(RareBytesTwo {
523                offsets: self.byte_offsets,
524                byte1: bytes[0],
525                byte2: bytes[1],
526            })),
527            3 => Some(PrefilterObj::new(RareBytesThree {
528                offsets: self.byte_offsets,
529                byte1: bytes[0],
530                byte2: bytes[1],
531                byte3: bytes[2],
532            })),
533            _ => unreachable!(),
534        }
535    }
536
537    /// Add a byte string to this builder.
538    ///
539    /// All patterns added to an Aho-Corasick automaton should be added to this
540    /// builder before attempting to construct the prefilter.
541    fn add(&mut self, bytes: &[u8]) {
542        // If we've already blown our budget, then don't waste time looking
543        // for more rare bytes.
544        if self.count > 3 {
545            self.available = false;
546            return;
547        }
548        let mut rarest = match bytes.get(0) {
549            None => return,
550            Some(&b) => (b, 0, freq_rank(b)),
551        };
552        // The idea here is to look for the rarest byte in each pattern, and
553        // add that to our set. As a special exception, if we see a byte that
554        // we've already added, then we immediately stop and choose that byte,
555        // even if there's another rare byte in the pattern. This helps us
556        // apply the rare byte optimization in more cases by attempting to pick
557        // bytes that are in common between patterns. So for example, if we
558        // were searching for `Sherlock` and `lockjaw`, then this would pick
559        // `k` for both patterns, resulting in the use of `memchr` instead of
560        // `memchr2` for `k` and `j`.
561        for (pos, &b) in bytes.iter().enumerate() {
562            if self.byte_offsets.set[b as usize].is_active() {
563                self.add_rare_byte(b, pos);
564                return;
565            }
566            let rank = freq_rank(b);
567            if rank < rarest.2 {
568                rarest = (b, pos, rank);
569            }
570        }
571        self.add_rare_byte(rarest.0, rarest.1);
572    }
573
574    fn add_rare_byte(&mut self, byte: u8, pos: usize) {
575        self.add_one_byte(byte, pos);
576        if self.ascii_case_insensitive {
577            self.add_one_byte(opposite_ascii_case(byte), pos);
578        }
579    }
580
581    fn add_one_byte(&mut self, byte: u8, pos: usize) {
582        let off = RareByteOffset::new(pos);
583        if !off.is_active() {
584            self.available = false;
585            return;
586        }
587        if self.byte_offsets.apply(byte, off) {
588            self.count += 1;
589            self.rank_sum += freq_rank(byte) as u16;
590        }
591    }
592}
593
594/// A prefilter for scanning for a single "rare" byte.
595#[derive(Clone, Debug)]
596struct RareBytesOne {
597    byte1: u8,
598    offset: RareByteOffset,
599}
600
601impl Prefilter for RareBytesOne {
602    fn next_candidate(
603        &self,
604        state: &mut PrefilterState,
605        haystack: &[u8],
606        at: usize,
607    ) -> Candidate {
608        memchr(self.byte1, &haystack[at..])
609            .map(|i| {
610                let pos = at + i;
611                state.last_scan_at = pos;
612                cmp::max(at, pos.saturating_sub(self.offset.max as usize))
613            })
614            .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
615    }
616
617    fn clone_prefilter(&self) -> Box<dyn Prefilter> {
618        Box::new(self.clone())
619    }
620
621    fn heap_bytes(&self) -> usize {
622        0
623    }
624}
625
626/// A prefilter for scanning for two "rare" bytes.
627#[derive(Clone, Debug)]
628struct RareBytesTwo {
629    offsets: RareByteOffsets,
630    byte1: u8,
631    byte2: u8,
632}
633
634impl Prefilter for RareBytesTwo {
635    fn next_candidate(
636        &self,
637        state: &mut PrefilterState,
638        haystack: &[u8],
639        at: usize,
640    ) -> Candidate {
641        memchr2(self.byte1, self.byte2, &haystack[at..])
642            .map(|i| {
643                let pos = at + i;
644                state.update_at(pos);
645                let offset = self.offsets.set[haystack[pos] as usize].max;
646                cmp::max(at, pos.saturating_sub(offset as usize))
647            })
648            .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
649    }
650
651    fn clone_prefilter(&self) -> Box<dyn Prefilter> {
652        Box::new(self.clone())
653    }
654
655    fn heap_bytes(&self) -> usize {
656        0
657    }
658}
659
660/// A prefilter for scanning for three "rare" bytes.
661#[derive(Clone, Debug)]
662struct RareBytesThree {
663    offsets: RareByteOffsets,
664    byte1: u8,
665    byte2: u8,
666    byte3: u8,
667}
668
669impl Prefilter for RareBytesThree {
670    fn next_candidate(
671        &self,
672        state: &mut PrefilterState,
673        haystack: &[u8],
674        at: usize,
675    ) -> Candidate {
676        memchr3(self.byte1, self.byte2, self.byte3, &haystack[at..])
677            .map(|i| {
678                let pos = at + i;
679                state.update_at(pos);
680                let offset = self.offsets.set[haystack[pos] as usize].max;
681                cmp::max(at, pos.saturating_sub(offset as usize))
682            })
683            .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
684    }
685
686    fn clone_prefilter(&self) -> Box<dyn Prefilter> {
687        Box::new(self.clone())
688    }
689
690    fn heap_bytes(&self) -> usize {
691        0
692    }
693}
694
695/// A builder for constructing a starting byte prefilter.
696///
697/// A starting byte prefilter is a simplistic prefilter that looks for possible
698/// matches by reporting all positions corresponding to a particular byte. This
699/// generally only takes affect when there are at most 3 distinct possible
700/// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two
701/// distinct starting bytes (`f` and `b`), and this prefiler returns all
702/// occurrences of either `f` or `b`.
703///
704/// In some cases, a heuristic frequency analysis may determine that it would
705/// be better not to use this prefilter even when there are 3 or fewer distinct
706/// starting bytes.
707#[derive(Clone, Debug)]
708struct StartBytesBuilder {
709    /// Whether this prefilter should account for ASCII case insensitivity or
710    /// not.
711    ascii_case_insensitive: bool,
712    /// The set of starting bytes observed.
713    byteset: Vec<bool>,
714    /// The number of bytes set to true in `byteset`.
715    count: usize,
716    /// The sum of frequency ranks for the rare bytes detected. This is
717    /// intended to give a heuristic notion of how rare the bytes are.
718    rank_sum: u16,
719}
720
721impl StartBytesBuilder {
722    /// Create a new builder for constructing a start byte prefilter.
723    fn new() -> StartBytesBuilder {
724        StartBytesBuilder {
725            ascii_case_insensitive: false,
726            byteset: vec![false; 256],
727            count: 0,
728            rank_sum: 0,
729        }
730    }
731
732    /// Enable ASCII case insensitivity. When set, byte strings added to this
733    /// builder will be interpreted without respect to ASCII case.
734    fn ascii_case_insensitive(mut self, yes: bool) -> StartBytesBuilder {
735        self.ascii_case_insensitive = yes;
736        self
737    }
738
739    /// Build the starting bytes prefilter.
740    ///
741    /// If there are more than 3 distinct starting bytes, or if heuristics
742    /// otherwise determine that this prefilter should not be used, then `None`
743    /// is returned.
744    fn build(&self) -> Option<PrefilterObj> {
745        if self.count > 3 {
746            return None;
747        }
748        let (mut bytes, mut len) = ([0; 3], 0);
749        for b in 0..256 {
750            if !self.byteset[b] {
751                continue;
752            }
753            // We don't handle non-ASCII bytes for now. Getting non-ASCII
754            // bytes right is trickier, since we generally don't want to put
755            // a leading UTF-8 code unit into a prefilter that isn't ASCII,
756            // since they can frequently. Instead, it would be better to use a
757            // continuation byte, but this requires more sophisticated analysis
758            // of the automaton and a richer prefilter API.
759            if b > 0x7F {
760                return None;
761            }
762            bytes[len] = b as u8;
763            len += 1;
764        }
765        match len {
766            0 => None,
767            1 => Some(PrefilterObj::new(StartBytesOne { byte1: bytes[0] })),
768            2 => Some(PrefilterObj::new(StartBytesTwo {
769                byte1: bytes[0],
770                byte2: bytes[1],
771            })),
772            3 => Some(PrefilterObj::new(StartBytesThree {
773                byte1: bytes[0],
774                byte2: bytes[1],
775                byte3: bytes[2],
776            })),
777            _ => unreachable!(),
778        }
779    }
780
781    /// Add a byte string to this builder.
782    ///
783    /// All patterns added to an Aho-Corasick automaton should be added to this
784    /// builder before attempting to construct the prefilter.
785    fn add(&mut self, bytes: &[u8]) {
786        if self.count > 3 {
787            return;
788        }
789        if let Some(&byte) = bytes.get(0) {
790            self.add_one_byte(byte);
791            if self.ascii_case_insensitive {
792                self.add_one_byte(opposite_ascii_case(byte));
793            }
794        }
795    }
796
797    fn add_one_byte(&mut self, byte: u8) {
798        if !self.byteset[byte as usize] {
799            self.byteset[byte as usize] = true;
800            self.count += 1;
801            self.rank_sum += freq_rank(byte) as u16;
802        }
803    }
804}
805
806/// A prefilter for scanning for a single starting byte.
807#[derive(Clone, Debug)]
808struct StartBytesOne {
809    byte1: u8,
810}
811
812impl Prefilter for StartBytesOne {
813    fn next_candidate(
814        &self,
815        _state: &mut PrefilterState,
816        haystack: &[u8],
817        at: usize,
818    ) -> Candidate {
819        memchr(self.byte1, &haystack[at..])
820            .map(|i| at + i)
821            .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
822    }
823
824    fn clone_prefilter(&self) -> Box<dyn Prefilter> {
825        Box::new(self.clone())
826    }
827
828    fn heap_bytes(&self) -> usize {
829        0
830    }
831}
832
833/// A prefilter for scanning for two starting bytes.
834#[derive(Clone, Debug)]
835struct StartBytesTwo {
836    byte1: u8,
837    byte2: u8,
838}
839
840impl Prefilter for StartBytesTwo {
841    fn next_candidate(
842        &self,
843        _state: &mut PrefilterState,
844        haystack: &[u8],
845        at: usize,
846    ) -> Candidate {
847        memchr2(self.byte1, self.byte2, &haystack[at..])
848            .map(|i| at + i)
849            .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
850    }
851
852    fn clone_prefilter(&self) -> Box<dyn Prefilter> {
853        Box::new(self.clone())
854    }
855
856    fn heap_bytes(&self) -> usize {
857        0
858    }
859}
860
861/// A prefilter for scanning for three starting bytes.
862#[derive(Clone, Debug)]
863struct StartBytesThree {
864    byte1: u8,
865    byte2: u8,
866    byte3: u8,
867}
868
869impl Prefilter for StartBytesThree {
870    fn next_candidate(
871        &self,
872        _state: &mut PrefilterState,
873        haystack: &[u8],
874        at: usize,
875    ) -> Candidate {
876        memchr3(self.byte1, self.byte2, self.byte3, &haystack[at..])
877            .map(|i| at + i)
878            .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
879    }
880
881    fn clone_prefilter(&self) -> Box<dyn Prefilter> {
882        Box::new(self.clone())
883    }
884
885    fn heap_bytes(&self) -> usize {
886        0
887    }
888}
889
890/// Return the next candidate reported by the given prefilter while
891/// simultaneously updating the given prestate.
892///
893/// The caller is responsible for checking the prestate before deciding whether
894/// to initiate a search.
895#[inline]
896pub fn next<P: Prefilter>(
897    prestate: &mut PrefilterState,
898    prefilter: P,
899    haystack: &[u8],
900    at: usize,
901) -> Candidate {
902    let cand = prefilter.next_candidate(prestate, haystack, at);
903    match cand {
904        Candidate::None => {
905            prestate.update_skipped_bytes(haystack.len() - at);
906        }
907        Candidate::Match(ref m) => {
908            prestate.update_skipped_bytes(m.start() - at);
909        }
910        Candidate::PossibleStartOfMatch(i) => {
911            prestate.update_skipped_bytes(i - at);
912        }
913    }
914    cand
915}
916
917/// If the given byte is an ASCII letter, then return it in the opposite case.
918/// e.g., Given `b'A'`, this returns `b'a'`, and given `b'a'`, this returns
919/// `b'A'`. If a non-ASCII letter is given, then the given byte is returned.
920pub fn opposite_ascii_case(b: u8) -> u8 {
921    if b'A' <= b && b <= b'Z' {
922        b.to_ascii_lowercase()
923    } else if b'a' <= b && b <= b'z' {
924        b.to_ascii_uppercase()
925    } else {
926        b
927    }
928}
929
930/// Return the frequency rank of the given byte. The higher the rank, the more
931/// common the byte (heuristically speaking).
932fn freq_rank(b: u8) -> u8 {
933    use byte_frequencies::BYTE_FREQUENCIES;
934    BYTE_FREQUENCIES[b as usize]
935}
936
937#[cfg(test)]
938mod tests {
939    use super::*;
940
941    #[test]
942    fn scratch() {
943        let mut b = Builder::new(MatchKind::LeftmostFirst);
944        b.add(b"Sherlock");
945        b.add(b"locjaw");
946        // b.add(b"Sherlock");
947        // b.add(b"Holmes");
948        // b.add(b"Watson");
949        // b.add("Шерлок Холмс".as_bytes());
950        // b.add("Джон Уотсон".as_bytes());
951
952        let s = b.build().unwrap();
953        println!("{:?}", s);
954    }
955}