aho_corasick/buffer.rs
1use std::cmp;
2use std::io;
3use std::ptr;
4
5/// The default buffer capacity that we use for the stream buffer.
6const DEFAULT_BUFFER_CAPACITY: usize = 8 * (1 << 10); // 8 KB
7
8/// A fairly simple roll buffer for supporting stream searches.
9///
10/// This buffer acts as a temporary place to store a fixed amount of data when
11/// reading from a stream. Its central purpose is to allow "rolling" some
12/// suffix of the data to the beginning of the buffer before refilling it with
13/// more data from the stream. For example, let's say we are trying to match
14/// "foobar" on a stream. When we report the match, we'd like to not only
15/// report the correct offsets at which the match occurs, but also the matching
16/// bytes themselves. So let's say our stream is a file with the following
17/// contents: `test test foobar test test`. Now assume that we happen to read
18/// the aforementioned file in two chunks: `test test foo` and `bar test test`.
19/// Naively, it would not be possible to report a single contiguous `foobar`
20/// match, but this roll buffer allows us to do that. Namely, after the second
21/// read, the contents of the buffer should be `st foobar test test`, where the
22/// search should ultimately resume immediately after `foo`. (The prefix `st `
23/// is included because the roll buffer saves N bytes at the end of the buffer,
24/// where N is the maximum possible length of a match.)
25///
26/// A lot of the logic for dealing with this is unfortunately split out between
27/// this roll buffer and the `StreamChunkIter`.
28#[derive(Debug)]
29pub struct Buffer {
30 /// The raw buffer contents. This has a fixed size and never increases.
31 buf: Vec<u8>,
32 /// The minimum size of the buffer, which is equivalent to the maximum
33 /// possible length of a match. This corresponds to the amount that we
34 /// roll
35 min: usize,
36 /// The end of the contents of this buffer.
37 end: usize,
38}
39
40impl Buffer {
41 /// Create a new buffer for stream searching. The minimum buffer length
42 /// given should be the size of the maximum possible match length.
43 pub fn new(min_buffer_len: usize) -> Buffer {
44 let min = cmp::max(1, min_buffer_len);
45 // The minimum buffer amount is also the amount that we roll our
46 // buffer in order to support incremental searching. To this end,
47 // our actual capacity needs to be at least 1 byte bigger than our
48 // minimum amount, otherwise we won't have any overlap. In actuality,
49 // we want our buffer to be a bit bigger than that for performance
50 // reasons, so we set a lower bound of `8 * min`.
51 //
52 // TODO: It would be good to find a way to test the streaming
53 // implementation with the minimal buffer size.
54 let capacity = cmp::max(min * 8, DEFAULT_BUFFER_CAPACITY);
55 Buffer { buf: vec![0; capacity], min, end: 0 }
56 }
57
58 /// Return the contents of this buffer.
59 #[inline]
60 pub fn buffer(&self) -> &[u8] {
61 &self.buf[..self.end]
62 }
63
64 /// Return the minimum size of the buffer. The only way a buffer may be
65 /// smaller than this is if the stream itself contains less than the
66 /// minimum buffer amount.
67 #[inline]
68 pub fn min_buffer_len(&self) -> usize {
69 self.min
70 }
71
72 /// Return the total length of the contents in the buffer.
73 #[inline]
74 pub fn len(&self) -> usize {
75 self.end
76 }
77
78 /// Return all free capacity in this buffer.
79 fn free_buffer(&mut self) -> &mut [u8] {
80 &mut self.buf[self.end..]
81 }
82
83 /// Refill the contents of this buffer by reading as much as possible into
84 /// this buffer's free capacity. If no more bytes could be read, then this
85 /// returns false. Otherwise, this reads until it has filled the buffer
86 /// past the minimum amount.
87 pub fn fill<R: io::Read>(&mut self, mut rdr: R) -> io::Result<bool> {
88 let mut readany = false;
89 loop {
90 let readlen = rdr.read(self.free_buffer())?;
91 if readlen == 0 {
92 return Ok(readany);
93 }
94 readany = true;
95 self.end += readlen;
96 if self.len() >= self.min {
97 return Ok(true);
98 }
99 }
100 }
101
102 /// Roll the contents of the buffer so that the suffix of this buffer is
103 /// moved to the front and all other contents are dropped. The size of the
104 /// suffix corresponds precisely to the minimum buffer length.
105 ///
106 /// This should only be called when the entire contents of this buffer have
107 /// been searched.
108 pub fn roll(&mut self) {
109 let roll_start = self
110 .end
111 .checked_sub(self.min)
112 .expect("buffer capacity should be bigger than minimum amount");
113 let roll_len = self.min;
114
115 assert!(roll_start + roll_len <= self.end);
116 unsafe {
117 // SAFETY: A buffer contains Copy data, so there's no problem
118 // moving it around. Safety also depends on our indices being in
119 // bounds, which they always should be, given the assert above.
120 ptr::copy(
121 self.buf[roll_start..].as_ptr(),
122 self.buf.as_mut_ptr(),
123 roll_len,
124 );
125 }
126 self.end = roll_len;
127 }
128}