regex/
re_builder.rs

1/// The set of user configurable options for compiling zero or more regexes.
2#[derive(Clone, Debug)]
3#[allow(missing_docs)]
4pub struct RegexOptions {
5    pub pats: Vec<String>,
6    pub size_limit: usize,
7    pub dfa_size_limit: usize,
8    pub nest_limit: u32,
9    pub case_insensitive: bool,
10    pub multi_line: bool,
11    pub dot_matches_new_line: bool,
12    pub swap_greed: bool,
13    pub ignore_whitespace: bool,
14    pub unicode: bool,
15    pub octal: bool,
16}
17
18impl Default for RegexOptions {
19    fn default() -> Self {
20        RegexOptions {
21            pats: vec![],
22            size_limit: 10 * (1 << 20),
23            dfa_size_limit: 2 * (1 << 20),
24            nest_limit: 250,
25            case_insensitive: false,
26            multi_line: false,
27            dot_matches_new_line: false,
28            swap_greed: false,
29            ignore_whitespace: false,
30            unicode: true,
31            octal: false,
32        }
33    }
34}
35
36macro_rules! define_builder {
37    ($name:ident, $regex_mod:ident, $only_utf8:expr) => {
38        pub mod $name {
39            use super::RegexOptions;
40            use error::Error;
41            use exec::ExecBuilder;
42
43            use $regex_mod::Regex;
44
45            /// A configurable builder for a regular expression.
46            ///
47            /// A builder can be used to configure how the regex is built, for example, by
48            /// setting the default flags (which can be overridden in the expression
49            /// itself) or setting various limits.
50            pub struct RegexBuilder(RegexOptions);
51
52            impl RegexBuilder {
53                /// Create a new regular expression builder with the given pattern.
54                ///
55                /// If the pattern is invalid, then an error will be returned when
56                /// `build` is called.
57                pub fn new(pattern: &str) -> RegexBuilder {
58                    let mut builder = RegexBuilder(RegexOptions::default());
59                    builder.0.pats.push(pattern.to_owned());
60                    builder
61                }
62
63                /// Consume the builder and compile the regular expression.
64                ///
65                /// Note that calling `as_str` on the resulting `Regex` will produce the
66                /// pattern given to `new` verbatim. Notably, it will not incorporate any
67                /// of the flags set on this builder.
68                pub fn build(&self) -> Result<Regex, Error> {
69                    ExecBuilder::new_options(self.0.clone())
70                        .only_utf8($only_utf8)
71                        .build()
72                        .map(Regex::from)
73                }
74
75                /// Set the value for the case insensitive (`i`) flag.
76                ///
77                /// When enabled, letters in the pattern will match both upper case and
78                /// lower case variants.
79                pub fn case_insensitive(
80                    &mut self,
81                    yes: bool,
82                ) -> &mut RegexBuilder {
83                    self.0.case_insensitive = yes;
84                    self
85                }
86
87                /// Set the value for the multi-line matching (`m`) flag.
88                ///
89                /// When enabled, `^` matches the beginning of lines and `$` matches the
90                /// end of lines.
91                ///
92                /// By default, they match beginning/end of the input.
93                pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
94                    self.0.multi_line = yes;
95                    self
96                }
97
98                /// Set the value for the any character (`s`) flag, where in `.` matches
99                /// anything when `s` is set and matches anything except for new line when
100                /// it is not set (the default).
101                ///
102                /// N.B. "matches anything" means "any byte" when Unicode is disabled and
103                /// means "any valid UTF-8 encoding of any Unicode scalar value" when
104                /// Unicode is enabled.
105                pub fn dot_matches_new_line(
106                    &mut self,
107                    yes: bool,
108                ) -> &mut RegexBuilder {
109                    self.0.dot_matches_new_line = yes;
110                    self
111                }
112
113                /// Set the value for the greedy swap (`U`) flag.
114                ///
115                /// When enabled, a pattern like `a*` is lazy (tries to find shortest
116                /// match) and `a*?` is greedy (tries to find longest match).
117                ///
118                /// By default, `a*` is greedy and `a*?` is lazy.
119                pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
120                    self.0.swap_greed = yes;
121                    self
122                }
123
124                /// Set the value for the ignore whitespace (`x`) flag.
125                ///
126                /// When enabled, whitespace such as new lines and spaces will be ignored
127                /// between expressions of the pattern, and `#` can be used to start a
128                /// comment until the next new line.
129                pub fn ignore_whitespace(
130                    &mut self,
131                    yes: bool,
132                ) -> &mut RegexBuilder {
133                    self.0.ignore_whitespace = yes;
134                    self
135                }
136
137                /// Set the value for the Unicode (`u`) flag.
138                ///
139                /// Enabled by default. When disabled, character classes such as `\w` only
140                /// match ASCII word characters instead of all Unicode word characters.
141                pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
142                    self.0.unicode = yes;
143                    self
144                }
145
146                /// Whether to support octal syntax or not.
147                ///
148                /// Octal syntax is a little-known way of uttering Unicode codepoints in
149                /// a regular expression. For example, `a`, `\x61`, `\u0061` and
150                /// `\141` are all equivalent regular expressions, where the last example
151                /// shows octal syntax.
152                ///
153                /// While supporting octal syntax isn't in and of itself a problem, it does
154                /// make good error messages harder. That is, in PCRE based regex engines,
155                /// syntax like `\0` invokes a backreference, which is explicitly
156                /// unsupported in Rust's regex engine. However, many users expect it to
157                /// be supported. Therefore, when octal support is disabled, the error
158                /// message will explicitly mention that backreferences aren't supported.
159                ///
160                /// Octal syntax is disabled by default.
161                pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
162                    self.0.octal = yes;
163                    self
164                }
165
166                /// Set the approximate size limit of the compiled regular expression.
167                ///
168                /// This roughly corresponds to the number of bytes occupied by a single
169                /// compiled program. If the program exceeds this number, then a
170                /// compilation error is returned.
171                pub fn size_limit(
172                    &mut self,
173                    limit: usize,
174                ) -> &mut RegexBuilder {
175                    self.0.size_limit = limit;
176                    self
177                }
178
179                /// Set the approximate size of the cache used by the DFA.
180                ///
181                /// This roughly corresponds to the number of bytes that the DFA will
182                /// use while searching.
183                ///
184                /// Note that this is a *per thread* limit. There is no way to set a global
185                /// limit. In particular, if a regex is used from multiple threads
186                /// simultaneously, then each thread may use up to the number of bytes
187                /// specified here.
188                pub fn dfa_size_limit(
189                    &mut self,
190                    limit: usize,
191                ) -> &mut RegexBuilder {
192                    self.0.dfa_size_limit = limit;
193                    self
194                }
195
196                /// Set the nesting limit for this parser.
197                ///
198                /// The nesting limit controls how deep the abstract syntax tree is allowed
199                /// to be. If the AST exceeds the given limit (e.g., with too many nested
200                /// groups), then an error is returned by the parser.
201                ///
202                /// The purpose of this limit is to act as a heuristic to prevent stack
203                /// overflow for consumers that do structural induction on an `Ast` using
204                /// explicit recursion. While this crate never does this (instead using
205                /// constant stack space and moving the call stack to the heap), other
206                /// crates may.
207                ///
208                /// This limit is not checked until the entire Ast is parsed. Therefore,
209                /// if callers want to put a limit on the amount of heap space used, then
210                /// they should impose a limit on the length, in bytes, of the concrete
211                /// pattern string. In particular, this is viable since this parser
212                /// implementation will limit itself to heap space proportional to the
213                /// length of the pattern string.
214                ///
215                /// Note that a nest limit of `0` will return a nest limit error for most
216                /// patterns but not all. For example, a nest limit of `0` permits `a` but
217                /// not `ab`, since `ab` requires a concatenation, which results in a nest
218                /// depth of `1`. In general, a nest limit is not something that manifests
219                /// in an obvious way in the concrete syntax, therefore, it should not be
220                /// used in a granular way.
221                pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
222                    self.0.nest_limit = limit;
223                    self
224                }
225            }
226        }
227    };
228}
229
230define_builder!(bytes, re_bytes, false);
231define_builder!(unicode, re_unicode, true);
232
233macro_rules! define_set_builder {
234    ($name:ident, $regex_mod:ident, $only_utf8:expr) => {
235        pub mod $name {
236            use super::RegexOptions;
237            use error::Error;
238            use exec::ExecBuilder;
239
240            use re_set::$regex_mod::RegexSet;
241
242            /// A configurable builder for a set of regular expressions.
243            ///
244            /// A builder can be used to configure how the regexes are built, for example,
245            /// by setting the default flags (which can be overridden in the expression
246            /// itself) or setting various limits.
247            pub struct RegexSetBuilder(RegexOptions);
248
249            impl RegexSetBuilder {
250                /// Create a new regular expression builder with the given pattern.
251                ///
252                /// If the pattern is invalid, then an error will be returned when
253                /// `build` is called.
254                pub fn new<I, S>(patterns: I) -> RegexSetBuilder
255                where
256                    S: AsRef<str>,
257                    I: IntoIterator<Item = S>,
258                {
259                    let mut builder = RegexSetBuilder(RegexOptions::default());
260                    for pat in patterns {
261                        builder.0.pats.push(pat.as_ref().to_owned());
262                    }
263                    builder
264                }
265
266                /// Consume the builder and compile the regular expressions into a set.
267                pub fn build(&self) -> Result<RegexSet, Error> {
268                    ExecBuilder::new_options(self.0.clone())
269                        .only_utf8($only_utf8)
270                        .build()
271                        .map(RegexSet::from)
272                }
273
274                /// Set the value for the case insensitive (`i`) flag.
275                pub fn case_insensitive(
276                    &mut self,
277                    yes: bool,
278                ) -> &mut RegexSetBuilder {
279                    self.0.case_insensitive = yes;
280                    self
281                }
282
283                /// Set the value for the multi-line matching (`m`) flag.
284                pub fn multi_line(
285                    &mut self,
286                    yes: bool,
287                ) -> &mut RegexSetBuilder {
288                    self.0.multi_line = yes;
289                    self
290                }
291
292                /// Set the value for the any character (`s`) flag, where in `.` matches
293                /// anything when `s` is set and matches anything except for new line when
294                /// it is not set (the default).
295                ///
296                /// N.B. "matches anything" means "any byte" for `regex::bytes::RegexSet`
297                /// expressions and means "any Unicode scalar value" for `regex::RegexSet`
298                /// expressions.
299                pub fn dot_matches_new_line(
300                    &mut self,
301                    yes: bool,
302                ) -> &mut RegexSetBuilder {
303                    self.0.dot_matches_new_line = yes;
304                    self
305                }
306
307                /// Set the value for the greedy swap (`U`) flag.
308                pub fn swap_greed(
309                    &mut self,
310                    yes: bool,
311                ) -> &mut RegexSetBuilder {
312                    self.0.swap_greed = yes;
313                    self
314                }
315
316                /// Set the value for the ignore whitespace (`x`) flag.
317                pub fn ignore_whitespace(
318                    &mut self,
319                    yes: bool,
320                ) -> &mut RegexSetBuilder {
321                    self.0.ignore_whitespace = yes;
322                    self
323                }
324
325                /// Set the value for the Unicode (`u`) flag.
326                pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
327                    self.0.unicode = yes;
328                    self
329                }
330
331                /// Whether to support octal syntax or not.
332                ///
333                /// Octal syntax is a little-known way of uttering Unicode codepoints in
334                /// a regular expression. For example, `a`, `\x61`, `\u0061` and
335                /// `\141` are all equivalent regular expressions, where the last example
336                /// shows octal syntax.
337                ///
338                /// While supporting octal syntax isn't in and of itself a problem, it does
339                /// make good error messages harder. That is, in PCRE based regex engines,
340                /// syntax like `\0` invokes a backreference, which is explicitly
341                /// unsupported in Rust's regex engine. However, many users expect it to
342                /// be supported. Therefore, when octal support is disabled, the error
343                /// message will explicitly mention that backreferences aren't supported.
344                ///
345                /// Octal syntax is disabled by default.
346                pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
347                    self.0.octal = yes;
348                    self
349                }
350
351                /// Set the approximate size limit of the compiled regular expression.
352                ///
353                /// This roughly corresponds to the number of bytes occupied by a single
354                /// compiled program. If the program exceeds this number, then a
355                /// compilation error is returned.
356                pub fn size_limit(
357                    &mut self,
358                    limit: usize,
359                ) -> &mut RegexSetBuilder {
360                    self.0.size_limit = limit;
361                    self
362                }
363
364                /// Set the approximate size of the cache used by the DFA.
365                ///
366                /// This roughly corresponds to the number of bytes that the DFA will
367                /// use while searching.
368                ///
369                /// Note that this is a *per thread* limit. There is no way to set a global
370                /// limit. In particular, if a regex is used from multiple threads
371                /// simultaneously, then each thread may use up to the number of bytes
372                /// specified here.
373                pub fn dfa_size_limit(
374                    &mut self,
375                    limit: usize,
376                ) -> &mut RegexSetBuilder {
377                    self.0.dfa_size_limit = limit;
378                    self
379                }
380
381                /// Set the nesting limit for this parser.
382                ///
383                /// The nesting limit controls how deep the abstract syntax tree is allowed
384                /// to be. If the AST exceeds the given limit (e.g., with too many nested
385                /// groups), then an error is returned by the parser.
386                ///
387                /// The purpose of this limit is to act as a heuristic to prevent stack
388                /// overflow for consumers that do structural induction on an `Ast` using
389                /// explicit recursion. While this crate never does this (instead using
390                /// constant stack space and moving the call stack to the heap), other
391                /// crates may.
392                ///
393                /// This limit is not checked until the entire Ast is parsed. Therefore,
394                /// if callers want to put a limit on the amount of heap space used, then
395                /// they should impose a limit on the length, in bytes, of the concrete
396                /// pattern string. In particular, this is viable since this parser
397                /// implementation will limit itself to heap space proportional to the
398                /// length of the pattern string.
399                ///
400                /// Note that a nest limit of `0` will return a nest limit error for most
401                /// patterns but not all. For example, a nest limit of `0` permits `a` but
402                /// not `ab`, since `ab` requires a concatenation, which results in a nest
403                /// depth of `1`. In general, a nest limit is not something that manifests
404                /// in an obvious way in the concrete syntax, therefore, it should not be
405                /// used in a granular way.
406                pub fn nest_limit(
407                    &mut self,
408                    limit: u32,
409                ) -> &mut RegexSetBuilder {
410                    self.0.nest_limit = limit;
411                    self
412                }
413            }
414        }
415    };
416}
417
418define_set_builder!(set_bytes, bytes, false);
419define_set_builder!(set_unicode, unicode, true);