regex_syntax/hir/
print.rs

1/*!
2This module provides a regular expression printer for `Hir`.
3*/
4
5use std::fmt;
6
7use hir::visitor::{self, Visitor};
8use hir::{self, Hir, HirKind};
9use is_meta_character;
10
11/// A builder for constructing a printer.
12///
13/// Note that since a printer doesn't have any configuration knobs, this type
14/// remains unexported.
15#[derive(Clone, Debug)]
16struct PrinterBuilder {
17    _priv: (),
18}
19
20impl Default for PrinterBuilder {
21    fn default() -> PrinterBuilder {
22        PrinterBuilder::new()
23    }
24}
25
26impl PrinterBuilder {
27    fn new() -> PrinterBuilder {
28        PrinterBuilder { _priv: () }
29    }
30
31    fn build(&self) -> Printer {
32        Printer { _priv: () }
33    }
34}
35
36/// A printer for a regular expression's high-level intermediate
37/// representation.
38///
39/// A printer converts a high-level intermediate representation (HIR) to a
40/// regular expression pattern string. This particular printer uses constant
41/// stack space and heap space proportional to the size of the HIR.
42///
43/// Since this printer is only using the HIR, the pattern it prints will likely
44/// not resemble the original pattern at all. For example, a pattern like
45/// `\pL` will have its entire class written out.
46///
47/// The purpose of this printer is to provide a means to mutate an HIR and then
48/// build a regular expression from the result of that mutation. (A regex
49/// library could provide a constructor from this HIR explicitly, but that
50/// creates an unnecessary public coupling between the regex library and this
51/// specific HIR representation.)
52#[derive(Debug)]
53pub struct Printer {
54    _priv: (),
55}
56
57impl Printer {
58    /// Create a new printer.
59    pub fn new() -> Printer {
60        PrinterBuilder::new().build()
61    }
62
63    /// Print the given `Ast` to the given writer. The writer must implement
64    /// `fmt::Write`. Typical implementations of `fmt::Write` that can be used
65    /// here are a `fmt::Formatter` (which is available in `fmt::Display`
66    /// implementations) or a `&mut String`.
67    pub fn print<W: fmt::Write>(&mut self, hir: &Hir, wtr: W) -> fmt::Result {
68        visitor::visit(hir, Writer { printer: self, wtr: wtr })
69    }
70}
71
72#[derive(Debug)]
73struct Writer<'p, W> {
74    printer: &'p mut Printer,
75    wtr: W,
76}
77
78impl<'p, W: fmt::Write> Visitor for Writer<'p, W> {
79    type Output = ();
80    type Err = fmt::Error;
81
82    fn finish(self) -> fmt::Result {
83        Ok(())
84    }
85
86    fn visit_pre(&mut self, hir: &Hir) -> fmt::Result {
87        match *hir.kind() {
88            HirKind::Empty
89            | HirKind::Repetition(_)
90            | HirKind::Concat(_)
91            | HirKind::Alternation(_) => {}
92            HirKind::Literal(hir::Literal::Unicode(c)) => {
93                self.write_literal_char(c)?;
94            }
95            HirKind::Literal(hir::Literal::Byte(b)) => {
96                self.write_literal_byte(b)?;
97            }
98            HirKind::Class(hir::Class::Unicode(ref cls)) => {
99                self.wtr.write_str("[")?;
100                for range in cls.iter() {
101                    if range.start() == range.end() {
102                        self.write_literal_char(range.start())?;
103                    } else {
104                        self.write_literal_char(range.start())?;
105                        self.wtr.write_str("-")?;
106                        self.write_literal_char(range.end())?;
107                    }
108                }
109                self.wtr.write_str("]")?;
110            }
111            HirKind::Class(hir::Class::Bytes(ref cls)) => {
112                self.wtr.write_str("(?-u:[")?;
113                for range in cls.iter() {
114                    if range.start() == range.end() {
115                        self.write_literal_class_byte(range.start())?;
116                    } else {
117                        self.write_literal_class_byte(range.start())?;
118                        self.wtr.write_str("-")?;
119                        self.write_literal_class_byte(range.end())?;
120                    }
121                }
122                self.wtr.write_str("])")?;
123            }
124            HirKind::Anchor(hir::Anchor::StartLine) => {
125                self.wtr.write_str("(?m:^)")?;
126            }
127            HirKind::Anchor(hir::Anchor::EndLine) => {
128                self.wtr.write_str("(?m:$)")?;
129            }
130            HirKind::Anchor(hir::Anchor::StartText) => {
131                self.wtr.write_str(r"\A")?;
132            }
133            HirKind::Anchor(hir::Anchor::EndText) => {
134                self.wtr.write_str(r"\z")?;
135            }
136            HirKind::WordBoundary(hir::WordBoundary::Unicode) => {
137                self.wtr.write_str(r"\b")?;
138            }
139            HirKind::WordBoundary(hir::WordBoundary::UnicodeNegate) => {
140                self.wtr.write_str(r"\B")?;
141            }
142            HirKind::WordBoundary(hir::WordBoundary::Ascii) => {
143                self.wtr.write_str(r"(?-u:\b)")?;
144            }
145            HirKind::WordBoundary(hir::WordBoundary::AsciiNegate) => {
146                self.wtr.write_str(r"(?-u:\B)")?;
147            }
148            HirKind::Group(ref x) => match x.kind {
149                hir::GroupKind::CaptureIndex(_) => {
150                    self.wtr.write_str("(")?;
151                }
152                hir::GroupKind::CaptureName { ref name, .. } => {
153                    write!(self.wtr, "(?P<{}>", name)?;
154                }
155                hir::GroupKind::NonCapturing => {
156                    self.wtr.write_str("(?:")?;
157                }
158            },
159        }
160        Ok(())
161    }
162
163    fn visit_post(&mut self, hir: &Hir) -> fmt::Result {
164        match *hir.kind() {
165            // Handled during visit_pre
166            HirKind::Empty
167            | HirKind::Literal(_)
168            | HirKind::Class(_)
169            | HirKind::Anchor(_)
170            | HirKind::WordBoundary(_)
171            | HirKind::Concat(_)
172            | HirKind::Alternation(_) => {}
173            HirKind::Repetition(ref x) => {
174                match x.kind {
175                    hir::RepetitionKind::ZeroOrOne => {
176                        self.wtr.write_str("?")?;
177                    }
178                    hir::RepetitionKind::ZeroOrMore => {
179                        self.wtr.write_str("*")?;
180                    }
181                    hir::RepetitionKind::OneOrMore => {
182                        self.wtr.write_str("+")?;
183                    }
184                    hir::RepetitionKind::Range(ref x) => match *x {
185                        hir::RepetitionRange::Exactly(m) => {
186                            write!(self.wtr, "{{{}}}", m)?;
187                        }
188                        hir::RepetitionRange::AtLeast(m) => {
189                            write!(self.wtr, "{{{},}}", m)?;
190                        }
191                        hir::RepetitionRange::Bounded(m, n) => {
192                            write!(self.wtr, "{{{},{}}}", m, n)?;
193                        }
194                    },
195                }
196                if !x.greedy {
197                    self.wtr.write_str("?")?;
198                }
199            }
200            HirKind::Group(_) => {
201                self.wtr.write_str(")")?;
202            }
203        }
204        Ok(())
205    }
206
207    fn visit_alternation_in(&mut self) -> fmt::Result {
208        self.wtr.write_str("|")
209    }
210}
211
212impl<'p, W: fmt::Write> Writer<'p, W> {
213    fn write_literal_char(&mut self, c: char) -> fmt::Result {
214        if is_meta_character(c) {
215            self.wtr.write_str("\\")?;
216        }
217        self.wtr.write_char(c)
218    }
219
220    fn write_literal_byte(&mut self, b: u8) -> fmt::Result {
221        let c = b as char;
222        if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() {
223            self.write_literal_char(c)
224        } else {
225            write!(self.wtr, "(?-u:\\x{:02X})", b)
226        }
227    }
228
229    fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result {
230        let c = b as char;
231        if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() {
232            self.write_literal_char(c)
233        } else {
234            write!(self.wtr, "\\x{:02X}", b)
235        }
236    }
237}
238
239#[cfg(test)]
240mod tests {
241    use super::Printer;
242    use ParserBuilder;
243
244    fn roundtrip(given: &str, expected: &str) {
245        roundtrip_with(|b| b, given, expected);
246    }
247
248    fn roundtrip_bytes(given: &str, expected: &str) {
249        roundtrip_with(|b| b.allow_invalid_utf8(true), given, expected);
250    }
251
252    fn roundtrip_with<F>(mut f: F, given: &str, expected: &str)
253    where
254        F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder,
255    {
256        let mut builder = ParserBuilder::new();
257        f(&mut builder);
258        let hir = builder.build().parse(given).unwrap();
259
260        let mut printer = Printer::new();
261        let mut dst = String::new();
262        printer.print(&hir, &mut dst).unwrap();
263
264        // Check that the result is actually valid.
265        builder.build().parse(&dst).unwrap();
266
267        assert_eq!(expected, dst);
268    }
269
270    #[test]
271    fn print_literal() {
272        roundtrip("a", "a");
273        roundtrip(r"\xff", "\u{FF}");
274        roundtrip_bytes(r"\xff", "\u{FF}");
275        roundtrip_bytes(r"(?-u)\xff", r"(?-u:\xFF)");
276        roundtrip("☃", "☃");
277    }
278
279    #[test]
280    fn print_class() {
281        roundtrip(r"[a]", r"[a]");
282        roundtrip(r"[a-z]", r"[a-z]");
283        roundtrip(r"[a-z--b-c--x-y]", r"[ad-wz]");
284        roundtrip(r"[^\x01-\u{10FFFF}]", "[\u{0}]");
285        roundtrip(r"[-]", r"[\-]");
286        roundtrip(r"[☃-⛄]", r"[☃-⛄]");
287
288        roundtrip(r"(?-u)[a]", r"(?-u:[a])");
289        roundtrip(r"(?-u)[a-z]", r"(?-u:[a-z])");
290        roundtrip_bytes(r"(?-u)[a-\xFF]", r"(?-u:[a-\xFF])");
291
292        // The following test that the printer escapes meta characters
293        // in character classes.
294        roundtrip(r"[\[]", r"[\[]");
295        roundtrip(r"[Z-_]", r"[Z-_]");
296        roundtrip(r"[Z-_--Z]", r"[\[-_]");
297
298        // The following test that the printer escapes meta characters
299        // in byte oriented character classes.
300        roundtrip_bytes(r"(?-u)[\[]", r"(?-u:[\[])");
301        roundtrip_bytes(r"(?-u)[Z-_]", r"(?-u:[Z-_])");
302        roundtrip_bytes(r"(?-u)[Z-_--Z]", r"(?-u:[\[-_])");
303    }
304
305    #[test]
306    fn print_anchor() {
307        roundtrip(r"^", r"\A");
308        roundtrip(r"$", r"\z");
309        roundtrip(r"(?m)^", r"(?m:^)");
310        roundtrip(r"(?m)$", r"(?m:$)");
311    }
312
313    #[test]
314    fn print_word_boundary() {
315        roundtrip(r"\b", r"\b");
316        roundtrip(r"\B", r"\B");
317        roundtrip(r"(?-u)\b", r"(?-u:\b)");
318        roundtrip_bytes(r"(?-u)\B", r"(?-u:\B)");
319    }
320
321    #[test]
322    fn print_repetition() {
323        roundtrip("a?", "a?");
324        roundtrip("a??", "a??");
325        roundtrip("(?U)a?", "a??");
326
327        roundtrip("a*", "a*");
328        roundtrip("a*?", "a*?");
329        roundtrip("(?U)a*", "a*?");
330
331        roundtrip("a+", "a+");
332        roundtrip("a+?", "a+?");
333        roundtrip("(?U)a+", "a+?");
334
335        roundtrip("a{1}", "a{1}");
336        roundtrip("a{1,}", "a{1,}");
337        roundtrip("a{1,5}", "a{1,5}");
338        roundtrip("a{1}?", "a{1}?");
339        roundtrip("a{1,}?", "a{1,}?");
340        roundtrip("a{1,5}?", "a{1,5}?");
341        roundtrip("(?U)a{1}", "a{1}?");
342        roundtrip("(?U)a{1,}", "a{1,}?");
343        roundtrip("(?U)a{1,5}", "a{1,5}?");
344    }
345
346    #[test]
347    fn print_group() {
348        roundtrip("()", "()");
349        roundtrip("(?P<foo>)", "(?P<foo>)");
350        roundtrip("(?:)", "(?:)");
351
352        roundtrip("(a)", "(a)");
353        roundtrip("(?P<foo>a)", "(?P<foo>a)");
354        roundtrip("(?:a)", "(?:a)");
355
356        roundtrip("((((a))))", "((((a))))");
357    }
358
359    #[test]
360    fn print_alternation() {
361        roundtrip("|", "|");
362        roundtrip("||", "||");
363
364        roundtrip("a|b", "a|b");
365        roundtrip("a|b|c", "a|b|c");
366        roundtrip("foo|bar|quux", "foo|bar|quux");
367    }
368}