htmlescape/
decode.rs

1use std::io::{self, Write, BufRead, Cursor};
2use std::char;
3use self::DecodeState::*;
4use self::DecodeErrKind::*;
5use io_support::{self, write_char, CharsError};
6use entities::*;
7
8#[derive(Debug)]
9pub enum DecodeErrKind {
10    /// A non-existent named entity was referenced.
11    /// Example: &thisentitydoesnotexist
12    UnknownEntity,
13
14    /// A numerical escape sequence (&# or &#x) containing an invalid character.
15    /// Examples: `&#32a`, `&#xfoo`
16    MalformedNumEscape,
17
18    /// A numerical escape sequence (&# or &#x) resolved to an invalid unicode code point.
19    /// Example: `&#xffffff`
20    InvalidCharacter,
21
22    /// The input ended prematurely (ie. inside an unterminated named entity sequence).
23    PrematureEnd,
24
25    /// An IO error occured.
26    IoError(io::Error),
27
28    /// The supplied Reader produces invalid UTF-8.
29    EncodingError,
30}
31
32impl PartialEq for DecodeErrKind {
33    fn eq(&self, other: &DecodeErrKind) -> bool {
34        match (self, other) {
35            (&UnknownEntity, &UnknownEntity) => true,
36            (&MalformedNumEscape, &MalformedNumEscape) => true,
37            (&InvalidCharacter, &InvalidCharacter) => true,
38            (&PrematureEnd, &PrematureEnd) => true,
39            (&IoError(_), &IoError(_)) => true,
40            (&EncodingError, &EncodingError) => true,
41            _ => false
42        }
43    }
44}
45
46impl Eq for DecodeErrKind {}
47
48/// Error from decoding a entity-encoded string.
49#[derive(Debug, Eq, PartialEq)]
50pub struct DecodeErr {
51    /// Number of characters read from the input before encountering an error
52    pub position: usize,
53    /// Type of error
54    pub kind: DecodeErrKind
55}
56
57#[derive(PartialEq, Eq)]
58enum DecodeState {
59    Normal,
60    Entity,
61    Named,
62    Numeric,
63    Hex,
64    Dec
65}
66
67macro_rules! try_parse(
68    ($parse:expr, $pos:expr) => (
69        match $parse {
70            Err(reason) => return Err(DecodeErr{ position: $pos, kind: reason}),
71            Ok(res) => res
72        }
73    ););
74
75macro_rules! try_dec_io(
76    ($io:expr, $pos:expr) => (
77        match $io {
78            Err(e) => return Err(DecodeErr{ position: $pos, kind: IoError(e)}),
79            Ok(res) => res
80        }
81    ););
82
83/// Decodes an entity-encoded string from a reader to a writer.
84///
85/// Similar to `decode_html`, except reading from a reader rather than a string, and
86/// writing to a writer rather than returning a `String`.
87///
88/// # Arguments
89/// - `reader` - UTF-8 encoded data is read from here.
90/// - `writer` - UTF8- decoded data is written to here.
91///
92/// # Errors
93/// Errors can be caused by IO errors, `reader` producing invalid UTF-8, or by syntax errors.
94pub fn decode_html_rw<R: BufRead, W: Write>(reader: R, writer: &mut W) -> Result<(), DecodeErr> {
95    let mut state: DecodeState = Normal;
96    let mut pos = 0;
97    let mut good_pos = 0;
98    let mut buf = String::with_capacity(8);
99    for c in io_support::chars(reader) {
100        let c = match c {
101            Err(e) => {
102                let kind = match e {
103                    CharsError::NotUtf8   => EncodingError,
104                    CharsError::Other(io) => IoError(io)
105                };
106                return Err(DecodeErr{ position: pos, kind: kind });
107            }
108            Ok(c) => c
109        };
110        match state {
111            Normal if c == '&' => state = Entity,
112            Normal => try_dec_io!(write_char(writer, c), good_pos),
113            Entity if c == '#' => state = Numeric,
114            Entity if c == ';' => return Err(DecodeErr{ position: good_pos, kind: UnknownEntity }),
115            Entity => {
116                state = Named;
117                buf.push(c);
118            }
119            Named if c == ';' => {
120                state = Normal;
121                let ch = try_parse!(decode_named_entity(&buf), good_pos);
122                try_dec_io!(write_char(writer, ch), good_pos);
123                buf.clear();
124            }
125            Named => buf.push(c),
126            Numeric if is_digit(c) => {
127                state = Dec;
128                buf.push(c);
129            }
130            Numeric if c == 'x' => state = Hex,
131            Dec if c == ';' => {
132                state = Normal;
133                let ch = try_parse!(decode_numeric(&buf, 10), good_pos);
134                try_dec_io!(write_char(writer, ch), good_pos);
135                buf.clear();
136            }
137            Hex if c == ';' => {
138                state = Normal;
139                let ch = try_parse!(decode_numeric(&buf, 16), good_pos);
140                try_dec_io!(write_char(writer, ch), good_pos);
141                buf.clear();
142            }
143            Hex if is_hex_digit(c) => buf.push(c),
144            Dec if is_digit(c) => buf.push(c),
145            Numeric | Hex | Dec => return Err(DecodeErr{ position: good_pos, kind: MalformedNumEscape}),
146        }
147        pos += 1;
148        if state == Normal {
149            good_pos = pos;
150        }
151    }
152    if state != Normal {
153        Err(DecodeErr{ position: good_pos, kind: PrematureEnd})
154    } else {
155        Ok(())
156    }
157}
158
159/// Decodes an entity-encoded string.
160///
161/// Decodes an entity encoded string, replacing HTML entities (`&amp;`, `&#20;` ...) with the
162/// the corresponding character. Case matters for named entities, ie. `&Amp;` is invalid.
163/// Case does not matter for hex entities, so `&#x2E;` and `&#x2e;` are treated the same.
164///
165/// # Arguments
166/// - `s` - Entity-encoded string to decode.
167///
168/// # Failure
169/// The function will fail if input string contains invalid named entities (eg. `&nosuchentity;`),
170/// invalid hex entities (eg. `&#xRT;`), invalid decimal entities (eg. `&#-1;), unclosed entities
171/// (`s == "&amp hej och hå"`) or otherwise malformed entities.
172///
173/// This function will never return errors with `kind` set to `IoError` or `EncodingError`.
174pub fn decode_html(s: &str) -> Result<String, DecodeErr> {
175    let mut writer = Vec::with_capacity(s.len());
176    let bytes = s.as_bytes();
177    let mut reader = Cursor::new(bytes);
178    let res = decode_html_rw(&mut reader, &mut writer);
179    match res {
180        Ok(_) => Ok(String::from_utf8(writer).unwrap()),
181        Err(err) => Err(err)
182    }
183}
184
185fn is_digit(c: char) -> bool { c >= '0' && c <= '9' }
186
187fn is_hex_digit(c: char) -> bool {
188    is_digit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
189}
190
191fn decode_named_entity(entity: &str) -> Result<char, DecodeErrKind> {
192    match NAMED_ENTITIES.binary_search_by(|&(ent, _)| ent.cmp(entity)) {
193        Err(..) => Err(UnknownEntity),
194        Ok(idx) => {
195            let (_, c) = NAMED_ENTITIES[idx];
196            Ok(c)
197        }
198    }
199}
200
201fn decode_numeric(esc: &str, radix: u32) -> Result<char, DecodeErrKind> {
202    match u32::from_str_radix(esc, radix) {
203        Ok(n) => match char::from_u32(n) {
204            Some(c) => Ok(c),
205            None => Err(InvalidCharacter)
206        },
207        Err(..) => Err(MalformedNumEscape)
208    }
209}
210