1use std::io::{self, Write, BufRead, Cursor};
2use std::char;
3use self::DecodeState::*;
4use self::DecodeErrKind::*;
5use io_support::{self, write_char, CharsError};
6use entities::*;
7
8#[derive(Debug)]
9pub enum DecodeErrKind {
10 UnknownEntity,
13
14 MalformedNumEscape,
17
18 InvalidCharacter,
21
22 PrematureEnd,
24
25 IoError(io::Error),
27
28 EncodingError,
30}
31
32impl PartialEq for DecodeErrKind {
33 fn eq(&self, other: &DecodeErrKind) -> bool {
34 match (self, other) {
35 (&UnknownEntity, &UnknownEntity) => true,
36 (&MalformedNumEscape, &MalformedNumEscape) => true,
37 (&InvalidCharacter, &InvalidCharacter) => true,
38 (&PrematureEnd, &PrematureEnd) => true,
39 (&IoError(_), &IoError(_)) => true,
40 (&EncodingError, &EncodingError) => true,
41 _ => false
42 }
43 }
44}
45
46impl Eq for DecodeErrKind {}
47
48#[derive(Debug, Eq, PartialEq)]
50pub struct DecodeErr {
51 pub position: usize,
53 pub kind: DecodeErrKind
55}
56
57#[derive(PartialEq, Eq)]
58enum DecodeState {
59 Normal,
60 Entity,
61 Named,
62 Numeric,
63 Hex,
64 Dec
65}
66
67macro_rules! try_parse(
68 ($parse:expr, $pos:expr) => (
69 match $parse {
70 Err(reason) => return Err(DecodeErr{ position: $pos, kind: reason}),
71 Ok(res) => res
72 }
73 ););
74
75macro_rules! try_dec_io(
76 ($io:expr, $pos:expr) => (
77 match $io {
78 Err(e) => return Err(DecodeErr{ position: $pos, kind: IoError(e)}),
79 Ok(res) => res
80 }
81 ););
82
83pub fn decode_html_rw<R: BufRead, W: Write>(reader: R, writer: &mut W) -> Result<(), DecodeErr> {
95 let mut state: DecodeState = Normal;
96 let mut pos = 0;
97 let mut good_pos = 0;
98 let mut buf = String::with_capacity(8);
99 for c in io_support::chars(reader) {
100 let c = match c {
101 Err(e) => {
102 let kind = match e {
103 CharsError::NotUtf8 => EncodingError,
104 CharsError::Other(io) => IoError(io)
105 };
106 return Err(DecodeErr{ position: pos, kind: kind });
107 }
108 Ok(c) => c
109 };
110 match state {
111 Normal if c == '&' => state = Entity,
112 Normal => try_dec_io!(write_char(writer, c), good_pos),
113 Entity if c == '#' => state = Numeric,
114 Entity if c == ';' => return Err(DecodeErr{ position: good_pos, kind: UnknownEntity }),
115 Entity => {
116 state = Named;
117 buf.push(c);
118 }
119 Named if c == ';' => {
120 state = Normal;
121 let ch = try_parse!(decode_named_entity(&buf), good_pos);
122 try_dec_io!(write_char(writer, ch), good_pos);
123 buf.clear();
124 }
125 Named => buf.push(c),
126 Numeric if is_digit(c) => {
127 state = Dec;
128 buf.push(c);
129 }
130 Numeric if c == 'x' => state = Hex,
131 Dec if c == ';' => {
132 state = Normal;
133 let ch = try_parse!(decode_numeric(&buf, 10), good_pos);
134 try_dec_io!(write_char(writer, ch), good_pos);
135 buf.clear();
136 }
137 Hex if c == ';' => {
138 state = Normal;
139 let ch = try_parse!(decode_numeric(&buf, 16), good_pos);
140 try_dec_io!(write_char(writer, ch), good_pos);
141 buf.clear();
142 }
143 Hex if is_hex_digit(c) => buf.push(c),
144 Dec if is_digit(c) => buf.push(c),
145 Numeric | Hex | Dec => return Err(DecodeErr{ position: good_pos, kind: MalformedNumEscape}),
146 }
147 pos += 1;
148 if state == Normal {
149 good_pos = pos;
150 }
151 }
152 if state != Normal {
153 Err(DecodeErr{ position: good_pos, kind: PrematureEnd})
154 } else {
155 Ok(())
156 }
157}
158
159pub fn decode_html(s: &str) -> Result<String, DecodeErr> {
175 let mut writer = Vec::with_capacity(s.len());
176 let bytes = s.as_bytes();
177 let mut reader = Cursor::new(bytes);
178 let res = decode_html_rw(&mut reader, &mut writer);
179 match res {
180 Ok(_) => Ok(String::from_utf8(writer).unwrap()),
181 Err(err) => Err(err)
182 }
183}
184
185fn is_digit(c: char) -> bool { c >= '0' && c <= '9' }
186
187fn is_hex_digit(c: char) -> bool {
188 is_digit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
189}
190
191fn decode_named_entity(entity: &str) -> Result<char, DecodeErrKind> {
192 match NAMED_ENTITIES.binary_search_by(|&(ent, _)| ent.cmp(entity)) {
193 Err(..) => Err(UnknownEntity),
194 Ok(idx) => {
195 let (_, c) = NAMED_ENTITIES[idx];
196 Ok(c)
197 }
198 }
199}
200
201fn decode_numeric(esc: &str, radix: u32) -> Result<char, DecodeErrKind> {
202 match u32::from_str_radix(esc, radix) {
203 Ok(n) => match char::from_u32(n) {
204 Some(c) => Ok(c),
205 None => Err(InvalidCharacter)
206 },
207 Err(..) => Err(MalformedNumEscape)
208 }
209}
210