htmlescape/
io_support.rs

1use std::io::{Write, Read, Error, ErrorKind};
2use std::io;
3use std::fmt;
4
5pub fn write_char<W: Write>(writer: &mut W, c: char) -> io::Result<()> {
6    let mut buf = [0u8;4];
7    let utf8 = encode_char_utf8(c, &mut buf);
8    writer.write_all(utf8)
9}
10
11fn encode_char_utf8<'a>(c: char, buf: &'a mut [u8]) -> &'a [u8] {
12    let c = c as u32;
13    if c <= 0x7f {
14        buf[0] = c as u8;
15        &buf[..1]
16    } else if c <= 0x7ff {
17        buf[1] = 0b10000000 | (c & 0b00111111) as u8;
18        buf[0] = 0b11000000 | ((c >> 6) & 0b00011111) as u8;
19        &buf[..2]
20    } else if c <= 0xffff {
21        buf[2] = 0b10000000 | (c & 0b00111111) as u8;
22        buf[1] = 0b10000000 | ((c >> 6) & 0b00111111) as u8;
23        buf[0] = 0b11100000 | ((c >> 12) & 0b00001111) as u8;
24        &buf[..3]
25    } else {
26        buf[3] = 0b10000000 | (c & 0b00111111) as u8;
27        buf[2] = 0b10000000 | ((c >> 6) & 0b00111111) as u8;
28        buf[1] = 0b10000000 | ((c >> 12) & 0b00111111) as u8;
29        buf[0] = 0b11110000 | ((c >> 18) & 0b00000111) as u8;
30        &buf[..4]
31    }
32}
33
34fn utf8_char_bytes(first: u8) -> usize {
35    if first & 0b10000000 == 0 {
36        1
37    } else if first & 0b11100000 == 0b11000000 {
38        2
39    } else if first & 0b11110000 == 0b11100000 {
40        3
41    } else if first & 0b11111000 == 0b11110000 {
42        4
43    } else {
44        0
45    }
46}
47
48
49// `Chars` code copied and modified from `std`
50// The reason for doing this is that when using Chars from std, read_one_byte() isn't inlined.
51// This is very painful when the backing Reader is just a Cursor over a byte array,
52//  and read_one_byte() should be much more than return buf[idx++] (+ check for end ofc.).
53
54pub struct Chars<R> {
55    inner: R
56}
57
58pub fn chars<R: Read>(reader: R) -> Chars<R> {
59    Chars { inner: reader }
60}
61
62#[derive(Debug)]
63pub enum CharsError {
64    /// Variant representing that the underlying stream was read successfully
65    /// but it did not contain valid utf8 data.
66    NotUtf8,
67
68    /// Variant representing that an I/O error occurred.
69    Other(Error),
70}
71
72impl<R: Read> Iterator for Chars<R> {
73    type Item = Result<char, CharsError>;
74
75    fn next(&mut self) -> Option<Result<char, CharsError>> {
76        let first_byte = match read_a_byte(&mut self.inner) {
77            None => return None,
78            Some(Ok(b)) => b,
79            Some(Err(e)) => return Some(Err(CharsError::Other(e))),
80        };
81        let width = utf8_char_bytes(first_byte);
82        if width == 1 { return Some(Ok(first_byte as char)) }
83        if width == 0 { return Some(Err(CharsError::NotUtf8)) }
84        let mut buf = [first_byte, 0, 0, 0];
85        {
86            let mut start = 1;
87            while start < width {
88                match self.inner.read(&mut buf[start..width]) {
89                    Ok(0) => return Some(Err(CharsError::NotUtf8)),
90                    Ok(n) => start += n,
91                    Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
92                    Err(e) => return Some(Err(CharsError::Other(e))),
93                }
94            }
95        }
96        Some(match ::std::str::from_utf8(&buf[..width]).ok() {
97            Some(s) => Ok(s.chars().next().unwrap()),
98            None => Err(CharsError::NotUtf8),
99        })
100    }
101}
102
103fn read_a_byte<R: Read>(reader: &mut R) -> Option<io::Result<u8>> {
104    let mut buf = [0];
105    loop {
106        return match reader.read(&mut buf) {
107            Ok(0) => None,
108            Ok(..) => Some(Ok(buf[0])),
109            Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
110            Err(e) => Some(Err(e)),
111        };
112    }
113}
114
115impl fmt::Display for CharsError {
116    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
117        match *self {
118            CharsError::NotUtf8 => {
119                "byte stream did not contain valid utf8".fmt(f)
120            }
121            CharsError::Other(ref e) => e.fmt(f),
122        }
123    }
124}
125
126#[cfg(test)]
127mod test {
128
129    use super::encode_char_utf8;
130
131    #[test]
132    fn test_encode_char_utf8() {
133        do_test_encode_char_utf8('$', &[0x24]);
134        do_test_encode_char_utf8('¢', &[0xc2, 0xa2]);
135        do_test_encode_char_utf8('€', &[0xe2, 0x82, 0xac]);
136        do_test_encode_char_utf8('\u{10348}', &[0xf0, 0x90, 0x8d, 0x88]);
137    }
138
139    fn do_test_encode_char_utf8(c: char, expected: &[u8]) {
140        let mut buf = [0u8;4];
141        let utf8 = encode_char_utf8(c, &mut buf);
142        assert_eq!(utf8, expected);
143    }
144}