Skip to content

Commit 9554502

Browse files
committed
Move char decoding iterators into a separate private module.
1 parent 9396924 commit 9554502

File tree

3 files changed

+265
-249
lines changed

3 files changed

+265
-249
lines changed

src/libcore/char/decode.rs

Lines changed: 259 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,259 @@
1+
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
//! UTF-8 and UTF-16 decoding iterators
12+
13+
use fmt;
14+
use iter::FusedIterator;
15+
use super::from_u32_unchecked;
16+
17+
/// An iterator over an iterator of bytes of the characters the bytes represent
18+
/// as UTF-8
19+
#[unstable(feature = "decode_utf8", issue = "33906")]
20+
#[derive(Clone, Debug)]
21+
pub struct DecodeUtf8<I: Iterator<Item = u8>>(::iter::Peekable<I>);
22+
23+
/// Decodes an `Iterator` of bytes as UTF-8.
24+
#[unstable(feature = "decode_utf8", issue = "33906")]
25+
#[inline]
26+
pub fn decode_utf8<I: IntoIterator<Item = u8>>(i: I) -> DecodeUtf8<I::IntoIter> {
27+
DecodeUtf8(i.into_iter().peekable())
28+
}
29+
30+
/// `<DecodeUtf8 as Iterator>::next` returns this for an invalid input sequence.
31+
#[unstable(feature = "decode_utf8", issue = "33906")]
32+
#[derive(PartialEq, Eq, Debug)]
33+
pub struct InvalidSequence(());
34+
35+
#[unstable(feature = "decode_utf8", issue = "33906")]
36+
impl<I: Iterator<Item = u8>> Iterator for DecodeUtf8<I> {
37+
type Item = Result<char, InvalidSequence>;
38+
#[inline]
39+
40+
fn next(&mut self) -> Option<Result<char, InvalidSequence>> {
41+
self.0.next().map(|first_byte| {
42+
// Emit InvalidSequence according to
43+
// Unicode §5.22 Best Practice for U+FFFD Substitution
44+
// http://www.unicode.org/versions/Unicode9.0.0/ch05.pdf#G40630
45+
46+
// Roughly: consume at least one byte,
47+
// then validate one byte at a time and stop before the first unexpected byte
48+
// (which might be the valid start of the next byte sequence).
49+
50+
let mut code_point;
51+
macro_rules! first_byte {
52+
($mask: expr) => {
53+
code_point = u32::from(first_byte & $mask)
54+
}
55+
}
56+
macro_rules! continuation_byte {
57+
() => { continuation_byte!(0x80...0xBF) };
58+
($range: pat) => {
59+
match self.0.peek() {
60+
Some(&byte @ $range) => {
61+
code_point = (code_point << 6) | u32::from(byte & 0b0011_1111);
62+
self.0.next();
63+
}
64+
_ => return Err(InvalidSequence(()))
65+
}
66+
}
67+
}
68+
69+
match first_byte {
70+
0x00...0x7F => {
71+
first_byte!(0b1111_1111);
72+
}
73+
0xC2...0xDF => {
74+
first_byte!(0b0001_1111);
75+
continuation_byte!();
76+
}
77+
0xE0 => {
78+
first_byte!(0b0000_1111);
79+
continuation_byte!(0xA0...0xBF); // 0x80...0x9F here are overlong
80+
continuation_byte!();
81+
}
82+
0xE1...0xEC | 0xEE...0xEF => {
83+
first_byte!(0b0000_1111);
84+
continuation_byte!();
85+
continuation_byte!();
86+
}
87+
0xED => {
88+
first_byte!(0b0000_1111);
89+
continuation_byte!(0x80...0x9F); // 0xA0..0xBF here are surrogates
90+
continuation_byte!();
91+
}
92+
0xF0 => {
93+
first_byte!(0b0000_0111);
94+
continuation_byte!(0x90...0xBF); // 0x80..0x8F here are overlong
95+
continuation_byte!();
96+
continuation_byte!();
97+
}
98+
0xF1...0xF3 => {
99+
first_byte!(0b0000_0111);
100+
continuation_byte!();
101+
continuation_byte!();
102+
continuation_byte!();
103+
}
104+
0xF4 => {
105+
first_byte!(0b0000_0111);
106+
continuation_byte!(0x80...0x8F); // 0x90..0xBF here are beyond char::MAX
107+
continuation_byte!();
108+
continuation_byte!();
109+
}
110+
_ => return Err(InvalidSequence(())) // Illegal first byte, overlong, or beyond MAX
111+
}
112+
unsafe {
113+
Ok(from_u32_unchecked(code_point))
114+
}
115+
})
116+
}
117+
118+
#[inline]
119+
fn size_hint(&self) -> (usize, Option<usize>) {
120+
let (lower, upper) = self.0.size_hint();
121+
122+
// A code point is at most 4 bytes long.
123+
let min_code_points = lower / 4;
124+
125+
(min_code_points, upper)
126+
}
127+
}
128+
129+
#[unstable(feature = "decode_utf8", issue = "33906")]
130+
impl<I: FusedIterator<Item = u8>> FusedIterator for DecodeUtf8<I> {}
131+
132+
/// An iterator that decodes UTF-16 encoded code points from an iterator of `u16`s.
133+
#[stable(feature = "decode_utf16", since = "1.9.0")]
134+
#[derive(Clone, Debug)]
135+
pub struct DecodeUtf16<I>
136+
where I: Iterator<Item = u16>
137+
{
138+
iter: I,
139+
buf: Option<u16>,
140+
}
141+
142+
/// An error that can be returned when decoding UTF-16 code points.
143+
#[stable(feature = "decode_utf16", since = "1.9.0")]
144+
#[derive(Debug, Clone, Eq, PartialEq)]
145+
pub struct DecodeUtf16Error {
146+
code: u16,
147+
}
148+
149+
/// Create an iterator over the UTF-16 encoded code points in `iter`,
150+
/// returning unpaired surrogates as `Err`s.
151+
///
152+
/// # Examples
153+
///
154+
/// Basic usage:
155+
///
156+
/// ```
157+
/// use std::char::decode_utf16;
158+
///
159+
/// fn main() {
160+
/// // 𝄞mus<invalid>ic<invalid>
161+
/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
162+
/// 0x0073, 0xDD1E, 0x0069, 0x0063,
163+
/// 0xD834];
164+
///
165+
/// assert_eq!(decode_utf16(v.iter().cloned())
166+
/// .map(|r| r.map_err(|e| e.unpaired_surrogate()))
167+
/// .collect::<Vec<_>>(),
168+
/// vec![Ok('𝄞'),
169+
/// Ok('m'), Ok('u'), Ok('s'),
170+
/// Err(0xDD1E),
171+
/// Ok('i'), Ok('c'),
172+
/// Err(0xD834)]);
173+
/// }
174+
/// ```
175+
///
176+
/// A lossy decoder can be obtained by replacing `Err` results with the replacement character:
177+
///
178+
/// ```
179+
/// use std::char::{decode_utf16, REPLACEMENT_CHARACTER};
180+
///
181+
/// fn main() {
182+
/// // 𝄞mus<invalid>ic<invalid>
183+
/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
184+
/// 0x0073, 0xDD1E, 0x0069, 0x0063,
185+
/// 0xD834];
186+
///
187+
/// assert_eq!(decode_utf16(v.iter().cloned())
188+
/// .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER))
189+
/// .collect::<String>(),
190+
/// "𝄞mus�ic�");
191+
/// }
192+
/// ```
193+
#[stable(feature = "decode_utf16", since = "1.9.0")]
194+
#[inline]
195+
pub fn decode_utf16<I: IntoIterator<Item = u16>>(iter: I) -> DecodeUtf16<I::IntoIter> {
196+
DecodeUtf16 {
197+
iter: iter.into_iter(),
198+
buf: None,
199+
}
200+
}
201+
202+
#[stable(feature = "decode_utf16", since = "1.9.0")]
203+
impl<I: Iterator<Item = u16>> Iterator for DecodeUtf16<I> {
204+
type Item = Result<char, DecodeUtf16Error>;
205+
206+
fn next(&mut self) -> Option<Result<char, DecodeUtf16Error>> {
207+
let u = match self.buf.take() {
208+
Some(buf) => buf,
209+
None => self.iter.next()?
210+
};
211+
212+
if u < 0xD800 || 0xDFFF < u {
213+
// not a surrogate
214+
Some(Ok(unsafe { from_u32_unchecked(u as u32) }))
215+
} else if u >= 0xDC00 {
216+
// a trailing surrogate
217+
Some(Err(DecodeUtf16Error { code: u }))
218+
} else {
219+
let u2 = match self.iter.next() {
220+
Some(u2) => u2,
221+
// eof
222+
None => return Some(Err(DecodeUtf16Error { code: u })),
223+
};
224+
if u2 < 0xDC00 || u2 > 0xDFFF {
225+
// not a trailing surrogate so we're not a valid
226+
// surrogate pair, so rewind to redecode u2 next time.
227+
self.buf = Some(u2);
228+
return Some(Err(DecodeUtf16Error { code: u }));
229+
}
230+
231+
// all ok, so lets decode it.
232+
let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
233+
Some(Ok(unsafe { from_u32_unchecked(c) }))
234+
}
235+
}
236+
237+
#[inline]
238+
fn size_hint(&self) -> (usize, Option<usize>) {
239+
let (low, high) = self.iter.size_hint();
240+
// we could be entirely valid surrogates (2 elements per
241+
// char), or entirely non-surrogates (1 element per char)
242+
(low / 2, high)
243+
}
244+
}
245+
246+
impl DecodeUtf16Error {
247+
/// Returns the unpaired surrogate which caused this error.
248+
#[stable(feature = "decode_utf16", since = "1.9.0")]
249+
pub fn unpaired_surrogate(&self) -> u16 {
250+
self.code
251+
}
252+
}
253+
254+
#[stable(feature = "decode_utf16", since = "1.9.0")]
255+
impl fmt::Display for DecodeUtf16Error {
256+
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
257+
write!(f, "unpaired surrogate found: {:x}", self.code)
258+
}
259+
}

0 commit comments

Comments
 (0)