Skip to content

Commit 72a19fa

Browse files
committed
Switch to WHATWG’s MIME type parsing algorithm
… and a custom Rust type for MIME type records, because the mime crate does not have a constructor from components: hyperium/mime#78
1 parent 16e0e57 commit 72a19fa

File tree

4 files changed

+196
-44
lines changed

4 files changed

+196
-44
lines changed

Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ authors = ["Simon Sapin <simon.sapin@exyr.org>"]
55

66
[dependencies]
77
matches = "0.1"
8-
mime = "0.3"
98

109
[dev-dependencies]
1110
rustc-test = "0.3"

src/lib.rs

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,19 +8,25 @@
88
//! let url = DataUrl::process("data:,Hello%20World!").unwrap();
99
//! let (body, fragment) = url.decode_to_vec().unwrap();
1010
//!
11-
//! assert_eq!(url.mime_type().type_(), mime::TEXT);
12-
//! assert_eq!(url.mime_type().subtype(), mime::PLAIN);
13-
//! assert_eq!(url.mime_type().get_param(mime::CHARSET).unwrap(), "US-ASCII");
11+
//! assert_eq!(url.mime_type().type_, "text");
12+
//! assert_eq!(url.mime_type().subtype, "plain");
13+
//! assert_eq!(url.mime_type().parameters, [("charset".into(), "US-ASCII".into())]);
1414
//! assert_eq!(body, b"Hello World!");
1515
//! assert!(fragment.is_none());
1616
//! ```
1717
1818
#[macro_use] extern crate matches;
19-
pub extern crate mime;
2019

21-
use forgiving_base64::{InvalidBase64, DecodeError};
20+
macro_rules! require {
21+
($condition: expr) => {
22+
if !$condition {
23+
return None
24+
}
25+
}
26+
}
2227

2328
pub mod forgiving_base64;
29+
pub mod mime;
2430

2531
pub struct DataUrl<'a> {
2632
mime_type: mime::Mime,
@@ -57,20 +63,20 @@ impl<'a> DataUrl<'a> {
5763
/// Streaming-decode the data URL’s body to `write_body_bytes`,
5864
/// and return the URL’s fragment identifier if it has one.
5965
pub fn decode<F, E>(&self, write_body_bytes: F)
60-
-> Result<Option<FragmentIdentifier<'a>>, DecodeError<E>>
66+
-> Result<Option<FragmentIdentifier<'a>>, forgiving_base64::DecodeError<E>>
6167
where F: FnMut(&[u8]) -> Result<(), E>
6268
{
6369
if self.base64 {
6470
decode_with_base64(self.encoded_body_plus_fragment, write_body_bytes)
6571
} else {
6672
decode_without_base64(self.encoded_body_plus_fragment, write_body_bytes)
67-
.map_err(DecodeError::WriteError)
73+
.map_err(forgiving_base64::DecodeError::WriteError)
6874
}
6975
}
7076

7177
/// Return the decoded body, and the URL’s fragment identifier if it has one.
7278
pub fn decode_to_vec(&self)
73-
-> Result<(Vec<u8>, Option<FragmentIdentifier<'a>>), InvalidBase64>
79+
-> Result<(Vec<u8>, Option<FragmentIdentifier<'a>>), forgiving_base64::InvalidBase64>
7480
{
7581
let mut body = Vec::new();
7682
let fragment = self.decode(|bytes| Ok(body.extend_from_slice(bytes)))?;
@@ -101,14 +107,6 @@ impl<'a> FragmentIdentifier<'a> {
101107
}
102108
}
103109

104-
macro_rules! require {
105-
($condition: expr) => {
106-
if !$condition {
107-
return None
108-
}
109-
}
110-
}
111-
112110
/// Similar to <https://url.spec.whatwg.org/#concept-basic-url-parser>
113111
/// followed by <https://url.spec.whatwg.org/#concept-url-serializer>
114112
///
@@ -196,7 +194,11 @@ fn parse_header(from_colon_to_comma: &str) -> (mime::Mime, bool) {
196194
// FIXME: does Mime::from_str match the MIME Sniffing Standard’s parsing algorithm?
197195
// <https://mimesniff.spec.whatwg.org/#parse-a-mime-type>
198196
let mime_type = string.parse().unwrap_or_else(|_| {
199-
"text/plain;charset=US-ASCII".parse().unwrap()
197+
mime::Mime {
198+
type_: String::from("text"),
199+
subtype: String::from("plain"),
200+
parameters: vec![(String::from("charset"), String::from("US-ASCII"))],
201+
}
200202
});
201203

202204
(mime_type, base64)
@@ -289,7 +291,7 @@ fn decode_without_base64<F, E>(encoded_body_plus_fragment: &str, mut write_bytes
289291
/// <https://infra.spec.whatwg.org/#isomorphic-decode> composed with
290292
/// <https://infra.spec.whatwg.org/#forgiving-base64-decode>.
291293
fn decode_with_base64<F, E>(encoded_body_plus_fragment: &str, write_bytes: F)
292-
-> Result<Option<FragmentIdentifier>, DecodeError<E>>
294+
-> Result<Option<FragmentIdentifier>, forgiving_base64::DecodeError<E>>
293295
where F: FnMut(&[u8]) -> Result<(), E>
294296
{
295297
let mut decoder = forgiving_base64::Decoder::new(write_bytes);

src/mime.rs

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
use std::fmt::{self, Write};
2+
use std::str::FromStr;
3+
4+
/// <https://mimesniff.spec.whatwg.org/#mime-type-representation>
5+
#[derive(Debug, PartialEq, Eq)]
6+
pub struct Mime {
7+
pub type_: String,
8+
pub subtype: String,
9+
/// (name, value)
10+
pub parameters: Vec<(String, String)>
11+
}
12+
13+
#[derive(Debug)]
14+
pub struct MimeParsingError(());
15+
16+
/// <https://mimesniff.spec.whatwg.org/#parsing-a-mime-type>
17+
impl FromStr for Mime {
18+
type Err = MimeParsingError;
19+
20+
fn from_str(s: &str) -> Result<Self, Self::Err> {
21+
parse(s).ok_or(MimeParsingError(()))
22+
}
23+
}
24+
25+
fn parse(s: &str) -> Option<Mime> {
26+
let trimmed = s.trim_matches(ascii_whitespace);
27+
28+
let (type_, rest) = split2(trimmed, '/');
29+
require!(only_http_token_code_points(type_) && !type_.is_empty());
30+
31+
let (subtype, rest) = split2(rest?, ';');
32+
let subtype = subtype.trim_right_matches(ascii_whitespace);
33+
require!(only_http_token_code_points(subtype) && !subtype.is_empty());
34+
35+
let mut parameters = Vec::new();
36+
if let Some(rest) = rest {
37+
parse_parameters(rest, &mut parameters)
38+
}
39+
40+
Some(Mime {
41+
type_: type_.to_ascii_lowercase(),
42+
subtype: subtype.to_ascii_lowercase(),
43+
parameters,
44+
})
45+
}
46+
47+
fn split2(s: &str, separator: char) -> (&str, Option<&str>) {
48+
let mut iter = s.splitn(2, separator);
49+
let first = iter.next().unwrap();
50+
(first, iter.next())
51+
}
52+
53+
fn parse_parameters(s: &str, parameters: &mut Vec<(String, String)>) {
54+
let mut semicolon_separated = s.split(';');
55+
56+
while let Some(piece) = semicolon_separated.next() {
57+
let piece = piece.trim_left_matches(ascii_whitespace);
58+
let (name, value) = split2(piece, '=');
59+
if name.is_empty() || !only_http_token_code_points(name) || contains(&parameters, name) {
60+
continue
61+
}
62+
if let Some(value) = value {
63+
let value = if value.starts_with('"') {
64+
let max_len = value.len().saturating_sub(2); // without start or end quotes
65+
let mut unescaped_value = String::with_capacity(max_len);
66+
let mut chars = value[1..].chars();
67+
'until_closing_quote: loop {
68+
while let Some(c) = chars.next() {
69+
match c {
70+
'"' => break 'until_closing_quote,
71+
'\\' => unescaped_value.push(chars.next().unwrap_or('\\')),
72+
_ => unescaped_value.push(c)
73+
}
74+
}
75+
if let Some(piece) = semicolon_separated.next() {
76+
// A semicolon inside a quoted value is not a separator
77+
// for the next parameter, but part of the value.
78+
unescaped_value.push(';');
79+
chars = piece.chars()
80+
} else {
81+
break
82+
}
83+
}
84+
if !valid_value(&unescaped_value) {
85+
continue
86+
}
87+
unescaped_value
88+
} else {
89+
let value = value.trim_right_matches(ascii_whitespace);
90+
if !valid_value(value) {
91+
continue
92+
}
93+
value.to_owned()
94+
};
95+
parameters.push((name.to_ascii_lowercase(), value))
96+
}
97+
}
98+
}
99+
100+
fn contains(parameters: &[(String, String)], name: &str) -> bool {
101+
parameters.iter().any(|&(ref n, _)| n == name)
102+
}
103+
104+
fn valid_value(s: &str) -> bool {
105+
s.chars().all(|c| {
106+
// <https://mimesniff.spec.whatwg.org/#http-quoted-string-token-code-point>
107+
matches!(c, '\t' | ' '...'~' | '\u{80}'...'\u{FF}')
108+
}) && !s.is_empty()
109+
}
110+
111+
/// <https://mimesniff.spec.whatwg.org/#serializing-a-mime-type>
112+
impl fmt::Display for Mime {
113+
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
114+
f.write_str(&self.type_)?;
115+
f.write_str("/")?;
116+
f.write_str(&self.subtype)?;
117+
for &(ref name, ref value) in &self.parameters {
118+
f.write_str(";")?;
119+
f.write_str(name)?;
120+
f.write_str("=")?;
121+
if only_http_token_code_points(value) {
122+
f.write_str(value)?
123+
} else {
124+
f.write_str("\"")?;
125+
for c in value.chars() {
126+
if c == '"' || c == '\\' {
127+
f.write_str("\\")?
128+
}
129+
f.write_char(c)?
130+
}
131+
f.write_str("\"")?
132+
}
133+
}
134+
Ok(())
135+
}
136+
}
137+
138+
fn ascii_whitespace(c: char) -> bool {
139+
matches!(c, ' ' | '\t' | '\n' | '\r' | '\x0C')
140+
}
141+
142+
fn only_http_token_code_points(s: &str) -> bool {
143+
s.bytes().all(|byte| IS_HTTP_TOKEN[byte as usize])
144+
}
145+
146+
macro_rules! byte_map {
147+
($($flag:expr,)*) => ([
148+
$($flag != 0,)*
149+
])
150+
}
151+
152+
// Copied from https://github.com/hyperium/mime/blob/v0.3.5/src/parse.rs#L293
153+
static IS_HTTP_TOKEN: [bool; 256] = byte_map![
154+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
155+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
156+
0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0,
157+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
158+
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
159+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
160+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
161+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
162+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
163+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
164+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
165+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
166+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
167+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
168+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
169+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
170+
];

tests/wpt.rs

Lines changed: 6 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@ fn run_data_url(input: String, expected_mime: Option<String>, expected_body: Opt
99
let url = url.unwrap();
1010
let (body, _) = url.decode_to_vec().unwrap();
1111
if expected_mime == "" {
12-
assert_eq!(*url.mime_type(), "text/plain;charset=US-ASCII")
12+
assert_eq!(url.mime_type().to_string(), "text/plain;charset=US-ASCII")
1313
} else {
14-
assert_eq!(*url.mime_type(), &*expected_mime)
14+
assert_eq!(url.mime_type().to_string(), expected_mime)
1515
}
1616
if let Some(expected_body) = expected_body {
1717
assert_eq!(body, expected_body)
@@ -26,19 +26,6 @@ fn collect_data_url<F>(add_test: &mut F)
2626
{
2727
let known_failures = [
2828
"data://test:test/,X",
29-
"data:;%62ase64,WA",
30-
"data:;base 64,WA",
31-
"data:;base64;,WA",
32-
"data:;base64;base64,WA",
33-
"data:;charset =x,X",
34-
"data:;charset,X",
35-
"data:;charset=,X",
36-
"data:text/plain;,X",
37-
"data:text/plain;a=\",\",X",
38-
"data:x/x;base64;base64,WA",
39-
"data:x/x;base64;base64x,WA",
40-
"data:x/x;base64;charset=x,WA",
41-
"data:x/x;base64;charset=x;base64,WA",
4229
];
4330

4431
#[derive(Deserialize)]
@@ -98,8 +85,8 @@ fn collect_base64<F>(add_test: &mut F)
9885
fn run_mime(input: String, expected: Option<String>) {
9986
let result = input.parse::<data_url::mime::Mime>();
10087
match (result, expected) {
101-
(Ok(bytes), Some(expected)) => assert_eq!(bytes, &*expected),
102-
(Ok(bytes), None) => panic!("Expected error, got {:?}", bytes),
88+
(Ok(mime), Some(expected)) => assert_eq!(mime.to_string(), expected),
89+
(Ok(mime), None) => panic!("Expected error, got {:?}", mime),
10390
(Err(e), Some(expected)) => panic!("Expected {:?}, got error {:?}", expected, e),
10491
(Err(_), None) => {}
10592
}
@@ -109,13 +96,7 @@ fn run_mime(input: String, expected: Option<String>) {
10996
fn collect_mime<F>(add_test: &mut F)
11097
where F: FnMut(String, bool, rustc_test::TestFn)
11198
{
112-
// Many WPT tests fail with the mime crate’s parser,
113-
// since that parser is not written for the same spec.
114-
// Only run a few of them for now, since listing all the failures individually is not useful.
115-
let only_run_first_n_entries = 5;
116-
let known_failures = [
117-
"text/html;charset=gbk(",
118-
];
99+
let known_failures = [];
119100

120101
#[derive(Deserialize)]
121102
#[serde(untagged)]
@@ -129,7 +110,7 @@ fn collect_mime<F>(add_test: &mut F)
129110
let entries = v.into_iter().chain(v2);
130111

131112
let mut last_comment = None;
132-
for entry in entries.take(only_run_first_n_entries) {
113+
for entry in entries {
133114
let (input, expected) = match entry {
134115
Entry::TestCase { input, output } => (input, output),
135116
Entry::Comment(s) => {

0 commit comments

Comments
 (0)