Skip to content

Commit 8d72494

Browse files
committed
Align mime type parser to spec
The most notable two changes are: a) the parser now uses html whitespace instead of ascii whitespace b) the parser now correctly handles empty quoted values The commit additionally fixes a bug in the parser that would fail parsing if an invalid name was encountered and its corresponding value contained a quoted semicolon.
1 parent c34de27 commit 8d72494

File tree

4 files changed

+182
-15
lines changed

4 files changed

+182
-15
lines changed

data-url/src/mime.rs

Lines changed: 28 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,13 @@ impl FromStr for Mime {
3535
}
3636

3737
fn parse(s: &str) -> Option<Mime> {
38-
let trimmed = s.trim_matches(ascii_whitespace);
38+
let trimmed = s.trim_matches(http_whitespace);
3939

4040
let (type_, rest) = split2(trimmed, '/');
4141
require!(only_http_token_code_points(type_) && !type_.is_empty());
4242

4343
let (subtype, rest) = split2(rest?, ';');
44-
let subtype = subtype.trim_end_matches(ascii_whitespace);
44+
let subtype = subtype.trim_end_matches(http_whitespace);
4545
require!(only_http_token_code_points(subtype) && !subtype.is_empty());
4646

4747
let mut parameters = Vec::new();
@@ -66,11 +66,12 @@ fn parse_parameters(s: &str, parameters: &mut Vec<(String, String)>) {
6666
let mut semicolon_separated = s.split(';');
6767

6868
while let Some(piece) = semicolon_separated.next() {
69-
let piece = piece.trim_start_matches(ascii_whitespace);
69+
let piece = piece.trim_start_matches(http_whitespace);
7070
let (name, value) = split2(piece, '=');
71-
if name.is_empty() || !only_http_token_code_points(name) || contains(parameters, name) {
72-
continue;
73-
}
71+
// We can not early return on an invalid name here, because the value
72+
// parsing later may consume more semicolon seperated pieces.
73+
let name_valid =
74+
!name.is_empty() && only_http_token_code_points(name) && !contains(parameters, name);
7475
if let Some(value) = value {
7576
let value = if let Some(stripped) = value.strip_prefix('"') {
7677
let max_len = stripped.len().saturating_sub(1); // without end quote
@@ -80,7 +81,17 @@ fn parse_parameters(s: &str, parameters: &mut Vec<(String, String)>) {
8081
while let Some(c) = chars.next() {
8182
match c {
8283
'"' => break 'until_closing_quote,
83-
'\\' => unescaped_value.push(chars.next().unwrap_or('\\')),
84+
'\\' => unescaped_value.push(chars.next().unwrap_or_else(|| {
85+
semicolon_separated
86+
.next()
87+
.map(|piece| {
88+
// A semicolon inside a quoted value is not a separator
89+
// for the next parameter, but part of the value.
90+
chars = piece.chars();
91+
';'
92+
})
93+
.unwrap_or('\\')
94+
})),
8495
_ => unescaped_value.push(c),
8596
}
8697
}
@@ -93,13 +104,16 @@ fn parse_parameters(s: &str, parameters: &mut Vec<(String, String)>) {
93104
break;
94105
}
95106
}
96-
if !valid_value(&unescaped_value) {
107+
if !name_valid || !valid_value(value) {
97108
continue;
98109
}
99110
unescaped_value
100111
} else {
101-
let value = value.trim_end_matches(ascii_whitespace);
102-
if !valid_value(value) {
112+
let value = value.trim_end_matches(http_whitespace);
113+
if value.is_empty() {
114+
continue;
115+
}
116+
if !name_valid || !valid_value(value) {
103117
continue;
104118
}
105119
value.to_owned()
@@ -117,7 +131,7 @@ fn valid_value(s: &str) -> bool {
117131
s.chars().all(|c| {
118132
// <https://mimesniff.spec.whatwg.org/#http-quoted-string-token-code-point>
119133
matches!(c, '\t' | ' '..='~' | '\u{80}'..='\u{FF}')
120-
}) && !s.is_empty()
134+
})
121135
}
122136

123137
/// <https://mimesniff.spec.whatwg.org/#serializing-a-mime-type>
@@ -130,7 +144,7 @@ impl fmt::Display for Mime {
130144
f.write_str(";")?;
131145
f.write_str(name)?;
132146
f.write_str("=")?;
133-
if only_http_token_code_points(value) {
147+
if only_http_token_code_points(value) && !value.is_empty() {
134148
f.write_str(value)?
135149
} else {
136150
f.write_str("\"")?;
@@ -147,8 +161,8 @@ impl fmt::Display for Mime {
147161
}
148162
}
149163

150-
fn ascii_whitespace(c: char) -> bool {
151-
matches!(c, ' ' | '\t' | '\n' | '\r' | '\x0C')
164+
fn http_whitespace(c: char) -> bool {
165+
matches!(c, ' ' | '\t' | '\n' | '\r')
152166
}
153167

154168
fn only_http_token_code_points(s: &str) -> bool {

data-url/tests/base64.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@
4949
["ab=c=", null],
5050
["abc=d", null],
5151
["abc=d=", null],
52+
["ab\u000Bcd", null],
53+
["ab\u3000cd", null],
54+
["ab\u3001cd", null],
5255
["ab\tcd", [105, 183, 29]],
5356
["ab\ncd", [105, 183, 29]],
5457
["ab\fcd", [105, 183, 29]],

data-url/tests/data-urls.json

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,12 @@
5252
["data:text/plain;Charset=UTF-8,%C2%B1",
5353
"text/plain;charset=UTF-8",
5454
[194, 177]],
55+
["data:text/plain;charset=windows-1252,áñçə💩",
56+
"text/plain;charset=windows-1252",
57+
[195, 161, 195, 177, 195, 167, 201, 153, 240, 159, 146, 169]],
58+
["data:text/plain;charset=UTF-8,áñçə💩",
59+
"text/plain;charset=UTF-8",
60+
[195, 161, 195, 177, 195, 167, 201, 153, 240, 159, 146, 169]],
5561
["data:image/gif,%C2%B1",
5662
"image/gif",
5763
[194, 177]],
@@ -100,14 +106,23 @@
100106
["data:image/png,X X",
101107
"image/png",
102108
[88, 32, 88]],
109+
["data:application/javascript,X X",
110+
"application/javascript",
111+
[88, 32, 88]],
103112
["data:application/xml,X X",
104113
"application/xml",
105114
[88, 32, 88]],
115+
["data:text/javascript,X X",
116+
"text/javascript",
117+
[88, 32, 88]],
118+
["data:text/plain,X X",
119+
"text/plain",
120+
[88, 32, 88]],
106121
["data:unknown/unknown,X X",
107122
"unknown/unknown",
108123
[88, 32, 88]],
109124
["data:text/plain;a=\",\",X",
110-
"text/plain",
125+
"text/plain;a=\"\"",
111126
[34, 44, 88]],
112127
["data:text/plain;a=%2C,X",
113128
"text/plain;a=%2C",

data-url/tests/mime-types.json

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@
3232
"navigable": true,
3333
"encoding": "GBK"
3434
},
35+
{
36+
"input": "text/html;charset=();charset=GBK",
37+
"output": "text/html;charset=\"()\"",
38+
"navigable": true,
39+
"encoding": null
40+
},
3541
"Spaces",
3642
{
3743
"input": "text/html;charset =gbk",
@@ -57,6 +63,37 @@
5763
"navigable": true,
5864
"encoding": "GBK"
5965
},
66+
{
67+
"input": "text/html;charset= \"gbk\"",
68+
"output": "text/html;charset=\" \\\"gbk\\\"\"",
69+
"navigable": true,
70+
"encoding": null
71+
},
72+
"0x0B and 0x0C",
73+
{
74+
"input": "text/html;charset=\u000Bgbk",
75+
"output": "text/html",
76+
"navigable": true,
77+
"encoding": null
78+
},
79+
{
80+
"input": "text/html;charset=\u000Cgbk",
81+
"output": "text/html",
82+
"navigable": true,
83+
"encoding": null
84+
},
85+
{
86+
"input": "text/html;\u000Bcharset=gbk",
87+
"output": "text/html",
88+
"navigable": true,
89+
"encoding": null
90+
},
91+
{
92+
"input": "text/html;\u000Ccharset=gbk",
93+
"output": "text/html",
94+
"navigable": true,
95+
"encoding": null
96+
},
6097
"Single quotes are a token, not a delimiter",
6198
{
6299
"input": "text/html;charset='gbk'",
@@ -76,6 +113,12 @@
76113
"navigable": true,
77114
"encoding": null
78115
},
116+
{
117+
"input": "text/html;charset=';charset=GBK",
118+
"output": "text/html;charset='",
119+
"navigable": true,
120+
"encoding": null
121+
},
79122
"Invalid parameters",
80123
{
81124
"input": "text/html;test;charset=gbk",
@@ -113,6 +156,18 @@
113156
"navigable": true,
114157
"encoding": "GBK"
115158
},
159+
{
160+
"input": "text/html;charset= \"\u007F;charset=GBK",
161+
"output": "text/html;charset=GBK",
162+
"navigable": true,
163+
"encoding": "GBK"
164+
},
165+
{
166+
"input": "text/html;charset=\"\u007F;charset=foo\";charset=GBK",
167+
"output": "text/html;charset=GBK",
168+
"navigable": true,
169+
"encoding": "GBK"
170+
},
116171
"Double quotes",
117172
{
118173
"input": "text/html;charset=\"gbk\"",
@@ -138,6 +193,12 @@
138193
"navigable": true,
139194
"encoding": "GBK"
140195
},
196+
{
197+
"input": "text/html;charset=\"gbk \"",
198+
"output": "text/html;charset=\"gbk \"",
199+
"navigable": true,
200+
"encoding": "GBK"
201+
},
141202
{
142203
"input": "text/html;charset=\"\\ gbk\"",
143204
"output": "text/html;charset=\" gbk\"",
@@ -156,6 +217,18 @@
156217
"navigable": true,
157218
"encoding": "GBK"
158219
},
220+
{
221+
"input": "text/html;charset=\"\";charset=GBK",
222+
"output": "text/html;charset=\"\"",
223+
"navigable": true,
224+
"encoding": null
225+
},
226+
{
227+
"input": "text/html;charset=\";charset=GBK",
228+
"output": "text/html;charset=\";charset=GBK\"",
229+
"navigable": true,
230+
"encoding": null
231+
},
159232
"Unexpected code points",
160233
{
161234
"input": "text/html;charset={gbk}",
@@ -175,6 +248,20 @@
175248
"input": "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789/0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789",
176249
"output": "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789/0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789"
177250
},
251+
"Invalid names",
252+
{
253+
"input": "text/html;a]=bar;b[=bar;c=bar",
254+
"output": "text/html;c=bar"
255+
},
256+
"Semicolons in value",
257+
{
258+
"input": "text/html;valid=\";\";foo=bar",
259+
"output": "text/html;valid=\";\";foo=bar"
260+
},
261+
{
262+
"input": "text/html;in]valid=\";asd=foo\";foo=bar",
263+
"output": "text/html;foo=bar"
264+
},
178265
"Valid",
179266
{
180267
"input": "!#$%&'*+-.^_`|~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz/!#$%&'*+-.^_`|~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz;!#$%&'*+-.^_`|~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz=!#$%&'*+-.^_`|~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
@@ -202,6 +289,18 @@
202289
"input": "x/x;x=\t",
203290
"output": "x/x"
204291
},
292+
{
293+
"input": "x/x\n\r\t ;x=x",
294+
"output": "x/x;x=x"
295+
},
296+
{
297+
"input": "\n\r\t x/x;x=x\n\r\t ",
298+
"output": "x/x;x=x"
299+
},
300+
{
301+
"input": "x/x;\n\r\t x=x\n\r\t ;x=y",
302+
"output": "x/x;x=x"
303+
},
205304
"Latin1",
206305
{
207306
"input": "text/html;test=\u00FF;charset=gbk",
@@ -215,6 +314,22 @@
215314
"output": "x/x;x=x"
216315
},
217316
"Failure",
317+
{
318+
"input": "\u000Bx/x",
319+
"output": null
320+
},
321+
{
322+
"input": "\u000Cx/x",
323+
"output": null
324+
},
325+
{
326+
"input": "x/x\u000B",
327+
"output": null
328+
},
329+
{
330+
"input": "x/x\u000C",
331+
"output": null
332+
},
218333
{
219334
"input": "",
220335
"output": null
@@ -223,6 +338,10 @@
223338
"input": "\t",
224339
"output": null
225340
},
341+
{
342+
"input": "/",
343+
"output": null
344+
},
226345
{
227346
"input": "bogus",
228347
"output": null
@@ -247,6 +366,10 @@
247366
"input": "(/)",
248367
"output": null
249368
},
369+
{
370+
"input": "ÿ/ÿ",
371+
"output": null
372+
},
250373
{
251374
"input": "text/html(;doesnot=matter",
252375
"output": null
@@ -258,5 +381,17 @@
258381
{
259382
"input": "\u0100/\u0100",
260383
"output": null
384+
},
385+
{
386+
"input": "text /html",
387+
"output": null
388+
},
389+
{
390+
"input": "text/ html",
391+
"output": null
392+
},
393+
{
394+
"input": "\"text/html\"",
395+
"output": null
261396
}
262397
]

0 commit comments

Comments
 (0)