Skip to content

Commit 318c748

Browse files
AngledLuffaStanford NLP
authored andcommitted
Only url decode text if the www-form content type was given
1 parent 0b12f6c commit 318c748

File tree

1 file changed

+18
-18
lines changed

1 file changed

+18
-18
lines changed

src/edu/stanford/nlp/pipeline/StanfordCoreNLPServer.java

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -273,18 +273,17 @@ private static Map<String, String> getURLParams(URI uri) throws UnsupportedEncod
273273
}
274274
}
275275

276+
// TODO(AngledLuffa): this must be a constant somewhere, but I couldn't find it
277+
static final String URL_ENCODED = "application/x-www-form-urlencoded";
278+
276279
/**
277280
* Reads the POST contents of the request and parses it into an Annotation object, ready to be annotated.
278281
* This method can also read a serialized document, if the input format is set to be serialized.
279282
*
280-
* The POST contents will be treated as UTF-8 unless the strict property is set to true (in which case they will
283+
* The POST contents will be treated as UTF-8 unless the strict property is set to true, in which case they will
281284
* be treated as ISO-8859-1. They should be application/x-www-form-urlencoded, and decoding will be done via the
282-
* java.net.URLDecoder.decode(String, String) function. This will attempt to decode each
283-
* percent sign followed by two characters as a byte in hexadecimal. Other characters are passed through unchanged.
284-
* However, it normally also works fine to simple pass text to this method in a POST. Our method doesn't accept
285-
* encoding ' ' as '+' (a plus will be hex encoded) and converts any '%' not followed by a hex digit to "%25".
286-
* The only thing you really must do is to escape a '%' character that is a percent followed by two potential
287-
* hex digit characters as "%25".
285+
* java.net.URLDecoder.decode(String, String) function. It is also allowed to send in text/plain requests,
286+
* which will not be decoded. In fact, nothing other than x-www-form-urlencoded will be decoded.
288287
*
289288
* @param props The properties we are annotating with. This is where the input format is retrieved from.
290289
* @param httpExchange The exchange we are reading POST data from.
@@ -303,10 +302,16 @@ private Annotation getDocument(Properties props, HttpExchange httpExchange) thro
303302
// real users of CoreNLP would likely assume UTF-8 by default.
304303
String defaultEncoding = this.strict ? "ISO-8859-1" : "UTF-8";
305304
// Get the encoding
306-
Headers h = httpExchange.getRequestHeaders();
305+
Headers headers = httpExchange.getRequestHeaders();
307306
String encoding;
308-
if (h.containsKey("Content-type")) {
309-
String[] charsetPair = Arrays.stream(h.getFirst("Content-type").split(";"))
307+
// the original default behavior of the server was to
308+
// unescape, so let's assume by default that the input text is
309+
// escaped. if the Content-type is set to text we will know
310+
// we shouldn't unescape after all
311+
String contentType = URL_ENCODED;
312+
if (headers.containsKey("Content-type")) {
313+
contentType = headers.getFirst("Content-type").split(";")[0].trim();
314+
String[] charsetPair = Arrays.stream(headers.getFirst("Content-type").split(";"))
310315
.map(x -> x.split("="))
311316
.filter(x -> x.length > 0 && "charset".equals(x[0]))
312317
.findFirst().orElse(new String[]{"charset", defaultEncoding});
@@ -320,14 +325,9 @@ private Annotation getDocument(Properties props, HttpExchange httpExchange) thro
320325
}
321326

322327
String text = IOUtils.slurpReader(IOUtils.encodedInputStreamReader(httpExchange.getRequestBody(), encoding));
323-
// System.err.println("The text I got was |" + text + '|');
324-
325-
// Remove the % and + characters that mess up the URL decoding.
326-
text = text.replaceAll("%(?![0-9a-fA-F]{2})", "%25"); // Escape a percent that isn't followed by a hex byte
327-
text = text.replaceAll("\\+", "%2B");
328-
// System.err.println("The text fiddled: |" + text + '|');
329-
text = URLDecoder.decode(text, encoding);
330-
// System.err.println("The text decoded: |" + text + '|');
328+
if (contentType.equals(URL_ENCODED)) {
329+
text = URLDecoder.decode(text, encoding);
330+
}
331331
// We use to trim. But now we don't. It seems like doing that is illegitimate. text = text.trim();
332332

333333
// Read the annotation

0 commit comments

Comments
 (0)