@@ -273,18 +273,17 @@ private static Map<String, String> getURLParams(URI uri) throws UnsupportedEncod
273
273
}
274
274
}
275
275
276
+ // TODO(AngledLuffa): this must be a constant somewhere, but I couldn't find it
277
+ static final String URL_ENCODED = "application/x-www-form-urlencoded" ;
278
+
276
279
/**
277
280
* Reads the POST contents of the request and parses it into an Annotation object, ready to be annotated.
278
281
* This method can also read a serialized document, if the input format is set to be serialized.
279
282
*
280
- * The POST contents will be treated as UTF-8 unless the strict property is set to true ( in which case they will
283
+ * The POST contents will be treated as UTF-8 unless the strict property is set to true, in which case they will
281
284
* be treated as ISO-8859-1. They should be application/x-www-form-urlencoded, and decoding will be done via the
282
- * java.net.URLDecoder.decode(String, String) function. This will attempt to decode each
283
- * percent sign followed by two characters as a byte in hexadecimal. Other characters are passed through unchanged.
284
- * However, it normally also works fine to simple pass text to this method in a POST. Our method doesn't accept
285
- * encoding ' ' as '+' (a plus will be hex encoded) and converts any '%' not followed by a hex digit to "%25".
286
- * The only thing you really must do is to escape a '%' character that is a percent followed by two potential
287
- * hex digit characters as "%25".
285
+ * java.net.URLDecoder.decode(String, String) function. It is also allowed to send in text/plain requests,
286
+ * which will not be decoded. In fact, nothing other than x-www-form-urlencoded will be decoded.
288
287
*
289
288
* @param props The properties we are annotating with. This is where the input format is retrieved from.
290
289
* @param httpExchange The exchange we are reading POST data from.
@@ -303,10 +302,16 @@ private Annotation getDocument(Properties props, HttpExchange httpExchange) thro
303
302
// real users of CoreNLP would likely assume UTF-8 by default.
304
303
String defaultEncoding = this .strict ? "ISO-8859-1" : "UTF-8" ;
305
304
// Get the encoding
306
- Headers h = httpExchange .getRequestHeaders ();
305
+ Headers headers = httpExchange .getRequestHeaders ();
307
306
String encoding ;
308
- if (h .containsKey ("Content-type" )) {
309
- String [] charsetPair = Arrays .stream (h .getFirst ("Content-type" ).split (";" ))
307
+ // the original default behavior of the server was to
308
+ // unescape, so let's assume by default that the input text is
309
+ // escaped. if the Content-type is set to text we will know
310
+ // we shouldn't unescape after all
311
+ String contentType = URL_ENCODED ;
312
+ if (headers .containsKey ("Content-type" )) {
313
+ contentType = headers .getFirst ("Content-type" ).split (";" )[0 ].trim ();
314
+ String [] charsetPair = Arrays .stream (headers .getFirst ("Content-type" ).split (";" ))
310
315
.map (x -> x .split ("=" ))
311
316
.filter (x -> x .length > 0 && "charset" .equals (x [0 ]))
312
317
.findFirst ().orElse (new String []{"charset" , defaultEncoding });
@@ -320,14 +325,9 @@ private Annotation getDocument(Properties props, HttpExchange httpExchange) thro
320
325
}
321
326
322
327
String text = IOUtils .slurpReader (IOUtils .encodedInputStreamReader (httpExchange .getRequestBody (), encoding ));
323
- // System.err.println("The text I got was |" + text + '|');
324
-
325
- // Remove the % and + characters that mess up the URL decoding.
326
- text = text .replaceAll ("%(?![0-9a-fA-F]{2})" , "%25" ); // Escape a percent that isn't followed by a hex byte
327
- text = text .replaceAll ("\\ +" , "%2B" );
328
- // System.err.println("The text fiddled: |" + text + '|');
329
- text = URLDecoder .decode (text , encoding );
330
- // System.err.println("The text decoded: |" + text + '|');
328
+ if (contentType .equals (URL_ENCODED )) {
329
+ text = URLDecoder .decode (text , encoding );
330
+ }
331
331
// We use to trim. But now we don't. It seems like doing that is illegitimate. text = text.trim();
332
332
333
333
// Read the annotation
0 commit comments