@@ -92,6 +92,7 @@ public void testComplexJSON() {
92
92
// BEGIN TESTS FOR ANNOTATION WRITING
93
93
// -----
94
94
95
+
95
96
@ Test
96
97
public void testSimpleDocument () throws IOException {
97
98
Annotation ann = new Annotation ("JSON is neat. Better than XML." );
@@ -189,4 +190,118 @@ public void testSimpleDocument() throws IOException {
189
190
Assert .assertEquals (expected , actual );
190
191
}
191
192
193
+ /** Test with codepoints - could refactor, but meh */
194
+ @ Test
195
+ public void testCodepointDocument () throws IOException {
196
+ Annotation ann = new Annotation ("JSON is neat. Better than 😺." );
197
+ StanfordCoreNLP pipeline = new StanfordCoreNLP (PropertiesUtils .asProperties ("annotators" , "tokenize" , "tokenize.codepoint" , "true" ));
198
+ pipeline .annotate (ann );
199
+ String actual = new JSONOutputter ().print (ann );
200
+ String expected = indent (
201
+ "{\n " +
202
+ "\t \" sentences\" : [\n " +
203
+ "\t \t {\n " +
204
+ "\t \t \t \" index\" : 0,\n " +
205
+ "\t \t \t \" tokens\" : [\n " +
206
+ "\t \t \t \t {\n " +
207
+ "\t \t \t \t \t \" index\" : 1,\n " +
208
+ "\t \t \t \t \t \" word\" : \" JSON\" ,\n " +
209
+ "\t \t \t \t \t \" originalText\" : \" JSON\" ,\n " +
210
+ "\t \t \t \t \t \" characterOffsetBegin\" : 0,\n " +
211
+ "\t \t \t \t \t \" characterOffsetEnd\" : 4,\n " +
212
+ "\t \t \t \t \t \" codepointOffsetBegin\" : 0,\n " +
213
+ "\t \t \t \t \t \" codepointOffsetEnd\" : 4,\n " +
214
+ "\t \t \t \t \t \" before\" : \" \" ,\n " +
215
+ "\t \t \t \t \t \" after\" : \" \" \n " +
216
+ "\t \t \t \t },\n " +
217
+ "\t \t \t \t {\n " +
218
+ "\t \t \t \t \t \" index\" : 2,\n " +
219
+ "\t \t \t \t \t \" word\" : \" is\" ,\n " +
220
+ "\t \t \t \t \t \" originalText\" : \" is\" ,\n " +
221
+ "\t \t \t \t \t \" characterOffsetBegin\" : 5,\n " +
222
+ "\t \t \t \t \t \" characterOffsetEnd\" : 7,\n " +
223
+ "\t \t \t \t \t \" codepointOffsetBegin\" : 5,\n " +
224
+ "\t \t \t \t \t \" codepointOffsetEnd\" : 7,\n " +
225
+ "\t \t \t \t \t \" before\" : \" \" ,\n " +
226
+ "\t \t \t \t \t \" after\" : \" \" \n " +
227
+ "\t \t \t \t },\n " +
228
+ "\t \t \t \t {\n " +
229
+ "\t \t \t \t \t \" index\" : 3,\n " +
230
+ "\t \t \t \t \t \" word\" : \" neat\" ,\n " +
231
+ "\t \t \t \t \t \" originalText\" : \" neat\" ,\n " +
232
+ "\t \t \t \t \t \" characterOffsetBegin\" : 8,\n " +
233
+ "\t \t \t \t \t \" characterOffsetEnd\" : 12,\n " +
234
+ "\t \t \t \t \t \" codepointOffsetBegin\" : 8,\n " +
235
+ "\t \t \t \t \t \" codepointOffsetEnd\" : 12,\n " +
236
+ "\t \t \t \t \t \" before\" : \" \" ,\n " +
237
+ "\t \t \t \t \t \" after\" : \" \" \n " +
238
+ "\t \t \t \t },\n " +
239
+ "\t \t \t \t {\n " +
240
+ "\t \t \t \t \t \" index\" : 4,\n " +
241
+ "\t \t \t \t \t \" word\" : \" .\" ,\n " +
242
+ "\t \t \t \t \t \" originalText\" : \" .\" ,\n " +
243
+ "\t \t \t \t \t \" characterOffsetBegin\" : 12,\n " +
244
+ "\t \t \t \t \t \" characterOffsetEnd\" : 13,\n " +
245
+ "\t \t \t \t \t \" codepointOffsetBegin\" : 12,\n " +
246
+ "\t \t \t \t \t \" codepointOffsetEnd\" : 13,\n " +
247
+ "\t \t \t \t \t \" before\" : \" \" ,\n " +
248
+ "\t \t \t \t \t \" after\" : \" \" \n " +
249
+ "\t \t \t \t }\n " +
250
+ "\t \t \t ]\n " +
251
+ "\t \t },\n " +
252
+ "\t \t {\n " +
253
+ "\t \t \t \" index\" : 1,\n " +
254
+ "\t \t \t \" tokens\" : [\n " +
255
+ "\t \t \t \t {\n " +
256
+ "\t \t \t \t \t \" index\" : 1,\n " +
257
+ "\t \t \t \t \t \" word\" : \" Better\" ,\n " +
258
+ "\t \t \t \t \t \" originalText\" : \" Better\" ,\n " +
259
+ "\t \t \t \t \t \" characterOffsetBegin\" : 14,\n " +
260
+ "\t \t \t \t \t \" characterOffsetEnd\" : 20,\n " +
261
+ "\t \t \t \t \t \" codepointOffsetBegin\" : 14,\n " +
262
+ "\t \t \t \t \t \" codepointOffsetEnd\" : 20,\n " +
263
+ "\t \t \t \t \t \" before\" : \" \" ,\n " +
264
+ "\t \t \t \t \t \" after\" : \" \" \n " +
265
+ "\t \t \t \t },\n " +
266
+ "\t \t \t \t {\n " +
267
+ "\t \t \t \t \t \" index\" : 2,\n " +
268
+ "\t \t \t \t \t \" word\" : \" than\" ,\n " +
269
+ "\t \t \t \t \t \" originalText\" : \" than\" ,\n " +
270
+ "\t \t \t \t \t \" characterOffsetBegin\" : 21,\n " +
271
+ "\t \t \t \t \t \" characterOffsetEnd\" : 25,\n " +
272
+ "\t \t \t \t \t \" codepointOffsetBegin\" : 21,\n " +
273
+ "\t \t \t \t \t \" codepointOffsetEnd\" : 25,\n " +
274
+ "\t \t \t \t \t \" before\" : \" \" ,\n " +
275
+ "\t \t \t \t \t \" after\" : \" \" \n " +
276
+ "\t \t \t \t },\n " +
277
+ "\t \t \t \t {\n " +
278
+ "\t \t \t \t \t \" index\" : 3,\n " +
279
+ "\t \t \t \t \t \" word\" : \" 😺\" ,\n " +
280
+ "\t \t \t \t \t \" originalText\" : \" 😺\" ,\n " +
281
+ "\t \t \t \t \t \" characterOffsetBegin\" : 26,\n " +
282
+ "\t \t \t \t \t \" characterOffsetEnd\" : 28,\n " +
283
+ "\t \t \t \t \t \" codepointOffsetBegin\" : 26,\n " +
284
+ "\t \t \t \t \t \" codepointOffsetEnd\" : 27,\n " +
285
+ "\t \t \t \t \t \" before\" : \" \" ,\n " +
286
+ "\t \t \t \t \t \" after\" : \" \" \n " +
287
+ "\t \t \t \t },\n " +
288
+ "\t \t \t \t {\n " +
289
+ "\t \t \t \t \t \" index\" : 4,\n " +
290
+ "\t \t \t \t \t \" word\" : \" .\" ,\n " +
291
+ "\t \t \t \t \t \" originalText\" : \" .\" ,\n " +
292
+ "\t \t \t \t \t \" characterOffsetBegin\" : 28,\n " +
293
+ "\t \t \t \t \t \" characterOffsetEnd\" : 29,\n " +
294
+ "\t \t \t \t \t \" codepointOffsetBegin\" : 27,\n " +
295
+ "\t \t \t \t \t \" codepointOffsetEnd\" : 28,\n " +
296
+ "\t \t \t \t \t \" before\" : \" \" ,\n " +
297
+ "\t \t \t \t \t \" after\" : \" \" \n " +
298
+ "\t \t \t \t }\n " +
299
+ "\t \t \t ]\n " +
300
+ "\t \t }\n " +
301
+ "\t ]\n " +
302
+ "}\n " );
303
+
304
+ Assert .assertEquals (expected , actual );
305
+ }
306
+
192
307
}
0 commit comments