Skip to content

Commit e0f6185

Browse files
committed
Fix broken codepoint offsets in JSON output
1 parent e519193 commit e0f6185

File tree

2 files changed

+122
-2
lines changed

2 files changed

+122
-2
lines changed

src/edu/stanford/nlp/pipeline/JSONOutputter.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,8 +188,8 @@ public void print(Annotation doc, OutputStream target, Options options) throws I
188188
l3.set("characterOffsetEnd", token.endPosition());
189189
if (token.containsKey(CoreAnnotations.CodepointOffsetBeginAnnotation.class) &&
190190
token.containsKey(CoreAnnotations.CodepointOffsetEndAnnotation.class)) {
191-
l3.set("codepointOffsetBegin", token.beginPosition());
192-
l3.set("codepointOffsetEnd", token.endPosition());
191+
l3.set("codepointOffsetBegin", token.get(CoreAnnotations.CodepointOffsetBeginAnnotation.class));
192+
l3.set("codepointOffsetEnd", token.get(CoreAnnotations.CodepointOffsetEndAnnotation.class));
193193
}
194194
l3.set("pos", token.tag());
195195
l3.set("ner", token.ner());
@@ -216,6 +216,11 @@ public void print(Annotation doc, OutputStream target, Options options) throws I
216216
l2.set("originalText", token.originalText());
217217
l2.set("characterOffsetBegin", token.beginPosition());
218218
l2.set("characterOffsetEnd", token.endPosition());
219+
if (token.containsKey(CoreAnnotations.CodepointOffsetBeginAnnotation.class) &&
220+
token.containsKey(CoreAnnotations.CodepointOffsetEndAnnotation.class)) {
221+
l2.set("codepointOffsetBegin", token.get(CoreAnnotations.CodepointOffsetBeginAnnotation.class));
222+
l2.set("codepointOffsetEnd", token.get(CoreAnnotations.CodepointOffsetEndAnnotation.class));
223+
}
219224
}));
220225
}
221226
}

test/src/edu/stanford/nlp/pipeline/JSONOutputterTest.java

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ public void testComplexJSON() {
9292
// BEGIN TESTS FOR ANNOTATION WRITING
9393
// -----
9494

95+
9596
@Test
9697
public void testSimpleDocument() throws IOException {
9798
Annotation ann = new Annotation("JSON is neat. Better than XML.");
@@ -189,4 +190,118 @@ public void testSimpleDocument() throws IOException {
189190
Assert.assertEquals(expected, actual);
190191
}
191192

193+
/** Test with codepoints - could refactor, but meh */
194+
@Test
195+
public void testCodepointDocument() throws IOException {
196+
Annotation ann = new Annotation("JSON is neat. Better than 😺.");
197+
StanfordCoreNLP pipeline = new StanfordCoreNLP(PropertiesUtils.asProperties("annotators", "tokenize", "tokenize.codepoint", "true"));
198+
pipeline.annotate(ann);
199+
String actual = new JSONOutputter().print(ann);
200+
String expected = indent(
201+
"{\n" +
202+
"\t\"sentences\": [\n" +
203+
"\t\t{\n" +
204+
"\t\t\t\"index\": 0,\n" +
205+
"\t\t\t\"tokens\": [\n" +
206+
"\t\t\t\t{\n" +
207+
"\t\t\t\t\t\"index\": 1,\n" +
208+
"\t\t\t\t\t\"word\": \"JSON\",\n" +
209+
"\t\t\t\t\t\"originalText\": \"JSON\",\n" +
210+
"\t\t\t\t\t\"characterOffsetBegin\": 0,\n" +
211+
"\t\t\t\t\t\"characterOffsetEnd\": 4,\n" +
212+
"\t\t\t\t\t\"codepointOffsetBegin\": 0,\n" +
213+
"\t\t\t\t\t\"codepointOffsetEnd\": 4,\n" +
214+
"\t\t\t\t\t\"before\": \"\",\n" +
215+
"\t\t\t\t\t\"after\": \" \"\n" +
216+
"\t\t\t\t},\n" +
217+
"\t\t\t\t{\n" +
218+
"\t\t\t\t\t\"index\": 2,\n" +
219+
"\t\t\t\t\t\"word\": \"is\",\n" +
220+
"\t\t\t\t\t\"originalText\": \"is\",\n" +
221+
"\t\t\t\t\t\"characterOffsetBegin\": 5,\n" +
222+
"\t\t\t\t\t\"characterOffsetEnd\": 7,\n" +
223+
"\t\t\t\t\t\"codepointOffsetBegin\": 5,\n" +
224+
"\t\t\t\t\t\"codepointOffsetEnd\": 7,\n" +
225+
"\t\t\t\t\t\"before\": \" \",\n" +
226+
"\t\t\t\t\t\"after\": \" \"\n" +
227+
"\t\t\t\t},\n" +
228+
"\t\t\t\t{\n" +
229+
"\t\t\t\t\t\"index\": 3,\n" +
230+
"\t\t\t\t\t\"word\": \"neat\",\n" +
231+
"\t\t\t\t\t\"originalText\": \"neat\",\n" +
232+
"\t\t\t\t\t\"characterOffsetBegin\": 8,\n" +
233+
"\t\t\t\t\t\"characterOffsetEnd\": 12,\n" +
234+
"\t\t\t\t\t\"codepointOffsetBegin\": 8,\n" +
235+
"\t\t\t\t\t\"codepointOffsetEnd\": 12,\n" +
236+
"\t\t\t\t\t\"before\": \" \",\n" +
237+
"\t\t\t\t\t\"after\": \"\"\n" +
238+
"\t\t\t\t},\n" +
239+
"\t\t\t\t{\n" +
240+
"\t\t\t\t\t\"index\": 4,\n" +
241+
"\t\t\t\t\t\"word\": \".\",\n" +
242+
"\t\t\t\t\t\"originalText\": \".\",\n" +
243+
"\t\t\t\t\t\"characterOffsetBegin\": 12,\n" +
244+
"\t\t\t\t\t\"characterOffsetEnd\": 13,\n" +
245+
"\t\t\t\t\t\"codepointOffsetBegin\": 12,\n" +
246+
"\t\t\t\t\t\"codepointOffsetEnd\": 13,\n" +
247+
"\t\t\t\t\t\"before\": \"\",\n" +
248+
"\t\t\t\t\t\"after\": \" \"\n" +
249+
"\t\t\t\t}\n" +
250+
"\t\t\t]\n" +
251+
"\t\t},\n" +
252+
"\t\t{\n" +
253+
"\t\t\t\"index\": 1,\n" +
254+
"\t\t\t\"tokens\": [\n" +
255+
"\t\t\t\t{\n" +
256+
"\t\t\t\t\t\"index\": 1,\n" +
257+
"\t\t\t\t\t\"word\": \"Better\",\n" +
258+
"\t\t\t\t\t\"originalText\": \"Better\",\n" +
259+
"\t\t\t\t\t\"characterOffsetBegin\": 14,\n" +
260+
"\t\t\t\t\t\"characterOffsetEnd\": 20,\n" +
261+
"\t\t\t\t\t\"codepointOffsetBegin\": 14,\n" +
262+
"\t\t\t\t\t\"codepointOffsetEnd\": 20,\n" +
263+
"\t\t\t\t\t\"before\": \" \",\n" +
264+
"\t\t\t\t\t\"after\": \" \"\n" +
265+
"\t\t\t\t},\n" +
266+
"\t\t\t\t{\n" +
267+
"\t\t\t\t\t\"index\": 2,\n" +
268+
"\t\t\t\t\t\"word\": \"than\",\n" +
269+
"\t\t\t\t\t\"originalText\": \"than\",\n" +
270+
"\t\t\t\t\t\"characterOffsetBegin\": 21,\n" +
271+
"\t\t\t\t\t\"characterOffsetEnd\": 25,\n" +
272+
"\t\t\t\t\t\"codepointOffsetBegin\": 21,\n" +
273+
"\t\t\t\t\t\"codepointOffsetEnd\": 25,\n" +
274+
"\t\t\t\t\t\"before\": \" \",\n" +
275+
"\t\t\t\t\t\"after\": \" \"\n" +
276+
"\t\t\t\t},\n" +
277+
"\t\t\t\t{\n" +
278+
"\t\t\t\t\t\"index\": 3,\n" +
279+
"\t\t\t\t\t\"word\": \"😺\",\n" +
280+
"\t\t\t\t\t\"originalText\": \"😺\",\n" +
281+
"\t\t\t\t\t\"characterOffsetBegin\": 26,\n" +
282+
"\t\t\t\t\t\"characterOffsetEnd\": 28,\n" +
283+
"\t\t\t\t\t\"codepointOffsetBegin\": 26,\n" +
284+
"\t\t\t\t\t\"codepointOffsetEnd\": 27,\n" +
285+
"\t\t\t\t\t\"before\": \" \",\n" +
286+
"\t\t\t\t\t\"after\": \"\"\n" +
287+
"\t\t\t\t},\n" +
288+
"\t\t\t\t{\n" +
289+
"\t\t\t\t\t\"index\": 4,\n" +
290+
"\t\t\t\t\t\"word\": \".\",\n" +
291+
"\t\t\t\t\t\"originalText\": \".\",\n" +
292+
"\t\t\t\t\t\"characterOffsetBegin\": 28,\n" +
293+
"\t\t\t\t\t\"characterOffsetEnd\": 29,\n" +
294+
"\t\t\t\t\t\"codepointOffsetBegin\": 27,\n" +
295+
"\t\t\t\t\t\"codepointOffsetEnd\": 28,\n" +
296+
"\t\t\t\t\t\"before\": \"\",\n" +
297+
"\t\t\t\t\t\"after\": \"\"\n" +
298+
"\t\t\t\t}\n" +
299+
"\t\t\t]\n" +
300+
"\t\t}\n" +
301+
"\t]\n" +
302+
"}\n");
303+
304+
Assert.assertEquals(expected, actual);
305+
}
306+
192307
}

0 commit comments

Comments
 (0)