Fix broken codepoint offsets in JSON output

AngledLuffa · AngledLuffa · commit e0f6185addda · 2022-05-26T00:21:37.000-07:00
diff --git a/src/edu/stanford/nlp/pipeline/JSONOutputter.java b/src/edu/stanford/nlp/pipeline/JSONOutputter.java
@@ -188,8 +188,8 @@ public void print(Annotation doc, OutputStream target, Options options) throws I
               l3.set("characterOffsetEnd", token.endPosition());
               if (token.containsKey(CoreAnnotations.CodepointOffsetBeginAnnotation.class) &&
                   token.containsKey(CoreAnnotations.CodepointOffsetEndAnnotation.class)) {
-                l3.set("codepointOffsetBegin", token.beginPosition());
-                l3.set("codepointOffsetEnd", token.endPosition());
+                l3.set("codepointOffsetBegin", token.get(CoreAnnotations.CodepointOffsetBeginAnnotation.class));
+                l3.set("codepointOffsetEnd", token.get(CoreAnnotations.CodepointOffsetEndAnnotation.class));
               }
               l3.set("pos", token.tag());
               l3.set("ner", token.ner());
@@ -216,6 +216,11 @@ public void print(Annotation doc, OutputStream target, Options options) throws I
                 l2.set("originalText", token.originalText());
                 l2.set("characterOffsetBegin", token.beginPosition());
                 l2.set("characterOffsetEnd", token.endPosition());
+                if (token.containsKey(CoreAnnotations.CodepointOffsetBeginAnnotation.class) &&
+                    token.containsKey(CoreAnnotations.CodepointOffsetEndAnnotation.class)) {
+                  l2.set("codepointOffsetBegin", token.get(CoreAnnotations.CodepointOffsetBeginAnnotation.class));
+                  l2.set("codepointOffsetEnd", token.get(CoreAnnotations.CodepointOffsetEndAnnotation.class));
+                }
           }));
         }
       }
diff --git a/test/src/edu/stanford/nlp/pipeline/JSONOutputterTest.java b/test/src/edu/stanford/nlp/pipeline/JSONOutputterTest.java
@@ -92,6 +92,7 @@ public void testComplexJSON() {
   // BEGIN TESTS FOR ANNOTATION WRITING
   // -----
 
+
   @Test
   public void testSimpleDocument() throws IOException {
     Annotation ann = new Annotation("JSON is neat. Better than XML.");
@@ -189,4 +190,118 @@ public void testSimpleDocument() throws IOException {
     Assert.assertEquals(expected, actual);
   }
 
+  /** Test with codepoints - could refactor, but meh */
+  @Test
+  public void testCodepointDocument() throws IOException {
+    Annotation ann = new Annotation("JSON is neat. Better than 😺.");
+    StanfordCoreNLP pipeline = new StanfordCoreNLP(PropertiesUtils.asProperties("annotators", "tokenize", "tokenize.codepoint", "true"));
+    pipeline.annotate(ann);
+    String actual = new JSONOutputter().print(ann);
+    String expected = indent(
+        "{\n" +
+        "\t\"sentences\": [\n" +
+        "\t\t{\n" +
+        "\t\t\t\"index\": 0,\n" +
+        "\t\t\t\"tokens\": [\n" +
+        "\t\t\t\t{\n" +
+        "\t\t\t\t\t\"index\": 1,\n" +
+        "\t\t\t\t\t\"word\": \"JSON\",\n" +
+        "\t\t\t\t\t\"originalText\": \"JSON\",\n" +
+        "\t\t\t\t\t\"characterOffsetBegin\": 0,\n" +
+        "\t\t\t\t\t\"characterOffsetEnd\": 4,\n" +
+        "\t\t\t\t\t\"codepointOffsetBegin\": 0,\n" +
+        "\t\t\t\t\t\"codepointOffsetEnd\": 4,\n" +
+        "\t\t\t\t\t\"before\": \"\",\n" +
+        "\t\t\t\t\t\"after\": \" \"\n" +
+        "\t\t\t\t},\n" +
+        "\t\t\t\t{\n" +
+        "\t\t\t\t\t\"index\": 2,\n" +
+        "\t\t\t\t\t\"word\": \"is\",\n" +
+        "\t\t\t\t\t\"originalText\": \"is\",\n" +
+        "\t\t\t\t\t\"characterOffsetBegin\": 5,\n" +
+        "\t\t\t\t\t\"characterOffsetEnd\": 7,\n" +
+        "\t\t\t\t\t\"codepointOffsetBegin\": 5,\n" +
+        "\t\t\t\t\t\"codepointOffsetEnd\": 7,\n" +
+        "\t\t\t\t\t\"before\": \" \",\n" +
+        "\t\t\t\t\t\"after\": \" \"\n" +
+        "\t\t\t\t},\n" +
+        "\t\t\t\t{\n" +
+        "\t\t\t\t\t\"index\": 3,\n" +
+        "\t\t\t\t\t\"word\": \"neat\",\n" +
+        "\t\t\t\t\t\"originalText\": \"neat\",\n" +
+        "\t\t\t\t\t\"characterOffsetBegin\": 8,\n" +
+        "\t\t\t\t\t\"characterOffsetEnd\": 12,\n" +
+        "\t\t\t\t\t\"codepointOffsetBegin\": 8,\n" +
+        "\t\t\t\t\t\"codepointOffsetEnd\": 12,\n" +
+        "\t\t\t\t\t\"before\": \" \",\n" +
+        "\t\t\t\t\t\"after\": \"\"\n" +
+        "\t\t\t\t},\n" +
+        "\t\t\t\t{\n" +
+        "\t\t\t\t\t\"index\": 4,\n" +
+        "\t\t\t\t\t\"word\": \".\",\n" +
+        "\t\t\t\t\t\"originalText\": \".\",\n" +
+        "\t\t\t\t\t\"characterOffsetBegin\": 12,\n" +
+        "\t\t\t\t\t\"characterOffsetEnd\": 13,\n" +
+        "\t\t\t\t\t\"codepointOffsetBegin\": 12,\n" +
+        "\t\t\t\t\t\"codepointOffsetEnd\": 13,\n" +
+        "\t\t\t\t\t\"before\": \"\",\n" +
+        "\t\t\t\t\t\"after\": \" \"\n" +
+        "\t\t\t\t}\n" +
+        "\t\t\t]\n" +
+        "\t\t},\n" +
+        "\t\t{\n" +
+        "\t\t\t\"index\": 1,\n" +
+        "\t\t\t\"tokens\": [\n" +
+        "\t\t\t\t{\n" +
+        "\t\t\t\t\t\"index\": 1,\n" +
+        "\t\t\t\t\t\"word\": \"Better\",\n" +
+        "\t\t\t\t\t\"originalText\": \"Better\",\n" +
+        "\t\t\t\t\t\"characterOffsetBegin\": 14,\n" +
+        "\t\t\t\t\t\"characterOffsetEnd\": 20,\n" +
+        "\t\t\t\t\t\"codepointOffsetBegin\": 14,\n" +
+        "\t\t\t\t\t\"codepointOffsetEnd\": 20,\n" +
+        "\t\t\t\t\t\"before\": \" \",\n" +
+        "\t\t\t\t\t\"after\": \" \"\n" +
+        "\t\t\t\t},\n" +
+        "\t\t\t\t{\n" +
+        "\t\t\t\t\t\"index\": 2,\n" +
+        "\t\t\t\t\t\"word\": \"than\",\n" +
+        "\t\t\t\t\t\"originalText\": \"than\",\n" +
+        "\t\t\t\t\t\"characterOffsetBegin\": 21,\n" +
+        "\t\t\t\t\t\"characterOffsetEnd\": 25,\n" +
+        "\t\t\t\t\t\"codepointOffsetBegin\": 21,\n" +
+        "\t\t\t\t\t\"codepointOffsetEnd\": 25,\n" +
+        "\t\t\t\t\t\"before\": \" \",\n" +
+        "\t\t\t\t\t\"after\": \" \"\n" +
+        "\t\t\t\t},\n" +
+        "\t\t\t\t{\n" +
+        "\t\t\t\t\t\"index\": 3,\n" +
+        "\t\t\t\t\t\"word\": \"😺\",\n" +
+        "\t\t\t\t\t\"originalText\": \"😺\",\n" +
+        "\t\t\t\t\t\"characterOffsetBegin\": 26,\n" +
+        "\t\t\t\t\t\"characterOffsetEnd\": 28,\n" +
+        "\t\t\t\t\t\"codepointOffsetBegin\": 26,\n" +
+        "\t\t\t\t\t\"codepointOffsetEnd\": 27,\n" +
+        "\t\t\t\t\t\"before\": \" \",\n" +
+        "\t\t\t\t\t\"after\": \"\"\n" +
+        "\t\t\t\t},\n" +
+        "\t\t\t\t{\n" +
+        "\t\t\t\t\t\"index\": 4,\n" +
+        "\t\t\t\t\t\"word\": \".\",\n" +
+        "\t\t\t\t\t\"originalText\": \".\",\n" +
+        "\t\t\t\t\t\"characterOffsetBegin\": 28,\n" +
+        "\t\t\t\t\t\"characterOffsetEnd\": 29,\n" +
+        "\t\t\t\t\t\"codepointOffsetBegin\": 27,\n" +
+        "\t\t\t\t\t\"codepointOffsetEnd\": 28,\n" +
+        "\t\t\t\t\t\"before\": \"\",\n" +
+        "\t\t\t\t\t\"after\": \"\"\n" +
+        "\t\t\t\t}\n" +
+        "\t\t\t]\n" +
+        "\t\t}\n" +
+        "\t]\n" +
+        "}\n");
+
+    Assert.assertEquals(expected, actual);
+  }
+
 }