update version

J38 · Stanford NLP · commit e1de6757bc03 · 2020-08-06T08:50:38.000-07:00
diff --git a/doc/corenlp/pom-full.xml b/doc/corenlp/pom-full.xml
@@ -14,8 +14,8 @@
     </license>
   </licenses>
   <scm>
-    <url>https://nlp.stanford.edu/software/stanford-corenlp-4.1.0.zip</url>
-    <connection>https://nlp.stanford.edu/software/stanford-corenlp-4.1.0.zip</connection>
+    <url>https://nlp.stanford.edu/software/stanford-corenlp-full-2020-07-31.zip</url>
+    <connection>https://nlp.stanford.edu/software/stanford-corenlp-full-2020-07-31.zip</connection>
   </scm>
   <developers>
     <developer>
diff --git a/doc/releasenotes/v4.1.0/corenlp.out b/doc/releasenotes/v4.1.0/corenlp.out
diff --git a/itest/src/edu/stanford/nlp/international/german/process/GermanNERCoreLabelProcessorITest.java b/itest/src/edu/stanford/nlp/international/german/process/GermanNERCoreLabelProcessorITest.java
@@ -0,0 +1,120 @@
+package edu.stanford.nlp.international.german.process;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.Properties;
+import java.util.stream.Collectors;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import edu.stanford.nlp.ling.CoreAnnotations;
+import edu.stanford.nlp.ling.CoreLabel;
+import edu.stanford.nlp.pipeline.CoreDocument;
+import edu.stanford.nlp.pipeline.StanfordCoreNLP;
+import edu.stanford.nlp.util.StringUtils;
+
+/**
+ * Testing for the module that alters tokenization during German NER.
+ * German tokenization matches the CoNLL 2018 standard, but during
+ * NER we want to use a tokenization that doesn't split on hyphens for
+ * improved F1.  The GermanNERCoreLabelProcessor merges tokens originally
+ * split on hyphen, passes that token list to the CRFClassifier, and then
+ * creates a final tokens list with the correct NER-related annotations.
+ */
+
+public class GermanNERCoreLabelProcessorITest {
+
+  public StanfordCoreNLP pipeline;
+  public GermanNERCoreLabelProcessor germanNERCoreLabelProcessor;
+
+  @Before
+  public void setUp() {
+    Properties props = StringUtils.argsToProperties("-props", "german");
+    props.setProperty("annotators", "tokenize,ssplit");
+    pipeline = new StanfordCoreNLP(props);
+    germanNERCoreLabelProcessor = new GermanNERCoreLabelProcessor();
+  }
+
+  public void runTestProcessorExampleNoTags(String text, List<String> goldProcessedTokens) {
+    runTestProcessorExample(text, goldProcessedTokens, null, null);
+  }
+
+  public void runTestProcessorExample(String text, List<String> goldProcessedTokens, List<String> tagsToApply,
+                                   List<String> finalTags) {
+    // tokenize and sentence split
+    CoreDocument document = new CoreDocument(pipeline.process(text));
+    // test processed tokens match expected tokenization pattern
+    System.err.println("---");
+    System.err.println("original tokens: "+document.sentences().get(0).tokensAsStrings());
+    List<CoreLabel> processedTokens = germanNERCoreLabelProcessor.process(document.sentences().get(0).tokens());
+    System.err.println("processed tokens: "+processedTokens.stream().map(
+        tok -> tok.word()).collect(Collectors.toList()));
+    assertEquals(goldProcessedTokens, processedTokens.stream().map(tok -> tok.word()).collect(Collectors.toList()));
+    // if there are tags to apply, apply them
+    if (tagsToApply != null) {
+      for (int i = 0 ; i < tagsToApply.size() ; i++) {
+        processedTokens.get(i).setNER(tagsToApply.get(i));
+      }
+    }
+    List<CoreLabel> restoredTokens = germanNERCoreLabelProcessor.restore(document.tokens(), processedTokens);
+    // test if tags were set correctly for the restored list
+    if (finalTags != null) {
+      List<String> restoredTags = restoredTokens.stream().map(tok -> tok.ner()).collect(Collectors.toList());
+      assertEquals(finalTags, restoredTags);
+      for (int i = 0 ; i < finalTags.size() ; i++) {
+        restoredTokens.get(i).remove(CoreAnnotations.NamedEntityTagAnnotation.class);
+      }
+    }
+    // test that restored tokens match the original tokenization
+    System.err.println("restored tokens :"+restoredTokens.stream().map(tok -> tok.word()).collect(Collectors.toList()));
+    assertEquals(document.tokens(), restoredTokens);
+  }
+
+  @Test
+  public void testProcessor() {
+    // basic example
+    // should split "Microsoft-Aktie" into "Microsoft", "-", and "Aktie", merge during NER, then restore to split
+    String basicExample = "Die Microsoft-Aktie sank daraufhin an der Wall Street um über vier Dollar auf 89,87 Dollar.";
+    List<String> basicGoldTokens = Arrays.asList("Die", "Microsoft-Aktie", "sank", "daraufhin", "an", "der",
+        "Wall", "Street", "um", "über", "vier", "Dollar", "auf", "89,87", "Dollar", ".");
+    List<String> basicTagsAppliedToProcessed = Arrays.asList("O", "MISC", "O", "O", "O", "O", "LOC", "LOC", "O", "O",
+        "O", "O", "O", "O", "O", "O");
+    List<String> basicFinalTags = Arrays.asList("O", "MISC", "MISC", "MISC", "O", "O", "O", "O", "LOC", "LOC", "O", "O", "O",
+        "O", "O", "O", "O", "O");
+    runTestProcessorExample(basicExample, basicGoldTokens, basicTagsAppliedToProcessed, basicFinalTags);
+    // basic example with multiple instances
+    // split both "Mathematik-Lehrpläne" and "Nordrhein-Westfalen"
+    String multiBasicExample = "Es sei wichtig, daß sich ein breiter Kreis mit den von Heymann vorgelegten Thesen " +
+        "beschäftigt, meint Ringel, denn immerhin sei dieser offizieller Berater bei der Erarbeitung neuer " +
+        "Mathematik-Lehrpläne in Nordrhein-Westfalen.";
+    List<String> multiBasicGoldTokens = Arrays.asList("Es", "sei", "wichtig", ",", "daß", "sich", "ein", "breiter",
+        "Kreis", "mit", "den", "von", "Heymann", "vorgelegten", "Thesen", "beschäftigt", ",", "meint", "Ringel", ",",
+        "denn", "immerhin", "sei", "dieser", "offizieller", "Berater", "bei", "der", "Erarbeitung", "neuer",
+        "Mathematik-Lehrpläne", "in", "Nordrhein-Westfalen", ".");
+    List<String> multiBasicTagsAppliedToProcessed = Arrays.asList("O", "O", "O", "O", "O", "O", "O", "O", "O", "O",
+        "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "MISC", "O",
+        "LOC", "O");
+    List<String> multiBasicFinalTags = Arrays.asList("O", "O", "O", "O", "O", "O", "O", "O", "O", "O",
+        "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "MISC",
+        "MISC", "MISC", "O", "LOC", "LOC", "LOC", "O");
+    runTestProcessorExample(multiBasicExample, multiBasicGoldTokens, multiBasicTagsAppliedToProcessed,
+        multiBasicFinalTags);
+    // not just letters example
+    String notJustLettersExample = "Darüber hinaus gibt es Adobe - Trainings, Firmen-/Inhouse-Seminare sowie Coaching " +
+        "für 1-2 Personen.";
+    // triple example
+    String tripleHyphenatedExample = "IG-Metall-Chef Klaus Zwickel hat in Mannheim überaus deutlich an diese Partei, " +
+        "die auch die seine ist, als Verbündete appelliert.";
+    // original tokenization throughout
+    String originalHyphenSplitExample = "Mit dem Carsch - Haus verfügt der Stadtteil über ein Luxuskaufhaus.";
+    // original tokenization throughout
+    String wordEndingWithHyphenExample = "Rund 30 000 Tier- und Pflanzenarten sind weltweit in ihrer Existenz bedroht.";
+    // original tokenization throughout
+    String doubleHyphenExample = "Er will Ende November in Washington mit der US -- Regierung über den Friedensprozeß " +
+        "sprechen.";
+  }
+
+}
diff --git a/itest/src/edu/stanford/nlp/pipeline/GermanNERTokenizerAnnotator.java b/itest/src/edu/stanford/nlp/pipeline/GermanNERTokenizerAnnotator.java
@@ -0,0 +1,36 @@
+package edu.stanford.nlp.pipeline;
+
+import edu.stanford.nlp.international.german.process.GermanNERCoreLabelProcessor;
+import edu.stanford.nlp.ling.*;
+
+import java.util.*;
+
+/**
+ * An Annotator that matches tokenization when German text is sent to CRFClassifier.
+ * Note, this annotator just alters the document tokens, and is only meant for a
+ * benchmarking test.  This won't alter the list of sentence tokens.
+ */
+
+public class GermanNERTokenizerAnnotator implements Annotator {
+
+  public GermanNERCoreLabelProcessor germanCoreLabelProcessor = new GermanNERCoreLabelProcessor();
+
+  @Override
+  public void annotate(Annotation ann) {
+    ann.set(CoreAnnotations.TokensAnnotation.class,
+        germanCoreLabelProcessor.process(ann.get(CoreAnnotations.TokensAnnotation.class)));
+  }
+
+  @Override
+  public Set<Class<? extends CoreAnnotation>> requires() {
+    return Collections.emptySet();
+  }
+
+  @Override
+  public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
+    return Collections.emptySet();
+  }
+
+
+
+}
diff --git a/pom-java-11.xml b/pom-java-11.xml
@@ -2,7 +2,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>edu.stanford.nlp</groupId>
   <artifactId>stanford-corenlp</artifactId>
-  <version>4.1.0</version>
+  <version>4.0.0</version>
   <packaging>jar</packaging>
   <name>Stanford CoreNLP</name>
   <description>Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.</description>
@@ -14,8 +14,8 @@
     </license>
   </licenses>
   <scm>
-    <url>https://nlp.stanford.edu/software/stanford-corenlp-4.1.0.zip</url>
-    <connection>https://nlp.stanford.edu/software/stanford-corenlp-4.1.0.zip</connection>
+    <url>https://nlp.stanford.edu/software/stanford-corenlp-full-2020-04-16.zip</url>
+    <connection>https://nlp.stanford.edu/software/stanford-corenlp-full-2020-04-16.zip</connection>
   </scm>
   <developers>
     <developer>
@@ -193,7 +193,7 @@
             <configuration>
               <artifacts>
                 <artifact>
-                  <file>${project.basedir}/stanford-corenlp-4.1.0-models.jar</file>
+                  <file>${project.basedir}/stanford-corenlp-4.0.0-models.jar</file>
                   <type>jar</type>
                   <classifier>models</classifier>
                 </artifact>
diff --git a/pom.xml b/pom.xml
@@ -2,7 +2,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>edu.stanford.nlp</groupId>
   <artifactId>stanford-corenlp</artifactId>
-  <version>4.1.0</version>
+  <version>4.0.0</version>
   <packaging>jar</packaging>
   <name>Stanford CoreNLP</name>
   <description>Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.</description>
@@ -14,8 +14,8 @@
     </license>
   </licenses>
   <scm>
-    <url>https://nlp.stanford.edu/software/stanford-corenlp-4.1.0.zip</url>
-    <connection>https://nlp.stanford.edu/software/stanford-corenlp-4.1.0.zip</connection>
+    <url>https://nlp.stanford.edu/software/stanford-corenlp-full-2020-04-16.zip</url>
+    <connection>https://nlp.stanford.edu/software/stanford-corenlp-full-2020-04-16.zip</connection>
   </scm>
   <developers>
     <developer>
@@ -195,7 +195,7 @@
             <configuration>
               <artifacts>
                 <artifact>
-                  <file>${project.basedir}/stanford-corenlp-4.1.0-models.jar</file>
+                  <file>${project.basedir}/stanford-corenlp-4.0.0-models.jar</file>
                   <type>jar</type>
                   <classifier>models</classifier>
                 </artifact>
diff --git a/src/edu/stanford/nlp/ie/util/RelationTriple.java b/src/edu/stanford/nlp/ie/util/RelationTriple.java
@@ -1,6 +1,5 @@
 package edu.stanford.nlp.ie.util;
 
-import java.io.Serializable;
 import java.text.DecimalFormat;
 import java.util.*;
 import java.util.function.ToIntFunction;
@@ -23,9 +22,7 @@
  * @author Gabor Angeli
  */
 @SuppressWarnings("UnusedDeclaration")
-public class RelationTriple implements Comparable<RelationTriple>, Iterable<CoreLabel>, Serializable {
-
-  private static final long serialVersionUID = 43758623469716523L;
+public class RelationTriple implements Comparable<RelationTriple>, Iterable<CoreLabel> {
 
   /** The subject (first argument) of this triple */
   public final List<CoreLabel> subject;
diff --git a/src/edu/stanford/nlp/international/german/process/GermanNERCoreLabelProcessor.java b/src/edu/stanford/nlp/international/german/process/GermanNERCoreLabelProcessor.java
@@ -0,0 +1,91 @@
+package edu.stanford.nlp.international.german.process;
+
+import edu.stanford.nlp.ling.*;
+import edu.stanford.nlp.process.*;
+
+import java.util.*;
+import java.util.function.*;
+
+
+/**
+ * Class for mapping CoNLL 2018 tokenized German text to German NER tokenization
+ * and vice versa.  The CoNLL 2018 German tokenization splits words such as
+ * "CDU-Parlamentarier" into "CDU", "-", and "Parlamentarier".  This causes
+ * a performance drop for German NER of several F1 points, so this module will
+ * facilitate retokenizing CoNLL 2018 tokenized text to match our internal German
+ * NER training data which does not split on hyphens.  Post classification this
+ * module can restore the tokenization to the CoNLL 2018 style.
+ */
+
+public class GermanNERCoreLabelProcessor extends CoreLabelProcessor {
+
+  /** Check that after() is not null and the empty string **/
+  public Function<CoreLabel, Boolean> afterIsEmpty = tok ->
+      tok.containsKey(CoreAnnotations.AfterAnnotation.class) && tok.after().equals("");
+
+  /**
+   * merge the contents of two tokens
+   **/
+  public static void mergeTokens(CoreLabel token, CoreLabel nextToken) {
+    // NOTE: right now the merged tokens get the part-of-speech tag of the first token
+    token.setWord(token.word() + nextToken.word());
+    token.setAfter(nextToken.after());
+    token.setEndPosition(nextToken.endPosition());
+    token.setValue(token.word()+"-"+token.sentIndex());
+  }
+
+  @Override
+  public List<CoreLabel> process(List<CoreLabel> tokens) {
+    List<CoreLabel> processedTokens = new ArrayList<CoreLabel>();
+    for (CoreLabel currToken : tokens) {
+      CoreLabel processedToken = new CoreLabel(currToken);
+      CoreLabel lastProcessedToken =
+          processedTokens.size() > 0 ? processedTokens.get(processedTokens.size() - 1) : null;
+      if (lastProcessedToken != null && afterIsEmpty.apply(lastProcessedToken) && currToken.word().equals("-")) {
+        mergeTokens(lastProcessedToken, currToken);
+      } else if (lastProcessedToken != null && lastProcessedToken.word().endsWith("-") &&
+          afterIsEmpty.apply(lastProcessedToken)) {
+        mergeTokens(lastProcessedToken, currToken);
+      } else {
+        processedTokens.add(processedToken);
+      }
+    }
+    return processedTokens;
+  }
+
+  @Override
+  public List<CoreLabel> restore(List<CoreLabel> originalTokens, List<CoreLabel> processedTokens) {
+    List<CoreLabel> restoredTokens = new ArrayList<>();
+    for (int i = 0, j = 0 ; i < processedTokens.size() ; i++) {
+      // for each processed token, loop through the 1 or more original tokens
+      // that correspond to the merged token
+      CoreLabel processedToken = processedTokens.get(i);
+      while (j < originalTokens.size()) {
+        CoreLabel originalToken = originalTokens.get(j);
+        if (originalToken.beginPosition() >= processedToken.endPosition())
+          break;
+        // copy most info from processed token (such as NER tag)
+        CoreLabel restoredToken = new CoreLabel(processedToken);
+        // copy text and character info from original token
+        restoredToken.setWord(originalToken.word());
+        restoredToken.setOriginalText(originalToken.originalText());
+        restoredToken.setBeginPosition(originalToken.beginPosition());
+        restoredToken.setEndPosition(originalToken.endPosition());
+        restoredToken.set(CoreAnnotations.TokenBeginAnnotation.class, originalToken.get(
+            CoreAnnotations.TokenBeginAnnotation.class));
+        restoredToken.set(CoreAnnotations.TokenEndAnnotation.class, originalToken.get(
+            CoreAnnotations.TokenEndAnnotation.class));
+        restoredToken.setAfter(originalToken.after());
+        restoredToken.setBefore(originalToken.before());
+        restoredToken.setIndex(j+1);
+        restoredToken.setValue(restoredToken.word());
+        // add restored token to list
+        restoredTokens.add(restoredToken);
+        // move on to next original token
+        j++;
+      }
+    }
+    return restoredTokens;
+  }
+
+}
diff --git a/src/edu/stanford/nlp/pipeline/demo/corenlp-brat.html b/src/edu/stanford/nlp/pipeline/demo/corenlp-brat.html
@@ -28,7 +28,7 @@
 <nav class="navbar navbar-default navbar-static-top">
   <div class="container">
     <div class="navbar-header">
-      <a class="navbar-brand" href="https://stanfordnlp.github.io/CoreNLP/">Stanford CoreNLP 4.0.0 (updated 2020-04-16)</a>
+      <a class="navbar-brand" href="https://stanfordnlp.github.io/CoreNLP/">Stanford CoreNLP 4.1.0 (updated 2020-07-31)</a>
     </div>
   </div>
 </nav>