Skip to content

Commit b3bee86

Browse files
authored
Merge pull request #1084 from stanfordnlp/angel-coref
Angel's coref branch
2 parents 5b62b0d + be96b61 commit b3bee86

27 files changed

+2632
-787
lines changed

src/edu/stanford/nlp/coref/CorefAlgorithm.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import edu.stanford.nlp.coref.neural.NeuralCorefAlgorithm;
1111
import edu.stanford.nlp.coref.statistical.ClusteringCorefAlgorithm;
1212
import edu.stanford.nlp.coref.statistical.StatisticalCorefAlgorithm;
13+
import edu.stanford.nlp.util.PropertiesUtils;
1314

1415
/**
1516
* A CorefAlgorithms make coreference decisions on the provided {@link Document} after
@@ -31,6 +32,18 @@ static CorefAlgorithm fromProps(Properties props, Dictionaries dictionaries) {
3132
return new NeuralCorefAlgorithm(props, dictionaries);
3233
} else if (algorithm == CorefAlgorithmType.FASTNEURAL) {
3334
return new FastNeuralCorefAlgorithm(props, dictionaries);
35+
} else if (algorithm == CorefAlgorithmType.CUSTOM) {
36+
String classname = PropertiesUtils.getString(props, "coref.algorithm.class", null);
37+
try {
38+
if (classname != null) {
39+
Class clazz = Class.forName(classname);
40+
return (CorefAlgorithm) clazz.getConstructor(Properties.class, Dictionaries.class).newInstance(props, dictionaries);
41+
} else {
42+
throw new RuntimeException("Please specify coref.algorithm.class");
43+
}
44+
} catch (Exception e) {
45+
throw new RuntimeException("Error creating custom coref system", e);
46+
}
3447
} else {
3548
try {
3649
return new HybridCorefSystem(props, dictionaries);

src/edu/stanford/nlp/coref/CorefPrinter.java

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import java.util.Set;
77
import java.util.stream.Collectors;
88

9+
import edu.stanford.nlp.coref.data.CorefCluster;
910
import edu.stanford.nlp.coref.data.Document;
1011
import edu.stanford.nlp.coref.data.Mention;
1112
import edu.stanford.nlp.ling.CoreAnnotations;
@@ -24,12 +25,18 @@ public static String printConllOutput(Document document, boolean gold) {
2425
return printConllOutput(document, gold, false);
2526
}
2627

27-
public static String printConllOutput(Document document, boolean gold, boolean filterSingletons) {
28+
public static String printConllOutput(Document document, boolean gold, boolean filterSingletons)
29+
{
30+
return printConllOutput(document, gold, filterSingletons, document.corefClusters);
31+
}
32+
33+
public static String printConllOutput(Document document, boolean gold, boolean filterSingletons,
34+
Map<Integer, CorefCluster> corefClusters) {
2835
List<List<Mention>> orderedMentions = gold ? document.goldMentions : document.predictedMentions;
2936
if (filterSingletons) {
3037
orderedMentions = orderedMentions.stream().map(
31-
ml -> ml.stream().filter(m -> document.corefClusters.get(m.corefClusterID) != null &&
32-
document.corefClusters.get(m.corefClusterID).size() > 1)
38+
ml -> ml.stream().filter(m -> corefClusters.get(m.corefClusterID) != null &&
39+
corefClusters.get(m.corefClusterID).size() > 1)
3340
.collect(Collectors.toList()))
3441
.collect(Collectors.toList());
3542
}
@@ -39,14 +46,14 @@ public static String printConllOutput(Document document, boolean gold, boolean f
3946
public static String printConllOutput(Document document,
4047
List<List<Mention>> orderedMentions, boolean gold) {
4148
Annotation anno = document.annotation;
42-
List<List<String[]>> conllDocSentences = document.conllDoc.sentenceWordLists;
49+
List<List<String[]>> conllDocSentences = document.getSentenceWordLists();
4350
String docID = anno.get(CoreAnnotations.DocIDAnnotation.class);
4451
StringBuilder sb = new StringBuilder();
4552
sb.append("#begin document ").append(docID).append("\n");
4653
List<CoreMap> sentences = anno.get(CoreAnnotations.SentencesAnnotation.class);
4754
for(int sentNum = 0 ; sentNum < sentences.size() ; sentNum++){
4855
List<CoreLabel> sentence = sentences.get(sentNum).get(CoreAnnotations.TokensAnnotation.class);
49-
List<String[]> conllSentence = conllDocSentences.get(sentNum);
56+
List<String[]> conllSentence = (conllDocSentences != null)? conllDocSentences.get(sentNum) : null;
5057
Map<Integer,Set<Mention>> mentionBeginOnly = Generics.newHashMap();
5158
Map<Integer,Set<Mention>> mentionEndOnly = Generics.newHashMap();
5259
Map<Integer,Set<Mention>> mentionBeginEnd = Generics.newHashMap();
@@ -91,10 +98,12 @@ public static String printConllOutput(Document document,
9198
}
9299
if(sb2.length() == 0) sb2.append("-");
93100

94-
String[] columns = conllSentence.get(i);
95-
for(int j = 0 ; j < columns.length-1 ; j++){
96-
String column = columns[j];
97-
sb.append(column).append("\t");
101+
if (conllSentence != null) {
102+
String[] columns = conllSentence.get(i);
103+
for (int j = 0; j < columns.length - 1; j++) {
104+
String column = columns[j];
105+
sb.append(column).append("\t");
106+
}
98107
}
99108
sb.append(sb2).append("\n");
100109
}

src/edu/stanford/nlp/coref/CorefProperties.java

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,23 @@
11
package edu.stanford.nlp.coref;
22

3+
import java.util.List;
34
import java.util.Locale;
45
import java.util.Properties;
5-
6+
import java.util.Set;
7+
import java.util.function.Predicate;
8+
9+
import edu.stanford.nlp.coref.data.CorefChain;
10+
import edu.stanford.nlp.coref.data.CorefCluster;
11+
import edu.stanford.nlp.coref.data.Mention;
12+
import edu.stanford.nlp.ling.CoreAnnotations;
13+
import edu.stanford.nlp.ling.CoreLabel;
14+
import edu.stanford.nlp.pipeline.Annotation;
615
import edu.stanford.nlp.trees.HeadFinder;
716
import edu.stanford.nlp.trees.SemanticHeadFinder;
817
import edu.stanford.nlp.trees.international.pennchinese.ChineseSemanticHeadFinder;
18+
import edu.stanford.nlp.util.CollectionUtils;
19+
import edu.stanford.nlp.util.CoreMap;
20+
import edu.stanford.nlp.util.Pair;
921
import edu.stanford.nlp.util.PropertiesUtils;
1022

1123
/**
@@ -20,7 +32,7 @@ private CorefProperties() {} // static methods
2032

2133
//---------- Coreference Algorithms ----------
2234

23-
public enum CorefAlgorithmType {CLUSTERING, STATISTICAL, NEURAL, FASTNEURAL, HYBRID}
35+
public enum CorefAlgorithmType {CLUSTERING, STATISTICAL, NEURAL, FASTNEURAL, HYBRID, CUSTOM}
2436

2537
public static CorefAlgorithmType algorithm(Properties props) {
2638
String type = PropertiesUtils.getString(props, "coref.algorithm",
@@ -56,6 +68,10 @@ public static boolean removeSingletonClusters(Properties props) {
5668
return PropertiesUtils.getBool(props, "coref.removeSingletonClusters", true);
5769
}
5870

71+
public static boolean removeXmlMentions(Properties props) {
72+
return PropertiesUtils.getBool(props, "coref.removeXmlMentions", false);
73+
}
74+
5975
// ---------- Heuristic Mention Filtering ----------
6076

6177
public static int maxMentionDistance(Properties props) {
@@ -183,4 +199,16 @@ public static HeadFinder getHeadFinder(Properties props) {
183199
}
184200
}
185201

202+
public static Predicate<Pair<CorefChain.CorefMention, List<CoreLabel>>> getCorefMentionFilter(Properties props) {
203+
String filterCorefChain = props.getProperty("coref.evaluate.filter");
204+
if (filterCorefChain != null) {
205+
if ("filterCustomerAbstractPronouns".equals(filterCorefChain)) {
206+
return CorefUtils.filterCustomerAbstractPronouns;
207+
} else {
208+
throw new RuntimeException("Cannot create coref.evaluate.filter " + filterCorefChain);
209+
}
210+
} else {
211+
return null;
212+
}
213+
}
186214
}

src/edu/stanford/nlp/coref/CorefRules.java

Lines changed: 67 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,19 @@ public static boolean entityPersonDisagree(Document document, CorefCluster menti
103103
if(disagree) return true;
104104
else return false;
105105
}
106+
public static boolean entityPersonCompatible(Document document, CorefCluster mentionCluster, CorefCluster potentialAntecedent, Dictionaries dict){
107+
boolean disagree = false;
108+
for(Mention m : mentionCluster.getCorefMentions()) {
109+
for(Mention ant : potentialAntecedent.getCorefMentions()) {
110+
if(!entityPersonCompatible(document, m, ant, dict)) {
111+
disagree = true;
112+
break;
113+
}
114+
}
115+
}
116+
if(disagree) return false;
117+
else return true;
118+
}
106119

107120
private static final List<String> entityWordsToExclude =
108121
Arrays.asList(new String[]{ "the","this", "mr.", "miss", "mrs.", "dr.", "ms.", "inc.", "ltd.", "corp.", "'s"});
@@ -744,8 +757,12 @@ public static boolean entityPersonDisagree(Document document, Mention m, Mention
744757
if ((m.person == Person.IT && ant.person == Person.THEY)
745758
|| (m.person == Person.THEY && ant.person == Person.IT) || (m.person == Person.THEY && ant.person == Person.THEY)) {
746759
return false;
747-
} else if (m.person != Person.UNKNOWN && ant.person != Person.UNKNOWN)
760+
} else if (m.person != Person.UNKNOWN && ant.person != Person.UNKNOWN) {
748761
return true;
762+
} else if (((m.person == Person.I || m.person == Person.YOU) && (dict.determiners.contains(ant.spanToString()))) ||
763+
((ant.person == Person.I || ant.person == Person.YOU) && (dict.determiners.contains(m.spanToString())))) {
764+
return true;
765+
}
749766
}
750767
if(sameSpeaker) {
751768
if(!ant.isPronominal()) {
@@ -788,6 +805,42 @@ public static boolean entityPersonDisagree(Document document, Mention m, Mention
788805
return false;
789806
}
790807

808+
public static boolean entityPersonCompatible(Document document, Mention m, Mention ant, Dictionaries dict) {
809+
// Returns if the entity person is compatible based on the speaker
810+
boolean sameSpeaker = entitySameSpeaker(document, m, ant);
811+
812+
if (sameSpeaker && m.person!=ant.person) {
813+
if ((m.person == Person.IT && ant.person == Person.THEY)
814+
|| (m.person == Person.THEY && ant.person == Person.IT) || (m.person == Person.THEY && ant.person == Person.THEY)) {
815+
return true;
816+
} else if (m.person != Person.UNKNOWN && ant.person != Person.UNKNOWN) {
817+
return false;
818+
} else if (((m.person == Person.I || m.person == Person.YOU) && (dict.determiners.contains(ant.spanToString().toLowerCase()))) ||
819+
((ant.person == Person.I || ant.person == Person.YOU) && (dict.determiners.contains(m.spanToString().toLowerCase())))) {
820+
return false;
821+
}
822+
}
823+
if(sameSpeaker) {
824+
if(!ant.isPronominal()) {
825+
if(m.person==Person.I || m.person==Person.WE || m.person==Person.YOU) return false;
826+
} else if(!m.isPronominal()) {
827+
if(ant.person==Person.I || ant.person==Person.WE || ant.person==Person.YOU) return false;
828+
}
829+
}
830+
boolean differentSpeaker = entityDifferentSpeaker(document, m, ant);
831+
if (differentSpeaker) {
832+
if (ant.person == Person.I && m.person == Person.I) {
833+
return false;
834+
}
835+
if (document.numberOfSpeakers() == 2) {
836+
if (ant.person == Person.YOU && m.person == Person.YOU) {
837+
return false;
838+
}
839+
}
840+
}
841+
return true;
842+
}
843+
791844
/** Do the mentions share the same speaker? */
792845
public static boolean entitySameSpeaker(Document document, Mention m, Mention ant) {
793846
String mSpeakerStr = m.headWord.get(CoreAnnotations.SpeakerAnnotation.class);
@@ -814,6 +867,19 @@ public static boolean entitySameSpeaker(Document document, Mention m, Mention an
814867
}
815868
}
816869

870+
public static boolean entityDifferentSpeaker(Document document, Mention m, Mention ant) {
871+
String mSpeakerStr = m.headWord.get(CoreAnnotations.SpeakerAnnotation.class);
872+
if (mSpeakerStr == null) {
873+
return false;
874+
}
875+
String antSpeakerStr = ant.headWord.get(CoreAnnotations.SpeakerAnnotation.class);
876+
if (antSpeakerStr == null) {
877+
return false;
878+
}
879+
880+
return !entitySameSpeaker(document, m, ant);
881+
}
882+
817883
/**
818884
* Given the name of a speaker, returns the coref cluster id it belongs to (-1 if no cluster)
819885
* @param document The document to search in
@@ -981,5 +1047,4 @@ private static boolean isContextOverlapping(Mention m1, Mention m2) {
9811047
return Sets.intersects(context1, context2);
9821048
}
9831049

984-
9851050
}

src/edu/stanford/nlp/coref/CorefScorer.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ public static void printScoreSummary(String summary, Logger logger, boolean afte
4949
for(String line : lines) {
5050
if(line.startsWith("Identification of Mentions")) {
5151
Redwood.log(line);
52+
logger.info(line);
5253
return;
5354
}
5455
}
@@ -61,6 +62,7 @@ public static void printScoreSummary(String summary, Logger logger, boolean afte
6162
}
6263
}
6364
Redwood.log(sb.toString());
65+
logger.info(sb.toString());
6466
}
6567
}
6668

@@ -76,10 +78,11 @@ public static double getFinalConllScore(String summary) {
7678
return finalScore;
7779
}
7880

79-
public static void printFinalConllScore(String summary) {
81+
public static void printFinalConllScore(String summary, Logger logger) {
8082
double finalScore = getFinalConllScore(summary);
8183
Redwood.log(
8284
"Final conll score ((muc+bcub+ceafe)/3) = " + (new DecimalFormat("#.##")).format(finalScore));
85+
logger.info("Final conll score ((muc+bcub+ceafe)/3) = " + (new DecimalFormat("#.##")).format(finalScore));
8386
}
8487

8588
public static double getFinalConllScoreFromOutputDir(String corefOutputDir, String scorerPath) {

src/edu/stanford/nlp/coref/CorefSystem.java

Lines changed: 44 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,16 @@
11
package edu.stanford.nlp.coref;
22

3+
import java.io.File;
34
import java.io.FileOutputStream;
5+
import java.io.IOException;
46
import java.io.PrintWriter;
57
import java.util.Calendar;
68
import java.util.Map;
79
import java.util.Properties;
10+
import java.util.logging.FileHandler;
11+
import java.util.logging.Level;
812
import java.util.logging.Logger;
13+
import java.util.stream.Collectors;
914

1015
import edu.stanford.nlp.coref.data.CorefChain;
1116
import edu.stanford.nlp.coref.data.CorefCluster;
@@ -15,6 +20,7 @@
1520
import edu.stanford.nlp.pipeline.Annotation;
1621
import edu.stanford.nlp.util.Generics;
1722
import edu.stanford.nlp.util.StringUtils;
23+
import edu.stanford.nlp.util.logging.NewlineLogFormatter;
1824
import edu.stanford.nlp.util.logging.Redwood;
1925

2026
/**
@@ -69,16 +75,36 @@ public void annotate(Annotation ann) {
6975
ann.set(CorefCoreAnnotations.CorefChainAnnotation.class, result);
7076
}
7177

78+
public void initLogger(Logger logger, String logFileName) {
79+
try {
80+
FileHandler fh = new FileHandler(logFileName, false);
81+
logger.addHandler(fh);
82+
logger.setLevel(Level.FINE);
83+
fh.setFormatter(new NewlineLogFormatter());
84+
} catch (SecurityException | IOException e) {
85+
throw new RuntimeException("Cannot initialize logger!", e);
86+
}
87+
}
88+
7289
public void runOnConll(Properties props) throws Exception {
73-
String baseName = CorefProperties.conllOutputPath(props) +
74-
Calendar.getInstance().getTime().toString().replaceAll("\\s", "-").replaceAll(":", "-");
90+
File f = new File(CorefProperties.conllOutputPath(props));
91+
if (! f.exists()) {
92+
f.mkdirs();
93+
}
94+
String timestamp = Calendar.getInstance().getTime().toString().replaceAll("\\s", "-").replaceAll(":", "-");
95+
String baseName = CorefProperties.conllOutputPath(props) + timestamp;
7596
String goldOutput = baseName + ".gold.txt";
7697
String beforeCorefOutput = baseName + ".predicted.txt";
7798
String afterCorefOutput = baseName + ".coref.predicted.txt";
7899
PrintWriter writerGold = new PrintWriter(new FileOutputStream(goldOutput));
79100
PrintWriter writerBeforeCoref = new PrintWriter(new FileOutputStream(beforeCorefOutput));
80101
PrintWriter writerAfterCoref = new PrintWriter(new FileOutputStream(afterCorefOutput));
81102

103+
Logger logger = Logger.getLogger(CorefSystem.class.getName());
104+
initLogger(logger,baseName + ".log");
105+
logger.info(timestamp);
106+
logger.info(props.toString());
107+
82108
(new CorefDocumentProcessor() {
83109
@Override
84110
public void process(int id, Document document) {
@@ -94,7 +120,14 @@ public void process(int id, Document document) {
94120
if (verbose) {
95121
CorefUtils.printHumanReadableCoref(document);
96122
}
97-
writerAfterCoref.print(CorefPrinter.printConllOutput(document, false, true));
123+
if (document.filterMentionSet != null) {
124+
Map<Integer,CorefCluster> filteredClusters = document.corefClusters
125+
.values().stream().filter(x -> CorefUtils.filterClustersWithMentionSpans(x, document.filterMentionSet) )
126+
.collect(Collectors.toMap(x -> x.clusterID, x -> x));
127+
writerAfterCoref.print(CorefPrinter.printConllOutput(document, false, true, filteredClusters));
128+
} else {
129+
writerAfterCoref.print(CorefPrinter.printConllOutput(document, false, true));
130+
}
98131
}
99132

100133
@Override
@@ -106,14 +139,20 @@ public String getName() {
106139
}
107140
}).run(docMaker);
108141

109-
Logger logger = Logger.getLogger(CorefSystem.class.getName());
110142
String summary = CorefScorer.getEvalSummary(CorefProperties.getScorerPath(props),
111143
goldOutput, beforeCorefOutput);
144+
145+
logger.info("Before Coref");
112146
CorefScorer.printScoreSummary(summary, logger, false);
147+
CorefScorer.printScoreSummary(summary, logger, true);
148+
CorefScorer.printFinalConllScore(summary, logger);
149+
113150
summary = CorefScorer.getEvalSummary(CorefProperties.getScorerPath(props), goldOutput,
114151
afterCorefOutput);
152+
logger.info("After Coref");
153+
CorefScorer.printScoreSummary(summary, logger, false);
115154
CorefScorer.printScoreSummary(summary, logger, true);
116-
CorefScorer.printFinalConllScore(summary);
155+
CorefScorer.printFinalConllScore(summary, logger);
117156

118157
writerGold.close();
119158
writerBeforeCoref.close();

0 commit comments

Comments
 (0)