Skip to content

Commit f362675

Browse files
committed
Make loading the Wikidict (entitylink) annotator slightly faster
1 parent d5b9106 commit f362675

File tree

1 file changed

+20
-6
lines changed

1 file changed

+20
-6
lines changed

src/edu/stanford/nlp/pipeline/WikidictAnnotator.java

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -66,10 +66,14 @@ public WikidictAnnotator(String name, Properties properties) {
6666
try {
6767
int i = 0;
6868
String[] fields = new String[3];
69-
// Note that using our own Interner means that there will be no
70-
// overlap with other models which store large amounts of words.
71-
// However, this is much faster at loading
72-
Interner<String> interner = new Interner<>();
69+
// Keeping track of the previous link will let us reuse String
70+
// objects, assuming the file is sorted by the second column.
71+
// TODO: we actually didn't know where the dictionary creation
72+
// code is. If it gets updated later and the dictionary is
73+
// rebuilt, please remember to change the code to update it by
74+
// the second column.
75+
String previousLink = "";
76+
int reuse = 0;
7377
for (String line : IOUtils.readLines(wikidictPath, "UTF-8")) {
7478
if (line.charAt(0) == '\t') {
7579
continue;
@@ -89,11 +93,21 @@ public WikidictAnnotator(String name, Properties properties) {
8993
String surfaceForm = fields[0];
9094
if (wikidictCaseless)
9195
surfaceForm = surfaceForm.toLowerCase();
92-
String link = interner.intern(fields[1]); // intern, as most entities have multiple surface forms
96+
// save memory by reusing the string without using an interner
97+
// requires that the dictionary be sorted by link
98+
String link = fields[1];
99+
if (link.equals(previousLink)) {
100+
link = previousLink;
101+
reuse++;
102+
}
93103
// Add the entry
94104
dictionary.put(surfaceForm, link);
105+
previousLink = link;
106+
}
107+
log.info("Done reading Wikidict (" + dictionary.size() + " links read; " + (dictionary.size() - reuse) + " unique entities; " + Redwood.formatTimeDifference(System.currentTimeMillis() - startTime) + " elapsed)");
108+
if ((dictionary.size() - reuse) / (float) dictionary.size() > 0.35) {
109+
log.error("We expected a much higher fraction of key reuse in the dictionary. It is possible the dictionary was recreated and then not sorted. Please sort the dictionary by the second column and update the dictionary creation code to sort this way before writing. This will save quite a bit of time loading without sacrificing memory performance.");
95110
}
96-
log.info("Done reading Wikidict (" + dictionary.size() + " links read; " + interner.size() + " unique entities; " + Redwood.formatTimeDifference(System.currentTimeMillis() - startTime) + " elapsed)");
97111
} catch (Exception e) {
98112
throw new RuntimeException(e);
99113
}

0 commit comments

Comments
 (0)