Skip to content

Commit 8adcbfe

Browse files
committed
A few lemmatizer updates: enroll and appall instead of enrol or appal, add de- as a verb prefix (presumably doesn't break any exceptions), add blog and xfer as other double letter exceptions
1 parent 2dd08da commit 8adcbfe

File tree

2 files changed

+75060
-74803
lines changed

2 files changed

+75060
-74803
lines changed

src/edu/stanford/nlp/process/Morpha.flex

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -49,13 +49,15 @@ import edu.stanford.nlp.util.logging.Redwood;
4949
/** A list of verbs that have doubling of consonants
5050
* this list can be rebuilt with the main method in process.Morphology!
5151
* the verb stem list lives in "/u/nlp/data/morph/verbstem.list"
52+
*<br>
53+
* Removed appal and enrol - use American instead of English lemma for those words
5254
*/
5355
private static final String[] verbStems = { "abat",
5456
"abet", "abhor", "abut", "accur", "acquit",
5557
"adlib", "admit", "aerobat", "aerosol", "agendaset",
56-
"allot", "alot", "anagram", "annul", "appal",
58+
"allot", "alot", "anagram", "annul",
5759
"apparel", "armbar", "aver", "babysit", "airdrop",
58-
"appal", "blackleg", "bobsled", "bur", "chum",
60+
"blackleg", "bobsled", "bur", "chum",
5961
"confab", "counterplot", "curet", "dib", "backdrop",
6062
"backfil", "backflip", "backlog", "backpedal", "backslap",
6163
"backstab", "bag", "balfun", "ballot", "ban",
@@ -67,7 +69,7 @@ import edu.stanford.nlp.util.logging.Redwood;
6769
"bevel", "bewig", "bib", "bid", "billet",
6870
"bin", "bip", "bit", "bitmap", "blab",
6971
"blag", "blam", "blan", "blat", "bles",
70-
"blim", "blip", "blob", "bloodlet", "blot",
72+
"blim", "blip", "blob", "blog", "bloodlet", "blot",
7173
"blub", "blur", "bob", "bodypop", "bog",
7274
"booby-trap", "boobytrap", "booksel", "bootleg", "bop",
7375
"bot", "bowel", "bracket", "brag", "brig",
@@ -102,13 +104,13 @@ import edu.stanford.nlp.util.logging.Redwood;
102104
"disembowel", "dishevel", "disinter", "dispel", "disprefer",
103105
"distil", "dog", "dognap", "don", "doorstep",
104106
"dot", "dowel", "drag", "drat", "driftnet",
105-
"distil", "egotrip", "enrol", "enthral", "extol",
107+
"distil", "egotrip", "enthral", "extol",
106108
"fulfil", "gaffe", "golliwog", "idyl", "inspan",
107109
"drip", "drivel", "drop", "drub", "drug",
108110
"drum", "dub", "duel", "dun", "dybbuk",
109111
"earwig", "eavesdrop", "ecolabel", "eitherspigot", "electroblot",
110112
"embed", "emit", "empanel", "enamel", "endlabel",
111-
"endtrim", "enrol", "enthral", "entrammel", "entrap",
113+
"endtrim", "enthral", "entrammel", "entrap",
112114
"enwrap", "equal", "equip", "estop", "exaggerat",
113115
"excel", "expel", "extol", "fag", "fan",
114116
"farewel", "fat", "featherbed", "feget", "fet",
@@ -277,7 +279,7 @@ import edu.stanford.nlp.util.logging.Redwood;
277279
"wet", "wham", "whet", "whip", "whir",
278280
"whiteskin", "whiz", "whup", "wildcat", "win",
279281
"windmil", "wit", "woodchop", "woodcut", "wor",
280-
"worship", "wrap", "will", "wiretap", "yen",
282+
"worship", "wrap", "will", "wiretap", "xfer", "yen",
281283
"yak", "yap", "yarnspin", "yip", "yodel",
282284
"zag", "zap", "zig", "zig-zag", "zigzag",
283285
"zip", "ztrip" };
@@ -456,7 +458,7 @@ CXY = [bcdfghjklmnpqrstvwxzBCDFGHJKLMNPQRSTVWXZ]
456458
CXY2 = "bb"|"cc"|"dd"|"ff"|"gg"|"hh"|"jj"|"kk"|"ll"|"mm"|"nn"|"pp"|"qq"|"rr"|"ss"|"tt"|"vv"|"ww"|"xx"|"zz"
457459
S2 = "ss"|"zz"
458460
S = [sxzSXZ]|([csCS]"h")
459-
PRE = "be"|"ex"|"in"|"mis"|"pre"|"pro"|"re"
461+
PRE = "be"|"de"|"ex"|"in"|"mis"|"pre"|"pro"|"re"
460462
EDING = "ed"|"ing"
461463
ESEDING = "es"|"ed"|"ing"
462464
G = [^ \t\r\n\u2028\u2029\u000B\u000C\u0085_]

0 commit comments

Comments
 (0)