Skip to content

Commit 9d9b1ae

Browse files
committed
A few updates to Morphology to better match UD standards
1 parent 6f520d4 commit 9d9b1ae

File tree

3 files changed

+72175
-70630
lines changed

3 files changed

+72175
-70630
lines changed

src/edu/stanford/nlp/process/Morpha.flex

Lines changed: 31 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -448,7 +448,7 @@ import edu.stanford.nlp.util.logging.Redwood;
448448

449449

450450

451-
A = ['+a-zA-Z0-9]
451+
A = ['+a-zA-Z0-9]
452452
V = [aeiouAEIOU]
453453
VY = [aeiouyAEIOUY]
454454
C = [bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ]
@@ -501,12 +501,12 @@ COMP_SUP = "JJR"|"JJS"|"RBR"|"RBS"
501501
<verb,any>"did" { return(stem(3,"do","ed")); }
502502
<verb,any>"done" { return(stem(4,"do","en")); }
503503
<verb,any>"didst" { return(stem(5,"do","ed")); } /* disprefer */
504-
<verb,any>"'ll" { return(stem(3,"will","")); }
505-
<verb,any>"'m" { return(stem(2,"be","")); } /* disprefer */
504+
<verb,any>['’]"ll" { return(stem(3,"will","")); }
505+
<verb,any>['’]"m" { return(stem(2,"be","")); } /* disprefer */
506506
<verb,any>"m" { return(stem(1,"be","")); } /* disprefer */
507-
<verb,any>"'re" { return(stem(3,"be","")); } /* disprefer */
507+
<verb,any>['’]"re" { return(stem(3,"be","")); } /* disprefer */
508508
<verb,any>"r" { return(stem(1,"be","")); } /* disprefer */
509-
<verb,any>"'ve" { return(stem(3,"have","")); }
509+
<verb,any>['’]"ve" { return(stem(3,"have","")); }
510510
<verb,any>"ve" { return(stem(2,"have","")); }
511511
<verb,any>"v" { return(stem(1,"have","")); }
512512
<verb,any>"no" { return(stem(2,"know","")); }
@@ -971,7 +971,7 @@ COMP_SUP = "JJR"|"JJS"|"RBR"|"RBS"
971971
<verb,any>"shrunken" { return(stem(5,"ink","en")); } /* disprefer */
972972
<verb,any>"sightsaw" { return(stem(3,"see","ed")); }
973973
<verb,any>"sightseen" { return(stem(3,"ee","en")); }
974-
<verb,any>"ski'd" { return(stem(3,"i","ed")); } /* en */
974+
<verb,any>"ski"['’]"d" { return(stem(3,"i","ed")); } /* en */
975975
<verb,any>"skydove" { return(stem(3,"ive","ed")); } /* en */
976976
<verb,any>"slain" { return(stem(3,"ay","en")); }
977977
<verb,any>"slept" { return(stem(3,"eep","ed")); } /* en */
@@ -1160,7 +1160,7 @@ COMP_SUP = "JJR"|"JJS"|"RBR"|"RBS"
11601160
<noun,any>("Brahman"|"German"|"dragoman"|"ottoman"|"shaman"|"talisman"|"Norman"|"Pullman"|"Roman")"s" { return(stem(1,"","s")); }
11611161
<noun,any>("Czech"|"diptych"|"Sassenach"|"abdomen"|"alibi"|"aria"|"bandit"|"begonia"|"bikini"|"caryatid"|"colon"|"cornucopia"|"cromlech"|"cupola"|"dryad"|"eisteddfod"|"encyclopaedia"|"epoch"|"eunuch"|"flotilla"|"gardenia"|"gestalt"|"gondola"|"hierarch"|"hose"|"impediment"|"koala"|"loch"|"mania"|"manservant"|"martini"|"matriarch"|"monarch"|"oligarch"|"omen"|"parabola"|"pastorale"|"patriarch"|"pea"|"peninsula"|"pfennig"|"phantasmagoria"|"pibroch"|"poly"|"real"|"safari"|"sari"|"specimen"|"standby"|"stomach"|"swami"|"taxi"|"tech"|"toccata"|"triptych"|"villa"|"yogi"|"zloty")"s" { return(stem(1,"","s")); }
11621162
<noun,any>("asyl"|"sanct"|"rect"|"pl"|"pendul"|"mausole"|"hoodl"|"for")"ums" { return(stem(1,"","s")); }
1163-
<noun,any>("Bantu"|"Bengalese"|"Beninese"|"Boche"|"Burmese"|"Chinese"|"Congolese"|"Gabonese"|"Guyanese"|"Japanese"|"Javanese"|"Lebanese"|"Maltese"|"Olympics"|"Portuguese"|"Senegalese"|"Siamese"|"Singhalese"|"Sinhalese"|"Sioux"|"Sudanese"|"Swiss"|"Taiwanese"|"Togolese"|"Vietnamese"|"aircraft"|"anopheles"|"apparatus"|"asparagus"|"barracks"|"bellows"|"bison"|"bluefish"|"bob"|"bourgeois"|"bream"|"brill"|"butterfingers"|"carp"|"catfish"|"chassis"|"chub"|"cod"|"codfish"|"coley"|"contretemps"|"corps"|"crawfish"|"crayfish"|"crossroads"|"cuttlefish"|"dace"|"dice"|"dogfish"|"doings"|"dory"|"downstairs"|"eldest"|"finnan"|"firstborn"|"fish"|"flatfish"|"flounder"|"fowl"|"fry"|"fries"|{A}+"-works"|"gasworks"|"glassworks"|"globefish"|"goldfish"|"grand"|"gudgeon"|"gulden"|"haddock"|"hake"|"halibut"|"headquarters"|"herring"|"hertz"|"horsepower"|"hovercraft"|"hundredweight"|"ironworks"|"jackanapes"|"kilohertz"|"kurus"|"kwacha"|"ling"|"lungfish"|"mackerel"|"means"|"megahertz"|"moorfowl"|"moorgame"|"mullet"|"offspring"|"pampas"|"parr"|"patois"|"pekinese"|"penn'orth"|"perch"|"pickerel"|"pike"|"pince-nez"|"plaice"|"precis"|"quid"|"rand"|"rendezvous"|"revers"|"roach"|"roux"|"salmon"|"samurai"|"series"|"shad"|"sheep"|"shellfish"|"smelt"|"spacecraft"|"species"|"starfish"|"stockfish"|"sunfish"|"superficies"|"sweepstakes"|"swordfish"|"tench"|"tope"|"triceps"|"trout"|"tuna"|"tunafish"|"tunny"|"turbot"|"undersigned"|"veg"|"waterfowl"|"waterworks"|"waxworks"|"whiting"|"wildfowl"|"woodworm"|"yen") { return(xnull_stem()); }
1163+
<noun,any>("Bantu"|"Bengalese"|"Beninese"|"Boche"|"Burmese"|"Chinese"|"Congolese"|"Gabonese"|"Guyanese"|"Japanese"|"Javanese"|"Lebanese"|"Maltese"|"Olympics"|"Portuguese"|"Senegalese"|"Siamese"|"Singhalese"|"Sinhalese"|"Sioux"|"Sudanese"|"Swiss"|"Taiwanese"|"Togolese"|"Vietnamese"|"aircraft"|"anopheles"|"apparatus"|"asparagus"|"barracks"|"bellows"|"bison"|"bluefish"|"bob"|"bourgeois"|"bream"|"brill"|"butterfingers"|"carp"|"catfish"|"chassis"|"chub"|"cod"|"codfish"|"coley"|"contretemps"|"corps"|"crawfish"|"crayfish"|"crossroads"|"cuttlefish"|"dace"|"dice"|"dogfish"|"doings"|"dory"|"downstairs"|"eldest"|"feces"|"finnan"|"firstborn"|"fish"|"flatfish"|"flounder"|"fowl"|"fry"|"fries"|{A}+"-works"|"gasworks"|"glassworks"|"globefish"|"goldfish"|"grand"|"gudgeon"|"gulden"|"haddock"|"hake"|"halibut"|"headquarters"|"herring"|"hertz"|"horsepower"|"hovercraft"|"hundredweight"|"ironworks"|"jackanapes"|"kilohertz"|"kurus"|"kwacha"|"ling"|"lungfish"|"mackerel"|"means"|"megahertz"|"moorfowl"|"moorgame"|"mullet"|"offspring"|"pampas"|"parr"|"patois"|"pekinese"|"penn'orth"|"perch"|"pickerel"|"pike"|"pince-nez"|"plaice"|"precis"|"quid"|"rand"|"rendezvous"|"revers"|"roach"|"roux"|"salmon"|"samurai"|"series"|"shad"|"sheep"|"shellfish"|"smelt"|"spacecraft"|"species"|"starfish"|"stockfish"|"sunfish"|"superficies"|"sweepstakes"|"swordfish"|"tench"|"tope"|"triceps"|"trout"|"tuna"|"tunafish"|"tunny"|"turbot"|"undersigned"|"veg"|"waterfowl"|"waterworks"|"waxworks"|"whiting"|"wildfowl"|"woodworm"|"yen") { return(xnull_stem()); }
11641164
<noun,any>"Aries" { return(stem(1,"s","s")); }
11651165
<noun,any>"Pisces" { return(stem(1,"s","s")); }
11661166
<noun,any>"Bengali" { return(stem(1,"i","s")); }
@@ -1250,7 +1250,7 @@ COMP_SUP = "JJR"|"JJS"|"RBR"|"RBS"
12501250
<noun,any>"clutches" { return(stem(2,"","s")); }
12511251
<noun,any>"continua" { return(stem(1,"um","s")); }
12521252
<noun,any>"diggings" { return(stem(1,"","s")); }
1253-
<noun,any>"K's" { return(stem(2,"","s")); }
1253+
<noun,any>"K"['’]"s" { return(stem(2,"","s")); }
12541254
<noun,any>"seychellois" { return(stem(1,"s","s")); }
12551255
<noun,any>"afterlives" { return(stem(3,"fe","s")); }
12561256
<noun,any>"avens" { return(stem(1,"s","s")); }
@@ -1469,7 +1469,7 @@ COMP_SUP = "JJR"|"JJS"|"RBR"|"RBS"
14691469
<noun,any>"maxima" { return(stem(2,"mum","s")); }
14701470
<noun,any>"memoranda" { return(stem(2,"dum","s")); }
14711471
<noun,any>"men-at-arms" { return(stem(10,"an-at-arms","s")); }
1472-
<noun,any>"men-o'-war" { return(stem(9,"an-of-war","s")); } /* disprefer */
1472+
<noun,any>"men-o"['’]"-war" { return(stem(9,"an-of-war","s")); } /* disprefer */
14731473
<noun,any>"men-of-war" { return(stem(9,"an-of-war","s")); }
14741474
<noun,any>"menservants" { return(stem(10,"anservant","s")); } /* disprefer */
14751475
<noun,any>"mesdemoiselles" { return(stem(13,"ademoiselle","s")); }
@@ -1874,11 +1874,11 @@ COMP_SUP = "JJR"|"JJS"|"RBR"|"RBS"
18741874
/* -o / -oe */
18751875

18761876
<verb,any>("bastinado"|"bunco"|"bunko"|"carbonado"|"contango"|"crescendo"|"ditto"|"echo"|"embargo"|"fresco"|"hallo"|"halo"|"lasso"|"niello"|"radio"|"solo"|"stiletto"|"stucco"|"tally-ho"|"tango"|"torpedo"|"veto"|"zero")"ed" { return(stem(2,"","ed")); } /* en */
1877-
<verb,any>"ko'd" { return(stem(3,"o","ed")); } /* en */
1878-
<verb,any>"ko'ing" { return(stem(4,"","ing")); }
1879-
<verb,any>"ko's" { return(stem(2,"","s")); }
1880-
<verb,any>"tally-ho'd" { return(stem(3,"","ed")); } /* en */ /* disprefer */
1881-
<noun,any>("co"|"do"|"ko"|"no")"'s" { return(stem(2,"","s")); }
1877+
<verb,any>"ko"['’]"d" { return(stem(3,"o","ed")); } /* en */
1878+
<verb,any>"ko"['’]"ing" { return(stem(4,"","ing")); }
1879+
<verb,any>"ko"['’]"s" { return(stem(2,"","s")); }
1880+
<verb,any>"tally-ho"['’]"d" { return(stem(3,"","ed")); } /* en */ /* disprefer */
1881+
<noun,any>("co"|"do"|"ko"|"no")['’]"s" { return(stem(2,"","s")); }
18821882

18831883
<noun,any>("aloe"|"archfoe"|"canoe"|"doe"|"felloe"|"floe"|"foe"|"hammertoe"|"hoe"|"icefloe"|"mistletoe"|"oboe"|"roe"|({A}*"shoe")|"sloe"|"throe"|"tiptoe"|"toe"|"voe"|"woe")"s" { return(stem(1,"","s")); }
18841884
<verb,any>("canoe"|"hoe"|"outwoe"|"rehoe"|({A}*"shoe")|"tiptoe"|"toe")"s" { return(stem(1,"","s")); }
@@ -1919,7 +1919,7 @@ COMP_SUP = "JJR"|"JJS"|"RBR"|"RBS"
19191919
<noun,any>"m.p.s." { return(stem(6,"m.p.","s")); }
19201920
<noun,any>("cons"|"miss"|"mrs"|"ms"|"n-s"|"pres"|"ss")"." { return(cnull_stem()); }
19211921
<noun,any>({A}|".")+".s." { return(cnull_stem()); }
1922-
<noun,any>({A}|".")+".'s." { return(stem(4,".","s")); } /* disprefer */
1922+
<noun,any>({A}|".")+"."['’]"s." { return(stem(4,".","s")); } /* disprefer */
19231923
<noun,any>({A}|".")+"s." { return(stem(2,".","s")); }
19241924

19251925
<noun,any>{A}*"men" { return(stem(2,"an","s")); }
@@ -2042,19 +2042,19 @@ COMP_SUP = "JJR"|"JJS"|"RBR"|"RBS"
20422042
<scan>"is"/_VBZ { return(stem(2,"be","s")); }
20432043
<scan>"du"/_VBP { return(stem(2,"do","")); } /* In dunno */
20442044
<scan>"no"/_VB { return(stem(2,"know","")); } /* In dunno */
2045-
<scan>"'d"/_VH { return(stem(2,"have","ed")); } /* disprefer */
2046-
<scan>"'d"/_VBD { return(stem(2,"have","ed")); } /* disprefer */
2047-
<scan>"'d"/_VM { return(stem(2,"would","")); }
2048-
<scan>"'d"/_MD { return(stem(2,"would","")); }
2045+
<scan>['’]"d"/_VH { return(stem(2,"have","ed")); } /* disprefer */
2046+
<scan>['’]"d"/_VBD { return(stem(2,"have","ed")); } /* disprefer */
2047+
<scan>['’]"d"/_VM { return(stem(2,"would","")); }
2048+
<scan>['’]"d"/_MD { return(stem(2,"would","")); }
20492049
<scan>"d"/_MD { return(stem(1,"would","")); }
2050-
<scan>"'s"/_VBZ { return(stem(2,"be","s")); } /* disprefer */ /* could really be have */
2050+
<scan>['’]"s"/_VBZ { return(stem(2,"be","s")); } /* disprefer */ /* could really be have */
20512051
<scan>"s"/_VBZ { return(stem(1,"be","s")); } /* disprefer */ /* could really be have */
2052-
<scan>"'s"/_VDZ { return(stem(2,"do","s")); } /* disprefer */
2053-
<scan>"'s"/_VHZ { return(stem(2,"have","s")); } /* disprefer */
2054-
<scan>"'s"/_"$" { return(stem(2,"'s","")); }
2055-
<scan>"'s"/_POS { return(stem(2,"'s","")); }
2056-
<scan>"'s"/_CSA { return(stem(2,"as","")); }
2057-
<scan>"'s"/_CJS { return(stem(2,"as","")); }
2052+
<scan>['’]"s"/_VDZ { return(stem(2,"do","s")); } /* disprefer */
2053+
<scan>['’]"s"/_VHZ { return(stem(2,"have","s")); } /* disprefer */
2054+
<scan>['’]"s"/_"$" { return(stem(2,"'s","")); }
2055+
<scan>['’]"s"/_POS { return(stem(2,"'s","")); }
2056+
<scan>['’]"s"/_CSA { return(stem(2,"as","")); }
2057+
<scan>['’]"s"/_CJS { return(stem(2,"as","")); }
20582058
<scan>"not"/_XX { return(stem(3,"not","")); }
20592059
<scan>"ai"/_VB { return(stem(2,"be","")); } /* disprefer */
20602060
<scan>"ai"/_VH { return(stem(2,"have","")); } /* disprefer */
@@ -2064,12 +2064,13 @@ COMP_SUP = "JJR"|"JJS"|"RBR"|"RBS"
20642064
<scan>"sha"/_MD { return(stem(3,"shall","")); }
20652065
<scan>"wo"/_VM { return(stem(2,"will","")); } /* disprefer */
20662066
<scan>"wo"/_MD { return(stem(2,"will","")); } /* disprefer */
2067-
<scan>"'ll"/_MD { return(stem(3,"will","")); }
2067+
<scan>['’]"ll"/_MD { return(stem(3,"will","")); }
20682068
<scan>"ll"/_MD { return(stem(2,"will","")); }
20692069
<scan>"wilt"/_MD { return(stem(4,"will","")); }
2070-
<scan>"n't"/_XX { return(stem(3,"not","")); } /* disprefer */
2071-
<scan>"n't"/_RB { return(stem(3,"not","")); } /* cdm add; disprefer */
2072-
<scan>"n"/_RB { return(stem(1,"not","")); } /* cdm add; disprefer */
2070+
<scan>"n"['’]"t"/_XX { return(stem(3,"not","")); } /* disprefer */
2071+
<scan>"n"['’]"t"/_RB { return(stem(3,"not","")); } /* cdm add; disprefer */
2072+
<scan>"nt"/_RB { return(stem(2,"not","")); } /* luffa add; disprefer? */
2073+
<scan>"n"/_RB { return(stem(1,"not","")); } /* cdm add; disprefer */
20732074
<scan>"him"/_P { return(stem(3,"he","")); }
20742075
<scan>"her"/_P { return(stem(3,"she","")); }
20752076
<scan>"them"/_P { return(stem(1,"y","")); }

0 commit comments

Comments
 (0)