Skip to content

Commit 40fee82

Browse files
committed
Remove commas from numbers and patch tokens that end with -, although they could still be lemmatized better
1 parent 0fba443 commit 40fee82

File tree

2 files changed

+80406
-77332
lines changed

2 files changed

+80406
-77332
lines changed

src/edu/stanford/nlp/process/Morpha.flex

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -462,6 +462,7 @@ ESEDING = "es"|"ed"|"ing"
462462
G = [^ \t\r\n\u2028\u2029\u000B\u000C\u0085_]
463463
GM = [^ \t\r\n\u2028\u2029\u000B\u000C\u0085_-]
464464
SKIP = [ \t\r\n\u2028\u2029\u000B\u000C\u0085]
465+
DIGIT = [0-9]
465466

466467
/* adjectives such as tame which become tamer, tamest */
467468
E_ADJS = "able"|"absolute"|"abstruse"|"acute"|"ample"|"austere"|"bare"|"base"|"blithe"|"blonde"|"blue"|"brave"|"brittle"|"brusque"|"capable"|"chaste"|"choice"|"close"|"coarse"|"complete"|"concise"|"crude"|"cute"|"demure"|"dense"|"dire"|"divine"|"doggone"|"eerie"|"extreme"|"false"|"feeble"|"fickle"|"fierce"|"fine"|"free"|"game"|"gauche"|"gentle"|"gladsome"|"grave"|"grewsome"|"gruesome"|"hale"|"handsome"|"hoarse"|"huge"|"humane"|"humble"|"idle"|"immense"|"inane"|"insane"|"intense"|"irate"|"kittle"|"lame"|"large"|"late"|"lithe"|"little"|"loose"|"mature"|"mere"|"mickle"|"minute"|"mute"|"naive"|"naïve"|"negative"|"nice"|"nimble"|"noble"|"nude"|"obscene"|"obscure"|"obtuse"|"opaque"|"pale"|"polite"|"positive"|"possible"|"precise"|"private"|"pure"|"purple"|"rare"|"rathe"|"remote"|"resolute"|"rife"|"ripe"|"rude"|"safe"|"sage"|"sane"|"savage"|"scarce"|"secure"|"sensible"|"serene"|"severe"|"simple"|"sincere"|"sore"|"spare"|"sparse"|"spruce"|"square"|"stable"|"stale"|"strange"|"suave"|"sublime"|"subtile"|"subtle"|"supple"|"supreme"|"sure"|"svelte"|"tame"|"tense"|"terse"|"trite"|"true"|"unique"|"unripe"|"unsafe"|"unstable"|"untrue"|"unwise"|"urbane"|"vague"|"vile"|"white"|"wholesome"|"wide"|"winsome"|"wise"|"yare"
@@ -2027,7 +2028,9 @@ S_ENDING_DEMONYMS = "Afghan"|"Afghani"|"African"|"Albanian"|"Alexandrine"|"Alger
20272028
<verb,noun,any>{GM}*"-" { // The first word isn't stemmed separately, but the second half can be
20282029
String stem = common_noun_stem();
20292030
String n = next();
2030-
if (n == null) {
2031+
if (n == null || n.startsWith("_")) {
2032+
// TODO: would be nice to use the rest of the lemma rules
2033+
// when the token just ends with "-"
20312034
return stem;
20322035
} else {
20332036
return stem.concat(n);
@@ -2107,6 +2110,9 @@ S_ENDING_DEMONYMS = "Afghan"|"Afghani"|"African"|"Albanian"|"Alexandrine"|"Alger
21072110
<scan>{S_ENDING_DEMONYMS}/_(NN(P?)(S?)|JJ) { return(capitalise(common_noun_stem())); }
21082111
<scan>{S_ENDING_DEMONYMS}s/_(NN(P?)(S?)|JJ) { return(capitalise(stem(1, "", "s"))); }
21092112

2113+
/* remove commas from numbers, eg 5,000 -> 5000 */
2114+
<scan>({DIGIT}|",")+([.]{DIGIT}+)?/_CD { return(yytext().replaceAll(",", "")); }
2115+
21102116
<scan>"worse"/_JJR { return(stem(5, "bad", "")); }
21112117
<scan>"worst"/_JJS { return(stem(5, "bad", "")); }
21122118
<scan>"worse"/_RBR { return(stem(5, "badly", "")); }

0 commit comments

Comments
 (0)