@@ -462,6 +462,7 @@ ESEDING = "es"|"ed"|"ing"
462
462
G = [^ \t\r\n \u2028\u2029\u000B\u000C\u0085 _]
463
463
GM = [^ \t\r\n \u2028\u2029\u000B\u000C\u0085 _-]
464
464
SKIP = [ \t\r\n \u2028\u2029\u000B\u000C\u0085 ]
465
+ DIGIT = [ 0- 9]
465
466
466
467
/* adjectives such as tame which become tamer, tamest */
467
468
E_ADJS = "able"|"absolute"|"abstruse"|"acute"|"ample"|"austere"|"bare"|"base"|"blithe"|"blonde"|"blue"|"brave"|"brittle"|"brusque"|"capable"|"chaste"|"choice"|"close"|"coarse"|"complete"|"concise"|"crude"|"cute"|"demure"|"dense"|"dire"|"divine"|"doggone"|"eerie"|"extreme"|"false"|"feeble"|"fickle"|"fierce"|"fine"|"free"|"game"|"gauche"|"gentle"|"gladsome"|"grave"|"grewsome"|"gruesome"|"hale"|"handsome"|"hoarse"|"huge"|"humane"|"humble"|"idle"|"immense"|"inane"|"insane"|"intense"|"irate"|"kittle"|"lame"|"large"|"late"|"lithe"|"little"|"loose"|"mature"|"mere"|"mickle"|"minute"|"mute"|"naive"|"naïve"|"negative"|"nice"|"nimble"|"noble"|"nude"|"obscene"|"obscure"|"obtuse"|"opaque"|"pale"|"polite"|"positive"|"possible"|"precise"|"private"|"pure"|"purple"|"rare"|"rathe"|"remote"|"resolute"|"rife"|"ripe"|"rude"|"safe"|"sage"|"sane"|"savage"|"scarce"|"secure"|"sensible"|"serene"|"severe"|"simple"|"sincere"|"sore"|"spare"|"sparse"|"spruce"|"square"|"stable"|"stale"|"strange"|"suave"|"sublime"|"subtile"|"subtle"|"supple"|"supreme"|"sure"|"svelte"|"tame"|"tense"|"terse"|"trite"|"true"|"unique"|"unripe"|"unsafe"|"unstable"|"untrue"|"unwise"|"urbane"|"vague"|"vile"|"white"|"wholesome"|"wide"|"winsome"|"wise"|"yare"
@@ -2027,7 +2028,9 @@ S_ENDING_DEMONYMS = "Afghan"|"Afghani"|"African"|"Albanian"|"Alexandrine"|"Alger
2027
2028
<verb,noun,any> {GM} * "-" { // The first word isn't stemmed separately, but the second half can be
2028
2029
String stem = common_noun_stem();
2029
2030
String n = next();
2030
- if (n == null ) {
2031
+ if (n == null || n. startsWith(" _" )) {
2032
+ // TODO: would be nice to use the rest of the lemma rules
2033
+ // when the token just ends with "-"
2031
2034
return stem;
2032
2035
} else {
2033
2036
return stem. concat(n);
@@ -2107,6 +2110,9 @@ S_ENDING_DEMONYMS = "Afghan"|"Afghani"|"African"|"Albanian"|"Alexandrine"|"Alger
2107
2110
<scan> {S_ENDING_DEMONYMS} / _( NN( P?)( S?)| JJ) { return (capitalise(common_noun_stem())); }
2108
2111
<scan> {S_ENDING_DEMONYMS} s/ _( NN( P?)( S?)| JJ) { return (capitalise(stem(1 , " " , " s" ))); }
2109
2112
2113
+ /* remove commas from numbers, eg 5,000 -> 5000 */
2114
+ <scan> ( {DIGIT} | "," )+( [ .] {DIGIT} +)?/ _CD { return (yytext(). replaceAll(" ," , " " )); }
2115
+
2110
2116
<scan> "worse" / _JJR { return (stem(5 , " bad" , " " )); }
2111
2117
<scan> "worst" / _JJS { return (stem(5 , " bad" , " " )); }
2112
2118
<scan> "worse" / _RBR { return (stem(5 , " badly" , " " )); }
0 commit comments