Skip to content

Commit 3bd8fad

Browse files
committed
Port Czech and Slovak stemmers from Solr
1 parent 0e6c3c7 commit 3bd8fad

11 files changed

+776
-4
lines changed

build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ version = "${property("plugin.version")}-${property("elasticsearch.version")}"
3030
esplugin {
3131
name 'monitora_utils'
3232
description 'Utils for Elasticsearch'
33-
classname 'cz.monitora.elasticsearch.LowerCase'
33+
classname 'cz.monitora.elasticsearch.MonitoraESPlugin'
3434
licenseFile rootProject.file('LICENSE')
3535
}
3636

Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
package cz.monitora.elasticsearch;
22

3+
import cz.monitora.elasticsearch.analyzer.czech.CzechStemFilterFactory;
4+
import cz.monitora.elasticsearch.analyzer.lowercase.LowerCaseTokenFilterFactory;
5+
import cz.monitora.elasticsearch.analyzer.slovak.SlovakStemFilterFactory;
36
import org.elasticsearch.index.analysis.TokenFilterFactory;
47
import org.elasticsearch.indices.analysis.AnalysisModule;
58
import org.elasticsearch.plugins.AnalysisPlugin;
@@ -8,12 +11,14 @@
811
import java.util.HashMap;
912
import java.util.Map;
1013

11-
public class LowerCase extends Plugin implements AnalysisPlugin {
14+
public class MonitoraESPlugin extends Plugin implements AnalysisPlugin {
1215

1316
@Override
1417
public Map<String, AnalysisModule.AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
1518
Map<String, AnalysisModule.AnalysisProvider<TokenFilterFactory>> extra = new HashMap<>();
1619
extra.put("monitora_lowercase", AnalysisPlugin.requiresAnalysisSettings(LowerCaseTokenFilterFactory::new));
20+
extra.put("czech_stem", CzechStemFilterFactory::new);
21+
extra.put("slovak_stem", SlovakStemFilterFactory::new);
1722
return extra;
1823
}
1924
}
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package cz.monitora.elasticsearch.analyzer.czech;
18+
19+
import java.io.IOException;
20+
21+
import org.apache.lucene.analysis.cz.CzechStemmer;
22+
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // for javadoc
23+
import org.apache.lucene.analysis.TokenFilter;
24+
import org.apache.lucene.analysis.TokenStream;
25+
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
26+
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
27+
28+
29+
/**
30+
* A {@link TokenFilter} that applies {@link CzechStemmer} to stem Czech words.
31+
* <p>
32+
* To prevent terms from being stemmed use an instance of
33+
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
34+
* the {@link KeywordAttribute} before this {@link TokenStream}.
35+
* </p>
36+
* <p><b>NOTE</b>: Input is expected to be in lowercase,
37+
* but with diacritical marks</p>
38+
* @see SetKeywordMarkerFilter
39+
*/
40+
public final class CzechStemFilter extends TokenFilter {
41+
private final CzechStemmer stemmer = new CzechStemmer();
42+
private final CzechStemmerASCIIFold stemmerASCIIFold = new CzechStemmerASCIIFold();
43+
private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
44+
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
45+
private final boolean withASCIIFold;
46+
47+
public CzechStemFilter(TokenStream input, boolean withASCIIFold) {
48+
super(input);
49+
this.withASCIIFold = withASCIIFold;
50+
}
51+
52+
@Override
53+
public boolean incrementToken() throws IOException {
54+
if (input.incrementToken()) {
55+
if(!keywordAttr.isKeyword()) {
56+
final int newlen = (
57+
withASCIIFold ?
58+
stemmerASCIIFold.stem(termAttr.buffer(), termAttr.length()) :
59+
stemmer.stem(termAttr.buffer(), termAttr.length()));
60+
termAttr.setLength(newlen);
61+
}
62+
return true;
63+
} else {
64+
return false;
65+
}
66+
}
67+
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
package cz.monitora.elasticsearch.analyzer.czech;
2+
3+
4+
import java.util.Map;
5+
6+
import org.apache.lucene.analysis.TokenStream;
7+
import org.elasticsearch.common.settings.Settings;
8+
import org.elasticsearch.env.Environment;
9+
import org.elasticsearch.index.IndexSettings;
10+
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
11+
12+
public class CzechStemFilterFactory extends AbstractTokenFilterFactory {
13+
private final boolean withASCIIFold;
14+
15+
/** Creates a new CzechStemFilterFactory */
16+
public CzechStemFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
17+
super(name, settings);
18+
withASCIIFold = settings.getAsBoolean("with_asciifold", true);
19+
}
20+
21+
@Override
22+
public TokenStream create(TokenStream input) {
23+
return new CzechStemFilter(input, withASCIIFold);
24+
}
25+
}
Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package cz.monitora.elasticsearch.analyzer.czech;
18+
19+
20+
import static org.apache.lucene.analysis.util.StemmerUtil.*;
21+
22+
/**
23+
* Light Stemmer for Czech.
24+
* <p>
25+
* Implements the algorithm described in:
26+
* <i>
27+
* Indexing and stemming approaches for the Czech language
28+
* </i>
29+
* http://portal.acm.org/citation.cfm?id=1598600
30+
* </p>
31+
*/
32+
public class CzechStemmerASCIIFold {
33+
34+
/**
35+
* Stem an input buffer of Czech text.
36+
*
37+
* @param s input buffer
38+
* @param len length of input buffer
39+
* @return length of input buffer after normalization
40+
*
41+
* <p><b>NOTE</b>: Input is expected to be in lowercase,
42+
* but with diacritical marks</p>
43+
*/
44+
public int stem(char s[], int len) {
45+
len = removeCase(s, len);
46+
len = removePossessives(s, len);
47+
if (len > 0) {
48+
len = normalize(s, len);
49+
}
50+
return len;
51+
}
52+
53+
private int removeCase(char s[], int len) {
54+
if (len > 7 && endsWith(s, len, "atech"))
55+
return len - 5;
56+
57+
if (len > 6 &&
58+
(endsWith(s, len,"etem") ||
59+
endsWith(s, len,"etem") ||
60+
endsWith(s, len,"atum")))
61+
return len - 4;
62+
63+
if (len > 5 &&
64+
(endsWith(s, len, "ech") ||
65+
endsWith(s, len, "ich") ||
66+
endsWith(s, len, "ich") ||
67+
endsWith(s, len, "eho") ||
68+
endsWith(s, len, "emi") ||
69+
endsWith(s, len, "emi") ||
70+
endsWith(s, len, "emu") ||
71+
endsWith(s, len, "ete") ||
72+
endsWith(s, len, "ete") ||
73+
endsWith(s, len, "eti") ||
74+
endsWith(s, len, "eti") ||
75+
endsWith(s, len, "iho") ||
76+
endsWith(s, len, "iho") ||
77+
endsWith(s, len, "imi") ||
78+
endsWith(s, len, "imu") ||
79+
endsWith(s, len, "imu") ||
80+
endsWith(s, len, "ach") ||
81+
endsWith(s, len, "ata") ||
82+
endsWith(s, len, "aty") ||
83+
endsWith(s, len, "ych") ||
84+
endsWith(s, len, "ama") ||
85+
endsWith(s, len, "ami") ||
86+
endsWith(s, len, "ove") ||
87+
endsWith(s, len, "ovi") ||
88+
endsWith(s, len, "ymi")))
89+
return len - 3;
90+
91+
if (len > 4 &&
92+
(endsWith(s, len, "em") ||
93+
endsWith(s, len, "es") ||
94+
endsWith(s, len, "em") ||
95+
endsWith(s, len, "im") ||
96+
endsWith(s, len, "um") ||
97+
endsWith(s, len, "at") ||
98+
endsWith(s, len, "am") ||
99+
endsWith(s, len, "os") ||
100+
endsWith(s, len, "us") ||
101+
endsWith(s, len, "ym") ||
102+
endsWith(s, len, "mi") ||
103+
endsWith(s, len, "ou")))
104+
return len - 2;
105+
106+
if (len > 3) {
107+
switch (s[len - 1]) {
108+
case 'a':
109+
case 'e':
110+
case 'i':
111+
case 'o':
112+
case 'u':
113+
//case 'u':
114+
case 'y':
115+
//case 'a':
116+
//case 'e':
117+
//case 'i':
118+
//case 'y':
119+
//case 'e':
120+
return len - 1;
121+
}
122+
}
123+
124+
return len;
125+
}
126+
127+
private int removePossessives(char s[], int len) {
128+
if (len > 5 &&
129+
(endsWith(s, len, "ov") ||
130+
endsWith(s, len, "in") ||
131+
endsWith(s, len, "uv")))
132+
return len - 2;
133+
134+
return len;
135+
}
136+
137+
private int normalize(char s[], int len) {
138+
if (endsWith(s, len, "ct")) { // ct -> ck
139+
s[len - 2] = 'c';
140+
s[len - 1] = 'k';
141+
return len;
142+
}
143+
144+
if (endsWith(s, len, "st")) { // st -> sk
145+
s[len - 2] = 's';
146+
s[len - 1] = 'k';
147+
return len;
148+
}
149+
150+
switch(s[len - 1]) {
151+
case 'c': // [cc] -> k
152+
//case 'c':
153+
s[len - 1] = 'k';
154+
return len;
155+
case 'z': // [zz] -> h
156+
//case 'z':
157+
s[len - 1] = 'h';
158+
return len;
159+
}
160+
161+
if (len > 1 && s[len - 2] == 'e') {
162+
s[len - 2] = s[len - 1]; // e* > *
163+
return len - 1;
164+
}
165+
166+
if (len > 2 && s[len - 2] == 'u') {
167+
s[len - 2] = 'o'; // *u* -> *o*
168+
return len;
169+
}
170+
171+
return len;
172+
}
173+
}

src/main/java/cz/monitora/elasticsearch/LowerCaseFilter.java renamed to src/main/java/cz/monitora/elasticsearch/analyzer/lowercase/LowerCaseFilter.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package cz.monitora.elasticsearch;
1+
package cz.monitora.elasticsearch.analyzer.lowercase;
22

33
import org.apache.lucene.analysis.CharacterUtils;
44
import org.apache.lucene.analysis.TokenFilter;

src/main/java/cz/monitora/elasticsearch/LowerCaseTokenFilterFactory.java renamed to src/main/java/cz/monitora/elasticsearch/analyzer/lowercase/LowerCaseTokenFilterFactory.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package cz.monitora.elasticsearch;
1+
package cz.monitora.elasticsearch.analyzer.lowercase;
22

33
import org.apache.lucene.analysis.TokenStream;
44
import org.elasticsearch.common.settings.Settings;
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
package cz.monitora.elasticsearch.analyzer.slovak;
2+
3+
import java.io.IOException;
4+
5+
import org.apache.lucene.analysis.TokenFilter;
6+
import org.apache.lucene.analysis.TokenStream;
7+
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
8+
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
9+
10+
11+
public final class SlovakStemFilter extends TokenFilter {
12+
private final SlovakStemmer stemmer = new SlovakStemmer();
13+
private final SlovakStemmerASCIIFold stemmerASCIIFold = new SlovakStemmerASCIIFold();
14+
private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
15+
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
16+
private final boolean withASCIIFold;
17+
18+
public SlovakStemFilter(TokenStream input, boolean withASCIIFold) {
19+
super(input);
20+
this.withASCIIFold = withASCIIFold;
21+
}
22+
23+
@Override
24+
public boolean incrementToken() throws IOException {
25+
if (input.incrementToken()) {
26+
if(!keywordAttr.isKeyword()) {
27+
final int newlen = (
28+
withASCIIFold ?
29+
stemmerASCIIFold.stem(termAttr.buffer(), termAttr.length()) :
30+
stemmer.stem(termAttr.buffer(), termAttr.length()));
31+
termAttr.setLength(newlen);
32+
}
33+
return true;
34+
} else {
35+
return false;
36+
}
37+
}
38+
}
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
package cz.monitora.elasticsearch.analyzer.slovak;
2+
3+
import java.util.Map;
4+
5+
import org.apache.lucene.analysis.TokenStream;
6+
import org.elasticsearch.common.settings.Settings;
7+
import org.elasticsearch.env.Environment;
8+
import org.elasticsearch.index.IndexSettings;
9+
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
10+
11+
12+
public class SlovakStemFilterFactory extends AbstractTokenFilterFactory {
13+
private final boolean withASCIIFold;
14+
15+
public SlovakStemFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
16+
super(name, settings);
17+
withASCIIFold = settings.getAsBoolean("with_asciifold", true);
18+
}
19+
20+
@Override
21+
public TokenStream create(TokenStream input) {
22+
return new SlovakStemFilter(input, withASCIIFold);
23+
}
24+
}

0 commit comments

Comments
 (0)