Skip to content

Commit 8a996d6

Browse files
authored
hc liga stemming (#1)
* fix "lize" (local for liga) stemming
1 parent 1f432f8 commit 8a996d6

File tree

5 files changed

+232
-1
lines changed

5 files changed

+232
-1
lines changed

src/main/java/cz/monitora/elasticsearch/analyzer/czech/CzechStemFilter.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818

1919
import java.io.IOException;
2020

21-
import org.apache.lucene.analysis.cz.CzechStemmer;
2221
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // for javadoc
2322
import org.apache.lucene.analysis.TokenFilter;
2423
import org.apache.lucene.analysis.TokenStream;
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package cz.monitora.elasticsearch.analyzer.czech;
18+
19+
import static org.apache.lucene.analysis.util.StemmerUtil.*;
20+
21+
/**
22+
* Light Stemmer for Czech.
23+
*
24+
* <p>Implements the algorithm described in: <i> Indexing and stemming approaches for the Czech
25+
* language </i> http://portal.acm.org/citation.cfm?id=1598600
26+
*/
27+
public class CzechStemmer {
28+
29+
/**
30+
* Stem an input buffer of Czech text.
31+
*
32+
* @param s input buffer
33+
* @param len length of input buffer
34+
* @return length of input buffer after normalization
35+
* <p><b>NOTE</b>: Input is expected to be in lowercase, but with diacritical marks
36+
*/
37+
public int stem(char[] s, int len) {
38+
len = removeCase(s, len);
39+
len = removePossessives(s, len);
40+
if (len > 0) {
41+
len = normalize(s, len);
42+
}
43+
return len;
44+
45+
}
46+
47+
private int removeCase(char[] s, int len) {
48+
if (len > 7 && endsWith(s, len, "atech")) return len - 5;
49+
50+
if (len > 6
51+
&& (endsWith(s, len, "ětem") || endsWith(s, len, "etem") || endsWith(s, len, "atům")))
52+
return len - 4;
53+
54+
if (len > 5
55+
&& (endsWith(s, len, "ech")
56+
|| endsWith(s, len, "ich")
57+
|| endsWith(s, len, "ích")
58+
|| endsWith(s, len, "ého")
59+
|| endsWith(s, len, "ěmi")
60+
|| endsWith(s, len, "emi")
61+
|| endsWith(s, len, "ému")
62+
|| endsWith(s, len, "ěte")
63+
|| endsWith(s, len, "ete")
64+
|| endsWith(s, len, "ěti")
65+
|| endsWith(s, len, "eti")
66+
|| endsWith(s, len, "ího")
67+
|| endsWith(s, len, "iho")
68+
|| endsWith(s, len, "ími")
69+
|| endsWith(s, len, "ímu")
70+
|| endsWith(s, len, "imu")
71+
|| endsWith(s, len, "ách")
72+
|| endsWith(s, len, "ata")
73+
|| endsWith(s, len, "aty")
74+
|| endsWith(s, len, "ých")
75+
|| endsWith(s, len, "ama")
76+
|| endsWith(s, len, "ami")
77+
|| endsWith(s, len, "ové")
78+
|| endsWith(s, len, "ovi")
79+
|| endsWith(s, len, "ými"))) return len - 3;
80+
81+
if (len > 4
82+
&& (endsWith(s, len, "em")
83+
|| endsWith(s, len, "es")
84+
|| endsWith(s, len, "ém")
85+
|| endsWith(s, len, "ím")
86+
|| endsWith(s, len, "ům")
87+
|| endsWith(s, len, "at")
88+
|| endsWith(s, len, "ám")
89+
|| endsWith(s, len, "os")
90+
|| endsWith(s, len, "us")
91+
|| endsWith(s, len, "ým")
92+
|| endsWith(s, len, "mi")
93+
|| endsWith(s, len, "ou"))) return len - 2;
94+
95+
/*
96+
special case for "liga" local (6th case - "lize")
97+
which we want to be stemmed as "lig" as the rest of the cases
98+
if we move this further, we might have been demaging more words
99+
so we just do it here
100+
*/
101+
if (len == 4
102+
&& endsWith(s, len, "lize")) {
103+
s[len - 2] = 'g';
104+
return len - 1;
105+
}
106+
107+
if (len > 3) {
108+
switch (s[len - 1]) {
109+
case 'a':
110+
case 'e':
111+
case 'i':
112+
case 'o':
113+
case 'u':
114+
case 'ů':
115+
case 'y':
116+
case 'á':
117+
case 'é':
118+
case 'í':
119+
case 'ý':
120+
case 'ě':
121+
return len - 1;
122+
}
123+
}
124+
125+
return len;
126+
}
127+
128+
private int removePossessives(char[] s, int len) {
129+
if (len > 5 && (endsWith(s, len, "ov") || endsWith(s, len, "in") || endsWith(s, len, "ův")))
130+
return len - 2;
131+
132+
return len;
133+
}
134+
135+
private int normalize(char[] s, int len) {
136+
if (endsWith(s, len, "čt")) { // čt -> ck
137+
s[len - 2] = 'c';
138+
s[len - 1] = 'k';
139+
return len;
140+
}
141+
142+
if (endsWith(s, len, "št")) { // št -> sk
143+
s[len - 2] = 's';
144+
s[len - 1] = 'k';
145+
return len;
146+
}
147+
148+
switch (s[len - 1]) {
149+
case 'c': // [cč] -> k
150+
case 'č':
151+
s[len - 1] = 'k';
152+
return len;
153+
case 'z': // [zž] -> h
154+
case 'ž':
155+
s[len - 1] = 'h';
156+
return len;
157+
}
158+
159+
if (len > 1 && s[len - 2] == 'e') {
160+
s[len - 2] = s[len - 1]; // e* > *
161+
return len - 1;
162+
}
163+
164+
if (len > 2 && s[len - 2] == 'ů') {
165+
s[len - 2] = 'o'; // *ů* -> *o*
166+
return len;
167+
}
168+
169+
return len;
170+
}
171+
}

src/main/java/cz/monitora/elasticsearch/analyzer/czech/CzechStemmerASCIIFold.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,17 @@ private int removeCase(char s[], int len) {
103103
endsWith(s, len, "ou")))
104104
return len - 2;
105105

106+
/*
107+
special case for "liga" local (6th case - "lize")
108+
which we want to be stemmed as "lig" as the rest of the cases
109+
if we move this further, we might have been demaging more words
110+
so we just do it here
111+
*/
112+
if (len == 4 && endsWith(s, len, "lize")) {
113+
s[len - 2] = 'g';
114+
return len - 1;
115+
}
116+
106117
if (len > 3) {
107118
switch (s[len - 1]) {
108119
case 'a':
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
package cz.monitora.elasticsearch.analyzer.czech;
2+
3+
import java.util.Arrays;
4+
import org.junit.jupiter.params.ParameterizedTest;
5+
import org.junit.jupiter.params.provider.CsvSource;
6+
import static org.junit.jupiter.api.Assertions.*;
7+
8+
public class CzechStemmerASCIIFoldTest {
9+
10+
// grep 'ze$' /usr/share/hunspell/cs_CZ.dic | head
11+
@ParameterizedTest
12+
@CsvSource({
13+
"starenka, starenk",
14+
"ruzove, ruh",
15+
16+
// from our synonyms
17+
"liga, lig",
18+
"lize, lig",
19+
})
20+
public void test_stem(String val, String exp) {
21+
final CzechStemmerASCIIFold stemmer = new CzechStemmerASCIIFold();
22+
char[] ch = val.toCharArray();
23+
assertEquals(exp, new String(Arrays.copyOfRange(ch, 0, stemmer.stem(ch, ch.length))));
24+
}
25+
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
package cz.monitora.elasticsearch.analyzer.czech;
2+
3+
import java.util.Arrays;
4+
import org.junit.jupiter.params.ParameterizedTest;
5+
import org.junit.jupiter.params.provider.CsvSource;
6+
import static org.junit.jupiter.api.Assertions.*;
7+
8+
public class CzechStemmerTest {
9+
10+
// grep 'ze$' /usr/share/hunspell/cs_CZ.dic | head
11+
@ParameterizedTest
12+
@CsvSource({
13+
"stařenk, stařenk",
14+
"růžové, růh",
15+
16+
// from our synonyms
17+
"liga, lig",
18+
"lize, lig",
19+
})
20+
public void test_stem(String val, String exp) {
21+
final CzechStemmer stemmer = new CzechStemmer();
22+
char[] ch = val.toCharArray();
23+
assertEquals(exp, new String(Arrays.copyOfRange(ch, 0, stemmer.stem(ch, ch.length))));
24+
}
25+
}

0 commit comments

Comments
 (0)