Port Czech and Slovak stemmers from Solr

crabhi · crabhi · commit 3bd8fad4a0da · 2023-02-01T21:56:23.000+01:00
diff --git a/build.gradle b/build.gradle
@@ -30,7 +30,7 @@ version = "${property("plugin.version")}-${property("elasticsearch.version")}"
 esplugin {
   name 'monitora_utils'
   description 'Utils for Elasticsearch'
-  classname 'cz.monitora.elasticsearch.LowerCase'
+  classname 'cz.monitora.elasticsearch.MonitoraESPlugin'
   licenseFile rootProject.file('LICENSE')
 }
 
diff --git a/src/main/java/cz/monitora/elasticsearch/MonitoraESPlugin.java b/src/main/java/cz/monitora/elasticsearch/MonitoraESPlugin.java
@@ -1,5 +1,8 @@
 package cz.monitora.elasticsearch;
 
+import cz.monitora.elasticsearch.analyzer.czech.CzechStemFilterFactory;
+import cz.monitora.elasticsearch.analyzer.lowercase.LowerCaseTokenFilterFactory;
+import cz.monitora.elasticsearch.analyzer.slovak.SlovakStemFilterFactory;
 import org.elasticsearch.index.analysis.TokenFilterFactory;
 import org.elasticsearch.indices.analysis.AnalysisModule;
 import org.elasticsearch.plugins.AnalysisPlugin;
@@ -8,12 +11,14 @@
 import java.util.HashMap;
 import java.util.Map;
 
-public class LowerCase extends Plugin implements AnalysisPlugin {
+public class MonitoraESPlugin extends Plugin implements AnalysisPlugin {
 
     @Override
     public Map<String, AnalysisModule.AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
         Map<String, AnalysisModule.AnalysisProvider<TokenFilterFactory>> extra = new HashMap<>();
         extra.put("monitora_lowercase", AnalysisPlugin.requiresAnalysisSettings(LowerCaseTokenFilterFactory::new));
+        extra.put("czech_stem", CzechStemFilterFactory::new);
+        extra.put("slovak_stem", SlovakStemFilterFactory::new);
         return extra;
     }
 }
diff --git a/src/main/java/cz/monitora/elasticsearch/analyzer/czech/CzechStemFilter.java b/src/main/java/cz/monitora/elasticsearch/analyzer/czech/CzechStemFilter.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package cz.monitora.elasticsearch.analyzer.czech;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.cz.CzechStemmer;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // for javadoc
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+
+/**
+ * A {@link TokenFilter} that applies {@link CzechStemmer} to stem Czech words.
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ * <p><b>NOTE</b>: Input is expected to be in lowercase,
+ * but with diacritical marks</p>
+ * @see SetKeywordMarkerFilter
+ */
+public final class CzechStemFilter extends TokenFilter {
+  private final CzechStemmer stemmer = new CzechStemmer();
+  private final CzechStemmerASCIIFold stemmerASCIIFold = new CzechStemmerASCIIFold();
+  private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
+  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+  private final boolean withASCIIFold;
+
+  public CzechStemFilter(TokenStream input, boolean withASCIIFold) {
+    super(input);
+    this.withASCIIFold = withASCIIFold;
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      if(!keywordAttr.isKeyword()) {
+        final int newlen = (
+          withASCIIFold ?
+          stemmerASCIIFold.stem(termAttr.buffer(), termAttr.length()) :
+          stemmer.stem(termAttr.buffer(), termAttr.length()));
+        termAttr.setLength(newlen);
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
diff --git a/src/main/java/cz/monitora/elasticsearch/analyzer/czech/CzechStemFilterFactory.java b/src/main/java/cz/monitora/elasticsearch/analyzer/czech/CzechStemFilterFactory.java
@@ -0,0 +1,25 @@
+package cz.monitora.elasticsearch.analyzer.czech;
+
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
+
+public class CzechStemFilterFactory extends AbstractTokenFilterFactory {
+  private final boolean withASCIIFold;
+
+  /** Creates a new CzechStemFilterFactory */
+  public CzechStemFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+    super(name, settings);
+    withASCIIFold = settings.getAsBoolean("with_asciifold", true);
+  }
+
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new CzechStemFilter(input, withASCIIFold);
+  }
+}
diff --git a/src/main/java/cz/monitora/elasticsearch/analyzer/czech/CzechStemmerASCIIFold.java b/src/main/java/cz/monitora/elasticsearch/analyzer/czech/CzechStemmerASCIIFold.java
@@ -0,0 +1,173 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package cz.monitora.elasticsearch.analyzer.czech;
+
+
+import static org.apache.lucene.analysis.util.StemmerUtil.*;
+
+/**
+ * Light Stemmer for Czech.
+ * <p>
+ * Implements the algorithm described in:
+ * <i>
+ * Indexing and stemming approaches for the Czech language
+ * </i>
+ * http://portal.acm.org/citation.cfm?id=1598600
+ * </p>
+ */
+public class CzechStemmerASCIIFold {
+
+  /**
+   * Stem an input buffer of Czech text.
+   *
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return length of input buffer after normalization
+   *
+   * <p><b>NOTE</b>: Input is expected to be in lowercase,
+   * but with diacritical marks</p>
+   */
+  public int stem(char s[], int len) {
+    len = removeCase(s, len);
+    len = removePossessives(s, len);
+    if (len > 0) {
+      len = normalize(s, len);
+    }
+    return len;
+  }
+
+  private int removeCase(char s[], int len) {
+    if (len > 7 && endsWith(s, len, "atech"))
+      return len - 5;
+
+    if (len > 6 &&
+        (endsWith(s, len,"etem") ||
+        endsWith(s, len,"etem") ||
+        endsWith(s, len,"atum")))
+      return len - 4;
+
+    if (len > 5 &&
+        (endsWith(s, len, "ech") ||
+        endsWith(s, len, "ich") ||
+        endsWith(s, len, "ich") ||
+        endsWith(s, len, "eho") ||
+        endsWith(s, len, "emi") ||
+        endsWith(s, len, "emi") ||
+        endsWith(s, len, "emu") ||
+        endsWith(s, len, "ete") ||
+        endsWith(s, len, "ete") ||
+        endsWith(s, len, "eti") ||
+        endsWith(s, len, "eti") ||
+        endsWith(s, len, "iho") ||
+        endsWith(s, len, "iho") ||
+        endsWith(s, len, "imi") ||
+        endsWith(s, len, "imu") ||
+        endsWith(s, len, "imu") ||
+        endsWith(s, len, "ach") ||
+        endsWith(s, len, "ata") ||
+        endsWith(s, len, "aty") ||
+        endsWith(s, len, "ych") ||
+        endsWith(s, len, "ama") ||
+        endsWith(s, len, "ami") ||
+        endsWith(s, len, "ove") ||
+        endsWith(s, len, "ovi") ||
+        endsWith(s, len, "ymi")))
+      return len - 3;
+
+    if (len > 4 &&
+        (endsWith(s, len, "em") ||
+        endsWith(s, len, "es") ||
+        endsWith(s, len, "em") ||
+        endsWith(s, len, "im") ||
+        endsWith(s, len, "um") ||
+        endsWith(s, len, "at") ||
+        endsWith(s, len, "am") ||
+        endsWith(s, len, "os") ||
+        endsWith(s, len, "us") ||
+        endsWith(s, len, "ym") ||
+        endsWith(s, len, "mi") ||
+        endsWith(s, len, "ou")))
+      return len - 2;
+
+    if (len > 3) {
+      switch (s[len - 1]) {
+        case 'a':
+        case 'e':
+        case 'i':
+        case 'o':
+        case 'u':
+        //case 'u':
+        case 'y':
+        //case 'a':
+        //case 'e':
+        //case 'i':
+        //case 'y':
+        //case 'e':
+          return len - 1;
+      }
+    }
+
+    return len;
+  }
+
+  private int removePossessives(char s[], int len) {
+    if (len > 5 &&
+        (endsWith(s, len, "ov") ||
+        endsWith(s, len, "in") ||
+        endsWith(s, len, "uv")))
+      return len - 2;
+
+    return len;
+  }
+
+  private int normalize(char s[], int len) {
+    if (endsWith(s, len, "ct")) { // ct -> ck
+      s[len - 2] = 'c';
+      s[len - 1] = 'k';
+      return len;
+    }
+
+    if (endsWith(s, len, "st")) { // st -> sk
+      s[len - 2] = 's';
+      s[len - 1] = 'k';
+      return len;
+    }
+
+    switch(s[len - 1]) {
+      case 'c': // [cc] -> k
+      //case 'c':
+        s[len - 1] = 'k';
+        return len;
+      case 'z': // [zz] -> h
+      //case 'z':
+        s[len - 1] = 'h';
+        return len;
+    }
+
+    if (len > 1 && s[len - 2] == 'e') {
+      s[len - 2] = s[len - 1]; // e* > *
+      return len - 1;
+    }
+
+    if (len > 2 && s[len - 2] == 'u') {
+      s[len - 2] = 'o'; // *u* -> *o*
+      return len;
+    }
+
+    return len;
+  }
+}
diff --git a/src/main/java/cz/monitora/elasticsearch/analyzer/lowercase/LowerCaseFilter.java b/src/main/java/cz/monitora/elasticsearch/analyzer/lowercase/LowerCaseFilter.java
@@ -1,4 +1,4 @@
-package cz.monitora.elasticsearch;
+package cz.monitora.elasticsearch.analyzer.lowercase;
 
 import org.apache.lucene.analysis.CharacterUtils;
 import org.apache.lucene.analysis.TokenFilter;
diff --git a/src/main/java/cz/monitora/elasticsearch/analyzer/lowercase/LowerCaseTokenFilterFactory.java b/src/main/java/cz/monitora/elasticsearch/analyzer/lowercase/LowerCaseTokenFilterFactory.java
@@ -1,4 +1,4 @@
-package cz.monitora.elasticsearch;
+package cz.monitora.elasticsearch.analyzer.lowercase;
 
 import org.apache.lucene.analysis.TokenStream;
 import org.elasticsearch.common.settings.Settings;
diff --git a/src/main/java/cz/monitora/elasticsearch/analyzer/slovak/SlovakStemFilter.java b/src/main/java/cz/monitora/elasticsearch/analyzer/slovak/SlovakStemFilter.java
@@ -0,0 +1,38 @@
+package cz.monitora.elasticsearch.analyzer.slovak;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+
+public final class SlovakStemFilter extends TokenFilter {
+	private final SlovakStemmer stemmer = new SlovakStemmer();
+	private final SlovakStemmerASCIIFold stemmerASCIIFold = new SlovakStemmerASCIIFold();
+	private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
+	private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+	private final boolean withASCIIFold;
+
+	public SlovakStemFilter(TokenStream input, boolean withASCIIFold) {
+		super(input);
+		this.withASCIIFold = withASCIIFold;
+	}
+
+	@Override
+	public boolean incrementToken() throws IOException {
+		if (input.incrementToken()) {
+			if(!keywordAttr.isKeyword()) {
+				final int newlen = (
+					withASCIIFold ?
+					stemmerASCIIFold.stem(termAttr.buffer(), termAttr.length()) :
+					stemmer.stem(termAttr.buffer(), termAttr.length()));
+				termAttr.setLength(newlen);
+			}
+			return true;
+		} else {
+			return false;
+		}
+	}
+}
diff --git a/src/main/java/cz/monitora/elasticsearch/analyzer/slovak/SlovakStemFilterFactory.java b/src/main/java/cz/monitora/elasticsearch/analyzer/slovak/SlovakStemFilterFactory.java
@@ -0,0 +1,24 @@
+package cz.monitora.elasticsearch.analyzer.slovak;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
+
+
+public class SlovakStemFilterFactory extends AbstractTokenFilterFactory {
+    private final boolean withASCIIFold;
+
+    public SlovakStemFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+        super(name, settings);
+        withASCIIFold = settings.getAsBoolean("with_asciifold", true);
+	}
+
+	@Override
+	public TokenStream create(TokenStream input) {
+		return new SlovakStemFilter(input, withASCIIFold);
+	}
+}
diff --git a/src/main/java/cz/monitora/elasticsearch/analyzer/slovak/SlovakStemmer.java b/src/main/java/cz/monitora/elasticsearch/analyzer/slovak/SlovakStemmer.java
diff --git a/src/main/java/cz/monitora/elasticsearch/analyzer/slovak/SlovakStemmerASCIIFold.java b/src/main/java/cz/monitora/elasticsearch/analyzer/slovak/SlovakStemmerASCIIFold.java

Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,7 @@ version = "${property("plugin.version")}-${property("elasticsearch.version")}"`
`30`	`30`	`esplugin {`
`31`	`31`	`name 'monitora_utils'`
`32`	`32`	`description 'Utils for Elasticsearch'`
`33`		`- classname 'cz.monitora.elasticsearch.LowerCase'`
	`33`	`+ classname 'cz.monitora.elasticsearch.MonitoraESPlugin'`
`34`	`34`	`licenseFile rootProject.file('LICENSE')`
`35`	`35`	`}`
`36`	`36`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-package cz.monitora.elasticsearch;`
	`1`	`+package cz.monitora.elasticsearch.analyzer.lowercase;`
`2`	`2`
`3`	`3`	`import org.apache.lucene.analysis.CharacterUtils;`
`4`	`4`	`import org.apache.lucene.analysis.TokenFilter;`