opensearch-project
diff --git a/‎ml-algorithms/src/main/java/org/opensearch/ml/engine/MLEngine.java
Lines changed: 5 additions & 0 deletions b/‎ml-algorithms/src/main/java/org/opensearch/ml/engine/MLEngine.java
Lines changed: 5 additions & 0 deletions
diff --git a/‎ml-algorithms/src/main/java/org/opensearch/ml/engine/algorithms/tokenize/SparseTokenizerModel.java
Lines changed: 2 additions & 8 deletions b/‎ml-algorithms/src/main/java/org/opensearch/ml/engine/algorithms/tokenize/SparseTokenizerModel.java
Lines changed: 2 additions & 8 deletions
diff --git a/‎ml-algorithms/src/main/java/org/opensearch/ml/engine/analysis/DJLUtils.java
Lines changed: 80 additions & 0 deletions b/‎ml-algorithms/src/main/java/org/opensearch/ml/engine/analysis/DJLUtils.java
Lines changed: 80 additions & 0 deletions
diff --git a/‎ml-algorithms/src/main/java/org/opensearch/ml/engine/analysis/HFModelAnalyzer.java
Lines changed: 28 additions & 0 deletions b/‎ml-algorithms/src/main/java/org/opensearch/ml/engine/analysis/HFModelAnalyzer.java
Lines changed: 28 additions & 0 deletions
diff --git a/‎ml-algorithms/src/main/java/org/opensearch/ml/engine/analysis/HFModelAnalyzerProvider.java
Lines changed: 29 additions & 0 deletions b/‎ml-algorithms/src/main/java/org/opensearch/ml/engine/analysis/HFModelAnalyzerProvider.java
Lines changed: 29 additions & 0 deletions
diff --git a/‎ml-algorithms/src/main/java/org/opensearch/ml/engine/analysis/HFModelTokenizer.java
Lines changed: 123 additions & 0 deletions b/‎ml-algorithms/src/main/java/org/opensearch/ml/engine/analysis/HFModelTokenizer.java
Lines changed: 123 additions & 0 deletions
@@ -38,6 +38,7 @@ public class MLEngine {
 
     public static final String REGISTER_MODEL_FOLDER = "register";
     public static final String DEPLOY_MODEL_FOLDER = "deploy";
+    public static final String ANALYSIS_FOLDER = "analysis";
     private final String MODEL_REPO = "https://artifacts.opensearch.org/models/ml-models";
 
     @Getter
@@ -114,6 +115,10 @@ public Path getModelCacheRootPath() {
         return mlModelsCachePath.resolve("models");
     }
 
+    public Path getAnalysisRootPath() {
+        return mlModelsCachePath.resolve(ANALYSIS_FOLDER);
+    }
+
     public MLModel train(Input input) {
         validateMLInput(input);
         MLInput mlInput = (MLInput) input;
 
@@ -6,11 +6,8 @@
 package org.opensearch.ml.engine.algorithms.tokenize;
 
 import static org.opensearch.ml.common.CommonValue.ML_MAP_RESPONSE_KEY;
-import static org.opensearch.ml.common.utils.StringUtils.gson;
 
 import java.io.IOException;
-import java.io.InputStreamReader;
-import java.lang.reflect.Type;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
@@ -31,10 +28,9 @@
 import org.opensearch.ml.common.output.model.ModelTensorOutput;
 import org.opensearch.ml.common.output.model.ModelTensors;
 import org.opensearch.ml.engine.algorithms.DLModel;
+import org.opensearch.ml.engine.analysis.DJLUtils;
 import org.opensearch.ml.engine.annotation.Function;
 
-import com.google.gson.reflect.TypeToken;
-
 import ai.djl.MalformedModelException;
 import ai.djl.huggingface.tokenizers.Encoding;
 import ai.djl.huggingface.tokenizers.HuggingFaceTokenizer;
@@ -110,9 +106,7 @@ protected void doLoadModel(
         tokenizer = HuggingFaceTokenizer.builder().optPadding(true).optTokenizerPath(modelPath.resolve("tokenizer.json")).build();
         idf = new HashMap<>();
         if (Files.exists(modelPath.resolve(IDF_FILE_NAME))) {
-            Type mapType = new TypeToken<Map<String, Float>>() {
-            }.getType();
-            idf = gson.fromJson(new InputStreamReader(Files.newInputStream(modelPath.resolve(IDF_FILE_NAME))), mapType);
+            idf = DJLUtils.fetchTokenWeights(modelPath.resolve(IDF_FILE_NAME));
         }
         log.info("sparse tokenize Model {} is successfully deployed", modelId);
     }
 
@@ -0,0 +1,80 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package org.opensearch.ml.engine.analysis;
+
+import static org.opensearch.ml.common.utils.StringUtils.gson;
+
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.lang.reflect.Type;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.security.AccessController;
+import java.security.PrivilegedActionException;
+import java.security.PrivilegedExceptionAction;
+import java.util.Map;
+import java.util.concurrent.Callable;
+
+import org.opensearch.ml.engine.MLEngine;
+
+import com.google.gson.reflect.TypeToken;
+
+import ai.djl.huggingface.tokenizers.HuggingFaceTokenizer;
+import lombok.Getter;
+import lombok.Setter;
+
+/**
+ * Utility class for DJL (Deep Java Library) operations related to tokenization and model handling.
+ */
+public class DJLUtils {
+    @Getter
+    @Setter
+    private static MLEngine mlEngine;
+
+    private static <T> T withDJLContext(Callable<T> action) throws PrivilegedActionException {
+        return AccessController.doPrivileged((PrivilegedExceptionAction<T>) () -> {
+            ClassLoader contextClassLoader = Thread.currentThread().getContextClassLoader();
+            try {
+                System.setProperty("java.library.path", mlEngine.getMlCachePath().toAbsolutePath().toString());
+                System.setProperty("DJL_CACHE_DIR", mlEngine.getMlCachePath().toAbsolutePath().toString());
+                Thread.currentThread().setContextClassLoader(ai.djl.Model.class.getClassLoader());
+
+                return action.call();
+            } finally {
+                Thread.currentThread().setContextClassLoader(contextClassLoader);
+            }
+        });
+    }
+
+    /**
+     * Creates a new HuggingFaceTokenizer instance for the given resource path.
+     * @param resourcePath The resource path of the tokenizer to create
+     * @return A new HuggingFaceTokenizer instance
+     * @throws RuntimeException if tokenizer initialization fails
+     */
+    public static HuggingFaceTokenizer buildHuggingFaceTokenizer(Path resourcePath) {
+        try {
+            return withDJLContext(() -> { return HuggingFaceTokenizer.newInstance(resourcePath); });
+        } catch (PrivilegedActionException e) {
+            throw new RuntimeException("Failed to initialize Hugging Face tokenizer. " + e);
+        }
+    }
+
+    /**
+     * Fetches token weights from a specified file for a given tokenizer.
+     * @param resourcePath The resource path of the tokenizer to create
+     * @return A map of token to weight mappings
+     * @throws RuntimeException if file fetching or parsing fails
+     */
+    public static Map<String, Float> fetchTokenWeights(Path resourcePath) {
+        try {
+            Type mapType = new TypeToken<Map<String, Float>>() {
+            }.getType();
+            return gson.fromJson(new InputStreamReader(Files.newInputStream(resourcePath)), mapType);
+        } catch (IOException e) {
+            throw new RuntimeException("Failed to parse token weights file. " + e);
+        }
+    }
+}
@@ -0,0 +1,28 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package org.opensearch.ml.engine.analysis;
+
+import java.util.function.Supplier;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Tokenizer;
+
+/**
+ * Custom Lucene Analyzer that uses the HFModelTokenizer for text analysis.
+ * Provides a way to process text using Hugging Face models within OpenSearch.
+ */
+public class HFModelAnalyzer extends Analyzer {
+    Supplier<Tokenizer> tokenizerSupplier;
+
+    public HFModelAnalyzer(Supplier<Tokenizer> tokenizerSupplier) {
+        this.tokenizerSupplier = tokenizerSupplier;
+    }
+
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName) {
+        final Tokenizer src = tokenizerSupplier.get();
+        return new TokenStreamComponents(src, src);
+    }
+}
@@ -0,0 +1,29 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package org.opensearch.ml.engine.analysis;
+
+import org.opensearch.common.settings.Settings;
+import org.opensearch.env.Environment;
+import org.opensearch.index.IndexSettings;
+import org.opensearch.index.analysis.AbstractIndexAnalyzerProvider;
+
+/**
+ * Provider class for HFModelAnalyzer instances.
+ * Handles the creation and configuration of HFModelAnalyzer instances within OpenSearch.
+ */
+public class HFModelAnalyzerProvider extends AbstractIndexAnalyzerProvider<HFModelAnalyzer> {
+    private final HFModelAnalyzer analyzer;
+
+    public HFModelAnalyzerProvider(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+        super(indexSettings, name, settings);
+        HFModelTokenizerFactory tokenizerFactory = new HFModelTokenizerFactory(indexSettings, environment, name, settings);
+        analyzer = new HFModelAnalyzer(tokenizerFactory::create);
+    }
+
+    @Override
+    public HFModelAnalyzer get() {
+        return analyzer;
+    }
+}
@@ -0,0 +1,123 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package org.opensearch.ml.engine.analysis;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Map;
+import java.util.Objects;
+import java.util.function.Supplier;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.util.BytesRef;
+
+import com.google.common.io.CharStreams;
+
+import ai.djl.huggingface.tokenizers.Encoding;
+import ai.djl.huggingface.tokenizers.HuggingFaceTokenizer;
+import lombok.extern.log4j.Log4j2;
+
+/**
+ * A Lucene Tokenizer implementation that uses Hugging Face tokenizer for tokenization.
+ * Supports token weighting and handles overflow scenarios.
+ */
+@Log4j2
+public class HFModelTokenizer extends Tokenizer {
+    public static final String NAME = "hf_model_tokenizer";
+    private static final Float DEFAULT_TOKEN_WEIGHT = 1.0f;
+
+    private final CharTermAttribute termAtt;
+    private final PayloadAttribute payloadAtt;
+    private final OffsetAttribute offsetAtt;
+    private final Supplier<HuggingFaceTokenizer> tokenizerSupplier;
+    private final Supplier<Map<String, Float>> tokenWeightsSupplier;
+
+    private Encoding encoding;
+    private int tokenIdx = 0;
+    private int overflowingIdx = 0;
+
+    public HFModelTokenizer(Supplier<HuggingFaceTokenizer> huggingFaceTokenizerSupplier) {
+        this(huggingFaceTokenizerSupplier, null);
+    }
+
+    public HFModelTokenizer(Supplier<HuggingFaceTokenizer> huggingFaceTokenizerSupplier, Supplier<Map<String, Float>> weightsSupplier) {
+        termAtt = addAttribute(CharTermAttribute.class);
+        offsetAtt = addAttribute(OffsetAttribute.class);
+        if (Objects.nonNull(weightsSupplier)) {
+            payloadAtt = addAttribute(PayloadAttribute.class);
+        } else {
+            payloadAtt = null;
+        }
+        tokenizerSupplier = huggingFaceTokenizerSupplier;
+        tokenWeightsSupplier = weightsSupplier;
+    }
+
+    @Override
+    public void reset() throws IOException {
+        super.reset();
+        tokenIdx = 0;
+        overflowingIdx = -1;
+        String inputStr = CharStreams.toString(input);
+        // For pre-built analyzer, when create new index service, reset() will be called with empty input in checkVersions
+        // And we want to lazy-load the tokenizer only really needed. So we use supplier, and skip empty input.
+        encoding = StringUtils.isEmpty(inputStr) ? null : tokenizerSupplier.get().encode(inputStr, false, true);
+    }
+
+    private static boolean isLastTokenInEncodingSegment(int idx, Encoding encodingSegment) {
+        return idx >= encodingSegment.getTokens().length || encodingSegment.getAttentionMask()[idx] == 0;
+    }
+
+    public static byte[] floatToBytes(float value) {
+        return ByteBuffer.allocate(4).putFloat(value).array();
+    }
+
+    public static float bytesToFloat(byte[] bytes) {
+        return ByteBuffer.wrap(bytes).getFloat();
+    }
+
+    @Override
+    final public boolean incrementToken() throws IOException {
+        clearAttributes();
+        if (Objects.isNull(encoding))
+            return false;
+        Encoding curEncoding = overflowingIdx == -1 ? encoding : encoding.getOverflowing()[overflowingIdx];
+
+        while (!isLastTokenInEncodingSegment(tokenIdx, curEncoding) || overflowingIdx < encoding.getOverflowing().length) {
+            if (isLastTokenInEncodingSegment(tokenIdx, curEncoding)) {
+                // reset cur segment, go to the next segment
+                // until overflowingIdx = encoding.getOverflowing().length
+                tokenIdx = 0;
+                overflowingIdx++;
+                if (overflowingIdx >= encoding.getOverflowing().length) {
+                    return false;
+                }
+                curEncoding = encoding.getOverflowing()[overflowingIdx];
+            } else {
+                termAtt.append(curEncoding.getTokens()[tokenIdx]);
+                offsetAtt
+                    .setOffset(curEncoding.getCharTokenSpans()[tokenIdx].getStart(), curEncoding.getCharTokenSpans()[tokenIdx].getEnd());
+                if (Objects.nonNull(tokenWeightsSupplier)) {
+                    // for neural sparse query, write the token weight to payload field
+                    payloadAtt
+                        .setPayload(
+                            new BytesRef(
+                                floatToBytes(
+                                    tokenWeightsSupplier.get().getOrDefault(curEncoding.getTokens()[tokenIdx], DEFAULT_TOKEN_WEIGHT)
+                                )
+                            )
+                        );
+                }
+                tokenIdx++;
+                return true;
+            }
+        }
+
+        return false;
+    }
+}