diff --git a/docs/model_serving_framework/deploy_sparse_model_to_SageMaker.ipynb b/docs/model_serving_framework/deploy_sparse_model_to_SageMaker.ipynb
new file mode 100644
index 0000000000..7997346c93
--- /dev/null
+++ b/docs/model_serving_framework/deploy_sparse_model_to_SageMaker.ipynb
@@ -0,0 +1,502 @@
+{
+ "cells": [
+  {
+   "cell_type": "raw",
+   "id": "b32dbef8",
+   "metadata": {
+    "vscode": {
+     "languageId": "raw"
+    }
+   },
+   "source": [
+    "/*\n",
+    " * Copyright OpenSearch Contributors\n",
+    " * SPDX-License-Identifier: Apache-2.0\n",
+    " */"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f6ff4e2d",
+   "metadata": {},
+   "source": [
+    "> **_NOTE:_**  **This script is supposed to be executed at SageMaker Notebook!**\n",
+    "\n",
+    "## prerequesites\n",
+    "- We have setup an **SageMaker Notebook**, the **S3 bucket** to store the bindle, and config their permission\n",
+    "\n",
+    "## Step 1\n",
+    "Use git to clone this file to your SageMaker Notebook instance, and open this run.ipynb at your SageMaker Notebook\n",
+    "\n",
+    "## Step 2\n",
+    "Prepare the model file for SageMaker. Run below code blocks in sequence."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b6bc7a23",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!mkdir handler\n",
+    "!mkdir handler/code\n",
+    "!mkdir handler/MAR-INF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e4ca2c0c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile handler/code/requirements.txt\n",
+    "sentence-transformers==5.0.0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "071ff5c9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile handler/MAR-INF/MANIFEST.json\n",
+    "{\n",
+    "  \"runtime\": \"python\",\n",
+    "  \"model\": {\n",
+    "    \"modelName\": \"neuralsparse\",\n",
+    "    \"handler\": \"neural_sparse_handler.py\",\n",
+    "    \"modelVersion\": \"1.0\",\n",
+    "    \"configFile\": \"neural_sparse_config.yaml\"\n",
+    "  },\n",
+    "  \"archiverVersion\": \"0.9.0\"\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9c7d23ac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile handler/neural_sparse_config.yaml\n",
+    "## configs about dynamic batch inference https://docs.pytorch.org/serve/batch_inference_with_ts.html\n",
+    "## batchSize: the maximum number of requests to aggregate. Each request can contain multiple documents.\n",
+    "batchSize: 16\n",
+    "maxBatchDelay: 5\n",
+    "responseTimeout: 300"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3ee8d2bc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile handler/neural_sparse_handler.py\n",
+    "\n",
+    "import os\n",
+    "import re\n",
+    "import itertools\n",
+    "import json\n",
+    "import torch\n",
+    "\n",
+    "from ts.torch_handler.base_handler import BaseHandler\n",
+    "from sentence_transformers.sparse_encoder import SparseEncoder\n",
+    "\n",
+    "model_id = os.environ.get(\n",
+    "    \"MODEL_ID\", \"opensearch-project/opensearch-neural-sparse-encoding-doc-v3-gte\"\n",
+    ")\n",
+    "max_bs = int(os.environ.get(\"MAX_BS\", 32))\n",
+    "trust_remote_code = model_id.endswith(\"gte\")\n",
+    "\n",
+    "class SparseEncodingModelHandler(BaseHandler):\n",
+    "    def __init__(self):\n",
+    "        super().__init__()\n",
+    "        self.initialized = False\n",
+    "\n",
+    "    def initialize(self, context):\n",
+    "        self.manifest = context.manifest\n",
+    "        properties = context.system_properties\n",
+    "\n",
+    "        # Print initialization parameters\n",
+    "        print(f\"Initializing SparseEncodingModelHandler with model_id: {model_id}\")\n",
+    "\n",
+    "        # load model and tokenizer\n",
+    "        self.device = torch.device(\n",
+    "            \"cuda:\" + str(properties.get(\"gpu_id\"))\n",
+    "            if torch.cuda.is_available()\n",
+    "            else \"cpu\"\n",
+    "        )\n",
+    "        print(f\"Using device: {self.device}\")\n",
+    "        self.model = SparseEncoder(model_id, device=self.device, trust_remote_code=trust_remote_code)\n",
+    "        self.initialized = True\n",
+    "\n",
+    "    def preprocess(self, requests):\n",
+    "        inputSentence = []\n",
+    "        batch_idx = []\n",
+    "\n",
+    "        for request in requests:\n",
+    "            request_body = request.get(\"body\")\n",
+    "            if isinstance(request_body, bytearray):\n",
+    "                request_body = request_body.decode(\"utf-8\")\n",
+    "                request_body = json.loads((request_body))\n",
+    "            if isinstance(request_body, list):\n",
+    "                inputSentence += request_body\n",
+    "                batch_idx.append(len(request_body))\n",
+    "            else:\n",
+    "                inputSentence.append(request_body)\n",
+    "                batch_idx.append(1)\n",
+    "\n",
+    "        return inputSentence, batch_idx\n",
+    "\n",
+    "    def handle(self, data, context):\n",
+    "        inputSentence, batch_idx = self.preprocess(data)\n",
+    "        model_output = self.model.encode_document(inputSentence, batch_size=max_bs)\n",
+    "        sparse_embedding = list(map(dict,self.model.decode(model_output)))\n",
+    "\n",
+    "        outputs = [sparse_embedding[s:e]\n",
+    "           for s, e in zip([0]+list(itertools.accumulate(batch_idx))[:-1],\n",
+    "                           itertools.accumulate(batch_idx))]\n",
+    "        return outputs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0de74dbf",
+   "metadata": {},
+   "source": [
+    "Wrap the handler folder to a tarball. And upload it to your S3 bucket.\n",
+    "\n",
+    "In handler/neural_sparse_handler.py, we define the model loading, pre-process, inference and post-process. We use mixed-precision to accelerate the inference.\n",
+    "\n",
+    "In handler/neural_sparse_config.yaml, we define some configs for the torch serve (include dynamic micro-batching)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ebf38b1b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "bucket_name = \"your_bucket_name\"\n",
+    "os.system(\"tar -czvf neural-sparse-handler.tar.gz -C handler/ .\")\n",
+    "os.system(\n",
+    "    f\"aws s3 cp neural-sparse-handler.tar.gz s3://{bucket_name}/neural-sparse-handler.tar.gz\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "161796c1",
+   "metadata": {},
+   "source": [
+    "## Step 3\n",
+    "Use SageMaker python SDK to deploy the tarball on a real-time inference endpoint\n",
+    "\n",
+    "Here we use ml.g5.xlarge. It's a GPU instance with good price-performance.\n",
+    "\n",
+    "Please modify the region base according to your settings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5d16be94",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# constants that can be customized for models\n",
+    "model_id = \"opensearch-project/opensearch-neural-sparse-encoding-doc-v3-gte\"\n",
+    "## The maximum number of documents to encode in a single inference step. Too large number will cause CUDA OOM.\n",
+    "## Even we set batchSize to 16, the actual documents number can be larger. Because one request can contain multiple documents.\n",
+    "max_batch_size = \"32\"\n",
+    "\n",
+    "# constants related to deployment\n",
+    "model_name = \"ns-handler\"\n",
+    "endpoint_name = \"ns-handler\"\n",
+    "instance_type = \"ml.g5.xlarge\"\n",
+    "initial_instance_count = 1\n",
+    "\n",
+    "# run this cell\n",
+    "import boto3\n",
+    "import sagemaker\n",
+    "from sagemaker.model import Model\n",
+    "from sagemaker.predictor import Predictor\n",
+    "from sagemaker.serializers import JSONSerializer\n",
+    "from sagemaker.deserializers import JSONDeserializer\n",
+    "\n",
+    "role = sagemaker.get_execution_role()\n",
+    "sess = boto3.Session()\n",
+    "region = sess.region_name\n",
+    "smsess = sagemaker.Session(boto_session=sess)\n",
+    "\n",
+    "envs = {\n",
+    "    \"TS_ASYNC_LOGGING\": \"true\",\n",
+    "    \"MODEL_ID\": model_id,\n",
+    "    \"MAX_BS\": max_batch_size,\n",
+    "}\n",
+    "\n",
+    "baseimage = sagemaker.image_uris.retrieve(\n",
+    "    framework=\"pytorch\",\n",
+    "    region=region,\n",
+    "    py_version=\"py312\",\n",
+    "    image_scope=\"inference\",\n",
+    "    version=\"2.6\",\n",
+    "    instance_type=instance_type,\n",
+    ")\n",
+    "\n",
+    "model = Model(\n",
+    "    model_data=f\"s3://{bucket_name}/neural-sparse-handler.tar.gz\",\n",
+    "    image_uri=baseimage,\n",
+    "    role=role,\n",
+    "    predictor_cls=Predictor,\n",
+    "    name=model_name,\n",
+    "    sagemaker_session=smsess,\n",
+    "    env=envs,\n",
+    ")\n",
+    "\n",
+    "endpoint_name = endpoint_name\n",
+    "predictor = model.deploy(\n",
+    "    instance_type=instance_type,\n",
+    "    initial_instance_count=initial_instance_count,\n",
+    "    endpoint_name=endpoint_name,\n",
+    "    serializer=JSONSerializer(),\n",
+    "    deserializer=JSONDeserializer(),\n",
+    "    ModelDataDownloadTimeoutInSeconds=3600,\n",
+    "    ContainerStartupHealthCheckTimeoutInSeconds=3600,\n",
+    "    VolumeSizeInGB=16,\n",
+    ")\n",
+    "\n",
+    "print(predictor.endpoint_name)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0863ed26",
+   "metadata": {},
+   "source": [
+    "## Step 4\n",
+    "\n",
+    "After we create the endpoint, use some sample request to see how it works"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e1a7918c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# run this cell\n",
+    "import json\n",
+    "\n",
+    "body = [\"Currently New York is rainy.\"]\n",
+    "amz = boto3.client(\"sagemaker-runtime\")\n",
+    "\n",
+    "response = amz.invoke_endpoint(\n",
+    "    EndpointName=predictor.endpoint_name,\n",
+    "    Body=json.dumps(body),\n",
+    "    ContentType=\"application/json\",\n",
+    ")\n",
+    "\n",
+    "res = response[\"Body\"].read()\n",
+    "results = json.loads(res.decode(\"utf8\"))\n",
+    "results"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ae30128b",
+   "metadata": {
+    "vscode": {
+     "languageId": "raw"
+    }
+   },
+   "source": [
+    "response:\n",
+    "```json\n",
+    "[{'weather': 1.0386549234390259,\n",
+    "  'york': 1.0295555591583252,\n",
+    "  'ny': 0.9703458547592163,\n",
+    "  'rain': 0.9549243450164795,\n",
+    "  'rainy': 0.9478658437728882,\n",
+    "  'nyc': 0.8449130058288574,\n",
+    "  'new': 0.6880059838294983,\n",
+    "  'raining': 0.6789529323577881,\n",
+    "  'current': 0.6762993931770325,\n",
+    "  'wet': 0.6448248028755188,\n",
+    "  'rainfall': 0.6405332088470459,\n",
+    "  'currently': 0.6092915534973145,\n",
+    "  'now': 0.586189329624176,\n",
+    "  'manhattan': 0.5858010053634644,\n",
+    "  'today': 0.5322379469871521,\n",
+    "  'temperature': 0.5275187492370605,\n",
+    "  'climate': 0.48528429865837097,\n",
+    "  'is': 0.481422483921051,\n",
+    "  'y': 0.4586825370788574,\n",
+    "  '##yo': 0.45718008279800415,\n",
+    "  'cloudy': 0.41763371229171753,\n",
+    "  'it': 0.41397932171821594,\n",
+    "  'forecast': 0.38210317492485046,\n",
+    "  'rains': 0.3785228431224823,\n",
+    "  'rained': 0.35427314043045044,\n",
+    "  'yorkshire': 0.31092309951782227,\n",
+    "  'snow': 0.30391135811805725,\n",
+    "  'yorker': 0.28260838985443115,\n",
+    "  'time': 0.27697092294692993,\n",
+    "  'sunny': 0.2620435059070587,\n",
+    "  'nyu': 0.2503677308559418,\n",
+    "  'in': 0.24964851140975952,\n",
+    "  'windy': 0.2452678382396698,\n",
+    "  'presently': 0.22908653318881989,\n",
+    "  'stormy': 0.21931196749210358,\n",
+    "  'temperatures': 0.21101005375385284,\n",
+    "  'tonight': 0.20632436871528625,\n",
+    "  'present': 0.20109090209007263,\n",
+    "  'this': 0.20102401077747345,\n",
+    "  'us': 0.1935725212097168,\n",
+    "  'nj': 0.18026664853096008,\n",
+    "  'storm': 0.17380213737487793,\n",
+    "  'week': 0.17336463928222656,\n",
+    "  'news': 0.16366833448410034,\n",
+    "  '##storm': 0.16161945462226868,\n",
+    "  'here': 0.14572882652282715,\n",
+    "  'temps': 0.13970820605754852,\n",
+    "  'lately': 0.13716177642345428,\n",
+    "  '##weather': 0.13432787358760834,\n",
+    "  'te': 0.1198926791548729,\n",
+    "  'yesterday': 0.11460382491350174,\n",
+    "  'or': 0.11349867284297943,\n",
+    "  'storms': 0.11013525724411011,\n",
+    "  'sunshine': 0.09905409067869186,\n",
+    "  'usa': 0.09774350374937057,\n",
+    "  'clouds': 0.09281915426254272,\n",
+    "  'humidity': 0.09233205765485764,\n",
+    "  'humid': 0.086763896048069,\n",
+    "  'daylight': 0.08338665962219238,\n",
+    "  'state': 0.08252169191837311,\n",
+    "  'winter': 0.07992527633905411,\n",
+    "  'summer': 0.07536710053682327,\n",
+    "  'fog': 0.06763386726379395,\n",
+    "  'mood': 0.06538641452789307,\n",
+    "  'like': 0.06360717862844467,\n",
+    "  'hurricane': 0.062024328857660294,\n",
+    "  'water': 0.061854153871536255,\n",
+    "  'hudson': 0.0577932633459568,\n",
+    "  'gloom': 0.04488009959459305,\n",
+    "  'flu': 0.04299859702587128,\n",
+    "  'sunday': 0.039578113704919815,\n",
+    "  'brooklyn': 0.03740933537483215,\n",
+    "  'season': 0.03519425913691521,\n",
+    "  'month': 0.026503682136535645,\n",
+    "  'america': 0.025791412219405174,\n",
+    "  'monsoon': 0.01986435428261757,\n",
+    "  'color': 0.015449629165232182,\n",
+    "  'seasons': 0.012146473862230778,\n",
+    "  'does': 0.006621183827519417,\n",
+    "  'snowy': 0.0020988560281693935}]\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "33035889",
+   "metadata": {},
+   "source": [
+    "## Step 5\n",
+    "> **_NOTE:_**  **This step is supposed to be executed at an instance have access to OpenSearch cluster!**\n",
+    "\n",
+    "Register this SageMaker endpoint at your OpenSearch cluster\n",
+    "\n",
+    "Please check the OpenSearch doc for more information. Here we provide one demo request body using access_key and secret_key. Please choose the authentication according to your use case.\n",
+    "\n",
+    "### create connector\n",
+    "\n",
+    "(Fill the region and predictor.endpoint_name in request body)\n",
+    "```json\n",
+    "POST /_plugins/_ml/connectors/_create\n",
+    "{\n",
+    "  \"name\": \"test\",\n",
+    "  \"description\": \"Test connector for Sagemaker model\",\n",
+    "  \"version\": 1,\n",
+    "  \"protocol\": \"aws_sigv4\",\n",
+    "  \"credential\": {\n",
+    "    \"access_key\": \"your access key\",\n",
+    "    \"secret_key\": \"your secret key\"\n",
+    "  },\n",
+    "  \"parameters\": {\n",
+    "    \"region\": \"{region}\",\n",
+    "    \"service_name\": \"sagemaker\",\n",
+    "    \"input_docs_processed_step_size\": 2,\n",
+    "  },\n",
+    "  \"actions\": [\n",
+    "    {\n",
+    "      \"action_type\": \"predict\",\n",
+    "      \"method\": \"POST\",\n",
+    "      \"headers\": {\n",
+    "        \"content-type\": \"application/json\"\n",
+    "      },\n",
+    "      \"url\": \"https://runtime.sagemaker.{region}.amazonaws.com/endpoints/{predictor.endpoint_name}/invocations\",\n",
+    "      \"request_body\": \"${parameters.input}\"\n",
+    "    }\n",
+    "  ],\n",
+    "  \"client_config\":{\n",
+    "      \"max_retry_times\": -1,\n",
+    "      \"max_connection\": 60,\n",
+    "      \"retry_backoff_millis\": 10\n",
+    "  }\n",
+    "}\n",
+    "```\n",
+    "\n",
+    "### register model\n",
+    "```json\n",
+    "POST /_plugins/_ml/models/_register?deploy=true\n",
+    "{\n",
+    "  \"name\": \"test\",\n",
+    "  \"function_name\": \"remote\",\n",
+    "  \"version\": \"1.0.0\",\n",
+    "  \"connector_id\": \"{connector id}\",\n",
+    "  \"description\": \"Test connector for Sagemaker model\"\n",
+    "}\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e1fda443",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}