diff --git a/docs/model_serving_framework/deploy_sparse_model_to_SageMaker.ipynb b/docs/model_serving_framework/deploy_sparse_model_to_SageMaker.ipynb new file mode 100644 index 0000000000..7997346c93 --- /dev/null +++ b/docs/model_serving_framework/deploy_sparse_model_to_SageMaker.ipynb @@ -0,0 +1,502 @@ +{ + "cells": [ + { + "cell_type": "raw", + "id": "b32dbef8", + "metadata": { + "vscode": { + "languageId": "raw" + } + }, + "source": [ + "/*\n", + " * Copyright OpenSearch Contributors\n", + " * SPDX-License-Identifier: Apache-2.0\n", + " */" + ] + }, + { + "cell_type": "markdown", + "id": "f6ff4e2d", + "metadata": {}, + "source": [ + "> **_NOTE:_** **This script is supposed to be executed at SageMaker Notebook!**\n", + "\n", + "## prerequesites\n", + "- We have setup an **SageMaker Notebook**, the **S3 bucket** to store the bindle, and config their permission\n", + "\n", + "## Step 1\n", + "Use git to clone this file to your SageMaker Notebook instance, and open this run.ipynb at your SageMaker Notebook\n", + "\n", + "## Step 2\n", + "Prepare the model file for SageMaker. Run below code blocks in sequence." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6bc7a23", + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir handler\n", + "!mkdir handler/code\n", + "!mkdir handler/MAR-INF" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4ca2c0c", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile handler/code/requirements.txt\n", + "sentence-transformers==5.0.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "071ff5c9", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile handler/MAR-INF/MANIFEST.json\n", + "{\n", + " \"runtime\": \"python\",\n", + " \"model\": {\n", + " \"modelName\": \"neuralsparse\",\n", + " \"handler\": \"neural_sparse_handler.py\",\n", + " \"modelVersion\": \"1.0\",\n", + " \"configFile\": \"neural_sparse_config.yaml\"\n", + " },\n", + " \"archiverVersion\": \"0.9.0\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c7d23ac", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile handler/neural_sparse_config.yaml\n", + "## configs about dynamic batch inference https://docs.pytorch.org/serve/batch_inference_with_ts.html\n", + "## batchSize: the maximum number of requests to aggregate. Each request can contain multiple documents.\n", + "batchSize: 16\n", + "maxBatchDelay: 5\n", + "responseTimeout: 300" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ee8d2bc", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile handler/neural_sparse_handler.py\n", + "\n", + "import os\n", + "import re\n", + "import itertools\n", + "import json\n", + "import torch\n", + "\n", + "from ts.torch_handler.base_handler import BaseHandler\n", + "from sentence_transformers.sparse_encoder import SparseEncoder\n", + "\n", + "model_id = os.environ.get(\n", + " \"MODEL_ID\", \"opensearch-project/opensearch-neural-sparse-encoding-doc-v3-gte\"\n", + ")\n", + "max_bs = int(os.environ.get(\"MAX_BS\", 32))\n", + "trust_remote_code = model_id.endswith(\"gte\")\n", + "\n", + "class SparseEncodingModelHandler(BaseHandler):\n", + " def __init__(self):\n", + " super().__init__()\n", + " self.initialized = False\n", + "\n", + " def initialize(self, context):\n", + " self.manifest = context.manifest\n", + " properties = context.system_properties\n", + "\n", + " # Print initialization parameters\n", + " print(f\"Initializing SparseEncodingModelHandler with model_id: {model_id}\")\n", + "\n", + " # load model and tokenizer\n", + " self.device = torch.device(\n", + " \"cuda:\" + str(properties.get(\"gpu_id\"))\n", + " if torch.cuda.is_available()\n", + " else \"cpu\"\n", + " )\n", + " print(f\"Using device: {self.device}\")\n", + " self.model = SparseEncoder(model_id, device=self.device, trust_remote_code=trust_remote_code)\n", + " self.initialized = True\n", + "\n", + " def preprocess(self, requests):\n", + " inputSentence = []\n", + " batch_idx = []\n", + "\n", + " for request in requests:\n", + " request_body = request.get(\"body\")\n", + " if isinstance(request_body, bytearray):\n", + " request_body = request_body.decode(\"utf-8\")\n", + " request_body = json.loads((request_body))\n", + " if isinstance(request_body, list):\n", + " inputSentence += request_body\n", + " batch_idx.append(len(request_body))\n", + " else:\n", + " inputSentence.append(request_body)\n", + " batch_idx.append(1)\n", + "\n", + " return inputSentence, batch_idx\n", + "\n", + " def handle(self, data, context):\n", + " inputSentence, batch_idx = self.preprocess(data)\n", + " model_output = self.model.encode_document(inputSentence, batch_size=max_bs)\n", + " sparse_embedding = list(map(dict,self.model.decode(model_output)))\n", + "\n", + " outputs = [sparse_embedding[s:e]\n", + " for s, e in zip([0]+list(itertools.accumulate(batch_idx))[:-1],\n", + " itertools.accumulate(batch_idx))]\n", + " return outputs" + ] + }, + { + "cell_type": "markdown", + "id": "0de74dbf", + "metadata": {}, + "source": [ + "Wrap the handler folder to a tarball. And upload it to your S3 bucket.\n", + "\n", + "In handler/neural_sparse_handler.py, we define the model loading, pre-process, inference and post-process. We use mixed-precision to accelerate the inference.\n", + "\n", + "In handler/neural_sparse_config.yaml, we define some configs for the torch serve (include dynamic micro-batching)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebf38b1b", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "bucket_name = \"your_bucket_name\"\n", + "os.system(\"tar -czvf neural-sparse-handler.tar.gz -C handler/ .\")\n", + "os.system(\n", + " f\"aws s3 cp neural-sparse-handler.tar.gz s3://{bucket_name}/neural-sparse-handler.tar.gz\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "161796c1", + "metadata": {}, + "source": [ + "## Step 3\n", + "Use SageMaker python SDK to deploy the tarball on a real-time inference endpoint\n", + "\n", + "Here we use ml.g5.xlarge. It's a GPU instance with good price-performance.\n", + "\n", + "Please modify the region base according to your settings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d16be94", + "metadata": {}, + "outputs": [], + "source": [ + "# constants that can be customized for models\n", + "model_id = \"opensearch-project/opensearch-neural-sparse-encoding-doc-v3-gte\"\n", + "## The maximum number of documents to encode in a single inference step. Too large number will cause CUDA OOM.\n", + "## Even we set batchSize to 16, the actual documents number can be larger. Because one request can contain multiple documents.\n", + "max_batch_size = \"32\"\n", + "\n", + "# constants related to deployment\n", + "model_name = \"ns-handler\"\n", + "endpoint_name = \"ns-handler\"\n", + "instance_type = \"ml.g5.xlarge\"\n", + "initial_instance_count = 1\n", + "\n", + "# run this cell\n", + "import boto3\n", + "import sagemaker\n", + "from sagemaker.model import Model\n", + "from sagemaker.predictor import Predictor\n", + "from sagemaker.serializers import JSONSerializer\n", + "from sagemaker.deserializers import JSONDeserializer\n", + "\n", + "role = sagemaker.get_execution_role()\n", + "sess = boto3.Session()\n", + "region = sess.region_name\n", + "smsess = sagemaker.Session(boto_session=sess)\n", + "\n", + "envs = {\n", + " \"TS_ASYNC_LOGGING\": \"true\",\n", + " \"MODEL_ID\": model_id,\n", + " \"MAX_BS\": max_batch_size,\n", + "}\n", + "\n", + "baseimage = sagemaker.image_uris.retrieve(\n", + " framework=\"pytorch\",\n", + " region=region,\n", + " py_version=\"py312\",\n", + " image_scope=\"inference\",\n", + " version=\"2.6\",\n", + " instance_type=instance_type,\n", + ")\n", + "\n", + "model = Model(\n", + " model_data=f\"s3://{bucket_name}/neural-sparse-handler.tar.gz\",\n", + " image_uri=baseimage,\n", + " role=role,\n", + " predictor_cls=Predictor,\n", + " name=model_name,\n", + " sagemaker_session=smsess,\n", + " env=envs,\n", + ")\n", + "\n", + "endpoint_name = endpoint_name\n", + "predictor = model.deploy(\n", + " instance_type=instance_type,\n", + " initial_instance_count=initial_instance_count,\n", + " endpoint_name=endpoint_name,\n", + " serializer=JSONSerializer(),\n", + " deserializer=JSONDeserializer(),\n", + " ModelDataDownloadTimeoutInSeconds=3600,\n", + " ContainerStartupHealthCheckTimeoutInSeconds=3600,\n", + " VolumeSizeInGB=16,\n", + ")\n", + "\n", + "print(predictor.endpoint_name)" + ] + }, + { + "cell_type": "markdown", + "id": "0863ed26", + "metadata": {}, + "source": [ + "## Step 4\n", + "\n", + "After we create the endpoint, use some sample request to see how it works" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1a7918c", + "metadata": {}, + "outputs": [], + "source": [ + "# run this cell\n", + "import json\n", + "\n", + "body = [\"Currently New York is rainy.\"]\n", + "amz = boto3.client(\"sagemaker-runtime\")\n", + "\n", + "response = amz.invoke_endpoint(\n", + " EndpointName=predictor.endpoint_name,\n", + " Body=json.dumps(body),\n", + " ContentType=\"application/json\",\n", + ")\n", + "\n", + "res = response[\"Body\"].read()\n", + "results = json.loads(res.decode(\"utf8\"))\n", + "results" + ] + }, + { + "cell_type": "markdown", + "id": "ae30128b", + "metadata": { + "vscode": { + "languageId": "raw" + } + }, + "source": [ + "response:\n", + "```json\n", + "[{'weather': 1.0386549234390259,\n", + " 'york': 1.0295555591583252,\n", + " 'ny': 0.9703458547592163,\n", + " 'rain': 0.9549243450164795,\n", + " 'rainy': 0.9478658437728882,\n", + " 'nyc': 0.8449130058288574,\n", + " 'new': 0.6880059838294983,\n", + " 'raining': 0.6789529323577881,\n", + " 'current': 0.6762993931770325,\n", + " 'wet': 0.6448248028755188,\n", + " 'rainfall': 0.6405332088470459,\n", + " 'currently': 0.6092915534973145,\n", + " 'now': 0.586189329624176,\n", + " 'manhattan': 0.5858010053634644,\n", + " 'today': 0.5322379469871521,\n", + " 'temperature': 0.5275187492370605,\n", + " 'climate': 0.48528429865837097,\n", + " 'is': 0.481422483921051,\n", + " 'y': 0.4586825370788574,\n", + " '##yo': 0.45718008279800415,\n", + " 'cloudy': 0.41763371229171753,\n", + " 'it': 0.41397932171821594,\n", + " 'forecast': 0.38210317492485046,\n", + " 'rains': 0.3785228431224823,\n", + " 'rained': 0.35427314043045044,\n", + " 'yorkshire': 0.31092309951782227,\n", + " 'snow': 0.30391135811805725,\n", + " 'yorker': 0.28260838985443115,\n", + " 'time': 0.27697092294692993,\n", + " 'sunny': 0.2620435059070587,\n", + " 'nyu': 0.2503677308559418,\n", + " 'in': 0.24964851140975952,\n", + " 'windy': 0.2452678382396698,\n", + " 'presently': 0.22908653318881989,\n", + " 'stormy': 0.21931196749210358,\n", + " 'temperatures': 0.21101005375385284,\n", + " 'tonight': 0.20632436871528625,\n", + " 'present': 0.20109090209007263,\n", + " 'this': 0.20102401077747345,\n", + " 'us': 0.1935725212097168,\n", + " 'nj': 0.18026664853096008,\n", + " 'storm': 0.17380213737487793,\n", + " 'week': 0.17336463928222656,\n", + " 'news': 0.16366833448410034,\n", + " '##storm': 0.16161945462226868,\n", + " 'here': 0.14572882652282715,\n", + " 'temps': 0.13970820605754852,\n", + " 'lately': 0.13716177642345428,\n", + " '##weather': 0.13432787358760834,\n", + " 'te': 0.1198926791548729,\n", + " 'yesterday': 0.11460382491350174,\n", + " 'or': 0.11349867284297943,\n", + " 'storms': 0.11013525724411011,\n", + " 'sunshine': 0.09905409067869186,\n", + " 'usa': 0.09774350374937057,\n", + " 'clouds': 0.09281915426254272,\n", + " 'humidity': 0.09233205765485764,\n", + " 'humid': 0.086763896048069,\n", + " 'daylight': 0.08338665962219238,\n", + " 'state': 0.08252169191837311,\n", + " 'winter': 0.07992527633905411,\n", + " 'summer': 0.07536710053682327,\n", + " 'fog': 0.06763386726379395,\n", + " 'mood': 0.06538641452789307,\n", + " 'like': 0.06360717862844467,\n", + " 'hurricane': 0.062024328857660294,\n", + " 'water': 0.061854153871536255,\n", + " 'hudson': 0.0577932633459568,\n", + " 'gloom': 0.04488009959459305,\n", + " 'flu': 0.04299859702587128,\n", + " 'sunday': 0.039578113704919815,\n", + " 'brooklyn': 0.03740933537483215,\n", + " 'season': 0.03519425913691521,\n", + " 'month': 0.026503682136535645,\n", + " 'america': 0.025791412219405174,\n", + " 'monsoon': 0.01986435428261757,\n", + " 'color': 0.015449629165232182,\n", + " 'seasons': 0.012146473862230778,\n", + " 'does': 0.006621183827519417,\n", + " 'snowy': 0.0020988560281693935}]\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "33035889", + "metadata": {}, + "source": [ + "## Step 5\n", + "> **_NOTE:_** **This step is supposed to be executed at an instance have access to OpenSearch cluster!**\n", + "\n", + "Register this SageMaker endpoint at your OpenSearch cluster\n", + "\n", + "Please check the OpenSearch doc for more information. Here we provide one demo request body using access_key and secret_key. Please choose the authentication according to your use case.\n", + "\n", + "### create connector\n", + "\n", + "(Fill the region and predictor.endpoint_name in request body)\n", + "```json\n", + "POST /_plugins/_ml/connectors/_create\n", + "{\n", + " \"name\": \"test\",\n", + " \"description\": \"Test connector for Sagemaker model\",\n", + " \"version\": 1,\n", + " \"protocol\": \"aws_sigv4\",\n", + " \"credential\": {\n", + " \"access_key\": \"your access key\",\n", + " \"secret_key\": \"your secret key\"\n", + " },\n", + " \"parameters\": {\n", + " \"region\": \"{region}\",\n", + " \"service_name\": \"sagemaker\",\n", + " \"input_docs_processed_step_size\": 2,\n", + " },\n", + " \"actions\": [\n", + " {\n", + " \"action_type\": \"predict\",\n", + " \"method\": \"POST\",\n", + " \"headers\": {\n", + " \"content-type\": \"application/json\"\n", + " },\n", + " \"url\": \"https://runtime.sagemaker.{region}.amazonaws.com/endpoints/{predictor.endpoint_name}/invocations\",\n", + " \"request_body\": \"${parameters.input}\"\n", + " }\n", + " ],\n", + " \"client_config\":{\n", + " \"max_retry_times\": -1,\n", + " \"max_connection\": 60,\n", + " \"retry_backoff_millis\": 10\n", + " }\n", + "}\n", + "```\n", + "\n", + "### register model\n", + "```json\n", + "POST /_plugins/_ml/models/_register?deploy=true\n", + "{\n", + " \"name\": \"test\",\n", + " \"function_name\": \"remote\",\n", + " \"version\": \"1.0.0\",\n", + " \"connector_id\": \"{connector id}\",\n", + " \"description\": \"Test connector for Sagemaker model\"\n", + "}\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1fda443", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}