ODSC 39392/triton (#128)

z7ye · web-flow · commit cd2c8f34f68d · 2023-04-24T18:02:40.000-07:00
diff --git a/ads/model/deployment/common/utils.py b/ads/model/deployment/common/utils.py
@@ -119,19 +119,14 @@ def send_request(
     Returns:
         A JSON representive of a requests.Response object.
     """
-    headers = dict()
     if is_json_payload:
-        headers["Content-Type"] = (
-            header.get("content_type") or DEFAULT_CONTENT_TYPE_JSON
-        )
+        header["Content-Type"] =  header.pop("content_type", DEFAULT_CONTENT_TYPE_JSON) or DEFAULT_CONTENT_TYPE_JSON
         request_kwargs = {"json": data}
     else:
-        headers["Content-Type"] = (
-            header.get("content_type") or DEFAULT_CONTENT_TYPE_BYTES
-        )
+        header["Content-Type"] = header.pop("content_type", DEFAULT_CONTENT_TYPE_BYTES) or DEFAULT_CONTENT_TYPE_BYTES
         request_kwargs = {"data": data}  # should pass bytes when using data
-
-    request_kwargs["headers"] = headers
+    
+    request_kwargs["headers"] = header
 
     if dry_run:
         request_kwargs["headers"]["Accept"] = "*/*"
@@ -140,7 +135,7 @@ def send_request(
             return json.loads(req.body)
         return req.body
     else:
-        request_kwargs["auth"] = header.get("signer")
+        request_kwargs["auth"] = header.pop("signer")
         return requests.post(endpoint, **request_kwargs).json()
 
 
diff --git a/ads/model/deployment/model_deployment.py b/ads/model/deployment/model_deployment.py
@@ -17,7 +17,6 @@
 from ads.common import auth as authutil
 import pandas as pd
 from ads.model.serde.model_input import JsonModelInputSERDE
-from ads.common import auth, oci_client
 from ads.common.oci_logging import (
     LOG_INTERVAL,
     LOG_RECORDS_LIMIT,
@@ -63,6 +62,7 @@
 
 MODEL_DEPLOYMENT_KIND = "deployment"
 MODEL_DEPLOYMENT_TYPE = "modelDeployment"
+MODEL_DEPLOYMENT_INFERENCE_SERVER_TRITON = "TRITON"
 
 MODEL_DEPLOYMENT_INSTANCE_SHAPE = "VM.Standard.E4.Flex"
 MODEL_DEPLOYMENT_INSTANCE_OCPUS = 1
@@ -828,6 +828,8 @@ def predict(
         data: Any = None,
         serializer: "ads.model.ModelInputSerializer" = model_input_serializer,
         auto_serialize_data: bool = False,
+        model_name: str = None,
+        model_version: str = None,
         **kwargs,
     ) -> dict:
         """Returns prediction of input data run against the model deployment endpoint.
@@ -860,6 +862,10 @@ def predict(
             If `auto_serialize_data=False`, `data` required to be bytes or json serializable
             and `json_input` required to be json serializable. If `auto_serialize_data` set
             to True, data will be serialized before sending to model deployment endpoint.
+        model_name: str
+            Defaults to None. When the `Inference_server="triton"`, the name of the model to invoke.
+        model_version: str
+            Defaults to None. When the `Inference_server="triton"`, the version of the model to invoke.
         kwargs:
             content_type: str
                 Used to indicate the media type of the resource.
@@ -878,6 +884,7 @@ def predict(
             "signer": signer,
             "content_type": kwargs.get("content_type", None),
         }
+        header.update(kwargs.pop("headers", {}))
 
         if data is None and json_input is None:
             raise AttributeError(
@@ -916,9 +923,13 @@ def predict(
             raise TypeError(
                 "`data` is not bytes or json serializable. Set `auto_serialize_data` to `True` to serialize the input data."
             )
-
+        if model_name and model_version:
+            header['model-name'] = model_name
+            header['model-version'] = model_version
+        elif bool(model_version) ^ bool(model_name):
+            raise ValueError("`model_name` and `model_version` have to be provided together.")
         prediction = send_request(
-            data=data, endpoint=endpoint, is_json_payload=is_json_payload, header=header
+            data=data, endpoint=endpoint, is_json_payload=is_json_payload, header=header,
         )
         return prediction
 
@@ -1390,6 +1401,10 @@ def _update_from_oci_model(self, oci_model_instance) -> "ModelDeployment":
             infrastructure.CONST_WEB_CONCURRENCY,
             runtime.env.get("WEB_CONCURRENCY", None),
         )
+        if runtime.env.get("CONTAINER_TYPE", None) == MODEL_DEPLOYMENT_INFERENCE_SERVER_TRITON:
+            runtime.set_spec(
+                runtime.CONST_INFERENCE_SERVER, MODEL_DEPLOYMENT_INFERENCE_SERVER_TRITON.lower()
+            )
 
         self.set_spec(self.CONST_INFRASTRUCTURE, infrastructure)
         self.set_spec(self.CONST_RUNTIME, runtime)
@@ -1566,6 +1581,9 @@ def _build_model_deployment_configuration_details(self) -> Dict:
                 infrastructure.web_concurrency
             )
             runtime.set_spec(runtime.CONST_ENV, environment_variables)
+        if hasattr(runtime, "inference_server") and runtime.inference_server and runtime.inference_server.upper() == MODEL_DEPLOYMENT_INFERENCE_SERVER_TRITON:
+            environment_variables["CONTAINER_TYPE"] = MODEL_DEPLOYMENT_INFERENCE_SERVER_TRITON
+            runtime.set_spec(runtime.CONST_ENV, environment_variables)
         environment_configuration_details = {
             runtime.CONST_ENVIRONMENT_CONFIG_TYPE: runtime.environment_config_type,
             runtime.CONST_ENVIRONMENT_VARIABLES: runtime.env,
diff --git a/ads/model/deployment/model_deployment_runtime.py b/ads/model/deployment/model_deployment_runtime.py
@@ -330,6 +330,7 @@ class ModelDeploymentContainerRuntime(ModelDeploymentRuntime):
     CONST_ENTRYPOINT = "entrypoint"
     CONST_SERVER_PORT = "serverPort"
     CONST_HEALTH_CHECK_PORT = "healthCheckPort"
+    CONST_INFERENCE_SERVER = "inferenceServer"
 
     attribute_map = {
         **ModelDeploymentRuntime.attribute_map,
@@ -339,6 +340,7 @@ class ModelDeploymentContainerRuntime(ModelDeploymentRuntime):
         CONST_ENTRYPOINT: "entrypoint",
         CONST_SERVER_PORT: "server_port",
         CONST_HEALTH_CHECK_PORT: "health_check_port",
+        CONST_INFERENCE_SERVER: "inference_server"
     }
 
     payload_attribute_map = {
@@ -532,3 +534,57 @@ def with_health_check_port(
             The ModelDeploymentContainerRuntime instance (self).
         """
         return self.set_spec(self.CONST_HEALTH_CHECK_PORT, health_check_port)
+    
+    @property
+    def inference_server(self) -> str:
+        """Returns the inference server.
+
+        Returns
+        -------
+        str
+            The inference server.
+        """
+        return self.get_spec(self.CONST_INFERENCE_SERVER, None)
+
+    def with_inference_server(self, inference_server: str = "triton") -> "ModelDeploymentRuntime":
+        """Sets the inference server. Current supported inference server is "triton".
+        Note if you are using byoc, you do not need to set the inference server.
+
+        Parameters
+        ----------
+        inference_server: str
+            Set the inference server.
+
+        Returns
+        -------
+        ModelDeploymentRuntime
+            The ModelDeploymentRuntime instance (self).
+            
+        Example
+        -------
+        >>> from ads.model.deployment import ModelDeployment, ModelDeploymentContainerRuntime, ModelDeploymentInfrastructure
+        >>> import ads
+        >>> ads.set_auth("resource_principal")
+        >>> infrastructure = ModelDeploymentInfrastructure()\
+        ...                 .with_project_id(<project_id>)\
+        ...                 .with_compartment_id(<comparment_id>)\
+        ...                 .with_shape_name("VM.Standard.E4.Flex")\
+        ...                 .with_replica(2)\
+        ...                 .with_bandwidth_mbps(10)\
+        ...                 .with_access_log(log_group_id=<deployment_log_group_id>, log_id=<deployment_access_log_id>)\
+        ...                 .with_predict_log(log_group_id=<deployment_log_group_id>, log_id=<deployment_predict_log_id>)
+
+        >>> runtime = ModelDeploymentContainerRuntime()\
+        ...                 .with_image(<container_image>)\
+        ...                 .with_server_port(<server_port>)\
+        ...                 .with_health_check_port(<health_check_port>)\
+        ...                 .with_model_uri(<model_id>)\
+        ...                 .with_env({"key":"value", "key2":"value2"})\
+        ...                 .with_inference_server("triton")
+        >>> deployment = ModelDeployment()\
+        ...                 .with_display_name("Triton Example")\
+        ...                 .with_infrastructure(infrastructure)\
+        ...                 .with_runtime(runtime)
+        >>> deployment.deploy()
+        """
+        return self.set_spec(self.CONST_INFERENCE_SERVER, inference_server.lower())
diff --git a/tests/unitary/default_setup/model_deployment/test_model_deployment_v2.py b/tests/unitary/default_setup/model_deployment/test_model_deployment_v2.py
@@ -308,6 +308,62 @@ def initialize_model_deployment_from_spec(self):
                 "runtime": runtime,
             }
         )
+        
+    def initialize_model_deployment_triton_builder(self):
+        infrastructure = ModelDeploymentInfrastructure()\
+            .with_compartment_id("fakeid.compartment.oc1..xxx")\
+            .with_project_id("fakeid.datascienceproject.oc1.iad.xxx")\
+            .with_shape_name("VM.Standard.E4.Flex")\
+            .with_replica(2)\
+            .with_bandwidth_mbps(10)\
+
+        runtime = ModelDeploymentContainerRuntime()\
+            .with_image("fake_image")\
+            .with_server_port(5000)\
+            .with_health_check_port(5000)\
+            .with_model_uri("fake_model_id")\
+            .with_env({"key":"value", "key2":"value2"})\
+            .with_inference_server("triton")
+        
+        deployment = ModelDeployment()\
+            .with_display_name("triton case")\
+            .with_infrastructure(infrastructure)\
+            .with_runtime(runtime)
+        return deployment
+
+    def initialize_model_deployment_triton_yaml(self):
+        yaml_string = """
+kind: deployment
+spec:
+  displayName: triton
+  infrastructure:
+    kind: infrastructure
+    spec:
+      bandwidthMbps: 10
+      compartmentId: fake_compartment_id
+      deploymentType: SINGLE_MODEL
+      policyType: FIXED_SIZE
+      replica: 2
+      shapeConfigDetails:
+        memoryInGBs: 16.0
+        ocpus: 1.0
+      shapeName: VM.Standard.E4.Flex
+    type: datascienceModelDeployment
+  runtime:
+    kind: runtime
+    spec:
+      env:
+        key: value
+        key2: value2
+      inference_server: triton
+      healthCheckPort: 8000
+      image: fake_image
+      modelUri: fake_model_id
+      serverPort: 8000
+    type: container
+"""
+        deployment_from_yaml = ModelDeployment.from_yaml(yaml_string)
+        return deployment_from_yaml
 
     def initialize_model_deployment_from_kwargs(self):
         infrastructure = (
@@ -435,11 +491,34 @@ def test_initialize_model_deployment_with_error(self):
                 },
             )
 
+
     def test_initialize_model_deployment_with_spec_kwargs(self):
         model_deployment_kwargs = self.initialize_model_deployment_from_kwargs()
         model_deployment_builder = self.initialize_model_deployment()
 
         assert model_deployment_kwargs.to_dict() == model_deployment_builder.to_dict()
+        
+        
+    def test_initialize_model_deployment_triton_builder(self):
+        temp_model_deployment = self.initialize_model_deployment_triton_builder()
+        assert isinstance(
+            temp_model_deployment.runtime, ModelDeploymentContainerRuntime
+        )
+        assert isinstance(
+            temp_model_deployment.infrastructure, ModelDeploymentInfrastructure
+        )
+        assert temp_model_deployment.runtime.inference_server == "triton"
+    
+    def test_initialize_model_deployment_triton_yaml(self):
+        temp_model_deployment = self.initialize_model_deployment_triton_yaml()
+        assert isinstance(
+            temp_model_deployment.runtime, ModelDeploymentContainerRuntime
+        )
+        assert isinstance(
+            temp_model_deployment.infrastructure, ModelDeploymentInfrastructure
+        )
+        assert temp_model_deployment.runtime.inference_server == "triton"
+        
 
     def test_model_deployment_to_dict(self):
         model_deployment = self.initialize_model_deployment()