diff --git a/docs/.values-table.md b/docs/.values-table.md index 0847bf8..1116c8c 100644 --- a/docs/.values-table.md +++ b/docs/.values-table.md @@ -34,9 +34,12 @@ | envoy.rate_limiter.listener_level.max_tokens | int | `5` | Maximum number of simultaneous connections to the Envoy Proxy. Each new connection takes a "token" from the "bucket" which initially contains ``max_tokens`` tokens. | | envoy.rate_limiter.listener_level.tokens_per_fill | int | `1` | ``tokens_per_fill`` tokens are added to the "bucket" every ``fill_interval``, allowing new connections to be established. | | envoy.rate_limiter.listener_level.fill_interval | string | `"12s"` | For example, adding a new token every 12 seconds allows 5 new connections every minute. | -| envoy.rate_limiter.prometheus_based | object | `{"enabled":false,"luaConfig":"cfg/envoy-filter.lua"}` | This rate limiter rejects new connections based on metric extracted from Prometheus (e.g. inference queue latency). The metric is taken from parameter ``prometheus.serverLoadMetric``, and the threshold is set by ``prometheus.serverLoadThreshold``. These parameters are the same as those used by the KEDA autoscaler. | +| envoy.rate_limiter.prometheus_based | object | `{"enabled":false}` | This rate limiter rejects new connections based on metric extracted from Prometheus (e.g. inference queue latency). The metric is taken from parameter ``prometheus.serverLoadMetric``, and the threshold is set by ``prometheus.serverLoadThreshold``. These parameters are the same as those used by the KEDA autoscaler. | | envoy.rate_limiter.prometheus_based.enabled | bool | `false` | Enable rate limiter | | envoy.loadBalancerPolicy | string | `"LEAST_REQUEST"` | Envoy load balancer policy. Options: ROUND_ROBIN, LEAST_REQUEST, RING_HASH, RANDOM, MAGLEV | +| envoy.lua_filter.enabled | bool | `false` | | +| envoy.lua_filter.lua_config | string | `"cfg/envoy-filter.lua"` | | +| envoy.dynamic_routing | object | `{"enabled":false}` | Enable dynamic routing in Envoy proxy. | | envoy.auth.enabled | bool | `false` | Enable authentication in Envoy proxy | | envoy.auth.jwt_issuer | string | `""` | | | envoy.auth.jwt_remote_jwks_uri | string | `""` | | diff --git a/helm/supersonic/cfg/envoy-filter-dynamic.lua b/helm/supersonic/cfg/envoy-filter-dynamic.lua new file mode 100644 index 0000000..0740052 --- /dev/null +++ b/helm/supersonic/cfg/envoy-filter-dynamic.lua @@ -0,0 +1,64 @@ +function envoy_on_request(request_handle) + local path = request_handle:headers():get(":path") + local contentType = request_handle:headers():get("content-type") + + + ---- Extract model_name from ModelInferRequest ---- + if contentType == "application/grpc" then + -- request_handle:logInfo("path = " .. path) + if path == "/inference.GRPCInferenceService/ModelInfer" then + + local model_name, model_version = extract_model_name_and_version(request_handle, body) + -- request_handle:logInfo("ModelInfer model_name = " .. model_name .. " model_version = " .. model_version) + + -- log and propagate via dynamic metadata + if model_name and model_version then + local svc_name = "RELEASE-" .. model_name .. "-v" .. model_version + local header_value = svc_name .. ".NAMESPACE.svc.cluster.local:8001" + request_handle:logInfo("route-to = " .. header_value) + -- add header + request_handle:headers():add("route-to", header_value) + end + else + --- for non-inference calls, for now just forward to default service + request_handle:headers():add("route-to", "RELEASE-triton.NAMESPACE.svc.cluster.local:8001") + end + end +end + +function extract_model_name_and_version(request_handle) + local model_name = "" + local model_version = "" + local body = request_handle:body():getBytes(0, request_handle:body():length()) + + if body and #body > 5 then + -- strip the 5-byte gRPC header (1-byte flag + 4-byte msg-len) + local msg = body:sub(6) + + -- protobuf wire format for field 1, wire type 2: tag = 0x0A + -- field 1 is the model name - we know it from here: + -- wire type 2 means that the field is length-delimited + if msg:byte(1) == 0x0A then + -- next byte is a varint length (assumes <128 bytes) + local name_len = msg:byte(2) + -- extract UTF-8 model name + model_name = msg:sub(3, 2 + name_len) + -- request_handle:logInfo("ModelInfer model_name = " .. model_name) + local offset = 3 + name_len + + -- Extract model version (field 2, wire type 2, tag 0x12) + if msg:byte(offset) == 0x12 then + local ver_len = msg:byte(offset + 1) + model_version = msg:sub(offset + 2, offset + 1 + ver_len) + -- request_handle:logInfo("ModelInfer model_version = " .. model_version) + offset = offset + 2 + ver_len + else + request_handle:logWarn(string.format("No model_version field (expected tag 0x12 at offset %d, got 0x%02X)", + offset, msg:byte(offset))) + end + else + request_handle:logErr("Unexpected protobuf tag: " .. string.format("0x%02X", msg:byte(1))) + end + end + return model_name, model_version +end \ No newline at end of file diff --git a/helm/supersonic/templates/envoy/configmaps.yaml b/helm/supersonic/templates/envoy/configmaps.yaml index a96eaec..dd2f187 100644 --- a/helm/supersonic/templates/envoy/configmaps.yaml +++ b/helm/supersonic/templates/envoy/configmaps.yaml @@ -57,11 +57,22 @@ static_resources: routes: - match: prefix: "/" + {{- if .envoy.dynamic_routing.enabled }} + typed_per_filter_config: + envoy.filters.http.dynamic_forward_proxy: + "@type": type.googleapis.com/envoy.extensions.filters.http.dynamic_forward_proxy.v3.PerRouteConfig + host_rewrite_header: "route-to" + route: + cluster: dynamic_forward_proxy_cluster + timeout: {{ .envoy.grpc_route_timeout }} + {{- else }} route: cluster: triton_grpc_service timeout: {{ .envoy.grpc_route_timeout }} + {{- end }} + http_filters: - {{- with .envoy.rate_limiter.prometheus_based }} + {{- with .envoy.lua_filter }} {{- if .enabled }} - name: envoy.filters.http.lua typed_config: @@ -94,6 +105,16 @@ static_resources: provider_name: provider_icecube {{- end }} {{- end }} + {{- if .envoy.dynamic_routing.enabled }} + - name: envoy.filters.http.dynamic_forward_proxy + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.dynamic_forward_proxy.v3.FilterConfig + dns_cache_config: + name: dynamic_cache + dns_lookup_family: ALL + dns_cache_circuit_breaker: + max_pending_requests: 1024 + {{- end }} - name: envoy.filters.http.router typed_config: "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router @@ -176,6 +197,22 @@ static_resources: socket_address: address: {{ .tritonName }} port_value: {{ .tritonGrpcPort }} + {{- if .envoy.dynamic_routing.enabled }} + - name: dynamic_forward_proxy_cluster + connect_timeout: 2s + lb_policy: CLUSTER_PROVIDED + http2_protocol_options: + max_concurrent_streams: 1000 + cluster_type: + name: envoy.clusters.dynamic_forward_proxy + typed_config: + "@type": type.googleapis.com/envoy.extensions.clusters.dynamic_forward_proxy.v3.ClusterConfig + dns_cache_config: + name: dynamic_cache + dns_lookup_family: ALL + dns_cache_circuit_breaker: + max_pending_requests: 1024 + {{- end }} {{- end }} {{- end }} @@ -226,7 +263,7 @@ data: {{ include "envoy.configuration.yaml" $envoyContext | indent 4 }} --- -{{- if .Values.envoy.rate_limiter.prometheus_based.enabled }} +{{- if .Values.envoy.lua_filter.enabled }} {{- /* Create a ConfigMap for the Lua filter */}} apiVersion: v1 kind: ConfigMap @@ -239,12 +276,14 @@ metadata: data: envoy-filter.lua: |- {{- /* Read and process the Lua configuration file */}} - {{- $luaConfig := $.Files.Get .Values.envoy.rate_limiter.prometheus_based.luaConfig | nindent 4 }} + {{- $luaConfig := $.Files.Get .Values.envoy.lua_filter.lua_config | nindent 4 }} {{- $luaConfig = $luaConfig | replace "SERVER_LOAD_METRIC" (include "supersonic.defaultMetric" . | quote) }} {{- $luaConfig = $luaConfig | replace "SERVER_LOAD_THRESHOLD" (quote .Values.serverLoadThreshold) }} {{- $luaConfig = $luaConfig | replace "PROMETHEUS_SCHEME" (include "supersonic.prometheusScheme" .) }} {{- $luaConfig = $luaConfig | replace "PROMETHEUS_HOST" (include "supersonic.prometheusHost" .) }} {{- $luaConfig = $luaConfig | replace "PROMETHEUS_PORT" (include "supersonic.prometheusPort" .) }} + {{- $luaConfig = $luaConfig | replace "RELEASE" .Release.Name }} + {{- $luaConfig = $luaConfig | replace "NAMESPACE" .Release.Namespace }} {{ $luaConfig | indent 4 }} --- diff --git a/helm/supersonic/templates/envoy/deployment.yaml b/helm/supersonic/templates/envoy/deployment.yaml index 7b2666c..5a48944 100644 --- a/helm/supersonic/templates/envoy/deployment.yaml +++ b/helm/supersonic/templates/envoy/deployment.yaml @@ -47,7 +47,7 @@ spec: volumeMounts: - name: {{ include "supersonic.name" . }}-envoy-config mountPath: /etc/envoy - {{- if .Values.envoy.rate_limiter.prometheus_based.enabled }} + {{- if .Values.envoy.lua_filter.enabled }} - name: {{ include "supersonic.name" . }}-lua-volume mountPath: /etc/envoy/lua readOnly: true @@ -58,7 +58,7 @@ spec: - name: {{ include "supersonic.name" . }}-envoy-config configMap: name: {{ include "supersonic.name" . }}-envoy-config - {{- if .Values.envoy.rate_limiter.prometheus_based.enabled }} + {{- if .Values.envoy.lua_filter.enabled }} - name: {{ include "supersonic.name" . }}-lua-volume configMap: name: {{ include "supersonic.name" . }}-lua-config diff --git a/helm/supersonic/values.schema.json b/helm/supersonic/values.schema.json index 4ad91bd..fb3d1c7 100644 --- a/helm/supersonic/values.schema.json +++ b/helm/supersonic/values.schema.json @@ -370,14 +370,10 @@ "properties": { "enabled": { "type": "boolean" - }, - "luaConfig": { - "type": "string" } }, "required": [ - "enabled", - "luaConfig" + "enabled" ] } }, @@ -389,6 +385,32 @@ "loadBalancerPolicy": { "type": "string" }, + "lua_filter": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + }, + "lua_config": { + "type": "string" + } + }, + "required": [ + "enabled", + "lua_config" + ] + }, + "dynamic_routing": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + } + }, + "required": [ + "enabled" + ] + }, "auth": { "type": "object", "properties": { @@ -424,11 +446,13 @@ "required": [ "args", "auth", + "dynamic_routing", "enabled", "grpc_route_timeout", "image", "ingress", "loadBalancerPolicy", + "lua_filter", "rate_limiter", "replicas", "resources", diff --git a/helm/supersonic/values.yaml b/helm/supersonic/values.yaml index c261f19..042a9f1 100644 --- a/helm/supersonic/values.yaml +++ b/helm/supersonic/values.yaml @@ -156,12 +156,19 @@ envoy: prometheus_based: # -- Enable rate limiter enabled: false - luaConfig: "cfg/envoy-filter.lua" # -- Envoy load balancer policy. # Options: ROUND_ROBIN, LEAST_REQUEST, RING_HASH, RANDOM, MAGLEV loadBalancerPolicy: "LEAST_REQUEST" + lua_filter: + enabled: false + lua_config: "cfg/envoy-filter.lua" + + # -- Enable dynamic routing in Envoy proxy. + dynamic_routing: + enabled: false + auth: # -- Enable authentication in Envoy proxy enabled: false diff --git a/values/values-geddes-cms.yaml b/values/values-geddes-cms.yaml index 453ae6e..4e87ac0 100644 --- a/values/values-geddes-cms.yaml +++ b/values/values-geddes-cms.yaml @@ -11,10 +11,12 @@ triton: --model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoEgamma/EgammaPhotonProducers/data/models/ \ --model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoTauTag/TrainingFiles/data/DeepTauIdSONIC/ \ --model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoMET/METPUSubtraction/data/models/ \ + --model-control-mode=explicit \ --allow-gpu-metrics=true \ --log-verbose=0 \ --strict-model-config=false \ --exit-timeout-secs=60 + resources: limits: { nvidia.com/gpu: 1, cpu: 2, memory: 4G} requests: { nvidia.com/gpu: 1, cpu: 2, memory: 4G} @@ -37,6 +39,14 @@ envoy: enabled: true hostName: sonic-cms.geddes.rcac.purdue.edu ingressClassName: public + rate_limiter: + prometheus_based: + enabled: false + dynamic_routing: + enabled: true + lua_filter: + enabled: true + lua_config: "cfg/envoy-filter-dynamic.lua" autoscaler: enabled: true @@ -55,15 +65,20 @@ tolerations: effect: NoSchedule prometheus: - enabled: true - server: - ingress: - enabled: true - hostName: prometheus-cms.geddes.rcac.purdue.edu - ingressClassName: public + # enabled: false + external: + enabled: true + url: "prometheus-cms.geddes.rcac.purdue.edu" + port: 443 + scheme: https + # server: + # ingress: + # enabled: true + # hostName: prometheus-cms.geddes.rcac.purdue.edu + # ingressClassName: public grafana: - enabled: true + enabled: false ingress: enabled: true hostName: grafana-cms.geddes.rcac.purdue.edu