From 228be136182837ffd65e15d3e1831ecc7eb7201b Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Wed, 15 Oct 2025 17:05:25 +0530 Subject: [PATCH 1/2] feat: add support for debug tracer for sending otel data on debug session --- apisix/debug_tracer.lua | 238 +++++++++++++++++++++++++++++++ apisix/init.lua | 26 +++- apisix/plugins/opentelemetry.lua | 67 +++++++-- 3 files changed, 319 insertions(+), 12 deletions(-) create mode 100644 apisix/debug_tracer.lua diff --git a/apisix/debug_tracer.lua b/apisix/debug_tracer.lua new file mode 100644 index 000000000000..cce1a8cb0df5 --- /dev/null +++ b/apisix/debug_tracer.lua @@ -0,0 +1,238 @@ +-- +-- Licensed to the Apache Software Foundation (ASF) under one or more +-- contributor license agreements. See the NOTICE file distributed with +-- this work for additional information regarding copyright ownership. +-- The ASF licenses this file to You under the Apache License, Version 2.0 +-- (the "License"); you may not use this file except in compliance with +-- the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. +-- +local core = require("apisix.core") +local process = require("ngx.process") + +local always_on_sampler_new = require("opentelemetry.trace.sampling.always_on_sampler").new +local exporter_client_new = require("opentelemetry.trace.exporter.http_client").new +local otlp_exporter_new = require("opentelemetry.trace.exporter.otlp").new +local batch_span_processor_new = require("opentelemetry.trace.batch_span_processor").new +local tracer_provider_new = require("opentelemetry.trace.tracer_provider").new +local resource_new = require("opentelemetry.resource").new +local attr = require("opentelemetry.attribute") +local span_kind = require("opentelemetry.trace.span_kind") + +local context = require("opentelemetry.context").new() + +local _M = { version = 0.1 } + +local hostname + +function _M.init() + if process.type() ~= "worker" then + return + end + hostname = core.utils.gethostname() +end + + + +local DebugTracerProvider = {} +DebugTracerProvider.__index = DebugTracerProvider + +function DebugTracerProvider.new(collector_config, resource_attrs) + local self = setmetatable({ + collector_config = collector_config or { + address = "127.0.0.1:4318", + request_timeout = 3, + request_headers = {} + }, + resource_attrs = resource_attrs or {}, + spans = {}, -- Buffered spans for this tracer instance + is_reporting = false + }, DebugTracerProvider) + + return self +end + +function DebugTracerProvider:start_span(span_name, options) + local span_id = core.utils.uuid() + local trace_id = core.utils.uuid() + local start_time = ngx.now() * 1000000000 -- Convert to nanoseconds + + local span = { + id = span_id, + trace_id = trace_id, + name = span_name, + start_time = start_time, + end_time = nil, + kind = options and options.kind or span_kind.internal, + attributes = options and options.attributes or {}, + parent_span_id = options and options.parent_span_id, + status = nil, + events = {} + } + + -- Store in buffered spans + self.spans[span_id] = span + + return { + span_id = span_id, + trace_id = trace_id, + name = span_name, + context = context + } +end + +function DebugTracerProvider:finish_span(span_token, end_time) + local span = self.spans[span_token.span_id] + if span then + span.end_time = end_time or (ngx.now() * 1000000000) + end + return span +end + +function DebugTracerProvider:add_event(span_token, event_name, attributes) + local span = self.spans[span_token.span_id] + if span then + table.insert(span.events, { + name = event_name, + time = ngx.now() * 1000000000, + attributes = attributes or {} + }) + end +end + +function DebugTracerProvider:set_attributes(span_token, attributes) + local span = self.spans[span_token.span_id] + if span then + for k, v in pairs(attributes) do + span.attributes[k] = v + end + end +end + +function DebugTracerProvider:set_status(span_token, status, description) + local span = self.spans[span_token.span_id] + if span then + span.status = { + code = status, + description = description + } + end +end + +function DebugTracerProvider:report_trace(debug_session_id) + if self.is_reporting then + core.log.warn("Debug tracer is already in reporting mode") + return + end + + self.is_reporting = true + + -- Create real OpenTelemetry tracer + local real_tracer = self:_create_real_tracer(debug_session_id) + + -- Convert all buffered spans to real spans + for span_id, buffered_span in pairs(self.spans) do + if buffered_span.end_time then + self:_convert_to_real_span(real_tracer, buffered_span) + end + end + + -- Force flush + real_tracer.provider:force_flush() + + core.log.info("Debug trace reported for session: ", debug_session_id) +end + +function DebugTracerProvider:_create_real_tracer(debug_session_id) + -- Build resource attributes + local resource_attrs = { attr.string("hostname", hostname) } + + -- Add service name if not provided + if not self.resource_attrs["service.name"] then + table.insert(resource_attrs, attr.string("service.name", "APISIX-Debug")) + end + + -- Add debug session ID + table.insert(resource_attrs, attr.string("debug.session.id", debug_session_id)) + + -- Add custom resource attributes + for k, v in pairs(self.resource_attrs) do + if type(v) == "string" then + table.insert(resource_attrs, attr.string(k, v)) + elseif type(v) == "number" then + table.insert(resource_attrs, attr.double(k, v)) + elseif type(v) == "boolean" then + table.insert(resource_attrs, attr.bool(k, v)) + end + end + + -- Create real tracer + local exporter = otlp_exporter_new( + exporter_client_new( + self.collector_config.address, + self.collector_config.request_timeout, + self.collector_config.request_headers + ) + ) + + local batch_span_processor = batch_span_processor_new( + exporter, + self.collector_config.batch_span_processor or {} + ) + + local sampler = always_on_sampler_new() -- Always sample debug traces + + local tp = tracer_provider_new(batch_span_processor, { + resource = resource_new(unpack(resource_attrs)), + sampler = sampler, + }) + + return tp:tracer("apisix-debug-tracer") +end + +function DebugTracerProvider:_convert_to_real_span(real_tracer, buffered_span) + -- Start span with original timing + local span_ctx = real_tracer:start(buffered_span.name, { + kind = buffered_span.kind, + attributes = buffered_span.attributes, + start_time = buffered_span.start_time + }) + + local span = span_ctx:span() + + -- Add events + for _, event in ipairs(buffered_span.events) do + -- Note: OpenTelemetry Lua might not have direct event API + -- We can add as attributes instead + span:set_attributes(event.attributes) + end + + -- Set status + if buffered_span.status then + span:set_status(buffered_span.status.code, buffered_span.status.description) + end + + -- Finish with original end time + span:finish(buffered_span.end_time) +end + +function DebugTracerProvider:get_buffered_spans_count() + local count = 0 + for _ in pairs(self.spans) do + count = count + 1 + end + return count +end + +function _M.create_tracer_provider(collector_config, resource_attrs) + return DebugTracerProvider.new(collector_config, resource_attrs) +end + +return _M diff --git a/apisix/init.lua b/apisix/init.lua index 430572e27e48..8ff0c8821b80 100644 --- a/apisix/init.lua +++ b/apisix/init.lua @@ -47,6 +47,8 @@ local debug = require("apisix.debug") local pubsub_kafka = require("apisix.pubsub.kafka") local resource = require("apisix.resource") local trusted_addresses_util = require("apisix.utils.trusted-addresses") +local span_kind = require("opentelemetry.trace.span_kind") +local debug_tracer = require("apisix.debug_tracer") local ngx = ngx local get_method = ngx.req.get_method local ngx_exit = ngx.exit @@ -203,8 +205,30 @@ function _M.ssl_client_hello_phase() local api_ctx = core.tablepool.fetch("api_ctx", 0, 32) ngx_ctx.api_ctx = api_ctx + -- if shared_dict:get("active_debug_sessions") then (Make it conditional?) + local tracer_provider = debug_tracer.create_tracer_provider() + api_ctx.debug_tracer = tracer_provider + local tls_span = tracer_provider:start_span("TLS Handshake", { + kind = span_kind.server, + attributes = { + tls_sni = sni, + tls_ext_status_request = tls_ext_status_req ~= nil + } + }) + api_ctx.tls_span = tls_span + -- end local ok, err = router.router_ssl.match_and_set(api_ctx, true, sni) - + if api_ctx.debug_tracer and api_ctx.matched_ssl then + local route_span = api_ctx.debug_tracer:start_span("SSL Route Matching", { + attributes = { + matched_sni = sni, + ssl_cert_found = api_ctx.matched_ssl ~= nil, + route_id = api_ctx.matched_ssl and api_ctx.matched_ssl.value and + api_ctx.matched_ssl.value.id or "none" + } + }) + api_ctx.route_span = route_span + end ngx_ctx.matched_ssl = api_ctx.matched_ssl core.tablepool.release("api_ctx", api_ctx) ngx_ctx.api_ctx = nil diff --git a/apisix/plugins/opentelemetry.lua b/apisix/plugins/opentelemetry.lua index d98ac44ae69d..b5974b8a8b19 100644 --- a/apisix/plugins/opentelemetry.lua +++ b/apisix/plugins/opentelemetry.lua @@ -18,6 +18,7 @@ local plugin_name = "opentelemetry" local core = require("apisix.core") local plugin = require("apisix.plugin") local process = require("ngx.process") +local debug_tracer = require("apisix.debug_tracer") local always_off_sampler_new = require("opentelemetry.trace.sampling.always_off_sampler").new local always_on_sampler_new = require("opentelemetry.trace.sampling.always_on_sampler").new @@ -305,6 +306,28 @@ local function inject_attributes(attributes, wanted_attributes, source, with_pre end end +function _M.access(conf, api_ctx) + -- Check if this request matches any debug session + local debug_session = check_debug_session_conditions(api_ctx) + if debug_session and api_ctx.debug_tracer then + api_ctx.active_debug_session = debug_session + + -- We've decided to report this trace, finish spans and report + if api_ctx.tls_span then + api_ctx.debug_tracer:finish_span(api_ctx.tls_span) + end + if api_ctx.route_span then + api_ctx.debug_tracer:finish_span(api_ctx.route_span) + end + if api_ctx.debug_main_span then + api_ctx.debug_tracer:finish_span(api_ctx.debug_main_span) + end + + -- Report the trace + api_ctx.debug_tracer:report_trace(debug_session.id) + api_ctx.debug_tracer = nil -- Clean up + end +end function _M.rewrite(conf, api_ctx) local metadata = plugin.plugin_metadata(plugin_name) @@ -380,6 +403,23 @@ function _M.rewrite(conf, api_ctx) -- inject trace context into the headers of upstream HTTP request trace_context_propagator:inject(ctx, ngx.req) + -- implement should_enable_debug_tracing + if should_enable_debug_tracing(api_ctx) then + local metadata = plugin.plugin_metadata(plugin_name) + if metadata then + local plugin_info = metadata.value + api_ctx.debug_tracer = debug_tracer.create_tracer_provider( + plugin_info.collector, + plugin_info.resource + ) + -- Start main request span in debug tracer + local debug_span = api_ctx.debug_tracer:start_span(span_name, { + kind = span_kind.server, + attributes = attributes + }) + api_ctx.debug_main_span = debug_span + end + end end @@ -407,18 +447,23 @@ end -- body_filter maybe not called because of empty http body response -- so we need to check if the span has finished in log phase function _M.log(conf, api_ctx) - if api_ctx.otel_context_token then - -- ctx:detach() is not necessary, because of ctx is stored in ngx.ctx - local upstream_status = core.response.get_upstream_status(api_ctx) - - -- get span from current context - local span = context:current():span() - if upstream_status and upstream_status >= 500 then - span:set_status(span_status.ERROR, - "upstream response status: " .. upstream_status) + if not api_ctx.active_debug_session then + -- implement check_response_debug_session + local debug_session = check_response_debug_session(api_ctx) + if debug_session and api_ctx.debug_tracer then + -- Finish all spans and report + for span_id, span in pairs(api_ctx.debug_tracer.spans) do + if not span.end_time then + api_ctx.debug_tracer:finish_span({span_id = span_id}) + end + end + api_ctx.debug_tracer:report_trace(debug_session.id) end - - span:finish() + end + + -- Cleanup: if debug tracer exists but no session matched, discard + if api_ctx.debug_tracer and not api_ctx.active_debug_session then + api_ctx.debug_tracer = nil end end From 30b4cd0ed2ecc292cb0c0182179c73bd6b82647b Mon Sep 17 00:00:00 2001 From: Ashish Tiwari Date: Wed, 15 Oct 2025 17:07:39 +0530 Subject: [PATCH 2/2] change name --- apisix/debug_tracer.lua | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/apisix/debug_tracer.lua b/apisix/debug_tracer.lua index cce1a8cb0df5..1765bd651652 100644 --- a/apisix/debug_tracer.lua +++ b/apisix/debug_tracer.lua @@ -41,10 +41,10 @@ end -local DebugTracerProvider = {} -DebugTracerProvider.__index = DebugTracerProvider +local debug_tracer_provider = {} +debug_tracer_provider.__index = debug_tracer_provider -function DebugTracerProvider.new(collector_config, resource_attrs) +function debug_tracer_provider.new(collector_config, resource_attrs) local self = setmetatable({ collector_config = collector_config or { address = "127.0.0.1:4318", @@ -54,12 +54,12 @@ function DebugTracerProvider.new(collector_config, resource_attrs) resource_attrs = resource_attrs or {}, spans = {}, -- Buffered spans for this tracer instance is_reporting = false - }, DebugTracerProvider) + }, debug_tracer_provider) return self end -function DebugTracerProvider:start_span(span_name, options) +function debug_tracer_provider:start_span(span_name, options) local span_id = core.utils.uuid() local trace_id = core.utils.uuid() local start_time = ngx.now() * 1000000000 -- Convert to nanoseconds @@ -88,7 +88,7 @@ function DebugTracerProvider:start_span(span_name, options) } end -function DebugTracerProvider:finish_span(span_token, end_time) +function debug_tracer_provider:finish_span(span_token, end_time) local span = self.spans[span_token.span_id] if span then span.end_time = end_time or (ngx.now() * 1000000000) @@ -96,7 +96,7 @@ function DebugTracerProvider:finish_span(span_token, end_time) return span end -function DebugTracerProvider:add_event(span_token, event_name, attributes) +function debug_tracer_provider:add_event(span_token, event_name, attributes) local span = self.spans[span_token.span_id] if span then table.insert(span.events, { @@ -107,7 +107,7 @@ function DebugTracerProvider:add_event(span_token, event_name, attributes) end end -function DebugTracerProvider:set_attributes(span_token, attributes) +function debug_tracer_provider:set_attributes(span_token, attributes) local span = self.spans[span_token.span_id] if span then for k, v in pairs(attributes) do @@ -116,7 +116,7 @@ function DebugTracerProvider:set_attributes(span_token, attributes) end end -function DebugTracerProvider:set_status(span_token, status, description) +function debug_tracer_provider:set_status(span_token, status, description) local span = self.spans[span_token.span_id] if span then span.status = { @@ -126,15 +126,12 @@ function DebugTracerProvider:set_status(span_token, status, description) end end -function DebugTracerProvider:report_trace(debug_session_id) +function debug_tracer_provider:report_trace(debug_session_id) if self.is_reporting then core.log.warn("Debug tracer is already in reporting mode") return end - self.is_reporting = true - - -- Create real OpenTelemetry tracer local real_tracer = self:_create_real_tracer(debug_session_id) -- Convert all buffered spans to real spans @@ -150,7 +147,7 @@ function DebugTracerProvider:report_trace(debug_session_id) core.log.info("Debug trace reported for session: ", debug_session_id) end -function DebugTracerProvider:_create_real_tracer(debug_session_id) +function debug_tracer_provider:_create_real_tracer(debug_session_id) -- Build resource attributes local resource_attrs = { attr.string("hostname", hostname) } @@ -197,7 +194,7 @@ function DebugTracerProvider:_create_real_tracer(debug_session_id) return tp:tracer("apisix-debug-tracer") end -function DebugTracerProvider:_convert_to_real_span(real_tracer, buffered_span) +function debug_tracer_provider:_convert_to_real_span(real_tracer, buffered_span) -- Start span with original timing local span_ctx = real_tracer:start(buffered_span.name, { kind = buffered_span.kind, @@ -223,7 +220,7 @@ function DebugTracerProvider:_convert_to_real_span(real_tracer, buffered_span) span:finish(buffered_span.end_time) end -function DebugTracerProvider:get_buffered_spans_count() +function debug_tracer_provider:get_buffered_spans_count() local count = 0 for _ in pairs(self.spans) do count = count + 1 @@ -232,7 +229,7 @@ function DebugTracerProvider:get_buffered_spans_count() end function _M.create_tracer_provider(collector_config, resource_attrs) - return DebugTracerProvider.new(collector_config, resource_attrs) + return debug_tracer_provider.new(collector_config, resource_attrs) end return _M