ktransformers backend support #2221
magikRUKKOLA
started this conversation in
Show and tell
Replies: 2 comments
-
UPDATE. made sure it works with the latest version (v1.10.2 (?)). upstream ollama { server localhost:11434; }
upstream ktransformers1 { server localhost:8080; }
upstream ktransformers2 { server 192.168.1.5:8080; }
proxy_cache_path /var/www/cache levels=1:2 keys_zone=inference_cache:10m inactive=60m use_temp_path=off;
lua_shared_dict shared_data 10m;
init_by_lua_block {
_G.common = {
blacklist_patterns = 'llava|gemma|granite|qwq|mixtral|llama3.2|r1:32b|r1:70b|distill',
-- Process CORS headers
handle_cors = function()
ngx.header['Access-Control-Allow-Origin'] = '*'
ngx.header['Access-Control-Allow-Methods'] = 'GET, POST, OPTIONS'
ngx.header['Access-Control-Allow-Headers'] = 'Authorization, Content-Type'
ngx.header['Access-Control-Max-Age'] = 1728000
ngx.header['Content-Type'] = 'text/plain; charset=utf-8'
ngx.header['Content-Length'] = 0
return ngx.exit(204)
end,
create_http_client = function()
local http = require "resty.http"
local httpc = http.new()
httpc:set_timeouts(2000, 2000, 2000)
return httpc
end,
process_ollama_model = function(model, blacklist_patterns)
local should_include = true
for pattern in blacklist_patterns:gmatch("([^|]+)") do
if model.name and model.name:lower():find(pattern:lower()) then
should_include = false
break
end
end
return should_include
end,
set_response_headers = function()
ngx.header["Content-Type"] = "application/json; charset=utf-8"
ngx.header["Access-Control-Allow-Origin"] = "*"
ngx.header["Server"] = nil
end
}
}
server {
listen 80;
server_name localhost;
set $backend "";
set $request_body_hash "";
#access_log /var/log/nginx/llm_access.log;
#error_log /var/log/nginx/llm_error.log debug;
location = /_cors_preflight {
internal;
if ($request_method = 'OPTIONS') {
access_by_lua_block { common.handle_cors() }
}
}
location ~ ^/v1/models$ {
if ($request_method = 'OPTIONS') {
access_by_lua_block { common.handle_cors(); return 204; }
}
content_by_lua_block {
ngx.exec("@handle_models")
}
}
location ~ ^/api/tags$ {
if ($request_method = 'OPTIONS') {
access_by_lua_block { common.handle_cors(); return 204; }
}
content_by_lua_block {
ngx.exec("@handle_models")
}
}
location @handle_models {
internal;
keepalive_timeout 0;
content_by_lua_block {
local cjson = require "cjson.safe"
local http = require "resty.http"
common.set_response_headers()
local response = {}
if ngx.var.uri:match("^/v1/models") then
response.object = "list"
response.data = {}
else
response.models = {}
end
local services = {
{
url = "http://127.0.0.1:11434/api/tags",
process = function(data)
for _, model in ipairs(data.models or {}) do
if common.process_ollama_model(model, common.blacklist_patterns) then
if ngx.var.uri:match("^/v1/models") then
table.insert(response.data, {
id = model.name,
object = "model",
created = model.modified_at,
owned_by = "library"
})
else
table.insert(response.models, {
name = model.name,
model = model.model,
modified_at = model.modified_at,
size = model.size,
digest = model.digest,
details = model.details
})
end
end
end
end
},
{ url = "http://127.0.0.1:8080/v1/models" },
{ url = "http://192.168.1.5:8080/v1/models" }
}
for _, s in ipairs(services) do
local httpc = common.create_http_client()
local res, err = httpc:request_uri(s.url, {
method = "GET",
headers = {
["Content-Type"] = "application/json",
["Host"] = "localhost",
}
})
if res and res.status == 200 then
local ok, data = pcall(cjson.decode, res.body)
if ok and data then
if s.process then
s.process(data)
else
if data.data then
for _, model in ipairs(data.data or {}) do
local model_id = model.id:match("^[^/]*/(.*)") or model.id
if ngx.var.uri:match("^/v1/models") then
table.insert(response.data, {
id = model_id,
object = "model",
created = os.time(),
owned_by = "library"
})
else
table.insert(response.models, {
name = model_id,
model = model.id
})
end
end
end
end
end
end
httpc:set_keepalive()
end
-- Safer empty response check
local empty_response = false
if ngx.var.uri:match("^/v1/models") then
empty_response = (response.data == nil or #response.data == 0)
else
empty_response = (response.models == nil or #response.models == 0)
end
if empty_response then
ngx.status = 500
ngx.say('{"error":"No model providers available"}')
else
ngx.print(cjson.encode(response))
end
}
}
location ~ ^/(v1/(chat/completions|completions|embeddings)|api/(generate|chat)) {
error_page 418 = @handle_cors_preflight;
if ($request_method = 'OPTIONS') { return 418; }
access_by_lua_block {
local cjson = require "cjson.safe"
if ngx.req.get_method() == "POST" then
ngx.req.read_body()
local body = ngx.req.get_body_data()
local args = cjson.decode(body) or {}
local is_naming_request = false
if args.messages and #args.messages > 0 then
local first_message = args.messages[1]
if first_message.content and
first_message.content:find("Based on the chat history, give this conversation a name") then
is_naming_request = true
end
end
local model = args.model or "DeepSeek-V3-0324-GGUF"
if is_naming_request then
ngx.var.backend = "ktransformers2"
else
ngx.var.backend = model:lower():find("r1%-1776%-gguf") and "ktransformers1"
or model:lower():find("v3%-") and "ktransformers2"
or "ollama"
end
ngx.var.request_body_hash = ngx.md5(ngx.req.get_body_file() or body)
end
}
proxy_pass http://$backend;
proxy_http_version 1.0;
proxy_set_header Connection "";
proxy_cache inference_cache;
proxy_cache_key "$request_method|$request_uri|$http_authorization|$request_body_hash";
proxy_cache_valid 200 20m;
proxy_cache_use_stale error timeout updating;
proxy_cache_background_update on;
proxy_cache_lock on;
proxy_cache_methods GET POST;
sub_filter_types application/json text/event-stream;
sub_filter_once off;
sub_filter 'content":""},"done":false' 'content":""},"done":true';
sub_filter ',"created_at":null' '';
sub_filter ',"message":null' '';
sub_filter ',"total_duration":null' '';
sub_filter ',"load_duration":null' '';
sub_filter ',"prompt_eval_count":null' '';
sub_filter ',"prompt_eval_duration":null' '';
sub_filter ',"eval_count":null' '';
sub_filter ',"eval_duration":null' '';
sub_filter ',}' '}';
sub_filter ',]' ']';
header_filter_by_lua_block { ngx.header.content_length = nil }
}
location @handle_cors_preflight {
internal;
access_by_lua_block { common.handle_cors() }
}
}
|
Beta Was this translation helpful? Give feedback.
0 replies
-
tool calls support added |
Beta Was this translation helpful? Give feedback.
0 replies
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Uh oh!
There was an error while loading. Please reload this page.
Uh oh!
There was an error while loading. Please reload this page.
-
https://github.com/kvcache-ai/ktransformers
One can use nginx and lua to make a wrapper (with the additional benefit of caching the completed response -- so in case if there is some network interruption and the chat app would output the connection error, one could just repeat the request later in order for the response to be completely loaded from the nginx cache) that has a preconfigured list of the ollama/open_ai api backends and queries them in order to check for the models available. Then it parses the initial http request and routes it to the backend that provides the exact llm the client requested.
So now its possible for example in the Android App like Chatbox AI to specify this backend and switch to the multiple ktransformers instances the the dropbox in the GUI of the Chatbox AI just like as it would be regular ollama api backend.
the code below also has an additional functionality to blacklist some neural networks from the inclusion to the resulting list.
the code below also reroutes the requests from the Chatbox AI app related to the subject title naming to the NON-reasoning model (to avoid seeing tags in the result)
Beta Was this translation helpful? Give feedback.
All reactions