You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
@@ -56,6 +59,10 @@ func (s *VllmSimulator) Start() error {
56
59
returnerr
57
60
}
58
61
62
+
// run request processing workers
63
+
fori:=1; i<=int(s.maxRunningReqs); i++ {
64
+
gos.reqProcessingWorker(ctx, i)
65
+
}
59
66
// start the http server
60
67
returns.startServer()
61
68
}
@@ -71,6 +78,7 @@ func (s *VllmSimulator) parseCommandParams() error {
71
78
pflag.StringVar(&lorasStr, "lora", "", "List of LoRA adapters, separated by comma")
72
79
pflag.IntVar(&s.maxLoras, "max-loras", 1, "Maximum number of LoRAs in a single batch")
73
80
pflag.IntVar(&s.maxCpuLoras, "max-cpu-loras", 0, "Maximum number of LoRAs to store in CPU memory")
81
+
pflag.Int64Var(&s.maxRunningReqs, "max-running-requests", 5, "Maximum number of inference requests that could be processed at the same time (parameter to simulate requests waiting queue)")
74
82
75
83
pflag.Parse()
76
84
@@ -250,32 +258,60 @@ func (s *VllmSimulator) handleCompletions(ctx *fasthttp.RequestCtx, isChatComple
250
258
return
251
259
}
252
260
253
-
ifs.isLora(model) {
254
-
// if current request's model is LoRA, add it to the list of running loras
0 commit comments