Skip to content

Commit 7f1f766

Browse files
authored
Choose latencies randomly (#103)
* Choose latencies randomly Signed-off-by: Ira <IRAR@il.ibm.com> * Improved code readability Signed-off-by: Ira <IRAR@il.ibm.com> --------- Signed-off-by: Ira <IRAR@il.ibm.com>
1 parent 15bfa0b commit 7f1f766

File tree

7 files changed

+188
-5
lines changed

7 files changed

+188
-5
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,11 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
9898
- `echo`: returns the same text that was sent in the request
9999
- `random`: returns a sentence chosen at random from a set of pre-defined sentences
100100
- `time-to-first-token`: the time to the first token (in milliseconds), optional, by default zero
101+
- `time-to-first-token-std-dev`: standard deviation for time before the first token will be returned, in milliseconds, optional, default is 0, can't be more than 30% of `time-to-first-token`, will not cause the actual time to first token to differ by more than 70% from `time-to-first-token`
101102
- `inter-token-latency`: the time to 'generate' each additional token (in milliseconds), optional, by default zero
103+
- `inter-token-latency-std-dev`: standard deviation for time between generated tokens, in milliseconds, optional, default is 0, can't be more than 30% of `inter-token-latency`, will not cause the actual inter token latency to differ by more than 70% from `inter-token-latency`
102104
- `kv-cache-transfer-latency`: time for KV-cache transfer from a remote vLLM (in milliseconds), by default zero. Usually much shorter than `time-to-first-token`
105+
- `kv-cache-transfer-latency-std-dev`: standard deviation for time to "transfer" kv-cache from another vLLM instance in case P/D is activated, in milliseconds, optional, default is 0, can't be more than 30% of `kv-cache-transfer-latency`, will not cause the actual latency to differ by more than 70% from `kv-cache-transfer-latency`
103106
- `seed`: random seed for operations (if not set, current Unix time in nanoseconds is used)
104107
- `max-tool-call-integer-param`: the maximum possible value of integer parameters in a tool call, optional, defaults to 100
105108
- `min-tool-call-integer-param`: the minimum possible value of integer parameters in a tool call, optional, defaults to 0

pkg/llm-d-inference-sim/config.go

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,24 @@ type configuration struct {
5151

5252
// TimeToFirstToken time before the first token will be returned, in milliseconds
5353
TimeToFirstToken int `yaml:"time-to-first-token"`
54+
// TimeToFirstTokenStdDev standard deviation for time before the first token will be returned,
55+
// in milliseconds, optional, default is 0, can't be more than 30% of TimeToFirstToken, will not
56+
// cause the actual time to first token to differ by more than 70% from TimeToFirstToken
57+
TimeToFirstTokenStdDev int `yaml:"time-to-first-token-std-dev"`
5458
// InterTokenLatency time between generated tokens, in milliseconds
5559
InterTokenLatency int `yaml:"inter-token-latency"`
56-
// KVCacheTransferLatency time to "transfer" kv-cache from another vLLM instance in case P/D is activated, in milliseconds
60+
// InterTokenLatencyStdDev standard deviation for time between generated tokens, in milliseconds,
61+
// optional, default is 0, can't be more than 30% of InterTokenLatency, will not cause the actual
62+
// inter token latency to differ by more than 70% from InterTokenLatency
63+
InterTokenLatencyStdDev int `yaml:"inter-token-latency-std-dev"`
64+
// KVCacheTransferLatency time to "transfer" kv-cache from another vLLM instance in case P/D is activated,
65+
// in milliseconds
5766
KVCacheTransferLatency int `yaml:"kv-cache-transfer-latency"`
67+
// KVCacheTransferLatencyStdDev standard deviation for time to "transfer" kv-cache from another
68+
// vLLM instance in case P/D is activated, in milliseconds, optional, default is 0, can't be more
69+
// than 30% of KVCacheTransferLatency, will not cause the actual latency to differ by more than 70% from
70+
// KVCacheTransferLatency
71+
KVCacheTransferLatencyStdDev int `yaml:"kv-cache-transfer-latency-std-dev"`
5872

5973
// Mode defines the simulator response generation mode, valid values: echo, random
6074
Mode string `yaml:"mode"`
@@ -178,12 +192,30 @@ func (c *configuration) validate() error {
178192
if c.InterTokenLatency < 0 {
179193
return errors.New("inter token latency cannot be negative")
180194
}
195+
if c.InterTokenLatencyStdDev < 0 {
196+
return errors.New("inter token latency standard deviation cannot be negative")
197+
}
198+
if float32(c.InterTokenLatencyStdDev) > 0.3*float32(c.InterTokenLatency) {
199+
return errors.New("inter token latency standard deviation cannot be more than 30% of inter token latency")
200+
}
181201
if c.TimeToFirstToken < 0 {
182202
return errors.New("time to first token cannot be negative")
183203
}
204+
if c.TimeToFirstTokenStdDev < 0 {
205+
return errors.New("time to first token standard deviation cannot be negative")
206+
}
207+
if float32(c.TimeToFirstTokenStdDev) > 0.3*float32(c.TimeToFirstToken) {
208+
return errors.New("time to first token standard deviation cannot be more than 30% of time to first token")
209+
}
184210
if c.KVCacheTransferLatency < 0 {
185211
return errors.New("kv-cache tranfer time cannot be negative")
186212
}
213+
if c.KVCacheTransferLatencyStdDev < 0 {
214+
return errors.New("kv-cache tranfer time standard deviation cannot be negative")
215+
}
216+
if float32(c.KVCacheTransferLatencyStdDev) > 0.3*float32(c.KVCacheTransferLatency) {
217+
return errors.New("kv-cache tranfer standard deviation cannot be more than 30% of kv-cache tranfer")
218+
}
187219
if c.MaxLoras < 1 {
188220
return errors.New("max LoRAs cannot be less than 1")
189221
}

pkg/llm-d-inference-sim/config_test.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,36 @@ var _ = Describe("Simulator configuration", func() {
258258
args: []string{"cmd", "--object-tool-call-not-required-field-probability", "1210",
259259
"--config", "../../manifests/config.yaml"},
260260
},
261+
{
262+
name: "invalid time-to-first-token-std-dev",
263+
args: []string{"cmd", "--time-to-first-token-std-dev", "3000",
264+
"--config", "../../manifests/config.yaml"},
265+
},
266+
{
267+
name: "invalid (negative) time-to-first-token-std-dev",
268+
args: []string{"cmd", "--time-to-first-token-std-dev", "10", "--time-to-first-token-std-dev", "-1",
269+
"--config", "../../manifests/config.yaml"},
270+
},
271+
{
272+
name: "invalid inter-token-latency-std-dev",
273+
args: []string{"cmd", "--inter-token-latency", " 1000", "--inter-token-latency-std-dev", "301",
274+
"--config", "../../manifests/config.yaml"},
275+
},
276+
{
277+
name: "invalid (negative) inter-token-latency-std-dev",
278+
args: []string{"cmd", "--inter-token-latency", " 1000", "--inter-token-latency-std-dev", "-1",
279+
"--config", "../../manifests/config.yaml"},
280+
},
281+
{
282+
name: "invalid kv-cache-transfer-latency-std-dev",
283+
args: []string{"cmd", "--kv-cache-transfer-latency", "70", "--kv-cache-transfer-latency-std-dev", "35",
284+
"--config", "../../manifests/config.yaml"},
285+
},
286+
{
287+
name: "invalid (negative) kv-cache-transfer-latency-std-dev",
288+
args: []string{"cmd", "--kv-cache-transfer-latency-std-dev", "-35",
289+
"--config", "../../manifests/config.yaml"},
290+
},
261291
}
262292

263293
for _, test := range invalidTests {

pkg/llm-d-inference-sim/simulator.go

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,9 @@ func (s *VllmSimulator) parseCommandParamsAndLoadConfig() error {
158158
f.IntVar(&config.InterTokenLatency, "inter-token-latency", config.InterTokenLatency, "Time to generate one token (in milliseconds)")
159159
f.IntVar(&config.TimeToFirstToken, "time-to-first-token", config.TimeToFirstToken, "Time to first token (in milliseconds)")
160160
f.IntVar(&config.KVCacheTransferLatency, "kv-cache-transfer-latency", config.KVCacheTransferLatency, "Time for KV-cache transfer from a remote vLLM (in milliseconds)")
161+
f.IntVar(&config.InterTokenLatencyStdDev, "inter-token-latency-std-dev", config.InterTokenLatencyStdDev, "Standard deviation for time between generated tokens (in milliseconds)")
162+
f.IntVar(&config.TimeToFirstTokenStdDev, "time-to-first-token-std-dev", config.TimeToFirstTokenStdDev, "Standard deviation for time before the first token will be returned (in milliseconds)")
163+
f.IntVar(&config.KVCacheTransferLatencyStdDev, "kv-cache-transfer-latency-std-dev", config.KVCacheTransferLatencyStdDev, "Standard deviation for time for KV-cache transfer from a remote vLLM (in milliseconds)")
161164
f.Int64Var(&config.Seed, "seed", config.Seed, "Random seed for operations (if not set, current Unix time in nanoseconds is used)")
162165

163166
f.IntVar(&config.MaxToolCallIntegerParam, "max-tool-call-integer-param", config.MaxToolCallIntegerParam, "Maximum possible value of integer parameters in a tool call")
@@ -674,7 +677,7 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques
674677

675678
// calculate how long to wait before returning the response, time is based on number of tokens
676679
numOfTokens := usageData.CompletionTokens
677-
totalMillisToWait := s.getTimeToFirstToken(doRemotePrefill) + (numOfTokens-1)*s.config.InterTokenLatency
680+
totalMillisToWait := s.getTimeToFirstToken(doRemotePrefill) + s.getTotalInterTokenLatency(numOfTokens)
678681
time.Sleep(time.Duration(totalMillisToWait) * time.Millisecond)
679682

680683
// TODO - maybe add pod id to response header for testing
@@ -687,10 +690,29 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques
687690

688691
// returns time to first token based on the current request's doRemotePrefill
689692
func (s *VllmSimulator) getTimeToFirstToken(doRemotePrefill bool) int {
693+
mean := float64(s.config.TimeToFirstToken)
694+
stddev := float64(s.config.TimeToFirstTokenStdDev)
690695
if doRemotePrefill {
691-
return s.config.KVCacheTransferLatency
696+
mean = float64(s.config.KVCacheTransferLatency)
697+
stddev = float64(s.config.KVCacheTransferLatencyStdDev)
692698
}
693-
return s.config.TimeToFirstToken
699+
return int(randomNorm(mean, stddev))
700+
}
701+
702+
// returns inter token latency
703+
func (s *VllmSimulator) getInterTokenLatency() int {
704+
mean := float64(s.config.InterTokenLatency)
705+
stddev := float64(s.config.InterTokenLatencyStdDev)
706+
return int(randomNorm(mean, stddev))
707+
}
708+
709+
// returns total inter token latency for the given number of tokens
710+
func (s *VllmSimulator) getTotalInterTokenLatency(numOfTokens int) int {
711+
total := 0
712+
for range numOfTokens - 1 {
713+
total += s.getInterTokenLatency()
714+
}
715+
return total
694716
}
695717

696718
// createModelsResponse creates and returns ModelResponse for the current state, returned array of models contains the base model + LoRA adapters if exist

pkg/llm-d-inference-sim/simulator_test.go

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -489,4 +489,84 @@ var _ = Describe("Simulator", func() {
489489
Expect(string(body)).To(ContainSubstring("BadRequestError"))
490490
})
491491
})
492+
493+
Describe("Check random latencies", Ordered, func() {
494+
var simulator *VllmSimulator
495+
496+
BeforeAll(func() {
497+
var err error
498+
simulator, err = New(klog.Background())
499+
Expect(err).NotTo(HaveOccurred())
500+
501+
simulator.config = newConfig()
502+
simulator.config.TimeToFirstToken = 2048
503+
simulator.config.TimeToFirstTokenStdDev = 2048
504+
simulator.config.KVCacheTransferLatency = 2048
505+
simulator.config.KVCacheTransferLatencyStdDev = 2048
506+
})
507+
508+
DescribeTable("should calculate inter token latency correctly",
509+
func(interTokenLatency int, stddev int) {
510+
simulator.config.InterTokenLatency = interTokenLatency
511+
simulator.config.InterTokenLatencyStdDev = stddev
512+
interToken := simulator.getInterTokenLatency()
513+
Expect(interToken).To(BeNumerically(">=", float32(interTokenLatency)*0.3))
514+
Expect(interToken).To(BeNumerically("<=", float32(interTokenLatency)*1.7))
515+
},
516+
func(interTokenLatency int, stddev int) string {
517+
return fmt.Sprintf("interTokenLatency: %d stddev: %d", interTokenLatency, stddev)
518+
},
519+
Entry(nil, 1000, 300),
520+
Entry(nil, 1000, 800), // invalid std dev, used for testing purposes
521+
Entry(nil, 1000, 900), // invalid std dev, used for testing purposes
522+
Entry(nil, 1000, 0),
523+
)
524+
525+
DescribeTable("should calculate total inter token latency correctly",
526+
func(interTokenLatency int, stddev int, numberOfTokens int) {
527+
simulator.config.InterTokenLatency = interTokenLatency
528+
simulator.config.InterTokenLatencyStdDev = stddev
529+
latency := simulator.getTotalInterTokenLatency(numberOfTokens)
530+
Expect(latency).To(BeNumerically(">=", float32(interTokenLatency)*0.3*float32(numberOfTokens)))
531+
Expect(latency).To(BeNumerically("<=", float32(interTokenLatency)*1.7*float32(numberOfTokens)))
532+
},
533+
func(interTokenLatency int, stddev int, numberOfTokens int) string {
534+
return fmt.Sprintf("interTokenLatency: %d stddev: %d, numberOfTokens: %d", interTokenLatency,
535+
stddev, numberOfTokens)
536+
},
537+
Entry(nil, 1000, 30, 100),
538+
Entry(nil, 1000, 800, 20), // invalid std dev, used for testing purposes
539+
Entry(nil, 1000, 900, 5), // invalid std dev, used for testing purposes
540+
Entry(nil, 1000, 0, 50),
541+
)
542+
543+
DescribeTable("should calculate time to first token correctly",
544+
func(timeToFirstToken int, timeToFirstTokenStdDev int,
545+
kvCacheLatency int, kvCacheLatencyStdDev int, doREmotePrefill bool) {
546+
simulator.config.TimeToFirstToken = timeToFirstToken
547+
simulator.config.TimeToFirstTokenStdDev = timeToFirstTokenStdDev
548+
simulator.config.KVCacheTransferLatency = kvCacheLatency
549+
simulator.config.KVCacheTransferLatencyStdDev = kvCacheLatencyStdDev
550+
timeToFirst := simulator.getTimeToFirstToken(doREmotePrefill)
551+
if doREmotePrefill {
552+
Expect(timeToFirst).To(BeNumerically(">=", float32(kvCacheLatency)*0.3))
553+
Expect(timeToFirst).To(BeNumerically("<=", float32(kvCacheLatency)*1.7))
554+
} else {
555+
Expect(timeToFirst).To(BeNumerically(">=", float32(timeToFirstToken)*0.3))
556+
Expect(timeToFirst).To(BeNumerically("<=", float32(timeToFirstToken)*1.7))
557+
}
558+
},
559+
func(timeToFirstToken int, timeToFirstTokenStdDev int,
560+
kvCacheLatency int, kvCacheLatencyStdDev int, doREmotePrefill bool) string {
561+
return fmt.Sprintf("timeToFirstToken: %d stddev: %d kvCacheLatency: %d stddev: %d doREmotePrefill: %t",
562+
timeToFirstToken, timeToFirstTokenStdDev, kvCacheLatency, kvCacheLatencyStdDev, doREmotePrefill)
563+
},
564+
Entry(nil, 10000, 300, 1000, 200, true),
565+
Entry(nil, 10000, 300, 1000, 200, false),
566+
Entry(nil, 10000, 9000, 1000, 800, true), // invalid std dev, used for testing purposes
567+
Entry(nil, 10000, 8000, 1000, 900, false), // invalid std dev, used for testing purposes
568+
Entry(nil, 10000, 0, 1000, 0, true),
569+
Entry(nil, 10000, 0, 1000, 0, false),
570+
)
571+
})
492572
})

pkg/llm-d-inference-sim/streaming.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writ
9191

9292
for i, token := range tokens {
9393
if i != 0 {
94-
time.Sleep(time.Duration(s.config.InterTokenLatency) * time.Millisecond)
94+
time.Sleep(time.Duration(s.getInterTokenLatency()) * time.Millisecond)
9595
}
9696
var toolChunkInsert *toolCall
9797
if tc != nil {

pkg/llm-d-inference-sim/utils.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,22 @@ func randomFloat(min float64, max float64) float64 {
151151
return randomGenerator.Float64()*(max-min) + min
152152
}
153153

154+
// Returns a normally distributed float64
155+
// If the generated value differs by more than 70% from mean, the returned
156+
// value will be 70% of mean
157+
func randomNorm(mean float64, stddev float64) float64 {
158+
if stddev == 0 {
159+
return mean
160+
}
161+
value := randomGenerator.NormFloat64()*stddev + mean
162+
if value < 0.3*mean {
163+
value = 0.3 * mean
164+
} else if value > 1.7*mean {
165+
value = 1.7 * mean
166+
}
167+
return value
168+
}
169+
154170
// Regular expression for the response tokenization
155171
var re *regexp.Regexp
156172

0 commit comments

Comments
 (0)