llm-d
diff --git a/‎README.md
Lines changed: 29 additions & 7 deletions b/‎README.md
Lines changed: 29 additions & 7 deletions
diff --git a/‎manifests/config.yaml
Lines changed: 11 additions & 0 deletions b/‎manifests/config.yaml
Lines changed: 11 additions & 0 deletions
diff --git a/‎manifests/deployment.yaml
Lines changed: 2 additions & 2 deletions b/‎manifests/deployment.yaml
Lines changed: 2 additions & 2 deletions
diff --git a/‎pkg/llm-d-inference-sim/config.go
Lines changed: 159 additions & 0 deletions b/‎pkg/llm-d-inference-sim/config.go
Lines changed: 159 additions & 0 deletions
diff --git a/‎pkg/llm-d-inference-sim/config_test.go
Lines changed: 155 additions & 0 deletions b/‎pkg/llm-d-inference-sim/config_test.go
Lines changed: 155 additions & 0 deletions
@@ -85,18 +85,40 @@ API responses contains a subset of the fields provided by the OpenAI API.
 For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started/quickstart.html#openai-completions-api-with-vllm">vLLM documentation</a>
 
 ## Command line parameters
-- `port`: the port the simulator listents on, mandatory
+- `config`: the path to a yaml configuration file 
+- `port`: the port the simulator listents on, default is 8000
 - `model`: the currently 'loaded' model, mandatory
-- `lora`: a list of available LoRA adapters, separated by commas, optional, by default empty
+- `served-model-name`: model names exposed by the API (comma-separated)
+- `lora-modules`: LoRA module configurations in JSON format: [{"name": "name", "path": "lora_path", "base_model_name": "id"}], optional, empty by default
+- `max-loras`: maximum number of LoRAs in a single batch, optional, default is one
+- `max-cpu-loras`: maximum number of LoRAs to store in CPU memory, optional, must be >= than max-loras, default is max-loras
+- `max-num-seqs`: maximum number of sequences per iteration (maximum number of inference requests that could be processed at the same time), default is 5
 - `mode`: the simulator mode, optional, by default `random`
- - `echo`: returns the same text that was sent in the request
- - `random`: returns a sentence chosen at random from a set of pre-defined sentences
+    - `echo`: returns the same text that was sent in the request
+    - `random`: returns a sentence chosen at random from a set of pre-defined sentences
 - `time-to-first-token`: the time to the first token (in milliseconds), optional, by default zero
 - `inter-token-latency`: the time to 'generate' each additional token (in milliseconds), optional, by default zero
-- `max-loras`: maximum number of LoRAs in a single batch, optional, default is one
-- `max-cpu-loras`: maximum number of LoRAs to store in CPU memory, optional, must be >= than max_loras, default is max_loras
-- `max-running-requests`: maximum number of inference requests that could be processed at the same time
 
+In addition, as we are using klog, the following parameters are available:
+- `add_dir_header`: if true, adds the file directory to the header of the log messages
+- `alsologtostderr`: log to standard error as well as files (no effect when -logtostderr=true)
+- `log_backtrace_at`: when logging hits line file:N, emit a stack trace (default :0)
+- `log_dir`: if non-empty, write log files in this directory (no effect when -logtostderr=true)
+- `log_file`: if non-empty, use this log file (no effect when -logtostderr=true)
+- `log_file_max_size`: defines the maximum size a log file can grow to (no effect when -logtostderr=true). Unit is megabytes. If the value is 0, the maximum file size is unlimited. (default 1800)
+- `logtostderr`: log to standard error instead of files (default true)
+- `one_output`: if true, only write logs to their native severity level (vs also writing to each lower severity level; no effect when -logtostderr=true)
+- `skip_headers`: if true, avoid header prefixes in the log messages
+- `skip_log_headers`: if true, avoid headers when opening log files (no effect when -logtostderr=true)
+- `stderrthreshold`: logs at or above this threshold go to stderr when writing to files and stderr (no effect when -logtostderr=true or -alsologtostderr=true) (default 2)
+- `v`: number for the log level verbosity
+- `vmodule`: comma-separated list of pattern=N settings for file-filtered logging
+
+---
+
+## Migrating from releases prior to v0.2.0
+- `max-running-requests` was replaced by `max-num-seqs`
+- `lora` was replaced by `lora-modules`, which is now an array in JSON format, e.g, [{"name": "name", "path": "lora_path", "base_model_name": "id"}]
 
 ## Working with docker image
 
 
@@ -0,0 +1,11 @@
+port: 8001
+model: "Qwen/Qwen2-0.5B"
+served-model-name: ["model1", "model2"]
+max-loras: 2
+max-cpu-loras: 5
+max-num-seqs: 5
+lora-modules: [{"name":"lora1","path":"/path/to/lora1"},{"name":"lora2","path":"/path/to/lora2"}]
+
+mode: "random"
+time-to-first-token: 2
+inter-token-latency: 1
@@ -20,8 +20,8 @@ spec:
         - "8000"
         - --max-loras
         - "2"
-        - --lora
-        - food-review-1
+        - --lora-modules
+        - '[{"name": "food-review-1"}]'
         image: ghcr.io/llm-d/llm-d-inference-sim:v0.1.0
         imagePullPolicy: IfNotPresent
         name: vllm-sim
 
@@ -0,0 +1,159 @@
+/*
+Copyright 2025 The llm-d-inference-sim Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package llmdinferencesim
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"os"
+
+	"gopkg.in/yaml.v3"
+)
+
+type configuration struct {
+	// Port defines on which port the simulator runs
+	Port int `yaml:"port"`
+	// Model defines the current base model name
+	Model string `yaml:"model"`
+	// ServedModelNames is one or many model names exposed by the API
+	ServedModelNames []string `yaml:"served-model-name"`
+	// MaxLoras defines maximum number of loaded LoRAs
+	MaxLoras int `yaml:"max-loras"`
+	// MaxCPULoras defines maximum number of LoRAs to store in CPU memory
+	MaxCPULoras int `yaml:"max-cpu-loras"`
+	// MaxNumSeqs is maximum number of sequences per iteration (the maximum
+	// number of inference requests that could be processed at the same time)
+	MaxNumSeqs int `yaml:"max-num-seqs"`
+	// LoraModules is a list of LoRA adapters
+	LoraModules loraModulesValue `yaml:"lora-modules"`
+
+	// TimeToFirstToken time before the first token will be returned, in milliseconds
+	TimeToFirstToken int `yaml:"time-to-first-token"`
+	// InterTokenLatency time between generated tokens, in milliseconds
+	InterTokenLatency int `yaml:"inter-token-latency"`
+	// Mode defines the simulator response generation mode, valid values: echo, random
+	Mode string `yaml:"mode"`
+}
+
+type loraModule struct {
+	// Name is the LoRA's name
+	Name string `yaml:"name"`
+	// Path is the LoRA's path
+	Path string `yaml:"path"`
+	// BaseModelName is the LoRA's base model
+	BaseModelName string `yaml:"base_model_name"`
+}
+
+type loraModulesValue []loraModule
+
+func (l *loraModulesValue) String() string {
+	b, _ := json.Marshal(l)
+	return string(b)
+}
+
+func (l *loraModulesValue) Set(val string) error {
+	return json.Unmarshal([]byte(val), l)
+}
+
+func (l *loraModulesValue) Type() string {
+	return "loras"
+}
+
+// Implement custom YAML unmarshaling for just this type
+func (l *loraModulesValue) UnmarshalYAML(unmarshal func(interface{}) error) error {
+	// Try parsing as an array of loraModule
+	var arr []loraModule
+	if err := unmarshal(&arr); err == nil {
+		*l = arr
+		return nil
+	}
+	// Try parsing as a JSON string
+	var str string
+	if err := unmarshal(&str); err == nil {
+		return json.Unmarshal([]byte(str), l)
+	}
+	return errors.New("lora-modules: invalid format")
+}
+
+func newConfig() *configuration {
+	return &configuration{
+		Port:        vLLMDefaultPort,
+		MaxLoras:    1,
+		MaxCPULoras: 1,
+		MaxNumSeqs:  5,
+		Mode:        modeRandom,
+	}
+}
+
+func (c *configuration) load(configFile string) error {
+	configBytes, err := os.ReadFile(configFile)
+	if err != nil {
+		return fmt.Errorf("failed to read configuration file: %s", err)
+	}
+
+	if err := yaml.Unmarshal(configBytes, &c); err != nil {
+		return fmt.Errorf("failed to unmarshal configuration: %s", err)
+	}
+	return nil
+}
+
+func (c *configuration) validate() error {
+	if c.Model == "" {
+		return errors.New("model parameter is empty")
+	}
+	// Upstream vLLM behaviour: when --served-model-name is not provided,
+	// it falls back to using the value of --model as the single public name
+	// returned by the API and exposed in Prometheus metrics.
+	if len(c.ServedModelNames) == 0 {
+		c.ServedModelNames = []string{c.Model}
+	}
+
+	if c.Mode != modeEcho && c.Mode != modeRandom {
+		return fmt.Errorf("invalid mode '%s', valid values are 'random' and 'echo'", c.Mode)
+	}
+	if c.Port <= 0 {
+		return fmt.Errorf("invalid port '%d'", c.Port)
+	}
+	if c.InterTokenLatency < 0 {
+		return errors.New("inter token latency cannot be negative")
+	}
+	if c.TimeToFirstToken < 0 {
+		return errors.New("time to first token cannot be negative")
+	}
+	if c.MaxLoras < 1 {
+		return errors.New("max LoRAs cannot be less than 1")
+	}
+	if c.MaxCPULoras == 0 {
+		// max CPU LoRAs by default is same as max LoRAs
+		c.MaxCPULoras = c.MaxLoras
+	}
+	if c.MaxCPULoras < c.MaxLoras {
+		return errors.New("max CPU LoRAs cannot be less than max LoRAs")
+	}
+
+	for _, lora := range c.LoraModules {
+		if lora.Name == "" {
+			return errors.New("empty LoRA name")
+		}
+		if lora.BaseModelName != "" && lora.BaseModelName != c.Model {
+			return fmt.Errorf("unknown base model '%s' for LoRA '%s'", lora.BaseModelName, lora.Name)
+		}
+	}
+
+	return nil
+}
@@ -0,0 +1,155 @@
+/*
+Copyright 2025 The llm-d-inference-sim Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package llmdinferencesim
+
+import (
+	"os"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	"k8s.io/klog/v2"
+)
+
+func createSimConfig(args []string) (*configuration, error) {
+	oldArgs := os.Args
+	defer func() {
+		os.Args = oldArgs
+	}()
+	os.Args = args
+
+	s, err := New(klog.Background())
+	if err != nil {
+		return nil, err
+	}
+	if err := s.parseCommandParamsAndLoadConfig(); err != nil {
+		return nil, err
+	}
+	return s.config, nil
+}
+
+type testCase struct {
+	name           string
+	args           []string
+	expectedConfig *configuration
+}
+
+var _ = Describe("Simulator configuration", func() {
+	tests := make([]testCase, 0)
+
+	// Simple config with only model name set
+	c := newConfig()
+	c.Model = model
+	c.ServedModelNames = []string{c.Model}
+	test := testCase{
+		name:           "simple",
+		args:           []string{"cmd", "--model", model, "--mode", modeRandom},
+		expectedConfig: c,
+	}
+	tests = append(tests, test)
+
+	// Config from config.yaml file
+	c = newConfig()
+	c.Port = 8001
+	c.Model = "Qwen/Qwen2-0.5B"
+	c.ServedModelNames = []string{"model1", "model2"}
+	c.MaxLoras = 2
+	c.MaxCPULoras = 5
+	c.MaxNumSeqs = 5
+	c.TimeToFirstToken = 2
+	c.InterTokenLatency = 1
+	c.LoraModules = []loraModule{{Name: "lora1", Path: "/path/to/lora1"}, {Name: "lora2", Path: "/path/to/lora2"}}
+	test = testCase{
+		name:           "config file",
+		args:           []string{"cmd", "--config", "../../manifests/config.yaml"},
+		expectedConfig: c,
+	}
+	tests = append(tests, test)
+
+	// Config from config.yaml file plus command line args
+	c = newConfig()
+	c.Port = 8002
+	c.Model = model
+	c.ServedModelNames = []string{"alias1", "alias2"}
+	c.MaxLoras = 2
+	c.MaxCPULoras = 5
+	c.MaxNumSeqs = 5
+	c.TimeToFirstToken = 2
+	c.InterTokenLatency = 1
+	c.LoraModules = []loraModule{{Name: "lora1", Path: "/path/to/lora1"}, {Name: "lora2", Path: "/path/to/lora2"}}
+	test = testCase{
+		name: "config file with command line args",
+		args: []string{"cmd", "--model", model, "--config", "../../manifests/config.yaml", "--port", "8002",
+			"--served-model-name", "alias1,alias2"},
+		expectedConfig: c,
+	}
+	tests = append(tests, test)
+
+	// Invalid configurations
+	test = testCase{
+		name: "invalid model",
+		args: []string{"cmd", "--model", "", "--config", "../../manifests/config.yaml"},
+	}
+	tests = append(tests, test)
+
+	test = testCase{
+		name: "invalid port",
+		args: []string{"cmd", "--port", "-50", "--config", "../../manifests/config.yaml"},
+	}
+	tests = append(tests, test)
+
+	test = testCase{
+		name: "invalid max-loras",
+		args: []string{"cmd", "--max-loras", "15", "--config", "../../manifests/config.yaml"},
+	}
+	tests = append(tests, test)
+
+	test = testCase{
+		name: "invalid mode",
+		args: []string{"cmd", "--mode", "hello", "--config", "../../manifests/config.yaml"},
+	}
+	tests = append(tests, test)
+
+	test = testCase{
+		name: "invalid lora",
+		args: []string{"cmd", "--config", "../../manifests/config.yaml",
+			"--lora-modules", "[{\"path\":\"/path/to/lora15\"}]"},
+	}
+	tests = append(tests, test)
+
+	DescribeTable("check configurations",
+		func(args []string, expectedConfig *configuration) {
+			config, err := createSimConfig(args)
+			Expect(err).NotTo(HaveOccurred())
+			Expect(config).To(Equal(expectedConfig))
+		},
+		Entry(tests[0].name, tests[0].args, tests[0].expectedConfig),
+		Entry(tests[1].name, tests[1].args, tests[1].expectedConfig),
+		Entry(tests[2].name, tests[2].args, tests[2].expectedConfig),
+	)
+
+	DescribeTable("invalid configurations",
+		func(args []string) {
+			_, err := createSimConfig(args)
+			Expect(err).To(HaveOccurred())
+		},
+		Entry(tests[3].name, tests[3].args),
+		Entry(tests[4].name, tests[4].args),
+		Entry(tests[5].name, tests[5].args),
+		Entry(tests[6].name, tests[6].args),
+		Entry(tests[7].name, tests[7].args),
+	)
+})