bentoml
diff --git a/‎.github/workflows/tests.yml
Lines changed: 0 additions & 35 deletions b/‎.github/workflows/tests.yml
Lines changed: 0 additions & 35 deletions
diff --git a/‎src/openllm/__main__.py
Lines changed: 22 additions & 2 deletions b/‎src/openllm/__main__.py
Lines changed: 22 additions & 2 deletions
diff --git a/‎src/openllm/local.py
Lines changed: 60 additions & 11 deletions b/‎src/openllm/local.py
Lines changed: 60 additions & 11 deletions
diff --git a/‎tests/test_cli_flow.py
Lines changed: 0 additions & 75 deletions b/‎tests/test_cli_flow.py
Lines changed: 0 additions & 75 deletions
@@ -220,13 +220,23 @@ def serve(
   repo: typing.Optional[str] = None,
   port: int = 3000,
   verbose: bool = False,
+  env: typing.Optional[list[str]] = typer.Option(
+    None,
+    '--env',
+    help='Environment variables to pass to the deployment command. Format: NAME or NAME=value. Can be specified multiple times.',
+  ),
+  arg: typing.Optional[list[str]] = typer.Option(
+    None,
+    '--arg',
+    help='Bento arguments in the form of key=value pairs. Can be specified multiple times.',
+  ),
 ) -> None:
   cmd_update()
   if verbose:
     VERBOSE_LEVEL.set(20)
   target = get_local_machine_spec()
   bento = ensure_bento(model, target=target, repo_name=repo)
-  local_serve(bento, port=port)
+  local_serve(bento, port=port, cli_envs=env, cli_args=arg)
 
 
 @app.command(help='run the model and chat in terminal')
@@ -236,6 +246,16 @@ def run(
   port: typing.Optional[int] = None,
   timeout: int = 600,
   verbose: bool = False,
+  env: typing.Optional[list[str]] = typer.Option(
+    None,
+    '--env',
+    help='Environment variables to pass to the deployment command. Format: NAME or NAME=value. Can be specified multiple times.',
+  ),
+  arg: typing.Optional[list[str]] = typer.Option(
+    None,
+    '--arg',
+    help='Bento arguments in the form of key=value pairs. Can be specified multiple times.',
+  ),
 ) -> None:
   cmd_update()
   if verbose:
@@ -244,7 +264,7 @@ def run(
   bento = ensure_bento(model, target=target, repo_name=repo)
   if port is None:
     port = random.randint(30000, 40000)
-  local_run(bento, port=port, timeout=timeout)
+  local_run(bento, port=port, timeout=timeout, cli_envs=env, cli_args=arg)
 
 
 @app.command(help='deploy production-ready OpenAI API-compatible server to BentoCloud')
 
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-import asyncio, time, typing
+import asyncio, time, typing, os
 import httpx, openai
 
 from openai.types.chat import ChatCompletionAssistantMessageParam, ChatCompletionUserMessageParam
@@ -19,8 +19,6 @@
 
 
 def prep_env_vars(bento: BentoInfo) -> None:
-  import os
-
   env_vars = bento.envs
   for env_var in env_vars:
     if not env_var.get('value'):
@@ -30,23 +28,57 @@ def prep_env_vars(bento: BentoInfo) -> None:
     os.environ[key] = value
 
 
-def _get_serve_cmd(bento: BentoInfo, port: int = 3000) -> tuple[list[str], EnvVars]:
+def _get_serve_cmd(
+  bento: BentoInfo, port: int = 3000, cli_args: typing.Optional[list[str]] = None
+) -> tuple[list[str], EnvVars]:
   cmd = ['bentoml', 'serve', bento.bentoml_tag]
   if port != 3000:
     cmd += ['--port', str(port)]
+
+  # Add CLI arguments if provided
+  if cli_args:
+    for arg in cli_args:
+      cmd += ['--arg', arg]
+
   return cmd, EnvVars({'BENTOML_HOME': f'{bento.repo.path}/bentoml'})
 
 
-def serve(bento: BentoInfo, port: int = 3000) -> None:
+def serve(
+  bento: BentoInfo,
+  port: int = 3000,
+  cli_envs: typing.Optional[list[str]] = None,
+  cli_args: typing.Optional[list[str]] = None,
+) -> None:
   prep_env_vars(bento)
-  cmd, env = _get_serve_cmd(bento, port=port)
+  cmd, env = _get_serve_cmd(bento, port=port, cli_args=cli_args)
+
+  # Add CLI environment variables if provided
+  if cli_envs:
+    for env_var in cli_envs:
+      if '=' in env_var:
+        key, value = env_var.split('=', 1)
+        env[key] = value
+      else:
+        env[env_var] = os.environ.get(env_var, '')
+
   venv = ensure_venv(bento, runtime_envs=env)
   output(f'Access the Chat UI at http://localhost:{port}/chat (or with you IP)')
   run_command(cmd, env=env, cwd=None, venv=venv)
 
 
-async def _run_model(bento: BentoInfo, port: int = 3000, timeout: int = 600) -> None:
-  cmd, env = _get_serve_cmd(bento, port)
+async def _run_model(
+  bento: BentoInfo,
+  port: int = 3000,
+  timeout: int = 600,
+  cli_env: typing.Optional[dict[str, typing.Any]] = None,
+  cli_args: typing.Optional[list[str]] = None,
+) -> None:
+  cmd, env = _get_serve_cmd(bento, port, cli_args=cli_args)
+
+  # Merge cli environment variables if provided
+  if cli_env:
+    env.update(cli_env)
+
   venv = ensure_venv(bento, runtime_envs=env)
   async with async_run_command(cmd, env=env, cwd=None, venv=venv, silent=False) as server_proc:
     output(f'Model server started {server_proc.pid}')
@@ -109,9 +141,26 @@ async def _run_model(bento: BentoInfo, port: int = 3000, timeout: int = 600) ->
       except KeyboardInterrupt:
         break
     output('\nStopping model server...', style='green')
-  output('Stopped model server', style='green')
+    output('Stopped model server', style='green')
 
 
-def run(bento: BentoInfo, port: int = 3000, timeout: int = 600) -> None:
+def run(
+  bento: BentoInfo,
+  port: int = 3000,
+  timeout: int = 600,
+  cli_envs: typing.Optional[list[str]] = None,
+  cli_args: typing.Optional[list[str]] = None,
+) -> None:
   prep_env_vars(bento)
-  asyncio.run(_run_model(bento, port=port, timeout=timeout))
+
+  # Add CLI environment variables to the process
+  env = {}
+  if cli_envs:
+    for env_var in cli_envs:
+      if '=' in env_var:
+        key, value = env_var.split('=', 1)
+        env[key] = value
+      else:
+        env[env_var] = os.environ.get(env_var, '')
+
+  asyncio.run(_run_model(bento, port=port, timeout=timeout, cli_env=env, cli_args=cli_args))