zihugithub
diff --git a/‎examples/cloud/conf/serve.yaml
Lines changed: 29 additions & 0 deletions b/‎examples/cloud/conf/serve.yaml
Lines changed: 29 additions & 0 deletions
diff --git a/‎examples/cloud/conf/serve/cloud_model.yaml
Lines changed: 10 additions & 0 deletions b/‎examples/cloud/conf/serve/cloud_model.yaml
Lines changed: 10 additions & 0 deletions
diff --git a/‎examples/deepseek_v3/conf/serve_auto_tuner.yaml
Lines changed: 2 additions & 2 deletions b/‎examples/deepseek_v3/conf/serve_auto_tuner.yaml
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/qwen2_5/conf/serve.yaml
Lines changed: 2 additions & 2 deletions b/‎examples/qwen2_5/conf/serve.yaml
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/qwen2_5/conf/serve_auto_tuner.yaml
Lines changed: 3 additions & 3 deletions b/‎examples/qwen2_5/conf/serve_auto_tuner.yaml
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/qwen2_5/conf/serve_disagg_xpyd.yaml
Lines changed: 9 additions & 9 deletions b/‎examples/qwen2_5/conf/serve_disagg_xpyd.yaml
Lines changed: 9 additions & 9 deletions
diff --git a/‎examples/qwen2_5/conf/serve_multiple_instance.yaml
Lines changed: 3 additions & 3 deletions b/‎examples/qwen2_5/conf/serve_multiple_instance.yaml
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/qwen3/conf/serve.yaml
Lines changed: 2 additions & 2 deletions b/‎examples/qwen3/conf/serve.yaml
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/qwen3/conf/serve_atmb.yaml
Lines changed: 3 additions & 3 deletions b/‎examples/qwen3/conf/serve_atmb.yaml
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/robobrain/conf/serve.yaml
Lines changed: 3 additions & 3 deletions b/‎examples/robobrain/conf/serve.yaml
Lines changed: 3 additions & 3 deletions
@@ -0,0 +1,29 @@
+defaults:
+- _self_
+- serve: cloud_model
+
+experiment:
+  exp_name: cloud
+  exp_dir: outputs/${experiment.exp_name}
+  task:
+    type: serve
+  runner:
+    type: cloud
+    hostfile: ${oc.env:HOSTFILE, /etc/hostfile}
+    master_addr: ${oc.env:MASTER_ADDR, 127.0.0.1}
+    master_port: 7396
+    device_type: ${oc.env:DEVICE_TYPE, gpu}
+    nproc_per_node: ${oc.env:AIRS_ACCELERATOR_NUM, 1}
+    deploy:
+      use_fs_serve: false
+  envs:
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+
+  cmds:
+    before_start: source /root/miniconda3/bin/activate flagscale-inference
+
+action: run
+
+hydra:
+  run:
+    dir: ${experiment.exp_dir}/hydra
@@ -0,0 +1,10 @@
+- serve_id: vllm_model
+  engine: vllm
+  engine_args:
+    model: null
+    port: 8000
+    host: 0.0.0.0
+    served_model_name: cloud
+    tensor_parallel_size: 1
+    pipeline_parallel_size: 1
+    max_model_len: 32768
@@ -7,13 +7,13 @@ experiment:
   exp_dir: ./outputs/${experiment.exp_name}
   task:
     type: serve
-  deploy:
-    use_fs_serve: false
   runner:
     nnodes: 2
     nproc_per_node: 8
     hostfile: examples/deepseek/conf/hostfile.txt
     docker: ds
+    deploy:
+      use_fs_serve: false
   auto_tuner:
     space:
       tensor_model_parallel_size: [4, 8]
 
@@ -7,10 +7,10 @@ experiment:
   exp_dir: outputs/${experiment.exp_name}
   task:
     type: serve
-  deploy:
-    use_fs_serve: false
   runner:
     hostfile: null
+    deploy:
+      use_fs_serve: false
   envs:
     CUDA_VISIBLE_DEVICES: 0
     CUDA_DEVICE_MAX_CONNECTIONS: 1
 
@@ -7,12 +7,12 @@ experiment:
   exp_dir: ./outputs
   task:
     type: serve
-  deploy:
-    port: 6701
-    use_fs_serve: false
   runner:
     nnodes: 1
     nproc_per_node: 4
+    deploy:
+      port: 6701
+      use_fs_serve: false
   auto_tuner:
     space:
       tensor_model_parallel_size: "auto"
 
@@ -7,18 +7,18 @@ experiment:
   exp_dir: outputs/${experiment.exp_name}
   task:
     type: serve
-  deploy:
-    port: 10001
-    use_fs_serve: false
-    prefill_decode_disaggregation: true
-    prefill_num: 2
-    prefill_address: x.x.x.x # optional, default "auto"
-    decode_num: 2
-    decode_address: x.x.x.x # optional, default "auto"
-    prefill_decode_strategy: slo # optional, one of [slo|random|robin], default slo
   runner:
     hostfile: examples/qwen/conf/hostfile.txt
     docker: fr-v2
+    deploy:
+      port: 10001
+      use_fs_serve: false
+      prefill_decode_disaggregation: true
+      prefill_num: 2
+      prefill_address: x.x.x.x # optional, default "auto"
+      decode_num: 2
+      decode_address: x.x.x.x # optional, default "auto"
+      prefill_decode_strategy: slo # optional, one of [slo|random|robin], default slo
   envs:
     CUDA_DEVICE_MAX_CONNECTIONS: 1
     VLLM_USE_V1: 0
 
@@ -8,15 +8,15 @@ experiment:
   task:
     type: serve
     entrypoint: null
-  deploy:
-    port: 6701
-    use_fs_serve: true
   runner:
     hostfile: null # /path/to/hostfile.txt
     docker: ds
     ssh_port: 22
     nnodes: 2
     nproc_per_node: 8
+    deploy:
+      port: 6701
+      use_fs_serve: true
   auto_tuner:
     space:
       tensor_model_parallel_size: [2,4]
 
@@ -7,10 +7,10 @@ experiment:
   exp_dir: outputs/${experiment.exp_name}
   task:
     type: serve
-  deploy:
-    use_fs_serve: false
   runner:
     hostfile: null
+    deploy:
+      use_fs_serve: false
   envs:
     CUDA_VISIBLE_DEVICES: 0
     CUDA_DEVICE_MAX_CONNECTIONS: 1
 
@@ -8,12 +8,12 @@ experiment:
   exp_dir: outputs/${experiment.exp_name}
   task:
     type: serve
-  deploy:
-    port: 6701
-    use_fs_serve: false
   runner:
     nnodes: 1
     nproc_per_node: 4
+    deploy:
+      port: 6701
+      use_fs_serve: false
   envs:
     CUDA_VISIBLE_DEVICES: 0,1,2,3
     CUDA_DEVICE_MAX_CONNECTIONS: 1
 
@@ -8,13 +8,13 @@ experiment:
   task:
     type: serve
     entrypoint: null
-  deploy:
-    port: 6701
-    use_fs_serve: true
   runner:
     hostfile: null #examples/robobrain/conf/hostfile.txt
     docker: robobrain
     ssh_port: 22
+    deploy:
+      port: 6701
+      use_fs_serve: true
 
   cmds:
     before_start: export RAY_DEDUP_LOGS=0