vllm-project
diff --git a/‎experimental/bench.sh
Lines changed: 11 additions & 0 deletions b/‎experimental/bench.sh
Lines changed: 11 additions & 0 deletions
diff --git a/‎experimental/nvshmem.patch
Lines changed: 92 additions & 0 deletions b/‎experimental/nvshmem.patch
Lines changed: 92 additions & 0 deletions
diff --git a/‎experimental/serve_deepseek_v2.sh
Lines changed: 31 additions & 0 deletions b/‎experimental/serve_deepseek_v2.sh
Lines changed: 31 additions & 0 deletions
diff --git a/‎experimental/test_scale.py
Lines changed: 61 additions & 0 deletions b/‎experimental/test_scale.py
Lines changed: 61 additions & 0 deletions
diff --git a/‎experimental/test_stateless_pg.py
Lines changed: 93 additions & 0 deletions b/‎experimental/test_stateless_pg.py
Lines changed: 93 additions & 0 deletions
diff --git a/‎vllm/config.py
Lines changed: 14 additions & 0 deletions b/‎vllm/config.py
Lines changed: 14 additions & 0 deletions
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+MODEL_NAME="deepseek-ai/DeepSeek-V2-Lite-Chat"
+HOST="localhost"
+PORT=8006
+
+vllm bench serve \
+    --model $MODEL_NAME \
+    --host $HOST \
+    --port $PORT \
+    --num-prompts 5
@@ -0,0 +1,92 @@
+From 18c0599c2f07ec965132efa25961dc8179c2dda3 Mon Sep 17 00:00:00 2001
+From: Yongji Wu <wuyongji317@gmail.com>
+Date: Tue, 20 May 2025 13:41:12 -0700
+Subject: [PATCH] fix reinit issues due to states not cleaned up
+
+fix double free
+---
+ src/host/init/init.cu                             | 10 ++++++++++
+ .../internal/host/nvshmemi_mem_transport.hpp      | 15 +++++++++++++++
+ src/modules/bootstrap/uid/bootstrap_uid.cpp       |  5 +++++
+ 3 files changed, 30 insertions(+)
+
+diff --git a/src/host/init/init.cu b/src/host/init/init.cu
+index b1c5dbf..1fecb4b 100644
+--- a/src/host/init/init.cu
++++ b/src/host/init/init.cu
+@@ -43,6 +43,8 @@
+ #include "internal/host/nvshmemi_types.h"
+ #include "internal/host/shared_memory.h"
+ #include "internal/host/nvshmemi_symmetric_heap.hpp"
++// eep-dev
++#include "internal/host/nvshmemi_mem_transport.hpp"
+ 
+ extern __constant__ nvshmemi_device_host_state_t nvshmemi_device_state_d;
+ static std::map<void *, int> registered_device_states;
+@@ -1293,6 +1295,14 @@ void nvshmemid_hostlib_finalize(void *device_ctx, void *transport_device_ctx) {
+         /* Multi-init Multi-fini*/
+         nvshmemi_state = NULL;
+         nvshmemi_device_state.nvshmemi_is_nvshmem_initialized = 0;
++        
++        // eep-dev
++        nvshmemi_mem_p2p_transport::destroy_instance();
++        nvshmemi_mem_remote_transport::destroy_instance();
++        free(nvshmemi_default_session);
++        nvshmemi_default_session = nullptr;
++        nvshmemi_device_state.nvshmemi_is_nvshmem_bootstrapped = false;
++        
+         nvshmemi_is_device_state_ready = false;
+     } else
+         nvshmemi_boot_handle.barrier(&nvshmemi_boot_handle);
+diff --git a/src/include/internal/host/nvshmemi_mem_transport.hpp b/src/include/internal/host/nvshmemi_mem_transport.hpp
+index 2495844..e4f408a 100644
+--- a/src/include/internal/host/nvshmemi_mem_transport.hpp
++++ b/src/include/internal/host/nvshmemi_mem_transport.hpp
+@@ -36,6 +36,13 @@ class nvshmemi_mem_p2p_transport final {
+             return p2p_objref_;
+         }
+     }
++    // eep-dev
++    static void destroy_instance(void) {
++        if (p2p_objref_ != nullptr) {
++            delete p2p_objref_;
++            p2p_objref_ = nullptr;
++        }
++    }
+ 
+     void print_mem_handle(int pe_id, int transport_idx, nvshmemi_symmetric_heap &obj);
+ 
+@@ -87,6 +94,14 @@ class nvshmemi_mem_remote_transport final {
+         }
+     }
+ 
++    // eep-dev
++    static void destroy_instance(void) {
++        if (remote_objref_ != nullptr) {
++            delete remote_objref_;
++            remote_objref_ = nullptr;
++        }
++    }
++
+     int gather_mem_handles(nvshmemi_symmetric_heap &obj, uint64_t heap_offset, size_t size);
+     /* On-demand registration and release of memory */
+     int register_mem_handle(nvshmem_mem_handle_t *local_handles, int transport_idx,
+diff --git a/src/modules/bootstrap/uid/bootstrap_uid.cpp b/src/modules/bootstrap/uid/bootstrap_uid.cpp
+index a1fa748..788fa96 100644
+--- a/src/modules/bootstrap/uid/bootstrap_uid.cpp
++++ b/src/modules/bootstrap/uid/bootstrap_uid.cpp
+@@ -630,6 +630,11 @@ int nvshmemi_bootstrap_plugin_pre_init(bootstrap_handle_t* handle, const int abi
+     // Discover the network for bootstrap, if not done previously.
+     // This code needs to be stateful to be able to be called multiple times by the caller
+     BOOTSTRAP_CHECK(bootstrap_net_init());
++    // eep-dev
++    if (handle->pre_init_ops != nullptr) {
++        BOOTSTRAP_PTR_FREE(handle->pre_init_ops);
++        handle->pre_init_ops = nullptr;
++    }
+     if (handle->pre_init_ops == nullptr) {
+         BOOTSTRAP_CALLOC(&handle->pre_init_ops, 1);
+         handle->pre_init_ops->get_unique_id = bootstrap_get_unique_id;
+-- 
+2.43.0
+
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+# Serve DeepSeek V2 model with vLLM
+# This script demonstrates how to serve the DeepSeek V2 model using vLLM's V1 engine
+
+# MODEL_NAME="gaunernst/DeepSeek-V2-Lite-Chat-FP8"
+MODEL_NAME="deepseek-ai/DeepSeek-V2-Lite-Chat"
+HOST="0.0.0.0"
+PORT=8006
+
+DATA_PARALLEL_SIZE=3
+DATA_PARALLEL_SIZE_LOCAL=$DATA_PARALLEL_SIZE
+
+export VLLM_USE_V1=1
+export VLLM_ALL2ALL_BACKEND="pplx"
+export VLLM_USE_DEEP_GEMM=1
+
+# Launch the vLLM server
+vllm serve $MODEL_NAME --trust-remote-code \
+    --disable-log-requests \
+    --host $HOST \
+    --port $PORT \
+    --tensor-parallel-size 1 \
+    --enable-expert-parallel \
+    --enable-eplb \
+    --num-redundant-experts 32 \
+    --enforce-eager \
+    --data-parallel-backend ray \
+    --data-parallel-size $DATA_PARALLEL_SIZE \
+    --data-parallel-size-local $DATA_PARALLEL_SIZE_LOCAL \
+    --data-parallel-start-rank 0
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import json
+import sys
+
+import requests
+
+
+def test_scale(host, port, new_dp_size):
+    url = f"http://{host}:{port}/scale"
+    payload = {"new_data_parallel_size": new_dp_size}
+    headers = {"Content-Type": "application/json"}
+
+    print(f"Sending scale request to {url}")
+    print(f"Payload: {json.dumps(payload, indent=2)}")
+
+    try:
+        response = requests.post(url,
+                                 json=payload,
+                                 headers=headers,
+                                 timeout=300)
+
+        print(f"Status Code: {response.status_code}")
+        print(f"Response: {response.text}")
+
+        if response.status_code == 200:
+            print("Scale up/down request successful!")
+            return True
+        else:
+            print("Scale up/down request failed!")
+            return False
+
+    except requests.exceptions.RequestException as e:
+        print(f"Request failed: {e}")
+        return False
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Test scale up/down functionality")
+    parser.add_argument("--host", default="localhost", help="API server host")
+    parser.add_argument("--port",
+                        type=int,
+                        default=8006,
+                        help="API server port")
+    parser.add_argument("--new_dp_size",
+                        type=int,
+                        default=2,
+                        help="New data parallel size")
+
+    args = parser.parse_args()
+
+    success = test_scale(args.host, args.port, args.new_dp_size)
+    sys.exit(0 if success else 1)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+from torch.multiprocessing import spawn
+
+from vllm.distributed.utils import (
+    stateless_destroy_torch_distributed_process_group,
+    stateless_init_torch_distributed_process_group)
+
+
+def worker_process(rank: int, world_size: int, host: str, port1: int,
+                   port2: int):
+    torch.cuda.set_device(rank % torch.cuda.device_count())
+
+    # Create first process group with all workers
+    pg1 = stateless_init_torch_distributed_process_group(host=host,
+                                                         port=port1,
+                                                         rank=rank,
+                                                         world_size=world_size,
+                                                         backend="gloo")
+
+    # Create second process group with worldsize-1 workers (excluding last rank)
+    pg2 = None
+    if rank < world_size - 1:
+        pg2 = stateless_init_torch_distributed_process_group(
+            host=host,
+            port=port2,
+            rank=rank,
+            world_size=world_size - 1,
+            backend="gloo")
+
+    # Test both groups work simultaneously
+    tensor1 = torch.tensor([rank], dtype=torch.float32)
+    torch.distributed.all_reduce(tensor1, group=pg1)
+    expected1 = sum(range(world_size))
+    assert tensor1.item(
+    ) == expected1, f"PG1 failed: got {tensor1.item()}, expected {expected1}"
+    print(f"Rank {rank}: PG1 all_reduce passed")
+
+    if pg2 is not None:
+        tensor2 = torch.tensor([rank], dtype=torch.float32)
+        torch.distributed.all_reduce(tensor2, group=pg2)
+        expected2 = sum(range(world_size - 1))
+        assert tensor2.item() == expected2, (
+            f"PG2 failed: got {tensor2.item()}, expected {expected2}")
+        print(f"Rank {rank}: PG2 all_reduce passed")
+
+    # Destroy first process group
+    stateless_destroy_torch_distributed_process_group(pg1)
+    print(f"Rank {rank}: PG1 destroyed")
+
+    # Last rank exits here
+    if rank == world_size - 1:
+        print(f"Rank {rank}: Exiting")
+        return
+
+    # Test second group still works after destroying
+    # first group and last rank exit
+    tensor3 = torch.tensor([rank * 10], dtype=torch.float32)
+    torch.distributed.all_reduce(tensor3, group=pg2)
+    expected3 = sum(i * 10 for i in range(world_size - 1))
+    assert tensor3.item() == expected3, (
+        f"PG2 after PG1 destroy failed: got {tensor3.item()}, "
+        f"expected {expected3}")
+    print(f"Rank {rank}: PG2 after PG1 destroy passed")
+
+    # Clean up
+    if pg2 is not None:
+        stateless_destroy_torch_distributed_process_group(pg2)
+    print(f"Rank {rank}: PG2 destroyed")
+
+
+def test_stateless_process_groups():
+    assert not torch.distributed.is_initialized(
+    ), "torch.distributed should not be initialized"
+
+    world_size = 4
+    host = "127.0.0.1"
+    port1 = 29600
+    port2 = 29601
+
+    print(f"Testing stateless process groups with world_size={world_size}")
+
+    spawn(worker_process,
+          args=(world_size, host, port1, port2),
+          nprocs=world_size,
+          join=True)
+
+    print("Test completed successfully!")
+
+
+if __name__ == "__main__":
+    test_stateless_process_groups()
@@ -1954,6 +1954,20 @@ def has_unfinished_dp(dp_group: "ProcessGroup",
         aggregated_has_unfinished = bool(tensor.item())
         return aggregated_has_unfinished
 
+    # eep-dev
+    @staticmethod
+    def sync_kv_cache_memory(dp_group: "ProcessGroup",
+                             kv_cache_memory: int) -> None:
+        if kv_cache_memory == -1:
+            kv_cache_memory = torch.iinfo(torch.int64).max
+        tensor = torch.tensor([kv_cache_memory],
+                              dtype=torch.int64,
+                              device="cpu")
+        # we cannot use broadcast for stateless dp group since it depends
+        # on global rank
+        torch.distributed.all_reduce(tensor, op=ReduceOp.MIN, group=dp_group)
+        return tensor.item()
+
     def compute_hash(self):
         """
         Provide a hash that uniquely identifies all the configs