Skip to content

Commit eee364f

Browse files
add some debugging to vllm docker (#454)
* add some debugging to vllm docker * update * check processes using GPU * lint
1 parent 6d3489a commit eee364f

File tree

2 files changed

+56
-3
lines changed

2 files changed

+56
-3
lines changed

model-engine/model_engine_server/inference/vllm/Dockerfile

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
FROM nvcr.io/nvidia/pytorch:23.09-py3
22

3+
RUN apt-get update \
4+
&& apt-get install -y \
5+
gdb \
6+
psmisc \
7+
&& apt-get autoremove -y \
8+
&& rm -rf /var/lib/apt/lists/*
9+
310
RUN pip uninstall torch -y
411
COPY requirements.txt /workspace/requirements.txt
512
RUN pip install -r requirements.txt

model-engine/model_engine_server/inference/vllm/vllm_server.py

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
import argparse
2+
import code
23
import json
4+
import signal
5+
import subprocess
6+
import traceback
37
from typing import AsyncGenerator
48

59
import uvicorn
@@ -46,9 +50,9 @@ async def stream_results() -> AsyncGenerator[str, None]:
4650
"text": request_output.outputs[-1].text[len(last_output_text) :],
4751
"count_prompt_tokens": len(request_output.prompt_token_ids),
4852
"count_output_tokens": len(request_output.outputs[0].token_ids),
49-
"log_probs": request_output.outputs[0].logprobs[-1]
50-
if sampling_params.logprobs
51-
else None,
53+
"log_probs": (
54+
request_output.outputs[0].logprobs[-1] if sampling_params.logprobs else None
55+
),
5256
"finished": request_output.finished,
5357
}
5458
last_output_text = request_output.outputs[-1].text
@@ -88,7 +92,47 @@ async def abort_request() -> None:
8892
return Response(content=json.dumps(ret))
8993

9094

95+
def get_gpu_free_memory():
96+
"""Get GPU free memory using nvidia-smi."""
97+
try:
98+
output = subprocess.check_output(
99+
["nvidia-smi", "--query-gpu=memory.free", "--format=csv,noheader,nounits"]
100+
).decode("utf-8")
101+
gpu_memory = [int(x) for x in output.strip().split("\n")]
102+
return gpu_memory
103+
except subprocess.CalledProcessError:
104+
return None
105+
106+
107+
def check_unknown_startup_memory_usage():
108+
"""Check for unknown memory usage at startup."""
109+
gpu_free_memory = get_gpu_free_memory()
110+
if gpu_free_memory is not None:
111+
min_mem = min(gpu_free_memory)
112+
max_mem = max(gpu_free_memory)
113+
if max_mem - min_mem > 10:
114+
print(
115+
f"WARNING: Unbalanced GPU memory usage at start up. This may cause OOM. Memory usage per GPU in MB: {gpu_free_memory}."
116+
)
117+
output = subprocess.check_output(["fuser -v /dev/nvidia*"], shell=True).decode("utf-8")
118+
print(f"Processes using GPU: {output}")
119+
120+
121+
def debug(sig, frame):
122+
"""Interrupt running process, and provide a python prompt for
123+
interactive debugging."""
124+
d = {"_frame": frame} # Allow access to frame object.
125+
d.update(frame.f_globals) # Unless shadowed by global
126+
d.update(frame.f_locals)
127+
128+
i = code.InteractiveConsole(d)
129+
message = "Signal received : entering python shell.\nTraceback:\n"
130+
message += "".join(traceback.format_stack(frame))
131+
i.interact(message)
132+
133+
91134
if __name__ == "__main__":
135+
check_unknown_startup_memory_usage()
92136
parser = argparse.ArgumentParser()
93137
parser.add_argument("--host", type=str, default=None) # None == IPv4 / IPv6 dualstack
94138
parser.add_argument("--port", type=int, default=5005)
@@ -98,6 +142,8 @@ async def abort_request() -> None:
98142
engine_args = AsyncEngineArgs.from_cli_args(args)
99143
engine = AsyncLLMEngine.from_engine_args(engine_args)
100144

145+
signal.signal(signal.SIGUSR1, debug)
146+
101147
uvicorn.run(
102148
app,
103149
host=args.host,

0 commit comments

Comments
 (0)