1
1
import argparse
2
+ import code
2
3
import json
4
+ import signal
5
+ import subprocess
6
+ import traceback
3
7
from typing import AsyncGenerator
4
8
5
9
import uvicorn
@@ -46,9 +50,9 @@ async def stream_results() -> AsyncGenerator[str, None]:
46
50
"text" : request_output .outputs [- 1 ].text [len (last_output_text ) :],
47
51
"count_prompt_tokens" : len (request_output .prompt_token_ids ),
48
52
"count_output_tokens" : len (request_output .outputs [0 ].token_ids ),
49
- "log_probs" : request_output . outputs [ 0 ]. logprobs [ - 1 ]
50
- if sampling_params .logprobs
51
- else None ,
53
+ "log_probs" : (
54
+ request_output . outputs [ 0 ]. logprobs [ - 1 ] if sampling_params .logprobs else None
55
+ ) ,
52
56
"finished" : request_output .finished ,
53
57
}
54
58
last_output_text = request_output .outputs [- 1 ].text
@@ -88,7 +92,47 @@ async def abort_request() -> None:
88
92
return Response (content = json .dumps (ret ))
89
93
90
94
95
+ def get_gpu_free_memory ():
96
+ """Get GPU free memory using nvidia-smi."""
97
+ try :
98
+ output = subprocess .check_output (
99
+ ["nvidia-smi" , "--query-gpu=memory.free" , "--format=csv,noheader,nounits" ]
100
+ ).decode ("utf-8" )
101
+ gpu_memory = [int (x ) for x in output .strip ().split ("\n " )]
102
+ return gpu_memory
103
+ except subprocess .CalledProcessError :
104
+ return None
105
+
106
+
107
+ def check_unknown_startup_memory_usage ():
108
+ """Check for unknown memory usage at startup."""
109
+ gpu_free_memory = get_gpu_free_memory ()
110
+ if gpu_free_memory is not None :
111
+ min_mem = min (gpu_free_memory )
112
+ max_mem = max (gpu_free_memory )
113
+ if max_mem - min_mem > 10 :
114
+ print (
115
+ f"WARNING: Unbalanced GPU memory usage at start up. This may cause OOM. Memory usage per GPU in MB: { gpu_free_memory } ."
116
+ )
117
+ output = subprocess .check_output (["fuser -v /dev/nvidia*" ], shell = True ).decode ("utf-8" )
118
+ print (f"Processes using GPU: { output } " )
119
+
120
+
121
+ def debug (sig , frame ):
122
+ """Interrupt running process, and provide a python prompt for
123
+ interactive debugging."""
124
+ d = {"_frame" : frame } # Allow access to frame object.
125
+ d .update (frame .f_globals ) # Unless shadowed by global
126
+ d .update (frame .f_locals )
127
+
128
+ i = code .InteractiveConsole (d )
129
+ message = "Signal received : entering python shell.\n Traceback:\n "
130
+ message += "" .join (traceback .format_stack (frame ))
131
+ i .interact (message )
132
+
133
+
91
134
if __name__ == "__main__" :
135
+ check_unknown_startup_memory_usage ()
92
136
parser = argparse .ArgumentParser ()
93
137
parser .add_argument ("--host" , type = str , default = None ) # None == IPv4 / IPv6 dualstack
94
138
parser .add_argument ("--port" , type = int , default = 5005 )
@@ -98,6 +142,8 @@ async def abort_request() -> None:
98
142
engine_args = AsyncEngineArgs .from_cli_args (args )
99
143
engine = AsyncLLMEngine .from_engine_args (engine_args )
100
144
145
+ signal .signal (signal .SIGUSR1 , debug )
146
+
101
147
uvicorn .run (
102
148
app ,
103
149
host = args .host ,
0 commit comments