|
1 | 1 | # SPDX-License-Identifier: Apache-2.0
|
2 | 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
3 | 3 | """
|
4 |
| -Example to deploy DeepSeek R1 or V3 with Ray Serve LLM. |
5 |
| -See more details at: |
6 |
| -https://docs.ray.io/en/latest/serve/tutorials/serve-deepseek.html |
7 |
| -And see Ray Serve LLM documentation at: |
8 |
| -https://docs.ray.io/en/latest/serve/llm/serving-llms.html |
| 4 | +Deploy DeepSeek R1 or V3 with Ray Serve LLM. |
| 5 | +
|
| 6 | +Ray Serve LLM is a scalable and production-grade model serving library built |
| 7 | +on the Ray distributed computing framework and first-class support for the vLLM engine. |
| 8 | +
|
| 9 | +Key features: |
| 10 | +- Automatic scaling, back-pressure, and load balancing across a Ray cluster. |
| 11 | +- Unified multi-node multi-model deployment. |
| 12 | +- Exposes an OpenAI-compatible HTTP API. |
| 13 | +- Multi-LoRA support with shared base models. |
9 | 14 |
|
10 |
| -Run `python3 ray_serve_deepseek.py` to deploy the model. |
| 15 | +Run `python3 ray_serve_deepseek.py` to launch an endpoint. |
| 16 | +
|
| 17 | +Learn more in the official Ray Serve LLM documentation: |
| 18 | +https://docs.ray.io/en/latest/serve/llm/serving-llms.html |
11 | 19 | """
|
12 | 20 |
|
13 | 21 | from ray import serve
|
|
16 | 24 | llm_config = LLMConfig(
|
17 | 25 | model_loading_config={
|
18 | 26 | "model_id": "deepseek",
|
19 |
| - # Since DeepSeek model is huge, it is recommended to pre-download |
20 |
| - # the model to local disk, say /path/to/the/model and specify: |
21 |
| - # model_source="/path/to/the/model" |
| 27 | + # Pre-downloading the model to local storage is recommended since |
| 28 | + # the model is large. Set model_source="/path/to/the/model". |
22 | 29 | "model_source": "deepseek-ai/DeepSeek-R1",
|
23 | 30 | },
|
24 | 31 | deployment_config={
|
|
27 | 34 | "max_replicas": 1,
|
28 | 35 | }
|
29 | 36 | },
|
30 |
| - # Change to the accelerator type of the node |
| 37 | + # Set to the node's accelerator type. |
31 | 38 | accelerator_type="H100",
|
32 | 39 | runtime_env={"env_vars": {"VLLM_USE_V1": "1"}},
|
33 |
| - # Customize engine arguments as needed (e.g. vLLM engine kwargs) |
| 40 | + # Customize engine arguments as required (for example, vLLM engine kwargs). |
34 | 41 | engine_kwargs={
|
35 | 42 | "tensor_parallel_size": 8,
|
36 | 43 | "pipeline_parallel_size": 2,
|
|
44 | 51 | },
|
45 | 52 | )
|
46 | 53 |
|
47 |
| -# Deploy the application |
| 54 | +# Deploy the application. |
48 | 55 | llm_app = build_openai_app({"llm_configs": [llm_config]})
|
49 | 56 | serve.run(llm_app)
|
0 commit comments