Skip to content

Commit 4a7fc56

Browse files
authored
Merge pull request #32 from pytorch/add-h100-benchmark
Setup vLLM benchmark CI for H100
2 parents da80873 + 8dcda1b commit 4a7fc56

File tree

3 files changed

+192
-9
lines changed

3 files changed

+192
-9
lines changed

.github/workflows/vllm-benchmark.yml

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
name: vLLM Benchmark
2+
3+
on:
4+
schedule:
5+
# Run every 2 hours
6+
- cron: '0 */2 * * *'
7+
workflow_dispatch:
8+
inputs:
9+
vllm_branch:
10+
required: true
11+
type: string
12+
default: main
13+
vllm_commit:
14+
required: false
15+
type: string
16+
pull_request:
17+
paths:
18+
- .github/workflows/vllm-benchmark.yml
19+
20+
concurrency:
21+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
22+
cancel-in-progress: true
23+
24+
jobs:
25+
benchmark-h100:
26+
name: Run vLLM benchmarks
27+
runs-on: linux.aws.h100.4
28+
environment: pytorch-x-vllm
29+
steps:
30+
- name: Checkout repository
31+
uses: actions/checkout@v4
32+
33+
- name: Checkout vLLM repository
34+
uses: actions/checkout@v4
35+
with:
36+
repository: vllm-project/vllm
37+
path: vllm-benchmarks/vllm
38+
ref: ${{ inputs.vllm_branch || 'main' }}
39+
fetch-depth: 0
40+
41+
- uses: actions/setup-python@v5
42+
with:
43+
python-version: '3.12'
44+
cache: 'pip'
45+
46+
- name: Set GPU device name
47+
working-directory: vllm-benchmarks
48+
run: |
49+
export GPU_DEVICE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader | awk '{print $2}')
50+
echo "GPU_DEVICE=$GPU_DEVICE" >> $GITHUB_ENV
51+
52+
- name: Install dependencies
53+
working-directory: vllm-benchmarks
54+
run: |
55+
set -eux
56+
pip install -r requirements.txt
57+
58+
- name: Check for last benchmark commit
59+
working-directory: vllm-benchmarks
60+
env:
61+
HEAD_BRANCH: ${{ inputs.vllm_branch || 'main' }}
62+
HEAD_SHA: ${{ inputs.vllm_commit || '' }}
63+
DOCKER_IMAGE_PREFIX: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo
64+
run: |
65+
set -eux
66+
67+
if [[ -z "${HEAD_SHA}" ]]; then
68+
pushd vllm
69+
# Looking back the latest 100 commits is enough
70+
for i in {0..99}
71+
do
72+
# Check if the image is there, if it doesn't then check an older one
73+
# because the commit is too recent
74+
HEAD_SHA=$(git rev-parse --verify HEAD~${i})
75+
DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}"
76+
77+
# No Docker image available yet because the commit is too recent
78+
if ! docker manifest inspect "${DOCKER_IMAGE}"; then
79+
continue
80+
fi
81+
82+
NOT_EXIST=0
83+
S3_PATH="v3/vllm-project/vllm/${HEAD_BRANCH}/${HEAD_SHA}/${GPU_DEVICE}/benchmark_results.json"
84+
aws s3api head-object --bucket ossci-benchmarks --key ${S3_PATH} || NOT_EXIST=1
85+
86+
if [[ ${NOT_EXIST} == "1" ]]; then
87+
echo "Found a vLLM commit ${HEAD_SHA} that hasn't been benchmarked yet"
88+
break
89+
fi
90+
done
91+
popd
92+
fi
93+
94+
echo "HEAD_SHA=$HEAD_SHA" >> $GITHUB_ENV
95+
96+
- name: Setup GPU_FLAG for docker run
97+
run: |
98+
echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
99+
100+
- name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
101+
run: |
102+
echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
103+
104+
- name: Setup benchmark tests
105+
working-directory: vllm-benchmarks
106+
run: |
107+
pushd vllm
108+
git checkout "${HEAD_SHA}"
109+
popd
110+
111+
# Set the list of benchmarks we want to cover in PyTorch infra
112+
cp -r benchmarks/*.json vllm/.buildkite/nightly-benchmarks/tests
113+
114+
- name: Run vLLM benchmark
115+
env:
116+
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
117+
SCCACHE_REGION: us-east-1
118+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
119+
DOCKER_IMAGE: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${{ env.HEAD_SHA }}
120+
# vLLM-related environment variables
121+
ENGINE_VERSION: v1
122+
SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
123+
run: |
124+
set -x
125+
126+
docker run \
127+
${GPU_FLAG:-} \
128+
${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
129+
-e SCCACHE_BUCKET \
130+
-e SCCACHE_REGION \
131+
-e GPU_DEVICE \
132+
-e HF_TOKEN \
133+
-e ENGINE_VERSION \
134+
-e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
135+
--ipc=host \
136+
--tty \
137+
--security-opt seccomp=unconfined \
138+
-v "${GITHUB_WORKSPACE}:/tmp/workspace" \
139+
-w /tmp/workspace \
140+
"${DOCKER_IMAGE}" \
141+
bash -xc "cd vllm-benchmarks/vllm && bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh"
142+
143+
- name: Upload the benchmark results
144+
working-directory: vllm-benchmarks
145+
env:
146+
BENCHMARK_RESULTS: vllm/benchmarks/results
147+
run: |
148+
set -eux
149+
150+
sudo chown -R ${UID} "${BENCHMARK_RESULTS}"
151+
ls -lah "${BENCHMARK_RESULTS}"
152+
153+
python upload_benchmark_results.py \
154+
--vllm vllm \
155+
--benchmark-results "${BENCHMARK_RESULTS}" \
156+
--device "${GPU_DEVICE}"

vllm-benchmarks/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@ psutil==7.0.0
44
pynvml==12.0.0
55
boto3==1.36.21
66
awscli==1.37.21
7+
torch==2.7.0

vllm-benchmarks/upload_benchmark_results.py

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -40,13 +40,25 @@ def __call__(
4040

4141
def parse_args() -> Any:
4242
parser = ArgumentParser("Upload vLLM benchmarks results to S3")
43-
parser.add_argument(
43+
vllm_metadata = parser.add_mutually_exclusive_group(required=True)
44+
vllm_metadata.add_argument(
4445
"--vllm",
4546
type=str,
46-
required=True,
4747
action=ValidateDir,
4848
help="the directory that vllm repo is checked out",
4949
)
50+
branch_commit = vllm_metadata.add_argument_group("vLLM branch and commit metadata")
51+
branch_commit.add_argument(
52+
"--head-branch",
53+
type=str,
54+
default="main",
55+
help="the name of the vLLM branch the benchmark runs on",
56+
)
57+
branch_commit.add_argument(
58+
"--head-sha",
59+
type=str,
60+
help="the commit SHA the benchmark runs on",
61+
)
5062
parser.add_argument(
5163
"--benchmark-results",
5264
type=str,
@@ -78,14 +90,19 @@ def parse_args() -> Any:
7890
def get_git_metadata(vllm_dir: str) -> Tuple[str, str]:
7991
repo = Repo(vllm_dir)
8092
try:
81-
return repo.active_branch.name, repo.head.object.hexsha
93+
return (
94+
repo.active_branch.name,
95+
repo.head.object.hexsha,
96+
repo.head.object.committed_date,
97+
)
8298
except TypeError:
8399
# This is a detached HEAD, default the branch to main
84-
return "main", repo.head.object.hexsha
100+
return "main", repo.head.object.hexsha, repo.head.object.committed_date
85101

86102

87-
def get_benchmark_metadata(head_branch: str, head_sha: str) -> Dict[str, Any]:
88-
timestamp = int(time.time())
103+
def get_benchmark_metadata(
104+
head_branch: str, head_sha: str, timestamp: int
105+
) -> Dict[str, Any]:
89106
return {
90107
"timestamp": timestamp,
91108
"schema_version": "v3",
@@ -104,6 +121,8 @@ def get_runner_info() -> Dict[str, Any]:
104121
name = "rocm"
105122
elif torch.cuda.is_available() and torch.version.cuda:
106123
name = "cuda"
124+
else:
125+
name = "unknown"
107126

108127
return {
109128
"name": name,
@@ -176,7 +195,6 @@ def upload_to_s3(
176195
f"{s3_bucket}",
177196
f"{s3_path}",
178197
).put(
179-
ACL="public-read",
180198
Body=gzip.compress(data.encode()),
181199
ContentEncoding="gzip",
182200
ContentType="application/json",
@@ -186,9 +204,17 @@ def upload_to_s3(
186204
def main() -> None:
187205
args = parse_args()
188206

189-
head_branch, head_sha = get_git_metadata(args.vllm)
207+
if args.vllm:
208+
head_branch, head_sha, timestamp = get_git_metadata(args.vllm)
209+
else:
210+
head_branch, head_sha, timestamp = (
211+
args.head_branch,
212+
args.head_sha,
213+
int(time.time()),
214+
)
215+
190216
# Gather some information about the benchmark
191-
metadata = get_benchmark_metadata(head_branch, head_sha)
217+
metadata = get_benchmark_metadata(head_branch, head_sha, timestamp)
192218
runner = get_runner_info()
193219

194220
# Extract and aggregate the benchmark results

0 commit comments

Comments
 (0)