diff --git a/docs/dev-docker/README.md b/docs/dev-docker/README.md index 03e0f93ca152..838d2209fb10 100644 --- a/docs/dev-docker/README.md +++ b/docs/dev-docker/README.md @@ -40,14 +40,14 @@ The table below shows performance data where a local inference client is fed req | Model | Precision | TP Size | Input | Output | Num Prompts | Max Num Seqs | Throughput (tokens/s) | |-------|-----------|---------|-------|--------|-------------|--------------|-----------------------| -| Llama 3.1 70B (amd/Llama-3.1-70B-Instruct-FP8-KV) | FP8 | 8 | 128 | 2048 | 3200 | 3200 | 16364.9 | -| | | | 128 | 4096 | 1500 | 1500 | 12171.0 | -| | | | 500 | 2000 | 2000 | 2000 | 13290.4 | -| | | | 2048 | 2048 | 1500 | 1500 | 8216.5 | -| Llama 3.1 405B (amd/Llama-3.1-405B-Instruct-FP8-KV) | FP8 | 8 | 128 | 2048 | 1500 | 1500 | 4331.6 | -| | | | 128 | 4096 | 1500 | 1500 | 3409.9 | -| | | | 500 | 2000 | 2000 | 2000 | 3184.0 | -| | | | 2048 | 2048 | 500 | 500 | 2154.3 | +| Llama 3.1 70B (amd/Llama-3.1-70B-Instruct-FP8-KV) | FP8 | 8 | 128 | 2048 | 3200 | 3200 | 16896.6 | +| | | | 128 | 4096 | 1500 | 1500 | 13943.8 | +| | | | 500 | 2000 | 2000 | 2000 | 13512.8 | +| | | | 2048 | 2048 | 1500 | 1500 | 8444.5 | +| Llama 3.1 405B (amd/Llama-3.1-405B-Instruct-FP8-KV) | FP8 | 8 | 128 | 2048 | 1500 | 1500 | 4359.9 | +| | | | 128 | 4096 | 1500 | 1500 | 3430.9 | +| | | | 500 | 2000 | 2000 | 2000 | 3226.8 | +| | | | 2048 | 2048 | 500 | 500 | 2228.2 | *TP stands for Tensor Parallelism.* @@ -57,38 +57,38 @@ The table below shows latency measurement, which typically involves assessing th | Model | Precision | TP Size | Batch Size | Input | Output | MI300X Latency (sec) | |-------|-----------|----------|------------|--------|---------|-------------------| -| Llama 3.1 70B (amd/Llama-3.1-70B-Instruct-FP8-KV) | FP8 | 8 | 1 | 128 | 2048 | 17.411 | -| | | | 2 | 128 | 2048 | 18.750 | -| | | | 4 | 128 | 2048 | 19.059 | -| | | | 8 | 128 | 2048 | 20.857 | -| | | | 16 | 128 | 2048 | 22.670 | -| | | | 32 | 128 | 2048 | 25.495 | -| | | | 64 | 128 | 2048 | 34.187 | -| | | | 128 | 128 | 2048 | 48.754 | -| | | | 1 | 2048 | 2048 | 17.699 | -| | | | 2 | 2048 | 2048 | 18.919 | -| | | | 4 | 2048 | 2048 | 19.220 | -| | | | 8 | 2048 | 2048 | 21.545 | -| | | | 16 | 2048 | 2048 | 24.329 | -| | | | 32 | 2048 | 2048 | 29.461 | -| | | | 64 | 2048 | 2048 | 40.148 | -| | | | 128 | 2048 | 2048 | 61.382 | -| Llama 3.1 405B (amd/Llama-3.1-70B-Instruct-FP8-KV) | FP8 | 8 | 1 | 128 | 2048 | 46.601 | -| | | | 2 | 128 | 2048 | 46.947 | -| | | | 4 | 128 | 2048 | 48.971 | -| | | | 8 | 128 | 2048 | 53.021 | -| | | | 16 | 128 | 2048 | 55.836 | -| | | | 32 | 128 | 2048 | 64.947 | -| | | | 64 | 128 | 2048 | 81.408 | -| | | | 128 | 128 | 2048 | 115.296 | -| | | | 1 | 2048 | 2048 | 46.998 | -| | | | 2 | 2048 | 2048 | 47.619 | -| | | | 4 | 2048 | 2048 | 51.086 | -| | | | 8 | 2048 | 2048 | 55.706 | -| | | | 16 | 2048 | 2048 | 61.049 | -| | | | 32 | 2048 | 2048 | 75.842 | -| | | | 64 | 2048 | 2048 | 103.074 | -| | | | 128 | 2048 | 2048 | 157.705 | +| Llama 3.1 70B (amd/Llama-3.1-70B-Instruct-FP8-KV) | FP8 | 8 | 1 | 128 | 2048 | 15.427 | +| | | | 2 | 128 | 2048 | 16.661 | +| | | | 4 | 128 | 2048 | 17.326 | +| | | | 8 | 128 | 2048 | 18.679 | +| | | | 16 | 128 | 2048 | 20.642 | +| | | | 32 | 128 | 2048 | 23.260 | +| | | | 64 | 128 | 2048 | 30.498 | +| | | | 128 | 128 | 2048 | 42.952 | +| | | | 1 | 2048 | 2048 | 15.677 | +| | | | 2 | 2048 | 2048 | 16.715 | +| | | | 4 | 2048 | 2048 | 17.684 | +| | | | 8 | 2048 | 2048 | 19.444 | +| | | | 16 | 2048 | 2048 | 22.282 | +| | | | 32 | 2048 | 2048 | 26.545 | +| | | | 64 | 2048 | 2048 | 36.651 | +| | | | 128 | 2048 | 2048 | 55.949 | +| Llama 3.1 405B (amd/Llama-3.1-70B-Instruct-FP8-KV) | FP8 | 8 | 1 | 128 | 2048 | 45.294 | +| | | | 2 | 128 | 2048 | 46.166 | +| | | | 4 | 128 | 2048 | 47.867 | +| | | | 8 | 128 | 2048 | 51.065 | +| | | | 16 | 128 | 2048 | 54.304 | +| | | | 32 | 128 | 2048 | 63.078 | +| | | | 64 | 128 | 2048 | 81.906 | +| | | | 128 | 128 | 2048 | 108.097 | +| | | | 1 | 2048 | 2048 | 46.003 | +| | | | 2 | 2048 | 2048 | 46.596 | +| | | | 4 | 2048 | 2048 | 49.273 | +| | | | 8 | 2048 | 2048 | 53.762 | +| | | | 16 | 2048 | 2048 | 59.629 | +| | | | 32 | 2048 | 2048 | 73.753 | +| | | | 64 | 2048 | 2048 | 103.530 | +| | | | 128 | 2048 | 2048 | 151.785 | *TP stands for Tensor Parallelism.* @@ -490,23 +490,15 @@ To reproduce the release docker: ```bash git clone https://github.com/ROCm/vllm.git cd vllm - git checkout b8498bc4a1c2aae1e25cfc780db0eadbc4716c67 - docker build -f docker/Dockerfile.rocm -t --build-arg USE_CYTHON=1 . -``` - -### Building AITER Image - -Use AITER release candidate branch instead: - -```bash - git clone https://github.com/ROCm/vllm.git - cd vllm - git checkout aiter_integration_final + git checkout c43debd43c4d8a7e4fdeff4c069c5970e5e701c0 docker build -f docker/Dockerfile.rocm -t --build-arg USE_CYTHON=1 . ``` ## Changelog +20250415: +- Updated hipBLASt version to 0.15 + 20250410_aiter: - 2-stage MoE - MLA from AITER