-
-
Notifications
You must be signed in to change notification settings - Fork 8.7k
[WIP][Hardware][CPU] testing branch for mlperf #20473
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Draft
bigPYJ1151
wants to merge
19
commits into
vllm-project:main
Choose a base branch
from
bigPYJ1151:whisper
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Draft
Changes from all commits
Commits
Show all changes
19 commits
Select commit
Hold shift + click to select a range
c533c6f
[Misc] Reduce logs on startup (#18649)
DarkLight1337 0e615db
downgrade to 2.6
bigPYJ1151 9d7dede
tp=3
bigPYJ1151 01a5fe2
torch.compile
bigPYJ1151 2408210
fix compressed-tensor
bigPYJ1151 05719df
cpu v1
bigPYJ1151 1216cf5
opt shm
bigPYJ1151 592c599
sgl-kernel
bigPYJ1151 0a09047
fix v1 compile
bigPYJ1151 0c87720
opt memcpy
bigPYJ1151 b4f0618
disable mla
bigPYJ1151 a0b2529
sgl kernel opt for prefill
bigPYJ1151 15bac8b
torch2.7
bigPYJ1151 8a52e6c
enable V1 fp8 kv
bigPYJ1151 eb6d401
whisper torch compile
bigPYJ1151 058eca2
optimize whisper cross attn with varlen
bigPYJ1151 4e75814
refine attn kernel dispatch for whisper
bigPYJ1151 6486c04
Remove kv_cache_dtype assert for quantized model
tianmu-li a4825a0
Revert "optimize whisper cross attn with varlen" (#218)
tianmu-li File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,233 @@ | ||
#pragma once | ||
|
||
#include <ATen/ATen.h> | ||
#include <ATen/Parallel.h> | ||
#include <ATen/record_function.h> | ||
|
||
#if defined(_OPENMP) | ||
#include <omp.h> | ||
#endif | ||
|
||
namespace { | ||
|
||
// dispatch bool | ||
#define AT_DISPATCH_BOOL(BOOL_V, BOOL_NAME, ...) \ | ||
[&] { \ | ||
if (BOOL_V) { \ | ||
constexpr bool BOOL_NAME = true; \ | ||
return __VA_ARGS__(); \ | ||
} else { \ | ||
constexpr bool BOOL_NAME = false; \ | ||
return __VA_ARGS__(); \ | ||
} \ | ||
}() | ||
|
||
// dispatch: bfloat16, float16, int8_t, fp8_e4m3 | ||
#define CPU_DISPATCH_PACKED_TYPES(TYPE, ...) \ | ||
[&] { \ | ||
switch (TYPE) { \ | ||
case at::ScalarType::BFloat16 : { \ | ||
using packed_t = at::BFloat16; \ | ||
return __VA_ARGS__(); \ | ||
} \ | ||
case at::ScalarType::Half: { \ | ||
using packed_t = at::Half; \ | ||
return __VA_ARGS__(); \ | ||
} \ | ||
case at::ScalarType::Char : { \ | ||
using packed_t = int8_t; \ | ||
return __VA_ARGS__(); \ | ||
} \ | ||
case at::ScalarType::Float8_e4m3fn : { \ | ||
using packed_t = at::Float8_e4m3fn; \ | ||
return __VA_ARGS__(); \ | ||
} \ | ||
default: \ | ||
TORCH_CHECK(false, "Unsupported floating data type.\n"); \ | ||
} \ | ||
}() | ||
|
||
#define UNUSED(x) (void)(x) | ||
|
||
#define CHECK_CPU(x) TORCH_CHECK(x.device().type() == at::kCPU, #x " must be a CPU tensor") | ||
|
||
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") | ||
#define CHECK_LAST_DIM_CONTIGUOUS(x) \ | ||
TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimention") | ||
|
||
#define CHECK_INPUT(x) \ | ||
CHECK_CPU(x); \ | ||
CHECK_CONTIGUOUS(x) | ||
#define CHECK_LAST_DIM_CONTIGUOUS_INPUT(x) \ | ||
CHECK_CPU(x); \ | ||
CHECK_LAST_DIM_CONTIGUOUS(x) | ||
|
||
#define CHECK_DIM(d, x) TORCH_CHECK(x.dim() == d, #x " must be a " #d "D tensor") | ||
|
||
#define CHECK_EQ(a, b) TORCH_CHECK((a) == (b), "CHECK_EQ(" #a ", " #b ") failed. ", a, " vs ", b) | ||
|
||
// parallel routines | ||
constexpr int GRAIN_SIZE = 1024; | ||
|
||
template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0> | ||
inline T div_up(T x, T y) { return (x + y - 1) / y; } | ||
|
||
template <typename T> | ||
inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) { | ||
#if 0 | ||
// onednn partition pattern | ||
T& n_my = n_end; | ||
if (nth <= 1 || n == 0) { | ||
n_start = 0; | ||
n_my = n; | ||
} else { | ||
T n1 = div_up(n, nth); | ||
T n2 = n1 - 1; | ||
T T1 = n - n2 * nth; | ||
n_my = ith < T1 ? n1 : n2; | ||
n_start = ith <= T1 ? ith*n1 : T1 * n1 + (ith - T1) * n2; | ||
} | ||
n_end += n_start; | ||
#else | ||
// pytorch aten partition pattern | ||
T n_my = div_up(n, nth); | ||
n_start = ith * n_my; | ||
n_end = std::min(n_start + n_my, n); | ||
#endif | ||
} | ||
|
||
template <typename func_t> | ||
inline void parallel_for(int n, const func_t& f) { | ||
#if defined(_OPENMP) | ||
#pragma omp parallel | ||
{ | ||
int nth = omp_get_num_threads(); | ||
int ith = omp_get_thread_num(); | ||
int tbegin, tend; | ||
balance211(n, nth, ith, tbegin, tend); | ||
f(tbegin, tend); | ||
} | ||
#else | ||
f(0, n); | ||
#endif | ||
} | ||
|
||
// for 1d parallel, use `actual_nth` | ||
// for 2d parallel, use even nths, e.g. 43->42 | ||
int inline adjust_num_threads(int m) { | ||
int actual_nth = at::get_num_threads(); | ||
if (m == 1) { | ||
return actual_nth; | ||
} | ||
return std::max(1, (actual_nth >> 1) * 2); | ||
} | ||
|
||
template <typename func_t> | ||
inline void parallel_2d(int m, int n, const func_t& f) { | ||
|
||
// make sure we have even num_threads | ||
int nth = adjust_num_threads(m); | ||
|
||
// [NOTE] thread blocking: | ||
// | ||
// 1) prefer square block per thread | ||
// 2) use even number of CPU cores | ||
// 3) use all `num_threads` cores | ||
// | ||
// we have: | ||
// TM * TN = T | ||
// BM / TM = BN / TN | ||
// then: | ||
// TM = ((BM / BN) * T) ^ 0.5 | ||
// | ||
float r = float(m) / n; | ||
int nth_m = std::ceil(std::sqrt(r * nth)); | ||
int nth_n = 1; | ||
for (; nth_m > 0; --nth_m) { | ||
nth_n = nth / nth_m; | ||
if (nth_m * nth_n == nth) { | ||
break; | ||
} | ||
} | ||
|
||
#if defined(_OPENMP) | ||
#pragma omp parallel num_threads(nth) | ||
{ | ||
int ith = omp_get_thread_num(); | ||
int ith_m = ith / nth_n; | ||
int ith_n = ith % nth_n; | ||
|
||
int thread_block_m = div_up(m, nth_m); | ||
int thread_block_n = div_up(n, nth_n); | ||
|
||
int begin_m = ith_m * thread_block_m; | ||
int end_m = std::min(m, begin_m + thread_block_m); | ||
int begin_n = ith_n * thread_block_n; | ||
int end_n = std::min(n, begin_n + thread_block_n); | ||
|
||
f(begin_m, end_m, begin_n, end_n); | ||
} | ||
#else | ||
f(0, m, 0, n); | ||
#endif | ||
} | ||
|
||
template <typename T> | ||
int get_cache_blocks(int BLOCK_SIZE, int K) { | ||
// L2 2MB and ratio of 50% | ||
const int L2_size = 2048 * 1024 >> 1; | ||
return std::max(1, int(L2_size / (BLOCK_SIZE * K * sizeof(T)))); | ||
} | ||
|
||
// data indexing for dimension collapse | ||
template <typename T> | ||
inline T data_index_init(T offset) { | ||
return offset; | ||
} | ||
|
||
template <typename T, typename... Args> | ||
inline T data_index_init(T offset, T& x, const T& X, Args&&... args) { | ||
offset = data_index_init(offset, std::forward<Args>(args)...); | ||
x = offset % X; | ||
return offset / X; | ||
} | ||
|
||
inline bool data_index_step() { | ||
return true; | ||
} | ||
|
||
template <typename T, typename... Args> | ||
inline bool data_index_step(T& x, const T& X, Args&&... args) { | ||
if (data_index_step(std::forward<Args>(args)...)) { | ||
x = ((x + 1) == X) ? 0 : (x + 1); | ||
return x == 0; | ||
} | ||
return false; | ||
} | ||
|
||
// forced unroll for perf critical path | ||
|
||
#if __has_attribute(always_inline) | ||
#define ALWAYS_INLINE __attribute__((__always_inline__)) inline | ||
#else | ||
#define ALWAYS_INLINE inline | ||
#endif | ||
|
||
template <int n> | ||
struct Unroll { | ||
template <typename Func, typename... Args> | ||
ALWAYS_INLINE void operator()(const Func& f, Args... args) const { | ||
Unroll<n - 1>{}(f, args...); | ||
f(std::integral_constant<int, n - 1>{}, args...); | ||
} | ||
}; | ||
|
||
template <> | ||
struct Unroll<1> { | ||
template <typename Func, typename... Args> | ||
ALWAYS_INLINE void operator()(const Func& f, Args... args) const { | ||
f(std::integral_constant<int, 0>{}, args...); | ||
} | ||
}; | ||
|
||
} // anonymous namespace |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are inconsistent preprocessor definitions for
CPU_CAPABILITY_AVX512
across the codebase, which could lead to incorrect builds or runtime behavior.cmake/cpu_extension.cmake:245
: DefinesCPU_CAPABILITY_AVX512
ifENABLE_AVX512BF16 AND ENABLE_AVX512VNNI
.csrc/cpu/sgl-kernels/vec.h:3-5
: DefinesCPU_CAPABILITY_AVX512
if__AVX512F__ && __AVX512BF16__ && __AMX_BF16__
.csrc/cpu/torch_bindings.cpp:242
: Guards new op registrations with#if defined (__AVX512BF16__) && defined (__AVX512F__) && defined (__AVX512VNNI__)
.These conditions are different. For example, the
vec.h
definition depends on__AMX_BF16__
which is not checked in CMake, while the CMake logic depends onAVX512VNNI
which is not in thevec.h
condition.To ensure correctness and maintainability, these definitions should be unified. I recommend using a single source of truth for this capability check, probably in this CMake file, and then using that definition throughout the C++ code. The definition in
vec.h
should probably be guarded with#ifndef CPU_CAPABILITY_AVX512
to avoid redefinition warnings and use the CMake-provided definition.