76
76
#define GGML_CUDA_CC_IS_CDNA (cc ) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
77
77
78
78
// Moore Threads
79
- #define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210 ) // MTT S80, MTT S3000
80
- #define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220 ) // MTT S4000
81
- #define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310 ) // TBD
79
+ #define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210 )
80
+
81
+ #define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210 ) // MTT S80, MTT S3000
82
+ #define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220 ) // MTT S4000
83
+ #define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310 ) // TBD
82
84
83
85
#define GGML_CUDA_CC_IS_MTHREADS (cc ) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
84
86
#define GGML_CUDA_CC_IS_QY1 (cc ) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
@@ -215,9 +217,9 @@ typedef float2 dfloat2;
215
217
#define FAST_FP16_AVAILABLE
216
218
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
217
219
218
- #if (! defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)
220
+ #if !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
219
221
#define FP16_MMA_AVAILABLE
220
- #endif // (! defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)
222
+ #endif // !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
221
223
222
224
#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
223
225
#define FP16_MMA_AVAILABLE
@@ -231,9 +233,9 @@ typedef float2 dfloat2;
231
233
#define CP_ASYNC_AVAILABLE
232
234
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
233
235
234
- #if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220 )
236
+ #if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1 )
235
237
#define FLASH_ATTN_AVAILABLE
236
- #endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220 )
238
+ #endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1 )
237
239
238
240
static bool fp16_available (const int cc) {
239
241
return ggml_cuda_highest_compiled_arch (cc) >= GGML_CUDA_CC_PASCAL;
@@ -245,8 +247,7 @@ static bool fast_fp16_available(const int cc) {
245
247
246
248
// To be used for feature selection of external libraries, e.g. cuBLAS.
247
249
static bool fast_fp16_hardware_available (const int cc) {
248
- return (GGML_CUDA_CC_IS_NVIDIA (cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610 ) || GGML_CUDA_CC_IS_AMD (cc) ||
249
- (GGML_CUDA_CC_IS_MTHREADS (cc) && cc >= GGML_CUDA_CC_QY2);
250
+ return (GGML_CUDA_CC_IS_NVIDIA (cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610 ) || GGML_CUDA_CC_IS_AMD (cc);
250
251
}
251
252
252
253
// Any FP16 tensor core instructions are available for ggml code.
@@ -255,8 +256,7 @@ static bool fp16_mma_available(const int cc) {
255
256
return false ;
256
257
#else
257
258
if ((GGML_CUDA_CC_IS_NVIDIA (cc) && ggml_cuda_highest_compiled_arch (cc) >= GGML_CUDA_CC_VOLTA) ||
258
- GGML_CUDA_CC_IS_CDNA (cc) || GGML_CUDA_CC_IS_RDNA3 (cc) ||
259
- GGML_CUDA_CC_IS_MTHREADS (cc)) {
259
+ GGML_CUDA_CC_IS_CDNA (cc) || GGML_CUDA_CC_IS_RDNA3 (cc)) {
260
260
return true ;
261
261
} else if (GGML_CUDA_CC_IS_RDNA4 (cc)) {
262
262
#if defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12)
@@ -273,16 +273,7 @@ static bool fp16_mma_available(const int cc) {
273
273
// To be used for feature selection of external libraries, e.g. cuBLAS.
274
274
static bool fp16_mma_hardware_available (const int cc) {
275
275
return (GGML_CUDA_CC_IS_NVIDIA (cc) && cc >= GGML_CUDA_CC_VOLTA) ||
276
- GGML_CUDA_CC_IS_CDNA (cc) || GGML_CUDA_CC_IS_RDNA3 (cc) || GGML_CUDA_CC_IS_RDNA4 (cc) ||
277
- (GGML_CUDA_CC_IS_MTHREADS (cc) && cc >= GGML_CUDA_CC_QY2);
278
- }
279
-
280
- static bool bf16_mma_hardware_available (const int cc) {
281
- return (GGML_CUDA_CC_IS_NVIDIA (cc) && cc >= GGML_CUDA_CC_AMPERE) || GGML_CUDA_CC_IS_CDNA (cc) || cc >= GGML_CUDA_CC_RDNA3;
282
- }
283
-
284
- static bool fp32_mma_hardware_available (const int cc) {
285
- return GGML_CUDA_CC_IS_CDNA (cc);
276
+ GGML_CUDA_CC_IS_CDNA (cc) || GGML_CUDA_CC_IS_RDNA3 (cc) || GGML_CUDA_CC_IS_RDNA4 (cc);
286
277
}
287
278
288
279
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
0 commit comments