CUDA: refactor host code, dyn. par. blocks

JohannesGaessler · JohannesGaessler · commit 34f93bbb3996 · 2024-04-18T13:15:32.000+02:00
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -141,6 +141,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
         info.devices[id].cc = 100*prop.major + 10*prop.minor;
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
         info.devices[id].smpb = prop.sharedMemPerBlock;
+        info.devices[id].nsm  = prop.multiProcessorCount;
     }
 
     for (int id = 0; id < info.device_count; ++id) {
diff --git a/ggml-cuda/common.cuh b/ggml-cuda/common.cuh
@@ -390,6 +390,11 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
 }
 #endif // defined(GGML_USE_HIPBLAS)
 
+#define FP16_AVAILABLE     defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) ? \
+    defined(RDNA1) || defined(RDNA2) || defined(RDNA3) : __CUDA_ARCH__ >= CC_PASCAL
+#define FP16_MMA_AVAILABLE defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) ? \
+                                        defined(RDNA3) : __CUDA_ARCH__ >= CC_VOLTA
+
 // TODO: move to ggml-common.h
 static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
 
@@ -403,6 +408,7 @@ struct ggml_cuda_device_info {
 
     struct cuda_device_info {
         int     cc;                 // compute capability
+        int     nsm;                // number of streaming multiprocessors
         size_t  smpb;               // max. shared memory per block
         bool    vmm;                // virtual memory support
         size_t  vmm_granularity;    // granularity of virtual memory
diff --git a/ggml-cuda/fattn.cu b/ggml-cuda/fattn.cu

Original file line number	Diff line number	Diff line change
`@@ -141,6 +141,7 @@ static ggml_cuda_device_info ggml_cuda_init() {`
`141`	`141`	`info.devices[id].cc = 100prop.major + 10prop.minor;`
`142`	`142`	`#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)`
`143`	`143`	`info.devices[id].smpb = prop.sharedMemPerBlock;`
	`144`	`+ info.devices[id].nsm = prop.multiProcessorCount;`
`144`	`145`	`}`
`145`	`146`
`146`	`147`	`for (int id = 0; id < info.device_count; ++id) {`