Skip to content

Commit 34f93bb

Browse files
CUDA: refactor host code, dyn. par. blocks
1 parent 5668c79 commit 34f93bb

File tree

3 files changed

+248
-301
lines changed

3 files changed

+248
-301
lines changed

ggml-cuda.cu

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
141141
info.devices[id].cc = 100*prop.major + 10*prop.minor;
142142
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
143143
info.devices[id].smpb = prop.sharedMemPerBlock;
144+
info.devices[id].nsm = prop.multiProcessorCount;
144145
}
145146

146147
for (int id = 0; id < info.device_count; ++id) {

ggml-cuda/common.cuh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,11 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
390390
}
391391
#endif // defined(GGML_USE_HIPBLAS)
392392

393+
#define FP16_AVAILABLE defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) ? \
394+
defined(RDNA1) || defined(RDNA2) || defined(RDNA3) : __CUDA_ARCH__ >= CC_PASCAL
395+
#define FP16_MMA_AVAILABLE defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) ? \
396+
defined(RDNA3) : __CUDA_ARCH__ >= CC_VOLTA
397+
393398
// TODO: move to ggml-common.h
394399
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
395400

@@ -403,6 +408,7 @@ struct ggml_cuda_device_info {
403408

404409
struct cuda_device_info {
405410
int cc; // compute capability
411+
int nsm; // number of streaming multiprocessors
406412
size_t smpb; // max. shared memory per block
407413
bool vmm; // virtual memory support
408414
size_t vmm_granularity; // granularity of virtual memory

0 commit comments

Comments
 (0)