Skip to content

Commit a558769

Browse files
cuda : fix jetson compile error (#4560)
* fix old jetson compile error * Update Makefile * update jetson detect and cuda version detect * update cuda marco define * update makefile and cuda,fix some issue * Update README.md Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update Makefile * Update README.md --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
1 parent 6724ef1 commit a558769

File tree

4 files changed

+31
-5
lines changed

4 files changed

+31
-5
lines changed

Makefile

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -282,8 +282,17 @@ endif
282282
ifneq ($(filter aarch64%,$(UNAME_M)),)
283283
# Apple M1, M2, etc.
284284
# Raspberry Pi 3, 4, Zero 2 (64-bit)
285+
# Nvidia Jetson
285286
MK_CFLAGS += -mcpu=native
286287
MK_CXXFLAGS += -mcpu=native
288+
JETSON_RELEASE_INFO = $(shell jetson_release)
289+
ifdef JETSON_RELEASE_INFO
290+
ifneq ($(filter TX2%,$(JETSON_RELEASE_INFO)),)
291+
JETSON_EOL_MODULE_DETECT = 1
292+
CC = aarch64-unknown-linux-gnu-gcc
293+
cxx = aarch64-unknown-linux-gnu-g++
294+
endif
295+
endif
287296
endif
288297

289298
ifneq ($(filter armv6%,$(UNAME_M)),)
@@ -357,10 +366,13 @@ ifdef LLAMA_BLIS
357366
endif # LLAMA_BLIS
358367

359368
ifdef LLAMA_CUBLAS
360-
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
361-
MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
369+
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include
370+
MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib
362371
OBJS += ggml-cuda.o
363-
MK_NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
372+
MK_NVCCFLAGS = -use_fast_math
373+
ifndef JETSON_EOL_MODULE_DETECT
374+
MK_NVCCFLAGS += --forward-unknown-to-host-compiler
375+
endif # JETSON_EOL_MODULE_DETECT
364376

365377
ifdef LLAMA_DEBUG
366378
MK_NVCCFLAGS += -lineinfo
@@ -417,7 +429,11 @@ ifdef LLAMA_CUDA_CCBIN
417429
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
418430
endif
419431
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
432+
ifdef JETSON_EOL_MODULE_DETECT
433+
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
434+
else
420435
$(NVCC) $(BASE_CXXFLAGS) $(NVCCFLAGS) -Wno-pedantic -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
436+
endif # JETSON_EOL_MODULE_DETECT
421437
endif # LLAMA_CUBLAS
422438

423439
ifdef LLAMA_CLBLAST

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -396,6 +396,9 @@ Building the program with BLAS support may lead to some performance improvements
396396
- #### cuBLAS
397397
398398
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
399+
400+
For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
401+
399402
- Using `make`:
400403
```bash
401404
make LLAMA_CUBLAS=1

ggml-cuda.cu

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,13 @@
9090
#include <cuda_runtime.h>
9191
#include <cublas_v2.h>
9292
#include <cuda_fp16.h>
93+
// CUDA 10.2 does not have these macro definitions.
94+
#ifndef CUBLAS_TF32_TENSOR_OP_MATH
95+
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
96+
#define CUBLAS_COMPUTE_16F CUDA_R_16F
97+
#define CUBLAS_COMPUTE_32F CUDA_R_32F
98+
#define cublasComputeType_t cudaDataType_t
99+
#endif
93100
#endif // defined(GGML_USE_HIPBLAS)
94101

95102
#include "ggml-cuda.h"

ggml-quants.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3677,7 +3677,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
36773677

36783678
const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4);
36793679
const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
3680-
const ggml_int16x8x2_t mins16 = {vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))};
3680+
const ggml_int16x8x2_t mins16 = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))}};
36813681
const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])),
36823682
vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0])));
36833683
const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])),
@@ -6626,7 +6626,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
66266626

66276627
const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
66286628
const int8x16_t scales = vld1q_s8(scale);
6629-
const ggml_int16x8x2_t q6scales = {vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))};
6629+
const ggml_int16x8x2_t q6scales = {{vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))}};
66306630

66316631
const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
66326632
vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),

0 commit comments

Comments
 (0)