|
16 | 16 | #include <arm_sve.h>
|
17 | 17 | #endif // __ARM_FEATURE_SVE
|
18 | 18 |
|
| 19 | +#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__) |
| 20 | +// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example: |
| 21 | +// |
| 22 | +// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ |
| 23 | +// |
| 24 | +#include <arm_neon.h> |
| 25 | +#endif |
| 26 | + |
19 | 27 | #if defined(__F16C__)
|
20 | 28 | #include <immintrin.h>
|
21 | 29 | #endif
|
@@ -303,35 +311,29 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
|
303 | 311 |
|
304 | 312 | // FP16 to FP32 conversion
|
305 | 313 |
|
306 |
| -// 16-bit float |
307 |
| -// on Arm, we use __fp16 |
308 |
| -// on x86, we use uint16_t |
309 |
| -// |
310 |
| -// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616 |
311 |
| -// for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843 |
312 |
| -// |
313 |
| -#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__) |
314 |
| - |
315 |
| - // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example: |
316 |
| - // |
317 |
| - // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ |
318 |
| - // |
319 |
| - #include <arm_neon.h> |
| 314 | +#if defined(__ARM_NEON) |
| 315 | + #if defined(_MSC_VER) || (defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) |
| 316 | + typedef uint16_t ggml_fp16_internal_t; |
| 317 | + #else |
| 318 | + typedef __fp16 ggml_fp16_internal_t; |
| 319 | + #endif |
| 320 | +#endif |
320 | 321 |
|
| 322 | +#if defined(__ARM_NEON) && !defined(_MSC_VER) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) |
321 | 323 | #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
322 | 324 | #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
323 | 325 |
|
324 | 326 | #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
325 | 327 |
|
326 | 328 | static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
327 |
| - __fp16 tmp; |
| 329 | + ggml_fp16_internal_t tmp; |
328 | 330 | memcpy(&tmp, &h, sizeof(ggml_fp16_t));
|
329 | 331 | return (float)tmp;
|
330 | 332 | }
|
331 | 333 |
|
332 | 334 | static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
333 | 335 | ggml_fp16_t res;
|
334 |
| - __fp16 tmp = f; |
| 336 | + ggml_fp16_internal_t tmp = f; |
335 | 337 | memcpy(&res, &tmp, sizeof(ggml_fp16_t));
|
336 | 338 | return res;
|
337 | 339 | }
|
@@ -483,7 +485,7 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
|
483 | 485 | #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
484 | 486 | #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
485 | 487 |
|
486 |
| -#endif // defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__) |
| 488 | +#endif // defined(__ARM_NEON) && (!defined(__MSC_VER) |
487 | 489 |
|
488 | 490 | // precomputed f32 table for f16 (256 KB)
|
489 | 491 | // defined in ggml.c, initialized in ggml_init()
|
|
0 commit comments