|
| 1 | +add_library(ggml-tinyblas |
| 2 | + ggml-tinyblas.cpp |
| 3 | + ) |
| 4 | + |
| 5 | +target_link_libraries(ggml-tinyblas PRIVATE ggml-base) |
| 6 | +target_include_directories(ggml-tinyblas PRIVATE . ..) |
| 7 | + |
| 8 | +if (APPLE AND GGML_ACCELERATE) |
| 9 | + find_library(ACCELERATE_FRAMEWORK Accelerate) |
| 10 | + if (ACCELERATE_FRAMEWORK) |
| 11 | + message(STATUS "Accelerate framework found") |
| 12 | + |
| 13 | + add_compile_definitions(GGML_USE_ACCELERATE) |
| 14 | + add_compile_definitions(ACCELERATE_NEW_LAPACK) |
| 15 | + add_compile_definitions(ACCELERATE_LAPACK_ILP64) |
| 16 | + |
| 17 | + target_link_libraries(ggml-tinyblas PRIVATE ${ACCELERATE_FRAMEWORK}) |
| 18 | + else() |
| 19 | + message(WARNING "Accelerate framework not found") |
| 20 | + endif() |
| 21 | +endif() |
| 22 | + |
| 23 | +if (GGML_OPENMP) |
| 24 | + find_package(OpenMP) |
| 25 | + if (OpenMP_FOUND) |
| 26 | + message(STATUS "OpenMP found") |
| 27 | + |
| 28 | + add_compile_definitions(GGML_USE_OPENMP) |
| 29 | + |
| 30 | + target_link_libraries(ggml-tinyblas PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) |
| 31 | + |
| 32 | + else() |
| 33 | + message(WARNING "OpenMP not found") |
| 34 | + endif() |
| 35 | +endif() |
| 36 | + |
| 37 | +target_sources(ggml-tinyblas PRIVATE |
| 38 | + sgemm.cpp |
| 39 | + sgemm.h) |
| 40 | + |
| 41 | +if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR |
| 42 | + CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR |
| 43 | + (NOT CMAKE_OSX_ARCHITECTURES AND |
| 44 | + NOT CMAKE_GENERATOR_PLATFORM_LWR AND |
| 45 | + CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$")) |
| 46 | + |
| 47 | + message(STATUS "ARM detected") |
| 48 | + |
| 49 | + if (MSVC) |
| 50 | + add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead |
| 51 | + add_compile_definitions(__ARM_NEON) |
| 52 | + add_compile_definitions(__ARM_FEATURE_FMA) |
| 53 | + |
| 54 | + set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS}) |
| 55 | + string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2") |
| 56 | + |
| 57 | + check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) |
| 58 | + if (GGML_COMPILER_SUPPORT_DOTPROD) |
| 59 | + add_compile_definitions(__ARM_FEATURE_DOTPROD) |
| 60 | + endif () |
| 61 | + |
| 62 | + check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) |
| 63 | + |
| 64 | + if (GGML_COMPILER_SUPPORT_MATMUL_INT8) |
| 65 | + add_compile_definitions(__ARM_FEATURE_MATMUL_INT8) |
| 66 | + endif () |
| 67 | + |
| 68 | + check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) |
| 69 | + if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) |
| 70 | + add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) |
| 71 | + endif () |
| 72 | + |
| 73 | + set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV}) |
| 74 | + else() |
| 75 | + check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) |
| 76 | + if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") |
| 77 | + list(APPEND ARCH_FLAGS -mfp16-format=ieee) |
| 78 | + endif() |
| 79 | + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6") |
| 80 | + # Raspberry Pi 1, Zero |
| 81 | + list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access) |
| 82 | + endif() |
| 83 | + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") |
| 84 | + if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android") |
| 85 | + # Android armeabi-v7a |
| 86 | + list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations) |
| 87 | + else() |
| 88 | + # Raspberry Pi 2 |
| 89 | + list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) |
| 90 | + endif() |
| 91 | + endif() |
| 92 | + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") |
| 93 | + # Android arm64-v8a |
| 94 | + # Raspberry Pi 3, 4, Zero 2 (32-bit) |
| 95 | + list(APPEND ARCH_FLAGS -mno-unaligned-access) |
| 96 | + endif() |
| 97 | + if (GGML_SVE) |
| 98 | + list(APPEND ARCH_FLAGS -march=armv8.6-a+sve) |
| 99 | + endif() |
| 100 | + endif() |
| 101 | +elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR |
| 102 | + (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND |
| 103 | + CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$")) |
| 104 | + message(STATUS "x86 detected") |
| 105 | + if (MSVC) |
| 106 | + # instruction set detection for MSVC only |
| 107 | + if (GGML_NATIVE) |
| 108 | + # TODO: improve, should not reference files from the parent folder |
| 109 | + include(../ggml-cpu/cmake/FindSIMD.cmake) |
| 110 | + endif () |
| 111 | + if (GGML_AVX512) |
| 112 | + list(APPEND ARCH_FLAGS /arch:AVX512) |
| 113 | + # MSVC has no compile-time flags enabling specific |
| 114 | + # AVX512 extensions, neither it defines the |
| 115 | + # macros corresponding to the extensions. |
| 116 | + # Do it manually. |
| 117 | + if (GGML_AVX512_VBMI) |
| 118 | + add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>) |
| 119 | + add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>) |
| 120 | + if (CMAKE_C_COMPILER_ID STREQUAL "Clang") |
| 121 | + list(APPEND ARCH_FLAGS -mavx512vbmi) |
| 122 | + endif() |
| 123 | + endif() |
| 124 | + if (GGML_AVX512_VNNI) |
| 125 | + add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>) |
| 126 | + add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>) |
| 127 | + if (CMAKE_C_COMPILER_ID STREQUAL "Clang") |
| 128 | + list(APPEND ARCH_FLAGS -mavx512vnni) |
| 129 | + endif() |
| 130 | + endif() |
| 131 | + if (GGML_AVX512_BF16) |
| 132 | + add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>) |
| 133 | + add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>) |
| 134 | + if (CMAKE_C_COMPILER_ID STREQUAL "Clang") |
| 135 | + list(APPEND ARCH_FLAGS -mavx512bf16) |
| 136 | + endif() |
| 137 | + endif() |
| 138 | + if (GGML_AMX_TILE) |
| 139 | + add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>) |
| 140 | + add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>) |
| 141 | + endif() |
| 142 | + if (GGML_AMX_INT8) |
| 143 | + add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>) |
| 144 | + add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>) |
| 145 | + endif() |
| 146 | + if (GGML_AMX_BF16) |
| 147 | + add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>) |
| 148 | + add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>) |
| 149 | + endif() |
| 150 | + elseif (GGML_AVX2) |
| 151 | + list(APPEND ARCH_FLAGS /arch:AVX2) |
| 152 | + elseif (GGML_AVX) |
| 153 | + list(APPEND ARCH_FLAGS /arch:AVX) |
| 154 | + endif() |
| 155 | + else() |
| 156 | + if (GGML_NATIVE) |
| 157 | + list(APPEND ARCH_FLAGS -march=native) |
| 158 | + endif() |
| 159 | + if (GGML_F16C) |
| 160 | + list(APPEND ARCH_FLAGS -mf16c) |
| 161 | + endif() |
| 162 | + if (GGML_FMA) |
| 163 | + list(APPEND ARCH_FLAGS -mfma) |
| 164 | + endif() |
| 165 | + if (GGML_AVX) |
| 166 | + list(APPEND ARCH_FLAGS -mavx) |
| 167 | + endif() |
| 168 | + if (GGML_AVX2) |
| 169 | + list(APPEND ARCH_FLAGS -mavx2) |
| 170 | + endif() |
| 171 | + if (GGML_AVX512) |
| 172 | + list(APPEND ARCH_FLAGS -mavx512f) |
| 173 | + list(APPEND ARCH_FLAGS -mavx512dq) |
| 174 | + list(APPEND ARCH_FLAGS -mavx512bw) |
| 175 | + endif() |
| 176 | + if (GGML_AVX512_VBMI) |
| 177 | + list(APPEND ARCH_FLAGS -mavx512vbmi) |
| 178 | + endif() |
| 179 | + if (GGML_AVX512_VNNI) |
| 180 | + list(APPEND ARCH_FLAGS -mavx512vnni) |
| 181 | + endif() |
| 182 | + if (GGML_AVX512_BF16) |
| 183 | + list(APPEND ARCH_FLAGS -mavx512bf16) |
| 184 | + endif() |
| 185 | + if (GGML_AMX_TILE) |
| 186 | + list(APPEND ARCH_FLAGS -mamx-tile) |
| 187 | + endif() |
| 188 | + if (GGML_AMX_INT8) |
| 189 | + list(APPEND ARCH_FLAGS -mamx-int8) |
| 190 | + endif() |
| 191 | + if (GGML_AMX_BF16) |
| 192 | + list(APPEND ARCH_FLAGS -mamx-bf16) |
| 193 | + endif() |
| 194 | + endif() |
| 195 | +elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") |
| 196 | + message(STATUS "PowerPC detected") |
| 197 | + execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M) |
| 198 | + string(FIND "${POWER10_M}" "POWER10" substring_index) |
| 199 | + if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "") |
| 200 | + set(substring_index -1) |
| 201 | + endif() |
| 202 | + |
| 203 | + if (${substring_index} GREATER_EQUAL 0) |
| 204 | + list(APPEND ARCH_FLAGS -mcpu=power10) |
| 205 | + elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le") |
| 206 | + list(APPEND ARCH_FLAGS -mcpu=powerpc64le) |
| 207 | + else() |
| 208 | + list(APPEND ARCH_FLAGS -mcpu=native -mtune=native) |
| 209 | + #TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) |
| 210 | + endif() |
| 211 | +elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") |
| 212 | + message(STATUS "loongarch64 detected") |
| 213 | + |
| 214 | + list(APPEND ARCH_FLAGS -march=loongarch64) |
| 215 | + if (GGML_LASX) |
| 216 | + list(APPEND ARCH_FLAGS -mlasx) |
| 217 | + endif() |
| 218 | + if (GGML_LSX) |
| 219 | + list(APPEND ARCH_FLAGS -mlsx) |
| 220 | + endif() |
| 221 | +else() |
| 222 | + message(STATUS "Unknown architecture") |
| 223 | +endif() |
| 224 | + |
| 225 | +target_compile_options(ggml-tinyblas PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>") |
| 226 | +target_compile_options(ggml-tinyblas PRIVATE "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>") |
| 227 | + |
| 228 | +if (EMSCRIPTEN) |
| 229 | + set_target_properties(ggml-tinyblas PROPERTIES COMPILE_FLAGS "-msimd128") |
| 230 | +endif() |
0 commit comments