Skip to content

Commit 4a5ac65

Browse files
committed
extract llamafile in new tinyblas backend
1 parent bcdb7a2 commit 4a5ac65

File tree

12 files changed

+1180
-250
lines changed

12 files changed

+1180
-250
lines changed

Makefile

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -568,8 +568,8 @@ ifdef GGML_NVPL
568568
endif # GGML_NVPL
569569

570570
ifndef GGML_NO_LLAMAFILE
571-
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
572-
OBJ_GGML += ggml/src/ggml-cpu/llamafile/sgemm.o
571+
MK_CPPFLAGS += -DGGML_USE_TINYBLAS
572+
OBJ_GGML += ggml/src/ggml-tinyblas/ggml-tinyblas.o ggml/src/ggml-tinyblas/sgemm.o
573573
endif
574574

575575
ifndef GGML_NO_AMX
@@ -1173,12 +1173,20 @@ ggml/src/ggml-blas/ggml-blas.o: \
11731173
ggml/include/ggml-blas.h
11741174
$(CXX) $(CXXFLAGS) -c $< -o $@
11751175

1176+
# TODO renomer en GGML_NO_TINYBLAS
11761177
ifndef GGML_NO_LLAMAFILE
1177-
ggml/src/ggml-cpu/llamafile/sgemm.o: \
1178-
ggml/src/ggml-cpu/llamafile/sgemm.cpp \
1179-
ggml/src/ggml-cpu/llamafile/sgemm.h \
1178+
ggml/src/ggml-tinyblas/ggml-tinyblas.o: \
1179+
ggml/src/ggml-tinyblas/ggml-tinyblas.cpp \
1180+
ggml/include/ggml-tinyblas.h \
1181+
ggml/src/ggml-tinyblas/sgemm.h \
11801182
ggml/include/ggml.h
1181-
$(CXX) $(CXXFLAGS) -c $< -o $@ -I ggml/src -I ggml/src/ggml-cpu
1183+
$(CXX) $(CXXFLAGS) -std=c++17 -c $< -o $@
1184+
1185+
ggml/src/ggml-tinyblas/sgemm.o: \
1186+
ggml/src/ggml-tinyblas/sgemm.cpp \
1187+
ggml/src/ggml-tinyblas/sgemm.h \
1188+
ggml/include/ggml.h
1189+
$(CXX) $(CXXFLAGS) -std=c++17 -c $< -o $@
11821190
endif # GGML_NO_LLAMAFILE
11831191

11841192
ifndef GGML_NO_AMX
@@ -1337,7 +1345,6 @@ clean:
13371345
rm -rvf ggml/src/*.o
13381346
rm -rvf common/build-info.cpp
13391347
rm -rvf ggml/src/ggml-cpu/*.o
1340-
rm -rvf ggml/src/ggml-cpu/llamafile/*.o
13411348
rm -vrf ggml/src/ggml-amx/*.o
13421349
rm -vrf ggml/src/ggml-blas/*.o
13431350
rm -vrf ggml/src/ggml-cann/*.o
@@ -1350,6 +1357,7 @@ clean:
13501357
rm -vrf ggml/src/ggml-metal/ggml-metal-embed.metal
13511358
rm -vrf ggml/src/ggml-rpc/*.o
13521359
rm -vrf ggml/src/ggml-sycl/*.o
1360+
rm -vrf ggml/src/ggml-tinyblas/*.o
13531361
rm -vrf ggml/src/ggml-vulkan/*.o
13541362
rm -vrf ggml/src/ggml-musa/*.o
13551363
rm -rvf $(BUILD_TARGETS)

ggml/include/ggml-cpu.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,6 @@ extern "C" {
124124
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
125125
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
126126
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
127-
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
128127

129128
// Internal types and functions exposed for tests and benchmarks
130129

ggml/include/ggml-tinyblas.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#pragma once
2+
3+
#include "ggml.h"
4+
#include "ggml-backend.h"
5+
6+
7+
#ifdef __cplusplus
8+
extern "C" {
9+
#endif
10+
11+
// backend register
12+
GGML_API ggml_backend_reg_t ggml_backend_tinyblas_reg(void);
13+
14+
15+
#ifdef __cplusplus
16+
}
17+
#endif

ggml/src/ggml-backend-reg.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@
2727
#include "ggml-blas.h"
2828
#endif
2929

30+
#ifdef GGML_USE_TINYBLAS
31+
#include "ggml-tinyblas.h"
32+
#endif
33+
3034
#ifdef GGML_USE_RPC
3135
#include "ggml-rpc.h"
3236
#endif
@@ -66,6 +70,9 @@ struct ggml_backend_registry {
6670
#ifdef GGML_USE_BLAS
6771
register_backend(ggml_backend_blas_reg());
6872
#endif
73+
#ifdef GGML_USE_TINYBLAS
74+
register_backend(ggml_backend_tinyblas_reg());
75+
#endif
6976
#ifdef GGML_USE_RPC
7077
register_backend(ggml_backend_rpc_reg());
7178
#endif

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13868,14 +13868,6 @@ int ggml_cpu_has_wasm_simd(void) {
1386813868
#endif
1386913869
}
1387013870

13871-
int ggml_cpu_has_llamafile(void) {
13872-
#if defined(GGML_USE_LLAMAFILE)
13873-
return 1;
13874-
#else
13875-
return 0;
13876-
#endif
13877-
}
13878-
1387913871
int ggml_cpu_has_sse3(void) {
1388013872
#if defined(__SSE3__)
1388113873
return 1;

ggml/src/ggml-cpu/ggml-cpu.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -616,9 +616,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
616616
if (ggml_cpu_has_wasm_simd()) {
617617
features.push_back({ "WASM_SIMD", "1" });
618618
}
619-
if (ggml_cpu_has_llamafile()) {
620-
features.push_back({ "LLAMAFILE", "1" });
621-
}
622619

623620
features.push_back({ nullptr, nullptr });
624621

ggml/src/ggml-cpu/llamafile/sgemm.h

Lines changed: 0 additions & 14 deletions
This file was deleted.

ggml/src/ggml-tinyblas/CMakeLists.txt

Lines changed: 230 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,230 @@
1+
add_library(ggml-tinyblas
2+
ggml-tinyblas.cpp
3+
)
4+
5+
target_link_libraries(ggml-tinyblas PRIVATE ggml-base)
6+
target_include_directories(ggml-tinyblas PRIVATE . ..)
7+
8+
if (APPLE AND GGML_ACCELERATE)
9+
find_library(ACCELERATE_FRAMEWORK Accelerate)
10+
if (ACCELERATE_FRAMEWORK)
11+
message(STATUS "Accelerate framework found")
12+
13+
add_compile_definitions(GGML_USE_ACCELERATE)
14+
add_compile_definitions(ACCELERATE_NEW_LAPACK)
15+
add_compile_definitions(ACCELERATE_LAPACK_ILP64)
16+
17+
target_link_libraries(ggml-tinyblas PRIVATE ${ACCELERATE_FRAMEWORK})
18+
else()
19+
message(WARNING "Accelerate framework not found")
20+
endif()
21+
endif()
22+
23+
if (GGML_OPENMP)
24+
find_package(OpenMP)
25+
if (OpenMP_FOUND)
26+
message(STATUS "OpenMP found")
27+
28+
add_compile_definitions(GGML_USE_OPENMP)
29+
30+
target_link_libraries(ggml-tinyblas PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
31+
32+
else()
33+
message(WARNING "OpenMP not found")
34+
endif()
35+
endif()
36+
37+
target_sources(ggml-tinyblas PRIVATE
38+
sgemm.cpp
39+
sgemm.h)
40+
41+
if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
42+
CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
43+
(NOT CMAKE_OSX_ARCHITECTURES AND
44+
NOT CMAKE_GENERATOR_PLATFORM_LWR AND
45+
CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
46+
47+
message(STATUS "ARM detected")
48+
49+
if (MSVC)
50+
add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
51+
add_compile_definitions(__ARM_NEON)
52+
add_compile_definitions(__ARM_FEATURE_FMA)
53+
54+
set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
55+
string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
56+
57+
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
58+
if (GGML_COMPILER_SUPPORT_DOTPROD)
59+
add_compile_definitions(__ARM_FEATURE_DOTPROD)
60+
endif ()
61+
62+
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
63+
64+
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
65+
add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
66+
endif ()
67+
68+
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
69+
if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
70+
add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
71+
endif ()
72+
73+
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
74+
else()
75+
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
76+
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
77+
list(APPEND ARCH_FLAGS -mfp16-format=ieee)
78+
endif()
79+
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
80+
# Raspberry Pi 1, Zero
81+
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
82+
endif()
83+
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
84+
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
85+
# Android armeabi-v7a
86+
list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
87+
else()
88+
# Raspberry Pi 2
89+
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
90+
endif()
91+
endif()
92+
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
93+
# Android arm64-v8a
94+
# Raspberry Pi 3, 4, Zero 2 (32-bit)
95+
list(APPEND ARCH_FLAGS -mno-unaligned-access)
96+
endif()
97+
if (GGML_SVE)
98+
list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
99+
endif()
100+
endif()
101+
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
102+
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
103+
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
104+
message(STATUS "x86 detected")
105+
if (MSVC)
106+
# instruction set detection for MSVC only
107+
if (GGML_NATIVE)
108+
# TODO: improve, should not reference files from the parent folder
109+
include(../ggml-cpu/cmake/FindSIMD.cmake)
110+
endif ()
111+
if (GGML_AVX512)
112+
list(APPEND ARCH_FLAGS /arch:AVX512)
113+
# MSVC has no compile-time flags enabling specific
114+
# AVX512 extensions, neither it defines the
115+
# macros corresponding to the extensions.
116+
# Do it manually.
117+
if (GGML_AVX512_VBMI)
118+
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
119+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
120+
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
121+
list(APPEND ARCH_FLAGS -mavx512vbmi)
122+
endif()
123+
endif()
124+
if (GGML_AVX512_VNNI)
125+
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
126+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
127+
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
128+
list(APPEND ARCH_FLAGS -mavx512vnni)
129+
endif()
130+
endif()
131+
if (GGML_AVX512_BF16)
132+
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
133+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
134+
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
135+
list(APPEND ARCH_FLAGS -mavx512bf16)
136+
endif()
137+
endif()
138+
if (GGML_AMX_TILE)
139+
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
140+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
141+
endif()
142+
if (GGML_AMX_INT8)
143+
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
144+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
145+
endif()
146+
if (GGML_AMX_BF16)
147+
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
148+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
149+
endif()
150+
elseif (GGML_AVX2)
151+
list(APPEND ARCH_FLAGS /arch:AVX2)
152+
elseif (GGML_AVX)
153+
list(APPEND ARCH_FLAGS /arch:AVX)
154+
endif()
155+
else()
156+
if (GGML_NATIVE)
157+
list(APPEND ARCH_FLAGS -march=native)
158+
endif()
159+
if (GGML_F16C)
160+
list(APPEND ARCH_FLAGS -mf16c)
161+
endif()
162+
if (GGML_FMA)
163+
list(APPEND ARCH_FLAGS -mfma)
164+
endif()
165+
if (GGML_AVX)
166+
list(APPEND ARCH_FLAGS -mavx)
167+
endif()
168+
if (GGML_AVX2)
169+
list(APPEND ARCH_FLAGS -mavx2)
170+
endif()
171+
if (GGML_AVX512)
172+
list(APPEND ARCH_FLAGS -mavx512f)
173+
list(APPEND ARCH_FLAGS -mavx512dq)
174+
list(APPEND ARCH_FLAGS -mavx512bw)
175+
endif()
176+
if (GGML_AVX512_VBMI)
177+
list(APPEND ARCH_FLAGS -mavx512vbmi)
178+
endif()
179+
if (GGML_AVX512_VNNI)
180+
list(APPEND ARCH_FLAGS -mavx512vnni)
181+
endif()
182+
if (GGML_AVX512_BF16)
183+
list(APPEND ARCH_FLAGS -mavx512bf16)
184+
endif()
185+
if (GGML_AMX_TILE)
186+
list(APPEND ARCH_FLAGS -mamx-tile)
187+
endif()
188+
if (GGML_AMX_INT8)
189+
list(APPEND ARCH_FLAGS -mamx-int8)
190+
endif()
191+
if (GGML_AMX_BF16)
192+
list(APPEND ARCH_FLAGS -mamx-bf16)
193+
endif()
194+
endif()
195+
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
196+
message(STATUS "PowerPC detected")
197+
execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M)
198+
string(FIND "${POWER10_M}" "POWER10" substring_index)
199+
if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "")
200+
set(substring_index -1)
201+
endif()
202+
203+
if (${substring_index} GREATER_EQUAL 0)
204+
list(APPEND ARCH_FLAGS -mcpu=power10)
205+
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
206+
list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
207+
else()
208+
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
209+
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
210+
endif()
211+
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
212+
message(STATUS "loongarch64 detected")
213+
214+
list(APPEND ARCH_FLAGS -march=loongarch64)
215+
if (GGML_LASX)
216+
list(APPEND ARCH_FLAGS -mlasx)
217+
endif()
218+
if (GGML_LSX)
219+
list(APPEND ARCH_FLAGS -mlsx)
220+
endif()
221+
else()
222+
message(STATUS "Unknown architecture")
223+
endif()
224+
225+
target_compile_options(ggml-tinyblas PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
226+
target_compile_options(ggml-tinyblas PRIVATE "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
227+
228+
if (EMSCRIPTEN)
229+
set_target_properties(ggml-tinyblas PROPERTIES COMPILE_FLAGS "-msimd128")
230+
endif()

0 commit comments

Comments
 (0)