Skip to content

Commit afc1738

Browse files
committed
Merge remote-tracking branch 'origin/master' into GraniteFour
* origin/master: cmake : do not search for curl libraries by ourselves (ggml-org#14613) SYCL: Initial set_rows kernel implementation (ggml-org#14562) llama : minor coding style fix for smollm3 (ggml-org#14605) cmake : bump llguidance version to v1.0.1 (ggml-org#14609) cmake : llguidance build parser library only (ggml-org#14608) cuda : support Falcon-H1 state size for SSM_SCAN (ggml-org#14602) Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
2 parents d02d3dd + a457551 commit afc1738

File tree

10 files changed

+181
-27
lines changed

10 files changed

+181
-27
lines changed

common/CMakeLists.txt

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,7 @@ if (LLAMA_CURL)
8686
endif()
8787
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
8888
include_directories(${CURL_INCLUDE_DIRS})
89-
find_library(CURL_LIBRARY curl REQUIRED)
90-
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
89+
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
9190
endif ()
9291

9392
if (LLAMA_LLGUIDANCE)
@@ -112,13 +111,13 @@ if (LLAMA_LLGUIDANCE)
112111

113112
ExternalProject_Add(llguidance_ext
114113
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
115-
# v0.7.20 (+ fix to build on GCC 15):
116-
GIT_TAG b5b8b64dba11c4e4ee6b1d1450d3a3ae279891e8
114+
# v1.0.1:
115+
GIT_TAG d795912fedc7d393de740177ea9ea761e7905774
117116
PREFIX ${CMAKE_BINARY_DIR}/llguidance
118117
SOURCE_DIR ${LLGUIDANCE_SRC}
119118
BUILD_IN_SOURCE TRUE
120119
CONFIGURE_COMMAND ""
121-
BUILD_COMMAND cargo build --release
120+
BUILD_COMMAND cargo build --release --package llguidance
122121
INSTALL_COMMAND ""
123122
BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME} ${LLGUIDANCE_PATH}/llguidance.h
124123
UPDATE_COMMAND ""

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3335,8 +3335,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
33353335
case GGML_OP_SSM_SCAN: {
33363336
if (op->src[3]->ne[0] == 1) {
33373337
// Mamba2
3338-
// (kernel only supports d_state == 128 && d_head % 16 == 0)
3339-
return op->src[0]->ne[0] == 128 && op->src[0]->ne[1] % 16 == 0;
3338+
// (kernel only supports (d_state == 128 || d_state == 256) && d_head % 16 == 0)
3339+
return (op->src[0]->ne[0] == 128 || op->src[0]->ne[0] == 256) && op->src[0]->ne[1] % 16 == 0;
33403340
} else {
33413341
// Mamba
33423342
// (kernel only supports d_state == 16, d_head == 1, n_head % 128 == 0, n_group == 1)

ggml/src/ggml-cuda/ssm-scan.cu

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -201,11 +201,11 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
201201
const int src5_nb3, const int64_t s_off, const int64_t d_state, const int64_t head_dim,
202202
const int64_t n_head, const int64_t n_group, const int64_t n_tok, const int64_t n_seq,
203203
cudaStream_t stream) {
204-
const int threads = 128;
205204
// NOTE: if you change conditions here, be sure to update the corresponding supports_op condition!
206205
if (src3_nb1 == sizeof(float)) {
207206
// Mamba-2
208207
if (d_state == 128) {
208+
const int threads = 128;
209209
GGML_ASSERT(d_state % threads == 0);
210210
// NOTE: can be any power of two between 4 and 64
211211
const int splitH = 16;
@@ -215,10 +215,21 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
215215
src0, src1, src2, src3, src4, src5, src6, dst,
216216
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1,
217217
src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok);
218+
} else if (d_state == 256) { // Falcon-H1
219+
const int threads = 256;
220+
// NOTE: can be any power of two between 8 and 64
221+
const int splitH = 16;
222+
GGML_ASSERT(head_dim % splitH == 0);
223+
const dim3 blocks((n_head * head_dim + (splitH - 1)) / splitH, n_seq, 1);
224+
ssm_scan_f32_group<16, 256><<<blocks, threads, 0, stream>>>(
225+
src0, src1, src2, src3, src4, src5, src6, dst,
226+
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1,
227+
src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok);
218228
} else {
219-
GGML_ABORT("doesn't support d_state!=128.");
229+
GGML_ABORT("doesn't support d_state!=(128 or 256).");
220230
}
221231
} else {
232+
const int threads = 128;
222233
// Mamba-1
223234
GGML_ASSERT(n_head % threads == 0);
224235
GGML_ASSERT(head_dim == 1);

ggml/src/ggml-sycl/backend.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include "outprod.hpp"
3131
#include "quants.hpp"
3232
#include "rope.hpp"
33+
#include "set_rows.hpp"
3334
#include "softmax.hpp"
3435
#include "tsembd.hpp"
3536
#include "wkv.hpp"

ggml/src/ggml-sycl/ggml-sycl.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
#include "ggml-sycl/element_wise.hpp"
4242
#include "ggml-sycl/presets.hpp"
4343
#include "ggml-sycl/gemm.hpp"
44+
#include "ggml-sycl/set_rows.hpp"
4445
#include "ggml-sycl/sycl_hw.hpp"
4546
#include "ggml-sycl/getrows.hpp"
4647
#include "ggml.h"
@@ -3605,6 +3606,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
36053606
case GGML_OP_GET_ROWS:
36063607
ggml_sycl_get_rows(ctx, dst);
36073608
break;
3609+
case GGML_OP_SET_ROWS:
3610+
ggml_sycl_op_set_rows(ctx, dst);
3611+
break;
36083612
case GGML_OP_DUP:
36093613
ggml_sycl_dup(ctx, dst);
36103614
break;
@@ -4299,7 +4303,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
42994303
{
43004304
// TODO: add support
43014305
// ref: https://github.com/ggml-org/llama.cpp/pull/14274
4302-
return false;
4306+
return (op->type == GGML_TYPE_F32 || (op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_I64));
43034307
} break;
43044308
case GGML_OP_CPY:
43054309
{

ggml/src/ggml-sycl/set_rows.cpp

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
#include "set_rows.hpp"
2+
3+
namespace utils {
4+
template<typename T>
5+
static constexpr bool is_arithmetic_v() {
6+
return std::is_arithmetic_v<T> || std::is_same_v<T, sycl::half> || std::is_same_v<T, sycl::ext::oneapi::bfloat16>;
7+
}
8+
}
9+
template<typename TIn, typename TOut>
10+
static inline std::enable_if_t<utils::is_arithmetic_v<TIn>() && utils::is_arithmetic_v<TOut>(), void>
11+
convert (const char* src, char* dst) {
12+
auto src_val = *reinterpret_cast<const TIn*>(src);
13+
auto dst_val = sycl::vec<TIn, 1>(src_val).template convert<TOut, sycl::rounding_mode::automatic>()[0];
14+
*reinterpret_cast<TOut*>(dst) = dst_val;;
15+
}
16+
17+
template<typename TIn, typename TOut>
18+
static void k_set_rows(
19+
const char * __restrict__ src0, const int64_t * __restrict__ src1, char * __restrict__ dst,
20+
const int64_t ne00, const int64_t ne01, const int64_t ne11, const int64_t ne12,
21+
const size_t nb01, const size_t nb02, const size_t nb03,
22+
const size_t nb10, const size_t nb11, const size_t nb12,
23+
const size_t nb1, const size_t nb2, const size_t nb3,
24+
const size_t src_type_size, const size_t dst_type_size,
25+
const sycl::nd_item<3> & item_ct1) {
26+
27+
const int i03 = item_ct1.get_group(0);
28+
const int i02 = item_ct1.get_group(1);
29+
const int i01 = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1); // Row index
30+
31+
if (i01 >= ne01) {
32+
return;
33+
}
34+
35+
const int i12 = i03 % ne12;
36+
const int i11 = i02 % ne11;
37+
const int i10 = i01;
38+
39+
const int64_t dst_row = *(const int64_t *)((const char *)src1 + calculate_offset<3>({nb10, nb11, nb12}, {i10, i11, i12}));
40+
41+
const char * src0_row = src0 + calculate_offset<3>({nb01, nb02, nb03}, {i01, i02, i03});
42+
char * dst_row_ptr = dst + dst_row*nb1 + i02*nb2 + i03*nb3;
43+
44+
for (int col = item_ct1.get_local_id(0); col < ne00; col += item_ct1.get_local_range(0)) {
45+
const char * src_elem = src0_row + col * src_type_size;
46+
char * dst_elem = dst_row_ptr + col * dst_type_size;
47+
convert<TIn, TOut>(src_elem, dst_elem);
48+
}
49+
}
50+
51+
template<typename TIn, typename TOut>
52+
static void set_rows_sycl(
53+
const char * src0_d, const int64_t * src1_d, char * dst_d,
54+
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
55+
const int64_t ne11, const int64_t ne12, const size_t nb01, const size_t nb02, const size_t nb03,
56+
const size_t nb10, const size_t nb11, const size_t nb12,
57+
const size_t nb1, const size_t nb2, const size_t nb3,
58+
const size_t src_type_size, const size_t dst_type_size,
59+
queue_ptr stream) {
60+
61+
constexpr int max_threads_per_row = 64; // KEEPING 64 for now
62+
const int threads_per_row = std::min((int)ne00, max_threads_per_row);
63+
64+
constexpr int max_threads_per_block = 64;
65+
const int rows_per_block = std::max(1, max_threads_per_block / threads_per_row);
66+
67+
const sycl::range<3> block_size(1, rows_per_block, threads_per_row);
68+
const sycl::range<3> grid_size(ne03, ne02, (ne01 + rows_per_block - 1) / rows_per_block);
69+
70+
sycl_parallel_for(
71+
stream,
72+
sycl::nd_range<3>(grid_size * block_size, block_size),
73+
[=](sycl::nd_item<3> item_ct1) {
74+
k_set_rows<TIn, TOut>(
75+
src0_d, src1_d, dst_d,
76+
ne00, ne01, ne11, ne12,
77+
nb01, nb02, nb03,
78+
nb10, nb11, nb12,
79+
nb1, nb2, nb3,
80+
src_type_size, dst_type_size,
81+
item_ct1
82+
);
83+
}
84+
);
85+
}
86+
87+
88+
void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
89+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
90+
const ggml_tensor * src0 = dst->src[0];
91+
const ggml_tensor * src1 = dst->src[1];
92+
93+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
94+
GGML_ASSERT(dst->src[1]->type == GGML_TYPE_I64);
95+
96+
GGML_TENSOR_BINARY_OP_LOCALS
97+
98+
const int64_t * src1_dd = static_cast<const int64_t *>(src1->data);
99+
100+
dpct::queue_ptr stream = ctx.stream();
101+
switch (dst->type) {
102+
case GGML_TYPE_F32:
103+
set_rows_sycl<float, float>(
104+
(const char *)src0->data, src1_dd, (char *)dst->data,
105+
ne00, ne01, ne02, ne03,
106+
ne11, ne12,
107+
nb01, nb02, nb03,
108+
nb10, nb11, nb12,
109+
nb1, nb2, nb3,
110+
sizeof(float), sizeof(float),
111+
stream
112+
);
113+
break;
114+
case GGML_TYPE_F16:
115+
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
116+
set_rows_sycl<float, sycl::half>(
117+
(const char *)src0->data, src1_dd, (char *)dst->data,
118+
ne00, ne01, ne02, ne03,
119+
ne11, ne12,
120+
nb01, nb02, nb03,
121+
nb10, nb11, nb12,
122+
nb1, nb2, nb3,
123+
sizeof(float), sizeof(sycl::half),
124+
stream
125+
);
126+
break;
127+
default:
128+
GGML_ABORT("Unsupported tensor type!");
129+
break;
130+
}
131+
}

ggml/src/ggml-sycl/set_rows.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#ifndef GGML_SYCL_SET_ROWS_HPP
2+
#define GGML_SYCL_SET_ROWS_HPP
3+
4+
#include "common.hpp"
5+
6+
void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
7+
8+
#endif // GGML_SYCL_SET_ROWS_HPP

src/llama-arch.cpp

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1814,26 +1814,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
18141814
},
18151815
},
18161816
{
1817-
LLM_ARCH_UNKNOWN,
1817+
LLM_ARCH_SMOLLM3,
18181818
{
1819-
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1819+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1820+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1821+
{ LLM_TENSOR_OUTPUT, "output" },
1822+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1823+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1824+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1825+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1826+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1827+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1828+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1829+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1830+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
18201831
},
18211832
},
18221833
{
1823-
LLM_ARCH_SMOLLM3,
1834+
LLM_ARCH_UNKNOWN,
18241835
{
1825-
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1826-
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1827-
{ LLM_TENSOR_OUTPUT, "output" },
1828-
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1829-
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1830-
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1831-
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1832-
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1833-
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1834-
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1835-
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1836-
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1836+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
18371837
},
18381838
},
18391839
};

tests/test-backend-ops.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5069,6 +5069,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
50695069

50705070
test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 16, 1, 1024, 1, 32, 4)); // Mamba-1
50715071
test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 128, 64, 16, 2, 32, 4)); // Mamba-2
5072+
test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 256, 64, 8, 2, 32, 4)); // Falcon-H1
50725073

50735074
test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 1, 1));
50745075
test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 32, 1));

tools/run/CMakeLists.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,7 @@ if (LLAMA_CURL)
77
find_package(CURL REQUIRED)
88
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
99
include_directories(${CURL_INCLUDE_DIRS})
10-
find_library(CURL_LIBRARY curl REQUIRED)
11-
set(LLAMA_RUN_EXTRA_LIBS ${LLAMA_RUN_EXTRA_LIBS} ${CURL_LIBRARY})
10+
set(LLAMA_RUN_EXTRA_LIBS ${LLAMA_RUN_EXTRA_LIBS} ${CURL_LIBRARIES})
1211
endif ()
1312

1413
install(TARGETS ${TARGET} RUNTIME)

0 commit comments

Comments
 (0)