Skip to content

Commit 8b794f9

Browse files
committed
Merge branch 'master' into huydt/mb
2 parents 6751e69 + 5787b5d commit 8b794f9

File tree

40 files changed

+467
-135
lines changed

40 files changed

+467
-135
lines changed

.github/workflows/build-linux-cross.yml

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,3 +231,116 @@ jobs:
231231
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
232232
233233
cmake --build build --config Release -j $(nproc)
234+
235+
debian-13-loongarch64-cpu-cross:
236+
runs-on: ubuntu-24.04
237+
container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
238+
239+
steps:
240+
- uses: actions/checkout@v4
241+
- name: Setup LoongArch
242+
run: |
243+
rm -f /etc/apt/sources.list.d/*
244+
cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list
245+
deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main
246+
EOF
247+
( echo 'quiet "true";'; \
248+
echo 'APT::Get::Assume-Yes "true";'; \
249+
echo 'APT::Install-Recommends "false";'; \
250+
echo 'Acquire::Check-Valid-Until "false";'; \
251+
echo 'Acquire::Retries "5";'; \
252+
) > /etc/apt/apt.conf.d/99snapshot-repos
253+
254+
apt-get update
255+
apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip
256+
dpkg --add-architecture loong64
257+
258+
# Add arch-specific repositories for non-amd64 architectures
259+
cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list
260+
deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main
261+
EOF
262+
263+
apt-get update || true ;# Prevent failure due to missing URLs.
264+
265+
apt-get install -y --no-install-recommends \
266+
build-essential \
267+
gcc-14-loongarch64-linux-gnu \
268+
g++-14-loongarch64-linux-gnu
269+
270+
- name: Build
271+
run: |
272+
cmake -B build -DLLAMA_CURL=OFF \
273+
-DCMAKE_BUILD_TYPE=Release \
274+
-DGGML_OPENMP=OFF \
275+
-DLLAMA_BUILD_EXAMPLES=ON \
276+
-DLLAMA_BUILD_TOOLS=ON \
277+
-DLLAMA_BUILD_TESTS=OFF \
278+
-DCMAKE_SYSTEM_NAME=Linux \
279+
-DCMAKE_SYSTEM_PROCESSOR=loongarch64 \
280+
-DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \
281+
-DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \
282+
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
283+
-DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \
284+
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
285+
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
286+
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
287+
288+
cmake --build build --config Release -j $(nproc)
289+
290+
debian-13-loongarch64-vulkan-cross:
291+
runs-on: ubuntu-24.04
292+
container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
293+
294+
steps:
295+
- uses: actions/checkout@v4
296+
- name: Setup LoongArch
297+
run: |
298+
rm -f /etc/apt/sources.list.d/*
299+
cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list
300+
deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main
301+
EOF
302+
( echo 'quiet "true";'; \
303+
echo 'APT::Get::Assume-Yes "true";'; \
304+
echo 'APT::Install-Recommends "false";'; \
305+
echo 'Acquire::Check-Valid-Until "false";'; \
306+
echo 'Acquire::Retries "5";'; \
307+
) > /etc/apt/apt.conf.d/99snapshot-repos
308+
309+
apt-get update
310+
apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip
311+
dpkg --add-architecture loong64
312+
313+
# Add arch-specific repositories for non-amd64 architectures
314+
cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list
315+
deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main
316+
EOF
317+
318+
apt-get update || true ;# Prevent failure due to missing URLs.
319+
320+
apt-get install -y --no-install-recommends \
321+
build-essential \
322+
glslc \
323+
gcc-14-loongarch64-linux-gnu \
324+
g++-14-loongarch64-linux-gnu \
325+
libvulkan-dev:loong64
326+
327+
- name: Build
328+
run: |
329+
cmake -B build -DLLAMA_CURL=OFF \
330+
-DCMAKE_BUILD_TYPE=Release \
331+
-DGGML_VULKAN=ON \
332+
-DGGML_OPENMP=OFF \
333+
-DLLAMA_BUILD_EXAMPLES=ON \
334+
-DLLAMA_BUILD_TOOLS=ON \
335+
-DLLAMA_BUILD_TESTS=OFF \
336+
-DCMAKE_SYSTEM_NAME=Linux \
337+
-DCMAKE_SYSTEM_PROCESSOR=loongarch64 \
338+
-DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \
339+
-DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \
340+
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
341+
-DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \
342+
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
343+
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
344+
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
345+
346+
cmake --build build --config Release -j $(nproc)

common/common.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -934,7 +934,7 @@ struct common_init_result common_init_from_params(common_params & params) {
934934
return iparams;
935935
}
936936

937-
if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
937+
if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
938938
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
939939
params.ctx_shift = false;
940940
}
@@ -1041,7 +1041,7 @@ struct common_init_result common_init_from_params(common_params & params) {
10411041
if (llama_model_has_decoder(model)) {
10421042
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
10431043
}
1044-
llama_kv_self_clear(lctx);
1044+
llama_memory_clear(llama_get_memory(lctx), true);
10451045
llama_synchronize(lctx);
10461046
llama_perf_context_reset(lctx);
10471047
llama_set_warmup(lctx, false);

common/speculative.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,8 @@ llama_tokens common_speculative_gen_draft(
144144
auto & smpl = spec->smpl;
145145
auto & prompt = spec->prompt;
146146

147+
auto * mem = llama_get_memory(ctx);
148+
147149
int reuse_i = 0;
148150
int reuse_n = 0;
149151

@@ -173,7 +175,7 @@ llama_tokens common_speculative_gen_draft(
173175
result.reserve(params.n_draft);
174176

175177
if (reuse_n == 0) {
176-
llama_kv_self_clear(ctx);
178+
llama_memory_clear(mem, false);
177179

178180
prompt.clear();
179181
} else {
@@ -192,14 +194,14 @@ llama_tokens common_speculative_gen_draft(
192194
}
193195

194196
if (reuse_i > 0) {
195-
llama_kv_self_seq_rm (ctx, 0, 0, reuse_i);
196-
llama_kv_self_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
197+
llama_memory_seq_rm (mem, 0, 0, reuse_i);
198+
llama_memory_seq_add(mem, 0, reuse_i, -1, -reuse_i);
197199

198200
prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
199201
}
200202

201203
if (reuse_n < (int) prompt.size()) {
202-
llama_kv_self_seq_rm (ctx, 0, reuse_n, -1);
204+
llama_memory_seq_rm (mem, 0, reuse_n, -1);
203205

204206
prompt.erase(prompt.begin() + reuse_n, prompt.end());
205207
}

examples/batched.swift/Sources/main.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ if llama_decode(context, batch) != 0 {
116116
}
117117

118118
for i in 1 ..< n_parallel {
119-
llama_kv_self_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
119+
llama_memory_seq_cp(llama_get_memory(context), 0, Int32(i), 0, batch.n_tokens)
120120
}
121121

122122
if n_parallel > 1 {

examples/embedding/embedding.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
3737
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
3838

3939
// clear previous kv_cache values (irrelevant for embeddings)
40-
llama_kv_self_clear(ctx);
40+
llama_memory_clear(llama_get_memory(ctx), true);
4141

4242
// run model
4343
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);

examples/gritlm/gritlm.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
4545
}
4646

4747
// clear previous kv_cache values (irrelevant for embeddings)
48-
llama_kv_self_clear(ctx);
48+
llama_memory_clear(llama_get_memory(ctx), true);
4949
llama_set_embeddings(ctx, true);
5050
llama_set_causal_attn(ctx, false);
5151

@@ -102,7 +102,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
102102

103103
llama_token eos_token = llama_vocab_eos(vocab);
104104

105-
llama_kv_self_clear(ctx);
105+
llama_memory_clear(llama_get_memory(ctx), true);
106106
llama_set_embeddings(ctx, false);
107107
llama_set_causal_attn(ctx, true);
108108

examples/llama.android/llama/src/main/cpp/llama-android.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
194194
}
195195

196196
batch->logits[batch->n_tokens - 1] = true;
197-
llama_kv_self_clear(context);
197+
llama_memory_clear(llama_get_memory(context), false);
198198

199199
const auto t_pp_start = ggml_time_us();
200200
if (llama_decode(context, *batch) != 0) {
@@ -206,7 +206,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
206206

207207
LOGi("Benchmark text generation (tg)");
208208

209-
llama_kv_self_clear(context);
209+
llama_memory_clear(llama_get_memory(context), false);
210210
const auto t_tg_start = ggml_time_us();
211211
for (i = 0; i < tg; i++) {
212212

@@ -223,7 +223,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
223223

224224
const auto t_tg_end = ggml_time_us();
225225

226-
llama_kv_self_clear(context);
226+
llama_memory_clear(llama_get_memory(context), false);
227227

228228
const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
229229
const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
@@ -448,5 +448,5 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
448448
extern "C"
449449
JNIEXPORT void JNICALL
450450
Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
451-
llama_kv_self_clear(reinterpret_cast<llama_context *>(context));
451+
llama_memory_clear(llama_get_memory(reinterpret_cast<llama_context *>(context)), true);
452452
}

examples/llama.swiftui/llama.cpp.swift/LibLlama.swift

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ actor LlamaContext {
210210
}
211211
batch.logits[Int(batch.n_tokens) - 1] = 1 // true
212212

213-
llama_kv_self_clear(context)
213+
llama_memory_clear(llama_get_memory(context), false)
214214

215215
let t_pp_start = DispatchTime.now().uptimeNanoseconds / 1000;
216216

@@ -223,7 +223,7 @@ actor LlamaContext {
223223

224224
// bench text generation
225225

226-
llama_kv_self_clear(context)
226+
llama_memory_clear(llama_get_memory(context), false)
227227

228228
let t_tg_start = DispatchTime.now().uptimeNanoseconds / 1000;
229229

@@ -242,7 +242,7 @@ actor LlamaContext {
242242

243243
let t_tg_end = DispatchTime.now().uptimeNanoseconds / 1000;
244244

245-
llama_kv_self_clear(context)
245+
llama_memory_clear(llama_get_memory(context), false)
246246

247247
let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0
248248
let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0
@@ -292,7 +292,7 @@ actor LlamaContext {
292292
func clear() {
293293
tokens_list.removeAll()
294294
temporary_invalid_cchars.removeAll()
295-
llama_kv_self_clear(context)
295+
llama_memory_clear(llama_get_memory(context), true)
296296
}
297297

298298
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {

examples/lookahead/lookahead.cpp

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ int main(int argc, char ** argv) {
6060
llama_model * model = llama_init.model.get();
6161
llama_context * ctx = llama_init.context.get();
6262

63+
auto * mem = llama_get_memory(ctx);
64+
6365
const llama_vocab * vocab = llama_model_get_vocab(model);
6466

6567
// Tokenize the prompt
@@ -94,7 +96,7 @@ int main(int argc, char ** argv) {
9496
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1));
9597

9698
for (int s = 1; s < W + G + 1; ++s) {
97-
llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
99+
llama_memory_seq_cp(mem, 0, s, -1, -1);
98100
}
99101

100102
const auto t_enc_end = ggml_time_us();
@@ -427,17 +429,17 @@ int main(int argc, char ** argv) {
427429

428430
// KV cache management
429431
// if no verification token matched, we simply remove all cells from this batch -> no fragmentation
430-
llama_kv_self_seq_rm(ctx, -1, n_past, -1);
432+
llama_memory_seq_rm(mem, -1, n_past, -1);
431433

432434
if (seq_id_best != 0) {
433435
// if a verification token matched, we keep the best sequence and remove the rest
434436
// this leads to some KV cache fragmentation
435-
llama_kv_self_seq_keep(ctx, seq_id_best);
436-
llama_kv_self_seq_cp (ctx, seq_id_best, 0, -1, -1);
437-
llama_kv_self_seq_rm (ctx, seq_id_best, -1, -1);
437+
llama_memory_seq_keep(mem, seq_id_best);
438+
llama_memory_seq_cp (mem, seq_id_best, 0, -1, -1);
439+
llama_memory_seq_rm (mem, seq_id_best, -1, -1);
438440

439441
for (int s = 1; s < W + G + 1; ++s) {
440-
llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
442+
llama_memory_seq_cp(mem, 0, s, -1, -1);
441443
}
442444
}
443445
}

examples/lookup/lookup.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ int main(int argc, char ** argv){
181181

182182
// KV cache management
183183
// clean the cache of draft tokens that weren't accepted
184-
llama_kv_self_seq_rm(ctx, 0, n_past, -1);
184+
llama_memory_seq_rm(llama_get_memory(ctx), 0, n_past, -1);
185185

186186
common_batch_clear(batch_tgt);
187187
common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);

0 commit comments

Comments
 (0)