Skip to content

Commit f3d1227

Browse files
Merge branch 'ggerganov:master' into snowflake-arctic-clean
2 parents 0cffda8 + d46dbc7 commit f3d1227

File tree

136 files changed

+8572
-3402
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

136 files changed

+8572
-3402
lines changed

.flake8

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,17 @@
11
[flake8]
22
max-line-length = 125
3-
ignore = W503
3+
ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
4+
exclude =
5+
# Do not traverse examples
6+
examples,
7+
# Do not include package initializers
8+
__init__.py,
9+
# No need to traverse our git directory
10+
.git,
11+
# There's no value in checking cache directories
12+
__pycache__,
13+
# No need to include the build path
14+
build,
15+
# This contains builds that we don't want to check
16+
dist # This is generated with `python build .` for package releases
17+
# max-complexity = 10

.github/workflows/bench.yml

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,19 @@ jobs:
5252
ftype: q4_0
5353
pr_comment_enabled: "true"
5454

55-
if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }}
55+
if: |
56+
inputs.gpu-series == 'Standard_NC4as_T4_v3'
57+
|| (
58+
github.event_name == 'schedule'
59+
&& github.ref_name == 'master'
60+
&& github.repository_owner == 'ggerganov'
61+
)
62+
|| github.event_name == 'pull_request_target'
63+
|| (
64+
github.event_name == 'push'
65+
&& github.event.ref == 'refs/heads/master'
66+
&& github.repository_owner == 'ggerganov'
67+
)
5668
steps:
5769
- name: Clone
5870
id: checkout

.github/workflows/python-lint.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,4 @@ jobs:
2020
- name: flake8 Lint
2121
uses: py-actions/flake8@v2
2222
with:
23-
ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503"
24-
exclude: "examples/*,examples/*/**,*/**/__init__.py,convert-hf-to-gguf-update.py"
23+
plugins: "flake8-no-print"

.pre-commit-config.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,14 @@
33
exclude: prompts/.*.txt
44
repos:
55
- repo: https://github.com/pre-commit/pre-commit-hooks
6-
rev: v3.2.0
6+
rev: v4.6.0
77
hooks:
88
- id: trailing-whitespace
99
- id: end-of-file-fixer
1010
- id: check-yaml
1111
- id: check-added-large-files
1212
- repo: https://github.com/PyCQA/flake8
13-
rev: 6.0.0
13+
rev: 7.0.0
1414
hooks:
1515
- id: flake8
16+
additional_dependencies: [flake8-no-print]

CMakeLists.txt

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,8 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
103103
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
104104
"llama: max. batch size for using peer access")
105105
option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
106+
option(LLAMA_CUDA_NO_VMM "llama: do not try to use CUDA VMM" OFF)
107+
106108
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
107109
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
108110
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
@@ -403,12 +405,16 @@ if (LLAMA_CUDA)
403405
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
404406

405407
add_compile_definitions(GGML_USE_CUDA)
408+
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
406409
if (LLAMA_CUDA_FORCE_DMMV)
407410
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
408411
endif()
409412
if (LLAMA_CUDA_FORCE_MMQ)
410413
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
411414
endif()
415+
if (LLAMA_CUDA_NO_VMM)
416+
add_compile_definitions(GGML_CUDA_NO_VMM)
417+
endif()
412418
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
413419
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
414420
if (DEFINED LLAMA_CUDA_DMMV_Y)
@@ -425,7 +431,7 @@ if (LLAMA_CUDA)
425431

426432
if (LLAMA_STATIC)
427433
if (WIN32)
428-
# As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
434+
# As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
429435
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
430436
else ()
431437
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
@@ -434,7 +440,11 @@ if (LLAMA_CUDA)
434440
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
435441
endif()
436442

437-
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
443+
if (LLAMA_CUDA_NO_VMM)
444+
# No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
445+
else()
446+
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
447+
endif()
438448

439449
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
440450
# 52 == lowest CUDA 12 standard

Makefile

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,11 +77,10 @@ test: $(TEST_TARGETS)
7777
./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
7878
./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
7979
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
80-
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
81-
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
8280
./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
8381
./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
8482
./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
83+
./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \
8584
elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
8685
continue; \
8786
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
@@ -434,7 +433,7 @@ ifdef LLAMA_CUDA
434433
else
435434
CUDA_PATH ?= /usr/local/cuda
436435
endif
437-
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
436+
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
438437
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
439438
OBJS += ggml-cuda.o
440439
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))

README.md

Lines changed: 24 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
44

5-
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
5+
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
66

77
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
88

@@ -20,7 +20,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
2020

2121
### Hot topics
2222

23-
- **BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920**
23+
- **Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021**
24+
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
2425
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
2526
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
2627
- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
@@ -139,7 +140,6 @@ Typically finetunes of the base models below are supported as well.
139140
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
140141
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
141142
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
142-
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
143143

144144
**HTTP server**
145145

@@ -175,6 +175,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
175175
- [nat/openplayground](https://github.com/nat/openplayground)
176176
- [Faraday](https://faraday.dev/) (proprietary)
177177
- [LMStudio](https://lmstudio.ai/) (proprietary)
178+
- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
178179
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
179180
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
180181
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
@@ -712,6 +713,8 @@ Building the program with BLAS support may lead to some performance improvements
712713
713714
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
714715
716+
Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
717+
715718
```bash
716719
# obtain the official LLaMA model weights and place them in ./models
717720
ls ./models
@@ -933,25 +936,35 @@ If your issue is with model generation quality, then please at least scan the fo
933936
934937
### Android
935938
936-
#### Building the Project using Android NDK
937-
You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/).
939+
#### Build on Android using Termux
940+
[Termux](https://github.com/termux/termux-app#installation) is a method to execute `llama.cpp` on an Android device (no root required).
941+
```
942+
apt update && apt upgrade -y
943+
apt install git make cmake
944+
```
938945
939-
First, install the essential packages for termux:
946+
It's recommended to move your model inside the `~/` directory for best performance:
940947
```
941-
pkg install clang wget git cmake
948+
cd storage/downloads
949+
mv model.gguf ~/
942950
```
943-
Second, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
944951
945-
You can execute the following commands on your computer to avoid downloading the NDK to your mobile. Of course, you can also do this in Termux.
952+
[Get the code](https://github.com/ggerganov/llama.cpp#get-the-code) & [follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
946953
954+
#### Building the Project using Android NDK
955+
Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
956+
957+
Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
947958
```
948959
$ mkdir build-android
949960
$ cd build-android
950961
$ export NDK=<your_ndk_directory>
951962
$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
952963
$ make
953964
```
954-
Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card.
965+
966+
Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
967+
955968
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
956969
957970
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
@@ -973,53 +986,10 @@ $cd /data/data/com.termux/files/home/bin
973986
$./main -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
974987
```
975988
976-
Here is a demo of an interactive session running on Pixel 5 phone:
989+
Here's a demo of an interactive session running on Pixel 5 phone:
977990
978991
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
979992
980-
#### Building the Project using Termux (F-Droid)
981-
Termux from F-Droid offers an alternative route to execute the project on an Android device. This method empowers you to construct the project right from within the terminal, negating the requirement for a rooted device or SD Card.
982-
983-
Outlined below are the directives for installing the project using OpenBLAS and CLBlast. This combination is specifically designed to deliver peak performance on recent devices that feature a GPU.
984-
985-
If you opt to utilize OpenBLAS, you'll need to install the corresponding package.
986-
```
987-
apt install libopenblas
988-
```
989-
990-
Subsequently, if you decide to incorporate CLBlast, you'll first need to install the requisite OpenCL packages:
991-
```
992-
apt install ocl-icd opencl-headers opencl-clhpp clinfo
993-
```
994-
995-
In order to compile CLBlast, you'll need to first clone the respective Git repository, which can be found at this URL: https://github.com/CNugteren/CLBlast. Alongside this, clone this repository into your home directory. Once this is done, navigate to the CLBlast folder and execute the commands detailed below:
996-
```
997-
cmake .
998-
make
999-
cp libclblast.so* $PREFIX/lib
1000-
cp ./include/clblast.h ../llama.cpp
1001-
```
1002-
1003-
Following the previous steps, navigate to the LlamaCpp directory. To compile it with OpenBLAS and CLBlast, execute the command provided below:
1004-
```
1005-
cp /data/data/com.termux/files/usr/include/openblas/cblas.h .
1006-
cp /data/data/com.termux/files/usr/include/openblas/openblas_config.h .
1007-
make LLAMA_CLBLAST=1 //(sometimes you need to run this command twice)
1008-
```
1009-
1010-
Upon completion of the aforementioned steps, you will have successfully compiled the project. To run it using CLBlast, a slight adjustment is required: a command must be issued to direct the operations towards your device's physical GPU, rather than the virtual one. The necessary command is detailed below:
1011-
```
1012-
GGML_OPENCL_PLATFORM=0
1013-
GGML_OPENCL_DEVICE=0
1014-
export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
1015-
```
1016-
1017-
(Note: some Android devices, like the Zenfone 8, need the following command instead - "export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH". Source: https://www.reddit.com/r/termux/comments/kc3ynp/opencl_working_in_termux_more_in_comments/ )
1018-
1019-
For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
1020-
1021-
Place your desired model into the `~/llama.cpp/models/` directory and execute the `./main (...)` script.
1022-
1023993
### Docker
1024994
1025995
#### Prerequisites

ci/run.sh

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -160,9 +160,8 @@ function gg_run_test_scripts_debug {
160160

161161
set -e
162162

163-
# TODO: too slow, run on dedicated node
164-
#(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
165-
#(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
163+
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
164+
(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
166165

167166
set +e
168167
}
@@ -695,8 +694,10 @@ test $ret -eq 0 && gg_run ctest_release
695694
if [ -z ${GG_BUILD_LOW_PERF} ]; then
696695
test $ret -eq 0 && gg_run embd_bge_small
697696

698-
test $ret -eq 0 && gg_run test_scripts_debug
699-
test $ret -eq 0 && gg_run test_scripts_release
697+
if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
698+
test $ret -eq 0 && gg_run test_scripts_debug
699+
test $ret -eq 0 && gg_run test_scripts_release
700+
fi
700701

701702
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
702703
if [ -z ${GG_BUILD_CUDA} ]; then

common/common.cpp

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
#include "common.h"
2+
// Change JSON_ASSERT from assert() to GGML_ASSERT:
3+
#define JSON_ASSERT GGML_ASSERT
24
#include "json.hpp"
35
#include "json-schema-to-grammar.h"
46
#include "llama.h"
@@ -76,7 +78,7 @@ int32_t get_num_physical_cores() {
7678
// enumerate the set of thread siblings, num entries is num cores
7779
std::unordered_set<std::string> siblings;
7880
for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
79-
std::ifstream thread_siblings("/sys/devices/system/cpu"
81+
std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
8082
+ std::to_string(cpu) + "/topology/thread_siblings");
8183
if (!thread_siblings.is_open()) {
8284
break; // no more cpus
@@ -911,6 +913,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
911913
params.instruct = true;
912914
return true;
913915
}
916+
if (arg == "-cnv" || arg == "--conversation") {
917+
params.conversation = true;
918+
return true;
919+
}
914920
if (arg == "-cml" || arg == "--chatml") {
915921
params.chatml = true;
916922
return true;
@@ -1417,6 +1423,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
14171423
printf(" --version show version and build info\n");
14181424
printf(" -i, --interactive run in interactive mode\n");
14191425
printf(" --interactive-first run in interactive mode and wait for input right away\n");
1426+
printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
14201427
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
14211428
printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n");
14221429
printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
@@ -1964,18 +1971,18 @@ static bool llama_download_file(const std::string & url, const std::string & pat
19641971
try {
19651972
metadata_in >> metadata;
19661973
fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
1967-
if (metadata.contains("url") && metadata["url"].is_string()) {
1968-
auto previous_url = metadata["url"].get<std::string>();
1974+
if (metadata.contains("url") && metadata.at("url").is_string()) {
1975+
auto previous_url = metadata.at("url").get<std::string>();
19691976
if (previous_url != url) {
19701977
fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
19711978
return false;
19721979
}
19731980
}
1974-
if (metadata.contains("etag") && metadata["etag"].is_string()) {
1975-
etag = metadata["etag"];
1981+
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
1982+
etag = metadata.at("etag");
19761983
}
1977-
if (metadata.contains("lastModified") && metadata["lastModified"].is_string()) {
1978-
last_modified = metadata["lastModified"];
1984+
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
1985+
last_modified = metadata.at("lastModified");
19791986
}
19801987
} catch (const nlohmann::json::exception & e) {
19811988
fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ struct gpt_params {
140140
bool random_prompt = false; // do not randomize prompt if none provided
141141
bool use_color = false; // use color to distinguish generations and inputs
142142
bool interactive = false; // interactive mode
143+
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
143144
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
144145
bool prompt_cache_all = false; // save user input and generations to prompt cache
145146
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it

0 commit comments

Comments
 (0)