YellowRoseCx
diff --git a/‎CMakeLists.txt
Lines changed: 6 additions & 14 deletions b/‎CMakeLists.txt
Lines changed: 6 additions & 14 deletions
diff --git a/‎Makefile
Lines changed: 11 additions & 30 deletions b/‎Makefile
Lines changed: 11 additions & 30 deletions
diff --git a/‎README.md
Lines changed: 1 addition & 18 deletions b/‎README.md
Lines changed: 1 addition & 18 deletions
diff --git a/‎convert.py
Lines changed: 5 additions & 42 deletions b/‎convert.py
Lines changed: 5 additions & 42 deletions
diff --git a/‎examples/alpaca.sh
Lines changed: 1 addition & 1 deletion b/‎examples/alpaca.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/common.h
Lines changed: 1 addition & 2 deletions b/‎examples/common.h
Lines changed: 1 addition & 2 deletions
diff --git a/‎examples/embd-input/embd-input-lib.cpp
Lines changed: 4 additions & 7 deletions b/‎examples/embd-input/embd-input-lib.cpp
Lines changed: 4 additions & 7 deletions
diff --git a/‎examples/embd-input/embd-input.h
Lines changed: 3 additions & 1 deletion b/‎examples/embd-input/embd-input.h
Lines changed: 3 additions & 1 deletion
diff --git a/‎examples/embedding/embedding.cpp
Lines changed: 1 addition & 1 deletion b/‎examples/embedding/embedding.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/main/main.cpp
Lines changed: 1 addition & 1 deletion b/‎examples/main/main.cpp
Lines changed: 1 addition & 1 deletion
@@ -41,10 +41,9 @@ if (NOT MSVC)
 endif()
 
 # 3rd party libs
-option(LLAMA_CUBLAS                          "llama: use cuBLAS"                                OFF)
+option(LLAMA_CUBLAS                          "llama: use cuBLAS"                                ON)
 set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
 set(LLAMA_CUDA_DMMV_Y       "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
-set(LLAMA_CUDA_MMV_Y        "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
 option(LLAMA_CUDA_DMMV_F16                   "llama: use 16 bit floats for dmmv CUDA kernels"   OFF)
 set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
@@ -78,11 +77,8 @@ if (LLAMA_CUBLAS)
         set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h)
 
         add_compile_definitions(GGML_USE_CUBLAS)
-        add_compile_definitions(GGML_CUDA_FORCE_DMMV) #non dmmv broken for me
-        
         add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
         add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
-        add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})    
         if (LLAMA_CUDA_DMMV_F16)
             add_compile_definitions(GGML_CUDA_DMMV_F16)
         endif()
@@ -94,15 +90,6 @@ if (LLAMA_CUBLAS)
             set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
         endif()
 
-    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-        if (LLAMA_CUDA_DMMV_F16)
-            set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics
-        else()
-            set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
-        endif()
-    endif()
-    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
-
     else()
         message(WARNING "cuBLAS not found")
     endif()
@@ -213,6 +200,11 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES
     if (MSVC)
         # TODO: arm msvc?
     else()
+        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
+            # Apple M1, M2, etc.
+            # Raspberry Pi 3, 4, Zero 2 (64-bit)
+            add_compile_options(-mcpu=native)
+        endif()
         if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
             # Raspberry Pi 1, Zero
             add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access)
 
@@ -144,18 +144,16 @@ ifdef LLAMA_CUBLAS
 	CUBLASLD_FLAGS = -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
 	CUBLAS_OBJS = ggml-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
 	NVCC      = nvcc
-	NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_FORCE_DMMV
+	NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
 ifdef LLAMA_CUDA_DMMV_X
 	NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
 else
 	NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
 endif # LLAMA_CUDA_DMMV_X
 ifdef LLAMA_CUDA_DMMV_Y
-	NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
 	NVCCFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y)
 else
 	NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
-	NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
 endif # LLAMA_CUDA_DMMV_Y
 ifdef LLAMA_CUDA_DMMV_F16
 	NVCCFLAGS += -DGGML_CUDA_DMMV_F16
@@ -177,40 +175,23 @@ ifdef LLAMA_HIPBLAS
 	ROCM_PATH  ?= /opt/rocm
 	CC         := $(ROCM_PATH)/llvm/bin/clang
 	CXX        := $(ROCM_PATH)/llvm/bin/clang++
-	GPU_TARGETS = gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100
+	GPU_TARGETS = gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030
 	LLAMA_CUDA_DMMV_X ?= 64
-	LLAMA_CUDA_MMV_Y ?= 2
-	LLAMA_CUDA_FORCE_DMMV = true
+	LLAMA_CUDA_DMMV_Y ?= 2
 	CFLAGS     += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
 	CXXFLAGS   += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
 	LDFLAGS    += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64
 	OBJS       += ggml-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
 
-ifdef LLAMA_CUDA_DMMV_X
-    CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
-else
-    CXXFLAGS += -DGGML_CUDA_DMMV_X=32
-endif 
-ifeq ($(LLAMA_CUDA_FORCE_DMMV), true)
-    CXXFLAGS += -DGGML_CUDA_FORCE_DMMV
-endif
-ifdef LLAMA_CUDA_MMV_Y
-    CXXFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
-else ifdef LLAMA_CUDA_DMMV_Y
-    CXXFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
-else
-    CXXFLAGS += -DGGML_CUDA_MMV_Y=1
-endif
-
 ifdef LLAMA_CUDA_KQUANTS_ITER
 	CXXFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
 else
 	CXXFLAGS += -DK_QUANTS_PER_ITERATION=2
 endif
 
-ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
-
-
+ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \
+				-DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) \
+				-DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y)
 # DGGML_CUDA_DMMV_F16 does not currently work with AMD.
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(CXX) $(CXXFLAGS) -x hip -c -o $@ $<
@@ -278,11 +259,11 @@ else
 	OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
 	endif
 	ifdef LLAMA_CLBLAST
-		ifeq ($(UNAME_S),Darwin)
-				CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
-		else
-				CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
-		endif
+        ifeq ($(UNAME_S),Darwin)
+                CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
+        else
+                CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
+        endif
 	endif
 
 ifdef LLAMA_CUBLAS
 
@@ -1,22 +1,5 @@
-# koboldcpp-ROCM
+# koboldcpp
 
-To install, run        
-```make LLAMA_HIPBLAS=1```     
-To use ROCM, set GPU layers with --gpulayers when starting koboldcpp        
-Original [llama.cpp rocm port](https://github.com/ggerganov/llama.cpp/pull/1087) by SlyEcho, ported to koboldcpp by yellowrosecx
-
-Comparison with OpenCL using 6800xt
-| Model | Offloading Method | Time Taken - Processing 593 tokens| Time Taken - Generating 200 tokens| Total Time | Perf. Diff.
-|-----------------|----------------------------|--------------------|--------------------|------------|---|
-| Robin 7b q6_K |CLBLAST 6-t, All Layers on GPU | 6.8s (11ms/T) | 12.0s (60ms/T)  | 18.7s (10.7T/s) | 1x
-| Robin 7b q6_K |ROCM 1-t, All Layers on GPU   | 1.4s (2ms/T) | 5.5s (28ms/T)   | 6.9s (29.1T/s)| **2.71x**
-| Robin 13b q5_K_M |CLBLAST 6-t, All Layers on GPU | 10.9s (18ms/T) | 16.7s (83ms/T)  | 27.6s (7.3T/s) | 1x
-| Robin 13b q5_K_M |ROCM 1-t, All Layers on GPU   | 2.4s (4ms/T) | 7.8s (39ms/T)   | 10.2s (19.6T/s)| **2.63x**
-| Robin 33b q4_K_S |CLBLAST 6-t, 46/63 Layers on GPU | 23.2s (39ms/T) | 48.6s (243ms/T)  | 71.9s (2.8T/s) | 1x
-| Robin 33b q4_K_S |CLBLAST 6-t, 50/63 Layers on GPU | 25.5s (43ms/T) | 44.6s (223ms/T)  | 70.0s (2.9T/s) | 1x
-| Robin 33b q4_K_S |ROCM 6-t, 46/63 Layers on GPU   | 14.6s (25ms/T) | 44.1s (221ms/T)   | 58.7s (3.4T/s)| **1.19x**
-
---------
 A self contained distributable from Concedo that exposes llama.cpp function bindings, allowing it to be used via a simulated Kobold API endpoint.
 
 What does it mean? You get llama.cpp with a fancy UI, persistent stories, editing tools, save formats, memory, world info, author's note, characters, scenarios and everything Kobold and Kobold Lite have to offer. In a tiny package around 20 MB in size, excluding model weights.
 
@@ -136,7 +136,7 @@ def find_n_mult(n_ff: int, n_embd: int) -> int:
         calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
         if calc_ff == n_ff:
             return n_mult
-    raise Exception(f"failed to find n_mult for (n_ff={n_ff}, n_embd={n_embd}).")
+    return 1
 
 @dataclass
 class Params:
@@ -154,15 +154,9 @@ def guessed(model: 'LazyModel') -> 'Params':
         # try transformer naming first
         if "model.layers.0.self_attn.q_proj.weight" in model:
             n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
-        elif "model.layers.0.self_attn.W_pack.weight" in model:   # next: try baichuan naming
-            n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
         else:
             n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
 
-        if n_layer < 1:
-            raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
-                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
-
         n_head=n_embd // 128 # guessed
 
         return Params(
@@ -327,10 +321,6 @@ def astype(self, data_type: DataType) -> 'Tensor': ...
     @abstractmethod
     def permute(self, n_head: int) -> 'Tensor': ...
     @abstractmethod
-    def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ...
-    @abstractmethod
-    def part(self, n_part: int) -> 'UnquantizedTensor': ...
-    @abstractmethod
     def to_ggml(self) -> 'GGMLCompatibleTensor': ...
 
 
@@ -355,14 +345,6 @@ def astype(self, data_type: DataType) -> Tensor:
     def to_ggml(self) -> 'UnquantizedTensor':
         return self
 
-    def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor':
-        r = self.ndarray.shape[0] // 3
-        return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head))
-
-    def part(self, n_part: int) -> 'UnquantizedTensor':
-        r = self.ndarray.shape[0] // 3
-        return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
-
     def permute(self, n_head: int) -> 'UnquantizedTensor':
         return UnquantizedTensor(permute(self.ndarray, n_head))
 
@@ -660,19 +642,6 @@ def load() -> Tensor:
         return lazy_tensor.load().permute(n_head)
     return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
 
-def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
-    def load() -> Tensor:
-        return lazy_tensor.load().permute_part(n_part, n_head)
-    s = lazy_tensor.shape.copy()
-    s[0] = s[0] // 3
-    return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
-
-def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
-    def load() -> Tensor:
-        return lazy_tensor.load().part(n_part)
-    s = lazy_tensor.shape.copy()
-    s[0] = s[0] // 3
-    return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
 
 def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
     out: LazyModel = {}
@@ -681,17 +650,11 @@ def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
     out["output.weight"] = model["lm_head.weight"]
 
     for i in itertools.count():
-        if f"model.layers.{i}.self_attn.q_proj.weight" in model:
-            out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
-            out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
-            out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
-        elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
-            out[f"layers.{i}.attention.wq.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head)
-            out[f"layers.{i}.attention.wk.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head)
-            out[f"layers.{i}.attention.wv.weight"] = part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
-        else:
+        if f"model.layers.{i}.self_attn.q_proj.weight" not in model:
             break
-
+        out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
+        out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
+        out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
         out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
 
         out[f"layers.{i}.feed_forward.w1.weight"] = model[f"model.layers.{i}.mlp.gate_proj.weight"]
 
@@ -7,7 +7,7 @@
 cd `dirname $0`
 cd ..
 
-./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
+./main -m ./models/ggml-alpaca-7b-q4.bin \
        --color \
        -f ./prompts/alpaca.txt \
        --ctx_size 2048 \
 
@@ -31,7 +31,7 @@ struct gpt_params {
     int32_t n_gpu_layers                    = 0;   // number of layers to store in VRAM
     int32_t main_gpu                        = 0;   // the GPU that is used for scratch and small tensors
     float   tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
-    int32_t n_probs                         = 0;   // if greater than 0, output the probabilities of top n_probs tokens.
+    bool    low_vram                        = 0;   // if true, reduce VRAM usage at the cost of performance
 
     // sampling parameters
     std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
@@ -59,7 +59,6 @@ struct gpt_params {
     std::string lora_adapter = "";  // lora adapter path
     std::string lora_base    = "";  // base model path for the lora adapter
 
-    bool low_vram          = false;   // if true, reduce VRAM usage at the cost of performance
     bool memory_f16        = true;  // use f16 instead of f32 for memory kv
     bool random_prompt     = false; // do not randomize prompt if none provided
     bool use_color         = false; // use color to distinguish generations and inputs
 
@@ -29,7 +29,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
 
     fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
 
-    if (params.seed == LLAMA_DEFAULT_SEED) {
+    if (params.seed < 0) {
         params.seed = time(NULL);
     }
     fprintf(stderr, "%s: seed  = %d\n", __func__, params.seed);
@@ -210,12 +210,9 @@ llama_token sampling_id(struct MyModel* mymodel) {
 const char * sampling(struct MyModel * mymodel) {
     llama_context * ctx = mymodel->ctx;
     int id = sampling_id(mymodel);
-    static std::string ret;
-    if (id == llama_token_eos()) {
-        ret = "</s>";
-    } else {
-        ret = llama_token_to_str(ctx, id);
-    }
+    std::string ret;
+    if (id == llama_token_eos()) ret = "</s>";
+    else ret = llama_token_to_str(ctx, id);
     eval_id(mymodel, id);
     return ret.c_str();
 }
 
@@ -5,6 +5,7 @@
 #include "llama.h"
 #include "build-info.h"
 
+
 extern "C" {
 
 typedef struct MyModel {
@@ -13,13 +14,14 @@ typedef struct MyModel {
     int n_past = 0;
 } MyModel;
 
+
 struct MyModel* create_mymodel(int argc, char ** argv);
 
 bool eval_float(void* model, float* input, int N);
 bool eval_tokens(void* model, std::vector<llama_token> tokens);
 bool eval_id(struct MyModel* mymodel, int id);
 bool eval_string(struct MyModel* mymodel, const char* str);
-const char * sampling(struct MyModel* mymodel);
+const char* sampling(struct MyModel* mymodel);
 llama_token sampling_id(struct MyModel* mymodel);
 void free_mymodel(struct MyModel* mymodel);
 
 
@@ -18,7 +18,7 @@ int main(int argc, char ** argv) {
     params.embedding = true;
 
     if (params.n_ctx > 2048) {
-        fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
+        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
                 "expect poor results\n", __func__, params.n_ctx);
     }
 
 
@@ -85,7 +85,7 @@ int main(int argc, char ** argv) {
     }
 
     if (params.n_ctx > 2048) {
-        fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
+        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
                 "expect poor results\n", __func__, params.n_ctx);
     } else if (params.n_ctx < 8) {
         fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@ int main(int argc, char ** argv) {`
`18`	`18`	`params.embedding = true;`
`19`	`19`
`20`	`20`	`if (params.n_ctx > 2048) {`
`21`		`- fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"`
	`21`	`+ fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"`
`22`	`22`	`"expect poor results\n", __func__, params.n_ctx);`
`23`	`23`	`}`
`24`	`24`
Original file line number	Diff line number	Diff line change
`@@ -85,7 +85,7 @@ int main(int argc, char ** argv) {`
`85`	`85`	`}`
`86`	`86`
`87`	`87`	`if (params.n_ctx > 2048) {`
`88`		`- fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"`
	`88`	`+ fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"`
`89`	`89`	`"expect poor results\n", __func__, params.n_ctx);`
`90`	`90`	`} else if (params.n_ctx < 8) {`
`91`	`91`	`fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);`