Skip to content

Commit e3d3de6

Browse files
committed
Revert "Squashed commit of the following:"
This reverts commit 8a8218c.
1 parent 8a8218c commit e3d3de6

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+2222
-7032
lines changed

CMakeLists.txt

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,9 @@ if (NOT MSVC)
4141
endif()
4242

4343
# 3rd party libs
44-
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
44+
option(LLAMA_CUBLAS "llama: use cuBLAS" ON)
4545
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
4646
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
47-
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
4847
option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
4948
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
5049
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
@@ -78,11 +77,8 @@ if (LLAMA_CUBLAS)
7877
set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h)
7978

8079
add_compile_definitions(GGML_USE_CUBLAS)
81-
add_compile_definitions(GGML_CUDA_FORCE_DMMV) #non dmmv broken for me
82-
8380
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
8481
add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
85-
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
8682
if (LLAMA_CUDA_DMMV_F16)
8783
add_compile_definitions(GGML_CUDA_DMMV_F16)
8884
endif()
@@ -94,15 +90,6 @@ if (LLAMA_CUBLAS)
9490
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
9591
endif()
9692

97-
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
98-
if (LLAMA_CUDA_DMMV_F16)
99-
set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics
100-
else()
101-
set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
102-
endif()
103-
endif()
104-
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
105-
10693
else()
10794
message(WARNING "cuBLAS not found")
10895
endif()
@@ -213,6 +200,11 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES
213200
if (MSVC)
214201
# TODO: arm msvc?
215202
else()
203+
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
204+
# Apple M1, M2, etc.
205+
# Raspberry Pi 3, 4, Zero 2 (64-bit)
206+
add_compile_options(-mcpu=native)
207+
endif()
216208
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
217209
# Raspberry Pi 1, Zero
218210
add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access)

Makefile

Lines changed: 11 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -144,18 +144,16 @@ ifdef LLAMA_CUBLAS
144144
CUBLASLD_FLAGS = -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
145145
CUBLAS_OBJS = ggml-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
146146
NVCC = nvcc
147-
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_FORCE_DMMV
147+
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
148148
ifdef LLAMA_CUDA_DMMV_X
149149
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
150150
else
151151
NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
152152
endif # LLAMA_CUDA_DMMV_X
153153
ifdef LLAMA_CUDA_DMMV_Y
154-
NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
155154
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y)
156155
else
157156
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
158-
NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
159157
endif # LLAMA_CUDA_DMMV_Y
160158
ifdef LLAMA_CUDA_DMMV_F16
161159
NVCCFLAGS += -DGGML_CUDA_DMMV_F16
@@ -177,40 +175,23 @@ ifdef LLAMA_HIPBLAS
177175
ROCM_PATH ?= /opt/rocm
178176
CC := $(ROCM_PATH)/llvm/bin/clang
179177
CXX := $(ROCM_PATH)/llvm/bin/clang++
180-
GPU_TARGETS = gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100
178+
GPU_TARGETS = gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030
181179
LLAMA_CUDA_DMMV_X ?= 64
182-
LLAMA_CUDA_MMV_Y ?= 2
183-
LLAMA_CUDA_FORCE_DMMV = true
180+
LLAMA_CUDA_DMMV_Y ?= 2
184181
CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
185182
CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
186183
LDFLAGS += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64
187184
OBJS += ggml-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
188185

189-
ifdef LLAMA_CUDA_DMMV_X
190-
CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
191-
else
192-
CXXFLAGS += -DGGML_CUDA_DMMV_X=32
193-
endif
194-
ifeq ($(LLAMA_CUDA_FORCE_DMMV), true)
195-
CXXFLAGS += -DGGML_CUDA_FORCE_DMMV
196-
endif
197-
ifdef LLAMA_CUDA_MMV_Y
198-
CXXFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
199-
else ifdef LLAMA_CUDA_DMMV_Y
200-
CXXFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
201-
else
202-
CXXFLAGS += -DGGML_CUDA_MMV_Y=1
203-
endif
204-
205186
ifdef LLAMA_CUDA_KQUANTS_ITER
206187
CXXFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
207188
else
208189
CXXFLAGS += -DK_QUANTS_PER_ITERATION=2
209190
endif
210191

211-
ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
212-
213-
192+
ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \
193+
-DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) \
194+
-DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y)
214195
# DGGML_CUDA_DMMV_F16 does not currently work with AMD.
215196
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
216197
$(CXX) $(CXXFLAGS) -x hip -c -o $@ $<
@@ -278,11 +259,11 @@ else
278259
OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
279260
endif
280261
ifdef LLAMA_CLBLAST
281-
ifeq ($(UNAME_S),Darwin)
282-
CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
283-
else
284-
CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
285-
endif
262+
ifeq ($(UNAME_S),Darwin)
263+
CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
264+
else
265+
CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
266+
endif
286267
endif
287268

288269
ifdef LLAMA_CUBLAS

README.md

Lines changed: 1 addition & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,5 @@
1-
# koboldcpp-ROCM
1+
# koboldcpp
22

3-
To install, run
4-
```make LLAMA_HIPBLAS=1```
5-
To use ROCM, set GPU layers with --gpulayers when starting koboldcpp
6-
Original [llama.cpp rocm port](https://github.com/ggerganov/llama.cpp/pull/1087) by SlyEcho, ported to koboldcpp by yellowrosecx
7-
8-
Comparison with OpenCL using 6800xt
9-
| Model | Offloading Method | Time Taken - Processing 593 tokens| Time Taken - Generating 200 tokens| Total Time | Perf. Diff.
10-
|-----------------|----------------------------|--------------------|--------------------|------------|---|
11-
| Robin 7b q6_K |CLBLAST 6-t, All Layers on GPU | 6.8s (11ms/T) | 12.0s (60ms/T) | 18.7s (10.7T/s) | 1x
12-
| Robin 7b q6_K |ROCM 1-t, All Layers on GPU | 1.4s (2ms/T) | 5.5s (28ms/T) | 6.9s (29.1T/s)| **2.71x**
13-
| Robin 13b q5_K_M |CLBLAST 6-t, All Layers on GPU | 10.9s (18ms/T) | 16.7s (83ms/T) | 27.6s (7.3T/s) | 1x
14-
| Robin 13b q5_K_M |ROCM 1-t, All Layers on GPU | 2.4s (4ms/T) | 7.8s (39ms/T) | 10.2s (19.6T/s)| **2.63x**
15-
| Robin 33b q4_K_S |CLBLAST 6-t, 46/63 Layers on GPU | 23.2s (39ms/T) | 48.6s (243ms/T) | 71.9s (2.8T/s) | 1x
16-
| Robin 33b q4_K_S |CLBLAST 6-t, 50/63 Layers on GPU | 25.5s (43ms/T) | 44.6s (223ms/T) | 70.0s (2.9T/s) | 1x
17-
| Robin 33b q4_K_S |ROCM 6-t, 46/63 Layers on GPU | 14.6s (25ms/T) | 44.1s (221ms/T) | 58.7s (3.4T/s)| **1.19x**
18-
19-
--------
203
A self contained distributable from Concedo that exposes llama.cpp function bindings, allowing it to be used via a simulated Kobold API endpoint.
214

225
What does it mean? You get llama.cpp with a fancy UI, persistent stories, editing tools, save formats, memory, world info, author's note, characters, scenarios and everything Kobold and Kobold Lite have to offer. In a tiny package around 20 MB in size, excluding model weights.

convert.py

Lines changed: 5 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ def find_n_mult(n_ff: int, n_embd: int) -> int:
136136
calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
137137
if calc_ff == n_ff:
138138
return n_mult
139-
raise Exception(f"failed to find n_mult for (n_ff={n_ff}, n_embd={n_embd}).")
139+
return 1
140140

141141
@dataclass
142142
class Params:
@@ -154,15 +154,9 @@ def guessed(model: 'LazyModel') -> 'Params':
154154
# try transformer naming first
155155
if "model.layers.0.self_attn.q_proj.weight" in model:
156156
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
157-
elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
158-
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
159157
else:
160158
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
161159

162-
if n_layer < 1:
163-
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
164-
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
165-
166160
n_head=n_embd // 128 # guessed
167161

168162
return Params(
@@ -327,10 +321,6 @@ def astype(self, data_type: DataType) -> 'Tensor': ...
327321
@abstractmethod
328322
def permute(self, n_head: int) -> 'Tensor': ...
329323
@abstractmethod
330-
def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ...
331-
@abstractmethod
332-
def part(self, n_part: int) -> 'UnquantizedTensor': ...
333-
@abstractmethod
334324
def to_ggml(self) -> 'GGMLCompatibleTensor': ...
335325

336326

@@ -355,14 +345,6 @@ def astype(self, data_type: DataType) -> Tensor:
355345
def to_ggml(self) -> 'UnquantizedTensor':
356346
return self
357347

358-
def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor':
359-
r = self.ndarray.shape[0] // 3
360-
return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head))
361-
362-
def part(self, n_part: int) -> 'UnquantizedTensor':
363-
r = self.ndarray.shape[0] // 3
364-
return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
365-
366348
def permute(self, n_head: int) -> 'UnquantizedTensor':
367349
return UnquantizedTensor(permute(self.ndarray, n_head))
368350

@@ -660,19 +642,6 @@ def load() -> Tensor:
660642
return lazy_tensor.load().permute(n_head)
661643
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
662644

663-
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
664-
def load() -> Tensor:
665-
return lazy_tensor.load().permute_part(n_part, n_head)
666-
s = lazy_tensor.shape.copy()
667-
s[0] = s[0] // 3
668-
return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
669-
670-
def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
671-
def load() -> Tensor:
672-
return lazy_tensor.load().part(n_part)
673-
s = lazy_tensor.shape.copy()
674-
s[0] = s[0] // 3
675-
return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
676645

677646
def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
678647
out: LazyModel = {}
@@ -681,17 +650,11 @@ def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
681650
out["output.weight"] = model["lm_head.weight"]
682651

683652
for i in itertools.count():
684-
if f"model.layers.{i}.self_attn.q_proj.weight" in model:
685-
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
686-
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
687-
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
688-
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
689-
out[f"layers.{i}.attention.wq.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head)
690-
out[f"layers.{i}.attention.wk.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head)
691-
out[f"layers.{i}.attention.wv.weight"] = part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
692-
else:
653+
if f"model.layers.{i}.self_attn.q_proj.weight" not in model:
693654
break
694-
655+
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
656+
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
657+
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
695658
out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
696659

697660
out[f"layers.{i}.feed_forward.w1.weight"] = model[f"model.layers.{i}.mlp.gate_proj.weight"]

examples/alpaca.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
cd `dirname $0`
88
cd ..
99

10-
./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
10+
./main -m ./models/ggml-alpaca-7b-q4.bin \
1111
--color \
1212
-f ./prompts/alpaca.txt \
1313
--ctx_size 2048 \

examples/common.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ struct gpt_params {
3131
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
3232
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
3333
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
34-
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
34+
bool low_vram = 0; // if true, reduce VRAM usage at the cost of performance
3535

3636
// sampling parameters
3737
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
@@ -59,7 +59,6 @@ struct gpt_params {
5959
std::string lora_adapter = ""; // lora adapter path
6060
std::string lora_base = ""; // base model path for the lora adapter
6161

62-
bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
6362
bool memory_f16 = true; // use f16 instead of f32 for memory kv
6463
bool random_prompt = false; // do not randomize prompt if none provided
6564
bool use_color = false; // use color to distinguish generations and inputs

examples/embd-input/embd-input-lib.cpp

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
2929

3030
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
3131

32-
if (params.seed == LLAMA_DEFAULT_SEED) {
32+
if (params.seed < 0) {
3333
params.seed = time(NULL);
3434
}
3535
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
@@ -210,12 +210,9 @@ llama_token sampling_id(struct MyModel* mymodel) {
210210
const char * sampling(struct MyModel * mymodel) {
211211
llama_context * ctx = mymodel->ctx;
212212
int id = sampling_id(mymodel);
213-
static std::string ret;
214-
if (id == llama_token_eos()) {
215-
ret = "</s>";
216-
} else {
217-
ret = llama_token_to_str(ctx, id);
218-
}
213+
std::string ret;
214+
if (id == llama_token_eos()) ret = "</s>";
215+
else ret = llama_token_to_str(ctx, id);
219216
eval_id(mymodel, id);
220217
return ret.c_str();
221218
}

examples/embd-input/embd-input.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include "llama.h"
66
#include "build-info.h"
77

8+
89
extern "C" {
910

1011
typedef struct MyModel {
@@ -13,13 +14,14 @@ typedef struct MyModel {
1314
int n_past = 0;
1415
} MyModel;
1516

17+
1618
struct MyModel* create_mymodel(int argc, char ** argv);
1719

1820
bool eval_float(void* model, float* input, int N);
1921
bool eval_tokens(void* model, std::vector<llama_token> tokens);
2022
bool eval_id(struct MyModel* mymodel, int id);
2123
bool eval_string(struct MyModel* mymodel, const char* str);
22-
const char * sampling(struct MyModel* mymodel);
24+
const char* sampling(struct MyModel* mymodel);
2325
llama_token sampling_id(struct MyModel* mymodel);
2426
void free_mymodel(struct MyModel* mymodel);
2527

examples/embedding/embedding.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ int main(int argc, char ** argv) {
1818
params.embedding = true;
1919

2020
if (params.n_ctx > 2048) {
21-
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
21+
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
2222
"expect poor results\n", __func__, params.n_ctx);
2323
}
2424

examples/main/main.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ int main(int argc, char ** argv) {
8585
}
8686

8787
if (params.n_ctx > 2048) {
88-
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
88+
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
8989
"expect poor results\n", __func__, params.n_ctx);
9090
} else if (params.n_ctx < 8) {
9191
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);

0 commit comments

Comments
 (0)