Skip to content

Commit b2f5a0a

Browse files
committed
Merge remote-tracking branch 'ggerganov/master' into fix_decoding
* ggerganov/master: sync : ggml vulkan (ggml/0) yolo : add backend support (ggml/924) ggml : fix typo in ggml-quants.c comment (ggml/922) feat: add new `sin` and `cos` operators (ggml/919) readme : fix broken links (ggml-org#2358) examples : use colorblind friendly TTY color scheme (ggml-org#2360)
2 parents d64f20a + 9e3c534 commit b2f5a0a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+1630
-145007
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ Supported platforms:
3535
- [x] [Raspberry Pi](https://github.com/ggerganov/whisper.cpp/discussions/166)
3636
- [x] [docker](https://github.com/ggerganov/whisper.cpp/pkgs/container/whisper.cpp)
3737

38-
The entire high-level implementation of the model is contained in [whisper.h](whisper.h) and [whisper.cpp](whisper.cpp).
38+
The entire high-level implementation of the model is contained in [whisper.h](include/whisper.h) and [whisper.cpp](src/whisper.cpp).
3939
The rest of the code is part of the [`ggml`](https://github.com/ggerganov/ggml) machine learning library.
4040

4141
Having such a lightweight implementation of the model allows to easily integrate it in different platforms and applications.

examples/common.h

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <thread>
1010
#include <ctime>
1111
#include <fstream>
12+
#include <sstream>
1213

1314
#define COMMON_SAMPLE_RATE 16000
1415

@@ -288,12 +289,43 @@ void sam_print_usage(int argc, char ** argv, const sam_params & params);
288289
// Terminal utils
289290
//
290291

291-
292-
// Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
293-
// Lowest is red, middle is yellow, highest is green.
292+
#define SQR(X) ((X) * (X))
293+
#define UNCUBE(x) x < 48 ? 0 : x < 115 ? 1 : (x - 35) / 40
294+
295+
/**
296+
* Quantizes 24-bit RGB to xterm256 code range [16,256).
297+
*/
298+
static int rgb2xterm256(int r, int g, int b) {
299+
unsigned char cube[] = {0, 0137, 0207, 0257, 0327, 0377};
300+
int av, ir, ig, ib, il, qr, qg, qb, ql;
301+
av = r * .299 + g * .587 + b * .114 + .5;
302+
ql = (il = av > 238 ? 23 : (av - 3) / 10) * 10 + 8;
303+
qr = cube[(ir = UNCUBE(r))];
304+
qg = cube[(ig = UNCUBE(g))];
305+
qb = cube[(ib = UNCUBE(b))];
306+
if (SQR(qr - r) + SQR(qg - g) + SQR(qb - b) <=
307+
SQR(ql - r) + SQR(ql - g) + SQR(ql - b))
308+
return ir * 36 + ig * 6 + ib + 020;
309+
return il + 0350;
310+
}
311+
312+
static std::string set_xterm256_foreground(int r, int g, int b) {
313+
int x = rgb2xterm256(r, g, b);
314+
std::ostringstream oss;
315+
oss << "\033[38;5;" << x << "m";
316+
return oss.str();
317+
}
318+
319+
// Lowest is red, middle is yellow, highest is green. Color scheme from
320+
// Paul Tol; it is colorblind friendly https://personal.sron.nl/~pault/
294321
const std::vector<std::string> k_colors = {
295-
"\033[38;5;196m", "\033[38;5;202m", "\033[38;5;208m", "\033[38;5;214m", "\033[38;5;220m",
296-
"\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m", "\033[38;5;118m", "\033[38;5;82m",
322+
set_xterm256_foreground(220, 5, 12),
323+
set_xterm256_foreground(232, 96, 28),
324+
set_xterm256_foreground(241, 147, 45),
325+
set_xterm256_foreground(246, 193, 65),
326+
set_xterm256_foreground(247, 240, 86),
327+
set_xterm256_foreground(144, 201, 135),
328+
set_xterm256_foreground( 78, 178, 101),
297329
};
298330

299331
//

ggml/include/ggml.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -451,6 +451,8 @@ extern "C" {
451451
GGML_OP_SQR,
452452
GGML_OP_SQRT,
453453
GGML_OP_LOG,
454+
GGML_OP_SIN,
455+
GGML_OP_COS,
454456
GGML_OP_SUM,
455457
GGML_OP_SUM_ROWS,
456458
GGML_OP_MEAN,
@@ -967,6 +969,22 @@ extern "C" {
967969
struct ggml_context * ctx,
968970
struct ggml_tensor * a);
969971

972+
GGML_API struct ggml_tensor * ggml_sin(
973+
struct ggml_context * ctx,
974+
struct ggml_tensor * a);
975+
976+
GGML_API struct ggml_tensor * ggml_sin_inplace(
977+
struct ggml_context * ctx,
978+
struct ggml_tensor * a);
979+
980+
GGML_API struct ggml_tensor * ggml_cos(
981+
struct ggml_context * ctx,
982+
struct ggml_tensor * a);
983+
984+
GGML_API struct ggml_tensor * ggml_cos_inplace(
985+
struct ggml_context * ctx,
986+
struct ggml_tensor * a);
987+
970988
// return scalar
971989
GGML_API struct ggml_tensor * ggml_sum(
972990
struct ggml_context * ctx,

ggml/src/ggml-cuda.cu

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2181,6 +2181,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
21812181
case GGML_OP_ADD:
21822182
ggml_cuda_op_add(ctx, dst);
21832183
break;
2184+
case GGML_OP_SUB:
2185+
ggml_cuda_op_sub(ctx, dst);
2186+
break;
21842187
case GGML_OP_ACC:
21852188
ggml_cuda_op_acc(ctx, dst);
21862189
break;
@@ -2267,6 +2270,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
22672270
case GGML_OP_SQRT:
22682271
ggml_cuda_op_sqrt(ctx, dst);
22692272
break;
2273+
case GGML_OP_SIN:
2274+
ggml_cuda_op_sin(ctx, dst);
2275+
break;
2276+
case GGML_OP_COS:
2277+
ggml_cuda_op_cos(ctx, dst);
2278+
break;
22702279
case GGML_OP_CLAMP:
22712280
ggml_cuda_op_clamp(ctx, dst);
22722281
break;
@@ -2853,12 +2862,15 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
28532862
case GGML_OP_TRANSPOSE:
28542863
case GGML_OP_NORM:
28552864
case GGML_OP_ADD:
2865+
case GGML_OP_SUB:
28562866
case GGML_OP_MUL:
28572867
case GGML_OP_DIV:
28582868
case GGML_OP_RMS_NORM:
28592869
case GGML_OP_SCALE:
28602870
case GGML_OP_SQR:
28612871
case GGML_OP_SQRT:
2872+
case GGML_OP_SIN:
2873+
case GGML_OP_COS:
28622874
case GGML_OP_CLAMP:
28632875
case GGML_OP_CONT:
28642876
case GGML_OP_DIAG_MASK_INF:

ggml/src/ggml-cuda/binbcast.cu

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ static __device__ __forceinline__ float op_add(const float a, const float b) {
99
return a + b;
1010
}
1111

12+
static __device__ __forceinline__ float op_sub(const float a, const float b) {
13+
return a - b;
14+
}
15+
1216
static __device__ __forceinline__ float op_mul(const float a, const float b) {
1317
return a * b;
1418
}
@@ -271,6 +275,10 @@ void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
271275
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
272276
}
273277

278+
void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
279+
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_sub>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
280+
}
281+
274282
void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
275283
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
276284
}

ggml/src/ggml-cuda/binbcast.cuh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,6 @@
22

33
void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
44
void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
5+
void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
56
void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
67
void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

ggml/src/ggml-cuda/unary.cu

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,24 @@ static __global__ void sqrt_f32(const float * x, float * dst, const int k) {
101101
dst[i] = sqrtf(x[i]);
102102
}
103103

104+
static __global__ void sin_f32(const float * x, float * dst, const int k) {
105+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
106+
107+
if (i >= k) {
108+
return;
109+
}
110+
dst[i] = sinf(x[i]);
111+
}
112+
113+
static __global__ void cos_f32(const float * x, float * dst, const int k) {
114+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
115+
116+
if (i >= k) {
117+
return;
118+
}
119+
dst[i] = cosf(x[i]);
120+
}
121+
104122
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
105123
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
106124
gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
@@ -156,6 +174,16 @@ static void sqrt_f32_cuda(const float * x, float * dst, const int k, cudaStream_
156174
sqrt_f32<<<num_blocks, CUDA_SQRT_BLOCK_SIZE, 0, stream>>>(x, dst, k);
157175
}
158176

177+
static void sin_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
178+
const int num_blocks = (k + CUDA_SIN_BLOCK_SIZE - 1) / CUDA_SIN_BLOCK_SIZE;
179+
sin_f32<<<num_blocks, CUDA_SIN_BLOCK_SIZE, 0, stream>>>(x, dst, k);
180+
}
181+
182+
static void cos_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
183+
const int num_blocks = (k + CUDA_COS_BLOCK_SIZE - 1) / CUDA_COS_BLOCK_SIZE;
184+
cos_f32<<<num_blocks, CUDA_COS_BLOCK_SIZE, 0, stream>>>(x, dst, k);
185+
}
186+
159187
void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
160188
const ggml_tensor * src0 = dst->src[0];
161189
const float * src0_d = (const float *)src0->data;
@@ -312,3 +340,31 @@ void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
312340

313341
sqrt_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
314342
}
343+
344+
void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
345+
const ggml_tensor * src0 = dst->src[0];
346+
const float * src0_d = (const float *)src0->data;
347+
float * dst_d = (float *)dst->data;
348+
cudaStream_t stream = ctx.stream();
349+
350+
GGML_ASSERT(ggml_is_contiguous(src0));
351+
352+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
353+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
354+
355+
sin_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
356+
}
357+
358+
void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
359+
const ggml_tensor * src0 = dst->src[0];
360+
const float * src0_d = (const float *)src0->data;
361+
float * dst_d = (float *)dst->data;
362+
cudaStream_t stream = ctx.stream();
363+
364+
GGML_ASSERT(ggml_is_contiguous(src0));
365+
366+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
367+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
368+
369+
cos_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
370+
}

ggml/src/ggml-cuda/unary.cuh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
#define CUDA_HARDSWISH_BLOCK_SIZE 256
1010
#define CUDA_SQR_BLOCK_SIZE 256
1111
#define CUDA_SQRT_BLOCK_SIZE 256
12+
#define CUDA_SIN_BLOCK_SIZE 256
13+
#define CUDA_COS_BLOCK_SIZE 256
1214

1315
void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
1416

@@ -31,3 +33,7 @@ void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
3133
void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
3234

3335
void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
36+
37+
void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
38+
39+
void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

ggml/src/ggml-metal.m

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131
enum ggml_metal_kernel_type {
3232
GGML_METAL_KERNEL_TYPE_ADD,
3333
GGML_METAL_KERNEL_TYPE_ADD_ROW,
34+
GGML_METAL_KERNEL_TYPE_SUB,
35+
GGML_METAL_KERNEL_TYPE_SUB_ROW,
3436
GGML_METAL_KERNEL_TYPE_MUL,
3537
GGML_METAL_KERNEL_TYPE_MUL_ROW,
3638
GGML_METAL_KERNEL_TYPE_DIV,
@@ -205,6 +207,9 @@
205207
GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL,
206208
GGML_METAL_KERNEL_TYPE_CONCAT,
207209
GGML_METAL_KERNEL_TYPE_SQR,
210+
GGML_METAL_KERNEL_TYPE_SQRT,
211+
GGML_METAL_KERNEL_TYPE_SIN,
212+
GGML_METAL_KERNEL_TYPE_COS,
208213
GGML_METAL_KERNEL_TYPE_SUM_ROWS,
209214

210215
GGML_METAL_KERNEL_TYPE_COUNT
@@ -491,6 +496,8 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
491496

492497
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD, add, true);
493498
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW, add_row, true);
499+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUB, sub, true);
500+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUB_ROW, sub_row, true);
494501
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL, mul, true);
495502
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW, mul_row, true);
496503
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV, div, true);
@@ -665,6 +672,9 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
665672
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL, cpy_f32_iq4_nl, true);
666673
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONCAT, concat, true);
667674
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQR, sqr, true);
675+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQRT, sqrt, true);
676+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIN, sin, true);
677+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS, cos, true);
668678
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true);
669679
}
670680

@@ -765,15 +775,20 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx
765775
case GGML_OP_PERMUTE:
766776
case GGML_OP_CONCAT:
767777
case GGML_OP_ADD:
778+
case GGML_OP_SUB:
768779
case GGML_OP_ACC:
769780
case GGML_OP_MUL:
770781
case GGML_OP_DIV:
771782
case GGML_OP_REPEAT:
772783
case GGML_OP_SCALE:
773784
case GGML_OP_CLAMP:
785+
return true;
774786
case GGML_OP_SQR:
787+
case GGML_OP_SQRT:
788+
case GGML_OP_SIN:
789+
case GGML_OP_COS:
790+
return ggml_is_contiguous(op->src[0]);
775791
case GGML_OP_SUM_ROWS:
776-
return true;
777792
case GGML_OP_SOFT_MAX:
778793
case GGML_OP_RMS_NORM:
779794
case GGML_OP_GROUP_NORM:
@@ -1050,6 +1065,7 @@ static enum ggml_status ggml_metal_graph_compute(
10501065
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
10511066
} break;
10521067
case GGML_OP_ADD:
1068+
case GGML_OP_SUB:
10531069
case GGML_OP_MUL:
10541070
case GGML_OP_DIV:
10551071
{
@@ -1073,6 +1089,7 @@ static enum ggml_status ggml_metal_graph_compute(
10731089
nb = ne00 / 4;
10741090
switch (dst->op) {
10751091
case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
1092+
case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB_ROW].pipeline; break;
10761093
case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
10771094
case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
10781095
default: GGML_ABORT("fatal error");
@@ -1082,6 +1099,7 @@ static enum ggml_status ggml_metal_graph_compute(
10821099
} else {
10831100
switch (dst->op) {
10841101
case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
1102+
case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB].pipeline; break;
10851103
case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
10861104
case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
10871105
default: GGML_ABORT("fatal error");
@@ -1409,6 +1427,48 @@ static enum ggml_status ggml_metal_graph_compute(
14091427

14101428
const int64_t n = ggml_nelements(dst);
14111429

1430+
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
1431+
} break;
1432+
case GGML_OP_SQRT:
1433+
{
1434+
GGML_ASSERT(ggml_is_contiguous(src0));
1435+
1436+
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQRT].pipeline;
1437+
1438+
[encoder setComputePipelineState:pipeline];
1439+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1440+
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
1441+
1442+
const int64_t n = ggml_nelements(dst);
1443+
1444+
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
1445+
} break;
1446+
case GGML_OP_SIN:
1447+
{
1448+
GGML_ASSERT(ggml_is_contiguous(src0));
1449+
1450+
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SIN].pipeline;
1451+
1452+
[encoder setComputePipelineState:pipeline];
1453+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1454+
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
1455+
1456+
const int64_t n = ggml_nelements(dst);
1457+
1458+
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
1459+
} break;
1460+
case GGML_OP_COS:
1461+
{
1462+
GGML_ASSERT(ggml_is_contiguous(src0));
1463+
1464+
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_COS].pipeline;
1465+
1466+
[encoder setComputePipelineState:pipeline];
1467+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1468+
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
1469+
1470+
const int64_t n = ggml_nelements(dst);
1471+
14121472
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
14131473
} break;
14141474
case GGML_OP_SUM_ROWS:

0 commit comments

Comments
 (0)