-
Notifications
You must be signed in to change notification settings - Fork 12.4k
ggml : add ggml_scale_bias #14417
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ggml : add ggml_scale_bias #14417
Changes from 12 commits
50f88fc
7af3fd9
a5ccf16
e427af7
92a8738
a28df6f
782b58f
477a97a
0e51a0a
4d01953
b22708f
c8d8931
265cb43
563aca0
50c678f
0d70ca8
4ea74b0
cd1703a
ebbad77
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -351,6 +351,70 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int | |
#endif | ||
} | ||
|
||
inline static void ggml_vec_mad1_f32(const int n, float * y, const float s, const float b) { | ||
#if defined(GGML_USE_ACCELERATE) | ||
vDSP_vsmul(y, 1, &s, y, 1, n); | ||
vDSP_vsadd(y, 1, &b, y, 1, n); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There is vDSP_vsmsa There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. implemented in 563aca0 |
||
#elif defined(GGML_SIMD) | ||
#if defined(__ARM_FEATURE_SVE) | ||
const int sve_register_length = ggml_cpu_get_sve_cnt() * 8; | ||
const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16 | ||
const int ggml_f32_step = 2 * ggml_f32_epr; | ||
|
||
GGML_F32_VEC vs = GGML_F32_VEC_SET1(s); | ||
GGML_F32_VEC vb = GGML_F32_VEC_SET1(b); | ||
|
||
const int np = (n & ~(ggml_f32_step - 1)); | ||
svfloat32_t ay1; | ||
svfloat32_t ay2; | ||
for (int i = 0; i < np; i += ggml_f32_step) { | ||
ay1 = GGML_F32_VEC_LOAD(y + i); | ||
ay1 = GGML_F32_VEC_FMA(ay1, vs, vb); | ||
GGML_F32_VEC_STORE(y + i, ay1); | ||
|
||
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr); | ||
ay2 = GGML_F32_VEC_FMA(ay2, vs, vb); | ||
GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2); | ||
} | ||
// leftovers | ||
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only | ||
if (np < n) { | ||
svbool_t pg = svwhilelt_b32(np, n); | ||
ay1 = svld1_f32(pg, y + np); | ||
ay1 = svmul_f32_m(pg, ay1, vs); | ||
ay1 = svadd_f32_m(pg, ay1, vb); | ||
svst1_f32(pg, y + np, ay1); | ||
} | ||
#else | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove this SVE implementation - we don't have hardware to test it yet. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done in 50c678f |
||
const int np = (n & ~(GGML_F32_STEP - 1)); | ||
|
||
GGML_F32_VEC vs = GGML_F32_VEC_SET1(s); | ||
GGML_F32_VEC vb = GGML_F32_VEC_SET1(b); | ||
|
||
GGML_F32_VEC ay[GGML_F32_ARR]; | ||
|
||
for (int i = 0; i < np; i += GGML_F32_STEP) { | ||
for (int j = 0; j < GGML_F32_ARR; j++) { | ||
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); | ||
ay[j] = GGML_F32_VEC_FMA(ay[j], vs, vb); | ||
|
||
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]); | ||
} | ||
} | ||
|
||
// leftovers | ||
for (int i = np; i < n; ++i) { | ||
y[i] = y[i]*s + b; | ||
} | ||
#endif | ||
#else | ||
// scalar | ||
for (int i = 0; i < n; ++i) { | ||
y[i] = y[i]*s + b; | ||
} | ||
#endif | ||
} | ||
|
||
//inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; } | ||
inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { | ||
#if defined(GGML_USE_ACCELERATE) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,18 +1,18 @@ | ||
#include "scale.cuh" | ||
|
||
static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) { | ||
static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int k) { | ||
const int i = blockDim.x*blockIdx.x + threadIdx.x; | ||
|
||
if (i >= k) { | ||
return; | ||
} | ||
|
||
dst[i] = scale * x[i]; | ||
dst[i] = scale * x[i] + bias; | ||
} | ||
|
||
static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) { | ||
static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int k, cudaStream_t stream) { | ||
const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE; | ||
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k); | ||
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, k); | ||
} | ||
|
||
void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { | ||
|
@@ -25,7 +25,9 @@ void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { | |
GGML_ASSERT( dst->type == GGML_TYPE_F32); | ||
|
||
float scale; | ||
float bias; | ||
memcpy(&scale, dst->op_params, sizeof(float)); | ||
memcpy(&bias, (float *) dst->op_params + 1, sizeof(float)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Make the this more consistent: memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
memcpy(&bias, (float *) dst->op_params + 1, sizeof(float)); |
||
|
||
scale_f32_cuda(src0_d, dst_d, scale, ggml_nelements(src0), stream); | ||
scale_f32_cuda(src0_d, dst_d, scale, bias, ggml_nelements(src0), stream); | ||
} |
Uh oh!
There was an error while loading. Please reload this page.