Skip to content

ggml-cpu : Add GGML_CPU_FFAST_MATH for sine autovectorization #1243

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ message(DEBUG "INS_ENB : ${INS_ENB}")
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable" OFF)
option(GGML_CPU_FFAST_MATH "ggml: use approximate math" OFF)
option(GGML_SSE42 "ggml: enable SSE 4.2" ${INS_ENB})
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
Expand Down
6 changes: 6 additions & 0 deletions src/ggml-cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
ggml-cpu/binary-ops.cpp
ggml-cpu/unary-ops.h
ggml-cpu/unary-ops.cpp
ggml-cpu/unary-ops.inc
ggml-cpu/unary-ops-ffast-math.cpp
ggml-cpu/simd-mappings.h
ggml-cpu/vec.h
ggml-cpu/vec.cpp
Expand Down Expand Up @@ -64,6 +66,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
endif()
endif()

if (GGML_CPU_FFAST_MATH AND NOT MSVC)
set_source_files_properties(ggml-cpu/unary-ops-ffast-math.cpp PROPERTIES COMPILE_FLAGS "-ffast-math $<$<CONFIG:RelWithDebInfo>:-O3>")
endif()

if (GGML_LLAMAFILE)
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_LLAMAFILE)

Expand Down
21 changes: 21 additions & 0 deletions src/ggml-cpu/unary-ops-ffast-math.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#include "unary-ops.inc"

// This file is compiled with -ffast-math only ifdef GGML_CPU_FFAST_MATH.
// libmvec allows sine/cos vectorization but not bit-identically to libm.
// Backends (e.g. CUDA) aren't bit-identical either, but more people expect the CPU backend to be.

static inline float op_sin(float x) {
return sinf(x);
}

static inline float op_cos(float x) {
return cosf(x);
}

void ggml_compute_forward_sin(const ggml_compute_params * params, ggml_tensor * dst) {
unary_op<op_sin>(params, dst);
}

void ggml_compute_forward_cos(const ggml_compute_params * params, ggml_tensor * dst) {
unary_op<op_cos>(params, dst);
}
75 changes: 1 addition & 74 deletions src/ggml-cpu/unary-ops.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#include "unary-ops.h"
#include "unary-ops.inc"

static inline float op_abs(float x) {
return fabsf(x);
Expand Down Expand Up @@ -52,75 +52,10 @@ static inline float op_sqrt(float x) {
return sqrtf(x);
}

static inline float op_sin(float x) {
return sinf(x);
}

static inline float op_cos(float x) {
return cosf(x);
}

static inline float op_log(float x) {
return logf(x);
}

template <float (*op)(float), typename src0_t, typename dst_t>
static inline void vec_unary_op(int64_t n, dst_t * y, const src0_t * x) {
constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
constexpr auto f32_to_dst = type_conversion_table<dst_t >::from_f32;

for (int i = 0; i < n; i++) {
y[i] = f32_to_dst(op(src0_to_f32(x[i])));
}
}

template <float (*op)(float), typename src0_t, typename dst_t>
static void apply_unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];

GGML_ASSERT(ggml_is_contiguous_1(src0) && ggml_is_contiguous_1(dst) && ggml_are_same_shape(src0, dst));

GGML_TENSOR_UNARY_OP_LOCALS

GGML_ASSERT( nb0 == sizeof(dst_t));
GGML_ASSERT(nb00 == sizeof(src0_t));

const auto [ir0, ir1] = get_thread_range(params, src0);

for (int64_t ir = ir0; ir < ir1; ++ir) {
const int64_t i03 = ir/(ne02*ne01);
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);

dst_t * dst_ptr = (dst_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);

vec_unary_op<op>(ne0, dst_ptr, src0_ptr);
}
}

// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates
template <float (*op)(float)>
static void unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];

/* */ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { // all f32
apply_unary_op<op, float, float>(params, dst);
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { // all f16
apply_unary_op<op, ggml_fp16_t, ggml_fp16_t>(params, dst);
} else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
apply_unary_op<op, ggml_bf16_t, ggml_bf16_t>(params, dst);
} else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
apply_unary_op<op, ggml_bf16_t, float>(params, dst);
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
apply_unary_op<op, ggml_fp16_t, float>(params, dst);
} else {
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
ggml_type_name(dst->type), ggml_type_name(src0->type));
GGML_ABORT("fatal error");
}
}

void ggml_compute_forward_abs(const ggml_compute_params * params, ggml_tensor * dst) {
unary_op<op_abs>(params, dst);
}
Expand Down Expand Up @@ -173,14 +108,6 @@ void ggml_compute_forward_sqrt(const ggml_compute_params * params, ggml_tensor *
unary_op<op_sqrt>(params, dst);
}

void ggml_compute_forward_sin(const ggml_compute_params * params, ggml_tensor * dst) {
unary_op<op_sin>(params, dst);
}

void ggml_compute_forward_cos(const ggml_compute_params * params, ggml_tensor * dst) {
unary_op<op_cos>(params, dst);
}

void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor * dst) {
unary_op<op_log>(params, dst);
}
58 changes: 58 additions & 0 deletions src/ggml-cpu/unary-ops.inc
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#include "unary-ops.h"

template <float (*op)(float), typename src0_t, typename dst_t>
static inline void vec_unary_op(int64_t n, dst_t * y, const src0_t * x) {
constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
constexpr auto f32_to_dst = type_conversion_table<dst_t >::from_f32;

for (int i = 0; i < n; i++) {
y[i] = f32_to_dst(op(src0_to_f32(x[i])));
}
}

template <float (*op)(float), typename src0_t, typename dst_t>
static void apply_unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];

GGML_ASSERT(ggml_is_contiguous_1(src0) && ggml_is_contiguous_1(dst) && ggml_are_same_shape(src0, dst));

GGML_TENSOR_UNARY_OP_LOCALS

GGML_ASSERT( nb0 == sizeof(dst_t));
GGML_ASSERT(nb00 == sizeof(src0_t));

const auto [ir0, ir1] = get_thread_range(params, src0);

for (int64_t ir = ir0; ir < ir1; ++ir) {
const int64_t i03 = ir/(ne02*ne01);
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);

dst_t * dst_ptr = (dst_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);

vec_unary_op<op>(ne0, dst_ptr, src0_ptr);
}
}

// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates
template <float (*op)(float)>
static void unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];

/* */ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { // all f32
apply_unary_op<op, float, float>(params, dst);
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { // all f16
apply_unary_op<op, ggml_fp16_t, ggml_fp16_t>(params, dst);
} else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
apply_unary_op<op, ggml_bf16_t, ggml_bf16_t>(params, dst);
} else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
apply_unary_op<op, ggml_bf16_t, float>(params, dst);
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
apply_unary_op<op, ggml_fp16_t, float>(params, dst);
} else {
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
ggml_type_name(dst->type), ggml_type_name(src0->type));
GGML_ABORT("fatal error");
}
}