Skip to content

Add support for google-benchmark #1237

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -209,8 +209,9 @@ set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")

# extra artifacts
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
option(GGML_BUILD_BENCHMARKS "ggml: build benchmarks" ${GGML_STANDALONE})

#
# dependencies
Expand Down Expand Up @@ -247,6 +248,10 @@ if (GGML_BUILD_EXAMPLES)
add_subdirectory(examples)
endif ()

if (GGML_BUILD_BENCHMARKS)
add_subdirectory(benchmarks)
endif ()

#
# install
#
Expand Down
16 changes: 16 additions & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
include(ExternalProject)
find_package(Git REQUIRED)

include(FetchContent)

FetchContent_Declare(
googlebenchmark
GIT_REPOSITORY https://github.com/google/benchmark.git
GIT_TAG v1.9.1)
set(BENCHMARK_ENABLE_TESTING
OFF
CACHE BOOL "Turn off BENCHMARK_ENABLE_TESTING" FORCE)
FetchContent_MakeAvailable(googlebenchmark)

add_executable(benchmark-mul-mat ${PROJECT_SOURCE_DIR}/benchmarks/benchmark-mul-mat.cpp)
target_link_libraries(benchmark-mul-mat PRIVATE benchmark::benchmark ggml)
216 changes: 216 additions & 0 deletions benchmarks/benchmark-mul-mat.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "ggml-cpu.h"
#include "ggml.h"

#include <benchmark/benchmark.h>

#include <cstring>
#include <random>
#include <stdexcept>
#include <thread>

#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif

#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif

template <typename ValueT>
void randomize(std::vector<ValueT>& data) {
static_assert(std::is_floating_point_v<ValueT>);
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<ValueT> dist(-1.0, 1.0);
for (auto& element : data) {
element = dist(gen);
}
}

enum class BackendDevice { kCPU, kCUDA, kMETAL };

// initialize the backend
template <BackendDevice Device>
ggml_backend_t init_backend() {
ggml_backend_t backend = NULL;

if constexpr (Device == BackendDevice::kCPU) {
backend = ggml_backend_cpu_init();
if (!backend) {
throw std::runtime_error("Failed to initialize CPU backend.");
}
return backend;
} else if constexpr (Device == BackendDevice::kCUDA) {
#ifdef GGML_USE_CUDA
backend = ggml_backend_cuda_init(0);
if (!backend) {
throw std::runtime_error("Failed to initialize CUDA backend.");
}
return backend;
#endif
} else if constexpr (Device == BackendDevice::kMETAL) {
#ifdef GGML_USE_METAL
backend = ggml_backend_metal_init();
if (!backend) {
throw std::runtime_error("Failed to initialize METAL backend.");
}
return backend;
#endif
} else {
throw std::runtime_error("Unknown backend type.");
}
if (!backend) {
throw std::runtime_error("Failed to initialize backend.");
}
return NULL;
}


template <BackendDevice Device, typename InputT, ggml_type GGML_TYPE>
static void bmMulMat(benchmark::State& state) {
// define the matrix size, use power-of-two square-matrices of equal size for simplicity
const uint8_t log_2_size = state.range(0);
assert(log_2_size < sizeof(size_t) * 8);
const size_t M = 1 << log_2_size;
const size_t N = 1 << log_2_size;
const size_t K = 1 << log_2_size;

const uint8_t log_2_threads = state.range(1);
assert(log_2_threads < sizeof(uint8_t) * 8);
const uint8_t num_cpu_threads = 1 << log_2_threads;
if (num_cpu_threads > std::thread::hardware_concurrency()) {
state.SkipWithError("Not enough CPU threads available.");
}

// define input and output matrices
std::vector<InputT> matrixA(M * K);
std::vector<InputT> matrixB(K * N);
std::vector<InputT> matrixC(M * N);

// randomize input matrices
randomize<InputT>(matrixA);
randomize<InputT>(matrixB);

// calculate GGML buffer sizes
const size_t buffer_size_A = (M * N) * ggml_type_size(GGML_TYPE);
const size_t buffer_size_B = (N * K) * ggml_type_size(GGML_TYPE);
const size_t overhead = 1024;
const size_t buffer_size = buffer_size_A + buffer_size_B + overhead;

constexpr uint8_t num_tensors = 2;
struct ggml_init_params params{
/*.mem_size =*/ggml_tensor_overhead() * num_tensors,
/*.mem_buffer =*/NULL,
/*.no_alloc =*/true,
};

// initialize the backend
ggml_backend_t backend = init_backend<Device>();

// allocate buffers
ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, buffer_size);

// create context
struct ggml_context* ctx = ggml_init(params);

// create tensors
struct ggml_tensor* a = ggml_new_tensor_2d(ctx, GGML_TYPE, K, M);
struct ggml_tensor* b = ggml_new_tensor_2d(ctx, GGML_TYPE, K, N);

// create an allocator
struct ggml_tallocr alloc = ggml_tallocr_new(buffer);

// alloc memory
if (ggml_tallocr_alloc(&alloc, a) != GGML_STATUS_SUCCESS) {
throw std::runtime_error("Failed to allocate tensor a.");
}
if (ggml_tallocr_alloc(&alloc, b) != GGML_STATUS_SUCCESS) {
throw std::runtime_error("Failed to allocate tensor b.");
}

// load data to buffer
if (ggml_backend_is_cpu(backend)
#ifdef GGML_USE_METAL
|| ggml_backend_is_metal(backend)
#endif
) {
memcpy(a->data, matrixA.data(), ggml_nbytes(a));
memcpy(b->data, matrixB.data(), ggml_nbytes(b));
} else {
ggml_backend_tensor_set(a, matrixA.data(), 0, ggml_nbytes(a));
ggml_backend_tensor_set(b, matrixB.data(), 0, ggml_nbytes(b));
}

ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));

size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
std::vector<uint8_t> buf(buf_size);

struct ggml_init_params params0 = {
/*.mem_size =*/buf_size,
/*.mem_buffer =*/buf.data(),
/*.no_alloc =*/true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
};

// create a temporary context to build the graph
struct ggml_context* ctx0 = ggml_init(params0);

struct ggml_cgraph* gf = ggml_new_graph(ctx0);

// zT = x @ yT
struct ggml_tensor* result = ggml_mul_mat(ctx0, a, ggml_cont(ctx0, b));

// z = (zT)T
ggml_build_forward_expand(gf, ggml_cont(ctx0, ggml_transpose(ctx0, result)));

// compute the required memory
ggml_gallocr_reserve(allocr, gf);

// allocate tensors
ggml_gallocr_alloc_graph(allocr, gf);

if (ggml_backend_is_cpu(backend)) {
ggml_backend_cpu_set_n_threads(backend, num_cpu_threads);
}

for (auto _ : state) {
ggml_backend_graph_compute(backend, gf);
}

const size_t flops_per_iteration = 2 * M * N * K;
state.counters["Flop/s"] = benchmark::Counter(flops_per_iteration, benchmark::Counter::kIsIterationInvariantRate,
benchmark::Counter::kIs1000);

// free memory
ggml_free(ctx0);
ggml_free(ctx);
ggml_backend_buffer_free(buffer);
ggml_backend_free(backend);
ggml_gallocr_free(allocr);
}

const std::vector<int64_t> log_2_sizes{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
const std::vector<int64_t> log_2_threads{0, 1, 2, 3, 4, 5, 6, 7, 8};

// CPU single-threaded, MxNxK: 1 x 1 x 1 -> 2048 x 2048 x 2048
BENCHMARK_TEMPLATE(bmMulMat, BackendDevice::kCPU, float, GGML_TYPE_F16)->ArgsProduct({log_2_sizes, {0}});
BENCHMARK_TEMPLATE(bmMulMat, BackendDevice::kCPU, float, GGML_TYPE_F32)->ArgsProduct({log_2_sizes, {0}});
// CPU multithreaded, MxNxK: 2048 x 2048 x 2048
BENCHMARK_TEMPLATE(bmMulMat, BackendDevice::kCPU, float, GGML_TYPE_F16)->ArgsProduct({{11}, log_2_threads});
BENCHMARK_TEMPLATE(bmMulMat, BackendDevice::kCPU, float, GGML_TYPE_F32)->ArgsProduct({{11}, log_2_threads});

#ifdef GGML_USE_CUDA
// CUDA single-device, MxNxK: 1 x 1 x 1 -> 2048 x 2048 x 2048
BENCHMARK_TEMPLATE(bmMulMat, BackendDevice::kCUDA, float, GGML_TYPE_F16)->ArgsProduct({log_2_sizes, {0}});
BENCHMARK_TEMPLATE(bmMulMat, BackendDevice::kCUDA, float, GGML_TYPE_F32)->ArgsProduct({log_2_sizes, {0}});
#endif

#ifdef GGML_USE_METAL
// METAL single-device, MxNxK: 1 x 1 x 1 -> 2048 x 2048 x 2048
BENCHMARK_TEMPLATE(bmMulMat, BackendDevice::kMETAL, float, GGML_TYPE_F16)->ArgsProduct({log_2_sizes, {0}});
BENCHMARK_TEMPLATE(bmMulMat, BackendDevice::kMETAL, float, GGML_TYPE_F32)->ArgsProduct({log_2_sizes, {0}});
#endif

BENCHMARK_MAIN();