Skip to content

Commit beff9ca

Browse files
committed
ggml/ggml-vulkan/test-backend-ops: adds CONV_2D op with faster and memory efficient instant GEMM based Vulkan implementation
* ggml: adds op CONV_2D, ggml_conv_2d_direct, * ggml-vulkan: adds f32 scalar shader to compute 2D convolution directly with gemm (no need for im2col), * test-backend-ops: adds test_case_ref to check the validity/performance of ops against reference implementations having different graphs,
1 parent bb16041 commit beff9ca

File tree

9 files changed

+947
-4
lines changed

9 files changed

+947
-4
lines changed

ggml/include/ggml-backend.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,8 @@ extern "C" {
340340

341341
// Compare the output of two backends
342342
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
343+
// Compare the output of two backends, graphs can be different and only the selected nodes will be compared
344+
GGML_API bool ggml_backend_compare_graph_backend_node(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph1, struct ggml_cgraph * graph2, ggml_backend_eval_callback callback, void * user_data, char* op_name_out_1, char* op_name_out_2);
343345

344346
// Tensor initialization
345347
GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);

ggml/include/ggml.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -481,6 +481,7 @@ extern "C" {
481481
GGML_OP_CONV_TRANSPOSE_1D,
482482
GGML_OP_IM2COL,
483483
GGML_OP_IM2COL_BACK,
484+
GGML_OP_CONV_2D,
484485
GGML_OP_CONV_2D_DW,
485486
GGML_OP_CONV_TRANSPOSE_2D,
486487
GGML_OP_POOL_1D,
@@ -1663,6 +1664,17 @@ extern "C" {
16631664
int d0, // dilation dimension 0
16641665
int d1); // dilation dimension 1
16651666

1667+
GGML_API struct ggml_tensor * ggml_conv_2d_direct(
1668+
struct ggml_context * ctx,
1669+
struct ggml_tensor * a, // convolution kernel
1670+
struct ggml_tensor * b, // data
1671+
int stride0, // stride dimension 0
1672+
int stride1, // stride dimension 1
1673+
int padding0, // padding dimension 0
1674+
int padding1, // padding dimension 1
1675+
int dilation0, // dilation dimension 0
1676+
int dilation1); // dilation dimension 1
1677+
16661678
// kernel size is a->ne[0] x a->ne[1]
16671679
// stride is equal to kernel size
16681680
// padding is zero

ggml/src/ggml-backend.cpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1864,6 +1864,55 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
18641864
return true;
18651865
}
18661866

1867+
bool ggml_backend_compare_graph_backend_node(
1868+
ggml_backend_t backend1,
1869+
ggml_backend_t backend2,
1870+
struct ggml_cgraph * graph1,
1871+
struct ggml_cgraph * graph2,
1872+
ggml_backend_eval_callback callback, void * user_data, char* op_name_out_1, char* op_name_out_2) {
1873+
1874+
ggml_tensor * out1 = NULL;
1875+
ggml_tensor * out2 = NULL;
1876+
1877+
struct ggml_cgraph * g1 = graph1;
1878+
struct ggml_cgraph * g2 = graph2;
1879+
1880+
for (int i = 0; i < g1->n_nodes; i++) {
1881+
struct ggml_tensor * t1 = g1->nodes[i];
1882+
struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
1883+
ggml_backend_graph_compute(backend1, &g1v);
1884+
if (ggml_is_view_op(t1->op)) {
1885+
continue;
1886+
}
1887+
if(strcmp(t1 -> name, op_name_out_1) == 0){
1888+
out1 = t1;
1889+
}
1890+
}
1891+
1892+
for (int i = 0; i < g2->n_nodes; i++) {
1893+
struct ggml_tensor * t2 = g2->nodes[i];
1894+
struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
1895+
ggml_backend_graph_compute(backend2, &g2v);
1896+
if (ggml_is_view_op(t2->op)) {
1897+
continue;
1898+
}
1899+
if(strcmp(t2 -> name, op_name_out_2) == 0){
1900+
out2 = t2;
1901+
}
1902+
}
1903+
1904+
assert(out1 != NULL);
1905+
assert(out2 != NULL);
1906+
assert(ggml_are_same_layout(out1, out2));
1907+
1908+
// compare results, calculate rms etc
1909+
if (!callback(0, out1, out2, user_data)) {
1910+
return false;
1911+
}
1912+
1913+
return true;
1914+
}
1915+
18671916
// CPU backend - buffer
18681917

18691918
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1858,6 +1858,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
18581858
{
18591859
ggml_compute_forward_im2col_back_f32(params, tensor);
18601860
} break;
1861+
case GGML_OP_CONV_2D:
1862+
{
1863+
GGML_ABORT("Op not supported on CPU yet.");
1864+
} break;
18611865
case GGML_OP_CONV_2D_DW:
18621866
{
18631867
ggml_compute_forward_conv_2d_dw(params, tensor);

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 180 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,7 @@ struct vk_device_struct {
457457
vk_pipeline pipeline_rwkv_wkv6_f32;
458458
vk_pipeline pipeline_rwkv_wkv7_f32;
459459
vk_pipeline pipeline_opt_step_adamw_f32;
460+
vk_pipeline pipeline_conv2d_f32;
460461
vk_pipeline pipeline_conv2d_dw_whcn_f32;
461462
vk_pipeline pipeline_conv2d_dw_cwhn_f32;
462463

@@ -816,6 +817,38 @@ struct vk_op_rwkv_wkv7_push_constants {
816817
uint32_t H;
817818
};
818819

820+
struct vk_op_conv2d_push_constants {
821+
uint32_t Cout;
822+
uint32_t Cin;
823+
uint32_t N;
824+
825+
uint32_t KW;
826+
uint32_t KH;
827+
uint32_t W;
828+
uint32_t H;
829+
uint32_t OW;
830+
uint32_t OH;
831+
832+
uint32_t s0;
833+
uint32_t s1;
834+
uint32_t p0;
835+
uint32_t p1;
836+
uint32_t d0;
837+
uint32_t d1;
838+
839+
uint32_t nb01;
840+
uint32_t nb02;
841+
uint32_t nb03;
842+
843+
uint32_t nb11;
844+
uint32_t nb12;
845+
uint32_t nb13;
846+
847+
uint32_t nb1;
848+
uint32_t nb2;
849+
uint32_t nb3;
850+
};
851+
819852
struct vk_op_conv2d_dw_push_constants {
820853
uint32_t ne;
821854
uint32_t batches;
@@ -916,16 +949,33 @@ class vk_memory_logger {
916949
class vk_perf_logger {
917950
public:
918951
void print_timings() {
952+
if(timings.empty()){
953+
return;
954+
}
919955
std::cerr << "----------------\nVulkan Timings:" << std::endl;
920956
for (const auto& t : timings) {
921957
uint64_t total = 0;
922958
for (const auto& time : t.second) {
923959
total += time;
924960
}
925-
std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us" << std::endl;
961+
std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us";
962+
963+
// If we have as many flops entries as timing entries for the op, then compute and log the flops/S.
964+
auto it = flops.find(t.first);
965+
if(it != flops.end() && (it->second).size() == t.second.size()){
966+
uint64_t total_nflops = 0;
967+
for(const auto& elem : it->second){
968+
total_nflops += elem;
969+
}
970+
std::cout << " (" << (double(total_nflops)/(1000.0*1000.0*1000.0)) / (double(total)/(1000.0*1000.0*1000.0)) << " GFLOPS/s)";
971+
}
972+
973+
974+
std::cerr << std::endl;
926975
}
927976

928977
timings.clear();
978+
flops.clear();
929979
}
930980

931981
void log_timing(const ggml_tensor * node, uint64_t time) {
@@ -944,12 +994,33 @@ class vk_perf_logger {
944994
name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k);
945995
}
946996
timings[name].push_back(time);
997+
flops[name].push_back( m*n*(k+(k-1)) );
947998
return;
948999
}
1000+
if(node->op == GGML_OP_CONV_2D){
1001+
std::string name = ggml_op_name(node->op);
1002+
ggml_tensor * knl = node->src[0];
1003+
uint64_t OW = node->ne[0];
1004+
uint64_t OH = node->ne[1];
1005+
uint64_t N = node->ne[3];
1006+
uint64_t Cout = node->ne[2];
1007+
uint64_t KW = knl->ne[0];
1008+
uint64_t KH = knl->ne[1];
1009+
uint64_t Cin = knl->ne[2];
1010+
// KxCRS @ CRSxNPQ = KxNPQ -> M=K, K=CRS, N=NPQ
1011+
uint64_t size_M = Cout;
1012+
uint64_t size_K = Cin*KW*KH;
1013+
uint64_t size_N = N*OW*OH;
1014+
uint64_t n_flops = size_M*size_N*(size_K+(size_K-1));
1015+
flops[name].push_back(n_flops);
1016+
timings[name].push_back(time);
1017+
return;
1018+
}
9491019
timings[ggml_op_name(node->op)].push_back(time);
9501020
}
9511021
private:
9521022
std::map<std::string, std::vector<uint64_t>> timings;
1023+
std::map<std::string, std::vector<uint64_t>> flops;
9531024
};
9541025

9551026
struct ggml_backend_vk_context {
@@ -2806,6 +2877,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
28062877

28072878
ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
28082879

2880+
ggml_vk_create_pipeline(device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3, sizeof(vk_op_conv2d_push_constants), {128 /* equal to BS_K in the shader */, 128 /* equal to BS_NPQ in the shader */, 1}, {}, 1);
2881+
28092882
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
28102883
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f32, "conv2d_dw_cwhn_f32", conv2d_dw_cwhn_f32_len, conv2d_dw_cwhn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
28112884

@@ -6578,6 +6651,16 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
65786651
return ctx->device->pipeline_leaky_relu_f32;
65796652
}
65806653
return nullptr;
6654+
case GGML_OP_CONV_2D:
6655+
if (src0->type == GGML_TYPE_F32 &&
6656+
src1->type == GGML_TYPE_F32 &&
6657+
dst->type == GGML_TYPE_F32 &&
6658+
ggml_is_contiguous(src0) &&
6659+
ggml_is_contiguous(src1) &&
6660+
ggml_is_contiguous(dst)) {
6661+
return ctx->device->pipeline_conv2d_f32;
6662+
}
6663+
return nullptr;
65816664
case GGML_OP_CONV_2D_DW:
65826665
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
65836666
if (ggml_is_contiguous(src1)) {
@@ -6899,6 +6982,30 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
68996982
const uint32_t OW = dst->ne[0];
69006983
elements = { N * OC * OH * OW, 1, 1};
69016984
} break;
6985+
case GGML_OP_CONV_2D:
6986+
{
6987+
// src0 - kernel: [KW, KH, Cin, Cout]
6988+
// src1 - input: [W, H, Cin, N]
6989+
// dst - result: [OW, OH, Cout, N]
6990+
6991+
// Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
6992+
auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
6993+
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
6994+
};
6995+
// parallelize in {OW/BS_K, OH/BS_NPQ, 1}
6996+
int64_t W = src1->ne[0];
6997+
int64_t H = src1->ne[1];
6998+
int64_t KW = src0->ne[0];
6999+
int64_t KH = src0->ne[1];
7000+
int64_t Cout = src0->ne[3];
7001+
int64_t N = src1->ne[3];
7002+
int64_t OH = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]);
7003+
int64_t OW = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]);
7004+
int64_t NPQ = N*OW*OH;
7005+
7006+
// Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups
7007+
elements = {static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1};
7008+
} break;
69027009
case GGML_OP_ADD:
69037010
case GGML_OP_SUB:
69047011
case GGML_OP_DIV:
@@ -7753,6 +7860,55 @@ static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c
77537860
}, dryrun);
77547861
}
77557862

7863+
static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
7864+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
7865+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
7866+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
7867+
7868+
GGML_TENSOR_BINARY_OP_LOCALS
7869+
7870+
GGML_ASSERT(nb00 == sizeof(float));
7871+
GGML_ASSERT(nb10 == sizeof(float));
7872+
GGML_ASSERT(nb0 == sizeof(float));
7873+
7874+
vk_op_conv2d_push_constants p{};
7875+
p.Cout = static_cast<uint32_t>(ne03);
7876+
p.Cin = static_cast<uint32_t>(ne02);
7877+
p.N = static_cast<uint32_t>(ne13);
7878+
7879+
p.KW = static_cast<uint32_t>(ne00);
7880+
p.KH = static_cast<uint32_t>(ne01);
7881+
p.W = static_cast<uint32_t>(ne10);
7882+
p.H = static_cast<uint32_t>(ne11);
7883+
p.OW = static_cast<uint32_t>(ne0);
7884+
p.OH = static_cast<uint32_t>(ne1);
7885+
7886+
p.s0 = static_cast<uint32_t>(dst->op_params[0]);
7887+
p.s1 = static_cast<uint32_t>(dst->op_params[1]);
7888+
p.p0 = static_cast<uint32_t>(dst->op_params[2]);
7889+
p.p1 = static_cast<uint32_t>(dst->op_params[3]);
7890+
p.d0 = static_cast<uint32_t>(dst->op_params[4]);
7891+
p.d1 = static_cast<uint32_t>(dst->op_params[5]);
7892+
7893+
p.nb01 = static_cast<uint32_t>(nb01/nb00);
7894+
p.nb02 = static_cast<uint32_t>(nb02/nb00);
7895+
p.nb03 = static_cast<uint32_t>(nb03/nb00);
7896+
7897+
p.nb11 = static_cast<uint32_t>(nb11/nb10);
7898+
p.nb12 = static_cast<uint32_t>(nb12/nb10);
7899+
p.nb13 = static_cast<uint32_t>(nb13/nb10);
7900+
7901+
p.nb1 = static_cast<uint32_t>(nb1 / nb0);
7902+
p.nb2 = static_cast<uint32_t>(nb2 / nb0);
7903+
p.nb3 = static_cast<uint32_t>(nb3 / nb0);
7904+
7905+
GGML_ASSERT(ne03 == ne2);
7906+
GGML_ASSERT(ne02 == ne12);
7907+
7908+
ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONV_2D, std::move(p), dryrun);
7909+
7910+
}
7911+
77567912
static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
77577913
vk_op_conv2d_dw_push_constants p{};
77587914
p.ne = ggml_nelements(dst);
@@ -8799,6 +8955,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
87998955
case GGML_OP_TIMESTEP_EMBEDDING:
88008956
case GGML_OP_CONV_TRANSPOSE_1D:
88018957
case GGML_OP_POOL_2D:
8958+
case GGML_OP_CONV_2D:
88028959
case GGML_OP_CONV_2D_DW:
88038960
case GGML_OP_RWKV_WKV6:
88048961
case GGML_OP_RWKV_WKV7:
@@ -8864,6 +9021,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
88649021
case GGML_OP_TIMESTEP_EMBEDDING:
88659022
case GGML_OP_CONV_TRANSPOSE_1D:
88669023
case GGML_OP_POOL_2D:
9024+
case GGML_OP_CONV_2D:
88679025
case GGML_OP_CONV_2D_DW:
88689026
case GGML_OP_LEAKY_RELU:
88699027
{
@@ -9042,6 +9200,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
90429200
case GGML_OP_POOL_2D:
90439201
ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun);
90449202

9203+
break;
9204+
case GGML_OP_CONV_2D:
9205+
ggml_vk_conv_2d(ctx, compute_ctx, src0, src1, node, dryrun);
9206+
90459207
break;
90469208
case GGML_OP_CONV_2D_DW:
90479209
ggml_vk_conv_2d_dw(ctx, compute_ctx, src0, src1, node, dryrun);
@@ -9168,6 +9330,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
91689330
case GGML_OP_TIMESTEP_EMBEDDING:
91699331
case GGML_OP_CONV_TRANSPOSE_1D:
91709332
case GGML_OP_POOL_2D:
9333+
case GGML_OP_CONV_2D:
91719334
case GGML_OP_CONV_2D_DW:
91729335
case GGML_OP_RWKV_WKV6:
91739336
case GGML_OP_RWKV_WKV7:
@@ -10242,6 +10405,14 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
1024210405
return true;
1024310406
case GGML_OP_CONV_TRANSPOSE_1D:
1024410407
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
10408+
case GGML_OP_CONV_2D:
10409+
// Channel-contiguous format is not supported yet.
10410+
return (op->src[0]->type == GGML_TYPE_F32 &&
10411+
op->src[1]->type == GGML_TYPE_F32 &&
10412+
op->type == GGML_TYPE_F32 &&
10413+
ggml_is_contiguous(op->src[0]) &&
10414+
ggml_is_contiguous(op->src[1]) &&
10415+
ggml_is_contiguous(op));
1024510416
default:
1024610417
return false;
1024710418
}
@@ -10765,6 +10936,14 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
1076510936
const int32_t p1 = tensor->op_params[6];
1076610937

1076710938
tensor_clone = ggml_pool_2d(ggml_ctx, src_clone[0], op, k0, k1, s0, s1, p0, p1);
10939+
} else if (tensor->op == GGML_OP_CONV_2D) {
10940+
const int32_t s0 = tensor->op_params[0];
10941+
const int32_t s1 = tensor->op_params[1];
10942+
const int32_t p0 = tensor->op_params[2];
10943+
const int32_t p1 = tensor->op_params[3];
10944+
const int32_t d0 = tensor->op_params[4];
10945+
const int32_t d1 = tensor->op_params[5];
10946+
tensor_clone = ggml_conv_2d(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1);
1076810947
} else if (tensor->op == GGML_OP_LEAKY_RELU) {
1076910948
const float * op_params = (const float *)tensor->op_params;
1077010949
tensor_clone = ggml_leaky_relu(ggml_ctx, src_clone[0], op_params[0], false);

0 commit comments

Comments
 (0)