Skip to content

Commit 720b483

Browse files
committed
ggml/ggml-vulkan/test-backend-ops: adds CONV_2D for Vulkan
* ggml-vulkan: adds f32 scalar shader to compute 2D convolution directly with gemm (no need for im2col), * test-backend-ops: adds test_case_ref to check the validity/performance of ops against reference implementations having different graphs, adds tests
1 parent c31e606 commit 720b483

File tree

6 files changed

+1004
-4
lines changed

6 files changed

+1004
-4
lines changed

ggml/include/ggml-backend.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,8 @@ extern "C" {
340340

341341
// Compare the output of two backends
342342
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node);
343+
// Compare the output of two backends, graphs can be different and only the selected nodes will be compared
344+
GGML_API bool ggml_backend_compare_graph_backend_node(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph1, struct ggml_cgraph * graph2, ggml_backend_eval_callback callback, void * user_data, char* op_name_out_1, char* op_name_out_2);
343345

344346
// Tensor initialization
345347
GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);

ggml/src/ggml-backend.cpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1882,6 +1882,55 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
18821882
return true;
18831883
}
18841884

1885+
bool ggml_backend_compare_graph_backend_node(
1886+
ggml_backend_t backend1,
1887+
ggml_backend_t backend2,
1888+
struct ggml_cgraph * graph1,
1889+
struct ggml_cgraph * graph2,
1890+
ggml_backend_eval_callback callback, void * user_data, char* op_name_out_1, char* op_name_out_2) {
1891+
1892+
ggml_tensor * out1 = NULL;
1893+
ggml_tensor * out2 = NULL;
1894+
1895+
struct ggml_cgraph * g1 = graph1;
1896+
struct ggml_cgraph * g2 = graph2;
1897+
1898+
for (int i = 0; i < g1->n_nodes; i++) {
1899+
struct ggml_tensor * t1 = g1->nodes[i];
1900+
struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
1901+
ggml_backend_graph_compute(backend1, &g1v);
1902+
if (ggml_is_view_op(t1->op)) {
1903+
continue;
1904+
}
1905+
if(strcmp(t1 -> name, op_name_out_1) == 0){
1906+
out1 = t1;
1907+
}
1908+
}
1909+
1910+
for (int i = 0; i < g2->n_nodes; i++) {
1911+
struct ggml_tensor * t2 = g2->nodes[i];
1912+
struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
1913+
ggml_backend_graph_compute(backend2, &g2v);
1914+
if (ggml_is_view_op(t2->op)) {
1915+
continue;
1916+
}
1917+
if(strcmp(t2 -> name, op_name_out_2) == 0){
1918+
out2 = t2;
1919+
}
1920+
}
1921+
1922+
assert(out1 != NULL);
1923+
assert(out2 != NULL);
1924+
assert(ggml_are_same_layout(out1, out2));
1925+
1926+
// compare results, calculate rms etc
1927+
if (!callback(0, out1, out2, user_data)) {
1928+
return false;
1929+
}
1930+
1931+
return true;
1932+
}
1933+
18851934
// CPU backend - buffer
18861935

18871936
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 180 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -482,6 +482,7 @@ struct vk_device_struct {
482482
vk_pipeline pipeline_rwkv_wkv6_f32;
483483
vk_pipeline pipeline_rwkv_wkv7_f32;
484484
vk_pipeline pipeline_opt_step_adamw_f32;
485+
vk_pipeline pipeline_conv2d_f32;
485486
vk_pipeline pipeline_conv2d_dw_whcn_f32;
486487
vk_pipeline pipeline_conv2d_dw_cwhn_f32;
487488

@@ -875,6 +876,38 @@ struct vk_op_rwkv_wkv7_push_constants {
875876
uint32_t H;
876877
};
877878

879+
struct vk_op_conv2d_push_constants {
880+
uint32_t Cout;
881+
uint32_t Cin;
882+
uint32_t N;
883+
884+
uint32_t KW;
885+
uint32_t KH;
886+
uint32_t W;
887+
uint32_t H;
888+
uint32_t OW;
889+
uint32_t OH;
890+
891+
uint32_t s0;
892+
uint32_t s1;
893+
uint32_t p0;
894+
uint32_t p1;
895+
uint32_t d0;
896+
uint32_t d1;
897+
898+
uint32_t nb01;
899+
uint32_t nb02;
900+
uint32_t nb03;
901+
902+
uint32_t nb11;
903+
uint32_t nb12;
904+
uint32_t nb13;
905+
906+
uint32_t nb1;
907+
uint32_t nb2;
908+
uint32_t nb3;
909+
};
910+
878911
struct vk_op_conv2d_dw_push_constants {
879912
uint32_t ne;
880913
uint32_t batches;
@@ -976,16 +1009,33 @@ class vk_memory_logger {
9761009
class vk_perf_logger {
9771010
public:
9781011
void print_timings() {
1012+
if(timings.empty()){
1013+
return;
1014+
}
9791015
std::cerr << "----------------\nVulkan Timings:" << std::endl;
9801016
for (const auto& t : timings) {
9811017
uint64_t total = 0;
9821018
for (const auto& time : t.second) {
9831019
total += time;
9841020
}
985-
std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us" << std::endl;
1021+
std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us";
1022+
1023+
// If we have as many flops entries as timing entries for the op, then compute and log the flops/S.
1024+
auto it = flops.find(t.first);
1025+
if(it != flops.end() && (it->second).size() == t.second.size()){
1026+
uint64_t total_nflops = 0;
1027+
for(const auto& elem : it->second){
1028+
total_nflops += elem;
1029+
}
1030+
std::cout << " (" << (double(total_nflops)/(1000.0*1000.0*1000.0)) / (double(total)/(1000.0*1000.0*1000.0)) << " GFLOPS/s)";
1031+
}
1032+
1033+
1034+
std::cerr << std::endl;
9861035
}
9871036

9881037
timings.clear();
1038+
flops.clear();
9891039
}
9901040

9911041
void log_timing(const ggml_tensor * node, uint64_t time) {
@@ -1004,12 +1054,33 @@ class vk_perf_logger {
10041054
name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k);
10051055
}
10061056
timings[name].push_back(time);
1057+
flops[name].push_back( m*n*(k+(k-1)) );
10071058
return;
10081059
}
1060+
if(node->op == GGML_OP_CONV_2D){
1061+
std::string name = ggml_op_name(node->op);
1062+
ggml_tensor * knl = node->src[0];
1063+
uint64_t OW = node->ne[0];
1064+
uint64_t OH = node->ne[1];
1065+
uint64_t N = node->ne[3];
1066+
uint64_t Cout = node->ne[2];
1067+
uint64_t KW = knl->ne[0];
1068+
uint64_t KH = knl->ne[1];
1069+
uint64_t Cin = knl->ne[2];
1070+
// KxCRS @ CRSxNPQ = KxNPQ -> M=K, K=CRS, N=NPQ
1071+
uint64_t size_M = Cout;
1072+
uint64_t size_K = Cin*KW*KH;
1073+
uint64_t size_N = N*OW*OH;
1074+
uint64_t n_flops = size_M*size_N*(size_K+(size_K-1));
1075+
flops[name].push_back(n_flops);
1076+
timings[name].push_back(time);
1077+
return;
1078+
}
10091079
timings[ggml_op_name(node->op)].push_back(time);
10101080
}
10111081
private:
10121082
std::map<std::string, std::vector<uint64_t>> timings;
1083+
std::map<std::string, std::vector<uint64_t>> flops;
10131084
};
10141085

10151086
struct ggml_backend_vk_context {
@@ -2955,6 +3026,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
29553026

29563027
ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
29573028

3029+
ggml_vk_create_pipeline(device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3, sizeof(vk_op_conv2d_push_constants), {128 /* equal to BS_K in the shader */, 128 /* equal to BS_NPQ in the shader */, 1}, {}, 1);
3030+
29583031
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
29593032
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f32, "conv2d_dw_cwhn_f32", conv2d_dw_cwhn_f32_len, conv2d_dw_cwhn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
29603033

@@ -6803,6 +6876,16 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
68036876
return ctx->device->pipeline_leaky_relu_f32;
68046877
}
68056878
return nullptr;
6879+
case GGML_OP_CONV_2D:
6880+
if (src0->type == GGML_TYPE_F32 &&
6881+
src1->type == GGML_TYPE_F32 &&
6882+
dst->type == GGML_TYPE_F32 &&
6883+
ggml_is_contiguous(src0) &&
6884+
ggml_is_contiguous(src1) &&
6885+
ggml_is_contiguous(dst)) {
6886+
return ctx->device->pipeline_conv2d_f32;
6887+
}
6888+
return nullptr;
68066889
case GGML_OP_CONV_2D_DW:
68076890
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
68086891
if (ggml_is_contiguous(src1)) {
@@ -7125,6 +7208,30 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
71257208
const uint32_t OW = dst->ne[0];
71267209
elements = { N * OC * OH * OW, 1, 1};
71277210
} break;
7211+
case GGML_OP_CONV_2D:
7212+
{
7213+
// src0 - kernel: [KW, KH, Cin, Cout]
7214+
// src1 - input: [W, H, Cin, N]
7215+
// dst - result: [OW, OH, Cout, N]
7216+
7217+
// Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
7218+
auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
7219+
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
7220+
};
7221+
// parallelize in {OW/BS_K, OH/BS_NPQ, 1}
7222+
int64_t W = src1->ne[0];
7223+
int64_t H = src1->ne[1];
7224+
int64_t KW = src0->ne[0];
7225+
int64_t KH = src0->ne[1];
7226+
int64_t Cout = src0->ne[3];
7227+
int64_t N = src1->ne[3];
7228+
int64_t OH = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]);
7229+
int64_t OW = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]);
7230+
int64_t NPQ = N*OW*OH;
7231+
7232+
// Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups
7233+
elements = {static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1};
7234+
} break;
71287235
case GGML_OP_ADD:
71297236
case GGML_OP_SUB:
71307237
case GGML_OP_DIV:
@@ -7991,6 +8098,55 @@ static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c
79918098
}, dryrun);
79928099
}
79938100

8101+
static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
8102+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
8103+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
8104+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
8105+
8106+
GGML_TENSOR_BINARY_OP_LOCALS
8107+
8108+
GGML_ASSERT(nb00 == sizeof(float));
8109+
GGML_ASSERT(nb10 == sizeof(float));
8110+
GGML_ASSERT(nb0 == sizeof(float));
8111+
8112+
vk_op_conv2d_push_constants p{};
8113+
p.Cout = static_cast<uint32_t>(ne03);
8114+
p.Cin = static_cast<uint32_t>(ne02);
8115+
p.N = static_cast<uint32_t>(ne13);
8116+
8117+
p.KW = static_cast<uint32_t>(ne00);
8118+
p.KH = static_cast<uint32_t>(ne01);
8119+
p.W = static_cast<uint32_t>(ne10);
8120+
p.H = static_cast<uint32_t>(ne11);
8121+
p.OW = static_cast<uint32_t>(ne0);
8122+
p.OH = static_cast<uint32_t>(ne1);
8123+
8124+
p.s0 = static_cast<uint32_t>(dst->op_params[0]);
8125+
p.s1 = static_cast<uint32_t>(dst->op_params[1]);
8126+
p.p0 = static_cast<uint32_t>(dst->op_params[2]);
8127+
p.p1 = static_cast<uint32_t>(dst->op_params[3]);
8128+
p.d0 = static_cast<uint32_t>(dst->op_params[4]);
8129+
p.d1 = static_cast<uint32_t>(dst->op_params[5]);
8130+
8131+
p.nb01 = static_cast<uint32_t>(nb01/nb00);
8132+
p.nb02 = static_cast<uint32_t>(nb02/nb00);
8133+
p.nb03 = static_cast<uint32_t>(nb03/nb00);
8134+
8135+
p.nb11 = static_cast<uint32_t>(nb11/nb10);
8136+
p.nb12 = static_cast<uint32_t>(nb12/nb10);
8137+
p.nb13 = static_cast<uint32_t>(nb13/nb10);
8138+
8139+
p.nb1 = static_cast<uint32_t>(nb1 / nb0);
8140+
p.nb2 = static_cast<uint32_t>(nb2 / nb0);
8141+
p.nb3 = static_cast<uint32_t>(nb3 / nb0);
8142+
8143+
GGML_ASSERT(ne03 == ne2);
8144+
GGML_ASSERT(ne02 == ne12);
8145+
8146+
ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONV_2D, std::move(p), dryrun);
8147+
8148+
}
8149+
79948150
static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
79958151
vk_op_conv2d_dw_push_constants p{};
79968152
p.ne = ggml_nelements(dst);
@@ -9053,6 +9209,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
90539209
case GGML_OP_TIMESTEP_EMBEDDING:
90549210
case GGML_OP_CONV_TRANSPOSE_1D:
90559211
case GGML_OP_POOL_2D:
9212+
case GGML_OP_CONV_2D:
90569213
case GGML_OP_CONV_2D_DW:
90579214
case GGML_OP_RWKV_WKV6:
90589215
case GGML_OP_RWKV_WKV7:
@@ -9120,6 +9277,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
91209277
case GGML_OP_TIMESTEP_EMBEDDING:
91219278
case GGML_OP_CONV_TRANSPOSE_1D:
91229279
case GGML_OP_POOL_2D:
9280+
case GGML_OP_CONV_2D:
91239281
case GGML_OP_CONV_2D_DW:
91249282
case GGML_OP_LEAKY_RELU:
91259283
{
@@ -9326,6 +9484,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
93269484
case GGML_OP_POOL_2D:
93279485
ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun);
93289486

9487+
break;
9488+
case GGML_OP_CONV_2D:
9489+
ggml_vk_conv_2d(ctx, compute_ctx, src0, src1, node, dryrun);
9490+
93299491
break;
93309492
case GGML_OP_CONV_2D_DW:
93319493
ggml_vk_conv_2d_dw(ctx, compute_ctx, src0, src1, node, dryrun);
@@ -9456,6 +9618,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
94569618
case GGML_OP_TIMESTEP_EMBEDDING:
94579619
case GGML_OP_CONV_TRANSPOSE_1D:
94589620
case GGML_OP_POOL_2D:
9621+
case GGML_OP_CONV_2D:
94599622
case GGML_OP_CONV_2D_DW:
94609623
case GGML_OP_RWKV_WKV6:
94619624
case GGML_OP_RWKV_WKV7:
@@ -10617,6 +10780,14 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
1061710780
return true;
1061810781
case GGML_OP_CONV_TRANSPOSE_1D:
1061910782
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
10783+
case GGML_OP_CONV_2D:
10784+
// Channel-contiguous format is not supported yet.
10785+
return (op->src[0]->type == GGML_TYPE_F32 &&
10786+
op->src[1]->type == GGML_TYPE_F32 &&
10787+
op->type == GGML_TYPE_F32 &&
10788+
ggml_is_contiguous(op->src[0]) &&
10789+
ggml_is_contiguous(op->src[1]) &&
10790+
ggml_is_contiguous(op));
1062010791
default:
1062110792
return false;
1062210793
}
@@ -11175,6 +11346,14 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
1117511346
const int32_t p1 = tensor->op_params[6];
1117611347

1117711348
tensor_clone = ggml_pool_2d(ggml_ctx, src_clone[0], op, k0, k1, s0, s1, p0, p1);
11349+
} else if (tensor->op == GGML_OP_CONV_2D) {
11350+
const int32_t s0 = tensor->op_params[0];
11351+
const int32_t s1 = tensor->op_params[1];
11352+
const int32_t p0 = tensor->op_params[2];
11353+
const int32_t p1 = tensor->op_params[3];
11354+
const int32_t d0 = tensor->op_params[4];
11355+
const int32_t d1 = tensor->op_params[5];
11356+
tensor_clone = ggml_conv_2d(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1);
1117811357
} else if (tensor->op == GGML_OP_LEAKY_RELU) {
1117911358
const float * op_params = (const float *)tensor->op_params;
1118011359
tensor_clone = ggml_leaky_relu(ggml_ctx, src_clone[0], op_params[0], false);

0 commit comments

Comments
 (0)