@@ -482,6 +482,7 @@ struct vk_device_struct {
482
482
vk_pipeline pipeline_rwkv_wkv6_f32;
483
483
vk_pipeline pipeline_rwkv_wkv7_f32;
484
484
vk_pipeline pipeline_opt_step_adamw_f32;
485
+ vk_pipeline pipeline_conv2d_f32;
485
486
vk_pipeline pipeline_conv2d_dw_whcn_f32;
486
487
vk_pipeline pipeline_conv2d_dw_cwhn_f32;
487
488
@@ -875,6 +876,38 @@ struct vk_op_rwkv_wkv7_push_constants {
875
876
uint32_t H;
876
877
};
877
878
879
+ struct vk_op_conv2d_push_constants {
880
+ uint32_t Cout;
881
+ uint32_t Cin;
882
+ uint32_t N;
883
+
884
+ uint32_t KW;
885
+ uint32_t KH;
886
+ uint32_t W;
887
+ uint32_t H;
888
+ uint32_t OW;
889
+ uint32_t OH;
890
+
891
+ uint32_t s0;
892
+ uint32_t s1;
893
+ uint32_t p0;
894
+ uint32_t p1;
895
+ uint32_t d0;
896
+ uint32_t d1;
897
+
898
+ uint32_t nb01;
899
+ uint32_t nb02;
900
+ uint32_t nb03;
901
+
902
+ uint32_t nb11;
903
+ uint32_t nb12;
904
+ uint32_t nb13;
905
+
906
+ uint32_t nb1;
907
+ uint32_t nb2;
908
+ uint32_t nb3;
909
+ };
910
+
878
911
struct vk_op_conv2d_dw_push_constants {
879
912
uint32_t ne;
880
913
uint32_t batches;
@@ -976,16 +1009,33 @@ class vk_memory_logger {
976
1009
class vk_perf_logger {
977
1010
public:
978
1011
void print_timings() {
1012
+ if(timings.empty()){
1013
+ return;
1014
+ }
979
1015
std::cerr << "----------------\nVulkan Timings:" << std::endl;
980
1016
for (const auto& t : timings) {
981
1017
uint64_t total = 0;
982
1018
for (const auto& time : t.second) {
983
1019
total += time;
984
1020
}
985
- std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us" << std::endl;
1021
+ std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us";
1022
+
1023
+ // If we have as many flops entries as timing entries for the op, then compute and log the flops/S.
1024
+ auto it = flops.find(t.first);
1025
+ if(it != flops.end() && (it->second).size() == t.second.size()){
1026
+ uint64_t total_nflops = 0;
1027
+ for(const auto& elem : it->second){
1028
+ total_nflops += elem;
1029
+ }
1030
+ std::cout << " (" << (double(total_nflops)/(1000.0*1000.0*1000.0)) / (double(total)/(1000.0*1000.0*1000.0)) << " GFLOPS/s)";
1031
+ }
1032
+
1033
+
1034
+ std::cerr << std::endl;
986
1035
}
987
1036
988
1037
timings.clear();
1038
+ flops.clear();
989
1039
}
990
1040
991
1041
void log_timing(const ggml_tensor * node, uint64_t time) {
@@ -1004,12 +1054,33 @@ class vk_perf_logger {
1004
1054
name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k);
1005
1055
}
1006
1056
timings[name].push_back(time);
1057
+ flops[name].push_back( m*n*(k+(k-1)) );
1007
1058
return;
1008
1059
}
1060
+ if(node->op == GGML_OP_CONV_2D){
1061
+ std::string name = ggml_op_name(node->op);
1062
+ ggml_tensor * knl = node->src[0];
1063
+ uint64_t OW = node->ne[0];
1064
+ uint64_t OH = node->ne[1];
1065
+ uint64_t N = node->ne[3];
1066
+ uint64_t Cout = node->ne[2];
1067
+ uint64_t KW = knl->ne[0];
1068
+ uint64_t KH = knl->ne[1];
1069
+ uint64_t Cin = knl->ne[2];
1070
+ // KxCRS @ CRSxNPQ = KxNPQ -> M=K, K=CRS, N=NPQ
1071
+ uint64_t size_M = Cout;
1072
+ uint64_t size_K = Cin*KW*KH;
1073
+ uint64_t size_N = N*OW*OH;
1074
+ uint64_t n_flops = size_M*size_N*(size_K+(size_K-1));
1075
+ flops[name].push_back(n_flops);
1076
+ timings[name].push_back(time);
1077
+ return;
1078
+ }
1009
1079
timings[ggml_op_name(node->op)].push_back(time);
1010
1080
}
1011
1081
private:
1012
1082
std::map<std::string, std::vector<uint64_t>> timings;
1083
+ std::map<std::string, std::vector<uint64_t>> flops;
1013
1084
};
1014
1085
1015
1086
struct ggml_backend_vk_context {
@@ -2955,6 +3026,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
2955
3026
2956
3027
ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
2957
3028
3029
+ ggml_vk_create_pipeline(device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3, sizeof(vk_op_conv2d_push_constants), {128 /* equal to BS_K in the shader */, 128 /* equal to BS_NPQ in the shader */, 1}, {}, 1);
3030
+
2958
3031
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
2959
3032
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f32, "conv2d_dw_cwhn_f32", conv2d_dw_cwhn_f32_len, conv2d_dw_cwhn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
2960
3033
@@ -6803,6 +6876,16 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
6803
6876
return ctx->device->pipeline_leaky_relu_f32;
6804
6877
}
6805
6878
return nullptr;
6879
+ case GGML_OP_CONV_2D:
6880
+ if (src0->type == GGML_TYPE_F32 &&
6881
+ src1->type == GGML_TYPE_F32 &&
6882
+ dst->type == GGML_TYPE_F32 &&
6883
+ ggml_is_contiguous(src0) &&
6884
+ ggml_is_contiguous(src1) &&
6885
+ ggml_is_contiguous(dst)) {
6886
+ return ctx->device->pipeline_conv2d_f32;
6887
+ }
6888
+ return nullptr;
6806
6889
case GGML_OP_CONV_2D_DW:
6807
6890
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
6808
6891
if (ggml_is_contiguous(src1)) {
@@ -7125,6 +7208,30 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
7125
7208
const uint32_t OW = dst->ne[0];
7126
7209
elements = { N * OC * OH * OW, 1, 1};
7127
7210
} break;
7211
+ case GGML_OP_CONV_2D:
7212
+ {
7213
+ // src0 - kernel: [KW, KH, Cin, Cout]
7214
+ // src1 - input: [W, H, Cin, N]
7215
+ // dst - result: [OW, OH, Cout, N]
7216
+
7217
+ // Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
7218
+ auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
7219
+ return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
7220
+ };
7221
+ // parallelize in {OW/BS_K, OH/BS_NPQ, 1}
7222
+ int64_t W = src1->ne[0];
7223
+ int64_t H = src1->ne[1];
7224
+ int64_t KW = src0->ne[0];
7225
+ int64_t KH = src0->ne[1];
7226
+ int64_t Cout = src0->ne[3];
7227
+ int64_t N = src1->ne[3];
7228
+ int64_t OH = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]);
7229
+ int64_t OW = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]);
7230
+ int64_t NPQ = N*OW*OH;
7231
+
7232
+ // Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups
7233
+ elements = {static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1};
7234
+ } break;
7128
7235
case GGML_OP_ADD:
7129
7236
case GGML_OP_SUB:
7130
7237
case GGML_OP_DIV:
@@ -7991,6 +8098,55 @@ static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c
7991
8098
}, dryrun);
7992
8099
}
7993
8100
8101
+ static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
8102
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
8103
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
8104
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
8105
+
8106
+ GGML_TENSOR_BINARY_OP_LOCALS
8107
+
8108
+ GGML_ASSERT(nb00 == sizeof(float));
8109
+ GGML_ASSERT(nb10 == sizeof(float));
8110
+ GGML_ASSERT(nb0 == sizeof(float));
8111
+
8112
+ vk_op_conv2d_push_constants p{};
8113
+ p.Cout = static_cast<uint32_t>(ne03);
8114
+ p.Cin = static_cast<uint32_t>(ne02);
8115
+ p.N = static_cast<uint32_t>(ne13);
8116
+
8117
+ p.KW = static_cast<uint32_t>(ne00);
8118
+ p.KH = static_cast<uint32_t>(ne01);
8119
+ p.W = static_cast<uint32_t>(ne10);
8120
+ p.H = static_cast<uint32_t>(ne11);
8121
+ p.OW = static_cast<uint32_t>(ne0);
8122
+ p.OH = static_cast<uint32_t>(ne1);
8123
+
8124
+ p.s0 = static_cast<uint32_t>(dst->op_params[0]);
8125
+ p.s1 = static_cast<uint32_t>(dst->op_params[1]);
8126
+ p.p0 = static_cast<uint32_t>(dst->op_params[2]);
8127
+ p.p1 = static_cast<uint32_t>(dst->op_params[3]);
8128
+ p.d0 = static_cast<uint32_t>(dst->op_params[4]);
8129
+ p.d1 = static_cast<uint32_t>(dst->op_params[5]);
8130
+
8131
+ p.nb01 = static_cast<uint32_t>(nb01/nb00);
8132
+ p.nb02 = static_cast<uint32_t>(nb02/nb00);
8133
+ p.nb03 = static_cast<uint32_t>(nb03/nb00);
8134
+
8135
+ p.nb11 = static_cast<uint32_t>(nb11/nb10);
8136
+ p.nb12 = static_cast<uint32_t>(nb12/nb10);
8137
+ p.nb13 = static_cast<uint32_t>(nb13/nb10);
8138
+
8139
+ p.nb1 = static_cast<uint32_t>(nb1 / nb0);
8140
+ p.nb2 = static_cast<uint32_t>(nb2 / nb0);
8141
+ p.nb3 = static_cast<uint32_t>(nb3 / nb0);
8142
+
8143
+ GGML_ASSERT(ne03 == ne2);
8144
+ GGML_ASSERT(ne02 == ne12);
8145
+
8146
+ ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONV_2D, std::move(p), dryrun);
8147
+
8148
+ }
8149
+
7994
8150
static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
7995
8151
vk_op_conv2d_dw_push_constants p{};
7996
8152
p.ne = ggml_nelements(dst);
@@ -9053,6 +9209,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
9053
9209
case GGML_OP_TIMESTEP_EMBEDDING:
9054
9210
case GGML_OP_CONV_TRANSPOSE_1D:
9055
9211
case GGML_OP_POOL_2D:
9212
+ case GGML_OP_CONV_2D:
9056
9213
case GGML_OP_CONV_2D_DW:
9057
9214
case GGML_OP_RWKV_WKV6:
9058
9215
case GGML_OP_RWKV_WKV7:
@@ -9120,6 +9277,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
9120
9277
case GGML_OP_TIMESTEP_EMBEDDING:
9121
9278
case GGML_OP_CONV_TRANSPOSE_1D:
9122
9279
case GGML_OP_POOL_2D:
9280
+ case GGML_OP_CONV_2D:
9123
9281
case GGML_OP_CONV_2D_DW:
9124
9282
case GGML_OP_LEAKY_RELU:
9125
9283
{
@@ -9326,6 +9484,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
9326
9484
case GGML_OP_POOL_2D:
9327
9485
ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun);
9328
9486
9487
+ break;
9488
+ case GGML_OP_CONV_2D:
9489
+ ggml_vk_conv_2d(ctx, compute_ctx, src0, src1, node, dryrun);
9490
+
9329
9491
break;
9330
9492
case GGML_OP_CONV_2D_DW:
9331
9493
ggml_vk_conv_2d_dw(ctx, compute_ctx, src0, src1, node, dryrun);
@@ -9456,6 +9618,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
9456
9618
case GGML_OP_TIMESTEP_EMBEDDING:
9457
9619
case GGML_OP_CONV_TRANSPOSE_1D:
9458
9620
case GGML_OP_POOL_2D:
9621
+ case GGML_OP_CONV_2D:
9459
9622
case GGML_OP_CONV_2D_DW:
9460
9623
case GGML_OP_RWKV_WKV6:
9461
9624
case GGML_OP_RWKV_WKV7:
@@ -10617,6 +10780,14 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
10617
10780
return true;
10618
10781
case GGML_OP_CONV_TRANSPOSE_1D:
10619
10782
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
10783
+ case GGML_OP_CONV_2D:
10784
+ // Channel-contiguous format is not supported yet.
10785
+ return (op->src[0]->type == GGML_TYPE_F32 &&
10786
+ op->src[1]->type == GGML_TYPE_F32 &&
10787
+ op->type == GGML_TYPE_F32 &&
10788
+ ggml_is_contiguous(op->src[0]) &&
10789
+ ggml_is_contiguous(op->src[1]) &&
10790
+ ggml_is_contiguous(op));
10620
10791
default:
10621
10792
return false;
10622
10793
}
@@ -11175,6 +11346,14 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
11175
11346
const int32_t p1 = tensor->op_params[6];
11176
11347
11177
11348
tensor_clone = ggml_pool_2d(ggml_ctx, src_clone[0], op, k0, k1, s0, s1, p0, p1);
11349
+ } else if (tensor->op == GGML_OP_CONV_2D) {
11350
+ const int32_t s0 = tensor->op_params[0];
11351
+ const int32_t s1 = tensor->op_params[1];
11352
+ const int32_t p0 = tensor->op_params[2];
11353
+ const int32_t p1 = tensor->op_params[3];
11354
+ const int32_t d0 = tensor->op_params[4];
11355
+ const int32_t d1 = tensor->op_params[5];
11356
+ tensor_clone = ggml_conv_2d(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1);
11178
11357
} else if (tensor->op == GGML_OP_LEAKY_RELU) {
11179
11358
const float * op_params = (const float *)tensor->op_params;
11180
11359
tensor_clone = ggml_leaky_relu(ggml_ctx, src_clone[0], op_params[0], false);
0 commit comments