@@ -457,6 +457,7 @@ struct vk_device_struct {
457
457
vk_pipeline pipeline_rwkv_wkv6_f32;
458
458
vk_pipeline pipeline_rwkv_wkv7_f32;
459
459
vk_pipeline pipeline_opt_step_adamw_f32;
460
+ vk_pipeline pipeline_conv2d_f32;
460
461
vk_pipeline pipeline_conv2d_dw_whcn_f32;
461
462
vk_pipeline pipeline_conv2d_dw_cwhn_f32;
462
463
@@ -816,6 +817,38 @@ struct vk_op_rwkv_wkv7_push_constants {
816
817
uint32_t H;
817
818
};
818
819
820
+ struct vk_op_conv2d_push_constants {
821
+ uint32_t Cout;
822
+ uint32_t Cin;
823
+ uint32_t N;
824
+
825
+ uint32_t KW;
826
+ uint32_t KH;
827
+ uint32_t W;
828
+ uint32_t H;
829
+ uint32_t OW;
830
+ uint32_t OH;
831
+
832
+ uint32_t s0;
833
+ uint32_t s1;
834
+ uint32_t p0;
835
+ uint32_t p1;
836
+ uint32_t d0;
837
+ uint32_t d1;
838
+
839
+ uint32_t nb01;
840
+ uint32_t nb02;
841
+ uint32_t nb03;
842
+
843
+ uint32_t nb11;
844
+ uint32_t nb12;
845
+ uint32_t nb13;
846
+
847
+ uint32_t nb1;
848
+ uint32_t nb2;
849
+ uint32_t nb3;
850
+ };
851
+
819
852
struct vk_op_conv2d_dw_push_constants {
820
853
uint32_t ne;
821
854
uint32_t batches;
@@ -916,16 +949,33 @@ class vk_memory_logger {
916
949
class vk_perf_logger {
917
950
public:
918
951
void print_timings() {
952
+ if(timings.empty()){
953
+ return;
954
+ }
919
955
std::cerr << "----------------\nVulkan Timings:" << std::endl;
920
956
for (const auto& t : timings) {
921
957
uint64_t total = 0;
922
958
for (const auto& time : t.second) {
923
959
total += time;
924
960
}
925
- std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us" << std::endl;
961
+ std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us";
962
+
963
+ // If we have as many flops entries as timing entries for the op, then compute and log the flops/S.
964
+ auto it = flops.find(t.first);
965
+ if(it != flops.end() && (it->second).size() == t.second.size()){
966
+ uint64_t total_nflops = 0;
967
+ for(const auto& elem : it->second){
968
+ total_nflops += elem;
969
+ }
970
+ std::cout << " (" << (double(total_nflops)/(1000.0*1000.0*1000.0)) / (double(total)/(1000.0*1000.0*1000.0)) << " GFLOPS/s)";
971
+ }
972
+
973
+
974
+ std::cerr << std::endl;
926
975
}
927
976
928
977
timings.clear();
978
+ flops.clear();
929
979
}
930
980
931
981
void log_timing(const ggml_tensor * node, uint64_t time) {
@@ -944,12 +994,33 @@ class vk_perf_logger {
944
994
name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k);
945
995
}
946
996
timings[name].push_back(time);
997
+ flops[name].push_back( m*n*(k+(k-1)) );
947
998
return;
948
999
}
1000
+ if(node->op == GGML_OP_CONV_2D){
1001
+ std::string name = ggml_op_name(node->op);
1002
+ ggml_tensor * knl = node->src[0];
1003
+ uint64_t OW = node->ne[0];
1004
+ uint64_t OH = node->ne[1];
1005
+ uint64_t N = node->ne[3];
1006
+ uint64_t Cout = node->ne[2];
1007
+ uint64_t KW = knl->ne[0];
1008
+ uint64_t KH = knl->ne[1];
1009
+ uint64_t Cin = knl->ne[2];
1010
+ // KxCRS @ CRSxNPQ = KxNPQ -> M=K, K=CRS, N=NPQ
1011
+ uint64_t size_M = Cout;
1012
+ uint64_t size_K = Cin*KW*KH;
1013
+ uint64_t size_N = N*OW*OH;
1014
+ uint64_t n_flops = size_M*size_N*(size_K+(size_K-1));
1015
+ flops[name].push_back(n_flops);
1016
+ timings[name].push_back(time);
1017
+ return;
1018
+ }
949
1019
timings[ggml_op_name(node->op)].push_back(time);
950
1020
}
951
1021
private:
952
1022
std::map<std::string, std::vector<uint64_t>> timings;
1023
+ std::map<std::string, std::vector<uint64_t>> flops;
953
1024
};
954
1025
955
1026
struct ggml_backend_vk_context {
@@ -2806,6 +2877,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
2806
2877
2807
2878
ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
2808
2879
2880
+ ggml_vk_create_pipeline(device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3, sizeof(vk_op_conv2d_push_constants), {128 /* equal to BS_K in the shader */, 128 /* equal to BS_NPQ in the shader */, 1}, {}, 1);
2881
+
2809
2882
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
2810
2883
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f32, "conv2d_dw_cwhn_f32", conv2d_dw_cwhn_f32_len, conv2d_dw_cwhn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
2811
2884
@@ -6578,6 +6651,16 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
6578
6651
return ctx->device->pipeline_leaky_relu_f32;
6579
6652
}
6580
6653
return nullptr;
6654
+ case GGML_OP_CONV_2D:
6655
+ if (src0->type == GGML_TYPE_F32 &&
6656
+ src1->type == GGML_TYPE_F32 &&
6657
+ dst->type == GGML_TYPE_F32 &&
6658
+ ggml_is_contiguous(src0) &&
6659
+ ggml_is_contiguous(src1) &&
6660
+ ggml_is_contiguous(dst)) {
6661
+ return ctx->device->pipeline_conv2d_f32;
6662
+ }
6663
+ return nullptr;
6581
6664
case GGML_OP_CONV_2D_DW:
6582
6665
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
6583
6666
if (ggml_is_contiguous(src1)) {
@@ -6899,6 +6982,30 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
6899
6982
const uint32_t OW = dst->ne[0];
6900
6983
elements = { N * OC * OH * OW, 1, 1};
6901
6984
} break;
6985
+ case GGML_OP_CONV_2D:
6986
+ {
6987
+ // src0 - kernel: [KW, KH, Cin, Cout]
6988
+ // src1 - input: [W, H, Cin, N]
6989
+ // dst - result: [OW, OH, Cout, N]
6990
+
6991
+ // Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
6992
+ auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
6993
+ return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
6994
+ };
6995
+ // parallelize in {OW/BS_K, OH/BS_NPQ, 1}
6996
+ int64_t W = src1->ne[0];
6997
+ int64_t H = src1->ne[1];
6998
+ int64_t KW = src0->ne[0];
6999
+ int64_t KH = src0->ne[1];
7000
+ int64_t Cout = src0->ne[3];
7001
+ int64_t N = src1->ne[3];
7002
+ int64_t OH = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]);
7003
+ int64_t OW = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]);
7004
+ int64_t NPQ = N*OW*OH;
7005
+
7006
+ // Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups
7007
+ elements = {static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1};
7008
+ } break;
6902
7009
case GGML_OP_ADD:
6903
7010
case GGML_OP_SUB:
6904
7011
case GGML_OP_DIV:
@@ -7753,6 +7860,55 @@ static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c
7753
7860
}, dryrun);
7754
7861
}
7755
7862
7863
+ static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
7864
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7865
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
7866
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
7867
+
7868
+ GGML_TENSOR_BINARY_OP_LOCALS
7869
+
7870
+ GGML_ASSERT(nb00 == sizeof(float));
7871
+ GGML_ASSERT(nb10 == sizeof(float));
7872
+ GGML_ASSERT(nb0 == sizeof(float));
7873
+
7874
+ vk_op_conv2d_push_constants p{};
7875
+ p.Cout = static_cast<uint32_t>(ne03);
7876
+ p.Cin = static_cast<uint32_t>(ne02);
7877
+ p.N = static_cast<uint32_t>(ne13);
7878
+
7879
+ p.KW = static_cast<uint32_t>(ne00);
7880
+ p.KH = static_cast<uint32_t>(ne01);
7881
+ p.W = static_cast<uint32_t>(ne10);
7882
+ p.H = static_cast<uint32_t>(ne11);
7883
+ p.OW = static_cast<uint32_t>(ne0);
7884
+ p.OH = static_cast<uint32_t>(ne1);
7885
+
7886
+ p.s0 = static_cast<uint32_t>(dst->op_params[0]);
7887
+ p.s1 = static_cast<uint32_t>(dst->op_params[1]);
7888
+ p.p0 = static_cast<uint32_t>(dst->op_params[2]);
7889
+ p.p1 = static_cast<uint32_t>(dst->op_params[3]);
7890
+ p.d0 = static_cast<uint32_t>(dst->op_params[4]);
7891
+ p.d1 = static_cast<uint32_t>(dst->op_params[5]);
7892
+
7893
+ p.nb01 = static_cast<uint32_t>(nb01/nb00);
7894
+ p.nb02 = static_cast<uint32_t>(nb02/nb00);
7895
+ p.nb03 = static_cast<uint32_t>(nb03/nb00);
7896
+
7897
+ p.nb11 = static_cast<uint32_t>(nb11/nb10);
7898
+ p.nb12 = static_cast<uint32_t>(nb12/nb10);
7899
+ p.nb13 = static_cast<uint32_t>(nb13/nb10);
7900
+
7901
+ p.nb1 = static_cast<uint32_t>(nb1 / nb0);
7902
+ p.nb2 = static_cast<uint32_t>(nb2 / nb0);
7903
+ p.nb3 = static_cast<uint32_t>(nb3 / nb0);
7904
+
7905
+ GGML_ASSERT(ne03 == ne2);
7906
+ GGML_ASSERT(ne02 == ne12);
7907
+
7908
+ ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONV_2D, std::move(p), dryrun);
7909
+
7910
+ }
7911
+
7756
7912
static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
7757
7913
vk_op_conv2d_dw_push_constants p{};
7758
7914
p.ne = ggml_nelements(dst);
@@ -8799,6 +8955,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
8799
8955
case GGML_OP_TIMESTEP_EMBEDDING:
8800
8956
case GGML_OP_CONV_TRANSPOSE_1D:
8801
8957
case GGML_OP_POOL_2D:
8958
+ case GGML_OP_CONV_2D:
8802
8959
case GGML_OP_CONV_2D_DW:
8803
8960
case GGML_OP_RWKV_WKV6:
8804
8961
case GGML_OP_RWKV_WKV7:
@@ -8864,6 +9021,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
8864
9021
case GGML_OP_TIMESTEP_EMBEDDING:
8865
9022
case GGML_OP_CONV_TRANSPOSE_1D:
8866
9023
case GGML_OP_POOL_2D:
9024
+ case GGML_OP_CONV_2D:
8867
9025
case GGML_OP_CONV_2D_DW:
8868
9026
case GGML_OP_LEAKY_RELU:
8869
9027
{
@@ -9042,6 +9200,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
9042
9200
case GGML_OP_POOL_2D:
9043
9201
ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun);
9044
9202
9203
+ break;
9204
+ case GGML_OP_CONV_2D:
9205
+ ggml_vk_conv_2d(ctx, compute_ctx, src0, src1, node, dryrun);
9206
+
9045
9207
break;
9046
9208
case GGML_OP_CONV_2D_DW:
9047
9209
ggml_vk_conv_2d_dw(ctx, compute_ctx, src0, src1, node, dryrun);
@@ -9168,6 +9330,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
9168
9330
case GGML_OP_TIMESTEP_EMBEDDING:
9169
9331
case GGML_OP_CONV_TRANSPOSE_1D:
9170
9332
case GGML_OP_POOL_2D:
9333
+ case GGML_OP_CONV_2D:
9171
9334
case GGML_OP_CONV_2D_DW:
9172
9335
case GGML_OP_RWKV_WKV6:
9173
9336
case GGML_OP_RWKV_WKV7:
@@ -10242,6 +10405,14 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
10242
10405
return true;
10243
10406
case GGML_OP_CONV_TRANSPOSE_1D:
10244
10407
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
10408
+ case GGML_OP_CONV_2D:
10409
+ // Channel-contiguous format is not supported yet.
10410
+ return (op->src[0]->type == GGML_TYPE_F32 &&
10411
+ op->src[1]->type == GGML_TYPE_F32 &&
10412
+ op->type == GGML_TYPE_F32 &&
10413
+ ggml_is_contiguous(op->src[0]) &&
10414
+ ggml_is_contiguous(op->src[1]) &&
10415
+ ggml_is_contiguous(op));
10245
10416
default:
10246
10417
return false;
10247
10418
}
@@ -10765,6 +10936,14 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
10765
10936
const int32_t p1 = tensor->op_params[6];
10766
10937
10767
10938
tensor_clone = ggml_pool_2d(ggml_ctx, src_clone[0], op, k0, k1, s0, s1, p0, p1);
10939
+ } else if (tensor->op == GGML_OP_CONV_2D) {
10940
+ const int32_t s0 = tensor->op_params[0];
10941
+ const int32_t s1 = tensor->op_params[1];
10942
+ const int32_t p0 = tensor->op_params[2];
10943
+ const int32_t p1 = tensor->op_params[3];
10944
+ const int32_t d0 = tensor->op_params[4];
10945
+ const int32_t d1 = tensor->op_params[5];
10946
+ tensor_clone = ggml_conv_2d(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1);
10768
10947
} else if (tensor->op == GGML_OP_LEAKY_RELU) {
10769
10948
const float * op_params = (const float *)tensor->op_params;
10770
10949
tensor_clone = ggml_leaky_relu(ggml_ctx, src_clone[0], op_params[0], false);
0 commit comments