Skip to content

Commit daa9623

Browse files
authored
Overlap cmdbuffer creation and cmdbuffer execution in Vulkan backend by submitting smaller cmdbuffers early. (ggml-org#9118)
* Overlap cmdbuffer creation and cmdbuffer execution in Vulkan backend by submitting smaller cmdbuffers early. * fix compile issues * Fix issues where the last submit wasn't executed or handled properly. * remove trailing whitespace * Repair GGML_VULKAN_CHECK_RESULTS * Increase submit counter only if actual work has been submitted and increase submit count to 100. * Fix some nodes are not checked with GGML_VULKAN_CHECK_RESULTS enabled.
1 parent e079bff commit daa9623

File tree

1 file changed

+80
-50
lines changed

1 file changed

+80
-50
lines changed

ggml/src/ggml-vulkan.cpp

Lines changed: 80 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -787,6 +787,9 @@ static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, s
787787

788788
static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
789789
if (ctx->seqs.empty()) {
790+
if (fence) {
791+
ctx->q->queue.submit({}, fence);
792+
}
790793
return;
791794
}
792795
VK_LOG_DEBUG("ggml_vk_submit(" << ctx << ", " << fence << ")");
@@ -5658,11 +5661,15 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
56585661
}
56595662
}
56605663

5661-
static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, bool last_node, bool dryrun){
5664+
static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence);
5665+
5666+
// Returns true if node has enqueued work into the queue, false otherwise
5667+
// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
5668+
static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){
56625669
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
56635670

56645671
if (ggml_is_empty(node) || extra == nullptr) {
5665-
return;
5672+
return false;
56665673
}
56675674

56685675
VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")");
@@ -5679,7 +5686,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
56795686
case GGML_OP_PERMUTE:
56805687
case GGML_OP_TRANSPOSE:
56815688
case GGML_OP_NONE:
5682-
return;
5689+
return false;
56835690
case GGML_OP_UNARY:
56845691
switch (ggml_get_unary_op(node)) {
56855692
case GGML_UNARY_OP_SILU:
@@ -5689,7 +5696,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
56895696
case GGML_UNARY_OP_TANH:
56905697
break;
56915698
default:
5692-
return;
5699+
return false;
56935700
}
56945701
break;
56955702
case GGML_OP_REPEAT:
@@ -5726,7 +5733,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
57265733
default:
57275734
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
57285735
GGML_ABORT("fatal error");
5729-
return;
5736+
return false;
57305737
}
57315738

57325739
vk_context compute_ctx;
@@ -5826,7 +5833,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
58265833
ggml_vk_unary(ctx, compute_ctx, src0, node, dryrun);
58275834
break;
58285835
default:
5829-
return;
5836+
return false;
58305837
}
58315838
break;
58325839
case GGML_OP_DIAG_MASK_INF:
@@ -5870,11 +5877,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
58705877

58715878
break;
58725879
default:
5873-
return;
5880+
return false;
58745881
}
58755882

58765883
if (dryrun) {
5877-
return;
5884+
return false;
58785885
}
58795886

58805887
ctx->tensor_ctxs[node_idx] = compute_ctx;
@@ -5885,14 +5892,34 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
58855892
last_node = true;
58865893
#endif
58875894

5888-
if (last_node) {
5895+
if (submit || last_node) {
58895896
ggml_vk_ctx_end(compute_ctx);
5890-
compute_ctx->exit_tensor_idx = node_idx;
5897+
5898+
// TODO probably it'd be better to pass a exit_node flag to ggml_vk_compute_forward
5899+
if (last_node) {
5900+
compute_ctx->exit_tensor_idx = node_idx_begin;
5901+
}
5902+
else {
5903+
compute_ctx->exit_tensor_idx = -1;
5904+
}
5905+
58915906
ctx->compute_ctx.reset();
5907+
5908+
bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false);
5909+
if (!ok) {
5910+
if (node->op == GGML_OP_UNARY) {
5911+
std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
5912+
}
5913+
else {
5914+
std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
5915+
}
5916+
}
5917+
58925918
}
5919+
return true;
58935920
}
58945921

5895-
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx){
5922+
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true){
58965923
ggml_tensor_extra_gpu * extra = nullptr;
58975924

58985925
switch (tensor->op) {
@@ -5960,40 +5987,38 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
59605987

59615988
VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")");
59625989

5963-
#ifdef GGML_VULKAN_CHECK_RESULTS
5964-
ggml_vk_check_results_0(tensor);
5965-
#endif
5966-
59675990
vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock();
59685991

5969-
#ifdef GGML_VULKAN_PERF
5970-
std::chrono::steady_clock::time_point start;
5971-
#endif // GGML_VULKAN_PERF
5992+
// always wait for the GPU work to be done for the last submit
5993+
if (tensor_idx == subctx->exit_tensor_idx) {
5994+
use_fence = true;
5995+
}
59725996

59735997
// Only run if ctx hasn't been submitted yet
59745998
if (!subctx->seqs.empty()) {
5999+
#ifdef GGML_VULKAN_CHECK_RESULTS
6000+
ggml_vk_check_results_0(tensor);
6001+
use_fence = true;
6002+
#endif
6003+
59756004
// Do staging buffer copies
59766005
for (auto& cpy : subctx->in_memcpys) {
59776006
memcpy(cpy.dst, cpy.src, cpy.n);
59786007
}
59796008

5980-
#ifdef GGML_VULKAN_PERF
5981-
start = std::chrono::steady_clock::now();
5982-
#endif // GGML_VULKAN_PERF
6009+
ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{});
6010+
6011+
if (use_fence) {
6012+
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
59836013

5984-
ggml_vk_submit(subctx, ctx->fence);
6014+
ctx->device->device.resetFences({ ctx->fence });
6015+
}
6016+
#ifdef GGML_VULKAN_CHECK_RESULTS
6017+
ggml_vk_check_results_1(tensor);
6018+
#endif
59856019
}
59866020

59876021
if (tensor_idx == subctx->exit_tensor_idx) {
5988-
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
5989-
5990-
#ifdef GGML_VULKAN_PERF
5991-
auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start);
5992-
ctx->device->perf_logger->log_timing(tensor, duration.count());
5993-
#endif // GGML_VULKAN_PERF
5994-
5995-
ctx->device->device.resetFences({ ctx->fence });
5996-
59976022
// Do staging buffer copies
59986023
for (auto& cpy : subctx->out_memcpys) {
59996024
memcpy(cpy.dst, cpy.src, cpy.n);
@@ -6482,7 +6507,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
64826507
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
64836508

64846509
for (int i = 0; i < cgraph->n_nodes; i++) {
6485-
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, 0, true);
6510+
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
64866511
}
64876512
ggml_vk_preallocate_buffers(ctx);
64886513
ggml_pipeline_allocate_descriptor_sets(ctx->device);
@@ -6497,31 +6522,36 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
64976522
// Reserve tensor context space for all nodes
64986523
ctx->tensor_ctxs.resize(cgraph->n_nodes);
64996524

6500-
for (int i = 0; i < cgraph->n_nodes; i++) {
6501-
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, i == last_node, false);
6502-
}
6525+
bool first_node_in_batch = true; // true if next node will be first node in a batch
6526+
int submit_node_idx = 0; // index to first node in a batch
65036527

6528+
// submit work every submit_count node to overlap CPU cmdbuffer generation with GPU execution
6529+
constexpr int submit_count = 100;
6530+
int submitted_nodes = 0;
65046531
for (int i = 0; i < cgraph->n_nodes; i++) {
6505-
ggml_tensor * node = cgraph->nodes[i];
6506-
6507-
if (ggml_vk_is_empty(node)) {
6508-
continue;
6532+
if (first_node_in_batch) {
6533+
submit_node_idx = i;
65096534
}
65106535

6511-
bool ok = ggml_vk_compute_forward(ctx, node, i);
6512-
if (!ok) {
6513-
if (node->op == GGML_OP_UNARY) {
6514-
std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
6515-
} else {
6516-
std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
6536+
bool submit = (submitted_nodes >= submit_count) || (i == last_node);
6537+
6538+
6539+
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
6540+
6541+
if (enqueued) {
6542+
++submitted_nodes;
6543+
6544+
#ifndef GGML_VULKAN_CHECK_RESULTS
6545+
if (first_node_in_batch) {
6546+
first_node_in_batch = false;
65176547
}
6548+
#endif
65186549
}
6519-
#ifdef GGML_VULKAN_CHECK_RESULTS
6520-
else {
6521-
ggml_vk_check_results_1(node);
6550+
6551+
if (submit) {
6552+
first_node_in_batch = true;
6553+
submitted_nodes = 0;
65226554
}
6523-
#endif
6524-
GGML_ASSERT(ok);
65256555
}
65266556

65276557
#ifdef GGML_VULKAN_PERF

0 commit comments

Comments
 (0)