Skip to content

Commit d84635b

Browse files
authored
opencl: improve profiling (ggml-org#12442)
* opencl: more profiling timing * opencl: generate trace for profiling * opencl: reduce profiling overhead * Populate profiling timing info at the end rather than after each kernel run * opencl: fix for chrome tracing
1 parent 75422e8 commit d84635b

File tree

1 file changed

+95
-22
lines changed

1 file changed

+95
-22
lines changed

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 95 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -297,8 +297,27 @@ static int ggml_backend_opencl_n_devices = 0;
297297
struct ProfilingInfo {
298298
std::string op_name;
299299
std::string kernel_name;
300-
// Kernel execution time in nanoseconds.
301-
cl_ulong duration_ns;
300+
301+
cl_kernel kernel;
302+
cl_event evt;
303+
304+
cl_ulong cmd_queued;
305+
cl_ulong cmd_submit;
306+
cl_ulong cmd_start;
307+
cl_ulong cmd_end;
308+
cl_ulong overhead_start;
309+
cl_ulong overhead_end;
310+
// For the times below, see spec for clGetEventProfilingInfo
311+
// The time kernel spent in cmd queue - SUBMIT - QUEUED
312+
cl_ulong cmd_queued_duration_ns;
313+
// The time kernel spent for submission - START - SUBMIT
314+
cl_ulong cmd_submit_duration_ns;
315+
// Kernel execution time in nanoseconds - END - START
316+
cl_ulong cmd_duration_ns;
317+
// The time for the kernel to complete - COMPLETE - END
318+
cl_ulong cmd_complete_duration_ns;
319+
// Total time to finish the kernel - COMPELTE - QUEUED
320+
cl_ulong cmd_total_duration_ns;
302321
// Global and local work sizes.
303322
size_t global_size[3];
304323
size_t local_size[3];
@@ -903,19 +922,84 @@ static void ggml_cl2_free(void) {
903922
return;
904923
}
905924

925+
// Populate profiling info
926+
for (ProfilingInfo & info : g_profiling_info) {
927+
cl_ulong cmd_queued;
928+
cl_ulong cmd_submit;
929+
cl_ulong cmd_start;
930+
cl_ulong cmd_end;
931+
cl_ulong cmd_complete;
932+
933+
CL_CHECK(clWaitForEvents(1, &info.evt));
934+
CL_CHECK(clGetEventProfilingInfo(
935+
info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
936+
CL_CHECK(clGetEventProfilingInfo(
937+
info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
938+
CL_CHECK(clGetEventProfilingInfo(
939+
info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
940+
CL_CHECK(clGetEventProfilingInfo(
941+
info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
942+
CL_CHECK(clGetEventProfilingInfo(
943+
info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
944+
CL_CHECK(clReleaseEvent(info.evt));
945+
946+
char kernel_name[512];
947+
CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
948+
sizeof(kernel_name), kernel_name, NULL));
949+
info.kernel_name = kernel_name;
950+
951+
info.cmd_queued = cmd_queued;
952+
info.cmd_submit = cmd_submit;
953+
info.cmd_start = cmd_start;
954+
info.cmd_end = cmd_end;
955+
956+
info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
957+
info.cmd_submit_duration_ns = cmd_start - cmd_submit;
958+
info.cmd_duration_ns = cmd_end - cmd_start;
959+
info.cmd_complete_duration_ns = cmd_complete - cmd_end;
960+
info.cmd_total_duration_ns = cmd_complete - cmd_queued;
961+
}
962+
963+
// Dump a csv
906964
float total_kernel_time = 0;
907-
fprintf(fperf, "op name, kernel name, duration (ms), global size, local size, output size\n");
965+
fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
908966
for (const ProfilingInfo & info : g_profiling_info) {
909-
total_kernel_time += info.duration_ns/1.e6f;
910-
fprintf(fperf, "%s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
911-
info.op_name.c_str(), info.kernel_name.c_str(), info.duration_ns/1.e6f,
967+
total_kernel_time += info.cmd_duration_ns/1.e6f;
968+
fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
969+
info.op_name.c_str(), info.kernel_name.c_str(),
970+
info.cmd_queued_duration_ns/1.e6f,
971+
info.cmd_submit_duration_ns/1.e6f,
972+
info.cmd_duration_ns/1.e6f,
973+
info.cmd_complete_duration_ns/1.e6f,
974+
info.cmd_total_duration_ns/1.e6f,
912975
info.global_size[0], info.global_size[1], info.global_size[2],
913976
info.local_size[0], info.local_size[2], info.local_size[2],
914977
info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
915978
}
916979
fclose(fperf);
917980

918981
GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
982+
983+
// Dump a simple chrome trace
984+
FILE* ftrace = fopen("cl_trace.json", "w");
985+
if (!ftrace) {
986+
GGML_LOG_ERROR("Failed to open cl_trace.json\n");
987+
return;
988+
}
989+
990+
fprintf(ftrace, "[\n");
991+
for (const ProfilingInfo & info : g_profiling_info) {
992+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
993+
info.kernel_name.c_str(), info.cmd_queued/1000);
994+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
995+
info.kernel_name.c_str(), info.cmd_submit/1000);
996+
997+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
998+
info.kernel_name.c_str(), info.cmd_start/1000);
999+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
1000+
info.kernel_name.c_str(), info.cmd_end/1000);
1001+
}
1002+
fclose(ftrace);
9191003
#endif
9201004
}
9211005

@@ -2062,25 +2146,14 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
20622146
// Profiling utility
20632147
//------------------------------------------------------------------------------
20642148
#ifdef GGML_OPENCL_PROFILING
2065-
void populateProfilingInfo(
2149+
static void populateProfilingInfo(
20662150
ProfilingInfo& info, cl_event evt, cl_kernel kernel,
20672151
size_t global_size[3], size_t local_size[3],
20682152
const ggml_tensor * tensor) {
2069-
cl_ulong start;
2070-
cl_ulong end;
2071-
CL_CHECK(clWaitForEvents(1, &evt));
2072-
CL_CHECK(clGetEventProfilingInfo(
2073-
evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL));
2074-
CL_CHECK(clGetEventProfilingInfo(
2075-
evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL));
2076-
2077-
char kernel_name[512];
2078-
CL_CHECK(clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME,
2079-
sizeof(kernel_name), kernel_name, NULL));
2080-
2081-
info.duration_ns = end - start;
2082-
info.op_name = tensor->name;
2083-
info.kernel_name = kernel_name;
2153+
info.op_name = tensor->name;
2154+
info.kernel = kernel;
2155+
info.evt = evt;
2156+
20842157
info.local_size[0] = local_size[0];
20852158
info.local_size[1] = local_size[1];
20862159
info.local_size[2] = local_size[2];

0 commit comments

Comments
 (0)