@@ -297,8 +297,27 @@ static int ggml_backend_opencl_n_devices = 0;
297
297
struct ProfilingInfo {
298
298
std::string op_name;
299
299
std::string kernel_name;
300
- // Kernel execution time in nanoseconds.
301
- cl_ulong duration_ns;
300
+
301
+ cl_kernel kernel;
302
+ cl_event evt;
303
+
304
+ cl_ulong cmd_queued;
305
+ cl_ulong cmd_submit;
306
+ cl_ulong cmd_start;
307
+ cl_ulong cmd_end;
308
+ cl_ulong overhead_start;
309
+ cl_ulong overhead_end;
310
+ // For the times below, see spec for clGetEventProfilingInfo
311
+ // The time kernel spent in cmd queue - SUBMIT - QUEUED
312
+ cl_ulong cmd_queued_duration_ns;
313
+ // The time kernel spent for submission - START - SUBMIT
314
+ cl_ulong cmd_submit_duration_ns;
315
+ // Kernel execution time in nanoseconds - END - START
316
+ cl_ulong cmd_duration_ns;
317
+ // The time for the kernel to complete - COMPLETE - END
318
+ cl_ulong cmd_complete_duration_ns;
319
+ // Total time to finish the kernel - COMPELTE - QUEUED
320
+ cl_ulong cmd_total_duration_ns;
302
321
// Global and local work sizes.
303
322
size_t global_size[3 ];
304
323
size_t local_size[3 ];
@@ -903,19 +922,84 @@ static void ggml_cl2_free(void) {
903
922
return ;
904
923
}
905
924
925
+ // Populate profiling info
926
+ for (ProfilingInfo & info : g_profiling_info) {
927
+ cl_ulong cmd_queued;
928
+ cl_ulong cmd_submit;
929
+ cl_ulong cmd_start;
930
+ cl_ulong cmd_end;
931
+ cl_ulong cmd_complete;
932
+
933
+ CL_CHECK (clWaitForEvents (1 , &info.evt ));
934
+ CL_CHECK (clGetEventProfilingInfo (
935
+ info.evt , CL_PROFILING_COMMAND_QUEUED, sizeof (cl_ulong), &cmd_queued, NULL ));
936
+ CL_CHECK (clGetEventProfilingInfo (
937
+ info.evt , CL_PROFILING_COMMAND_SUBMIT, sizeof (cl_ulong), &cmd_submit, NULL ));
938
+ CL_CHECK (clGetEventProfilingInfo (
939
+ info.evt , CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &cmd_start, NULL ));
940
+ CL_CHECK (clGetEventProfilingInfo (
941
+ info.evt , CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &cmd_end, NULL ));
942
+ CL_CHECK (clGetEventProfilingInfo (
943
+ info.evt , CL_PROFILING_COMMAND_COMPLETE, sizeof (cl_ulong), &cmd_complete, NULL ));
944
+ CL_CHECK (clReleaseEvent (info.evt ));
945
+
946
+ char kernel_name[512 ];
947
+ CL_CHECK (clGetKernelInfo (info.kernel , CL_KERNEL_FUNCTION_NAME,
948
+ sizeof (kernel_name), kernel_name, NULL ));
949
+ info.kernel_name = kernel_name;
950
+
951
+ info.cmd_queued = cmd_queued;
952
+ info.cmd_submit = cmd_submit;
953
+ info.cmd_start = cmd_start;
954
+ info.cmd_end = cmd_end;
955
+
956
+ info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
957
+ info.cmd_submit_duration_ns = cmd_start - cmd_submit;
958
+ info.cmd_duration_ns = cmd_end - cmd_start;
959
+ info.cmd_complete_duration_ns = cmd_complete - cmd_end;
960
+ info.cmd_total_duration_ns = cmd_complete - cmd_queued;
961
+ }
962
+
963
+ // Dump a csv
906
964
float total_kernel_time = 0 ;
907
- fprintf (fperf, " op name, kernel name, duration (ms), global size, local size, output size\n " );
965
+ fprintf (fperf, " op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n " );
908
966
for (const ProfilingInfo & info : g_profiling_info) {
909
- total_kernel_time += info.duration_ns /1 .e6f ;
910
- fprintf (fperf, " %s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n " ,
911
- info.op_name .c_str (), info.kernel_name .c_str (), info.duration_ns /1 .e6f ,
967
+ total_kernel_time += info.cmd_duration_ns /1 .e6f ;
968
+ fprintf (fperf, " %s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n " ,
969
+ info.op_name .c_str (), info.kernel_name .c_str (),
970
+ info.cmd_queued_duration_ns /1 .e6f ,
971
+ info.cmd_submit_duration_ns /1 .e6f ,
972
+ info.cmd_duration_ns /1 .e6f ,
973
+ info.cmd_complete_duration_ns /1 .e6f ,
974
+ info.cmd_total_duration_ns /1 .e6f ,
912
975
info.global_size [0 ], info.global_size [1 ], info.global_size [2 ],
913
976
info.local_size [0 ], info.local_size [2 ], info.local_size [2 ],
914
977
info.output_size [0 ], info.output_size [1 ], info.output_size [2 ], info.output_size [3 ]);
915
978
}
916
979
fclose (fperf);
917
980
918
981
GGML_LOG_INFO (" ggml_opencl: total kernel time: %f\n " , total_kernel_time);
982
+
983
+ // Dump a simple chrome trace
984
+ FILE* ftrace = fopen (" cl_trace.json" , " w" );
985
+ if (!ftrace) {
986
+ GGML_LOG_ERROR (" Failed to open cl_trace.json\n " );
987
+ return ;
988
+ }
989
+
990
+ fprintf (ftrace, " [\n " );
991
+ for (const ProfilingInfo & info : g_profiling_info) {
992
+ fprintf (ftrace, " {\" name\" : \" %s\" , \" cat\" : \" OpenCL\" , \" ph\" : \" B\" , \" ts\" : %lu, \" pid\" : \"\" , \" tid\" : \" Host\" },\n " ,
993
+ info.kernel_name .c_str (), info.cmd_queued /1000 );
994
+ fprintf (ftrace, " {\" name\" : \" %s\" , \" cat\" : \" OpenCL\" , \" ph\" : \" E\" , \" ts\" : %lu, \" pid\" : \"\" , \" tid\" : \" Host\" },\n " ,
995
+ info.kernel_name .c_str (), info.cmd_submit /1000 );
996
+
997
+ fprintf (ftrace, " {\" name\" : \" %s\" , \" cat\" : \" OpenCL\" , \" ph\" : \" B\" , \" ts\" : %lu, \" pid\" : \"\" , \" tid\" : \" Device\" },\n " ,
998
+ info.kernel_name .c_str (), info.cmd_start /1000 );
999
+ fprintf (ftrace, " {\" name\" : \" %s\" , \" cat\" : \" OpenCL\" , \" ph\" : \" E\" , \" ts\" : %lu, \" pid\" : \"\" , \" tid\" : \" Device\" },\n " ,
1000
+ info.kernel_name .c_str (), info.cmd_end /1000 );
1001
+ }
1002
+ fclose (ftrace);
919
1003
#endif
920
1004
}
921
1005
@@ -2062,25 +2146,14 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
2062
2146
// Profiling utility
2063
2147
// ------------------------------------------------------------------------------
2064
2148
#ifdef GGML_OPENCL_PROFILING
2065
- void populateProfilingInfo (
2149
+ static void populateProfilingInfo (
2066
2150
ProfilingInfo& info, cl_event evt, cl_kernel kernel,
2067
2151
size_t global_size[3 ], size_t local_size[3 ],
2068
2152
const ggml_tensor * tensor) {
2069
- cl_ulong start;
2070
- cl_ulong end;
2071
- CL_CHECK (clWaitForEvents (1 , &evt));
2072
- CL_CHECK (clGetEventProfilingInfo (
2073
- evt, CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &start, NULL ));
2074
- CL_CHECK (clGetEventProfilingInfo (
2075
- evt, CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &end, NULL ));
2076
-
2077
- char kernel_name[512 ];
2078
- CL_CHECK (clGetKernelInfo (kernel, CL_KERNEL_FUNCTION_NAME,
2079
- sizeof (kernel_name), kernel_name, NULL ));
2080
-
2081
- info.duration_ns = end - start;
2082
- info.op_name = tensor->name ;
2083
- info.kernel_name = kernel_name;
2153
+ info.op_name = tensor->name ;
2154
+ info.kernel = kernel;
2155
+ info.evt = evt;
2156
+
2084
2157
info.local_size [0 ] = local_size[0 ];
2085
2158
info.local_size [1 ] = local_size[1 ];
2086
2159
info.local_size [2 ] = local_size[2 ];
0 commit comments