1111# include <string.h>
1212# include <limits.h>
1313# include <ctype.h>
14+ # include <math.h>
1415# if defined(_WIN32 )
1516# include <windows.h>
1617# include <process.h>
@@ -154,17 +155,14 @@ void c_dbcsr_acc_opencl_configure(void) {
154155 const char * const env_rank = (NULL != getenv ("PMI_RANK" ) ? getenv ("PMI_RANK" ) : getenv ("OMPI_COMM_WORLD_LOCAL_RANK" ));
155156 const char * const env_nranks = getenv ("MPI_LOCALNRANKS" ), * const env_devsplit = getenv ("ACC_OPENCL_DEVSPLIT" );
156157 const char * const env_verbose = getenv ("ACC_OPENCL_VERBOSE" ), * const env_dump_acc = getenv ("ACC_OPENCL_DUMP" );
158+ const char * const env_profile = getenv ("ACC_OPENCL_PROFILE" ), * const env_nlocks = getenv ("ACC_OPENCL_NLOCKS" );
157159 const char * const env_debug = getenv ("ACC_OPENCL_DEBUG" ), * const env_timer = getenv ("ACC_OPENCL_TIMER" );
158160 const char * const env_dump = (NULL != env_dump_acc ? env_dump_acc : getenv ("IGC_ShaderDumpEnable" ));
159161 const char * const env_neo = getenv ("NEOReadDebugKeys" ), * const env_wa = getenv ("ACC_OPENCL_WA" );
162+ static char neo_enable_debug_keys [] = "NEOReadDebugKeys=1" ;
160163# if defined(ACC_OPENCL_STREAM_PRIORITIES )
161164 const char * const env_priority = getenv ("ACC_OPENCL_PRIORITY" );
162165# endif
163- # if defined(ACC_OPENCL_PROFILE_DBCSR )
164- const char * const env_profile = getenv ("ACC_OPENCL_PROFILE" );
165- # endif
166- const char * const env_nlocks = getenv ("ACC_OPENCL_NLOCKS" );
167- static char neo_enable_debug_keys [] = "NEOReadDebugKeys=1" ;
168166# if defined(ACC_OPENCL_NCCS )
169167 const char * const env_nccs = getenv ("ACC_OPENCL_NCCS" );
170168 const int nccs = (NULL == env_nccs ? ACC_OPENCL_NCCS : atoi (env_nccs ));
@@ -183,19 +181,16 @@ void c_dbcsr_acc_opencl_configure(void) {
183181 const char * const env_async = NULL ;
184182 const int async_default = 0 ;
185183# endif
186-
187-
188184 const int nlocks = (NULL == env_nlocks ? 1 /*default*/ : atoi (env_nlocks ));
189185 const int neo = (NULL == env_neo ? 1 : atoi (env_neo ));
190186 int i ;
191187# if defined(_OPENMP )
192188 const int max_threads = omp_get_max_threads (), num_threads = omp_get_num_threads ();
189+ memset (& c_dbcsr_acc_opencl_config , 0 , sizeof (c_dbcsr_acc_opencl_config ));
193190 c_dbcsr_acc_opencl_config .nthreads = (num_threads < max_threads ? max_threads : num_threads );
194- c_dbcsr_acc_opencl_config .nstreams = (num_threads < max_threads ? (ACC_OPENCL_MAXNITEMS * max_threads )
195- : (ACC_OPENCL_MAXNITEMS ));
196191# else
192+ memset (& c_dbcsr_acc_opencl_config , 0 , sizeof (c_dbcsr_acc_opencl_config ));
197193 c_dbcsr_acc_opencl_config .nthreads = 1 ;
198- c_dbcsr_acc_opencl_config .nstreams = ACC_OPENCL_MAXNITEMS ;
199194# endif
200195 c_dbcsr_acc_opencl_config .nranks = LIBXSMM_MAX (NULL != env_nranks ? atoi (env_nranks ) : 1 , 1 );
201196 c_dbcsr_acc_opencl_config .nrank = (NULL != env_rank ? atoi (env_rank ) : 0 ) % c_dbcsr_acc_opencl_config .nranks ;
@@ -219,9 +214,7 @@ void c_dbcsr_acc_opencl_configure(void) {
219214# if defined(ACC_OPENCL_STREAM_PRIORITIES )
220215 c_dbcsr_acc_opencl_config .priority = (NULL == env_priority ? /*default*/ 3 : atoi (env_priority ));
221216# endif
222- # if defined(ACC_OPENCL_PROFILE_DBCSR )
223217 c_dbcsr_acc_opencl_config .profile = (NULL == env_profile ? /*default*/ 0 : atoi (env_profile ));
224- # endif
225218 c_dbcsr_acc_opencl_config .xhints = (NULL == env_xhints ? xhints_default : atoi (env_xhints ));
226219 c_dbcsr_acc_opencl_config .async = (NULL == env_async ? async_default : atoi (env_async ));
227220 c_dbcsr_acc_opencl_config .dump = (NULL == env_dump ? /*default*/ 0 : atoi (env_dump ));
@@ -337,14 +330,16 @@ int c_dbcsr_acc_init(void) {
337330# endif
338331# if defined(ACC_OPENCL_PROFILE_DBCSR )
339332 int routine_handle ;
333+ c_dbcsr_acc_opencl_configure ();
340334 if (0 != c_dbcsr_acc_opencl_config .profile ) {
341335 static const char * const routine_name_ptr = LIBXSMM_FUNCNAME + ACC_OPENCL_PROFILE_DBCSR ;
342336 static const int routine_name_len = (int )sizeof (LIBXSMM_FUNCNAME ) - (ACC_OPENCL_PROFILE_DBCSR + 1 );
343337 c_dbcsr_timeset ((const char * * )& routine_name_ptr , & routine_name_len , & routine_handle );
344338 }
339+ # else
340+ c_dbcsr_acc_opencl_configure ();
345341# endif
346342 /* eventually touch OpenCL/compute runtime after configure */
347- c_dbcsr_acc_opencl_configure ();
348343 if (0 == c_dbcsr_acc_opencl_config .ndevices && EXIT_SUCCESS == result ) { /* avoid to initialize multiple times */
349344 char buffer [ACC_OPENCL_BUFFERSIZE ];
350345 cl_platform_id platforms [ACC_OPENCL_MAXNDEVS ] = {NULL };
@@ -548,15 +543,16 @@ int c_dbcsr_acc_init(void) {
548543 assert (0 < c_dbcsr_acc_opencl_config .ndevices );
549544 assert (c_dbcsr_acc_opencl_config .ndevices < ACC_OPENCL_MAXNDEVS );
550545# if defined(ACC_OPENCL_MEM_DEVPTR )
551- c_dbcsr_acc_opencl_config .memptrs = NULL ;
552- c_dbcsr_acc_opencl_config .memptr_data = NULL ;
553- c_dbcsr_acc_opencl_config .nmemptrs = 0 ;
554- # endif
555- c_dbcsr_acc_opencl_config .streams = NULL ;
556- c_dbcsr_acc_opencl_config .events = NULL ;
557- c_dbcsr_acc_opencl_config .stream_data = NULL ;
558- c_dbcsr_acc_opencl_config .event_data = NULL ;
559- c_dbcsr_acc_opencl_config .nstreams = c_dbcsr_acc_opencl_config .nevents = 0 ;
546+ assert (NULL == c_dbcsr_acc_opencl_config .memptrs );
547+ assert (NULL == c_dbcsr_acc_opencl_config .memptr_data );
548+ assert (0 == c_dbcsr_acc_opencl_config .nmemptrs );
549+ # endif
550+ assert (NULL == c_dbcsr_acc_opencl_config .streams );
551+ assert (NULL == c_dbcsr_acc_opencl_config .events );
552+ assert (NULL == c_dbcsr_acc_opencl_config .stream_data );
553+ assert (NULL == c_dbcsr_acc_opencl_config .event_data );
554+ assert (0 == c_dbcsr_acc_opencl_config .nstreams );
555+ assert (0 == c_dbcsr_acc_opencl_config .nevents );
560556# if defined(ACC_OPENCL_CACHE_DID )
561557 c_dbcsr_acc_opencl_active_id = device_id + 1 ; /* update c_dbcsr_acc_opencl_active_id */
562558# endif
@@ -567,8 +563,8 @@ int c_dbcsr_acc_init(void) {
567563 c_dbcsr_acc_opencl_config .memptr_data = (c_dbcsr_acc_opencl_info_memptr_t * )malloc (
568564 sizeof (c_dbcsr_acc_opencl_info_memptr_t ) * nhandles );
569565 if (NULL != c_dbcsr_acc_opencl_config .memptrs && NULL != c_dbcsr_acc_opencl_config .memptr_data ) {
570- c_dbcsr_acc_opencl_pmalloc_init (NULL /*lock*/ , sizeof (c_dbcsr_acc_opencl_info_memptr_t ),
571- & c_dbcsr_acc_opencl_config . nmemptrs , (void * * )c_dbcsr_acc_opencl_config .memptrs , c_dbcsr_acc_opencl_config .memptr_data );
566+ c_dbcsr_acc_opencl_pmalloc_init (sizeof (c_dbcsr_acc_opencl_info_memptr_t ), & c_dbcsr_acc_opencl_config . nmemptrs ,
567+ (void * * )c_dbcsr_acc_opencl_config .memptrs , c_dbcsr_acc_opencl_config .memptr_data );
572568 }
573569 else {
574570 free (c_dbcsr_acc_opencl_config .memptrs );
@@ -585,7 +581,7 @@ int c_dbcsr_acc_init(void) {
585581 c_dbcsr_acc_opencl_config .stream_data = (c_dbcsr_acc_opencl_stream_t * )malloc (
586582 sizeof (c_dbcsr_acc_opencl_stream_t ) * nhandles );
587583 if (NULL != c_dbcsr_acc_opencl_config .streams && NULL != c_dbcsr_acc_opencl_config .stream_data ) {
588- c_dbcsr_acc_opencl_pmalloc_init (NULL /*lock*/ , sizeof (c_dbcsr_acc_opencl_stream_t ), & c_dbcsr_acc_opencl_config .nstreams ,
584+ c_dbcsr_acc_opencl_pmalloc_init (sizeof (c_dbcsr_acc_opencl_stream_t ), & c_dbcsr_acc_opencl_config .nstreams ,
589585 (void * * )c_dbcsr_acc_opencl_config .streams , c_dbcsr_acc_opencl_config .stream_data );
590586 }
591587 else {
@@ -601,7 +597,7 @@ int c_dbcsr_acc_init(void) {
601597 c_dbcsr_acc_opencl_config .events = (cl_event * * )malloc (sizeof (cl_event * ) * nhandles );
602598 c_dbcsr_acc_opencl_config .event_data = (cl_event * )malloc (sizeof (cl_event ) * nhandles );
603599 if (NULL != c_dbcsr_acc_opencl_config .events && NULL != c_dbcsr_acc_opencl_config .event_data ) {
604- c_dbcsr_acc_opencl_pmalloc_init (NULL /*lock*/ , sizeof (cl_event * ), & c_dbcsr_acc_opencl_config .nevents ,
600+ c_dbcsr_acc_opencl_pmalloc_init (sizeof (cl_event * ), & c_dbcsr_acc_opencl_config .nevents ,
605601 (void * * )c_dbcsr_acc_opencl_config .events , c_dbcsr_acc_opencl_config .event_data );
606602 }
607603 else {
@@ -612,6 +608,24 @@ int c_dbcsr_acc_init(void) {
612608 c_dbcsr_acc_opencl_config .nevents = 0 ;
613609 result = EXIT_FAILURE ;
614610 }
611+ if (
612+ # if defined(ACC_OPENCL_PROFILE_DBCSR )
613+ 2 <= c_dbcsr_acc_opencl_config .profile ||
614+ # else
615+ 1 <= c_dbcsr_acc_opencl_config .profile ||
616+ # endif
617+ 0 > c_dbcsr_acc_opencl_config .profile )
618+ {
619+ const int profile = LIBXSMM_MAX (c_dbcsr_acc_opencl_config .profile , 2 );
620+ c_dbcsr_acc_opencl_hist_create (& c_dbcsr_acc_opencl_config .hist_h2d , profile + 1 , profile * 4 , 2 );
621+ c_dbcsr_acc_opencl_hist_create (& c_dbcsr_acc_opencl_config .hist_d2h , profile + 1 , profile * 4 , 2 );
622+ c_dbcsr_acc_opencl_hist_create (& c_dbcsr_acc_opencl_config .hist_d2d , profile + 1 , profile * 4 , 2 );
623+ }
624+ else {
625+ assert (NULL == c_dbcsr_acc_opencl_config .hist_h2d );
626+ assert (NULL == c_dbcsr_acc_opencl_config .hist_d2h );
627+ assert (NULL == c_dbcsr_acc_opencl_config .hist_d2d );
628+ }
615629 if (EXIT_SUCCESS == result ) { /* lastly, print active device and list of devices */
616630# if defined(ACC_OPENCL_ACTIVATE )
617631 if (0 <= ACC_OPENCL_ACTIVATE && ACC_OPENCL_ACTIVATE < c_dbcsr_acc_opencl_config .ndevices ) {
@@ -694,6 +708,9 @@ LIBXSMM_ATTRIBUTE_DTOR void c_dbcsr_acc_opencl_finalize(void) {
694708 ACC_OPENCL_DESTROY ((ACC_OPENCL_LOCKTYPE * )(c_dbcsr_acc_opencl_locks + ACC_OPENCL_CACHELINE * i ));
695709 }
696710 /* release/reset buffers */
711+ c_dbcsr_acc_opencl_hist_free (c_dbcsr_acc_opencl_config .hist_h2d );
712+ c_dbcsr_acc_opencl_hist_free (c_dbcsr_acc_opencl_config .hist_d2h );
713+ c_dbcsr_acc_opencl_hist_free (c_dbcsr_acc_opencl_config .hist_d2d );
697714# if defined(ACC_OPENCL_MEM_DEVPTR )
698715 free (c_dbcsr_acc_opencl_config .memptrs );
699716 free (c_dbcsr_acc_opencl_config .memptr_data );
@@ -730,6 +747,7 @@ int c_dbcsr_acc_finalize(void) {
730747# endif
731748 assert (c_dbcsr_acc_opencl_config .ndevices < ACC_OPENCL_MAXNDEVS );
732749 if (0 != c_dbcsr_acc_opencl_config .ndevices && NULL != cleanup ) {
750+ const int precision [] = {0 , 1 };
733751 if (2 <= c_dbcsr_acc_opencl_config .verbosity || 0 > c_dbcsr_acc_opencl_config .verbosity ) {
734752 const cl_device_id device_id = c_dbcsr_acc_opencl_config .devices [c_dbcsr_acc_opencl_config .device_id ];
735753 int d ;
@@ -741,6 +759,9 @@ int c_dbcsr_acc_finalize(void) {
741759 }
742760 fprintf (stderr , "\n" );
743761 }
762+ c_dbcsr_acc_opencl_hist_print (stderr , c_dbcsr_acc_opencl_config .hist_h2d , "\nPROF ACC/OpenCL: H2D" , precision );
763+ c_dbcsr_acc_opencl_hist_print (stderr , c_dbcsr_acc_opencl_config .hist_d2h , "\nPROF ACC/OpenCL: D2H" , precision );
764+ c_dbcsr_acc_opencl_hist_print (stderr , c_dbcsr_acc_opencl_config .hist_d2d , "\nPROF ACC/OpenCL: D2D" , precision );
744765# if defined(__DBCSR_ACC )
745766 /* DBCSR may call c_dbcsr_acc_init as well as libsmm_acc_init() since both interface are used.
746767 * libsmm_acc_init may privately call c_dbcsr_acc_init (as it depends on the ACC interface).
@@ -772,7 +793,7 @@ int c_dbcsr_acc_get_ndevices(int* ndevices) {
772793 c_dbcsr_timeset ((const char * * )& routine_name_ptr , & routine_name_len , & routine_handle );
773794 }
774795# endif
775- # if defined(ACC_OPENCL_LAZYINIT )
796+ # if defined(__DBCSR_ACC ) /* lazy initialization */
776797 /* DBCSR calls c_dbcsr_acc_get_ndevices before calling c_dbcsr_acc_init. */
777798 result = c_dbcsr_acc_init ();
778799 if (EXIT_SUCCESS == result )
@@ -1135,7 +1156,7 @@ int c_dbcsr_acc_opencl_set_active_device(ACC_OPENCL_LOCKTYPE* lock, int device_i
11351156 else devinfo -> wgsize [2 ] = 0 ;
11361157# if defined(ACC_OPENCL_XHINTS ) && defined(ACC_OPENCL_MEM_DEVPTR )
11371158 if (0 != (1 & c_dbcsr_acc_opencl_config .xhints ) && 2 <= * devinfo -> std_level && 0 != devinfo -> intel &&
1138- 0 == devinfo -> unified &&
1159+ 0 == devinfo -> unified && 0 == c_dbcsr_acc_opencl_config . profile &&
11391160 EXIT_SUCCESS == clGetDeviceInfo (active_id , CL_DEVICE_PLATFORM , sizeof (cl_platform_id ), & platform , NULL ) &&
11401161 EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor (active_id , "intel" , 2 /*platform vendor*/ ) &&
11411162 EXIT_SUCCESS == clGetDeviceInfo (active_id , 0x4191 /*CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL*/ , sizeof (cl_bitfield ),
@@ -1743,6 +1764,192 @@ int c_dbcsr_acc_opencl_set_kernel_ptr(cl_kernel kernel, cl_uint arg_index, const
17431764 : clSetKernelArg (kernel , arg_index , sizeof (cl_mem ), & arg_value ));
17441765}
17451766
1767+
1768+ double c_dbcsr_acc_opencl_duration (cl_event event , int * result_code ) {
1769+ cl_ulong begin = 0 , end = 0 ;
1770+ int r = EXIT_FAILURE ;
1771+ double result = 0 ;
1772+ if (NULL != event ) {
1773+ r = clGetEventProfilingInfo (event , CL_PROFILING_COMMAND_START , sizeof (cl_ulong ), & begin , NULL );
1774+ if (EXIT_SUCCESS == r ) {
1775+ r = clGetEventProfilingInfo (event , CL_PROFILING_COMMAND_END , sizeof (cl_ulong ), & end , NULL );
1776+ if (EXIT_SUCCESS == r ) {
1777+ result = 1E-9 * LIBXSMM_DELTA (begin , end ); /* Nanoseconds->seconds */
1778+ }
1779+ }
1780+ }
1781+ if (NULL != result_code ) * result_code = r ;
1782+ return result ;
1783+ }
1784+
1785+
1786+ typedef struct c_dbcsr_acc_opencl_hist_t {
1787+ double * vals , min , max ;
1788+ int * buckets , nbuckets , nqueue , nvals , n ;
1789+ } c_dbcsr_acc_opencl_hist_t ;
1790+
1791+
1792+ void c_dbcsr_acc_opencl_hist_create (void * * hist , int nbuckets , int nqueue , int nvals ) {
1793+ c_dbcsr_acc_opencl_hist_t * h = malloc (sizeof (c_dbcsr_acc_opencl_hist_t ));
1794+ assert (NULL != hist && 0 < nbuckets && 0 < nqueue && 0 < nvals );
1795+ if (NULL != h ) {
1796+ h -> vals = malloc (sizeof (double ) * LIBXSMM_MAX (nbuckets , nqueue ) * nvals );
1797+ h -> buckets = calloc (nbuckets , sizeof (int ));
1798+ if (NULL != h -> vals && NULL != h -> buckets ) {
1799+ union {
1800+ int raw ;
1801+ float value ;
1802+ } inf = {0 };
1803+ # if defined(INFINITY ) && /*overflow warning*/ !defined(_CRAYC )
1804+ inf .value = (float )(INFINITY );
1805+ # else
1806+ inf .raw = 0x7F800000 ;
1807+ # endif
1808+ h -> min = + inf .value ;
1809+ h -> max = - inf .value ;
1810+ h -> nbuckets = nbuckets ;
1811+ h -> nqueue = nqueue ;
1812+ h -> nvals = nvals ;
1813+ h -> n = 0 ;
1814+ }
1815+ else {
1816+ free (h -> buckets );
1817+ free (h -> vals );
1818+ free (h );
1819+ h = NULL ;
1820+ }
1821+ }
1822+ * hist = h ;
1823+ }
1824+
1825+
1826+ void c_dbcsr_acc_opencl_hist_add (ACC_OPENCL_LOCKTYPE * lock , void * hist , const double vals []) {
1827+ if (NULL != hist ) {
1828+ c_dbcsr_acc_opencl_hist_t * const h = (c_dbcsr_acc_opencl_hist_t * )hist ;
1829+ int i , j , k ;
1830+ if (NULL != lock ) ACC_OPENCL_ACQUIRE (lock );
1831+ if (h -> nqueue <= h -> n ++ ) {
1832+ const double * values , w = h -> max - h -> min ;
1833+ const int * buckets ;
1834+ if (h -> n == h -> nqueue ) {
1835+ c_dbcsr_acc_opencl_hist_get (NULL /*lock*/ , hist , & buckets , NULL /*nbuckets*/ , NULL /*range*/ , & values , NULL /*nvals*/ );
1836+ }
1837+ for (i = 1 ; i <= h -> nbuckets ; ++ i ) {
1838+ const double q = h -> min + i * w / h -> nbuckets ;
1839+ if (vals [0 ] <= q || h -> nbuckets == i ) {
1840+ for (k = 0 , j = (i - 1 ) * h -> nvals ; k < h -> nvals ; ++ k ) {
1841+ if (0 != h -> buckets [i - 1 ]) h -> vals [j + k ] += vals [k ];
1842+ else h -> vals [j + k ] = vals [k ];
1843+ }
1844+ ++ h -> buckets [i - 1 ];
1845+ break ;
1846+ }
1847+ }
1848+ }
1849+ else { /* fill-phase */
1850+ if (h -> min > vals [0 ]) h -> min = vals [0 ];
1851+ if (h -> max < vals [0 ]) h -> max = vals [0 ];
1852+ for (k = 0 , j = (h -> n - 1 ) * h -> nvals ; k < h -> nvals ; ++ k ) {
1853+ h -> vals [j + k ] = vals [k ];
1854+ }
1855+ }
1856+ if (NULL != lock ) ACC_OPENCL_RELEASE (lock );
1857+ }
1858+ }
1859+
1860+
1861+ void c_dbcsr_acc_opencl_hist_get (
1862+ ACC_OPENCL_LOCKTYPE * lock , void * hist , const int * * buckets , int * nbuckets , double range [2 ], const double * * vals , int * nvals ) {
1863+ int * b = NULL , m = 0 , n = 0 , i , j , k ;
1864+ double * v = NULL , r [] = {0 , 0 };
1865+ assert (NULL != buckets || NULL != range || NULL != vals );
1866+ if (NULL != hist ) {
1867+ c_dbcsr_acc_opencl_hist_t * const h = (c_dbcsr_acc_opencl_hist_t * )hist ;
1868+ if (NULL != lock ) ACC_OPENCL_ACQUIRE (lock );
1869+ if (h -> n <= h -> nqueue ) {
1870+ const double w = h -> max - h -> min ;
1871+ if (h -> n < h -> nbuckets ) h -> nbuckets = h -> n ;
1872+ for (i = 1 , j = 0 ; i <= h -> nbuckets ; j = h -> nvals * i ++ ) {
1873+ const double p = h -> min + (i - 1 ) * w / h -> nbuckets , q = h -> min + i * w / h -> nbuckets ;
1874+ for (n = 0 , m = 0 ; n < h -> n ; m = ++ n * h -> nvals ) {
1875+ if (0 == h -> buckets [n ] && (p < h -> vals [m ] || 1 == i ) && (h -> vals [m ] <= q || h -> nbuckets == i )) {
1876+ if (j != m ) {
1877+ if (0 != h -> buckets [i - 1 ]) { /* accumulate */
1878+ for (k = 0 ; k < h -> nvals ; ++ k ) {
1879+ h -> vals [j + k ] = 0.5 * (h -> vals [j + k ] + h -> vals [m + k ]);
1880+ }
1881+ }
1882+ else { /* initialize/swap */
1883+ for (k = 0 ; k < h -> nvals ; ++ k ) {
1884+ const double value = h -> vals [m + k ];
1885+ h -> vals [m + k ] = h -> vals [j + k ];
1886+ h -> vals [j + k ] = value ;
1887+ }
1888+ }
1889+ }
1890+ ++ h -> buckets [i - 1 ];
1891+ }
1892+ }
1893+ }
1894+ h -> nqueue = 0 ;
1895+ }
1896+ if (0 < h -> n ) {
1897+ r [0 ] = h -> min ;
1898+ r [1 ] = h -> max ;
1899+ b = h -> buckets ;
1900+ n = h -> nbuckets ;
1901+ v = h -> vals ;
1902+ m = h -> nvals ;
1903+ }
1904+ if (NULL != lock ) ACC_OPENCL_RELEASE (lock );
1905+ }
1906+ if (NULL != nbuckets ) * nbuckets = n ;
1907+ if (NULL != buckets ) * buckets = b ;
1908+ if (NULL != nvals ) * nvals = m ;
1909+ if (NULL != vals ) * vals = v ;
1910+ if (NULL != range ) {
1911+ range [0 ] = r [0 ];
1912+ range [1 ] = r [1 ];
1913+ }
1914+ }
1915+
1916+
1917+ void c_dbcsr_acc_opencl_hist_print (FILE * stream , void * hist , const char title [], const int prec []) {
1918+ int nbuckets = 0 , nvals = 0 , i = 1 , j = 0 , k ;
1919+ const int * buckets = NULL ;
1920+ const double * vals = NULL ;
1921+ double range [2 ];
1922+ c_dbcsr_acc_opencl_hist_get (NULL /*lock*/ , hist , & buckets , & nbuckets , range , & vals , & nvals );
1923+ if (NULL != stream && NULL != buckets && 0 < nbuckets && NULL != vals && 0 < nvals ) {
1924+ const double w = range [1 ] - range [0 ];
1925+ if (NULL != title ) fprintf (stream , "%s\n" , title );
1926+ for (; i <= nbuckets ; j = nvals * i ++ ) {
1927+ const double q = range [0 ] + i * w / nbuckets , r = (i != nbuckets ? q : LIBXSMM_MAX (q , vals [j ]));
1928+ const int c = buckets [i - 1 ];
1929+ if (NULL != prec ) fprintf (stream , "\t#%i <= %.*f: %i" , i , prec [0 ], r , c );
1930+ else fprintf (stream , "\t#%i <= %f: %i" , i , r , c );
1931+ if (0 != c ) {
1932+ fprintf (stream , " ->" );
1933+ for (k = 0 ; k < nvals ; ++ k ) {
1934+ if (NULL != prec ) fprintf (stream , " %.*f" , prec [k ], vals [j + k ]);
1935+ else fprintf (stream , " %f" , vals [j + k ]);
1936+ }
1937+ }
1938+ fprintf (stream , "\n" );
1939+ }
1940+ }
1941+ }
1942+
1943+
1944+ void c_dbcsr_acc_opencl_hist_free (void * hist ) {
1945+ if (NULL != hist ) {
1946+ c_dbcsr_acc_opencl_hist_t * const h = (c_dbcsr_acc_opencl_hist_t * )hist ;
1947+ free (h -> buckets );
1948+ free (h -> vals );
1949+ free (h );
1950+ }
1951+ }
1952+
17461953# if defined(__cplusplus )
17471954 }
17481955# endif
0 commit comments