Skip to content

Commit 6da439d

Browse files
committed
ocl: histogram for profiling performance
- Ensure level-2 profile only for external transfers. - API: introduced c_dbcsr_acc_opencl_hist. - Simplified pool allocation (API). - Cleaned-up initialization.
1 parent 9b7a308 commit 6da439d

File tree

6 files changed

+344
-114
lines changed

6 files changed

+344
-114
lines changed

src/acc/opencl/acc_opencl.c

Lines changed: 235 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
# include <string.h>
1212
# include <limits.h>
1313
# include <ctype.h>
14+
# include <math.h>
1415
# if defined(_WIN32)
1516
# include <windows.h>
1617
# include <process.h>
@@ -154,17 +155,14 @@ void c_dbcsr_acc_opencl_configure(void) {
154155
const char* const env_rank = (NULL != getenv("PMI_RANK") ? getenv("PMI_RANK") : getenv("OMPI_COMM_WORLD_LOCAL_RANK"));
155156
const char *const env_nranks = getenv("MPI_LOCALNRANKS"), *const env_devsplit = getenv("ACC_OPENCL_DEVSPLIT");
156157
const char *const env_verbose = getenv("ACC_OPENCL_VERBOSE"), *const env_dump_acc = getenv("ACC_OPENCL_DUMP");
158+
const char *const env_profile = getenv("ACC_OPENCL_PROFILE"), *const env_nlocks = getenv("ACC_OPENCL_NLOCKS");
157159
const char *const env_debug = getenv("ACC_OPENCL_DEBUG"), *const env_timer = getenv("ACC_OPENCL_TIMER");
158160
const char* const env_dump = (NULL != env_dump_acc ? env_dump_acc : getenv("IGC_ShaderDumpEnable"));
159161
const char *const env_neo = getenv("NEOReadDebugKeys"), *const env_wa = getenv("ACC_OPENCL_WA");
162+
static char neo_enable_debug_keys[] = "NEOReadDebugKeys=1";
160163
# if defined(ACC_OPENCL_STREAM_PRIORITIES)
161164
const char* const env_priority = getenv("ACC_OPENCL_PRIORITY");
162165
# endif
163-
# if defined(ACC_OPENCL_PROFILE_DBCSR)
164-
const char* const env_profile = getenv("ACC_OPENCL_PROFILE");
165-
# endif
166-
const char* const env_nlocks = getenv("ACC_OPENCL_NLOCKS");
167-
static char neo_enable_debug_keys[] = "NEOReadDebugKeys=1";
168166
# if defined(ACC_OPENCL_NCCS)
169167
const char* const env_nccs = getenv("ACC_OPENCL_NCCS");
170168
const int nccs = (NULL == env_nccs ? ACC_OPENCL_NCCS : atoi(env_nccs));
@@ -183,19 +181,16 @@ void c_dbcsr_acc_opencl_configure(void) {
183181
const char* const env_async = NULL;
184182
const int async_default = 0;
185183
# endif
186-
187-
188184
const int nlocks = (NULL == env_nlocks ? 1 /*default*/ : atoi(env_nlocks));
189185
const int neo = (NULL == env_neo ? 1 : atoi(env_neo));
190186
int i;
191187
# if defined(_OPENMP)
192188
const int max_threads = omp_get_max_threads(), num_threads = omp_get_num_threads();
189+
memset(&c_dbcsr_acc_opencl_config, 0, sizeof(c_dbcsr_acc_opencl_config));
193190
c_dbcsr_acc_opencl_config.nthreads = (num_threads < max_threads ? max_threads : num_threads);
194-
c_dbcsr_acc_opencl_config.nstreams = (num_threads < max_threads ? (ACC_OPENCL_MAXNITEMS * max_threads)
195-
: (ACC_OPENCL_MAXNITEMS));
196191
# else
192+
memset(&c_dbcsr_acc_opencl_config, 0, sizeof(c_dbcsr_acc_opencl_config));
197193
c_dbcsr_acc_opencl_config.nthreads = 1;
198-
c_dbcsr_acc_opencl_config.nstreams = ACC_OPENCL_MAXNITEMS;
199194
# endif
200195
c_dbcsr_acc_opencl_config.nranks = LIBXSMM_MAX(NULL != env_nranks ? atoi(env_nranks) : 1, 1);
201196
c_dbcsr_acc_opencl_config.nrank = (NULL != env_rank ? atoi(env_rank) : 0) % c_dbcsr_acc_opencl_config.nranks;
@@ -219,9 +214,7 @@ void c_dbcsr_acc_opencl_configure(void) {
219214
# if defined(ACC_OPENCL_STREAM_PRIORITIES)
220215
c_dbcsr_acc_opencl_config.priority = (NULL == env_priority ? /*default*/ 3 : atoi(env_priority));
221216
# endif
222-
# if defined(ACC_OPENCL_PROFILE_DBCSR)
223217
c_dbcsr_acc_opencl_config.profile = (NULL == env_profile ? /*default*/ 0 : atoi(env_profile));
224-
# endif
225218
c_dbcsr_acc_opencl_config.xhints = (NULL == env_xhints ? xhints_default : atoi(env_xhints));
226219
c_dbcsr_acc_opencl_config.async = (NULL == env_async ? async_default : atoi(env_async));
227220
c_dbcsr_acc_opencl_config.dump = (NULL == env_dump ? /*default*/ 0 : atoi(env_dump));
@@ -337,14 +330,16 @@ int c_dbcsr_acc_init(void) {
337330
# endif
338331
# if defined(ACC_OPENCL_PROFILE_DBCSR)
339332
int routine_handle;
333+
c_dbcsr_acc_opencl_configure();
340334
if (0 != c_dbcsr_acc_opencl_config.profile) {
341335
static const char* const routine_name_ptr = LIBXSMM_FUNCNAME + ACC_OPENCL_PROFILE_DBCSR;
342336
static const int routine_name_len = (int)sizeof(LIBXSMM_FUNCNAME) - (ACC_OPENCL_PROFILE_DBCSR + 1);
343337
c_dbcsr_timeset((const char**)&routine_name_ptr, &routine_name_len, &routine_handle);
344338
}
339+
# else
340+
c_dbcsr_acc_opencl_configure();
345341
# endif
346342
/* eventually touch OpenCL/compute runtime after configure */
347-
c_dbcsr_acc_opencl_configure();
348343
if (0 == c_dbcsr_acc_opencl_config.ndevices && EXIT_SUCCESS == result) { /* avoid to initialize multiple times */
349344
char buffer[ACC_OPENCL_BUFFERSIZE];
350345
cl_platform_id platforms[ACC_OPENCL_MAXNDEVS] = {NULL};
@@ -548,15 +543,16 @@ int c_dbcsr_acc_init(void) {
548543
assert(0 < c_dbcsr_acc_opencl_config.ndevices);
549544
assert(c_dbcsr_acc_opencl_config.ndevices < ACC_OPENCL_MAXNDEVS);
550545
# if defined(ACC_OPENCL_MEM_DEVPTR)
551-
c_dbcsr_acc_opencl_config.memptrs = NULL;
552-
c_dbcsr_acc_opencl_config.memptr_data = NULL;
553-
c_dbcsr_acc_opencl_config.nmemptrs = 0;
554-
# endif
555-
c_dbcsr_acc_opencl_config.streams = NULL;
556-
c_dbcsr_acc_opencl_config.events = NULL;
557-
c_dbcsr_acc_opencl_config.stream_data = NULL;
558-
c_dbcsr_acc_opencl_config.event_data = NULL;
559-
c_dbcsr_acc_opencl_config.nstreams = c_dbcsr_acc_opencl_config.nevents = 0;
546+
assert(NULL == c_dbcsr_acc_opencl_config.memptrs);
547+
assert(NULL == c_dbcsr_acc_opencl_config.memptr_data);
548+
assert(0 == c_dbcsr_acc_opencl_config.nmemptrs);
549+
# endif
550+
assert(NULL == c_dbcsr_acc_opencl_config.streams);
551+
assert(NULL == c_dbcsr_acc_opencl_config.events);
552+
assert(NULL == c_dbcsr_acc_opencl_config.stream_data);
553+
assert(NULL == c_dbcsr_acc_opencl_config.event_data);
554+
assert(0 == c_dbcsr_acc_opencl_config.nstreams);
555+
assert(0 == c_dbcsr_acc_opencl_config.nevents);
560556
# if defined(ACC_OPENCL_CACHE_DID)
561557
c_dbcsr_acc_opencl_active_id = device_id + 1; /* update c_dbcsr_acc_opencl_active_id */
562558
# endif
@@ -567,8 +563,8 @@ int c_dbcsr_acc_init(void) {
567563
c_dbcsr_acc_opencl_config.memptr_data = (c_dbcsr_acc_opencl_info_memptr_t*)malloc(
568564
sizeof(c_dbcsr_acc_opencl_info_memptr_t) * nhandles);
569565
if (NULL != c_dbcsr_acc_opencl_config.memptrs && NULL != c_dbcsr_acc_opencl_config.memptr_data) {
570-
c_dbcsr_acc_opencl_pmalloc_init(NULL /*lock*/, sizeof(c_dbcsr_acc_opencl_info_memptr_t),
571-
&c_dbcsr_acc_opencl_config.nmemptrs, (void**)c_dbcsr_acc_opencl_config.memptrs, c_dbcsr_acc_opencl_config.memptr_data);
566+
c_dbcsr_acc_opencl_pmalloc_init(sizeof(c_dbcsr_acc_opencl_info_memptr_t), &c_dbcsr_acc_opencl_config.nmemptrs,
567+
(void**)c_dbcsr_acc_opencl_config.memptrs, c_dbcsr_acc_opencl_config.memptr_data);
572568
}
573569
else {
574570
free(c_dbcsr_acc_opencl_config.memptrs);
@@ -585,7 +581,7 @@ int c_dbcsr_acc_init(void) {
585581
c_dbcsr_acc_opencl_config.stream_data = (c_dbcsr_acc_opencl_stream_t*)malloc(
586582
sizeof(c_dbcsr_acc_opencl_stream_t) * nhandles);
587583
if (NULL != c_dbcsr_acc_opencl_config.streams && NULL != c_dbcsr_acc_opencl_config.stream_data) {
588-
c_dbcsr_acc_opencl_pmalloc_init(NULL /*lock*/, sizeof(c_dbcsr_acc_opencl_stream_t), &c_dbcsr_acc_opencl_config.nstreams,
584+
c_dbcsr_acc_opencl_pmalloc_init(sizeof(c_dbcsr_acc_opencl_stream_t), &c_dbcsr_acc_opencl_config.nstreams,
589585
(void**)c_dbcsr_acc_opencl_config.streams, c_dbcsr_acc_opencl_config.stream_data);
590586
}
591587
else {
@@ -601,7 +597,7 @@ int c_dbcsr_acc_init(void) {
601597
c_dbcsr_acc_opencl_config.events = (cl_event**)malloc(sizeof(cl_event*) * nhandles);
602598
c_dbcsr_acc_opencl_config.event_data = (cl_event*)malloc(sizeof(cl_event) * nhandles);
603599
if (NULL != c_dbcsr_acc_opencl_config.events && NULL != c_dbcsr_acc_opencl_config.event_data) {
604-
c_dbcsr_acc_opencl_pmalloc_init(NULL /*lock*/, sizeof(cl_event*), &c_dbcsr_acc_opencl_config.nevents,
600+
c_dbcsr_acc_opencl_pmalloc_init(sizeof(cl_event*), &c_dbcsr_acc_opencl_config.nevents,
605601
(void**)c_dbcsr_acc_opencl_config.events, c_dbcsr_acc_opencl_config.event_data);
606602
}
607603
else {
@@ -612,6 +608,24 @@ int c_dbcsr_acc_init(void) {
612608
c_dbcsr_acc_opencl_config.nevents = 0;
613609
result = EXIT_FAILURE;
614610
}
611+
if (
612+
# if defined(ACC_OPENCL_PROFILE_DBCSR)
613+
2 <= c_dbcsr_acc_opencl_config.profile ||
614+
# else
615+
1 <= c_dbcsr_acc_opencl_config.profile ||
616+
# endif
617+
0 > c_dbcsr_acc_opencl_config.profile)
618+
{
619+
const int profile = LIBXSMM_MAX(c_dbcsr_acc_opencl_config.profile, 2);
620+
c_dbcsr_acc_opencl_hist_create(&c_dbcsr_acc_opencl_config.hist_h2d, profile + 1, profile * 4, 2);
621+
c_dbcsr_acc_opencl_hist_create(&c_dbcsr_acc_opencl_config.hist_d2h, profile + 1, profile * 4, 2);
622+
c_dbcsr_acc_opencl_hist_create(&c_dbcsr_acc_opencl_config.hist_d2d, profile + 1, profile * 4, 2);
623+
}
624+
else {
625+
assert(NULL == c_dbcsr_acc_opencl_config.hist_h2d);
626+
assert(NULL == c_dbcsr_acc_opencl_config.hist_d2h);
627+
assert(NULL == c_dbcsr_acc_opencl_config.hist_d2d);
628+
}
615629
if (EXIT_SUCCESS == result) { /* lastly, print active device and list of devices */
616630
# if defined(ACC_OPENCL_ACTIVATE)
617631
if (0 <= ACC_OPENCL_ACTIVATE && ACC_OPENCL_ACTIVATE < c_dbcsr_acc_opencl_config.ndevices) {
@@ -694,6 +708,9 @@ LIBXSMM_ATTRIBUTE_DTOR void c_dbcsr_acc_opencl_finalize(void) {
694708
ACC_OPENCL_DESTROY((ACC_OPENCL_LOCKTYPE*)(c_dbcsr_acc_opencl_locks + ACC_OPENCL_CACHELINE * i));
695709
}
696710
/* release/reset buffers */
711+
c_dbcsr_acc_opencl_hist_free(c_dbcsr_acc_opencl_config.hist_h2d);
712+
c_dbcsr_acc_opencl_hist_free(c_dbcsr_acc_opencl_config.hist_d2h);
713+
c_dbcsr_acc_opencl_hist_free(c_dbcsr_acc_opencl_config.hist_d2d);
697714
# if defined(ACC_OPENCL_MEM_DEVPTR)
698715
free(c_dbcsr_acc_opencl_config.memptrs);
699716
free(c_dbcsr_acc_opencl_config.memptr_data);
@@ -730,6 +747,7 @@ int c_dbcsr_acc_finalize(void) {
730747
# endif
731748
assert(c_dbcsr_acc_opencl_config.ndevices < ACC_OPENCL_MAXNDEVS);
732749
if (0 != c_dbcsr_acc_opencl_config.ndevices && NULL != cleanup) {
750+
const int precision[] = {0, 1};
733751
if (2 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) {
734752
const cl_device_id device_id = c_dbcsr_acc_opencl_config.devices[c_dbcsr_acc_opencl_config.device_id];
735753
int d;
@@ -741,6 +759,9 @@ int c_dbcsr_acc_finalize(void) {
741759
}
742760
fprintf(stderr, "\n");
743761
}
762+
c_dbcsr_acc_opencl_hist_print(stderr, c_dbcsr_acc_opencl_config.hist_h2d, "\nPROF ACC/OpenCL: H2D", precision);
763+
c_dbcsr_acc_opencl_hist_print(stderr, c_dbcsr_acc_opencl_config.hist_d2h, "\nPROF ACC/OpenCL: D2H", precision);
764+
c_dbcsr_acc_opencl_hist_print(stderr, c_dbcsr_acc_opencl_config.hist_d2d, "\nPROF ACC/OpenCL: D2D", precision);
744765
# if defined(__DBCSR_ACC)
745766
/* DBCSR may call c_dbcsr_acc_init as well as libsmm_acc_init() since both interface are used.
746767
* libsmm_acc_init may privately call c_dbcsr_acc_init (as it depends on the ACC interface).
@@ -772,7 +793,7 @@ int c_dbcsr_acc_get_ndevices(int* ndevices) {
772793
c_dbcsr_timeset((const char**)&routine_name_ptr, &routine_name_len, &routine_handle);
773794
}
774795
# endif
775-
# if defined(ACC_OPENCL_LAZYINIT)
796+
# if defined(__DBCSR_ACC) /* lazy initialization */
776797
/* DBCSR calls c_dbcsr_acc_get_ndevices before calling c_dbcsr_acc_init. */
777798
result = c_dbcsr_acc_init();
778799
if (EXIT_SUCCESS == result)
@@ -1135,7 +1156,7 @@ int c_dbcsr_acc_opencl_set_active_device(ACC_OPENCL_LOCKTYPE* lock, int device_i
11351156
else devinfo->wgsize[2] = 0;
11361157
# if defined(ACC_OPENCL_XHINTS) && defined(ACC_OPENCL_MEM_DEVPTR)
11371158
if (0 != (1 & c_dbcsr_acc_opencl_config.xhints) && 2 <= *devinfo->std_level && 0 != devinfo->intel &&
1138-
0 == devinfo->unified &&
1159+
0 == devinfo->unified && 0 == c_dbcsr_acc_opencl_config.profile &&
11391160
EXIT_SUCCESS == clGetDeviceInfo(active_id, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform, NULL) &&
11401161
EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "intel", 2 /*platform vendor*/) &&
11411162
EXIT_SUCCESS == clGetDeviceInfo(active_id, 0x4191 /*CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL*/, sizeof(cl_bitfield),
@@ -1743,6 +1764,192 @@ int c_dbcsr_acc_opencl_set_kernel_ptr(cl_kernel kernel, cl_uint arg_index, const
17431764
: clSetKernelArg(kernel, arg_index, sizeof(cl_mem), &arg_value));
17441765
}
17451766

1767+
1768+
double c_dbcsr_acc_opencl_duration(cl_event event, int* result_code) {
1769+
cl_ulong begin = 0, end = 0;
1770+
int r = EXIT_FAILURE;
1771+
double result = 0;
1772+
if (NULL != event) {
1773+
r = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &begin, NULL);
1774+
if (EXIT_SUCCESS == r) {
1775+
r = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
1776+
if (EXIT_SUCCESS == r) {
1777+
result = 1E-9 * LIBXSMM_DELTA(begin, end); /* Nanoseconds->seconds */
1778+
}
1779+
}
1780+
}
1781+
if (NULL != result_code) *result_code = r;
1782+
return result;
1783+
}
1784+
1785+
1786+
typedef struct c_dbcsr_acc_opencl_hist_t {
1787+
double *vals, min, max;
1788+
int *buckets, nbuckets, nqueue, nvals, n;
1789+
} c_dbcsr_acc_opencl_hist_t;
1790+
1791+
1792+
void c_dbcsr_acc_opencl_hist_create(void** hist, int nbuckets, int nqueue, int nvals) {
1793+
c_dbcsr_acc_opencl_hist_t* h = malloc(sizeof(c_dbcsr_acc_opencl_hist_t));
1794+
assert(NULL != hist && 0 < nbuckets && 0 < nqueue && 0 < nvals);
1795+
if (NULL != h) {
1796+
h->vals = malloc(sizeof(double) * LIBXSMM_MAX(nbuckets, nqueue) * nvals);
1797+
h->buckets = calloc(nbuckets, sizeof(int));
1798+
if (NULL != h->vals && NULL != h->buckets) {
1799+
union {
1800+
int raw;
1801+
float value;
1802+
} inf = {0};
1803+
# if defined(INFINITY) && /*overflow warning*/ !defined(_CRAYC)
1804+
inf.value = (float)(INFINITY);
1805+
# else
1806+
inf.raw = 0x7F800000;
1807+
# endif
1808+
h->min = +inf.value;
1809+
h->max = -inf.value;
1810+
h->nbuckets = nbuckets;
1811+
h->nqueue = nqueue;
1812+
h->nvals = nvals;
1813+
h->n = 0;
1814+
}
1815+
else {
1816+
free(h->buckets);
1817+
free(h->vals);
1818+
free(h);
1819+
h = NULL;
1820+
}
1821+
}
1822+
*hist = h;
1823+
}
1824+
1825+
1826+
void c_dbcsr_acc_opencl_hist_add(ACC_OPENCL_LOCKTYPE* lock, void* hist, const double vals[]) {
1827+
if (NULL != hist) {
1828+
c_dbcsr_acc_opencl_hist_t* const h = (c_dbcsr_acc_opencl_hist_t*)hist;
1829+
int i, j, k;
1830+
if (NULL != lock) ACC_OPENCL_ACQUIRE(lock);
1831+
if (h->nqueue <= h->n++) {
1832+
const double *values, w = h->max - h->min;
1833+
const int* buckets;
1834+
if (h->n == h->nqueue) {
1835+
c_dbcsr_acc_opencl_hist_get(NULL /*lock*/, hist, &buckets, NULL /*nbuckets*/, NULL /*range*/, &values, NULL /*nvals*/);
1836+
}
1837+
for (i = 1; i <= h->nbuckets; ++i) {
1838+
const double q = h->min + i * w / h->nbuckets;
1839+
if (vals[0] <= q || h->nbuckets == i) {
1840+
for (k = 0, j = (i - 1) * h->nvals; k < h->nvals; ++k) {
1841+
if (0 != h->buckets[i - 1]) h->vals[j + k] += vals[k];
1842+
else h->vals[j + k] = vals[k];
1843+
}
1844+
++h->buckets[i - 1];
1845+
break;
1846+
}
1847+
}
1848+
}
1849+
else { /* fill-phase */
1850+
if (h->min > vals[0]) h->min = vals[0];
1851+
if (h->max < vals[0]) h->max = vals[0];
1852+
for (k = 0, j = (h->n - 1) * h->nvals; k < h->nvals; ++k) {
1853+
h->vals[j + k] = vals[k];
1854+
}
1855+
}
1856+
if (NULL != lock) ACC_OPENCL_RELEASE(lock);
1857+
}
1858+
}
1859+
1860+
1861+
void c_dbcsr_acc_opencl_hist_get(
1862+
ACC_OPENCL_LOCKTYPE* lock, void* hist, const int** buckets, int* nbuckets, double range[2], const double** vals, int* nvals) {
1863+
int *b = NULL, m = 0, n = 0, i, j, k;
1864+
double *v = NULL, r[] = {0, 0};
1865+
assert(NULL != buckets || NULL != range || NULL != vals);
1866+
if (NULL != hist) {
1867+
c_dbcsr_acc_opencl_hist_t* const h = (c_dbcsr_acc_opencl_hist_t*)hist;
1868+
if (NULL != lock) ACC_OPENCL_ACQUIRE(lock);
1869+
if (h->n <= h->nqueue) {
1870+
const double w = h->max - h->min;
1871+
if (h->n < h->nbuckets) h->nbuckets = h->n;
1872+
for (i = 1, j = 0; i <= h->nbuckets; j = h->nvals * i++) {
1873+
const double p = h->min + (i - 1) * w / h->nbuckets, q = h->min + i * w / h->nbuckets;
1874+
for (n = 0, m = 0; n < h->n; m = ++n * h->nvals) {
1875+
if (0 == h->buckets[n] && (p < h->vals[m] || 1 == i) && (h->vals[m] <= q || h->nbuckets == i)) {
1876+
if (j != m) {
1877+
if (0 != h->buckets[i - 1]) { /* accumulate */
1878+
for (k = 0; k < h->nvals; ++k) {
1879+
h->vals[j + k] = 0.5 * (h->vals[j + k] + h->vals[m + k]);
1880+
}
1881+
}
1882+
else { /* initialize/swap */
1883+
for (k = 0; k < h->nvals; ++k) {
1884+
const double value = h->vals[m + k];
1885+
h->vals[m + k] = h->vals[j + k];
1886+
h->vals[j + k] = value;
1887+
}
1888+
}
1889+
}
1890+
++h->buckets[i - 1];
1891+
}
1892+
}
1893+
}
1894+
h->nqueue = 0;
1895+
}
1896+
if (0 < h->n) {
1897+
r[0] = h->min;
1898+
r[1] = h->max;
1899+
b = h->buckets;
1900+
n = h->nbuckets;
1901+
v = h->vals;
1902+
m = h->nvals;
1903+
}
1904+
if (NULL != lock) ACC_OPENCL_RELEASE(lock);
1905+
}
1906+
if (NULL != nbuckets) *nbuckets = n;
1907+
if (NULL != buckets) *buckets = b;
1908+
if (NULL != nvals) *nvals = m;
1909+
if (NULL != vals) *vals = v;
1910+
if (NULL != range) {
1911+
range[0] = r[0];
1912+
range[1] = r[1];
1913+
}
1914+
}
1915+
1916+
1917+
void c_dbcsr_acc_opencl_hist_print(FILE* stream, void* hist, const char title[], const int prec[]) {
1918+
int nbuckets = 0, nvals = 0, i = 1, j = 0, k;
1919+
const int* buckets = NULL;
1920+
const double* vals = NULL;
1921+
double range[2];
1922+
c_dbcsr_acc_opencl_hist_get(NULL /*lock*/, hist, &buckets, &nbuckets, range, &vals, &nvals);
1923+
if (NULL != stream && NULL != buckets && 0 < nbuckets && NULL != vals && 0 < nvals) {
1924+
const double w = range[1] - range[0];
1925+
if (NULL != title) fprintf(stream, "%s\n", title);
1926+
for (; i <= nbuckets; j = nvals * i++) {
1927+
const double q = range[0] + i * w / nbuckets, r = (i != nbuckets ? q : LIBXSMM_MAX(q, vals[j]));
1928+
const int c = buckets[i - 1];
1929+
if (NULL != prec) fprintf(stream, "\t#%i <= %.*f: %i", i, prec[0], r, c);
1930+
else fprintf(stream, "\t#%i <= %f: %i", i, r, c);
1931+
if (0 != c) {
1932+
fprintf(stream, " ->");
1933+
for (k = 0; k < nvals; ++k) {
1934+
if (NULL != prec) fprintf(stream, " %.*f", prec[k], vals[j + k]);
1935+
else fprintf(stream, " %f", vals[j + k]);
1936+
}
1937+
}
1938+
fprintf(stream, "\n");
1939+
}
1940+
}
1941+
}
1942+
1943+
1944+
void c_dbcsr_acc_opencl_hist_free(void* hist) {
1945+
if (NULL != hist) {
1946+
c_dbcsr_acc_opencl_hist_t* const h = (c_dbcsr_acc_opencl_hist_t*)hist;
1947+
free(h->buckets);
1948+
free(h->vals);
1949+
free(h);
1950+
}
1951+
}
1952+
17461953
# if defined(__cplusplus)
17471954
}
17481955
# endif

0 commit comments

Comments
 (0)