Skip to content

Commit 955e0c4

Browse files
committed
Fix a performance regression in datatype pack/unpack
In introducing native types to the pack/unpack methods in 3063916, a set of divisions were introduced that the compiler cannot optimize out. This results in poor performance for small datatypes, where 4 divisions are executed for each chunk. This patch restores pack performance from 350MB/s to 1.5GB/s on a datatype with 4 doubles in 2 cache lines. The patch moves alignment checks and stride computation into macros where we have access to the datatypes, which can be used by the compiler to optimize the divisions/modulos. Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
1 parent 9c12f2a commit 955e0c4

File tree

1 file changed

+23
-28
lines changed

1 file changed

+23
-28
lines changed

opal/datatype/opal_datatype_pack_unpack_predefined.h

Lines changed: 23 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -179,11 +179,17 @@
179179
} \
180180
}
181181

182-
#define OPAL_DATATYPE_PACK_PREDEFINED_ELEMENT(src_base, dest_base, count, stride, blocklen, type) \
182+
#define OPAL_DATATYPE_PACK_PREDEFINED_ELEMENT(src_base, dest_base, count, blocklen, type, align) \
183183
{ \
184+
register unsigned long i = count; \
185+
if (((uintptr_t) src_base) % (align) || \
186+
((uintptr_t) dest_base) % (align) || \
187+
(elem->extent % (align) && cando_count > blocklen)) { \
188+
return OPAL_ERROR; \
189+
} \
184190
type *_src = (type *) src_base; \
185191
type *_dest = (type *) dest_base; \
186-
register unsigned long i = count; \
192+
size_t stride = elem->extent / sizeof(type); \
187193
if (blocklen == 1) { \
188194
OPAL_DATATYPE_PACK_PREDEFINED_BLOCKLEN_ONE(stride, blocklen); \
189195
} else if (blocklen == 2) { \
@@ -206,11 +212,18 @@
206212
dest_base = (unsigned char *) _dest; \
207213
}
208214

209-
#define OPAL_DATATYPE_UNPACK_PREDEFINED_ELEMENT(src_base, dest_base, count, stride, blocklen, \
210-
type) \
215+
#define OPAL_DATATYPE_UNPACK_PREDEFINED_ELEMENT(src_base, dest_base, count, blocklen, \
216+
type, align) \
211217
{ \
218+
if (((uintptr_t) src_base) % (align) || \
219+
((uintptr_t) dest_base) % (align) || \
220+
(elem->extent % (align) && cando_count > blocklen)) { \
221+
return OPAL_ERROR; \
222+
} \
212223
type *_src = (type *) src_base; \
213224
type *_dest = (type *) dest_base; \
225+
/* elem's extent but in terms of count rather than bytes */ \
226+
size_t stride = elem->extent / sizeof(type); \
214227
register unsigned long i = count; \
215228
/* (reversing the meanings of blocklen and stride and using the "PACK" macro) */ \
216229
if (blocklen == 1) { \
@@ -240,24 +253,15 @@ static inline int opal_datatype_unpack_predefined_element(unsigned char **rtn_sr
240253
size_t cando_count,
241254
const ddt_elem_desc_t *elem)
242255
{
243-
size_t stride; // elem's extent but in terms of count rather than bytes
244256
size_t blocklen;
245257
int id;
246-
int align;
247258

248259
id = elem->common.type;
249260
blocklen = elem->blocklen;
250-
stride = elem->extent / opal_datatype_basicDatatypes[id]->size;
251-
align = opal_datatype_basicDatatypes[id]->align;
252261

253262
unsigned char *src = *rtn_src;
254263
unsigned char *dest = *rtn_dest;
255264

256-
if ((uintptr_t) src % align || (uintptr_t) dest % align
257-
|| (elem->extent % align && cando_count > blocklen)) {
258-
return OPAL_ERROR;
259-
}
260-
261265
/*
262266
* Here as an example of how we want to call our macro, if the incoming id
263267
* were OPAL_DATATYPE_INT4, we want
@@ -279,9 +283,9 @@ static inline int opal_datatype_unpack_predefined_element(unsigned char **rtn_sr
279283
OPAL_DATATYPE_MYUNPACK_NOTAVAIL, 0); \
280284
} while (0)
281285

282-
#define OPAL_DATATYPE_MYUNPACK_AVAILABLE(TYPE, unused_ALIGN, NAME, unused) \
286+
#define OPAL_DATATYPE_MYUNPACK_AVAILABLE(TYPE, ALIGN, NAME, unused) \
283287
do { \
284-
OPAL_DATATYPE_UNPACK_PREDEFINED_ELEMENT(src, dest, cando_count, stride, blocklen, TYPE); \
288+
OPAL_DATATYPE_UNPACK_PREDEFINED_ELEMENT(src, dest, cando_count, blocklen, TYPE, ALIGN); \
285289
success = true; \
286290
} while (0)
287291

@@ -375,34 +379,25 @@ static inline int opal_datatype_pack_predefined_element(unsigned char **rtn_src,
375379
size_t cando_count,
376380
const ddt_elem_desc_t *elem)
377381
{
378-
size_t stride; // elem's extent but in terms of count rather than bytes
379382
size_t blocklen;
380383
int id;
381-
int align;
382384

383385
id = elem->common.type;
384386
blocklen = elem->blocklen;
385-
stride = elem->extent / opal_datatype_basicDatatypes[id]->size;
386-
align = opal_datatype_basicDatatypes[id]->align;
387387

388388
unsigned char *src = *rtn_src;
389389
unsigned char *dest = *rtn_dest;
390390

391-
if ((uintptr_t) src % align || (uintptr_t) dest % align
392-
|| (elem->extent % align && cando_count > blocklen)) {
393-
return OPAL_ERROR;
394-
}
395-
396391
#define OPAL_DATATYPE_MYPACK(NAME) \
397392
do { \
398393
OPAL_DATATYPE_HANDLE_##NAME(OPAL_DATATYPE_MYPACK_AVAILABLE, OPAL_DATATYPE_MYPACK_NOTAVAIL, \
399394
0); \
400395
} while (0)
401396

402-
#define OPAL_DATATYPE_MYPACK_AVAILABLE(TYPE, unused_ALIGN, NAME, unused) \
403-
do { \
404-
OPAL_DATATYPE_PACK_PREDEFINED_ELEMENT(src, dest, cando_count, stride, blocklen, TYPE); \
405-
success = true; \
397+
#define OPAL_DATATYPE_MYPACK_AVAILABLE(TYPE, ALIGN, NAME, unused) \
398+
do { \
399+
OPAL_DATATYPE_PACK_PREDEFINED_ELEMENT(src, dest, cando_count, blocklen, TYPE, ALIGN); \
400+
success = true; \
406401
} while (0)
407402

408403
#define OPAL_DATATYPE_MYPACK_NOTAVAIL(NAME, unused) \

0 commit comments

Comments
 (0)