From ff3b07fa21b0a0811f0bb6b081049412f7233854 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Fri, 22 Feb 2019 19:41:20 -0500 Subject: [PATCH] Add MCA parameters to define the size of memcpy chunks. Add support for vector copy, allowing the upper level to define specialized/optimized vector copy functions. Signed-off-by: George Bosilca --- opal/datatype/opal_datatype_copy.c | 23 ++++- opal/datatype/opal_datatype_copy.h | 123 ++++++++++++++++++------- opal/datatype/opal_datatype_internal.h | 5 +- opal/datatype/opal_datatype_module.c | 59 ++++++++---- 4 files changed, 152 insertions(+), 58 deletions(-) diff --git a/opal/datatype/opal_datatype_copy.c b/opal/datatype/opal_datatype_copy.c index 7bf94ef97b9..c1f51a5a25f 100644 --- a/opal/datatype/opal_datatype_copy.c +++ b/opal/datatype/opal_datatype_copy.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -41,7 +41,15 @@ #define DO_DEBUG(INST) #endif /* OPAL_ENABLE_DEBUG */ -static size_t opal_datatype_memop_block_size = 128 * 1024; +size_t opal_datatype_memop_block_size = 128 * 1024; +size_t opal_datatype_cuda_memop_block_size = SIZE_MAX; /* or (size_t)-1 in pre C99 */ + +/* The MEM_OP_BLOCK_SIZE_CONST define how a large contiguous memcpy + * should be split. In some cases having a pipeline might allow for + * cache write-backs, but in general (and certainly in the case of + * CUDA devices) this should be set to the largest size_t value. + */ +#define MEM_OP_BLOCK_SIZE_CONST opal_datatype_memop_block_size /** * Non overlapping memory regions @@ -72,6 +80,10 @@ static size_t opal_datatype_memop_block_size = 128 * 1024; #include "opal_datatype_copy.h" #if OPAL_CUDA_SUPPORT + +#undef MEM_OP_BLOCK_SIZE_CONST +#define MEM_OP_BLOCK_SIZE_CONST opal_datatype_cuda_memop_block_size + #include "opal_datatype_cuda.h" #undef MEM_OP_NAME @@ -92,9 +104,12 @@ static size_t opal_datatype_memop_block_size = 128 * 1024; fct = copy_function; \ } \ } while(0) -#else + +#else /* OPAL_CUDA_SUPPORT */ + #define SET_CUDA_COPY_FCT(cuda_device_bufs, fct, copy_function) -#endif + +#endif /* OPAL_CUDA_SUPPORT */ int32_t opal_datatype_copy_content_same_ddt( const opal_datatype_t* datatype, int32_t count, char* destination_base, char* source_base ) diff --git a/opal/datatype/opal_datatype_copy.h b/opal/datatype/opal_datatype_copy.h index 7aeac8e63ec..9e32cc50367 100644 --- a/opal/datatype/opal_datatype_copy.h +++ b/opal/datatype/opal_datatype_copy.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ /* - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. @@ -35,18 +35,61 @@ #endif -#define _predefined_data DT_CONCAT(MEM_OP_NAME,_predefined_data) -#define _contiguous_loop DT_CONCAT(MEM_OP_NAME,_contiguous_loop) #define _copy_content_same_ddt DT_CONCAT(MEM_OP_NAME,_copy_content_same_ddt) -static inline void _predefined_data( const dt_elem_desc_t* ELEM, - const opal_datatype_t* DATATYPE, - unsigned char* SOURCE_BASE, - size_t TOTAL_COUNT, - size_t COUNT, - unsigned char* SOURCE, - unsigned char* DESTINATION, - size_t* SPACE ) +#if !defined(MEM_OP_BLOCK_SIZE_CONST) +#error +#endif + +#if !defined(_memcpy_vector) + +#define _memcpy_vector DT_CONCAT(MEM_OP_NAME,_memcpy_vector) +#define __OPAL_DATATYPE_DEFINE__memcpy_vector + +static inline size_t +_memcpy_vector( unsigned char* dest, /* destination pointer of the copy */ + unsigned char* source, /* source pointer of the copy */ + size_t blength, /* size in bytes of each block */ + size_t count, /* the number of blocks */ + ptrdiff_t dstride, /* the stride at the destination of each block */ + ptrdiff_t sstride ) /* the stride at the source of each block */ +{ + size_t _length = 0; + if( (blength == (size_t)(sstride)) && (sstride == dstride) ) { + _length = count * blength; + /* the extent and the size of the basic datatype are equals */ + DO_DEBUG( opal_output( 0, "vector copy [*] %s( %p, %p, %" PRIsize_t " )\n", + STRINGIFY(MEM_OP_NAME), (void*)dest, (void*)source, _length ); ); + MEM_OP( dest, source, _length ); + } else { + for(size_t _i = 0; _i < count; _i++ ) { + /* the extent and the size of the basic datatype are equals */ + DO_DEBUG( opal_output( 0, "vector copy [%" PRIsize_t "] %s( %p, %p, %" PRIsize_t " )\n", + _i, STRINGIFY(MEM_OP_NAME), (void*)dest, (void*)source, blength ); ); + MEM_OP( dest, source, blength ); + _length += blength; + source += sstride; + dest += dstride; + } + } + return _length; +} +#endif /* !defined(_memcpy_vector) */ + +#if !defined(_predefined_data) + +#define _predefined_data DT_CONCAT(MEM_OP_NAME,_predefined_data) +#define __OPAL_DATATYPE_DEFINE__predefined_data + +static inline +void _predefined_data( const dt_elem_desc_t* ELEM, + const opal_datatype_t* DATATYPE, + unsigned char* SOURCE_BASE, + size_t TOTAL_COUNT, + size_t COUNT, + unsigned char* SOURCE, + unsigned char* DESTINATION, + size_t* SPACE ) { size_t _copy_count = (COUNT); size_t _copy_blength; @@ -57,38 +100,34 @@ static inline void _predefined_data( const dt_elem_desc_t* ELEM, _copy_blength = opal_datatype_basicDatatypes[_elem->common.type]->size; if( _copy_blength == (size_t)_elem->extent ) { - _copy_blength *= _copy_count; OPAL_DATATYPE_SAFEGUARD_POINTER( _source, _copy_blength, (SOURCE_BASE), (DATATYPE), (TOTAL_COUNT) ); - /* the extent and the size of the basic datatype are equals */ - DO_DEBUG( opal_output( 0, "copy 1. %s( %p, %p, %" PRIsize_t " ) => space %" PRIsize_t "\n", - STRINGIFY(MEM_OP_NAME), (void*)_destination, (void*)_source, _copy_blength, *(SPACE) ); ); - MEM_OP( _destination, _source, _copy_blength ); - _source += _copy_blength; - _destination += _copy_blength; } else { for(size_t _i = 0; _i < _copy_count; _i++ ) { OPAL_DATATYPE_SAFEGUARD_POINTER( _source, _copy_blength, (SOURCE_BASE), (DATATYPE), (TOTAL_COUNT) ); - DO_DEBUG( opal_output( 0, "copy 2. %s( %p, %p, %lu ) => space %lu\n", - STRINGIFY(MEM_OP_NAME), (void*)_destination, (void*)_source, (unsigned long)_copy_blength, (unsigned long)(*(SPACE) - (_i * _copy_blength)) ); ); - MEM_OP( _destination, _source, _copy_blength ); - _source += _elem->extent; - _destination += _elem->extent; } - _copy_blength *= _copy_count; } - *(SPACE) -= _copy_blength; + _copy_blength = _memcpy_vector( _destination, _source, + _copy_blength, _copy_count, + _elem->extent, _elem->extent ); + *(SPACE) -= _copy_blength; } +#endif /* !defined(_predefined_data) */ -static inline void _contiguous_loop( const dt_elem_desc_t* ELEM, - const opal_datatype_t* DATATYPE, - unsigned char* SOURCE_BASE, - size_t TOTAL_COUNT, - size_t COUNT, - unsigned char* SOURCE, - unsigned char* DESTINATION, - size_t* SPACE ) +#if !defined(_contiguous_loop) +#define _contiguous_loop DT_CONCAT(MEM_OP_NAME,_contiguous_loop) +#define __OPAL_DATATYPE_DEFINE__contiguous_loop + +static inline +void _contiguous_loop( const dt_elem_desc_t* ELEM, + const opal_datatype_t* DATATYPE, + unsigned char* SOURCE_BASE, + size_t TOTAL_COUNT, + size_t COUNT, + unsigned char* SOURCE, + unsigned char* DESTINATION, + size_t* SPACE ) { ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM); ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items); @@ -115,6 +154,7 @@ static inline void _contiguous_loop( const dt_elem_desc_t* ELEM, } *(SPACE) -= _copy_loops; } +#endif /* !defined(_contiguous_loop) */ static inline int32_t _copy_content_same_ddt( const opal_datatype_t* datatype, int32_t count, char* destination_base, char* source_base ) @@ -146,7 +186,7 @@ static inline int32_t _copy_content_same_ddt( const opal_datatype_t* datatype, i source += datatype->true_lb; if( (ptrdiff_t)datatype->size == extent ) { /* all contiguous == no gaps around */ size_t total_length = iov_len_local; - size_t memop_chunk = opal_datatype_memop_block_size; + size_t memop_chunk = MEM_OP_BLOCK_SIZE_CONST; while( total_length > 0 ) { if( memop_chunk > total_length ) memop_chunk = total_length; OPAL_DATATYPE_SAFEGUARD_POINTER( destination, memop_chunk, @@ -251,3 +291,18 @@ static inline int32_t _copy_content_same_ddt( const opal_datatype_t* datatype, i } } } + +#if defined(__OPAL_DATATYPE_DEFINE__memcpy_vector) +#undef __OPAL_DATATYPE_DEFINE__memcpy_vector +#undef _memcpy_vector +#endif /* defined(__OPAL_DATATYPE_DEFINE__memcpy_vector) */ + +#if defined(__OPAL_DATATYPE_DEFINE__predefined_data) +#undef __OPAL_DATATYPE_DEFINE__predefined_data +#undef _predefined_data +#endif /* defined(__OPAL_DATATYPE_DEFINE__predefined_data) */ + +#if defined(__OPAL_DATATYPE_DEFINE__contiguous_loop) +#undef __OPAL_DATATYPE_DEFINE__contiguous_loop +#undef _contiguous_loop +#endif /* defined(__OPAL_DATATYPE_DEFINE__contiguous_loop) */ diff --git a/opal/datatype/opal_datatype_internal.h b/opal/datatype/opal_datatype_internal.h index 955f003c5bc..edf161ddb28 100644 --- a/opal/datatype/opal_datatype_internal.h +++ b/opal/datatype/opal_datatype_internal.h @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2018 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -515,5 +515,8 @@ extern bool opal_position_debug; extern bool opal_copy_debug; #endif /* OPAL_ENABLE_DEBUG */ +extern size_t opal_datatype_memop_block_size; +extern size_t opal_datatype_cuda_memop_block_size; + END_C_DECLS #endif /* OPAL_DATATYPE_INTERNAL_H_HAS_BEEN_INCLUDED */ diff --git a/opal/datatype/opal_datatype_module.c b/opal/datatype/opal_datatype_module.c index 0d3d2687fce..75874895ed4 100644 --- a/opal/datatype/opal_datatype_module.c +++ b/opal/datatype/opal_datatype_module.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2018 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -157,39 +157,39 @@ OPAL_DECLSPEC const opal_datatype_t* opal_datatype_basicDatatypes[OPAL_DATATYPE_ int opal_datatype_register_params(void) { -#if OPAL_ENABLE_DEBUG int ret; +#if OPAL_ENABLE_DEBUG ret = mca_base_var_register ("opal", "mpi", NULL, "ddt_unpack_debug", - "Whether to output debugging information in the ddt unpack functions (nonzero = enabled)", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_LOCAL, &opal_unpack_debug); + "Whether to output debugging information in the ddt unpack functions (nonzero = enabled)", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, &opal_unpack_debug); if (0 > ret) { - return ret; + return ret; } ret = mca_base_var_register ("opal", "mpi", NULL, "ddt_pack_debug", - "Whether to output debugging information in the ddt pack functions (nonzero = enabled)", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_LOCAL, &opal_pack_debug); + "Whether to output debugging information in the ddt pack functions (nonzero = enabled)", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, &opal_pack_debug); if (0 > ret) { - return ret; + return ret; } ret = mca_base_var_register ("opal", "mpi", NULL, "ddt_position_debug", - "Non zero lead to output generated by the datatype position functions", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_LOCAL, &opal_position_debug); + "Non zero lead to output generated by the datatype position functions", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, &opal_position_debug); if (0 > ret) { - return ret; + return ret; } ret = mca_base_var_register ("opal", "mpi", NULL, "ddt_copy_debug", - "Whether to output debugging information in the ddt copy functions (nonzero = enabled)", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_LOCAL, &opal_copy_debug); + "Whether to output debugging information in the ddt copy functions (nonzero = enabled)", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, &opal_copy_debug); if (0 > ret) { - return ret; + return ret; } ret = mca_base_var_register ("opal", "opal", NULL, "ddt_verbose", @@ -208,12 +208,33 @@ int opal_datatype_register_params(void) OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_LOCAL, &opal_cuda_verbose); if (0 > ret) { - return ret; + return ret; } #endif #endif /* OPAL_ENABLE_DEBUG */ + /* Define the segment size for the data copy operations */ + ret = mca_base_var_register ("opal", "opal", NULL, "datatype_memcpy_block_size", + "Split all memory copies handled by the datatype engine", + MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL, + &opal_datatype_memop_block_size); + if (0 > ret) { + return ret; + } + +#if OPAL_CUDA_SUPPORT + ret = mca_base_var_register ("opal", "opal", NULL, "datatype_cuda_memcpy_block_size", + "Split all CUDA memory copies handled by the datatype engine", + MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL, + &opal_datatype_cuda_memop_block_size); + if (0 > ret) { + return ret; + } +#endif + return OPAL_SUCCESS; }