diff --git a/opal/mca/btl/uct/Makefile.am b/opal/mca/btl/uct/Makefile.am index df548cc66ff..11799cfe3fe 100644 --- a/opal/mca/btl/uct/Makefile.am +++ b/opal/mca/btl/uct/Makefile.am @@ -13,6 +13,7 @@ # Copyright (c) 2017 IBM Corporation. All rights reserved. # Copyright (c) 2017-2018 Los Alamos National Security, LLC. All rights # reserved. +# Copyright (c) 2025 Google, LLC. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -24,22 +25,31 @@ AM_CPPFLAGS = $(btl_uct_CPPFLAGS) amca_paramdir = $(AMCA_PARAM_SETS_DIR) -sources = \ +headers = \ btl_uct.h \ + btl_uct_rdma.h \ + btl_uct_endpoint.h \ + btl_uct_am.h \ + btl_uct_frag.h \ + btl_uct_types.h \ + btl_uct_device_context.h \ + btl_uct_discover.h \ + btl_uct_modex.h \ + btl_uct_include_list.h + +sources = \ btl_uct_module.c \ btl_uct_component.c \ - btl_uct_rdma.h \ btl_uct_rdma.c \ - btl_uct_endpoint.h \ btl_uct_endpoint.c \ btl_uct_amo.c \ - btl_uct_am.h \ btl_uct_am.c \ - btl_uct_frag.h \ btl_uct_frag.c \ btl_uct_tl.c \ - btl_uct_types.h \ - btl_uct_device_context.h + btl_uct_discover.c \ + btl_uct_modex.c \ + btl_uct_include_list.c \ + btl_uct_device_context.c # Make the output library in this directory, and name it either # mca__.la (for DSO builds) or libmca__.la @@ -50,20 +60,22 @@ lib = lib_sources = component = mca_btl_uct.la component_sources = $(sources) +component_headers = $(headers) else lib = libmca_btl_uct.la lib_sources = $(sources) +lib_headers = ${headers} component = component_sources = endif mcacomponentdir = $(opallibdir) mcacomponent_LTLIBRARIES = $(component) -mca_btl_uct_la_SOURCES = $(component_sources) +mca_btl_uct_la_SOURCES = $(component_sources) $(component_headers) mca_btl_uct_la_LDFLAGS = -module -avoid-version $(btl_uct_LDFLAGS) mca_btl_uct_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la $(btl_uct_LIBS) noinst_LTLIBRARIES = $(lib) -libmca_btl_uct_la_SOURCES = $(lib_sources) +libmca_btl_uct_la_SOURCES = $(lib_sources) $(lib_headers) libmca_btl_uct_la_LDFLAGS = -module -avoid-version $(btl_uct_LDFLAGS) libmca_btl_uct_la_LIBADD = $(btl_uct_LIBS) diff --git a/opal/mca/btl/uct/btl_uct.h b/opal/mca/btl/uct/btl_uct.h index 65bc69fddb2..20b40783d46 100644 --- a/opal/mca/btl/uct/btl_uct.h +++ b/opal/mca/btl/uct/btl_uct.h @@ -64,6 +64,9 @@ struct mca_btl_uct_module_t { /** base BTL interface */ mca_btl_base_module_t super; + /** module index in the component module array */ + int module_index; + /** whether the module has been fully initialized or not */ bool initialized; @@ -76,31 +79,15 @@ struct mca_btl_uct_module_t { /** mutex to protect the module */ opal_recursive_mutex_t lock; - /** async context */ - ucs_async_context_t *ucs_async; - /** transport for active messaging */ mca_btl_uct_tl_t *am_tl; /** transport for RDMA/AMOs */ mca_btl_uct_tl_t *rdma_tl; - /** transport for forming connections (if needed) */ - mca_btl_uct_tl_t *conn_tl; - - /** array containing the am_tl and rdma_tl */ - mca_btl_uct_tl_t *comm_tls[2]; - -#if UCT_API >= UCT_VERSION(1, 7) - uct_component_h uct_component; -#endif - /** registration cache */ mca_rcache_base_module_t *rcache; - /** name of the memory domain backing this module */ - char *md_name; - /** am and rdma share endpoints */ bool shared_endpoints; @@ -119,8 +106,9 @@ struct mca_btl_uct_module_t { /** frags that were waiting on connections that are now ready to send */ opal_list_t pending_frags; - /** pending connection requests */ - opal_fifo_t pending_connection_reqs; + /** allowed transports */ + char *allowed_transports; + mca_btl_uct_include_list_t allowed_transport_list; }; typedef struct mca_btl_uct_module_t mca_btl_uct_module_t; @@ -133,6 +121,9 @@ struct mca_btl_uct_component_t { /** base BTL component */ mca_btl_base_component_3_0_0_t super; + /** whether the component is initialized. controls cleanup. */ + bool initialized; + /** number of TL modules */ int module_count; @@ -141,10 +132,15 @@ struct mca_btl_uct_component_t { /** allowed UCT memory domains */ char *memory_domains; + mca_btl_uct_include_list_t memory_domain_list; /** allowed transports */ char *allowed_transports; + /** transports to consider for forming connections */ + char *connection_domains; + mca_btl_uct_include_list_t connection_domain_list; + /** number of worker contexts to create */ int num_contexts_per_module; @@ -158,6 +154,17 @@ struct mca_btl_uct_component_t { /** connection retry timeout */ unsigned int connection_retry_timeout; + +#if UCT_API >= UCT_VERSION(1, 7) + uct_component_h *uct_components; + unsigned num_uct_components; +#endif + + /** list of memory domains (btl_uct_md_t) */ + opal_list_t md_list; + + /** connection transport (if needed). reference is owned by conn_md */ + mca_btl_uct_tl_t *conn_tl; }; typedef struct mca_btl_uct_component_t mca_btl_uct_component_t; @@ -293,11 +300,16 @@ ucs_status_t mca_btl_uct_am_handler(void *arg, void *data, size_t length, unsign struct mca_btl_base_endpoint_t *mca_btl_uct_get_ep(struct mca_btl_base_module_t *module, opal_proc_t *proc); -int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, - uct_tl_resource_desc_t *tl_descs, unsigned tl_count); +int mca_btl_uct_populate_tls(mca_btl_uct_md_t *md, uct_tl_resource_desc_t *tl_descs, unsigned tl_count); int mca_btl_uct_process_connection_request(mca_btl_uct_module_t *module, mca_btl_uct_conn_req_t *req); +mca_btl_uct_module_t *mca_btl_uct_alloc_module(mca_btl_uct_md_t *md, + size_t registration_size); + +int mca_btl_uct_evaluate_tl(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl); +int mca_btl_uct_enable_tl_conn(mca_btl_uct_tl_t *tl); + /** * @brief Checks if a tl is suitable for using for RDMA * @@ -305,7 +317,7 @@ int mca_btl_uct_process_connection_request(mca_btl_uct_module_t *module, */ static inline bool mca_btl_uct_tl_supports_rdma(mca_btl_uct_tl_t *tl) { - return (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags + return (tl->uct_iface_attr.cap.flags & (UCT_IFACE_FLAG_PUT_ZCOPY | UCT_IFACE_FLAG_GET_ZCOPY)) == (UCT_IFACE_FLAG_PUT_ZCOPY | UCT_IFACE_FLAG_GET_ZCOPY); } @@ -315,7 +327,7 @@ static inline bool mca_btl_uct_tl_supports_rdma(mca_btl_uct_tl_t *tl) */ static inline bool mca_btl_uct_tl_support_am(mca_btl_uct_tl_t *tl) { - return (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags + return (tl->uct_iface_attr.cap.flags & (UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_AM_BCOPY | UCT_IFACE_FLAG_AM_ZCOPY)); } @@ -326,7 +338,7 @@ static inline bool mca_btl_uct_tl_support_am(mca_btl_uct_tl_t *tl) */ static inline bool mca_btl_uct_tl_supports_conn(mca_btl_uct_tl_t *tl) { - return (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags + return (tl->uct_iface_attr.cap.flags & (UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_CONNECT_TO_IFACE)) == (UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_CONNECT_TO_IFACE); } @@ -338,7 +350,11 @@ static inline bool mca_btl_uct_tl_supports_conn(mca_btl_uct_tl_t *tl) */ static inline bool mca_btl_uct_tl_requires_connection_tl(mca_btl_uct_tl_t *tl) { - return !(MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE); + if (NULL == tl) { + return false; + } + + return !(tl->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE); } END_C_DECLS diff --git a/opal/mca/btl/uct/btl_uct_am.c b/opal/mca/btl/uct/btl_uct_am.c index 1aae456842c..68e82329137 100644 --- a/opal/mca/btl/uct/btl_uct_am.c +++ b/opal/mca/btl/uct/btl_uct_am.c @@ -2,6 +2,7 @@ /* * Copyright (c) 2018 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2025 Google, LLC. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -26,7 +27,7 @@ mca_btl_base_descriptor_t *mca_btl_uct_alloc(mca_btl_base_module_t *btl, mca_btl_uct_module_t *uct_btl = (mca_btl_uct_module_t *) btl; mca_btl_uct_base_frag_t *frag = NULL; - if (size <= (size_t) MCA_BTL_UCT_TL_ATTR(uct_btl->am_tl, 0).cap.am.max_short) { + if (size <= (size_t) uct_btl->am_tl->uct_iface_attr.cap.am.max_short) { frag = mca_btl_uct_frag_alloc_short(uct_btl, endpoint); } else if (size <= uct_btl->super.btl_eager_limit) { frag = mca_btl_uct_frag_alloc_eager(uct_btl, endpoint); @@ -55,7 +56,6 @@ static inline void _mca_btl_uct_send_pack(void *data, void *header, size_t heade { uint32_t iov_count = 1; struct iovec iov; - size_t length; if (header_size > 0) { assert(NULL != header); @@ -106,7 +106,7 @@ struct mca_btl_base_descriptor_t *mca_btl_uct_prepare_src(mca_btl_base_module_t frag->uct_iov.length = total_size; frag->base.order = order; frag->base.des_flags = flags; - if (total_size > (size_t) MCA_BTL_UCT_TL_ATTR(uct_btl->am_tl, 0).cap.am.max_short) { + if (total_size > (size_t) uct_btl->am_tl->uct_iface_attr.cap.am.max_short) { frag->segments[0].seg_len = reserve; frag->segments[1].seg_len = *size; frag->segments[1].seg_addr.pval = data_ptr; @@ -182,7 +182,7 @@ int mca_btl_uct_send_frag(mca_btl_uct_module_t *uct_btl, mca_btl_uct_base_frag_t mca_btl_uct_context_lock(context); /* attempt to post the fragment */ if (NULL != frag->base.super.registration - && (context->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_AM_ZCOPY)) { + && (uct_btl->am_tl->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_AM_ZCOPY)) { frag->comp.dev_context = context; ucs_status = uct_ep_am_zcopy(ep_handle, MCA_BTL_UCT_FRAG, &frag->header, sizeof(frag->header), &frag->uct_iov, 1, 0, @@ -197,7 +197,7 @@ int mca_btl_uct_send_frag(mca_btl_uct_module_t *uct_btl, mca_btl_uct_base_frag_t /* short message */ if (1 == frag->base.des_segment_count && (frag->uct_iov.length + 8) - < MCA_BTL_UCT_TL_ATTR(uct_btl->am_tl, 0).cap.am.max_short) { + < uct_btl->am_tl->uct_iface_attr.cap.am.max_short) { ucs_status = uct_ep_am_short(ep_handle, MCA_BTL_UCT_FRAG, frag->header.value, frag->uct_iov.buffer, frag->uct_iov.length); @@ -233,7 +233,7 @@ int mca_btl_uct_send_frag(mca_btl_uct_module_t *uct_btl, mca_btl_uct_base_frag_t } OPAL_THREAD_LOCK(&uct_btl->lock); - mca_btl_uct_append_pending_frag(uct_btl, frag, context, true); + mca_btl_uct_append_pending_frag(uct_btl, frag, context, /*ready=*/true); OPAL_THREAD_UNLOCK(&uct_btl->lock); return OPAL_SUCCESS; @@ -260,14 +260,14 @@ int mca_btl_uct_send(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi OPAL_THREAD_LOCK(&uct_btl->lock); /* check one more time in case another thread is completing the connection now */ if (OPAL_SUCCESS != mca_btl_uct_endpoint_test_am(uct_btl, endpoint, context, &ep_handle)) { - mca_btl_uct_append_pending_frag(uct_btl, frag, context, false); + mca_btl_uct_append_pending_frag(uct_btl, frag, context, /*ready=*/false); OPAL_THREAD_UNLOCK(&uct_btl->lock); return OPAL_SUCCESS; } OPAL_THREAD_UNLOCK(&uct_btl->lock); } - return mca_btl_uct_send_frag(uct_btl, frag, true); + return mca_btl_uct_send_frag(uct_btl, frag, /*append=*/true); } struct mca_btl_uct_sendi_pack_args_t { @@ -291,9 +291,9 @@ static size_t mca_btl_uct_sendi_pack(void *data, void *arg) return args->header_size + args->payload_size + 8; } -static inline size_t mca_btl_uct_max_sendi(mca_btl_uct_module_t *uct_btl, int context_id) +static inline size_t mca_btl_uct_max_sendi(mca_btl_uct_module_t *uct_btl) { - return MCA_BTL_UCT_TL_ATTR(uct_btl->am_tl, context_id).cap.am.max_bcopy; + return uct_btl->am_tl->uct_iface_attr.cap.am.max_bcopy; } int mca_btl_uct_sendi(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, @@ -313,7 +313,7 @@ int mca_btl_uct_sendi(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpo rc = mca_btl_uct_endpoint_check_am(uct_btl, endpoint, context, &ep_handle); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc - || msg_size > mca_btl_uct_max_sendi(uct_btl, context->context_id))) { + || msg_size > mca_btl_uct_max_sendi(uct_btl))) { if (descriptor) { *descriptor = mca_btl_uct_alloc(btl, endpoint, order, total_size, flags); } @@ -327,7 +327,7 @@ int mca_btl_uct_sendi(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpo if (0 == payload_size) { ucs_status = uct_ep_am_short(ep_handle, MCA_BTL_UCT_FRAG, am_header.value, header, header_size); - } else if (msg_size < (size_t) MCA_BTL_UCT_TL_ATTR(uct_btl->am_tl, context->context_id) + } else if (msg_size < (size_t) uct_btl->am_tl->uct_iface_attr .cap.am.max_short) { int8_t *data = alloca(total_size); size_t packed_payload_size = payload_size; diff --git a/opal/mca/btl/uct/btl_uct_component.c b/opal/mca/btl/uct/btl_uct_component.c index 5eec97ec487..3392387a707 100644 --- a/opal/mca/btl/uct/btl_uct_component.c +++ b/opal/mca/btl/uct/btl_uct_component.c @@ -29,6 +29,8 @@ #include "opal_config.h" +#include "btl_uct_discover.h" +#include "btl_uct_modex.h" #include "opal/mca/btl/base/base.h" #include "opal/mca/btl/btl.h" #include "opal/mca/hwloc/base/base.h" @@ -44,21 +46,47 @@ #include "btl_uct_am.h" #include "btl_uct_device_context.h" -static int mca_btl_uct_component_register(void) +static void mca_btl_uct_cleanup(void) { - mca_btl_uct_module_t *module = &mca_btl_uct_module_template; + if (mca_btl_uct_component.initialized) { + return; + } + + BTL_VERBOSE(("in UCT btl cleanup")); + + for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + if (NULL != mca_btl_uct_component.modules[i]) { + (void) mca_btl_uct_finalize(&mca_btl_uct_component.modules[i]->super); + } + } - mca_btl_uct_component.memory_domains = "mlx5_0,mlx4_0,rocep0s4"; + OBJ_DESTRUCT(&mca_btl_uct_component.memory_domain_list); + OBJ_DESTRUCT(&mca_btl_uct_component.connection_domain_list); + + OPAL_LIST_DESTRUCT(&mca_btl_uct_component.md_list); + +#if UCT_API >= UCT_VERSION(1, 7) + if (NULL != mca_btl_uct_component.uct_components) { + uct_release_component_list(mca_btl_uct_component.uct_components); + mca_btl_uct_component.uct_components = NULL; + mca_btl_uct_component.num_uct_components = 0; + } +#endif +} + +static int mca_btl_uct_component_register(void) +{ + mca_btl_uct_component.memory_domains = "mlx5_0,mlx4_0,rocep0s4,irdma0"; (void) mca_base_component_var_register( &mca_btl_uct_component.super.btl_version, "memory_domains", "Comma-delimited list of memory domains of the form " "to use for communication. Memory domains MUST provide transports that " "support put, get, and amos. Special values: all (all available), none." - " (default: mlx5_0,mlx4_0,rocep0s4)", + " (default: mlx5_0,mlx4_0,rocep0s4,irdma0)", MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_uct_component.memory_domains); - mca_btl_uct_component.allowed_transports = "dc_mlx5,rc_mlx5,ud,ugni_rdma,ugni_smsg,any"; + mca_btl_uct_component.allowed_transports = "dc_mlx5,rc_mlx5,rc_verbs,ud,ud_verbs,ugni_rdma,ugni_smsg,any"; (void) mca_base_component_var_register( &mca_btl_uct_component.super.btl_version, "transports", "Comma-delimited list of transports to use sorted by increasing " @@ -67,6 +95,15 @@ static int mca_btl_uct_component_register(void) MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_uct_component.allowed_transports); + mca_btl_uct_component.connection_domains = "tcp"; + (void) mca_base_component_var_register( + &mca_btl_uct_component.super.btl_version, "connection_domains", + "Comma-delimited list of connection-only domains to use sorted by increasing " + "priority. The list of transports available can be queried using ucx_info. Special" + "values: any (any available) (default: tcp)", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_uct_component.connection_domains); + mca_btl_uct_component.num_contexts_per_module = 0; (void) mca_base_component_var_register( &mca_btl_uct_component.super.btl_version, "num_contexts_per_module", @@ -113,10 +150,23 @@ static int mca_btl_uct_component_register(void) MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_4, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_uct_component.connection_retry_timeout); - /* for now we want this component to lose to btl/ugni and btl/vader */ - module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 1; + OBJ_CONSTRUCT(&mca_btl_uct_component.md_list, opal_list_t); + OBJ_CONSTRUCT(&mca_btl_uct_component.memory_domain_list, mca_btl_uct_include_list_t); + OBJ_CONSTRUCT(&mca_btl_uct_component.connection_domain_list, mca_btl_uct_include_list_t); + + int rc = mca_btl_uct_component_discover_mds(); + if (OPAL_SUCCESS != rc) { + return rc; + } + + rc = mca_btl_uct_component_generate_modules(&mca_btl_uct_component.md_list); + if (OPAL_SUCCESS != rc) { + return rc; + } + + opal_finalize_register_cleanup(mca_btl_uct_cleanup); - return mca_btl_base_param_register(&mca_btl_uct_component.super.btl_version, &module->super); + return OPAL_SUCCESS; } static void mca_btl_uct_mem_release_cb(void *buf, size_t length, void *cbdata, bool from_alloc) @@ -167,155 +217,17 @@ static int mca_btl_uct_component_open(void) */ static int mca_btl_uct_component_close(void) { + mca_btl_uct_component.conn_tl = NULL; + if (mca_btl_uct_component.disable_ucx_memory_hooks) { opal_mem_hooks_unregister_release(mca_btl_uct_mem_release_cb); } - return OPAL_SUCCESS; -} - -static size_t mca_btl_uct_tl_modex_size(mca_btl_uct_tl_t *tl) -{ - const size_t size = strlen(tl->uct_tl_name) + 1; - - if (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { - /* pad out to a multiple of 4 bytes */ - return (4 + 3 + size + MCA_BTL_UCT_TL_ATTR(tl, 0).device_addr_len - + MCA_BTL_UCT_TL_ATTR(tl, 0).iface_addr_len) - & ~3; - } - - return (4 + 3 + size + MCA_BTL_UCT_TL_ATTR(tl, 0).device_addr_len) & ~3; -} + /* complete delayed cleanup */ + mca_btl_uct_component.initialized = false; + mca_btl_uct_cleanup(); -static size_t mca_btl_uct_module_modex_size(mca_btl_uct_module_t *module) -{ - size_t modex_size = 4 + strlen(module->md_name) + 1; - - if (module->rdma_tl) { - modex_size += mca_btl_uct_tl_modex_size(module->rdma_tl); - } - - if (module->am_tl && module->am_tl != module->rdma_tl) { - modex_size += mca_btl_uct_tl_modex_size(module->am_tl); - } - - if (module->conn_tl && module->conn_tl != module->rdma_tl && module->conn_tl != module->am_tl) { - modex_size += mca_btl_uct_tl_modex_size(module->conn_tl); - } - - return modex_size; -} - -static size_t mca_btl_uct_tl_modex_pack(mca_btl_uct_tl_t *tl, uint8_t *modex_data) -{ - mca_btl_uct_device_context_t *dev_context = tl->uct_dev_contexts[0]; - size_t modex_size = mca_btl_uct_tl_modex_size(tl); - - *((uint32_t *) modex_data) = (uint32_t) modex_size; - modex_data += 4; - - strcpy((char *) modex_data, tl->uct_tl_name); - modex_data += strlen(tl->uct_tl_name) + 1; - - /* NTH: only the first context is available. i assume the device addresses of the - * contexts will be the same but they will have different iface addresses. i also - * am assuming that it doesn't really matter if all remote contexts connect to - * the same endpoint since we are only doing RDMA. if any of these assumptions are - * wrong then we can't delay creating the other contexts and must include their - * information in the modex. */ - if (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { - uct_iface_get_address(dev_context->uct_iface, (uct_iface_addr_t *) modex_data); - modex_data += MCA_BTL_UCT_TL_ATTR(tl, 0).iface_addr_len; - } - - uct_iface_get_device_address(dev_context->uct_iface, (uct_device_addr_t *) modex_data); - modex_data += MCA_BTL_UCT_TL_ATTR(tl, 0).device_addr_len; - - return modex_size; -} - -static int mca_btl_uct_modex_send(void) -{ - size_t modex_size = sizeof(mca_btl_uct_modex_t); - mca_btl_uct_modex_t *modex; - uint8_t *modex_data; - int rc; - - for (int i = 0; i < mca_btl_uct_component.module_count; ++i) { - modex_size += mca_btl_uct_module_modex_size(mca_btl_uct_component.modules[i]); - } - - modex = alloca(modex_size); - modex_data = modex->data; - - modex->module_count = mca_btl_uct_component.module_count; - - for (int i = 0; i < mca_btl_uct_component.module_count; ++i) { - mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i]; - size_t name_len = strlen(module->md_name); - - /* pack the size */ - *((uint32_t *) modex_data) = (uint32_t) mca_btl_uct_module_modex_size(module); - - modex_data += 4; - - strcpy((char *) modex_data, module->md_name); - modex_data += name_len + 1; - - if (module->rdma_tl) { - modex_data += mca_btl_uct_tl_modex_pack(module->rdma_tl, modex_data); - } - - if (module->am_tl && module->am_tl != module->rdma_tl) { - modex_data += mca_btl_uct_tl_modex_pack(module->am_tl, modex_data); - } - - if (module->conn_tl && module->conn_tl != module->rdma_tl - && module->conn_tl != module->am_tl) { - modex_data += mca_btl_uct_tl_modex_pack(module->conn_tl, modex_data); - } - } - - OPAL_MODEX_SEND(rc, PMIX_GLOBAL, &mca_btl_uct_component.super.btl_version, modex, modex_size); - return rc; -} - -static mca_btl_uct_module_t *mca_btl_uct_alloc_module(const char *md_name, mca_btl_uct_md_t *md, - size_t registration_size) -{ - mca_btl_uct_module_t *module; - ucs_status_t ucs_status; - - module = malloc(sizeof(*module)); - if (NULL == module) { - return NULL; - } - - /* copy the module template */ - *module = mca_btl_uct_module_template; - - OBJ_CONSTRUCT(&module->id_to_endpoint, opal_hash_table_t); - OBJ_CONSTRUCT(&module->endpoint_lock, opal_mutex_t); - OBJ_CONSTRUCT(&module->short_frags, opal_free_list_t); - OBJ_CONSTRUCT(&module->eager_frags, opal_free_list_t); - OBJ_CONSTRUCT(&module->max_frags, opal_free_list_t); - OBJ_CONSTRUCT(&module->pending_frags, opal_list_t); - OBJ_CONSTRUCT(&module->lock, opal_recursive_mutex_t); - OBJ_CONSTRUCT(&module->pending_connection_reqs, opal_fifo_t); - - module->md = md; - module->md_name = strdup(md_name); - module->super.btl_registration_handle_size = registration_size; - - ucs_status = ucs_async_context_create(UCS_ASYNC_MODE_THREAD, &module->ucs_async); - if (UCS_OK != ucs_status) { - BTL_VERBOSE(("Could not create a UCT async context")); - mca_btl_uct_finalize(&module->super); - return NULL; - } - - return module; + return OPAL_SUCCESS; } ucs_status_t mca_btl_uct_am_handler(void *arg, void *data, size_t length, unsigned flags) @@ -341,170 +253,6 @@ ucs_status_t mca_btl_uct_am_handler(void *arg, void *data, size_t length, unsign return UCS_OK; } -#if UCT_API >= UCT_VERSION(1, 7) -static int mca_btl_uct_component_process_uct_md(uct_component_h component, - uct_md_resource_desc_t *md_desc, - char **allowed_ifaces) -#else -static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc, - char **allowed_ifaces) -#endif -{ - mca_rcache_base_resources_t rcache_resources; - uct_tl_resource_desc_t *tl_desc; - mca_btl_uct_module_t *module; - uct_md_config_t *uct_config; - uct_md_attr_t md_attr; - mca_btl_uct_md_t *md; - bool found = false; - unsigned num_tls; - char *tmp; - ucs_status_t ucs_status; - - if (MCA_BTL_UCT_MAX_MODULES == mca_btl_uct_component.module_count) { - BTL_VERBOSE(("created the maximum number of allowable modules")); - return OPAL_ERR_NOT_AVAILABLE; - } - - BTL_VERBOSE(("processing memory domain %s", md_desc->md_name)); - - for (int j = 0; allowed_ifaces[j]; ++j) { - if (0 == strncmp(allowed_ifaces[j], md_desc->md_name, strlen(md_desc->md_name)) - || 0 == strcmp(allowed_ifaces[j], "all")) { - found = true; - break; - } - } - - if (!found) { - /* nothing to do */ - return OPAL_SUCCESS; - } - - md = OBJ_NEW(mca_btl_uct_md_t); - -#if UCT_API >= UCT_VERSION(1, 7) - ucs_status = uct_md_config_read(component, NULL, NULL, &uct_config); - if (UCS_OK != ucs_status) { - BTL_VERBOSE(("uct_md_config_read failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); - return OPAL_ERR_NOT_AVAILABLE; - } - ucs_status = uct_md_open(component, md_desc->md_name, uct_config, &md->uct_md); - if (UCS_OK != ucs_status) { - BTL_VERBOSE(("uct_md_open failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); - return OPAL_ERR_NOT_AVAILABLE; - } -#else - ucs_status = uct_md_config_read(md_desc->md_name, NULL, NULL, &uct_config); - if (UCS_OK != ucs_status) { - BTL_VERBOSE(("uct_md_config_read failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); - return OPAL_ERR_NOT_AVAILABLE; - } - ucs_status = uct_md_open(md_desc->md_name, uct_config, &md->uct_md); - if (UCS_OK != ucs_status) { - BTL_VERBOSE(("uct_md_open failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); - return OPAL_ERR_NOT_AVAILABLE; - } -#endif - uct_config_release(uct_config); - - ucs_status = uct_md_query(md->uct_md, &md_attr); - if (UCS_OK != ucs_status) { - BTL_VERBOSE(("uct_config_release failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); - return OPAL_ERR_NOT_AVAILABLE; - } - ucs_status = uct_md_query_tl_resources(md->uct_md, &tl_desc, &num_tls); - if (UCS_OK != ucs_status) { - BTL_VERBOSE(("uct_config_release failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); - return OPAL_ERR_NOT_AVAILABLE; - } - - module = mca_btl_uct_alloc_module(md_desc->md_name, md, md_attr.rkey_packed_size); - if (NULL == module) { - uct_release_tl_resource_list(tl_desc); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - (void) mca_btl_uct_query_tls(module, md, tl_desc, num_tls); - - uct_release_tl_resource_list(tl_desc); - - /* release the initial reference to the md object. if any modules were created the UCT md will - * remain open until those modules are finalized. */ - OBJ_RELEASE(md); - - if (NULL == module->am_tl && NULL == module->rdma_tl) { - BTL_VERBOSE(("uct memory domain %s does not have any appropriate tls", md_desc->md_name)); - mca_btl_uct_finalize(&module->super); - return OPAL_ERR_NOT_AVAILABLE; - } - -#if UCT_API >= UCT_VERSION(1, 7) - module->uct_component = component; -#endif - - mca_btl_uct_component.modules[mca_btl_uct_component.module_count++] = module; - - /* NTH: a registration cache shouldn't be necessary when using UCT but there are measurable - * performance benefits to using rcache/grdma instead of assuming UCT will do the right - * thing. */ - (void) opal_asprintf(&tmp, "uct.%s", module->md_name); - - rcache_resources.cache_name = tmp; - rcache_resources.reg_data = (void *) module; - rcache_resources.sizeof_reg = sizeof(mca_btl_uct_reg_t) - + module->super.btl_registration_handle_size; - rcache_resources.register_mem = mca_btl_uct_reg_mem; - rcache_resources.deregister_mem = mca_btl_uct_dereg_mem; - - module->rcache = mca_rcache_base_module_create("grdma", module, &rcache_resources); - free(tmp); - if (NULL == module->rcache) { - /* something when horribly wrong */ - BTL_VERBOSE(("could not allocate a registration cache for this btl module")); - mca_btl_uct_finalize(&module->super); - return OPAL_ERROR; - } - - return OPAL_SUCCESS; -} - -#if UCT_API >= UCT_VERSION(1, 7) -static int mca_btl_uct_component_process_uct_component(uct_component_h component, - char **allowed_ifaces) -{ - uct_component_attr_t attr = {.field_mask = UCT_COMPONENT_ATTR_FIELD_NAME - | UCT_COMPONENT_ATTR_FIELD_MD_RESOURCE_COUNT}; - ucs_status_t ucs_status; - int rc; - - ucs_status = uct_component_query(component, &attr); - if (UCS_OK != ucs_status) { - return OPAL_ERROR; - } - - BTL_VERBOSE(("processing uct component %s", attr.name)); - - attr.md_resources = calloc(attr.md_resource_count, sizeof(*attr.md_resources)); - attr.field_mask |= UCT_COMPONENT_ATTR_FIELD_MD_RESOURCES; - ucs_status = uct_component_query(component, &attr); - if (UCS_OK != ucs_status) { - return OPAL_ERROR; - } - - for (unsigned i = 0; i < attr.md_resource_count; ++i) { - rc = mca_btl_uct_component_process_uct_md(component, attr.md_resources + i, allowed_ifaces); - if (OPAL_SUCCESS != rc) { - break; - } - } - - free(attr.md_resources); - - return OPAL_SUCCESS; -} -#endif /* UCT_API >= UCT_VERSION(1, 7) */ - /* * UCT component initialization: * (1) read interface list from kernel and compare against component parameters @@ -520,8 +268,6 @@ static mca_btl_base_module_t **mca_btl_uct_component_init(int *num_btl_modules, /* for this BTL to be useful the interface needs to support RDMA and certain atomic operations */ struct mca_btl_base_module_t **base_modules; - ucs_status_t ucs_status; - char **allowed_ifaces; int rc; BTL_VERBOSE(("initializing uct btl")); @@ -533,54 +279,26 @@ static mca_btl_base_module_t **mca_btl_uct_component_init(int *num_btl_modules, return NULL; } - allowed_ifaces = opal_argv_split(mca_btl_uct_component.memory_domains, ','); - if (NULL == allowed_ifaces) { + rc = mca_btl_uct_enable_modules(mca_btl_uct_component.modules, mca_btl_uct_component.module_count); + if (OPAL_SUCCESS != rc) { return NULL; } - mca_btl_uct_component.module_count = 0; - -#if UCT_API >= UCT_VERSION(1, 7) - uct_component_h *components; - unsigned num_components; - - ucs_status = uct_query_components(&components, &num_components); - if (UCS_OK != ucs_status) { - BTL_ERROR(("could not query UCT components")); + rc = mca_btl_uct_component_maybe_setup_conn_tl(); + if (OPAL_SUCCESS != rc && OPAL_ERR_NOT_FOUND != rc) { return NULL; } - /* generate all suitable btl modules */ - for (unsigned i = 0; i < num_components; ++i) { - rc = mca_btl_uct_component_process_uct_component(components[i], allowed_ifaces); - if (OPAL_SUCCESS != rc) { - break; - } + rc = mca_btl_uct_component_filter_mds(); + if (OPAL_SUCCESS != rc) { + return NULL; } - uct_release_component_list(components); - -#else /* UCT 1.6 and older */ - uct_md_resource_desc_t *resources; - unsigned resource_count; - - uct_query_md_resources(&resources, &resource_count); - - /* generate all suitable btl modules */ - for (unsigned i = 0; i < resource_count; ++i) { - rc = mca_btl_uct_component_process_uct_md(resources + i, allowed_ifaces); - if (OPAL_SUCCESS != rc) { - break; - } + rc = mca_btl_uct_component_modex_send(); + if (OPAL_SUCCESS != rc) { + return NULL; } - uct_release_md_resource_list(resources); - -#endif /* UCT_API >= UCT_VERSION(1, 7) */ - - opal_argv_free(allowed_ifaces); - mca_btl_uct_modex_send(); - /* pass module array back to caller */ base_modules = calloc(mca_btl_uct_component.module_count, sizeof(*base_modules)); if (NULL == base_modules) { @@ -590,6 +308,8 @@ static mca_btl_base_module_t **mca_btl_uct_component_init(int *num_btl_modules, memcpy(base_modules, mca_btl_uct_component.modules, mca_btl_uct_component.module_count * sizeof(mca_btl_uct_component.modules[0])); + mca_btl_uct_component.initialized = true; + *num_btl_modules = mca_btl_uct_component.module_count; BTL_VERBOSE(("uct btl initialization complete. found %d suitable memory domains", @@ -633,7 +353,7 @@ static int mca_btl_uct_component_progress_pending(mca_btl_uct_module_t *uct_btl) opal_list_remove_item(&uct_btl->pending_frags, (opal_list_item_t *) frag); - if (OPAL_SUCCESS > mca_btl_uct_send_frag(uct_btl, frag, false)) { + if (OPAL_SUCCESS > mca_btl_uct_send_frag(uct_btl, frag, /*append=*/false)) { opal_list_prepend(&uct_btl->pending_frags, (opal_list_item_t *) frag); } else { completed++; @@ -644,6 +364,36 @@ static int mca_btl_uct_component_progress_pending(mca_btl_uct_module_t *uct_btl) return completed; } +static int mca_btl_uct_component_progress_connections (mca_btl_uct_tl_t *conn_tl) { + mca_btl_uct_pending_connection_request_t *request; + int ret; + + if (conn_tl == NULL) { + return 0; + } + + ret = mca_btl_uct_tl_progress(conn_tl, 0); + + while (NULL + != (request = (mca_btl_uct_pending_connection_request_t *) opal_fifo_pop_atomic( + &conn_tl->pending_connection_reqs))) { + mca_btl_uct_conn_req_t *conn_req = (mca_btl_uct_conn_req_t *) request->request_data; + BTL_VERBOSE(("processing connection request....")); + if (conn_req->module_index >= mca_btl_uct_component.module_count) { + BTL_ERROR(("invalid connection request received")); + abort(); + } + int rc = mca_btl_uct_process_connection_request(mca_btl_uct_component.modules[conn_req->module_index], conn_req); + if (rc != OPAL_SUCCESS) { + opal_fifo_push_atomic(&conn_tl->pending_connection_reqs, &request->super); + break; + } + OBJ_RELEASE(request); + } + + return ret; +} + /** * @brief UCT BTL progress function * @@ -654,38 +404,28 @@ static int mca_btl_uct_component_progress(void) int starting_index = mca_btl_uct_get_context_index(); unsigned ret = 0; - for (int i = 0; i < mca_btl_uct_component.module_count; ++i) { - mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i]; - + mca_btl_uct_md_t *md; + OPAL_LIST_FOREACH(md, &mca_btl_uct_component.md_list, mca_btl_uct_md_t) { /* unlike ucp, uct actually tells us something useful! its almost like it was "inspired" * by the btl progress functions.... */ - ret += mca_btl_uct_tl_progress(module->rdma_tl, starting_index); - - if (module->am_tl != module->rdma_tl) { - ret += mca_btl_uct_tl_progress(module->am_tl, starting_index); - } - - if (module->conn_tl) { - mca_btl_uct_pending_connection_request_t *request; - - if (module->conn_tl != module->am_tl && module->conn_tl != module->rdma_tl) { - ret += mca_btl_uct_tl_progress(module->conn_tl, 0); - } - - while (NULL - != (request = (mca_btl_uct_pending_connection_request_t *) opal_fifo_pop_atomic( - &module->pending_connection_reqs))) { - mca_btl_uct_process_connection_request(module, (mca_btl_uct_conn_req_t *) - request->request_data); - OBJ_RELEASE(request); - } + mca_btl_uct_tl_t *tl; + OPAL_LIST_FOREACH(tl, &md->tls, mca_btl_uct_tl_t) { + ret += mca_btl_uct_tl_progress(tl, starting_index); } + } + + for (int i = 0; i < mca_btl_uct_component.module_count; ++i) { + mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i]; if (0 != opal_list_get_size(&module->pending_frags)) { mca_btl_uct_component_progress_pending(module); } } + if (NULL != mca_btl_uct_component.conn_tl) { + ret += mca_btl_uct_component_progress_connections (mca_btl_uct_component.conn_tl); + } + return (int) ret; } diff --git a/opal/mca/btl/uct/btl_uct_device_context.c b/opal/mca/btl/uct/btl_uct_device_context.c new file mode 100644 index 00000000000..0e6c284ecd9 --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_device_context.c @@ -0,0 +1,143 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2018 Triad National Security, LLC. All rights + * reserved. + * Copyright (c) 2019-2025 Google, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal_config.h" + +#include +#include +#include +#include + +#include "btl_uct.h" +#include "btl_uct_device_context.h" +#include "btl_uct_types.h" + +#include "opal/class/opal_free_list.h" +#include "opal/class/opal_object.h" + +#if HAVE_DECL_UCT_CB_FLAG_SYNC +# define MCA_BTL_UCT_CB_FLAG_SYNC UCT_CB_FLAG_SYNC +#else +# define MCA_BTL_UCT_CB_FLAG_SYNC 0 +#endif + +static void mca_btl_uct_context_enable_progress(mca_btl_uct_device_context_t *context) +{ + if (!context->progress_enabled) { +#if HAVE_DECL_UCT_PROGRESS_THREAD_SAFE + uct_iface_progress_enable(context->uct_iface, + UCT_PROGRESS_THREAD_SAFE | UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); +#else + uct_iface_progress_enable(context->uct_iface, UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); +#endif + context->progress_enabled = true; + } +} + +mca_btl_uct_device_context_t *mca_btl_uct_context_create(mca_btl_uct_module_t *module, + mca_btl_uct_tl_t *tl, int context_id, + bool enable_progress) +{ +#if UCT_API >= UCT_VERSION(1, 6) + uct_iface_params_t iface_params = {.field_mask = UCT_IFACE_PARAM_FIELD_OPEN_MODE + | UCT_IFACE_PARAM_FIELD_DEVICE, + .open_mode = UCT_IFACE_OPEN_MODE_DEVICE, + .mode = {.device = {.tl_name = tl->uct_tl_name, + .dev_name = tl->uct_dev_name}}}; +#else + uct_iface_params_t iface_params = {.rndv_cb = NULL, + .eager_cb = NULL, + .stats_root = NULL, + .rx_headroom = 0, + .open_mode = UCT_IFACE_OPEN_MODE_DEVICE, + .mode = {.device = {.tl_name = tl->uct_tl_name, + .dev_name = tl->uct_dev_name}}}; +#endif + mca_btl_uct_device_context_t *context; + ucs_status_t ucs_status; + int rc; + + context = calloc(1, sizeof(*context)); + if (OPAL_UNLIKELY(NULL == context)) { + return NULL; + } + + context->context_id = context_id; + context->uct_btl = module; + OBJ_CONSTRUCT(&context->completion_fifo, opal_fifo_t); + OBJ_CONSTRUCT(&context->mutex, opal_recursive_mutex_t); + OBJ_CONSTRUCT(&context->rdma_completions, opal_free_list_t); + + rc = opal_free_list_init(&context->rdma_completions, sizeof(mca_btl_uct_uct_completion_t), + opal_cache_line_size, OBJ_CLASS(mca_btl_uct_uct_completion_t), 0, + opal_cache_line_size, 0, 4096, 128, NULL, 0, NULL, NULL, NULL); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + mca_btl_uct_context_destroy(context); + return NULL; + } + + /* apparently (in contradiction to the spec) UCT is *not* thread safe. because we have to + * use our own locks just go ahead and use UCS_THREAD_MODE_SINGLE. if they ever fix their + * api then change this back to UCS_THREAD_MODE_MULTI and remove the locks around the + * various UCT calls. */ + ucs_status = uct_worker_create(tl->ucs_async, UCS_THREAD_MODE_SINGLE, &context->uct_worker); + if (OPAL_UNLIKELY(UCS_OK != ucs_status)) { + BTL_VERBOSE(("could not create a UCT worker")); + mca_btl_uct_context_destroy(context); + return NULL; + } + + ucs_status = uct_iface_open(tl->uct_md->uct_md, context->uct_worker, &iface_params, + tl->uct_tl_config, &context->uct_iface); + if (OPAL_UNLIKELY(UCS_OK != ucs_status)) { + BTL_VERBOSE(("could not open UCT interface. error code: %d", ucs_status)); + mca_btl_uct_context_destroy(context); + return NULL; + } + + if (module != NULL && tl == module->am_tl) { + BTL_VERBOSE(("installing AM handler for tl %s::%s context id %d", + tl->uct_md->md_name, tl->uct_tl_name, context_id)); + uct_iface_set_am_handler(context->uct_iface, MCA_BTL_UCT_FRAG, mca_btl_uct_am_handler, + context, MCA_BTL_UCT_CB_FLAG_SYNC); + } + + if (enable_progress) { + BTL_VERBOSE(("enabling progress for tl %s::%s context id %d", + tl->uct_md->md_name, tl->uct_tl_name, context_id)); + mca_btl_uct_context_enable_progress(context); + } + + return context; +} + +void mca_btl_uct_context_destroy(mca_btl_uct_device_context_t *context) +{ + if (context->uct_iface) { + uct_iface_close(context->uct_iface); + context->uct_iface = NULL; + } + + if (context->uct_worker) { + uct_worker_destroy(context->uct_worker); + context->uct_worker = NULL; + } + + OBJ_DESTRUCT(&context->completion_fifo); + OBJ_DESTRUCT(&context->rdma_completions); + free(context); +} + diff --git a/opal/mca/btl/uct/btl_uct_device_context.h b/opal/mca/btl/uct/btl_uct_device_context.h index 7e25e0bef19..d264cc40610 100644 --- a/opal/mca/btl/uct/btl_uct_device_context.h +++ b/opal/mca/btl/uct/btl_uct_device_context.h @@ -94,14 +94,14 @@ mca_btl_uct_module_get_tl_context_specific(mca_btl_uct_module_t *module, mca_btl mca_btl_uct_device_context_t *context = tl->uct_dev_contexts[context_id]; if (OPAL_UNLIKELY(NULL == context)) { - OPAL_THREAD_LOCK(&module->lock); + OPAL_THREAD_LOCK(&tl->tl_lock); context = tl->uct_dev_contexts[context_id]; if (OPAL_UNLIKELY(NULL == context)) { context = tl->uct_dev_contexts[context_id] = mca_btl_uct_context_create(module, tl, context_id, true); } - OPAL_THREAD_UNLOCK(&module->lock); + OPAL_THREAD_UNLOCK(&tl->tl_lock); } return context; diff --git a/opal/mca/btl/uct/btl_uct_discover.c b/opal/mca/btl/uct/btl_uct_discover.c new file mode 100644 index 00000000000..7bb13db9837 --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_discover.c @@ -0,0 +1,508 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. + * Copyright (c) 2018-2024 Triad National Security, LLC. All rights + * reserved. + * Copyright (c) 2019-2025 Google, LLC. All rights reserved. + * Copyright (c) 2019 Intel, Inc. All rights reserved. + * Copyright (c) 2022 IBM Corporation. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal_config.h" + +#include "btl_uct_discover.h" +#include "btl_uct_include_list.h" + +#include "btl_uct.h" +#include "opal/class/opal_list.h" +#include "opal/util/printf.h" + +#if UCT_API >= UCT_VERSION(1, 7) +static int mca_btl_uct_component_process_uct_md(uct_component_h component, + uct_md_resource_desc_t *md_desc) +#else +static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc) +#endif +{ + uct_tl_resource_desc_t *tl_desc; + uct_md_config_t *uct_config; + mca_btl_uct_md_t *md; + int list_rank; + unsigned num_tls; + ucs_status_t ucs_status; + int connection_list_rank = -1; + bool consider_for_connection_module = false; + + BTL_VERBOSE(("processing memory domain %s", md_desc->md_name)); + + BTL_VERBOSE(("checking if %s should be used for communication", md_desc->md_name)); + list_rank = mca_btl_uct_include_list_rank (md_desc->md_name, &mca_btl_uct_component.memory_domain_list); + + if (list_rank < 0) { + BTL_VERBOSE(("checking if %s should be used for connections", md_desc->md_name)); + connection_list_rank = mca_btl_uct_include_list_rank (md_desc->md_name, &mca_btl_uct_component.connection_domain_list); + + if (connection_list_rank < 0) { + /* nothing to do */ + BTL_VERBOSE(("not continuing with memory domain %s", md_desc->md_name)); + return OPAL_SUCCESS; + } + + BTL_VERBOSE(("will be considering domain %s for connections only", md_desc->md_name)); + consider_for_connection_module = true; + } + + md = OBJ_NEW(mca_btl_uct_md_t); + md->md_name = strdup(md_desc->md_name); +#if UCT_API >= UCT_VERSION(1, 7) + md->uct_component = component; +#endif + md->connection_only_domain = consider_for_connection_module; + +#if UCT_API >= UCT_VERSION(1, 7) + ucs_status = uct_md_config_read(component, NULL, NULL, &uct_config); + if (UCS_OK != ucs_status) { + BTL_VERBOSE(("uct_md_config_read failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); + return OPAL_ERR_NOT_AVAILABLE; + } + + ucs_status = uct_md_open(component, md->md_name, uct_config, &md->uct_md); + if (UCS_OK != ucs_status) { + BTL_VERBOSE(("uct_md_open failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); + return OPAL_ERR_NOT_AVAILABLE; + } +#else + ucs_status = uct_md_config_read(md->md_name, NULL, NULL, &uct_config); + if (UCS_OK != ucs_status) { + BTL_VERBOSE(("uct_md_config_read failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); + return OPAL_ERR_NOT_AVAILABLE; + } + + ucs_status = uct_md_open(md->md_name, uct_config, &md->uct_md); + if (UCS_OK != ucs_status) { + BTL_VERBOSE(("uct_md_open failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); + return OPAL_ERR_NOT_AVAILABLE; + } +#endif + uct_config_release(uct_config); + + ucs_status = uct_md_query(md->uct_md, &md->md_attr); + if (UCS_OK != ucs_status) { + BTL_VERBOSE(("uct_config_release failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); + return OPAL_ERR_NOT_AVAILABLE; + } + + ucs_status = uct_md_query_tl_resources(md->uct_md, &tl_desc, &num_tls); + if (UCS_OK != ucs_status) { + BTL_VERBOSE(("uct_config_release failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); + return OPAL_ERR_NOT_AVAILABLE; + } + + (void) mca_btl_uct_populate_tls(md, tl_desc, num_tls); + + uct_release_tl_resource_list(tl_desc); + opal_list_append(&mca_btl_uct_component.md_list, &md->super); + + return OPAL_SUCCESS; +} + +#if UCT_API >= UCT_VERSION(1, 7) +static int mca_btl_uct_component_process_uct_component(uct_component_h component) +{ + uct_component_attr_t attr = { + .field_mask = UCT_COMPONENT_ATTR_FIELD_NAME + | UCT_COMPONENT_ATTR_FIELD_MD_RESOURCE_COUNT, + }; + ucs_status_t ucs_status; + int rc; + + ucs_status = uct_component_query(component, &attr); + if (UCS_OK != ucs_status) { + return OPAL_ERROR; + } + + BTL_VERBOSE(("processing uct component %s", attr.name)); + + attr.md_resources = calloc(attr.md_resource_count, sizeof(*attr.md_resources)); + attr.field_mask |= UCT_COMPONENT_ATTR_FIELD_MD_RESOURCES; + ucs_status = uct_component_query(component, &attr); + if (UCS_OK != ucs_status) { + return OPAL_ERROR; + } + + for (unsigned i = 0; i < attr.md_resource_count; ++i) { + rc = mca_btl_uct_component_process_uct_md(component, attr.md_resources + i); + if (OPAL_SUCCESS != rc) { + break; + } + } + + free(attr.md_resources); + + return OPAL_SUCCESS; +} +#endif /* UCT_API >= UCT_VERSION(1, 7) */ + +int mca_btl_uct_component_discover_mds(void) +{ + mca_btl_uct_include_list_parse(mca_btl_uct_component.memory_domains, + &mca_btl_uct_component.memory_domain_list); + mca_btl_uct_include_list_parse(mca_btl_uct_component.connection_domains, + &mca_btl_uct_component.connection_domain_list); + +#if UCT_API >= UCT_VERSION(1, 7) + ucs_status_t ucs_status = uct_query_components(&mca_btl_uct_component.uct_components, + &mca_btl_uct_component.num_uct_components); + if (UCS_OK != ucs_status) { + BTL_ERROR(("could not query UCT components")); + return OPAL_ERROR; + } + + /* generate list of memory domains */ + for (unsigned i = 0; i < mca_btl_uct_component.num_uct_components; ++i) { + int rc = mca_btl_uct_component_process_uct_component(mca_btl_uct_component.uct_components[i]); + if (OPAL_SUCCESS != rc) { + break; + } + } +#else /* UCT 1.6 and older */ + uct_md_resource_desc_t *resources; + unsigned resource_count; + + uct_query_md_resources(&resources, &resource_count); + + /* generate all suitable btl modules */ + for (unsigned i = 0; i < resource_count; ++i) { + int rc = mca_btl_uct_component_process_uct_md(resources + i); + if (OPAL_SUCCESS != rc) { + break; + } + } + + uct_release_md_resource_list(resources); + +#endif /* UCT_API >= UCT_VERSION(1, 7) */ + + return OPAL_SUCCESS; +} + +static int mca_btl_uct_module_register_mca_var(mca_btl_uct_module_t *module) +{ + mca_base_component_t dummy_component; + /* mca_btl_uct_component starts with an mca_base_component_t structure */ + memcpy(&dummy_component, &mca_btl_uct_component, sizeof(dummy_component)); + snprintf(dummy_component.mca_component_name, sizeof(dummy_component.mca_component_name), + "uct_%s", module->md->md_name); + + BTL_VERBOSE(("registering MCA parameters for module uct_%s", module->md->md_name)); + + module->allowed_transports = mca_btl_uct_component.allowed_transports; + (void) mca_base_component_var_register( + &dummy_component, "transports", + "Comma-delimited list of transports to use sorted by increasing " + "priority. The list of transports available can be queried using ucx_info. Special" + "values: any (any available) (default: dc_mlx5,rc_mlx5,ud,any)", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, &module->allowed_transports); + + return mca_btl_base_param_register(&dummy_component, &module->super); +} + +static int tl_compare(opal_list_item_t **a, opal_list_item_t **b) +{ + mca_btl_uct_tl_t *tl_a = (mca_btl_uct_tl_t *) *a; + mca_btl_uct_tl_t *tl_b = (mca_btl_uct_tl_t *) *b; + + return tl_a->priority - tl_b->priority; +} + +static int mca_btl_uct_generate_module(mca_btl_uct_md_t *md) +{ + mca_btl_uct_tl_t *tl; + mca_btl_uct_module_t *module = mca_btl_uct_alloc_module(md, md->md_attr.rkey_packed_size); + + BTL_VERBOSE(("attempting to create a BTL module for memory domain: %s", md->md_name)); + + int rc = mca_btl_uct_module_register_mca_var(module); + if (OPAL_SUCCESS != rc) { + mca_btl_uct_finalize(&module->super); + return rc; + } + + mca_btl_uct_include_list_parse(module->allowed_transports, + &module->allowed_transport_list); + mca_btl_uct_tl_t *next; + OPAL_LIST_FOREACH_SAFE (tl, next, &md->tls, mca_btl_uct_tl_t) { + int rank = mca_btl_uct_include_list_rank(tl->uct_tl_name, &module->allowed_transport_list); + if (rank < 0) { + opal_list_remove_item(&md->tls, &tl->super); + OBJ_RELEASE(tl); + continue; + } + tl->priority = rank; + } + + opal_list_sort(&md->tls, tl_compare); + + /* Treat the flags specified by the user as a mask. */ + uint32_t btl_flags = module->super.btl_flags; + uint32_t btl_atomic_flags = module->super.btl_atomic_flags; + + module->super.btl_flags = 0; + module->super.btl_atomic_flags = 0; + + OPAL_LIST_FOREACH (tl, &md->tls, mca_btl_uct_tl_t) { + mca_btl_uct_evaluate_tl(module, tl); + if (NULL != module->am_tl && NULL != module->rdma_tl) { + /* all done */ + break; + } + } + + module->super.btl_flags &= btl_flags; + module->super.btl_atomic_flags &= btl_atomic_flags; + + if (NULL == module->rdma_tl) { + /* no rdma tls */ + BTL_VERBOSE(("no rdma tl matched supplied filter. disabling RDMA support")); + + module->super.btl_flags &= ~MCA_BTL_FLAGS_RDMA; + module->super.btl_put = NULL; + module->super.btl_get = NULL; + module->super.btl_atomic_fop = NULL; + module->super.btl_atomic_op = NULL; + } + + if (NULL == module->am_tl) { + /* no active message tls == no send/recv */ + BTL_VERBOSE(("no active message tl matched supplied filter. disabling send/recv support")); + + module->super.btl_send = NULL; + module->super.btl_sendi = NULL; + module->super.btl_alloc = NULL; + module->super.btl_free = NULL; + } + + if (NULL == module->am_tl && NULL == module->rdma_tl) { + mca_btl_uct_finalize(&module->super); + return OPAL_ERR_NOT_AVAILABLE; + } + + module->module_index = mca_btl_uct_component.module_count; + mca_btl_uct_component.modules[mca_btl_uct_component.module_count++] = module; + + return OPAL_SUCCESS; +} + +static void mca_btl_uct_enable_tl(mca_btl_uct_tl_t *tl) { + if (NULL == tl) { + return; + } + + if (tl->max_device_contexts < 1) { + tl->max_device_contexts = mca_btl_uct_component.num_contexts_per_module; + } +} + +static int mca_btl_uct_enable_module(mca_btl_uct_module_t *module) +{ + /* NTH: a registration cache shouldn't be necessary when using UCT but there are measurable + * performance benefits to using rcache/grdma instead of assuming UCT will do the right + * thing. */ + char *tmp = NULL; + (void) opal_asprintf(&tmp, "uct.%s", module->md->md_name); + + mca_rcache_base_resources_t rcache_resources = { + .cache_name = tmp, + .reg_data = (void *) module, + .sizeof_reg = sizeof(mca_btl_uct_reg_t) + module->super.btl_registration_handle_size, + .register_mem = mca_btl_uct_reg_mem, + .deregister_mem = mca_btl_uct_dereg_mem, + }; + + module->rcache = mca_rcache_base_module_create("grdma", module, &rcache_resources); + free(tmp); + if (NULL == module->rcache) { + /* something went horribly wrong */ + BTL_VERBOSE(("could not allocate a registration cache for this btl module")); + return OPAL_ERROR; + } + + mca_btl_uct_enable_tl(module->rdma_tl); + mca_btl_uct_enable_tl(module->am_tl); + + return OPAL_SUCCESS; +} + +int mca_btl_uct_enable_modules(mca_btl_uct_module_t **modules, int module_count) +{ + for (int i = 0 ; i < module_count ; ++i) { + int rc = mca_btl_uct_enable_module(modules[i]); + if (OPAL_SUCCESS != rc) { + BTL_VERBOSE(("could not enable module for memory domain %s", modules[i]->md->md_name)); + mca_btl_uct_finalize(&modules[i]->super); + } + } + + return OPAL_SUCCESS; +} + +int mca_btl_uct_component_generate_modules(opal_list_t *md_list) +{ + mca_btl_uct_component.module_count = 0; + + mca_btl_uct_md_t *md; + OPAL_LIST_FOREACH(md, md_list, mca_btl_uct_md_t) { + if (MCA_BTL_UCT_MAX_MODULES == mca_btl_uct_component.module_count) { + BTL_VERBOSE(("created the maximum number of allowable modules")); + break; + } + + if (md->connection_only_domain) { + /* will not build a module for this domain */ + continue; + } + + int rc = mca_btl_uct_generate_module(md); + if (OPAL_SUCCESS != rc) { + BTL_VERBOSE(("could not create a module for memory domain %s", md->md_name)); + } + } + + return OPAL_SUCCESS; +} + +int mca_btl_uct_component_maybe_setup_conn_tl(void) +{ + bool connection_tl_required = false; + for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + connection_tl_required |= + mca_btl_uct_tl_requires_connection_tl(mca_btl_uct_component.modules[i]->am_tl); + connection_tl_required |= + mca_btl_uct_tl_requires_connection_tl(mca_btl_uct_component.modules[i]->rdma_tl); + if (connection_tl_required) { + break; + } + } + + if (!connection_tl_required) { + return OPAL_SUCCESS; + } + + mca_btl_uct_md_t *md; + OPAL_LIST_FOREACH(md, &mca_btl_uct_component.md_list, mca_btl_uct_md_t) { + mca_btl_uct_tl_t *tl, *next; + OPAL_LIST_FOREACH_SAFE(tl, next, &md->tls, mca_btl_uct_tl_t) { + if (mca_btl_uct_tl_supports_conn(tl)) { + break; + } + tl = NULL; + } + + if (NULL == mca_btl_uct_component.conn_tl) { + mca_btl_uct_component.conn_tl = tl; + } + + if (tl != NULL && (md->connection_only_domain || NULL == mca_btl_uct_component.conn_tl)) { + mca_btl_uct_component.conn_tl = tl; + if (md->connection_only_domain) { + /* not going do to better */ + break; + } + } + } + + if (NULL == mca_btl_uct_component.conn_tl) { + /* no connection tl found, will need to disable all connect-to-endpoint modules */ + BTL_VERBOSE(("could not find a suitable transport to support forming connections")); + return OPAL_ERR_NOT_FOUND; + } + + BTL_VERBOSE(("using transport %s::%s for connection management", + mca_btl_uct_component.conn_tl->uct_md->md_name, + mca_btl_uct_component.conn_tl->uct_tl_name)); + + return mca_btl_uct_enable_tl_conn(mca_btl_uct_component.conn_tl); +} + +int mca_btl_uct_component_filter_mds(void) +{ + int usable_module_count = mca_btl_uct_component.module_count; + /* clean out all unused mds, tls, and unusable modules */ + if (NULL == mca_btl_uct_component.conn_tl) { + for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i]; + if (!(mca_btl_uct_tl_requires_connection_tl(module->am_tl) || + mca_btl_uct_tl_requires_connection_tl(module->rdma_tl))) { + continue; + } + + /* module is unusable */ + mca_btl_uct_finalize(&module->super); + mca_btl_uct_component.modules[i] = NULL; + --usable_module_count; + } + } + + mca_btl_uct_md_t *md, *md_next; + OPAL_LIST_FOREACH_SAFE(md, md_next, &mca_btl_uct_component.md_list, mca_btl_uct_md_t) { + mca_btl_uct_module_t *module = NULL; + for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + module = mca_btl_uct_component.modules[i]; + if (NULL != module && module->md == md) { + break; + } + module = NULL; + } + + mca_btl_uct_tl_t *tl, *next; + OPAL_LIST_FOREACH_SAFE(tl, next, &md->tls, mca_btl_uct_tl_t) { + if (tl == mca_btl_uct_component.conn_tl || (NULL != module && + (tl == module->rdma_tl || + tl == module->am_tl))) { + /* tl is in use */ + continue; + } + opal_list_remove_item(&md->tls, &tl->super); + OBJ_RELEASE(tl); + } + + if (opal_list_get_size(&md->tls) == 0) { + opal_list_remove_item(&mca_btl_uct_component.md_list, &md->super); + OBJ_RELEASE(md); + } + } + + /* remove holes in the module array */ + if (usable_module_count < mca_btl_uct_component.module_count) { + for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + if (mca_btl_uct_component.modules[i] == NULL) { + for (int j = i ; j < mca_btl_uct_component.module_count ; ++j) { + mca_btl_uct_component.modules[i++] = mca_btl_uct_component.modules[j]; + } + } + } + mca_btl_uct_component.module_count = usable_module_count; + } + + return OPAL_SUCCESS; +} diff --git a/opal/mca/btl/uct/btl_uct_discover.h b/opal/mca/btl/uct/btl_uct_discover.h new file mode 100644 index 00000000000..08b03899fc4 --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_discover.h @@ -0,0 +1,43 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Google, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#if !defined(MCA_BTL_UCT_DISCOVER_H) +#define MCA_BTL_UCT_DISCOVER_H + +#include "btl_uct.h" +#include "opal/class/opal_list.h" + +/** + * @brief Query UCT for the available memory domains. This list will be limited by + */ +int mca_btl_uct_component_discover_mds(void); + +/** + * @brief Create BTL modules from the memory domain list. + * + * The modules are registered with MCA and must be shut down using + * mca_btl_module_finalize. + */ +int mca_btl_uct_component_generate_modules(opal_list_t *md_list); + +int mca_btl_uct_enable_modules(mca_btl_uct_module_t **modules, int module_count); + +/** + * @brief Scan detected transports and find a connection transport (if needed). + */ +int mca_btl_uct_component_maybe_setup_conn_tl(void); + +/** + * @brief Clean out unused memory domains and transport layers. + */ +int mca_btl_uct_component_filter_mds(void); + + +#endif /* !defined(MCA_BTL_UCT_DISCOVER_H) */ diff --git a/opal/mca/btl/uct/btl_uct_endpoint.c b/opal/mca/btl/uct/btl_uct_endpoint.c index 695fd754aa2..fe4217035e6 100644 --- a/opal/mca/btl/uct/btl_uct_endpoint.c +++ b/opal/mca/btl/uct/btl_uct_endpoint.c @@ -16,6 +16,7 @@ #include "btl_uct.h" #include "btl_uct_am.h" #include "btl_uct_device_context.h" +#include "btl_uct_modex.h" #include "opal/mca/timer/base/base.h" #include "opal/util/proc.h" @@ -24,7 +25,7 @@ static void mca_btl_uct_endpoint_construct(mca_btl_uct_endpoint_t *endpoint) memset(endpoint->uct_eps, 0, sizeof(endpoint->uct_eps[0]) * mca_btl_uct_component.num_contexts_per_module); endpoint->conn_ep = NULL; - OBJ_CONSTRUCT(&endpoint->ep_lock, opal_recursive_mutex_t); + OBJ_CONSTRUCT(&endpoint->ep_lock, opal_mutex_t); } static void mca_btl_uct_endpoint_destruct(mca_btl_uct_endpoint_t *endpoint) @@ -63,53 +64,6 @@ mca_btl_base_endpoint_t *mca_btl_uct_endpoint_create(opal_proc_t *proc) return (mca_btl_base_endpoint_t *) endpoint; } -static unsigned char *mca_btl_uct_process_modex_tl(unsigned char *modex_data) -{ - BTL_VERBOSE( - ("processing modex for tl %s. size: %u", modex_data + 4, *((uint32_t *) modex_data))); - - /* skip size and name */ - return modex_data + 4 + strlen((char *) modex_data + 4) + 1; -} - -static void mca_btl_uct_process_modex(mca_btl_uct_module_t *uct_btl, unsigned char *modex_data, - unsigned char **rdma_tl_data, unsigned char **am_tl_data, - unsigned char **conn_tl_data) -{ - BTL_VERBOSE(("processing remote modex data")); - - if (uct_btl->rdma_tl) { - BTL_VERBOSE(("modex contains RDMA data")); - if (rdma_tl_data) { - *rdma_tl_data = mca_btl_uct_process_modex_tl(modex_data); - } - modex_data += *((uint32_t *) modex_data); - } else if (rdma_tl_data) { - *rdma_tl_data = NULL; - } - - if (uct_btl->am_tl && uct_btl->am_tl != uct_btl->rdma_tl) { - BTL_VERBOSE(("modex contains active message data")); - if (am_tl_data) { - *am_tl_data = mca_btl_uct_process_modex_tl(modex_data); - } - modex_data += *((uint32_t *) modex_data); - } else if (am_tl_data) { - *am_tl_data = NULL; - } - - if (uct_btl->conn_tl && uct_btl->conn_tl != uct_btl->rdma_tl - && uct_btl->conn_tl != uct_btl->am_tl) { - BTL_VERBOSE(("modex contains connection data")); - if (conn_tl_data) { - *conn_tl_data = mca_btl_uct_process_modex_tl(modex_data); - } - modex_data += *((uint32_t *) modex_data); - } else if (conn_tl_data) { - *conn_tl_data = NULL; - } -} - static inline ucs_status_t mca_btl_uct_ep_create_connected_compat(uct_iface_h iface, uct_device_addr_t *device_addr, uct_iface_addr_t *iface_addr, @@ -150,7 +104,7 @@ static int mca_btl_uct_endpoint_connect_iface(mca_btl_uct_module_t *uct_btl, mca /* easy case. just connect to the interface */ iface_addr = (uct_iface_addr_t *) tl_data; device_addr = (uct_device_addr_t *) ((uintptr_t) iface_addr - + MCA_BTL_UCT_TL_ATTR(tl, tl_context->context_id) + + tl->uct_iface_attr .iface_addr_len); BTL_VERBOSE(("connecting endpoint to interface")); @@ -164,22 +118,6 @@ static int mca_btl_uct_endpoint_connect_iface(mca_btl_uct_module_t *uct_btl, mca return (UCS_OK == ucs_status) ? OPAL_SUCCESS : OPAL_ERROR; } -static void mca_btl_uct_connection_ep_construct(mca_btl_uct_connection_ep_t *ep) -{ - ep->uct_ep = NULL; -} - -static void mca_btl_uct_connection_ep_destruct(mca_btl_uct_connection_ep_t *ep) -{ - if (ep->uct_ep) { - uct_ep_destroy(ep->uct_ep); - ep->uct_ep = NULL; - } -} - -OBJ_CLASS_INSTANCE(mca_btl_uct_connection_ep_t, opal_object_t, mca_btl_uct_connection_ep_construct, - mca_btl_uct_connection_ep_destruct); - struct mca_btl_uct_conn_completion_t { uct_completion_t super; volatile bool complete; @@ -203,24 +141,61 @@ static void mca_btl_uct_endpoint_flush_complete(uct_completion_t *self, ucs_stat } #endif +static void mca_btl_uct_flush_conn_endpoint(mca_btl_uct_connection_ep_t *conn_ep) +{ + mca_btl_uct_device_context_t *conn_tl_context = conn_ep->tl->uct_dev_contexts[0]; + mca_btl_uct_conn_completion_t completion + = {.super = {.count = 1, .func = mca_btl_uct_endpoint_flush_complete}, .complete = false}; + ucs_status_t ucs_status; + MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, { + ucs_status = uct_ep_flush(conn_ep->uct_ep, 0, &completion.super); + }); + if (UCS_OK != ucs_status && UCS_INPROGRESS != ucs_status) { + /* NTH: I don't know if this path is needed. For some networks we must use a completion. */ + do { + MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, { + ucs_status = uct_ep_flush(conn_ep->uct_ep, 0, NULL); + }); + mca_btl_uct_context_progress(conn_tl_context); + } while (UCS_INPROGRESS == ucs_status); + } else { + do { + mca_btl_uct_context_progress(conn_tl_context); + } while (!completion.complete); + } +} + +static void mca_btl_uct_connection_ep_construct(mca_btl_uct_connection_ep_t *ep) +{ + ep->uct_ep = NULL; + ep->tl = NULL; +} + +static void mca_btl_uct_connection_ep_destruct(mca_btl_uct_connection_ep_t *ep) +{ + if (ep->uct_ep) { + mca_btl_uct_flush_conn_endpoint(ep); + uct_ep_destroy(ep->uct_ep); + ep->uct_ep = NULL; + } +} + +OBJ_CLASS_INSTANCE(mca_btl_uct_connection_ep_t, opal_object_t, mca_btl_uct_connection_ep_construct, + mca_btl_uct_connection_ep_destruct); + static int mca_btl_uct_endpoint_send_conn_req(mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, - mca_btl_uct_device_context_t *conn_tl_context, mca_btl_uct_conn_req_t *request, size_t request_length) { - mca_btl_uct_conn_completion_t completion - = {.super = {.count = 1, .func = mca_btl_uct_endpoint_flush_complete}, .complete = false}; - ucs_status_t ucs_status; + mca_btl_uct_device_context_t *conn_tl_context = mca_btl_uct_component.conn_tl->uct_dev_contexts[0]; BTL_VERBOSE( ("sending connection request to peer. context id: %d, type: %d, length: %" PRIsize_t, request->context_id, request->type, request_length)); - /* need to drop the lock to avoid hold-and-wait */ - opal_mutex_unlock(&endpoint->ep_lock); - do { + ucs_status_t ucs_status; MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, { ucs_status = uct_ep_am_short(endpoint->conn_ep->uct_ep, MCA_BTL_UCT_CONNECT_RDMA, request->type, request, request_length); @@ -233,75 +208,70 @@ static int mca_btl_uct_endpoint_send_conn_req(mca_btl_uct_module_t *uct_btl, return OPAL_ERROR; } + /* need to drop the lock to avoid hold-and-wait */ + opal_mutex_unlock(&endpoint->ep_lock); /* some TLs (UD for example) need to be progressed to get resources */ mca_btl_uct_context_progress(conn_tl_context); + opal_mutex_lock(&endpoint->ep_lock); } while (1); - /* for now we just wait for the connection request to complete before continuing */ - ucs_status = uct_ep_flush(endpoint->conn_ep->uct_ep, 0, &completion.super); - if (UCS_OK != ucs_status && UCS_INPROGRESS != ucs_status) { - /* NTH: I don't know if this path is needed. For some networks we must use a completion. */ - do { - ucs_status = uct_ep_flush(endpoint->conn_ep->uct_ep, 0, NULL); - mca_btl_uct_context_progress(conn_tl_context); - } while (UCS_INPROGRESS == ucs_status); - } else { - do { - mca_btl_uct_context_progress(conn_tl_context); - } while (!completion.complete); - } - - opal_mutex_lock(&endpoint->ep_lock); - return OPAL_SUCCESS; } -static int mca_btl_uct_endpoint_send_connection_data( - mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, mca_btl_uct_tl_t *tl, - mca_btl_uct_device_context_t *tl_context, mca_btl_uct_tl_endpoint_t *tl_endpoint, - uint8_t *conn_tl_data, int request_type) +static int mca_btl_uct_endpoint_get_helper_endpoint(mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, + uint8_t *conn_tl_data) { - mca_btl_uct_tl_t *conn_tl = uct_btl->conn_tl; - mca_btl_uct_device_context_t *conn_tl_context = conn_tl->uct_dev_contexts[0]; - uct_device_addr_t *device_addr = NULL; - uct_iface_addr_t *iface_addr; - ucs_status_t ucs_status; + if (NULL != endpoint->conn_ep) { + BTL_VERBOSE(("re-using existing connection endpoint")); + OBJ_RETAIN(endpoint->conn_ep); + return OPAL_SUCCESS; + } - assert(NULL != conn_tl); + mca_btl_uct_tl_t *conn_tl = mca_btl_uct_component.conn_tl; - BTL_VERBOSE(("connecting endpoint to remote endpoint")); + BTL_VERBOSE(("creating a temporary endpoint for handling connections to %p", + opal_process_name_print(endpoint->ep_proc->proc_name))); - if (NULL == endpoint->conn_ep) { - BTL_VERBOSE(("creating a temporary endpoint for handling connections to %p", - opal_process_name_print(endpoint->ep_proc->proc_name))); + uct_iface_addr_t *iface_addr = (uct_iface_addr_t *) conn_tl_data; + uct_device_addr_t *device_addr = (uct_device_addr_t *) ((uintptr_t) conn_tl_data + + conn_tl->uct_iface_attr.iface_addr_len); - iface_addr = (uct_iface_addr_t *) conn_tl_data; - device_addr = (uct_device_addr_t *) ((uintptr_t) conn_tl_data - + MCA_BTL_UCT_TL_ATTR(conn_tl, 0).iface_addr_len); + endpoint->conn_ep = OBJ_NEW(mca_btl_uct_connection_ep_t); + if (OPAL_UNLIKELY(NULL == endpoint->conn_ep)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } - endpoint->conn_ep = OBJ_NEW(mca_btl_uct_connection_ep_t); - if (OPAL_UNLIKELY(NULL == endpoint->conn_ep)) { - return OPAL_ERR_OUT_OF_RESOURCE; - } + endpoint->conn_ep->tl = conn_tl; - /* create a temporary endpoint for setting up the rdma endpoint */ - MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, { + ucs_status_t ucs_status; + mca_btl_uct_device_context_t *conn_tl_context = conn_tl->uct_dev_contexts[0]; + /* create a temporary endpoint for setting up the rdma endpoint */ + MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, { ucs_status = mca_btl_uct_ep_create_connected_compat(conn_tl_context->uct_iface, device_addr, iface_addr, &endpoint->conn_ep->uct_ep); }); - if (UCS_OK != ucs_status) { - BTL_VERBOSE( - ("could not create an endpoint for forming connection to remote peer. code = %d", - ucs_status)); - return OPAL_ERROR; - } - } else { - OBJ_RETAIN(endpoint->conn_ep); + if (UCS_OK != ucs_status) { + BTL_VERBOSE( + ("could not create an endpoint for forming connection to remote peer. code = %d", + ucs_status)); + return OPAL_ERROR; } + return OPAL_SUCCESS; +} + +static int mca_btl_uct_endpoint_send_connection_data( + mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, + mca_btl_uct_tl_t *tl, mca_btl_uct_device_context_t *tl_context, + mca_btl_uct_tl_endpoint_t *tl_endpoint, int request_type, int remote_module_index) +{ + ucs_status_t ucs_status; + + BTL_VERBOSE(("connecting endpoint to remote endpoint")); + size_t request_length = sizeof(mca_btl_uct_conn_req_t) - + MCA_BTL_UCT_TL_ATTR(tl, tl_context->context_id).ep_addr_len; + + tl->uct_iface_attr.ep_addr_len; mca_btl_uct_conn_req_t *request = alloca(request_length); /* fill in common request parameters */ @@ -309,6 +279,7 @@ static int mca_btl_uct_endpoint_send_connection_data( request->context_id = tl_context->context_id; request->tl_index = tl->tl_index; request->type = request_type; + request->module_index = remote_module_index; /* fill in connection request */ ucs_status = uct_ep_get_address(tl_endpoint->uct_ep, (uct_ep_addr_t *) request->ep_addr); @@ -322,7 +293,7 @@ static int mca_btl_uct_endpoint_send_connection_data( /* let the remote side know that the connection has been established and * wait for the message to be sent */ - int rc = mca_btl_uct_endpoint_send_conn_req(uct_btl, endpoint, conn_tl_context, request, + int rc = mca_btl_uct_endpoint_send_conn_req(uct_btl, endpoint, request, request_length); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { OBJ_RELEASE(endpoint->conn_ep); @@ -337,9 +308,9 @@ static int mca_btl_uct_endpoint_send_connection_data( } static int mca_btl_uct_endpoint_connect_endpoint( - mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, mca_btl_uct_tl_t *tl, - mca_btl_uct_device_context_t *tl_context, mca_btl_uct_tl_endpoint_t *tl_endpoint, - uint8_t *tl_data, uint8_t *conn_tl_data, void *ep_addr) + mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, + mca_btl_uct_tl_t *tl, mca_btl_uct_device_context_t *tl_context, + mca_btl_uct_tl_endpoint_t *tl_endpoint, uint8_t *tl_data, void *ep_addr, int remote_module_index) { ucs_status_t ucs_status; @@ -367,20 +338,23 @@ static int mca_btl_uct_endpoint_connect_endpoint( if (UCS_OK != ucs_status) { return OPAL_ERROR; } - - mca_btl_uct_endpoint_set_flag(uct_btl, endpoint, tl_context->context_id, tl_endpoint, - MCA_BTL_UCT_ENDPOINT_FLAG_EP_CONNECTED); } opal_timer_t now = opal_timer_base_get_usec(); - if ((now - tl_endpoint->last_connection_req) < mca_btl_uct_component.connection_retry_timeout && !ep_addr) { - return (tl_endpoint->flags & MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY) ? OPAL_SUCCESS - : OPAL_ERR_OUT_OF_RESOURCE; + if ((now - tl_endpoint->last_connection_req) > mca_btl_uct_component.connection_retry_timeout || ep_addr) { + int rc = mca_btl_uct_endpoint_send_connection_data(uct_btl, endpoint, tl, tl_context, tl_endpoint, + /*request_type=*/!!ep_addr, remote_module_index); + if (OPAL_SUCCESS != rc) { + return rc; + } + } + + if (ep_addr) { + mca_btl_uct_endpoint_set_flag(uct_btl, endpoint, tl_context->context_id, tl_endpoint, + MCA_BTL_UCT_ENDPOINT_FLAG_EP_CONNECTED); } - int rc = mca_btl_uct_endpoint_send_connection_data(uct_btl, endpoint, tl, tl_context, tl_endpoint, - conn_tl_data, /*request_type=*/!!ep_addr); - return (OPAL_SUCCESS == rc) ? OPAL_ERR_OUT_OF_RESOURCE : rc; + return OPAL_ERR_OUT_OF_RESOURCE; } int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endpoint_t *endpoint, @@ -392,9 +366,8 @@ int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endp : uct_btl->am_tl; mca_btl_uct_device_context_t *tl_context = mca_btl_uct_module_get_tl_context_specific(uct_btl, tl, context_id); - uint8_t *rdma_tl_data = NULL, *conn_tl_data = NULL, *am_tl_data = NULL, *tl_data; + uint8_t *conn_tl_data, *tl_data = NULL; mca_btl_uct_modex_t *modex; - uint8_t *modex_data; size_t msg_size; int rc; @@ -410,19 +383,20 @@ int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endp !!(MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & tl_endpoint->flags))); opal_mutex_lock(&endpoint->ep_lock); - if (MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & tl_endpoint->flags) { - opal_mutex_unlock(&endpoint->ep_lock); - /* nothing more to do. someone else completed the connection */ - return OPAL_SUCCESS; - } - - /* dumpicate connection request. nothing to do until the endpoint data is received */ - if (NULL != tl_endpoint->uct_ep && NULL == ep_addr) { - opal_mutex_unlock(&endpoint->ep_lock); - return OPAL_ERR_OUT_OF_RESOURCE; - } do { + if (MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & tl_endpoint->flags) { + /* nothing more to do. someone else completed the connection */ + rc = OPAL_SUCCESS; + break; + } + + /* dumpicate connection request. nothing to do until the endpoint data is received */ + if (NULL != tl_endpoint->uct_ep && NULL == ep_addr) { + rc = OPAL_ERR_OUT_OF_RESOURCE; + break; + } + /* read the modex. this is done both to start the connection and to process endpoint data */ OPAL_MODEX_RECV(rc, &mca_btl_uct_component.super.btl_version, &endpoint->ep_proc->proc_name, (void **) &modex, &msg_size); @@ -434,45 +408,39 @@ int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endp BTL_VERBOSE(("received modex of size %lu for proc %s. module count %d", (unsigned long) msg_size, OPAL_NAME_PRINT(endpoint->ep_proc->proc_name), modex->module_count)); - modex_data = modex->data; - - /* look for matching transport in the modex */ - for (int i = 0; i < modex->module_count; ++i) { - uint32_t modex_size = *((uint32_t *) modex_data); - - BTL_VERBOSE( - ("found modex for md %s, searching for %s", modex_data + 4, uct_btl->md_name)); - - modex_data += 4; - - if (0 != strcmp((char *) modex_data, uct_btl->md_name)) { - /* modex belongs to a different module, skip it and continue */ - modex_data += modex_size - 4; - continue; - } - - modex_data += strlen((char *) modex_data) + 1; - mca_btl_uct_process_modex(uct_btl, modex_data, &rdma_tl_data, &am_tl_data, - &conn_tl_data); + int remote_module_index; + tl_data = mca_btl_uct_find_modex(modex, tl, &remote_module_index); + if (OPAL_UNLIKELY(NULL == tl_data)) { + BTL_ERROR(("could not find modex data for this transport")); + rc = OPAL_ERR_UNREACH; break; } - tl_data = (tl == uct_btl->rdma_tl) ? rdma_tl_data : am_tl_data; + /* connect the endpoint */ + if (mca_btl_uct_tl_requires_connection_tl(tl)) { + conn_tl_data = mca_btl_uct_find_modex(modex, mca_btl_uct_component.conn_tl, + /*remote_module_index=*/NULL); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + BTL_ERROR(("could not find modex for connection module")); + break; + } - if (NULL == tl_data) { - opal_mutex_unlock(&endpoint->ep_lock); - return OPAL_ERR_UNREACH; - } - /* connect the endpoint */ - if (!mca_btl_uct_tl_requires_connection_tl(tl)) { - rc = mca_btl_uct_endpoint_connect_iface(uct_btl, tl, tl_context, tl_endpoint, tl_data); + if (NULL == tl_endpoint->uct_ep) { + /* allocate or retain a connection endpoint */ + rc = mca_btl_uct_endpoint_get_helper_endpoint(uct_btl, endpoint, + conn_tl_data); + if (OPAL_SUCCESS != rc) { + break; + } + } + + rc = mca_btl_uct_endpoint_connect_endpoint(uct_btl, endpoint, tl, tl_context, tl_endpoint, + tl_data, ep_addr, remote_module_index); } else { - rc = mca_btl_uct_endpoint_connect_endpoint(uct_btl, endpoint, tl, tl_context, - tl_endpoint, tl_data, conn_tl_data, ep_addr); + rc = mca_btl_uct_endpoint_connect_iface(uct_btl, tl, tl_context, tl_endpoint, tl_data); } - } while (0); opal_mutex_unlock(&endpoint->ep_lock); diff --git a/opal/mca/btl/uct/btl_uct_include_list.c b/opal/mca/btl/uct/btl_uct_include_list.c new file mode 100644 index 00000000000..5e989581612 --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_include_list.c @@ -0,0 +1,78 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2024-2025 Google, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include + +#include "opal_config.h" + +#include "btl_uct_include_list.h" +#include "btl_uct_types.h" +#include "opal/class/opal_object.h" +#include "opal/mca/btl/base/btl_base_error.h" +#include "opal/util/argv.h" + +void mca_btl_uct_include_list_parse (const char *value, mca_btl_uct_include_list_t *list) { + list->list = NULL; + list->include = true; + + if (value == NULL) { + return; + } + + if (value[0] == '^') { + list->include = false; + value++; + } + + list->list = opal_argv_split(value, ','); +} + +int mca_btl_uct_include_list_rank (const char *name, const mca_btl_uct_include_list_t *list) { + if (list->list == NULL) { + return -1; + } + + for (int i = 0; list->list[i]; ++i) { + regex_t preg; + + BTL_VERBOSE(("evaluating %s vs %s-list item %s", name, list->include ? "include" : "exclude", list->list[i])); + int rc = regcomp(&preg, list->list[i], REG_ICASE); + if (0 != rc) { + char errbuf[256]; + regerror(rc, &preg, errbuf, sizeof(errbuf)); + BTL_ERROR(("when matching name, could not parse regular expression: %s, error: %s", list->list[i], errbuf)); + continue; + } + + int result = regexec(&preg, name, /*nmatch=*/0, /*pmatch=*/NULL, /*eflags=*/0); + regfree(&preg); + if (0 == result) { + return list->include ? i + 1 : -(i + 1); + } + } + + return list->include ? -1 : 1; +} + +static void mca_btl_uct_include_list_construct (mca_btl_uct_include_list_t *list) +{ + list->list = NULL; +} + +static void mca_btl_uct_include_list_destruct (mca_btl_uct_include_list_t *list) +{ + opal_argv_free (list->list); + list->list = NULL; +} + +OBJ_CLASS_INSTANCE(mca_btl_uct_include_list_t, opal_object_t, mca_btl_uct_include_list_construct, + mca_btl_uct_include_list_destruct); + + diff --git a/opal/mca/btl/uct/btl_uct_include_list.h b/opal/mca/btl/uct/btl_uct_include_list.h new file mode 100644 index 00000000000..69fba979d8d --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_include_list.h @@ -0,0 +1,34 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2024-2025 Google, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "btl_uct_types.h" + +#if !defined(BTL_UCT_INCLUDE_LIST_H) +#define BTL_UCT_INCLUDE_LIST_H + +/** + * @brief Parse `value` to create an include list. + * + * @param[in] value Comma-delimeted string to parse. + * @param[in,out] list Include list object, must already be constructed. + */ +void mca_btl_uct_include_list_parse (const char *value, mca_btl_uct_include_list_t *list); + +/** + * @brief Find the rank of `name` in the include list `list`. + * + * @param[in] name name to find + * @param[in] list list to search + * + * A negative result means the name is not present or the list is negated. + */ +int mca_btl_uct_include_list_rank (const char *name, const mca_btl_uct_include_list_t *list); + +#endif /* !defined(BTL_UCT_INCLUDE_LIST_H) */ diff --git a/opal/mca/btl/uct/btl_uct_modex.c b/opal/mca/btl/uct/btl_uct_modex.c new file mode 100644 index 00000000000..7d6aa2f5450 --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_modex.c @@ -0,0 +1,198 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. + * Copyright (c) 2018-2024 Triad National Security, LLC. All rights + * reserved. + * Copyright (c) 2019-2025 Google, LLC. All rights reserved. + * Copyright (c) 2019 Intel, Inc. All rights reserved. + * Copyright (c) 2022 IBM Corporation. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal_config.h" + +#include "btl_uct_modex.h" +#include "btl_uct_types.h" +#include "btl_uct_device_context.h" +#include "opal/class/opal_list.h" +#include "opal/mca/pmix/pmix-internal.h" + +static uint16_t mca_btl_uct_tl_modex_size(mca_btl_uct_tl_t *tl) +{ + uint16_t size = sizeof(mca_btl_uct_tl_modex_t); + + if (tl->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { + size += (uint16_t)tl->uct_iface_attr.iface_addr_len; + } + + /* pad out to a multiple of 4 bytes */ + return (3 + size + (uint16_t)tl->uct_iface_attr.device_addr_len) & ~3; +} + +static uint16_t mca_btl_uct_md_modex_size(mca_btl_uct_md_t *md) +{ + uint16_t modex_size = sizeof(mca_btl_uct_md_modex_t); + + mca_btl_uct_tl_t *tl; + OPAL_LIST_FOREACH(tl, &md->tls, mca_btl_uct_tl_t) { + modex_size += mca_btl_uct_tl_modex_size(tl); + } + + return modex_size; +} + +static uint8_t *mca_btl_uct_tl_modex_pack(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl, + uint8_t *modex_data) +{ + mca_btl_uct_device_context_t *dev_context = + mca_btl_uct_module_get_tl_context_specific(module, tl, /*context_id=*/0); + + mca_btl_uct_tl_modex_t *tl_modex = (mca_btl_uct_tl_modex_t *)modex_data; + tl_modex->size = mca_btl_uct_tl_modex_size(tl); + + memset(tl_modex->tl_name, 0, sizeof(tl_modex->tl_name)); + strncpy(tl_modex->tl_name, tl->uct_tl_name, sizeof(tl_modex->tl_name)); + + uint8_t *tl_modex_data = (uint8_t *) tl_modex->data; + + /* NTH: only the first context is available. i assume the device addresses of the + * contexts will be the same but they will have different iface addresses. i also + * am assuming that it doesn't really matter if all remote contexts connect to + * the same endpoint since we are only doing RDMA. if any of these assumptions are + * wrong then we can't delay creating the other contexts and must include their + * information in the modex. */ + if (tl->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { + uct_iface_get_address(dev_context->uct_iface, (uct_iface_addr_t *) tl_modex_data); + tl_modex_data += tl->uct_iface_attr.iface_addr_len; + } + + uct_iface_get_device_address(dev_context->uct_iface, (uct_device_addr_t *) tl_modex_data); + tl_modex_data += tl->uct_iface_attr.device_addr_len; + + return modex_data + tl_modex->size; +} + +static uint8_t *mca_btl_uct_modex_pack(mca_btl_uct_md_t *md, uint8_t *modex_data) +{ + mca_btl_uct_module_t *module = NULL; + for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + if (mca_btl_uct_component.modules[i]->md == md) { + module = mca_btl_uct_component.modules[i]; + break; + } + } + + mca_btl_uct_md_modex_t *md_modex = (mca_btl_uct_md_modex_t *)modex_data; + modex_data = md_modex->data; + + md_modex->size = mca_btl_uct_md_modex_size(md); + md_modex->module_index = module ? module->module_index : (uint16_t) -1; + + memset(md_modex->md_name, 0, sizeof(md_modex->md_name)); + strncpy(md_modex->md_name, md->md_name, sizeof(md_modex->md_name)); + + mca_btl_uct_tl_t *tl; + OPAL_LIST_FOREACH(tl, &md->tls, mca_btl_uct_tl_t) { + modex_data = mca_btl_uct_tl_modex_pack(module, tl, modex_data); + } + + return modex_data; +} + +int mca_btl_uct_component_modex_send(void) +{ + size_t modex_size = sizeof(mca_btl_uct_modex_t); + mca_btl_uct_modex_t *modex; + uint8_t *modex_data; + int rc; + + mca_btl_uct_md_t *md; + OPAL_LIST_FOREACH(md, &mca_btl_uct_component.md_list, mca_btl_uct_md_t) { + modex_size += mca_btl_uct_md_modex_size(md); + } + + modex = alloca(modex_size); + modex_data = modex->data; + + modex->module_count = opal_list_get_size(&mca_btl_uct_component.md_list); + OPAL_LIST_FOREACH(md, &mca_btl_uct_component.md_list, mca_btl_uct_md_t) { + modex_data = mca_btl_uct_modex_pack(md, modex_data); + } + + OPAL_MODEX_SEND(rc, PMIX_GLOBAL, &mca_btl_uct_component.super.btl_version, modex, modex_size); + return rc; +} + +static uint8_t *mca_btl_uct_find_tl_modex(mca_btl_uct_md_modex_t *md_modex, mca_btl_uct_tl_t *tl) +{ + uint8_t *modex_data = md_modex->data; + + for (uint16_t modex_offset = 0 ; modex_offset < md_modex->size ; ){ + mca_btl_uct_tl_modex_t *tl_modex = (mca_btl_uct_tl_modex_t *)(modex_data + modex_offset); + + BTL_VERBOSE(("found modex for tl %s searching for %s", tl_modex->tl_name, tl->uct_tl_name)); + + if (0 == strcmp(tl->uct_tl_name, tl_modex->tl_name)) { + return tl_modex->data; + } + + BTL_VERBOSE(("no match, continuing")); + + modex_offset += tl_modex->size; + } + + return NULL; +} + +uint8_t *mca_btl_uct_find_modex(mca_btl_uct_modex_t *modex, mca_btl_uct_tl_t *tl, int *remote_module_index) { + uint8_t *modex_data = modex->data; + + /* look for matching transport in the modex */ + for (int i = 0; i < modex->module_count; ++i) { + mca_btl_uct_md_modex_t *md_modex = (mca_btl_uct_md_modex_t *)modex_data; + + BTL_VERBOSE(("found modex for md %s (remote module index %hu), searching for %s", + md_modex->md_name, md_modex->module_index, tl->uct_md->md_name)); + + if (0 != strcmp(tl->uct_md->md_name, md_modex->md_name)) { + /* modex belongs to a different module, skip it and continue */ + modex_data += md_modex->size; + continue; + } + + uint8_t *tl_modex = mca_btl_uct_find_tl_modex(md_modex, tl); + if (NULL == tl_modex) { + break; + } + + if (NULL != remote_module_index) { + *remote_module_index = md_modex->module_index; + } + + BTL_VERBOSE(("finished processing modex for %s", tl->uct_md->md_name)); + + return tl_modex; + } + + BTL_ERROR(("could not find modex for %s::%s", tl->uct_md->md_name, tl->uct_tl_name)); + + return NULL; +} diff --git a/opal/mca/btl/uct/btl_uct_modex.h b/opal/mca/btl/uct/btl_uct_modex.h new file mode 100644 index 00000000000..e202bc8113f --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_modex.h @@ -0,0 +1,20 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Google, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#if !defined(MCA_BTL_UCT_MODEX_H) +#define MCA_BTL_UCT_MODEX_H + +#include "btl_uct.h" + +int mca_btl_uct_component_modex_send(void); + +uint8_t *mca_btl_uct_find_modex(mca_btl_uct_modex_t *modex, mca_btl_uct_tl_t *tl, int *remote_module_index); + +#endif /* !defined(MCA_BTL_UCT_MODEX_H) */ diff --git a/opal/mca/btl/uct/btl_uct_module.c b/opal/mca/btl/uct/btl_uct_module.c index 9577d615b92..9914c5e8f99 100644 --- a/opal/mca/btl/uct/btl_uct_module.c +++ b/opal/mca/btl/uct/btl_uct_module.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2020 Google, LLC. All rights reserved. + * Copyright (c) 2020-2025 Google, LLC. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -90,7 +90,7 @@ static int mca_btl_uct_add_procs(mca_btl_base_module_t *btl, size_t nprocs, if (am_tl) { rc = opal_free_list_init(&uct_module->short_frags, sizeof(mca_btl_uct_base_frag_t), opal_cache_line_size, OBJ_CLASS(mca_btl_uct_base_frag_t), - MCA_BTL_UCT_TL_ATTR(am_tl, 0).cap.am.max_short, + am_tl->uct_iface_attr.cap.am.max_short, opal_cache_line_size, 0, 1024, 64, NULL, 0, NULL, NULL, NULL); rc = opal_free_list_init(&uct_module->eager_frags, sizeof(mca_btl_uct_base_frag_t), @@ -264,6 +264,35 @@ int mca_btl_uct_dereg_mem(void *reg_data, mca_rcache_base_registration_t *reg) return OPAL_SUCCESS; } +mca_btl_uct_module_t *mca_btl_uct_alloc_module(mca_btl_uct_md_t *md, + size_t registration_size) +{ + mca_btl_uct_module_t *module; + + module = malloc(sizeof(*module)); + if (NULL == module) { + return NULL; + } + + /* copy the module template */ + *module = mca_btl_uct_module_template; + + OBJ_CONSTRUCT(&module->id_to_endpoint, opal_hash_table_t); + OBJ_CONSTRUCT(&module->endpoint_lock, opal_mutex_t); + OBJ_CONSTRUCT(&module->short_frags, opal_free_list_t); + OBJ_CONSTRUCT(&module->eager_frags, opal_free_list_t); + OBJ_CONSTRUCT(&module->max_frags, opal_free_list_t); + OBJ_CONSTRUCT(&module->pending_frags, opal_list_t); + OBJ_CONSTRUCT(&module->lock, opal_recursive_mutex_t); + OBJ_CONSTRUCT(&module->allowed_transport_list, mca_btl_uct_include_list_t); + + module->md = md; + OBJ_RETAIN(md); + module->super.btl_registration_handle_size = registration_size; + + return module; +} + /* * Cleanup/release module resources. */ @@ -284,31 +313,32 @@ int mca_btl_uct_finalize(mca_btl_base_module_t *btl) OBJ_DESTRUCT(&uct_module->max_frags); OBJ_DESTRUCT(&uct_module->pending_frags); OBJ_DESTRUCT(&uct_module->lock); - OBJ_DESTRUCT(&uct_module->pending_connection_reqs); + OBJ_DESTRUCT(&uct_module->allowed_transport_list); if (uct_module->rcache) { mca_rcache_base_module_destroy(uct_module->rcache); } - if (NULL != uct_module->am_tl) { - OBJ_RELEASE(uct_module->am_tl); - } - - if (NULL != uct_module->conn_tl) { - OBJ_RELEASE(uct_module->conn_tl); - } + OBJ_DESTRUCT(&uct_module->endpoint_lock); - if (NULL != uct_module->rdma_tl) { - OBJ_RELEASE(uct_module->rdma_tl); + char *tmp; + asprintf(&tmp, "uct_%s", uct_module->md->md_name); + int rc = mca_base_var_group_find("opal", "btl", tmp); + free(tmp); + if (rc >= 0) { + mca_base_var_group_deregister(rc); } - ucs_async_context_destroy(uct_module->ucs_async); - - OBJ_DESTRUCT(&uct_module->endpoint_lock); - - free(uct_module->md_name); + OBJ_RELEASE(uct_module->md); free(uct_module); + for (int i = 0 ; i < MCA_BTL_UCT_MAX_MODULES ; ++i) { + if (mca_btl_uct_component.modules[i] == uct_module) { + mca_btl_uct_component.modules[i] = NULL; + break; + } + } + return OPAL_SUCCESS; } @@ -338,9 +368,11 @@ mca_btl_uct_module_t mca_btl_uct_module_template = { /* set the default flags for this btl. uct provides us with rdma and both * fetching and non-fetching atomics (though limited to add and cswap) */ .btl_flags = MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS - | MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION, - .btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | MCA_BTL_ATOMIC_SUPPORTS_CSWAP - | MCA_BTL_ATOMIC_SUPPORTS_SWAP | MCA_BTL_ATOMIC_SUPPORTS_32BIT, + | MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION | MCA_BTL_FLAGS_SEND, + .btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | MCA_BTL_ATOMIC_SUPPORTS_AND + | MCA_BTL_ATOMIC_SUPPORTS_OR | MCA_BTL_ATOMIC_SUPPORTS_XOR + | MCA_BTL_ATOMIC_SUPPORTS_CSWAP | MCA_BTL_ATOMIC_SUPPORTS_SWAP + | MCA_BTL_ATOMIC_SUPPORTS_32BIT, /* set the default limits on put and get */ .btl_put_limit = 1 << 23, @@ -353,22 +385,30 @@ mca_btl_uct_module_t mca_btl_uct_module_template = { .btl_rdma_pipeline_send_length = 8192, .btl_eager_limit = 8192, .btl_max_send_size = 65536, + /* for now we want this component to lose to btl/ugni and btl/vader */ + .btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 1, }}; OBJ_CLASS_INSTANCE(mca_btl_uct_reg_t, opal_free_list_item_t, NULL, NULL); static void mca_btl_uct_md_construct(mca_btl_uct_md_t *md) { + md->uct_component = NULL; md->uct_md = NULL; + md->md_name = NULL; + OBJ_CONSTRUCT(&md->tls, opal_list_t); } static void mca_btl_uct_md_destruct(mca_btl_uct_md_t *md) { + OPAL_LIST_DESTRUCT(&md->tls); + + free(md->md_name); if (md->uct_md) { uct_md_close(md->uct_md); md->uct_md = NULL; } } -OBJ_CLASS_INSTANCE(mca_btl_uct_md_t, opal_object_t, mca_btl_uct_md_construct, +OBJ_CLASS_INSTANCE(mca_btl_uct_md_t, opal_list_item_t, mca_btl_uct_md_construct, mca_btl_uct_md_destruct); diff --git a/opal/mca/btl/uct/btl_uct_rdma.c b/opal/mca/btl/uct/btl_uct_rdma.c index d4210e4631c..e1e8f4b91d9 100644 --- a/opal/mca/btl/uct/btl_uct_rdma.c +++ b/opal/mca/btl/uct/btl_uct_rdma.c @@ -126,7 +126,7 @@ int mca_btl_uct_get(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoin mca_btl_uct_context_lock(context); - if (size <= MCA_BTL_UCT_TL_ATTR(uct_btl->rdma_tl, context->context_id).cap.get.max_bcopy) { + if (size <= uct_btl->rdma_tl->uct_iface_attr.cap.get.max_bcopy) { ucs_status = uct_ep_get_bcopy(ep_handle, mca_btl_uct_get_unpack, local_address, size, remote_address, rkey.rkey, &comp->uct_comp); } else { @@ -223,7 +223,7 @@ int mca_btl_uct_put(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoin /* determine what UCT prototol should be used */ if (size <= uct_btl->super.btl_put_local_registration_threshold) { use_short = size - <= MCA_BTL_UCT_TL_ATTR(uct_btl->rdma_tl, context->context_id).cap.put.max_short; + <= uct_btl->rdma_tl->uct_iface_attr.cap.put.max_short; use_bcopy = !use_short; } diff --git a/opal/mca/btl/uct/btl_uct_rdma.h b/opal/mca/btl/uct/btl_uct_rdma.h index 0438106b2c8..481be991b4d 100644 --- a/opal/mca/btl/uct/btl_uct_rdma.h +++ b/opal/mca/btl/uct/btl_uct_rdma.h @@ -53,7 +53,7 @@ static inline int mca_btl_uct_get_rkey(mca_btl_uct_module_t *module, } # if UCT_API >= UCT_VERSION(1, 7) - ucs_status = uct_rkey_unpack(module->uct_component, (void *) remote_handle, rkey); + ucs_status = uct_rkey_unpack(module->md->uct_component, (void *) remote_handle, rkey); # else ucs_status = uct_rkey_unpack((void *) remote_handle, rkey); # endif @@ -63,7 +63,7 @@ static inline int mca_btl_uct_get_rkey(mca_btl_uct_module_t *module, static inline void mca_btl_uct_rkey_release(mca_btl_uct_module_t *uct_btl, uct_rkey_bundle_t *rkey) { # if UCT_API >= UCT_VERSION(1, 7) - uct_rkey_release(uct_btl->uct_component, rkey); + uct_rkey_release(uct_btl->md->uct_component, rkey); # else (void) uct_btl; uct_rkey_release(rkey); diff --git a/opal/mca/btl/uct/btl_uct_tl.c b/opal/mca/btl/uct/btl_uct_tl.c index c1ef4c6d727..f55754bc9d8 100644 --- a/opal/mca/btl/uct/btl_uct_tl.c +++ b/opal/mca/btl/uct/btl_uct_tl.c @@ -18,12 +18,7 @@ #include "btl_uct_device_context.h" #include "opal/util/argv.h" #include "opal/util/bit_ops.h" - -#if HAVE_DECL_UCT_CB_FLAG_SYNC -# define MCA_BTL_UCT_CB_FLAG_SYNC UCT_CB_FLAG_SYNC -#else -# define MCA_BTL_UCT_CB_FLAG_SYNC 0 -#endif +#include "opal/util/minmax.h" /** * @brief Convert UCT capabilities to BTL flags @@ -70,13 +65,14 @@ static uint64_t mca_btl_uct_cap_to_btl_atomic_flag[][2] = { }, }; -static void mca_btl_uct_module_set_atomic_flags(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) +static void mca_btl_uct_module_set_atomic_flags(mca_btl_uct_module_t *module) { - uint64_t cap_flags = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags; + mca_btl_uct_tl_t *tl = module->rdma_tl; + uint64_t cap_flags = tl->uct_iface_attr.cap.flags; /* NTH: only use the fetching atomics for now */ - uint64_t atomic_flags32 = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.atomic32.fop_flags; - uint64_t atomic_flags64 = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.atomic64.fop_flags; + uint64_t atomic_flags32 = tl->uct_iface_attr.cap.atomic32.fop_flags; + uint64_t atomic_flags64 = tl->uct_iface_attr.cap.atomic64.fop_flags; uint64_t all_flags = atomic_flags64 | atomic_flags32; @@ -120,9 +116,10 @@ static uint64_t mca_btl_uct_cap_to_btl_atomic_flag[][2] = { * * @returns equivalent BTL atomic flags */ -static void mca_btl_uct_module_set_atomic_flags(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) +static void mca_btl_uct_module_set_atomic_flags(mca_btl_uct_module_t *module) { - uint64_t cap_flags = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags; + mca_btl_uct_tl_t *tl = module->rdma_tl; + uint64_t cap_flags = tl->uct_iface_attr.cap.flags; module->super.btl_atomic_flags = 0; @@ -144,6 +141,7 @@ static void mca_btl_uct_tl_constructor(mca_btl_uct_tl_t *tl) { memset((void *) ((uintptr_t) tl + sizeof(tl->super)), 0, sizeof(*tl) - sizeof(tl->super)); OBJ_CONSTRUCT(&tl->tl_lock, opal_mutex_t); + OBJ_CONSTRUCT(&tl->pending_connection_reqs, opal_fifo_t); } static void mca_btl_uct_tl_destructor(mca_btl_uct_tl_t *tl) @@ -156,11 +154,10 @@ static void mca_btl_uct_tl_destructor(mca_btl_uct_tl_t *tl) } } - if (tl->uct_md) { - OBJ_RELEASE(tl->uct_md); + if (tl->ucs_async) { + ucs_async_context_destroy(tl->ucs_async); } - free(tl->uct_dev_contexts); free(tl->uct_tl_name); free(tl->uct_dev_name); @@ -169,6 +166,7 @@ static void mca_btl_uct_tl_destructor(mca_btl_uct_tl_t *tl) } OBJ_DESTRUCT(&tl->tl_lock); + OBJ_DESTRUCT(&tl->pending_connection_reqs); } OBJ_CLASS_INSTANCE(mca_btl_uct_tl_t, opal_list_item_t, mca_btl_uct_tl_constructor, @@ -176,14 +174,14 @@ OBJ_CLASS_INSTANCE(mca_btl_uct_tl_t, opal_list_item_t, mca_btl_uct_tl_constructo static ucs_status_t mca_btl_uct_conn_req_cb(void *arg, void *data, size_t length, unsigned flags) { - mca_btl_uct_module_t *module = (mca_btl_uct_module_t *) arg; + mca_btl_uct_tl_t *tl = (mca_btl_uct_tl_t *) arg; mca_btl_uct_pending_connection_request_t *request = calloc(1, length + sizeof(request->super)); /* it is not safe to process the connection request from the callback so just save it for * later processing */ OBJ_CONSTRUCT(request, mca_btl_uct_pending_connection_request_t); memcpy(&request->request_data, (void *) ((intptr_t) data + 8), length); - opal_fifo_push_atomic(&module->pending_connection_reqs, &request->super); + opal_fifo_push_atomic(&tl->pending_connection_reqs, &request->super); return UCS_OK; } @@ -238,17 +236,21 @@ int mca_btl_uct_process_connection_request(mca_btl_uct_module_t *module, return OPAL_SUCCESS; } -static int mca_btl_uct_setup_connection_tl(mca_btl_uct_module_t *module) +static int mca_btl_uct_setup_connection_tl(mca_btl_uct_tl_t *tl) { ucs_status_t ucs_status; - if (NULL == module->conn_tl) { + if (NULL == tl) { return OPAL_ERR_NOT_SUPPORTED; } - ucs_status = uct_iface_set_am_handler(module->conn_tl->uct_dev_contexts[0]->uct_iface, - MCA_BTL_UCT_CONNECT_RDMA, mca_btl_uct_conn_req_cb, module, - UCT_CB_FLAG_ASYNC); + mca_btl_uct_device_context_t *context = + mca_btl_uct_module_get_tl_context_specific(/*module=*/NULL, tl, + /*context_id=*/0); + + ucs_status = uct_iface_set_am_handler(context->uct_iface, + MCA_BTL_UCT_CONNECT_RDMA, mca_btl_uct_conn_req_cb, + tl, UCT_CB_FLAG_ASYNC); if (UCS_OK != ucs_status) { BTL_ERROR(("could not set active message handler for uct tl")); } @@ -256,23 +258,7 @@ static int mca_btl_uct_setup_connection_tl(mca_btl_uct_module_t *module) return UCS_OK == ucs_status ? OPAL_SUCCESS : OPAL_ERROR; } -static void mca_btl_uct_context_enable_progress(mca_btl_uct_device_context_t *context) -{ - if (!context->progress_enabled) { -#if HAVE_DECL_UCT_PROGRESS_THREAD_SAFE - uct_iface_progress_enable(context->uct_iface, - UCT_PROGRESS_THREAD_SAFE | UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); -#else - uct_iface_progress_enable(context->uct_iface, UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); -#endif - context->progress_enabled = true; - } -} - -mca_btl_uct_device_context_t *mca_btl_uct_context_create(mca_btl_uct_module_t *module, - mca_btl_uct_tl_t *tl, int context_id, - bool enable_progress) -{ +static int mca_btl_uct_populate_tl_attr(mca_btl_uct_tl_t *tl) { #if UCT_API >= UCT_VERSION(1, 6) uct_iface_params_t iface_params = {.field_mask = UCT_IFACE_PARAM_FIELD_OPEN_MODE | UCT_IFACE_PARAM_FIELD_DEVICE, @@ -288,96 +274,38 @@ mca_btl_uct_device_context_t *mca_btl_uct_context_create(mca_btl_uct_module_t *m .mode = {.device = {.tl_name = tl->uct_tl_name, .dev_name = tl->uct_dev_name}}}; #endif - mca_btl_uct_device_context_t *context; ucs_status_t ucs_status; - int rc; - context = calloc(1, sizeof(*context)); - if (OPAL_UNLIKELY(NULL == context)) { - return NULL; - } - - context->context_id = context_id; - context->uct_btl = module; - OBJ_CONSTRUCT(&context->completion_fifo, opal_fifo_t); - OBJ_CONSTRUCT(&context->mutex, opal_recursive_mutex_t); - OBJ_CONSTRUCT(&context->rdma_completions, opal_free_list_t); - - rc = opal_free_list_init(&context->rdma_completions, sizeof(mca_btl_uct_uct_completion_t), - opal_cache_line_size, OBJ_CLASS(mca_btl_uct_uct_completion_t), 0, - opal_cache_line_size, 0, 4096, 128, NULL, 0, NULL, NULL, NULL); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - mca_btl_uct_context_destroy(context); - return NULL; - } - - /* apparently (in contradiction to the spec) UCT is *not* thread safe. because we have to - * use our own locks just go ahead and use UCS_THREAD_MODE_SINGLE. if they ever fix their - * api then change this back to UCS_THREAD_MODE_MULTI and remove the locks around the - * various UCT calls. */ - ucs_status = uct_worker_create(module->ucs_async, UCS_THREAD_MODE_SINGLE, &context->uct_worker); + /* do the bare minimum to get tl attributes */ + uct_worker_h uct_worker; + ucs_status = uct_worker_create(tl->ucs_async, UCS_THREAD_MODE_SINGLE, &uct_worker); if (OPAL_UNLIKELY(UCS_OK != ucs_status)) { BTL_VERBOSE(("could not create a UCT worker")); - mca_btl_uct_context_destroy(context); - return NULL; + return OPAL_ERROR; } - ucs_status = uct_iface_open(tl->uct_md->uct_md, context->uct_worker, &iface_params, - tl->uct_tl_config, &context->uct_iface); + uct_iface_h uct_iface; + ucs_status = uct_iface_open(tl->uct_md->uct_md, uct_worker, &iface_params, + tl->uct_tl_config, &uct_iface); if (OPAL_UNLIKELY(UCS_OK != ucs_status)) { BTL_VERBOSE(("could not open UCT interface. error code: %d", ucs_status)); - mca_btl_uct_context_destroy(context); - return NULL; + uct_worker_destroy(uct_worker); + return OPAL_ERROR; } - /* only need to query one of the interfaces to get the attributes */ - ucs_status = uct_iface_query(context->uct_iface, &context->uct_iface_attr); + int rc = OPAL_SUCCESS; + ucs_status = uct_iface_query(uct_iface, &tl->uct_iface_attr); if (UCS_OK != ucs_status) { BTL_VERBOSE(("Error querying UCT interface")); - mca_btl_uct_context_destroy(context); - return NULL; + rc = OPAL_ERROR; } - if (context_id > 0 && tl == module->am_tl) { - BTL_VERBOSE(("installing AM handler for tl %p context id %d", (void *) tl, context_id)); - uct_iface_set_am_handler(context->uct_iface, MCA_BTL_UCT_FRAG, mca_btl_uct_am_handler, - context, MCA_BTL_UCT_CB_FLAG_SYNC); - } - - if (enable_progress) { - BTL_VERBOSE(("enabling progress for tl %p context id %d", (void *) tl, context_id)); - mca_btl_uct_context_enable_progress(context); - } - - return context; -} - -void mca_btl_uct_context_destroy(mca_btl_uct_device_context_t *context) -{ - if (context->uct_iface) { - uct_iface_close(context->uct_iface); - context->uct_iface = NULL; - } - - if (context->uct_worker) { - uct_worker_destroy(context->uct_worker); - context->uct_worker = NULL; - } - - OBJ_DESTRUCT(&context->completion_fifo); - OBJ_DESTRUCT(&context->rdma_completions); - free(context); -} - -static int tl_compare(opal_list_item_t **a, opal_list_item_t **b) -{ - mca_btl_uct_tl_t *tl_a = (mca_btl_uct_tl_t *) *a; - mca_btl_uct_tl_t *tl_b = (mca_btl_uct_tl_t *) *b; - - return tl_a->priority - tl_b->priority; + uct_iface_close(uct_iface); + uct_worker_destroy(uct_worker); + return rc; } -static mca_btl_uct_tl_t *mca_btl_uct_create_tl(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, +static mca_btl_uct_tl_t *mca_btl_uct_create_tl(mca_btl_uct_md_t *md, uct_tl_resource_desc_t *tl_desc, int priority) { mca_btl_uct_tl_t *tl = OBJ_NEW(mca_btl_uct_tl_t); @@ -388,30 +316,29 @@ static mca_btl_uct_tl_t *mca_btl_uct_create_tl(mca_btl_uct_module_t *module, mca /* initialize btl tl structure */ tl->uct_md = md; - OBJ_RETAIN(md); tl->uct_tl_name = strdup(tl_desc->tl_name); tl->uct_dev_name = strdup(tl_desc->dev_name); + tl->dev_type = tl_desc->dev_type; tl->priority = priority; - tl->uct_dev_contexts = calloc(MCA_BTL_UCT_MAX_WORKERS, sizeof(tl->uct_dev_contexts[0])); - if (NULL == tl->uct_dev_contexts) { + (void) uct_md_iface_config_read(md->uct_md, tl_desc->tl_name, NULL, NULL, &tl->uct_tl_config); + + ucs_status_t ucs_status = ucs_async_context_create(UCS_ASYNC_MODE_THREAD, &tl->ucs_async); + if (UCS_OK != ucs_status) { + BTL_VERBOSE(("Could not create a UCT async context")); OBJ_RELEASE(tl); return NULL; } - (void) uct_md_iface_config_read(md->uct_md, tl_desc->tl_name, NULL, NULL, &tl->uct_tl_config); - - /* always create a 0 context (needed to query) */ - tl->uct_dev_contexts[0] = mca_btl_uct_context_create(module, tl, 0, false); - if (NULL == tl->uct_dev_contexts[0]) { - BTL_VERBOSE(("could not create a uct device context")); + int rc = mca_btl_uct_populate_tl_attr(tl); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { OBJ_RELEASE(tl); return NULL; } - BTL_VERBOSE(("Interface CAPS for tl %s::%s: 0x%lx", module->md_name, tl_desc->tl_name, - (unsigned long) MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags)); + BTL_VERBOSE(("Interface CAPS for tl %s::%s::%s 0x%lx", md->md_name, tl_desc->tl_name, + tl_desc->dev_name, (unsigned long) tl->uct_iface_attr.cap.flags)); return tl; } @@ -420,32 +347,32 @@ static void mca_btl_uct_set_tl_rdma(mca_btl_uct_module_t *module, mca_btl_uct_tl { BTL_VERBOSE(("tl %s is suitable for RDMA", tl->uct_tl_name)); - mca_btl_uct_module_set_atomic_flags(module, tl); + module->rdma_tl = tl; + + mca_btl_uct_module_set_atomic_flags(module); - module->super.btl_get_limit = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.get.max_zcopy; - if (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.get.max_bcopy) { + module->super.btl_get_limit = opal_min(tl->uct_iface_attr.cap.get.max_zcopy, + module->super.btl_get_limit); + if (tl->uct_iface_attr.cap.get.max_bcopy) { module->super.btl_get_alignment = 0; - module->super.btl_get_local_registration_threshold = MCA_BTL_UCT_TL_ATTR(tl, 0) + module->super.btl_get_local_registration_threshold = tl->uct_iface_attr .cap.get.max_bcopy; } else { /* this is overkill in terms of alignment but we have no way to enforce a minimum get size */ module->super.btl_get_alignment = opal_next_poweroftwo_inclusive( - MCA_BTL_UCT_TL_ATTR(tl, 0).cap.get.min_zcopy); + tl->uct_iface_attr.cap.get.min_zcopy); } - module->super.btl_put_limit = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.put.max_zcopy; + module->super.btl_put_limit = opal_min(tl->uct_iface_attr.cap.put.max_zcopy, + module->super.btl_put_limit); module->super.btl_put_alignment = 0; /* no registration needed when using short/bcopy put */ - module->super.btl_put_local_registration_threshold = MCA_BTL_UCT_TL_ATTR(tl, 0) + module->super.btl_put_local_registration_threshold = tl->uct_iface_attr .cap.put.max_bcopy; - module->rdma_tl = tl; - OBJ_RETAIN(tl); - tl->tl_index = (module->am_tl && tl != module->am_tl) ? 1 : 0; - module->comm_tls[tl->tl_index] = tl; if (tl->max_device_contexts <= 1) { tl->max_device_contexts = mca_btl_uct_component.num_contexts_per_module; } @@ -454,46 +381,37 @@ static void mca_btl_uct_set_tl_rdma(mca_btl_uct_module_t *module, mca_btl_uct_tl static void mca_btl_uct_set_tl_am(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) { BTL_VERBOSE(("tl %s is suitable for active-messaging", tl->uct_tl_name)); - - if (module->rdma_tl == tl) { - module->shared_endpoints = true; - } module->am_tl = tl; - OBJ_RETAIN(tl); - - uct_iface_set_am_handler(tl->uct_dev_contexts[0]->uct_iface, MCA_BTL_UCT_FRAG, - mca_btl_uct_am_handler, tl->uct_dev_contexts[0], UCT_CB_FLAG_ASYNC); tl->tl_index = (module->rdma_tl && tl != module->rdma_tl) ? 1 : 0; - module->comm_tls[tl->tl_index] = tl; if (tl->max_device_contexts <= 1) { tl->max_device_contexts = mca_btl_uct_component.num_contexts_per_module; } - module->super.btl_eager_limit = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.am.max_bcopy - - sizeof(mca_btl_uct_am_header_t); - if (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & UCT_IFACE_FLAG_AM_ZCOPY) { - module->super.btl_max_send_size = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.am.max_zcopy - - sizeof(mca_btl_uct_am_header_t); - } else { - module->super.btl_max_send_size = module->super.btl_eager_limit; + size_t max_eager_limit = tl->uct_iface_attr.cap.am.max_bcopy + - sizeof(mca_btl_uct_am_header_t); + size_t max_send_size = max_eager_limit; + + if (tl->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_AM_ZCOPY) { + max_send_size = opal_max(max_send_size, tl->uct_iface_attr.cap.am.max_zcopy + - sizeof(mca_btl_uct_am_header_t)); } + + module->super.btl_eager_limit = opal_min(module->super.btl_eager_limit, max_eager_limit); + module->super.btl_max_send_size = opal_min(module->super.btl_max_send_size, max_send_size); } -static int mca_btl_uct_set_tl_conn(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) +int mca_btl_uct_enable_tl_conn(mca_btl_uct_tl_t *tl) { int rc; BTL_VERBOSE(("tl %s is suitable for making connections", tl->uct_tl_name)); - module->conn_tl = tl; - rc = mca_btl_uct_setup_connection_tl(module); + rc = mca_btl_uct_setup_connection_tl(tl); if (OPAL_SUCCESS != rc) { return rc; } - OBJ_RETAIN(tl); - if (!tl->max_device_contexts) { /* if a tl is only being used to create connections do not bother with multiple * contexts */ @@ -503,11 +421,9 @@ static int mca_btl_uct_set_tl_conn(mca_btl_uct_module_t *module, mca_btl_uct_tl_ return OPAL_SUCCESS; } -static int mca_btl_uct_evaluate_tl(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) +int mca_btl_uct_evaluate_tl(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) { - int rc; - - BTL_VERBOSE(("evaluating tl %s", tl->uct_tl_name)); + BTL_VERBOSE(("evaluating tl %s::%s", tl->uct_md->md_name, tl->uct_tl_name)); if (NULL == module->rdma_tl && mca_btl_uct_tl_supports_rdma(tl)) { mca_btl_uct_set_tl_rdma(module, tl); } @@ -516,165 +432,48 @@ static int mca_btl_uct_evaluate_tl(mca_btl_uct_module_t *module, mca_btl_uct_tl_ mca_btl_uct_set_tl_am(module, tl); } - if (NULL == module->conn_tl && mca_btl_uct_tl_supports_conn(tl)) { - rc = mca_btl_uct_set_tl_conn(module, tl); - if (OPAL_SUCCESS != rc) { - return rc; - } - } - if (tl == module->rdma_tl || tl == module->am_tl) { - BTL_VERBOSE(("tl has flags 0x%" PRIx64, MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags)); - module->super.btl_flags |= mca_btl_uct_module_flags(MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags); + BTL_VERBOSE(("tl has flags 0x%" PRIx64, tl->uct_iface_attr.cap.flags)); + module->super.btl_flags |= mca_btl_uct_module_flags(tl->uct_iface_attr.cap.flags); + module->super.btl_flags |= MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION; /* the bandwidth and latency numbers relate to both rdma and active messages. need to * come up with a better estimate. */ /* UCT bandwidth is in bytes/sec, BTL is in MB/sec */ #if UCT_API >= UCT_VERSION(1, 7) - module->super.btl_bandwidth = (uint32_t)((MCA_BTL_UCT_TL_ATTR(tl, 0).bandwidth.dedicated - + MCA_BTL_UCT_TL_ATTR(tl, 0).bandwidth.shared + module->super.btl_bandwidth = (uint32_t)((tl->uct_iface_attr.bandwidth.dedicated + + tl->uct_iface_attr.bandwidth.shared / (opal_process_info.num_local_peers + 1)) / 1048576.0); #else - module->super.btl_bandwidth = (uint32_t)(MCA_BTL_UCT_TL_ATTR(tl, 0).bandwidth / 1048576.0); + module->super.btl_bandwidth = (uint32_t)(tl->uct_iface_attr.bandwidth / 1048576.0); #endif /* TODO -- figure out how to translate UCT latency to us */ module->super.btl_latency = 1; } - if (tl == module->rdma_tl || tl == module->am_tl || tl == module->conn_tl) { - /* make sure progress is enabled on the default context now that we know this TL will be - * used */ - mca_btl_uct_context_enable_progress(tl->uct_dev_contexts[0]); - } - return OPAL_SUCCESS; } -int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, - uct_tl_resource_desc_t *tl_descs, unsigned tl_count) +int mca_btl_uct_populate_tls(mca_btl_uct_md_t *md, uct_tl_resource_desc_t *tl_descs, unsigned tl_count) { - bool include = true, any = false; - mca_btl_uct_tl_t *tl; - opal_list_t tl_list; - char **tl_filter; - int any_priority = 0; - - OBJ_CONSTRUCT(&tl_list, opal_list_t); - - tl_filter = opal_argv_split(mca_btl_uct_component.allowed_transports, ','); - - if ('^' == tl_filter[0][0]) { - /* user has negated the include list */ - char *tmp = strdup(tl_filter[0] + 1); - - free(tl_filter[0]); - tl_filter[0] = tmp; - include = false; - } - - /* check for the any keyword */ - for (unsigned j = 0; tl_filter[j]; ++j) { - if (0 == strcmp(tl_filter[j], "any")) { - any_priority = j; - any = true; - break; - } - } - - if (any && !include) { - opal_argv_free(tl_filter); - return OPAL_ERR_NOT_AVAILABLE; - } + BTL_VERBOSE(("processing %u tls in memory domain %s", tl_count, md->md_name)); for (unsigned i = 0; i < tl_count; ++i) { - bool try_tl = any; - int priority = any_priority; - - for (unsigned j = 0; tl_filter[j]; ++j) { - if (0 == strcmp(tl_filter[j], tl_descs[i].tl_name)) { - try_tl = include; - priority = j; - break; - } - } - - BTL_VERBOSE(("tl filter: tl_name = %s, use = %d, priority = %d", tl_descs[i].tl_name, - try_tl, priority)); - - if (!try_tl) { - continue; - } - - if (0 == strcmp(tl_descs[i].tl_name, "ud")) { - /* ud looks like any normal transport but we do not want to use it for anything other - * than connection management so ensure it gets evaluated last */ - priority = INT_MAX; - } - - tl = mca_btl_uct_create_tl(module, md, tl_descs + i, priority); + BTL_VERBOSE(("processing tl %s::%s::%s", md->md_name, tl_descs[i].tl_name, tl_descs[i].dev_name)); + /* the priority will be set during module creation */ + mca_btl_uct_tl_t *tl = mca_btl_uct_create_tl(md, tl_descs + i, /*priority=*/0); if (tl) { - opal_list_append(&tl_list, &tl->super); + opal_list_append(&md->tls, &tl->super); } } - opal_argv_free(tl_filter); - - if (0 == opal_list_get_size(&tl_list)) { + if (0 == opal_list_get_size(&md->tls)) { BTL_VERBOSE(("no suitable tls match filter: %s", mca_btl_uct_component.allowed_transports)); - OBJ_DESTRUCT(&tl_list); return OPAL_ERR_NOT_AVAILABLE; } - opal_list_sort(&tl_list, tl_compare); - - OPAL_LIST_FOREACH (tl, &tl_list, mca_btl_uct_tl_t) { - mca_btl_uct_evaluate_tl(module, tl); - if (NULL != module->am_tl && NULL != module->rdma_tl - && (NULL != module->conn_tl - || !(mca_btl_uct_tl_requires_connection_tl(module->am_tl) - || mca_btl_uct_tl_requires_connection_tl(module->rdma_tl)))) { - /* all done */ - break; - } - } - - if (NULL == module->rdma_tl) { - /* no rdma tls */ - BTL_VERBOSE(("no rdma tl matched supplied filter. disabling RDMA support")); - - module->super.btl_flags &= ~MCA_BTL_FLAGS_RDMA; - module->super.btl_put = NULL; - module->super.btl_get = NULL; - module->super.btl_atomic_fop = NULL; - module->super.btl_atomic_op = NULL; - } - - if (NULL == module->am_tl) { - /* no active message tls == no send/recv */ - BTL_VERBOSE(("no active message tl matched supplied filter. disabling send/recv support")); - - module->super.btl_send = NULL; - module->super.btl_sendi = NULL; - module->super.btl_alloc = NULL; - module->super.btl_free = NULL; - } - - OPAL_LIST_DESTRUCT(&tl_list); - - if (!(NULL != module->am_tl && mca_btl_uct_tl_requires_connection_tl(module->am_tl)) - && !(NULL != module->rdma_tl && mca_btl_uct_tl_requires_connection_tl(module->rdma_tl)) - && module->conn_tl) { - /* no connection tl needed for selected transports */ - OBJ_RELEASE(module->conn_tl); - module->conn_tl = NULL; - } else if (NULL == module->conn_tl) { - BTL_VERBOSE(("a connection tl is required but no tls match the filter %s", - mca_btl_uct_component.allowed_transports)); - return OPAL_ERROR; - } - return OPAL_SUCCESS; } diff --git a/opal/mca/btl/uct/btl_uct_types.h b/opal/mca/btl/uct/btl_uct_types.h index b2bac61be61..d1625fa9bef 100644 --- a/opal/mca/btl/uct/btl_uct_types.h +++ b/opal/mca/btl/uct/btl_uct_types.h @@ -10,17 +10,23 @@ * $HEADER$ */ +#include + #if !defined(BTL_UCT_TYPES_H) # define BTL_UCT_TYPES_H # include "opal/mca/btl/btl.h" +#include "opal/class/opal_fifo.h" +#include "opal/class/opal_list.h" +#include "opal/class/opal_object.h" #include "opal/mca/timer/base/base.h" /* forward declarations */ struct mca_btl_uct_module_t; struct mca_btl_base_endpoint_t; struct mca_btl_uct_base_frag_t; +struct mca_btl_uct_tl_t; /* TL endpoint flags */ /** connection data was received */ @@ -64,10 +70,27 @@ typedef struct mca_btl_uct_modex_t mca_btl_uct_modex_t; */ struct mca_btl_uct_md_t { /** make this an opal object */ - opal_object_t super; + opal_list_item_t super; + + /** if true none of the tls in this domain will be used + * for communication */ + bool connection_only_domain; + + /** name of the memory domain backing this module */ + char *md_name; + + /** list of mca_btl_uct_tl_t's for this memory domain */ + opal_list_t tls; /** UCT memory domain handle */ uct_md_h uct_md; + + /** memory domain attributes */ + uct_md_attr_t md_attr; + +#if UCT_API >= UCT_VERSION(1, 7) + uct_component_h uct_component; +#endif }; typedef struct mca_btl_uct_md_t mca_btl_uct_md_t; @@ -90,6 +113,9 @@ struct mca_btl_uct_conn_req_t { /** transport index that should be connected */ int tl_index; + /** module that is being connected (local index to the receiver) */ + int module_index; + /** endpoint address data */ uint8_t ep_addr[]; }; @@ -119,6 +145,8 @@ struct mca_btl_uct_connection_ep_t { /** opal base object */ opal_object_t super; + struct mca_btl_uct_tl_t *tl; + /** UCT endpoint used for connection */ uct_ep_h uct_ep; }; @@ -151,9 +179,6 @@ struct mca_btl_uct_device_context_t { /** UCT interface handle */ uct_iface_h uct_iface; - /** interface attributes */ - uct_iface_attr_t uct_iface_attr; - /** RDMA completions */ opal_free_list_t rdma_completions; @@ -285,7 +310,7 @@ struct mca_btl_base_endpoint_t { opal_proc_t *ep_proc; /** mutex to protect this structure */ - opal_recursive_mutex_t ep_lock; + opal_mutex_t ep_lock; /** cached connection endpoint */ mca_btl_uct_connection_ep_t *conn_ep; @@ -308,7 +333,7 @@ struct mca_btl_uct_tl_t { /** relative priority 0 == highest */ int priority; - /** memory domain associated with this tl */ + /** memory domain associated with this tl (no reference) */ mca_btl_uct_md_t *uct_md; /** lock protecting tl structures */ @@ -323,22 +348,32 @@ struct mca_btl_uct_tl_t { /** device name for this tl (used for creating device contexts) */ char *uct_dev_name; + /** UCT device type from the tl description */ + uct_device_type_t dev_type; + /** maximum number of device contexts that can be created */ int max_device_contexts; /** array of device contexts */ - mca_btl_uct_device_context_t **uct_dev_contexts; + mca_btl_uct_device_context_t *uct_dev_contexts[MCA_BTL_UCT_MAX_WORKERS]; /** tl index. this is used to differentiate (if there is any difference) * between rdma and am endpoints */ int tl_index; + + /** interface attributes */ + uct_iface_attr_t uct_iface_attr; + + /** async context */ + ucs_async_context_t *ucs_async; + + /** pending connection requests */ + opal_fifo_t pending_connection_reqs; }; typedef struct mca_btl_uct_tl_t mca_btl_uct_tl_t; OBJ_CLASS_DECLARATION(mca_btl_uct_tl_t); -# define MCA_BTL_UCT_TL_ATTR(tl, context_id) (tl)->uct_dev_contexts[(context_id)]->uct_iface_attr - struct mca_btl_uct_pending_connection_request_t { opal_list_item_t super; uint8_t request_data[]; @@ -347,4 +382,36 @@ struct mca_btl_uct_pending_connection_request_t { typedef struct mca_btl_uct_pending_connection_request_t mca_btl_uct_pending_connection_request_t; OBJ_CLASS_DECLARATION(mca_btl_uct_pending_connection_request_t); +/** + * @brief parsed include/exclude list + * + */ +struct mca_btl_uct_include_list_t { + opal_object_t super; + + /** argv-style (NULL terminated) array of strings */ + char **list; + /** is an inclusive list (vs exclusive) */ + bool include; +}; +typedef struct mca_btl_uct_include_list_t mca_btl_uct_include_list_t; +OBJ_CLASS_DECLARATION(mca_btl_uct_include_list_t); + +struct mca_btl_uct_tl_modex_t { + /** total size of this modex */ + uint16_t size; + char tl_name[UCT_TL_NAME_MAX]; + uint8_t data[]; +} __opal_attribute_packed__; +typedef struct mca_btl_uct_tl_modex_t mca_btl_uct_tl_modex_t; + +struct mca_btl_uct_md_modex_t { + /** total size of this modex */ + uint16_t size; + uint16_t module_index; + char md_name[UCT_MD_NAME_MAX]; + uint8_t data[]; +} __opal_attribute_packed__; +typedef struct mca_btl_uct_md_modex_t mca_btl_uct_md_modex_t; + #endif /* !defined(BTL_UCT_TYPES_H) */