From 0586a28d9af6e969ae4ef2b4a92c5e5a27b3ad38 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Thu, 27 Feb 2025 13:59:40 -0700 Subject: [PATCH 01/13] btl/uct: allow connections to be formed using a separate memory domain It is possible that the current memory domain does not have an adequate transport for forming endpoint to endpoint connections. When this is the case the btl will fail to function. To support these situations this CL adds support for using an alternate transport (usually tcp) which can be used to make the endpoint connections. Signed-off-by: Nathan Hjelm --- opal/mca/btl/uct/btl_uct.h | 26 +- opal/mca/btl/uct/btl_uct_am.c | 1 - opal/mca/btl/uct/btl_uct_component.c | 358 ++++++++++++++++++++------- opal/mca/btl/uct/btl_uct_endpoint.c | 206 +++++++++------ opal/mca/btl/uct/btl_uct_tl.c | 77 ++---- opal/mca/btl/uct/btl_uct_types.h | 16 ++ 6 files changed, 467 insertions(+), 217 deletions(-) diff --git a/opal/mca/btl/uct/btl_uct.h b/opal/mca/btl/uct/btl_uct.h index 65bc69fddb2..97a5c28c561 100644 --- a/opal/mca/btl/uct/btl_uct.h +++ b/opal/mca/btl/uct/btl_uct.h @@ -64,6 +64,9 @@ struct mca_btl_uct_module_t { /** base BTL interface */ mca_btl_base_module_t super; + /** module index in the component module array */ + int module_index; + /** whether the module has been fully initialized or not */ bool initialized; @@ -141,9 +144,15 @@ struct mca_btl_uct_component_t { /** allowed UCT memory domains */ char *memory_domains; + mca_btl_uct_include_list_t memory_domain_list; /** allowed transports */ char *allowed_transports; + mca_btl_uct_include_list_t allowed_transport_list; + + /** transports to consider for forming connections */ + char *connection_domains; + mca_btl_uct_include_list_t connection_domain_list; /** number of worker contexts to create */ int num_contexts_per_module; @@ -158,6 +167,10 @@ struct mca_btl_uct_component_t { /** connection retry timeout */ unsigned int connection_retry_timeout; + + /** alternate connection-only module that can be used if no suitable + * connection tl is found. this is usually a tcp tl. */ + mca_btl_uct_module_t *conn_module; }; typedef struct mca_btl_uct_component_t mca_btl_uct_component_t; @@ -294,7 +307,8 @@ struct mca_btl_base_endpoint_t *mca_btl_uct_get_ep(struct mca_btl_base_module_t opal_proc_t *proc); int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, - uct_tl_resource_desc_t *tl_descs, unsigned tl_count); + uct_tl_resource_desc_t *tl_descs, unsigned tl_count, + bool evaluate_for_conn_only); int mca_btl_uct_process_connection_request(mca_btl_uct_module_t *module, mca_btl_uct_conn_req_t *req); @@ -341,5 +355,15 @@ static inline bool mca_btl_uct_tl_requires_connection_tl(mca_btl_uct_tl_t *tl) return !(MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE); } +/** + * @brief Find the rank of `name` in the include list `list`. + * + * @param[in] name name to find + * @param[in] list list to search + * + * A negative result means the name is not present or the list is negated. + */ +int mca_btl_uct_include_list_rank (const char *name, const mca_btl_uct_include_list_t *list); + END_C_DECLS #endif diff --git a/opal/mca/btl/uct/btl_uct_am.c b/opal/mca/btl/uct/btl_uct_am.c index 1aae456842c..85d89d2d734 100644 --- a/opal/mca/btl/uct/btl_uct_am.c +++ b/opal/mca/btl/uct/btl_uct_am.c @@ -55,7 +55,6 @@ static inline void _mca_btl_uct_send_pack(void *data, void *header, size_t heade { uint32_t iov_count = 1; struct iovec iov; - size_t length; if (header_size > 0) { assert(NULL != header); diff --git a/opal/mca/btl/uct/btl_uct_component.c b/opal/mca/btl/uct/btl_uct_component.c index 5eec97ec487..43673625d68 100644 --- a/opal/mca/btl/uct/btl_uct_component.c +++ b/opal/mca/btl/uct/btl_uct_component.c @@ -27,6 +27,8 @@ * $HEADER$ */ +#include + #include "opal_config.h" #include "opal/mca/btl/base/base.h" @@ -67,6 +69,15 @@ static int mca_btl_uct_component_register(void) MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_uct_component.allowed_transports); + mca_btl_uct_component.connection_domains = "tcp"; + (void) mca_base_component_var_register( + &mca_btl_uct_component.super.btl_version, "connection_domains", + "Comma-delimited list of connection-only domains to use sorted by increasing " + "priority. The list of transports available can be queried using ucx_info. Special" + "values: any (any available) (default: tcp)", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_uct_component.connection_domains); + mca_btl_uct_component.num_contexts_per_module = 0; (void) mca_base_component_var_register( &mca_btl_uct_component.super.btl_version, "num_contexts_per_module", @@ -124,6 +135,54 @@ static void mca_btl_uct_mem_release_cb(void *buf, size_t length, void *cbdata, b ucm_vm_munmap(buf, length); } +static void mca_btl_uct_component_parse_include_list (const char *value, mca_btl_uct_include_list_t *list) { + list->list = NULL; + list->include = true; + + if (value == NULL) { + return; + } + + if (value[0] == '^') { + list->include = false; + value++; + } + + list->list = opal_argv_split(value, ','); +} + +static void mca_btl_uct_include_list_free (mca_btl_uct_include_list_t *list) { + opal_argv_free (list->list); + list->list = NULL; +} + +int mca_btl_uct_include_list_rank (const char *name, const mca_btl_uct_include_list_t *list) { + if (list->list == NULL) { + return -1; + } + + for (int i = 0; list->list[i]; ++i) { + regex_t preg; + + BTL_VERBOSE(("evaluating %s vs %s-list item %s", name, list->include ? "include" : "exclude", list->list[i])); + int rc = regcomp(&preg, list->list[i], REG_ICASE); + if (0 != rc) { + char errbuf[256]; + regerror(rc, &preg, errbuf, sizeof(errbuf)); + BTL_ERROR(("when matching name, could not parse regular expression: %s, error: %s", list->list[i], errbuf)); + continue; + } + + int result = regexec(&preg, name, /*nmatch=*/0, /*pmatch=*/NULL, /*eflags=*/0); + regfree(&preg); + if (0 == result) { + return list->include ? i + 1 : -(i + 1); + } + } + + return list->include ? -1 : 1; +} + static int mca_btl_uct_component_open(void) { if (0 == mca_btl_uct_component.num_contexts_per_module) { @@ -167,10 +226,19 @@ static int mca_btl_uct_component_open(void) */ static int mca_btl_uct_component_close(void) { + if (NULL != mca_btl_uct_component.conn_module) { + mca_btl_uct_finalize (&mca_btl_uct_component.conn_module->super); + mca_btl_uct_component.conn_module = NULL; + } + if (mca_btl_uct_component.disable_ucx_memory_hooks) { opal_mem_hooks_unregister_release(mca_btl_uct_mem_release_cb); } + mca_btl_uct_include_list_free (&mca_btl_uct_component.memory_domain_list); + mca_btl_uct_include_list_free (&mca_btl_uct_component.allowed_transport_list); + mca_btl_uct_include_list_free (&mca_btl_uct_component.connection_domain_list); + return OPAL_SUCCESS; } @@ -235,6 +303,34 @@ static size_t mca_btl_uct_tl_modex_pack(mca_btl_uct_tl_t *tl, uint8_t *modex_dat return modex_size; } +static uint8_t *mca_btl_uct_modex_pack(mca_btl_uct_module_t *module, uint8_t *modex_data) +{ + size_t name_len = strlen(module->md_name); + + /* pack the size */ + *((uint32_t *) modex_data) = (uint32_t) mca_btl_uct_module_modex_size(module); + + modex_data += 4; + + strcpy((char *) modex_data, module->md_name); + modex_data += name_len + 1; + + if (module->rdma_tl) { + modex_data += mca_btl_uct_tl_modex_pack(module->rdma_tl, modex_data); + } + + if (module->am_tl && module->am_tl != module->rdma_tl) { + modex_data += mca_btl_uct_tl_modex_pack(module->am_tl, modex_data); + } + + if (module->conn_tl && module->conn_tl != module->rdma_tl + && module->conn_tl != module->am_tl) { + modex_data += mca_btl_uct_tl_modex_pack(module->conn_tl, modex_data); + } + + return modex_data; +} + static int mca_btl_uct_modex_send(void) { size_t modex_size = sizeof(mca_btl_uct_modex_t); @@ -246,35 +342,22 @@ static int mca_btl_uct_modex_send(void) modex_size += mca_btl_uct_module_modex_size(mca_btl_uct_component.modules[i]); } + if (mca_btl_uct_component.conn_module != NULL) { + modex_size += mca_btl_uct_module_modex_size(mca_btl_uct_component.conn_module); + } + modex = alloca(modex_size); modex_data = modex->data; modex->module_count = mca_btl_uct_component.module_count; for (int i = 0; i < mca_btl_uct_component.module_count; ++i) { - mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i]; - size_t name_len = strlen(module->md_name); - - /* pack the size */ - *((uint32_t *) modex_data) = (uint32_t) mca_btl_uct_module_modex_size(module); - - modex_data += 4; - - strcpy((char *) modex_data, module->md_name); - modex_data += name_len + 1; - - if (module->rdma_tl) { - modex_data += mca_btl_uct_tl_modex_pack(module->rdma_tl, modex_data); - } - - if (module->am_tl && module->am_tl != module->rdma_tl) { - modex_data += mca_btl_uct_tl_modex_pack(module->am_tl, modex_data); - } + modex_data = mca_btl_uct_modex_pack (mca_btl_uct_component.modules[i], modex_data); + } - if (module->conn_tl && module->conn_tl != module->rdma_tl - && module->conn_tl != module->am_tl) { - modex_data += mca_btl_uct_tl_modex_pack(module->conn_tl, modex_data); - } + if (mca_btl_uct_component.conn_module != NULL) { + ++modex->module_count; + modex_data = mca_btl_uct_modex_pack (mca_btl_uct_component.conn_module, modex_data); } OPAL_MODEX_SEND(rc, PMIX_GLOBAL, &mca_btl_uct_component.super.btl_version, modex, modex_size); @@ -323,6 +406,10 @@ ucs_status_t mca_btl_uct_am_handler(void *arg, void *data, size_t length, unsign mca_btl_uct_device_context_t *tl_context = (mca_btl_uct_device_context_t *) arg; mca_btl_uct_module_t *uct_btl = tl_context->uct_btl; mca_btl_uct_am_header_t *header = (mca_btl_uct_am_header_t *) data; + if (header->data.tag == 0xff) { + fprintf (stderr, "%d: got an invalid tag\n", getpid()); + while (true) {} + } mca_btl_active_message_callback_t *reg = mca_btl_base_active_message_trigger + header->data.tag; mca_btl_base_segment_t seg = {.seg_addr = {.pval = (void *) ((intptr_t) data + sizeof(*header))}, @@ -337,17 +424,16 @@ ucs_status_t mca_btl_uct_am_handler(void *arg, void *data, size_t length, unsign tl_context->in_am_callback = true; reg->cbfunc(&uct_btl->super, &desc); tl_context->in_am_callback = false; + header->data.tag = 0xff; return UCS_OK; } #if UCT_API >= UCT_VERSION(1, 7) static int mca_btl_uct_component_process_uct_md(uct_component_h component, - uct_md_resource_desc_t *md_desc, - char **allowed_ifaces) + uct_md_resource_desc_t *md_desc) #else -static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc, - char **allowed_ifaces) +static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc) #endif { mca_rcache_base_resources_t rcache_resources; @@ -356,29 +442,35 @@ static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc, uct_md_config_t *uct_config; uct_md_attr_t md_attr; mca_btl_uct_md_t *md; - bool found = false; + int list_rank; unsigned num_tls; char *tmp; ucs_status_t ucs_status; + int connection_list_rank = -1; + bool consider_for_connection_module = false; + + BTL_VERBOSE(("processing memory domain %s", md_desc->md_name)); if (MCA_BTL_UCT_MAX_MODULES == mca_btl_uct_component.module_count) { BTL_VERBOSE(("created the maximum number of allowable modules")); return OPAL_ERR_NOT_AVAILABLE; } - BTL_VERBOSE(("processing memory domain %s", md_desc->md_name)); + BTL_VERBOSE(("checking if %s should be used for communication", md_desc->md_name)); + list_rank = mca_btl_uct_include_list_rank (md_desc->md_name, &mca_btl_uct_component.memory_domain_list); - for (int j = 0; allowed_ifaces[j]; ++j) { - if (0 == strncmp(allowed_ifaces[j], md_desc->md_name, strlen(md_desc->md_name)) - || 0 == strcmp(allowed_ifaces[j], "all")) { - found = true; - break; + if (list_rank < 0) { + BTL_VERBOSE(("checking if %s should be used for connections", md_desc->md_name)); + connection_list_rank = mca_btl_uct_include_list_rank (md_desc->md_name, &mca_btl_uct_component.connection_domain_list); + + if (connection_list_rank < 0) { + /* nothing to do */ + BTL_VERBOSE(("not continuing with memory domain %s", md_desc->md_name)); + return OPAL_SUCCESS; } - } - if (!found) { - /* nothing to do */ - return OPAL_SUCCESS; + BTL_VERBOSE(("will be considering domain %s for connections only", md_desc->md_name)); + consider_for_connection_module = true; } md = OBJ_NEW(mca_btl_uct_md_t); @@ -425,7 +517,9 @@ static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc, return OPAL_ERR_OUT_OF_RESOURCE; } - (void) mca_btl_uct_query_tls(module, md, tl_desc, num_tls); + /* if this module is not to be used for communication check if it has a transport suitable + * for forming connections. */ + (void) mca_btl_uct_query_tls(module, md, tl_desc, num_tls, consider_for_connection_module); uct_release_tl_resource_list(tl_desc); @@ -433,7 +527,7 @@ static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc, * remain open until those modules are finalized. */ OBJ_RELEASE(md); - if (NULL == module->am_tl && NULL == module->rdma_tl) { + if (NULL == module->am_tl && NULL == module->rdma_tl && (NULL == module->conn_tl || !consider_for_connection_module)) { BTL_VERBOSE(("uct memory domain %s does not have any appropriate tls", md_desc->md_name)); mca_btl_uct_finalize(&module->super); return OPAL_ERR_NOT_AVAILABLE; @@ -443,35 +537,45 @@ static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc, module->uct_component = component; #endif - mca_btl_uct_component.modules[mca_btl_uct_component.module_count++] = module; - - /* NTH: a registration cache shouldn't be necessary when using UCT but there are measurable - * performance benefits to using rcache/grdma instead of assuming UCT will do the right - * thing. */ - (void) opal_asprintf(&tmp, "uct.%s", module->md_name); - - rcache_resources.cache_name = tmp; - rcache_resources.reg_data = (void *) module; - rcache_resources.sizeof_reg = sizeof(mca_btl_uct_reg_t) - + module->super.btl_registration_handle_size; - rcache_resources.register_mem = mca_btl_uct_reg_mem; - rcache_resources.deregister_mem = mca_btl_uct_dereg_mem; - - module->rcache = mca_rcache_base_module_create("grdma", module, &rcache_resources); - free(tmp); - if (NULL == module->rcache) { - /* something when horribly wrong */ - BTL_VERBOSE(("could not allocate a registration cache for this btl module")); - mca_btl_uct_finalize(&module->super); - return OPAL_ERROR; + if (!consider_for_connection_module) { + module->module_index = mca_btl_uct_component.module_count; + + mca_btl_uct_component.modules[mca_btl_uct_component.module_count++] = module; + + /* NTH: a registration cache shouldn't be necessary when using UCT but there are measurable + * performance benefits to using rcache/grdma instead of assuming UCT will do the right + * thing. */ + (void) opal_asprintf(&tmp, "uct.%s", module->md_name); + + rcache_resources.cache_name = tmp; + rcache_resources.reg_data = (void *) module; + rcache_resources.sizeof_reg = sizeof(mca_btl_uct_reg_t) + + module->super.btl_registration_handle_size; + rcache_resources.register_mem = mca_btl_uct_reg_mem; + rcache_resources.deregister_mem = mca_btl_uct_dereg_mem; + + module->rcache = mca_rcache_base_module_create("grdma", module, &rcache_resources); + free(tmp); + if (NULL == module->rcache) { + /* something when horribly wrong */ + BTL_VERBOSE(("could not allocate a registration cache for this btl module")); + mca_btl_uct_finalize(&module->super); + return OPAL_ERROR; + } + } else { + if (NULL == mca_btl_uct_component.conn_module) { + BTL_VERBOSE(("memory domain %s may be used for connections", md_desc->md_name)); + mca_btl_uct_component.conn_module = module; + } else { + mca_btl_uct_finalize(&module->super); + } } return OPAL_SUCCESS; } #if UCT_API >= UCT_VERSION(1, 7) -static int mca_btl_uct_component_process_uct_component(uct_component_h component, - char **allowed_ifaces) +static int mca_btl_uct_component_process_uct_component(uct_component_h component) { uct_component_attr_t attr = {.field_mask = UCT_COMPONENT_ATTR_FIELD_NAME | UCT_COMPONENT_ATTR_FIELD_MD_RESOURCE_COUNT}; @@ -493,7 +597,7 @@ static int mca_btl_uct_component_process_uct_component(uct_component_h component } for (unsigned i = 0; i < attr.md_resource_count; ++i) { - rc = mca_btl_uct_component_process_uct_md(component, attr.md_resources + i, allowed_ifaces); + rc = mca_btl_uct_component_process_uct_md(component, attr.md_resources + i); if (OPAL_SUCCESS != rc) { break; } @@ -505,6 +609,63 @@ static int mca_btl_uct_component_process_uct_component(uct_component_h component } #endif /* UCT_API >= UCT_VERSION(1, 7) */ +static void mca_btl_uct_component_validate_modules(void) { + if (mca_btl_uct_component.conn_module != NULL) { + /* verify that a connection-only module is required. this might be the case in some systems + * where rc verbs is avaiable but ud is not. */ + bool need_conn_module = false; + for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i]; + if (module->conn_tl != NULL) { + continue; + } + if ((module->rdma_tl && mca_btl_uct_tl_requires_connection_tl(module->rdma_tl)) || + (module->am_tl && mca_btl_uct_tl_requires_connection_tl(module->am_tl))) { + need_conn_module = true; + break; + } + } + + if (!need_conn_module) { + mca_btl_uct_finalize (&mca_btl_uct_component.conn_module->super); + mca_btl_uct_component.conn_module = NULL; + } + } else { + int usable_module_count = mca_btl_uct_component.module_count; + + /* check that all modules can be used */ + for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i]; + if (NULL != module->conn_tl) { + /* module has its own connection transport */ + continue; + } + + if (((module->rdma_tl && mca_btl_uct_tl_requires_connection_tl(module->rdma_tl)) || + (module->am_tl && mca_btl_uct_tl_requires_connection_tl(module->am_tl))) + && NULL == module->conn_tl) { + /* module can not be used */ + BTL_VERBOSE(("module for memory domain %s can not be used due to missing connection transport", + module->md_name)); + mca_btl_uct_finalize (&mca_btl_uct_component.modules[i]->super); + mca_btl_uct_component.modules[i] = NULL; + } + } + + /* remove holes in the module array */ + if (usable_module_count < mca_btl_uct_component.module_count) { + for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + if (mca_btl_uct_component.modules[i] == NULL) { + for (int j = i ; j < mca_btl_uct_component.module_count ; ++j) { + mca_btl_uct_component.modules[i++] = mca_btl_uct_component.modules[j]; + } + } + } + mca_btl_uct_component.module_count = usable_module_count; + } + } +} + /* * UCT component initialization: * (1) read interface list from kernel and compare against component parameters @@ -521,7 +682,6 @@ static mca_btl_base_module_t **mca_btl_uct_component_init(int *num_btl_modules, */ struct mca_btl_base_module_t **base_modules; ucs_status_t ucs_status; - char **allowed_ifaces; int rc; BTL_VERBOSE(("initializing uct btl")); @@ -533,10 +693,12 @@ static mca_btl_base_module_t **mca_btl_uct_component_init(int *num_btl_modules, return NULL; } - allowed_ifaces = opal_argv_split(mca_btl_uct_component.memory_domains, ','); - if (NULL == allowed_ifaces) { - return NULL; - } + mca_btl_uct_component_parse_include_list(mca_btl_uct_component.memory_domains, + &mca_btl_uct_component.memory_domain_list); + mca_btl_uct_component_parse_include_list(mca_btl_uct_component.allowed_transports, + &mca_btl_uct_component.allowed_transport_list); + mca_btl_uct_component_parse_include_list(mca_btl_uct_component.connection_domains, + &mca_btl_uct_component.connection_domain_list); mca_btl_uct_component.module_count = 0; @@ -552,7 +714,7 @@ static mca_btl_base_module_t **mca_btl_uct_component_init(int *num_btl_modules, /* generate all suitable btl modules */ for (unsigned i = 0; i < num_components; ++i) { - rc = mca_btl_uct_component_process_uct_component(components[i], allowed_ifaces); + rc = mca_btl_uct_component_process_uct_component(components[i]); if (OPAL_SUCCESS != rc) { break; } @@ -568,7 +730,7 @@ static mca_btl_base_module_t **mca_btl_uct_component_init(int *num_btl_modules, /* generate all suitable btl modules */ for (unsigned i = 0; i < resource_count; ++i) { - rc = mca_btl_uct_component_process_uct_md(resources + i, allowed_ifaces); + rc = mca_btl_uct_component_process_uct_md(resources + i); if (OPAL_SUCCESS != rc) { break; } @@ -578,7 +740,9 @@ static mca_btl_base_module_t **mca_btl_uct_component_init(int *num_btl_modules, #endif /* UCT_API >= UCT_VERSION(1, 7) */ - opal_argv_free(allowed_ifaces); + /* filter out unusable modules before sending the modex */ + mca_btl_uct_component_validate_modules(); + mca_btl_uct_modex_send(); /* pass module array back to caller */ @@ -644,6 +808,36 @@ static int mca_btl_uct_component_progress_pending(mca_btl_uct_module_t *uct_btl) return completed; } +static int mca_btl_uct_component_progress_connections (mca_btl_uct_module_t *module) { + mca_btl_uct_pending_connection_request_t *request; + int ret; + + if (module->conn_tl == NULL) { + return 0; + } + + ret = mca_btl_uct_tl_progress(module->conn_tl, 0); + + while (NULL + != (request = (mca_btl_uct_pending_connection_request_t *) opal_fifo_pop_atomic( + &module->pending_connection_reqs))) { + mca_btl_uct_conn_req_t *conn_req = (mca_btl_uct_conn_req_t *) request->request_data; + BTL_VERBOSE(("processing connection request....")); + if (conn_req->module_index >= mca_btl_uct_component.module_count) { + BTL_ERROR(("invalid connection request received")); + abort(); + } + int rc = mca_btl_uct_process_connection_request(mca_btl_uct_component.modules[conn_req->module_index], conn_req); + if (rc != OPAL_SUCCESS) { + opal_fifo_push_atomic(&module->pending_connection_reqs, &request->super); + break; + } + OBJ_RELEASE(request); + } + + return ret; +} + /** * @brief UCT BTL progress function * @@ -665,27 +859,17 @@ static int mca_btl_uct_component_progress(void) ret += mca_btl_uct_tl_progress(module->am_tl, starting_index); } - if (module->conn_tl) { - mca_btl_uct_pending_connection_request_t *request; - - if (module->conn_tl != module->am_tl && module->conn_tl != module->rdma_tl) { - ret += mca_btl_uct_tl_progress(module->conn_tl, 0); - } - - while (NULL - != (request = (mca_btl_uct_pending_connection_request_t *) opal_fifo_pop_atomic( - &module->pending_connection_reqs))) { - mca_btl_uct_process_connection_request(module, (mca_btl_uct_conn_req_t *) - request->request_data); - OBJ_RELEASE(request); - } - } - + mca_btl_uct_component_progress_connections (module); + if (0 != opal_list_get_size(&module->pending_frags)) { mca_btl_uct_component_progress_pending(module); } } + if (NULL != mca_btl_uct_component.conn_module) { + ret += mca_btl_uct_component_progress_connections (mca_btl_uct_component.conn_module); + } + return (int) ret; } diff --git a/opal/mca/btl/uct/btl_uct_endpoint.c b/opal/mca/btl/uct/btl_uct_endpoint.c index 695fd754aa2..39f2979302f 100644 --- a/opal/mca/btl/uct/btl_uct_endpoint.c +++ b/opal/mca/btl/uct/btl_uct_endpoint.c @@ -256,50 +256,57 @@ static int mca_btl_uct_endpoint_send_conn_req(mca_btl_uct_module_t *uct_btl, return OPAL_SUCCESS; } -static int mca_btl_uct_endpoint_send_connection_data( - mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, mca_btl_uct_tl_t *tl, - mca_btl_uct_device_context_t *tl_context, mca_btl_uct_tl_endpoint_t *tl_endpoint, - uint8_t *conn_tl_data, int request_type) +static int mca_btl_uct_endpoint_get_helper_endpoint(mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, + mca_btl_uct_tl_t *conn_tl, uint8_t *conn_tl_data) { - mca_btl_uct_tl_t *conn_tl = uct_btl->conn_tl; - mca_btl_uct_device_context_t *conn_tl_context = conn_tl->uct_dev_contexts[0]; - uct_device_addr_t *device_addr = NULL; - uct_iface_addr_t *iface_addr; - ucs_status_t ucs_status; - - assert(NULL != conn_tl); - - BTL_VERBOSE(("connecting endpoint to remote endpoint")); + if (NULL != endpoint->conn_ep) { + BTL_VERBOSE(("re-using existing connection endpoint")); + OBJ_RETAIN(endpoint->conn_ep); + return OPAL_SUCCESS; + } - if (NULL == endpoint->conn_ep) { - BTL_VERBOSE(("creating a temporary endpoint for handling connections to %p", - opal_process_name_print(endpoint->ep_proc->proc_name))); + BTL_VERBOSE(("creating a temporary endpoint for handling connections to %p", + opal_process_name_print(endpoint->ep_proc->proc_name))); - iface_addr = (uct_iface_addr_t *) conn_tl_data; - device_addr = (uct_device_addr_t *) ((uintptr_t) conn_tl_data - + MCA_BTL_UCT_TL_ATTR(conn_tl, 0).iface_addr_len); + uct_iface_addr_t *iface_addr = (uct_iface_addr_t *) conn_tl_data; + uct_device_addr_t *device_addr = (uct_device_addr_t *) ((uintptr_t) conn_tl_data + + MCA_BTL_UCT_TL_ATTR(conn_tl, 0).iface_addr_len); - endpoint->conn_ep = OBJ_NEW(mca_btl_uct_connection_ep_t); - if (OPAL_UNLIKELY(NULL == endpoint->conn_ep)) { - return OPAL_ERR_OUT_OF_RESOURCE; - } + endpoint->conn_ep = OBJ_NEW(mca_btl_uct_connection_ep_t); + if (OPAL_UNLIKELY(NULL == endpoint->conn_ep)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } - /* create a temporary endpoint for setting up the rdma endpoint */ - MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, { + ucs_status_t ucs_status; + mca_btl_uct_device_context_t *conn_tl_context = conn_tl->uct_dev_contexts[0]; + /* create a temporary endpoint for setting up the rdma endpoint */ + MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, { ucs_status = mca_btl_uct_ep_create_connected_compat(conn_tl_context->uct_iface, device_addr, iface_addr, &endpoint->conn_ep->uct_ep); }); - if (UCS_OK != ucs_status) { - BTL_VERBOSE( - ("could not create an endpoint for forming connection to remote peer. code = %d", - ucs_status)); - return OPAL_ERROR; - } - } else { - OBJ_RETAIN(endpoint->conn_ep); + if (UCS_OK != ucs_status) { + BTL_VERBOSE( + ("could not create an endpoint for forming connection to remote peer. code = %d", + ucs_status)); + return OPAL_ERROR; } + return OPAL_SUCCESS; +} + +static int mca_btl_uct_endpoint_send_connection_data( + mca_btl_uct_module_t *uct_btl, mca_btl_uct_tl_t *conn_tl, mca_btl_base_endpoint_t *endpoint, + mca_btl_uct_tl_t *tl, mca_btl_uct_device_context_t *tl_context, + mca_btl_uct_tl_endpoint_t *tl_endpoint, int request_type, int remote_module_index) +{ + mca_btl_uct_device_context_t *conn_tl_context = conn_tl->uct_dev_contexts[0]; + ucs_status_t ucs_status; + + assert(NULL != conn_tl); + + BTL_VERBOSE(("connecting endpoint to remote endpoint")); + size_t request_length = sizeof(mca_btl_uct_conn_req_t) + MCA_BTL_UCT_TL_ATTR(tl, tl_context->context_id).ep_addr_len; mca_btl_uct_conn_req_t *request = alloca(request_length); @@ -309,6 +316,7 @@ static int mca_btl_uct_endpoint_send_connection_data( request->context_id = tl_context->context_id; request->tl_index = tl->tl_index; request->type = request_type; + request->module_index = remote_module_index; /* fill in connection request */ ucs_status = uct_ep_get_address(tl_endpoint->uct_ep, (uct_ep_addr_t *) request->ep_addr); @@ -337,9 +345,9 @@ static int mca_btl_uct_endpoint_send_connection_data( } static int mca_btl_uct_endpoint_connect_endpoint( - mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, mca_btl_uct_tl_t *tl, - mca_btl_uct_device_context_t *tl_context, mca_btl_uct_tl_endpoint_t *tl_endpoint, - uint8_t *tl_data, uint8_t *conn_tl_data, void *ep_addr) + mca_btl_uct_module_t *uct_btl, mca_btl_uct_tl_t *conn_tl, mca_btl_base_endpoint_t *endpoint, + mca_btl_uct_tl_t *tl, mca_btl_uct_device_context_t *tl_context, + mca_btl_uct_tl_endpoint_t *tl_endpoint, uint8_t *tl_data, void *ep_addr, int remote_module_index) { ucs_status_t ucs_status; @@ -378,11 +386,47 @@ static int mca_btl_uct_endpoint_connect_endpoint( : OPAL_ERR_OUT_OF_RESOURCE; } - int rc = mca_btl_uct_endpoint_send_connection_data(uct_btl, endpoint, tl, tl_context, tl_endpoint, - conn_tl_data, /*request_type=*/!!ep_addr); + int rc = mca_btl_uct_endpoint_send_connection_data(uct_btl, conn_tl, endpoint, tl, tl_context, tl_endpoint, + /*request_type=*/!!ep_addr, remote_module_index); return (OPAL_SUCCESS == rc) ? OPAL_ERR_OUT_OF_RESOURCE : rc; } +static int mca_btl_uct_find_modex(mca_btl_uct_module_t *uct_btl, mca_btl_uct_modex_t *modex, + uint8_t **rdma_tl_data, uint8_t **am_tl_data, uint8_t **conn_tl_data, int *remote_module_index) { + uint8_t *modex_data = modex->data; + + /* look for matching transport in the modex */ + for (int i = 0; i < modex->module_count; ++i) { + uint32_t modex_size = *((uint32_t *) modex_data); + + BTL_VERBOSE(("found modex for md %s, searching for %s", modex_data + 4, uct_btl->md_name)); + + modex_data += 4; + + if (0 != strcmp((char *) modex_data, uct_btl->md_name)) { + /* modex belongs to a different module, skip it and continue */ + modex_data += modex_size - 4; + continue; + } + + modex_data += strlen((char *) modex_data) + 1; + + mca_btl_uct_process_modex(uct_btl, modex_data, rdma_tl_data, am_tl_data, conn_tl_data); + if (NULL != remote_module_index) { + *remote_module_index = i; + } + + BTL_VERBOSE(("finished processing modex for %s", uct_btl->md_name)); + + return OPAL_SUCCESS; + } + + BTL_ERROR(("could not find modex for %s", uct_btl->md_name)); + + return OPAL_ERR_NOT_FOUND; +} + + int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endpoint_t *endpoint, int context_id, void *ep_addr, int tl_index) { @@ -394,7 +438,6 @@ int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endp = mca_btl_uct_module_get_tl_context_specific(uct_btl, tl, context_id); uint8_t *rdma_tl_data = NULL, *conn_tl_data = NULL, *am_tl_data = NULL, *tl_data; mca_btl_uct_modex_t *modex; - uint8_t *modex_data; size_t msg_size; int rc; @@ -410,19 +453,20 @@ int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endp !!(MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & tl_endpoint->flags))); opal_mutex_lock(&endpoint->ep_lock); - if (MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & tl_endpoint->flags) { - opal_mutex_unlock(&endpoint->ep_lock); - /* nothing more to do. someone else completed the connection */ - return OPAL_SUCCESS; - } - - /* dumpicate connection request. nothing to do until the endpoint data is received */ - if (NULL != tl_endpoint->uct_ep && NULL == ep_addr) { - opal_mutex_unlock(&endpoint->ep_lock); - return OPAL_ERR_OUT_OF_RESOURCE; - } do { + if (MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & tl_endpoint->flags) { + /* nothing more to do. someone else completed the connection */ + rc = OPAL_SUCCESS; + break; + } + + /* dumpicate connection request. nothing to do until the endpoint data is received */ + if (NULL != tl_endpoint->uct_ep && NULL == ep_addr) { + rc = OPAL_ERR_OUT_OF_RESOURCE; + break; + } + /* read the modex. this is done both to start the connection and to process endpoint data */ OPAL_MODEX_RECV(rc, &mca_btl_uct_component.super.btl_version, &endpoint->ep_proc->proc_name, (void **) &modex, &msg_size); @@ -434,45 +478,51 @@ int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endp BTL_VERBOSE(("received modex of size %lu for proc %s. module count %d", (unsigned long) msg_size, OPAL_NAME_PRINT(endpoint->ep_proc->proc_name), modex->module_count)); - modex_data = modex->data; - - /* look for matching transport in the modex */ - for (int i = 0; i < modex->module_count; ++i) { - uint32_t modex_size = *((uint32_t *) modex_data); - - BTL_VERBOSE( - ("found modex for md %s, searching for %s", modex_data + 4, uct_btl->md_name)); - - modex_data += 4; - - if (0 != strcmp((char *) modex_data, uct_btl->md_name)) { - /* modex belongs to a different module, skip it and continue */ - modex_data += modex_size - 4; - continue; - } - modex_data += strlen((char *) modex_data) + 1; - - mca_btl_uct_process_modex(uct_btl, modex_data, &rdma_tl_data, &am_tl_data, - &conn_tl_data); + int remote_module_index; + rc = mca_btl_uct_find_modex (uct_btl, modex, &rdma_tl_data, &am_tl_data, &conn_tl_data, &remote_module_index); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { break; } tl_data = (tl == uct_btl->rdma_tl) ? rdma_tl_data : am_tl_data; - if (NULL == tl_data) { - opal_mutex_unlock(&endpoint->ep_lock); - return OPAL_ERR_UNREACH; + if (OPAL_UNLIKELY(NULL == tl_data)) { + BTL_ERROR(("could not find modex data for this transport")); + rc = OPAL_ERR_UNREACH; + break; } /* connect the endpoint */ - if (!mca_btl_uct_tl_requires_connection_tl(tl)) { - rc = mca_btl_uct_endpoint_connect_iface(uct_btl, tl, tl_context, tl_endpoint, tl_data); + if (mca_btl_uct_tl_requires_connection_tl(tl)) { + mca_btl_uct_tl_t *conn_tl = uct_btl->conn_tl; + if (NULL == conn_tl) { + rc = mca_btl_uct_find_modex (mca_btl_uct_component.conn_module, modex, + /*rdma_tl_data=*/NULL, /*am_tl_data=*/NULL, + &conn_tl_data, /*remote_module_index=*/NULL); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + BTL_ERROR(("could not find modex for connection module")); + break; + } + + BTL_VERBOSE(("using separate connection module for tl")); + conn_tl = mca_btl_uct_component.conn_module->conn_tl; + } + + if (NULL == tl_endpoint->uct_ep) { + /* allocate or retain a connection endpoint */ + rc = mca_btl_uct_endpoint_get_helper_endpoint(uct_btl, endpoint, conn_tl, + conn_tl_data); + if (OPAL_SUCCESS != rc) { + break; + } + } + + rc = mca_btl_uct_endpoint_connect_endpoint(uct_btl, conn_tl, endpoint, tl, + tl_context, tl_endpoint, tl_data, ep_addr, remote_module_index); } else { - rc = mca_btl_uct_endpoint_connect_endpoint(uct_btl, endpoint, tl, tl_context, - tl_endpoint, tl_data, conn_tl_data, ep_addr); + rc = mca_btl_uct_endpoint_connect_iface(uct_btl, tl, tl_context, tl_endpoint, tl_data); } - } while (0); opal_mutex_unlock(&endpoint->ep_lock); diff --git a/opal/mca/btl/uct/btl_uct_tl.c b/opal/mca/btl/uct/btl_uct_tl.c index c1ef4c6d727..fd8061fa81c 100644 --- a/opal/mca/btl/uct/btl_uct_tl.c +++ b/opal/mca/btl/uct/btl_uct_tl.c @@ -553,57 +553,27 @@ static int mca_btl_uct_evaluate_tl(mca_btl_uct_module_t *module, mca_btl_uct_tl_ } int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, - uct_tl_resource_desc_t *tl_descs, unsigned tl_count) + uct_tl_resource_desc_t *tl_descs, unsigned tl_count, + bool evaluate_for_conn_only) { - bool include = true, any = false; mca_btl_uct_tl_t *tl; opal_list_t tl_list; - char **tl_filter; - int any_priority = 0; OBJ_CONSTRUCT(&tl_list, opal_list_t); - tl_filter = opal_argv_split(mca_btl_uct_component.allowed_transports, ','); - - if ('^' == tl_filter[0][0]) { - /* user has negated the include list */ - char *tmp = strdup(tl_filter[0] + 1); - - free(tl_filter[0]); - tl_filter[0] = tmp; - include = false; - } - - /* check for the any keyword */ - for (unsigned j = 0; tl_filter[j]; ++j) { - if (0 == strcmp(tl_filter[j], "any")) { - any_priority = j; - any = true; - break; - } - } - - if (any && !include) { - opal_argv_free(tl_filter); - return OPAL_ERR_NOT_AVAILABLE; - } - for (unsigned i = 0; i < tl_count; ++i) { - bool try_tl = any; - int priority = any_priority; - - for (unsigned j = 0; tl_filter[j]; ++j) { - if (0 == strcmp(tl_filter[j], tl_descs[i].tl_name)) { - try_tl = include; - priority = j; - break; + int priority = 0; + BTL_VERBOSE(("processing tl %s, evaluate_for_conn_only=%d", tl_descs[i].tl_name, evaluate_for_conn_only)); + + if (!evaluate_for_conn_only) { + priority = mca_btl_uct_include_list_rank (tl_descs[i].tl_name, &mca_btl_uct_component.allowed_transport_list); + BTL_VERBOSE(("tl filter: tl_name = %s, priority = %d", tl_descs[i].tl_name, + priority)); + if (priority < 0) { + continue; } - } - - BTL_VERBOSE(("tl filter: tl_name = %s, use = %d, priority = %d", tl_descs[i].tl_name, - try_tl, priority)); - - if (!try_tl) { + } else if (tl_descs[i].dev_type != UCT_DEVICE_TYPE_NET) { + /* only network types are suitable for forming connections */ continue; } @@ -616,12 +586,23 @@ int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, tl = mca_btl_uct_create_tl(module, md, tl_descs + i, priority); if (tl) { - opal_list_append(&tl_list, &tl->super); + if (mca_btl_uct_tl_supports_conn(tl) && evaluate_for_conn_only) { + BTL_VERBOSE(("evaluating tl %s for forming connections", tl_descs[i].tl_name)); + int rc = mca_btl_uct_set_tl_conn(module, tl); + OBJ_RELEASE(tl); + + if (OPAL_SUCCESS == rc) { + mca_btl_uct_context_enable_progress(tl->uct_dev_contexts[0]); + return OPAL_SUCCESS; + } + + BTL_VERBOSE(("tl %s cannot be used for forming connections", tl_descs[i].tl_name)); + } else { + opal_list_append(&tl_list, &tl->super); + } } } - opal_argv_free(tl_filter); - if (0 == opal_list_get_size(&tl_list)) { BTL_VERBOSE(("no suitable tls match filter: %s", mca_btl_uct_component.allowed_transports)); OBJ_DESTRUCT(&tl_list); @@ -670,10 +651,6 @@ int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, /* no connection tl needed for selected transports */ OBJ_RELEASE(module->conn_tl); module->conn_tl = NULL; - } else if (NULL == module->conn_tl) { - BTL_VERBOSE(("a connection tl is required but no tls match the filter %s", - mca_btl_uct_component.allowed_transports)); - return OPAL_ERROR; } return OPAL_SUCCESS; diff --git a/opal/mca/btl/uct/btl_uct_types.h b/opal/mca/btl/uct/btl_uct_types.h index b2bac61be61..6bdf8286a73 100644 --- a/opal/mca/btl/uct/btl_uct_types.h +++ b/opal/mca/btl/uct/btl_uct_types.h @@ -90,6 +90,9 @@ struct mca_btl_uct_conn_req_t { /** transport index that should be connected */ int tl_index; + /** module that is being connected (local index to the receiver) */ + int module_index; + /** endpoint address data */ uint8_t ep_addr[]; }; @@ -347,4 +350,17 @@ struct mca_btl_uct_pending_connection_request_t { typedef struct mca_btl_uct_pending_connection_request_t mca_btl_uct_pending_connection_request_t; OBJ_CLASS_DECLARATION(mca_btl_uct_pending_connection_request_t); +/** + * @brief parsed include/exclude list + * + */ +struct mca_btl_uct_include_list_t { + /** argv-style (NULL terminated) array of strings */ + char **list; + /** is an inclusive list (vs exclusive) */ + bool include; +}; +typedef struct mca_btl_uct_include_list_t mca_btl_uct_include_list_t; + + #endif /* !defined(BTL_UCT_TYPES_H) */ From 4189968e9f9fe5016e2af74b4e3e1935aef1d0da Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Tue, 22 Apr 2025 21:57:12 +0000 Subject: [PATCH 02/13] btl/uct: move tl attributes off of tl context structure In theory the tl attributes do not differ betweeen contexts so query them once when the tl is created not once per context. This removes the need to allocate the first context so that code has also been removed. Signed-off-by: Nathan Hjelm --- opal/mca/btl/uct/btl_uct.h | 8 +-- opal/mca/btl/uct/btl_uct_am.c | 17 ++--- opal/mca/btl/uct/btl_uct_component.c | 19 ++---- opal/mca/btl/uct/btl_uct_endpoint.c | 6 +- opal/mca/btl/uct/btl_uct_module.c | 2 +- opal/mca/btl/uct/btl_uct_rdma.c | 4 +- opal/mca/btl/uct/btl_uct_tl.c | 99 +++++++++++++++++++--------- opal/mca/btl/uct/btl_uct_types.h | 8 +-- 8 files changed, 97 insertions(+), 66 deletions(-) diff --git a/opal/mca/btl/uct/btl_uct.h b/opal/mca/btl/uct/btl_uct.h index 97a5c28c561..3535f493d42 100644 --- a/opal/mca/btl/uct/btl_uct.h +++ b/opal/mca/btl/uct/btl_uct.h @@ -319,7 +319,7 @@ int mca_btl_uct_process_connection_request(mca_btl_uct_module_t *module, */ static inline bool mca_btl_uct_tl_supports_rdma(mca_btl_uct_tl_t *tl) { - return (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags + return (tl->uct_iface_attr.cap.flags & (UCT_IFACE_FLAG_PUT_ZCOPY | UCT_IFACE_FLAG_GET_ZCOPY)) == (UCT_IFACE_FLAG_PUT_ZCOPY | UCT_IFACE_FLAG_GET_ZCOPY); } @@ -329,7 +329,7 @@ static inline bool mca_btl_uct_tl_supports_rdma(mca_btl_uct_tl_t *tl) */ static inline bool mca_btl_uct_tl_support_am(mca_btl_uct_tl_t *tl) { - return (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags + return (tl->uct_iface_attr.cap.flags & (UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_AM_BCOPY | UCT_IFACE_FLAG_AM_ZCOPY)); } @@ -340,7 +340,7 @@ static inline bool mca_btl_uct_tl_support_am(mca_btl_uct_tl_t *tl) */ static inline bool mca_btl_uct_tl_supports_conn(mca_btl_uct_tl_t *tl) { - return (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags + return (tl->uct_iface_attr.cap.flags & (UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_CONNECT_TO_IFACE)) == (UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_CONNECT_TO_IFACE); } @@ -352,7 +352,7 @@ static inline bool mca_btl_uct_tl_supports_conn(mca_btl_uct_tl_t *tl) */ static inline bool mca_btl_uct_tl_requires_connection_tl(mca_btl_uct_tl_t *tl) { - return !(MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE); + return !(tl->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE); } /** diff --git a/opal/mca/btl/uct/btl_uct_am.c b/opal/mca/btl/uct/btl_uct_am.c index 85d89d2d734..a0d50836e47 100644 --- a/opal/mca/btl/uct/btl_uct_am.c +++ b/opal/mca/btl/uct/btl_uct_am.c @@ -2,6 +2,7 @@ /* * Copyright (c) 2018 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2025 Google, LLC. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -26,7 +27,7 @@ mca_btl_base_descriptor_t *mca_btl_uct_alloc(mca_btl_base_module_t *btl, mca_btl_uct_module_t *uct_btl = (mca_btl_uct_module_t *) btl; mca_btl_uct_base_frag_t *frag = NULL; - if (size <= (size_t) MCA_BTL_UCT_TL_ATTR(uct_btl->am_tl, 0).cap.am.max_short) { + if (size <= (size_t) uct_btl->am_tl->uct_iface_attr.cap.am.max_short) { frag = mca_btl_uct_frag_alloc_short(uct_btl, endpoint); } else if (size <= uct_btl->super.btl_eager_limit) { frag = mca_btl_uct_frag_alloc_eager(uct_btl, endpoint); @@ -105,7 +106,7 @@ struct mca_btl_base_descriptor_t *mca_btl_uct_prepare_src(mca_btl_base_module_t frag->uct_iov.length = total_size; frag->base.order = order; frag->base.des_flags = flags; - if (total_size > (size_t) MCA_BTL_UCT_TL_ATTR(uct_btl->am_tl, 0).cap.am.max_short) { + if (total_size > (size_t) uct_btl->am_tl->uct_iface_attr.cap.am.max_short) { frag->segments[0].seg_len = reserve; frag->segments[1].seg_len = *size; frag->segments[1].seg_addr.pval = data_ptr; @@ -181,7 +182,7 @@ int mca_btl_uct_send_frag(mca_btl_uct_module_t *uct_btl, mca_btl_uct_base_frag_t mca_btl_uct_context_lock(context); /* attempt to post the fragment */ if (NULL != frag->base.super.registration - && (context->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_AM_ZCOPY)) { + && (uct_btl->am_tl->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_AM_ZCOPY)) { frag->comp.dev_context = context; ucs_status = uct_ep_am_zcopy(ep_handle, MCA_BTL_UCT_FRAG, &frag->header, sizeof(frag->header), &frag->uct_iov, 1, 0, @@ -196,7 +197,7 @@ int mca_btl_uct_send_frag(mca_btl_uct_module_t *uct_btl, mca_btl_uct_base_frag_t /* short message */ if (1 == frag->base.des_segment_count && (frag->uct_iov.length + 8) - < MCA_BTL_UCT_TL_ATTR(uct_btl->am_tl, 0).cap.am.max_short) { + < uct_btl->am_tl->uct_iface_attr.cap.am.max_short) { ucs_status = uct_ep_am_short(ep_handle, MCA_BTL_UCT_FRAG, frag->header.value, frag->uct_iov.buffer, frag->uct_iov.length); @@ -290,9 +291,9 @@ static size_t mca_btl_uct_sendi_pack(void *data, void *arg) return args->header_size + args->payload_size + 8; } -static inline size_t mca_btl_uct_max_sendi(mca_btl_uct_module_t *uct_btl, int context_id) +static inline size_t mca_btl_uct_max_sendi(mca_btl_uct_module_t *uct_btl) { - return MCA_BTL_UCT_TL_ATTR(uct_btl->am_tl, context_id).cap.am.max_bcopy; + return uct_btl->am_tl->uct_iface_attr.cap.am.max_bcopy; } int mca_btl_uct_sendi(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, @@ -312,7 +313,7 @@ int mca_btl_uct_sendi(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpo rc = mca_btl_uct_endpoint_check_am(uct_btl, endpoint, context, &ep_handle); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc - || msg_size > mca_btl_uct_max_sendi(uct_btl, context->context_id))) { + || msg_size > mca_btl_uct_max_sendi(uct_btl))) { if (descriptor) { *descriptor = mca_btl_uct_alloc(btl, endpoint, order, total_size, flags); } @@ -326,7 +327,7 @@ int mca_btl_uct_sendi(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpo if (0 == payload_size) { ucs_status = uct_ep_am_short(ep_handle, MCA_BTL_UCT_FRAG, am_header.value, header, header_size); - } else if (msg_size < (size_t) MCA_BTL_UCT_TL_ATTR(uct_btl->am_tl, context->context_id) + } else if (msg_size < (size_t) uct_btl->am_tl->uct_iface_attr .cap.am.max_short) { int8_t *data = alloca(total_size); size_t packed_payload_size = payload_size; diff --git a/opal/mca/btl/uct/btl_uct_component.c b/opal/mca/btl/uct/btl_uct_component.c index 43673625d68..171f07d7a2a 100644 --- a/opal/mca/btl/uct/btl_uct_component.c +++ b/opal/mca/btl/uct/btl_uct_component.c @@ -246,14 +246,14 @@ static size_t mca_btl_uct_tl_modex_size(mca_btl_uct_tl_t *tl) { const size_t size = strlen(tl->uct_tl_name) + 1; - if (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { + if (tl->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { /* pad out to a multiple of 4 bytes */ - return (4 + 3 + size + MCA_BTL_UCT_TL_ATTR(tl, 0).device_addr_len - + MCA_BTL_UCT_TL_ATTR(tl, 0).iface_addr_len) + return (4 + 3 + size + tl->uct_iface_attr.device_addr_len + + tl->uct_iface_attr.iface_addr_len) & ~3; } - return (4 + 3 + size + MCA_BTL_UCT_TL_ATTR(tl, 0).device_addr_len) & ~3; + return (4 + 3 + size + tl->uct_iface_attr.device_addr_len) & ~3; } static size_t mca_btl_uct_module_modex_size(mca_btl_uct_module_t *module) @@ -292,13 +292,13 @@ static size_t mca_btl_uct_tl_modex_pack(mca_btl_uct_tl_t *tl, uint8_t *modex_dat * the same endpoint since we are only doing RDMA. if any of these assumptions are * wrong then we can't delay creating the other contexts and must include their * information in the modex. */ - if (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { + if (tl->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { uct_iface_get_address(dev_context->uct_iface, (uct_iface_addr_t *) modex_data); - modex_data += MCA_BTL_UCT_TL_ATTR(tl, 0).iface_addr_len; + modex_data += tl->uct_iface_attr.iface_addr_len; } uct_iface_get_device_address(dev_context->uct_iface, (uct_device_addr_t *) modex_data); - modex_data += MCA_BTL_UCT_TL_ATTR(tl, 0).device_addr_len; + modex_data += tl->uct_iface_attr.device_addr_len; return modex_size; } @@ -406,10 +406,6 @@ ucs_status_t mca_btl_uct_am_handler(void *arg, void *data, size_t length, unsign mca_btl_uct_device_context_t *tl_context = (mca_btl_uct_device_context_t *) arg; mca_btl_uct_module_t *uct_btl = tl_context->uct_btl; mca_btl_uct_am_header_t *header = (mca_btl_uct_am_header_t *) data; - if (header->data.tag == 0xff) { - fprintf (stderr, "%d: got an invalid tag\n", getpid()); - while (true) {} - } mca_btl_active_message_callback_t *reg = mca_btl_base_active_message_trigger + header->data.tag; mca_btl_base_segment_t seg = {.seg_addr = {.pval = (void *) ((intptr_t) data + sizeof(*header))}, @@ -424,7 +420,6 @@ ucs_status_t mca_btl_uct_am_handler(void *arg, void *data, size_t length, unsign tl_context->in_am_callback = true; reg->cbfunc(&uct_btl->super, &desc); tl_context->in_am_callback = false; - header->data.tag = 0xff; return UCS_OK; } diff --git a/opal/mca/btl/uct/btl_uct_endpoint.c b/opal/mca/btl/uct/btl_uct_endpoint.c index 39f2979302f..5ff2405f5a4 100644 --- a/opal/mca/btl/uct/btl_uct_endpoint.c +++ b/opal/mca/btl/uct/btl_uct_endpoint.c @@ -150,7 +150,7 @@ static int mca_btl_uct_endpoint_connect_iface(mca_btl_uct_module_t *uct_btl, mca /* easy case. just connect to the interface */ iface_addr = (uct_iface_addr_t *) tl_data; device_addr = (uct_device_addr_t *) ((uintptr_t) iface_addr - + MCA_BTL_UCT_TL_ATTR(tl, tl_context->context_id) + + tl->uct_iface_attr .iface_addr_len); BTL_VERBOSE(("connecting endpoint to interface")); @@ -270,7 +270,7 @@ static int mca_btl_uct_endpoint_get_helper_endpoint(mca_btl_uct_module_t *uct_bt uct_iface_addr_t *iface_addr = (uct_iface_addr_t *) conn_tl_data; uct_device_addr_t *device_addr = (uct_device_addr_t *) ((uintptr_t) conn_tl_data - + MCA_BTL_UCT_TL_ATTR(conn_tl, 0).iface_addr_len); + + conn_tl->uct_iface_attr.iface_addr_len); endpoint->conn_ep = OBJ_NEW(mca_btl_uct_connection_ep_t); if (OPAL_UNLIKELY(NULL == endpoint->conn_ep)) { @@ -308,7 +308,7 @@ static int mca_btl_uct_endpoint_send_connection_data( BTL_VERBOSE(("connecting endpoint to remote endpoint")); size_t request_length = sizeof(mca_btl_uct_conn_req_t) - + MCA_BTL_UCT_TL_ATTR(tl, tl_context->context_id).ep_addr_len; + + tl->uct_iface_attr.ep_addr_len; mca_btl_uct_conn_req_t *request = alloca(request_length); /* fill in common request parameters */ diff --git a/opal/mca/btl/uct/btl_uct_module.c b/opal/mca/btl/uct/btl_uct_module.c index 9577d615b92..f4ed0188320 100644 --- a/opal/mca/btl/uct/btl_uct_module.c +++ b/opal/mca/btl/uct/btl_uct_module.c @@ -90,7 +90,7 @@ static int mca_btl_uct_add_procs(mca_btl_base_module_t *btl, size_t nprocs, if (am_tl) { rc = opal_free_list_init(&uct_module->short_frags, sizeof(mca_btl_uct_base_frag_t), opal_cache_line_size, OBJ_CLASS(mca_btl_uct_base_frag_t), - MCA_BTL_UCT_TL_ATTR(am_tl, 0).cap.am.max_short, + am_tl->uct_iface_attr.cap.am.max_short, opal_cache_line_size, 0, 1024, 64, NULL, 0, NULL, NULL, NULL); rc = opal_free_list_init(&uct_module->eager_frags, sizeof(mca_btl_uct_base_frag_t), diff --git a/opal/mca/btl/uct/btl_uct_rdma.c b/opal/mca/btl/uct/btl_uct_rdma.c index d4210e4631c..e1e8f4b91d9 100644 --- a/opal/mca/btl/uct/btl_uct_rdma.c +++ b/opal/mca/btl/uct/btl_uct_rdma.c @@ -126,7 +126,7 @@ int mca_btl_uct_get(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoin mca_btl_uct_context_lock(context); - if (size <= MCA_BTL_UCT_TL_ATTR(uct_btl->rdma_tl, context->context_id).cap.get.max_bcopy) { + if (size <= uct_btl->rdma_tl->uct_iface_attr.cap.get.max_bcopy) { ucs_status = uct_ep_get_bcopy(ep_handle, mca_btl_uct_get_unpack, local_address, size, remote_address, rkey.rkey, &comp->uct_comp); } else { @@ -223,7 +223,7 @@ int mca_btl_uct_put(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoin /* determine what UCT prototol should be used */ if (size <= uct_btl->super.btl_put_local_registration_threshold) { use_short = size - <= MCA_BTL_UCT_TL_ATTR(uct_btl->rdma_tl, context->context_id).cap.put.max_short; + <= uct_btl->rdma_tl->uct_iface_attr.cap.put.max_short; use_bcopy = !use_short; } diff --git a/opal/mca/btl/uct/btl_uct_tl.c b/opal/mca/btl/uct/btl_uct_tl.c index fd8061fa81c..6417bc175df 100644 --- a/opal/mca/btl/uct/btl_uct_tl.c +++ b/opal/mca/btl/uct/btl_uct_tl.c @@ -72,11 +72,11 @@ static uint64_t mca_btl_uct_cap_to_btl_atomic_flag[][2] = { static void mca_btl_uct_module_set_atomic_flags(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) { - uint64_t cap_flags = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags; + uint64_t cap_flags = tl->uct_iface_attr.cap.flags; /* NTH: only use the fetching atomics for now */ - uint64_t atomic_flags32 = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.atomic32.fop_flags; - uint64_t atomic_flags64 = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.atomic64.fop_flags; + uint64_t atomic_flags32 = tl->uct_iface_attr.cap.atomic32.fop_flags; + uint64_t atomic_flags64 = tl->uct_iface_attr.cap.atomic64.fop_flags; uint64_t all_flags = atomic_flags64 | atomic_flags32; @@ -122,7 +122,7 @@ static uint64_t mca_btl_uct_cap_to_btl_atomic_flag[][2] = { */ static void mca_btl_uct_module_set_atomic_flags(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) { - uint64_t cap_flags = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags; + uint64_t cap_flags = tl->uct_iface_attr.cap.flags; module->super.btl_atomic_flags = 0; @@ -269,6 +269,53 @@ static void mca_btl_uct_context_enable_progress(mca_btl_uct_device_context_t *co } } +static int mca_btl_uct_populate_tl_attr(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) { +#if UCT_API >= UCT_VERSION(1, 6) + uct_iface_params_t iface_params = {.field_mask = UCT_IFACE_PARAM_FIELD_OPEN_MODE + | UCT_IFACE_PARAM_FIELD_DEVICE, + .open_mode = UCT_IFACE_OPEN_MODE_DEVICE, + .mode = {.device = {.tl_name = tl->uct_tl_name, + .dev_name = tl->uct_dev_name}}}; +#else + uct_iface_params_t iface_params = {.rndv_cb = NULL, + .eager_cb = NULL, + .stats_root = NULL, + .rx_headroom = 0, + .open_mode = UCT_IFACE_OPEN_MODE_DEVICE, + .mode = {.device = {.tl_name = tl->uct_tl_name, + .dev_name = tl->uct_dev_name}}}; +#endif + ucs_status_t ucs_status; + + /* do the bare minimum to get tl attributes */ + uct_worker_h uct_worker; + ucs_status = uct_worker_create(module->ucs_async, UCS_THREAD_MODE_SINGLE, &uct_worker); + if (OPAL_UNLIKELY(UCS_OK != ucs_status)) { + BTL_VERBOSE(("could not create a UCT worker")); + return OPAL_ERROR; + } + + uct_iface_h uct_iface; + ucs_status = uct_iface_open(tl->uct_md->uct_md, uct_worker, &iface_params, + tl->uct_tl_config, &uct_iface); + if (OPAL_UNLIKELY(UCS_OK != ucs_status)) { + BTL_VERBOSE(("could not open UCT interface. error code: %d", ucs_status)); + uct_worker_destroy(uct_worker); + return OPAL_ERROR; + } + + /* only need to query one of the interfaces to get the attributes */ + ucs_status = uct_iface_query(uct_iface, &tl->uct_iface_attr); + if (UCS_OK != ucs_status) { + BTL_VERBOSE(("Error querying UCT interface")); + uct_worker_destroy(uct_worker); + return OPAL_ERROR; + } + + uct_iface_close(uct_iface); + return OPAL_SUCCESS; +} + mca_btl_uct_device_context_t *mca_btl_uct_context_create(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl, int context_id, bool enable_progress) @@ -330,14 +377,6 @@ mca_btl_uct_device_context_t *mca_btl_uct_context_create(mca_btl_uct_module_t *m return NULL; } - /* only need to query one of the interfaces to get the attributes */ - ucs_status = uct_iface_query(context->uct_iface, &context->uct_iface_attr); - if (UCS_OK != ucs_status) { - BTL_VERBOSE(("Error querying UCT interface")); - mca_btl_uct_context_destroy(context); - return NULL; - } - if (context_id > 0 && tl == module->am_tl) { BTL_VERBOSE(("installing AM handler for tl %p context id %d", (void *) tl, context_id)); uct_iface_set_am_handler(context->uct_iface, MCA_BTL_UCT_FRAG, mca_btl_uct_am_handler, @@ -402,16 +441,14 @@ static mca_btl_uct_tl_t *mca_btl_uct_create_tl(mca_btl_uct_module_t *module, mca (void) uct_md_iface_config_read(md->uct_md, tl_desc->tl_name, NULL, NULL, &tl->uct_tl_config); - /* always create a 0 context (needed to query) */ - tl->uct_dev_contexts[0] = mca_btl_uct_context_create(module, tl, 0, false); - if (NULL == tl->uct_dev_contexts[0]) { - BTL_VERBOSE(("could not create a uct device context")); + int rc = mca_btl_uct_populate_tl_attr(module, tl); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { OBJ_RELEASE(tl); return NULL; } BTL_VERBOSE(("Interface CAPS for tl %s::%s: 0x%lx", module->md_name, tl_desc->tl_name, - (unsigned long) MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags)); + (unsigned long) tl->uct_iface_attr.cap.flags)); return tl; } @@ -422,23 +459,23 @@ static void mca_btl_uct_set_tl_rdma(mca_btl_uct_module_t *module, mca_btl_uct_tl mca_btl_uct_module_set_atomic_flags(module, tl); - module->super.btl_get_limit = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.get.max_zcopy; - if (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.get.max_bcopy) { + module->super.btl_get_limit = tl->uct_iface_attr.cap.get.max_zcopy; + if (tl->uct_iface_attr.cap.get.max_bcopy) { module->super.btl_get_alignment = 0; - module->super.btl_get_local_registration_threshold = MCA_BTL_UCT_TL_ATTR(tl, 0) + module->super.btl_get_local_registration_threshold = tl->uct_iface_attr .cap.get.max_bcopy; } else { /* this is overkill in terms of alignment but we have no way to enforce a minimum get size */ module->super.btl_get_alignment = opal_next_poweroftwo_inclusive( - MCA_BTL_UCT_TL_ATTR(tl, 0).cap.get.min_zcopy); + tl->uct_iface_attr.cap.get.min_zcopy); } - module->super.btl_put_limit = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.put.max_zcopy; + module->super.btl_put_limit = tl->uct_iface_attr.cap.put.max_zcopy; module->super.btl_put_alignment = 0; /* no registration needed when using short/bcopy put */ - module->super.btl_put_local_registration_threshold = MCA_BTL_UCT_TL_ATTR(tl, 0) + module->super.btl_put_local_registration_threshold = tl->uct_iface_attr .cap.put.max_bcopy; module->rdma_tl = tl; @@ -470,10 +507,10 @@ static void mca_btl_uct_set_tl_am(mca_btl_uct_module_t *module, mca_btl_uct_tl_t tl->max_device_contexts = mca_btl_uct_component.num_contexts_per_module; } - module->super.btl_eager_limit = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.am.max_bcopy + module->super.btl_eager_limit = tl->uct_iface_attr.cap.am.max_bcopy - sizeof(mca_btl_uct_am_header_t); - if (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & UCT_IFACE_FLAG_AM_ZCOPY) { - module->super.btl_max_send_size = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.am.max_zcopy + if (tl->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_AM_ZCOPY) { + module->super.btl_max_send_size = tl->uct_iface_attr.cap.am.max_zcopy - sizeof(mca_btl_uct_am_header_t); } else { module->super.btl_max_send_size = module->super.btl_eager_limit; @@ -524,20 +561,20 @@ static int mca_btl_uct_evaluate_tl(mca_btl_uct_module_t *module, mca_btl_uct_tl_ } if (tl == module->rdma_tl || tl == module->am_tl) { - BTL_VERBOSE(("tl has flags 0x%" PRIx64, MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags)); - module->super.btl_flags |= mca_btl_uct_module_flags(MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags); + BTL_VERBOSE(("tl has flags 0x%" PRIx64, tl->uct_iface_attr.cap.flags)); + module->super.btl_flags |= mca_btl_uct_module_flags(tl->uct_iface_attr.cap.flags); /* the bandwidth and latency numbers relate to both rdma and active messages. need to * come up with a better estimate. */ /* UCT bandwidth is in bytes/sec, BTL is in MB/sec */ #if UCT_API >= UCT_VERSION(1, 7) - module->super.btl_bandwidth = (uint32_t)((MCA_BTL_UCT_TL_ATTR(tl, 0).bandwidth.dedicated - + MCA_BTL_UCT_TL_ATTR(tl, 0).bandwidth.shared + module->super.btl_bandwidth = (uint32_t)((tl->uct_iface_attr.bandwidth.dedicated + + tl->uct_iface_attr.bandwidth.shared / (opal_process_info.num_local_peers + 1)) / 1048576.0); #else - module->super.btl_bandwidth = (uint32_t)(MCA_BTL_UCT_TL_ATTR(tl, 0).bandwidth / 1048576.0); + module->super.btl_bandwidth = (uint32_t)(tl->uct_iface_attr.bandwidth / 1048576.0); #endif /* TODO -- figure out how to translate UCT latency to us */ module->super.btl_latency = 1; diff --git a/opal/mca/btl/uct/btl_uct_types.h b/opal/mca/btl/uct/btl_uct_types.h index 6bdf8286a73..8305049b791 100644 --- a/opal/mca/btl/uct/btl_uct_types.h +++ b/opal/mca/btl/uct/btl_uct_types.h @@ -154,9 +154,6 @@ struct mca_btl_uct_device_context_t { /** UCT interface handle */ uct_iface_h uct_iface; - /** interface attributes */ - uct_iface_attr_t uct_iface_attr; - /** RDMA completions */ opal_free_list_t rdma_completions; @@ -335,13 +332,14 @@ struct mca_btl_uct_tl_t { /** tl index. this is used to differentiate (if there is any difference) * between rdma and am endpoints */ int tl_index; + + /** interface attributes */ + uct_iface_attr_t uct_iface_attr; }; typedef struct mca_btl_uct_tl_t mca_btl_uct_tl_t; OBJ_CLASS_DECLARATION(mca_btl_uct_tl_t); -# define MCA_BTL_UCT_TL_ATTR(tl, context_id) (tl)->uct_dev_contexts[(context_id)]->uct_iface_attr - struct mca_btl_uct_pending_connection_request_t { opal_list_item_t super; uint8_t request_data[]; From 3cfd9ee7371c2bbf79e8aa074cf37c9c88227099 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Tue, 22 Apr 2025 22:23:09 +0000 Subject: [PATCH 03/13] btl/uct: change mca_btl_uct_tl_t uct_dev_contexts member to be an array The btl always allocates the maximum number of contexts. This is not a significant amount of memory. Rather than reduce it to be based on the configured maximum number of contexts it makes sense to just make it an array and remove the extra indirection when accessing the contexts. Signed-off-by: Nathan Hjelm --- opal/mca/btl/uct/btl_uct_component.c | 12 +++++++----- opal/mca/btl/uct/btl_uct_tl.c | 28 +++++----------------------- opal/mca/btl/uct/btl_uct_types.h | 2 +- 3 files changed, 13 insertions(+), 29 deletions(-) diff --git a/opal/mca/btl/uct/btl_uct_component.c b/opal/mca/btl/uct/btl_uct_component.c index 171f07d7a2a..6fba2d81d41 100644 --- a/opal/mca/btl/uct/btl_uct_component.c +++ b/opal/mca/btl/uct/btl_uct_component.c @@ -275,9 +275,11 @@ static size_t mca_btl_uct_module_modex_size(mca_btl_uct_module_t *module) return modex_size; } -static size_t mca_btl_uct_tl_modex_pack(mca_btl_uct_tl_t *tl, uint8_t *modex_data) +static size_t mca_btl_uct_tl_modex_pack(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl, + uint8_t *modex_data) { - mca_btl_uct_device_context_t *dev_context = tl->uct_dev_contexts[0]; + mca_btl_uct_device_context_t *dev_context = + mca_btl_uct_module_get_tl_context_specific(module, tl, /*context_id=*/0); size_t modex_size = mca_btl_uct_tl_modex_size(tl); *((uint32_t *) modex_data) = (uint32_t) modex_size; @@ -316,16 +318,16 @@ static uint8_t *mca_btl_uct_modex_pack(mca_btl_uct_module_t *module, uint8_t *mo modex_data += name_len + 1; if (module->rdma_tl) { - modex_data += mca_btl_uct_tl_modex_pack(module->rdma_tl, modex_data); + modex_data += mca_btl_uct_tl_modex_pack(module, module->rdma_tl, modex_data); } if (module->am_tl && module->am_tl != module->rdma_tl) { - modex_data += mca_btl_uct_tl_modex_pack(module->am_tl, modex_data); + modex_data += mca_btl_uct_tl_modex_pack(module, module->am_tl, modex_data); } if (module->conn_tl && module->conn_tl != module->rdma_tl && module->conn_tl != module->am_tl) { - modex_data += mca_btl_uct_tl_modex_pack(module->conn_tl, modex_data); + modex_data += mca_btl_uct_tl_modex_pack(module, module->conn_tl, modex_data); } return modex_data; diff --git a/opal/mca/btl/uct/btl_uct_tl.c b/opal/mca/btl/uct/btl_uct_tl.c index 6417bc175df..8de7c45c97b 100644 --- a/opal/mca/btl/uct/btl_uct_tl.c +++ b/opal/mca/btl/uct/btl_uct_tl.c @@ -160,7 +160,6 @@ static void mca_btl_uct_tl_destructor(mca_btl_uct_tl_t *tl) OBJ_RELEASE(tl->uct_md); } - free(tl->uct_dev_contexts); free(tl->uct_tl_name); free(tl->uct_dev_name); @@ -246,7 +245,10 @@ static int mca_btl_uct_setup_connection_tl(mca_btl_uct_module_t *module) return OPAL_ERR_NOT_SUPPORTED; } - ucs_status = uct_iface_set_am_handler(module->conn_tl->uct_dev_contexts[0]->uct_iface, + mca_btl_uct_device_context_t *context = mca_btl_uct_module_get_tl_context_specific(module, module->conn_tl, + /*context_id=*/0); + + ucs_status = uct_iface_set_am_handler(context->uct_iface, MCA_BTL_UCT_CONNECT_RDMA, mca_btl_uct_conn_req_cb, module, UCT_CB_FLAG_ASYNC); if (UCS_OK != ucs_status) { @@ -377,7 +379,7 @@ mca_btl_uct_device_context_t *mca_btl_uct_context_create(mca_btl_uct_module_t *m return NULL; } - if (context_id > 0 && tl == module->am_tl) { + if (tl == module->am_tl) { BTL_VERBOSE(("installing AM handler for tl %p context id %d", (void *) tl, context_id)); uct_iface_set_am_handler(context->uct_iface, MCA_BTL_UCT_FRAG, mca_btl_uct_am_handler, context, MCA_BTL_UCT_CB_FLAG_SYNC); @@ -433,12 +435,6 @@ static mca_btl_uct_tl_t *mca_btl_uct_create_tl(mca_btl_uct_module_t *module, mca tl->uct_dev_name = strdup(tl_desc->dev_name); tl->priority = priority; - tl->uct_dev_contexts = calloc(MCA_BTL_UCT_MAX_WORKERS, sizeof(tl->uct_dev_contexts[0])); - if (NULL == tl->uct_dev_contexts) { - OBJ_RELEASE(tl); - return NULL; - } - (void) uct_md_iface_config_read(md->uct_md, tl_desc->tl_name, NULL, NULL, &tl->uct_tl_config); int rc = mca_btl_uct_populate_tl_attr(module, tl); @@ -491,16 +487,9 @@ static void mca_btl_uct_set_tl_rdma(mca_btl_uct_module_t *module, mca_btl_uct_tl static void mca_btl_uct_set_tl_am(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) { BTL_VERBOSE(("tl %s is suitable for active-messaging", tl->uct_tl_name)); - - if (module->rdma_tl == tl) { - module->shared_endpoints = true; - } module->am_tl = tl; OBJ_RETAIN(tl); - uct_iface_set_am_handler(tl->uct_dev_contexts[0]->uct_iface, MCA_BTL_UCT_FRAG, - mca_btl_uct_am_handler, tl->uct_dev_contexts[0], UCT_CB_FLAG_ASYNC); - tl->tl_index = (module->rdma_tl && tl != module->rdma_tl) ? 1 : 0; module->comm_tls[tl->tl_index] = tl; if (tl->max_device_contexts <= 1) { @@ -580,12 +569,6 @@ static int mca_btl_uct_evaluate_tl(mca_btl_uct_module_t *module, mca_btl_uct_tl_ module->super.btl_latency = 1; } - if (tl == module->rdma_tl || tl == module->am_tl || tl == module->conn_tl) { - /* make sure progress is enabled on the default context now that we know this TL will be - * used */ - mca_btl_uct_context_enable_progress(tl->uct_dev_contexts[0]); - } - return OPAL_SUCCESS; } @@ -629,7 +612,6 @@ int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, OBJ_RELEASE(tl); if (OPAL_SUCCESS == rc) { - mca_btl_uct_context_enable_progress(tl->uct_dev_contexts[0]); return OPAL_SUCCESS; } diff --git a/opal/mca/btl/uct/btl_uct_types.h b/opal/mca/btl/uct/btl_uct_types.h index 8305049b791..1ae8a9c8073 100644 --- a/opal/mca/btl/uct/btl_uct_types.h +++ b/opal/mca/btl/uct/btl_uct_types.h @@ -327,7 +327,7 @@ struct mca_btl_uct_tl_t { int max_device_contexts; /** array of device contexts */ - mca_btl_uct_device_context_t **uct_dev_contexts; + mca_btl_uct_device_context_t *uct_dev_contexts[MCA_BTL_UCT_MAX_WORKERS]; /** tl index. this is used to differentiate (if there is any difference) * between rdma and am endpoints */ From 72fc031828cc29691584c4a2e6bcec932a5f2e6c Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Wed, 23 Apr 2025 17:24:06 +0000 Subject: [PATCH 04/13] btl/uct: fix some errors around connection endpoint handling This commit fixes a couple of bugs discovered using rma-mt: - Do not call mca_btl_uct_endpoint_set_flag before sending a message on the connection endpoint. This method may cause the release of the connection endpoint (cached on the BTL endpoint). If this happens it would lead to a SEGV. - Flush the endpoint only when it is being released. There is no need to do so on every send. Releasing the endpoint without flushing it may lead to it being destroyed while still processing data. Signed-off-by: Nathan Hjelm --- opal/mca/btl/uct/btl_uct_endpoint.c | 115 +++++++++++++++------------- opal/mca/btl/uct/btl_uct_types.h | 3 + 2 files changed, 66 insertions(+), 52 deletions(-) diff --git a/opal/mca/btl/uct/btl_uct_endpoint.c b/opal/mca/btl/uct/btl_uct_endpoint.c index 5ff2405f5a4..eaf086493d6 100644 --- a/opal/mca/btl/uct/btl_uct_endpoint.c +++ b/opal/mca/btl/uct/btl_uct_endpoint.c @@ -164,22 +164,6 @@ static int mca_btl_uct_endpoint_connect_iface(mca_btl_uct_module_t *uct_btl, mca return (UCS_OK == ucs_status) ? OPAL_SUCCESS : OPAL_ERROR; } -static void mca_btl_uct_connection_ep_construct(mca_btl_uct_connection_ep_t *ep) -{ - ep->uct_ep = NULL; -} - -static void mca_btl_uct_connection_ep_destruct(mca_btl_uct_connection_ep_t *ep) -{ - if (ep->uct_ep) { - uct_ep_destroy(ep->uct_ep); - ep->uct_ep = NULL; - } -} - -OBJ_CLASS_INSTANCE(mca_btl_uct_connection_ep_t, opal_object_t, mca_btl_uct_connection_ep_construct, - mca_btl_uct_connection_ep_destruct); - struct mca_btl_uct_conn_completion_t { uct_completion_t super; volatile bool complete; @@ -203,24 +187,62 @@ static void mca_btl_uct_endpoint_flush_complete(uct_completion_t *self, ucs_stat } #endif +static void mca_btl_uct_flush_conn_endpoint(mca_btl_uct_connection_ep_t *conn_ep) +{ + mca_btl_uct_device_context_t *conn_tl_context = conn_ep->tl->uct_dev_contexts[0]; + mca_btl_uct_conn_completion_t completion + = {.super = {.count = 1, .func = mca_btl_uct_endpoint_flush_complete}, .complete = false}; + ucs_status_t ucs_status; + MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, { + ucs_status = uct_ep_flush(conn_ep->uct_ep, 0, &completion.super); + }); + if (UCS_OK != ucs_status && UCS_INPROGRESS != ucs_status) { + /* NTH: I don't know if this path is needed. For some networks we must use a completion. */ + do { + MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, { + ucs_status = uct_ep_flush(conn_ep->uct_ep, 0, NULL); + }); + mca_btl_uct_context_progress(conn_tl_context); + } while (UCS_INPROGRESS == ucs_status); + } else { + do { + mca_btl_uct_context_progress(conn_tl_context); + } while (!completion.complete); + } +} + +static void mca_btl_uct_connection_ep_construct(mca_btl_uct_connection_ep_t *ep) +{ + ep->uct_ep = NULL; + ep->tl = NULL; +} + +static void mca_btl_uct_connection_ep_destruct(mca_btl_uct_connection_ep_t *ep) +{ + if (ep->uct_ep) { + mca_btl_uct_flush_conn_endpoint(ep); + uct_ep_destroy(ep->uct_ep); + ep->uct_ep = NULL; + } +} + +OBJ_CLASS_INSTANCE(mca_btl_uct_connection_ep_t, opal_object_t, mca_btl_uct_connection_ep_construct, + mca_btl_uct_connection_ep_destruct); + static int mca_btl_uct_endpoint_send_conn_req(mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, - mca_btl_uct_device_context_t *conn_tl_context, + mca_btl_uct_tl_t *conn_tl, mca_btl_uct_conn_req_t *request, size_t request_length) { - mca_btl_uct_conn_completion_t completion - = {.super = {.count = 1, .func = mca_btl_uct_endpoint_flush_complete}, .complete = false}; - ucs_status_t ucs_status; + mca_btl_uct_device_context_t *conn_tl_context = conn_tl->uct_dev_contexts[0]; BTL_VERBOSE( ("sending connection request to peer. context id: %d, type: %d, length: %" PRIsize_t, request->context_id, request->type, request_length)); - /* need to drop the lock to avoid hold-and-wait */ - opal_mutex_unlock(&endpoint->ep_lock); - do { + ucs_status_t ucs_status; MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, { ucs_status = uct_ep_am_short(endpoint->conn_ep->uct_ep, MCA_BTL_UCT_CONNECT_RDMA, request->type, request, request_length); @@ -233,26 +255,13 @@ static int mca_btl_uct_endpoint_send_conn_req(mca_btl_uct_module_t *uct_btl, return OPAL_ERROR; } + /* need to drop the lock to avoid hold-and-wait */ + opal_mutex_unlock(&endpoint->ep_lock); /* some TLs (UD for example) need to be progressed to get resources */ mca_btl_uct_context_progress(conn_tl_context); + opal_mutex_lock(&endpoint->ep_lock); } while (1); - /* for now we just wait for the connection request to complete before continuing */ - ucs_status = uct_ep_flush(endpoint->conn_ep->uct_ep, 0, &completion.super); - if (UCS_OK != ucs_status && UCS_INPROGRESS != ucs_status) { - /* NTH: I don't know if this path is needed. For some networks we must use a completion. */ - do { - ucs_status = uct_ep_flush(endpoint->conn_ep->uct_ep, 0, NULL); - mca_btl_uct_context_progress(conn_tl_context); - } while (UCS_INPROGRESS == ucs_status); - } else { - do { - mca_btl_uct_context_progress(conn_tl_context); - } while (!completion.complete); - } - - opal_mutex_lock(&endpoint->ep_lock); - return OPAL_SUCCESS; } @@ -277,6 +286,8 @@ static int mca_btl_uct_endpoint_get_helper_endpoint(mca_btl_uct_module_t *uct_bt return OPAL_ERR_OUT_OF_RESOURCE; } + endpoint->conn_ep->tl = conn_tl; + ucs_status_t ucs_status; mca_btl_uct_device_context_t *conn_tl_context = conn_tl->uct_dev_contexts[0]; /* create a temporary endpoint for setting up the rdma endpoint */ @@ -300,11 +311,8 @@ static int mca_btl_uct_endpoint_send_connection_data( mca_btl_uct_tl_t *tl, mca_btl_uct_device_context_t *tl_context, mca_btl_uct_tl_endpoint_t *tl_endpoint, int request_type, int remote_module_index) { - mca_btl_uct_device_context_t *conn_tl_context = conn_tl->uct_dev_contexts[0]; ucs_status_t ucs_status; - assert(NULL != conn_tl); - BTL_VERBOSE(("connecting endpoint to remote endpoint")); size_t request_length = sizeof(mca_btl_uct_conn_req_t) @@ -330,7 +338,7 @@ static int mca_btl_uct_endpoint_send_connection_data( /* let the remote side know that the connection has been established and * wait for the message to be sent */ - int rc = mca_btl_uct_endpoint_send_conn_req(uct_btl, endpoint, conn_tl_context, request, + int rc = mca_btl_uct_endpoint_send_conn_req(uct_btl, endpoint, conn_tl, request, request_length); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { OBJ_RELEASE(endpoint->conn_ep); @@ -375,20 +383,23 @@ static int mca_btl_uct_endpoint_connect_endpoint( if (UCS_OK != ucs_status) { return OPAL_ERROR; } - - mca_btl_uct_endpoint_set_flag(uct_btl, endpoint, tl_context->context_id, tl_endpoint, - MCA_BTL_UCT_ENDPOINT_FLAG_EP_CONNECTED); } opal_timer_t now = opal_timer_base_get_usec(); - if ((now - tl_endpoint->last_connection_req) < mca_btl_uct_component.connection_retry_timeout && !ep_addr) { - return (tl_endpoint->flags & MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY) ? OPAL_SUCCESS - : OPAL_ERR_OUT_OF_RESOURCE; + if ((now - tl_endpoint->last_connection_req) > mca_btl_uct_component.connection_retry_timeout || ep_addr) { + int rc = mca_btl_uct_endpoint_send_connection_data(uct_btl, conn_tl, endpoint, tl, tl_context, tl_endpoint, + /*request_type=*/!!ep_addr, remote_module_index); + if (OPAL_SUCCESS != rc) { + return rc; + } + } + + if (ep_addr) { + mca_btl_uct_endpoint_set_flag(uct_btl, endpoint, tl_context->context_id, tl_endpoint, + MCA_BTL_UCT_ENDPOINT_FLAG_EP_CONNECTED); } - int rc = mca_btl_uct_endpoint_send_connection_data(uct_btl, conn_tl, endpoint, tl, tl_context, tl_endpoint, - /*request_type=*/!!ep_addr, remote_module_index); - return (OPAL_SUCCESS == rc) ? OPAL_ERR_OUT_OF_RESOURCE : rc; + return OPAL_ERR_OUT_OF_RESOURCE; } static int mca_btl_uct_find_modex(mca_btl_uct_module_t *uct_btl, mca_btl_uct_modex_t *modex, diff --git a/opal/mca/btl/uct/btl_uct_types.h b/opal/mca/btl/uct/btl_uct_types.h index 1ae8a9c8073..e4a49e18183 100644 --- a/opal/mca/btl/uct/btl_uct_types.h +++ b/opal/mca/btl/uct/btl_uct_types.h @@ -21,6 +21,7 @@ struct mca_btl_uct_module_t; struct mca_btl_base_endpoint_t; struct mca_btl_uct_base_frag_t; +struct mca_btl_uct_tl_t; /* TL endpoint flags */ /** connection data was received */ @@ -122,6 +123,8 @@ struct mca_btl_uct_connection_ep_t { /** opal base object */ opal_object_t super; + struct mca_btl_uct_tl_t *tl; + /** UCT endpoint used for connection */ uct_ep_h uct_ep; }; From d93556d0cfe1f4d8b73c2c6d9e1422944f3b2a9d Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Wed, 23 Apr 2025 17:28:06 +0000 Subject: [PATCH 05/13] btl/uct: downgrade endpoing lock from recursive Recursive locks are not needed for the endpoint lock. This commit modifies the lock to be opal_mutex_t. Signed-off-by: Nathan Hjelm --- opal/mca/btl/uct/btl_uct_endpoint.c | 2 +- opal/mca/btl/uct/btl_uct_types.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/opal/mca/btl/uct/btl_uct_endpoint.c b/opal/mca/btl/uct/btl_uct_endpoint.c index eaf086493d6..843dccec523 100644 --- a/opal/mca/btl/uct/btl_uct_endpoint.c +++ b/opal/mca/btl/uct/btl_uct_endpoint.c @@ -24,7 +24,7 @@ static void mca_btl_uct_endpoint_construct(mca_btl_uct_endpoint_t *endpoint) memset(endpoint->uct_eps, 0, sizeof(endpoint->uct_eps[0]) * mca_btl_uct_component.num_contexts_per_module); endpoint->conn_ep = NULL; - OBJ_CONSTRUCT(&endpoint->ep_lock, opal_recursive_mutex_t); + OBJ_CONSTRUCT(&endpoint->ep_lock, opal_mutex_t); } static void mca_btl_uct_endpoint_destruct(mca_btl_uct_endpoint_t *endpoint) diff --git a/opal/mca/btl/uct/btl_uct_types.h b/opal/mca/btl/uct/btl_uct_types.h index e4a49e18183..00444a1e69b 100644 --- a/opal/mca/btl/uct/btl_uct_types.h +++ b/opal/mca/btl/uct/btl_uct_types.h @@ -288,7 +288,7 @@ struct mca_btl_base_endpoint_t { opal_proc_t *ep_proc; /** mutex to protect this structure */ - opal_recursive_mutex_t ep_lock; + opal_mutex_t ep_lock; /** cached connection endpoint */ mca_btl_uct_connection_ep_t *conn_ep; From 2f3ffe39157213983b98a75f427a6fa91237a828 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Wed, 23 Apr 2025 17:29:21 +0000 Subject: [PATCH 06/13] btl/uct: move the async context from the module to the tl There is no real benefit from sharing the async context between tls. Given this and some other changes that will be made it makes sense to move it from the module to the tl. Signed-off-by: Nathan Hjelm --- opal/mca/btl/uct/btl_uct.h | 3 --- opal/mca/btl/uct/btl_uct_component.c | 7 ------- opal/mca/btl/uct/btl_uct_module.c | 2 -- opal/mca/btl/uct/btl_uct_tl.c | 15 +++++++++++++-- opal/mca/btl/uct/btl_uct_types.h | 3 +++ 5 files changed, 16 insertions(+), 14 deletions(-) diff --git a/opal/mca/btl/uct/btl_uct.h b/opal/mca/btl/uct/btl_uct.h index 3535f493d42..1a2f5eee68e 100644 --- a/opal/mca/btl/uct/btl_uct.h +++ b/opal/mca/btl/uct/btl_uct.h @@ -79,9 +79,6 @@ struct mca_btl_uct_module_t { /** mutex to protect the module */ opal_recursive_mutex_t lock; - /** async context */ - ucs_async_context_t *ucs_async; - /** transport for active messaging */ mca_btl_uct_tl_t *am_tl; diff --git a/opal/mca/btl/uct/btl_uct_component.c b/opal/mca/btl/uct/btl_uct_component.c index 6fba2d81d41..0cc529c3108 100644 --- a/opal/mca/btl/uct/btl_uct_component.c +++ b/opal/mca/btl/uct/btl_uct_component.c @@ -393,13 +393,6 @@ static mca_btl_uct_module_t *mca_btl_uct_alloc_module(const char *md_name, mca_b module->md_name = strdup(md_name); module->super.btl_registration_handle_size = registration_size; - ucs_status = ucs_async_context_create(UCS_ASYNC_MODE_THREAD, &module->ucs_async); - if (UCS_OK != ucs_status) { - BTL_VERBOSE(("Could not create a UCT async context")); - mca_btl_uct_finalize(&module->super); - return NULL; - } - return module; } diff --git a/opal/mca/btl/uct/btl_uct_module.c b/opal/mca/btl/uct/btl_uct_module.c index f4ed0188320..2fb769294f7 100644 --- a/opal/mca/btl/uct/btl_uct_module.c +++ b/opal/mca/btl/uct/btl_uct_module.c @@ -302,8 +302,6 @@ int mca_btl_uct_finalize(mca_btl_base_module_t *btl) OBJ_RELEASE(uct_module->rdma_tl); } - ucs_async_context_destroy(uct_module->ucs_async); - OBJ_DESTRUCT(&uct_module->endpoint_lock); free(uct_module->md_name); diff --git a/opal/mca/btl/uct/btl_uct_tl.c b/opal/mca/btl/uct/btl_uct_tl.c index 8de7c45c97b..cad397625fb 100644 --- a/opal/mca/btl/uct/btl_uct_tl.c +++ b/opal/mca/btl/uct/btl_uct_tl.c @@ -156,6 +156,10 @@ static void mca_btl_uct_tl_destructor(mca_btl_uct_tl_t *tl) } } + if (tl->ucs_async) { + ucs_async_context_destroy(tl->ucs_async); + } + if (tl->uct_md) { OBJ_RELEASE(tl->uct_md); } @@ -291,7 +295,7 @@ static int mca_btl_uct_populate_tl_attr(mca_btl_uct_module_t *module, mca_btl_uc /* do the bare minimum to get tl attributes */ uct_worker_h uct_worker; - ucs_status = uct_worker_create(module->ucs_async, UCS_THREAD_MODE_SINGLE, &uct_worker); + ucs_status = uct_worker_create(tl->ucs_async, UCS_THREAD_MODE_SINGLE, &uct_worker); if (OPAL_UNLIKELY(UCS_OK != ucs_status)) { BTL_VERBOSE(("could not create a UCT worker")); return OPAL_ERROR; @@ -364,7 +368,7 @@ mca_btl_uct_device_context_t *mca_btl_uct_context_create(mca_btl_uct_module_t *m * use our own locks just go ahead and use UCS_THREAD_MODE_SINGLE. if they ever fix their * api then change this back to UCS_THREAD_MODE_MULTI and remove the locks around the * various UCT calls. */ - ucs_status = uct_worker_create(module->ucs_async, UCS_THREAD_MODE_SINGLE, &context->uct_worker); + ucs_status = uct_worker_create(tl->ucs_async, UCS_THREAD_MODE_SINGLE, &context->uct_worker); if (OPAL_UNLIKELY(UCS_OK != ucs_status)) { BTL_VERBOSE(("could not create a UCT worker")); mca_btl_uct_context_destroy(context); @@ -437,6 +441,13 @@ static mca_btl_uct_tl_t *mca_btl_uct_create_tl(mca_btl_uct_module_t *module, mca (void) uct_md_iface_config_read(md->uct_md, tl_desc->tl_name, NULL, NULL, &tl->uct_tl_config); + ucs_status_t ucs_status = ucs_async_context_create(UCS_ASYNC_MODE_THREAD, &tl->ucs_async); + if (UCS_OK != ucs_status) { + BTL_VERBOSE(("Could not create a UCT async context")); + OBJ_RELEASE(tl); + return NULL; + } + int rc = mca_btl_uct_populate_tl_attr(module, tl); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { OBJ_RELEASE(tl); diff --git a/opal/mca/btl/uct/btl_uct_types.h b/opal/mca/btl/uct/btl_uct_types.h index 00444a1e69b..44f0810b590 100644 --- a/opal/mca/btl/uct/btl_uct_types.h +++ b/opal/mca/btl/uct/btl_uct_types.h @@ -338,6 +338,9 @@ struct mca_btl_uct_tl_t { /** interface attributes */ uct_iface_attr_t uct_iface_attr; + + /** async context */ + ucs_async_context_t *ucs_async; }; typedef struct mca_btl_uct_tl_t mca_btl_uct_tl_t; From 16140a3fe1d6cef39c0384f5f762739a7c7d56c2 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Wed, 23 Apr 2025 18:28:10 +0000 Subject: [PATCH 07/13] btl/uct: remove the need for a BTL module for connection TLs Connection TLs are only used to form connections for connect-to-endpoint TLs. They do not need to belong to the same memory domain as the one they are used with so there is no need to rely on a BTL module. This commit moves the pending_connection_reqs to the tl and changes the code to support a NULL module for the connection tl. Signed-off-by: Nathan Hjelm --- opal/mca/btl/uct/btl_uct.h | 3 --- opal/mca/btl/uct/btl_uct_component.c | 15 +++++++-------- opal/mca/btl/uct/btl_uct_device_context.h | 4 ++-- opal/mca/btl/uct/btl_uct_module.c | 3 +-- opal/mca/btl/uct/btl_uct_tl.c | 23 +++++++++++++---------- opal/mca/btl/uct/btl_uct_types.h | 3 +++ 6 files changed, 26 insertions(+), 25 deletions(-) diff --git a/opal/mca/btl/uct/btl_uct.h b/opal/mca/btl/uct/btl_uct.h index 1a2f5eee68e..cf799cd9b02 100644 --- a/opal/mca/btl/uct/btl_uct.h +++ b/opal/mca/btl/uct/btl_uct.h @@ -118,9 +118,6 @@ struct mca_btl_uct_module_t { /** frags that were waiting on connections that are now ready to send */ opal_list_t pending_frags; - - /** pending connection requests */ - opal_fifo_t pending_connection_reqs; }; typedef struct mca_btl_uct_module_t mca_btl_uct_module_t; diff --git a/opal/mca/btl/uct/btl_uct_component.c b/opal/mca/btl/uct/btl_uct_component.c index 0cc529c3108..87d398d80c8 100644 --- a/opal/mca/btl/uct/btl_uct_component.c +++ b/opal/mca/btl/uct/btl_uct_component.c @@ -387,7 +387,6 @@ static mca_btl_uct_module_t *mca_btl_uct_alloc_module(const char *md_name, mca_b OBJ_CONSTRUCT(&module->max_frags, opal_free_list_t); OBJ_CONSTRUCT(&module->pending_frags, opal_list_t); OBJ_CONSTRUCT(&module->lock, opal_recursive_mutex_t); - OBJ_CONSTRUCT(&module->pending_connection_reqs, opal_fifo_t); module->md = md; module->md_name = strdup(md_name); @@ -798,19 +797,19 @@ static int mca_btl_uct_component_progress_pending(mca_btl_uct_module_t *uct_btl) return completed; } -static int mca_btl_uct_component_progress_connections (mca_btl_uct_module_t *module) { +static int mca_btl_uct_component_progress_connections (mca_btl_uct_tl_t *conn_tl) { mca_btl_uct_pending_connection_request_t *request; int ret; - if (module->conn_tl == NULL) { + if (conn_tl == NULL) { return 0; } - ret = mca_btl_uct_tl_progress(module->conn_tl, 0); + ret = mca_btl_uct_tl_progress(conn_tl, 0); while (NULL != (request = (mca_btl_uct_pending_connection_request_t *) opal_fifo_pop_atomic( - &module->pending_connection_reqs))) { + &conn_tl->pending_connection_reqs))) { mca_btl_uct_conn_req_t *conn_req = (mca_btl_uct_conn_req_t *) request->request_data; BTL_VERBOSE(("processing connection request....")); if (conn_req->module_index >= mca_btl_uct_component.module_count) { @@ -819,7 +818,7 @@ static int mca_btl_uct_component_progress_connections (mca_btl_uct_module_t *mod } int rc = mca_btl_uct_process_connection_request(mca_btl_uct_component.modules[conn_req->module_index], conn_req); if (rc != OPAL_SUCCESS) { - opal_fifo_push_atomic(&module->pending_connection_reqs, &request->super); + opal_fifo_push_atomic(&conn_tl->pending_connection_reqs, &request->super); break; } OBJ_RELEASE(request); @@ -849,7 +848,7 @@ static int mca_btl_uct_component_progress(void) ret += mca_btl_uct_tl_progress(module->am_tl, starting_index); } - mca_btl_uct_component_progress_connections (module); + mca_btl_uct_component_progress_connections (module->conn_tl); if (0 != opal_list_get_size(&module->pending_frags)) { mca_btl_uct_component_progress_pending(module); @@ -857,7 +856,7 @@ static int mca_btl_uct_component_progress(void) } if (NULL != mca_btl_uct_component.conn_module) { - ret += mca_btl_uct_component_progress_connections (mca_btl_uct_component.conn_module); + ret += mca_btl_uct_component_progress_connections (mca_btl_uct_component.conn_module->conn_tl); } return (int) ret; diff --git a/opal/mca/btl/uct/btl_uct_device_context.h b/opal/mca/btl/uct/btl_uct_device_context.h index 7e25e0bef19..d264cc40610 100644 --- a/opal/mca/btl/uct/btl_uct_device_context.h +++ b/opal/mca/btl/uct/btl_uct_device_context.h @@ -94,14 +94,14 @@ mca_btl_uct_module_get_tl_context_specific(mca_btl_uct_module_t *module, mca_btl mca_btl_uct_device_context_t *context = tl->uct_dev_contexts[context_id]; if (OPAL_UNLIKELY(NULL == context)) { - OPAL_THREAD_LOCK(&module->lock); + OPAL_THREAD_LOCK(&tl->tl_lock); context = tl->uct_dev_contexts[context_id]; if (OPAL_UNLIKELY(NULL == context)) { context = tl->uct_dev_contexts[context_id] = mca_btl_uct_context_create(module, tl, context_id, true); } - OPAL_THREAD_UNLOCK(&module->lock); + OPAL_THREAD_UNLOCK(&tl->tl_lock); } return context; diff --git a/opal/mca/btl/uct/btl_uct_module.c b/opal/mca/btl/uct/btl_uct_module.c index 2fb769294f7..be8648ab951 100644 --- a/opal/mca/btl/uct/btl_uct_module.c +++ b/opal/mca/btl/uct/btl_uct_module.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2020 Google, LLC. All rights reserved. + * Copyright (c) 2020-2025 Google, LLC. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -284,7 +284,6 @@ int mca_btl_uct_finalize(mca_btl_base_module_t *btl) OBJ_DESTRUCT(&uct_module->max_frags); OBJ_DESTRUCT(&uct_module->pending_frags); OBJ_DESTRUCT(&uct_module->lock); - OBJ_DESTRUCT(&uct_module->pending_connection_reqs); if (uct_module->rcache) { mca_rcache_base_module_destroy(uct_module->rcache); diff --git a/opal/mca/btl/uct/btl_uct_tl.c b/opal/mca/btl/uct/btl_uct_tl.c index cad397625fb..4a01bd6b996 100644 --- a/opal/mca/btl/uct/btl_uct_tl.c +++ b/opal/mca/btl/uct/btl_uct_tl.c @@ -144,6 +144,7 @@ static void mca_btl_uct_tl_constructor(mca_btl_uct_tl_t *tl) { memset((void *) ((uintptr_t) tl + sizeof(tl->super)), 0, sizeof(*tl) - sizeof(tl->super)); OBJ_CONSTRUCT(&tl->tl_lock, opal_mutex_t); + OBJ_CONSTRUCT(&tl->pending_connection_reqs, opal_fifo_t); } static void mca_btl_uct_tl_destructor(mca_btl_uct_tl_t *tl) @@ -172,6 +173,7 @@ static void mca_btl_uct_tl_destructor(mca_btl_uct_tl_t *tl) } OBJ_DESTRUCT(&tl->tl_lock); + OBJ_DESTRUCT(&tl->pending_connection_reqs); } OBJ_CLASS_INSTANCE(mca_btl_uct_tl_t, opal_list_item_t, mca_btl_uct_tl_constructor, @@ -179,14 +181,14 @@ OBJ_CLASS_INSTANCE(mca_btl_uct_tl_t, opal_list_item_t, mca_btl_uct_tl_constructo static ucs_status_t mca_btl_uct_conn_req_cb(void *arg, void *data, size_t length, unsigned flags) { - mca_btl_uct_module_t *module = (mca_btl_uct_module_t *) arg; + mca_btl_uct_tl_t *tl = (mca_btl_uct_tl_t *) arg; mca_btl_uct_pending_connection_request_t *request = calloc(1, length + sizeof(request->super)); /* it is not safe to process the connection request from the callback so just save it for * later processing */ OBJ_CONSTRUCT(request, mca_btl_uct_pending_connection_request_t); memcpy(&request->request_data, (void *) ((intptr_t) data + 8), length); - opal_fifo_push_atomic(&module->pending_connection_reqs, &request->super); + opal_fifo_push_atomic(&tl->pending_connection_reqs, &request->super); return UCS_OK; } @@ -241,20 +243,21 @@ int mca_btl_uct_process_connection_request(mca_btl_uct_module_t *module, return OPAL_SUCCESS; } -static int mca_btl_uct_setup_connection_tl(mca_btl_uct_module_t *module) +static int mca_btl_uct_setup_connection_tl(mca_btl_uct_tl_t *tl) { ucs_status_t ucs_status; - if (NULL == module->conn_tl) { + if (NULL == tl) { return OPAL_ERR_NOT_SUPPORTED; } - mca_btl_uct_device_context_t *context = mca_btl_uct_module_get_tl_context_specific(module, module->conn_tl, - /*context_id=*/0); + mca_btl_uct_device_context_t *context = + mca_btl_uct_module_get_tl_context_specific(/*module=*/NULL, tl, + /*context_id=*/0); ucs_status = uct_iface_set_am_handler(context->uct_iface, - MCA_BTL_UCT_CONNECT_RDMA, mca_btl_uct_conn_req_cb, module, - UCT_CB_FLAG_ASYNC); + MCA_BTL_UCT_CONNECT_RDMA, mca_btl_uct_conn_req_cb, + tl, UCT_CB_FLAG_ASYNC); if (UCS_OK != ucs_status) { BTL_ERROR(("could not set active message handler for uct tl")); } @@ -383,7 +386,7 @@ mca_btl_uct_device_context_t *mca_btl_uct_context_create(mca_btl_uct_module_t *m return NULL; } - if (tl == module->am_tl) { + if (module != NULL && tl == module->am_tl) { BTL_VERBOSE(("installing AM handler for tl %p context id %d", (void *) tl, context_id)); uct_iface_set_am_handler(context->uct_iface, MCA_BTL_UCT_FRAG, mca_btl_uct_am_handler, context, MCA_BTL_UCT_CB_FLAG_SYNC); @@ -524,7 +527,7 @@ static int mca_btl_uct_set_tl_conn(mca_btl_uct_module_t *module, mca_btl_uct_tl_ BTL_VERBOSE(("tl %s is suitable for making connections", tl->uct_tl_name)); module->conn_tl = tl; - rc = mca_btl_uct_setup_connection_tl(module); + rc = mca_btl_uct_setup_connection_tl(tl); if (OPAL_SUCCESS != rc) { return rc; } diff --git a/opal/mca/btl/uct/btl_uct_types.h b/opal/mca/btl/uct/btl_uct_types.h index 44f0810b590..6bec24d86d0 100644 --- a/opal/mca/btl/uct/btl_uct_types.h +++ b/opal/mca/btl/uct/btl_uct_types.h @@ -341,6 +341,9 @@ struct mca_btl_uct_tl_t { /** async context */ ucs_async_context_t *ucs_async; + + /** pending connection requests */ + opal_fifo_t pending_connection_reqs; }; typedef struct mca_btl_uct_tl_t mca_btl_uct_tl_t; From ecdac957a0e7b0b6845110726ea016e1ebd9c495 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Wed, 23 Apr 2025 19:51:39 +0000 Subject: [PATCH 08/13] btl/uct: module md_name to mca_btl_uct_md_t More cleanup preparing to separate connection-only tls from their module. Signed-off-by: Nathan Hjelm --- opal/mca/btl/uct/btl_uct.h | 3 --- opal/mca/btl/uct/btl_uct_component.c | 26 +++++++++++++------------- opal/mca/btl/uct/btl_uct_endpoint.c | 8 ++++---- opal/mca/btl/uct/btl_uct_module.c | 3 ++- opal/mca/btl/uct/btl_uct_tl.c | 2 +- opal/mca/btl/uct/btl_uct_types.h | 3 +++ 6 files changed, 23 insertions(+), 22 deletions(-) diff --git a/opal/mca/btl/uct/btl_uct.h b/opal/mca/btl/uct/btl_uct.h index cf799cd9b02..2fd386b5106 100644 --- a/opal/mca/btl/uct/btl_uct.h +++ b/opal/mca/btl/uct/btl_uct.h @@ -98,9 +98,6 @@ struct mca_btl_uct_module_t { /** registration cache */ mca_rcache_base_module_t *rcache; - /** name of the memory domain backing this module */ - char *md_name; - /** am and rdma share endpoints */ bool shared_endpoints; diff --git a/opal/mca/btl/uct/btl_uct_component.c b/opal/mca/btl/uct/btl_uct_component.c index 87d398d80c8..093d1c396f2 100644 --- a/opal/mca/btl/uct/btl_uct_component.c +++ b/opal/mca/btl/uct/btl_uct_component.c @@ -258,7 +258,7 @@ static size_t mca_btl_uct_tl_modex_size(mca_btl_uct_tl_t *tl) static size_t mca_btl_uct_module_modex_size(mca_btl_uct_module_t *module) { - size_t modex_size = 4 + strlen(module->md_name) + 1; + size_t modex_size = 4 + strlen(module->md->md_name) + 1; if (module->rdma_tl) { modex_size += mca_btl_uct_tl_modex_size(module->rdma_tl); @@ -307,14 +307,14 @@ static size_t mca_btl_uct_tl_modex_pack(mca_btl_uct_module_t *module, mca_btl_uc static uint8_t *mca_btl_uct_modex_pack(mca_btl_uct_module_t *module, uint8_t *modex_data) { - size_t name_len = strlen(module->md_name); + size_t name_len = strlen(module->md->md_name); /* pack the size */ *((uint32_t *) modex_data) = (uint32_t) mca_btl_uct_module_modex_size(module); modex_data += 4; - strcpy((char *) modex_data, module->md_name); + strcpy((char *) modex_data, module->md->md_name); modex_data += name_len + 1; if (module->rdma_tl) { @@ -366,7 +366,7 @@ static int mca_btl_uct_modex_send(void) return rc; } -static mca_btl_uct_module_t *mca_btl_uct_alloc_module(const char *md_name, mca_btl_uct_md_t *md, +static mca_btl_uct_module_t *mca_btl_uct_alloc_module(mca_btl_uct_md_t *md, size_t registration_size) { mca_btl_uct_module_t *module; @@ -389,7 +389,6 @@ static mca_btl_uct_module_t *mca_btl_uct_alloc_module(const char *md_name, mca_b OBJ_CONSTRUCT(&module->lock, opal_recursive_mutex_t); module->md = md; - module->md_name = strdup(md_name); module->super.btl_registration_handle_size = registration_size; return module; @@ -463,6 +462,7 @@ static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc) } md = OBJ_NEW(mca_btl_uct_md_t); + md->md_name = strdup(md_desc->md_name); #if UCT_API >= UCT_VERSION(1, 7) ucs_status = uct_md_config_read(component, NULL, NULL, &uct_config); @@ -470,18 +470,18 @@ static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc) BTL_VERBOSE(("uct_md_config_read failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); return OPAL_ERR_NOT_AVAILABLE; } - ucs_status = uct_md_open(component, md_desc->md_name, uct_config, &md->uct_md); + ucs_status = uct_md_open(component, md->md_name, uct_config, &md->uct_md); if (UCS_OK != ucs_status) { BTL_VERBOSE(("uct_md_open failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); return OPAL_ERR_NOT_AVAILABLE; } #else - ucs_status = uct_md_config_read(md_desc->md_name, NULL, NULL, &uct_config); + ucs_status = uct_md_config_read(md->md_name, NULL, NULL, &uct_config); if (UCS_OK != ucs_status) { BTL_VERBOSE(("uct_md_config_read failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); return OPAL_ERR_NOT_AVAILABLE; } - ucs_status = uct_md_open(md_desc->md_name, uct_config, &md->uct_md); + ucs_status = uct_md_open(md->md_name, uct_config, &md->uct_md); if (UCS_OK != ucs_status) { BTL_VERBOSE(("uct_md_open failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); return OPAL_ERR_NOT_AVAILABLE; @@ -500,7 +500,7 @@ static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc) return OPAL_ERR_NOT_AVAILABLE; } - module = mca_btl_uct_alloc_module(md_desc->md_name, md, md_attr.rkey_packed_size); + module = mca_btl_uct_alloc_module(md, md_attr.rkey_packed_size); if (NULL == module) { uct_release_tl_resource_list(tl_desc); return OPAL_ERR_OUT_OF_RESOURCE; @@ -517,7 +517,7 @@ static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc) OBJ_RELEASE(md); if (NULL == module->am_tl && NULL == module->rdma_tl && (NULL == module->conn_tl || !consider_for_connection_module)) { - BTL_VERBOSE(("uct memory domain %s does not have any appropriate tls", md_desc->md_name)); + BTL_VERBOSE(("uct memory domain %s does not have any appropriate tls", md->md_name)); mca_btl_uct_finalize(&module->super); return OPAL_ERR_NOT_AVAILABLE; } @@ -534,7 +534,7 @@ static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc) /* NTH: a registration cache shouldn't be necessary when using UCT but there are measurable * performance benefits to using rcache/grdma instead of assuming UCT will do the right * thing. */ - (void) opal_asprintf(&tmp, "uct.%s", module->md_name); + (void) opal_asprintf(&tmp, "uct.%s", md->md_name); rcache_resources.cache_name = tmp; rcache_resources.reg_data = (void *) module; @@ -553,7 +553,7 @@ static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc) } } else { if (NULL == mca_btl_uct_component.conn_module) { - BTL_VERBOSE(("memory domain %s may be used for connections", md_desc->md_name)); + BTL_VERBOSE(("memory domain %s may be used for connections", md->md_name)); mca_btl_uct_component.conn_module = module; } else { mca_btl_uct_finalize(&module->super); @@ -635,7 +635,7 @@ static void mca_btl_uct_component_validate_modules(void) { && NULL == module->conn_tl) { /* module can not be used */ BTL_VERBOSE(("module for memory domain %s can not be used due to missing connection transport", - module->md_name)); + module->md->md_name)); mca_btl_uct_finalize (&mca_btl_uct_component.modules[i]->super); mca_btl_uct_component.modules[i] = NULL; } diff --git a/opal/mca/btl/uct/btl_uct_endpoint.c b/opal/mca/btl/uct/btl_uct_endpoint.c index 843dccec523..b9ae99eb687 100644 --- a/opal/mca/btl/uct/btl_uct_endpoint.c +++ b/opal/mca/btl/uct/btl_uct_endpoint.c @@ -410,11 +410,11 @@ static int mca_btl_uct_find_modex(mca_btl_uct_module_t *uct_btl, mca_btl_uct_mod for (int i = 0; i < modex->module_count; ++i) { uint32_t modex_size = *((uint32_t *) modex_data); - BTL_VERBOSE(("found modex for md %s, searching for %s", modex_data + 4, uct_btl->md_name)); + BTL_VERBOSE(("found modex for md %s, searching for %s", modex_data + 4, uct_btl->md->md_name)); modex_data += 4; - if (0 != strcmp((char *) modex_data, uct_btl->md_name)) { + if (0 != strcmp((char *) modex_data, uct_btl->md->md_name)) { /* modex belongs to a different module, skip it and continue */ modex_data += modex_size - 4; continue; @@ -427,12 +427,12 @@ static int mca_btl_uct_find_modex(mca_btl_uct_module_t *uct_btl, mca_btl_uct_mod *remote_module_index = i; } - BTL_VERBOSE(("finished processing modex for %s", uct_btl->md_name)); + BTL_VERBOSE(("finished processing modex for %s", uct_btl->md->md_name)); return OPAL_SUCCESS; } - BTL_ERROR(("could not find modex for %s", uct_btl->md_name)); + BTL_ERROR(("could not find modex for %s", uct_btl->md->md_name)); return OPAL_ERR_NOT_FOUND; } diff --git a/opal/mca/btl/uct/btl_uct_module.c b/opal/mca/btl/uct/btl_uct_module.c index be8648ab951..ac84ed9b49a 100644 --- a/opal/mca/btl/uct/btl_uct_module.c +++ b/opal/mca/btl/uct/btl_uct_module.c @@ -303,7 +303,6 @@ int mca_btl_uct_finalize(mca_btl_base_module_t *btl) OBJ_DESTRUCT(&uct_module->endpoint_lock); - free(uct_module->md_name); free(uct_module); return OPAL_SUCCESS; @@ -357,10 +356,12 @@ OBJ_CLASS_INSTANCE(mca_btl_uct_reg_t, opal_free_list_item_t, NULL, NULL); static void mca_btl_uct_md_construct(mca_btl_uct_md_t *md) { md->uct_md = NULL; + md->md_name = NULL; } static void mca_btl_uct_md_destruct(mca_btl_uct_md_t *md) { + free(md->md_name); if (md->uct_md) { uct_md_close(md->uct_md); md->uct_md = NULL; diff --git a/opal/mca/btl/uct/btl_uct_tl.c b/opal/mca/btl/uct/btl_uct_tl.c index 4a01bd6b996..05926076e0a 100644 --- a/opal/mca/btl/uct/btl_uct_tl.c +++ b/opal/mca/btl/uct/btl_uct_tl.c @@ -457,7 +457,7 @@ static mca_btl_uct_tl_t *mca_btl_uct_create_tl(mca_btl_uct_module_t *module, mca return NULL; } - BTL_VERBOSE(("Interface CAPS for tl %s::%s: 0x%lx", module->md_name, tl_desc->tl_name, + BTL_VERBOSE(("Interface CAPS for tl %s::%s: 0x%lx", module->md->md_name, tl_desc->tl_name, (unsigned long) tl->uct_iface_attr.cap.flags)); return tl; diff --git a/opal/mca/btl/uct/btl_uct_types.h b/opal/mca/btl/uct/btl_uct_types.h index 6bec24d86d0..f592ff0a16a 100644 --- a/opal/mca/btl/uct/btl_uct_types.h +++ b/opal/mca/btl/uct/btl_uct_types.h @@ -67,6 +67,9 @@ struct mca_btl_uct_md_t { /** make this an opal object */ opal_object_t super; + /** name of the memory domain backing this module */ + char *md_name; + /** UCT memory domain handle */ uct_md_h uct_md; }; From 83768b784bd6e8f7e6e51888b4ee6d4466c2e006 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Wed, 23 Apr 2025 19:52:43 +0000 Subject: [PATCH 09/13] btl/uct: put active tls in a list on the mca_btl_uct_md_t structure This simplifies the code a bit by moving mca_btl_uct_tl_t ownership to the mca_btl_uct_md_t class. Signed-off-by: Nathan Hjelm --- opal/mca/btl/uct/btl_uct_component.c | 13 ++++------ opal/mca/btl/uct/btl_uct_module.c | 16 +++--------- opal/mca/btl/uct/btl_uct_tl.c | 37 +++++++++++----------------- opal/mca/btl/uct/btl_uct_types.h | 5 +++- 4 files changed, 28 insertions(+), 43 deletions(-) diff --git a/opal/mca/btl/uct/btl_uct_component.c b/opal/mca/btl/uct/btl_uct_component.c index 093d1c396f2..5eca31352f1 100644 --- a/opal/mca/btl/uct/btl_uct_component.c +++ b/opal/mca/btl/uct/btl_uct_component.c @@ -500,8 +500,10 @@ static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc) return OPAL_ERR_NOT_AVAILABLE; } + /* module will take ownership of the md */ module = mca_btl_uct_alloc_module(md, md_attr.rkey_packed_size); if (NULL == module) { + OBJ_RELEASE(md); uct_release_tl_resource_list(tl_desc); return OPAL_ERR_OUT_OF_RESOURCE; } @@ -512,10 +514,6 @@ static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc) uct_release_tl_resource_list(tl_desc); - /* release the initial reference to the md object. if any modules were created the UCT md will - * remain open until those modules are finalized. */ - OBJ_RELEASE(md); - if (NULL == module->am_tl && NULL == module->rdma_tl && (NULL == module->conn_tl || !consider_for_connection_module)) { BTL_VERBOSE(("uct memory domain %s does not have any appropriate tls", md->md_name)); mca_btl_uct_finalize(&module->super); @@ -842,10 +840,9 @@ static int mca_btl_uct_component_progress(void) /* unlike ucp, uct actually tells us something useful! its almost like it was "inspired" * by the btl progress functions.... */ - ret += mca_btl_uct_tl_progress(module->rdma_tl, starting_index); - - if (module->am_tl != module->rdma_tl) { - ret += mca_btl_uct_tl_progress(module->am_tl, starting_index); + mca_btl_uct_tl_t *tl; + OPAL_LIST_FOREACH(tl, &module->md->tls, mca_btl_uct_tl_t) { + ret += mca_btl_uct_tl_progress(tl, starting_index); } mca_btl_uct_component_progress_connections (module->conn_tl); diff --git a/opal/mca/btl/uct/btl_uct_module.c b/opal/mca/btl/uct/btl_uct_module.c index ac84ed9b49a..999316e0418 100644 --- a/opal/mca/btl/uct/btl_uct_module.c +++ b/opal/mca/btl/uct/btl_uct_module.c @@ -289,19 +289,8 @@ int mca_btl_uct_finalize(mca_btl_base_module_t *btl) mca_rcache_base_module_destroy(uct_module->rcache); } - if (NULL != uct_module->am_tl) { - OBJ_RELEASE(uct_module->am_tl); - } - - if (NULL != uct_module->conn_tl) { - OBJ_RELEASE(uct_module->conn_tl); - } - - if (NULL != uct_module->rdma_tl) { - OBJ_RELEASE(uct_module->rdma_tl); - } - OBJ_DESTRUCT(&uct_module->endpoint_lock); + OBJ_RELEASE(uct_module->md); free(uct_module); @@ -357,10 +346,13 @@ static void mca_btl_uct_md_construct(mca_btl_uct_md_t *md) { md->uct_md = NULL; md->md_name = NULL; + OBJ_CONSTRUCT(&md->tls, opal_list_t); } static void mca_btl_uct_md_destruct(mca_btl_uct_md_t *md) { + OPAL_LIST_DESTRUCT(&md->tls); + free(md->md_name); if (md->uct_md) { uct_md_close(md->uct_md); diff --git a/opal/mca/btl/uct/btl_uct_tl.c b/opal/mca/btl/uct/btl_uct_tl.c index 05926076e0a..d94f4decee2 100644 --- a/opal/mca/btl/uct/btl_uct_tl.c +++ b/opal/mca/btl/uct/btl_uct_tl.c @@ -161,10 +161,6 @@ static void mca_btl_uct_tl_destructor(mca_btl_uct_tl_t *tl) ucs_async_context_destroy(tl->ucs_async); } - if (tl->uct_md) { - OBJ_RELEASE(tl->uct_md); - } - free(tl->uct_tl_name); free(tl->uct_dev_name); @@ -436,7 +432,6 @@ static mca_btl_uct_tl_t *mca_btl_uct_create_tl(mca_btl_uct_module_t *module, mca /* initialize btl tl structure */ tl->uct_md = md; - OBJ_RETAIN(md); tl->uct_tl_name = strdup(tl_desc->tl_name); tl->uct_dev_name = strdup(tl_desc->dev_name); @@ -457,7 +452,7 @@ static mca_btl_uct_tl_t *mca_btl_uct_create_tl(mca_btl_uct_module_t *module, mca return NULL; } - BTL_VERBOSE(("Interface CAPS for tl %s::%s: 0x%lx", module->md->md_name, tl_desc->tl_name, + BTL_VERBOSE(("Interface CAPS for tl %s::%s: 0x%lx", md->md_name, tl_desc->tl_name, (unsigned long) tl->uct_iface_attr.cap.flags)); return tl; @@ -489,7 +484,6 @@ static void mca_btl_uct_set_tl_rdma(mca_btl_uct_module_t *module, mca_btl_uct_tl .cap.put.max_bcopy; module->rdma_tl = tl; - OBJ_RETAIN(tl); tl->tl_index = (module->am_tl && tl != module->am_tl) ? 1 : 0; module->comm_tls[tl->tl_index] = tl; @@ -502,7 +496,6 @@ static void mca_btl_uct_set_tl_am(mca_btl_uct_module_t *module, mca_btl_uct_tl_t { BTL_VERBOSE(("tl %s is suitable for active-messaging", tl->uct_tl_name)); module->am_tl = tl; - OBJ_RETAIN(tl); tl->tl_index = (module->rdma_tl && tl != module->rdma_tl) ? 1 : 0; module->comm_tls[tl->tl_index] = tl; @@ -532,8 +525,6 @@ static int mca_btl_uct_set_tl_conn(mca_btl_uct_module_t *module, mca_btl_uct_tl_ return rc; } - OBJ_RETAIN(tl); - if (!tl->max_device_contexts) { /* if a tl is only being used to create connections do not bother with multiple * contexts */ @@ -591,9 +582,6 @@ int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, bool evaluate_for_conn_only) { mca_btl_uct_tl_t *tl; - opal_list_t tl_list; - - OBJ_CONSTRUCT(&tl_list, opal_list_t); for (unsigned i = 0; i < tl_count; ++i) { int priority = 0; @@ -623,28 +611,27 @@ int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, if (mca_btl_uct_tl_supports_conn(tl) && evaluate_for_conn_only) { BTL_VERBOSE(("evaluating tl %s for forming connections", tl_descs[i].tl_name)); int rc = mca_btl_uct_set_tl_conn(module, tl); - OBJ_RELEASE(tl); if (OPAL_SUCCESS == rc) { + opal_list_append(&md->tls, &tl->super); return OPAL_SUCCESS; } BTL_VERBOSE(("tl %s cannot be used for forming connections", tl_descs[i].tl_name)); } else { - opal_list_append(&tl_list, &tl->super); + opal_list_append(&md->tls, &tl->super); } } } - if (0 == opal_list_get_size(&tl_list)) { + if (0 == opal_list_get_size(&md->tls)) { BTL_VERBOSE(("no suitable tls match filter: %s", mca_btl_uct_component.allowed_transports)); - OBJ_DESTRUCT(&tl_list); return OPAL_ERR_NOT_AVAILABLE; } - opal_list_sort(&tl_list, tl_compare); + opal_list_sort(&md->tls, tl_compare); - OPAL_LIST_FOREACH (tl, &tl_list, mca_btl_uct_tl_t) { + OPAL_LIST_FOREACH (tl, &md->tls, mca_btl_uct_tl_t) { mca_btl_uct_evaluate_tl(module, tl); if (NULL != module->am_tl && NULL != module->rdma_tl && (NULL != module->conn_tl @@ -676,15 +663,21 @@ int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, module->super.btl_free = NULL; } - OPAL_LIST_DESTRUCT(&tl_list); - if (!(NULL != module->am_tl && mca_btl_uct_tl_requires_connection_tl(module->am_tl)) && !(NULL != module->rdma_tl && mca_btl_uct_tl_requires_connection_tl(module->rdma_tl)) && module->conn_tl) { /* no connection tl needed for selected transports */ - OBJ_RELEASE(module->conn_tl); module->conn_tl = NULL; } + /* clear out unused tls */ + mca_btl_uct_tl_t *next; + OPAL_LIST_FOREACH_SAFE(tl, next, &md->tls, mca_btl_uct_tl_t) { + if (tl != module->conn_tl && tl != module->rdma_tl && tl != module->am_tl) { + opal_list_remove_item(&md->tls, &tl->super); + OBJ_RELEASE(tl); + } + } + return OPAL_SUCCESS; } diff --git a/opal/mca/btl/uct/btl_uct_types.h b/opal/mca/btl/uct/btl_uct_types.h index f592ff0a16a..534b063b280 100644 --- a/opal/mca/btl/uct/btl_uct_types.h +++ b/opal/mca/btl/uct/btl_uct_types.h @@ -70,6 +70,9 @@ struct mca_btl_uct_md_t { /** name of the memory domain backing this module */ char *md_name; + /** list of mca_btl_uct_tl_t's for this memory domain */ + opal_list_t tls; + /** UCT memory domain handle */ uct_md_h uct_md; }; @@ -314,7 +317,7 @@ struct mca_btl_uct_tl_t { /** relative priority 0 == highest */ int priority; - /** memory domain associated with this tl */ + /** memory domain associated with this tl (no reference) */ mca_btl_uct_md_t *uct_md; /** lock protecting tl structures */ From b593d42a27db88192deacc2e86c40dcd35fc9f4a Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Wed, 23 Apr 2025 22:44:34 +0000 Subject: [PATCH 10/13] btl/uct: keep UCT component list around after initial scan From what I can tell it is not ok to be releasing this list while the components within the list are still in use. To be safe keep the list around until the uct component is closed. Signed-off-by: Nathan Hjelm --- opal/mca/btl/uct/btl_uct.h | 5 +++++ opal/mca/btl/uct/btl_uct_component.c | 21 ++++++++++++--------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/opal/mca/btl/uct/btl_uct.h b/opal/mca/btl/uct/btl_uct.h index 2fd386b5106..16bee1195ec 100644 --- a/opal/mca/btl/uct/btl_uct.h +++ b/opal/mca/btl/uct/btl_uct.h @@ -162,6 +162,11 @@ struct mca_btl_uct_component_t { /** alternate connection-only module that can be used if no suitable * connection tl is found. this is usually a tcp tl. */ mca_btl_uct_module_t *conn_module; + +#if UCT_API >= UCT_VERSION(1, 7) + uct_component_h *uct_components; + unsigned num_uct_components; +#endif }; typedef struct mca_btl_uct_component_t mca_btl_uct_component_t; diff --git a/opal/mca/btl/uct/btl_uct_component.c b/opal/mca/btl/uct/btl_uct_component.c index 5eca31352f1..d8ae0a2c492 100644 --- a/opal/mca/btl/uct/btl_uct_component.c +++ b/opal/mca/btl/uct/btl_uct_component.c @@ -239,6 +239,14 @@ static int mca_btl_uct_component_close(void) mca_btl_uct_include_list_free (&mca_btl_uct_component.allowed_transport_list); mca_btl_uct_include_list_free (&mca_btl_uct_component.connection_domain_list); +#if UCT_API >= UCT_VERSION(1, 7) + if (NULL != mca_btl_uct_component.uct_components) { + uct_release_component_list(mca_btl_uct_component.uct_components); + mca_btl_uct_component.uct_components = NULL; + mca_btl_uct_component.num_uct_components = 0; + } +#endif + return OPAL_SUCCESS; } @@ -690,25 +698,20 @@ static mca_btl_base_module_t **mca_btl_uct_component_init(int *num_btl_modules, mca_btl_uct_component.module_count = 0; #if UCT_API >= UCT_VERSION(1, 7) - uct_component_h *components; - unsigned num_components; - - ucs_status = uct_query_components(&components, &num_components); + ucs_status = uct_query_components(&mca_btl_uct_component.uct_components, + &mca_btl_uct_component.num_uct_components); if (UCS_OK != ucs_status) { BTL_ERROR(("could not query UCT components")); return NULL; } /* generate all suitable btl modules */ - for (unsigned i = 0; i < num_components; ++i) { - rc = mca_btl_uct_component_process_uct_component(components[i]); + for (unsigned i = 0; i < mca_btl_uct_component.num_uct_components; ++i) { + rc = mca_btl_uct_component_process_uct_component(mca_btl_uct_component.uct_components[i]); if (OPAL_SUCCESS != rc) { break; } } - - uct_release_component_list(components); - #else /* UCT 1.6 and older */ uct_md_resource_desc_t *resources; unsigned resource_count; From 34ae4a8131cd48ee3751223aa25d69cb38ac6430 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Wed, 23 Apr 2025 22:53:35 +0000 Subject: [PATCH 11/13] btl/uct: move uct_component from the module to mca_uct_md_t Signed-off-by: Nathan Hjelm --- opal/mca/btl/uct/btl_uct.h | 4 ---- opal/mca/btl/uct/btl_uct_component.c | 7 +++---- opal/mca/btl/uct/btl_uct_module.c | 1 + opal/mca/btl/uct/btl_uct_rdma.h | 4 ++-- opal/mca/btl/uct/btl_uct_types.h | 4 ++++ 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/opal/mca/btl/uct/btl_uct.h b/opal/mca/btl/uct/btl_uct.h index 16bee1195ec..f657a6fddbd 100644 --- a/opal/mca/btl/uct/btl_uct.h +++ b/opal/mca/btl/uct/btl_uct.h @@ -91,10 +91,6 @@ struct mca_btl_uct_module_t { /** array containing the am_tl and rdma_tl */ mca_btl_uct_tl_t *comm_tls[2]; -#if UCT_API >= UCT_VERSION(1, 7) - uct_component_h uct_component; -#endif - /** registration cache */ mca_rcache_base_module_t *rcache; diff --git a/opal/mca/btl/uct/btl_uct_component.c b/opal/mca/btl/uct/btl_uct_component.c index d8ae0a2c492..98f56d0d05d 100644 --- a/opal/mca/btl/uct/btl_uct_component.c +++ b/opal/mca/btl/uct/btl_uct_component.c @@ -471,6 +471,9 @@ static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc) md = OBJ_NEW(mca_btl_uct_md_t); md->md_name = strdup(md_desc->md_name); +#if UCT_API >= UCT_VERSION(1, 7) + md->uct_component = component; +#endif #if UCT_API >= UCT_VERSION(1, 7) ucs_status = uct_md_config_read(component, NULL, NULL, &uct_config); @@ -528,10 +531,6 @@ static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc) return OPAL_ERR_NOT_AVAILABLE; } -#if UCT_API >= UCT_VERSION(1, 7) - module->uct_component = component; -#endif - if (!consider_for_connection_module) { module->module_index = mca_btl_uct_component.module_count; diff --git a/opal/mca/btl/uct/btl_uct_module.c b/opal/mca/btl/uct/btl_uct_module.c index 999316e0418..4201a54b812 100644 --- a/opal/mca/btl/uct/btl_uct_module.c +++ b/opal/mca/btl/uct/btl_uct_module.c @@ -344,6 +344,7 @@ OBJ_CLASS_INSTANCE(mca_btl_uct_reg_t, opal_free_list_item_t, NULL, NULL); static void mca_btl_uct_md_construct(mca_btl_uct_md_t *md) { + md->uct_component = NULL; md->uct_md = NULL; md->md_name = NULL; OBJ_CONSTRUCT(&md->tls, opal_list_t); diff --git a/opal/mca/btl/uct/btl_uct_rdma.h b/opal/mca/btl/uct/btl_uct_rdma.h index 0438106b2c8..481be991b4d 100644 --- a/opal/mca/btl/uct/btl_uct_rdma.h +++ b/opal/mca/btl/uct/btl_uct_rdma.h @@ -53,7 +53,7 @@ static inline int mca_btl_uct_get_rkey(mca_btl_uct_module_t *module, } # if UCT_API >= UCT_VERSION(1, 7) - ucs_status = uct_rkey_unpack(module->uct_component, (void *) remote_handle, rkey); + ucs_status = uct_rkey_unpack(module->md->uct_component, (void *) remote_handle, rkey); # else ucs_status = uct_rkey_unpack((void *) remote_handle, rkey); # endif @@ -63,7 +63,7 @@ static inline int mca_btl_uct_get_rkey(mca_btl_uct_module_t *module, static inline void mca_btl_uct_rkey_release(mca_btl_uct_module_t *uct_btl, uct_rkey_bundle_t *rkey) { # if UCT_API >= UCT_VERSION(1, 7) - uct_rkey_release(uct_btl->uct_component, rkey); + uct_rkey_release(uct_btl->md->uct_component, rkey); # else (void) uct_btl; uct_rkey_release(rkey); diff --git a/opal/mca/btl/uct/btl_uct_types.h b/opal/mca/btl/uct/btl_uct_types.h index 534b063b280..9b3f3cccbb3 100644 --- a/opal/mca/btl/uct/btl_uct_types.h +++ b/opal/mca/btl/uct/btl_uct_types.h @@ -75,6 +75,10 @@ struct mca_btl_uct_md_t { /** UCT memory domain handle */ uct_md_h uct_md; + +#if UCT_API >= UCT_VERSION(1, 7) + uct_component_h uct_component; +#endif }; typedef struct mca_btl_uct_md_t mca_btl_uct_md_t; From 5ed3871bde29322cbb0b6561122a7fe7dfb964e7 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Wed, 23 Apr 2025 23:03:19 +0000 Subject: [PATCH 12/13] btl/uct: complete rework of descovery and initialization code There is an issue with btl/uct which prevents the usage of the standard btl_uct_ MCA variables (eager limit, flags, etc). Because of the way the btl was written these values are all determined directly from UCT and can not be changed using the MCA variable interface. To address this issue this commit breaks apart the initialization code and separates out the pieces that are necessary for discovery only. The discovery pieces now use a new set of variables that include the memory domain name and directly control the behavior for BTLs on that memory domain as well as enabling the usage of the btl_uct variable to control the defaults for these variables. Example, using memory domain irdma0 will create variables: btl_uct_irdma0_eager_limit, btl_uct_irdma0_max_send_size, etc. The defaults will be based on what is reported by UCT and the user can set the values to a subset of what UCT reports. For example, if the max send size for the hardware is 8192B then it can be set to anything up to and including that value. The same is true for feature flags, if the hardware supports only some btl atomics or operations the user can specify a subset of them (others will be ignored). Signed-off-by: Nathan Hjelm --- opal/mca/btl/uct/Makefile.am | 29 +- opal/mca/btl/uct/btl_uct.h | 48 +- opal/mca/btl/uct/btl_uct_am.c | 6 +- opal/mca/btl/uct/btl_uct_component.c | 582 +++--------------------- opal/mca/btl/uct/btl_uct_discover.c | 508 +++++++++++++++++++++ opal/mca/btl/uct/btl_uct_discover.h | 43 ++ opal/mca/btl/uct/btl_uct_endpoint.c | 133 +----- opal/mca/btl/uct/btl_uct_include_list.c | 78 ++++ opal/mca/btl/uct/btl_uct_include_list.h | 34 ++ opal/mca/btl/uct/btl_uct_modex.c | 198 ++++++++ opal/mca/btl/uct/btl_uct_modex.h | 20 + opal/mca/btl/uct/btl_uct_module.c | 59 ++- opal/mca/btl/uct/btl_uct_tl.c | 181 ++------ opal/mca/btl/uct/btl_uct_types.h | 36 +- 14 files changed, 1157 insertions(+), 798 deletions(-) create mode 100644 opal/mca/btl/uct/btl_uct_discover.c create mode 100644 opal/mca/btl/uct/btl_uct_discover.h create mode 100644 opal/mca/btl/uct/btl_uct_include_list.c create mode 100644 opal/mca/btl/uct/btl_uct_include_list.h create mode 100644 opal/mca/btl/uct/btl_uct_modex.c create mode 100644 opal/mca/btl/uct/btl_uct_modex.h diff --git a/opal/mca/btl/uct/Makefile.am b/opal/mca/btl/uct/Makefile.am index df548cc66ff..92e5ab070d9 100644 --- a/opal/mca/btl/uct/Makefile.am +++ b/opal/mca/btl/uct/Makefile.am @@ -13,6 +13,7 @@ # Copyright (c) 2017 IBM Corporation. All rights reserved. # Copyright (c) 2017-2018 Los Alamos National Security, LLC. All rights # reserved. +# Copyright (c) 2025 Google, LLC. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -24,22 +25,30 @@ AM_CPPFLAGS = $(btl_uct_CPPFLAGS) amca_paramdir = $(AMCA_PARAM_SETS_DIR) -sources = \ +headers = \ btl_uct.h \ + btl_uct_rdma.h \ + btl_uct_endpoint.h \ + btl_uct_am.h \ + btl_uct_frag.h \ + btl_uct_types.h \ + btl_uct_device_context.h \ + btl_uct_discover.h \ + btl_uct_modex.h \ + btl_uct_include_list.h + +sources = \ btl_uct_module.c \ btl_uct_component.c \ - btl_uct_rdma.h \ btl_uct_rdma.c \ - btl_uct_endpoint.h \ btl_uct_endpoint.c \ btl_uct_amo.c \ - btl_uct_am.h \ btl_uct_am.c \ - btl_uct_frag.h \ btl_uct_frag.c \ btl_uct_tl.c \ - btl_uct_types.h \ - btl_uct_device_context.h + btl_uct_discover.c \ + btl_uct_modex.c \ + btl_uct_include_list.c # Make the output library in this directory, and name it either # mca__.la (for DSO builds) or libmca__.la @@ -50,20 +59,22 @@ lib = lib_sources = component = mca_btl_uct.la component_sources = $(sources) +component_headers = $(headers) else lib = libmca_btl_uct.la lib_sources = $(sources) +lib_headers = ${headers} component = component_sources = endif mcacomponentdir = $(opallibdir) mcacomponent_LTLIBRARIES = $(component) -mca_btl_uct_la_SOURCES = $(component_sources) +mca_btl_uct_la_SOURCES = $(component_sources) $(component_headers) mca_btl_uct_la_LDFLAGS = -module -avoid-version $(btl_uct_LDFLAGS) mca_btl_uct_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la $(btl_uct_LIBS) noinst_LTLIBRARIES = $(lib) -libmca_btl_uct_la_SOURCES = $(lib_sources) +libmca_btl_uct_la_SOURCES = $(lib_sources) $(lib_headers) libmca_btl_uct_la_LDFLAGS = -module -avoid-version $(btl_uct_LDFLAGS) libmca_btl_uct_la_LIBADD = $(btl_uct_LIBS) diff --git a/opal/mca/btl/uct/btl_uct.h b/opal/mca/btl/uct/btl_uct.h index f657a6fddbd..20b40783d46 100644 --- a/opal/mca/btl/uct/btl_uct.h +++ b/opal/mca/btl/uct/btl_uct.h @@ -85,12 +85,6 @@ struct mca_btl_uct_module_t { /** transport for RDMA/AMOs */ mca_btl_uct_tl_t *rdma_tl; - /** transport for forming connections (if needed) */ - mca_btl_uct_tl_t *conn_tl; - - /** array containing the am_tl and rdma_tl */ - mca_btl_uct_tl_t *comm_tls[2]; - /** registration cache */ mca_rcache_base_module_t *rcache; @@ -111,6 +105,10 @@ struct mca_btl_uct_module_t { /** frags that were waiting on connections that are now ready to send */ opal_list_t pending_frags; + + /** allowed transports */ + char *allowed_transports; + mca_btl_uct_include_list_t allowed_transport_list; }; typedef struct mca_btl_uct_module_t mca_btl_uct_module_t; @@ -123,6 +121,9 @@ struct mca_btl_uct_component_t { /** base BTL component */ mca_btl_base_component_3_0_0_t super; + /** whether the component is initialized. controls cleanup. */ + bool initialized; + /** number of TL modules */ int module_count; @@ -135,7 +136,6 @@ struct mca_btl_uct_component_t { /** allowed transports */ char *allowed_transports; - mca_btl_uct_include_list_t allowed_transport_list; /** transports to consider for forming connections */ char *connection_domains; @@ -155,14 +155,16 @@ struct mca_btl_uct_component_t { /** connection retry timeout */ unsigned int connection_retry_timeout; - /** alternate connection-only module that can be used if no suitable - * connection tl is found. this is usually a tcp tl. */ - mca_btl_uct_module_t *conn_module; - #if UCT_API >= UCT_VERSION(1, 7) uct_component_h *uct_components; unsigned num_uct_components; #endif + + /** list of memory domains (btl_uct_md_t) */ + opal_list_t md_list; + + /** connection transport (if needed). reference is owned by conn_md */ + mca_btl_uct_tl_t *conn_tl; }; typedef struct mca_btl_uct_component_t mca_btl_uct_component_t; @@ -298,12 +300,16 @@ ucs_status_t mca_btl_uct_am_handler(void *arg, void *data, size_t length, unsign struct mca_btl_base_endpoint_t *mca_btl_uct_get_ep(struct mca_btl_base_module_t *module, opal_proc_t *proc); -int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, - uct_tl_resource_desc_t *tl_descs, unsigned tl_count, - bool evaluate_for_conn_only); +int mca_btl_uct_populate_tls(mca_btl_uct_md_t *md, uct_tl_resource_desc_t *tl_descs, unsigned tl_count); int mca_btl_uct_process_connection_request(mca_btl_uct_module_t *module, mca_btl_uct_conn_req_t *req); +mca_btl_uct_module_t *mca_btl_uct_alloc_module(mca_btl_uct_md_t *md, + size_t registration_size); + +int mca_btl_uct_evaluate_tl(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl); +int mca_btl_uct_enable_tl_conn(mca_btl_uct_tl_t *tl); + /** * @brief Checks if a tl is suitable for using for RDMA * @@ -344,18 +350,12 @@ static inline bool mca_btl_uct_tl_supports_conn(mca_btl_uct_tl_t *tl) */ static inline bool mca_btl_uct_tl_requires_connection_tl(mca_btl_uct_tl_t *tl) { + if (NULL == tl) { + return false; + } + return !(tl->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE); } -/** - * @brief Find the rank of `name` in the include list `list`. - * - * @param[in] name name to find - * @param[in] list list to search - * - * A negative result means the name is not present or the list is negated. - */ -int mca_btl_uct_include_list_rank (const char *name, const mca_btl_uct_include_list_t *list); - END_C_DECLS #endif diff --git a/opal/mca/btl/uct/btl_uct_am.c b/opal/mca/btl/uct/btl_uct_am.c index a0d50836e47..68e82329137 100644 --- a/opal/mca/btl/uct/btl_uct_am.c +++ b/opal/mca/btl/uct/btl_uct_am.c @@ -233,7 +233,7 @@ int mca_btl_uct_send_frag(mca_btl_uct_module_t *uct_btl, mca_btl_uct_base_frag_t } OPAL_THREAD_LOCK(&uct_btl->lock); - mca_btl_uct_append_pending_frag(uct_btl, frag, context, true); + mca_btl_uct_append_pending_frag(uct_btl, frag, context, /*ready=*/true); OPAL_THREAD_UNLOCK(&uct_btl->lock); return OPAL_SUCCESS; @@ -260,14 +260,14 @@ int mca_btl_uct_send(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi OPAL_THREAD_LOCK(&uct_btl->lock); /* check one more time in case another thread is completing the connection now */ if (OPAL_SUCCESS != mca_btl_uct_endpoint_test_am(uct_btl, endpoint, context, &ep_handle)) { - mca_btl_uct_append_pending_frag(uct_btl, frag, context, false); + mca_btl_uct_append_pending_frag(uct_btl, frag, context, /*ready=*/false); OPAL_THREAD_UNLOCK(&uct_btl->lock); return OPAL_SUCCESS; } OPAL_THREAD_UNLOCK(&uct_btl->lock); } - return mca_btl_uct_send_frag(uct_btl, frag, true); + return mca_btl_uct_send_frag(uct_btl, frag, /*append=*/true); } struct mca_btl_uct_sendi_pack_args_t { diff --git a/opal/mca/btl/uct/btl_uct_component.c b/opal/mca/btl/uct/btl_uct_component.c index 98f56d0d05d..3392387a707 100644 --- a/opal/mca/btl/uct/btl_uct_component.c +++ b/opal/mca/btl/uct/btl_uct_component.c @@ -27,10 +27,10 @@ * $HEADER$ */ -#include - #include "opal_config.h" +#include "btl_uct_discover.h" +#include "btl_uct_modex.h" #include "opal/mca/btl/base/base.h" #include "opal/mca/btl/btl.h" #include "opal/mca/hwloc/base/base.h" @@ -46,21 +46,47 @@ #include "btl_uct_am.h" #include "btl_uct_device_context.h" -static int mca_btl_uct_component_register(void) +static void mca_btl_uct_cleanup(void) { - mca_btl_uct_module_t *module = &mca_btl_uct_module_template; + if (mca_btl_uct_component.initialized) { + return; + } + + BTL_VERBOSE(("in UCT btl cleanup")); + + for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + if (NULL != mca_btl_uct_component.modules[i]) { + (void) mca_btl_uct_finalize(&mca_btl_uct_component.modules[i]->super); + } + } + + OBJ_DESTRUCT(&mca_btl_uct_component.memory_domain_list); + OBJ_DESTRUCT(&mca_btl_uct_component.connection_domain_list); + + OPAL_LIST_DESTRUCT(&mca_btl_uct_component.md_list); + +#if UCT_API >= UCT_VERSION(1, 7) + if (NULL != mca_btl_uct_component.uct_components) { + uct_release_component_list(mca_btl_uct_component.uct_components); + mca_btl_uct_component.uct_components = NULL; + mca_btl_uct_component.num_uct_components = 0; + } +#endif +} - mca_btl_uct_component.memory_domains = "mlx5_0,mlx4_0,rocep0s4"; +static int mca_btl_uct_component_register(void) +{ + mca_btl_uct_component.memory_domains = "mlx5_0,mlx4_0,rocep0s4,irdma0"; (void) mca_base_component_var_register( &mca_btl_uct_component.super.btl_version, "memory_domains", "Comma-delimited list of memory domains of the form " "to use for communication. Memory domains MUST provide transports that " "support put, get, and amos. Special values: all (all available), none." - " (default: mlx5_0,mlx4_0,rocep0s4)", + " (default: mlx5_0,mlx4_0,rocep0s4,irdma0)", MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_uct_component.memory_domains); - mca_btl_uct_component.allowed_transports = "dc_mlx5,rc_mlx5,ud,ugni_rdma,ugni_smsg,any"; + mca_btl_uct_component.allowed_transports = "dc_mlx5,rc_mlx5,rc_verbs,ud,ud_verbs,ugni_rdma,ugni_smsg,any"; (void) mca_base_component_var_register( &mca_btl_uct_component.super.btl_version, "transports", "Comma-delimited list of transports to use sorted by increasing " @@ -124,63 +150,28 @@ static int mca_btl_uct_component_register(void) MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_4, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_uct_component.connection_retry_timeout); - /* for now we want this component to lose to btl/ugni and btl/vader */ - module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 1; - - return mca_btl_base_param_register(&mca_btl_uct_component.super.btl_version, &module->super); -} - -static void mca_btl_uct_mem_release_cb(void *buf, size_t length, void *cbdata, bool from_alloc) -{ - ucm_vm_munmap(buf, length); -} - -static void mca_btl_uct_component_parse_include_list (const char *value, mca_btl_uct_include_list_t *list) { - list->list = NULL; - list->include = true; + OBJ_CONSTRUCT(&mca_btl_uct_component.md_list, opal_list_t); + OBJ_CONSTRUCT(&mca_btl_uct_component.memory_domain_list, mca_btl_uct_include_list_t); + OBJ_CONSTRUCT(&mca_btl_uct_component.connection_domain_list, mca_btl_uct_include_list_t); - if (value == NULL) { - return; + int rc = mca_btl_uct_component_discover_mds(); + if (OPAL_SUCCESS != rc) { + return rc; } - if (value[0] == '^') { - list->include = false; - value++; + rc = mca_btl_uct_component_generate_modules(&mca_btl_uct_component.md_list); + if (OPAL_SUCCESS != rc) { + return rc; } - list->list = opal_argv_split(value, ','); -} + opal_finalize_register_cleanup(mca_btl_uct_cleanup); -static void mca_btl_uct_include_list_free (mca_btl_uct_include_list_t *list) { - opal_argv_free (list->list); - list->list = NULL; + return OPAL_SUCCESS; } -int mca_btl_uct_include_list_rank (const char *name, const mca_btl_uct_include_list_t *list) { - if (list->list == NULL) { - return -1; - } - - for (int i = 0; list->list[i]; ++i) { - regex_t preg; - - BTL_VERBOSE(("evaluating %s vs %s-list item %s", name, list->include ? "include" : "exclude", list->list[i])); - int rc = regcomp(&preg, list->list[i], REG_ICASE); - if (0 != rc) { - char errbuf[256]; - regerror(rc, &preg, errbuf, sizeof(errbuf)); - BTL_ERROR(("when matching name, could not parse regular expression: %s, error: %s", list->list[i], errbuf)); - continue; - } - - int result = regexec(&preg, name, /*nmatch=*/0, /*pmatch=*/NULL, /*eflags=*/0); - regfree(&preg); - if (0 == result) { - return list->include ? i + 1 : -(i + 1); - } - } - - return list->include ? -1 : 1; +static void mca_btl_uct_mem_release_cb(void *buf, size_t length, void *cbdata, bool from_alloc) +{ + ucm_vm_munmap(buf, length); } static int mca_btl_uct_component_open(void) @@ -226,182 +217,19 @@ static int mca_btl_uct_component_open(void) */ static int mca_btl_uct_component_close(void) { - if (NULL != mca_btl_uct_component.conn_module) { - mca_btl_uct_finalize (&mca_btl_uct_component.conn_module->super); - mca_btl_uct_component.conn_module = NULL; - } + mca_btl_uct_component.conn_tl = NULL; if (mca_btl_uct_component.disable_ucx_memory_hooks) { opal_mem_hooks_unregister_release(mca_btl_uct_mem_release_cb); } - mca_btl_uct_include_list_free (&mca_btl_uct_component.memory_domain_list); - mca_btl_uct_include_list_free (&mca_btl_uct_component.allowed_transport_list); - mca_btl_uct_include_list_free (&mca_btl_uct_component.connection_domain_list); - -#if UCT_API >= UCT_VERSION(1, 7) - if (NULL != mca_btl_uct_component.uct_components) { - uct_release_component_list(mca_btl_uct_component.uct_components); - mca_btl_uct_component.uct_components = NULL; - mca_btl_uct_component.num_uct_components = 0; - } -#endif + /* complete delayed cleanup */ + mca_btl_uct_component.initialized = false; + mca_btl_uct_cleanup(); return OPAL_SUCCESS; } -static size_t mca_btl_uct_tl_modex_size(mca_btl_uct_tl_t *tl) -{ - const size_t size = strlen(tl->uct_tl_name) + 1; - - if (tl->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { - /* pad out to a multiple of 4 bytes */ - return (4 + 3 + size + tl->uct_iface_attr.device_addr_len - + tl->uct_iface_attr.iface_addr_len) - & ~3; - } - - return (4 + 3 + size + tl->uct_iface_attr.device_addr_len) & ~3; -} - -static size_t mca_btl_uct_module_modex_size(mca_btl_uct_module_t *module) -{ - size_t modex_size = 4 + strlen(module->md->md_name) + 1; - - if (module->rdma_tl) { - modex_size += mca_btl_uct_tl_modex_size(module->rdma_tl); - } - - if (module->am_tl && module->am_tl != module->rdma_tl) { - modex_size += mca_btl_uct_tl_modex_size(module->am_tl); - } - - if (module->conn_tl && module->conn_tl != module->rdma_tl && module->conn_tl != module->am_tl) { - modex_size += mca_btl_uct_tl_modex_size(module->conn_tl); - } - - return modex_size; -} - -static size_t mca_btl_uct_tl_modex_pack(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl, - uint8_t *modex_data) -{ - mca_btl_uct_device_context_t *dev_context = - mca_btl_uct_module_get_tl_context_specific(module, tl, /*context_id=*/0); - size_t modex_size = mca_btl_uct_tl_modex_size(tl); - - *((uint32_t *) modex_data) = (uint32_t) modex_size; - modex_data += 4; - - strcpy((char *) modex_data, tl->uct_tl_name); - modex_data += strlen(tl->uct_tl_name) + 1; - - /* NTH: only the first context is available. i assume the device addresses of the - * contexts will be the same but they will have different iface addresses. i also - * am assuming that it doesn't really matter if all remote contexts connect to - * the same endpoint since we are only doing RDMA. if any of these assumptions are - * wrong then we can't delay creating the other contexts and must include their - * information in the modex. */ - if (tl->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { - uct_iface_get_address(dev_context->uct_iface, (uct_iface_addr_t *) modex_data); - modex_data += tl->uct_iface_attr.iface_addr_len; - } - - uct_iface_get_device_address(dev_context->uct_iface, (uct_device_addr_t *) modex_data); - modex_data += tl->uct_iface_attr.device_addr_len; - - return modex_size; -} - -static uint8_t *mca_btl_uct_modex_pack(mca_btl_uct_module_t *module, uint8_t *modex_data) -{ - size_t name_len = strlen(module->md->md_name); - - /* pack the size */ - *((uint32_t *) modex_data) = (uint32_t) mca_btl_uct_module_modex_size(module); - - modex_data += 4; - - strcpy((char *) modex_data, module->md->md_name); - modex_data += name_len + 1; - - if (module->rdma_tl) { - modex_data += mca_btl_uct_tl_modex_pack(module, module->rdma_tl, modex_data); - } - - if (module->am_tl && module->am_tl != module->rdma_tl) { - modex_data += mca_btl_uct_tl_modex_pack(module, module->am_tl, modex_data); - } - - if (module->conn_tl && module->conn_tl != module->rdma_tl - && module->conn_tl != module->am_tl) { - modex_data += mca_btl_uct_tl_modex_pack(module, module->conn_tl, modex_data); - } - - return modex_data; -} - -static int mca_btl_uct_modex_send(void) -{ - size_t modex_size = sizeof(mca_btl_uct_modex_t); - mca_btl_uct_modex_t *modex; - uint8_t *modex_data; - int rc; - - for (int i = 0; i < mca_btl_uct_component.module_count; ++i) { - modex_size += mca_btl_uct_module_modex_size(mca_btl_uct_component.modules[i]); - } - - if (mca_btl_uct_component.conn_module != NULL) { - modex_size += mca_btl_uct_module_modex_size(mca_btl_uct_component.conn_module); - } - - modex = alloca(modex_size); - modex_data = modex->data; - - modex->module_count = mca_btl_uct_component.module_count; - - for (int i = 0; i < mca_btl_uct_component.module_count; ++i) { - modex_data = mca_btl_uct_modex_pack (mca_btl_uct_component.modules[i], modex_data); - } - - if (mca_btl_uct_component.conn_module != NULL) { - ++modex->module_count; - modex_data = mca_btl_uct_modex_pack (mca_btl_uct_component.conn_module, modex_data); - } - - OPAL_MODEX_SEND(rc, PMIX_GLOBAL, &mca_btl_uct_component.super.btl_version, modex, modex_size); - return rc; -} - -static mca_btl_uct_module_t *mca_btl_uct_alloc_module(mca_btl_uct_md_t *md, - size_t registration_size) -{ - mca_btl_uct_module_t *module; - ucs_status_t ucs_status; - - module = malloc(sizeof(*module)); - if (NULL == module) { - return NULL; - } - - /* copy the module template */ - *module = mca_btl_uct_module_template; - - OBJ_CONSTRUCT(&module->id_to_endpoint, opal_hash_table_t); - OBJ_CONSTRUCT(&module->endpoint_lock, opal_mutex_t); - OBJ_CONSTRUCT(&module->short_frags, opal_free_list_t); - OBJ_CONSTRUCT(&module->eager_frags, opal_free_list_t); - OBJ_CONSTRUCT(&module->max_frags, opal_free_list_t); - OBJ_CONSTRUCT(&module->pending_frags, opal_list_t); - OBJ_CONSTRUCT(&module->lock, opal_recursive_mutex_t); - - module->md = md; - module->super.btl_registration_handle_size = registration_size; - - return module; -} - ucs_status_t mca_btl_uct_am_handler(void *arg, void *data, size_t length, unsigned flags) { mca_btl_uct_device_context_t *tl_context = (mca_btl_uct_device_context_t *) arg; @@ -425,241 +253,6 @@ ucs_status_t mca_btl_uct_am_handler(void *arg, void *data, size_t length, unsign return UCS_OK; } -#if UCT_API >= UCT_VERSION(1, 7) -static int mca_btl_uct_component_process_uct_md(uct_component_h component, - uct_md_resource_desc_t *md_desc) -#else -static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc) -#endif -{ - mca_rcache_base_resources_t rcache_resources; - uct_tl_resource_desc_t *tl_desc; - mca_btl_uct_module_t *module; - uct_md_config_t *uct_config; - uct_md_attr_t md_attr; - mca_btl_uct_md_t *md; - int list_rank; - unsigned num_tls; - char *tmp; - ucs_status_t ucs_status; - int connection_list_rank = -1; - bool consider_for_connection_module = false; - - BTL_VERBOSE(("processing memory domain %s", md_desc->md_name)); - - if (MCA_BTL_UCT_MAX_MODULES == mca_btl_uct_component.module_count) { - BTL_VERBOSE(("created the maximum number of allowable modules")); - return OPAL_ERR_NOT_AVAILABLE; - } - - BTL_VERBOSE(("checking if %s should be used for communication", md_desc->md_name)); - list_rank = mca_btl_uct_include_list_rank (md_desc->md_name, &mca_btl_uct_component.memory_domain_list); - - if (list_rank < 0) { - BTL_VERBOSE(("checking if %s should be used for connections", md_desc->md_name)); - connection_list_rank = mca_btl_uct_include_list_rank (md_desc->md_name, &mca_btl_uct_component.connection_domain_list); - - if (connection_list_rank < 0) { - /* nothing to do */ - BTL_VERBOSE(("not continuing with memory domain %s", md_desc->md_name)); - return OPAL_SUCCESS; - } - - BTL_VERBOSE(("will be considering domain %s for connections only", md_desc->md_name)); - consider_for_connection_module = true; - } - - md = OBJ_NEW(mca_btl_uct_md_t); - md->md_name = strdup(md_desc->md_name); -#if UCT_API >= UCT_VERSION(1, 7) - md->uct_component = component; -#endif - -#if UCT_API >= UCT_VERSION(1, 7) - ucs_status = uct_md_config_read(component, NULL, NULL, &uct_config); - if (UCS_OK != ucs_status) { - BTL_VERBOSE(("uct_md_config_read failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); - return OPAL_ERR_NOT_AVAILABLE; - } - ucs_status = uct_md_open(component, md->md_name, uct_config, &md->uct_md); - if (UCS_OK != ucs_status) { - BTL_VERBOSE(("uct_md_open failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); - return OPAL_ERR_NOT_AVAILABLE; - } -#else - ucs_status = uct_md_config_read(md->md_name, NULL, NULL, &uct_config); - if (UCS_OK != ucs_status) { - BTL_VERBOSE(("uct_md_config_read failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); - return OPAL_ERR_NOT_AVAILABLE; - } - ucs_status = uct_md_open(md->md_name, uct_config, &md->uct_md); - if (UCS_OK != ucs_status) { - BTL_VERBOSE(("uct_md_open failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); - return OPAL_ERR_NOT_AVAILABLE; - } -#endif - uct_config_release(uct_config); - - ucs_status = uct_md_query(md->uct_md, &md_attr); - if (UCS_OK != ucs_status) { - BTL_VERBOSE(("uct_config_release failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); - return OPAL_ERR_NOT_AVAILABLE; - } - ucs_status = uct_md_query_tl_resources(md->uct_md, &tl_desc, &num_tls); - if (UCS_OK != ucs_status) { - BTL_VERBOSE(("uct_config_release failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); - return OPAL_ERR_NOT_AVAILABLE; - } - - /* module will take ownership of the md */ - module = mca_btl_uct_alloc_module(md, md_attr.rkey_packed_size); - if (NULL == module) { - OBJ_RELEASE(md); - uct_release_tl_resource_list(tl_desc); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - /* if this module is not to be used for communication check if it has a transport suitable - * for forming connections. */ - (void) mca_btl_uct_query_tls(module, md, tl_desc, num_tls, consider_for_connection_module); - - uct_release_tl_resource_list(tl_desc); - - if (NULL == module->am_tl && NULL == module->rdma_tl && (NULL == module->conn_tl || !consider_for_connection_module)) { - BTL_VERBOSE(("uct memory domain %s does not have any appropriate tls", md->md_name)); - mca_btl_uct_finalize(&module->super); - return OPAL_ERR_NOT_AVAILABLE; - } - - if (!consider_for_connection_module) { - module->module_index = mca_btl_uct_component.module_count; - - mca_btl_uct_component.modules[mca_btl_uct_component.module_count++] = module; - - /* NTH: a registration cache shouldn't be necessary when using UCT but there are measurable - * performance benefits to using rcache/grdma instead of assuming UCT will do the right - * thing. */ - (void) opal_asprintf(&tmp, "uct.%s", md->md_name); - - rcache_resources.cache_name = tmp; - rcache_resources.reg_data = (void *) module; - rcache_resources.sizeof_reg = sizeof(mca_btl_uct_reg_t) - + module->super.btl_registration_handle_size; - rcache_resources.register_mem = mca_btl_uct_reg_mem; - rcache_resources.deregister_mem = mca_btl_uct_dereg_mem; - - module->rcache = mca_rcache_base_module_create("grdma", module, &rcache_resources); - free(tmp); - if (NULL == module->rcache) { - /* something when horribly wrong */ - BTL_VERBOSE(("could not allocate a registration cache for this btl module")); - mca_btl_uct_finalize(&module->super); - return OPAL_ERROR; - } - } else { - if (NULL == mca_btl_uct_component.conn_module) { - BTL_VERBOSE(("memory domain %s may be used for connections", md->md_name)); - mca_btl_uct_component.conn_module = module; - } else { - mca_btl_uct_finalize(&module->super); - } - } - - return OPAL_SUCCESS; -} - -#if UCT_API >= UCT_VERSION(1, 7) -static int mca_btl_uct_component_process_uct_component(uct_component_h component) -{ - uct_component_attr_t attr = {.field_mask = UCT_COMPONENT_ATTR_FIELD_NAME - | UCT_COMPONENT_ATTR_FIELD_MD_RESOURCE_COUNT}; - ucs_status_t ucs_status; - int rc; - - ucs_status = uct_component_query(component, &attr); - if (UCS_OK != ucs_status) { - return OPAL_ERROR; - } - - BTL_VERBOSE(("processing uct component %s", attr.name)); - - attr.md_resources = calloc(attr.md_resource_count, sizeof(*attr.md_resources)); - attr.field_mask |= UCT_COMPONENT_ATTR_FIELD_MD_RESOURCES; - ucs_status = uct_component_query(component, &attr); - if (UCS_OK != ucs_status) { - return OPAL_ERROR; - } - - for (unsigned i = 0; i < attr.md_resource_count; ++i) { - rc = mca_btl_uct_component_process_uct_md(component, attr.md_resources + i); - if (OPAL_SUCCESS != rc) { - break; - } - } - - free(attr.md_resources); - - return OPAL_SUCCESS; -} -#endif /* UCT_API >= UCT_VERSION(1, 7) */ - -static void mca_btl_uct_component_validate_modules(void) { - if (mca_btl_uct_component.conn_module != NULL) { - /* verify that a connection-only module is required. this might be the case in some systems - * where rc verbs is avaiable but ud is not. */ - bool need_conn_module = false; - for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { - mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i]; - if (module->conn_tl != NULL) { - continue; - } - if ((module->rdma_tl && mca_btl_uct_tl_requires_connection_tl(module->rdma_tl)) || - (module->am_tl && mca_btl_uct_tl_requires_connection_tl(module->am_tl))) { - need_conn_module = true; - break; - } - } - - if (!need_conn_module) { - mca_btl_uct_finalize (&mca_btl_uct_component.conn_module->super); - mca_btl_uct_component.conn_module = NULL; - } - } else { - int usable_module_count = mca_btl_uct_component.module_count; - - /* check that all modules can be used */ - for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { - mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i]; - if (NULL != module->conn_tl) { - /* module has its own connection transport */ - continue; - } - - if (((module->rdma_tl && mca_btl_uct_tl_requires_connection_tl(module->rdma_tl)) || - (module->am_tl && mca_btl_uct_tl_requires_connection_tl(module->am_tl))) - && NULL == module->conn_tl) { - /* module can not be used */ - BTL_VERBOSE(("module for memory domain %s can not be used due to missing connection transport", - module->md->md_name)); - mca_btl_uct_finalize (&mca_btl_uct_component.modules[i]->super); - mca_btl_uct_component.modules[i] = NULL; - } - } - - /* remove holes in the module array */ - if (usable_module_count < mca_btl_uct_component.module_count) { - for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { - if (mca_btl_uct_component.modules[i] == NULL) { - for (int j = i ; j < mca_btl_uct_component.module_count ; ++j) { - mca_btl_uct_component.modules[i++] = mca_btl_uct_component.modules[j]; - } - } - } - mca_btl_uct_component.module_count = usable_module_count; - } - } -} - /* * UCT component initialization: * (1) read interface list from kernel and compare against component parameters @@ -675,7 +268,6 @@ static mca_btl_base_module_t **mca_btl_uct_component_init(int *num_btl_modules, /* for this BTL to be useful the interface needs to support RDMA and certain atomic operations */ struct mca_btl_base_module_t **base_modules; - ucs_status_t ucs_status; int rc; BTL_VERBOSE(("initializing uct btl")); @@ -687,52 +279,25 @@ static mca_btl_base_module_t **mca_btl_uct_component_init(int *num_btl_modules, return NULL; } - mca_btl_uct_component_parse_include_list(mca_btl_uct_component.memory_domains, - &mca_btl_uct_component.memory_domain_list); - mca_btl_uct_component_parse_include_list(mca_btl_uct_component.allowed_transports, - &mca_btl_uct_component.allowed_transport_list); - mca_btl_uct_component_parse_include_list(mca_btl_uct_component.connection_domains, - &mca_btl_uct_component.connection_domain_list); - - mca_btl_uct_component.module_count = 0; - -#if UCT_API >= UCT_VERSION(1, 7) - ucs_status = uct_query_components(&mca_btl_uct_component.uct_components, - &mca_btl_uct_component.num_uct_components); - if (UCS_OK != ucs_status) { - BTL_ERROR(("could not query UCT components")); + rc = mca_btl_uct_enable_modules(mca_btl_uct_component.modules, mca_btl_uct_component.module_count); + if (OPAL_SUCCESS != rc) { return NULL; } - /* generate all suitable btl modules */ - for (unsigned i = 0; i < mca_btl_uct_component.num_uct_components; ++i) { - rc = mca_btl_uct_component_process_uct_component(mca_btl_uct_component.uct_components[i]); - if (OPAL_SUCCESS != rc) { - break; - } + rc = mca_btl_uct_component_maybe_setup_conn_tl(); + if (OPAL_SUCCESS != rc && OPAL_ERR_NOT_FOUND != rc) { + return NULL; } -#else /* UCT 1.6 and older */ - uct_md_resource_desc_t *resources; - unsigned resource_count; - - uct_query_md_resources(&resources, &resource_count); - /* generate all suitable btl modules */ - for (unsigned i = 0; i < resource_count; ++i) { - rc = mca_btl_uct_component_process_uct_md(resources + i); - if (OPAL_SUCCESS != rc) { - break; - } + rc = mca_btl_uct_component_filter_mds(); + if (OPAL_SUCCESS != rc) { + return NULL; } - uct_release_md_resource_list(resources); - -#endif /* UCT_API >= UCT_VERSION(1, 7) */ - - /* filter out unusable modules before sending the modex */ - mca_btl_uct_component_validate_modules(); - - mca_btl_uct_modex_send(); + rc = mca_btl_uct_component_modex_send(); + if (OPAL_SUCCESS != rc) { + return NULL; + } /* pass module array back to caller */ base_modules = calloc(mca_btl_uct_component.module_count, sizeof(*base_modules)); @@ -743,6 +308,8 @@ static mca_btl_base_module_t **mca_btl_uct_component_init(int *num_btl_modules, memcpy(base_modules, mca_btl_uct_component.modules, mca_btl_uct_component.module_count * sizeof(mca_btl_uct_component.modules[0])); + mca_btl_uct_component.initialized = true; + *num_btl_modules = mca_btl_uct_component.module_count; BTL_VERBOSE(("uct btl initialization complete. found %d suitable memory domains", @@ -786,7 +353,7 @@ static int mca_btl_uct_component_progress_pending(mca_btl_uct_module_t *uct_btl) opal_list_remove_item(&uct_btl->pending_frags, (opal_list_item_t *) frag); - if (OPAL_SUCCESS > mca_btl_uct_send_frag(uct_btl, frag, false)) { + if (OPAL_SUCCESS > mca_btl_uct_send_frag(uct_btl, frag, /*append=*/false)) { opal_list_prepend(&uct_btl->pending_frags, (opal_list_item_t *) frag); } else { completed++; @@ -837,25 +404,26 @@ static int mca_btl_uct_component_progress(void) int starting_index = mca_btl_uct_get_context_index(); unsigned ret = 0; - for (int i = 0; i < mca_btl_uct_component.module_count; ++i) { - mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i]; - + mca_btl_uct_md_t *md; + OPAL_LIST_FOREACH(md, &mca_btl_uct_component.md_list, mca_btl_uct_md_t) { /* unlike ucp, uct actually tells us something useful! its almost like it was "inspired" * by the btl progress functions.... */ mca_btl_uct_tl_t *tl; - OPAL_LIST_FOREACH(tl, &module->md->tls, mca_btl_uct_tl_t) { + OPAL_LIST_FOREACH(tl, &md->tls, mca_btl_uct_tl_t) { ret += mca_btl_uct_tl_progress(tl, starting_index); } - - mca_btl_uct_component_progress_connections (module->conn_tl); + } + for (int i = 0; i < mca_btl_uct_component.module_count; ++i) { + mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i]; + if (0 != opal_list_get_size(&module->pending_frags)) { mca_btl_uct_component_progress_pending(module); } } - if (NULL != mca_btl_uct_component.conn_module) { - ret += mca_btl_uct_component_progress_connections (mca_btl_uct_component.conn_module->conn_tl); + if (NULL != mca_btl_uct_component.conn_tl) { + ret += mca_btl_uct_component_progress_connections (mca_btl_uct_component.conn_tl); } return (int) ret; diff --git a/opal/mca/btl/uct/btl_uct_discover.c b/opal/mca/btl/uct/btl_uct_discover.c new file mode 100644 index 00000000000..7bb13db9837 --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_discover.c @@ -0,0 +1,508 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. + * Copyright (c) 2018-2024 Triad National Security, LLC. All rights + * reserved. + * Copyright (c) 2019-2025 Google, LLC. All rights reserved. + * Copyright (c) 2019 Intel, Inc. All rights reserved. + * Copyright (c) 2022 IBM Corporation. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal_config.h" + +#include "btl_uct_discover.h" +#include "btl_uct_include_list.h" + +#include "btl_uct.h" +#include "opal/class/opal_list.h" +#include "opal/util/printf.h" + +#if UCT_API >= UCT_VERSION(1, 7) +static int mca_btl_uct_component_process_uct_md(uct_component_h component, + uct_md_resource_desc_t *md_desc) +#else +static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc) +#endif +{ + uct_tl_resource_desc_t *tl_desc; + uct_md_config_t *uct_config; + mca_btl_uct_md_t *md; + int list_rank; + unsigned num_tls; + ucs_status_t ucs_status; + int connection_list_rank = -1; + bool consider_for_connection_module = false; + + BTL_VERBOSE(("processing memory domain %s", md_desc->md_name)); + + BTL_VERBOSE(("checking if %s should be used for communication", md_desc->md_name)); + list_rank = mca_btl_uct_include_list_rank (md_desc->md_name, &mca_btl_uct_component.memory_domain_list); + + if (list_rank < 0) { + BTL_VERBOSE(("checking if %s should be used for connections", md_desc->md_name)); + connection_list_rank = mca_btl_uct_include_list_rank (md_desc->md_name, &mca_btl_uct_component.connection_domain_list); + + if (connection_list_rank < 0) { + /* nothing to do */ + BTL_VERBOSE(("not continuing with memory domain %s", md_desc->md_name)); + return OPAL_SUCCESS; + } + + BTL_VERBOSE(("will be considering domain %s for connections only", md_desc->md_name)); + consider_for_connection_module = true; + } + + md = OBJ_NEW(mca_btl_uct_md_t); + md->md_name = strdup(md_desc->md_name); +#if UCT_API >= UCT_VERSION(1, 7) + md->uct_component = component; +#endif + md->connection_only_domain = consider_for_connection_module; + +#if UCT_API >= UCT_VERSION(1, 7) + ucs_status = uct_md_config_read(component, NULL, NULL, &uct_config); + if (UCS_OK != ucs_status) { + BTL_VERBOSE(("uct_md_config_read failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); + return OPAL_ERR_NOT_AVAILABLE; + } + + ucs_status = uct_md_open(component, md->md_name, uct_config, &md->uct_md); + if (UCS_OK != ucs_status) { + BTL_VERBOSE(("uct_md_open failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); + return OPAL_ERR_NOT_AVAILABLE; + } +#else + ucs_status = uct_md_config_read(md->md_name, NULL, NULL, &uct_config); + if (UCS_OK != ucs_status) { + BTL_VERBOSE(("uct_md_config_read failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); + return OPAL_ERR_NOT_AVAILABLE; + } + + ucs_status = uct_md_open(md->md_name, uct_config, &md->uct_md); + if (UCS_OK != ucs_status) { + BTL_VERBOSE(("uct_md_open failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); + return OPAL_ERR_NOT_AVAILABLE; + } +#endif + uct_config_release(uct_config); + + ucs_status = uct_md_query(md->uct_md, &md->md_attr); + if (UCS_OK != ucs_status) { + BTL_VERBOSE(("uct_config_release failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); + return OPAL_ERR_NOT_AVAILABLE; + } + + ucs_status = uct_md_query_tl_resources(md->uct_md, &tl_desc, &num_tls); + if (UCS_OK != ucs_status) { + BTL_VERBOSE(("uct_config_release failed %d (%s)", ucs_status, ucs_status_string(ucs_status))); + return OPAL_ERR_NOT_AVAILABLE; + } + + (void) mca_btl_uct_populate_tls(md, tl_desc, num_tls); + + uct_release_tl_resource_list(tl_desc); + opal_list_append(&mca_btl_uct_component.md_list, &md->super); + + return OPAL_SUCCESS; +} + +#if UCT_API >= UCT_VERSION(1, 7) +static int mca_btl_uct_component_process_uct_component(uct_component_h component) +{ + uct_component_attr_t attr = { + .field_mask = UCT_COMPONENT_ATTR_FIELD_NAME + | UCT_COMPONENT_ATTR_FIELD_MD_RESOURCE_COUNT, + }; + ucs_status_t ucs_status; + int rc; + + ucs_status = uct_component_query(component, &attr); + if (UCS_OK != ucs_status) { + return OPAL_ERROR; + } + + BTL_VERBOSE(("processing uct component %s", attr.name)); + + attr.md_resources = calloc(attr.md_resource_count, sizeof(*attr.md_resources)); + attr.field_mask |= UCT_COMPONENT_ATTR_FIELD_MD_RESOURCES; + ucs_status = uct_component_query(component, &attr); + if (UCS_OK != ucs_status) { + return OPAL_ERROR; + } + + for (unsigned i = 0; i < attr.md_resource_count; ++i) { + rc = mca_btl_uct_component_process_uct_md(component, attr.md_resources + i); + if (OPAL_SUCCESS != rc) { + break; + } + } + + free(attr.md_resources); + + return OPAL_SUCCESS; +} +#endif /* UCT_API >= UCT_VERSION(1, 7) */ + +int mca_btl_uct_component_discover_mds(void) +{ + mca_btl_uct_include_list_parse(mca_btl_uct_component.memory_domains, + &mca_btl_uct_component.memory_domain_list); + mca_btl_uct_include_list_parse(mca_btl_uct_component.connection_domains, + &mca_btl_uct_component.connection_domain_list); + +#if UCT_API >= UCT_VERSION(1, 7) + ucs_status_t ucs_status = uct_query_components(&mca_btl_uct_component.uct_components, + &mca_btl_uct_component.num_uct_components); + if (UCS_OK != ucs_status) { + BTL_ERROR(("could not query UCT components")); + return OPAL_ERROR; + } + + /* generate list of memory domains */ + for (unsigned i = 0; i < mca_btl_uct_component.num_uct_components; ++i) { + int rc = mca_btl_uct_component_process_uct_component(mca_btl_uct_component.uct_components[i]); + if (OPAL_SUCCESS != rc) { + break; + } + } +#else /* UCT 1.6 and older */ + uct_md_resource_desc_t *resources; + unsigned resource_count; + + uct_query_md_resources(&resources, &resource_count); + + /* generate all suitable btl modules */ + for (unsigned i = 0; i < resource_count; ++i) { + int rc = mca_btl_uct_component_process_uct_md(resources + i); + if (OPAL_SUCCESS != rc) { + break; + } + } + + uct_release_md_resource_list(resources); + +#endif /* UCT_API >= UCT_VERSION(1, 7) */ + + return OPAL_SUCCESS; +} + +static int mca_btl_uct_module_register_mca_var(mca_btl_uct_module_t *module) +{ + mca_base_component_t dummy_component; + /* mca_btl_uct_component starts with an mca_base_component_t structure */ + memcpy(&dummy_component, &mca_btl_uct_component, sizeof(dummy_component)); + snprintf(dummy_component.mca_component_name, sizeof(dummy_component.mca_component_name), + "uct_%s", module->md->md_name); + + BTL_VERBOSE(("registering MCA parameters for module uct_%s", module->md->md_name)); + + module->allowed_transports = mca_btl_uct_component.allowed_transports; + (void) mca_base_component_var_register( + &dummy_component, "transports", + "Comma-delimited list of transports to use sorted by increasing " + "priority. The list of transports available can be queried using ucx_info. Special" + "values: any (any available) (default: dc_mlx5,rc_mlx5,ud,any)", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, &module->allowed_transports); + + return mca_btl_base_param_register(&dummy_component, &module->super); +} + +static int tl_compare(opal_list_item_t **a, opal_list_item_t **b) +{ + mca_btl_uct_tl_t *tl_a = (mca_btl_uct_tl_t *) *a; + mca_btl_uct_tl_t *tl_b = (mca_btl_uct_tl_t *) *b; + + return tl_a->priority - tl_b->priority; +} + +static int mca_btl_uct_generate_module(mca_btl_uct_md_t *md) +{ + mca_btl_uct_tl_t *tl; + mca_btl_uct_module_t *module = mca_btl_uct_alloc_module(md, md->md_attr.rkey_packed_size); + + BTL_VERBOSE(("attempting to create a BTL module for memory domain: %s", md->md_name)); + + int rc = mca_btl_uct_module_register_mca_var(module); + if (OPAL_SUCCESS != rc) { + mca_btl_uct_finalize(&module->super); + return rc; + } + + mca_btl_uct_include_list_parse(module->allowed_transports, + &module->allowed_transport_list); + mca_btl_uct_tl_t *next; + OPAL_LIST_FOREACH_SAFE (tl, next, &md->tls, mca_btl_uct_tl_t) { + int rank = mca_btl_uct_include_list_rank(tl->uct_tl_name, &module->allowed_transport_list); + if (rank < 0) { + opal_list_remove_item(&md->tls, &tl->super); + OBJ_RELEASE(tl); + continue; + } + tl->priority = rank; + } + + opal_list_sort(&md->tls, tl_compare); + + /* Treat the flags specified by the user as a mask. */ + uint32_t btl_flags = module->super.btl_flags; + uint32_t btl_atomic_flags = module->super.btl_atomic_flags; + + module->super.btl_flags = 0; + module->super.btl_atomic_flags = 0; + + OPAL_LIST_FOREACH (tl, &md->tls, mca_btl_uct_tl_t) { + mca_btl_uct_evaluate_tl(module, tl); + if (NULL != module->am_tl && NULL != module->rdma_tl) { + /* all done */ + break; + } + } + + module->super.btl_flags &= btl_flags; + module->super.btl_atomic_flags &= btl_atomic_flags; + + if (NULL == module->rdma_tl) { + /* no rdma tls */ + BTL_VERBOSE(("no rdma tl matched supplied filter. disabling RDMA support")); + + module->super.btl_flags &= ~MCA_BTL_FLAGS_RDMA; + module->super.btl_put = NULL; + module->super.btl_get = NULL; + module->super.btl_atomic_fop = NULL; + module->super.btl_atomic_op = NULL; + } + + if (NULL == module->am_tl) { + /* no active message tls == no send/recv */ + BTL_VERBOSE(("no active message tl matched supplied filter. disabling send/recv support")); + + module->super.btl_send = NULL; + module->super.btl_sendi = NULL; + module->super.btl_alloc = NULL; + module->super.btl_free = NULL; + } + + if (NULL == module->am_tl && NULL == module->rdma_tl) { + mca_btl_uct_finalize(&module->super); + return OPAL_ERR_NOT_AVAILABLE; + } + + module->module_index = mca_btl_uct_component.module_count; + mca_btl_uct_component.modules[mca_btl_uct_component.module_count++] = module; + + return OPAL_SUCCESS; +} + +static void mca_btl_uct_enable_tl(mca_btl_uct_tl_t *tl) { + if (NULL == tl) { + return; + } + + if (tl->max_device_contexts < 1) { + tl->max_device_contexts = mca_btl_uct_component.num_contexts_per_module; + } +} + +static int mca_btl_uct_enable_module(mca_btl_uct_module_t *module) +{ + /* NTH: a registration cache shouldn't be necessary when using UCT but there are measurable + * performance benefits to using rcache/grdma instead of assuming UCT will do the right + * thing. */ + char *tmp = NULL; + (void) opal_asprintf(&tmp, "uct.%s", module->md->md_name); + + mca_rcache_base_resources_t rcache_resources = { + .cache_name = tmp, + .reg_data = (void *) module, + .sizeof_reg = sizeof(mca_btl_uct_reg_t) + module->super.btl_registration_handle_size, + .register_mem = mca_btl_uct_reg_mem, + .deregister_mem = mca_btl_uct_dereg_mem, + }; + + module->rcache = mca_rcache_base_module_create("grdma", module, &rcache_resources); + free(tmp); + if (NULL == module->rcache) { + /* something went horribly wrong */ + BTL_VERBOSE(("could not allocate a registration cache for this btl module")); + return OPAL_ERROR; + } + + mca_btl_uct_enable_tl(module->rdma_tl); + mca_btl_uct_enable_tl(module->am_tl); + + return OPAL_SUCCESS; +} + +int mca_btl_uct_enable_modules(mca_btl_uct_module_t **modules, int module_count) +{ + for (int i = 0 ; i < module_count ; ++i) { + int rc = mca_btl_uct_enable_module(modules[i]); + if (OPAL_SUCCESS != rc) { + BTL_VERBOSE(("could not enable module for memory domain %s", modules[i]->md->md_name)); + mca_btl_uct_finalize(&modules[i]->super); + } + } + + return OPAL_SUCCESS; +} + +int mca_btl_uct_component_generate_modules(opal_list_t *md_list) +{ + mca_btl_uct_component.module_count = 0; + + mca_btl_uct_md_t *md; + OPAL_LIST_FOREACH(md, md_list, mca_btl_uct_md_t) { + if (MCA_BTL_UCT_MAX_MODULES == mca_btl_uct_component.module_count) { + BTL_VERBOSE(("created the maximum number of allowable modules")); + break; + } + + if (md->connection_only_domain) { + /* will not build a module for this domain */ + continue; + } + + int rc = mca_btl_uct_generate_module(md); + if (OPAL_SUCCESS != rc) { + BTL_VERBOSE(("could not create a module for memory domain %s", md->md_name)); + } + } + + return OPAL_SUCCESS; +} + +int mca_btl_uct_component_maybe_setup_conn_tl(void) +{ + bool connection_tl_required = false; + for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + connection_tl_required |= + mca_btl_uct_tl_requires_connection_tl(mca_btl_uct_component.modules[i]->am_tl); + connection_tl_required |= + mca_btl_uct_tl_requires_connection_tl(mca_btl_uct_component.modules[i]->rdma_tl); + if (connection_tl_required) { + break; + } + } + + if (!connection_tl_required) { + return OPAL_SUCCESS; + } + + mca_btl_uct_md_t *md; + OPAL_LIST_FOREACH(md, &mca_btl_uct_component.md_list, mca_btl_uct_md_t) { + mca_btl_uct_tl_t *tl, *next; + OPAL_LIST_FOREACH_SAFE(tl, next, &md->tls, mca_btl_uct_tl_t) { + if (mca_btl_uct_tl_supports_conn(tl)) { + break; + } + tl = NULL; + } + + if (NULL == mca_btl_uct_component.conn_tl) { + mca_btl_uct_component.conn_tl = tl; + } + + if (tl != NULL && (md->connection_only_domain || NULL == mca_btl_uct_component.conn_tl)) { + mca_btl_uct_component.conn_tl = tl; + if (md->connection_only_domain) { + /* not going do to better */ + break; + } + } + } + + if (NULL == mca_btl_uct_component.conn_tl) { + /* no connection tl found, will need to disable all connect-to-endpoint modules */ + BTL_VERBOSE(("could not find a suitable transport to support forming connections")); + return OPAL_ERR_NOT_FOUND; + } + + BTL_VERBOSE(("using transport %s::%s for connection management", + mca_btl_uct_component.conn_tl->uct_md->md_name, + mca_btl_uct_component.conn_tl->uct_tl_name)); + + return mca_btl_uct_enable_tl_conn(mca_btl_uct_component.conn_tl); +} + +int mca_btl_uct_component_filter_mds(void) +{ + int usable_module_count = mca_btl_uct_component.module_count; + /* clean out all unused mds, tls, and unusable modules */ + if (NULL == mca_btl_uct_component.conn_tl) { + for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i]; + if (!(mca_btl_uct_tl_requires_connection_tl(module->am_tl) || + mca_btl_uct_tl_requires_connection_tl(module->rdma_tl))) { + continue; + } + + /* module is unusable */ + mca_btl_uct_finalize(&module->super); + mca_btl_uct_component.modules[i] = NULL; + --usable_module_count; + } + } + + mca_btl_uct_md_t *md, *md_next; + OPAL_LIST_FOREACH_SAFE(md, md_next, &mca_btl_uct_component.md_list, mca_btl_uct_md_t) { + mca_btl_uct_module_t *module = NULL; + for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + module = mca_btl_uct_component.modules[i]; + if (NULL != module && module->md == md) { + break; + } + module = NULL; + } + + mca_btl_uct_tl_t *tl, *next; + OPAL_LIST_FOREACH_SAFE(tl, next, &md->tls, mca_btl_uct_tl_t) { + if (tl == mca_btl_uct_component.conn_tl || (NULL != module && + (tl == module->rdma_tl || + tl == module->am_tl))) { + /* tl is in use */ + continue; + } + opal_list_remove_item(&md->tls, &tl->super); + OBJ_RELEASE(tl); + } + + if (opal_list_get_size(&md->tls) == 0) { + opal_list_remove_item(&mca_btl_uct_component.md_list, &md->super); + OBJ_RELEASE(md); + } + } + + /* remove holes in the module array */ + if (usable_module_count < mca_btl_uct_component.module_count) { + for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + if (mca_btl_uct_component.modules[i] == NULL) { + for (int j = i ; j < mca_btl_uct_component.module_count ; ++j) { + mca_btl_uct_component.modules[i++] = mca_btl_uct_component.modules[j]; + } + } + } + mca_btl_uct_component.module_count = usable_module_count; + } + + return OPAL_SUCCESS; +} diff --git a/opal/mca/btl/uct/btl_uct_discover.h b/opal/mca/btl/uct/btl_uct_discover.h new file mode 100644 index 00000000000..08b03899fc4 --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_discover.h @@ -0,0 +1,43 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Google, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#if !defined(MCA_BTL_UCT_DISCOVER_H) +#define MCA_BTL_UCT_DISCOVER_H + +#include "btl_uct.h" +#include "opal/class/opal_list.h" + +/** + * @brief Query UCT for the available memory domains. This list will be limited by + */ +int mca_btl_uct_component_discover_mds(void); + +/** + * @brief Create BTL modules from the memory domain list. + * + * The modules are registered with MCA and must be shut down using + * mca_btl_module_finalize. + */ +int mca_btl_uct_component_generate_modules(opal_list_t *md_list); + +int mca_btl_uct_enable_modules(mca_btl_uct_module_t **modules, int module_count); + +/** + * @brief Scan detected transports and find a connection transport (if needed). + */ +int mca_btl_uct_component_maybe_setup_conn_tl(void); + +/** + * @brief Clean out unused memory domains and transport layers. + */ +int mca_btl_uct_component_filter_mds(void); + + +#endif /* !defined(MCA_BTL_UCT_DISCOVER_H) */ diff --git a/opal/mca/btl/uct/btl_uct_endpoint.c b/opal/mca/btl/uct/btl_uct_endpoint.c index b9ae99eb687..fe4217035e6 100644 --- a/opal/mca/btl/uct/btl_uct_endpoint.c +++ b/opal/mca/btl/uct/btl_uct_endpoint.c @@ -16,6 +16,7 @@ #include "btl_uct.h" #include "btl_uct_am.h" #include "btl_uct_device_context.h" +#include "btl_uct_modex.h" #include "opal/mca/timer/base/base.h" #include "opal/util/proc.h" @@ -63,53 +64,6 @@ mca_btl_base_endpoint_t *mca_btl_uct_endpoint_create(opal_proc_t *proc) return (mca_btl_base_endpoint_t *) endpoint; } -static unsigned char *mca_btl_uct_process_modex_tl(unsigned char *modex_data) -{ - BTL_VERBOSE( - ("processing modex for tl %s. size: %u", modex_data + 4, *((uint32_t *) modex_data))); - - /* skip size and name */ - return modex_data + 4 + strlen((char *) modex_data + 4) + 1; -} - -static void mca_btl_uct_process_modex(mca_btl_uct_module_t *uct_btl, unsigned char *modex_data, - unsigned char **rdma_tl_data, unsigned char **am_tl_data, - unsigned char **conn_tl_data) -{ - BTL_VERBOSE(("processing remote modex data")); - - if (uct_btl->rdma_tl) { - BTL_VERBOSE(("modex contains RDMA data")); - if (rdma_tl_data) { - *rdma_tl_data = mca_btl_uct_process_modex_tl(modex_data); - } - modex_data += *((uint32_t *) modex_data); - } else if (rdma_tl_data) { - *rdma_tl_data = NULL; - } - - if (uct_btl->am_tl && uct_btl->am_tl != uct_btl->rdma_tl) { - BTL_VERBOSE(("modex contains active message data")); - if (am_tl_data) { - *am_tl_data = mca_btl_uct_process_modex_tl(modex_data); - } - modex_data += *((uint32_t *) modex_data); - } else if (am_tl_data) { - *am_tl_data = NULL; - } - - if (uct_btl->conn_tl && uct_btl->conn_tl != uct_btl->rdma_tl - && uct_btl->conn_tl != uct_btl->am_tl) { - BTL_VERBOSE(("modex contains connection data")); - if (conn_tl_data) { - *conn_tl_data = mca_btl_uct_process_modex_tl(modex_data); - } - modex_data += *((uint32_t *) modex_data); - } else if (conn_tl_data) { - *conn_tl_data = NULL; - } -} - static inline ucs_status_t mca_btl_uct_ep_create_connected_compat(uct_iface_h iface, uct_device_addr_t *device_addr, uct_iface_addr_t *iface_addr, @@ -231,11 +185,10 @@ OBJ_CLASS_INSTANCE(mca_btl_uct_connection_ep_t, opal_object_t, mca_btl_uct_conne static int mca_btl_uct_endpoint_send_conn_req(mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, - mca_btl_uct_tl_t *conn_tl, mca_btl_uct_conn_req_t *request, size_t request_length) { - mca_btl_uct_device_context_t *conn_tl_context = conn_tl->uct_dev_contexts[0]; + mca_btl_uct_device_context_t *conn_tl_context = mca_btl_uct_component.conn_tl->uct_dev_contexts[0]; BTL_VERBOSE( ("sending connection request to peer. context id: %d, type: %d, length: %" PRIsize_t, @@ -266,7 +219,7 @@ static int mca_btl_uct_endpoint_send_conn_req(mca_btl_uct_module_t *uct_btl, } static int mca_btl_uct_endpoint_get_helper_endpoint(mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, - mca_btl_uct_tl_t *conn_tl, uint8_t *conn_tl_data) + uint8_t *conn_tl_data) { if (NULL != endpoint->conn_ep) { BTL_VERBOSE(("re-using existing connection endpoint")); @@ -274,6 +227,8 @@ static int mca_btl_uct_endpoint_get_helper_endpoint(mca_btl_uct_module_t *uct_bt return OPAL_SUCCESS; } + mca_btl_uct_tl_t *conn_tl = mca_btl_uct_component.conn_tl; + BTL_VERBOSE(("creating a temporary endpoint for handling connections to %p", opal_process_name_print(endpoint->ep_proc->proc_name))); @@ -307,7 +262,7 @@ static int mca_btl_uct_endpoint_get_helper_endpoint(mca_btl_uct_module_t *uct_bt } static int mca_btl_uct_endpoint_send_connection_data( - mca_btl_uct_module_t *uct_btl, mca_btl_uct_tl_t *conn_tl, mca_btl_base_endpoint_t *endpoint, + mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, mca_btl_uct_tl_t *tl, mca_btl_uct_device_context_t *tl_context, mca_btl_uct_tl_endpoint_t *tl_endpoint, int request_type, int remote_module_index) { @@ -338,7 +293,7 @@ static int mca_btl_uct_endpoint_send_connection_data( /* let the remote side know that the connection has been established and * wait for the message to be sent */ - int rc = mca_btl_uct_endpoint_send_conn_req(uct_btl, endpoint, conn_tl, request, + int rc = mca_btl_uct_endpoint_send_conn_req(uct_btl, endpoint, request, request_length); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { OBJ_RELEASE(endpoint->conn_ep); @@ -353,7 +308,7 @@ static int mca_btl_uct_endpoint_send_connection_data( } static int mca_btl_uct_endpoint_connect_endpoint( - mca_btl_uct_module_t *uct_btl, mca_btl_uct_tl_t *conn_tl, mca_btl_base_endpoint_t *endpoint, + mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, mca_btl_uct_tl_t *tl, mca_btl_uct_device_context_t *tl_context, mca_btl_uct_tl_endpoint_t *tl_endpoint, uint8_t *tl_data, void *ep_addr, int remote_module_index) { @@ -387,7 +342,7 @@ static int mca_btl_uct_endpoint_connect_endpoint( opal_timer_t now = opal_timer_base_get_usec(); if ((now - tl_endpoint->last_connection_req) > mca_btl_uct_component.connection_retry_timeout || ep_addr) { - int rc = mca_btl_uct_endpoint_send_connection_data(uct_btl, conn_tl, endpoint, tl, tl_context, tl_endpoint, + int rc = mca_btl_uct_endpoint_send_connection_data(uct_btl, endpoint, tl, tl_context, tl_endpoint, /*request_type=*/!!ep_addr, remote_module_index); if (OPAL_SUCCESS != rc) { return rc; @@ -402,42 +357,6 @@ static int mca_btl_uct_endpoint_connect_endpoint( return OPAL_ERR_OUT_OF_RESOURCE; } -static int mca_btl_uct_find_modex(mca_btl_uct_module_t *uct_btl, mca_btl_uct_modex_t *modex, - uint8_t **rdma_tl_data, uint8_t **am_tl_data, uint8_t **conn_tl_data, int *remote_module_index) { - uint8_t *modex_data = modex->data; - - /* look for matching transport in the modex */ - for (int i = 0; i < modex->module_count; ++i) { - uint32_t modex_size = *((uint32_t *) modex_data); - - BTL_VERBOSE(("found modex for md %s, searching for %s", modex_data + 4, uct_btl->md->md_name)); - - modex_data += 4; - - if (0 != strcmp((char *) modex_data, uct_btl->md->md_name)) { - /* modex belongs to a different module, skip it and continue */ - modex_data += modex_size - 4; - continue; - } - - modex_data += strlen((char *) modex_data) + 1; - - mca_btl_uct_process_modex(uct_btl, modex_data, rdma_tl_data, am_tl_data, conn_tl_data); - if (NULL != remote_module_index) { - *remote_module_index = i; - } - - BTL_VERBOSE(("finished processing modex for %s", uct_btl->md->md_name)); - - return OPAL_SUCCESS; - } - - BTL_ERROR(("could not find modex for %s", uct_btl->md->md_name)); - - return OPAL_ERR_NOT_FOUND; -} - - int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endpoint_t *endpoint, int context_id, void *ep_addr, int tl_index) { @@ -447,7 +366,7 @@ int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endp : uct_btl->am_tl; mca_btl_uct_device_context_t *tl_context = mca_btl_uct_module_get_tl_context_specific(uct_btl, tl, context_id); - uint8_t *rdma_tl_data = NULL, *conn_tl_data = NULL, *am_tl_data = NULL, *tl_data; + uint8_t *conn_tl_data, *tl_data = NULL; mca_btl_uct_modex_t *modex; size_t msg_size; int rc; @@ -491,13 +410,7 @@ int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endp modex->module_count)); int remote_module_index; - rc = mca_btl_uct_find_modex (uct_btl, modex, &rdma_tl_data, &am_tl_data, &conn_tl_data, &remote_module_index); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - break; - } - - tl_data = (tl == uct_btl->rdma_tl) ? rdma_tl_data : am_tl_data; - + tl_data = mca_btl_uct_find_modex(modex, tl, &remote_module_index); if (OPAL_UNLIKELY(NULL == tl_data)) { BTL_ERROR(("could not find modex data for this transport")); rc = OPAL_ERR_UNREACH; @@ -506,31 +419,25 @@ int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endp /* connect the endpoint */ if (mca_btl_uct_tl_requires_connection_tl(tl)) { - mca_btl_uct_tl_t *conn_tl = uct_btl->conn_tl; - if (NULL == conn_tl) { - rc = mca_btl_uct_find_modex (mca_btl_uct_component.conn_module, modex, - /*rdma_tl_data=*/NULL, /*am_tl_data=*/NULL, - &conn_tl_data, /*remote_module_index=*/NULL); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - BTL_ERROR(("could not find modex for connection module")); - break; - } - - BTL_VERBOSE(("using separate connection module for tl")); - conn_tl = mca_btl_uct_component.conn_module->conn_tl; + conn_tl_data = mca_btl_uct_find_modex(modex, mca_btl_uct_component.conn_tl, + /*remote_module_index=*/NULL); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + BTL_ERROR(("could not find modex for connection module")); + break; } + if (NULL == tl_endpoint->uct_ep) { /* allocate or retain a connection endpoint */ - rc = mca_btl_uct_endpoint_get_helper_endpoint(uct_btl, endpoint, conn_tl, + rc = mca_btl_uct_endpoint_get_helper_endpoint(uct_btl, endpoint, conn_tl_data); if (OPAL_SUCCESS != rc) { break; } } - rc = mca_btl_uct_endpoint_connect_endpoint(uct_btl, conn_tl, endpoint, tl, - tl_context, tl_endpoint, tl_data, ep_addr, remote_module_index); + rc = mca_btl_uct_endpoint_connect_endpoint(uct_btl, endpoint, tl, tl_context, tl_endpoint, + tl_data, ep_addr, remote_module_index); } else { rc = mca_btl_uct_endpoint_connect_iface(uct_btl, tl, tl_context, tl_endpoint, tl_data); } diff --git a/opal/mca/btl/uct/btl_uct_include_list.c b/opal/mca/btl/uct/btl_uct_include_list.c new file mode 100644 index 00000000000..5e989581612 --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_include_list.c @@ -0,0 +1,78 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2024-2025 Google, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include + +#include "opal_config.h" + +#include "btl_uct_include_list.h" +#include "btl_uct_types.h" +#include "opal/class/opal_object.h" +#include "opal/mca/btl/base/btl_base_error.h" +#include "opal/util/argv.h" + +void mca_btl_uct_include_list_parse (const char *value, mca_btl_uct_include_list_t *list) { + list->list = NULL; + list->include = true; + + if (value == NULL) { + return; + } + + if (value[0] == '^') { + list->include = false; + value++; + } + + list->list = opal_argv_split(value, ','); +} + +int mca_btl_uct_include_list_rank (const char *name, const mca_btl_uct_include_list_t *list) { + if (list->list == NULL) { + return -1; + } + + for (int i = 0; list->list[i]; ++i) { + regex_t preg; + + BTL_VERBOSE(("evaluating %s vs %s-list item %s", name, list->include ? "include" : "exclude", list->list[i])); + int rc = regcomp(&preg, list->list[i], REG_ICASE); + if (0 != rc) { + char errbuf[256]; + regerror(rc, &preg, errbuf, sizeof(errbuf)); + BTL_ERROR(("when matching name, could not parse regular expression: %s, error: %s", list->list[i], errbuf)); + continue; + } + + int result = regexec(&preg, name, /*nmatch=*/0, /*pmatch=*/NULL, /*eflags=*/0); + regfree(&preg); + if (0 == result) { + return list->include ? i + 1 : -(i + 1); + } + } + + return list->include ? -1 : 1; +} + +static void mca_btl_uct_include_list_construct (mca_btl_uct_include_list_t *list) +{ + list->list = NULL; +} + +static void mca_btl_uct_include_list_destruct (mca_btl_uct_include_list_t *list) +{ + opal_argv_free (list->list); + list->list = NULL; +} + +OBJ_CLASS_INSTANCE(mca_btl_uct_include_list_t, opal_object_t, mca_btl_uct_include_list_construct, + mca_btl_uct_include_list_destruct); + + diff --git a/opal/mca/btl/uct/btl_uct_include_list.h b/opal/mca/btl/uct/btl_uct_include_list.h new file mode 100644 index 00000000000..69fba979d8d --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_include_list.h @@ -0,0 +1,34 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2024-2025 Google, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "btl_uct_types.h" + +#if !defined(BTL_UCT_INCLUDE_LIST_H) +#define BTL_UCT_INCLUDE_LIST_H + +/** + * @brief Parse `value` to create an include list. + * + * @param[in] value Comma-delimeted string to parse. + * @param[in,out] list Include list object, must already be constructed. + */ +void mca_btl_uct_include_list_parse (const char *value, mca_btl_uct_include_list_t *list); + +/** + * @brief Find the rank of `name` in the include list `list`. + * + * @param[in] name name to find + * @param[in] list list to search + * + * A negative result means the name is not present or the list is negated. + */ +int mca_btl_uct_include_list_rank (const char *name, const mca_btl_uct_include_list_t *list); + +#endif /* !defined(BTL_UCT_INCLUDE_LIST_H) */ diff --git a/opal/mca/btl/uct/btl_uct_modex.c b/opal/mca/btl/uct/btl_uct_modex.c new file mode 100644 index 00000000000..7d6aa2f5450 --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_modex.c @@ -0,0 +1,198 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. + * Copyright (c) 2018-2024 Triad National Security, LLC. All rights + * reserved. + * Copyright (c) 2019-2025 Google, LLC. All rights reserved. + * Copyright (c) 2019 Intel, Inc. All rights reserved. + * Copyright (c) 2022 IBM Corporation. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal_config.h" + +#include "btl_uct_modex.h" +#include "btl_uct_types.h" +#include "btl_uct_device_context.h" +#include "opal/class/opal_list.h" +#include "opal/mca/pmix/pmix-internal.h" + +static uint16_t mca_btl_uct_tl_modex_size(mca_btl_uct_tl_t *tl) +{ + uint16_t size = sizeof(mca_btl_uct_tl_modex_t); + + if (tl->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { + size += (uint16_t)tl->uct_iface_attr.iface_addr_len; + } + + /* pad out to a multiple of 4 bytes */ + return (3 + size + (uint16_t)tl->uct_iface_attr.device_addr_len) & ~3; +} + +static uint16_t mca_btl_uct_md_modex_size(mca_btl_uct_md_t *md) +{ + uint16_t modex_size = sizeof(mca_btl_uct_md_modex_t); + + mca_btl_uct_tl_t *tl; + OPAL_LIST_FOREACH(tl, &md->tls, mca_btl_uct_tl_t) { + modex_size += mca_btl_uct_tl_modex_size(tl); + } + + return modex_size; +} + +static uint8_t *mca_btl_uct_tl_modex_pack(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl, + uint8_t *modex_data) +{ + mca_btl_uct_device_context_t *dev_context = + mca_btl_uct_module_get_tl_context_specific(module, tl, /*context_id=*/0); + + mca_btl_uct_tl_modex_t *tl_modex = (mca_btl_uct_tl_modex_t *)modex_data; + tl_modex->size = mca_btl_uct_tl_modex_size(tl); + + memset(tl_modex->tl_name, 0, sizeof(tl_modex->tl_name)); + strncpy(tl_modex->tl_name, tl->uct_tl_name, sizeof(tl_modex->tl_name)); + + uint8_t *tl_modex_data = (uint8_t *) tl_modex->data; + + /* NTH: only the first context is available. i assume the device addresses of the + * contexts will be the same but they will have different iface addresses. i also + * am assuming that it doesn't really matter if all remote contexts connect to + * the same endpoint since we are only doing RDMA. if any of these assumptions are + * wrong then we can't delay creating the other contexts and must include their + * information in the modex. */ + if (tl->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { + uct_iface_get_address(dev_context->uct_iface, (uct_iface_addr_t *) tl_modex_data); + tl_modex_data += tl->uct_iface_attr.iface_addr_len; + } + + uct_iface_get_device_address(dev_context->uct_iface, (uct_device_addr_t *) tl_modex_data); + tl_modex_data += tl->uct_iface_attr.device_addr_len; + + return modex_data + tl_modex->size; +} + +static uint8_t *mca_btl_uct_modex_pack(mca_btl_uct_md_t *md, uint8_t *modex_data) +{ + mca_btl_uct_module_t *module = NULL; + for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + if (mca_btl_uct_component.modules[i]->md == md) { + module = mca_btl_uct_component.modules[i]; + break; + } + } + + mca_btl_uct_md_modex_t *md_modex = (mca_btl_uct_md_modex_t *)modex_data; + modex_data = md_modex->data; + + md_modex->size = mca_btl_uct_md_modex_size(md); + md_modex->module_index = module ? module->module_index : (uint16_t) -1; + + memset(md_modex->md_name, 0, sizeof(md_modex->md_name)); + strncpy(md_modex->md_name, md->md_name, sizeof(md_modex->md_name)); + + mca_btl_uct_tl_t *tl; + OPAL_LIST_FOREACH(tl, &md->tls, mca_btl_uct_tl_t) { + modex_data = mca_btl_uct_tl_modex_pack(module, tl, modex_data); + } + + return modex_data; +} + +int mca_btl_uct_component_modex_send(void) +{ + size_t modex_size = sizeof(mca_btl_uct_modex_t); + mca_btl_uct_modex_t *modex; + uint8_t *modex_data; + int rc; + + mca_btl_uct_md_t *md; + OPAL_LIST_FOREACH(md, &mca_btl_uct_component.md_list, mca_btl_uct_md_t) { + modex_size += mca_btl_uct_md_modex_size(md); + } + + modex = alloca(modex_size); + modex_data = modex->data; + + modex->module_count = opal_list_get_size(&mca_btl_uct_component.md_list); + OPAL_LIST_FOREACH(md, &mca_btl_uct_component.md_list, mca_btl_uct_md_t) { + modex_data = mca_btl_uct_modex_pack(md, modex_data); + } + + OPAL_MODEX_SEND(rc, PMIX_GLOBAL, &mca_btl_uct_component.super.btl_version, modex, modex_size); + return rc; +} + +static uint8_t *mca_btl_uct_find_tl_modex(mca_btl_uct_md_modex_t *md_modex, mca_btl_uct_tl_t *tl) +{ + uint8_t *modex_data = md_modex->data; + + for (uint16_t modex_offset = 0 ; modex_offset < md_modex->size ; ){ + mca_btl_uct_tl_modex_t *tl_modex = (mca_btl_uct_tl_modex_t *)(modex_data + modex_offset); + + BTL_VERBOSE(("found modex for tl %s searching for %s", tl_modex->tl_name, tl->uct_tl_name)); + + if (0 == strcmp(tl->uct_tl_name, tl_modex->tl_name)) { + return tl_modex->data; + } + + BTL_VERBOSE(("no match, continuing")); + + modex_offset += tl_modex->size; + } + + return NULL; +} + +uint8_t *mca_btl_uct_find_modex(mca_btl_uct_modex_t *modex, mca_btl_uct_tl_t *tl, int *remote_module_index) { + uint8_t *modex_data = modex->data; + + /* look for matching transport in the modex */ + for (int i = 0; i < modex->module_count; ++i) { + mca_btl_uct_md_modex_t *md_modex = (mca_btl_uct_md_modex_t *)modex_data; + + BTL_VERBOSE(("found modex for md %s (remote module index %hu), searching for %s", + md_modex->md_name, md_modex->module_index, tl->uct_md->md_name)); + + if (0 != strcmp(tl->uct_md->md_name, md_modex->md_name)) { + /* modex belongs to a different module, skip it and continue */ + modex_data += md_modex->size; + continue; + } + + uint8_t *tl_modex = mca_btl_uct_find_tl_modex(md_modex, tl); + if (NULL == tl_modex) { + break; + } + + if (NULL != remote_module_index) { + *remote_module_index = md_modex->module_index; + } + + BTL_VERBOSE(("finished processing modex for %s", tl->uct_md->md_name)); + + return tl_modex; + } + + BTL_ERROR(("could not find modex for %s::%s", tl->uct_md->md_name, tl->uct_tl_name)); + + return NULL; +} diff --git a/opal/mca/btl/uct/btl_uct_modex.h b/opal/mca/btl/uct/btl_uct_modex.h new file mode 100644 index 00000000000..e202bc8113f --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_modex.h @@ -0,0 +1,20 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Google, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#if !defined(MCA_BTL_UCT_MODEX_H) +#define MCA_BTL_UCT_MODEX_H + +#include "btl_uct.h" + +int mca_btl_uct_component_modex_send(void); + +uint8_t *mca_btl_uct_find_modex(mca_btl_uct_modex_t *modex, mca_btl_uct_tl_t *tl, int *remote_module_index); + +#endif /* !defined(MCA_BTL_UCT_MODEX_H) */ diff --git a/opal/mca/btl/uct/btl_uct_module.c b/opal/mca/btl/uct/btl_uct_module.c index 4201a54b812..9914c5e8f99 100644 --- a/opal/mca/btl/uct/btl_uct_module.c +++ b/opal/mca/btl/uct/btl_uct_module.c @@ -264,6 +264,35 @@ int mca_btl_uct_dereg_mem(void *reg_data, mca_rcache_base_registration_t *reg) return OPAL_SUCCESS; } +mca_btl_uct_module_t *mca_btl_uct_alloc_module(mca_btl_uct_md_t *md, + size_t registration_size) +{ + mca_btl_uct_module_t *module; + + module = malloc(sizeof(*module)); + if (NULL == module) { + return NULL; + } + + /* copy the module template */ + *module = mca_btl_uct_module_template; + + OBJ_CONSTRUCT(&module->id_to_endpoint, opal_hash_table_t); + OBJ_CONSTRUCT(&module->endpoint_lock, opal_mutex_t); + OBJ_CONSTRUCT(&module->short_frags, opal_free_list_t); + OBJ_CONSTRUCT(&module->eager_frags, opal_free_list_t); + OBJ_CONSTRUCT(&module->max_frags, opal_free_list_t); + OBJ_CONSTRUCT(&module->pending_frags, opal_list_t); + OBJ_CONSTRUCT(&module->lock, opal_recursive_mutex_t); + OBJ_CONSTRUCT(&module->allowed_transport_list, mca_btl_uct_include_list_t); + + module->md = md; + OBJ_RETAIN(md); + module->super.btl_registration_handle_size = registration_size; + + return module; +} + /* * Cleanup/release module resources. */ @@ -284,16 +313,32 @@ int mca_btl_uct_finalize(mca_btl_base_module_t *btl) OBJ_DESTRUCT(&uct_module->max_frags); OBJ_DESTRUCT(&uct_module->pending_frags); OBJ_DESTRUCT(&uct_module->lock); + OBJ_DESTRUCT(&uct_module->allowed_transport_list); if (uct_module->rcache) { mca_rcache_base_module_destroy(uct_module->rcache); } OBJ_DESTRUCT(&uct_module->endpoint_lock); - OBJ_RELEASE(uct_module->md); + char *tmp; + asprintf(&tmp, "uct_%s", uct_module->md->md_name); + int rc = mca_base_var_group_find("opal", "btl", tmp); + free(tmp); + if (rc >= 0) { + mca_base_var_group_deregister(rc); + } + + OBJ_RELEASE(uct_module->md); free(uct_module); + for (int i = 0 ; i < MCA_BTL_UCT_MAX_MODULES ; ++i) { + if (mca_btl_uct_component.modules[i] == uct_module) { + mca_btl_uct_component.modules[i] = NULL; + break; + } + } + return OPAL_SUCCESS; } @@ -323,9 +368,11 @@ mca_btl_uct_module_t mca_btl_uct_module_template = { /* set the default flags for this btl. uct provides us with rdma and both * fetching and non-fetching atomics (though limited to add and cswap) */ .btl_flags = MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS - | MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION, - .btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | MCA_BTL_ATOMIC_SUPPORTS_CSWAP - | MCA_BTL_ATOMIC_SUPPORTS_SWAP | MCA_BTL_ATOMIC_SUPPORTS_32BIT, + | MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION | MCA_BTL_FLAGS_SEND, + .btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | MCA_BTL_ATOMIC_SUPPORTS_AND + | MCA_BTL_ATOMIC_SUPPORTS_OR | MCA_BTL_ATOMIC_SUPPORTS_XOR + | MCA_BTL_ATOMIC_SUPPORTS_CSWAP | MCA_BTL_ATOMIC_SUPPORTS_SWAP + | MCA_BTL_ATOMIC_SUPPORTS_32BIT, /* set the default limits on put and get */ .btl_put_limit = 1 << 23, @@ -338,6 +385,8 @@ mca_btl_uct_module_t mca_btl_uct_module_template = { .btl_rdma_pipeline_send_length = 8192, .btl_eager_limit = 8192, .btl_max_send_size = 65536, + /* for now we want this component to lose to btl/ugni and btl/vader */ + .btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 1, }}; OBJ_CLASS_INSTANCE(mca_btl_uct_reg_t, opal_free_list_item_t, NULL, NULL); @@ -361,5 +410,5 @@ static void mca_btl_uct_md_destruct(mca_btl_uct_md_t *md) } } -OBJ_CLASS_INSTANCE(mca_btl_uct_md_t, opal_object_t, mca_btl_uct_md_construct, +OBJ_CLASS_INSTANCE(mca_btl_uct_md_t, opal_list_item_t, mca_btl_uct_md_construct, mca_btl_uct_md_destruct); diff --git a/opal/mca/btl/uct/btl_uct_tl.c b/opal/mca/btl/uct/btl_uct_tl.c index d94f4decee2..a82c5bc4f89 100644 --- a/opal/mca/btl/uct/btl_uct_tl.c +++ b/opal/mca/btl/uct/btl_uct_tl.c @@ -18,6 +18,7 @@ #include "btl_uct_device_context.h" #include "opal/util/argv.h" #include "opal/util/bit_ops.h" +#include "opal/util/minmax.h" #if HAVE_DECL_UCT_CB_FLAG_SYNC # define MCA_BTL_UCT_CB_FLAG_SYNC UCT_CB_FLAG_SYNC @@ -70,8 +71,9 @@ static uint64_t mca_btl_uct_cap_to_btl_atomic_flag[][2] = { }, }; -static void mca_btl_uct_module_set_atomic_flags(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) +static void mca_btl_uct_module_set_atomic_flags(mca_btl_uct_module_t *module) { + mca_btl_uct_tl_t *tl = module->rdma_tl; uint64_t cap_flags = tl->uct_iface_attr.cap.flags; /* NTH: only use the fetching atomics for now */ @@ -120,8 +122,9 @@ static uint64_t mca_btl_uct_cap_to_btl_atomic_flag[][2] = { * * @returns equivalent BTL atomic flags */ -static void mca_btl_uct_module_set_atomic_flags(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) +static void mca_btl_uct_module_set_atomic_flags(mca_btl_uct_module_t *module) { + mca_btl_uct_tl_t *tl = module->rdma_tl; uint64_t cap_flags = tl->uct_iface_attr.cap.flags; module->super.btl_atomic_flags = 0; @@ -274,7 +277,7 @@ static void mca_btl_uct_context_enable_progress(mca_btl_uct_device_context_t *co } } -static int mca_btl_uct_populate_tl_attr(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) { +static int mca_btl_uct_populate_tl_attr(mca_btl_uct_tl_t *tl) { #if UCT_API >= UCT_VERSION(1, 6) uct_iface_params_t iface_params = {.field_mask = UCT_IFACE_PARAM_FIELD_OPEN_MODE | UCT_IFACE_PARAM_FIELD_DEVICE, @@ -309,16 +312,16 @@ static int mca_btl_uct_populate_tl_attr(mca_btl_uct_module_t *module, mca_btl_uc return OPAL_ERROR; } - /* only need to query one of the interfaces to get the attributes */ + int rc = OPAL_SUCCESS; ucs_status = uct_iface_query(uct_iface, &tl->uct_iface_attr); if (UCS_OK != ucs_status) { BTL_VERBOSE(("Error querying UCT interface")); - uct_worker_destroy(uct_worker); - return OPAL_ERROR; + rc = OPAL_ERROR; } uct_iface_close(uct_iface); - return OPAL_SUCCESS; + uct_worker_destroy(uct_worker); + return rc; } mca_btl_uct_device_context_t *mca_btl_uct_context_create(mca_btl_uct_module_t *module, @@ -383,13 +386,15 @@ mca_btl_uct_device_context_t *mca_btl_uct_context_create(mca_btl_uct_module_t *m } if (module != NULL && tl == module->am_tl) { - BTL_VERBOSE(("installing AM handler for tl %p context id %d", (void *) tl, context_id)); + BTL_VERBOSE(("installing AM handler for tl %s::%s context id %d", + tl->uct_md->md_name, tl->uct_tl_name, context_id)); uct_iface_set_am_handler(context->uct_iface, MCA_BTL_UCT_FRAG, mca_btl_uct_am_handler, context, MCA_BTL_UCT_CB_FLAG_SYNC); } if (enable_progress) { - BTL_VERBOSE(("enabling progress for tl %p context id %d", (void *) tl, context_id)); + BTL_VERBOSE(("enabling progress for tl %s::%s context id %d", + tl->uct_md->md_name, tl->uct_tl_name, context_id)); mca_btl_uct_context_enable_progress(context); } @@ -413,15 +418,7 @@ void mca_btl_uct_context_destroy(mca_btl_uct_device_context_t *context) free(context); } -static int tl_compare(opal_list_item_t **a, opal_list_item_t **b) -{ - mca_btl_uct_tl_t *tl_a = (mca_btl_uct_tl_t *) *a; - mca_btl_uct_tl_t *tl_b = (mca_btl_uct_tl_t *) *b; - - return tl_a->priority - tl_b->priority; -} - -static mca_btl_uct_tl_t *mca_btl_uct_create_tl(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, +static mca_btl_uct_tl_t *mca_btl_uct_create_tl(mca_btl_uct_md_t *md, uct_tl_resource_desc_t *tl_desc, int priority) { mca_btl_uct_tl_t *tl = OBJ_NEW(mca_btl_uct_tl_t); @@ -435,6 +432,7 @@ static mca_btl_uct_tl_t *mca_btl_uct_create_tl(mca_btl_uct_module_t *module, mca tl->uct_tl_name = strdup(tl_desc->tl_name); tl->uct_dev_name = strdup(tl_desc->dev_name); + tl->dev_type = tl_desc->dev_type; tl->priority = priority; (void) uct_md_iface_config_read(md->uct_md, tl_desc->tl_name, NULL, NULL, &tl->uct_tl_config); @@ -446,14 +444,14 @@ static mca_btl_uct_tl_t *mca_btl_uct_create_tl(mca_btl_uct_module_t *module, mca return NULL; } - int rc = mca_btl_uct_populate_tl_attr(module, tl); + int rc = mca_btl_uct_populate_tl_attr(tl); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { OBJ_RELEASE(tl); return NULL; } - BTL_VERBOSE(("Interface CAPS for tl %s::%s: 0x%lx", md->md_name, tl_desc->tl_name, - (unsigned long) tl->uct_iface_attr.cap.flags)); + BTL_VERBOSE(("Interface CAPS for tl %s::%s::%s 0x%lx", md->md_name, tl_desc->tl_name, + tl_desc->dev_name, (unsigned long) tl->uct_iface_attr.cap.flags)); return tl; } @@ -462,9 +460,12 @@ static void mca_btl_uct_set_tl_rdma(mca_btl_uct_module_t *module, mca_btl_uct_tl { BTL_VERBOSE(("tl %s is suitable for RDMA", tl->uct_tl_name)); - mca_btl_uct_module_set_atomic_flags(module, tl); + module->rdma_tl = tl; - module->super.btl_get_limit = tl->uct_iface_attr.cap.get.max_zcopy; + mca_btl_uct_module_set_atomic_flags(module); + + module->super.btl_get_limit = opal_min(tl->uct_iface_attr.cap.get.max_zcopy, + module->super.btl_get_limit); if (tl->uct_iface_attr.cap.get.max_bcopy) { module->super.btl_get_alignment = 0; module->super.btl_get_local_registration_threshold = tl->uct_iface_attr @@ -476,17 +477,15 @@ static void mca_btl_uct_set_tl_rdma(mca_btl_uct_module_t *module, mca_btl_uct_tl tl->uct_iface_attr.cap.get.min_zcopy); } - module->super.btl_put_limit = tl->uct_iface_attr.cap.put.max_zcopy; + module->super.btl_put_limit = opal_min(tl->uct_iface_attr.cap.put.max_zcopy, + module->super.btl_put_limit); module->super.btl_put_alignment = 0; /* no registration needed when using short/bcopy put */ module->super.btl_put_local_registration_threshold = tl->uct_iface_attr .cap.put.max_bcopy; - module->rdma_tl = tl; - tl->tl_index = (module->am_tl && tl != module->am_tl) ? 1 : 0; - module->comm_tls[tl->tl_index] = tl; if (tl->max_device_contexts <= 1) { tl->max_device_contexts = mca_btl_uct_component.num_contexts_per_module; } @@ -498,28 +497,29 @@ static void mca_btl_uct_set_tl_am(mca_btl_uct_module_t *module, mca_btl_uct_tl_t module->am_tl = tl; tl->tl_index = (module->rdma_tl && tl != module->rdma_tl) ? 1 : 0; - module->comm_tls[tl->tl_index] = tl; if (tl->max_device_contexts <= 1) { tl->max_device_contexts = mca_btl_uct_component.num_contexts_per_module; } - module->super.btl_eager_limit = tl->uct_iface_attr.cap.am.max_bcopy - - sizeof(mca_btl_uct_am_header_t); + size_t max_eager_limit = tl->uct_iface_attr.cap.am.max_bcopy + - sizeof(mca_btl_uct_am_header_t); + size_t max_send_size = max_eager_limit; + if (tl->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_AM_ZCOPY) { - module->super.btl_max_send_size = tl->uct_iface_attr.cap.am.max_zcopy - - sizeof(mca_btl_uct_am_header_t); - } else { - module->super.btl_max_send_size = module->super.btl_eager_limit; + max_send_size = opal_max(max_send_size, tl->uct_iface_attr.cap.am.max_zcopy + - sizeof(mca_btl_uct_am_header_t)); } + + module->super.btl_eager_limit = opal_min(module->super.btl_eager_limit, max_eager_limit); + module->super.btl_max_send_size = opal_min(module->super.btl_max_send_size, max_send_size); } -static int mca_btl_uct_set_tl_conn(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) +int mca_btl_uct_enable_tl_conn(mca_btl_uct_tl_t *tl) { int rc; BTL_VERBOSE(("tl %s is suitable for making connections", tl->uct_tl_name)); - module->conn_tl = tl; rc = mca_btl_uct_setup_connection_tl(tl); if (OPAL_SUCCESS != rc) { return rc; @@ -534,11 +534,9 @@ static int mca_btl_uct_set_tl_conn(mca_btl_uct_module_t *module, mca_btl_uct_tl_ return OPAL_SUCCESS; } -static int mca_btl_uct_evaluate_tl(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) +int mca_btl_uct_evaluate_tl(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) { - int rc; - - BTL_VERBOSE(("evaluating tl %s", tl->uct_tl_name)); + BTL_VERBOSE(("evaluating tl %s::%s", tl->uct_md->md_name, tl->uct_tl_name)); if (NULL == module->rdma_tl && mca_btl_uct_tl_supports_rdma(tl)) { mca_btl_uct_set_tl_rdma(module, tl); } @@ -547,16 +545,10 @@ static int mca_btl_uct_evaluate_tl(mca_btl_uct_module_t *module, mca_btl_uct_tl_ mca_btl_uct_set_tl_am(module, tl); } - if (NULL == module->conn_tl && mca_btl_uct_tl_supports_conn(tl)) { - rc = mca_btl_uct_set_tl_conn(module, tl); - if (OPAL_SUCCESS != rc) { - return rc; - } - } - if (tl == module->rdma_tl || tl == module->am_tl) { BTL_VERBOSE(("tl has flags 0x%" PRIx64, tl->uct_iface_attr.cap.flags)); module->super.btl_flags |= mca_btl_uct_module_flags(tl->uct_iface_attr.cap.flags); + module->super.btl_flags |= MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION; /* the bandwidth and latency numbers relate to both rdma and active messages. need to * come up with a better estimate. */ @@ -577,50 +569,17 @@ static int mca_btl_uct_evaluate_tl(mca_btl_uct_module_t *module, mca_btl_uct_tl_ return OPAL_SUCCESS; } -int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, - uct_tl_resource_desc_t *tl_descs, unsigned tl_count, - bool evaluate_for_conn_only) +int mca_btl_uct_populate_tls(mca_btl_uct_md_t *md, uct_tl_resource_desc_t *tl_descs, unsigned tl_count) { - mca_btl_uct_tl_t *tl; + BTL_VERBOSE(("processing %u tls in memory domain %s", tl_count, md->md_name)); for (unsigned i = 0; i < tl_count; ++i) { - int priority = 0; - BTL_VERBOSE(("processing tl %s, evaluate_for_conn_only=%d", tl_descs[i].tl_name, evaluate_for_conn_only)); - - if (!evaluate_for_conn_only) { - priority = mca_btl_uct_include_list_rank (tl_descs[i].tl_name, &mca_btl_uct_component.allowed_transport_list); - BTL_VERBOSE(("tl filter: tl_name = %s, priority = %d", tl_descs[i].tl_name, - priority)); - if (priority < 0) { - continue; - } - } else if (tl_descs[i].dev_type != UCT_DEVICE_TYPE_NET) { - /* only network types are suitable for forming connections */ - continue; - } - - if (0 == strcmp(tl_descs[i].tl_name, "ud")) { - /* ud looks like any normal transport but we do not want to use it for anything other - * than connection management so ensure it gets evaluated last */ - priority = INT_MAX; - } - - tl = mca_btl_uct_create_tl(module, md, tl_descs + i, priority); + BTL_VERBOSE(("processing tl %s::%s::%s", md->md_name, tl_descs[i].tl_name, tl_descs[i].dev_name)); + /* the priority will be set during module creation */ + mca_btl_uct_tl_t *tl = mca_btl_uct_create_tl(md, tl_descs + i, /*priority=*/0); if (tl) { - if (mca_btl_uct_tl_supports_conn(tl) && evaluate_for_conn_only) { - BTL_VERBOSE(("evaluating tl %s for forming connections", tl_descs[i].tl_name)); - int rc = mca_btl_uct_set_tl_conn(module, tl); - - if (OPAL_SUCCESS == rc) { - opal_list_append(&md->tls, &tl->super); - return OPAL_SUCCESS; - } - - BTL_VERBOSE(("tl %s cannot be used for forming connections", tl_descs[i].tl_name)); - } else { - opal_list_append(&md->tls, &tl->super); - } + opal_list_append(&md->tls, &tl->super); } } @@ -629,55 +588,5 @@ int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, return OPAL_ERR_NOT_AVAILABLE; } - opal_list_sort(&md->tls, tl_compare); - - OPAL_LIST_FOREACH (tl, &md->tls, mca_btl_uct_tl_t) { - mca_btl_uct_evaluate_tl(module, tl); - if (NULL != module->am_tl && NULL != module->rdma_tl - && (NULL != module->conn_tl - || !(mca_btl_uct_tl_requires_connection_tl(module->am_tl) - || mca_btl_uct_tl_requires_connection_tl(module->rdma_tl)))) { - /* all done */ - break; - } - } - - if (NULL == module->rdma_tl) { - /* no rdma tls */ - BTL_VERBOSE(("no rdma tl matched supplied filter. disabling RDMA support")); - - module->super.btl_flags &= ~MCA_BTL_FLAGS_RDMA; - module->super.btl_put = NULL; - module->super.btl_get = NULL; - module->super.btl_atomic_fop = NULL; - module->super.btl_atomic_op = NULL; - } - - if (NULL == module->am_tl) { - /* no active message tls == no send/recv */ - BTL_VERBOSE(("no active message tl matched supplied filter. disabling send/recv support")); - - module->super.btl_send = NULL; - module->super.btl_sendi = NULL; - module->super.btl_alloc = NULL; - module->super.btl_free = NULL; - } - - if (!(NULL != module->am_tl && mca_btl_uct_tl_requires_connection_tl(module->am_tl)) - && !(NULL != module->rdma_tl && mca_btl_uct_tl_requires_connection_tl(module->rdma_tl)) - && module->conn_tl) { - /* no connection tl needed for selected transports */ - module->conn_tl = NULL; - } - - /* clear out unused tls */ - mca_btl_uct_tl_t *next; - OPAL_LIST_FOREACH_SAFE(tl, next, &md->tls, mca_btl_uct_tl_t) { - if (tl != module->conn_tl && tl != module->rdma_tl && tl != module->am_tl) { - opal_list_remove_item(&md->tls, &tl->super); - OBJ_RELEASE(tl); - } - } - return OPAL_SUCCESS; } diff --git a/opal/mca/btl/uct/btl_uct_types.h b/opal/mca/btl/uct/btl_uct_types.h index 9b3f3cccbb3..d1625fa9bef 100644 --- a/opal/mca/btl/uct/btl_uct_types.h +++ b/opal/mca/btl/uct/btl_uct_types.h @@ -10,11 +10,16 @@ * $HEADER$ */ +#include + #if !defined(BTL_UCT_TYPES_H) # define BTL_UCT_TYPES_H # include "opal/mca/btl/btl.h" +#include "opal/class/opal_fifo.h" +#include "opal/class/opal_list.h" +#include "opal/class/opal_object.h" #include "opal/mca/timer/base/base.h" /* forward declarations */ @@ -65,7 +70,11 @@ typedef struct mca_btl_uct_modex_t mca_btl_uct_modex_t; */ struct mca_btl_uct_md_t { /** make this an opal object */ - opal_object_t super; + opal_list_item_t super; + + /** if true none of the tls in this domain will be used + * for communication */ + bool connection_only_domain; /** name of the memory domain backing this module */ char *md_name; @@ -76,6 +85,9 @@ struct mca_btl_uct_md_t { /** UCT memory domain handle */ uct_md_h uct_md; + /** memory domain attributes */ + uct_md_attr_t md_attr; + #if UCT_API >= UCT_VERSION(1, 7) uct_component_h uct_component; #endif @@ -336,6 +348,9 @@ struct mca_btl_uct_tl_t { /** device name for this tl (used for creating device contexts) */ char *uct_dev_name; + /** UCT device type from the tl description */ + uct_device_type_t dev_type; + /** maximum number of device contexts that can be created */ int max_device_contexts; @@ -372,12 +387,31 @@ OBJ_CLASS_DECLARATION(mca_btl_uct_pending_connection_request_t); * */ struct mca_btl_uct_include_list_t { + opal_object_t super; + /** argv-style (NULL terminated) array of strings */ char **list; /** is an inclusive list (vs exclusive) */ bool include; }; typedef struct mca_btl_uct_include_list_t mca_btl_uct_include_list_t; +OBJ_CLASS_DECLARATION(mca_btl_uct_include_list_t); +struct mca_btl_uct_tl_modex_t { + /** total size of this modex */ + uint16_t size; + char tl_name[UCT_TL_NAME_MAX]; + uint8_t data[]; +} __opal_attribute_packed__; +typedef struct mca_btl_uct_tl_modex_t mca_btl_uct_tl_modex_t; + +struct mca_btl_uct_md_modex_t { + /** total size of this modex */ + uint16_t size; + uint16_t module_index; + char md_name[UCT_MD_NAME_MAX]; + uint8_t data[]; +} __opal_attribute_packed__; +typedef struct mca_btl_uct_md_modex_t mca_btl_uct_md_modex_t; #endif /* !defined(BTL_UCT_TYPES_H) */ From 349db3bb8e755bef99d60c07ab27fca86bea49df Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Thu, 1 May 2025 17:14:47 +0000 Subject: [PATCH 13/13] btl/uct: move device context code to a new file There is a specific header for device contexts so it makes sense to move the context-specific code to a matching C file. No changes in this other than moving code around. Signed-off-by: Nathan Hjelm --- opal/mca/btl/uct/Makefile.am | 3 +- opal/mca/btl/uct/btl_uct_device_context.c | 143 ++++++++++++++++++++++ opal/mca/btl/uct/btl_uct_tl.c | 113 ----------------- 3 files changed, 145 insertions(+), 114 deletions(-) create mode 100644 opal/mca/btl/uct/btl_uct_device_context.c diff --git a/opal/mca/btl/uct/Makefile.am b/opal/mca/btl/uct/Makefile.am index 92e5ab070d9..11799cfe3fe 100644 --- a/opal/mca/btl/uct/Makefile.am +++ b/opal/mca/btl/uct/Makefile.am @@ -48,7 +48,8 @@ sources = \ btl_uct_tl.c \ btl_uct_discover.c \ btl_uct_modex.c \ - btl_uct_include_list.c + btl_uct_include_list.c \ + btl_uct_device_context.c # Make the output library in this directory, and name it either # mca__.la (for DSO builds) or libmca__.la diff --git a/opal/mca/btl/uct/btl_uct_device_context.c b/opal/mca/btl/uct/btl_uct_device_context.c new file mode 100644 index 00000000000..0e6c284ecd9 --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_device_context.c @@ -0,0 +1,143 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2018 Triad National Security, LLC. All rights + * reserved. + * Copyright (c) 2019-2025 Google, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal_config.h" + +#include +#include +#include +#include + +#include "btl_uct.h" +#include "btl_uct_device_context.h" +#include "btl_uct_types.h" + +#include "opal/class/opal_free_list.h" +#include "opal/class/opal_object.h" + +#if HAVE_DECL_UCT_CB_FLAG_SYNC +# define MCA_BTL_UCT_CB_FLAG_SYNC UCT_CB_FLAG_SYNC +#else +# define MCA_BTL_UCT_CB_FLAG_SYNC 0 +#endif + +static void mca_btl_uct_context_enable_progress(mca_btl_uct_device_context_t *context) +{ + if (!context->progress_enabled) { +#if HAVE_DECL_UCT_PROGRESS_THREAD_SAFE + uct_iface_progress_enable(context->uct_iface, + UCT_PROGRESS_THREAD_SAFE | UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); +#else + uct_iface_progress_enable(context->uct_iface, UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); +#endif + context->progress_enabled = true; + } +} + +mca_btl_uct_device_context_t *mca_btl_uct_context_create(mca_btl_uct_module_t *module, + mca_btl_uct_tl_t *tl, int context_id, + bool enable_progress) +{ +#if UCT_API >= UCT_VERSION(1, 6) + uct_iface_params_t iface_params = {.field_mask = UCT_IFACE_PARAM_FIELD_OPEN_MODE + | UCT_IFACE_PARAM_FIELD_DEVICE, + .open_mode = UCT_IFACE_OPEN_MODE_DEVICE, + .mode = {.device = {.tl_name = tl->uct_tl_name, + .dev_name = tl->uct_dev_name}}}; +#else + uct_iface_params_t iface_params = {.rndv_cb = NULL, + .eager_cb = NULL, + .stats_root = NULL, + .rx_headroom = 0, + .open_mode = UCT_IFACE_OPEN_MODE_DEVICE, + .mode = {.device = {.tl_name = tl->uct_tl_name, + .dev_name = tl->uct_dev_name}}}; +#endif + mca_btl_uct_device_context_t *context; + ucs_status_t ucs_status; + int rc; + + context = calloc(1, sizeof(*context)); + if (OPAL_UNLIKELY(NULL == context)) { + return NULL; + } + + context->context_id = context_id; + context->uct_btl = module; + OBJ_CONSTRUCT(&context->completion_fifo, opal_fifo_t); + OBJ_CONSTRUCT(&context->mutex, opal_recursive_mutex_t); + OBJ_CONSTRUCT(&context->rdma_completions, opal_free_list_t); + + rc = opal_free_list_init(&context->rdma_completions, sizeof(mca_btl_uct_uct_completion_t), + opal_cache_line_size, OBJ_CLASS(mca_btl_uct_uct_completion_t), 0, + opal_cache_line_size, 0, 4096, 128, NULL, 0, NULL, NULL, NULL); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + mca_btl_uct_context_destroy(context); + return NULL; + } + + /* apparently (in contradiction to the spec) UCT is *not* thread safe. because we have to + * use our own locks just go ahead and use UCS_THREAD_MODE_SINGLE. if they ever fix their + * api then change this back to UCS_THREAD_MODE_MULTI and remove the locks around the + * various UCT calls. */ + ucs_status = uct_worker_create(tl->ucs_async, UCS_THREAD_MODE_SINGLE, &context->uct_worker); + if (OPAL_UNLIKELY(UCS_OK != ucs_status)) { + BTL_VERBOSE(("could not create a UCT worker")); + mca_btl_uct_context_destroy(context); + return NULL; + } + + ucs_status = uct_iface_open(tl->uct_md->uct_md, context->uct_worker, &iface_params, + tl->uct_tl_config, &context->uct_iface); + if (OPAL_UNLIKELY(UCS_OK != ucs_status)) { + BTL_VERBOSE(("could not open UCT interface. error code: %d", ucs_status)); + mca_btl_uct_context_destroy(context); + return NULL; + } + + if (module != NULL && tl == module->am_tl) { + BTL_VERBOSE(("installing AM handler for tl %s::%s context id %d", + tl->uct_md->md_name, tl->uct_tl_name, context_id)); + uct_iface_set_am_handler(context->uct_iface, MCA_BTL_UCT_FRAG, mca_btl_uct_am_handler, + context, MCA_BTL_UCT_CB_FLAG_SYNC); + } + + if (enable_progress) { + BTL_VERBOSE(("enabling progress for tl %s::%s context id %d", + tl->uct_md->md_name, tl->uct_tl_name, context_id)); + mca_btl_uct_context_enable_progress(context); + } + + return context; +} + +void mca_btl_uct_context_destroy(mca_btl_uct_device_context_t *context) +{ + if (context->uct_iface) { + uct_iface_close(context->uct_iface); + context->uct_iface = NULL; + } + + if (context->uct_worker) { + uct_worker_destroy(context->uct_worker); + context->uct_worker = NULL; + } + + OBJ_DESTRUCT(&context->completion_fifo); + OBJ_DESTRUCT(&context->rdma_completions); + free(context); +} + diff --git a/opal/mca/btl/uct/btl_uct_tl.c b/opal/mca/btl/uct/btl_uct_tl.c index a82c5bc4f89..f55754bc9d8 100644 --- a/opal/mca/btl/uct/btl_uct_tl.c +++ b/opal/mca/btl/uct/btl_uct_tl.c @@ -20,12 +20,6 @@ #include "opal/util/bit_ops.h" #include "opal/util/minmax.h" -#if HAVE_DECL_UCT_CB_FLAG_SYNC -# define MCA_BTL_UCT_CB_FLAG_SYNC UCT_CB_FLAG_SYNC -#else -# define MCA_BTL_UCT_CB_FLAG_SYNC 0 -#endif - /** * @brief Convert UCT capabilities to BTL flags */ @@ -264,19 +258,6 @@ static int mca_btl_uct_setup_connection_tl(mca_btl_uct_tl_t *tl) return UCS_OK == ucs_status ? OPAL_SUCCESS : OPAL_ERROR; } -static void mca_btl_uct_context_enable_progress(mca_btl_uct_device_context_t *context) -{ - if (!context->progress_enabled) { -#if HAVE_DECL_UCT_PROGRESS_THREAD_SAFE - uct_iface_progress_enable(context->uct_iface, - UCT_PROGRESS_THREAD_SAFE | UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); -#else - uct_iface_progress_enable(context->uct_iface, UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); -#endif - context->progress_enabled = true; - } -} - static int mca_btl_uct_populate_tl_attr(mca_btl_uct_tl_t *tl) { #if UCT_API >= UCT_VERSION(1, 6) uct_iface_params_t iface_params = {.field_mask = UCT_IFACE_PARAM_FIELD_OPEN_MODE @@ -324,100 +305,6 @@ static int mca_btl_uct_populate_tl_attr(mca_btl_uct_tl_t *tl) { return rc; } -mca_btl_uct_device_context_t *mca_btl_uct_context_create(mca_btl_uct_module_t *module, - mca_btl_uct_tl_t *tl, int context_id, - bool enable_progress) -{ -#if UCT_API >= UCT_VERSION(1, 6) - uct_iface_params_t iface_params = {.field_mask = UCT_IFACE_PARAM_FIELD_OPEN_MODE - | UCT_IFACE_PARAM_FIELD_DEVICE, - .open_mode = UCT_IFACE_OPEN_MODE_DEVICE, - .mode = {.device = {.tl_name = tl->uct_tl_name, - .dev_name = tl->uct_dev_name}}}; -#else - uct_iface_params_t iface_params = {.rndv_cb = NULL, - .eager_cb = NULL, - .stats_root = NULL, - .rx_headroom = 0, - .open_mode = UCT_IFACE_OPEN_MODE_DEVICE, - .mode = {.device = {.tl_name = tl->uct_tl_name, - .dev_name = tl->uct_dev_name}}}; -#endif - mca_btl_uct_device_context_t *context; - ucs_status_t ucs_status; - int rc; - - context = calloc(1, sizeof(*context)); - if (OPAL_UNLIKELY(NULL == context)) { - return NULL; - } - - context->context_id = context_id; - context->uct_btl = module; - OBJ_CONSTRUCT(&context->completion_fifo, opal_fifo_t); - OBJ_CONSTRUCT(&context->mutex, opal_recursive_mutex_t); - OBJ_CONSTRUCT(&context->rdma_completions, opal_free_list_t); - - rc = opal_free_list_init(&context->rdma_completions, sizeof(mca_btl_uct_uct_completion_t), - opal_cache_line_size, OBJ_CLASS(mca_btl_uct_uct_completion_t), 0, - opal_cache_line_size, 0, 4096, 128, NULL, 0, NULL, NULL, NULL); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - mca_btl_uct_context_destroy(context); - return NULL; - } - - /* apparently (in contradiction to the spec) UCT is *not* thread safe. because we have to - * use our own locks just go ahead and use UCS_THREAD_MODE_SINGLE. if they ever fix their - * api then change this back to UCS_THREAD_MODE_MULTI and remove the locks around the - * various UCT calls. */ - ucs_status = uct_worker_create(tl->ucs_async, UCS_THREAD_MODE_SINGLE, &context->uct_worker); - if (OPAL_UNLIKELY(UCS_OK != ucs_status)) { - BTL_VERBOSE(("could not create a UCT worker")); - mca_btl_uct_context_destroy(context); - return NULL; - } - - ucs_status = uct_iface_open(tl->uct_md->uct_md, context->uct_worker, &iface_params, - tl->uct_tl_config, &context->uct_iface); - if (OPAL_UNLIKELY(UCS_OK != ucs_status)) { - BTL_VERBOSE(("could not open UCT interface. error code: %d", ucs_status)); - mca_btl_uct_context_destroy(context); - return NULL; - } - - if (module != NULL && tl == module->am_tl) { - BTL_VERBOSE(("installing AM handler for tl %s::%s context id %d", - tl->uct_md->md_name, tl->uct_tl_name, context_id)); - uct_iface_set_am_handler(context->uct_iface, MCA_BTL_UCT_FRAG, mca_btl_uct_am_handler, - context, MCA_BTL_UCT_CB_FLAG_SYNC); - } - - if (enable_progress) { - BTL_VERBOSE(("enabling progress for tl %s::%s context id %d", - tl->uct_md->md_name, tl->uct_tl_name, context_id)); - mca_btl_uct_context_enable_progress(context); - } - - return context; -} - -void mca_btl_uct_context_destroy(mca_btl_uct_device_context_t *context) -{ - if (context->uct_iface) { - uct_iface_close(context->uct_iface); - context->uct_iface = NULL; - } - - if (context->uct_worker) { - uct_worker_destroy(context->uct_worker); - context->uct_worker = NULL; - } - - OBJ_DESTRUCT(&context->completion_fifo); - OBJ_DESTRUCT(&context->rdma_completions); - free(context); -} - static mca_btl_uct_tl_t *mca_btl_uct_create_tl(mca_btl_uct_md_t *md, uct_tl_resource_desc_t *tl_desc, int priority) {