open-mpi
diff --git a/‎ompi/mca/mtl/ofi/README
Lines changed: 44 additions & 8 deletions b/‎ompi/mca/mtl/ofi/README
Lines changed: 44 additions & 8 deletions
diff --git a/‎ompi/mca/mtl/ofi/help-mtl-ofi.txt
Lines changed: 32 additions & 7 deletions b/‎ompi/mca/mtl/ofi/help-mtl-ofi.txt
Lines changed: 32 additions & 7 deletions
diff --git a/‎ompi/mca/mtl/ofi/mtl_ofi.h
Lines changed: 36 additions & 43 deletions b/‎ompi/mca/mtl/ofi/mtl_ofi.h
Lines changed: 36 additions & 43 deletions
@@ -111,11 +111,22 @@ favours only creating as many contexts as needed.
            }
     }
 
-2. MCA variable:
+2. MCA variables:
 To utilize the feature, the following MCA variable needs to be set:
+  mtl_ofi_enable_sep:
+  This MCA variable needs to be set to enable the use of Scalable Endpoints
+  feature in the OFI MTL. The underlying provider is also checked to ensure the
+  feature is supported. If the provider chosen does not support it, user needs
+  to either set this variable to 0 or select different provider which supports
+  the feature.
+
+  Default: 0
+
+  Command-line syntax:
+  "-mca mtl_ofi_enable_sep 1"
+
   mtl_ofi_thread_grouping:
-  This MCA variable is at the OFI MTL level and needs to be set to switch
-  the feature on.
+  This MCA variable needs to be set to switch Thread Grouping feature on.
 
   Default: 0
 
@@ -124,21 +135,46 @@ To utilize the feature, the following MCA variable needs to be set:
    - Applications that have multiple threads using a single communicator as
      it may degrade performance.
 
-Command-line syntax to set the MCA variable:
-  "-mca mtl_ofi_thread_grouping 1"
+  Command-line syntax:
+    "-mca mtl_ofi_thread_grouping 1"
+
+  mtl_ofi_num_ctxts:
+  MCA variable allows user to set the number of OFI contexts the applications
+  expects to use. For multi-threaded applications using Thread Grouping
+  feature, this number should be set to the number of user threads that will
+  call into MPI. For single-threaded applications one OFI context is
+  sufficient.
+
+  Default: 1
+
+  Command-line syntax:
+  "-mca mtl_ofi_num_ctxts N" [ N: number of OFI contexts required by
+                                         application ]
 
 3. Notes on performance:
-  - OFI MTL will create as many TX/RX contexts as allowed by an underlying
-    provider (each provider may have different thresholds). Once the threshold
+  - OFI MTL will create as many TX/RX contexts as set by MCA mtl_ofi_num_ctxts.
+    The number of contexts that can be created is also limited by the underlying
+    provider as each provider may have different thresholds. Once the threshold
     is exceeded, contexts are used in a round-robin fashion which leads to
     resource sharing among threads. Therefore locks are required to guard
     against race conditions. For performance, it is recommended to have
 
-      Number of communicators = Number of contexts
+      Number of threads = Number of communicators = Number of contexts
 
     For example, when using PSM2 provider, the number of contexts is dictated
     by the Intel Omni-Path HFI1 driver module.
 
+  - OPAL layer allows for multiple threads to enter progress simultaneously. To
+    enable this feature, user needs to set MCA variable
+    "max_thread_in_progress". When using Thread Grouping feature, it is
+    recommended to set this MCA parameter to the number of threads expected to
+    call into MPI as it provides performance benefits.
+
+    Command-line syntax:
+    "-mca opal_max_thread_in_progress N" [ N: number of threads expected to
+                                              make MPI calls ]
+      Default: 1
+
   - For applications using a single thread with multiple communicators and MCA
     variable "mtl_ofi_thread_grouping" set to 1, the MTL will use multiple
     contexts, but the benefits may be negligible as only one thread is driving
 
@@ -26,17 +26,42 @@ fi_info -v -p %s
   Location: %s:%d
 
 [SEP unavailable]
-Scalable Endpoint feature is required for Thread Grouping feature to work
-but it is not supported by %s provider. Try disabling this feature.
+Scalable Endpoint feature is enabled by the user but it is not supported by
+%s provider. Try disabling this feature or use a different provider that
+supports it using mtl_ofi_provider_include.
 
   Local host: %s
   Location: %s:%d
 
-[SEP ctxt limit]
-Reached limit (%d) for number of OFI contexts that can be opened with the
-provider. Creating new communicators beyond this limit is possible but
-they will re-use existing contexts in round-robin fashion.
-Using new communicators beyond the limit will impact performance.
+[SEP required]
+Scalable Endpoint feature is required for Thread Grouping feature to work.
+Please try enabling Scalable Endpoints using mtl_ofi_enable_sep.
+
+  Local host: %s
+  Location: %s:%d
+
+[SEP thread grouping ctxt limit]
+Reached limit (%d) for number of OFI contexts set by mtl_ofi_num_ctxts.
+Please set mtl_ofi_num_ctxts to a larger value if you need more contexts.
+If an MPI application creates more communicators than mtl_ofi_num_ctxts,
+OFI MTL will make the new communicators re-use existing contexts in
+round-robin fashion which will impact performance.
+
+  Local host: %s
+  Location: %s:%d
+
+[Local ranks exceed ofi contexts]
+Number of local ranks exceed the number of available OFI contexts in %s
+provider and we cannot provision enough contexts for each rank. Try disabling
+Scalable Endpoint feature.
+
+  Local host: %s
+  Location: %s:%d
+
+[Ctxts exceeded available]
+User requested for more than available contexts from provider. Limiting
+to max allowed (%d). Contexts will be re used in round-robin fashion if there
+are more threads than the available contexts.
 
   Local host: %s
   Location: %s:%d
@@ -327,16 +327,7 @@ ompi_mtl_ofi_isend_callback(struct fi_cq_tagged_entry *wc,
 
 #define MTL_OFI_MAP_COMM_TO_CONTEXT(comm_id, ctxt_id)                           \
     do {                                                                        \
-        if (ompi_mtl_ofi.thread_grouping &&                                     \
-            (!ompi_mtl_ofi.threshold_comm_context_id ||                         \
-            ((uint32_t) ompi_mtl_ofi.threshold_comm_context_id > comm_id))) {   \
-            ctxt_id = ompi_mtl_ofi.comm_to_context[comm_id];                    \
-        } else if (ompi_mtl_ofi.thread_grouping) {                              \
-            /* Round-robin assignment of contexts if threshold is reached */    \
-            ctxt_id = comm_id % ompi_mtl_ofi.total_ctxts_used;                  \
-        } else {                                                                \
-            ctxt_id = 0;                                                        \
-        }                                                                       \
+        ctxt_id = ompi_mtl_ofi.comm_to_context[comm_id];                    \
     } while (0);
 
 __opal_attribute_always_inline__ static inline int
@@ -348,40 +339,40 @@ ompi_mtl_ofi_ssend_recv(ompi_mtl_ofi_request_t *ack_req,
                   uint64_t *match_bits,
                   int tag)
 {
-        ssize_t ret = OMPI_SUCCESS;
-        int ctxt_id = 0;
+    ssize_t ret = OMPI_SUCCESS;
+    int ctxt_id = 0;
 
-        MTL_OFI_MAP_COMM_TO_CONTEXT(comm->c_contextid, ctxt_id);
-        set_thread_context(ctxt_id);
+    MTL_OFI_MAP_COMM_TO_CONTEXT(comm->c_contextid, ctxt_id);
+    set_thread_context(ctxt_id);
 
-        ack_req = malloc(sizeof(ompi_mtl_ofi_request_t));
-        assert(ack_req);
+    ack_req = malloc(sizeof(ompi_mtl_ofi_request_t));
+    assert(ack_req);
 
-        ack_req->parent = ofi_req;
-        ack_req->event_callback = ompi_mtl_ofi_send_ack_callback;
-        ack_req->error_callback = ompi_mtl_ofi_send_ack_error_callback;
+    ack_req->parent = ofi_req;
+    ack_req->event_callback = ompi_mtl_ofi_send_ack_callback;
+    ack_req->error_callback = ompi_mtl_ofi_send_ack_error_callback;
 
-        ofi_req->completion_count += 1;
+    ofi_req->completion_count += 1;
 
-        MTL_OFI_RETRY_UNTIL_DONE(fi_trecv(ompi_mtl_ofi.ofi_ctxt[ctxt_id].rx_ep,
-                                          NULL,
-                                          0,
-                                          NULL,
-                                          *src_addr,
-                                          *match_bits | ompi_mtl_ofi.sync_send_ack,
-                                          0, /* Exact match, no ignore bits */
-                                          (void *) &ack_req->ctx), ret);
-        if (OPAL_UNLIKELY(0 > ret)) {
-            opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
-                                "%s:%d: fi_trecv failed: %s(%zd)",
-                                __FILE__, __LINE__, fi_strerror(-ret), ret);
-            free(ack_req);
-            return ompi_mtl_ofi_get_error(ret);
-        }
+    MTL_OFI_RETRY_UNTIL_DONE(fi_trecv(ompi_mtl_ofi.ofi_ctxt[ctxt_id].rx_ep,
+                                      NULL,
+                                      0,
+                                      NULL,
+                                      *src_addr,
+                                      *match_bits | ompi_mtl_ofi.sync_send_ack,
+                                      0, /* Exact match, no ignore bits */
+                                      (void *) &ack_req->ctx), ret);
+    if (OPAL_UNLIKELY(0 > ret)) {
+        opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
+                            "%s:%d: fi_trecv failed: %s(%zd)",
+                            __FILE__, __LINE__, fi_strerror(-ret), ret);
+        free(ack_req);
+        return ompi_mtl_ofi_get_error(ret);
+    }
 
-         /* The SYNC_SEND tag bit is set for the send operation only.*/
-        MTL_OFI_SET_SYNC_SEND(*match_bits);
-        return OMPI_SUCCESS;
+     /* The SYNC_SEND tag bit is set for the send operation only.*/
+    MTL_OFI_SET_SYNC_SEND(*match_bits);
+    return OMPI_SUCCESS;
 }
 
 __opal_attribute_always_inline__ static inline int
@@ -1242,13 +1233,15 @@ static int ompi_mtl_ofi_init_contexts(struct mca_mtl_base_module_t *mtl,
     }
 
     /*
-     * We only create upto Max number of contexts allowed by provider.
+     * We only create upto Max number of contexts asked for by the user.
      * If user enables thread grouping feature and creates more number of
-     * communicators than we have contexts, then we set the threshold
-     * context_id so we know to use context 0 for operations involving these
-     * "extra" communicators.
+     * communicators than available contexts, then we set the threshold
+     * context_id so that new communicators created beyond the threshold
+     * will be assigned to contexts in a round-robin fashion.
      */
-    if (ompi_mtl_ofi.max_ctx_cnt <= ctxt_id) {
+    if (ompi_mtl_ofi.num_ofi_contexts <= ompi_mtl_ofi.total_ctxts_used) {
+        ompi_mtl_ofi.comm_to_context[comm->c_contextid] = comm->c_contextid %
+                                                          ompi_mtl_ofi.total_ctxts_used;
         if (!ompi_mtl_ofi.threshold_comm_context_id) {
             ompi_mtl_ofi.threshold_comm_context_id = comm->c_contextid;