Skip to content

Commit d586292

Browse files
hjelmnjsquyres
authored andcommitted
btl/uct: correctly set the completion status before using uct completions
According to UCT documentation the status field must be preset to UCS_OK before the completion is used. My assumption here is that the field is not filled in in all cases leaving the value as potentially garbage. This CL addresses the issue but setting the status in the constructor for both RDMA completions and frags. The status is then reset on completion callback. Signed-off-by: Nathan Hjelm <hjelmn@google.com>
1 parent 0bccfcd commit d586292

File tree

2 files changed

+16
-29
lines changed

2 files changed

+16
-29
lines changed

opal/mca/btl/uct/btl_uct_frag.c

Lines changed: 2 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
/*
33
* Copyright (c) 2018 Los Alamos National Security, LLC. All rights
44
* reserved.
5+
* Copyright (c) 2025 Google, LLC. All rights reserved.
56
* $COPYRIGHT$
67
*
78
* Additional copyrights may follow
@@ -11,29 +12,6 @@
1112

1213
#include "btl_uct_frag.h"
1314

14-
static void mca_btl_uct_frag_completion_compat(uct_completion_t *uct_comp, ucs_status_t status)
15-
{
16-
mca_btl_uct_uct_completion_t *comp = (mca_btl_uct_uct_completion_t
17-
*) ((uintptr_t) uct_comp
18-
- offsetof(mca_btl_uct_uct_completion_t,
19-
uct_comp));
20-
21-
BTL_VERBOSE(("frag operation complete. frag = %p. status = %d", (void *) comp->frag, status));
22-
23-
comp->status = status;
24-
opal_fifo_push(&comp->dev_context->completion_fifo, &comp->super.super);
25-
}
26-
27-
#if UCT_API >= ((1L<<UCT_MAJOR_BIT)|(10L << UCT_MINOR_BIT))
28-
static void mca_btl_uct_frag_completion(uct_completion_t *uct_comp) {
29-
mca_btl_uct_frag_completion_compat(uct_comp, uct_comp->status);
30-
}
31-
#else
32-
static void mca_btl_uct_frag_completion(uct_completion_t *uct_comp, ucs_status_t status) {
33-
mca_btl_uct_frag_completion_compat(uct_comp, status);
34-
}
35-
#endif
36-
3715
static void mca_btl_uct_base_frag_constructor(mca_btl_uct_base_frag_t *frag)
3816
{
3917
mca_btl_uct_reg_t *reg = (mca_btl_uct_reg_t *) frag->base.super.registration;
@@ -42,14 +20,11 @@ static void mca_btl_uct_base_frag_constructor(mca_btl_uct_base_frag_t *frag)
4220
memset((char *) frag + sizeof(frag->base), 0, sizeof(*frag) - sizeof(frag->base));
4321

4422
OBJ_CONSTRUCT(&frag->comp, mca_btl_uct_uct_completion_t);
23+
frag->comp.frag = frag;
4524

4625
frag->base.des_segments = frag->segments;
4726
frag->base.des_segment_count = 1;
4827

49-
frag->comp.uct_comp.func = mca_btl_uct_frag_completion;
50-
frag->comp.uct_comp.count = 1;
51-
frag->comp.frag = frag;
52-
5328
frag->segments[0].seg_addr.pval = frag->base.super.ptr;
5429
frag->uct_iov.buffer = frag->base.super.ptr;
5530
frag->uct_iov.stride = 0;

opal/mca/btl/uct/btl_uct_rdma.c

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
/*
33
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
44
* reserved.
5+
* Copyright (c) 2025 Google, LLC. All rights reserved.
56
* $COPYRIGHT$
67
*
78
* Additional copyrights may follow
@@ -18,15 +19,21 @@ static void mca_btl_uct_uct_completion_compat(uct_completion_t *uct_comp, ucs_st
1819
- offsetof(mca_btl_uct_uct_completion_t,
1920
uct_comp));
2021

21-
BTL_VERBOSE(("network operation complete. status = %d", status));
22+
BTL_VERBOSE(("network operation complete. frag = %p, status = %d", (void *) comp->frag, status));
2223

2324
comp->status = status;
2425
opal_fifo_push(&comp->dev_context->completion_fifo, &comp->super.super);
2526
}
2627

2728
#if UCT_API >= ((1L<<UCT_MAJOR_BIT)|(10L << UCT_MINOR_BIT))
2829
static void mca_btl_uct_uct_completion(uct_completion_t *uct_comp) {
29-
mca_btl_uct_uct_completion_compat(uct_comp, uct_comp->status);
30+
ucs_status_t status = uct_comp->status;
31+
/* reset the status now as the completion can not be accessed after calling
32+
* mca_btl_uct_uct_completion_compat (may get returned) */
33+
uct_comp->status = UCS_OK;
34+
35+
mca_btl_uct_uct_completion_compat(uct_comp, status);
36+
/* do not access the completion struture here */
3037
}
3138
#else
3239
static void mca_btl_uct_uct_completion(uct_completion_t *uct_comp, ucs_status_t status) {
@@ -38,6 +45,11 @@ static void mca_btl_uct_uct_completion_construct(mca_btl_uct_uct_completion_t *c
3845
{
3946
comp->frag = NULL;
4047
comp->uct_comp.func = mca_btl_uct_uct_completion;
48+
comp.uct_comp.count = 1;
49+
50+
#if UCT_API >= ((1L<<UCT_MAJOR_BIT)|(10L << UCT_MINOR_BIT))
51+
comp->uct_comp->status = UCS_OK;
52+
#endif
4153
}
4254

4355
OBJ_CLASS_INSTANCE(mca_btl_uct_uct_completion_t, opal_free_list_item_t,

0 commit comments

Comments
 (0)