Skip to content

Commit c69c53b

Browse files
author
valentin petrov
authored
SCHEDULE: Multithreading fixes (#463) (#470)
* SCHEDULE: fix race in n_completed_tasks * SCHEDULE: fix race in pipelined sched finalize * SCHEDULE: fix race in sched task cleanup (cherry picked from commit b07ebd9)
1 parent e38647f commit c69c53b

File tree

4 files changed

+52
-13
lines changed

4 files changed

+52
-13
lines changed

src/schedule/ucc_schedule.c

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include "utils/ucc_compiler_def.h"
77
#include "components/base/ucc_base_iface.h"
88
#include "coll_score/ucc_coll_score.h"
9+
#include "core/ucc_context.h"
910

1011
ucc_status_t ucc_event_manager_init(ucc_event_manager_t *em)
1112
{
@@ -43,6 +44,7 @@ ucc_status_t ucc_coll_task_init(ucc_coll_task_t *task,
4344
task->n_deps = 0;
4445
task->n_deps_satisfied = 0;
4546
task->bargs.args.mask = 0;
47+
task->schedule = NULL;
4648
if (bargs) {
4749
memcpy(&task->bargs, bargs, sizeof(*bargs));
4850
}
@@ -101,11 +103,11 @@ ucc_schedule_completed_handler(ucc_coll_task_t *parent_task, //NOLINT
101103
ucc_coll_task_t *task)
102104
{
103105
ucc_schedule_t *self = ucc_container_of(task, ucc_schedule_t, super);
106+
uint32_t n_completed_tasks;
104107

105-
// TODO: do we need lock here?
106-
// if tasks in schedule are independet and completes concurently
107-
self->n_completed_tasks += 1;
108-
if (self->n_completed_tasks == self->n_tasks) {
108+
n_completed_tasks = ucc_atomic_fadd32(&self->n_completed_tasks, 1);
109+
110+
if (n_completed_tasks + 1 == self->n_tasks) {
109111
self->super.status = UCC_OK;
110112
ucc_task_complete(&self->super);
111113
}
@@ -125,8 +127,9 @@ ucc_status_t ucc_schedule_init(ucc_schedule_t *schedule, ucc_base_coll_args_t *b
125127

126128
void ucc_schedule_add_task(ucc_schedule_t *schedule, ucc_coll_task_t *task)
127129
{
128-
ucc_event_manager_subscribe(&task->em, UCC_EVENT_COMPLETED,
129-
&schedule->super, ucc_schedule_completed_handler);
130+
ucc_event_manager_subscribe(&task->em, UCC_EVENT_COMPLETED_SCHEDULE,
131+
&schedule->super,
132+
ucc_schedule_completed_handler);
130133
task->schedule = schedule;
131134
schedule->tasks[schedule->n_tasks++] = task;
132135
}

src/schedule/ucc_schedule.h

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ typedef enum {
1818
UCC_EVENT_COMPLETED = 0,
1919
UCC_EVENT_SCHEDULE_STARTED,
2020
UCC_EVENT_TASK_STARTED,
21+
UCC_EVENT_COMPLETED_SCHEDULE, /*< Event is used to notify SCHEDULE that
22+
one of its task has completed */
2123
UCC_EVENT_ERROR,
2224
UCC_EVENT_LAST
2325
} ucc_event_t;
@@ -86,8 +88,8 @@ typedef struct ucc_context ucc_context_t;
8688

8789
typedef struct ucc_schedule {
8890
ucc_coll_task_t super;
89-
int n_completed_tasks;
90-
int n_tasks;
91+
uint32_t n_completed_tasks;
92+
uint32_t n_tasks;
9193
ucc_context_t *ctx;
9294
ucc_coll_task_t *tasks[UCC_SCHEDULE_MAX_TASKS];
9395
} ucc_schedule_t;
@@ -123,11 +125,23 @@ ucc_status_t ucc_triggered_post(ucc_ee_h ee, ucc_ev_t *ev,
123125

124126
static inline ucc_status_t ucc_task_complete(ucc_coll_task_t *task)
125127
{
126-
ucc_status_t status = task->status;
127-
ucc_coll_callback_t cb = task->cb;
128-
int has_cb = task->flags & UCC_COLL_TASK_FLAG_CB;
128+
ucc_status_t status = task->status;
129+
ucc_coll_callback_t cb = task->cb;
130+
int has_cb = task->flags & UCC_COLL_TASK_FLAG_CB;
131+
int has_sched = task->schedule != NULL;
129132

130133
ucc_assert((status == UCC_OK) || (status < 0));
134+
135+
/* If task is part of a schedule then it can be
136+
released during ucc_event_manager_notify(EVENT_COMPLETED_SCHEDULE) below.
137+
Sequence: notify => schedule->n_completed_tasks++ =>
138+
shedule->super.status = UCC_OK => user releases schedule from another
139+
thread => schedule_finalize => schedule finalizes all the tasks.
140+
After that the task ptr should not be accessed.
141+
This is why notification of schedule is done separatly in the end of
142+
this function. Internal implementation must make sure that tasks
143+
with schedules are not released during a callabck (if set). */
144+
131145
if (ucc_likely(status == UCC_OK)) {
132146
status = ucc_event_manager_notify(task, UCC_EVENT_COMPLETED);
133147
} else {
@@ -148,6 +162,10 @@ static inline ucc_status_t ucc_task_complete(ucc_coll_task_t *task)
148162
if (has_cb) {
149163
cb.cb(cb.data, status);
150164
}
165+
if (has_sched && status == UCC_OK) {
166+
status = ucc_event_manager_notify(task, UCC_EVENT_COMPLETED_SCHEDULE);
167+
}
168+
151169
return status;
152170
}
153171

@@ -164,4 +182,5 @@ static inline void ucc_task_subscribe_dep(ucc_coll_task_t *target,
164182
#define UCC_TASK_CORE_CTX(_task) \
165183
(((ucc_coll_task_t *)_task)->team->context->ucc_context)
166184

185+
#define UCC_TASK_THREAD_MODE(_task) (UCC_TASK_CORE_CTX(_task)->thread_mode)
167186
#endif

src/schedule/ucc_schedule_pipelined.c

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include "ucc_schedule.h"
66
#include "ucc_schedule_pipelined.h"
77
#include "coll_score/ucc_coll_score.h"
8+
#include "core/ucc_context.h"
89

910
static ucc_status_t ucc_frag_start_handler(ucc_coll_task_t *parent,
1011
ucc_coll_task_t *task)
@@ -43,6 +44,10 @@ ucc_schedule_pipelined_completed_handler(ucc_coll_task_t *parent_task,
4344
ucc_schedule_t *frag = ucc_derived_of(parent_task, ucc_schedule_t);
4445
int i;
4546

47+
if (UCC_TASK_THREAD_MODE(task) == UCC_THREAD_MULTIPLE) {
48+
ucc_recursive_spin_lock(&schedule->lock);
49+
}
50+
4651
schedule->super.n_completed_tasks += 1;
4752
schedule->n_frags_in_pipeline--;
4853
ucc_trace_req(
@@ -52,6 +57,9 @@ ucc_schedule_pipelined_completed_handler(ucc_coll_task_t *parent_task,
5257
ucc_assert(frag->super.status == UCC_OK);
5358
if (schedule->super.n_completed_tasks == schedule->super.n_tasks) {
5459
schedule->super.super.status = UCC_OK;
60+
if (UCC_TASK_THREAD_MODE(task) == UCC_THREAD_MULTIPLE) {
61+
ucc_recursive_spin_unlock(&schedule->lock);
62+
}
5563
ucc_task_complete(task);
5664
return UCC_OK;
5765
}
@@ -75,6 +83,9 @@ ucc_schedule_pipelined_completed_handler(ucc_coll_task_t *parent_task,
7583
break;
7684
}
7785
}
86+
if (UCC_TASK_THREAD_MODE(task) == UCC_THREAD_MULTIPLE) {
87+
ucc_recursive_spin_unlock(&schedule->lock);
88+
}
7889
return UCC_OK;
7990
}
8091

@@ -89,6 +100,7 @@ ucc_status_t ucc_schedule_pipelined_finalize(ucc_coll_task_t *task)
89100
for (i = 0; i < schedule_p->n_frags; i++) {
90101
schedule_p->frags[i]->super.finalize(&frags[i]->super);
91102
}
103+
ucc_recursive_spinlock_destroy(&schedule_p->lock);
92104
return UCC_OK;
93105
}
94106

@@ -143,6 +155,8 @@ ucc_status_t ucc_schedule_pipelined_init(
143155
return status;
144156
}
145157

158+
ucc_recursive_spinlock_init(&schedule->lock, 0);
159+
146160
schedule->super.n_tasks = n_frags_total;
147161
schedule->n_frags = n_frags;
148162
schedule->sequential = sequential;
@@ -158,6 +172,7 @@ ucc_status_t ucc_schedule_pipelined_init(
158172
ucc_error("failed to initialize fragment for pipeline");
159173
goto err;
160174
}
175+
frags[i]->super.schedule = &schedule->super;
161176
frags[i]->super.status = UCC_OPERATION_INITIALIZED;
162177
frags[i]->super.super.status = UCC_OPERATION_INITIALIZED;
163178
}
@@ -176,8 +191,9 @@ ucc_status_t ucc_schedule_pipelined_init(
176191
UCC_EVENT_SCHEDULE_STARTED,
177192
&frags[i]->super, ucc_frag_start_handler);
178193
ucc_event_manager_subscribe(
179-
&frags[i]->super.em, UCC_EVENT_COMPLETED, &schedule->super.super,
180-
ucc_schedule_pipelined_completed_handler);
194+
&frags[i]->super.em, UCC_EVENT_COMPLETED_SCHEDULE,
195+
&schedule->super.super,
196+
ucc_schedule_pipelined_completed_handler);
181197
}
182198
return UCC_OK;
183199
err:

src/schedule/ucc_schedule_pipelined.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ typedef struct ucc_schedule_pipelined {
4646
int sequential;
4747
int next_frag_to_post;
4848
ucc_schedule_frag_setup_fn_t frag_setup;
49+
ucc_recursive_spinlock_t lock;
4950
} ucc_schedule_pipelined_t;
5051

5152
/* Creates a pipelined schedule for the algorithm defined by "frag_init".

0 commit comments

Comments
 (0)