Skip to content

Commit 4241d4c

Browse files
d-nettoKristofferC
authored andcommitted
GC scheduler refinements (#52294)
Supersedes #51061 and #51414. Still needs more perf analysis. (cherry picked from commit e26c257)
1 parent 6627b91 commit 4241d4c

File tree

2 files changed

+101
-29
lines changed

2 files changed

+101
-29
lines changed

src/gc.c

Lines changed: 101 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ uv_mutex_t gc_threads_lock;
2727
uv_cond_t gc_threads_cond;
2828
// To indicate whether concurrent sweeping should run
2929
uv_sem_t gc_sweep_assists_needed;
30+
// Mutex used to coordinate entry of GC threads in the mark loop
31+
uv_mutex_t gc_queue_observer_lock;
3032

3133
// Linked list of callback functions
3234

@@ -2706,8 +2708,10 @@ void gc_mark_and_steal(jl_ptls_t ptls)
27062708
jl_gc_markqueue_t *mq = &ptls->mark_queue;
27072709
jl_gc_markqueue_t *mq_master = NULL;
27082710
int master_tid = jl_atomic_load(&gc_master_tid);
2709-
if (master_tid != -1)
2710-
mq_master = &gc_all_tls_states[master_tid]->mark_queue;
2711+
if (master_tid == -1) {
2712+
return;
2713+
}
2714+
mq_master = &gc_all_tls_states[master_tid]->mark_queue;
27112715
void *new_obj;
27122716
jl_gc_chunk_t c;
27132717
pop : {
@@ -2782,28 +2786,108 @@ void gc_mark_and_steal(jl_ptls_t ptls)
27822786
}
27832787
}
27842788

2789+
size_t gc_count_work_in_queue(jl_ptls_t ptls) JL_NOTSAFEPOINT
2790+
{
2791+
// assume each chunk is worth 256 units of work and each pointer
2792+
// is worth 1 unit of work
2793+
size_t work = 256 * (jl_atomic_load_relaxed(&ptls->mark_queue.chunk_queue.bottom) -
2794+
jl_atomic_load_relaxed(&ptls->mark_queue.chunk_queue.top));
2795+
work += (jl_atomic_load_relaxed(&ptls->mark_queue.ptr_queue.bottom) -
2796+
jl_atomic_load_relaxed(&ptls->mark_queue.ptr_queue.top));
2797+
return work;
2798+
}
2799+
2800+
/**
2801+
* Correctness argument for the mark-loop termination protocol.
2802+
*
2803+
* Safety properties:
2804+
* - No work items shall be in any thread's queues when `gc_mark_loop_barrier` observes
2805+
* that `gc_n_threads_marking` is zero.
2806+
*
2807+
* - No work item shall be stolen from the master thread (i.e. mutator thread which started
2808+
* GC and which helped the `jl_n_markthreads` - 1 threads to mark) after
2809+
* `gc_mark_loop_barrier` observes that `gc_n_threads_marking` is zero. This property is
2810+
* necessary because we call `gc_mark_loop_serial` after marking the finalizer list in
2811+
* `_jl_gc_collect`, and want to ensure that we have the serial mark-loop semantics there,
2812+
* and that no work is stolen from us at that point.
2813+
*
2814+
* Proof:
2815+
* - Suppose the master thread observes that `gc_n_threads_marking` is zero in
2816+
* `gc_mark_loop_barrier` and there is a work item left in one thread's queue at that point.
2817+
* Since threads try to steal from all threads' queues, this implies that all threads must
2818+
* have tried to steal from the queue which still has a work item left, but failed to do so,
2819+
* which violates the semantics of Chase-Lev's work-stealing queue.
2820+
*
2821+
* - Let E1 be the event "master thread writes -1 to gc_master_tid" and E2 be the even
2822+
* "master thread observes that `gc_n_threads_marking` is zero". Since we're using
2823+
* sequentially consistent atomics, E1 => E2. Now suppose one thread which is spinning in
2824+
* `gc_should_mark` tries to enter the mark-loop after E2. In order to do so, it must
2825+
* increment `gc_n_threads_marking` to 1 in an event E3, and then read `gc_master_tid` in an
2826+
* event E4. Since we're using sequentially consistent atomics, E3 => E4. Since we observed
2827+
* `gc_n_threads_marking` as zero in E2, then E2 => E3, and we conclude E1 => E4, so that
2828+
* the thread which is spinning in `gc_should_mark` must observe that `gc_master_tid` is -1
2829+
* and therefore won't enter the mark-loop.
2830+
*/
2831+
2832+
int gc_should_mark(jl_ptls_t ptls)
2833+
{
2834+
int should_mark = 0;
2835+
int n_threads_marking = jl_atomic_load(&gc_n_threads_marking);
2836+
// fast path
2837+
if (n_threads_marking == 0) {
2838+
return 0;
2839+
}
2840+
uv_mutex_lock(&gc_queue_observer_lock);
2841+
while (1) {
2842+
int tid = jl_atomic_load(&gc_master_tid);
2843+
// fast path
2844+
if (tid == -1) {
2845+
break;
2846+
}
2847+
n_threads_marking = jl_atomic_load(&gc_n_threads_marking);
2848+
// fast path
2849+
if (n_threads_marking == 0) {
2850+
break;
2851+
}
2852+
size_t work = gc_count_work_in_queue(gc_all_tls_states[tid]);
2853+
for (tid = gc_first_tid; tid < gc_first_tid + jl_n_markthreads; tid++) {
2854+
work += gc_count_work_in_queue(gc_all_tls_states[tid]);
2855+
}
2856+
// if there is a lot of work left, enter the mark loop
2857+
if (work >= 16 * n_threads_marking) {
2858+
jl_atomic_fetch_add(&gc_n_threads_marking, 1);
2859+
should_mark = 1;
2860+
break;
2861+
}
2862+
jl_cpu_pause();
2863+
}
2864+
uv_mutex_unlock(&gc_queue_observer_lock);
2865+
return should_mark;
2866+
}
2867+
2868+
void gc_wake_all_for_marking(jl_ptls_t ptls)
2869+
{
2870+
jl_atomic_store(&gc_master_tid, ptls->tid);
2871+
uv_mutex_lock(&gc_threads_lock);
2872+
jl_atomic_fetch_add(&gc_n_threads_marking, 1);
2873+
uv_cond_broadcast(&gc_threads_cond);
2874+
uv_mutex_unlock(&gc_threads_lock);
2875+
}
2876+
27852877
void gc_mark_loop_parallel(jl_ptls_t ptls, int master)
27862878
{
2787-
int backoff = GC_BACKOFF_MIN;
27882879
if (master) {
2789-
jl_atomic_store(&gc_master_tid, ptls->tid);
2790-
// Wake threads up and try to do some work
2791-
uv_mutex_lock(&gc_threads_lock);
2792-
jl_atomic_fetch_add(&gc_n_threads_marking, 1);
2793-
uv_cond_broadcast(&gc_threads_cond);
2794-
uv_mutex_unlock(&gc_threads_lock);
2880+
gc_wake_all_for_marking(ptls);
27952881
gc_mark_and_steal(ptls);
27962882
jl_atomic_fetch_add(&gc_n_threads_marking, -1);
27972883
}
2798-
while (jl_atomic_load(&gc_n_threads_marking) > 0) {
2799-
// Try to become a thief while other threads are marking
2800-
jl_atomic_fetch_add(&gc_n_threads_marking, 1);
2801-
if (jl_atomic_load(&gc_master_tid) != -1) {
2802-
gc_mark_and_steal(ptls);
2884+
while (1) {
2885+
int should_mark = gc_should_mark(ptls);
2886+
if (!should_mark) {
2887+
break;
28032888
}
2889+
gc_mark_and_steal(ptls);
28042890
jl_atomic_fetch_add(&gc_n_threads_marking, -1);
2805-
// Failed to steal
2806-
gc_backoff(&backoff);
28072891
}
28082892
}
28092893

@@ -3543,6 +3627,7 @@ void jl_gc_init(void)
35433627
uv_mutex_init(&gc_threads_lock);
35443628
uv_cond_init(&gc_threads_cond);
35453629
uv_sem_init(&gc_sweep_assists_needed, 0);
3630+
uv_mutex_init(&gc_queue_observer_lock);
35463631

35473632
jl_gc_init_page();
35483633
jl_gc_debug_init();

src/gc.h

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -187,19 +187,6 @@ extern jl_gc_global_page_pool_t global_page_pool_lazily_freed;
187187
extern jl_gc_global_page_pool_t global_page_pool_clean;
188188
extern jl_gc_global_page_pool_t global_page_pool_freed;
189189

190-
#define GC_BACKOFF_MIN 4
191-
#define GC_BACKOFF_MAX 12
192-
193-
STATIC_INLINE void gc_backoff(int *i) JL_NOTSAFEPOINT
194-
{
195-
if (*i < GC_BACKOFF_MAX) {
196-
(*i)++;
197-
}
198-
for (int j = 0; j < (1 << *i); j++) {
199-
jl_cpu_pause();
200-
}
201-
}
202-
203190
// Lock-free stack implementation taken
204191
// from Herlihy's "The Art of Multiprocessor Programming"
205192

0 commit comments

Comments
 (0)