Implement PaddedSpinLock, which avoids false sharing. (#55944)

kuszmaul · web-flow · commit 1b1b5d57765a · 2025-05-11T07:58:21.000-04:00
diff --git a/NEWS.md b/NEWS.md
@@ -24,6 +24,14 @@ Command-line option changes
 Multi-threading changes
 -----------------------
 
+* A new `AbstractSpinLock` is defined with `SpinLock <: AbstractSpinLock` ([#55944]).
+* A new `PaddedSpinLock <: AbstractSpinLock` is defined.  It has extra padding to avoid false sharing ([#55944]).
+* New types are defined to handle the pattern of code that must run once per process, called
+  a `OncePerProcess{T}` type, which allows defining a function that should be run exactly once
+  the first time it is called, and then always return the same result value of type `T`
+  every subsequent time afterwards. There are also `OncePerThread{T}` and `OncePerTask{T}` types for
+  similar usage with threads or tasks. ([#TBD])
+
 Build system changes
 --------------------
 
diff --git a/base/locks-mt.jl b/base/locks-mt.jl
@@ -3,7 +3,7 @@
 import .Base: unsafe_convert, lock, trylock, unlock, islocked, wait, notify, AbstractLock
 
 export SpinLock
-
+public PaddedSpinLock
 # Important Note: these low-level primitives defined here
 #   are typically not for general usage
 
@@ -12,33 +12,68 @@ export SpinLock
 ##########################################
 
 """
-    SpinLock()
+    abstract type AbstractSpinLock <: AbstractLock end
 
-Create a non-reentrant, test-and-test-and-set spin lock.
+A non-reentrant, test-and-test-and-set spin lock.
 Recursive use will result in a deadlock.
 This kind of lock should only be used around code that takes little time
 to execute and does not block (e.g. perform I/O).
 In general, [`ReentrantLock`](@ref) should be used instead.
 
 Each [`lock`](@ref) must be matched with an [`unlock`](@ref).
-If [`!islocked(lck::SpinLock)`](@ref islocked) holds, [`trylock(lck)`](@ref trylock)
+If [`!islocked(lck::AbstractSpinLock)`](@ref islocked) holds, [`trylock(lck)`](@ref trylock)
 succeeds unless there are other tasks attempting to hold the lock "at the same time."
 
 Test-and-test-and-set spin locks are quickest up to about 30ish
 contending threads. If you have more contention than that, different
 synchronization approaches should be considered.
 """
-mutable struct SpinLock <: AbstractLock
+abstract type AbstractSpinLock <: AbstractLock end
+
+"""
+    SpinLock() <: AbstractSpinLock
+
+Spinlocks are not padded, and so may suffer from false sharing.
+See also [`PaddedSpinLock`](@ref).
+
+See the documentation for [`AbstractSpinLock`](@ref) regarding correct usage.
+"""
+mutable struct SpinLock <: AbstractSpinLock
     # we make this much larger than necessary to minimize false-sharing
     @atomic owned::Int
     SpinLock() = new(0)
 end
 
+# TODO: Determine the cache line size using e.g., CPUID. Meanwhile, this is correct for most
+# processors.
+const CACHE_LINE_SIZE = 64
+
+"""
+    PaddedSpinLock() <: AbstractSpinLock
+
+PaddedSpinLocks are padded so that each is guaranteed to be on its own cache line, to avoid
+false sharing.
+See also [`SpinLock`](@ref).
+
+See the documentation for [`AbstractSpinLock`](@ref) regarding correct usage.
+"""
+mutable struct PaddedSpinLock <: AbstractSpinLock
+    # we make this much larger than necessary to minimize false-sharing
+    _padding_before::NTuple{max(0, CACHE_LINE_SIZE - sizeof(Int)), UInt8}
+    @atomic owned::Int
+    _padding_after::NTuple{max(0, CACHE_LINE_SIZE - sizeof(Int)), UInt8}
+    function PaddedSpinLock()
+        l = new()
+        @atomic l.owned = 0
+        return l
+    end
+end
+
 # Note: this cannot assert that the lock is held by the correct thread, because we do not
 # track which thread locked it. Users beware.
-Base.assert_havelock(l::SpinLock) = islocked(l) ? nothing : Base.concurrency_violation()
+Base.assert_havelock(l::AbstractSpinLock) = islocked(l) ? nothing : Base.concurrency_violation()
 
-function lock(l::SpinLock)
+function lock(l::AbstractSpinLock)
     while true
         if @inline trylock(l)
             return
@@ -49,7 +84,7 @@ function lock(l::SpinLock)
     end
 end
 
-function trylock(l::SpinLock)
+function trylock(l::AbstractSpinLock)
     if l.owned == 0
         GC.disable_finalizers()
         p = @atomicswap :acquire l.owned = 1
@@ -61,7 +96,7 @@ function trylock(l::SpinLock)
     return false
 end
 
-function unlock(l::SpinLock)
+function unlock(l::AbstractSpinLock)
     if (@atomicswap :release l.owned = 0) == 0
         error("unlock count must match lock count")
     end
@@ -70,6 +105,6 @@ function unlock(l::SpinLock)
     return
 end
 
-function islocked(l::SpinLock)
+function islocked(l::AbstractSpinLock)
     return (@atomic :monotonic l.owned) != 0
 end
diff --git a/doc/src/base/multi-threading.md b/doc/src/base/multi-threading.md
@@ -63,7 +63,9 @@ Base.@threadcall
 These building blocks are used to create the regular synchronization objects.
 
 ```@docs
+Base.Threads.AbstractSpinLock
 Base.Threads.SpinLock
+Base.Threads.PaddedSpinLock
 ```
 
 ## Task metrics (Experimental)
diff --git a/test/threads_exec.jl b/test/threads_exec.jl
@@ -110,6 +110,26 @@ if threadpoolsize(:default) > 1
     end
 end
 
+if threadpoolsize() > 1
+    let lk = Base.Threads.PaddedSpinLock()
+        c1 = Base.Event()
+        c2 = Base.Event()
+        @test trylock(lk)
+        @test !trylock(lk)
+        t1 = Threads.@spawn (notify(c1); lock(lk); unlock(lk); trylock(lk))
+        t2 = Threads.@spawn (notify(c2); trylock(lk))
+        Libc.systemsleep(0.1) # block our thread from scheduling for a bit
+        wait(c1)
+        wait(c2)
+        @test !fetch(t2)
+        @test istaskdone(t2)
+        @test !istaskdone(t1)
+        unlock(lk)
+        @test fetch(t1)
+        @test istaskdone(t1)
+    end
+end
+
 # threading constructs
 
 @testset "@threads and @spawn threadpools" begin