Skip to content

8361099: Shenandoah: Improve heap lock contention by using CAS for memory allocation #26171

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 45 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
a063a1c
Add allocate_atomic using CAS to ShenandoahHeapRegion
pengxiaolong May 21, 2025
66f3919
Duplicate Z's CPUAffinity in gc shared
pengxiaolong May 21, 2025
90f21c7
Touch up
pengxiaolong May 21, 2025
cd19779
cas_alloc
pengxiaolong Jun 24, 2025
5a6bc1c
CAS allocation for mutators
pengxiaolong Jul 1, 2025
2da4821
Update allocation bias
pengxiaolong Jul 1, 2025
5d0d37f
Humongous allocation and GC shall not use regions reserved for direct…
pengxiaolong Jul 1, 2025
c7ef2ec
Bug fix
pengxiaolong Jul 1, 2025
8237eb6
Bug fix
pengxiaolong Jul 1, 2025
854ba37
Merge branch 'openjdk:master' into cas-alloc
pengxiaolong Jul 1, 2025
11da608
increase_used needs to be called with heap lock
pengxiaolong Jul 1, 2025
60e75f2
Fix errors under race conditions
pengxiaolong Jul 1, 2025
d3cebfc
Fixes
pengxiaolong Jul 2, 2025
4caa801
Fix humongous allocation failure
pengxiaolong Jul 3, 2025
64015b3
Fix more asserts
pengxiaolong Jul 3, 2025
b9c9926
Merge branch 'openjdk:master' into cas-alloc
pengxiaolong Jul 3, 2025
94e538c
Fix build error
pengxiaolong Jul 3, 2025
37cee1f
Remove use of heap lock when update used
pengxiaolong Jul 3, 2025
977bebf
Adjust alloc logic
pengxiaolong Jul 3, 2025
4faf618
Fix build error
pengxiaolong Jul 3, 2025
d509856
More refactors
pengxiaolong Jul 3, 2025
970f3dd
Add todo comments
pengxiaolong Jul 3, 2025
bc5e72a
Revert "Duplicate Z's CPUAffinity in gc shared"
pengxiaolong Jul 3, 2025
103e42f
Steal alloc from other shared regions
pengxiaolong Jul 3, 2025
2f5d818
Use current thread id for hash
pengxiaolong Jul 4, 2025
6aa2dba
Fix build error for Windows
pengxiaolong Jul 4, 2025
ce5616c
Not reserve a region if it is ready for promotion
pengxiaolong Jul 4, 2025
d1d71bc
Only reserve empty region for direct allocation, also take the chance…
pengxiaolong Jul 4, 2025
96db619
reserve region when non-empty region has enough capacity
pengxiaolong Jul 4, 2025
d4dcb28
touch up
pengxiaolong Jul 4, 2025
2ea822c
Allocate new obj before storing the new reserved shared region
pengxiaolong Jul 7, 2025
e4ddfdc
Fix typo
pengxiaolong Jul 7, 2025
dceff3c
Not repeat allocation on the regions already tried before stealing al…
pengxiaolong Jul 7, 2025
138acb7
Fix typo
pengxiaolong Jul 7, 2025
fccbd0d
Fix improper order when release a region from direct allocation
pengxiaolong Jul 7, 2025
3452995
Fix improper order when release a region from direct allocation
pengxiaolong Jul 7, 2025
c93dc01
Fix improper order when release a region from direct allocation
pengxiaolong Jul 7, 2025
3e80fdc
Fix a bug
pengxiaolong Jul 7, 2025
b3d3592
Not update allocation bias
pengxiaolong Jul 7, 2025
9340e6e
Add CPU afinity support and use CPU process id instead thread id for …
pengxiaolong Jul 8, 2025
1557472
Fix wrong include
pengxiaolong Jul 8, 2025
926462f
Fix wrong include
pengxiaolong Jul 8, 2025
c640e68
Use random to decide the start index where mutator starts allocating …
pengxiaolong Jul 8, 2025
ca04034
Delete ShenandoahCPU
pengxiaolong Jul 8, 2025
f4c8e55
Comments to explain ShenandoahDirectlyAllocatableRegionAffinity
pengxiaolong Jul 8, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "gc/shenandoah/shenandoahCollectionSet.hpp"
#include "gc/shenandoah/shenandoahCollectorPolicy.hpp"
#include "gc/shenandoah/shenandoahEvacInfo.hpp"
#include "gc/shenandoah/shenandoahFreeSet.hpp"
#include "gc/shenandoah/shenandoahGeneration.hpp"
#include "gc/shenandoah/shenandoahGenerationalHeap.hpp"
#include "gc/shenandoah/shenandoahHeapRegion.inline.hpp"
Expand Down Expand Up @@ -94,6 +95,9 @@ void ShenandoahGenerationalHeuristics::choose_collection_set(ShenandoahCollectio
immediate_regions++;
immediate_garbage += garbage;
region->make_trash_immediate();
if (region->reserved_for_direct_allocation()) {
heap->free_set()->release_directly_allocatable_region(region);
}
} else {
bool is_candidate;
// This is our candidate for later consideration.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "gc/shared/gcCause.hpp"
#include "gc/shenandoah/heuristics/shenandoahHeuristics.hpp"
#include "gc/shenandoah/shenandoahCollectorPolicy.hpp"
#include "gc/shenandoah/shenandoahFreeSet.hpp"
#include "gc/shenandoah/shenandoahHeapRegion.inline.hpp"
#include "gc/shenandoah/shenandoahMarkingContext.inline.hpp"
#include "logging/log.hpp"
Expand Down Expand Up @@ -111,6 +112,9 @@ void ShenandoahHeuristics::choose_collection_set(ShenandoahCollectionSet* collec
immediate_regions++;
immediate_garbage += garbage;
region->make_trash_immediate();
if (region->reserved_for_direct_allocation()) {
heap->free_set()->release_directly_allocatable_region(region);
}
} else {
// This is our candidate for later consideration.
candidates[cand_idx].set_region_and_garbage(region, garbage);
Expand Down
4 changes: 4 additions & 0 deletions src/hotspot/share/gc/shenandoah/shenandoahCollectionSet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

#include "gc/shenandoah/shenandoahAgeCensus.hpp"
#include "gc/shenandoah/shenandoahCollectionSet.hpp"
#include "gc/shenandoah/shenandoahFreeSet.hpp"
#include "gc/shenandoah/shenandoahHeap.inline.hpp"
#include "gc/shenandoah/shenandoahHeapRegion.inline.hpp"
#include "gc/shenandoah/shenandoahHeapRegionSet.hpp"
Expand Down Expand Up @@ -101,6 +102,9 @@ void ShenandoahCollectionSet::add_region(ShenandoahHeapRegion* r) {
if (ShenandoahHeap::heap()->mode()->is_generational() && r->age() >= ShenandoahGenerationalHeap::heap()->age_census()->tenuring_threshold()) {
_young_bytes_to_promote += live;
}
if (r->reserved_for_direct_allocation()) {
_heap->free_set()->release_directly_allocatable_region(r);
}
} else if (r->is_old()) {
_old_bytes_to_evacuate += live;
_old_garbage += garbage;
Expand Down
388 changes: 353 additions & 35 deletions src/hotspot/share/gc/shenandoah/shenandoahFreeSet.cpp

Large diffs are not rendered by default.

90 changes: 60 additions & 30 deletions src/hotspot/share/gc/shenandoah/shenandoahFreeSet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "gc/shenandoah/shenandoahHeap.hpp"
#include "gc/shenandoah/shenandoahHeapRegionSet.hpp"
#include "gc/shenandoah/shenandoahSimpleBitMap.hpp"
#include "memory/padded.inline.hpp"

// Each ShenandoahHeapRegion is associated with a ShenandoahFreeSetPartitionId.
enum class ShenandoahFreeSetPartitionId : uint8_t {
Expand Down Expand Up @@ -78,10 +79,9 @@ class ShenandoahRegionPartitions {
// are denoted in bytes. Note that some regions that had been assigned to a particular partition at rebuild time
// may have been retired following the rebuild. The tallies for these regions are still reflected in _capacity[p]
// and _used[p], even though the region may have been removed from the free set.
size_t _capacity[UIntNumPartitions];
size_t _used[UIntNumPartitions];
size_t _available[UIntNumPartitions];
size_t _region_counts[UIntNumPartitions];
size_t volatile _capacity[UIntNumPartitions];
size_t volatile _used[UIntNumPartitions];
size_t volatile _region_counts[UIntNumPartitions];

// For each partition p, _left_to_right_bias is true iff allocations are normally made from lower indexed regions
// before higher indexed regions.
Expand Down Expand Up @@ -213,56 +213,40 @@ class ShenandoahRegionPartitions {

inline size_t capacity_of(ShenandoahFreeSetPartitionId which_partition) const {
assert (which_partition < NumPartitions, "selected free set must be valid");
return _capacity[int(which_partition)];
return Atomic::load(_capacity + int(which_partition));
}

inline size_t used_by(ShenandoahFreeSetPartitionId which_partition) const {
assert (which_partition < NumPartitions, "selected free set must be valid");
return _used[int(which_partition)];
return Atomic::load(_used + int(which_partition));
}

inline size_t available_in(ShenandoahFreeSetPartitionId which_partition) const {
assert (which_partition < NumPartitions, "selected free set must be valid");
shenandoah_assert_heaplocked();
assert(_available[int(which_partition)] == _capacity[int(which_partition)] - _used[int(which_partition)],
"Expect available (%zu) equals capacity (%zu) - used (%zu) for partition %s",
_available[int(which_partition)], _capacity[int(which_partition)], _used[int(which_partition)],
partition_membership_name(ssize_t(which_partition)));
return _available[int(which_partition)];
return capacity_of(which_partition) - used_by(which_partition);
}

// Return available_in assuming caller does not hold the heap lock. In production builds, available is
// returned without acquiring the lock. In debug builds, the global heap lock is acquired in order to
// enforce a consistency assert.
inline size_t available_in_not_locked(ShenandoahFreeSetPartitionId which_partition) const {
assert (which_partition < NumPartitions, "selected free set must be valid");
shenandoah_assert_not_heaplocked();
#ifdef ASSERT
ShenandoahHeapLocker locker(ShenandoahHeap::heap()->lock());
assert((_available[int(which_partition)] == FreeSetUnderConstruction) ||
(_available[int(which_partition)] == _capacity[int(which_partition)] - _used[int(which_partition)]),
"Expect available (%zu) equals capacity (%zu) - used (%zu) for partition %s",
_available[int(which_partition)], _capacity[int(which_partition)], _used[int(which_partition)],
partition_membership_name(ssize_t(which_partition)));
#endif
return _available[int(which_partition)];
return available_in(which_partition);
}

inline void set_capacity_of(ShenandoahFreeSetPartitionId which_partition, size_t value) {
shenandoah_assert_heaplocked();
assert (which_partition < NumPartitions, "selected free set must be valid");
_capacity[int(which_partition)] = value;
_available[int(which_partition)] = value - _used[int(which_partition)];
Atomic::store(_capacity + int(which_partition), value);
}

inline void set_used_by(ShenandoahFreeSetPartitionId which_partition, size_t value) {
shenandoah_assert_heaplocked();
assert (which_partition < NumPartitions, "selected free set must be valid");
_used[int(which_partition)] = value;
_available[int(which_partition)] = _capacity[int(which_partition)] - value;
Atomic::store(_used + int(which_partition), value);
}

inline size_t count(ShenandoahFreeSetPartitionId which_partition) const { return _region_counts[int(which_partition)]; }
inline size_t count(ShenandoahFreeSetPartitionId which_partition) const {
assert (which_partition < NumPartitions, "selected free set must be valid");
return Atomic::load(_region_counts + int(which_partition));
}

// Assure leftmost, rightmost, leftmost_empty, and rightmost_empty bounds are valid for all free sets.
// Valid bounds honor all of the following (where max is the number of heap regions):
Expand All @@ -287,6 +271,28 @@ class ShenandoahRegionPartitions {
void assert_bounds() NOT_DEBUG_RETURN;
};

#define DIRECTLY_ALLOCATABLE_REGION_UNKNOWN_AFFINITY ((Thread*)-1)
#define DIRECTLY_ALLOCATABLE_REGION_UNKNOWN_SELF ((Thread*)-2)
// When mutator threads allocate from directly allocatable regions, ideally the allocation should be evenly
// distributed to all the directly allocatable regions, random is the best portable option for this, but with random
// distribution it may worsen memory locality, e.g. two consecutive allocation from same thread are randomly
// distributed to different allocatable regions. ShenandoahDirectlyAllocatableRegionAffinity solves/mitigates
// the memory locality issue.
// The idea and code is borrowed from ZGC's CPU affinity, but with random number instead of CPU id.
class ShenandoahDirectlyAllocatableRegionAffinity : public AllStatic {
struct Affinity {
Thread* _thread;
};

static PaddedEnd<Affinity>* _affinity;
static THREAD_LOCAL Thread* _self;
static THREAD_LOCAL uint _index;
static uint index_slow();
public:
static void initialize();
static uint index();
};

// Publicly, ShenandoahFreeSet represents memory that is available to mutator threads. The public capacity(), used(),
// and available() methods represent this public notion of memory that is under control of the mutator. Separately,
// ShenandoahFreeSet also represents memory available to garbage collection activities for compaction purposes.
Expand Down Expand Up @@ -315,6 +321,7 @@ class ShenandoahFreeSet : public CHeapObj<mtGC> {
private:
ShenandoahHeap* const _heap;
ShenandoahRegionPartitions _partitions;
ShenandoahHeapRegion** _directly_allocatable_regions;

HeapWord* allocate_aligned_plab(size_t size, ShenandoahAllocRequest& req, ShenandoahHeapRegion* r);

Expand Down Expand Up @@ -410,6 +417,21 @@ class ShenandoahFreeSet : public CHeapObj<mtGC> {
// log status, assuming lock has already been acquired by the caller.
void log_status();

template<bool IS_TLAB>
HeapWord* par_allocate_in_for_mutator(ShenandoahHeapRegion* region, ShenandoahAllocRequest &req, bool &in_new_region);

bool try_allocate_directly_allocatable_regions(ShenandoahHeapRegion** shared_region_address[],
ShenandoahHeapRegion* original_shared_regions[],
uint region_count,
ShenandoahAllocRequest &req,
HeapWord* &obj,
bool &in_new_region);
template<bool IS_MUTATOR, bool IS_OLD>
uint iterate_regions_for_alloc(ShenandoahHeapRegionBreakableIterClosure* cl, bool use_empty);

template<typename Iter>
uint iterate_regions_for_alloc(Iter& iterator, ShenandoahHeapRegionBreakableIterClosure* cl);

public:
static const size_t FreeSetUnderConstruction = ShenandoahRegionPartitions::FreeSetUnderConstruction;

Expand Down Expand Up @@ -484,6 +506,14 @@ class ShenandoahFreeSet : public CHeapObj<mtGC> {

HeapWord* allocate(ShenandoahAllocRequest& req, bool& in_new_region);

HeapWord* allocate_humongous(ShenandoahAllocRequest &req);

void release_all_directly_allocatable_regions();

void release_directly_allocatable_region(ShenandoahHeapRegion *region);

template<bool IS_TLAB>
HeapWord* par_allocate_single_for_mutator(ShenandoahAllocRequest &req, bool &in_new_region);
/*
* Internal fragmentation metric: describes how fragmented the heap regions are.
*
Expand Down
2 changes: 2 additions & 0 deletions src/hotspot/share/gc/shenandoah/shenandoahFullGC.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,8 @@ void ShenandoahFullGC::do_it(GCCause::Cause gc_cause) {
heap->tlabs_retire(ResizeTLAB);
}

heap->free_set()->release_all_directly_allocatable_regions();

OrderAccess::fence();

phase1_mark_heap();
Expand Down
2 changes: 1 addition & 1 deletion src/hotspot/share/gc/shenandoah/shenandoahGeneration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -565,7 +565,7 @@ size_t ShenandoahGeneration::select_aged_regions(size_t old_available) {
// old generation.
HeapWord* tams = ctx->top_at_mark_start(r);
HeapWord* original_top = r->top();
if (!heap->is_concurrent_old_mark_in_progress() && tams == original_top) {
if (!heap->is_concurrent_old_mark_in_progress() && tams == original_top && !r->reserved_for_direct_allocation()) {
// No allocations from this region have been made during concurrent mark. It meets all the criteria
// for in-place-promotion. Though we only need the value of top when we fill the end of the region,
// we use this field to indicate that this region should be promoted in place during the evacuation
Expand Down
20 changes: 18 additions & 2 deletions src/hotspot/share/gc/shenandoah/shenandoahHeap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -976,7 +976,7 @@ HeapWord* ShenandoahHeap::allocate_memory(ShenandoahAllocRequest& req) {
}

if (!ShenandoahAllocFailureALot || !should_inject_alloc_failure()) {
result = allocate_memory_under_lock(req, in_new_region);
result = allocate_memory_for_mutator(req, in_new_region);
}

// Check that gc overhead is not exceeded.
Expand Down Expand Up @@ -1008,7 +1008,7 @@ HeapWord* ShenandoahHeap::allocate_memory(ShenandoahAllocRequest& req) {
const size_t original_count = shenandoah_policy()->full_gc_count();
while (result == nullptr && should_retry_allocation(original_count)) {
control_thread()->handle_alloc_failure(req, true);
result = allocate_memory_under_lock(req, in_new_region);
result = allocate_memory_for_mutator(req, in_new_region);
}
if (result != nullptr) {
// If our allocation request has been satisfied after it initially failed, we count this as good gc progress
Expand Down Expand Up @@ -1062,6 +1062,22 @@ HeapWord* ShenandoahHeap::allocate_memory(ShenandoahAllocRequest& req) {
return result;
}

HeapWord* ShenandoahHeap::allocate_memory_for_mutator(ShenandoahAllocRequest& req, bool& in_new_region) {
assert(req.is_mutator_alloc(), "Sanity");
assert(!req.is_old(), "Sanity");
shenandoah_assert_not_heaplocked();
ShenandoahFreeSet* free_set = ShenandoahHeap::free_set();
if (ShenandoahHeapRegion::requires_humongous(req.size())) {
in_new_region = true;
return free_set->allocate_humongous(req);
}
if (req.is_lab_alloc()) {
return free_set->par_allocate_single_for_mutator<true>(req, in_new_region);
} else {
return free_set->par_allocate_single_for_mutator<false>(req, in_new_region);
}
}

inline bool ShenandoahHeap::should_retry_allocation(size_t original_full_gc_count) const {
return shenandoah_policy()->full_gc_count() == original_full_gc_count
&& !shenandoah_policy()->is_at_shutdown();
Expand Down
7 changes: 7 additions & 0 deletions src/hotspot/share/gc/shenandoah/shenandoahHeap.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,12 @@ class ShenandoahHeapRegionClosure : public StackObj {
virtual bool is_thread_safe() { return false; }
};

class ShenandoahHeapRegionBreakableIterClosure : public StackObj {
public:
// Return true to break the iteration loop.
virtual bool heap_region_do(ShenandoahHeapRegion* r) { return false; };
};

typedef ShenandoahLock ShenandoahHeapLock;
typedef ShenandoahLocker ShenandoahHeapLocker;
typedef Stack<oop, mtGC> ShenandoahScanObjectStack;
Expand Down Expand Up @@ -691,6 +697,7 @@ class ShenandoahHeap : public CollectedHeap {

private:
HeapWord* allocate_memory_under_lock(ShenandoahAllocRequest& request, bool& in_new_region);
HeapWord* allocate_memory_for_mutator(ShenandoahAllocRequest& request, bool& in_new_region);
HeapWord* allocate_from_gclab_slow(Thread* thread, size_t size);
HeapWord* allocate_new_gclab(size_t min_size, size_t word_size, size_t* actual_size);

Expand Down
17 changes: 10 additions & 7 deletions src/hotspot/share/gc/shenandoah/shenandoahHeapRegion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ ShenandoahHeapRegion::ShenandoahHeapRegion(HeapWord* start, size_t index, bool c
SpaceMangler::mangle_region(MemRegion(_bottom, _end));
}
_recycling.unset();
_direct_alloc_reserved.unset();
}

void ShenandoahHeapRegion::report_illegal_transition(const char *method) {
Expand Down Expand Up @@ -370,25 +371,25 @@ void ShenandoahHeapRegion::make_committed_bypass() {
}

void ShenandoahHeapRegion::reset_alloc_metadata() {
_tlab_allocs = 0;
_gclab_allocs = 0;
_plab_allocs = 0;
Atomic::store(&_tlab_allocs, size_t(0));
Atomic::store(&_gclab_allocs, size_t(0));
Atomic::store(&_plab_allocs, size_t(0));
}

size_t ShenandoahHeapRegion::get_shared_allocs() const {
return used() - (_tlab_allocs + _gclab_allocs + _plab_allocs) * HeapWordSize;
return used() - (Atomic::load(&_tlab_allocs) + Atomic::load(&_gclab_allocs) + Atomic::load(&_plab_allocs)) * HeapWordSize;
}

size_t ShenandoahHeapRegion::get_tlab_allocs() const {
return _tlab_allocs * HeapWordSize;
return Atomic::load(&_tlab_allocs) * HeapWordSize;
}

size_t ShenandoahHeapRegion::get_gclab_allocs() const {
return _gclab_allocs * HeapWordSize;
return Atomic::load(&_gclab_allocs) * HeapWordSize;
}

size_t ShenandoahHeapRegion::get_plab_allocs() const {
return _plab_allocs * HeapWordSize;
return Atomic::load(&_plab_allocs) * HeapWordSize;
}

void ShenandoahHeapRegion::set_live_data(size_t s) {
Expand Down Expand Up @@ -854,6 +855,8 @@ size_t ShenandoahHeapRegion::pin_count() const {
}

void ShenandoahHeapRegion::set_affiliation(ShenandoahAffiliation new_affiliation) {
assert(new_affiliation != OLD_GENERATION || !reserved_for_direct_allocation(), "Reserved region can't move to old");

ShenandoahHeap* heap = ShenandoahHeap::heap();

ShenandoahAffiliation region_affiliation = heap->region_affiliation(this);
Expand Down
Loading