diff --git a/src/hotspot/share/gc/shenandoah/heuristics/shenandoahGenerationalHeuristics.cpp b/src/hotspot/share/gc/shenandoah/heuristics/shenandoahGenerationalHeuristics.cpp
index 08fd45993462b..71fff8689ad0e 100644
--- a/src/hotspot/share/gc/shenandoah/heuristics/shenandoahGenerationalHeuristics.cpp
+++ b/src/hotspot/share/gc/shenandoah/heuristics/shenandoahGenerationalHeuristics.cpp
@@ -27,6 +27,7 @@
 #include "gc/shenandoah/shenandoahCollectionSet.hpp"
 #include "gc/shenandoah/shenandoahCollectorPolicy.hpp"
 #include "gc/shenandoah/shenandoahEvacInfo.hpp"
+#include "gc/shenandoah/shenandoahFreeSet.hpp"
 #include "gc/shenandoah/shenandoahGeneration.hpp"
 #include "gc/shenandoah/shenandoahGenerationalHeap.hpp"
 #include "gc/shenandoah/shenandoahHeapRegion.inline.hpp"
@@ -94,6 +95,9 @@ void ShenandoahGenerationalHeuristics::choose_collection_set(ShenandoahCollectio
         immediate_regions++;
         immediate_garbage += garbage;
         region->make_trash_immediate();
+        if (region->reserved_for_direct_allocation()) {
+          heap->free_set()->release_directly_allocatable_region(region);
+        }
       } else {
         bool is_candidate;
         // This is our candidate for later consideration.
diff --git a/src/hotspot/share/gc/shenandoah/heuristics/shenandoahHeuristics.cpp b/src/hotspot/share/gc/shenandoah/heuristics/shenandoahHeuristics.cpp
index b151a75e6e7e5..f85ef76e8e27a 100644
--- a/src/hotspot/share/gc/shenandoah/heuristics/shenandoahHeuristics.cpp
+++ b/src/hotspot/share/gc/shenandoah/heuristics/shenandoahHeuristics.cpp
@@ -27,6 +27,7 @@
 #include "gc/shared/gcCause.hpp"
 #include "gc/shenandoah/heuristics/shenandoahHeuristics.hpp"
 #include "gc/shenandoah/shenandoahCollectorPolicy.hpp"
+#include "gc/shenandoah/shenandoahFreeSet.hpp"
 #include "gc/shenandoah/shenandoahHeapRegion.inline.hpp"
 #include "gc/shenandoah/shenandoahMarkingContext.inline.hpp"
 #include "logging/log.hpp"
@@ -111,6 +112,9 @@ void ShenandoahHeuristics::choose_collection_set(ShenandoahCollectionSet* collec
         immediate_regions++;
         immediate_garbage += garbage;
         region->make_trash_immediate();
+        if (region->reserved_for_direct_allocation()) {
+          heap->free_set()->release_directly_allocatable_region(region);
+        }
       } else {
         // This is our candidate for later consideration.
         candidates[cand_idx].set_region_and_garbage(region, garbage);
diff --git a/src/hotspot/share/gc/shenandoah/shenandoahCollectionSet.cpp b/src/hotspot/share/gc/shenandoah/shenandoahCollectionSet.cpp
index 25b900f8d7772..60acaf349da96 100644
--- a/src/hotspot/share/gc/shenandoah/shenandoahCollectionSet.cpp
+++ b/src/hotspot/share/gc/shenandoah/shenandoahCollectionSet.cpp
@@ -27,6 +27,7 @@
 
 #include "gc/shenandoah/shenandoahAgeCensus.hpp"
 #include "gc/shenandoah/shenandoahCollectionSet.hpp"
+#include "gc/shenandoah/shenandoahFreeSet.hpp"
 #include "gc/shenandoah/shenandoahHeap.inline.hpp"
 #include "gc/shenandoah/shenandoahHeapRegion.inline.hpp"
 #include "gc/shenandoah/shenandoahHeapRegionSet.hpp"
@@ -101,6 +102,9 @@ void ShenandoahCollectionSet::add_region(ShenandoahHeapRegion* r) {
     if (ShenandoahHeap::heap()->mode()->is_generational() && r->age() >= ShenandoahGenerationalHeap::heap()->age_census()->tenuring_threshold()) {
       _young_bytes_to_promote += live;
     }
+    if (r->reserved_for_direct_allocation()) {
+      _heap->free_set()->release_directly_allocatable_region(r);
+    }
   } else if (r->is_old()) {
     _old_bytes_to_evacuate += live;
     _old_garbage += garbage;
diff --git a/src/hotspot/share/gc/shenandoah/shenandoahFreeSet.cpp b/src/hotspot/share/gc/shenandoah/shenandoahFreeSet.cpp
index 1acb6a23e7a4c..4687f91a77896 100644
--- a/src/hotspot/share/gc/shenandoah/shenandoahFreeSet.cpp
+++ b/src/hotspot/share/gc/shenandoah/shenandoahFreeSet.cpp
@@ -33,6 +33,7 @@
 #include "gc/shenandoah/shenandoahOldGeneration.hpp"
 #include "gc/shenandoah/shenandoahSimpleBitMap.hpp"
 #include "gc/shenandoah/shenandoahSimpleBitMap.inline.hpp"
+#include "gc/shenandoah/shenandoahUtils.hpp"
 #include "gc/shenandoah/shenandoahYoungGeneration.hpp"
 #include "logging/logStream.hpp"
 #include "memory/resourceArea.hpp"
@@ -229,7 +230,6 @@ void ShenandoahRegionPartitions::make_all_regions_unavailable() {
     _rightmosts_empty[partition_id] = -1;;
     _capacity[partition_id] = 0;
     _used[partition_id] = 0;
-    _available[partition_id] = FreeSetUnderConstruction;
   }
   _region_counts[int(ShenandoahFreeSetPartitionId::Mutator)] = _region_counts[int(ShenandoahFreeSetPartitionId::Collector)] = 0;
 }
@@ -244,21 +244,18 @@ void ShenandoahRegionPartitions::establish_mutator_intervals(idx_t mutator_leftm
   _leftmosts_empty[int(ShenandoahFreeSetPartitionId::Mutator)] = mutator_leftmost_empty;
   _rightmosts_empty[int(ShenandoahFreeSetPartitionId::Mutator)] = mutator_rightmost_empty;
 
-  _region_counts[int(ShenandoahFreeSetPartitionId::Mutator)] = mutator_region_count;
-  _used[int(ShenandoahFreeSetPartitionId::Mutator)] = mutator_used;
-  _capacity[int(ShenandoahFreeSetPartitionId::Mutator)] = mutator_region_count * _region_size_bytes;
-  _available[int(ShenandoahFreeSetPartitionId::Mutator)] =
-    _capacity[int(ShenandoahFreeSetPartitionId::Mutator)] - _used[int(ShenandoahFreeSetPartitionId::Mutator)];
+  Atomic::store(_region_counts + int(ShenandoahFreeSetPartitionId::Mutator), mutator_region_count);
+  Atomic::store(_used + int(ShenandoahFreeSetPartitionId::Mutator),  mutator_used);
+  Atomic::store(_capacity + int(ShenandoahFreeSetPartitionId::Mutator), mutator_region_count * _region_size_bytes);
 
   _leftmosts[int(ShenandoahFreeSetPartitionId::Collector)] = _max;
   _rightmosts[int(ShenandoahFreeSetPartitionId::Collector)] = -1;
   _leftmosts_empty[int(ShenandoahFreeSetPartitionId::Collector)] = _max;
   _rightmosts_empty[int(ShenandoahFreeSetPartitionId::Collector)] = -1;
 
-  _region_counts[int(ShenandoahFreeSetPartitionId::Collector)] = 0;
-  _used[int(ShenandoahFreeSetPartitionId::Collector)] = 0;
-  _capacity[int(ShenandoahFreeSetPartitionId::Collector)] = 0;
-  _available[int(ShenandoahFreeSetPartitionId::Collector)] = 0;
+  Atomic::store(_region_counts + int(ShenandoahFreeSetPartitionId::Collector), size_t(0));
+  Atomic::store(_used + int(ShenandoahFreeSetPartitionId::Collector), size_t(0));
+  Atomic::store(_capacity + int(ShenandoahFreeSetPartitionId::Collector), size_t(0));
 }
 
 void ShenandoahRegionPartitions::establish_old_collector_intervals(idx_t old_collector_leftmost, idx_t old_collector_rightmost,
@@ -272,22 +269,14 @@ void ShenandoahRegionPartitions::establish_old_collector_intervals(idx_t old_col
   _leftmosts_empty[int(ShenandoahFreeSetPartitionId::OldCollector)] = old_collector_leftmost_empty;
   _rightmosts_empty[int(ShenandoahFreeSetPartitionId::OldCollector)] = old_collector_rightmost_empty;
 
-  _region_counts[int(ShenandoahFreeSetPartitionId::OldCollector)] = old_collector_region_count;
-  _used[int(ShenandoahFreeSetPartitionId::OldCollector)] = old_collector_used;
-  _capacity[int(ShenandoahFreeSetPartitionId::OldCollector)] = old_collector_region_count * _region_size_bytes;
-  _available[int(ShenandoahFreeSetPartitionId::OldCollector)] =
-    _capacity[int(ShenandoahFreeSetPartitionId::OldCollector)] - _used[int(ShenandoahFreeSetPartitionId::OldCollector)];
+  Atomic::store(_region_counts + int(ShenandoahFreeSetPartitionId::OldCollector), old_collector_region_count);
+  Atomic::store(_used + int(ShenandoahFreeSetPartitionId::OldCollector), old_collector_used);
+  Atomic::store(_capacity + int(ShenandoahFreeSetPartitionId::Collector), old_collector_region_count * _region_size_bytes);
 }
 
 void ShenandoahRegionPartitions::increase_used(ShenandoahFreeSetPartitionId which_partition, size_t bytes) {
-  shenandoah_assert_heaplocked();
   assert (which_partition < NumPartitions, "Partition must be valid");
-
-  _used[int(which_partition)] += bytes;
-  _available[int(which_partition)] -= bytes;
-  assert (_used[int(which_partition)] <= _capacity[int(which_partition)],
-          "Must not use (%zu) more than capacity (%zu) after increase by %zu",
-          _used[int(which_partition)], _capacity[int(which_partition)], bytes);
+  Atomic::add(_used + int(which_partition), bytes);
 }
 
 inline void ShenandoahRegionPartitions::shrink_interval_if_range_modifies_either_boundary(
@@ -389,7 +378,6 @@ void ShenandoahRegionPartitions::make_free(idx_t idx, ShenandoahFreeSetPartition
   _membership[int(which_partition)].set_bit(idx);
   _capacity[int(which_partition)] += _region_size_bytes;
   _used[int(which_partition)] += _region_size_bytes - available;
-  _available[int(which_partition)] += available;
   expand_interval_if_boundary_modified(which_partition, idx, available);
   _region_counts[int(which_partition)]++;
 }
@@ -448,17 +436,19 @@ void ShenandoahRegionPartitions::move_from_partition_to_partition(idx_t idx, She
           "Orig partition used: %zu must exceed moved used: %zu within region %zd",
           _used[int(orig_partition)], used, idx);
 
+  if (orig_partition == ShenandoahFreeSetPartitionId::Mutator && r->reserved_for_direct_allocation()) {
+    ShenandoahHeap::heap()->free_set()->release_directly_allocatable_region(r);
+  }
+
   _membership[int(orig_partition)].clear_bit(idx);
   _membership[int(new_partition)].set_bit(idx);
 
   _capacity[int(orig_partition)] -= _region_size_bytes;
   _used[int(orig_partition)] -= used;
-  _available[int(orig_partition)] -= available;
   shrink_interval_if_boundary_modified(orig_partition, idx);
 
   _capacity[int(new_partition)] += _region_size_bytes;;
   _used[int(new_partition)] += used;
-  _available[int(new_partition)] += available;
   expand_interval_if_boundary_modified(new_partition, idx, available);
 
   _region_counts[int(orig_partition)]--;
@@ -601,6 +591,7 @@ void ShenandoahRegionPartitions::assert_bounds() {
   idx_t rightmosts[UIntNumPartitions];
   idx_t empty_leftmosts[UIntNumPartitions];
   idx_t empty_rightmosts[UIntNumPartitions];
+  ShenandoahHeap* heap = ShenandoahHeap::heap();
 
   for (uint i = 0; i < UIntNumPartitions; i++) {
     leftmosts[i] = _max;
@@ -621,18 +612,31 @@ void ShenandoahRegionPartitions::assert_bounds() {
       {
         size_t capacity = _free_set->alloc_capacity(i);
         bool is_empty = (capacity == _region_size_bytes);
-        assert(capacity > 0, "free regions must have allocation capacity");
+        // TODO remove assert, not possible to pass when allow mutator to allocate w/o lock.
+        //assert(capacity > 0, "free regions must have allocation capacity");
         if (i < leftmosts[int(partition)]) {
           leftmosts[int(partition)] = i;
         }
         if (is_empty && (i < empty_leftmosts[int(partition)])) {
-          empty_leftmosts[int(partition)] = i;
+          if (partition == ShenandoahFreeSetPartitionId::Mutator) {
+            if (!heap->get_region(i)->reserved_for_direct_allocation()){
+              empty_leftmosts[int(partition)] = i;
+            }
+          } else {
+            empty_leftmosts[int(partition)] = i;
+          }
         }
         if (i > rightmosts[int(partition)]) {
           rightmosts[int(partition)] = i;
         }
         if (is_empty && (i > empty_rightmosts[int(partition)])) {
-          empty_rightmosts[int(partition)] = i;
+          if (partition == ShenandoahFreeSetPartitionId::Mutator) {
+            if (!heap->get_region(i)->reserved_for_direct_allocation()) {
+              empty_rightmosts[int(partition)] = i;
+            }
+          } else {
+            empty_rightmosts[int(partition)] = i;
+          }
         }
         break;
       }
@@ -745,12 +749,55 @@ void ShenandoahRegionPartitions::assert_bounds() {
 }
 #endif
 
+PaddedEnd<ShenandoahDirectlyAllocatableRegionAffinity::Affinity>* ShenandoahDirectlyAllocatableRegionAffinity::_affinity = nullptr;
+THREAD_LOCAL Thread* ShenandoahDirectlyAllocatableRegionAffinity::_self = DIRECTLY_ALLOCATABLE_REGION_UNKNOWN_SELF;
+THREAD_LOCAL uint ShenandoahDirectlyAllocatableRegionAffinity::_index = 0;
+
+uint ShenandoahDirectlyAllocatableRegionAffinity::index_slow() {
+  // Set current thread
+  if (_self == DIRECTLY_ALLOCATABLE_REGION_UNKNOWN_SELF) {
+    _self = Thread::current();
+  }
+
+  // Create a new random index where the thread will start allocation
+  _index = static_cast<uint>(os::random()) % ShenandoahDirectlyAllocatableRegionCount;
+
+  // Update affinity table
+  _affinity[_index]._thread = _self;
+
+  return _index;
+}
+
+void ShenandoahDirectlyAllocatableRegionAffinity::initialize() {
+  assert(_affinity == nullptr, "Already initialized");
+  _affinity = PaddedArray<Affinity, mtGC>::create_unfreeable(ShenandoahDirectlyAllocatableRegionCount);
+  for (uint32_t i = 0; i < ShenandoahDirectlyAllocatableRegionCount; i++) {
+    _affinity[i]._thread = DIRECTLY_ALLOCATABLE_REGION_UNKNOWN_AFFINITY;
+  }
+}
+
+uint ShenandoahDirectlyAllocatableRegionAffinity::index() {
+  assert(_affinity != nullptr, "Not initialized");
+  // Fast path
+  if (_affinity[_index]._thread == _self) {
+    return _index;
+  }
+
+  // Slow path
+  return index_slow();
+}
+
 ShenandoahFreeSet::ShenandoahFreeSet(ShenandoahHeap* heap, size_t max_regions) :
   _heap(heap),
   _partitions(max_regions, this),
   _alloc_bias_weight(0)
 {
   clear_internal();
+  _directly_allocatable_regions = PaddedArray<ShenandoahHeapRegionAddress, mtGC>::create_unfreeable(ShenandoahDirectlyAllocatableRegionCount);
+  for (uint i = 0; i < ShenandoahDirectlyAllocatableRegionCount; i++) {
+    _directly_allocatable_regions[i].address = nullptr;
+  }
+  ShenandoahDirectlyAllocatableRegionAffinity::initialize();
 }
 
 void ShenandoahFreeSet::add_promoted_in_place_region_to_old_collector(ShenandoahHeapRegion* region) {
@@ -784,7 +831,7 @@ template<typename Iter>
 HeapWord* ShenandoahFreeSet::allocate_with_affiliation(Iter& iterator, ShenandoahAffiliation affiliation, ShenandoahAllocRequest& req, bool& in_new_region) {
   for (idx_t idx = iterator.current(); iterator.has_next(); idx = iterator.next()) {
     ShenandoahHeapRegion* r = _heap->get_region(idx);
-    if (r->affiliation() == affiliation) {
+    if (r->affiliation() == affiliation && !r->reserved_for_direct_allocation()) {
       HeapWord* result = try_allocate_in(r, req, in_new_region);
       if (result != nullptr) {
         return result;
@@ -828,7 +875,7 @@ HeapWord* ShenandoahFreeSet::allocate_single(ShenandoahAllocRequest& req, bool&
 }
 
 HeapWord* ShenandoahFreeSet::allocate_for_mutator(ShenandoahAllocRequest &req, bool &in_new_region) {
-  update_allocation_bias();
+  //update_allocation_bias();
 
   if (_partitions.is_empty(ShenandoahFreeSetPartitionId::Mutator)) {
     // There is no recovery. Mutator does not touch collector view at all.
@@ -880,7 +927,7 @@ HeapWord* ShenandoahFreeSet::allocate_from_regions(Iter& iterator, ShenandoahAll
   for (idx_t idx = iterator.current(); iterator.has_next(); idx = iterator.next()) {
     ShenandoahHeapRegion* r = _heap->get_region(idx);
     size_t min_size = (req.type() == ShenandoahAllocRequest::_alloc_tlab) ? req.min_size() : req.size();
-    if (alloc_capacity(r) >= min_size) {
+    if (!r->reserved_for_direct_allocation() && alloc_capacity(r) >= min_size) {
       HeapWord* result = try_allocate_in(r, req, in_new_region);
       if (result != nullptr) {
         return result;
@@ -948,7 +995,7 @@ HeapWord* ShenandoahFreeSet::try_allocate_from_mutator(ShenandoahAllocRequest& r
   ShenandoahRightLeftIterator iterator(&_partitions, ShenandoahFreeSetPartitionId::Mutator, true);
   for (idx_t idx = iterator.current(); iterator.has_next(); idx = iterator.next()) {
     ShenandoahHeapRegion* r = _heap->get_region(idx);
-    if (can_allocate_from(r)) {
+    if (can_allocate_from(r) && !r->reserved_for_direct_allocation()) {
       if (req.is_old()) {
         if (!flip_to_old_gc(r)) {
           continue;
@@ -1202,7 +1249,8 @@ HeapWord* ShenandoahFreeSet::allocate_contiguous(ShenandoahAllocRequest& req) {
     // We've confirmed num contiguous regions belonging to Mutator partition, so no need to confirm membership.
     // If region is not completely free, the current [beg; end] is useless, and we may fast-forward.  If we can extend
     // the existing range, we can exploit that certain regions are already known to be in the Mutator free set.
-    while (!can_allocate_from(_heap->get_region(end))) {
+    ShenandoahHeapRegion* region = _heap->get_region(end);
+    while (!can_allocate_from(region) || region->reserved_for_direct_allocation()) {
       // region[end] is not empty, so we restart our search after region[end]
       idx_t slide_delta = end + 1 - beg;
       if (beg + slide_delta > last_possible_start) {
@@ -1225,6 +1273,7 @@ HeapWord* ShenandoahFreeSet::allocate_contiguous(ShenandoahAllocRequest& req) {
         return nullptr;
       }
       end = beg;
+      region = _heap->get_region(end);
     }
 
     if ((end - beg + 1) == num) {
@@ -1979,12 +2028,12 @@ void ShenandoahFreeSet::log_status() {
       }
 
       size_t max_humongous = max_contig * ShenandoahHeapRegion::region_size_bytes();
-      size_t free = capacity() - used();
 
       // Since certain regions that belonged to the Mutator free partition at the time of most recent rebuild may have been
       // retired, the sum of used and capacities within regions that are still in the Mutator free partition may not match
       // my internally tracked values of used() and free().
-      assert(free == total_free, "Free memory should match");
+      //TODO remove assert, it is not possible to mach since mutators may allocate on region w/o acquiring lock
+      //assert(free == total_free, "Free memory should match");
       ls.print("Free: %zu%s, Max: %zu%s regular, %zu%s humongous, ",
                byte_size_in_proper_unit(total_free),    proper_unit_for_byte_size(total_free),
                byte_size_in_proper_unit(max),           proper_unit_for_byte_size(max),
@@ -2080,6 +2129,274 @@ HeapWord* ShenandoahFreeSet::allocate(ShenandoahAllocRequest& req, bool& in_new_
   }
 }
 
+HeapWord* ShenandoahFreeSet::allocate_humongous(ShenandoahAllocRequest& req) {
+  assert(ShenandoahHeapRegion::requires_humongous(req.size()), "Must be humongous alloc");
+  ShenandoahHeapLocker locker(_heap->lock(), req.is_mutator_alloc());
+  return allocate_contiguous(req);
+}
+
+void ShenandoahFreeSet::release_all_directly_allocatable_regions() {
+  for (uint i = 0; i < ShenandoahDirectlyAllocatableRegionCount; i++) {
+    ShenandoahHeapRegion* volatile* address = &_directly_allocatable_regions[i].address;
+    ShenandoahHeapRegion* r = Atomic::load_acquire(address);
+    if (r != nullptr) {
+      assert(r->reserved_for_direct_allocation(), "Must be");
+      Atomic::release_store_fence(address, static_cast<ShenandoahHeapRegion*>(nullptr));
+      r->release_from_direct_allocation();
+    }
+  }
+}
+
+template<bool IS_TLAB>
+HeapWord* ShenandoahFreeSet::par_allocate_single_for_mutator(ShenandoahAllocRequest &req, bool &in_new_region) {
+  shenandoah_assert_not_heaplocked();
+  assert(req.is_mutator_alloc(), "Must be mutator allocation");
+  assert(req.is_young(), "Mutator allocations always come from young generation.");
+  assert(!ShenandoahHeapRegion::requires_humongous(req.size()), "Must not");
+  assert(req.type() == ShenandoahAllocRequest::_alloc_tlab || req.type() == ShenandoahAllocRequest::_alloc_shared, "Must be");
+
+  const uint start_idx = ShenandoahDirectlyAllocatableRegionAffinity::index();
+  for (;;) {
+    constexpr uint max_probes = 3;
+    uint idx = start_idx;
+    ShenandoahHeapRegion* retirable_regions[max_probes];
+    ShenandoahHeapRegion* volatile * retirable_shared_regions_addresses[max_probes];
+    HeapWord* obj = nullptr;
+    uint count = 0u;
+    for (uint i = 0u; i < max_probes; i++) {
+      ShenandoahHeapRegion* volatile * shared_region_address = &_directly_allocatable_regions[idx].address;
+      ShenandoahHeapRegion* r = Atomic::load_acquire(shared_region_address);
+      if (r != nullptr && r->reserved_for_direct_allocation()) {
+        obj = par_allocate_in_for_mutator<IS_TLAB>(r, req, in_new_region);
+        if (obj != nullptr) {
+          return obj;
+        }
+      }
+
+      if (r == nullptr || r->free() < PLAB::min_size()) {
+        // Region is ready to retire
+        retirable_regions[count] = r;
+        retirable_shared_regions_addresses[count] = shared_region_address;
+        count++;
+      }
+      idx = (idx + 1) % ShenandoahDirectlyAllocatableRegionCount;
+    }
+    // Failed to allocate in 3 consecutive directly allocatable regions, meanwhile none of the 3 regions
+    // is ready for retire and replacement, it will fall back to allocate from other regions with a heap lock.
+    if (count == 0u) {
+      ShenandoahHeapLocker locker(ShenandoahHeap::heap()->lock(), true);
+      return allocate_for_mutator(req, in_new_region);
+    }
+    // If any of the 3 consecutive directly allocatable regions is ready for retire and replacement,
+    // grab heap lock try to retire all ready-to-retire shared regions.
+    if (!try_allocate_directly_allocatable_regions(retirable_shared_regions_addresses, retirable_regions, count, req, obj, in_new_region)) {
+      if (obj == nullptr) {
+        //only tried 3 shared regions, try to steal from other shared regions before OOM
+        do {
+          ShenandoahHeapRegion* r = Atomic::load_acquire(&_directly_allocatable_regions[idx].address);
+          if (r != nullptr && r->reserved_for_direct_allocation()) {
+            obj = par_allocate_in_for_mutator<IS_TLAB>(r, req, in_new_region);
+            if (obj != nullptr) break;
+          }
+          idx = (idx + 1) % ShenandoahDirectlyAllocatableRegionCount;
+        } while (idx != start_idx);
+        return obj;
+      }
+    }
+    // Regardless whether result of directly allocatable region allocation, the obj may have been allocated.
+    if (obj != nullptr) {
+      _partitions.increase_used(ShenandoahFreeSetPartitionId::Mutator, req.actual_size() * HeapWordSize);
+      return obj;
+    }
+  }
+}
+
+// Explicit specializations
+template HeapWord* ShenandoahFreeSet::par_allocate_single_for_mutator<true>(ShenandoahAllocRequest &req, bool &in_new_region);
+template HeapWord* ShenandoahFreeSet::par_allocate_single_for_mutator<false>(ShenandoahAllocRequest &req, bool &in_new_region);
+
+template<bool IS_TLAB>
+HeapWord* ShenandoahFreeSet::par_allocate_in_for_mutator(ShenandoahHeapRegion* region, ShenandoahAllocRequest &req, bool &in_new_region) {
+  HeapWord* obj = nullptr;
+  size_t actual_size = req.size();
+  if (IS_TLAB) {
+    obj = region->allocate_lab_atomic(req, actual_size);
+  } else {
+    obj = region->allocate_atomic(actual_size, req);
+  }
+  if (obj != nullptr) {
+    assert(actual_size > 0, "Must be");
+    req.set_actual_size(actual_size);
+    if (pointer_delta(obj, region->bottom()) == actual_size) {
+      // Set to true if it is the first object/tlab allocated in the region.
+      in_new_region = true;
+    }
+    _partitions.increase_used(ShenandoahFreeSetPartitionId::Mutator, req.actual_size() * HeapWordSize);
+  }
+  return obj;
+}
+
+class DirectlyAllocatableRegionAllocationClosure : public ShenandoahHeapRegionBreakableIterClosure {
+public:
+  ShenandoahHeapRegion* volatile ** _shared_region_addresses;
+  const uint _shared_region_address_count;
+  uint _current_index = 0u;
+  const uint _request_count;
+  uint _fulfilled_count = 0u;
+  ShenandoahAllocRequest &_req;
+  HeapWord* &_obj;
+  bool &_in_new_region;
+  const size_t _min_req_byte_size;
+
+  DirectlyAllocatableRegionAllocationClosure(
+    ShenandoahHeapRegion* volatile * shared_region_addresses[], const uint shared_region_address_count, const uint request_count,
+    ShenandoahAllocRequest &req, HeapWord* &obj, bool &in_new_region)
+  : _shared_region_addresses(shared_region_addresses), _shared_region_address_count(shared_region_address_count), _request_count(request_count),
+    _req(req), _obj(obj), _in_new_region(in_new_region),
+    _min_req_byte_size((req.type() == ShenandoahAllocRequest::_alloc_tlab ? req.min_size() : req.size()) * HeapWordSize) {
+    skip_invalid_address();
+  }
+
+  void skip_invalid_address() {
+    while (_current_index < _shared_region_address_count && _shared_region_addresses[_current_index] == nullptr) {
+      _current_index++;
+    }
+  }
+
+  bool heap_region_do(ShenandoahHeapRegion *r) override {
+    if (r->reserved_for_direct_allocation()) return false;
+    if (r->is_empty()) {
+      if (ShenandoahHeap::heap()->is_concurrent_weak_root_in_progress() && r->is_trash()) {
+        return false;
+      }
+      r->try_recycle_under_lock();
+
+      r->reserve_for_direct_allocation();
+      r->set_affiliation(YOUNG_GENERATION);
+      r->make_regular_allocation(YOUNG_GENERATION);
+      ShenandoahHeap::heap()->generation_for(r->affiliation())->increment_affiliated_region_count();
+      if (_obj == nullptr) {
+        size_t actual_size = _req.size();
+        _obj = _req.is_lab_alloc() ? r ->allocate_lab(_req, actual_size) : r->allocate(actual_size, _req);
+        _req.set_actual_size(actual_size);
+        _in_new_region = true;
+      }
+      OrderAccess::fence();
+      Atomic::store(_shared_region_addresses[_current_index++], r);
+      skip_invalid_address();
+      _fulfilled_count++;
+    } else if (r->affiliation() == YOUNG_GENERATION && r->is_regular() &&
+               r->get_top_before_promote() != nullptr && r->free() >= _min_req_byte_size) {
+      if (_obj == nullptr) {
+        size_t actual_size = _req.size();
+        _obj = _req.is_lab_alloc() ? r ->allocate_lab(_req, actual_size) : r->allocate(actual_size, _req);
+        _req.set_actual_size(actual_size);
+        _in_new_region = false;
+      } else {
+        r->reserve_for_direct_allocation();
+        Atomic::store(_shared_region_addresses[_current_index++], r);
+        skip_invalid_address();
+        _fulfilled_count++;
+      }
+    }
+    return _fulfilled_count == _request_count || _current_index == _shared_region_address_count;
+  }
+};
+
+bool ShenandoahFreeSet::try_allocate_directly_allocatable_regions(ShenandoahHeapRegion* volatile * shared_region_address[],
+                                                                  ShenandoahHeapRegion* original_shared_regions[],
+                                                                  const uint region_count,
+                                                                  ShenandoahAllocRequest &req,
+                                                                  HeapWord* &obj,
+                                                                  bool &in_new_region) {
+  assert(Thread::current()->is_Java_thread(), "Must be mutator");
+  assert(region_count > 0u && region_count <= ShenandoahDirectlyAllocatableRegionCount, "Must be");
+  shenandoah_assert_not_heaplocked();
+
+  ShenandoahHeapLocker locker(ShenandoahHeap::heap()->lock(), true);
+  uint request_count = 0u;
+  uint fulfilled_by_others = 0u;
+  for (uint i = 0u; i < region_count; i++) {
+    ShenandoahHeapRegion* r = Atomic::load_acquire(shared_region_address[i]);
+    if (r != original_shared_regions[i]) {
+      fulfilled_by_others++;
+      shared_region_address[i] = nullptr;
+      original_shared_regions[i] = nullptr;
+    } else {
+      request_count++;
+      if (r != nullptr) {
+        if (r->free() < PLAB::min_size()) {
+          Atomic::release_store_fence(shared_region_address[i], static_cast<ShenandoahHeapRegion*>(nullptr));
+          // TODO confirm when&why the region is moved out of Mutator partition?
+          if (_partitions.in_free_set(ShenandoahFreeSetPartitionId::Mutator, r->index())) {
+            _partitions.retire_from_partition(ShenandoahFreeSetPartitionId::Mutator, r->index(), r->used());
+          }
+          r->release_from_direct_allocation();
+        } else {
+          // Although r is same as original one when tried CAS allocation, but it has more free space.
+          fulfilled_by_others++;
+          shared_region_address[i] = nullptr;
+          original_shared_regions[i] = nullptr;
+          request_count--;
+        }
+      }
+    }
+  }
+
+  DirectlyAllocatableRegionAllocationClosure cl(shared_region_address, region_count, request_count, req, obj, in_new_region);
+  if (request_count > 0u) {
+    iterate_regions_for_alloc<true, false>(&cl, true);
+  }
+  return cl._fulfilled_count > 0u || fulfilled_by_others > 0u;
+}
+
+void ShenandoahFreeSet::release_directly_allocatable_region(ShenandoahHeapRegion* region) {
+  shenandoah_assert_heaplocked();
+  for (uint i = 0u; i < ShenandoahDirectlyAllocatableRegionCount; i++) {
+    if (_directly_allocatable_regions[i].address == region) {
+      Atomic::release_store(&_directly_allocatable_regions[i].address, static_cast<ShenandoahHeapRegion*>(nullptr));
+      break;
+    }
+  }
+  OrderAccess::fence();
+  region->release_from_direct_allocation();
+}
+
+template<bool IS_MUTATOR, bool IS_OLD>
+uint ShenandoahFreeSet::iterate_regions_for_alloc(ShenandoahHeapRegionBreakableIterClosure* cl, bool use_empty) {
+  assert((IS_MUTATOR && !IS_OLD) || !IS_MUTATOR, "Sanity check");
+  ShenandoahFreeSetPartitionId partition = IS_MUTATOR ? ShenandoahFreeSetPartitionId::Mutator :
+                                           (IS_OLD ? ShenandoahFreeSetPartitionId::OldCollector : ShenandoahFreeSetPartitionId::Mutator);
+  if (_partitions.is_empty(partition)) {
+    return 0u;
+  }
+  /*
+  if (IS_MUTATOR) {
+    update_allocation_bias();
+  }
+  */
+  if (_partitions.alloc_from_left_bias(partition)) {
+    ShenandoahLeftRightIterator iterator(&_partitions, partition, use_empty);
+    return iterate_regions_for_alloc(iterator, cl);
+  } else {
+    ShenandoahRightLeftIterator iterator(&_partitions, partition, use_empty);
+    return iterate_regions_for_alloc(iterator, cl);
+  }
+}
+
+template<typename Iter>
+uint ShenandoahFreeSet::iterate_regions_for_alloc(Iter& iterator, ShenandoahHeapRegionBreakableIterClosure* cl) {
+  uint regions_iterated = 0u;
+  for (idx_t idx = iterator.current(); iterator.has_next(); idx = iterator.next()) {
+    regions_iterated++;
+    ShenandoahHeapRegion* r = _heap->get_region(idx);
+    if (cl->heap_region_do(r)) {
+      break;
+    }
+  }
+  return regions_iterated;
+}
+
 void ShenandoahFreeSet::print_on(outputStream* out) const {
   out->print_cr("Mutator Free Set: %zu", _partitions.count(ShenandoahFreeSetPartitionId::Mutator));
   ShenandoahLeftRightIterator mutator(const_cast<ShenandoahRegionPartitions*>(&_partitions), ShenandoahFreeSetPartitionId::Mutator);
diff --git a/src/hotspot/share/gc/shenandoah/shenandoahFreeSet.hpp b/src/hotspot/share/gc/shenandoah/shenandoahFreeSet.hpp
index 55f23480618b9..94af6241b4190 100644
--- a/src/hotspot/share/gc/shenandoah/shenandoahFreeSet.hpp
+++ b/src/hotspot/share/gc/shenandoah/shenandoahFreeSet.hpp
@@ -29,6 +29,7 @@
 #include "gc/shenandoah/shenandoahHeap.hpp"
 #include "gc/shenandoah/shenandoahHeapRegionSet.hpp"
 #include "gc/shenandoah/shenandoahSimpleBitMap.hpp"
+#include "memory/padded.inline.hpp"
 
 // Each ShenandoahHeapRegion is associated with a ShenandoahFreeSetPartitionId.
 enum class ShenandoahFreeSetPartitionId : uint8_t {
@@ -78,10 +79,9 @@ class ShenandoahRegionPartitions {
   // are denoted in bytes.  Note that some regions that had been assigned to a particular partition at rebuild time
   // may have been retired following the rebuild.  The tallies for these regions are still reflected in _capacity[p]
   // and _used[p], even though the region may have been removed from the free set.
-  size_t _capacity[UIntNumPartitions];
-  size_t _used[UIntNumPartitions];
-  size_t _available[UIntNumPartitions];
-  size_t _region_counts[UIntNumPartitions];
+  size_t volatile _capacity[UIntNumPartitions];
+  size_t volatile _used[UIntNumPartitions];
+  size_t volatile _region_counts[UIntNumPartitions];
 
   // For each partition p, _left_to_right_bias is true iff allocations are normally made from lower indexed regions
   // before higher indexed regions.
@@ -213,56 +213,40 @@ class ShenandoahRegionPartitions {
 
   inline size_t capacity_of(ShenandoahFreeSetPartitionId which_partition) const {
     assert (which_partition < NumPartitions, "selected free set must be valid");
-    return _capacity[int(which_partition)];
+    return Atomic::load(_capacity + int(which_partition));
   }
 
   inline size_t used_by(ShenandoahFreeSetPartitionId which_partition) const {
     assert (which_partition < NumPartitions, "selected free set must be valid");
-    return _used[int(which_partition)];
+    return Atomic::load(_used + int(which_partition));
   }
 
   inline size_t available_in(ShenandoahFreeSetPartitionId which_partition) const {
     assert (which_partition < NumPartitions, "selected free set must be valid");
-    shenandoah_assert_heaplocked();
-    assert(_available[int(which_partition)] == _capacity[int(which_partition)] - _used[int(which_partition)],
-           "Expect available (%zu) equals capacity (%zu) - used (%zu) for partition %s",
-           _available[int(which_partition)], _capacity[int(which_partition)], _used[int(which_partition)],
-           partition_membership_name(ssize_t(which_partition)));
-    return _available[int(which_partition)];
+    return capacity_of(which_partition) - used_by(which_partition);
   }
 
   // Return available_in assuming caller does not hold the heap lock.  In production builds, available is
   // returned without acquiring the lock.  In debug builds, the global heap lock is acquired in order to
   // enforce a consistency assert.
   inline size_t available_in_not_locked(ShenandoahFreeSetPartitionId which_partition) const {
-    assert (which_partition < NumPartitions, "selected free set must be valid");
-    shenandoah_assert_not_heaplocked();
-#ifdef ASSERT
-    ShenandoahHeapLocker locker(ShenandoahHeap::heap()->lock());
-    assert((_available[int(which_partition)] == FreeSetUnderConstruction) ||
-           (_available[int(which_partition)] == _capacity[int(which_partition)] - _used[int(which_partition)]),
-           "Expect available (%zu) equals capacity (%zu) - used (%zu) for partition %s",
-           _available[int(which_partition)], _capacity[int(which_partition)], _used[int(which_partition)],
-           partition_membership_name(ssize_t(which_partition)));
-#endif
-    return _available[int(which_partition)];
+    return available_in(which_partition);
   }
 
   inline void set_capacity_of(ShenandoahFreeSetPartitionId which_partition, size_t value) {
-    shenandoah_assert_heaplocked();
     assert (which_partition < NumPartitions, "selected free set must be valid");
-    _capacity[int(which_partition)] = value;
-    _available[int(which_partition)] = value - _used[int(which_partition)];
+    Atomic::store(_capacity + int(which_partition), value);
   }
 
   inline void set_used_by(ShenandoahFreeSetPartitionId which_partition, size_t value) {
-    shenandoah_assert_heaplocked();
     assert (which_partition < NumPartitions, "selected free set must be valid");
-    _used[int(which_partition)] = value;
-    _available[int(which_partition)] = _capacity[int(which_partition)] - value;
+    Atomic::store(_used + int(which_partition), value);
   }
 
-  inline size_t count(ShenandoahFreeSetPartitionId which_partition) const { return _region_counts[int(which_partition)]; }
+  inline size_t count(ShenandoahFreeSetPartitionId which_partition) const {
+    assert (which_partition < NumPartitions, "selected free set must be valid");
+    return Atomic::load(_region_counts + int(which_partition));
+  }
 
   // Assure leftmost, rightmost, leftmost_empty, and rightmost_empty bounds are valid for all free sets.
   // Valid bounds honor all of the following (where max is the number of heap regions):
@@ -287,6 +271,28 @@ class ShenandoahRegionPartitions {
   void assert_bounds() NOT_DEBUG_RETURN;
 };
 
+#define DIRECTLY_ALLOCATABLE_REGION_UNKNOWN_AFFINITY ((Thread*)-1)
+#define DIRECTLY_ALLOCATABLE_REGION_UNKNOWN_SELF     ((Thread*)-2)
+// When mutator threads allocate from directly allocatable regions, ideally the allocation should be evenly
+// distributed to all the directly allocatable regions, random is the best portable option for this, but with random
+// distribution it may worsen memory locality, e.g. two consecutive allocation from same thread are randomly
+// distributed to different allocatable regions. ShenandoahDirectlyAllocatableRegionAffinity solves/mitigates
+// the memory locality issue.
+// The idea and code is borrowed from ZGC's CPU affinity, but with random number instead of CPU id.
+class ShenandoahDirectlyAllocatableRegionAffinity : public AllStatic {
+  struct Affinity {
+    Thread* _thread;
+  };
+
+  static PaddedEnd<Affinity>* _affinity;
+  static THREAD_LOCAL Thread* _self;
+  static THREAD_LOCAL uint    _index;
+  static uint index_slow();
+public:
+  static void initialize();
+  static uint index();
+};
+
 // Publicly, ShenandoahFreeSet represents memory that is available to mutator threads.  The public capacity(), used(),
 // and available() methods represent this public notion of memory that is under control of the mutator.  Separately,
 // ShenandoahFreeSet also represents memory available to garbage collection activities for compaction purposes.
@@ -313,8 +319,12 @@ class ShenandoahRegionPartitions {
 
 class ShenandoahFreeSet : public CHeapObj<mtGC> {
 private:
+  struct ShenandoahHeapRegionAddress {
+    ShenandoahHeapRegion* volatile address;
+  };
   ShenandoahHeap* const _heap;
   ShenandoahRegionPartitions _partitions;
+  PaddedEnd<ShenandoahHeapRegionAddress>* _directly_allocatable_regions;
 
   HeapWord* allocate_aligned_plab(size_t size, ShenandoahAllocRequest& req, ShenandoahHeapRegion* r);
 
@@ -410,6 +420,21 @@ class ShenandoahFreeSet : public CHeapObj<mtGC> {
   // log status, assuming lock has already been acquired by the caller.
   void log_status();
 
+  template<bool IS_TLAB>
+  HeapWord* par_allocate_in_for_mutator(ShenandoahHeapRegion* region, ShenandoahAllocRequest &req, bool &in_new_region);
+
+  bool try_allocate_directly_allocatable_regions(ShenandoahHeapRegion* volatile * shared_region_address[],
+                                                 ShenandoahHeapRegion* original_shared_regions[],
+                                                 uint region_count,
+                                                 ShenandoahAllocRequest &req,
+                                                 HeapWord* &obj,
+                                                 bool &in_new_region);
+  template<bool IS_MUTATOR, bool IS_OLD>
+  uint iterate_regions_for_alloc(ShenandoahHeapRegionBreakableIterClosure* cl, bool use_empty);
+
+  template<typename Iter>
+  uint iterate_regions_for_alloc(Iter& iterator, ShenandoahHeapRegionBreakableIterClosure* cl);
+
 public:
   static const size_t FreeSetUnderConstruction = ShenandoahRegionPartitions::FreeSetUnderConstruction;
 
@@ -484,6 +509,14 @@ class ShenandoahFreeSet : public CHeapObj<mtGC> {
 
   HeapWord* allocate(ShenandoahAllocRequest& req, bool& in_new_region);
 
+  HeapWord* allocate_humongous(ShenandoahAllocRequest &req);
+
+  void release_all_directly_allocatable_regions();
+
+  void release_directly_allocatable_region(ShenandoahHeapRegion *region);
+
+  template<bool IS_TLAB>
+  HeapWord* par_allocate_single_for_mutator(ShenandoahAllocRequest &req, bool &in_new_region);
   /*
    * Internal fragmentation metric: describes how fragmented the heap regions are.
    *
diff --git a/src/hotspot/share/gc/shenandoah/shenandoahFullGC.cpp b/src/hotspot/share/gc/shenandoah/shenandoahFullGC.cpp
index 27ff45e67de19..8d5eaac4c3d66 100644
--- a/src/hotspot/share/gc/shenandoah/shenandoahFullGC.cpp
+++ b/src/hotspot/share/gc/shenandoah/shenandoahFullGC.cpp
@@ -219,6 +219,8 @@ void ShenandoahFullGC::do_it(GCCause::Cause gc_cause) {
     heap->tlabs_retire(ResizeTLAB);
   }
 
+  heap->free_set()->release_all_directly_allocatable_regions();
+
   OrderAccess::fence();
 
   phase1_mark_heap();
diff --git a/src/hotspot/share/gc/shenandoah/shenandoahGeneration.cpp b/src/hotspot/share/gc/shenandoah/shenandoahGeneration.cpp
index 9a511de939ccb..01de1cfb3fea8 100644
--- a/src/hotspot/share/gc/shenandoah/shenandoahGeneration.cpp
+++ b/src/hotspot/share/gc/shenandoah/shenandoahGeneration.cpp
@@ -565,7 +565,7 @@ size_t ShenandoahGeneration::select_aged_regions(size_t old_available) {
         // old generation.
         HeapWord* tams = ctx->top_at_mark_start(r);
         HeapWord* original_top = r->top();
-        if (!heap->is_concurrent_old_mark_in_progress() && tams == original_top) {
+        if (!heap->is_concurrent_old_mark_in_progress() && tams == original_top && !r->reserved_for_direct_allocation()) {
           // No allocations from this region have been made during concurrent mark. It meets all the criteria
           // for in-place-promotion. Though we only need the value of top when we fill the end of the region,
           // we use this field to indicate that this region should be promoted in place during the evacuation
diff --git a/src/hotspot/share/gc/shenandoah/shenandoahHeap.cpp b/src/hotspot/share/gc/shenandoah/shenandoahHeap.cpp
index 50881a5077833..0fd230153d70b 100644
--- a/src/hotspot/share/gc/shenandoah/shenandoahHeap.cpp
+++ b/src/hotspot/share/gc/shenandoah/shenandoahHeap.cpp
@@ -976,7 +976,7 @@ HeapWord* ShenandoahHeap::allocate_memory(ShenandoahAllocRequest& req) {
     }
 
     if (!ShenandoahAllocFailureALot || !should_inject_alloc_failure()) {
-      result = allocate_memory_under_lock(req, in_new_region);
+      result = allocate_memory_for_mutator(req, in_new_region);
     }
 
     // Check that gc overhead is not exceeded.
@@ -1008,7 +1008,7 @@ HeapWord* ShenandoahHeap::allocate_memory(ShenandoahAllocRequest& req) {
       const size_t original_count = shenandoah_policy()->full_gc_count();
       while (result == nullptr && should_retry_allocation(original_count)) {
         control_thread()->handle_alloc_failure(req, true);
-        result = allocate_memory_under_lock(req, in_new_region);
+        result = allocate_memory_for_mutator(req, in_new_region);
       }
       if (result != nullptr) {
         // If our allocation request has been satisfied after it initially failed, we count this as good gc progress
@@ -1062,6 +1062,22 @@ HeapWord* ShenandoahHeap::allocate_memory(ShenandoahAllocRequest& req) {
   return result;
 }
 
+HeapWord* ShenandoahHeap::allocate_memory_for_mutator(ShenandoahAllocRequest& req, bool& in_new_region) {
+  assert(req.is_mutator_alloc(), "Sanity");
+  assert(!req.is_old(), "Sanity");
+  shenandoah_assert_not_heaplocked();
+  ShenandoahFreeSet* free_set = ShenandoahHeap::free_set();
+  if (ShenandoahHeapRegion::requires_humongous(req.size())) {
+    in_new_region = true;
+    return free_set->allocate_humongous(req);
+  }
+  if (req.is_lab_alloc()) {
+    return free_set->par_allocate_single_for_mutator<true>(req, in_new_region);
+  } else {
+    return free_set->par_allocate_single_for_mutator<false>(req, in_new_region);
+  }
+}
+
 inline bool ShenandoahHeap::should_retry_allocation(size_t original_full_gc_count) const {
   return shenandoah_policy()->full_gc_count() == original_full_gc_count
       && !shenandoah_policy()->is_at_shutdown();
diff --git a/src/hotspot/share/gc/shenandoah/shenandoahHeap.hpp b/src/hotspot/share/gc/shenandoah/shenandoahHeap.hpp
index 4124bf8be7f5a..d50c76cab12e5 100644
--- a/src/hotspot/share/gc/shenandoah/shenandoahHeap.hpp
+++ b/src/hotspot/share/gc/shenandoah/shenandoahHeap.hpp
@@ -118,6 +118,12 @@ class ShenandoahHeapRegionClosure : public StackObj {
   virtual bool is_thread_safe() { return false; }
 };
 
+class ShenandoahHeapRegionBreakableIterClosure : public StackObj {
+public:
+  // Return true to break the iteration loop.
+  virtual bool heap_region_do(ShenandoahHeapRegion* r) { return false; };
+};
+
 typedef ShenandoahLock    ShenandoahHeapLock;
 typedef ShenandoahLocker  ShenandoahHeapLocker;
 typedef Stack<oop, mtGC>  ShenandoahScanObjectStack;
@@ -691,6 +697,7 @@ class ShenandoahHeap : public CollectedHeap {
 
 private:
   HeapWord* allocate_memory_under_lock(ShenandoahAllocRequest& request, bool& in_new_region);
+  HeapWord* allocate_memory_for_mutator(ShenandoahAllocRequest& request, bool& in_new_region);
   HeapWord* allocate_from_gclab_slow(Thread* thread, size_t size);
   HeapWord* allocate_new_gclab(size_t min_size, size_t word_size, size_t* actual_size);
 
diff --git a/src/hotspot/share/gc/shenandoah/shenandoahHeapRegion.cpp b/src/hotspot/share/gc/shenandoah/shenandoahHeapRegion.cpp
index 05eb0c299a5ea..6ebfbb6a3d60f 100644
--- a/src/hotspot/share/gc/shenandoah/shenandoahHeapRegion.cpp
+++ b/src/hotspot/share/gc/shenandoah/shenandoahHeapRegion.cpp
@@ -89,6 +89,7 @@ ShenandoahHeapRegion::ShenandoahHeapRegion(HeapWord* start, size_t index, bool c
     SpaceMangler::mangle_region(MemRegion(_bottom, _end));
   }
   _recycling.unset();
+  _direct_alloc_reserved.unset();
 }
 
 void ShenandoahHeapRegion::report_illegal_transition(const char *method) {
@@ -370,25 +371,25 @@ void ShenandoahHeapRegion::make_committed_bypass() {
 }
 
 void ShenandoahHeapRegion::reset_alloc_metadata() {
-  _tlab_allocs = 0;
-  _gclab_allocs = 0;
-  _plab_allocs = 0;
+  Atomic::store(&_tlab_allocs, size_t(0));
+  Atomic::store(&_gclab_allocs, size_t(0));
+  Atomic::store(&_plab_allocs, size_t(0));
 }
 
 size_t ShenandoahHeapRegion::get_shared_allocs() const {
-  return used() - (_tlab_allocs + _gclab_allocs + _plab_allocs) * HeapWordSize;
+  return used() - (Atomic::load(&_tlab_allocs) + Atomic::load(&_gclab_allocs) + Atomic::load(&_plab_allocs)) * HeapWordSize;
 }
 
 size_t ShenandoahHeapRegion::get_tlab_allocs() const {
-  return _tlab_allocs * HeapWordSize;
+  return Atomic::load(&_tlab_allocs) * HeapWordSize;
 }
 
 size_t ShenandoahHeapRegion::get_gclab_allocs() const {
-  return _gclab_allocs * HeapWordSize;
+  return Atomic::load(&_gclab_allocs) * HeapWordSize;
 }
 
 size_t ShenandoahHeapRegion::get_plab_allocs() const {
-  return _plab_allocs * HeapWordSize;
+  return Atomic::load(&_plab_allocs) * HeapWordSize;
 }
 
 void ShenandoahHeapRegion::set_live_data(size_t s) {
@@ -854,6 +855,8 @@ size_t ShenandoahHeapRegion::pin_count() const {
 }
 
 void ShenandoahHeapRegion::set_affiliation(ShenandoahAffiliation new_affiliation) {
+  assert(new_affiliation != OLD_GENERATION || !reserved_for_direct_allocation(), "Reserved region can't move to old");
+
   ShenandoahHeap* heap = ShenandoahHeap::heap();
 
   ShenandoahAffiliation region_affiliation = heap->region_affiliation(this);
diff --git a/src/hotspot/share/gc/shenandoah/shenandoahHeapRegion.hpp b/src/hotspot/share/gc/shenandoah/shenandoahHeapRegion.hpp
index 4c99364bc6ed4..4cde038bc1764 100644
--- a/src/hotspot/share/gc/shenandoah/shenandoahHeapRegion.hpp
+++ b/src/hotspot/share/gc/shenandoah/shenandoahHeapRegion.hpp
@@ -250,11 +250,11 @@ class ShenandoahHeapRegion {
   HeapWord* _coalesce_and_fill_boundary; // for old regions not selected as collection set candidates.
 
   // Frequently updated fields
-  HeapWord* _top;
+  HeapWord* volatile _top;
 
-  size_t _tlab_allocs;
-  size_t _gclab_allocs;
-  size_t _plab_allocs;
+  size_t volatile _tlab_allocs;
+  size_t volatile _gclab_allocs;
+  size_t volatile _plab_allocs;
 
   volatile size_t _live_data;
   volatile size_t _critical_pins;
@@ -268,6 +268,8 @@ class ShenandoahHeapRegion {
 
   bool _needs_bitmap_reset;
 
+  ShenandoahSharedFlag _direct_alloc_reserved; // Flag to indicate that whether the region is reserved for lock-free direct allocation
+
 public:
   ShenandoahHeapRegion(HeapWord* start, size_t index, bool committed);
 
@@ -366,6 +368,15 @@ class ShenandoahHeapRegion {
   // Allocation (return nullptr if full)
   inline HeapWord* allocate(size_t word_size, const ShenandoahAllocRequest& req);
 
+  inline HeapWord* allocate_lab(const ShenandoahAllocRequest &req, size_t &actual_size);
+
+  // Atomic allocation using CAS, return nullptr if full or no enough space for the req
+  inline HeapWord* allocate_atomic(size_t word_size, const ShenandoahAllocRequest &req);
+
+  inline HeapWord* allocate_lab_atomic(const ShenandoahAllocRequest &req, size_t &actual_size);
+
+  inline bool try_allocate(HeapWord* const obj, size_t const size);
+
   inline void clear_live_data();
   void set_live_data(size_t s);
 
@@ -425,8 +436,12 @@ class ShenandoahHeapRegion {
   // Find humongous start region that this region belongs to
   ShenandoahHeapRegion* humongous_start_region() const;
 
-  HeapWord* top() const         { return _top;     }
-  void set_top(HeapWord* v)     { _top = v;        }
+  HeapWord* top() const {
+    return Atomic::load(&_top);
+  }
+  void set_top(HeapWord* v) {
+    Atomic::store(&_top, v);
+  }
 
   HeapWord* new_top() const     { return _new_top; }
   void set_new_top(HeapWord* v) { _new_top = v;    }
@@ -491,6 +506,20 @@ class ShenandoahHeapRegion {
     _needs_bitmap_reset = false;
   }
 
+  inline void reserve_for_direct_allocation() {
+    assert(_direct_alloc_reserved.is_unset(), "Must be");
+    _direct_alloc_reserved.set();
+  }
+
+  inline void release_from_direct_allocation() {
+    assert(_direct_alloc_reserved.is_set(), "Must be");
+    _direct_alloc_reserved.unset();
+  }
+
+  inline bool reserved_for_direct_allocation() const {
+    return _direct_alloc_reserved.is_set();
+  }
+
 private:
   void decrement_humongous_waste() const;
   void do_commit();
diff --git a/src/hotspot/share/gc/shenandoah/shenandoahHeapRegion.inline.hpp b/src/hotspot/share/gc/shenandoah/shenandoahHeapRegion.inline.hpp
index 0df482c1e2dab..bbfd325edb36d 100644
--- a/src/hotspot/share/gc/shenandoah/shenandoahHeapRegion.inline.hpp
+++ b/src/hotspot/share/gc/shenandoah/shenandoahHeapRegion.inline.hpp
@@ -109,6 +109,85 @@ HeapWord* ShenandoahHeapRegion::allocate(size_t size, const ShenandoahAllocReque
   }
 }
 
+HeapWord* ShenandoahHeapRegion::allocate_lab(const ShenandoahAllocRequest& req, size_t &actual_size) {
+  shenandoah_assert_heaplocked_or_safepoint();
+  assert(req.is_lab_alloc(), "Only lab alloc");
+  assert(this->affiliation() == req.affiliation(), "Region affiliation should already be established");
+
+  size_t adjusted_size = req.size();
+  HeapWord* obj = nullptr;
+  HeapWord* old_top = top();
+  size_t free_words = align_down(byte_size(old_top, end()) >> LogHeapWordSize, MinObjAlignment);
+  if (adjusted_size > free_words) {
+    adjusted_size = free_words;
+  }
+  if (adjusted_size >= req.min_size()) {
+    obj = allocate(adjusted_size, req);
+    actual_size = adjusted_size;
+    assert(obj == old_top, "Must be");
+  }
+  return obj;
+}
+
+HeapWord* ShenandoahHeapRegion::allocate_atomic(size_t size, const ShenandoahAllocRequest& req) {
+  assert(is_object_aligned(size), "alloc size breaks alignment: %zu", size);
+  assert(this->affiliation() == req.affiliation(), "Region affiliation should already be established");
+  assert(this->is_regular() || this->is_regular_pinned(), "must be a regular region");
+
+  for (;;) {
+    if (!reserved_for_direct_allocation()) {
+      return nullptr;
+    }
+    HeapWord* obj = top();
+    if (pointer_delta(end(), obj) >= size) {
+      if (try_allocate(obj, size)) {
+        adjust_alloc_metadata(req.type(), size);
+        return obj;
+      }
+    } else {
+      return nullptr;
+    }
+  }
+}
+
+HeapWord* ShenandoahHeapRegion::allocate_lab_atomic(const ShenandoahAllocRequest& req, size_t &actual_size) {
+  assert(req.is_lab_alloc(), "Only lab alloc");
+  assert(this->affiliation() == req.affiliation(), "Region affiliation should already be established");
+  assert(this->is_regular() || this->is_regular_pinned(), "must be a regular region");
+  size_t adjusted_size = req.size();
+  for (;;) {
+    if (!reserved_for_direct_allocation()) {
+      return nullptr;
+    }
+    HeapWord* obj = top();
+    size_t free_words = align_down(byte_size(obj, end()) >> LogHeapWordSize, MinObjAlignment);
+    if (adjusted_size > free_words) {
+      adjusted_size = free_words;
+    }
+    if (adjusted_size >= req.min_size()) {
+      if (try_allocate(obj, adjusted_size)) {
+        actual_size = adjusted_size;
+        adjust_alloc_metadata(req.type(), adjusted_size);
+        return obj;
+      }
+    } else {
+      log_trace(gc, free)("Failed to shrink TLAB or GCLAB request (%zu) in region %zu to %zu"
+                          " because min_size() is %zu", req.size(), index(), adjusted_size, req.min_size());
+      return nullptr;
+    }
+  }
+}
+
+bool ShenandoahHeapRegion::try_allocate(HeapWord* const obj, size_t const size) {
+  HeapWord* new_top = obj + size;
+  if (Atomic::cmpxchg(&_top, obj, new_top) == obj) {
+    assert(is_object_aligned(new_top), "new top breaks alignment: " PTR_FORMAT, p2i(new_top));
+    assert(is_object_aligned(obj),     "obj is not aligned: "       PTR_FORMAT, p2i(obj));
+    return true;
+  }
+  return false;
+}
+
 inline void ShenandoahHeapRegion::adjust_alloc_metadata(ShenandoahAllocRequest::Type type, size_t size) {
   switch (type) {
     case ShenandoahAllocRequest::_alloc_shared:
@@ -116,13 +195,13 @@ inline void ShenandoahHeapRegion::adjust_alloc_metadata(ShenandoahAllocRequest::
       // Counted implicitly by tlab/gclab allocs
       break;
     case ShenandoahAllocRequest::_alloc_tlab:
-      _tlab_allocs += size;
+      Atomic::add(&_tlab_allocs, size);
       break;
     case ShenandoahAllocRequest::_alloc_gclab:
-      _gclab_allocs += size;
+      Atomic::add(&_gclab_allocs, size);
       break;
     case ShenandoahAllocRequest::_alloc_plab:
-      _plab_allocs += size;
+      Atomic::add(&_plab_allocs, size);
       break;
     default:
       ShouldNotReachHere();
diff --git a/src/hotspot/share/gc/shenandoah/shenandoah_globals.hpp b/src/hotspot/share/gc/shenandoah/shenandoah_globals.hpp
index ad0beeafed79e..df638796bd8ef 100644
--- a/src/hotspot/share/gc/shenandoah/shenandoah_globals.hpp
+++ b/src/hotspot/share/gc/shenandoah/shenandoah_globals.hpp
@@ -564,6 +564,10 @@
          "to prevent starvation of the old collector. Setting this to "     \
          "0 will allow back to back young collections to run during old "   \
          "collections.")                                                    \
+  product(uintx, ShenandoahDirectlyAllocatableRegionCount, 13, EXPERIMENTAL,\
+         "Number of regions Shenandoah will pre-allocate for "              \
+         "direct allocation with CAS, the values should less than "         \
+         "number of CPU cores. Ideally it should be a prime number. ")      \
   // end of GC_SHENANDOAH_FLAGS
 
 #endif // SHARE_GC_SHENANDOAH_SHENANDOAH_GLOBALS_HPP
diff --git a/src/hotspot/share/gc/shenandoah/vmStructs_shenandoah.hpp b/src/hotspot/share/gc/shenandoah/vmStructs_shenandoah.hpp
index a245f91fa71e9..069daa4990ee6 100644
--- a/src/hotspot/share/gc/shenandoah/vmStructs_shenandoah.hpp
+++ b/src/hotspot/share/gc/shenandoah/vmStructs_shenandoah.hpp
@@ -42,7 +42,7 @@
   volatile_nonstatic_field(ShenandoahHeapRegion, _state,           ShenandoahHeapRegion::RegionState) \
   nonstatic_field(ShenandoahHeapRegion, _index,                    size_t const)                      \
   nonstatic_field(ShenandoahHeapRegion, _bottom,                   HeapWord* const)                   \
-  nonstatic_field(ShenandoahHeapRegion, _top,                      HeapWord*)                         \
+  volatile_nonstatic_field(ShenandoahHeapRegion, _top,             HeapWord*)                         \
   nonstatic_field(ShenandoahHeapRegion, _end,                      HeapWord* const)                   \
 
 #define VM_INT_CONSTANTS_SHENANDOAH(declare_constant, declare_constant_with_value) \