From 3e790cfe66bbc6738d9873d6a75d31880c87b32d Mon Sep 17 00:00:00 2001 From: Noah Lev Date: Thu, 5 Oct 2023 17:14:02 -0400 Subject: [PATCH 1/8] Fix support for arm64 and aarch64 --- Makefile | 3 ++- bazel | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index a21b18a1..7651b226 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,8 @@ # Version 2.0, that can be found in the LICENSE file. PREFIX = /usr -BAZEL_CONFIG = --config=modern-amd64 +# TODO: put the arch flag back for x86 +BAZEL_CONFIG = LIB_SUFFIX = UNAME_S = $(shell uname -s) diff --git a/bazel b/bazel index d45b7c95..fe16de1d 100755 --- a/bazel +++ b/bazel @@ -324,7 +324,7 @@ def determine_bazel_filename(version): machine = normalized_machine_arch_name() if machine != "x86_64" and machine != 'arm64': raise Exception( - 'Unsupported machine architecture "{}". Bazel currently only supports x86_64.'.format( + 'Unsupported machine architecture "{}". Bazel currently only supports x86_64 and arm64.'.format( machine ) ) @@ -342,6 +342,8 @@ def normalized_machine_arch_name(): machine = platform.machine().lower() if machine == "amd64": machine = "x86_64" + if machine == "aarch64": + machine = "arm64" return machine From 02bdb8c68f1d6891596947a8377b790346e0be59 Mon Sep 17 00:00:00 2001 From: Noah Lev Date: Thu, 19 Oct 2023 16:21:26 -0400 Subject: [PATCH 2/8] Make ShuffleVector et al. work for 16K pages I also temporarily pointed heaplayers to a local copy since it needs changes for Apple Silicon too. --- WORKSPACE | 17 +++++++++++------ src/common.h | 20 ++++++++++++++++++-- src/shuffle_vector.h | 11 ++++++----- src/thread_local_heap.cc | 3 ++- 4 files changed, 37 insertions(+), 14 deletions(-) diff --git a/WORKSPACE b/WORKSPACE index 6eea4954..a77682c9 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -40,11 +40,16 @@ http_archive( ], ) -http_archive( +# FIXME: temporary +local_repository( name = "org_heaplayers", - sha256 = "c8a9f7589e13112515ba1ac8647b4e80462f18a6773f7f5f132a7d7602fe2aec", - strip_prefix = "Heap-Layers-{}".format(commit["heap_layers"]), - urls = [ - "https://github.com/emeryberger/Heap-Layers/archive/{}.zip".format(commit["heap_layers"]), - ], + path = "../Heap-Layers", ) +# http_archive( +# name = "org_heaplayers", +# sha256 = "c8a9f7589e13112515ba1ac8647b4e80462f18a6773f7f5f132a7d7602fe2aec", +# strip_prefix = "Heap-Layers-{}".format(commit["heap_layers"]), +# urls = [ +# "https://github.com/emeryberger/Heap-Layers/archive/{}.zip".format(commit["heap_layers"]), +# ], +# ) diff --git a/src/common.h b/src/common.h index 83310511..752ece7a 100644 --- a/src/common.h +++ b/src/common.h @@ -58,6 +58,13 @@ #define MAP_NORESERVE 0 #endif +#if __APPLE__ +#include +#if TARGET_CPU_ARM64 +#define MESH_APPLE_SILICON +#endif +#endif + namespace mesh { static constexpr bool kMeshingEnabled = MESHING_ENABLED == 1; @@ -79,7 +86,11 @@ static constexpr size_t kMaxSize = 16384; static constexpr size_t kClassSizesMax = 25; static constexpr size_t kAlignment = 8; static constexpr int kMinAlign = 16; +#ifdef MESH_APPLE_SILICON +static constexpr uint64_t kPageSize = 16384; +#else static constexpr uint64_t kPageSize = 4096; +#endif static constexpr size_t kMaxFastLargeSize = 256 * 1024; // 256Kb static constexpr size_t kMaxSplitListSize = 16384; @@ -113,10 +124,15 @@ static constexpr size_t kMinArenaExpansion = 4096; // 16 MB in pages // ensures we amortize the cost of going to the global heap enough static constexpr uint64_t kMinStringLen = 8; static constexpr size_t kMiniheapRefillGoalSize = 4 * 1024; +// this must be kept below 2^6 because it's used as the max value in a bitfield; see sv::Entry static constexpr size_t kMaxMiniheapsPerShuffleVector = 24; // shuffle vector features -static constexpr int16_t kMaxShuffleVectorLength = 256; // sizeof(uint8_t) << 8 +#ifdef MESH_APPLE_SILICON +static constexpr int16_t kMaxShuffleVectorLength = 1024; +#else +static constexpr int16_t kMaxShuffleVectorLength = 256; +#endif static constexpr bool kEnableShuffleOnInit = SHUFFLE_ON_INIT == 1; static constexpr bool kEnableShuffleOnFree = SHUFFLE_ON_FREE == 1; @@ -182,7 +198,7 @@ using std::unique_lock; #define ATTRIBUTE_ALIGNED(s) __attribute__((aligned(s))) #define ATTRIBUTE_MALLOC __attribute__((malloc)) #define ATTRIBUTE_ALLOC_SIZE(x) __attribute__((alloc_size(x))) -#define ATTRIBUTE_ALLOC_SIZE2(x,y) __attribute__((alloc_size(x, y))) +#define ATTRIBUTE_ALLOC_SIZE2(x, y) __attribute__((alloc_size(x, y))) #define CACHELINE_SIZE 64 #define CACHELINE_ALIGNED ATTRIBUTE_ALIGNED(CACHELINE_SIZE) #define CACHELINE_ALIGNED_FN CACHELINE_ALIGNED diff --git a/src/shuffle_vector.h b/src/shuffle_vector.h index 5a8cd6cd..eb6698da 100644 --- a/src/shuffle_vector.h +++ b/src/shuffle_vector.h @@ -27,7 +27,7 @@ class Entry { Entry() noexcept : _mhOffset{0}, _bitOffset{0} { } - explicit Entry(uint8_t mhOff, uint8_t bitOff) : _mhOffset{mhOff}, _bitOffset{bitOff} { + explicit Entry(uint16_t mhOff, uint16_t bitOff) : _mhOffset{mhOff}, _bitOffset{bitOff} { } Entry(const Entry &rhs) = default; @@ -40,17 +40,18 @@ class Entry { return _mhOffset == rhs._mhOffset && _bitOffset == rhs._bitOffset; } - inline uint8_t ATTRIBUTE_ALWAYS_INLINE miniheapOffset() const { + // FIXME: should this cast to uint8_t? + inline uint16_t ATTRIBUTE_ALWAYS_INLINE miniheapOffset() const { return _mhOffset; } - inline uint8_t ATTRIBUTE_ALWAYS_INLINE bit() const { + inline uint16_t ATTRIBUTE_ALWAYS_INLINE bit() const { return _bitOffset; } private: - uint8_t _mhOffset; - uint8_t _bitOffset; + uint16_t _mhOffset : 6; + uint16_t _bitOffset : 10; }; static_assert(sizeof(Entry) == 2, "Entry too big!"); } // namespace sv diff --git a/src/thread_local_heap.cc b/src/thread_local_heap.cc index 8a11ea15..384d7f76 100644 --- a/src/thread_local_heap.cc +++ b/src/thread_local_heap.cc @@ -44,7 +44,8 @@ void ThreadLocalHeap::InitTLH() { ThreadLocalHeap *ThreadLocalHeap::NewHeap(pthread_t current) { // we just allocate out of our internal heap void *buf = mesh::internal::Heap().malloc(sizeof(ThreadLocalHeap)); - static_assert(sizeof(ThreadLocalHeap) < 4096 * 8, "tlh should have a reasonable size"); + // FIXME: is this size too big? + static_assert(sizeof(ThreadLocalHeap) < 4096 * 16, "tlh should have a reasonable size"); hard_assert(buf != nullptr); hard_assert(reinterpret_cast(buf) % CACHELINE_SIZE == 0); From 9a38eef210a629f47caca9f4db3fc8b6b451e785 Mon Sep 17 00:00:00 2001 From: Noah Lev Date: Thu, 19 Oct 2023 16:46:02 -0400 Subject: [PATCH 3/8] Adjust Flags for 16K pages --- Makefile | 5 +++-- src/mini_heap.h | 12 +++++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 7651b226..418a811d 100644 --- a/Makefile +++ b/Makefile @@ -3,8 +3,9 @@ # Version 2.0, that can be found in the LICENSE file. PREFIX = /usr -# TODO: put the arch flag back for x86 -BAZEL_CONFIG = +# FIXME: put the arch flag back for x86 +# FIXME: revert temp flags +BAZEL_CONFIG = --config=disable-meshing --config=debugsymbols LIB_SUFFIX = UNAME_S = $(shell uname -s) diff --git a/src/mini_heap.h b/src/mini_heap.h index a480b22e..d71df4c3 100644 --- a/src/mini_heap.h +++ b/src/mini_heap.h @@ -31,10 +31,16 @@ class Flags { static inline constexpr uint32_t ATTRIBUTE_ALWAYS_INLINE getSingleBitMask(uint32_t pos) { return 1UL << pos; } + // FIXME: these need to be updated so offset has enough space (10 bits) static constexpr uint32_t SizeClassShift = 0; static constexpr uint32_t FreelistIdShift = 6; + // max value is (16K / 16 - 1) - 1 = 1022 + // so needs 10 bits static constexpr uint32_t ShuffleVectorOffsetShift = 8; - static constexpr uint32_t MaxCountShift = 16; + // max value is 16K / 16 = 1024 = 2^10 (for 16K pages) + // so needs 11 bits (10 for 0-1023 + 1) + // we give it one more in case we need more later + static constexpr uint32_t MaxCountShift = 18; static constexpr uint32_t MeshedOffset = 30; inline void ATTRIBUTE_ALWAYS_INLINE setMasked(uint32_t mask, uint32_t newVal) { @@ -53,9 +59,9 @@ class Flags { (freelistId << FreelistIdShift)} { d_assert((freelistId & 0x3) == freelistId); d_assert((sizeClass & ((1 << FreelistIdShift) - 1)) == sizeClass); - d_assert(svOffset < 255); + d_assert(svOffset < (kPageSize / kMinObjectSize - 1)); d_assert_msg(sizeClass < 255, "sizeClass: %u", sizeClass); - d_assert(maxCount <= 256); + d_assert(maxCount <= (kPageSize / kMinObjectSize)); d_assert(this->maxCount() == maxCount); } From e5d3d896c8e723f48cdf938d4a01432a113e173b Mon Sep 17 00:00:00 2001 From: Noah Lev Date: Tue, 24 Oct 2023 16:20:19 -0400 Subject: [PATCH 4/8] Migrate bitmaps to page size independence --- src/bitmap.h | 39 ++++++++++++++++++--------------------- src/common.h | 17 ++++++++++++----- src/mini_heap.h | 19 ++++++++++--------- 3 files changed, 40 insertions(+), 35 deletions(-) diff --git a/src/bitmap.h b/src/bitmap.h index bb9df24d..5af335d4 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -86,14 +86,10 @@ class AtomicBitmapBase { AtomicBitmapBase(size_t bitCount) { d_assert_msg(bitCount <= maxBits, "max bits (%zu) exceeded: %zu", maxBits, bitCount); - static_assert(wordCount(representationSize(maxBits)) == 4, "unexpected representation size"); - // for (size_t i = 0; i < wordCount(representationSize(maxBits)); i++) { - // _bits[i].store(0, std::memory_order_relaxed); - // } - _bits[0].store(0, std::memory_order_relaxed); - _bits[1].store(0, std::memory_order_relaxed); - _bits[2].store(0, std::memory_order_relaxed); - _bits[3].store(0, std::memory_order_relaxed); + // FIXME: this used to be manually unrolled. hopefully clang can unroll it for us? + for (size_t i = 0; i < wordCount(representationSize(maxBits)); i++) { + _bits[i].store(0, std::memory_order_relaxed); + } std::atomic_thread_fence(std::memory_order_release); } @@ -101,13 +97,10 @@ class AtomicBitmapBase { } inline void ATTRIBUTE_ALWAYS_INLINE setAndExchangeAll(size_t *oldBits, const size_t *newBits) { - // for (size_t i = 0; i < wordCount(representationSize(maxBits)); i++) { - // oldBits[i] = _bits[i].exchange(newBits[i]); - // } - oldBits[0] = _bits[0].exchange(newBits[0], std::memory_order_acq_rel); - oldBits[1] = _bits[1].exchange(newBits[1], std::memory_order_acq_rel); - oldBits[2] = _bits[2].exchange(newBits[2], std::memory_order_acq_rel); - oldBits[3] = _bits[3].exchange(newBits[3], std::memory_order_acq_rel); + // FIXME: this used to be manually unrolled. hopefully clang can unroll it for us? + for (size_t i = 0; i < wordCount(representationSize(maxBits)); i++) { + oldBits[i] = _bits[i].exchange(newBits[i], std::memory_order_acq_rel); + } } public: @@ -140,8 +133,12 @@ class AtomicBitmapBase { } inline uint32_t ATTRIBUTE_ALWAYS_INLINE inUseCount() const { - return __builtin_popcountl(_bits[0]) + __builtin_popcountl(_bits[1]) + __builtin_popcountl(_bits[2]) + - __builtin_popcountl(_bits[3]); + // FIXME: this used to be manually unrolled. hopefully clang can unroll it for us? + uint32_t sum = 0; + for (size_t i = 0; i < wordCount(representationSize(maxBits)); i++) { + sum += __builtin_popcountl(_bits[i]); + } + return sum; } protected: @@ -577,12 +574,12 @@ class BitmapBase : public Super { } // namespace bitmap namespace internal { -typedef bitmap::BitmapBase> Bitmap; -typedef bitmap::BitmapBase> RelaxedFixedBitmap; +typedef bitmap::BitmapBase> Bitmap; +typedef bitmap::BitmapBase> RelaxedFixedBitmap; typedef bitmap::BitmapBase RelaxedBitmap; -static_assert(sizeof(Bitmap) == sizeof(size_t) * 4, "Bitmap unexpected size"); -static_assert(sizeof(RelaxedFixedBitmap) == sizeof(size_t) * 4, "Bitmap unexpected size"); +static_assert(sizeof(Bitmap) <= sizeof(size_t) * 16, "Bitmap unexpected size"); +static_assert(sizeof(RelaxedFixedBitmap) <= sizeof(size_t) * 16, "Bitmap unexpected size"); static_assert(sizeof(RelaxedBitmap) == sizeof(size_t) * 2, "Bitmap unexpected size"); } // namespace internal } // namespace mesh diff --git a/src/common.h b/src/common.h index 752ece7a..0797044c 100644 --- a/src/common.h +++ b/src/common.h @@ -76,21 +76,28 @@ static constexpr int kMapShared = 1; static constexpr int kMapShared = kMeshingEnabled ? MAP_SHARED : MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE; #endif -// we have to define this here for use in meshable_arena's CheapHeap we allocate -// MiniHeaps out of. We validate (and fail compilation) if this gets out of date -// with a static_assert at the bottom of mini_heap.h -static constexpr size_t kMiniHeapSize = 64; - static constexpr size_t kMinObjectSize = 16; static constexpr size_t kMaxSize = 16384; static constexpr size_t kClassSizesMax = 25; static constexpr size_t kAlignment = 8; static constexpr int kMinAlign = 16; + #ifdef MESH_APPLE_SILICON + static constexpr uint64_t kPageSize = 16384; +// we have to define this here for use in meshable_arena's CheapHeap we allocate +// MiniHeaps out of. We validate (and fail compilation) if this gets out of date +// with a static_assert at the bottom of mini_heap.h +static constexpr size_t kMiniHeapSize = 160; + #else + static constexpr uint64_t kPageSize = 4096; +// see comment in other branch +static constexpr size_t kMiniHeapSize = 64; + #endif + static constexpr size_t kMaxFastLargeSize = 256 * 1024; // 256Kb static constexpr size_t kMaxSplitListSize = 16384; diff --git a/src/mini_heap.h b/src/mini_heap.h index d71df4c3..74709ed7 100644 --- a/src/mini_heap.h +++ b/src/mini_heap.h @@ -518,20 +518,21 @@ class MiniHeap { return spanptr; } - internal::Bitmap _bitmap; // 32 bytes 32 - const Span _span; // 8 40 - MiniHeapListEntry _freelist{}; // 8 48 - atomic _current{0}; // 4 52 - Flags _flags; // 4 56 - const float _objectSizeReciprocal; // 4 60 - MiniHeapID _nextMeshed{}; // 4 64 + // The comment are for the max size, since it is architecture-dependent. + internal::Bitmap _bitmap; // 128 128 bytes + const Span _span; // 8 136 + MiniHeapListEntry _freelist{}; // 8 144 + atomic _current{0}; // 4 148 + Flags _flags; // 4 152 + const float _objectSizeReciprocal; // 4 156 + MiniHeapID _nextMeshed{}; // 4 160 }; typedef FixedArray MiniHeapArray; static_assert(sizeof(pid_t) == 4, "pid_t not 32-bits!"); -static_assert(sizeof(mesh::internal::Bitmap) == 32, "Bitmap too big!"); -static_assert(sizeof(MiniHeap) == 64, "MiniHeap too big!"); +static_assert(sizeof(mesh::internal::Bitmap) == kMaxShuffleVectorLength / 8, "Bitmap too big!"); +static_assert(sizeof(MiniHeap) <= 160, "MiniHeap too big!"); static_assert(sizeof(MiniHeap) == kMiniHeapSize, "MiniHeap size mismatch"); static_assert(sizeof(MiniHeapArray) == 64 * sizeof(void *), "MiniHeapArray too big!"); } // namespace mesh From 9e0a5bfb24da1b072c6aefa7d40563cb0735843b Mon Sep 17 00:00:00 2001 From: Noah Lev Date: Tue, 24 Oct 2023 17:03:36 -0400 Subject: [PATCH 5/8] Migrate some more things --- src/common.h | 2 +- src/global_heap.cc | 2 +- src/mac_wrapper.cc | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/common.h b/src/common.h index 0797044c..c2e35a22 100644 --- a/src/common.h +++ b/src/common.h @@ -150,7 +150,7 @@ static constexpr std::chrono::milliseconds kZeroMs{0}; static constexpr std::chrono::milliseconds kMeshPeriodMs{100}; // 100 ms // controls aspects of miniheaps -static constexpr size_t kMaxMeshes = 256; // 1 per bit +static constexpr size_t kMaxMeshes = kMaxShuffleVectorLength; // 1 per bit #ifdef __APPLE__ static constexpr size_t kArenaSize = 32ULL * 1024ULL * 1024ULL * 1024ULL; // 16 GB #else diff --git a/src/global_heap.cc b/src/global_heap.cc index ae6be6ad..146b45bd 100644 --- a/src/global_heap.cc +++ b/src/global_heap.cc @@ -505,7 +505,7 @@ shiftedSplitting(MWC &prng, MiniHeapListEntry *miniheaps, SplitArray &left, Spli return; } - constexpr size_t nBytes = 32; + constexpr size_t nBytes = kMaxShuffleVectorLength / 8; const size_t limit = rightSize < t ? rightSize : t; d_assert(nBytes == left[0]->bitmap().byteCount()); diff --git a/src/mac_wrapper.cc b/src/mac_wrapper.cc index 4183c6c8..137bb289 100644 --- a/src/mac_wrapper.cc +++ b/src/mac_wrapper.cc @@ -306,6 +306,7 @@ MESH_EXPORT void replace_malloc_destroy_zone(malloc_zone_t *) { MESH_EXPORT kern_return_t replace_malloc_get_all_zones(task_t, memory_reader_t, vm_address_t **addresses, unsigned *count) { *addresses = 0; + // FIXME: this produces a warning by clang, so maybe this should have a *? count = 0; return KERN_SUCCESS; } From e4fd57b7c993eb84e6c70b06872f84d69e7f8936 Mon Sep 17 00:00:00 2001 From: Noah Lev Date: Thu, 26 Oct 2023 17:14:37 -0400 Subject: [PATCH 6/8] Fix typo --- src/mini_heap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mini_heap.h b/src/mini_heap.h index 74709ed7..4d753cce 100644 --- a/src/mini_heap.h +++ b/src/mini_heap.h @@ -518,7 +518,7 @@ class MiniHeap { return spanptr; } - // The comment are for the max size, since it is architecture-dependent. + // The comments are for the max size, since it is architecture-dependent. internal::Bitmap _bitmap; // 128 128 bytes const Span _span; // 8 136 MiniHeapListEntry _freelist{}; // 8 144 From c4636522fb12b9affb20796625638e2ac794deeb Mon Sep 17 00:00:00 2001 From: Noah Lev Date: Wed, 17 Apr 2024 19:04:32 -0400 Subject: [PATCH 7/8] Apply bugfix from Bobby maxCount bit width changed but load wasn't updated. --- src/mini_heap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mini_heap.h b/src/mini_heap.h index 4d753cce..b44ab38c 100644 --- a/src/mini_heap.h +++ b/src/mini_heap.h @@ -62,7 +62,7 @@ class Flags { d_assert(svOffset < (kPageSize / kMinObjectSize - 1)); d_assert_msg(sizeClass < 255, "sizeClass: %u", sizeClass); d_assert(maxCount <= (kPageSize / kMinObjectSize)); - d_assert(this->maxCount() == maxCount); + d_assert_msg(this->maxCount() == maxCount, "maxCount() (%u) != maxCount (%u)", this->maxCount(), maxCount); } inline uint32_t freelistId() const { @@ -79,7 +79,7 @@ class Flags { inline uint32_t maxCount() const { // XXX: does this assume little endian? - return (_flags.load(std::memory_order_seq_cst) >> MaxCountShift) & 0x1ff; + return (_flags.load(std::memory_order_seq_cst) >> MaxCountShift) & 0x7ff; } inline uint32_t sizeClass() const { From 4597b6a191855e8ffbcbe86750a349739253995e Mon Sep 17 00:00:00 2001 From: Noah Lev Date: Thu, 2 May 2024 17:30:49 -0400 Subject: [PATCH 8/8] Apply remaining fixes from Bobby --- src/runtime.cc | 2 +- src/testing/unit/concurrent_mesh_test.cc | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/runtime.cc b/src/runtime.cc index 528604cf..adfdc617 100644 --- a/src/runtime.cc +++ b/src/runtime.cc @@ -358,7 +358,7 @@ void Runtime::segfaultHandler(int sig, siginfo_t *siginfo, void *context) { // okToProceed is a barrier that ensures any in-progress meshing has // completed, and the reason for the fault was 'just' a meshing - if (siginfo->si_code == SEGV_ACCERR && runtime().heap().okToProceed(siginfo->si_addr)) { + if ((siginfo->si_code == SEGV_ACCERR || siginfo->si_code == SEGV_MAPERR) && runtime().heap().okToProceed(siginfo->si_addr)) { // debug("TODO: trapped access violation from meshing, log stat\n"); return; } diff --git a/src/testing/unit/concurrent_mesh_test.cc b/src/testing/unit/concurrent_mesh_test.cc index 58c8e04e..9f1b92cd 100644 --- a/src/testing/unit/concurrent_mesh_test.cc +++ b/src/testing/unit/concurrent_mesh_test.cc @@ -11,6 +11,7 @@ #include "gtest/gtest.h" +#include "common.h" #include "internal.h" #include "meshing.h" #include "runtime.h" @@ -20,7 +21,11 @@ using namespace std; using namespace mesh; static constexpr uint32_t StrLen = 128; +#ifdef MESH_APPLE_SILICON +static constexpr uint32_t ObjCount = 128; +#else static constexpr uint32_t ObjCount = 32; +#endif static char *s1; static char *s2;