diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..d9fa1439 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,10 @@ +{ + "files.associations": { + "mutex": "c", + "shared_mutex": "c", + "condition_variable": "c", + "chrono": "cpp", + "random": "c", + "algorithm": "c" + } +} \ No newline at end of file diff --git a/P-CLHT/CMakeLists.txt b/P-CLHT/CMakeLists.txt index 1697d1cd..2950e814 100644 --- a/P-CLHT/CMakeLists.txt +++ b/P-CLHT/CMakeLists.txt @@ -42,5 +42,12 @@ set(P_CLHT_TEST example.cpp src/clht_lb_res.c src/clht_gc.c external/sspfd/sspfd.c external/ssmem/src/ssmem.c) add_executable(example ${P_CLHT_TEST}) -target_link_libraries(example ${TbbLib} ${JemallocLib} boost_system +target_link_libraries(example ${TbbLib} ${JemallocLib} pmemobj pmem boost_system + boost_thread pthread) + +set(P_CLHT_TEST test.cpp src/clht_lb_res.c src/clht_gc.c + external/sspfd/sspfd.c external/ssmem/src/ssmem.c) +add_executable(test ${P_CLHT_TEST}) + +target_link_libraries(test ${TbbLib} ${JemallocLib} pmemobj pmem boost_system boost_thread pthread) diff --git a/P-CLHT/Makefile b/P-CLHT/Makefile index daa01641..49fc7352 100644 --- a/P-CLHT/Makefile +++ b/P-CLHT/Makefile @@ -151,7 +151,7 @@ CFLAGS += $(PLATFORM) CFLAGS += $(OPTIMIZE) CFLAGS += $(DEBUG_FLAGS) -INCLUDES := -I$(MAININCLUDE) -I$(TOP)/external/include +INCLUDES := -I$(MAININCLUDE) -I$(TOP)/external/include OBJ_FILES := clht_gc.o SRC := src diff --git a/P-CLHT/README.md b/P-CLHT/README.md index ad788e60..009a2ee8 100644 --- a/P-CLHT/README.md +++ b/P-CLHT/README.md @@ -1,11 +1,10 @@ -## P-CLHT: Persistent Cache-Line Hash Table +## P-CLHT: Persistent Cache-Line Hash Table - PMDK -`P-CLHT` is a crash consistent version of [Cache-Line Hash Table](https://dl.acm.org/citation.cfm?id=2694359) (CLHT). +`P-CLHT` is a recoverable and crash-consistent version of [Cache-Line Hash Table](https://dl.acm.org/citation.cfm?id=2694359) (CLHT). CLHT is a cache-friendly hash table which restricts each bucket to be of the size of a cache line. CLHT is an unordered index only supporting point queries. -**Conversion**. `CLHT-LB` using lock-based writes for concurrency is converted into `P-CLHT` by adding cache -line flushes and memory fences after each critical volatile store. +**Conversion**. `CLHT-LB` using lock-based writes for concurrency is converted into `P-CLHT` by adding cache line flushes and memory fences after each critical volatile store. **Performance**. Compared with [CCEH](https://www.usenix.org/conference/fast19/presentation/nam) that is a state-of-the-art unordered index, `P-CLHT` shows **2.38x**, **1.35x**, and **1.25x** better performance in @@ -16,7 +15,55 @@ YCSB workload A, B, C respectively using random integer keys while **0.37x** wor **Use Case**. `P-CLHT` provides the superior performance of insertion and point lookup, even if not supporting range scans. Therefore, it would be appropriate to be used for the applications only consisting of point queries. +This branch of P-CLHT also uses PMDK to ensure the persistence and recoverability of the cache-line hash table. All other details of this data structure are the same (cache line flushing, alignment, etc) except for the backend library used to ensure persistence. + +**Motivation** The published implementation does not have a way of recovering permanent memory leaks during a crash. The PMDK library, specifically `libpmemobj`, gives us useful internal structures such as `pmemobj_root`, which is a stored offset within the persistent memory pool that can be used to recover any data that was left in a partial state, etc. + +**How We Used PMDK** The entire conversion required us to replace any data structure pointers to point to the persistent memory pool using the non-transactional, atomic allocation functions such as `pmemobj_alloc`. Since the `PMEMoid` structs (which store the pool offset and id) were 16 bytes, some code manipulation was required to ensure the cache-line alignment of the data structure. Finally, transactions were used for major hashtable operations such as insertion, resizing, and deletion. This part is still being tested and is a work-in-progress. If you look through the code and compare it with the `master` branch, you can see that the changes follow a logical pattern, and the modifications are relatively minor. + +**How to test recoverability?** The best way to recover your hashtable is following the paradigm presented in `clht_open` where all the user has to do is use `pmemobj_root` to recover the root (a clht_t object basically) of the persistent memory pool. Please make sure that you are opening the same pool with the correct pool layout! +``` +... +PMEMoid my_root = pmemobj_root(pop, sizeof(clht_t)); +if (pmemobj_direct(my_root) == NULL) +{ + perror("root pointer is null\n"); +} +... +clht_t* w = pmemobj_direct(my_root); +... +``` + ## Build & Run +### How to enable PM? +1. Install PMDK +```$ git clone https://github.com/pmem/pmdk.git +$ cd pmdk +$ git checkout tags/1.6 +$ make -j +$ cd .. +``` +2. Emulate PM with Ext4-DAX mount +```$ sudo mkfs.ext4 -b 4096 -E stride=512 -F /dev/pmem0 +$ sudo mount -o dax /dev/pmem0 /mnt/pmem +``` + +3. Set pool_size and pool name appropriately using `pmemobj_create`. For example: +``` +// Size of the memory pool +size_t pool_size = 2*1024*1024*1024UL; +if( access("/mnt/pmem/pool", F_OK ) != -1 ) +{ + // If the pool already exists, open it + pop = pmemobj_open("/mnt/pmem/pool", POBJ_LAYOUT_NAME(clht)); +} else +{ + // If the pool does not exist, create it + pop = pmemobj_create("/mnt/pmem/pool", POBJ_LAYOUT_NAME(clht), pool_size, 0666); +} +``` + +4. Make accordingly and run the example. #### Build @@ -35,4 +82,4 @@ $ ./example 10000 4 usage: ./example [n] [nthreads] n: number of keys (integer) nthreads: number of threads (integer) -``` +``` \ No newline at end of file diff --git a/P-CLHT/example.cpp b/P-CLHT/example.cpp index e76b5bb6..46012bb4 100644 --- a/P-CLHT/example.cpp +++ b/P-CLHT/example.cpp @@ -6,9 +6,9 @@ #include #include "tbb/tbb.h" -using namespace std; +#include -#include "clht.h" +#include "clht_lb_res.h" #include "ssmem.h" typedef struct thread_data { @@ -96,7 +96,7 @@ void run(char **argv) { thread_group[i].join(); auto duration = std::chrono::duration_cast( std::chrono::system_clock::now() - starttime); - printf("Throughput: load, %f ,ops/us\n", (n * 1.0) / duration.count()); + printf("Throughput: load, %f ,ops/s\n", (n * 1.0) / (duration.count()/1000000.0)); } barrier.crossing = 0; @@ -117,11 +117,12 @@ void run(char **argv) { barrier_cross(&barrier); for (uint64_t i = start_key; i < end_key; i++) { - uintptr_t val = clht_get(tds[thread_id].ht->ht, keys[i]); - if (val != keys[i]) { - std::cout << "[CLHT] wrong key read: " << val << "expected: " << keys[i] << std::endl; - exit(1); - } + clht_hashtable_t *ht = (clht_hashtable_t*)clht_ptr_from_off((tds[thread_id].ht)->ht_off); + uintptr_t val = clht_get(ht, keys[i]); + if (val != keys[i]) { + std::cout << "[CLHT] wrong key read: " << val << " expected: " << keys[i] << std::endl; + exit(1); + } } }; @@ -134,9 +135,9 @@ void run(char **argv) { thread_group[i].join(); auto duration = std::chrono::duration_cast( std::chrono::system_clock::now() - starttime); - printf("Throughput: run, %f ,ops/us\n", (n * 1.0) / duration.count()); + printf("Throughput: run, %f ,ops/s\n", (n * 1.0) / (duration.count()/1000000.0)); } - clht_gc_destroy(hashtable); + // clht_gc_destroy(hashtable); delete[] keys; } diff --git a/P-CLHT/include/clht_lb_res.h b/P-CLHT/include/clht_lb_res.h index 94a1108d..494dcdb4 100644 --- a/P-CLHT/include/clht_lb_res.h +++ b/P-CLHT/include/clht_lb_res.h @@ -36,6 +36,7 @@ #include #include "atomic_ops.h" #include "utils.h" +#include #ifdef __cplusplus extern "C" { @@ -160,7 +161,9 @@ typedef struct ALIGNED(CACHE_LINE_SIZE) bucket_s volatile uint32_t hops; clht_addr_t key[ENTRIES_PER_BUCKET]; clht_val_t val[ENTRIES_PER_BUCKET]; - volatile struct bucket_s* next; + // volatile struct bucket_s* next; + // PMEMoid next; + uint64_t next_off; } bucket_t; //#if __GNUC__ > 4 && __GNUC_MINOR__ > 4 @@ -173,7 +176,9 @@ typedef struct ALIGNED(CACHE_LINE_SIZE) clht { struct { - struct clht_hashtable_s* ht; + // PMEMoid ht; + uint64_t ht_off; + // struct clht_hashtable_s* ht; uint8_t next_cache_line[CACHE_LINE_SIZE - (sizeof(void*))]; struct clht_hashtable_s* ht_oldest; struct ht_ts* version_list; @@ -193,7 +198,9 @@ typedef struct ALIGNED(CACHE_LINE_SIZE) clht_hashtable_s struct { size_t num_buckets; - bucket_t* table; + // PMEMoid table; + uint64_t table_off; + //bucket_t* table; size_t hash; size_t version; uint8_t next_cache_line[CACHE_LINE_SIZE - (3 * sizeof(size_t)) - (sizeof(void*))]; @@ -441,6 +448,18 @@ int ht_resize_pes(clht_t* hashtable, int is_increase, int by); const char* clht_type_desc(); void clht_lock_initialization(clht_t *h); + +// Initialize the persistent memory pool +POBJ_LAYOUT_BEGIN(clht); +POBJ_LAYOUT_ROOT(clht, clht_t); +POBJ_LAYOUT_TOID(clht, clht_hashtable_t); +POBJ_LAYOUT_TOID(clht, bucket_t); +POBJ_LAYOUT_END(clht); + +/* Global pool uuid */ +uint64_t pool_uuid; +void* clht_ptr_from_off(uint64_t offset); + #ifdef __cplusplus } #endif diff --git a/P-CLHT/src/clht_gc.c b/P-CLHT/src/clht_gc.c index 797dbfe8..55246862 100644 --- a/P-CLHT/src/clht_gc.c +++ b/P-CLHT/src/clht_gc.c @@ -47,7 +47,8 @@ clht_gc_thread_init(clht_t* h, int id) ht_ts_t* ts = (ht_ts_t*) memalign(CACHE_LINE_SIZE, sizeof(ht_ts_t)); assert(ts != NULL); - ts->version = h->ht->version; + clht_hashtable_t* ht_ptr = clht_ptr_from_off(h->ht_off); + ts->version = ht_ptr->version; ts->id = id; do @@ -128,7 +129,8 @@ clht_gc_min_version_used(clht_t* h) { volatile ht_ts_t* cur = h->version_list; - size_t min = h->ht->version; + clht_hashtable_t* ht_ptr = clht_ptr_from_off(h->ht_off); + size_t min = ht_ptr->version; while (cur != NULL) { if (cur->version < min) @@ -149,8 +151,9 @@ clht_gc_min_version_used(clht_t* h) static int clht_gc_collect_cond(clht_t* hashtable, int collect_not_referenced_only) { + clht_hashtable_t* ht_ptr = clht_ptr_from_off(hashtable->ht_off); /* if version_min >= current version there is nothing to collect! */ - if ((hashtable->version_min >= hashtable->ht->version) || TRYLOCK_ACQ(&hashtable->gc_lock)) + if ((hashtable->version_min >= ht_ptr->version) || TRYLOCK_ACQ(&hashtable->gc_lock)) { /* printf("** someone else is performing gc\n"); */ return 0; @@ -160,7 +163,7 @@ clht_gc_collect_cond(clht_t* hashtable, int collect_not_referenced_only) /* printf("[GCOLLE-%02d] LOCK : %zu\n", GET_ID(collect_not_referenced_only), hashtable->version); */ - size_t version_min = hashtable->ht->version; + size_t version_min = ht_ptr->version; if (collect_not_referenced_only) { version_min = clht_gc_min_version_used(hashtable); @@ -221,20 +224,24 @@ clht_gc_free(clht_hashtable_t* hashtable) uint64_t bin; for (bin = 0; bin < num_buckets; bin++) { - bucket = hashtable->table + bin; - bucket = bucket->next; + bucket = ((bucket_t*)clht_ptr_from_off(hashtable->table_off)) + bin; + bucket = clht_ptr_from_off(bucket->next_off); + while (bucket != NULL) - { - volatile bucket_t* cur = bucket; - bucket = bucket->next; - free((void*) cur); - } + { + volatile bucket_t* cur = bucket; + bucket = clht_ptr_from_off(bucket->next_off); + PMEMoid cur_oid = pmemobj_oid((void*) cur); + pmemobj_free(&cur_oid); + } } #endif - free(hashtable->table); - free(hashtable); - + PMEMoid table_oid = {pool_uuid, hashtable->table_off}; + pmemobj_free(&(table_oid)); + PMEMoid ht_oid = pmemobj_oid((void*) hashtable); + pmemobj_free(&ht_oid); + return 1; } @@ -246,12 +253,13 @@ clht_gc_destroy(clht_t* hashtable) { #if !defined(CLHT_LINKED) clht_gc_collect_all(hashtable); - clht_gc_free(hashtable->ht); - free(hashtable); + clht_gc_free(clht_ptr_from_off(hashtable->ht_off)); + // PMEMoid ht_oid = pmemobj_oid((void*) hashtable); + // pmemobj_free(&ht_oid); #endif - // ssmem_alloc_term(clht_alloc); - free(clht_alloc); + // ssmem_alloc_term(clht_alloc); + //free(clht_alloc); } /* @@ -269,20 +277,26 @@ clht_gc_release(clht_hashtable_t* hashtable) uint64_t bin; for (bin = 0; bin < num_buckets; bin++) - { - bucket = hashtable->table + bin; - bucket = bucket->next; + { + bucket = ((bucket_t*)clht_ptr_from_off(hashtable->table_off)) + bin; + bucket = clht_ptr_from_off(bucket->next_off); + while (bucket != NULL) { volatile bucket_t* cur = bucket; - bucket = bucket->next; + bucket = clht_ptr_from_off(bucket->next_off); ssmem_release(clht_alloc, (void*) cur); + // PMEMoid cur_oid = pmemobj_oid((void*) cur); + // pmemobj_free(&cur_oid); } - } + } #endif - ssmem_release(clht_alloc, hashtable->table); + ssmem_release(clht_alloc, clht_ptr_from_off(hashtable->table_off)); ssmem_release(clht_alloc, hashtable); + // pmemobj_free(&(hashtable->table)); + // PMEMoid ht_oid = pmemobj_oid((void*) hashtable); + // pmemobj_free(&ht_oid); return 1; } diff --git a/P-CLHT/src/clht_lb_res.c b/P-CLHT/src/clht_lb_res.c index 9f86c3c2..0db89d7b 100644 --- a/P-CLHT/src/clht_lb_res.c +++ b/P-CLHT/src/clht_lb_res.c @@ -36,8 +36,17 @@ #include #include +#include + #include "clht_lb_res.h" +/* Global pool pointer */ +PMEMobjpool* pop; + +struct root { + struct clht_t *ht; +}; + //#define CLHTDEBUG //#define CRASH_AFTER_SWAP_CLHT //#define CRASH_BEFORE_SWAP_CLHT @@ -72,6 +81,8 @@ __thread size_t check_ht_status_steps = CLHT_STATUS_INVOK_IN; #endif */ +#define PMDK_TRANSACTION 0 + const char* clht_type_desc() { @@ -162,34 +173,67 @@ static inline void clflush_next_check(char *data, int len, bool fence) #elif CLWB asm volatile(".byte 0x66; xsaveopt %0" : "+m" (*(volatile char *)(ptr))); #endif - if (((bucket_t *)data)->next) - clflush_next_check((char *)(((bucket_t *)data)->next), sizeof(bucket_t), false); + if (clht_ptr_from_off( ((bucket_t *)data)->next_off ) ) + clflush_next_check((char *)clht_ptr_from_off( ((bucket_t *)data)->next_off ), sizeof(bucket_t), false); while(read_tsc() < etsc) cpu_pause(); } if (fence) mfence(); } +void* clht_ptr_from_off(uint64_t offset) +{ + PMEMoid oid = {pool_uuid, offset}; + return pmemobj_direct(oid); +} + +static int bucket_init(PMEMobjpool *pop_arg, void *ptr, void *arg) +{ + bucket_t* bucket = ptr; + bucket->lock = 0; + + uint32_t j; + for (j = 0; j < ENTRIES_PER_BUCKET; j++) + { + bucket->key[j] = 0; + } + bucket->next_off = OID_NULL.off; + return 0; +} + /* Create a new bucket. */ bucket_t* clht_bucket_create() { bucket_t* bucket = NULL; - bucket = (bucket_t *) memalign(CACHE_LINE_SIZE, sizeof(bucket_t)); - if (bucket == NULL) - { - return NULL; - } - + PMEMoid bucket_oid; +#if PMDK_TRANSACTION +TX_BEGIN(pop) { + bucket_oid = pmemobj_tx_alloc(sizeof(bucket_t), TOID_TYPE_NUM(bucket_t)); + bucket = pmemobj_direct(bucket_oid); bucket->lock = 0; - + uint32_t j; for (j = 0; j < ENTRIES_PER_BUCKET; j++) { bucket->key[j] = 0; } - bucket->next = NULL; - + bucket->next_off = OID_NULL.off; +} TX_END + #else + if (pmemobj_alloc(pop, &bucket_oid, sizeof(bucket_t), TOID_TYPE_NUM(bucket_t), bucket_init, 0)) + { + fprintf(stderr, "pmemobj_alloc failed for clht_bucket_create\n"); + assert(0); + } + bucket = pmemobj_direct(bucket_oid); +#endif + // bucket = (bucket_t *) memalign(CACHE_LINE_SIZE, sizeof(bucket_t)); + + if (bucket == NULL) + { + return NULL; + } return bucket; } @@ -206,34 +250,76 @@ clht_bucket_create_stats(clht_hashtable_t* h, int* resize) return b; } + clht_hashtable_t* clht_hashtable_create(uint64_t num_buckets); +// clht_hashtable_t* g_ptr; + clht_t* clht_create(uint64_t num_buckets) { - clht_t* w = (clht_t*) memalign(CACHE_LINE_SIZE, sizeof(clht_t)); - if (w == NULL) + // Open the PMEMpool if it exists, otherwise create it. + size_t pool_size = 2*1024*1024*1024UL; + if( access("/mnt/pmem/pool", F_OK ) != -1 ) { - printf("** malloc @ hatshtalbe\n"); - return NULL; + pop = pmemobj_open("/mnt/pmem/pool", POBJ_LAYOUT_NAME(clht)); + // reload the root + } else + { + pop = pmemobj_create("/mnt/pmem/pool", POBJ_LAYOUT_NAME(clht), pool_size, 0666); + } + + if (pop == NULL) + { + perror("failed to open the pool\n"); } + + // Create the root pointer + PMEMoid my_root = pmemobj_root(pop, sizeof(clht_t)); + if (pmemobj_direct(my_root) == NULL) + { + perror("root pointer is null\n"); + } + pool_uuid = my_root.pool_uuid_lo; - w->ht = clht_hashtable_create(num_buckets); - if (w->ht == NULL) + clht_t* w = pmemobj_direct(my_root); + printf("my_root.off: %ld\n", my_root.off); + + if (w == NULL) { - free(w); + printf("** malloc @ hashtable\n"); return NULL; } - w->resize_lock = LOCK_FREE; - w->gc_lock = LOCK_FREE; - w->status_lock = LOCK_FREE; - w->version_list = NULL; - w->version_min = 0; - w->ht_oldest = w->ht; - clflush((char *)w->ht->table, num_buckets * sizeof(bucket_t), true); - clflush((char *)w->ht, sizeof(clht_hashtable_t), true); - clflush((char *)w, sizeof(clht_t), true); + if (w->ht_off == 0) { + clht_hashtable_t* ht_ptr; + + // Transactional allocation + ht_ptr = clht_hashtable_create(num_buckets); + // printf("g_ptr after abort: %p\n", g_ptr); + // PMEMoid temp = pmemobj_oid(g_ptr); + // printf("temp.offset: %d, temp.pool: %d\n", temp.off, temp.pool_uuid_lo); + printf("clht_create ht_ptr->table.off: %ld\n", ht_ptr->table_off); + w->ht_off = pmemobj_oid(ht_ptr).off; + + if (ht_ptr == NULL) + { + free(w); + return NULL; + } + + w->resize_lock = LOCK_FREE; + w->gc_lock = LOCK_FREE; + w->status_lock = LOCK_FREE; + w->version_list = NULL; + w->version_min = 0; + w->ht_oldest = ht_ptr; + + // This should flush everything to persistent memory + clflush((char *)clht_ptr_from_off(ht_ptr->table_off), num_buckets * sizeof(bucket_t), true); + clflush((char *)ht_ptr, sizeof(clht_hashtable_t), true); + clflush((char *)w, sizeof(clht_t), true); + } return w; } @@ -249,35 +335,65 @@ clht_hashtable_create(uint64_t num_buckets) } /* Allocate the table itself. */ - hashtable = (clht_hashtable_t*) memalign(CACHE_LINE_SIZE, sizeof(clht_hashtable_t)); + // hashtable = (clht_hashtable_t*) memalign(CACHE_LINE_SIZE, sizeof(clht_hashtable_t)); + // Allocate the table in persistent memory + PMEMoid ht_oid; +#if PMDK_TRANSACTION +TX_BEGIN(pop) { + ht_oid = pmemobj_tx_alloc(sizeof(clht_hashtable_t), TOID_TYPE_NUM(clht_hashtable_t)); +#else + if (pmemobj_alloc(pop, &ht_oid, sizeof(clht_hashtable_t), TOID_TYPE_NUM(clht_hashtable_t), 0, 0)) + { + fprintf(stderr, "pmemobj_alloc failed for clht_hashtable_create\n"); + assert(0); + } +#endif + hashtable = pmemobj_direct(ht_oid); + // g_ptr = hashtable; + // printf("g_ptr: %p\n", g_ptr); + if (hashtable == NULL) { - printf("** malloc @ hatshtalbe\n"); + printf("** malloc @ hashtable\n"); return NULL; } /* hashtable->table = calloc(num_buckets, (sizeof(bucket_t))); */ - hashtable->table = (bucket_t*) memalign(CACHE_LINE_SIZE, num_buckets * (sizeof(bucket_t))); - if (hashtable->table == NULL) + // hashtable->table = (bucket_t*) memalign(CACHE_LINE_SIZE, num_buckets * (sizeof(bucket_t))); + PMEMoid table_oid; +#if PMDK_TRANSACTION + table_oid = pmemobj_tx_zalloc(num_buckets * sizeof(bucket_t), TOID_TYPE_NUM(bucket_t)); +#else + if (pmemobj_alloc(pop, &table_oid, num_buckets * sizeof(bucket_t), TOID_TYPE_NUM(bucket_t), 0, 0)) + { + fprintf(stderr, "pmemobj_alloc failed for table_oid in clht_hashtable_create\n"); + assert(0); + } +#endif + hashtable->table_off = table_oid.off; + + bucket_t* bucket_ptr = clht_ptr_from_off(hashtable->table_off); + + if (bucket_ptr == NULL) { printf("** alloc: hashtable->table\n"); fflush(stdout); free(hashtable); return NULL; } - memset(hashtable->table, 0, num_buckets * (sizeof(bucket_t))); + //memset(bucket_ptr, 0, num_buckets * (sizeof(bucket_t))); uint64_t i; for (i = 0; i < num_buckets; i++) { - hashtable->table[i].lock = LOCK_FREE; + bucket_ptr[i].lock = LOCK_FREE; uint32_t j; for (j = 0; j < ENTRIES_PER_BUCKET; j++) { - hashtable->table[i].key[j] = 0; + bucket_ptr[i].key[j] = 0; } } - + hashtable->num_buckets = num_buckets; hashtable->hash = num_buckets - 1; hashtable->version = 0; @@ -293,6 +409,9 @@ clht_hashtable_create(uint64_t num_buckets) hashtable->is_helper = 1; hashtable->helper_done = 0; +#if PMDK_TRANSACTION +} TX_END +#endif return hashtable; } @@ -316,8 +435,8 @@ clht_get(clht_hashtable_t* hashtable, clht_addr_t key) { size_t bin = clht_hash(hashtable, key); CLHT_GC_HT_VERSION_USED(hashtable); - volatile bucket_t* bucket = hashtable->table + bin; - + volatile bucket_t* bucket = ((bucket_t*)clht_ptr_from_off(hashtable->table_off)) + bin; + uint32_t j; do { @@ -339,8 +458,7 @@ clht_get(clht_hashtable_t* hashtable, clht_addr_t key) } } } - - bucket = bucket->next; + bucket = clht_ptr_from_off(bucket->next_off); } while (unlikely(bucket != NULL)); return 0; @@ -359,7 +477,7 @@ bucket_exists(volatile bucket_t* bucket, clht_addr_t key) return true; } } - bucket = bucket->next; + bucket = clht_ptr_from_off(bucket->next_off); } while (unlikely(bucket != NULL)); return false; @@ -369,9 +487,10 @@ bucket_exists(volatile bucket_t* bucket, clht_addr_t key) int clht_put(clht_t* h, clht_addr_t key, clht_val_t val) { - clht_hashtable_t* hashtable = h->ht; + clht_hashtable_t* hashtable = clht_ptr_from_off(h->ht_off); size_t bin = clht_hash(hashtable, key); - volatile bucket_t* bucket = hashtable->table + bin; + volatile bucket_t* bucket = ((bucket_t*)clht_ptr_from_off(hashtable->table_off)) + bin; + #if CLHT_READ_ONLY_FAIL == 1 if (bucket_exists(bucket, key)) { @@ -382,10 +501,10 @@ clht_put(clht_t* h, clht_addr_t key, clht_val_t val) clht_lock_t* lock = &bucket->lock; while (!LOCK_ACQ(lock, hashtable)) { - hashtable = h->ht; + hashtable = clht_ptr_from_off(h->ht_off); size_t bin = clht_hash(hashtable, key); - bucket = hashtable->table + bin; + bucket = ((bucket_t*)clht_ptr_from_off(hashtable->table_off)) + bin; lock = &bucket->lock; } @@ -410,15 +529,39 @@ clht_put(clht_t* h, clht_addr_t key, clht_val_t val) empty_v = &bucket->val[j]; } } - + int resize = 0; - if (likely(bucket->next == NULL)) + if (likely(clht_ptr_from_off(bucket->next_off) == NULL)) { if (unlikely(empty == NULL)) { DPP(put_num_failed_expand); - - bucket_t* b = clht_bucket_create_stats(hashtable, &resize); + bucket_t* b; +#if PMDK_TRANSACTION + TX_BEGIN(pop) { + pmemobj_tx_add_range_direct((const void*)&(bucket->next_off), sizeof(uint64_t)); + // printf("made it here at least\n"); + b = clht_bucket_create_stats(hashtable, &resize); + b->val[0] = val; +#ifdef __tile__ + /* keep the writes in order */ + _mm_sfence(); +#endif + b->key[0] = key; +#ifdef __tile__ + /* make sure they are visible */ + _mm_sfence(); +#endif + } TX_FINALLY { + clflush((char *)b, sizeof(bucket_t), true); + bucket->next_off = pmemobj_oid(b).off; + bucket_t* next_ptr = clht_ptr_from_off(bucket->next_off); + clflush((char *)&next_ptr, sizeof(uintptr_t), true); + } TX_ONABORT { + printf("Failed clht_put, rolling back\n"); + } TX_END +#else + b = clht_bucket_create_stats(hashtable, &resize); b->val[0] = val; #ifdef __tile__ /* keep the writes in order */ @@ -427,11 +570,13 @@ clht_put(clht_t* h, clht_addr_t key, clht_val_t val) b->key[0] = key; #ifdef __tile__ /* make sure they are visible */ - _mm_sfence(); + mm_sfence(); #endif clflush((char *)b, sizeof(bucket_t), true); - bucket->next = b; - clflush((char *)&bucket->next, sizeof(uintptr_t), true); + bucket->next_off = pmemobj_oid(b).off; + bucket_t* next_ptr = clht_ptr_from_off(bucket->next_off); + clflush((char *)&next_ptr, sizeof(uintptr_t), true); +#endif } else { @@ -450,16 +595,16 @@ clht_put(clht_t* h, clht_addr_t key, clht_val_t val) /* ht_resize_pes(h, 1); */ DEBUG_PRINT("Calling ht_status for key %ld\n", (long)key); int ret = ht_status(h, 1, 0); - // if crash, return true, because the insert anyway succeeded if (ret == 0) return true; } return true; } - bucket = bucket->next; + bucket = clht_ptr_from_off(bucket->next_off); } while (true); + } @@ -467,9 +612,9 @@ clht_put(clht_t* h, clht_addr_t key, clht_val_t val) clht_val_t clht_remove(clht_t* h, clht_addr_t key) { - clht_hashtable_t* hashtable = h->ht; + clht_hashtable_t* hashtable = clht_ptr_from_off(h->ht_off); size_t bin = clht_hash(hashtable, key); - volatile bucket_t* bucket = hashtable->table + bin; + volatile bucket_t* bucket = ((bucket_t*)clht_ptr_from_off(hashtable->table_off)) + bin; #if CLHT_READ_ONLY_FAIL == 1 if (!bucket_exists(bucket, key)) @@ -481,10 +626,10 @@ clht_remove(clht_t* h, clht_addr_t key) clht_lock_t* lock = &bucket->lock; while (!LOCK_ACQ(lock, hashtable)) { - hashtable = h->ht; + hashtable = clht_ptr_from_off(h->ht_off); size_t bin = clht_hash(hashtable, key); - bucket = hashtable->table + bin; + bucket = ((bucket_t*)clht_ptr_from_off(hashtable->table_off)) + bin; lock = &bucket->lock; } @@ -499,13 +644,21 @@ clht_remove(clht_t* h, clht_addr_t key) if (bucket->key[j] == key) { clht_val_t val = bucket->val[j]; +#if PMDK_TRANSACTION + TX_BEGIN(pop) { + pmemobj_tx_add_range_direct((const void*)&(bucket->key[j]), sizeof(clht_addr_t)); + bucket->key[j] = 0; + clflush((char *)&bucket->key[j], sizeof(uintptr_t), true); + } TX_END +#else bucket->key[j] = 0; clflush((char *)&bucket->key[j], sizeof(uintptr_t), true); +#endif LOCK_RLS(lock); return val; } } - bucket = bucket->next; + bucket = clht_ptr_from_off(bucket->next_off); } while (unlikely(bucket != NULL)); LOCK_RLS(lock); @@ -515,7 +668,7 @@ clht_remove(clht_t* h, clht_addr_t key) static uint32_t clht_put_seq(clht_hashtable_t* hashtable, clht_addr_t key, clht_val_t val, uint64_t bin) { - volatile bucket_t* bucket = hashtable->table + bin; + volatile bucket_t* bucket = ((bucket_t*)clht_ptr_from_off(hashtable->table_off)) + bin; uint32_t j; do @@ -524,23 +677,44 @@ clht_put_seq(clht_hashtable_t* hashtable, clht_addr_t key, clht_val_t val, uint6 { if (bucket->key[j] == 0) { +#if PMDK_TRANSACTION + TX_BEGIN(pop) { + pmemobj_tx_add_range_direct((const void*)&(bucket->key[j]), sizeof(clht_addr_t)); + pmemobj_tx_add_range_direct((const void*)&(bucket->val[j]), sizeof(clht_val_t)); + bucket->val[j] = val; + bucket->key[j] = key; + } TX_END +#else bucket->val[j] = val; bucket->key[j] = key; +#endif return true; } } - if (bucket->next == NULL) + if (clht_ptr_from_off(bucket->next_off) == NULL) { DPP(put_num_failed_expand); int null; - bucket->next = clht_bucket_create_stats(hashtable, &null); - bucket->next->val[0] = val; - bucket->next->key[0] = key; + bucket->next_off = pmemobj_oid(clht_bucket_create()).off; + bucket_t* bucket_ptr = clht_ptr_from_off(bucket->next_off); +#if PMDK_TRANSACTION + TX_BEGIN(pop) { + pmemobj_tx_add_range_direct((const void*)&(bucket->key[0]), sizeof(clht_addr_t)); + pmemobj_tx_add_range_direct((const void*)&(bucket->val[0]), sizeof(clht_val_t)); + bucket_ptr->val[0] = val; + bucket_ptr->key[0] = key; + } TX_ONABORT { + printf("Failed clht_put_seq, rolling back\n"); + } TX_END +#else + bucket_ptr->val[0] = val; + bucket_ptr->key[0] = key; +#endif return true; } - bucket = bucket->next; + bucket = clht_ptr_from_off(bucket->next_off); } while (true); } @@ -594,7 +768,7 @@ bucket_cpy(clht_t* h, volatile bucket_t* bucket, clht_hashtable_t* ht_new) clht_put_seq(ht_new, key, bucket->val[j], bin); } } - bucket = bucket->next; + bucket = clht_ptr_from_off(bucket->next_off); } while (bucket != NULL); @@ -614,7 +788,7 @@ ht_resize_help(clht_hashtable_t* h) /* hash = num_buckets - 1 */ for (b = h->hash; b >= 0; b--) { - bucket_t* bu_cur = h->table + b; + bucket_t* bu_cur = ((bucket_t*)clht_ptr_from_off(h->table_off)) + b; if (!bucket_cpy((clht_t *)h, bu_cur, h->table_tmp)) { /* reached a point where the resizer is handling */ /* printf("[GC-%02d] helped #buckets: %10zu = %5.1f%%\n", */ @@ -631,12 +805,13 @@ ht_resize_help(clht_hashtable_t* h) int ht_resize_pes(clht_t* h, int is_increase, int by) { -// ticks s = getticks(); - check_ht_status_steps = CLHT_STATUS_INVOK; + ticks s = getticks(); - clht_hashtable_t* ht_old = h->ht; + check_ht_status_steps = CLHT_STATUS_INVOK; + clht_hashtable_t* ht_old = clht_ptr_from_off(h->ht_off); + if (TRYLOCK_ACQ(&h->resize_lock)) { return 0; @@ -656,8 +831,11 @@ ht_resize_pes(clht_t* h, int is_increase, int by) num_buckets_new = ht_old->num_buckets / CLHT_RATIO_HALVE; } - /* printf("// resizing: from %8zu to %8zu buckets\n", ht_old->num_buckets, num_buckets_new); */ + printf("// resizing: from %8zu to %8zu buckets\n", ht_old->num_buckets, num_buckets_new); +#if PMDK_TRANSACTION + TX_BEGIN(pop) { +#endif clht_hashtable_t* ht_new = clht_hashtable_create(num_buckets_new); ht_new->version = ht_old->version + 1; @@ -667,7 +845,7 @@ ht_resize_pes(clht_t* h, int is_increase, int by) size_t b; for (b = 0; b < ht_old->num_buckets; b++) { - bucket_t* bu_cur = ht_old->table + b; + bucket_t* bu_cur = (bucket_t*)(clht_ptr_from_off(ht_old->table_off)) + b; int ret = bucket_cpy(h, bu_cur, ht_new); /* reached a point where the helper is handling */ if (ret == -1) return -1; @@ -688,8 +866,8 @@ ht_resize_pes(clht_t* h, int is_increase, int by) size_t b; for (b = 0; b < ht_old->num_buckets; b++) - { - bucket_t* bu_cur = ht_old->table + b; + { + bucket_t* bu_cur = (bucket_t*)(clht_ptr_from_off(ht_old->table_off)) + b; int ret = bucket_cpy(h, bu_cur, ht_new); if (ret == -1) return -1; @@ -715,7 +893,7 @@ ht_resize_pes(clht_t* h, int is_increase, int by) mfence(); clflush((char *)ht_new, sizeof(clht_hashtable_t), false); - clflush_next_check((char *)ht_new->table, num_buckets_new * sizeof(bucket_t), false); + clflush_next_check((char *)clht_ptr_from_off(ht_new->table_off), num_buckets_new * sizeof(bucket_t), false); mfence(); #if defined(CRASH_BEFORE_SWAP_CLHT) @@ -749,7 +927,17 @@ ht_resize_pes(clht_t* h, int is_increase, int by) #endif // atomically swap the root pointer - SWAP_U64((uint64_t*) h, (uint64_t) ht_new); + // are there any race conditions? + PMEMoid ht_new_oid = pmemobj_oid(ht_new); + uint64_t ht_new_oid_off = ht_new_oid.off; + // uint64_t h_new = (uint64_t)h + sizeof(uint64_t); + uint64_t* h_new = &(h->ht_off); +#if PMDK_TRANSACTION + pmemobj_tx_add_range_direct(&(h->ht_off), sizeof(uint64_t)); +#endif + //SWAP_U64((uint64_t*) h, (uint64_t) ht_new); + SWAP_U64((uint64_t*)h_new, ht_new_oid_off); + clflush((char *)h, sizeof(uint64_t), true); #if defined(CRASH_AFTER_SWAP_CLHT) @@ -785,10 +973,10 @@ ht_resize_pes(clht_t* h, int is_increase, int by) ht_old->table_new = ht_new; TRYLOCK_RLS(h->resize_lock); -// ticks e = getticks() - s; -// double mba = (ht_new->num_buckets * 64) / (1024.0 * 1024); -// printf("[RESIZE-%02d] to #bu %7zu = MB: %7.2f | took: %13llu ti = %8.6f s\n", -// clht_gc_get_id(), ht_new->num_buckets, mba, (unsigned long long) e, e / 2.1e9); + ticks e = getticks() - s; + double mba = (ht_new->num_buckets * 64) / (1024.0 * 1024); + printf("[RESIZE-%02d] to #bu %7zu = MB: %7.2f | took: %13llu ti = %8.6f s\n", + clht_gc_get_id(), ht_new->num_buckets, mba, (unsigned long long) e, e / 2.1e9); #if defined(CLHTDEBUG) DEBUG_PRINT("-------------ht old------------\n"); @@ -810,6 +998,9 @@ ht_resize_pes(clht_t* h, int is_increase, int by) { ht_status(h, 1, 0); } +#if PMDK_TRANSACTION + } TX_END +#endif return 1; } @@ -823,7 +1014,7 @@ clht_size(clht_hashtable_t* hashtable) uint64_t bin; for (bin = 0; bin < num_buckets; bin++) { - bucket = hashtable->table + bin; + bucket = ((bucket_t*)clht_ptr_from_off(hashtable->table_off)) + bin; uint32_t j; do @@ -836,7 +1027,7 @@ clht_size(clht_hashtable_t* hashtable) } } - bucket = bucket->next; + bucket = clht_ptr_from_off(bucket->next_off); } while (bucket != NULL); } @@ -852,8 +1043,7 @@ ht_status(clht_t* h, int resize_increase, int just_print) { return 0; } - - clht_hashtable_t* hashtable = h->ht; + clht_hashtable_t* hashtable = clht_ptr_from_off(h->ht_off); uint64_t num_buckets = hashtable->num_buckets; volatile bucket_t* bucket = NULL; size_t size = 0; @@ -863,7 +1053,7 @@ ht_status(clht_t* h, int resize_increase, int just_print) uint64_t bin; for (bin = 0; bin < num_buckets; bin++) { - bucket = hashtable->table + bin; + bucket = ((bucket_t*)clht_ptr_from_off(hashtable->table_off)) + bin; int expands_cont = -1; expands--; @@ -880,7 +1070,7 @@ ht_status(clht_t* h, int resize_increase, int just_print) } } - bucket = bucket->next; + bucket = clht_ptr_from_off(bucket->next_off); } while (bucket != NULL); @@ -918,7 +1108,8 @@ ht_status(clht_t* h, int resize_increase, int just_print) inc_by_pow2 = 2; } DEBUG_PRINT("Callig ht_resize_pes\n"); - int ret = ht_resize_pes(h, 1, inc_by_pow2); + int ret = 0; + ret = ht_resize_pes(h, 1, inc_by_pow2); // return if crashed if (ret == -1) return 0; @@ -979,7 +1170,7 @@ clht_print(clht_hashtable_t* hashtable) uint64_t bin; for (bin = 0; bin < num_buckets; bin++) { - bucket = hashtable->table + bin; + bucket = ((bucket_t*)clht_ptr_from_off(hashtable->table_off)) + bin; printf("[[%05zu]] ", bin); @@ -994,7 +1185,7 @@ clht_print(clht_hashtable_t* hashtable) } } - bucket = bucket->next; + bucket = clht_ptr_from_off(bucket->next_off); printf(" ** -> "); } while (bucket != NULL); @@ -1006,7 +1197,7 @@ clht_print(clht_hashtable_t* hashtable) void clht_lock_initialization(clht_t *h) { DEBUG_PRINT("Performing Lock initialization\n"); - clht_hashtable_t *ht = h->ht; + clht_hashtable_t *ht = clht_ptr_from_off(h->ht_off); volatile bucket_t *next; h->resize_lock = LOCK_FREE; @@ -1015,8 +1206,9 @@ void clht_lock_initialization(clht_t *h) int i; for (i = 0; i < ht->num_buckets; i++) { - ht->table[i].lock = LOCK_FREE; - for (next = ht->table[i].next; next != NULL; next = next->next) { + bucket_t* temp = clht_ptr_from_off(ht->table_off); + temp[i].lock = LOCK_FREE; + for (next = clht_ptr_from_off(temp[i].next_off); next != NULL; next = clht_ptr_from_off(next->next_off)) { next->lock = LOCK_FREE; } } diff --git a/P-CLHT/test.cpp b/P-CLHT/test.cpp new file mode 100644 index 00000000..fd0947f5 --- /dev/null +++ b/P-CLHT/test.cpp @@ -0,0 +1,154 @@ +#include +#include +#include +#include +#include +#include +#include "tbb/tbb.h" + +#include + +#include "clht_lb_res.h" +#include "ssmem.h" + +typedef struct thread_data { + uint32_t id; + clht_t *ht; +} thread_data_t; + +typedef struct barrier { + pthread_cond_t complete; + pthread_mutex_t mutex; + int count; + int crossing; +} barrier_t; + +void barrier_init(barrier_t *b, int n) { + pthread_cond_init(&b->complete, NULL); + pthread_mutex_init(&b->mutex, NULL); + b->count = n; + b->crossing = 0; +} + +void barrier_cross(barrier_t *b) { + pthread_mutex_lock(&b->mutex); + b->crossing++; + if (b->crossing < b->count) { + pthread_cond_wait(&b->complete, &b->mutex); + } else { + pthread_cond_broadcast(&b->complete); + b->crossing = 0; + } + pthread_mutex_unlock(&b->mutex); +} + +barrier_t barrier; + +void run(char **argv) { + std::cout << "Simple Example of P-CLHT" << std::endl; + + uint64_t n = std::atoll(argv[1]); + uint64_t *keys = new uint64_t[n]; + + // Generate keys + for (uint64_t i = 0; i < n; i++) { + keys[i] = i + 1; + } + + int num_thread = atoi(argv[2]); + + printf("operation,n,ops/s\n"); + + clht_t *hashtable = clht_create(512); + + barrier_init(&barrier, num_thread); + + thread_data_t *tds = (thread_data_t *) malloc(num_thread * sizeof(thread_data_t)); + + std::atomic next_thread_id; + + { + // Load + auto starttime = std::chrono::system_clock::now(); + next_thread_id.store(0); + auto func = [&]() { + int thread_id = next_thread_id.fetch_add(1); + tds[thread_id].id = thread_id; + tds[thread_id].ht = hashtable; + + uint64_t start_key = n / num_thread * (uint64_t)thread_id; + uint64_t end_key = start_key + n / num_thread; + + clht_gc_thread_init(tds[thread_id].ht, tds[thread_id].id); + barrier_cross(&barrier); + + for (uint64_t i = start_key; i < end_key; i++) { + // Simulate a crash while inserting some key + clht_put(tds[thread_id].ht, keys[i], keys[i]); + } + }; + + std::vector thread_group; + + for (int i = 0; i < num_thread; i++) + thread_group.push_back(std::thread{func}); + + for (int i = 0; i < num_thread; i++) + thread_group[i].join(); + auto duration = std::chrono::duration_cast( + std::chrono::system_clock::now() - starttime); + printf("Throughput: load, %f ,ops/us\n", (n * 1.0) / duration.count()); + } + + barrier.crossing = 0; + + { + // Run + auto starttime = std::chrono::system_clock::now(); + next_thread_id.store(0); + auto func = [&]() { + int thread_id = next_thread_id.fetch_add(1); + tds[thread_id].id = thread_id; + tds[thread_id].ht = hashtable; + + uint64_t start_key = n / num_thread * (uint64_t)thread_id; + uint64_t end_key = start_key + n / num_thread; + + clht_gc_thread_init(tds[thread_id].ht, tds[thread_id].id); + barrier_cross(&barrier); + + for (uint64_t i = start_key; i < end_key; i++) { + clht_hashtable_t *ht = (clht_hashtable_t*)clht_ptr_from_off((tds[thread_id].ht)->ht_off); + uintptr_t val = clht_get(ht, keys[i]); + if (val != keys[i]) { + std::cout << "[CLHT] wrong key read: " << val << "expected: " << keys[i] << std::endl; + exit(1); + } + } + }; + + std::vector thread_group; + + for (int i = 0; i < num_thread; i++) + thread_group.push_back(std::thread{func}); + + for (int i = 0; i < num_thread; i++) + thread_group[i].join(); + auto duration = std::chrono::duration_cast( + std::chrono::system_clock::now() - starttime); + printf("Throughput: run, %f ,ops/us\n", (n * 1.0) / duration.count()); + } + clht_gc_destroy(hashtable); + + delete[] keys; +} + +int main(int argc, char **argv) { + if (argc != 3) { + printf("usage: %s [n] [nthreads]\nn: number of keys (integer)\nnthreads: number of threads (integer)\n", argv[0]); + return 1; + } + + run(argv); + return 0; +} \ No newline at end of file diff --git a/P-Masstree/CMakeLists.txt b/P-Masstree/CMakeLists.txt index 4883fb1d..1b1a1039 100644 --- a/P-Masstree/CMakeLists.txt +++ b/P-Masstree/CMakeLists.txt @@ -36,4 +36,4 @@ find_library(TbbLib tbb) set(P_MASS_TEST example.cpp masstree.cpp) add_executable(example ${P_MASS_TEST}) -target_link_libraries(example ${JemallocLib} ${TbbLib} atomic boost_system boost_thread) +target_link_libraries(example ${JemallocLib} ${TbbLib} pmemobj pmem atomic boost_system boost_thread) diff --git a/P-Masstree/Epoche.cpp b/P-Masstree/Epoche.cpp index a1df89c9..f78e22cf 100644 --- a/P-Masstree/Epoche.cpp +++ b/P-Masstree/Epoche.cpp @@ -8,6 +8,8 @@ #include #include "Epoche.h" +#include + using namespace MASS; inline DeletionList::~DeletionList() { @@ -98,12 +100,14 @@ inline void Epoche::exitEpocheAndCleanup(ThreadInfo &epocheInfo) { } LabelDelete *cur = deletionList.head(), *next, *prev = nullptr; + PMEMoid free_objs; while (cur != nullptr) { next = cur->next; if (cur->epoche < oldestEpoche) { for (std::size_t i = 0; i < cur->nodesCount; ++i) { - free(cur->nodes[i]); + free_objs = pmemobj_oid(cur->nodes[i]); + pmemobj_free(&free_objs); } deletionList.remove(cur, prev); } else { @@ -125,12 +129,14 @@ inline Epoche::~Epoche() { } for (auto &d : deletionLists) { LabelDelete *cur = d.head(), *next, *prev = nullptr; + PMEMoid free_objs; while (cur != nullptr) { next = cur->next; assert(cur->epoche < oldestEpoche); for (std::size_t i = 0; i < cur->nodesCount; ++i) { - free(cur->nodes[i]); + free_objs = pmemobj_oid(cur->nodes[i]); + free(&free_objs); } d.remove(cur, prev); cur = next; diff --git a/P-Masstree/masstree.cpp b/P-Masstree/masstree.cpp index cbc8f12d..427f9285 100644 --- a/P-Masstree/masstree.cpp +++ b/P-Masstree/masstree.cpp @@ -5,6 +5,10 @@ using namespace MASS; namespace masstree { +/* Global pool pointer */ +PMEMobjpool *pop; +uint64_t pool_uuid; + static constexpr uint64_t CACHE_LINE_SIZE = 64; static uint64_t CPU_FREQ_MHZ = 2100; static unsigned long write_latency = 0; @@ -59,6 +63,12 @@ static inline void prefetch_(const void *ptr) asm volatile("prefetcht0 %0" : : "m" (*(const cacheline_t *)ptr)); } +static inline void *ptr_from_off(uint64_t offset) +{ + PMEMoid oid = {pool_uuid, offset}; + return pmemobj_direct(oid); +} + #ifdef LOCK_INIT static tbb::concurrent_vector lock_initializer; void lock_initialization() @@ -70,15 +80,37 @@ void lock_initialization() } #endif -masstree::masstree() { - leafnode *init_root = new leafnode(0); - root_ = init_root; - clflush((char *)root_, sizeof(leafnode), true); +masstree::masstree () { + if (root_ == 0) { + leafnode *init_root = new leafnode(0); + root_ = pmemobj_oid(init_root).off; + clflush((char *)ptr_from_off(root_), sizeof(leafnode), true); + } } masstree::masstree (void *new_root) { - root_ = new_root; - clflush((char *)root_, sizeof(leafnode), true); // 304 is the leafnode size of masstree + root_ = pmemobj_oid(new_root).off; + clflush((char *)ptr_from_off(root_), sizeof(leafnode), true); +} + +void *masstree::operator new(size_t size) { + // Open the PMEMpool if it exists, otherwise create it. + size_t pool_size = 8*1024*1024*1024UL; + if (access("/mnt/pmem/pool", F_OK) != -1) + pop = pmemobj_open("/mnt/pmem/pool", POBJ_LAYOUT_NAME(p_masstree)); + else + pop = pmemobj_create("/mnt/pmem/pool", "p_masstree", pool_size, 0666); + + if (pop == NULL) + perror("failed to open the pool\n"); + + // Create the root pointer + PMEMoid my_root = pmemobj_root(pop, size); + if (pmemobj_direct(my_root) == NULL) + perror("root pointer is null\n"); + pool_uuid = my_root.pool_uuid_lo; + + return pmemobj_direct(my_root); } ThreadInfo masstree::getThreadInfo() { @@ -88,9 +120,14 @@ ThreadInfo masstree::getThreadInfo() { leafnode::leafnode(uint32_t level) : permutation(permuter::make_empty()) { level_ = level; version_ = 0; - wlock = new std::mutex(); - next = NULL; - leftmost_ptr = NULL; + PMEMoid ret; + if (pmemobj_zalloc(pop, &ret, sizeof(PMEMmutex), 0)) { + fprintf(stderr, "pmemobj_zalloc failed for lock allocation\n"); + assert(0); + } + wlock = ret.off; + next = 0; + leftmost_ptr = 0; highest = 0; #ifdef LOCK_INIT lock_initializer.push_back(wlock); @@ -100,13 +137,18 @@ leafnode::leafnode(uint32_t level) : permutation(permuter::make_empty()) { leafnode::leafnode(void *left, uint64_t key, void *right, uint32_t level = 1) : permutation(permuter::make_empty()) { level_ = level; version_ = 0; - wlock = new std::mutex(); - next = NULL; + PMEMoid ret; + if (pmemobj_zalloc(pop, &ret, sizeof(PMEMmutex), 0)) { + fprintf(stderr, "pmemobj_zalloc failed for lock allocation\n"); + assert(0); + } + wlock = ret.off; + next = 0; highest = 0; - leftmost_ptr = reinterpret_cast (left); + leftmost_ptr = pmemobj_oid(left).off; entry[0].key = key; - entry[0].value = right; + entry[0].value = pmemobj_oid(right).off; permutation = permuter::make_sorted(1); #ifdef LOCK_INIT @@ -114,21 +156,30 @@ leafnode::leafnode(void *left, uint64_t key, void *right, uint32_t level = 1) : #endif } +leafnode::~leafnode() { + PMEMoid free_lock = {pool_uuid, wlock}; + pmemobj_free(&free_lock); +} + void *leafnode::operator new(size_t size) { - void *ret; - posix_memalign(&ret, CACHE_LINE_SIZE, size); - return ret; + PMEMoid ret; + if (pmemobj_alloc(pop, &ret, size, 0, 0, 0)) { + fprintf(stderr, "pmemobj_alloc failed for leaf allocation\n"); + assert(0); + } + return pmemobj_direct(ret); } void leafnode::operator delete(void *addr) { - free(addr); + PMEMoid leaf_oid = pmemobj_oid(addr); + pmemobj_free(&leaf_oid); } -void leafnode::lock() {wlock->lock();} +void leafnode::lock() {pmemobj_mutex_lock(pop, (PMEMmutex *)ptr_from_off(wlock));} -void leafnode::unlock() {wlock->unlock();} +void leafnode::unlock() {pmemobj_mutex_unlock(pop, (PMEMmutex *)ptr_from_off(wlock));} -bool leafnode::trylock() {return wlock->try_lock();} +int leafnode::trylock() {return pmemobj_mutex_trylock(pop, (PMEMmutex *)ptr_from_off(wlock));} int leafnode::compare_key(const uint64_t a, const uint64_t b) { @@ -180,7 +231,8 @@ leafnode *leafnode::advance_to_key(const uint64_t& key, bool checker) const leafnode *n = this; leafnode *next; - if ((next = n->next) && compare_key(key, next->highest) >= 0) { + if ((next = reinterpret_cast(ptr_from_off(n->next))) + && compare_key(key, next->highest) >= 0) { // if (!checker) { // printf("Reader must not come here\n"); // exit(0); @@ -215,11 +267,15 @@ void leafnode::prefetch() const leafvalue *masstree::make_leaf(char *key, size_t key_len, uint64_t value) { - void *aligned_alloc; size_t len = (key_len % sizeof(uint64_t)) == 0 ? key_len : (((key_len) / sizeof(uint64_t)) + 1) * sizeof(uint64_t); - posix_memalign(&aligned_alloc, CACHE_LINE_SIZE, sizeof(leafvalue) + len + sizeof(uint64_t)); - leafvalue *lv = reinterpret_cast (aligned_alloc); + PMEMoid ret; + if (pmemobj_alloc(pop, &ret, sizeof(leafvalue) + len + sizeof(uint64_t), 0, 0, 0)) { + fprintf(stderr, "pmemobj_alloc failed for leaf allocation\n"); + assert(0); + } + + leafvalue *lv = reinterpret_cast (pmemobj_direct(ret)); memset(lv, 0, sizeof(leafvalue) + len + sizeof(uint64_t)); lv->value = value; @@ -236,11 +292,15 @@ leafvalue *masstree::make_leaf(char *key, size_t key_len, uint64_t value) leafvalue *leafnode::smallest_leaf(size_t key_len, uint64_t value) { - void *aligned_alloc; size_t len = (key_len % sizeof(uint64_t)) == 0 ? key_len : (((key_len) / sizeof(uint64_t)) + 1) * sizeof(uint64_t); - posix_memalign(&aligned_alloc, CACHE_LINE_SIZE, sizeof(leafvalue) + len); - leafvalue *lv = reinterpret_cast (aligned_alloc); + PMEMoid ret; + if (pmemobj_alloc(pop, &ret, sizeof(leafvalue) + len, 0, 0, 0)) { + fprintf(stderr, "pmemobj_alloc failed for leaf allocation\n"); + assert(0); + } + + leafvalue *lv = reinterpret_cast (pmemobj_direct(ret)); memset(lv, 0, sizeof(leafvalue) + len); lv->value = value; @@ -266,7 +326,7 @@ void leafnode::make_new_layer(leafnode *l, key_indexed_position &kx_, leafvalue leafnode *nl = new leafnode(0); nl->assign_initialize_for_layer(0, olv->fkey[depth]); if (twig_head != l) - twig_tail->entry[0].value = nl; + twig_tail->entry[0].value = pmemobj_oid(nl).off; else twig_head = nl; nl->permutation = permuter::make_sorted(1); @@ -276,30 +336,30 @@ void leafnode::make_new_layer(leafnode *l, key_indexed_position &kx_, leafvalue } leafnode *nl = new leafnode(0); - nl->assign_initialize(0, kcmp < 0 ? olv->fkey[depth] : nlv->fkey[depth], kcmp < 0 ? SET_LV(olv) : SET_LV(nlv)); - nl->assign_initialize(1, kcmp < 0 ? nlv->fkey[depth] : olv->fkey[depth], kcmp < 0 ? SET_LV(nlv) : SET_LV(olv)); + nl->assign_initialize(0, kcmp < 0 ? olv->fkey[depth] : nlv->fkey[depth], kcmp < 0 ? SET_LV(pmemobj_oid(olv).off) : SET_LV(pmemobj_oid(nlv).off)); + nl->assign_initialize(1, kcmp < 0 ? nlv->fkey[depth] : olv->fkey[depth], kcmp < 0 ? SET_LV(pmemobj_oid(nlv).off) : SET_LV(pmemobj_oid(olv).off)); nl->permutation = permuter::make_sorted(2); fence(); if (twig_tail != l) - twig_tail->entry[0].value = nl; + twig_tail->entry[0].value = pmemobj_oid(nl).off; twig_tail = nl; if (twig_head != l) { leafnode *iter = twig_head; mfence(); - for ( ; iter != twig_tail && iter != NULL; iter = reinterpret_cast (iter->entry[0].value)) { + for ( ; iter != twig_tail && iter != NULL; iter = reinterpret_cast (ptr_from_off(iter->entry[0].value))) { clflush((char *)iter, sizeof(leafnode), false); } clflush((char *)twig_tail, sizeof(leafnode), false); mfence(); - l->entry[kx_.p].value = twig_head; + l->entry[kx_.p].value = pmemobj_oid(twig_head).off; clflush((char *)l->entry_addr(kx_.p) + 8, sizeof(uintptr_t), true); } else { clflush((char *)nl, sizeof(leafnode), true); - l->entry[kx_.p].value = nl; + l->entry[kx_.p].value = pmemobj_oid(nl).off; clflush((char *)l->entry_addr(kx_.p) + 8, sizeof(uintptr_t), true); } } @@ -324,10 +384,10 @@ void leafnode::check_for_recovery(masstree *t, leafnode *left, leafnode *right, if (depth > 0) { key_indexed_position pkx_; leafnode *p = correct_layer_root(root, lv, depth, pkx_); - if (p->value(pkx_.p) == left) { + if (p->value(pkx_.p) == pmemobj_oid(left).off) { leafnode *new_root = new leafnode(left, right->highest, right, left->level() + 1); clflush((char *) new_root, sizeof(leafnode), true); - p->entry[pkx_.p].value = new_root; + p->entry[pkx_.p].value = pmemobj_oid(new_root).off; clflush((char *) &p->entry[pkx_.p].value, sizeof(uintptr_t), true); p->unlock(); @@ -335,10 +395,10 @@ void leafnode::check_for_recovery(masstree *t, leafnode *left, leafnode *right, left->unlock(); } else { root = p; - t->split(p->entry[pkx_.p].value, root, depth, lv, right->highest, right, left->level() + 1, left); + t->split(ptr_from_off(p->entry[pkx_.p].value), root, depth, lv, right->highest, right, left->level() + 1, left); } } else { - if (t->root() == left) { + if (t->root() == pmemobj_oid(left).off) { leafnode *new_root = new leafnode(left, right->highest, right, left->level() + 1); clflush((char *) new_root, sizeof(leafnode), true); t->setNewRoot(new_root); @@ -359,14 +419,14 @@ void masstree::put(uint64_t key, void *value, ThreadInfo &threadEpocheInfo) leafnode *next = NULL, *p = NULL; from_root: - p = reinterpret_cast (this->root_); + p = reinterpret_cast (ptr_from_off(this->root_)); while (p->level() != 0) { inter_retry: next = p->advance_to_key(key, true); if (next != p) { // check for recovery - if (p->trylock()) { - if (next->trylock()) + if (p->trylock() == 0) { + if (next->trylock() == 0) p->check_for_recovery(this, p, next, NULL, 0, NULL); else p->unlock(); @@ -380,19 +440,19 @@ void masstree::put(uint64_t key, void *value, ThreadInfo &threadEpocheInfo) kx_ = p->key_lower_bound(key); - void *snapshot_v; + uint64_t snapshot_v; if (kx_.i >= 0) { snapshot_v = p->value(kx_.p); fence(); if (p->key(kx_.p) <= key) { if (snapshot_v == p->value(kx_.p)) - p = reinterpret_cast(snapshot_v); + p = reinterpret_cast(ptr_from_off(snapshot_v)); else { goto inter_retry; } } } else { - p = p->leftmost(); + p = reinterpret_cast(ptr_from_off(p->leftmost())); } } @@ -401,8 +461,8 @@ void masstree::put(uint64_t key, void *value, ThreadInfo &threadEpocheInfo) next = l->advance_to_key(key, true); if (next != l) { //check for recovery - if (l->trylock()) { - if (next->trylock()) + if (l->trylock() == 0) { + if (next->trylock() == 0) l->check_for_recovery(this, l, next, NULL, 0, NULL); else l->unlock(); @@ -449,7 +509,7 @@ void masstree::put(char *key, uint64_t value, ThreadInfo &threadEpocheInfo) leafvalue *lv = make_leaf(key, strlen(key), value); restart: - root = this->root_; + root = ptr_from_off(this->root_); depth = 0; p = reinterpret_cast (root); @@ -459,8 +519,8 @@ void masstree::put(char *key, uint64_t value, ThreadInfo &threadEpocheInfo) next = p->advance_to_key(lv->fkey[depth], true); if (next != p) { // check for recovery - if (p->trylock()) { - if (next->trylock()) + if (p->trylock() == 0) { + if (next->trylock() == 0) p->check_for_recovery(this, p, next, root, depth, lv); else p->unlock(); @@ -474,19 +534,19 @@ void masstree::put(char *key, uint64_t value, ThreadInfo &threadEpocheInfo) kx_ = p->key_lower_bound(lv->fkey[depth]); - void *snapshot_v; + uint64_t snapshot_v; if (kx_.i >= 0) { snapshot_v = p->value(kx_.p); fence(); if (p->key(kx_.p) <= lv->fkey[depth]) { if (snapshot_v == p->value(kx_.p)) - p = reinterpret_cast(snapshot_v); + p = reinterpret_cast(ptr_from_off(snapshot_v)); else { goto inter_retry; } } } else { - p = p->leftmost(); + p = reinterpret_cast(ptr_from_off(p->leftmost())); } } @@ -495,8 +555,8 @@ void masstree::put(char *key, uint64_t value, ThreadInfo &threadEpocheInfo) next = l->advance_to_key(lv->fkey[depth], true); if (next != l) { //check for recovery - if (l->trylock()) { - if (next->trylock()) + if (l->trylock() == 0) { + if (next->trylock() == 0) l->check_for_recovery(this, l, next, root, depth, lv); else l->unlock(); @@ -526,27 +586,27 @@ void masstree::put(char *key, uint64_t value, ThreadInfo &threadEpocheInfo) if (kx_.p >= 0) { // i) If there is additional layer, retry B+tree traversing from the next layer if (!IS_LV(l->value(kx_.p))) { - p = reinterpret_cast (l->value(kx_.p)); + p = reinterpret_cast (ptr_from_off(l->value(kx_.p))); root = l; depth++; l->unlock(); goto from_root; // ii) Atomically update value for the matching key - } else if (IS_LV(l->value(kx_.p)) && (LV_PTR(l->value(kx_.p)))->key_len == lv->key_len && - memcmp(lv->fkey, (LV_PTR(l->value(kx_.p)))->fkey, lv->key_len) == 0) { - (LV_PTR(l->value(kx_.p)))->value = value; - clflush((char *)&(LV_PTR(l->value(kx_.p)))->value, sizeof(void *), true); + } else if (IS_LV(l->value(kx_.p)) && ((leafvalue *)ptr_from_off((LV_PTR(l->value(kx_.p)))))->key_len == lv->key_len && + memcmp(lv->fkey, ((leafvalue *)ptr_from_off((LV_PTR(l->value(kx_.p)))))->fkey, lv->key_len) == 0) { + ((leafvalue *)ptr_from_off((LV_PTR(l->value(kx_.p)))))->value = value; + clflush((char *)&((leafvalue *)ptr_from_off((LV_PTR(l->value(kx_.p)))))->value, sizeof(void *), true); l->unlock(); // iii) Allocate additional layers (B+tree's roots) up to // the number of common prefixes (8bytes unit). // Insert two keys to the leafnode in the last layer // During these processes, this leafnode must be locked } else { - l->make_new_layer(l, kx_, LV_PTR(l->value(kx_.p)), lv, ++depth); + l->make_new_layer(l, kx_, ((leafvalue *)ptr_from_off(LV_PTR(l->value(kx_.p)))), lv, ++depth); l->unlock(); } } else { - if (!(l->leaf_insert(this, root, depth, lv, lv->fkey[depth], SET_LV(lv), kx_, true, true, NULL))) { + if (!(l->leaf_insert(this, root, depth, lv, lv->fkey[depth], SET_LV(pmemobj_oid(lv).off), kx_, true, true, NULL))) { put(key, value, threadEpocheInfo); } } @@ -555,11 +615,11 @@ void masstree::put(char *key, uint64_t value, ThreadInfo &threadEpocheInfo) void masstree::del(uint64_t key, ThreadInfo &threadEpocheInfo) { EpocheGuard epocheGuard(threadEpocheInfo); - void *root = this->root_; + void *root = ptr_from_off(this->root_); key_indexed_position kx_; uint32_t depth = 0; leafnode *next; - void *snapshot_v; + uint64_t snapshot_v; leafnode *p = reinterpret_cast (root); while (p->level() != 0) { @@ -567,8 +627,8 @@ void masstree::del(uint64_t key, ThreadInfo &threadEpocheInfo) next = p->advance_to_key(key, true); if (next != p) { // check for recovery - if (p->trylock()) { - if (next->trylock()) + if (p->trylock() == 0) { + if (next->trylock() == 0) p->check_for_recovery(this, p, next, NULL, 0, NULL); else p->unlock(); @@ -586,12 +646,12 @@ void masstree::del(uint64_t key, ThreadInfo &threadEpocheInfo) fence(); if (p->key(kx_.p) <= key) { if (snapshot_v == p->value(kx_.p)) - p = reinterpret_cast(snapshot_v); + p = reinterpret_cast(ptr_from_off(snapshot_v)); else goto inter_retry; } } else { - p = p->leftmost(); + p = reinterpret_cast(ptr_from_off(p->leftmost())); } } @@ -600,8 +660,8 @@ void masstree::del(uint64_t key, ThreadInfo &threadEpocheInfo) next = l->advance_to_key(key, true); if (next != l) { //check for recovery - if (l->trylock()) { - if (next->trylock()) + if (l->trylock() == 0) { + if (next->trylock() == 0) l->check_for_recovery(this, l, next, NULL, 0, NULL); else l->unlock(); @@ -634,7 +694,7 @@ void masstree::del(uint64_t key, ThreadInfo &threadEpocheInfo) void masstree::del(char *key, ThreadInfo &threadEpocheInfo) { EpocheGuard epocheGuard(threadEpocheInfo); - void *root = this->root_; + void *root = ptr_from_off(this->root_); key_indexed_position kx_; uint32_t depth = 0; leafnode *next; @@ -648,8 +708,8 @@ void masstree::del(char *key, ThreadInfo &threadEpocheInfo) next = p->advance_to_key(lv->fkey[depth], true); if (next != p) { // check for recovery - if (p->trylock()) { - if (next->trylock()) + if (p->trylock() == 0) { + if (next->trylock() == 0) p->check_for_recovery(this, p, next, root, depth, lv); else p->unlock(); @@ -663,19 +723,19 @@ void masstree::del(char *key, ThreadInfo &threadEpocheInfo) kx_ = p->key_lower_bound(lv->fkey[depth]); - void *snapshot_v; + uint64_t snapshot_v; if (kx_.i >= 0) { snapshot_v = p->value(kx_.p); fence(); if (p->key(kx_.p) <= lv->fkey[depth]) { if (snapshot_v == p->value(kx_.p)) - p = reinterpret_cast(snapshot_v); + p = reinterpret_cast(ptr_from_off(snapshot_v)); else { goto inter_retry; } } } else { - p = p->leftmost(); + p = reinterpret_cast(ptr_from_off(p->leftmost())); } } @@ -684,8 +744,8 @@ void masstree::del(char *key, ThreadInfo &threadEpocheInfo) next = l->advance_to_key(lv->fkey[depth], true); if (next != l) { //check for recovery - if (l->trylock()) { - if (next->trylock()) + if (l->trylock() == 0) { + if (next->trylock() == 0) l->check_for_recovery(this, l, next, root, depth, lv); else l->unlock(); @@ -710,14 +770,14 @@ void masstree::del(char *key, ThreadInfo &threadEpocheInfo) if (kx_.p >= 0) { // i) If there is additional layer, retry B+tree traversing from the next layer if (!IS_LV(l->value(kx_.p))) { - p = reinterpret_cast (l->value(kx_.p)); + p = reinterpret_cast (ptr_from_off(l->value(kx_.p))); root = l; depth++; l->unlock(); goto from_root; // ii) Checking false-positive result and starting to delete it - } else if (IS_LV(l->value(kx_.p)) && (LV_PTR(l->value(kx_.p)))->key_len == lv->key_len && - memcmp(lv->fkey, (LV_PTR(l->value(kx_.p)))->fkey, lv->key_len) == 0) { + } else if (IS_LV(l->value(kx_.p)) && ((leafvalue *)ptr_from_off((LV_PTR(l->value(kx_.p)))))->key_len == lv->key_len && + memcmp(lv->fkey, ((leafvalue *)ptr_from_off((LV_PTR(l->value(kx_.p)))))->fkey, lv->key_len) == 0) { if (!(l->leaf_delete(this, root, depth, lv, lv->fkey[depth], kx_, true, true, NULL, threadEpocheInfo))) { del(key, threadEpocheInfo); } @@ -734,7 +794,7 @@ void masstree::del(char *key, ThreadInfo &threadEpocheInfo) inline void leafnode::assign_initialize(int p, const uint64_t& key, void *value) { entry[p].key = key; - entry[p].value = value; + entry[p].value = (uint64_t) value; } inline void leafnode::assign_initialize(int p, leafnode *x, int xp) @@ -773,8 +833,8 @@ int leafnode::split_into(leafnode *nr, int p, const uint64_t& key, void *value, nr->highest = nr->entry[0].key; nr->next = this->next; clflush((char *)nr, sizeof(leafnode), true); - this->next = nr; - clflush((char *)(&this->next), sizeof(uintptr_t), true); + this->next = pmemobj_oid(nr).off; + clflush((char *)(&this->next), sizeof(uint64_t), true); split_key = nr->highest; return p >= mid ? 1 + (mid == LEAF_WIDTH) : 0; @@ -796,12 +856,12 @@ void leafnode::split_into_inter(leafnode *nr, int p, const uint64_t& key, void * nr->permutation = permr.value(); //leafnode::link_split(this, nr); - nr->leftmost_ptr = reinterpret_cast(this->entry[perml[mid - 1]].value); + nr->leftmost_ptr = this->entry[perml[mid - 1]].value; nr->highest = this->entry[perml[mid - 1]].key; nr->next = this->next; clflush((char *)nr, sizeof(leafnode), true); - this->next = nr; - clflush((char *)(&this->next), sizeof(uintptr_t), true); + this->next = pmemobj_oid(nr).off; + clflush((char *)(&this->next), sizeof(uint64_t), true); split_key = nr->highest; //return p >= mid ? 1 + (mid == LEAF_WIDTH) : 0; @@ -811,13 +871,13 @@ void leafnode::assign(int p, const uint64_t& key, void *value) { entry[p].key = key; fence(); - entry[p].value = value; + entry[p].value = (uint64_t) value; } void leafnode::assign_value(int p, void *value) { - entry[p].value = value; - clflush((char *)&entry[p].value, sizeof(void *), true); + entry[p].value = (uint64_t) value; + clflush((char *)&entry[p].value, sizeof(uint64_t), true); } void *leafnode::entry_addr(int p) @@ -827,8 +887,8 @@ void *leafnode::entry_addr(int p) void masstree::setNewRoot(void *new_root) { - this->root_ = new_root; - clflush((char *)&this->root_, sizeof(void *), true); + this->root_ = pmemobj_oid(new_root).off; + clflush((char *)&this->root_, sizeof(uint64_t), true); } leafnode *leafnode::correct_layer_root(void *root, leafvalue *lv, uint32_t depth, key_indexed_position &pkx_) @@ -882,24 +942,24 @@ leafnode *leafnode::search_for_leftsibling(void *root, uint64_t key, uint32_t le kx_ = p->key_lower_bound(key); - void *snapshot_v; + uint64_t snapshot_v; if (kx_.i >= 0) { snapshot_v = p->value(kx_.p); fence(); if (p->key(kx_.p) <= key) { if (snapshot_v == p->value(kx_.p)) - p = reinterpret_cast(snapshot_v); + p = reinterpret_cast(ptr_from_off(snapshot_v)); else { goto inter_retry; } } } else { - p = p->leftmost(); + p = reinterpret_cast(ptr_from_off(p->leftmost())); } } leaf_retry: - if (p->trylock()) { + if (p->trylock() == 0) { next = p->advance_to_key(key, true); if (next != p) { p->unlock(); @@ -950,9 +1010,10 @@ void *leafnode::leaf_insert(masstree *t, void *root, uint32_t depth, leafvalue * // 2) replay the original split process from the third step that removes the half of // the entries from the left sibling. (this would be more reasonable in terms of // reusing the existing split mechanism) - if (this->next != NULL && this->key(this->permutation[this->permutation.size() - 1]) > this->next->highest) { - this->next = this->next->next; - clflush((char *)&this->next, sizeof(leafnode *), true); + if (this->next != 0 && this->key(this->permutation[this->permutation.size() - 1]) + > ((leafnode *)ptr_from_off(this->next))->highest) { + this->next = ((leafnode *)ptr_from_off(this->next))->next; + clflush((char *)&this->next, sizeof(uint64_t), true); } leafnode *new_sibling = new leafnode(this->level_); @@ -977,18 +1038,18 @@ void *leafnode::leaf_insert(masstree *t, void *root, uint32_t depth, leafvalue * if (depth > 0) { key_indexed_position pkx_; leafnode *p = correct_layer_root(root, lv, depth, pkx_); - if (p->value(pkx_.p) == this) { + if (p->value(pkx_.p) == pmemobj_oid(this).off) { leafnode *new_root = new leafnode(this, split_key, new_sibling, level_ + 1); clflush((char *) new_root, sizeof(leafnode), true); - p->entry[pkx_.p].value = new_root; + p->entry[pkx_.p].value = pmemobj_oid(new_root).off; clflush((char *) &p->entry[pkx_.p].value, sizeof(uintptr_t), true); p->unlock(); } else { root = p; - t->split(p->entry[pkx_.p].value, root, depth, lv, split_key, new_sibling, level_ + 1, NULL); + t->split(ptr_from_off(p->entry[pkx_.p].value), root, depth, lv, split_key, new_sibling, level_ + 1, NULL); } } else { - if (t->root() == this) { + if (t->root() == pmemobj_oid(this).off) { leafnode *new_root = new leafnode(this, split_key, new_sibling, level_ + 1); clflush((char *) new_root, sizeof(leafnode), true); t->setNewRoot(new_root); @@ -1053,7 +1114,7 @@ void *leafnode::leaf_delete(masstree *t, void *root, uint32_t depth, leafvalue * if (depth > 0) { key_indexed_position pkx_; leafnode *p = correct_layer_root(root, lv, depth, pkx_); - if (p->value(pkx_.p) == nr) { + if (p->value(pkx_.p) == pmemobj_oid(nr).off) { cp = nr->permutation.value(); cp = cp.make_empty(); fence(); @@ -1063,17 +1124,17 @@ void *leafnode::leaf_delete(masstree *t, void *root, uint32_t depth, leafvalue * nr->unlock(); return nr; } else { - nl = search_for_leftsibling(p->entry[pkx_.p].value, nr->highest ? nr->highest - 1 : nr->highest, nr->level_, nr); - merge_state = t->merge(p->entry[pkx_.p].value, reinterpret_cast (p), depth, lv, nr->highest, nr->level_ + 1, NULL, threadInfo); + nl = search_for_leftsibling(ptr_from_off(p->entry[pkx_.p].value), nr->highest ? nr->highest - 1 : nr->highest, nr->level_, nr); + merge_state = t->merge(ptr_from_off(p->entry[pkx_.p].value), reinterpret_cast (p), depth, lv, nr->highest, nr->level_ + 1, NULL, threadInfo); if (merge_state == 16) { p = correct_layer_root(root, lv, depth, pkx_); - p->entry[pkx_.p].value = nr; + p->entry[pkx_.p].value = pmemobj_oid(nr).off; clflush((char *)&p->entry[pkx_.p].value, sizeof(void *), true); p->unlock(); } } } else { - if (t->root() == nr) { + if (t->root() == pmemobj_oid(nr).off) { cp = nr->permutation.value(); cp = cp.make_empty(); fence(); @@ -1082,7 +1143,7 @@ void *leafnode::leaf_delete(masstree *t, void *root, uint32_t depth, leafvalue * nr->unlock(); return nr; } else { - nl = search_for_leftsibling(t->root(), nr->highest ? nr->highest - 1 : nr->highest, nr->level_, nr); + nl = search_for_leftsibling(ptr_from_off(t->root()), nr->highest ? nr->highest - 1 : nr->highest, nr->level_, nr); merge_state = t->merge(NULL, NULL, 0, NULL, nr->highest, nr->level_ + 1, NULL, threadInfo); if (merge_state == 16) t->setNewRoot(nr); @@ -1123,7 +1184,7 @@ void *leafnode::inter_insert(masstree *t, void *root, uint32_t depth, leafvalue // permutation based insert if (this->permutation.size() < LEAF_WIDTH) { kx_.p = this->permutation.back(); - this->assign(kx_.p, key, value); + this->assign(kx_.p, key, (void *)pmemobj_oid(value).off); clflush((char *)(&this->entry[kx_.p]), sizeof(kv), true); permuter cp = this->permutation.value(); @@ -1133,7 +1194,7 @@ void *leafnode::inter_insert(masstree *t, void *root, uint32_t depth, leafvalue clflush((char *)(&this->permutation), sizeof(permuter), true); if (child != NULL) { - child->next->unlock(); + ((leafnode *)ptr_from_off(child->next))->unlock(); child->unlock(); } @@ -1155,9 +1216,10 @@ void *leafnode::inter_insert(masstree *t, void *root, uint32_t depth, leafvalue // 2) replay the original split process from the third step that removes the half of // the entries from the left sibling. (this would be more reasonable in terms of // reusing the existing split mechanism) - if (this->next != NULL && this->key(this->permutation[this->permutation.size() - 1]) > this->next->highest) { - this->next = this->next->next; - clflush((char *)&this->next, sizeof(leafnode *), true); + if (this->next != 0 && this->key(this->permutation[this->permutation.size() - 1]) + > ((leafnode *)ptr_from_off(this->next))->highest) { + this->next = ((leafnode *)ptr_from_off(this->next))->next; + clflush((char *)&this->next, sizeof(uint64_t), true); } leafnode *new_sibling = new leafnode(this->level_); @@ -1182,7 +1244,7 @@ void *leafnode::inter_insert(masstree *t, void *root, uint32_t depth, leafvalue if (key < split_key) { kx_.p = nl->permutation.back(); - nl->assign(kx_.p, key, value); + nl->assign(kx_.p, key, (void *)pmemobj_oid(value).off); clflush((char *)(&nl->entry[kx_.p]), sizeof(kv), true); permuter cp = nl->permutation.value(); @@ -1195,7 +1257,7 @@ void *leafnode::inter_insert(masstree *t, void *root, uint32_t depth, leafvalue } else { kx_ = nr->key_lower_bound_by(key); kx_.p = nr->permutation.back(); - nr->assign(kx_.p, key, value); + nr->assign(kx_.p, key, (void *)pmemobj_oid(value).off); clflush((char *)(&nr->entry[kx_.p]), sizeof(kv), true); permuter cp = nr->permutation.value(); @@ -1209,33 +1271,33 @@ void *leafnode::inter_insert(masstree *t, void *root, uint32_t depth, leafvalue // lock coupling (hand-over-hand locking) if (child != NULL) { - child->next->unlock(); + ((leafnode *)ptr_from_off(child->next))->unlock(); child->unlock(); } if (depth > 0) { key_indexed_position pkx_; leafnode *p = correct_layer_root(root, lv, depth, pkx_); - if (p->value(pkx_.p) == this) { + if (p->value(pkx_.p) == pmemobj_oid(this).off) { leafnode *new_root = new leafnode(this, split_key, new_sibling, level_ + 1); clflush((char *) new_root, sizeof(leafnode), true); - p->entry[pkx_.p].value = new_root; - clflush((char *) &p->entry[pkx_.p].value, sizeof(uintptr_t), true); + p->entry[pkx_.p].value = pmemobj_oid(new_root).off; + clflush((char *) &p->entry[pkx_.p].value, sizeof(uint64_t), true); p->unlock(); - this->next->unlock(); + ((leafnode *)ptr_from_off(this->next))->unlock(); this->unlock(); } else { root = p; - t->split(p->entry[pkx_.p].value, root, depth, lv, split_key, new_sibling, level_ + 1, this); + t->split(ptr_from_off(p->entry[pkx_.p].value), root, depth, lv, split_key, new_sibling, level_ + 1, this); } } else { - if (t->root() == this) { + if (t->root() == pmemobj_oid(this).off) { leafnode *new_root = new leafnode(this, split_key, new_sibling, level_ + 1); clflush((char *) new_root, sizeof(leafnode), true); t->setNewRoot(new_root); - this->next->unlock(); + ((leafnode *)ptr_from_off(this->next))->unlock(); this->unlock(); } else { t->split(NULL, NULL, 0, NULL, split_key, new_sibling, level_ + 1, this); @@ -1273,22 +1335,22 @@ int leafnode::inter_delete(masstree *t, void *root, uint32_t depth, leafvalue *l if (depth > 0) { key_indexed_position pkx_; leafnode *p = correct_layer_root(root, lv, depth, pkx_); - if (p->value(pkx_.p) == nr) { + if (p->value(pkx_.p) == pmemobj_oid(nr).off) { kx_.i = 16; p->unlock(); nr->unlock(); return (ret = kx_.i); } else { - nl = search_for_leftsibling(p->entry[pkx_.p].value, nr->highest ? nr->highest - 1 : nr->highest, nr->level_, nr); - merge_state = t->merge(p->entry[pkx_.p].value, root, depth, lv, nr->highest, nr->level_ + 1, nl, threadInfo); + nl = search_for_leftsibling(ptr_from_off(p->entry[pkx_.p].value), nr->highest ? nr->highest - 1 : nr->highest, nr->level_, nr); + merge_state = t->merge(ptr_from_off(p->entry[pkx_.p].value), root, depth, lv, nr->highest, nr->level_ + 1, nl, threadInfo); } } else { - if (t->root() == nr) { + if (t->root() == pmemobj_oid(nr).off) { kx_.i = 16; nr->unlock(); return (ret = kx_.i); } else { - nl = search_for_leftsibling(t->root(), nr->highest ? nr->highest - 1 : nr->highest, nr->level_, nr); + nl = search_for_leftsibling(ptr_from_off(t->root()), nr->highest ? nr->highest - 1 : nr->highest, nr->level_, nr); merge_state = t->merge(NULL, NULL, 0, NULL, nr->highest, nr->level_ + 1, nl, threadInfo); } } @@ -1327,9 +1389,9 @@ void masstree::split(void *left, void *root, uint32_t depth, leafvalue *lv, p = reinterpret_cast (left); reinterpret_cast (root)->unlock(); } else { - if (level > reinterpret_cast(root_)->level()) + if (level > reinterpret_cast(ptr_from_off(root_))->level()) return ; - p = reinterpret_cast (root_); + p = reinterpret_cast (ptr_from_off(root_)); } while (p->level() > level) { @@ -1345,19 +1407,19 @@ void masstree::split(void *left, void *root, uint32_t depth, leafvalue *lv, kx_ = p->key_lower_bound(key); - void *snapshot_v; + uint64_t snapshot_v; if (kx_.i >= 0) { snapshot_v = p->value(kx_.p); fence(); if (p->key(kx_.p) <= key) { if (snapshot_v == p->value(kx_.p)) - p = reinterpret_cast(snapshot_v); + p = reinterpret_cast(ptr_from_off(snapshot_v)); else { goto inter_retry; } } } else { - p = p->leftmost(); + p = reinterpret_cast(ptr_from_off(p->leftmost())); } } @@ -1394,7 +1456,7 @@ int masstree::merge(void *left, void *root, uint32_t depth, leafvalue *lv, key_indexed_position kx_; uint64_t oldv; leafnode *next; - void *snapshot_v; + uint64_t snapshot_v; if (depth > 0) { //if (level > reinterpret_cast(left)->level()) @@ -1404,7 +1466,7 @@ int masstree::merge(void *left, void *root, uint32_t depth, leafvalue *lv, } else { //if (level > reinterpret_cast(this->root_)->level()) // return ; - p = reinterpret_cast (this->root_); + p = reinterpret_cast (ptr_from_off(this->root_)); } while (p->level() > level) { @@ -1424,13 +1486,13 @@ int masstree::merge(void *left, void *root, uint32_t depth, leafvalue *lv, fence(); if (p->key(kx_.p) <= key) { if (snapshot_v == p->value(kx_.p)) - p = reinterpret_cast(snapshot_v); + p = reinterpret_cast(ptr_from_off(snapshot_v)); else { goto inter_retry; } } } else { - p = p->leftmost(); + p = reinterpret_cast(ptr_from_off(p->leftmost())); } } @@ -1455,7 +1517,7 @@ int masstree::merge(void *left, void *root, uint32_t depth, leafvalue *lv, void *masstree::get(uint64_t key, ThreadInfo &threadEpocheInfo) { EpocheGuard epocheGuard(threadEpocheInfo); - void *root = this->root_; + void *root = ptr_from_off(this->root_); key_indexed_position kx_; leafnode *next; @@ -1473,19 +1535,19 @@ void *masstree::get(uint64_t key, ThreadInfo &threadEpocheInfo) kx_ = p->key_lower_bound(key); - void *snapshot_v; + uint64_t snapshot_v; if (kx_.i >= 0) { snapshot_v = p->value(kx_.p); fence(); if (p->key(kx_.p) <= key) { if (snapshot_v == p->value(kx_.p)) - p = reinterpret_cast(snapshot_v); + p = reinterpret_cast(ptr_from_off(snapshot_v)); else { goto inter_retry; } } } else { - p = p->leftmost(); + p = reinterpret_cast(ptr_from_off(p->leftmost())); } } @@ -1496,11 +1558,11 @@ void *masstree::get(uint64_t key, ThreadInfo &threadEpocheInfo) kx_ = l->key_lower_bound_by(key); - void *snapshot_v = l->value(kx_.p); + uint64_t snapshot_v = l->value(kx_.p); fence(); if (kx_.p >= 0 && l->key(kx_.p) == key) { if (snapshot_v == l->value(kx_.p)) - return snapshot_v; + return (void *)snapshot_v; else { l = l->advance_to_key(key, false); goto leaf_retry; @@ -1520,10 +1582,10 @@ void *masstree::get(uint64_t key, ThreadInfo &threadEpocheInfo) } if (l->next_()) { - cp = l->next_()->permute(); - printf("next high key = %lu\n", l->next_()->highest_()); + cp = ((leafnode *)ptr_from_off(l->next_()))->permute(); + printf("next high key = %lu\n", ((leafnode *)ptr_from_off(l->next_()))->highest_()); for (int i = 0; i < cp.size(); i++) { - printf("next key = %lu\n", l->next_()->key(cp[i])); + printf("next key = %lu\n", ((leafnode *)ptr_from_off(l->next_()))->key(cp[i])); } } exit(0); @@ -1534,11 +1596,11 @@ void *masstree::get(uint64_t key, ThreadInfo &threadEpocheInfo) void *masstree::get(char *key, ThreadInfo &threadEpocheInfo) { EpocheGuard epocheGuard(threadEpocheInfo); - void *root = this->root_; + void *root = ptr_from_off(this->root_); key_indexed_position kx_; uint32_t depth = 0; leafnode *next; - void *snapshot_v; + uint64_t snapshot_v; leafvalue *lv = make_leaf(key, strlen(key), 0); @@ -1562,13 +1624,13 @@ void *masstree::get(char *key, ThreadInfo &threadEpocheInfo) fence(); if (p->key(kx_.p) <= lv->fkey[depth]) { if (snapshot_v == p->value(kx_.p)) - p = reinterpret_cast(snapshot_v); + p = reinterpret_cast(ptr_from_off(snapshot_v)); else { goto inter_retry; } } } else { - p = p->leftmost(); + p = reinterpret_cast(ptr_from_off(p->leftmost())); } } @@ -1583,16 +1645,16 @@ void *masstree::get(char *key, ThreadInfo &threadEpocheInfo) snapshot_v = l->value(kx_.p); if (!IS_LV(l->value(kx_.p))) { if (l->key(kx_.p) == lv->fkey[depth] && snapshot_v == l->value(kx_.p)) { - p = reinterpret_cast (snapshot_v); + p = reinterpret_cast (ptr_from_off(snapshot_v)); depth++; goto from_root; } } else { - snapshot_v = &((LV_PTR(l->value(kx_.p)))->value); - if (l->key(kx_.p) == lv->fkey[depth] && (LV_PTR(l->value(kx_.p)))->key_len == lv->key_len - && memcmp((LV_PTR(l->value(kx_.p)))->fkey, lv->fkey, lv->key_len) == 0) { - if (snapshot_v == &((LV_PTR(l->value(kx_.p)))->value)) - return snapshot_v; + snapshot_v = (uint64_t) &(((leafvalue *)ptr_from_off((LV_PTR(l->value(kx_.p)))))->value); + if (l->key(kx_.p) == lv->fkey[depth] && ((leafvalue *)ptr_from_off((LV_PTR(l->value(kx_.p)))))->key_len == lv->key_len + && memcmp(((leafvalue *)ptr_from_off((LV_PTR(l->value(kx_.p)))))->fkey, lv->fkey, lv->key_len) == 0) { + if (snapshot_v == (uint64_t) &(((leafvalue *)ptr_from_off((LV_PTR(l->value(kx_.p)))))->value)) + return (void *) snapshot_v; } else { return NULL; } @@ -1614,13 +1676,13 @@ void *masstree::get(char *key, ThreadInfo &threadEpocheInfo) permuter cp = l->permute(); for (int i = 0; i < cp.size(); i++) { printf("key = %lu\n", l->key(cp[i])); - printf("fkey = %s\n", (char *)((LV_PTR(l->value(cp[i])))->fkey)); + printf("fkey = %s\n", (char *)(((leafvalue *)ptr_from_off((LV_PTR(l->value(cp[i])))))->fkey)); } if (l->next_()) { - cp = l->next_()->permute(); + cp = ((leafnode *)ptr_from_off(l->next_()))->permute(); for (int i = 0; i < cp.size(); i++) { - printf("next key = %lu\n", l->next_()->key(cp[i])); + printf("next key = %lu\n", ((leafnode *)ptr_from_off(l->next_()))->key(cp[i])); } } exit(0); @@ -1632,7 +1694,7 @@ void leafnode::get_range(leafvalue * &lv, int num, int &count, leafvalue *buf[], { key_indexed_position kx_; leafnode *next; - void *snapshot_v, *snapshot_n; + uint64_t snapshot_v, snapshot_n; permuter perm; int backup; @@ -1656,13 +1718,13 @@ void leafnode::get_range(leafvalue * &lv, int num, int &count, leafvalue *buf[], fence(); if (p->key(kx_.p) <= lv->fkey[depth]) { if (snapshot_v == p->value(kx_.p)) - p = reinterpret_cast(snapshot_v); + p = reinterpret_cast(ptr_from_off(snapshot_v)); else { goto inter_retry; } } } else { - p = p->leftmost(); + p = reinterpret_cast(ptr_from_off(p->leftmost())); } } @@ -1682,25 +1744,25 @@ void leafnode::get_range(leafvalue * &lv, int num, int &count, leafvalue *buf[], fence(); if (!IS_LV(l->value(perm[i]))) { if (l->key(perm[i]) > lv->fkey[depth] && snapshot_v == l->value(perm[i])) { - p = reinterpret_cast (snapshot_v); + p = reinterpret_cast (ptr_from_off(snapshot_v)); leafvalue *smallest = p->smallest_leaf(lv->key_len, lv->value); p->get_range(smallest, num, count, buf, p, depth + 1); } else if (l->key(perm[i]) == lv->fkey[depth] && snapshot_v == l->value(perm[i])) { - p = reinterpret_cast (snapshot_v); + p = reinterpret_cast (ptr_from_off(snapshot_v)); p->get_range(lv, num, count, buf, p, depth + 1); } } else { snapshot_v = (LV_PTR(snapshot_v)); if (l->key(perm[i]) > lv->fkey[depth]) { if (snapshot_v == (LV_PTR(l->value(perm[i])))) - buf[count++] = reinterpret_cast (snapshot_v); + buf[count++] = reinterpret_cast (ptr_from_off(snapshot_v)); else { count = backup; goto leaf_retry; } - } else if (l->key(perm[i]) == lv->fkey[depth] && memcmp((LV_PTR(l->value(perm[i])))->fkey, lv->fkey, lv->key_len) >= 0) { + } else if (l->key(perm[i]) == lv->fkey[depth] && memcmp(((leafvalue *)ptr_from_off((LV_PTR(l->value(perm[i])))))->fkey, lv->fkey, lv->key_len) >= 0) { if (snapshot_v == (LV_PTR(l->value(perm[i])))) - buf[count++] = reinterpret_cast (snapshot_v); + buf[count++] = reinterpret_cast (ptr_from_off(snapshot_v)); else { count = backup; goto leaf_retry; @@ -1713,10 +1775,10 @@ void leafnode::get_range(leafvalue * &lv, int num, int &count, leafvalue *buf[], count = backup; continue; } else { - if (snapshot_n == NULL) + if (snapshot_n == 0) break; else - l = reinterpret_cast (snapshot_n); + l = reinterpret_cast (ptr_from_off(snapshot_n)); } } } @@ -1724,11 +1786,11 @@ void leafnode::get_range(leafvalue * &lv, int num, int &count, leafvalue *buf[], int masstree::scan(char *min, int num, leafvalue *buf[], ThreadInfo &threadEpocheInfo) { EpocheGuard epocheGuard(threadEpocheInfo); - void *root = this->root_; + void *root = ptr_from_off(this->root_); key_indexed_position kx_; uint32_t depth = 0; leafnode *next; - void *snapshot_v, *snapshot_n; + uint64_t snapshot_v, snapshot_n; permuter perm; int count, backup; @@ -1753,13 +1815,13 @@ int masstree::scan(char *min, int num, leafvalue *buf[], ThreadInfo &threadEpoch fence(); if (p->key(kx_.p) <= lv->fkey[depth]) { if (snapshot_v == p->value(kx_.p)) - p = reinterpret_cast(snapshot_v); + p = reinterpret_cast(ptr_from_off(snapshot_v)); else { goto inter_retry; } } } else { - p = p->leftmost(); + p = reinterpret_cast(ptr_from_off(p->leftmost())); } } @@ -1779,25 +1841,25 @@ int masstree::scan(char *min, int num, leafvalue *buf[], ThreadInfo &threadEpoch mfence(); if (!IS_LV(l->value(perm[i]))) { if (l->key(perm[i]) > lv->fkey[depth] && snapshot_v == l->value(perm[i])) { - p = reinterpret_cast (snapshot_v); + p = reinterpret_cast (ptr_from_off(snapshot_v)); leafvalue *smallest = p->smallest_leaf(lv->key_len, lv->value); p->get_range(smallest, num, count, buf, p, depth + 1); } else if (l->key(perm[i]) == lv->fkey[depth] && snapshot_v == l->value(perm[i])) { - p = reinterpret_cast (snapshot_v); + p = reinterpret_cast (ptr_from_off(snapshot_v)); p->get_range(lv, num, count, buf, p, depth + 1); } } else { snapshot_v = (LV_PTR(snapshot_v)); if (l->key(perm[i]) > lv->fkey[depth]) { if (snapshot_v == (LV_PTR(l->value(perm[i])))) - buf[count++] = reinterpret_cast (snapshot_v); + buf[count++] = reinterpret_cast (ptr_from_off(snapshot_v)); else { count = backup; goto leaf_retry; } - } else if (l->key(perm[i]) == lv->fkey[depth] && memcmp((LV_PTR(l->value(perm[i])))->fkey, lv->fkey, lv->key_len) >= 0) { + } else if (l->key(perm[i]) == lv->fkey[depth] && memcmp(((leafvalue *)ptr_from_off((LV_PTR(l->value(perm[i])))))->fkey, lv->fkey, lv->key_len) >= 0) { if (snapshot_v == (LV_PTR(l->value(perm[i])))) - buf[count++] = reinterpret_cast (snapshot_v); + buf[count++] = reinterpret_cast (ptr_from_off(snapshot_v)); else { count = backup; goto leaf_retry; @@ -1810,10 +1872,10 @@ int masstree::scan(char *min, int num, leafvalue *buf[], ThreadInfo &threadEpoch count = backup; continue; } else { - if (snapshot_n == NULL) + if (snapshot_n == 0) break; else - l = reinterpret_cast (snapshot_n); + l = reinterpret_cast (ptr_from_off(snapshot_n)); } } @@ -1823,12 +1885,12 @@ int masstree::scan(char *min, int num, leafvalue *buf[], ThreadInfo &threadEpoch int masstree::scan(uint64_t min, int num, uint64_t *buf, ThreadInfo &threadEpocheInfo) { EpocheGuard epocheGuard(threadEpocheInfo); - void *root = this->root_; + void *root = ptr_from_off(this->root_); key_indexed_position kx_; uint32_t depth = 0; leafnode *next; - void *snapshot_v; - leafnode *snapshot_n; + uint64_t snapshot_v; + uint64_t snapshot_n; permuter perm; int count, backup; @@ -1851,13 +1913,13 @@ int masstree::scan(uint64_t min, int num, uint64_t *buf, ThreadInfo &threadEpoch fence(); if (p->key(kx_.p) <= min) { if (snapshot_v == p->value(kx_.p)) - p = reinterpret_cast(snapshot_v); + p = reinterpret_cast(ptr_from_off(snapshot_v)); else { goto inter_retry; } } } else { - p = p->leftmost(); + p = reinterpret_cast(ptr_from_off(p->leftmost())); } } @@ -1888,10 +1950,10 @@ int masstree::scan(uint64_t min, int num, uint64_t *buf, ThreadInfo &threadEpoch count = backup; continue; } else { - if (snapshot_n == NULL) + if (snapshot_n == 0) break; else - l = snapshot_n; + l = (leafnode *)ptr_from_off(snapshot_n); } } diff --git a/P-Masstree/masstree.h b/P-Masstree/masstree.h index ef48e71d..0201486c 100644 --- a/P-Masstree/masstree.h +++ b/P-Masstree/masstree.h @@ -13,6 +13,9 @@ #include "tbb/concurrent_vector.h" #endif +#include +#include + #include "Epoche.h" namespace masstree { @@ -25,17 +28,17 @@ namespace masstree { #define LV_BITS (1ULL << 0) #define IS_LV(x) ((uintptr_t)x & LV_BITS) -#define LV_PTR(x) (leafvalue*)((void*)((uintptr_t)x & ~LV_BITS)) +#define LV_PTR(x) ((uint64_t)((uintptr_t)x & ~LV_BITS)) #define SET_LV(x) ((void*)((uintptr_t)x | LV_BITS)) class kv { private: uint64_t key; - void *value; + uint64_t value; public: kv() { key = UINT64_MAX; - value = NULL; + value = 0; } friend class leafnode; @@ -59,20 +62,22 @@ typedef struct key_indexed_position { class masstree { private: - void *root_; + uint64_t root_; MASS::Epoche epoche{256}; public: - masstree(); + masstree (); masstree (void *new_root); ~masstree() { } - MASS::ThreadInfo getThreadInfo(); + void *operator new(size_t size); - void *root() {return root_;} + uint64_t root() {return root_;} + + MASS::ThreadInfo getThreadInfo(); void setNewRoot(void *new_root); @@ -325,9 +330,9 @@ class leafnode { private: uint32_t level_; // 4bytes uint32_t version_; // 4bytes - std::mutex *wlock; // 8bytes - leafnode *next; // 8bytes - leafnode *leftmost_ptr; // 8bytes + uint64_t wlock; // 8bytes + uint64_t next; // 8bytes + uint64_t leftmost_ptr; // 8bytes uint64_t highest; // 8bytes permuter permutation; // 8bytes uint64_t dummy[2]; // 16bytes @@ -338,7 +343,7 @@ class leafnode { leafnode(void *left, uint64_t key, void *right, uint32_t level); - ~leafnode () {delete wlock;} + ~leafnode (); void *operator new(size_t size); @@ -354,7 +359,7 @@ class leafnode { void unlock(); - bool trylock(); + int trylock(); int compare_key(const uint64_t a, const uint64_t b); @@ -396,11 +401,11 @@ class leafnode { uint64_t key(int i) {return entry[i].key;} - void *value(int i) {return entry[i].value;} + uint64_t value(int i) {return entry[i].value;} - leafnode *leftmost() {return leftmost_ptr;} + uint64_t leftmost() {return leftmost_ptr;} - leafnode *next_() {return next;} + uint64_t next_() {return next;} uint64_t highest_() {return highest;} @@ -419,5 +424,12 @@ class leafnode { leafnode *search_for_leftsibling(void *root, uint64_t key, uint32_t level, leafnode *right); }; +// Initialize the persistent memory pool +//POBJ_LAYOUT_BEGIN(p_masstree); +//POBJ_LAYOUT_ROOT(p_masstree, masstree); +//POBJ_LAYOUT_TOID(p_masstree, leafnode); +//POBJ_LAYOUT_TOID(p_masstree, leafvalue); +//POBJ_LAYOUT_END(p_masstree); + } #endif diff --git a/pmdk.md b/pmdk.md index 06b5bcfd..a666a2b1 100644 --- a/pmdk.md +++ b/pmdk.md @@ -22,12 +22,19 @@ clht_t* w = pmemobj_direct(my_root); **Limitations** Currently, CLHT is the only data structure that has been converted to PMDK. We plan on updating the other data structures in the near future. ## Build & Run +### System requirements +- Ubuntu 18.04.1 LTS +- P-HOT: x86-64 CPU supporting at least the AVX-2 and BMI-2 instruction sets (Haswell and newer) +- Linux kernel: v5.3 or later (The huge performance drop in PMDK was observed on the old kernel versions) +- Compiler: cmake, g++-7, gcc-7, c++17 + ### How to enable PM? 1. Install PMDK -```$ git clone https://github.com/pmem/pmdk.git +$ git clone https://github.com/pmem/pmdk.git $ cd pmdk -$ git checkout tags/1.6 +$ git checkout tags/1.8 $ make -j +$ sudo make install $ cd .. ``` 2. Emulate PM with Ext4-DAX mount @@ -50,4 +57,4 @@ if( access("/mnt/pmem/pool", F_OK ) != -1 ) } ``` -4. Make accordingly and run your code. \ No newline at end of file +4. Make accordingly and run your code.