From f1b563218f04ab58ec214a86479b3b4e923e0c5c Mon Sep 17 00:00:00 2001 From: Krzysztof Swiecicki Date: Wed, 22 May 2024 09:42:18 +0000 Subject: [PATCH 1/3] Add memspace "lowest latency" This memspace is analogous to the 'highest bandwidth' memspace in its composition, but it focuses on the NUMA nodes with best latency. --- include/umf/memspace.h | 5 ++ src/CMakeLists.txt | 3 +- src/libumf.c | 1 + src/libumf.map | 1 + src/memory_target.c | 12 +++ src/memory_target.h | 4 + src/memory_target_ops.h | 2 + src/memory_targets/memory_target_numa.c | 90 ++++++++++++++++++--- src/memspace_internal.h | 1 + src/memspaces/memspace_lowest_latency.c | 103 ++++++++++++++++++++++++ 10 files changed, 209 insertions(+), 13 deletions(-) create mode 100644 src/memspaces/memspace_lowest_latency.c diff --git a/include/umf/memspace.h b/include/umf/memspace.h index 9467fd308..8acfa9325 100644 --- a/include/umf/memspace.h +++ b/include/umf/memspace.h @@ -61,6 +61,11 @@ umf_memspace_handle_t umfMemspaceHighestCapacityGet(void); /// umf_memspace_handle_t umfMemspaceHighestBandwidthGet(void); +/// \brief Retrieves predefined lowest latency memspace. +/// \return lowest latency memspace handle on success or NULL on failure. +/// +umf_memspace_handle_t umfMemspaceLowestLatencyGet(void); + #ifdef __cplusplus } #endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 2b1318baf..2e256e911 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -85,7 +85,8 @@ set(UMF_SOURCES_COMMON_LINUX_MACOSX memspaces/memspace_numa.c memspaces/memspace_host_all.c memspaces/memspace_highest_capacity.c - memspaces/memspace_highest_bandwidth.c) + memspaces/memspace_highest_bandwidth.c + memspaces/memspace_lowest_latency.c) set(UMF_SOURCES_LINUX ${UMF_SOURCES_LINUX} ${UMF_SOURCES_COMMON_LINUX_MACOSX} provider/provider_os_memory_linux.c) diff --git a/src/libumf.c b/src/libumf.c index 87a8afef5..161baa387 100644 --- a/src/libumf.c +++ b/src/libumf.c @@ -34,6 +34,7 @@ void umfTearDown(void) { umfMemspaceHostAllDestroy(); umfMemspaceHighestCapacityDestroy(); umfMemspaceHighestBandwidthDestroy(); + umfMemspaceLowestLatencyDestroy(); umfDestroyTopology(); #endif // make sure TRACKER is not used after being destroyed diff --git a/src/libumf.map b/src/libumf.map index 23fd8867b..293a8c66d 100644 --- a/src/libumf.map +++ b/src/libumf.map @@ -38,6 +38,7 @@ UMF_1.0 { umfMemspaceHighestBandwidthGet; umfMemspaceHighestCapacityGet; umfMemspaceHostAllGet; + umfMemspaceLowestLatencyGet; umfOpenIPCHandle; umfOsMemoryProviderOps; umfPoolAlignedMalloc; diff --git a/src/memory_target.c b/src/memory_target.c index 6ec08eab8..3cbdb09d9 100644 --- a/src/memory_target.c +++ b/src/memory_target.c @@ -97,3 +97,15 @@ umfMemoryTargetGetBandwidth(umf_memory_target_handle_t srcMemoryTarget, return srcMemoryTarget->ops->get_bandwidth( srcMemoryTarget->priv, dstMemoryTarget->priv, bandwidth); } + +umf_result_t +umfMemoryTargetGetLatency(umf_memory_target_handle_t srcMemoryTarget, + umf_memory_target_handle_t dstMemoryTarget, + size_t *latency) { + if (!srcMemoryTarget || !dstMemoryTarget || !latency) { + return UMF_RESULT_ERROR_INVALID_ARGUMENT; + } + + return srcMemoryTarget->ops->get_latency(srcMemoryTarget->priv, + dstMemoryTarget->priv, latency); +} diff --git a/src/memory_target.h b/src/memory_target.h index cb394ddff..c522cce24 100644 --- a/src/memory_target.h +++ b/src/memory_target.h @@ -39,6 +39,10 @@ umf_result_t umfMemoryTargetGetBandwidth(umf_memory_target_handle_t srcMemoryTarget, umf_memory_target_handle_t dstMemoryTarget, size_t *bandwidth); +umf_result_t +umfMemoryTargetGetLatency(umf_memory_target_handle_t srcMemoryTarget, + umf_memory_target_handle_t dstMemoryTarget, + size_t *latency); #ifdef __cplusplus } diff --git a/src/memory_target_ops.h b/src/memory_target_ops.h index 767e939cc..b62b5545d 100644 --- a/src/memory_target_ops.h +++ b/src/memory_target_ops.h @@ -41,6 +41,8 @@ typedef struct umf_memory_target_ops_t { umf_result_t (*get_capacity)(void *memoryTarget, size_t *capacity); umf_result_t (*get_bandwidth)(void *srcMemoryTarget, void *dstMemoryTarget, size_t *bandwidth); + umf_result_t (*get_latency)(void *srcMemoryTarget, void *dstMemoryTarget, + size_t *latency); } umf_memory_target_ops_t; #ifdef __cplusplus diff --git a/src/memory_targets/memory_target_numa.c b/src/memory_targets/memory_target_numa.c index dc1bf435e..4b830ae9e 100644 --- a/src/memory_targets/memory_target_numa.c +++ b/src/memory_targets/memory_target_numa.c @@ -167,13 +167,26 @@ static umf_result_t numa_get_capacity(void *memTarget, size_t *capacity) { return UMF_RESULT_SUCCESS; } -static umf_result_t numa_get_bandwidth(void *srcMemoryTarget, - void *dstMemoryTarget, - size_t *bandwidth) { - if (!srcMemoryTarget || !dstMemoryTarget || !bandwidth) { - return UMF_RESULT_ERROR_INVALID_ARGUMENT; +typedef enum memattr_type_t { + MEMATTR_TYPE_BANDWIDTH, + MEMATTR_TYPE_LATENCY +} memattr_type_t; + +static size_t memattr_get_worst_value(memattr_type_t type) { + switch (type) { + case MEMATTR_TYPE_BANDWIDTH: + return 0; + case MEMATTR_TYPE_LATENCY: + return SIZE_MAX; + default: + assert(0); // Should not be reachable + return 0; } +} +static umf_result_t query_attribute_value(void *srcMemoryTarget, + void *dstMemoryTarget, size_t *value, + memattr_type_t type) { hwloc_topology_t topology = umfGetTopology(); if (!topology) { return UMF_RESULT_ERROR_NOT_SUPPORTED; @@ -195,23 +208,75 @@ static umf_result_t numa_get_bandwidth(void *srcMemoryTarget, // Given NUMA nodes aren't local, HWLOC returns an error in such case. if (!hwloc_bitmap_intersects(srcNumaNode->cpuset, dstNumaNode->cpuset)) { - *bandwidth = 0; + // Since we want to skip such query, we return the worst possible + // value for given memory attribute. + *value = memattr_get_worst_value(type); return UMF_RESULT_SUCCESS; } + enum hwloc_memattr_id_e hwlocMemAttrType = INT_MAX; + switch (type) { + case MEMATTR_TYPE_BANDWIDTH: + hwlocMemAttrType = HWLOC_MEMATTR_ID_BANDWIDTH; + break; + case MEMATTR_TYPE_LATENCY: + hwlocMemAttrType = HWLOC_MEMATTR_ID_LATENCY; + break; + default: + assert(0); // Shouldn't be reachable. + return UMF_RESULT_ERROR_INVALID_ARGUMENT; + } + struct hwloc_location initiator = {.location.cpuset = srcNumaNode->cpuset, .type = HWLOC_LOCATION_TYPE_CPUSET}; - hwloc_uint64_t value = 0; - int ret = hwloc_memattr_get_value(topology, HWLOC_MEMATTR_ID_BANDWIDTH, - dstNumaNode, &initiator, 0, &value); + + hwloc_uint64_t memAttrValue = 0; + int ret = hwloc_memattr_get_value(topology, hwlocMemAttrType, dstNumaNode, + &initiator, 0, &memAttrValue); if (ret) { - LOG_ERR("Retrieving bandwidth for initiator node %u to node %u failed.", - srcNumaNode->os_index, dstNumaNode->os_index); return (errno == EINVAL) ? UMF_RESULT_ERROR_NOT_SUPPORTED : UMF_RESULT_ERROR_UNKNOWN; } - *bandwidth = value; + *value = memAttrValue; + + return UMF_RESULT_SUCCESS; +} + +static umf_result_t numa_get_bandwidth(void *srcMemoryTarget, + void *dstMemoryTarget, + size_t *bandwidth) { + if (!srcMemoryTarget || !dstMemoryTarget || !bandwidth) { + return UMF_RESULT_ERROR_INVALID_ARGUMENT; + } + + umf_result_t ret = query_attribute_value(srcMemoryTarget, dstMemoryTarget, + bandwidth, MEMATTR_TYPE_BANDWIDTH); + if (ret) { + LOG_ERR("Retrieving bandwidth for initiator node %u to node %u failed.", + ((struct numa_memory_target_t *)srcMemoryTarget)->physical_id, + ((struct numa_memory_target_t *)dstMemoryTarget)->physical_id); + return ret; + } + + return UMF_RESULT_SUCCESS; +} + +static umf_result_t numa_get_latency(void *srcMemoryTarget, + void *dstMemoryTarget, size_t *latency) { + if (!srcMemoryTarget || !dstMemoryTarget || !latency) { + return UMF_RESULT_ERROR_INVALID_ARGUMENT; + } + + umf_result_t ret = query_attribute_value(srcMemoryTarget, dstMemoryTarget, + latency, MEMATTR_TYPE_LATENCY); + if (ret) { + LOG_ERR("Retrieving latency for initiator node %u to node %u failed.", + ((struct numa_memory_target_t *)srcMemoryTarget)->physical_id, + ((struct numa_memory_target_t *)dstMemoryTarget)->physical_id); + return ret; + } + return UMF_RESULT_SUCCESS; } @@ -223,5 +288,6 @@ struct umf_memory_target_ops_t UMF_MEMORY_TARGET_NUMA_OPS = { .clone = numa_clone, .get_capacity = numa_get_capacity, .get_bandwidth = numa_get_bandwidth, + .get_latency = numa_get_latency, .memory_provider_create_from_memspace = numa_memory_provider_create_from_memspace}; diff --git a/src/memspace_internal.h b/src/memspace_internal.h index 6ced67303..857049392 100644 --- a/src/memspace_internal.h +++ b/src/memspace_internal.h @@ -60,6 +60,7 @@ void umfMemspaceDestroy(umf_memspace_handle_t hMemspace); void umfMemspaceHostAllDestroy(void); void umfMemspaceHighestCapacityDestroy(void); void umfMemspaceHighestBandwidthDestroy(void); +void umfMemspaceLowestLatencyDestroy(void); #ifdef __cplusplus } diff --git a/src/memspaces/memspace_lowest_latency.c b/src/memspaces/memspace_lowest_latency.c new file mode 100644 index 000000000..fa6985362 --- /dev/null +++ b/src/memspaces/memspace_lowest_latency.c @@ -0,0 +1,103 @@ +/* + * + * Copyright (C) 2024 Intel Corporation + * + * Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + */ + +#include +#include +#include +#include + +#include "base_alloc_global.h" +#include "memory_target_numa.h" +#include "memspace_internal.h" +#include "memspace_numa.h" +#include "topology.h" +#include "utils_common.h" +#include "utils_concurrency.h" +#include "utils_log.h" + +static umf_result_t getBestLatencyTarget(umf_memory_target_handle_t initiator, + umf_memory_target_handle_t *nodes, + size_t numNodes, + umf_memory_target_handle_t *target) { + size_t bestNodeIdx = 0; + size_t bestLatency = SIZE_MAX; + for (size_t nodeIdx = 0; nodeIdx < numNodes; nodeIdx++) { + size_t latency = SIZE_MAX; + umf_result_t ret = + umfMemoryTargetGetLatency(initiator, nodes[nodeIdx], &latency); + if (ret) { + return ret; + } + + if (latency < bestLatency) { + bestNodeIdx = nodeIdx; + bestLatency = latency; + } + } + + *target = nodes[bestNodeIdx]; + + return UMF_RESULT_SUCCESS; +} + +static umf_result_t +umfMemspaceLowestLatencyCreate(umf_memspace_handle_t *hMemspace) { + if (!hMemspace) { + return UMF_RESULT_ERROR_INVALID_ARGUMENT; + } + + umf_memspace_handle_t hostAllMemspace = umfMemspaceHostAllGet(); + if (!hostAllMemspace) { + return UMF_RESULT_ERROR_UNKNOWN; + } + + umf_memspace_handle_t lowLatencyMemspace = NULL; + umf_result_t ret = umfMemspaceFilter(hostAllMemspace, getBestLatencyTarget, + &lowLatencyMemspace); + if (ret != UMF_RESULT_SUCCESS) { + // HWLOC could possibly return an 'EINVAL' error, which in this context + // means that the HMAT is unavailable and we can't obtain the + // 'latency' value of any NUMA node. + return ret; + } + + *hMemspace = lowLatencyMemspace; + return UMF_RESULT_SUCCESS; +} + +static umf_memspace_handle_t UMF_MEMSPACE_LOWEST_LATENCY = NULL; +static UTIL_ONCE_FLAG UMF_MEMSPACE_LOWEST_LATENCY_INITIALIZED = + UTIL_ONCE_FLAG_INIT; + +void umfMemspaceLowestLatencyDestroy(void) { + if (UMF_MEMSPACE_LOWEST_LATENCY) { + umfMemspaceDestroy(UMF_MEMSPACE_LOWEST_LATENCY); + UMF_MEMSPACE_LOWEST_LATENCY = NULL; + } +} + +static void umfMemspaceLowestLatencyInit(void) { + umf_result_t ret = + umfMemspaceLowestLatencyCreate(&UMF_MEMSPACE_LOWEST_LATENCY); + if (ret != UMF_RESULT_SUCCESS) { + LOG_ERR("Creating the lowest latency memspace failed with a %u error\n", + ret); + assert(ret == UMF_RESULT_ERROR_NOT_SUPPORTED); + } + +#if defined(_WIN32) && !defined(UMF_SHARED_LIBRARY) + atexit(umfMemspaceLowestLatencyDestroy); +#endif +} + +umf_memspace_handle_t umfMemspaceLowestLatencyGet(void) { + util_init_once(&UMF_MEMSPACE_LOWEST_LATENCY_INITIALIZED, + umfMemspaceLowestLatencyInit); + return UMF_MEMSPACE_LOWEST_LATENCY; +} From 2e3c0090d6e63ba1e9079ab3450e58bdfd82a706 Mon Sep 17 00:00:00 2001 From: Krzysztof Swiecicki Date: Wed, 22 May 2024 10:06:33 +0000 Subject: [PATCH 2/3] Add tests for memspace "lowest latency" Those tests are skipped with GTEST_SKIP() when latency property can't be queried (HMAT is not supported on the platform). --- test/CMakeLists.txt | 12 +- test/memspaces/memspace_fixtures.hpp | 221 ++++++++++++++++++ test/memspaces/memspace_helpers.hpp | 25 -- test/memspaces/memspace_highest_bandwidth.cpp | 177 +------------- test/memspaces/memspace_highest_capacity.cpp | 1 + test/memspaces/memspace_host_all.cpp | 1 + test/memspaces/memspace_lowest_latency.cpp | 44 ++++ test/memspaces/memspace_numa.cpp | 1 + test/test_valgrind.sh | 3 + 9 files changed, 289 insertions(+), 196 deletions(-) create mode 100644 test/memspaces/memspace_fixtures.hpp create mode 100644 test/memspaces/memspace_lowest_latency.cpp diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index f88fa1789..f8a4982f4 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -162,23 +162,27 @@ if(LINUX) # OS-specific functions are implemented only for Linux now add_umf_test( NAME memspace_numa SRCS memspaces/memspace_numa.cpp - LIBS ${LIBNUMA_LIBRARIES}) + LIBS ${LIBNUMA_LIBRARIES} ${LIBHWLOC_LIBRARIES}) add_umf_test( NAME provider_os_memory_config SRCS provider_os_memory_config.cpp - LIBS ${UMF_UTILS_FOR_TEST} ${LIBNUMA_LIBRARIES}) + LIBS ${UMF_UTILS_FOR_TEST} ${LIBNUMA_LIBRARIES} ${LIBHWLOC_LIBRARIES}) add_umf_test( NAME memspace_host_all SRCS memspaces/memspace_host_all.cpp - LIBS ${UMF_UTILS_FOR_TEST} ${LIBNUMA_LIBRARIES}) + LIBS ${UMF_UTILS_FOR_TEST} ${LIBNUMA_LIBRARIES} ${LIBHWLOC_LIBRARIES}) add_umf_test( NAME memspace_highest_capacity SRCS memspaces/memspace_highest_capacity.cpp - LIBS ${UMF_UTILS_FOR_TEST} ${LIBNUMA_LIBRARIES}) + LIBS ${UMF_UTILS_FOR_TEST} ${LIBNUMA_LIBRARIES} ${LIBHWLOC_LIBRARIES}) add_umf_test( NAME memspace_highest_bandwidth SRCS memspaces/memspace_highest_bandwidth.cpp LIBS ${UMF_UTILS_FOR_TEST} ${LIBNUMA_LIBRARIES} ${LIBHWLOC_LIBRARIES}) + add_umf_test( + NAME memspace_lowest_latency + SRCS memspaces/memspace_lowest_latency.cpp + LIBS ${UMF_UTILS_FOR_TEST} ${LIBNUMA_LIBRARIES} ${LIBHWLOC_LIBRARIES}) add_umf_test( NAME mempolicy SRCS memspaces/mempolicy.cpp diff --git a/test/memspaces/memspace_fixtures.hpp b/test/memspaces/memspace_fixtures.hpp new file mode 100644 index 000000000..de01e9ae6 --- /dev/null +++ b/test/memspaces/memspace_fixtures.hpp @@ -0,0 +1,221 @@ +// Copyright (C) 2024 Intel Corporation +// Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef UMF_MEMSPACE_FIXTURES_HPP +#define UMF_MEMSPACE_FIXTURES_HPP + +#include "base.hpp" +#include "memspace_helpers.hpp" +#include "test_helpers.h" + +#include +#include +#include +#include +#include + +#define SIZE_4K (4096UL) +#define SIZE_4M (SIZE_4K * 1024UL) + +// In HWLOC v2.3.0, the 'hwloc_location_type_e' enum is defined inside an +// 'hwloc_location' struct. In newer versions, this enum is defined globally. +// To prevent compile errors in C++ tests related this scope change +// 'hwloc_location_type_e' has been aliased. +using hwloc_location_type_alias = decltype(hwloc_location::type); + +struct numaNodesTest : ::umf_test::test { + void SetUp() override { + ::umf_test::test::SetUp(); + + if (numa_available() == -1 || numa_all_nodes_ptr == nullptr) { + GTEST_FAIL() << "Failed to initialize libnuma"; + } + + int maxNode = numa_max_node(); + if (maxNode < 0) { + GTEST_FAIL() << "No available numa nodes"; + } + + for (int i = 0; i <= maxNode; i++) { + if (numa_bitmask_isbitset(numa_all_nodes_ptr, i)) { + nodeIds.emplace_back(i); + maxNodeId = i; + } + } + } + + std::vector nodeIds; + unsigned long maxNodeId = 0; +}; + +using isQuerySupportedFunc = bool (*)(size_t); +using memspaceGetFunc = umf_memspace_handle_t (*)(); +using memspaceGetParams = std::tuple; + +struct memspaceGetTest : ::numaNodesTest, + ::testing::WithParamInterface { + void SetUp() override { + ::numaNodesTest::SetUp(); + + auto [isQuerySupported, memspaceGet] = this->GetParam(); + + if (!isQuerySupported(nodeIds.front())) { + GTEST_SKIP(); + } + + hMemspace = memspaceGet(); + ASSERT_NE(hMemspace, nullptr); + } + + umf_memspace_handle_t hMemspace = nullptr; +}; + +struct memspaceProviderTest : ::memspaceGetTest { + void SetUp() override { + ::memspaceGetTest::SetUp(); + + if (::memspaceGetTest::IsSkipped()) { + GTEST_SKIP(); + } + + umf_result_t ret = + umfMemoryProviderCreateFromMemspace(hMemspace, nullptr, &hProvider); + ASSERT_EQ(ret, UMF_RESULT_SUCCESS); + ASSERT_NE(hProvider, nullptr); + } + + void TearDown() override { + ::memspaceGetTest::TearDown(); + + if (hProvider) { + umfMemoryProviderDestroy(hProvider); + } + } + + umf_memory_provider_handle_t hProvider = nullptr; +}; + +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(memspaceGetTest); +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(memspaceProviderTest); + +TEST_P(memspaceGetTest, providerFromMemspace) { + umf_memory_provider_handle_t hProvider = nullptr; + umf_result_t ret = + umfMemoryProviderCreateFromMemspace(hMemspace, nullptr, &hProvider); + UT_ASSERTeq(ret, UMF_RESULT_SUCCESS); + UT_ASSERTne(hProvider, nullptr); + + umfMemoryProviderDestroy(hProvider); +} + +TEST_P(memspaceProviderTest, allocFree) { + void *ptr = nullptr; + size_t size = SIZE_4K; + size_t alignment = 0; + + umf_result_t ret = umfMemoryProviderAlloc(hProvider, size, alignment, &ptr); + UT_ASSERTeq(ret, UMF_RESULT_SUCCESS); + UT_ASSERTne(ptr, nullptr); + + // Access the allocation, so that all the pages associated with it are + // allocated on some NUMA node. + memset(ptr, 0xFF, size); + + ret = umfMemoryProviderFree(hProvider, ptr, size); + UT_ASSERTeq(ret, UMF_RESULT_SUCCESS); +} + +static std::vector getAllCpus() { + std::vector allCpus; + for (int i = 0; i < numa_num_possible_cpus(); ++i) { + if (numa_bitmask_isbitset(numa_all_cpus_ptr, i)) { + allCpus.push_back(i); + } + } + + return allCpus; +} + +#define MAX_NODES 512 + +TEST_P(memspaceProviderTest, allocLocalMt) { + auto pinAllocValidate = [&](umf_memory_provider_handle_t hProvider, + int cpu) { + hwloc_topology_t topology = NULL; + UT_ASSERTeq(hwloc_topology_init(&topology), 0); + UT_ASSERTeq(hwloc_topology_load(topology), 0); + + // Pin current thread to the provided CPU. + hwloc_cpuset_t pinCpuset = hwloc_bitmap_alloc(); + UT_ASSERTeq(hwloc_bitmap_set(pinCpuset, cpu), 0); + UT_ASSERTeq( + hwloc_set_cpubind(topology, pinCpuset, HWLOC_CPUBIND_THREAD), 0); + + // Confirm that the thread is pinned to the provided CPU. + hwloc_cpuset_t curCpuset = hwloc_bitmap_alloc(); + UT_ASSERTeq( + hwloc_get_cpubind(topology, curCpuset, HWLOC_CPUBIND_THREAD), 0); + UT_ASSERT(hwloc_bitmap_isequal(curCpuset, pinCpuset)); + hwloc_bitmap_free(curCpuset); + hwloc_bitmap_free(pinCpuset); + + // Allocate some memory. + const size_t size = SIZE_4K; + const size_t alignment = 0; + void *ptr = nullptr; + + umf_result_t ret = + umfMemoryProviderAlloc(hProvider, size, alignment, &ptr); + UT_ASSERTeq(ret, UMF_RESULT_SUCCESS); + UT_ASSERTne(ptr, nullptr); + + // Access the allocation, so that all the pages associated with it are + // allocated on some NUMA node. + memset(ptr, 0xFF, size); + + // Get the NUMA node responsible for this allocation. + int mode = -1; + std::vector boundNodeIds; + size_t allocNodeId = SIZE_MAX; + getAllocationPolicy(ptr, maxNodeId, mode, boundNodeIds, allocNodeId); + + // Get the CPUs associated with the specified NUMA node. + hwloc_obj_t allocNodeObj = + hwloc_get_obj_by_type(topology, HWLOC_OBJ_NUMANODE, allocNodeId); + + unsigned nNodes = MAX_NODES; + std::vector localNodes(MAX_NODES); + hwloc_location loc; + loc.location.object = allocNodeObj, + loc.type = hwloc_location_type_alias::HWLOC_LOCATION_TYPE_OBJECT; + UT_ASSERTeq(hwloc_get_local_numanode_objs(topology, &loc, &nNodes, + localNodes.data(), 0), + 0); + UT_ASSERT(nNodes <= MAX_NODES); + + // Confirm that the allocation from this thread was made to a local + // NUMA node. + UT_ASSERT(std::any_of(localNodes.begin(), localNodes.end(), + [&allocNodeObj](hwloc_obj_t node) { + return node == allocNodeObj; + })); + + ret = umfMemoryProviderFree(hProvider, ptr, size); + UT_ASSERTeq(ret, UMF_RESULT_SUCCESS); + + hwloc_topology_destroy(topology); + }; + + const auto cpus = getAllCpus(); + std::vector threads; + for (auto cpu : cpus) { + threads.emplace_back(pinAllocValidate, hProvider, cpu); + } + + for (auto &thread : threads) { + thread.join(); + } +} + +#endif /* UMF_MEMSPACE_FIXTURES_HPP */ diff --git a/test/memspaces/memspace_helpers.hpp b/test/memspaces/memspace_helpers.hpp index b92beba63..9ce9a004e 100644 --- a/test/memspaces/memspace_helpers.hpp +++ b/test/memspaces/memspace_helpers.hpp @@ -17,31 +17,6 @@ #define SIZE_4K (4096UL) #define SIZE_4M (SIZE_4K * 1024UL) -struct numaNodesTest : ::umf_test::test { - void SetUp() override { - ::umf_test::test::SetUp(); - - if (numa_available() == -1 || numa_all_nodes_ptr == nullptr) { - GTEST_FAIL() << "Failed to initialize libnuma"; - } - - int maxNode = numa_max_node(); - if (maxNode < 0) { - GTEST_FAIL() << "No available numa nodes"; - } - - for (int i = 0; i <= maxNode; i++) { - if (numa_bitmask_isbitset(numa_all_nodes_ptr, i)) { - nodeIds.emplace_back(i); - maxNodeId = i; - } - } - } - - std::vector nodeIds; - unsigned long maxNodeId = 0; -}; - /// /// @brief Retrieves the memory policy information for \p ptr. /// @param ptr allocation pointer. diff --git a/test/memspaces/memspace_highest_bandwidth.cpp b/test/memspaces/memspace_highest_bandwidth.cpp index 7a56eeb26..2b5330751 100644 --- a/test/memspaces/memspace_highest_bandwidth.cpp +++ b/test/memspaces/memspace_highest_bandwidth.cpp @@ -2,23 +2,14 @@ // Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include "memory_target_numa.h" +#include "memspace_fixtures.hpp" #include "memspace_helpers.hpp" #include "memspace_internal.h" #include "test_helpers.h" #include -#include #include -using umf_test::test; - -// In HWLOC v2.3.0, the 'hwloc_location_type_e' enum is defined inside an -// 'hwloc_location' struct. In newer versions, this enum is defined globally. -// To prevent compile errors in C++ tests related this scope change -// 'hwloc_location_type_e' has been aliased. -using hwloc_location_type_alias = decltype(hwloc_location::type); - static bool canQueryBandwidth(size_t nodeId) { hwloc_topology_t topology = nullptr; int ret = hwloc_topology_init(&topology); @@ -43,161 +34,13 @@ static bool canQueryBandwidth(size_t nodeId) { return (ret == 0); } -struct memspaceHighestBandwidthTest : ::numaNodesTest { - void SetUp() override { - ::numaNodesTest::SetUp(); - - if (!canQueryBandwidth(nodeIds.front())) { - GTEST_SKIP(); - } - - hMemspace = umfMemspaceHighestBandwidthGet(); - ASSERT_NE(hMemspace, nullptr); - } - - umf_memspace_handle_t hMemspace = nullptr; -}; - -struct memspaceHighestBandwidthProviderTest : ::memspaceHighestBandwidthTest { - void SetUp() override { - ::memspaceHighestBandwidthTest::SetUp(); - - if (!canQueryBandwidth(nodeIds.front())) { - GTEST_SKIP(); - } - - umf_result_t ret = - umfMemoryProviderCreateFromMemspace(hMemspace, nullptr, &hProvider); - ASSERT_EQ(ret, UMF_RESULT_SUCCESS); - ASSERT_NE(hProvider, nullptr); - } - - void TearDown() override { - ::memspaceHighestBandwidthTest::TearDown(); - - if (hProvider) { - umfMemoryProviderDestroy(hProvider); - } - } - - umf_memory_provider_handle_t hProvider = nullptr; -}; - -TEST_F(memspaceHighestBandwidthTest, providerFromMemspace) { - umf_memory_provider_handle_t hProvider = nullptr; - umf_result_t ret = - umfMemoryProviderCreateFromMemspace(hMemspace, nullptr, &hProvider); - UT_ASSERTeq(ret, UMF_RESULT_SUCCESS); - UT_ASSERTne(hProvider, nullptr); - - umfMemoryProviderDestroy(hProvider); -} - -TEST_F(memspaceHighestBandwidthProviderTest, allocFree) { - void *ptr = nullptr; - size_t size = SIZE_4K; - size_t alignment = 0; - - umf_result_t ret = umfMemoryProviderAlloc(hProvider, size, alignment, &ptr); - UT_ASSERTeq(ret, UMF_RESULT_SUCCESS); - UT_ASSERTne(ptr, nullptr); - - // Access the allocation, so that all the pages associated with it are - // allocated on some NUMA node. - memset(ptr, 0xFF, size); - - ret = umfMemoryProviderFree(hProvider, ptr, size); - UT_ASSERTeq(ret, UMF_RESULT_SUCCESS); -} +INSTANTIATE_TEST_SUITE_P(memspaceLowestLatencyTest, memspaceGetTest, + ::testing::Values(memspaceGetParams{ + canQueryBandwidth, + umfMemspaceHighestBandwidthGet})); -static std::vector getAllCpus() { - std::vector allCpus; - for (int i = 0; i < numa_num_possible_cpus(); ++i) { - if (numa_bitmask_isbitset(numa_all_cpus_ptr, i)) { - allCpus.push_back(i); - } - } - - return allCpus; -} - -#define MAX_NODES 512 - -TEST_F(memspaceHighestBandwidthProviderTest, allocLocalMt) { - auto pinAllocValidate = [&](umf_memory_provider_handle_t hProvider, - int cpu) { - hwloc_topology_t topology = NULL; - UT_ASSERTeq(hwloc_topology_init(&topology), 0); - UT_ASSERTeq(hwloc_topology_load(topology), 0); - - // Pin current thread to the provided CPU. - hwloc_cpuset_t pinCpuset = hwloc_bitmap_alloc(); - UT_ASSERTeq(hwloc_bitmap_set(pinCpuset, cpu), 0); - UT_ASSERTeq( - hwloc_set_cpubind(topology, pinCpuset, HWLOC_CPUBIND_THREAD), 0); - - // Confirm that the thread is pinned to the provided CPU. - hwloc_cpuset_t curCpuset = hwloc_bitmap_alloc(); - UT_ASSERTeq( - hwloc_get_cpubind(topology, curCpuset, HWLOC_CPUBIND_THREAD), 0); - UT_ASSERT(hwloc_bitmap_isequal(curCpuset, pinCpuset)); - hwloc_bitmap_free(curCpuset); - hwloc_bitmap_free(pinCpuset); - - // Allocate some memory. - const size_t size = SIZE_4K; - const size_t alignment = 0; - void *ptr = nullptr; - - umf_result_t ret = - umfMemoryProviderAlloc(hProvider, size, alignment, &ptr); - UT_ASSERTeq(ret, UMF_RESULT_SUCCESS); - UT_ASSERTne(ptr, nullptr); - - // Access the allocation, so that all the pages associated with it are - // allocated on some NUMA node. - memset(ptr, 0xFF, size); - - // Get the NUMA node responsible for this allocation. - int mode = -1; - std::vector boundNodeIds; - size_t allocNodeId = SIZE_MAX; - getAllocationPolicy(ptr, maxNodeId, mode, boundNodeIds, allocNodeId); - - // Get the CPUs associated with the specified NUMA node. - hwloc_obj_t allocNodeObj = - hwloc_get_obj_by_type(topology, HWLOC_OBJ_NUMANODE, allocNodeId); - - unsigned nNodes = MAX_NODES; - std::vector localNodes(MAX_NODES); - hwloc_location loc; - loc.location.object = allocNodeObj, - loc.type = hwloc_location_type_alias::HWLOC_LOCATION_TYPE_OBJECT; - UT_ASSERTeq(hwloc_get_local_numanode_objs(topology, &loc, &nNodes, - localNodes.data(), 0), - 0); - UT_ASSERT(nNodes <= MAX_NODES); - - // Confirm that the allocation from this thread was made to a local - // NUMA node. - UT_ASSERT(std::any_of(localNodes.begin(), localNodes.end(), - [&allocNodeObj](hwloc_obj_t node) { - return node == allocNodeObj; - })); - - ret = umfMemoryProviderFree(hProvider, ptr, size); - UT_ASSERTeq(ret, UMF_RESULT_SUCCESS); - - hwloc_topology_destroy(topology); - }; - - const auto cpus = getAllCpus(); - std::vector threads; - for (auto cpu : cpus) { - threads.emplace_back(pinAllocValidate, hProvider, cpu); - } - - for (auto &thread : threads) { - thread.join(); - } -} +INSTANTIATE_TEST_SUITE_P(memspaceLowestLatencyProviderTest, + memspaceProviderTest, + ::testing::Values(memspaceGetParams{ + canQueryBandwidth, + umfMemspaceHighestBandwidthGet})); diff --git a/test/memspaces/memspace_highest_capacity.cpp b/test/memspaces/memspace_highest_capacity.cpp index 162aa18f7..b16f3f50d 100644 --- a/test/memspaces/memspace_highest_capacity.cpp +++ b/test/memspaces/memspace_highest_capacity.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include "memory_target_numa.h" +#include "memspace_fixtures.hpp" #include "memspace_helpers.hpp" #include "memspace_internal.h" #include "numa_helpers.h" diff --git a/test/memspaces/memspace_host_all.cpp b/test/memspaces/memspace_host_all.cpp index e0326709b..588b24f15 100644 --- a/test/memspaces/memspace_host_all.cpp +++ b/test/memspaces/memspace_host_all.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include "memory_target_numa.h" +#include "memspace_fixtures.hpp" #include "memspace_helpers.hpp" #include "memspace_internal.h" #include "numa_helpers.h" diff --git a/test/memspaces/memspace_lowest_latency.cpp b/test/memspaces/memspace_lowest_latency.cpp new file mode 100644 index 000000000..f0eabcf3e --- /dev/null +++ b/test/memspaces/memspace_lowest_latency.cpp @@ -0,0 +1,44 @@ +// Copyright (C) 2024 Intel Corporation +// Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "memspace_fixtures.hpp" +#include "memspace_helpers.hpp" +#include "memspace_internal.h" +#include "test_helpers.h" + +#include +#include + +static bool canQueryLatency(size_t nodeId) { + hwloc_topology_t topology = nullptr; + int ret = hwloc_topology_init(&topology); + UT_ASSERTeq(ret, 0); + ret = hwloc_topology_load(topology); + UT_ASSERTeq(ret, 0); + + hwloc_obj_t numaNode = + hwloc_get_obj_by_type(topology, HWLOC_OBJ_NUMANODE, nodeId); + UT_ASSERTne(numaNode, nullptr); + + // Setup initiator structure. + struct hwloc_location initiator; + initiator.location.cpuset = numaNode->cpuset; + initiator.type = hwloc_location_type_alias::HWLOC_LOCATION_TYPE_CPUSET; + + hwloc_uint64_t value = 0; + ret = hwloc_memattr_get_value(topology, HWLOC_MEMATTR_ID_LATENCY, numaNode, + &initiator, 0, &value); + + hwloc_topology_destroy(topology); + return (ret == 0); +} + +INSTANTIATE_TEST_SUITE_P(memspaceLowestLatencyTest, memspaceGetTest, + ::testing::Values(memspaceGetParams{ + canQueryLatency, umfMemspaceLowestLatencyGet})); + +INSTANTIATE_TEST_SUITE_P(memspaceLowestLatencyProviderTest, + memspaceProviderTest, + ::testing::Values(memspaceGetParams{ + canQueryLatency, umfMemspaceLowestLatencyGet})); diff --git a/test/memspaces/memspace_numa.cpp b/test/memspaces/memspace_numa.cpp index c214ef189..225cccd9f 100644 --- a/test/memspaces/memspace_numa.cpp +++ b/test/memspaces/memspace_numa.cpp @@ -4,6 +4,7 @@ #include "memspaces/memspace_numa.h" #include "base.hpp" +#include "memspace_fixtures.hpp" #include "memspace_helpers.hpp" #include "memspace_internal.h" diff --git a/test/test_valgrind.sh b/test/test_valgrind.sh index 5680fefef..62252fcdb 100755 --- a/test/test_valgrind.sh +++ b/test/test_valgrind.sh @@ -103,6 +103,9 @@ for test in $(ls -1 umf_test-*); do umf_test-memspace_highest_bandwidth) FILTER='--gtest_filter="-*allocLocalMt*"' ;; + umf_test-memspace_lowest_latency) + FILTER='--gtest_filter="-*allocLocalMt*"' + ;; esac [ "$FILTER" != "" ] && echo -n "($FILTER) " From 8f394d8afc4e5273886d2b1a8baf99db5a969663 Mon Sep 17 00:00:00 2001 From: Krzysztof Swiecicki Date: Wed, 22 May 2024 10:08:25 +0000 Subject: [PATCH 3/3] Add brief introduction of lowest latency memspace --- README.md | 5 +++++ include/umf/memspace.h | 6 ++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index de833eeab..9204ab7c9 100644 --- a/README.md +++ b/README.md @@ -229,6 +229,11 @@ Can be retrieved using umfMemspaceHighestCapacityGet. Memspace backed by an aggregated list of NUMA nodes identified as highest bandwidth after selecting each available NUMA node as the initiator. Querying the bandwidth value requires HMAT support on the platform. Calling `umfMemspaceHighestBandwidthGet()` will return NULL if it's not supported. +#### Lowest latency memspace + +Memspace backed by an aggregated list of NUMA nodes identified as lowest latency after selecting each available NUMA node as the initiator. +Querying the latency value requires HMAT support on the platform. Calling `umfMemspaceLowestLatencyGet()` will return NULL if it's not supported. + ### Proxy library UMF provides the UMF proxy library (`umf_proxy`) that makes it possible diff --git a/include/umf/memspace.h b/include/umf/memspace.h index 8acfa9325..694675974 100644 --- a/include/umf/memspace.h +++ b/include/umf/memspace.h @@ -57,12 +57,14 @@ umf_memspace_handle_t umfMemspaceHostAllGet(void); umf_memspace_handle_t umfMemspaceHighestCapacityGet(void); /// \brief Retrieves predefined highest bandwidth memspace. -/// \return highest bandwidth memspace handle on success or NULL on failure. +/// \return highest bandwidth memspace handle on success or NULL on +/// failure (no HMAT support). /// umf_memspace_handle_t umfMemspaceHighestBandwidthGet(void); /// \brief Retrieves predefined lowest latency memspace. -/// \return lowest latency memspace handle on success or NULL on failure. +/// \return lowest latency memspace handle on success or NULL on +/// failure (no HMAT support). /// umf_memspace_handle_t umfMemspaceLowestLatencyGet(void);