Skip to content

Add memspace "lowest latency" #509

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,11 @@ Can be retrieved using umfMemspaceHighestCapacityGet.
Memspace backed by an aggregated list of NUMA nodes identified as highest bandwidth after selecting each available NUMA node as the initiator.
Querying the bandwidth value requires HMAT support on the platform. Calling `umfMemspaceHighestBandwidthGet()` will return NULL if it's not supported.

#### Lowest latency memspace

Memspace backed by an aggregated list of NUMA nodes identified as lowest latency after selecting each available NUMA node as the initiator.
Querying the latency value requires HMAT support on the platform. Calling `umfMemspaceLowestLatencyGet()` will return NULL if it's not supported.

### Proxy library

UMF provides the UMF proxy library (`umf_proxy`) that makes it possible
Expand Down
9 changes: 8 additions & 1 deletion include/umf/memspace.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,17 @@ umf_memspace_handle_t umfMemspaceHostAllGet(void);
umf_memspace_handle_t umfMemspaceHighestCapacityGet(void);

/// \brief Retrieves predefined highest bandwidth memspace.
/// \return highest bandwidth memspace handle on success or NULL on failure.
/// \return highest bandwidth memspace handle on success or NULL on
/// failure (no HMAT support).
///
umf_memspace_handle_t umfMemspaceHighestBandwidthGet(void);

/// \brief Retrieves predefined lowest latency memspace.
/// \return lowest latency memspace handle on success or NULL on
/// failure (no HMAT support).
///
umf_memspace_handle_t umfMemspaceLowestLatencyGet(void);

#ifdef __cplusplus
}
#endif
Expand Down
3 changes: 2 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,8 @@ set(UMF_SOURCES_COMMON_LINUX_MACOSX
memspaces/memspace_numa.c
memspaces/memspace_host_all.c
memspaces/memspace_highest_capacity.c
memspaces/memspace_highest_bandwidth.c)
memspaces/memspace_highest_bandwidth.c
memspaces/memspace_lowest_latency.c)

set(UMF_SOURCES_LINUX ${UMF_SOURCES_LINUX} ${UMF_SOURCES_COMMON_LINUX_MACOSX}
provider/provider_os_memory_linux.c)
Expand Down
1 change: 1 addition & 0 deletions src/libumf.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ void umfTearDown(void) {
umfMemspaceHostAllDestroy();
umfMemspaceHighestCapacityDestroy();
umfMemspaceHighestBandwidthDestroy();
umfMemspaceLowestLatencyDestroy();
umfDestroyTopology();
#endif
// make sure TRACKER is not used after being destroyed
Expand Down
1 change: 1 addition & 0 deletions src/libumf.map
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ UMF_1.0 {
umfMemspaceHighestBandwidthGet;
umfMemspaceHighestCapacityGet;
umfMemspaceHostAllGet;
umfMemspaceLowestLatencyGet;
umfOpenIPCHandle;
umfOsMemoryProviderOps;
umfPoolAlignedMalloc;
Expand Down
12 changes: 12 additions & 0 deletions src/memory_target.c
Original file line number Diff line number Diff line change
Expand Up @@ -97,3 +97,15 @@ umfMemoryTargetGetBandwidth(umf_memory_target_handle_t srcMemoryTarget,
return srcMemoryTarget->ops->get_bandwidth(
srcMemoryTarget->priv, dstMemoryTarget->priv, bandwidth);
}

umf_result_t
umfMemoryTargetGetLatency(umf_memory_target_handle_t srcMemoryTarget,
umf_memory_target_handle_t dstMemoryTarget,
size_t *latency) {
if (!srcMemoryTarget || !dstMemoryTarget || !latency) {
return UMF_RESULT_ERROR_INVALID_ARGUMENT;
}

return srcMemoryTarget->ops->get_latency(srcMemoryTarget->priv,
dstMemoryTarget->priv, latency);
}
4 changes: 4 additions & 0 deletions src/memory_target.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ umf_result_t
umfMemoryTargetGetBandwidth(umf_memory_target_handle_t srcMemoryTarget,
umf_memory_target_handle_t dstMemoryTarget,
size_t *bandwidth);
umf_result_t
umfMemoryTargetGetLatency(umf_memory_target_handle_t srcMemoryTarget,
umf_memory_target_handle_t dstMemoryTarget,
size_t *latency);

#ifdef __cplusplus
}
Expand Down
2 changes: 2 additions & 0 deletions src/memory_target_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ typedef struct umf_memory_target_ops_t {
umf_result_t (*get_capacity)(void *memoryTarget, size_t *capacity);
umf_result_t (*get_bandwidth)(void *srcMemoryTarget, void *dstMemoryTarget,
size_t *bandwidth);
umf_result_t (*get_latency)(void *srcMemoryTarget, void *dstMemoryTarget,
size_t *latency);
} umf_memory_target_ops_t;

#ifdef __cplusplus
Expand Down
90 changes: 78 additions & 12 deletions src/memory_targets/memory_target_numa.c
Original file line number Diff line number Diff line change
Expand Up @@ -167,13 +167,26 @@ static umf_result_t numa_get_capacity(void *memTarget, size_t *capacity) {
return UMF_RESULT_SUCCESS;
}

static umf_result_t numa_get_bandwidth(void *srcMemoryTarget,
void *dstMemoryTarget,
size_t *bandwidth) {
if (!srcMemoryTarget || !dstMemoryTarget || !bandwidth) {
return UMF_RESULT_ERROR_INVALID_ARGUMENT;
typedef enum memattr_type_t {
MEMATTR_TYPE_BANDWIDTH,
MEMATTR_TYPE_LATENCY
} memattr_type_t;

static size_t memattr_get_worst_value(memattr_type_t type) {
switch (type) {
case MEMATTR_TYPE_BANDWIDTH:
return 0;
case MEMATTR_TYPE_LATENCY:
return SIZE_MAX;
default:
assert(0); // Should not be reachable
return 0;
}
}

static umf_result_t query_attribute_value(void *srcMemoryTarget,
void *dstMemoryTarget, size_t *value,
memattr_type_t type) {
hwloc_topology_t topology = umfGetTopology();
if (!topology) {
return UMF_RESULT_ERROR_NOT_SUPPORTED;
Expand All @@ -195,23 +208,75 @@ static umf_result_t numa_get_bandwidth(void *srcMemoryTarget,

// Given NUMA nodes aren't local, HWLOC returns an error in such case.
if (!hwloc_bitmap_intersects(srcNumaNode->cpuset, dstNumaNode->cpuset)) {
*bandwidth = 0;
// Since we want to skip such query, we return the worst possible
// value for given memory attribute.
*value = memattr_get_worst_value(type);
return UMF_RESULT_SUCCESS;
}

enum hwloc_memattr_id_e hwlocMemAttrType = INT_MAX;
switch (type) {
case MEMATTR_TYPE_BANDWIDTH:
hwlocMemAttrType = HWLOC_MEMATTR_ID_BANDWIDTH;
break;
case MEMATTR_TYPE_LATENCY:
hwlocMemAttrType = HWLOC_MEMATTR_ID_LATENCY;
break;
default:
assert(0); // Shouldn't be reachable.
return UMF_RESULT_ERROR_INVALID_ARGUMENT;
}

struct hwloc_location initiator = {.location.cpuset = srcNumaNode->cpuset,
.type = HWLOC_LOCATION_TYPE_CPUSET};
hwloc_uint64_t value = 0;
int ret = hwloc_memattr_get_value(topology, HWLOC_MEMATTR_ID_BANDWIDTH,
dstNumaNode, &initiator, 0, &value);

hwloc_uint64_t memAttrValue = 0;
int ret = hwloc_memattr_get_value(topology, hwlocMemAttrType, dstNumaNode,
&initiator, 0, &memAttrValue);
if (ret) {
LOG_ERR("Retrieving bandwidth for initiator node %u to node %u failed.",
srcNumaNode->os_index, dstNumaNode->os_index);
return (errno == EINVAL) ? UMF_RESULT_ERROR_NOT_SUPPORTED
: UMF_RESULT_ERROR_UNKNOWN;
}

*bandwidth = value;
*value = memAttrValue;

return UMF_RESULT_SUCCESS;
}

static umf_result_t numa_get_bandwidth(void *srcMemoryTarget,
void *dstMemoryTarget,
size_t *bandwidth) {
if (!srcMemoryTarget || !dstMemoryTarget || !bandwidth) {
return UMF_RESULT_ERROR_INVALID_ARGUMENT;
}

umf_result_t ret = query_attribute_value(srcMemoryTarget, dstMemoryTarget,
bandwidth, MEMATTR_TYPE_BANDWIDTH);
if (ret) {
LOG_ERR("Retrieving bandwidth for initiator node %u to node %u failed.",
((struct numa_memory_target_t *)srcMemoryTarget)->physical_id,
((struct numa_memory_target_t *)dstMemoryTarget)->physical_id);
return ret;
}

return UMF_RESULT_SUCCESS;
}

static umf_result_t numa_get_latency(void *srcMemoryTarget,
void *dstMemoryTarget, size_t *latency) {
if (!srcMemoryTarget || !dstMemoryTarget || !latency) {
return UMF_RESULT_ERROR_INVALID_ARGUMENT;
}

umf_result_t ret = query_attribute_value(srcMemoryTarget, dstMemoryTarget,
latency, MEMATTR_TYPE_LATENCY);
if (ret) {
LOG_ERR("Retrieving latency for initiator node %u to node %u failed.",
((struct numa_memory_target_t *)srcMemoryTarget)->physical_id,
((struct numa_memory_target_t *)dstMemoryTarget)->physical_id);
return ret;
}

return UMF_RESULT_SUCCESS;
}

Expand All @@ -223,5 +288,6 @@ struct umf_memory_target_ops_t UMF_MEMORY_TARGET_NUMA_OPS = {
.clone = numa_clone,
.get_capacity = numa_get_capacity,
.get_bandwidth = numa_get_bandwidth,
.get_latency = numa_get_latency,
.memory_provider_create_from_memspace =
numa_memory_provider_create_from_memspace};
1 change: 1 addition & 0 deletions src/memspace_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ void umfMemspaceDestroy(umf_memspace_handle_t hMemspace);
void umfMemspaceHostAllDestroy(void);
void umfMemspaceHighestCapacityDestroy(void);
void umfMemspaceHighestBandwidthDestroy(void);
void umfMemspaceLowestLatencyDestroy(void);

#ifdef __cplusplus
}
Expand Down
103 changes: 103 additions & 0 deletions src/memspaces/memspace_lowest_latency.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
/*
*
* Copyright (C) 2024 Intel Corporation
*
* Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*/

#include <assert.h>
#include <ctype.h>
#include <hwloc.h>
#include <stdlib.h>

#include "base_alloc_global.h"
#include "memory_target_numa.h"
#include "memspace_internal.h"
#include "memspace_numa.h"
#include "topology.h"
#include "utils_common.h"
#include "utils_concurrency.h"
#include "utils_log.h"

static umf_result_t getBestLatencyTarget(umf_memory_target_handle_t initiator,
umf_memory_target_handle_t *nodes,
size_t numNodes,
umf_memory_target_handle_t *target) {
size_t bestNodeIdx = 0;
size_t bestLatency = SIZE_MAX;
for (size_t nodeIdx = 0; nodeIdx < numNodes; nodeIdx++) {
size_t latency = SIZE_MAX;
umf_result_t ret =
umfMemoryTargetGetLatency(initiator, nodes[nodeIdx], &latency);
if (ret) {
return ret;
}

if (latency < bestLatency) {
bestNodeIdx = nodeIdx;
bestLatency = latency;
}
}

*target = nodes[bestNodeIdx];

return UMF_RESULT_SUCCESS;
}

static umf_result_t
umfMemspaceLowestLatencyCreate(umf_memspace_handle_t *hMemspace) {
if (!hMemspace) {
return UMF_RESULT_ERROR_INVALID_ARGUMENT;
}

umf_memspace_handle_t hostAllMemspace = umfMemspaceHostAllGet();
if (!hostAllMemspace) {
return UMF_RESULT_ERROR_UNKNOWN;
}

umf_memspace_handle_t lowLatencyMemspace = NULL;
umf_result_t ret = umfMemspaceFilter(hostAllMemspace, getBestLatencyTarget,
&lowLatencyMemspace);
if (ret != UMF_RESULT_SUCCESS) {
// HWLOC could possibly return an 'EINVAL' error, which in this context
// means that the HMAT is unavailable and we can't obtain the
// 'latency' value of any NUMA node.
return ret;
}

*hMemspace = lowLatencyMemspace;
return UMF_RESULT_SUCCESS;
}

static umf_memspace_handle_t UMF_MEMSPACE_LOWEST_LATENCY = NULL;
static UTIL_ONCE_FLAG UMF_MEMSPACE_LOWEST_LATENCY_INITIALIZED =
UTIL_ONCE_FLAG_INIT;

void umfMemspaceLowestLatencyDestroy(void) {
if (UMF_MEMSPACE_LOWEST_LATENCY) {
umfMemspaceDestroy(UMF_MEMSPACE_LOWEST_LATENCY);
UMF_MEMSPACE_LOWEST_LATENCY = NULL;
}
}

static void umfMemspaceLowestLatencyInit(void) {
umf_result_t ret =
umfMemspaceLowestLatencyCreate(&UMF_MEMSPACE_LOWEST_LATENCY);
if (ret != UMF_RESULT_SUCCESS) {
LOG_ERR("Creating the lowest latency memspace failed with a %u error\n",
ret);
assert(ret == UMF_RESULT_ERROR_NOT_SUPPORTED);
}

#if defined(_WIN32) && !defined(UMF_SHARED_LIBRARY)
atexit(umfMemspaceLowestLatencyDestroy);
#endif
}

umf_memspace_handle_t umfMemspaceLowestLatencyGet(void) {
util_init_once(&UMF_MEMSPACE_LOWEST_LATENCY_INITIALIZED,
umfMemspaceLowestLatencyInit);
return UMF_MEMSPACE_LOWEST_LATENCY;
}
12 changes: 8 additions & 4 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -162,23 +162,27 @@ if(LINUX) # OS-specific functions are implemented only for Linux now
add_umf_test(
NAME memspace_numa
SRCS memspaces/memspace_numa.cpp
LIBS ${LIBNUMA_LIBRARIES})
LIBS ${LIBNUMA_LIBRARIES} ${LIBHWLOC_LIBRARIES})
add_umf_test(
NAME provider_os_memory_config
SRCS provider_os_memory_config.cpp
LIBS ${UMF_UTILS_FOR_TEST} ${LIBNUMA_LIBRARIES})
LIBS ${UMF_UTILS_FOR_TEST} ${LIBNUMA_LIBRARIES} ${LIBHWLOC_LIBRARIES})
add_umf_test(
NAME memspace_host_all
SRCS memspaces/memspace_host_all.cpp
LIBS ${UMF_UTILS_FOR_TEST} ${LIBNUMA_LIBRARIES})
LIBS ${UMF_UTILS_FOR_TEST} ${LIBNUMA_LIBRARIES} ${LIBHWLOC_LIBRARIES})
add_umf_test(
NAME memspace_highest_capacity
SRCS memspaces/memspace_highest_capacity.cpp
LIBS ${UMF_UTILS_FOR_TEST} ${LIBNUMA_LIBRARIES})
LIBS ${UMF_UTILS_FOR_TEST} ${LIBNUMA_LIBRARIES} ${LIBHWLOC_LIBRARIES})
add_umf_test(
NAME memspace_highest_bandwidth
SRCS memspaces/memspace_highest_bandwidth.cpp
LIBS ${UMF_UTILS_FOR_TEST} ${LIBNUMA_LIBRARIES} ${LIBHWLOC_LIBRARIES})
add_umf_test(
NAME memspace_lowest_latency
SRCS memspaces/memspace_lowest_latency.cpp
LIBS ${UMF_UTILS_FOR_TEST} ${LIBNUMA_LIBRARIES} ${LIBHWLOC_LIBRARIES})
add_umf_test(
NAME mempolicy
SRCS memspaces/mempolicy.cpp
Expand Down
Loading