Skip to content

Commit ca878c5

Browse files
authored
Merge branch 'main' into fabio/immediate_append_exp
2 parents 66c80c9 + f98289c commit ca878c5

File tree

20 files changed

+219
-85
lines changed

20 files changed

+219
-85
lines changed

.github/workflows/build-fuzz-reusable.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ jobs:
4747
cmake --build build -j $(nproc)
4848
4949
- name: Configure CMake
50+
# CFI sanitization (or flto?) seems to cause linking to fail
51+
# https://github.com/oneapi-src/unified-runtime/issues/2323
5052
run: >
5153
cmake
5254
-B${{github.workspace}}/build
@@ -58,6 +60,7 @@ jobs:
5860
-DUR_USE_ASAN=ON
5961
-DUR_USE_UBSAN=ON
6062
-DUR_BUILD_ADAPTER_L0=ON
63+
-DUR_USE_CFI=OFF
6164
-DUR_LEVEL_ZERO_LOADER_LIBRARY=${{github.workspace}}/level-zero/build/lib/libze_loader.so
6265
-DUR_LEVEL_ZERO_INCLUDE_DIR=${{github.workspace}}/level-zero/include/
6366
-DUR_DPCXX=${{github.workspace}}/dpcpp_compiler/bin/clang++

.github/workflows/build-hw-reusable.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,8 @@ jobs:
8282
tar -xvf ${{github.workspace}}/dpcpp_compiler.tar.gz -C dpcpp_compiler
8383
8484
- name: Configure CMake
85+
# CFI sanitization seems to fail on our CUDA nodes
86+
# https://github.com/oneapi-src/unified-runtime/issues/2309
8587
run: >
8688
cmake
8789
-B${{github.workspace}}/build
@@ -94,6 +96,7 @@ jobs:
9496
-DUR_BUILD_ADAPTER_${{matrix.adapter.name}}=ON
9597
-DUR_CONFORMANCE_TEST_LOADER=${{ matrix.adapter.other_name != '' && 'ON' || 'OFF' }}
9698
${{ matrix.adapter.other_name != '' && format('-DUR_BUILD_ADAPTER_{0}=ON', matrix.adapter.other_name) || '' }}
99+
-DUR_USE_CFI=${{ matrix.adapter.name == 'CUDA' && 'OFF' || 'ON' }}
97100
-DUR_STATIC_LOADER=${{matrix.adapter.static_Loader}}
98101
-DUR_STATIC_ADAPTER_${{matrix.adapter.name}}=${{matrix.adapter.static_adapter}}
99102
-DUR_DPCXX=${{github.workspace}}/dpcpp_compiler/bin/clang++

.github/workflows/cmake.yml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -256,10 +256,12 @@ jobs:
256256
compiler: {c: clang-cl, cxx: clang-cl}
257257

258258
build_type: [Debug, Release]
259-
compiler: [{c: cl, cxx: cl}, {c: clang-cl, cxx: clang-cl}]
259+
# TODO: clang-cl seems to be fully broken (https://github.com/oneapi-src/unified-runtime/issues/2348)
260+
#compiler: [{c: cl, cxx: cl}, {c: clang-cl, cxx: clang-cl}]
261+
compiler: [{c: cl, cxx: cl}]
260262
include:
261-
- compiler: {c: clang-cl, cxx: clang-cl}
262-
toolset: "-T ClangCL"
263+
#- compiler: {c: clang-cl, cxx: clang-cl}
264+
# toolset: "-T ClangCL"
263265
- os: 'windows-2022'
264266
adapter: {name: L0, var: '-DUR_BUILD_ADAPTER_L0=ON -DUR_STATIC_ADAPTER_L0=ON'}
265267
build_type: 'Release'

.github/workflows/coverity.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ jobs:
1414
coverity:
1515
name: Coverity
1616
# run only on upstream; forks don't have token for upstream's cov project
17-
if: github.repository == 'oneapi-src/unified-memory-framework'
17+
if: github.repository == 'oneapi-src/unified-runtime'
1818
runs-on: ubuntu-latest
1919

2020
steps:
@@ -64,18 +64,18 @@ jobs:
6464
if [ -n "$COVERITY_DIR" ]; then
6565
export PATH="$PATH:$COVERITY_DIR/bin"
6666
fi
67-
cov-build --dir ${{github.workspace}}/coverity-files cmake --build ${{github.workspace}}/build --config Release -j$(nproc)
67+
cov-build --dir ${{github.workspace}}/cov-int cmake --build ${{github.workspace}}/build --config Release -j$(nproc)
6868
6969
- name: Create tarball to analyze
70-
run: tar czvf ur-coverity-files.tgz coverity-files
70+
run: tar czvf cov-int_ur.tgz cov-int
7171

7272
- name: Push tarball to scan
7373
run: |
7474
BRANCH_NAME=$(echo ${GITHUB_REF_NAME})
7575
COMMIT_ID=$(echo $GITHUB_SHA)
7676
curl --form token=${{ secrets.COVERITY_SCAN_TOKEN }} \
7777
--form email=bb-ur@intel.com \
78-
--form file=@ur-coverity-files.tgz \
78+
--form file=@cov-int_ur.tgz \
7979
--form version="$COMMIT_ID" \
8080
--form description="$BRANCH_NAME:$COMMIT_ID" \
8181
https://scan.coverity.com/builds\?project\=oneapi-src%2Funified-runtime

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ option(UR_USE_ASAN "enable AddressSanitizer" OFF)
4040
option(UR_USE_UBSAN "enable UndefinedBehaviorSanitizer" OFF)
4141
option(UR_USE_MSAN "enable MemorySanitizer" OFF)
4242
option(UR_USE_TSAN "enable ThreadSanitizer" OFF)
43+
option(UR_USE_CFI "enable Control Flow Integrity checks (requires clang and implies -flto)" ON)
4344
option(UR_ENABLE_TRACING "enable api tracing through xpti" OFF)
4445
option(UR_ENABLE_SANITIZER "enable device sanitizer" ON)
4546
option(UR_ENABLE_SYMBOLIZER "enable symoblizer for sanitizer" OFF)

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ List of options provided by CMake:
130130
| UR_USE_TSAN | Enable ThreadSanitizer | ON/OFF | OFF |
131131
| UR_USE_UBSAN | Enable UndefinedBehavior Sanitizer | ON/OFF | OFF |
132132
| UR_USE_MSAN | Enable MemorySanitizer (clang only) | ON/OFF | OFF |
133+
| UR_USE_CFI | Enable Control Flow Integrity checks (clang only, also enables lto) | ON/OFF | ON |
133134
| UR_ENABLE_TRACING | Enable XPTI-based tracing layer | ON/OFF | OFF |
134135
| UR_ENABLE_SANITIZER | Enable device sanitizer layer | ON/OFF | ON |
135136
| UR_CONFORMANCE_TARGET_TRIPLES | SYCL triples to build CTS device binaries for | Comma-separated list | spir64 |

cmake/helpers.cmake

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,16 @@ if(CMAKE_SYSTEM_NAME STREQUAL Linux)
6363
check_cxx_compiler_flag("-fstack-clash-protection" CXX_HAS_FSTACK_CLASH_PROTECTION)
6464
endif()
6565

66+
if (UR_USE_CFI)
67+
set(SAVED_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
68+
set(CMAKE_REQUIRED_FLAGS "-flto -fvisibility=hidden")
69+
check_cxx_compiler_flag("-fsanitize=cfi" CXX_HAS_CFI_SANITIZE)
70+
set(CMAKE_REQUIRED_FLAGS ${SAVED_CMAKE_REQUIRED_FLAGS})
71+
else()
72+
# If CFI checking is disabled, pretend we don't support it
73+
set(CXX_HAS_CFI_SANITIZE OFF)
74+
endif()
75+
6676
function(add_ur_target_compile_options name)
6777
if(NOT MSVC)
6878
target_compile_definitions(${name} PRIVATE -D_FORTIFY_SOURCE=2)
@@ -78,11 +88,10 @@ function(add_ur_target_compile_options name)
7888
# Hardening options
7989
-fPIC
8090
-fstack-protector-strong
81-
-fvisibility=hidden # Required for -fsanitize=cfi
82-
# -fsanitize=cfi requires -flto, which breaks a lot of things
83-
# See: https://github.com/oneapi-src/unified-runtime/issues/2120
84-
# -flto
85-
# $<$<CXX_COMPILER_ID:Clang,AppleClang>:-fsanitize=cfi>
91+
-fvisibility=hidden
92+
# cfi-icall requires called functions in shared libraries to also be built with cfi-icall, which we can't
93+
# guarantee. -fsanitize=cfi depends on -flto
94+
$<$<BOOL:${CXX_HAS_CFI_SANITIZE}>:-flto -fsanitize=cfi -fno-sanitize=cfi-icall>
8695
$<$<BOOL:${CXX_HAS_FCF_PROTECTION_FULL}>:-fcf-protection=full>
8796
$<$<BOOL:${CXX_HAS_FSTACK_CLASH_PROTECTION}>:-fstack-clash-protection>
8897

@@ -119,7 +128,10 @@ endfunction()
119128
function(add_ur_target_link_options name)
120129
if(NOT MSVC)
121130
if (NOT APPLE)
122-
target_link_options(${name} PRIVATE "LINKER:-z,relro,-z,now,-z,noexecstack")
131+
target_link_options(${name} PRIVATE
132+
$<$<BOOL:${CXX_HAS_CFI_SANITIZE}>:-flto -fsanitize=cfi -fno-sanitize=cfi-icall>
133+
"LINKER:-z,relro,-z,now,-z,noexecstack"
134+
)
123135
if (UR_DEVELOPER_MODE)
124136
target_link_options(${name} PRIVATE -Werror -Wextra)
125137
endif()

scripts/templates/ur_api.hpp.mako

Whitespace-only changes.

source/adapters/cuda/enqueue.cpp

Lines changed: 61 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -961,35 +961,71 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect(
961961

962962
// CUDA has no memset functions that allow setting values more than 4 bytes. UR
963963
// API lets you pass an arbitrary "pattern" to the buffer fill, which can be
964-
// more than 4 bytes. We must break up the pattern into 1 byte values, and set
965-
// the buffer using multiple strided calls. The first 4 patterns are set using
966-
// cuMemsetD32Async then all subsequent 1 byte patterns are set using
967-
// cuMemset2DAsync which is called for each pattern.
964+
// more than 4 bytes. We must break up the pattern into 1, 2 or 4-byte values
965+
// and set the buffer using multiple strided calls.
968966
ur_result_t commonMemSetLargePattern(CUstream Stream, uint32_t PatternSize,
969967
size_t Size, const void *pPattern,
970968
CUdeviceptr Ptr) {
971-
// Calculate the number of patterns, stride, number of times the pattern
972-
// needs to be applied, and the number of times the first 32 bit pattern
973-
// needs to be applied.
974-
auto NumberOfSteps = PatternSize / sizeof(uint8_t);
975-
auto Pitch = NumberOfSteps * sizeof(uint8_t);
976-
auto Height = Size / NumberOfSteps;
977-
auto Count32 = Size / sizeof(uint32_t);
978-
979-
// Get 4-byte chunk of the pattern and call cuMemsetD32Async
980-
auto Value = *(static_cast<const uint32_t *>(pPattern));
981-
UR_CHECK_ERROR(cuMemsetD32Async(Ptr, Value, Count32, Stream));
982-
for (auto step = 4u; step < NumberOfSteps; ++step) {
983-
// take 1 byte of the pattern
984-
Value = *(static_cast<const uint8_t *>(pPattern) + step);
985-
986-
// offset the pointer to the part of the buffer we want to write to
987-
auto OffsetPtr = Ptr + (step * sizeof(uint8_t));
988-
989-
// set all of the pattern chunks
990-
UR_CHECK_ERROR(cuMemsetD2D8Async(OffsetPtr, Pitch, Value, sizeof(uint8_t),
991-
Height, Stream));
969+
// Find the largest supported word size into which the pattern can be divided
970+
auto BackendWordSize = PatternSize % 4u == 0u ? 4u
971+
: PatternSize % 2u == 0u ? 2u
972+
: 1u;
973+
974+
// Calculate the number of words in the pattern, the stride, and the number of
975+
// times the pattern needs to be applied
976+
auto NumberOfSteps = PatternSize / BackendWordSize;
977+
auto Pitch = NumberOfSteps * BackendWordSize;
978+
auto Height = Size / PatternSize;
979+
980+
// Same implementation works for any pattern word type (uint8_t, uint16_t,
981+
// uint32_t)
982+
auto memsetImpl = [BackendWordSize, NumberOfSteps, Pitch, Height, Size, Ptr,
983+
&Stream](const auto *pPatternWords,
984+
auto &&continuousMemset, auto &&stridedMemset) {
985+
// If the pattern is 1 word or the first word is repeated throughout, a fast
986+
// continuous fill can be used without the need for slower strided fills
987+
bool UseOnlyFirstValue{true};
988+
for (auto Step{1u}; (Step < NumberOfSteps) && UseOnlyFirstValue; ++Step) {
989+
if (*(pPatternWords + Step) != *pPatternWords) {
990+
UseOnlyFirstValue = false;
991+
}
992+
}
993+
auto OptimizedNumberOfSteps{UseOnlyFirstValue ? 1u : NumberOfSteps};
994+
995+
// Fill the pattern in steps of BackendWordSize bytes. Use a continuous
996+
// fill in the first step because it's faster than a strided fill. Then,
997+
// overwrite the other values in subsequent steps.
998+
for (auto Step{0u}; Step < OptimizedNumberOfSteps; ++Step) {
999+
if (Step == 0) {
1000+
UR_CHECK_ERROR(continuousMemset(Ptr, *(pPatternWords),
1001+
Size / BackendWordSize, Stream));
1002+
} else {
1003+
UR_CHECK_ERROR(stridedMemset(Ptr + Step * BackendWordSize, Pitch,
1004+
*(pPatternWords + Step), 1u, Height,
1005+
Stream));
1006+
}
1007+
}
1008+
};
1009+
1010+
// Apply the implementation to the chosen pattern word type
1011+
switch (BackendWordSize) {
1012+
case 4u: {
1013+
memsetImpl(static_cast<const uint32_t *>(pPattern), cuMemsetD32Async,
1014+
cuMemsetD2D32Async);
1015+
break;
1016+
}
1017+
case 2u: {
1018+
memsetImpl(static_cast<const uint16_t *>(pPattern), cuMemsetD16Async,
1019+
cuMemsetD2D16Async);
1020+
break;
1021+
}
1022+
default: {
1023+
memsetImpl(static_cast<const uint8_t *>(pPattern), cuMemsetD8Async,
1024+
cuMemsetD2D8Async);
1025+
break;
9921026
}
1027+
}
1028+
9931029
return UR_RESULT_SUCCESS;
9941030
}
9951031

source/adapters/hip/enqueue.cpp

Lines changed: 62 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -712,25 +712,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect(
712712

713713
static inline void memsetRemainPattern(hipStream_t Stream, uint32_t PatternSize,
714714
size_t Size, const void *pPattern,
715-
hipDeviceptr_t Ptr) {
715+
hipDeviceptr_t Ptr,
716+
uint32_t StartOffset) {
717+
// Calculate the number of times the pattern needs to be applied
718+
auto Height = Size / PatternSize;
716719

717-
// Calculate the number of patterns, stride and the number of times the
718-
// pattern needs to be applied.
719-
auto NumberOfSteps = PatternSize / sizeof(uint8_t);
720-
auto Pitch = NumberOfSteps * sizeof(uint8_t);
721-
auto Height = Size / NumberOfSteps;
722-
723-
for (auto step = 4u; step < NumberOfSteps; ++step) {
720+
for (auto step = StartOffset; step < PatternSize; ++step) {
724721
// take 1 byte of the pattern
725722
auto Value = *(static_cast<const uint8_t *>(pPattern) + step);
726723

727724
// offset the pointer to the part of the buffer we want to write to
728-
auto OffsetPtr = reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(Ptr) +
729-
(step * sizeof(uint8_t)));
725+
auto OffsetPtr =
726+
reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(Ptr) + step);
730727

731728
// set all of the pattern chunks
732-
UR_CHECK_ERROR(hipMemset2DAsync(OffsetPtr, Pitch, Value, sizeof(uint8_t),
733-
Height, Stream));
729+
UR_CHECK_ERROR(
730+
hipMemset2DAsync(OffsetPtr, PatternSize, Value, 1u, Height, Stream));
734731
}
735732
}
736733

@@ -743,11 +740,55 @@ static inline void memsetRemainPattern(hipStream_t Stream, uint32_t PatternSize,
743740
ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize,
744741
size_t Size, const void *pPattern,
745742
hipDeviceptr_t Ptr) {
743+
// Find the largest supported word size into which the pattern can be divided
744+
auto BackendWordSize = PatternSize % 4u == 0u ? 4u
745+
: PatternSize % 2u == 0u ? 2u
746+
: 1u;
747+
748+
// Calculate the number of patterns
749+
auto NumberOfSteps = PatternSize / BackendWordSize;
750+
751+
// If the pattern is 1 word or the first word is repeated throughout, a fast
752+
// continuous fill can be used without the need for slower strided fills
753+
bool UseOnlyFirstValue{true};
754+
auto checkIfFirstWordRepeats = [&UseOnlyFirstValue,
755+
NumberOfSteps](const auto *pPatternWords) {
756+
for (auto Step{1u}; (Step < NumberOfSteps) && UseOnlyFirstValue; ++Step) {
757+
if (*(pPatternWords + Step) != *pPatternWords) {
758+
UseOnlyFirstValue = false;
759+
}
760+
}
761+
};
746762

747-
// Get 4-byte chunk of the pattern and call hipMemsetD32Async
748-
auto Count32 = Size / sizeof(uint32_t);
749-
auto Value = *(static_cast<const uint32_t *>(pPattern));
750-
UR_CHECK_ERROR(hipMemsetD32Async(Ptr, Value, Count32, Stream));
763+
// Use a continuous fill for the first word in the pattern because it's faster
764+
// than a strided fill. Then, overwrite the other values in subsequent steps.
765+
switch (BackendWordSize) {
766+
case 4u: {
767+
auto *pPatternWords = static_cast<const uint32_t *>(pPattern);
768+
checkIfFirstWordRepeats(pPatternWords);
769+
UR_CHECK_ERROR(
770+
hipMemsetD32Async(Ptr, *pPatternWords, Size / BackendWordSize, Stream));
771+
break;
772+
}
773+
case 2u: {
774+
auto *pPatternWords = static_cast<const uint16_t *>(pPattern);
775+
checkIfFirstWordRepeats(pPatternWords);
776+
UR_CHECK_ERROR(
777+
hipMemsetD16Async(Ptr, *pPatternWords, Size / BackendWordSize, Stream));
778+
break;
779+
}
780+
default: {
781+
auto *pPatternWords = static_cast<const uint8_t *>(pPattern);
782+
checkIfFirstWordRepeats(pPatternWords);
783+
UR_CHECK_ERROR(
784+
hipMemsetD8Async(Ptr, *pPatternWords, Size / BackendWordSize, Stream));
785+
break;
786+
}
787+
}
788+
789+
if (UseOnlyFirstValue) {
790+
return UR_RESULT_SUCCESS;
791+
}
751792

752793
// There is a bug in ROCm prior to 6.0.0 version which causes hipMemset2D
753794
// to behave incorrectly when acting on host pinned memory.
@@ -761,7 +802,7 @@ ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize,
761802
// we need to check that isManaged attribute is false.
762803
if (ptrAttribs.hostPointer && !ptrAttribs.isManaged) {
763804
const auto NumOfCopySteps = Size / PatternSize;
764-
const auto Offset = sizeof(uint32_t);
805+
const auto Offset = BackendWordSize;
765806
const auto LeftPatternSize = PatternSize - Offset;
766807
const auto OffsetPatternPtr = reinterpret_cast<const void *>(
767808
reinterpret_cast<const uint8_t *>(pPattern) + Offset);
@@ -776,10 +817,12 @@ ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize,
776817
Stream));
777818
}
778819
} else {
779-
memsetRemainPattern(Stream, PatternSize, Size, pPattern, Ptr);
820+
memsetRemainPattern(Stream, PatternSize, Size, pPattern, Ptr,
821+
BackendWordSize);
780822
}
781823
#else
782-
memsetRemainPattern(Stream, PatternSize, Size, pPattern, Ptr);
824+
memsetRemainPattern(Stream, PatternSize, Size, pPattern, Ptr,
825+
BackendWordSize);
783826
#endif
784827
return UR_RESULT_SUCCESS;
785828
}

0 commit comments

Comments
 (0)