Skip to content

Commit 2cdbf13

Browse files
committed
Merge branch 'release-2.8.0'
2 parents 0f7c1fb + 985f189 commit 2cdbf13

File tree

93 files changed

+830
-5439
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

93 files changed

+830
-5439
lines changed

.ci/daint.cscs.ch/ocl.build.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ if [ ! -d "${HOME}/libxsmm" ]; then
3535
fi
3636
cd "${HOME}/libxsmm"
3737
git fetch
38-
git checkout d009b33e8742a93c9e1549323587fb6197451294
38+
git checkout 488aa88f2a9825e9f92a0cfc773c1aedf019f88a
3939
make -j
4040
cd ..
4141

.github/workflows/testing-linux.yml

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -74,16 +74,16 @@ jobs:
7474
mv build/coverage.info build/coverage-Linux-${{ matrix.use_mpi }}-${{ matrix.use_openmp }}-${{ matrix.use_smm }}-cpu.info
7575
7676
- name: Upload coverage data
77-
uses: actions/upload-artifact@v3
77+
uses: actions/upload-artifact@v4
7878
with:
79-
name: coverage-data
79+
name: coverage-data-${{ matrix.use_mpi }}-${{ matrix.use_openmp }}-${{ matrix.use_smm }}-${{ matrix.mpi_suffix }}
8080
path: build/coverage-*.info
8181

8282
- name: Upload coverage data (generated files)
83-
uses: actions/upload-artifact@v3
83+
uses: actions/upload-artifact@v4
8484
if: matrix.use_mpi == 'MPI=ON' && matrix.use_openmp == 'OPENMP=ON' && matrix.use_smm == 'SMM=blas' && matrix.mpi_suffix == 'openmpi'
8585
with:
86-
name: coverage-data
86+
name: coverage-data-${{ matrix.use_mpi }}-${{ matrix.use_openmp }}-${{ matrix.use_smm }}-${{ matrix.mpi_suffix }}-generated-files
8787
path: |
8888
build/src/dbcsr.h
8989
build/src/tensors/dbcsr_tensor.h
@@ -200,9 +200,10 @@ jobs:
200200
- uses: actions/checkout@v4
201201

202202
- name: Download coverage data
203-
uses: actions/download-artifact@v3
203+
uses: actions/download-artifact@v4.1.7
204204
with:
205-
name: coverage-data
205+
pattern: coverage-data-*
206+
merge-multiple: true
206207

207208
- name: Combine coverage
208209
run: |
@@ -213,7 +214,7 @@ jobs:
213214
lcov --summary merged.info
214215
215216
- name: Upload merged HTML report
216-
uses: actions/upload-artifact@v3
217+
uses: actions/upload-artifact@v4
217218
with:
218219
name: html-report
219220
path: htmlcov

.pre-commit-config.yaml

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ fail_fast: false
66
minimum_pre_commit_version: 3.2.0
77
repos:
88
- repo: https://github.com/astral-sh/ruff-pre-commit
9-
rev: 'v0.5.4'
9+
rev: 'v0.8.2'
1010
hooks:
1111
- id: ruff
1212
args: [ --fix, --exit-non-zero-on-fix ]
@@ -15,19 +15,19 @@ repos:
1515
.cp2k/.*|
1616
)$
1717
- repo: https://github.com/psf/black
18-
rev: 24.4.2
18+
rev: 24.10.0
1919
hooks:
2020
- id: black
2121
name: Reformat Python files with the black code formatter
2222
files: '^.*(/PACKAGE)|(\.py)$'
2323
- repo: https://github.com/pre-commit/pre-commit-hooks
24-
rev: v4.6.0
24+
rev: v5.0.0
2525
hooks:
2626
- id: check-ast
2727
- id: check-yaml
2828
- id: check-symlinks
2929
- id: trailing-whitespace
30-
- repo: https://github.com/pseewald/fprettify
30+
- repo: https://github.com/fortran-lang/fprettify
3131
rev: v0.3.7
3232
hooks:
3333
- id: fprettify
@@ -65,4 +65,5 @@ repos:
6565
language: python
6666
files: \.(c|cc|cxx|cpp|cl|frag|glsl|h|hpp|hxx|ih|ispc|ipp|java|js|m|mm|proto|textproto|vert)$
6767
args: ['-i', '-fallback-style=none', '--style=file']
68-
additional_dependencies: ['clang-format']
68+
# specify version since clang-format is not stable version-to-version
69+
additional_dependencies: ['clang-format~=19.1.0']

CMakeLists.txt

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -120,13 +120,7 @@ set_property(CACHE WITH_GPU PROPERTY STRINGS ${SUPPORTED_CUDA_ARCHITECTURES}
120120

121121
option(WITH_CUDA_PROFILING "Enable profiling within CUDA" OFF)
122122
option(WITH_HIP_PROFILING "Enable profiling within HIP" OFF)
123-
option(WITH_G2G "Enable GPU aware MPI within CUDA/HIP backends" OFF)
124123

125-
if (WITH_G2G AND ((NOT USE_ACCEL) OR ((NOT USE_ACCEL MATCHES "cuda")
126-
AND (NOT USE_ACCEL MATCHES "hip"))))
127-
message(
128-
FATAL_ERROR "GPU aware MPI can only be enabled for HIP/CUDA GPU backends")
129-
endif ()
130124
# =================================================================================================
131125
# LANGUAGES AND TESTING
132126
enable_language(Fortran)
@@ -274,7 +268,6 @@ if (USE_ACCEL MATCHES "cuda")
274268
message(STATUS "Kernel parameters: " ${WITH_GPU_PARAMS})
275269
message(STATUS "GPU architecture number: " ${ACC_ARCH_NUMBER})
276270
message(STATUS "GPU profiling enabled: " ${WITH_CUDA_PROFILING})
277-
message(STATUS "GPU aware MPI enabled: " ${WITH_G2G})
278271
endif ()
279272

280273
if (USE_ACCEL MATCHES "hip")
@@ -319,7 +312,6 @@ if (USE_ACCEL MATCHES "hip")
319312
message(STATUS "Kernel parameters: " ${WITH_GPU_PARAMS})
320313
message(STATUS "GPU architecture number: " ${ACC_ARCH_NUMBER})
321314
message(STATUS "GPU profiling enabled: " ${WITH_HIP_PROFILING})
322-
message(STATUS "GPU aware MPI enabled: " ${WITH_G2G})
323315

324316
# =================================== BLAS on GPU backend
325317
find_package(hipblas CONFIG REQUIRED HINTS ${ROCM_PATH})

VERSION

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
MAJOR = 2
2-
MINOR = 7
2+
MINOR = 8
33
PATCH = 0
44
# A specific DATE (YYYY-MM-DD) fixes an official release, otherwise
55
# it is considered Development version.
6-
DATE = 2024-07-29
6+
DATE = 2024-12-11
77

88

cmake/CompilerConfiguration.cmake

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,6 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
5151
if ((NOT (USE_MPI)) OR (NOT ("${MPI_Fortran_LIBRARY_VERSION_STRING}" MATCHES "Open MPI")))
5252
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=leak")
5353
endif ()
54-
if (USE_ACCEL MATCHES "hip" AND hip_VERSION GREATER_EQUAL 6.0.0) # Remove deprecated function error with ROCm v6+
55-
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations")
56-
endif ()
5754
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
5855
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -funroll-loops")
5956
set(CMAKE_CXX_FLAGS_COVERAGE "-O0 -g --coverage")

docs/guide/2-user-guide/1-installation/index.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@ make
7070
-DUSE_ACCEL=<opencl|cuda|hip>
7171
-DWITH_CUDA_PROFILING=<OFF|ON>
7272
-DWITH_HIP_PROFILING=<OFF|ON>
73-
-DWITH_G2G=<OFF|ON>
7473
-DWITH_C_API=<ON|OFF>
7574
-DWITH_EXAMPLES=<ON|OFF>
7675
-DWITH_GPU=<P100|K20X|K40|K80|V100|Mi50|Mi100|Mi250>

docs/guide/3-developer-guide/3-programming/1-overview/index.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,4 +55,3 @@ Assumed square matrix with 20x20 matrix with 5x5 blocks and a 2x2 processor grid
5555
| `__CUDA_PROFILING` | To turn on Nvidia Tools Extensions. It requires to link `-lnvToolsExt` | Fortran, C, C++ |
5656
| `__CUDA` | Enable CUDA acceleration | C, C++ |
5757
| `__HIP` | Enable HIP acceleration | C, C++ |
58-
| `__DBCSR_ACC_G2G` | Enable GPU Aware MPI in CUDA and HIP backends | Fortran, C, C++ |

docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/2-parameters.md

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,3 @@ The batched matrix-matrix multiplication kernels are templated on:
1414
The batched transpose kernels are templated on:
1515

1616
* the characteristic dimensions of the transpose: `m, n`
17-
18-
## Predictive parameters
19-
20-
The input features for the predictive models can be 'raw' parameters (left-most-column in the figure below), or hand-engineered features 'derived' from the raw features (matrix sizes, launch parameters and resource usage estimations).
21-
22-
![libsmm_acc_predictive_modeling_features](../../../../../media/images/libsmm_acc_predictive_modeling_features.png)

docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/4-predict.md

Lines changed: 0 additions & 3 deletions
This file was deleted.

docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/5-notebooks.md

Lines changed: 0 additions & 3 deletions
This file was deleted.

docs/media/images/README.md

Lines changed: 0 additions & 3 deletions
This file was deleted.
Binary file not shown.

docs/media/images/libsmm_acc_predictive_modeling_features.xml

Lines changed: 0 additions & 1 deletion
This file was deleted.

src/CMakeLists.txt

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -284,18 +284,6 @@ if (USE_ACCEL)
284284
$<$<BOOL:${WITH_HIP_PROFILING}>:roctx64>
285285
$<$<BOOL:${WITH_HIP_PROFILING}>:roctracer64>
286286
$<$<STREQUAL:${USE_ACCEL},opencl>:OpenCL::OpenCL>)
287-
288-
if (WITH_G2G)
289-
target_compile_definitions(
290-
dbcsr
291-
PRIVATE __DBCSR_ACC_G2G
292-
$<$<STREQUAL:${USE_ACCEL},cuda>:__CUDA>
293-
$<$<STREQUAL:${USE_ACCEL},cuda>:ARCH_NUMBER=${ACC_ARCH_NUMBER}>
294-
$<$<STREQUAL:${USE_ACCEL},hip>:__HIP>
295-
$<$<STREQUAL:${USE_ACCEL},hip>:ARCH_NUMBER=${ACC_ARCH_NUMBER}>
296-
$<$<BOOL:${WITH_CUDA_PROFILING}>:__CUDA_PROFILING>
297-
$<$<BOOL:${WITH_HIP_PROFILING}>:__HIP_PROFILING>)
298-
endif ()
299287
endif ()
300288

301289
# =================================================================================================

src/acc/acc_bench_smm.c

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -222,21 +222,25 @@ int main(int argc, char* argv[]) {
222222
#endif
223223
CHECK(libsmm_acc_init(), &result, check); /* note: libsmm_acc_init() may imply acc_init() */
224224
if (EXIT_SUCCESS == result) {
225-
const char* const env_device = getenv("DEVICE");
226-
const int device = ((NULL == env_device || '\0' == *env_device) ? 0 : atoi(env_device));
227225
int ndevices = 0;
228226
result = c_dbcsr_acc_get_ndevices(&ndevices);
229-
if (0 < ndevices && (0 == device || EXIT_SUCCESS == c_dbcsr_acc_set_active_device(device))) {
230-
printf("Activated device%i (ndevices=%i)\n", device, ndevices);
231-
}
232-
else {
233-
if (0 >= ndevices) {
234-
fprintf(stderr, "ERROR: No ACC-device found!\n");
227+
if (EXIT_SUCCESS == result && 0 < ndevices) {
228+
const char* const env_device = getenv("DEVICE");
229+
const char* const env_rank = (NULL != getenv("PMI_RANK") ? getenv("PMI_RANK") : getenv("OMPI_COMM_WORLD_LOCAL_RANK"));
230+
const int rank = (NULL != env_rank ? atoi(env_rank) : -1);
231+
int device = ((NULL == env_device || '\0' == *env_device) ? 0 : atoi(env_device));
232+
device = ((0 <= device && device < ndevices) ? (0 <= rank ? (rank % ndevices) : device) : -1);
233+
result = c_dbcsr_acc_set_active_device(device);
234+
if (EXIT_SUCCESS == result) {
235+
printf("Activated device%i (ndevices=%i)\n", device, ndevices);
235236
}
236237
else {
237-
fprintf(stderr, "ERROR: Failed to activate device %i of %i!\n", device, ndevices);
238+
fprintf(stderr, "ERROR: Failed to activate device!\n");
238239
}
239-
result = EXIT_FAILURE;
240+
}
241+
else {
242+
fprintf(stderr, "ERROR: No ACC-device found!\n");
243+
if (EXIT_SUCCESS == result) result = EXIT_FAILURE;
240244
}
241245
if (EXIT_SUCCESS == result) {
242246
rnd = (int*)malloc(sizeof(int) * NRAND);
@@ -280,7 +284,7 @@ int main(int argc, char* argv[]) {
280284
#if defined(USE_LIBXSMM)
281285
libxsmm_timer_tickint start;
282286
int print_offset = 0;
283-
char print_buffer[1024];
287+
char print_buffer[1024] = "";
284288
# if defined(__OPENCL)
285289
const char* const env_smm_repeat = getenv("SMM_NREPEAT");
286290
const int smm_nrepeat = (NULL == env_smm_repeat ? 1 : MAX(atoi(env_smm_repeat), 1));
@@ -497,7 +501,7 @@ int main(int argc, char* argv[]) {
497501
if (maxdiff < epsilon && NULL != file) maxdiff = epsilon;
498502
if (0 < epsilon) {
499503
if (LIBXSMM_NOTNAN(diff.v_tst)) {
500-
PRINTF(" (|%g-%g|=%g)\n", diff.v_ref, diff.v_tst, fabs(diff.v_ref - diff.v_tst));
504+
PRINTF(" (|%g-%g|=%g)\n", diff.v_ref, diff.v_tst, diff.linf_abs);
501505
}
502506
else {
503507
PRINTF(" (%g)\n", diff.v_tst);
@@ -508,6 +512,7 @@ int main(int argc, char* argv[]) {
508512
}
509513
if (0 < check && check < epsilon) result = EXIT_FAILURE;
510514
}
515+
else fprintf(stderr, "ERROR: failed to validate!\n");
511516
}
512517
# endif
513518
}

src/acc/acc_bench_trans.c

Lines changed: 23 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -106,52 +106,48 @@ int main(int argc, char* argv[]) {
106106
#else
107107
const int warmup = 0;
108108
#endif
109-
const char* const env_device = getenv("DEVICE");
110-
const int device = ((NULL == env_device || '\0' == *env_device) ? 0 : atoi(env_device));
111109
int *stack_hst = NULL, *stack_dev = NULL;
112110
ELEM_TYPE *mat_hst = NULL, *mat_dev = NULL;
113-
int result = EXIT_SUCCESS, ndevices = 0, r, i, mm = m, nn = n;
111+
int result = EXIT_SUCCESS, mm = m, nn = n, r, i;
114112
void* stream = NULL;
115113
#if defined(USE_LIBXSMM)
116114
libxsmm_timer_tickint start;
117115
double duration;
118116
#endif
119117
assert(m <= (mn / n) && 0 == (mn % n));
118+
if (MAX_KERNEL_DIM < m || MAX_KERNEL_DIM < n) {
119+
fprintf(stderr, "Matrix shape exceeds MAX_KERNEL_DIM!\n");
120+
result = EXIT_FAILURE;
121+
}
120122
CHECK(c_dbcsr_acc_init(), &result);
121123
/* note: libsmm_acc_init() may imply acc_init() */
122124
CHECK(libsmm_acc_init(), &result);
123125
if (EXIT_SUCCESS == result) {
126+
int ndevices = 0;
124127
result = c_dbcsr_acc_get_ndevices(&ndevices);
125-
if (0 < ndevices && (0 == device || EXIT_SUCCESS == c_dbcsr_acc_set_active_device(device))) {
126-
printf("Activated device%i (ndevices=%i)\n", device, ndevices);
127-
}
128-
else {
129-
if (0 >= ndevices) {
130-
fprintf(stderr, "No ACC-device found!\n");
128+
if (EXIT_SUCCESS == result && 0 < ndevices) {
129+
const char* const env_device = getenv("DEVICE");
130+
const char* const env_rank = (NULL != getenv("PMI_RANK") ? getenv("PMI_RANK") : getenv("OMPI_COMM_WORLD_LOCAL_RANK"));
131+
const int rank = (NULL != env_rank ? atoi(env_rank) : -1);
132+
int device = ((NULL == env_device || '\0' == *env_device) ? 0 : atoi(env_device));
133+
device = ((0 <= device && device < ndevices) ? (0 <= rank ? (rank % ndevices) : device) : -1);
134+
result = c_dbcsr_acc_set_active_device(device);
135+
if (EXIT_SUCCESS == result) {
136+
printf("Activated device%i (ndevices=%i)\n", device, ndevices);
137+
printf("%s%s%i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "", nrepeat, stack_size, m, n);
138+
printf("typename (id=%i): %s\n", DBCSR_TYPE(ELEM_TYPE), DBCSR_STRINGIFY(ELEM_TYPE));
131139
}
132140
else {
133-
fprintf(stderr, "Failed to activate device %i of %i!\n", device, ndevices);
141+
fprintf(stderr, "ERROR: Failed to activate device!\n");
134142
}
135-
#if !defined(__CUDA)
136-
CHECK(libsmm_acc_finalize(), NULL);
137-
#endif
138-
CHECK(c_dbcsr_acc_finalize(), NULL);
139-
return result;
143+
}
144+
else {
145+
fprintf(stderr, "ERROR: No ACC-device found!\n");
146+
if (EXIT_SUCCESS == result) result = EXIT_FAILURE;
140147
}
141148
}
142149
else {
143150
fprintf(stderr, "ACC initialization failed!\n");
144-
#if !defined(__CUDA)
145-
CHECK(libsmm_acc_finalize(), NULL);
146-
#endif
147-
CHECK(c_dbcsr_acc_finalize(), NULL);
148-
return result;
149-
}
150-
printf("%s%s%i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "", nrepeat, stack_size, m, n);
151-
printf("typename (id=%i): %s\n", DBCSR_TYPE(ELEM_TYPE), DBCSR_STRINGIFY(ELEM_TYPE));
152-
if (MAX_KERNEL_DIM < m || MAX_KERNEL_DIM < n) {
153-
fprintf(stderr, "Matrix shape exceeds MAX_KERNEL_DIM!\n");
154-
result = EXIT_FAILURE;
155151
}
156152
#if defined(PRIORITY)
157153
CHECK(c_dbcsr_acc_stream_priority_range(&priomin, &priomax), &result);
@@ -259,7 +255,7 @@ int main(int argc, char* argv[]) {
259255
CHECK(c_dbcsr_acc_finalize(), NULL);
260256
if (EXIT_SUCCESS != result) {
261257
if (-1 != result) {
262-
fprintf(stderr, "FAILED\n");
258+
fprintf(stderr, "\nFAILED\n\n");
263259
}
264260
else {
265261
fprintf(stderr, "Kernel not suitable!\n");

0 commit comments

Comments
 (0)