Skip to content

Commit 5f4fdd5

Browse files
authored
Merge pull request #10874 from edgargabriel/topic/accelerator-rocm-updates
accelerator/rocm: updates to the component
2 parents e498938 + 86bc10a commit 5f4fdd5

File tree

6 files changed

+45
-304
lines changed

6 files changed

+45
-304
lines changed

config/opal_check_rocm.m4

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,12 @@ dnl
2020
#
2121
AC_DEFUN([OPAL_CHECK_ROCM],[
2222
23-
OPAL_VAR_SCOPE_PUSH([opal_check_rocm_happy rocm_save_CPPFLAGS rocm_save_LDFLAGS rocm_CPPFLAGS rocm_LDFLAGS])
23+
OPAL_VAR_SCOPE_PUSH([opal_check_rocm_happy rocm_save_CPPFLAGS rocm_save_LDFLAGS rocm_save_LIBS rocm_CPPFLAGS rocm_LDFLAGS rocm_LIBS])
2424
2525
rocm_save_CPPFLAGS="$CPPFLAGS"
2626
rocm_save_LDFLAGS="$LDFLAGS"
27-
27+
rocm_save_LIBS="$LIBS"
28+
2829
# Get some configuration information
2930
AC_ARG_WITH([rocm],
3031
[AS_HELP_STRING([--with-rocm(=DIR)],
@@ -51,7 +52,8 @@ AC_DEFUN([OPAL_CHECK_ROCM],[
5152
5253
LDFLAGS="$rocm_save_LDFLAGS"
5354
OPAL_APPEND([CPPFLAGS], [${$1_CPPFLAGS}] )
54-
55+
LIBS="$rocm_save_LIBS"
56+
5557
AS_IF([ test "$opal_check_rocm_happy" = "no" ],
5658
[ CPPFLAGS="$rocm_save_CPPFLAGS"])
5759
@@ -70,13 +72,3 @@ AC_DEFUN([OPAL_CHECK_ROCM],[
7072
AM_CONDITIONAL([OPAL_rocm_support], [test "$opal_check_rocm_happy" = "yes"])
7173
OPAL_VAR_SCOPE_POP
7274
])
73-
74-
AC_DEFUN([OPAL_CHECK_ROCM_AFTER_OPAL_DL],[
75-
# We cannot have ROCm support without OPAL DL support. Error out
76-
# if the user wants Rocm but we do not have OPAL DL support.
77-
AS_IF([test $OPAL_HAVE_DL_SUPPORT -eq 0 && test "$opal_check_rocm_happy" = "yes"],
78-
[AC_MSG_WARN([--with-rocm was specified, but dlopen support is disabled.])
79-
AC_MSG_WARN([You must reconfigure Open MPI with dlopen ("dl") support.])
80-
AC_MSG_ERROR([Cannot continue.])])
81-
82-
])

configure.ac

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1222,18 +1222,6 @@ fi
12221222
# checkpoint results
12231223
AC_CACHE_SAVE
12241224

1225-
##################################
1226-
# CUDA: part two
1227-
##################################
1228-
1229-
# This is somewhat gross to have a configure check for a common MCA
1230-
# component outside of the normal MCA checks, but this check must come
1231-
# after the opal DL MCA checks have done. Someday this could perhaps
1232-
# be done better by having some kind of "run this check at the end of
1233-
# all other MCA checks" hook...?
1234-
1235-
OPAL_CHECK_ROCM_AFTER_OPAL_DL
1236-
12371225
##################################
12381226
# MPI Extended Interfaces
12391227
##################################

opal/mca/accelerator/rocm/Makefile.am

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,11 @@ endif
3535
mcacomponentdir = $(opallibdir)
3636
mcacomponent_LTLIBRARIES = $(component_install)
3737
mca_accelerator_rocm_la_SOURCES = $(sources)
38-
mca_accelerator_rocm_la_LDFLAGS = -module -avoid-version
39-
mca_accelerator_rocm_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la
38+
mca_accelerator_rocm_la_LDFLAGS = -module -avoid-version $(opal_rocm_LDFLAGS)
39+
mca_accelerator_rocm_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la \
40+
$(opal_rocm_LIBS)
4041

4142
noinst_LTLIBRARIES = $(component_noinst)
4243
libmca_accelerator_rocm_la_SOURCES =$(sources)
43-
libmca_accelerator_rocm_la_LDFLAGS = -module -avoid-version
44+
libmca_accelerator_rocm_la_LDFLAGS = -module -avoid-version $(opal_rocm_LDFLAGS)
45+
libmca_accelerator_rocm_la_LIBADD = $(opal_rocm_LIBS)

opal/mca/accelerator/rocm/accelerator_rocm_component.c

Lines changed: 6 additions & 248 deletions
Original file line numberDiff line numberDiff line change
@@ -23,17 +23,12 @@
2323
#include "opal/runtime/opal_params.h"
2424
#include "accelerator_rocm.h"
2525

26-
static struct opal_dl_handle_t* hip_handle = NULL;
27-
opal_accelerator_rocm_hipFunctionTable_t opal_accelerator_hip_funcs={};
28-
2926
int opal_accelerator_rocm_memcpy_async = 1;
3027
int opal_accelerator_rocm_verbose = 0;
3128
size_t opal_accelerator_rocm_memcpyD2H_limit=1024;
3229
size_t opal_accelerator_rocm_memcpyH2D_limit=1048576;
3330

3431
hipStream_t opal_accelerator_rocm_MemcpyStream = NULL;
35-
static opal_mutex_t opal_accelerator_rocm_init_lock = OPAL_MUTEX_STATIC_INIT;
36-
3732

3833
/*
3934
* Public string showing the accelerator rocm component version number
@@ -46,7 +41,7 @@ const char *opal_accelerator_rocm_component_version_string
4641
{ \
4742
hipError_t error = condition; \
4843
if (hipSuccess != error){ \
49-
const char* msg = HIP_FUNCS.hipGetErrorString(error); \
44+
const char* msg = hipGetErrorString(error); \
5045
opal_output(0, "HIP error: %d %s file: %s line: %d\n", error, msg, __FILE__, __LINE__); \
5146
return error; \
5247
} \
@@ -56,7 +51,7 @@ const char *opal_accelerator_rocm_component_version_string
5651
{ \
5752
hipError_t error = condition; \
5853
if (hipSuccess != error){ \
59-
const char* msg = HIP_FUNCS.hipGetErrorString(error); \
54+
const char* msg = hipGetErrorString(error); \
6055
opal_output(0, "HIP error: %d %s file: %s line: %d\n", error, msg, __FILE__, __LINE__); \
6156
return NULL; \
6257
} \
@@ -159,221 +154,6 @@ static int accelerator_rocm_component_register(void)
159154
return OPAL_SUCCESS;
160155
}
161156

162-
163-
static int hip_dl_init(void)
164-
{
165-
char *str;
166-
void *ptr;
167-
int ret = opal_dl_open("libamdhip64.so", false, false, &hip_handle, &str);
168-
if (OPAL_SUCCESS != ret) {
169-
opal_output(0, "Unable to open libamdhip64.so\n");
170-
return OPAL_ERROR;
171-
}
172-
173-
ret = opal_dl_lookup(hip_handle, "hipMalloc", &ptr, &str);
174-
if (OPAL_SUCCESS != ret) {
175-
opal_output(0, "Failed to find hipMalloc\n");
176-
dlclose(hip_handle);
177-
return OPAL_ERROR;
178-
}
179-
HIP_FUNCS.hipMalloc = (hipMalloc_t)ptr;
180-
181-
ret = opal_dl_lookup(hip_handle, "hipFree", &ptr, &str);
182-
if (OPAL_SUCCESS != ret) {
183-
opal_output_verbose(10, 0, "Failed to find hipFree\n");
184-
dlclose(hip_handle);
185-
return OPAL_ERROR;
186-
}
187-
HIP_FUNCS.hipFree = (hipFree_t)ptr;
188-
189-
ret = opal_dl_lookup(hip_handle, "hipMemcpy", &ptr, &str);
190-
if (OPAL_SUCCESS != ret) {
191-
opal_output_verbose(10, 0, "Failed to find hipMemcpy\n");
192-
dlclose(hip_handle);
193-
return OPAL_ERROR;
194-
}
195-
HIP_FUNCS.hipMemcpy = (hipMemcpy_t)ptr;
196-
197-
ret = opal_dl_lookup(hip_handle, "hipMemcpyAsync", &ptr, &str);
198-
if (OPAL_SUCCESS != ret) {
199-
opal_output_verbose(10, 0, "Failed to find hipMemcpyAsync\n");
200-
dlclose(hip_handle);
201-
return OPAL_ERROR;
202-
}
203-
HIP_FUNCS.hipMemcpyAsync = (hipMemcpyAsync_t)ptr;
204-
205-
ret = opal_dl_lookup(hip_handle, "hipMemcpy2D", &ptr, &str);
206-
if (OPAL_SUCCESS != ret) {
207-
opal_output_verbose(10, 0, "Failed to find hipMemcpy2D\n");
208-
dlclose(hip_handle);
209-
return OPAL_ERROR;
210-
}
211-
HIP_FUNCS.hipMemcpy2D = (hipMemcpy2D_t)ptr;
212-
213-
ret = opal_dl_lookup(hip_handle, "hipMemcpy2DAsync", &ptr, &str);
214-
if (OPAL_SUCCESS != ret) {
215-
opal_output_verbose(10, 0, "Failed to find hipMemcpy2DAsync\n");
216-
dlclose(hip_handle);
217-
return OPAL_ERROR;
218-
}
219-
HIP_FUNCS.hipMemcpy2DAsync = (hipMemcpy2DAsync_t)ptr;
220-
221-
ret = opal_dl_lookup(hip_handle, "hipMemGetAddressRange", &ptr, &str);
222-
if (OPAL_SUCCESS != ret) {
223-
opal_output_verbose(10, 0, "Failed to find hipMemGetAddressRange\n");
224-
dlclose(hip_handle);
225-
return OPAL_ERROR;
226-
}
227-
HIP_FUNCS.hipMemGetAddressRange = (hipMemGetAddressRange_t)ptr;
228-
229-
ret = opal_dl_lookup(hip_handle, "hipHostRegister", &ptr, &str);
230-
if (OPAL_SUCCESS != ret) {
231-
opal_output_verbose(10, 0, "Failed to find hipHostRegister\n");
232-
dlclose(hip_handle);
233-
return OPAL_ERROR;
234-
}
235-
HIP_FUNCS.hipHostRegister = (hipHostRegister_t)ptr;
236-
237-
ret = opal_dl_lookup(hip_handle, "hipHostUnregister", &ptr, &str);
238-
if (OPAL_SUCCESS != ret) {
239-
opal_output_verbose(10, 0, "Failed to find hipHostUnregister\n");
240-
dlclose(hip_handle);
241-
return OPAL_ERROR;
242-
}
243-
HIP_FUNCS.hipHostUnregister = (hipHostUnregister_t)ptr;
244-
245-
ret = opal_dl_lookup(hip_handle, "hipStreamCreate", &ptr, &str);
246-
if (OPAL_SUCCESS != ret) {
247-
opal_output_verbose(10, 0, "Failed to find hipStreamCreate\n");
248-
dlclose(hip_handle);
249-
return OPAL_ERROR;
250-
}
251-
HIP_FUNCS.hipStreamCreate = (hipStreamCreate_t)ptr;
252-
253-
ret = opal_dl_lookup(hip_handle, "hipStreamDestroy", &ptr, &str);
254-
if (OPAL_SUCCESS != ret) {
255-
opal_output_verbose(10, 0, "Failed to find hipStreamDestroy\n");
256-
dlclose(hip_handle);
257-
return OPAL_ERROR;
258-
}
259-
HIP_FUNCS.hipStreamDestroy = (hipStreamDestroy_t)ptr;
260-
261-
ret = opal_dl_lookup(hip_handle, "hipStreamSynchronize", &ptr, &str);
262-
if (OPAL_SUCCESS != ret) {
263-
opal_output_verbose(10, 0, "Failed to find hipStreamSynchronize\n");
264-
dlclose(hip_handle);
265-
return OPAL_ERROR;
266-
}
267-
HIP_FUNCS.hipStreamSynchronize = (hipStreamSynchronize_t)ptr;
268-
269-
ret = opal_dl_lookup(hip_handle, "hipGetErrorString", &ptr, &str);
270-
if (OPAL_SUCCESS != ret) {
271-
opal_output_verbose(10, 0, "Failed to find hipGetErrorString\n");
272-
dlclose(hip_handle);
273-
return OPAL_ERROR;
274-
}
275-
HIP_FUNCS.hipGetErrorString = (hipGetErrorString_t)ptr;
276-
277-
ret = opal_dl_lookup(hip_handle, "hipPointerGetAttributes", &ptr, &str);
278-
if (OPAL_SUCCESS != ret) {
279-
opal_output_verbose(10, 0, "Failed to find hipPointerGetAttributes\n");
280-
dlclose(hip_handle);
281-
return OPAL_ERROR;
282-
}
283-
HIP_FUNCS.hipPointerGetAttributes = (hipPointerGetAttributes_t)ptr;
284-
285-
ret = opal_dl_lookup(hip_handle, "hipEventCreateWithFlags", &ptr, &str);
286-
if (OPAL_SUCCESS != ret) {
287-
opal_output_verbose(10, 0, "Failed to find hipEventCreateWithFlags\n");
288-
dlclose(hip_handle);
289-
return OPAL_ERROR;
290-
}
291-
HIP_FUNCS.hipEventCreateWithFlags = (hipEventCreateWithFlags_t)ptr;
292-
293-
ret = opal_dl_lookup(hip_handle, "hipEventDestroy", &ptr, &str);
294-
if (OPAL_SUCCESS != ret) {
295-
opal_output_verbose(10, 0, "Failed to find hipEventDestroy\n");
296-
dlclose(hip_handle);
297-
return OPAL_ERROR;
298-
}
299-
HIP_FUNCS.hipEventDestroy = (hipEventDestroy_t)ptr;
300-
301-
ret = opal_dl_lookup(hip_handle, "hipEventRecord", &ptr, &str);
302-
if (OPAL_SUCCESS != ret) {
303-
opal_output_verbose(10, 0, "Failed to find hipEventRecord\n");
304-
dlclose(hip_handle);
305-
return OPAL_ERROR;
306-
}
307-
HIP_FUNCS.hipEventRecord = (hipEventRecord_t)ptr;
308-
309-
ret = opal_dl_lookup(hip_handle, "hipEventQuery", &ptr, &str);
310-
if (OPAL_SUCCESS != ret) {
311-
opal_output_verbose(10, 0, "Failed to find hipEventQuery\n");
312-
dlclose(hip_handle);
313-
return OPAL_ERROR;
314-
}
315-
HIP_FUNCS.hipEventQuery = (hipEventQuery_t)ptr;
316-
317-
ret = opal_dl_lookup(hip_handle, "hipEventSynchronize", &ptr, &str);
318-
if (OPAL_SUCCESS != ret) {
319-
opal_output_verbose(10, 0, "Failed to find hipEventSynchronize\n");
320-
dlclose(hip_handle);
321-
return OPAL_ERROR;
322-
}
323-
HIP_FUNCS.hipEventSynchronize = (hipEventSynchronize_t)ptr;
324-
325-
ret = opal_dl_lookup(hip_handle, "hipIpcGetMemHandle", &ptr, &str);
326-
if (OPAL_SUCCESS != ret) {
327-
opal_output_verbose(10, 0, "Failed to find hipIpcGetMemHandle\n");
328-
dlclose(hip_handle);
329-
return OPAL_ERROR;
330-
}
331-
HIP_FUNCS.hipIpcGetMemHandle = (hipIpcGetMemHandle_t)ptr;
332-
333-
ret = opal_dl_lookup(hip_handle, "hipIpcOpenMemHandle", &ptr, &str);
334-
if (OPAL_SUCCESS != ret) {
335-
opal_output_verbose(10, 0, "Failed to find hipIpcOpenMemHandle\n");
336-
dlclose(hip_handle);
337-
return OPAL_ERROR;
338-
}
339-
HIP_FUNCS.hipIpcOpenMemHandle = (hipIpcOpenMemHandle_t)ptr;
340-
341-
ret = opal_dl_lookup(hip_handle, "hipIpcCloseMemHandle", &ptr, &str);
342-
if (OPAL_SUCCESS != ret) {
343-
opal_output_verbose(10, 0, "Failed to find hipIpcCloseMemHandle\n");
344-
dlclose(hip_handle);
345-
return OPAL_ERROR;
346-
}
347-
HIP_FUNCS.hipIpcCloseMemHandle = (hipIpcCloseMemHandle_t)ptr;
348-
349-
ret = opal_dl_lookup(hip_handle, "hipGetDevice", &ptr, &str);
350-
if (OPAL_SUCCESS != ret) {
351-
opal_output_verbose(10, 0, "Failed to find hipGetDevice\n");
352-
dlclose(hip_handle);
353-
return OPAL_ERROR;
354-
}
355-
HIP_FUNCS.hipGetDevice = (hipGetDevice_t)ptr;
356-
357-
ret = opal_dl_lookup(hip_handle, "hipGetDeviceCount", &ptr, &str);
358-
if (OPAL_SUCCESS != ret) {
359-
opal_output_verbose(10, 0, "Failed to find hipGetDeviceCount\n");
360-
dlclose(hip_handle);
361-
return OPAL_ERROR;
362-
}
363-
HIP_FUNCS.hipGetDeviceCount = (hipGetDeviceCount_t)ptr;
364-
365-
ret = opal_dl_lookup(hip_handle, "hipDeviceCanAccessPeer", &ptr, &str);
366-
if (OPAL_SUCCESS != ret) {
367-
opal_output_verbose(10, 0, "Failed to find hipDeviceCanAccessPeer\n");
368-
dlclose(hip_handle);
369-
return OPAL_ERROR;
370-
}
371-
HIP_FUNCS.hipDeviceCanAccessPeer = (hipDeviceCanAccessPeer_t)ptr;
372-
373-
return OPAL_SUCCESS;
374-
}
375-
376-
377157
static opal_accelerator_base_module_t* accelerator_rocm_init(void)
378158
{
379159
hipError_t err;
@@ -382,57 +162,35 @@ static opal_accelerator_base_module_t* accelerator_rocm_init(void)
382162
return NULL;
383163
}
384164

385-
OPAL_THREAD_LOCK(&opal_accelerator_rocm_init_lock);
386-
387-
if (opal_rocm_runtime_initialized) {
388-
OPAL_THREAD_UNLOCK(&opal_accelerator_rocm_init_lock);
389-
return NULL;
390-
}
391-
392-
if (OPAL_SUCCESS != hip_dl_init()) {
393-
opal_output(0, "Could not open libamdhip64.so. Please check your LD_LIBRARY_PATH\n");
394-
OPAL_THREAD_UNLOCK(&opal_accelerator_rocm_init_lock);
395-
return NULL;
396-
}
397-
398165
int count=0;
399-
err = HIP_FUNCS.hipGetDeviceCount(&count);
166+
err = hipGetDeviceCount(&count);
400167
if (hipSuccess != err || 0 == count) {
401168
opal_output(0, "No HIP capabale device found. Disabling component.\n");
402-
OPAL_THREAD_UNLOCK(&opal_accelerator_rocm_init_lock);
403169
return NULL;
404170
}
405171

406-
err = HIP_FUNCS.hipStreamCreate(&opal_accelerator_rocm_MemcpyStream);
172+
err = hipStreamCreate(&opal_accelerator_rocm_MemcpyStream);
407173
if (hipSuccess != err) {
408174
opal_output(0, "Could not create hipStream, err=%d %s\n",
409-
err, HIP_FUNCS.hipGetErrorString(err));
410-
OPAL_THREAD_UNLOCK(&opal_accelerator_rocm_init_lock);
175+
err, hipGetErrorString(err));
411176
return NULL;
412177
}
413178

414179
opal_atomic_mb();
415180
opal_rocm_runtime_initialized = true;
416-
OPAL_THREAD_UNLOCK(&opal_accelerator_rocm_init_lock);
417181

418182
return &opal_accelerator_rocm_module;
419183
}
420184

421185
static void accelerator_rocm_finalize(opal_accelerator_base_module_t* module)
422186
{
423-
OPAL_THREAD_LOCK(&opal_accelerator_rocm_init_lock);
424187
if (NULL != (void*)opal_accelerator_rocm_MemcpyStream) {
425-
hipError_t err = HIP_FUNCS.hipStreamDestroy(opal_accelerator_rocm_MemcpyStream);
188+
hipError_t err = hipStreamDestroy(opal_accelerator_rocm_MemcpyStream);
426189
if (hipSuccess != err) {
427190
opal_output_verbose(10, 0, "hip_dl_finalize: error while destroying the hipStream\n");
428191
}
429192
opal_accelerator_rocm_MemcpyStream = NULL;
430193
}
431-
if (NULL != hip_handle) {
432-
opal_dl_close(hip_handle);
433-
hip_handle = NULL;
434-
}
435-
OPAL_THREAD_UNLOCK(&opal_accelerator_rocm_init_lock);
436194

437195
return;
438196
}

0 commit comments

Comments
 (0)