Skip to content

Commit 86bc10a

Browse files
committed
accelerator/rocm: updates to the component
this pr removes the function table from the rocm component (and hence the dlopen functionality), as well as the lock used during initialization and shutdown. Some minor changes are further also required to configure and Makefile logic. Signed-off-by: Edgar Gabriel <Edgar.Gabriel@amd.com>
1 parent b4355c2 commit 86bc10a

File tree

6 files changed

+45
-304
lines changed

6 files changed

+45
-304
lines changed

config/opal_check_rocm.m4

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,12 @@ dnl
2020
#
2121
AC_DEFUN([OPAL_CHECK_ROCM],[
2222
23-
OPAL_VAR_SCOPE_PUSH([opal_check_rocm_happy rocm_save_CPPFLAGS rocm_save_LDFLAGS rocm_CPPFLAGS rocm_LDFLAGS])
23+
OPAL_VAR_SCOPE_PUSH([opal_check_rocm_happy rocm_save_CPPFLAGS rocm_save_LDFLAGS rocm_save_LIBS rocm_CPPFLAGS rocm_LDFLAGS rocm_LIBS])
2424
2525
rocm_save_CPPFLAGS="$CPPFLAGS"
2626
rocm_save_LDFLAGS="$LDFLAGS"
27-
27+
rocm_save_LIBS="$LIBS"
28+
2829
# Get some configuration information
2930
AC_ARG_WITH([rocm],
3031
[AS_HELP_STRING([--with-rocm(=DIR)],
@@ -51,7 +52,8 @@ AC_DEFUN([OPAL_CHECK_ROCM],[
5152
5253
LDFLAGS="$rocm_save_LDFLAGS"
5354
OPAL_APPEND([CPPFLAGS], [${$1_CPPFLAGS}] )
54-
55+
LIBS="$rocm_save_LIBS"
56+
5557
AS_IF([ test "$opal_check_rocm_happy" = "no" ],
5658
[ CPPFLAGS="$rocm_save_CPPFLAGS"])
5759
@@ -70,13 +72,3 @@ AC_DEFUN([OPAL_CHECK_ROCM],[
7072
AM_CONDITIONAL([OPAL_rocm_support], [test "$opal_check_rocm_happy" = "yes"])
7173
OPAL_VAR_SCOPE_POP
7274
])
73-
74-
AC_DEFUN([OPAL_CHECK_ROCM_AFTER_OPAL_DL],[
75-
# We cannot have ROCm support without OPAL DL support. Error out
76-
# if the user wants Rocm but we do not have OPAL DL support.
77-
AS_IF([test $OPAL_HAVE_DL_SUPPORT -eq 0 && test "$opal_check_rocm_happy" = "yes"],
78-
[AC_MSG_WARN([--with-rocm was specified, but dlopen support is disabled.])
79-
AC_MSG_WARN([You must reconfigure Open MPI with dlopen ("dl") support.])
80-
AC_MSG_ERROR([Cannot continue.])])
81-
82-
])

configure.ac

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1222,18 +1222,6 @@ fi
12221222
# checkpoint results
12231223
AC_CACHE_SAVE
12241224

1225-
##################################
1226-
# CUDA: part two
1227-
##################################
1228-
1229-
# This is somewhat gross to have a configure check for a common MCA
1230-
# component outside of the normal MCA checks, but this check must come
1231-
# after the opal DL MCA checks have done. Someday this could perhaps
1232-
# be done better by having some kind of "run this check at the end of
1233-
# all other MCA checks" hook...?
1234-
1235-
OPAL_CHECK_ROCM_AFTER_OPAL_DL
1236-
12371225
##################################
12381226
# MPI Extended Interfaces
12391227
##################################

opal/mca/accelerator/rocm/Makefile.am

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,11 @@ endif
3535
mcacomponentdir = $(opallibdir)
3636
mcacomponent_LTLIBRARIES = $(component_install)
3737
mca_accelerator_rocm_la_SOURCES = $(sources)
38-
mca_accelerator_rocm_la_LDFLAGS = -module -avoid-version
39-
mca_accelerator_rocm_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la
38+
mca_accelerator_rocm_la_LDFLAGS = -module -avoid-version $(opal_rocm_LDFLAGS)
39+
mca_accelerator_rocm_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la \
40+
$(opal_rocm_LIBS)
4041

4142
noinst_LTLIBRARIES = $(component_noinst)
4243
libmca_accelerator_rocm_la_SOURCES =$(sources)
43-
libmca_accelerator_rocm_la_LDFLAGS = -module -avoid-version
44+
libmca_accelerator_rocm_la_LDFLAGS = -module -avoid-version $(opal_rocm_LDFLAGS)
45+
libmca_accelerator_rocm_la_LIBADD = $(opal_rocm_LIBS)

opal/mca/accelerator/rocm/accelerator_rocm_component.c

Lines changed: 6 additions & 248 deletions
Original file line numberDiff line numberDiff line change
@@ -23,17 +23,12 @@
2323
#include "opal/runtime/opal_params.h"
2424
#include "accelerator_rocm.h"
2525

26-
static struct opal_dl_handle_t* hip_handle = NULL;
27-
opal_accelerator_rocm_hipFunctionTable_t opal_accelerator_hip_funcs={};
28-
2926
int opal_accelerator_rocm_memcpy_async = 1;
3027
int opal_accelerator_rocm_verbose = 0;
3128
size_t opal_accelerator_rocm_memcpyD2H_limit=1024;
3229
size_t opal_accelerator_rocm_memcpyH2D_limit=1048576;
3330

3431
hipStream_t opal_accelerator_rocm_MemcpyStream = NULL;
35-
static opal_mutex_t opal_accelerator_rocm_init_lock = OPAL_MUTEX_STATIC_INIT;
36-
3732

3833
/*
3934
* Public string showing the accelerator rocm component version number
@@ -46,7 +41,7 @@ const char *opal_accelerator_rocm_component_version_string
4641
{ \
4742
hipError_t error = condition; \
4843
if (hipSuccess != error){ \
49-
const char* msg = HIP_FUNCS.hipGetErrorString(error); \
44+
const char* msg = hipGetErrorString(error); \
5045
opal_output(0, "HIP error: %d %s file: %s line: %d\n", error, msg, __FILE__, __LINE__); \
5146
return error; \
5247
} \
@@ -56,7 +51,7 @@ const char *opal_accelerator_rocm_component_version_string
5651
{ \
5752
hipError_t error = condition; \
5853
if (hipSuccess != error){ \
59-
const char* msg = HIP_FUNCS.hipGetErrorString(error); \
54+
const char* msg = hipGetErrorString(error); \
6055
opal_output(0, "HIP error: %d %s file: %s line: %d\n", error, msg, __FILE__, __LINE__); \
6156
return NULL; \
6257
} \
@@ -159,221 +154,6 @@ static int accelerator_rocm_component_register(void)
159154
return OPAL_SUCCESS;
160155
}
161156

162-
163-
static int hip_dl_init(void)
164-
{
165-
char *str;
166-
void *ptr;
167-
int ret = opal_dl_open("libamdhip64.so", false, false, &hip_handle, &str);
168-
if (OPAL_SUCCESS != ret) {
169-
opal_output(0, "Unable to open libamdhip64.so\n");
170-
return OPAL_ERROR;
171-
}
172-
173-
ret = opal_dl_lookup(hip_handle, "hipMalloc", &ptr, &str);
174-
if (OPAL_SUCCESS != ret) {
175-
opal_output(0, "Failed to find hipMalloc\n");
176-
dlclose(hip_handle);
177-
return OPAL_ERROR;
178-
}
179-
HIP_FUNCS.hipMalloc = (hipMalloc_t)ptr;
180-
181-
ret = opal_dl_lookup(hip_handle, "hipFree", &ptr, &str);
182-
if (OPAL_SUCCESS != ret) {
183-
opal_output_verbose(10, 0, "Failed to find hipFree\n");
184-
dlclose(hip_handle);
185-
return OPAL_ERROR;
186-
}
187-
HIP_FUNCS.hipFree = (hipFree_t)ptr;
188-
189-
ret = opal_dl_lookup(hip_handle, "hipMemcpy", &ptr, &str);
190-
if (OPAL_SUCCESS != ret) {
191-
opal_output_verbose(10, 0, "Failed to find hipMemcpy\n");
192-
dlclose(hip_handle);
193-
return OPAL_ERROR;
194-
}
195-
HIP_FUNCS.hipMemcpy = (hipMemcpy_t)ptr;
196-
197-
ret = opal_dl_lookup(hip_handle, "hipMemcpyAsync", &ptr, &str);
198-
if (OPAL_SUCCESS != ret) {
199-
opal_output_verbose(10, 0, "Failed to find hipMemcpyAsync\n");
200-
dlclose(hip_handle);
201-
return OPAL_ERROR;
202-
}
203-
HIP_FUNCS.hipMemcpyAsync = (hipMemcpyAsync_t)ptr;
204-
205-
ret = opal_dl_lookup(hip_handle, "hipMemcpy2D", &ptr, &str);
206-
if (OPAL_SUCCESS != ret) {
207-
opal_output_verbose(10, 0, "Failed to find hipMemcpy2D\n");
208-
dlclose(hip_handle);
209-
return OPAL_ERROR;
210-
}
211-
HIP_FUNCS.hipMemcpy2D = (hipMemcpy2D_t)ptr;
212-
213-
ret = opal_dl_lookup(hip_handle, "hipMemcpy2DAsync", &ptr, &str);
214-
if (OPAL_SUCCESS != ret) {
215-
opal_output_verbose(10, 0, "Failed to find hipMemcpy2DAsync\n");
216-
dlclose(hip_handle);
217-
return OPAL_ERROR;
218-
}
219-
HIP_FUNCS.hipMemcpy2DAsync = (hipMemcpy2DAsync_t)ptr;
220-
221-
ret = opal_dl_lookup(hip_handle, "hipMemGetAddressRange", &ptr, &str);
222-
if (OPAL_SUCCESS != ret) {
223-
opal_output_verbose(10, 0, "Failed to find hipMemGetAddressRange\n");
224-
dlclose(hip_handle);
225-
return OPAL_ERROR;
226-
}
227-
HIP_FUNCS.hipMemGetAddressRange = (hipMemGetAddressRange_t)ptr;
228-
229-
ret = opal_dl_lookup(hip_handle, "hipHostRegister", &ptr, &str);
230-
if (OPAL_SUCCESS != ret) {
231-
opal_output_verbose(10, 0, "Failed to find hipHostRegister\n");
232-
dlclose(hip_handle);
233-
return OPAL_ERROR;
234-
}
235-
HIP_FUNCS.hipHostRegister = (hipHostRegister_t)ptr;
236-
237-
ret = opal_dl_lookup(hip_handle, "hipHostUnregister", &ptr, &str);
238-
if (OPAL_SUCCESS != ret) {
239-
opal_output_verbose(10, 0, "Failed to find hipHostUnregister\n");
240-
dlclose(hip_handle);
241-
return OPAL_ERROR;
242-
}
243-
HIP_FUNCS.hipHostUnregister = (hipHostUnregister_t)ptr;
244-
245-
ret = opal_dl_lookup(hip_handle, "hipStreamCreate", &ptr, &str);
246-
if (OPAL_SUCCESS != ret) {
247-
opal_output_verbose(10, 0, "Failed to find hipStreamCreate\n");
248-
dlclose(hip_handle);
249-
return OPAL_ERROR;
250-
}
251-
HIP_FUNCS.hipStreamCreate = (hipStreamCreate_t)ptr;
252-
253-
ret = opal_dl_lookup(hip_handle, "hipStreamDestroy", &ptr, &str);
254-
if (OPAL_SUCCESS != ret) {
255-
opal_output_verbose(10, 0, "Failed to find hipStreamDestroy\n");
256-
dlclose(hip_handle);
257-
return OPAL_ERROR;
258-
}
259-
HIP_FUNCS.hipStreamDestroy = (hipStreamDestroy_t)ptr;
260-
261-
ret = opal_dl_lookup(hip_handle, "hipStreamSynchronize", &ptr, &str);
262-
if (OPAL_SUCCESS != ret) {
263-
opal_output_verbose(10, 0, "Failed to find hipStreamSynchronize\n");
264-
dlclose(hip_handle);
265-
return OPAL_ERROR;
266-
}
267-
HIP_FUNCS.hipStreamSynchronize = (hipStreamSynchronize_t)ptr;
268-
269-
ret = opal_dl_lookup(hip_handle, "hipGetErrorString", &ptr, &str);
270-
if (OPAL_SUCCESS != ret) {
271-
opal_output_verbose(10, 0, "Failed to find hipGetErrorString\n");
272-
dlclose(hip_handle);
273-
return OPAL_ERROR;
274-
}
275-
HIP_FUNCS.hipGetErrorString = (hipGetErrorString_t)ptr;
276-
277-
ret = opal_dl_lookup(hip_handle, "hipPointerGetAttributes", &ptr, &str);
278-
if (OPAL_SUCCESS != ret) {
279-
opal_output_verbose(10, 0, "Failed to find hipPointerGetAttributes\n");
280-
dlclose(hip_handle);
281-
return OPAL_ERROR;
282-
}
283-
HIP_FUNCS.hipPointerGetAttributes = (hipPointerGetAttributes_t)ptr;
284-
285-
ret = opal_dl_lookup(hip_handle, "hipEventCreateWithFlags", &ptr, &str);
286-
if (OPAL_SUCCESS != ret) {
287-
opal_output_verbose(10, 0, "Failed to find hipEventCreateWithFlags\n");
288-
dlclose(hip_handle);
289-
return OPAL_ERROR;
290-
}
291-
HIP_FUNCS.hipEventCreateWithFlags = (hipEventCreateWithFlags_t)ptr;
292-
293-
ret = opal_dl_lookup(hip_handle, "hipEventDestroy", &ptr, &str);
294-
if (OPAL_SUCCESS != ret) {
295-
opal_output_verbose(10, 0, "Failed to find hipEventDestroy\n");
296-
dlclose(hip_handle);
297-
return OPAL_ERROR;
298-
}
299-
HIP_FUNCS.hipEventDestroy = (hipEventDestroy_t)ptr;
300-
301-
ret = opal_dl_lookup(hip_handle, "hipEventRecord", &ptr, &str);
302-
if (OPAL_SUCCESS != ret) {
303-
opal_output_verbose(10, 0, "Failed to find hipEventRecord\n");
304-
dlclose(hip_handle);
305-
return OPAL_ERROR;
306-
}
307-
HIP_FUNCS.hipEventRecord = (hipEventRecord_t)ptr;
308-
309-
ret = opal_dl_lookup(hip_handle, "hipEventQuery", &ptr, &str);
310-
if (OPAL_SUCCESS != ret) {
311-
opal_output_verbose(10, 0, "Failed to find hipEventQuery\n");
312-
dlclose(hip_handle);
313-
return OPAL_ERROR;
314-
}
315-
HIP_FUNCS.hipEventQuery = (hipEventQuery_t)ptr;
316-
317-
ret = opal_dl_lookup(hip_handle, "hipEventSynchronize", &ptr, &str);
318-
if (OPAL_SUCCESS != ret) {
319-
opal_output_verbose(10, 0, "Failed to find hipEventSynchronize\n");
320-
dlclose(hip_handle);
321-
return OPAL_ERROR;
322-
}
323-
HIP_FUNCS.hipEventSynchronize = (hipEventSynchronize_t)ptr;
324-
325-
ret = opal_dl_lookup(hip_handle, "hipIpcGetMemHandle", &ptr, &str);
326-
if (OPAL_SUCCESS != ret) {
327-
opal_output_verbose(10, 0, "Failed to find hipIpcGetMemHandle\n");
328-
dlclose(hip_handle);
329-
return OPAL_ERROR;
330-
}
331-
HIP_FUNCS.hipIpcGetMemHandle = (hipIpcGetMemHandle_t)ptr;
332-
333-
ret = opal_dl_lookup(hip_handle, "hipIpcOpenMemHandle", &ptr, &str);
334-
if (OPAL_SUCCESS != ret) {
335-
opal_output_verbose(10, 0, "Failed to find hipIpcOpenMemHandle\n");
336-
dlclose(hip_handle);
337-
return OPAL_ERROR;
338-
}
339-
HIP_FUNCS.hipIpcOpenMemHandle = (hipIpcOpenMemHandle_t)ptr;
340-
341-
ret = opal_dl_lookup(hip_handle, "hipIpcCloseMemHandle", &ptr, &str);
342-
if (OPAL_SUCCESS != ret) {
343-
opal_output_verbose(10, 0, "Failed to find hipIpcCloseMemHandle\n");
344-
dlclose(hip_handle);
345-
return OPAL_ERROR;
346-
}
347-
HIP_FUNCS.hipIpcCloseMemHandle = (hipIpcCloseMemHandle_t)ptr;
348-
349-
ret = opal_dl_lookup(hip_handle, "hipGetDevice", &ptr, &str);
350-
if (OPAL_SUCCESS != ret) {
351-
opal_output_verbose(10, 0, "Failed to find hipGetDevice\n");
352-
dlclose(hip_handle);
353-
return OPAL_ERROR;
354-
}
355-
HIP_FUNCS.hipGetDevice = (hipGetDevice_t)ptr;
356-
357-
ret = opal_dl_lookup(hip_handle, "hipGetDeviceCount", &ptr, &str);
358-
if (OPAL_SUCCESS != ret) {
359-
opal_output_verbose(10, 0, "Failed to find hipGetDeviceCount\n");
360-
dlclose(hip_handle);
361-
return OPAL_ERROR;
362-
}
363-
HIP_FUNCS.hipGetDeviceCount = (hipGetDeviceCount_t)ptr;
364-
365-
ret = opal_dl_lookup(hip_handle, "hipDeviceCanAccessPeer", &ptr, &str);
366-
if (OPAL_SUCCESS != ret) {
367-
opal_output_verbose(10, 0, "Failed to find hipDeviceCanAccessPeer\n");
368-
dlclose(hip_handle);
369-
return OPAL_ERROR;
370-
}
371-
HIP_FUNCS.hipDeviceCanAccessPeer = (hipDeviceCanAccessPeer_t)ptr;
372-
373-
return OPAL_SUCCESS;
374-
}
375-
376-
377157
static opal_accelerator_base_module_t* accelerator_rocm_init(void)
378158
{
379159
hipError_t err;
@@ -382,57 +162,35 @@ static opal_accelerator_base_module_t* accelerator_rocm_init(void)
382162
return NULL;
383163
}
384164

385-
OPAL_THREAD_LOCK(&opal_accelerator_rocm_init_lock);
386-
387-
if (opal_rocm_runtime_initialized) {
388-
OPAL_THREAD_UNLOCK(&opal_accelerator_rocm_init_lock);
389-
return NULL;
390-
}
391-
392-
if (OPAL_SUCCESS != hip_dl_init()) {
393-
opal_output(0, "Could not open libamdhip64.so. Please check your LD_LIBRARY_PATH\n");
394-
OPAL_THREAD_UNLOCK(&opal_accelerator_rocm_init_lock);
395-
return NULL;
396-
}
397-
398165
int count=0;
399-
err = HIP_FUNCS.hipGetDeviceCount(&count);
166+
err = hipGetDeviceCount(&count);
400167
if (hipSuccess != err || 0 == count) {
401168
opal_output(0, "No HIP capabale device found. Disabling component.\n");
402-
OPAL_THREAD_UNLOCK(&opal_accelerator_rocm_init_lock);
403169
return NULL;
404170
}
405171

406-
err = HIP_FUNCS.hipStreamCreate(&opal_accelerator_rocm_MemcpyStream);
172+
err = hipStreamCreate(&opal_accelerator_rocm_MemcpyStream);
407173
if (hipSuccess != err) {
408174
opal_output(0, "Could not create hipStream, err=%d %s\n",
409-
err, HIP_FUNCS.hipGetErrorString(err));
410-
OPAL_THREAD_UNLOCK(&opal_accelerator_rocm_init_lock);
175+
err, hipGetErrorString(err));
411176
return NULL;
412177
}
413178

414179
opal_atomic_mb();
415180
opal_rocm_runtime_initialized = true;
416-
OPAL_THREAD_UNLOCK(&opal_accelerator_rocm_init_lock);
417181

418182
return &opal_accelerator_rocm_module;
419183
}
420184

421185
static void accelerator_rocm_finalize(opal_accelerator_base_module_t* module)
422186
{
423-
OPAL_THREAD_LOCK(&opal_accelerator_rocm_init_lock);
424187
if (NULL != (void*)opal_accelerator_rocm_MemcpyStream) {
425-
hipError_t err = HIP_FUNCS.hipStreamDestroy(opal_accelerator_rocm_MemcpyStream);
188+
hipError_t err = hipStreamDestroy(opal_accelerator_rocm_MemcpyStream);
426189
if (hipSuccess != err) {
427190
opal_output_verbose(10, 0, "hip_dl_finalize: error while destroying the hipStream\n");
428191
}
429192
opal_accelerator_rocm_MemcpyStream = NULL;
430193
}
431-
if (NULL != hip_handle) {
432-
opal_dl_close(hip_handle);
433-
hip_handle = NULL;
434-
}
435-
OPAL_THREAD_UNLOCK(&opal_accelerator_rocm_init_lock);
436194

437195
return;
438196
}

0 commit comments

Comments
 (0)