Skip to content

Commit f8ddbd0

Browse files
authored
Merge pull request llvm#567 from AMD-Lightning-Internal/amd/dev/rlieberm/jhuber-targ-gpuintr
[OpenMP] Replace use of target address space with <gpuintrin.h> local…
2 parents da2d0ed + 2b935c9 commit f8ddbd0

File tree

12 files changed

+51
-52
lines changed

12 files changed

+51
-52
lines changed

offload/DeviceRTL/include/DeviceTypes.h

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,15 @@
1313
#define OMPTARGET_TYPES_H
1414

1515
#include "DevRTLExtras.h"
16+
#include <gpuintrin.h>
1617
#include <stddef.h>
1718
#include <stdint.h>
1819

20+
template <typename T> using Private = __gpu_private T;
21+
template <typename T> using Constant = __gpu_constant T;
22+
template <typename T> using Local = __gpu_local T;
23+
template <typename T> using Global = __gpu_local T;
24+
1925
enum omp_proc_bind_t {
2026
omp_proc_bind_false = 0,
2127
omp_proc_bind_true = 1,
@@ -156,19 +162,6 @@ typedef enum omp_allocator_handle_t {
156162
#define __PRAGMA(STR) _Pragma(#STR)
157163
#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
158164

159-
#define SHARED(NAME) \
160-
[[clang::address_space(3)]] NAME [[clang::loader_uninitialized]];
161-
162-
// TODO: clang should use address space 5 for omp_thread_mem_alloc, but right
163-
// now that's not the case.
164-
#define THREAD_LOCAL(NAME) \
165-
[[clang::address_space(5)]] NAME [[clang::loader_uninitialized]]
166-
167-
// TODO: clang should use address space 4 for omp_const_mem_alloc, maybe it
168-
// does?
169-
#define CONSTANT(NAME) \
170-
[[clang::address_space(4)]] NAME [[clang::loader_uninitialized]]
171-
172165
///}
173166

174167
#endif

offload/DeviceRTL/include/State.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ struct TeamStateTy {
8787
ParallelRegionFnTy ParallelRegionFnVar;
8888
};
8989

90-
extern TeamStateTy [[clang::address_space(3)]] TeamState;
90+
extern Local<TeamStateTy> TeamState;
9191

9292
struct ThreadStateTy {
9393

@@ -113,7 +113,7 @@ struct ThreadStateTy {
113113
}
114114
};
115115

116-
extern ThreadStateTy **[[clang::address_space(3)]] ThreadStates;
116+
extern Local<ThreadStateTy **> ThreadStates;
117117

118118
/// Initialize the state machinery. Must be called by all threads.
119119
void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,

offload/DeviceRTL/include/Xteamr.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
#define _UI unsigned int
2525
#define _UL unsigned long
2626
#define _INLINE_ATTR_ __attribute__((flatten, always_inline))
27-
#define _RF_LDS volatile __attribute__((address_space(3)))
27+
#define _RF_LDS volatile __gpu_local
2828

2929
extern "C" {
3030
/// External cross team reduction (xteamr) helper functions

offload/DeviceRTL/include/Xteams.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
#define _UI unsigned int
2626
#define _UL unsigned long
2727
#define _INLINE_ATTR_ __attribute__((flatten, always_inline))
28-
#define _RF_LDS volatile __attribute__((address_space(3)))
28+
#define _RF_LDS volatile __gpu_local
2929

3030
extern "C" {
3131
/// External cross team scan (xteams) helper functions

offload/DeviceRTL/src/Configuration.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ using namespace ompx;
2828
// This variable should be visible to the plugin so we override the default
2929
// hidden visibility.
3030
[[gnu::used, gnu::retain, gnu::weak,
31-
gnu::visibility("protected")]] DeviceEnvironmentTy
32-
CONSTANT(__omp_rtl_device_environment);
31+
gnu::visibility(
32+
"protected")]] Constant<DeviceEnvironmentTy> __omp_rtl_device_environment;
3333

3434
uint32_t config::getAssumeTeamsOversubscription() {
3535
return __omp_rtl_assume_teams_oversubscription;

offload/DeviceRTL/src/Mapping.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,7 @@ uint32_t mapping::getNumberOfProcessorElements() {
314314

315315
// TODO: This is a workaround for initialization coming from kernels outside of
316316
// the TU. We will need to solve this more correctly in the future.
317-
[[gnu::weak]] int SHARED(IsSPMDMode);
317+
[[gnu::weak, clang::loader_uninitialized]] Local<int> IsSPMDMode;
318318

319319
void mapping::init(bool IsSPMD) {
320320
if (mapping::isInitialThreadInLevel0(IsSPMD))

offload/DeviceRTL/src/Reduction.cpp

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -71,16 +71,16 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
7171
if (NumThreads == 1)
7272
return 1;
7373

74-
//
75-
// This reduce function handles reduction within a team. It handles
76-
// parallel regions in both L1 and L2 parallelism levels. It also
77-
// supports Generic, SPMD, and NoOMP modes.
78-
//
79-
// 1. Reduce within a warp.
80-
// 2. Warp master copies value to warp 0 via shared memory.
81-
// 3. Warp 0 reduces to a single value.
82-
// 4. The reduced value is available in the thread that returns 1.
83-
//
74+
//
75+
// This reduce function handles reduction within a team. It handles
76+
// parallel regions in both L1 and L2 parallelism levels. It also
77+
// supports Generic, SPMD, and NoOMP modes.
78+
//
79+
// 1. Reduce within a warp.
80+
// 2. Warp master copies value to warp 0 via shared memory.
81+
// 3. Warp 0 reduces to a single value.
82+
// 4. The reduced value is available in the thread that returns 1.
83+
//
8484

8585
#if __has_builtin(__nvvm_reflect)
8686
if (__nvvm_reflect("__CUDA_ARCH") >= 700) {
@@ -197,7 +197,7 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v3(
197197
uint32_t NumThreads = omp_get_num_threads();
198198
uint32_t TeamId = omp_get_team_num();
199199
uint32_t NumTeams = omp_get_num_teams();
200-
static unsigned SHARED(ChunkTeamCount);
200+
[[clang::loader_uninitialized]] static Local<unsigned> ChunkTeamCount;
201201

202202
// Block progress for teams greater than the current upper
203203
// limit. We always only allow a number of teams less or equal
@@ -284,8 +284,8 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
284284
uint32_t NumThreads = omp_get_num_threads();
285285
uint32_t TeamId = omp_get_team_num();
286286
uint32_t NumTeams = omp_get_num_teams();
287-
static unsigned SHARED(Bound);
288-
static unsigned SHARED(ChunkTeamCount);
287+
[[clang::loader_uninitialized]] static Local<unsigned> Bound;
288+
[[clang::loader_uninitialized]] static Local<unsigned> ChunkTeamCount;
289289

290290
// Block progress for teams greater than the current upper
291291
// limit. We always only allow a number of teams less or equal

offload/DeviceRTL/src/State.cpp

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,17 @@ __attribute__((noinline)) void internal_free(void *Ptr);
3131
///{
3232

3333
/// External symbol to access dynamic shared memory.
34-
[[gnu::aligned(allocator::ALIGNMENT)]] extern unsigned char
35-
[[clang::address_space(3)]] DynamicSharedBuffer[];
34+
[[gnu::aligned(
35+
allocator::ALIGNMENT)]] extern Local<unsigned char> DynamicSharedBuffer[];
3636

3737
/// The kernel environment passed to the init method by the compiler.
38-
static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr);
38+
[[clang::loader_uninitialized]] static Local<KernelEnvironmentTy *>
39+
KernelEnvironmentPtr;
3940

4041
/// The kernel launch environment passed as argument to the kernel by the
4142
/// runtime.
42-
static KernelLaunchEnvironmentTy *SHARED(KernelLaunchEnvironmentPtr);
43+
[[clang::loader_uninitialized]] static Local<KernelLaunchEnvironmentTy *>
44+
KernelLaunchEnvironmentPtr;
4345

4446
///}
4547

@@ -165,7 +167,8 @@ static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
165167
"Shared scratchpad of this size not supported yet.");
166168

167169
/// The allocation of a single shared memory scratchpad.
168-
static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack);
170+
[[clang::loader_uninitialized]] static Local<SharedMemorySmartStackTy>
171+
SharedMemorySmartStack;
169172

170173
void SharedMemorySmartStackTy::init(bool IsSPMD) {
171174
Usage[mapping::getThreadIdInBlock()] = 0;
@@ -277,8 +280,10 @@ void state::TeamStateTy::assertEqual(TeamStateTy &Other) const {
277280
ASSERT(HasThreadState == Other.HasThreadState, nullptr);
278281
}
279282

280-
state::TeamStateTy SHARED(ompx::state::TeamState);
281-
state::ThreadStateTy **SHARED(ompx::state::ThreadStates);
283+
[[clang::loader_uninitialized]] Local<state::TeamStateTy>
284+
ompx::state::TeamState;
285+
[[clang::loader_uninitialized]] Local<state::ThreadStateTy **>
286+
ompx::state::ThreadStates;
282287

283288
namespace {
284289

@@ -507,10 +512,10 @@ void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
507512
/// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
508513
constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64;
509514

510-
[[clang::loader_uninitialized]] static void *[[clang::address_space(
511-
3)]] SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
512-
[[clang::loader_uninitialized]] static void **[[clang::address_space(
513-
3)]] SharedMemVariableSharingSpacePtr;
515+
[[clang::loader_uninitialized]] static Local<void *>
516+
SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
517+
[[clang::loader_uninitialized]] static Local<void **>
518+
SharedMemVariableSharingSpacePtr;
514519

515520
void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
516521
if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {

offload/DeviceRTL/src/Synchronization.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
9090
}
9191
}
9292

93-
uint32_t SHARED(namedBarrierTracker);
93+
[[clang::loader_uninitialized]] Local<uint32_t> namedBarrierTracker;
9494

9595
void namedBarrierInit() {
9696
// Don't have global ctors, and shared memory is not zero init

offload/DeviceRTL/src/Workshare.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ struct DynamicScheduleTracker {
4545
#define LAST_CHUNK 2
4646

4747
// TODO: This variable is a hack inherited from the old runtime.
48-
static uint64_t SHARED(Cnt);
48+
[[clang::loader_uninitialized]] static Local<uint64_t> Cnt;
4949

5050
template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
5151
////////////////////////////////////////////////////////////////////////////////
@@ -475,7 +475,8 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
475475
//
476476
// __kmpc_dispatch_deinit
477477
//
478-
static DynamicScheduleTracker **SHARED(ThreadDST);
478+
[[clang::loader_uninitialized]] static Local<DynamicScheduleTracker **>
479+
ThreadDST;
479480

480481
// Create a new DST, link the current one, and define the new as current.
481482
static DynamicScheduleTracker *pushDST() {

0 commit comments

Comments
 (0)