Skip to content

Commit 6511333

Browse files
authored
[SYCLomatic][ASM] Support migration of PTX instruction prmt.b32 (#2708)
Signed-off-by: chenwei.sun <chenwei.sun@intel.com>
1 parent 0ae1059 commit 6511333

File tree

6 files changed

+127
-3
lines changed

6 files changed

+127
-3
lines changed

clang/lib/DPCT/RulesAsm/AsmMigration.cpp

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2858,6 +2858,45 @@ class SYCLGen : public SYCLGenBase {
28582858
"rewrite the code.");
28592859
return SYCLGenSuccess();
28602860
}
2861+
2862+
bool handle_prmt(const InlineAsmInstruction *Inst) override {
2863+
if (Inst->getNumInputOperands() != 3 || Inst->getNumTypes() != 1)
2864+
return SYCLGenError();
2865+
2866+
if (emitStmt(Inst->getOutputOperand()))
2867+
return SYCLGenError();
2868+
OS() << " = " << MapNames::getDpctNamespace()
2869+
<< "byte_level_permute_custom(";
2870+
2871+
llvm::SaveAndRestore<const InlineAsmInstruction *> Save(CurrInst);
2872+
CurrInst = Inst;
2873+
std::string Op[3];
2874+
if (tryEmitAllInputOperands(Op, Inst))
2875+
return SYCLGenError();
2876+
2877+
OS() << Op[0] << ", ";
2878+
OS() << Op[1] << ", ";
2879+
OS() << Op[2] << ", ";
2880+
if (Inst->hasAttr(InstAttr::f4e)) {
2881+
OS() << "1";
2882+
} else if (Inst->hasAttr(InstAttr::b4e)) {
2883+
OS() << "2";
2884+
} else if (Inst->hasAttr(InstAttr::rc8)) {
2885+
OS() << "3";
2886+
} else if (Inst->hasAttr(InstAttr::ecl)) {
2887+
OS() << "4";
2888+
} else if (Inst->hasAttr(InstAttr::ecr)) {
2889+
OS() << "5";
2890+
} else if (Inst->hasAttr(InstAttr::rc16)) {
2891+
OS() << "6";
2892+
} else {
2893+
OS() << "0";
2894+
}
2895+
OS() << ")";
2896+
2897+
endstmt();
2898+
return SYCLGenSuccess();
2899+
}
28612900
};
28622901

28632902
/// Clean the special character in identifier.

clang/lib/DPCT/RulesAsm/Parser/AsmTokenKinds.def

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -412,7 +412,12 @@ MODIFIER(sc, ".sc")
412412
MODIFIER(gl, ".gl")
413413
MODIFIER(L1, ".L1")
414414
MODIFIER(L2, ".L2")
415-
415+
MODIFIER(f4e, ".f4e")
416+
MODIFIER(b4e, ".b4e")
417+
MODIFIER(rc8, ".rc8")
418+
MODIFIER(ecl, ".ecl")
419+
MODIFIER(ecr, ".ecr")
420+
MODIFIER(rc16, ".rc16")
416421

417422
#undef LINKAGE
418423
#undef TARGET

clang/lib/DPCT/SrcAPI/APINames_ASM.inc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ ENTRY("or", "or", true, NO_FLAG, P1, "Successful")
101101
ENTRY("pmevent", "pmevent", false, NO_FLAG, P1, "Comment")
102102
ENTRY("popc", "popc", true, NO_FLAG, P1, "Successful")
103103
ENTRY("prefetch", "prefetch", true, NO_FLAG, P1, "Partial")
104-
ENTRY("prmt", "prmt", false, NO_FLAG, P1, "Comment")
104+
ENTRY("prmt", "prmt", true, NO_FLAG, P1, "Successful")
105105
ENTRY("rcp", "rcp", true, NO_FLAG, P1, "Successful")
106106
ENTRY("red", "red", true, NO_FLAG, P1, "Partial")
107107
ENTRY("redux", "redux", false, NO_FLAG, P1, "Comment")

clang/runtime/dpct-rt/include/dpct/util.hpp

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,48 @@ inline unsigned int byte_level_permute(unsigned int a, unsigned int b,
187187
return ret;
188188
}
189189

190+
/// \param [in] low32 The 4 bytes to construct the 8 bytes value as low 32 bits.
191+
/// \param [in] high32 The 4 bytes to construct the 8 bytes value as high 32
192+
/// bits.
193+
/// \param [in] sel The selector value. It is used to generate selectors which are
194+
/// used to fetch byte value from \p low32 and \p high32 to construct result
195+
/// value.
196+
/// \param [in] mode The mode of permutation, together with \p sel, it
197+
/// further defines the behavior of data selection. \p mode and \p sel define 4
198+
/// selectors (s[i], i=0, 1, 2, 3), which are used as index to fetch 4 bytes from
199+
/// the 8 bytes value constructed by \p low32 and \p high32; the byte selected by s[i]
200+
/// is used to fill the i-th byte of the result.
201+
/// The available mode values are:
202+
/// mode value 0: s[i] = sel[i * 4 + 3 : i * 4]
203+
/// mode value 1: s[i] = sel[1 : 0] + i
204+
/// mode value 2: s[i] = (sel[1 : 0] + 7) % 8
205+
/// mode value 3: s[i] = sel[1 : 0]
206+
/// mode value 4: s[i] = max(sel[1 : 0], i)
207+
/// mode value 5: s[i] = min(sel[1 : 0], i)
208+
/// mode value 6: s[0] = sel[0 : 0] * 2
209+
/// s[1] = sel[0 : 0] * 2 + 1
210+
/// s[2] = s[0]
211+
/// s[3] = s[1]
212+
/// other value: illegal, undefined behavior, return 0.
213+
inline uint32_t byte_level_permute_custom(uint32_t low32, uint32_t high32,
214+
uint32_t sel, int mode = 0) {
215+
constexpr uint16_t lookup[6][4] = {
216+
{0x3210, 0x4321, 0x5432, 0x6543}, // Forward 4-byte extract
217+
{0x5670, 0x6701, 0x7012, 0x0123}, // Backward 4-byte extract
218+
{0x0000, 0x1111, 0x2222, 0x3333}, // Replicate 8-bit values
219+
{0x3210, 0x3211, 0x3222, 0x3333}, // Edge clamp left
220+
{0x0000, 0x1110, 0x2210, 0x3210}, // Edge clamp right
221+
{0x1010, 0x3232, 0x1010, 0x3232} // Replicate 16-bit values
222+
};
223+
224+
if (mode >= 1 && mode <= 6) {
225+
return byte_level_permute(low32, high32, lookup[mode - 1][sel & 0x3]);
226+
} else if (!mode) {
227+
return byte_level_permute(low32, high32, sel);
228+
}
229+
return 0;
230+
}
231+
190232
/// Find position of first least significant set bit in an integer.
191233
/// ffs(0) returns 0.
192234
///

clang/test/dpct/asm/prmt.cu

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
// UNSUPPORTED: cuda-8.0, cuda-9.0, cuda-9.1, cuda-9.2, cuda-10.0, cuda-10.1, cuda-10.2
2+
// UNSUPPORTED: v8.0, v9.0, v9.1, v9.2, v10.0, v10.1, v10.2
3+
// RUN: dpct --format-range=none -out-root %T/prmt %s --cuda-include-path="%cuda-path/include" -- -std=c++14 -x cuda --cuda-host-only
4+
// RUN: FileCheck %s --match-full-lines --input-file %T/prmt/prmt.dp.cpp
5+
// RUN: %if build_lit %{icpx -c -fsycl %T/prmt/prmt.dp.cpp -o %T/prmt/prmt.dp.o %}
6+
7+
// clang-format off
8+
#include <cstdint>
9+
#include <cuda_runtime.h>
10+
11+
__global__ void testKernel1(uint32_t *d_result, uint32_t a) {
12+
static constexpr uint32_t sel = 0x3210;
13+
static constexpr uint32_t b = 0;
14+
uint32_t d;
15+
16+
// CHECK: d = dpct::byte_level_permute_custom(a, b, sel, 0);
17+
asm volatile("prmt.b32 %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "n"(b), "n"(sel));
18+
19+
// CHECK: d = dpct::byte_level_permute_custom(a, b, sel, 1);
20+
asm volatile("prmt.b32.f4e %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "n"(b), "n"(sel));
21+
22+
// CHECK: d = dpct::byte_level_permute_custom(a, b, sel, 2);
23+
asm volatile("prmt.b32.b4e %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "n"(b), "n"(sel));
24+
25+
// CHECK: d = dpct::byte_level_permute_custom(a, b, sel, 3);
26+
asm volatile("prmt.b32.rc8 %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "n"(b), "n"(sel));
27+
28+
// CHECK: d = dpct::byte_level_permute_custom(a, b, sel, 4);
29+
asm volatile("prmt.b32.ecl %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "n"(b), "n"(sel));
30+
31+
// CHECK: d = dpct::byte_level_permute_custom(a, b, sel, 5);
32+
asm volatile("prmt.b32.ecr %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "n"(b), "n"(sel));
33+
34+
// CHECK: d = dpct::byte_level_permute_custom(a, b, sel, 6);
35+
asm volatile("prmt.b32.rc16 %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "n"(b), "n"(sel));
36+
}
37+
38+
// clang-format on

docs/dev_guide/api-mapping-status/ASM_API_migration_status.csv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ or,YES,
6767
pmevent,NO,
6868
popc,YES,
6969
prefetch,YES, Partial
70-
prmt,NO,
70+
prmt,YES,
7171
rcp,YES,
7272
red,NO,
7373
redux,NO,

0 commit comments

Comments
 (0)