Skip to content

Commit 68d2fc0

Browse files
Cuda testing (#28)
1 parent 17853c9 commit 68d2fc0

File tree

10 files changed

+157
-40
lines changed

10 files changed

+157
-40
lines changed

dub.json

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,26 @@
11
{
22
"name": "dcompute",
33
"description": "Native Heterogeneous Computing for D",
4-
"copyright": "Copyright © 2016, Nicholas Wilson",
4+
"copyright": "Copyright © 2017, Nicholas Wilson",
55
"authors": ["Nicholas Wilson"],
66
"license": "BSL-1.0",
77
"dependencies": {
88
"derelict-cl" : "~>2.0.0",
99
"derelict-cuda": "~>2.0.1",
1010
"taggedalgebraic": "~>0.10.7"
1111
},
12-
"dflags" : ["-mdcompute-targets=cuda-350" , "-oq", "-betterC"]
12+
"dflags" : ["-mdcompute-targets=cuda-210" ,"-oq", "-betterC"],
13+
"configurations": [
14+
{
15+
"name": "library",
16+
"targetType": "library",
17+
"excludedSourceFiles": ["./source/tests/*"],
18+
},
19+
{
20+
"name": "unittest",
21+
"targetType": "executable",
22+
"versions": ["DComputeTesting"],
23+
24+
},
25+
]
1326
}

source/dcompute/driver/cuda/buffer.d

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,40 @@ struct Buffer(T)
66
{
77
size_t raw;
88

9+
// Host memory associated with this buffer
10+
T[] hostMemory;
11+
912
this(size_t elems)
1013
{
1114
status = cast(Status)cuMemAlloc(&raw,elems * T.sizeof);
1215
checkErrors();
16+
hostMemory = null;
1317
}
18+
19+
this(T[] arr)
20+
{
21+
status = cast(Status)cuMemAlloc(&raw,arr.length * T.sizeof);
22+
checkErrors();
23+
hostMemory = arr;
24+
}
25+
void copy(Copy c)()
26+
{
27+
static if (c == Copy.hostToDevice)
28+
{
29+
cuMemcpyHtoD(raw, hostMemory.ptr,hostMemory.length * T.sizeof);
30+
}
31+
else static if (c == Copy.deviceToHost)
32+
{
33+
cuMemcpyDtoH(hostMemory.ptr,raw,hostMemory.length * T.sizeof);
34+
}
35+
}
36+
alias hostArgOf(U : GlobalPointer!T) = raw;
37+
void release()
38+
{
39+
cuMemFree(raw);
40+
raw = 0;
41+
hostMemory = null;
42+
}
1443
}
1544

1645
alias bf = Buffer!float;

source/dcompute/driver/cuda/context.d

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import dcompute.driver.cuda;
55
struct Context
66
{
77
void* raw;
8-
this(Device dev, uint flags)
8+
this(Device dev, uint flags = 0)
99
{
1010
status = cast(Status)cuCtxCreate(&raw, flags,dev.raw);
1111
checkErrors();

source/dcompute/driver/cuda/package.d

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
module dcompute.driver.cuda;
22

3+
public import ldc.dcompute;
34
public import derelict.cuda.driverapi;
45

56
public import dcompute.driver.error;
@@ -14,10 +15,10 @@ public import dcompute.driver.cuda.platform;
1415
public import dcompute.driver.cuda.program;
1516
public import dcompute.driver.cuda.queue;
1617

17-
enum CopySource
18+
enum Copy
1819
{
19-
host,
20-
device,
20+
hostToDevice,
21+
deviceToHost,
2122
array,
2223
}
2324

@@ -27,3 +28,16 @@ enum MemoryBankConfig : int
2728
fourBytes,
2829
eightBytes,
2930
}
31+
template HostArgsOf(F) {
32+
import std.meta, std.traits;
33+
alias HostArgsOf = staticMap!(ReplaceTemplate!(Pointer, Buffer), Parameters!F);
34+
}
35+
private template ReplaceTemplate(alias needle, alias replacement) {
36+
template ReplaceTemplate(T) {
37+
static if (is(T : needle!Args, Args...)) {
38+
alias ReplaceTemplate = replacement!(Args[1]);
39+
} else {
40+
alias ReplaceTemplate = T;
41+
}
42+
}
43+
}

source/dcompute/driver/cuda/platform.d

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,21 @@ struct Platform
88
{
99
static void initialise(uint flags =0)
1010
{
11+
DerelictCUDADriver.load();
1112
status = cast(Status)cuInit(flags);
1213
checkErrors();
1314
}
1415

15-
Device[] devices(A)(A a)
16+
static Device[] getDevices(A)(A a)
1617
{
1718
int len;
1819
TypedAllocator!(A) allocator;
1920
status = cast(Status)cuDeviceGetCount(&len);
2021
checkErrors();
21-
22-
Device[] ret = allocator.makeArray!(Device)(len);
22+
23+
//TODO:
24+
//Device[] ret = allocator.makeArray!(Device)(len);
25+
Device[] ret = new Device[len];
2326
foreach(int i; 0 .. len)
2427
{
2528
status = cast(Status)cuDeviceGet(&ret[i].raw,i);

source/dcompute/driver/cuda/program.d

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ struct Program
3939
return ret;
4040
}
4141

42+
__gshared static Program globalProgram;
4243
//cuModuleLoadDataEx
4344
//cuModuleLoadFatBinary
4445

@@ -47,7 +48,7 @@ struct Program
4748
status = cast(Status)cuModuleUnload(raw);
4849
checkErrors();
4950
}
50-
static Program globalProgram;
51+
5152
//TODO: linkstate
5253
}
5354

source/dcompute/driver/cuda/queue.d

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,14 +57,21 @@ struct Queue
5757
cuStreamAddCallback
5858
}*/
5959

60-
auto enqueue(alias k)(uint[3] _grid, uint[3] _block, uint _sharedMem)
60+
auto enqueue(alias k)(uint[3] _grid, uint[3] _block, uint _sharedMem = 0)
6161
{
6262
static struct Call
6363
{
6464
Queue q;
6565
uint[3] grid, block;
6666
uint sharedMem;
6767

68+
this(Queue _q,uint[3] _grid, uint[3] _block, uint _sharedMem)
69+
{
70+
q= _q;
71+
grid = _grid;
72+
block = _block;
73+
sharedMem = _sharedMem;
74+
}
6875
//TODO integrate evnts into this.
6976
void opCall(HostArgsOf!(typeof(k)) args)
7077
{
@@ -76,7 +83,7 @@ struct Queue
7683
}
7784

7885
status = cast(Status)
79-
cuLaunchKernel(kernel,
86+
cuLaunchKernel(kernel.raw,
8087
grid[0], grid[1], grid[2],
8188
block[0],block[1],block[2],
8289
sharedMem,

source/dcompute/driver/error.d

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -133,14 +133,14 @@ version (D_BetterC)
133133
{
134134
void delegate (Status) nothrow @nogc onDriverError;
135135
immutable void delegate (Status) nothrow @nogc defaultOnDriverError;
136-
static this()
137-
{
138-
defaultOnDriverError = (Status _status)
139-
{
140-
import core.stdc.stdio : stderr;
141-
import std.conv : to;
142-
fprintf(stderr,"*** DCompute driver error:%s\n",
143-
_status.to!(string).toStringz);
136+
static this()
137+
{
138+
defaultOnDriverError = (Status _status)
139+
{
140+
import core.stdc.stdio : stderr;
141+
import std.conv : to;
142+
fprintf(stderr,"*** DCompute driver error:%s\n",
143+
_status.to!(string).toStringz);
144144
};
145145
}
146146
}
@@ -161,13 +161,13 @@ else
161161
}
162162
void delegate(Status) onDriverError;
163163
immutable void delegate(Status) defaultOnDriverError;
164-
static this()
164+
static this()
165165
{
166-
defaultOnDriverError = (Status _status)
167-
{
168-
throw new DComputeDriverException(_status);
169-
};
170-
}
166+
defaultOnDriverError = (Status _status)
167+
{
168+
throw new DComputeDriverException(_status);
169+
};
170+
}
171171
}
172172
static this() { onDriverError = (Status _status) { defaultOnDriverError(_status);};}
173173
// Thread local status

source/dcompute/tests/dummykernels.d

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,14 @@ pragma(LDC_no_moduleinfo);
55
import ldc.dcompute;
66
import dcompute.std.index;
77

8-
@kernel void saxpy(GlobalPointer!(float) a, GlobalPointer!(float) b, float c )
8+
@kernel void saxpy(GlobalPointer!(float) res,
9+
float alpha,GlobalPointer!(float) x,
10+
GlobalPointer!(float) y,
11+
size_t N)
912
{
1013
auto i = GlobalIndex.x;
11-
a[i] = b[i] +c;
14+
if (i >= N) return;
15+
res[i] = alpha*x[i] + y[i];
1216
}
1317

1418
alias aagf = AutoIndexed!(GlobalPointer!(float));

source/dcompute/tests/main.d

Lines changed: 58 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,20 @@
1+
version = DComputeTestCUDA;
2+
13
import dcompute.tests.test;
24

35
import std.stdio;
46
import std.traits;
57
import std.meta;
6-
8+
import std.exception : enforce;
79
import std.experimental.allocator;
10+
import std.array;
811

912
import dcompute.tests.dummykernels : saxpy;
1013

11-
import dcompute.driver.ocl120;
14+
version(DComputeTestOpenCL)
15+
import dcompute.driver.ocl120;
16+
version(DComputeTestCUDA)
17+
import dcompute.driver.cuda;
1218

1319
ubyte[] readBinaryFromDisk()
1420
{
@@ -18,16 +24,56 @@ ubyte[] readBinaryFromDisk()
1824
float[] somedata() { return [1,2,3,4,5]; }
1925
int main(string[] args)
2026
{
21-
auto platform = Platform.getPlatforms(theAllocator)[0];
22-
auto devices = platform.getDevices(theAllocator);
23-
auto plist = propertyList!(Context.Properties)(Context.Properties.platform);
24-
auto context = Context(devices,plist);
25-
Program.globalProgram = context.createProgramFromSPIR(theAllocator,devices,readBinaryFromDisk());
26-
auto queue = context.createQueue(devices[0],Queue.Properties.outOfOrderExecution);
27-
auto data = somedata();
28-
auto buf = context.createBuffer!(float)(Memory.Flags.none,data);
29-
Event e = queue.enqueue!(saxpy)([5])(buf,buf,42.0);
30-
e.wait();
27+
version(DComputeTestOpenCL)
28+
{
29+
auto platform = Platform.getPlatforms(theAllocator)[0];
30+
auto devices = platform.getDevices(theAllocator);
31+
auto plist = propertyList!(Context.Properties)(Context.Properties.platform);
32+
auto ctx = Context(devices,plist);
33+
Program.globalProgram = context.createProgramFromSPIR(theAllocator,devices,readBinaryFromDisk());
34+
auto queue = context.createQueue(devices[0],Queue.Properties.outOfOrderExecution);
35+
auto data = somedata();
36+
auto buf = context.createBuffer!(float)(Memory.Flags.none,data);
37+
Event e = queue.enqueue!(saxpy)([5])(buf,buf,42.0);
38+
e.wait();
39+
}
40+
version(DComputeTestCUDA)
41+
{
42+
Platform.initialise();
43+
44+
auto devs = Platform.getDevices(theAllocator);
45+
auto dev = devs[0];
46+
auto ctx = Context(dev); scope(exit) ctx.detach();
47+
48+
// Change the file path to match your GPU.
49+
Program.globalProgram = Program.fromFile("./kernels_cuda210_64.ptx");
50+
auto q = Queue(false);
51+
52+
enum size_t N = 128;
53+
float alpha = 5.0;
54+
float[N] res, x,y;
55+
foreach (i; 0 .. N)
56+
{
57+
x[i] = N - i;
58+
y[i] = i * i;
59+
}
60+
Buffer!(float) b_res, b_x, b_y;
61+
b_res = Buffer!(float)(res[]); scope(exit) b_res.release();
62+
b_x = Buffer!(float)(x[]); scope(exit) b_x.release();
63+
b_y = Buffer!(float)(y[]); scope(exit) b_y.release();
64+
65+
b_x.copy!(Copy.hostToDevice);
66+
b_y.copy!(Copy.hostToDevice);
67+
68+
q.enqueue!(saxpy)
69+
([N,1,1],[1,1,1])
70+
(b_res,alpha,b_x,b_y, N);
71+
b_res.copy!(Copy.deviceToHost);
72+
foreach(i; 0 .. N)
73+
enforce(res[i] == alpha * x[i] + y[i]);
74+
writeln(res[]);
75+
}
76+
3177
return 0;
3278
}
3379

0 commit comments

Comments
 (0)