Skip to content

Commit 1ece450

Browse files
committed
Automatically use zero-copy buffers on CPUs/iGPUs, bandwidth kernels now write non-zero data
1 parent 7b264f9 commit 1ece450

File tree

3 files changed

+107
-68
lines changed

3 files changed

+107
-68
lines changed

src/kernel.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ kernel void kernel_char(global float* data) {
8181

8282
kernel void kernel_coalesced_write(global float* data) {
8383
const uint n = get_global_id(0);
84-
for(uint i=0u; i<def_M; i++) data[i*def_N+n] = 0.0f; // coalesced write
84+
for(uint i=0u; i<def_M; i++) data[i*def_N+n] = (float)n; // coalesced write
8585
}
8686
kernel void kernel_coalesced_read(global float* data) {
8787
const uint n = get_global_id(0);
@@ -91,7 +91,7 @@ kernel void kernel_coalesced_read(global float* data) {
9191
}
9292
kernel void kernel_misaligned_write(global float* data) {
9393
const uint n = get_global_id(0);
94-
for(uint i=0u; i<def_M; i++) data[n*def_M+i] = 0.0f; // misaligned write
94+
for(uint i=0u; i<def_M; i++) data[n*def_M+i] = (float)n; // misaligned write
9595
}
9696
kernel void kernel_misaligned_read(global float* data) {
9797
const uint n = get_global_id(0);

src/main.cpp

Lines changed: 28 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -146,33 +146,35 @@ void benchmark_device(const Device_Info& device_info) {
146146
println("\r| Memory Bandwidth (misaligned read ) "+alignr(29u, to_string(4.0f*(float)N*(float)M/(float)(time_mr-time_cw/(double)M)*1E-9f, 2u))+" GB/s |");
147147
println("\r| Memory Bandwidth (misaligned write) "+alignr(29u, to_string(4.0f*(float)N*(float)M/(float) time_mw *1E-9f, 2u))+" GB/s |");
148148

149-
print("| Benchmarking ... |");
150-
for(uint i=0u; i<N_memory; i++) {
151-
clock.start();
152-
buffer.write_to_device();
153-
time_send = fmin(clock.stop(), time_send);
154-
}
155-
const float bw_send = 4.0f*M*N/(float)time_send*1E-9f;
156-
println("\r| PCIe Bandwidth (send ) "+alignr(29u, to_string(bw_send, 2u))+" GB/s |");
157-
print("| Benchmarking ... |");
158-
for(uint i=0u; i<N_memory; i++) {
159-
clock.start();
160-
buffer.read_from_device();
161-
time_receive = fmin(clock.stop(), time_receive);
162-
}
163-
const float bw_receive = 4.0f*M*N/(float)time_receive*1E-9f;
164-
println("\r| PCIe Bandwidth ( receive ) "+alignr(29u, to_string(bw_receive, 2u))+" GB/s |");
165-
print("| Benchmarking ... |");
166-
for(uint i=0u; i<N_memory; i++) {
167-
clock.start();
168-
buffer.read_from_device(N*M/2u, N*M, false);
169-
buffer.write_to_device(0u, N*M/2u, false);
170-
buffer.finish_queue();
171-
time_bidirectional = fmin(clock.stop(), time_bidirectional);
149+
if(!device.info.uses_ram) {
150+
print("| Benchmarking ... |");
151+
for(uint i=0u; i<N_memory; i++) {
152+
clock.start();
153+
buffer.write_to_device();
154+
time_send = fmin(clock.stop(), time_send);
155+
}
156+
const float bw_send = 4.0f*M*N/(float)time_send*1E-9f;
157+
println("\r| PCIe Bandwidth (send ) "+alignr(29u, to_string(bw_send, 2u))+" GB/s |");
158+
print("| Benchmarking ... |");
159+
for(uint i=0u; i<N_memory; i++) {
160+
clock.start();
161+
buffer.read_from_device();
162+
time_receive = fmin(clock.stop(), time_receive);
163+
}
164+
const float bw_receive = 4.0f*M*N/(float)time_receive*1E-9f;
165+
println("\r| PCIe Bandwidth ( receive ) "+alignr(29u, to_string(bw_receive, 2u))+" GB/s |");
166+
print("| Benchmarking ... |");
167+
for(uint i=0u; i<N_memory; i++) {
168+
clock.start();
169+
buffer.read_from_device(N*M/2u, N*M, false);
170+
buffer.write_to_device(0u, N*M/2u, false);
171+
buffer.finish_queue();
172+
time_bidirectional = fmin(clock.stop(), time_bidirectional);
173+
}
174+
const float bw_bidirectional = 4.0f*M*N/(float)time_bidirectional*1E-9f;
175+
const float bw_max = fmax(2.0f*fmax(bw_send, bw_receive), bw_bidirectional);
176+
println("\r| PCIe Bandwidth ( bidirectional) (Gen"+to_string(bw_max>17.6f?4:bw_max>8.8f?3:bw_max>4.4f?2:1)+" x16)"+alignr(8u, to_string(bw_bidirectional, 2u))+" GB/s |");
172177
}
173-
const float bw_bidirectional = 4.0f*M*N/(float)time_bidirectional*1E-9f;
174-
const float bw_max = fmax(2.0f*fmax(bw_send, bw_receive), bw_bidirectional);
175-
println("\r| PCIe Bandwidth ( bidirectional) (Gen"+to_string(bw_max>17.6f?4:bw_max>8.8f?3:bw_max>4.4f?2:1)+" x16)"+alignr(8u, to_string(bw_bidirectional, 2u))+" GB/s |");
176178

177179
println("|-----------------------------------------------------------------------------|");
178180
}

0 commit comments

Comments
 (0)