@@ -270,15 +270,15 @@ int main(int argc, char *argv[])
270
270
if (use_cold_cache) {
271
271
clock_gettime (MYCLOCK, &beg);
272
272
for (iter = 0 ; iter < num_write_iters; ++iter) {
273
- // Simulate GPU reading the data written by CPU. When cache
273
+ // Simulate GPU writing the data written by CPU. When cache
274
274
// mapping is used, the cache lines will be moved to GPU.
275
275
// The next access by CPU will cause the cache lines to
276
276
// move back to CPU (cold cache). gdr_copy_to_mapping will
277
277
// pay this cost.
278
278
279
- // We use sync memops. The copy is done when cuMemcpy
280
- // returns.
281
- cuMemcpy ((CUdeviceptr)h_buf, d_A , copy_size);
279
+ // We use sync memops. The memset is considered done by the
280
+ // time cuMemsetD8 returns.
281
+ cuMemsetD8 (d_A, 0 , copy_size);
282
282
283
283
gdr_copy_to_mapping (mh, buf_ptr, init_buf, copy_size);
284
284
SB ();
@@ -287,11 +287,11 @@ int main(int argc, char *argv[])
287
287
288
288
lat_us += time_diff (beg, end);
289
289
290
- // Measure the cost of cuMemcpy. Remove that from the total
291
- // latency.
290
+ // Measure the cost of cuMemsetD8 and remove that from the
291
+ // total latency.
292
292
clock_gettime (MYCLOCK, &beg);
293
293
for (iter = 0 ; iter < num_write_iters; ++iter) {
294
- cuMemcpy ((CUdeviceptr)h_buf, d_A , copy_size);
294
+ cuMemsetD8 (d_A, 0 , copy_size);
295
295
}
296
296
clock_gettime (MYCLOCK, &end);
297
297
@@ -301,7 +301,6 @@ int main(int argc, char *argv[])
301
301
clock_gettime (MYCLOCK, &beg);
302
302
for (iter = 0 ; iter < num_write_iters; ++iter) {
303
303
gdr_copy_to_mapping (mh, buf_ptr, init_buf, copy_size);
304
- SB ();
305
304
}
306
305
clock_gettime (MYCLOCK, &end);
307
306
@@ -329,8 +328,8 @@ int main(int argc, char *argv[])
329
328
// to move back to CPU (cold cache). gdr_copy_from_mapping
330
329
// will pay this cost.
331
330
332
- // We use sync memops. The memset is done when cuMemsetD8
333
- // returns.
331
+ // We use sync memops. The memset is considered done by the
332
+ // time cuMemsetD8 returns.
334
333
cuMemsetD8 (d_A, 0 , copy_size);
335
334
336
335
gdr_copy_from_mapping (mh, h_buf, buf_ptr, copy_size);
@@ -354,7 +353,6 @@ int main(int argc, char *argv[])
354
353
clock_gettime (MYCLOCK, &beg);
355
354
for (iter = 0 ; iter < num_read_iters; ++iter) {
356
355
gdr_copy_from_mapping (mh, h_buf, buf_ptr, copy_size);
357
- LB ();
358
356
}
359
357
clock_gettime (MYCLOCK, &end);
360
358
0 commit comments