Skip to content

Commit bda1f60

Browse files
committed
Merge branch 'pmarkthub/nvbug/5176653' into 'main'
[NVBUG 5176653] Fix the performance regression in copylat See merge request gpudirect/gdrcopy!61
2 parents 919a4f5 + 9240af0 commit bda1f60

File tree

1 file changed

+9
-11
lines changed

1 file changed

+9
-11
lines changed

tests/copylat.cpp

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -270,15 +270,15 @@ int main(int argc, char *argv[])
270270
if (use_cold_cache) {
271271
clock_gettime(MYCLOCK, &beg);
272272
for (iter = 0; iter < num_write_iters; ++iter) {
273-
// Simulate GPU reading the data written by CPU. When cache
273+
// Simulate GPU writing the data written by CPU. When cache
274274
// mapping is used, the cache lines will be moved to GPU.
275275
// The next access by CPU will cause the cache lines to
276276
// move back to CPU (cold cache). gdr_copy_to_mapping will
277277
// pay this cost.
278278

279-
// We use sync memops. The copy is done when cuMemcpy
280-
// returns.
281-
cuMemcpy((CUdeviceptr)h_buf, d_A, copy_size);
279+
// We use sync memops. The memset is considered done by the
280+
// time cuMemsetD8 returns.
281+
cuMemsetD8(d_A, 0, copy_size);
282282

283283
gdr_copy_to_mapping(mh, buf_ptr, init_buf, copy_size);
284284
SB();
@@ -287,11 +287,11 @@ int main(int argc, char *argv[])
287287

288288
lat_us += time_diff(beg, end);
289289

290-
// Measure the cost of cuMemcpy. Remove that from the total
291-
// latency.
290+
// Measure the cost of cuMemsetD8 and remove that from the
291+
// total latency.
292292
clock_gettime(MYCLOCK, &beg);
293293
for (iter = 0; iter < num_write_iters; ++iter) {
294-
cuMemcpy((CUdeviceptr)h_buf, d_A, copy_size);
294+
cuMemsetD8(d_A, 0, copy_size);
295295
}
296296
clock_gettime(MYCLOCK, &end);
297297

@@ -301,7 +301,6 @@ int main(int argc, char *argv[])
301301
clock_gettime(MYCLOCK, &beg);
302302
for (iter = 0; iter < num_write_iters; ++iter) {
303303
gdr_copy_to_mapping(mh, buf_ptr, init_buf, copy_size);
304-
SB();
305304
}
306305
clock_gettime(MYCLOCK, &end);
307306

@@ -329,8 +328,8 @@ int main(int argc, char *argv[])
329328
// to move back to CPU (cold cache). gdr_copy_from_mapping
330329
// will pay this cost.
331330

332-
// We use sync memops. The memset is done when cuMemsetD8
333-
// returns.
331+
// We use sync memops. The memset is considered done by the
332+
// time cuMemsetD8 returns.
334333
cuMemsetD8(d_A, 0, copy_size);
335334

336335
gdr_copy_from_mapping(mh, h_buf, buf_ptr, copy_size);
@@ -354,7 +353,6 @@ int main(int argc, char *argv[])
354353
clock_gettime(MYCLOCK, &beg);
355354
for (iter = 0; iter < num_read_iters; ++iter) {
356355
gdr_copy_from_mapping(mh, h_buf, buf_ptr, copy_size);
357-
LB();
358356
}
359357
clock_gettime(MYCLOCK, &end);
360358

0 commit comments

Comments
 (0)