Skip to content

Commit b0951b3

Browse files
c/parallel should be built with CUB_DISABLE_CDP (NVIDIA#4422)
* c/parallel should be built with CUB_DISABLE_CDP Library functions launch kernels from the host only. Setting this variable causes CUB_RUNTIME_FUNCTION to be _CCCL_HOST, and hence compiler won't generate CUB dispatch functions for the device, speeding up compilation. * Add CUB_DISABLE_CDP to jit compiler arguments * [pre-commit.ci] auto code formatting --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent c796a7f commit b0951b3

File tree

8 files changed

+29
-15
lines changed

8 files changed

+29
-15
lines changed

c/parallel/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ target_link_libraries(cccl.c.parallel PRIVATE
4141
Thrust::Thrust
4242
)
4343
target_compile_definitions(cccl.c.parallel PUBLIC CCCL_C_EXPERIMENTAL=1 _CUB_HAS_TRANSFORM_UBLKCP=0)
44-
target_compile_definitions(cccl.c.parallel PRIVATE NVRTC_GET_TYPE_NAME=1)
44+
target_compile_definitions(cccl.c.parallel PRIVATE NVRTC_GET_TYPE_NAME=1 CUB_DISABLE_CDP=1)
4545
target_compile_options(cccl.c.parallel PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>)
4646

4747
target_include_directories(cccl.c.parallel PUBLIC "include")

c/parallel/src/for.cu

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,9 @@ CUresult cccl_device_for_build(
9494

9595
const std::string arch = std::format("-arch=sm_{0}{1}", cc_major, cc_minor);
9696

97-
constexpr size_t num_args = 7;
98-
const char* args[num_args] = {arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto"};
97+
constexpr size_t num_args = 8;
98+
const char* args[num_args] = {
99+
arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto", "-DCUB_DISABLE_CDP"};
99100

100101
constexpr size_t num_lto_args = 2;
101102
const char* lopts[num_lto_args] = {"-lto", arch.c_str()};

c/parallel/src/merge_sort.cu

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -408,8 +408,9 @@ struct device_merge_sort_vsmem_helper {{
408408

409409
const std::string arch = std::format("-arch=sm_{0}{1}", cc_major, cc_minor);
410410

411-
constexpr size_t num_args = 7;
412-
const char* args[num_args] = {arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto"};
411+
constexpr size_t num_args = 8;
412+
const char* args[num_args] = {
413+
arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto", "-DCUB_DISABLE_CDP"};
413414

414415
constexpr size_t num_lto_args = 2;
415416
const char* lopts[num_lto_args] = {"-lto", arch.c_str()};

c/parallel/src/reduce.cu

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -332,8 +332,9 @@ struct device_reduce_policy {{
332332

333333
const std::string arch = std::format("-arch=sm_{0}{1}", cc_major, cc_minor);
334334

335-
constexpr size_t num_args = 7;
336-
const char* args[num_args] = {arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto"};
335+
constexpr size_t num_args = 8;
336+
const char* args[num_args] = {
337+
arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto", "-DCUB_DISABLE_CDP"};
337338

338339
constexpr size_t num_lto_args = 2;
339340
const char* lopts[num_lto_args] = {"-lto", arch.c_str()};

c/parallel/src/scan.cu

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -295,8 +295,9 @@ struct device_scan_policy {{
295295

296296
const std::string arch = std::format("-arch=sm_{0}{1}", cc_major, cc_minor);
297297

298-
constexpr size_t num_args = 7;
299-
const char* args[num_args] = {arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto"};
298+
constexpr size_t num_args = 8;
299+
const char* args[num_args] = {
300+
arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto", "-DCUB_DISABLE_CDP"};
300301

301302
constexpr size_t num_lto_args = 2;
302303
const char* lopts[num_lto_args] = {"-lto", arch.c_str()};

c/parallel/src/segmented_reduce.cu

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -332,8 +332,9 @@ struct device_segmented_reduce_policy {{
332332

333333
const std::string arch = std::format("-arch=sm_{0}{1}", cc_major, cc_minor);
334334

335-
constexpr size_t num_args = 7;
336-
const char* args[num_args] = {arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto"};
335+
constexpr size_t num_args = 8;
336+
const char* args[num_args] = {
337+
arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto", "-DCUB_DISABLE_CDP"};
337338

338339
constexpr size_t num_lto_args = 2;
339340
const char* lopts[num_lto_args] = {"-lto", arch.c_str()};

c/parallel/src/transform.cu

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -270,9 +270,17 @@ struct device_transform_policy {{
270270
// Note: `-default-device` is needed because of the use of lambdas
271271
// in the transform kernel code. Qualifying those explicitly with
272272
// `__device__` seems not to be supported by NVRTC.
273-
constexpr size_t num_args = 8;
273+
constexpr size_t num_args = 9;
274274
const char* args[num_args] = {
275-
arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto", "-default-device"};
275+
arch.c_str(),
276+
cub_path,
277+
thrust_path,
278+
libcudacxx_path,
279+
ctk_path,
280+
"-rdc=true",
281+
"-dlto",
282+
"-default-device",
283+
"-DCUB_DISABLE_CDP"};
276284

277285
constexpr size_t num_lto_args = 2;
278286
const char* lopts[num_lto_args] = {"-lto", arch.c_str()};

c/parallel/src/unique_by_key.cu

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -363,8 +363,9 @@ struct device_unique_by_key_vsmem_helper {{
363363

364364
const std::string arch = std::format("-arch=sm_{0}{1}", cc_major, cc_minor);
365365

366-
constexpr size_t num_args = 7;
367-
const char* args[num_args] = {arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto"};
366+
constexpr size_t num_args = 8;
367+
const char* args[num_args] = {
368+
arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto", "-DCUB_DISABLE_CDP"};
368369

369370
constexpr size_t num_lto_args = 2;
370371
const char* lopts[num_lto_args] = {"-lto", arch.c_str()};

0 commit comments

Comments
 (0)