Skip to content

Commit 6d50e29

Browse files
authored
Add nvtext substring duplication APIs (Part 1) (#18585)
Adds new set of nvtext substring deduplication APIs ```cpp std::unique_ptr<cudf::column> nvtext::substring_duplicates( cudf::strings_column_view const& input, cudf::size_type min_width, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); ``` This finds and returns any duplicate substrings of at least `min_width` bytes within the `input` column. ```cpp std::unique_ptr<rmm::device_uvector<int32_t>> build_suffix_array( cudf::strings_column_view const& input, int32_t min_width, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); ``` Just builds the suffix array. The `build_suffix_array` returns int32 values per input byte. So the output is 4x the input size. Additionally, the internal sort routine requires an extra 4x of temporary memory. This means the memory footprint for `build_suffix_array` will be about 8x the size of the input. Also, this function will be about 95% of the runtime and care should be taken to not to call this repeatedly on the same input data. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Shruti Shivakumar (https://github.com/shrshi) - Tianyu Liu (https://github.com/kingcrimsontianyu) - Bradley Dice (https://github.com/bdice) URL: #18585
1 parent 2f5e9d4 commit 6d50e29

File tree

16 files changed

+786
-9
lines changed

16 files changed

+786
-9
lines changed

cpp/CMakeLists.txt

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ project(
2929
VERSION "${RAPIDS_VERSION}"
3030
LANGUAGES C CXX CUDA
3131
)
32+
3233
if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.5)
3334
message(
3435
FATAL_ERROR
@@ -45,7 +46,6 @@ set(THREADS_PREFER_PTHREAD_FLAG ON)
4546

4647
# ##################################################################################################
4748
# * build options ---------------------------------------------------------------------------------
48-
4949
option(USE_NVTX "Build with NVTX support" ON)
5050
option(BUILD_TESTS "Configure CMake to build tests" ON)
5151
option(BUILD_BENCHMARKS "Configure CMake to build (google & nvbench) benchmarks" OFF)
@@ -63,6 +63,7 @@ option(
6363
stream to external libraries."
6464
OFF
6565
)
66+
6667
# Option to add all symbols to the dynamic symbol table in the library file, allowing to retrieve
6768
# human-readable stacktrace for debugging.
6869
option(
@@ -71,19 +72,23 @@ option(
7172
OFF
7273
)
7374
option(DISABLE_DEPRECATION_WARNINGS "Disable warnings generated from deprecated declarations." OFF)
75+
7476
# Option to enable line info in CUDA device compilation to allow introspection when profiling /
7577
# memchecking
7678
option(CUDA_ENABLE_LINEINFO
7779
"Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler)" OFF
7880
)
7981
option(CUDA_WARNINGS_AS_ERRORS "Enable -Werror=all-warnings for all CUDA compilation" ON)
82+
8083
# cudart can be statically linked or dynamically linked. The python ecosystem wants dynamic linking
8184
option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
8285

8386
set(DEFAULT_CUDF_BUILD_STREAMS_TEST_UTIL ON)
87+
8488
if(CUDA_STATIC_RUNTIME OR NOT BUILD_SHARED_LIBS)
8589
set(DEFAULT_CUDF_BUILD_STREAMS_TEST_UTIL OFF)
8690
endif()
91+
8792
option(
8893
CUDF_BUILD_STREAMS_TEST_UTIL
8994
"Whether to build the utilities for stream testing contained in libcudf"
@@ -127,6 +132,7 @@ message(VERBOSE
127132
rapids_cmake_build_type("Release")
128133
set(CUDF_BUILD_TESTS ${BUILD_TESTS})
129134
set(CUDF_BUILD_BENCHMARKS ${BUILD_BENCHMARKS})
135+
130136
if(BUILD_TESTS AND NOT CUDF_BUILD_TESTUTIL)
131137
message(
132138
FATAL_ERROR
@@ -175,12 +181,14 @@ if(CUDF_CLANG_TIDY)
175181
string(REGEX MATCH "LLVM version ([0-9]+\\.[0-9]+)\\.[0-9]+" LLVM_VERSION_MATCH
176182
"${CLANG_TIDY_OUTPUT}"
177183
)
184+
178185
# Discard the patch version and allow it to float. Empirically, results between patch versions are
179186
# mostly stable, and looking at available packages on some package managers sometimes patch
180187
# versions are skipped so we don't want to constrain to a patch version that the user can't
181188
# install.
182189
set(LLVM_VERSION "${CMAKE_MATCH_1}")
183190
set(expected_clang_tidy_version 19.1)
191+
184192
if(NOT expected_clang_tidy_version VERSION_EQUAL LLVM_VERSION)
185193
message(
186194
FATAL_ERROR
@@ -216,6 +224,7 @@ function(enable_static_checkers target)
216224
)
217225
endif()
218226
endif()
227+
219228
if(_LINT_IWYU)
220229
# A few extra warnings pop up when building with IWYU. I'm not sure why, but they are not
221230
# relevant since they don't show up in any other build so it's better to suppress them until we
@@ -226,16 +235,19 @@ function(enable_static_checkers target)
226235
# on real builds.
227236
foreach(_flag -Wno-missing-braces -Wno-unneeded-internal-declaration)
228237
list(FIND CUDF_CXX_FLAGS "${_flag}" _flag_index)
238+
229239
if(_flag_index EQUAL -1)
230240
list(APPEND CUDF_CXX_FLAGS ${_flag})
231241
endif()
232242
endforeach()
243+
233244
set(CUDF_CXX_FLAGS
234245
"${CUDF_CXX_FLAGS}"
235246
PARENT_SCOPE
236247
)
237248
set_target_properties(${target} PROPERTIES CXX_INCLUDE_WHAT_YOU_USE "${IWYU_EXE}")
238249
endif()
250+
239251
foreach(file IN LISTS _LINT_SKIPPED_FILES)
240252
set_source_files_properties(${file} PROPERTIES SKIP_LINTING ON)
241253
endforeach()
@@ -279,32 +291,45 @@ create_logger_macros(CUDF "cudf::default_logger()" include/cudf)
279291

280292
# find jitify
281293
include(cmake/thirdparty/get_jitify.cmake)
294+
282295
# find NVTX
283296
include(cmake/thirdparty/get_nvtx.cmake)
297+
284298
# find nvCOMP
285299
include(cmake/thirdparty/get_nvcomp.cmake)
300+
286301
# find CCCL before rmm so that we get cudf's patched version of CCCL
287302
include(cmake/thirdparty/get_cccl.cmake)
303+
288304
# find rmm
289305
include(cmake/thirdparty/get_rmm.cmake)
306+
290307
# find flatbuffers
291308
include(cmake/thirdparty/get_flatbuffers.cmake)
309+
292310
# find dlpack
293311
include(cmake/thirdparty/get_dlpack.cmake)
312+
294313
# find cuCollections, should come after including CCCL
295314
include(cmake/thirdparty/get_cucollections.cmake)
315+
296316
# find or install GoogleTest
297317
if(CUDF_BUILD_TESTUTIL)
298318
include(cmake/thirdparty/get_gtest.cmake)
299319
endif()
320+
300321
# preprocess jitify-able kernels
301322
include(cmake/Modules/JitifyPreprocessKernels.cmake)
323+
302324
# find KvikIO
303325
include(cmake/thirdparty/get_kvikio.cmake)
326+
304327
# find nanoarrow
305328
include(cmake/thirdparty/get_nanoarrow.cmake)
329+
306330
# find thread_pool
307331
include(cmake/thirdparty/get_thread_pool.cmake)
332+
308333
# find zstd
309334
include(cmake/thirdparty/get_zstd.cmake)
310335

@@ -327,7 +352,6 @@ endif()
327352

328353
# ##################################################################################################
329354
# * library targets -------------------------------------------------------------------------------
330-
331355
add_library(
332356
cudf
333357
src/aggregation/aggregation.cpp
@@ -754,6 +778,7 @@ add_library(
754778
src/table/table.cpp
755779
src/table/table_device_view.cu
756780
src/table/table_view.cpp
781+
src/text/deduplicate.cu
757782
src/text/detokenize.cu
758783
src/text/edit_distance.cu
759784
src/text/generate_ngrams.cu
@@ -857,16 +882,20 @@ set_target_properties(
857882
# flags if necessary.
858883
if(CUDF_CLANG_TIDY OR CUDF_IWYU)
859884
set(linters)
885+
860886
if(CUDF_CLANG_TIDY)
861887
list(APPEND linters CLANG_TIDY)
862888
endif()
889+
863890
if(CUDF_IWYU)
864891
list(APPEND linters IWYU)
865892
endif()
893+
866894
enable_static_checkers(
867895
cudf SKIPPED_FILES src/io/comp/cpu_unbz2.cpp src/io/comp/brotli_dict.cpp ${linters}
868896
)
869897
endif()
898+
870899
target_compile_options(
871900
cudf PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
872901
"$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
@@ -997,7 +1026,6 @@ add_library(cudf::cudf ALIAS cudf)
9971026

9981027
# ##################################################################################################
9991028
# * build cudftestutil ----------------------------------------------------------------------------
1000-
10011029
if(CUDF_BUILD_TESTUTIL)
10021030
add_library(
10031031
cudftest_default_stream
@@ -1096,11 +1124,9 @@ if(CUDF_BUILD_TESTUTIL)
10961124
tests/utilities/tdigest_utilities.cu
10971125
DESTINATION src/cudftestutil/utilities
10981126
)
1099-
11001127
endif()
11011128

11021129
# * build cudf_identify_stream_usage --------------------------------------------------------------
1103-
11041130
if(CUDF_BUILD_STREAMS_TEST_UTIL)
11051131
if(CUDA_STATIC_RUNTIME)
11061132
message(
@@ -1117,6 +1143,7 @@ if(CUDF_BUILD_STREAMS_TEST_UTIL)
11171143
add_library(
11181144
${_tgt} SHARED src/utilities/stacktrace.cpp tests/utilities/identify_stream_usage.cpp
11191145
)
1146+
11201147
if(CUDF_USE_PER_THREAD_DEFAULT_STREAM)
11211148
target_compile_definitions(
11221149
${_tgt} PUBLIC CUDA_API_PER_THREAD_DEFAULT_STREAM CUDF_USE_PER_THREAD_DEFAULT_STREAM
@@ -1135,9 +1162,11 @@ if(CUDF_BUILD_STREAMS_TEST_UTIL)
11351162
)
11361163
target_include_directories(${_tgt} PRIVATE "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/include>")
11371164
target_link_libraries(${_tgt} PUBLIC CUDA::cudart rmm::rmm)
1165+
11381166
if(CUDF_BUILD_STACKTRACE_DEBUG)
11391167
target_link_libraries(${_tgt} PRIVATE cudf_backtrace)
11401168
endif()
1169+
11411170
rapids_cuda_set_runtime(${_tgt} USE_STATIC ${CUDA_STATIC_RUNTIME})
11421171
add_library(cudf::${_tgt} ALIAS ${_tgt})
11431172

@@ -1149,7 +1178,6 @@ endif()
11491178

11501179
# ##################################################################################################
11511180
# * add tests -------------------------------------------------------------------------------------
1152-
11531181
if(CUDF_BUILD_TESTS)
11541182
# include CTest module -- automatically calls enable_testing()
11551183
include(CTest)
@@ -1167,7 +1195,6 @@ endif()
11671195

11681196
# ##################################################################################################
11691197
# * add benchmarks --------------------------------------------------------------------------------
1170-
11711198
if(CUDF_BUILD_BENCHMARKS)
11721199
# Find or install GoogleBench
11731200
include(${rapids-cmake-dir}/cpm/gbench.cmake)
@@ -1198,6 +1225,7 @@ install(FILES ${CUDF_BINARY_DIR}/include/cudf/version_config.hpp
11981225
)
11991226

12001227
set(_components_export_string)
1228+
12011229
if(TARGET cudftestutil)
12021230
install(
12031231
TARGETS cudftest_default_stream cudftestutil cudftestutil_impl

cpp/include/doxygen_groups.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2021-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -175,6 +175,7 @@
175175
* @defgroup nvtext_replace Replacing
176176
* @defgroup nvtext_minhash MinHashing
177177
* @defgroup nvtext_jaccard Jaccard Index
178+
* @defgroup nvtext_dedup Deduplication
178179
* @}
179180
* @defgroup utility_apis Utilities
180181
* @{

cpp/include/nvtext/deduplicate.hpp

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
/*
2+
* Copyright (c) 2025, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#pragma once
17+
18+
#include <cudf/column/column.hpp>
19+
#include <cudf/strings/strings_column_view.hpp>
20+
#include <cudf/utilities/export.hpp>
21+
#include <cudf/utilities/memory_resource.hpp>
22+
23+
#include <rmm/cuda_stream_view.hpp>
24+
#include <rmm/device_uvector.hpp>
25+
26+
//! NVText APIs
27+
namespace CUDF_EXPORT nvtext {
28+
/**
29+
* @addtogroup nvtext_dedup
30+
* @{
31+
* @file
32+
*/
33+
34+
/**
35+
* @brief Returns duplicate strings found in the given input
36+
*
37+
* The internal implementation creates a suffix array of the input which
38+
* requires ~10x the input size for temporary memory.
39+
*
40+
* The output includes any strings of at least `min_width` bytes that
41+
* appear more than once in the entire input.
42+
*
43+
* @throw std::invalid_argument If `min_width` <= 8
44+
* @throw std::invalid_argument If `min_width` is greater than the input chars size
45+
* @throw std::invalid_argument If the `input` chars size is greater than 2GB
46+
*
47+
* @param input Strings column to identify duplicates
48+
* @param min_width Minimum number of bytes that must match to identify a duplicate
49+
* @param stream CUDA stream used for device memory operations and kernel launches
50+
* @param mr Device memory resource used to allocate the returned column's device memory
51+
* @return New strings column with updated strings
52+
*/
53+
std::unique_ptr<cudf::column> substring_duplicates(
54+
cudf::strings_column_view const& input,
55+
cudf::size_type min_width,
56+
rmm::cuda_stream_view stream = cudf::get_default_stream(),
57+
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
58+
59+
/**
60+
* @brief Builds a suffix array for the input strings column
61+
*
62+
* The internal implementation creates a suffix array of the input which
63+
* requires ~4x the input size for temporary memory. The output is an additional
64+
* 4x of the input size.
65+
*
66+
* @throw std::invalid_argument If `min_width` is greater than the input chars size
67+
* @throw std::invalid_argument If the `input` chars size is greater than 2GB
68+
*
69+
* @param input Strings column to build suffix array for
70+
* @param min_width Minimum number of bytes that must match to identify a duplicate
71+
* @param stream CUDA stream used for device memory operations and kernel launches
72+
* @param mr Device memory resource used to allocate the returned column's device memory
73+
* @return Sorted suffix array and corresponding sizes
74+
*/
75+
std::unique_ptr<rmm::device_uvector<cudf::size_type>> build_suffix_array(
76+
cudf::strings_column_view const& input,
77+
cudf::size_type min_width,
78+
rmm::cuda_stream_view stream = cudf::get_default_stream(),
79+
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
80+
81+
/** @} */ // end of group
82+
} // namespace CUDF_EXPORT nvtext

0 commit comments

Comments
 (0)