Skip to content

Commit 96b0f94

Browse files
authored
String Transform Examples (#18616)
Depends on: #18490 Follows up on #18023 Authors: - Basit Ayantunde (https://github.com/lamarrr) Approvers: - David Wendt (https://github.com/davidwendt) - Bradley Dice (https://github.com/bdice) URL: #18616
1 parent 5019fa8 commit 96b0f94

File tree

8 files changed

+415
-1
lines changed

8 files changed

+415
-1
lines changed

cpp/examples/build.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/bin/bash
22

3-
# Copyright (c) 2021-2024, NVIDIA CORPORATION.
3+
# Copyright (c) 2021-2025, NVIDIA CORPORATION.
44

55
# libcudf examples build script
66

@@ -58,6 +58,7 @@ build_example() {
5858

5959
build_example basic
6060
build_example strings
61+
build_example string_transforms
6162
build_example nested_types
6263
build_example parquet_io
6364
build_example billion_rows
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION.
2+
3+
cmake_minimum_required(VERSION 3.30.4 FATAL_ERROR)
4+
5+
include(../set_cuda_architecture.cmake)
6+
7+
# initialize cuda architecture
8+
rapids_cuda_init_architectures(string_transforms_examples)
9+
10+
project(
11+
string_transforms_examples
12+
VERSION 0.0.1
13+
LANGUAGES CXX CUDA
14+
)
15+
16+
include(../fetch_dependencies.cmake)
17+
18+
include(rapids-cmake)
19+
rapids_cmake_build_type("Release")
20+
21+
list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)
22+
23+
add_executable(int_output int_output.cpp)
24+
target_compile_features(int_output PRIVATE cxx_std_17)
25+
target_compile_options(int_output PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>")
26+
target_link_libraries(int_output PRIVATE cudf::cudf nvtx3::nvtx3-cpp)
27+
install(TARGETS int_output DESTINATION bin/examples/libcudf)
28+
29+
add_executable(output output.cpp)
30+
target_compile_features(output PRIVATE cxx_std_17)
31+
target_compile_options(output PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>")
32+
target_link_libraries(output PRIVATE cudf::cudf nvToolsExt)
33+
install(TARGETS output DESTINATION bin/examples/libcudf)
34+
35+
add_executable(preallocated preallocated.cpp)
36+
target_compile_features(preallocated PRIVATE cxx_std_17)
37+
target_compile_options(preallocated PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>")
38+
target_link_libraries(preallocated PRIVATE cudf::cudf nvToolsExt)
39+
install(TARGETS preallocated DESTINATION bin/examples/libcudf)
40+
41+
install(FILES ${CMAKE_CURRENT_LIST_DIR}/info.csv DESTINATION bin/examples/libcudf)
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# libcudf C++ examples using string transforms
2+
3+
This C++ example demonstrates using libcudf transform API to access and create
4+
strings columns.
5+
6+
The example source code loads a csv file and produces a transformed column from the table using the values from the tables.
7+
8+
The following examples are included:
9+
1. Using a transform to perform a fused checksum on two columns
10+
2. Using a transform to get a substring from a kernel
11+
3. Using a transform kernel to output a string to a pre-allocated buffer
12+
13+
14+
## Compile and execute
15+
16+
```bash
17+
# Configure project
18+
cmake -S . -B build/
19+
# Build
20+
cmake --build build/ --parallel $PARALLEL_LEVEL
21+
# Execute
22+
build/output info.csv
23+
--OR--
24+
build/preallocated info.csv
25+
```
26+
27+
If your machine does not come with a pre-built libcudf binary, expect the
28+
first build to take some time, as it would build libcudf on the host machine.
29+
It may be sped up by configuring the proper `PARALLEL_LEVEL` number.
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
/*
2+
* Copyright (c) 2025, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#pragma once
17+
18+
#include <cudf/column/column.hpp>
19+
#include <cudf/column/column_view.hpp>
20+
#include <cudf/io/csv.hpp>
21+
#include <cudf/io/datasource.hpp>
22+
#include <cudf/strings/strings_column_view.hpp>
23+
#include <cudf/table/table.hpp>
24+
#include <cudf/table/table_view.hpp>
25+
26+
#include <rmm/cuda_device.hpp>
27+
#include <rmm/mr/device/cuda_memory_resource.hpp>
28+
#include <rmm/mr/device/device_memory_resource.hpp>
29+
#include <rmm/mr/device/owning_wrapper.hpp>
30+
#include <rmm/mr/device/pool_memory_resource.hpp>
31+
32+
#include <chrono>
33+
#include <iostream>
34+
#include <memory>
35+
#include <string>
36+
#include <vector>
37+
38+
/**
39+
* @brief Main example function returns transformed string table.
40+
*
41+
* @param table Table to be transformed
42+
* @return Transformed result
43+
*/
44+
std::unique_ptr<cudf::column> transform(cudf::table_view const& table);
45+
46+
/**
47+
* @brief Create CUDA memory resource
48+
*/
49+
auto make_cuda_mr() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
50+
51+
/**
52+
* @brief Create a pool device memory resource
53+
*/
54+
auto make_pool_mr()
55+
{
56+
return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
57+
make_cuda_mr(), rmm::percent_of_free_device_memory(50));
58+
}
59+
60+
/**
61+
* @brief Create memory resource for libcudf functions
62+
*/
63+
std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(std::string const& name)
64+
{
65+
if (name == "pool") { return make_pool_mr(); }
66+
return make_cuda_mr();
67+
}
68+
69+
void write_csv(cudf::table_view const& tbl_view, std::string const& file_path)
70+
{
71+
auto sink_info = cudf::io::sink_info(file_path);
72+
auto builder = cudf::io::csv_writer_options::builder(sink_info, tbl_view).include_header(false);
73+
auto options = builder.build();
74+
cudf::io::write_csv(options);
75+
}
76+
77+
/**
78+
* @brief Main for strings examples
79+
*
80+
* Command line parameters:
81+
* 1. CSV file name/path
82+
* 2. Out file name/path
83+
* 3. Memory resource (optional): 'pool' or 'cuda'
84+
*
85+
* The stdout includes the number of rows in the input and the output size in bytes.
86+
*/
87+
int main(int argc, char const** argv)
88+
{
89+
if (argc < 3) {
90+
std::cout << "required parameters: csv-file-path out-file-path\n";
91+
return 1;
92+
}
93+
94+
auto const mr_name = std::string{argc >= 4 ? std::string(argv[3]) : std::string("cuda")};
95+
auto const out_csv = std::string{argv[2]};
96+
auto const in_csv = std::string{argv[1]};
97+
auto resource = create_memory_resource(mr_name);
98+
cudf::set_current_device_resource(resource.get());
99+
100+
auto const csv_result = [in_csv] {
101+
cudf::io::csv_reader_options in_opts =
102+
cudf::io::csv_reader_options::builder(cudf::io::source_info{in_csv}).header(0);
103+
return cudf::io::read_csv(in_opts).tbl;
104+
}();
105+
auto const csv_table = csv_result->view();
106+
107+
std::cout << "table: " << csv_table.num_rows() << " rows " << csv_table.num_columns()
108+
<< " columns\n";
109+
110+
auto st = std::chrono::steady_clock::now();
111+
auto result = transform(csv_table);
112+
113+
std::chrono::duration<double> elapsed = std::chrono::steady_clock::now() - st;
114+
std::cout << "Wall time: " << elapsed.count() << " seconds\n";
115+
116+
std::vector<std::unique_ptr<cudf::column>> table_columns;
117+
table_columns.push_back(std::move(result));
118+
119+
auto out_table = cudf::table(std::move(table_columns));
120+
121+
write_csv(out_table, out_csv);
122+
123+
return 0;
124+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
Name,Email,Country Code,Area Code,Phone
2+
John Doe,John.Doe@gmail.com,"1","415","839-2847"
3+
Jane Doe,Jane.Doe@gmail.com,"1","312","492-7365"
4+
Billy Joe,Billy.Joe@gmail.com,"1","617","203-9584"
5+
James James,James.James@yahoo.com,"44","20","7946-1298"
6+
Michael Frederick,Michael.Frederick@yahoo.com,"44","121","238-9432"
7+
Christopher Cheryl,Christopher.Cheryl@yahoo.com,"44","131","657-4821"
8+
Jessica Autumn,Jessica.Autumn@yahoo.com,"33","1","4567-9832"
9+
Matthew Tyrone,Matthew.Tyrone@yahoo.com,"33","4","7892-3321"
10+
Ashley Martha,Ashley.Martha@gmail.com,"33","3","5643-1987"
11+
Jennifer Omar,Jennifer.Omar@outlook.com,"55","11","99876-5432"
12+
Joshua Lydia,Joshua.Lydia@yahoo.com,"55","21","87654-3210"
13+
Amanda Jerome,Amanda.Jerome@yahoo.com,"55","31","93456-7890"
14+
Daniel Theodore,Daniel.Theodore@gmail.com,"261","20","654-7890"
15+
David Abby,David.Abby@yahoo.com,"261","32","123-4567"
16+
James Neil,James.Neil@yahoo.com,"261","34","987-6543"
17+
Robert Shawna,Robert.Shawna@gmail.com,"212","5","3245-9876"
18+
John Sierra,John.Sierra@gmail.com,"212","6","4567-1234"
19+
Joseph Nina,Joseph.Nina@gmail.com,"212","7","8912-3456"
20+
Andrew Tammy,Andrew.Tammy@gmail.com,"44","113","204-8392"
21+
Ryan Nikki,Ryan.Nikki@yahoo.com,"33","2","6789-1234"
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
/*
2+
* Copyright (c) 2025, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include "common.hpp"
18+
19+
#include <cudf/table/table_view.hpp>
20+
#include <cudf/transform.hpp>
21+
22+
#include <rmm/cuda_stream_view.hpp>
23+
24+
std::unique_ptr<cudf::column> transform(cudf::table_view const& table)
25+
{
26+
auto stream = rmm::cuda_stream_default;
27+
auto mr = cudf::get_current_device_resource_ref();
28+
29+
auto udf = R"***(
30+
__device__ void checksum(uint16_t* out,
31+
cudf::string_view const name,
32+
cudf::string_view const email)
33+
{
34+
auto fletcher16 = [](cudf::string_view str) -> uint16_t {
35+
uint16_t sum1 = 0;
36+
uint16_t sum2 = 0;
37+
for (cudf::size_type i = 0; i < str.size_bytes(); ++i) {
38+
sum1 = (sum1 + str.data()[i]) % 255;
39+
sum2 = (sum2 + sum1) % 255;
40+
}
41+
return (sum2 << 8) | sum1;
42+
};
43+
*out = fletcher16(name) ^ fletcher16(email);
44+
}
45+
)***";
46+
47+
return cudf::transform({table.column(0), table.column(1)},
48+
udf,
49+
cudf::data_type{cudf::type_id::UINT16},
50+
false,
51+
std::nullopt,
52+
stream,
53+
mr);
54+
}
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
/*
2+
* Copyright (c) 2025, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include "common.hpp"
18+
19+
#include <cudf/column/column_factories.hpp>
20+
#include <cudf/table/table_view.hpp>
21+
#include <cudf/transform.hpp>
22+
23+
#include <rmm/cuda_stream_view.hpp>
24+
25+
std::unique_ptr<cudf::column> transform(cudf::table_view const& table)
26+
{
27+
auto stream = rmm::cuda_stream_default;
28+
auto mr = cudf::get_current_device_resource_ref();
29+
30+
auto udf = R"***(
31+
__device__ void email_provider(cudf::string_view* out,
32+
cudf::string_view const email,
33+
cudf::string_view const alt)
34+
{
35+
auto pos = email.find('@');
36+
37+
if (pos == cudf::string_view::npos) {
38+
*out = alt;
39+
return;
40+
}
41+
42+
auto provider_begin = pos + 1;
43+
auto provider = email.substr(provider_begin, email.length() - provider_begin);
44+
45+
*out = provider;
46+
}
47+
)***";
48+
49+
// a column with size 1 is considered a scalar
50+
auto alt = cudf::make_column_from_scalar(
51+
cudf::string_scalar(cudf::string_view{"(unknown)", 9}, true, stream, mr), 1, stream, mr);
52+
53+
return cudf::transform({table.column(1), *alt},
54+
udf,
55+
cudf::data_type{cudf::type_id::STRING},
56+
false,
57+
std::nullopt,
58+
stream,
59+
mr);
60+
}

0 commit comments

Comments
 (0)