Skip to content

String Transform Examples #18616

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 98 commits into
base: branch-25.06
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 93 commits
Commits
Show all changes
98 commits
Select commit Hold shift + click to select a range
211e9a2
initial changes
lamarrr Mar 24, 2025
4b02b11
removed jit::column_device_view and renamed CUDF_JIT_UDF to CUDF_RUNT…
lamarrr Mar 26, 2025
12f4a46
updated tests
lamarrr Mar 26, 2025
9c0ab88
Merge branch 'branch-25.06' into column-device-view-refactor
lamarrr Mar 26, 2025
88e5c87
added todo
lamarrr Mar 26, 2025
2025110
Merge branch 'branch-25.06' into column-device-view-refactor
lamarrr Mar 26, 2025
9964234
Merge branch 'branch-25.06' into column-device-view-refactor
lamarrr Mar 31, 2025
53062b0
Merge branch 'branch-25.06' into column-device-view-refactor
lamarrr Mar 31, 2025
c00444d
added string find test
lamarrr Apr 2, 2025
61dcd4d
Merge branch 'branch-25.06' into column-device-view-refactor
lamarrr Apr 2, 2025
8ed0029
Merge branch 'branch-25.06' into column-device-view-refactor
lamarrr Apr 3, 2025
2c59065
Merge branch 'branch-25.06' into column-device-view-refactor
lamarrr Apr 3, 2025
f1a11d8
initial prototype
lamarrr Apr 4, 2025
ae4a47a
update
lamarrr Apr 10, 2025
4a4ef09
added raw_device_column_view
lamarrr Apr 10, 2025
4b2c18c
update
lamarrr Apr 10, 2025
0eff98c
update
lamarrr Apr 10, 2025
8008826
Merge branch 'branch-25.06' into column-device-view-refactor
lamarrr Apr 10, 2025
aa5684f
updated doc
lamarrr Apr 10, 2025
7f02f32
Merge branch 'column-device-view-refactor' of https://github.com/lama…
lamarrr Apr 10, 2025
f6d764b
Merge branch 'column-device-view-refactor' into string-output
lamarrr Apr 11, 2025
f9dc7ce
update
lamarrr Apr 14, 2025
315545d
moved and renamed raw_column_device_view
lamarrr Apr 14, 2025
1163d7c
formatting and doc update
lamarrr Apr 14, 2025
785a415
moved get_mask_offset_word
lamarrr Apr 15, 2025
fb01e05
Merge branch 'column-device-view-refactor' into string-output
lamarrr Apr 15, 2025
065d19e
update
lamarrr Apr 15, 2025
3539622
renamed column offsets index
lamarrr Apr 15, 2025
e7b5585
Merge remote-tracking branch 'upstream/branch-25.06' into column-devi…
lamarrr Apr 15, 2025
046d8be
Merge branch 'branch-25.06' into column-device-view-refactor
lamarrr Apr 15, 2025
90f9e74
code review changes
lamarrr Apr 17, 2025
9fef854
Merge branch 'column-device-view-refactor' of https://github.com/lama…
lamarrr Apr 17, 2025
d0fe4b0
Merge branch 'branch-25.06' into column-device-view-refactor
lamarrr Apr 17, 2025
e0683cd
updated API
lamarrr Apr 17, 2025
75d76f0
Merge branch 'branch-25.06' into column-device-view-refactor
lamarrr Apr 18, 2025
f80ccc2
Merge branch 'column-device-view-refactor' into string-output
lamarrr Apr 18, 2025
7c0f63c
refactoring + added buffer_string abstraction
lamarrr Apr 18, 2025
114ebd7
Merge branch 'branch-25.06' into column-device-view-refactor
lamarrr Apr 19, 2025
03cc9b7
Merge branch 'column-device-view-refactor' into string-output
lamarrr Apr 19, 2025
ff620f1
Merge remote-tracking branch 'upstream/branch-25.06' into string-output
lamarrr Apr 19, 2025
e1f4061
update
lamarrr Apr 20, 2025
23c513e
removed buffer_string
lamarrr Apr 22, 2025
a6a6b27
removed buffer_string
lamarrr Apr 22, 2025
68f6266
fixed warnings
lamarrr Apr 22, 2025
4c8d1c9
fixed tests
lamarrr Apr 22, 2025
987ae98
code review changes
lamarrr Apr 29, 2025
05caf59
code review changes
lamarrr Apr 30, 2025
977b4f6
Merge remote-tracking branch 'upstream/branch-25.06' into string-output
lamarrr Apr 30, 2025
0cd1152
initial commit
lamarrr May 1, 2025
e698ba5
update
lamarrr May 1, 2025
b6ddf7b
bug fixes
lamarrr May 1, 2025
4ebcb53
removed indexing from user_data_accessor
lamarrr May 6, 2025
edf95da
Merge branch 'branch-25.06' into string-output
lamarrr May 6, 2025
3200792
fixed lint error
lamarrr May 6, 2025
a58631d
Merge branch 'string-output' of https://github.com/lamarrr/cudf into …
lamarrr May 6, 2025
3761302
Merge branch 'branch-25.06' into string-output
lamarrr May 7, 2025
48c60fc
Merge branch 'branch-25.06' into string-output
lamarrr May 7, 2025
a714483
Merge branch 'branch-25.06' into string-output
lamarrr May 7, 2025
cf52484
removed user_data_accessor
lamarrr May 8, 2025
493de47
Merge branch 'string-output' of https://github.com/lamarrr/cudf into …
lamarrr May 8, 2025
d7a4217
Merge branch 'branch-25.06' into string-output
lamarrr May 12, 2025
5ac9998
added row indices for user_data
lamarrr May 12, 2025
d354792
Update cpp/src/transform/transform.cpp
lamarrr May 12, 2025
a186a80
Update cpp/src/transform/transform.cpp
lamarrr May 12, 2025
2532323
Update cpp/src/transform/transform.cpp
lamarrr May 12, 2025
328779c
disabled null input support + formatting
lamarrr May 12, 2025
f8ab0ec
removed nullmask copying
lamarrr May 12, 2025
19dc7ab
doc update + added assertion tests + added back nullmask copying
lamarrr May 12, 2025
a840126
updated doxygen and test
lamarrr May 12, 2025
7b70b4c
Merge branch 'branch-25.06' into string-output
lamarrr May 12, 2025
41e28ae
updated test
lamarrr May 12, 2025
a41d7db
Merge branch 'string-output' of https://github.com/lamarrr/cudf into …
lamarrr May 12, 2025
9d8d230
Merge branch 'branch-25.06' into string-output
lamarrr May 13, 2025
17ab71b
Merge branch 'branch-25.06' into string-output
lamarrr May 14, 2025
231e0d3
Merge branch 'string-output' into string-jit-examples
lamarrr May 14, 2025
764daf3
deleted checksum example
lamarrr May 14, 2025
c4932fc
removed int_output and updated examples
lamarrr May 14, 2025
04fe77f
added utf-8 text to stringoutput test
lamarrr May 14, 2025
2c21db9
Merge branch 'branch-25.06' into string-output
lamarrr May 14, 2025
05250bf
removed null tests in java transforms test
lamarrr May 14, 2025
649863e
Merge branch 'string-output' of https://github.com/lamarrr/cudf into …
lamarrr May 14, 2025
0ba2305
Merge branch 'string-output' into string-jit-examples
lamarrr May 15, 2025
92fc387
Merge branch 'branch-25.06' into string-jit-examples
lamarrr May 15, 2025
5c676a8
removed int_output and updated examples
lamarrr May 15, 2025
7ac0022
Merge branch 'string-jit-examples' of https://github.com/lamarrr/cudf…
lamarrr May 15, 2025
faf5573
Merge branch 'branch-25.06' into string-jit-examples
lamarrr May 15, 2025
46d1d25
formatting
lamarrr May 15, 2025
221443b
Merge branch 'string-jit-examples' of https://github.com/lamarrr/cudf…
lamarrr May 15, 2025
d5a351a
updated examples
lamarrr May 17, 2025
2ea0bf7
pre-commit
lamarrr May 17, 2025
4541851
Update cpp/examples/string_transforms/README.md
lamarrr May 19, 2025
668b25b
Merge branch 'branch-25.06' into string-jit-examples
lamarrr May 19, 2025
2b1b4cb
Update cpp/examples/string_transforms/common.hpp
lamarrr May 19, 2025
1110062
Update cpp/examples/string_transforms/preallocated.cpp
lamarrr May 19, 2025
47472cb
update
lamarrr May 19, 2025
fc01e2d
update
lamarrr May 19, 2025
a87e3a3
update
lamarrr May 19, 2025
2381d60
Merge branch 'branch-25.06' into string-jit-examples
lamarrr May 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cpp/examples/build.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

# Copyright (c) 2021-2024, NVIDIA CORPORATION.
# Copyright (c) 2021-2025, NVIDIA CORPORATION.

# libcudf examples build script

Expand Down Expand Up @@ -58,6 +58,7 @@ build_example() {

build_example basic
build_example strings
build_example string_transforms
build_example nested_types
build_example parquet_io
build_example billion_rows
Expand Down
41 changes: 41 additions & 0 deletions cpp/examples/string_transforms/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Copyright (c) 2025, NVIDIA CORPORATION.

cmake_minimum_required(VERSION 3.30.4 FATAL_ERROR)

include(../set_cuda_architecture.cmake)

# initialize cuda architecture
rapids_cuda_init_architectures(string_transforms_examples)

project(
string_transforms_examples
VERSION 0.0.1
LANGUAGES CXX CUDA
)

include(../fetch_dependencies.cmake)

include(rapids-cmake)
rapids_cmake_build_type("Release")

list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)

add_executable(int_output int_output.cpp)
target_compile_features(int_output PRIVATE cxx_std_17)
target_compile_options(int_output PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>")
target_link_libraries(int_output PRIVATE cudf::cudf nvtx3::nvtx3-cpp)
install(TARGETS int_output DESTINATION bin/examples/libcudf)

add_executable(output output.cpp)
target_compile_features(output PRIVATE cxx_std_17)
target_compile_options(output PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>")
target_link_libraries(output PRIVATE cudf::cudf nvToolsExt)
install(TARGETS output DESTINATION bin/examples/libcudf)

add_executable(preallocated preallocated.cpp)
target_compile_features(preallocated PRIVATE cxx_std_17)
target_compile_options(preallocated PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>")
target_link_libraries(preallocated PRIVATE cudf::cudf nvToolsExt)
install(TARGETS preallocated DESTINATION bin/examples/libcudf)

install(FILES ${CMAKE_CURRENT_LIST_DIR}/info.csv DESTINATION bin/examples/libcudf)
29 changes: 29 additions & 0 deletions cpp/examples/string_transforms/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# libcudf C++ examples using string transforms

This C++ example demonstrates using libcudf transform API to access and create
strings columns.

The example source code loads a csv file and produces a transformed column from the table using the values from the tables.

The following examples are included:
1. Using a transform to perform a fused checksum on two columns
2. Using a transform to get get a substring from a kernel
3. Using a transform kernel to output a string to a pre-allocated buffer


## Compile and execute

```bash
# Configure project
cmake -S . -B build/
# Build
cmake --build build/ --parallel $PARALLEL_LEVEL
# Execute
build/output info.csv
--OR--
build/preallocated info.csv
```

If your machine does not come with a pre-built libcudf binary, expect the
first build to take some time, as it would build libcudf on the host machine.
It may be sped up by configuring the proper `PARALLEL_LEVEL` number.
124 changes: 124 additions & 0 deletions cpp/examples/string_transforms/common.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <cudf/column/column.hpp>
#include <cudf/column/column_view.hpp>
#include <cudf/io/csv.hpp>
#include <cudf/io/datasource.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/table/table.hpp>
#include <cudf/table/table_view.hpp>

#include <rmm/cuda_device.hpp>
#include <rmm/mr/device/cuda_memory_resource.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>
#include <rmm/mr/device/owning_wrapper.hpp>
#include <rmm/mr/device/pool_memory_resource.hpp>

#include <chrono>
#include <iostream>
#include <memory>
#include <string>
#include <vector>

/**
* @brief Main example function returns transformed string table.
*
* @param table Table to be transformed
* @return Transformed string column
*/
std::unique_ptr<cudf::column> transform(cudf::table_view const& table);

/**
* @brief Create CUDA memory resource
*/
auto make_cuda_mr() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }

/**
* @brief Create a pool device memory resource
*/
auto make_pool_mr()
{
return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
make_cuda_mr(), rmm::percent_of_free_device_memory(50));
}

/**
* @brief Create memory resource for libcudf functions
*/
std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(std::string const& name)
{
if (name == "pool") { return make_pool_mr(); }
return make_cuda_mr();
}

void write_csv(cudf::table_view const& tbl_view, std::string const& file_path)
{
auto sink_info = cudf::io::sink_info(file_path);
auto builder = cudf::io::csv_writer_options::builder(sink_info, tbl_view).include_header(false);
auto options = builder.build();
cudf::io::write_csv(options);
}

/**
* @brief Main for strings examples
*
* Command line parameters:
* 1. CSV file name/path
* 2. Out file name/path
* 3. Memory resource (optional): 'pool' or 'cuda'
*
* The stdout includes the number of rows in the input and the output size in bytes.
*/
int main(int argc, char const** argv)
{
if (argc < 3) {
std::cout << "required parameters: csv-file-path out-file-path\n";
return 1;
}

auto const mr_name = std::string{argc >= 4 ? std::string(argv[3]) : std::string("cuda")};
auto const out_csv = std::string{argv[2]};
auto const in_csv = std::string{argv[1]};
auto resource = create_memory_resource(mr_name);
cudf::set_current_device_resource(resource.get());

auto const csv_result = [in_csv] {
cudf::io::csv_reader_options in_opts =
cudf::io::csv_reader_options::builder(cudf::io::source_info{in_csv}).header(0);
return cudf::io::read_csv(in_opts).tbl;
}();
auto const csv_table = csv_result->view();

std::cout << "table: " << csv_table.num_rows() << " rows " << csv_table.num_columns()
<< " columns\n";

auto st = std::chrono::steady_clock::now();
auto result = transform(csv_table);

std::chrono::duration<double> elapsed = std::chrono::steady_clock::now() - st;
std::cout << "Wall time: " << elapsed.count() << " seconds\n";

std::vector<std::unique_ptr<cudf::column>> table_columns;
table_columns.push_back(std::move(result));

auto out_table = cudf::table(std::move(table_columns));

write_csv(out_table, out_csv);

return 0;
}
21 changes: 21 additions & 0 deletions cpp/examples/string_transforms/info.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
Name,Email,Country Code,Area Code,Phone
John Doe,John.Doe@gmail.com,"1","415","839-2847"
Jane Doe,Jane.Doe@gmail.com,"1","312","492-7365"
Billy Joe,Billy.Joe@gmail.com,"1","617","203-9584"
James James,James.James@yahoo.com,"44","20","7946-1298"
Michael Frederick,Michael.Frederick@yahoo.com,"44","121","238-9432"
Christopher Cheryl,Christopher.Cheryl@yahoo.com,"44","131","657-4821"
Jessica Autumn,Jessica.Autumn@yahoo.com,"33","1","4567-9832"
Matthew Tyrone,Matthew.Tyrone@yahoo.com,"33","4","7892-3321"
Ashley Martha,Ashley.Martha@gmail.com,"33","3","5643-1987"
Jennifer Omar,Jennifer.Omar@outlook.com,"55","11","99876-5432"
Joshua Lydia,Joshua.Lydia@yahoo.com,"55","21","87654-3210"
Amanda Jerome,Amanda.Jerome@yahoo.com,"55","31","93456-7890"
Daniel Theodore,Daniel.Theodore@gmail.com,"261","20","654-7890"
David Abby,David.Abby@yahoo.com,"261","32","123-4567"
James Neil,James.Neil@yahoo.com,"261","34","987-6543"
Robert Shawna,Robert.Shawna@gmail.com,"212","5","3245-9876"
John Sierra,John.Sierra@gmail.com,"212","6","4567-1234"
Joseph Nina,Joseph.Nina@gmail.com,"212","7","8912-3456"
Andrew Tammy,Andrew.Tammy@gmail.com,"44","113","204-8392"
Ryan Nikki,Ryan.Nikki@yahoo.com,"33","2","6789-1234"
54 changes: 54 additions & 0 deletions cpp/examples/string_transforms/int_output.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "common.hpp"

#include <cudf/table/table_view.hpp>
#include <cudf/transform.hpp>

#include <rmm/cuda_stream_view.hpp>

std::unique_ptr<cudf::column> transform(cudf::table_view const& table)
{
auto stream = rmm::cuda_stream_default;
auto mr = cudf::get_current_device_resource_ref();

auto udf = R"***(
__device__ void checksum(uint16_t* out,
cudf::string_view const name,
cudf::string_view const email)
{
auto fletcher16 = [](cudf::string_view str) -> uint16_t {
uint16_t sum1 = 0;
uint16_t sum2 = 0;
for (cudf::size_type i = 0; i < str.size_bytes(); ++i) {
sum1 = (sum1 + str.data()[i]) % 255;
sum2 = (sum2 + sum1) % 255;
}
return (sum2 << 8) | sum1;
};
*out = fletcher16(name) ^ fletcher16(email);
}
)***";

return cudf::transform({table.column(0), table.column(1)},
udf,
cudf::data_type{cudf::type_id::UINT16},
false,
std::nullopt,
stream,
mr);
}
59 changes: 59 additions & 0 deletions cpp/examples/string_transforms/output.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "common.hpp"

#include <cudf/column/column_factories.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/transform.hpp>

#include <rmm/cuda_stream_view.hpp>

std::unique_ptr<cudf::column> transform(cudf::table_view const& table)
{
auto stream = rmm::cuda_stream_default;
auto mr = cudf::get_current_device_resource_ref();

auto udf = R"***(
__device__ void email_provider(cudf::string_view* out,
cudf::string_view const email,
cudf::string_view const alt)
{
auto pos = email.find('@');

if (pos == cudf::string_view::npos) {
*out = alt;
return;
}

auto provider_begin = pos + 1;
auto provider = email.substr(provider_begin, email.length() - provider_begin);

*out = provider;
}
)***";

auto alt = cudf::make_column_from_scalar(
cudf::string_scalar(cudf::string_view{"(unknown)", 9}, true, stream), 1, stream);

return cudf::transform({table.column(1), *alt},
udf,
cudf::data_type{cudf::type_id::STRING},
false,
std::nullopt,
stream,
mr);
}
Loading
Loading