diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh index 25984df1b60..b96e7997871 100755 --- a/cpp/examples/build.sh +++ b/cpp/examples/build.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. # libcudf examples build script @@ -58,6 +58,7 @@ build_example() { build_example basic build_example strings +build_example string_transforms build_example nested_types build_example parquet_io build_example billion_rows diff --git a/cpp/examples/string_transforms/CMakeLists.txt b/cpp/examples/string_transforms/CMakeLists.txt new file mode 100644 index 00000000000..97d454ab3aa --- /dev/null +++ b/cpp/examples/string_transforms/CMakeLists.txt @@ -0,0 +1,41 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +cmake_minimum_required(VERSION 3.30.4 FATAL_ERROR) + +include(../set_cuda_architecture.cmake) + +# initialize cuda architecture +rapids_cuda_init_architectures(string_transforms_examples) + +project( + string_transforms_examples + VERSION 0.0.1 + LANGUAGES CXX CUDA +) + +include(../fetch_dependencies.cmake) + +include(rapids-cmake) +rapids_cmake_build_type("Release") + +list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr) + +add_executable(int_output int_output.cpp) +target_compile_features(int_output PRIVATE cxx_std_17) +target_compile_options(int_output PRIVATE "$<$:${CUDF_CUDA_FLAGS}>") +target_link_libraries(int_output PRIVATE cudf::cudf nvtx3::nvtx3-cpp) +install(TARGETS int_output DESTINATION bin/examples/libcudf) + +add_executable(output output.cpp) +target_compile_features(output PRIVATE cxx_std_17) +target_compile_options(output PRIVATE "$<$:${CUDF_CUDA_FLAGS}>") +target_link_libraries(output PRIVATE cudf::cudf nvToolsExt) +install(TARGETS output DESTINATION bin/examples/libcudf) + +add_executable(preallocated preallocated.cpp) +target_compile_features(preallocated PRIVATE cxx_std_17) +target_compile_options(preallocated PRIVATE "$<$:${CUDF_CUDA_FLAGS}>") +target_link_libraries(preallocated PRIVATE cudf::cudf nvToolsExt) +install(TARGETS preallocated DESTINATION bin/examples/libcudf) + +install(FILES ${CMAKE_CURRENT_LIST_DIR}/info.csv DESTINATION bin/examples/libcudf) diff --git a/cpp/examples/string_transforms/README.md b/cpp/examples/string_transforms/README.md new file mode 100644 index 00000000000..349614fd188 --- /dev/null +++ b/cpp/examples/string_transforms/README.md @@ -0,0 +1,29 @@ +# libcudf C++ examples using string transforms + +This C++ example demonstrates using libcudf transform API to access and create +strings columns. + +The example source code loads a csv file and produces a transformed column from the table using the values from the tables. + +The following examples are included: +1. Using a transform to perform a fused checksum on two columns +2. Using a transform to get a substring from a kernel +3. Using a transform kernel to output a string to a pre-allocated buffer + + +## Compile and execute + +```bash +# Configure project +cmake -S . -B build/ +# Build +cmake --build build/ --parallel $PARALLEL_LEVEL +# Execute +build/output info.csv +--OR-- +build/preallocated info.csv +``` + +If your machine does not come with a pre-built libcudf binary, expect the +first build to take some time, as it would build libcudf on the host machine. +It may be sped up by configuring the proper `PARALLEL_LEVEL` number. diff --git a/cpp/examples/string_transforms/common.hpp b/cpp/examples/string_transforms/common.hpp new file mode 100644 index 00000000000..1486e8f4467 --- /dev/null +++ b/cpp/examples/string_transforms/common.hpp @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +/** + * @brief Main example function returns transformed string table. + * + * @param table Table to be transformed + * @return Transformed result + */ +std::unique_ptr transform(cudf::table_view const& table); + +/** + * @brief Create CUDA memory resource + */ +auto make_cuda_mr() { return std::make_shared(); } + +/** + * @brief Create a pool device memory resource + */ +auto make_pool_mr() +{ + return rmm::mr::make_owning_wrapper( + make_cuda_mr(), rmm::percent_of_free_device_memory(50)); +} + +/** + * @brief Create memory resource for libcudf functions + */ +std::shared_ptr create_memory_resource(std::string const& name) +{ + if (name == "pool") { return make_pool_mr(); } + return make_cuda_mr(); +} + +void write_csv(cudf::table_view const& tbl_view, std::string const& file_path) +{ + auto sink_info = cudf::io::sink_info(file_path); + auto builder = cudf::io::csv_writer_options::builder(sink_info, tbl_view).include_header(false); + auto options = builder.build(); + cudf::io::write_csv(options); +} + +/** + * @brief Main for strings examples + * + * Command line parameters: + * 1. CSV file name/path + * 2. Out file name/path + * 3. Memory resource (optional): 'pool' or 'cuda' + * + * The stdout includes the number of rows in the input and the output size in bytes. + */ +int main(int argc, char const** argv) +{ + if (argc < 3) { + std::cout << "required parameters: csv-file-path out-file-path\n"; + return 1; + } + + auto const mr_name = std::string{argc >= 4 ? std::string(argv[3]) : std::string("cuda")}; + auto const out_csv = std::string{argv[2]}; + auto const in_csv = std::string{argv[1]}; + auto resource = create_memory_resource(mr_name); + cudf::set_current_device_resource(resource.get()); + + auto const csv_result = [in_csv] { + cudf::io::csv_reader_options in_opts = + cudf::io::csv_reader_options::builder(cudf::io::source_info{in_csv}).header(0); + return cudf::io::read_csv(in_opts).tbl; + }(); + auto const csv_table = csv_result->view(); + + std::cout << "table: " << csv_table.num_rows() << " rows " << csv_table.num_columns() + << " columns\n"; + + auto st = std::chrono::steady_clock::now(); + auto result = transform(csv_table); + + std::chrono::duration elapsed = std::chrono::steady_clock::now() - st; + std::cout << "Wall time: " << elapsed.count() << " seconds\n"; + + std::vector> table_columns; + table_columns.push_back(std::move(result)); + + auto out_table = cudf::table(std::move(table_columns)); + + write_csv(out_table, out_csv); + + return 0; +} diff --git a/cpp/examples/string_transforms/info.csv b/cpp/examples/string_transforms/info.csv new file mode 100644 index 00000000000..cea3d7d7e7a --- /dev/null +++ b/cpp/examples/string_transforms/info.csv @@ -0,0 +1,21 @@ +Name,Email,Country Code,Area Code,Phone +John Doe,John.Doe@gmail.com,"1","415","839-2847" +Jane Doe,Jane.Doe@gmail.com,"1","312","492-7365" +Billy Joe,Billy.Joe@gmail.com,"1","617","203-9584" +James James,James.James@yahoo.com,"44","20","7946-1298" +Michael Frederick,Michael.Frederick@yahoo.com,"44","121","238-9432" +Christopher Cheryl,Christopher.Cheryl@yahoo.com,"44","131","657-4821" +Jessica Autumn,Jessica.Autumn@yahoo.com,"33","1","4567-9832" +Matthew Tyrone,Matthew.Tyrone@yahoo.com,"33","4","7892-3321" +Ashley Martha,Ashley.Martha@gmail.com,"33","3","5643-1987" +Jennifer Omar,Jennifer.Omar@outlook.com,"55","11","99876-5432" +Joshua Lydia,Joshua.Lydia@yahoo.com,"55","21","87654-3210" +Amanda Jerome,Amanda.Jerome@yahoo.com,"55","31","93456-7890" +Daniel Theodore,Daniel.Theodore@gmail.com,"261","20","654-7890" +David Abby,David.Abby@yahoo.com,"261","32","123-4567" +James Neil,James.Neil@yahoo.com,"261","34","987-6543" +Robert Shawna,Robert.Shawna@gmail.com,"212","5","3245-9876" +John Sierra,John.Sierra@gmail.com,"212","6","4567-1234" +Joseph Nina,Joseph.Nina@gmail.com,"212","7","8912-3456" +Andrew Tammy,Andrew.Tammy@gmail.com,"44","113","204-8392" +Ryan Nikki,Ryan.Nikki@yahoo.com,"33","2","6789-1234" diff --git a/cpp/examples/string_transforms/int_output.cpp b/cpp/examples/string_transforms/int_output.cpp new file mode 100644 index 00000000000..680dc6a1a9b --- /dev/null +++ b/cpp/examples/string_transforms/int_output.cpp @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "common.hpp" + +#include +#include + +#include + +std::unique_ptr transform(cudf::table_view const& table) +{ + auto stream = rmm::cuda_stream_default; + auto mr = cudf::get_current_device_resource_ref(); + + auto udf = R"***( + __device__ void checksum(uint16_t* out, + cudf::string_view const name, + cudf::string_view const email) + { + auto fletcher16 = [](cudf::string_view str) -> uint16_t { + uint16_t sum1 = 0; + uint16_t sum2 = 0; + for (cudf::size_type i = 0; i < str.size_bytes(); ++i) { + sum1 = (sum1 + str.data()[i]) % 255; + sum2 = (sum2 + sum1) % 255; + } + return (sum2 << 8) | sum1; + }; + *out = fletcher16(name) ^ fletcher16(email); + } + )***"; + + return cudf::transform({table.column(0), table.column(1)}, + udf, + cudf::data_type{cudf::type_id::UINT16}, + false, + std::nullopt, + stream, + mr); +} diff --git a/cpp/examples/string_transforms/output.cpp b/cpp/examples/string_transforms/output.cpp new file mode 100644 index 00000000000..aef40c8b237 --- /dev/null +++ b/cpp/examples/string_transforms/output.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "common.hpp" + +#include +#include +#include + +#include + +std::unique_ptr transform(cudf::table_view const& table) +{ + auto stream = rmm::cuda_stream_default; + auto mr = cudf::get_current_device_resource_ref(); + + auto udf = R"***( + __device__ void email_provider(cudf::string_view* out, + cudf::string_view const email, + cudf::string_view const alt) + { + auto pos = email.find('@'); + + if (pos == cudf::string_view::npos) { + *out = alt; + return; + } + + auto provider_begin = pos + 1; + auto provider = email.substr(provider_begin, email.length() - provider_begin); + + *out = provider; + } + )***"; + + // a column with size 1 is considered a scalar + auto alt = cudf::make_column_from_scalar( + cudf::string_scalar(cudf::string_view{"(unknown)", 9}, true, stream, mr), 1, stream, mr); + + return cudf::transform({table.column(1), *alt}, + udf, + cudf::data_type{cudf::type_id::STRING}, + false, + std::nullopt, + stream, + mr); +} diff --git a/cpp/examples/string_transforms/preallocated.cpp b/cpp/examples/string_transforms/preallocated.cpp new file mode 100644 index 00000000000..22ced8d9170 --- /dev/null +++ b/cpp/examples/string_transforms/preallocated.cpp @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "common.hpp" + +#include +#include +#include +#include + +#include +#include + +std::unique_ptr transform(cudf::table_view const& table) +{ + auto stream = rmm::cuda_stream_default; + auto mr = cudf::get_current_device_resource_ref(); + + /// Convert a phone number to E.164 international phone format + /// (https://en.wikipedia.org/wiki/E.164) + auto udf = R"***( +__device__ void e164_format(void* scratch, + cudf::size_type row, + cudf::string_view* out, + cudf::string_view const country_code, + cudf::string_view const area_code, + cudf::string_view const phone_number, + [[maybe_unused]] int32_t scratch_size) +{ + auto const begin = static_cast(scratch) + + static_cast(row) * static_cast(scratch_size); + auto const end = begin + scratch_size; + auto it = begin; + + auto push = [&](cudf::string_view str) { + auto const size = str.size_bytes(); + + if ((it + size) > end) { return; } + + memcpy(it, str.data(), size); + it += size; + }; + + push(cudf::string_view{"+", 1}); + push(country_code); + push(cudf::string_view{"-", 1}); + push(area_code); + push(cudf::string_view{"-", 1}); + push(phone_number); + + *out = cudf::string_view{begin, static_cast(it - static_cast(begin))}; +} + )***"; + + constexpr cudf::size_type maximum_size = 20; + auto const num_rows = table.num_rows(); + + rmm::device_uvector scratch(maximum_size * num_rows, stream, mr); + + // a column with size 1 is considered a scalar + auto size = cudf::make_column_from_scalar( + cudf::numeric_scalar(maximum_size, true, stream, mr), 1, stream, mr); + + return cudf::transform({table.column(2), table.column(3), table.column(4), *size}, + udf, + cudf::data_type{cudf::type_id::STRING}, + false, + scratch.data(), + stream, + mr); +}