diff --git a/.cspell.json b/.cspell.json index 76e570034d1f8..4d3eb28340131 100644 --- a/.cspell.json +++ b/.cspell.json @@ -4,5 +4,5 @@ "planning/behavior_velocity_planner/autoware_behavior_velocity_intersection_module/scripts/**" ], "ignoreRegExpList": [], - "words": ["dltype", "tvmgen", "fromarray", "soblin", "brkay54", "libtensorrt", "TRTBEV"] + "words": ["dltype", "tvmgen", "fromarray", "soblin", "brkay54", "libtensorrt", "TRTBEV", "CUDAH"] } diff --git a/launch/tier4_perception_launch/launch/obstacle_segmentation/ground_segmentation/ground_segmentation.launch.py b/launch/tier4_perception_launch/launch/obstacle_segmentation/ground_segmentation/ground_segmentation.launch.py index 52a4fdd213ab6..7508214782cc3 100644 --- a/launch/tier4_perception_launch/launch/obstacle_segmentation/ground_segmentation/ground_segmentation.launch.py +++ b/launch/tier4_perception_launch/launch/obstacle_segmentation/ground_segmentation/ground_segmentation.launch.py @@ -48,6 +48,9 @@ def __init__(self, context): context ) self.use_time_series_filter = LaunchConfiguration("use_time_series_filter").perform(context) + self.use_cuda_ground_segmentation = ( + LaunchConfiguration("use_cuda_ground_segmentation").perform(context).lower() == "true" + ) # check if self.use_single_frame_filter is bool if isinstance(self.use_single_frame_filter, str): self.use_single_frame_filter = self.use_single_frame_filter.lower() == "true" @@ -176,7 +179,7 @@ def create_ransac_pipeline(self): name="short_height_obstacle_detection_area_filter", namespace="plane_fitting", remappings=[ - ("input", "concatenated/pointcloud"), + ("input", "concat/pointcloud"), ("output", "detection_area/pointcloud"), ], parameters=[ @@ -535,6 +538,46 @@ def launch_setup(context, *args, **kwargs): pipeline = GroundSegmentationPipeline(context) components = [] + if pipeline.use_cuda_ground_segmentation: + ground_segmentation_node_param = ParameterFile( + param_file=LaunchConfiguration("cuda_ground_segmentation_node_param_path").perform( + context + ), + allow_substs=True, + ) + components.append( + ComposableNode( + package="autoware_ground_segmentation_cuda", + plugin="autoware::cuda_ground_segmentation::CudaScanGroundSegmentationFilterNode", + name="cuda_scan_ground_segmentation_filter", + remappings=[ + ("~/input/pointcloud", "/sensing/lidar/concatenated/pointcloud"), + ("~/input/pointcloud/cuda", "/sensing/lidar/concatenated/pointcloud/cuda"), + ("~/output/pointcloud", "/perception/obstacle_segmentation/pointcloud"), + ( + "~/output/pointcloud/cuda", + "/perception/obstacle_segmentation/pointcloud/cuda", + ), + ( + "~/output/ground_pointcloud", + "/perception/obstacle_segmentation/ground_pointcloud", + ), + ( + "~/output/ground_pointcloud/cuda", + "/perception/obstacle_segmentation/ground_pointcloud/cuda", + ), + ], + parameters=[ground_segmentation_node_param], + extra_arguments=[], + ) + ) + return [ + LoadComposableNodes( + composable_node_descriptions=components, + target_container=LaunchConfiguration("pointcloud_container_name"), + ) + ] + components.extend( pipeline.create_single_frame_obstacle_segmentation_components( input_topic=LaunchConfiguration("input/pointcloud"), @@ -594,6 +637,7 @@ def add_launch_arg(name: str, default_value=None): add_launch_arg("use_intra_process", "True") add_launch_arg("pointcloud_container_name", "pointcloud_container") add_launch_arg("input/pointcloud", "/sensing/lidar/concatenated/pointcloud") + add_launch_arg("use_cuda_ground_segmentation", "False") add_launch_arg( "ogm_outlier_filter_param_path", [ @@ -601,6 +645,13 @@ def add_launch_arg(name: str, default_value=None): "/config/perception/obstacle_segmentation/occupancy_grid_based_outlier_filter/occupancy_grid_map_outlier_filter.param.yaml", ], ) + add_launch_arg( + "cuda_ground_segmentation_node_param_path", + [ + FindPackageShare("autoware_ground_segmentation_cuda"), + "/config/cuda_scan_ground_segmentation_filter.param.yaml", + ], + ) set_container_executable = SetLaunchConfiguration( "container_executable", diff --git a/launch/tier4_perception_launch/launch/perception.launch.xml b/launch/tier4_perception_launch/launch/perception.launch.xml index 52b00fc22bedc..cde5a75d46108 100644 --- a/launch/tier4_perception_launch/launch/perception.launch.xml +++ b/launch/tier4_perception_launch/launch/perception.launch.xml @@ -149,6 +149,7 @@ + @@ -225,6 +226,7 @@ + diff --git a/perception/autoware_ground_segmentation_cuda/CMakeLists.txt b/perception/autoware_ground_segmentation_cuda/CMakeLists.txt new file mode 100644 index 0000000000000..68df65699c4d3 --- /dev/null +++ b/perception/autoware_ground_segmentation_cuda/CMakeLists.txt @@ -0,0 +1,131 @@ +cmake_minimum_required(VERSION 3.8) +project(autoware_ground_segmentation_cuda) + +find_package(ament_cmake_auto REQUIRED) +find_package(CUDA) +find_package(agnocastlib) + +if(NOT ${CUDA_FOUND}) + message(WARNING "cuda was not found, so the autoware_ground_segmentation_cuda package will not be built.") + return() +elseif(CMAKE_BUILD_TYPE STREQUAL "Debug") + set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-g -G") + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-g -G") +endif() + +if(USE_AGNOCAST AND NOT agnocastlib_FOUND) + message(FATAL_ERROR "agnocastlib is required when USE_AGNOCAST is enabled") +endif() + +ament_auto_find_build_dependencies() + +# Default to C++17 +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17) +endif() + +if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") + add_compile_options(-Wall -Wextra -Wpedantic -Wunused-function) +endif() + +if(USE_AGNOCAST) + add_definitions(-DUSE_AGNOCAST_ENABLED) +endif() + +if(BUILD_TESTING) + list(APPEND AMENT_LINT_AUTO_EXCLUDE ament_cmake_uncrustify) + find_package(ament_lint_auto REQUIRED) + ament_lint_auto_find_test_dependencies() +endif() + +include_directories( + include + SYSTEM + ${CUDA_INCLUDE_DIRS} +) + +# cSpell: ignore expt gencode +list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -diag-suppress 20012") +list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_75,code=sm_75") +list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_86,code=sm_86") +list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_87,code=sm_87") +list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_89,code=sm_89") +list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_89,code=compute_89") + +#################### cuda_ground_segmentation ################## +cuda_add_library(cuda_ground_segmentation_lib SHARED + src/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter.cu +) + +target_link_libraries(cuda_ground_segmentation_lib + ${autoware_pointcloud_preprocessor_TARGETS} + ${autoware_cuda_pointcloud_preprocessor_TARGETS} +) + +target_include_directories(cuda_ground_segmentation_lib SYSTEM PRIVATE + ${autoware_pointcloud_preprocessor_INCLUDE_DIRS} + ${autoware_cuda_pointcloud_preprocessor_INCLUDE_DIRS} + ${autoware_point_types_INCLUDE_DIRS} + ${cuda_blackboard_INCLUDE_DIRS} + ${diagnostic_msgs_INCLUDE_DIRS} + ${geometry_msgs_INCLUDE_DIRS} + ${rclcpp_INCLUDE_DIRS} + ${rclcpp_components_INCLUDE_DIRS} + ${rcl_interfaces_INCLUDE_DIRS} + ${sensor_msgs_INCLUDE_DIRS} + ${tf2_INCLUDE_DIRS} + ${tf2_msgs_INCLUDE_DIRS} + ${autoware_cuda_utils_INCLUDE_DIRS} +) + +if(USE_AGNOCAST) + target_include_directories(cuda_ground_segmentation_lib SYSTEM PRIVATE + ${autoware_agnocast_wrapper_INCLUDE_DIRS} + ${agnocastlib_INCLUDE_DIRS} + ) + target_link_libraries(cuda_ground_segmentation_lib + ${agnocastlib_LIBRARIES} + ) +endif() + + +# Targets +ament_auto_add_library(cuda_ground_segmentation SHARED + src/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter_node.cpp +) + +target_link_libraries(cuda_ground_segmentation + ${CUDA_LIBRARIES} + ${diagnostic_msgs_LIBRARIES} + cuda_ground_segmentation_lib +) + +#=========== ScanGround Segmentation filter ======== +rclcpp_components_register_node(cuda_ground_segmentation + PLUGIN "autoware::cuda_ground_segmentation::CudaScanGroundSegmentationFilterNode" + EXECUTABLE cuda_scan_ground_segmentation_filter_node) + +################################################################################ +# Install +install(DIRECTORY launch config + DESTINATION share/${PROJECT_NAME} +) + +install( + TARGETS cuda_ground_segmentation_lib + LIBRARY DESTINATION lib +) + +ament_auto_package() + +# Set ROS_DISTRO macros +set(ROS_DISTRO $ENV{ROS_DISTRO}) +if(${ROS_DISTRO} STREQUAL "rolling") + add_compile_definitions(ROS_DISTRO_ROLLING) +elseif(${ROS_DISTRO} STREQUAL "foxy") + add_compile_definitions(ROS_DISTRO_FOXY) +elseif(${ROS_DISTRO} STREQUAL "galactic") + add_compile_definitions(ROS_DISTRO_GALACTIC) +elseif(${ROS_DISTRO} STREQUAL "humble") + add_compile_definitions(ROS_DISTRO_HUMBLE) +endif() diff --git a/perception/autoware_ground_segmentation_cuda/README.md b/perception/autoware_ground_segmentation_cuda/README.md new file mode 100644 index 0000000000000..33218e12fa062 --- /dev/null +++ b/perception/autoware_ground_segmentation_cuda/README.md @@ -0,0 +1,19 @@ +# autoware_ground_segmentation_cuda + +## Purpose + +The `autoware_ground_segmentation` algorithms have been thoroughly tested with Autoware. However, due to latency and high computational cost when processing large pointcloud, the input pointcloud range has been limited by the `crop_box_filter` based on the ego-vehicle's `base_link`. This can cause unwanted object loss, especially before a sloped road. + +![ground_segmentation_pipeline issue](./docs/image/ground_segmentation_issue.png) + +Recently, GPU and CUDA-supported libraries such as [cuda_blackboard](https://github.com/autowarefoundation/cuda_blackboard/blob/1837689df2891f6223f07c178c21aed252566ede/README.md) and accelerated versions of [`autoware_pointcloud_preprocessor`](../../sensing/autoware_cuda_pointcloud_preprocessor/README.md) have been implemented. These can be leveraged to improve the performance of ground segmentation filter algorithms using CUDA/GPU. + +This package reimplements the current scan_ground_filter of the ground_segmentation package to reduce latency and avoid the bottleneck caused by processing a large number of point clouds. + +## Inner-workings / Algorithm + +The detailed algorithm is available in [scan-ground-filter.md](../autoware_ground_segmentation/docs/scan-ground-filter.md). + +## Parameters + +{{ json_to_markdown("perception/autoware_ground_segmentation_cuda/schema/cuda_scan_ground_segmentation_filter.schema.json") }} diff --git a/perception/autoware_ground_segmentation_cuda/config/cuda_scan_ground_segmentation_filter.param.yaml b/perception/autoware_ground_segmentation_cuda/config/cuda_scan_ground_segmentation_filter.param.yaml new file mode 100644 index 0000000000000..20c8026798e8d --- /dev/null +++ b/perception/autoware_ground_segmentation_cuda/config/cuda_scan_ground_segmentation_filter.param.yaml @@ -0,0 +1,19 @@ +/**: + ros__parameters: + global_slope_max_angle_deg: 10.0 + local_slope_max_angle_deg: 25.0 + non_ground_height_threshold: 0.20 + grid_size_m: 0.5 + detection_range_z_max: 3.2 + use_recheck_ground_cluster: true + recheck_start_distance: 20.0 + use_lowest_point: true + center_pcl_shift: 0.0 + sector_angle_deg: 1.0 + gnd_cell_buffer_size: 5 + min_x: -100.0 + max_x: 150.0 + min_y: -70.0 + max_y: 70.0 + max_z: 2.5 + min_z: -2.5 diff --git a/perception/autoware_ground_segmentation_cuda/docs/image/ground_segmentation_issue.png b/perception/autoware_ground_segmentation_cuda/docs/image/ground_segmentation_issue.png new file mode 100644 index 0000000000000..31586172329a7 Binary files /dev/null and b/perception/autoware_ground_segmentation_cuda/docs/image/ground_segmentation_issue.png differ diff --git a/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_common.hpp b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_common.hpp new file mode 100644 index 0000000000000..b22ab4cc9aefd --- /dev/null +++ b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_common.hpp @@ -0,0 +1,38 @@ +// Copyright 2025 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +//    http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_COMMON_HPP_ +#define AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_COMMON_HPP_ + +#include + +#include + +#ifndef CUDAH +#define CUDAH __forceinline__ __host__ __device__ +#endif + +#ifndef BLOCK_SIZE_X +#define BLOCK_SIZE_X (256) +#endif + +#ifndef WARP_SIZE +#define WARP_SIZE (32) +#endif + +#ifndef FULL_MASK +#define FULL_MASK (0xFFFFFFFF) +#endif + +#endif // AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_COMMON_HPP_ diff --git a/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_mempool_wrapper.hpp b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_mempool_wrapper.hpp new file mode 100644 index 0000000000000..d58384f9070eb --- /dev/null +++ b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_mempool_wrapper.hpp @@ -0,0 +1,83 @@ +// Copyright 2025 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +//    http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_MEMPOOL_WRAPPER_HPP_ +#define AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_MEMPOOL_WRAPPER_HPP_ + +#include + +#include + +namespace autoware +{ + +class CudaMempool +{ +public: + inline explicit CudaMempool(int device_id = 0, int64_t pool_release_threshold = 0) + { + cudaMemPoolProps pool_props = {}; + + // Check if the device is valid + CHECK_CUDA_ERROR(cudaGetDevice(&device_id)); + + pool_props.allocType = cudaMemAllocationTypePinned; + pool_props.location.id = device_id; + pool_props.location.type = cudaMemLocationTypeDevice; + + CHECK_CUDA_ERROR(cudaMemPoolCreate(&pool_, &pool_props)); + + if (pool_release_threshold >= 0) { + CHECK_CUDA_ERROR( + cudaMemPoolSetAttribute(pool_, cudaMemPoolAttrReleaseThreshold, &pool_release_threshold)); + } + } + + CudaMempool(const CudaMempool &) = delete; + inline CudaMempool(CudaMempool && other) : pool_(std::move(other.pool_)) + { + other.pool_ = nullptr; + } + + CudaMempool & operator=(const CudaMempool &) = delete; + inline CudaMempool & operator=(CudaMempool && other) + { + if (this != &other) { + release(); + pool_ = std::move(other.pool_); + other.pool_ = nullptr; + } + + return *this; + } + + inline cudaMemPool_t get() { return pool_; } + + ~CudaMempool() { release(); } + +private: + inline void release() + { + if (pool_) { + CHECK_CUDA_ERROR(cudaMemPoolDestroy(pool_)); + } + + pool_ = nullptr; + } + + cudaMemPool_t pool_; +}; +} // namespace autoware + +#endif // AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_MEMPOOL_WRAPPER_HPP_ diff --git a/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter.hpp b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter.hpp new file mode 100644 index 0000000000000..d7fc8875d61d1 --- /dev/null +++ b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter.hpp @@ -0,0 +1,179 @@ +// Copyright 2025 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +//    http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_SCAN_GROUND_SEGMENTATION_FILTER_HPP_ +#define AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_SCAN_GROUND_SEGMENTATION_FILTER_HPP_ + +#include "cuda_common.hpp" +#include "cuda_mempool_wrapper.hpp" +#include "cuda_stream_wrapper.hpp" +#include "device_vector.hpp" + +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace autoware::cuda_ground_segmentation +{ + +enum SegmentationMode : uint8_t { UNINITIALIZED = 0, CONTINUOUS, DISCONTINUOUS, BREAK }; + +struct PointTypeStruct +{ + float x; + float y; + float z; + std::uint8_t intensity; + std::uint8_t return_type; + std::uint16_t channel; +}; + +enum class PointType : uint8_t { + INIT = 0, + GROUND, + NON_GROUND, + POINT_FOLLOW, + UNKNOWN, + VIRTUAL_GROUND, + OUT_OF_RANGE +}; + +struct alignas(16) ClassifiedPointType +{ + float z; + PointType type; + float radius; + size_t origin_index; // index in the original point cloud + + CUDAH ClassifiedPointType() : z(0.0), type(PointType::INIT), radius(-1.0), origin_index(0) {} +}; + +struct alignas(16) Cell +{ + float gnd_radius_avg; + float gnd_height_avg; + float gnd_height_min; + uint32_t num_ground_points; + // initialize constructor + + CUDAH Cell() + : gnd_radius_avg(0.0f), gnd_height_avg(0.0f), gnd_height_min(0.0f), num_ground_points(0) + { + } +}; + +// structure to hold parameter values +struct FilterParameters +{ + float max_x; + float min_x; + float max_y; + float min_y; + float max_z; + float min_z; + float max_radius; + float non_ground_height_threshold; + // common thresholds + float global_slope_max_angle_rad; // radians + float local_slope_max_angle_rad; // radians + float global_slope_max_ratio; + float local_slope_max_ratio; + // common parameters + float sector_angle_rad; // radial sector angle in radians + float inv_sector_angle_rad; + + // cell mode parameters + float recheck_start_distance; // distance to start rechecking ground cluster + float detection_range_z_max; + // cell parameters + float cell_divider_size_m; + float center_x{0.0f}; + float center_y{0.0f}; + uint32_t max_num_cells; + uint32_t max_num_cells_per_sector; // number of cells per sector + uint32_t num_sectors; // number of radial sectors + uint32_t gnd_grid_continual_thresh{3}; // threshold for continual ground grid + uint32_t use_recheck_ground_cluster; // to enable recheck ground cluster + uint32_t gnd_cell_buffer_size{5}; +}; + +class CudaScanGroundSegmentationFilter +{ +public: + explicit CudaScanGroundSegmentationFilter( + const FilterParameters & filter_parameters, const int64_t max_mem_pool_size_in_byte); + ~CudaScanGroundSegmentationFilter() = default; + + // Method to process the point cloud data and filter ground points + void classifyPointCloud( + const cuda_blackboard::CudaPointCloud2 & input_points, + cuda_blackboard::CudaPointCloud2 & ground, cuda_blackboard::CudaPointCloud2 & non_ground); + +private: + std::unique_ptr dev_input_points_, non_ground_, ground_; + std::unique_ptr> empty_cell_mark_; + + uint32_t number_input_points_; + uint32_t num_process_points_host_; + uint32_t input_pointcloud_step_; + uint32_t input_xyzi_offset_[4]; + float center_x_{0.0f}; + float center_y_{0.0f}; + const uint32_t gnd_grid_continual_thresh_{3}; // threshold for continual ground grid + const uint32_t continual_gnd_grid_thresh_{5}; // threshold for continual ground grid with recheck + // Parameters + FilterParameters filter_parameters_; + +private: + // Remove points too far from the origin + void removeOutliers(const cuda_blackboard::CudaPointCloud2 & input); + + /* + * This function calc the cell_id for each point + * Assign the point with initialized class into temp memory for classification + * Memory size of each cell is depend on predefined cell point num + * + */ + + void scanPerSectorGroundReference( + device_vector & cell_list, device_vector & starting_pid, + device_vector & classified_points); + + /* + * Extract obstacle points from classified_points_dev into + */ + template + void extractPoints( + device_vector & classified_points, + const cuda_blackboard::CudaPointCloud2 & input, cuda_blackboard::CudaPointCloud2 & output); + + // Distribute points to cells + void sort_points( + device_vector & cell_list, device_vector & starting_pid, + device_vector & classified_points); + + std::shared_ptr stream_; + std::shared_ptr mempool_; +}; + +} // namespace autoware::cuda_ground_segmentation + +#endif // AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_SCAN_GROUND_SEGMENTATION_FILTER_HPP_ diff --git a/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter_node.hpp b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter_node.hpp new file mode 100644 index 0000000000000..f5e82b0ef7125 --- /dev/null +++ b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter_node.hpp @@ -0,0 +1,64 @@ +// Copyright 2025 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +//    http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_SCAN_GROUND_SEGMENTATION_FILTER_NODE_HPP_ +#define AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_SCAN_GROUND_SEGMENTATION_FILTER_NODE_HPP_ + +#include "autoware/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter.hpp" + +#include +#include +#include +#include +#include + +// Autoware utils +#include +#include +#include +#include + +namespace autoware::cuda_ground_segmentation +{ +class CudaScanGroundSegmentationFilterNode : public rclcpp::Node +{ +public: + explicit CudaScanGroundSegmentationFilterNode(const rclcpp::NodeOptions & options); + +private: + void cudaPointCloudCallback(const cuda_blackboard::CudaPointCloud2::ConstSharedPtr & msg); + // Cuda Ground Segmentation Filter + std::unique_ptr cuda_ground_segmentation_filter_{}; + // debugger + std::unique_ptr> stop_watch_ptr_{nullptr}; + std::unique_ptr debug_publisher_ptr_{nullptr}; + + // rclcpp::Subscription::SharedPtr sub_; + // rclcpp::Publisher::SharedPtr pub_; + // rclcpp::Publisher::SharedPtr pub_gnd_; + + // Cuda Sub + std::shared_ptr> + sub_{}; + // Cuda Pub + std::unique_ptr> + pub_{}; + + std::unique_ptr> + pub_gnd_{}; +}; + +} // namespace autoware::cuda_ground_segmentation + +#endif // AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_SCAN_GROUND_SEGMENTATION_FILTER_NODE_HPP_ diff --git a/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_stream_wrapper.hpp b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_stream_wrapper.hpp new file mode 100644 index 0000000000000..e33dd49f36eb6 --- /dev/null +++ b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_stream_wrapper.hpp @@ -0,0 +1,72 @@ +// Copyright 2025 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +//    http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_STREAM_WRAPPER_HPP_ +#define AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_STREAM_WRAPPER_HPP_ + +#include + +#include + +#include + +namespace autoware +{ + +// A wrapper on the cudaStream_t, with a destructor to +class CudaStream +{ +public: + inline CudaStream(bool is_null = false) + { + // If is_null is true, we use the default stream + // Otherwise, we create a new stream + if (!is_null) { + CHECK_CUDA_ERROR(cudaStreamCreate(&stream_)); + } + } + + CudaStream(const CudaStream &) = delete; + CudaStream(CudaStream && other) : stream_(std::move(other.stream_)) { other.stream_ = nullptr; } + + CudaStream & operator=(const CudaStream &) = delete; + inline CudaStream & operator=(CudaStream && other) + { + if (this != &other) { + release(); + stream_ = std::move(other.stream_); + other.stream_ = nullptr; + } + + return *this; + } + + inline cudaStream_t & get() { return stream_; } + + ~CudaStream() { release(); } + +private: + inline void release() + { + if (stream_) { + CHECK_CUDA_ERROR(cudaStreamDestroy(stream_)); + } + } + + cudaStream_t stream_{nullptr}; +}; + +} // namespace autoware + +#endif // AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_STREAM_WRAPPER_HPP_ diff --git a/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_utility.cuh b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_utility.cuh new file mode 100644 index 0000000000000..db560b87b92d9 --- /dev/null +++ b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_utility.cuh @@ -0,0 +1,147 @@ +// Copyright 2025 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +//    http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION_CUDA_UTILITIES_HPP_ +#define AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION_CUDA_UTILITIES_HPP_ + +#include "cuda_common.hpp" +#include "cuda_mempool_wrapper.hpp" +#include "cuda_stream_wrapper.hpp" +#include "device_vector.hpp" + +#include + +#include + +namespace autoware::cuda +{ + +template +inline cudaError_t launchAsync( + int thread_num, int shared_size, cudaStream_t & stream, void (*f)(Args...), Args... args) +{ + int block_x = (thread_num > block_size) ? block_size : thread_num; + + if (block_x <= 0) { + return cudaErrorLaunchFailure; + } + + int grid_x = (thread_num + block_x - 1) / block_x; + + f<<>>(args...); + + return cudaGetLastError(); +} + +template +cudaError_t ExclusiveScan( + T * input, T * output, int ele_num, std::shared_ptr stream, + std::shared_ptr mempool) +{ + if (ele_num == 0) { + return cudaSuccess; + } + + if (ele_num < 0 || !stream) { + return cudaErrorInvalidValue; + } + + device_vector d_temp_storage(stream, mempool); + size_t temp_storage_bytes = 0; + + cub::DeviceScan::ExclusiveSum( + (void *)(d_temp_storage.data()), temp_storage_bytes, input, output, ele_num, stream->get()); + + int temp_ele_num = (temp_storage_bytes + sizeof(int) - 1) / sizeof(int); + d_temp_storage.resize(temp_ele_num); + + cub::DeviceScan::ExclusiveSum( + (void *)(d_temp_storage.data()), temp_storage_bytes, input, output, ele_num, stream->get()); + + return cudaGetLastError(); +} + +template +cudaError_t ExclusiveScan(device_vector & input, device_vector & output) +{ + if (input.empty()) { + return cudaSuccess; + } + + output.resize(input.size()); + + return ExclusiveScan( + input.data(), output.data(), (int)(input.size()), input.get_stream(), input.get_mempool()); +} + +template +cudaError_t ExclusiveScan(device_vector & input) +{ + return ExclusiveScan(input, input); +} + +template +__global__ void fillVector(T * vec, int ele_num, T init_val) +{ + int index = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + + for (int i = index; i < ele_num; i += stride) { + vec[i] = init_val; + } +} + +template +cudaError_t fill(T * input, int ele_num, T val, std::shared_ptr stream) +{ + if (ele_num == 0) { + return cudaSuccess; + } + + if (ele_num < 0 || !stream) { + return cudaErrorInvalidValue; + } + + return launchAsync(ele_num, 0, stream->get(), fillVector, input, ele_num, val); +} + +template +cudaError_t fill(device_vector & input, T val) +{ + return fill(input.data(), (int)(input.size()), val, input.get_stream()); +} + +CUDAH void memcpy(uint8_t * dst, const uint8_t * src, int size) +{ + for (int i = 0; i < size; ++i) { + dst[i] = src[i]; + } +} + +inline void copyPointCloud2Metadata( + cuda_blackboard::CudaPointCloud2 & dst, const cuda_blackboard::CudaPointCloud2 & src) +{ + dst.header = src.header; + dst.height = src.height; + dst.width = src.width; + dst.fields = src.fields; + dst.is_bigendian = src.is_bigendian; + dst.point_step = src.point_step; + dst.row_step = src.row_step; + dst.is_dense = src.is_dense; +} + +} // namespace autoware::cuda + +#endif diff --git a/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/device_vector.hpp b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/device_vector.hpp new file mode 100644 index 0000000000000..20928e4c84321 --- /dev/null +++ b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/device_vector.hpp @@ -0,0 +1,332 @@ +// Copyright 2025 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +//    http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__DEVICE_VECTOR_HPP_ +#define AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__DEVICE_VECTOR_HPP_ + +#include "cuda_mempool_wrapper.hpp" +#include "cuda_stream_wrapper.hpp" + +#include + +#include + +#include +#include +#include +#include + +namespace autoware +{ + +template +class device_vector +{ +public: + device_vector( + std::shared_ptr stream = std::make_shared(true), + std::shared_ptr mem_pool = nullptr); + + device_vector( + const device_vector & other, + std::shared_ptr stream = std::make_shared(true), + std::shared_ptr mem_pool = nullptr, + bool copy_all = false // If true, use the stream and mem_pool of the other + ); + + device_vector(device_vector && other); + + device_vector( + size_t ele_num, std::shared_ptr stream = std::make_shared(true), + std::shared_ptr mem_pool = nullptr); + + device_vector( + const std::vector & other, + std::shared_ptr stream = std::make_shared(true), + std::shared_ptr mem_pool = nullptr); + + device_vector & operator=(const device_vector & other); + device_vector & operator=(device_vector && other); + device_vector & operator=(const std::vector & other); + + void to_vector(std::vector & output) const + { + output.resize(ele_num_); + copyDtoH(output.data(), data_, ele_num_); + } + + // Slow, do not use unless necessary + T operator[](int idx) const + { + if (idx < 0 || idx >= ele_num_) { + throw std::invalid_argument("Error: out-of-bound access at index " + std::to_string(idx)); + } + T val; + + copyDtoH(&val, data_ + idx, 1); + + CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_->get())); + + return val; + } + + T * data() { return data_; } + const T * data() const { return data_; } + size_t size() const { return ele_num_; } + + void resize(size_t new_size); + void reserve(size_t new_size); + bool empty() const { return (ele_num_ == 0); } + void clear(); + + std::shared_ptr get_stream() { return stream_; } + std::shared_ptr get_mempool() { return mempool_; } + + ~device_vector(); + +private: + // Return false if allocation fail + void allocate(size_t ele_num); + void release(); + void copyDtoD(T * dst, const T * src, size_t ele_num); + void copyHtoD(T * dst, const T * src, size_t ele_num); + void copyDtoH(T * dst, const T * src, size_t ele_num) const; + + T * data_; + size_t ele_num_; // The assumed size + size_t actual_ele_num_; // The actual size of the vector + std::shared_ptr stream_; // If null, use the default stream + std::shared_ptr mempool_; // If null, use the default non-pool allocation +}; + +template +device_vector::device_vector( + std::shared_ptr stream, std::shared_ptr mem_pool) +: stream_(stream), mempool_(mem_pool) +{ + data_ = nullptr; + ele_num_ = actual_ele_num_ = 0; +} + +template +device_vector::device_vector( + const device_vector & other, std::shared_ptr stream, + std::shared_ptr mem_pool, bool copy_all) +: stream_(stream), mempool_(mem_pool), data_(nullptr) +{ + ele_num_ = actual_ele_num_ = 0; + + if (copy_all) { + stream_ = other.stream_; + mempool_ = other.mempool_; + } + + if (other.empty()) { + return; + } + + // Allocate memory + resize(other.size()); + + // Copy + copyDtoD(data_, other.data_, other.size()); +} + +template +device_vector::device_vector(device_vector && other) +{ + data_ = other.data_; + ele_num_ = other.ele_num_; + actual_ele_num_ = other.actual_ele_num_; + stream_ = other.stream_; + mempool_ = other.mempool_; + + other.data_ = nullptr; + other.ele_num_ = other.actual_ele_num_ = 0; + other.stream_.reset(); + other.mempool_.reset(); +} + +template +device_vector::device_vector( + size_t ele_num, std::shared_ptr stream, std::shared_ptr mem_pool) +: stream_(stream), mempool_(mem_pool), data_(nullptr) +{ + ele_num_ = actual_ele_num_ = 0; + + if (ele_num > 0) { + resize(ele_num); + } +} + +template +device_vector::device_vector( + const std::vector & other, std::shared_ptr stream, + std::shared_ptr mem_pool) +: stream_(stream), mempool_(mem_pool), data_(nullptr) +{ + ele_num_ = actual_ele_num_ = 0; + + if (other.empty()) { + return; + } + + allocate(other.size()); + copyHtoD(data_, other.data(), other.size()); + ele_num_ = other.size(); +} + +template +device_vector & device_vector::operator=(const device_vector & other) +{ + size_t other_size = other.size(); + + resize(other_size); + + if (other_size == 0) { + copyDtoD(data_, other.data_, other_size); + } + + return *this; +} + +template +device_vector & device_vector::operator=(device_vector && other) +{ + release(); + + data_ = other.data_; + ele_num_ = other.ele_num_; + actual_ele_num_ = other.actual_ele_num_; + stream_ = other.stream_; + mempool_ = other.mempool_; + + // TODO: is stream_ = other.stream_ necessary? + other.data_ = nullptr; + other.ele_num_ = other.actual_ele_num_ = 0; + other.mempool_.reset(); + other.stream_.reset(); + + return *this; +} + +template +device_vector & device_vector::operator=(const std::vector & other) +{ + release(); + + if (other.empty()) { + return *this; + } + + copyHtoD(data_, other.data(), sizeof(T) * other.size()); + + return *this; +} + +template +void device_vector::resize(size_t new_ele_num) +{ + // If no significant change, just change the number of elements + if (new_ele_num >= actual_ele_num_ * 0.8 && new_ele_num <= actual_ele_num_) { + ele_num_ = new_ele_num; + return; + } + + // Otherwise, reallocate the data + release(); + allocate(new_ele_num); + ele_num_ = new_ele_num; +} + +template +void device_vector::reserve(size_t new_ele_num) +{ + if (new_ele_num <= actual_ele_num_ * 0.8) { + return; + } + + allocate(new_ele_num); +} + +template +void device_vector::clear() +{ + // Release the memory only + release(); +} + +template +device_vector::~device_vector() +{ + // Release the memory + release(); + // Release the stream and memory pool + stream_.reset(); + mempool_.reset(); +} + +template +void device_vector::allocate(size_t ele_num) +{ + if (ele_num > 0) { + if (mempool_) { + CHECK_CUDA_ERROR( + cudaMallocFromPoolAsync(&data_, sizeof(T) * ele_num, mempool_->get(), stream_->get())); + } else { + CHECK_CUDA_ERROR(cudaMallocAsync(&data_, sizeof(T) * ele_num, stream_->get())); + } + } + + actual_ele_num_ = ele_num; + // Currently no element yet + ele_num_ = 0; +} + +template +void device_vector::release() +{ + if (data_) { + CHECK_CUDA_ERROR(cudaFreeAsync(data_, stream_->get())); + } + + data_ = nullptr; + ele_num_ = 0; + actual_ele_num_ = 0; +} + +template +void device_vector::copyDtoD(T * dst, const T * src, size_t ele_num) +{ + CHECK_CUDA_ERROR( + cudaMemcpyAsync(dst, src, sizeof(T) * ele_num, cudaMemcpyDeviceToDevice, stream_->get())); +} + +template +void device_vector::copyHtoD(T * dst, const T * src, size_t ele_num) +{ + CHECK_CUDA_ERROR( + cudaMemcpyAsync(dst, src, sizeof(T) * ele_num, cudaMemcpyHostToDevice, stream_->get())); +} + +template +void device_vector::copyDtoH(T * dst, const T * src, size_t ele_num) const +{ + CHECK_CUDA_ERROR( + cudaMemcpyAsync(dst, src, sizeof(T) * ele_num, cudaMemcpyDeviceToHost, stream_->get())); +} + +} // namespace autoware + +#endif // AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__DEVICE_VECTOR_HPP_ diff --git a/perception/autoware_ground_segmentation_cuda/launch/cuda_scan_ground_filter.launch.py b/perception/autoware_ground_segmentation_cuda/launch/cuda_scan_ground_filter.launch.py new file mode 100644 index 0000000000000..688b1743806fc --- /dev/null +++ b/perception/autoware_ground_segmentation_cuda/launch/cuda_scan_ground_filter.launch.py @@ -0,0 +1,94 @@ +# Copyright 2021 Tier IV, Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import launch +from launch.actions import DeclareLaunchArgument +from launch.actions import GroupAction +from launch.actions import OpaqueFunction +from launch.conditions import LaunchConfigurationEquals +from launch.conditions import LaunchConfigurationNotEquals +from launch.substitutions import LaunchConfiguration +from launch_ros.actions import ComposableNodeContainer +from launch_ros.actions import LoadComposableNodes +from launch_ros.descriptions import ComposableNode +from launch_ros.parameter_descriptions import ParameterFile +from launch_ros.substitutions import FindPackageShare + + +def launch_setup(context, *args, **kwargs): + + ground_segmentation_node_param = ParameterFile( + param_file=LaunchConfiguration("cuda_ground_segmentation_node_param_path").perform(context), + allow_substs=True, + ) + + nodes = [ + ComposableNode( + package="autoware_ground_segmentation_cuda", + plugin="autoware::cuda_ground_segmentation::CudaScanGroundSegmentationFilterNode", + name="cuda_scan_ground_segmentation_filter", + remappings=[ + ("~/input/pointcloud", LaunchConfiguration("input/pointcloud")), + ("~/output/pointcloud", LaunchConfiguration("output/pointcloud")), + ], + parameters=[ground_segmentation_node_param], + extra_arguments=[], + ), + ] + + loader = LoadComposableNodes( + condition=LaunchConfigurationNotEquals("container", ""), + composable_node_descriptions=nodes, + target_container=LaunchConfiguration("container"), + ) + + container = ComposableNodeContainer( + name="scan_ground_filter_container", + namespace="", + package="rclcpp_components", + executable="component_container", + composable_node_descriptions=nodes, + output="screen", + condition=LaunchConfigurationEquals("container", ""), + ) + + group = GroupAction( + [ + container, + loader, + ] + ) + + return [group] + + +def generate_launch_description(): + def add_launch_arg(name: str, default_value=None): + return DeclareLaunchArgument(name, default_value=default_value) + + return launch.LaunchDescription( + [ + add_launch_arg("container", ""), + add_launch_arg("input/pointcloud", "/sensing/lidar/concatenated/pointcloud"), + add_launch_arg("output/pointcloud", "/perception/obstacle_segmentation/pointcloud"), + add_launch_arg( + "cuda_ground_segmentation_node_param_path", + [ + FindPackageShare("autoware_ground_segmentation_cuda"), + "/config/cuda_scan_ground_segmentation_filter.param.yaml", + ], + ), + ] + + [OpaqueFunction(function=launch_setup)] + ) diff --git a/perception/autoware_ground_segmentation_cuda/package.xml b/perception/autoware_ground_segmentation_cuda/package.xml new file mode 100644 index 0000000000000..38d7e4f6931de --- /dev/null +++ b/perception/autoware_ground_segmentation_cuda/package.xml @@ -0,0 +1,39 @@ + + + + autoware_ground_segmentation_cuda + 0.0.0 + GPU-Accelerated ground segmentation + badai-nguyen + Anh Nguyen + badai-nguyen + Apache License 2.0 + + ament_cmake_auto + autoware_cmake + + agnocastlib + autoware_agnocast_wrapper + autoware_cuda_dependency_meta + autoware_cuda_pointcloud_preprocessor + autoware_cuda_utils + autoware_point_types + autoware_pointcloud_preprocessor + autoware_utils + cuda_blackboard + diagnostic_msgs + rclcpp + rclcpp_components + sensor_msgs + tf2 + tf2_msgs + tier4_debug_msgs + + ament_clang_format + ament_lint_auto + ament_lint_common + + + ament_cmake + + diff --git a/perception/autoware_ground_segmentation_cuda/schema/cuda_scan_ground_segmentation_filter.schema.json b/perception/autoware_ground_segmentation_cuda/schema/cuda_scan_ground_segmentation_filter.schema.json new file mode 100644 index 0000000000000..f95891773c172 --- /dev/null +++ b/perception/autoware_ground_segmentation_cuda/schema/cuda_scan_ground_segmentation_filter.schema.json @@ -0,0 +1,126 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Cuda Scan Ground Segmentation Filter Node Params", + "type": "object", + "definitions": { + "cuda_scan_ground_segmentation_filter": { + "type": "object", + "properties": { + "global_slope_max_angle_deg": { + "type": "number", + "description": "Maximum global ground slope angle [deg]. Larger values reduce false negatives on steep roads but can increase false positives (classifying objects as ground).", + "default": 10.0 + }, + "local_slope_max_angle_deg": { + "type": "number", + "description": "Maximum local slope angle [deg] between adjacent points (used for fine discrimination of small objects or ramps).", + "default": 13.0 + }, + "non_ground_height_threshold": { + "type": "number", + "description": "Absolute vertical distance threshold [m] above estimated ground line to classify a point as non-ground.", + "default": 0.2 + }, + "grid_size_m": { + "type": "number", + "description": "Radial cell size in polar grid [m]; larger values smooth ground estimation but may miss small objects.", + "default": 0.1 + }, + "gnd_cell_buffer_size": { + "type": "integer", + "description": "Number of previous ground cells used to estimate local ground slope.", + "default": 4 + }, + "detection_range_z_max": { + "type": "number", + "description": "Maximum Z height (above base_link) considered in classification [m].", + "default": 2.5 + }, + "center_pcl_shift": { + "type": "number", + "description": "X-axis offset [m] applied to additional LiDAR point clouds relative to vehicle reference (positive forward).", + "default": 0.0 + }, + "sector_angle_deg": { + "type": "number", + "description": "Angular width of each polar sector [deg].", + "default": 1.0 + }, + "use_recheck_ground_cluster": { + "type": "boolean", + "description": "Enable secondary pass to re-evaluate ground clusters (may reduce false ground on elevated points).", + "default": true + }, + "recheck_start_distance": { + "type": "number", + "description": "Minimum radial distance [m] at which ground cluster recheck logic begins.", + "default": 20.0, + "minimum": 0.0 + }, + "use_lowest_point": { + "type": "boolean", + "description": "Use lowest point in cell as reference during ground recheck (otherwise median-like selection).", + "default": true + }, + "min_x": { + "type": "number", + "description": "Minimum X range relative to base_link [m] included in processing.", + "default": -100.0 + }, + "max_x": { + "type": "number", + "description": "Maximum X (forward) range relative to base_link [m] included in processing.", + "default": 150.0 + }, + "min_y": { + "type": "number", + "description": "Minimum Y range relative to base_link [m] included in processing.", + "default": -70.0 + }, + "max_y": { + "type": "number", + "description": "Maximum Y range relative to base_link [m] included in processing.", + "default": 70.0 + }, + "min_z": { + "type": "number", + "description": "Minimum Z range relative to base_link [m] included in processing.", + "default": -2.5 + }, + "max_z": { + "type": "number", + "description": "Maximum Z range relative to base_link [m] included in processing.", + "default": 2.5 + } + }, + "required": [ + "global_slope_max_angle_deg", + "local_slope_max_angle_deg", + "non_ground_height_threshold", + "grid_size_m", + "detection_range_z_max", + "use_recheck_ground_cluster", + "recheck_start_distance", + "use_lowest_point", + "center_pcl_shift", + "sector_angle_deg", + "gnd_cell_buffer_size" + ], + "additionalProperties": false + } + }, + "properties": { + "/**": { + "type": "object", + "properties": { + "ros__parameters": { + "$ref": "#/definitions/cuda_scan_ground_segmentation_filter" + } + }, + "required": ["ros__parameters"], + "additionalProperties": false + } + }, + "required": ["/**"], + "additionalProperties": false +} diff --git a/perception/autoware_ground_segmentation_cuda/src/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter.cu b/perception/autoware_ground_segmentation_cuda/src/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter.cu new file mode 100644 index 0000000000000..4c09415dab4e2 --- /dev/null +++ b/perception/autoware_ground_segmentation_cuda/src/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter.cu @@ -0,0 +1,819 @@ +// Copyright 2025 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +//    http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "autoware/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter.hpp" + +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include + +namespace autoware::cuda_ground_segmentation +{ + +__device__ __forceinline__ float fastAtan2_0_2Pi(float y, float x) +{ + return fmodf(atan2(y, x) + 2.0f * M_PI, 2.0f * M_PI); +} + +// Initialize the cell list +__global__ void cellInit(Cell * __restrict__ cell_list, int max_num_cells) +{ + int index = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + Cell new_cell; + + for (int i = index; i < max_num_cells; i += stride) { + new_cell.gnd_height_min = 1e6f; + + cell_list[i] = new_cell; + } +} + +/** + * @brief CUDA kernel to count the number of points in each grid cell for ground segmentation. + * + * This kernel processes each input point, computes its polar coordinates (radius and angle) + * relative to a specified center, and determines which cell in a polar grid the point belongs to. + * It then atomically increments the point count for the corresponding cell. + * + * @param input_points Pointer to the array of input points (device memory). + * @param num_input_points Number of input points in the array. + * @param filter_parameters_dev Pointer to filter parameters structure (device memory), containing + * grid and segmentation configuration such as center coordinates, sector angle, and cell + * size. + * @param centroid_cells_list_dev Pointer to the array of cell centroid structures (device memory), + * where each cell maintains a count of points assigned to it. + * + * @note Each thread processes one point. Atomic operations are used to safely increment the + * point count in each cell when multiple threads access the same cell concurrently. + * @note Points that fall outside the defined grid (cell_id >= max_num_cells) are ignored. + */ +__global__ void computeCellId( + const PointTypeStruct * __restrict__ input_points, int point_num, FilterParameters param, + int * __restrict__ cell_id, int * __restrict__ count, + ClassifiedPointType * __restrict__ classified_points) +{ + int index = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + ClassifiedPointType cp; + + cp.type = PointType::INIT; + + for (int i = index; i < point_num; i += stride) { + auto p = input_points[i]; + float dx = p.x - param.center_x; + float dy = p.y - param.center_y; + float radius = hypotf(dx, dy); + float angle = fastAtan2_0_2Pi(dy, dx); + int sector_id = static_cast(angle * param.inv_sector_angle_rad); + int cell_id_in_sector = static_cast(radius / param.cell_divider_size_m); + int point_cell_id = cell_id_in_sector * param.num_sectors + sector_id; + + // Save this so we don't have to recalculate later + cell_id[i] = point_cell_id; + + // ALso, initialize classified points + cp.z = p.z; + cp.radius = radius; + cp.origin_index = i; + + classified_points[i] = cp; + + // Also update the number of points in each cell atomically + atomicAdd(count + point_cell_id, 1); + } +} + +__global__ void distributePointsToCell( + ClassifiedPointType * input, ClassifiedPointType * output, int * cell_id, int point_num, + int * writing_loc) +{ + int index = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + + for (int i = index; i < point_num; i += stride) { + int cid = cell_id[i]; + int wloc = atomicAdd(writing_loc + cid, 1); + + output[wloc] = input[i]; + } +} + +CudaScanGroundSegmentationFilter::CudaScanGroundSegmentationFilter( + const FilterParameters & filter_parameters, const int64_t max_mem_pool_size_in_byte) +: filter_parameters_(filter_parameters) +{ + stream_.reset(new CudaStream()); + + int current_device_id = 0; + + CHECK_CUDA_ERROR(cudaGetDevice(¤t_device_id)); + mempool_.reset(new CudaMempool(current_device_id, max_mem_pool_size_in_byte)); + + // Pre-allocate the PointCloud2 objects + // The CudaPointCloud2 still use sync cudaMalloc to allocate memory + // Those calls are costly, so it should be done only once. + dev_input_points_.reset(new cuda_blackboard::CudaPointCloud2); + + // Warm-up the memory pool a bit + empty_cell_mark_.reset(new device_vector(100000, stream_, mempool_)); +} + +__forceinline__ __device__ SegmentationMode checkSegmentationMode( + int cell_id, int closest_gnd_cell_id, int furthest_gnd_cell_id, int gnd_cell_buffer_size, + int gnd_cell_continual_threshold, int empty_cell_num) +{ + if (closest_gnd_cell_id < 0) { + return SegmentationMode::UNINITIALIZED; + } + + if (cell_id - closest_gnd_cell_id >= gnd_cell_continual_threshold) { + return SegmentationMode::BREAK; + } + + if (cell_id - furthest_gnd_cell_id - empty_cell_num <= gnd_cell_buffer_size) { + return SegmentationMode::CONTINUOUS; + } + + return SegmentationMode::DISCONTINUOUS; +} + +__forceinline__ __device__ void segmentUninitializedPoint( + ClassifiedPointType & p, const FilterParameters & param) +{ + if (p.z > param.detection_range_z_max || p.z < -param.non_ground_height_threshold) { + p.type = PointType::OUT_OF_RANGE; + return; + } + + float global_height_th = p.radius * param.global_slope_max_ratio; + + if (p.z > global_height_th && p.z > param.non_ground_height_threshold) { + p.type = PointType::NON_GROUND; + return; + } + + float abs_pz = abs(p.z); + + if (abs_pz < global_height_th && abs_pz < param.non_ground_height_threshold) { + p.type = PointType::GROUND; + } +} + +__forceinline__ __device__ void segmentContinuousPoint( + ClassifiedPointType & p, const FilterParameters & param, float slope, + float prev_cell_gnd_height_avg, float prev_cell_gnd_radius_avg) +{ + if (p.z - prev_cell_gnd_height_avg > param.detection_range_z_max) { + p.type = PointType::OUT_OF_RANGE; + return; + } + + float d_radius = p.radius - prev_cell_gnd_radius_avg + param.cell_divider_size_m; + float dz = p.z - prev_cell_gnd_height_avg; + + if (p.z > param.global_slope_max_ratio * p.radius) { + p.type = PointType::NON_GROUND; + return; + } + + if (dz > param.local_slope_max_ratio * d_radius) { + p.type = PointType::NON_GROUND; + return; + } + + float estimated_ground_z = prev_cell_gnd_height_avg + slope * param.cell_divider_size_m; + + if (p.z > estimated_ground_z + param.non_ground_height_threshold) { + p.type = PointType::NON_GROUND; + return; + } + + if ( + p.z < estimated_ground_z - + param.non_ground_height_threshold || // p.z < estimated_ground_z - + // param.non_ground_height_threshold + dz < -param.local_slope_max_ratio * d_radius || + p.z < -param.global_slope_max_ratio * p.radius) { + p.type = PointType::OUT_OF_RANGE; + + return; + } + + p.type = PointType::GROUND; +} + +__forceinline__ __device__ void segmentDiscontinuousPoint( + ClassifiedPointType & p, const FilterParameters & param, float prev_cell_gnd_height_avg, + float prev_cell_gnd_radius_avg) +{ + if (p.z - prev_cell_gnd_height_avg > param.detection_range_z_max) { + p.type = PointType::OUT_OF_RANGE; + return; + } + + float dz = p.z - prev_cell_gnd_height_avg; + // float d_radius = p.radius - prev_cell_gnd_radius_avg + param.cell_divider_size_m; + float d_radius = p.radius - prev_cell_gnd_radius_avg; + float global_height_threshold = p.radius * param.global_slope_max_ratio; + float local_height_threshold = param.local_slope_max_ratio * d_radius; + + if (p.z > global_height_threshold || dz > local_height_threshold) { + p.type = PointType::NON_GROUND; + return; + } + + if (dz < -local_height_threshold || p.z < -global_height_threshold) { + p.type = PointType::OUT_OF_RANGE; + return; + } + + p.type = PointType::GROUND; +} + +__forceinline__ __device__ void segmentBreakPoint( + ClassifiedPointType & p, const FilterParameters & param, float prev_cell_gnd_height_avg, + float prev_cell_gnd_radius_avg) +{ + if (p.z - prev_cell_gnd_height_avg > param.detection_range_z_max) { + p.type = PointType::OUT_OF_RANGE; + return; + } + + float dz = p.z - prev_cell_gnd_height_avg; + float d_radius = p.radius - prev_cell_gnd_radius_avg; + float global_height_threshold = d_radius * param.global_slope_max_ratio; + + p.type = (dz > global_height_threshold) + ? PointType::NON_GROUND + : ((dz < -global_height_threshold) ? PointType::OUT_OF_RANGE : PointType::GROUND); +} + +__forceinline__ __device__ void segmentCell( + const int wid, // Index of the thread in the warp + ClassifiedPointType * classified_points, const FilterParameters & param, Cell & cell, + float slope, // Slope of the line connect the previous ground cells + int start_pid, int end_pid, // Start and end indices of points in the current cell + float prev_cell_gnd_radius_avg, float prev_cell_gnd_height_avg, const SegmentationMode & mode) +{ + if (start_pid >= end_pid) { + return; + } + + // For computing the cell statistic + float t_gnd_radius_sum = 0; + float t_gnd_height_sum = 0; + float t_gnd_height_min = FLT_MAX; + int t_gnd_point_num = 0; + // For recheck + float minus_t_gnd_radius_sum = 0; + float minus_t_gnd_height_sum = 0; + float minus_t_gnd_point_num = 0; + ClassifiedPointType last_gnd_point, p; + int last_gnd_idx; + int local_gnd_point_num; + + // Fit line from the recent ground cells + for (int j = start_pid + wid; j < end_pid; j += WARP_SIZE) { + p = classified_points[j]; + + switch (mode) { + case (SegmentationMode::UNINITIALIZED): { + segmentUninitializedPoint(p, param); + break; + } + case (SegmentationMode::CONTINUOUS): { + segmentContinuousPoint(p, param, slope, prev_cell_gnd_height_avg, prev_cell_gnd_radius_avg); + break; + } + case (SegmentationMode::DISCONTINUOUS): { + segmentDiscontinuousPoint(p, param, prev_cell_gnd_height_avg, prev_cell_gnd_radius_avg); + break; + } + case (SegmentationMode::BREAK): { + segmentBreakPoint(p, param, prev_cell_gnd_height_avg, prev_cell_gnd_radius_avg); + break; + } + } + + if (p.type == PointType::GROUND) { + t_gnd_radius_sum += p.radius; + t_gnd_height_sum += p.z; + ++t_gnd_point_num; + t_gnd_height_min = (t_gnd_height_min > p.z) ? p.z : t_gnd_height_min; + } + + classified_points[j] = p; + } + + // Wait for all threads in the warp to finish + local_gnd_point_num = t_gnd_point_num; + __syncwarp(); + + // Find the min height and the number of ground points first + + // Use reduction to compute the cell's stat + for (int offset = WARP_SIZE >> 1; offset > 0; offset >>= 1) { + t_gnd_height_sum += __shfl_down_sync(FULL_MASK, t_gnd_height_sum, offset); + t_gnd_radius_sum += __shfl_down_sync(FULL_MASK, t_gnd_radius_sum, offset); + t_gnd_point_num += __shfl_down_sync(FULL_MASK, t_gnd_point_num, offset); + + float other_height_min = __shfl_down_sync(FULL_MASK, t_gnd_height_min, offset); + t_gnd_height_min = min(t_gnd_height_min, other_height_min); + } + + // Now broadcast the min height and the number of ground points to all threads + float cell_gnd_height_min = __shfl_sync(FULL_MASK, t_gnd_height_min, 0); + int cell_gnd_point_num = __shfl_sync(FULL_MASK, t_gnd_point_num, 0); + float cell_gnd_radius_sum = __shfl_sync(FULL_MASK, t_gnd_radius_sum, 0); + + if ( + param.use_recheck_ground_cluster && cell_gnd_point_num > 1 && + cell_gnd_radius_sum / (float)(cell_gnd_point_num) > param.recheck_start_distance) { + // Now recheck the points using the height_min + for (int j = start_pid + wid; j < end_pid; j += WARP_SIZE) { + auto p = classified_points[j]; + + if ( + p.type == PointType::GROUND && + p.z > cell_gnd_height_min + param.non_ground_height_threshold && cell_gnd_point_num > 1) { + last_gnd_point = p; + minus_t_gnd_height_sum += p.z; + minus_t_gnd_radius_sum += p.radius; + ++minus_t_gnd_point_num; + p.type = PointType::NON_GROUND; + last_gnd_idx = j; + } + + classified_points[j] = p; + } + + __syncwarp(); + + // Now use ballot sync to see if there are any ground points remaining + uint32_t recheck_res = __ballot_sync(FULL_MASK, local_gnd_point_num > minus_t_gnd_point_num); + uint32_t backup_res = __ballot_sync(FULL_MASK, local_gnd_point_num > 0); + + // If no ground point remains, we have to keep the last ground point + if (recheck_res == 0 && backup_res > 0) { + // Get the index of the last thread that detects ground point + int last_tid = 32 - __ffs(backup_res); + + // Only the last point in that thread remain + if (wid == last_tid) { + // Keep the last point's stat + minus_t_gnd_height_sum -= last_gnd_point.z; + minus_t_gnd_radius_sum -= last_gnd_point.radius; + --minus_t_gnd_point_num; + classified_points[last_gnd_idx] = last_gnd_point; + } + } + __syncwarp(); + + // Final update + for (int offset = WARP_SIZE >> 1; offset > 0; offset >>= 1) { + minus_t_gnd_height_sum += __shfl_down_sync(FULL_MASK, minus_t_gnd_height_sum, offset); + minus_t_gnd_radius_sum += __shfl_down_sync(FULL_MASK, minus_t_gnd_radius_sum, offset); + minus_t_gnd_point_num += __shfl_down_sync(FULL_MASK, minus_t_gnd_point_num, offset); + } + + if (wid == 0) { + t_gnd_height_sum -= minus_t_gnd_height_sum; + t_gnd_radius_sum -= minus_t_gnd_radius_sum; + t_gnd_point_num -= minus_t_gnd_point_num; + } + } + + // Finally, thread 0 update the cell stat + if (wid == 0 && t_gnd_point_num > 0) { + cell.gnd_radius_avg = t_gnd_radius_sum / (float)(t_gnd_point_num); + cell.gnd_height_avg = t_gnd_height_sum / (float)(t_gnd_point_num); + cell.gnd_height_min = t_gnd_height_min; + cell.num_ground_points = t_gnd_point_num; + } + + __syncwarp(); + cell.num_ground_points = __shfl_sync(FULL_MASK, cell.num_ground_points, 0); +} + +__global__ void sectorProcessingKernel( + Cell * __restrict__ cell_list, ClassifiedPointType * __restrict__ classified_points, + int * starting_pid, FilterParameters param, int * empty_cell_mark) +{ + int index = threadIdx.x + blockIdx.x * blockDim.x; + // Each warp handles one sector + int sector_stride = (blockDim.x * gridDim.x) / WARP_SIZE; + int wid = index % WARP_SIZE; // Index of the thread within the warp + // Shared memory to backup the cell data + extern __shared__ Cell shared_buffer[]; + Cell * cell_queue = shared_buffer + (threadIdx.x / WARP_SIZE) * param.gnd_cell_buffer_size; + int * cell_id_queue = + (int *)(shared_buffer + param.gnd_cell_buffer_size * (blockDim.x / WARP_SIZE)) + + (threadIdx.x / WARP_SIZE) * param.gnd_cell_buffer_size; + + // Loop on sectors + for (int sector_id = index / WARP_SIZE; sector_id < param.num_sectors; + sector_id += sector_stride) { + // For storing the previous ground cells + int closest_gnd_cell_id, furthest_gnd_cell_id, num_latest_gnd_cells = 0; + int head = 0, tail = 0; + float sum_x, sum_y, sum_xx, sum_xy, slope; + float prev_gnd_radius_avg, prev_gnd_height_avg; + int empty_cell_num = 0; + + // Initially no ground cell is identified + closest_gnd_cell_id = furthest_gnd_cell_id = -1; + sum_x = sum_y = sum_xx = sum_xy = slope = 0.0; + + // Loop on the cells in a sector + for (int i = 0; i < param.max_num_cells_per_sector; ++i) { + if (num_latest_gnd_cells > 0) { + furthest_gnd_cell_id = cell_id_queue[head]; + empty_cell_num = empty_cell_mark[i * param.num_sectors + sector_id] - + empty_cell_mark[furthest_gnd_cell_id * param.num_sectors + sector_id]; + } + + int gid = i * param.num_sectors + sector_id; + int sid = starting_pid[gid], eid = starting_pid[gid + 1]; + Cell cell; + + // Skip empty cells + if (sid >= eid) { + continue; + } + + auto mode = checkSegmentationMode( + i, closest_gnd_cell_id, furthest_gnd_cell_id, param.gnd_cell_buffer_size, + param.gnd_grid_continual_thresh, empty_cell_num); + + // Classify the points in the cell + segmentCell( + wid, classified_points, param, cell, slope, sid, eid, prev_gnd_radius_avg, + prev_gnd_height_avg, mode); + + // Update the indices of the previous ground cells if the cell contains ground points + if (cell.num_ground_points > 0) { + if (num_latest_gnd_cells >= param.gnd_cell_buffer_size) { + if (wid == 0) { + // If the number of previous ground cell reach maximum, + // remove the cell at the queue head + Cell head_cell = cell_queue[head]; + sum_x -= head_cell.gnd_radius_avg; + sum_y -= head_cell.gnd_height_avg; + sum_xx -= head_cell.gnd_radius_avg * head_cell.gnd_radius_avg; + sum_xy -= head_cell.gnd_radius_avg * head_cell.gnd_height_avg; + } + + // Now remove the entry at the head of the queue + head = (head + 1) % param.gnd_cell_buffer_size; + --num_latest_gnd_cells; + } + + ++num_latest_gnd_cells; + + if (wid == 0) { + // Add the new cell to the queue + cell_queue[tail] = cell; + cell_id_queue[tail] = i; + + // Update the stats and estimate the local slope + sum_x += cell.gnd_radius_avg; + sum_y += cell.gnd_height_avg; + sum_xx += cell.gnd_radius_avg * cell.gnd_radius_avg; + sum_xy += cell.gnd_radius_avg * cell.gnd_height_avg; + + float denom = (num_latest_gnd_cells * sum_xx - sum_x * sum_x); + + if (fabsf(denom) < 1e-6f) { + Cell head_cell = cell_queue[head]; + slope = head_cell.gnd_height_avg / head_cell.gnd_radius_avg; + } else { + slope = (num_latest_gnd_cells * sum_xy - sum_x * sum_y) / denom; + slope = fmax(fminf(slope, param.global_slope_max_ratio), -param.global_slope_max_ratio); + } + + // Write the cell to the global memory + cell_list[gid] = cell; + } + + // Wait for the thread 0 to finish its work + __syncwarp(); + // Now remove the cell at the end of the queue + tail = (tail + 1) % param.gnd_cell_buffer_size; + // Distribute the new slope to all threads in the warp + slope = __shfl_sync(FULL_MASK, slope, 0); + prev_gnd_radius_avg = __shfl_sync(FULL_MASK, cell.gnd_radius_avg, 0); + prev_gnd_height_avg = __shfl_sync(FULL_MASK, cell.gnd_height_avg, 0); + closest_gnd_cell_id = i; + } + } + } +} + +__global__ void markEmptyCells(int * starting_pid, int cell_num, int * mark) +{ + int index = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + + for (int i = index; i < cell_num; i += stride) { + mark[i] = (starting_pid[i] >= starting_pid[i + 1]) ? 1 : 0; + } +} + +__global__ void prefixSumEmptyCells(int * mark, int sector_num, int cell_per_sector_num) +{ + int index = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + + for (int i = index; i < sector_num; i += stride) { + int sum = 0; + + for (int j = 0, cell_global_id = i; j < cell_per_sector_num; + ++j, cell_global_id += sector_num) { + int val_j = mark[cell_global_id]; + + mark[cell_global_id] = sum; + sum += val_j; + } + } +} + +void CudaScanGroundSegmentationFilter::sort_points( + device_vector & cell_list, device_vector & starting_pid, + device_vector & classified_points) +{ + int point_num = dev_input_points_->height * dev_input_points_->width; + + if (point_num == 0 || filter_parameters_.max_num_cells == 0) { + return; + } + + int cell_num = filter_parameters_.max_num_cells; + + cell_list.resize(cell_num); + + CHECK_CUDA_ERROR( + cuda::launchAsync( + (int)(cell_list.size()), 0, stream_->get(), cellInit, cell_list.data(), + (int)(cell_list.size()))); + + starting_pid.resize(cell_num + 1); + + device_vector cell_id(point_num, stream_, mempool_); + device_vector tmp_classified_points(point_num, stream_, mempool_); + + CHECK_CUDA_ERROR(cuda::fill(starting_pid, 0)); + CHECK_CUDA_ERROR( + cuda::launchAsync( + point_num, 0, stream_->get(), computeCellId, + reinterpret_cast(dev_input_points_->data.get()), + (int)(dev_input_points_->height * dev_input_points_->width), filter_parameters_, + cell_id.data(), starting_pid.data(), tmp_classified_points.data())); + + CHECK_CUDA_ERROR(cuda::ExclusiveScan(starting_pid)); + + device_vector writing_loc(starting_pid, stream_, mempool_); + + classified_points.resize(point_num); + + CHECK_CUDA_ERROR( + cuda::launchAsync( + point_num, 0, stream_->get(), distributePointsToCell, tmp_classified_points.data(), + classified_points.data(), cell_id.data(), point_num, writing_loc.data())); + + // Compute the number of empty cells between every pair of consecutive non-empty cell + empty_cell_mark_->resize(cell_num); + + CHECK_CUDA_ERROR( + cuda::launchAsync( + cell_num, 0, stream_->get(), markEmptyCells, starting_pid.data(), cell_num, + empty_cell_mark_->data())); + + empty_cell_mark_->resize(cell_num); + + CHECK_CUDA_ERROR( + cuda::launchAsync( + (int)(filter_parameters_.num_sectors), 0, stream_->get(), prefixSumEmptyCells, + empty_cell_mark_->data(), (int)filter_parameters_.num_sectors, + (int)filter_parameters_.max_num_cells_per_sector)); +} + +// ============ Scan per sector to get ground reference and Non-Ground points ============= +void CudaScanGroundSegmentationFilter::scanPerSectorGroundReference( + device_vector & cell_list, device_vector & starting_pid, + device_vector & classified_points) +{ + const uint32_t num_sectors = filter_parameters_.num_sectors; + if (num_sectors == 0) { + return; + } + + CHECK_CUDA_ERROR( + cuda::launchAsync( + (int)(num_sectors * WARP_SIZE), + (BLOCK_SIZE_X / WARP_SIZE) * (sizeof(Cell) + sizeof(int)) * + filter_parameters_.gnd_cell_buffer_size, + stream_->get(), sectorProcessingKernel, cell_list.data(), classified_points.data(), + starting_pid.data(), filter_parameters_, empty_cell_mark_->data())); +} + +struct NonGroundChecker +{ + CUDAH bool operator()(const ClassifiedPointType & p) const + { + return (p.type != PointType::GROUND); + } +}; + +struct GroundChecker +{ + CUDAH bool operator()(const ClassifiedPointType & p) const + { + return (p.type == PointType::GROUND); + } +}; + +template +__global__ void markingPoints( + ClassifiedPointType * classified_points, int point_num, int * mark, CheckerType checker) +{ + int index = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + + for (int i = index; i < point_num; i += stride) { + auto p = classified_points[i]; + + mark[p.origin_index] = (checker(p)) ? 1 : 0; + } +} + +__global__ void extract( + const PointTypeStruct * __restrict__ dev_input_points, int point_num, int * writing_loc, + PointTypeStruct * __restrict__ dev_output_points) +{ + int index = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + + for (int i = index; i < point_num; i += stride) { + int wloc = writing_loc[i]; + + if (wloc < writing_loc[i + 1]) { + dev_output_points[wloc] = dev_input_points[i]; + } + } +} + +// ============= Extract non-ground points ============= +template +void CudaScanGroundSegmentationFilter::extractPoints( + device_vector & classified_points, + const cuda_blackboard::CudaPointCloud2 & input, cuda_blackboard::CudaPointCloud2 & output) +{ + int point_num = dev_input_points_->height * dev_input_points_->width; + device_vector point_mark(point_num + 1, stream_, mempool_); + + // Mark non-ground points + CHECK_CUDA_ERROR( + cuda::launchAsync( + point_num, 0, stream_->get(), markingPoints, classified_points.data(), point_num, + point_mark.data(), CheckerType())); + + // Exclusive scan + device_vector writing_loc(stream_, mempool_); + + CHECK_CUDA_ERROR(cuda::ExclusiveScan(point_mark, writing_loc)); + + // Reserve the output + int output_size = writing_loc[point_num]; + + if (output_size <= 0) { + return; + } + + cuda::copyPointCloud2Metadata(output, input); + + output.data = cuda_blackboard::make_unique(output_size * output.point_step); + output.height = 1; + output.width = output_size; + + // Get the points + CHECK_CUDA_ERROR( + cuda::launchAsync( + point_num, 0, stream_->get(), extract, + reinterpret_cast(input.data.get()), + (int)(input.height * input.width), writing_loc.data(), + reinterpret_cast(output.data.get()))); +} + +void CudaScanGroundSegmentationFilter::classifyPointCloud( + const cuda_blackboard::CudaPointCloud2 & input, cuda_blackboard::CudaPointCloud2 & ground, + cuda_blackboard::CudaPointCloud2 & non_ground) +{ + device_vector cell_list(stream_, mempool_); + device_vector starting_pid(stream_, mempool_); + device_vector classified_points(stream_, mempool_); + + removeOutliers(input); + sort_points(cell_list, starting_pid, classified_points); + scanPerSectorGroundReference(cell_list, starting_pid, classified_points); + + int point_num = dev_input_points_->height * dev_input_points_->width; + + // Extract non-ground points + extractPoints(classified_points, *dev_input_points_, non_ground); + + // Extract ground points + extractPoints(classified_points, *dev_input_points_, ground); +} + +__global__ void markNonOutliers( + const PointTypeStruct * __restrict__ cloud, int point_num, FilterParameters param, + float max_radius, int * __restrict__ mark) +{ + int index = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + + for (int i = index; i < point_num; i += stride) { + auto p = cloud[i]; + int val = (p.x < param.min_x || p.x > param.max_x || p.y < param.min_y || p.y > param.max_y || + p.z < param.min_z || p.z > param.max_z) + ? 0 + : 1; + mark[i] = val; + } +} + +__global__ void getValidPoints( + const PointTypeStruct * __restrict__ cloud, int point_num, int * __restrict__ writing_loc, + PointTypeStruct * __restrict__ out_cloud) +{ + int index = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + + for (int i = index; i < point_num; i += stride) { + int wloc = writing_loc[i]; + + if (wloc < writing_loc[i + 1]) { + out_cloud[wloc] = cloud[i]; + } + } +} + +void CudaScanGroundSegmentationFilter::removeOutliers( + const cuda_blackboard::CudaPointCloud2 & input) +{ + float max_radius = filter_parameters_.max_radius; + int point_num = input.width * input.height; + + if (point_num <= 0) { + return; + } + + device_vector mark(point_num + 1, stream_, mempool_); + + CHECK_CUDA_ERROR( + cuda::launchAsync( + point_num, 0, stream_->get(), markNonOutliers, + reinterpret_cast(input.data.get()), + (int)(input.height * input.width), filter_parameters_, max_radius, mark.data())); + + CHECK_CUDA_ERROR(cuda::ExclusiveScan(mark)); + + int remain_size = mark[point_num]; + + cuda::copyPointCloud2Metadata(*dev_input_points_, input); + + dev_input_points_->height = 1; + dev_input_points_->width = remain_size; + dev_input_points_->data = + cuda_blackboard::make_unique(remain_size * dev_input_points_->point_step); + + CHECK_CUDA_ERROR( + cuda::launchAsync( + point_num, 0, stream_->get(), getValidPoints, + reinterpret_cast(input.data.get()), + (int)(input.height * input.width), mark.data(), + reinterpret_cast(dev_input_points_->data.get()))); +} + +} // namespace autoware::cuda_ground_segmentation diff --git a/perception/autoware_ground_segmentation_cuda/src/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter_node.cpp b/perception/autoware_ground_segmentation_cuda/src/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter_node.cpp new file mode 100644 index 0000000000000..e1f86ae6665e5 --- /dev/null +++ b/perception/autoware_ground_segmentation_cuda/src/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter_node.cpp @@ -0,0 +1,158 @@ +// Copyright 2025 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +//    http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "autoware/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter_node.hpp" + +#include "autoware/pointcloud_preprocessor/utility/memory.hpp" + +#include + +namespace autoware::cuda_ground_segmentation +{ + +using autoware_utils::deg2rad; +CudaScanGroundSegmentationFilterNode::CudaScanGroundSegmentationFilterNode( + const rclcpp::NodeOptions & options) +: Node("cuda_scan_ground_segmentation_filter_node", options) +{ + // Delare processing time debug + { + using autoware_utils::DebugPublisher; + using autoware_utils::StopWatch; + stop_watch_ptr_ = std::make_unique>(); + debug_publisher_ptr_ = std::make_unique(this, "cuda_scan_ground_filter"); + stop_watch_ptr_->tic("cyclic_time"); + stop_watch_ptr_->tic("processing_time"); + } + // Declare parameters + FilterParameters filter_parameters; + // global parameters + filter_parameters.max_x = static_cast(declare_parameter("max_x")); + filter_parameters.min_x = static_cast(declare_parameter("min_x")); + filter_parameters.max_y = static_cast(declare_parameter("max_y")); + filter_parameters.min_y = static_cast(declare_parameter("min_y")); + filter_parameters.max_z = static_cast(declare_parameter("max_z")); + filter_parameters.min_z = static_cast(declare_parameter("min_z")); + filter_parameters.center_x = + static_cast(declare_parameter("center_pcl_shift")); // default 0.0 + + filter_parameters.max_radius = std::max( + std::max( + std::hypot(filter_parameters.max_x, filter_parameters.max_y), + std::hypot(filter_parameters.min_x, filter_parameters.min_y)), + std::max( + std::hypot(filter_parameters.max_x, filter_parameters.min_y), + std::hypot(filter_parameters.min_x, filter_parameters.max_y))); + + // common parameters + filter_parameters.sector_angle_rad = + static_cast(deg2rad(declare_parameter("sector_angle_deg"))); + filter_parameters.inv_sector_angle_rad = 1.0f / filter_parameters.sector_angle_rad; + filter_parameters.num_sectors = + static_cast(std::ceil(2.0 * M_PI * filter_parameters.inv_sector_angle_rad)); + + // common thresholds + filter_parameters.global_slope_max_angle_rad = + static_cast(deg2rad(declare_parameter("global_slope_max_angle_deg"))); + filter_parameters.local_slope_max_angle_rad = + static_cast(deg2rad(declare_parameter("local_slope_max_angle_deg"))); + filter_parameters.global_slope_max_ratio = std::tan(filter_parameters.global_slope_max_angle_rad); + filter_parameters.local_slope_max_ratio = std::tan(filter_parameters.local_slope_max_angle_rad); + + // cell mode parameters + filter_parameters.use_recheck_ground_cluster = + static_cast(declare_parameter("use_recheck_ground_cluster")); + filter_parameters.recheck_start_distance = + static_cast(declare_parameter("recheck_start_distance")); + filter_parameters.detection_range_z_max = + static_cast(declare_parameter("detection_range_z_max")); + filter_parameters.non_ground_height_threshold = + static_cast(declare_parameter("non_ground_height_threshold")); + + // cell parameters + filter_parameters.cell_divider_size_m = + static_cast(declare_parameter("grid_size_m")); + filter_parameters.max_num_cells_per_sector = + static_cast(filter_parameters.max_radius / filter_parameters.cell_divider_size_m); + filter_parameters.max_num_cells = static_cast( + filter_parameters.max_num_cells_per_sector * filter_parameters.num_sectors); + filter_parameters.gnd_cell_buffer_size = + static_cast(declare_parameter("gnd_cell_buffer_size")); + + int64_t max_mem_pool_size_in_byte = + declare_parameter("max_mem_pool_size_in_byte", 1e9); // 1 GB + + // Initialize CUDA blackboard publisher + // Initialize CUDA blackboard subscriber + sub_ = + std::make_shared>( + *this, "~/input/pointcloud", + std::bind( + &CudaScanGroundSegmentationFilterNode::cudaPointCloudCallback, this, + std::placeholders::_1)); + + // Initialize CUDA blackboard publisher + pub_ = + std::make_unique>( + *this, "~/output/pointcloud"); + + pub_gnd_ = + std::make_unique>( + *this, "~/output/ground_pointcloud"); + + cuda_ground_segmentation_filter_ = std::make_unique( + filter_parameters, max_mem_pool_size_in_byte); +} + +void CudaScanGroundSegmentationFilterNode::cudaPointCloudCallback( + const cuda_blackboard::CudaPointCloud2::ConstSharedPtr & msg) +{ + // start time measurement + if (stop_watch_ptr_) { + stop_watch_ptr_->tic("processing_time"); + } + + auto non_ground_unique = std::make_unique(); + auto ground_unique = std::make_unique(); + + // Create shared_ptr from raw pointers for the function call + auto non_ground_shared = std::shared_ptr( + non_ground_unique.get(), [](auto *) {}); // no-op deleter + auto ground_shared = + std::shared_ptr(ground_unique.get(), [](auto *) {}); + + cuda_ground_segmentation_filter_->classifyPointCloud(*msg, *ground_shared, *non_ground_shared); + + // Publish using the original unique_ptr + pub_->publish(std::move(non_ground_unique)); + pub_gnd_->publish(std::move(ground_unique)); + + // end time measurement + if (debug_publisher_ptr_ && stop_watch_ptr_) { + stop_watch_ptr_->toc("processing_time"); + stop_watch_ptr_->toc("cyclic_time"); + const double cyclic_time_ms = stop_watch_ptr_->toc("cyclic_time", true); + const double processing_time_ms = stop_watch_ptr_->toc("processing_time", true); + debug_publisher_ptr_->publish( + "debug/cyclic_time_ms", cyclic_time_ms); + debug_publisher_ptr_->publish( + "debug/processing_time_ms", processing_time_ms); + } +} + +} // namespace autoware::cuda_ground_segmentation + +#include "rclcpp_components/register_node_macro.hpp" +RCLCPP_COMPONENTS_REGISTER_NODE( + autoware::cuda_ground_segmentation::CudaScanGroundSegmentationFilterNode)