diff --git a/.cspell.json b/.cspell.json
index 76e570034d1f8..4d3eb28340131 100644
--- a/.cspell.json
+++ b/.cspell.json
@@ -4,5 +4,5 @@
"planning/behavior_velocity_planner/autoware_behavior_velocity_intersection_module/scripts/**"
],
"ignoreRegExpList": [],
- "words": ["dltype", "tvmgen", "fromarray", "soblin", "brkay54", "libtensorrt", "TRTBEV"]
+ "words": ["dltype", "tvmgen", "fromarray", "soblin", "brkay54", "libtensorrt", "TRTBEV", "CUDAH"]
}
diff --git a/launch/tier4_perception_launch/launch/obstacle_segmentation/ground_segmentation/ground_segmentation.launch.py b/launch/tier4_perception_launch/launch/obstacle_segmentation/ground_segmentation/ground_segmentation.launch.py
index 52a4fdd213ab6..7508214782cc3 100644
--- a/launch/tier4_perception_launch/launch/obstacle_segmentation/ground_segmentation/ground_segmentation.launch.py
+++ b/launch/tier4_perception_launch/launch/obstacle_segmentation/ground_segmentation/ground_segmentation.launch.py
@@ -48,6 +48,9 @@ def __init__(self, context):
context
)
self.use_time_series_filter = LaunchConfiguration("use_time_series_filter").perform(context)
+ self.use_cuda_ground_segmentation = (
+ LaunchConfiguration("use_cuda_ground_segmentation").perform(context).lower() == "true"
+ )
# check if self.use_single_frame_filter is bool
if isinstance(self.use_single_frame_filter, str):
self.use_single_frame_filter = self.use_single_frame_filter.lower() == "true"
@@ -176,7 +179,7 @@ def create_ransac_pipeline(self):
name="short_height_obstacle_detection_area_filter",
namespace="plane_fitting",
remappings=[
- ("input", "concatenated/pointcloud"),
+ ("input", "concat/pointcloud"),
("output", "detection_area/pointcloud"),
],
parameters=[
@@ -535,6 +538,46 @@ def launch_setup(context, *args, **kwargs):
pipeline = GroundSegmentationPipeline(context)
components = []
+ if pipeline.use_cuda_ground_segmentation:
+ ground_segmentation_node_param = ParameterFile(
+ param_file=LaunchConfiguration("cuda_ground_segmentation_node_param_path").perform(
+ context
+ ),
+ allow_substs=True,
+ )
+ components.append(
+ ComposableNode(
+ package="autoware_ground_segmentation_cuda",
+ plugin="autoware::cuda_ground_segmentation::CudaScanGroundSegmentationFilterNode",
+ name="cuda_scan_ground_segmentation_filter",
+ remappings=[
+ ("~/input/pointcloud", "/sensing/lidar/concatenated/pointcloud"),
+ ("~/input/pointcloud/cuda", "/sensing/lidar/concatenated/pointcloud/cuda"),
+ ("~/output/pointcloud", "/perception/obstacle_segmentation/pointcloud"),
+ (
+ "~/output/pointcloud/cuda",
+ "/perception/obstacle_segmentation/pointcloud/cuda",
+ ),
+ (
+ "~/output/ground_pointcloud",
+ "/perception/obstacle_segmentation/ground_pointcloud",
+ ),
+ (
+ "~/output/ground_pointcloud/cuda",
+ "/perception/obstacle_segmentation/ground_pointcloud/cuda",
+ ),
+ ],
+ parameters=[ground_segmentation_node_param],
+ extra_arguments=[],
+ )
+ )
+ return [
+ LoadComposableNodes(
+ composable_node_descriptions=components,
+ target_container=LaunchConfiguration("pointcloud_container_name"),
+ )
+ ]
+
components.extend(
pipeline.create_single_frame_obstacle_segmentation_components(
input_topic=LaunchConfiguration("input/pointcloud"),
@@ -594,6 +637,7 @@ def add_launch_arg(name: str, default_value=None):
add_launch_arg("use_intra_process", "True")
add_launch_arg("pointcloud_container_name", "pointcloud_container")
add_launch_arg("input/pointcloud", "/sensing/lidar/concatenated/pointcloud")
+ add_launch_arg("use_cuda_ground_segmentation", "False")
add_launch_arg(
"ogm_outlier_filter_param_path",
[
@@ -601,6 +645,13 @@ def add_launch_arg(name: str, default_value=None):
"/config/perception/obstacle_segmentation/occupancy_grid_based_outlier_filter/occupancy_grid_map_outlier_filter.param.yaml",
],
)
+ add_launch_arg(
+ "cuda_ground_segmentation_node_param_path",
+ [
+ FindPackageShare("autoware_ground_segmentation_cuda"),
+ "/config/cuda_scan_ground_segmentation_filter.param.yaml",
+ ],
+ )
set_container_executable = SetLaunchConfiguration(
"container_executable",
diff --git a/launch/tier4_perception_launch/launch/perception.launch.xml b/launch/tier4_perception_launch/launch/perception.launch.xml
index 52b00fc22bedc..cde5a75d46108 100644
--- a/launch/tier4_perception_launch/launch/perception.launch.xml
+++ b/launch/tier4_perception_launch/launch/perception.launch.xml
@@ -149,6 +149,7 @@
+
@@ -225,6 +226,7 @@
+
diff --git a/perception/autoware_ground_segmentation_cuda/CMakeLists.txt b/perception/autoware_ground_segmentation_cuda/CMakeLists.txt
new file mode 100644
index 0000000000000..68df65699c4d3
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/CMakeLists.txt
@@ -0,0 +1,131 @@
+cmake_minimum_required(VERSION 3.8)
+project(autoware_ground_segmentation_cuda)
+
+find_package(ament_cmake_auto REQUIRED)
+find_package(CUDA)
+find_package(agnocastlib)
+
+if(NOT ${CUDA_FOUND})
+ message(WARNING "cuda was not found, so the autoware_ground_segmentation_cuda package will not be built.")
+ return()
+elseif(CMAKE_BUILD_TYPE STREQUAL "Debug")
+ set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-g -G")
+ set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-g -G")
+endif()
+
+if(USE_AGNOCAST AND NOT agnocastlib_FOUND)
+ message(FATAL_ERROR "agnocastlib is required when USE_AGNOCAST is enabled")
+endif()
+
+ament_auto_find_build_dependencies()
+
+# Default to C++17
+if(NOT CMAKE_CXX_STANDARD)
+ set(CMAKE_CXX_STANDARD 17)
+endif()
+
+if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+ add_compile_options(-Wall -Wextra -Wpedantic -Wunused-function)
+endif()
+
+if(USE_AGNOCAST)
+ add_definitions(-DUSE_AGNOCAST_ENABLED)
+endif()
+
+if(BUILD_TESTING)
+ list(APPEND AMENT_LINT_AUTO_EXCLUDE ament_cmake_uncrustify)
+ find_package(ament_lint_auto REQUIRED)
+ ament_lint_auto_find_test_dependencies()
+endif()
+
+include_directories(
+ include
+ SYSTEM
+ ${CUDA_INCLUDE_DIRS}
+)
+
+# cSpell: ignore expt gencode
+list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -diag-suppress 20012")
+list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_75,code=sm_75")
+list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_86,code=sm_86")
+list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_87,code=sm_87")
+list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_89,code=sm_89")
+list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_89,code=compute_89")
+
+#################### cuda_ground_segmentation ##################
+cuda_add_library(cuda_ground_segmentation_lib SHARED
+ src/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter.cu
+)
+
+target_link_libraries(cuda_ground_segmentation_lib
+ ${autoware_pointcloud_preprocessor_TARGETS}
+ ${autoware_cuda_pointcloud_preprocessor_TARGETS}
+)
+
+target_include_directories(cuda_ground_segmentation_lib SYSTEM PRIVATE
+ ${autoware_pointcloud_preprocessor_INCLUDE_DIRS}
+ ${autoware_cuda_pointcloud_preprocessor_INCLUDE_DIRS}
+ ${autoware_point_types_INCLUDE_DIRS}
+ ${cuda_blackboard_INCLUDE_DIRS}
+ ${diagnostic_msgs_INCLUDE_DIRS}
+ ${geometry_msgs_INCLUDE_DIRS}
+ ${rclcpp_INCLUDE_DIRS}
+ ${rclcpp_components_INCLUDE_DIRS}
+ ${rcl_interfaces_INCLUDE_DIRS}
+ ${sensor_msgs_INCLUDE_DIRS}
+ ${tf2_INCLUDE_DIRS}
+ ${tf2_msgs_INCLUDE_DIRS}
+ ${autoware_cuda_utils_INCLUDE_DIRS}
+)
+
+if(USE_AGNOCAST)
+ target_include_directories(cuda_ground_segmentation_lib SYSTEM PRIVATE
+ ${autoware_agnocast_wrapper_INCLUDE_DIRS}
+ ${agnocastlib_INCLUDE_DIRS}
+ )
+ target_link_libraries(cuda_ground_segmentation_lib
+ ${agnocastlib_LIBRARIES}
+ )
+endif()
+
+
+# Targets
+ament_auto_add_library(cuda_ground_segmentation SHARED
+ src/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter_node.cpp
+)
+
+target_link_libraries(cuda_ground_segmentation
+ ${CUDA_LIBRARIES}
+ ${diagnostic_msgs_LIBRARIES}
+ cuda_ground_segmentation_lib
+)
+
+#=========== ScanGround Segmentation filter ========
+rclcpp_components_register_node(cuda_ground_segmentation
+ PLUGIN "autoware::cuda_ground_segmentation::CudaScanGroundSegmentationFilterNode"
+ EXECUTABLE cuda_scan_ground_segmentation_filter_node)
+
+################################################################################
+# Install
+install(DIRECTORY launch config
+ DESTINATION share/${PROJECT_NAME}
+)
+
+install(
+ TARGETS cuda_ground_segmentation_lib
+ LIBRARY DESTINATION lib
+)
+
+ament_auto_package()
+
+# Set ROS_DISTRO macros
+set(ROS_DISTRO $ENV{ROS_DISTRO})
+if(${ROS_DISTRO} STREQUAL "rolling")
+ add_compile_definitions(ROS_DISTRO_ROLLING)
+elseif(${ROS_DISTRO} STREQUAL "foxy")
+ add_compile_definitions(ROS_DISTRO_FOXY)
+elseif(${ROS_DISTRO} STREQUAL "galactic")
+ add_compile_definitions(ROS_DISTRO_GALACTIC)
+elseif(${ROS_DISTRO} STREQUAL "humble")
+ add_compile_definitions(ROS_DISTRO_HUMBLE)
+endif()
diff --git a/perception/autoware_ground_segmentation_cuda/README.md b/perception/autoware_ground_segmentation_cuda/README.md
new file mode 100644
index 0000000000000..33218e12fa062
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/README.md
@@ -0,0 +1,19 @@
+# autoware_ground_segmentation_cuda
+
+## Purpose
+
+The `autoware_ground_segmentation` algorithms have been thoroughly tested with Autoware. However, due to latency and high computational cost when processing large pointcloud, the input pointcloud range has been limited by the `crop_box_filter` based on the ego-vehicle's `base_link`. This can cause unwanted object loss, especially before a sloped road.
+
+
+
+Recently, GPU and CUDA-supported libraries such as [cuda_blackboard](https://github.com/autowarefoundation/cuda_blackboard/blob/1837689df2891f6223f07c178c21aed252566ede/README.md) and accelerated versions of [`autoware_pointcloud_preprocessor`](../../sensing/autoware_cuda_pointcloud_preprocessor/README.md) have been implemented. These can be leveraged to improve the performance of ground segmentation filter algorithms using CUDA/GPU.
+
+This package reimplements the current scan_ground_filter of the ground_segmentation package to reduce latency and avoid the bottleneck caused by processing a large number of point clouds.
+
+## Inner-workings / Algorithm
+
+The detailed algorithm is available in [scan-ground-filter.md](../autoware_ground_segmentation/docs/scan-ground-filter.md).
+
+## Parameters
+
+{{ json_to_markdown("perception/autoware_ground_segmentation_cuda/schema/cuda_scan_ground_segmentation_filter.schema.json") }}
diff --git a/perception/autoware_ground_segmentation_cuda/config/cuda_scan_ground_segmentation_filter.param.yaml b/perception/autoware_ground_segmentation_cuda/config/cuda_scan_ground_segmentation_filter.param.yaml
new file mode 100644
index 0000000000000..20c8026798e8d
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/config/cuda_scan_ground_segmentation_filter.param.yaml
@@ -0,0 +1,19 @@
+/**:
+ ros__parameters:
+ global_slope_max_angle_deg: 10.0
+ local_slope_max_angle_deg: 25.0
+ non_ground_height_threshold: 0.20
+ grid_size_m: 0.5
+ detection_range_z_max: 3.2
+ use_recheck_ground_cluster: true
+ recheck_start_distance: 20.0
+ use_lowest_point: true
+ center_pcl_shift: 0.0
+ sector_angle_deg: 1.0
+ gnd_cell_buffer_size: 5
+ min_x: -100.0
+ max_x: 150.0
+ min_y: -70.0
+ max_y: 70.0
+ max_z: 2.5
+ min_z: -2.5
diff --git a/perception/autoware_ground_segmentation_cuda/docs/image/ground_segmentation_issue.png b/perception/autoware_ground_segmentation_cuda/docs/image/ground_segmentation_issue.png
new file mode 100644
index 0000000000000..31586172329a7
Binary files /dev/null and b/perception/autoware_ground_segmentation_cuda/docs/image/ground_segmentation_issue.png differ
diff --git a/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_common.hpp b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_common.hpp
new file mode 100644
index 0000000000000..b22ab4cc9aefd
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_common.hpp
@@ -0,0 +1,38 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_COMMON_HPP_
+#define AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_COMMON_HPP_
+
+#include
+
+#include
+
+#ifndef CUDAH
+#define CUDAH __forceinline__ __host__ __device__
+#endif
+
+#ifndef BLOCK_SIZE_X
+#define BLOCK_SIZE_X (256)
+#endif
+
+#ifndef WARP_SIZE
+#define WARP_SIZE (32)
+#endif
+
+#ifndef FULL_MASK
+#define FULL_MASK (0xFFFFFFFF)
+#endif
+
+#endif // AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_COMMON_HPP_
diff --git a/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_mempool_wrapper.hpp b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_mempool_wrapper.hpp
new file mode 100644
index 0000000000000..d58384f9070eb
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_mempool_wrapper.hpp
@@ -0,0 +1,83 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_MEMPOOL_WRAPPER_HPP_
+#define AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_MEMPOOL_WRAPPER_HPP_
+
+#include
+
+#include
+
+namespace autoware
+{
+
+class CudaMempool
+{
+public:
+ inline explicit CudaMempool(int device_id = 0, int64_t pool_release_threshold = 0)
+ {
+ cudaMemPoolProps pool_props = {};
+
+ // Check if the device is valid
+ CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
+
+ pool_props.allocType = cudaMemAllocationTypePinned;
+ pool_props.location.id = device_id;
+ pool_props.location.type = cudaMemLocationTypeDevice;
+
+ CHECK_CUDA_ERROR(cudaMemPoolCreate(&pool_, &pool_props));
+
+ if (pool_release_threshold >= 0) {
+ CHECK_CUDA_ERROR(
+ cudaMemPoolSetAttribute(pool_, cudaMemPoolAttrReleaseThreshold, &pool_release_threshold));
+ }
+ }
+
+ CudaMempool(const CudaMempool &) = delete;
+ inline CudaMempool(CudaMempool && other) : pool_(std::move(other.pool_))
+ {
+ other.pool_ = nullptr;
+ }
+
+ CudaMempool & operator=(const CudaMempool &) = delete;
+ inline CudaMempool & operator=(CudaMempool && other)
+ {
+ if (this != &other) {
+ release();
+ pool_ = std::move(other.pool_);
+ other.pool_ = nullptr;
+ }
+
+ return *this;
+ }
+
+ inline cudaMemPool_t get() { return pool_; }
+
+ ~CudaMempool() { release(); }
+
+private:
+ inline void release()
+ {
+ if (pool_) {
+ CHECK_CUDA_ERROR(cudaMemPoolDestroy(pool_));
+ }
+
+ pool_ = nullptr;
+ }
+
+ cudaMemPool_t pool_;
+};
+} // namespace autoware
+
+#endif // AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_MEMPOOL_WRAPPER_HPP_
diff --git a/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter.hpp b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter.hpp
new file mode 100644
index 0000000000000..d7fc8875d61d1
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter.hpp
@@ -0,0 +1,179 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_SCAN_GROUND_SEGMENTATION_FILTER_HPP_
+#define AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_SCAN_GROUND_SEGMENTATION_FILTER_HPP_
+
+#include "cuda_common.hpp"
+#include "cuda_mempool_wrapper.hpp"
+#include "cuda_stream_wrapper.hpp"
+#include "device_vector.hpp"
+
+#include
+#include
+#include
+#include
+
+#include
+
+#include
+#include
+#include
+
+namespace autoware::cuda_ground_segmentation
+{
+
+enum SegmentationMode : uint8_t { UNINITIALIZED = 0, CONTINUOUS, DISCONTINUOUS, BREAK };
+
+struct PointTypeStruct
+{
+ float x;
+ float y;
+ float z;
+ std::uint8_t intensity;
+ std::uint8_t return_type;
+ std::uint16_t channel;
+};
+
+enum class PointType : uint8_t {
+ INIT = 0,
+ GROUND,
+ NON_GROUND,
+ POINT_FOLLOW,
+ UNKNOWN,
+ VIRTUAL_GROUND,
+ OUT_OF_RANGE
+};
+
+struct alignas(16) ClassifiedPointType
+{
+ float z;
+ PointType type;
+ float radius;
+ size_t origin_index; // index in the original point cloud
+
+ CUDAH ClassifiedPointType() : z(0.0), type(PointType::INIT), radius(-1.0), origin_index(0) {}
+};
+
+struct alignas(16) Cell
+{
+ float gnd_radius_avg;
+ float gnd_height_avg;
+ float gnd_height_min;
+ uint32_t num_ground_points;
+ // initialize constructor
+
+ CUDAH Cell()
+ : gnd_radius_avg(0.0f), gnd_height_avg(0.0f), gnd_height_min(0.0f), num_ground_points(0)
+ {
+ }
+};
+
+// structure to hold parameter values
+struct FilterParameters
+{
+ float max_x;
+ float min_x;
+ float max_y;
+ float min_y;
+ float max_z;
+ float min_z;
+ float max_radius;
+ float non_ground_height_threshold;
+ // common thresholds
+ float global_slope_max_angle_rad; // radians
+ float local_slope_max_angle_rad; // radians
+ float global_slope_max_ratio;
+ float local_slope_max_ratio;
+ // common parameters
+ float sector_angle_rad; // radial sector angle in radians
+ float inv_sector_angle_rad;
+
+ // cell mode parameters
+ float recheck_start_distance; // distance to start rechecking ground cluster
+ float detection_range_z_max;
+ // cell parameters
+ float cell_divider_size_m;
+ float center_x{0.0f};
+ float center_y{0.0f};
+ uint32_t max_num_cells;
+ uint32_t max_num_cells_per_sector; // number of cells per sector
+ uint32_t num_sectors; // number of radial sectors
+ uint32_t gnd_grid_continual_thresh{3}; // threshold for continual ground grid
+ uint32_t use_recheck_ground_cluster; // to enable recheck ground cluster
+ uint32_t gnd_cell_buffer_size{5};
+};
+
+class CudaScanGroundSegmentationFilter
+{
+public:
+ explicit CudaScanGroundSegmentationFilter(
+ const FilterParameters & filter_parameters, const int64_t max_mem_pool_size_in_byte);
+ ~CudaScanGroundSegmentationFilter() = default;
+
+ // Method to process the point cloud data and filter ground points
+ void classifyPointCloud(
+ const cuda_blackboard::CudaPointCloud2 & input_points,
+ cuda_blackboard::CudaPointCloud2 & ground, cuda_blackboard::CudaPointCloud2 & non_ground);
+
+private:
+ std::unique_ptr dev_input_points_, non_ground_, ground_;
+ std::unique_ptr> empty_cell_mark_;
+
+ uint32_t number_input_points_;
+ uint32_t num_process_points_host_;
+ uint32_t input_pointcloud_step_;
+ uint32_t input_xyzi_offset_[4];
+ float center_x_{0.0f};
+ float center_y_{0.0f};
+ const uint32_t gnd_grid_continual_thresh_{3}; // threshold for continual ground grid
+ const uint32_t continual_gnd_grid_thresh_{5}; // threshold for continual ground grid with recheck
+ // Parameters
+ FilterParameters filter_parameters_;
+
+private:
+ // Remove points too far from the origin
+ void removeOutliers(const cuda_blackboard::CudaPointCloud2 & input);
+
+ /*
+ * This function calc the cell_id for each point
+ * Assign the point with initialized class into temp memory for classification
+ * Memory size of each cell is depend on predefined cell point num
+ *
+ */
+
+ void scanPerSectorGroundReference(
+ device_vector & cell_list, device_vector & starting_pid,
+ device_vector & classified_points);
+
+ /*
+ * Extract obstacle points from classified_points_dev into
+ */
+ template
+ void extractPoints(
+ device_vector & classified_points,
+ const cuda_blackboard::CudaPointCloud2 & input, cuda_blackboard::CudaPointCloud2 & output);
+
+ // Distribute points to cells
+ void sort_points(
+ device_vector & cell_list, device_vector & starting_pid,
+ device_vector & classified_points);
+
+ std::shared_ptr stream_;
+ std::shared_ptr mempool_;
+};
+
+} // namespace autoware::cuda_ground_segmentation
+
+#endif // AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_SCAN_GROUND_SEGMENTATION_FILTER_HPP_
diff --git a/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter_node.hpp b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter_node.hpp
new file mode 100644
index 0000000000000..f5e82b0ef7125
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter_node.hpp
@@ -0,0 +1,64 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_SCAN_GROUND_SEGMENTATION_FILTER_NODE_HPP_
+#define AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_SCAN_GROUND_SEGMENTATION_FILTER_NODE_HPP_
+
+#include "autoware/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter.hpp"
+
+#include
+#include
+#include
+#include
+#include
+
+// Autoware utils
+#include
+#include
+#include
+#include
+
+namespace autoware::cuda_ground_segmentation
+{
+class CudaScanGroundSegmentationFilterNode : public rclcpp::Node
+{
+public:
+ explicit CudaScanGroundSegmentationFilterNode(const rclcpp::NodeOptions & options);
+
+private:
+ void cudaPointCloudCallback(const cuda_blackboard::CudaPointCloud2::ConstSharedPtr & msg);
+ // Cuda Ground Segmentation Filter
+ std::unique_ptr cuda_ground_segmentation_filter_{};
+ // debugger
+ std::unique_ptr> stop_watch_ptr_{nullptr};
+ std::unique_ptr debug_publisher_ptr_{nullptr};
+
+ // rclcpp::Subscription::SharedPtr sub_;
+ // rclcpp::Publisher::SharedPtr pub_;
+ // rclcpp::Publisher::SharedPtr pub_gnd_;
+
+ // Cuda Sub
+ std::shared_ptr>
+ sub_{};
+ // Cuda Pub
+ std::unique_ptr>
+ pub_{};
+
+ std::unique_ptr>
+ pub_gnd_{};
+};
+
+} // namespace autoware::cuda_ground_segmentation
+
+#endif // AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_SCAN_GROUND_SEGMENTATION_FILTER_NODE_HPP_
diff --git a/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_stream_wrapper.hpp b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_stream_wrapper.hpp
new file mode 100644
index 0000000000000..e33dd49f36eb6
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_stream_wrapper.hpp
@@ -0,0 +1,72 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_STREAM_WRAPPER_HPP_
+#define AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_STREAM_WRAPPER_HPP_
+
+#include
+
+#include
+
+#include
+
+namespace autoware
+{
+
+// A wrapper on the cudaStream_t, with a destructor to
+class CudaStream
+{
+public:
+ inline CudaStream(bool is_null = false)
+ {
+ // If is_null is true, we use the default stream
+ // Otherwise, we create a new stream
+ if (!is_null) {
+ CHECK_CUDA_ERROR(cudaStreamCreate(&stream_));
+ }
+ }
+
+ CudaStream(const CudaStream &) = delete;
+ CudaStream(CudaStream && other) : stream_(std::move(other.stream_)) { other.stream_ = nullptr; }
+
+ CudaStream & operator=(const CudaStream &) = delete;
+ inline CudaStream & operator=(CudaStream && other)
+ {
+ if (this != &other) {
+ release();
+ stream_ = std::move(other.stream_);
+ other.stream_ = nullptr;
+ }
+
+ return *this;
+ }
+
+ inline cudaStream_t & get() { return stream_; }
+
+ ~CudaStream() { release(); }
+
+private:
+ inline void release()
+ {
+ if (stream_) {
+ CHECK_CUDA_ERROR(cudaStreamDestroy(stream_));
+ }
+ }
+
+ cudaStream_t stream_{nullptr};
+};
+
+} // namespace autoware
+
+#endif // AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_STREAM_WRAPPER_HPP_
diff --git a/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_utility.cuh b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_utility.cuh
new file mode 100644
index 0000000000000..db560b87b92d9
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_utility.cuh
@@ -0,0 +1,147 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION_CUDA_UTILITIES_HPP_
+#define AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION_CUDA_UTILITIES_HPP_
+
+#include "cuda_common.hpp"
+#include "cuda_mempool_wrapper.hpp"
+#include "cuda_stream_wrapper.hpp"
+#include "device_vector.hpp"
+
+#include
+
+#include
+
+namespace autoware::cuda
+{
+
+template
+inline cudaError_t launchAsync(
+ int thread_num, int shared_size, cudaStream_t & stream, void (*f)(Args...), Args... args)
+{
+ int block_x = (thread_num > block_size) ? block_size : thread_num;
+
+ if (block_x <= 0) {
+ return cudaErrorLaunchFailure;
+ }
+
+ int grid_x = (thread_num + block_x - 1) / block_x;
+
+ f<<>>(args...);
+
+ return cudaGetLastError();
+}
+
+template
+cudaError_t ExclusiveScan(
+ T * input, T * output, int ele_num, std::shared_ptr stream,
+ std::shared_ptr mempool)
+{
+ if (ele_num == 0) {
+ return cudaSuccess;
+ }
+
+ if (ele_num < 0 || !stream) {
+ return cudaErrorInvalidValue;
+ }
+
+ device_vector d_temp_storage(stream, mempool);
+ size_t temp_storage_bytes = 0;
+
+ cub::DeviceScan::ExclusiveSum(
+ (void *)(d_temp_storage.data()), temp_storage_bytes, input, output, ele_num, stream->get());
+
+ int temp_ele_num = (temp_storage_bytes + sizeof(int) - 1) / sizeof(int);
+ d_temp_storage.resize(temp_ele_num);
+
+ cub::DeviceScan::ExclusiveSum(
+ (void *)(d_temp_storage.data()), temp_storage_bytes, input, output, ele_num, stream->get());
+
+ return cudaGetLastError();
+}
+
+template
+cudaError_t ExclusiveScan(device_vector & input, device_vector & output)
+{
+ if (input.empty()) {
+ return cudaSuccess;
+ }
+
+ output.resize(input.size());
+
+ return ExclusiveScan(
+ input.data(), output.data(), (int)(input.size()), input.get_stream(), input.get_mempool());
+}
+
+template
+cudaError_t ExclusiveScan(device_vector & input)
+{
+ return ExclusiveScan(input, input);
+}
+
+template
+__global__ void fillVector(T * vec, int ele_num, T init_val)
+{
+ int index = threadIdx.x + blockIdx.x * blockDim.x;
+ int stride = blockDim.x * gridDim.x;
+
+ for (int i = index; i < ele_num; i += stride) {
+ vec[i] = init_val;
+ }
+}
+
+template
+cudaError_t fill(T * input, int ele_num, T val, std::shared_ptr stream)
+{
+ if (ele_num == 0) {
+ return cudaSuccess;
+ }
+
+ if (ele_num < 0 || !stream) {
+ return cudaErrorInvalidValue;
+ }
+
+ return launchAsync(ele_num, 0, stream->get(), fillVector, input, ele_num, val);
+}
+
+template
+cudaError_t fill(device_vector & input, T val)
+{
+ return fill(input.data(), (int)(input.size()), val, input.get_stream());
+}
+
+CUDAH void memcpy(uint8_t * dst, const uint8_t * src, int size)
+{
+ for (int i = 0; i < size; ++i) {
+ dst[i] = src[i];
+ }
+}
+
+inline void copyPointCloud2Metadata(
+ cuda_blackboard::CudaPointCloud2 & dst, const cuda_blackboard::CudaPointCloud2 & src)
+{
+ dst.header = src.header;
+ dst.height = src.height;
+ dst.width = src.width;
+ dst.fields = src.fields;
+ dst.is_bigendian = src.is_bigendian;
+ dst.point_step = src.point_step;
+ dst.row_step = src.row_step;
+ dst.is_dense = src.is_dense;
+}
+
+} // namespace autoware::cuda
+
+#endif
diff --git a/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/device_vector.hpp b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/device_vector.hpp
new file mode 100644
index 0000000000000..20928e4c84321
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/device_vector.hpp
@@ -0,0 +1,332 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__DEVICE_VECTOR_HPP_
+#define AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__DEVICE_VECTOR_HPP_
+
+#include "cuda_mempool_wrapper.hpp"
+#include "cuda_stream_wrapper.hpp"
+
+#include
+
+#include
+
+#include
+#include
+#include
+#include
+
+namespace autoware
+{
+
+template
+class device_vector
+{
+public:
+ device_vector(
+ std::shared_ptr stream = std::make_shared(true),
+ std::shared_ptr mem_pool = nullptr);
+
+ device_vector(
+ const device_vector & other,
+ std::shared_ptr stream = std::make_shared(true),
+ std::shared_ptr mem_pool = nullptr,
+ bool copy_all = false // If true, use the stream and mem_pool of the other
+ );
+
+ device_vector(device_vector && other);
+
+ device_vector(
+ size_t ele_num, std::shared_ptr stream = std::make_shared(true),
+ std::shared_ptr mem_pool = nullptr);
+
+ device_vector(
+ const std::vector & other,
+ std::shared_ptr stream = std::make_shared(true),
+ std::shared_ptr mem_pool = nullptr);
+
+ device_vector & operator=(const device_vector & other);
+ device_vector & operator=(device_vector && other);
+ device_vector & operator=(const std::vector & other);
+
+ void to_vector(std::vector & output) const
+ {
+ output.resize(ele_num_);
+ copyDtoH(output.data(), data_, ele_num_);
+ }
+
+ // Slow, do not use unless necessary
+ T operator[](int idx) const
+ {
+ if (idx < 0 || idx >= ele_num_) {
+ throw std::invalid_argument("Error: out-of-bound access at index " + std::to_string(idx));
+ }
+ T val;
+
+ copyDtoH(&val, data_ + idx, 1);
+
+ CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_->get()));
+
+ return val;
+ }
+
+ T * data() { return data_; }
+ const T * data() const { return data_; }
+ size_t size() const { return ele_num_; }
+
+ void resize(size_t new_size);
+ void reserve(size_t new_size);
+ bool empty() const { return (ele_num_ == 0); }
+ void clear();
+
+ std::shared_ptr get_stream() { return stream_; }
+ std::shared_ptr get_mempool() { return mempool_; }
+
+ ~device_vector();
+
+private:
+ // Return false if allocation fail
+ void allocate(size_t ele_num);
+ void release();
+ void copyDtoD(T * dst, const T * src, size_t ele_num);
+ void copyHtoD(T * dst, const T * src, size_t ele_num);
+ void copyDtoH(T * dst, const T * src, size_t ele_num) const;
+
+ T * data_;
+ size_t ele_num_; // The assumed size
+ size_t actual_ele_num_; // The actual size of the vector
+ std::shared_ptr stream_; // If null, use the default stream
+ std::shared_ptr mempool_; // If null, use the default non-pool allocation
+};
+
+template
+device_vector::device_vector(
+ std::shared_ptr stream, std::shared_ptr mem_pool)
+: stream_(stream), mempool_(mem_pool)
+{
+ data_ = nullptr;
+ ele_num_ = actual_ele_num_ = 0;
+}
+
+template
+device_vector::device_vector(
+ const device_vector & other, std::shared_ptr stream,
+ std::shared_ptr mem_pool, bool copy_all)
+: stream_(stream), mempool_(mem_pool), data_(nullptr)
+{
+ ele_num_ = actual_ele_num_ = 0;
+
+ if (copy_all) {
+ stream_ = other.stream_;
+ mempool_ = other.mempool_;
+ }
+
+ if (other.empty()) {
+ return;
+ }
+
+ // Allocate memory
+ resize(other.size());
+
+ // Copy
+ copyDtoD(data_, other.data_, other.size());
+}
+
+template
+device_vector::device_vector(device_vector && other)
+{
+ data_ = other.data_;
+ ele_num_ = other.ele_num_;
+ actual_ele_num_ = other.actual_ele_num_;
+ stream_ = other.stream_;
+ mempool_ = other.mempool_;
+
+ other.data_ = nullptr;
+ other.ele_num_ = other.actual_ele_num_ = 0;
+ other.stream_.reset();
+ other.mempool_.reset();
+}
+
+template
+device_vector::device_vector(
+ size_t ele_num, std::shared_ptr stream, std::shared_ptr mem_pool)
+: stream_(stream), mempool_(mem_pool), data_(nullptr)
+{
+ ele_num_ = actual_ele_num_ = 0;
+
+ if (ele_num > 0) {
+ resize(ele_num);
+ }
+}
+
+template
+device_vector::device_vector(
+ const std::vector & other, std::shared_ptr stream,
+ std::shared_ptr mem_pool)
+: stream_(stream), mempool_(mem_pool), data_(nullptr)
+{
+ ele_num_ = actual_ele_num_ = 0;
+
+ if (other.empty()) {
+ return;
+ }
+
+ allocate(other.size());
+ copyHtoD(data_, other.data(), other.size());
+ ele_num_ = other.size();
+}
+
+template
+device_vector & device_vector::operator=(const device_vector & other)
+{
+ size_t other_size = other.size();
+
+ resize(other_size);
+
+ if (other_size == 0) {
+ copyDtoD(data_, other.data_, other_size);
+ }
+
+ return *this;
+}
+
+template
+device_vector & device_vector::operator=(device_vector && other)
+{
+ release();
+
+ data_ = other.data_;
+ ele_num_ = other.ele_num_;
+ actual_ele_num_ = other.actual_ele_num_;
+ stream_ = other.stream_;
+ mempool_ = other.mempool_;
+
+ // TODO: is stream_ = other.stream_ necessary?
+ other.data_ = nullptr;
+ other.ele_num_ = other.actual_ele_num_ = 0;
+ other.mempool_.reset();
+ other.stream_.reset();
+
+ return *this;
+}
+
+template
+device_vector & device_vector::operator=(const std::vector & other)
+{
+ release();
+
+ if (other.empty()) {
+ return *this;
+ }
+
+ copyHtoD(data_, other.data(), sizeof(T) * other.size());
+
+ return *this;
+}
+
+template
+void device_vector::resize(size_t new_ele_num)
+{
+ // If no significant change, just change the number of elements
+ if (new_ele_num >= actual_ele_num_ * 0.8 && new_ele_num <= actual_ele_num_) {
+ ele_num_ = new_ele_num;
+ return;
+ }
+
+ // Otherwise, reallocate the data
+ release();
+ allocate(new_ele_num);
+ ele_num_ = new_ele_num;
+}
+
+template
+void device_vector::reserve(size_t new_ele_num)
+{
+ if (new_ele_num <= actual_ele_num_ * 0.8) {
+ return;
+ }
+
+ allocate(new_ele_num);
+}
+
+template
+void device_vector::clear()
+{
+ // Release the memory only
+ release();
+}
+
+template
+device_vector::~device_vector()
+{
+ // Release the memory
+ release();
+ // Release the stream and memory pool
+ stream_.reset();
+ mempool_.reset();
+}
+
+template
+void device_vector::allocate(size_t ele_num)
+{
+ if (ele_num > 0) {
+ if (mempool_) {
+ CHECK_CUDA_ERROR(
+ cudaMallocFromPoolAsync(&data_, sizeof(T) * ele_num, mempool_->get(), stream_->get()));
+ } else {
+ CHECK_CUDA_ERROR(cudaMallocAsync(&data_, sizeof(T) * ele_num, stream_->get()));
+ }
+ }
+
+ actual_ele_num_ = ele_num;
+ // Currently no element yet
+ ele_num_ = 0;
+}
+
+template
+void device_vector::release()
+{
+ if (data_) {
+ CHECK_CUDA_ERROR(cudaFreeAsync(data_, stream_->get()));
+ }
+
+ data_ = nullptr;
+ ele_num_ = 0;
+ actual_ele_num_ = 0;
+}
+
+template
+void device_vector::copyDtoD(T * dst, const T * src, size_t ele_num)
+{
+ CHECK_CUDA_ERROR(
+ cudaMemcpyAsync(dst, src, sizeof(T) * ele_num, cudaMemcpyDeviceToDevice, stream_->get()));
+}
+
+template
+void device_vector::copyHtoD(T * dst, const T * src, size_t ele_num)
+{
+ CHECK_CUDA_ERROR(
+ cudaMemcpyAsync(dst, src, sizeof(T) * ele_num, cudaMemcpyHostToDevice, stream_->get()));
+}
+
+template
+void device_vector::copyDtoH(T * dst, const T * src, size_t ele_num) const
+{
+ CHECK_CUDA_ERROR(
+ cudaMemcpyAsync(dst, src, sizeof(T) * ele_num, cudaMemcpyDeviceToHost, stream_->get()));
+}
+
+} // namespace autoware
+
+#endif // AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__DEVICE_VECTOR_HPP_
diff --git a/perception/autoware_ground_segmentation_cuda/launch/cuda_scan_ground_filter.launch.py b/perception/autoware_ground_segmentation_cuda/launch/cuda_scan_ground_filter.launch.py
new file mode 100644
index 0000000000000..688b1743806fc
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/launch/cuda_scan_ground_filter.launch.py
@@ -0,0 +1,94 @@
+# Copyright 2021 Tier IV, Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import launch
+from launch.actions import DeclareLaunchArgument
+from launch.actions import GroupAction
+from launch.actions import OpaqueFunction
+from launch.conditions import LaunchConfigurationEquals
+from launch.conditions import LaunchConfigurationNotEquals
+from launch.substitutions import LaunchConfiguration
+from launch_ros.actions import ComposableNodeContainer
+from launch_ros.actions import LoadComposableNodes
+from launch_ros.descriptions import ComposableNode
+from launch_ros.parameter_descriptions import ParameterFile
+from launch_ros.substitutions import FindPackageShare
+
+
+def launch_setup(context, *args, **kwargs):
+
+ ground_segmentation_node_param = ParameterFile(
+ param_file=LaunchConfiguration("cuda_ground_segmentation_node_param_path").perform(context),
+ allow_substs=True,
+ )
+
+ nodes = [
+ ComposableNode(
+ package="autoware_ground_segmentation_cuda",
+ plugin="autoware::cuda_ground_segmentation::CudaScanGroundSegmentationFilterNode",
+ name="cuda_scan_ground_segmentation_filter",
+ remappings=[
+ ("~/input/pointcloud", LaunchConfiguration("input/pointcloud")),
+ ("~/output/pointcloud", LaunchConfiguration("output/pointcloud")),
+ ],
+ parameters=[ground_segmentation_node_param],
+ extra_arguments=[],
+ ),
+ ]
+
+ loader = LoadComposableNodes(
+ condition=LaunchConfigurationNotEquals("container", ""),
+ composable_node_descriptions=nodes,
+ target_container=LaunchConfiguration("container"),
+ )
+
+ container = ComposableNodeContainer(
+ name="scan_ground_filter_container",
+ namespace="",
+ package="rclcpp_components",
+ executable="component_container",
+ composable_node_descriptions=nodes,
+ output="screen",
+ condition=LaunchConfigurationEquals("container", ""),
+ )
+
+ group = GroupAction(
+ [
+ container,
+ loader,
+ ]
+ )
+
+ return [group]
+
+
+def generate_launch_description():
+ def add_launch_arg(name: str, default_value=None):
+ return DeclareLaunchArgument(name, default_value=default_value)
+
+ return launch.LaunchDescription(
+ [
+ add_launch_arg("container", ""),
+ add_launch_arg("input/pointcloud", "/sensing/lidar/concatenated/pointcloud"),
+ add_launch_arg("output/pointcloud", "/perception/obstacle_segmentation/pointcloud"),
+ add_launch_arg(
+ "cuda_ground_segmentation_node_param_path",
+ [
+ FindPackageShare("autoware_ground_segmentation_cuda"),
+ "/config/cuda_scan_ground_segmentation_filter.param.yaml",
+ ],
+ ),
+ ]
+ + [OpaqueFunction(function=launch_setup)]
+ )
diff --git a/perception/autoware_ground_segmentation_cuda/package.xml b/perception/autoware_ground_segmentation_cuda/package.xml
new file mode 100644
index 0000000000000..38d7e4f6931de
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/package.xml
@@ -0,0 +1,39 @@
+
+
+
+ autoware_ground_segmentation_cuda
+ 0.0.0
+ GPU-Accelerated ground segmentation
+ badai-nguyen
+ Anh Nguyen
+ badai-nguyen
+ Apache License 2.0
+
+ ament_cmake_auto
+ autoware_cmake
+
+ agnocastlib
+ autoware_agnocast_wrapper
+ autoware_cuda_dependency_meta
+ autoware_cuda_pointcloud_preprocessor
+ autoware_cuda_utils
+ autoware_point_types
+ autoware_pointcloud_preprocessor
+ autoware_utils
+ cuda_blackboard
+ diagnostic_msgs
+ rclcpp
+ rclcpp_components
+ sensor_msgs
+ tf2
+ tf2_msgs
+ tier4_debug_msgs
+
+ ament_clang_format
+ ament_lint_auto
+ ament_lint_common
+
+
+ ament_cmake
+
+
diff --git a/perception/autoware_ground_segmentation_cuda/schema/cuda_scan_ground_segmentation_filter.schema.json b/perception/autoware_ground_segmentation_cuda/schema/cuda_scan_ground_segmentation_filter.schema.json
new file mode 100644
index 0000000000000..f95891773c172
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/schema/cuda_scan_ground_segmentation_filter.schema.json
@@ -0,0 +1,126 @@
+{
+ "$schema": "http://json-schema.org/draft-07/schema#",
+ "title": "Cuda Scan Ground Segmentation Filter Node Params",
+ "type": "object",
+ "definitions": {
+ "cuda_scan_ground_segmentation_filter": {
+ "type": "object",
+ "properties": {
+ "global_slope_max_angle_deg": {
+ "type": "number",
+ "description": "Maximum global ground slope angle [deg]. Larger values reduce false negatives on steep roads but can increase false positives (classifying objects as ground).",
+ "default": 10.0
+ },
+ "local_slope_max_angle_deg": {
+ "type": "number",
+ "description": "Maximum local slope angle [deg] between adjacent points (used for fine discrimination of small objects or ramps).",
+ "default": 13.0
+ },
+ "non_ground_height_threshold": {
+ "type": "number",
+ "description": "Absolute vertical distance threshold [m] above estimated ground line to classify a point as non-ground.",
+ "default": 0.2
+ },
+ "grid_size_m": {
+ "type": "number",
+ "description": "Radial cell size in polar grid [m]; larger values smooth ground estimation but may miss small objects.",
+ "default": 0.1
+ },
+ "gnd_cell_buffer_size": {
+ "type": "integer",
+ "description": "Number of previous ground cells used to estimate local ground slope.",
+ "default": 4
+ },
+ "detection_range_z_max": {
+ "type": "number",
+ "description": "Maximum Z height (above base_link) considered in classification [m].",
+ "default": 2.5
+ },
+ "center_pcl_shift": {
+ "type": "number",
+ "description": "X-axis offset [m] applied to additional LiDAR point clouds relative to vehicle reference (positive forward).",
+ "default": 0.0
+ },
+ "sector_angle_deg": {
+ "type": "number",
+ "description": "Angular width of each polar sector [deg].",
+ "default": 1.0
+ },
+ "use_recheck_ground_cluster": {
+ "type": "boolean",
+ "description": "Enable secondary pass to re-evaluate ground clusters (may reduce false ground on elevated points).",
+ "default": true
+ },
+ "recheck_start_distance": {
+ "type": "number",
+ "description": "Minimum radial distance [m] at which ground cluster recheck logic begins.",
+ "default": 20.0,
+ "minimum": 0.0
+ },
+ "use_lowest_point": {
+ "type": "boolean",
+ "description": "Use lowest point in cell as reference during ground recheck (otherwise median-like selection).",
+ "default": true
+ },
+ "min_x": {
+ "type": "number",
+ "description": "Minimum X range relative to base_link [m] included in processing.",
+ "default": -100.0
+ },
+ "max_x": {
+ "type": "number",
+ "description": "Maximum X (forward) range relative to base_link [m] included in processing.",
+ "default": 150.0
+ },
+ "min_y": {
+ "type": "number",
+ "description": "Minimum Y range relative to base_link [m] included in processing.",
+ "default": -70.0
+ },
+ "max_y": {
+ "type": "number",
+ "description": "Maximum Y range relative to base_link [m] included in processing.",
+ "default": 70.0
+ },
+ "min_z": {
+ "type": "number",
+ "description": "Minimum Z range relative to base_link [m] included in processing.",
+ "default": -2.5
+ },
+ "max_z": {
+ "type": "number",
+ "description": "Maximum Z range relative to base_link [m] included in processing.",
+ "default": 2.5
+ }
+ },
+ "required": [
+ "global_slope_max_angle_deg",
+ "local_slope_max_angle_deg",
+ "non_ground_height_threshold",
+ "grid_size_m",
+ "detection_range_z_max",
+ "use_recheck_ground_cluster",
+ "recheck_start_distance",
+ "use_lowest_point",
+ "center_pcl_shift",
+ "sector_angle_deg",
+ "gnd_cell_buffer_size"
+ ],
+ "additionalProperties": false
+ }
+ },
+ "properties": {
+ "/**": {
+ "type": "object",
+ "properties": {
+ "ros__parameters": {
+ "$ref": "#/definitions/cuda_scan_ground_segmentation_filter"
+ }
+ },
+ "required": ["ros__parameters"],
+ "additionalProperties": false
+ }
+ },
+ "required": ["/**"],
+ "additionalProperties": false
+}
diff --git a/perception/autoware_ground_segmentation_cuda/src/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter.cu b/perception/autoware_ground_segmentation_cuda/src/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter.cu
new file mode 100644
index 0000000000000..4c09415dab4e2
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/src/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter.cu
@@ -0,0 +1,819 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter.hpp"
+
+#include
+#include
+#include
+
+#include
+
+#include
+
+#include
+#include
+#include
+#include
+
+namespace autoware::cuda_ground_segmentation
+{
+
+__device__ __forceinline__ float fastAtan2_0_2Pi(float y, float x)
+{
+ return fmodf(atan2(y, x) + 2.0f * M_PI, 2.0f * M_PI);
+}
+
+// Initialize the cell list
+__global__ void cellInit(Cell * __restrict__ cell_list, int max_num_cells)
+{
+ int index = threadIdx.x + blockIdx.x * blockDim.x;
+ int stride = blockDim.x * gridDim.x;
+ Cell new_cell;
+
+ for (int i = index; i < max_num_cells; i += stride) {
+ new_cell.gnd_height_min = 1e6f;
+
+ cell_list[i] = new_cell;
+ }
+}
+
+/**
+ * @brief CUDA kernel to count the number of points in each grid cell for ground segmentation.
+ *
+ * This kernel processes each input point, computes its polar coordinates (radius and angle)
+ * relative to a specified center, and determines which cell in a polar grid the point belongs to.
+ * It then atomically increments the point count for the corresponding cell.
+ *
+ * @param input_points Pointer to the array of input points (device memory).
+ * @param num_input_points Number of input points in the array.
+ * @param filter_parameters_dev Pointer to filter parameters structure (device memory), containing
+ * grid and segmentation configuration such as center coordinates, sector angle, and cell
+ * size.
+ * @param centroid_cells_list_dev Pointer to the array of cell centroid structures (device memory),
+ * where each cell maintains a count of points assigned to it.
+ *
+ * @note Each thread processes one point. Atomic operations are used to safely increment the
+ * point count in each cell when multiple threads access the same cell concurrently.
+ * @note Points that fall outside the defined grid (cell_id >= max_num_cells) are ignored.
+ */
+__global__ void computeCellId(
+ const PointTypeStruct * __restrict__ input_points, int point_num, FilterParameters param,
+ int * __restrict__ cell_id, int * __restrict__ count,
+ ClassifiedPointType * __restrict__ classified_points)
+{
+ int index = threadIdx.x + blockIdx.x * blockDim.x;
+ int stride = blockDim.x * gridDim.x;
+ ClassifiedPointType cp;
+
+ cp.type = PointType::INIT;
+
+ for (int i = index; i < point_num; i += stride) {
+ auto p = input_points[i];
+ float dx = p.x - param.center_x;
+ float dy = p.y - param.center_y;
+ float radius = hypotf(dx, dy);
+ float angle = fastAtan2_0_2Pi(dy, dx);
+ int sector_id = static_cast(angle * param.inv_sector_angle_rad);
+ int cell_id_in_sector = static_cast(radius / param.cell_divider_size_m);
+ int point_cell_id = cell_id_in_sector * param.num_sectors + sector_id;
+
+ // Save this so we don't have to recalculate later
+ cell_id[i] = point_cell_id;
+
+ // ALso, initialize classified points
+ cp.z = p.z;
+ cp.radius = radius;
+ cp.origin_index = i;
+
+ classified_points[i] = cp;
+
+ // Also update the number of points in each cell atomically
+ atomicAdd(count + point_cell_id, 1);
+ }
+}
+
+__global__ void distributePointsToCell(
+ ClassifiedPointType * input, ClassifiedPointType * output, int * cell_id, int point_num,
+ int * writing_loc)
+{
+ int index = threadIdx.x + blockIdx.x * blockDim.x;
+ int stride = blockDim.x * gridDim.x;
+
+ for (int i = index; i < point_num; i += stride) {
+ int cid = cell_id[i];
+ int wloc = atomicAdd(writing_loc + cid, 1);
+
+ output[wloc] = input[i];
+ }
+}
+
+CudaScanGroundSegmentationFilter::CudaScanGroundSegmentationFilter(
+ const FilterParameters & filter_parameters, const int64_t max_mem_pool_size_in_byte)
+: filter_parameters_(filter_parameters)
+{
+ stream_.reset(new CudaStream());
+
+ int current_device_id = 0;
+
+ CHECK_CUDA_ERROR(cudaGetDevice(¤t_device_id));
+ mempool_.reset(new CudaMempool(current_device_id, max_mem_pool_size_in_byte));
+
+ // Pre-allocate the PointCloud2 objects
+ // The CudaPointCloud2 still use sync cudaMalloc to allocate memory
+ // Those calls are costly, so it should be done only once.
+ dev_input_points_.reset(new cuda_blackboard::CudaPointCloud2);
+
+ // Warm-up the memory pool a bit
+ empty_cell_mark_.reset(new device_vector(100000, stream_, mempool_));
+}
+
+__forceinline__ __device__ SegmentationMode checkSegmentationMode(
+ int cell_id, int closest_gnd_cell_id, int furthest_gnd_cell_id, int gnd_cell_buffer_size,
+ int gnd_cell_continual_threshold, int empty_cell_num)
+{
+ if (closest_gnd_cell_id < 0) {
+ return SegmentationMode::UNINITIALIZED;
+ }
+
+ if (cell_id - closest_gnd_cell_id >= gnd_cell_continual_threshold) {
+ return SegmentationMode::BREAK;
+ }
+
+ if (cell_id - furthest_gnd_cell_id - empty_cell_num <= gnd_cell_buffer_size) {
+ return SegmentationMode::CONTINUOUS;
+ }
+
+ return SegmentationMode::DISCONTINUOUS;
+}
+
+__forceinline__ __device__ void segmentUninitializedPoint(
+ ClassifiedPointType & p, const FilterParameters & param)
+{
+ if (p.z > param.detection_range_z_max || p.z < -param.non_ground_height_threshold) {
+ p.type = PointType::OUT_OF_RANGE;
+ return;
+ }
+
+ float global_height_th = p.radius * param.global_slope_max_ratio;
+
+ if (p.z > global_height_th && p.z > param.non_ground_height_threshold) {
+ p.type = PointType::NON_GROUND;
+ return;
+ }
+
+ float abs_pz = abs(p.z);
+
+ if (abs_pz < global_height_th && abs_pz < param.non_ground_height_threshold) {
+ p.type = PointType::GROUND;
+ }
+}
+
+__forceinline__ __device__ void segmentContinuousPoint(
+ ClassifiedPointType & p, const FilterParameters & param, float slope,
+ float prev_cell_gnd_height_avg, float prev_cell_gnd_radius_avg)
+{
+ if (p.z - prev_cell_gnd_height_avg > param.detection_range_z_max) {
+ p.type = PointType::OUT_OF_RANGE;
+ return;
+ }
+
+ float d_radius = p.radius - prev_cell_gnd_radius_avg + param.cell_divider_size_m;
+ float dz = p.z - prev_cell_gnd_height_avg;
+
+ if (p.z > param.global_slope_max_ratio * p.radius) {
+ p.type = PointType::NON_GROUND;
+ return;
+ }
+
+ if (dz > param.local_slope_max_ratio * d_radius) {
+ p.type = PointType::NON_GROUND;
+ return;
+ }
+
+ float estimated_ground_z = prev_cell_gnd_height_avg + slope * param.cell_divider_size_m;
+
+ if (p.z > estimated_ground_z + param.non_ground_height_threshold) {
+ p.type = PointType::NON_GROUND;
+ return;
+ }
+
+ if (
+ p.z < estimated_ground_z -
+ param.non_ground_height_threshold || // p.z < estimated_ground_z -
+ // param.non_ground_height_threshold
+ dz < -param.local_slope_max_ratio * d_radius ||
+ p.z < -param.global_slope_max_ratio * p.radius) {
+ p.type = PointType::OUT_OF_RANGE;
+
+ return;
+ }
+
+ p.type = PointType::GROUND;
+}
+
+__forceinline__ __device__ void segmentDiscontinuousPoint(
+ ClassifiedPointType & p, const FilterParameters & param, float prev_cell_gnd_height_avg,
+ float prev_cell_gnd_radius_avg)
+{
+ if (p.z - prev_cell_gnd_height_avg > param.detection_range_z_max) {
+ p.type = PointType::OUT_OF_RANGE;
+ return;
+ }
+
+ float dz = p.z - prev_cell_gnd_height_avg;
+ // float d_radius = p.radius - prev_cell_gnd_radius_avg + param.cell_divider_size_m;
+ float d_radius = p.radius - prev_cell_gnd_radius_avg;
+ float global_height_threshold = p.radius * param.global_slope_max_ratio;
+ float local_height_threshold = param.local_slope_max_ratio * d_radius;
+
+ if (p.z > global_height_threshold || dz > local_height_threshold) {
+ p.type = PointType::NON_GROUND;
+ return;
+ }
+
+ if (dz < -local_height_threshold || p.z < -global_height_threshold) {
+ p.type = PointType::OUT_OF_RANGE;
+ return;
+ }
+
+ p.type = PointType::GROUND;
+}
+
+__forceinline__ __device__ void segmentBreakPoint(
+ ClassifiedPointType & p, const FilterParameters & param, float prev_cell_gnd_height_avg,
+ float prev_cell_gnd_radius_avg)
+{
+ if (p.z - prev_cell_gnd_height_avg > param.detection_range_z_max) {
+ p.type = PointType::OUT_OF_RANGE;
+ return;
+ }
+
+ float dz = p.z - prev_cell_gnd_height_avg;
+ float d_radius = p.radius - prev_cell_gnd_radius_avg;
+ float global_height_threshold = d_radius * param.global_slope_max_ratio;
+
+ p.type = (dz > global_height_threshold)
+ ? PointType::NON_GROUND
+ : ((dz < -global_height_threshold) ? PointType::OUT_OF_RANGE : PointType::GROUND);
+}
+
+__forceinline__ __device__ void segmentCell(
+ const int wid, // Index of the thread in the warp
+ ClassifiedPointType * classified_points, const FilterParameters & param, Cell & cell,
+ float slope, // Slope of the line connect the previous ground cells
+ int start_pid, int end_pid, // Start and end indices of points in the current cell
+ float prev_cell_gnd_radius_avg, float prev_cell_gnd_height_avg, const SegmentationMode & mode)
+{
+ if (start_pid >= end_pid) {
+ return;
+ }
+
+ // For computing the cell statistic
+ float t_gnd_radius_sum = 0;
+ float t_gnd_height_sum = 0;
+ float t_gnd_height_min = FLT_MAX;
+ int t_gnd_point_num = 0;
+ // For recheck
+ float minus_t_gnd_radius_sum = 0;
+ float minus_t_gnd_height_sum = 0;
+ float minus_t_gnd_point_num = 0;
+ ClassifiedPointType last_gnd_point, p;
+ int last_gnd_idx;
+ int local_gnd_point_num;
+
+ // Fit line from the recent ground cells
+ for (int j = start_pid + wid; j < end_pid; j += WARP_SIZE) {
+ p = classified_points[j];
+
+ switch (mode) {
+ case (SegmentationMode::UNINITIALIZED): {
+ segmentUninitializedPoint(p, param);
+ break;
+ }
+ case (SegmentationMode::CONTINUOUS): {
+ segmentContinuousPoint(p, param, slope, prev_cell_gnd_height_avg, prev_cell_gnd_radius_avg);
+ break;
+ }
+ case (SegmentationMode::DISCONTINUOUS): {
+ segmentDiscontinuousPoint(p, param, prev_cell_gnd_height_avg, prev_cell_gnd_radius_avg);
+ break;
+ }
+ case (SegmentationMode::BREAK): {
+ segmentBreakPoint(p, param, prev_cell_gnd_height_avg, prev_cell_gnd_radius_avg);
+ break;
+ }
+ }
+
+ if (p.type == PointType::GROUND) {
+ t_gnd_radius_sum += p.radius;
+ t_gnd_height_sum += p.z;
+ ++t_gnd_point_num;
+ t_gnd_height_min = (t_gnd_height_min > p.z) ? p.z : t_gnd_height_min;
+ }
+
+ classified_points[j] = p;
+ }
+
+ // Wait for all threads in the warp to finish
+ local_gnd_point_num = t_gnd_point_num;
+ __syncwarp();
+
+ // Find the min height and the number of ground points first
+
+ // Use reduction to compute the cell's stat
+ for (int offset = WARP_SIZE >> 1; offset > 0; offset >>= 1) {
+ t_gnd_height_sum += __shfl_down_sync(FULL_MASK, t_gnd_height_sum, offset);
+ t_gnd_radius_sum += __shfl_down_sync(FULL_MASK, t_gnd_radius_sum, offset);
+ t_gnd_point_num += __shfl_down_sync(FULL_MASK, t_gnd_point_num, offset);
+
+ float other_height_min = __shfl_down_sync(FULL_MASK, t_gnd_height_min, offset);
+ t_gnd_height_min = min(t_gnd_height_min, other_height_min);
+ }
+
+ // Now broadcast the min height and the number of ground points to all threads
+ float cell_gnd_height_min = __shfl_sync(FULL_MASK, t_gnd_height_min, 0);
+ int cell_gnd_point_num = __shfl_sync(FULL_MASK, t_gnd_point_num, 0);
+ float cell_gnd_radius_sum = __shfl_sync(FULL_MASK, t_gnd_radius_sum, 0);
+
+ if (
+ param.use_recheck_ground_cluster && cell_gnd_point_num > 1 &&
+ cell_gnd_radius_sum / (float)(cell_gnd_point_num) > param.recheck_start_distance) {
+ // Now recheck the points using the height_min
+ for (int j = start_pid + wid; j < end_pid; j += WARP_SIZE) {
+ auto p = classified_points[j];
+
+ if (
+ p.type == PointType::GROUND &&
+ p.z > cell_gnd_height_min + param.non_ground_height_threshold && cell_gnd_point_num > 1) {
+ last_gnd_point = p;
+ minus_t_gnd_height_sum += p.z;
+ minus_t_gnd_radius_sum += p.radius;
+ ++minus_t_gnd_point_num;
+ p.type = PointType::NON_GROUND;
+ last_gnd_idx = j;
+ }
+
+ classified_points[j] = p;
+ }
+
+ __syncwarp();
+
+ // Now use ballot sync to see if there are any ground points remaining
+ uint32_t recheck_res = __ballot_sync(FULL_MASK, local_gnd_point_num > minus_t_gnd_point_num);
+ uint32_t backup_res = __ballot_sync(FULL_MASK, local_gnd_point_num > 0);
+
+ // If no ground point remains, we have to keep the last ground point
+ if (recheck_res == 0 && backup_res > 0) {
+ // Get the index of the last thread that detects ground point
+ int last_tid = 32 - __ffs(backup_res);
+
+ // Only the last point in that thread remain
+ if (wid == last_tid) {
+ // Keep the last point's stat
+ minus_t_gnd_height_sum -= last_gnd_point.z;
+ minus_t_gnd_radius_sum -= last_gnd_point.radius;
+ --minus_t_gnd_point_num;
+ classified_points[last_gnd_idx] = last_gnd_point;
+ }
+ }
+ __syncwarp();
+
+ // Final update
+ for (int offset = WARP_SIZE >> 1; offset > 0; offset >>= 1) {
+ minus_t_gnd_height_sum += __shfl_down_sync(FULL_MASK, minus_t_gnd_height_sum, offset);
+ minus_t_gnd_radius_sum += __shfl_down_sync(FULL_MASK, minus_t_gnd_radius_sum, offset);
+ minus_t_gnd_point_num += __shfl_down_sync(FULL_MASK, minus_t_gnd_point_num, offset);
+ }
+
+ if (wid == 0) {
+ t_gnd_height_sum -= minus_t_gnd_height_sum;
+ t_gnd_radius_sum -= minus_t_gnd_radius_sum;
+ t_gnd_point_num -= minus_t_gnd_point_num;
+ }
+ }
+
+ // Finally, thread 0 update the cell stat
+ if (wid == 0 && t_gnd_point_num > 0) {
+ cell.gnd_radius_avg = t_gnd_radius_sum / (float)(t_gnd_point_num);
+ cell.gnd_height_avg = t_gnd_height_sum / (float)(t_gnd_point_num);
+ cell.gnd_height_min = t_gnd_height_min;
+ cell.num_ground_points = t_gnd_point_num;
+ }
+
+ __syncwarp();
+ cell.num_ground_points = __shfl_sync(FULL_MASK, cell.num_ground_points, 0);
+}
+
+__global__ void sectorProcessingKernel(
+ Cell * __restrict__ cell_list, ClassifiedPointType * __restrict__ classified_points,
+ int * starting_pid, FilterParameters param, int * empty_cell_mark)
+{
+ int index = threadIdx.x + blockIdx.x * blockDim.x;
+ // Each warp handles one sector
+ int sector_stride = (blockDim.x * gridDim.x) / WARP_SIZE;
+ int wid = index % WARP_SIZE; // Index of the thread within the warp
+ // Shared memory to backup the cell data
+ extern __shared__ Cell shared_buffer[];
+ Cell * cell_queue = shared_buffer + (threadIdx.x / WARP_SIZE) * param.gnd_cell_buffer_size;
+ int * cell_id_queue =
+ (int *)(shared_buffer + param.gnd_cell_buffer_size * (blockDim.x / WARP_SIZE)) +
+ (threadIdx.x / WARP_SIZE) * param.gnd_cell_buffer_size;
+
+ // Loop on sectors
+ for (int sector_id = index / WARP_SIZE; sector_id < param.num_sectors;
+ sector_id += sector_stride) {
+ // For storing the previous ground cells
+ int closest_gnd_cell_id, furthest_gnd_cell_id, num_latest_gnd_cells = 0;
+ int head = 0, tail = 0;
+ float sum_x, sum_y, sum_xx, sum_xy, slope;
+ float prev_gnd_radius_avg, prev_gnd_height_avg;
+ int empty_cell_num = 0;
+
+ // Initially no ground cell is identified
+ closest_gnd_cell_id = furthest_gnd_cell_id = -1;
+ sum_x = sum_y = sum_xx = sum_xy = slope = 0.0;
+
+ // Loop on the cells in a sector
+ for (int i = 0; i < param.max_num_cells_per_sector; ++i) {
+ if (num_latest_gnd_cells > 0) {
+ furthest_gnd_cell_id = cell_id_queue[head];
+ empty_cell_num = empty_cell_mark[i * param.num_sectors + sector_id] -
+ empty_cell_mark[furthest_gnd_cell_id * param.num_sectors + sector_id];
+ }
+
+ int gid = i * param.num_sectors + sector_id;
+ int sid = starting_pid[gid], eid = starting_pid[gid + 1];
+ Cell cell;
+
+ // Skip empty cells
+ if (sid >= eid) {
+ continue;
+ }
+
+ auto mode = checkSegmentationMode(
+ i, closest_gnd_cell_id, furthest_gnd_cell_id, param.gnd_cell_buffer_size,
+ param.gnd_grid_continual_thresh, empty_cell_num);
+
+ // Classify the points in the cell
+ segmentCell(
+ wid, classified_points, param, cell, slope, sid, eid, prev_gnd_radius_avg,
+ prev_gnd_height_avg, mode);
+
+ // Update the indices of the previous ground cells if the cell contains ground points
+ if (cell.num_ground_points > 0) {
+ if (num_latest_gnd_cells >= param.gnd_cell_buffer_size) {
+ if (wid == 0) {
+ // If the number of previous ground cell reach maximum,
+ // remove the cell at the queue head
+ Cell head_cell = cell_queue[head];
+ sum_x -= head_cell.gnd_radius_avg;
+ sum_y -= head_cell.gnd_height_avg;
+ sum_xx -= head_cell.gnd_radius_avg * head_cell.gnd_radius_avg;
+ sum_xy -= head_cell.gnd_radius_avg * head_cell.gnd_height_avg;
+ }
+
+ // Now remove the entry at the head of the queue
+ head = (head + 1) % param.gnd_cell_buffer_size;
+ --num_latest_gnd_cells;
+ }
+
+ ++num_latest_gnd_cells;
+
+ if (wid == 0) {
+ // Add the new cell to the queue
+ cell_queue[tail] = cell;
+ cell_id_queue[tail] = i;
+
+ // Update the stats and estimate the local slope
+ sum_x += cell.gnd_radius_avg;
+ sum_y += cell.gnd_height_avg;
+ sum_xx += cell.gnd_radius_avg * cell.gnd_radius_avg;
+ sum_xy += cell.gnd_radius_avg * cell.gnd_height_avg;
+
+ float denom = (num_latest_gnd_cells * sum_xx - sum_x * sum_x);
+
+ if (fabsf(denom) < 1e-6f) {
+ Cell head_cell = cell_queue[head];
+ slope = head_cell.gnd_height_avg / head_cell.gnd_radius_avg;
+ } else {
+ slope = (num_latest_gnd_cells * sum_xy - sum_x * sum_y) / denom;
+ slope = fmax(fminf(slope, param.global_slope_max_ratio), -param.global_slope_max_ratio);
+ }
+
+ // Write the cell to the global memory
+ cell_list[gid] = cell;
+ }
+
+ // Wait for the thread 0 to finish its work
+ __syncwarp();
+ // Now remove the cell at the end of the queue
+ tail = (tail + 1) % param.gnd_cell_buffer_size;
+ // Distribute the new slope to all threads in the warp
+ slope = __shfl_sync(FULL_MASK, slope, 0);
+ prev_gnd_radius_avg = __shfl_sync(FULL_MASK, cell.gnd_radius_avg, 0);
+ prev_gnd_height_avg = __shfl_sync(FULL_MASK, cell.gnd_height_avg, 0);
+ closest_gnd_cell_id = i;
+ }
+ }
+ }
+}
+
+__global__ void markEmptyCells(int * starting_pid, int cell_num, int * mark)
+{
+ int index = threadIdx.x + blockIdx.x * blockDim.x;
+ int stride = blockDim.x * gridDim.x;
+
+ for (int i = index; i < cell_num; i += stride) {
+ mark[i] = (starting_pid[i] >= starting_pid[i + 1]) ? 1 : 0;
+ }
+}
+
+__global__ void prefixSumEmptyCells(int * mark, int sector_num, int cell_per_sector_num)
+{
+ int index = threadIdx.x + blockIdx.x * blockDim.x;
+ int stride = blockDim.x * gridDim.x;
+
+ for (int i = index; i < sector_num; i += stride) {
+ int sum = 0;
+
+ for (int j = 0, cell_global_id = i; j < cell_per_sector_num;
+ ++j, cell_global_id += sector_num) {
+ int val_j = mark[cell_global_id];
+
+ mark[cell_global_id] = sum;
+ sum += val_j;
+ }
+ }
+}
+
+void CudaScanGroundSegmentationFilter::sort_points(
+ device_vector & cell_list, device_vector & starting_pid,
+ device_vector & classified_points)
+{
+ int point_num = dev_input_points_->height * dev_input_points_->width;
+
+ if (point_num == 0 || filter_parameters_.max_num_cells == 0) {
+ return;
+ }
+
+ int cell_num = filter_parameters_.max_num_cells;
+
+ cell_list.resize(cell_num);
+
+ CHECK_CUDA_ERROR(
+ cuda::launchAsync(
+ (int)(cell_list.size()), 0, stream_->get(), cellInit, cell_list.data(),
+ (int)(cell_list.size())));
+
+ starting_pid.resize(cell_num + 1);
+
+ device_vector cell_id(point_num, stream_, mempool_);
+ device_vector tmp_classified_points(point_num, stream_, mempool_);
+
+ CHECK_CUDA_ERROR(cuda::fill(starting_pid, 0));
+ CHECK_CUDA_ERROR(
+ cuda::launchAsync(
+ point_num, 0, stream_->get(), computeCellId,
+ reinterpret_cast(dev_input_points_->data.get()),
+ (int)(dev_input_points_->height * dev_input_points_->width), filter_parameters_,
+ cell_id.data(), starting_pid.data(), tmp_classified_points.data()));
+
+ CHECK_CUDA_ERROR(cuda::ExclusiveScan(starting_pid));
+
+ device_vector writing_loc(starting_pid, stream_, mempool_);
+
+ classified_points.resize(point_num);
+
+ CHECK_CUDA_ERROR(
+ cuda::launchAsync(
+ point_num, 0, stream_->get(), distributePointsToCell, tmp_classified_points.data(),
+ classified_points.data(), cell_id.data(), point_num, writing_loc.data()));
+
+ // Compute the number of empty cells between every pair of consecutive non-empty cell
+ empty_cell_mark_->resize(cell_num);
+
+ CHECK_CUDA_ERROR(
+ cuda::launchAsync(
+ cell_num, 0, stream_->get(), markEmptyCells, starting_pid.data(), cell_num,
+ empty_cell_mark_->data()));
+
+ empty_cell_mark_->resize(cell_num);
+
+ CHECK_CUDA_ERROR(
+ cuda::launchAsync(
+ (int)(filter_parameters_.num_sectors), 0, stream_->get(), prefixSumEmptyCells,
+ empty_cell_mark_->data(), (int)filter_parameters_.num_sectors,
+ (int)filter_parameters_.max_num_cells_per_sector));
+}
+
+// ============ Scan per sector to get ground reference and Non-Ground points =============
+void CudaScanGroundSegmentationFilter::scanPerSectorGroundReference(
+ device_vector & cell_list, device_vector & starting_pid,
+ device_vector & classified_points)
+{
+ const uint32_t num_sectors = filter_parameters_.num_sectors;
+ if (num_sectors == 0) {
+ return;
+ }
+
+ CHECK_CUDA_ERROR(
+ cuda::launchAsync(
+ (int)(num_sectors * WARP_SIZE),
+ (BLOCK_SIZE_X / WARP_SIZE) * (sizeof(Cell) + sizeof(int)) *
+ filter_parameters_.gnd_cell_buffer_size,
+ stream_->get(), sectorProcessingKernel, cell_list.data(), classified_points.data(),
+ starting_pid.data(), filter_parameters_, empty_cell_mark_->data()));
+}
+
+struct NonGroundChecker
+{
+ CUDAH bool operator()(const ClassifiedPointType & p) const
+ {
+ return (p.type != PointType::GROUND);
+ }
+};
+
+struct GroundChecker
+{
+ CUDAH bool operator()(const ClassifiedPointType & p) const
+ {
+ return (p.type == PointType::GROUND);
+ }
+};
+
+template
+__global__ void markingPoints(
+ ClassifiedPointType * classified_points, int point_num, int * mark, CheckerType checker)
+{
+ int index = threadIdx.x + blockIdx.x * blockDim.x;
+ int stride = blockDim.x * gridDim.x;
+
+ for (int i = index; i < point_num; i += stride) {
+ auto p = classified_points[i];
+
+ mark[p.origin_index] = (checker(p)) ? 1 : 0;
+ }
+}
+
+__global__ void extract(
+ const PointTypeStruct * __restrict__ dev_input_points, int point_num, int * writing_loc,
+ PointTypeStruct * __restrict__ dev_output_points)
+{
+ int index = threadIdx.x + blockIdx.x * blockDim.x;
+ int stride = blockDim.x * gridDim.x;
+
+ for (int i = index; i < point_num; i += stride) {
+ int wloc = writing_loc[i];
+
+ if (wloc < writing_loc[i + 1]) {
+ dev_output_points[wloc] = dev_input_points[i];
+ }
+ }
+}
+
+// ============= Extract non-ground points =============
+template
+void CudaScanGroundSegmentationFilter::extractPoints(
+ device_vector & classified_points,
+ const cuda_blackboard::CudaPointCloud2 & input, cuda_blackboard::CudaPointCloud2 & output)
+{
+ int point_num = dev_input_points_->height * dev_input_points_->width;
+ device_vector point_mark(point_num + 1, stream_, mempool_);
+
+ // Mark non-ground points
+ CHECK_CUDA_ERROR(
+ cuda::launchAsync(
+ point_num, 0, stream_->get(), markingPoints, classified_points.data(), point_num,
+ point_mark.data(), CheckerType()));
+
+ // Exclusive scan
+ device_vector writing_loc(stream_, mempool_);
+
+ CHECK_CUDA_ERROR(cuda::ExclusiveScan(point_mark, writing_loc));
+
+ // Reserve the output
+ int output_size = writing_loc[point_num];
+
+ if (output_size <= 0) {
+ return;
+ }
+
+ cuda::copyPointCloud2Metadata(output, input);
+
+ output.data = cuda_blackboard::make_unique(output_size * output.point_step);
+ output.height = 1;
+ output.width = output_size;
+
+ // Get the points
+ CHECK_CUDA_ERROR(
+ cuda::launchAsync(
+ point_num, 0, stream_->get(), extract,
+ reinterpret_cast(input.data.get()),
+ (int)(input.height * input.width), writing_loc.data(),
+ reinterpret_cast(output.data.get())));
+}
+
+void CudaScanGroundSegmentationFilter::classifyPointCloud(
+ const cuda_blackboard::CudaPointCloud2 & input, cuda_blackboard::CudaPointCloud2 & ground,
+ cuda_blackboard::CudaPointCloud2 & non_ground)
+{
+ device_vector| cell_list(stream_, mempool_);
+ device_vector starting_pid(stream_, mempool_);
+ device_vector classified_points(stream_, mempool_);
+
+ removeOutliers(input);
+ sort_points(cell_list, starting_pid, classified_points);
+ scanPerSectorGroundReference(cell_list, starting_pid, classified_points);
+
+ int point_num = dev_input_points_->height * dev_input_points_->width;
+
+ // Extract non-ground points
+ extractPoints(classified_points, *dev_input_points_, non_ground);
+
+ // Extract ground points
+ extractPoints(classified_points, *dev_input_points_, ground);
+}
+
+__global__ void markNonOutliers(
+ const PointTypeStruct * __restrict__ cloud, int point_num, FilterParameters param,
+ float max_radius, int * __restrict__ mark)
+{
+ int index = threadIdx.x + blockIdx.x * blockDim.x;
+ int stride = blockDim.x * gridDim.x;
+
+ for (int i = index; i < point_num; i += stride) {
+ auto p = cloud[i];
+ int val = (p.x < param.min_x || p.x > param.max_x || p.y < param.min_y || p.y > param.max_y ||
+ p.z < param.min_z || p.z > param.max_z)
+ ? 0
+ : 1;
+ mark[i] = val;
+ }
+}
+
+__global__ void getValidPoints(
+ const PointTypeStruct * __restrict__ cloud, int point_num, int * __restrict__ writing_loc,
+ PointTypeStruct * __restrict__ out_cloud)
+{
+ int index = threadIdx.x + blockIdx.x * blockDim.x;
+ int stride = blockDim.x * gridDim.x;
+
+ for (int i = index; i < point_num; i += stride) {
+ int wloc = writing_loc[i];
+
+ if (wloc < writing_loc[i + 1]) {
+ out_cloud[wloc] = cloud[i];
+ }
+ }
+}
+
+void CudaScanGroundSegmentationFilter::removeOutliers(
+ const cuda_blackboard::CudaPointCloud2 & input)
+{
+ float max_radius = filter_parameters_.max_radius;
+ int point_num = input.width * input.height;
+
+ if (point_num <= 0) {
+ return;
+ }
+
+ device_vector mark(point_num + 1, stream_, mempool_);
+
+ CHECK_CUDA_ERROR(
+ cuda::launchAsync(
+ point_num, 0, stream_->get(), markNonOutliers,
+ reinterpret_cast(input.data.get()),
+ (int)(input.height * input.width), filter_parameters_, max_radius, mark.data()));
+
+ CHECK_CUDA_ERROR(cuda::ExclusiveScan(mark));
+
+ int remain_size = mark[point_num];
+
+ cuda::copyPointCloud2Metadata(*dev_input_points_, input);
+
+ dev_input_points_->height = 1;
+ dev_input_points_->width = remain_size;
+ dev_input_points_->data =
+ cuda_blackboard::make_unique(remain_size * dev_input_points_->point_step);
+
+ CHECK_CUDA_ERROR(
+ cuda::launchAsync(
+ point_num, 0, stream_->get(), getValidPoints,
+ reinterpret_cast(input.data.get()),
+ (int)(input.height * input.width), mark.data(),
+ reinterpret_cast(dev_input_points_->data.get())));
+}
+
+} // namespace autoware::cuda_ground_segmentation
diff --git a/perception/autoware_ground_segmentation_cuda/src/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter_node.cpp b/perception/autoware_ground_segmentation_cuda/src/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter_node.cpp
new file mode 100644
index 0000000000000..e1f86ae6665e5
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/src/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter_node.cpp
@@ -0,0 +1,158 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter_node.hpp"
+
+#include "autoware/pointcloud_preprocessor/utility/memory.hpp"
+
+#include
+
+namespace autoware::cuda_ground_segmentation
+{
+
+using autoware_utils::deg2rad;
+CudaScanGroundSegmentationFilterNode::CudaScanGroundSegmentationFilterNode(
+ const rclcpp::NodeOptions & options)
+: Node("cuda_scan_ground_segmentation_filter_node", options)
+{
+ // Delare processing time debug
+ {
+ using autoware_utils::DebugPublisher;
+ using autoware_utils::StopWatch;
+ stop_watch_ptr_ = std::make_unique>();
+ debug_publisher_ptr_ = std::make_unique(this, "cuda_scan_ground_filter");
+ stop_watch_ptr_->tic("cyclic_time");
+ stop_watch_ptr_->tic("processing_time");
+ }
+ // Declare parameters
+ FilterParameters filter_parameters;
+ // global parameters
+ filter_parameters.max_x = static_cast(declare_parameter("max_x"));
+ filter_parameters.min_x = static_cast(declare_parameter("min_x"));
+ filter_parameters.max_y = static_cast(declare_parameter("max_y"));
+ filter_parameters.min_y = static_cast(declare_parameter("min_y"));
+ filter_parameters.max_z = static_cast(declare_parameter("max_z"));
+ filter_parameters.min_z = static_cast(declare_parameter("min_z"));
+ filter_parameters.center_x =
+ static_cast(declare_parameter("center_pcl_shift")); // default 0.0
+
+ filter_parameters.max_radius = std::max(
+ std::max(
+ std::hypot(filter_parameters.max_x, filter_parameters.max_y),
+ std::hypot(filter_parameters.min_x, filter_parameters.min_y)),
+ std::max(
+ std::hypot(filter_parameters.max_x, filter_parameters.min_y),
+ std::hypot(filter_parameters.min_x, filter_parameters.max_y)));
+
+ // common parameters
+ filter_parameters.sector_angle_rad =
+ static_cast(deg2rad(declare_parameter("sector_angle_deg")));
+ filter_parameters.inv_sector_angle_rad = 1.0f / filter_parameters.sector_angle_rad;
+ filter_parameters.num_sectors =
+ static_cast(std::ceil(2.0 * M_PI * filter_parameters.inv_sector_angle_rad));
+
+ // common thresholds
+ filter_parameters.global_slope_max_angle_rad =
+ static_cast(deg2rad(declare_parameter("global_slope_max_angle_deg")));
+ filter_parameters.local_slope_max_angle_rad =
+ static_cast(deg2rad(declare_parameter("local_slope_max_angle_deg")));
+ filter_parameters.global_slope_max_ratio = std::tan(filter_parameters.global_slope_max_angle_rad);
+ filter_parameters.local_slope_max_ratio = std::tan(filter_parameters.local_slope_max_angle_rad);
+
+ // cell mode parameters
+ filter_parameters.use_recheck_ground_cluster =
+ static_cast(declare_parameter("use_recheck_ground_cluster"));
+ filter_parameters.recheck_start_distance =
+ static_cast(declare_parameter("recheck_start_distance"));
+ filter_parameters.detection_range_z_max =
+ static_cast(declare_parameter("detection_range_z_max"));
+ filter_parameters.non_ground_height_threshold =
+ static_cast(declare_parameter("non_ground_height_threshold"));
+
+ // cell parameters
+ filter_parameters.cell_divider_size_m =
+ static_cast(declare_parameter("grid_size_m"));
+ filter_parameters.max_num_cells_per_sector =
+ static_cast(filter_parameters.max_radius / filter_parameters.cell_divider_size_m);
+ filter_parameters.max_num_cells = static_cast(
+ filter_parameters.max_num_cells_per_sector * filter_parameters.num_sectors);
+ filter_parameters.gnd_cell_buffer_size =
+ static_cast(declare_parameter("gnd_cell_buffer_size"));
+
+ int64_t max_mem_pool_size_in_byte =
+ declare_parameter("max_mem_pool_size_in_byte", 1e9); // 1 GB
+
+ // Initialize CUDA blackboard publisher
+ // Initialize CUDA blackboard subscriber
+ sub_ =
+ std::make_shared>(
+ *this, "~/input/pointcloud",
+ std::bind(
+ &CudaScanGroundSegmentationFilterNode::cudaPointCloudCallback, this,
+ std::placeholders::_1));
+
+ // Initialize CUDA blackboard publisher
+ pub_ =
+ std::make_unique>(
+ *this, "~/output/pointcloud");
+
+ pub_gnd_ =
+ std::make_unique>(
+ *this, "~/output/ground_pointcloud");
+
+ cuda_ground_segmentation_filter_ = std::make_unique(
+ filter_parameters, max_mem_pool_size_in_byte);
+}
+
+void CudaScanGroundSegmentationFilterNode::cudaPointCloudCallback(
+ const cuda_blackboard::CudaPointCloud2::ConstSharedPtr & msg)
+{
+ // start time measurement
+ if (stop_watch_ptr_) {
+ stop_watch_ptr_->tic("processing_time");
+ }
+
+ auto non_ground_unique = std::make_unique();
+ auto ground_unique = std::make_unique();
+
+ // Create shared_ptr from raw pointers for the function call
+ auto non_ground_shared = std::shared_ptr(
+ non_ground_unique.get(), [](auto *) {}); // no-op deleter
+ auto ground_shared =
+ std::shared_ptr(ground_unique.get(), [](auto *) {});
+
+ cuda_ground_segmentation_filter_->classifyPointCloud(*msg, *ground_shared, *non_ground_shared);
+
+ // Publish using the original unique_ptr
+ pub_->publish(std::move(non_ground_unique));
+ pub_gnd_->publish(std::move(ground_unique));
+
+ // end time measurement
+ if (debug_publisher_ptr_ && stop_watch_ptr_) {
+ stop_watch_ptr_->toc("processing_time");
+ stop_watch_ptr_->toc("cyclic_time");
+ const double cyclic_time_ms = stop_watch_ptr_->toc("cyclic_time", true);
+ const double processing_time_ms = stop_watch_ptr_->toc("processing_time", true);
+ debug_publisher_ptr_->publish(
+ "debug/cyclic_time_ms", cyclic_time_ms);
+ debug_publisher_ptr_->publish(
+ "debug/processing_time_ms", processing_time_ms);
+ }
+}
+
+} // namespace autoware::cuda_ground_segmentation
+
+#include "rclcpp_components/register_node_macro.hpp"
+RCLCPP_COMPONENTS_REGISTER_NODE(
+ autoware::cuda_ground_segmentation::CudaScanGroundSegmentationFilterNode)
| | | | |