autowarefoundation · anhnv3991 · Aug 24, 2025 · Aug 24, 2025 · Aug 24, 2025 · Aug 25, 2025
diff --git a/.cspell.json b/.cspell.json
@@ -4,5 +4,5 @@
     "planning/behavior_velocity_planner/autoware_behavior_velocity_intersection_module/scripts/**"
   ],
   "ignoreRegExpList": [],
-  "words": ["dltype", "tvmgen", "fromarray", "soblin", "brkay54", "libtensorrt", "TRTBEV"]
+  "words": ["dltype", "tvmgen", "fromarray", "soblin", "brkay54", "libtensorrt", "TRTBEV", "CUDAH"]
 }
@@ -48,6 +48,9 @@
             context
         )
         self.use_time_series_filter = LaunchConfiguration("use_time_series_filter").perform(context)
+        self.use_cuda_ground_segmentation = (
+            LaunchConfiguration("use_cuda_ground_segmentation").perform(context).lower() == "true"
+        )
         # check if self.use_single_frame_filter is bool
         if isinstance(self.use_single_frame_filter, str):
             self.use_single_frame_filter = self.use_single_frame_filter.lower() == "true"
@@ -176,7 +179,7 @@
                 name="short_height_obstacle_detection_area_filter",
                 namespace="plane_fitting",
                 remappings=[
-                    ("input", "concatenated/pointcloud"),
+                    ("input", "concat/pointcloud"),
                     ("output", "detection_area/pointcloud"),
                 ],
                 parameters=[
@@ -535,6 +538,46 @@
     pipeline = GroundSegmentationPipeline(context)
 
     components = []
+    if pipeline.use_cuda_ground_segmentation:
+        ground_segmentation_node_param = ParameterFile(
+            param_file=LaunchConfiguration("cuda_ground_segmentation_node_param_path").perform(
+                context
+            ),
+            allow_substs=True,
+        )
+        components.append(
+            ComposableNode(
+                package="autoware_ground_segmentation_cuda",
+                plugin="autoware::cuda_ground_segmentation::CudaScanGroundSegmentationFilterNode",
+                name="cuda_scan_ground_segmentation_filter",
+                remappings=[
+                    ("~/input/pointcloud", "/sensing/lidar/concatenated/pointcloud"),
+                    ("~/input/pointcloud/cuda", "/sensing/lidar/concatenated/pointcloud/cuda"),
+                    ("~/output/pointcloud", "/perception/obstacle_segmentation/pointcloud"),
+                    (
+                        "~/output/pointcloud/cuda",
+                        "/perception/obstacle_segmentation/pointcloud/cuda",
+                    ),
+                    (
+                        "~/output/ground_pointcloud",
+                        "/perception/obstacle_segmentation/ground_pointcloud",
+                    ),
+                    (
+                        "~/output/ground_pointcloud/cuda",
+                        "/perception/obstacle_segmentation/ground_pointcloud/cuda",
+                    ),
+                ],
+                parameters=[ground_segmentation_node_param],
+                extra_arguments=[],
+            )
+        )
+        return [
+            LoadComposableNodes(
+                composable_node_descriptions=components,
+                target_container=LaunchConfiguration("pointcloud_container_name"),
+            )
+        ]
+
     components.extend(
         pipeline.create_single_frame_obstacle_segmentation_components(
             input_topic=LaunchConfiguration("input/pointcloud"),
@@ -594,13 +637,21 @@
     add_launch_arg("use_intra_process", "True")
     add_launch_arg("pointcloud_container_name", "pointcloud_container")
     add_launch_arg("input/pointcloud", "/sensing/lidar/concatenated/pointcloud")
+    add_launch_arg("use_cuda_ground_segmentation", "False")
     add_launch_arg(
         "ogm_outlier_filter_param_path",
         [
             FindPackageShare("autoware_launch"),
             "/config/perception/obstacle_segmentation/occupancy_grid_based_outlier_filter/occupancy_grid_map_outlier_filter.param.yaml",
         ],
     )
+    add_launch_arg(
+        "cuda_ground_segmentation_node_param_path",
+        [
+            FindPackageShare("autoware_ground_segmentation_cuda"),
+            "/config/cuda_scan_ground_segmentation_filter.param.yaml",
+        ],
+    )
 
     set_container_executable = SetLaunchConfiguration(
         "container_executable",

@@ -149,6 +149,7 @@
   <arg name="use_perception_analytics_publisher" default="true" description="use perception analytics publisher"/>
   <arg name="use_obstacle_segmentation_single_frame_filter" description="use single frame filter at the ground segmentation"/>
   <arg name="use_obstacle_segmentation_time_series_filter" description="use time series filter at the ground segmentation"/>
+  <arg name="use_cuda_ground_segmentation" default="false" description="use cuda ground segmentation filter at the ground segmentation"/>
 
   <!-- traffic light recognition options to switch launch function/module -->
   <arg name="use_traffic_light_recognition"/>
@@ -225,6 +226,7 @@
         <arg name="obstacle_segmentation_ground_segmentation_param_path" value="$(var obstacle_segmentation_ground_segmentation_param_path)"/>
         <arg name="use_single_frame_filter" value="$(var use_obstacle_segmentation_single_frame_filter)"/>
         <arg name="use_time_series_filter" value="$(var use_obstacle_segmentation_time_series_filter)"/>
+        <arg name="use_cuda_ground_segmentation" value="$(var use_cuda_ground_segmentation)"/>
       </include>
     </group>
 

diff --git a/perception/autoware_ground_segmentation_cuda/CMakeLists.txt b/perception/autoware_ground_segmentation_cuda/CMakeLists.txt
@@ -0,0 +1,131 @@
+cmake_minimum_required(VERSION 3.8)
+project(autoware_ground_segmentation_cuda)
+
+find_package(ament_cmake_auto REQUIRED)
+find_package(CUDA)
+find_package(agnocastlib)
+
+if(NOT ${CUDA_FOUND})
+  message(WARNING "cuda was not found, so the autoware_ground_segmentation_cuda package will not be built.")
+  return()
+elseif(CMAKE_BUILD_TYPE STREQUAL "Debug")
+  set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-g -G")
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-g -G")
+endif()
+
+if(USE_AGNOCAST AND NOT agnocastlib_FOUND)
+  message(FATAL_ERROR "agnocastlib is required when USE_AGNOCAST is enabled")
+endif()
+
+ament_auto_find_build_dependencies()
+
+# Default to C++17
+if(NOT CMAKE_CXX_STANDARD)
+    set(CMAKE_CXX_STANDARD 17)
+endif()
+
+if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    add_compile_options(-Wall -Wextra -Wpedantic -Wunused-function)
+endif()
+
+if(USE_AGNOCAST)
+    add_definitions(-DUSE_AGNOCAST_ENABLED)
+endif()
+
+if(BUILD_TESTING)
+    list(APPEND AMENT_LINT_AUTO_EXCLUDE ament_cmake_uncrustify)
+    find_package(ament_lint_auto REQUIRED)
+    ament_lint_auto_find_test_dependencies()
+endif()
+
+include_directories(
+    include
+    SYSTEM
+    ${CUDA_INCLUDE_DIRS}
+)
+
+# cSpell: ignore expt gencode
+list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -diag-suppress 20012")
+list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_75,code=sm_75")
+list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_86,code=sm_86")
+list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_87,code=sm_87")
+list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_89,code=sm_89")
+list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_89,code=compute_89")
+
+#################### cuda_ground_segmentation ##################
+cuda_add_library(cuda_ground_segmentation_lib SHARED
+  src/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter.cu
+)
+
+target_link_libraries(cuda_ground_segmentation_lib
+  ${autoware_pointcloud_preprocessor_TARGETS}
+  ${autoware_cuda_pointcloud_preprocessor_TARGETS}
+)
+
+target_include_directories(cuda_ground_segmentation_lib SYSTEM PRIVATE
+  ${autoware_pointcloud_preprocessor_INCLUDE_DIRS}
+  ${autoware_cuda_pointcloud_preprocessor_INCLUDE_DIRS}
+  ${autoware_point_types_INCLUDE_DIRS}
+  ${cuda_blackboard_INCLUDE_DIRS}
+  ${diagnostic_msgs_INCLUDE_DIRS}
+  ${geometry_msgs_INCLUDE_DIRS}
+  ${rclcpp_INCLUDE_DIRS}
+  ${rclcpp_components_INCLUDE_DIRS}
+  ${rcl_interfaces_INCLUDE_DIRS}
+  ${sensor_msgs_INCLUDE_DIRS}
+  ${tf2_INCLUDE_DIRS}
+  ${tf2_msgs_INCLUDE_DIRS}
+  ${autoware_cuda_utils_INCLUDE_DIRS}
+)
+
+if(USE_AGNOCAST)
+    target_include_directories(cuda_ground_segmentation_lib SYSTEM PRIVATE
+        ${autoware_agnocast_wrapper_INCLUDE_DIRS}
+        ${agnocastlib_INCLUDE_DIRS}
+    )
+    target_link_libraries(cuda_ground_segmentation_lib
+        ${agnocastlib_LIBRARIES}
+    )
+endif()
+
+
+# Targets
+ament_auto_add_library(cuda_ground_segmentation SHARED
+  src/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter_node.cpp
+)
+
+target_link_libraries(cuda_ground_segmentation
+  ${CUDA_LIBRARIES}
+  ${diagnostic_msgs_LIBRARIES}
+  cuda_ground_segmentation_lib
+)
+
+#=========== ScanGround Segmentation filter ========
+rclcpp_components_register_node(cuda_ground_segmentation
+  PLUGIN "autoware::cuda_ground_segmentation::CudaScanGroundSegmentationFilterNode"
+  EXECUTABLE cuda_scan_ground_segmentation_filter_node)
+
+################################################################################
+# Install
+install(DIRECTORY launch config
+  DESTINATION share/${PROJECT_NAME}
+)
+
+install(
+  TARGETS cuda_ground_segmentation_lib
+  LIBRARY DESTINATION lib
+)
+
+ament_auto_package()
+
+# Set ROS_DISTRO macros
+set(ROS_DISTRO $ENV{ROS_DISTRO})
+if(${ROS_DISTRO} STREQUAL "rolling")
+    add_compile_definitions(ROS_DISTRO_ROLLING)
+elseif(${ROS_DISTRO} STREQUAL "foxy")
+    add_compile_definitions(ROS_DISTRO_FOXY)
+elseif(${ROS_DISTRO} STREQUAL "galactic")
+    add_compile_definitions(ROS_DISTRO_GALACTIC)
+elseif(${ROS_DISTRO} STREQUAL "humble")
+    add_compile_definitions(ROS_DISTRO_HUMBLE)
+endif()
diff --git a/perception/autoware_ground_segmentation_cuda/README.md b/perception/autoware_ground_segmentation_cuda/README.md
@@ -0,0 +1,19 @@
+# autoware_ground_segmentation_cuda
+
+## Purpose
+
+The `autoware_ground_segmentation` algorithms have been thoroughly tested with Autoware. However, due to latency and high computational cost when processing large pointcloud, the input pointcloud range has been limited by the `crop_box_filter` based on the ego-vehicle's `base_link`. This can cause unwanted object loss, especially before a sloped road.
+
+![ground_segmentation_pipeline issue](./docs/image/ground_segmentation_issue.png)
+
+Recently, GPU and CUDA-supported libraries such as [cuda_blackboard](https://github.com/autowarefoundation/cuda_blackboard/blob/1837689df2891f6223f07c178c21aed252566ede/README.md) and accelerated versions of [`autoware_pointcloud_preprocessor`](../../sensing/autoware_cuda_pointcloud_preprocessor/README.md) have been implemented. These can be leveraged to improve the performance of ground segmentation filter algorithms using CUDA/GPU.
+
+This package reimplements the current scan_ground_filter of the ground_segmentation package to reduce latency and avoid the bottleneck caused by processing a large number of point clouds.
+
+## Inner-workings / Algorithm
+
+The detailed algorithm is available in [scan-ground-filter.md](../autoware_ground_segmentation/docs/scan-ground-filter.md).
+
+## Parameters
+
+{{ json_to_markdown("perception/autoware_ground_segmentation_cuda/schema/cuda_scan_ground_segmentation_filter.schema.json") }}
diff --git a/.../autoware_ground_segmentation_cuda/config/cuda_scan_ground_segmentation_filter.param.yaml b/.../autoware_ground_segmentation_cuda/config/cuda_scan_ground_segmentation_filter.param.yaml
@@ -0,0 +1,19 @@
+/**:
+  ros__parameters:
+    global_slope_max_angle_deg: 10.0
+    local_slope_max_angle_deg: 25.0
+    non_ground_height_threshold: 0.20
+    grid_size_m: 0.5
+    detection_range_z_max: 3.2
+    use_recheck_ground_cluster: true
+    recheck_start_distance: 20.0
+    use_lowest_point: true
+    center_pcl_shift: 0.0
+    sector_angle_deg: 1.0
+    gnd_cell_buffer_size: 5
+    min_x: -100.0
+    max_x: 150.0
+    min_y: -70.0
+    max_y: 70.0
+    max_z: 2.5
+    min_z: -2.5
diff --git a/...tion/autoware_ground_segmentation_cuda/docs/image/ground_segmentation_issue.png b/...tion/autoware_ground_segmentation_cuda/docs/image/ground_segmentation_issue.png
diff --git a/...e_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_common.hpp b/...e_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_common.hpp
@@ -0,0 +1,38 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_COMMON_HPP_
+#define AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_COMMON_HPP_
+
+#include <autoware/cuda_utils/cuda_check_error.hpp>
+
+#include <cuda_runtime.h>
+
+#ifndef CUDAH
+#define CUDAH __forceinline__ __host__ __device__
+#endif
+
+#ifndef BLOCK_SIZE_X
+#define BLOCK_SIZE_X (256)
+#endif
+
+#ifndef WARP_SIZE
+#define WARP_SIZE (32)
+#endif
+
+#ifndef FULL_MASK
+#define FULL_MASK (0xFFFFFFFF)
+#endif
+
+#endif  // AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_COMMON_HPP_