diff --git a/.cspell.json b/.cspell.json
index 76e570034d1f8..4d3eb28340131 100644
--- a/.cspell.json
+++ b/.cspell.json
@@ -4,5 +4,5 @@
     "planning/behavior_velocity_planner/autoware_behavior_velocity_intersection_module/scripts/**"
   ],
   "ignoreRegExpList": [],
-  "words": ["dltype", "tvmgen", "fromarray", "soblin", "brkay54", "libtensorrt", "TRTBEV"]
+  "words": ["dltype", "tvmgen", "fromarray", "soblin", "brkay54", "libtensorrt", "TRTBEV", "CUDAH"]
 }
diff --git a/launch/tier4_perception_launch/launch/obstacle_segmentation/ground_segmentation/ground_segmentation.launch.py b/launch/tier4_perception_launch/launch/obstacle_segmentation/ground_segmentation/ground_segmentation.launch.py
index 52a4fdd213ab6..7508214782cc3 100644
--- a/launch/tier4_perception_launch/launch/obstacle_segmentation/ground_segmentation/ground_segmentation.launch.py
+++ b/launch/tier4_perception_launch/launch/obstacle_segmentation/ground_segmentation/ground_segmentation.launch.py
@@ -48,6 +48,9 @@ def __init__(self, context):
             context
         )
         self.use_time_series_filter = LaunchConfiguration("use_time_series_filter").perform(context)
+        self.use_cuda_ground_segmentation = (
+            LaunchConfiguration("use_cuda_ground_segmentation").perform(context).lower() == "true"
+        )
         # check if self.use_single_frame_filter is bool
         if isinstance(self.use_single_frame_filter, str):
             self.use_single_frame_filter = self.use_single_frame_filter.lower() == "true"
@@ -176,7 +179,7 @@ def create_ransac_pipeline(self):
                 name="short_height_obstacle_detection_area_filter",
                 namespace="plane_fitting",
                 remappings=[
-                    ("input", "concatenated/pointcloud"),
+                    ("input", "concat/pointcloud"),
                     ("output", "detection_area/pointcloud"),
                 ],
                 parameters=[
@@ -535,6 +538,46 @@ def launch_setup(context, *args, **kwargs):
     pipeline = GroundSegmentationPipeline(context)
 
     components = []
+    if pipeline.use_cuda_ground_segmentation:
+        ground_segmentation_node_param = ParameterFile(
+            param_file=LaunchConfiguration("cuda_ground_segmentation_node_param_path").perform(
+                context
+            ),
+            allow_substs=True,
+        )
+        components.append(
+            ComposableNode(
+                package="autoware_ground_segmentation_cuda",
+                plugin="autoware::cuda_ground_segmentation::CudaScanGroundSegmentationFilterNode",
+                name="cuda_scan_ground_segmentation_filter",
+                remappings=[
+                    ("~/input/pointcloud", "/sensing/lidar/concatenated/pointcloud"),
+                    ("~/input/pointcloud/cuda", "/sensing/lidar/concatenated/pointcloud/cuda"),
+                    ("~/output/pointcloud", "/perception/obstacle_segmentation/pointcloud"),
+                    (
+                        "~/output/pointcloud/cuda",
+                        "/perception/obstacle_segmentation/pointcloud/cuda",
+                    ),
+                    (
+                        "~/output/ground_pointcloud",
+                        "/perception/obstacle_segmentation/ground_pointcloud",
+                    ),
+                    (
+                        "~/output/ground_pointcloud/cuda",
+                        "/perception/obstacle_segmentation/ground_pointcloud/cuda",
+                    ),
+                ],
+                parameters=[ground_segmentation_node_param],
+                extra_arguments=[],
+            )
+        )
+        return [
+            LoadComposableNodes(
+                composable_node_descriptions=components,
+                target_container=LaunchConfiguration("pointcloud_container_name"),
+            )
+        ]
+
     components.extend(
         pipeline.create_single_frame_obstacle_segmentation_components(
             input_topic=LaunchConfiguration("input/pointcloud"),
@@ -594,6 +637,7 @@ def add_launch_arg(name: str, default_value=None):
     add_launch_arg("use_intra_process", "True")
     add_launch_arg("pointcloud_container_name", "pointcloud_container")
     add_launch_arg("input/pointcloud", "/sensing/lidar/concatenated/pointcloud")
+    add_launch_arg("use_cuda_ground_segmentation", "False")
     add_launch_arg(
         "ogm_outlier_filter_param_path",
         [
@@ -601,6 +645,13 @@ def add_launch_arg(name: str, default_value=None):
             "/config/perception/obstacle_segmentation/occupancy_grid_based_outlier_filter/occupancy_grid_map_outlier_filter.param.yaml",
         ],
     )
+    add_launch_arg(
+        "cuda_ground_segmentation_node_param_path",
+        [
+            FindPackageShare("autoware_ground_segmentation_cuda"),
+            "/config/cuda_scan_ground_segmentation_filter.param.yaml",
+        ],
+    )
 
     set_container_executable = SetLaunchConfiguration(
         "container_executable",
diff --git a/launch/tier4_perception_launch/launch/perception.launch.xml b/launch/tier4_perception_launch/launch/perception.launch.xml
index 52b00fc22bedc..cde5a75d46108 100644
--- a/launch/tier4_perception_launch/launch/perception.launch.xml
+++ b/launch/tier4_perception_launch/launch/perception.launch.xml
@@ -149,6 +149,7 @@
   <arg name="use_perception_analytics_publisher" default="true" description="use perception analytics publisher"/>
   <arg name="use_obstacle_segmentation_single_frame_filter" description="use single frame filter at the ground segmentation"/>
   <arg name="use_obstacle_segmentation_time_series_filter" description="use time series filter at the ground segmentation"/>
+  <arg name="use_cuda_ground_segmentation" default="false" description="use cuda ground segmentation filter at the ground segmentation"/>
 
   <!-- traffic light recognition options to switch launch function/module -->
   <arg name="use_traffic_light_recognition"/>
@@ -225,6 +226,7 @@
         <arg name="obstacle_segmentation_ground_segmentation_param_path" value="$(var obstacle_segmentation_ground_segmentation_param_path)"/>
         <arg name="use_single_frame_filter" value="$(var use_obstacle_segmentation_single_frame_filter)"/>
         <arg name="use_time_series_filter" value="$(var use_obstacle_segmentation_time_series_filter)"/>
+        <arg name="use_cuda_ground_segmentation" value="$(var use_cuda_ground_segmentation)"/>
       </include>
     </group>
 
diff --git a/perception/autoware_ground_segmentation_cuda/CMakeLists.txt b/perception/autoware_ground_segmentation_cuda/CMakeLists.txt
new file mode 100644
index 0000000000000..68df65699c4d3
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/CMakeLists.txt
@@ -0,0 +1,131 @@
+cmake_minimum_required(VERSION 3.8)
+project(autoware_ground_segmentation_cuda)
+
+find_package(ament_cmake_auto REQUIRED)
+find_package(CUDA)
+find_package(agnocastlib)
+
+if(NOT ${CUDA_FOUND})
+  message(WARNING "cuda was not found, so the autoware_ground_segmentation_cuda package will not be built.")
+  return()
+elseif(CMAKE_BUILD_TYPE STREQUAL "Debug")
+  set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-g -G")
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-g -G")
+endif()
+
+if(USE_AGNOCAST AND NOT agnocastlib_FOUND)
+  message(FATAL_ERROR "agnocastlib is required when USE_AGNOCAST is enabled")
+endif()
+
+ament_auto_find_build_dependencies()
+
+# Default to C++17
+if(NOT CMAKE_CXX_STANDARD)
+    set(CMAKE_CXX_STANDARD 17)
+endif()
+
+if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    add_compile_options(-Wall -Wextra -Wpedantic -Wunused-function)
+endif()
+
+if(USE_AGNOCAST)
+    add_definitions(-DUSE_AGNOCAST_ENABLED)
+endif()
+
+if(BUILD_TESTING)
+    list(APPEND AMENT_LINT_AUTO_EXCLUDE ament_cmake_uncrustify)
+    find_package(ament_lint_auto REQUIRED)
+    ament_lint_auto_find_test_dependencies()
+endif()
+
+include_directories(
+    include
+    SYSTEM
+    ${CUDA_INCLUDE_DIRS}
+)
+
+# cSpell: ignore expt gencode
+list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -diag-suppress 20012")
+list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_75,code=sm_75")
+list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_86,code=sm_86")
+list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_87,code=sm_87")
+list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_89,code=sm_89")
+list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_89,code=compute_89")
+
+#################### cuda_ground_segmentation ##################
+cuda_add_library(cuda_ground_segmentation_lib SHARED
+  src/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter.cu
+)
+
+target_link_libraries(cuda_ground_segmentation_lib
+  ${autoware_pointcloud_preprocessor_TARGETS}
+  ${autoware_cuda_pointcloud_preprocessor_TARGETS}
+)
+
+target_include_directories(cuda_ground_segmentation_lib SYSTEM PRIVATE
+  ${autoware_pointcloud_preprocessor_INCLUDE_DIRS}
+  ${autoware_cuda_pointcloud_preprocessor_INCLUDE_DIRS}
+  ${autoware_point_types_INCLUDE_DIRS}
+  ${cuda_blackboard_INCLUDE_DIRS}
+  ${diagnostic_msgs_INCLUDE_DIRS}
+  ${geometry_msgs_INCLUDE_DIRS}
+  ${rclcpp_INCLUDE_DIRS}
+  ${rclcpp_components_INCLUDE_DIRS}
+  ${rcl_interfaces_INCLUDE_DIRS}
+  ${sensor_msgs_INCLUDE_DIRS}
+  ${tf2_INCLUDE_DIRS}
+  ${tf2_msgs_INCLUDE_DIRS}
+  ${autoware_cuda_utils_INCLUDE_DIRS}
+)
+
+if(USE_AGNOCAST)
+    target_include_directories(cuda_ground_segmentation_lib SYSTEM PRIVATE
+        ${autoware_agnocast_wrapper_INCLUDE_DIRS}
+        ${agnocastlib_INCLUDE_DIRS}
+    )
+    target_link_libraries(cuda_ground_segmentation_lib
+        ${agnocastlib_LIBRARIES}
+    )
+endif()
+
+
+# Targets
+ament_auto_add_library(cuda_ground_segmentation SHARED
+  src/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter_node.cpp
+)
+
+target_link_libraries(cuda_ground_segmentation
+  ${CUDA_LIBRARIES}
+  ${diagnostic_msgs_LIBRARIES}
+  cuda_ground_segmentation_lib
+)
+
+#=========== ScanGround Segmentation filter ========
+rclcpp_components_register_node(cuda_ground_segmentation
+  PLUGIN "autoware::cuda_ground_segmentation::CudaScanGroundSegmentationFilterNode"
+  EXECUTABLE cuda_scan_ground_segmentation_filter_node)
+
+################################################################################
+# Install
+install(DIRECTORY launch config
+  DESTINATION share/${PROJECT_NAME}
+)
+
+install(
+  TARGETS cuda_ground_segmentation_lib
+  LIBRARY DESTINATION lib
+)
+
+ament_auto_package()
+
+# Set ROS_DISTRO macros
+set(ROS_DISTRO $ENV{ROS_DISTRO})
+if(${ROS_DISTRO} STREQUAL "rolling")
+    add_compile_definitions(ROS_DISTRO_ROLLING)
+elseif(${ROS_DISTRO} STREQUAL "foxy")
+    add_compile_definitions(ROS_DISTRO_FOXY)
+elseif(${ROS_DISTRO} STREQUAL "galactic")
+    add_compile_definitions(ROS_DISTRO_GALACTIC)
+elseif(${ROS_DISTRO} STREQUAL "humble")
+    add_compile_definitions(ROS_DISTRO_HUMBLE)
+endif()
diff --git a/perception/autoware_ground_segmentation_cuda/README.md b/perception/autoware_ground_segmentation_cuda/README.md
new file mode 100644
index 0000000000000..33218e12fa062
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/README.md
@@ -0,0 +1,19 @@
+# autoware_ground_segmentation_cuda
+
+## Purpose
+
+The `autoware_ground_segmentation` algorithms have been thoroughly tested with Autoware. However, due to latency and high computational cost when processing large pointcloud, the input pointcloud range has been limited by the `crop_box_filter` based on the ego-vehicle's `base_link`. This can cause unwanted object loss, especially before a sloped road.
+
+![ground_segmentation_pipeline issue](./docs/image/ground_segmentation_issue.png)
+
+Recently, GPU and CUDA-supported libraries such as [cuda_blackboard](https://github.com/autowarefoundation/cuda_blackboard/blob/1837689df2891f6223f07c178c21aed252566ede/README.md) and accelerated versions of [`autoware_pointcloud_preprocessor`](../../sensing/autoware_cuda_pointcloud_preprocessor/README.md) have been implemented. These can be leveraged to improve the performance of ground segmentation filter algorithms using CUDA/GPU.
+
+This package reimplements the current scan_ground_filter of the ground_segmentation package to reduce latency and avoid the bottleneck caused by processing a large number of point clouds.
+
+## Inner-workings / Algorithm
+
+The detailed algorithm is available in [scan-ground-filter.md](../autoware_ground_segmentation/docs/scan-ground-filter.md).
+
+## Parameters
+
+{{ json_to_markdown("perception/autoware_ground_segmentation_cuda/schema/cuda_scan_ground_segmentation_filter.schema.json") }}
diff --git a/perception/autoware_ground_segmentation_cuda/config/cuda_scan_ground_segmentation_filter.param.yaml b/perception/autoware_ground_segmentation_cuda/config/cuda_scan_ground_segmentation_filter.param.yaml
new file mode 100644
index 0000000000000..20c8026798e8d
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/config/cuda_scan_ground_segmentation_filter.param.yaml
@@ -0,0 +1,19 @@
+/**:
+  ros__parameters:
+    global_slope_max_angle_deg: 10.0
+    local_slope_max_angle_deg: 25.0
+    non_ground_height_threshold: 0.20
+    grid_size_m: 0.5
+    detection_range_z_max: 3.2
+    use_recheck_ground_cluster: true
+    recheck_start_distance: 20.0
+    use_lowest_point: true
+    center_pcl_shift: 0.0
+    sector_angle_deg: 1.0
+    gnd_cell_buffer_size: 5
+    min_x: -100.0
+    max_x: 150.0
+    min_y: -70.0
+    max_y: 70.0
+    max_z: 2.5
+    min_z: -2.5
diff --git a/perception/autoware_ground_segmentation_cuda/docs/image/ground_segmentation_issue.png b/perception/autoware_ground_segmentation_cuda/docs/image/ground_segmentation_issue.png
new file mode 100644
index 0000000000000..31586172329a7
Binary files /dev/null and b/perception/autoware_ground_segmentation_cuda/docs/image/ground_segmentation_issue.png differ
diff --git a/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_common.hpp b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_common.hpp
new file mode 100644
index 0000000000000..b22ab4cc9aefd
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_common.hpp
@@ -0,0 +1,38 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_COMMON_HPP_
+#define AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_COMMON_HPP_
+
+#include <autoware/cuda_utils/cuda_check_error.hpp>
+
+#include <cuda_runtime.h>
+
+#ifndef CUDAH
+#define CUDAH __forceinline__ __host__ __device__
+#endif
+
+#ifndef BLOCK_SIZE_X
+#define BLOCK_SIZE_X (256)
+#endif
+
+#ifndef WARP_SIZE
+#define WARP_SIZE (32)
+#endif
+
+#ifndef FULL_MASK
+#define FULL_MASK (0xFFFFFFFF)
+#endif
+
+#endif  // AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_COMMON_HPP_
diff --git a/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_mempool_wrapper.hpp b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_mempool_wrapper.hpp
new file mode 100644
index 0000000000000..d58384f9070eb
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_mempool_wrapper.hpp
@@ -0,0 +1,83 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_MEMPOOL_WRAPPER_HPP_
+#define AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_MEMPOOL_WRAPPER_HPP_
+
+#include <autoware/cuda_utils/cuda_check_error.hpp>
+
+#include <cuda_runtime_api.h>
+
+namespace autoware
+{
+
+class CudaMempool
+{
+public:
+  inline explicit CudaMempool(int device_id = 0, int64_t pool_release_threshold = 0)
+  {
+    cudaMemPoolProps pool_props = {};
+
+    // Check if the device is valid
+    CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
+
+    pool_props.allocType = cudaMemAllocationTypePinned;
+    pool_props.location.id = device_id;
+    pool_props.location.type = cudaMemLocationTypeDevice;
+
+    CHECK_CUDA_ERROR(cudaMemPoolCreate(&pool_, &pool_props));
+
+    if (pool_release_threshold >= 0) {
+      CHECK_CUDA_ERROR(
+        cudaMemPoolSetAttribute(pool_, cudaMemPoolAttrReleaseThreshold, &pool_release_threshold));
+    }
+  }
+
+  CudaMempool(const CudaMempool &) = delete;
+  inline CudaMempool(CudaMempool && other) : pool_(std::move(other.pool_))
+  {
+    other.pool_ = nullptr;
+  }
+
+  CudaMempool & operator=(const CudaMempool &) = delete;
+  inline CudaMempool & operator=(CudaMempool && other)
+  {
+    if (this != &other) {
+      release();
+      pool_ = std::move(other.pool_);
+      other.pool_ = nullptr;
+    }
+
+    return *this;
+  }
+
+  inline cudaMemPool_t get() { return pool_; }
+
+  ~CudaMempool() { release(); }
+
+private:
+  inline void release()
+  {
+    if (pool_) {
+      CHECK_CUDA_ERROR(cudaMemPoolDestroy(pool_));
+    }
+
+    pool_ = nullptr;
+  }
+
+  cudaMemPool_t pool_;
+};
+}  // namespace autoware
+
+#endif  // AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_MEMPOOL_WRAPPER_HPP_
diff --git a/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter.hpp b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter.hpp
new file mode 100644
index 0000000000000..d7fc8875d61d1
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter.hpp
@@ -0,0 +1,179 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_SCAN_GROUND_SEGMENTATION_FILTER_HPP_
+#define AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_SCAN_GROUND_SEGMENTATION_FILTER_HPP_
+
+#include "cuda_common.hpp"
+#include "cuda_mempool_wrapper.hpp"
+#include "cuda_stream_wrapper.hpp"
+#include "device_vector.hpp"
+
+#include <autoware/cuda_pointcloud_preprocessor/point_types.hpp>
+#include <autoware/cuda_utils/cuda_check_error.hpp>
+#include <cuda_blackboard/cuda_pointcloud2.hpp>
+#include <cuda_blackboard/cuda_unique_ptr.hpp>
+
+#include <cuda_runtime.h>
+
+#include <cstdint>
+#include <iostream>
+#include <memory>
+
+namespace autoware::cuda_ground_segmentation
+{
+
+enum SegmentationMode : uint8_t { UNINITIALIZED = 0, CONTINUOUS, DISCONTINUOUS, BREAK };
+
+struct PointTypeStruct
+{
+  float x;
+  float y;
+  float z;
+  std::uint8_t intensity;
+  std::uint8_t return_type;
+  std::uint16_t channel;
+};
+
+enum class PointType : uint8_t {
+  INIT = 0,
+  GROUND,
+  NON_GROUND,
+  POINT_FOLLOW,
+  UNKNOWN,
+  VIRTUAL_GROUND,
+  OUT_OF_RANGE
+};
+
+struct alignas(16) ClassifiedPointType
+{
+  float z;
+  PointType type;
+  float radius;
+  size_t origin_index;  // index in the original point cloud
+
+  CUDAH ClassifiedPointType() : z(0.0), type(PointType::INIT), radius(-1.0), origin_index(0) {}
+};
+
+struct alignas(16) Cell
+{
+  float gnd_radius_avg;
+  float gnd_height_avg;
+  float gnd_height_min;
+  uint32_t num_ground_points;
+  // initialize constructor
+
+  CUDAH Cell()
+  : gnd_radius_avg(0.0f), gnd_height_avg(0.0f), gnd_height_min(0.0f), num_ground_points(0)
+  {
+  }
+};
+
+// structure to hold parameter values
+struct FilterParameters
+{
+  float max_x;
+  float min_x;
+  float max_y;
+  float min_y;
+  float max_z;
+  float min_z;
+  float max_radius;
+  float non_ground_height_threshold;
+  // common thresholds
+  float global_slope_max_angle_rad;  // radians
+  float local_slope_max_angle_rad;   // radians
+  float global_slope_max_ratio;
+  float local_slope_max_ratio;
+  // common parameters
+  float sector_angle_rad;  // radial sector angle in radians
+  float inv_sector_angle_rad;
+
+  // cell mode parameters
+  float recheck_start_distance;  // distance to start rechecking ground cluster
+  float detection_range_z_max;
+  // cell parameters
+  float cell_divider_size_m;
+  float center_x{0.0f};
+  float center_y{0.0f};
+  uint32_t max_num_cells;
+  uint32_t max_num_cells_per_sector;      // number of cells per sector
+  uint32_t num_sectors;                   // number of radial sectors
+  uint32_t gnd_grid_continual_thresh{3};  // threshold for continual ground grid
+  uint32_t use_recheck_ground_cluster;    // to enable recheck ground cluster
+  uint32_t gnd_cell_buffer_size{5};
+};
+
+class CudaScanGroundSegmentationFilter
+{
+public:
+  explicit CudaScanGroundSegmentationFilter(
+    const FilterParameters & filter_parameters, const int64_t max_mem_pool_size_in_byte);
+  ~CudaScanGroundSegmentationFilter() = default;
+
+  // Method to process the point cloud data and filter ground points
+  void classifyPointCloud(
+    const cuda_blackboard::CudaPointCloud2 & input_points,
+    cuda_blackboard::CudaPointCloud2 & ground, cuda_blackboard::CudaPointCloud2 & non_ground);
+
+private:
+  std::unique_ptr<cuda_blackboard::CudaPointCloud2> dev_input_points_, non_ground_, ground_;
+  std::unique_ptr<device_vector<int>> empty_cell_mark_;
+
+  uint32_t number_input_points_;
+  uint32_t num_process_points_host_;
+  uint32_t input_pointcloud_step_;
+  uint32_t input_xyzi_offset_[4];
+  float center_x_{0.0f};
+  float center_y_{0.0f};
+  const uint32_t gnd_grid_continual_thresh_{3};  // threshold for continual ground grid
+  const uint32_t continual_gnd_grid_thresh_{5};  // threshold for continual ground grid with recheck
+  // Parameters
+  FilterParameters filter_parameters_;
+
+private:
+  // Remove points too far from the origin
+  void removeOutliers(const cuda_blackboard::CudaPointCloud2 & input);
+
+  /*
+   * This function calc the cell_id for each point
+   * Assign the point with initialized class into temp memory for classification
+   * Memory size of each cell is depend on predefined cell point num
+   *
+   */
+
+  void scanPerSectorGroundReference(
+    device_vector<Cell> & cell_list, device_vector<int> & starting_pid,
+    device_vector<ClassifiedPointType> & classified_points);
+
+  /*
+   * Extract obstacle points from classified_points_dev into
+   */
+  template <typename CheckerType>
+  void extractPoints(
+    device_vector<ClassifiedPointType> & classified_points,
+    const cuda_blackboard::CudaPointCloud2 & input, cuda_blackboard::CudaPointCloud2 & output);
+
+  // Distribute points to cells
+  void sort_points(
+    device_vector<Cell> & cell_list, device_vector<int> & starting_pid,
+    device_vector<ClassifiedPointType> & classified_points);
+
+  std::shared_ptr<CudaStream> stream_;
+  std::shared_ptr<CudaMempool> mempool_;
+};
+
+}  // namespace autoware::cuda_ground_segmentation
+
+#endif  // AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_SCAN_GROUND_SEGMENTATION_FILTER_HPP_
diff --git a/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter_node.hpp b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter_node.hpp
new file mode 100644
index 0000000000000..f5e82b0ef7125
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter_node.hpp
@@ -0,0 +1,64 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_SCAN_GROUND_SEGMENTATION_FILTER_NODE_HPP_
+#define AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_SCAN_GROUND_SEGMENTATION_FILTER_NODE_HPP_
+
+#include "autoware/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter.hpp"
+
+#include <cuda_blackboard/cuda_adaptation.hpp>
+#include <cuda_blackboard/cuda_blackboard_publisher.hpp>
+#include <cuda_blackboard/cuda_blackboard_subscriber.hpp>
+#include <cuda_blackboard/cuda_pointcloud2.hpp>
+#include <rclcpp/rclcpp.hpp>
+
+// Autoware utils
+#include <autoware_utils/ros/debug_publisher.hpp>
+#include <autoware_utils/ros/diagnostics_interface.hpp>
+#include <autoware_utils/ros/published_time_publisher.hpp>
+#include <autoware_utils/system/stop_watch.hpp>
+
+namespace autoware::cuda_ground_segmentation
+{
+class CudaScanGroundSegmentationFilterNode : public rclcpp::Node
+{
+public:
+  explicit CudaScanGroundSegmentationFilterNode(const rclcpp::NodeOptions & options);
+
+private:
+  void cudaPointCloudCallback(const cuda_blackboard::CudaPointCloud2::ConstSharedPtr & msg);
+  // Cuda Ground Segmentation Filter
+  std::unique_ptr<CudaScanGroundSegmentationFilter> cuda_ground_segmentation_filter_{};
+  // debugger
+  std::unique_ptr<autoware_utils::StopWatch<std::chrono::milliseconds>> stop_watch_ptr_{nullptr};
+  std::unique_ptr<autoware_utils::DebugPublisher> debug_publisher_ptr_{nullptr};
+
+  // rclcpp::Subscription<cuda_blackboard::CudaPointCloud2>::SharedPtr sub_;
+  // rclcpp::Publisher<cuda_blackboard::CudaPointCloud2>::SharedPtr pub_;
+  // rclcpp::Publisher<cuda_blackboard::CudaPointCloud2>::SharedPtr pub_gnd_;
+
+  // Cuda Sub
+  std::shared_ptr<cuda_blackboard::CudaBlackboardSubscriber<cuda_blackboard::CudaPointCloud2>>
+    sub_{};
+  // Cuda Pub
+  std::unique_ptr<cuda_blackboard::CudaBlackboardPublisher<cuda_blackboard::CudaPointCloud2>>
+    pub_{};
+
+  std::unique_ptr<cuda_blackboard::CudaBlackboardPublisher<cuda_blackboard::CudaPointCloud2>>
+    pub_gnd_{};
+};
+
+}  // namespace autoware::cuda_ground_segmentation
+
+#endif  // AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_SCAN_GROUND_SEGMENTATION_FILTER_NODE_HPP_
diff --git a/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_stream_wrapper.hpp b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_stream_wrapper.hpp
new file mode 100644
index 0000000000000..e33dd49f36eb6
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_stream_wrapper.hpp
@@ -0,0 +1,72 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_STREAM_WRAPPER_HPP_
+#define AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_STREAM_WRAPPER_HPP_
+
+#include <autoware/cuda_utils/cuda_check_error.hpp>
+
+#include <cuda_runtime_api.h>
+
+#include <memory>
+
+namespace autoware
+{
+
+// A wrapper on the cudaStream_t, with a destructor to
+class CudaStream
+{
+public:
+  inline CudaStream(bool is_null = false)
+  {
+    // If is_null is true, we use the default stream
+    // Otherwise, we create a new stream
+    if (!is_null) {
+      CHECK_CUDA_ERROR(cudaStreamCreate(&stream_));
+    }
+  }
+
+  CudaStream(const CudaStream &) = delete;
+  CudaStream(CudaStream && other) : stream_(std::move(other.stream_)) { other.stream_ = nullptr; }
+
+  CudaStream & operator=(const CudaStream &) = delete;
+  inline CudaStream & operator=(CudaStream && other)
+  {
+    if (this != &other) {
+      release();
+      stream_ = std::move(other.stream_);
+      other.stream_ = nullptr;
+    }
+
+    return *this;
+  }
+
+  inline cudaStream_t & get() { return stream_; }
+
+  ~CudaStream() { release(); }
+
+private:
+  inline void release()
+  {
+    if (stream_) {
+      CHECK_CUDA_ERROR(cudaStreamDestroy(stream_));
+    }
+  }
+
+  cudaStream_t stream_{nullptr};
+};
+
+}  // namespace autoware
+
+#endif  // AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__CUDA_STREAM_WRAPPER_HPP_
diff --git a/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_utility.cuh b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_utility.cuh
new file mode 100644
index 0000000000000..db560b87b92d9
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/cuda_utility.cuh
@@ -0,0 +1,147 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION_CUDA_UTILITIES_HPP_
+#define AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION_CUDA_UTILITIES_HPP_
+
+#include "cuda_common.hpp"
+#include "cuda_mempool_wrapper.hpp"
+#include "cuda_stream_wrapper.hpp"
+#include "device_vector.hpp"
+
+#include <cub/cub.cuh>
+
+#include <cuda_runtime.h>
+
+namespace autoware::cuda
+{
+
+template <int block_size, typename... Args>
+inline cudaError_t launchAsync(
+  int thread_num, int shared_size, cudaStream_t & stream, void (*f)(Args...), Args... args)
+{
+  int block_x = (thread_num > block_size) ? block_size : thread_num;
+
+  if (block_x <= 0) {
+    return cudaErrorLaunchFailure;
+  }
+
+  int grid_x = (thread_num + block_x - 1) / block_x;
+
+  f<<<grid_x, block_x, shared_size, stream>>>(args...);
+
+  return cudaGetLastError();
+}
+
+template <typename T>
+cudaError_t ExclusiveScan(
+  T * input, T * output, int ele_num, std::shared_ptr<CudaStream> stream,
+  std::shared_ptr<CudaMempool> mempool)
+{
+  if (ele_num == 0) {
+    return cudaSuccess;
+  }
+
+  if (ele_num < 0 || !stream) {
+    return cudaErrorInvalidValue;
+  }
+
+  device_vector<int> d_temp_storage(stream, mempool);
+  size_t temp_storage_bytes = 0;
+
+  cub::DeviceScan::ExclusiveSum(
+    (void *)(d_temp_storage.data()), temp_storage_bytes, input, output, ele_num, stream->get());
+
+  int temp_ele_num = (temp_storage_bytes + sizeof(int) - 1) / sizeof(int);
+  d_temp_storage.resize(temp_ele_num);
+
+  cub::DeviceScan::ExclusiveSum(
+    (void *)(d_temp_storage.data()), temp_storage_bytes, input, output, ele_num, stream->get());
+
+  return cudaGetLastError();
+}
+
+template <typename T>
+cudaError_t ExclusiveScan(device_vector<T> & input, device_vector<T> & output)
+{
+  if (input.empty()) {
+    return cudaSuccess;
+  }
+
+  output.resize(input.size());
+
+  return ExclusiveScan(
+    input.data(), output.data(), (int)(input.size()), input.get_stream(), input.get_mempool());
+}
+
+template <typename T>
+cudaError_t ExclusiveScan(device_vector<T> & input)
+{
+  return ExclusiveScan(input, input);
+}
+
+template <typename T>
+__global__ void fillVector(T * vec, int ele_num, T init_val)
+{
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for (int i = index; i < ele_num; i += stride) {
+    vec[i] = init_val;
+  }
+}
+
+template <typename T>
+cudaError_t fill(T * input, int ele_num, T val, std::shared_ptr<CudaStream> stream)
+{
+  if (ele_num == 0) {
+    return cudaSuccess;
+  }
+
+  if (ele_num < 0 || !stream) {
+    return cudaErrorInvalidValue;
+  }
+
+  return launchAsync<BLOCK_SIZE_X>(ele_num, 0, stream->get(), fillVector, input, ele_num, val);
+}
+
+template <typename T>
+cudaError_t fill(device_vector<T> & input, T val)
+{
+  return fill(input.data(), (int)(input.size()), val, input.get_stream());
+}
+
+CUDAH void memcpy(uint8_t * dst, const uint8_t * src, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    dst[i] = src[i];
+  }
+}
+
+inline void copyPointCloud2Metadata(
+  cuda_blackboard::CudaPointCloud2 & dst, const cuda_blackboard::CudaPointCloud2 & src)
+{
+  dst.header = src.header;
+  dst.height = src.height;
+  dst.width = src.width;
+  dst.fields = src.fields;
+  dst.is_bigendian = src.is_bigendian;
+  dst.point_step = src.point_step;
+  dst.row_step = src.row_step;
+  dst.is_dense = src.is_dense;
+}
+
+}  // namespace autoware::cuda
+
+#endif
diff --git a/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/device_vector.hpp b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/device_vector.hpp
new file mode 100644
index 0000000000000..20928e4c84321
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/include/autoware/cuda_scan_ground_segmentation/device_vector.hpp
@@ -0,0 +1,332 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__DEVICE_VECTOR_HPP_
+#define AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__DEVICE_VECTOR_HPP_
+
+#include "cuda_mempool_wrapper.hpp"
+#include "cuda_stream_wrapper.hpp"
+
+#include <autoware/cuda_utils/cuda_check_error.hpp>
+
+#include <cuda_runtime_api.h>
+
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace autoware
+{
+
+template <typename T>
+class device_vector
+{
+public:
+  device_vector(
+    std::shared_ptr<CudaStream> stream = std::make_shared<CudaStream>(true),
+    std::shared_ptr<CudaMempool> mem_pool = nullptr);
+
+  device_vector(
+    const device_vector & other,
+    std::shared_ptr<CudaStream> stream = std::make_shared<CudaStream>(true),
+    std::shared_ptr<CudaMempool> mem_pool = nullptr,
+    bool copy_all = false  // If true, use the stream and mem_pool of the other
+  );
+
+  device_vector(device_vector && other);
+
+  device_vector(
+    size_t ele_num, std::shared_ptr<CudaStream> stream = std::make_shared<CudaStream>(true),
+    std::shared_ptr<CudaMempool> mem_pool = nullptr);
+
+  device_vector(
+    const std::vector<T> & other,
+    std::shared_ptr<CudaStream> stream = std::make_shared<CudaStream>(true),
+    std::shared_ptr<CudaMempool> mem_pool = nullptr);
+
+  device_vector & operator=(const device_vector & other);
+  device_vector & operator=(device_vector && other);
+  device_vector & operator=(const std::vector<T> & other);
+
+  void to_vector(std::vector<T> & output) const
+  {
+    output.resize(ele_num_);
+    copyDtoH(output.data(), data_, ele_num_);
+  }
+
+  // Slow, do not use unless necessary
+  T operator[](int idx) const
+  {
+    if (idx < 0 || idx >= ele_num_) {
+      throw std::invalid_argument("Error: out-of-bound access at index " + std::to_string(idx));
+    }
+    T val;
+
+    copyDtoH(&val, data_ + idx, 1);
+
+    CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_->get()));
+
+    return val;
+  }
+
+  T * data() { return data_; }
+  const T * data() const { return data_; }
+  size_t size() const { return ele_num_; }
+
+  void resize(size_t new_size);
+  void reserve(size_t new_size);
+  bool empty() const { return (ele_num_ == 0); }
+  void clear();
+
+  std::shared_ptr<CudaStream> get_stream() { return stream_; }
+  std::shared_ptr<CudaMempool> get_mempool() { return mempool_; }
+
+  ~device_vector();
+
+private:
+  // Return false if allocation fail
+  void allocate(size_t ele_num);
+  void release();
+  void copyDtoD(T * dst, const T * src, size_t ele_num);
+  void copyHtoD(T * dst, const T * src, size_t ele_num);
+  void copyDtoH(T * dst, const T * src, size_t ele_num) const;
+
+  T * data_;
+  size_t ele_num_;                        // The assumed size
+  size_t actual_ele_num_;                 // The actual size of the vector
+  std::shared_ptr<CudaStream> stream_;    // If null, use the default stream
+  std::shared_ptr<CudaMempool> mempool_;  // If null, use the default non-pool allocation
+};
+
+template <typename T>
+device_vector<T>::device_vector(
+  std::shared_ptr<CudaStream> stream, std::shared_ptr<CudaMempool> mem_pool)
+: stream_(stream), mempool_(mem_pool)
+{
+  data_ = nullptr;
+  ele_num_ = actual_ele_num_ = 0;
+}
+
+template <typename T>
+device_vector<T>::device_vector(
+  const device_vector & other, std::shared_ptr<CudaStream> stream,
+  std::shared_ptr<CudaMempool> mem_pool, bool copy_all)
+: stream_(stream), mempool_(mem_pool), data_(nullptr)
+{
+  ele_num_ = actual_ele_num_ = 0;
+
+  if (copy_all) {
+    stream_ = other.stream_;
+    mempool_ = other.mempool_;
+  }
+
+  if (other.empty()) {
+    return;
+  }
+
+  // Allocate memory
+  resize(other.size());
+
+  // Copy
+  copyDtoD(data_, other.data_, other.size());
+}
+
+template <typename T>
+device_vector<T>::device_vector(device_vector && other)
+{
+  data_ = other.data_;
+  ele_num_ = other.ele_num_;
+  actual_ele_num_ = other.actual_ele_num_;
+  stream_ = other.stream_;
+  mempool_ = other.mempool_;
+
+  other.data_ = nullptr;
+  other.ele_num_ = other.actual_ele_num_ = 0;
+  other.stream_.reset();
+  other.mempool_.reset();
+}
+
+template <typename T>
+device_vector<T>::device_vector(
+  size_t ele_num, std::shared_ptr<CudaStream> stream, std::shared_ptr<CudaMempool> mem_pool)
+: stream_(stream), mempool_(mem_pool), data_(nullptr)
+{
+  ele_num_ = actual_ele_num_ = 0;
+
+  if (ele_num > 0) {
+    resize(ele_num);
+  }
+}
+
+template <typename T>
+device_vector<T>::device_vector(
+  const std::vector<T> & other, std::shared_ptr<CudaStream> stream,
+  std::shared_ptr<CudaMempool> mem_pool)
+: stream_(stream), mempool_(mem_pool), data_(nullptr)
+{
+  ele_num_ = actual_ele_num_ = 0;
+
+  if (other.empty()) {
+    return;
+  }
+
+  allocate(other.size());
+  copyHtoD(data_, other.data(), other.size());
+  ele_num_ = other.size();
+}
+
+template <typename T>
+device_vector<T> & device_vector<T>::operator=(const device_vector & other)
+{
+  size_t other_size = other.size();
+
+  resize(other_size);
+
+  if (other_size == 0) {
+    copyDtoD(data_, other.data_, other_size);
+  }
+
+  return *this;
+}
+
+template <typename T>
+device_vector<T> & device_vector<T>::operator=(device_vector && other)
+{
+  release();
+
+  data_ = other.data_;
+  ele_num_ = other.ele_num_;
+  actual_ele_num_ = other.actual_ele_num_;
+  stream_ = other.stream_;
+  mempool_ = other.mempool_;
+
+  // TODO: is stream_ = other.stream_ necessary?
+  other.data_ = nullptr;
+  other.ele_num_ = other.actual_ele_num_ = 0;
+  other.mempool_.reset();
+  other.stream_.reset();
+
+  return *this;
+}
+
+template <typename T>
+device_vector<T> & device_vector<T>::operator=(const std::vector<T> & other)
+{
+  release();
+
+  if (other.empty()) {
+    return *this;
+  }
+
+  copyHtoD(data_, other.data(), sizeof(T) * other.size());
+
+  return *this;
+}
+
+template <typename T>
+void device_vector<T>::resize(size_t new_ele_num)
+{
+  // If no significant change, just change the number of elements
+  if (new_ele_num >= actual_ele_num_ * 0.8 && new_ele_num <= actual_ele_num_) {
+    ele_num_ = new_ele_num;
+    return;
+  }
+
+  // Otherwise, reallocate the data
+  release();
+  allocate(new_ele_num);
+  ele_num_ = new_ele_num;
+}
+
+template <typename T>
+void device_vector<T>::reserve(size_t new_ele_num)
+{
+  if (new_ele_num <= actual_ele_num_ * 0.8) {
+    return;
+  }
+
+  allocate(new_ele_num);
+}
+
+template <typename T>
+void device_vector<T>::clear()
+{
+  // Release the memory only
+  release();
+}
+
+template <typename T>
+device_vector<T>::~device_vector()
+{
+  // Release the memory
+  release();
+  // Release the stream and memory pool
+  stream_.reset();
+  mempool_.reset();
+}
+
+template <typename T>
+void device_vector<T>::allocate(size_t ele_num)
+{
+  if (ele_num > 0) {
+    if (mempool_) {
+      CHECK_CUDA_ERROR(
+        cudaMallocFromPoolAsync(&data_, sizeof(T) * ele_num, mempool_->get(), stream_->get()));
+    } else {
+      CHECK_CUDA_ERROR(cudaMallocAsync(&data_, sizeof(T) * ele_num, stream_->get()));
+    }
+  }
+
+  actual_ele_num_ = ele_num;
+  // Currently no element yet
+  ele_num_ = 0;
+}
+
+template <typename T>
+void device_vector<T>::release()
+{
+  if (data_) {
+    CHECK_CUDA_ERROR(cudaFreeAsync(data_, stream_->get()));
+  }
+
+  data_ = nullptr;
+  ele_num_ = 0;
+  actual_ele_num_ = 0;
+}
+
+template <typename T>
+void device_vector<T>::copyDtoD(T * dst, const T * src, size_t ele_num)
+{
+  CHECK_CUDA_ERROR(
+    cudaMemcpyAsync(dst, src, sizeof(T) * ele_num, cudaMemcpyDeviceToDevice, stream_->get()));
+}
+
+template <typename T>
+void device_vector<T>::copyHtoD(T * dst, const T * src, size_t ele_num)
+{
+  CHECK_CUDA_ERROR(
+    cudaMemcpyAsync(dst, src, sizeof(T) * ele_num, cudaMemcpyHostToDevice, stream_->get()));
+}
+
+template <typename T>
+void device_vector<T>::copyDtoH(T * dst, const T * src, size_t ele_num) const
+{
+  CHECK_CUDA_ERROR(
+    cudaMemcpyAsync(dst, src, sizeof(T) * ele_num, cudaMemcpyDeviceToHost, stream_->get()));
+}
+
+}  // namespace autoware
+
+#endif  // AUTOWARE__CUDA_SCAN_GROUND_SEGMENTATION__DEVICE_VECTOR_HPP_
diff --git a/perception/autoware_ground_segmentation_cuda/launch/cuda_scan_ground_filter.launch.py b/perception/autoware_ground_segmentation_cuda/launch/cuda_scan_ground_filter.launch.py
new file mode 100644
index 0000000000000..688b1743806fc
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/launch/cuda_scan_ground_filter.launch.py
@@ -0,0 +1,94 @@
+# Copyright 2021 Tier IV, Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import launch
+from launch.actions import DeclareLaunchArgument
+from launch.actions import GroupAction
+from launch.actions import OpaqueFunction
+from launch.conditions import LaunchConfigurationEquals
+from launch.conditions import LaunchConfigurationNotEquals
+from launch.substitutions import LaunchConfiguration
+from launch_ros.actions import ComposableNodeContainer
+from launch_ros.actions import LoadComposableNodes
+from launch_ros.descriptions import ComposableNode
+from launch_ros.parameter_descriptions import ParameterFile
+from launch_ros.substitutions import FindPackageShare
+
+
+def launch_setup(context, *args, **kwargs):
+
+    ground_segmentation_node_param = ParameterFile(
+        param_file=LaunchConfiguration("cuda_ground_segmentation_node_param_path").perform(context),
+        allow_substs=True,
+    )
+
+    nodes = [
+        ComposableNode(
+            package="autoware_ground_segmentation_cuda",
+            plugin="autoware::cuda_ground_segmentation::CudaScanGroundSegmentationFilterNode",
+            name="cuda_scan_ground_segmentation_filter",
+            remappings=[
+                ("~/input/pointcloud", LaunchConfiguration("input/pointcloud")),
+                ("~/output/pointcloud", LaunchConfiguration("output/pointcloud")),
+            ],
+            parameters=[ground_segmentation_node_param],
+            extra_arguments=[],
+        ),
+    ]
+
+    loader = LoadComposableNodes(
+        condition=LaunchConfigurationNotEquals("container", ""),
+        composable_node_descriptions=nodes,
+        target_container=LaunchConfiguration("container"),
+    )
+
+    container = ComposableNodeContainer(
+        name="scan_ground_filter_container",
+        namespace="",
+        package="rclcpp_components",
+        executable="component_container",
+        composable_node_descriptions=nodes,
+        output="screen",
+        condition=LaunchConfigurationEquals("container", ""),
+    )
+
+    group = GroupAction(
+        [
+            container,
+            loader,
+        ]
+    )
+
+    return [group]
+
+
+def generate_launch_description():
+    def add_launch_arg(name: str, default_value=None):
+        return DeclareLaunchArgument(name, default_value=default_value)
+
+    return launch.LaunchDescription(
+        [
+            add_launch_arg("container", ""),
+            add_launch_arg("input/pointcloud", "/sensing/lidar/concatenated/pointcloud"),
+            add_launch_arg("output/pointcloud", "/perception/obstacle_segmentation/pointcloud"),
+            add_launch_arg(
+                "cuda_ground_segmentation_node_param_path",
+                [
+                    FindPackageShare("autoware_ground_segmentation_cuda"),
+                    "/config/cuda_scan_ground_segmentation_filter.param.yaml",
+                ],
+            ),
+        ]
+        + [OpaqueFunction(function=launch_setup)]
+    )
diff --git a/perception/autoware_ground_segmentation_cuda/package.xml b/perception/autoware_ground_segmentation_cuda/package.xml
new file mode 100644
index 0000000000000..38d7e4f6931de
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/package.xml
@@ -0,0 +1,39 @@
+<?xml version="1.0"?>
+<?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
+<package format="3">
+  <name>autoware_ground_segmentation_cuda</name>
+  <version>0.0.0</version>
+  <description>GPU-Accelerated ground segmentation</description>
+  <maintainer email="dai.nguyen@tier4.jp">badai-nguyen</maintainer>
+  <maintainer email="anh.nguyen.2@tier4.jp">Anh Nguyen</maintainer>
+  <author email="dai.nguyen@tier4.jp">badai-nguyen</author>
+  <license>Apache License 2.0</license>
+
+  <buildtool_depend>ament_cmake_auto</buildtool_depend>
+  <buildtool_depend>autoware_cmake</buildtool_depend>
+
+  <depend>agnocastlib</depend>
+  <depend>autoware_agnocast_wrapper</depend>
+  <depend>autoware_cuda_dependency_meta</depend>
+  <depend>autoware_cuda_pointcloud_preprocessor</depend>
+  <depend>autoware_cuda_utils</depend>
+  <depend>autoware_point_types</depend>
+  <depend>autoware_pointcloud_preprocessor</depend>
+  <depend>autoware_utils</depend>
+  <depend>cuda_blackboard</depend>
+  <depend>diagnostic_msgs</depend>
+  <depend>rclcpp</depend>
+  <depend>rclcpp_components</depend>
+  <depend>sensor_msgs</depend>
+  <depend>tf2</depend>
+  <depend>tf2_msgs</depend>
+  <depend>tier4_debug_msgs</depend>
+
+  <test_depend>ament_clang_format</test_depend>
+  <test_depend>ament_lint_auto</test_depend>
+  <test_depend>ament_lint_common</test_depend>
+
+  <export>
+    <build_type>ament_cmake</build_type>
+  </export>
+</package>
diff --git a/perception/autoware_ground_segmentation_cuda/schema/cuda_scan_ground_segmentation_filter.schema.json b/perception/autoware_ground_segmentation_cuda/schema/cuda_scan_ground_segmentation_filter.schema.json
new file mode 100644
index 0000000000000..f95891773c172
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/schema/cuda_scan_ground_segmentation_filter.schema.json
@@ -0,0 +1,126 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "Cuda Scan Ground Segmentation Filter Node Params",
+  "type": "object",
+  "definitions": {
+    "cuda_scan_ground_segmentation_filter": {
+      "type": "object",
+      "properties": {
+        "global_slope_max_angle_deg": {
+          "type": "number",
+          "description": "Maximum global ground slope angle [deg]. Larger values reduce false negatives on steep roads but can increase false positives (classifying objects as ground).",
+          "default": 10.0
+        },
+        "local_slope_max_angle_deg": {
+          "type": "number",
+          "description": "Maximum local slope angle [deg] between adjacent points (used for fine discrimination of small objects or ramps).",
+          "default": 13.0
+        },
+        "non_ground_height_threshold": {
+          "type": "number",
+          "description": "Absolute vertical distance threshold [m] above estimated ground line to classify a point as non-ground.",
+          "default": 0.2
+        },
+        "grid_size_m": {
+          "type": "number",
+          "description": "Radial cell size in polar grid [m]; larger values smooth ground estimation but may miss small objects.",
+          "default": 0.1
+        },
+        "gnd_cell_buffer_size": {
+          "type": "integer",
+          "description": "Number of previous ground cells used to estimate local ground slope.",
+          "default": 4
+        },
+        "detection_range_z_max": {
+          "type": "number",
+          "description": "Maximum Z height (above base_link) considered in classification [m].",
+          "default": 2.5
+        },
+        "center_pcl_shift": {
+          "type": "number",
+          "description": "X-axis offset [m] applied to additional LiDAR point clouds relative to vehicle reference (positive forward).",
+          "default": 0.0
+        },
+        "sector_angle_deg": {
+          "type": "number",
+          "description": "Angular width of each polar sector [deg].",
+          "default": 1.0
+        },
+        "use_recheck_ground_cluster": {
+          "type": "boolean",
+          "description": "Enable secondary pass to re-evaluate ground clusters (may reduce false ground on elevated points).",
+          "default": true
+        },
+        "recheck_start_distance": {
+          "type": "number",
+          "description": "Minimum radial distance [m] at which ground cluster recheck logic begins.",
+          "default": 20.0,
+          "minimum": 0.0
+        },
+        "use_lowest_point": {
+          "type": "boolean",
+          "description": "Use lowest point in cell as reference during ground recheck (otherwise median-like selection).",
+          "default": true
+        },
+        "min_x": {
+          "type": "number",
+          "description": "Minimum X range relative to base_link [m] included in processing.",
+          "default": -100.0
+        },
+        "max_x": {
+          "type": "number",
+          "description": "Maximum X (forward) range relative to base_link [m] included in processing.",
+          "default": 150.0
+        },
+        "min_y": {
+          "type": "number",
+          "description": "Minimum Y range relative to base_link [m] included in processing.",
+          "default": -70.0
+        },
+        "max_y": {
+          "type": "number",
+          "description": "Maximum Y range relative to base_link [m] included in processing.",
+          "default": 70.0
+        },
+        "min_z": {
+          "type": "number",
+          "description": "Minimum Z range relative to base_link [m] included in processing.",
+          "default": -2.5
+        },
+        "max_z": {
+          "type": "number",
+          "description": "Maximum Z range relative to base_link [m] included in processing.",
+          "default": 2.5
+        }
+      },
+      "required": [
+        "global_slope_max_angle_deg",
+        "local_slope_max_angle_deg",
+        "non_ground_height_threshold",
+        "grid_size_m",
+        "detection_range_z_max",
+        "use_recheck_ground_cluster",
+        "recheck_start_distance",
+        "use_lowest_point",
+        "center_pcl_shift",
+        "sector_angle_deg",
+        "gnd_cell_buffer_size"
+      ],
+      "additionalProperties": false
+    }
+  },
+  "properties": {
+    "/**": {
+      "type": "object",
+      "properties": {
+        "ros__parameters": {
+          "$ref": "#/definitions/cuda_scan_ground_segmentation_filter"
+        }
+      },
+      "required": ["ros__parameters"],
+      "additionalProperties": false
+    }
+  },
+  "required": ["/**"],
+  "additionalProperties": false
+}
diff --git a/perception/autoware_ground_segmentation_cuda/src/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter.cu b/perception/autoware_ground_segmentation_cuda/src/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter.cu
new file mode 100644
index 0000000000000..4c09415dab4e2
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/src/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter.cu
@@ -0,0 +1,819 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter.hpp"
+
+#include <autoware/cuda_scan_ground_segmentation/cuda_common.hpp>
+#include <autoware/cuda_scan_ground_segmentation/cuda_utility.cuh>
+#include <cub/cub.cuh>
+
+#include <sensor_msgs/msg/point_field.hpp>
+
+#include <sys/time.h>
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <optional>
+
+namespace autoware::cuda_ground_segmentation
+{
+
+__device__ __forceinline__ float fastAtan2_0_2Pi(float y, float x)
+{
+  return fmodf(atan2(y, x) + 2.0f * M_PI, 2.0f * M_PI);
+}
+
+// Initialize the cell list
+__global__ void cellInit(Cell * __restrict__ cell_list, int max_num_cells)
+{
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride = blockDim.x * gridDim.x;
+  Cell new_cell;
+
+  for (int i = index; i < max_num_cells; i += stride) {
+    new_cell.gnd_height_min = 1e6f;
+
+    cell_list[i] = new_cell;
+  }
+}
+
+/**
+ * @brief CUDA kernel to count the number of points in each grid cell for ground segmentation.
+ *
+ * This kernel processes each input point, computes its polar coordinates (radius and angle)
+ * relative to a specified center, and determines which cell in a polar grid the point belongs to.
+ * It then atomically increments the point count for the corresponding cell.
+ *
+ * @param input_points Pointer to the array of input points (device memory).
+ * @param num_input_points Number of input points in the array.
+ * @param filter_parameters_dev Pointer to filter parameters structure (device memory), containing
+ *        grid and segmentation configuration such as center coordinates, sector angle, and cell
+ * size.
+ * @param centroid_cells_list_dev Pointer to the array of cell centroid structures (device memory),
+ *        where each cell maintains a count of points assigned to it.
+ *
+ * @note Each thread processes one point. Atomic operations are used to safely increment the
+ *       point count in each cell when multiple threads access the same cell concurrently.
+ * @note Points that fall outside the defined grid (cell_id >= max_num_cells) are ignored.
+ */
+__global__ void computeCellId(
+  const PointTypeStruct * __restrict__ input_points, int point_num, FilterParameters param,
+  int * __restrict__ cell_id, int * __restrict__ count,
+  ClassifiedPointType * __restrict__ classified_points)
+{
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride = blockDim.x * gridDim.x;
+  ClassifiedPointType cp;
+
+  cp.type = PointType::INIT;
+
+  for (int i = index; i < point_num; i += stride) {
+    auto p = input_points[i];
+    float dx = p.x - param.center_x;
+    float dy = p.y - param.center_y;
+    float radius = hypotf(dx, dy);
+    float angle = fastAtan2_0_2Pi(dy, dx);
+    int sector_id = static_cast<int>(angle * param.inv_sector_angle_rad);
+    int cell_id_in_sector = static_cast<int>(radius / param.cell_divider_size_m);
+    int point_cell_id = cell_id_in_sector * param.num_sectors + sector_id;
+
+    // Save this so we don't have to recalculate later
+    cell_id[i] = point_cell_id;
+
+    // ALso, initialize classified points
+    cp.z = p.z;
+    cp.radius = radius;
+    cp.origin_index = i;
+
+    classified_points[i] = cp;
+
+    // Also update the number of points in each cell atomically
+    atomicAdd(count + point_cell_id, 1);
+  }
+}
+
+__global__ void distributePointsToCell(
+  ClassifiedPointType * input, ClassifiedPointType * output, int * cell_id, int point_num,
+  int * writing_loc)
+{
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for (int i = index; i < point_num; i += stride) {
+    int cid = cell_id[i];
+    int wloc = atomicAdd(writing_loc + cid, 1);
+
+    output[wloc] = input[i];
+  }
+}
+
+CudaScanGroundSegmentationFilter::CudaScanGroundSegmentationFilter(
+  const FilterParameters & filter_parameters, const int64_t max_mem_pool_size_in_byte)
+: filter_parameters_(filter_parameters)
+{
+  stream_.reset(new CudaStream());
+
+  int current_device_id = 0;
+
+  CHECK_CUDA_ERROR(cudaGetDevice(&current_device_id));
+  mempool_.reset(new CudaMempool(current_device_id, max_mem_pool_size_in_byte));
+
+  // Pre-allocate the PointCloud2 objects
+  // The CudaPointCloud2 still use sync cudaMalloc to allocate memory
+  // Those calls are costly, so it should be done only once.
+  dev_input_points_.reset(new cuda_blackboard::CudaPointCloud2);
+
+  // Warm-up the memory pool a bit
+  empty_cell_mark_.reset(new device_vector<int>(100000, stream_, mempool_));
+}
+
+__forceinline__ __device__ SegmentationMode checkSegmentationMode(
+  int cell_id, int closest_gnd_cell_id, int furthest_gnd_cell_id, int gnd_cell_buffer_size,
+  int gnd_cell_continual_threshold, int empty_cell_num)
+{
+  if (closest_gnd_cell_id < 0) {
+    return SegmentationMode::UNINITIALIZED;
+  }
+
+  if (cell_id - closest_gnd_cell_id >= gnd_cell_continual_threshold) {
+    return SegmentationMode::BREAK;
+  }
+
+  if (cell_id - furthest_gnd_cell_id - empty_cell_num <= gnd_cell_buffer_size) {
+    return SegmentationMode::CONTINUOUS;
+  }
+
+  return SegmentationMode::DISCONTINUOUS;
+}
+
+__forceinline__ __device__ void segmentUninitializedPoint(
+  ClassifiedPointType & p, const FilterParameters & param)
+{
+  if (p.z > param.detection_range_z_max || p.z < -param.non_ground_height_threshold) {
+    p.type = PointType::OUT_OF_RANGE;
+    return;
+  }
+
+  float global_height_th = p.radius * param.global_slope_max_ratio;
+
+  if (p.z > global_height_th && p.z > param.non_ground_height_threshold) {
+    p.type = PointType::NON_GROUND;
+    return;
+  }
+
+  float abs_pz = abs(p.z);
+
+  if (abs_pz < global_height_th && abs_pz < param.non_ground_height_threshold) {
+    p.type = PointType::GROUND;
+  }
+}
+
+__forceinline__ __device__ void segmentContinuousPoint(
+  ClassifiedPointType & p, const FilterParameters & param, float slope,
+  float prev_cell_gnd_height_avg, float prev_cell_gnd_radius_avg)
+{
+  if (p.z - prev_cell_gnd_height_avg > param.detection_range_z_max) {
+    p.type = PointType::OUT_OF_RANGE;
+    return;
+  }
+
+  float d_radius = p.radius - prev_cell_gnd_radius_avg + param.cell_divider_size_m;
+  float dz = p.z - prev_cell_gnd_height_avg;
+
+  if (p.z > param.global_slope_max_ratio * p.radius) {
+    p.type = PointType::NON_GROUND;
+    return;
+  }
+
+  if (dz > param.local_slope_max_ratio * d_radius) {
+    p.type = PointType::NON_GROUND;
+    return;
+  }
+
+  float estimated_ground_z = prev_cell_gnd_height_avg + slope * param.cell_divider_size_m;
+
+  if (p.z > estimated_ground_z + param.non_ground_height_threshold) {
+    p.type = PointType::NON_GROUND;
+    return;
+  }
+
+  if (
+    p.z < estimated_ground_z -
+            param.non_ground_height_threshold ||  // p.z < estimated_ground_z -
+                                                  // param.non_ground_height_threshold
+    dz < -param.local_slope_max_ratio * d_radius ||
+    p.z < -param.global_slope_max_ratio * p.radius) {
+    p.type = PointType::OUT_OF_RANGE;
+
+    return;
+  }
+
+  p.type = PointType::GROUND;
+}
+
+__forceinline__ __device__ void segmentDiscontinuousPoint(
+  ClassifiedPointType & p, const FilterParameters & param, float prev_cell_gnd_height_avg,
+  float prev_cell_gnd_radius_avg)
+{
+  if (p.z - prev_cell_gnd_height_avg > param.detection_range_z_max) {
+    p.type = PointType::OUT_OF_RANGE;
+    return;
+  }
+
+  float dz = p.z - prev_cell_gnd_height_avg;
+  // float d_radius = p.radius - prev_cell_gnd_radius_avg + param.cell_divider_size_m;
+  float d_radius = p.radius - prev_cell_gnd_radius_avg;
+  float global_height_threshold = p.radius * param.global_slope_max_ratio;
+  float local_height_threshold = param.local_slope_max_ratio * d_radius;
+
+  if (p.z > global_height_threshold || dz > local_height_threshold) {
+    p.type = PointType::NON_GROUND;
+    return;
+  }
+
+  if (dz < -local_height_threshold || p.z < -global_height_threshold) {
+    p.type = PointType::OUT_OF_RANGE;
+    return;
+  }
+
+  p.type = PointType::GROUND;
+}
+
+__forceinline__ __device__ void segmentBreakPoint(
+  ClassifiedPointType & p, const FilterParameters & param, float prev_cell_gnd_height_avg,
+  float prev_cell_gnd_radius_avg)
+{
+  if (p.z - prev_cell_gnd_height_avg > param.detection_range_z_max) {
+    p.type = PointType::OUT_OF_RANGE;
+    return;
+  }
+
+  float dz = p.z - prev_cell_gnd_height_avg;
+  float d_radius = p.radius - prev_cell_gnd_radius_avg;
+  float global_height_threshold = d_radius * param.global_slope_max_ratio;
+
+  p.type = (dz > global_height_threshold)
+             ? PointType::NON_GROUND
+             : ((dz < -global_height_threshold) ? PointType::OUT_OF_RANGE : PointType::GROUND);
+}
+
+__forceinline__ __device__ void segmentCell(
+  const int wid,  // Index of the thread in the warp
+  ClassifiedPointType * classified_points, const FilterParameters & param, Cell & cell,
+  float slope,                 // Slope of the line connect the previous ground cells
+  int start_pid, int end_pid,  // Start and end indices of points in the current cell
+  float prev_cell_gnd_radius_avg, float prev_cell_gnd_height_avg, const SegmentationMode & mode)
+{
+  if (start_pid >= end_pid) {
+    return;
+  }
+
+  // For computing the cell statistic
+  float t_gnd_radius_sum = 0;
+  float t_gnd_height_sum = 0;
+  float t_gnd_height_min = FLT_MAX;
+  int t_gnd_point_num = 0;
+  // For recheck
+  float minus_t_gnd_radius_sum = 0;
+  float minus_t_gnd_height_sum = 0;
+  float minus_t_gnd_point_num = 0;
+  ClassifiedPointType last_gnd_point, p;
+  int last_gnd_idx;
+  int local_gnd_point_num;
+
+  // Fit line from the recent ground cells
+  for (int j = start_pid + wid; j < end_pid; j += WARP_SIZE) {
+    p = classified_points[j];
+
+    switch (mode) {
+      case (SegmentationMode::UNINITIALIZED): {
+        segmentUninitializedPoint(p, param);
+        break;
+      }
+      case (SegmentationMode::CONTINUOUS): {
+        segmentContinuousPoint(p, param, slope, prev_cell_gnd_height_avg, prev_cell_gnd_radius_avg);
+        break;
+      }
+      case (SegmentationMode::DISCONTINUOUS): {
+        segmentDiscontinuousPoint(p, param, prev_cell_gnd_height_avg, prev_cell_gnd_radius_avg);
+        break;
+      }
+      case (SegmentationMode::BREAK): {
+        segmentBreakPoint(p, param, prev_cell_gnd_height_avg, prev_cell_gnd_radius_avg);
+        break;
+      }
+    }
+
+    if (p.type == PointType::GROUND) {
+      t_gnd_radius_sum += p.radius;
+      t_gnd_height_sum += p.z;
+      ++t_gnd_point_num;
+      t_gnd_height_min = (t_gnd_height_min > p.z) ? p.z : t_gnd_height_min;
+    }
+
+    classified_points[j] = p;
+  }
+
+  // Wait for all threads in the warp to finish
+  local_gnd_point_num = t_gnd_point_num;
+  __syncwarp();
+
+  // Find the min height and the number of ground points first
+
+  // Use reduction to compute the cell's stat
+  for (int offset = WARP_SIZE >> 1; offset > 0; offset >>= 1) {
+    t_gnd_height_sum += __shfl_down_sync(FULL_MASK, t_gnd_height_sum, offset);
+    t_gnd_radius_sum += __shfl_down_sync(FULL_MASK, t_gnd_radius_sum, offset);
+    t_gnd_point_num += __shfl_down_sync(FULL_MASK, t_gnd_point_num, offset);
+
+    float other_height_min = __shfl_down_sync(FULL_MASK, t_gnd_height_min, offset);
+    t_gnd_height_min = min(t_gnd_height_min, other_height_min);
+  }
+
+  // Now broadcast the min height and the number of ground points to all threads
+  float cell_gnd_height_min = __shfl_sync(FULL_MASK, t_gnd_height_min, 0);
+  int cell_gnd_point_num = __shfl_sync(FULL_MASK, t_gnd_point_num, 0);
+  float cell_gnd_radius_sum = __shfl_sync(FULL_MASK, t_gnd_radius_sum, 0);
+
+  if (
+    param.use_recheck_ground_cluster && cell_gnd_point_num > 1 &&
+    cell_gnd_radius_sum / (float)(cell_gnd_point_num) > param.recheck_start_distance) {
+    // Now recheck the points using the height_min
+    for (int j = start_pid + wid; j < end_pid; j += WARP_SIZE) {
+      auto p = classified_points[j];
+
+      if (
+        p.type == PointType::GROUND &&
+        p.z > cell_gnd_height_min + param.non_ground_height_threshold && cell_gnd_point_num > 1) {
+        last_gnd_point = p;
+        minus_t_gnd_height_sum += p.z;
+        minus_t_gnd_radius_sum += p.radius;
+        ++minus_t_gnd_point_num;
+        p.type = PointType::NON_GROUND;
+        last_gnd_idx = j;
+      }
+
+      classified_points[j] = p;
+    }
+
+    __syncwarp();
+
+    // Now use ballot sync to see if there are any ground points remaining
+    uint32_t recheck_res = __ballot_sync(FULL_MASK, local_gnd_point_num > minus_t_gnd_point_num);
+    uint32_t backup_res = __ballot_sync(FULL_MASK, local_gnd_point_num > 0);
+
+    // If no ground point remains, we have to keep the last ground point
+    if (recheck_res == 0 && backup_res > 0) {
+      // Get the index of the last thread that detects ground point
+      int last_tid = 32 - __ffs(backup_res);
+
+      // Only the last point in that thread remain
+      if (wid == last_tid) {
+        // Keep the last point's stat
+        minus_t_gnd_height_sum -= last_gnd_point.z;
+        minus_t_gnd_radius_sum -= last_gnd_point.radius;
+        --minus_t_gnd_point_num;
+        classified_points[last_gnd_idx] = last_gnd_point;
+      }
+    }
+    __syncwarp();
+
+    // Final update
+    for (int offset = WARP_SIZE >> 1; offset > 0; offset >>= 1) {
+      minus_t_gnd_height_sum += __shfl_down_sync(FULL_MASK, minus_t_gnd_height_sum, offset);
+      minus_t_gnd_radius_sum += __shfl_down_sync(FULL_MASK, minus_t_gnd_radius_sum, offset);
+      minus_t_gnd_point_num += __shfl_down_sync(FULL_MASK, minus_t_gnd_point_num, offset);
+    }
+
+    if (wid == 0) {
+      t_gnd_height_sum -= minus_t_gnd_height_sum;
+      t_gnd_radius_sum -= minus_t_gnd_radius_sum;
+      t_gnd_point_num -= minus_t_gnd_point_num;
+    }
+  }
+
+  // Finally, thread 0 update the cell stat
+  if (wid == 0 && t_gnd_point_num > 0) {
+    cell.gnd_radius_avg = t_gnd_radius_sum / (float)(t_gnd_point_num);
+    cell.gnd_height_avg = t_gnd_height_sum / (float)(t_gnd_point_num);
+    cell.gnd_height_min = t_gnd_height_min;
+    cell.num_ground_points = t_gnd_point_num;
+  }
+
+  __syncwarp();
+  cell.num_ground_points = __shfl_sync(FULL_MASK, cell.num_ground_points, 0);
+}
+
+__global__ void sectorProcessingKernel(
+  Cell * __restrict__ cell_list, ClassifiedPointType * __restrict__ classified_points,
+  int * starting_pid, FilterParameters param, int * empty_cell_mark)
+{
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  // Each warp handles one sector
+  int sector_stride = (blockDim.x * gridDim.x) / WARP_SIZE;
+  int wid = index % WARP_SIZE;  // Index of the thread within the warp
+  // Shared memory to backup the cell data
+  extern __shared__ Cell shared_buffer[];
+  Cell * cell_queue = shared_buffer + (threadIdx.x / WARP_SIZE) * param.gnd_cell_buffer_size;
+  int * cell_id_queue =
+    (int *)(shared_buffer + param.gnd_cell_buffer_size * (blockDim.x / WARP_SIZE)) +
+    (threadIdx.x / WARP_SIZE) * param.gnd_cell_buffer_size;
+
+  // Loop on sectors
+  for (int sector_id = index / WARP_SIZE; sector_id < param.num_sectors;
+       sector_id += sector_stride) {
+    // For storing the previous ground cells
+    int closest_gnd_cell_id, furthest_gnd_cell_id, num_latest_gnd_cells = 0;
+    int head = 0, tail = 0;
+    float sum_x, sum_y, sum_xx, sum_xy, slope;
+    float prev_gnd_radius_avg, prev_gnd_height_avg;
+    int empty_cell_num = 0;
+
+    // Initially no ground cell is identified
+    closest_gnd_cell_id = furthest_gnd_cell_id = -1;
+    sum_x = sum_y = sum_xx = sum_xy = slope = 0.0;
+
+    // Loop on the cells in a sector
+    for (int i = 0; i < param.max_num_cells_per_sector; ++i) {
+      if (num_latest_gnd_cells > 0) {
+        furthest_gnd_cell_id = cell_id_queue[head];
+        empty_cell_num = empty_cell_mark[i * param.num_sectors + sector_id] -
+                         empty_cell_mark[furthest_gnd_cell_id * param.num_sectors + sector_id];
+      }
+
+      int gid = i * param.num_sectors + sector_id;
+      int sid = starting_pid[gid], eid = starting_pid[gid + 1];
+      Cell cell;
+
+      // Skip empty cells
+      if (sid >= eid) {
+        continue;
+      }
+
+      auto mode = checkSegmentationMode(
+        i, closest_gnd_cell_id, furthest_gnd_cell_id, param.gnd_cell_buffer_size,
+        param.gnd_grid_continual_thresh, empty_cell_num);
+
+      // Classify the points in the cell
+      segmentCell(
+        wid, classified_points, param, cell, slope, sid, eid, prev_gnd_radius_avg,
+        prev_gnd_height_avg, mode);
+
+      // Update the indices of the previous ground cells if the cell contains ground points
+      if (cell.num_ground_points > 0) {
+        if (num_latest_gnd_cells >= param.gnd_cell_buffer_size) {
+          if (wid == 0) {
+            // If the number of previous ground cell reach maximum,
+            // remove the cell at the queue head
+            Cell head_cell = cell_queue[head];
+            sum_x -= head_cell.gnd_radius_avg;
+            sum_y -= head_cell.gnd_height_avg;
+            sum_xx -= head_cell.gnd_radius_avg * head_cell.gnd_radius_avg;
+            sum_xy -= head_cell.gnd_radius_avg * head_cell.gnd_height_avg;
+          }
+
+          // Now remove the entry at the head of the queue
+          head = (head + 1) % param.gnd_cell_buffer_size;
+          --num_latest_gnd_cells;
+        }
+
+        ++num_latest_gnd_cells;
+
+        if (wid == 0) {
+          // Add the new cell to the queue
+          cell_queue[tail] = cell;
+          cell_id_queue[tail] = i;
+
+          // Update the stats and estimate the local slope
+          sum_x += cell.gnd_radius_avg;
+          sum_y += cell.gnd_height_avg;
+          sum_xx += cell.gnd_radius_avg * cell.gnd_radius_avg;
+          sum_xy += cell.gnd_radius_avg * cell.gnd_height_avg;
+
+          float denom = (num_latest_gnd_cells * sum_xx - sum_x * sum_x);
+
+          if (fabsf(denom) < 1e-6f) {
+            Cell head_cell = cell_queue[head];
+            slope = head_cell.gnd_height_avg / head_cell.gnd_radius_avg;
+          } else {
+            slope = (num_latest_gnd_cells * sum_xy - sum_x * sum_y) / denom;
+            slope = fmax(fminf(slope, param.global_slope_max_ratio), -param.global_slope_max_ratio);
+          }
+
+          // Write the cell to the global memory
+          cell_list[gid] = cell;
+        }
+
+        // Wait for the thread 0 to finish its work
+        __syncwarp();
+        // Now remove the cell at the end of the queue
+        tail = (tail + 1) % param.gnd_cell_buffer_size;
+        // Distribute the new slope to all threads in the warp
+        slope = __shfl_sync(FULL_MASK, slope, 0);
+        prev_gnd_radius_avg = __shfl_sync(FULL_MASK, cell.gnd_radius_avg, 0);
+        prev_gnd_height_avg = __shfl_sync(FULL_MASK, cell.gnd_height_avg, 0);
+        closest_gnd_cell_id = i;
+      }
+    }
+  }
+}
+
+__global__ void markEmptyCells(int * starting_pid, int cell_num, int * mark)
+{
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for (int i = index; i < cell_num; i += stride) {
+    mark[i] = (starting_pid[i] >= starting_pid[i + 1]) ? 1 : 0;
+  }
+}
+
+__global__ void prefixSumEmptyCells(int * mark, int sector_num, int cell_per_sector_num)
+{
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for (int i = index; i < sector_num; i += stride) {
+    int sum = 0;
+
+    for (int j = 0, cell_global_id = i; j < cell_per_sector_num;
+         ++j, cell_global_id += sector_num) {
+      int val_j = mark[cell_global_id];
+
+      mark[cell_global_id] = sum;
+      sum += val_j;
+    }
+  }
+}
+
+void CudaScanGroundSegmentationFilter::sort_points(
+  device_vector<Cell> & cell_list, device_vector<int> & starting_pid,
+  device_vector<ClassifiedPointType> & classified_points)
+{
+  int point_num = dev_input_points_->height * dev_input_points_->width;
+
+  if (point_num == 0 || filter_parameters_.max_num_cells == 0) {
+    return;
+  }
+
+  int cell_num = filter_parameters_.max_num_cells;
+
+  cell_list.resize(cell_num);
+
+  CHECK_CUDA_ERROR(
+    cuda::launchAsync<BLOCK_SIZE_X>(
+      (int)(cell_list.size()), 0, stream_->get(), cellInit, cell_list.data(),
+      (int)(cell_list.size())));
+
+  starting_pid.resize(cell_num + 1);
+
+  device_vector<int> cell_id(point_num, stream_, mempool_);
+  device_vector<ClassifiedPointType> tmp_classified_points(point_num, stream_, mempool_);
+
+  CHECK_CUDA_ERROR(cuda::fill(starting_pid, 0));
+  CHECK_CUDA_ERROR(
+    cuda::launchAsync<BLOCK_SIZE_X>(
+      point_num, 0, stream_->get(), computeCellId,
+      reinterpret_cast<const PointTypeStruct *>(dev_input_points_->data.get()),
+      (int)(dev_input_points_->height * dev_input_points_->width), filter_parameters_,
+      cell_id.data(), starting_pid.data(), tmp_classified_points.data()));
+
+  CHECK_CUDA_ERROR(cuda::ExclusiveScan(starting_pid));
+
+  device_vector<int> writing_loc(starting_pid, stream_, mempool_);
+
+  classified_points.resize(point_num);
+
+  CHECK_CUDA_ERROR(
+    cuda::launchAsync<BLOCK_SIZE_X>(
+      point_num, 0, stream_->get(), distributePointsToCell, tmp_classified_points.data(),
+      classified_points.data(), cell_id.data(), point_num, writing_loc.data()));
+
+  // Compute the number of empty cells between every pair of consecutive non-empty cell
+  empty_cell_mark_->resize(cell_num);
+
+  CHECK_CUDA_ERROR(
+    cuda::launchAsync<BLOCK_SIZE_X>(
+      cell_num, 0, stream_->get(), markEmptyCells, starting_pid.data(), cell_num,
+      empty_cell_mark_->data()));
+
+  empty_cell_mark_->resize(cell_num);
+
+  CHECK_CUDA_ERROR(
+    cuda::launchAsync<BLOCK_SIZE_X>(
+      (int)(filter_parameters_.num_sectors), 0, stream_->get(), prefixSumEmptyCells,
+      empty_cell_mark_->data(), (int)filter_parameters_.num_sectors,
+      (int)filter_parameters_.max_num_cells_per_sector));
+}
+
+// ============ Scan per sector to get ground reference and Non-Ground points =============
+void CudaScanGroundSegmentationFilter::scanPerSectorGroundReference(
+  device_vector<Cell> & cell_list, device_vector<int> & starting_pid,
+  device_vector<ClassifiedPointType> & classified_points)
+{
+  const uint32_t num_sectors = filter_parameters_.num_sectors;
+  if (num_sectors == 0) {
+    return;
+  }
+
+  CHECK_CUDA_ERROR(
+    cuda::launchAsync<BLOCK_SIZE_X>(
+      (int)(num_sectors * WARP_SIZE),
+      (BLOCK_SIZE_X / WARP_SIZE) * (sizeof(Cell) + sizeof(int)) *
+        filter_parameters_.gnd_cell_buffer_size,
+      stream_->get(), sectorProcessingKernel, cell_list.data(), classified_points.data(),
+      starting_pid.data(), filter_parameters_, empty_cell_mark_->data()));
+}
+
+struct NonGroundChecker
+{
+  CUDAH bool operator()(const ClassifiedPointType & p) const
+  {
+    return (p.type != PointType::GROUND);
+  }
+};
+
+struct GroundChecker
+{
+  CUDAH bool operator()(const ClassifiedPointType & p) const
+  {
+    return (p.type == PointType::GROUND);
+  }
+};
+
+template <typename CheckerType>
+__global__ void markingPoints(
+  ClassifiedPointType * classified_points, int point_num, int * mark, CheckerType checker)
+{
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for (int i = index; i < point_num; i += stride) {
+    auto p = classified_points[i];
+
+    mark[p.origin_index] = (checker(p)) ? 1 : 0;
+  }
+}
+
+__global__ void extract(
+  const PointTypeStruct * __restrict__ dev_input_points, int point_num, int * writing_loc,
+  PointTypeStruct * __restrict__ dev_output_points)
+{
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for (int i = index; i < point_num; i += stride) {
+    int wloc = writing_loc[i];
+
+    if (wloc < writing_loc[i + 1]) {
+      dev_output_points[wloc] = dev_input_points[i];
+    }
+  }
+}
+
+// ============= Extract non-ground points =============
+template <typename CheckerType>
+void CudaScanGroundSegmentationFilter::extractPoints(
+  device_vector<ClassifiedPointType> & classified_points,
+  const cuda_blackboard::CudaPointCloud2 & input, cuda_blackboard::CudaPointCloud2 & output)
+{
+  int point_num = dev_input_points_->height * dev_input_points_->width;
+  device_vector<int> point_mark(point_num + 1, stream_, mempool_);
+
+  // Mark non-ground points
+  CHECK_CUDA_ERROR(
+    cuda::launchAsync<BLOCK_SIZE_X>(
+      point_num, 0, stream_->get(), markingPoints, classified_points.data(), point_num,
+      point_mark.data(), CheckerType()));
+
+  // Exclusive scan
+  device_vector<int> writing_loc(stream_, mempool_);
+
+  CHECK_CUDA_ERROR(cuda::ExclusiveScan(point_mark, writing_loc));
+
+  // Reserve the output
+  int output_size = writing_loc[point_num];
+
+  if (output_size <= 0) {
+    return;
+  }
+
+  cuda::copyPointCloud2Metadata(output, input);
+
+  output.data = cuda_blackboard::make_unique<uint8_t[]>(output_size * output.point_step);
+  output.height = 1;
+  output.width = output_size;
+
+  // Get the points
+  CHECK_CUDA_ERROR(
+    cuda::launchAsync<BLOCK_SIZE_X>(
+      point_num, 0, stream_->get(), extract,
+      reinterpret_cast<const PointTypeStruct *>(input.data.get()),
+      (int)(input.height * input.width), writing_loc.data(),
+      reinterpret_cast<PointTypeStruct *>(output.data.get())));
+}
+
+void CudaScanGroundSegmentationFilter::classifyPointCloud(
+  const cuda_blackboard::CudaPointCloud2 & input, cuda_blackboard::CudaPointCloud2 & ground,
+  cuda_blackboard::CudaPointCloud2 & non_ground)
+{
+  device_vector<Cell> cell_list(stream_, mempool_);
+  device_vector<int> starting_pid(stream_, mempool_);
+  device_vector<ClassifiedPointType> classified_points(stream_, mempool_);
+
+  removeOutliers(input);
+  sort_points(cell_list, starting_pid, classified_points);
+  scanPerSectorGroundReference(cell_list, starting_pid, classified_points);
+
+  int point_num = dev_input_points_->height * dev_input_points_->width;
+
+  // Extract non-ground points
+  extractPoints<NonGroundChecker>(classified_points, *dev_input_points_, non_ground);
+
+  // Extract ground points
+  extractPoints<GroundChecker>(classified_points, *dev_input_points_, ground);
+}
+
+__global__ void markNonOutliers(
+  const PointTypeStruct * __restrict__ cloud, int point_num, FilterParameters param,
+  float max_radius, int * __restrict__ mark)
+{
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for (int i = index; i < point_num; i += stride) {
+    auto p = cloud[i];
+    int val = (p.x < param.min_x || p.x > param.max_x || p.y < param.min_y || p.y > param.max_y ||
+               p.z < param.min_z || p.z > param.max_z)
+                ? 0
+                : 1;
+    mark[i] = val;
+  }
+}
+
+__global__ void getValidPoints(
+  const PointTypeStruct * __restrict__ cloud, int point_num, int * __restrict__ writing_loc,
+  PointTypeStruct * __restrict__ out_cloud)
+{
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for (int i = index; i < point_num; i += stride) {
+    int wloc = writing_loc[i];
+
+    if (wloc < writing_loc[i + 1]) {
+      out_cloud[wloc] = cloud[i];
+    }
+  }
+}
+
+void CudaScanGroundSegmentationFilter::removeOutliers(
+  const cuda_blackboard::CudaPointCloud2 & input)
+{
+  float max_radius = filter_parameters_.max_radius;
+  int point_num = input.width * input.height;
+
+  if (point_num <= 0) {
+    return;
+  }
+
+  device_vector<int> mark(point_num + 1, stream_, mempool_);
+
+  CHECK_CUDA_ERROR(
+    cuda::launchAsync<BLOCK_SIZE_X>(
+      point_num, 0, stream_->get(), markNonOutliers,
+      reinterpret_cast<const PointTypeStruct *>(input.data.get()),
+      (int)(input.height * input.width), filter_parameters_, max_radius, mark.data()));
+
+  CHECK_CUDA_ERROR(cuda::ExclusiveScan(mark));
+
+  int remain_size = mark[point_num];
+
+  cuda::copyPointCloud2Metadata(*dev_input_points_, input);
+
+  dev_input_points_->height = 1;
+  dev_input_points_->width = remain_size;
+  dev_input_points_->data =
+    cuda_blackboard::make_unique<uint8_t[]>(remain_size * dev_input_points_->point_step);
+
+  CHECK_CUDA_ERROR(
+    cuda::launchAsync<BLOCK_SIZE_X>(
+      point_num, 0, stream_->get(), getValidPoints,
+      reinterpret_cast<const PointTypeStruct *>(input.data.get()),
+      (int)(input.height * input.width), mark.data(),
+      reinterpret_cast<PointTypeStruct *>(dev_input_points_->data.get())));
+}
+
+}  // namespace autoware::cuda_ground_segmentation
diff --git a/perception/autoware_ground_segmentation_cuda/src/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter_node.cpp b/perception/autoware_ground_segmentation_cuda/src/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter_node.cpp
new file mode 100644
index 0000000000000..e1f86ae6665e5
--- /dev/null
+++ b/perception/autoware_ground_segmentation_cuda/src/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter_node.cpp
@@ -0,0 +1,158 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/cuda_scan_ground_segmentation/cuda_scan_ground_segmentation_filter_node.hpp"
+
+#include "autoware/pointcloud_preprocessor/utility/memory.hpp"
+
+#include <autoware_utils/math/unit_conversion.hpp>
+
+namespace autoware::cuda_ground_segmentation
+{
+
+using autoware_utils::deg2rad;
+CudaScanGroundSegmentationFilterNode::CudaScanGroundSegmentationFilterNode(
+  const rclcpp::NodeOptions & options)
+: Node("cuda_scan_ground_segmentation_filter_node", options)
+{
+  // Delare processing time debug
+  {
+    using autoware_utils::DebugPublisher;
+    using autoware_utils::StopWatch;
+    stop_watch_ptr_ = std::make_unique<StopWatch<std::chrono::milliseconds>>();
+    debug_publisher_ptr_ = std::make_unique<DebugPublisher>(this, "cuda_scan_ground_filter");
+    stop_watch_ptr_->tic("cyclic_time");
+    stop_watch_ptr_->tic("processing_time");
+  }
+  // Declare parameters
+  FilterParameters filter_parameters;
+  // global parameters
+  filter_parameters.max_x = static_cast<float>(declare_parameter<double>("max_x"));
+  filter_parameters.min_x = static_cast<float>(declare_parameter<double>("min_x"));
+  filter_parameters.max_y = static_cast<float>(declare_parameter<double>("max_y"));
+  filter_parameters.min_y = static_cast<float>(declare_parameter<double>("min_y"));
+  filter_parameters.max_z = static_cast<float>(declare_parameter<double>("max_z"));
+  filter_parameters.min_z = static_cast<float>(declare_parameter<double>("min_z"));
+  filter_parameters.center_x =
+    static_cast<float>(declare_parameter<double>("center_pcl_shift"));  // default 0.0
+
+  filter_parameters.max_radius = std::max(
+    std::max(
+      std::hypot(filter_parameters.max_x, filter_parameters.max_y),
+      std::hypot(filter_parameters.min_x, filter_parameters.min_y)),
+    std::max(
+      std::hypot(filter_parameters.max_x, filter_parameters.min_y),
+      std::hypot(filter_parameters.min_x, filter_parameters.max_y)));
+
+  // common parameters
+  filter_parameters.sector_angle_rad =
+    static_cast<float>(deg2rad(declare_parameter<double>("sector_angle_deg")));
+  filter_parameters.inv_sector_angle_rad = 1.0f / filter_parameters.sector_angle_rad;
+  filter_parameters.num_sectors =
+    static_cast<uint32_t>(std::ceil(2.0 * M_PI * filter_parameters.inv_sector_angle_rad));
+
+  // common thresholds
+  filter_parameters.global_slope_max_angle_rad =
+    static_cast<float>(deg2rad(declare_parameter<double>("global_slope_max_angle_deg")));
+  filter_parameters.local_slope_max_angle_rad =
+    static_cast<float>(deg2rad(declare_parameter<double>("local_slope_max_angle_deg")));
+  filter_parameters.global_slope_max_ratio = std::tan(filter_parameters.global_slope_max_angle_rad);
+  filter_parameters.local_slope_max_ratio = std::tan(filter_parameters.local_slope_max_angle_rad);
+
+  // cell mode parameters
+  filter_parameters.use_recheck_ground_cluster =
+    static_cast<uint32_t>(declare_parameter<bool>("use_recheck_ground_cluster"));
+  filter_parameters.recheck_start_distance =
+    static_cast<float>(declare_parameter<double>("recheck_start_distance"));
+  filter_parameters.detection_range_z_max =
+    static_cast<float>(declare_parameter<double>("detection_range_z_max"));
+  filter_parameters.non_ground_height_threshold =
+    static_cast<float>(declare_parameter<double>("non_ground_height_threshold"));
+
+  // cell parameters
+  filter_parameters.cell_divider_size_m =
+    static_cast<float>(declare_parameter<double>("grid_size_m"));
+  filter_parameters.max_num_cells_per_sector =
+    static_cast<uint32_t>(filter_parameters.max_radius / filter_parameters.cell_divider_size_m);
+  filter_parameters.max_num_cells = static_cast<uint32_t>(
+    filter_parameters.max_num_cells_per_sector * filter_parameters.num_sectors);
+  filter_parameters.gnd_cell_buffer_size =
+    static_cast<uint32_t>(declare_parameter<int>("gnd_cell_buffer_size"));
+
+  int64_t max_mem_pool_size_in_byte =
+    declare_parameter<int64_t>("max_mem_pool_size_in_byte", 1e9);  // 1 GB
+
+  // Initialize CUDA blackboard publisher
+  // Initialize CUDA blackboard subscriber
+  sub_ =
+    std::make_shared<cuda_blackboard::CudaBlackboardSubscriber<cuda_blackboard::CudaPointCloud2>>(
+      *this, "~/input/pointcloud",
+      std::bind(
+        &CudaScanGroundSegmentationFilterNode::cudaPointCloudCallback, this,
+        std::placeholders::_1));
+
+  // Initialize CUDA blackboard publisher
+  pub_ =
+    std::make_unique<cuda_blackboard::CudaBlackboardPublisher<cuda_blackboard::CudaPointCloud2>>(
+      *this, "~/output/pointcloud");
+
+  pub_gnd_ =
+    std::make_unique<cuda_blackboard::CudaBlackboardPublisher<cuda_blackboard::CudaPointCloud2>>(
+      *this, "~/output/ground_pointcloud");
+
+  cuda_ground_segmentation_filter_ = std::make_unique<CudaScanGroundSegmentationFilter>(
+    filter_parameters, max_mem_pool_size_in_byte);
+}
+
+void CudaScanGroundSegmentationFilterNode::cudaPointCloudCallback(
+  const cuda_blackboard::CudaPointCloud2::ConstSharedPtr & msg)
+{
+  // start time measurement
+  if (stop_watch_ptr_) {
+    stop_watch_ptr_->tic("processing_time");
+  }
+
+  auto non_ground_unique = std::make_unique<cuda_blackboard::CudaPointCloud2>();
+  auto ground_unique = std::make_unique<cuda_blackboard::CudaPointCloud2>();
+
+  // Create shared_ptr from raw pointers for the function call
+  auto non_ground_shared = std::shared_ptr<cuda_blackboard::CudaPointCloud2>(
+    non_ground_unique.get(), [](auto *) {});  // no-op deleter
+  auto ground_shared =
+    std::shared_ptr<cuda_blackboard::CudaPointCloud2>(ground_unique.get(), [](auto *) {});
+
+  cuda_ground_segmentation_filter_->classifyPointCloud(*msg, *ground_shared, *non_ground_shared);
+
+  // Publish using the original unique_ptr
+  pub_->publish(std::move(non_ground_unique));
+  pub_gnd_->publish(std::move(ground_unique));
+
+  // end time measurement
+  if (debug_publisher_ptr_ && stop_watch_ptr_) {
+    stop_watch_ptr_->toc("processing_time");
+    stop_watch_ptr_->toc("cyclic_time");
+    const double cyclic_time_ms = stop_watch_ptr_->toc("cyclic_time", true);
+    const double processing_time_ms = stop_watch_ptr_->toc("processing_time", true);
+    debug_publisher_ptr_->publish<autoware_internal_debug_msgs::msg::Float64Stamped>(
+      "debug/cyclic_time_ms", cyclic_time_ms);
+    debug_publisher_ptr_->publish<autoware_internal_debug_msgs::msg::Float64Stamped>(
+      "debug/processing_time_ms", processing_time_ms);
+  }
+}
+
+}  // namespace autoware::cuda_ground_segmentation
+
+#include "rclcpp_components/register_node_macro.hpp"
+RCLCPP_COMPONENTS_REGISTER_NODE(
+  autoware::cuda_ground_segmentation::CudaScanGroundSegmentationFilterNode)