Clean up ncu_trace and ncu metrics

xuzhao9 · facebook-github-bot · commit 6aa44c2eafa8 · 2025-07-11T12:18:05.000-07:00
Summary: Remove `ncu_trace` metric as we have decided to integrate NCU metric analysis within the framework. Now we always save the replay file, analyze the replay file, and aggregate the metrics.

Reviewed By: FindHao

Differential Revision: D78164136

fbshipit-source-id: 399106cb92fabd2e708e94168e1fde85ef17c627
diff --git a/tritonbench/components/ncu/ncu_analyzer.py b/tritonbench/components/ncu/ncu_analyzer.py
@@ -59,7 +59,29 @@
 }
 
 
-def import_ncu_python_path():
+def get_ncu_metrics(metrics: List[str]) -> List[str]:
+    """
+    This function returns a list of all the NCU metrics used in the benchmark.
+
+    Returns:
+        list: A list of all the NCU metrics used in the benchmark.
+    """
+    ncu_metrics = []
+    for (
+        bench_metric,
+        short_ncu_metrics,
+    ) in bench_metric_to_short_ncu_metric.items():
+        # Only process metrics that are required
+        if bench_metric in metrics:
+            # For each short metric name in the list of metrics for this benchmark metric
+            for short_ncu_metric in short_ncu_metrics:
+                # Get the full NCU metric name and add it to our list
+                full_metric_name = short_ncu_metric_name[short_ncu_metric]
+                ncu_metrics.append(full_metric_name)
+    return ncu_metrics
+
+
+def _import_ncu_python_path():
     """
     This function modifies the Python path to include the NVIDIA Nsight Compute (NCU) Python modules.
     It searches for the 'ncu' command in the system PATH, determines its location, and appends the
@@ -153,7 +175,7 @@ def read_ncu_report(report_path: str, required_metrics: List[str]):
     assert os.path.exists(
         report_path
     ), f"The NCU report at {report_path} does not exist."
-    import_ncu_python_path()
+    _import_ncu_python_path()
     import ncu_report
 
     # save all kernels' metrics. {metric_name: [kernel1_metric_value, kernel2_metric_value, ...]}
diff --git a/tritonbench/components/ncu/nsys_analyzer.py b/tritonbench/components/ncu/nsys_analyzer.py
@@ -20,6 +20,14 @@
 }
 
 
+def get_nsys_metrics(metrics: List[str]) -> List[str]:
+    nsys_metrics = []
+    for metric_name in nsys_metrics_to_reports.keys():
+        if metric_name in metrics:
+            nsys_metrics.append(metric_name)
+    return nsys_metrics
+
+
 def read_nsys_report(
     report_path: str, required_metrics: List[str]
 ) -> Dict[str, List[float]]:
diff --git a/tritonbench/utils/triton_op.py b/tritonbench/utils/triton_op.py
@@ -228,8 +228,6 @@ class BenchmarkOperatorMetrics:
     compile_trace: Optional[str] = None
     # att trace directory
     att_trace: Optional[str] = None
-    # ncu trace file
-    ncu_trace: Optional[str] = None
     # ncu replay file
     ncu_rep: Optional[str] = None
     # ncu replay file with TTGIR line numbers
@@ -1227,53 +1225,41 @@ def _init_extra_metrics() -> Dict[str, Any]:
                 metrics.compile_trace = self.compile_time(
                     input_id, fn_name, metrics, kineto_trace=True
                 )
-            # Collect NCU metrics if any required metrics match the ncu analyzer
-            # metrics. Only profile with the necessary metrics to avoid excessive
-            # overhead.
             if not is_hip():
-                if "ncu_trace" in self.required_metrics:
-                    metrics.ncu_trace = self.ncu_trace(input_id, fn_name)
-                ncu_metrics = []
-                for (
-                    bench_metric,
-                    short_ncu_metrics,
-                ) in ncu_analyzer.bench_metric_to_short_ncu_metric.items():
-                    # Only process metrics that are required
-                    if bench_metric in self.required_metrics:
-                        # For each short metric name in the list of metrics for this benchmark metric
-                        for short_ncu_metric in short_ncu_metrics:
-                            # Get the full NCU metric name and add it to our list
-                            full_metric_name = ncu_analyzer.short_ncu_metric_name[
-                                short_ncu_metric
-                            ]
-                            ncu_metrics.append(full_metric_name)
-                extend_ncu_args = (
-                    ["--metrics", ",".join(ncu_metrics)] if ncu_metrics else None
+                # ncu metrics (ncu_rep, ncu_rep_ir, or ncu_analyzer metrics)
+                ncu_metrics: List[str] = ncu_analyzer.get_ncu_metrics(
+                    self.required_metrics
                 )
-                if ncu_metrics or "ncu_rep" in self.required_metrics:
-                    metrics.ncu_rep = self.ncu_trace(
-                        input_id, fn_name, replay=True, extend_ncu_args=extend_ncu_args
+                if (
+                    ncu_metrics
+                    or "ncu_rep" in self.required_metrics
+                    or "ncu_rep_ir" in self.required_metrics
+                ):
+                    profile_ir = "ncu_rep_ir" in self.required_metrics
+                    out = self.ncu_trace(
+                        input_id,
+                        fn_name,
+                        replay=True,
+                        extend_ncu_args=ncu_metrics,
+                        profile_ir=profile_ir,
                     )
                 # Read and update NCU metrics if any required metrics match the NCU metrics
                 if ncu_metrics:
                     ncu_analyzer_results = ncu_analyzer.read_ncu_report(
-                        metrics.ncu_rep, self.required_metrics
+                        out, self.required_metrics
                     )
                     for metric_name, metric_value in ncu_analyzer_results.items():
                         metrics.extra_metrics[metric_name] = metric_value
                     if "arithmetic_intensity" in self.required_metrics:
                         logger.warning(
                             "Arithmetic intensity only supports FP32 and FP64 for now."
                         )
+                if "ncu_rep" in self.required_metrics:
+                    metrics.ncu_rep = out
                 if "ncu_rep_ir" in self.required_metrics:
-                    metrics.ncu_rep_ir = self.ncu_trace(
-                        input_id, fn_name, replay=True, profile_ir=True
-                    )
-                nsys_metrics = []
-                for metric_name in nsys_analyzer.nsys_metrics_to_reports.keys():
-                    if metric_name in self.required_metrics:
-                        nsys_metrics.append(metric_name)
-
+                    metrics.ncu_rep_ir = out
+                # nsys metrics
+                nsys_metrics = nsys_analyzer.get_nsys_metrics(self.required_metrics)
                 if "nsys_rep" in self.required_metrics or nsys_metrics:
                     nsys_rep_path = self.nsys_rep(input_id, fn_name)
                     metrics.nsys_rep = nsys_rep_path
@@ -1602,10 +1588,14 @@ def ncu_trace(
         profile_ir=False,
         extend_ncu_args: List[str] = None,
     ) -> str:
-        extend_ncu_args = extend_ncu_args or [
-            "--set",
-            "full",
-        ]
+        extend_ncu_args = (
+            ["--metrics", ",".join(extend_ncu_args)]
+            if extend_ncu_args
+            else [
+                "--set",
+                "full",
+            ]
+        )
         op_task_args = self._get_op_task_args(input_id, fn_name, "_ncu_trace_in_task")
         # Disable DCGM
         disable_dyno_dcgm = [