Fix remaining flake8 and lint in titan repo (#1382)

H-Huang · web-flow · commit 3ca7041dde0d · 2025-07-11T13:59:18.000-04:00
`pre-commit run --all-files` is failing due to a few files. This cleans
it up so the entire repo passes lint checks
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -14,7 +14,7 @@ We actively welcome your pull requests.
 2. If you've added code that should be tested, add tests.
 3. If you've changed APIs, update the documentation.
 4. Ensure the test suite passes.
-5. Make sure your code lints (`pre-commit run --files $(git diff --name-only HEAD~1)`).
+5. Make sure your code lints (`pre-commit run --all-files`).
 6. If you haven't already, complete the Contributor License Agreement ("CLA").
 
 ### Contributor License Agreement ("CLA")
diff --git a/torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/fast_debug_ao.py b/torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/fast_debug_ao.py
@@ -7,7 +7,6 @@
 # pyre-unsafe
 import logging
 
-import numpy as np
 import torch
 
 from reference_utils import (
diff --git a/torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/mg_grouped_gemm.py b/torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/mg_grouped_gemm.py
@@ -8,26 +8,21 @@
 # https://github.com/pytorch/FBGEMM/blob/main/fbgemm_gpu/experimental/gemm/triton_gemm
 
 # pyre-unsafe
-import functools
 import logging
 
 import os
 import sys
-from typing import Any, Dict, Optional, Tuple
+from typing import Tuple
 
 import torch
 
 import triton
 import triton.language as tl
-from triton import Config as TConfig
-
-from triton.runtime import driver  # @manual
 
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 
 from tma_autotuning import (
     _NV_CONFIGS,
-    ALIGN_SIZE_M,
     CudaUtils,
     early_config_prune,
     TmaDescriptorHelper,
@@ -727,6 +722,7 @@ def grouped_gemm_forward(
     w: torch.Tensor,
     m_sizes: torch.Tensor,
     tma_size: int = 128,
+    using_fp8: bool = False,
 ) -> torch.Tensor:
     """
     M*G style grouped GEMM with TMA and Float8 support.
@@ -892,7 +888,7 @@ def grouped_gemm_backward(
 
     # Compute grad_x using flat linear implementation
     try:
-        logging.info(f"Computing grad_x with flat linear kernel")
+        logging.info("Computing grad_x with flat linear kernel")
 
         # Use TMA-optimized implementation
         grad_x = grouped_gemm_dx_tma(
@@ -909,7 +905,7 @@ def grouped_gemm_backward(
 
     # Compute grad_w using flat linear style implementation
     try:
-        logging.info(f"Computing grad_w with flat linear kernel")
+        logging.info("Computing grad_w with flat linear kernel")
 
         grad_w = grouped_gemm_dw_tma(
             x, grad_output, m_sizes, num_sms=NUM_SMS, tma_size=tma_size
@@ -1203,14 +1199,14 @@ def grid(META):
 # ======== PyTorch wrapper functions ========
 
 
-class GroupedGEMM_mg(torch.autograd.Function):
+class GroupedGemmMg(torch.autograd.Function):
     """
     Autograd function for GroupedGEMM with M*G grouping.
     Supports both standard and FP8 quantized operations.
     """
 
     @staticmethod
-    def forward(ctx, x, w, m_sizes, use_tma=True, tma_size=128):
+    def forward(ctx, x, w, m_sizes, use_tma=True, tma_size=128, using_fp8=False):
         """
         Forward pass of GroupedGEMM.
 
@@ -1301,4 +1297,4 @@ def mg_grouped_gemm(
     Returns:
         Output tensor, shape [M_total, N]
     """
-    return GroupedGEMM_mg.apply(x, w, m_sizes, use_tma, tma_size, using_fp8)
+    return GroupedGemmMg.apply(x, w, m_sizes, use_tma, tma_size, using_fp8)
diff --git a/torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/tma_autotuning.py b/torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/tma_autotuning.py
@@ -8,17 +8,15 @@
 # https://github.com/pytorch/FBGEMM/blob/main/fbgemm_gpu/experimental/gemm/triton_gemm
 
 # pyre-unsafe
-import functools
 
 import os
 import sys
-from typing import Any, Dict, Optional, Tuple
+from typing import Dict
 
 import torch
 
 import triton
 import triton.language as tl
-from triton import Config as TConfig
 
 from triton.runtime import driver  # @manual
 
@@ -54,7 +52,11 @@ def get_num_sms() -> int:
 
 
 class TmaDescriptorHelper:
-    """Helper class for managing TMA descriptors in Triton kernels."""
+    """Helper class for managing TMA descriptors in Triton kernels.
+
+    Args:
+        tma_size: Size of the TMA descriptor in bytes
+    """
 
     class KernelParamWrapper:
         """Wrapper to implement the TmaDescKernelParam interface."""
@@ -67,11 +69,6 @@ def tma_desc_cpu_ptr(self) -> int:
             return self.desc.data_ptr()
 
     def __init__(self, tma_size: int = 128):
-        """Initialize the TMA descriptor helper.
-
-        Args:
-            tma_size: Size of the TMA descriptor in bytes
-        """
         if not CudaUtils.verify_tma():
             raise RuntimeError(
                 "TMA not supported on this device (requires Hopper or newer)"
diff --git a/torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/unit_test_backwards.py b/torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/unit_test_backwards.py
@@ -5,20 +5,12 @@
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
-import logging
 import unittest
 from typing import Tuple
 
 import torch
-import torch.nn as nn
-
-from mg_grouped_gemm import (
-    grouped_gemm_backward,
-    grouped_gemm_dw_tma,
-    grouped_gemm_dx_tma,
-    grouped_gemm_forward,
-    mg_grouped_gemm,
-)
+
+from mg_grouped_gemm import grouped_gemm_backward, grouped_gemm_forward
 
 from reference_utils import (
     analyze_tensor_differences,
@@ -27,7 +19,7 @@
 )
 
 
-class TestMG_GroupedGEMM_Backward(unittest.TestCase):
+class TestMgGroupedGemmBackward(unittest.TestCase):
     def setUp(self) -> None:
         torch.manual_seed(2020)  # Set seed for reproducibility
 
@@ -81,7 +73,7 @@ def _run_grouped_gemm_backward_test(
         self.assertTrue(grad_a_close)
         self.assertTrue(grad_b_close)
 
-    def test_MG_grouped_gemm_backward_bf16(self) -> None:
+    def test_mg_grouped_gemm_backward_bf16(self) -> None:
         for G in (1, 8, 16):
             for M in (512, 1024):
                 print(f"Testing BF16 M*G GroupGeMM Backward with G={G}, M={M}")
@@ -93,7 +85,7 @@ def test_MG_grouped_gemm_backward_bf16(self) -> None:
                     rtol=1e-2,
                 )
 
-    def test_MG_grouped_gemm_backward_deepseek_shapes(self) -> None:
+    def test_mg_grouped_gemm_backward_deepseek_shapes(self) -> None:
         """Test backward pass with shapes from Deepseek model."""
         deepseek_shapes = [
             (4, 2048, 4096, 7168),  # G, M, N, K
@@ -113,7 +105,7 @@ def test_MG_grouped_gemm_backward_deepseek_shapes(self) -> None:
                 shape, device, dtype=torch.float16, atol=1e-2, rtol=1e-2
             )
 
-    def test_MG_dx(self) -> None:
+    def test_mg_dx(self) -> None:
         """Test specifically the dx (gradient w.r.t. input) computation."""
         G, M, N, K = 4, 512, 1024, 2048
         device = torch.device("cuda")
@@ -143,7 +135,7 @@ def test_MG_dx(self) -> None:
         dx_close = analyze_tensor_differences(grad_a, expected_grad_a, "grad_a (dx)")
         self.assertTrue(dx_close)
 
-    def test_MG_dw(self) -> None:
+    def test_mg_dw(self) -> None:
         """Test specifically the dw (gradient w.r.t. weights) computation."""
         G, M, N, K = 4, 512, 1024, 2048
         device = torch.device("cuda")
diff --git a/torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/unit_test_forwards.py b/torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/unit_test_forwards.py
@@ -5,17 +5,15 @@
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
-import logging
 import unittest
 from typing import Tuple
 
 import torch
-import torch.nn as nn
 
 from mg_grouped_gemm import grouped_gemm_forward
 
 
-class TestMG_GroupedGEMM(unittest.TestCase):
+class TestMgGroupedGemm(unittest.TestCase):
     def setUp(self) -> None:
         torch.manual_seed(2020)
 
@@ -51,7 +49,7 @@ def _run_grouped_gemm_test(
         result = result.to(dtype)
         torch.testing.assert_close(result, expected_result, atol=atol, rtol=rtol)
 
-    def test_MG_grouped_gemm_bf16(self) -> None:
+    def test_mg_grouped_gemm_bf16(self) -> None:
         for G in (1, 4, 16):
             for M in (128, 512, 1024):
                 print(f"Testing BF16 M*G GroupGeMM with G={G}, M={M}")
@@ -63,7 +61,7 @@ def test_MG_grouped_gemm_bf16(self) -> None:
                     rtol=1.6e-2,
                 )
 
-    def test_MG_grouped_gemm_deepseek_shapes(self) -> None:
+    def test_mg_grouped_gemm_deepseek_shapes(self) -> None:
         """Test with shapes from Deepseek model."""
         deepseek_shapes = [
             (4, 2048, 4096, 7168),  # G, M, N, K