pytorch
diff --git a/‎test/prototype/mx_formats/test_mx_linear.py
Lines changed: 2 additions & 0 deletions b/‎test/prototype/mx_formats/test_mx_linear.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎test/prototype/mx_formats/test_mx_tensor.py
Lines changed: 298 additions & 0 deletions b/‎test/prototype/mx_formats/test_mx_tensor.py
Lines changed: 298 additions & 0 deletions
diff --git a/‎torchao/prototype/mx_formats/mx_subclass.py
Lines changed: 6 additions & 1 deletion b/‎torchao/prototype/mx_formats/mx_subclass.py
Lines changed: 6 additions & 1 deletion
@@ -558,11 +558,13 @@ def test_nvfp4_matmul_with_amax(
         A,
         per_tensor_scale=a_scale,
         mm_config=mm_config,
+        is_swizzled_scales=True,
     )
     B_nvfp4 = NVFP4Tensor.to_nvfp4(
         B,
         per_tensor_scale=b_scale,
         mm_config=mm_config,
+        is_swizzled_scales=True,
     )
 
     func = torch.compile(F.linear, fullgraph=True) if compile else F.linear
 
@@ -657,3 +657,301 @@ def assert_sqnr_gt_threshold(orig, new, threshold):
     assert x.t().dtype == x_reconstructed_t.dtype, (
         f"Transpose dtype mismatch: {x.t().dtype} vs {x_reconstructed_t.dtype}"
     )
+
+
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (128, 4),
+        (256, 8),
+        (100, 3),
+        (4, 4),
+        (50, 10),
+        (384, 12),
+    ],
+)
+@pytest.mark.parametrize(
+    "use_triton_kernel", [False, True] if torch.cuda.is_available() else [False]
+)
+@pytest.mark.skipif(
+    not TORCH_VERSION_AT_LEAST_2_8, reason="torch.compile requires PyTorch 2.8+"
+)
+def test_to_blocked_from_blocked_roundtrip(shape, use_triton_kernel: bool):
+    from torchao.prototype.mx_formats.utils import from_blocked, to_blocked
+
+    rows, cols = shape
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    original = torch.randint(0, 255, (rows, cols), device=device, dtype=torch.uint8)
+
+    blocked = to_blocked(original, use_triton_kernel=use_triton_kernel)
+    reconstructed = from_blocked(blocked, rows, cols)
+
+    torch.testing.assert_close(
+        original,
+        reconstructed,
+        atol=0.0,
+        rtol=0.0,
+        msg=f"Roundtrip failed for shape {shape} with use_triton_kernel={use_triton_kernel}",
+    )
+
+
+@pytest.mark.parametrize("is_swizzled_scales", [False, True])
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (32, 64),
+        (16, 32),
+        (64, 128),
+        (384, 128),
+    ],
+)
+@pytest.mark.skipif(
+    not TORCH_VERSION_AT_LEAST_2_8, reason="torch.compile requires PyTorch 2.8+"
+)
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_nvfp4_swizzled_scales_construction(is_swizzled_scales, shape):
+    """
+    Test that NVFP4Tensor can be constructed with swizzled scales and
+    that the _is_swizzled_scales flag is set correctly.
+    """
+    from torchao.prototype.mx_formats.nvfp4_tensor import NVFP4Tensor
+
+    M, K = shape
+    data = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+
+    tensor = NVFP4Tensor.to_nvfp4(data, is_swizzled_scales=is_swizzled_scales)
+    assert tensor._is_swizzled_scales == is_swizzled_scales
+    reconstructed = tensor.to_dtype(torch.bfloat16)
+    assert reconstructed.shape == data.shape
+
+
+@pytest.mark.parametrize(
+    "slice_dim,slice_spec",
+    [
+        # Row slicing - must align with 128-row boundaries
+        pytest.param(0, slice(0, 128), id="slice_rows[0:128]"),
+        pytest.param(0, slice(128, 256), id="slice_rows[128:256]"),
+        # Column slicing - must align with 64-column boundaries (4 scale columns * 16 block_size)
+        pytest.param(1, slice(0, 64), id="slice_cols[0:64]"),
+        pytest.param(1, slice(64, 128), id="slice_cols[64:128]"),
+        pytest.param(1, slice(0, 128), id="slice_cols[0:128]_full_width"),
+        # Test tensor parallelism patterns (half splits)
+        pytest.param(1, slice(0, 2048), id="slice_cols[0:2048]_tp_first_half"),
+        pytest.param(1, slice(2048, 4096), id="slice_cols[2048:4096]_tp_second_half"),
+        # Test quarter splits
+        pytest.param(1, slice(0, 1024), id="slice_cols[0:1024]_quarter"),
+        pytest.param(1, slice(1024, 2048), id="slice_cols[1024:2048]_quarter"),
+    ],
+)
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(
+    not TORCH_VERSION_AT_LEAST_2_8, reason="NVFP4 requires PyTorch 2.8+"
+)
+def test_nvfp4_swizzled_scales_slicing(slice_dim, slice_spec):
+    """
+    Test that slicing works correctly with swizzled scales and maintains
+    the swizzled state in the output tensor.
+    """
+    from torchao.prototype.mx_formats.nvfp4_tensor import NVFP4Tensor
+
+    # Use larger tensor sizes that align with swizzled requirements
+    if slice_dim == 0:
+        # For row slicing, need at least 256 rows to test 128-row boundaries
+        M, K = 256, 4096
+    else:
+        # For column slicing, need multiples of 64 columns for alignment
+        M, K = 128, 4096
+
+    data = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+
+    tensor = NVFP4Tensor.to_nvfp4(data, is_swizzled_scales=True)
+    assert tensor._is_swizzled_scales == True
+
+    if slice_dim == 0:
+        sliced_tensor = tensor[slice_spec, :]
+    else:
+        sliced_tensor = tensor[:, slice_spec]
+
+    # Verify sliced tensor maintains swizzled state
+    assert sliced_tensor._is_swizzled_scales == True
+
+    # Verify sliced tensor can be dequantized
+    sliced_reconstructed = sliced_tensor.to_dtype(torch.bfloat16)
+
+    # Compare with direct slicing of original data
+    original_reconstructed = tensor.to_dtype(torch.bfloat16)
+    if slice_dim == 0:
+        expected = original_reconstructed[slice_spec, :]
+    else:
+        expected = original_reconstructed[:, slice_spec]
+
+    torch.testing.assert_close(sliced_reconstructed, expected, atol=1e-6, rtol=1e-6)
+
+
+@pytest.mark.parametrize(
+    "slice_dim,slice_spec,expected_error",
+    [
+        # Row slicing with misaligned boundaries
+        pytest.param(
+            0,
+            slice(0, 100),
+            "Row slicing of NVFP4Tensor with swizzled scales requires",
+            id="misaligned_row_end",
+        ),
+        pytest.param(
+            0,
+            slice(50, 150),
+            "Row slicing of NVFP4Tensor with swizzled scales requires",
+            id="misaligned_row_start",
+        ),
+        # Column slicing with misaligned boundaries
+        pytest.param(
+            1,
+            slice(0, 32),
+            "Column slicing of NVFP4Tensor with swizzled scales requires",
+            id="misaligned_col_32",
+        ),
+        pytest.param(
+            1,
+            slice(16, 80),
+            "Column slicing of NVFP4Tensor with swizzled scales requires",
+            id="misaligned_col_start",
+        ),
+        pytest.param(
+            1,
+            slice(0, 100),
+            "Column slicing of NVFP4Tensor with swizzled scales requires",
+            id="misaligned_col_end",
+        ),
+        # Odd column boundaries (FP4 packing requirement)
+        pytest.param(
+            1,
+            slice(1, 65),
+            "start index to be a multiple of 64, got 1",
+            id="odd_start",
+        ),
+        pytest.param(
+            1,
+            slice(0, 65),
+            " multiple of 64 or equal to tensor size 4096, got 65",
+            id="odd_end",
+        ),
+    ],
+)
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(
+    not TORCH_VERSION_AT_LEAST_2_8, reason="NVFP4 requires PyTorch 2.8+"
+)
+def test_nvfp4_swizzled_scales_slicing_errors(slice_dim, slice_spec, expected_error):
+    """
+    Test that slicing raises appropriate errors for misaligned boundaries.
+    """
+    from torchao.prototype.mx_formats.nvfp4_tensor import NVFP4Tensor
+
+    M, K = 256, 4096
+    data = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+    tensor = NVFP4Tensor.to_nvfp4(data, is_swizzled_scales=True)
+
+    with pytest.raises(RuntimeError, match=expected_error):
+        if slice_dim == 0:
+            _ = tensor[slice_spec, :]
+        else:
+            _ = tensor[:, slice_spec]
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(
+    not TORCH_VERSION_AT_LEAST_2_8, reason="NVFP4 requires PyTorch 2.8+"
+)
+def test_nvfp4_swizzled_scales_view_semantics():
+    """
+    Test that slicing maintains proper view semantics where possible.
+    """
+    from torchao.prototype.mx_formats.nvfp4_tensor import NVFP4Tensor
+
+    M, K = 256, 4096
+    data = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+    tensor = NVFP4Tensor.to_nvfp4(data, is_swizzled_scales=True)
+
+    # Test row slicing (should maintain views)
+    sliced_tensor = tensor[0:128, :]
+
+    # Test that the sliced tensor shares storage with original for data
+    # (Note: scales might not share storage due to swizzled layout complexity)
+    assert sliced_tensor._data.data_ptr() == tensor._data.data_ptr()
+
+    # Test full-width column slicing (should maintain views)
+    full_width_slice = tensor[:, 0:K]
+    assert full_width_slice._scale_e4m3.data_ptr() == tensor._scale_e4m3.data_ptr()
+    assert full_width_slice._data.data_ptr() == tensor._data.data_ptr()
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(
+    not TORCH_VERSION_AT_LEAST_2_8, reason="NVFP4 requires PyTorch 2.8+"
+)
+def test_nvfp4_swizzled_scales_serialization():
+    """
+    Test that tensor flatten/unflatten preserves the swizzled scales state.
+    """
+    from torchao.prototype.mx_formats.nvfp4_tensor import NVFP4Tensor
+
+    M, K = 32, 64
+    data = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+
+    # Create tensor with swizzled scales
+    original_tensor = NVFP4Tensor.to_nvfp4(data, is_swizzled_scales=True)
+
+    # Test serialization
+    tensor_list, ctx = original_tensor.__tensor_flatten__()
+
+    # Verify swizzled flag is preserved in context
+    assert "_is_swizzled_scales" in ctx
+    assert ctx["_is_swizzled_scales"] == True
+
+    # Test deserialization
+    inner_tensors = {}
+    for name in tensor_list:
+        inner_tensors[name] = getattr(original_tensor, name)
+
+    reconstructed_tensor = NVFP4Tensor.__tensor_unflatten__(
+        inner_tensors, ctx, None, None
+    )
+
+    # Verify the swizzled state is preserved
+    assert reconstructed_tensor._is_swizzled_scales == True
+
+    # Verify functionality is preserved
+    original_dq = original_tensor.to_dtype(torch.bfloat16)
+    reconstructed_dq = reconstructed_tensor.to_dtype(torch.bfloat16)
+
+    torch.testing.assert_close(original_dq, reconstructed_dq, atol=1e-6, rtol=1e-6)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(
+    not TORCH_VERSION_AT_LEAST_2_8, reason="NVFP4 requires PyTorch 2.8+"
+)
+def test_nvfp4_swizzled_scales_get_scales_method():
+    """
+    Test that the get_scales() method correctly unswizzles scales when needed.
+    """
+    from torchao.prototype.mx_formats.nvfp4_tensor import NVFP4Tensor
+
+    M, K = 32, 64
+    data = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+
+    # Create tensors with both storage methods
+    regular_tensor = NVFP4Tensor.to_nvfp4(data, is_swizzled_scales=False)
+    swizzled_tensor = NVFP4Tensor.to_nvfp4(data, is_swizzled_scales=True)
+
+    # Get scales from both tensors and verify they are equal
+    regular_scales = regular_tensor.get_hp_scales()
+    swizzled_scales = swizzled_tensor.get_hp_scales()
+    torch.testing.assert_close(regular_scales, swizzled_scales, atol=0.0, rtol=0.0)
+
+    # Verify scales have the expected shape
+    expected_shape = (M, K // 16)
+    assert regular_scales.shape == expected_shape
+    assert swizzled_scales.shape == expected_shape
@@ -184,6 +184,11 @@ def _nvfp4_inference_linear_transform(
 
     weight = module.weight
 
+    if weight.shape[0] % 16 != 0 or weight.shape[1] % 16 != 0:
+        raise RuntimeError(
+            f"NVFP4 only supports weight shape divisible by 16, got {weight.shape}"
+        )
+
     if module.bias is not None and weight.dtype == torch.float32:
         raise RuntimeError(
             "Bias is not supported when module weight is in fp32 (out_dtype=Float32). "
@@ -193,8 +198,8 @@ def _nvfp4_inference_linear_transform(
     quantized_weight = NVFP4Tensor.to_nvfp4(
         weight,
         mm_config=config.mm_config,
+        is_swizzled_scales=True,
     )
-
     module.weight = torch.nn.Parameter(quantized_weight, requires_grad=False)
     module.extra_repr = types.MethodType(_linear_extra_repr, module)
     return module
Original file line number	Diff line number	Diff line change
`@@ -558,11 +558,13 @@ def test_nvfp4_matmul_with_amax(`
`558`	`558`	`A,`
`559`	`559`	`per_tensor_scale=a_scale,`
`560`	`560`	`mm_config=mm_config,`
	`561`	`+ is_swizzled_scales=True,`
`561`	`562`	`)`
`562`	`563`	`B_nvfp4 = NVFP4Tensor.to_nvfp4(`
`563`	`564`	`B,`
`564`	`565`	`per_tensor_scale=b_scale,`
`565`	`566`	`mm_config=mm_config,`
	`567`	`+ is_swizzled_scales=True,`
`566`	`568`	`)`
`567`	`569`
`568`	`570`	`func = torch.compile(F.linear, fullgraph=True) if compile else F.linear`