per tensor input scales are never good???

dsikka · dsikka · commit 35d98d55b6fc · 2025-04-24T22:01:54.000Z
diff --git a/src/compressed_tensors/quantization/lifecycle/initialize.py b/src/compressed_tensors/quantization/lifecycle/initialize.py
@@ -175,23 +175,26 @@ def _initialize_scale_zero_point(
 
     # NVFP4 support; use FP8 scales
     # For weight quant, attach global scales for NVFP4
-    # TODO: NVFP4 Scheme
     if (
         quantization_args.num_bits == 4
         and quantization_args.type == QuantizationType.FLOAT
     ):
-        scale_dtype = FP8_E4M3_DATA.dtype
-        # create and attach nvfp4 data
-        tensor_amax = torch.abs(module.weight.data).max().to(torch.float32)
-        # Setting data for now - could possibly be handled later in the pipeline
-        value = FP8_E4M3_DATA.max * FP4_E2M1_DATA.max / tensor_amax
-        # TODO: use model.weight.dtype after checking
-        value = value.to(torch.float32).to(device)
-        # Assuming the global scale can be torch.float16/bfloat16/module weight dtype and not only torch.float32?
-        init_global_scale = Parameter(value, requires_grad=False)
-        register_offload_parameter(
-            module, f"{base_name}_global_scale", init_global_scale
-        )
+        if base_name == "weight":
+            scale_dtype = FP8_E4M3_DATA.dtype
+            # create and attach nvfp4 data
+            tensor_amax = torch.abs(module.weight.data).max().to(torch.float32)
+            # Setting data for now - could possibly be handled later in the pipeline
+            value = FP8_E4M3_DATA.max * FP4_E2M1_DATA.max / tensor_amax
+            # TODO: use model.weight.dtype after checking
+            value = value.to(torch.float32).to(device)
+            # Assuming the global scale can be torch.float16/bfloat16/module weight dtype and not only torch.float32?
+            init_global_scale = Parameter(value, requires_grad=False)
+            register_offload_parameter(
+                module, f"{base_name}_global_scale", init_global_scale
+            )
+        else:
+            # input scales should be float32
+            scale_dtype = torch.float32
 
     # TODO: consider erroring out in the future as if the dtype if not one fo these,
     # there is likely bug
diff --git a/src/compressed_tensors/quantization/quant_scheme.py b/src/compressed_tensors/quantization/quant_scheme.py
@@ -108,7 +108,15 @@ def is_preset_scheme(name: str) -> bool:
         symmetric=True,
         dynamic=False,
         group_size=16,
-    )
+    ),
+    input_activations=QuantizationArgs(
+        num_bits=4,
+        type=QuantizationType.FLOAT,
+        strategy=QuantizationStrategy.TENSOR,
+        symmetric=True,
+        dynamic=False,
+        observer=None,
+    ),
 )
 
 # 8 bit integer weights and 8 bit activations quantization
diff --git a/src/compressed_tensors/quantization/utils/helpers.py b/src/compressed_tensors/quantization/utils/helpers.py
@@ -87,8 +87,8 @@ def calculate_qparams(
         if (
             quantization_args.num_bits == 4
             and quantization_args.type == QuantizationType.FLOAT
+            and global_scale is not None
         ):
-            assert global_scale is not None
             scales = global_scale * (max_val_pos / FP4_E2M1_DATA.max)  # Not needed
             scales = scales.to(FP8_E4M3_DATA.dtype)
         else: