Add linear bias support for QAT (#1755)

andrewor14 · web-flow · commit 878ec7a8026d · 2025-02-21T15:47:03.000-05:00
**Summary:** Add linear bias support for QAT, which previously
resulted in the following unintuitive error message:

```
RuntimeError: Boolean value of Tensor with more than one value is ambiguous
```

Note that we don't fake quantize the bias still. We just support
applying QAT on linear modules with bias.

**Test Plan:**
python test/quantization/test_qat.py -k test_qat_linear_bias
diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py
@@ -133,6 +133,21 @@ def forward(self, x):
         return x
 
 
+class ModelWithLinearBias(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear1 = torch.nn.Linear(512, 256, bias=True)
+        self.linear2 = torch.nn.Linear(256, 512, bias=True)
+
+    def example_inputs(self):
+        return (torch.randn(1, 512),)
+
+    def forward(self, x):
+        x = self.linear1(x)
+        x = self.linear2(x)
+        return x
+
+
 class TestQAT(unittest.TestCase):
     SEED = 123
 
@@ -1366,6 +1381,25 @@ def test_fake_quantizer_repr(self):
         self.assertTrue("PerGroup" in fake_quantizer_repr)
         self.assertTrue("MappingType.SYMMETRIC" in fake_quantizer_repr)
 
+    @unittest.skipIf(
+        not TORCH_VERSION_AT_LEAST_2_4, "skipping when torch version is 2.4 or lower"
+    )
+    def test_qat_linear_bias(self):
+        """
+        Test that QAT supports linear bias.
+        """
+        m = ModelWithLinearBias()
+        activation_config = FakeQuantizeConfig(
+            torch.int8, "per_token", is_symmetric=False
+        )
+        weight_config = FakeQuantizeConfig(TorchAODType.INT4, group_size=32)
+        quantize_(
+            m,
+            intx_quantization_aware_training(activation_config, weight_config),
+        )
+        example_inputs = m.example_inputs()
+        m(*example_inputs)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/torchao/quantization/qat/linear.py b/torchao/quantization/qat/linear.py
@@ -75,9 +75,6 @@ def __init__(
             *args,
             **kwargs,
         )
-        if bias:
-            raise NotImplementedError("bias not supported yet")
-
         # initialize activation fake quantizer
         if activation_config is not None:
             self.activation_fake_quantizer = FakeQuantizer(activation_config)
@@ -103,17 +100,21 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             w = self.weight_fake_quantizer(self.weight)
         else:
             w = self.weight
-        return F.linear(x, w)
+        return F.linear(x, w, self.bias)
 
     def to_linear(self) -> torch.nn.Linear:
         new_linear = torch.nn.Linear(
-            self.in_features, self.out_features, self.bias, device=self.weight.device
+            self.in_features,
+            self.out_features,
+            self.bias is not None,
+            device=self.weight.device,
         )
         # In distributed training, the model may be instantiated
         # on the meta device, in which case there is no need to
         # copy the weights, and doing so will result in an error
         if self.weight.device != torch.device("meta"):
             new_linear.weight = self.weight
+            new_linear.bias = self.bias
         return new_linear
 
     @classmethod
@@ -126,7 +127,7 @@ def from_linear(
         new_linear = FakeQuantizedLinear(
             mod.in_features,
             mod.out_features,
-            mod.bias,
+            mod.bias is not None,
             activation_config=activation_config,
             weight_config=weight_config,
             device=mod.weight.device,
@@ -136,6 +137,7 @@ def from_linear(
         # copy the weights, and doing so will result in an error
         if mod.weight.device != torch.device("meta"):
             new_linear.weight = mod.weight
+            new_linear.bias = mod.bias
         return new_linear