Reduce peak memory used for unit tests.

RyanJDick · RyanJDick · commit a83a999b7931 · 2024-12-24T14:32:11.000Z
diff --git a/tests/backend/model_manager/load/model_cache/torch_module_autocast/test_autocast_modules.py b/tests/backend/model_manager/load/model_cache/torch_module_autocast/test_autocast_modules.py
@@ -40,7 +40,7 @@ def linear_8bit_lt_layer():
 def test_custom_invoke_linear_8bit_lt_all_weights_on_cuda(linear_8bit_lt_layer: InvokeLinear8bitLt):
     """Test CustomInvokeLinear8bitLt inference with all weights on the GPU."""
     # Run inference on the original layer.
-    x = torch.randn(10, 32).to("cuda")
+    x = torch.randn(1, 32).to("cuda")
     y_quantized = linear_8bit_lt_layer(x)
 
     # Wrap the InvokeLinear8bitLt layer in a CustomInvokeLinear8bitLt layer, and run inference on it.
@@ -54,7 +54,7 @@ def test_custom_invoke_linear_8bit_lt_all_weights_on_cuda(linear_8bit_lt_layer:
 def test_custom_invoke_linear_8bit_lt_all_weights_on_cpu(linear_8bit_lt_layer: InvokeLinear8bitLt):
     """Test CustomInvokeLinear8bitLt inference with all weights on the CPU (streaming to the GPU)."""
     # Run inference on the original layer.
-    x = torch.randn(10, 32).to("cuda")
+    x = torch.randn(1, 32).to("cuda")
     y_quantized = linear_8bit_lt_layer(x)
 
     # Copy the state dict to the CPU and reload it.
@@ -98,7 +98,7 @@ def linear_nf4_layer():
 def test_custom_invoke_linear_nf4_all_weights_on_cuda(linear_nf4_layer: InvokeLinearNF4):
     """Test CustomInvokeLinearNF4 inference with all weights on the GPU."""
     # Run inference on the original layer.
-    x = torch.randn(10, 32).to("cuda")
+    x = torch.randn(1, 32).to("cuda")
     y_quantized = linear_nf4_layer(x)
 
     # Wrap the InvokeLinearNF4 layer in a CustomInvokeLinearNF4 layer, and run inference on it.
@@ -112,7 +112,7 @@ def test_custom_invoke_linear_nf4_all_weights_on_cuda(linear_nf4_layer: InvokeLi
 def test_custom_invoke_linear_nf4_all_weights_on_cpu(linear_nf4_layer: InvokeLinearNF4):
     """Test CustomInvokeLinearNF4 inference with all weights on the CPU (streaming to the GPU)."""
     # Run inference on the original layer.
-    x = torch.randn(10, 32).to(device="cuda")
+    x = torch.randn(1, 32).to(device="cuda")
     y_quantized = linear_nf4_layer(x)
 
     # Copy the state dict to the CPU and reload it.
diff --git a/tests/backend/model_manager/load/model_cache/torch_module_autocast/test_torch_module_autocast.py b/tests/backend/model_manager/load/model_cache/torch_module_autocast/test_torch_module_autocast.py
@@ -57,7 +57,7 @@ def test_torch_module_autocast_linear_layer(device: torch.device, model: torch.n
     assert all(p.device.type == "cpu" for p in model.parameters())
 
     # Run inference on the CPU.
-    x = torch.randn(10, 32, device="cpu")
+    x = torch.randn(1, 32, device="cpu")
     expected = model(x)
     assert expected.device.type == "cpu"
 
@@ -103,7 +103,7 @@ def test_torch_module_autocast_bnb_llm_int8_linear_layer():
     assert model.linear.weight.SCB is not None
 
     # Run inference on the GPU.
-    x = torch.randn(10, 32)
+    x = torch.randn(1, 32)
     expected = model(x.to("cuda"))
     assert expected.device.type == "cuda"
 
diff --git a/tests/backend/quantization/test_bnb_llm_int8.py b/tests/backend/quantization/test_bnb_llm_int8.py
@@ -33,7 +33,7 @@ def test_invoke_linear_8bit_lt_quantization():
     assert quantized_layer.weight.CB.dtype == torch.int8
 
     # Run inference on both the original and quantized layers.
-    x = torch.randn(10, 32)
+    x = torch.randn(1, 32)
     y = orig_layer(x)
     y_quantized = quantized_layer(x.to("cuda"))
     assert y.shape == y_quantized.shape
@@ -53,7 +53,7 @@ def test_invoke_linear_8bit_lt_state_dict_roundtrip():
     orig_layer_state_dict = orig_layer.state_dict()
 
     # Run inference on the original layer.
-    x = torch.randn(10, 32)
+    x = torch.randn(1, 32)
     y = orig_layer(x)
 
     # Prepare a quantized InvokeLinear8bitLt layer.