13
13
14
14
from tests .quantization .utils import is_quant_method_supported
15
15
16
- from ..models .utils import check_embeddings_close
17
- from ..utils import compare_two_settings , create_new_process_for_each_test
16
+ from ...utils import compare_two_settings , multi_gpu_test
17
+ from ..utils import check_embeddings_close
18
18
19
19
models_4bit_to_test = [
20
20
("facebook/opt-125m" , "quantize opt model inflight" ),
42
42
@pytest .mark .skipif (not is_quant_method_supported ("bitsandbytes" ),
43
43
reason = 'bitsandbytes is not supported on this GPU type.' )
44
44
@pytest .mark .parametrize ("model_name, description" , models_4bit_to_test )
45
- @create_new_process_for_each_test ()
46
45
def test_load_4bit_bnb_model (hf_runner , vllm_runner , example_prompts ,
47
46
model_name , description ) -> None :
48
47
@@ -56,7 +55,6 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
56
55
reason = 'bitsandbytes is not supported on this GPU type.' )
57
56
@pytest .mark .parametrize ("model_name, description" ,
58
57
models_pre_qaunt_4bit_to_test )
59
- @create_new_process_for_each_test ()
60
58
def test_load_pre_quant_4bit_bnb_model (hf_runner , vllm_runner , example_prompts ,
61
59
model_name , description ) -> None :
62
60
@@ -68,20 +66,17 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
68
66
reason = 'bitsandbytes is not supported on this GPU type.' )
69
67
@pytest .mark .parametrize ("model_name, description" ,
70
68
models_pre_quant_8bit_to_test )
71
- @create_new_process_for_each_test ()
72
69
def test_load_8bit_bnb_model (hf_runner , vllm_runner , example_prompts ,
73
70
model_name , description ) -> None :
74
71
75
72
validate_generated_texts (hf_runner , vllm_runner , example_prompts [:1 ],
76
73
model_name , True )
77
74
78
75
79
- @pytest .mark .skipif (torch .cuda .device_count () < 2 ,
80
- reason = 'Test requires at least 2 GPUs.' )
81
76
@pytest .mark .skipif (not is_quant_method_supported ("bitsandbytes" ),
82
77
reason = 'bitsandbytes is not supported on this GPU type.' )
83
78
@pytest .mark .parametrize ("model_name, description" , models_4bit_to_test )
84
- @create_new_process_for_each_test ( )
79
+ @multi_gpu_test ( num_gpus = 2 )
85
80
def test_load_tp_4bit_bnb_model (hf_runner , vllm_runner , example_prompts ,
86
81
model_name , description ) -> None :
87
82
@@ -96,12 +91,10 @@ def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
96
91
vllm_tp_size = 2 )
97
92
98
93
99
- @pytest .mark .skipif (torch .cuda .device_count () < 2 ,
100
- reason = 'Test requires at least 2 GPUs.' )
101
94
@pytest .mark .skipif (not is_quant_method_supported ("bitsandbytes" ),
102
95
reason = 'bitsandbytes is not supported on this GPU type.' )
103
96
@pytest .mark .parametrize ("model_name, description" , models_4bit_to_test )
104
- @create_new_process_for_each_test ( )
97
+ @multi_gpu_test ( num_gpus = 2 )
105
98
def test_load_pp_4bit_bnb_model (model_name , description ) -> None :
106
99
common_args = [
107
100
"--disable-log-stats" ,
@@ -127,7 +120,6 @@ def test_load_pp_4bit_bnb_model(model_name, description) -> None:
127
120
@pytest .mark .parametrize ("model_name, description" ,
128
121
models_4bit_to_embedding_test )
129
122
@pytest .mark .parametrize ("dtype" , ["half" ])
130
- @create_new_process_for_each_test ()
131
123
def test_4bit_bnb_embedding_model (
132
124
model_name ,
133
125
description ,
@@ -146,6 +138,13 @@ def test_4bit_bnb_embedding_model(
146
138
example_prompts = [str (s ).strip () for s in example_prompts ]
147
139
148
140
# Inflight 4bit quantization
141
+ with vllm_runner (model_name ,
142
+ task = "embed" ,
143
+ dtype = dtype ,
144
+ gpu_memory_utilization = 0.5 ,
145
+ quantization = "bitsandbytes" ) as vllm_model :
146
+ vllm_outputs = vllm_model .embed (example_prompts )
147
+
149
148
hf_model_kwargs = dict (quantization_config = BitsAndBytesConfig (
150
149
load_in_4bit = True ))
151
150
with hf_runner (
@@ -156,12 +155,6 @@ def test_4bit_bnb_embedding_model(
156
155
) as hf_model :
157
156
hf_outputs = hf_model .encode (example_prompts )
158
157
159
- with vllm_runner (model_name ,
160
- task = "embed" ,
161
- dtype = dtype ,
162
- gpu_memory_utilization = 0.5 ,
163
- quantization = "bitsandbytes" ) as vllm_model :
164
- vllm_outputs = vllm_model .embed (example_prompts )
165
158
check_embeddings_close (
166
159
embeddings_0_lst = hf_outputs ,
167
160
embeddings_1_lst = vllm_outputs ,
0 commit comments