@@ -92,15 +92,18 @@ def _test_impl_moe_quant(
92
92
self .assertGreaterEqual (compute_error (out_q , out ), 10 )
93
93
self .assertGreaterEqual (compute_error (out_qc , out ), 10 )
94
94
95
- @unittest .skipIf (not torch .cuda .is_available (), "Need CUDA available" )
96
- @unittest .skipIf (not TORCH_VERSION_AT_LEAST_2_5 , "Test only enabled for 2.5+" )
97
95
@parameterized .expand (
98
96
[
99
97
("single_token" , 1 , False ),
100
98
("multiple_tokens" , 8 , False ),
101
99
]
102
100
)
103
101
def test_int4wo_fake_dim (self , name , num_tokens , fullgraph ):
102
+ if not torch .cuda .is_available ():
103
+ self .skipTest ("Need CUDA available" )
104
+ if not TORCH_VERSION_AT_LEAST_2_5 :
105
+ self .skipTest ("Test only enabled for 2.5+" )
106
+
104
107
config = MoEQuantConfig (Int4WeightOnlyConfig ())
105
108
tensor_impl_class = TensorCoreTiledAQTTensorImpl
106
109
@@ -111,16 +114,20 @@ def test_int4wo_fake_dim(self, name, num_tokens, fullgraph):
111
114
fullgraph = fullgraph ,
112
115
)
113
116
114
- @unittest .skipIf (not torch .cuda .is_available (), "Need CUDA available" )
115
- @unittest .skipIf (not is_sm_at_least_90 (), "Requires CUDA capability >= 9.0" )
116
- @unittest .skipIf (not TORCH_VERSION_AT_LEAST_2_5 , "Test only enabled for 2.5+" )
117
117
@parameterized .expand (
118
118
[
119
119
("single_token" , 1 , True ),
120
120
("multiple_tokens" , 8 , False ),
121
121
]
122
122
)
123
123
def test_int4wo_base (self , name , num_tokens , fullgraph ):
124
+ if not torch .cuda .is_available ():
125
+ self .skipTest ("Need CUDA available" )
126
+ if not is_sm_at_least_90 ():
127
+ self .skipTest ("Requires CUDA capability >= 9.0" )
128
+ if not TORCH_VERSION_AT_LEAST_2_5 :
129
+ self .skipTest ("Test only enabled for 2.5+" )
130
+
124
131
config = Int4WeightOnlyConfig ()
125
132
tensor_impl_class = TensorCoreTiledAQTTensorImpl
126
133
@@ -131,15 +138,18 @@ def test_int4wo_base(self, name, num_tokens, fullgraph):
131
138
fullgraph = fullgraph ,
132
139
)
133
140
134
- @unittest .skipIf (not torch .cuda .is_available (), "Need CUDA available" )
135
- @unittest .skipIf (not TORCH_VERSION_AT_LEAST_2_5 , "Test only enabled for 2.5+" )
136
141
@parameterized .expand (
137
142
[
138
143
("single_token" , 1 , False ),
139
144
("multiple_tokens" , 8 , False ),
140
145
]
141
146
)
142
147
def test_int8wo_fake_dim (self , name , num_tokens , fullgraph ):
148
+ if not torch .cuda .is_available ():
149
+ self .skipTest ("Need CUDA available" )
150
+ if not TORCH_VERSION_AT_LEAST_2_5 :
151
+ self .skipTest ("Test only enabled for 2.5+" )
152
+
143
153
config = MoEQuantConfig (Int8WeightOnlyConfig ())
144
154
tensor_impl_class = PlainAQTTensorImpl
145
155
@@ -150,15 +160,18 @@ def test_int8wo_fake_dim(self, name, num_tokens, fullgraph):
150
160
fullgraph = fullgraph ,
151
161
)
152
162
153
- @unittest .skipIf (not torch .cuda .is_available (), "Need CUDA available" )
154
- @unittest .skipIf (not TORCH_VERSION_AT_LEAST_2_5 , "Test only enabled for 2.5+" )
155
163
@parameterized .expand (
156
164
[
157
165
("single_token" , 1 , True ),
158
166
("multiple_tokens" , 8 , False ),
159
167
]
160
168
)
161
169
def test_int8wo_base (self , name , num_tokens , fullgraph ):
170
+ if not torch .cuda .is_available ():
171
+ self .skipTest ("Need CUDA available" )
172
+ if not TORCH_VERSION_AT_LEAST_2_5 :
173
+ self .skipTest ("Test only enabled for 2.5+" )
174
+
162
175
config = Int8WeightOnlyConfig ()
163
176
tensor_impl_class = PlainAQTTensorImpl
164
177
@@ -169,14 +182,16 @@ def test_int8wo_base(self, name, num_tokens, fullgraph):
169
182
fullgraph = fullgraph ,
170
183
)
171
184
172
- @unittest .skipIf (not TORCH_VERSION_AT_LEAST_2_5 , "Test only enabled for 2.5+" )
173
185
@parameterized .expand (
174
186
[
175
187
("single_token" , 1 , True ),
176
188
("multiple_tokens" , 8 , False ),
177
189
]
178
190
)
179
191
def test_int8wo_base_cpu (self , name , num_tokens , fullgraph ):
192
+ if not TORCH_VERSION_AT_LEAST_2_5 :
193
+ self .skipTest ("Test only enabled for 2.5+" )
194
+
180
195
config = Int8WeightOnlyConfig ()
181
196
tensor_impl_class = PlainAQTTensorImpl
182
197
@@ -188,14 +203,17 @@ def test_int8wo_base_cpu(self, name, num_tokens, fullgraph):
188
203
device = "cpu" ,
189
204
)
190
205
191
- @unittest .skipIf (not torch .cuda .is_available (), "Need CUDA available" )
192
- @unittest .skipIf (not TORCH_VERSION_AT_LEAST_2_5 , "Test only enabled for 2.5+" )
193
206
@parameterized .expand (
194
207
[
195
208
("multiple_tokens" , 32 , False ),
196
209
]
197
210
)
198
211
def test_int8dq_fake_dim (self , name , num_tokens , fullgraph ):
212
+ if not torch .cuda .is_available ():
213
+ self .skipTest ("Need CUDA available" )
214
+ if not TORCH_VERSION_AT_LEAST_2_5 :
215
+ self .skipTest ("Test only enabled for 2.5+" )
216
+
199
217
config = MoEQuantConfig (Int8DynamicActivationInt8WeightConfig ())
200
218
base_class = LinearActivationQuantizedTensor
201
219
@@ -207,14 +225,17 @@ def test_int8dq_fake_dim(self, name, num_tokens, fullgraph):
207
225
fullgraph = fullgraph ,
208
226
)
209
227
210
- @unittest .skipIf (not torch .cuda .is_available (), "Need CUDA available" )
211
- @unittest .skipIf (not TORCH_VERSION_AT_LEAST_2_5 , "Test only enabled for 2.5+" )
212
228
@parameterized .expand (
213
229
[
214
230
("multiple_tokens" , 32 , False ),
215
231
]
216
232
)
217
233
def test_int8dq_base (self , name , num_tokens , fullgraph ):
234
+ if not torch .cuda .is_available ():
235
+ self .skipTest ("Need CUDA available" )
236
+ if not TORCH_VERSION_AT_LEAST_2_5 :
237
+ self .skipTest ("Test only enabled for 2.5+" )
238
+
218
239
config = Int8DynamicActivationInt8WeightConfig ()
219
240
base_class = LinearActivationQuantizedTensor
220
241
@@ -226,15 +247,18 @@ def test_int8dq_base(self, name, num_tokens, fullgraph):
226
247
fullgraph = fullgraph ,
227
248
)
228
249
229
- @unittest .skipIf (not torch .cuda .is_available (), "Need CUDA available" )
230
- @unittest .skipIf (not is_sm_at_least_90 (), "Requires CUDA capability >= 9.0" )
231
250
@parameterized .expand (
232
251
[
233
252
("single_token" , 1 , False ),
234
253
("multiple_tokens" , 8 , False ),
235
254
]
236
255
)
237
256
def test_fp8wo_fake_dim (self , name , num_tokens , fullgraph ):
257
+ if not torch .cuda .is_available ():
258
+ self .skipTest ("Need CUDA available" )
259
+ if not is_sm_at_least_90 ():
260
+ self .skipTest ("Requires CUDA capability >= 9.0" )
261
+
238
262
config = MoEQuantConfig (Float8WeightOnlyConfig ())
239
263
tensor_impl_class = Float8AQTTensorImpl
240
264
@@ -245,15 +269,18 @@ def test_fp8wo_fake_dim(self, name, num_tokens, fullgraph):
245
269
fullgraph = fullgraph ,
246
270
)
247
271
248
- @unittest .skipIf (not torch .cuda .is_available (), "Need CUDA available" )
249
- @unittest .skipIf (not is_sm_at_least_90 (), "Requires CUDA capability >= 9.0" )
250
272
@parameterized .expand (
251
273
[
252
274
("single_token" , 1 , True ),
253
275
("multiple_tokens" , 8 , False ),
254
276
]
255
277
)
256
278
def test_fp8wo_base (self , name , num_tokens , fullgraph ):
279
+ if not torch .cuda .is_available ():
280
+ self .skipTest ("Need CUDA available" )
281
+ if not is_sm_at_least_90 ():
282
+ self .skipTest ("Requires CUDA capability >= 9.0" )
283
+
257
284
config = Float8WeightOnlyConfig ()
258
285
tensor_impl_class = Float8AQTTensorImpl
259
286
@@ -264,15 +291,18 @@ def test_fp8wo_base(self, name, num_tokens, fullgraph):
264
291
fullgraph = fullgraph ,
265
292
)
266
293
267
- @unittest .skipIf (not torch .cuda .is_available (), "Need CUDA available" )
268
- @unittest .skipIf (not is_sm_at_least_90 (), "Requires CUDA capability >= 9.0" )
269
294
@parameterized .expand (
270
295
[
271
296
("single_token" , 1 , False ),
272
297
("multiple_tokens" , 8 , False ),
273
298
]
274
299
)
275
300
def test_fp8dq_fake_dim (self , name , num_tokens , fullgraph ):
301
+ if not torch .cuda .is_available ():
302
+ self .skipTest ("Need CUDA available" )
303
+ if not is_sm_at_least_90 ():
304
+ self .skipTest ("Requires CUDA capability >= 9.0" )
305
+
276
306
config = MoEQuantConfig (Float8DynamicActivationFloat8WeightConfig ())
277
307
base_class = LinearActivationQuantizedTensor
278
308
@@ -283,15 +313,18 @@ def test_fp8dq_fake_dim(self, name, num_tokens, fullgraph):
283
313
fullgraph = fullgraph ,
284
314
)
285
315
286
- @unittest .skipIf (not torch .cuda .is_available (), "Need CUDA available" )
287
- @unittest .skipIf (not is_sm_at_least_90 (), "Requires CUDA capability >= 9.0" )
288
316
@parameterized .expand (
289
317
[
290
318
("single_token" , 1 , True ),
291
319
("multiple_tokens" , 8 , False ),
292
320
]
293
321
)
294
322
def test_fp8dq_base (self , name , num_tokens , fullgraph ):
323
+ if not torch .cuda .is_available ():
324
+ self .skipTest ("Need CUDA available" )
325
+ if not is_sm_at_least_90 ():
326
+ self .skipTest ("Requires CUDA capability >= 9.0" )
327
+
295
328
config = Float8DynamicActivationFloat8WeightConfig ()
296
329
base_class = LinearActivationQuantizedTensor
297
330
0 commit comments