|
2 | 2 | benchmark_mode: "inference" |
3 | 3 | quantization_config_recipe_names: |
4 | 4 | # Will run a baseline inference for model by default, without quantization for comparison |
5 | | - # - "int4wo-32" |
| 5 | + - "int4wo-32" |
6 | 6 | # - "marlin" |
7 | 7 | - "int8wo" |
| 8 | + - "int8dq" |
| 9 | + - "float8dq" |
8 | 10 | # sparsity_config_recipe_names: |
9 | 11 | # Will run a baseline inference for model by default, without sparsity for comparison |
10 | 12 | # - "semi-sparse" |
11 | 13 | # - "block" |
12 | 14 | output_dir: "benchmarks/microbenchmarks/results" |
13 | 15 | model_params: |
14 | | - # - name: "small_bf16_linear" |
15 | | - # matrix_shapes: |
16 | | - # - name: "custom" |
17 | | - # shapes: [ |
18 | | - # [1024, 1024, 1024], # [m, k, n] |
19 | | - # ] |
20 | | - # high_precision_dtype: "torch.bfloat16" |
21 | | - # use_torch_compile: true |
22 | | - # torch_compile_mode: "max-autotune" |
23 | | - # device: "cuda" |
24 | | - # model_type: "linear" |
25 | | - # enable_profiler: true # Enable profiling for this model |
| 16 | + - name: "small_bf16_linear" |
| 17 | + matrix_shapes: |
| 18 | + - name: "custom" |
| 19 | + shapes: [ |
| 20 | + [1024, 1024, 1024], # [m, k, n] |
| 21 | + ] |
| 22 | + high_precision_dtype: "torch.bfloat16" |
| 23 | + use_torch_compile: true |
| 24 | + torch_compile_mode: "max-autotune" |
| 25 | + device: "cuda" |
| 26 | + model_type: "linear" |
| 27 | + enable_profiler: true # Enable profiling for this model |
26 | 28 |
|
27 | 29 | - name: "large_bf16_ln_linear" |
28 | 30 | matrix_shapes: |
@@ -65,30 +67,30 @@ model_params: |
65 | 67 | # model_type: "linear" |
66 | 68 | # enable_profiler: true # Enable profiling for this model |
67 | 69 |
|
68 | | - - name: "bf16_rms_norm_linear_activation" |
69 | | - matrix_shapes: |
70 | | - - name: "custom" |
71 | | - shapes: [ |
72 | | - [2048, 4096, 1024], |
73 | | - ] |
74 | | - high_precision_dtype: "torch.bfloat16" |
75 | | - use_torch_compile: true |
76 | | - torch_compile_mode: "max-autotune" |
77 | | - device: "cuda" |
78 | | - model_type: "rms_norm_linear_activation" |
79 | | - enable_profiler: true |
80 | | - enable_memory_profile: true |
| 70 | + # - name: "bf16_rms_norm_linear_activation" |
| 71 | + # matrix_shapes: |
| 72 | + # - name: "custom" |
| 73 | + # shapes: [ |
| 74 | + # [2048, 4096, 1024], |
| 75 | + # ] |
| 76 | + # high_precision_dtype: "torch.bfloat16" |
| 77 | + # use_torch_compile: true |
| 78 | + # torch_compile_mode: "max-autotune" |
| 79 | + # device: "cuda" |
| 80 | + # model_type: "rms_norm_linear_activation" |
| 81 | + # enable_profiler: true |
| 82 | + # enable_memory_profile: true |
81 | 83 |
|
82 | | - - name: "bf16_transformer_block" |
83 | | - matrix_shapes: |
84 | | - - name: "custom" |
85 | | - shapes: [ |
86 | | - [2048, 4096, 1024], # For transformer_block, k is the hidden dimension |
87 | | - ] |
88 | | - high_precision_dtype: "torch.bfloat16" |
89 | | - use_torch_compile: true |
90 | | - torch_compile_mode: "max-autotune" |
91 | | - device: "cuda" |
92 | | - model_type: "transformer_block" |
93 | | - enable_profiler: true |
94 | | - enable_memory_profile: true |
| 84 | + # - name: "bf16_transformer_block" |
| 85 | + # matrix_shapes: |
| 86 | + # - name: "custom" |
| 87 | + # shapes: [ |
| 88 | + # [2048, 4096, 1024], # For transformer_block, k is the hidden dimension |
| 89 | + # ] |
| 90 | + # high_precision_dtype: "torch.bfloat16" |
| 91 | + # use_torch_compile: true |
| 92 | + # torch_compile_mode: "max-autotune" |
| 93 | + # device: "cuda" |
| 94 | + # model_type: "transformer_block" # TODO: Add a custom model (Figure out how to do this, maybe pass a .py file with model definition) |
| 95 | + # enable_profiler: true |
| 96 | + # enable_memory_profile: true |
0 commit comments