|
4 | 4 | # This source code is licensed under the BSD 3-Clause license found in the
|
5 | 5 | # LICENSE file in the root directory of this source tree.
|
6 | 6 | import logging
|
7 |
| -from typing import Callable, Optional |
| 7 | +from functools import partial |
| 8 | +from typing import Callable, List, Optional, Union |
8 | 9 |
|
9 | 10 | import torch.nn as nn
|
10 | 11 |
|
11 |
| -from torchao.float8.config import Float8LinearConfig |
| 12 | +from torchao.float8.config import Float8LinearConfig, Float8LinearRecipeName |
12 | 13 | from torchao.float8.float8_linear import Float8Linear
|
13 | 14 |
|
14 | 15 | log = logging.getLogger(__name__)
|
@@ -113,3 +114,85 @@ def convert_to_float8_training(
|
113 | 114 | from_float,
|
114 | 115 | module_filter_fn=module_filter_fn,
|
115 | 116 | )
|
| 117 | + |
| 118 | + |
| 119 | +def _auto_filter_for_recipe( |
| 120 | + recipe: Union[str, Float8LinearRecipeName], filter_fqns: List[str] |
| 121 | +) -> Callable[[nn.Module, str], bool]: |
| 122 | + """Returns function which automatically filters nn.Linear modules that meet at least one of the following criteria: |
| 123 | +
|
| 124 | + 1. Dims not divisible by 16 (hardware requirement for float8). |
| 125 | + 2. Dim sizes below certain thresholds, which may result in worse performance. |
| 126 | +
|
| 127 | + NOTE: the thresholds are simple heuristics based on performance testing, and may not be optimal |
| 128 | + for your model. For the best performance, we recommend defining your own module_filter_fn customized for |
| 129 | + your module, using the performance tables for the given float8 recipe here: |
| 130 | + https://github.com/pytorch/ao/tree/main/torchao/float8#performance). These benchmarks referenced for |
| 131 | + auto filtering layers were run on H100 GPUs, and may not be representative of other hardware. |
| 132 | +
|
| 133 | + This is an experimental API, the design may change in the future. |
| 134 | + """ |
| 135 | + if isinstance(recipe, str): |
| 136 | + recipe = Float8LinearRecipeName(recipe) |
| 137 | + if recipe == Float8LinearRecipeName.TENSORWISE: |
| 138 | + return partial(_auto_filter_for_tensorwise, filter_fqns=filter_fqns) |
| 139 | + elif recipe == Float8LinearRecipeName.ROWWISE: |
| 140 | + return partial(_auto_filter_for_rowwise, filter_fqns=filter_fqns) |
| 141 | + elif recipe == Float8LinearRecipeName.ROWWISE_WITH_GW_HP: |
| 142 | + raise NotImplementedError(f"Unsupported recipe: {recipe}") |
| 143 | + else: |
| 144 | + raise ValueError(f"Invalid recipe: {recipe}") |
| 145 | + |
| 146 | + |
| 147 | +def _auto_filter_for_rowwise(mod: nn.Module, fqn: str, filter_fqns: List[str]) -> bool: |
| 148 | + if not isinstance(mod, nn.Linear): |
| 149 | + return False |
| 150 | + |
| 151 | + # If the fqn matches any filtered fqn, then we should not convert this module. |
| 152 | + is_filtered_fqn = any(filter_fqn in fqn for filter_fqn in filter_fqns) |
| 153 | + if is_filtered_fqn: |
| 154 | + return False |
| 155 | + |
| 156 | + # All dims must be divisible by 16 due to float8 hardware requirements. |
| 157 | + N, K = mod.weight.shape |
| 158 | + dims_multiples_of_16 = K % 16 == 0 and N % 16 == 0 |
| 159 | + if not dims_multiples_of_16: |
| 160 | + return False |
| 161 | + |
| 162 | + # Dims below these thresholds may result in worse performance |
| 163 | + # (see https://github.com/pytorch/ao/tree/main/torchao/float8#rowwise-scaling) |
| 164 | + # Note that these benchmarks referenced for auto filtering layers were run on |
| 165 | + # H100 GPUs, and may not be representative of other hardware. |
| 166 | + if N <= 2048: |
| 167 | + return False |
| 168 | + elif K <= 1024: |
| 169 | + return False |
| 170 | + elif N <= 4096 and K <= 2048: |
| 171 | + return False |
| 172 | + return True |
| 173 | + |
| 174 | + |
| 175 | +def _auto_filter_for_tensorwise( |
| 176 | + mod: nn.Module, fqn: str, filter_fqns: List[str] |
| 177 | +) -> bool: |
| 178 | + if not isinstance(mod, nn.Linear): |
| 179 | + return False |
| 180 | + |
| 181 | + # If the fqn matches any filtered fqn, then we should not convert this module. |
| 182 | + is_filtered_fqn = any(filter_fqn in fqn for filter_fqn in filter_fqns) |
| 183 | + if is_filtered_fqn: |
| 184 | + return False |
| 185 | + |
| 186 | + # All dims must be divisible by 16 due to float8 hardware requirements. |
| 187 | + N, K = mod.weight.shape |
| 188 | + dims_multiples_of_16 = K % 16 == 0 and N % 16 == 0 |
| 189 | + if not dims_multiples_of_16: |
| 190 | + return False |
| 191 | + |
| 192 | + # Dims below these thresholds may result in worse performance |
| 193 | + # (see https://github.com/pytorch/ao/tree/main/torchao/float8#tensorwise-scaling) |
| 194 | + # Note that these benchmarks referenced for auto filtering layers were run on |
| 195 | + # H100 GPUs, and may not be representative of other hardware. |
| 196 | + if K <= 4096 and N <= 1024: |
| 197 | + return False |
| 198 | + return True |
0 commit comments