|
3 | 3 | import torch
|
4 | 4 | from optimum.quanto.quantize import _quantize_submodule
|
5 | 5 |
|
6 |
| -# def custom_freeze(model: torch.nn.Module): |
7 |
| -# for name, m in model.named_modules(): |
8 |
| -# if isinstance(m, QModuleMixin): |
9 |
| -# m.weight = |
10 |
| -# m.freeze() |
11 |
| - |
12 | 6 |
|
13 | 7 | def requantize(
|
14 | 8 | model: torch.nn.Module,
|
15 | 9 | state_dict: Dict[str, Any],
|
16 | 10 | quantization_map: Dict[str, Dict[str, str]],
|
17 |
| - device: torch.device = None, |
| 11 | + device: torch.device | None = None, |
18 | 12 | ):
|
| 13 | + """This function was initially copied from: |
| 14 | + https://github.com/huggingface/optimum-quanto/blob/832f7f5c3926c91fe4f923aaaf037a780ac3e6c3/optimum/quanto/quantize.py#L101 |
| 15 | +
|
| 16 | + The function was modified to remove the `freeze()` call. The `freeze()` call is very slow and unnecessary when the |
| 17 | + weights are about to be loaded from a state_dict. |
| 18 | +
|
| 19 | + TODO(ryand): Unless I'm overlooking something, this should be contributed upstream to the `optimum-quanto` library. |
| 20 | + """ |
19 | 21 | if device is None:
|
20 | 22 | device = next(model.parameters()).device
|
21 | 23 | if device.type == "meta":
|
@@ -45,6 +47,7 @@ def move_tensor(t, device):
|
45 | 47 | setattr(m, name, torch.nn.Parameter(move_tensor(param, "cpu")))
|
46 | 48 | for name, param in m.named_buffers(recurse=False):
|
47 | 49 | setattr(m, name, move_tensor(param, "cpu"))
|
| 50 | + |
48 | 51 | # Freeze model and move to target device
|
49 | 52 | # freeze(model)
|
50 | 53 | # model.to(device)
|
|
0 commit comments