Dynamic quants (#202)

awni · angeloskath · web-flow · commit 3be51537a38e · 2025-06-02T12:31:42.000-07:00
* dynamic quants + reorg

* readme

* angelos fix

* Change sensitivity metric

* update version

* fix rebase

---------

Co-authored-by: Angelos Katharopoulos &lt;a_katharopoulos@apple.com&gt;
diff --git a/mlx_lm/LEARNED_QUANTS.md b/mlx_lm/LEARNED_QUANTS.md
@@ -1,21 +1,26 @@
 # Learned Quantization 
 
-To reduce the quality loss from quantization MLX LM has two options:
+To reduce the quality loss from quantization MLX LM has several options:
 
 - Distilled Weight Quantization (DWQ)
-- Activation-aware Weight Quantization (AWQ)[^1].
+- Activation-aware Weight Quantization (AWQ)[^1]
+- Dynamic quantization
 
-Both DWQ and AWQ use an example dataset to tune parameters of the model. DWQ
-fine-tunes non-quantized parameters (including quantization scales and biases)
-using the non-quantized model as a teacher. AWQ scales and clips the weights
-prior to quantization. The scaling and clipping values are found with a grid
-search minimizing the distance from the quantized hidden activations to the
-non-quantized hidden activations
+All methods use calibration data to tune parameters or hyper-parameters of the
+model. DWQ fine-tunes non-quantized parameters (including quantization scales
+and biases) using the non-quantized model as a teacher. AWQ scales and clips
+the weights prior to quantization. Dynamic quantization estimates the
+sensitivity of a model's outputs to each layer and uses a higher precision for
+layers which have higher sensitivity.
+
+Dynamic quantization is the fastest to run. DWQ takes longer but typically
+yields better results. You can also cascade methods. For example a dynamically
+quantized model can be further refined with DWQ.
 
 To get started, first install the requirements:
 
 ```
-pip install mlx-lm[lwq]
+pip install mlx-lm[quant]
 ```
 
 ### DWQ
@@ -66,6 +71,30 @@ A few options to reduce memory use for DWQ:
   `--max-seq-length 512` reduces the memory and still gets good results.
 - Use a smaller batch size, e.g. `--batch-size 1`
 
+### Dynamic Quantization
+
+Use `mlx_lm.dynamic_quant` to generate a dynamic quantization of given model.
+For example:
+
+```bash
+mlx_lm.dynamic_quant --model mistralai/Mistral-7B-Instruct-v0.3
+```
+
+The script will estimate the sensitivity for each quantizable layer in the
+model. It will then quantize the model using higher precision (default 5 bits)
+for the more sensitive layers and lower precision (default 4 bits) for the
+rest. The script also saves a JSON file with each layer's sensitivities which
+saves needing to compute it multiple times to make different precision quants
+of the same model.
+
+Some important options are:
+
+- `--target-bpw`: The target bits-per-weight. For a given set of quantization
+  parameters only certain ranges are possible. For example, with the default
+  parameters a BPW in the range `[4.5, 5.5]` is achievable.
+- `--sensitivities`: A path to a precomputed sensitivities file.
+- `--low-bits`: The number of bits to use for the less sensitive layers.
+- `--high-bits`: The number of bits to use for the more sensitive layers.
 
 ### AWQ 
 
diff --git a/mlx_lm/__main__.py b/mlx_lm/__main__.py
@@ -5,8 +5,9 @@
 
 if __name__ == "__main__":
     subcommands = {
-        "awq",
-        "dwq",
+        "quant.awq",
+        "quant.dwq",
+        "quant.dynamic_quant",
         "cache_prompt",
         "chat",
         "convert",
diff --git a/mlx_lm/_version.py b/mlx_lm/_version.py
@@ -1,3 +1,3 @@
 # Copyright © 2023-2024 Apple Inc.
 
-__version__ = "0.24.1"
+__version__ = "0.25.0"
diff --git a/mlx_lm/quant/awq.py b/mlx_lm/quant/awq.py
@@ -14,6 +14,7 @@
 
 from mlx_lm.models.base import create_attention_mask
 from mlx_lm.models.switch_layers import SwitchLinear
+from mlx_lm.quant.utils import load_data
 from mlx_lm.utils import (
     fetch_from_hub,
     get_model_path,
@@ -510,23 +511,6 @@ def __call__(self, x: mx.array, *args, **kwargs):
         )
 
 
-def load_dataset(tokenizer, num_samples: int, sequence_length: int) -> mx.array:
-    save_dir = Path.home() / ".cache/mlx-lm/calibration_v5.txt"
-    if not save_dir.exists():
-        save_dir.parent.mkdir(parents=True, exist_ok=True)
-        url = "https://gist.githubusercontent.com/tristandruyen/9e207a95c7d75ddf37525d353e00659c/raw/571fda718462de863e5a0171078c175420c7649a/calibration_data_v5_rc.txt"
-        request.urlretrieve(url, save_dir)
-    with open(save_dir) as fid:
-        texts = fid.read()
-    tokens = tokenizer.encode(texts, return_tensors="mlx")[0]
-
-    # select random non-overlapping chunks
-    tokens = tokens[: (tokens.size // sequence_length) * sequence_length]
-    tokens = tokens.reshape(-1, sequence_length)
-    segments = mx.random.permutation(tokens.shape[0])[:num_samples]
-    return tokens[segments]
-
-
 def update_config(
     model: nn.Module,
     config: Dict[str, Any],
@@ -578,7 +562,7 @@ def main():
     if (awq_config := AWQ_MODEL_CONFIGS.get(model_type, None)) is None:
         raise NotImplementedError(f"AWQ support for {model_type} models NYI.")
 
-    calibration_data = load_dataset(tokenizer, args.num_samples, args.sequence_length)
+    calibration_data = load_data(tokenizer, args.num_samples, args.sequence_length)
 
     calibration_data = dist_split(calibration_data, group)
 
diff --git a/mlx_lm/quant/dwq.py b/mlx_lm/quant/dwq.py
diff --git a/mlx_lm/quant/dynamic_quant.py b/mlx_lm/quant/dynamic_quant.py
@@ -0,0 +1,244 @@
+# Copyright © 2025 Apple Inc.
+
+import argparse
+import copy
+import json
+import math
+
+import mlx.core as mx
+import mlx.nn as nn
+import numpy as np
+from mlx.utils import tree_flatten, tree_map, tree_unflatten
+from tqdm import tqdm
+
+from mlx_lm.quant.utils import load_data
+from mlx_lm.utils import (
+    compute_bits_per_weight,
+    fetch_from_hub,
+    get_model_path,
+    quantize_model,
+    save,
+)
+
+
+def eval_ppl(model, data, batch_size=8):
+    all_loss = 0.0
+    ntoks = 0
+    for s in range(0, len(data), batch_size):
+        batch = data[s : s + batch_size]
+        logits = model(batch[:, :-1]).astype(mx.float32)
+        losses = nn.losses.cross_entropy(logits, batch[:, 1:])
+        all_loss += losses.sum().item()
+        ntoks += losses.size
+    ppl = math.exp(all_loss / ntoks)
+    return ppl
+
+
+def estimate_sensitivities(
+    model,
+    data,
+    low_bits,
+    low_group_size,
+    high_bits,
+    high_group_size,
+):
+    batch_size = 4
+    layers = tree_flatten(model.leaf_modules(), is_leaf=nn.Module.is_module)
+    layers = {k: l for k, l in layers if hasattr(l, "to_quantized")}
+
+    q_model = copy.deepcopy(model)
+
+    def qdq(w, bits, group_size):
+        w, s, b = mx.quantize(w, bits=bits, group_size=group_size)
+        return mx.dequantize(w, scales=s, biases=b, bits=bits, group_size=group_size)
+
+    q_layers = copy.deepcopy(layers)
+    for l in q_layers.values():
+        l.weight = qdq(l.weight, low_bits, low_group_size)
+    q_model.freeze()
+    q_model.update_modules(tree_unflatten(list(q_layers.items())))
+
+    def log_norm(x):
+        x = x.astype(mx.float32)
+        return x - mx.logsumexp(x, axis=-1, keepdims=True)
+
+    def loss_fn(batch, targets):
+        logprobs = log_norm(q_model(batch))
+        return nn.losses.kl_div_loss(logprobs, targets, reduction="mean")
+
+    grad_accum = tree_map(lambda x: mx.zeros(x.shape), q_model.trainable_parameters())
+    for e, s in tqdm(
+        enumerate(range(0, len(data), batch_size)),
+        total=len(data) // batch_size,
+        desc="Estimating sensitivities",
+    ):
+        batch = data[s : s + batch_size]
+        targets = log_norm(model(batch))
+        mx.eval(targets)
+        _, grads = nn.value_and_grad(q_model, loss_fn)(batch, targets)
+        grad_accum = tree_map(lambda x, y: x + y, grad_accum, grads)
+        mx.eval(grad_accum)
+
+    def compute_sensitivity(gradient, low_q_weight, original_weight):
+        n_batches = (len(data) + batch_size - 1) // batch_size
+        gradient = gradient / n_batches
+        high_q_weight = qdq(original_weight, high_bits, high_group_size)
+        param_size = original_weight.size / 1e6
+        alignment = (gradient * (low_q_weight - high_q_weight)).sum()
+        return alignment / param_size
+
+    sensitivities = tree_map(
+        compute_sensitivity,
+        grad_accum,
+        q_model.parameters(),
+        model.parameters(),
+    )
+    mx.eval(sensitivities)
+
+    sensitivities = [(k[:-7], s.item()) for k, s in tree_flatten(sensitivities)]
+
+    return sensitivities
+
+
+def estimate_threshold(
+    model,
+    sensitivities,
+    target_bpw,
+    low_bits,
+    low_group_size,
+    high_bits,
+    high_group_size,
+):
+    def predicate(p, m, high_threshold):
+        if not hasattr(m, "to_quantized"):
+            return False
+        if sensitivities[p] > high_threshold:
+            return {"bits": high_bits, "group_size": high_group_size}
+        return True
+
+    # Binary search for the threshold
+    sens_vals = list(sensitivities.values())
+    min_threshold = min(sens_vals)
+    max_threshold = max(sens_vals)
+    tolerance = 1e-3 * (max_threshold - min_threshold)
+    while (max_threshold - min_threshold) > tolerance:
+        mid = (max_threshold + min_threshold) / 2
+        class_predicate = lambda p, m: predicate(p, m, mid)
+        q_model = copy.deepcopy(model)
+        nn.quantize(
+            q_model,
+            group_size=low_group_size,
+            bits=low_bits,
+            class_predicate=class_predicate,
+        )
+        bpw = compute_bits_per_weight(q_model)
+        if bpw > target_bpw:
+            min_threshold = mid
+        else:
+            max_threshold = mid
+
+    return (max_threshold + min_threshold) / 2
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", "-m", default="Qwen/Qwen3-0.6B-base")
+    parser.add_argument(
+        "--mlx-path", default="mlx_model", help="Path to save the model"
+    )
+    parser.add_argument("--seed", type=int, default=123)
+    parser.add_argument(
+        "--sensitivities",
+        type=str,
+        default=None,
+        help="Path to a pre-computed sensitivity JSON file.",
+    )
+    parser.add_argument(
+        "--target-bpw", type=float, default=5.0, help="Target bits per weight."
+    )
+    parser.add_argument("--low-bits", type=int, default=4)
+    parser.add_argument("--low-group-size", type=int, default=64)
+    parser.add_argument("--high-bits", type=int, default=5)
+    parser.add_argument("--high-group-size", type=int, default=64)
+    parser.add_argument(
+        "--report-ppl",
+        action="store_true",
+        help="Compute the perplexity of the base and quantized models.",
+    )
+
+    args = parser.parse_args()
+
+    group = mx.distributed.init()
+
+    if args.sensitivities is None:
+        model_path = get_model_path(args.model, revision=None)
+        model, config, tokenizer = fetch_from_hub(model_path, lazy=True)
+        mx.random.seed(args.seed)
+        data = load_data(tokenizer, num_samples=-1, sequence_length=512)
+
+        sensitivities = estimate_sensitivities(
+            model,
+            data,
+            args.low_bits,
+            args.low_group_size,
+            args.high_bits,
+            args.high_group_size,
+        )
+        model_name = args.model.replace("/", "_")
+        with open(f"{model_name}_sensitivities.json", "w") as fid:
+            json.dump(sensitivities, fid)
+    else:
+        with open(args.sensitivities, "r") as fid:
+            sensitivities = json.load(fid)
+
+    sensitivities = dict(sensitivities)
+    model_path = get_model_path(args.model, revision=None)
+    model, config, tokenizer = fetch_from_hub(model_path, lazy=True)
+    mx.random.seed(args.seed)
+    data = load_data(tokenizer, num_samples=-1, sequence_length=512)
+
+    if args.report_ppl:
+        ppl = eval_ppl(model, data)
+        print(f"Original PPL: {ppl:.3f}")
+
+    threshold = estimate_threshold(
+        model,
+        sensitivities,
+        target_bpw=args.target_bpw,
+        low_bits=args.low_bits,
+        low_group_size=args.low_group_size,
+        high_bits=args.high_bits,
+        high_group_size=args.high_group_size,
+    )
+
+    def quant_predicate(p, m, _):
+        if not hasattr(m, "to_quantized"):
+            return False
+        if sensitivities[p] > threshold:
+            return {"bits": args.high_bits, "group_size": args.high_group_size}
+        return True
+
+    model, config = quantize_model(
+        model,
+        config,
+        q_group_size=args.low_group_size,
+        q_bits=args.low_bits,
+        quant_predicate=quant_predicate,
+    )
+
+    if args.report_ppl:
+        ppl = eval_ppl(model, data)
+        print(f"Quantized PPL: {ppl:.3f}")
+
+    save(
+        args.mlx_path,
+        model_path,
+        model,
+        tokenizer,
+        config,
+        hf_repo=args.model,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mlx_lm/quant/utils.py b/mlx_lm/quant/utils.py
@@ -0,0 +1,26 @@
+# Copyright © 2025 Apple Inc.
+
+from pathlib import Path
+
+import mlx.core as mx
+
+
+def load_data(tokenizer, num_samples: int, sequence_length: int) -> mx.array:
+    save_dir = Path.home() / ".cache/mlx-lm/calibration_v5.txt"
+    if not save_dir.exists():
+        from urllib import request
+
+        save_dir.parent.mkdir(parents=True, exist_ok=True)
+        url = "https://gist.githubusercontent.com/tristandruyen/9e207a95c7d75ddf37525d353e00659c/raw/571fda718462de863e5a0171078c175420c7649a/calibration_data_v5_rc.txt"
+        request.urlretrieve(url, save_dir)
+    with open(save_dir) as fid:
+        texts = fid.read()
+    tokens = tokenizer.encode(texts, return_tensors="mlx")[0]
+
+    # select random non-overlapping chunks
+    tokens = tokens[: (tokens.size // sequence_length) * sequence_length]
+    tokens = tokens.reshape(-1, sequence_length)
+    segments = mx.random.permutation(tokens.shape[0])
+    if num_samples > 0:
+        segments = segments[:num_samples]
+    return tokens[segments]
diff --git a/setup.py b/setup.py

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`# Copyright © 2023-2024 Apple Inc.`
`2`	`2`
`3`		`-__version__ = "0.24.1"`
	`3`	`+__version__ = "0.25.0"`