From 718afcb8567ae31f11fce8da50d775d4cc248617 Mon Sep 17 00:00:00 2001 From: Jonathan Date: Fri, 18 Jul 2025 19:51:37 +0300 Subject: [PATCH 1/3] Update README for quantization example #1659 Signed-off-by: Jonathan --- .../quantization_2of4_sparse_w4a16/README.md | 111 ++++++++++++++++-- 1 file changed, 98 insertions(+), 13 deletions(-) diff --git a/examples/quantization_2of4_sparse_w4a16/README.md b/examples/quantization_2of4_sparse_w4a16/README.md index 51e04dd98..522c0e5c3 100644 --- a/examples/quantization_2of4_sparse_w4a16/README.md +++ b/examples/quantization_2of4_sparse_w4a16/README.md @@ -45,37 +45,99 @@ It contains instructions to prune the model to 2:4 sparsity, run one epoch of re and quantize to 4 bits in one show using GPTQ. ```python +from pathlib import Path + import torch -from transformers import AutoModelForCausalLM +from loguru import logger +from transformers import AutoModelForCausalLM, AutoTokenizer + +from llmcompressor import oneshot, train +# load the model in as bfloat16 to save on memory and compute model_stub = "neuralmagic/Llama-2-7b-ultrachat200k" model = AutoModelForCausalLM.from_pretrained(model_stub, torch_dtype=torch.bfloat16) +tokenizer = AutoTokenizer.from_pretrained(model_stub) +# uses LLM Compressor's built-in preprocessing for ultra chat dataset = "ultrachat-200k" -splits = {"calibration": "train_gen[:5%]", "train": "train_gen"} +# Select the recipe for 2 of 4 sparsity and 4-bit activation quantization recipe = "2of4_w4a16_recipe.yaml" + +# save location of quantized model +output_dir = "output_llama7b_2of4_w4a16_channel" +output_path = Path(output_dir) + +# set dataset config parameters +splits = {"calibration": "train_gen[:5%]", "train": "train_gen"} +max_seq_length = 512 +num_calibration_samples = 512 + +# set training parameters for finetuning +num_train_epochs = 0.01 +logging_steps = 500 +save_steps = 5000 +gradient_checkpointing = True # saves memory during training +learning_rate = 0.0001 +bf16 = False # using full precision for training +lr_scheduler_type = "cosine" +warmup_ratio = 0.1 +preprocessing_num_workers = 64 + +oneshot_kwargs = dict( + dataset=dataset, + recipe=recipe, + num_calibration_samples=num_calibration_samples, + preprocessing_num_workers=preprocessing_num_workers, + splits=splits, +) + +training_kwargs = dict( + bf16=bf16, + max_seq_length=max_seq_length, + num_train_epochs=num_train_epochs, + logging_steps=logging_steps, + save_steps=save_steps, + gradient_checkpointing=gradient_checkpointing, + learning_rate=learning_rate, + lr_scheduler_type=lr_scheduler_type, + warmup_ratio=warmup_ratio, +) ``` -## Step 2: Run sparsification using `apply` -The `apply` function applies the given recipe to our model and dataset. -The hardcoded kwargs may be altered based on each model's needs. -After running, the sparsified model will be saved to `output_llama7b_2of4_w4a16_channel`. +## Step 2: Run `sparsification`, `fine-tuning`, and `quantization` +The compression process now runs in three stages: sparsification, fine-tuning, and quantization. +Each stage saves the intermediate model outputs to the `output_llama7b_2of4_w4a16_channel` directory. ```python -from llmcompressor.transformers import apply +from llmcompressor import oneshot, train +from pathlib import Path output_dir = "output_llama7b_2of4_w4a16_channel" +output_path = Path(output_dir) -apply( +# 1. Oneshot sparsification: apply pruning +oneshot( model=model, dataset=dataset, recipe=recipe, - bf16=False, # use full precision for training + splits=splits, + num_calibration_samples=512, + preprocessing_num_workers=8, output_dir=output_dir, + stage="sparsity_stage", +) + +# 2. Sparse fine-tuning: improve accuracy on pruned model +train( + model=output_path / "sparsity_stage", + dataset=dataset, + recipe=recipe, splits=splits, - max_seq_length=512, num_calibration_samples=512, + preprocessing_num_workers=8, + bf16=False, + max_seq_length=512, num_train_epochs=0.5, logging_steps=500, save_steps=5000, @@ -83,11 +145,34 @@ apply( learning_rate=0.0001, lr_scheduler_type="cosine", warmup_ratio=0.1, + output_dir=output_dir, + stage="finetuning_stage", +) + +# 3. Oneshot quantization: compress model weights to lower precision +quantized_model = oneshot( + model=output_path / "finetuning_stage", + dataset=dataset, + recipe=recipe, + splits=splits, + num_calibration_samples=512, + preprocessing_num_workers=8, + output_dir=output_dir, + stage="quantization_stage", +) +quantized_model.save_pretrained( + f"{output_dir}/quantization_stage", skip_sparsity_compression_stats=False ) +tokenizer.save_pretrained(f"{output_dir}/quantization_stage") ``` ### Custom Quantization -The current repo supports multiple quantization techniques configured using a recipe. Supported strategies are `tensor`, `group` and `channel`. -The above recipe (`2of4_w4a16_recipe.yaml`) uses channel-wise quantization specified by `strategy: "channel"` in its config group. -To use quantize per tensor, change strategy from `channel` to `tensor`. To use group size quantization, change from `channel` to `group` and specify its value, say 128, by including `group_size: 128`. A group size quantization example is shown in `2of4_w4a16_group-128_recipe.yaml`. +The current repo supports multiple quantization techniques configured using a recipe. Supported strategies are tensor, group, and channel. + +The recipe `(2of4_w4a16_recipe.yaml)` uses channel-wise quantization `(strategy: "channel")`. +To change the quantization strategy, edit the recipe file accordingly: + +Use `tensor` for per-tensor quantization +Use `group` for group-wise quantization and specify the group_size parameter (e.g., 128) +See `2of4_w4a16_group-128_recipe.yaml` for a group-size example From 1ee8f2e55d52c126f133db8f0bbe31684c59bde7 Mon Sep 17 00:00:00 2001 From: jonnyelisha Date: Mon, 21 Jul 2025 19:59:47 +0300 Subject: [PATCH 2/3] Update README.md Co-authored-by: Brian Dellabetta --- examples/quantization_2of4_sparse_w4a16/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/quantization_2of4_sparse_w4a16/README.md b/examples/quantization_2of4_sparse_w4a16/README.md index 522c0e5c3..8623e1a67 100644 --- a/examples/quantization_2of4_sparse_w4a16/README.md +++ b/examples/quantization_2of4_sparse_w4a16/README.md @@ -170,7 +170,7 @@ tokenizer.save_pretrained(f"{output_dir}/quantization_stage") ### Custom Quantization The current repo supports multiple quantization techniques configured using a recipe. Supported strategies are tensor, group, and channel. -The recipe `(2of4_w4a16_recipe.yaml)` uses channel-wise quantization `(strategy: "channel")`. +The recipe (`2of4_w4a16_recipe.yaml`) uses channel-wise quantization (`strategy: "channel"`). To change the quantization strategy, edit the recipe file accordingly: Use `tensor` for per-tensor quantization From 35f0da6f06cf1238d20d9589bdf15cf6aff73310 Mon Sep 17 00:00:00 2001 From: jonnyelisha Date: Mon, 21 Jul 2025 20:13:51 +0300 Subject: [PATCH 3/3] Update examples/quantization_2of4_sparse_w4a16/README.md Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .../quantization_2of4_sparse_w4a16/README.md | 24 ++----------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/examples/quantization_2of4_sparse_w4a16/README.md b/examples/quantization_2of4_sparse_w4a16/README.md index 8623e1a67..c1e34c280 100644 --- a/examples/quantization_2of4_sparse_w4a16/README.md +++ b/examples/quantization_2of4_sparse_w4a16/README.md @@ -74,7 +74,7 @@ max_seq_length = 512 num_calibration_samples = 512 # set training parameters for finetuning -num_train_epochs = 0.01 +num_train_epochs = 0.5 logging_steps = 500 save_steps = 5000 gradient_checkpointing = True # saves memory during training @@ -82,27 +82,7 @@ learning_rate = 0.0001 bf16 = False # using full precision for training lr_scheduler_type = "cosine" warmup_ratio = 0.1 -preprocessing_num_workers = 64 - -oneshot_kwargs = dict( - dataset=dataset, - recipe=recipe, - num_calibration_samples=num_calibration_samples, - preprocessing_num_workers=preprocessing_num_workers, - splits=splits, -) - -training_kwargs = dict( - bf16=bf16, - max_seq_length=max_seq_length, - num_train_epochs=num_train_epochs, - logging_steps=logging_steps, - save_steps=save_steps, - gradient_checkpointing=gradient_checkpointing, - learning_rate=learning_rate, - lr_scheduler_type=lr_scheduler_type, - warmup_ratio=warmup_ratio, -) +preprocessing_num_workers = 8 ``` ## Step 2: Run `sparsification`, `fine-tuning`, and `quantization`