|
| 1 | +from pathlib import Path |
| 2 | + |
1 | 3 | import torch
|
2 | 4 | from loguru import logger
|
3 | 5 | from transformers import AutoModelForCausalLM, AutoTokenizer
|
4 | 6 |
|
5 | 7 | from llmcompressor import oneshot, train
|
6 |
| -from llmcompressor.utils import dispatch_for_generation |
7 | 8 |
|
8 | 9 | # load the model in as bfloat16 to save on memory and compute
|
9 | 10 | model_stub = "neuralmagic/Llama-2-7b-ultrachat200k"
|
|
18 | 19 |
|
19 | 20 | # save location of quantized model
|
20 | 21 | output_dir = "output_llama7b_2of4_w4a16_channel"
|
| 22 | +output_path = Path(output_dir) |
21 | 23 |
|
22 | 24 | # set dataset config parameters
|
23 | 25 | splits = {"calibration": "train_gen[:5%]", "train": "train_gen"}
|
|
63 | 65 | # ./output_llama7b_2of4_w4a16_channel/ + (finetuning/sparsity/quantization)_stage
|
64 | 66 |
|
65 | 67 | # Oneshot sparsification
|
66 |
| -oneshot_applied_model = oneshot( |
| 68 | + |
| 69 | +oneshot( |
67 | 70 | model=model,
|
68 | 71 | **oneshot_kwargs,
|
| 72 | + output_dir=output_dir, |
69 | 73 | stage="sparsity_stage",
|
70 | 74 | )
|
71 | 75 |
|
72 | 76 | # Sparse finetune
|
73 |
| -dispatch_for_generation(model) |
74 |
| -finetune_applied_model = train( |
75 |
| - model=oneshot_applied_model, |
| 77 | +train( |
| 78 | + model=(output_path / "sparsity_stage"), |
76 | 79 | **oneshot_kwargs,
|
77 | 80 | **training_kwargs,
|
| 81 | + output_dir=output_dir, |
78 | 82 | stage="finetuning_stage",
|
79 | 83 | )
|
80 | 84 |
|
81 | 85 | # Oneshot quantization
|
82 |
| -model.to("cpu") |
83 | 86 | quantized_model = oneshot(
|
84 |
| - model=finetune_applied_model, |
| 87 | + model=(output_path / "finetuning_stage"), |
85 | 88 | **oneshot_kwargs,
|
86 | 89 | stage="quantization_stage",
|
87 | 90 | )
|
|
0 commit comments