diff --git a/_static/img/bert.png b/_static/img/bert.png deleted file mode 100644 index 6e23a8acfd3..00000000000 Binary files a/_static/img/bert.png and /dev/null differ diff --git a/_static/img/compare_output.png b/_static/img/compare_output.png deleted file mode 100644 index 4ece4d11483..00000000000 Binary files a/_static/img/compare_output.png and /dev/null differ diff --git a/_static/img/compare_stub.png b/_static/img/compare_stub.png deleted file mode 100644 index 8140a99b182..00000000000 Binary files a/_static/img/compare_stub.png and /dev/null differ diff --git a/_static/img/pt2e_quant_xpu_inductor.png b/_static/img/pt2e_quant_xpu_inductor.png deleted file mode 100644 index 2fc7e4ae7bf..00000000000 Binary files a/_static/img/pt2e_quant_xpu_inductor.png and /dev/null differ diff --git a/_static/img/quant_asym.png b/_static/img/quant_asym.png deleted file mode 100644 index 9dc43817a59..00000000000 Binary files a/_static/img/quant_asym.png and /dev/null differ diff --git a/_static/img/quantized_transfer_learning.png b/_static/img/quantized_transfer_learning.png deleted file mode 100644 index c138cbdb0c1..00000000000 Binary files a/_static/img/quantized_transfer_learning.png and /dev/null differ diff --git a/_static/img/shadow.png b/_static/img/shadow.png deleted file mode 100644 index e09d0b87f01..00000000000 Binary files a/_static/img/shadow.png and /dev/null differ diff --git a/_static/img/thumbnails/cropped/experimental-Dynamic-Quantization-on-BERT.png b/_static/img/thumbnails/cropped/experimental-Dynamic-Quantization-on-BERT.png deleted file mode 100644 index 34bbf8c7bdf..00000000000 Binary files a/_static/img/thumbnails/cropped/experimental-Dynamic-Quantization-on-BERT.png and /dev/null differ diff --git a/_static/img/thumbnails/cropped/experimental-Dynamic-Quantization-on-an-LSTM-Word-Language-Model.png b/_static/img/thumbnails/cropped/experimental-Dynamic-Quantization-on-an-LSTM-Word-Language-Model.png deleted file mode 100644 index 986efaa3f88..00000000000 Binary files a/_static/img/thumbnails/cropped/experimental-Dynamic-Quantization-on-an-LSTM-Word-Language-Model.png and /dev/null differ diff --git a/_static/img/thumbnails/cropped/mobile.png b/_static/img/thumbnails/cropped/mobile.png deleted file mode 100644 index 12dc917519c..00000000000 Binary files a/_static/img/thumbnails/cropped/mobile.png and /dev/null differ diff --git a/_static/img/thumbnails/cropped/using-dynamic-post-training-quantization.png b/_static/img/thumbnails/cropped/using-dynamic-post-training-quantization.png deleted file mode 100644 index 6ce22e4862a..00000000000 Binary files a/_static/img/thumbnails/cropped/using-dynamic-post-training-quantization.png and /dev/null differ diff --git a/advanced_source/dynamic_quantization_tutorial.py b/advanced_source/dynamic_quantization_tutorial.py deleted file mode 100644 index c5b7d70a046..00000000000 --- a/advanced_source/dynamic_quantization_tutorial.py +++ /dev/null @@ -1,308 +0,0 @@ -""" -(beta) Dynamic Quantization on an LSTM Word Language Model -================================================================== - -**Author**: `James Reed `_ - -**Edited by**: `Seth Weidman `_ - -Introduction ------------- - -Quantization involves converting the weights and activations of your model from float -to int, which can result in smaller model size and faster inference with only a small -hit to accuracy. - -In this tutorial, we will apply the easiest form of quantization - -`dynamic quantization `_ - -to an LSTM-based next word-prediction model, closely following the -`word language model `_ -from the PyTorch examples. -""" - -# imports -import os -from io import open -import time - -import torch -import torch.nn as nn -import torch.nn.functional as F - -###################################################################### -# 1. Define the model -# ------------------- -# -# Here we define the LSTM model architecture, following the -# `model `_ -# from the word language model example. - -class LSTMModel(nn.Module): - """Container module with an encoder, a recurrent module, and a decoder.""" - - def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5): - super(LSTMModel, self).__init__() - self.drop = nn.Dropout(dropout) - self.encoder = nn.Embedding(ntoken, ninp) - self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout) - self.decoder = nn.Linear(nhid, ntoken) - - self.init_weights() - - self.nhid = nhid - self.nlayers = nlayers - - def init_weights(self): - initrange = 0.1 - self.encoder.weight.data.uniform_(-initrange, initrange) - self.decoder.bias.data.zero_() - self.decoder.weight.data.uniform_(-initrange, initrange) - - def forward(self, input, hidden): - emb = self.drop(self.encoder(input)) - output, hidden = self.rnn(emb, hidden) - output = self.drop(output) - decoded = self.decoder(output) - return decoded, hidden - - def init_hidden(self, bsz): - weight = next(self.parameters()) - return (weight.new_zeros(self.nlayers, bsz, self.nhid), - weight.new_zeros(self.nlayers, bsz, self.nhid)) - -###################################################################### -# 2. Load in the text data -# ------------------------ -# -# Next, we load the -# `Wikitext-2 dataset `_ into a `Corpus`, -# again following the -# `preprocessing `_ -# from the word language model example. - -class Dictionary(object): - def __init__(self): - self.word2idx = {} - self.idx2word = [] - - def add_word(self, word): - if word not in self.word2idx: - self.idx2word.append(word) - self.word2idx[word] = len(self.idx2word) - 1 - return self.word2idx[word] - - def __len__(self): - return len(self.idx2word) - - -class Corpus(object): - def __init__(self, path): - self.dictionary = Dictionary() - self.train = self.tokenize(os.path.join(path, 'train.txt')) - self.valid = self.tokenize(os.path.join(path, 'valid.txt')) - self.test = self.tokenize(os.path.join(path, 'test.txt')) - - def tokenize(self, path): - """Tokenizes a text file.""" - assert os.path.exists(path) - # Add words to the dictionary - with open(path, 'r', encoding="utf8") as f: - for line in f: - words = line.split() + [''] - for word in words: - self.dictionary.add_word(word) - - # Tokenize file content - with open(path, 'r', encoding="utf8") as f: - idss = [] - for line in f: - words = line.split() + [''] - ids = [] - for word in words: - ids.append(self.dictionary.word2idx[word]) - idss.append(torch.tensor(ids).type(torch.int64)) - ids = torch.cat(idss) - - return ids - -model_data_filepath = 'data/' - -corpus = Corpus(model_data_filepath + 'wikitext-2') - -###################################################################### -# 3. Load the pretrained model -# ----------------------------- -# -# This is a tutorial on dynamic quantization, a quantization technique -# that is applied after a model has been trained. Therefore, we'll simply -# load some pretrained weights into this model architecture; these -# weights were obtained by training for five epochs using the default -# settings in the word language model example. -# -# Before running this tutorial, download the required pre-trained model: -# -# .. code-block:: bash -# -# wget https://s3.amazonaws.com/pytorch-tutorial-assets/word_language_model_quantize.pth -# -# Place the downloaded file in the data directory or update the model_data_filepath accordingly. - -ntokens = len(corpus.dictionary) - -model = LSTMModel( - ntoken = ntokens, - ninp = 512, - nhid = 256, - nlayers = 5, -) - -model.load_state_dict( - torch.load( - model_data_filepath + 'word_language_model_quantize.pth', - map_location=torch.device('cpu'), - weights_only=True - ) - ) - -model.eval() -print(model) - -###################################################################### -# Now let's generate some text to ensure that the pretrained model is working -# properly - similarly to before, we follow -# `here `_ - -input_ = torch.randint(ntokens, (1, 1), dtype=torch.long) -hidden = model.init_hidden(1) -temperature = 1.0 -num_words = 1000 - -with open(model_data_filepath + 'out.txt', 'w') as outf: - with torch.no_grad(): # no tracking history - for i in range(num_words): - output, hidden = model(input_, hidden) - word_weights = output.squeeze().div(temperature).exp().cpu() - word_idx = torch.multinomial(word_weights, 1)[0] - input_.fill_(word_idx) - - word = corpus.dictionary.idx2word[word_idx] - - outf.write(str(word.encode('utf-8')) + ('\n' if i % 20 == 19 else ' ')) - - if i % 100 == 0: - print('| Generated {}/{} words'.format(i, 1000)) - -with open(model_data_filepath + 'out.txt', 'r') as outf: - all_output = outf.read() - print(all_output) - -###################################################################### -# It's no GPT-2, but it looks like the model has started to learn the structure of -# language! -# -# We're almost ready to demonstrate dynamic quantization. We just need to define a few more -# helper functions: - -bptt = 25 -criterion = nn.CrossEntropyLoss() -eval_batch_size = 1 - -# create test data set -def batchify(data, bsz): - # Work out how cleanly we can divide the dataset into ``bsz`` parts. - nbatch = data.size(0) // bsz - # Trim off any extra elements that wouldn't cleanly fit (remainders). - data = data.narrow(0, 0, nbatch * bsz) - # Evenly divide the data across the ``bsz`` batches. - return data.view(bsz, -1).t().contiguous() - -test_data = batchify(corpus.test, eval_batch_size) - -# Evaluation functions -def get_batch(source, i): - seq_len = min(bptt, len(source) - 1 - i) - data = source[i:i+seq_len] - target = source[i+1:i+1+seq_len].reshape(-1) - return data, target - -def repackage_hidden(h): - """Wraps hidden states in new Tensors, to detach them from their history.""" - - if isinstance(h, torch.Tensor): - return h.detach() - else: - return tuple(repackage_hidden(v) for v in h) - -def evaluate(model_, data_source): - # Turn on evaluation mode which disables dropout. - model_.eval() - total_loss = 0. - hidden = model_.init_hidden(eval_batch_size) - with torch.no_grad(): - for i in range(0, data_source.size(0) - 1, bptt): - data, targets = get_batch(data_source, i) - output, hidden = model_(data, hidden) - hidden = repackage_hidden(hidden) - output_flat = output.view(-1, ntokens) - total_loss += len(data) * criterion(output_flat, targets).item() - return total_loss / (len(data_source) - 1) - -###################################################################### -# 4. Test dynamic quantization -# ---------------------------- -# -# Finally, we can call ``torch.quantization.quantize_dynamic`` on the model! -# Specifically, -# -# - We specify that we want the ``nn.LSTM`` and ``nn.Linear`` modules in our -# model to be quantized -# - We specify that we want weights to be converted to ``int8`` values - -import torch.quantization - -quantized_model = torch.quantization.quantize_dynamic( - model, {nn.LSTM, nn.Linear}, dtype=torch.qint8 -) -print(quantized_model) - -###################################################################### -# The model looks the same; how has this benefited us? First, we see a -# significant reduction in model size: - -def print_size_of_model(model): - torch.save(model.state_dict(), "temp.p") - print('Size (MB):', os.path.getsize("temp.p")/1e6) - os.remove('temp.p') - -print_size_of_model(model) -print_size_of_model(quantized_model) - -###################################################################### -# Second, we see faster inference time, with no difference in evaluation loss: -# -# Note: we set the number of threads to one for single threaded comparison, since quantized -# models run single threaded. - -torch.set_num_threads(1) - -def time_model_evaluation(model, test_data): - s = time.time() - loss = evaluate(model, test_data) - elapsed = time.time() - s - print('''loss: {0:.3f}\nelapsed time (seconds): {1:.1f}'''.format(loss, elapsed)) - -time_model_evaluation(model, test_data) -time_model_evaluation(quantized_model, test_data) - -###################################################################### -# Running this locally on a MacBook Pro, without quantization, inference takes about 200 seconds, -# and with quantization it takes just about 100 seconds. -# -# Conclusion -# ---------- -# -# Dynamic quantization can be an easy way to reduce model size while only -# having a limited effect on accuracy. -# -# Thanks for reading! As always, we welcome any feedback, so please create an issue -# `here `_ if you have any. diff --git a/index.rst b/index.rst index 8ce8b249f84..18180ea824a 100644 --- a/index.rst +++ b/index.rst @@ -5,7 +5,6 @@ Welcome to PyTorch Tutorials * `Utilizing Torch Function modes with torch.compile `__ * `Context Parallel Tutorial `__ -* `PyTorch 2 Export Quantization with Intel GPU Backend through Inductor `__ * `(beta) Explicit horizontal fusion with foreach_map and torch.compile `__ * Updated `Inductor Windows CPU Tutorial `__ @@ -577,34 +576,6 @@ Welcome to PyTorch Tutorials :link: advanced/semi_structured_sparse.html :tags: Text,Model-Optimization -.. customcarditem:: - :header: (beta) Dynamic Quantization on an LSTM Word Language Model - :card_description: Apply dynamic quantization, the easiest form of quantization, to a LSTM-based next word prediction model. - :image: _static/img/thumbnails/cropped/experimental-Dynamic-Quantization-on-an-LSTM-Word-Language-Model.png - :link: advanced/dynamic_quantization_tutorial.html - :tags: Text,Quantization,Model-Optimization - -.. customcarditem:: - :header: (beta) Dynamic Quantization on BERT - :card_description: Apply the dynamic quantization on a BERT (Bidirectional Embedding Representations from Transformers) model. - :image: _static/img/thumbnails/cropped/experimental-Dynamic-Quantization-on-BERT.png - :link: intermediate/dynamic_quantization_bert_tutorial.html - :tags: Text,Quantization,Model-Optimization - -.. customcarditem:: - :header: (beta) Quantized Transfer Learning for Computer Vision Tutorial - :card_description: Extends the Transfer Learning for Computer Vision Tutorial using a quantized model. - :image: _static/img/thumbnails/cropped/60-min-blitz.png - :link: intermediate/quantized_transfer_learning_tutorial.html - :tags: Image/Video,Quantization,Model-Optimization - -.. customcarditem:: - :header: (beta) Static Quantization with Eager Mode in PyTorch - :card_description: This tutorial shows how to do post-training static quantization. - :image: _static/img/thumbnails/cropped/60-min-blitz.png - :link: advanced/static_quantization_tutorial.html - :tags: Quantization - .. customcarditem:: :header: Multi-Objective Neural Architecture Search with Ax :card_description: Learn how to use Ax to search over architectures find optimal tradeoffs between accuracy and latency. @@ -1035,10 +1006,6 @@ Additional Resources beginner/hyperparameter_tuning_tutorial intermediate/parametrizations intermediate/pruning_tutorial - advanced/dynamic_quantization_tutorial - intermediate/dynamic_quantization_bert_tutorial - intermediate/quantized_transfer_learning_tutorial - advanced/static_quantization_tutorial intermediate/nvfuser_intro_tutorial intermediate/ax_multiobjective_nas_tutorial intermediate/torch_compile_tutorial diff --git a/intermediate_source/dynamic_quantization_bert_tutorial.rst b/intermediate_source/dynamic_quantization_bert_tutorial.rst deleted file mode 100644 index 786ef11f3b2..00000000000 --- a/intermediate_source/dynamic_quantization_bert_tutorial.rst +++ /dev/null @@ -1,568 +0,0 @@ -(beta) Dynamic Quantization on BERT -=========================================== - -.. tip:: - To get the most of this tutorial, we suggest using this - `Colab Version `_. This will allow you to experiment with the information presented below. - -**Author**: `Jianyu Huang `_ - -**Reviewed by**: `Raghuraman Krishnamoorthi `_ - -**Edited by**: `Jessica Lin `_ - - -Introduction ------------- - - -In this tutorial, we will apply the dynamic quantization on a BERT -model, closely following the BERT model from `the HuggingFace -Transformers examples `_. -With this step-by-step journey, we would like to demonstrate how to -convert a well-known state-of-the-art model like BERT into dynamic -quantized model. - -- BERT, or Bidirectional Embedding Representations from Transformers, - is a new method of pre-training language representations which - achieves the state-of-the-art accuracy results on many popular - Natural Language Processing (NLP) tasks, such as question answering, - text classification, and others. The original paper can be found - `here `_. - -- Dynamic quantization support in PyTorch converts a float model to a - quantized model with static int8 or float16 data types for the - weights and dynamic quantization for the activations. The activations - are quantized dynamically (per batch) to int8 when the weights are - quantized to int8. In PyTorch, we have `torch.quantization.quantize_dynamic API - `_, - which replaces specified modules with dynamic weight-only quantized - versions and output the quantized model. - -- We demonstrate the accuracy and inference performance results on the - `Microsoft Research Paraphrase Corpus (MRPC) task `_ - in the General Language Understanding Evaluation benchmark `(GLUE) - `_. The MRPC (Dolan and Brockett, 2005) is - a corpus of sentence pairs automatically extracted from online news - sources, with human annotations of whether the sentences in the pair - are semantically equivalent. As the classes are imbalanced (68% - positive, 32% negative), we follow the common practice and report - `F1 score `_. - MRPC is a common NLP task for language pair classification, as shown - below. - -.. image:: /_static/img/bert.png - - -1. Setup --------- - -1.1 Install PyTorch and HuggingFace Transformers -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -To start this tutorial, let’s first follow the installation instructions -in PyTorch `here `_ and HuggingFace Github Repo `here `_. -In addition, we also install `scikit-learn `_ package, as we will reuse its -built-in F1 score calculation helper function. - -.. code:: shell - - pip install sklearn - pip install transformers==4.29.2 - - -Because we will be using the beta parts of the PyTorch, it is -recommended to install the latest version of torch and torchvision. You -can find the most recent instructions on local installation `here -`_. For example, to install on -Mac: - -.. code:: shell - - yes y | pip uninstall torch torchvision - yes y | pip install --pre torch -f https://download.pytorch.org/whl/nightly/cu101/torch_nightly.html - - - - -1.2 Import the necessary modules -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -In this step we import the necessary Python modules for the tutorial. - -.. code:: python - - import logging - import numpy as np - import os - import random - import sys - import time - import torch - - from argparse import Namespace - from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, - TensorDataset) - from tqdm import tqdm - from transformers import (BertConfig, BertForSequenceClassification, BertTokenizer,) - from transformers import glue_compute_metrics as compute_metrics - from transformers import glue_output_modes as output_modes - from transformers import glue_processors as processors - from transformers import glue_convert_examples_to_features as convert_examples_to_features - - # Setup logging - logger = logging.getLogger(__name__) - logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.WARN) - - logging.getLogger("transformers.modeling_utils").setLevel( - logging.WARN) # Reduce logging - - print(torch.__version__) - -We set the number of threads to compare the single thread performance between FP32 and INT8 performance. -In the end of the tutorial, the user can set other number of threads by building PyTorch with right parallel backend. - -.. code:: python - - torch.set_num_threads(1) - print(torch.__config__.parallel_info()) - - -1.3 Learn about helper functions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The helper functions are built-in in transformers library. We mainly use -the following helper functions: one for converting the text examples -into the feature vectors; The other one for measuring the F1 score of -the predicted result. - -The `glue_convert_examples_to_features `_ function converts the texts into input features: - -- Tokenize the input sequences; -- Insert [CLS] in the beginning; -- Insert [SEP] between the first sentence and the second sentence, and - in the end; -- Generate token type ids to indicate whether a token belongs to the - first sequence or the second sequence. - -The `glue_compute_metrics `_ function has the compute metrics with -the `F1 score `_, which -can be interpreted as a weighted average of the precision and recall, -where an F1 score reaches its best value at 1 and worst score at 0. The -relative contribution of precision and recall to the F1 score are equal. - -- The equation for the F1 score is: -.. math:: F1 = 2 * (\text{precision} * \text{recall}) / (\text{precision} + \text{recall}) - - -1.4 Download the dataset -^^^^^^^^^^^^^^^^^^^^^^^^ - -Before running MRPC tasks we download the `GLUE data -`_ by running `this script -`_ -and unpack it to a directory ``glue_data``. - - -.. code:: shell - - python download_glue_data.py --data_dir='glue_data' --tasks='MRPC' - - -2. Fine-tune the BERT model ---------------------------- - -The spirit of BERT is to pre-train the language representations and then -to fine-tune the deep bi-directional representations on a wide range of -tasks with minimal task-dependent parameters, and achieves -state-of-the-art results. In this tutorial, we will focus on fine-tuning -with the pre-trained BERT model to classify semantically equivalent -sentence pairs on MRPC task. - -To fine-tune the pre-trained BERT model (``bert-base-uncased`` model in -HuggingFace transformers) for the MRPC task, you can follow the command -in `examples `_: - -.. code:: python - - export GLUE_DIR=./glue_data - export TASK_NAME=MRPC - export OUT_DIR=./$TASK_NAME/ - python ./run_glue.py \ - --model_type bert \ - --model_name_or_path bert-base-uncased \ - --task_name $TASK_NAME \ - --do_train \ - --do_eval \ - --do_lower_case \ - --data_dir $GLUE_DIR/$TASK_NAME \ - --max_seq_length 128 \ - --per_gpu_eval_batch_size=8 \ - --per_gpu_train_batch_size=8 \ - --learning_rate 2e-5 \ - --num_train_epochs 3.0 \ - --save_steps 100000 \ - --output_dir $OUT_DIR - -We provide the fine-tuned BERT model for MRPC task `here `_. -To save time, you can download the model file (~400 MB) directly into your local folder ``$OUT_DIR``. - -2.1 Set global configurations -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Here we set the global configurations for evaluating the fine-tuned BERT -model before and after the dynamic quantization. - -.. code:: python - - configs = Namespace() - - # The output directory for the fine-tuned model, $OUT_DIR. - configs.output_dir = "./MRPC/" - - # The data directory for the MRPC task in the GLUE benchmark, $GLUE_DIR/$TASK_NAME. - configs.data_dir = "./glue_data/MRPC" - - # The model name or path for the pre-trained model. - configs.model_name_or_path = "bert-base-uncased" - # The maximum length of an input sequence - configs.max_seq_length = 128 - - # Prepare GLUE task. - configs.task_name = "MRPC".lower() - configs.processor = processors[configs.task_name]() - configs.output_mode = output_modes[configs.task_name] - configs.label_list = configs.processor.get_labels() - configs.model_type = "bert".lower() - configs.do_lower_case = True - - # Set the device, batch size, topology, and caching flags. - configs.device = "cpu" - configs.per_gpu_eval_batch_size = 8 - configs.n_gpu = 0 - configs.local_rank = -1 - configs.overwrite_cache = False - - - # Set random seed for reproducibility. - def set_seed(seed): - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - set_seed(42) - - - -2.2 Load the fine-tuned BERT model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -We load the tokenizer and fine-tuned BERT sequence classifier model -(FP32) from the ``configs.output_dir``. - -.. code:: python - - tokenizer = BertTokenizer.from_pretrained( - configs.output_dir, do_lower_case=configs.do_lower_case) - - model = BertForSequenceClassification.from_pretrained(configs.output_dir) - model.to(configs.device) - - -2.3 Define the tokenize and evaluation function -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -We reuse the tokenize and evaluation function from `HuggingFace `_. - -.. code:: python - - # coding=utf-8 - # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. - # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. - # - # Licensed under the Apache License, Version 2.0 (the "License"); - # you may not use this file except in compliance with the License. - # You may obtain a copy of the License at - # - # http://www.apache.org/licenses/LICENSE-2.0 - # - # Unless required by applicable law or agreed to in writing, software - # distributed under the License is distributed on an "AS IS" BASIS, - # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - # See the License for the specific language governing permissions and - # limitations under the License. - - def evaluate(args, model, tokenizer, prefix=""): - # Loop to handle MNLI double evaluation (matched, mis-matched) - eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) - eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) - - results = {} - for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): - eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) - - if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: - os.makedirs(eval_output_dir) - - args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) - # Note that DistributedSampler samples randomly - eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) - eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) - - # multi-gpu eval - if args.n_gpu > 1: - model = torch.nn.DataParallel(model) - - # Eval! - logger.info("***** Running evaluation {} *****".format(prefix)) - logger.info(" Num examples = %d", len(eval_dataset)) - logger.info(" Batch size = %d", args.eval_batch_size) - eval_loss = 0.0 - nb_eval_steps = 0 - preds = None - out_label_ids = None - for batch in tqdm(eval_dataloader, desc="Evaluating"): - model.eval() - batch = tuple(t.to(args.device) for t in batch) - - with torch.no_grad(): - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1], - 'labels': batch[3]} - if args.model_type != 'distilbert': - inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids - outputs = model(**inputs) - tmp_eval_loss, logits = outputs[:2] - - eval_loss += tmp_eval_loss.mean().item() - nb_eval_steps += 1 - if preds is None: - preds = logits.detach().cpu().numpy() - out_label_ids = inputs['labels'].detach().cpu().numpy() - else: - preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) - out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) - - eval_loss = eval_loss / nb_eval_steps - if args.output_mode == "classification": - preds = np.argmax(preds, axis=1) - elif args.output_mode == "regression": - preds = np.squeeze(preds) - result = compute_metrics(eval_task, preds, out_label_ids) - results.update(result) - - output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") - with open(output_eval_file, "w") as writer: - logger.info("***** Eval results {} *****".format(prefix)) - for key in sorted(result.keys()): - logger.info(" %s = %s", key, str(result[key])) - writer.write("%s = %s\n" % (key, str(result[key]))) - - return results - - - def load_and_cache_examples(args, task, tokenizer, evaluate=False): - if args.local_rank not in [-1, 0] and not evaluate: - torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache - - processor = processors[task]() - output_mode = output_modes[task] - # Load data features from cache or dataset file - cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format( - 'dev' if evaluate else 'train', - list(filter(None, args.model_name_or_path.split('/'))).pop(), - str(args.max_seq_length), - str(task))) - if os.path.exists(cached_features_file) and not args.overwrite_cache: - logger.info("Loading features from cached file %s", cached_features_file) - features = torch.load(cached_features_file) - else: - logger.info("Creating features from dataset file at %s", args.data_dir) - label_list = processor.get_labels() - if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']: - # HACK(label indices are swapped in RoBERTa pretrained model) - label_list[1], label_list[2] = label_list[2], label_list[1] - examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) - features = convert_examples_to_features(examples, - tokenizer, - label_list=label_list, - max_length=args.max_seq_length, - output_mode=output_mode, - pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet - pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], - pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, - ) - if args.local_rank in [-1, 0]: - logger.info("Saving features into cached file %s", cached_features_file) - torch.save(features, cached_features_file) - - if args.local_rank == 0 and not evaluate: - torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache - - # Convert to Tensors and build dataset - all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) - all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) - if output_mode == "classification": - all_labels = torch.tensor([f.label for f in features], dtype=torch.long) - elif output_mode == "regression": - all_labels = torch.tensor([f.label for f in features], dtype=torch.float) - - dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) - return dataset - - -3. Apply the dynamic quantization ---------------------------------- - -We call ``torch.quantization.quantize_dynamic`` on the model to apply -the dynamic quantization on the HuggingFace BERT model. Specifically, - -- We specify that we want the torch.nn.Linear modules in our model to - be quantized; -- We specify that we want weights to be converted to quantized int8 - values. - -.. code:: python - - quantized_model = torch.quantization.quantize_dynamic( - model, {torch.nn.Linear}, dtype=torch.qint8 - ) - print(quantized_model) - - -3.1 Check the model size -^^^^^^^^^^^^^^^^^^^^^^^^ - -Let’s first check the model size. We can observe a significant reduction -in model size (FP32 total size: 438 MB; INT8 total size: 181 MB): - -.. code:: python - - def print_size_of_model(model): - torch.save(model.state_dict(), "temp.p") - print('Size (MB):', os.path.getsize("temp.p")/1e6) - os.remove('temp.p') - - print_size_of_model(model) - print_size_of_model(quantized_model) - - -The BERT model used in this tutorial (``bert-base-uncased``) has a -vocabulary size V of 30522. With the embedding size of 768, the total -size of the word embedding table is ~ 4 (Bytes/FP32) \* 30522 \* 768 = -90 MB. So with the help of quantization, the model size of the -non-embedding table part is reduced from 350 MB (FP32 model) to 90 MB -(INT8 model). - - -3.2 Evaluate the inference accuracy and time -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Next, let’s compare the inference time as well as the evaluation -accuracy between the original FP32 model and the INT8 model after the -dynamic quantization. - -.. code:: python - - def time_model_evaluation(model, configs, tokenizer): - eval_start_time = time.time() - result = evaluate(configs, model, tokenizer, prefix="") - eval_end_time = time.time() - eval_duration_time = eval_end_time - eval_start_time - print(result) - print("Evaluate total time (seconds): {0:.1f}".format(eval_duration_time)) - - # Evaluate the original FP32 BERT model - time_model_evaluation(model, configs, tokenizer) - - # Evaluate the INT8 BERT model after the dynamic quantization - time_model_evaluation(quantized_model, configs, tokenizer) - - -Running this locally on a MacBook Pro, without quantization, inference -(for all 408 examples in MRPC dataset) takes about 160 seconds, and with -quantization it takes just about 90 seconds. We summarize the results -for running the quantized BERT model inference on a Macbook Pro as the -follows: - -.. code:: - - | Prec | F1 score | Model Size | 1 thread | 4 threads | - | FP32 | 0.9019 | 438 MB | 160 sec | 85 sec | - | INT8 | 0.902 | 181 MB | 90 sec | 46 sec | - -We have 0.6% lower F1 score accuracy after applying the post-training dynamic -quantization on the fine-tuned BERT model on the MRPC task. As a -comparison, in a `recent paper `_ (Table 1), -it achieved 0.8788 by -applying the post-training dynamic quantization and 0.8956 by applying -the quantization-aware training. The main difference is that we support the -asymmetric quantization in PyTorch while that paper supports the -symmetric quantization only. - -Note that we set the number of threads to 1 for the single-thread -comparison in this tutorial. We also support the intra-op -parallelization for these quantized INT8 operators. The users can now -set multi-thread by ``torch.set_num_threads(N)`` (``N`` is the number of -intra-op parallelization threads). One preliminary requirement to enable -the intra-op parallelization support is to build PyTorch with the right -`backend `_ -such as OpenMP, Native or TBB. -You can use ``torch.__config__.parallel_info()`` to check the -parallelization settings. On the same MacBook Pro using PyTorch with -Native backend for parallelization, we can get about 46 seconds for -processing the evaluation of MRPC dataset. - - -3.3 Serialize the quantized model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -We can serialize and save the quantized model for the future use using -`torch.jit.save` after tracing the model. - -.. code:: python - - def ids_tensor(shape, vocab_size): - # Creates a random int32 tensor of the shape within the vocab size - return torch.randint(0, vocab_size, shape=shape, dtype=torch.int, device='cpu') - - input_ids = ids_tensor([8, 128], 2) - token_type_ids = ids_tensor([8, 128], 2) - attention_mask = ids_tensor([8, 128], vocab_size=2) - dummy_input = (input_ids, attention_mask, token_type_ids) - traced_model = torch.jit.trace(quantized_model, dummy_input) - torch.jit.save(traced_model, "bert_traced_eager_quant.pt") - -To load the quantized model, we can use `torch.jit.load` - -.. code:: python - - loaded_quantized_model = torch.jit.load("bert_traced_eager_quant.pt") - -Conclusion ----------- - -In this tutorial, we demonstrated how to convert a -well-known state-of-the-art NLP model like BERT into dynamic quantized -model. Dynamic quantization can reduce the size of the model while only -having a limited implication on accuracy. - -Thanks for reading! As always, we welcome any feedback, so please create -an issue `here `_ if you have -any. - - - -References ------------ - -[1] J.Devlin, M. Chang, K. Lee and K. Toutanova, `BERT: Pre-training of -Deep Bidirectional Transformers for Language Understanding (2018) -`_. - -[2] `HuggingFace Transformers `_. - -[3] O. Zafrir, G. Boudoukh, P. Izsak, and M. Wasserblat (2019). `Q8BERT: -Quantized 8bit BERT `_. diff --git a/intermediate_source/quantized_transfer_learning_tutorial.rst b/intermediate_source/quantized_transfer_learning_tutorial.rst deleted file mode 100644 index 9ba5e92d197..00000000000 --- a/intermediate_source/quantized_transfer_learning_tutorial.rst +++ /dev/null @@ -1,516 +0,0 @@ -(beta) Quantized Transfer Learning for Computer Vision Tutorial -======================================================================== - -.. tip:: - To get the most of this tutorial, we suggest using this - `Colab Version `_. - This will allow you to experiment with the information presented below. - -**Author**: `Zafar Takhirov `_ - -**Reviewed by**: `Raghuraman Krishnamoorthi `_ - -**Edited by**: `Jessica Lin `_ - -This tutorial builds on the original `PyTorch Transfer Learning `_ -tutorial, written by `Sasank Chilamkurthy `_. - -Transfer learning refers to techniques that make use of a pretrained model for -application on a different data-set. -There are two main ways the transfer learning is used: - -1. **ConvNet as a fixed feature extractor**: Here, you `“freeze” `_ - the weights of all the parameters in the network except that of the final - several layers (aka “the head”, usually fully connected layers). - These last layers are replaced with new ones initialized with random - weights and only these layers are trained. -2. **Finetuning the ConvNet**: Instead of random initializaion, the model is - initialized using a pretrained network, after which the training proceeds as - usual but with a different dataset. - Usually the head (or part of it) is also replaced in the network in - case there is a different number of outputs. - It is common in this method to set the learning rate to a smaller number. - This is done because the network is already trained, and only minor changes - are required to "finetune" it to a new dataset. - -You can also combine the above two methods: -First you can freeze the feature extractor, and train the head. After -that, you can unfreeze the feature extractor (or part of it), set the -learning rate to something smaller, and continue training. - -In this part you will use the first method – extracting the features -using a quantized model. - - -Part 0. Prerequisites ---------------------- - -Before diving into the transfer learning, let us review the "prerequisites", -such as installations and data loading/visualizations. - -.. code:: python - - # Imports - import copy - import matplotlib.pyplot as plt - import numpy as np - import os - import time - - plt.ion() - -Installing the Nightly Build -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Because you will be using the beta parts of the PyTorch, it is -recommended to install the latest version of ``torch`` and -``torchvision``. You can find the most recent instructions on local -installation `here `_. -For example, to install without GPU support: - -.. code:: shell - - pip install numpy - pip install --pre torch torchvision -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html - # For CUDA support use https://download.pytorch.org/whl/nightly/cu101/torch_nightly.html - - -Load Data -~~~~~~~~~ - -.. note :: This section is identical to the original transfer learning tutorial. -We will use ``torchvision`` and ``torch.utils.data`` packages to load -the data. - -The problem you are going to solve today is classifying **ants** and -**bees** from images. The dataset contains about 120 training images -each for ants and bees. There are 75 validation images for each class. -This is considered a very small dataset to generalize on. However, since -we are using transfer learning, we should be able to generalize -reasonably well. - -*This dataset is a very small subset of imagenet.* - -.. note :: Download the data from `here `_ - and extract it to the ``data`` directory. - -.. code:: python - - import torch - from torchvision import transforms, datasets - - # Data augmentation and normalization for training - # Just normalization for validation - data_transforms = { - 'train': transforms.Compose([ - transforms.Resize(224), - transforms.RandomCrop(224), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) - ]), - 'val': transforms.Compose([ - transforms.Resize(224), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) - ]), - } - - data_dir = 'data/hymenoptera_data' - image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), - data_transforms[x]) - for x in ['train', 'val']} - dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=16, - shuffle=True, num_workers=8) - for x in ['train', 'val']} - dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']} - class_names = image_datasets['train'].classes - - device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - - -Visualize a few images -~~~~~~~~~~~~~~~~~~~~~~ - -Let’s visualize a few training images so as to understand the data -augmentations. - -.. code:: python - - import torchvision - - def imshow(inp, title=None, ax=None, figsize=(5, 5)): - """Imshow for Tensor.""" - inp = inp.numpy().transpose((1, 2, 0)) - mean = np.array([0.485, 0.456, 0.406]) - std = np.array([0.229, 0.224, 0.225]) - inp = std * inp + mean - inp = np.clip(inp, 0, 1) - if ax is None: - fig, ax = plt.subplots(1, figsize=figsize) - ax.imshow(inp) - ax.set_xticks([]) - ax.set_yticks([]) - if title is not None: - ax.set_title(title) - - # Get a batch of training data - inputs, classes = next(iter(dataloaders['train'])) - - # Make a grid from batch - out = torchvision.utils.make_grid(inputs, nrow=4) - - fig, ax = plt.subplots(1, figsize=(10, 10)) - imshow(out, title=[class_names[x] for x in classes], ax=ax) - - -Support Function for Model Training -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Below is a generic function for model training. -This function also - -- Schedules the learning rate -- Saves the best model - -.. code:: python - - def train_model(model, criterion, optimizer, scheduler, num_epochs=25, device='cpu'): - """ - Support function for model training. - - Args: - model: Model to be trained - criterion: Optimization criterion (loss) - optimizer: Optimizer to use for training - scheduler: Instance of ``torch.optim.lr_scheduler`` - num_epochs: Number of epochs - device: Device to run the training on. Must be 'cpu' or 'cuda' - """ - since = time.time() - - best_model_wts = copy.deepcopy(model.state_dict()) - best_acc = 0.0 - - for epoch in range(num_epochs): - print('Epoch {}/{}'.format(epoch, num_epochs - 1)) - print('-' * 10) - - # Each epoch has a training and validation phase - for phase in ['train', 'val']: - if phase == 'train': - model.train() # Set model to training mode - else: - model.eval() # Set model to evaluate mode - - running_loss = 0.0 - running_corrects = 0 - - # Iterate over data. - for inputs, labels in dataloaders[phase]: - inputs = inputs.to(device) - labels = labels.to(device) - - # zero the parameter gradients - optimizer.zero_grad() - - # forward - # track history if only in train - with torch.set_grad_enabled(phase == 'train'): - outputs = model(inputs) - _, preds = torch.max(outputs, 1) - loss = criterion(outputs, labels) - - # backward + optimize only if in training phase - if phase == 'train': - loss.backward() - optimizer.step() - - # statistics - running_loss += loss.item() * inputs.size(0) - running_corrects += torch.sum(preds == labels.data) - if phase == 'train': - scheduler.step() - - epoch_loss = running_loss / dataset_sizes[phase] - epoch_acc = running_corrects.double() / dataset_sizes[phase] - - print('{} Loss: {:.4f} Acc: {:.4f}'.format( - phase, epoch_loss, epoch_acc)) - - # deep copy the model - if phase == 'val' and epoch_acc > best_acc: - best_acc = epoch_acc - best_model_wts = copy.deepcopy(model.state_dict()) - - print() - - time_elapsed = time.time() - since - print('Training complete in {:.0f}m {:.0f}s'.format( - time_elapsed // 60, time_elapsed % 60)) - print('Best val Acc: {:4f}'.format(best_acc)) - - # load best model weights - model.load_state_dict(best_model_wts) - return model - - -Support Function for Visualizing the Model Predictions -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Generic function to display predictions for a few images - -.. code:: python - - def visualize_model(model, rows=3, cols=3): - was_training = model.training - model.eval() - current_row = current_col = 0 - fig, ax = plt.subplots(rows, cols, figsize=(cols*2, rows*2)) - - with torch.no_grad(): - for idx, (imgs, lbls) in enumerate(dataloaders['val']): - imgs = imgs.cpu() - lbls = lbls.cpu() - - outputs = model(imgs) - _, preds = torch.max(outputs, 1) - - for jdx in range(imgs.size()[0]): - imshow(imgs.data[jdx], ax=ax[current_row, current_col]) - ax[current_row, current_col].axis('off') - ax[current_row, current_col].set_title('predicted: {}'.format(class_names[preds[jdx]])) - - current_col += 1 - if current_col >= cols: - current_row += 1 - current_col = 0 - if current_row >= rows: - model.train(mode=was_training) - return - model.train(mode=was_training) - - -Part 1. Training a Custom Classifier based on a Quantized Feature Extractor ---------------------------------------------------------------------------- - -In this section you will use a “frozen” quantized feature extractor, and -train a custom classifier head on top of it. Unlike floating point -models, you don’t need to set requires_grad=False for the quantized -model, as it has no trainable parameters. Please, refer to the -`documentation `_ for -more details. - -Load a pretrained model: for this exercise you will be using -`ResNet-18 `_. - -.. code:: python - - import torchvision.models.quantization as models - - # You will need the number of filters in the `fc` for future use. - # Here the size of each output sample is set to 2. - # Alternatively, it can be generalized to nn.Linear(num_ftrs, len(class_names)). - model_fe = models.resnet18(pretrained=True, progress=True, quantize=True) - num_ftrs = model_fe.fc.in_features - - -At this point you need to modify the pretrained model. The model -has the quantize/dequantize blocks in the beginning and the end. However, -because you will only use the feature extractor, the dequantization layer has -to move right before the linear layer (the head). The easiest way to do that -is to wrap the model in the ``nn.Sequential`` module. - -The first step is to isolate the feature extractor in the ResNet -model. Although in this example you are tasked to use all layers except -``fc`` as the feature extractor, in reality, you can take as many parts -as you need. This would be useful in case you would like to replace some -of the convolutional layers as well. - - -.. note:: When separating the feature extractor from the rest of a quantized - model, you have to manually place the quantizer/dequantized in the - beginning and the end of the parts you want to keep quantized. - -The function below creates a model with a custom head. - -.. code:: python - - from torch import nn - - def create_combined_model(model_fe): - # Step 1. Isolate the feature extractor. - model_fe_features = nn.Sequential( - model_fe.quant, # Quantize the input - model_fe.conv1, - model_fe.bn1, - model_fe.relu, - model_fe.maxpool, - model_fe.layer1, - model_fe.layer2, - model_fe.layer3, - model_fe.layer4, - model_fe.avgpool, - model_fe.dequant, # Dequantize the output - ) - - # Step 2. Create a new "head" - new_head = nn.Sequential( - nn.Dropout(p=0.5), - nn.Linear(num_ftrs, 2), - ) - - # Step 3. Combine, and don't forget the quant stubs. - new_model = nn.Sequential( - model_fe_features, - nn.Flatten(1), - new_head, - ) - return new_model - -.. warning:: Currently the quantized models can only be run on CPU. - However, it is possible to send the non-quantized parts of the model to a GPU. - -.. code:: python - - import torch.optim as optim - new_model = create_combined_model(model_fe) - new_model = new_model.to('cpu') - - criterion = nn.CrossEntropyLoss() - - # Note that we are only training the head. - optimizer_ft = optim.SGD(new_model.parameters(), lr=0.01, momentum=0.9) - - # Decay LR by a factor of 0.1 every 7 epochs - exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1) - - -Train and evaluate -~~~~~~~~~~~~~~~~~~ - -This step takes around 15-25 min on CPU. Because the quantized model can -only run on the CPU, you cannot run the training on GPU. - -.. code:: python - - new_model = train_model(new_model, criterion, optimizer_ft, exp_lr_scheduler, - num_epochs=25, device='cpu') - - visualize_model(new_model) - plt.tight_layout() - - -Part 2. Finetuning the Quantizable Model ----------------------------------------- - -In this part, we fine tune the feature extractor used for transfer -learning, and quantize the feature extractor. Note that in both part 1 -and 2, the feature extractor is quantized. The difference is that in -part 1, we use a pretrained quantized model. In this part, we create a -quantized feature extractor after fine tuning on the data-set of -interest, so this is a way to get better accuracy with transfer learning -while having the benefits of quantization. Note that in our specific -example, the training set is really small (120 images) so the benefits -of fine tuning the entire model is not apparent. However, the procedure -shown here will improve accuracy for transfer learning with larger -datasets. - -The pretrained feature extractor must be quantizable. -To make sure it is quantizable, perform the following steps: - - 1. Fuse ``(Conv, BN, ReLU)``, ``(Conv, BN)``, and ``(Conv, ReLU)`` using - ``torch.quantization.fuse_modules``. - 2. Connect the feature extractor with a custom head. - This requires dequantizing the output of the feature extractor. - 3. Insert fake-quantization modules at appropriate locations - in the feature extractor to mimic quantization during training. - -For step (1), we use models from ``torchvision/models/quantization``, which -have a member method ``fuse_model``. This function fuses all the ``conv``, -``bn``, and ``relu`` modules. For custom models, this would require calling -the ``torch.quantization.fuse_modules`` API with the list of modules to fuse -manually. - -Step (2) is performed by the ``create_combined_model`` function -used in the previous section. - -Step (3) is achieved by using ``torch.quantization.prepare_qat``, which -inserts fake-quantization modules. - - -As step (4), you can start "finetuning" the model, and after that convert -it to a fully quantized version (Step 5). - -To convert the fine tuned model into a quantized model you can call the -``torch.quantization.convert`` function (in our case only -the feature extractor is quantized). - -.. note:: Because of the random initialization your results might differ from - the results shown in this tutorial. - -.. code:: python - - # notice `quantize=False` - model = models.resnet18(pretrained=True, progress=True, quantize=False) - num_ftrs = model.fc.in_features - - # Step 1 - model.train() - model.fuse_model() - # Step 2 - model_ft = create_combined_model(model) - model_ft[0].qconfig = torch.quantization.default_qat_qconfig # Use default QAT configuration - # Step 3 - model_ft = torch.quantization.prepare_qat(model_ft, inplace=True) - - -Finetuning the model -~~~~~~~~~~~~~~~~~~~~ - -In the current tutorial the whole model is fine tuned. In -general, this will lead to higher accuracy. However, due to the small -training set used here, we end up overfitting to the training set. - - -Step 4. Fine tune the model - -.. code:: python - - for param in model_ft.parameters(): - param.requires_grad = True - - model_ft.to(device) # We can fine-tune on GPU if available - - criterion = nn.CrossEntropyLoss() - - # Note that we are training everything, so the learning rate is lower - # Notice the smaller learning rate - optimizer_ft = optim.SGD(model_ft.parameters(), lr=1e-3, momentum=0.9, weight_decay=0.1) - - # Decay LR by a factor of 0.3 every several epochs - exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer_ft, step_size=5, gamma=0.3) - - model_ft_tuned = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, - num_epochs=25, device=device) - -Step 5. Convert to quantized model - -.. code:: python - - from torch.quantization import convert - model_ft_tuned.cpu() - - model_quantized_and_trained = convert(model_ft_tuned, inplace=False) - - -Lets see how the quantized model performs on a few images - -.. code:: python - - visualize_model(model_quantized_and_trained) - - plt.ioff() - plt.tight_layout() - plt.show() diff --git a/prototype_source/fx_graph_mode_ptq_dynamic.py b/prototype_source/fx_graph_mode_ptq_dynamic.py deleted file mode 100644 index fc29e5fa97b..00000000000 --- a/prototype_source/fx_graph_mode_ptq_dynamic.py +++ /dev/null @@ -1,311 +0,0 @@ -""" -(prototype) FX Graph Mode Post Training Dynamic Quantization -============================================================ - -**Author**: `Jerry Zhang `_ - -This tutorial introduces the steps to do post training dynamic quantization in graph mode based on ``torch.fx``. -We have a separate tutorial for `FX Graph Mode Post Training Static Quantization `_, -comparison between FX Graph Mode Quantization and Eager Mode Quantization can be found in the `quantization docs `_ - -tldr; The FX Graph Mode API for dynamic quantization looks like the following: - -.. code:: python - - import torch - from torch.ao.quantization import default_dynamic_qconfig, QConfigMapping - # Note that this is temporary, we'll expose these functions to torch.ao.quantization after official releasee - from torch.quantization.quantize_fx import prepare_fx, convert_fx - - float_model.eval() - # The old 'fbgemm' is still available but 'x86' is the recommended default. - qconfig = get_default_qconfig("x86") - qconfig_mapping = QConfigMapping().set_global(qconfig) - prepared_model = prepare_fx(float_model, qconfig_mapping, example_inputs) # fuse modules and insert observers - # no calibration is required for dynamic quantization - quantized_model = convert_fx(prepared_model) # convert the model to a dynamically quantized model - -In this tutorial, we’ll apply dynamic quantization to an LSTM-based next word-prediction model, -closely following the word language model from the PyTorch examples. -We will copy the code from `Dynamic Quantization on an LSTM Word Language Model `_ -and omit the descriptions. - -""" - - -################################################### -# 1. Define the Model, Download Data and Model -# -------------------------------------------- -# -# Download the `data `_ -# and unzip to data folder -# -# .. code:: -# -# mkdir data -# cd data -# wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip -# unzip wikitext-2-v1.zip -# -# Download model to the data folder: -# -# .. code:: -# -# wget https://s3.amazonaws.com/pytorch-tutorial-assets/word_language_model_quantize.pth -# -# Define the model: - -# imports -import os -from io import open -import time -import copy - -import torch -import torch.nn as nn -import torch.nn.functional as F - -# Model Definition -class LSTMModel(nn.Module): - """Container module with an encoder, a recurrent module, and a decoder.""" - - def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5): - super(LSTMModel, self).__init__() - self.drop = nn.Dropout(dropout) - self.encoder = nn.Embedding(ntoken, ninp) - self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout) - self.decoder = nn.Linear(nhid, ntoken) - - self.init_weights() - - self.nhid = nhid - self.nlayers = nlayers - - def init_weights(self): - initrange = 0.1 - self.encoder.weight.data.uniform_(-initrange, initrange) - self.decoder.bias.data.zero_() - self.decoder.weight.data.uniform_(-initrange, initrange) - - def forward(self, input, hidden): - emb = self.drop(self.encoder(input)) - output, hidden = self.rnn(emb, hidden) - output = self.drop(output) - decoded = self.decoder(output) - return decoded, hidden - - -def init_hidden(lstm_model, bsz): - # get the weight tensor and create hidden layer in the same device - weight = lstm_model.encoder.weight - # get weight from quantized model - if not isinstance(weight, torch.Tensor): - weight = weight() - device = weight.device - nlayers = lstm_model.rnn.num_layers - nhid = lstm_model.rnn.hidden_size - return (torch.zeros(nlayers, bsz, nhid, device=device), - torch.zeros(nlayers, bsz, nhid, device=device)) - - -# Load Text Data -class Dictionary(object): - def __init__(self): - self.word2idx = {} - self.idx2word = [] - - def add_word(self, word): - if word not in self.word2idx: - self.idx2word.append(word) - self.word2idx[word] = len(self.idx2word) - 1 - return self.word2idx[word] - - def __len__(self): - return len(self.idx2word) - - -class Corpus(object): - def __init__(self, path): - self.dictionary = Dictionary() - self.train = self.tokenize(os.path.join(path, 'wiki.train.tokens')) - self.valid = self.tokenize(os.path.join(path, 'wiki.valid.tokens')) - self.test = self.tokenize(os.path.join(path, 'wiki.test.tokens')) - - def tokenize(self, path): - """Tokenizes a text file.""" - assert os.path.exists(path) - # Add words to the dictionary - with open(path, 'r', encoding="utf8") as f: - for line in f: - words = line.split() + [''] - for word in words: - self.dictionary.add_word(word) - - # Tokenize file content - with open(path, 'r', encoding="utf8") as f: - idss = [] - for line in f: - words = line.split() + [''] - ids = [] - for word in words: - ids.append(self.dictionary.word2idx[word]) - idss.append(torch.tensor(ids).type(torch.int64)) - ids = torch.cat(idss) - - return ids - -model_data_filepath = 'data/' - -corpus = Corpus(model_data_filepath + 'wikitext-2') - -ntokens = len(corpus.dictionary) - -# Load Pretrained Model -model = LSTMModel( - ntoken = ntokens, - ninp = 512, - nhid = 256, - nlayers = 5, -) - -model.load_state_dict( - torch.load( - model_data_filepath + 'word_language_model_quantize.pth', - map_location=torch.device('cpu'), - weights_only=True - ) - ) - -model.eval() -print(model) - -bptt = 25 -criterion = nn.CrossEntropyLoss() -eval_batch_size = 1 - -# create test data set -def batchify(data, bsz): - # Work out how cleanly we can divide the dataset into bsz parts. - nbatch = data.size(0) // bsz - # Trim off any extra elements that wouldn't cleanly fit (remainders). - data = data.narrow(0, 0, nbatch * bsz) - # Evenly divide the data across the bsz batches. - return data.view(bsz, -1).t().contiguous() - -test_data = batchify(corpus.test, eval_batch_size) -example_inputs = (next(iter(test_data))[0]) - -# Evaluation functions -def get_batch(source, i): - seq_len = min(bptt, len(source) - 1 - i) - data = source[i:i+seq_len] - target = source[i+1:i+1+seq_len].reshape(-1) - return data, target - -def repackage_hidden(h): - """Wraps hidden states in new Tensors, to detach them from their history.""" - - if isinstance(h, torch.Tensor): - return h.detach() - else: - return tuple(repackage_hidden(v) for v in h) - -def evaluate(model_, data_source): - # Turn on evaluation mode which disables dropout. - model_.eval() - total_loss = 0. - hidden = init_hidden(model_, eval_batch_size) - with torch.no_grad(): - for i in range(0, data_source.size(0) - 1, bptt): - data, targets = get_batch(data_source, i) - output, hidden = model_(data, hidden) - hidden = repackage_hidden(hidden) - output_flat = output.view(-1, ntokens) - total_loss += len(data) * criterion(output_flat, targets).item() - return total_loss / (len(data_source) - 1) - -###################################################################### -# 2. Post Training Dynamic Quantization -# ------------------------------------- -# Now we can dynamically quantize the model. -# We can use the same function as post training static quantization but with a dynamic qconfig. - -from torch.quantization.quantize_fx import prepare_fx, convert_fx -from torch.ao.quantization import default_dynamic_qconfig, float_qparams_weight_only_qconfig, QConfigMapping - -# Full docs for supported qconfig for floating point modules/ops can be found in `quantization docs `_ -# Full docs for `QConfigMapping `_ -qconfig_mapping = (QConfigMapping() - .set_object_type(nn.Embedding, float_qparams_weight_only_qconfig) - .set_object_type(nn.LSTM, default_dynamic_qconfig) - .set_object_type(nn.Linear, default_dynamic_qconfig) -) -# Load model to create the original model because quantization api changes the model inplace and we want -# to keep the original model for future comparison - - -model_to_quantize = LSTMModel( - ntoken = ntokens, - ninp = 512, - nhid = 256, - nlayers = 5, -) - -model_to_quantize.load_state_dict( - torch.load( - model_data_filepath + 'word_language_model_quantize.pth', - map_location=torch.device('cpu') - ) - ) - -model_to_quantize.eval() - - -prepared_model = prepare_fx(model_to_quantize, qconfig_mapping, example_inputs) -print("prepared model:", prepared_model) -quantized_model = convert_fx(prepared_model) -print("quantized model", quantized_model) - - -###################################################################### -# For dynamically quantized objects, we didn't do anything in ``prepare_fx`` for modules, -# but will insert observers for weight for dynamically quantizable forunctionals and torch ops. -# We also fuse the modules like Conv + Bn, Linear + ReLU. -# -# In convert we'll convert the float modules to dynamically quantized modules and -# convert float ops to dynamically quantized ops. We can see in the example model, -# ``nn.Embedding``, ``nn.Linear`` and ``nn.LSTM`` are dynamically quantized. -# -# Now we can compare the size and runtime of the quantized model. - -def print_size_of_model(model): - torch.save(model.state_dict(), "temp.p") - print('Size (MB):', os.path.getsize("temp.p")/1e6) - os.remove('temp.p') - -print_size_of_model(model) -print_size_of_model(quantized_model) - -###################################################################### -# There is a 4x size reduction because we quantized all the weights -# in the model (nn.Embedding, nn.Linear and nn.LSTM) from float (4 bytes) to quantized int (1 byte). - -torch.set_num_threads(1) - -def time_model_evaluation(model, test_data): - s = time.time() - loss = evaluate(model, test_data) - elapsed = time.time() - s - print('''loss: {0:.3f}\nelapsed time (seconds): {1:.1f}'''.format(loss, elapsed)) - -time_model_evaluation(model, test_data) -time_model_evaluation(quantized_model, test_data) - -##################################################################### -# There is a roughly 2x speedup for this model. Also note that the speedup -# may vary depending on model, device, build, input batch sizes, threading etc. -# -# 3. Conclusion -# ------------- -# This tutorial introduces the api for post training dynamic quantization in FX Graph Mode, -# which dynamically quantizes the same modules as Eager Mode Quantization. diff --git a/prototype_source/fx_graph_mode_ptq_static.rst b/prototype_source/fx_graph_mode_ptq_static.rst deleted file mode 100644 index da16d04dbce..00000000000 --- a/prototype_source/fx_graph_mode_ptq_static.rst +++ /dev/null @@ -1,411 +0,0 @@ -(prototype) FX Graph Mode Post Training Static Quantization -=========================================================== -**Author**: `Jerry Zhang `_ **Edited by**: `Charles Hernandez `_ - -This tutorial introduces the steps to do post training static quantization in graph mode based on -`torch.fx `_. -The advantage of FX graph mode quantization is that we can perform quantization fully automatically on the model. -Although there might be some effort required to make the model compatible with FX Graph Mode Quantization (symbolically traceable with ``torch.fx``), -we'll have a separate tutorial to show how to make the part of the model we want to quantize compatible with FX Graph Mode Quantization. -We also have a tutorial for `FX Graph Mode Post Training Dynamic Quantization `_. -tldr; The FX Graph Mode API looks like the following: - -.. code:: python - - import torch - from torch.ao.quantization import get_default_qconfig - from torch.ao.quantization.quantize_fx import prepare_fx, convert_fx - from torch.ao.quantization import QConfigMapping - float_model.eval() - # The old 'fbgemm' is still available but 'x86' is the recommended default. - qconfig = get_default_qconfig("x86") - qconfig_mapping = QConfigMapping().set_global(qconfig) - def calibrate(model, data_loader): - model.eval() - with torch.no_grad(): - for image, target in data_loader: - model(image) - example_inputs = (next(iter(data_loader))[0]) # get an example input - prepared_model = prepare_fx(float_model, qconfig_mapping, example_inputs) # fuse modules and insert observers - calibrate(prepared_model, data_loader_test) # run calibration on sample data - quantized_model = convert_fx(prepared_model) # convert the calibrated model to a quantized model - - - -1. Motivation of FX Graph Mode Quantization -------------------------------------------- - -Currently, PyTorch only has eager mode quantization as an alternative: `Static Quantization with Eager Mode in PyTorch `_. - -We can see there are multiple manual steps involved in the eager mode quantization process, including: - -- Explicitly quantize and dequantize activations-this is time consuming when floating point and quantized operations are mixed in a model. -- Explicitly fuse modules-this requires manually identifying the sequence of convolutions, batch norms and relus and other fusion patterns. -- Special handling is needed for pytorch tensor operations (like add, concat etc.) -- Functionals did not have first class support (functional.conv2d and functional.linear would not get quantized) - -Most of these required modifications comes from the underlying limitations of eager mode quantization. Eager mode works in module level since it can not inspect the code that is actually run (in the forward function), quantization is achieved by module swapping, and we don’t know how the modules are used in forward function in eager mode, so it requires users to insert QuantStub and DeQuantStub manually to mark the points they want to quantize or dequantize. -In graph mode, we can inspect the actual code that’s been executed in forward function (e.g. aten function calls) and quantization is achieved by module and graph manipulations. Since graph mode has full visibility of the code that is run, our tool is able to automatically figure out things like which modules to fuse and where to insert observer calls, quantize/dequantize functions etc., we are able to automate the whole quantization process. - -Advantages of FX Graph Mode Quantization are: - -- Simple quantization flow, minimal manual steps -- Unlocks the possibility of doing higher level optimizations like automatic precision selection - -2. Define Helper Functions and Prepare Dataset ----------------------------------------------- - -We’ll start by doing the necessary imports, defining some helper functions and prepare the data. -These steps are identitcal to `Static Quantization with Eager Mode in PyTorch `_. - -To run the code in this tutorial using the entire ImageNet dataset, first download imagenet by following the instructions at here `ImageNet Data `_. Unzip the downloaded file into the 'data_path' folder. - -Download the `torchvision resnet18 model `_ and rename it to -``data/resnet18_pretrained_float.pth``. - -.. code:: python - - import os - import sys - import time - import numpy as np - - import torch - from torch.ao.quantization import get_default_qconfig, QConfigMapping - from torch.ao.quantization.quantize_fx import prepare_fx, convert_fx, fuse_fx - import torch.nn as nn - from torch.utils.data import DataLoader - - import torchvision - from torchvision import datasets - from torchvision.models.resnet import resnet18 - import torchvision.transforms as transforms - - # Set up warnings - import warnings - warnings.filterwarnings( - action='ignore', - category=DeprecationWarning, - module=r'.*' - ) - warnings.filterwarnings( - action='default', - module=r'torch.ao.quantization' - ) - - # Specify random seed for repeatable results - _ = torch.manual_seed(191009) - - - class AverageMeter(object): - """Computes and stores the average and current value""" - def __init__(self, name, fmt=':f'): - self.name = name - self.fmt = fmt - self.reset() - - def reset(self): - self.val = 0 - self.avg = 0 - self.sum = 0 - self.count = 0 - - def update(self, val, n=1): - self.val = val - self.sum += val * n - self.count += n - self.avg = self.sum / self.count - - def __str__(self): - fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' - return fmtstr.format(**self.__dict__) - - - def accuracy(output, target, topk=(1,)): - """Computes the accuracy over the k top predictions for the specified values of k""" - with torch.no_grad(): - maxk = max(topk) - batch_size = target.size(0) - - _, pred = output.topk(maxk, 1, True, True) - pred = pred.t() - correct = pred.eq(target.view(1, -1).expand_as(pred)) - - res = [] - for k in topk: - correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) - res.append(correct_k.mul_(100.0 / batch_size)) - return res - - - def evaluate(model, criterion, data_loader): - model.eval() - top1 = AverageMeter('Acc@1', ':6.2f') - top5 = AverageMeter('Acc@5', ':6.2f') - cnt = 0 - with torch.no_grad(): - for image, target in data_loader: - output = model(image) - loss = criterion(output, target) - cnt += 1 - acc1, acc5 = accuracy(output, target, topk=(1, 5)) - top1.update(acc1[0], image.size(0)) - top5.update(acc5[0], image.size(0)) - print('') - - return top1, top5 - - def load_model(model_file): - model = resnet18(pretrained=False) - state_dict = torch.load(model_file, weights_only=True) - model.load_state_dict(state_dict) - model.to("cpu") - return model - - def print_size_of_model(model): - if isinstance(model, torch.jit.RecursiveScriptModule): - torch.jit.save(model, "temp.p") - else: - torch.jit.save(torch.jit.script(model), "temp.p") - print("Size (MB):", os.path.getsize("temp.p")/1e6) - os.remove("temp.p") - - def prepare_data_loaders(data_path): - normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], - std=[0.229, 0.224, 0.225]) - dataset = torchvision.datasets.ImageNet( - data_path, split="train", transform=transforms.Compose([ - transforms.RandomResizedCrop(224), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - normalize, - ])) - dataset_test = torchvision.datasets.ImageNet( - data_path, split="val", transform=transforms.Compose([ - transforms.Resize(256), - transforms.CenterCrop(224), - transforms.ToTensor(), - normalize, - ])) - - train_sampler = torch.utils.data.RandomSampler(dataset) - test_sampler = torch.utils.data.SequentialSampler(dataset_test) - - data_loader = torch.utils.data.DataLoader( - dataset, batch_size=train_batch_size, - sampler=train_sampler) - - data_loader_test = torch.utils.data.DataLoader( - dataset_test, batch_size=eval_batch_size, - sampler=test_sampler) - - return data_loader, data_loader_test - - data_path = '~/.data/imagenet' - saved_model_dir = 'data/' - float_model_file = 'resnet18_pretrained_float.pth' - - train_batch_size = 30 - eval_batch_size = 50 - - data_loader, data_loader_test = prepare_data_loaders(data_path) - example_inputs = (next(iter(data_loader))[0]) - criterion = nn.CrossEntropyLoss() - float_model = load_model(saved_model_dir + float_model_file).to("cpu") - float_model.eval() - - # create another instance of the model since - # we need to keep the original model around - model_to_quantize = load_model(saved_model_dir + float_model_file).to("cpu") - -3. Set model to eval mode -------------------------- -For post training quantization, we'll need to set model to eval mode. - -.. code:: python - - model_to_quantize.eval() - - -4. Specify how to quantize the model with ``QConfigMapping`` ------------------------------------------------------------- - -.. code:: python - - qconfig_mapping = QConfigMapping.set_global(default_qconfig) - -We use the same qconfig used in eager mode quantization, ``qconfig`` is just a named tuple -of the observers for activation and weight. ``QConfigMapping`` contains mapping information from ops to qconfigs: - -.. code:: python - - qconfig_mapping = (QConfigMapping() - .set_global(qconfig_opt) # qconfig_opt is an optional qconfig, either a valid qconfig or None - .set_object_type(torch.nn.Conv2d, qconfig_opt) # can be a callable... - .set_object_type("reshape", qconfig_opt) # ...or a string of the method - .set_module_name_regex("foo.*bar.*conv[0-9]+", qconfig_opt) # matched in order, first match takes precedence - .set_module_name("foo.bar", qconfig_opt) - .set_module_name_object_type_order() - ) - # priority (in increasing order): global, object_type, module_name_regex, module_name - # qconfig == None means fusion and quantization should be skipped for anything - # matching the rule (unless a higher priority match is found) - - -Utility functions related to ``qconfig`` can be found in the `qconfig `_ file -while those for ``QConfigMapping`` can be found in the `qconfig_mapping ` - -.. code:: python - - # The old 'fbgemm' is still available but 'x86' is the recommended default. - qconfig = get_default_qconfig("x86") - qconfig_mapping = QConfigMapping().set_global(qconfig) - -5. Prepare the Model for Post Training Static Quantization ----------------------------------------------------------- - -.. code:: python - - prepared_model = prepare_fx(model_to_quantize, qconfig_mapping, example_inputs) - -prepare_fx folds BatchNorm modules into previous Conv2d modules, and insert observers -in appropriate places in the model. - -.. code:: python - - prepared_model = prepare_fx(model_to_quantize, qconfig_mapping, example_inputs) - print(prepared_model.graph) - -6. Calibration --------------- -Calibration function is run after the observers are inserted in the model. -The purpose for calibration is to run through some sample examples that is representative of the workload -(for example a sample of the training data set) so that the observers in the model are able to observe -the statistics of the Tensors and we can later use this information to calculate quantization parameters. - -.. code:: python - - def calibrate(model, data_loader): - model.eval() - with torch.no_grad(): - for image, target in data_loader: - model(image) - calibrate(prepared_model, data_loader_test) # run calibration on sample data - -7. Convert the Model to a Quantized Model ------------------------------------------ -``convert_fx`` takes a calibrated model and produces a quantized model. - -.. code:: python - - quantized_model = convert_fx(prepared_model) - print(quantized_model) - -8. Evaluation -------------- -We can now print the size and accuracy of the quantized model. - -.. code:: python - - print("Size of model before quantization") - print_size_of_model(float_model) - print("Size of model after quantization") - print_size_of_model(quantized_model) - top1, top5 = evaluate(quantized_model, criterion, data_loader_test) - print("[before serilaization] Evaluation accuracy on test dataset: %2.2f, %2.2f"%(top1.avg, top5.avg)) - - fx_graph_mode_model_file_path = saved_model_dir + "resnet18_fx_graph_mode_quantized.pth" - - # this does not run due to some erros loading convrelu module: - # ModuleAttributeError: 'ConvReLU2d' object has no attribute '_modules' - # save the whole model directly - # torch.save(quantized_model, fx_graph_mode_model_file_path) - # loaded_quantized_model = torch.load(fx_graph_mode_model_file_path, weights_only=False) - - # save with state_dict - # torch.save(quantized_model.state_dict(), fx_graph_mode_model_file_path) - # import copy - # model_to_quantize = copy.deepcopy(float_model) - # prepared_model = prepare_fx(model_to_quantize, {"": qconfig}) - # loaded_quantized_model = convert_fx(prepared_model) - # loaded_quantized_model.load_state_dict(torch.load(fx_graph_mode_model_file_path), weights_only=True) - - # save with script - torch.jit.save(torch.jit.script(quantized_model), fx_graph_mode_model_file_path) - loaded_quantized_model = torch.jit.load(fx_graph_mode_model_file_path) - - top1, top5 = evaluate(loaded_quantized_model, criterion, data_loader_test) - print("[after serialization/deserialization] Evaluation accuracy on test dataset: %2.2f, %2.2f"%(top1.avg, top5.avg)) - -If you want to get better accuracy or performance, try changing the `qconfig_mapping`. -We plan to add support for graph mode in the Numerical Suite so that you can -easily determine the sensitivity towards quantization of different modules in a model. For more information, see `PyTorch Numeric Suite Tutorial `_ - -9. Debugging Quantized Model ----------------------------- -We can also print the weight for quantized a non-quantized convolution op to see the difference, -we'll first call fuse explicitly to fuse the convolution and batch norm in the model: -Note that ``fuse_fx`` only works in eval mode. - -.. code:: python - - fused = fuse_fx(float_model) - - conv1_weight_after_fuse = fused.conv1[0].weight[0] - conv1_weight_after_quant = quantized_model.conv1.weight().dequantize()[0] - - print(torch.max(abs(conv1_weight_after_fuse - conv1_weight_after_quant))) - -10. Comparison with Baseline Float Model and Eager Mode Quantization --------------------------------------------------------------------- - -.. code:: python - - scripted_float_model_file = "resnet18_scripted.pth" - - print("Size of baseline model") - print_size_of_model(float_model) - - top1, top5 = evaluate(float_model, criterion, data_loader_test) - print("Baseline Float Model Evaluation accuracy: %2.2f, %2.2f"%(top1.avg, top5.avg)) - torch.jit.save(torch.jit.script(float_model), saved_model_dir + scripted_float_model_file) - -In this section, we compare the model quantized with FX graph mode quantization with the model -quantized in eager mode. FX graph mode and eager mode produce very similar quantized models, -so the expectation is that the accuracy and speedup are similar as well. - -.. code:: python - - print("Size of Fx graph mode quantized model") - print_size_of_model(quantized_model) - top1, top5 = evaluate(quantized_model, criterion, data_loader_test) - print("FX graph mode quantized model Evaluation accuracy on test dataset: %2.2f, %2.2f"%(top1.avg, top5.avg)) - - from torchvision.models.quantization.resnet import resnet18 - eager_quantized_model = resnet18(pretrained=True, quantize=True).eval() - print("Size of eager mode quantized model") - eager_quantized_model = torch.jit.script(eager_quantized_model) - print_size_of_model(eager_quantized_model) - top1, top5 = evaluate(eager_quantized_model, criterion, data_loader_test) - print("eager mode quantized model Evaluation accuracy on test dataset: %2.2f, %2.2f"%(top1.avg, top5.avg)) - eager_mode_model_file = "resnet18_eager_mode_quantized.pth" - torch.jit.save(eager_quantized_model, saved_model_dir + eager_mode_model_file) - -We can see that the model size and accuracy of FX graph mode and eager mode quantized model are pretty similar. - -Running the model in AIBench (with single threading) gives the following result: - -.. code:: - - Scripted Float Model: - Self CPU time total: 192.48ms - - Scripted Eager Mode Quantized Model: - Self CPU time total: 50.76ms - - Scripted FX Graph Mode Quantized Model: - Self CPU time total: 50.63ms - -As we can see for resnet18 both FX graph mode and eager mode quantized model get similar speedup over the floating point model, -which is around 2-4x faster than the floating point model. But the actual speedup over floating point model may vary -depending on model, device, build, input batch sizes, threading etc. diff --git a/prototype_source/fx_graph_mode_quant_guide.rst b/prototype_source/fx_graph_mode_quant_guide.rst deleted file mode 100644 index 4ae8496ed52..00000000000 --- a/prototype_source/fx_graph_mode_quant_guide.rst +++ /dev/null @@ -1,324 +0,0 @@ -(prototype) FX Graph Mode Quantization User Guide -=========================================================== - -**Author**: `Jerry Zhang `_ - -FX Graph Mode Quantization requires a symbolically traceable model. -We use the FX framework to convert a symbolically traceable nn.Module instance to IR, -and we operate on the IR to execute the quantization passes. -Please post your question about symbolically tracing your model in `PyTorch Discussion Forum `_ - -Quantization will only work on the symbolically traceable parts of your model. -The data dependent control flow-if statements / for loops, and so on using symbolically traced values-are one common pattern which is not supported. -If your model is not symbolically traceable end to end, you have a couple of options to enable FX Graph Mode Quantization only on a part of the model. -You can use any combination of these options: - -1. Non traceable code doesn’t need to be quantized - a. Symbolically trace only the code that needs to be quantized - b. Skip symbolic tracing the non-traceable code - -2. Non traceable code needs to be quantized - a. Refactor your code to make it symbolically traceable - b. Write your own observed and quantized submodule - - -If the code that is not symbolically traceable does not need to be quantized, we have the following two options -to run FX Graph Mode Quantization: - - -Symbolically trace only the code that needs to be quantized ------------------------------------------------------------------ -When the whole model is not symbolically traceable but the submodule we want to quantize is -symbolically traceable, we can run quantization only on that submodule. - -before: - -.. code:: python - - class M(nn.Module): - def forward(self, x): - x = non_traceable_code_1(x) - x = traceable_code(x) - x = non_traceable_code_2(x) - return x - -after: - -.. code:: python - - class FP32Traceable(nn.Module): - def forward(self, x): - x = traceable_code(x) - return x - - class M(nn.Module): - def __init__(self): - self.traceable_submodule = FP32Traceable(...) - def forward(self, x): - x = self.traceable_code_1(x) - # We'll only symbolic trace/quantize this submodule - x = self.traceable_submodule(x) - x = self.traceable_code_2(x) - return x - -quantization code: - -.. code:: python - - qconfig_mapping = QConfigMapping().set_global(qconfig) - model_fp32.traceable_submodule = \ - prepare_fx(model_fp32.traceable_submodule, qconfig_mapping, example_inputs) - -Note if original model needs to be preserved, you will have to -copy it yourself before calling the quantization APIs. - - -Skip symbolically trace the non-traceable code ---------------------------------------------------- -When we have some non-traceable code in the module, and this part of code doesn’t need to be quantized, -we can factor out this part of the code into a submodule and skip symbolically trace that submodule. - - -before - -.. code:: python - - class M(nn.Module): - - def forward(self, x): - x = self.traceable_code_1(x) - x = non_traceable_code(x) - x = self.traceable_code_2(x) - return x - - -after, non-traceable parts moved to a module and marked as a leaf - -.. code:: python - - class FP32NonTraceable(nn.Module): - - def forward(self, x): - x = non_traceable_code(x) - return x - - class M(nn.Module): - - def __init__(self): - ... - self.non_traceable_submodule = FP32NonTraceable(...) - - def forward(self, x): - x = self.traceable_code_1(x) - # we will configure the quantization call to not trace through - # this submodule - x = self.non_traceable_submodule(x) - x = self.traceable_code_2(x) - return x - -quantization code: - -.. code:: python - - qconfig_mapping = QConfigMapping.set_global(qconfig) - - prepare_custom_config_dict = { - # option 1 - "non_traceable_module_name": "non_traceable_submodule", - # option 2 - "non_traceable_module_class": [MNonTraceable], - } - model_prepared = prepare_fx( - model_fp32, - qconfig_mapping, - example_inputs, - prepare_custom_config_dict=prepare_custom_config_dict, - ) - -If the code that is not symbolically traceable needs to be quantized, we have the following two options: - -Refactor your code to make it symbolically traceable --------------------------------------------------------- -If it is easy to refactor the code and make the code symbolically traceable, -we can refactor the code and remove the use of non-traceable constructs in python. - -More information about symbolic tracing support can be found `here `_. - -before: - -.. code:: python - - def transpose_for_scores(self, x): - new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) - x = x.view(*new_x_shape) - return x.permute(0, 2, 1, 3) - - -This is not symbolically traceable because in x.view(*new_x_shape) -unpacking is not supported, however, it is easy to remove the unpacking -since x.view also supports list input. - - -after: - -.. code:: python - - def transpose_for_scores(self, x): - new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) - x = x.view(new_x_shape) - return x.permute(0, 2, 1, 3) - - -This can be combined with other approaches and the quantization code -depends on the model. - -Write your own observed and quantized submodule ------------------------------------------------------ - -If the non-traceable code can’t be refactored to be symbolically traceable, -for example it has some loops that can’t be eliminated, like nn.LSTM, -we’ll need to factor out the non-traceable code to a submodule (we call it CustomModule in fx graph mode quantization) and -define the observed and quantized version of the submodule (in post training static quantization or quantization aware training for static quantization) -or define the quantized version (in post training dynamic and weight only quantization) - - -before: - -.. code:: python - - class M(nn.Module): - - def forward(self, x): - x = traceable_code_1(x) - x = non_traceable_code(x) - x = traceable_code_1(x) - return x - -after: - -1. Factor out non_traceable_code to FP32NonTraceable -non-traceable logic, wrapped in a module - -.. code:: python - - class FP32NonTraceable: - ... - -2. Define observed version of -FP32NonTraceable - -.. code:: python - - class ObservedNonTraceable: - - @classmethod - def from_float(cls, ...): - ... - -3. Define statically quantized version of FP32NonTraceable -and a class method "from_observed" to convert from ObservedNonTraceable -to StaticQuantNonTraceable - -.. code:: python - - class StaticQuantNonTraceable: - - @classmethod - def from_observed(cls, ...): - ... - - -.. code:: python - - # refactor parent class to call FP32NonTraceable - class M(nn.Module): - - def __init__(self): - ... - self.non_traceable_submodule = FP32NonTraceable(...) - - def forward(self, x): - x = self.traceable_code_1(x) - # this part will be quantized manually - x = self.non_traceable_submodule(x) - x = self.traceable_code_1(x) - return x - - -quantization code: - - -.. code:: python - - # post training static quantization or - # quantization aware training (that produces a statically quantized module)v - prepare_custom_config_dict = { - "float_to_observed_custom_module_class": { - "static": { - FP32NonTraceable: ObservedNonTraceable, - } - }, - } - - model_prepared = prepare_fx( - model_fp32, - qconfig_mapping, - example_inputs, - prepare_custom_config_dict=prepare_custom_config_dict) - -calibrate / train (not shown) - -.. code:: python - - convert_custom_config_dict = { - "observed_to_quantized_custom_module_class": { - "static": { - ObservedNonTraceable: StaticQuantNonTraceable, - } - }, - } - model_quantized = convert_fx( - model_prepared, - convert_custom_config_dict) - -post training dynamic/weight only quantization -in these two modes we don't need to observe the original model, so we -only need to define thee quantized model - -.. code:: python - - class DynamicQuantNonTraceable: # or WeightOnlyQuantMNonTraceable - ... - @classmethod - def from_observed(cls, ...): - ... - - prepare_custom_config_dict = { - "non_traceable_module_class": [ - FP32NonTraceable - ] - } - - -.. code:: python - - # The example is for post training quantization - model_fp32.eval() - model_prepared = prepare_fx( - model_fp32, - qconfig_mapping, - example_inputs, - prepare_custom_config_dict=prepare_custom_config_dict) - - convert_custom_config_dict = { - "observed_to_quantized_custom_module_class": { - "dynamic": { - FP32NonTraceable: DynamicQuantNonTraceable, - } - }, - } - model_quantized = convert_fx( - model_prepared, - convert_custom_config_dict) - -You can also find examples for custom modules in test ``test_custom_module_class`` in ``torch/test/quantization/test_quantize_fx.py``. diff --git a/prototype_source/numeric_suite_tutorial.py b/prototype_source/numeric_suite_tutorial.py deleted file mode 100644 index a630d27e6a6..00000000000 --- a/prototype_source/numeric_suite_tutorial.py +++ /dev/null @@ -1,420 +0,0 @@ -# -*- coding: utf-8 -*- -""" -PyTorch Numeric Suite Tutorial -============================== - -Introduction ------------- - -Quantization is good when it works, but it’s difficult to know what's wrong when it doesn't satisfy the accuracy we expect. Debugging the accuracy issue of quantization is not easy and time consuming. - -One important step of debugging is to measure the statistics of the float model and its corresponding quantized model to know where are they differ most. We built a suite of numeric tools called PyTorch Numeric Suite in PyTorch quantization to enable the measurement of the statistics between quantized module and float module to support quantization debugging efforts. Even for the quantized model with good accuracy, PyTorch Numeric Suite can still be used as the profiling tool to better understand the quantization error within the model and provide the guidance for further optimization. - -PyTorch Numeric Suite currently supports models quantized through both static quantization and dynamic quantization with unified APIs. - -In this tutorial we will first use ResNet18 as an example to show how to use PyTorch Numeric Suite to measure the statistics between static quantized model and float model in eager mode. Then we will use LSTM based sequence model as an example to show the usage of PyTorch Numeric Suite for dynamic quantized model. - -Numeric Suite for Static Quantization -------------------------------------- - -Setup -^^^^^^ -We’ll start by doing the necessary imports: -""" - -############################################################################## - -import numpy as np -import torch -import torch.nn as nn -import torchvision -from torchvision import models, datasets -import torchvision.transforms as transforms -import os -import torch.quantization -import torch.quantization._numeric_suite as ns -from torch.quantization import ( - default_eval_fn, - default_qconfig, - quantize, -) - -############################################################################## -# Then we load the pretrained float ResNet18 model, and quantize it into qmodel. We cannot compare two arbitrary models, only a float model and the quantized model derived from it can be compared. - - -float_model = torchvision.models.quantization.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1, quantize=False) -float_model.to('cpu') -float_model.eval() -float_model.fuse_model() -float_model.qconfig = torch.quantization.default_qconfig -img_data = [(torch.rand(2, 3, 10, 10, dtype=torch.float), torch.randint(0, 1, (2,), dtype=torch.long)) for _ in range(2)] -qmodel = quantize(float_model, default_eval_fn, [img_data], inplace=False) - -############################################################################## -# 1. Compare the weights of float and quantized models -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# The first thing we usually want to compare are the weights of quantized model and float model. -# We can call ``compare_weights()`` from PyTorch Numeric Suite to get a dictionary ``wt_compare_dict`` with key corresponding to module names and each entry is a dictionary with two keys 'float' and 'quantized', containing the float and quantized weights. -# ``compare_weights()`` takes in floating point and quantized state dict and returns a dict, with keys corresponding to the -# floating point weights and values being a dictionary of floating point and quantized weights - -wt_compare_dict = ns.compare_weights(float_model.state_dict(), qmodel.state_dict()) - -print('keys of wt_compare_dict:') -print(wt_compare_dict.keys()) - -print("\nkeys of wt_compare_dict entry for conv1's weight:") -print(wt_compare_dict['conv1.weight'].keys()) -print(wt_compare_dict['conv1.weight']['float'].shape) -print(wt_compare_dict['conv1.weight']['quantized'].shape) - - -############################################################################## -# Once get ``wt_compare_dict``, users can process this dictionary in whatever way they want. Here as an example we compute the quantization error of the weights of float and quantized models as following. -# Compute the Signal-to-Quantization-Noise Ratio (SQNR) of the quantized tensor ``y``. The SQNR reflects the -# relationship between the maximum nominal signal strength and the quantization error introduced in the -# quantization. Higher SQNR corresponds to lower quantization error. - -def compute_error(x, y): - Ps = torch.norm(x) - Pn = torch.norm(x-y) - return 20*torch.log10(Ps/Pn) - -for key in wt_compare_dict: - print(key, compute_error(wt_compare_dict[key]['float'], wt_compare_dict[key]['quantized'].dequantize())) - -############################################################################## -# As another example ``wt_compare_dict`` can also be used to plot the histogram of the weights of floating point and quantized models. - -import matplotlib.pyplot as plt - -f = wt_compare_dict['conv1.weight']['float'].flatten() -plt.hist(f, bins = 100) -plt.title("Floating point model weights of conv1") -plt.show() - -q = wt_compare_dict['conv1.weight']['quantized'].flatten().dequantize() -plt.hist(q, bins = 100) -plt.title("Quantized model weights of conv1") -plt.show() - - - -############################################################################## -# -# 2. Compare float point and quantized models at corresponding locations -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# -# The second tool allows for comparison of weights and activations between float and quantized models at corresponding locations for the same input as shown in the figure below. Red arrows indicate the locations of the comparison. -# -# .. figure:: /_static/img/compare_output.png -# -# We call ``compare_model_outputs()`` from PyTorch Numeric Suite to get the activations in float model and quantized model at corresponding locations for the given input data. This API returns a dict with module names being keys. Each entry is itself a dict with two keys 'float' and 'quantized' containing the activations. -data = img_data[0][0] - -# Take in floating point and quantized model as well as input data, and returns a dict, with keys -# corresponding to the quantized module names and each entry being a dictionary with two keys 'float' and -# 'quantized', containing the activations of floating point and quantized model at matching locations. -act_compare_dict = ns.compare_model_outputs(float_model, qmodel, data) - -print('keys of act_compare_dict:') -print(act_compare_dict.keys()) - -print("\nkeys of act_compare_dict entry for conv1's output:") -print(act_compare_dict['conv1.stats'].keys()) -print(act_compare_dict['conv1.stats']['float'][0].shape) -print(act_compare_dict['conv1.stats']['quantized'][0].shape) - -############################################################################## -# This dict can be used to compare and compute the quantization error of the activations of float and quantized models as following. -for key in act_compare_dict: - print(key, compute_error(act_compare_dict[key]['float'][0], act_compare_dict[key]['quantized'][0].dequantize())) - -############################################################################## -# If we want to do the comparison for more than one input data, we can do the following. -# Prepare the model by attaching the logger to both floating point module and quantized -# module if they are in the ``white_list``. Default logger is ``OutputLogger``, and default white_list -# is ``DEFAULT_NUMERIC_SUITE_COMPARE_MODEL_OUTPUT_WHITE_LIST`` -ns.prepare_model_outputs(float_model, qmodel) - -for data in img_data: - float_model(data[0]) - qmodel(data[0]) - -# Find the matching activation between floating point and quantized modules, and return a dict with key -# corresponding to quantized module names and each entry being a dictionary with two keys 'float' -# and 'quantized', containing the matching floating point and quantized activations logged by the logger -act_compare_dict = ns.get_matching_activations(float_model, qmodel) - - -############################################################################## -# The default logger used in above APIs is ``OutputLogger``, which is used to log the outputs of the modules. We can inherit from base ``Logger`` class and create our own logger to perform different functionalities. For example we can make a new ``MyOutputLogger`` class as below. - -class MyOutputLogger(ns.Logger): - r"""Customized logger class - """ - - def __init__(self): - super(MyOutputLogger, self).__init__() - - def forward(self, x): - # Custom functionalities - # ... - return x - -############################################################################## -# And then we can pass this logger into above APIs such as: - -data = img_data[0][0] -act_compare_dict = ns.compare_model_outputs(float_model, qmodel, data, logger_cls=MyOutputLogger) - -############################################################################## -# or: - -ns.prepare_model_outputs(float_model, qmodel, MyOutputLogger) -for data in img_data: - float_model(data[0]) - qmodel(data[0]) -act_compare_dict = ns.get_matching_activations(float_model, qmodel) - - - -############################################################################## -# -# 3. Compare a module in a quantized model with its float point equivalent, with the same input data -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# -# The third tool allows for comparing a quantized module in a model with its float point counterpart, feeding both of them the same input and comparing their outputs as shown below. -# -# .. figure:: /_static/img/compare_stub.png -# -# In practice we call prepare_model_with_stubs() to swap the quantized module that we want to compare with the Shadow module, which is illustrated as below: -# -# .. figure:: /_static/img/shadow.png -# -# The Shadow module takes quantized module, float module and logger as input, and creates a forward path inside to make the float module to shadow quantized module sharing the same input tensor. -# -# The logger can be customizable, default logger is ``ShadowLogger`` and it will save the outputs of the quantized module and float module that can be used to compute the module level quantization error. -# -# Notice before each call of ``compare_model_outputs()`` and ``compare_model_stub()`` we need to have clean float and quantized model. This is because ``compare_model_outputs()`` and ``compare_model_stub()`` modify float and quantized model inplace, and it will cause unexpected results if call one right after another. - -float_model = torchvision.models.quantization.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1, quantize=False) -float_model.to('cpu') -float_model.eval() -float_model.fuse_model() -float_model.qconfig = torch.quantization.default_qconfig -img_data = [(torch.rand(2, 3, 10, 10, dtype=torch.float), torch.randint(0, 1, (2,), dtype=torch.long)) for _ in range(2)] -qmodel = quantize(float_model, default_eval_fn, [img_data], inplace=False) - -############################################################################## -# In the following example we call ``compare_model_stub()`` from PyTorch Numeric Suite to compare ``QuantizableBasicBlock`` module with its float point equivalent. This API returns a dict with key corresponding to module names and each entry being a dictionary with two keys 'float' and 'quantized', containing the output tensors of quantized and its matching float shadow module. - -data = img_data[0][0] -module_swap_list = [torchvision.models.quantization.resnet.QuantizableBasicBlock] - -# Takes in floating point and quantized model as well as input data, and returns a dict with key -# corresponding to module names and each entry being a dictionary with two keys 'float' and -# 'quantized', containing the output tensors of quantized module and its matching floating point shadow module. -ob_dict = ns.compare_model_stub(float_model, qmodel, module_swap_list, data) - -print('keys of ob_dict:') -print(ob_dict.keys()) - -print("\nkeys of ob_dict entry for layer1.0's output:") -print(ob_dict['layer1.0.stats'].keys()) -print(ob_dict['layer1.0.stats']['float'][0].shape) -print(ob_dict['layer1.0.stats']['quantized'][0].shape) - -############################################################################## -# This dict can be then used to compare and compute the module level quantization error. - -for key in ob_dict: - print(key, compute_error(ob_dict[key]['float'][0], ob_dict[key]['quantized'][0].dequantize())) - -############################################################################## -# If we want to do the comparison for more than one input data, we can do the following. - -ns.prepare_model_with_stubs(float_model, qmodel, module_swap_list, ns.ShadowLogger) -for data in img_data: - qmodel(data[0]) -ob_dict = ns.get_logger_dict(qmodel) - -############################################################################## -# The default logger used in above APIs is ``ShadowLogger``, which is used to log the outputs of the quantized module and its matching float shadow module. We can inherit from base ``Logger`` class and create our own logger to perform different functionalities. For example we can make a new ``MyShadowLogger`` class as below. - -class MyShadowLogger(ns.Logger): - r"""Customized logger class - """ - - def __init__(self): - super(MyShadowLogger, self).__init__() - - def forward(self, x, y): - # Custom functionalities - # ... - return x - -############################################################################## -# And then we can pass this logger into above APIs such as: - -data = img_data[0][0] -ob_dict = ns.compare_model_stub(float_model, qmodel, module_swap_list, data, logger_cls=MyShadowLogger) - -############################################################################## -# or: - -ns.prepare_model_with_stubs(float_model, qmodel, module_swap_list, MyShadowLogger) -for data in img_data: - qmodel(data[0]) -ob_dict = ns.get_logger_dict(qmodel) - -############################################################################### -# Numeric Suite for Dynamic Quantization -# -------------------------------------- -# -# Numeric Suite APIs are designed in such as way that they work for both dynamic quantized model and static quantized model. We will use a model with both LSTM and Linear modules to demonstrate the usage of Numeric Suite on dynamic quantized model. This model is the same one used in the tutorial of dynamic quantization on LSTM word language model [1]. -# - -################################# -# Setup -# ^^^^^^ -# First we define the model as below. Notice that within this model only ``nn.LSTM`` and ``nn.Linear`` modules will be quantized dynamically and ``nn.Embedding`` will remain as floating point module after quantization. - -class LSTMModel(nn.Module): - """Container module with an encoder, a recurrent module, and a decoder.""" - - def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5): - super(LSTMModel, self).__init__() - self.encoder = nn.Embedding(ntoken, ninp) - self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout) - self.decoder = nn.Linear(nhid, ntoken) - - self.init_weights() - - self.nhid = nhid - self.nlayers = nlayers - - def init_weights(self): - initrange = 0.1 - self.encoder.weight.data.uniform_(-initrange, initrange) - self.decoder.bias.data.zero_() - self.decoder.weight.data.uniform_(-initrange, initrange) - - def forward(self, input, hidden): - emb = self.encoder(input) - output, hidden = self.rnn(emb, hidden) - decoded = self.decoder(output) - return decoded, hidden - - def init_hidden(self, bsz): - weight = next(self.parameters()) - return (weight.new_zeros(self.nlayers, bsz, self.nhid), - weight.new_zeros(self.nlayers, bsz, self.nhid)) - -############################################################################## -# Then we create the ``float_model`` and quantize it into qmodel. - -ntokens = 10 - -float_model = LSTMModel( - ntoken = ntokens, - ninp = 512, - nhid = 256, - nlayers = 5, -) - -float_model.eval() - -qmodel = torch.quantization.quantize_dynamic( - float_model, {nn.LSTM, nn.Linear}, dtype=torch.qint8 -) - -############################################################################## -# -# 1. Compare the weights of float and quantized models -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# -# We first call ``compare_weights()`` from PyTorch Numeric Suite to get a dictionary ``wt_compare_dict`` with key corresponding to module names and each entry is a dictionary with two keys 'float' and 'quantized', containing the float and quantized weights. - -wt_compare_dict = ns.compare_weights(float_model.state_dict(), qmodel.state_dict()) - -############################################################################## -# Once we get ``wt_compare_dict``, it can be used to compare and compute the quantization error of the weights of float and quantized models as following. - -for key in wt_compare_dict: - if wt_compare_dict[key]['quantized'].is_quantized: - print(key, compute_error(wt_compare_dict[key]['float'], wt_compare_dict[key]['quantized'].dequantize())) - else: - print(key, compute_error(wt_compare_dict[key]['float'], wt_compare_dict[key]['quantized'])) - -############################################################################## -# -# The Inf value in ``encoder.weight`` entry above is because encoder module is not quantized and the weights are the same in both floating point and quantized models. -# -# 2. Compare float point and quantized models at corresponding locations -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# -# Then we call ``compare_model_outputs()`` from PyTorch Numeric Suite to get the activations in float model and quantized model at corresponding locations for the given input data. This API returns a dict with module names being keys. Each entry is itself a dict with two keys 'float' and 'quantized' containing the activations. Notice that this sequence model has two inputs, and we can pass both inputs into ``compare_model_outputs()`` and ``compare_model_stub()``. - - -input_ = torch.randint(ntokens, (1, 1), dtype=torch.long) -hidden = float_model.init_hidden(1) - -act_compare_dict = ns.compare_model_outputs(float_model, qmodel, input_, hidden) -print(act_compare_dict.keys()) - -############################################################################## -# This dict can be used to compare and compute the quantization error of the activations of float and quantized models as following. The LSTM module in this model has two outputs, in this example we compute the error of the first output. - - -for key in act_compare_dict: - print(key, compute_error(act_compare_dict[key]['float'][0][0], act_compare_dict[key]['quantized'][0][0])) - -############################################################################## -# -# 3. Compare a module in a quantized model with its float point equivalent, with the same input data -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# -# Next we call ``compare_model_stub()`` from PyTorch Numeric Suite to compare LSTM and Linear module with its float point equivalent. This API returns a dict with key corresponding to module names and each entry being a dictionary with two keys 'float' and 'quantized', containing the output tensors of quantized and its matching float shadow module. -# -# We reset the model first. - - -float_model = LSTMModel( - ntoken = ntokens, - ninp = 512, - nhid = 256, - nlayers = 5, -) -float_model.eval() - -qmodel = torch.quantization.quantize_dynamic( - float_model, {nn.LSTM, nn.Linear}, dtype=torch.qint8 -) - -############################################################################## -# Next we call ``compare_model_stub()`` from PyTorch Numeric Suite to compare LSTM and Linear module with its float point equivalent. This API returns a dict with key corresponding to module names and each entry being a dictionary with two keys 'float' and 'quantized', containing the output tensors of quantized and its matching float shadow module. - -module_swap_list = [nn.Linear, nn.LSTM] -ob_dict = ns.compare_model_stub(float_model, qmodel, module_swap_list, input_, hidden) -print(ob_dict.keys()) - -############################################################################## -# This dict can be then used to compare and compute the module level quantization error. - -for key in ob_dict: - print(key, compute_error(ob_dict[key]['float'][0], ob_dict[key]['quantized'][0])) - -############################################################################## -# SQNR of 40 dB is high and this is a situation where we have very good numerical alignment between the floating point and quantized model. -# -# Conclusion -# ---------- -# In this tutorial, we demonstrated how to use PyTorch Numeric Suite to measure and compare the statistics between quantized model and float model in eager mode with unified APIs for both static quantization and dynamic quantization. -# -# Thanks for reading! As always, we welcome any feedback, so please create an issue `here `_ if you have any. -# -# References -# ---------- -# [1] `DYNAMIC QUANTIZATION ON AN LSTM WORD LANGUAGE MODEL `_. diff --git a/prototype_source/pt2e_quant_ptq.rst b/prototype_source/pt2e_quant_ptq.rst deleted file mode 100644 index 4873bce7d55..00000000000 --- a/prototype_source/pt2e_quant_ptq.rst +++ /dev/null @@ -1,602 +0,0 @@ -(prototype) PyTorch 2 Export Post Training Quantization -================================================================ -**Author**: `Jerry Zhang `_ - -This tutorial introduces the steps to do post training static quantization in -graph mode based on -`torch._export.export `_. Compared -to `FX Graph Mode Quantization `_, -this flow is expected to have significantly higher model coverage -(`88% on 14K models `_), -better programmability, and a simplified UX. - -Exportable by `torch.export.export` is a prerequisite to use the flow, you can -find what are the constructs that's supported in `Export DB `_. - -The high level architecture of quantization 2 with quantizer could look like -this: - -:: - - float_model(Python) Example Input - \ / - \ / - —------------------------------------------------------- - | export | - —------------------------------------------------------- - | - FX Graph in ATen Backend Specific Quantizer - | / - —-------------------------------------------------------- - | prepare_pt2e | - —-------------------------------------------------------- - | - Calibrate/Train - | - —-------------------------------------------------------- - | convert_pt2e | - —-------------------------------------------------------- - | - Quantized Model - | - —-------------------------------------------------------- - | Lowering | - —-------------------------------------------------------- - | - Executorch, Inductor or - - -The PyTorch 2 export quantization API looks like this: - -.. code:: python - - import torch - class M(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear = torch.nn.Linear(5, 10) - - def forward(self, x): - return self.linear(x) - - - example_inputs = (torch.randn(1, 5),) - m = M().eval() - - # Step 1. program capture - # This is available for pytorch 2.5+, for more details on lower pytorch versions - # please check `Export the model with torch.export` section - m = torch.export.export_for_training(m, example_inputs).module() - # we get a model with aten ops - - - # Step 2. quantization - from torch.ao.quantization.quantize_pt2e import ( - prepare_pt2e, - convert_pt2e, - ) - - from torch.ao.quantization.quantizer.xnnpack_quantizer import ( - XNNPACKQuantizer, - get_symmetric_quantization_config, - ) - # backend developer will write their own Quantizer and expose methods to allow - # users to express how they - # want the model to be quantized - quantizer = XNNPACKQuantizer().set_global(get_symmetric_quantization_config()) - m = prepare_pt2e(m, quantizer) - - # calibration omitted - - m = convert_pt2e(m) - # we have a model with aten ops doing integer computations when possible - - -Motivation of PyTorch 2 Export Quantization ---------------------------------------------- - -In PyTorch versions prior to 2, we have FX Graph Mode Quantization that uses -`QConfigMapping `_ -and `BackendConfig `_ -for customizations. ``QConfigMapping`` allows modeling users to specify how -they want their model to be quantized, ``BackendConfig`` allows backend -developers to specify the supported ways of quantization in their backend. While -that API covers most use cases relatively well, it is not fully extensible. -There are two main limitations for the current API: - -* Limitation around expressing quantization intentions for complicated operator - patterns (how an operator pattern should be observed/quantized) using existing - objects: ``QConfig`` and ``QConfigMapping``. - -* Limited support on how user can express their intention of how they want - their model to be quantized. For example, if users want to quantize the every - other linear in the model, or the quantization behavior has some dependency on - the actual shape of the Tensor (for example, only observe/quantize inputs - and outputs when the linear has a 3D input), backend developer or modeling - users need to change the core quantization API/flow. - -A few improvements could make the existing flow better: - -* We use ``QConfigMapping`` and ``BackendConfig`` as separate objects, - ``QConfigMapping`` describes user’s intention of how they want their model to - be quantized, ``BackendConfig`` describes what kind of quantization a backend - supports. ``BackendConfig`` is backend-specific, but ``QConfigMapping`` is not, - and the user can provide a ``QConfigMapping`` that is incompatible with a specific - ``BackendConfig``, this is not a great UX. Ideally, we can structure this better - by making both configuration (``QConfigMapping``) and quantization capability - (``BackendConfig``) backend-specific, so there will be less confusion about - incompatibilities. -* In ``QConfig`` we are exposing observer/ ``fake_quant`` observer classes as an - object for the user to configure quantization, this increases the things that - the user may need to care about. For example, not only the ``dtype`` but also - how the observation should happen, these could potentially be hidden from the - user so that the user flow is simpler. - -Here is a summary of the benefits of the new API: - -- **Programmability** (addressing 1. and 2.): When a user’s quantization needs - are not covered by available quantizers, users can build their own quantizer and - compose it with other quantizers as mentioned above. -- **Simplified UX** (addressing 3.): Provides a single instance with which both - backend and users interact. Thus you no longer have the user facing quantization - config mapping to map users intent and a separate quantization config that - backends interact with to configure what backend support. We will still have a - method for users to query what is supported in a quantizer. With a single - instance, composing different quantization capabilities also becomes more - natural than previously. - - For example XNNPACK does not support ``embedding_byte`` - and we have natively support for this in ExecuTorch. Thus, if we had - ``ExecuTorchQuantizer`` that only quantized ``embedding_byte``, then it can be - composed with ``XNNPACKQuantizer``. (Previously, this used to be concatenating the - two ``BackendConfig`` together and since options in ``QConfigMapping`` are not - backend specific, user also need to figure out how to specify the configurations - by themselves that matches the quantization capabilities of the combined - backend. With a single quantizer instance, we can compose two quantizers and - query the composed quantizer for capabilities, which makes it less error prone - and cleaner, for example, ``composed_quantizer.quantization_capabilities())``. - -- **Separation of concerns** (addressing 4.): As we design the quantizer API, we - also decouple specification of quantization, as expressed in terms of ``dtype``, - min/max (# of bits), symmetric, and so on, from the observer concept. - Currently, the observer captures both quantization specification and how to - observe (Histogram vs MinMax observer). Modeling users are freed from - interacting with observer and fake quant objects with this change. - -Define Helper Functions and Prepare Dataset -------------------------------------------- - -We’ll start by doing the necessary imports, defining some helper functions and -prepare the data. These steps are identitcal to -`Static Quantization with Eager Mode in PyTorch `_. - -To run the code in this tutorial using the entire ImageNet dataset, first -download Imagenet by following the instructions at here -`ImageNet Data `_. Unzip the downloaded file -into the ``data_path`` folder. - -Download the `torchvision resnet18 model `_ -and rename it to ``data/resnet18_pretrained_float.pth``. - -.. code:: python - - import os - import sys - import time - import numpy as np - - import torch - import torch.nn as nn - from torch.utils.data import DataLoader - - import torchvision - from torchvision import datasets - from torchvision.models.resnet import resnet18 - import torchvision.transforms as transforms - - # Set up warnings - import warnings - warnings.filterwarnings( - action='ignore', - category=DeprecationWarning, - module=r'.*' - ) - warnings.filterwarnings( - action='default', - module=r'torch.ao.quantization' - ) - - # Specify random seed for repeatable results - _ = torch.manual_seed(191009) - - - class AverageMeter(object): - """Computes and stores the average and current value""" - def __init__(self, name, fmt=':f'): - self.name = name - self.fmt = fmt - self.reset() - - def reset(self): - self.val = 0 - self.avg = 0 - self.sum = 0 - self.count = 0 - - def update(self, val, n=1): - self.val = val - self.sum += val * n - self.count += n - self.avg = self.sum / self.count - - def __str__(self): - fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' - return fmtstr.format(**self.__dict__) - - - def accuracy(output, target, topk=(1,)): - """ - Computes the accuracy over the k top predictions for the specified - values of k. - """ - with torch.no_grad(): - maxk = max(topk) - batch_size = target.size(0) - - _, pred = output.topk(maxk, 1, True, True) - pred = pred.t() - correct = pred.eq(target.view(1, -1).expand_as(pred)) - - res = [] - for k in topk: - correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) - res.append(correct_k.mul_(100.0 / batch_size)) - return res - - - def evaluate(model, criterion, data_loader): - model.eval() - top1 = AverageMeter('Acc@1', ':6.2f') - top5 = AverageMeter('Acc@5', ':6.2f') - cnt = 0 - with torch.no_grad(): - for image, target in data_loader: - output = model(image) - loss = criterion(output, target) - cnt += 1 - acc1, acc5 = accuracy(output, target, topk=(1, 5)) - top1.update(acc1[0], image.size(0)) - top5.update(acc5[0], image.size(0)) - print('') - - return top1, top5 - - def load_model(model_file): - model = resnet18(pretrained=False) - state_dict = torch.load(model_file, weights_only=True) - model.load_state_dict(state_dict) - model.to("cpu") - return model - - def print_size_of_model(model): - torch.save(model.state_dict(), "temp.p") - print("Size (MB):", os.path.getsize("temp.p")/1e6) - os.remove("temp.p") - - def prepare_data_loaders(data_path): - normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], - std=[0.229, 0.224, 0.225]) - dataset = torchvision.datasets.ImageNet( - data_path, split="train", transform=transforms.Compose([ - transforms.RandomResizedCrop(224), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - normalize, - ])) - dataset_test = torchvision.datasets.ImageNet( - data_path, split="val", transform=transforms.Compose([ - transforms.Resize(256), - transforms.CenterCrop(224), - transforms.ToTensor(), - normalize, - ])) - - train_sampler = torch.utils.data.RandomSampler(dataset) - test_sampler = torch.utils.data.SequentialSampler(dataset_test) - - data_loader = torch.utils.data.DataLoader( - dataset, batch_size=train_batch_size, - sampler=train_sampler) - - data_loader_test = torch.utils.data.DataLoader( - dataset_test, batch_size=eval_batch_size, - sampler=test_sampler) - - return data_loader, data_loader_test - - data_path = '~/.data/imagenet' - saved_model_dir = 'data/' - float_model_file = 'resnet18_pretrained_float.pth' - - train_batch_size = 30 - eval_batch_size = 50 - - data_loader, data_loader_test = prepare_data_loaders(data_path) - example_inputs = (next(iter(data_loader))[0]) - criterion = nn.CrossEntropyLoss() - float_model = load_model(saved_model_dir + float_model_file).to("cpu") - float_model.eval() - - # create another instance of the model since - # we need to keep the original model around - model_to_quantize = load_model(saved_model_dir + float_model_file).to("cpu") - -Set the model to eval mode --------------------------- - -For post training quantization, we'll need to set the model to the eval mode. - -.. code:: python - - model_to_quantize.eval() - -Export the model with torch.export ----------------------------------- - -Here is how you can use ``torch.export`` to export the model: - -.. code-block:: python - - example_inputs = (torch.rand(2, 3, 224, 224),) - # for pytorch 2.5+ - exported_model = torch.export.export_for_training(model_to_quantize, example_inputs).module() - - # for pytorch 2.4 and before - # from torch._export import capture_pre_autograd_graph - # exported_model = capture_pre_autograd_graph(model_to_quantize, example_inputs) - - # or capture with dynamic dimensions - # for pytorch 2.5+ - dynamic_shapes = tuple( - {0: torch.export.Dim("dim")} if i == 0 else None - for i in range(len(example_inputs)) - ) - exported_model = torch.export.export_for_training(model_to_quantize, example_inputs, dynamic_shapes=dynamic_shapes).module() - - # for pytorch 2.4 and before - # dynamic_shape API may vary as well - # from torch._export import dynamic_dim - # exported_model = capture_pre_autograd_graph(model_to_quantize, example_inputs, constraints=[dynamic_dim(example_inputs[0], 0)]) - - -Import the Backend Specific Quantizer and Configure how to Quantize the Model ------------------------------------------------------------------------------ - -The following code snippets describes how to quantize the model: - -.. code-block:: python - - from torch.ao.quantization.quantizer.xnnpack_quantizer import ( - XNNPACKQuantizer, - get_symmetric_quantization_config, - ) - quantizer = XNNPACKQuantizer() - quantizer.set_global(get_symmetric_quantization_config()) - -``Quantizer`` is backend specific, and each ``Quantizer`` will provide their -own way to allow users to configure their model. Just as an example, here is -the different configuration APIs supported by ``XNNPackQuantizer``: - -.. code-block:: python - - quantizer.set_global(qconfig_opt) # qconfig_opt is an optional quantization config - .set_object_type(torch.nn.Conv2d, qconfig_opt) # can be a module type - .set_object_type(torch.nn.functional.linear, qconfig_opt) # or torch functional op - .set_module_name("foo.bar", qconfig_opt) - -.. note:: - - Check out our - `tutorial `_ - that describes how to write a new ``Quantizer``. - -Prepare the Model for Post Training Quantization ----------------------------------------------------------- - -``prepare_pt2e`` folds ``BatchNorm`` operators into preceding ``Conv2d`` -operators, and inserts observers in appropriate places in the model. - -.. code-block:: python - - prepared_model = prepare_pt2e(exported_model, quantizer) - print(prepared_model.graph) - -Calibration --------------- - -The calibration function is run after the observers are inserted in the model. -The purpose for calibration is to run through some sample examples that is -representative of the workload (for example a sample of the training data set) -so that the observers in themodel are able to observe the statistics of the -Tensors and we can later use this information to calculate quantization -parameters. - -.. code-block:: python - - def calibrate(model, data_loader): - model.eval() - with torch.no_grad(): - for image, target in data_loader: - model(image) - calibrate(prepared_model, data_loader_test) # run calibration on sample data - -Convert the Calibrated Model to a Quantized Model -------------------------------------------------- - -``convert_pt2e`` takes a calibrated model and produces a quantized model. - -.. code-block:: python - - quantized_model = convert_pt2e(prepared_model) - print(quantized_model) - -At this step, we currently have two representations that you can choose from, but exact representation -we offer in the long term might change based on feedback from PyTorch users. - -* Q/DQ Representation (default) - - Previous documentation for `representations `_ all quantized operators are represented as ``dequantize -> fp32_op -> qauntize``. - -.. code-block:: python - - def quantized_linear(x_int8, x_scale, x_zero_point, weight_int8, weight_scale, weight_zero_point, bias_fp32, output_scale, output_zero_point): - x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor( - x_i8, x_scale, x_zero_point, x_quant_min, x_quant_max, torch.int8) - weight_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor( - weight_i8, weight_scale, weight_zero_point, weight_quant_min, weight_quant_max, torch.int8) - weight_permuted = torch.ops.aten.permute_copy.default(weight_fp32, [1, 0]); - out_fp32 = torch.ops.aten.addmm.default(bias_fp32, x_fp32, weight_permuted) - out_i8 = torch.ops.quantized_decomposed.quantize_per_tensor( - out_fp32, out_scale, out_zero_point, out_quant_min, out_quant_max, torch.int8) - return out_i8 - -* Reference Quantized Model Representation - - We will have a special representation for selected ops, for example, quantized linear. Other ops are represented as ``dq -> float32_op -> q`` and ``q/dq`` are decomposed into more primitive operators. - You can get this representation by using ``convert_pt2e(..., use_reference_representation=True)``. - -.. code-block:: python - - # Reference Quantized Pattern for quantized linear - def quantized_linear(x_int8, x_scale, x_zero_point, weight_int8, weight_scale, weight_zero_point, bias_fp32, output_scale, output_zero_point): - x_int16 = x_int8.to(torch.int16) - weight_int16 = weight_int8.to(torch.int16) - acc_int32 = torch.ops.out_dtype(torch.mm, torch.int32, (x_int16 - x_zero_point), (weight_int16 - weight_zero_point)) - bias_scale = x_scale * weight_scale - bias_int32 = out_dtype(torch.ops.aten.div.Tensor, torch.int32, bias_fp32, bias_scale) - acc_int32 = acc_int32 + bias_int32 - acc_int32 = torch.ops.out_dtype(torch.ops.aten.mul.Scalar, torch.int32, acc_int32, x_scale * weight_scale / output_scale) + output_zero_point - out_int8 = torch.ops.aten.clamp(acc_int32, qmin, qmax).to(torch.int8) - return out_int8 - - -See `here `_ for the most up-to-date reference representations. - - -Checking Model Size and Accuracy Evaluation ----------------------------------------------- - -Now we can compare the size and model accuracy with baseline model. - -.. code-block:: python - - # Baseline model size and accuracy - print("Size of baseline model") - print_size_of_model(float_model) - - top1, top5 = evaluate(float_model, criterion, data_loader_test) - print("Baseline Float Model Evaluation accuracy: %2.2f, %2.2f"%(top1.avg, top5.avg)) - - # Quantized model size and accuracy - print("Size of model after quantization") - # export again to remove unused weights - quantized_model = torch.export.export_for_training(quantized_model, example_inputs).module() - print_size_of_model(quantized_model) - - top1, top5 = evaluate(quantized_model, criterion, data_loader_test) - print("[before serilaization] Evaluation accuracy on test dataset: %2.2f, %2.2f"%(top1.avg, top5.avg)) - - -.. note:: - We can't do performance evaluation now since the model is not lowered to - target device, it's just a representation of quantized computation in ATen - operators. - -.. note:: - The weights are still in fp32 right now, we may do constant propagation for quantize op to - get integer weights in the future. - -If you want to get better accuracy or performance, try configuring -``quantizer`` in different ways, and each ``quantizer`` will have its own way -of configuration, so please consult the documentation for the -quantizer you are using to learn more about how you can have more control -over how to quantize a model. - -Save and Load Quantized Model ---------------------------------- - -We'll show how to save and load the quantized model. - - -.. code-block:: python - - # 0. Store reference output, for example, inputs, and check evaluation accuracy: - example_inputs = (next(iter(data_loader))[0],) - ref = quantized_model(*example_inputs) - top1, top5 = evaluate(quantized_model, criterion, data_loader_test) - print("[before serialization] Evaluation accuracy on test dataset: %2.2f, %2.2f"%(top1.avg, top5.avg)) - - # 1. Export the model and Save ExportedProgram - pt2e_quantized_model_file_path = saved_model_dir + "resnet18_pt2e_quantized.pth" - # capture the model to get an ExportedProgram - quantized_ep = torch.export.export(quantized_model, example_inputs) - # use torch.export.save to save an ExportedProgram - torch.export.save(quantized_ep, pt2e_quantized_model_file_path) - - - # 2. Load the saved ExportedProgram - loaded_quantized_ep = torch.export.load(pt2e_quantized_model_file_path) - loaded_quantized_model = loaded_quantized_ep.module() - - # 3. Check results for example inputs and check evaluation accuracy again: - res = loaded_quantized_model(*example_inputs) - print("diff:", ref - res) - - top1, top5 = evaluate(loaded_quantized_model, criterion, data_loader_test) - print("[after serialization/deserialization] Evaluation accuracy on test dataset: %2.2f, %2.2f"%(top1.avg, top5.avg)) - - -Output: - - -.. code-block:: python - - [before serialization] Evaluation accuracy on test dataset: 79.82, 94.55 - diff: tensor([[0., 0., 0., ..., 0., 0., 0.], - [0., 0., 0., ..., 0., 0., 0.], - [0., 0., 0., ..., 0., 0., 0.], - ..., - [0., 0., 0., ..., 0., 0., 0.], - [0., 0., 0., ..., 0., 0., 0.], - [0., 0., 0., ..., 0., 0., 0.]]) - - [after serialization/deserialization] Evaluation accuracy on test dataset: 79.82, 94.55 - - -Debugging the Quantized Model ------------------------------- - -You can use `Numeric Suite `_ -that can help with debugging in eager mode and FX graph mode. The new version of -Numeric Suite working with PyTorch 2 Export models is still in development. - -Lowering and Performance Evaluation ------------------------------------- - -The model produced at this point is not the final model that runs on the device, -it is a reference quantized model that captures the intended quantized computation -from the user, expressed as ATen operators and some additional quantize/dequantize operators, -to get a model that runs on real devices, we'll need to lower the model. -For example, for the models that run on edge devices, we can lower with delegation and ExecuTorch runtime -operators. - -Conclusion --------------- - -In this tutorial, we went through the overall quantization flow in PyTorch 2 -Export Quantization using ``XNNPACKQuantizer`` and got a quantized model that -could be further lowered to a backend that supports inference with XNNPACK -backend. To use this for your own backend, please first follow the -`tutorial `__ and -implement a ``Quantizer`` for your backend, and then quantize the model with -that ``Quantizer``. diff --git a/prototype_source/pt2e_quant_ptq_x86_inductor.rst b/prototype_source/pt2e_quant_ptq_x86_inductor.rst deleted file mode 100644 index 39214a51749..00000000000 --- a/prototype_source/pt2e_quant_ptq_x86_inductor.rst +++ /dev/null @@ -1,10 +0,0 @@ -Quantization in PyTorch 2.0 Export Tutorial -=========================================== - -This tutorial has been moved. - -Redirecting in 3 seconds... - -.. raw:: html - - diff --git a/prototype_source/pt2e_quant_qat.rst b/prototype_source/pt2e_quant_qat.rst deleted file mode 100644 index 8f11b0730c5..00000000000 --- a/prototype_source/pt2e_quant_qat.rst +++ /dev/null @@ -1,487 +0,0 @@ -(prototype) PyTorch 2 Export Quantization-Aware Training (QAT) -================================================================ -**Author**: `Andrew Or `_ - -This tutorial shows how to perform quantization-aware training (QAT) in -graph mode based on `torch.export.export `_. -For more details about PyTorch 2 Export Quantization in general, refer -to the `post training quantization tutorial `_. - -The PyTorch 2 Export QAT flow looks like the following—it is similar -to the post training quantization (PTQ) flow for the most part: - -.. code:: python - - import torch - from torch._export import capture_pre_autograd_graph - from torch.ao.quantization.quantize_pt2e import ( - prepare_qat_pt2e, - convert_pt2e, - ) - from torch.ao.quantization.quantizer.xnnpack_quantizer import ( - XNNPACKQuantizer, - get_symmetric_quantization_config, - ) - - class M(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear = torch.nn.Linear(5, 10) - - def forward(self, x): - return self.linear(x) - - - example_inputs = (torch.randn(1, 5),) - m = M() - - # Step 1. program capture - # This is available for pytorch 2.5+, for more details on lower pytorch versions - # please check `Export the model with torch.export` section - m = torch.export.export_for_training(m, example_inputs).module() - # we get a model with aten ops - - # Step 2. quantization-aware training - # backend developer will write their own Quantizer and expose methods to allow - # users to express how they want the model to be quantized - quantizer = XNNPACKQuantizer().set_global(get_symmetric_quantization_config()) - m = prepare_qat_pt2e(m, quantizer) - - # train omitted - - m = convert_pt2e(m) - # we have a model with aten ops doing integer computations when possible - - # move the quantized model to eval mode, equivalent to `m.eval()` - torch.ao.quantization.move_exported_model_to_eval(m) - -Note that calling ``model.eval()`` or ``model.train()`` after program capture is -not allowed, because these methods no longer correctly change the behavior of -certain ops like dropout and batch normalization. Instead, please use -``torch.ao.quantization.move_exported_model_to_eval()`` and -``torch.ao.quantization.move_exported_model_to_train()`` (coming soon) -respectively. - - -Define Helper Functions and Prepare the Dataset ------------------------------------------------ - -To run the code in this tutorial using the entire ImageNet dataset, first -download ImageNet by following the instructions in -`ImageNet Data `_. Unzip the downloaded file -into the ``data_path`` folder. - -Next, download the `torchvision resnet18 model `_ -and rename it to ``data/resnet18_pretrained_float.pth``. - -We’ll start by doing the necessary imports, defining some helper functions and -prepare the data. These steps are very similar to the ones defined in the -`static eager mode post training quantization tutorial `_: - -.. code:: python - - import os - import sys - import time - import numpy as np - - import torch - import torch.nn as nn - from torch.utils.data import DataLoader - - import torchvision - from torchvision import datasets - from torchvision.models.resnet import resnet18 - import torchvision.transforms as transforms - - # Set up warnings - import warnings - warnings.filterwarnings( - action='ignore', - category=DeprecationWarning, - module=r'.*' - ) - warnings.filterwarnings( - action='default', - module=r'torch.ao.quantization' - ) - - # Specify random seed for repeatable results - _ = torch.manual_seed(191009) - - class AverageMeter(object): - """Computes and stores the average and current value""" - def __init__(self, name, fmt=':f'): - self.name = name - self.fmt = fmt - self.reset() - - def reset(self): - self.val = 0 - self.avg = 0 - self.sum = 0 - self.count = 0 - - def update(self, val, n=1): - self.val = val - self.sum += val * n - self.count += n - self.avg = self.sum / self.count - - def __str__(self): - fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' - return fmtstr.format(**self.__dict__) - - def accuracy(output, target, topk=(1,)): - """ - Computes the accuracy over the k top predictions for the specified - values of k. - """ - with torch.no_grad(): - maxk = max(topk) - batch_size = target.size(0) - - _, pred = output.topk(maxk, 1, True, True) - pred = pred.t() - correct = pred.eq(target.view(1, -1).expand_as(pred)) - - res = [] - for k in topk: - correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) - res.append(correct_k.mul_(100.0 / batch_size)) - return res - - def evaluate(model, criterion, data_loader, device): - torch.ao.quantization.move_exported_model_to_eval(model) - top1 = AverageMeter('Acc@1', ':6.2f') - top5 = AverageMeter('Acc@5', ':6.2f') - cnt = 0 - with torch.no_grad(): - for image, target in data_loader: - image = image.to(device) - target = target.to(device) - output = model(image) - loss = criterion(output, target) - cnt += 1 - acc1, acc5 = accuracy(output, target, topk=(1, 5)) - top1.update(acc1[0], image.size(0)) - top5.update(acc5[0], image.size(0)) - print('') - - return top1, top5 - - def load_model(model_file): - model = resnet18(pretrained=False) - state_dict = torch.load(model_file, weights_only=True) - model.load_state_dict(state_dict) - return model - - def print_size_of_model(model): - if isinstance(model, torch.jit.RecursiveScriptModule): - torch.jit.save(model, "temp.p") - else: - torch.jit.save(torch.jit.script(model), "temp.p") - print("Size (MB):", os.path.getsize("temp.p")/1e6) - os.remove("temp.p") - - def prepare_data_loaders(data_path): - normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], - std=[0.229, 0.224, 0.225]) - dataset = torchvision.datasets.ImageNet( - data_path, split="train", transform=transforms.Compose([ - transforms.RandomResizedCrop(224), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - normalize, - ])) - dataset_test = torchvision.datasets.ImageNet( - data_path, split="val", transform=transforms.Compose([ - transforms.Resize(256), - transforms.CenterCrop(224), - transforms.ToTensor(), - normalize, - ])) - - train_sampler = torch.utils.data.RandomSampler(dataset) - test_sampler = torch.utils.data.SequentialSampler(dataset_test) - - data_loader = torch.utils.data.DataLoader( - dataset, batch_size=train_batch_size, - sampler=train_sampler) - - data_loader_test = torch.utils.data.DataLoader( - dataset_test, batch_size=eval_batch_size, - sampler=test_sampler) - - return data_loader, data_loader_test - - def train_one_epoch(model, criterion, optimizer, data_loader, device, ntrain_batches): - # Note: do not call model.train() here, since this doesn't work on an exported model. - # Instead, call `torch.ao.quantization.move_exported_model_to_train(model)`, which will - # be added in the near future - top1 = AverageMeter('Acc@1', ':6.2f') - top5 = AverageMeter('Acc@5', ':6.2f') - avgloss = AverageMeter('Loss', '1.5f') - - cnt = 0 - for image, target in data_loader: - start_time = time.time() - print('.', end = '') - cnt += 1 - image, target = image.to(device), target.to(device) - output = model(image) - loss = criterion(output, target) - optimizer.zero_grad() - loss.backward() - optimizer.step() - acc1, acc5 = accuracy(output, target, topk=(1, 5)) - top1.update(acc1[0], image.size(0)) - top5.update(acc5[0], image.size(0)) - avgloss.update(loss, image.size(0)) - if cnt >= ntrain_batches: - print('Loss', avgloss.avg) - - print('Training: * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}' - .format(top1=top1, top5=top5)) - return - - print('Full imagenet train set: * Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f}' - .format(top1=top1, top5=top5)) - return - - data_path = '~/.data/imagenet' - saved_model_dir = 'data/' - float_model_file = 'resnet18_pretrained_float.pth' - - train_batch_size = 32 - eval_batch_size = 32 - - data_loader, data_loader_test = prepare_data_loaders(data_path) - example_inputs = (next(iter(data_loader))[0]) - criterion = nn.CrossEntropyLoss() - float_model = load_model(saved_model_dir + float_model_file).to("cuda") - - -Export the model with torch.export ----------------------------------- - -Here is how you can use ``torch.export`` to export the model: - -.. code:: python - - from torch._export import capture_pre_autograd_graph - - example_inputs = (torch.rand(2, 3, 224, 224),) - # for pytorch 2.5+ - exported_model = torch.export.export_for_training(float_model, example_inputs).module() - # for pytorch 2.4 and before - # from torch._export import capture_pre_autograd_graph - # exported_model = capture_pre_autograd_graph(model_to_quantize, example_inputs) - - -.. code:: python - - # or, to capture with dynamic dimensions: - - # for pytorch 2.5+ - dynamic_shapes = tuple( - {0: torch.export.Dim("dim")} if i == 0 else None - for i in range(len(example_inputs)) - ) - exported_model = torch.export.export_for_training(float_model, example_inputs, dynamic_shapes=dynamic_shapes).module() - - # for pytorch 2.4 and before - # dynamic_shape API may vary as well - # from torch._export import dynamic_dim - - # example_inputs = (torch.rand(2, 3, 224, 224),) - # exported_model = capture_pre_autograd_graph( - # float_model, - # example_inputs, - # constraints=[dynamic_dim(example_inputs[0], 0)], - # ) - - -Import the Backend Specific Quantizer and Configure how to Quantize the Model ------------------------------------------------------------------------------ - -The following code snippets describe how to quantize the model: - -.. code-block:: python - - from torch.ao.quantization.quantizer.xnnpack_quantizer import ( - XNNPACKQuantizer, - get_symmetric_quantization_config, - ) - quantizer = XNNPACKQuantizer() - quantizer.set_global(get_symmetric_quantization_config(is_qat=True)) - -``Quantizer`` is backend specific, and each ``Quantizer`` will provide their -own way to allow users to configure their model. - -.. note:: - - Check out our - `tutorial `_ - that describes how to write a new ``Quantizer``. - - -Prepare the Model for Quantization-Aware Training ----------------------------------------------------------- - -``prepare_qat_pt2e`` inserts fake quantizes in appropriate places in the model -and performs the appropriate QAT "fusions", such as ``Conv2d`` + ``BatchNorm2d``, -for better training accuracies. The fused operations are represented as a subgraph -of ATen ops in the prepared graph. - -.. code-block:: python - - prepared_model = prepare_qat_pt2e(exported_model, quantizer) - print(prepared_model) - -.. note:: - - If your model contains batch normalization, the actual ATen ops you get - in the graph depend on the model's device when you export the model. - If the model is on CPU, then you'll get ``torch.ops.aten._native_batch_norm_legit``. - If the model is on CUDA, then you'll get ``torch.ops.aten.cudnn_batch_norm``. - However, this is not fundamental and may be subject to change in the future. - - Between these two ops, it has been shown that ``torch.ops.aten.cudnn_batch_norm`` - provides better numerics on models like MobileNetV2. To get this op, either - call ``model.cuda()`` before export, or run the following after prepare to manually - swap the ops: - - .. code:: python - - for n in prepared_model.graph.nodes: - if n.target == torch.ops.aten._native_batch_norm_legit.default: - n.target = torch.ops.aten.cudnn_batch_norm.default - prepared_model.recompile() - - In the future, we plan to consolidate the batch normalization ops such that - the above will no longer be necessary. - -Training Loop ------------------------------------------------------------------------------ - -The training loop is similar to the ones in previous versions of QAT. To achieve -better accuracies, you may optionally disable observers and updating batch -normalization statistics after a certain number of epochs, or evaluate the QAT -or the quantized model trained so far every ``N`` epochs. - -.. code:: python - - num_epochs = 10 - num_train_batches = 20 - num_eval_batches = 20 - num_observer_update_epochs = 4 - num_batch_norm_update_epochs = 3 - num_epochs_between_evals = 2 - - # QAT takes time and one needs to train over a few epochs. - # Train and check accuracy after each epoch - for nepoch in range(num_epochs): - train_one_epoch(prepared_model, criterion, optimizer, data_loader, "cuda", num_train_batches) - - # Optionally disable observer/batchnorm stats after certain number of epochs - if epoch >= num_observer_update_epochs: - print("Disabling observer for subseq epochs, epoch = ", epoch) - prepared_model.apply(torch.ao.quantization.disable_observer) - if epoch >= num_batch_norm_update_epochs: - print("Freezing BN for subseq epochs, epoch = ", epoch) - for n in prepared_model.graph.nodes: - # Args: input, weight, bias, running_mean, running_var, training, momentum, eps - # We set the `training` flag to False here to freeze BN stats - if n.target in [ - torch.ops.aten._native_batch_norm_legit.default, - torch.ops.aten.cudnn_batch_norm.default, - ]: - new_args = list(n.args) - new_args[5] = False - n.args = new_args - prepared_model.recompile() - - # Check the quantized accuracy every N epochs - # Note: If you wish to just evaluate the QAT model (not the quantized model), - # then you can just call `torch.ao.quantization.move_exported_model_to_eval/train`. - # However, the latter API is not ready yet and will be available in the near future. - if (nepoch + 1) % num_epochs_between_evals == 0: - prepared_model_copy = copy.deepcopy(prepared_model) - quantized_model = convert_pt2e(prepared_model_copy) - top1, top5 = evaluate(quantized_model, criterion, data_loader_test, neval_batches=num_eval_batches) - print('Epoch %d: Evaluation accuracy on %d images, %2.2f' % (nepoch, num_eval_batches * eval_batch_size, top1.avg)) - - -Saving and Loading Model Checkpoints ----------------------------------------------------------- - -Model checkpoints for the PyTorch 2 Export QAT flow are -the same as in any other training flow. They are useful for -pausing training and resuming it later, recovering from -failed training runs, and performing inference on different -machines at a later time. You can save model checkpoints -during or after training as follows: - -.. code:: python - - checkpoint_path = "/path/to/my/checkpoint_%s.pth" % nepoch - torch.save(prepared_model.state_dict(), "checkpoint_path") - -To load the checkpoints, you must export and prepare the -model the exact same way it was initially exported and -prepared. For example: - -.. code:: python - - from torch._export import capture_pre_autograd_graph - from torch.ao.quantization.quantizer.xnnpack_quantizer import ( - XNNPACKQuantizer, - get_symmetric_quantization_config, - ) - from torchvision.models.resnet import resnet18 - - example_inputs = (torch.rand(2, 3, 224, 224),) - float_model = resnet18(pretrained=False) - exported_model = capture_pre_autograd_graph(float_model, example_inputs) - quantizer = XNNPACKQuantizer() - quantizer.set_global(get_symmetric_quantization_config(is_qat=True)) - prepared_model = prepare_qat_pt2e(exported_model, quantizer) - prepared_model.load_state_dict(torch.load(checkpoint_path)) - - # resume training or perform inference - - -Convert the Trained Model to a Quantized Model ----------------------------------------------------------- - -``convert_pt2e`` takes a calibrated model and produces a quantized model. -Note that, before inference, you must first call -``torch.ao.quantization.move_exported_model_to_eval()`` to ensure certain ops -like dropout behave correctly in the eval graph. Otherwise, we would continue -to incorrectly apply dropout in the forward pass during inference, for example. - -.. code-block:: python - - quantized_model = convert_pt2e(prepared_model) - - # move certain ops like dropout to eval mode, equivalent to `m.eval()` - torch.ao.quantization.move_exported_model_to_eval(m) - - print(quantized_model) - - top1, top5 = evaluate(quantized_model, criterion, data_loader_test, neval_batches=num_eval_batches) - print('Final evaluation accuracy on %d images, %2.2f' % (num_eval_batches * eval_batch_size, top1.avg)) - -.. TODO: add results here - - -Conclusion --------------- - -In this tutorial, we demonstrated how to run Quantization-Aware Training (QAT) -flow in PyTorch 2 Export Quantization. After convert, the rest of the flow -is the same as Post-Training Quantization (PTQ); the user can -serialize/deserialize the model and further lower it to a backend that supports -inference with XNNPACK backend. For more detail, follow the -`PTQ tutorial `_. diff --git a/prototype_source/pt2e_quant_x86_inductor.rst b/prototype_source/pt2e_quant_x86_inductor.rst deleted file mode 100644 index f9836d6e371..00000000000 --- a/prototype_source/pt2e_quant_x86_inductor.rst +++ /dev/null @@ -1,313 +0,0 @@ -PyTorch 2 Export Quantization with X86 Backend through Inductor -================================================================== - -**Author**: `Leslie Fang `_, `Weiwen Xia `_, `Jiong Gong `_, `Jerry Zhang `_ - -Prerequisites ---------------- - -- `PyTorch 2 Export Post Training Quantization `_ -- `PyTorch 2 Export Quantization-Aware Training `_ -- `TorchInductor and torch.compile concepts in PyTorch `_ -- `Inductor C++ Wrapper concepts `_ - -Introduction --------------- - -This tutorial introduces the steps for utilizing the PyTorch 2 Export Quantization flow to generate a quantized model customized -for the x86 inductor backend and explains how to lower the quantized model into the inductor. - -The pytorch 2 export quantization flow uses the torch.export to capture the model into a graph and perform quantization transformations on top of the ATen graph. -This approach is expected to have significantly higher model coverage, better programmability, and a simplified UX. -TorchInductor is the new compiler backend that compiles the FX Graphs generated by TorchDynamo into optimized C++/Triton kernels. - -This flow of quantization 2 with Inductor supports both static and dynamic quantization. Static quantization works best for CNN models, like ResNet-50. And dynamic quantization is more suitable for NLP models, like RNN and BERT. -For the difference between the two quantization types, please refer to the `following page `__. - -The quantization flow mainly includes three steps: - -- Step 1: Capture the FX Graph from the eager Model based on the `torch export mechanism `_. -- Step 2: Apply the Quantization flow based on the captured FX Graph, including defining the backend-specific quantizer, generating the prepared model with observers, - performing the prepared model's calibration or quantization-aware training, and converting the prepared model into the quantized model. -- Step 3: Lower the quantized model into inductor with the API ``torch.compile``. - -The high-level architecture of this flow could look like this: - -:: - - float_model(Python) Example Input - \ / - \ / - —-------------------------------------------------------- - | export | - —-------------------------------------------------------- - | - FX Graph in ATen - | X86InductorQuantizer - | / - —-------------------------------------------------------- - | prepare_pt2e | - | | | - | Calibrate/Train | - | | | - | convert_pt2e | - —-------------------------------------------------------- - | - Quantized Model - | - —-------------------------------------------------------- - | Lower into Inductor | - —-------------------------------------------------------- - | - Inductor - -Combining Quantization in PyTorch 2 Export and TorchInductor, we have flexibility and productivity with the new Quantization frontend -and outstanding out-of-box performance with the compiler backend. Especially on Intel fourth generation (SPR) Xeon processors which can -further boost the models' performance by leveraging the -`advanced-matrix-extensions `_ feature. - -Post Training Quantization ----------------------------- - -Now, we will walk you through a step-by-step tutorial for how to use it with `torchvision resnet18 model `_ -for post training quantization. - -1. Capture FX Graph -^^^^^^^^^^^^^^^^^^^^^ - -We will start by performing the necessary imports, capturing the FX Graph from the eager module. - -:: - - import torch - import torchvision.models as models - import copy - from torch.ao.quantization.quantize_pt2e import prepare_pt2e, convert_pt2e - import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq - from torch.ao.quantization.quantizer.x86_inductor_quantizer import X86InductorQuantizer - from torch._export import capture_pre_autograd_graph - - # Create the Eager Model - model_name = "resnet18" - model = models.__dict__[model_name](pretrained=True) - - # Set the model to eval mode - model = model.eval() - - # Create the data, using the dummy data here as an example - traced_bs = 50 - x = torch.randn(traced_bs, 3, 224, 224).contiguous(memory_format=torch.channels_last) - example_inputs = (x,) - - # Capture the FX Graph to be quantized - with torch.no_grad(): - # if you are using the PyTorch nightlies or building from source with the pytorch master, - # use the API of `capture_pre_autograd_graph` - # Note 1: `capture_pre_autograd_graph` is also a short-term API, it will be updated to use the official `torch.export` API when that is ready. - exported_model = capture_pre_autograd_graph( - model, - example_inputs - ) - # Note 2: if you are using the PyTorch 2.1 release binary or building from source with the PyTorch 2.1 release branch, - # please use the API of `torch._dynamo.export` to capture the FX Graph. - # exported_model, guards = torch._dynamo.export( - # model, - # *copy.deepcopy(example_inputs), - # aten_graph=True, - # ) - - -Next, we will have the FX Module to be quantized. - -2. Apply Quantization -^^^^^^^^^^^^^^^^^^^^^^^ - -After we capture the FX Module to be quantized, we will import the Backend Quantizer for X86 CPU and configure how to -quantize the model. - -:: - - quantizer = X86InductorQuantizer() - quantizer.set_global(xiq.get_default_x86_inductor_quantization_config()) - -.. note:: - - The default quantization configuration in ``X86InductorQuantizer`` uses 8-bits for both activations and weights. - When Vector Neural Network Instruction is not available, the oneDNN backend silently chooses kernels that assume - `multiplications are 7-bit x 8-bit `_. In other words, potential - numeric saturation and accuracy issue may happen when running on CPU without Vector Neural Network Instruction. - -The quantization config is for static quantization by default. To apply dynamic quantization, add an argument ``is_dynamic=True`` when getting the config. - -.. code-block:: python - - quantizer = X86InductorQuantizer() - quantizer.set_global(xiq.get_default_x86_inductor_quantization_config(is_dynamic=True)) - - -After we import the backend-specific Quantizer, we will prepare the model for post-training quantization. -``prepare_pt2e`` folds BatchNorm operators into preceding Conv2d operators, and inserts observers in appropriate places in the model. - -:: - - prepared_model = prepare_pt2e(exported_model, quantizer) - -Now, we will calibrate the ``prepared_model`` after the observers are inserted in the model. This step is needed for static quantization only. - -:: - - # We use the dummy data as an example here - prepared_model(*example_inputs) - - # Alternatively: user can define the dataset to calibrate - # def calibrate(model, data_loader): - # model.eval() - # with torch.no_grad(): - # for image, target in data_loader: - # model(image) - # calibrate(prepared_model, data_loader_test) # run calibration on sample data - -Finally, we will convert the calibrated Model to a quantized Model. ``convert_pt2e`` takes a calibrated model and produces a quantized model. - -:: - - converted_model = convert_pt2e(prepared_model) - -After these steps, we finished running the quantization flow and we will get the quantized model. - - -3. Lower into Inductor -^^^^^^^^^^^^^^^^^^^^^^^^ - -After we get the quantized model, we will further lower it to the inductor backend. The default Inductor wrapper -generates Python code to invoke both generated kernels and external kernels. Additionally, Inductor supports -C++ wrapper that generates pure C++ code. This allows seamless integration of the generated and external kernels, -effectively reducing Python overhead. In the future, leveraging the C++ wrapper, we can extend the capability -to achieve pure C++ deployment. For more comprehensive details about C++ Wrapper in general, please refer to the -dedicated tutorial on `Inductor C++ Wrapper Tutorial `_. - -:: - - # Optional: using the C++ wrapper instead of default Python wrapper - import torch._inductor.config as config - config.cpp_wrapper = True - -:: - - with torch.no_grad(): - optimized_model = torch.compile(converted_model) - - # Running some benchmark - optimized_model(*example_inputs) - -In a more advanced scenario, int8-mixed-bf16 quantization comes into play. In this instance, -a Convolution or GEMM operator produces BFloat16 output data type instead of Float32 in the absence -of a subsequent quantization node. Subsequently, the BFloat16 tensor seamlessly propagates through -subsequent pointwise operators, effectively minimizing memory usage and potentially enhancing performance. -The utilization of this feature mirrors that of regular BFloat16 Autocast, as simple as wrapping the -script within the BFloat16 Autocast context. - -:: - - with torch.autocast(device_type="cpu", dtype=torch.bfloat16, enabled=True), torch.no_grad(): - # Turn on Autocast to use int8-mixed-bf16 quantization. After lowering into Inductor CPP Backend, - # For operators such as QConvolution and QLinear: - # * The input data type is consistently defined as int8, attributable to the presence of a pair - of quantization and dequantization nodes inserted at the input. - # * The computation precision remains at int8. - # * The output data type may vary, being either int8 or BFloat16, contingent on the presence - # of a pair of quantization and dequantization nodes at the output. - # For non-quantizable pointwise operators, the data type will be inherited from the previous node, - # potentially resulting in a data type of BFloat16 in this scenario. - # For quantizable pointwise operators such as QMaxpool2D, it continues to operate with the int8 - # data type for both input and output. - optimized_model = torch.compile(converted_model) - - # Running some benchmark - optimized_model(*example_inputs) - -Put all these codes together, we will have the toy example code. -Please note that since the Inductor ``freeze`` feature does not turn on by default yet, run your example code with ``TORCHINDUCTOR_FREEZING=1``. - -For example: - -:: - - TORCHINDUCTOR_FREEZING=1 python example_x86inductorquantizer_pytorch_2_1.py - -With PyTorch 2.1 release, all CNN models from TorchBench test suite have been measured and proven effective comparing with Inductor FP32 inference path. Please refer -to `this document `_ -for detail benchmark number. - -Quantization Aware Training ------------------------------ - -The PyTorch 2 Export Quantization-Aware Training (QAT) is now supported on X86 CPU using X86InductorQuantizer, -followed by the subsequent lowering of the quantized model into Inductor. -For a more in-depth understanding of PT2 Export Quantization-Aware Training, -we recommend referring to the dedicated `PyTorch 2 Export Quantization-Aware Training `_. - -The PyTorch 2 Export QAT flow is largely similar to the PTQ flow: - -.. code:: python - - import torch - from torch._export import capture_pre_autograd_graph - from torch.ao.quantization.quantize_pt2e import ( - prepare_qat_pt2e, - convert_pt2e, - ) - import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq - from torch.ao.quantization.quantizer.x86_inductor_quantizer import X86InductorQuantizer - - class M(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear = torch.nn.Linear(1024, 1000) - - def forward(self, x): - return self.linear(x) - - example_inputs = (torch.randn(1, 1024),) - m = M() - - # Step 1. program capture - # NOTE: this API will be updated to torch.export API in the future, but the captured - # result shoud mostly stay the same - exported_model = capture_pre_autograd_graph(m, example_inputs) - # we get a model with aten ops - - # Step 2. quantization-aware training - # Use Backend Quantizer for X86 CPU - # To apply dynamic quantization, add an argument ``is_dynamic=True`` when getting the config. - quantizer = X86InductorQuantizer() - quantizer.set_global(xiq.get_default_x86_inductor_quantization_config(is_qat=True)) - prepared_model = prepare_qat_pt2e(exported_model, quantizer) - - # train omitted - - converted_model = convert_pt2e(prepared_model) - # we have a model with aten ops doing integer computations when possible - - # move the quantized model to eval mode, equivalent to `m.eval()` - torch.ao.quantization.move_exported_model_to_eval(converted_model) - - # Lower the model into Inductor - with torch.no_grad(): - optimized_model = torch.compile(converted_model) - _ = optimized_model(*example_inputs) - -Please note that the Inductor ``freeze`` feature is not enabled by default. -To use this feature, you need to run example code with ``TORCHINDUCTOR_FREEZING=1``. - -For example: - -:: - - TORCHINDUCTOR_FREEZING=1 python example_x86inductorquantizer_qat.py - -Conclusion ------------- - -With this tutorial, we introduce how to use Inductor with X86 CPU in PyTorch 2 Quantization. Users can learn about -how to use ``X86InductorQuantizer`` to quantize a model and lower it into the inductor with X86 CPU devices. diff --git a/prototype_source/pt2e_quant_xpu_inductor.rst b/prototype_source/pt2e_quant_xpu_inductor.rst deleted file mode 100644 index 38be4cc73cc..00000000000 --- a/prototype_source/pt2e_quant_xpu_inductor.rst +++ /dev/null @@ -1,239 +0,0 @@ -PyTorch 2 Export Quantization with Intel GPU Backend through Inductor -================================================================== - -**Author**: `Yan Zhiwei `_, `Wang Eikan `_, `Zhang Liangang `_, `Liu River `_, `Cui Yifeng `_ - -Prerequisites ---------------- - -- `PyTorch 2 Export Post Training Quantization `_ -- `TorchInductor and torch.compile concepts in PyTorch `_ -- PyTorch 2.7 or later - -Introduction --------------- - -This tutorial introduces ``XPUInductorQuantizer``, which aims to serve quantized models for inference on Intel GPUs. -``XPUInductorQuantizer`` uses the PyTorch Export Quantization flow and lowers the quantized model into the inductor. - -The Pytorch 2 Export Quantization flow uses `torch.export` to capture the model into a graph and perform quantization transformations on top of the ATen graph. -This approach is expected to have significantly higher model coverage with better programmability and a simplified user experience. -TorchInductor is a compiler backend that transforms FX Graphs generated by ``TorchDynamo`` into optimized C++/Triton kernels. - -The quantization flow has three steps: - -- Step 1: Capture the FX Graph from the eager model based on the `torch export mechanism `_. -- Step 2: Apply the quantization flow based on the captured FX Graph, including defining the backend-specific quantizer, generating the prepared model with observers, - performing the prepared model's calibration, and converting the prepared model into the quantized model. -- Step 3: Lower the quantized model into inductor with the API ``torch.compile``, which would call Triton kernels or oneDNN GEMM/Convolution kernels. - - -The high-level architecture of this flow could look like this: - -.. image:: ../_static/img/pt2e_quant_xpu_inductor.png - :align: center - -Post Training Quantization ----------------------------- - -Static quantization is the only method we currently support. - -The following dependencies are recommended to be installed through the Intel GPU channel: - -:: - - pip3 install torch torchvision torchaudio pytorch-triton-xpu --index-url https://download.pytorch.org/whl/xpu - - -Please note that since the inductor ``freeze`` feature does not turn on by default yet, you must run your example code with ``TORCHINDUCTOR_FREEZING=1``. - -For example: - -:: - - TORCHINDUCTOR_FREEZING=1 python xpu_inductor_quantizer_example.py - - -1. Capture FX Graph -^^^^^^^^^^^^^^^^^^^^^ - -We will start by performing the necessary imports, capturing the FX Graph from the eager module. - -:: - - import torch - import torchvision.models as models - from torch.ao.quantization.quantize_pt2e import prepare_pt2e, convert_pt2e - import torch.ao.quantization.quantizer.xpu_inductor_quantizer as xpuiq - from torch.ao.quantization.quantizer.xpu_inductor_quantizer import XPUInductorQuantizer - from torch.export import export_for_training - - # Create the Eager Model - model_name = "resnet18" - model = models.__dict__[model_name](weights=models.ResNet18_Weights.DEFAULT) - - # Set the model to eval mode - model = model.eval().to("xpu") - - # Create the data, using the dummy data here as an example - traced_bs = 50 - x = torch.randn(traced_bs, 3, 224, 224, device="xpu").contiguous(memory_format=torch.channels_last) - example_inputs = (x,) - - # Capture the FX Graph to be quantized - with torch.no_grad(): - exported_model = export_for_training( - model, - example_inputs, - strict=True - ).module() - - -Next, we will quantize the FX Module. - -2. Apply Quantization -^^^^^^^^^^^^^^^^^^^^^^^ - -After we capture the FX Module, we will import the Backend Quantizer for Intel GPU and configure it to -quantize the model. - -:: - - quantizer = XPUInductorQuantizer() - quantizer.set_global(xpuiq.get_default_xpu_inductor_quantization_config()) - -The default quantization configuration in ``XPUInductorQuantizer`` uses signed 8-bits for both activations and weights. The tensors are per-tensor quantized, whereas the weights are signed 8-bit per-channel quantized. - -Optionally, in addition to the default quantization configuration using asymmetric quantized activation, signed 8-bits symmetric quantized activation is also supported, which has the potential to provide better performance. - -:: - - from torch.ao.quantization.observer import HistogramObserver, PerChannelMinMaxObserver - from torch.ao.quantization.quantizer.quantizer import QuantizationSpec - from torch.ao.quantization.quantizer.xnnpack_quantizer_utils import QuantizationConfig - from typing import Any, Optional, TYPE_CHECKING - if TYPE_CHECKING: - from torch.ao.quantization.qconfig import _ObserverOrFakeQuantizeConstructor - def get_xpu_inductor_symm_quantization_config(): - extra_args: dict[str, Any] = {"eps": 2**-12} - act_observer_or_fake_quant_ctr = HistogramObserver - act_quantization_spec = QuantizationSpec( - dtype=torch.int8, - quant_min=-128, - quant_max=127, - qscheme=torch.per_tensor_symmetric, # Change the activation quant config to symmetric - is_dynamic=False, - observer_or_fake_quant_ctr=act_observer_or_fake_quant_ctr.with_args( - **extra_args - ), - ) - - weight_observer_or_fake_quant_ctr: _ObserverOrFakeQuantizeConstructor = ( - PerChannelMinMaxObserver - ) - - weight_quantization_spec = QuantizationSpec( - dtype=torch.int8, - quant_min=-128, - quant_max=127, - qscheme=torch.per_channel_symmetric, # Same as the default config, the only supported option for weight - ch_axis=0, # 0 corresponding to weight shape = (oc, ic, kh, kw) of conv - is_dynamic=False, - observer_or_fake_quant_ctr=weight_observer_or_fake_quant_ctr.with_args( - **extra_args - ), - ) - - bias_quantization_spec = None # will use placeholder observer by default - quantization_config = QuantizationConfig( - act_quantization_spec, - act_quantization_spec, - weight_quantization_spec, - bias_quantization_spec, - False, - ) - return quantization_config - - # Then, set the quantization configuration to the quantizer. - quantizer = XPUInductorQuantizer() - quantizer.set_global(get_xpu_inductor_symm_quantization_config()) - -After the backend-specific quantizer is imported, prepare the model for post-training quantization. -``prepare_pt2e`` folds ``BatchNorm`` operators into preceding Conv2d operators, and inserts observers into appropriate places in the model. - -:: - - prepared_model = prepare_pt2e(exported_model, quantizer) - -**(For static quantization only)** Calibrate the ``prepared_model`` after the observers are inserted into the model. - -:: - - # We use the dummy data as an example here - prepared_model(*example_inputs) - - # Alternatively: user can define the dataset to calibrate - # def calibrate(model, data_loader): - # model.eval() - # with torch.no_grad(): - # for image, target in data_loader: - # model(image) - # calibrate(prepared_model, data_loader_test) # run calibration on sample data - -Finally, convert the calibrated model to a quantized model. ``convert_pt2e`` takes a calibrated model and produces a quantized model. - -:: - - converted_model = convert_pt2e(prepared_model) - -After these steps, the quantization flow has been completed and the quantized model is available. - - -3. Lower into Inductor -^^^^^^^^^^^^^^^^^^^^^^^^ - -The quantized model will then be lowered into the inductor backend. - -:: - - with torch.no_grad(): - optimized_model = torch.compile(converted_model) - - # Running some benchmark - optimized_model(*example_inputs) - -In a more advanced scenario, int8-mixed-bf16 quantization comes into play. In this instance, -a convolution or GEMM operator produces the output in BFloat16 instead of Float32 in the absence -of a subsequent quantization node. Subsequently, the BFloat16 tensor seamlessly propagates through -subsequent pointwise operators, effectively minimizing memory usage and potentially enhancing performance. -The utilization of this feature mirrors that of regular BFloat16 Autocast, as simple as wrapping the -script within the BFloat16 Autocast context. - -:: - - with torch.amp.autocast(device_type="xpu", dtype=torch.bfloat16), torch.no_grad(): - # Turn on Autocast to use int8-mixed-bf16 quantization. After lowering into indcutor backend, - # For operators such as QConvolution and QLinear: - # * The input data type is consistently defined as int8, attributable to the presence of a pair - # of quantization and dequantization nodes inserted at the input. - # * The computation precision remains at int8. - # * The output data type may vary, being either int8 or BFloat16, contingent on the presence - # of a pair of quantization and dequantization nodes at the output. - # For non-quantizable pointwise operators, the data type will be inherited from the previous node, - # potentially resulting in a data type of BFloat16 in this scenario. - # For quantizable pointwise operators such as QMaxpool2D, it continues to operate with the int8 - # data type for both input and output. - optimized_model = torch.compile(converted_model) - - # Running some benchmark - optimized_model(*example_inputs) - - -Conclusion ------------- - -In this tutorial, we have learned how to utilize the ``XPUInductorQuantizer`` to perform post-training quantization on models for inference -on Intel GPUs, leveraging PyTorch 2's Export Quantization flow. We covered the step-by-step process of capturing an FX Graph, -applying quantization, and lowering the quantized model into the inductor backend using ``torch.compile``. Additionally, we explored -the benefits of using int8-mixed-bf16 quantization for improved memory efficiency and potential performance gains, -especially when using ``BFloat16`` autocast. diff --git a/prototype_source/pt2e_quantizer.rst b/prototype_source/pt2e_quantizer.rst deleted file mode 100644 index be6d6949edd..00000000000 --- a/prototype_source/pt2e_quantizer.rst +++ /dev/null @@ -1,381 +0,0 @@ -How to Write a ``Quantizer`` for PyTorch 2 Export Quantization -================================================================ - -**Author**: `Leslie Fang `_, `Weiwen Xia `__, `Jiong Gong `__, `Kimish Patel `__, `Jerry Zhang `__ - -Prerequisites: -^^^^^^^^^^^^^^^^ - -Required: - -- `Torchdynamo concepts in PyTorch `__ - -- `Quantization concepts in PyTorch `__ - -- `(prototype) PyTorch 2 Export Post Training Quantization `__ - -Optional: - -- `FX Graph Mode post training static quantization `__ - -- `BackendConfig in PyTorch Quantization FX Graph Mode `__ - -- `QConfig and QConfigMapping in PyTorch Quantization FX Graph Mode `__ - -Introduction -^^^^^^^^^^^^^ - -`(prototype) PyTorch 2 Export Post Training Quantization `__ introduced the overall API for pytorch 2 export quantization, main difference from fx graph mode quantization in terms of API is that we made it explicit that quantiation is targeting a specific backend. So to use the new flow, backend need to implement a ``Quantizer`` class that encodes: -(1). What is supported quantized operator or patterns in the backend -(2). How can users express the way they want their floating point model to be quantized, for example, quantized the whole model to be int8 symmetric quantization, or quantize only linear layers etc. - -Please see `here `__ For motivations for the new API and ``Quantizer``. - -An existing quantizer object defined for ``XNNPACK`` is in -`QNNPackQuantizer `__ - -Annotation API -^^^^^^^^^^^^^^^^^^^ - -``Quantizer`` uses annotation API to convey quantization intent for different operators/patterns. -Annotation API mainly consists of -`QuantizationSpec `__ -and -`QuantizationAnnotation `__. - -``QuantizationSpec`` is used to convey intent of how a tensor will be quantized, -e.g. dtype, bitwidth, min, max values, symmetric vs. asymmetric etc. -Furthermore, ``QuantizationSpec`` also allows quantizer to specify how a -tensor value should be observed, e.g. ``MinMaxObserver``, or ``HistogramObserver`` -, or some customized observer. - -``QuantizationAnnotation`` composed of ``QuantizationSpec`` objects is used to annotate input tensors -and output tensor of a pattern. Annotating input tensors is equivalent of annotating input edges, -while annotating output tensor is equivalent of annotating node. ``QuantizationAnnotation`` is a ``dataclass`` -with several fields: - -- ``input_qspec_map`` field is of class ``Dict`` to map each input tensor (as input edge) to a ``QuantizationSpec``. -- ``output_qspec`` field expresses the ``QuantizationSpec`` used to annotate the output tensor; -- ``_annotated`` field indicates if this node has already been annotated by quantizer. - -To conclude, annotation API requires quantizer to annotate edges (input tensors) or -nodes (output tensor) of the graph. Now, we will have a step-by-step tutorial for -how to use the annotation API with different types of ``QuantizationSpec``. - -1. Annotate Common Operator Patterns --------------------------------------------------------- - -In order to use the quantized pattern/operators, e.g. ``quantized add``, -backend developers will have intent to quantize (as expressed by ``QuantizationSpec``) -inputs, output of the pattern. Following is an example flow (take ``add`` operator as example) -of how this intent is conveyed in the quantization workflow with annotation API. - -- Step 1: Identify the original floating point pattern in the FX graph. There are - several ways to identify this pattern: Quantizer may use a pattern matcher - to match the operator pattern; Quantizer may go through the nodes from start to the end and compare - the node's target type to match the operator pattern. In this example, we can use the - `get_source_partitions `__ - to match this pattern. The original floating point ``add`` pattern only contain a single ``add`` node. - -:: - - add_partitions = get_source_partitions(gm.graph, [operator.add, torch.add]) - add_partitions = list(itertools.chain(*add_partitions.values())) - for add_partition in add_partitions: - add_node = add_partition.output_nodes[0] - -- Step 2: Define the ``QuantizationSpec`` for inputs and output of the pattern. ``QuantizationSpec`` - defines the ``data type``, ``qscheme``, and other quantization parameters about users' intent of - how to observe or fake quantize a tensor. - -:: - - act_quantization_spec = QuantizationSpec( - dtype=torch.int8, - quant_min=-128, - quant_max=127, - qscheme=torch.per_tensor_affine, - is_dynamic=False, - observer_or_fake_quant_ctr=HistogramObserver.with_args(eps=2**-12), - ) - - input_act_qspec = act_quantization_spec - output_act_qspec = act_quantization_spec - -- Step 3: Annotate the inputs and output of the pattern with ``QuantizationAnnotation``. - In this example, we will create the ``QuantizationAnnotation`` object with the ``QuantizationSpec`` - created in above step 2 for two inputs and one output of the ``add`` node. - -:: - - input_qspec_map = {} - input_act0 = add_node.args[0] - input_qspec_map[input_act0] = input_act_qspec - - input_act1 = add_node.args[1] - input_qspec_map[input_act1] = input_act_qspec - - add_node.meta["quantization_annotation"] = QuantizationAnnotation( - input_qspec_map=input_qspec_map, - output_qspec=output_act_qspec, - _annotated=True, - ) - -After we annotate the ``add`` node like this, in the following up quantization flow, ``HistogramObserver`` will -be inserted at its two input nodes and one output node in prepare phase. And ``HistogramObserver`` will be substituted with -``quantize`` node and ``dequantize`` node in the convert phase. - -2. Annotate Operators that Shares Quantization Params --------------------------------------------------------- - -It is natural that users want to annotate a quantized model where quantization -parameters can be shared among some tensors explicitly. Two typical use cases are: - -- Example 1: One example is for ``add`` where having both inputs sharing quantization - parameters makes operator implementation much easier. Without using of - `SharedQuantizationSpec `__, - we must annotate ``add`` as example in above section 1, in which two inputs of ``add`` - has different quantization parameters. -- Example 2: Another example is that of sharing quantization parameters between inputs and output. - This typically results from operators such as ``maxpool``, ``average_pool``, ``concat`` etc. - -``SharedQuantizationSpec`` is designed for this use case to annotate tensors whose quantization -parameters are shared with other tensors. Input of ``SharedQuantizationSpec`` is an ``EdgeOrNode`` object which -can be an input edge or an output value. - -.. note:: - - * Sharing is transitive - - Some tensors might be effectively using shared quantization spec due to: - - * Two nodes/edges are configured to use ``SharedQuantizationSpec``. - * There is existing sharing of some nodes. - - For example, let's say we have two ``conv`` nodes ``conv1`` and ``conv2``, and both of them are fed into a ``cat`` - node: ``cat([conv1_out, conv2_out], ...)``. Let's say the output of ``conv1``, ``conv2``, and the first input of ``cat`` are configured - with the same configurations of ``QuantizationSpec``. The second input of ``cat`` is configured to use ``SharedQuantizationSpec`` - with the first input. - - .. code-block:: - - conv1_out: qspec1(dtype=torch.int8, ...) - conv2_out: qspec1(dtype=torch.int8, ...) - cat_input0: qspec1(dtype=torch.int8, ...) - cat_input1: SharedQuantizationSpec((conv1, cat)) # conv1 node is the first input of cat - - First of all, the output of ``conv1`` is implicitly sharing quantization parameters (and observer object) - with the first input of ``cat``, and the same is true for the output of ``conv2`` and the second input of ``cat``. - Therefore, since the user configures the two inputs of ``cat`` to share quantization parameters, by transitivity, - ``conv2_out`` and ``conv1_out`` will also be sharing quantization parameters. In the observed graph, you - will see the following: - - .. code-block:: - - conv1 -> obs -> cat - conv2 -> obs / - - and both ``obs`` will be the same observer instance. - - -- Input edge is the connection between input node and the node consuming the input, - so it's a ``Tuple[Node, Node]``. -- Output value is an FX ``Node``. - -Now, if we want to rewrite ``add`` annotation example with ``SharedQuantizationSpec`` to indicate -two input tensors as sharing quantization parameters. We can define its ``QuantizationAnnotation`` -as this: - -- Step 1: Identify the original floating point pattern in the FX graph. We can use the same - methods introduced in ``QuantizationSpec`` example to identify the ``add`` pattern. -- Step 2: Annotate input_act0 of ``add`` with ``QuantizationSpec``. -- Step 3: Create a ``SharedQuantizationSpec`` object with input edge defined as ``(input_act0, add_node)`` which means to - share the observer used for this edge. Then, user can annotate input_act1 with this ``SharedQuantizationSpec`` - object. - -:: - - input_qspec_map = {} - share_qparams_with_input_act0_qspec = SharedQuantizationSpec((input_act0, add_node)) - input_qspec_map = {input_act0: act_quantization_spec, input_act1: share_qparams_with_input_act0_qspec} - - add_node.meta["quantization_annotation"] = QuantizationAnnotation( - input_qspec_map=input_qspec_map, - output_qspec=act_quantization_spec, - _annotated=True, - ) - -3. Annotate Operators with Fixed Quantization Parameters ---------------------------------------------------------- - -Another typical use case to annotate a quantized model is for tensors whose -quantization parameters are known beforehand. For example, operator like ``sigmoid``, which has -predefined and fixed scale/zero_point at input and output tensors. -`FixedQParamsQuantizationSpec `__ -is designed for this use case. To use ``FixedQParamsQuantizationSpec``, users need to pass in parameters -of ``scale`` and ``zero_point`` explicitly. - -- Step 1: Identify the original floating point pattern in the FX graph. We can use the same - methods introduced in ``QuantizationSpec`` example to identify the ``sigmoid`` pattern. -- Step 2: Create ``FixedQParamsQuantizationSpec`` object with inputs of fixed ``scale``, ``zero_point`` value. - These values will be used to create the ``quantize`` node and ``dequantize`` node in the convert phase. -- Step 3: Annotate inputs and output to use this ``FixedQParamsQuantizationSpec`` object. - -:: - - act_qspec = FixedQParamsQuantizationSpec( - dtype=torch.uint8, - quant_min=0, - quant_max=255, - qscheme=torch.per_tensor_affine, - scale=1.0 / 256.0, - zero_point=0, - ) - sigmoid_node.meta["quantization_annotation"] = QuantizationAnnotation( - input_qspec_map={input_act: act_qspec}, - output_qspec=act_qspec, - _annotated=True, - ) - -4. Annotate Tensors with Derived Quantization Parameters ---------------------------------------------------------------- - -Another use case is to define the constraint for tensors whose quantization parameters are derived from other tensors. -For example, if we want to annotate a convolution node, and define the ``scale`` of its bias input tensor -as product of the activation tensor's ``scale`` and weight tensor's ``scale``. We can use -`DerivedQuantizationSpec `__ -to annotate this conv node. - -- Step 1: Identify the original floating point pattern in the FX graph. We can use the same - methods introduced in ``QuantizationSpec`` example to identify the ``convolution`` pattern. -- Step 2: Define ``derive_qparams_fn`` function, it accepts list of ``ObserverOrFakeQuantize`` ( - `ObserverBase `__ - or `FakeQuantizeBase `__) - as input. From each ``ObserverOrFakeQuantize`` object, user can get the ``scale``, ``zero point`` value. - User can define its heuristic about how to derive new ``scale``, ``zero point`` value based on the - quantization parameters calculated from the observer or fake quant instances. -- Step 3: Define ``DerivedQuantizationSpec`` obejct, it accepts inputs of: list of ``EdgeOrNode`` objects. - The observer corresponding to each ``EdgeOrNode`` object will be passed into the ``derive_qparams_fn`` function; - ``derive_qparams_fn`` function; several other quantization parameters such as ``dtype``, ``qscheme``. -- Step 4: Annotate the inputs and output of this conv node with ``QuantizationAnnotation``. - -:: - - def derive_qparams_fn(obs_or_fqs: List[ObserverOrFakeQuantize]) -> Tuple[Tensor, Tensor]: - assert len(obs_or_fqs) == 2, \ - "Expecting two obs/fqs, one for activation and one for weight, got: {}".format(len(obs_or_fq)) - act_obs_or_fq = obs_or_fqs[0] - weight_obs_or_fq = obs_or_fqs[1] - act_scale, act_zp = act_obs_or_fq.calculate_qparams() - weight_scale, weight_zp = weight_obs_or_fq.calculate_qparams() - return torch.tensor([act_scale * weight_scale]).to(torch.float32), torch.tensor([0]).to(torch.int32) - - bias_qspec = DerivedQuantizationSpec( - derived_from=[(input_act, node), (weight, node)], - derive_qparams_fn=derive_qparams_fn, - dtype=torch.int32, - quant_min=-2**31, - quant_max=2**31 - 1, - qscheme=torch.per_tensor_symmetric, - ) - input_qspec_map = {input_act: act_quantization_spec, weight: weight_quantization_spec, bias: bias_qspec} - node.meta["quantization_annotation"] = QuantizationAnnotation( - input_qspec_map=input_qspec_map, - output_qspec=act_quantization_spec, - _annotated=True, - ) - -5. A Toy Example with Resnet18 --------------------------------------------------------- - -After above annotation methods defined with ``QuantizationAnnotation API``, we can now put them together to construct a ``BackendQuantizer`` -and run a `toy example `__ -with ``Torchvision Resnet18``. To better understand the final example, here are the classes and utility -functions that are used in the example: - -- `QuantizationConfig `__ - consists of ``QuantizationSpec`` for activation, weight, and bias separately. -- When annotating the model, - `get_input_act_qspec `__, - `get_output_act_qspec `__, - `get_weight_qspec `__, and - `get_bias_qspec `__ - can be used to get the ``QuantizationSpec`` from ``QuantizationConfig`` for a specific pattern. - -A Note on IR for PT2E Quantization Flow -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -IR means the intermediate representation of the model, for example, ``torch`` IR (``torch.nn`` modules, ``torch.nn.functional`` ops) or ``aten`` IR (``torch.ops.aten.linear``, ...). PT2E Quantization Flow is using pre autograd aten IR (the output of `torch.export` API) so that we support training. As is shown before, we need to match the operator or operator patterns before we can attach annotations on them, So the question is how do we match the pattern? - -Motivation: Problem of Matching ``aten`` IR directly --------------------------------------------------------- - -The most straightforward way might be matching ``aten`` IR directly. - -Example:: - - for n in gm.graph.nodes: - if n.op != "call_function" or n.target not in [ - torch.ops.aten.relu.default, - torch.ops.aten.relu_.default, - ]: - continue - relu_node = n - maybe_conv_node = n.args[0] - if ( - not isinstance(maybe_conv_node, Node) - or maybe_conv_node.op != "call_function" - or maybe_conv_node.target - not in [ - torch.ops.aten.conv1d.default, - torch.ops.aten.conv2d.default, - ] - ): - continue - - # annotate conv and relu nodes - ... - -However one problem for using this IR is that the representation might change if the PyTorch implementation for modules or functional ops changed. But this could be unexpected since modeling users typically assume that when the eager mode model code doesn't change, they should get the same model representation after program capture as well. One concrete effect for this problem is that if a ``Quantizer`` do annotations based on recognizing ``aten`` IR patterns, then it may fail to recognzing the pattern after PyTorch version update, and the same eager mode floating point may be left unquantized. - -Recommendation: Use ``SubgraphMatcherWithNameNodeMap`` for pattern matching ------------------------------------------------------------------------------ -Because of this, we recommend people to recognize the pattern through ``SubgraphMatcherWithNameNodeMap`` (an improved version of ``SubgraphMatcher`` that makes it easier to query the nodes that people want to annotate), through capturing a ``torch`` IR pattern (with the same program capture used for capturing the floating point model), instead of using the ``aten`` IR pattern directly. - -Example:: - - def conv_relu_pattern(input, weight, bias): - conv = torch.nn.functional.conv2d(input, weight, bias) - output = torch.nn.functional.relu(conv) - # returns an additional dict that includes a map from name to node that we want to annotate - return relu, {"input": input, "weight": weight, "bias": bias, "output": output} - - matcher = SubgraphMatcherWithNameNodeMap(conv_relu_pattern) - matches = matcher.match(model) - for match in matches: - # find input and output of the pattern - # annotate the nodes - name_node_map = match.name_node_map - input_node = name_node_map["input"] - weight_node = name_node_map["weight"] - bias_node = name_node_map["bias"] - output_node = name_node_map["relu"] - input_node.users[0].meta["quantization_annotation"] = ... - weight_node.users[0].meta["quantization_annotation"] = ... - bias_node.users[0].meta["quantization_annotation"] = ... - output_node.meta["quantization_annotation"] = ... - -With this, the ``Quantizer`` will still be valid even when the implementation for nn modules and functionals changes, the ``aten`` IR for floating point model will change, but since we capture the pattern again instead of hardcoding the ``aten`` IR for the pattern, we'll get the updated ``aten`` IR as well and will still be able to match the pattern. - -One caveat is that if inputs of the pattern has multiple users, we don't have a good way to identify which user node we want to annotate except for checking the aten op target. - -Another caveat is that we need to make sure we have an exhaustive list of examples (e.g. 2D, 3D, 4D inputs, real v.s. symbolic inputs, training=True v.s. training=False etc.) for the pattern to make sure cover different possible ``aten`` IR outcomes captured from the ``torch`` IR pattern. - -Note: We may provide some (pattern, list of example_inputs) or some pre-generated matcher object so people can just use them directly in the future. - -Conclusion -^^^^^^^^^^^^^^^^^^^ - -With this tutorial, we introduce the new quantization path in PyTorch 2. Users can learn about -how to define a ``BackendQuantizer`` with the ``QuantizationAnnotation API`` and integrate it into the PyTorch 2 Export Quantization flow. -Examples of ``QuantizationSpec``, ``SharedQuantizationSpec``, ``FixedQParamsQuantizationSpec``, and ``DerivedQuantizationSpec`` -are given for specific annotation use case. You can use `XNNPACKQuantizer `_ as an example to start implementing your own ``Quantizer``. After that please follow `this tutorial `_ to actually quantize your model. diff --git a/prototype_source/quantization_in_pytorch_2_0_export_tutorial.rst b/prototype_source/quantization_in_pytorch_2_0_export_tutorial.rst deleted file mode 100644 index 43fd190e995..00000000000 --- a/prototype_source/quantization_in_pytorch_2_0_export_tutorial.rst +++ /dev/null @@ -1,10 +0,0 @@ -Quantization in PyTorch 2.0 Export Tutorial -=========================================== - -This tutorial has been moved. - -Redirecting in 3 seconds... - -.. raw:: html - - diff --git a/recipes_source/fuse.rst b/recipes_source/fuse.rst deleted file mode 100644 index c6c69762962..00000000000 --- a/recipes_source/fuse.rst +++ /dev/null @@ -1,157 +0,0 @@ -Fuse Modules Recipe -===================================== - -This recipe demonstrates how to fuse a list of PyTorch modules into a single module and how to do the performance test to compare the fused model with its non-fused version. - -Introduction ------------- - -Before quantization is applied to a model to reduce its size and memory footprint (see `Quantization Recipe `_ for details on quantization), the list of modules in the model may be fused first into a single module. Fusion is optional, but it may save on memory access, make the model run faster, and improve its accuracy. - - -Pre-requisites --------------- - -PyTorch 1.6.0 or 1.7.0 - -Steps --------------- - -Follow the steps below to fuse an example model, quantize it, script it, optimize it for mobile, save it and test it with the Android benchmark tool. - -1. Define the Example Model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Use the same example model defined in the `PyTorch Mobile Performance Recipes `_: - -:: - - import torch - from torch.utils.mobile_optimizer import optimize_for_mobile - - class AnnotatedConvBnReLUModel(torch.nn.Module): - def __init__(self): - super(AnnotatedConvBnReLUModel, self).__init__() - self.conv = torch.nn.Conv2d(3, 5, 3, bias=False).to(dtype=torch.float) - self.bn = torch.nn.BatchNorm2d(5).to(dtype=torch.float) - self.relu = torch.nn.ReLU(inplace=True) - self.quant = torch.quantization.QuantStub() - self.dequant = torch.quantization.DeQuantStub() - - def forward(self, x): - x = x.contiguous(memory_format=torch.channels_last) - x = self.quant(x) - x = self.conv(x) - x = self.bn(x) - x = self.relu(x) - x = self.dequant(x) - return x - - -2. Generate Two Models with and without `fuse_modules` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Add the following code below the model definition above and run the script: - -:: - - model = AnnotatedConvBnReLUModel() - print(model) - - def prepare_save(model, fused): - model.qconfig = torch.quantization.get_default_qconfig('qnnpack') - torch.quantization.prepare(model, inplace=True) - torch.quantization.convert(model, inplace=True) - torchscript_model = torch.jit.script(model) - torchscript_model_optimized = optimize_for_mobile(torchscript_model) - torch.jit.save(torchscript_model_optimized, "model.pt" if not fused else "model_fused.pt") - - prepare_save(model, False) - - model = AnnotatedConvBnReLUModel() - model_fused = torch.quantization.fuse_modules(model, [['bn', 'relu']], inplace=False) - print(model_fused) - - prepare_save(model_fused, True) - - -The graphs of the original model and its fused version will be printed as follows: - -:: - - AnnotatedConvBnReLUModel( - (conv): Conv2d(3, 5, kernel_size=(3, 3), stride=(1, 1), bias=False) - (bn): BatchNorm2d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (relu): ReLU(inplace=True) - (quant): QuantStub() - (dequant): DeQuantStub() - ) - - AnnotatedConvBnReLUModel( - (conv): Conv2d(3, 5, kernel_size=(3, 3), stride=(1, 1), bias=False) - (bn): BNReLU2d( - (0): BatchNorm2d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (1): ReLU(inplace=True) - ) - (relu): Identity() - (quant): QuantStub() - (dequant): DeQuantStub() - ) - -In the second fused model output, the first item `bn` in the list is replaced with the fused module, and the rest of the modules (`relu` in this example) is replaced with identity. In addition, the non-fused and fused versions of the model `model.pt` and `model_fused.pt` are generated. - -3. Build the Android benchmark Tool -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Get the PyTorch source and build the Android benchmark tool as follows: - -:: - - git clone --recursive https://github.com/pytorch/pytorch - cd pytorch - git submodule update --init --recursive - BUILD_PYTORCH_MOBILE=1 ANDROID_ABI=arm64-v8a ./scripts/build_android.sh -DBUILD_BINARY=ON - - -This will generate the Android benchmark binary `speed_benchmark_torch` in the `build_android/bin` folder. - -4. Test Compare the Fused and Non-Fused Models -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Connect your Android device, then copy `speed_benchmark_torch` and the model files and run the benchmark tool on them: - -:: - - adb push build_android/bin/speed_benchmark_torch /data/local/tmp - adb push model.pt /data/local/tmp - adb push model_fused.pt /data/local/tmp - adb shell "/data/local/tmp/speed_benchmark_torch --model=/data/local/tmp/model.pt" --input_dims="1,3,224,224" --input_type="float" - adb shell "/data/local/tmp/speed_benchmark_torch --model=/data/local/tmp/model_fused.pt" --input_dims="1,3,224,224" --input_type="float" - - -The results from the last two commands should be like: - -:: - - Main run finished. Microseconds per iter: 6189.07. Iters per second: 161.575 - -and - -:: - - Main run finished. Microseconds per iter: 6216.65. Iters per second: 160.858 - -For this example model, there is no much performance difference between the fused and non-fused models. But the similar steps can be used to fuse and prepare a real deep model and test to see the performance improvement. Keep in mind that currently `torch.quantization.fuse_modules` only fuses the following sequence of modules: - -* conv, bn -* conv, bn, relu -* conv, relu -* linear, relu -* bn, relu - -If any other sequence list is provided to the `fuse_modules` call, it will simply be ignored. - -Learn More ---------------- - -See `here `_ for the official documentation of `torch.quantization.fuse_modules`. diff --git a/recipes_source/quantization.rst b/recipes_source/quantization.rst deleted file mode 100644 index ac9cd48fe8c..00000000000 --- a/recipes_source/quantization.rst +++ /dev/null @@ -1,135 +0,0 @@ -Quantization Recipe -===================================== - -This recipe demonstrates how to quantize a PyTorch model so it can run with reduced size and faster inference speed with about the same accuracy as the original model. Quantization can be applied to both server and mobile model deployment, but it can be especially important or even critical on mobile, because a non-quantized model's size may exceed the limit that an iOS or Android app allows for, cause the deployment or OTA update to take too much time, and make the inference too slow for a good user experience. - -Introduction ------------- - -Quantization is a technique that converts 32-bit floating numbers in the model parameters to 8-bit integers. With quantization, the model size and memory footprint can be reduced to 1/4 of its original size, and the inference can be made about 2-4 times faster, while the accuracy stays about the same. - -There are overall three approaches or workflows to quantize a model: post training dynamic quantization, post training static quantization, and quantization aware training. But if the model you want to use already has a quantized version, you can use it directly without going through any of the three workflows above. For example, the `torchvision` library already includes quantized versions for models MobileNet v2, ResNet 18, ResNet 50, Inception v3, GoogleNet, among others. So we will make the last approach another workflow, albeit a simple one. - -.. note:: - The quantization support is available for a limited set of operators. See `this `_ for more information. - -Pre-requisites ------------------ - -PyTorch 1.6.0 or 1.7.0 - -torchvision 0.6.0 or 0.7.0 - -Workflows ------------- - -Use one of the four workflows below to quantize a model. - -1. Use Pretrained Quantized MobileNet v2 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -To get the MobileNet v2 quantized model, simply do: - -:: - - import torchvision - model_quantized = torchvision.models.quantization.mobilenet_v2(pretrained=True, quantize=True) - - -To compare the size difference of a non-quantized MobileNet v2 model with its quantized version: - -:: - - model = torchvision.models.mobilenet_v2(pretrained=True) - - import os - import torch - - def print_model_size(mdl): - torch.save(mdl.state_dict(), "tmp.pt") - print("%.2f MB" %(os.path.getsize("tmp.pt")/1e6)) - os.remove('tmp.pt') - - print_model_size(model) - print_model_size(model_quantized) - - -The outputs will be: - -:: - - 14.27 MB - 3.63 MB - -2. Post Training Dynamic Quantization -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -To apply Dynamic Quantization, which converts all the weights in a model from 32-bit floating numbers to 8-bit integers but doesn't convert the activations to int8 till just before performing the computation on the activations, simply call `torch.quantization.quantize_dynamic`: - -:: - - model_dynamic_quantized = torch.quantization.quantize_dynamic( - model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8 - ) - -where `qconfig_spec` specifies the list of submodule names in `model` to apply quantization to. - -.. warning:: An important limitation of Dynamic Quantization, while it is the easiest workflow if you do not have a pre-trained quantized model ready for use, is that it currently only supports `nn.Linear` and `nn.LSTM` in `qconfig_spec`, meaning that you will have to use Static Quantization or Quantization Aware Training, to be discussed later, to quantize other modules such as `nn.Conv2d`. - -The full documentation of the `quantize_dynamic` API call is `here `_. Three other examples of using the post training dynamic quantization are `the Bert example `_, `an LSTM model example `_, and another `demo LSTM example `_. - -3. Post Training Static Quantization -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -This method converts both the weights and the activations to 8-bit integers beforehand so there won’t be on-the-fly conversion on the activations during the inference, as the dynamic quantization does. While post-training static quantization can significantly enhance inference speed and reduce model size, this method may degrade the original model's accuracy more compared to post training dynamic quantization. - -To apply static quantization on a model, run the following code: - -:: - - backend = "qnnpack" - model.qconfig = torch.quantization.get_default_qconfig(backend) - torch.backends.quantized.engine = backend - model_static_quantized = torch.quantization.prepare(model, inplace=False) - model_static_quantized = torch.quantization.convert(model_static_quantized, inplace=False) - -After this, running `print_model_size(model_static_quantized)` shows the static quantized model is `3.98MB`. - -A complete model definition and static quantization example is `here `_. A dedicated static quantization tutorial is `here `_. - -.. note:: - To make the model run on mobile devices which normally have arm architecture, you need to use `qnnpack` for `backend`; to run the model on computer with x86 architecture, use `x86`` (the old `fbgemm` is still available but 'x86' is the recommended default). - -4. Quantization Aware Training -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Quantization aware training inserts fake quantization to all the weights and activations during the model training process and results in higher inference accuracy than the post-training quantization methods. It is typically used in CNN models. - -To enable a model for quantization aware traing, define in the `__init__` method of the model definition a `QuantStub` and a `DeQuantStub` to convert tensors from floating point to quantized type and vice versa: - -:: - - self.quant = torch.quantization.QuantStub() - self.dequant = torch.quantization.DeQuantStub() - -Then in the beginning and the end of the `forward` method of the model definition, call `x = self.quant(x)` and `x = self.dequant(x)`. - -To do a quantization aware training, use the following code snippet: - -:: - - model.qconfig = torch.quantization.get_default_qat_qconfig(backend) - model_qat = torch.quantization.prepare_qat(model, inplace=False) - # quantization aware training goes here - model_qat = torch.quantization.convert(model_qat.eval(), inplace=False) - -For more detailed examples of the quantization aware training, see `here `_ and `here `_. - -A pre-trained quantized model can also be used for quantized aware transfer learning, using the same `quant` and `dequant` calls shown above. See `here `_ for a complete example. - -After a quantized model is generated using one of the steps above, before the model can be used to run on mobile devices, it needs to be further converted to the `TorchScript` format and then optimized for mobile apps. See the `Script and Optimize for Mobile recipe `_ for details. - -Learn More ------------------ - -For more info on the different workflows of quantization, see `here `_ and `here `_. diff --git a/recipes_source/recipes/dynamic_quantization.py b/recipes_source/recipes/dynamic_quantization.py deleted file mode 100644 index e69d7bfd02e..00000000000 --- a/recipes_source/recipes/dynamic_quantization.py +++ /dev/null @@ -1,294 +0,0 @@ -""" -Dynamic Quantization -==================== - -In this recipe you will see how to take advantage of Dynamic -Quantization to accelerate inference on an LSTM-style recurrent neural -network. This reduces the size of the model weights and speeds up model -execution. - -Introduction -------------- - -There are a number of trade-offs that can be made when designing neural -networks. During model development and training you can alter the -number of layers and number of parameters in a recurrent neural network -and trade-off accuracy against model size and/or model latency or -throughput. Such changes can take lot of time and compute resources -because you are iterating over the model training. Quantization gives -you a way to make a similar trade off between performance and model -accuracy with a known model after training is completed. - -You can give it a try in a single session and you will certainly reduce -your model size significantly and may get a significant latency -reduction without losing a lot of accuracy. - -What is dynamic quantization? ------------------------------ - -Quantizing a network means converting it to use a reduced precision -integer representation for the weights and/or activations. This saves on -model size and allows the use of higher throughput math operations on -your CPU or GPU. - -When converting from floating point to integer values you are -essentially multiplying the floating point value by some scale factor -and rounding the result to a whole number. The various quantization -approaches differ in the way they approach determining that scale -factor. - -The key idea with dynamic quantization as described here is that we are -going to determine the scale factor for activations dynamically based on -the data range observed at runtime. This ensures that the scale factor -is "tuned" so that as much signal as possible about each observed -dataset is preserved. - -The model parameters on the other hand are known during model conversion -and they are converted ahead of time and stored in INT8 form. - -Arithmetic in the quantized model is done using vectorized INT8 -instructions. Accumulation is typically done with INT16 or INT32 to -avoid overflow. This higher precision value is scaled back to INT8 if -the next layer is quantized or converted to FP32 for output. - -Dynamic quantization is relatively free of tuning parameters which makes -it well suited to be added into production pipelines as a standard part -of converting LSTM models to deployment. - - - -.. note:: - Limitations on the approach taken here - - - This recipe provides a quick introduction to the dynamic quantization - features in PyTorch and the workflow for using it. Our focus is on - explaining the specific functions used to convert the model. We will - make a number of significant simplifications in the interest of brevity - and clarity - - -1. You will start with a minimal LSTM network -2. You are simply going to initialize the network with a random hidden - state -3. You are going to test the network with random inputs -4. You are not going to train the network in this tutorial -5. You will see that the quantized form of this network is smaller and - runs faster than the floating point network we started with -6. You will see that the output values are generally in the same - ballpark as the output of the FP32 network, but we are not - demonstrating here the expected accuracy loss on a real trained - network - -You will see how dynamic quantization is done and be able to see -suggestive reductions in memory use and latency times. Providing a -demonstration that the technique can preserve high levels of model -accuracy on a trained LSTM is left to a more advanced tutorial. If you -want to move right away to that more rigorous treatment please proceed -to the `advanced dynamic quantization -tutorial `__. - -Steps -------------- - -This recipe has 5 steps. - -1. Set Up - Here you define a very simple LSTM, import modules, and establish - some random input tensors. - -2. Do the Quantization - Here you instantiate a floating point model and then create quantized - version of it. - -3. Look at Model Size - Here you show that the model size gets smaller. - -4. Look at Latency - Here you run the two models and compare model runtime (latency). - -5. Look at Accuracy - Here you run the two models and compare outputs. - - -1: Set Up -~~~~~~~~~~~~~~~ -This is a straightforward bit of code to set up for the rest of the -recipe. - -The unique module we are importing here is torch.quantization which -includes PyTorch's quantized operators and conversion functions. We also -define a very simple LSTM model and set up some inputs. - -""" - -# import the modules used here in this recipe -import torch -import torch.quantization -import torch.nn as nn -import copy -import os -import time - -# define a very, very simple LSTM for demonstration purposes -# in this case, we are wrapping ``nn.LSTM``, one layer, no preprocessing or postprocessing -# inspired by -# `Sequence Models and Long Short-Term Memory Networks tutorial `__. -class lstm_for_demonstration(nn.Module): - """Elementary Long Short Term Memory style model which simply wraps ``nn.LSTM`` - Not to be used for anything other than demonstration. - """ - def __init__(self,in_dim,out_dim,depth): - super(lstm_for_demonstration,self).__init__() - self.lstm = nn.LSTM(in_dim,out_dim,depth) - - def forward(self,inputs,hidden): - out,hidden = self.lstm(inputs,hidden) - return out, hidden - - -torch.manual_seed(29592) # set the seed for reproducibility - -#shape parameters -model_dimension=8 -sequence_length=20 -batch_size=1 -lstm_depth=1 - -# random data for input -inputs = torch.randn(sequence_length,batch_size,model_dimension) -# hidden is actually is a tuple of the initial hidden state and the initial cell state -hidden = (torch.randn(lstm_depth,batch_size,model_dimension), torch.randn(lstm_depth,batch_size,model_dimension)) - - -###################################################################### -# 2: Do the Quantization -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Now we get to the fun part. First we create an instance of the model -# called ``float_lstm`` then we are going to quantize it. We're going to use -# the `torch.quantization.quantize_dynamic `__ function, which takes the model, then a list of the submodules -# which we want to -# have quantized if they appear, then the datatype we are targeting. This -# function returns a quantized version of the original model as a new -# module. -# -# That's all it takes. -# - - # here is our floating point instance -float_lstm = lstm_for_demonstration(model_dimension, model_dimension,lstm_depth) - -# this is the call that does the work -quantized_lstm = torch.quantization.quantize_dynamic( - float_lstm, {nn.LSTM, nn.Linear}, dtype=torch.qint8 -) - -# show the changes that were made -print('Here is the floating point version of this module:') -print(float_lstm) -print('') -print('and now the quantized version:') -print(quantized_lstm) - - -###################################################################### -# 3. Look at Model Size -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# We've quantized the model. What does that get us? Well the first -# benefit is that we've replaced the FP32 model parameters with INT8 -# values (and some recorded scale factors). This means about 75% less data -# to store and move around. With the default values the reduction shown -# below will be less than 75% but if you increase the model size above -# (for example you can set model dimension to something like 80) this will -# converge towards 4x smaller as the stored model size dominated more and -# more by the parameter values. -# - -def print_size_of_model(model, label=""): - torch.save(model.state_dict(), "temp.p") - size=os.path.getsize("temp.p") - print("model: ",label,' \t','Size (KB):', size/1e3) - os.remove('temp.p') - return size - -# compare the sizes -f=print_size_of_model(float_lstm,"fp32") -q=print_size_of_model(quantized_lstm,"int8") -print("{0:.2f} times smaller".format(f/q)) - - -###################################################################### -# 4. Look at Latency -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# The second benefit is that the quantized model will typically run -# faster. This is due to a combinations of effects including at least: -# -# 1. Less time spent moving parameter data in -# 2. Faster INT8 operations -# -# As you will see the quantized version of this super-simple network runs -# faster. This will generally be true of more complex networks but as they -# say "your mileage may vary" depending on a number of factors including -# the structure of the model and the hardware you are running on. -# - -# compare the performance -print("Floating point FP32") - -##################################################################### -# .. code-block:: python -# -# %timeit float_lstm.forward(inputs, hidden) - -print("Quantized INT8") - -###################################################################### -# .. code-block:: python -# -# %timeit quantized_lstm.forward(inputs,hidden) - - -###################################################################### -# 5: Look at Accuracy -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# We are not going to do a careful look at accuracy here because we are -# working with a randomly initialized network rather than a properly -# trained one. However, I think it is worth quickly showing that the -# quantized network does produce output tensors that are "in the same -# ballpark" as the original one. -# -# For a more detailed analysis please see the more advanced tutorials -# referenced at the end of this recipe. -# - -# run the float model -out1, hidden1 = float_lstm(inputs, hidden) -mag1 = torch.mean(abs(out1)).item() -print('mean absolute value of output tensor values in the FP32 model is {0:.5f} '.format(mag1)) - -# run the quantized model -out2, hidden2 = quantized_lstm(inputs, hidden) -mag2 = torch.mean(abs(out2)).item() -print('mean absolute value of output tensor values in the INT8 model is {0:.5f}'.format(mag2)) - -# compare them -mag3 = torch.mean(abs(out1-out2)).item() -print('mean absolute value of the difference between the output tensors is {0:.5f} or {1:.2f} percent'.format(mag3,mag3/mag1*100)) - - -###################################################################### -# Learn More -# ------------ -# We've explained what dynamic quantization is, what benefits it brings, -# and you have used the ``torch.quantization.quantize_dynamic()`` function -# to quickly quantize a simple LSTM model. -# -# This was a fast and high level treatment of this material; for more -# detail please continue learning with `(beta) Dynamic Quantization on an LSTM Word Language Model Tutorial `_. -# -# -# Additional Resources -# -------------------- -# -# * `Quantization API Documentaion `_ -# * `(beta) Dynamic Quantization on BERT `_ -# * `(beta) Dynamic Quantization on an LSTM Word Language Model `_ -# * `Introduction to Quantization on PyTorch `_ -# diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst index da1f571982b..a64d2070aed 100644 --- a/recipes_source/recipes_index.rst +++ b/recipes_source/recipes_index.rst @@ -187,16 +187,6 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu :link: ../recipes/recipes/tensorboard_with_pytorch.html :tags: Visualization,TensorBoard -.. Quantization - -.. customcarditem:: - :header: Dynamic Quantization - :card_description: Apply dynamic quantization to a simple LSTM model. - :image: ../_static/img/thumbnails/cropped/using-dynamic-post-training-quantization.png - :link: ../recipes/recipes/dynamic_quantization.html - :tags: Quantization,Text,Model-Optimization - - .. Production Development .. customcarditem:: @@ -220,20 +210,6 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu :link: ../recipes/android_native_app_with_custom_op.html :tags: Mobile -.. customcarditem:: - :header: Fuse Modules recipe - :card_description: Learn how to fuse a list of PyTorch modules into a single module to reduce the model size before quantization. - :image: ../_static/img/thumbnails/cropped/mobile.png - :link: ../recipes/fuse.html - :tags: Mobile - -.. customcarditem:: - :header: Quantization for Mobile Recipe - :card_description: Learn how to reduce the model size and make it run faster without losing much on accuracy. - :image: ../_static/img/thumbnails/cropped/mobile.png - :link: ../recipes/quantization.html - :tags: Mobile,Quantization - .. customcarditem:: :header: Script and Optimize for Mobile :card_description: Learn how to convert the model to TorchScipt and (optional) optimize it for mobile apps. diff --git a/redirects.py b/redirects.py index 4e5dfceebfd..b0566c63f56 100644 --- a/redirects.py +++ b/redirects.py @@ -7,4 +7,23 @@ "intermediate/torchserve_with_ipex_2.html": "../index.html", "recipes/torchserve_vertexai_tutorial.html": "../index.html", "beginner/flava_finetuning_tutorial.html": "../index.html", + "advanced/static_quantization_tutorial.html": "../index.html", + "advanced/dynamic_quantization_tutorial.html": "../index.html", + "intermediate/dynamic_quantization_bert_tutorial.html": "../index.html", + "intermediate/quantized_transfer_learning_tutorial.html": "../index.html", + "prototype/fx_graph_mode_ptq_dynamic.html": "../index.html", + "prototype/fx_graph_mode_ptq_static.html": "../index.html", + "prototype/fx_graph_mode_quant_guide.html": "../index.html", + "prototype/numeric_suite_tutorial.html": "../index.html", + "prototype/quantization_in_pytorch_2_0_export_tutorial.html": "../index.html", + "prototype/pt2e_quant_ptq.html": "https://docs.pytorch.org/ao/main/tutorials_source/pt2e_quant_ptq.html", + "prototype/pt2e_quant_qat.html": "https://docs.pytorch.org/ao/main/tutorials_source/pt2e_quant_qat.html", + "prototype/pt2e_quantizer.html": "https://docs.pytorch.org/ao/main/tutorials_source/pt2e_quantizer.html", + "prototype/pt2e_quant_x86_inductor.html": "https://docs.pytorch.org/ao/main/tutorials_source/pt2e_quant_x86_inductor.html", + "prototype/pt2e_quant_ptq_x86_inductor.html": "https://docs.pytorch.org/ao/main/tutorials_source/pt2e_quant_x86_inductor.html", + "prototype/pt2e_quant_xpu_inductor.html": "https://docs.pytorch.org/ao/main/tutorials_source/pt2e_quant_xpu_inductor.html", + "prototype/openvino_quantizer.html": "https://docs.pytorch.org/ao/main/tutorials_source/pt2e_quant_openvino_inductor.html", + "receipes/fuse.html": "../index.html", + "receipes/quantization.html": "../index.html", + "receipes/receipes/dynamic_quantization.html": "../index.html", }