Support diagnosis for ONNX NLP models (#1012)

mengniwang95 · chensuyue · web-flow · commit 45144027c6cf · 2023-06-21T09:08:59.000+08:00
Signed-off-by: Mengni Wang &lt;mengni.wang@intel.com&gt;
Co-authored-by: chen, suyue &lt;suyue.chen@intel.com&gt;
diff --git a/docs/source/releases_info.md b/docs/source/releases_info.md
@@ -17,6 +17,8 @@ Contact [inc.maintainers@intel.com](mailto:inc.maintainers@intel.com) if you nee
 
 The MSE tuning strategy does not work with the PyTorch adaptor layer. This strategy requires a comparison between the FP32 and INT8 tensors to decide which op impacts the final quantization accuracy. The PyTorch adaptor layer does not implement this inspect tensor interface. Therefore, do not choose the MSE tuning strategy for PyTorch models.
 
+The diagnosis function does not work with ONNX Runtime 1.13.1 for QDQ format quantization of ONNX models. It can not dump the output value of QDQ pairs since framework limitation.
+
 ## Incompatible Changes
 
 [Neural Compressor v1.2](https://github.com/intel/neural-compressor/tree/v1.2) introduces incompatible changes in user facing APIs. Please refer to [incompatible changes](incompatible_changes.md) to know which incompatible changes are made in v1.2.
@@ -25,4 +27,4 @@ The MSE tuning strategy does not work with the PyTorch adaptor layer. This strat
 
 [Neural Compressor v1.7](https://github.com/intel/neural-compressor/tree/v1.7) renames the pip/conda package name from lpot to neural_compressor. To run old examples on latest software, please replace package name for compatibility with `sed -i "s|lpot|neural_compressor|g" your_script.py` .
 
-[Neural Compressor v2.0](https://github.com/intel/neural-compressor/tree/v2.0) renames the `DATASETS` class as `Datasets`, please notice use cases like `from neural_compressor.data import Datasets`. Details please check the [PR](https://github.com/intel/neural-compressor/pull/244/files).
+[Neural Compressor v2.0](https://github.com/intel/neural-compressor/tree/v2.0) renames the `DATASETS` class as `Datasets`, please notice use cases like `from neural_compressor.data import Datasets`. Details please check the [PR](https://github.com/intel/neural-compressor/pull/244/files).
diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py
@@ -493,7 +493,8 @@ def inspect_tensor(self, model, dataloader, op_list=[],
                   white_nodes=op_list,
                   backend=self.backend)
         tensors = augment.dump_tensor(activation=(inspect_type!='weight'),
-                                      weight=(inspect_type!='activation'),)
+                                      weight=(inspect_type!='activation'),
+                                      format=self.format)
         if save_to_disk:
             if not save_path:
                 save_path = self.work_space
diff --git a/neural_compressor/adaptor/ox_utils/calibration.py b/neural_compressor/adaptor/ox_utils/calibration.py
@@ -53,7 +53,7 @@ def __init__(self, model_wrapper,
                  black_nodes=[],
                  white_nodes=[],
                  iterations=[],
-                 backend=['CPUExecutionProvider'],
+                 backend='CPUExecutionProvider',
                  reduce_range=False):
         """Initialization.
 
@@ -149,7 +149,7 @@ def augment_graph(self, activation_only=False, weight_only=False):
                         elif not self.already_quantized and input in initializers:
                             tensors_to_dump.add(input)
                 elif activation_only:
-                    tensors_to_dump.update(node.output)
+                    tensors_to_dump.update([node.input[0]])
 
         model_inputs = [i.name for i in model.graph.input]
         for tensor in tensors_to_dump:
@@ -160,9 +160,7 @@ def augment_graph(self, activation_only=False, weight_only=False):
                 for augment_node_type in self.augment_nodes:
                     if augment_node_type in ['DequantizeLinear']:
                         # insert DequantizeLinear node as output
-                        if tensor.endswith('_scale') or tensor.endswith('_zero_point') or \
-                                tensor.endswith('_QuantizeLinear') or \
-                                tensor.endswith('_QuantizeInput_quantized'):
+                        if tensor.endswith('_scale') or tensor.endswith('_zero_point'):
                             continue
 
                         if not self.dynamically_quantized:
@@ -483,14 +481,16 @@ def calculate_quantization_params(self, q_config, quantization_thresholds):
 
         return quantization_params
 
-    def dump_tensor(self, activation=True, weight=False):
+    def dump_tensor(self, activation=True, weight=False, format=None):
         """Dump activation or weight or both from the model."""
+        is_qdq = False
         if "QuantizeLinear" in [node.op_type for node in self.model.graph.node] or \
                 "DynamicQuantizeLinear" in [node.op_type for node in self.model.graph.node]:
             self.augment_nodes = ["DequantizeLinear"]
             self.already_quantized = True
             self.dynamically_quantized = \
                 "DynamicQuantizeLinear" in [node.op_type for node in self.model.graph.node]
+            is_qdq = format == 'qdq'
         self.augment_graph(activation_only=not weight, weight_only=not activation)
         _, output_dicts = self.get_intermediate_outputs()
         iters = len(list(output_dicts.values())[-1])
@@ -507,30 +507,37 @@ def dump_tensor(self, activation=True, weight=False):
             if tensor_name.replace('_dequantized', '_quantized') in model_initializer_names:
                 nodes = [node for node in map_input[tensor_name] \
                          if node.name.replace('_quant', '') in self.white_nodes]
-            elif tensor_name.replace('_quantized', '') in model_input_names:
-                continue
-            else:
+            elif tensor_name in model_output_names:
                 nodes = [map_output[tensor_name]]
+            else:
+                nodes = map_input[tensor_name]
             for node in nodes:
                 node_name = node.name.replace('_quant', '')
                 if tensor_name in model_output_names and node_name not in self.white_nodes:
                     continue
-                while node_name not in self.white_nodes and self.already_quantized:
-                    node = augmengted_wrapper.get_parents(node, output_name_to_node=map_output)[0]
-                    node_name = node.name.replace('_quant', '')
                 if node_name not in self.white_nodes:
                     continue
                 if node_name not in map_node_weight:
                     map_node_weight[node_name] = {}
-                if tensor_name not in model_initializer_names:
+                if ((is_qdq and tensor_name.replace('_dequantized', '_quantized') not in model_initializer_names) or \
+                    (not is_qdq and tensor_name not in model_initializer_names)) and \
+                    tensor_name in node.input[:2]:
                     for i in range(iters):
-                        map_node_activation[i][node_name] = \
-                            {tensor_name.replace('_quantized', ''): tensors[i]}
-                elif not (node.op_type in ['Conv', 'Gemm', 'FusedConv'] and tensor_name not in node.input[:2]) and \
+                        if node.op_type in ['Attention', 'QAttention'] and tensor_name not in node.input[:2]:
+                            continue
+                        if is_qdq:
+                            map_node_activation[i][node_name] = \
+                                {tensor_name.replace('_dequantized', '').replace('_' + node_name, ''): tensors[i]}
+                        else:
+                            map_node_activation[i][node_name] = \
+                                {tensor_name.replace('_quantized', ''): tensors[i]}
+                elif not (node.op_type in ['QGemm'] and tensor_name not in node.input[:6]) and \
                     not (node.op_type in ['QLinearConv'] and tensor_name not in node.input[:8]) and \
-                    not (node.op_type in ['QGemm'] and tensor_name not in node.input[:6]):
-                    map_node_weight[node_name].update({tensor_name.replace('_quantized', ''): \
-                                                           tensors[0]})
+                    not (node.op_type in ['Conv', 'Gemm', 'FusedConv'] and tensor_name not in node.input[:2]):
+                    if is_qdq:
+                        map_node_weight[node_name].update({tensor_name.replace('_dequantized', ''): tensors[0]})
+                    else:
+                        map_node_weight[node_name].update({tensor_name.replace('_quantized', ''): tensors[0]})
         dumped_tensors_map = {}
         if weight:
             dumped_tensors_map.update({"weight": map_node_weight})
diff --git a/neural_compressor/adaptor/ox_utils/operators/maxpool.py b/neural_compressor/adaptor/ox_utils/operators/maxpool.py
@@ -66,7 +66,7 @@ def convert(self, convert_format):
             all([i.op_type != 'QuantizeLinear' for i in children]): # pragma: no cover
             return
         node.input[0] = parent.input[0]
-        node.output[0] = node.output[0] + '_quantized'
+        node.output[0] = node.output[0].replace('_QuantizeInput', '_quantized')
         for child in children:
             if child.op_type == 'QuantizeLinear':
                 self.quantizer.remove_nodes.append(child)
@@ -82,4 +82,4 @@ class QMaxPoolOperator(QOperator):
 
     def __init__(self, onnx_node, children, initializers):
         """Initialization."""
-        super().__init__(onnx_node, children, initializers)
+        super().__init__(onnx_node, children, initializers)
diff --git a/neural_compressor/model/onnx_model.py b/neural_compressor/model/onnx_model.py
@@ -329,11 +329,21 @@ def get_scale_zero(self, tensor):
         if not tensor.endswith('_quantized'):
             logger.debug("Find {} in the quantized graph is not quantized.".format(tensor))
             return None, None
-        input_name_to_nodes = self._input_name_to_nodes
-        node = input_name_to_nodes[tensor][0]
-        scale = "_".join(tensor.split('_')[:-1] + ['scale'])
+        node = self._input_name_to_nodes[tensor][0]
+        parent = self._output_name_to_node[tensor] if tensor in self._output_name_to_node else None
+        direct_int8 = ['Reshape', 'Transpose', 'Squeeze', 'Unsqueeze', 'MaxPool', 'Pad']
+        if parent is not None and parent.op_type in direct_int8:
+            fp32_tensor_name = \
+                parent.input[0].replace('_quantized', '').replace('_QuantizeLinear', '').replace('_QuantizeInput', '')
+        elif node.op_type in ['Gather']:
+            fp32_tensor_name = \
+                node.output[0].replace('_quantized', '').replace('_QuantizeLinear', '').replace('_QuantizeInput', '')
+        else:
+            fp32_tensor_name = \
+                tensor.replace('_quantized', '').replace('_QuantizeLinear', '').replace('_QuantizeInput', '')
+        scale = fp32_tensor_name + '_scale'
         scale_tensor = self.get_initializer(scale)
-        zo = "_".join(tensor.split('_')[:-1] + ['zero_point'])
+        zo = fp32_tensor_name + '_zero_point'
         zo_tensor = self.get_initializer(zo)
 
         #TODO check if scale_tensor and zero_point is needed
diff --git a/test/adaptor/onnxrt_adaptor/test_adaptor_onnxrt.py b/test/adaptor/onnxrt_adaptor/test_adaptor_onnxrt.py
@@ -6,6 +6,7 @@
 import torchvision
 import onnx
 import numpy as np
+from packaging.version import Version
 from collections import OrderedDict
 from onnx import onnx_pb as onnx_proto
 from onnx import helper, TensorProto, numpy_helper
@@ -731,6 +732,8 @@ def evaluate(self):
         with self.assertRaises(ValueError):
             test()
 
+    @unittest.skipIf(Version(ort.__version__) == Version("1.13.1"), 
+                     "This function does not work with ONNX Runtime 1.13.1 for QDQ format quantization of ONNX models.")
     def test_inspect_tensor(self):
         framework_specific_info = {"device": "cpu",
                                "approach": "post_training_static_quant",
@@ -774,7 +777,8 @@ def test_inspect_tensor(self):
             self.assertTrue(len(fp32_tensor['activation']) == len(int8_tensor['activation']))
             self.assertTrue(sorted(fp32_tensor['activation'][0].keys()) == sorted(int8_tensor['activation'][0].keys()))
             for op in op_list:
-                self.assertTrue(sorted(fp32_tensor['activation'][0][op].keys()) == sorted(int8_tensor['activation'][0][op].keys()))
+                for x, y in zip(fp32_tensor['activation'][0][op].values(), int8_tensor['activation'][0][op].values()):
+                    self.assertTrue(x.shape == y.shape)
 
             if fake_yaml == "qlinear.yaml":
                 fp32_tensor = quantizer.strategy.adaptor.inspect_tensor(opt_model.model, self.cv_dataloader, op_list, inspect_type='weight')
diff --git a/test/adaptor/onnxrt_adaptor/test_onnxrt_augment.py b/test/adaptor/onnxrt_adaptor/test_onnxrt_augment.py
@@ -124,9 +124,9 @@ def test_dump_tensor(self):
                                 white_nodes=["conv"])
         map_dumped_tensors = augment.dump_tensor()
         assert "conv" in map_dumped_tensors["activation"][0]
-        assert "C" in map_dumped_tensors["activation"][0]["conv"]
+        assert "A" in map_dumped_tensors["activation"][0]["conv"]
         assert "conv" in map_dumped_tensors["activation"][1]
-        assert "C" in map_dumped_tensors["activation"][1]["conv"]
+        assert "A" in map_dumped_tensors["activation"][1]["conv"]
 
         model, dataloader = self.cv_session
         augment = ONNXRTAugment(ONNXModel(model),
@@ -321,6 +321,7 @@ def test_augment_graph(self):
         #        |
         #    QuantizeLinear
    
+        Attention_input = helper.make_tensor_value_info('input_quantized', TensorProto.INT8, [7, 13])
         Attention_weight = helper.make_tensor_value_info('weight_quantized', TensorProto.INT8, [13,7])
         weight_quantized = generate_input_initializer([13, 7], np.int8, 'weight_quantized')
         Attention_bias = helper.make_tensor_value_info('bias', TensorProto.FLOAT, [13, 7])
@@ -340,7 +341,8 @@ def test_augment_graph(self):
         Q_zo = helper.make_tensor_value_info('attn_output_zero_point', TensorProto.INT8, [1])
         attn_output_zero_point = generate_input_initializer([1], np.int8, 'attn_output_zero_point')
         Output = helper.make_tensor_value_info('output', TensorProto.INT8, [13,7])
-        attention_node = onnx.helper.make_node('QAttention', ['weight_quantized', 
+        attention_node = onnx.helper.make_node('QAttention', ['input_quantized',
+                                                             'weight_quantized', 
                                                              'bias', 
                                                              'input_scale',
                                                              'weight_scale',
@@ -354,7 +356,8 @@ def test_augment_graph(self):
                                              name='attn_output_QuantizeLinear')
         graph = helper.make_graph([attention_node, qlinear_node], 
                                    'test_graph_5', 
-                                   [Attention_weight, 
+                                   [Attention_input,
+                                   Attention_weight, 
                                    Attention_bias, 
                                    Input_scale,
                                    Weight_scale,
@@ -380,14 +383,15 @@ def test_augment_graph(self):
         augment = ONNXRTAugment(ONNXModel(model), data_reader, [], white_nodes=['attention'])
         augment.augment_nodes = ['DequantizeLinear']
         augment.already_quantized = True
+
         augment.augment_graph(activation_only=True, weight_only=False)
         augmented_model = augment.augmented_model
 
         augmented_model_node_names = [node.name for node in augmented_model.graph.node]
         augmented_model_outputs = [output.name for output in augmented_model.graph.output]
         added_node_names = ['attention_quant', 'attn_output_QuantizeLinear']
-        added_outputs = ['attn_output', 'output']
-        self.assertEqual(len(augmented_model_node_names), 2)
+        added_outputs = ['input_quantized_output', 'output']
+        self.assertEqual(len(augmented_model_node_names), 3)
         self.assertEqual(len(augmented_model_outputs), 2)
         for name in added_node_names:
             self.assertTrue(name in augmented_model_node_names)
@@ -406,10 +410,6 @@ def test_augment_graph(self):
         a_scale = generate_input_initializer([1], np.float32, 'A_scale')
         A_zo = helper.make_tensor_value_info('A_zero_point', TensorProto.INT8, [1])
         a_zero_point = generate_input_initializer([1], np.int8, 'A_zero_point')
-        B_scale = helper.make_tensor_value_info('B_scale', TensorProto.FLOAT, [1])
-        b_scale = generate_input_initializer([1], np.float32, 'B_scale')
-        B_zo = helper.make_tensor_value_info('B_zero_point', TensorProto.INT8, [1])
-        b_zero_point = generate_input_initializer([1], np.int8, 'B_zero_point')
         C = helper.make_tensor_value_info('C', TensorProto.INT8, [1, 1, 5, 5])
         c = generate_input_initializer([1, 1, 5, 5], np.int8, 'C')
         C_scale = helper.make_tensor_value_info('C_scale', TensorProto.FLOAT, [1])
@@ -423,14 +423,12 @@ def test_augment_graph(self):
         D_zo = helper.make_tensor_value_info('D_zero_point', TensorProto.INT8, [1])
         d_zero_point = generate_input_initializer([1], np.int8, 'D_zero_point')
         D = helper.make_tensor_value_info('D', TensorProto.FLOAT, [1, 1, 5, 5])
-        quantize_node = onnx.helper.make_node('QuantizeLinear', ['A', 'A_scale', 'A_zero_point'], ['B'], name='A_QuantizeLinear')
-        conv_node = onnx.helper.make_node('QLinearConv', ['B', 'B_scale', 'B_zero_point', 'C', 'C_scale', 'C_zero_point', 'D_scale', 'D_zero_point', 'E'], ['D_quantized'], name='conv_quant', kernel_shape=[3, 3], pads=[1, 1, 1, 1])
+        quantize_node = onnx.helper.make_node('QuantizeLinear', ['A', 'A_scale', 'A_zero_point'], ['A_quantized'], name='A_QuantizeLinear')
+        conv_node = onnx.helper.make_node('QLinearConv', ['A_quantized', 'A_scale', 'A_zero_point', 'C_quantized', 'C_scale', 'C_zero_point', 'D_scale', 'D_zero_point', 'E'], ['D_quantized'], name='conv_quant', kernel_shape=[3, 3], pads=[1, 1, 1, 1])
         dequantize_node = onnx.helper.make_node('DequantizeLinear', ['D_quantized', 'D_scale', 'D_zero_point'], ['D'], name='D_DequantizeLinear')
         graph = helper.make_graph([quantize_node, conv_node, dequantize_node], 'test_graph_5', [A, A_scale, A_zo, C, C_scale, C_zo, E, D_scale, D_zo], [D])
         graph.initializer.add().CopyFrom(a_scale)
         graph.initializer.add().CopyFrom(a_zero_point)
-        graph.initializer.add().CopyFrom(b_scale)
-        graph.initializer.add().CopyFrom(b_zero_point)
         graph.initializer.add().CopyFrom(c)
         graph.initializer.add().CopyFrom(c_scale)
         graph.initializer.add().CopyFrom(c_zero_point)
@@ -449,8 +447,8 @@ def test_augment_graph(self):
 
         augmented_model_node_names = [node.name for node in augmented_model.graph.node]
         augmented_model_outputs = [output.name for output in augmented_model.graph.output]
-        added_node_names = ['A_QuantizeLinear', 'conv_quant', 'D_DequantizeLinear', 'D_quantized_DequantizeLinear']
-        added_outputs = ['D', 'D_quantized_output']
+        added_node_names = ['A_QuantizeLinear', 'conv_quant', 'D_DequantizeLinear', 'A_quantized_DequantizeLinear']
+        added_outputs = ['D', 'A_quantized_output']
         self.assertEqual(len(augmented_model_node_names), 4)
         self.assertEqual(len(augmented_model_outputs), 2)
         for name in added_node_names: