fix corner case in ONNX backend (#1115)

mengniwang95 · web-flow · commit 8c565a4b7062 · 2022-08-02T17:30:05.000+08:00
diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py
@@ -269,59 +269,29 @@ def _dump_model_op_stats(self, model):
         for precision in self.query_handler.get_precisions():
             if precision != 'fp32':
                 fp32_op_list += self.query_handler.get_op_types_by_precision(precision=precision)
+        qdq_ops = ["QuantizeLinear", "DequantizeLinear", "DynamicQuantizeLinear"]
         res = {}
         for op_type in fp32_op_list:
             res[op_type] = {'INT8':0, 'BF16': 0, 'FP16': 0, 'FP32':0}
-        for op_type in ["QuantizeLinear", "DequantizeLinear", "DynamicQuantizeLinear"]:
+        for op_type in qdq_ops:
             res[op_type] = {'INT8':0, 'BF16': 0, 'FP16': 0, 'FP32':0}
 
-
-        if self.backend in ["qlinearops", "qdq", "qoperator"] :
-            int8_op_list = ["QLinearConv", "QLinearMatMul", "QAttention",
-                            "QLinearMul", "QLinearRelu", "QLinearClip",
-                            "QLinearLeakyRelu", "QLinearSigmoid", "MaxPool","Squeeze",
-                            "EmbedLayerNormalization", "QLinearGlobalAveragePool", 
-                            "QLinearAdd", "Pad", "Split", "Gather", "Reshape", "Concat",
-                            "QuantizeLinear", "DequantizeLinear", "QLinearAveragePool",
-                            "Unsqueeze", "Transpose"
-            ]
-        else:
-            int8_op_list = ["ConvInteger", "MatMulInteger", "QAttention",
-                            "DynamicQuantizeLSTM", "Gather", "EmbedLayerNormalization",
-                            "DynamicQuantizeLinear"
-            ]
-
         for node in model.model.graph.node:
-            possible_int8_res = [name for name in int8_op_list if node.op_type.find(name) != -1]
- 
-            if any(possible_int8_res):
+            if node.name.endswith('_quant'):
                 if self.backend in ["qlinearops", "qdq", "qoperator"]:
-                    if node.op_type == "QuantizeLinear" or node.op_type == "DequantizeLinear" \
-                            or node.op_type == "DynamicQuantizeLinear":
-                        origin_op_type = node.op_type
-                    else:
-                        origin_op_type = possible_int8_res[0].split('QLinear')[-1]
+                    origin_op_type = node.op_type.split('QLinear')[-1]
                 else:
-                    origin_op_type = possible_int8_res[0].split('Integer')[0]
-
-                if node.op_type in ["Pad", "Split", "Gather", "Concat", "Reshape", "Unsqueeze", 
-                    "Squeeze", "Transpose"]:
-                    if any([output.endswith('_quantized') for output in node.output]) or \
-                        any(['_DequantizeLinear' in inp for inp in node.input]):
-                        origin_op_type = node.op_type
-                    else:
-                        if node.op_type in res:
-                            res[node.op_type]['FP32'] += 1
-                        continue
+                    origin_op_type = node.op_type.split('Integer')[0]
 
                 if origin_op_type == "QAttention":
                     origin_op_type = "Attention"
-                if origin_op_type == "DynamicQuantizeLSTM":
+                elif origin_op_type == "DynamicQuantizeLSTM":
                     origin_op_type = "LSTM"
+                elif origin_op_type == "QEmbedLayerNormalization":
+                    origin_op_type = "EmbedLayerNormalization"
                 res[origin_op_type]['INT8'] += 1
 
-            elif node.op_type in fp32_op_list and \
-                any(['_DequantizeLinear' in inp for inp in node.input]):
+            elif node.op_type in qdq_ops:
                 res[node.op_type]['INT8'] += 1
 
             elif node.op_type in fp32_op_list and node.name in self.quantize_config:
@@ -330,6 +300,9 @@ def _dump_model_op_stats(self, model):
                 else:
                     res[node.op_type][self.quantize_config[node.name].upper()] += 1
 
+            elif node.op_type in res:
+                res[node.op_type]['FP32'] += 1
+
         output_data = [[op_type, sum(res[op_type].values()), res[op_type]['INT8'],
             res[op_type]['BF16'], res[op_type]['FP16'], res[op_type]['FP32']] for \
             op_type in res.keys()]
diff --git a/neural_compressor/adaptor/ox_utils/onnxrt_mid.py b/neural_compressor/adaptor/ox_utils/onnxrt_mid.py
@@ -131,6 +131,9 @@ def augment_graph(self, activation_only=False, weight_only=False):
                     elif not onnx_version < ONNX18_VERSION:
                         tensors_to_dump.update(node.input)
                     tensors_to_dump.update(node.output)
+                    if node.op_type == 'EmbedLayerNormalization' and len(node.output) > 1 and \
+                        node.output[2] in tensors_to_dump:
+                        tensors_to_dump.remove(node.output[2])
                 elif weight_only:
                     for input in node.input:
                         if self.already_quantized and \
diff --git a/neural_compressor/adaptor/ox_utils/operators/embed_layernorm.py b/neural_compressor/adaptor/ox_utils/operators/embed_layernorm.py
@@ -47,7 +47,8 @@ def convert(self):
         [7] mask (int32) (optional)
         '''
 
-        parents = self.quantizer.model.get_parents(node)
+        parents = [i for i in self.quantizer.model.get_parents(node) \
+            if i.op_type == 'DequantizeLinear']
         inputs = []
         # 'input_ids'
         inputs.extend([node.input[0]])
diff --git a/neural_compressor/adaptor/ox_utils/quantizer.py b/neural_compressor/adaptor/ox_utils/quantizer.py
@@ -169,6 +169,11 @@ def merge_dedicated_qdq_pair(self):
                             self.replace_input.append([self.model.get_children(child)[0],
                                                 child.output[0], node.input[0]])
                     self.remove_nodes.append(node)
+            self.model.remove_nodes(self.remove_nodes)
+            self.model.graph().node.extend(self.new_nodes)
+            for node, old_input_name, new_input_name in self.replace_input:
+                self.model.replace_node_input(node, old_input_name, new_input_name)
+            self.model.update()
         elif self.mode != 'qdq' or not self.dedicated_qdq_pair:
             target_type = ['QuantizeLinear', 'DequantizeLinear']
             for op_type in target_type:
@@ -190,11 +195,11 @@ def merge_dedicated_qdq_pair(self):
                             self.replace_input.append([self.model.get_children(dq_nodes[i])[0],
                                                        dq_nodes[i].output[0], 
                                                        dq_nodes[idx].output[0]])
-        self.model.remove_nodes(self.remove_nodes)
-        self.model.graph().node.extend(self.new_nodes)
-        for node, old_input_name, new_input_name in self.replace_input:
-            self.model.replace_node_input(node, old_input_name, new_input_name)
-        self.model.update()
+                self.model.remove_nodes(self.remove_nodes)
+                self.model.graph().node.extend(self.new_nodes)
+                for node, old_input_name, new_input_name in self.replace_input:
+                    self.model.replace_node_input(node, old_input_name, new_input_name)
+                self.model.update()
 
     def should_cast(self, node):
         if node.name in self.config and self.config[node.name] != 'fp32': # pragma: no cover
@@ -269,7 +274,8 @@ def dfs(match_nodes, node, pattern):
  
                         self.remove_nodes.append(match_nodes[1])
                         if all([i.op_type in ['QuantizeLinear', 'DequantizeLinear'] \
-                            for i in self.model.get_children(match_nodes[0])]):
+                            for i in self.model.get_children(match_nodes[0])]) and \
+                            match_nodes[0].output[0] not in self.model.output():
                             self.remove_nodes.append(match_nodes[0])
                 else: # pragma: no cover
                     parent = self.model.get_parents(match_nodes[0])[0]
diff --git a/neural_compressor/adaptor/ox_utils/registry.py b/neural_compressor/adaptor/ox_utils/registry.py
@@ -20,7 +20,7 @@
 from .operators.qdq_base_operator import QDQOperatorBase
 from .operators.matmul import MatMulInteger, QLinearMatMul, QDQMatMul
 from .operators.attention import AttentionQuant, QDQAttention
-from .operators.embed_layernorm import EmbedLayerNormalizationQuant
+from .operators.embed_layernorm import EmbedLayerNormalizationQuant, QDQEmbedLayerNormalization
 from .operators.gather import GatherConverter, GatherQuant
 from .operators.conv import QLinearConv, ConvInteger, QDQConv
 from .operators.activation import QLinearActivation, QDQRemovableActivation, QDQActivation
@@ -92,7 +92,8 @@
     "AveragePool": QDQPool,
     "Unsqueeze" : QDQDirect8BitOp,
     "Concat": QDQConcat,
-    "Split": QDQSplit
+    "Split": QDQSplit,
+    "EmbedLayerNormalization": QDQEmbedLayerNormalization
 }
 
 CastRegistry = {
diff --git a/neural_compressor/adaptor/ox_utils/util.py b/neural_compressor/adaptor/ox_utils/util.py
@@ -91,7 +91,7 @@ def split_shared_bias(model):
             for node in node_list[1:]:
                 if node.op_type not in ['Conv', 'FusedConv']:
                     continue
-                if node.input[2] == input_name:
+                if len(node.input) > 2 and node.input[2] == input_name:
                     new_input_name = node.input[2] + '_nc_split_' + node.name
                     new_input = helper.make_tensor(
                                     new_input_name,