Remove ONNX RT Quantizer dependency (#1087)

yuwenzho · web-flow · commit 7bf81073c505 · 2022-07-27T16:06:59.000+08:00
diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py
@@ -100,7 +100,7 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None):
             return model
         if model.model.opset_import[0].version < 11: # pragma: no cover
             logger.warning("Quantize input needs model opset 11 or newer.")
-        from onnxruntime.quantization.quant_utils import QuantizationMode
+        from neural_compressor.adaptor.ox_utils.util import QuantizationMode
         if self.backend in ["qlinearops", "qoperator"]:
             backend = QuantizationMode.QLinearOps
             if self.backend == "qlinearops":
@@ -218,7 +218,7 @@ def recover(self, model, q_config):
         if model.model.opset_import[0].version < 11: # pragma: no cover
             logger.warning("Quantize input needs model opset 11 or newer.")
 
-        from onnxruntime.quantization.quant_utils import QuantizationMode
+        from neural_compressor.adaptor.ox_utils.util import QuantizationMode
         if self.backend in ["qlinearops", "qoperator"]:
             backend = QuantizationMode.QLinearOps
         elif self.backend == "qdq":
@@ -471,7 +471,7 @@ def _pre_optimize(self, model, level=1):
         self.pre_optimized_model = model
 
     def _revert_fusedconv(self, model):
-        from onnxruntime.quantization.quant_utils import attribute_to_kwarg
+        from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg
         from onnx import onnx_pb as onnx_proto
         new_nodes = []
         remove_nodes = []
@@ -813,7 +813,7 @@ def eval_func(dataloader):
 
     def diagnosis_helper(self, fp32_model, int8_model, tune_cfg=None, save_path=None):
         from neural_compressor.utils.utility import dump_data_to_local
-        from neural_compressor.model.onnx_model import find_by_name
+        from neural_compressor.adaptor.ox_utils.util import find_by_name
         if self.backend in ["qlinearops", "qoperator"]:
             supported_optype = ['Conv', 'MatMul', 'Concat', 'Attention', 'FusedConv',
                 'Add', 'Mul', 'LeakyRelu', 'Sigmoid', 'GlobalAveragePool', 'AveragePool']
diff --git a/neural_compressor/adaptor/ox_utils/operators/activation.py b/neural_compressor/adaptor/ox_utils/operators/activation.py
@@ -19,7 +19,7 @@
 import onnx
 from .base_operator import QuantOperatorBase
 from .qdq_base_operator import QDQOperatorBase
-from onnxruntime.quantization.quant_utils import QuantizedValueType, \
+from neural_compressor.adaptor.ox_utils.util import QuantizedValueType, \
                                                  attribute_to_kwarg, ms_domain
 from onnx import onnx_pb as onnx_proto
 from neural_compressor.adaptor.ox_utils.util import QuantizedValue
diff --git a/neural_compressor/adaptor/ox_utils/operators/attention.py b/neural_compressor/adaptor/ox_utils/operators/attention.py
@@ -19,7 +19,7 @@
 import onnx
 from .base_operator import QuantOperatorBase
 from .qdq_base_operator import QDQOperatorBase
-from onnxruntime.quantization.quant_utils import attribute_to_kwarg, ms_domain
+from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg, ms_domain
 from onnx import onnx_pb as onnx_proto
 '''
     Quantize Attention
diff --git a/neural_compressor/adaptor/ox_utils/operators/binary_op.py b/neural_compressor/adaptor/ox_utils/operators/binary_op.py
@@ -18,7 +18,7 @@
 
 import onnx
 from .base_operator import QuantOperatorBase
-from onnxruntime.quantization.quant_utils import attribute_to_kwarg, ms_domain, \
+from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg, ms_domain, \
                                                  QuantizedValueType
 from onnx import onnx_pb as onnx_proto
 from neural_compressor.adaptor.ox_utils.util import QuantizedValue
diff --git a/neural_compressor/adaptor/ox_utils/operators/concat.py b/neural_compressor/adaptor/ox_utils/operators/concat.py
@@ -18,7 +18,7 @@
 
 import onnx
 from .base_operator import QuantOperatorBase
-from onnxruntime.quantization.quant_utils import QuantizedValueType, \
+from neural_compressor.adaptor.ox_utils.util import QuantizedValueType, \
         attribute_to_kwarg, ms_domain
 from onnx import onnx_pb as onnx_proto
 from neural_compressor.adaptor.ox_utils.util import QuantizedValue
diff --git a/neural_compressor/adaptor/ox_utils/operators/conv.py b/neural_compressor/adaptor/ox_utils/operators/conv.py
@@ -19,7 +19,7 @@
 import onnx
 from .base_operator import QuantOperatorBase
 from .qdq_base_operator import QDQOperatorBase
-from onnxruntime.quantization.quant_utils import find_by_name, get_mul_node, \
+from neural_compressor.adaptor.ox_utils.util import find_by_name, \
                                                  QuantizedValueType, attribute_to_kwarg
 from onnx import onnx_pb as onnx_proto
 from neural_compressor.adaptor.ox_utils.util import QuantizedValue
@@ -88,17 +88,17 @@ def convert(self):
 
         scales_mul_node = find_by_name(scales_mul_op, self.quantizer.new_nodes)
         if scales_mul_node is None:
-            scales_mul_node = get_mul_node([scale_0, scale_1],
-                scales_mul_op + ":0", scales_mul_op)
+            scales_mul_node = onnx.helper.make_node("Mul", [scale_0, scale_1], 
+                                        [scales_mul_op + ":0"], scales_mul_op)
             self.quantizer.new_nodes.append(scales_mul_node)
 
         scales_mul_op_output = scales_mul_node.output[0]
 
         # Add mul operation to multiply mul_scales_op result with output of ConvInteger
         # and make the output of this node the same as output of original conv node.
         output_scale_mul_op = node.name + "_output_scale_mul"
-        self.quantizer.new_nodes.append(get_mul_node([cast_op_output, scales_mul_op_output], 
-                                                  node.output[0], output_scale_mul_op))
+        self.quantizer.new_nodes.append(onnx.helper.make_node("Mul",
+            [cast_op_output, scales_mul_op_output], [node.output[0]], output_scale_mul_op))
         self.quantizer.remove_nodes.extend(parents[1:])
         self.quantizer.remove_nodes.append(node)
 
diff --git a/neural_compressor/adaptor/ox_utils/operators/embed_layernorm.py b/neural_compressor/adaptor/ox_utils/operators/embed_layernorm.py
@@ -20,7 +20,7 @@
 from .base_operator import QuantOperatorBase
 from .qdq_base_operator import QDQOperatorBase
 from onnx import onnx_pb as onnx_proto
-from onnxruntime.quantization.quant_utils import QuantizedValueType, \
+from neural_compressor.adaptor.ox_utils.util import QuantizedValueType, \
                                                  attribute_to_kwarg, ms_domain
 '''
 Quantize EmbedLayerNormalization
diff --git a/neural_compressor/adaptor/ox_utils/operators/gather.py b/neural_compressor/adaptor/ox_utils/operators/gather.py
@@ -18,7 +18,7 @@
 
 import onnx
 from .base_operator import QuantOperatorBase
-from onnxruntime.quantization.quant_utils import QuantizedValueType, attribute_to_kwarg
+from neural_compressor.adaptor.ox_utils.util import QuantizedValueType, attribute_to_kwarg
 from onnx import onnx_pb as onnx_proto
 from neural_compressor.adaptor.ox_utils.util import QuantizedValue
 '''
diff --git a/neural_compressor/adaptor/ox_utils/operators/gavgpool.py b/neural_compressor/adaptor/ox_utils/operators/gavgpool.py
@@ -18,7 +18,7 @@
 
 import onnx
 from .base_operator import QuantOperatorBase
-from onnxruntime.quantization.quant_utils import attribute_to_kwarg, ms_domain, \
+from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg, ms_domain, \
                                                  QuantizedValueType
 from neural_compressor.adaptor.ox_utils.util import QuantizedValue                                                 
 class QGlobalAveragePool(QuantOperatorBase):
diff --git a/neural_compressor/adaptor/ox_utils/operators/lstm.py b/neural_compressor/adaptor/ox_utils/operators/lstm.py
@@ -20,7 +20,7 @@
 import numpy
 from .base_operator import QuantOperatorBase
 from .qdq_base_operator import QDQOperatorBase
-from onnxruntime.quantization.quant_utils import attribute_to_kwarg, ms_domain, QuantType
+from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg, ms_domain, QuantType
 from onnx import onnx_pb as onnx_proto
 '''
     Quantize LSTM
diff --git a/neural_compressor/adaptor/ox_utils/operators/matmul.py b/neural_compressor/adaptor/ox_utils/operators/matmul.py
@@ -19,7 +19,7 @@
 import onnx
 from .base_operator import QuantOperatorBase
 from .qdq_base_operator import QDQOperatorBase
-from onnxruntime.quantization.quant_utils import find_by_name, get_mul_node, \
+from neural_compressor.adaptor.ox_utils.util import find_by_name, \
                                                  QuantizedValueType
 from onnx import onnx_pb as onnx_proto
 from neural_compressor.adaptor.ox_utils.util import QuantizedValue
@@ -73,18 +73,18 @@ def convert(self):
 
         scales_mul_node = find_by_name(scales_mul_op, self.quantizer.new_nodes)
         if scales_mul_node is None:
-            scales_mul_node = get_mul_node([scale[0], scale[1]], 
-                scales_mul_op + ":0", scales_mul_op)
+            scales_mul_node = onnx.helper.make_node("Mul", [scale[0], scale[1]],  
+                [scales_mul_op + ":0"], scales_mul_op)
             self.quantizer.new_nodes.append(scales_mul_node)
 
         scales_mul_op_output = scales_mul_node.output[0]
 
         # Add mul operation to multiply mul_scales_op result with output of MatMulInteger
         # and make the output of this node the same as output of original matmul node.
         output_scale_mul_op = node.name + "_output_scale_mul"
-        self.quantizer.new_nodes.append(get_mul_node([cast_op_output, scales_mul_op_output], 
-                                  node.output[0], 
-                                  output_scale_mul_op))
+        self.quantizer.new_nodes.append(
+            onnx.helper.make_node("Mul", [cast_op_output, scales_mul_op_output],
+                            [node.output[0]], output_scale_mul_op))
         if parents[1].op_type == 'DequantizeLinear':
             self.quantizer.remove_nodes.append(parents[1])
         self.quantizer.remove_nodes.append(node)
diff --git a/neural_compressor/adaptor/ox_utils/operators/maxpool.py b/neural_compressor/adaptor/ox_utils/operators/maxpool.py
@@ -19,7 +19,7 @@
 import onnx
 from .base_operator import QuantOperatorBase
 from .direct_q8 import QDQDirect8BitOp
-from onnxruntime.quantization.quant_utils import QuantizedValueType
+from neural_compressor.adaptor.ox_utils.util import QuantizedValueType
 from onnx import onnx_pb as onnx_proto
 from neural_compressor.adaptor.ox_utils.util import QuantizedValue
 
@@ -66,4 +66,4 @@ def quantize(self):
             return
 
         # Direct 8bits op
-        super().quantize()
+        super().quantize()
diff --git a/neural_compressor/adaptor/ox_utils/operators/pad.py b/neural_compressor/adaptor/ox_utils/operators/pad.py
@@ -18,11 +18,11 @@
 
 import numpy
 import onnx
-from onnxruntime.quantization.quant_utils import QuantizedValueType, \
+from neural_compressor.adaptor.ox_utils.util import QuantizedValueType, \
                                                  attribute_to_kwarg
 from .base_operator import QuantOperatorBase
 from .qdq_base_operator import QDQOperatorBase
-from neural_compressor.adaptor.ox_utils.util import QuantizedValue
+from neural_compressor.adaptor.ox_utils.util import QuantizedValue, quantize_nparray
 
 class QDQPad(QDQOperatorBase):
     def __init__(self, onnx_quantizer, onnx_node):
@@ -101,11 +101,3 @@ def convert(self):
         node.input[0] = parent.input[0]
         node.output[0] = child.output[0]
         self.quantizer.remove_nodes.extend([parent, child])
-
-def quantize_nparray(qtype, arr, scale, zero_point, low=None, high=None):
-    dtype = numpy.uint8 if qtype == "uint8" else numpy.int8
-    cliplow = max(0 if dtype == numpy.uint8 else -127, -127 if low is None else low)
-    cliphigh = min(255 if dtype == numpy.uint8 else 127, 255 if high is None else high)
-    arr_fp32 = numpy.asarray((arr.astype(numpy.float32) / scale).round() + zero_point)
-    numpy.clip(arr_fp32, cliplow, cliphigh, out=arr_fp32)
-    return arr_fp32.astype(dtype)
diff --git a/neural_compressor/adaptor/ox_utils/operators/pooling.py b/neural_compressor/adaptor/ox_utils/operators/pooling.py
@@ -19,7 +19,7 @@
 import onnx
 from .base_operator import QuantOperatorBase
 from .qdq_base_operator import QDQOperatorBase
-from onnxruntime.quantization.quant_utils import QuantizedValueType, \
+from neural_compressor.adaptor.ox_utils.util import QuantizedValueType, \
         attribute_to_kwarg, ms_domain
 from onnx import onnx_pb as onnx_proto
 from neural_compressor.adaptor.ox_utils.util import QuantizedValue
diff --git a/neural_compressor/adaptor/ox_utils/operators/qdq_base_operator.py b/neural_compressor/adaptor/ox_utils/operators/qdq_base_operator.py
@@ -19,8 +19,6 @@
 
 import itertools
 from .base_operator import QuantOperatorBase
-from onnxruntime.quantization.quant_utils import QuantizedValue, QuantizedValueType, \
-                                                    attribute_to_kwarg, quantize_nparray
 
 
 class QDQOperatorBase(QuantOperatorBase):
diff --git a/neural_compressor/adaptor/ox_utils/operators/split.py b/neural_compressor/adaptor/ox_utils/operators/split.py
@@ -17,7 +17,7 @@
 #
 
 import onnx
-from onnxruntime.quantization.quant_utils import QuantizedValueType, \
+from neural_compressor.adaptor.ox_utils.util import QuantizedValueType, \
                                                  attribute_to_kwarg
 from .base_operator import QuantOperatorBase 
 from neural_compressor.adaptor.ox_utils.util import QuantizedValue
diff --git a/neural_compressor/adaptor/ox_utils/quantizer.py b/neural_compressor/adaptor/ox_utils/quantizer.py
@@ -27,18 +27,16 @@
 from onnx import onnx_pb as onnx_proto
 from onnx import TensorProto
 from onnx import shape_inference
-from onnxruntime.quantization.quant_utils import QuantizedValueType
-from onnxruntime.quantization.quant_utils import find_by_name, get_elem_index, get_mul_node, \
-                                generate_identified_filename, attribute_to_kwarg, type_to_name
-from onnxruntime.quantization.quant_utils import __producer__, __version__, onnx_domain
 from onnxruntime import SessionOptions, InferenceSession, GraphOptimizationLevel
-from onnxruntime.quantization.quant_utils import QuantizationMode
 
 from neural_compressor.adaptor.ox_utils.registry import CreateQDQQuantizer, \
     CreateOpConverter, CreateCaster
 from neural_compressor.adaptor.ox_utils.util import QuantizedValue, QuantizedInitializer, \
-    quantize_data_with_scale_zero, quantize_data, dtype_mapping, support_pair, ValueInfo, \
-    _get_qrange_for_qType, convert_np_to_float16, cast_tensor, make_quant_node, make_dquant_node
+    _get_qrange_for_qType, cast_tensor, make_quant_node, make_dquant_node
+from neural_compressor.adaptor.ox_utils.util import QuantizedValueType
+from neural_compressor.adaptor.ox_utils.util import find_by_name, dtype_to_name
+from neural_compressor.adaptor.ox_utils.util import __producer__, __version__
+from neural_compressor.adaptor.ox_utils.util import quantize_data, dtype_mapping, support_pair, ValueInfo
 from neural_compressor import options
 from neural_compressor.utils.utility import CpuInfo
 from neural_compressor.model.onnx_model import ONNXModel
@@ -791,7 +789,8 @@ def tensor_proto_to_array(initializer):
             weights = onnx.numpy_helper.to_array(initializer)
         else:
             raise ValueError('Only float type quantization is supported. \
-               Weights {} is {}. '.format(initializer.name, type_to_name[initializer.data_type]))
+                Weights {} is {}.'.format(initializer.name, 
+                    dtype_to_name(dtype_mapping, initializer.data_type)))
         return weights
 
     def _get_quantization_params(self, param_name):
diff --git a/neural_compressor/adaptor/ox_utils/registry.py b/neural_compressor/adaptor/ox_utils/registry.py
@@ -15,7 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from onnxruntime.quantization.quant_utils import QuantizationMode
+from .util import QuantizationMode
 from .operators.base_operator import QuantOperatorBase
 from .operators.qdq_base_operator import QDQOperatorBase
 from .operators.matmul import MatMulInteger, QLinearMatMul, QDQMatMul
diff --git a/neural_compressor/adaptor/ox_utils/util.py b/neural_compressor/adaptor/ox_utils/util.py
@@ -20,7 +20,14 @@
 import numpy as np
 from onnx import helper
 from onnx import onnx_pb as onnx_proto  
-from onnxruntime.quantization.quant_utils import QuantType       
+from enum import Enum
+from pathlib import Path
+import abc
+
+__producer__ = "onnx.quantize"
+__version__ = "0.1.0"
+onnx_domain = "ai.onnx"
+ms_domain = "com.microsoft"      
 
 support_pair = {
     'uint8 uint8': True,
@@ -35,11 +42,29 @@
 
 dtype_mapping = {
     'fp32': 1,
-    'fp16': 10,
-    'int8': 3,
     'uint8': 2,
+    'int8': 3,
+    'uint16': 4,
+    'int16': 5,
+    'int32': 6,
+    'int64': 7,
+    'string': 8,
+    'bool': 9,
+    'fp16': 10,
+    'double': 11,
+    'uint32': 12,
+    'uint64': 13,
+    'complex64': 14,
+    'complex128': 15,
 }
 
+def dtype_to_name(dtype_mapping, dtype):
+    return list(dtype_mapping.keys())[list(dtype_mapping.values()).index(dtype)]
+
+class QuantType(Enum): # pragma: no cover
+    QInt8 = 0
+    QUInt8 = 1
+
 def make_quant_node(name, inputs, outputs):
     return helper.make_node("QuantizeLinear", inputs, outputs, name)
 
@@ -318,3 +343,63 @@ def __init__(self,
         self.axis = axis
         # If empty, single zero point and scales computed from a single rmin and rmax
         self.qType = qType
+
+
+class QuantizationMode(Enum): # pragma: no cover
+    IntegerOps = 0
+    QLinearOps = 1
+
+class QuantizedValueType(Enum): # pragma: no cover
+    Input = 0
+    Initializer = 1
+
+class QuantFormat(Enum): # pragma: no cover
+    QOperator = 0
+    QDQ = 1
+
+def quantize_nparray(qtype, arr, scale, zero_point, low=None, high=None):
+    dtype = np.uint8 if qtype == "uint8" else np.int8
+    cliplow = max(0 if dtype == np.uint8 else -127, -127 if low is None else low)
+    cliphigh = min(255 if dtype == np.uint8 else 127, 255 if high is None else high)
+    arr_fp32 = np.asarray((arr.astype(np.float32) / scale).round() + zero_point)
+    np.clip(arr_fp32, cliplow, cliphigh, out=arr_fp32)
+    return arr_fp32.astype(dtype)
+
+def attribute_to_kwarg(attribute):
+    '''
+    Convert attribute to kwarg format for use with onnx.helper.make_node.
+    '''
+    attribute_mapping = {
+        1: attribute.f,
+        2: attribute.i,
+        3: attribute.s,
+        4: attribute.t,
+        5: attribute.g,
+        6: attribute.floats,
+        7: attribute.ints,
+        8: attribute.strings,
+        9: attribute.tensors,
+        10: attribute.graphs
+    }
+    if attribute.type in attribute_mapping:
+        value = attribute_mapping[attribute.type]
+    else: # pragma: no cover
+        raise ValueError(
+            'attribute {} has no type specified '
+            'or unsupported type {}.'.format(attribute.name, attribute.type))
+    return {attribute.name: value}
+
+def find_by_name(name, item_list):
+    '''
+    Helper function to find item by name in a list.
+    '''
+    items = []
+    for item in item_list:
+        assert hasattr(item, "name"), \
+            "{} should have a 'name' atrribute defined".format(item) # pragma: no cover
+        if item.name == name:
+            items.append(item)
+    if len(items) > 0:
+        return items[0]
+    else:
+        return None
diff --git a/neural_compressor/model/onnx_model.py b/neural_compressor/model/onnx_model.py
diff --git a/test/adaptor/onnxrt_adaptor/test_onnxrt_operators.py b/test/adaptor/onnxrt_adaptor/test_onnxrt_operators.py
diff --git a/test/model/test_onnx_model.py b/test/model/test_onnx_model.py