fastmachinelearning
diff --git a/‎hls4ml/backends/oneapi/oneapi_backend.py
Lines changed: 0 additions & 12 deletions b/‎hls4ml/backends/oneapi/oneapi_backend.py
Lines changed: 0 additions & 12 deletions
diff --git a/‎hls4ml/backends/vivado/passes/distributed_arithmetic.py
Lines changed: 41 additions & 30 deletions b/‎hls4ml/backends/vivado/passes/distributed_arithmetic.py
Lines changed: 41 additions & 30 deletions
diff --git a/‎hls4ml/converters/keras_v3/hgq2/_base.py
Lines changed: 4 additions & 2 deletions b/‎hls4ml/converters/keras_v3/hgq2/_base.py
Lines changed: 4 additions & 2 deletions
diff --git a/‎hls4ml/converters/keras_v3/hgq2/softmax.py
Lines changed: 1 addition & 3 deletions b/‎hls4ml/converters/keras_v3/hgq2/softmax.py
Lines changed: 1 addition & 3 deletions
diff --git a/‎hls4ml/converters/keras_v3/hgq2/unary_lut.py
Lines changed: 2 additions & 8 deletions b/‎hls4ml/converters/keras_v3/hgq2/unary_lut.py
Lines changed: 2 additions & 8 deletions
diff --git a/‎hls4ml/converters/keras_v3_to_hls.py
Lines changed: 2 additions & 2 deletions b/‎hls4ml/converters/keras_v3_to_hls.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎hls4ml/model/graph.py
Lines changed: 0 additions & 6 deletions b/‎hls4ml/model/graph.py
Lines changed: 0 additions & 6 deletions
@@ -179,12 +179,6 @@ def compile(self, model):
         try:
             subprocess.run('which icpx', shell=True, cwd=builddir, check=True)
         except subprocess.CalledProcessError:
-            try:
-                import pytest
-
-                pytest.skip('icpx not present')
-            except ImportError:
-                pass
             raise RuntimeError('Could not find icpx. Please configure oneAPI appropriately')
         subprocess.run('cmake ..', shell=True, cwd=builddir, check=True)
         subprocess.run('make lib', shell=True, cwd=builddir, check=True)
@@ -210,12 +204,6 @@ def build(self, model, build_type='fpga_emu', run=False):
         try:
             subprocess.run('which icpx', shell=True, cwd=builddir, check=True)
         except subprocess.CalledProcessError:
-            try:
-                import pytest
-
-                pytest.skip('icpx not present')
-            except ImportError:
-                pass
             raise RuntimeError('Could not find icpx. Please configure oneAPI appropriately')
         subprocess.run('cmake ..', shell=True, cwd=builddir, check=True)
         subprocess.run(f'make {build_type}', shell=True, cwd=builddir, check=True)
 
@@ -11,7 +11,6 @@
 from hls4ml.model.optimizer.passes.hgq_proxy_model import FixedPointQuantizer
 from hls4ml.model.types import FixedPrecisionType, Source
 from hls4ml.utils.dependency import requires
-from hls4ml.utils.einsum_utils import parse_einsum  # noqa: F401
 
 if typing.TYPE_CHECKING:
     from hls4ml.model import ModelGraph
@@ -66,8 +65,8 @@ def _(node: Dense):
 @get_kernel_inp_kif.register(Conv1D)
 @get_kernel_inp_kif.register(Conv2D)
 def _(layer: Conv1D | Conv2D):
-    assert layer.attributes.attributes['data_format'] == 'channels_last', 'Only channels_last format is supported'
-    kernel = layer.attributes.attributes['weight'].data
+    assert layer.attributes['data_format'] == 'channels_last', 'Only channels_last format is supported'
+    kernel = layer.attributes['weight'].data
     k_in, i_in, f_in = _get_input_kif(layer)
     k_in, i_in, f_in = pad_arrs(layer, 0, k_in, i_in, f_in)
     k_in, i_in, f_in = im2col(kernel.shape, k_in, i_in, f_in)
@@ -149,16 +148,10 @@ def transform(self, model: 'ModelGraph', node: Layer):
         node.set_attr('da_codegen', Source(fn_str))
 
 
-dense_da_stream_template = '''struct config{index} {{
-    static const unsigned n_in = {n_in};
-    static const unsigned n_out = {n_out};
-    static const unsigned io_type = nnet::io_stream;
-    static const unsigned strategy = nnet::distributed_arithmetic;
-    constexpr static auto dense_da = nnet::dense_da_{index}<typename {inp_t}::value_type, typename {out_t}::value_type>;
-}};\n'''
-
-
 class FuseQuantizerIntoDALayers(OptimizerPass):
+    """Heterogeneous quantizer can be fused into the DA CMVM kernel in some cases.
+    This would allow heterogeenous quantizarion for io stream in some cases."""
+
     def match(self, node: Layer):
         if not isinstance(node, FixedPointQuantizer):
             return False
@@ -203,9 +196,18 @@ def transform(self, model: 'ModelGraph', node: FixedPointQuantizer):
         return True
 
 
+dense_da_stream_template = '''struct config{index} {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_out = {n_out};
+    static const unsigned io_type = nnet::io_stream;
+    static const unsigned strategy = nnet::distributed_arithmetic;
+    constexpr static auto dense_da = nnet::dense_da_{index}<typename {inp_t}::value_type, typename {out_t}::value_type>;
+}};\n'''
+
+
 class DALatencyDenseTemplate(OptimizerPass):
-    # For Dense, distributed arithmetic do not call the original, regardless of the io_type
-    # FOr io_stream, a minimal config will still be generated
+    # For Dense, distributed arithmetic do not call the original impl, regardless of the io_type
+    # For io_stream, a minimal config will still be generated
     def match(self, node: Layer):
         if node.class_name != 'Dense':
             return False
@@ -225,10 +227,10 @@ def transform(self, model: 'ModelGraph', node: Layer):
         if io_type == 'io_parallel':
             fn_name = f'dense_da_{node.index}<{inp_t}, {out_t}>'
             function_cpp = f'{namespace}::{fn_name}({inp_name}, {out_name});'
-            node.attributes.attributes['function_cpp'] = function_cpp
+            node.attributes['function_cpp'] = function_cpp
         else:
             assert io_type == 'io_stream'
-            config_cpp = dense_da_stream_template.format(inp_t=inp_t, out_t=out_t, **node.attributes.attributes)
+            config_cpp = dense_da_stream_template.format(inp_t=inp_t, out_t=out_t, **node.attributes)
             function_cpp = f'nnet::dense<{inp_t}, {out_t}, config{node.index}>({inp_name}, {out_name});'
             node.attributes['config_cpp'] = config_cpp
             node.attributes['function_cpp'] = function_cpp
@@ -300,8 +302,8 @@ def transform(self, model: 'ModelGraph', node: Layer):
             class_name = class_name[9:]
 
         ndim = len(ker_shape) - 2
-        function_cpp = f'nnet::conv{ndim}d_da_cl<config{node.index}, {inp_t}, {out_t}>({inp_name}, {out_name});'
-        node.attributes.attributes['function_cpp'] = function_cpp
+        function_cpp = f'nnet::conv{ndim}d_cl<config{node.index}, {inp_t}, {out_t}>({inp_name}, {out_name});'
+        node.attributes['function_cpp'] = function_cpp
 
         # config generation
         params = node.attributes.attributes.copy()
@@ -314,15 +316,15 @@ def transform(self, model: 'ModelGraph', node: Layer):
         params.setdefault('stride_height', -1 if ndim == 1 else 1)
 
         config_cpp = conv_da_parallel_template.format(inp_t=inp_t, out_t=out_t, n_pixels=n_pixels, **params)
-        node.attributes.attributes['config_cpp'] = config_cpp
+        node.attributes['config_cpp'] = config_cpp
 
         # Only unrolled header is required for io_parallel
         include_headers = [
             'nnet_utils/nnet_da_wrappers.h',
             f'nnet_utils/nnet_{class_name.lower()}.h',
             'nnet_utils/nnet_conv_stream.h',  # some properties defined in config need this
         ]
-        node.attributes.attributes['include_header'] = include_headers
+        node.attributes['include_header'] = include_headers
 
         # avoid output weights and bias; alternatie entry point does not use them
         del node.attributes['weight_data']
@@ -333,6 +335,18 @@ def transform(self, model: 'ModelGraph', node: Layer):
         del node.attributes['bias_t']
 
 
+kernel_fn_template = '''
+template <typename inp_t, typename out_t>
+void einsum_dense{index}_da_kernel(
+    inp_t inp_tpose[{inp_tpose}],
+    out_t out_tpose[{out_tpose}],
+    int l0
+) {{
+    {fn_call_str}
+}}
+'''
+
+
 class DistributedArithmeticEinsumCodegen(OptimizerPass):
     '''Generates C++ code for distributed arithmetic implementation of Dense layers'''
 
@@ -373,16 +387,13 @@ def transform(self, model: 'ModelGraph', node: Layer):
             fn_call = f'{fn_name}(&inp_tpose[({i} * {L_data} + l0) * {C}], &out_tpose[({i} * {L_data} + l0) * {L_ker}]);'
             fn_calls.append(fn_call)
 
-        kernel_fn = f'''
-template <typename inp_t, typename out_t>
-void einsum_dense{node.index}_da_kernel(
-    inp_t inp_tpose[{L_data * C * I}],
-    out_t out_tpose[{L_data * L_ker * I}],
-    int l0
-) {{
-    {"    ".join(fn_calls)}
-}}
-'''
+        kernel_fn = kernel_fn_template.format(
+            index=node.index,
+            inp_tpose=L_data * C * I,
+            out_tpose=L_data * L_ker * I,
+            fn_call_str='    \n'.join(fn_calls),
+        )
+
         code_gen = '\n\n'.join(fn_strs) + '\n\n' + kernel_fn
         node.attributes['da_codegen'] = Source(code_gen)
         del node.attributes['weight_data']
 
@@ -33,8 +33,10 @@ def extract_fixed_quantizer_config(q, tensor: 'KerasTensor', is_input: bool) ->
     B = np.broadcast_to(B.astype(np.int8), (1,) + shape)  # type: ignore
     I = np.broadcast_to(I.astype(np.int8), (1,) + shape)  # noqa: E741
 
-    overflow_mode = internal_q.overflow_mode
-    round_mode = internal_q.round_mode
+    overflow_mode: str = internal_q.overflow_mode
+    round_mode: str = internal_q.round_mode
+    if round_mode.startswith('S_'):
+        round_mode = round_mode[2:]
     fusible = np.unique(k).size == 1 and np.unique(B).size == 1 and np.unique(I).size == 1
 
     input_keras_tensor_names = tensor.name if is_input else f'{tensor.name}_q'
 
@@ -63,7 +63,7 @@ def handle(
             assert all(ax1 - ax0 == 1 for ax0, ax1 in zip(axs[:-1], axs[1:])), 'Softmax must act on adjacent axes'
             n_outer: int = prod(in_tensors[0].shape[1 : axs[0]])  # type: ignore
             n_inner: int = prod(in_tensors[0].shape[axs[-1] + 1 :])  # type: ignore
-            ax = -1  # if n_inner == 1 else 999  # 999 as placeholder
+            ax = -1
         n_in: int = prod(in_tensors[0].shape[1:])  # type: ignore
 
         from hgq.quantizer.internal import FixedPointQuantizerBase
@@ -124,8 +124,6 @@ def handle(
 
         if layer.stable:
             inp_norm_t = fixed_quantizer_to_hls4ml_t(layer.exp_table.iq.quantizer)
-            # inp_norm_t.saturation_mode = SaturationMode.WRAP
-            # inp_norm_t.rounding_mode = RoundingMode.TRN
             config['inp_norm_t'] = inp_norm_t
 
         return (config,)
@@ -2,7 +2,7 @@
 from collections.abc import Sequence
 
 import numpy as np
-from quantizers import float_quantize, get_fixed_quantizer_np
+from quantizers import get_fixed_quantizer_np
 
 from hls4ml.model.types import FixedPrecisionType
 
@@ -14,8 +14,6 @@
 
 from decimal import Decimal
 
-from hls4ml.utils.qinterval import minimal_kif
-
 
 @register
 class QUnaryLUTHandler(QLayerHandler, KerasV3LayerHandler):
@@ -78,12 +76,8 @@ def handle(
             table_t = FixedPrecisionType(b, I, k)
         else:
             assert isinstance(oq, FloatPointQuantizer)
-            m, e, e0 = (ops.convert_to_numpy(x).ravel().item() for x in (oq.m, oq.e, oq.e0))
-            table = float_quantize(table, m, e, e0)
-            k, i, f = (int(np.min(x)) for x in minimal_kif(table))
-
             raise NotImplementedError('FloatPointQuantizer is not supported yet')
-            table_t = FixedPrecisionType(k + i + f, k + i, bool(k))
+
         table = ops.convert_to_numpy(table)
 
         config.update(
 
@@ -21,8 +21,8 @@
 
 def get_io_tensors(layer: 'keras.Layer', node_whitelist: set[int] | None = None):
     """Given a keras layer, return a list of tuples of input and output
-    tensors. If the layer is called only once (i.e., no shared layers),
-    the list will contain only one tuple.
+    tensors. If the layer is called only once (i.e., layer is not used
+    multiple times in the same model), the list will contain only one tuple.
 
     The layer must have been built before calling this function.
 
 
@@ -873,12 +873,6 @@ def _compute_n_samples(self, x):
         return int(n_sample)
 
     def predict(self, x):
-        if isinstance(x, np.ndarray) and not x.flags['C_CONTIGUOUS']:
-            x = np.ascontiguousarray(x)
-
-        # Compile the model if it wasn't compiled yet
-        if self._top_function_lib is None:
-            self.compile()
         top_function, ctype = self._get_top_function(x)
         n_samples = self._compute_n_samples(x)
         n_inputs = len(self.get_input_variables())