fastmachinelearning
diff --git a/‎.github/PULL_REQUEST_TEMPLATE.md
Lines changed: 1 addition & 1 deletion b/‎.github/PULL_REQUEST_TEMPLATE.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎.pre-commit-config.yaml
Lines changed: 10 additions & 10 deletions b/‎.pre-commit-config.yaml
Lines changed: 10 additions & 10 deletions
diff --git a/‎MANIFEST.in
Lines changed: 4 additions & 2 deletions b/‎MANIFEST.in
Lines changed: 4 additions & 2 deletions
diff --git a/‎README.md
Lines changed: 1 addition & 1 deletion b/‎README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/advanced/fifo_depth.rst
Lines changed: 23 additions & 8 deletions b/‎docs/advanced/fifo_depth.rst
Lines changed: 23 additions & 8 deletions
diff --git a/‎example-models b/‎example-models
diff --git a/‎hls4ml/__init__.py
Lines changed: 0 additions & 30 deletions b/‎hls4ml/__init__.py
Lines changed: 0 additions & 30 deletions
diff --git a/‎hls4ml/backends/catapult/passes/conv_stream.py
Lines changed: 26 additions & 31 deletions b/‎hls4ml/backends/catapult/passes/conv_stream.py
Lines changed: 26 additions & 31 deletions
diff --git a/‎hls4ml/backends/catapult/passes/convolution_templates.py
Lines changed: 7 additions & 0 deletions b/‎hls4ml/backends/catapult/passes/convolution_templates.py
Lines changed: 7 additions & 0 deletions
diff --git a/‎hls4ml/backends/fpga/fpga_backend.py
Lines changed: 30 additions & 3 deletions b/‎hls4ml/backends/fpga/fpga_backend.py
Lines changed: 30 additions & 3 deletions
diff --git a/‎hls4ml/backends/fpga/fpga_layers.py
Lines changed: 6 additions & 4 deletions b/‎hls4ml/backends/fpga/fpga_layers.py
Lines changed: 6 additions & 4 deletions
diff --git a/‎hls4ml/backends/fpga/passes/hgq_proxy_model.py
Lines changed: 3 additions & 1 deletion b/‎hls4ml/backends/fpga/passes/hgq_proxy_model.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎hls4ml/backends/oneapi/oneapi_backend.py
Lines changed: 18 additions & 1 deletion b/‎hls4ml/backends/oneapi/oneapi_backend.py
Lines changed: 18 additions & 1 deletion
diff --git a/‎hls4ml/backends/oneapi/passes/clone_templates.py
Lines changed: 1 addition & 2 deletions b/‎hls4ml/backends/oneapi/passes/clone_templates.py
Lines changed: 1 addition & 2 deletions
@@ -1,4 +1,4 @@
-A# Description
+# Description
 
 > :memo: Please include a summary of the change.
 >
 
@@ -2,20 +2,26 @@ exclude: (^hls4ml\/templates\/(vivado|quartus)\/(ap_types|ac_types)\/|^test/pyte
 
 repos:
 - repo: https://github.com/psf/black
-  rev: 24.10.0
+  rev: 25.1.0
   hooks:
   - id: black
     language_version: python3
     args: ['--line-length=125',
            '--skip-string-normalization']
 
+- repo: https://github.com/tox-dev/pyproject-fmt
+  rev: v2.5.1
+  hooks:
+    - id: pyproject-fmt
+
 - repo: https://github.com/pre-commit/pre-commit-hooks
   rev: v5.0.0
   hooks:
   - id: check-added-large-files
   - id: check-case-conflict
   - id: check-merge-conflict
   - id: check-symlinks
+  - id: check-toml
   - id: check-yaml
   - id: debug-statements
   - id: end-of-file-fixer
@@ -24,24 +30,18 @@ repos:
   - id: trailing-whitespace
 
 - repo: https://github.com/PyCQA/isort
-  rev: 5.13.2
+  rev: 6.0.1
   hooks:
   - id: isort
-    args: ["--profile", "black", --line-length=125]
 
 - repo: https://github.com/asottile/pyupgrade
-  rev: v3.19.0
+  rev: v3.19.1
   hooks:
   - id: pyupgrade
     args: ["--py36-plus"]
 
-- repo: https://github.com/asottile/setup-cfg-fmt
-  rev: v2.7.0
-  hooks:
-  - id: setup-cfg-fmt
-
 - repo: https://github.com/pycqa/flake8
-  rev: 7.1.1
+  rev: 7.1.2
   hooks:
   - id: flake8
     exclude: docs/conf.py
 
@@ -1,7 +1,9 @@
-include LICENSE README.md CONTRIBUTING.md CITATION.cff pyproject.toml setup.py setup.cfg .clang-format
+include LICENSE README.md CONTRIBUTING.md CITATION.cff pyproject.toml .clang-format
 graft example-models
 graft test
 graft contrib
 recursive-include hls4ml/templates *
-global-exclude .git .gitmodules .gitlab-ci.yml
+recursive-include hls4ml *.py
+recursive-include hls4ml/contrib *
+global-exclude .git .gitmodules .gitlab-ci.yml *.pyc
 include hls4ml/backends/vivado_accelerator/supported_boards.json
@@ -65,7 +65,7 @@ hls4ml.report.read_vivado_report('my-hls-test')
 
 # FAQ
 
-List of frequently asked questions and common HLS synthesis can be found [here](https://fastmachinelearning.org/hls4ml/faq.html)
+List of frequently asked questions and common HLS synthesis can be found [here](https://fastmachinelearning.org/hls4ml/intro/faq.html)
 
 # Citation
 If you use this software in a publication, please cite the software
 
@@ -5,28 +5,29 @@ FIFO Buffer Depth Optimization
 With the ``io_stream`` IO type, each layer is connected with the subsequent layer through first-in first-out (FIFO) buffers.
 The implementation of the FIFO buffers contribute to the overall resource utilization of the design, impacting in particular the BRAM or LUT utilization.
 Because the neural networks can have complex architectures generally, it is hard to know a priori the correct depth of each FIFO buffer.
-By default ``hls4ml`` choses the most conservative possible depth for each FIFO buffer, which can result in a an unnecessary overutilization of resources.
+By default ``hls4ml`` choses the most conservative possible depth for each FIFO buffer, which can result in a an unnecessary over-utilization of resources.
 
-In order to reduce the impact on the resources used for FIFO buffer implementation, an optimization has been developed in `#509 <https://github.com/fastmachinelearning/hls4ml/pull/509>`_ that correctly sizes the depth of the FIFO buffers by analyzing the RTL cosimulation.
-We implemented this FIFO buffer resizing as a :py:class:`~hls4ml.backends.vivado.passes.fifo_depth_optimization` optimizer pass.
+In order to reduce the impact on the resources used for FIFO buffer implementation, an optimization flow has been developed that correctly sizes the depth
+of the FIFO buffers by analyzing the RTL co-simulation. This feature is currently available in ``Vitis`` and ``Vivado`` backends.
+
+In ``Vivado`` backend, FIFO buffer resizing is implemented as a :py:class:`~hls4ml.backends.vivado.passes.fifo_depth_optimization` optimizer pass.
 Through RTL simulation with large FIFO buffers (by default set to a depth of 100,000), we estimate the maximum occupation of each FIFO.
 Once the maximum depth is determined, the optimizer pass sets the FIFO buffer depth to that value plus 1.
 
-As an example, we show below how to use the optimizer pass, inspired by this `GitHub Gist <https://gist.github.com/nicologhielmetti/3a268be32755448920e9f7d5c78a76d8>`_.
-First, we can define a simple neural network in Keras
+Below we show an example of the use of the FIFO depth optimization. First, we can define a simple neural network in Keras:
 
 .. code-block:: Python
 
     from tensorflow.keras.layers import Dense
     from tensorflow.keras.models import Sequential
 
     model = Sequential()
-    model.add(Dense(64, input_shape=(16,), name='fc1', activation='relu')
+    model.add(Dense(64, input_shape=(16,), name='fc1', activation='relu'))
     model.add(Dense(32, name='fc2', activation='relu'))
     model.add(Dense(32, name='fc3', activation='relu'))
-    model.add(Dense(5, name='fc3', activation='softmax'))
+    model.add(Dense(5, name='fc4', activation='softmax'))
 
-Then, we can convert the model, including the flow
+Then, we can convert the model, including the flow:
 
 .. code-block:: Python
 
@@ -47,3 +48,17 @@ Then, we can convert the model, including the flow
     hls_model.build(reset=False, csim=True, synth=True, cosim=True)
 
 For more details and results, see `H. Borras et al., "Open-source FPGA-ML codesign for the MLPerf Tiny Benchmark" (2022) <https://arxiv.org/abs/2206.11791>`_.
+
+Similarly, the FIFO buffers can be optimized while using the ``Vitis`` backend with the following changes:
+
+.. code-block:: Python
+
+    config['Flows'] = ['vitis:fifo_depth_optimization']
+    hls4ml.model.optimizer.get_optimizer('vitis:fifo_depth_optimization').configure(profiling_fifo_depth=100_000)
+
+    hls_model = hls4ml.converters.convert_from_keras_model(model,
+                                                        io_type='io_stream',
+                                                        hls_config=config,
+                                                        output_dir='hls4mlprj_fifo_depth_opt',
+                                                        part='xc7z020clg400-1',
+                                                        backend='Vitis')
@@ -1,33 +1,3 @@
-# Temporary workaround for QKeras installation requirement, will be removed after 1.0.0
-def maybe_install_qkeras():
-    import subprocess
-    import sys
-
-    QKERAS_PKG_NAME = 'QKeras'
-    # QKERAS_PKG_SOURCE = QKERAS_PKG_NAME
-    QKERAS_PKG_SOURCE = 'qkeras@git+https://github.com/fastmachinelearning/qkeras.git'
-
-    def pip_list():
-        p = subprocess.run([sys.executable, '-m', 'pip', 'list'], check=True, capture_output=True)
-        return p.stdout.decode()
-
-    def pip_install(package):
-        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])
-
-    all_pkgs = pip_list()
-    if QKERAS_PKG_NAME not in all_pkgs:
-        print('QKeras installation not found, installing one...')
-        pip_install(QKERAS_PKG_SOURCE)
-        print('QKeras installed.')
-
-
-try:
-    maybe_install_qkeras()
-except Exception:
-    print('Could not find QKeras installation, make sure you have QKeras installed.')
-
-# End of workaround
-
 from hls4ml import converters, report, utils  # noqa: F401, E402
 
 try:
 
@@ -6,7 +6,12 @@ class GenerateConvStreamingInstructions(OptimizerPass):
     '''Generates the instructions for streaming implementation of CNNs'''
 
     def match(self, node):
-        return isinstance(node, (Conv1D, SeparableConv1D, Conv2D, SeparableConv2D))
+        is_match = (
+            isinstance(node, (Conv1D, SeparableConv1D, Conv2D, SeparableConv2D))
+            and node.model.config.get_config_value('IOType').lower() == 'io_stream'
+            and node.get_attr('implementation').lower() == 'encoded'
+        )
+        return is_match
 
     def transform(self, model, node):
         node_class = node.__class__.__name__
@@ -18,35 +23,25 @@ def transform(self, model, node):
             raise Exception(f'Cannot generate instructions for node {node.name} ({node_class})')
 
     def _generate_1d_instructions(self, node):
-        if node.model.config.get_config_value('IOType') == 'io_stream':
-            min_w, instructions = node.model.config.backend.compute_conv1d_instructions(
-                node.get_input_variable().shape[0],
-                node.get_input_variable().shape[1],
-                node.get_attr('filt_width'),
-                node.get_attr('stride_width'),
-            )
-            instructions_str = ','.join(str(i) for i in instructions)
-            node.set_attr('min_width', min_w)
-            node.set_attr('instructions', instructions_str)
-        else:
-            # these are unused; just put dummy values
-            node.set_attr('min_width', node.get_attr('in_width'))
-            node.set_attr('instructions', '0')
+        min_w, instructions = node.model.config.backend.compute_conv1d_instructions(
+            node.get_input_variable().shape[0],
+            node.get_input_variable().shape[1],
+            node.get_attr('filt_width'),
+            node.get_attr('stride_width'),
+        )
+        instructions_str = ','.join(str(i) for i in instructions)
+        node.set_attr('min_width', min_w)
+        node.set_attr('instructions', instructions_str)
 
     def _generate_2d_instructions(self, node):
-        if node.model.config.get_config_value('IOType') == 'io_stream':
-            min_h, min_w, instructions = node.model.config.backend.compute_conv2d_instructions(
-                node.get_input_variable().shape[0],
-                node.get_input_variable().shape[1],
-                node.get_input_variable().shape[2],
-                node.get_attr('filt_height'),
-                node.get_attr('stride_height'),
-            )
-            instructions_str = ','.join(str(i) for i in instructions)
-            node.set_attr('min_height', min_h)
-            node.set_attr('min_width', min_w)
-            node.set_attr('instructions', instructions_str)
-        else:
-            node.set_attr('min_height', node.get_attr('in_height'))
-            node.set_attr('min_width', node.get_attr('in_width'))
-            node.set_attr('instructions', '0')
+        min_h, min_w, instructions = node.model.config.backend.compute_conv2d_instructions(
+            node.get_input_variable().shape[0],
+            node.get_input_variable().shape[1],
+            node.get_input_variable().shape[2],
+            node.get_attr('filt_height'),
+            node.get_attr('stride_height'),
+        )
+        instructions_str = ','.join(str(i) for i in instructions)
+        node.set_attr('min_height', min_h)
+        node.set_attr('min_width', min_w)
+        node.set_attr('instructions', instructions_str)
@@ -94,6 +94,9 @@ def format(self, node):
         else:
             params['fill_fn'] = 'FillConv1DBuffer'
 
+        params['min_width'] = node.get_attr('min_width', node.get_attr('in_width'))
+        params['instructions'] = node.get_attr('instructions', '0')
+
         conv_config = self.template.format(**params)
 
         mult_params = self._default_config_params(node)
@@ -210,6 +213,10 @@ def format(self, node):
         else:
             params['fill_fn'] = 'FillConv2DBuffer'
 
+        params['min_height'] = node.get_attr('min_height', node.get_attr('in_height'))
+        params['min_width'] = node.get_attr('min_width', node.get_attr('in_width'))
+        params['instructions'] = node.get_attr('instructions', '0')
+
         conv_config = self.template.format(**params)
 
         mult_params = self._default_config_params(node)
 
@@ -94,7 +94,7 @@ def __init__(self, name):
             attrs.append(ConfigurableAttribute('reuse_factor', default=1, description=descriptions.reuse_factor))
             self.attribute_map[layer] = attrs
 
-        # seperable is kind of special because it is effectively two layers that will be split
+        # separable is kind of special because it is effectively two layers that will be split
         for layer in (SeparableConv1D, SeparableConv2D):
             attrs = self.attribute_map.get(layer, [])
             attrs.append(TypeAttribute('depthwise_accum'))
@@ -755,7 +755,7 @@ def generate_conv1d_line_buffer_fn(self, layer_idx, n_partitions, in_W, in_C, ke
 
         generated_code = (
             "template<class data_T, typename CONFIG_T>\n"
-            "class fill_buffer_{index} : public FillConv1DBuffer<data_T, CONFIG_T> {{\n"
+            "class fill_buffer_{index} : public nnet::FillConv1DBuffer<data_T, CONFIG_T> {{\n"
             "    public:\n"
             "    static void fill_buffer(\n"
             "        data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n"
@@ -885,7 +885,7 @@ def generate_conv2d_line_buffer_fn(
 
         generated_code = (
             "template<class data_T, typename CONFIG_T>\n"
-            "class fill_buffer_{index} : public FillConv2DBuffer<data_T, CONFIG_T> {{\n"
+            "class fill_buffer_{index} : public nnet::FillConv2DBuffer<data_T, CONFIG_T> {{\n"
             "    public:\n"
             "    static void fill_buffer(\n"
             "        data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],\n"
@@ -913,6 +913,33 @@ def generate_conv2d_line_buffer_fn(
 
         return generated_code
 
+    @staticmethod
+    def permute_config_gen(name: str, shape: tuple[int, ...], perm: tuple[int, ...]):
+        """
+        Generate new shape and perm_strides for a permute operation. Operates by mapping the output index
+        to input input index by:
+        - unravel the output index
+        - map each dimension to the corresponding stride in the input tensor, sum
+        The operation can be expressed as:
+
+        new_shape = tuple(shape[i] for i in perm)
+        strides = np.cumprod((shapes[1:] + (1,))[::-1])[::-1]
+        perm_strides = [strides[i] for i in perm]
+        out[index] = inp[np.dot(np.unravel_index(index, new_shape), perm_strides)]
+
+        Args:
+            name (str): The name of the configuration.
+            shape (tuple[int, ...]): The shape of the input tensor.
+            perm (tuple[int, ...]): The permutation of the dimensions.
+
+        Returns:
+            (new_shape, perm_strides) (tuple, tuple):  the output shape and permutation strides.
+        """
+        new_shape = tuple(shape[i] for i in perm)
+        strides = np.cumprod((shape[1:] + (1,))[::-1])[::-1]
+        perm_strides = tuple(int(strides[i]) for i in perm)
+        return (new_shape, perm_strides)
+
     @model_optimizer()
     def write_hls(self, model):
         self.writer.write_hls(model)
 
@@ -73,12 +73,14 @@ def set_thresholds(self, scale, bias, ternary_threshold=0.5):
 class PointwiseConv1D(Conv1D):
     '''Optimized Conv1D implementation for 1x1 kernels.'''
 
-    # Nothing to do, will pick up function and config from class name
-    pass
+    def initialize(self):
+        # Do noting, values copied
+        pass
 
 
 class PointwiseConv2D(Conv2D):
     '''Optimized Conv2D implementation for 1x1 kernels.'''
 
-    # Nothing to do, will pick up function and config from class name
-    pass
+    def initialize(self):
+        # Do noting, values copied
+        pass
@@ -75,10 +75,12 @@ def transform(self, model, node: FixedPointQuantizer):
 class ProcessFixedPointQuantizerCall(FunctionCallTemplate):
     def __init__(self):
         super().__init__(FixedPointQuantizer, include_header=[])
-        self.template = 'nnet::{name}<{input_t}, {output_t}>({input}, {output});'
+        self.template = '{namespace}::{name}<{input_t}, {output_t}>({input}, {output});'
 
     def format(self, node):
         params = self._default_function_params(node)
+        namespace = node.model.config.writer_config.get('Namespace', None) or 'nnet'
+        params['namespace'] = namespace
 
         return self.template.format(**params)
 
 
@@ -129,13 +129,30 @@ def get_default_flow(self):
     def get_writer_flow(self):
         return self._writer_flow
 
-    def create_initial_config(self, part='Arria10', clock_period=5, io_type='io_parallel'):
+    def create_initial_config(self, part='Arria10', clock_period=5, io_type='io_parallel', write_tar=False, **_):
+        """Create initial configuration of the oneAPI backend.
+
+        Args:
+            part (str, optional): The FPGA part to be used. Defaults to 'Arria10'.
+            clock_period (int, optional): The clock period. Defaults to 5.
+            io_type (str, optional): Type of implementation used. One of
+                'io_parallel' or 'io_stream'. Defaults to 'io_parallel'.
+            write_tar (bool, optional): If True, compresses the output directory into a .tar.gz file. Defaults to False.
+
+        Returns:
+            dict: initial configuration.
+        """
+
         config = {}
 
         config['Part'] = part if part is not None else 'Arria10'
         config['ClockPeriod'] = clock_period
         config['IOType'] = io_type
         config['HLSConfig'] = {}
+        config['WriterConfig'] = {
+            # TODO:  add namespace
+            'WriteTar': write_tar,
+        }
 
         return config
 
 
@@ -1,5 +1,4 @@
-""" The clone templates in the fpga backend are not enough for oneAPI, so this adds the missing parts
-"""
+"""The clone templates in the fpga backend are not enough for oneAPI, so this adds the missing parts"""
 
 from hls4ml.backends.fpga.passes.clone import Clone
 from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-A# Description`
	`1`	`+# Description`
`2`	`2`
`3`	`3`	`> :memo: Please include a summary of the change.`
`4`	`4`	`>`