fastmachinelearning
diff --git a/‎hls4ml/backends/fpga/passes/codegen.py
Lines changed: 97 additions & 70 deletions b/‎hls4ml/backends/fpga/passes/codegen.py
Lines changed: 97 additions & 70 deletions
diff --git a/‎hls4ml/backends/vivado/passes/convolution_templates.py
Lines changed: 26 additions & 30 deletions b/‎hls4ml/backends/vivado/passes/convolution_templates.py
Lines changed: 26 additions & 30 deletions
@@ -2,7 +2,7 @@
 
 import numpy as np
 
-from hls4ml.model.layers import Conv1D, Conv2D, Dense
+from hls4ml.model.layers import GRU, LSTM, Conv1D, Conv2D, Dense
 from hls4ml.model.optimizer import OptimizerPass
 from hls4ml.model.types import Source
 
@@ -60,8 +60,8 @@ class GenerateUnrolledDenseResource(OptimizerPass):
 
     def match(self, node):
         # Only apply to layers use that use Dense Matrix Multiplication
-        # TODO - Extend (& test) for Conv1D / Separable Conv / Depthwise Conv / Recurrent layers
-        layers_with_dense = (Dense, Conv2D)
+        # TODO - Extend (& test) for Separable Conv / Depthwise Conv / Recurrent layers
+        layers_with_dense = (Dense, Conv1D, Conv2D, LSTM, GRU)
 
         # Unrolled Dense mimicks the hardware implementation of Resource strategy -> apply after Resource optimizer
         weights_transposed = node.get_attr('_weights_transposed', False)
@@ -70,23 +70,43 @@ def match(self, node):
         rf_gt_one = node.get_attr('reuse_factor', 1) > 1
 
         # User requested unrolled implementation of Dense
-        is_unrolled = node.get_attr('dense_resource_implementation', 'standard') == 'unrolled'
+        is_unrolled = node.get_attr('strategy', 'latency') == 'unrolled'
 
         return isinstance(node, layers_with_dense) and weights_transposed and rf_gt_one and is_unrolled
 
     def transform(self, model, node):
-        code_str = self.__generate_unrolled_dense_resource(model, node)
-        node.set_attr('unrolled_dense_resource_codegen', Source(code_str))
+        if isinstance(node, (LSTM, GRU)):
+            n_in, n_out, n_in_recr, n_out_recr = node.model.config.backend.get_layer_mult_size(node)
 
-    def __generate_unrolled_dense_resource(self, model, node):
+            reuse_factor = node.get_attr('reuse_factor')
+            weights = node.weights['weight']
+            code_str = self._generate_unrolled_function(n_in, n_out, reuse_factor, weights, str(node.index) + '_1')
+            node.set_attr('unrolled_dense_resource_codegen_1', Source(code_str))
+
+            recr_reuse_factor = node.get_attr('recurrent_reuse_factor')
+            recr_weights = node.weights['recurrent_weight']
+            code_str = self._generate_unrolled_function(
+                n_in_recr, n_out_recr, recr_reuse_factor, recr_weights, str(node.index) + '_2'
+            )
+            node.set_attr('unrolled_dense_resource_codegen_2', Source(code_str))
+
+        else:
+            n_in, n_out = node.model.config.backend.get_layer_mult_size(node)
+            reuse_factor = node.get_attr('reuse_factor')
+            weights = node.weights['weight']
+
+            code_str = self._generate_unrolled_function(n_in, n_out, reuse_factor, weights, node.index)
+            node.set_attr('unrolled_dense_resource_codegen', Source(code_str))
+
+    def _generate_unrolled_function(self, n_in, n_out, reuse_factor, weights, function_suffix):
         """
         Generate a C++ function that mimics the Dense Resource implementation.
 
         The HLS compiler produces suboptimal designs for Dense Resource when the weights processed by the same DSP are zero.
-        Latency strategy can optimize zero mutiplications
+        Latency strategy can optimize zero multiplications
         Resource strategy, on the other hand, cannot.
         When all the weights in the same BRAM block are zero, Vivado is unable to optimize it
-        With this (and additional TCL scripts) zero BRAM are optimised
+        With this (and additional TCL scripts) zero BRAM are optimized
 
         Args:
             node: Layer to generate code for
@@ -96,61 +116,58 @@ def __generate_unrolled_dense_resource(self, model, node):
 
         # Variable instantiation and function pragmas
         generated_code = (
-            "template<class data_T, class res_T, typename CONFIG_T>\n"
-            "class dense_unrolled_{index} : public DenseResourceUnrolled<data_T, res_T, CONFIG_T> {{\n"
-            "    public:\n"
-            "    static void dense_unrolled(\n"
-            "    data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],\n"
-            "    typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],\n"
-            "    typename CONFIG_T::bias_t biases[CONFIG_T::n_out]\n"
-            "    ) {{\n"
-            "        #pragma HLS pipeline II=CONFIG_T::reuse_factor\n"
-            "\n"
-            "        constexpr int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);\n"
-            "        #pragma HLS function_instantiate variable=weights,biases\n"
-            "        #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor\n"
-            "        #pragma HLS RESOURCE variable=weights core=ROM_nP_BRAM\n"
-            "        #pragma HLS ARRAY_PARTITION variable=biases complete\n"
-            "\n"
-            "        typename CONFIG_T::accum_t acc[CONFIG_T::n_out];\n"
-            "        #pragma HLS ARRAY_PARTITION variable=acc complete\n"
-            "\n"
-            "        InitAccum:\n"
-            "        for (int i = 0; i < CONFIG_T::n_out; i++) {{\n"
-            "            #pragma HLS UNROLL\n"
-            "            acc[i] = (typename CONFIG_T::accum_t) biases[i];\n"
-            "        }}\n"
-            "\n"
-        ).format(index=node.index)
+            'template<class data_T, class res_T, typename CONFIG_T>\n'
+            'class dense_unrolled_{suffix} : public DenseKernel<data_T, res_T, CONFIG_T> {{\n'
+            '    public:\n'
+            '    static void dense(\n'
+            '    data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],\n'
+            '    typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],\n'
+            '    typename CONFIG_T::bias_t biases[CONFIG_T::n_out]\n'
+            '    ) {{\n'
+            '        #pragma HLS pipeline II=CONFIG_T::reuse_factor\n'
+            '\n'
+            '        constexpr int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);\n'
+            '        #pragma HLS function_instantiate variable=weights,biases\n'
+            '        #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor\n'
+            '        #pragma HLS RESOURCE variable=weights core=ROM_nP_BRAM\n'
+            '        #pragma HLS ARRAY_PARTITION variable=biases complete\n'
+            '\n'
+            '        typename CONFIG_T::accum_t acc[CONFIG_T::n_out];\n'
+            '        #pragma HLS ARRAY_PARTITION variable=acc complete\n'
+            '\n'
+            '        InitAccum:\n'
+            '        for (int i = 0; i < CONFIG_T::n_out; i++) {{\n'
+            '            #pragma HLS UNROLL\n'
+            '            acc[i] = (typename CONFIG_T::accum_t) biases[i];\n'
+            '        }}\n'
+            '\n'
+        ).format(suffix=function_suffix)
 
         # Unrolled multiplication, according to the three cases
-        n_in, n_out = node.model.config.backend.get_layer_mult_size(node)
-        reuse_factor = node.get_attr('reuse_factor')
-        weights = node.weights['weight']
         if reuse_factor <= n_in:
-            mult_code = self.__generate_unrolled_mult_code_rf_leq_nin(n_in, n_out, reuse_factor, weights)
+            mult_code = self._generate_unrolled_mult_code_rf_leq_nin(n_in, n_out, reuse_factor, weights)
         elif reuse_factor > n_in and reuse_factor % n_in == 0:
-            mult_code = self.__generate_unrolled_mult_code_rf_gt_nin_rem0(n_in, n_out, reuse_factor, weights)
+            mult_code = self._generate_unrolled_mult_code_rf_gt_nin_rem0(n_in, n_out, reuse_factor, weights)
         else:
             # This case shouldn't happen if my understanding of RF is correct
             # The function fpga_backend._validate_reuse_factor() has assertion rf % n_in == 0 or rf < n_in
             raise Exception('Not implemented...')
 
         # Write output
-        generated_code += mult_code + "\n"
+        generated_code += mult_code + '\n'
         generated_code += (
-            "        Result:\n"
-            "        for (int i = 0; i < CONFIG_T::n_out; i++) {\n"
-            "            #pragma HLS UNROLL\n"
-            "            res[i] = cast<data_T, res_T, CONFIG_T>(acc[i]);\n"
-            "        }\n"
-            "    }\n"
-            "};\n"
+            '        Result:\n'
+            '        for (int i = 0; i < CONFIG_T::n_out; i++) {\n'
+            '            #pragma HLS UNROLL\n'
+            '            res[i] = cast<data_T, res_T, CONFIG_T>(acc[i]);\n'
+            '        }\n'
+            '    }\n'
+            '};\n'
         )
 
         return generated_code
 
-    def __generate_unrolled_mult_code_rf_leq_nin(self, n_in, n_out, reuse_factor, weights):
+    def _generate_unrolled_mult_code_rf_leq_nin(self, n_in, n_out, reuse_factor, weights):
         # Function constants
         mult_factor = min(n_in, reuse_factor)
         block_factor = int(math.ceil(n_in * n_out / reuse_factor))
@@ -162,24 +179,29 @@ def __generate_unrolled_mult_code_rf_leq_nin(self, n_in, n_out, reuse_factor, we
         # The new shape is (parallel_mult, reuse_factor)
         zeros = np.sum(~weights.data.reshape(block_factor, reuse_factor).any(1))
 
+        # Used to pad the code to make it human-readable
+        indent = '    '
+
         # Generate unrolled multiplications
-        mult_code = f"\t\t#pragma HLS ALLOCATION operation instances=mul limit={mult_limit - zeros}\n"
-        mult_code += "\t\tMULT: {\n"
-        mult_code += "\t\t\t#pragma HLS protocol\n"
+        mult_code = f'{indent*2}#pragma HLS ALLOCATION operation instances=mul limit={mult_limit - zeros}\n'
+        mult_code += f'{indent*2}MULT: {{\n'
+        mult_code += f'{indent*3}#pragma HLS protocol\n'
 
         for ir in range(reuse_factor):
             acc_step = 0
             out_index = 0
             w_index = ir
             in_index = ir
 
-            mult_code += f"\t\t\tM{ir}: {{\n"
+            mult_code += f'{indent*3}M{ir}: {{\n'
             for _ in range(block_factor):
                 if weights.data.flatten()[w_index] != 0:
-                    mult_code += f"\t\t\t\tacc[{out_index}] += \
-                        static_cast<typename CONFIG_T::accum_t>\
-                        (CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::\
-                        product(data[{in_index}], weights[{w_index}]));\n"
+                    mult_code += (
+                        f'{indent*4}acc[{out_index}] += '
+                        'static_cast<typename CONFIG_T::accum_t>'
+                        '(CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::'
+                        f'product(data[{in_index}], weights[{w_index}]));\n'
+                    )
 
                 w_index += reuse_factor
                 in_index += reuse_factor
@@ -191,13 +213,13 @@ def __generate_unrolled_mult_code_rf_leq_nin(self, n_in, n_out, reuse_factor, we
                 else:
                     acc_step += 1
 
-            mult_code += "\t\t\t}\n"
+            mult_code += f'{indent*3}}}\n'
 
-        mult_code += "\t\t}\n"
+        mult_code += f'{indent*2}}}\n'
 
         return mult_code
 
-    def __generate_unrolled_mult_code_rf_gt_nin_rem0(self, n_in, n_out, reuse_factor, weights):
+    def _generate_unrolled_mult_code_rf_gt_nin_rem0(self, n_in, n_out, reuse_factor, weights):
         # Function constants
         mult_factor = min(n_in, reuse_factor)
         block_factor = int(math.ceil(n_in * n_out / reuse_factor))
@@ -208,6 +230,9 @@ def __generate_unrolled_mult_code_rf_gt_nin_rem0(self, n_in, n_out, reuse_factor
         # The new shape is (parallel_mult, reuse_factor)
         zeros = np.sum(~weights.data.reshape(block_factor, reuse_factor).any(1))
 
+        # Used to pad the code to make it human-readable
+        indent = '    '
+
         # Generate out indices
         outidx = [0] * reuse_factor
         outstep = 0
@@ -221,32 +246,34 @@ def __generate_unrolled_mult_code_rf_gt_nin_rem0(self, n_in, n_out, reuse_factor
         in_index = 0
 
         # Generate unrolled multiplications
-        mult_code = f"\t\t#pragma HLS ALLOCATION operation instances=mul limit={mult_limit - zeros}\n"
-        mult_code += "\t\tMULT: {\n"
-        mult_code += "\t\t\t#pragma HLS protocol\n"
+        mult_code = f'{indent*2}#pragma HLS ALLOCATION operation instances=mul limit={mult_limit - zeros}\n'
+        mult_code += f'{indent*2}MULT: {{\n'
+        mult_code += f'{indent*3}#pragma HLS protocol\n'
 
         for ir in range(reuse_factor):
             w_index = ir
             out_index = outidx[ir]
 
-            mult_code += f"\t\t\tM{ir}: {{\n"
+            mult_code += f'{indent*3}M{ir}: {{\n'
             for _ in range(block_factor):
                 if weights.data.flatten()[w_index] != 0:
-                    mult_code += f"\t\t\t\tacc[{int(out_index)}] += \
-                        static_cast<typename CONFIG_T::accum_t>\
-                        (CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::\
-                        product(data[{in_index}], weights[{w_index}]));\n"
+                    mult_code += (
+                        f'{indent*4}acc[{int(out_index)}] += '
+                        'static_cast<typename CONFIG_T::accum_t>'
+                        '(CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::'
+                        f'product(data[{in_index}], weights[{w_index}]));\n'
+                    )
 
                 w_index += reuse_factor
                 if w_index > n_in * n_out:
                     break
                 out_index += outscale
-            mult_code += "\t\t\t}\n"
+            mult_code += f'{indent*3}}}\n'
 
             in_index += 1
             if in_index >= n_in:
                 in_index = 0
 
-        mult_code += "\t\t}\n"
+        mult_code += f'{indent*2}}}\n'
 
         return mult_code
@@ -17,14 +17,13 @@
     static const unsigned n_out = {n_out};
     static const unsigned reuse_factor = {reuse};
     static const unsigned strategy = nnet::{strategy};
-    static const unsigned resource_implementation = nnet::{dense_resource_implementation};
-    template<class data_T, class res_T, class CONFIG_T>
-    using dense_unrolled = nnet::{unrolled_function}<data_T, res_T, CONFIG_T>;
     static const unsigned n_zeros = {nzeros};
     static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor;
     typedef {accum_t.name} accum_t;
     typedef {bias_t.name} bias_t;
     typedef {weight_t.name} weight_t;
+    template<class data_T, class res_T, class CONFIG_T>
+    using kernel = nnet::{dense_function}<data_T, res_T, CONFIG_T>;
     template<class x_T, class y_T>
     using product = nnet::product::{product_type}<x_T, y_T>;
 }};\n"""
@@ -49,9 +48,6 @@
     static const bool store_weights_in_bram = false;
     static const unsigned strategy = nnet::{strategy};
     static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation};
-    static const unsigned resource_implementation = nnet::{dense_resource_implementation};
-    template<class data_T, class res_T, class CONFIG_T>
-    using dense_unrolled = nnet::{unrolled_function}<data_T, res_T, CONFIG_T>;
     static const unsigned min_width = {min_width};
     static const ap_uint<filt_width> pixels[min_width];
     static const unsigned n_partitions = {n_partitions};
@@ -96,8 +92,6 @@ def format(self, node):
             params['fill_fn'] = f'fill_buffer_{node.index}'
         else:
             params['fill_fn'] = 'FillConv1DBuffer'
-        # TODO - Extend unrolled Dense Resource to Conv1D
-        params['unrolled_function'] = 'DenseResourceUnrolled'
 
         conv_config = self.template.format(**params)
 
@@ -108,8 +102,18 @@ def format(self, node):
         mult_params['product_type'] = get_backend('vivado').product_type(
             node.get_input_variable().type.precision, node.get_weights('weight').type.precision
         )
-        # TODO - Extend unrolled Dense Resource to Conv1D
-        mult_params['unrolled_function'] = 'DenseResourceUnrolled'
+
+        if node.get_attr('strategy').lower() == 'latency':
+            mult_params['dense_function'] = 'DenseLatency'
+        elif node.get_attr('strategy').lower() == 'resource':
+            if int(mult_params['reuse_factor']) <= int(mult_params['n_in']):
+                mult_params['dense_function'] = 'DenseResource_rf_leq_nin'
+            else:
+                mult_params['dense_function'] = 'DenseResource_rf_gt_nin_rem0'
+            # The 3rd case is never used
+        elif node.get_attr('strategy').lower() == 'unrolled':
+            mult_params['dense_function'] = f'dense_unrolled_{node.index}'
+
         mult_config = self.mult_template.format(**mult_params)
 
         return mult_config + '\n' + conv_config
@@ -160,9 +164,6 @@ def __init__(self):
     static const bool store_weights_in_bram = false;
     static const unsigned strategy = nnet::{strategy};
     static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation};
-    static const unsigned resource_implementation = nnet::{dense_resource_implementation};
-    template<class data_T, class res_T, class CONFIG_T>
-    using dense_unrolled = nnet::{unrolled_function}<data_T, res_T, CONFIG_T>;
     static const unsigned min_height = {min_height};
     static const unsigned min_width = {min_width};
     static const ap_uint<filt_height * filt_width> pixels[min_height * min_width];
@@ -217,15 +218,6 @@ def format(self, node):
         else:
             params['fill_fn'] = 'FillConv2DBuffer'
 
-        if (
-            node.get_attr('dense_resource_implementation', 'standard') == 'unrolled'
-            and node.get_attr('strategy').lower() == 'resource'
-            and node.get_attr('reuse_factor') > 1
-        ):
-            params['unrolled_function'] = f'dense_unrolled_{node.index}'
-        else:
-            params['unrolled_function'] = 'DenseResourceUnrolled'
-
         conv_config = self.template.format(**params)
 
         mult_params = self._default_config_params(node)
@@ -235,14 +227,18 @@ def format(self, node):
         mult_params['product_type'] = get_backend('vivado').product_type(
             node.get_input_variable().type.precision, node.get_weights('weight').type.precision
         )
-        if (
-            node.get_attr('dense_resource_implementation', 'standard') == 'unrolled'
-            and node.get_attr('strategy').lower() == 'resource'
-            and node.get_attr('reuse_factor') > 1
-        ):
-            mult_params['unrolled_function'] = f'dense_unrolled_{node.index}'
-        else:
-            mult_params['unrolled_function'] = 'DenseResourceUnrolled'
+
+        if node.get_attr('strategy').lower() == 'latency':
+            mult_params['dense_function'] = 'DenseLatency'
+        elif node.get_attr('strategy').lower() == 'resource':
+            if int(mult_params['reuse_factor']) <= int(mult_params['n_in']):
+                mult_params['dense_function'] = 'DenseResource_rf_leq_nin'
+            else:
+                mult_params['dense_function'] = 'DenseResource_rf_gt_nin_rem0'
+            # The 3rd case is never used
+        elif node.get_attr('strategy').lower() == 'unrolled':
+            mult_params['dense_function'] = f'dense_unrolled_{node.index}'
+
         mult_config = self.mult_template.format(**mult_params)
 
         return mult_config + '\n' + conv_config