SepConv1d/2d for io_parallel w/ Latency strategy

vloncar · vloncar · commit 0a866adba513 · 2024-05-13T19:09:47.000+02:00
diff --git a/hls4ml/backends/fpga/passes/codegen.py b/hls4ml/backends/fpga/passes/codegen.py
@@ -1,4 +1,4 @@
-from hls4ml.model.layers import Conv1D, Conv2D
+from hls4ml.model.layers import Conv1D, Conv2D, SeparableConv1D, SeparableConv2D
 from hls4ml.model.optimizer import OptimizerPass
 from hls4ml.model.types import Source
 
@@ -7,16 +7,27 @@ class GenerateConvIm2col(OptimizerPass):
     '''Generates tcode for im2col step of 1D/2d convolution'''
 
     def match(self, node):
-        return isinstance(node, (Conv1D, Conv2D)) and node.model.config.get_config_value('IOType') == 'io_parallel'
+        return (
+            isinstance(node, (Conv1D, Conv2D, SeparableConv1D, SeparableConv2D))
+            and node.model.config.get_config_value('IOType') == 'io_parallel'
+        )
 
     def transform(self, model, node):
-        node_class = node.__class__.__name__
-        if '1D' in node_class:
-            self._generate_im2col_1d(node)
-        elif '2D' in node_class:
-            self._generate_im2col_2d(node)
+        node_class = node.class_name
+        if 'Separable' in node_class:
+            if '1D' in node_class:
+                self._generate_separable_im2col_1d(node)
+            elif '2D' in node_class:
+                self._generate_separable_im2col_2d(node)
+            else:
+                raise Exception(f'Cannot generate instructions for node {node.name} ({node_class})')
         else:
-            raise Exception(f'Cannot generate instructions for node {node.name} ({node_class})')
+            if '1D' in node_class:
+                self._generate_im2col_1d(node)
+            elif '2D' in node_class:
+                self._generate_im2col_2d(node)
+            else:
+                raise Exception(f'Cannot generate instructions for node {node.name} ({node_class})')
 
     def _generate_im2col_1d(self, node):
         code_str = node.model.config.backend.generate_conv1d_line_buffer_fn(
@@ -49,3 +60,56 @@ def _generate_im2col_2d(self, node):
         )
 
         node.set_attr('line_buffer_codegen', Source(code_str))
+
+    def _generate_separable_im2col_1d(self, node):
+        dw_code_str = node.model.config.backend.generate_conv1d_line_buffer_fn(
+            str(node.get_attr('index')) + '_dw',
+            node.get_attr('n_partitions'),
+            node.get_input_variable().shape[0],
+            node.get_input_variable().shape[1],
+            kernel=node.get_attr('filt_width'),
+            stride=node.get_attr('stride_width'),
+            pad=(node.get_attr('pad_left'), node.get_attr('pad_right')),
+        )
+
+        node.set_attr('dw_line_buffer_codegen', Source(dw_code_str))
+
+        pw_code_str = node.model.config.backend.generate_conv1d_line_buffer_fn(
+            str(node.get_attr('index')) + '_pw',
+            node.get_attr('n_partitions'),
+            node.get_output_variable().shape[0],
+            node.get_output_variable().shape[1],
+            kernel=1,
+        )
+
+        node.set_attr('pw_line_buffer_codegen', Source(pw_code_str))
+
+    def _generate_separable_im2col_2d(self, node):
+        dw_code_str = node.model.config.backend.generate_conv2d_line_buffer_fn(
+            str(node.get_attr('index')) + '_dw',
+            node.get_attr('n_partitions'),
+            node.get_input_variable().shape[0],
+            node.get_input_variable().shape[1],
+            node.get_input_variable().shape[2],
+            kernel=(node.get_attr('filt_height'), node.get_attr('filt_width')),
+            stride=(node.get_attr('stride_height'), node.get_attr('stride_width')),
+            pad=(
+                node.get_attr('pad_top'),
+                node.get_attr('pad_bottom'),
+                node.get_attr('pad_left'),
+                node.get_attr('pad_right'),
+            ),
+        )
+
+        node.set_attr('dw_line_buffer_codegen', Source(dw_code_str))
+
+        pw_code_str = node.model.config.backend.generate_conv2d_line_buffer_fn(
+            str(node.get_attr('index')) + '_pw',
+            node.get_attr('n_partitions'),
+            node.get_output_variable().shape[0],
+            node.get_output_variable().shape[1],
+            node.get_input_variable().shape[2],
+            kernel=(1, 1),
+        )
+
+        node.set_attr('pw_line_buffer_codegen', Source(pw_code_str))
diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py
@@ -254,8 +254,8 @@ def __init__(self):
     '{input}, {output}, {d}, {p}, {z}, {b});'
 )
 
-sepconv1d_include_list = ['nnet_utils/nnet_conv1d.h', 'nnet_utils/nnet_sepconv1d_stream.h']
-sepconv2d_include_list = ['nnet_utils/nnet_conv2d.h', 'nnet_utils/nnet_sepconv2d_stream.h']
+sepconv1d_include_list = ['nnet_utils/nnet_conv1d.h', 'nnet_utils/nnet_sepconv1d.h', 'nnet_utils/nnet_sepconv1d_stream.h']
+sepconv2d_include_list = ['nnet_utils/nnet_conv2d.h', 'nnet_utils/nnet_sepconv2d.h', 'nnet_utils/nnet_sepconv2d_stream.h']
 
 
 class SeparableConv1DConfigTemplate(LayerConfigTemplate):
@@ -286,7 +286,10 @@ def format(self, node):
         params['index'] = str(node.index) + '_depthwise'
         params['weight_t'] = node.get_weights('depthwise').type
         params['bias_t'] = node.get_weights('zero_bias').type
-        params['fill_fn'] = 'FillConv1DBuffer'
+        if node.model.config.get_config_value('IOType') == 'io_parallel':
+            params['fill_fn'] = f'fill_buffer_{node.index}_dw'
+        else:
+            params['fill_fn'] = 'FillConv1DBuffer'
 
         if node.get_attr('unscaled'):
             params['scale_index_type'] = 'scale_index_unscaled'
@@ -323,7 +326,10 @@ def format(self, node):
         params['weight_t'] = node.get_weights('pointwise').type
         params['min_width'] = params['in_width']
         params['instructions'] = '0'
-        params['fill_fn'] = 'FillConv1DBuffer'
+        if node.model.config.get_config_value('IOType') == 'io_parallel':
+            params['fill_fn'] = f'fill_buffer_{node.index}_dw'
+        else:
+            params['fill_fn'] = 'FillConv1DBuffer'
 
         if node.get_attr('unscaled'):
             params['scale_index_type'] = 'scale_index_unscaled'
@@ -402,7 +408,10 @@ def format(self, node):
         params['nzeros'] = node.get_weights('depthwise').nzeros
         params['index'] = str(node.index) + '_depthwise'
         params['weight_t'] = node.get_weights('depthwise').type
-        params['fill_fn'] = 'FillConv2DBuffer'
+        if node.model.config.get_config_value('IOType') == 'io_parallel':
+            params['fill_fn'] = f'fill_buffer_{node.index}_dw'
+        else:
+            params['fill_fn'] = 'FillConv2DBuffer'
 
         if node.get_attr('unscaled_h'):
             params['scale_index_height_type'] = 'scale_index_unscaled'
@@ -447,7 +456,10 @@ def format(self, node):
         params['min_height'] = params['in_height']
         params['min_width'] = params['in_width']
         params['instructions'] = '0'
-        params['fill_fn'] = 'FillConv2DBuffer'
+        if node.model.config.get_config_value('IOType') == 'io_parallel':
+            params['fill_fn'] = f'fill_buffer_{node.index}_pw'
+        else:
+            params['fill_fn'] = 'FillConv2DBuffer'
 
         if node.get_attr('unscaled_h'):
             params['scale_index_height_type'] = 'scale_index_unscaled'
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d.h
@@ -0,0 +1,46 @@
+#ifndef NNET_SEPARABLE_CONV1D_H_
+#define NNET_SEPARABLE_CONV1D_H_
+
+#include "nnet_common.h"
+#include "nnet_conv1d.h"
+#include "nnet_sepconv1d_latency.h"
+//#include "nnet_sepconv1d_resource.h"
+#include <cstdlib>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                          res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                          typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan],
+                          typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
+    #pragma HLS INLINE recursive
+    if (CONFIG_T::strategy == nnet::latency) {
+        depthwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        assert("Resource strategy for DepthwiseConv1D is not supported." && false);
+    }
+}
+
+template <class data_T, class dw_res_T, class res_T, typename CONFIG_T>
+void separable_conv_1d_cl(data_T data[CONFIG_T::depthwise_config::in_width * CONFIG_T::depthwise_config::n_chan],
+                          res_T res[CONFIG_T::pointwise_config::out_width * CONFIG_T::pointwise_config::n_filt],
+                          typename CONFIG_T::depthwise_config::weight_t
+                              depthwise_weights[CONFIG_T::depthwise_config::filt_width * CONFIG_T::depthwise_config::n_chan],
+                          typename CONFIG_T::pointwise_config::weight_t
+                              pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt],
+                          typename CONFIG_T::depthwise_config::bias_t depthwise_biases[CONFIG_T::depthwise_config::n_chan],
+                          typename CONFIG_T::pointwise_config::bias_t pointwise_biases[CONFIG_T::pointwise_config::n_filt]) {
+    #pragma HLS INLINE recursive
+
+    dw_res_T depthwise_res[CONFIG_T::depthwise_config::out_width * CONFIG_T::depthwise_config::n_filt];
+
+    depthwise_conv_1d_cl<data_T, dw_res_T, typename CONFIG_T::depthwise_config>(data, depthwise_res, depthwise_weights,
+                                                                                depthwise_biases);
+    pointwise_conv_1d_cl<dw_res_T, res_T, typename CONFIG_T::pointwise_config>(depthwise_res, res, pointwise_weights,
+                                                                               pointwise_biases);
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_latency.h
@@ -0,0 +1,86 @@
+#ifndef NNET_SEPARABLE_CONV2D_LATENCY_H_
+#define NNET_SEPARABLE_CONV2D_LATENCY_H_
+
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include <cstdlib>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                                  res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                                  typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan],
+                                  typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+
+    constexpr unsigned mult_n_in = CONFIG_T::filt_width * CONFIG_T::n_chan;
+    constexpr unsigned mult_n_acc = CONFIG_T::filt_width;
+    constexpr unsigned mult_n_out = CONFIG_T::n_filt;
+
+    data_T data_buf[CONFIG_T::n_pixels][mult_n_in];
+    #pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0
+
+    typename CONFIG_T::accum_t mult[mult_n_in];
+    #pragma HLS ARRAY_PARTITION variable=mult complete
+
+    typename CONFIG_T::accum_t acc[mult_n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+    #pragma HLS ARRAY_PARTITION variable=weights complete
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    // Limit multipliers to control parallelization
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit
+
+PartitionLoop:
+    for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {
+        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor rewind
+
+        CONFIG_T::template fill_buffer<data_T, CONFIG_T>::fill_buffer(data, data_buf, i_part);
+
+    PixelLoop:
+        for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+            #pragma HLS UNROLL
+
+            data_T cache;
+
+        // Do the matrix-multiply
+        Product:
+            for (int i_in = 0; i_in < mult_n_in; i_in++) {
+                #pragma HLS UNROLL
+                cache = data_buf[i_pxl][i_in];
+                mult[i_in] =
+                    CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(
+                        cache, weights[i_in]);
+            }
+
+        // Initialize accumulator with input biases
+        ResetAccum:
+            for (int i_acc = 0; i_acc < mult_n_out; i_acc++) {
+                #pragma HLS UNROLL
+                acc[i_acc] = (typename CONFIG_T::accum_t)biases[i_acc];
+            }
+
+        // Accumulate multiplication result
+        Accum1:
+            for (int i_in = 0; i_in < mult_n_acc; i_in++) {
+                #pragma HLS UNROLL
+            Accum2:
+                for (int i_out = 0; i_out < mult_n_out; i_out++) {
+                    #pragma HLS UNROLL
+                    acc[i_out] += mult[i_in * mult_n_out + i_out];
+                }
+            }
+
+        // Cast to "res_t" type
+        Result:
+            for (int i_res = 0; i_res < mult_n_out; i_res++) {
+                #pragma HLS UNROLL
+                *(res++) = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_res]);
+            }
+        }
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d.h
@@ -0,0 +1,51 @@
+#ifndef NNET_SEPARABLE_CONV2D_H_
+#define NNET_SEPARABLE_CONV2D_H_
+
+#include "nnet_common.h"
+#include "nnet_conv2d.h"
+#include "nnet_sepconv2d_latency.h"
+//#include "nnet_sepconv2d_resource.h"
+#include <cstdlib>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_2d_cl(
+    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
+    #pragma HLS INLINE recursive
+    if (CONFIG_T::strategy == nnet::latency) {
+        depthwise_conv_2d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        assert("Resource strategy for DepthwiseConv2D is not supported." && false);
+    }
+}
+
+template <class data_T, class dw_res_T, class res_T, typename CONFIG_T>
+void separable_conv_2d_cl(data_T data[CONFIG_T::depthwise_config::in_height * CONFIG_T::depthwise_config::in_width *
+                                      CONFIG_T::depthwise_config::n_chan],
+                          res_T res[CONFIG_T::pointwise_config::out_height * CONFIG_T::pointwise_config::out_width *
+                                    CONFIG_T::pointwise_config::n_filt],
+                          typename CONFIG_T::depthwise_config::weight_t
+                              depthwise_weights[CONFIG_T::depthwise_config::filt_height *
+                                                CONFIG_T::depthwise_config::filt_width * CONFIG_T::depthwise_config::n_chan],
+                          typename CONFIG_T::pointwise_config::weight_t
+                              pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt],
+                          typename CONFIG_T::depthwise_config::bias_t depthwise_biases[CONFIG_T::depthwise_config::n_chan],
+                          typename CONFIG_T::pointwise_config::bias_t pointwise_biases[CONFIG_T::pointwise_config::n_filt]) {
+    #pragma HLS INLINE recursive
+
+    dw_res_T depthwise_res[CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width *
+                           CONFIG_T::depthwise_config::n_filt];
+
+    depthwise_conv_2d_cl<data_T, dw_res_T, typename CONFIG_T::depthwise_config>(data, depthwise_res, depthwise_weights,
+                                                                                depthwise_biases);
+    pointwise_conv_2d_cl<dw_res_T, res_T, typename CONFIG_T::pointwise_config>(depthwise_res, res, pointwise_weights,
+                                                                               pointwise_biases);
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_latency.h