add conv2dtranspose io_parallel implementation. Can still be optimized

Jonathan-Shoemaker · Jonathan-Shoemaker · commit 60732637affd · 2022-07-22T17:04:11.000-05:00
diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py
@@ -656,6 +656,91 @@ def generate_conv2d_line_buffer_fn(self, layer_idx, n_partitions, in_H, in_W, in
 
         return generated_code
 
+    def _compute_conv2d_tr_im2col(self, input_shape, out_shape, kernel=(3, 3), stride=(1, 1)):
+        H, W, C = input_shape
+        kernel_h, kernel_w = kernel
+        stride_h, stride_w = stride
+        out_h, out_w = out_shape
+
+        tr_kernel_h = (kernel_h+stride_h-1)//stride_h
+        tr_kernel_w = (kernel_w+stride_w-1)//stride_w
+
+        input_img = np.arange(1, H * W * C + 1)
+        im_matrix = np.zeros((tr_kernel_h * tr_kernel_w * C * out_h * out_w, ))
+
+        index = 0
+        for i_oh in range(out_h):
+            for i_ow in range(out_w):
+                for i_kh in range(tr_kernel_h):
+                    input_row = i_oh - (tr_kernel_h-1) + i_kh
+                    for i_kw in range(tr_kernel_w):
+                        for i_c in range(C):
+                            if (input_row < 0 or input_row >= H):
+                                im_matrix[index] = 0
+                            else:
+                                input_col = i_ow - (tr_kernel_w-1) + i_kw
+                                if (input_col >= 0 and input_col < W):
+                                    im_matrix[index] = input_img[input_row * W * C + input_col * C + i_c]
+                                else:
+                                    im_matrix[index] = 0
+                            index += 1
+        
+        im_matrix = im_matrix.reshape(out_h * out_w, -1)
+        return im_matrix
+
+
+    def generate_conv2d_tr_line_buffer_fn(self, layer_idx, n_partitions, in_H, in_W, in_C, out_H, out_W, kernel=(3, 3), stride=(1, 1)):
+        if isinstance(kernel, Iterable):
+            kernel_height = kernel[0]
+            kernel_width = kernel[1]
+        else:
+            kernel_height = kernel
+            kernel_width = kernel
+
+        if isinstance(stride, Iterable):
+            stride_height = stride[0]
+            stride_width = stride[1]
+        else:
+            stride_height = stride
+            stride_width = stride
+
+        im2col_matrix = self._compute_conv2d_tr_im2col(
+            (in_H, in_W, in_C),
+            (out_W, out_W),
+            (kernel_height, kernel_width),
+            (stride_height, stride_width),
+        )
+
+        generated_code = (
+            "template<class data_T, typename CONFIG_T>\n"
+            "class fill_buffer_{index} : public FillConv2DBuffer<data_T, CONFIG_T> {{\n"
+            "    public:\n"
+            "    static void fill_buffer(\n"
+            "        data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],\n"
+            "        data_T buffer[CONFIG_T::n_pixels][CONFIG_T::trfilt_height * CONFIG_T::trfilt_width * CONFIG_T::n_chan],\n"
+            "        const unsigned partition\n"
+            "    ) {{\n"
+        ).format(index=layer_idx)
+        indent = '    '
+
+        for partition_idx, partition in enumerate(np.split(im2col_matrix, n_partitions)):
+            generated_code += indent * 2 + 'if (partition == {:>3}) {{\n'.format(partition_idx)
+            for pixel_idx, arr in enumerate(partition):
+                buffer_stmts = []
+                for j, v in enumerate(arr):
+                    if v == 0:
+                        val = '0'
+                    else:
+                        val = 'data[{}]'.format(int(v-1))
+                    buffer_stmts.append('buffer[{}][{}] = {:>10};'.format(pixel_idx, j, val))
+                generated_code += indent * 3 + ' '.join(buffer_stmts) + '\n'
+            generated_code += '\n' + indent * 2 + '}\n'
+
+        generated_code += indent + '}\n'
+        generated_code += '};\n'
+
+        return generated_code
+
     @model_optimizer()
     def write_hls(self, model):
         self.writer.write_hls(model)
diff --git a/hls4ml/backends/fpga/passes/codegen.py b/hls4ml/backends/fpga/passes/codegen.py
@@ -1,11 +1,11 @@
 from hls4ml.model.optimizer import OptimizerPass
-from hls4ml.model.layers import Conv1D, Conv2D, Conv1DTranspose
+from hls4ml.model.layers import Conv1D, Conv2D, Conv1DTranspose, Conv2DTranspose
 from hls4ml.model.types import Source
 
 class GenerateConvIm2col(OptimizerPass):
     ''' Generates tcode for im2col step of 1D/2d convolution '''
     def match(self, node):
-        return isinstance(node, (Conv1D, Conv2D, Conv1DTranspose)) and \
+        return isinstance(node, (Conv1D, Conv2D, Conv1DTranspose, Conv2DTranspose)) and \
             node.model.config.get_config_value('IOType') == 'io_parallel'
     
     def transform(self, model, node):
@@ -14,6 +14,8 @@ def transform(self, model, node):
             self._generate_im2col_1d_transpose(node)
         elif '1D' in node_class:
             self._generate_im2col_1d(node)
+        elif '2DTranspose' in node_class:
+            self._generate_im2col_2d_transpose(node)
         elif '2D' in node_class:
             self._generate_im2col_2d(node)
         else:
@@ -38,7 +40,7 @@ def _generate_im2col_1d_transpose(self, node):
             node.get_attr('n_partitions'),
             node.get_input_variable().shape[0],
             node.get_input_variable().shape[1],
-            node.get_attr('num_out'),
+            node.get_attr('proc_width'),
             kernel=node.get_attr('filt_width'),
             stride=node.get_attr('stride_width'),
         )
@@ -58,3 +60,18 @@ def _generate_im2col_2d(self, node):
         )
         
         node.set_attr('line_buffer_codegen', Source(code_str))
+
+    def _generate_im2col_2d_transpose(self, node):
+        code_str = node.model.config.backend.generate_conv2d_tr_line_buffer_fn(
+            node.get_attr('index'),
+            node.get_attr('n_partitions'),
+            node.get_input_variable().shape[0],
+            node.get_input_variable().shape[1],
+            node.get_input_variable().shape[2],
+            node.get_attr('proc_height'),
+            node.get_attr('proc_width'),
+            kernel=(node.get_attr('filt_height'), node.get_attr('filt_width')),
+            stride=(node.get_attr('stride_height'), node.get_attr('stride_width')),
+        )
+        
+        node.set_attr('line_buffer_codegen', Source(code_str))
diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py
@@ -111,18 +111,18 @@ def format(self, node):
     static const unsigned strategy = nnet::{strategy};
     static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation};
     static const unsigned min_width = {min_width};
-    static const ap_uint<filt_width> pixels[min_width];
+    static const ap_uint<trfilt_width> pixels[min_width];
     static const unsigned n_partitions = {n_partitions};
-    static const unsigned num_out = {num_out};
-    static const unsigned n_pixels = num_out / n_partitions;
+    static const unsigned proc_width = {proc_width};
+    static const unsigned n_pixels = proc_width / n_partitions;
     template<class data_T, class CONFIG_T>
     using fill_buffer = nnet::{fill_fn}<data_T, CONFIG_T>;
     typedef {accum_t.name} accum_t;
     typedef {bias_t.name} bias_t;
     typedef {weight_t.name} weight_t;
     typedef {config_t} mult_config;
 }};
-const ap_uint<config{index}::filt_width> config{index}::pixels[] = {{{instructions}}};\n"""
+const ap_uint<config{index}::trfilt_width> config{index}::pixels[] = {{{instructions}}};\n"""
 
 conv1dtranspose_function_template = 'nnet::conv_1d_transpose_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
 
@@ -282,13 +282,19 @@ def __init__(self):
     static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation};
     static const unsigned min_height = {min_height};
     static const unsigned min_width = {min_width};
-    static const ap_uint<filt_height * filt_width> pixels[min_height * min_width];
+    static const ap_uint<trfilt_height * trfilt_width> pixels[min_height * min_width];
+    static const unsigned n_partitions = {n_partitions};
+    static const unsigned proc_height = {proc_height};
+    static const unsigned proc_width = {proc_width};
+    static const unsigned n_pixels = proc_height * proc_width / n_partitions;
+    template<class data_T, class CONFIG_T>
+    using fill_buffer = nnet::{fill_fn}<data_T, CONFIG_T>;
     typedef {accum_t.name} accum_t;
     typedef {bias_t.name} bias_t;
     typedef {weight_t.name} weight_t;
     typedef {config_t} mult_config;
 }};
-const ap_uint<config{index}::filt_height * config{index}::filt_width> config{index}::pixels[] = {{{instructions}}};\n"""
+const ap_uint<config{index}::trfilt_height * config{index}::trfilt_width> config{index}::pixels[] = {{{instructions}}};\n"""
 
 conv2dtranspose_function_template = 'nnet::conv_2d_transpose_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
 
@@ -310,6 +316,10 @@ def format(self, node):
              // node.get_attr('stride_height')
 
         params['config_t'] = 'config{}_mult'.format(node.index)
+        if node.model.config.get_config_value('IOType') == 'io_parallel':
+            params['fill_fn'] = 'fill_buffer_{}'.format(node.index)
+        else:
+            params['fill_fn'] = 'FillConv2DBuffer'
         conv_config = self.template.format(**params)
 
         mult_params = self._default_config_params(node)
diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
@@ -190,17 +190,18 @@ def init_conv1dtranspose(self, layer):
             layer.set_attr('strategy', 'latency')
 
         in_width = layer.get_input_variable().shape[0]
-        num_out = 1 + in_width + (layer.get_output_variable().shape[1] + layer.get_attr('pad_left'))//layer.get_attr('stride_width')
+        proc_width = (layer.get_output_variable().shape[0] + layer.get_attr('pad_left') + layer.get_attr('stride_width')-1) \
+            // layer.get_attr('stride_width')
         chosen_pf = layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1)
-        valid_pf = self.get_valid_conv_partition_splits(1, num_out)
+        valid_pf = self.get_valid_conv_partition_splits(1, proc_width)
         if chosen_pf not in valid_pf:
             closest_pf = self.get_closest_reuse_factor(valid_pf, chosen_pf)
             print('WARNING: Invalid ParallelizationFactor={} in layer "{}". Using ParallelizationFactor={} instead. Valid ParallelizationFactor(s): {}.'
                   .format(chosen_pf, layer.name, closest_pf, ','.join(map(str, valid_pf))))
         else:
             closest_pf = chosen_pf
-        layer.set_attr('n_partitions', num_out // closest_pf)
-        layer.set_attr('num_out', num_out)
+        layer.set_attr('n_partitions', proc_width // closest_pf)
+        layer.set_attr('proc_width', proc_width)
         
         layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower())
 
@@ -247,7 +248,7 @@ def init_conv2d(self, layer):
         self._validate_conv_strategy(layer)
 
     @layer_optimizer(Conv2DTranspose)
-    def init_conv2d(self, layer):
+    def init_conv2dtranspose(self, layer):
         if len(layer.weights['weight'].data.shape) == 2: # This can happen if we assign weights of Dense layer to 1x1 Conv2D
             layer.weights['weight'].data = np.expand_dims(layer.weights['weight'].data, axis=(0,1))
 
@@ -259,8 +260,29 @@ def init_conv2d(self, layer):
         else:
             layer.set_attr('strategy', 'latency')
         
+        in_height = layer.get_input_variable().shape[0]
+        in_width = layer.get_input_variable().shape[1]
+
+        proc_height = (layer.get_output_variable().shape[0] + layer.get_attr('pad_top') + layer.get_attr('stride_height')-1) \
+            // layer.get_attr('stride_height')
+        proc_width  = (layer.get_output_variable().shape[1] + layer.get_attr('pad_left') + layer.get_attr('stride_width')-1) \
+            // layer.get_attr('stride_width')
+        chosen_pf = layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1)
+        valid_pf = self.get_valid_conv_partition_splits(proc_height, proc_width)
+        if chosen_pf not in valid_pf:
+            closest_pf = self.get_closest_reuse_factor(valid_pf, chosen_pf)
+            print('WARNING: Invalid ParallelizationFactor={} in layer "{}". Using ParallelizationFactor={} instead. Valid ParallelizationFactor(s): {}.'
+                .format(chosen_pf, layer.name, closest_pf, ','.join(map(str, valid_pf))))
+        else:
+            closest_pf = chosen_pf
+        layer.set_attr('n_partitions', proc_height * proc_width // closest_pf)
+        layer.set_attr('proc_height', proc_height)
+        layer.set_attr('proc_width', proc_width)
+
         layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower())
 
+        self._validate_conv_strategy(layer)
+
     @layer_optimizer(SeparableConv2D)
     def init_sepconv2d(self, layer):
         if layer.model.config.is_resource_strategy(layer):
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv2dtranspose.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv2dtranspose.h
@@ -2,6 +2,7 @@
 #define NNET_CONV2DTRANSPOSE_H
 
 #include "nnet_common.h"
+#include "nnet_conv2dtranspose_resource.h"
 #include <cstdlib>
 
 namespace nnet{
@@ -40,14 +41,16 @@ struct conv2dtranspose_config
 };
 
 template<class data_T, class res_T, typename CONFIG_T>
-void conv_2d_cl(
+void conv_2d_transpose_cl(
     data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
     res_T  res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
     typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
     typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt]
 )
 {
-    return; //only stream is supported currently
+    #pragma HLS INLINE region
+    //only have resource strategy as of now
+    conv_2d_transpose_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
 }
 
 }
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv2dtranspose_resource.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv2dtranspose_resource.h

Original file line number	Diff line number	Diff line change
`@@ -2,6 +2,7 @@`
`2`	`2`	`#define NNET_CONV2DTRANSPOSE_H`
`3`	`3`
`4`	`4`	`#include "nnet_common.h"`
	`5`	`+#include "nnet_conv2dtranspose_resource.h"`
`5`	`6`	`#include <cstdlib>`
`6`	`7`
`7`	`8`	`namespace nnet{`
`@@ -40,14 +41,16 @@ struct conv2dtranspose_config`
`40`	`41`	`};`
`41`	`42`
`42`	`43`	`template<class data_T, class res_T, typename CONFIG_T>`
`43`		`-void conv_2d_cl(`
	`44`	`+void conv_2d_transpose_cl(`
`44`	`45`	`data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],`
`45`	`46`	`res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],`
`46`	`47`	`typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],`
`47`	`48`	`typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]`
`48`	`49`	`)`
`49`	`50`	`{`
`50`		`- return; //only stream is supported currently`
	`51`	`+ #pragma HLS INLINE region`
	`52`	`+ //only have resource strategy as of now`
	`53`	`+ conv_2d_transpose_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);`
`51`	`54`	`}`
`52`	`55`
`53`	`56`	`}`