Skip to content

Commit 6d0c17b

Browse files
add conv2dtranspose io_parallel implementation. Can still be optimized
1 parent 2e373a9 commit 6d0c17b

File tree

6 files changed

+343
-16
lines changed

6 files changed

+343
-16
lines changed

hls4ml/backends/fpga/fpga_backend.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -698,6 +698,91 @@ def generate_conv2d_line_buffer_fn(self, layer_idx, n_partitions, in_H, in_W, in
698698

699699
return generated_code
700700

701+
def _compute_conv2d_tr_im2col(self, input_shape, out_shape, kernel=(3, 3), stride=(1, 1)):
702+
H, W, C = input_shape
703+
kernel_h, kernel_w = kernel
704+
stride_h, stride_w = stride
705+
out_h, out_w = out_shape
706+
707+
tr_kernel_h = (kernel_h+stride_h-1)//stride_h
708+
tr_kernel_w = (kernel_w+stride_w-1)//stride_w
709+
710+
input_img = np.arange(1, H * W * C + 1)
711+
im_matrix = np.zeros((tr_kernel_h * tr_kernel_w * C * out_h * out_w, ))
712+
713+
index = 0
714+
for i_oh in range(out_h):
715+
for i_ow in range(out_w):
716+
for i_kh in range(tr_kernel_h):
717+
input_row = i_oh - (tr_kernel_h-1) + i_kh
718+
for i_kw in range(tr_kernel_w):
719+
for i_c in range(C):
720+
if (input_row < 0 or input_row >= H):
721+
im_matrix[index] = 0
722+
else:
723+
input_col = i_ow - (tr_kernel_w-1) + i_kw
724+
if (input_col >= 0 and input_col < W):
725+
im_matrix[index] = input_img[input_row * W * C + input_col * C + i_c]
726+
else:
727+
im_matrix[index] = 0
728+
index += 1
729+
730+
im_matrix = im_matrix.reshape(out_h * out_w, -1)
731+
return im_matrix
732+
733+
734+
def generate_conv2d_tr_line_buffer_fn(self, layer_idx, n_partitions, in_H, in_W, in_C, out_H, out_W, kernel=(3, 3), stride=(1, 1)):
735+
if isinstance(kernel, Iterable):
736+
kernel_height = kernel[0]
737+
kernel_width = kernel[1]
738+
else:
739+
kernel_height = kernel
740+
kernel_width = kernel
741+
742+
if isinstance(stride, Iterable):
743+
stride_height = stride[0]
744+
stride_width = stride[1]
745+
else:
746+
stride_height = stride
747+
stride_width = stride
748+
749+
im2col_matrix = self._compute_conv2d_tr_im2col(
750+
(in_H, in_W, in_C),
751+
(out_W, out_W),
752+
(kernel_height, kernel_width),
753+
(stride_height, stride_width),
754+
)
755+
756+
generated_code = (
757+
"template<class data_T, typename CONFIG_T>\n"
758+
"class fill_buffer_{index} : public FillConv2DBuffer<data_T, CONFIG_T> {{\n"
759+
" public:\n"
760+
" static void fill_buffer(\n"
761+
" data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],\n"
762+
" data_T buffer[CONFIG_T::n_pixels][CONFIG_T::trfilt_height * CONFIG_T::trfilt_width * CONFIG_T::n_chan],\n"
763+
" const unsigned partition\n"
764+
" ) {{\n"
765+
).format(index=layer_idx)
766+
indent = ' '
767+
768+
for partition_idx, partition in enumerate(np.split(im2col_matrix, n_partitions)):
769+
generated_code += indent * 2 + 'if (partition == {:>3}) {{\n'.format(partition_idx)
770+
for pixel_idx, arr in enumerate(partition):
771+
buffer_stmts = []
772+
for j, v in enumerate(arr):
773+
if v == 0:
774+
val = '0'
775+
else:
776+
val = 'data[{}]'.format(int(v-1))
777+
buffer_stmts.append('buffer[{}][{}] = {:>10};'.format(pixel_idx, j, val))
778+
generated_code += indent * 3 + ' '.join(buffer_stmts) + '\n'
779+
generated_code += '\n' + indent * 2 + '}\n'
780+
781+
generated_code += indent + '}\n'
782+
generated_code += '};\n'
783+
784+
return generated_code
785+
701786
@model_optimizer()
702787
def write_hls(self, model):
703788
self.writer.write_hls(model)

hls4ml/backends/fpga/passes/codegen.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
from hls4ml.model.optimizer import OptimizerPass
2-
from hls4ml.model.layers import Conv1D, Conv2D, Conv1DTranspose
2+
from hls4ml.model.layers import Conv1D, Conv2D, Conv1DTranspose, Conv2DTranspose
33
from hls4ml.model.types import Source
44

55
class GenerateConvIm2col(OptimizerPass):
66
''' Generates tcode for im2col step of 1D/2d convolution '''
77
def match(self, node):
8-
return isinstance(node, (Conv1D, Conv2D, Conv1DTranspose)) and \
8+
return isinstance(node, (Conv1D, Conv2D, Conv1DTranspose, Conv2DTranspose)) and \
99
node.model.config.get_config_value('IOType') == 'io_parallel'
1010

1111
def transform(self, model, node):
@@ -14,6 +14,8 @@ def transform(self, model, node):
1414
self._generate_im2col_1d_transpose(node)
1515
elif '1D' in node_class:
1616
self._generate_im2col_1d(node)
17+
elif '2DTranspose' in node_class:
18+
self._generate_im2col_2d_transpose(node)
1719
elif '2D' in node_class:
1820
self._generate_im2col_2d(node)
1921
else:
@@ -38,7 +40,7 @@ def _generate_im2col_1d_transpose(self, node):
3840
node.get_attr('n_partitions'),
3941
node.get_input_variable().shape[0],
4042
node.get_input_variable().shape[1],
41-
node.get_attr('num_out'),
43+
node.get_attr('proc_width'),
4244
kernel=node.get_attr('filt_width'),
4345
stride=node.get_attr('stride_width'),
4446
)
@@ -58,3 +60,18 @@ def _generate_im2col_2d(self, node):
5860
)
5961

6062
node.set_attr('line_buffer_codegen', Source(code_str))
63+
64+
def _generate_im2col_2d_transpose(self, node):
65+
code_str = node.model.config.backend.generate_conv2d_tr_line_buffer_fn(
66+
node.get_attr('index'),
67+
node.get_attr('n_partitions'),
68+
node.get_input_variable().shape[0],
69+
node.get_input_variable().shape[1],
70+
node.get_input_variable().shape[2],
71+
node.get_attr('proc_height'),
72+
node.get_attr('proc_width'),
73+
kernel=(node.get_attr('filt_height'), node.get_attr('filt_width')),
74+
stride=(node.get_attr('stride_height'), node.get_attr('stride_width')),
75+
)
76+
77+
node.set_attr('line_buffer_codegen', Source(code_str))

hls4ml/backends/vivado/passes/convolution_templates.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -111,18 +111,18 @@ def format(self, node):
111111
static const unsigned strategy = nnet::{strategy};
112112
static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation};
113113
static const unsigned min_width = {min_width};
114-
static const ap_uint<filt_width> pixels[min_width];
114+
static const ap_uint<trfilt_width> pixels[min_width];
115115
static const unsigned n_partitions = {n_partitions};
116-
static const unsigned num_out = {num_out};
117-
static const unsigned n_pixels = num_out / n_partitions;
116+
static const unsigned proc_width = {proc_width};
117+
static const unsigned n_pixels = proc_width / n_partitions;
118118
template<class data_T, class CONFIG_T>
119119
using fill_buffer = nnet::{fill_fn}<data_T, CONFIG_T>;
120120
typedef {accum_t.name} accum_t;
121121
typedef {bias_t.name} bias_t;
122122
typedef {weight_t.name} weight_t;
123123
typedef {config_t} mult_config;
124124
}};
125-
const ap_uint<config{index}::filt_width> config{index}::pixels[] = {{{instructions}}};\n"""
125+
const ap_uint<config{index}::trfilt_width> config{index}::pixels[] = {{{instructions}}};\n"""
126126

127127
conv1dtranspose_function_template = 'nnet::conv_1d_transpose_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
128128

@@ -282,13 +282,19 @@ def __init__(self):
282282
static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation};
283283
static const unsigned min_height = {min_height};
284284
static const unsigned min_width = {min_width};
285-
static const ap_uint<filt_height * filt_width> pixels[min_height * min_width];
285+
static const ap_uint<trfilt_height * trfilt_width> pixels[min_height * min_width];
286+
static const unsigned n_partitions = {n_partitions};
287+
static const unsigned proc_height = {proc_height};
288+
static const unsigned proc_width = {proc_width};
289+
static const unsigned n_pixels = proc_height * proc_width / n_partitions;
290+
template<class data_T, class CONFIG_T>
291+
using fill_buffer = nnet::{fill_fn}<data_T, CONFIG_T>;
286292
typedef {accum_t.name} accum_t;
287293
typedef {bias_t.name} bias_t;
288294
typedef {weight_t.name} weight_t;
289295
typedef {config_t} mult_config;
290296
}};
291-
const ap_uint<config{index}::filt_height * config{index}::filt_width> config{index}::pixels[] = {{{instructions}}};\n"""
297+
const ap_uint<config{index}::trfilt_height * config{index}::trfilt_width> config{index}::pixels[] = {{{instructions}}};\n"""
292298

293299
conv2dtranspose_function_template = 'nnet::conv_2d_transpose_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
294300

@@ -310,6 +316,10 @@ def format(self, node):
310316
// node.get_attr('stride_height')
311317

312318
params['config_t'] = 'config{}_mult'.format(node.index)
319+
if node.model.config.get_config_value('IOType') == 'io_parallel':
320+
params['fill_fn'] = 'fill_buffer_{}'.format(node.index)
321+
else:
322+
params['fill_fn'] = 'FillConv2DBuffer'
313323
conv_config = self.template.format(**params)
314324

315325
mult_params = self._default_config_params(node)

hls4ml/backends/vivado/vivado_backend.py

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -196,17 +196,18 @@ def init_conv1dtranspose(self, layer):
196196
layer.set_attr('strategy', 'latency')
197197

198198
in_width = layer.get_input_variable().shape[0]
199-
num_out = 1 + in_width + (layer.get_output_variable().shape[1] + layer.get_attr('pad_left'))//layer.get_attr('stride_width')
199+
proc_width = (layer.get_output_variable().shape[0] + layer.get_attr('pad_left') + layer.get_attr('stride_width')-1) \
200+
// layer.get_attr('stride_width')
200201
chosen_pf = layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1)
201-
valid_pf = self.get_valid_conv_partition_splits(1, num_out)
202+
valid_pf = self.get_valid_conv_partition_splits(1, proc_width)
202203
if chosen_pf not in valid_pf:
203204
closest_pf = self.get_closest_reuse_factor(valid_pf, chosen_pf)
204205
print('WARNING: Invalid ParallelizationFactor={} in layer "{}". Using ParallelizationFactor={} instead. Valid ParallelizationFactor(s): {}.'
205206
.format(chosen_pf, layer.name, closest_pf, ','.join(map(str, valid_pf))))
206207
else:
207208
closest_pf = chosen_pf
208-
layer.set_attr('n_partitions', num_out // closest_pf)
209-
layer.set_attr('num_out', num_out)
209+
layer.set_attr('n_partitions', proc_width // closest_pf)
210+
layer.set_attr('proc_width', proc_width)
210211

211212
layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower())
212213

@@ -254,7 +255,7 @@ def init_conv2d(self, layer):
254255
self._validate_conv_strategy(layer)
255256

256257
@layer_optimizer(Conv2DTranspose)
257-
def init_conv2d(self, layer):
258+
def init_conv2dtranspose(self, layer):
258259
if len(layer.weights['weight'].data.shape) == 2: # This can happen if we assign weights of Dense layer to 1x1 Conv2D
259260
layer.weights['weight'].data = np.expand_dims(layer.weights['weight'].data, axis=(0,1))
260261

@@ -266,8 +267,29 @@ def init_conv2d(self, layer):
266267
else:
267268
layer.set_attr('strategy', 'latency')
268269

270+
in_height = layer.get_input_variable().shape[0]
271+
in_width = layer.get_input_variable().shape[1]
272+
273+
proc_height = (layer.get_output_variable().shape[0] + layer.get_attr('pad_top') + layer.get_attr('stride_height')-1) \
274+
// layer.get_attr('stride_height')
275+
proc_width = (layer.get_output_variable().shape[1] + layer.get_attr('pad_left') + layer.get_attr('stride_width')-1) \
276+
// layer.get_attr('stride_width')
277+
chosen_pf = layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1)
278+
valid_pf = self.get_valid_conv_partition_splits(proc_height, proc_width)
279+
if chosen_pf not in valid_pf:
280+
closest_pf = self.get_closest_reuse_factor(valid_pf, chosen_pf)
281+
print('WARNING: Invalid ParallelizationFactor={} in layer "{}". Using ParallelizationFactor={} instead. Valid ParallelizationFactor(s): {}.'
282+
.format(chosen_pf, layer.name, closest_pf, ','.join(map(str, valid_pf))))
283+
else:
284+
closest_pf = chosen_pf
285+
layer.set_attr('n_partitions', proc_height * proc_width // closest_pf)
286+
layer.set_attr('proc_height', proc_height)
287+
layer.set_attr('proc_width', proc_width)
288+
269289
layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower())
270290

291+
self._validate_conv_strategy(layer)
292+
271293
@layer_optimizer(SeparableConv2D)
272294
def init_sepconv2d(self, layer):
273295
if layer.model.config.is_resource_strategy(layer):

hls4ml/templates/vivado/nnet_utils/nnet_conv2dtranspose.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#define NNET_CONV2DTRANSPOSE_H
33

44
#include "nnet_common.h"
5+
#include "nnet_conv2dtranspose_resource.h"
56
#include <cstdlib>
67

78
namespace nnet{
@@ -40,14 +41,16 @@ struct conv2dtranspose_config
4041
};
4142

4243
template<class data_T, class res_T, typename CONFIG_T>
43-
void conv_2d_cl(
44+
void conv_2d_transpose_cl(
4445
data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
4546
res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
4647
typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
4748
typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]
4849
)
4950
{
50-
return; //only stream is supported currently
51+
#pragma HLS INLINE region
52+
//only have resource strategy as of now
53+
conv_2d_transpose_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
5154
}
5255

5356
}

0 commit comments

Comments
 (0)