Skip to content

Commit e3d5d49

Browse files
authored
Merge pull request #1012 from vloncar/sepconv_io_parallel
SepConv1d/2d for io_parallel with Latency strategy
2 parents b6855fe + 3b9b649 commit e3d5d49

File tree

12 files changed

+581
-55
lines changed

12 files changed

+581
-55
lines changed

hls4ml/backends/fpga/fpga_backend.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -685,7 +685,7 @@ def generate_conv1d_line_buffer_fn(self, layer_idx, n_partitions, in_W, in_C, ke
685685
686686
The HLS compiler produces suboptimal designs for a im2col algorithm implementation, so a trick we use is
687687
to generate a resulting a result of im2col transformation explicitly, instead of relying on loops. Since
688-
the result depends on the paraleters of the convolution layer (the input size, the kernel size, stride etc),
688+
the result depends on the parameters of the convolution layer (the input size, the kernel size, stride etc),
689689
we need to do this for every convolution layer.
690690
691691
Args:
@@ -782,7 +782,7 @@ def generate_conv2d_line_buffer_fn(
782782
783783
The HLS compiler produces suboptimal designs for a im2col algorithm implementation, so a trick we use is
784784
to generate a resulting a result of im2col transformation explicitly, instead of relying on loops. Since
785-
the result depends on the paraleters of the convolution layer (the input size, the kernel size, stride etc),
785+
the result depends on the parameters of the convolution layer (the input size, the kernel size, stride etc),
786786
we need to do this for every convolution layer.
787787
788788
Args:

hls4ml/backends/fpga/passes/codegen.py

Lines changed: 72 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from hls4ml.model.layers import Conv1D, Conv2D
1+
from hls4ml.model.layers import Conv1D, Conv2D, SeparableConv1D, SeparableConv2D
22
from hls4ml.model.optimizer import OptimizerPass
33
from hls4ml.model.types import Source
44

@@ -7,16 +7,27 @@ class GenerateConvIm2col(OptimizerPass):
77
'''Generates tcode for im2col step of 1D/2d convolution'''
88

99
def match(self, node):
10-
return isinstance(node, (Conv1D, Conv2D)) and node.model.config.get_config_value('IOType') == 'io_parallel'
10+
return (
11+
isinstance(node, (Conv1D, Conv2D, SeparableConv1D, SeparableConv2D))
12+
and node.model.config.get_config_value('IOType') == 'io_parallel'
13+
)
1114

1215
def transform(self, model, node):
13-
node_class = node.__class__.__name__
14-
if '1D' in node_class:
15-
self._generate_im2col_1d(node)
16-
elif '2D' in node_class:
17-
self._generate_im2col_2d(node)
16+
node_class = node.class_name
17+
if 'Separable' in node_class:
18+
if '1D' in node_class:
19+
self._generate_separable_im2col_1d(node)
20+
elif '2D' in node_class:
21+
self._generate_separable_im2col_2d(node)
22+
else:
23+
raise Exception(f'Cannot generate instructions for node {node.name} ({node_class})')
1824
else:
19-
raise Exception(f'Cannot generate instructions for node {node.name} ({node_class})')
25+
if '1D' in node_class:
26+
self._generate_im2col_1d(node)
27+
elif '2D' in node_class:
28+
self._generate_im2col_2d(node)
29+
else:
30+
raise Exception(f'Cannot generate instructions for node {node.name} ({node_class})')
2031

2132
def _generate_im2col_1d(self, node):
2233
code_str = node.model.config.backend.generate_conv1d_line_buffer_fn(
@@ -49,3 +60,56 @@ def _generate_im2col_2d(self, node):
4960
)
5061

5162
node.set_attr('line_buffer_codegen', Source(code_str))
63+
64+
def _generate_separable_im2col_1d(self, node):
65+
dw_code_str = node.model.config.backend.generate_conv1d_line_buffer_fn(
66+
str(node.get_attr('index')) + '_dw',
67+
node.get_attr('n_partitions'),
68+
node.get_input_variable().shape[0],
69+
node.get_input_variable().shape[1],
70+
kernel=node.get_attr('filt_width'),
71+
stride=node.get_attr('stride_width'),
72+
pad=(node.get_attr('pad_left'), node.get_attr('pad_right')),
73+
)
74+
75+
node.set_attr('dw_line_buffer_codegen', Source(dw_code_str))
76+
77+
pw_code_str = node.model.config.backend.generate_conv1d_line_buffer_fn(
78+
str(node.get_attr('index')) + '_pw',
79+
node.get_attr('n_partitions'),
80+
node.get_output_variable().shape[0],
81+
node.get_input_variable().shape[1],
82+
kernel=1,
83+
)
84+
85+
node.set_attr('pw_line_buffer_codegen', Source(pw_code_str))
86+
87+
def _generate_separable_im2col_2d(self, node):
88+
dw_code_str = node.model.config.backend.generate_conv2d_line_buffer_fn(
89+
str(node.get_attr('index')) + '_dw',
90+
node.get_attr('n_partitions'),
91+
node.get_input_variable().shape[0],
92+
node.get_input_variable().shape[1],
93+
node.get_input_variable().shape[2],
94+
kernel=(node.get_attr('filt_height'), node.get_attr('filt_width')),
95+
stride=(node.get_attr('stride_height'), node.get_attr('stride_width')),
96+
pad=(
97+
node.get_attr('pad_top'),
98+
node.get_attr('pad_bottom'),
99+
node.get_attr('pad_left'),
100+
node.get_attr('pad_right'),
101+
),
102+
)
103+
104+
node.set_attr('dw_line_buffer_codegen', Source(dw_code_str))
105+
106+
pw_code_str = node.model.config.backend.generate_conv2d_line_buffer_fn(
107+
str(node.get_attr('index')) + '_pw',
108+
node.get_attr('n_partitions'),
109+
node.get_output_variable().shape[0],
110+
node.get_output_variable().shape[1],
111+
node.get_input_variable().shape[2],
112+
kernel=(1, 1),
113+
)
114+
115+
node.set_attr('pw_line_buffer_codegen', Source(pw_code_str))

hls4ml/backends/vivado/passes/convolution_templates.py

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -254,8 +254,8 @@ def __init__(self):
254254
'{input}, {output}, {d}, {p}, {z}, {b});'
255255
)
256256

257-
sepconv1d_include_list = ['nnet_utils/nnet_conv1d.h', 'nnet_utils/nnet_sepconv1d_stream.h']
258-
sepconv2d_include_list = ['nnet_utils/nnet_conv2d.h', 'nnet_utils/nnet_sepconv2d_stream.h']
257+
sepconv1d_include_list = ['nnet_utils/nnet_conv1d.h', 'nnet_utils/nnet_sepconv1d.h', 'nnet_utils/nnet_sepconv1d_stream.h']
258+
sepconv2d_include_list = ['nnet_utils/nnet_conv2d.h', 'nnet_utils/nnet_sepconv2d.h', 'nnet_utils/nnet_sepconv2d_stream.h']
259259

260260

261261
class SeparableConv1DConfigTemplate(LayerConfigTemplate):
@@ -286,7 +286,10 @@ def format(self, node):
286286
params['index'] = str(node.index) + '_depthwise'
287287
params['weight_t'] = node.get_weights('depthwise').type
288288
params['bias_t'] = node.get_weights('zero_bias').type
289-
params['fill_fn'] = 'FillConv1DBuffer'
289+
if node.model.config.get_config_value('IOType') == 'io_parallel':
290+
params['fill_fn'] = f'fill_buffer_{node.index}_dw'
291+
else:
292+
params['fill_fn'] = 'FillConv1DBuffer'
290293

291294
if node.get_attr('unscaled'):
292295
params['scale_index_type'] = 'scale_index_unscaled'
@@ -317,13 +320,17 @@ def format(self, node):
317320

318321
params['filt_width'] = 1
319322
params['stride_width'] = 1
323+
params['pad_left'] = params['pad_right'] = 0
320324
params['dilation'] = node.get_attr('dilation', 1)
321325
params['nzeros'] = node.get_weights('pointwise').nzeros
322326
params['index'] = str(node.index) + '_pointwise'
323327
params['weight_t'] = node.get_weights('pointwise').type
324328
params['min_width'] = params['in_width']
325329
params['instructions'] = '0'
326-
params['fill_fn'] = 'FillConv1DBuffer'
330+
if node.model.config.get_config_value('IOType') == 'io_parallel':
331+
params['fill_fn'] = f'fill_buffer_{node.index}_pw'
332+
else:
333+
params['fill_fn'] = 'FillConv1DBuffer'
327334

328335
if node.get_attr('unscaled'):
329336
params['scale_index_type'] = 'scale_index_unscaled'
@@ -402,7 +409,10 @@ def format(self, node):
402409
params['nzeros'] = node.get_weights('depthwise').nzeros
403410
params['index'] = str(node.index) + '_depthwise'
404411
params['weight_t'] = node.get_weights('depthwise').type
405-
params['fill_fn'] = 'FillConv2DBuffer'
412+
if node.model.config.get_config_value('IOType') == 'io_parallel':
413+
params['fill_fn'] = f'fill_buffer_{node.index}_dw'
414+
else:
415+
params['fill_fn'] = 'FillConv2DBuffer'
406416

407417
if node.get_attr('unscaled_h'):
408418
params['scale_index_height_type'] = 'scale_index_unscaled'
@@ -440,14 +450,19 @@ def format(self, node):
440450

441451
params['filt_height'] = params['filt_width'] = 1
442452
params['stride_height'] = params['stride_width'] = 1
453+
params['pad_left'] = params['pad_right'] = 0
454+
params['pad_top'] = params['pad_bottom'] = 0
443455
params['dilation'] = node.get_attr('dilation', 1)
444456
params['nzeros'] = node.get_weights('pointwise').nzeros
445457
params['index'] = str(node.index) + '_pointwise'
446458
params['weight_t'] = node.get_weights('pointwise').type
447459
params['min_height'] = params['in_height']
448460
params['min_width'] = params['in_width']
449461
params['instructions'] = '0'
450-
params['fill_fn'] = 'FillConv2DBuffer'
462+
if node.model.config.get_config_value('IOType') == 'io_parallel':
463+
params['fill_fn'] = f'fill_buffer_{node.index}_pw'
464+
else:
465+
params['fill_fn'] = 'FillConv2DBuffer'
451466

452467
if node.get_attr('unscaled_h'):
453468
params['scale_index_height_type'] = 'scale_index_unscaled'

hls4ml/backends/vivado/vivado_backend.py

Lines changed: 44 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -295,9 +295,20 @@ def init_sepconv1d(self, layer):
295295
else:
296296
layer.set_attr('strategy', 'latency')
297297

298-
layer.set_attr(
299-
'n_partitions', 1
300-
) # TODO Once we have SeparableConv implementation for io_parallel this should be set properly
298+
out_width = layer.get_output_variable().shape[0]
299+
chosen_pf = layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1)
300+
valid_pf = self.get_valid_conv_partition_splits(1, out_width)
301+
if chosen_pf not in valid_pf:
302+
closest_pf = self.get_closest_reuse_factor(valid_pf, chosen_pf)
303+
valid_pf_str = ','.join(map(str, valid_pf))
304+
print(
305+
f'WARNING: Invalid ParallelizationFactor={chosen_pf} in layer "{layer.name}".'
306+
f'Using ParallelizationFactor={closest_pf} instead. Valid ParallelizationFactor(s): {valid_pf_str}.'
307+
)
308+
else:
309+
closest_pf = chosen_pf
310+
layer.set_attr('n_partitions', out_width // closest_pf)
311+
301312
layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower())
302313

303314
# Set the output type of the depthwise phase
@@ -350,9 +361,21 @@ def init_sepconv2d(self, layer):
350361
else:
351362
layer.set_attr('strategy', 'latency')
352363

353-
layer.set_attr(
354-
'n_partitions', 1
355-
) # TODO Once we have SeparableConv implementation for io_parallel this should be set properly
364+
out_height = layer.get_output_variable().shape[0]
365+
out_width = layer.get_output_variable().shape[1]
366+
chosen_pf = layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1)
367+
valid_pf = self.get_valid_conv_partition_splits(out_height, out_width)
368+
if chosen_pf not in valid_pf:
369+
closest_pf = self.get_closest_reuse_factor(valid_pf, chosen_pf)
370+
valid_pf_str = ','.join(map(str, valid_pf))
371+
print(
372+
f'WARNING: Invalid ParallelizationFactor={chosen_pf} in layer "{layer.name}".'
373+
f'Using ParallelizationFactor={closest_pf} instead. Valid ParallelizationFactor(s): {valid_pf_str}.'
374+
)
375+
else:
376+
closest_pf = chosen_pf
377+
layer.set_attr('n_partitions', out_height * out_width // closest_pf)
378+
356379
layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower())
357380

358381
# Set the output type of the depthwise phase
@@ -373,9 +396,21 @@ def init_depconv2d(self, layer):
373396
else:
374397
layer.set_attr('strategy', 'latency')
375398

376-
layer.set_attr(
377-
'n_partitions', 1
378-
) # TODO Once we have SeparableConv implementation for io_parallel this should be set properly
399+
out_height = layer.get_output_variable().shape[0]
400+
out_width = layer.get_output_variable().shape[1]
401+
chosen_pf = layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1)
402+
valid_pf = self.get_valid_conv_partition_splits(out_height, out_width)
403+
if chosen_pf not in valid_pf:
404+
closest_pf = self.get_closest_reuse_factor(valid_pf, chosen_pf)
405+
valid_pf_str = ','.join(map(str, valid_pf))
406+
print(
407+
f'WARNING: Invalid ParallelizationFactor={chosen_pf} in layer "{layer.name}".'
408+
f'Using ParallelizationFactor={closest_pf} instead. Valid ParallelizationFactor(s): {valid_pf_str}.'
409+
)
410+
else:
411+
closest_pf = chosen_pf
412+
layer.set_attr('n_partitions', out_height * out_width // closest_pf)
413+
379414
layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower())
380415

381416
def _set_pooling_accum_t(self, layer, pool_size):
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#ifndef NNET_SEPARABLE_CONV1D_H_
2+
#define NNET_SEPARABLE_CONV1D_H_
3+
4+
#include "nnet_common.h"
5+
#include "nnet_conv1d.h"
6+
#include "nnet_sepconv1d_latency.h"
7+
//#include "nnet_sepconv1d_resource.h"
8+
#include <cstdlib>
9+
10+
namespace nnet {
11+
12+
template <class data_T, class res_T, typename CONFIG_T>
13+
void depthwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
14+
res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
15+
typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan],
16+
typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
17+
#pragma HLS INLINE recursive
18+
if (CONFIG_T::strategy == nnet::latency) {
19+
depthwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
20+
} else {
21+
assert("Resource strategy for DepthwiseConv1D is not supported." && false);
22+
}
23+
}
24+
25+
template <class data_T, class dw_res_T, class res_T, typename CONFIG_T>
26+
void separable_conv_1d_cl(data_T data[CONFIG_T::depthwise_config::in_width * CONFIG_T::depthwise_config::n_chan],
27+
res_T res[CONFIG_T::pointwise_config::out_width * CONFIG_T::pointwise_config::n_filt],
28+
typename CONFIG_T::depthwise_config::weight_t
29+
depthwise_weights[CONFIG_T::depthwise_config::filt_width * CONFIG_T::depthwise_config::n_chan],
30+
typename CONFIG_T::pointwise_config::weight_t
31+
pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt],
32+
typename CONFIG_T::depthwise_config::bias_t depthwise_biases[CONFIG_T::depthwise_config::n_chan],
33+
typename CONFIG_T::pointwise_config::bias_t pointwise_biases[CONFIG_T::pointwise_config::n_filt]) {
34+
#pragma HLS INLINE recursive
35+
36+
dw_res_T depthwise_res[CONFIG_T::depthwise_config::out_width * CONFIG_T::depthwise_config::n_filt];
37+
38+
depthwise_conv_1d_cl<data_T, dw_res_T, typename CONFIG_T::depthwise_config>(data, depthwise_res, depthwise_weights,
39+
depthwise_biases);
40+
pointwise_conv_1d_cl<dw_res_T, res_T, typename CONFIG_T::pointwise_config>(depthwise_res, res, pointwise_weights,
41+
pointwise_biases);
42+
}
43+
44+
} // namespace nnet
45+
46+
#endif
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
#ifndef NNET_SEPARABLE_CONV2D_LATENCY_H_
2+
#define NNET_SEPARABLE_CONV2D_LATENCY_H_
3+
4+
#include "nnet_common.h"
5+
#include "nnet_mult.h"
6+
#include <cstdlib>
7+
8+
namespace nnet {
9+
10+
template <class data_T, class res_T, typename CONFIG_T>
11+
void depthwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
12+
res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
13+
typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan],
14+
typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
15+
16+
constexpr unsigned mult_n_in = CONFIG_T::filt_width * CONFIG_T::n_chan;
17+
constexpr unsigned mult_n_acc = CONFIG_T::filt_width;
18+
constexpr unsigned mult_n_out = CONFIG_T::n_filt;
19+
20+
data_T data_buf[CONFIG_T::n_pixels][mult_n_in];
21+
#pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0
22+
23+
typename CONFIG_T::accum_t mult[mult_n_in];
24+
#pragma HLS ARRAY_PARTITION variable=mult complete
25+
26+
typename CONFIG_T::accum_t acc[mult_n_out];
27+
#pragma HLS ARRAY_PARTITION variable=acc complete
28+
29+
#pragma HLS ARRAY_PARTITION variable=weights complete
30+
#pragma HLS ARRAY_PARTITION variable=biases complete
31+
32+
// Limit multipliers to control parallelization
33+
#pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit
34+
35+
PartitionLoop:
36+
for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {
37+
#pragma HLS PIPELINE II=CONFIG_T::reuse_factor rewind
38+
39+
CONFIG_T::template fill_buffer<data_T, CONFIG_T>::fill_buffer(data, data_buf, i_part);
40+
41+
PixelLoop:
42+
for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
43+
#pragma HLS UNROLL
44+
45+
data_T cache;
46+
47+
// Do the matrix-multiply
48+
Product:
49+
for (int i_in = 0; i_in < mult_n_in; i_in++) {
50+
#pragma HLS UNROLL
51+
cache = data_buf[i_pxl][i_in];
52+
mult[i_in] =
53+
CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(
54+
cache, weights[i_in]);
55+
}
56+
57+
// Initialize accumulator with input biases
58+
ResetAccum:
59+
for (int i_acc = 0; i_acc < mult_n_out; i_acc++) {
60+
#pragma HLS UNROLL
61+
acc[i_acc] = (typename CONFIG_T::accum_t)biases[i_acc];
62+
}
63+
64+
// Accumulate multiplication result
65+
Accum1:
66+
for (int i_in = 0; i_in < mult_n_acc; i_in++) {
67+
#pragma HLS UNROLL
68+
Accum2:
69+
for (int i_out = 0; i_out < mult_n_out; i_out++) {
70+
#pragma HLS UNROLL
71+
acc[i_out] += mult[i_in * mult_n_out + i_out];
72+
}
73+
}
74+
75+
// Cast to "res_t" type
76+
Result:
77+
for (int i_res = 0; i_res < mult_n_out; i_res++) {
78+
#pragma HLS UNROLL
79+
*(res++) = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_res]);
80+
}
81+
}
82+
}
83+
}
84+
85+
} // namespace nnet
86+
#endif

0 commit comments

Comments
 (0)