Skip to content

Commit 2ed0865

Browse files
committed
Reorganize codegen of unrolled implementation
1 parent 0ea246c commit 2ed0865

File tree

7 files changed

+359
-264
lines changed

7 files changed

+359
-264
lines changed

hls4ml/backends/fpga/fpga_backend.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,10 +227,12 @@ def get_closest_reuse_factor(self, valid_rf, chosen_rf):
227227
else:
228228
return before
229229

230-
def set_closest_reuse_factor(self, layer, n_in, n_out, attribute='reuse_factor'):
230+
def set_closest_reuse_factor(self, layer, n_in, n_out, attribute='reuse_factor', include_max_rf=True):
231231
assert attribute is not None, 'Reuse factor attribute cannot be None'
232232

233233
valid_rf = self.get_valid_reuse_factors(n_in, n_out)
234+
if not include_max_rf:
235+
valid_rf.pop()
234236
chosen_rf = layer.get_attr(attribute)
235237
if chosen_rf not in valid_rf:
236238
closest_rf = self.get_closest_reuse_factor(valid_rf, chosen_rf)
Lines changed: 1 addition & 229 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,4 @@
1-
import math
2-
3-
import numpy as np
4-
5-
from hls4ml.model.layers import GRU, LSTM, Conv1D, Conv2D, Dense
1+
from hls4ml.model.layers import Conv1D, Conv2D
62
from hls4ml.model.optimizer import OptimizerPass
73
from hls4ml.model.types import Source
84

@@ -53,227 +49,3 @@ def _generate_im2col_2d(self, node):
5349
)
5450

5551
node.set_attr('line_buffer_codegen', Source(code_str))
56-
57-
58-
class GenerateUnrolledDenseResource(OptimizerPass):
59-
'''Generates C++ code for unrolled Dense resource'''
60-
61-
def match(self, node):
62-
# Only apply to layers use that use Dense Matrix Multiplication
63-
# TODO - Extend (& test) for Separable Conv / Depthwise Conv / Recurrent layers
64-
layers_with_dense = (Dense, Conv1D, Conv2D, LSTM, GRU)
65-
66-
# Unrolled Dense mimicks the hardware implementation of Resource strategy -> apply after Resource optimizer
67-
weights_transposed = node.get_attr('_weights_transposed', False)
68-
69-
# RF = 1 will optimize DSPs anyway, so no need to unroll code
70-
rf_gt_one = node.get_attr('reuse_factor', 1) > 1
71-
72-
# User requested unrolled implementation of Dense
73-
is_unrolled = node.get_attr('strategy', 'latency') == 'unrolled'
74-
75-
return isinstance(node, layers_with_dense) and weights_transposed and rf_gt_one and is_unrolled
76-
77-
def transform(self, model, node):
78-
if isinstance(node, (LSTM, GRU)):
79-
n_in, n_out, n_in_recr, n_out_recr = node.model.config.backend.get_layer_mult_size(node)
80-
81-
reuse_factor = node.get_attr('reuse_factor')
82-
weights = node.weights['weight']
83-
code_str = self._generate_unrolled_function(n_in, n_out, reuse_factor, weights, str(node.index) + '_1')
84-
node.set_attr('unrolled_dense_resource_codegen_1', Source(code_str))
85-
86-
recr_reuse_factor = node.get_attr('recurrent_reuse_factor')
87-
recr_weights = node.weights['recurrent_weight']
88-
code_str = self._generate_unrolled_function(
89-
n_in_recr, n_out_recr, recr_reuse_factor, recr_weights, str(node.index) + '_2'
90-
)
91-
node.set_attr('unrolled_dense_resource_codegen_2', Source(code_str))
92-
93-
else:
94-
n_in, n_out = node.model.config.backend.get_layer_mult_size(node)
95-
reuse_factor = node.get_attr('reuse_factor')
96-
weights = node.weights['weight']
97-
98-
code_str = self._generate_unrolled_function(n_in, n_out, reuse_factor, weights, node.index)
99-
node.set_attr('unrolled_dense_resource_codegen', Source(code_str))
100-
101-
def _generate_unrolled_function(self, n_in, n_out, reuse_factor, weights, function_suffix):
102-
"""
103-
Generate a C++ function that mimics the Dense Resource implementation.
104-
105-
The HLS compiler produces suboptimal designs for Dense Resource when the weights processed by the same DSP are zero.
106-
Latency strategy can optimize zero multiplications
107-
Resource strategy, on the other hand, cannot.
108-
When all the weights in the same BRAM block are zero, Vivado is unable to optimize it
109-
With this (and additional TCL scripts) zero BRAM are optimized
110-
111-
Args:
112-
node: Layer to generate code for
113-
Returns:
114-
generated_code: Generated C++ function (string)
115-
"""
116-
117-
# Variable instantiation and function pragmas
118-
generated_code = (
119-
'template<class data_T, class res_T, typename CONFIG_T>\n'
120-
'class dense_unrolled_{suffix} : public DenseKernel<data_T, res_T, CONFIG_T> {{\n'
121-
' public:\n'
122-
' static void dense(\n'
123-
' data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],\n'
124-
' typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],\n'
125-
' typename CONFIG_T::bias_t biases[CONFIG_T::n_out]\n'
126-
' ) {{\n'
127-
' #pragma HLS pipeline II=CONFIG_T::reuse_factor\n'
128-
'\n'
129-
' constexpr int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);\n'
130-
' #pragma HLS function_instantiate variable=weights,biases\n'
131-
' #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor\n'
132-
' #pragma HLS RESOURCE variable=weights core=ROM_nP_BRAM\n'
133-
' #pragma HLS ARRAY_PARTITION variable=biases complete\n'
134-
'\n'
135-
' typename CONFIG_T::accum_t acc[CONFIG_T::n_out];\n'
136-
' #pragma HLS ARRAY_PARTITION variable=acc complete\n'
137-
'\n'
138-
' InitAccum:\n'
139-
' for (int i = 0; i < CONFIG_T::n_out; i++) {{\n'
140-
' #pragma HLS UNROLL\n'
141-
' acc[i] = (typename CONFIG_T::accum_t) biases[i];\n'
142-
' }}\n'
143-
'\n'
144-
).format(suffix=function_suffix)
145-
146-
# Unrolled multiplication, according to the three cases
147-
if reuse_factor <= n_in:
148-
mult_code = self._generate_unrolled_mult_code_rf_leq_nin(n_in, n_out, reuse_factor, weights)
149-
elif reuse_factor > n_in and reuse_factor % n_in == 0:
150-
mult_code = self._generate_unrolled_mult_code_rf_gt_nin_rem0(n_in, n_out, reuse_factor, weights)
151-
else:
152-
# This case shouldn't happen if my understanding of RF is correct
153-
# The function fpga_backend._validate_reuse_factor() has assertion rf % n_in == 0 or rf < n_in
154-
raise Exception('Not implemented...')
155-
156-
# Write output
157-
generated_code += mult_code + '\n'
158-
generated_code += (
159-
' Result:\n'
160-
' for (int i = 0; i < CONFIG_T::n_out; i++) {\n'
161-
' #pragma HLS UNROLL\n'
162-
' res[i] = cast<data_T, res_T, CONFIG_T>(acc[i]);\n'
163-
' }\n'
164-
' }\n'
165-
'};\n'
166-
)
167-
168-
return generated_code
169-
170-
def _generate_unrolled_mult_code_rf_leq_nin(self, n_in, n_out, reuse_factor, weights):
171-
# Function constants
172-
mult_factor = min(n_in, reuse_factor)
173-
block_factor = int(math.ceil(n_in * n_out / reuse_factor))
174-
mult_limit = int(math.ceil(n_in * n_out / mult_factor))
175-
mult_scale = mult_limit // n_out
176-
177-
# Zero DSPs are the DSP blocks that always have zero input
178-
# In this case, it is the number of rows in the transposed and reshaped weight matrix
179-
# The new shape is (parallel_mult, reuse_factor)
180-
zeros = np.sum(~weights.data.reshape(block_factor, reuse_factor).any(1))
181-
182-
# Used to pad the code to make it human-readable
183-
indent = ' '
184-
185-
# Generate unrolled multiplications
186-
mult_code = f'{indent*2}#pragma HLS ALLOCATION operation instances=mul limit={mult_limit - zeros}\n'
187-
mult_code += f'{indent*2}MULT: {{\n'
188-
mult_code += f'{indent*3}#pragma HLS protocol\n'
189-
190-
for ir in range(reuse_factor):
191-
acc_step = 0
192-
out_index = 0
193-
w_index = ir
194-
in_index = ir
195-
196-
mult_code += f'{indent*3}M{ir}: {{\n'
197-
for _ in range(block_factor):
198-
if weights.data.flatten()[w_index] != 0:
199-
mult_code += (
200-
f'{indent*4}acc[{out_index}] += '
201-
'static_cast<typename CONFIG_T::accum_t>'
202-
'(CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::'
203-
f'product(data[{in_index}], weights[{w_index}]));\n'
204-
)
205-
206-
w_index += reuse_factor
207-
in_index += reuse_factor
208-
if in_index >= n_in:
209-
in_index = ir
210-
if acc_step + 1 >= mult_scale:
211-
acc_step = 0
212-
out_index += 1
213-
else:
214-
acc_step += 1
215-
216-
mult_code += f'{indent*3}}}\n'
217-
218-
mult_code += f'{indent*2}}}\n'
219-
220-
return mult_code
221-
222-
def _generate_unrolled_mult_code_rf_gt_nin_rem0(self, n_in, n_out, reuse_factor, weights):
223-
# Function constants
224-
mult_factor = min(n_in, reuse_factor)
225-
block_factor = int(math.ceil(n_in * n_out / reuse_factor))
226-
mult_limit = int(math.ceil(n_in * n_out / mult_factor))
227-
228-
# Zero DSPs are the DSP blocks that always have zero input
229-
# In this case, it is the number of rows in the transposed and reshaped weight matrix
230-
# The new shape is (parallel_mult, reuse_factor)
231-
zeros = np.sum(~weights.data.reshape(block_factor, reuse_factor).any(1))
232-
233-
# Used to pad the code to make it human-readable
234-
indent = ' '
235-
236-
# Generate out indices
237-
outidx = [0] * reuse_factor
238-
outstep = 0
239-
outscale = reuse_factor // n_in
240-
for ir in range(reuse_factor):
241-
outidx[ir] = outstep
242-
if (ir + 1) % n_in == 0:
243-
outstep += 1
244-
245-
# Define variables
246-
in_index = 0
247-
248-
# Generate unrolled multiplications
249-
mult_code = f'{indent*2}#pragma HLS ALLOCATION operation instances=mul limit={mult_limit - zeros}\n'
250-
mult_code += f'{indent*2}MULT: {{\n'
251-
mult_code += f'{indent*3}#pragma HLS protocol\n'
252-
253-
for ir in range(reuse_factor):
254-
w_index = ir
255-
out_index = outidx[ir]
256-
257-
mult_code += f'{indent*3}M{ir}: {{\n'
258-
for _ in range(block_factor):
259-
if weights.data.flatten()[w_index] != 0:
260-
mult_code += (
261-
f'{indent*4}acc[{int(out_index)}] += '
262-
'static_cast<typename CONFIG_T::accum_t>'
263-
'(CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::'
264-
f'product(data[{in_index}], weights[{w_index}]));\n'
265-
)
266-
267-
w_index += reuse_factor
268-
if w_index > n_in * n_out:
269-
break
270-
out_index += outscale
271-
mult_code += f'{indent*3}}}\n'
272-
273-
in_index += 1
274-
if in_index >= n_in:
275-
in_index = 0
276-
277-
mult_code += f'{indent*2}}}\n'
278-
279-
return mult_code

hls4ml/backends/vitis/passes/feature_check.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def transform(self, model, node):
1414
node.set_attr('implementation', 'linebuffer')
1515

1616

17-
class ValidateStrategy(OptimizerPass):
17+
class ValidateResourceStrategy(OptimizerPass):
1818
_resource_layer_cls = ['Conv1D', 'Conv2D', 'Dense']
1919

2020
def match(self, node):
@@ -29,6 +29,23 @@ def transform(self, model, node):
2929
if rf > n_in and rf % n_in > 0:
3030
print(
3131
f'WARNING: "Resource" strategy in "{node.name}" ({node.class_name}) may have suboptimal QoR in Vitis '
32-
'backend due to use of "urem" cores.\n'
33-
'Consider using a different ReuseFactor or switching to "Latency" strategy.'
32+
'backend due to use of "urem" cores in Vitis HLS <= 2022.1.\n'
33+
'Consider using a different ReuseFactor or switching to "Latency" strategy if using older versions '
34+
'of Vitis HLS.'
3435
)
36+
37+
38+
class ValidateUnrolledStrategy(OptimizerPass):
39+
_unrolled_layer_cls = ['Conv1D', 'Conv2D', 'Dense', 'GRU', 'LSTM']
40+
41+
def match(self, node):
42+
is_unrolled_layer = len([layer_cls for layer_cls in self._unrolled_layer_cls if layer_cls in node.class_name]) > 0
43+
is_unrolled_strategy = node.get_attr('strategy', 'latency').lower() == 'unrolled'
44+
45+
return is_unrolled_layer and is_unrolled_strategy
46+
47+
def transform(self, model, node):
48+
print(
49+
f'WARNING: "Unrolled" strategy in "{node.name}" ({node.class_name}) may have unexpected II in Vitis backend.\n'
50+
'Verify that the final design satisfies the latency/II constraints.'
51+
)

hls4ml/backends/vitis/vitis_backend.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ def __init__(self):
1515
def _register_flows(self):
1616
validation_passes = [
1717
'vitis:validate_conv_implementation',
18-
'vitis:validate_strategy',
18+
'vitis:validate_resource_strategy',
19+
'vitis:validate_unrolled_strategy',
1920
]
2021
validation_flow = register_flow('validation', validation_passes, requires=['vivado:init_layers'], backend=self.name)
2122

0 commit comments

Comments
 (0)