2
2
3
3
import numpy as np
4
4
5
- from hls4ml .model .layers import Conv1D , Conv2D , Dense
5
+ from hls4ml .model .layers import GRU , LSTM , Conv1D , Conv2D , Dense
6
6
from hls4ml .model .optimizer import OptimizerPass
7
7
from hls4ml .model .types import Source
8
8
@@ -60,8 +60,8 @@ class GenerateUnrolledDenseResource(OptimizerPass):
60
60
61
61
def match (self , node ):
62
62
# Only apply to layers use that use Dense Matrix Multiplication
63
- # TODO - Extend (& test) for Conv1D / Separable Conv / Depthwise Conv / Recurrent layers
64
- layers_with_dense = (Dense , Conv2D )
63
+ # TODO - Extend (& test) for Separable Conv / Depthwise Conv / Recurrent layers
64
+ layers_with_dense = (Dense , Conv1D , Conv2D , LSTM , GRU )
65
65
66
66
# Unrolled Dense mimicks the hardware implementation of Resource strategy -> apply after Resource optimizer
67
67
weights_transposed = node .get_attr ('_weights_transposed' , False )
@@ -70,23 +70,43 @@ def match(self, node):
70
70
rf_gt_one = node .get_attr ('reuse_factor' , 1 ) > 1
71
71
72
72
# User requested unrolled implementation of Dense
73
- is_unrolled = node .get_attr ('dense_resource_implementation ' , 'standard ' ) == 'unrolled'
73
+ is_unrolled = node .get_attr ('strategy ' , 'latency ' ) == 'unrolled'
74
74
75
75
return isinstance (node , layers_with_dense ) and weights_transposed and rf_gt_one and is_unrolled
76
76
77
77
def transform (self , model , node ):
78
- code_str = self . __generate_unrolled_dense_resource ( model , node )
79
- node .set_attr ( 'unrolled_dense_resource_codegen' , Source ( code_str ) )
78
+ if isinstance ( node , ( LSTM , GRU )):
79
+ n_in , n_out , n_in_recr , n_out_recr = node .model . config . backend . get_layer_mult_size ( node )
80
80
81
- def __generate_unrolled_dense_resource (self , model , node ):
81
+ reuse_factor = node .get_attr ('reuse_factor' )
82
+ weights = node .weights ['weight' ]
83
+ code_str = self ._generate_unrolled_function (n_in , n_out , reuse_factor , weights , str (node .index ) + '_1' )
84
+ node .set_attr ('unrolled_dense_resource_codegen_1' , Source (code_str ))
85
+
86
+ recr_reuse_factor = node .get_attr ('recurrent_reuse_factor' )
87
+ recr_weights = node .weights ['recurrent_weight' ]
88
+ code_str = self ._generate_unrolled_function (
89
+ n_in_recr , n_out_recr , recr_reuse_factor , recr_weights , str (node .index ) + '_2'
90
+ )
91
+ node .set_attr ('unrolled_dense_resource_codegen_2' , Source (code_str ))
92
+
93
+ else :
94
+ n_in , n_out = node .model .config .backend .get_layer_mult_size (node )
95
+ reuse_factor = node .get_attr ('reuse_factor' )
96
+ weights = node .weights ['weight' ]
97
+
98
+ code_str = self ._generate_unrolled_function (n_in , n_out , reuse_factor , weights , node .index )
99
+ node .set_attr ('unrolled_dense_resource_codegen' , Source (code_str ))
100
+
101
+ def _generate_unrolled_function (self , n_in , n_out , reuse_factor , weights , function_suffix ):
82
102
"""
83
103
Generate a C++ function that mimics the Dense Resource implementation.
84
104
85
105
The HLS compiler produces suboptimal designs for Dense Resource when the weights processed by the same DSP are zero.
86
- Latency strategy can optimize zero mutiplications
106
+ Latency strategy can optimize zero multiplications
87
107
Resource strategy, on the other hand, cannot.
88
108
When all the weights in the same BRAM block are zero, Vivado is unable to optimize it
89
- With this (and additional TCL scripts) zero BRAM are optimised
109
+ With this (and additional TCL scripts) zero BRAM are optimized
90
110
91
111
Args:
92
112
node: Layer to generate code for
@@ -96,61 +116,58 @@ def __generate_unrolled_dense_resource(self, model, node):
96
116
97
117
# Variable instantiation and function pragmas
98
118
generated_code = (
99
- " template<class data_T, class res_T, typename CONFIG_T>\n "
100
- " class dense_unrolled_{index } : public DenseResourceUnrolled <data_T, res_T, CONFIG_T> {{\n "
101
- " public:\n "
102
- " static void dense_unrolled (\n "
103
- " data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],\n "
104
- " typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],\n "
105
- " typename CONFIG_T::bias_t biases[CONFIG_T::n_out]\n "
106
- " ) {{\n "
107
- " #pragma HLS pipeline II=CONFIG_T::reuse_factor\n "
108
- " \n "
109
- " constexpr int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);\n "
110
- " #pragma HLS function_instantiate variable=weights,biases\n "
111
- " #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor\n "
112
- " #pragma HLS RESOURCE variable=weights core=ROM_nP_BRAM\n "
113
- " #pragma HLS ARRAY_PARTITION variable=biases complete\n "
114
- " \n "
115
- " typename CONFIG_T::accum_t acc[CONFIG_T::n_out];\n "
116
- " #pragma HLS ARRAY_PARTITION variable=acc complete\n "
117
- " \n "
118
- " InitAccum:\n "
119
- " for (int i = 0; i < CONFIG_T::n_out; i++) {{\n "
120
- " #pragma HLS UNROLL\n "
121
- " acc[i] = (typename CONFIG_T::accum_t) biases[i];\n "
122
- " }}\n "
123
- " \n "
124
- ).format (index = node . index )
119
+ ' template<class data_T, class res_T, typename CONFIG_T>\n '
120
+ ' class dense_unrolled_{suffix } : public DenseKernel <data_T, res_T, CONFIG_T> {{\n '
121
+ ' public:\n '
122
+ ' static void dense (\n '
123
+ ' data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],\n '
124
+ ' typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],\n '
125
+ ' typename CONFIG_T::bias_t biases[CONFIG_T::n_out]\n '
126
+ ' ) {{\n '
127
+ ' #pragma HLS pipeline II=CONFIG_T::reuse_factor\n '
128
+ ' \n '
129
+ ' constexpr int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);\n '
130
+ ' #pragma HLS function_instantiate variable=weights,biases\n '
131
+ ' #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor\n '
132
+ ' #pragma HLS RESOURCE variable=weights core=ROM_nP_BRAM\n '
133
+ ' #pragma HLS ARRAY_PARTITION variable=biases complete\n '
134
+ ' \n '
135
+ ' typename CONFIG_T::accum_t acc[CONFIG_T::n_out];\n '
136
+ ' #pragma HLS ARRAY_PARTITION variable=acc complete\n '
137
+ ' \n '
138
+ ' InitAccum:\n '
139
+ ' for (int i = 0; i < CONFIG_T::n_out; i++) {{\n '
140
+ ' #pragma HLS UNROLL\n '
141
+ ' acc[i] = (typename CONFIG_T::accum_t) biases[i];\n '
142
+ ' }}\n '
143
+ ' \n '
144
+ ).format (suffix = function_suffix )
125
145
126
146
# Unrolled multiplication, according to the three cases
127
- n_in , n_out = node .model .config .backend .get_layer_mult_size (node )
128
- reuse_factor = node .get_attr ('reuse_factor' )
129
- weights = node .weights ['weight' ]
130
147
if reuse_factor <= n_in :
131
- mult_code = self .__generate_unrolled_mult_code_rf_leq_nin (n_in , n_out , reuse_factor , weights )
148
+ mult_code = self ._generate_unrolled_mult_code_rf_leq_nin (n_in , n_out , reuse_factor , weights )
132
149
elif reuse_factor > n_in and reuse_factor % n_in == 0 :
133
- mult_code = self .__generate_unrolled_mult_code_rf_gt_nin_rem0 (n_in , n_out , reuse_factor , weights )
150
+ mult_code = self ._generate_unrolled_mult_code_rf_gt_nin_rem0 (n_in , n_out , reuse_factor , weights )
134
151
else :
135
152
# This case shouldn't happen if my understanding of RF is correct
136
153
# The function fpga_backend._validate_reuse_factor() has assertion rf % n_in == 0 or rf < n_in
137
154
raise Exception ('Not implemented...' )
138
155
139
156
# Write output
140
- generated_code += mult_code + " \n "
157
+ generated_code += mult_code + ' \n '
141
158
generated_code += (
142
- " Result:\n "
143
- " for (int i = 0; i < CONFIG_T::n_out; i++) {\n "
144
- " #pragma HLS UNROLL\n "
145
- " res[i] = cast<data_T, res_T, CONFIG_T>(acc[i]);\n "
146
- " }\n "
147
- " }\n "
148
- " };\n "
159
+ ' Result:\n '
160
+ ' for (int i = 0; i < CONFIG_T::n_out; i++) {\n '
161
+ ' #pragma HLS UNROLL\n '
162
+ ' res[i] = cast<data_T, res_T, CONFIG_T>(acc[i]);\n '
163
+ ' }\n '
164
+ ' }\n '
165
+ ' };\n '
149
166
)
150
167
151
168
return generated_code
152
169
153
- def __generate_unrolled_mult_code_rf_leq_nin (self , n_in , n_out , reuse_factor , weights ):
170
+ def _generate_unrolled_mult_code_rf_leq_nin (self , n_in , n_out , reuse_factor , weights ):
154
171
# Function constants
155
172
mult_factor = min (n_in , reuse_factor )
156
173
block_factor = int (math .ceil (n_in * n_out / reuse_factor ))
@@ -162,24 +179,29 @@ def __generate_unrolled_mult_code_rf_leq_nin(self, n_in, n_out, reuse_factor, we
162
179
# The new shape is (parallel_mult, reuse_factor)
163
180
zeros = np .sum (~ weights .data .reshape (block_factor , reuse_factor ).any (1 ))
164
181
182
+ # Used to pad the code to make it human-readable
183
+ indent = ' '
184
+
165
185
# Generate unrolled multiplications
166
- mult_code = f" \t \t #pragma HLS ALLOCATION operation instances=mul limit={ mult_limit - zeros } \n "
167
- mult_code += " \t \t MULT : {\n "
168
- mult_code += " \t \t \t #pragma HLS protocol\n "
186
+ mult_code = f' { indent * 2 } #pragma HLS ALLOCATION operation instances=mul limit={ mult_limit - zeros } \n '
187
+ mult_code += f' { indent * 2 } MULT : {{ \n '
188
+ mult_code += f' { indent * 3 } #pragma HLS protocol\n '
169
189
170
190
for ir in range (reuse_factor ):
171
191
acc_step = 0
172
192
out_index = 0
173
193
w_index = ir
174
194
in_index = ir
175
195
176
- mult_code += f" \t \t \t M { ir } : {{\n "
196
+ mult_code += f' { indent * 3 } M { ir } : {{\n '
177
197
for _ in range (block_factor ):
178
198
if weights .data .flatten ()[w_index ] != 0 :
179
- mult_code += f"\t \t \t \t acc[{ out_index } ] += \
180
- static_cast<typename CONFIG_T::accum_t>\
181
- (CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::\
182
- product(data[{ in_index } ], weights[{ w_index } ]));\n "
199
+ mult_code += (
200
+ f'{ indent * 4 } acc[{ out_index } ] += '
201
+ 'static_cast<typename CONFIG_T::accum_t>'
202
+ '(CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::'
203
+ f'product(data[{ in_index } ], weights[{ w_index } ]));\n '
204
+ )
183
205
184
206
w_index += reuse_factor
185
207
in_index += reuse_factor
@@ -191,13 +213,13 @@ def __generate_unrolled_mult_code_rf_leq_nin(self, n_in, n_out, reuse_factor, we
191
213
else :
192
214
acc_step += 1
193
215
194
- mult_code += " \t \t \t } \n "
216
+ mult_code += f' { indent * 3 } }} \n '
195
217
196
- mult_code += " \t \t } \n "
218
+ mult_code += f' { indent * 2 } }} \n '
197
219
198
220
return mult_code
199
221
200
- def __generate_unrolled_mult_code_rf_gt_nin_rem0 (self , n_in , n_out , reuse_factor , weights ):
222
+ def _generate_unrolled_mult_code_rf_gt_nin_rem0 (self , n_in , n_out , reuse_factor , weights ):
201
223
# Function constants
202
224
mult_factor = min (n_in , reuse_factor )
203
225
block_factor = int (math .ceil (n_in * n_out / reuse_factor ))
@@ -208,6 +230,9 @@ def __generate_unrolled_mult_code_rf_gt_nin_rem0(self, n_in, n_out, reuse_factor
208
230
# The new shape is (parallel_mult, reuse_factor)
209
231
zeros = np .sum (~ weights .data .reshape (block_factor , reuse_factor ).any (1 ))
210
232
233
+ # Used to pad the code to make it human-readable
234
+ indent = ' '
235
+
211
236
# Generate out indices
212
237
outidx = [0 ] * reuse_factor
213
238
outstep = 0
@@ -221,32 +246,34 @@ def __generate_unrolled_mult_code_rf_gt_nin_rem0(self, n_in, n_out, reuse_factor
221
246
in_index = 0
222
247
223
248
# Generate unrolled multiplications
224
- mult_code = f" \t \t #pragma HLS ALLOCATION operation instances=mul limit={ mult_limit - zeros } \n "
225
- mult_code += " \t \t MULT : {\n "
226
- mult_code += " \t \t \t #pragma HLS protocol\n "
249
+ mult_code = f' { indent * 2 } #pragma HLS ALLOCATION operation instances=mul limit={ mult_limit - zeros } \n '
250
+ mult_code += f' { indent * 2 } MULT : {{ \n '
251
+ mult_code += f' { indent * 3 } #pragma HLS protocol\n '
227
252
228
253
for ir in range (reuse_factor ):
229
254
w_index = ir
230
255
out_index = outidx [ir ]
231
256
232
- mult_code += f" \t \t \t M { ir } : {{\n "
257
+ mult_code += f' { indent * 3 } M { ir } : {{\n '
233
258
for _ in range (block_factor ):
234
259
if weights .data .flatten ()[w_index ] != 0 :
235
- mult_code += f"\t \t \t \t acc[{ int (out_index )} ] += \
236
- static_cast<typename CONFIG_T::accum_t>\
237
- (CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::\
238
- product(data[{ in_index } ], weights[{ w_index } ]));\n "
260
+ mult_code += (
261
+ f'{ indent * 4 } acc[{ int (out_index )} ] += '
262
+ 'static_cast<typename CONFIG_T::accum_t>'
263
+ '(CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::'
264
+ f'product(data[{ in_index } ], weights[{ w_index } ]));\n '
265
+ )
239
266
240
267
w_index += reuse_factor
241
268
if w_index > n_in * n_out :
242
269
break
243
270
out_index += outscale
244
- mult_code += " \t \t \t } \n "
271
+ mult_code += f' { indent * 3 } }} \n '
245
272
246
273
in_index += 1
247
274
if in_index >= n_in :
248
275
in_index = 0
249
276
250
- mult_code += " \t \t } \n "
277
+ mult_code += f' { indent * 2 } }} \n '
251
278
252
279
return mult_code
0 commit comments