|
1 |
| -import math |
2 |
| - |
3 |
| -import numpy as np |
4 |
| - |
5 |
| -from hls4ml.model.layers import GRU, LSTM, Conv1D, Conv2D, Dense |
| 1 | +from hls4ml.model.layers import Conv1D, Conv2D |
6 | 2 | from hls4ml.model.optimizer import OptimizerPass
|
7 | 3 | from hls4ml.model.types import Source
|
8 | 4 |
|
@@ -53,227 +49,3 @@ def _generate_im2col_2d(self, node):
|
53 | 49 | )
|
54 | 50 |
|
55 | 51 | node.set_attr('line_buffer_codegen', Source(code_str))
|
56 |
| - |
57 |
| - |
58 |
| -class GenerateUnrolledDenseResource(OptimizerPass): |
59 |
| - '''Generates C++ code for unrolled Dense resource''' |
60 |
| - |
61 |
| - def match(self, node): |
62 |
| - # Only apply to layers use that use Dense Matrix Multiplication |
63 |
| - # TODO - Extend (& test) for Separable Conv / Depthwise Conv / Recurrent layers |
64 |
| - layers_with_dense = (Dense, Conv1D, Conv2D, LSTM, GRU) |
65 |
| - |
66 |
| - # Unrolled Dense mimicks the hardware implementation of Resource strategy -> apply after Resource optimizer |
67 |
| - weights_transposed = node.get_attr('_weights_transposed', False) |
68 |
| - |
69 |
| - # RF = 1 will optimize DSPs anyway, so no need to unroll code |
70 |
| - rf_gt_one = node.get_attr('reuse_factor', 1) > 1 |
71 |
| - |
72 |
| - # User requested unrolled implementation of Dense |
73 |
| - is_unrolled = node.get_attr('strategy', 'latency') == 'unrolled' |
74 |
| - |
75 |
| - return isinstance(node, layers_with_dense) and weights_transposed and rf_gt_one and is_unrolled |
76 |
| - |
77 |
| - def transform(self, model, node): |
78 |
| - if isinstance(node, (LSTM, GRU)): |
79 |
| - n_in, n_out, n_in_recr, n_out_recr = node.model.config.backend.get_layer_mult_size(node) |
80 |
| - |
81 |
| - reuse_factor = node.get_attr('reuse_factor') |
82 |
| - weights = node.weights['weight'] |
83 |
| - code_str = self._generate_unrolled_function(n_in, n_out, reuse_factor, weights, str(node.index) + '_1') |
84 |
| - node.set_attr('unrolled_dense_resource_codegen_1', Source(code_str)) |
85 |
| - |
86 |
| - recr_reuse_factor = node.get_attr('recurrent_reuse_factor') |
87 |
| - recr_weights = node.weights['recurrent_weight'] |
88 |
| - code_str = self._generate_unrolled_function( |
89 |
| - n_in_recr, n_out_recr, recr_reuse_factor, recr_weights, str(node.index) + '_2' |
90 |
| - ) |
91 |
| - node.set_attr('unrolled_dense_resource_codegen_2', Source(code_str)) |
92 |
| - |
93 |
| - else: |
94 |
| - n_in, n_out = node.model.config.backend.get_layer_mult_size(node) |
95 |
| - reuse_factor = node.get_attr('reuse_factor') |
96 |
| - weights = node.weights['weight'] |
97 |
| - |
98 |
| - code_str = self._generate_unrolled_function(n_in, n_out, reuse_factor, weights, node.index) |
99 |
| - node.set_attr('unrolled_dense_resource_codegen', Source(code_str)) |
100 |
| - |
101 |
| - def _generate_unrolled_function(self, n_in, n_out, reuse_factor, weights, function_suffix): |
102 |
| - """ |
103 |
| - Generate a C++ function that mimics the Dense Resource implementation. |
104 |
| -
|
105 |
| - The HLS compiler produces suboptimal designs for Dense Resource when the weights processed by the same DSP are zero. |
106 |
| - Latency strategy can optimize zero multiplications |
107 |
| - Resource strategy, on the other hand, cannot. |
108 |
| - When all the weights in the same BRAM block are zero, Vivado is unable to optimize it |
109 |
| - With this (and additional TCL scripts) zero BRAM are optimized |
110 |
| -
|
111 |
| - Args: |
112 |
| - node: Layer to generate code for |
113 |
| - Returns: |
114 |
| - generated_code: Generated C++ function (string) |
115 |
| - """ |
116 |
| - |
117 |
| - # Variable instantiation and function pragmas |
118 |
| - generated_code = ( |
119 |
| - 'template<class data_T, class res_T, typename CONFIG_T>\n' |
120 |
| - 'class dense_unrolled_{suffix} : public DenseKernel<data_T, res_T, CONFIG_T> {{\n' |
121 |
| - ' public:\n' |
122 |
| - ' static void dense(\n' |
123 |
| - ' data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],\n' |
124 |
| - ' typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],\n' |
125 |
| - ' typename CONFIG_T::bias_t biases[CONFIG_T::n_out]\n' |
126 |
| - ' ) {{\n' |
127 |
| - ' #pragma HLS pipeline II=CONFIG_T::reuse_factor\n' |
128 |
| - '\n' |
129 |
| - ' constexpr int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);\n' |
130 |
| - ' #pragma HLS function_instantiate variable=weights,biases\n' |
131 |
| - ' #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor\n' |
132 |
| - ' #pragma HLS RESOURCE variable=weights core=ROM_nP_BRAM\n' |
133 |
| - ' #pragma HLS ARRAY_PARTITION variable=biases complete\n' |
134 |
| - '\n' |
135 |
| - ' typename CONFIG_T::accum_t acc[CONFIG_T::n_out];\n' |
136 |
| - ' #pragma HLS ARRAY_PARTITION variable=acc complete\n' |
137 |
| - '\n' |
138 |
| - ' InitAccum:\n' |
139 |
| - ' for (int i = 0; i < CONFIG_T::n_out; i++) {{\n' |
140 |
| - ' #pragma HLS UNROLL\n' |
141 |
| - ' acc[i] = (typename CONFIG_T::accum_t) biases[i];\n' |
142 |
| - ' }}\n' |
143 |
| - '\n' |
144 |
| - ).format(suffix=function_suffix) |
145 |
| - |
146 |
| - # Unrolled multiplication, according to the three cases |
147 |
| - if reuse_factor <= n_in: |
148 |
| - mult_code = self._generate_unrolled_mult_code_rf_leq_nin(n_in, n_out, reuse_factor, weights) |
149 |
| - elif reuse_factor > n_in and reuse_factor % n_in == 0: |
150 |
| - mult_code = self._generate_unrolled_mult_code_rf_gt_nin_rem0(n_in, n_out, reuse_factor, weights) |
151 |
| - else: |
152 |
| - # This case shouldn't happen if my understanding of RF is correct |
153 |
| - # The function fpga_backend._validate_reuse_factor() has assertion rf % n_in == 0 or rf < n_in |
154 |
| - raise Exception('Not implemented...') |
155 |
| - |
156 |
| - # Write output |
157 |
| - generated_code += mult_code + '\n' |
158 |
| - generated_code += ( |
159 |
| - ' Result:\n' |
160 |
| - ' for (int i = 0; i < CONFIG_T::n_out; i++) {\n' |
161 |
| - ' #pragma HLS UNROLL\n' |
162 |
| - ' res[i] = cast<data_T, res_T, CONFIG_T>(acc[i]);\n' |
163 |
| - ' }\n' |
164 |
| - ' }\n' |
165 |
| - '};\n' |
166 |
| - ) |
167 |
| - |
168 |
| - return generated_code |
169 |
| - |
170 |
| - def _generate_unrolled_mult_code_rf_leq_nin(self, n_in, n_out, reuse_factor, weights): |
171 |
| - # Function constants |
172 |
| - mult_factor = min(n_in, reuse_factor) |
173 |
| - block_factor = int(math.ceil(n_in * n_out / reuse_factor)) |
174 |
| - mult_limit = int(math.ceil(n_in * n_out / mult_factor)) |
175 |
| - mult_scale = mult_limit // n_out |
176 |
| - |
177 |
| - # Zero DSPs are the DSP blocks that always have zero input |
178 |
| - # In this case, it is the number of rows in the transposed and reshaped weight matrix |
179 |
| - # The new shape is (parallel_mult, reuse_factor) |
180 |
| - zeros = np.sum(~weights.data.reshape(block_factor, reuse_factor).any(1)) |
181 |
| - |
182 |
| - # Used to pad the code to make it human-readable |
183 |
| - indent = ' ' |
184 |
| - |
185 |
| - # Generate unrolled multiplications |
186 |
| - mult_code = f'{indent*2}#pragma HLS ALLOCATION operation instances=mul limit={mult_limit - zeros}\n' |
187 |
| - mult_code += f'{indent*2}MULT: {{\n' |
188 |
| - mult_code += f'{indent*3}#pragma HLS protocol\n' |
189 |
| - |
190 |
| - for ir in range(reuse_factor): |
191 |
| - acc_step = 0 |
192 |
| - out_index = 0 |
193 |
| - w_index = ir |
194 |
| - in_index = ir |
195 |
| - |
196 |
| - mult_code += f'{indent*3}M{ir}: {{\n' |
197 |
| - for _ in range(block_factor): |
198 |
| - if weights.data.flatten()[w_index] != 0: |
199 |
| - mult_code += ( |
200 |
| - f'{indent*4}acc[{out_index}] += ' |
201 |
| - 'static_cast<typename CONFIG_T::accum_t>' |
202 |
| - '(CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::' |
203 |
| - f'product(data[{in_index}], weights[{w_index}]));\n' |
204 |
| - ) |
205 |
| - |
206 |
| - w_index += reuse_factor |
207 |
| - in_index += reuse_factor |
208 |
| - if in_index >= n_in: |
209 |
| - in_index = ir |
210 |
| - if acc_step + 1 >= mult_scale: |
211 |
| - acc_step = 0 |
212 |
| - out_index += 1 |
213 |
| - else: |
214 |
| - acc_step += 1 |
215 |
| - |
216 |
| - mult_code += f'{indent*3}}}\n' |
217 |
| - |
218 |
| - mult_code += f'{indent*2}}}\n' |
219 |
| - |
220 |
| - return mult_code |
221 |
| - |
222 |
| - def _generate_unrolled_mult_code_rf_gt_nin_rem0(self, n_in, n_out, reuse_factor, weights): |
223 |
| - # Function constants |
224 |
| - mult_factor = min(n_in, reuse_factor) |
225 |
| - block_factor = int(math.ceil(n_in * n_out / reuse_factor)) |
226 |
| - mult_limit = int(math.ceil(n_in * n_out / mult_factor)) |
227 |
| - |
228 |
| - # Zero DSPs are the DSP blocks that always have zero input |
229 |
| - # In this case, it is the number of rows in the transposed and reshaped weight matrix |
230 |
| - # The new shape is (parallel_mult, reuse_factor) |
231 |
| - zeros = np.sum(~weights.data.reshape(block_factor, reuse_factor).any(1)) |
232 |
| - |
233 |
| - # Used to pad the code to make it human-readable |
234 |
| - indent = ' ' |
235 |
| - |
236 |
| - # Generate out indices |
237 |
| - outidx = [0] * reuse_factor |
238 |
| - outstep = 0 |
239 |
| - outscale = reuse_factor // n_in |
240 |
| - for ir in range(reuse_factor): |
241 |
| - outidx[ir] = outstep |
242 |
| - if (ir + 1) % n_in == 0: |
243 |
| - outstep += 1 |
244 |
| - |
245 |
| - # Define variables |
246 |
| - in_index = 0 |
247 |
| - |
248 |
| - # Generate unrolled multiplications |
249 |
| - mult_code = f'{indent*2}#pragma HLS ALLOCATION operation instances=mul limit={mult_limit - zeros}\n' |
250 |
| - mult_code += f'{indent*2}MULT: {{\n' |
251 |
| - mult_code += f'{indent*3}#pragma HLS protocol\n' |
252 |
| - |
253 |
| - for ir in range(reuse_factor): |
254 |
| - w_index = ir |
255 |
| - out_index = outidx[ir] |
256 |
| - |
257 |
| - mult_code += f'{indent*3}M{ir}: {{\n' |
258 |
| - for _ in range(block_factor): |
259 |
| - if weights.data.flatten()[w_index] != 0: |
260 |
| - mult_code += ( |
261 |
| - f'{indent*4}acc[{int(out_index)}] += ' |
262 |
| - 'static_cast<typename CONFIG_T::accum_t>' |
263 |
| - '(CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::' |
264 |
| - f'product(data[{in_index}], weights[{w_index}]));\n' |
265 |
| - ) |
266 |
| - |
267 |
| - w_index += reuse_factor |
268 |
| - if w_index > n_in * n_out: |
269 |
| - break |
270 |
| - out_index += outscale |
271 |
| - mult_code += f'{indent*3}}}\n' |
272 |
| - |
273 |
| - in_index += 1 |
274 |
| - if in_index >= n_in: |
275 |
| - in_index = 0 |
276 |
| - |
277 |
| - mult_code += f'{indent*2}}}\n' |
278 |
| - |
279 |
| - return mult_code |
0 commit comments