11
11
from hls4ml .model .optimizer .passes .hgq_proxy_model import FixedPointQuantizer
12
12
from hls4ml .model .types import FixedPrecisionType , Source
13
13
from hls4ml .utils .dependency import requires
14
- from hls4ml .utils .einsum_utils import parse_einsum # noqa: F401
15
14
16
15
if typing .TYPE_CHECKING :
17
16
from hls4ml .model import ModelGraph
@@ -66,8 +65,8 @@ def _(node: Dense):
66
65
@get_kernel_inp_kif .register (Conv1D )
67
66
@get_kernel_inp_kif .register (Conv2D )
68
67
def _ (layer : Conv1D | Conv2D ):
69
- assert layer .attributes . attributes ['data_format' ] == 'channels_last' , 'Only channels_last format is supported'
70
- kernel = layer .attributes . attributes ['weight' ].data
68
+ assert layer .attributes ['data_format' ] == 'channels_last' , 'Only channels_last format is supported'
69
+ kernel = layer .attributes ['weight' ].data
71
70
k_in , i_in , f_in = _get_input_kif (layer )
72
71
k_in , i_in , f_in = pad_arrs (layer , 0 , k_in , i_in , f_in )
73
72
k_in , i_in , f_in = im2col (kernel .shape , k_in , i_in , f_in )
@@ -149,16 +148,10 @@ def transform(self, model: 'ModelGraph', node: Layer):
149
148
node .set_attr ('da_codegen' , Source (fn_str ))
150
149
151
150
152
- dense_da_stream_template = '''struct config{index} {{
153
- static const unsigned n_in = {n_in};
154
- static const unsigned n_out = {n_out};
155
- static const unsigned io_type = nnet::io_stream;
156
- static const unsigned strategy = nnet::distributed_arithmetic;
157
- constexpr static auto dense_da = nnet::dense_da_{index}<typename {inp_t}::value_type, typename {out_t}::value_type>;
158
- }};\n '''
159
-
160
-
161
151
class FuseQuantizerIntoDALayers (OptimizerPass ):
152
+ """Heterogeneous quantizer can be fused into the DA CMVM kernel in some cases.
153
+ This would allow heterogeenous quantizarion for io stream in some cases."""
154
+
162
155
def match (self , node : Layer ):
163
156
if not isinstance (node , FixedPointQuantizer ):
164
157
return False
@@ -203,9 +196,18 @@ def transform(self, model: 'ModelGraph', node: FixedPointQuantizer):
203
196
return True
204
197
205
198
199
+ dense_da_stream_template = '''struct config{index} {{
200
+ static const unsigned n_in = {n_in};
201
+ static const unsigned n_out = {n_out};
202
+ static const unsigned io_type = nnet::io_stream;
203
+ static const unsigned strategy = nnet::distributed_arithmetic;
204
+ constexpr static auto dense_da = nnet::dense_da_{index}<typename {inp_t}::value_type, typename {out_t}::value_type>;
205
+ }};\n '''
206
+
207
+
206
208
class DALatencyDenseTemplate (OptimizerPass ):
207
- # For Dense, distributed arithmetic do not call the original, regardless of the io_type
208
- # FOr io_stream, a minimal config will still be generated
209
+ # For Dense, distributed arithmetic do not call the original impl , regardless of the io_type
210
+ # For io_stream, a minimal config will still be generated
209
211
def match (self , node : Layer ):
210
212
if node .class_name != 'Dense' :
211
213
return False
@@ -225,10 +227,10 @@ def transform(self, model: 'ModelGraph', node: Layer):
225
227
if io_type == 'io_parallel' :
226
228
fn_name = f'dense_da_{ node .index } <{ inp_t } , { out_t } >'
227
229
function_cpp = f'{ namespace } ::{ fn_name } ({ inp_name } , { out_name } );'
228
- node .attributes . attributes ['function_cpp' ] = function_cpp
230
+ node .attributes ['function_cpp' ] = function_cpp
229
231
else :
230
232
assert io_type == 'io_stream'
231
- config_cpp = dense_da_stream_template .format (inp_t = inp_t , out_t = out_t , ** node .attributes . attributes )
233
+ config_cpp = dense_da_stream_template .format (inp_t = inp_t , out_t = out_t , ** node .attributes )
232
234
function_cpp = f'nnet::dense<{ inp_t } , { out_t } , config{ node .index } >({ inp_name } , { out_name } );'
233
235
node .attributes ['config_cpp' ] = config_cpp
234
236
node .attributes ['function_cpp' ] = function_cpp
@@ -300,8 +302,8 @@ def transform(self, model: 'ModelGraph', node: Layer):
300
302
class_name = class_name [9 :]
301
303
302
304
ndim = len (ker_shape ) - 2
303
- function_cpp = f'nnet::conv{ ndim } d_da_cl <config{ node .index } , { inp_t } , { out_t } >({ inp_name } , { out_name } );'
304
- node .attributes . attributes ['function_cpp' ] = function_cpp
305
+ function_cpp = f'nnet::conv{ ndim } d_cl <config{ node .index } , { inp_t } , { out_t } >({ inp_name } , { out_name } );'
306
+ node .attributes ['function_cpp' ] = function_cpp
305
307
306
308
# config generation
307
309
params = node .attributes .attributes .copy ()
@@ -314,15 +316,15 @@ def transform(self, model: 'ModelGraph', node: Layer):
314
316
params .setdefault ('stride_height' , - 1 if ndim == 1 else 1 )
315
317
316
318
config_cpp = conv_da_parallel_template .format (inp_t = inp_t , out_t = out_t , n_pixels = n_pixels , ** params )
317
- node .attributes . attributes ['config_cpp' ] = config_cpp
319
+ node .attributes ['config_cpp' ] = config_cpp
318
320
319
321
# Only unrolled header is required for io_parallel
320
322
include_headers = [
321
323
'nnet_utils/nnet_da_wrappers.h' ,
322
324
f'nnet_utils/nnet_{ class_name .lower ()} .h' ,
323
325
'nnet_utils/nnet_conv_stream.h' , # some properties defined in config need this
324
326
]
325
- node .attributes . attributes ['include_header' ] = include_headers
327
+ node .attributes ['include_header' ] = include_headers
326
328
327
329
# avoid output weights and bias; alternatie entry point does not use them
328
330
del node .attributes ['weight_data' ]
@@ -333,6 +335,18 @@ def transform(self, model: 'ModelGraph', node: Layer):
333
335
del node .attributes ['bias_t' ]
334
336
335
337
338
+ kernel_fn_template = '''
339
+ template <typename inp_t, typename out_t>
340
+ void einsum_dense{index}_da_kernel(
341
+ inp_t inp_tpose[{inp_tpose}],
342
+ out_t out_tpose[{out_tpose}],
343
+ int l0
344
+ ) {{
345
+ {fn_call_str}
346
+ }}
347
+ '''
348
+
349
+
336
350
class DistributedArithmeticEinsumCodegen (OptimizerPass ):
337
351
'''Generates C++ code for distributed arithmetic implementation of Dense layers'''
338
352
@@ -373,16 +387,13 @@ def transform(self, model: 'ModelGraph', node: Layer):
373
387
fn_call = f'{ fn_name } (&inp_tpose[({ i } * { L_data } + l0) * { C } ], &out_tpose[({ i } * { L_data } + l0) * { L_ker } ]);'
374
388
fn_calls .append (fn_call )
375
389
376
- kernel_fn = f'''
377
- template <typename inp_t, typename out_t>
378
- void einsum_dense{ node .index } _da_kernel(
379
- inp_t inp_tpose[{ L_data * C * I } ],
380
- out_t out_tpose[{ L_data * L_ker * I } ],
381
- int l0
382
- ) {{
383
- { " " .join (fn_calls )}
384
- }}
385
- '''
390
+ kernel_fn = kernel_fn_template .format (
391
+ index = node .index ,
392
+ inp_tpose = L_data * C * I ,
393
+ out_tpose = L_data * L_ker * I ,
394
+ fn_call_str = ' \n ' .join (fn_calls ),
395
+ )
396
+
386
397
code_gen = '\n \n ' .join (fn_strs ) + '\n \n ' + kernel_fn
387
398
node .attributes ['da_codegen' ] = Source (code_gen )
388
399
del node .attributes ['weight_data' ]
0 commit comments