@@ -21,7 +21,7 @@ void kernel_shift_tr_2d(
21
21
#pragma HLS PIPELINE II = 1
22
22
KernelShiftHeight: for (unsigned i_ih = 0 ; i_ih < CONFIG_T::trfilt_height; i_ih++) {
23
23
KernelShiftChannel: for (unsigned i_ic = 0 ; i_ic < CONFIG_T::n_chan; i_ic++) {
24
- // Shift every element in kernel_window to the left
24
+ // Shift every element in kernel_window to the left
25
25
kernel_window[i_ih * CONFIG_T::trfilt_width * CONFIG_T::n_chan + i_iw * CONFIG_T::n_chan + i_ic] = kernel_window[i_ih * CONFIG_T::trfilt_width * CONFIG_T::n_chan + (i_iw + 1 ) * CONFIG_T::n_chan + i_ic];
26
26
}
27
27
}
@@ -69,53 +69,50 @@ void shift_line_buffer_tr(const data_T& in_elem,
69
69
}
70
70
71
71
template <typename CONFIG_T>
72
- void load_tr_kern_weights (
73
- typename CONFIG_T::weight_t weights[CONFIG_T::trfilt_height * CONFIG_T::trfilt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
74
- typename CONFIG_T::weight_t kernel_weights[
75
- CONFIG_T::n_filt * CONFIG_T::trfilt_height * CONFIG_T::trfilt_width * CONFIG_T::n_chan
72
+ void load_trfilt_weights (
73
+ typename CONFIG_T::weight_t trfilt_weights[CONFIG_T::stride_height][CONFIG_T::stride_width][
74
+ CONFIG_T::trfilt_height * CONFIG_T::trfilt_width * CONFIG_T::n_filt * CONFIG_T::n_chan
76
75
],
77
- const int weight_x_start,
78
- const int weight_y_start
76
+ typename CONFIG_T::weight_t weights[
77
+ CONFIG_T::trfilt_height * CONFIG_T::trfilt_width * CONFIG_T::n_chan * CONFIG_T::n_filt
78
+ ]
79
79
)
80
80
{
81
- int x_indices[CONFIG_T::trfilt_width];
82
- int y_indices[CONFIG_T::trfilt_height];
83
- for (int step = 0 ; step < CONFIG_T::trfilt_width; step++) {
84
- x_indices[step] = weight_x_start - step * CONFIG_T::stride_width;
85
- }
86
- for (int step = 0 ; step < CONFIG_T::trfilt_height; step++) {
87
- y_indices[step] = weight_y_start - step * CONFIG_T::stride_height;
88
- }
89
-
90
- WeightsLoop: for (int x_step = 0 ; x_step < CONFIG_T::trfilt_width; x_step++) {
91
- #pragma HLS UNROLL
92
- #pragma HLS PIPELINE
93
- for (int y_step = 0 ; y_step < CONFIG_T::trfilt_height; y_step++) {
94
- #pragma HLS UNROLL
95
- #pragma HLS PIPELINE
96
- for (int filt_ind = 0 ; filt_ind < CONFIG_T::n_filt; filt_ind++) {
97
- #pragma HLS UNROLL
98
- #pragma HLS PIPELINE
99
- for (int chan_ind = 0 ; chan_ind < CONFIG_T::n_chan; chan_ind++) {
100
- #pragma HLS UNROLL
101
- #pragma HLS PIPELINE
102
- if (x_indices[x_step] >= CONFIG_T::filt_width || y_indices[y_step] >= CONFIG_T::filt_height) {
103
- kernel_weights[
104
- filt_ind * CONFIG_T::trfilt_height * CONFIG_T::trfilt_width * CONFIG_T::n_chan +
105
- y_step * CONFIG_T::trfilt_width * CONFIG_T::n_chan +
106
- x_step * CONFIG_T::n_chan + chan_ind
107
- ] = 0 ;
108
- }
109
- else {
110
- kernel_weights[
111
- filt_ind * CONFIG_T::trfilt_height * CONFIG_T::trfilt_width * CONFIG_T::n_chan +
112
- y_step * CONFIG_T::trfilt_width * CONFIG_T::n_chan +
113
- x_step * CONFIG_T::n_chan + chan_ind
114
- ] = weights[
115
- filt_ind * CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan +
116
- y_indices[y_step] * CONFIG_T::filt_width * CONFIG_T::n_chan +
117
- x_indices[x_step] * CONFIG_T::n_chan + chan_ind
118
- ];
81
+ #pragma HLS INLINE
82
+ // pull out the individual filter weights (split kernel into stride_height x stride_width kernels)
83
+ TrfiltWeightsLoop: for (unsigned i_sh = 0 ; i_sh < CONFIG_T::stride_height; i_sh++) {
84
+ #pragma HLS UNROLL
85
+ for (unsigned i_sw = 0 ; i_sw < CONFIG_T::stride_width; i_sw++) {
86
+ #pragma HLS UNROLL
87
+ for (unsigned i_fh = 0 ; i_fh < CONFIG_T::trfilt_height; i_fh++) {
88
+ #pragma HLS UNROLL
89
+ for (unsigned i_fw = 0 ; i_fw < CONFIG_T::trfilt_width; i_fw++) {
90
+ #pragma HLS UNROLL
91
+ unsigned filt_h_ind = i_sh + (CONFIG_T::trfilt_height-i_fh-1 )*CONFIG_T::stride_height;
92
+ unsigned filt_w_ind = i_sw + (CONFIG_T::trfilt_width-i_fw-1 )*CONFIG_T::stride_width;
93
+ for (unsigned i_nf = 0 ; i_nf < CONFIG_T::n_filt; i_nf++) {
94
+ #pragma HLS UNROLL
95
+ for (unsigned i_nc = 0 ; i_nc < CONFIG_T::n_chan; i_nc++) {
96
+ #pragma HLS UNROLL
97
+ if (filt_h_ind < CONFIG_T::filt_height && filt_w_ind < CONFIG_T::filt_width) {
98
+ trfilt_weights[i_sh][i_sw][
99
+ i_nf * CONFIG_T::n_chan * CONFIG_T::trfilt_height * CONFIG_T::trfilt_width +
100
+ i_fh * CONFIG_T::trfilt_width * CONFIG_T::n_chan +
101
+ i_fw * CONFIG_T::n_chan + i_nc
102
+ ]= weights[
103
+ i_nf * CONFIG_T::n_chan * CONFIG_T::filt_height * CONFIG_T::filt_width +
104
+ filt_h_ind * CONFIG_T::n_chan * CONFIG_T::filt_width +
105
+ filt_w_ind * CONFIG_T::n_chan + i_nc
106
+ ];
107
+ }
108
+ else {
109
+ trfilt_weights[i_sh][i_sw][
110
+ i_nf * CONFIG_T::n_chan * CONFIG_T::trfilt_height * CONFIG_T::trfilt_width +
111
+ i_fh * CONFIG_T::trfilt_width * CONFIG_T::n_chan +
112
+ i_fw * CONFIG_T::n_chan + i_nc
113
+ ] = 0 ;
114
+ }
115
+ }
119
116
}
120
117
}
121
118
}
@@ -141,64 +138,67 @@ void compute_output_buffer_tr_2d(
141
138
static typename data_T::value_type kernel_data[CONFIG_T::trfilt_height * CONFIG_T::trfilt_width * CONFIG_T::n_chan];
142
139
#pragma HLS ARRAY_PARTITION variable=kernel_data complete
143
140
144
- typename CONFIG_T::weight_t kernel_weights [
145
- CONFIG_T::n_filt * CONFIG_T::trfilt_height * CONFIG_T::trfilt_width * CONFIG_T::n_chan
141
+ static typename CONFIG_T::weight_t trfilt_weights[CONFIG_T::stride_height][CONFIG_T::stride_width] [
142
+ CONFIG_T::trfilt_height * CONFIG_T::trfilt_width * CONFIG_T::n_filt * CONFIG_T::n_chan
146
143
];
147
144
145
+ load_trfilt_weights<CONFIG_T>(trfilt_weights, weights);
146
+
148
147
typename res_T::value_type res_out[CONFIG_T::n_filt];
149
148
#pragma HLS ARRAY_PARTITION variable=res_out complete dim = 0
150
149
151
150
static typename res_T::value_type output_buffer[
152
151
CONFIG_T::in_width*CONFIG_T::stride_width*CONFIG_T::stride_height*CONFIG_T::n_filt
153
152
];
153
+ #pragma HLS ARRAY_PARTITION variable=output_buffer complete dim = 0
154
154
155
155
res_T res_pack;
156
156
#pragma HLS DATA_PACK variable = res_pack
157
157
158
158
// Add pixel to the buffer
159
159
nnet::shift_line_buffer_tr<data_T, CONFIG_T>(in_elem, line_buffer, kernel_data);
160
160
161
- int weight_x_start = CONFIG_T::stride_width * (CONFIG_T::trfilt_width-1 );
162
- int weight_y_start = CONFIG_T::stride_height * (CONFIG_T::trfilt_height-1 );
161
+ HeightStrideLoop: for (int w_idx = 0 ; w_idx < CONFIG_T::stride_width; w_idx++) {
162
+ // #pragma HLS PIPELINE
163
+ #pragma HLS UNROLL
164
+ WidthStrideLoop: for (int h_idx = 0 ; h_idx < CONFIG_T::stride_height; h_idx++) {
165
+ #pragma HLS UNROLL
163
166
164
- WidthStrideLoop: for (int h_idx = 0 ; h_idx < CONFIG_T::stride_height; h_idx++) {
165
- weight_x_start = CONFIG_T::stride_height * (CONFIG_T::trfilt_width-1 );
166
- HeightStrideLoop: for (int w_idx = 0 ; w_idx < CONFIG_T::stride_width; w_idx++) {
167
- load_tr_kern_weights<CONFIG_T>(
168
- weights, kernel_weights, weight_x_start, weight_y_start
169
- );
167
+ #pragma HLS INLINE region
170
168
171
169
if (CONFIG_T::strategy == nnet::latency) {
172
170
dense_latency<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
173
- kernel_data, res_out, kernel_weights , biases
171
+ kernel_data, res_out, trfilt_weights[h_idx][w_idx] , biases
174
172
);
175
173
} else {
176
174
dense_resource<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
177
- kernel_data, res_out, kernel_weights , biases
175
+ kernel_data, res_out, trfilt_weights[h_idx][w_idx] , biases
178
176
);
179
177
}
180
178
181
179
BufferOutputLoop: for (unsigned i_ic = 0 ; i_ic < CONFIG_T::n_filt; i_ic++) {
180
+ #pragma HLS UNROLL
182
181
output_buffer[
183
182
(pX*CONFIG_T::stride_width+w_idx)*CONFIG_T::stride_height*CONFIG_T::n_filt +
184
183
h_idx*CONFIG_T::n_filt + i_ic
185
184
] = res_out[i_ic];
185
+ // res_pack[i_ic] = res_out[i_ic];
186
186
}
187
+ // res_stream.write(res_pack);
187
188
188
- weight_x_start++;
189
189
}
190
- weight_y_start++;
191
190
}
192
191
193
- // Counter Housekeeping
194
- if (pX + 1 == CONFIG_T::in_width) // HAVE TO THINK ABOUT oX, oY STUFF. NOT AS EASY AS INCREMENTING
195
- {
192
+ // Counter Housekeeping and printing buffered output
193
+ if (pX + 1 == CONFIG_T::in_width) {
196
194
pX = 0 ;
197
- // write all of the buffered output
198
- for (int h_idx = 0 ; h_idx < CONFIG_T::stride_height; h_idx++) {
195
+ // write all of the buffered output for outputs we want
196
+ HeightOutputLoop: for (unsigned h_idx = 0 ; h_idx < CONFIG_T::stride_height; h_idx++) {
197
+ // #pragma HLS PIPELINE
199
198
if (pY*CONFIG_T::stride_height + h_idx >= CONFIG_T::pad_top &&
200
- pY*CONFIG_T::stride_height +h_idx < CONFIG_T::pad_top + CONFIG_T::out_height) {
201
- for (int oX = CONFIG_T::pad_left; oX < CONFIG_T::pad_left + CONFIG_T::out_width; oX++) {
199
+ pY*CONFIG_T::stride_height + h_idx < CONFIG_T::pad_top + CONFIG_T::out_height) {
200
+ WidthOutputLoop: for (unsigned oX = CONFIG_T::pad_left; oX < CONFIG_T::pad_left + CONFIG_T::out_width; oX++) {
201
+ #pragma HLS PIPELINE
202
202
CastLoop: for (unsigned i_ic = 0 ; i_ic < CONFIG_T::n_filt; i_ic++) {
203
203
#pragma HLS UNROLL
204
204
res_pack[i_ic] = output_buffer[
@@ -219,7 +219,7 @@ void compute_output_buffer_tr_2d(
219
219
} else {
220
220
pX = pX + 1 ;
221
221
}
222
-
222
+
223
223
}
224
224
225
225
template <class data_T , class res_T , typename CONFIG_T>
0 commit comments