|
47 | 47 | #include "layers_common.hpp"
|
48 | 48 | #include <opencv2/dnn/shape_utils.hpp>
|
49 | 49 |
|
| 50 | +#include <opencv2/core/utils/logger.hpp> |
| 51 | + |
50 | 52 | #ifdef HAVE_OPENCL
|
51 | 53 | #include "opencl_kernels_dnn.hpp"
|
52 | 54 | #endif
|
@@ -197,58 +199,168 @@ class SliceLayerImpl : public SliceLayer
|
197 | 199 | finalSliceRanges[i][j] = clamp(finalSliceRanges[i][j], inpShape[j]);
|
198 | 200 | }
|
199 | 201 | }
|
| 202 | + |
| 203 | +#if 0 |
| 204 | + std::cout << "DEBUG: DNN/Slice: " << outputs.size() << " inpShape=" << inpShape << std::endl; |
| 205 | + for (int i = 0; i < outputs.size(); ++i) |
| 206 | + { |
| 207 | + for (int j = 0; j < finalSliceRanges[i].size(); ++j) |
| 208 | + { |
| 209 | + std::cout << finalSliceRanges[i][j]; |
| 210 | + } |
| 211 | + std::cout << std::endl; |
| 212 | + } |
| 213 | +#endif |
200 | 214 | }
|
201 | 215 |
|
202 | 216 | #ifdef HAVE_OPENCL
|
203 | 217 | bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
|
204 | 218 | {
|
205 |
| -#if 1 |
206 |
| - // TODO fix that (brokes YOLOv4-tiny) |
207 |
| - return false; |
208 |
| -#else |
209 | 219 | std::vector<UMat> inputs;
|
210 | 220 | std::vector<UMat> outputs;
|
211 | 221 |
|
212 |
| - bool use_half = (inputs_.depth() == CV_16S); |
213 | 222 | inputs_.getUMatVector(inputs);
|
214 | 223 | outputs_.getUMatVector(outputs);
|
215 | 224 |
|
216 |
| - if (inputs[0].dims < 4 || (total(shape(outputs[0]), 0, 2) % 4 != 0) || |
217 |
| - (total(shape(outputs[0]), 2) % 4 != 0)) |
| 225 | + CV_Assert(outputs.size() == finalSliceRanges.size()); |
| 226 | + |
| 227 | + const UMat& input = inputs[0]; |
| 228 | + if (input.dims > 5) |
| 229 | + { |
| 230 | + CV_LOG_INFO(NULL, "DNN/OpenCL/Slice: implementation doesn't support dims=" << input.dims << ". Fallback to CPU"); |
218 | 231 | return false;
|
| 232 | + } |
219 | 233 |
|
220 |
| - String opts; |
221 |
| - if (use_half) |
222 |
| - opts = "-DDtype=half -DDtype4=half4 -DDtype8=half8"; |
223 |
| - else |
224 |
| - opts = "-DDtype=float -DDtype4=float4 -DDtype8=float8"; |
225 |
| - const UMat& inpMat = inputs[0]; |
| 234 | + size_t WSZ = 128; |
| 235 | + |
| 236 | + const int dims = input.dims; |
| 237 | + const int elemSize = (int)input.elemSize(); |
| 238 | + String opts0 = cv::format( |
| 239 | + "-DDIMS=%d -DELEMSIZE=%d", |
| 240 | + dims, elemSize |
| 241 | + ); |
| 242 | + for (int d = 0; d < dims; d++) |
| 243 | + { |
| 244 | + opts0 += cv::format(" -DSRC_STEP_%d=%d", d, (int)input.step[dims - 1 - d]); |
| 245 | + } |
| 246 | + String kname = cv::format("slice_%d", dims); |
226 | 247 | for (size_t i = 0; i < outputs.size(); i++)
|
227 | 248 | {
|
228 |
| - int groups = outputs[i].size[0]; |
229 |
| - int channels = outputs[i].size[1]; |
230 |
| - int rows = outputs[i].size[2]; |
231 |
| - int cols = outputs[i].size[3]; |
232 |
| -
|
233 |
| - ocl::Kernel kernel("slice", ocl::dnn::slice_oclsrc, opts); |
234 |
| - size_t local[] = { 128 }; |
235 |
| - size_t global[] = { (size_t)groups * channels / 4 * local[0] }; |
236 |
| - int idx = 0; |
237 |
| - kernel.set(idx++, ocl::KernelArg::PtrReadOnly(inpMat)); |
238 |
| - kernel.set(idx++, (int)(inpMat.size[2] * inpMat.size[3])); |
239 |
| - kernel.set(idx++, (int)(rows * cols)); |
240 |
| - kernel.set(idx++, (int)inpMat.size[3]); |
241 |
| - kernel.set(idx++, (int)cols); |
242 |
| - kernel.set(idx++, (int)finalSliceRanges[i][2].start); |
243 |
| - kernel.set(idx++, (int)finalSliceRanges[i][3].start); |
244 |
| - kernel.set(idx++, ocl::KernelArg::PtrWriteOnly(outputs[i])); |
245 |
| - bool ret = kernel.run(1, global, local, false); |
| 249 | + UMat& output = outputs[i]; |
| 250 | + const std::vector<Range>& range = finalSliceRanges[i]; |
| 251 | + |
| 252 | + String opts = opts0; |
| 253 | + |
| 254 | + CV_CheckEQ(output.dims, dims, ""); |
| 255 | + for (int d = 0; d < dims; d++) |
| 256 | + { |
| 257 | + opts += cv::format(" -DDST_STEP_%d=%d -DDST_SZ_%d=%d -DSRC_START_%d=%d", |
| 258 | + d, (int)output.step[dims - 1 - d], |
| 259 | + d, (int)output.size[dims - 1 - d], |
| 260 | + d, (int)range[dims - 1 - d].start |
| 261 | + ); |
| 262 | + CV_CheckEQ(range[d].size(), (int)output.size[d], ""); |
| 263 | + } |
| 264 | + |
| 265 | + int block_dims = 0; |
| 266 | + size_t block_size = elemSize; |
| 267 | + for (int i = dims - 1; i >= 0; --i) |
| 268 | + { |
| 269 | + if (input.step[i] != output.step[i]) |
| 270 | + break; |
| 271 | + block_size *= output.size[i]; |
| 272 | + block_dims++; |
| 273 | + } |
| 274 | + |
| 275 | + const size_t total = output.total() * elemSize; |
| 276 | + size_t num_blocks = total / block_size; |
| 277 | + |
| 278 | + if ((num_blocks <= 8 && block_size >= WSZ * 4) || (block_size >= WSZ * 64)) |
| 279 | + { |
| 280 | + // use 1D copy mode |
| 281 | + opts += cv::format(" -DUSE_COPY_1D=1"); |
| 282 | + |
| 283 | + opts += cv::format(" -DBLOCK_DIMS=%d", block_dims); |
| 284 | + opts += cv::format(" -DBLOCK_DIMS_CONTIGUOUS=%d", block_dims); |
| 285 | + opts += cv::format(" -DBLOCK_SIZE=%d", (int)block_size); |
| 286 | + |
| 287 | + opts += cv::format(" -DBLOCK_COLS=%d", (int)block_size); |
| 288 | + } |
| 289 | + else |
| 290 | + { |
| 291 | + // use 2D copy mode |
| 292 | + int block_cols = block_size; |
| 293 | + int block_dims_contiguous = block_dims; |
| 294 | + size_t input_base_step = input.step[dims - 1 - block_dims_contiguous]; |
| 295 | + size_t output_base_step = output.step[dims - 1 - block_dims_contiguous]; |
| 296 | + |
| 297 | + size_t block_rows = 1; |
| 298 | + for (int i = dims - 1 - block_dims_contiguous; i >= 0; --i) |
| 299 | + { |
| 300 | + if (input.step[i] * output_base_step != output.step[i] * input_base_step) |
| 301 | + break; |
| 302 | + block_rows *= output.size[i]; |
| 303 | + block_dims++; |
| 304 | + } |
| 305 | + |
| 306 | + block_size *= block_rows; |
| 307 | + |
| 308 | + num_blocks = total / block_size; |
| 309 | + |
| 310 | + if (block_rows > 1) |
| 311 | + { |
| 312 | + opts += cv::format(" -DBLOCK_DIMS=%d", block_dims); |
| 313 | + opts += cv::format(" -DBLOCK_DIMS_CONTIGUOUS=%d", block_dims_contiguous); |
| 314 | + opts += cv::format(" -DBLOCK_SIZE=%d", (int)block_size); |
| 315 | + |
| 316 | + opts += cv::format(" -DBLOCK_COLS=%d", (int)block_cols); |
| 317 | + |
| 318 | + opts += cv::format(" -DBLOCK_ROWS=%d", (int)block_rows); |
| 319 | + opts += cv::format(" -DBLOCK_SRC_STRIDE=%d", (int)input_base_step); |
| 320 | + } |
| 321 | + else |
| 322 | + { |
| 323 | + // use 1D copy mode |
| 324 | + opts += cv::format(" -DUSE_COPY_1D=1"); |
| 325 | + |
| 326 | + opts += cv::format(" -DBLOCK_DIMS=%d", block_dims_contiguous); |
| 327 | + opts += cv::format(" -DBLOCK_DIMS_CONTIGUOUS=%d", block_dims_contiguous); |
| 328 | + opts += cv::format(" -DBLOCK_SIZE=%d", (int)block_size); |
| 329 | + |
| 330 | + opts += cv::format(" -DBLOCK_COLS=%d", (int)block_size); |
| 331 | + } |
| 332 | + } |
| 333 | + |
| 334 | + const size_t MIN_WORK_ITEMS = 16; |
| 335 | + if (block_size <= 4 * MIN_WORK_ITEMS) |
| 336 | + WSZ = 4; |
| 337 | + else if (block_size <= 8 * MIN_WORK_ITEMS) |
| 338 | + WSZ = 8; |
| 339 | + else if (block_size <= 16 * MIN_WORK_ITEMS) |
| 340 | + WSZ = 16; |
| 341 | + else if (block_size <= 32 * MIN_WORK_ITEMS) |
| 342 | + WSZ = 32; |
| 343 | + else if (block_size <= 64 * MIN_WORK_ITEMS) |
| 344 | + WSZ = 64; |
| 345 | + |
| 346 | + opts += cv::format(" -DWSZ=%d", (int)WSZ); |
| 347 | + |
| 348 | + size_t local[] = { WSZ, 1 }; |
| 349 | + size_t global[] = { WSZ, num_blocks }; |
| 350 | + |
| 351 | + ocl::Kernel kernel(kname.c_str(), ocl::dnn::slice_oclsrc, opts); |
| 352 | + if (kernel.empty()) |
| 353 | + return false; |
| 354 | + bool ret = kernel.args( |
| 355 | + ocl::KernelArg::PtrReadOnly(input), |
| 356 | + ocl::KernelArg::PtrWriteOnly(output) |
| 357 | + ) |
| 358 | + .run(2, global, local, false); |
246 | 359 | if (!ret)
|
247 | 360 | return false;
|
248 |
| - } |
| 361 | + } // for outputs.size() |
249 | 362 |
|
250 | 363 | return true;
|
251 |
| -#endif |
252 | 364 | }
|
253 | 365 | #endif
|
254 | 366 |
|
|
0 commit comments