Skip to content

Commit 09f24a8

Browse files
committed
Merge pull request opencv#17764 from alalek:issue_17762
2 parents bf8136e + 81e027e commit 09f24a8

File tree

4 files changed

+624
-107
lines changed

4 files changed

+624
-107
lines changed

modules/dnn/perf/perf_layer.cpp

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
// This file is part of OpenCV project.
2+
// It is subject to the license terms in the LICENSE file found in the top-level directory
3+
// of this distribution and at http://opencv.org/license.html.
4+
5+
#include "perf_precomp.hpp"
6+
#include <opencv2/dnn/shape_utils.hpp>
7+
8+
namespace opencv_test {
9+
10+
struct Layer_Slice : public TestBaseWithParam<tuple<Backend, Target> >
11+
{
12+
template<int DIMS>
13+
void test_slice(const int* inputShape, const int* begin, const int* end)
14+
{
15+
int backendId = get<0>(GetParam());
16+
int targetId = get<1>(GetParam());
17+
18+
Mat input(DIMS, inputShape, CV_32FC1, Scalar::all(0));
19+
for (int i = 0; i < (int)input.total(); ++i)
20+
input.ptr<float>()[i] = (float)(i & 4095);
21+
22+
std::vector<Range> range(DIMS);
23+
for (int i = 0; i < DIMS; ++i)
24+
range[i] = Range(begin[i], end[i]);
25+
26+
Net net;
27+
LayerParams lp;
28+
lp.type = "Slice";
29+
lp.name = "testLayer";
30+
lp.set("begin", DictValue::arrayInt<int*>((int*)&begin[0], DIMS));
31+
lp.set("end", DictValue::arrayInt<int*>((int*)&end[0], DIMS));
32+
net.addLayerToPrev(lp.name, lp.type, lp);
33+
34+
// warmup
35+
{
36+
net.setInput(input);
37+
net.setPreferableBackend(backendId);
38+
net.setPreferableTarget(targetId);
39+
Mat out = net.forward();
40+
41+
EXPECT_GT(cv::norm(out, NORM_INF), 0);
42+
#if 0
43+
//normAssert(out, input(range));
44+
cout << input(range).clone().reshape(1, 1) << endl;
45+
cout << out.reshape(1, 1) << endl;
46+
#endif
47+
}
48+
49+
TEST_CYCLE()
50+
{
51+
Mat res = net.forward();
52+
}
53+
54+
SANITY_CHECK_NOTHING();
55+
}
56+
};
57+
58+
59+
60+
PERF_TEST_P_(Layer_Slice, YOLOv4_tiny_1)
61+
{
62+
const int inputShape[4] = {1, 64, 104, 104};
63+
const int begin[] = {0, 32, 0, 0};
64+
const int end[] = {1, 64, 104, 104};
65+
test_slice<4>(inputShape, begin, end);
66+
}
67+
68+
PERF_TEST_P_(Layer_Slice, YOLOv4_tiny_2)
69+
{
70+
const int inputShape[4] = {1, 128, 52, 52};
71+
const int begin[] = {0, 64, 0, 0};
72+
const int end[] = {1, 128, 52, 52};
73+
test_slice<4>(inputShape, begin, end);
74+
}
75+
76+
PERF_TEST_P_(Layer_Slice, YOLOv4_tiny_3)
77+
{
78+
const int inputShape[4] = {1, 256, 26, 26};
79+
const int begin[] = {0, 128, 0, 0};
80+
const int end[] = {1, 256, 26, 26};
81+
test_slice<4>(inputShape, begin, end);
82+
}
83+
84+
85+
PERF_TEST_P_(Layer_Slice, FastNeuralStyle_eccv16)
86+
{
87+
const int inputShape[4] = {1, 128, 80, 100};
88+
const int begin[] = {0, 0, 2, 2};
89+
const int end[] = {1, 128, 76, 96};
90+
test_slice<4>(inputShape, begin, end);
91+
}
92+
93+
INSTANTIATE_TEST_CASE_P(/**/, Layer_Slice, dnnBackendsAndTargets(false, false));
94+
95+
} // namespace

modules/dnn/src/layers/slice_layer.cpp

Lines changed: 145 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@
4747
#include "layers_common.hpp"
4848
#include <opencv2/dnn/shape_utils.hpp>
4949

50+
#include <opencv2/core/utils/logger.hpp>
51+
5052
#ifdef HAVE_OPENCL
5153
#include "opencl_kernels_dnn.hpp"
5254
#endif
@@ -197,58 +199,168 @@ class SliceLayerImpl : public SliceLayer
197199
finalSliceRanges[i][j] = clamp(finalSliceRanges[i][j], inpShape[j]);
198200
}
199201
}
202+
203+
#if 0
204+
std::cout << "DEBUG: DNN/Slice: " << outputs.size() << " inpShape=" << inpShape << std::endl;
205+
for (int i = 0; i < outputs.size(); ++i)
206+
{
207+
for (int j = 0; j < finalSliceRanges[i].size(); ++j)
208+
{
209+
std::cout << finalSliceRanges[i][j];
210+
}
211+
std::cout << std::endl;
212+
}
213+
#endif
200214
}
201215

202216
#ifdef HAVE_OPENCL
203217
bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
204218
{
205-
#if 1
206-
// TODO fix that (brokes YOLOv4-tiny)
207-
return false;
208-
#else
209219
std::vector<UMat> inputs;
210220
std::vector<UMat> outputs;
211221

212-
bool use_half = (inputs_.depth() == CV_16S);
213222
inputs_.getUMatVector(inputs);
214223
outputs_.getUMatVector(outputs);
215224

216-
if (inputs[0].dims < 4 || (total(shape(outputs[0]), 0, 2) % 4 != 0) ||
217-
(total(shape(outputs[0]), 2) % 4 != 0))
225+
CV_Assert(outputs.size() == finalSliceRanges.size());
226+
227+
const UMat& input = inputs[0];
228+
if (input.dims > 5)
229+
{
230+
CV_LOG_INFO(NULL, "DNN/OpenCL/Slice: implementation doesn't support dims=" << input.dims << ". Fallback to CPU");
218231
return false;
232+
}
219233

220-
String opts;
221-
if (use_half)
222-
opts = "-DDtype=half -DDtype4=half4 -DDtype8=half8";
223-
else
224-
opts = "-DDtype=float -DDtype4=float4 -DDtype8=float8";
225-
const UMat& inpMat = inputs[0];
234+
size_t WSZ = 128;
235+
236+
const int dims = input.dims;
237+
const int elemSize = (int)input.elemSize();
238+
String opts0 = cv::format(
239+
"-DDIMS=%d -DELEMSIZE=%d",
240+
dims, elemSize
241+
);
242+
for (int d = 0; d < dims; d++)
243+
{
244+
opts0 += cv::format(" -DSRC_STEP_%d=%d", d, (int)input.step[dims - 1 - d]);
245+
}
246+
String kname = cv::format("slice_%d", dims);
226247
for (size_t i = 0; i < outputs.size(); i++)
227248
{
228-
int groups = outputs[i].size[0];
229-
int channels = outputs[i].size[1];
230-
int rows = outputs[i].size[2];
231-
int cols = outputs[i].size[3];
232-
233-
ocl::Kernel kernel("slice", ocl::dnn::slice_oclsrc, opts);
234-
size_t local[] = { 128 };
235-
size_t global[] = { (size_t)groups * channels / 4 * local[0] };
236-
int idx = 0;
237-
kernel.set(idx++, ocl::KernelArg::PtrReadOnly(inpMat));
238-
kernel.set(idx++, (int)(inpMat.size[2] * inpMat.size[3]));
239-
kernel.set(idx++, (int)(rows * cols));
240-
kernel.set(idx++, (int)inpMat.size[3]);
241-
kernel.set(idx++, (int)cols);
242-
kernel.set(idx++, (int)finalSliceRanges[i][2].start);
243-
kernel.set(idx++, (int)finalSliceRanges[i][3].start);
244-
kernel.set(idx++, ocl::KernelArg::PtrWriteOnly(outputs[i]));
245-
bool ret = kernel.run(1, global, local, false);
249+
UMat& output = outputs[i];
250+
const std::vector<Range>& range = finalSliceRanges[i];
251+
252+
String opts = opts0;
253+
254+
CV_CheckEQ(output.dims, dims, "");
255+
for (int d = 0; d < dims; d++)
256+
{
257+
opts += cv::format(" -DDST_STEP_%d=%d -DDST_SZ_%d=%d -DSRC_START_%d=%d",
258+
d, (int)output.step[dims - 1 - d],
259+
d, (int)output.size[dims - 1 - d],
260+
d, (int)range[dims - 1 - d].start
261+
);
262+
CV_CheckEQ(range[d].size(), (int)output.size[d], "");
263+
}
264+
265+
int block_dims = 0;
266+
size_t block_size = elemSize;
267+
for (int i = dims - 1; i >= 0; --i)
268+
{
269+
if (input.step[i] != output.step[i])
270+
break;
271+
block_size *= output.size[i];
272+
block_dims++;
273+
}
274+
275+
const size_t total = output.total() * elemSize;
276+
size_t num_blocks = total / block_size;
277+
278+
if ((num_blocks <= 8 && block_size >= WSZ * 4) || (block_size >= WSZ * 64))
279+
{
280+
// use 1D copy mode
281+
opts += cv::format(" -DUSE_COPY_1D=1");
282+
283+
opts += cv::format(" -DBLOCK_DIMS=%d", block_dims);
284+
opts += cv::format(" -DBLOCK_DIMS_CONTIGUOUS=%d", block_dims);
285+
opts += cv::format(" -DBLOCK_SIZE=%d", (int)block_size);
286+
287+
opts += cv::format(" -DBLOCK_COLS=%d", (int)block_size);
288+
}
289+
else
290+
{
291+
// use 2D copy mode
292+
int block_cols = block_size;
293+
int block_dims_contiguous = block_dims;
294+
size_t input_base_step = input.step[dims - 1 - block_dims_contiguous];
295+
size_t output_base_step = output.step[dims - 1 - block_dims_contiguous];
296+
297+
size_t block_rows = 1;
298+
for (int i = dims - 1 - block_dims_contiguous; i >= 0; --i)
299+
{
300+
if (input.step[i] * output_base_step != output.step[i] * input_base_step)
301+
break;
302+
block_rows *= output.size[i];
303+
block_dims++;
304+
}
305+
306+
block_size *= block_rows;
307+
308+
num_blocks = total / block_size;
309+
310+
if (block_rows > 1)
311+
{
312+
opts += cv::format(" -DBLOCK_DIMS=%d", block_dims);
313+
opts += cv::format(" -DBLOCK_DIMS_CONTIGUOUS=%d", block_dims_contiguous);
314+
opts += cv::format(" -DBLOCK_SIZE=%d", (int)block_size);
315+
316+
opts += cv::format(" -DBLOCK_COLS=%d", (int)block_cols);
317+
318+
opts += cv::format(" -DBLOCK_ROWS=%d", (int)block_rows);
319+
opts += cv::format(" -DBLOCK_SRC_STRIDE=%d", (int)input_base_step);
320+
}
321+
else
322+
{
323+
// use 1D copy mode
324+
opts += cv::format(" -DUSE_COPY_1D=1");
325+
326+
opts += cv::format(" -DBLOCK_DIMS=%d", block_dims_contiguous);
327+
opts += cv::format(" -DBLOCK_DIMS_CONTIGUOUS=%d", block_dims_contiguous);
328+
opts += cv::format(" -DBLOCK_SIZE=%d", (int)block_size);
329+
330+
opts += cv::format(" -DBLOCK_COLS=%d", (int)block_size);
331+
}
332+
}
333+
334+
const size_t MIN_WORK_ITEMS = 16;
335+
if (block_size <= 4 * MIN_WORK_ITEMS)
336+
WSZ = 4;
337+
else if (block_size <= 8 * MIN_WORK_ITEMS)
338+
WSZ = 8;
339+
else if (block_size <= 16 * MIN_WORK_ITEMS)
340+
WSZ = 16;
341+
else if (block_size <= 32 * MIN_WORK_ITEMS)
342+
WSZ = 32;
343+
else if (block_size <= 64 * MIN_WORK_ITEMS)
344+
WSZ = 64;
345+
346+
opts += cv::format(" -DWSZ=%d", (int)WSZ);
347+
348+
size_t local[] = { WSZ, 1 };
349+
size_t global[] = { WSZ, num_blocks };
350+
351+
ocl::Kernel kernel(kname.c_str(), ocl::dnn::slice_oclsrc, opts);
352+
if (kernel.empty())
353+
return false;
354+
bool ret = kernel.args(
355+
ocl::KernelArg::PtrReadOnly(input),
356+
ocl::KernelArg::PtrWriteOnly(output)
357+
)
358+
.run(2, global, local, false);
246359
if (!ret)
247360
return false;
248-
}
361+
} // for outputs.size()
249362

250363
return true;
251-
#endif
252364
}
253365
#endif
254366

0 commit comments

Comments
 (0)