Skip to content

Commit 60b430f

Browse files
authored
[ARM] Add VisSegmentation NEON + OMP support (#710)
* [Android] Add VisSegmentation NEON support * [ARM] change vqaddq_u8 -> vaddq_u8 * [ARM] change vqaddq_u8 -> vaddq_u8 * [Bug Fix] add FDASSERT * update assert info * add QuantizeBlendingWeight8 * Update QuantizeBlendingWeight8 * Update VisSegmentation * [Visualize] add DefaultVisualizeType and EnableFastVisuzlie * fix typos * fix typo * Update VisSegmentation * [Android] Add omp parallel support for Android * Add omp schedule(static)
1 parent 129dda7 commit 60b430f

File tree

3 files changed

+161
-26
lines changed

3 files changed

+161
-26
lines changed

CMakeLists.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -480,10 +480,6 @@ else()
480480
set_target_properties(${LIBRARY_NAME} PROPERTIES LINK_FLAGS_RELEASE -s)
481481
endif()
482482

483-
#find_package(OpenMP)
484-
#if(OpenMP_CXX_FOUND)
485-
# list(APPEND DEPEND_LIBS OpenMP::OpenMP_CXX)
486-
#endif()
487483
set_target_properties(${LIBRARY_NAME} PROPERTIES VERSION ${FASTDEPLOY_VERSION})
488484
if(MSVC)
489485
# disable warnings for dll export
@@ -493,6 +489,10 @@ endif()
493489
if (ANDROID)
494490
find_library(log-lib log)
495491
list(APPEND DEPEND_LIBS ${log-lib})
492+
find_package(OpenMP)
493+
if(OpenMP_CXX_FOUND)
494+
list(APPEND DEPEND_LIBS OpenMP::OpenMP_CXX)
495+
endif()
496496
endif()
497497

498498
target_link_libraries(${LIBRARY_NAME} ${DEPEND_LIBS})

fastdeploy/vision/visualize/segmentation.cc

Lines changed: 153 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -17,37 +17,148 @@
1717
#include "fastdeploy/vision/visualize/visualize.h"
1818
#include "opencv2/highgui.hpp"
1919
#include "opencv2/imgproc/imgproc.hpp"
20+
#ifdef __ARM_NEON
21+
#include <arm_neon.h>
22+
#endif
2023

2124
namespace fastdeploy {
2225
namespace vision {
2326

24-
cv::Mat VisSegmentation(const cv::Mat& im, const SegmentationResult& result,
25-
float weight) {
26-
auto color_map = GenerateColorMap(1000);
27+
#ifdef __ARM_NEON
28+
static inline void QuantizeBlendingWeight8(
29+
float weight, uint8_t* old_multi_factor, uint8_t* new_multi_factor) {
30+
// Quantize the weight to boost blending performance.
31+
// if 0.0 < w <= 1/8, w ~ 1/8=1/(2^3) shift right 3 mul 1, 7
32+
// if 1/8 < w <= 2/8, w ~ 2/8=1/(2^3) shift right 3 mul 2, 6
33+
// if 2/8 < w <= 3/8, w ~ 3/8=1/(2^3) shift right 3 mul 3, 5
34+
// if 3/8 < w <= 4/8, w ~ 4/8=1/(2^3) shift right 3 mul 4, 4
35+
// Shift factor is always 3, but the mul factor is different.
36+
// Moving 7 bits to the right tends to result in a zero value,
37+
// So, We choose to shift 3 bits to get an approximation.
38+
uint8_t weight_quantize = static_cast<uint8_t>(weight * 8.0f);
39+
*new_multi_factor = weight_quantize;
40+
*old_multi_factor = (8 - weight_quantize);
41+
}
42+
43+
static cv::Mat FastVisSegmentationNEON(
44+
const cv::Mat& im, const SegmentationResult& result,
45+
float weight, bool quantize_weight = true) {
2746
int64_t height = result.shape[0];
2847
int64_t width = result.shape[1];
2948
auto vis_img = cv::Mat(height, width, CV_8UC3);
49+
50+
int32_t size = static_cast<int32_t>(height * width);
51+
uint8_t *vis_ptr = static_cast<uint8_t*>(vis_img.data);
52+
const uint8_t *label_ptr = static_cast<const uint8_t*>(result.label_map.data());
53+
const uint8_t *im_ptr = static_cast<const uint8_t*>(im.data);
3054

31-
int64_t index = 0;
32-
for (int i = 0; i < height; i++) {
33-
for (int j = 0; j < width; j++) {
34-
int category_id = result.label_map[index++];
35-
vis_img.at<cv::Vec3b>(i, j)[0] = color_map[3 * category_id + 0];
36-
vis_img.at<cv::Vec3b>(i, j)[1] = color_map[3 * category_id + 1];
37-
vis_img.at<cv::Vec3b>(i, j)[2] = color_map[3 * category_id + 2];
55+
if (!quantize_weight) {
56+
#pragma omp parallel for num_threads(2) schedule(static)
57+
for (int i = 0; i < size - 15; i += 16) {
58+
uint8x16_t labelx16 = vld1q_u8(label_ptr + i); // 16 bytes
59+
// e.g 0b00000001 << 7 -> 0b10000000 128;
60+
uint8x16x3_t vbgrx16x3;
61+
vbgrx16x3.val[0] = vshlq_n_u8(labelx16, 7);
62+
vbgrx16x3.val[1] = vshlq_n_u8(labelx16, 4);
63+
vbgrx16x3.val[2] = vshlq_n_u8(labelx16, 3);
64+
vst3q_u8(vis_ptr + i * 3, vbgrx16x3);
3865
}
66+
for (int i = size - 15; i < size; i++) {
67+
uint8_t label = label_ptr[i];
68+
vis_ptr[i * 3 + 0] = (label << 7);
69+
vis_ptr[i * 3 + 1] = (label << 4);
70+
vis_ptr[i * 3 + 2] = (label << 3);
71+
}
72+
// Blend colors use opencv
73+
cv::addWeighted(im, 1.0 - weight, vis_img, weight, 0, vis_img);
74+
return vis_img;
3975
}
40-
cv::addWeighted(im, 1.0 - weight, vis_img, weight, 0, vis_img);
76+
77+
// Quantize the weight to boost blending performance.
78+
// After that, we can directly use shift instructions
79+
// to blend the colors from input im and mask. Please
80+
// check QuantizeBlendingWeight8 for more details.
81+
uint8_t old_multi_factor, new_multi_factor;
82+
QuantizeBlendingWeight8(weight, &old_multi_factor,
83+
&new_multi_factor);
84+
if (new_multi_factor == 0) {
85+
return im; // Only keep origin image.
86+
}
87+
88+
if (new_multi_factor == 8) {
89+
// Only keep mask, no need to blending with origin image.
90+
#pragma omp parallel for num_threads(2) schedule(static)
91+
for (int i = 0; i < size - 15; i += 16) {
92+
uint8x16_t labelx16 = vld1q_u8(label_ptr + i); // 16 bytes
93+
// e.g 0b00000001 << 7 -> 0b10000000 128;
94+
uint8x16_t mbx16 = vshlq_n_u8(labelx16, 7);
95+
uint8x16_t mgx16 = vshlq_n_u8(labelx16, 4);
96+
uint8x16_t mrx16 = vshlq_n_u8(labelx16, 3);
97+
uint8x16x3_t vbgr16x3;
98+
vbgr16x3.val[0] = mbx16;
99+
vbgr16x3.val[1] = mgx16;
100+
vbgr16x3.val[2] = mrx16;
101+
vst3q_u8(vis_ptr + i * 3, vbgr16x3);
102+
}
103+
for (int i = size - 15; i < size; i++) {
104+
uint8_t label = label_ptr[i];
105+
vis_ptr[i * 3 + 0] = (label << 7);
106+
vis_ptr[i * 3 + 1] = (label << 4);
107+
vis_ptr[i * 3 + 2] = (label << 3);
108+
}
109+
return vis_img;
110+
}
111+
112+
uint8x16_t old_mulx16 = vdupq_n_u8(old_multi_factor);
113+
uint8x16_t new_mulx16 = vdupq_n_u8(new_multi_factor);
114+
// Blend the two colors together with quantize 'weight'.
115+
#pragma omp parallel for num_threads(2) schedule(static)
116+
for (int i = 0; i < size - 15; i += 16) {
117+
uint8x16x3_t bgrx16x3 = vld3q_u8(im_ptr + i * 3); // 48 bytes
118+
uint8x16_t labelx16 = vld1q_u8(label_ptr + i); // 16 bytes
119+
uint8x16_t ibx16 = bgrx16x3.val[0];
120+
uint8x16_t igx16 = bgrx16x3.val[1];
121+
uint8x16_t irx16 = bgrx16x3.val[2];
122+
// e.g 0b00000001 << 7 -> 0b10000000 128;
123+
uint8x16_t mbx16 = vshlq_n_u8(labelx16, 7);
124+
uint8x16_t mgx16 = vshlq_n_u8(labelx16, 4);
125+
uint8x16_t mrx16 = vshlq_n_u8(labelx16, 3);
126+
// TODO: keep the pixels of input im if mask = 0
127+
uint8x16_t ibx16_mshr, igx16_mshr, irx16_mshr;
128+
uint8x16_t mbx16_mshr, mgx16_mshr, mrx16_mshr;
129+
// Moving 7 bits to the right tends to result in zero,
130+
// So, We choose to shift 3 bits to get an approximation
131+
ibx16_mshr = vmulq_u8(vshrq_n_u8(ibx16, 3), old_mulx16);
132+
igx16_mshr = vmulq_u8(vshrq_n_u8(igx16, 3), old_mulx16);
133+
irx16_mshr = vmulq_u8(vshrq_n_u8(irx16, 3), old_mulx16);
134+
mbx16_mshr = vmulq_u8(vshrq_n_u8(mbx16, 3), new_mulx16);
135+
mgx16_mshr = vmulq_u8(vshrq_n_u8(mgx16, 3), new_mulx16);
136+
mrx16_mshr = vmulq_u8(vshrq_n_u8(mrx16, 3), new_mulx16);
137+
uint8x16x3_t vbgr16x3;
138+
vbgr16x3.val[0] = vaddq_u8(ibx16_mshr, mbx16_mshr);
139+
vbgr16x3.val[1] = vaddq_u8(igx16_mshr, mgx16_mshr);
140+
vbgr16x3.val[2] = vaddq_u8(irx16_mshr, mrx16_mshr);
141+
// Store the blended pixels to vis img
142+
vst3q_u8(vis_ptr + i * 3, vbgr16x3);
143+
}
144+
for (int i = size - 15; i < size; i++) {
145+
uint8_t label = label_ptr[i];
146+
vis_ptr[i * 3 + 0] = (im_ptr[i * 3 + 0] >> 3) * old_multi_factor
147+
+ ((label << 7) >> 3) * new_multi_factor;
148+
vis_ptr[i * 3 + 1] = (im_ptr[i * 3 + 1] >> 3) * old_multi_factor
149+
+ ((label << 4) >> 3) * new_multi_factor;
150+
vis_ptr[i * 3 + 2] = (im_ptr[i * 3 + 2] >> 3) * old_multi_factor
151+
+ ((label << 3) >> 3) * new_multi_factor;
152+
}
41153
return vis_img;
42154
}
155+
#endif
43156

44-
cv::Mat Visualize::VisSegmentation(const cv::Mat& im,
45-
const SegmentationResult& result) {
46-
FDWARNING << "DEPRECATED: fastdeploy::vision::Visualize::VisSegmentation is "
47-
"deprecated, please use fastdeploy::vision:VisSegmentation "
48-
"function instead."
49-
<< std::endl;
50-
auto color_map = GetColorMap();
157+
static cv::Mat VisSegmentationCommonCpu(
158+
const cv::Mat& im, const SegmentationResult& result,
159+
float weight) {
160+
// Use the native c++ version without any optimization.
161+
auto color_map = GenerateColorMap(1000);
51162
int64_t height = result.shape[0];
52163
int64_t width = result.shape[1];
53164
auto vis_img = cv::Mat(height, width, CV_8UC3);
@@ -61,10 +172,33 @@ cv::Mat Visualize::VisSegmentation(const cv::Mat& im,
61172
vis_img.at<cv::Vec3b>(i, j)[2] = color_map[3 * category_id + 2];
62173
}
63174
}
64-
cv::addWeighted(im, .5, vis_img, .5, 0, vis_img);
175+
cv::addWeighted(im, 1.0 - weight, vis_img, weight, 0, vis_img);
65176
return vis_img;
66177
}
67178

179+
cv::Mat VisSegmentation(const cv::Mat& im, const SegmentationResult& result,
180+
float weight) {
181+
// TODO: Support SSE/AVX on x86_64 platforms
182+
#ifdef __ARM_NEON
183+
return FastVisSegmentationNEON(im, result, weight, true);
184+
#else
185+
return VisSegmentationCommonCpu(im, result, weight);
186+
#endif
187+
}
188+
189+
cv::Mat Visualize::VisSegmentation(const cv::Mat& im,
190+
const SegmentationResult& result) {
191+
FDWARNING << "DEPRECATED: fastdeploy::vision::Visualize::VisSegmentation is "
192+
"deprecated, please use fastdeploy::vision:VisSegmentation "
193+
"function instead."
194+
<< std::endl;
195+
#ifdef __ARM_NEON
196+
return FastVisSegmentationNEON(im, result, 0.5f, true);
197+
#else
198+
return VisSegmentationCommonCpu(im, result, 0.5f);
199+
#endif
200+
}
201+
68202
} // namespace vision
69203
} // namespace fastdeploy
70204
#endif

fastdeploy/vision/visualize/visualize.cc

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,6 @@
1818
namespace fastdeploy {
1919
namespace vision {
2020

21-
int Visualize::num_classes_ = 0;
22-
std::vector<int> Visualize::color_map_ = std::vector<int>();
23-
2421
static std::vector<int> global_fd_vis_color_map = std::vector<int>();
2522

2623
std::vector<int> GenerateColorMap(int num_classes) {
@@ -42,6 +39,10 @@ std::vector<int> GenerateColorMap(int num_classes) {
4239
return color_map;
4340
}
4441

42+
// This class will deprecated, please not use it
43+
int Visualize::num_classes_ = 0;
44+
std::vector<int> Visualize::color_map_ = std::vector<int>();
45+
4546
const std::vector<int>& Visualize::GetColorMap(int num_classes) {
4647
if (num_classes < num_classes_) {
4748
return color_map_;

0 commit comments

Comments
 (0)