17
17
#include " fastdeploy/vision/visualize/visualize.h"
18
18
#include " opencv2/highgui.hpp"
19
19
#include " opencv2/imgproc/imgproc.hpp"
20
+ #ifdef __ARM_NEON
21
+ #include < arm_neon.h>
22
+ #endif
20
23
21
24
namespace fastdeploy {
22
25
namespace vision {
23
26
24
- cv::Mat VisSegmentation (const cv::Mat& im, const SegmentationResult& result,
25
- float weight) {
26
- auto color_map = GenerateColorMap (1000 );
27
+ #ifdef __ARM_NEON
28
+ static inline void QuantizeBlendingWeight8 (
29
+ float weight, uint8_t * old_multi_factor, uint8_t * new_multi_factor) {
30
+ // Quantize the weight to boost blending performance.
31
+ // if 0.0 < w <= 1/8, w ~ 1/8=1/(2^3) shift right 3 mul 1, 7
32
+ // if 1/8 < w <= 2/8, w ~ 2/8=1/(2^3) shift right 3 mul 2, 6
33
+ // if 2/8 < w <= 3/8, w ~ 3/8=1/(2^3) shift right 3 mul 3, 5
34
+ // if 3/8 < w <= 4/8, w ~ 4/8=1/(2^3) shift right 3 mul 4, 4
35
+ // Shift factor is always 3, but the mul factor is different.
36
+ // Moving 7 bits to the right tends to result in a zero value,
37
+ // So, We choose to shift 3 bits to get an approximation.
38
+ uint8_t weight_quantize = static_cast <uint8_t >(weight * 8 .0f );
39
+ *new_multi_factor = weight_quantize;
40
+ *old_multi_factor = (8 - weight_quantize);
41
+ }
42
+
43
+ static cv::Mat FastVisSegmentationNEON (
44
+ const cv::Mat& im, const SegmentationResult& result,
45
+ float weight, bool quantize_weight = true ) {
27
46
int64_t height = result.shape [0 ];
28
47
int64_t width = result.shape [1 ];
29
48
auto vis_img = cv::Mat (height, width, CV_8UC3);
49
+
50
+ int32_t size = static_cast <int32_t >(height * width);
51
+ uint8_t *vis_ptr = static_cast <uint8_t *>(vis_img.data );
52
+ const uint8_t *label_ptr = static_cast <const uint8_t *>(result.label_map .data ());
53
+ const uint8_t *im_ptr = static_cast <const uint8_t *>(im.data );
30
54
31
- int64_t index = 0 ;
32
- for (int i = 0 ; i < height; i++) {
33
- for (int j = 0 ; j < width; j++) {
34
- int category_id = result.label_map [index++];
35
- vis_img.at <cv::Vec3b>(i, j)[0 ] = color_map[3 * category_id + 0 ];
36
- vis_img.at <cv::Vec3b>(i, j)[1 ] = color_map[3 * category_id + 1 ];
37
- vis_img.at <cv::Vec3b>(i, j)[2 ] = color_map[3 * category_id + 2 ];
55
+ if (!quantize_weight) {
56
+ #pragma omp parallel for num_threads(2) schedule(static)
57
+ for (int i = 0 ; i < size - 15 ; i += 16 ) {
58
+ uint8x16_t labelx16 = vld1q_u8 (label_ptr + i); // 16 bytes
59
+ // e.g 0b00000001 << 7 -> 0b10000000 128;
60
+ uint8x16x3_t vbgrx16x3;
61
+ vbgrx16x3.val [0 ] = vshlq_n_u8 (labelx16, 7 );
62
+ vbgrx16x3.val [1 ] = vshlq_n_u8 (labelx16, 4 );
63
+ vbgrx16x3.val [2 ] = vshlq_n_u8 (labelx16, 3 );
64
+ vst3q_u8 (vis_ptr + i * 3 , vbgrx16x3);
38
65
}
66
+ for (int i = size - 15 ; i < size; i++) {
67
+ uint8_t label = label_ptr[i];
68
+ vis_ptr[i * 3 + 0 ] = (label << 7 );
69
+ vis_ptr[i * 3 + 1 ] = (label << 4 );
70
+ vis_ptr[i * 3 + 2 ] = (label << 3 );
71
+ }
72
+ // Blend colors use opencv
73
+ cv::addWeighted (im, 1.0 - weight, vis_img, weight, 0 , vis_img);
74
+ return vis_img;
39
75
}
40
- cv::addWeighted (im, 1.0 - weight, vis_img, weight, 0 , vis_img);
76
+
77
+ // Quantize the weight to boost blending performance.
78
+ // After that, we can directly use shift instructions
79
+ // to blend the colors from input im and mask. Please
80
+ // check QuantizeBlendingWeight8 for more details.
81
+ uint8_t old_multi_factor, new_multi_factor;
82
+ QuantizeBlendingWeight8 (weight, &old_multi_factor,
83
+ &new_multi_factor);
84
+ if (new_multi_factor == 0 ) {
85
+ return im; // Only keep origin image.
86
+ }
87
+
88
+ if (new_multi_factor == 8 ) {
89
+ // Only keep mask, no need to blending with origin image.
90
+ #pragma omp parallel for num_threads(2) schedule(static)
91
+ for (int i = 0 ; i < size - 15 ; i += 16 ) {
92
+ uint8x16_t labelx16 = vld1q_u8 (label_ptr + i); // 16 bytes
93
+ // e.g 0b00000001 << 7 -> 0b10000000 128;
94
+ uint8x16_t mbx16 = vshlq_n_u8 (labelx16, 7 );
95
+ uint8x16_t mgx16 = vshlq_n_u8 (labelx16, 4 );
96
+ uint8x16_t mrx16 = vshlq_n_u8 (labelx16, 3 );
97
+ uint8x16x3_t vbgr16x3;
98
+ vbgr16x3.val [0 ] = mbx16;
99
+ vbgr16x3.val [1 ] = mgx16;
100
+ vbgr16x3.val [2 ] = mrx16;
101
+ vst3q_u8 (vis_ptr + i * 3 , vbgr16x3);
102
+ }
103
+ for (int i = size - 15 ; i < size; i++) {
104
+ uint8_t label = label_ptr[i];
105
+ vis_ptr[i * 3 + 0 ] = (label << 7 );
106
+ vis_ptr[i * 3 + 1 ] = (label << 4 );
107
+ vis_ptr[i * 3 + 2 ] = (label << 3 );
108
+ }
109
+ return vis_img;
110
+ }
111
+
112
+ uint8x16_t old_mulx16 = vdupq_n_u8 (old_multi_factor);
113
+ uint8x16_t new_mulx16 = vdupq_n_u8 (new_multi_factor);
114
+ // Blend the two colors together with quantize 'weight'.
115
+ #pragma omp parallel for num_threads(2) schedule(static)
116
+ for (int i = 0 ; i < size - 15 ; i += 16 ) {
117
+ uint8x16x3_t bgrx16x3 = vld3q_u8 (im_ptr + i * 3 ); // 48 bytes
118
+ uint8x16_t labelx16 = vld1q_u8 (label_ptr + i); // 16 bytes
119
+ uint8x16_t ibx16 = bgrx16x3.val [0 ];
120
+ uint8x16_t igx16 = bgrx16x3.val [1 ];
121
+ uint8x16_t irx16 = bgrx16x3.val [2 ];
122
+ // e.g 0b00000001 << 7 -> 0b10000000 128;
123
+ uint8x16_t mbx16 = vshlq_n_u8 (labelx16, 7 );
124
+ uint8x16_t mgx16 = vshlq_n_u8 (labelx16, 4 );
125
+ uint8x16_t mrx16 = vshlq_n_u8 (labelx16, 3 );
126
+ // TODO: keep the pixels of input im if mask = 0
127
+ uint8x16_t ibx16_mshr, igx16_mshr, irx16_mshr;
128
+ uint8x16_t mbx16_mshr, mgx16_mshr, mrx16_mshr;
129
+ // Moving 7 bits to the right tends to result in zero,
130
+ // So, We choose to shift 3 bits to get an approximation
131
+ ibx16_mshr = vmulq_u8 (vshrq_n_u8 (ibx16, 3 ), old_mulx16);
132
+ igx16_mshr = vmulq_u8 (vshrq_n_u8 (igx16, 3 ), old_mulx16);
133
+ irx16_mshr = vmulq_u8 (vshrq_n_u8 (irx16, 3 ), old_mulx16);
134
+ mbx16_mshr = vmulq_u8 (vshrq_n_u8 (mbx16, 3 ), new_mulx16);
135
+ mgx16_mshr = vmulq_u8 (vshrq_n_u8 (mgx16, 3 ), new_mulx16);
136
+ mrx16_mshr = vmulq_u8 (vshrq_n_u8 (mrx16, 3 ), new_mulx16);
137
+ uint8x16x3_t vbgr16x3;
138
+ vbgr16x3.val [0 ] = vaddq_u8 (ibx16_mshr, mbx16_mshr);
139
+ vbgr16x3.val [1 ] = vaddq_u8 (igx16_mshr, mgx16_mshr);
140
+ vbgr16x3.val [2 ] = vaddq_u8 (irx16_mshr, mrx16_mshr);
141
+ // Store the blended pixels to vis img
142
+ vst3q_u8 (vis_ptr + i * 3 , vbgr16x3);
143
+ }
144
+ for (int i = size - 15 ; i < size; i++) {
145
+ uint8_t label = label_ptr[i];
146
+ vis_ptr[i * 3 + 0 ] = (im_ptr[i * 3 + 0 ] >> 3 ) * old_multi_factor
147
+ + ((label << 7 ) >> 3 ) * new_multi_factor;
148
+ vis_ptr[i * 3 + 1 ] = (im_ptr[i * 3 + 1 ] >> 3 ) * old_multi_factor
149
+ + ((label << 4 ) >> 3 ) * new_multi_factor;
150
+ vis_ptr[i * 3 + 2 ] = (im_ptr[i * 3 + 2 ] >> 3 ) * old_multi_factor
151
+ + ((label << 3 ) >> 3 ) * new_multi_factor;
152
+ }
41
153
return vis_img;
42
154
}
155
+ #endif
43
156
44
- cv::Mat Visualize::VisSegmentation (const cv::Mat& im,
45
- const SegmentationResult& result) {
46
- FDWARNING << " DEPRECATED: fastdeploy::vision::Visualize::VisSegmentation is "
47
- " deprecated, please use fastdeploy::vision:VisSegmentation "
48
- " function instead."
49
- << std::endl;
50
- auto color_map = GetColorMap ();
157
+ static cv::Mat VisSegmentationCommonCpu (
158
+ const cv::Mat& im, const SegmentationResult& result,
159
+ float weight) {
160
+ // Use the native c++ version without any optimization.
161
+ auto color_map = GenerateColorMap (1000 );
51
162
int64_t height = result.shape [0 ];
52
163
int64_t width = result.shape [1 ];
53
164
auto vis_img = cv::Mat (height, width, CV_8UC3);
@@ -61,10 +172,33 @@ cv::Mat Visualize::VisSegmentation(const cv::Mat& im,
61
172
vis_img.at <cv::Vec3b>(i, j)[2 ] = color_map[3 * category_id + 2 ];
62
173
}
63
174
}
64
- cv::addWeighted (im, . 5 , vis_img, . 5 , 0 , vis_img);
175
+ cv::addWeighted (im, 1.0 - weight , vis_img, weight , 0 , vis_img);
65
176
return vis_img;
66
177
}
67
178
179
+ cv::Mat VisSegmentation (const cv::Mat& im, const SegmentationResult& result,
180
+ float weight) {
181
+ // TODO: Support SSE/AVX on x86_64 platforms
182
+ #ifdef __ARM_NEON
183
+ return FastVisSegmentationNEON (im, result, weight, true );
184
+ #else
185
+ return VisSegmentationCommonCpu (im, result, weight);
186
+ #endif
187
+ }
188
+
189
+ cv::Mat Visualize::VisSegmentation (const cv::Mat& im,
190
+ const SegmentationResult& result) {
191
+ FDWARNING << " DEPRECATED: fastdeploy::vision::Visualize::VisSegmentation is "
192
+ " deprecated, please use fastdeploy::vision:VisSegmentation "
193
+ " function instead."
194
+ << std::endl;
195
+ #ifdef __ARM_NEON
196
+ return FastVisSegmentationNEON (im, result, 0 .5f , true );
197
+ #else
198
+ return VisSegmentationCommonCpu (im, result, 0 .5f );
199
+ #endif
200
+ }
201
+
68
202
} // namespace vision
69
203
} // namespace fastdeploy
70
204
#endif
0 commit comments