Skip to content

Commit e90ba7f

Browse files
authored
Updates for unsupported CMSIS-NN backend (#118)
* CMSIS fully connected function with Q31 output * Allow and ignore weights < 8 bit
1 parent db36ec5 commit e90ba7f

File tree

10 files changed

+257
-34
lines changed

10 files changed

+257
-34
lines changed

assets/cmsis-nn/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
CC=gcc
22
CFLAGS=-I. -ICMSIS/Core/Include -ICMSIS/NN/Include -ICMSIS/DSP/Include -Wall -D__ARM_ARCH_6M__
3-
LIB_FILES=arm_convolve_HWC_q7_basic.o arm_pool_q7_HWC.o arm_relu_q7.o arm_fully_connected_q7_q8p7_opt.o arm_convolve_HWC_q7_fast.o arm_convolve_HWC_q7_basic_nonsquare.o arm_pool_q7_HWC_nonsquare.o arm_pool_nonsquare_q7_HWC_nonsquare.o arm_relu32_q7.o arm_fully_connected_q7.o
3+
LIB_FILES=arm_convolve_HWC_q7_basic.o arm_pool_q7_HWC.o arm_relu_q7.o arm_convolve_HWC_q7_fast.o arm_convolve_HWC_q7_basic_nonsquare.o arm_pool_q7_HWC_nonsquare.o arm_pool_nonsquare_q7_HWC_nonsquare.o arm_relu32_q7.o arm_fully_connected_q7.o arm_fully_connected_q7_q31.o
44

55
.PHONY: all
66
all: main

assets/cmsis-nn/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
The ‘izer’ includes an unsupported CMSIS-NN code generator. To use it:
44

55
1. Understand it is incomplete and unsupported.
6-
2. Use only networks **without any** Conv1d, ConvTranspose2d, element-wise operations, and without input sequences (concatenation). It does not support wide (32-bit) output either. Some or more of these features could be added without too much effort, any suggestions or pull requests are welcome.
6+
2. Use only networks **without any** Conv1d, ConvTranspose2d, element-wise operations, and without input sequences (concatenation). Some or more of these features could be added without too much effort, any suggestions or pull requests are welcome.
77
3. Understand that there is no proper build environment.
88

99
### Setup
@@ -33,7 +33,7 @@ This is very similar to generating code for MAX78000 (see `gen-demos-max7800.sh`
3333
Next, go to the target folder (`cmsis-demos/cifar-10` in the above example), and execute:
3434

3535
```shell
36-
(ai8x-synthesis) $ cd cmisis-demos/cifar-19
36+
(ai8x-synthesis) $ cd cmisis-demos/cifar-10
3737
(ai8x-synthesis) $ ./makelinks.sh
3838
(ai8x-synthesis) $ make
3939
```

assets/cmsis-nn/arm_convolve_HWC_q7_basic.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
* -------------------------------------------------------------------- */
3333
#include "arm_math.h"
3434
#include "arm_nnfunctions.h"
35+
#include "arm_nnsupportfunctions.h"
3536

3637
/**
3738
* @ingroup groupNN

assets/cmsis-nn/arm_convolve_HWC_q7_basic_nonsquare.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
* -------------------------------------------------------------------- */
3333
#include "arm_math.h"
3434
#include "arm_nnfunctions.h"
35+
#include "arm_nnsupportfunctions.h"
3536

3637
/**
3738
* @ingroup groupNN

assets/cmsis-nn/arm_convolve_HWC_q7_fast.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333

3434
#include "arm_math.h"
3535
#include "arm_nnfunctions.h"
36+
#include "arm_nnsupportfunctions.h"
3637

3738
/**
3839
* @ingroup groupNN
Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
/*
2+
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
3+
*
4+
* SPDX-License-Identifier: Apache-2.0
5+
*
6+
* Licensed under the Apache License, Version 2.0 (the License); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
14+
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
/* ----------------------------------------------------------------------
20+
* Project: CMSIS NN Library
21+
* Title: arm_fully_connected_q7_q31.c
22+
* Description: Q7 fully-connected layer function with Q31 output
23+
*
24+
* $Date: March 23, 2021
25+
* $Revision: V.1.0.0
26+
*
27+
* Target Processor: Cortex-M cores
28+
*
29+
* -------------------------------------------------------------------- */
30+
31+
#include "arm_nnfunctions.h"
32+
#include "arm_nnsupportfunctions.h"
33+
34+
/**
35+
* @ingroup groupNN
36+
*/
37+
38+
/**
39+
* @addtogroup FC
40+
* @{
41+
*/
42+
43+
/**
44+
* @brief Q7-Q31 fully-connected layer function
45+
* @param[in] pV pointer to input vector
46+
* @param[in] pM pointer to matrix weights
47+
* @param[in] dim_vec length of the vector
48+
* @param[in] num_of_rows number of rows in weight matrix
49+
* @param[in] bias_shift amount of left-shift for bias
50+
* @param[in] out_shift amount of right-shift for output
51+
* @param[in] bias pointer to bias
52+
* @param[in,out] pOut pointer to output vector
53+
* @param[in,out] vec_buffer pointer to buffer space for input
54+
* @return The function returns <code>ARM_MATH_SUCCESS</code>
55+
*
56+
* @details
57+
*
58+
* <b>Buffer size:</b>
59+
*
60+
* vec_buffer size: dim_vec
61+
*
62+
* Q7_Q31 version of the fully connected layer
63+
*
64+
* Input and Weights are in q7_t and Activations are in q31_t
65+
*
66+
*/
67+
68+
arm_status arm_fully_connected_q7_q31(const q7_t *pV,
69+
const q7_t *pM,
70+
const uint16_t dim_vec,
71+
const uint16_t num_of_rows,
72+
const uint16_t bias_shift,
73+
const uint16_t out_shift,
74+
const q7_t *bias,
75+
q31_t *pOut,
76+
q15_t *vec_buffer)
77+
{
78+
79+
#if defined(ARM_MATH_DSP)
80+
/* Run the following code for Cortex-M4 and Cortex-M7 */
81+
82+
const q7_t *pB = pM;
83+
const q7_t *pB2;
84+
q31_t *pO = pOut;
85+
const q7_t *pBias = bias;
86+
const q15_t *pA;
87+
uint16_t rowCnt = num_of_rows >> 1;
88+
89+
/* expand the vector into the buffer */
90+
arm_q7_to_q15_reordered_no_shift(pV, vec_buffer, dim_vec);
91+
92+
while (rowCnt)
93+
{
94+
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
95+
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
96+
uint16_t colCnt = dim_vec >> 2;
97+
98+
pA = vec_buffer;
99+
pB2 = pB + dim_vec;
100+
101+
while (colCnt)
102+
{
103+
q31_t inV, inM11, inM12, inM21, inM22;
104+
pB = read_and_pad_reordered(pB, &inM11, &inM12);
105+
pB2 = read_and_pad_reordered(pB2, &inM21, &inM22);
106+
107+
inV = arm_nn_read_q15x2_ia(&pA);
108+
109+
sum = __SMLAD(inV, inM11, sum);
110+
sum2 = __SMLAD(inV, inM21, sum2);
111+
112+
inV = arm_nn_read_q15x2_ia(&pA);
113+
114+
sum = __SMLAD(inV, inM12, sum);
115+
sum2 = __SMLAD(inV, inM22, sum2);
116+
117+
colCnt--;
118+
}
119+
colCnt = dim_vec & 0x3;
120+
while (colCnt)
121+
{
122+
q7_t inV = *pA++;
123+
q15_t inM = *pB++;
124+
q15_t inM2 = *pB2++;
125+
126+
sum += inV * inM;
127+
sum2 += inV * inM2;
128+
colCnt--;
129+
} /* while over colCnt */
130+
*pO++ = (q31_t)(__SSAT((sum >> out_shift), 32));
131+
*pO++ = (q31_t)(__SSAT((sum2 >> out_shift), 32));
132+
133+
/* adjust the pointers and counters */
134+
pB += dim_vec;
135+
rowCnt--;
136+
}
137+
138+
/* left-over part of the rows */
139+
rowCnt = num_of_rows & 0x1;
140+
141+
while (rowCnt)
142+
{
143+
uint16_t colCnt = dim_vec >> 2;
144+
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
145+
146+
pA = vec_buffer;
147+
148+
while (colCnt)
149+
{
150+
q31_t inV1, inV2, inM11, inM12;
151+
152+
pB = read_and_pad_reordered(pB, &inM11, &inM12);
153+
154+
inV1 = arm_nn_read_q15x2_ia(&pA);
155+
sum = __SMLAD(inV1, inM11, sum);
156+
157+
inV2 = arm_nn_read_q15x2_ia(&pA);
158+
sum = __SMLAD(inV2, inM12, sum);
159+
160+
colCnt--;
161+
}
162+
163+
/* left-over of the vector */
164+
colCnt = dim_vec & 0x3;
165+
while (colCnt)
166+
{
167+
q7_t inV = *pA++;
168+
q15_t inM = *pB++;
169+
sum += inV * inM;
170+
colCnt--;
171+
}
172+
173+
*pO++ = (q31_t)(__SSAT((sum >> out_shift), 32));
174+
175+
rowCnt--;
176+
}
177+
178+
#else
179+
(void)vec_buffer;
180+
int i, j;
181+
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
182+
for (i = 0; i < num_of_rows; i++)
183+
{
184+
int ip_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
185+
for (j = 0; j < dim_vec; j++)
186+
{
187+
ip_out += pV[j] * pM[i * dim_vec + j];
188+
}
189+
pOut[i] = (q31_t)__SSAT((ip_out >> out_shift), 32);
190+
}
191+
192+
#endif /* ARM_MATH_DSP */
193+
194+
/* Return to ARM_MATH_SUCCESS */
195+
return (ARM_MATH_SUCCESS);
196+
}
197+
198+
/**
199+
* @} end of FC group
200+
*/

assets/cmsis-nn/cnn.h

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,16 @@
77

88
#include "weights.h"
99

10-
arm_status
11-
arm_fully_connected_q7_q8p7_opt(const q7_t * pV,
12-
const q7_t * pM,
13-
const uint16_t dim_vec,
14-
const uint16_t num_of_rows,
15-
const uint16_t bias_shift,
16-
const uint16_t out_shift, const q7_t * bias, q15_t * pOut, q15_t * vec_buffer);
10+
11+
arm_status arm_fully_connected_q7_q31(const q7_t *pV,
12+
const q7_t *pM,
13+
const uint16_t dim_vec,
14+
const uint16_t num_of_rows,
15+
const uint16_t bias_shift,
16+
const uint16_t out_shift,
17+
const q7_t *bias,
18+
q31_t *pOut,
19+
q15_t *vec_buffer);
1720

1821

1922
void arm_softmax_q8p7_q15(const q15_t * vec_in, const uint16_t dim_vec, q15_t * p_out);
@@ -72,4 +75,3 @@ void arm_avepool_nonsquare_q7_HWC_nonsquare(q7_t * Im_in,
7275

7376

7477
void arm_relu32_q7(q7_t * data, uint32_t size);
75-

attic/gen-cmsisdemos.sh

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,16 @@
11
#!/bin/sh
2-
./ai8xize.py --verbose --log --test-dir demos --prefix mnist --checkpoint-file trained/ai85-mnist-qat8-q.pth.tar --config-file networks/mnist-chw-ai85.yaml --device CMSIS-NN "$@"
3-
./ai8xize.py --verbose --log --test-dir demos --prefix cifar-10 --checkpoint-file trained/ai85-cifar10-qat8-q.pth.tar --config-file networks/cifar10-hwc-ai85.yaml --device CMSIS-NN "$@"
4-
./ai8xize.py --verbose --log --test-dir demos --prefix kws20_v2 --checkpoint-file trained/ai85-kws20_v2-qat8-q.pth.tar --config-file networks/kws20-v2-hwc.yaml --device CMSIS-NN "$@"
5-
./ai8xize.py --verbose --log --test-dir demos --prefix faceid --checkpoint-file trained/ai85-faceid-qat8-q.pth.tar --config-file networks/faceid.yaml --device CMSIS-NN "$@"
2+
TARGET=demos
3+
COMMON_ARGS="--device CMSIS-NN"
64

7-
./ai8xize.py --verbose --log --test-dir demos --prefix CMSIS-Conv1D --config-file tests/test-conv1d.yaml --device CMSIS-NN
8-
./ai8xize.py --verbose --log --test-dir demos --prefix CMSIS-Conv1x1 --config-file tests/test-conv1x1.yaml --device CMSIS-NN
5+
./ai8xize.py --verbose --log --test-dir $TARGET --prefix mnist --checkpoint-file trained/ai85-mnist-qat8-q.pth.tar --config-file networks/mnist-chw-ai85.yaml $COMMON_ARGS "$@"
6+
./ai8xize.py --verbose --log --test-dir $TARGET --prefix cifar-10 --checkpoint-file tests/ai85-cifar10-qat8-q.pth.tar --config-file networks/cifar10-hwc-ai85.yaml $COMMON_ARGS "$@"
7+
./ai8xize.py --verbose --log --test-dir $TARGET --prefix kws20_v2 --checkpoint-file trained/ai85-kws20_v2-qat8-q.pth.tar --config-file networks/kws20-v2-hwc.yaml $COMMON_ARGS "$@"
8+
./ai8xize.py --verbose --log --test-dir $TARGET --prefix faceid --checkpoint-file trained/ai85-faceid-qat8-q.pth.tar --config-file networks/faceid.yaml $COMMON_ARGS "$@"
9+
./ai8xize.py --verbose --log --test-dir $TARGET --prefix cats-dogs --checkpoint-file trained/ai85-catsdogs-qat8-q.pth.tar --config-file networks/cats-dogs-chw.yaml $COMMON_ARGS "$@"
910

10-
./ai8xize.py --verbose --log --test-dir demos --prefix CMSIS-Nonsquare --config-file tests/test-nonsquare.yaml --device CMSIS-NN
11-
./ai8xize.py --verbose --log --test-dir demos --prefix CMSIS-NonsquarePool --config-file tests/test-nonsquare-pool.yaml --device CMSIS-NN
12-
./ai8xize.py --verbose --log --test-dir demos --prefix CMSIS-NonsquarePoolNonsquare --config-file tests/test-nonsquare-nonsquarepool.yaml --device CMSIS-NN
11+
./ai8xize.py --verbose --log --test-dir $TARGET --prefix CMSIS-Conv1D --config-file tests/test-conv1d.yaml --device CMSIS-NN
12+
./ai8xize.py --verbose --log --test-dir $TARGET --prefix CMSIS-Conv1x1 --config-file tests/test-conv1x1.yaml --device CMSIS-NN
13+
14+
./ai8xize.py --verbose --log --test-dir $TARGET --prefix CMSIS-Nonsquare --config-file tests/test-nonsquare.yaml --device CMSIS-NN
15+
./ai8xize.py --verbose --log --test-dir $TARGET --prefix CMSIS-NonsquarePool --config-file tests/test-nonsquare-pool.yaml --device CMSIS-NN
16+
./ai8xize.py --verbose --log --test-dir $TARGET --prefix CMSIS-NonsquarePoolNonsquare --config-file tests/test-nonsquare-nonsquarepool.yaml --device CMSIS-NN

izer/cmsisnn.py

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -66,11 +66,13 @@ def create_net( # pylint: disable=too-many-arguments,too-many-locals,too-many-b
6666
"""
6767
wprint('CMSIS-NN code generation is unsupported.')
6868

69-
if output_width[-1] != 8:
69+
if output_width[-1] != 8 and operator[-1] != op.LINEAR:
7070
wprint('CMSIS-NN network generator does not currently support `output_width` that '
71-
'is not 8. Forcing to 8 bit.') # FIXME: Support 32-bit output
71+
'is not 8 when not using Linear. Forcing to 8 bit.') # FIXME: Support 32-bit output
7272
output_width[-1] = 8
7373

74+
final_size = 7 if output_width[-1] == 8 else 31
75+
7476
input_dim_str = [None] * layers
7577
output_dim_str = [None] * layers
7678
kernel_size_str = [None] * layers
@@ -83,7 +85,8 @@ def create_net( # pylint: disable=too-many-arguments,too-many-locals,too-many-b
8385
if quantization[ll] is None:
8486
quantization[ll] = 8 # Set default
8587
elif quantization[ll] != 8: # FIXME: Support quantization
86-
eprint('CMSIS-NN network generator does not currently support `quantization` != 8.')
88+
wprint('CMSIS-NN network generator does not currently support `quantization` != 8. '
89+
'Forcing to 8 bit.')
8790

8891
if output_shift[ll] is None:
8992
output_shift[ll] = 0 # Set default
@@ -206,8 +209,8 @@ def create_net( # pylint: disable=too-many-arguments,too-many-locals,too-many-b
206209
c_file.write(f'static q7_t buffer1[{img_buffer_size}];\n')
207210
c_file.write(f'static q15_t col_buffer[{col_buffer_size}];\n\n')
208211

209-
c_file.write('int cnn_run(const q7_t *input, int input_size, '
210-
'q7_t **output, int *output_size)\n{\n')
212+
c_file.write(f'int cnn_run(const q7_t *input, int input_size, q{final_size}_t **output, '
213+
'int *output_size)\n{\n')
211214

212215
# Compute layer-by-layer output and chain results into input
213216
buffer0, buffer1 = 'buffer0', 'buffer1'
@@ -514,9 +517,17 @@ def run_eltwise(
514517
# Detect fully connected layers
515518
if operator[ll] == op.LINEAR:
516519
assert in_dim == [1, 1] and output_dim[ll] == [1, 1]
517-
c_file.write(f' arm_fully_connected_q7({source}, '
520+
if output_width[ll] == 8:
521+
fn = 'q7'
522+
shift = 7 - output_shift[ll]
523+
cast = ''
524+
else:
525+
fn = 'q7_q31'
526+
shift = 0
527+
cast = '(q31_t *) '
528+
c_file.write(f' arm_fully_connected_{fn}({source}, '
518529
f'weights_{ll}, {in_chan}, {output_chan[ll]}, 7, '
519-
f'{7 - output_shift[ll]}, bias_{ll}, {buffer1}, '
530+
f'{shift}, bias_{ll}, {cast}{buffer1}, '
520531
'col_buffer);\n')
521532
else:
522533
fn = 'fast' if in_chan % 4 == 0 and output_chan[ll] % 2 == 0 \
@@ -565,13 +576,13 @@ def run_eltwise(
565576

566577
data = data_buf[-1]
567578

568-
c_file.write(f' *output = {buffer0};\n'
579+
c_file.write(f' *output = {"" if output_width[ll] == 8 else "(q31_t *) "}{buffer0};\n'
569580
f' *output_size = {data_cmsis.size};\n\n'
570581
' return 1;\n}\n\n')
571582

572583
c_file.write('int main(void)\n{\n'
573584
' int i;\n'
574-
' q7_t *output;\n'
585+
f' q{final_size}_t *output;\n'
575586
' int output_size;\n\n'
576587
f' cnn_run(input_data, {input_size}, &output, &output_size);\n\n')
577588

@@ -582,9 +593,12 @@ def run_eltwise(
582593
' printf("!!! FAIL !!!\\n\\n");\n\n')
583594

584595
c_file.write(' printf("Output of final layer:\\n");\n'
585-
' for (i = 0; i < output_size; i++) {\n'
586-
' printf("%5hhd", (int8_t) (output[i] & 0xff));\n'
587-
' if ((i + 1) % 32 == 0)\n printf("\\n");\n'
596+
' for (i = 0; i < output_size; i++) {\n')
597+
if final_size == 7:
598+
c_file.write(' printf("%5hhd", (int8_t) (output[i] & 0xff));\n')
599+
else:
600+
c_file.write(' printf("%8d", (int32_t) output[i]);\n')
601+
c_file.write(' if ((i + 1) % 32 == 0)\n printf("\\n");\n'
588602
' else if ((i + 1) % 4 == 0)\n printf(" ");\n'
589603
' }\n'
590604
' printf("\\n");\n'

0 commit comments

Comments
 (0)