Updates for unsupported CMSIS-NN backend (#118)

vicloginov · web-flow · commit e90ba7ffc07e · 2021-03-24T16:07:54.000-05:00
* CMSIS fully connected function with Q31 output
* Allow and ignore weights &lt; 8 bit
diff --git a/assets/cmsis-nn/Makefile b/assets/cmsis-nn/Makefile
@@ -1,6 +1,6 @@
 CC=gcc
 CFLAGS=-I. -ICMSIS/Core/Include -ICMSIS/NN/Include -ICMSIS/DSP/Include -Wall -D__ARM_ARCH_6M__
-LIB_FILES=arm_convolve_HWC_q7_basic.o arm_pool_q7_HWC.o arm_relu_q7.o arm_fully_connected_q7_q8p7_opt.o arm_convolve_HWC_q7_fast.o arm_convolve_HWC_q7_basic_nonsquare.o arm_pool_q7_HWC_nonsquare.o arm_pool_nonsquare_q7_HWC_nonsquare.o arm_relu32_q7.o arm_fully_connected_q7.o
+LIB_FILES=arm_convolve_HWC_q7_basic.o arm_pool_q7_HWC.o arm_relu_q7.o arm_convolve_HWC_q7_fast.o arm_convolve_HWC_q7_basic_nonsquare.o arm_pool_q7_HWC_nonsquare.o arm_pool_nonsquare_q7_HWC_nonsquare.o arm_relu32_q7.o arm_fully_connected_q7.o arm_fully_connected_q7_q31.o
 
 .PHONY: all
 all: main
diff --git a/assets/cmsis-nn/README.md b/assets/cmsis-nn/README.md
@@ -3,7 +3,7 @@
 The ‘izer’ includes an unsupported CMSIS-NN code generator. To use it:
 
 1. Understand it is incomplete and unsupported.
-2. Use only networks **without any** Conv1d, ConvTranspose2d, element-wise operations, and without input sequences (concatenation). It does not support wide (32-bit) output either. Some or more of these features could be added without too much effort, any suggestions or pull requests are welcome.
+2. Use only networks **without any** Conv1d, ConvTranspose2d, element-wise operations, and without input sequences (concatenation). Some or more of these features could be added without too much effort, any suggestions or pull requests are welcome.
 3. Understand that there is no proper build environment.
 
 ### Setup
@@ -33,7 +33,7 @@ This is very similar to generating code for MAX78000 (see `gen-demos-max7800.sh`
 Next, go to the target folder (`cmsis-demos/cifar-10` in the above example), and execute:
 
 ```shell
-(ai8x-synthesis) $ cd cmisis-demos/cifar-19
+(ai8x-synthesis) $ cd cmisis-demos/cifar-10
 (ai8x-synthesis) $ ./makelinks.sh
 (ai8x-synthesis) $ make
 ```
diff --git a/assets/cmsis-nn/arm_convolve_HWC_q7_basic.c b/assets/cmsis-nn/arm_convolve_HWC_q7_basic.c
@@ -32,6 +32,7 @@
  * -------------------------------------------------------------------- */
 #include "arm_math.h"
 #include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
 
 /**
  *  @ingroup groupNN
diff --git a/assets/cmsis-nn/arm_convolve_HWC_q7_basic_nonsquare.c b/assets/cmsis-nn/arm_convolve_HWC_q7_basic_nonsquare.c
@@ -32,6 +32,7 @@
  * -------------------------------------------------------------------- */
 #include "arm_math.h"
 #include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
 
 /**
  *  @ingroup groupNN
diff --git a/assets/cmsis-nn/arm_convolve_HWC_q7_fast.c b/assets/cmsis-nn/arm_convolve_HWC_q7_fast.c
@@ -33,6 +33,7 @@
 
 #include "arm_math.h"
 #include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
 
 /**
  *  @ingroup groupNN
diff --git a/assets/cmsis-nn/arm_fully_connected_q7_q31.c b/assets/cmsis-nn/arm_fully_connected_q7_q31.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_fully_connected_q7_q31.c
+ * Description:  Q7 fully-connected layer function with Q31 output
+ *
+ * $Date:        March 23, 2021
+ * $Revision:    V.1.0.0
+ *
+ * Target Processor:  Cortex-M cores
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
+
+/**
+ *  @ingroup groupNN
+ */
+
+/**
+ * @addtogroup FC
+ * @{
+ */
+
+/**
+ * @brief Q7-Q31 fully-connected layer function
+ * @param[in]       pV          pointer to input vector
+ * @param[in]       pM          pointer to matrix weights
+ * @param[in]       dim_vec     length of the vector
+ * @param[in]       num_of_rows number of rows in weight matrix
+ * @param[in]       bias_shift  amount of left-shift for bias
+ * @param[in]       out_shift   amount of right-shift for output
+ * @param[in]       bias        pointer to bias
+ * @param[in,out]   pOut        pointer to output vector
+ * @param[in,out]   vec_buffer  pointer to buffer space for input
+ * @return     The function returns <code>ARM_MATH_SUCCESS</code>
+ *
+ * @details
+ *
+ * <b>Buffer size:</b>
+ *
+ * vec_buffer size: dim_vec
+ *
+ *  Q7_Q31 version of the fully connected layer
+ *
+ *  Input and Weights are in q7_t and Activations are in q31_t
+ *
+ */
+
+arm_status arm_fully_connected_q7_q31(const q7_t *pV,
+                                      const q7_t *pM,
+                                      const uint16_t dim_vec,
+                                      const uint16_t num_of_rows,
+                                      const uint16_t bias_shift,
+                                      const uint16_t out_shift,
+                                      const q7_t *bias,
+                                      q31_t *pOut,
+                                      q15_t *vec_buffer)
+{
+
+#if defined(ARM_MATH_DSP)
+    /* Run the following code for Cortex-M4 and Cortex-M7 */
+
+    const q7_t *pB = pM;
+    const q7_t *pB2;
+    q31_t *pO = pOut;
+    const q7_t *pBias = bias;
+    const q15_t *pA;
+    uint16_t rowCnt = num_of_rows >> 1;
+
+    /* expand the vector into the buffer */
+    arm_q7_to_q15_reordered_no_shift(pV, vec_buffer, dim_vec);
+
+    while (rowCnt)
+    {
+        q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
+        q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
+        uint16_t colCnt = dim_vec >> 2;
+
+        pA = vec_buffer;
+        pB2 = pB + dim_vec;
+
+        while (colCnt)
+        {
+            q31_t inV, inM11, inM12, inM21, inM22;
+            pB = read_and_pad_reordered(pB, &inM11, &inM12);
+            pB2 = read_and_pad_reordered(pB2, &inM21, &inM22);
+
+            inV = arm_nn_read_q15x2_ia(&pA);
+
+            sum = __SMLAD(inV, inM11, sum);
+            sum2 = __SMLAD(inV, inM21, sum2);
+
+            inV = arm_nn_read_q15x2_ia(&pA);
+
+            sum = __SMLAD(inV, inM12, sum);
+            sum2 = __SMLAD(inV, inM22, sum2);
+
+            colCnt--;
+        }
+        colCnt = dim_vec & 0x3;
+        while (colCnt)
+        {
+            q7_t inV = *pA++;
+            q15_t inM = *pB++;
+            q15_t inM2 = *pB2++;
+
+            sum += inV * inM;
+            sum2 += inV * inM2;
+            colCnt--;
+        } /* while over colCnt */
+        *pO++ = (q31_t)(__SSAT((sum >> out_shift), 32));
+        *pO++ = (q31_t)(__SSAT((sum2 >> out_shift), 32));
+
+        /* adjust the pointers and counters */
+        pB += dim_vec;
+        rowCnt--;
+    }
+
+    /* left-over part of the rows */
+    rowCnt = num_of_rows & 0x1;
+
+    while (rowCnt)
+    {
+        uint16_t colCnt = dim_vec >> 2;
+        q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
+
+        pA = vec_buffer;
+
+        while (colCnt)
+        {
+            q31_t inV1, inV2, inM11, inM12;
+
+            pB = read_and_pad_reordered(pB, &inM11, &inM12);
+
+            inV1 = arm_nn_read_q15x2_ia(&pA);
+            sum = __SMLAD(inV1, inM11, sum);
+
+            inV2 = arm_nn_read_q15x2_ia(&pA);
+            sum = __SMLAD(inV2, inM12, sum);
+
+            colCnt--;
+        }
+
+        /* left-over of the vector */
+        colCnt = dim_vec & 0x3;
+        while (colCnt)
+        {
+            q7_t inV = *pA++;
+            q15_t inM = *pB++;
+            sum += inV * inM;
+            colCnt--;
+        }
+
+        *pO++ = (q31_t)(__SSAT((sum >> out_shift), 32));
+
+        rowCnt--;
+    }
+
+#else
+    (void)vec_buffer;
+    int i, j;
+    /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
+    for (i = 0; i < num_of_rows; i++)
+    {
+        int ip_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
+        for (j = 0; j < dim_vec; j++)
+        {
+            ip_out += pV[j] * pM[i * dim_vec + j];
+        }
+        pOut[i] = (q31_t)__SSAT((ip_out >> out_shift), 32);
+    }
+
+#endif /* ARM_MATH_DSP */
+
+    /* Return to ARM_MATH_SUCCESS */
+    return (ARM_MATH_SUCCESS);
+}
+
+/**
+ * @} end of FC group
+ */
diff --git a/assets/cmsis-nn/cnn.h b/assets/cmsis-nn/cnn.h
@@ -7,13 +7,16 @@
 
 #include "weights.h"
 
-arm_status
-arm_fully_connected_q7_q8p7_opt(const q7_t * pV,
-                                const q7_t * pM,
-                                const uint16_t dim_vec,
-                                const uint16_t num_of_rows,
-                                const uint16_t bias_shift,
-                                const uint16_t out_shift, const q7_t * bias, q15_t * pOut, q15_t * vec_buffer);
+
+arm_status arm_fully_connected_q7_q31(const q7_t *pV,
+                                      const q7_t *pM,
+                                      const uint16_t dim_vec,
+                                      const uint16_t num_of_rows,
+                                      const uint16_t bias_shift,
+                                      const uint16_t out_shift,
+                                      const q7_t *bias,
+                                      q31_t *pOut,
+                                      q15_t *vec_buffer);
 
 
 void arm_softmax_q8p7_q15(const q15_t * vec_in, const uint16_t dim_vec, q15_t * p_out);
@@ -72,4 +75,3 @@ void arm_avepool_nonsquare_q7_HWC_nonsquare(q7_t * Im_in,
 
 
 void arm_relu32_q7(q7_t * data, uint32_t size);
-
diff --git a/attic/gen-cmsisdemos.sh b/attic/gen-cmsisdemos.sh
@@ -1,12 +1,16 @@
 #!/bin/sh
-./ai8xize.py --verbose --log --test-dir demos --prefix mnist --checkpoint-file trained/ai85-mnist-qat8-q.pth.tar --config-file networks/mnist-chw-ai85.yaml --device CMSIS-NN "$@"
-./ai8xize.py --verbose --log --test-dir demos --prefix cifar-10 --checkpoint-file trained/ai85-cifar10-qat8-q.pth.tar --config-file networks/cifar10-hwc-ai85.yaml --device CMSIS-NN "$@"
-./ai8xize.py --verbose --log --test-dir demos --prefix kws20_v2 --checkpoint-file trained/ai85-kws20_v2-qat8-q.pth.tar --config-file networks/kws20-v2-hwc.yaml --device CMSIS-NN "$@"
-./ai8xize.py --verbose --log --test-dir demos --prefix faceid --checkpoint-file trained/ai85-faceid-qat8-q.pth.tar --config-file networks/faceid.yaml --device CMSIS-NN "$@"
+TARGET=demos
+COMMON_ARGS="--device CMSIS-NN"
 
-./ai8xize.py --verbose --log --test-dir demos --prefix CMSIS-Conv1D --config-file tests/test-conv1d.yaml --device CMSIS-NN
-./ai8xize.py --verbose --log --test-dir demos --prefix CMSIS-Conv1x1 --config-file tests/test-conv1x1.yaml --device CMSIS-NN
+./ai8xize.py --verbose --log --test-dir $TARGET --prefix mnist --checkpoint-file trained/ai85-mnist-qat8-q.pth.tar --config-file networks/mnist-chw-ai85.yaml $COMMON_ARGS "$@"
+./ai8xize.py --verbose --log --test-dir $TARGET --prefix cifar-10 --checkpoint-file tests/ai85-cifar10-qat8-q.pth.tar --config-file networks/cifar10-hwc-ai85.yaml $COMMON_ARGS "$@"
+./ai8xize.py --verbose --log --test-dir $TARGET --prefix kws20_v2 --checkpoint-file trained/ai85-kws20_v2-qat8-q.pth.tar --config-file networks/kws20-v2-hwc.yaml $COMMON_ARGS "$@"
+./ai8xize.py --verbose --log --test-dir $TARGET --prefix faceid --checkpoint-file trained/ai85-faceid-qat8-q.pth.tar --config-file networks/faceid.yaml $COMMON_ARGS "$@"
+./ai8xize.py --verbose --log --test-dir $TARGET --prefix cats-dogs --checkpoint-file trained/ai85-catsdogs-qat8-q.pth.tar --config-file networks/cats-dogs-chw.yaml $COMMON_ARGS "$@"
 
-./ai8xize.py --verbose --log --test-dir demos --prefix CMSIS-Nonsquare --config-file tests/test-nonsquare.yaml --device CMSIS-NN
-./ai8xize.py --verbose --log --test-dir demos --prefix CMSIS-NonsquarePool --config-file tests/test-nonsquare-pool.yaml --device CMSIS-NN
-./ai8xize.py --verbose --log --test-dir demos --prefix CMSIS-NonsquarePoolNonsquare --config-file tests/test-nonsquare-nonsquarepool.yaml --device CMSIS-NN
+./ai8xize.py --verbose --log --test-dir $TARGET --prefix CMSIS-Conv1D --config-file tests/test-conv1d.yaml --device CMSIS-NN
+./ai8xize.py --verbose --log --test-dir $TARGET --prefix CMSIS-Conv1x1 --config-file tests/test-conv1x1.yaml --device CMSIS-NN
+
+./ai8xize.py --verbose --log --test-dir $TARGET --prefix CMSIS-Nonsquare --config-file tests/test-nonsquare.yaml --device CMSIS-NN
+./ai8xize.py --verbose --log --test-dir $TARGET --prefix CMSIS-NonsquarePool --config-file tests/test-nonsquare-pool.yaml --device CMSIS-NN
+./ai8xize.py --verbose --log --test-dir $TARGET --prefix CMSIS-NonsquarePoolNonsquare --config-file tests/test-nonsquare-nonsquarepool.yaml --device CMSIS-NN
diff --git a/izer/cmsisnn.py b/izer/cmsisnn.py
@@ -66,11 +66,13 @@ def create_net(  # pylint: disable=too-many-arguments,too-many-locals,too-many-b
     """
     wprint('CMSIS-NN code generation is unsupported.')
 
-    if output_width[-1] != 8:
+    if output_width[-1] != 8 and operator[-1] != op.LINEAR:
         wprint('CMSIS-NN network generator does not currently support `output_width` that '
-               'is not 8. Forcing to 8 bit.')  # FIXME: Support 32-bit output
+               'is not 8 when not using Linear. Forcing to 8 bit.')  # FIXME: Support 32-bit output
         output_width[-1] = 8
 
+    final_size = 7 if output_width[-1] == 8 else 31
+
     input_dim_str = [None] * layers
     output_dim_str = [None] * layers
     kernel_size_str = [None] * layers
@@ -83,7 +85,8 @@ def create_net(  # pylint: disable=too-many-arguments,too-many-locals,too-many-b
         if quantization[ll] is None:
             quantization[ll] = 8  # Set default
         elif quantization[ll] != 8:  # FIXME: Support quantization
-            eprint('CMSIS-NN network generator does not currently support `quantization` != 8.')
+            wprint('CMSIS-NN network generator does not currently support `quantization` != 8. '
+                   'Forcing to 8 bit.')
 
         if output_shift[ll] is None:
             output_shift[ll] = 0  # Set default
@@ -206,8 +209,8 @@ def create_net(  # pylint: disable=too-many-arguments,too-many-locals,too-many-b
         c_file.write(f'static q7_t buffer1[{img_buffer_size}];\n')
         c_file.write(f'static q15_t col_buffer[{col_buffer_size}];\n\n')
 
-        c_file.write('int cnn_run(const q7_t *input, int input_size, '
-                     'q7_t **output, int *output_size)\n{\n')
+        c_file.write(f'int cnn_run(const q7_t *input, int input_size, q{final_size}_t **output, '
+                     'int *output_size)\n{\n')
 
         # Compute layer-by-layer output and chain results into input
         buffer0, buffer1 = 'buffer0', 'buffer1'
@@ -514,9 +517,17 @@ def run_eltwise(
                     # Detect fully connected layers
                     if operator[ll] == op.LINEAR:
                         assert in_dim == [1, 1] and output_dim[ll] == [1, 1]
-                        c_file.write(f'  arm_fully_connected_q7({source}, '
+                        if output_width[ll] == 8:
+                            fn = 'q7'
+                            shift = 7 - output_shift[ll]
+                            cast = ''
+                        else:
+                            fn = 'q7_q31'
+                            shift = 0
+                            cast = '(q31_t *) '
+                        c_file.write(f'  arm_fully_connected_{fn}({source}, '
                                      f'weights_{ll}, {in_chan}, {output_chan[ll]}, 7, '
-                                     f'{7 - output_shift[ll]}, bias_{ll}, {buffer1}, '
+                                     f'{shift}, bias_{ll}, {cast}{buffer1}, '
                                      'col_buffer);\n')
                     else:
                         fn = 'fast' if in_chan % 4 == 0 and output_chan[ll] % 2 == 0 \
@@ -565,13 +576,13 @@ def run_eltwise(
 
         data = data_buf[-1]
 
-        c_file.write(f'  *output = {buffer0};\n'
+        c_file.write(f'  *output = {"" if output_width[ll] == 8 else "(q31_t *) "}{buffer0};\n'
                      f'  *output_size = {data_cmsis.size};\n\n'
                      '  return 1;\n}\n\n')
 
         c_file.write('int main(void)\n{\n'
                      '  int i;\n'
-                     '  q7_t *output;\n'
+                     f'  q{final_size}_t *output;\n'
                      '  int output_size;\n\n'
                      f'  cnn_run(input_data, {input_size}, &output, &output_size);\n\n')
 
@@ -582,9 +593,12 @@ def run_eltwise(
                      '    printf("!!! FAIL !!!\\n\\n");\n\n')
 
         c_file.write('  printf("Output of final layer:\\n");\n'
-                     '  for (i = 0; i < output_size; i++) {\n'
-                     '    printf("%5hhd", (int8_t) (output[i] & 0xff));\n'
-                     '    if ((i + 1) % 32 == 0)\n      printf("\\n");\n'
+                     '  for (i = 0; i < output_size; i++) {\n')
+        if final_size == 7:
+            c_file.write('    printf("%5hhd", (int8_t) (output[i] & 0xff));\n')
+        else:
+            c_file.write('    printf("%8d", (int32_t) output[i]);\n')
+        c_file.write('    if ((i + 1) % 32 == 0)\n      printf("\\n");\n'
                      '    else if ((i + 1) % 4 == 0)\n      printf(" ");\n'
                      '  }\n'
                      '  printf("\\n");\n'
diff --git a/izer/izer.py b/izer/izer.py