Replace 1x1 kernels in streaming layers with 3x3 + pad (#143)

Robert Muchsel · web-flow · commit b162abcdb4af · 2021-06-30T16:45:53.000-05:00
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # MAX78000 Model Training and Synthesis
 
-_June 21, 2021_
+_June 29, 2021_
 
 The Maxim Integrated AI project is comprised of five repositories:
 
@@ -414,7 +414,7 @@ if [ $? -eq 1 ] ; then
 fi
 ```
 
-The debugger requires OpenOCD. On Windows, an OpenOCD executable is installed with the SDK. On macOS and Linux, the OpenOCD fork from [https://github.com/MaximIntegratedMicros/openocd.git](https://github.com/MaximIntegratedMicros/openocd.git) must be used. An Ubuntu Linux binary is available at https://github.com/MaximIntegratedAI/MAX78000_SDK/blob/master/Tools/OpenOCD/openocd. *Note: A copy of the configuration files and a `run-openocd-maxdap` script are contained in the `hardware` folder of the `ai8x-synthesis` project.*
+The debugger requires OpenOCD. On Windows, an OpenOCD executable is installed with the SDK. On macOS and Linux, the OpenOCD fork from [https://github.com/MaximIntegratedMicros/openocd.git](https://github.com/MaximIntegratedMicros/openocd.git) must be used. An x86_64 Ubuntu Linux binary is available at https://github.com/MaximIntegratedAI/MAX78000_SDK/blob/master/Tools/OpenOCD/openocd. *Note: A copy of the configuration files and a `run-openocd-maxdap` script are contained in the `hardware` folder of the `ai8x-synthesis` project.*
 
 `gen-demos-max78000.sh` will create code that is compatible with the SDK and copy it into the SDK’s Example directories.
 
@@ -807,6 +807,7 @@ The MAX78000 hardware does not support arbitrary network parameters. Specificall
   * Streaming is limited to 8 consecutive layers or fewer, and is limited to four FIFOs (up to 4 input channels in CHW and up to 16 channels in HWC format), see [FIFOs](#FIFOs).
   * For streaming layers, bias values may not be added correctly in all cases.
   * The *final* streaming layer must use padding.
+  * Layers that use 1×1 kernels without padding are automatically replaced with equivalent layers that use 3×3 kernels with padding.
   
 * The weight memory supports up to 768 * 64 3×3 Q7 kernels (see [Number Format](#Number-Format)).
   When using 1-, 2- or 4-bit weights, the capacity increases accordingly.
diff --git a/README.pdf b/README.pdf
diff --git a/izer/backend/max7800x.py b/izer/backend/max7800x.py
@@ -268,10 +268,23 @@ def create_net(self) -> str:  # pylint: disable=too-many-locals,too-many-branche
             eprint('Streaming in the first layer requires use of a FIFO.')
         if any(streaming) and start_layer != 0:
             eprint('`--start_layer` must be 0 when using streaming.')
+
         for ll in range(min(tc.dev.MAX_STREAM_LAYERS, layers)):
             if next_sequence[ll] != -1 and next_sequence[ll] != ll + 1 and streaming[ll]:
                 eprint(f'`next_sequence` must be {ll+1} when using streaming in layer {ll}. '
                        f'Currently configured: {next_sequence[ll]}')
+
+            if tc.dev.EMULATE_1X1_STREAMING and streaming[ll] and kernel_size[ll] == [1, 1]:
+                wprint(f'Layer {ll}: Using 3x3 kernels to emulate 1x1 streaming layer')
+                # Create 3x3 weights from 1x1 weights and emulate using 3x3 kernels
+                weight33 = np.zeros((kernel[ll].shape[0], 3, 3), dtype=np.int64)
+                weight33[:, 1, 1] = kernel[ll][:, 0, 0]
+                kernel[ll] = weight33
+                assert padding[ll] == [0, 0]
+                padding[ll] = [1, 1]
+                effective_pad[ll] = [1, 1]
+                kernel_size[ll][0] = kernel_size[ll][1] = 3
+
             if not tc.dev.SUPPORT_STREAM_NONPAD_FINAL and streaming[ll] \
                and (next_sequence[ll] == -1 or not streaming[next_sequence[ll]]) \
                and (padding[ll][0] == 0 or padding[ll][1] == 0):
diff --git a/izer/test/test_conv2d_1x1.py b/izer/test/test_conv2d_1x1.py
@@ -61,6 +61,29 @@ def convolve(data, weight, expected):
     print("SUCCESS" if np.array_equal(output, expected) else "*** FAILURE ***")
     assert np.array_equal(output, expected)
 
+    # Create 3x3 weights from 1x1 weights
+    # and emulate using 3x3 kernels
+    shape33 = (weight.shape[0], weight.shape[1], 3, 3)
+    weight33 = np.zeros(shape33, dtype=np.int64)
+    weight33[:, :, 1, 1] = weight[:, :, 0, 0]
+
+    output = compute.conv2d(
+        data,
+        weight33,
+        None,
+        data.shape,
+        expected.shape,
+        kernel_size=[3, 3],
+        stride=[1, 1],
+        pad=[1, 1],
+        dilation=[1, 1],
+        fractional_stride=[1, 1],
+        output_pad=[0, 0],
+        groups=1,
+    )
+    print("PYTORCH OK" if np.array_equal(output, t) else "*** FAILURE ***")
+    assert np.array_equal(output, t)
+
 
 def test_conv2d():
     """Main program to test compute.conv2d."""
diff --git a/izer/tornadocnn.py b/izer/tornadocnn.py
@@ -44,6 +44,7 @@ class Dev:
     REQUIRE_NEW_STREAMING = False
     REQUIRE_FIFO_CPL = True
     EMULATE_ELTWISE_MP = False
+    EMULATE_1X1_STREAMING = True
     USE_PROCESSORS = True
     MODERN_SIM = False