Improve messages for reshaping and multi-pass (#115)

Robert Muchsel · web-flow · commit a20508a67161 · 2021-03-17T16:31:59.000-05:00
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # MAX78000 Model Training and Synthesis
 
-_March 15, 2021_
+_March 17, 2021_
 
 The Maxim Integrated AI project is comprised of four repositories:
 
@@ -422,6 +422,8 @@ For example, if 192-channel data is read using 64 active processors, Data Memory
 
 *Note: Multi-pass also works with channel counts that are not a multiple of 64, and can be used with less than 64 active processors.*
 
+*Note: For all multi-pass cases, the processor count per pass is rounded up to the next multiple of 4.*
+
 ### Streaming Mode
 
 The machine also implements a streaming mode. Streaming allows input data dimensions that exceed the available per-channel data memory in the accelerator.
@@ -1237,6 +1239,8 @@ The following table describes the most important command line arguments for `ai8
 
 The [quick-start guide](https://github.com/MaximIntegratedAI/MaximAI_Documentation/blob/master/Guides/YAML%20Quickstart.md) provides a short overview of the purpose and structure of the YAML network description file.
 
+The following is a detailed guide into all supported configuration options.
+
 An example network description for the ai85net5 architecture and MNIST is shown below:
 
 ```yaml
@@ -1325,7 +1329,7 @@ Data sets are for example `mnist`, `fashionmnist`, and `cifar-10`.
 
 ##### `output_map` (Optional)
 
-The global `output_map`, if specified, overrides the memory instances where the last layer outputs its results. If not specified, this will be either the `output_processors` specified for the last layer, or, if that key does not exist, default to the number of processors needed for the output channels, starting at 0.
+The global `output_map`, if specified, overrides the memory instances where the last layer outputs its results. If not specified, this will be either the `output_processors` specified for the last layer, or, if that key does not exist, default to the number of processors needed for the output channels, starting at 0. Please also see `output_processors`.
 
 Example:
 	`output_map: 0x0000000000000ff0`
@@ -1350,7 +1354,7 @@ This key allows overriding of the processing sequence. The default is `0` for th
 
 `processors` is specified as a 64-bit hexadecimal value. Dots (‘.’) and a leading ‘0x’ are ignored.
 
-*Note: When using multi-pass (i.e., using more than 64 channels), the number processors is an integer division of the channel count, rounded up. For example, 60 processors are specified for 120 channels.*
+*Note: When using multi-pass (i.e., using more than 64 channels), the number processors is an integer division of the channel count, rounded up to the next multiple of 4. For example, 52 processors are required for 100 channels (since 100 div 2 = 50, and 52 is the next multiple of 4). For best efficiency, use channel counts that are multiples of 4.*
 
 Example for three processors 0, 4, and 8:
 	 `processors: 0x0000.0000.0000.0111`
diff --git a/README.pdf b/README.pdf
diff --git a/izer/cmsisnn.py b/izer/cmsisnn.py
@@ -337,7 +337,7 @@ def run_eltwise(
                 data = np.squeeze(data, axis=0)
 
             # Convolution or passthrough
-            if operator[ll] == op.CONV2D:
+            if operator[ll] in [op.CONV2D, op.LINEAR]:
                 if flatten[ll]:
                     in_chan *= input_dim[ll][0] * input_dim[ll][1]
                     data = data.reshape(in_chan, 1, 1)
@@ -442,7 +442,7 @@ def run_eltwise(
                              f'pool with stride {pool_stride_str[ll]}')
             else:
                 c_file.write('no pooling')
-            if operator[ll] in [op.CONV1D, op.CONV2D, op.CONVTRANSPOSE2D]:
+            if operator[ll] in [op.CONV1D, op.CONV2D, op.CONVTRANSPOSE2D, op.LINEAR]:
                 conv_str = f', {op.string(operator[ll])} with kernel size ' \
                            f'{kernel_size_str[ll]}, ' \
                            f'stride {stride_str[ll]}, ' \
@@ -512,7 +512,8 @@ def run_eltwise(
                    and padding[ll][0] == padding[ll][1] \
                    and stride[ll][0] == stride[ll][1]:
                     # Detect fully connected layers
-                    if in_dim == [1, 1] and output_dim[ll] == [1, 1]:
+                    if operator[ll] == op.LINEAR:
+                        assert in_dim == [1, 1] and output_dim[ll] == [1, 1]
                         c_file.write(f'  arm_fully_connected_q7({source}, '
                                      f'weights_{ll}, {in_chan}, {output_chan[ll]}, 7, '
                                      f'{7 - output_shift[ll]}, bias_{ll}, {buffer1}, '
diff --git a/izer/izer.py b/izer/izer.py
@@ -369,18 +369,28 @@ def main():
                             eprint('Cannot concatenate outputs of different dimensions in layer '
                                    f'{ll}: {dim} vs {output_dim[e]}.')
                     auto_input_dim[ll] = dim
+                    prev_op = operator[in_sequences[ll][0]]
                 else:
                     auto_input_dim[ll] = output_dim[in_sequences[ll]]
+                    prev_op = operator[in_sequences[ll]]
             else:
                 auto_input_dim[ll] = output_dim[prev_sequence[ll]]
+                prev_op = operator[prev_sequence[ll]]
             if conf_input_dim[ll] is None:
                 input_dim[ll] = auto_input_dim[ll]
+                # Print warning when going from 1D to 2D without explicitly reformatting the input
+                if input_dim[ll][1] == 1 and operator[ll] in [op.CONV2D, op.CONVTRANSPOSE2D] \
+                   and prev_op == op.CONV1D:
+                    wprint(f'Using 1-dimensional data {input_dim[ll][0]}x{input_dim[ll][1]} for '
+                           f'layer {ll} with a {op.string(operator[ll])} operator. '
+                           'Use `in_dim:` to reshape the data to two dimensions, or to silence '
+                           'this message.')
             else:
                 input_dim[ll] = conf_input_dim[ll]
         if operator[ll] != op.CONV1D:
             if pool_stride[ll][0] != pool_stride[ll][1]:
-                eprint(f'{op.string(operator[ll])} in layer {ll} does not support non-square '
-                       f'pooling stride (currently set to '
+                eprint(f'{op.string(operator[ll])} in layer {ll} does not support '
+                       f'non-square pooling stride (currently set to '
                        f'{pool_stride[ll][0]}x{pool_stride[ll][1]}).')
             pooled_size = [(input_dim[ll][0] + pool_stride[ll][0] - pool[ll][0]
                             - pool_dilation[ll][0] + 1) // pool_stride[ll][0],
@@ -398,12 +408,12 @@ def main():
 
         if operator[ll] != op.CONV1D:
             if stride[ll][0] != stride[ll][1]:
-                eprint(f'{op.string(operator[ll])} in layer {ll} does not support non-square '
-                       f'stride (currently set to {stride[ll][0]}x{stride[ll][1]}).')
+                eprint(f'{op.string(operator[ll])} in layer {ll} does not support '
+                       f'non-square stride (currently set to {stride[ll][0]}x{stride[ll][1]}).')
             if operator[ll] != op.CONVTRANSPOSE2D and stride[ll][0] != 1:
-                eprint(f'{op.string(operator[ll])} in layer {ll} does not support stride other '
-                       f'than 1 (currently set to {stride[ll][0]}x{stride[ll][1]}).')
-            if operator[ll] in [op.NONE, op.CONV2D]:
+                eprint(f'{op.string(operator[ll])} in layer {ll} does not support stride '
+                       f'other than 1 (currently set to {stride[ll][0]}x{stride[ll][1]}).')
+            if operator[ll] in [op.NONE, op.CONV2D, op.LINEAR]:
                 output_dim[ll] = [(pooled_size[0] - dilation[ll][0] * (kernel_size[ll][0] - 1)
                                    - 1 + 2 * padding[ll][0]) // stride[ll][0] + 1,
                                   (pooled_size[1] - dilation[ll][1] * (kernel_size[ll][1] - 1)
@@ -425,15 +435,15 @@ def main():
                 input_channels[ll] //= pooled_dim[ll][0] * pooled_dim[ll][1]
                 assert input_channels[ll] > 0
             if padding[ll][0] >= 3 and not tc.dev.SUPPORT_ARBITRARY_PADDING:
-                eprint(f'{op.string(operator[ll])} in layer {ll} does not support `pad` >= 3 '
-                       f'(currently set to {padding[ll][0]}).')
+                eprint(f'{op.string(operator[ll])} in layer {ll} does not support '
+                       f'`pad` >= 3 (currently set to {padding[ll][0]}).')
         else:
             if padding[ll][0] >= 3 and not tc.dev.SUPPORT_ARBITRARY_PADDING:
-                eprint(f'{op.string(operator[ll])} in layer {ll} does not support `pad` >= 3 '
-                       f'(currently set to {padding[ll][0]}).')
+                eprint(f'{op.string(operator[ll])} in layer {ll} does not support '
+                       f'`pad` >= 3 (currently set to {padding[ll][0]}).')
             if stride[ll][0] != 1 and not tc.dev.SUPPORT_ARBITRARY_STRIDE:
-                eprint(f'{op.string(operator[ll])} in layer {ll} does not support stride other '
-                       f'than 1 (currently set to {stride[ll][0]}).')
+                eprint(f'{op.string(operator[ll])} in layer {ll} does not support stride '
+                       f'other than 1 (currently set to {stride[ll][0]}).')
             output_dim[ll] = [(pooled_size[0] - dilation[ll][0] * (kernel_size[ll][0] - 1) - 1 +
                                2 * padding[ll][0]) // stride[ll][0] + 1,
                               1]
diff --git a/izer/max7800x.py b/izer/max7800x.py
@@ -534,7 +534,7 @@ def create_net(  # pylint: disable=too-many-arguments,too-many-locals,too-many-b
         if not overwrite:
             eprint('The target folder', target_dir, 'exists. Use --overwrite to proceed.')
         else:
-            wprint('--overwrite specified, writing to ', target_dir, ' even though it exists.')
+            wprint('--overwrite specified, writing to', target_dir, 'even though it exists.')
 
     # Redirect stdout?
     if log:
@@ -568,28 +568,28 @@ def create_net(  # pylint: disable=too-many-arguments,too-many-locals,too-many-b
         processors_used |= bits
 
         if input_chan[ll] > tc.dev.MAX_CHANNELS:
-            eprint(f'Layer {ll} is configured for {input_chan[ll]} inputs, which exceeds '
-                   f'the system maximum of {tc.dev.MAX_CHANNELS}.')
+            eprint(f'Layer {ll} is configured for {input_chan[ll]} input channels, which '
+                   f'exceeds the system maximum of {tc.dev.MAX_CHANNELS}.')
         if output_chan[ll] > tc.dev.MAX_CHANNELS:
-            eprint(f'Layer {ll} is configured for {output_chan[ll]} outputs, which exceeds '
-                   f'the system maximum of {tc.dev.MAX_CHANNELS}.')
+            eprint(f'Layer {ll} is configured for {output_chan[ll]} output channels, which '
+                   f'exceeds the system maximum of {tc.dev.MAX_CHANNELS}.')
         if (ll != start_layer or not fast_fifo_quad) \
            and popcount(processor_map[ll]) != in_expand_thresh[ll]:
-            eprint(f'Layer {ll} has {input_chan[ll]} inputs with input expansion '
-                   f'{in_expand[ll]}, {operands[ll]} operands, threshold {in_expand_thresh[ll]}, '
-                   f'but enabled processor map 0x{processor_map[ll]:016x} '
+            eprint(f'Layer {ll} has {input_chan[ll]} input channels using {in_expand[ll]} '
+                   f'passes, and {operands[ll]} operands ({in_expand_thresh[ll]} processors '
+                   f'per pass), but the enabled processor map 0x{processor_map[ll]:016x} '
                    f'has {popcount(processor_map[ll])} bits instead of the '
                    f'expected number of {in_expand_thresh[ll]}.')
         if ll == start_layer and fast_fifo_quad \
            and popcount(processor_map_0) != in_expand_thresh[ll]:
-            eprint(f'Layer {ll} has {input_chan[ll]} inputs with input expansion '
-                   f'{in_expand[ll]}, threshold {in_expand_thresh[ll]}, but '
+            eprint(f'Layer {ll} has {input_chan[ll]} input channels using {in_expand[ll]} '
+                   f'passes ({in_expand_thresh[ll]} processors per pass), but the '
                    f'enabled processor map 0x{processor_map[ll]:016x} '
                    f'has {popcount(processor_map[ll])} bits instead of the '
                    f'expected number of {in_expand_thresh[ll]}.')
         if popcount(output_processor_map[ll]) != out_expand_thresh[ll]:
-            eprint(f'Layer {ll} has {output_chan[ll]} outputs with output expansion '
-                   f'{out_expand[ll]}, threshold {out_expand_thresh[ll]}, but '
+            eprint(f'Layer {ll} has {output_chan[ll]} output channels using {out_expand[ll]} '
+                   f'passes ({out_expand_thresh[ll]} processors per pass), but the '
                    f'processor output map 0x{output_processor_map[ll]:016x} '
                    f'has {popcount(output_processor_map[ll])} bits instead of the '
                    f'expected number of {out_expand_thresh[ll]}.')
@@ -1434,7 +1434,7 @@ def create_net(  # pylint: disable=too-many-arguments,too-many-locals,too-many-b
                     # Write Pointer Timeslot Offset Register
                     # Used for 1x1 convolution, and pooling without convolution
                     val = 0
-                    if operator[ll] == op.CONV2D:
+                    if operator[ll] in [op.CONV2D, op.LINEAR]:
                         if kernel_size[ll] == [1, 1] and conv_groups[ll] == 1:
                             val = 1
                         elif conv_groups[ll] > 1 and not broadcast_mode[ll]:
@@ -1615,7 +1615,7 @@ def create_net(  # pylint: disable=too-many-arguments,too-many-locals,too-many-b
                     if operator[ll] == op.CONV1D:
                         val |= kernel_size[ll][0] << 8 | 1 << 12
                         assert kernel_size[ll][0] < 2**4
-                    elif (operator[ll] == op.CONV2D and kernel_size[ll] == [1, 1]
+                    elif (operator[ll] in [op.CONV2D, op.LINEAR] and kernel_size[ll] == [1, 1]
                           or operator[ll] == op.NONE and operands[ll] == 1):
                         val |= 1 << 8
                     if operands[ll] > 1:
@@ -1624,7 +1624,7 @@ def create_net(  # pylint: disable=too-many-arguments,too-many-locals,too-many-b
                         if (pool[ll][0] > 1 or pool[ll][1] > 1) \
                            and pool_first[ll]:
                             val |= 1 << 16
-                        if operator[ll] != op.NONE:  # in [op.CONV2D, op.CONVTRANSPOSE2D]:
+                        if operator[ll] != op.NONE:  # CONV2D, LINEAR, CONVTRANSPOSE2D
                             val |= 1 << 17
                     assert 0 <= oned_sad < 2**4
                     val |= oned_sad << 4
@@ -1634,7 +1634,7 @@ def create_net(  # pylint: disable=too-many-arguments,too-many-locals,too-many-b
 
                     # Configure tram pointer max
                     if operator[ll] == op.CONV1D or \
-                       operator[ll] == op.CONV2D and kernel_size[ll] == [1, 1]:
+                       operator[ll] in [op.CONV2D, op.LINEAR] and kernel_size[ll] == [1, 1]:
                         if flatten_prod >= 2**4:
                             assert flatten_prod < 2**16
                             val = flatten_prod << 16 | (2 * flatten_prod + 1)
@@ -1950,7 +1950,7 @@ def create_net(  # pylint: disable=too-many-arguments,too-many-locals,too-many-b
                             val = override_rollover
                         elif not tc.dev.REQUIRE_NEW_STREAMING:
                             if big_data[ll]:
-                                # FIXME stream_start + max(stride[ll][1], pool_stride[ll][1])
+                                # FIXME: stream_start + max(stride[ll][1], pool_stride[ll][1])
                                 val = 12
                             else:
                                 val = stream_start + (pool[ll][0] - 1) * input_dim[ll][1] \
@@ -2477,7 +2477,7 @@ def run_eltwise(
             data = np.squeeze(data, axis=0)
 
         # Convolution or passthrough
-        if operator[ll] == op.CONV2D:
+        if operator[ll] in [op.CONV2D, op.LINEAR]:
             if flatten[ll]:
                 in_chan *= pooled_dim[ll][0] * pooled_dim[ll][1]
                 data = data.reshape(in_chan, 1, 1)
diff --git a/izer/op.py b/izer/op.py
@@ -11,6 +11,7 @@
 CONV1D = 1
 CONV2D = 2
 CONVTRANSPOSE2D = 3
+LINEAR = 4
 
 ACT_RELU = 1
 ACT_ABS = 2
@@ -32,6 +33,7 @@
     CONV1D: 'conv1d',
     CONV2D: 'conv2d',
     CONVTRANSPOSE2D: 'convtranspose2d',
+    LINEAR: 'linear',
 }
 
 ELT_NAMES = {
@@ -50,18 +52,20 @@
     ELTWISE_OR: 0b10,
 }
 
+UNKNOWN = '????'
+
 
 def string(
         op,
         elt=False,
 ):
     """
-    Return string representation of operator `op`
+    Return string representation of operator `op`.
     """
     if not elt:
-        return OP_NAMES[op] if op in OP_NAMES else '????'
+        return OP_NAMES[op] if op in OP_NAMES else UNKNOWN
     # else:
-    return ELT_NAMES[op] if op in ELT_NAMES else '????'
+    return ELT_NAMES[op] if op in ELT_NAMES else UNKNOWN
 
 
 def eltwise(
@@ -88,9 +92,9 @@ def act_string(
         act,
 ):
     """
-    Return string representation of activation `act`
+    Return string representation of activation `act`.
     """
     if act is None:
         return ACT_NAMES[NONE]
     # else:
-    return ACT_NAMES[act] if act in ACT_NAMES else '????'
+    return ACT_NAMES[act] if act in ACT_NAMES else UNKNOWN
diff --git a/izer/yamlcfg.py b/izer/yamlcfg.py
@@ -280,7 +280,7 @@ def error_exit(message, sequence):
                 padding[sequence] = [0, 0]
             elif conv in ['linear', 'fc', 'mlp']:
                 # Emulate using Conv2D with 1x1 kernels and 1x1 data
-                operator[sequence] = op.CONV2D
+                operator[sequence] = op.LINEAR
                 kernel_size[sequence] = FC_KERNEL
                 padding[sequence] = [0, 0]
             else:
diff --git a/tests/sample_test_power-ai87.npy b/tests/sample_test_power-ai87.npy
diff --git a/tests/test-power-ai87.yaml b/tests/test-power-ai87.yaml
@@ -0,0 +1,20 @@
+arch: test
+dataset: test_power-ai87
+
+layers:
+- out_offset: 0x4000
+  processors: 0xffffffffffffffff
+  operation: conv2d
+  kernel_size: 3x3
+  activation: none
+  readahead: True
+  calcx4: True
+  output_shift: -3
+- out_offset: 0x0000
+  processors: 0xffffffffffffffff
+  operation: conv2d
+  kernel_size: 3x3
+  activation: none
+  readahead: True
+  calcx4: True
+  output_shift: -3
diff --git a/tests/weights_test_power-ai87.npy b/tests/weights_test_power-ai87.npy