analogdevicesinc
diff --git a/‎README.md
Lines changed: 3 additions & 3 deletions b/‎README.md
Lines changed: 3 additions & 3 deletions
diff --git a/‎README.pdf
-480 Bytes b/‎README.pdf
-480 Bytes
diff --git a/‎izer/apbaccess.py
Lines changed: 1 addition & 1 deletion b/‎izer/apbaccess.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎izer/checkpoint.py
Lines changed: 5 additions & 5 deletions b/‎izer/checkpoint.py
Lines changed: 5 additions & 5 deletions
diff --git a/‎izer/commandline.py
Lines changed: 2 additions & 0 deletions b/‎izer/commandline.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎izer/compute.py
Lines changed: 48 additions & 68 deletions b/‎izer/compute.py
Lines changed: 48 additions & 68 deletions
diff --git a/‎izer/izer.py
Lines changed: 5 additions & 4 deletions b/‎izer/izer.py
Lines changed: 5 additions & 4 deletions
diff --git a/‎izer/kbias.py
Lines changed: 2 additions & 1 deletion b/‎izer/kbias.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎izer/kernels.py
Lines changed: 15 additions & 10 deletions b/‎izer/kernels.py
Lines changed: 15 additions & 10 deletions
@@ -1,6 +1,6 @@
 # MAX78000 Model Training and Synthesis
 
-_February 10, 2021_
+_February 11, 2021_
 
 The Maxim Integrated AI project is comprised of four repositories:
 
@@ -688,7 +688,7 @@ The MAX78000 hardware does not support arbitrary network parameters. Specificall
 * `ConvTranspose2d`:
 
   * Kernel sizes must be 3×3.
-  * Padding can be 0, 1, or 2.
+  * Padding must be 1 *(Note: hardware supports additional padding modes, but there is no direct equivalent in PyToch)*.
   * Stride is fixed to [2, 2]. Output padding is fixed to 1.
 
 * A programmable layer-specific shift operator is available at the output of a convolution, see [`output_shift` (Optional)](#output_shift \(Optional\)).
@@ -1558,7 +1558,7 @@ Example:
 
 For layers that use a bias, this key can specify one or more bias memories that should be used. By default, the software uses a “Fit First Descending (FFD)” allocation algorithm that considers largest bias lengths first, and then the layer number, and places each bias in the available group with the most available space, descending to the smallest bias length.
 
-“Available groups” is layer specific and is a list of the groups that have enabled processors for the respective layer. `bias_group` must reference one or more of the available groups. This check can be overridden using the command line option `--ignore-bias-groups` that allows any group or list of groups for any layer.
+“Available groups” is layer specific and is a list of the groups that have enabled processors for the respective layer. `bias_group` must reference one or more of the available groups.
 
 `bias_group` can be a list of integers or a single integer.
 
 
@@ -175,7 +175,7 @@ def write_mem(
                                 for (addr, val) in self.data_mem[group][proc][mem]:
                                     f.write(f'@{addr:04x} {val}\n')
 
-        if self.kernel_mem is not None and not self.zero_sram:
+        if self.kernel_mem is not None:
             try:
                 target_dir = target_dir = os.path.join(base_directory, test_name, 'masks')
                 os.makedirs(target_dir, exist_ok=False)
 
@@ -115,7 +115,7 @@ def load(
             weight_min.append(w_min)
             weight_max.append(w_max)
 
-            if op == 'conv2d' and operator[seq] == opn.CONVTRANSPOSE2D:
+            if operator[seq] == opn.CONVTRANSPOSE2D:
                 # For ConvTranspose2d, flip the weights as follows:
                 w = np.flip(w, axis=(2, 3)).swapaxes(0, 1)
 
@@ -211,8 +211,8 @@ def load(
     if verbose:
         print(f'Checkpoint for epoch {checkpoint["epoch"]}, model {checkpoint["arch"]} - '
               'weight and bias data:')
-        print(' InCh OutCh  Weights         Quant Shift  Min Max   Size '
-              'Key                                 Bias       Quant  Min Max Size Key')
+        print(' InCh OutCh  Weights         Quant Shift  Min  Max   Size '
+              'Key                                 Bias       Quant  Min  Max Size Key')
         for ll in range(layers):
             if ll < len(weights) and weights[ll] is not None:
                 weight_shape = str(weights[ll].shape)
@@ -227,10 +227,10 @@ def load(
                 print(f'{input_channels[ll]:5} {output_channels[ll]:5}  '
                       f'{weight_shape:15} '
                       f'{quant[ll]:5} {output_shift_shape:5} '
-                      f'{weight_min[ll]:4} {weight_max[ll]:3} {weight_size[ll]:6} '
+                      f'{weight_min[ll]:4} {weight_max[ll]:4} {weight_size[ll]:6} '
                       f'{weight_keys[ll]:35} '
                       f'{bias_shape:10} '
-                      f'{bias_quant[ll]:5} {bias_min[ll]:4} {bias_max[ll]:3} {bias_size[ll]:4} '
+                      f'{bias_quant[ll]:5} {bias_min[ll]:4} {bias_max[ll]:4} {bias_size[ll]:4} '
                       f'{bias_keys[ll]:25}')
         print(f'TOTAL: {layers} layers, {param_count:,} parameters, {param_size:,} bytes')
 
 
@@ -198,6 +198,8 @@ def get_parser():
                        help="specify start offset for weights (debug, default: 0)")
     group.add_argument('--ignore-bias-groups', action='store_true', default=False,
                        help="do not force `bias_group` to use an active group (default: false)")
+    group.add_argument('--kernel-format', default='{0:4}', metavar='S',
+                       help="print format for kernels (default: '{0:4}')")
 
     # RTL sim
     group = parser.add_argument_group('RTL simulation')
 
@@ -66,7 +66,7 @@ def conv2d(
         fractional_stride,
         output_pad,
         groups=1,
-        debug=False,
+        debug=False,  # pylint: disable=unused-argument
 ):
     """
     Compute a 2D convolution.
@@ -77,80 +77,24 @@ def conv2d(
     in_channels = input_size[0]
     out_channels = output_size[0]
 
-    if debug:
-        # Slow route using pure Python
-        ref = np.full(shape=output_size, fill_value=np.nan, dtype=np.int64)
-        debug_print('k,c,x,y,weight,data,prod,cacc,acc')
-
-        for k in range(out_channels):
-            for y in range(-pad[0],
-                           input_size[1] - dilation[0] * (kernel_size[0] - 1) + pad[0],
-                           stride[0]):
-                for y_frac in range(fractional_stride[0]):
-                    for x in range(-pad[1],
-                                   input_size[2] - dilation[1] * (kernel_size[1] - 1) + pad[1],
-                                   stride[1]):
-                        for x_frac in range(fractional_stride[1]):
-                            val = np.int64(0)
-                            c = 0
-                            while True:
-                                dc = c if groups == 1 else c + k * (in_channels // groups)
-                                sval = np.int(0)
-                                for h in range(kernel_size[0]):
-                                    for w in range(kernel_size[1]):
-                                        ypos = (y + pad[0])*fractional_stride[0] - pad[0] \
-                                            + y_frac + h * dilation[0]
-                                        yd, yr = divmod(ypos, fractional_stride[0])
-                                        xpos = (x + pad[1])*fractional_stride[1] - pad[1] \
-                                            + x_frac + w * dilation[1]
-                                        xd, xr = divmod(xpos, fractional_stride[1])
-                                        if yr == 0 and 0 <= yd < input_size[1] and \
-                                           xr == 0 and 0 <= xd < input_size[2]:
-                                            prod = weight[k][c][h][w] * data[dc][yd][xd]
-                                            sval += prod
-                                            val += prod
-                                            stats.true_macc += 1
-                                            debug_print(
-                                                f'{k},{c},{x},{y},{weight[k][c][h][w]},'
-                                                f'{data[dc][yd][xd]},{prod},{sval},{val}'
-                                            )
-                                c += 16
-                                if c >= in_channels // groups:
-                                    c = (c + 1) % 16
-                                    if c in (0, in_channels // groups):
-                                        break
-
-                            if bias is not None:
-                                val += bias[k]
-                                debug_print(
-                                    f'     adding bias: {bias[k]} -> result: {val}'
-                                )
-
-                            ref[k][
-                                ((y + pad[0])*fractional_stride[0] + y_frac) // stride[0]
-                            ][
-                                ((x + pad[1])*fractional_stride[1] + x_frac) // stride[1]
-                            ] = val
-
-    # Fast computation using NumPy
-
     # Stretch data for fractionally-strided convolution
     if fractional_stride[0] > 1 or fractional_stride[1] > 1:
         ndata = np.zeros((data.shape[0],
-                          data.shape[1] * fractional_stride[0],
-                          data.shape[2] * fractional_stride[1]),
+                          data.shape[1] * fractional_stride[0] - 1,
+                          data.shape[2] * fractional_stride[1] - 1),
                          dtype=data.dtype)
         ndata[:, 0::fractional_stride[0], 0::fractional_stride[1]] = data
         data = ndata
 
-    # Create zero padding around data and stretch weights for dilation.
+    # Create zero padding around data
     if pad[0] or pad[1] or output_pad[0] or output_pad[1]:
         data = np.pad(data, pad_width=((0, 0),
-                                       (pad[0], pad[0]),
-                                       (pad[1], pad[1])),
+                                       (pad[0], pad[0] + output_pad[0]),
+                                       (pad[1], pad[1] + output_pad[1])),
                       mode='constant', constant_values=0)
 
     if dilation[0] > 1 or dilation[1] > 1:
+        # Stretch weights for dilation
         nweight = np.zeros((weight.shape[0], weight.shape[1],
                             (kernel_size[0] - 1) * dilation[0] + 1,
                             (kernel_size[1] - 1) * dilation[1] + 1),
@@ -182,15 +126,51 @@ def conv2d(
         for k in range(out_channels):
             output[k] += bias[k]
 
-    if debug:
-        if not (ref == output).all():
-            eprint('NumPy <-> Python mismatch in compute.conv2d')
-
-    assert output.shape == tuple(output_size), f'Shape mismatch: {output.shape} vs {output_size}'
+    assert output.shape == tuple(output_size), \
+        f'Shape mismatch: NumPy result {output.shape} vs expected {output_size}'
 
     return output
 
 
+def convtranspose2d(
+        data,
+        weight,
+        bias,
+        input_size,
+        output_size,
+        kernel_size,
+        stride,
+        pad,
+        dilation,
+        fractional_stride,
+        output_pad,
+        groups=1,
+        debug=False,
+):
+    """
+    Compute a transposed 2D convolution.
+    """
+
+    return conv2d(
+        data,
+        weight,
+        bias,
+        input_size,
+        output_size,
+        kernel_size,
+        stride,
+        (
+            dilation[0] * (kernel_size[0] - 1) - pad[0],
+            dilation[1] * (kernel_size[1] - 1) - pad[1]
+        ),
+        dilation,
+        fractional_stride,
+        output_pad,
+        groups,
+        debug,
+    )
+
+
 def conv1d(
         data,
         weight,
 
@@ -229,6 +229,7 @@ def main():
     pool = params['pool'][:layers]
     pool_stride = params['pool_stride'][:layers]
     padding = params['padding'][:layers]
+    output_padding = params['output_padding'][:layers]
     stride = params['stride'][:layers]
     dilation = params['dilation'][:layers]
     big_data = params['big_data'][:layers]
@@ -399,14 +400,12 @@ def main():
                                   (pooled_size[1] - dilation[ll][1] * (kernel_size[ll][1] - 1)
                                    - 1 + 2 * padding[ll][1]) // stride[ll][1] + 1]
             elif operator[ll] == op.CONVTRANSPOSE2D:
-                # output padding is always 1
-                output_padding = 1
                 output_dim[ll] = [(pooled_size[0] - 1) * stride[ll][0] - 2 * padding[ll][0]
                                   + dilation[ll][0] * (kernel_size[ll][0] - 1)
-                                  + output_padding + 1,
+                                  + output_padding[ll][0] + 1,
                                   (pooled_size[1] - 1) * stride[ll][1] - 2 * padding[ll][1]
                                   + dilation[ll][1] * (kernel_size[ll][1] - 1)
-                                  + output_padding + 1]
+                                  + output_padding[ll][1] + 1]
             else:  # Element-wise
                 output_dim[ll] = [pooled_size[0], pooled_size[1]]
             if flatten[ll]:
@@ -594,6 +593,8 @@ def main():
             fifo_go=args.fifo_go,
             pretend_zero_sram=args.pretend_zero_sram,
             ignore_bias_groups=args.ignore_bias_groups,
+            output_padding=output_padding,
+            kernel_format=args.kernel_format,
         )
         if not args.embedded_code and args.autogen.lower() != 'none':
             rtlsim.append_regression(
 
@@ -32,6 +32,7 @@ def load(
         processor_map,
         output_processor_map,
         out_expand,
+        groups_used,
         debug,  # pylint: disable=unused-argument
 ):
     """
@@ -74,7 +75,7 @@ def load(
             if ll == 0 and streaming[ll] and not tc.dev.SUPPORT_STREAM_BIAS:
                 bias_len[ll] += 1  # Work around a problem on AI85
 
-            bias_map += [(ll, group_map[ll] if bias_group_map[ll] is None else bias_group_map[ll],
+            bias_map += [(ll, groups_used if bias_group_map[ll] is None else bias_group_map[ll],
                           bias_len[ll])]
             continue
 
 
@@ -79,6 +79,7 @@ def load(  # pylint: disable=too-many-branches,too-many-statements
         api=False,
         start_offs=0,
         bypass=None,
+        zero_sram=False,
 ):
     """
     Stack `kernel` values and write them to C code (for `embedded_code` if `True` or
@@ -133,12 +134,15 @@ def load(  # pylint: disable=too-many-branches,too-many-statements
             in_exp = 1
             in_chan = in_expand_thresh[ll]
         elif calcx4[ll]:
+            # FIXME for output channels % 4 != 0
             kernel_reshaped = kernel[ll].reshape(
-                output_chan[ll],
+                output_chan[ll] // 4,
+                4,
                 in_expand[ll],
-                -1,
-            ).swapaxes(0, 1).reshape(
-                kernel[ll].shape,
+                in_expand_thresh[ll],
+                kernel_size[ll][0] * kernel_size[ll][1],
+            ).transpose(0, 2, 1, 3, -1).reshape(
+                kernel[ll].shape
             )
             in_exp = in_expand[ll]
             in_chan = input_chan[ll]
@@ -151,15 +155,15 @@ def load(  # pylint: disable=too-many-branches,too-many-statements
             kernel_reshaped = kernel_reshaped.copy().clip(-1, 0)
 
         if np.ndim(kernel_reshaped) > 2:
-            if kernel_reshaped.shape[-2] != kernel_size[ll][0] \
-               or kernel_reshaped.shape[-1] != kernel_size[ll][1]:
+            if kernel_reshaped.shape[-1] != kernel_size[ll][0] \
+               or kernel_reshaped.shape[-2] != kernel_size[ll][1]:
                 eprint(f'The configured kernel dimensions ({kernel_size[ll][0]}x'
-                       f'{kernel_size[ll][1]}) for layer {ll} do not match the binary weights '
-                       f'({kernel_reshaped.shape[-2]}x{kernel_reshaped.shape[-1]})!')
+                       f'{kernel_size[ll][1]}) for layer {ll} do not match the weights file '
+                       f'({kernel_reshaped.shape[-1]}x{kernel_reshaped.shape[-2]})!')
         else:
             if kernel_reshaped.shape[-1] != kernel_size[ll][0]:
                 eprint(f'The configured kernel dimensions ({kernel_size[ll][0]}) '
-                       f'for layer {ll} do not match the binary weights '
+                       f'for layer {ll} do not match the weights file '
                        f'({kernel_reshaped.shape[-1]})!')
 
         proc_map = processor_map[ll]
@@ -419,7 +423,8 @@ def add_kernel_data(ll, p, col_target, b):
                 ll = kernel_map[p][col]
                 if ll != _INVALID_VALUE:
                     k = kernel_data[p][col]
-                    apb.write_kern(ll, p, col, k, calcx4=calcx4[ll])
+                    if not zero_sram or np.any(k != 0):
+                        apb.write_kern(ll, p, col, k, calcx4=calcx4[ll])
         apb.function_footer()  # load_weights()
 
     if embedded_code or mexpress: