bug fix and add new sandwiches

Frederik Rahbaek Warburg · Frederik Rahbaek Warburg · commit c59cefa5c9b7 · 2022-04-06T13:53:33.000+02:00
diff --git a/stochman/nnj.py b/stochman/nnj.py
@@ -76,10 +76,8 @@ def _jacobian_wrt_input_mult_left_vec(self, x: Tensor, val: Tensor, jac_in: Tens
     def _jacobian_wrt_input_transpose_mult_left_vec(self, x: Tensor, val: Tensor, jac_in: Tensor) -> Tensor:
         return F.linear(jac_in.movedim(1, -1), self.weight.T, bias=None).movedim(-1, 1)
 
-    def _sandwich_full_wrt_input(self, x: Tensor, val: Tensor, tmp: Tensor) -> Tensor:
-        return torch.einsum("nm,bnj,jk->bmk", self.weight, tmp, self.weight)
-
-    def _sandwich_full_wrt_weight(self, x: Tensor, val: Tensor, tmp: Tensor) -> Tensor:
+    def _jacobian_wrt_weight_sandwich(self, x: Tensor, val: Tensor, tmp: Tensor, diag: bool = False) -> Tensor:
+        
         b, c = x.shape
         diag_elements = torch.diagonal(tmp, dim1=1, dim2=2)
         feat_k2 = (x**2).unsqueeze(1)
@@ -92,6 +90,9 @@ def _sandwich_full_wrt_weight(self, x: Tensor, val: Tensor, tmp: Tensor) -> Tens
 
         return h_k
 
+    def _jacobian_wrt_input_sandwich(self, x: Tensor, val: Tensor, tmp: Tensor, diag: bool = False) -> Tensor:
+        return torch.einsum("nm,bnj,jk->bmk", self.weight, tmp, self.weight)
+
 
 class PosLinear(AbstractJacobian, nn.Linear):
     def forward(self, x: Tensor):
@@ -123,6 +124,14 @@ def _jacobian_wrt_input_mult_left_vec(self, x: Tensor, val: Tensor, jac_in: Tens
             .movedim(dims2, dims1)
         )
 
+    def _jacobian_wrt_weight_sandwich(self, x: Tensor, val: Tensor, tmp: Tensor, diag: bool = False) -> Tensor:
+        # non parametric, so return empty
+        return []
+
+    def _jacobian_wrt_input_sandwich(self, x: Tensor, val: Tensor, tmp: Tensor, diag: bool = False) -> Tensor:
+        raise NotImplementedError
+
+
 
 class Conv1d(AbstractJacobian, nn.Conv1d):
     def _jacobian_wrt_input_mult_left_vec(self, x: Tensor, val: Tensor, jac_in: Tensor) -> Tensor:
@@ -358,7 +367,7 @@ def _jacobian_wrt_weight_T_mult_right(
                         dilation=self.dilation,
                         groups=self.groups,
                     )
-                    .reshape(b, *tmp_single_batch.shape[4:], c1, kernel_h, kernel_w)
+                    .reshape(c2, *tmp_single_batch.shape[4:], c1, kernel_h, kernel_w)
                     .movedim((-3, -2, -1), (1, 2, 3))
                 )
 
@@ -381,7 +390,7 @@ def _jacobian_wrt_weight_T_mult_right(
                     dilation=self.dilation,
                     groups=self.groups,
                 )
-                .reshape(b, *tmp.shape[4:], c1, kernel_h, kernel_w)
+                .reshape(c2, *tmp.shape[4:], c1, kernel_h, kernel_w)
                 .movedim((-3, -2, -1), (1, 2, 3))
             )
 
@@ -390,6 +399,18 @@ def _jacobian_wrt_weight_T_mult_right(
 
         return Jt_tmp
 
+    def _jacobian_wrt_input_sandwich(self, x: Tensor, val: Tensor, tmp: Tensor, diag: bool = False) -> Tensor:
+        if diag:
+            return self._jacobian_wrt_input_diag_sandwich(x, val, tmp)
+        else:
+            return self._jacobian_wrt_input_full_sandwich(x, val, tmp)
+
+    def _jacobian_wrt_weight_sandwich(self, x: Tensor, val: Tensor, tmp: Tensor, diag: bool = False) -> Tensor:
+        if diag:
+            return self._jacobian_wrt_weight_diag_sandwich(x, val, tmp)
+        else:
+            return self._jacobian_wrt_weight_full_sandwich(x, val, tmp)
+
     def _jacobian_wrt_input_full_sandwich(self, x: Tensor, val: Tensor, tmp: Tensor) -> Tensor:
         return self._jacobian_wrt_input_mult_left(x, val, self._jacobian_wrt_input_T_mult_right(x, val, tmp))
 
@@ -662,14 +683,21 @@ def _jacobian_wrt_input_mult_left_vec(self, x: Tensor, val: Tensor, jac_in: Tens
         jac_in = jac_in[arange_repeated, idx, :, :, :].reshape(*val.shape, *jac_in_orig_shape[4:])
         return jac_in
 
-    def _sandwich_full_wrt_input(self, x: Tensor, val: Tensor, tmp: Tensor) -> Tensor:
+    def _jacobian_wrt_weight_sandwich(self, x: Tensor, val: Tensor, tmp: Tensor, diag: bool = False) -> Tensor:
+        # non parametric, so return empty
+        return []
+
+    def _jacobian_wrt_input_sandwich(self, x: Tensor, val: Tensor, tmp: Tensor, diag: bool = False) -> Tensor:
+        return self._jacobian_wrt_input_full_sandwich(x, val, tmp)
+
+    def _jacobian_wrt_input_full_sandwich(self, x: Tensor, val: Tensor, tmp: Tensor) -> Tensor:
 
         new_tmp = torch.zeros_like(x)
         new_tmp[self.idx] = tmp
 
         return new_tmp
 
-    def _sandwich_diag_wrt_input(self, x: Tensor, val: Tensor, tmp: Tensor) -> Tensor:
+    def _jacobian_wrt_input_diag_sandwich(self, x: Tensor, val: Tensor, tmp: Tensor) -> Tensor:
         pass
 
 
@@ -758,15 +786,25 @@ def _jacobian(self, x: Tensor, val: Tensor) -> Tensor:
         jac = 1.0 - val**2
         return jac
 
-    def _sandwich_full_wrt_input(self, x: Tensor, val: Tensor, tmp: Tensor) -> Tensor:
+    def _jacobian_wrt_weight_sandwich(self, x: Tensor, val: Tensor, tmp: Tensor, diag: bool = False) -> Tensor:
+        # non parametric, so return empty
+        return []
+
+    def _jacobian_wrt_input_sandwich(self, x: Tensor, val: Tensor, tmp: Tensor, diag: bool = False) -> Tensor:
+        if diag:
+            return self._jacobian_wrt_input_diag_sandwich(x, val, tmp)
+        else:
+            return self._jacobian_wrt_input_full_sandwich(x, val, tmp)
+
+    def _jacobian_wrt_input_full_sandwich(self, x: Tensor, val: Tensor, tmp: Tensor) -> Tensor:
 
         jac = self._jacobian(x, val)
         jac = torch.diag_embed(jac.view(x.shape[0], -1))
         tmp = torch.einsum("bnm,bnj,bjk->bmk", jac, tmp, jac)
 
         return tmp
 
-    def _sandwich_diag_wrt_input(self, x: Tensor, val: Tensor, tmp: Tensor) -> Tensor:
+    def _jacobian_wrt_input_diag_sandwich(self, x: Tensor, val: Tensor, tmp: Tensor) -> Tensor:
 
         jac = self._jacobian(x, val)
         jac = jac.view(x.shape[0], -1)