improved documentation

thib-s · thib-s · commit b5c82684c05b · 2025-01-17T20:34:11.000+01:00
diff --git a/docs/api/conv.md b/docs/api/conv.md
@@ -1,18 +1,23 @@
+
 ::: orthogonium.layers.conv.AOC.ortho_conv
     rendering:
         show_root_toc_entry: True
     selection:
         inherited_members: True
 
-
-
 ::: orthogonium.layers.conv.SLL.sll_layer
     rendering:
         show_root_toc_entry: True
     selection:
         inherited_members: True
 
 ::: orthogonium.layers.conv.AOL.aol
+    rendering:
+        show_root_toc_entry: True
+    selection:
+        inherited_members: True
+
+::: orthogonium.layers.conv.adaptiveSOC.ortho_conv
     rendering:
         show_root_toc_entry: True
     selection:
diff --git a/orthogonium/layers/conv/AOC/ortho_conv.py b/orthogonium/layers/conv/AOC/ortho_conv.py
@@ -38,11 +38,17 @@ def AdaptiveOrthoConv2d(
         - Supports native striding, dilation, grouped convolutions, and flexible padding.
 
     Behavior:
-    -------------
+    ---------
         - When kernel_size == stride, the layer is an `RKOConv2d`.
         - When stride == 1, the layer is a `FastBlockConv2d`.
         - Otherwise, the layer is a `BcopRkoConv2d`.
 
+    Note:
+        - This implementation also work under zero padding, it lipschitz constant is still tight, but it looses
+            orthogonality.orthogonality on the image border.
+        - the unit tesing validated for a tolerance of 1e-4 under various orthogonalization schemes (see
+            reparametrizers). Only Cholesky based methods were validated for a lower tolerance of 5e-2.
+
     Arguments:
         in_channels (int): Number of input channels.
         out_channels (int): Number of output channels.
@@ -63,9 +69,9 @@ def AdaptiveOrthoConv2d(
 
 
     References:
-        - `[1] Boissin, T., Mamalet, F., Fel, T., Picard, A. M., Massena, T., & Serrurier, M. (2025).
+        - [1] Boissin, T., Mamalet, F., Fel, T., Picard, A. M., Massena, T., & Serrurier, M. (2025).
         An Adaptive Orthogonal Convolution Scheme for Efficient and Flexible CNN Architectures.
-        <https://arxiv.org/abs/2501.07930>`_
+        <https://arxiv.org/abs/2501.07930>
     """
 
     if kernel_size < stride:
@@ -124,6 +130,13 @@ def AdaptiveOrthoConvTranspose2d(
         - When stride == 1, the layer is a `FastBlockConvTranspose2D`.
         - Otherwise, the layer is a `BcopRkoConvTranspose2d`.
 
+
+    Note:
+        - This implementation also work under zero padding, it lipschitz constant is still tight, but it looses
+            orthogonality.orthogonality on the image border.
+        - The current implementation of the torch.nn.ConvTranspose2d does not support circular padding. One can
+            implement padding manually by add a padding layer before and setting padding = (0,0).
+
     Arguments:
         in_channels (int): Number of input channels.
         out_channels (int): Number of output channels.
@@ -145,9 +158,9 @@ def AdaptiveOrthoConvTranspose2d(
 
 
     References:
-        - `[1] Boissin, T., Mamalet, F., Fel, T., Picard, A. M., Massena, T., & Serrurier, M. (2025).
+        - [1] Boissin, T., Mamalet, F., Fel, T., Picard, A. M., Massena, T., & Serrurier, M. (2025).
         An Adaptive Orthogonal Convolution Scheme for Efficient and Flexible CNN Architectures.
-        <https://arxiv.org/abs/2501.07930>`_
+        <https://arxiv.org/abs/2501.07930>
     """
 
     if kernel_size < stride:
diff --git a/orthogonium/layers/conv/SLL/sll_layer.py b/orthogonium/layers/conv/SLL/sll_layer.py
@@ -101,9 +101,9 @@ def __init__(self, cin, cout, inner_dim_factor, kernel_size=3, stride=2, **kwarg
 
 
         References:
-            - `Boissin, T., Mamalet, F., Fel, T., Picard, A. M., Massena, T., & Serrurier, M. (2025).
+            - Boissin, T., Mamalet, F., Fel, T., Picard, A. M., Massena, T., & Serrurier, M. (2025).
             An Adaptive Orthogonal Convolution Scheme for Efficient and Flexible CNN Architectures.
-            <https://arxiv.org/abs/2501.07930>`_
+            <https://arxiv.org/abs/2501.07930>
         """
         super().__init__()
         inner_kernel_size = kernel_size - (stride - 1)
@@ -191,9 +191,10 @@ def __init__(self, cin, inner_dim_factor, kernel_size=3, **kwargs):
 
 
         References:
-            - `Araujo, A., Havens, A. J., Delattre, B., Allauzen, A., & Hu, B.
+            - Araujo, A., Havens, A. J., Delattre, B., Allauzen, A., & Hu, B.
             A Unified Algebraic Perspective on Lipschitz Neural Networks.
-            In The Eleventh International Conference on Learning Representations.<https://arxiv.org/abs/2303.03169>`_
+            In The Eleventh International Conference on Learning Representations.
+            <https://arxiv.org/abs/2303.03169>
         """
         super().__init__()
 
@@ -250,9 +251,10 @@ def __init__(self, in_features, out_features, inner_dim, **kwargs):
 
 
         References:
-            - `Araujo, A., Havens, A. J., Delattre, B., Allauzen, A., & Hu, B.
+            - Araujo, A., Havens, A. J., Delattre, B., Allauzen, A., & Hu, B.
             A Unified Algebraic Perspective on Lipschitz Neural Networks.
-            In The Eleventh International Conference on Learning Representations.<https://arxiv.org/abs/2303.03169>`_
+            In The Eleventh International Conference on Learning Representations.
+            <https://arxiv.org/abs/2303.03169>
         """
         super().__init__()
 
@@ -320,12 +322,13 @@ def __init__(
 
 
         References:
-            - `[1] Araujo, A., Havens, A. J., Delattre, B., Allauzen, A., & Hu, B.
+            - [1] Araujo, A., Havens, A. J., Delattre, B., Allauzen, A., & Hu, B.
             A Unified Algebraic Perspective on Lipschitz Neural Networks.
-            In The Eleventh International Conference on Learning Representations.<https://arxiv.org/abs/2303.03169>`_
-            - `[2] Boissin, T., Mamalet, F., Fel, T., Picard, A. M., Massena, T., & Serrurier, M. (2025).
+            In The Eleventh International Conference on Learning Representations.
+            <https://arxiv.org/abs/2303.03169>
+            - [2] Boissin, T., Mamalet, F., Fel, T., Picard, A. M., Massena, T., & Serrurier, M. (2025).
             An Adaptive Orthogonal Convolution Scheme for Efficient and Flexible CNN Architectures.
-            <https://arxiv.org/abs/2501.07930>`_
+            <https://arxiv.org/abs/2501.07930>
         """
         super().__init__()
 
diff --git a/orthogonium/layers/conv/adaptiveSOC/ortho_conv.py b/orthogonium/layers/conv/adaptiveSOC/ortho_conv.py
@@ -27,12 +27,65 @@ def AdaptiveSOCConv2d(
     ortho_params: OrthoParams = OrthoParams(),
 ) -> nn.Conv2d:
     """
-    factory function to create an Orthogonal Convolutional layer
-    choosing the appropriate class depending on the kernel size and stride.
+    Factory function to create an orthogonal convolutional layer, selecting the appropriate class based on kernel
+    size and stride. This is a modified implementation of the `Skew orthogonal convolution` [1], with significant
+    modification from the original paper:
 
-    When kernel_size == stride, the layer is a RKOConv2d.
-    When stride == 1, the layer is a FlashBCOP.
-    Otherwise, the layer is a BcopRkoConv2d.
+
+    - This implementation provide an explicit kernel (which is larger the original kernel size) so the forward is done
+        in a single iteration. As described in [2].
+    - This implementation avoid the use of channels padding to handle case where cin != cout. Similarly, stride is
+        handled natively using the ad adaptive scheme.
+    - the fantastic four method is replaced by AOL which allows to reduce the number of iterations required to
+        converge.
+
+    It aims to be more scalable to large networks and large image sizes, while enforcing orthogonality in the
+    convolutional layers. This layer also intend to be compatible with all the feature of the `nn.Conv2d` class
+    (e.g., striding, dilation, grouping, etc.). This method has an explicit kernel, which means that the forward
+    operation is equivalent to a standard convolutional layer, but the weight are constrained to be orthogonal.
+
+    Note:
+        - this implementation changes the size of the kernel, which also change the padding semantics. Please adjust
+            the padding according to the kernel size and the number of iterations.
+        - current unit testing use a tolerance of 8e-2 sor this layer can be expected to be 1.08 lipschitz continuous.
+            Similarly, the stable rank is evaluated loosely (must be greater than 0.5).
+
+    Key Features:
+    -------------
+        - Enforces orthogonality, preserving gradient norms.
+        - Supports native striding, dilation, grouped convolutions, and flexible padding.
+
+    Behavior:
+    -------------
+        - When kernel_size == stride, the layer is an `RKOConv2d`.
+        - When stride == 1, the layer is a `FastBlockConv2d`.
+        - Otherwise, the layer is a `BcopRkoConv2d`.
+
+    Arguments:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        kernel_size (_size_2_t): Size of the convolution kernel.
+        stride (_size_2_t, optional): Stride of the convolution. Default is 1.
+        padding (str or _size_2_t, optional): Padding mode or size. Default is "same".
+        dilation (_size_2_t, optional): Dilation rate. Default is 1.
+        groups (int, optional): Number of blocked connections from input to output channels. Default is 1.
+        bias (bool, optional): Whether to include a learnable bias. Default is True.
+        padding_mode (str, optional): Padding mode. Default is "circular".
+        ortho_params (OrthoParams, optional): Parameters to control orthogonality. Default is `OrthoParams()`.
+
+    Returns:
+        A configured instance of `nn.Conv2d` (one of `RKOConv2d`, `FastBlockConv2d`, or `BcopRkoConv2d`).
+
+    Raises:
+        `ValueError`: If kernel_size < stride, as orthogonality cannot be enforced.
+
+
+    References:
+        - [1] Singla, S., & Feizi, S. (2021, July). Skew orthogonal convolutions. In International Conference
+        on Machine Learning (pp. 9756-9766). PMLR.<https://arxiv.org/abs/2105.11417>
+        - [2] Boissin, T., Mamalet, F., Fel, T., Picard, A. M., Massena, T., & Serrurier, M. (2025).
+        An Adaptive Orthogonal Convolution Scheme for Efficient and Flexible CNN Architectures.
+        <https://arxiv.org/abs/2501.07930>
     """
     if kernel_size < stride:
         raise ValueError(
@@ -72,16 +125,64 @@ def AdaptiveSOCConvTranspose2d(
     ortho_params: OrthoParams = OrthoParams(),
 ) -> nn.ConvTranspose2d:
     """
-    factory function to create an Orthogonal Convolutional Transpose layer
-    choosing the appropriate class depending on the kernel size and stride.
+    Factory function to create an orthogonal transposed convolutional layer, selecting the appropriate class based on
+    kernel size and stride. This is a modified implementation of the `Skew orthogonal convolution` [1], with significant
+    modification from the original paper:
+
+    - This implementation provide an explicit kernel (which is larger the original kernel size) so the forward is done
+        in a single iteration. As described in [2].
+    - This implementation avoid the use of channels padding to handle case where cin != cout. Similarly, stride is
+        handled natively using the ad adaptive scheme.
+    - the fantastic four method is replaced by AOL which allows to reduce the number of iterations required to
+        converge.
+
+    It aims to be more scalable to large networks and large image sizes, while enforcing orthogonality in the
+    convolutional layers. This layer also intend to be compatible with all the feature of the `nn.Conv2d` class
+    (e.g., striding, dilation, grouping, etc.). This method has an explicit kernel, which means that the forward
+    operation is equivalent to a standard convolutional layer, but the weight are constrained to be orthogonal.
+
+    Note:
+        - this implementation changes the size of the kernel, which also change the padding semantics. Please adjust
+            the padding according to the kernel size and the number of iterations.
+        - current unit testing use a tolerance of 8e-2 sor this layer can be expected to be 1.08 lipschitz continuous.
+            Similarly, the stable rank is evaluated loosely (must be greater than 0.5).
+
+    Key Features:
+    -------------
+        - Enforces orthogonality, preserving gradient norms.
+        - Supports native striding, dilation, grouped convolutions, and flexible padding.
+
+    Behavior:
+    -------------
+        - When kernel_size == stride, the layer is an `RKOConv2d`.
+        - When stride == 1, the layer is a `FastBlockConv2d`.
+        - Otherwise, the layer is a `BcopRkoConv2d`.
+
+    Arguments:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        kernel_size (_size_2_t): Size of the convolution kernel.
+        stride (_size_2_t, optional): Stride of the convolution. Default is 1.
+        padding (str or _size_2_t, optional): Padding mode or size. Default is "same".
+        dilation (_size_2_t, optional): Dilation rate. Default is 1.
+        groups (int, optional): Number of blocked connections from input to output channels. Default is 1.
+        bias (bool, optional): Whether to include a learnable bias. Default is True.
+        padding_mode (str, optional): Padding mode. Default is "circular".
+        ortho_params (OrthoParams, optional): Parameters to control orthogonality. Default is `OrthoParams()`.
+
+    Returns:
+        A configured instance of `nn.Conv2d` (one of `RKOConv2d`, `FastBlockConv2d`, or `BcopRkoConv2d`).
+
+    Raises:
+        `ValueError`: If kernel_size < stride, as orthogonality cannot be enforced.
 
-    As we handle native striding with explicit kernel. It unlocks
-    the possibility to use the same parametrization for transposed convolutions.
-    This class uses the same interface as the ConvTranspose2d class.
 
-    Unfortunately, circular padding is not supported for the transposed convolution.
-    But unit testing have shown that the convolution is still orthogonal when
-        `out_channels * (stride**2) > in_channels`.
+    References:
+        - [1] Singla, S., & Feizi, S. (2021, July). Skew orthogonal convolutions. In International Conference
+        on Machine Learning (pp. 9756-9766). PMLR.<https://arxiv.org/abs/2105.11417>
+        - [2] Boissin, T., Mamalet, F., Fel, T., Picard, A. M., Massena, T., & Serrurier, M. (2025).
+        An Adaptive Orthogonal Convolution Scheme for Efficient and Flexible CNN Architectures.
+        <https://arxiv.org/abs/2501.07930>
     """
     if kernel_size < stride:
         raise ValueError(
diff --git a/orthogonium/layers/conv/singular_values/get_sv.py b/orthogonium/layers/conv/singular_values/get_sv.py
@@ -14,6 +14,7 @@ def get_conv_sv(
     """
     Computes Lipschitz constant (and optional 'stability rank') of a convolution layer. This use the layer paramaters
     to decide the correct function to call depending the the padding mode, the shape of the kernel and the stride.
+    Under the hood it uses the methods described in [1] and [2].
 
     Parameters:
         layer (torch.nn.Module): Convolutional layer to compute the Lipschitz constant for. It must have a weight
@@ -36,6 +37,15 @@ def get_conv_sv(
         There is currently an issue when estimating the lipschitz constant of a layer with circular padding and
         asymmetric padding (ie. even kernel size and no stride). The function may return a lipschitz constant lower
         than the actual value.
+
+    References:
+        - [1] Delattre, B., Barthélemy, Q., Araujo, A., & Allauzen, A. (2023, July).
+        Efficient bound of Lipschitz constant for convolutional layers by gram iteration.
+        In International Conference on Machine Learning (pp. 7513-7532). PMLR.
+        <https://arxiv.org/abs/2305.16173>
+        - [2] Delattre, B., Barthélemy, Q., & Allauzen, A. (2024).
+        Spectral Norm of Convolutional Layers with Circular and Zero Paddings.
+        <https://arxiv.org/abs/2402.00240>
     """
 
     def _compute_grouped_lip(