Skip to content

Commit b5c8268

Browse files
committed
improved documentation
1 parent 1b73d88 commit b5c8268

File tree

5 files changed

+162
-30
lines changed

5 files changed

+162
-30
lines changed

docs/api/conv.md

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,23 @@
1+
12
::: orthogonium.layers.conv.AOC.ortho_conv
23
rendering:
34
show_root_toc_entry: True
45
selection:
56
inherited_members: True
67

7-
8-
98
::: orthogonium.layers.conv.SLL.sll_layer
109
rendering:
1110
show_root_toc_entry: True
1211
selection:
1312
inherited_members: True
1413

1514
::: orthogonium.layers.conv.AOL.aol
15+
rendering:
16+
show_root_toc_entry: True
17+
selection:
18+
inherited_members: True
19+
20+
::: orthogonium.layers.conv.adaptiveSOC.ortho_conv
1621
rendering:
1722
show_root_toc_entry: True
1823
selection:

orthogonium/layers/conv/AOC/ortho_conv.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,17 @@ def AdaptiveOrthoConv2d(
3838
- Supports native striding, dilation, grouped convolutions, and flexible padding.
3939
4040
Behavior:
41-
-------------
41+
---------
4242
- When kernel_size == stride, the layer is an `RKOConv2d`.
4343
- When stride == 1, the layer is a `FastBlockConv2d`.
4444
- Otherwise, the layer is a `BcopRkoConv2d`.
4545
46+
Note:
47+
- This implementation also work under zero padding, it lipschitz constant is still tight, but it looses
48+
orthogonality.orthogonality on the image border.
49+
- the unit tesing validated for a tolerance of 1e-4 under various orthogonalization schemes (see
50+
reparametrizers). Only Cholesky based methods were validated for a lower tolerance of 5e-2.
51+
4652
Arguments:
4753
in_channels (int): Number of input channels.
4854
out_channels (int): Number of output channels.
@@ -63,9 +69,9 @@ def AdaptiveOrthoConv2d(
6369
6470
6571
References:
66-
- `[1] Boissin, T., Mamalet, F., Fel, T., Picard, A. M., Massena, T., & Serrurier, M. (2025).
72+
- [1] Boissin, T., Mamalet, F., Fel, T., Picard, A. M., Massena, T., & Serrurier, M. (2025).
6773
An Adaptive Orthogonal Convolution Scheme for Efficient and Flexible CNN Architectures.
68-
<https://arxiv.org/abs/2501.07930>`_
74+
<https://arxiv.org/abs/2501.07930>
6975
"""
7076

7177
if kernel_size < stride:
@@ -124,6 +130,13 @@ def AdaptiveOrthoConvTranspose2d(
124130
- When stride == 1, the layer is a `FastBlockConvTranspose2D`.
125131
- Otherwise, the layer is a `BcopRkoConvTranspose2d`.
126132
133+
134+
Note:
135+
- This implementation also work under zero padding, it lipschitz constant is still tight, but it looses
136+
orthogonality.orthogonality on the image border.
137+
- The current implementation of the torch.nn.ConvTranspose2d does not support circular padding. One can
138+
implement padding manually by add a padding layer before and setting padding = (0,0).
139+
127140
Arguments:
128141
in_channels (int): Number of input channels.
129142
out_channels (int): Number of output channels.
@@ -145,9 +158,9 @@ def AdaptiveOrthoConvTranspose2d(
145158
146159
147160
References:
148-
- `[1] Boissin, T., Mamalet, F., Fel, T., Picard, A. M., Massena, T., & Serrurier, M. (2025).
161+
- [1] Boissin, T., Mamalet, F., Fel, T., Picard, A. M., Massena, T., & Serrurier, M. (2025).
149162
An Adaptive Orthogonal Convolution Scheme for Efficient and Flexible CNN Architectures.
150-
<https://arxiv.org/abs/2501.07930>`_
163+
<https://arxiv.org/abs/2501.07930>
151164
"""
152165

153166
if kernel_size < stride:

orthogonium/layers/conv/SLL/sll_layer.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -101,9 +101,9 @@ def __init__(self, cin, cout, inner_dim_factor, kernel_size=3, stride=2, **kwarg
101101
102102
103103
References:
104-
- `Boissin, T., Mamalet, F., Fel, T., Picard, A. M., Massena, T., & Serrurier, M. (2025).
104+
- Boissin, T., Mamalet, F., Fel, T., Picard, A. M., Massena, T., & Serrurier, M. (2025).
105105
An Adaptive Orthogonal Convolution Scheme for Efficient and Flexible CNN Architectures.
106-
<https://arxiv.org/abs/2501.07930>`_
106+
<https://arxiv.org/abs/2501.07930>
107107
"""
108108
super().__init__()
109109
inner_kernel_size = kernel_size - (stride - 1)
@@ -191,9 +191,10 @@ def __init__(self, cin, inner_dim_factor, kernel_size=3, **kwargs):
191191
192192
193193
References:
194-
- `Araujo, A., Havens, A. J., Delattre, B., Allauzen, A., & Hu, B.
194+
- Araujo, A., Havens, A. J., Delattre, B., Allauzen, A., & Hu, B.
195195
A Unified Algebraic Perspective on Lipschitz Neural Networks.
196-
In The Eleventh International Conference on Learning Representations.<https://arxiv.org/abs/2303.03169>`_
196+
In The Eleventh International Conference on Learning Representations.
197+
<https://arxiv.org/abs/2303.03169>
197198
"""
198199
super().__init__()
199200

@@ -250,9 +251,10 @@ def __init__(self, in_features, out_features, inner_dim, **kwargs):
250251
251252
252253
References:
253-
- `Araujo, A., Havens, A. J., Delattre, B., Allauzen, A., & Hu, B.
254+
- Araujo, A., Havens, A. J., Delattre, B., Allauzen, A., & Hu, B.
254255
A Unified Algebraic Perspective on Lipschitz Neural Networks.
255-
In The Eleventh International Conference on Learning Representations.<https://arxiv.org/abs/2303.03169>`_
256+
In The Eleventh International Conference on Learning Representations.
257+
<https://arxiv.org/abs/2303.03169>
256258
"""
257259
super().__init__()
258260

@@ -320,12 +322,13 @@ def __init__(
320322
321323
322324
References:
323-
- `[1] Araujo, A., Havens, A. J., Delattre, B., Allauzen, A., & Hu, B.
325+
- [1] Araujo, A., Havens, A. J., Delattre, B., Allauzen, A., & Hu, B.
324326
A Unified Algebraic Perspective on Lipschitz Neural Networks.
325-
In The Eleventh International Conference on Learning Representations.<https://arxiv.org/abs/2303.03169>`_
326-
- `[2] Boissin, T., Mamalet, F., Fel, T., Picard, A. M., Massena, T., & Serrurier, M. (2025).
327+
In The Eleventh International Conference on Learning Representations.
328+
<https://arxiv.org/abs/2303.03169>
329+
- [2] Boissin, T., Mamalet, F., Fel, T., Picard, A. M., Massena, T., & Serrurier, M. (2025).
327330
An Adaptive Orthogonal Convolution Scheme for Efficient and Flexible CNN Architectures.
328-
<https://arxiv.org/abs/2501.07930>`_
331+
<https://arxiv.org/abs/2501.07930>
329332
"""
330333
super().__init__()
331334

orthogonium/layers/conv/adaptiveSOC/ortho_conv.py

Lines changed: 114 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,65 @@ def AdaptiveSOCConv2d(
2727
ortho_params: OrthoParams = OrthoParams(),
2828
) -> nn.Conv2d:
2929
"""
30-
factory function to create an Orthogonal Convolutional layer
31-
choosing the appropriate class depending on the kernel size and stride.
30+
Factory function to create an orthogonal convolutional layer, selecting the appropriate class based on kernel
31+
size and stride. This is a modified implementation of the `Skew orthogonal convolution` [1], with significant
32+
modification from the original paper:
3233
33-
When kernel_size == stride, the layer is a RKOConv2d.
34-
When stride == 1, the layer is a FlashBCOP.
35-
Otherwise, the layer is a BcopRkoConv2d.
34+
35+
- This implementation provide an explicit kernel (which is larger the original kernel size) so the forward is done
36+
in a single iteration. As described in [2].
37+
- This implementation avoid the use of channels padding to handle case where cin != cout. Similarly, stride is
38+
handled natively using the ad adaptive scheme.
39+
- the fantastic four method is replaced by AOL which allows to reduce the number of iterations required to
40+
converge.
41+
42+
It aims to be more scalable to large networks and large image sizes, while enforcing orthogonality in the
43+
convolutional layers. This layer also intend to be compatible with all the feature of the `nn.Conv2d` class
44+
(e.g., striding, dilation, grouping, etc.). This method has an explicit kernel, which means that the forward
45+
operation is equivalent to a standard convolutional layer, but the weight are constrained to be orthogonal.
46+
47+
Note:
48+
- this implementation changes the size of the kernel, which also change the padding semantics. Please adjust
49+
the padding according to the kernel size and the number of iterations.
50+
- current unit testing use a tolerance of 8e-2 sor this layer can be expected to be 1.08 lipschitz continuous.
51+
Similarly, the stable rank is evaluated loosely (must be greater than 0.5).
52+
53+
Key Features:
54+
-------------
55+
- Enforces orthogonality, preserving gradient norms.
56+
- Supports native striding, dilation, grouped convolutions, and flexible padding.
57+
58+
Behavior:
59+
-------------
60+
- When kernel_size == stride, the layer is an `RKOConv2d`.
61+
- When stride == 1, the layer is a `FastBlockConv2d`.
62+
- Otherwise, the layer is a `BcopRkoConv2d`.
63+
64+
Arguments:
65+
in_channels (int): Number of input channels.
66+
out_channels (int): Number of output channels.
67+
kernel_size (_size_2_t): Size of the convolution kernel.
68+
stride (_size_2_t, optional): Stride of the convolution. Default is 1.
69+
padding (str or _size_2_t, optional): Padding mode or size. Default is "same".
70+
dilation (_size_2_t, optional): Dilation rate. Default is 1.
71+
groups (int, optional): Number of blocked connections from input to output channels. Default is 1.
72+
bias (bool, optional): Whether to include a learnable bias. Default is True.
73+
padding_mode (str, optional): Padding mode. Default is "circular".
74+
ortho_params (OrthoParams, optional): Parameters to control orthogonality. Default is `OrthoParams()`.
75+
76+
Returns:
77+
A configured instance of `nn.Conv2d` (one of `RKOConv2d`, `FastBlockConv2d`, or `BcopRkoConv2d`).
78+
79+
Raises:
80+
`ValueError`: If kernel_size < stride, as orthogonality cannot be enforced.
81+
82+
83+
References:
84+
- [1] Singla, S., & Feizi, S. (2021, July). Skew orthogonal convolutions. In International Conference
85+
on Machine Learning (pp. 9756-9766). PMLR.<https://arxiv.org/abs/2105.11417>
86+
- [2] Boissin, T., Mamalet, F., Fel, T., Picard, A. M., Massena, T., & Serrurier, M. (2025).
87+
An Adaptive Orthogonal Convolution Scheme for Efficient and Flexible CNN Architectures.
88+
<https://arxiv.org/abs/2501.07930>
3689
"""
3790
if kernel_size < stride:
3891
raise ValueError(
@@ -72,16 +125,64 @@ def AdaptiveSOCConvTranspose2d(
72125
ortho_params: OrthoParams = OrthoParams(),
73126
) -> nn.ConvTranspose2d:
74127
"""
75-
factory function to create an Orthogonal Convolutional Transpose layer
76-
choosing the appropriate class depending on the kernel size and stride.
128+
Factory function to create an orthogonal transposed convolutional layer, selecting the appropriate class based on
129+
kernel size and stride. This is a modified implementation of the `Skew orthogonal convolution` [1], with significant
130+
modification from the original paper:
131+
132+
- This implementation provide an explicit kernel (which is larger the original kernel size) so the forward is done
133+
in a single iteration. As described in [2].
134+
- This implementation avoid the use of channels padding to handle case where cin != cout. Similarly, stride is
135+
handled natively using the ad adaptive scheme.
136+
- the fantastic four method is replaced by AOL which allows to reduce the number of iterations required to
137+
converge.
138+
139+
It aims to be more scalable to large networks and large image sizes, while enforcing orthogonality in the
140+
convolutional layers. This layer also intend to be compatible with all the feature of the `nn.Conv2d` class
141+
(e.g., striding, dilation, grouping, etc.). This method has an explicit kernel, which means that the forward
142+
operation is equivalent to a standard convolutional layer, but the weight are constrained to be orthogonal.
143+
144+
Note:
145+
- this implementation changes the size of the kernel, which also change the padding semantics. Please adjust
146+
the padding according to the kernel size and the number of iterations.
147+
- current unit testing use a tolerance of 8e-2 sor this layer can be expected to be 1.08 lipschitz continuous.
148+
Similarly, the stable rank is evaluated loosely (must be greater than 0.5).
149+
150+
Key Features:
151+
-------------
152+
- Enforces orthogonality, preserving gradient norms.
153+
- Supports native striding, dilation, grouped convolutions, and flexible padding.
154+
155+
Behavior:
156+
-------------
157+
- When kernel_size == stride, the layer is an `RKOConv2d`.
158+
- When stride == 1, the layer is a `FastBlockConv2d`.
159+
- Otherwise, the layer is a `BcopRkoConv2d`.
160+
161+
Arguments:
162+
in_channels (int): Number of input channels.
163+
out_channels (int): Number of output channels.
164+
kernel_size (_size_2_t): Size of the convolution kernel.
165+
stride (_size_2_t, optional): Stride of the convolution. Default is 1.
166+
padding (str or _size_2_t, optional): Padding mode or size. Default is "same".
167+
dilation (_size_2_t, optional): Dilation rate. Default is 1.
168+
groups (int, optional): Number of blocked connections from input to output channels. Default is 1.
169+
bias (bool, optional): Whether to include a learnable bias. Default is True.
170+
padding_mode (str, optional): Padding mode. Default is "circular".
171+
ortho_params (OrthoParams, optional): Parameters to control orthogonality. Default is `OrthoParams()`.
172+
173+
Returns:
174+
A configured instance of `nn.Conv2d` (one of `RKOConv2d`, `FastBlockConv2d`, or `BcopRkoConv2d`).
175+
176+
Raises:
177+
`ValueError`: If kernel_size < stride, as orthogonality cannot be enforced.
77178
78-
As we handle native striding with explicit kernel. It unlocks
79-
the possibility to use the same parametrization for transposed convolutions.
80-
This class uses the same interface as the ConvTranspose2d class.
81179
82-
Unfortunately, circular padding is not supported for the transposed convolution.
83-
But unit testing have shown that the convolution is still orthogonal when
84-
`out_channels * (stride**2) > in_channels`.
180+
References:
181+
- [1] Singla, S., & Feizi, S. (2021, July). Skew orthogonal convolutions. In International Conference
182+
on Machine Learning (pp. 9756-9766). PMLR.<https://arxiv.org/abs/2105.11417>
183+
- [2] Boissin, T., Mamalet, F., Fel, T., Picard, A. M., Massena, T., & Serrurier, M. (2025).
184+
An Adaptive Orthogonal Convolution Scheme for Efficient and Flexible CNN Architectures.
185+
<https://arxiv.org/abs/2501.07930>
85186
"""
86187
if kernel_size < stride:
87188
raise ValueError(

orthogonium/layers/conv/singular_values/get_sv.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ def get_conv_sv(
1414
"""
1515
Computes Lipschitz constant (and optional 'stability rank') of a convolution layer. This use the layer paramaters
1616
to decide the correct function to call depending the the padding mode, the shape of the kernel and the stride.
17+
Under the hood it uses the methods described in [1] and [2].
1718
1819
Parameters:
1920
layer (torch.nn.Module): Convolutional layer to compute the Lipschitz constant for. It must have a weight
@@ -36,6 +37,15 @@ def get_conv_sv(
3637
There is currently an issue when estimating the lipschitz constant of a layer with circular padding and
3738
asymmetric padding (ie. even kernel size and no stride). The function may return a lipschitz constant lower
3839
than the actual value.
40+
41+
References:
42+
- [1] Delattre, B., Barthélemy, Q., Araujo, A., & Allauzen, A. (2023, July).
43+
Efficient bound of Lipschitz constant for convolutional layers by gram iteration.
44+
In International Conference on Machine Learning (pp. 7513-7532). PMLR.
45+
<https://arxiv.org/abs/2305.16173>
46+
- [2] Delattre, B., Barthélemy, Q., & Allauzen, A. (2024).
47+
Spectral Norm of Convolutional Layers with Circular and Zero Paddings.
48+
<https://arxiv.org/abs/2402.00240>
3949
"""
4050

4151
def _compute_grouped_lip(

0 commit comments

Comments
 (0)