deel-ai · thib-s · Jan 9, 2025 · Jan 9, 2025 · Jan 9, 2025 · Jan 9, 2025
diff --git a/README.md b/README.md
@@ -29,6 +29,9 @@
     <a href="#">
         <img src="https://img.shields.io/badge/License-MIT-efefef">
     </a>
+    <a href="https://thib-s.github.io/orthogonium/">
+        <img alt="Documentation" src="https://img.shields.io/badge/Docs-here-0000ff">
+    </a>
 </div>
 <br>
 
@@ -39,7 +42,7 @@ build orthogonal layers, with a focus on convolutional layers . We noticed that
 significant role in the final performance : a more efficient implementation 
 allows larger networks and more training steps within the same compute 
 budget. So our implementation differs from original papers in order to 
-be faster, to consume less memory or be more flexible.
+be faster, to consume less memory or be more flexible. Feel free to read the [documentation](https://thib-s.github.io/orthogonium/)!
 
 # 📃 What is included in this library ?
 

diff --git a/docs/api/activations.md b/docs/api/activations.md
@@ -0,0 +1,5 @@
+::: orthogonium.layers.custom_activations
+    rendering:
+        show_root_toc_entry: True
+    selection:
+        inherited_members: True
diff --git a/docs/api/losses.md b/docs/api/losses.md
@@ -0,0 +1,5 @@
+::: orthogonium.losses
+    rendering:
+        show_root_toc_entry: True
+    selection:
+        inherited_members: True
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -7,6 +7,8 @@ nav:
       - convolutions: api/conv.md
       - linear layers: api/linear.md
       - reparametrizers: api/reparametrizers.md
+      - activations: api/activations.md
+      - losses: api/losses.md
 #      - layers.conv.AOC module: api/aoc.md
 #      - layers.conv.adaptiveSOC module: api/adaptiveSOC.md
 #      - layers.conv.SLL module: api/sll.md

diff --git a/orthogonium/layers/conv/AOC/fast_block_ortho_conv.py b/orthogonium/layers/conv/AOC/fast_block_ortho_conv.py
@@ -156,7 +156,6 @@ def __init__(
         out_channels,
         kernel_size,
         groups,
-        contiguous_optimization=False,
     ):
         """This module is used to generate orthogonal kernels for the BCOP layer. It takes
         as input a matrix PQ of shape (groups, 2*kernel_size, c, c//2) and returns a kernel
@@ -167,9 +166,6 @@ def __init__(
             out_channels (int): number of output channels
             kernel_size (int): size of the kernel
             groups (int): number of groups
-            contiguous_optimization (bool, optional): if True, the kernel will have twice the
-                number of channels. This is used to increase expressiveness, but at the price
-                of orthogonality (not Lipschitzness). Defaults to False.
         """
         super(BCOPTrivializer, self).__init__()
         self.kernel_size = kernel_size
@@ -178,9 +174,6 @@ def __init__(
         self.in_channels = in_channels
         self.min_channels = min(in_channels, out_channels)
         self.max_channels = max(in_channels, out_channels)
-        if contiguous_optimization:
-            self.max_channels *= 2
-        self.contiguous_optimization = contiguous_optimization
         self.transpose = out_channels < in_channels
         self.num_kernels = 2 * kernel_size
 
@@ -249,12 +242,6 @@ def forward(self, PQ):
         res = c11
         for i in range(c22.shape[0]):  # c22.shape[0] == 1 if k-1 is a power of two
             res = fast_matrix_conv(res, c22[i], self.groups)
-        # if contiguous optimization is enabled, we constructed a conv with twice the number
-        # of channels, we need to remove the extra channels
-        if self.contiguous_optimization:
-            res = res[
-                : self.max_channels // 2, : self.min_channels // self.groups, :, :
-            ]
         # since it is less expensive to compute the transposed kernel when co < ci
         # we transpose the kernel if needed
         if self.transpose:
@@ -288,24 +275,15 @@ def attach_bcop_weight(
     num_kernels = (
         2 * kernel_size
     )  # the number of projectors needed to create the kernel
-    contiguous_optimization = ortho_params.contiguous_optimization
     # register projectors matrices
     layer.register_parameter(
         weight_name,
         torch.nn.Parameter(
             torch.Tensor(
                 groups,
                 num_kernels,
-                (
-                    2 * max_channels // groups
-                    if contiguous_optimization
-                    else max_channels // groups
-                ),
-                (
-                    max_channels // groups
-                    if contiguous_optimization
-                    else max_channels // (groups * 2)
-                ),
+                (max_channels // groups),
+                (max_channels // (groups * 2)),
             ),
             requires_grad=True,
         ),
@@ -343,7 +321,6 @@ def attach_bcop_weight(
             out_channels,
             kernel_size,
             groups,
-            contiguous_optimization=contiguous_optimization,
         ),
         unsafe=True,
     )

diff --git a/orthogonium/layers/conv/AOC/ortho_conv.py b/orthogonium/layers/conv/AOC/ortho_conv.py
@@ -27,32 +27,34 @@ def AdaptiveOrthoConv2d(
     """
     Factory function to create an orthogonal convolutional layer, selecting the appropriate class based on kernel size and stride.
 
-    **Key Features:**
-    - Enforces orthogonality, preserving gradient norms.
-    - Supports native striding, dilation, grouped convolutions, and flexible padding.
-
-    **Behavior:**
-    - When kernel_size == stride, the layer is an `RKOConv2d`.
-    - When stride == 1, the layer is a `FastBlockConv2d`.
-    - Otherwise, the layer is a `BcopRkoConv2d`.
-
-    **Arguments:**
-    - `in_channels` (int): Number of input channels.
-    - `out_channels` (int): Number of output channels.
-    - `kernel_size` (_size_2_t): Size of the convolution kernel.
-    - `stride` (_size_2_t, optional): Stride of the convolution. Default is 1.
-    - `padding` (str or _size_2_t, optional): Padding mode or size. Default is "same".
-    - `dilation` (_size_2_t, optional): Dilation rate. Default is 1.
-    - `groups` (int, optional): Number of blocked connections from input to output channels. Default is 1.
-    - `bias` (bool, optional): Whether to include a learnable bias. Default is True.
-    - `padding_mode` (str, optional): Padding mode. Default is "circular".
-    - `ortho_params` (OrthoParams, optional): Parameters to control orthogonality. Default is `OrthoParams()`.
-
-    **Returns:**
-    - A configured instance of `nn.Conv2d` (one of `RKOConv2d`, `FastBlockConv2d`, or `BcopRkoConv2d`).
-
-    **Raises:**
-    - `ValueError`: If kernel_size < stride, as orthogonality cannot be enforced.
+    Key Features:
+    -------------
+        - Enforces orthogonality, preserving gradient norms.
+        - Supports native striding, dilation, grouped convolutions, and flexible padding.
+
+    Behavior:
+    -------------
+        - When kernel_size == stride, the layer is an `RKOConv2d`.
+        - When stride == 1, the layer is a `FastBlockConv2d`.
+        - Otherwise, the layer is a `BcopRkoConv2d`.
+
+    Arguments:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        kernel_size (_size_2_t): Size of the convolution kernel.
+        stride (_size_2_t, optional): Stride of the convolution. Default is 1.
+        padding (str or _size_2_t, optional): Padding mode or size. Default is "same".
+        dilation (_size_2_t, optional): Dilation rate. Default is 1.
+        groups (int, optional): Number of blocked connections from input to output channels. Default is 1.
+        bias (bool, optional): Whether to include a learnable bias. Default is True.
+        padding_mode (str, optional): Padding mode. Default is "circular".
+        ortho_params (OrthoParams, optional): Parameters to control orthogonality. Default is `OrthoParams()`.
+
+    Returns:
+        A configured instance of `nn.Conv2d` (one of `RKOConv2d`, `FastBlockConv2d`, or `BcopRkoConv2d`).
+
+    Raises:
+        `ValueError`: If kernel_size < stride, as orthogonality cannot be enforced.
     """
 
     if kernel_size < stride:
@@ -95,30 +97,32 @@ def AdaptiveOrthoConvTranspose2d(
     """
     Factory function to create an orthogonal convolutional transpose layer, adapting based on kernel size and stride.
 
-    **Key Features:**
-    - Ensures orthogonality in transpose convolutions for stable gradient propagation.
-    - Supports dilation, grouped operations, and efficient kernel construction.
-
-    **Behavior:**
-    - When kernel_size == stride, the layer is an `RkoConvTranspose2d`.
-    - When stride == 1, the layer is a `FastBlockConvTranspose2D`.
-    - Otherwise, the layer is a `BcopRkoConvTranspose2d`.
-
-    **Arguments:**
-    - `in_channels` (int): Number of input channels.
-    - `out_channels` (int): Number of output channels.
-    - `kernel_size` (_size_2_t): Size of the convolution kernel.
-    - `stride` (_size_2_t, optional): Stride of the transpose convolution. Default is 1.
-    - `padding` (_size_2_t, optional): Padding size. Default is 0.
-    - `output_padding` (_size_2_t, optional): Additional size for output. Default is 0.
-    - `groups` (int, optional): Number of groups. Default is 1.
-    - `bias` (bool, optional): Whether to include a learnable bias. Default is True.
-    - `dilation` (_size_2_t, optional): Dilation rate. Default is 1.
-    - `padding_mode` (str, optional): Padding mode. Default is "zeros".
-    - `ortho_params` (OrthoParams, optional): Parameters to control orthogonality. Default is `OrthoParams()`.
-
-    **Returns:**
-    - A configured instance of `nn.ConvTranspose2d` (one of `RkoConvTranspose2d`, `FastBlockConvTranspose2D`, or `BcopRkoConvTranspose2d`).
+    Key Features:
+    -------------
+        - Ensures orthogonality in transpose convolutions for stable gradient propagation.
+        - Supports dilation, grouped operations, and efficient kernel construction.
+
+    Behavior:
+    ---------
+        - When kernel_size == stride, the layer is an `RkoConvTranspose2d`.
+        - When stride == 1, the layer is a `FastBlockConvTranspose2D`.
+        - Otherwise, the layer is a `BcopRkoConvTranspose2d`.
+
+    Arguments:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        kernel_size (_size_2_t): Size of the convolution kernel.
+        stride (_size_2_t, optional): Stride of the transpose convolution. Default is 1.
+        padding (_size_2_t, optional): Padding size. Default is 0.
+        output_padding (_size_2_t, optional): Additional size for output. Default is 0.
+        groups (int, optional): Number of groups. Default is 1.
+        bias (bool, optional): Whether to include a learnable bias. Default is True.
+        dilation (_size_2_t, optional): Dilation rate. Default is 1.
+        padding_mode (str, optional): Padding mode. Default is "zeros".
+        ortho_params (OrthoParams, optional): Parameters to control orthogonality. Default is `OrthoParams()`.
+
+    Returns:
+        A configured instance of `nn.ConvTranspose2d` (one of `RkoConvTranspose2d`, `FastBlockConvTranspose2D`, or `BcopRkoConvTranspose2d`).
 
     **Raises:**
     - `ValueError`: If kernel_size < stride, as orthogonality cannot be enforced.

diff --git a/orthogonium/layers/conv/adaptiveSOC/fast_skew_ortho_conv.py b/orthogonium/layers/conv/adaptiveSOC/fast_skew_ortho_conv.py
@@ -138,14 +138,17 @@ def attach_soc_weight(
         weight_name (str): name of the weight
         kernel_shape (tuple): shape of the kernel (out_channels, in_channels/groups, kernel_size, kernel_size)
         groups (int): number of groups
-        bjorck_params (BjorckParams, optional): parameters of the Bjorck orthogonalization. Defaults to BjorckParams().
+        exp_params (ExpParams): parameters for the exponential algorithm.
 
     Returns:
         torch.Tensor: a handle to the attached weight
     """
     out_channels, in_channels, kernel_size, k2 = kernel_shape
     in_channels *= groups  # compute the real number of input channels
-    assert kernel_size == k2, "only square kernels are supported for the moment"
+    assert (
+        kernel_size == k2
+    ), "only square kernels are supported (to compute skew symmetric kernels)"
+    assert kernel_size % 2 == 1, "kernel size must be odd"
     max_channels = max(in_channels, out_channels)
     layer.register_parameter(
         weight_name,
@@ -238,8 +241,6 @@ def __init__(
             raise ValueError(
                 "kernel size must be smaller than stride. The set of orthonal convolutions is empty in this setting."
             )
-        if (in_channels % groups != 0) and (out_channels % groups != 0):
-            )
         self.padding = padding
         self.stride = stride
         self.kernel_size = kernel_size
@@ -252,11 +253,6 @@ def __init__(
             groups,
             exp_params=exp_params,
         )
-        if bias:
-            self.bias = nn.Parameter(torch.Tensor(out_channels))
-            nn.init.zeros_(self.bias)
-        else:
-            self.register_parameter("bias", None)
 
     def singular_values(self):
         """Compute the singular values of the convolutional layer using the FFT+SVD method.
@@ -341,8 +337,6 @@ def __init__(
             raise ValueError(
                 "kernel size must be smaller than stride. The set of orthonal convolutions is empty in this setting."
             )
-        if (in_channels % groups != 0) and (out_channels % groups != 0):
-            )
         if ((self.max_channels // groups) < 2) and (kernel_size != stride):
             raise ValueError("inner conv must have at least 2 channels")
         if out_channels * (stride**2) < in_channels:
@@ -367,12 +361,6 @@ def __init__(
             exp_params=exp_params,
         )
 
-        if bias:
-            self.bias = nn.Parameter(torch.Tensor(out_channels))
-            nn.init.zeros_(self.bias)
-        else:
-            self.register_parameter("bias", None)
-
     def singular_values(self):
         if self.padding_mode != "circular":
             print(
@@ -387,8 +375,8 @@ def singular_values(self):
                 self.groups,
                 self.in_channels // self.groups,
                 self.out_channels // self.groups,
-                self.kernel_size,
-                self.kernel_size,
+                self.weight.shape[-2],
+                self.weight.shape[-1],
             )
             .numpy(),
             self._input_shape,

diff --git a/orthogonium/layers/conv/adaptiveSOC/ortho_conv.py b/orthogonium/layers/conv/adaptiveSOC/ortho_conv.py
@@ -40,7 +40,7 @@ def AdaptiveSOCConv2d(
         )
     if kernel_size == stride:
         convclass = RKOConv2d
-    elif (stride == 1) or (in_channels >= out_channels):
+    elif stride == 1:
         convclass = FastSOC
     else:
         convclass = SOCRkoConv2d