diff --git a/README.md b/README.md
index 363ea59..04d6db5 100644
--- a/README.md
+++ b/README.md
@@ -29,6 +29,9 @@
     <a href="#">
         <img src="https://img.shields.io/badge/License-MIT-efefef">
     </a>
+    <a href="https://thib-s.github.io/orthogonium/">
+        <img alt="Documentation" src="https://img.shields.io/badge/Docs-here-0000ff">
+    </a>
 </div>
 <br>
 
@@ -39,7 +42,7 @@ build orthogonal layers, with a focus on convolutional layers . We noticed that
 significant role in the final performance : a more efficient implementation 
 allows larger networks and more training steps within the same compute 
 budget. So our implementation differs from original papers in order to 
-be faster, to consume less memory or be more flexible.
+be faster, to consume less memory or be more flexible. Feel free to read the [documentation](https://thib-s.github.io/orthogonium/)!
 
 # 📃 What is included in this library ?
 
diff --git a/docs/api/activations.md b/docs/api/activations.md
new file mode 100644
index 0000000..e58bd7a
--- /dev/null
+++ b/docs/api/activations.md
@@ -0,0 +1,5 @@
+::: orthogonium.layers.custom_activations
+    rendering:
+        show_root_toc_entry: True
+    selection:
+        inherited_members: True
diff --git a/docs/api/losses.md b/docs/api/losses.md
new file mode 100644
index 0000000..1b714be
--- /dev/null
+++ b/docs/api/losses.md
@@ -0,0 +1,5 @@
+::: orthogonium.losses
+    rendering:
+        show_root_toc_entry: True
+    selection:
+        inherited_members: True
diff --git a/mkdocs.yml b/mkdocs.yml
index 4545550..6fdaf6e 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -7,6 +7,8 @@ nav:
       - convolutions: api/conv.md
       - linear layers: api/linear.md
       - reparametrizers: api/reparametrizers.md
+      - activations: api/activations.md
+      - losses: api/losses.md
 #      - layers.conv.AOC module: api/aoc.md
 #      - layers.conv.adaptiveSOC module: api/adaptiveSOC.md
 #      - layers.conv.SLL module: api/sll.md
diff --git a/orthogonium/layers/conv/AOC/fast_block_ortho_conv.py b/orthogonium/layers/conv/AOC/fast_block_ortho_conv.py
index 277ba5b..10f3a83 100644
--- a/orthogonium/layers/conv/AOC/fast_block_ortho_conv.py
+++ b/orthogonium/layers/conv/AOC/fast_block_ortho_conv.py
@@ -156,7 +156,6 @@ def __init__(
         out_channels,
         kernel_size,
         groups,
-        contiguous_optimization=False,
     ):
         """This module is used to generate orthogonal kernels for the BCOP layer. It takes
         as input a matrix PQ of shape (groups, 2*kernel_size, c, c//2) and returns a kernel
@@ -167,9 +166,6 @@ def __init__(
             out_channels (int): number of output channels
             kernel_size (int): size of the kernel
             groups (int): number of groups
-            contiguous_optimization (bool, optional): if True, the kernel will have twice the
-                number of channels. This is used to increase expressiveness, but at the price
-                of orthogonality (not Lipschitzness). Defaults to False.
         """
         super(BCOPTrivializer, self).__init__()
         self.kernel_size = kernel_size
@@ -178,9 +174,6 @@ def __init__(
         self.in_channels = in_channels
         self.min_channels = min(in_channels, out_channels)
         self.max_channels = max(in_channels, out_channels)
-        if contiguous_optimization:
-            self.max_channels *= 2
-        self.contiguous_optimization = contiguous_optimization
         self.transpose = out_channels < in_channels
         self.num_kernels = 2 * kernel_size
 
@@ -249,12 +242,6 @@ def forward(self, PQ):
         res = c11
         for i in range(c22.shape[0]):  # c22.shape[0] == 1 if k-1 is a power of two
             res = fast_matrix_conv(res, c22[i], self.groups)
-        # if contiguous optimization is enabled, we constructed a conv with twice the number
-        # of channels, we need to remove the extra channels
-        if self.contiguous_optimization:
-            res = res[
-                : self.max_channels // 2, : self.min_channels // self.groups, :, :
-            ]
         # since it is less expensive to compute the transposed kernel when co < ci
         # we transpose the kernel if needed
         if self.transpose:
@@ -288,7 +275,6 @@ def attach_bcop_weight(
     num_kernels = (
         2 * kernel_size
     )  # the number of projectors needed to create the kernel
-    contiguous_optimization = ortho_params.contiguous_optimization
     # register projectors matrices
     layer.register_parameter(
         weight_name,
@@ -296,16 +282,8 @@ def attach_bcop_weight(
             torch.Tensor(
                 groups,
                 num_kernels,
-                (
-                    2 * max_channels // groups
-                    if contiguous_optimization
-                    else max_channels // groups
-                ),
-                (
-                    max_channels // groups
-                    if contiguous_optimization
-                    else max_channels // (groups * 2)
-                ),
+                (max_channels // groups),
+                (max_channels // (groups * 2)),
             ),
             requires_grad=True,
         ),
@@ -343,7 +321,6 @@ def attach_bcop_weight(
             out_channels,
             kernel_size,
             groups,
-            contiguous_optimization=contiguous_optimization,
         ),
         unsafe=True,
     )
diff --git a/orthogonium/layers/conv/AOC/ortho_conv.py b/orthogonium/layers/conv/AOC/ortho_conv.py
index 5b4eb21..a997039 100644
--- a/orthogonium/layers/conv/AOC/ortho_conv.py
+++ b/orthogonium/layers/conv/AOC/ortho_conv.py
@@ -27,32 +27,34 @@ def AdaptiveOrthoConv2d(
     """
     Factory function to create an orthogonal convolutional layer, selecting the appropriate class based on kernel size and stride.
 
-    **Key Features:**
-    - Enforces orthogonality, preserving gradient norms.
-    - Supports native striding, dilation, grouped convolutions, and flexible padding.
-
-    **Behavior:**
-    - When kernel_size == stride, the layer is an `RKOConv2d`.
-    - When stride == 1, the layer is a `FastBlockConv2d`.
-    - Otherwise, the layer is a `BcopRkoConv2d`.
-
-    **Arguments:**
-    - `in_channels` (int): Number of input channels.
-    - `out_channels` (int): Number of output channels.
-    - `kernel_size` (_size_2_t): Size of the convolution kernel.
-    - `stride` (_size_2_t, optional): Stride of the convolution. Default is 1.
-    - `padding` (str or _size_2_t, optional): Padding mode or size. Default is "same".
-    - `dilation` (_size_2_t, optional): Dilation rate. Default is 1.
-    - `groups` (int, optional): Number of blocked connections from input to output channels. Default is 1.
-    - `bias` (bool, optional): Whether to include a learnable bias. Default is True.
-    - `padding_mode` (str, optional): Padding mode. Default is "circular".
-    - `ortho_params` (OrthoParams, optional): Parameters to control orthogonality. Default is `OrthoParams()`.
-
-    **Returns:**
-    - A configured instance of `nn.Conv2d` (one of `RKOConv2d`, `FastBlockConv2d`, or `BcopRkoConv2d`).
-
-    **Raises:**
-    - `ValueError`: If kernel_size < stride, as orthogonality cannot be enforced.
+    Key Features:
+    -------------
+        - Enforces orthogonality, preserving gradient norms.
+        - Supports native striding, dilation, grouped convolutions, and flexible padding.
+
+    Behavior:
+    -------------
+        - When kernel_size == stride, the layer is an `RKOConv2d`.
+        - When stride == 1, the layer is a `FastBlockConv2d`.
+        - Otherwise, the layer is a `BcopRkoConv2d`.
+
+    Arguments:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        kernel_size (_size_2_t): Size of the convolution kernel.
+        stride (_size_2_t, optional): Stride of the convolution. Default is 1.
+        padding (str or _size_2_t, optional): Padding mode or size. Default is "same".
+        dilation (_size_2_t, optional): Dilation rate. Default is 1.
+        groups (int, optional): Number of blocked connections from input to output channels. Default is 1.
+        bias (bool, optional): Whether to include a learnable bias. Default is True.
+        padding_mode (str, optional): Padding mode. Default is "circular".
+        ortho_params (OrthoParams, optional): Parameters to control orthogonality. Default is `OrthoParams()`.
+
+    Returns:
+        A configured instance of `nn.Conv2d` (one of `RKOConv2d`, `FastBlockConv2d`, or `BcopRkoConv2d`).
+
+    Raises:
+        `ValueError`: If kernel_size < stride, as orthogonality cannot be enforced.
     """
 
     if kernel_size < stride:
@@ -95,30 +97,32 @@ def AdaptiveOrthoConvTranspose2d(
     """
     Factory function to create an orthogonal convolutional transpose layer, adapting based on kernel size and stride.
 
-    **Key Features:**
-    - Ensures orthogonality in transpose convolutions for stable gradient propagation.
-    - Supports dilation, grouped operations, and efficient kernel construction.
-
-    **Behavior:**
-    - When kernel_size == stride, the layer is an `RkoConvTranspose2d`.
-    - When stride == 1, the layer is a `FastBlockConvTranspose2D`.
-    - Otherwise, the layer is a `BcopRkoConvTranspose2d`.
-
-    **Arguments:**
-    - `in_channels` (int): Number of input channels.
-    - `out_channels` (int): Number of output channels.
-    - `kernel_size` (_size_2_t): Size of the convolution kernel.
-    - `stride` (_size_2_t, optional): Stride of the transpose convolution. Default is 1.
-    - `padding` (_size_2_t, optional): Padding size. Default is 0.
-    - `output_padding` (_size_2_t, optional): Additional size for output. Default is 0.
-    - `groups` (int, optional): Number of groups. Default is 1.
-    - `bias` (bool, optional): Whether to include a learnable bias. Default is True.
-    - `dilation` (_size_2_t, optional): Dilation rate. Default is 1.
-    - `padding_mode` (str, optional): Padding mode. Default is "zeros".
-    - `ortho_params` (OrthoParams, optional): Parameters to control orthogonality. Default is `OrthoParams()`.
-
-    **Returns:**
-    - A configured instance of `nn.ConvTranspose2d` (one of `RkoConvTranspose2d`, `FastBlockConvTranspose2D`, or `BcopRkoConvTranspose2d`).
+    Key Features:
+    -------------
+        - Ensures orthogonality in transpose convolutions for stable gradient propagation.
+        - Supports dilation, grouped operations, and efficient kernel construction.
+
+    Behavior:
+    ---------
+        - When kernel_size == stride, the layer is an `RkoConvTranspose2d`.
+        - When stride == 1, the layer is a `FastBlockConvTranspose2D`.
+        - Otherwise, the layer is a `BcopRkoConvTranspose2d`.
+
+    Arguments:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        kernel_size (_size_2_t): Size of the convolution kernel.
+        stride (_size_2_t, optional): Stride of the transpose convolution. Default is 1.
+        padding (_size_2_t, optional): Padding size. Default is 0.
+        output_padding (_size_2_t, optional): Additional size for output. Default is 0.
+        groups (int, optional): Number of groups. Default is 1.
+        bias (bool, optional): Whether to include a learnable bias. Default is True.
+        dilation (_size_2_t, optional): Dilation rate. Default is 1.
+        padding_mode (str, optional): Padding mode. Default is "zeros".
+        ortho_params (OrthoParams, optional): Parameters to control orthogonality. Default is `OrthoParams()`.
+
+    Returns:
+        A configured instance of `nn.ConvTranspose2d` (one of `RkoConvTranspose2d`, `FastBlockConvTranspose2D`, or `BcopRkoConvTranspose2d`).
 
     **Raises:**
     - `ValueError`: If kernel_size < stride, as orthogonality cannot be enforced.
diff --git a/orthogonium/layers/conv/adaptiveSOC/fast_skew_ortho_conv.py b/orthogonium/layers/conv/adaptiveSOC/fast_skew_ortho_conv.py
index 8ba1041..26685be 100644
--- a/orthogonium/layers/conv/adaptiveSOC/fast_skew_ortho_conv.py
+++ b/orthogonium/layers/conv/adaptiveSOC/fast_skew_ortho_conv.py
@@ -138,14 +138,17 @@ def attach_soc_weight(
         weight_name (str): name of the weight
         kernel_shape (tuple): shape of the kernel (out_channels, in_channels/groups, kernel_size, kernel_size)
         groups (int): number of groups
-        bjorck_params (BjorckParams, optional): parameters of the Bjorck orthogonalization. Defaults to BjorckParams().
+        exp_params (ExpParams): parameters for the exponential algorithm.
 
     Returns:
         torch.Tensor: a handle to the attached weight
     """
     out_channels, in_channels, kernel_size, k2 = kernel_shape
     in_channels *= groups  # compute the real number of input channels
-    assert kernel_size == k2, "only square kernels are supported for the moment"
+    assert (
+        kernel_size == k2
+    ), "only square kernels are supported (to compute skew symmetric kernels)"
+    assert kernel_size % 2 == 1, "kernel size must be odd"
     max_channels = max(in_channels, out_channels)
     layer.register_parameter(
         weight_name,
@@ -238,8 +241,6 @@ def __init__(
             raise ValueError(
                 "kernel size must be smaller than stride. The set of orthonal convolutions is empty in this setting."
             )
-        if (in_channels % groups != 0) and (out_channels % groups != 0):
-            )
         self.padding = padding
         self.stride = stride
         self.kernel_size = kernel_size
@@ -252,11 +253,6 @@ def __init__(
             groups,
             exp_params=exp_params,
         )
-        if bias:
-            self.bias = nn.Parameter(torch.Tensor(out_channels))
-            nn.init.zeros_(self.bias)
-        else:
-            self.register_parameter("bias", None)
 
     def singular_values(self):
         """Compute the singular values of the convolutional layer using the FFT+SVD method.
@@ -341,8 +337,6 @@ def __init__(
             raise ValueError(
                 "kernel size must be smaller than stride. The set of orthonal convolutions is empty in this setting."
             )
-        if (in_channels % groups != 0) and (out_channels % groups != 0):
-            )
         if ((self.max_channels // groups) < 2) and (kernel_size != stride):
             raise ValueError("inner conv must have at least 2 channels")
         if out_channels * (stride**2) < in_channels:
@@ -367,12 +361,6 @@ def __init__(
             exp_params=exp_params,
         )
 
-        if bias:
-            self.bias = nn.Parameter(torch.Tensor(out_channels))
-            nn.init.zeros_(self.bias)
-        else:
-            self.register_parameter("bias", None)
-
     def singular_values(self):
         if self.padding_mode != "circular":
             print(
@@ -387,8 +375,8 @@ def singular_values(self):
                 self.groups,
                 self.in_channels // self.groups,
                 self.out_channels // self.groups,
-                self.kernel_size,
-                self.kernel_size,
+                self.weight.shape[-2],
+                self.weight.shape[-1],
             )
             .numpy(),
             self._input_shape,
diff --git a/orthogonium/layers/conv/adaptiveSOC/ortho_conv.py b/orthogonium/layers/conv/adaptiveSOC/ortho_conv.py
index 92e471a..d00eb10 100644
--- a/orthogonium/layers/conv/adaptiveSOC/ortho_conv.py
+++ b/orthogonium/layers/conv/adaptiveSOC/ortho_conv.py
@@ -40,7 +40,7 @@ def AdaptiveSOCConv2d(
         )
     if kernel_size == stride:
         convclass = RKOConv2d
-    elif (stride == 1) or (in_channels >= out_channels):
+    elif stride == 1:
         convclass = FastSOC
     else:
         convclass = SOCRkoConv2d
diff --git a/orthogonium/layers/conv/adaptiveSOC/soc_x_rko_conv.py b/orthogonium/layers/conv/adaptiveSOC/soc_x_rko_conv.py
index d2cdb34..0ac5894 100644
--- a/orthogonium/layers/conv/adaptiveSOC/soc_x_rko_conv.py
+++ b/orthogonium/layers/conv/adaptiveSOC/soc_x_rko_conv.py
@@ -67,8 +67,6 @@ def __init__(
             raise ValueError(
                 "kernel size must be smaller than stride. The set of orthonal convolutions is empty in this setting."
             )
-        if (in_channels % groups != 0) and (out_channels % groups != 0):
-            )
         if ((self.max_channels // groups) < 2) and (kernel_size != stride):
             raise ValueError("inner conv must have at least 2 channels")
         self.padding = padding
@@ -77,14 +75,26 @@ def __init__(
         self.groups = groups
         self.intermediate_channels = max(in_channels, out_channels // stride**2)
         del self.weight
+        int_kernel_size = kernel_size - (stride - 1)
+        if int_kernel_size % 2 == 0:
+            if int_kernel_size <= 2:
+                int_kernel_size += 1
+            else:
+                int_kernel_size -= 1
+            # warn user that kernel size changed
+            warnings.warn(
+                f"kernel size changed from {kernel_size} to {int_kernel_size} "
+                f"as even kernel size is not supported for SOC.",
+                RuntimeWarning,
+            )
         attach_soc_weight(
             self,
             "weight_1",
             (
                 self.intermediate_channels,
                 in_channels // groups,
-                kernel_size - (stride - 1),
-                kernel_size - (stride - 1),
+                int_kernel_size,
+                int_kernel_size,
             ),
             groups,
             exp_params=exp_params,
@@ -98,12 +108,6 @@ def __init__(
             ortho_params=ortho_params,
         )
 
-        if bias:
-            self.bias = nn.Parameter(torch.Tensor(out_channels))
-            nn.init.zeros_(self.bias)
-        else:
-            self.register_parameter("bias", None)
-
     @property
     def weight(self):
         if self.training:
@@ -160,7 +164,9 @@ def singular_values(self):
         )
         sv_min = sv_min * svs_2.min()
         sv_max = sv_max * svs_2.max()
-        stable_rank = 0.5 * stable_rank + 0.5 * ((np.mean(svs_2) ** 2) / (svs_2.max() ** 2))
+        stable_rank = 0.5 * stable_rank + 0.5 * (
+            (np.mean(svs_2) ** 2) / (svs_2.max() ** 2)
+        )
         return sv_min, sv_max, stable_rank
 
     def forward(self, X):
@@ -218,8 +224,6 @@ def __init__(
             raise ValueError(
                 "kernel size must be smaller than stride. The set of orthonal convolutions is empty in this setting."
             )
-        if (in_channels % groups != 0) and (out_channels % groups != 0):
-            )
         if ((self.max_channels // groups) < 2) and (kernel_size != stride):
             raise ValueError("inner conv must have at least 2 channels")
         self.padding = padding
@@ -239,14 +243,26 @@ def __init__(
             #     RuntimeWarning,
             # )
         del self.weight
+        int_kernel_size = kernel_size - (stride - 1)
+        if int_kernel_size % 2 == 0:
+            if int_kernel_size <= 2:
+                int_kernel_size += 1
+            else:
+                int_kernel_size -= 1
+            # warn user that kernel size changed
+            warnings.warn(
+                f"kernel size changed from {kernel_size} to {int_kernel_size} "
+                f"as even kernel size is not supported for SOC.",
+                RuntimeWarning,
+            )
         attach_soc_weight(
             self,
             "weight_1",
             (
                 self.intermediate_channels,
                 out_channels // groups,
-                kernel_size - (stride - 1),
-                kernel_size - (stride - 1),
+                int_kernel_size,
+                int_kernel_size,
             ),
             groups,
             exp_params=exp_params,
@@ -260,12 +276,6 @@ def __init__(
             ortho_params=ortho_params,
         )
 
-        if bias:
-            self.bias = nn.Parameter(torch.Tensor(out_channels))
-            nn.init.zeros_(self.bias)
-        else:
-            self.register_parameter("bias", None)
-
     def singular_values(self):
         if self.padding_mode != "circular":
             print(
@@ -280,8 +290,8 @@ def singular_values(self):
                 self.groups,
                 self.intermediate_channels // self.groups,
                 self.out_channels // self.groups,
-                self.kernel_size,
-                self.kernel_size,
+                self.weight_1.shape[-2],
+                self.weight_1.shape[-1],
             )
             .numpy(),
             self._input_shape,
@@ -299,7 +309,9 @@ def singular_values(self):
         )
         sv_min = sv_min * svs_2.min()
         sv_max = sv_max * svs_2.max()
-        stable_rank = 0.5 * stable_rank + 0.5 * ((np.mean(svs_2) ** 2) / (svs_2.max() ** 2))
+        stable_rank = 0.5 * stable_rank + 0.5 * (
+            (np.mean(svs_2) ** 2) / (svs_2.max() ** 2)
+        )
         return sv_min, sv_max, stable_rank
 
     @property
diff --git a/orthogonium/layers/custom_activations.py b/orthogonium/layers/custom_activations.py
index 0204e1a..b2e19f8 100644
--- a/orthogonium/layers/custom_activations.py
+++ b/orthogonium/layers/custom_activations.py
@@ -10,6 +10,14 @@
 
 class Abs(nn.Module):
     def __init__(self):
+        """
+        Initializes an instance of the Abs class.
+
+        This method is automatically called when a new object of the Abs class
+        is instantiated. It calls the initializer of its superclass to ensure
+        proper initialization of inherited class functionality, setting up
+        the required base structures or attributes.
+        """
         super(Abs, self).__init__()
 
     def forward(self, z):
@@ -18,6 +26,15 @@ def forward(self, z):
 
 class MaxMin(nn.Module):
     def __init__(self, axis=1):
+        """
+        This class implements the MaxMin activation function. Which is a
+        pairwise activation function that returns the maximum and minimum (ordered)
+        of each pair of elements in the input tensor.
+
+        Parameters
+            axis : int, default=1 the axis along which to apply the activation function.
+
+        """
         self.axis = axis
         super(MaxMin, self).__init__()
 
@@ -29,6 +46,22 @@ def forward(self, z):
 
 class HouseHolder(nn.Module):
     def __init__(self, channels, axis=1):
+        """
+        A activation that applies a parameterized transformation via Householder
+        reflection technique. It is initialized with the number of input channels, which must
+        be even, and an axis that determines the dimension along which operations are applied.
+        This is a corrected version of the original implementation from Singla et al. (2019),
+        which features a 1/sqrt(2) scaling factor to be 1-Lipschitz.
+
+        Attributes:
+            theta (torch.nn.Parameter): Learnable parameter that determines the transformation
+                applied via Householder reflection.
+            axis (int): Dimension along which the operation is performed.
+
+        Args:
+            channels (int): Total number of input channels. Must be an even number.
+            axis (int): Dimension along which the transformation is applied. Default is 1.
+        """
         super(HouseHolder, self).__init__()
         assert (channels % 2) == 0
         eff_channels = channels // 2
@@ -54,6 +87,38 @@ def forward(self, z):
 
 class HouseHolder_Order_2(nn.Module):
     def __init__(self, channels, axis=1):
+        """
+        Represents a layer or module that performs operations using Householder
+        transformations of order 2, parameterized by angles corresponding to
+        each group of channels. This is a corrected version of the original
+        implementation from Singla et al. (2019), which features a 1/sqrt(2)
+        scaling factor to be 1-Lipschitz.
+
+        Attributes:
+            num_groups (int): The number of groups, which is half the number
+            of channels provided as input.
+
+            axis (int): The axis along which the computation is performed.
+
+            theta0 (torch.nn.Parameter): A tensor parameter of shape `(num_groups,)`
+            representing the first set of angles (in radians) used in the
+            parameterization.
+
+            theta1 (torch.nn.Parameter): A tensor parameter of shape `(num_groups,)`
+            representing the second set of angles (in radians) used in the
+            parameterization.
+
+            theta2 (torch.nn.Parameter): A tensor parameter of shape `(num_groups,)`
+            representing the third set of angles (in radians) used in the
+            parameterization.
+
+        Args:
+            channels (int): The total number of input channels. Must be an even
+            number, as it will be split into groups.
+
+            axis (int, optional): Specifies the axis for computations. Defaults
+            to 1.
+        """
         super(HouseHolder_Order_2, self).__init__()
         assert (channels % 2) == 0
         self.num_groups = channels // 2
diff --git a/orthogonium/layers/linear/ortho_linear.py b/orthogonium/layers/linear/ortho_linear.py
index 14c48d6..4ba192e 100644
--- a/orthogonium/layers/linear/ortho_linear.py
+++ b/orthogonium/layers/linear/ortho_linear.py
@@ -15,6 +15,36 @@ def __init__(
         bias: bool = True,
         ortho_params: OrthoParams = OrthoParams(),
     ):
+        """
+        Initializes an orthogonal linear layer with customizable orthogonalization parameters.
+
+        Attributes:
+            in_features : int
+                Number of input features.
+            out_features : int
+                Number of output features.
+            bias : bool
+                Whether to include a bias term in the layer. Default is True.
+            ortho_params : OrthoParams
+                Parameters for orthogonalization and spectral normalization. Default is the
+                default instance of OrthoParams.
+
+        Parameters:
+            in_features : int
+                The size of each input sample.
+            out_features : int
+                The size of each output sample.
+            bias : bool
+                Indicates if the layer should include a learnable bias parameter.
+            ortho_params : OrthoParams
+                An object containing orthogonalization and normalization configurations.
+
+        Notes
+        -----
+        The layer is initialized with orthogonal weights using `torch.nn.init.orthogonal_`.
+        Weight parameters are further parametrized for both spectral normalization and
+        orthogonal constraints using the provided `OrthoParams` object.
+        """
         super(OrthoLinear, self).__init__(in_features, out_features, bias=bias)
         torch.nn.init.orthogonal_(self.weight)
         parametrize.register_parametrization(
@@ -42,7 +72,25 @@ def __init__(
         *args,
         **kwargs,
     ):
-        """LInear layer where each output unit is normalized to have Frobenius norm 1"""
+        """
+        A custom PyTorch Linear layer that ensures weights are normalized to unit norm along a specified dimension.
+
+        This class extends the torch.nn.Linear module and modifies the weight
+        matrix to maintain orthogonal initialization and unit norm
+        normalization during training. In this specific case, each output can be viewed as the result of a 1-Lipschitz
+        function. This means that the whole function in more than 1-Lipschitz but that each output taken independently
+        is 1-Lipschitz.
+
+        Attributes:
+            weight: The learnable weight tensor with orthogonal initialization
+                and enforced unit norm parametrization.
+
+        Args:
+            *args: Variable length positional arguments passed to the base
+                Linear class.
+            **kwargs: Variable length keyword arguments passed to the base
+                Linear class.
+        """
         super(UnitNormLinear, self).__init__(*args, **kwargs)
         torch.nn.init.orthogonal_(self.weight)
         parametrize.register_parametrization(
diff --git a/orthogonium/losses.py b/orthogonium/losses.py
index d1ebce6..a9ff82b 100644
--- a/orthogonium/losses.py
+++ b/orthogonium/losses.py
@@ -8,6 +8,26 @@
 
 
 def check_last_linear_layer_type(model):
+    """
+    Determines the type of the last linear layer in a given model.
+
+    This function inspects the architecture of the model and identifies the last
+    linear layer of specific types (nn.Linear, OrthoLinear, UnitNormLinear). It
+    then returns a string indicating the type of the last linear layer based on
+    its class. This allows to determine the parameter to use for computing the
+    VRA of a model's output.
+
+    Args:
+        model: The model containing layers to be inspected.
+
+    Returns:
+        str: A string indicating the type of the last linear layer.
+             The possible values are:
+                 - "global" if the layer is of type OrthoLinear.
+                 - "classwise" if the layer is of type UnitNormLinear.
+                 - "unknown" if the layer is of any other type or if no
+                   linear layer is found.
+    """
     # Find the last linear layer in the model
     last_linear_layer = None
     layers = list(model.children())
@@ -102,6 +122,24 @@ def VRA(
 
 class LossXent(nn.Module):
     def __init__(self, n_classes, offset=2.12132, temperature=0.25):
+        """
+        A custom loss function class for cross-entropy calculation.
+
+        This class initializes a cross-entropy loss criterion along with additional
+        parameters, such as an offset and a temperature factor, to allow a finer control over
+        the accuracy/robustness tradeoff during training.
+
+        Attributes:
+            criterion (nn.CrossEntropyLoss): The PyTorch cross-entropy loss criterion.
+            n_classes (int): The number of classes present in the dataset.
+            offset (float): An offset value for customizing the loss computation.
+            temperature (float): A temperature factor for scaling logits during loss calculation.
+
+        Parameters:
+            n_classes (int): The number of classes in the dataset.
+            offset (float, optional): The offset value for loss computation. Default is 2.12132.
+            temperature (float, optional): The temperature scaling factor. Default is 0.25.
+        """
         super(LossXent, self).__init__()
         self.criterion = nn.CrossEntropyLoss()
         self.n_classes = n_classes
@@ -118,6 +156,15 @@ def __call__(self, outputs, labels):
 
 class CosineLoss(nn.Module):
     def __init__(self):
+        """
+        A class that implements the Cosine Loss for measuring the cosine similarity
+        between predictions and targets. Designed for use in scenarios involving
+        angle-based loss calculations or similarity measurements.
+
+        Attributes:
+            None
+
+        """
         super(CosineLoss, self).__init__()
 
     def forward(self, yp, yt):
diff --git a/orthogonium/model_factory/models_factory.py b/orthogonium/model_factory/models_factory.py
index f03a36f..5fd991f 100644
--- a/orthogonium/model_factory/models_factory.py
+++ b/orthogonium/model_factory/models_factory.py
@@ -188,7 +188,6 @@ def forward(self, x):
 #             eps=1e-6,
 #             bjorck_iters=6,
 #             beta=0.5,
-#             contiguous_optimization=False,
 #         ),
 #     ),
 #     act=ClassParam(MaxMin),
diff --git a/orthogonium/reparametrizers.py b/orthogonium/reparametrizers.py
index 29278ff..5cd4228 100644
--- a/orthogonium/reparametrizers.py
+++ b/orthogonium/reparametrizers.py
@@ -308,7 +308,9 @@ def __init__(self, weight_shape, niters=7):
         exponential, it produces an orthogonal matrix. This approach is particularly useful
         in contexts where smooth transitions between matrices are required.
 
-        Non-square matrices
+        Non-square matrices are padded to the largest dimension to ensure that the matrix can
+        be converted to a skew-symmetric matrix. The resulting matrix is cropped to the original
+        dimension.
 
         Args:
             weight_shape (tuple): The shape of the weight matrix.
@@ -411,8 +413,6 @@ class OrthoParams:
             configured to use BatchedBjorckOrthogonalization with specific
             parameters. This callable can be provided either as a `functool.partial` or as a
             `orthogonium.ClassParam`. It will recieve the shape of the weight tensor as its argument.
-        contiguous_optimization (bool): Determines whether to perform
-            optimization ensuring contiguous operations. Default is False.
     """
 
     # spectral_normalizer: Callable[Tuple[int, ...], nn.Module] = BatchedIdentity
@@ -428,7 +428,6 @@ class OrthoParams:
         # BatchedCholeskyOrthogonalization,
         # BatchedQROrthogonalization,
     )
-    contiguous_optimization: bool = False
 
 
 DEFAULT_ORTHO_PARAMS = OrthoParams()
@@ -437,33 +436,27 @@ class OrthoParams:
     orthogonalizer=ClassParam(
         BatchedBjorckOrthogonalization, beta=0.5, niters=12, pass_through=True
     ),
-    contiguous_optimization=False,
 )
 DEFAULT_TEST_ORTHO_PARAMS = OrthoParams(
     spectral_normalizer=ClassParam(BatchedPowerIteration, power_it_niter=4, eps=1e-4),  # type: ignore
     orthogonalizer=ClassParam(BatchedBjorckOrthogonalization, beta=0.5, niters=25),
     # orthogonalizer=ClassParam(BatchedQROrthogonalization),
     # orthogonalizer=ClassParam(BatchedExponentialOrthogonalization, niters=12),  # type: ignore
-    contiguous_optimization=False,
 )
 EXP_ORTHO_PARAMS = OrthoParams(
     spectral_normalizer=ClassParam(BatchedPowerIteration, power_it_niter=3, eps=1e-6),  # type: ignore
     orthogonalizer=ClassParam(BatchedExponentialOrthogonalization, niters=12),  # type: ignore
-    contiguous_optimization=False,
 )
 QR_ORTHO_PARAMS = OrthoParams(
     spectral_normalizer=ClassParam(BatchedPowerIteration, power_it_niter=3, eps=1e-3),  # type: ignore
     orthogonalizer=ClassParam(BatchedQROrthogonalization),  # type: ignore
-    contiguous_optimization=False,
 )
 CHOLESKY_ORTHO_PARAMS = OrthoParams(
     spectral_normalizer=BatchedIdentity,  # type: ignore
     orthogonalizer=ClassParam(BatchedCholeskyOrthogonalization),  # type: ignore
-    contiguous_optimization=False,
 )
 
 CHOLESKY_STABLE_ORTHO_PARAMS = OrthoParams(
     spectral_normalizer=BatchedIdentity,
     orthogonalizer=ClassParam(BatchedCholeskyOrthogonalization, stable=True),
-    contiguous_optimization=False,
 )
diff --git a/tests/test_orthogonality_conv_soc.py b/tests/test_orthogonality_conv_soc.py
new file mode 100644
index 0000000..d0d553e
--- /dev/null
+++ b/tests/test_orthogonality_conv_soc.py
@@ -0,0 +1,605 @@
+import numpy as np
+import pytest
+import torch
+from orthogonium.layers.conv.adaptiveSOC import (
+    AdaptiveSOCConv2d,
+    AdaptiveSOCConvTranspose2d,
+)
+from orthogonium.layers.conv.adaptiveSOC.soc_x_rko_conv import SOCRkoConv2d
+from orthogonium.layers.conv.adaptiveSOC.fast_skew_ortho_conv import FastSOC
+
+
+device = "cpu"  #  torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+def _compute_sv_impulse_response_layer(layer, img_shape):
+    with torch.no_grad():
+        layer = layer.to(device)
+        inputs = (
+            torch.eye(img_shape[0] * img_shape[1] * img_shape[2])
+            .view(
+                img_shape[0] * img_shape[1] * img_shape[2],
+                img_shape[0],
+                img_shape[1],
+                img_shape[2],
+            )
+            .to(device)
+        )
+        outputs = layer(inputs)
+        try:
+            svs = torch.linalg.svdvals(outputs.view(outputs.shape[0], -1))
+            svs = svs.cpu()
+            return svs.min(), svs.max(), svs.mean() / svs.max()
+        except np.linalg.LinAlgError:
+            print("SVD failed returning only largest singular value")
+            return torch.norm(outputs.view(outputs.shape[0], -1), p=2).max(), 0, 0
+
+
+def check_orthogonal_layer(
+    orthoconv,
+    groups,
+    input_channels,
+    kernel_size,
+    output_channels,
+    expected_kernel_shape,
+    tol=5e-4,
+    sigma_min_requirement=0.95,
+):
+    imsize = 8
+    # Test backpropagation and weight update
+    try:
+        orthoconv = orthoconv.to(device)
+        orthoconv.train()
+        opt = torch.optim.SGD(orthoconv.parameters(), lr=0.001)
+        for i in range(25):
+            opt.zero_grad()
+            inp = torch.randn(1, input_channels, imsize, imsize).to(device)
+            output = orthoconv(inp)
+            loss = -output.mean()
+            loss.backward()
+            opt.step()
+        orthoconv.eval()  # so i    mpulse response test checks the eval mode
+    except Exception as e:
+        pytest.fail(f"Backpropagation or weight update failed with: {e}")
+    # # check that orthoconv.weight has the correct shape
+    # if orthoconv.weight.data.shape != expected_kernel_shape:
+    #     pytest.fail(
+    #         f"BCOP weight has incorrect shape: {orthoconv.weight.shape} vs {(output_channels, input_channels // groups, kernel_size, kernel_size)}"
+    #     )
+    # Test singular_values function
+    try:
+        sigma_min, sigma_max, stable_rank = orthoconv.singular_values()  # try:
+    except np.linalg.LinAlgError as e:
+        pytest.skip(f"SVD failed with: {e}")
+    sigma_min_ir, sigma_max_ir, stable_rank_ir = _compute_sv_impulse_response_layer(
+        orthoconv, (input_channels, imsize, imsize)
+    )
+    print(f"input_shape = {inp.shape}, output_shape = {output.shape}")
+    print(
+        f"({input_channels}->{output_channels}, g{groups}, k{kernel_size}), "
+        f"sigma_max:"
+        f" {sigma_max:.3f}/{sigma_max_ir:.3f}, "
+        f"sigma_min:"
+        f" {sigma_min:.3f}/{sigma_min_ir:.3f}, "
+        f"stable_rank: {stable_rank:.3f}/{stable_rank_ir:.3f}"
+    )
+    # check that the singular values are close to 1
+    assert sigma_max_ir < (1 + tol), "sigma_max is not less than 1"
+    # assert (sigma_min_ir < (1 + tol)) and (
+    #     sigma_min_ir > sigma_min_requirement
+    # ), "sigma_min is not close to 1"
+    # assert abs(stable_rank_ir - 1) < tol, "stable_rank is not close to 1"
+    # check that the singular values are close to the impulse response values
+    # assert (
+    #     sigma_max > sigma_max_ir - 1e-2
+    # ), f"sigma_max must be greater to its IR value (1%): {sigma_max} vs {sigma_max_ir}"
+    assert (
+        abs(sigma_max - sigma_max_ir) < tol
+    ), f"sigma_max is not close to its IR value: {sigma_max} vs {sigma_max_ir}"
+    # assert (
+    #     abs(sigma_min - sigma_min_ir) < tol
+    # ), f"sigma_min is not close to its IR value: {sigma_min} vs {sigma_min_ir}"
+    # assert (
+    #     abs(stable_rank - stable_rank_ir) < tol
+    # ), f"stable_rank is not close to its IR value: {stable_rank} vs {stable_rank_ir}"
+
+
+@pytest.mark.parametrize("kernel_size", [1, 3])
+@pytest.mark.parametrize("input_channels", [8, 16])
+@pytest.mark.parametrize("output_channels", [8, 16])
+@pytest.mark.parametrize("stride", [1])
+@pytest.mark.parametrize("groups", [1, 2, 4])
+def test_standard_configs(kernel_size, input_channels, output_channels, stride, groups):
+    """
+    test combinations of kernel size, input channels, output channels, stride and groups
+    """
+    # Test instantiation
+    try:
+        orthoconv = AdaptiveSOCConv2d(
+            kernel_size=kernel_size,
+            in_channels=input_channels,
+            out_channels=output_channels,
+            stride=stride,
+            groups=groups,
+            bias=False,
+            padding=(kernel_size // 2, kernel_size // 2),
+            padding_mode="circular",
+        )
+    except Exception as e:
+        if kernel_size < stride:
+            # we expect this configuration to raise a RuntimeError
+            # pytest.skip(f"BCOP instantiation failed with: {e}")
+            return
+        else:
+            pytest.fail(f"BCOP instantiation failed with: {e}")
+    check_orthogonal_layer(
+        orthoconv,
+        groups,
+        input_channels,
+        kernel_size,
+        output_channels,
+        (
+            output_channels,
+            input_channels // groups,
+            kernel_size,
+            kernel_size,
+        ),
+        tol=5e-2,
+        sigma_min_requirement=0.0,
+    )
+
+
+#
+# @pytest.mark.parametrize("kernel_size", [3])
+# @pytest.mark.parametrize("input_channels", [8, 16])
+# @pytest.mark.parametrize(
+#     "output_channels", [8, 16]
+# )  # dilated convolutions are not supported for output_channels < input_channels
+# @pytest.mark.parametrize("stride", [1])
+# @pytest.mark.parametrize("groups", [1, 2, 4])
+# def test_dilation(kernel_size, input_channels, output_channels, stride, groups):
+#     """
+#     test combinations of kernel size, input channels, output channels, stride and groups
+#     """
+#     # Test instantiation
+#     try:
+#         orthoconv = AdaptiveSOCConv2d(
+#             kernel_size=kernel_size,
+#             in_channels=input_channels,
+#             out_channels=output_channels,
+#             stride=stride,
+#             dilation=2,
+#             groups=groups,
+#             bias=False,
+#             padding="same",
+#             padding_mode="circular",
+#         )
+#     except Exception as e:
+#         if kernel_size < stride:
+#             # we expect this configuration to raise a RuntimeError
+#             # pytest.skip(f"BCOP instantiation failed with: {e}")
+#             return
+#         else:
+#             pytest.fail(f"BCOP instantiation failed with: {e}")
+#     check_orthogonal_layer(
+#         orthoconv,
+#         groups,
+#         input_channels,
+#         kernel_size,
+#         output_channels,
+#         (
+#             output_channels,
+#             input_channels // groups,
+#             kernel_size,
+#             kernel_size,
+#         ),
+#     )
+#
+#
+# @pytest.mark.parametrize("kernel_size", [2, 4])
+# @pytest.mark.parametrize("input_channels", [8, 16])
+# @pytest.mark.parametrize(
+#     "output_channels", [8, 16]
+# )  # dilated+strided convolutions are not supported for output_channels < input_channels
+# @pytest.mark.parametrize("stride", [2])
+# @pytest.mark.parametrize("dilation", [2, 3])
+# @pytest.mark.parametrize("groups", [1, 2, 4])
+# def test_dilation_strided(
+#     kernel_size, input_channels, output_channels, stride, dilation, groups
+# ):
+#     """
+#     test combinations of kernel size, input channels, output channels, stride and groups
+#     """
+#     # Test instantiation
+#     try:
+#         orthoconv = AdaptiveSOCConv2d(
+#             kernel_size=kernel_size,
+#             in_channels=input_channels,
+#             out_channels=output_channels,
+#             stride=stride,
+#             dilation=dilation,
+#             groups=groups,
+#             bias=False,
+#             padding=(
+#                 int(np.ceil((dilation * (kernel_size - 1) + 1 - stride) / 2)),
+#                 int(np.ceil((dilation * (kernel_size - 1) + 1 - stride) / 2)),
+#             ),
+#             padding_mode="circular",
+#         )
+#     except Exception as e:
+#         if (output_channels >= input_channels) and (
+#             ((dilation % stride) == 0) and (stride > 1)
+#         ):
+#             # we expect this configuration to raise a ValueError
+#             # pytest.skip(f"BCOP instantiation failed with: {e}")
+#             return
+#         if (kernel_size == stride) and (((dilation % stride) == 0) and (stride > 1)):
+#             return
+#         else:
+#             pytest.fail(f"BCOP instantiation failed with: {e}")
+#     check_orthogonal_layer(
+#         orthoconv,
+#         groups,
+#         input_channels,
+#         kernel_size,
+#         output_channels,
+#         (
+#             output_channels,
+#             input_channels // groups,
+#             kernel_size,
+#             kernel_size,
+#         ),
+#     )
+
+
+@pytest.mark.parametrize("kernel_size", [4])
+@pytest.mark.parametrize("input_channels", [2, 4, 16])
+@pytest.mark.parametrize("output_channels", [2, 4, 16])
+@pytest.mark.parametrize("stride", [2])
+@pytest.mark.parametrize("groups", [1])
+def test_strided(kernel_size, input_channels, output_channels, stride, groups):
+    """
+    a more extensive testing when striding is enabled.
+    A larger range of cin and cout is used to track errors when cin < cout / stride**2
+    ( ie you reduce spatial dimensions but you increase the channel dimensions so
+    that you actually increase overall dimension.
+    """
+    # Test instantiation
+    try:
+        orthoconv = AdaptiveSOCConv2d(
+            kernel_size=kernel_size,
+            in_channels=input_channels,
+            out_channels=output_channels,
+            stride=stride,
+            groups=groups,
+            bias=False,
+            padding=((kernel_size - 1) // 2, (kernel_size - 1) // 2),
+            padding_mode="circular",
+        )
+    except Exception as e:
+        if kernel_size < stride:
+            # we expect this configuration to raise a RuntimeError
+            # pytest.skip(f"BCOP instantiation failed with: {e}")
+            return
+        else:
+            pytest.fail(f"BCOP instantiation failed with: {e}")
+    check_orthogonal_layer(
+        orthoconv,
+        groups,
+        input_channels,
+        kernel_size,
+        output_channels,
+        (
+            output_channels,
+            input_channels // groups,
+            kernel_size,
+            kernel_size,
+        ),
+        tol=5e-2,
+        sigma_min_requirement=0.0,
+    )
+
+
+# @pytest.mark.parametrize("kernel_size", [2, 4])
+# @pytest.mark.parametrize("input_channels", [8, 16])
+# @pytest.mark.parametrize("output_channels", [8, 16])
+# @pytest.mark.parametrize("stride", [1])
+# @pytest.mark.parametrize("groups", [1, 2, 4])
+# def test_even_kernels(kernel_size, input_channels, output_channels, stride, groups):
+#     """
+#     test specific to even kernel size
+#     """
+#     # Test instantiation
+#     try:
+#         orthoconv = AdaptiveSOCConv2d(
+#             kernel_size=kernel_size,
+#             in_channels=input_channels,
+#             out_channels=output_channels,
+#             stride=stride,
+#             groups=groups,
+#             bias=False,
+#             padding="same",
+#             padding_mode="circular",
+#         )
+#     except Exception as e:
+#         if kernel_size < stride:
+#             # we expect this configuration to raise a RuntimeError
+#             # pytest.skip(f"BCOP instantiation failed with: {e}")
+#             return
+#         else:
+#             pytest.fail(f"BCOP instantiation failed with: {e}")
+#     check_orthogonal_layer(
+#         orthoconv,
+#         groups,
+#         input_channels,
+#         kernel_size,
+#         output_channels,
+#         (
+#             output_channels,
+#             input_channels // groups,
+#             kernel_size,
+#             kernel_size,
+#         ),
+#     )
+
+
+# @pytest.mark.parametrize("kernel_size", [1, 2])
+# @pytest.mark.parametrize("input_channels", [4, 8, 32])
+# @pytest.mark.parametrize("output_channels", [4, 8, 32])
+# @pytest.mark.parametrize("groups", [1, 2])
+# def test_rko(kernel_size, input_channels, output_channels, groups):
+#     """
+#     test case where stride == kernel size
+#     """
+#     # Test instantiation
+#     try:
+#         rkoconv = AdaptiveSOCConv2d(
+#             kernel_size=kernel_size,
+#             in_channels=input_channels,
+#             out_channels=output_channels,
+#             stride=kernel_size,
+#             groups=groups,
+#             bias=False,
+#             padding=(0, 0),
+#             padding_mode="zeros",
+#         )
+#     except Exception as e:
+#         pytest.fail(f"BCOP instantiation failed with: {e}")
+#     check_orthogonal_layer(
+#         rkoconv,
+#         groups,
+#         input_channels,
+#         kernel_size,
+#         output_channels,
+#         (
+#             output_channels,
+#             input_channels // groups,
+#             kernel_size,
+#             kernel_size,
+#         ),
+#     )
+
+
+@pytest.mark.parametrize("kernel_size", [1, 3])
+@pytest.mark.parametrize("input_channels", [1, 2])
+@pytest.mark.parametrize("output_channels", [1, 2])
+@pytest.mark.parametrize("stride", [1])
+@pytest.mark.parametrize("groups", [1])
+def test_depthwise(kernel_size, input_channels, output_channels, stride, groups):
+    """
+    test combinations of kernel size, input channels, output channels, stride and groups
+    """
+    # Test instantiation
+    try:
+        orthoconv = AdaptiveSOCConv2d(
+            kernel_size=kernel_size,
+            in_channels=input_channels,
+            out_channels=output_channels,
+            stride=stride,
+            groups=groups,
+            bias=False,
+            padding=(kernel_size // 2, kernel_size // 2),
+            padding_mode="circular",
+        )
+    except Exception as e:
+        if kernel_size < stride:
+            # we expect this configuration to raise a RuntimeError
+            # pytest.skip(f"BCOP instantiation failed with: {e}")
+            return
+        else:
+            pytest.fail(f"BCOP instantiation failed with: {e}")
+    check_orthogonal_layer(
+        orthoconv,
+        groups,
+        input_channels,
+        kernel_size,
+        output_channels,
+        (
+            output_channels,
+            input_channels // groups,
+            kernel_size,
+            kernel_size,
+        ),
+        tol=5e-2,
+        sigma_min_requirement=0.0,
+    )
+
+
+# def test_invalid_kernel_smaller_than_stride():
+#     """
+#     A test to ensure that kernel_size < stride raises an expected ValueError
+#     """
+#     with pytest.raises(ValueError, match=r"kernel size must be smaller than stride"):
+#         AdaptiveSOCConv2d(
+#             in_channels=8,
+#             out_channels=4,
+#             kernel_size=2,
+#             stride=3,  # Invalid: kernel_size < stride
+#             groups=1,
+#             padding=0,
+#         )
+#     with pytest.raises(ValueError, match=r"kernel size must be smaller than stride"):
+#         SOCRkoConv2d(
+#             in_channels=8,
+#             out_channels=4,
+#             kernel_size=2,
+#             stride=3,  # Invalid: kernel_size < stride
+#             groups=1,
+#             padding=0,
+#         )
+#     with pytest.raises(ValueError, match=r"kernel size must be smaller than stride"):
+#         FastSOC(
+#             in_channels=8,
+#             out_channels=4,
+#             kernel_size=2,
+#             stride=3,  # Invalid: kernel_size < stride
+#             groups=1,
+#             padding=0,
+#         )
+#
+#
+# def test_invalid_dilation_with_stride():
+#     """
+#     A test to ensure dilation > 1 while stride > 1 raises an expected ValueError
+#     """
+#     with pytest.raises(
+#         ValueError,
+#         match=r"dilation must be 1 when stride is not 1",
+#     ):
+#         AdaptiveSOCConv2d(
+#             in_channels=8,
+#             out_channels=16,
+#             kernel_size=3,
+#             stride=2,
+#             dilation=2,  # Invalid: dilation > 1 while stride > 1
+#             groups=1,
+#             padding=0,
+#         )
+#     with pytest.raises(
+#         ValueError,
+#         match=r"dilation must be 1 when stride is not 1",
+#     ):
+#         SOCRkoConv2d(
+#             in_channels=8,
+#             out_channels=16,
+#             kernel_size=3,
+#             stride=2,
+#             dilation=2,  # Invalid: dilation > 1 while stride > 1
+#             groups=1,
+#             padding=0,
+#         )
+#     with pytest.raises(
+#         ValueError,
+#         match=r"dilation must be 1 when stride is not 1",
+#     ):
+#         FastSOC(
+#             in_channels=8,
+#             out_channels=16,
+#             kernel_size=3,
+#             stride=2,
+#             dilation=2,  # Invalid: dilation > 1 while stride > 1
+#             groups=1,
+#             padding=0,
+#         )
+
+
+@pytest.mark.parametrize("kernel_size", [1, 3])
+@pytest.mark.parametrize("input_channels", [4, 8])
+@pytest.mark.parametrize("output_channels", [4, 8])
+@pytest.mark.parametrize("stride", [1])
+@pytest.mark.parametrize("groups", [1, 2])
+def test_convtranspose(kernel_size, input_channels, output_channels, stride, groups):
+    # Test instantiation
+    padding = (0, 0)
+    padding_mode = "zeros"
+    try:
+
+        orthoconvtranspose = AdaptiveSOCConvTranspose2d(
+            kernel_size=kernel_size,
+            in_channels=input_channels,
+            out_channels=output_channels,
+            stride=stride,
+            groups=groups,
+            bias=False,
+            padding=padding,
+            padding_mode=padding_mode,
+        )
+    except Exception as e:
+        if kernel_size < stride:
+            # we expect this configuration to raise a RuntimeError
+            # pytest.skip(f"BCOP instantiation failed with: {e}")
+            return
+        else:
+            pytest.fail(f"BCOP instantiation failed with: {e}")
+    if (
+        kernel_size > 1
+        and kernel_size != stride
+        and output_channels * (stride**2) < input_channels
+    ):
+        pytest.skip("this case is not handled yet")
+    check_orthogonal_layer(
+        orthoconvtranspose,
+        groups,
+        input_channels,
+        kernel_size,
+        output_channels,
+        (
+            input_channels,
+            output_channels // groups,
+            kernel_size,
+            kernel_size,
+        ),
+        tol=5e-2,
+        sigma_min_requirement=0.0,
+    )
+
+
+@pytest.mark.parametrize("kernel_size", [2, 4])
+@pytest.mark.parametrize("input_channels", [4, 8])
+@pytest.mark.parametrize("output_channels", [4, 8])
+@pytest.mark.parametrize("stride", [2])
+@pytest.mark.parametrize("groups", [1, 2])
+def test_convtranspose(kernel_size, input_channels, output_channels, stride, groups):
+    # Test instantiation
+    padding = (0, 0)
+    padding_mode = "zeros"
+    try:
+
+        orthoconvtranspose = AdaptiveSOCConvTranspose2d(
+            kernel_size=kernel_size,
+            in_channels=input_channels,
+            out_channels=output_channels,
+            stride=stride,
+            groups=groups,
+            bias=False,
+            padding=padding,
+            padding_mode=padding_mode,
+        )
+    except Exception as e:
+        if kernel_size < stride:
+            # we expect this configuration to raise a RuntimeError
+            # pytest.skip(f"BCOP instantiation failed with: {e}")
+            return
+        else:
+            pytest.fail(f"BCOP instantiation failed with: {e}")
+    if (
+        kernel_size > 1
+        and kernel_size != stride
+        and output_channels * (stride**2) < input_channels
+    ):
+        pytest.skip("this case is not handled yet")
+    check_orthogonal_layer(
+        orthoconvtranspose,
+        groups,
+        input_channels,
+        kernel_size,
+        output_channels,
+        (
+            input_channels,
+            output_channels // groups,
+            kernel_size,
+            kernel_size,
+        ),
+        tol=5e-2,
+        sigma_min_requirement=0.0,
+    )