open-edge-platform · sapiovesanunivision · Nov 13, 2025 · Nov 13, 2025 · Nov 13, 2025 · Nov 13, 2025
@@ -203,8 +203,9 @@ def get_encoder_decoder_outputs(self, x: torch.Tensor) -> tuple[list[torch.Tenso
                 - en: List of fused encoder features reshaped to spatial dimensions
                 - de: List of fused decoder features reshaped to spatial dimensions
         """
+        h_patches = x.shape[2] // self.encoder.patch_size
+        w_patches = x.shape[3] // self.encoder.patch_size
         x = self.encoder.prepare_tokens(x)
-
         encoder_features = []
         decoder_features = []
 
@@ -237,8 +238,8 @@ def get_encoder_decoder_outputs(self, x: torch.Tensor) -> tuple[list[torch.Tenso
         de = [self._fuse_feature([decoder_features[idx] for idx in idxs]) for idxs in self.fuse_layer_decoder]
 
         # Process features for spatial output
-        en = self._process_features_for_spatial_output(en, side)
-        de = self._process_features_for_spatial_output(de, side)
+        en = self._process_features_for_spatial_output(en, h_patches, w_patches)
+        de = self._process_features_for_spatial_output(de, h_patches, w_patches)
         return en, de
 
     def forward(self, batch: torch.Tensor, global_step: int | None = None) -> torch.Tensor | InferenceBatch:
@@ -262,7 +263,7 @@ def forward(self, batch: torch.Tensor, global_step: int | None = None) -> torch.
 
         """
         en, de = self.get_encoder_decoder_outputs(batch)
-        image_size = batch.shape[2]
+        image_size = (batch.shape[2], batch.shape[3])
 
         if self.training:
             if global_step is None:
@@ -376,7 +377,7 @@ def _get_architecture_config(encoder_name: str, target_layers: list[int] | None)
     def _process_features_for_spatial_output(
         self,
         features: list[torch.Tensor],
-        side: int,
+        h_patches: int, w_patches: int
     ) -> list[torch.Tensor]:
         """Process features for spatial output by removing tokens and reshaping.
 
@@ -393,7 +394,7 @@ def _process_features_for_spatial_output(
 
         # Reshape to spatial dimensions
         batch_size = features[0].shape[0]
-        return [f.permute(0, 2, 1).reshape([batch_size, -1, side, side]).contiguous() for f in features]
+        return [f.permute(0, 2, 1).reshape([batch_size, -1, h_patches, w_patches]).contiguous() for f in features]
 
 
 class DecoderViTBlock(nn.Module):