Change ReLU6, [-1,1] rescaling, backbone init & no pretraining.

datumbox · datumbox · commit 8b9ca5330cc8 · 2021-05-04T14:17:03.000+01:00
diff --git a/torchvision/models/detection/ssdlite.py b/torchvision/models/detection/ssdlite.py
@@ -26,15 +26,15 @@ def _prediction_block(in_channels: int, out_channels: int, kernel_size: int,
     return nn.Sequential(
         # 3x3 depthwise with stride 1 and padding 1
         ConvBNActivation(in_channels, in_channels, kernel_size=kernel_size, groups=in_channels,
-                         norm_layer=norm_layer, activation_layer=nn.ReLU),
+                         norm_layer=norm_layer, activation_layer=nn.ReLU6),
 
         # 1x1 projetion to output channels
         nn.Conv2d(in_channels, out_channels, 1)
     )
 
 
 def _extra_block(in_channels: int, out_channels: int, norm_layer: Callable[..., nn.Module]) -> nn.Sequential:
-    activation = nn.ReLU
+    activation = nn.ReLU6
     intermediate_channels = out_channels // 2
     return nn.Sequential(
         # 1x1 projection to half output channels
@@ -93,7 +93,8 @@ def __init__(self, in_channels: List[int], num_anchors: List[int], norm_layer: C
 
 
 class SSDLiteFeatureExtractorMobileNet(nn.Module):
-    def __init__(self, backbone: nn.Module, c4_pos: int, norm_layer: Callable[..., nn.Module], **kwargs: Any):
+    def __init__(self, backbone: nn.Module, c4_pos: int, norm_layer: Callable[..., nn.Module], rescaling: bool,
+                 **kwargs: Any):
         super().__init__()
         # non-public config parameters
         min_depth = kwargs.pop('_min_depth', 16)
@@ -115,8 +116,13 @@ def __init__(self, backbone: nn.Module, c4_pos: int, norm_layer: Callable[..., n
         _normal_init(extra)
 
         self.extra = extra
+        self.rescaling = rescaling
 
     def forward(self, x: Tensor) -> Dict[str, Tensor]:
+        # Rescale from [0, 1] to [-1, -1]
+        if self.rescaling:
+            x = 2.0 * x - 1.0
+
         # Get feature maps from backbone and extra. Can't be refactored due to JIT limitations.
         output = []
         for block in self.features:
@@ -131,9 +137,12 @@ def forward(self, x: Tensor) -> Dict[str, Tensor]:
 
 
 def _mobilenet_extractor(backbone_name: str, progress: bool, pretrained: bool, trainable_layers: int,
-                         norm_layer: Callable[..., nn.Module], **kwargs: Any):
+                         norm_layer: Callable[..., nn.Module], rescaling: bool, **kwargs: Any):
     backbone = mobilenet.__dict__[backbone_name](pretrained=pretrained, progress=progress,
                                                  norm_layer=norm_layer, **kwargs).features
+    if not pretrained:
+        # Change the default initialization scheme if not pretrained
+        _normal_init(backbone)
 
     # Gather the indices of blocks which are strided. These are the locations of C1, ..., Cn-1 blocks.
     # The first and last blocks are always included because they are the C0 (conv1) and Cn.
@@ -148,11 +157,11 @@ def _mobilenet_extractor(backbone_name: str, progress: bool, pretrained: bool, t
         for parameter in b.parameters():
             parameter.requires_grad_(False)
 
-    return SSDLiteFeatureExtractorMobileNet(backbone, stage_indices[-2], norm_layer, **kwargs)
+    return SSDLiteFeatureExtractorMobileNet(backbone, stage_indices[-2], norm_layer, rescaling, **kwargs)
 
 
 def ssdlite320_mobilenet_v3_large(pretrained: bool = False, progress: bool = True, num_classes: int = 91,
-                                  pretrained_backbone: bool = True, trainable_backbone_layers: Optional[int] = None,
+                                  pretrained_backbone: bool = False, trainable_backbone_layers: Optional[int] = None,
                                   norm_layer: Optional[Callable[..., nn.Module]] = None,
                                   **kwargs: Any):
     trainable_backbone_layers = _validate_trainable_layers(
@@ -161,11 +170,13 @@ def ssdlite320_mobilenet_v3_large(pretrained: bool = False, progress: bool = Tru
     if pretrained:
         pretrained_backbone = False
 
+    rescaling = not pretrained_backbone
+
     if norm_layer is None:
         norm_layer = partial(nn.BatchNorm2d, eps=0.001, momentum=0.03)
 
     backbone = _mobilenet_extractor("mobilenet_v3_large", progress, pretrained_backbone, trainable_backbone_layers,
-                                    norm_layer, _width_mult=1.0)
+                                    norm_layer, rescaling, _width_mult=1.0)
 
     size = (320, 320)
     anchor_generator = DefaultBoxGenerator([[2, 3] for _ in range(6)], min_ratio=0.2, max_ratio=0.95)
@@ -181,7 +192,8 @@ def ssdlite320_mobilenet_v3_large(pretrained: bool = False, progress: bool = Tru
     }
     kwargs = {**defaults, **kwargs}
     model = SSD(backbone, anchor_generator, size, num_classes,
-                head=SSDLiteHead(out_channels, num_anchors, num_classes, norm_layer), **kwargs)
+                head=SSDLiteHead(out_channels, num_anchors, num_classes, norm_layer),
+                image_mean=[0., 0., 0.], image_std=[1., 1., 1.], **kwargs)
 
     if pretrained:
         weights_name = 'ssdlite320_mobilenet_v3_large_coco'