@@ -87,7 +87,7 @@ def forward(self, x):
87
87
return output
88
88
89
89
90
- def vgg16_mfm_backbone (pretrained , trainable_layers = 3 ):
90
+ def _vgg16_mfm_backbone (pretrained , trainable_layers = 3 ):
91
91
backbone = vgg .vgg16 (pretrained = pretrained ).features
92
92
93
93
# Gather the indices of maxpools. These are the locations of output blocks.
@@ -102,50 +102,77 @@ def vgg16_mfm_backbone(pretrained, trainable_layers=3):
102
102
for parameter in b .parameters ():
103
103
parameter .requires_grad_ (False )
104
104
105
+ # Patch ceil_mode for all maxpool layers of backbone to get the same outputs as Fig2 of SSD papers
106
+ for layer in backbone :
107
+ if isinstance (layer , nn .MaxPool2d ):
108
+ layer .ceil_mode = True
109
+
105
110
# Multiple Feature map definition - page 4, Fig 2 of SSD paper
111
+ def build_feature_map_block (layers , out_channels ):
112
+ block = nn .Sequential (* layers )
113
+ block .out_channels = out_channels
114
+ return block
115
+
106
116
feature_maps = nn .ModuleList ([
107
117
# Conv4_3 map
108
- nn .Sequential (
109
- * backbone [:23 ], # until conv4_3
118
+ build_feature_map_block (
119
+ backbone [:23 ], # until conv4_3
120
+ # TODO: add L2 nomarlization + scaling?
121
+ 512
110
122
),
111
123
# FC7 map
112
- nn .Sequential (
113
- * backbone [23 :], # until maxpool5 # TODO: replace maxpool 5 as in the paper?
114
- nn .Conv2d (in_channels = 512 , out_channels = 1024 , kernel_size = 3 , padding = 1 ), # FC6
115
- nn .ReLU (inplace = True ),
116
- nn .Conv2d (in_channels = 1024 , out_channels = 1024 , kernel_size = 1 ), # FC7
117
- nn .ReLU (inplace = True )
124
+ build_feature_map_block (
125
+ (
126
+ * backbone [23 :- 1 ], # until conv5_3
127
+ nn .MaxPool2d (kernel_size = 3 , stride = 1 , padding = 1 , ceil_mode = True ), # modified maxpool5
128
+ nn .Conv2d (in_channels = 512 , out_channels = 1024 , kernel_size = 3 , padding = 6 , dilation = 6 ), # FC6 with atrous
129
+ nn .ReLU (inplace = True ),
130
+ nn .Conv2d (in_channels = 1024 , out_channels = 1024 , kernel_size = 1 ), # FC7
131
+ nn .ReLU (inplace = True )
132
+ ),
133
+ 1024
118
134
),
119
135
# Conv8_2 map
120
- nn .Sequential (
121
- nn .Conv2d (1024 , 256 , kernel_size = 1 ),
122
- nn .ReLU (inplace = True ),
123
- nn .Conv2d (256 , 512 , kernel_size = 3 , padding = 1 , stride = 2 ),
124
- nn .ReLU (inplace = True ),
136
+ build_feature_map_block (
137
+ (
138
+ nn .Conv2d (1024 , 256 , kernel_size = 1 ),
139
+ nn .ReLU (inplace = True ),
140
+ nn .Conv2d (256 , 512 , kernel_size = 3 , padding = 1 , stride = 2 ),
141
+ nn .ReLU (inplace = True ),
142
+ ),
143
+ 512 ,
125
144
),
126
145
# Conv9_2 map
127
- nn .Sequential (
128
- nn .Conv2d (512 , 128 , kernel_size = 1 ),
129
- nn .ReLU (inplace = True ),
130
- nn .Conv2d (128 , 256 , kernel_size = 3 , padding = 1 , stride = 2 ),
131
- nn .ReLU (inplace = True ),
146
+ build_feature_map_block (
147
+ (
148
+ nn .Conv2d (512 , 128 , kernel_size = 1 ),
149
+ nn .ReLU (inplace = True ),
150
+ nn .Conv2d (128 , 256 , kernel_size = 3 , padding = 1 , stride = 2 ),
151
+ nn .ReLU (inplace = True ),
152
+ ),
153
+ 256 ,
132
154
),
133
155
# Conv10_2 map
134
- nn .Sequential (
135
- nn .Conv2d (256 , 128 , kernel_size = 1 ),
136
- nn .ReLU (inplace = True ),
137
- nn .Conv2d (128 , 256 , kernel_size = 3 , padding = 1 ),
138
- nn .ReLU (inplace = True ),
156
+ build_feature_map_block (
157
+ (
158
+ nn .Conv2d (256 , 128 , kernel_size = 1 ),
159
+ nn .ReLU (inplace = True ),
160
+ nn .Conv2d (128 , 256 , kernel_size = 3 ),
161
+ nn .ReLU (inplace = True ),
162
+ ),
163
+ 256 ,
139
164
),
140
165
# Conv11_2 map
141
- nn .Sequential (
142
- nn .Conv2d (256 , 128 , kernel_size = 1 ),
143
- nn .ReLU (inplace = True ),
144
- nn .Conv2d (128 , 256 , kernel_size = 3 , padding = 1 ),
145
- nn .ReLU (inplace = True ),
166
+ build_feature_map_block (
167
+ (
168
+ nn .Conv2d (256 , 128 , kernel_size = 1 ),
169
+ nn .ReLU (inplace = True ),
170
+ nn .Conv2d (128 , 256 , kernel_size = 3 ),
171
+ nn .ReLU (inplace = True ),
172
+ ),
173
+ 256 ,
146
174
),
147
175
])
148
- # TODO: keep track of block output sizes in a variable. Perhaps define a new block class that has it as attribute?
149
176
150
177
return MultiFeatureMap (feature_maps )
151
178
@@ -159,7 +186,7 @@ def ssd_vgg16(pretrained=False, progress=True,
159
186
# no need to download the backbone if pretrained is set
160
187
pretrained_backbone = False
161
188
162
- backbone = vgg16_mfm_backbone (pretrained_backbone , trainable_layers = trainable_backbone_layers )
189
+ backbone = _vgg16_mfm_backbone (pretrained_backbone , trainable_layers = trainable_backbone_layers )
163
190
model = SSD (backbone , num_classes , ** kwargs )
164
191
if pretrained :
165
192
pass # TODO: load pre-trained COCO weights
0 commit comments