From 52575a0f0ad574e36188529a2fa8c6f189878480 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 5 Sep 2022 11:53:17 +0100 Subject: [PATCH 1/3] S3D weight deployment --- references/video_classification/README.md | 15 +++++++++++++++ torchvision/models/video/s3d.py | 14 ++++++-------- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/references/video_classification/README.md b/references/video_classification/README.md index 9bd1b9cc285..0bf670a42db 100644 --- a/references/video_classification/README.md +++ b/references/video_classification/README.md @@ -97,6 +97,21 @@ Video resnet models: --val-crop-size 112 112 ``` +### S3D + +The S3D model was trained similarly to the above but with the following changes on the default configuration: +``` +--batch-size=12 --lr 0.2 --clip-len 64 --clips-per-video 5 --sync-bn \ +--train-resize-size 256 256 --train-crop-size 224 224 --val-resize-size 256 256 --val-crop-size 224 224 +``` + +We used 64 GPUs to train the architecture. + +To estimate the validation statistics of the model, we run the reference script with the following configuration: +``` +--batch-size=16 --test-only --clip-len 128 --clips-per-video 1 +``` + ### Additional video modelling resources - [Video Model Zoo](https://github.com/facebookresearch/VMZ) diff --git a/torchvision/models/video/s3d.py b/torchvision/models/video/s3d.py index f80d849683c..2be68f15494 100644 --- a/torchvision/models/video/s3d.py +++ b/torchvision/models/video/s3d.py @@ -104,7 +104,7 @@ class S3D(nn.Module): def __init__( self, num_classes: int = 400, - dropout: float = 0.0, + dropout: float = 0.2, norm_layer: Optional[Callable[..., torch.nn.Module]] = None, ) -> None: super().__init__() @@ -153,28 +153,26 @@ def forward(self, x): class S3D_Weights(WeightsEnum): KINETICS400_V1 = Weights( - url="https://download.pytorch.org/models/s3d-1bd8ae63.pth", + url="https://download.pytorch.org/models/s3d-d76dad2f.pth", transforms=partial( VideoClassification, crop_size=(224, 224), resize_size=(256, 256), - mean=(0.5, 0.5, 0.5), - std=(0.5, 0.5, 0.5), ), meta={ "min_size": (224, 224), "min_temporal_size": 14, "categories": _KINETICS400_CATEGORIES, - "recipe": "https://github.com/pytorch/vision/pull/6412#issuecomment-1219687434", + "recipe": "https://github.com/pytorch/vision/tree/main/references/video_classification#s3d", "_docs": ( - "The weights are ported from a community repository. The accuracies are estimated on clip-level " + "The weights aim to approximate the accuracy of the paper. The accuracies are estimated on clip-level " "with parameters `frame_rate=15`, `clips_per_video=1`, and `clip_len=128`." ), "num_params": 8320048, "_metrics": { "Kinetics-400": { - "acc@1": 67.315, - "acc@5": 87.593, + "acc@1": 68.345, + "acc@5": 88.050, } }, }, From 81a153b69343f0b14c0599f2c5daafd719f89d6b Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 5 Sep 2022 13:04:25 +0100 Subject: [PATCH 2/3] Update accuracies. --- torchvision/models/video/s3d.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/models/video/s3d.py b/torchvision/models/video/s3d.py index 2be68f15494..f7d364c665f 100644 --- a/torchvision/models/video/s3d.py +++ b/torchvision/models/video/s3d.py @@ -171,7 +171,7 @@ class S3D_Weights(WeightsEnum): "num_params": 8320048, "_metrics": { "Kinetics-400": { - "acc@1": 68.345, + "acc@1": 68.368, "acc@5": 88.050, } }, From 3ca5ac43e3a955084313f96aa85e74aadd48be68 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 5 Sep 2022 13:21:04 +0100 Subject: [PATCH 3/3] Address review comments. --- references/video_classification/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/references/video_classification/README.md b/references/video_classification/README.md index 0bf670a42db..cbd303275e5 100644 --- a/references/video_classification/README.md +++ b/references/video_classification/README.md @@ -81,6 +81,7 @@ Video resnet models: ``` # number of frames per clip --clip_len 16 \ +--frame-rate 15 \ # allow for temporal jittering --clips_per_video 5 \ --batch-size 24 \