pytorch
diff --git a/‎.circleci/config.yml
Lines changed: 109 additions & 929 deletions b/‎.circleci/config.yml
Lines changed: 109 additions & 929 deletions
diff --git a/‎.circleci/regenerate.py
Lines changed: 1 addition & 1 deletion b/‎.circleci/regenerate.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.rst
Lines changed: 3 additions & 1 deletion b/‎README.rst
Lines changed: 3 additions & 1 deletion
diff --git a/‎docs/source/datasets.rst
Lines changed: 2 additions & 0 deletions b/‎docs/source/datasets.rst
Lines changed: 2 additions & 0 deletions
diff --git a/‎packaging/windows/internal/cuda_install.bat
Lines changed: 4 additions & 0 deletions b/‎packaging/windows/internal/cuda_install.bat
Lines changed: 4 additions & 0 deletions
diff --git a/‎references/classification/README.md
Lines changed: 38 additions & 6 deletions b/‎references/classification/README.md
Lines changed: 38 additions & 6 deletions
diff --git a/‎references/classification/sampler.py
Lines changed: 3 additions & 3 deletions b/‎references/classification/sampler.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎setup.py
Lines changed: 53 additions & 0 deletions b/‎setup.py
Lines changed: 53 additions & 0 deletions
diff --git a/‎test/test_datasets.py
Lines changed: 73 additions & 0 deletions b/‎test/test_datasets.py
Lines changed: 73 additions & 0 deletions
diff --git a/‎test/test_video_gpu_decoder.py
Lines changed: 41 additions & 0 deletions b/‎test/test_video_gpu_decoder.py
Lines changed: 41 additions & 0 deletions
@@ -21,7 +21,7 @@
 from jinja2 import select_autoescape
 
 
-PYTHON_VERSIONS = ["3.6", "3.7", "3.8", "3.9"]
+PYTHON_VERSIONS = ["3.7", "3.8", "3.9"]
 
 RC_PATTERN = r"/v[0-9]+(\.[0-9]+)*-rc[0-9]+/"
 
 
@@ -21,7 +21,9 @@ supported Python versions.
 +--------------------------+--------------------------+---------------------------------+
 | ``torch``                | ``torchvision``          | ``python``                      |
 +==========================+==========================+=================================+
-| ``main`` / ``nightly``   | ``main`` / ``nightly``   | ``>=3.6``, ``<=3.9``            |
+| ``main`` / ``nightly``   | ``main`` / ``nightly``   | ``>=3.7``, ``<=3.9``            |
++--------------------------+--------------------------+---------------------------------+
+| ``1.10.1``               | ``0.11.2``               | ``>=3.6``, ``<=3.9``            |
 +--------------------------+--------------------------+---------------------------------+
 | ``1.10.0``               | ``0.11.1``               | ``>=3.6``, ``<=3.9``            |
 +--------------------------+--------------------------+---------------------------------+
 
@@ -38,6 +38,7 @@ You can also create your own datasets using the provided :ref:`base classes <bas
     Cityscapes
     CocoCaptions
     CocoDetection
+    DTD
     EMNIST
     FakeData
     FashionMNIST
@@ -46,6 +47,7 @@ You can also create your own datasets using the provided :ref:`base classes <bas
     Flickr30k
     FlyingChairs
     FlyingThings3D
+    Food101
     HD1K
     HMDB51
     ImageNet
 
@@ -212,6 +212,10 @@ if not exist "%SRC_DIR%\temp_build\NvToolsExt.7z" (
 echo Installing CUDA toolkit...
 7z x %CUDA_SETUP_FILE% -o"%SRC_DIR%\temp_build\cuda"
 pushd "%SRC_DIR%\temp_build\cuda"
+sc config wuauserv start= disabled
+sc stop wuauserv
+sc query wuauserv
+
 start /wait setup.exe -s %ARGS% -loglevel:6 -log:"%cd%/cuda_install_logs"
 echo %errorlevel%
 
 
@@ -145,25 +145,57 @@ Here `$MODEL` is one of `regnet_x_32gf`, `regnet_y_16gf` and `regnet_y_32gf`.
 
 ### Vision Transformer
 
-#### Base models
+#### vit_b_16
 ```
 torchrun --nproc_per_node=8 train.py\
-    --model $MODEL --epochs 300 --batch-size 64 --opt adamw --lr 0.003 --wd 0.3\
+    --model vit_b_16 --epochs 300 --batch-size 512 --opt adamw --lr 0.003 --wd 0.3\
     --lr-scheduler cosineannealinglr --lr-warmup-method linear --lr-warmup-epochs 30\
     --lr-warmup-decay 0.033 --amp --label-smoothing 0.11 --mixup-alpha 0.2 --auto-augment ra\
     --clip-grad-norm 1 --ra-sampler --cutmix-alpha 1.0 --model-ema
 ```
-Here `$MODEL` is one of `vit_b_16` and `vit_b_32`.
 
-#### Large models
+Note that the above command corresponds to training on a single node with 8 GPUs.
+For generatring the pre-trained weights, we trained with 8 nodes, each with 8 GPUs (for a total of 64 GPUs),
+and `--batch_size 64`.
+
+#### vit_b_32
 ```
 torchrun --nproc_per_node=8 train.py\
-    --model $MODEL --epochs 300 --batch-size 16 --opt adamw --lr 0.003 --wd 0.3\
+    --model vit_b_32 --epochs 300 --batch-size 512 --opt adamw --lr 0.003 --wd 0.3\
+    --lr-scheduler cosineannealinglr --lr-warmup-method linear --lr-warmup-epochs 30\
+    --lr-warmup-decay 0.033 --amp --label-smoothing 0.11 --mixup-alpha 0.2 --auto-augment imagenet\
+    --clip-grad-norm 1 --ra-sampler --cutmix-alpha 1.0 --model-ema
+```
+
+Note that the above command corresponds to training on a single node with 8 GPUs.
+For generatring the pre-trained weights, we trained with 2 nodes, each with 8 GPUs (for a total of 16 GPUs),
+and `--batch_size 256`.
+
+#### vit_l_16
+```
+torchrun --nproc_per_node=8 train.py\
+    --model vit_l_16 --epochs 600 --batch-size 128 --lr 0.5 --lr-scheduler cosineannealinglr\
+    --lr-warmup-method linear --lr-warmup-epochs 5 --label-smoothing 0.1 --mixup-alpha 0.2\
+    --auto-augment ta_wide --random-erase 0.1 --weight-decay 0.00002 --norm-weight-decay 0.0\
+    --clip-grad-norm 1 --ra-sampler --cutmix-alpha 1.0 --model-ema --val-resize-size 232
+```
+
+Note that the above command corresponds to training on a single node with 8 GPUs.
+For generatring the pre-trained weights, we trained with 2 nodes, each with 8 GPUs (for a total of 16 GPUs),
+and `--batch_size 64`.
+
+#### vit_l_32
+```
+torchrun --nproc_per_node=8 train.py\
+    --model vit_l_32 --epochs 300 --batch-size 512 --opt adamw --lr 0.003 --wd 0.3\
     --lr-scheduler cosineannealinglr --lr-warmup-method linear --lr-warmup-epochs 30\
     --lr-warmup-decay 0.033 --amp --label-smoothing 0.11 --mixup-alpha 0.2 --auto-augment ra\
     --clip-grad-norm 1 --ra-sampler --cutmix-alpha 1.0 --model-ema
 ```
-Here `$MODEL` is one of `vit_l_16` and `vit_l_32`.
+
+Note that the above command corresponds to training on a single node with 8 GPUs.
+For generatring the pre-trained weights, we trained with 8 nodes, each with 8 GPUs (for a total of 64 GPUs),
+and `--batch_size 64`.
 
 ## Mixed precision training
 Automatic Mixed Precision (AMP) training on GPU for Pytorch can be enabled with the [torch.cuda.amp](https://pytorch.org/docs/stable/amp.html?highlight=amp#module-torch.cuda.amp).
 
@@ -36,10 +36,10 @@ def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True, seed=0,
         self.repetitions = repetitions
 
     def __iter__(self):
-        # Deterministically shuffle based on epoch
-        g = torch.Generator()
-        g.manual_seed(self.seed + self.epoch)
         if self.shuffle:
+            # Deterministically shuffle based on epoch
+            g = torch.Generator()
+            g.manual_seed(self.seed + self.epoch)
             indices = torch.randperm(len(self.dataset), generator=g).tolist()
         else:
             indices = list(range(len(self.dataset)))
 
@@ -427,6 +427,59 @@ def get_extensions():
             )
         )
 
+    # Locating video codec
+    # CUDA_HOME should be set to the cuda root directory.
+    # TORCHVISION_INCLUDE and TORCHVISION_LIBRARY should include the location to
+    # video codec header files and libraries respectively.
+    video_codec_found = (
+        extension is CUDAExtension
+        and CUDA_HOME is not None
+        and any([os.path.exists(os.path.join(folder, "cuviddec.h")) for folder in vision_include])
+        and any([os.path.exists(os.path.join(folder, "nvcuvid.h")) for folder in vision_include])
+        and any([os.path.exists(os.path.join(folder, "libnvcuvid.so")) for folder in library_dirs])
+    )
+
+    print(f"video codec found: {video_codec_found}")
+
+    if (
+        video_codec_found
+        and has_ffmpeg
+        and any([os.path.exists(os.path.join(folder, "libavcodec", "bsf.h")) for folder in ffmpeg_include_dir])
+    ):
+        gpu_decoder_path = os.path.join(extensions_dir, "io", "decoder", "gpu")
+        gpu_decoder_src = glob.glob(os.path.join(gpu_decoder_path, "*.cpp"))
+        cuda_libs = os.path.join(CUDA_HOME, "lib64")
+        cuda_inc = os.path.join(CUDA_HOME, "include")
+
+        ext_modules.append(
+            extension(
+                "torchvision.Decoder",
+                gpu_decoder_src,
+                include_dirs=include_dirs + [gpu_decoder_path] + [cuda_inc] + ffmpeg_include_dir,
+                library_dirs=ffmpeg_library_dir + library_dirs + [cuda_libs],
+                libraries=[
+                    "avcodec",
+                    "avformat",
+                    "avutil",
+                    "swresample",
+                    "swscale",
+                    "nvcuvid",
+                    "cuda",
+                    "cudart",
+                    "z",
+                    "pthread",
+                    "dl",
+                ],
+                extra_compile_args=extra_compile_args,
+            )
+        )
+    else:
+        print(
+            "The installed version of ffmpeg is missing the header file 'bsf.h' which is "
+            "required for GPU video decoding. Please install the latest ffmpeg from conda-forge channel:"
+            " `conda install -c conda-forge ffmpeg`."
+        )
+
     return ext_modules
 
 
 
@@ -2169,6 +2169,79 @@ def inject_fake_data(self, tmpdir, config):
         return num_sequences * (num_examples_per_sequence - 1)
 
 
+class Food101TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.Food101
+    FEATURE_TYPES = (PIL.Image.Image, int)
+
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+
+    def inject_fake_data(self, tmpdir: str, config):
+        root_folder = pathlib.Path(tmpdir) / "food-101"
+        image_folder = root_folder / "images"
+        meta_folder = root_folder / "meta"
+
+        image_folder.mkdir(parents=True)
+        meta_folder.mkdir()
+
+        num_images_per_class = 5
+
+        metadata = {}
+        n_samples_per_class = 3 if config["split"] == "train" else 2
+        sampled_classes = ("apple_pie", "crab_cakes", "gyoza")
+        for cls in sampled_classes:
+            im_fnames = datasets_utils.create_image_folder(
+                image_folder,
+                cls,
+                file_name_fn=lambda idx: f"{idx}.jpg",
+                num_examples=num_images_per_class,
+            )
+            metadata[cls] = [
+                "/".join(fname.relative_to(image_folder).with_suffix("").parts)
+                for fname in random.choices(im_fnames, k=n_samples_per_class)
+            ]
+
+        with open(meta_folder / f"{config['split']}.json", "w") as file:
+            file.write(json.dumps(metadata))
+
+        return len(sampled_classes * n_samples_per_class)
+
+
+class DTDTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.DTD
+    FEATURE_TYPES = (PIL.Image.Image, int)
+
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+        split=("train", "test", "val"),
+        # There is no need to test the whole matrix here, since each fold is treated exactly the same
+        partition=(1, 5, 10),
+    )
+
+    def inject_fake_data(self, tmpdir: str, config):
+        data_folder = pathlib.Path(tmpdir) / "dtd" / "dtd"
+
+        num_images_per_class = 3
+        image_folder = data_folder / "images"
+        image_files = []
+        for cls in ("banded", "marbled", "zigzagged"):
+            image_files.extend(
+                datasets_utils.create_image_folder(
+                    image_folder,
+                    cls,
+                    file_name_fn=lambda idx: f"{cls}_{idx:04d}.jpg",
+                    num_examples=num_images_per_class,
+                )
+            )
+
+        meta_folder = data_folder / "labels"
+        meta_folder.mkdir()
+        image_ids = [str(path.relative_to(path.parents[1])).replace(os.sep, "/") for path in image_files]
+        image_ids_in_config = random.choices(image_ids, k=len(image_files) // 2)
+        with open(meta_folder / f"{config['split']}{config['partition']}.txt", "w") as file:
+            file.write("\n".join(image_ids_in_config) + "\n")
+
+        return len(image_ids_in_config)
+
+
 class FER2013TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.FER2013
     ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
 
@@ -0,0 +1,41 @@
+import os
+
+import pytest
+import torch
+from torchvision.io import _HAS_VIDEO_DECODER, VideoReader
+
+try:
+    import av
+except ImportError:
+    av = None
+
+VIDEO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "videos")
+
+test_videos = [
+    "RATRACE_wave_f_nm_np1_fr_goo_37.avi",
+    "TrumanShow_wave_f_nm_np1_fr_med_26.avi",
+    "v_SoccerJuggling_g23_c01.avi",
+    "v_SoccerJuggling_g24_c01.avi",
+    "R6llTwEh07w.mp4",
+    "SOX5yA1l24A.mp4",
+    "WUzgd7C1pWA.mp4",
+]
+
+
+@pytest.mark.skipif(_HAS_VIDEO_DECODER is False, reason="Didn't compile with support for gpu decoder")
+class TestVideoGPUDecoder:
+    @pytest.mark.skipif(av is None, reason="PyAV unavailable")
+    def test_frame_reading(self):
+        for test_video in test_videos:
+            full_path = os.path.join(VIDEO_DIR, test_video)
+            decoder = VideoReader(full_path, device="cuda:0")
+            with av.open(full_path) as container:
+                for av_frame in container.decode(container.streams.video[0]):
+                    av_frames = torch.tensor(av_frame.to_ndarray().flatten())
+                    vision_frames = next(decoder)["data"]
+                    mean_delta = torch.mean(torch.abs(av_frames.float() - decoder._reformat(vision_frames).float()))
+                    assert mean_delta < 0.1
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])