Skip to content

Training starts correctly but suddenly stops after a while #241

@gvalvano

Description

@gvalvano

Describe the bug
The training starts correctly, but it suddenly stops with AttributeError after a while.

Reproduction

The config file:

dataset_type = 'UnconditionalImageDataset'
train_pipeline = [
    dict(type='LoadImageFromFile', key='real_img', io_backend='disk'),
    dict(
        type='Resize',
        keys=['real_img'],
        scale=(1126, 1126),
        interpolation='bicubic'),
    dict(
        type='Crop',
        keys=['real_img'],
        crop_size=(1024, 1024),
        random_crop=True),
    dict(
        type='Normalize',
        keys=['real_img'],
        mean=[127.5, 127.5, 127.5],
        std=[127.5, 127.5, 127.5],
        to_rgb=False),
    dict(type='Flip', keys=['real_img'], direction='horizontal'),
    dict(type='ImageToTensor', keys=['real_img']),
    dict(type='Collect', keys=['real_img'], meta_keys=['real_img_path'])
]
val_pipeline = [
    dict(type='LoadImageFromFile', key='real_img', io_backend='disk'),
    dict(
        type='Resize',
        keys=['real_img'],
        scale=(1024, 1024),
        interpolation='bicubic'),
    dict(
        type='Normalize',
        keys=['real_img'],
        mean=[127.5, 127.5, 127.5],
        std=[127.5, 127.5, 127.5],
        to_rgb=False),
    dict(type='ImageToTensor', keys=['real_img']),
    dict(type='Collect', keys=['real_img'], meta_keys=['real_img_path'])
]
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=1,
    train=dict(
        type='RepeatDataset',
        times=100,
        dataset=dict(
            type='UnconditionalImageDataset',
            imgs_root='../data/images_collection/',
            pipeline=[
                dict(
                    type='LoadImageFromFile',
                    key='real_img',
                    io_backend='disk'),
                dict(
                    type='Resize',
                    keys=['real_img'],
                    scale=(1126, 1126),
                    interpolation='bicubic'),
                dict(
                    type='Crop',
                    keys=['real_img'],
                    crop_size=(1024, 1024),
                    random_crop=True),
                dict(
                    type='Normalize',
                    keys=['real_img'],
                    mean=[127.5, 127.5, 127.5],
                    std=[127.5, 127.5, 127.5],
                    to_rgb=False),
                dict(type='Flip', keys=['real_img'], direction='horizontal'),
                dict(type='ImageToTensor', keys=['real_img']),
                dict(
                    type='Collect',
                    keys=['real_img'],
                    meta_keys=['real_img_path'])
            ])),
    val=dict(
        type='UnconditionalImageDataset',
        imgs_root='../data/images_collection/',
        pipeline=[
            dict(type='LoadImageFromFile', key='real_img', io_backend='disk'),
            dict(
                type='Resize',
                keys=['real_img'],
                scale=(1024, 1024),
                interpolation='bicubic'),
            dict(
                type='Normalize',
                keys=['real_img'],
                mean=[127.5, 127.5, 127.5],
                std=[127.5, 127.5, 127.5],
                to_rgb=False),
            dict(type='ImageToTensor', keys=['real_img']),
            dict(
                type='Collect', keys=['real_img'], meta_keys=['real_img_path'])
        ]))
d_reg_interval = 16
g_reg_interval = 4
g_reg_ratio = 0.8
d_reg_ratio = 0.9411764705882353
model = dict(
    type='StaticUnconditionalGAN',
    generator=dict(
        type='StyleGANv2Generator', out_size=1024, style_channels=512),
    discriminator=dict(type='StyleGAN2Discriminator', in_size=1024),
    gan_loss=dict(type='GANLoss', gan_type='wgan-logistic-ns'),
    disc_auxiliary_loss=dict(
        type='R1GradientPenalty',
        loss_weight=80.0,
        interval=16,
        norm_mode='HWC',
        data_info=dict(real_data='real_imgs', discriminator='disc')),
    gen_auxiliary_loss=dict(
        type='GeneratorPathRegularizer',
        loss_weight=8.0,
        pl_batch_shrink=2,
        interval=4,
        data_info=dict(generator='gen', num_batches='batch_size')))
train_cfg = dict(use_ema=True)
test_cfg = None
optimizer = dict(
    generator=dict(type='Adam', lr=0.0016, betas=(0, 0.9919919678228657)),
    discriminator=dict(
        type='Adam', lr=0.0018823529411764706, betas=(0, 0.9905854573074332)))
checkpoint_config = dict(interval=2000, by_epoch=False, max_keep_ckpts=5)
log_config = dict(
    interval=500,
    hooks=[dict(type='TextLoggerHook'),
           dict(type='TensorboardLoggerHook')])
custom_hooks = [
    dict(
        type='VisualizeUnconditionalSamples',
        output_dir='training_samples',
        interval=5000),
    dict(
        type='ExponentialMovingAverageHook',
        module_keys=('generator_ema', ),
        interval=1,
        interp_cfg=dict(momentum=0.9977843871238888),
        priority='VERY_HIGH')
]
runner = dict(
    type='DynamicIterBasedRunner',
    is_dynamic_ddp=False,
    pass_training_status=True)
dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = None
resume_from = None
workflow = [('train', 10000)]
find_unused_parameters = True
cudnn_benchmark = True
opencv_num_threads = 0
mp_start_method = 'fork'
metrics = dict(
    fid50k=dict(
        type='FID',
        num_images=10000,
        inception_pkl=
        'results/pretrained_models/inception/ffhq-1024-50k-rgb.pkl',
        bgr2rgb=True),
    pr50k3=dict(type='PR', num_images=500, k=3),
    is50k=dict(type='IS', num_images=10000),
    ppl_zfull=dict(type='PPL', space='Z', sampling='full', num_images=10000),
    ppl_wfull=dict(type='PPL', space='W', sampling='full', num_images=10000),
    ppl_zend=dict(type='PPL', space='Z', sampling='end', num_images=10000),
    ppl_wend=dict(type='PPL', space='W', sampling='end', num_images=500),
    ms_ssim10k=dict(type='MS_SSIM', num_images=10000),
    swd16k=dict(type='SWD', num_images=10000))
run_id = 'stylegan'
data_root = '../data/images_collection/'
n_epochs = 500
n_images = 21505
total_iters = 10752500
image_resolution = 1024
inception_file_name = 'results/pretrained_models/inception/ffhq-1024-50k-rgb.pkl'
batch_size = 2
lr_config = None
ema_half_life = 10.0
_load = dict(type='LoadImageFromFile', key='real_img', io_backend='disk')
_normalize = dict(
    type='Normalize',
    keys=['real_img'],
    mean=[127.5, 127.5, 127.5],
    std=[127.5, 127.5, 127.5],
    to_rgb=False)
_resize = dict(
    type='Resize',
    keys=['real_img'],
    scale=(1126, 1126),
    interpolation='bicubic')
_resize_val = dict(
    type='Resize',
    keys=['real_img'],
    scale=(1024, 1024),
    interpolation='bicubic')
_crop = dict(
    type='Crop', keys=['real_img'], crop_size=(1024, 1024), random_crop=True)
_flip = dict(type='Flip', keys=['real_img'], direction='horizontal')
_convert = dict(type='ImageToTensor', keys=['real_img'])
_collect = dict(type='Collect', keys=['real_img'], meta_keys=['real_img_path'])
evaluation = dict(
    type='GenerativeEvalHook',
    interval=200,
    metrics=dict(
        type='FID',
        num_images=1000,
        inception_pkl=
        'results/pretrained_models/inception/ffhq-1024-50k-rgb.pkl',
        bgr2rgb=True),
    sample_kwargs=dict(sample_model='ema'))
use_ddp_wrapper = True
work_dir = 'results/logs/stylegan'
gpu_ids = range(0, 1)
  1. Did you make any modifications on the code or config? Did you understand what you have modified? --> Yes
  2. What dataset did you use? --> Custom, under ../data/images_collection/

Environment

  1. Please run python mmgen/utils/collect_env.py to collect necessary environment information and paste it here.
/path/to/miniconda/envs/open-mmlab/lib/python3.7/runpy.py:125: RuntimeWarning: 'mmgen.utils.collect_env' found in sys.modules after import of package 'mmgen.utils', but prior to execution of 'mmgen.utils.collect_env'; this may result in unpredictable behaviour
  warn(RuntimeWarning(msg))
fatal: not a git repository (or any parent up to mount point /)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).
sys.platform: linux
Python: 3.7.12 | packaged by conda-forge | (default, Oct 26 2021, 06:08:21) [GCC 9.4.0]
CUDA available: True
CUDA_HOME: /usr
NVCC: Cuda compilation tools, release 9.1, V9.1.85
GPU 0: TITAN Xp
GPU 1: GeForce GTX 1080 Ti
GCC: gcc (Ubuntu 5.5.0-12ubuntu1) 5.5.0 20171010
PyTorch: 1.10.2
PyTorch compiling details: PyTorch built with:
  - GCC 7.3
  - C++ Version: 201402
  - Intel(R) oneAPI Math Kernel Library Version 2022.0-Product Build 20211112 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v2.2.3 (Git Hash 7336ca9f055cf1bfa13efb658fe15dc9b41f0740)
  - OpenMP 201511 (a.k.a. OpenMP 4.5)
  - LAPACK is enabled (usually provided by MKL)
  - NNPACK is enabled
  - CPU capability usage: AVX512
  - CUDA Runtime 10.2
  - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_37,code=compute_37
  - CuDNN 7.6.5
  - Magma 2.5.2
  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=10.2, CUDNN_VERSION=7.6.5, CXX_COMPILER=/opt/rh/devtoolset-7/root/usr/bin/c++, CXX_FLAGS= -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -fopenmp -DNDEBUG -DUSE_KINETO -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -DEDGE_PROFILER_USE_KINETO -O2 -fPIC -Wno-narrowing -Wall -Wextra -Werror=return-type -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wno-unused-result -Wno-unused-local-typedefs -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_VERSION=1.10.2, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON, 

TorchVision: 0.11.3
OpenCV: 4.5.5
MMCV: 1.4.4
MMGen: 0.5.0+
MMCV Compiler: GCC 7.3
MMCV CUDA Compiler: 10.2
  1. You may add addition that may be helpful for locating the problem, such as
    • How you installed PyTorch [e.g., pip, conda, source] --> conda
    • Other environment variables that may be related (such as $PATH, $LD_LIBRARY_PATH, $PYTHONPATH, etc.) --> N.A

Error traceback
If applicable, paste the error trackback here.

[...]
2022-02-03 10:34:04,432 - mmgen - INFO - Switch to evaluation style mode: single
2022-02-03 10:34:04,433 - mmgen - INFO - Sample 1000 fake images for evaluation
[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 1000/1000, 19.6 task/s, elapsed: 51s, ETA:     0s
2022-02-03 10:34:59,248 - mmgen - INFO - Switch to train style mode: mix
2022-02-03 10:34:59,249 - mmgen - INFO - Switch to train style mode: mix
2022-02-03 10:34:59,251 - mmgen - INFO - Iter(val) [2800]       fid: 487.0529, fid_mean: 355.8218994140625, fid_cov: 131.2310
2022-02-03 10:38:12,054 - mmgen - INFO - Switch to evaluation style mode: single
2022-02-03 10:38:12,054 - mmgen - INFO - Switch to evaluation style mode: single
2022-02-03 10:38:12,055 - mmgen - INFO - Sample 1000 fake images for evaluation
[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 1000/1000, 19.4 task/s, elapsed: 51s, ETA:     0s
2022-02-03 10:39:07,205 - mmgen - INFO - Switch to train style mode: mix
2022-02-03 10:39:07,206 - mmgen - INFO - Switch to train style mode: mix
2022-02-03 10:39:07,207 - mmgen - INFO - Exp name: stylegan.py
2022-02-03 10:39:07,207 - mmgen - INFO - Iter(val) [3000]       fid: 541.2982, fid_mean: 405.7856750488281, fid_cov: 135.5125
Traceback (most recent call last):
  File "/path/to/miniconda/envs/open-mmlab/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/path/to/miniconda/envs/open-mmlab/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/path/to/project/MMGeneration/tools/train.py", line 204, in <module>
    main()
  File "/path/to/project/MMGeneration/tools/train.py", line 200, in main
    meta=meta)
  File "/path/to/project/MMGeneration/mmgen/apis/train.py", line 199, in train_model
    runner.run(data_loaders, cfg.workflow, cfg.total_iters)
  File "/path/to/project/MMGeneration/mmgen/core/runners/dynamic_iterbased_runner.py", line 285, in run
    iter_runner(iter_loaders[i], **kwargs)
  File "/path/to/project/MMGeneration/mmgen/core/runners/dynamic_iterbased_runner.py", line 197, in train
    data_batch = next(self.data_loader)
  File "/path/to/project/MMGeneration/mmgen/core/runners/dynamic_iterbased_runner.py", line 103, in __next__
    data = next(self.iter_loader)
  File "/path/to/miniconda/envs/open-mmlab/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 521, in __next__
    data = self._next_data()
  File "/path/to/miniconda/envs/open-mmlab/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1203, in _next_data
    return self._process_data(data)
  File "/path/to/miniconda/envs/open-mmlab/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1229, in _process_data
    data.reraise()
  File "/path/to/miniconda/envs/open-mmlab/lib/python3.7/site-packages/torch/_utils.py", line 434, in reraise
    raise exception
AttributeError: Caught AttributeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/path/to/miniconda/envs/open-mmlab/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 287, in _worker_loop
    data = fetcher.fetch(index)
  File "/path/to/miniconda/envs/open-mmlab/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 49, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/path/to/miniconda/envs/open-mmlab/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 49, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/path/to/project/MMGeneration/mmgen/datasets/dataset_wrappers.py", line 31, in __getitem__
    return self.dataset[idx % self._ori_len]
  File "/path/to/project/MMGeneration/mmgen/datasets/unconditional_image_dataset.py", line 74, in __getitem__
    return self.prepare_train_data(idx)
  File "/path/to/project/MMGeneration/mmgen/datasets/unconditional_image_dataset.py", line 55, in prepare_train_data
    return self.pipeline(results)
  File "/path/to/project/MMGeneration/mmgen/datasets/pipelines/compose.py", line 57, in __call__
    data = t(data)
  File "/path/to/project/MMGeneration/mmgen/datasets/pipelines/loading.py", line 67, in __call__
    results[f'{self.key}_ori_shape'] = img.shape
AttributeError: 'NoneType' object has no attribute 'shape'

Bug fix
If you have already identified the reason, you can provide the information here. If you are willing to create a PR to fix it, please also leave a comment here and that would be much appreciated!

Metadata

Metadata

Assignees

Labels

kind/bugsomething isn't workingpriority/P0highest priority

Type

No type

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions