-
Notifications
You must be signed in to change notification settings - Fork 232
Training starts correctly but suddenly stops after a while #241
Copy link
Copy link
Open
Labels
Milestone
Description
Describe the bug
The training starts correctly, but it suddenly stops with AttributeError after a while.
Reproduction
The config file:
dataset_type = 'UnconditionalImageDataset'
train_pipeline = [
dict(type='LoadImageFromFile', key='real_img', io_backend='disk'),
dict(
type='Resize',
keys=['real_img'],
scale=(1126, 1126),
interpolation='bicubic'),
dict(
type='Crop',
keys=['real_img'],
crop_size=(1024, 1024),
random_crop=True),
dict(
type='Normalize',
keys=['real_img'],
mean=[127.5, 127.5, 127.5],
std=[127.5, 127.5, 127.5],
to_rgb=False),
dict(type='Flip', keys=['real_img'], direction='horizontal'),
dict(type='ImageToTensor', keys=['real_img']),
dict(type='Collect', keys=['real_img'], meta_keys=['real_img_path'])
]
val_pipeline = [
dict(type='LoadImageFromFile', key='real_img', io_backend='disk'),
dict(
type='Resize',
keys=['real_img'],
scale=(1024, 1024),
interpolation='bicubic'),
dict(
type='Normalize',
keys=['real_img'],
mean=[127.5, 127.5, 127.5],
std=[127.5, 127.5, 127.5],
to_rgb=False),
dict(type='ImageToTensor', keys=['real_img']),
dict(type='Collect', keys=['real_img'], meta_keys=['real_img_path'])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=1,
train=dict(
type='RepeatDataset',
times=100,
dataset=dict(
type='UnconditionalImageDataset',
imgs_root='../data/images_collection/',
pipeline=[
dict(
type='LoadImageFromFile',
key='real_img',
io_backend='disk'),
dict(
type='Resize',
keys=['real_img'],
scale=(1126, 1126),
interpolation='bicubic'),
dict(
type='Crop',
keys=['real_img'],
crop_size=(1024, 1024),
random_crop=True),
dict(
type='Normalize',
keys=['real_img'],
mean=[127.5, 127.5, 127.5],
std=[127.5, 127.5, 127.5],
to_rgb=False),
dict(type='Flip', keys=['real_img'], direction='horizontal'),
dict(type='ImageToTensor', keys=['real_img']),
dict(
type='Collect',
keys=['real_img'],
meta_keys=['real_img_path'])
])),
val=dict(
type='UnconditionalImageDataset',
imgs_root='../data/images_collection/',
pipeline=[
dict(type='LoadImageFromFile', key='real_img', io_backend='disk'),
dict(
type='Resize',
keys=['real_img'],
scale=(1024, 1024),
interpolation='bicubic'),
dict(
type='Normalize',
keys=['real_img'],
mean=[127.5, 127.5, 127.5],
std=[127.5, 127.5, 127.5],
to_rgb=False),
dict(type='ImageToTensor', keys=['real_img']),
dict(
type='Collect', keys=['real_img'], meta_keys=['real_img_path'])
]))
d_reg_interval = 16
g_reg_interval = 4
g_reg_ratio = 0.8
d_reg_ratio = 0.9411764705882353
model = dict(
type='StaticUnconditionalGAN',
generator=dict(
type='StyleGANv2Generator', out_size=1024, style_channels=512),
discriminator=dict(type='StyleGAN2Discriminator', in_size=1024),
gan_loss=dict(type='GANLoss', gan_type='wgan-logistic-ns'),
disc_auxiliary_loss=dict(
type='R1GradientPenalty',
loss_weight=80.0,
interval=16,
norm_mode='HWC',
data_info=dict(real_data='real_imgs', discriminator='disc')),
gen_auxiliary_loss=dict(
type='GeneratorPathRegularizer',
loss_weight=8.0,
pl_batch_shrink=2,
interval=4,
data_info=dict(generator='gen', num_batches='batch_size')))
train_cfg = dict(use_ema=True)
test_cfg = None
optimizer = dict(
generator=dict(type='Adam', lr=0.0016, betas=(0, 0.9919919678228657)),
discriminator=dict(
type='Adam', lr=0.0018823529411764706, betas=(0, 0.9905854573074332)))
checkpoint_config = dict(interval=2000, by_epoch=False, max_keep_ckpts=5)
log_config = dict(
interval=500,
hooks=[dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')])
custom_hooks = [
dict(
type='VisualizeUnconditionalSamples',
output_dir='training_samples',
interval=5000),
dict(
type='ExponentialMovingAverageHook',
module_keys=('generator_ema', ),
interval=1,
interp_cfg=dict(momentum=0.9977843871238888),
priority='VERY_HIGH')
]
runner = dict(
type='DynamicIterBasedRunner',
is_dynamic_ddp=False,
pass_training_status=True)
dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = None
resume_from = None
workflow = [('train', 10000)]
find_unused_parameters = True
cudnn_benchmark = True
opencv_num_threads = 0
mp_start_method = 'fork'
metrics = dict(
fid50k=dict(
type='FID',
num_images=10000,
inception_pkl=
'results/pretrained_models/inception/ffhq-1024-50k-rgb.pkl',
bgr2rgb=True),
pr50k3=dict(type='PR', num_images=500, k=3),
is50k=dict(type='IS', num_images=10000),
ppl_zfull=dict(type='PPL', space='Z', sampling='full', num_images=10000),
ppl_wfull=dict(type='PPL', space='W', sampling='full', num_images=10000),
ppl_zend=dict(type='PPL', space='Z', sampling='end', num_images=10000),
ppl_wend=dict(type='PPL', space='W', sampling='end', num_images=500),
ms_ssim10k=dict(type='MS_SSIM', num_images=10000),
swd16k=dict(type='SWD', num_images=10000))
run_id = 'stylegan'
data_root = '../data/images_collection/'
n_epochs = 500
n_images = 21505
total_iters = 10752500
image_resolution = 1024
inception_file_name = 'results/pretrained_models/inception/ffhq-1024-50k-rgb.pkl'
batch_size = 2
lr_config = None
ema_half_life = 10.0
_load = dict(type='LoadImageFromFile', key='real_img', io_backend='disk')
_normalize = dict(
type='Normalize',
keys=['real_img'],
mean=[127.5, 127.5, 127.5],
std=[127.5, 127.5, 127.5],
to_rgb=False)
_resize = dict(
type='Resize',
keys=['real_img'],
scale=(1126, 1126),
interpolation='bicubic')
_resize_val = dict(
type='Resize',
keys=['real_img'],
scale=(1024, 1024),
interpolation='bicubic')
_crop = dict(
type='Crop', keys=['real_img'], crop_size=(1024, 1024), random_crop=True)
_flip = dict(type='Flip', keys=['real_img'], direction='horizontal')
_convert = dict(type='ImageToTensor', keys=['real_img'])
_collect = dict(type='Collect', keys=['real_img'], meta_keys=['real_img_path'])
evaluation = dict(
type='GenerativeEvalHook',
interval=200,
metrics=dict(
type='FID',
num_images=1000,
inception_pkl=
'results/pretrained_models/inception/ffhq-1024-50k-rgb.pkl',
bgr2rgb=True),
sample_kwargs=dict(sample_model='ema'))
use_ddp_wrapper = True
work_dir = 'results/logs/stylegan'
gpu_ids = range(0, 1)- Did you make any modifications on the code or config? Did you understand what you have modified? --> Yes
- What dataset did you use? --> Custom, under
../data/images_collection/
Environment
- Please run
python mmgen/utils/collect_env.pyto collect necessary environment information and paste it here.
/path/to/miniconda/envs/open-mmlab/lib/python3.7/runpy.py:125: RuntimeWarning: 'mmgen.utils.collect_env' found in sys.modules after import of package 'mmgen.utils', but prior to execution of 'mmgen.utils.collect_env'; this may result in unpredictable behaviour
warn(RuntimeWarning(msg))
fatal: not a git repository (or any parent up to mount point /)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).
sys.platform: linux
Python: 3.7.12 | packaged by conda-forge | (default, Oct 26 2021, 06:08:21) [GCC 9.4.0]
CUDA available: True
CUDA_HOME: /usr
NVCC: Cuda compilation tools, release 9.1, V9.1.85
GPU 0: TITAN Xp
GPU 1: GeForce GTX 1080 Ti
GCC: gcc (Ubuntu 5.5.0-12ubuntu1) 5.5.0 20171010
PyTorch: 1.10.2
PyTorch compiling details: PyTorch built with:
- GCC 7.3
- C++ Version: 201402
- Intel(R) oneAPI Math Kernel Library Version 2022.0-Product Build 20211112 for Intel(R) 64 architecture applications
- Intel(R) MKL-DNN v2.2.3 (Git Hash 7336ca9f055cf1bfa13efb658fe15dc9b41f0740)
- OpenMP 201511 (a.k.a. OpenMP 4.5)
- LAPACK is enabled (usually provided by MKL)
- NNPACK is enabled
- CPU capability usage: AVX512
- CUDA Runtime 10.2
- NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_37,code=compute_37
- CuDNN 7.6.5
- Magma 2.5.2
- Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=10.2, CUDNN_VERSION=7.6.5, CXX_COMPILER=/opt/rh/devtoolset-7/root/usr/bin/c++, CXX_FLAGS= -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -fopenmp -DNDEBUG -DUSE_KINETO -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -DEDGE_PROFILER_USE_KINETO -O2 -fPIC -Wno-narrowing -Wall -Wextra -Werror=return-type -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wno-unused-result -Wno-unused-local-typedefs -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_VERSION=1.10.2, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON,
TorchVision: 0.11.3
OpenCV: 4.5.5
MMCV: 1.4.4
MMGen: 0.5.0+
MMCV Compiler: GCC 7.3
MMCV CUDA Compiler: 10.2- You may add addition that may be helpful for locating the problem, such as
- How you installed PyTorch [e.g., pip, conda, source] --> conda
- Other environment variables that may be related (such as
$PATH,$LD_LIBRARY_PATH,$PYTHONPATH, etc.) --> N.A
Error traceback
If applicable, paste the error trackback here.
[...]
2022-02-03 10:34:04,432 - mmgen - INFO - Switch to evaluation style mode: single
2022-02-03 10:34:04,433 - mmgen - INFO - Sample 1000 fake images for evaluation
[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 1000/1000, 19.6 task/s, elapsed: 51s, ETA: 0s
2022-02-03 10:34:59,248 - mmgen - INFO - Switch to train style mode: mix
2022-02-03 10:34:59,249 - mmgen - INFO - Switch to train style mode: mix
2022-02-03 10:34:59,251 - mmgen - INFO - Iter(val) [2800] fid: 487.0529, fid_mean: 355.8218994140625, fid_cov: 131.2310
2022-02-03 10:38:12,054 - mmgen - INFO - Switch to evaluation style mode: single
2022-02-03 10:38:12,054 - mmgen - INFO - Switch to evaluation style mode: single
2022-02-03 10:38:12,055 - mmgen - INFO - Sample 1000 fake images for evaluation
[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 1000/1000, 19.4 task/s, elapsed: 51s, ETA: 0s
2022-02-03 10:39:07,205 - mmgen - INFO - Switch to train style mode: mix
2022-02-03 10:39:07,206 - mmgen - INFO - Switch to train style mode: mix
2022-02-03 10:39:07,207 - mmgen - INFO - Exp name: stylegan.py
2022-02-03 10:39:07,207 - mmgen - INFO - Iter(val) [3000] fid: 541.2982, fid_mean: 405.7856750488281, fid_cov: 135.5125
Traceback (most recent call last):
File "/path/to/miniconda/envs/open-mmlab/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/path/to/miniconda/envs/open-mmlab/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/path/to/project/MMGeneration/tools/train.py", line 204, in <module>
main()
File "/path/to/project/MMGeneration/tools/train.py", line 200, in main
meta=meta)
File "/path/to/project/MMGeneration/mmgen/apis/train.py", line 199, in train_model
runner.run(data_loaders, cfg.workflow, cfg.total_iters)
File "/path/to/project/MMGeneration/mmgen/core/runners/dynamic_iterbased_runner.py", line 285, in run
iter_runner(iter_loaders[i], **kwargs)
File "/path/to/project/MMGeneration/mmgen/core/runners/dynamic_iterbased_runner.py", line 197, in train
data_batch = next(self.data_loader)
File "/path/to/project/MMGeneration/mmgen/core/runners/dynamic_iterbased_runner.py", line 103, in __next__
data = next(self.iter_loader)
File "/path/to/miniconda/envs/open-mmlab/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 521, in __next__
data = self._next_data()
File "/path/to/miniconda/envs/open-mmlab/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1203, in _next_data
return self._process_data(data)
File "/path/to/miniconda/envs/open-mmlab/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1229, in _process_data
data.reraise()
File "/path/to/miniconda/envs/open-mmlab/lib/python3.7/site-packages/torch/_utils.py", line 434, in reraise
raise exception
AttributeError: Caught AttributeError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/path/to/miniconda/envs/open-mmlab/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 287, in _worker_loop
data = fetcher.fetch(index)
File "/path/to/miniconda/envs/open-mmlab/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 49, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/path/to/miniconda/envs/open-mmlab/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 49, in <listcomp>
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/path/to/project/MMGeneration/mmgen/datasets/dataset_wrappers.py", line 31, in __getitem__
return self.dataset[idx % self._ori_len]
File "/path/to/project/MMGeneration/mmgen/datasets/unconditional_image_dataset.py", line 74, in __getitem__
return self.prepare_train_data(idx)
File "/path/to/project/MMGeneration/mmgen/datasets/unconditional_image_dataset.py", line 55, in prepare_train_data
return self.pipeline(results)
File "/path/to/project/MMGeneration/mmgen/datasets/pipelines/compose.py", line 57, in __call__
data = t(data)
File "/path/to/project/MMGeneration/mmgen/datasets/pipelines/loading.py", line 67, in __call__
results[f'{self.key}_ori_shape'] = img.shape
AttributeError: 'NoneType' object has no attribute 'shape'Bug fix
If you have already identified the reason, you can provide the information here. If you are willing to create a PR to fix it, please also leave a comment here and that would be much appreciated!
Reactions are currently unavailable