Skip to content

Commit 21bd6eb

Browse files
Add UViM project (+misc. changes)
Co-authored-by: André Susano Pint <[email protected]>
1 parent 6ff6d08 commit 21bd6eb

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+15447
-91
lines changed

README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,14 @@ codebase:
4545
Resources: [config](big_vision/configs/vit_s16_i1k.py)
4646
- [UViM: A Unified Modeling Approach for Vision with Learned Guiding Codes](https://arxiv.org/abs/2205.10337), by
4747
Alexander Kolesnikov^*, André Susano Pinto^*, Lucas Beyer*, Xiaohua Zhai*, Jeremiah Harmsen*, Neil Houlsby*
48+
Resources: [readme](big_vision/configs/proj/uvim/README.md) [configs](big_vision/configs/proj/uvim), [colabs](big_vision/configs/proj/uvim).
4849

4950
### Multimodal research
5051

5152
- [LiT: Zero-Shot Transfer with Locked-image Text Tuning](https://arxiv.org/abs/2111.07991), by
5253
Xiaohua Zhai*, Xiao Wang*, Basil Mustafa*, Andreas Steiner*, Daniel Keysers,
5354
Alexander Kolesnikov, and Lucas Beyer*\
54-
Resources: [trainer](trainers/proj/image_text/contrastive.py), [config](configs/proj/image_text/lit_coco.py), [colab](https://colab.research.google.com/github/google-research/big_vision/blob/main/big_vision/configs/proj/image_text/lit.ipynb).
55+
Resources: [trainer](big_vision/trainers/proj/image_text/contrastive.py), [config](big_vision/configs/proj/image_text/lit_coco.py), [colab](https://colab.research.google.com/github/google-research/big_vision/blob/main/big_vision/configs/proj/image_text/lit.ipynb).
5556

5657
### Knowledge distillation
5758

@@ -114,12 +115,12 @@ We have since added the following key features and projects:
114115
- Patient and consistent distillation.
115116
- Scaling ViT.
116117
- MLP-Mixer.
118+
- UViM.
117119

118120
Features and projects we plan to release in the near future, in no particular
119121
order:
120122
- ImageNet-21k in TFDS.
121123
- Loading misc public models used in our publications (NFNet, MoCov3, DINO).
122-
- UViM.
123124
- Memory-efficient Polyak-averaging implementation.
124125
- Advanced JAX compute and memory profiling. We are using internal tools for
125126
this, but may eventually add support for the publicly available ones.

big_vision/configs/bit_i1k.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def get_config(runlocal=False):
4040

4141
config.seed = 0
4242
config.batch_size = 4096 if not runlocal else 32
43-
config.num_epochs = 90
43+
config.total_epochs = 90
4444

4545
pp_common = '|onehot(1000, key="{lbl}", key_result="labels")'
4646
pp_common += '|value_range(-1, 1)|keep("image", "labels")'

big_vision/configs/bit_i21k.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,14 @@ def get_config():
3636

3737
config.trial = 0
3838
config.batch_size = 4096
39-
config.num_epochs = 90
39+
config.total_epochs = 90
4040

41-
pp_common = f'|value_range(-1, 1)|onehot({config.num_classes})'
42-
config.pp_train = 'decode_jpeg_and_inception_crop(224)|flip_lr' + pp_common
43-
pp_eval = 'decode|resize_small(256)|central_crop(224)' + pp_common
41+
pp_common = '|value_range(-1, 1)|onehot({onehot_args})|keep("image", "labels")'
42+
pp_common_i21k = pp_common.format(onehot_args=f'{config.num_classes}')
43+
pp_common_i1k = pp_common.format(onehot_args='1000, key="label", key_result="labels"')
44+
config.pp_train = 'decode_jpeg_and_inception_crop(224)|flip_lr' + pp_common_i21k
45+
pp_eval = 'decode|resize_small(256)|central_crop(224)' + pp_common_i21k
46+
pp_eval_i1k = 'decode|resize_small(256)|central_crop(224)' + pp_common_i1k
4447
config.shuffle_buffer_size = 250_000 # Per host, so small-ish is ok.
4548

4649
config.log_training_steps = 50
@@ -63,7 +66,6 @@ def get_config():
6366
eval_common = dict(
6467
type='classification',
6568
dataset=config.dataset,
66-
data_dir=config.dataset_dir,
6769
pp_fn=pp_eval,
6870
loss_name=config.loss,
6971
log_steps=1000, # Very fast O(seconds) so it's fine to run it often.
@@ -72,6 +74,8 @@ def get_config():
7274
config.evals.test = {**eval_common, 'split': 'full[:25_600]'}
7375
config.evals.val = {**eval_common, 'split': 'full[25_600:51_200]'}
7476
config.evals.train = {**eval_common, 'split': 'full[51_200:76_800]'}
77+
78+
# Few-shot evaluators
7579
config.evals.fewshot = get_fewshot_lsr()
7680
config.evals.fewshot.log_steps = 25_000
7781

big_vision/configs/common.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,3 +117,8 @@ def autotype(x):
117117
return float(x) # Returns as float.
118118
except ValueError:
119119
return x # Returns as str.
120+
121+
122+
def pack_arg(**kw):
123+
"""Packs key-word args as a string to be parsed by `parse_arg()`."""
124+
return ','.join([f'{k}={v}' for k, v in kw.items()])

big_vision/configs/mlp_mixer_i1k.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def get_config(mode=None):
5555
)
5656

5757
config.batch_size = 4096
58-
config.num_epochs = 300
58+
config.total_epochs = 300
5959

6060
config.shuffle_buffer_size = 250_000 # Per host, so small-ish is ok.
6161

@@ -107,10 +107,10 @@ def get_config(mode=None):
107107
config.fewshot = get_fewshot_lsr()
108108

109109
if mode == 'gpu8':
110-
config.num_epochs = 60
110+
config.total_epochs = 60
111111
config.batch_size = 512
112112
config.cache_raw = False
113113
if mode == 'regression_test':
114-
config.num_epochs = 60
114+
config.total_epochs = 60
115115

116116
return config
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
# UViM: A Unified Modeling Approach for Vision with Learned Guiding Codes
2+
3+
*by Alexander Kolesnikov, André Susano Pinto, Lucas Beyer, Xiaohua Zhai, Jeremiah Harmsen, Neil Houlsby*
4+
5+
We provide pretrained UViM models from the [original paper](https://arxiv.org/abs/2205.10337),
6+
as well as the instructions on how to reproduce core paper experiments.
7+
8+
## Pretrained models
9+
10+
The table below contains UViM models (stage I and II) trained for three
11+
different tasks: panoptic segmentation, colorization and depth prediction.
12+
13+
| task | model | dataset | accuracy | download link |
14+
| --------------------- | ------------------- | ------------------------------------------------------------------------ | ------------ | ----------------------------------------------------------------------------------------- |
15+
| Panoptic segmentation | UViM Stage I model | [COCO(2017)](https://cocodataset.org/#home) | 75.8 PQ | [link](https://storage.googleapis.com/big_vision/uvim/panoptic_stageI_params.npz) |
16+
| Panoptic segmentation | UViM Stage II model | [COCO(2017)](https://cocodataset.org/#home) | 43.1 PQ | [link](https://storage.googleapis.com/big_vision/uvim/panoptic_stageII_params.npz) |
17+
| Colorization | UViM Stage I model | [ILSVRC-2012](https://www.image-net.org/) | 15.59 PQ | [link](https://storage.googleapis.com/big_vision/uvim/color_stageI_params.npz) |
18+
| Colorization | UViM Stage II model | [ILSVRC-2012](https://www.image-net.org/) | 16.99 FID | [link](https://storage.googleapis.com/big_vision/uvim/color_stageII_params.npz) |
19+
| Depth | UViM Stage I model | [NYU Depth V2](https://cs.nyu.edu/~silberman/datasets/nyu_depth_v2.html) | 0.155 RMSE | [link](https://storage.googleapis.com/big_vision/uvim/depth_stageI_params.npz) |
20+
| Depth | UViM Stage II model | [NYU Depth V2](https://cs.nyu.edu/~silberman/datasets/nyu_depth_v2.html) | 0.463 RMSE | [link](https://storage.googleapis.com/big_vision/uvim/depth_stageII_params.npz) |
21+
22+
All of this models can be interactively explored in our [colabs](configs/proj/uvim).
23+
24+
## Running on a single-host TPU machine
25+
26+
Below we provide instructions on how to run UViM training (stage I and
27+
stage II) using a single TPU host with 8 TPU accelerators. These instructions
28+
can be easily adapted to a GPU host and multi-host TPU setup, see the main
29+
`big_vision` [README file](README.md).
30+
31+
We assume that the user has already created and `ssh`-ed to the TPU host
32+
machine. The next step is to clone `big_vision` repository:
33+
`git clone https://github.com/google-research/big_vision.git`.
34+
35+
The next steps are to create a python virtual environment and install python
36+
dependencies:
37+
```
38+
virtualenv bv
39+
source bv/bin/activate
40+
cd big_vision/
41+
pip3 install --upgrade pip
42+
pip3 install -r big_vision/requirements.txt
43+
pip install "jax[tpu]>=0.2.16" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
44+
```
45+
46+
After this invoke the helper tool to download and prepare data:
47+
`python3 -m big_vision.tools.download_tfds_datasets coco/2017_panoptic nyu_depth_v2`.
48+
For preparing the ImageNet dataset consult the main codebase README.
49+
50+
> :warning: TPU machines have 100 GB of the disk space. It may not be enough to
51+
> store all training data (though only panoptic or only depth data may fit).
52+
> Consider preparing the data on a seperate machine and then copying it to
53+
> to TPU machine's extra persistent disk or to a Google Cloud Bucket. See
54+
> instructions for [creating an extra persistent disk](https://cloud.google.com/tpu/docs/users-guide-tpu-vm).
55+
> Remember to set the correct data home directory, e.g.`export DISK=/mnt/disk/persist; export TFDS_DATA_DIR=$DISK/tensorflow_datasets`.
56+
57+
Our panoptic evaluator uses raw variant of the COCO data, so we move it into a
58+
separate folder. Note, `tfds` has already pre-downloaded the panoptic data,
59+
except for one small json file that we fetch manually:
60+
```
61+
mkdir $DISK/coco_data
62+
cd $DISK/coco_data
63+
mv $TFDS_DATA_DIR/downloads/extracted/ZIP.image.cocod.org_annot_panop_annot_train<REPLACE_ME_WITH_THE_HASH_CODE>.zip/annotations/* .
64+
wget https://raw.githubusercontent.com/cocodataset/panopticapi/master/panoptic_coco_categories.json
65+
export COCO_DATA_DIR=$DISK/coco_data
66+
```
67+
68+
For FID evaluator, which is used for the colorization model, set the path to the
69+
directory with image id files, e.g.
70+
`export FID_DATA_DIR=<ROOT>/big_vision/evaluators/proj/uvim/coltran_fid_data`.
71+
72+
As an example, stage I panoptic training can be invoked as (note the `:singlehost` config parameter which will use lightweight configuration suitable for a single host):
73+
```
74+
python3 -m big_vision.trainers.proj.uvim.vqvae --config big_vision/configs/proj/uvim/vqvae_coco_panoptic.py:singlehost --workdir workdirs/`date '+%m-%d_%H%M'`
75+
```
76+
or stage II training
77+
```
78+
python3 -m big_vision.trainers.proj.uvim.train --config big_vision/configs/proj/uvim/train_coco_panoptic_pretrained.py:singlehost --workdir workdirs/`date '+%m-%d_%H%M'`
79+
```
80+
81+
## Acknowledgments
82+
The sampling code in `models/proj/uvim/decode.py` module is based on contributions
83+
from Anselm Levskaya, Ilya Tolstikhin and Maxim Neumann.
84+
Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
# Copyright 2022 Big Vision Authors.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# pylint: disable=line-too-long
16+
r"""A config for training a UViM stage II model for the panoptic task.
17+
18+
This config is expected to reproduce the paper's result and achieve
19+
approximately 43.7 PQ points on the COCO holdout data.
20+
21+
We also provide a low-resource variant of this config, which can be enabled
22+
by adding `:singlehost` postfix to the config name. This one is expected to
23+
achieve 39.4 PQ points on the COCO holdout data.
24+
"""
25+
26+
import big_vision.configs.common as bvcc
27+
from ml_collections import ConfigDict
28+
29+
VTT_MODELS = {
30+
'base': dict(num_layers=12, num_heads=12, mlp_dim=3072, emb_dim=768),
31+
'large': dict(num_layers=24, num_heads=16, mlp_dim=4096, emb_dim=1024),
32+
}
33+
34+
VQVAE_MODELS = {
35+
'base': dict(enc_depth=6, dec_depth=12, num_heads=12, mlp_dim=3072, width=768),
36+
}
37+
38+
RES = 512
39+
PATCH_SIZE = 16
40+
LABEL_RES = 512
41+
LABEL_PATCH_SIZE = 16
42+
43+
44+
def get_config(arg=''):
45+
"""Config for training."""
46+
arg = bvcc.parse_arg(arg, runlocal=False, singlehost=False)
47+
config = ConfigDict()
48+
49+
config.pp_train = (
50+
f'decode|coco_panoptic|concat(["semantics","instances"], "labels")|'
51+
f'randu("fliplr")|det_fliplr(key="image")|det_fliplr(key="labels")|'
52+
f'inception_box|crop_box(key="image")|crop_box(key="labels")|'
53+
f'resize({LABEL_RES}, inkey="image", outkey="image_ctx")|'
54+
f'resize({RES})|resize({LABEL_RES},key="labels",method="nearest")|'
55+
f'value_range(-1, 1, key="image_ctx")|'
56+
f'value_range(-1, 1)|make_canonical|keep("image","image_ctx","labels")'
57+
)
58+
pp_eval = (
59+
f'decode|coco_panoptic|concat(["semantics","instances"], "labels")|'
60+
f'resize({LABEL_RES}, inkey="image", outkey="image_ctx")|'
61+
f'resize({RES})|resize({LABEL_RES},key="labels",method="nearest")|'
62+
f'value_range(-1, 1, key="image_ctx")|'
63+
f'value_range(-1, 1)|make_canonical|keep("image","image_ctx","labels")'
64+
)
65+
pp_predict = (
66+
f'resize({LABEL_RES}, inkey="image", outkey="image_ctx")|resize({RES})|'
67+
f'value_range(-1, 1, key="image_ctx")|value_range(-1, 1)|'
68+
f'keep("image","image_ctx","image/id")' # image/id used for rng seeds.
69+
)
70+
71+
config.dataset = 'coco/2017_panoptic'
72+
config.train_split = 'train[4096:]'
73+
74+
config.batch_size = 512
75+
config.total_epochs = 200
76+
77+
config.log_training_steps = 50
78+
config.shuffle_buffer_size = 50_000
79+
config.ckpt_steps = 1000
80+
config.keep_ckpt_steps = 5000
81+
config.ckpt_timeout = 1
82+
config.prefetch_to_device = 2
83+
config.trial = 0
84+
85+
# Optimizer section
86+
config.optax_name = 'big_vision.scale_by_adafactor'
87+
config.optax = dict(beta2_cap=0.95)
88+
89+
config.lr = 0.001
90+
config.wd = 0.000001
91+
config.lr_mults = [
92+
('pos_embedding_encoder.*', 0.1),
93+
('EmbedPatches.*', 0.1),
94+
('encoder.*', 0.1),
95+
('decoder.*', 1.0)
96+
]
97+
config.schedule = dict(decay_type='cosine', warmup_steps=4_000)
98+
99+
# Oracle section
100+
config.oracle = ConfigDict()
101+
config.oracle.task = 'proj.uvim.panoptic_task'
102+
config.oracle.model_init = 'gs://big_vision/uvim/panoptic_stageI_params.npz'
103+
config.oracle.model_name = 'proj.uvim.vit'
104+
config.oracle.model = ConfigDict(VQVAE_MODELS['base'])
105+
config.oracle.model.input_size = (LABEL_RES, LABEL_RES)
106+
config.oracle.model.patch_size = (LABEL_PATCH_SIZE, LABEL_PATCH_SIZE)
107+
config.oracle.model.code_len = 256
108+
config.oracle.model.dict_size = 4096
109+
config.oracle.model.codeword_dim = 768
110+
config.oracle.model.with_encoder_ctx = True
111+
config.oracle.model.with_decoder_ctx = True
112+
config.oracle.model.code_dropout = 'random'
113+
config.oracle.model.bottleneck_resize = True
114+
config.oracle.model.inputs = {
115+
'semantics': (133 + 1, LABEL_PATCH_SIZE**2), # +1 for void label
116+
'instances': (100, LABEL_PATCH_SIZE**2), # COCO: actually 98 train/78 validation.
117+
}
118+
config.oracle.model.outputs = config.oracle.model.inputs
119+
120+
# Model section
121+
config.model_name = 'proj.uvim.vtt'
122+
# config.model_init = {'encoder': 'howto-i21k-B/8'}
123+
config.model_init = {'encoder': 'howto-i21k-L/16'}
124+
config.model = ConfigDict(VTT_MODELS['large'])
125+
config.model.patches = ConfigDict({'size': (PATCH_SIZE, PATCH_SIZE)})
126+
config.model.vocab_size = config.oracle.model.get_ref('dict_size') + 1
127+
config.model.posemb_type = 'learn'
128+
config.model.input_size = (RES, RES)
129+
config.model.seq_len = config.oracle.model.get_ref('code_len')
130+
131+
# Evaluation section
132+
config.evals = {}
133+
config.evals.val = ConfigDict()
134+
config.evals.val.type = 'proj.uvim.compute_mean'
135+
config.evals.val.pred = 'validation'
136+
config.evals.val.dataset = config.dataset
137+
config.evals.val.split = 'train[:4096]'
138+
config.evals.val.pp_fn = pp_eval
139+
config.evals.val.log_steps = 1000
140+
141+
base = {
142+
'type': 'proj.uvim.coco_panoptic',
143+
'pp_fn': pp_predict,
144+
'log_steps': 10_000,
145+
# Filters objects that occupy less than 0.03^2 fraction of all pixels.
146+
# 'predict_kwargs': {'min_fraction': 0.03 ** 2},
147+
}
148+
config.evals.coco_panoptic_train = dict(**base, split='train[4096:8192]')
149+
config.evals.coco_panoptic_holdout = dict(**base, split='train[:4096]')
150+
config.evals.coco_panoptic = dict(**base, split='validation')
151+
152+
# config.evals.save_pred = dict(type='proj.uvim.save_predictions')
153+
# config.evals.save_pred.pp = pp_eval.replace('decode|', '')
154+
# config.evals.save_pred.log_steps = 100_000
155+
# config.evals.save_pred.dataset = config.dataset
156+
# config.evals.save_pred.split = 'validation[:1024]'
157+
# config.evals.save_pred.outfile = 'inference.npz'
158+
159+
if arg.singlehost:
160+
config.batch_size = 32
161+
config.num_epochs = 50
162+
elif arg.runlocal:
163+
config.batch_size = 4
164+
config.shuffle_buffer_size = 10
165+
config.evals.val.split = 'train[:16]'
166+
return config

0 commit comments

Comments
 (0)