Skip to content

Commit 6b188ba

Browse files
Merge pull request #670 from mlcommons/dev
dev -> main
2 parents f9791d0 + 24632ad commit 6b188ba

File tree

653 files changed

+297414
-47
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

653 files changed

+297414
-47
lines changed

CHANGELOG.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,20 @@
11
# Change Log
22

3+
## algoperf-benchmark-0.1.2 (2024-03-04)
4+
Workload variant additions and fixes:
5+
- Add Deepspeech workload variant
6+
- Fix bugs in Imagenet ResNet, WMT and Criteo1tb variants
7+
8+
Add prize qualification logs for external tuning ruleset.
9+
Note: FastMRI trials with dropout are not yet added due to https://github.com/mlcommons/algorithmic-efficiency/issues/664.
10+
11+
Add missing funcitonality to Docker startup script for self_tuning ruleset.
12+
Add self_tuning ruleset option to script that runs all workloads for scoring.
13+
14+
Datasetup fixes.
15+
16+
Fix tests that check training differences in PyTorch and JAX on GPU.
17+
318
## algoperf-benchmark-0.1.1 (2024-01-19)
419
Bug fixes to FastMRI metric calculation and targets.
520

GETTING_STARTED.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,4 +381,13 @@ python score_submissions.py --submission_directory <directory_with_submissions>
381381

382382
We provide the scores and performance profiles for the [paper baseline algorithms](/reference_algorithms/paper_baselines/) in the "Baseline Results" section in [Benchmarking Neural Network Training Algorithms](https://arxiv.org/abs/2306.07179).
383383

384+
## Package Submission for Self-Reporting
385+
To prepare your submission for self reporting run:
386+
387+
```
388+
python3 package_logs.py --experiment_dir <experiment_dir> --destination_dir <destination_dir>
389+
```
390+
391+
The destination directiory will contain the logs packed in studies and trials required for self-reporting.
392+
384393
**Good Luck!**

algorithmic_efficiency/workloads/criteo1tb/criteo1tb_jax/models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def scaled_init(key, shape, dtype=jnp.float_):
8888
stddev=jnp.sqrt(1.0 / mlp_top_dims[layer_idx])))(
8989
top_mlp_input)
9090
x = nn.relu(x)
91-
if self.dropout_rate > 0.0 and layer_idx == num_layers_top - 2:
91+
if self.dropout_rate and layer_idx == num_layers_top - 2:
9292
x = nn.Dropout(rate=self.dropout_rate, deterministic=not train)(x)
9393
top_mlp_input += x
9494
# In the DLRM model the last layer width is always 1. We can hardcode that

algorithmic_efficiency/workloads/criteo1tb/workload.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ def max_allowed_runtime_sec(self) -> int:
9797

9898
@property
9999
def eval_period_time_sec(self) -> int:
100-
return 2 * 600 # 20 mins.
100+
return 2 * 60 # 2 mins.
101101

102102
def _build_input_queue(
103103
self,

algorithmic_efficiency/workloads/fastmri/fastmri_jax/workload.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,10 @@ def init_model_fn(
3333
use_tanh=self.use_tanh,
3434
use_layer_norm=self.use_layer_norm,
3535
dropout_rate=dropout_rate)
36-
37-
variables = jax.jit(self._model.init)({'params': rng}, fake_batch)
36+
params_rng, dropout_rng = jax.random.split(rng)
37+
variables = jax.jit(
38+
self._model.init)({'params': params_rng, 'dropout': dropout_rng},
39+
fake_batch)
3840
params = variables['params']
3941
self._param_shapes = param_utils.jax_param_shapes(params)
4042
self._param_types = param_utils.jax_param_types(self._param_shapes)

algorithmic_efficiency/workloads/librispeech_conformer/librispeech_jax/workload.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -327,7 +327,7 @@ def _eval_model_on_split(self,
327327
global_step: int = 0) -> Dict[str, float]:
328328
"""Run a full evaluation of the model."""
329329
del global_step
330-
if model_state is not None:
330+
if model_state is not None and len(model_state) > 0:
331331
# Sync batch statistics across replicas before evaluating.
332332
model_state = self.sync_batch_stats(model_state)
333333

algorithmic_efficiency/workloads/librispeech_deepspeech/librispeech_jax/workload.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@ def init_model_fn(
4747
variables = model_init_fn({'params': params_rng, 'dropout': dropout_rng},
4848
*fake_input_batch)
4949

50-
model_state = variables['batch_stats']
50+
model_state = variables[
51+
'batch_stats'] if not self.layernorm_everywhere else {}
5152
params = variables['params']
5253
self._param_shapes = param_utils.jax_param_shapes(params)
5354
self._param_types = param_utils.jax_param_types(self._param_shapes)

algorithmic_efficiency/workloads/wmt/wmt_jax/workload.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def predict_step(self,
121121
max_decode_len: int,
122122
beam_size: int = 4) -> spec.Tensor:
123123
"""Predict translation with fast decoding beam search on a batch."""
124-
config = models.TransformerConfig(deterministic=True, decode=True)
124+
config = replace(self._eval_model.config, decode=True)
125125
# Prepare transformer fast-decoder call for beam search: for beam search, we
126126
# need to set up our decoder model to handle a batch size equal to
127127
# batch_size * beam_size, where each batch item's data is expanded in-place

algorithmic_efficiency/workloads/workloads.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
'criteo1tb_embed_init': {
2828
'workload_path': 'criteo1tb/criteo1tb',
29-
'workload_class_name': 'Criteo1TbDlrmSmallEmbeddingInitWorkload'
29+
'workload_class_name': 'Criteo1TbDlrmSmallEmbedInitWorkload'
3030
},
3131
'criteo1tb_resnet': {
3232
'workload_path': 'criteo1tb/criteo1tb',

datasets/dataset_setup.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -706,13 +706,13 @@ def main(_):
706706
'to download the FastMRI dataset.\nSign up for the URLs at '
707707
'https://fastmri.med.nyu.edu/.')
708708

709-
updated_data_dir = download_fastmri(data_dir,
710-
knee_singlecoil_train_url,
711-
knee_singlecoil_val_url,
712-
knee_singlecoil_test_url)
709+
download_fastmri(data_dir,
710+
knee_singlecoil_train_url,
711+
knee_singlecoil_val_url,
712+
knee_singlecoil_test_url)
713713

714714
logging.info('fastMRI download completed. Extracting...')
715-
setup_fastmri(data_dir, updated_data_dir)
715+
setup_fastmri(data_dir)
716716

717717
if FLAGS.all or FLAGS.imagenet:
718718
flags.mark_flag_as_required('imagenet_train_url')

0 commit comments

Comments
 (0)