Merge branch 'rnnt-hybrid-export' of https://github.com/trias702/NeMo into rnnt-hybrid-export

trias702 · trias702 · commit 1fb26f9e56fe · 2023-07-21T17:52:24.000-07:00
diff --git a/docs/source/asr/models.rst b/docs/source/asr/models.rst
@@ -218,7 +218,7 @@ You may find FastConformer variants of cache-aware streaming models under ``<NeM
 Note cache-aware streaming models are being exported without caching support by default.
 To include caching support, `model.set_export_config({'cache_support' : 'True'})` should be called before export.
 Or, if ``<NeMo_git_root>/scripts/export.py`` is being used:
-`python export.py cache_aware_conformer.nemo cache_aware_conformer.onnx --config cache_support=True`
+`python export.py cache_aware_conformer.nemo cache_aware_conformer.onnx --export-config cache_support=True`
 
 .. _LSTM-Transducer_model:
 
@@ -299,7 +299,7 @@ Similar example configs for FastConformer variants of Hybrid models can be found
 Note Hybrid models are being exported as RNNT (encoder and decoder+joint parts) by default.
 To export as CTC (single encoder+decoder graph), `model.set_export_config({'decoder_type' : 'ctc'})` should be called before export.
 Or, if ``<NeMo_git_root>/scripts/export.py`` is being used:
-`python export.py hybrid_transducer.nemo hybrid_transducer.onnx --config decoder_type=ctc`
+`python export.py hybrid_transducer.nemo hybrid_transducer.onnx --export-config decoder_type=ctc`
 
 .. _Conformer-HAT_model:
 
diff --git a/docs/source/core/export.rst b/docs/source/core/export.rst
@@ -207,7 +207,7 @@ An example can be found in ``<NeMo_git_root>/nemo/collections/asr/models/rnnt_mo
 Here is example on now `set_export_config()` call is being tied to command line arguments in ``<NeMo_git_root>/scripts/export.py`` :
 
 .. code-block:: Python
-    python scripts/export.py  hybrid_conformer.nemo hybrid_conformer.onnx --config decoder_type=ctc
+    python scripts/export.py  hybrid_conformer.nemo hybrid_conformer.onnx --export-config decoder_type=ctc
 
 Exportable Model Code
 ~~~~~~~~~~~~~~~~~~~~~
diff --git a/nemo/collections/tts/models/base.py b/nemo/collections/tts/models/base.py
@@ -68,6 +68,18 @@ def list_available_models(cls) -> 'List[PretrainedModelInfo]':
                 list_of_models.extend(subclass_models)
         return list_of_models
 
+    def set_export_config(self, args):
+        for k in ['enable_volume', 'enable_ragged_batches']:
+            if k in args:
+                self.export_config[k] = bool(args[k])
+                args.pop(k)
+        if 'num_speakers' in args:
+            self.export_config['num_speakers'] = int(args['num_speakers'])
+            args.pop('num_speakers')
+        if 'emb_range' in args:
+            raise Exception('embedding range is not user-settable')
+        super().set_export_config(args)
+
 
 class Vocoder(ModelPT, ABC):
     """
diff --git a/scripts/export.py b/scripts/export.py
@@ -63,7 +63,7 @@ def get_args(argv):
     parser.add_argument("--device", default="cuda", help="Device to export for")
     parser.add_argument("--check-tolerance", type=float, default=0.01, help="tolerance for verification")
     parser.add_argument(
-        "--config",
+        "--export-config",
         metavar="KEY=VALUE",
         nargs='+',
         help="Set a number of key-value pairs to model.export_config dictionary "
@@ -142,8 +142,14 @@ def nemo_export(argv):
     if args.cache_support:
         model.set_export_config({"cache_support": "True"})
 
-    if args.config:
-        kv = dict(map(lambda s: s.split('='), args.config))
+    if args.export_config:
+        kv = {}
+        for key_value in args.export_config:
+            lst = key_value.split("=")
+            if len(lst) != 2:
+                raise Exception("Use correct format for --export_config: k=v")
+            k, v = lst
+            kv[k] = v
         model.set_export_config(kv)
 
     autocast = nullcontext
diff --git a/tests/collections/tts/test_tts_exportables.py b/tests/collections/tts/test_tts_exportables.py
@@ -54,8 +54,7 @@ def radtts_model():
     model = RadTTSModel(cfg=cfg.model)
     app_state.is_model_being_restored = False
     model.eval()
-    model.export_config['enable_ragged_batches'] = True
-    model.export_config['enable_volume'] = True
+    model.set_export_config({'enable_ragged_batches': 'True', 'enable_volume': 'True'})
     return model
 
 
diff --git a/tutorials/asr/Offline_ASR_with_VAD_for_CTC_models.ipynb b/tutorials/asr/Offline_ASR_with_VAD_for_CTC_models.ipynb
@@ -389,7 +389,7 @@
             "source": [
                 "# Further Reading\n",
                 "\n",
-                "There are two ways to incorporate VAD into ASR pipeline. The first strategy is to drop the frames that are predicted as `non-speech` by VAD, as already discussed in this tutorial. The second strategy is to keep all the frames and mask the `non-speech` frames with zero-signal values. Also, instead of using segment-VAD as shown in this tutorial, we can use frame-VAD model for faster inference and better accuracy. For more information, please refer to the script [speech_to_text_with_vad.py](https://github.com/NVIDIA/NeMo/blob/stable/examples/asr_vad/speech_to_text_with_vad.py)."
+                "There are two ways to incorporate VAD into ASR pipeline. The first strategy is to drop the frames that are predicted as `non-speech` by VAD, as already discussed in this tutorial. The second strategy is to keep all the frames and mask the `non-speech` frames with zero-signal values. Also, instead of using segment-VAD as shown in this tutorial, we can use frame-VAD model for faster inference and better accuracy. For more information, please refer to the script [speech_to_text_with_vad.py](https://github.com/NVIDIA/NeMo/blob/stable/examples/asr/asr_vad/speech_to_text_with_vad.py)."
             ]
         }
     ],
diff --git a/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb b/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb
@@ -85,7 +85,7 @@
         "# Install NeMo library. If you are running locally (rather than on Google Colab), comment out the below lines\n",
         "# and instead follow the instructions at https://github.com/NVIDIA/NeMo#Installation\n",
         "GITHUB_ACCOUNT = \"NVIDIA\"\n",
-        "BRANCH = \"main\"\n",
+        "BRANCH = 'main'\n",
         "!python -m pip install git+https://github.com/{GITHUB_ACCOUNT}/NeMo.git@{BRANCH}#egg=nemo_toolkit[all]\n",
         "\n",
         "# Download local version of NeMo scripts. If you are running locally and want to use your own local NeMo code,\n",
@@ -536,7 +536,7 @@
         "id": "b1K6paeee2Iu"
       },
       "source": [
-        "As we mentioned earlier, this model pipeline is intended to work with custom vocabularies up to several thousand entries. Since the whole medical vocabulary contains 110k entries, we restrict our custom vocabulary to 5000+ terms that occured in given corpus of abstracts.\n",
+        "As we mentioned earlier, this model pipeline is intended to work with custom vocabularies up to several thousand entries. Since the whole medical vocabulary contains 110k entries, we restrict our custom vocabulary to 5000+ terms that occurred in given corpus of abstracts.\n",
         "\n",
         "The goal of indexing our custom vocabulary is to build an index where key is a letter n-gram and value is the whole phrase. The keys are n-grams in the given user phrase and their misspelled variants taken from our collection of n-\n",
         "gram mappings (see Index of custom vocabulary in Fig. 1)\n",
@@ -1273,7 +1273,7 @@
         "### Filtering by Dynamic Programming(DP) score\n",
         "\n",
         "What else can be done?\n",
-        "Given a fragment and its potential replacement, we can apply **dynamic programming** to find the most probable \"translation\" path between them. We will use the same n-gram mapping vocabulary, because its frequencies give us \"translation probability\" of each n-gram pair. The final path score can be calculated as maximum sum of log probalities of matching n-grams along this path.\n",
+        "Given a fragment and its potential replacement, we can apply **dynamic programming** to find the most probable \"translation\" path between them. We will use the same n-gram mapping vocabulary, because its frequencies give us \"translation probability\" of each n-gram pair. The final path score can be calculated as maximum sum of log probabilities of matching n-grams along this path.\n",
         "Let's look at an example. "
       ]
     },

Original file line number	Diff line number	Diff line change
`@@ -389,7 +389,7 @@`
`389`	`389`	`"source": [`
`390`	`390`	`"# Further Reading\n",`
`391`	`391`	`"\n",`
`392`		- "There are two ways to incorporate VAD into ASR pipeline. The first strategy is to drop the frames that are predicted as `non-speech` by VAD, as already discussed in this tutorial. The second strategy is to keep all the frames and mask the `non-speech` frames with zero-signal values. Also, instead of using segment-VAD as shown in this tutorial, we can use frame-VAD model for faster inference and better accuracy. For more information, please refer to the script [speech_to_text_with_vad.py](https://github.com/NVIDIA/NeMo/blob/stable/examples/asr_vad/speech_to_text_with_vad.py)."
	`392`	+ "There are two ways to incorporate VAD into ASR pipeline. The first strategy is to drop the frames that are predicted as `non-speech` by VAD, as already discussed in this tutorial. The second strategy is to keep all the frames and mask the `non-speech` frames with zero-signal values. Also, instead of using segment-VAD as shown in this tutorial, we can use frame-VAD model for faster inference and better accuracy. For more information, please refer to the script [speech_to_text_with_vad.py](https://github.com/NVIDIA/NeMo/blob/stable/examples/asr/asr_vad/speech_to_text_with_vad.py)."
`393`	`393`	`]`
`394`	`394`	`}`
`395`	`395`	`],`