Skip to content

Commit 3a0f680

Browse files
authored
[TTS] Add output audio format to preprocessing (NVIDIA-NeMo#6889)
* [TTS] Add output audio format to preprocessing Signed-off-by: Ryan <[email protected]> * [TTS] Add format validation Signed-off-by: Ryan <[email protected]> * [TTS] Fix data tutorial Signed-off-by: Ryan <[email protected]> --------- Signed-off-by: Ryan <[email protected]>
1 parent ab5ebaa commit 3a0f680

File tree

2 files changed

+27
-1
lines changed

2 files changed

+27
-1
lines changed

scripts/dataset_processing/tts/preprocess_audio.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
--num_workers=1 \
3333
--trim_config_path="<nemo_root_path>/examples/tts/conf/trim/energy.yaml" \
3434
--output_sample_rate=22050 \
35+
--output_format=flac \
3536
--volume_level=0.95 \
3637
--min_duration=0.5 \
3738
--max_duration=20.0 \
@@ -97,6 +98,12 @@ def get_args():
9798
parser.add_argument(
9899
"--output_sample_rate", default=0, type=int, help="If provided, rate to resample the audio to."
99100
)
101+
parser.add_argument(
102+
"--output_format",
103+
default="wav",
104+
type=str,
105+
help="If provided, format output audio will be saved as. If not provided, will keep original format.",
106+
)
100107
parser.add_argument(
101108
"--volume_level", default=0.0, type=float, help="If provided, peak volume to normalize audio to."
102109
)
@@ -123,12 +130,18 @@ def _process_entry(
123130
overwrite_audio: bool,
124131
audio_trimmer: AudioTrimmer,
125132
output_sample_rate: int,
133+
output_format: str,
126134
volume_level: float,
127135
) -> Tuple[dict, float, float]:
128136
audio_filepath = Path(entry["audio_filepath"])
129137

130138
audio_path, audio_path_rel = get_abs_rel_paths(input_path=audio_filepath, base_path=input_audio_dir)
139+
140+
if not output_format:
141+
output_format = audio_path.suffix
142+
131143
output_path = output_audio_dir / audio_path_rel
144+
output_path = output_path.with_suffix(output_format)
132145
output_path.parent.mkdir(exist_ok=True, parents=True)
133146

134147
if output_path.exists() and not overwrite_audio:
@@ -159,6 +172,9 @@ def _process_entry(
159172

160173
if os.path.isabs(audio_filepath):
161174
entry["audio_filepath"] = str(output_path)
175+
else:
176+
output_filepath = audio_path_rel.with_suffix(output_format)
177+
entry["audio_filepath"] = str(output_filepath)
162178

163179
return entry, original_duration, output_duration
164180

@@ -175,6 +191,7 @@ def main():
175191
num_workers = args.num_workers
176192
max_entries = args.max_entries
177193
output_sample_rate = args.output_sample_rate
194+
output_format = args.output_format
178195
volume_level = args.volume_level
179196
min_duration = args.min_duration
180197
max_duration = args.max_duration
@@ -192,6 +209,11 @@ def main():
192209
else:
193210
audio_trimmer = None
194211

212+
if output_format:
213+
if output_format.upper() not in sf.available_formats():
214+
raise ValueError(f"Unsupported output audio format: {output_format}")
215+
output_format = f".{output_format}"
216+
195217
output_audio_dir.mkdir(exist_ok=True, parents=True)
196218

197219
entries = read_manifest(input_manifest_path)
@@ -207,6 +229,7 @@ def main():
207229
overwrite_audio=overwrite_audio,
208230
audio_trimmer=audio_trimmer,
209231
output_sample_rate=output_sample_rate,
232+
output_format=output_format,
210233
volume_level=volume_level,
211234
)
212235
for entry in tqdm(entries)

tutorials/tts/FastPitch_Data_Preparation.ipynb

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -452,6 +452,8 @@
452452
"num_workers = 4\n",
453453
"# Downsample data from 48khz to 44.1khz for compatibility\n",
454454
"output_sample_rate = 44100\n",
455+
"# Format of output audio files. Use \"flac\" to compress to a smaller file size.\n",
456+
"output_format = \"flac\"\n",
455457
"# Method for silence trimming. Can use \"energy.yaml\" or \"vad.yaml\".\n",
456458
"# We use VAD for VCTK because the audio has background noise.\n",
457459
"trim_config_path = NEMO_CONFIG_DIR / \"trim\" / \"vad.yaml\"\n",
@@ -475,6 +477,7 @@
475477
" f\"--output_audio_dir={output_audio_dir}\",\n",
476478
" f\"--num_workers={num_workers}\",\n",
477479
" f\"--output_sample_rate={output_sample_rate}\",\n",
480+
" f\"--output_format={output_format}\",\n",
478481
" f\"--trim_config_path={trim_config_path}\",\n",
479482
" f\"--volume_level={volume_level}\",\n",
480483
" f\"--min_duration={min_duration}\",\n",
@@ -532,7 +535,7 @@
532535
"source": [
533536
"audio_file = \"p228_009.wav\"\n",
534537
"audio_filepath = input_audio_dir / audio_file\n",
535-
"processed_audio_filepath = output_audio_dir / audio_file\n",
538+
"processed_audio_filepath = output_audio_dir / audio_file.replace(\".wav\", \".flac\")\n",
536539
"\n",
537540
"print(\"Original audio.\")\n",
538541
"ipd.display(ipd.Audio(audio_filepath))\n",

0 commit comments

Comments
 (0)