diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml
index 8bcbd70509..e92c556218 100644
--- a/.github/workflows/build_docs.yml
+++ b/.github/workflows/build_docs.yml
@@ -91,7 +91,7 @@ jobs:
 
         echo "::endgroup::"
         echo "::group::Build documentation"
-        # export BUILD_GALLERY=true  # TODO put this back?
+        export BUILD_GALLERY=true
         (cd docs && make html)
 
         echo "::endgroup::"
diff --git a/examples/tutorials/tacotron2_pipeline_tutorial.py b/examples/tutorials/tacotron2_pipeline_tutorial.py
index 0117a04e0b..0a4386220b 100644
--- a/examples/tutorials/tacotron2_pipeline_tutorial.py
+++ b/examples/tutorials/tacotron2_pipeline_tutorial.py
@@ -61,296 +61,296 @@
 import torch
 import torchaudio
 
-torch.random.manual_seed(0)
-device = "cuda" if torch.cuda.is_available() else "cpu"
+# torch.random.manual_seed(0)
+# device = "cuda" if torch.cuda.is_available() else "cpu"
 
-print(torch.__version__)
-print(torchaudio.__version__)
-print(device)
+# print(torch.__version__)
+# print(torchaudio.__version__)
+# print(device)
 
 
-######################################################################
-#
-
-import IPython
-import matplotlib.pyplot as plt
-
-
-######################################################################
-# Text Processing
-# ---------------
-#
-
-
-######################################################################
-# Character-based encoding
-# ~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# In this section, we will go through how the character-based encoding
-# works.
-#
-# Since the pre-trained Tacotron2 model expects specific set of symbol
-# tables, the same functionalities is available in ``torchaudio``. However,
-# we will first manually implement the encoding to aid in understanding.
-#
-# First, we define the set of symbols
-# ``'_-!\'(),.:;? abcdefghijklmnopqrstuvwxyz'``. Then, we will map the
-# each character of the input text into the index of the corresponding
-# symbol in the table. Symbols that are not in the table are ignored.
-
-symbols = "_-!'(),.:;? abcdefghijklmnopqrstuvwxyz"
-look_up = {s: i for i, s in enumerate(symbols)}
-symbols = set(symbols)
-
-
-def text_to_sequence(text):
-    text = text.lower()
-    return [look_up[s] for s in text if s in symbols]
-
-
-text = "Hello world! Text to speech!"
-print(text_to_sequence(text))
-
-
-######################################################################
-# As mentioned in the above, the symbol table and indices must match
-# what the pretrained Tacotron2 model expects. ``torchaudio`` provides the same
-# transform along with the pretrained model. You can
-# instantiate and use such transform as follow.
-#
-
-processor = torchaudio.pipelines.TACOTRON2_WAVERNN_CHAR_LJSPEECH.get_text_processor()
-
-text = "Hello world! Text to speech!"
-processed, lengths = processor(text)
-
-print(processed)
-print(lengths)
-
-
-######################################################################
-# Note: The output of our manual encoding and the ``torchaudio`` ``text_processor`` output matches (meaning we correctly re-implemented what the library does internally). It takes either a text or list of texts as inputs.
-# When a list of texts are provided, the returned ``lengths`` variable
-# represents the valid length of each processed tokens in the output
-# batch.
-#
-# The intermediate representation can be retrieved as follows:
-#
-
-print([processor.tokens[i] for i in processed[0, : lengths[0]]])
-
-
-######################################################################
-# Phoneme-based encoding
-# ~~~~~~~~~~~~~~~~~~~~~~
-#
-# Phoneme-based encoding is similar to character-based encoding, but it
-# uses a symbol table based on phonemes and a G2P (Grapheme-to-Phoneme)
-# model.
-#
-# The detail of the G2P model is out of the scope of this tutorial, we will
-# just look at what the conversion looks like.
-#
-# Similar to the case of character-based encoding, the encoding process is
-# expected to match what a pretrained Tacotron2 model is trained on.
-# ``torchaudio`` has an interface to create the process.
-#
-# The following code illustrates how to make and use the process. Behind
-# the scene, a G2P model is created using ``DeepPhonemizer`` package, and
-# the pretrained weights published by the author of ``DeepPhonemizer`` is
-# fetched.
-#
-
-bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH
-
-processor = bundle.get_text_processor()
-
-text = "Hello world! Text to speech!"
-with torch.inference_mode():
-    processed, lengths = processor(text)
-
-print(processed)
-print(lengths)
-
-
-######################################################################
-# Notice that the encoded values are different from the example of
-# character-based encoding.
-#
-# The intermediate representation looks like the following.
-#
+# ######################################################################
+# #
 
-print([processor.tokens[i] for i in processed[0, : lengths[0]]])
+# import IPython
+# import matplotlib.pyplot as plt
 
 
-######################################################################
-# Spectrogram Generation
-# ----------------------
-#
-# ``Tacotron2`` is the model we use to generate spectrogram from the
-# encoded text. For the detail of the model, please refer to `the
-# paper <https://arxiv.org/abs/1712.05884>`__.
-#
-# It is easy to instantiate a Tacotron2 model with pretrained weights,
-# however, note that the input to Tacotron2 models need to be processed
-# by the matching text processor.
-#
-# :py:class:`torchaudio.pipelines.Tacotron2TTSBundle` bundles the matching
-# models and processors together so that it is easy to create the pipeline.
-#
-# For the available bundles, and its usage, please refer to
-# :py:class:`~torchaudio.pipelines.Tacotron2TTSBundle`.
-#
+# ######################################################################
+# # Text Processing
+# # ---------------
+# #
 
-bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH
-processor = bundle.get_text_processor()
-tacotron2 = bundle.get_tacotron2().to(device)
 
-text = "Hello world! Text to speech!"
+# ######################################################################
+# # Character-based encoding
+# # ~~~~~~~~~~~~~~~~~~~~~~~~
+# #
+# # In this section, we will go through how the character-based encoding
+# # works.
+# #
+# # Since the pre-trained Tacotron2 model expects specific set of symbol
+# # tables, the same functionalities is available in ``torchaudio``. However,
+# # we will first manually implement the encoding to aid in understanding.
+# #
+# # First, we define the set of symbols
+# # ``'_-!\'(),.:;? abcdefghijklmnopqrstuvwxyz'``. Then, we will map the
+# # each character of the input text into the index of the corresponding
+# # symbol in the table. Symbols that are not in the table are ignored.
+
+# symbols = "_-!'(),.:;? abcdefghijklmnopqrstuvwxyz"
+# look_up = {s: i for i, s in enumerate(symbols)}
+# symbols = set(symbols)
+
+
+# def text_to_sequence(text):
+#     text = text.lower()
+#     return [look_up[s] for s in text if s in symbols]
+
+
+# text = "Hello world! Text to speech!"
+# print(text_to_sequence(text))
+
+
+# ######################################################################
+# # As mentioned in the above, the symbol table and indices must match
+# # what the pretrained Tacotron2 model expects. ``torchaudio`` provides the same
+# # transform along with the pretrained model. You can
+# # instantiate and use such transform as follow.
+# #
 
-with torch.inference_mode():
-    processed, lengths = processor(text)
-    processed = processed.to(device)
-    lengths = lengths.to(device)
-    spec, _, _ = tacotron2.infer(processed, lengths)
+# processor = torchaudio.pipelines.TACOTRON2_WAVERNN_CHAR_LJSPEECH.get_text_processor()
+
+# text = "Hello world! Text to speech!"
+# processed, lengths = processor(text)
 
+# print(processed)
+# print(lengths)
 
-_ = plt.imshow(spec[0].cpu().detach(), origin="lower", aspect="auto")
 
+# ######################################################################
+# # Note: The output of our manual encoding and the ``torchaudio`` ``text_processor`` output matches (meaning we correctly re-implemented what the library does internally). It takes either a text or list of texts as inputs.
+# # When a list of texts are provided, the returned ``lengths`` variable
+# # represents the valid length of each processed tokens in the output
+# # batch.
+# #
+# # The intermediate representation can be retrieved as follows:
+# #
 
-######################################################################
-# Note that ``Tacotron2.infer`` method perfoms multinomial sampling,
-# therefore, the process of generating the spectrogram incurs randomness.
-#
+# print([processor.tokens[i] for i in processed[0, : lengths[0]]])
 
 
-def plot():
-    fig, ax = plt.subplots(3, 1)
-    for i in range(3):
-        with torch.inference_mode():
-            spec, spec_lengths, _ = tacotron2.infer(processed, lengths)
-        print(spec[0].shape)
-        ax[i].imshow(spec[0].cpu().detach(), origin="lower", aspect="auto")
+# ######################################################################
+# # Phoneme-based encoding
+# # ~~~~~~~~~~~~~~~~~~~~~~
+# #
+# # Phoneme-based encoding is similar to character-based encoding, but it
+# # uses a symbol table based on phonemes and a G2P (Grapheme-to-Phoneme)
+# # model.
+# #
+# # The detail of the G2P model is out of the scope of this tutorial, we will
+# # just look at what the conversion looks like.
+# #
+# # Similar to the case of character-based encoding, the encoding process is
+# # expected to match what a pretrained Tacotron2 model is trained on.
+# # ``torchaudio`` has an interface to create the process.
+# #
+# # The following code illustrates how to make and use the process. Behind
+# # the scene, a G2P model is created using ``DeepPhonemizer`` package, and
+# # the pretrained weights published by the author of ``DeepPhonemizer`` is
+# # fetched.
+# #
+
+# bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH
 
+# processor = bundle.get_text_processor()
 
-plot()
+# text = "Hello world! Text to speech!"
+# with torch.inference_mode():
+#     processed, lengths = processor(text)
+
+# print(processed)
+# print(lengths)
+
+
+# ######################################################################
+# # Notice that the encoded values are different from the example of
+# # character-based encoding.
+# #
+# # The intermediate representation looks like the following.
+# #
+
+# print([processor.tokens[i] for i in processed[0, : lengths[0]]])
+
+
+# ######################################################################
+# # Spectrogram Generation
+# # ----------------------
+# #
+# # ``Tacotron2`` is the model we use to generate spectrogram from the
+# # encoded text. For the detail of the model, please refer to `the
+# # paper <https://arxiv.org/abs/1712.05884>`__.
+# #
+# # It is easy to instantiate a Tacotron2 model with pretrained weights,
+# # however, note that the input to Tacotron2 models need to be processed
+# # by the matching text processor.
+# #
+# # :py:class:`torchaudio.pipelines.Tacotron2TTSBundle` bundles the matching
+# # models and processors together so that it is easy to create the pipeline.
+# #
+# # For the available bundles, and its usage, please refer to
+# # :py:class:`~torchaudio.pipelines.Tacotron2TTSBundle`.
+# #
 
+# bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH
+# processor = bundle.get_text_processor()
+# tacotron2 = bundle.get_tacotron2().to(device)
 
-######################################################################
-# Waveform Generation
-# -------------------
-#
-# Once the spectrogram is generated, the last process is to recover the
-# waveform from the spectrogram using a vocoder.
-#
-# ``torchaudio`` provides vocoders based on ``GriffinLim`` and
-# ``WaveRNN``.
-#
+# text = "Hello world! Text to speech!"
 
+# with torch.inference_mode():
+#     processed, lengths = processor(text)
+#     processed = processed.to(device)
+#     lengths = lengths.to(device)
+#     spec, _, _ = tacotron2.infer(processed, lengths)
+
+
+# _ = plt.imshow(spec[0].cpu().detach(), origin="lower", aspect="auto")
 
-######################################################################
-# WaveRNN Vocoder
-# ~~~~~~~~~~~~~~~
-#
-# Continuing from the previous section, we can instantiate the matching
-# WaveRNN model from the same bundle.
-#
 
-bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH
+# ######################################################################
+# # Note that ``Tacotron2.infer`` method perfoms multinomial sampling,
+# # therefore, the process of generating the spectrogram incurs randomness.
+# #
+
+
+# def plot():
+#     fig, ax = plt.subplots(3, 1)
+#     for i in range(3):
+#         with torch.inference_mode():
+#             spec, spec_lengths, _ = tacotron2.infer(processed, lengths)
+#         print(spec[0].shape)
+#         ax[i].imshow(spec[0].cpu().detach(), origin="lower", aspect="auto")
+
+
+# plot()
+
+
+# ######################################################################
+# # Waveform Generation
+# # -------------------
+# #
+# # Once the spectrogram is generated, the last process is to recover the
+# # waveform from the spectrogram using a vocoder.
+# #
+# # ``torchaudio`` provides vocoders based on ``GriffinLim`` and
+# # ``WaveRNN``.
+# #
+
+
+# ######################################################################
+# # WaveRNN Vocoder
+# # ~~~~~~~~~~~~~~~
+# #
+# # Continuing from the previous section, we can instantiate the matching
+# # WaveRNN model from the same bundle.
+# #
+
+# bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH
+
+# processor = bundle.get_text_processor()
+# tacotron2 = bundle.get_tacotron2().to(device)
+# vocoder = bundle.get_vocoder().to(device)
+
+# text = "Hello world! Text to speech!"
+
+# with torch.inference_mode():
+#     processed, lengths = processor(text)
+#     processed = processed.to(device)
+#     lengths = lengths.to(device)
+#     spec, spec_lengths, _ = tacotron2.infer(processed, lengths)
+#     waveforms, lengths = vocoder(spec, spec_lengths)
+
+# ######################################################################
+# #
+
+
+# def plot(waveforms, spec, sample_rate):
+#     waveforms = waveforms.cpu().detach()
+
+#     fig, [ax1, ax2] = plt.subplots(2, 1)
+#     ax1.plot(waveforms[0])
+#     ax1.set_xlim(0, waveforms.size(-1))
+#     ax1.grid(True)
+#     ax2.imshow(spec[0].cpu().detach(), origin="lower", aspect="auto")
+#     return IPython.display.Audio(waveforms[0:1], rate=sample_rate)
+
+
+# plot(waveforms, spec, vocoder.sample_rate)
+
+
+# ######################################################################
+# # Griffin-Lim Vocoder
+# # ~~~~~~~~~~~~~~~~~~~
+# #
+# # Using the Griffin-Lim vocoder is same as WaveRNN. You can instantiate
+# # the vocoder object with
+# # :py:func:`~torchaudio.pipelines.Tacotron2TTSBundle.get_vocoder`
+# # method and pass the spectrogram.
+# #
 
-processor = bundle.get_text_processor()
-tacotron2 = bundle.get_tacotron2().to(device)
-vocoder = bundle.get_vocoder().to(device)
-
-text = "Hello world! Text to speech!"
-
-with torch.inference_mode():
-    processed, lengths = processor(text)
-    processed = processed.to(device)
-    lengths = lengths.to(device)
-    spec, spec_lengths, _ = tacotron2.infer(processed, lengths)
-    waveforms, lengths = vocoder(spec, spec_lengths)
-
-######################################################################
-#
+# bundle = torchaudio.pipelines.TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH
 
+# processor = bundle.get_text_processor()
+# tacotron2 = bundle.get_tacotron2().to(device)
+# vocoder = bundle.get_vocoder().to(device)
 
-def plot(waveforms, spec, sample_rate):
-    waveforms = waveforms.cpu().detach()
+# with torch.inference_mode():
+#     processed, lengths = processor(text)
+#     processed = processed.to(device)
+#     lengths = lengths.to(device)
+#     spec, spec_lengths, _ = tacotron2.infer(processed, lengths)
+# waveforms, lengths = vocoder(spec, spec_lengths)
 
-    fig, [ax1, ax2] = plt.subplots(2, 1)
-    ax1.plot(waveforms[0])
-    ax1.set_xlim(0, waveforms.size(-1))
-    ax1.grid(True)
-    ax2.imshow(spec[0].cpu().detach(), origin="lower", aspect="auto")
-    return IPython.display.Audio(waveforms[0:1], rate=sample_rate)
+# ######################################################################
+# #
 
+# plot(waveforms, spec, vocoder.sample_rate)
 
-plot(waveforms, spec, vocoder.sample_rate)
-
-
-######################################################################
-# Griffin-Lim Vocoder
-# ~~~~~~~~~~~~~~~~~~~
-#
-# Using the Griffin-Lim vocoder is same as WaveRNN. You can instantiate
-# the vocoder object with
-# :py:func:`~torchaudio.pipelines.Tacotron2TTSBundle.get_vocoder`
-# method and pass the spectrogram.
-#
-
-bundle = torchaudio.pipelines.TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH
-
-processor = bundle.get_text_processor()
-tacotron2 = bundle.get_tacotron2().to(device)
-vocoder = bundle.get_vocoder().to(device)
-
-with torch.inference_mode():
-    processed, lengths = processor(text)
-    processed = processed.to(device)
-    lengths = lengths.to(device)
-    spec, spec_lengths, _ = tacotron2.infer(processed, lengths)
-waveforms, lengths = vocoder(spec, spec_lengths)
-
-######################################################################
-#
-
-plot(waveforms, spec, vocoder.sample_rate)
-
-
-######################################################################
-# Waveglow Vocoder
-# ~~~~~~~~~~~~~~~~
-#
-# Waveglow is a vocoder published by Nvidia. The pretrained weights are
-# published on Torch Hub. One can instantiate the model using ``torch.hub``
-# module.
-#
-
-# Workaround to load model mapped on GPU
-# https://stackoverflow.com/a/61840832
-waveglow = torch.hub.load(
-    "NVIDIA/DeepLearningExamples:torchhub",
-    "nvidia_waveglow",
-    model_math="fp32",
-    pretrained=False,
-)
-checkpoint = torch.hub.load_state_dict_from_url(
-    "https://api.ngc.nvidia.com/v2/models/nvidia/waveglowpyt_fp32/versions/1/files/nvidia_waveglowpyt_fp32_20190306.pth",  # noqa: E501
-    progress=False,
-    map_location=device,
-)
-state_dict = {key.replace("module.", ""): value for key, value in checkpoint["state_dict"].items()}
-
-waveglow.load_state_dict(state_dict)
-waveglow = waveglow.remove_weightnorm(waveglow)
-waveglow = waveglow.to(device)
-waveglow.eval()
-
-with torch.no_grad():
-    waveforms = waveglow.infer(spec)
-
-######################################################################
-#
-
-plot(waveforms, spec, 22050)
+
+# ######################################################################
+# # Waveglow Vocoder
+# # ~~~~~~~~~~~~~~~~
+# #
+# # Waveglow is a vocoder published by Nvidia. The pretrained weights are
+# # published on Torch Hub. One can instantiate the model using ``torch.hub``
+# # module.
+# #
+
+# # Workaround to load model mapped on GPU
+# # https://stackoverflow.com/a/61840832
+# waveglow = torch.hub.load(
+#     "NVIDIA/DeepLearningExamples:torchhub",
+#     "nvidia_waveglow",
+#     model_math="fp32",
+#     pretrained=False,
+# )
+# checkpoint = torch.hub.load_state_dict_from_url(
+#     "https://api.ngc.nvidia.com/v2/models/nvidia/waveglowpyt_fp32/versions/1/files/nvidia_waveglowpyt_fp32_20190306.pth",  # noqa: E501
+#     progress=False,
+#     map_location=device,
+# )
+# state_dict = {key.replace("module.", ""): value for key, value in checkpoint["state_dict"].items()}
+
+# waveglow.load_state_dict(state_dict)
+# waveglow = waveglow.remove_weightnorm(waveglow)
+# waveglow = waveglow.to(device)
+# waveglow.eval()
+
+# with torch.no_grad():
+#     waveforms = waveglow.infer(spec)
+
+# ######################################################################
+# #
+
+# plot(waveforms, spec, 22050)