Update desciptions of lengths parameters (#1890)

mthrok · web-flow · commit 211270db1564 · 2021-10-16T18:36:30.000-04:00
diff --git a/torchaudio/models/tacotron2.py b/torchaudio/models/tacotron2.py
@@ -1080,7 +1080,7 @@ def infer(self, tokens: Tensor, lengths: Optional[Tensor] = None) -> Tuple[Tenso
                 If ``None``, it is assumed that the all the tokens are valid. Default: ``None``
 
         Returns:
-            Tensor, Tensor, and Tensor:
+            (Tensor, Tensor, Tensor):
                 Tensor
                     The predicted mel spectrogram with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
                 Tensor
diff --git a/torchaudio/models/wav2vec2/model.py b/torchaudio/models/wav2vec2/model.py
@@ -50,22 +50,29 @@ def extract_features(
         Args:
             waveforms (Tensor): Audio tensor of shape `(batch, frames)`.
             lengths (Tensor or None, optional):
-                Indicates the valid length of each audio sample in the batch.
+                Indicates the valid length of each audio in the batch.
                 Shape: `(batch, )`.
+                When the ``waveforms`` contains audios with different durations,
+                by providing ``lengths`` argument, the model will compute
+                the corresponding valid output lengths and apply proper mask in
+                transformer attention layer.
+                If ``None``, it is assumed that the entire audio waveform
+                length is valid.
             num_layers (int or None, optional):
                 If given, limit the number of intermediate layers to go through.
                 Providing `1` will stop the computation after going through one
                 intermediate layers. If not given, the outputs from all the
                 intermediate layers are returned.
 
         Returns:
-            List of Tensors and an optional Tensor:
+            (List[Tensor], Optional[Tensor]):
             List of Tensors
                 Features from requested layers.
-                Each Tensor is of shape: `(batch, frames, feature dimention)`
+                Each Tensor is of shape: `(batch, time frame, feature dimension)`
             Tensor or None
                 If ``lengths`` argument was provided, a Tensor of shape `(batch, )`
-                is retuned. It indicates the valid length of each feature in the batch.
+                is returned.
+                It indicates the valid length in time axis of each feature Tensor.
         """
         x, lengths = self.feature_extractor(waveforms, lengths)
         x = self.encoder.extract_features(x, lengths, num_layers)
@@ -81,17 +88,24 @@ def forward(
         Args:
             waveforms (Tensor): Audio tensor of shape `(batch, frames)`.
             lengths (Tensor or None, optional):
-                Indicates the valid length of each audio sample in the batch.
+                Indicates the valid length of each audio in the batch.
                 Shape: `(batch, )`.
+                When the ``waveforms`` contains audios with different duration,
+                by providing ``lengths`` argument, the model will compute
+                the corresponding valid output lengths and apply proper mask in
+                transformer attention layer.
+                If ``None``, it is assumed that all the audio in ``waveforms``
+                have valid length. Default: ``None``.
 
         Returns:
-            Tensor and an optional Tensor:
+            (Tensor, Optional[Tensor]):
             Tensor
                 The sequences of probability distribution (in logit) over labels.
                 Shape: `(batch, frames, num labels)`.
             Tensor or None
                 If ``lengths`` argument was provided, a Tensor of shape `(batch, )`
-                is retuned. It indicates the valid length of each feature in the batch.
+                is retuned.
+                It indicates the valid length in time axis of the output Tensor.
         """
         x, lengths = self.feature_extractor(waveforms, lengths)
         x = self.encoder(x, lengths)
diff --git a/torchaudio/models/wavernn.py b/torchaudio/models/wavernn.py
@@ -341,16 +341,23 @@ def infer(self, specgram: Tensor, lengths: Optional[Tensor] = None) -> Tuple[Ten
             specgram (Tensor):
                 Batch of spectrograms. Shape: `(n_batch, n_freq, n_time)`.
             lengths (Tensor or None, optional):
-                Indicates the valid length in of each spectrogram in time axis.
-                Shape: `(n_batch, )`.
+                Indicates the valid length of each audio in the batch.
+                Shape: `(batch, )`.
+                When the ``specgram`` contains spectrograms with different duration,
+                by providing ``lengths`` argument, the model will compute
+                the corresponding valid output lengths.
+                If ``None``, it is assumed that all the audio in ``waveforms``
+                have valid length. Default: ``None``.
 
         Returns:
-            Tensor and optional Tensor:
+            (Tensor, Optional[Tensor]):
             Tensor
                 The inferred waveform of size `(n_batch, 1, n_time)`.
                 1 stands for a single channel.
             Tensor or None
-                The valid lengths of each waveform in the batch. Size `(n_batch, )`.
+                If ``lengths`` argument was provided, a Tensor of shape `(batch, )`
+                is retuned.
+                It indicates the valid length in time axis of the output Tensor.
         """
 
         device = specgram.device