@@ -50,22 +50,29 @@ def extract_features(
50
50
Args:
51
51
waveforms (Tensor): Audio tensor of shape `(batch, frames)`.
52
52
lengths (Tensor or None, optional):
53
- Indicates the valid length of each audio sample in the batch.
53
+ Indicates the valid length of each audio in the batch.
54
54
Shape: `(batch, )`.
55
+ When the ``waveforms`` contains audios with different durations,
56
+ by providing ``lengths`` argument, the model will compute
57
+ the corresponding valid output lengths and apply proper mask in
58
+ transformer attention layer.
59
+ If ``None``, it is assumed that the entire audio waveform
60
+ length is valid.
55
61
num_layers (int or None, optional):
56
62
If given, limit the number of intermediate layers to go through.
57
63
Providing `1` will stop the computation after going through one
58
64
intermediate layers. If not given, the outputs from all the
59
65
intermediate layers are returned.
60
66
61
67
Returns:
62
- List of Tensors and an optional Tensor:
68
+ ( List[Tensor], Optional[ Tensor]) :
63
69
List of Tensors
64
70
Features from requested layers.
65
- Each Tensor is of shape: `(batch, frames , feature dimention )`
71
+ Each Tensor is of shape: `(batch, time frame , feature dimension )`
66
72
Tensor or None
67
73
If ``lengths`` argument was provided, a Tensor of shape `(batch, )`
68
- is retuned. It indicates the valid length of each feature in the batch.
74
+ is returned.
75
+ It indicates the valid length in time axis of each feature Tensor.
69
76
"""
70
77
x , lengths = self .feature_extractor (waveforms , lengths )
71
78
x = self .encoder .extract_features (x , lengths , num_layers )
@@ -81,17 +88,24 @@ def forward(
81
88
Args:
82
89
waveforms (Tensor): Audio tensor of shape `(batch, frames)`.
83
90
lengths (Tensor or None, optional):
84
- Indicates the valid length of each audio sample in the batch.
91
+ Indicates the valid length of each audio in the batch.
85
92
Shape: `(batch, )`.
93
+ When the ``waveforms`` contains audios with different duration,
94
+ by providing ``lengths`` argument, the model will compute
95
+ the corresponding valid output lengths and apply proper mask in
96
+ transformer attention layer.
97
+ If ``None``, it is assumed that all the audio in ``waveforms``
98
+ have valid length. Default: ``None``.
86
99
87
100
Returns:
88
- Tensor and an optional Tensor:
101
+ ( Tensor, Optional[ Tensor]) :
89
102
Tensor
90
103
The sequences of probability distribution (in logit) over labels.
91
104
Shape: `(batch, frames, num labels)`.
92
105
Tensor or None
93
106
If ``lengths`` argument was provided, a Tensor of shape `(batch, )`
94
- is retuned. It indicates the valid length of each feature in the batch.
107
+ is retuned.
108
+ It indicates the valid length in time axis of the output Tensor.
95
109
"""
96
110
x , lengths = self .feature_extractor (waveforms , lengths )
97
111
x = self .encoder (x , lengths )
0 commit comments