WIP: adding encoding and bits_per_sample option

mthrok · mthrok · commit b119836bcd97 · 2021-02-02T23:27:02.000Z
diff --git a/torchaudio/backend/sox_io_backend.py b/torchaudio/backend/sox_io_backend.py
@@ -175,16 +175,19 @@ def _save(
         channels_first: bool = True,
         compression: Optional[float] = None,
         format: Optional[str] = None,
-        dtype: Optional[str] = None,
+        encoding: Optional[str] = None,
+        bits_per_sample: Optional[int] = None,
 ):
     if hasattr(filepath, 'write'):
         if format is None:
             raise RuntimeError('`format` is required when saving to file object.')
         torchaudio._torchaudio.save_audio_fileobj(
-            filepath, src, sample_rate, channels_first, compression, format, dtype)
+            filepath, src, sample_rate, channels_first, compression,
+            format, encoding, bits_per_sample)
     else:
         torch.ops.torchaudio.sox_io_save_audio_file(
-            os.fspath(filepath), src, sample_rate, channels_first, compression, format, dtype)
+            os.fspath(filepath), src, sample_rate, channels_first, compression,
+            format, encoding, bits_per_sample)
 
 
 @_mod_utils.requires_module('torchaudio._torchaudio')
@@ -195,7 +198,8 @@ def save(
         channels_first: bool = True,
         compression: Optional[float] = None,
         format: Optional[str] = None,
-        dtype: Optional[str] = None,
+        encoding: Optional[str] = None,
+        bits_per_sample: Optional[int] = None,
 ):
     """Save audio data to file.
 
@@ -248,16 +252,11 @@ def save(
             ``dtype=None`` means no conversion is performed.
             ``dtype`` parameter is only effective for ``float32`` Tensor.
     """
-    if src.dtype == torch.float32 and dtype is None:
-        warnings.warn(
-            '`dtype` default value will be changed to `int16` in 0.9 release.'
-            'Specify `dtype` to suppress this warning.'
-        )
     if not torch.jit.is_scripting():
-        _save(filepath, src, sample_rate, channels_first, compression, format, dtype)
+        _save(filepath, src, sample_rate, channels_first, compression, format, encoding, bits_per_sample)
         return
     torch.ops.torchaudio.sox_io_save_audio_file(
-        filepath, src, sample_rate, channels_first, compression, format, dtype)
+        filepath, src, sample_rate, channels_first, compression, format, encoding, bits_per_sample)
 
 
 @_mod_utils.requires_module('torchaudio._torchaudio')
diff --git a/torchaudio/csrc/sox/io.cpp b/torchaudio/csrc/sox/io.cpp
@@ -46,32 +46,31 @@ namespace {
 
 std::string get_encoding(sox_encoding_t encoding) {
   switch (encoding) {
-    case SOX_ENCODING_UNKNOWN:
-      return "UNKNOWN";
     case SOX_ENCODING_SIGN2:
-      return "PCM_S";
+      return ENCODING_PCM_SIGNED;
     case SOX_ENCODING_UNSIGNED:
-      return "PCM_U";
+      return ENCODING_PCM_UNSIGNED;
     case SOX_ENCODING_FLOAT:
-      return "PCM_F";
+      return ENCODING_PCM_FLOAT;
     case SOX_ENCODING_FLAC:
-      return "FLAC";
+      return ENCODING_FLAC;
     case SOX_ENCODING_ULAW:
-      return "ULAW";
+      return ENCODING_ULAW;
     case SOX_ENCODING_ALAW:
-      return "ALAW";
+      return ENCODING_ALAW;
     case SOX_ENCODING_MP3:
-      return "MP3";
+      return ENCODING_MP3;
     case SOX_ENCODING_VORBIS:
-      return "VORBIS";
+      return ENCODING_VORBIS;
     case SOX_ENCODING_AMR_WB:
-      return "AMR_WB";
+      return ENCODING_AMR_WB;
     case SOX_ENCODING_AMR_NB:
-      return "AMR_NB";
+      return ENCODING_AMR_NB;
     case SOX_ENCODING_OPUS:
-      return "OPUS";
+      return ENCODING_OPUS;
+    case SOX_ENCODING_UNKNOWN:
     default:
-      return "UNKNOWN";
+      return ENCODING_UNKNOWN;
   }
 }
 
@@ -148,34 +147,26 @@ void save_audio_file(
     torch::Tensor tensor,
     int64_t sample_rate,
     bool channels_first,
-    c10::optional<double> compression,
-    c10::optional<std::string> format,
-    c10::optional<std::string> dtype) {
+    c10::optional<double>& compression,
+    c10::optional<std::string>& format,
+    c10::optional<std::string>& encoding,
+    c10::optional<int64_t>& bits_per_sample) {
   validate_input_tensor(tensor);
 
-  if (tensor.dtype() != torch::kFloat32 && dtype.has_value()) {
-    throw std::runtime_error(
-        "dtype conversion only supported for float32 tensors");
-  }
-  const auto tgt_dtype =
-      (tensor.dtype() == torch::kFloat32 && dtype.has_value())
-      ? get_dtype_from_str(dtype.value())
-      : tensor.dtype();
-
   const auto filetype = [&]() {
     if (format.has_value())
       return format.value();
     return get_filetype(path);
   }();
+
   if (filetype == "amr-nb") {
     const auto num_channels = tensor.size(channels_first ? 0 : 1);
     TORCH_CHECK(
         num_channels == 1, "amr-nb format only supports single channel audio.");
-    tensor = (unnormalize_wav(tensor) / 65536).to(torch::kInt16);
   }
   const auto signal_info =
       get_signalinfo(&tensor, sample_rate, filetype, channels_first);
-  const auto encoding_info = get_encodinginfo_for_save(filetype, tgt_dtype, compression);
+  const auto encoding_info = get_encodinginfo_for_save(filetype, compression, encoding, bits_per_sample);
 
   SoxFormat sf(sox_open_write(
       path.c_str(),
@@ -289,31 +280,22 @@ void save_audio_fileobj(
     torch::Tensor tensor,
     int64_t sample_rate,
     bool channels_first,
-    c10::optional<double> compression,
+    c10::optional<double>& compression,
     std::string filetype,
-    c10::optional<std::string> dtype) {
+    c10::optional<std::string>& encoding,
+    c10::optional<int64_t>& bits_per_sample) {
   validate_input_tensor(tensor);
 
-  if (tensor.dtype() != torch::kFloat32 && dtype.has_value()) {
-    throw std::runtime_error(
-        "dtype conversion only supported for float32 tensors");
-  }
-  const auto tgt_dtype =
-      (tensor.dtype() == torch::kFloat32 && dtype.has_value())
-      ? get_dtype_from_str(dtype.value())
-      : tensor.dtype();
-
   if (filetype == "amr-nb") {
     const auto num_channels = tensor.size(channels_first ? 0 : 1);
     if (num_channels != 1) {
       throw std::runtime_error(
           "amr-nb format only supports single channel audio.");
     }
-    tensor = (unnormalize_wav(tensor) / 65536).to(torch::kInt16);
   }
   const auto signal_info =
       get_signalinfo(&tensor, sample_rate, filetype, channels_first);
-  const auto encoding_info = get_encodinginfo_for_save(filetype, tgt_dtype, compression);
+  const auto encoding_info = get_encodinginfo_for_save(filetype, compression, encoding, bits_per_sample);
 
   AutoReleaseBuffer buffer;
 
diff --git a/torchaudio/csrc/sox/io.h b/torchaudio/csrc/sox/io.h
@@ -48,9 +48,10 @@ void save_audio_file(
     torch::Tensor tensor,
     int64_t sample_rate,
     bool channels_first,
-    c10::optional<double> compression,
-    c10::optional<std::string> format,
-    c10::optional<std::string> dtype);
+    c10::optional<double>& compression,
+    c10::optional<std::string>& format,
+    c10::optional<std::string>& encoding,
+    c10::optional<int64_t>& bits_per_sample);
 
 #ifdef TORCH_API_INCLUDE_EXTENSION_H
 
@@ -71,9 +72,10 @@ void save_audio_fileobj(
     torch::Tensor tensor,
     int64_t sample_rate,
     bool channels_first,
-    c10::optional<double> compression,
+    c10::optional<double>& compression,
     std::string filetype,
-    c10::optional<std::string> dtype);
+    c10::optional<std::string>& encoding,
+    c10::optional<int64_t>& bits_per_sample);
 
 #endif // TORCH_API_INCLUDE_EXTENSION_H
 
diff --git a/torchaudio/csrc/sox/utils.cpp b/torchaudio/csrc/sox/utils.cpp
@@ -220,31 +220,134 @@ const std::string get_filetype(const std::string path) {
   return ext;
 }
 
-sox_encoding_t get_encoding(
-    const std::string filetype,
-    const caffe2::TypeMeta dtype) {
-  if (filetype == "mp3")
-    return SOX_ENCODING_MP3;
-  if (filetype == "flac")
-    return SOX_ENCODING_FLAC;
-  if (filetype == "ogg" || filetype == "vorbis")
-    return SOX_ENCODING_VORBIS;
-  if (filetype == "wav" || filetype == "amb") {
-    if (dtype == torch::kUInt8)
-      return SOX_ENCODING_UNSIGNED;
-    if (dtype == torch::kInt16)
-      return SOX_ENCODING_SIGN2;
-    if (dtype == torch::kInt32)
-      return SOX_ENCODING_SIGN2;
-    if (dtype == torch::kFloat32)
-      return SOX_ENCODING_FLOAT;
-    throw std::runtime_error("Unsupported dtype.");
+namespace {
+
+std::tuple<sox_encoding_t, unsigned> get_save_encoding_for_wav(
+    const std::string format,
+    const c10::optional<std::string>& encoding,
+    const c10::optional<int64_t>& bits_per_sample) {
+  if (!encoding.has_value()) {
+    if (!bits_per_sample.has_value())
+      return std::make_tuple<>(SOX_ENCODING_SIGN2, 16);
+    auto val = static_cast<unsigned>(bits_per_sample.value());
+    if (val == 8)
+      return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
+    return std::make_tuple<>(SOX_ENCODING_SIGN2, val);
   }
-  if (filetype == "sph")
-    return SOX_ENCODING_SIGN2;
-  if (filetype == "amr-nb")
-    return SOX_ENCODING_AMR_NB;
-  throw std::runtime_error("Unsupported file type: " + filetype);
+  if (encoding == ENCODING_PCM_SIGNED) {
+    if (!bits_per_sample.has_value())
+      return std::make_tuple<>(SOX_ENCODING_SIGN2, 16);
+    auto val = static_cast<unsigned>(bits_per_sample.value());
+    if (val == 8) {
+      TORCH_WARN_ONCE("%s does not support 8-bit signed PCM encoding. Using 16-bit.", format);
+      val = 16;
+    }
+    return std::make_tuple<>(SOX_ENCODING_SIGN2, val);
+  }
+  if (encoding == ENCODING_PCM_UNSIGNED) {
+    if (!bits_per_sample.has_value())
+      return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
+    auto val = static_cast<unsigned>(bits_per_sample.value());
+    if (val != 8)
+      TORCH_WARN_ONCE("%s only supports 8-bit for unsigned PCM encoding. Using 8-bit.", format);
+    return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
+  }
+  if (encoding == ENCODING_PCM_FLOAT) {
+    auto val = static_cast<unsigned>(bits_per_sample.value_or(32));
+    if (val != 32)
+      TORCH_WARN_ONCE("%s only supports 32-bit for floating point PCM encoding. Using 32-bit.", format);
+    return std::make_tuple<>(SOX_ENCODING_FLOAT, 32);
+  }
+  if (encoding == ENCODING_ULAW) {
+    auto val = static_cast<unsigned>(bits_per_sample.value_or(8));
+    if (val != 8)
+      TORCH_WARN_ONCE("%s only supports 8-bit for mu-law encoding. Using 8-bit.", format);
+    return std::make_tuple<>(SOX_ENCODING_ULAW, 8);
+  }
+  if (encoding == ENCODING_ALAW) {
+    auto val = static_cast<unsigned>(bits_per_sample.value_or(8));
+    if (val != 8)
+      TORCH_WARN_ONCE("%s only supports 8-bit for a-law encoding. Using 8-bit.", format);
+    return std::make_tuple<>(SOX_ENCODING_ALAW, 8);      
+  }
+  std::ostringstream message;
+  message << format << " format does not support encoding: " << encoding.value();
+  throw std::runtime_error(message.str());
+}
+
+std::tuple<sox_encoding_t, unsigned> get_save_encoding(
+    const std::string& format,
+    const c10::optional<std::string>& encoding,
+    const c10::optional<int64_t>& bits_per_sample) {
+  if (format == "mp3") {
+    if (encoding.has_value()) {
+      TORCH_WARN_ONCE("mp3 does not support `encoding` option. Ignoring.");
+    }
+    if (bits_per_sample.has_value()) {
+      TORCH_WARN_ONCE("mp3 does not `bits_per_sample` option. Ignoring.");
+    }
+    return std::make_tuple<>(SOX_ENCODING_MP3, 16);
+  }
+  if (format == "ogg" || format == "vorbis") {
+    if (encoding.has_value()) {
+      TORCH_WARN_ONCE("ogg/vorbis does not support `encoding` option. Ignoring.");
+    }
+    if (bits_per_sample.has_value()) {
+      TORCH_WARN_ONCE("ogg/vorbis does not `bits_per_sample` option. Ignoring.");
+    }
+    return std::make_tuple<>(SOX_ENCODING_VORBIS, 16);
+  }
+  if (format == "amr-nb") {
+    if (encoding.has_value()) {
+      TORCH_WARN_ONCE("amr-nb does not support `encoding` option. Ignoring.");
+    }
+    if (bits_per_sample.has_value()) {
+      TORCH_WARN_ONCE("amr-nb does not `bits_per_sample` option. Ignoring.");
+    }
+    return std::make_tuple<>(SOX_ENCODING_AMR_NB, 16);
+  }
+  if (format == "wav" || format == "amb") {
+    return get_save_encoding_for_wav(format, encoding, bits_per_sample);
+  }
+  if (format == "flac") {
+    if (encoding.has_value()) {
+      TORCH_WARN_ONCE("flac does not support `encoding` option. Ignoring.");
+    }
+    unsigned bps = [&](){
+      unsigned val = static_cast<unsigned>(bits_per_sample.value_or(24));
+      if (val > 24) {
+        TORCH_WARN_ONCE("flac does not support bits_per_sample larger than 24. Using 24.");
+        val = 24;
+      }
+      return val;
+    }();
+    return std::make_tuple<>(SOX_ENCODING_FLAC, bps);
+  }
+  if (format == "sph") {
+    if (!encoding.has_value() || encoding == ENCODING_PCM_SIGNED) {
+      if (!bits_per_sample.has_value())
+        return std::make_tuple<>(SOX_ENCODING_SIGN2, 16);
+      auto val = static_cast<unsigned>(bits_per_sample.value());
+      return std::make_tuple<>(SOX_ENCODING_SIGN2, val);
+    }
+    if (encoding == ENCODING_PCM_UNSIGNED || encoding == ENCODING_PCM_FLOAT) {
+      TORCH_WARN_ONCE("sph does not support unsigned integer PCM or floating point PCM. Using signed interger PCM");
+      auto val = static_cast<unsigned>(bits_per_sample.value_or(16));
+      return std::make_tuple<>(SOX_ENCODING_UNSIGNED, val);
+    }
+    if (encoding == ENCODING_ULAW) {
+      auto val = static_cast<unsigned>(bits_per_sample.value_or(8));
+      if (val != 8)
+        TORCH_WARN_ONCE("sph only supports 8-bit for mu-law encoding. Using 8-bit.");
+      return std::make_tuple<>(SOX_ENCODING_ULAW, 8);
+    }
+    if (encoding == ENCODING_ALAW) {
+      auto val = static_cast<unsigned>(bits_per_sample.value_or(8));
+      return std::make_tuple<>(SOX_ENCODING_ALAW, val);
+    }
+    throw std::runtime_error("sph format does not support encoding: " + encoding.value());
+  }
+  throw std::runtime_error("Unsupported format: " + format);
 }
 
 unsigned get_precision(
@@ -278,6 +381,8 @@ unsigned get_precision(
   throw std::runtime_error("Unsupported file type: " + filetype);
 }
 
+} // namepsace
+
 sox_signalinfo_t get_signalinfo(
     const torch::Tensor* waveform,
     const int64_t sample_rate,
@@ -326,12 +431,14 @@ sox_encodinginfo_t get_tensor_encodinginfo(
 }
 
 sox_encodinginfo_t get_encodinginfo_for_save(
-    const std::string filetype,
-    const caffe2::TypeMeta dtype,
-    c10::optional<double>& compression) {
+    const std::string& format,
+    const c10::optional<double>& compression,
+    const c10::optional<std::string>& encoding,
+    const c10::optional<int64_t>& bits_per_sample) {
+  auto enc = get_save_encoding(format, encoding, bits_per_sample);
   return sox_encodinginfo_t{
-      /*encoding=*/get_encoding(filetype, dtype),
-      /*bits_per_sample=*/get_precision(filetype, dtype),
+      /*encoding=*/std::get<0>(enc),
+      /*bits_per_sample=*/std::get<1>(enc),
       /*compression=*/compression.value_or(HUGE_VAL),
       /*reverse_bytes=*/sox_option_default,
       /*reverse_nibbles=*/sox_option_default,
diff --git a/torchaudio/csrc/sox/utils.h b/torchaudio/csrc/sox/utils.h