Просмотр исходного кода

fix: add torchaudio 2.9 compatibility for list_audio_backends removal (#1148)

torchaudio.list_audio_backends() was deprecated in 2.8 and removed in 2.9.
Added try/except to handle both old and new torchaudio versions.

For torchaudio 2.9+, attempts to detect ffmpeg availability by trying to
import the ffmpeg-related module, falling back to soundfile if unavailable.

Fixes #1118
Harikrishna KP 2 недель назад
Родитель
Сommit
75a9afe46a
2 измененных файлов с 32 добавлено и 11 удалено
  1. 16 5
      fish_speech/inference_engine/reference_loader.py
  2. 16 6
      tools/vqgan/extract_vq.py

+ 16 - 5
fish_speech/inference_engine/reference_loader.py

@@ -31,11 +31,22 @@ class ReferenceLoader:
         self.encode_reference: Callable
         self.encode_reference: Callable
 
 
         # Define the torchaudio backend
         # Define the torchaudio backend
-        backends = torchaudio.list_audio_backends()
-        if "ffmpeg" in backends:
-            self.backend = "ffmpeg"
-        else:
-            self.backend = "soundfile"
+        # list_audio_backends() was removed in torchaudio 2.9
+        try:
+            backends = torchaudio.list_audio_backends()
+            if "ffmpeg" in backends:
+                self.backend = "ffmpeg"
+            else:
+                self.backend = "soundfile"
+        except AttributeError:
+            # torchaudio 2.9+ removed list_audio_backends()
+            # Try ffmpeg first, fallback to soundfile
+            try:
+                import torchaudio.io._load_audio_fileobj  # noqa: F401
+
+                self.backend = "ffmpeg"
+            except (ImportError, ModuleNotFoundError):
+                self.backend = "soundfile"
 
 
     def load_by_id(
     def load_by_id(
         self,
         self,

+ 16 - 6
tools/vqgan/extract_vq.py

@@ -23,12 +23,22 @@ OmegaConf.register_new_resolver("eval", eval)
 # This file is used to convert the audio files to text files using the Whisper model.
 # This file is used to convert the audio files to text files using the Whisper model.
 # It's mainly used to generate the training data for the VQ model.
 # It's mainly used to generate the training data for the VQ model.
 
 
-backends = torchaudio.list_audio_backends()
-
-if "ffmpeg" in backends:
-    backend = "ffmpeg"
-else:
-    backend = "soundfile"
+# Determine audio backend - list_audio_backends() was removed in torchaudio 2.9
+try:
+    backends = torchaudio.list_audio_backends()
+    if "ffmpeg" in backends:
+        backend = "ffmpeg"
+    else:
+        backend = "soundfile"
+except AttributeError:
+    # torchaudio 2.9+ removed list_audio_backends()
+    # Try ffmpeg first, fallback to soundfile
+    try:
+        import torchaudio.io._load_audio_fileobj  # Check if ffmpeg backend is available
+
+        backend = "ffmpeg"
+    except (ImportError, ModuleNotFoundError):
+        backend = "soundfile"
 
 
 RANK = int(os.environ.get("SLURM_PROCID", 0))
 RANK = int(os.environ.get("SLURM_PROCID", 0))
 WORLD_SIZE = int(os.environ.get("SLURM_NTASKS", 1))
 WORLD_SIZE = int(os.environ.get("SLURM_NTASKS", 1))