2 년 전 · df811976d5
--- a/data_server/src/main.rs
+++ b/data_server/src/main.rs
@@ -252,7 +252,7 @@ struct Args {
 
															     files: Vec<String>,
														
 
															     /// Causual sampling
														
 
															-    #[clap(short, long, default_value = "false")]
														
 
															+    #[clap(short, long, default_value = "true")]
														
 
															     causal: bool,
														
 
															     /// Address to bind to
														
--- a/tools/vqgan/extract_vq.py
+++ b/tools/vqgan/extract_vq.py
@@ -63,6 +63,7 @@ def get_model(
 
															     return model
														
 
															+@torch.inference_mode()
														
 
															 def process_batch(files: list[Path], model) -> float:
														
 
															     wavs = []
														
 
															     audio_lengths = []
														
@@ -87,8 +88,7 @@ def process_batch(files: list[Path], model) -> float:
 
															     audio_lengths = torch.tensor(audio_lengths, device=model.device, dtype=torch.long)
														
 
															     # Calculate lengths
														
 
															-    with torch.no_grad():
														
 
															-        indices, feature_lengths = model.encode(audios, audio_lengths)
														
 
															+    indices, feature_lengths = model.encode(audios, audio_lengths)
														
 
															     # Save to disk
														
 
															     outputs = indices.cpu().numpy()
														
@@ -111,7 +111,7 @@ def process_batch(files: list[Path], model) -> float:
 
															 @click.option("--config-name", default="vqgan_pretrain")
														
 
															 @click.option(
														
 
															     "--checkpoint-path",
														
 
															-    default="checkpoints/vqgan-v1.pth",
														
 
															+    default="checkpoints/vq-gan-group-fsq-8x1024-wn-20x768-30kh.pth",
														
 
															 )
														
 
															 @click.option("--batch-size", default=64)
														
 
															 @click.option("--filelist", default=None, type=Path)
														
--- a/tools/vqgan/inference.py
+++ b/tools/vqgan/inference.py
@@ -5,15 +5,12 @@ import librosa
 
															 import numpy as np
														
 
															 import soundfile as sf
														
 
															 import torch
														
 
															-import torch.nn.functional as F
														
 
															-from einops import rearrange
														
 
															 from hydra import compose, initialize
														
 
															 from hydra.utils import instantiate
														
 
															 from lightning import LightningModule
														
 
															 from loguru import logger
														
 
															 from omegaconf import OmegaConf
														
 
															-from fish_speech.models.vqgan.utils import sequence_mask
														
 
															 from fish_speech.utils.file import AUDIO_EXTENSIONS
														
 
															 # register eval resolver
														
@@ -85,10 +82,6 @@ def main(input_path, output_path, config_name, checkpoint_path):
 
															     else:
														
 
															         raise ValueError(f"Unknown input type: {input_path}")
														
 
															-    # random destroy 10% of indices
														
 
															-    # mask = torch.rand_like(indices, dtype=torch.float) > 0.9
														
 
															-    # indices[mask] = torch.randint(0, 1000, mask.shape, device=indices.device, dtype=indices.dtype)[mask]
														
 
															-
														
 
															     # Restore
														
 
															     feature_lengths = torch.tensor([indices.shape[1]], device=model.device)
														
 
															     fake_audios = model.decode(