11 miesięcy temu · 9021a57dce
--- a/.gitignore
+++ b/.gitignore
@@ -1,33 +1,113 @@
 
				+# =============================================================================
			
 
				+# Fish Speech - .gitignore
			
 
				+# =============================================================================
			
 
				+
			
 
				+# Operating System Files
			
 
				+# -----------------------
			
 
				 .DS_Store
			
 
				-.pgx.*
			
 
				+.DS_Store?
			
 
				+._*
			
 
				+.Spotlight-V100
			
 
				+.Trashes
			
 
				+ehthumbs.db
			
 
				+Thumbs.db
			
 
				+
			
 
				+# IDEs and Editors
			
 
				+# ----------------
			
 
				+.vscode/
			
 
				+.idea/
			
 
				+*.swp
			
 
				+*.swo
			
 
				+*~
			
 
				+
			
 
				+# Python
			
 
				+# ------
			
 
				+__pycache__/
			
 
				+*.py[cod]
			
 
				+*$py.class
			
 
				+*.so
			
 
				+.Python
			
 
				+build/
			
 
				+develop-eggs/
			
 
				+dist/
			
 
				+downloads/
			
 
				+eggs/
			
 
				+.eggs/
			
 
				+lib/
			
 
				+lib64/
			
 
				+parts/
			
 
				+sdist/
			
 
				+var/
			
 
				+wheels/
			
 
				+*.egg-info/
			
 
				+.installed.cfg
			
 
				+*.egg
			
 
				+MANIFEST
			
 
				+
			
 
				+# Virtual Environments
			
 
				+# --------------------
			
 
				+.env
			
 
				+.venv
			
 
				+env/
			
 
				+venv/
			
 
				+ENV/
			
 
				+env.bak/
			
 
				+venv.bak/
			
 
				+/fishenv/
			
 
				+
			
 
				+# Project Dependencies
			
 
				+# --------------------
			
 
				 .pdm-python
			
 
				 /fish_speech.egg-info
			
 
				-__pycache__
			
 
				-/results
			
 
				-/data
			
 
				-/*.test.sh
			
 
				+
			
 
				+# Data and Model Files
			
 
				+# --------------------
			
 
				+/data/
			
 
				+/results/
			
 
				+/checkpoints/
			
 
				+/references/
			
 
				+/demo-audios/
			
 
				+/example/
			
 
				+filelists/
			
 
				 *.filelist
			
 
				-filelists
			
 
				+
			
 
				+# Audio Files
			
 
				+# -----------
			
 
				+*.wav
			
 
				+*.mp3
			
 
				+*.flac
			
 
				+*.ogg
			
 
				+*.m4a
			
 
				+
			
 
				+# Data Files
			
 
				+# ----------
			
 
				+*.npy
			
 
				+*.npz
			
 
				+*.pkl
			
 
				+*.pickle
			
 
				+*.lab
			
 
				 /fish_speech/text/cmudict_cache.pickle
			
 
				-/checkpoints
			
 
				-/.vscode
			
 
				-/data_server/target
			
 
				-/*.npy
			
 
				-/*.wav
			
 
				-/*.mp3
			
 
				-/*.lab
			
 
				-/results
			
 
				-/data
			
 
				-/.idea
			
 
				+
			
 
				+# Cache and Temporary Files
			
 
				+# --------------------------
			
 
				+/.cache/
			
 
				+/.gradio/
			
 
				+/.locale/
			
 
				+.pgx.*
			
 
				+*log
			
 
				+*.log
			
 
				+
			
 
				+# External Tools
			
 
				+# --------------
			
 
				 ffmpeg.exe
			
 
				 ffprobe.exe
			
 
				+/faster_whisper/
			
 
				+
			
 
				+# Server Related
			
 
				+# --------------
			
 
				+/data_server/target/
			
 
				+
			
 
				+# Test Files
			
 
				+# ----------
			
 
				+/*.test.sh
			
 
				 asr-label*
			
 
				-/.cache
			
 
				-/fishenv
			
 
				-/.locale
			
 
				-/demo-audios
			
 
				-/references
			
 
				-/example
			
 
				-/faster_whisper
			
 
				-/.gradio
			
 
				-*log
			
--- a/data
+++ b/data
@@ -0,0 +1 @@
 
				+/mnt/users/whaledolphin/data
			
--- a/fish_speech/models/text2semantic/inference.py
+++ b/fish_speech/models/text2semantic/inference.py
@@ -339,7 +339,7 @@ def generate_long(
 
				     temperature: float = 0.8,
			
 
				     compile: bool = False,
			
 
				     iterative_prompt: bool = True,
			
 
				-    chunk_length: int = 150,
			
 
				+    chunk_length: int = 512,
			
 
				     prompt_text: Optional[str | list[str]] = None,
			
 
				     prompt_tokens: Optional[torch.Tensor | list[torch.Tensor]] = None,
			
 
				 ):
			
@@ -365,6 +365,24 @@ def generate_long(
 
				     texts = split_text(text, chunk_length) if iterative_prompt else [text]
			
 
				     max_length = model.config.max_seq_len
			
 
				 
			
 
				+    # if use_prompt:
			
 
				+    #     base_content_sequence.append(
			
 
				+    #         [
			
 
				+    #             TextPart(text=prompt_text[0]),
			
 
				+    #             VQPart(codes=prompt_tokens[0]),
			
 
				+    #         ],
			
 
				+    #         add_end=True,
			
 
				+    #     )
			
 
				+
			
 
				+    # for text in texts:
			
 
				+    #     content_sequence = ContentSequence(modality=None)
			
 
				+    #     base_content_sequence.append(
			
 
				+    #         [
			
 
				+    #             TextPart(text=text),
			
 
				+    #         ],
			
 
				+    #         add_end=True,
			
 
				+    #     )
			
 
				+
			
 
				     if use_prompt:
			
 
				         for t, c in zip(prompt_text, prompt_tokens):
			
 
				             base_content_sequence.append(
			
@@ -385,7 +403,7 @@ def generate_long(
 
				 
			
 
				     encoded = []
			
 
				     for text in texts:
			
 
				-        content_sequence = ContentSequence(modality=None)
			
 
				+        content_sequence = ContentSequence(modality="text")
			
 
				         content_sequence.append(TextPart(text=text))
			
 
				         encoded.append(
			
 
				             content_sequence.encode_for_inference(
			
--- a/uv.lock
+++ b/uv.lock
@@ -942,6 +942,7 @@ dependencies = [
 
				     { name = "cachetools" },
			
 
				     { name = "datasets" },
			
 
				     { name = "descript-audio-codec" },
			
 
				+    { name = "descript-audiotools" },
			
 
				     { name = "einops" },
			
 
				     { name = "einx", extra = ["torch"] },
			
 
				     { name = "faster-whisper" },
			
@@ -985,6 +986,7 @@ requires-dist = [
 
				     { name = "cachetools" },
			
 
				     { name = "datasets", specifier = "==2.18.0" },
			
 
				     { name = "descript-audio-codec" },
			
 
				+    { name = "descript-audiotools" },
			
 
				     { name = "einops", specifier = ">=0.7.0" },
			
 
				     { name = "einx", extras = ["torch"], specifier = "==0.2.2" },
			
 
				     { name = "faster-whisper" },