Prechádzať zdrojové kódy

Fix timbre problem (#1009)

* [feature]add dataset classs

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [dev]combine agent and tts infer

* [feature]:update inference

* [feature]:update uv.lock

* [Merge]:merge upstream/main

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [fix]:remove unused files

* [fix]:remove unused files

* [fix]:remove unused files

* [fix]:fix infer bugs

* [docs]:update introduction and optinize front appearence

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [docs]:update README for OpenAudio-S1

* [docs]:update docs

* [docs]:Update video

* [docs]:fix video

* [docs]:fix video

* [fix]:fix timbre problem

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Whale and Dolphin 10 mesiacov pred
rodič
commit
9021a57dce
4 zmenil súbory, kde vykonal 128 pridanie a 27 odobranie
  1. 105 25
      .gitignore
  2. 1 0
      data
  3. 20 2
      fish_speech/models/text2semantic/inference.py
  4. 2 0
      uv.lock

+ 105 - 25
.gitignore

@@ -1,33 +1,113 @@
+# =============================================================================
+# Fish Speech - .gitignore
+# =============================================================================
+
+# Operating System Files
+# -----------------------
 .DS_Store
-.pgx.*
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+
+# IDEs and Editors
+# ----------------
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# Python
+# ------
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# Virtual Environments
+# --------------------
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+/fishenv/
+
+# Project Dependencies
+# --------------------
 .pdm-python
 /fish_speech.egg-info
-__pycache__
-/results
-/data
-/*.test.sh
+
+# Data and Model Files
+# --------------------
+/data/
+/results/
+/checkpoints/
+/references/
+/demo-audios/
+/example/
+filelists/
 *.filelist
-filelists
+
+# Audio Files
+# -----------
+*.wav
+*.mp3
+*.flac
+*.ogg
+*.m4a
+
+# Data Files
+# ----------
+*.npy
+*.npz
+*.pkl
+*.pickle
+*.lab
 /fish_speech/text/cmudict_cache.pickle
-/checkpoints
-/.vscode
-/data_server/target
-/*.npy
-/*.wav
-/*.mp3
-/*.lab
-/results
-/data
-/.idea
+
+# Cache and Temporary Files
+# --------------------------
+/.cache/
+/.gradio/
+/.locale/
+.pgx.*
+*log
+*.log
+
+# External Tools
+# --------------
 ffmpeg.exe
 ffprobe.exe
+/faster_whisper/
+
+# Server Related
+# --------------
+/data_server/target/
+
+# Test Files
+# ----------
+/*.test.sh
 asr-label*
-/.cache
-/fishenv
-/.locale
-/demo-audios
-/references
-/example
-/faster_whisper
-/.gradio
-*log

+ 1 - 0
data

@@ -0,0 +1 @@
+/mnt/users/whaledolphin/data

+ 20 - 2
fish_speech/models/text2semantic/inference.py

@@ -339,7 +339,7 @@ def generate_long(
     temperature: float = 0.8,
     compile: bool = False,
     iterative_prompt: bool = True,
-    chunk_length: int = 150,
+    chunk_length: int = 512,
     prompt_text: Optional[str | list[str]] = None,
     prompt_tokens: Optional[torch.Tensor | list[torch.Tensor]] = None,
 ):
@@ -365,6 +365,24 @@ def generate_long(
     texts = split_text(text, chunk_length) if iterative_prompt else [text]
     max_length = model.config.max_seq_len
 
+    # if use_prompt:
+    #     base_content_sequence.append(
+    #         [
+    #             TextPart(text=prompt_text[0]),
+    #             VQPart(codes=prompt_tokens[0]),
+    #         ],
+    #         add_end=True,
+    #     )
+
+    # for text in texts:
+    #     content_sequence = ContentSequence(modality=None)
+    #     base_content_sequence.append(
+    #         [
+    #             TextPart(text=text),
+    #         ],
+    #         add_end=True,
+    #     )
+
     if use_prompt:
         for t, c in zip(prompt_text, prompt_tokens):
             base_content_sequence.append(
@@ -385,7 +403,7 @@ def generate_long(
 
     encoded = []
     for text in texts:
-        content_sequence = ContentSequence(modality=None)
+        content_sequence = ContentSequence(modality="text")
         content_sequence.append(TextPart(text=text))
         encoded.append(
             content_sequence.encode_for_inference(

+ 2 - 0
uv.lock

@@ -942,6 +942,7 @@ dependencies = [
     { name = "cachetools" },
     { name = "datasets" },
     { name = "descript-audio-codec" },
+    { name = "descript-audiotools" },
     { name = "einops" },
     { name = "einx", extra = ["torch"] },
     { name = "faster-whisper" },
@@ -985,6 +986,7 @@ requires-dist = [
     { name = "cachetools" },
     { name = "datasets", specifier = "==2.18.0" },
     { name = "descript-audio-codec" },
+    { name = "descript-audiotools" },
     { name = "einops", specifier = ">=0.7.0" },
     { name = "einx", extras = ["torch"], specifier = "==0.2.2" },
     { name = "faster-whisper" },