1 år sedan · 7b0802db0b
--- a/Start_Agent.md
+++ b/Start_Agent.md
@@ -1,17 +1,27 @@
 
				 # How To Start?
			
 
				 
			
 
				-### Environment Prepare
			
 
				+### Download Model
			
 
				 
			
 
				-If you haven't install the environment of Fish-speech, please use:
			
 
				+You can get the model by:
			
 
				 
			
 
				 ```bash
			
 
				-pip install -e .[stable]
			
 
				+huggingface-cli download fishaudio/fish-agent-v0.1-3b --local-dir checkpoints/fish-agent-v0.1-3b
			
 
				 ```
			
 
				 
			
 
				-Then use:
			
 
				+Put them in the 'checkpoints' folder.
			
 
				+
			
 
				+You also need the VQGAN weight in the fish-speech-1.4 repo.
			
 
				+
			
 
				+So there will be 2 folder in the checkpoints.
			
 
				+
			
 
				+The ``checkpoints/fish-speech-1.4`` and ``checkpoints/fish-agent-v0.1-3b``
			
 
				+
			
 
				+### Environment Prepare
			
 
				+
			
 
				+If you haven't install the environment of Fish-speech, please use:
			
 
				 
			
 
				 ```bash
			
 
				-pip install livekit livekit-agents
			
 
				+pip install -e .[stable]
			
 
				 ```
			
 
				 
			
 
				 ### Launch The Agent Demo.
			
@@ -19,7 +29,7 @@ pip install livekit livekit-agents
 
				 Please use the command below under the main folder:
			
 
				 
			
 
				 ```bash
			
 
				-python -m tools.api --llama-checkpoint-path checkpoints/fish-agent-3b-pretrain/ --mode agent --compile
			
 
				+python -m tools.api --llama-checkpoint-path checkpoints/fish-agent-v0.1-3b/ --mode agent --compile
			
 
				 ```
			
 
				 
			
 
				 The ``--compile`` args only support Python < 3.12 , which will greatly speed up the token generation.
			
--- a/tools/fish_e2e.py
+++ b/tools/fish_e2e.py
@@ -1,18 +1,17 @@
 
				 import base64
			
 
				+import ctypes
			
 
				 import io
			
 
				 import json
			
 
				 import os
			
 
				 import struct
			
 
				 from dataclasses import dataclass
			
 
				 from enum import Enum
			
 
				-from typing import AsyncGenerator
			
 
				+from typing import AsyncGenerator, Union
			
 
				 
			
 
				 import httpx
			
 
				 import numpy as np
			
 
				 import ormsgpack
			
 
				 import soundfile as sf
			
 
				-from livekit import rtc
			
 
				-from livekit.agents.llm.chat_context import ChatContext
			
 
				 
			
 
				 from .schema import (
			
 
				     ServeMessage,
			
@@ -24,6 +23,49 @@ from .schema import (
 
				 )
			
 
				 
			
 
				 
			
 
				+class CustomAudioFrame:
			
 
				+    def __init__(self, data, sample_rate, num_channels, samples_per_channel):
			
 
				+        if len(data) < num_channels * samples_per_channel * ctypes.sizeof(
			
 
				+            ctypes.c_int16
			
 
				+        ):
			
 
				+            raise ValueError(
			
 
				+                "data length must be >= num_channels * samples_per_channel * sizeof(int16)"
			
 
				+            )
			
 
				+
			
 
				+        self._data = bytearray(data)
			
 
				+        self._sample_rate = sample_rate
			
 
				+        self._num_channels = num_channels
			
 
				+        self._samples_per_channel = samples_per_channel
			
 
				+
			
 
				+    @property
			
 
				+    def data(self):
			
 
				+        return memoryview(self._data).cast("h")
			
 
				+
			
 
				+    @property
			
 
				+    def sample_rate(self):
			
 
				+        return self._sample_rate
			
 
				+
			
 
				+    @property
			
 
				+    def num_channels(self):
			
 
				+        return self._num_channels
			
 
				+
			
 
				+    @property
			
 
				+    def samples_per_channel(self):
			
 
				+        return self._samples_per_channel
			
 
				+
			
 
				+    @property
			
 
				+    def duration(self):
			
 
				+        return self.samples_per_channel / self.sample_rate
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        return (
			
 
				+            f"CustomAudioFrame(sample_rate={self.sample_rate}, "
			
 
				+            f"num_channels={self.num_channels}, "
			
 
				+            f"samples_per_channel={self.samples_per_channel}, "
			
 
				+            f"duration={self.duration:.3f})"
			
 
				+        )
			
 
				+
			
 
				+
			
 
				 class FishE2EEventType(Enum):
			
 
				     SPEECH_SEGMENT = 1
			
 
				     TEXT_SEGMENT = 2
			
@@ -36,7 +78,7 @@ class FishE2EEventType(Enum):
 
				 @dataclass
			
 
				 class FishE2EEvent:
			
 
				     type: FishE2EEventType
			
 
				-    frame: rtc.AudioFrame = None
			
 
				+    frame: np.ndarray = None
			
 
				     text: str = None
			
 
				     vq_codes: list[list[int]] = None
			
 
				 
			
@@ -81,7 +123,7 @@ class FishE2EAgent:
 
				         user_audio_data: np.ndarray | None,
			
 
				         sample_rate: int,
			
 
				         num_channels: int,
			
 
				-        chat_ctx: ChatContext | None = None,
			
 
				+        chat_ctx: dict | None = None,
			
 
				     ) -> AsyncGenerator[bytes, None]:
			
 
				 
			
 
				         if system_audio_data is not None:
			
@@ -163,7 +205,7 @@ class FishE2EAgent:
 
				             audio_data = np.frombuffer(decode_data["audios"][0], dtype=np.float16)
			
 
				             audio_data = (audio_data * 32768).astype(np.int16).tobytes()
			
 
				 
			
 
				-            audio_frame = rtc.AudioFrame(
			
 
				+            audio_frame = CustomAudioFrame(
			
 
				                 data=audio_data,
			
 
				                 samples_per_channel=len(audio_data) // 2,
			
 
				                 sample_rate=44100,