commons.py 1.4 KB

123456789101112131415161718192021222324252627282930313233343536
  1. from typing import Annotated, Literal, Optional
  2. from pydantic import BaseModel, Field, conint
  3. class ServeReferenceAudio(BaseModel):
  4. audio: bytes
  5. text: str
  6. class ServeTTSRequest(BaseModel):
  7. text: str
  8. chunk_length: Annotated[int, conint(ge=100, le=300, strict=True)] = 200
  9. # Audio format
  10. format: Literal["wav", "pcm", "mp3"] = "wav"
  11. mp3_bitrate: Literal[64, 128, 192] = 128
  12. # References audios for in-context learning
  13. references: list[ServeReferenceAudio] = []
  14. # Reference id
  15. # For example, if you want use https://fish.audio/m/7f92f8afb8ec43bf81429cc1c9199cb1/
  16. # Just pass 7f92f8afb8ec43bf81429cc1c9199cb1
  17. reference_id: str | None = None
  18. seed: int | None = None
  19. use_memory_cache: Literal["on-demand", "never"] = "never"
  20. # Normalize text for en & zh, this increase stability for numbers
  21. normalize: bool = True
  22. mp3_bitrate: Optional[int] = 64
  23. opus_bitrate: Optional[int] = -1000
  24. # Balance mode will reduce latency to 300ms, but may decrease stability
  25. latency: Literal["normal", "balanced"] = "normal"
  26. # not usually used below
  27. streaming: bool = False
  28. max_new_tokens: int = 1024
  29. top_p: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7
  30. repetition_penalty: Annotated[float, Field(ge=0.9, le=2.0, strict=True)] = 1.2
  31. temperature: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7