base_chunk.py 912 B

123456789101112131415161718192021222324252627282930313233343536
  1. from typing import List, Dict, Any
  2. from dataclasses import dataclass, field, asdict
  3. @dataclass
  4. class Chunk:
  5. chunk_id: int
  6. doc_id: str
  7. text: str
  8. tokens: int
  9. dataset_id: int
  10. topic: str = ""
  11. domain: str = ""
  12. task_type: str = ""
  13. topic_purity: float = 1.0
  14. text_type: int = 1
  15. summary: str = ""
  16. status: int = 1
  17. keywords: List[str] = field(default_factory=list)
  18. concepts: List[str] = field(default_factory=list)
  19. questions: List[str] = field(default_factory=list)
  20. entities: List[str] = field(default_factory=list)
  21. @dataclass
  22. class ChunkerConfig:
  23. target_tokens: int = 256
  24. max_tokens: int = 512
  25. min_tokens: int = 64
  26. boundary_threshold: float = 0.8
  27. min_sent_per_chunk: int = 3
  28. max_sent_per_chunk: int = 10
  29. enable_adaptive_boundary: bool = True
  30. enable_kg: bool = True
  31. topic_purity_floor: float = 0.8
  32. kg_topk: int = 3