base_chunk.py 742 B

1234567891011121314151617181920212223242526272829
  1. from typing import List, Dict, Any
  2. from dataclasses import dataclass, field, asdict
  3. @dataclass
  4. class Chunk:
  5. chunk_id: int
  6. doc_id: str
  7. text: str
  8. tokens: int
  9. topic: str = ""
  10. domain: str = ""
  11. task_type: str = ""
  12. topic_purity: float = 1.0
  13. summary: str = ""
  14. keywords: List[str] = field(default_factory=list)
  15. concepts: List[str] = field(default_factory=list)
  16. questions: List[str] = field(default_factory=list)
  17. @dataclass
  18. class ChunkerConfig:
  19. target_tokens: int = 256
  20. boundary_threshold: float = 0.8
  21. min_sent_per_chunk: int = 3
  22. max_sent_per_chunk: int = 10
  23. enable_adaptive_boundary: bool = True
  24. enable_kg: bool = True
  25. topic_purity_floor: float = 0.8
  26. kg_topk: int = 3