123456789101112131415161718192021222324252627282930 |
- from typing import List, Dict, Any
- from dataclasses import dataclass, field, asdict
- @dataclass
- class Chunk:
- chunk_id: int
- doc_id: str
- text: str
- tokens: int
- topic: str = ""
- domain: str = ""
- task_type: str = ""
- topic_purity: float = 1.0
- summary: str = ""
- keywords: List[str] = field(default_factory=list)
- concepts: List[str] = field(default_factory=list)
- questions: List[str] = field(default_factory=list)
- entities: List[str] = field(default_factory=list)
- @dataclass
- class ChunkerConfig:
- target_tokens: int = 256
- boundary_threshold: float = 0.8
- min_sent_per_chunk: int = 3
- max_sent_per_chunk: int = 10
- enable_adaptive_boundary: bool = True
- enable_kg: bool = True
- topic_purity_floor: float = 0.8
- kg_topk: int = 3
|