from typing import List, Dict, Any from dataclasses import dataclass, field, asdict @dataclass class Chunk: chunk_id: int doc_id: str text: str tokens: int dataset_id: int topic: str = "" domain: str = "" task_type: str = "" topic_purity: float = 1.0 text_type: int = 1 summary: str = "" status: int = 1 keywords: List[str] = field(default_factory=list) concepts: List[str] = field(default_factory=list) questions: List[str] = field(default_factory=list) entities: List[str] = field(default_factory=list) @dataclass class ChunkerConfig: target_tokens: int = 256 max_tokens: int = 2048 min_tokens: int = 64 boundary_threshold: float = 0.8 min_sent_per_chunk: int = 3 max_sent_per_chunk: int = 10 enable_adaptive_boundary: bool = True enable_kg: bool = True topic_purity_floor: float = 0.8 kg_topk: int = 3