from typing import List from applications.config import Chunk from applications.api import fetch_deepseek_completion class LLMClassifier: @staticmethod def generate_prompt(chunk_text: str) -> str: raw_prompt = """ 你是一个文本分析助手。 请严格按照以下要求分析我提供的文本,并输出 **JSON 格式**结果: ### 输出字段说明 1. **topic**:一句话概括文本主题 2. **summary**:50字以内简要说明文本内容 3. **domain**:从下列枚举表中选择一个最合适的领域(必须严格选取一个,不能生成新词) - ["AI 技术","机器学习","自然语言处理","计算机视觉","知识图谱","数据科学","软件工程","数据库","云计算","网络安全","区块链","量子计算", "数学","物理","化学","生物","医学","心理学","教育", "金融","会计","经济学","管理学","市场营销","投资/基金", "法律","政治","社会学","历史","哲学","语言学","文学","艺术", "体育","娱乐","军事","环境科学","地理","其他"] 4. **task_type**:文本主要任务类型(如:解释、教学、动作描述、方法提出) 5. **keywords**:不超过 3 个,偏向外部检索用标签(概括性强,利于搜索) 6. **concepts**:不超过 3 个,偏向内部知识点(技术/学术内涵,和 keywords 明显区分) 7. **questions**:文本中显式或隐含的问题(无则返回空数组) 8. **entities**:文本中出现的命名实体(如人名、地名、机构名、系统名、模型名等,无则返回空数组) ### 输出格式示例 ```json { "topic": "RAG 技术与主题感知分块", "summary": "介绍RAG在复杂问答中的应用,并提出分块方法。", "domain": "自然语言处理", "task_type": "方法提出", "keywords": ["RAG", "文本分块", "问答系统"], "concepts": ["检索增强生成", "语义边界检测", "主题感知分块"], "questions": ["如何优化RAG在问答场景中的效果?"], "entities": ["RAG"] } 下面是文本: """ return raw_prompt.strip() + chunk_text async def classify_chunk(self, chunk: Chunk) -> Chunk: text = chunk.text.strip() prompt = self.generate_prompt(text) response = await fetch_deepseek_completion( model="DeepSeek-V3", prompt=prompt, output_type="json" ) return Chunk( chunk_id=chunk.chunk_id, doc_id=chunk.doc_id, text=text, tokens=chunk.tokens, topic_purity=chunk.topic_purity, dataset_id=chunk.dataset_id, summary=response.get("summary"), topic=response.get("topic"), domain=response.get("domain"), task_type=response.get("task_type"), concepts=response.get("concepts", []), keywords=response.get("keywords", []), questions=response.get("questions", []), entities=response.get("entities", []), )