12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758 |
- from typing import List
- from applications.config import Chunk
- from applications.api import fetch_deepseek_completion
- class LLMClassifier:
- @staticmethod
- def generate_prompt(chunk_text: str) -> str:
- raw_prompt = """
- 你是一个文本分析助手。
- 我会给你一段文本,请你输出以下信息:
- 1. **主题标签 (topic)**:一句话概括文本主题
- 2. **关键词 (keywords)**:3-5 个,便于检索
- 3. **摘要 (summary)**:50字以内简要说明
- 4. **领域 (domain)**:该文本所属领域(如:AI 技术、体育、金融)
- 5. **任务类型 (task_type)**:文本主要任务类型(如:解释、教学、动作描述、方法提出)
- 6. **核心知识点 (concepts)**:涉及的核心知识点或概念
- 7. **显示/隐式问题 (questions)**:文本中隐含或显式的问题
- 请用 JSON 格式输出,例如:
- {
- "topic": "RAG 技术与分块策略",
- "summary": "介绍RAG技术并提出主题感知的分块方法。",
- "domain": "AI 技术",
- "task_type": "方法提出",
- "keywords": ["RAG", "检索增强", "文本分块", "知识图谱"],
- "concepts": ["RAG", "文本分块", "知识图谱"],
- "questions": ["如何提升RAG的检索效果?"]
- }
- 下面是文本:
- """
- return raw_prompt.strip() + chunk_text
- async def classify_chunk(self, chunk: Chunk) -> Chunk:
- text = chunk.text.strip()
- prompt = self.generate_prompt(text)
- response = await fetch_deepseek_completion(
- model="DeepSeek-V3", prompt=prompt, output_type="json"
- )
- return Chunk(
- chunk_id=chunk.chunk_id,
- doc_id=chunk.doc_id,
- text=text,
- tokens=chunk.tokens,
- topic_purity=chunk.topic_purity,
- summary=response.get("summary"),
- topic=response.get("topic"),
- domain=response.get("domain"),
- task_type=response.get("task_type"),
- concepts=response.get("concepts", []),
- keywords=response.get("keywords", []),
- questions=response.get("questions", []),
- )
- async def classify_chunk_by_topic(self, chunk_list: List[Chunk]) -> List[Chunk]:
- return [await self.classify_chunk(chunk) for chunk in chunk_list]
|