Server
/
rag_server


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
							from typing import List

from applications.config import Chunk
from applications.api import fetch_deepseek_completion


class LLMClassifier:
    @staticmethod
    def generate_prompt(chunk_text: str) -> str:
        raw_prompt = """
你是一个文本分析助手。  
我会给你一段文本，请你输出以下信息：  
1. **主题标签 (topic)**：一句话概括文本主题  
2. **关键词 (keywords)**：3-5 个，便于检索  
3. **摘要 (summary)**：50字以内简要说明  
4. **领域 (domain)**：该文本所属领域（如：AI 技术、体育、金融）
5. **任务类型 (task_type)**：文本主要任务类型（如：解释、教学、动作描述、方法提出）  
6. **核心知识点 (concepts)**：涉及的核心知识点或概念  
7. **显示/隐式问题 (questions)**：文本中隐含或显式的问题  

请用 JSON 格式输出，例如：
{
    "topic": "RAG 技术与分块策略",
    "summary": "介绍RAG技术并提出主题感知的分块方法。", 
    "domain": "AI 技术",
    "task_type": "方法提出",
    "keywords": ["RAG", "检索增强", "文本分块", "知识图谱"],
    "concepts": ["RAG", "文本分块", "知识图谱"],
    "questions": ["如何提升RAG的检索效果？"]
}

下面是文本：
        """
        return raw_prompt.strip() + chunk_text

    async def classify_chunk(self, chunk: Chunk) -> Chunk:
        text = chunk.text.strip()
        prompt = self.generate_prompt(text)
        response = await fetch_deepseek_completion(
            model="DeepSeek-V3", prompt=prompt, output_type="json"
        )
        return Chunk(
            chunk_id=chunk.chunk_id,
            doc_id=chunk.doc_id,
            text=text,
            tokens=chunk.tokens,
            topic_purity=chunk.topic_purity,
            summary=response.get("summary"),
            topic=response.get("topic"),
            domain=response.get("domain"),
            task_type=response.get("task_type"),
            concepts=response.get("concepts", []),
            keywords=response.get("keywords", []),
            questions=response.get("questions", []),
        )

    async def classify_chunk_by_topic(self, chunk_list: List[Chunk]) -> List[Chunk]:
        return [await self.classify_chunk(chunk) for chunk in chunk_list]