|
@@ -0,0 +1,160 @@
|
|
|
|
|
+from typing import List, Dict, Optional, Any
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def map_prompt(question, format_context):
|
|
|
|
|
+ prompt = f"""
|
|
|
|
|
+【任务说明】:针对下方单个文档片段,提炼与“问题”直接相关的‘可被引用的事实要点’
|
|
|
|
|
+【问题】: {question}
|
|
|
|
|
+【输入】:{format_context}
|
|
|
|
|
+【输出】:输出只输出 JSON,不需要多余问题,输出结果如下:
|
|
|
|
|
+ {{
|
|
|
|
|
+ "id": "id", # 返回输入 text 中的 DOC id
|
|
|
|
|
+ "claims": [
|
|
|
|
|
+ {{"point": "事实要点1(尽量原文转述/精准改写)"}},
|
|
|
|
|
+ {{"point": "事实要点2}},
|
|
|
|
|
+ ],
|
|
|
|
|
+ "conflicts_or_limits": ["该片段的限制/含糊点(如时间、定义口径、版本号等)"]
|
|
|
|
|
+ }}
|
|
|
|
|
+ """
|
|
|
|
|
+ return prompt
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def reduce_prompt(question, mapped_results_json_list):
|
|
|
|
|
+ prompt = f"""
|
|
|
|
|
+【任务】:合并多份 Map 结果,完成一下三点:
|
|
|
|
|
+ "1) 去重并合并同义要点;
|
|
|
|
|
+ "2) 标注并归纳冲突点;
|
|
|
|
|
+ "3) 输出最终回答(含引用)。
|
|
|
|
|
+【问题】:{question}
|
|
|
|
|
+【Map 结果】:{mapped_results_json_list}
|
|
|
|
|
+【输出(Markdown)】:
|
|
|
|
|
+ - 简要结论(2-4句)
|
|
|
|
|
+ - 关键要点(每点附主要引用,如 [C12] [C4], 用 map 结果中的 id来表示)
|
|
|
|
|
+ - 证据信息不一致(如有):列出冲突内容、涉及的 doc_id、可能原因
|
|
|
|
|
+ - 信息缺口(如有)
|
|
|
|
|
+"""
|
|
|
|
|
+ return prompt
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def verify_prompt(question, draft, formatted_contexts):
|
|
|
|
|
+ prompt = f"""
|
|
|
|
|
+【任务】: 对“初稿答案”的关键断言逐条核验,严格限定仅使用上下文,请标注每条断言是否被证据‘支持/相矛盾/证据不足’,必要时修正结论
|
|
|
|
|
+【问题】
|
|
|
|
|
+ {question}
|
|
|
|
|
+【初稿答案】
|
|
|
|
|
+ {draft}
|
|
|
|
|
+【上下文】
|
|
|
|
|
+ {formatted_contexts}
|
|
|
|
|
+【输出(只输出 JSON)】
|
|
|
|
|
+{
|
|
|
|
|
+ "verdicts": [
|
|
|
|
|
+ {
|
|
|
|
|
+ "claim": "断言内容", "status": "supported|contradicted|insufficient", "citations": ["[Cid]"]}
|
|
|
|
|
+ ],
|
|
|
|
|
+ "final_answer": "(如需修正,请给出修正后的简明答案,并附引用)"
|
|
|
|
|
+}
|
|
|
|
|
+"""
|
|
|
|
|
+ return prompt
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def build_rag_prompt(
|
|
|
|
|
+ question: str,
|
|
|
|
|
+ contexts: List[Dict],
|
|
|
|
|
+ mode: str = "single", # 可选:single | map | reduce | rerank | verify | map_reduce
|
|
|
|
|
+ max_chars_per_chunk: int = 800,
|
|
|
|
|
+ draft_answer: Optional[str] = None,
|
|
|
|
|
+ mapped_results_json_list: Optional[str] = None,
|
|
|
|
|
+) -> Dict[str, Any]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 生成 RAG 聚合阶段所需的提示词(Prompt)。
|
|
|
|
|
+ 返回值依据 mode 不同而不同:
|
|
|
|
|
+ - single: {"system": str, "user": str}
|
|
|
|
|
+ - map: {"system": str, "user_list": List[str]} # 每个片段一条 Map 提示
|
|
|
|
|
+ - reduce: {"system": str, "user": str}
|
|
|
|
|
+ - rerank: {"system": str, "user": str}
|
|
|
|
|
+ - verify: {"system": str, "user": str}
|
|
|
|
|
+ - map_reduce: {"map": {...}, "reduce": {...}} # 组合骨架
|
|
|
|
|
+ 使用方式:
|
|
|
|
|
+ 1) 单步聚合:把返回的 system/user 丢给 LLM 即可;
|
|
|
|
|
+ 2) Map-Reduce:先用 map.user_list 逐条调用 LLM 得到 JSON,再把合并后的 JSON 列表给 reduce。
|
|
|
|
|
+ """
|
|
|
|
|
+
|
|
|
|
|
+ # ——Tools
|
|
|
|
|
+ def _trim(text: str, limit: int) -> str:
|
|
|
|
|
+ text = text.strip().replace("\n", " ")
|
|
|
|
|
+ return text if len(text) <= limit else text[: max(0, limit - 1)] + "…"
|
|
|
|
|
+
|
|
|
|
|
+ def _format_contexts(chunks: List[Dict]) -> str:
|
|
|
|
|
+ lines = [_format_each_chunk(i) for i in chunks]
|
|
|
|
|
+ return "\n".join(lines).strip()
|
|
|
|
|
+
|
|
|
|
|
+ def _format_each_chunk(chunk: Dict) -> str:
|
|
|
|
|
+ bits = [f"DOC id={chunk['id']}"]
|
|
|
|
|
+ if chunk.get("score"):
|
|
|
|
|
+ bits.append(f"score={round(chunk['score'], 4)}")
|
|
|
|
|
+
|
|
|
|
|
+ prefix = "[" + " ".join(bits) + "]"
|
|
|
|
|
+ snippets = _trim(chunk["content"], max_chars_per_chunk)
|
|
|
|
|
+ item = f"{prefix}\n{snippets}\n"
|
|
|
|
|
+ return item
|
|
|
|
|
+
|
|
|
|
|
+ # —— 统一 System 约束(全中文) ——
|
|
|
|
|
+ system_text = (
|
|
|
|
|
+ "你是一位“基于证据的助手”。你必须只使用我提供的【上下文】来回答:\n"
|
|
|
|
|
+ "- 不得使用外部常识或臆测;\n"
|
|
|
|
|
+ "- 若上下文不足,请明确输出“信息不足”,并指出缺失的信息类型;\n"
|
|
|
|
|
+ "- 对关键结论附 [C{id}] 形式的出处;\n"
|
|
|
|
|
+ "- 如存在冲突证据,请列出冲突并给出谨慎结论与采信依据;\n"
|
|
|
|
|
+ "- 用中文回答,保持简洁、结构化。"
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ formatted_contexts = _format_contexts(contexts)
|
|
|
|
|
+ match mode:
|
|
|
|
|
+ case "single":
|
|
|
|
|
+ user_text = (
|
|
|
|
|
+ f"【问题】\n{question}\n\n"
|
|
|
|
|
+ f"【上下文(已按相关性排序)】\n{formatted_contexts}\n\n"
|
|
|
|
|
+ "【请按以下结构作答】\n"
|
|
|
|
|
+ "1) 简要结论(2-4句)\n"
|
|
|
|
|
+ "2) 关键要点(每点附1-2个引用,如 [C3])\n"
|
|
|
|
|
+ "3) 证据信息不一致(如有)\n"
|
|
|
|
|
+ "4) 信息缺口(如有)"
|
|
|
|
|
+ )
|
|
|
|
|
+ return {"system": system_text, "user": user_text}
|
|
|
|
|
+
|
|
|
|
|
+ # —— Map 步骤:对每个片段单独提炼“可引用事实点” ——
|
|
|
|
|
+ case "map":
|
|
|
|
|
+ map_user_list = []
|
|
|
|
|
+ for context in contexts:
|
|
|
|
|
+ format_context = _format_each_chunk(context)
|
|
|
|
|
+ map_user_list.append(map_prompt(question, format_context))
|
|
|
|
|
+ return {"system": system_text, "user_list": map_user_list}
|
|
|
|
|
+
|
|
|
|
|
+ # 对 map 的结果进行聚合
|
|
|
|
|
+ case "reduce":
|
|
|
|
|
+ res = reduce_prompt(question, mapped_results_json_list)
|
|
|
|
|
+ return {"system": system_text, "user": res}
|
|
|
|
|
+
|
|
|
|
|
+ # —— 自重排(Rank-Then-Read 的“Rank”):仅评分排序,不做总结 ——
|
|
|
|
|
+ case "rerank":
|
|
|
|
|
+ rerank_system = "你是一位严谨的重排器。请只输出 JSON。"
|
|
|
|
|
+ rerank_user = (
|
|
|
|
|
+ f"请比较下列候选段与问题“{question}”的相关性,仅打分并排序(不做总结)。\n"
|
|
|
|
|
+ "评分标准(由高到低):直接回答性 > 主题一致性 > 细节重合度 > 时间匹配。\n\n"
|
|
|
|
|
+ f"【候选段】\n{formatted_contexts}\n\n"
|
|
|
|
|
+ "【只输出 JSON,格式如下(按 score 从高到低)】\n"
|
|
|
|
|
+ '[{"id":"DOC_ID","score":X.X}]'
|
|
|
|
|
+ )
|
|
|
|
|
+ return {"system": rerank_system, "user": rerank_user}
|
|
|
|
|
+
|
|
|
|
|
+ # —— 核验(Chain-of-Verification):对初稿答案逐条校验并修正 ——
|
|
|
|
|
+ case "verify":
|
|
|
|
|
+ draft = draft_answer or "(此处为初稿答案)"
|
|
|
|
|
+ verify_user = verify_prompt(question, draft, formatted_contexts)
|
|
|
|
|
+ return {"system": system_text, "user": verify_user}
|
|
|
|
|
+
|
|
|
|
|
+ case _:
|
|
|
|
|
+ raise ValueError(f"不支持的模式:{mode}")
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+__all__ = ["build_rag_prompt"]
|