sug_v6_1_2_5.py 51 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482
  1. import asyncio
  2. import json
  3. import os
  4. import sys
  5. import argparse
  6. from datetime import datetime
  7. from typing import Literal
  8. from agents import Agent, Runner
  9. from lib.my_trace import set_trace
  10. from pydantic import BaseModel, Field
  11. from lib.utils import read_file_as_string
  12. from lib.client import get_model
  13. MODEL_NAME = "google/gemini-2.5-flash"
  14. from script.search_recommendations.xiaohongshu_search_recommendations import XiaohongshuSearchRecommendations
  15. from script.search.xiaohongshu_search import XiaohongshuSearch
  16. # ============================================================================
  17. # 数据模型
  18. # ============================================================================
  19. class QueryState(BaseModel):
  20. """Query状态跟踪"""
  21. query: str
  22. level: int # 当前所在层级
  23. no_suggestion_rounds: int = 0 # 连续没有suggestion的轮数
  24. relevance_score: float = 0.0 # 与原始需求的相关度
  25. parent_query: str | None = None # 父query
  26. strategy: str | None = None # 生成策略:direct_sug, rewrite, add_word
  27. is_terminated: bool = False # 是否已终止(不再处理)
  28. class WordLibrary(BaseModel):
  29. """动态分词库"""
  30. words: set[str] = Field(default_factory=set)
  31. word_sources: dict[str, str] = Field(default_factory=dict) # 记录词的来源:word -> source(note_id或"initial")
  32. def add_word(self, word: str, source: str = "unknown"):
  33. """添加单词到分词库"""
  34. if word and word.strip():
  35. word = word.strip()
  36. self.words.add(word)
  37. if word not in self.word_sources:
  38. self.word_sources[word] = source
  39. def add_words(self, words: list[str], source: str = "unknown"):
  40. """批量添加单词"""
  41. for word in words:
  42. self.add_word(word, source)
  43. def get_unused_word(self, current_query: str) -> str | None:
  44. """获取一个当前query中没有的词"""
  45. for word in self.words:
  46. if word not in current_query:
  47. return word
  48. return None
  49. def model_dump(self):
  50. """序列化为dict"""
  51. return {
  52. "words": list(self.words),
  53. "word_sources": self.word_sources
  54. }
  55. class RunContext(BaseModel):
  56. """运行上下文"""
  57. version: str
  58. input_files: dict[str, str]
  59. q_with_context: str
  60. q_context: str
  61. q: str
  62. log_url: str
  63. log_dir: str
  64. # 新增字段
  65. word_library: dict = Field(default_factory=dict) # 使用dict存储,因为set不能直接序列化
  66. query_states: list[dict] = Field(default_factory=list)
  67. steps: list[dict] = Field(default_factory=list)
  68. # Query演化图
  69. query_graph: dict = Field(default_factory=dict) # 记录Query的演化路径和关系
  70. # 最终结果
  71. satisfied_notes: list[dict] = Field(default_factory=list)
  72. final_output: str | None = None
  73. # ============================================================================
  74. # Agent 定义
  75. # ============================================================================
  76. # Agent 1: 分词专家
  77. class WordSegmentation(BaseModel):
  78. """分词结果"""
  79. words: list[str] = Field(..., description="分词结果列表")
  80. reasoning: str = Field(..., description="分词理由")
  81. word_segmentation_instructions = """
  82. 你是分词专家。给定一个query,将其拆分成有意义的最小单元。
  83. ## 分词原则
  84. 1. 保留有搜索意义的词汇
  85. 2. 拆分成独立的概念
  86. 3. 保留专业术语的完整性
  87. 4. 去除虚词(的、吗、呢等)
  88. ## 输出要求
  89. 返回分词列表和分词理由。
  90. """.strip()
  91. word_segmenter = Agent[None](
  92. name="分词专家",
  93. instructions=word_segmentation_instructions,
  94. model=get_model(MODEL_NAME),
  95. output_type=WordSegmentation,
  96. )
  97. # Agent 2: Query相关度评估专家
  98. class RelevanceEvaluation(BaseModel):
  99. """相关度评估"""
  100. relevance_score: float = Field(..., description="相关性分数 0-1")
  101. is_improved: bool = Field(..., description="是否比之前更好")
  102. reason: str = Field(..., description="评估理由")
  103. relevance_evaluation_instructions = """
  104. 你是Query相关度评估专家。
  105. ## 任务
  106. 评估当前query与原始需求的匹配程度。
  107. ## 评估标准
  108. - 主题相关性
  109. - 要素覆盖度
  110. - 意图匹配度
  111. ## 输出
  112. - relevance_score: 0-1的相关性分数
  113. - is_improved: 如果提供了previous_score,判断是否有提升
  114. - reason: 详细理由
  115. """.strip()
  116. relevance_evaluator = Agent[None](
  117. name="Query相关度评估专家",
  118. instructions=relevance_evaluation_instructions,
  119. model=get_model(MODEL_NAME),
  120. output_type=RelevanceEvaluation,
  121. )
  122. # Agent 3: Query改写专家
  123. class QueryRewrite(BaseModel):
  124. """Query改写结果"""
  125. rewritten_query: str = Field(..., description="改写后的query")
  126. rewrite_type: str = Field(..., description="改写类型:abstract或synonym")
  127. reasoning: str = Field(..., description="改写理由")
  128. query_rewrite_instructions = """
  129. 你是Query改写专家。
  130. ## 改写策略
  131. 1. **向上抽象**:将具体概念泛化到更高层次
  132. - 例:iPhone 13 → 智能手机
  133. 2. **同义改写**:使用同义词或相关表达
  134. - 例:购买 → 入手、获取
  135. ## 输出要求
  136. 返回改写后的query、改写类型和理由。
  137. """.strip()
  138. query_rewriter = Agent[None](
  139. name="Query改写专家",
  140. instructions=query_rewrite_instructions,
  141. model=get_model(MODEL_NAME),
  142. output_type=QueryRewrite,
  143. )
  144. # Agent 4: 加词位置评估专家
  145. class WordInsertion(BaseModel):
  146. """加词结果"""
  147. new_query: str = Field(..., description="加词后的新query")
  148. insertion_position: str = Field(..., description="插入位置描述")
  149. reasoning: str = Field(..., description="插入理由")
  150. word_insertion_instructions = """
  151. 你是加词位置评估专家。
  152. ## 任务
  153. 将新词加到当前query的最合适位置,保持语义通顺。
  154. ## 原则
  155. 1. 保持语法正确
  156. 2. 语义连贯
  157. 3. 符合搜索习惯
  158. ## 输出
  159. 返回新query、插入位置描述和理由。
  160. """.strip()
  161. word_inserter = Agent[None](
  162. name="加词位置评估专家",
  163. instructions=word_insertion_instructions,
  164. model=get_model(MODEL_NAME),
  165. output_type=WordInsertion,
  166. )
  167. # Agent 5: Result匹配度评估专家
  168. class ResultEvaluation(BaseModel):
  169. """Result评估结果"""
  170. match_level: str = Field(..., description="匹配等级:satisfied, partial, unsatisfied")
  171. relevance_score: float = Field(..., description="相关性分数 0-1")
  172. missing_aspects: list[str] = Field(default_factory=list, description="缺失的方面")
  173. reason: str = Field(..., description="评估理由")
  174. result_evaluation_instructions = """
  175. 你是Result匹配度评估专家。
  176. ## 任务
  177. 评估搜索结果(帖子)与原始需求的匹配程度。
  178. ## 评估等级
  179. 1. **satisfied**: 完全满足需求
  180. 2. **partial**: 部分满足,但有缺失
  181. 3. **unsatisfied**: 基本不满足
  182. ## 输出要求
  183. - match_level: 匹配等级
  184. - relevance_score: 相关性分数
  185. - missing_aspects: 如果是partial,列出缺失的方面
  186. - reason: 详细理由
  187. """.strip()
  188. result_evaluator = Agent[None](
  189. name="Result匹配度评估专家",
  190. instructions=result_evaluation_instructions,
  191. model=get_model(MODEL_NAME),
  192. output_type=ResultEvaluation,
  193. )
  194. # Agent 6: Query改造专家(基于缺失部分)
  195. class QueryImprovement(BaseModel):
  196. """Query改造结果"""
  197. improved_query: str = Field(..., description="改造后的query")
  198. added_aspects: list[str] = Field(..., description="添加的方面")
  199. reasoning: str = Field(..., description="改造理由")
  200. query_improvement_instructions = """
  201. 你是Query改造专家。
  202. ## 任务
  203. 根据搜索结果的缺失部分,改造query使其包含这些内容。
  204. ## 原则
  205. 1. 针对性补充缺失方面
  206. 2. 保持query简洁
  207. 3. 符合搜索习惯
  208. ## 输出
  209. 返回改造后的query、添加的方面和理由。
  210. """.strip()
  211. query_improver = Agent[None](
  212. name="Query改造专家",
  213. instructions=query_improvement_instructions,
  214. model=get_model(MODEL_NAME),
  215. output_type=QueryImprovement,
  216. )
  217. # Agent 7: 关键词提取专家
  218. class KeywordExtraction(BaseModel):
  219. """关键词提取结果"""
  220. keywords: list[str] = Field(..., description="提取的关键词列表")
  221. reasoning: str = Field(..., description="提取理由")
  222. keyword_extraction_instructions = """
  223. 你是关键词提取专家。
  224. ## 任务
  225. 从帖子标题和描述中提取核心关键词。
  226. ## 提取原则
  227. 1. 提取有搜索价值的词汇
  228. 2. 去除虚词和通用词
  229. 3. 保留专业术语
  230. 4. 提取3-10个关键词
  231. ## 输出
  232. 返回关键词列表和提取理由。
  233. """.strip()
  234. keyword_extractor = Agent[None](
  235. name="关键词提取专家",
  236. instructions=keyword_extraction_instructions,
  237. model=get_model(MODEL_NAME),
  238. output_type=KeywordExtraction,
  239. )
  240. # ============================================================================
  241. # 辅助函数
  242. # ============================================================================
  243. def add_step(context: RunContext, step_name: str, step_type: str, data: dict):
  244. """添加步骤记录"""
  245. step = {
  246. "step_number": len(context.steps) + 1,
  247. "step_name": step_name,
  248. "step_type": step_type,
  249. "timestamp": datetime.now().isoformat(),
  250. "data": data
  251. }
  252. context.steps.append(step)
  253. return step
  254. def add_query_to_graph(context: RunContext, query_state: QueryState, iteration: int, evaluation_reason: str = "", is_selected: bool = True, parent_level: int | None = None):
  255. """添加Query节点到演化图
  256. Args:
  257. context: 运行上下文
  258. query_state: Query状态
  259. iteration: 迭代次数
  260. evaluation_reason: 评估原因(可选)
  261. is_selected: 是否被选中进入处理队列(默认True)
  262. parent_level: 父节点的层级(用于构造parent_id)
  263. """
  264. # 使用 "query_level" 格式作为节点ID
  265. query_id = f"{query_state.query}_{query_state.level}"
  266. # 初始化图结构
  267. if "nodes" not in context.query_graph:
  268. context.query_graph["nodes"] = {}
  269. context.query_graph["edges"] = []
  270. context.query_graph["iterations"] = {}
  271. # 添加Query节点(type: query)
  272. context.query_graph["nodes"][query_id] = {
  273. "type": "query",
  274. "query": query_state.query,
  275. "level": query_state.level,
  276. "relevance_score": query_state.relevance_score,
  277. "strategy": query_state.strategy,
  278. "parent_query": query_state.parent_query,
  279. "iteration": iteration,
  280. "is_terminated": query_state.is_terminated,
  281. "no_suggestion_rounds": query_state.no_suggestion_rounds,
  282. "evaluation_reason": evaluation_reason, # 评估原因
  283. "is_selected": is_selected # 是否被选中
  284. }
  285. # 添加边(父子关系)
  286. if query_state.parent_query and parent_level is not None:
  287. # 构造父节点ID: parent_query_parent_level
  288. parent_id = f"{query_state.parent_query}_{parent_level}"
  289. if parent_id in context.query_graph["nodes"]:
  290. context.query_graph["edges"].append({
  291. "from": parent_id,
  292. "to": query_id,
  293. "edge_type": "query_to_query",
  294. "strategy": query_state.strategy,
  295. "score_improvement": query_state.relevance_score - context.query_graph["nodes"][parent_id]["relevance_score"]
  296. })
  297. # 按迭代分组
  298. if iteration not in context.query_graph["iterations"]:
  299. context.query_graph["iterations"][iteration] = []
  300. context.query_graph["iterations"][iteration].append(query_id)
  301. def add_note_to_graph(context: RunContext, query: str, query_level: int, note: dict):
  302. """添加Note节点到演化图,并连接到对应的Query
  303. Args:
  304. context: 运行上下文
  305. query: query文本
  306. query_level: query所在层级
  307. note: 帖子数据
  308. """
  309. note_id = note["note_id"]
  310. # 初始化图结构
  311. if "nodes" not in context.query_graph:
  312. context.query_graph["nodes"] = {}
  313. context.query_graph["edges"] = []
  314. context.query_graph["iterations"] = {}
  315. # 添加Note节点(type: note),包含完整的元信息
  316. context.query_graph["nodes"][note_id] = {
  317. "type": "note",
  318. "note_id": note_id,
  319. "title": note["title"],
  320. "desc": note.get("desc", ""), # 完整描述,不截断
  321. "note_url": note.get("note_url", ""),
  322. "image_list": note.get("image_list", []), # 图片列表
  323. "interact_info": note.get("interact_info", {}), # 互动信息(点赞、收藏、评论、分享)
  324. "match_level": note["evaluation"]["match_level"],
  325. "relevance_score": note["evaluation"]["relevance_score"],
  326. "evaluation_reason": note["evaluation"].get("reason", ""), # 评估原因
  327. "found_by_query": query
  328. }
  329. # 添加边:Query → Note,使用 query_level 格式的ID
  330. query_id = f"{query}_{query_level}"
  331. if query_id in context.query_graph["nodes"]:
  332. context.query_graph["edges"].append({
  333. "from": query_id,
  334. "to": note_id,
  335. "edge_type": "query_to_note",
  336. "match_level": note["evaluation"]["match_level"],
  337. "relevance_score": note["evaluation"]["relevance_score"]
  338. })
  339. def process_note_data(note: dict) -> dict:
  340. """处理搜索接口返回的帖子数据"""
  341. note_card = note.get("note_card", {})
  342. image_list = note_card.get("image_list", [])
  343. interact_info = note_card.get("interact_info", {})
  344. user_info = note_card.get("user", {})
  345. return {
  346. "note_id": note.get("id", ""),
  347. "title": note_card.get("display_title", ""),
  348. "desc": note_card.get("desc", ""),
  349. "image_list": image_list,
  350. "interact_info": {
  351. "liked_count": interact_info.get("liked_count", 0),
  352. "collected_count": interact_info.get("collected_count", 0),
  353. "comment_count": interact_info.get("comment_count", 0),
  354. "shared_count": interact_info.get("shared_count", 0)
  355. },
  356. "user": {
  357. "nickname": user_info.get("nickname", ""),
  358. "user_id": user_info.get("user_id", "")
  359. },
  360. "type": note_card.get("type", "normal"),
  361. "note_url": f"https://www.xiaohongshu.com/explore/{note.get('id', '')}"
  362. }
  363. # ============================================================================
  364. # 核心流程函数
  365. # ============================================================================
  366. async def initialize_word_library(original_query: str, context: RunContext) -> WordLibrary:
  367. """初始化分词库"""
  368. print("\n[初始化] 创建分词库...")
  369. # 使用Agent进行分词
  370. result = await Runner.run(word_segmenter, original_query)
  371. segmentation: WordSegmentation = result.final_output
  372. word_lib = WordLibrary()
  373. word_lib.add_words(segmentation.words, source="initial")
  374. print(f"初始分词库: {list(word_lib.words)}")
  375. print(f"分词理由: {segmentation.reasoning}")
  376. # 保存到context
  377. context.word_library = word_lib.model_dump()
  378. add_step(context, "初始化分词库", "word_library_init", {
  379. "agent": "分词专家",
  380. "input": original_query,
  381. "output": {
  382. "words": segmentation.words,
  383. "reasoning": segmentation.reasoning
  384. },
  385. "result": {
  386. "word_library": list(word_lib.words)
  387. }
  388. })
  389. return word_lib
  390. async def evaluate_query_relevance(
  391. query: str,
  392. original_need: str,
  393. previous_score: float | None = None,
  394. context: RunContext = None
  395. ) -> RelevanceEvaluation:
  396. """评估query与原始需求的相关度"""
  397. eval_input = f"""
  398. <原始需求>
  399. {original_need}
  400. </原始需求>
  401. <当前Query>
  402. {query}
  403. </当前Query>
  404. {"<之前的相关度分数>" + str(previous_score) + "</之前的相关度分数>" if previous_score is not None else ""}
  405. 请评估当前query与原始需求的相关度。
  406. """
  407. result = await Runner.run(relevance_evaluator, eval_input)
  408. evaluation: RelevanceEvaluation = result.final_output
  409. return evaluation
  410. async def process_suggestions(
  411. query: str,
  412. query_state: QueryState,
  413. original_need: str,
  414. word_lib: WordLibrary,
  415. context: RunContext,
  416. xiaohongshu_api: XiaohongshuSearchRecommendations,
  417. iteration: int
  418. ) -> list[QueryState]:
  419. """处理suggestion分支,返回新的query states"""
  420. print(f"\n [Suggestion分支] 处理query: {query}")
  421. # 收集本次分支处理中的所有Agent调用
  422. agent_calls = []
  423. # 1. 获取suggestions
  424. suggestions = xiaohongshu_api.get_recommendations(keyword=query)
  425. if not suggestions or len(suggestions) == 0:
  426. print(f" → 没有获取到suggestion")
  427. query_state.no_suggestion_rounds += 1
  428. # 记录步骤
  429. add_step(context, f"Suggestion分支 - {query}", "suggestion_branch", {
  430. "query": query,
  431. "query_level": query_state.level,
  432. "suggestions_count": 0,
  433. "no_suggestion_rounds": query_state.no_suggestion_rounds,
  434. "new_queries_generated": 0
  435. })
  436. return []
  437. print(f" → 获取到 {len(suggestions)} 个suggestions")
  438. query_state.no_suggestion_rounds = 0 # 重置计数
  439. # 2. 评估每个suggestion
  440. new_queries = []
  441. suggestion_evaluations = []
  442. for sug in suggestions: # 处理所有建议
  443. # 评估sug与原始需求的相关度(注意:这里是与原始需求original_need对比,而非当前query)
  444. # 这样可以确保生成的suggestion始终围绕用户的核心需求
  445. sug_eval = await evaluate_query_relevance(sug, original_need, query_state.relevance_score, context)
  446. sug_eval_record = {
  447. "suggestion": sug,
  448. "relevance_score": sug_eval.relevance_score,
  449. "is_improved": sug_eval.is_improved,
  450. "reason": sug_eval.reason
  451. }
  452. suggestion_evaluations.append(sug_eval_record)
  453. # 创建query state(所有suggestion都作为query节点)
  454. sug_state = QueryState(
  455. query=sug,
  456. level=query_state.level + 1,
  457. relevance_score=sug_eval.relevance_score,
  458. parent_query=query,
  459. strategy="调用sug"
  460. )
  461. # 判断是否比当前query更好(只有提升的才加入待处理队列)
  462. is_selected = sug_eval.is_improved and sug_eval.relevance_score > query_state.relevance_score
  463. # 将所有suggestion添加到演化图(包括未提升的)
  464. add_query_to_graph(
  465. context,
  466. sug_state,
  467. iteration,
  468. evaluation_reason=sug_eval.reason,
  469. is_selected=is_selected,
  470. parent_level=query_state.level # 父节点的层级
  471. )
  472. if is_selected:
  473. print(f" ✓ {sug} (分数: {sug_eval.relevance_score:.2f}, 提升: {sug_eval.is_improved})")
  474. new_queries.append(sug_state)
  475. else:
  476. print(f" ✗ {sug} (分数: {sug_eval.relevance_score:.2f}, 未提升)")
  477. # 3. 改写策略(向上抽象或同义改写)
  478. if len(new_queries) < 3: # 如果直接使用sug的数量不够,尝试改写
  479. # 尝试向上抽象
  480. rewrite_input_abstract = f"""
  481. <当前Query>
  482. {query}
  483. </当前Query>
  484. <改写要求>
  485. 类型: abstract (向上抽象)
  486. </改写要求>
  487. 请改写这个query。
  488. """
  489. result = await Runner.run(query_rewriter, rewrite_input_abstract)
  490. rewrite: QueryRewrite = result.final_output
  491. # 收集改写Agent的输入输出
  492. rewrite_agent_call = {
  493. "agent": "Query改写专家",
  494. "action": "向上抽象改写",
  495. "input": {
  496. "query": query,
  497. "rewrite_type": "abstract"
  498. },
  499. "output": {
  500. "rewritten_query": rewrite.rewritten_query,
  501. "rewrite_type": rewrite.rewrite_type,
  502. "reasoning": rewrite.reasoning
  503. }
  504. }
  505. agent_calls.append(rewrite_agent_call)
  506. # 评估改写后的query
  507. rewrite_eval = await evaluate_query_relevance(rewrite.rewritten_query, original_need, query_state.relevance_score, context)
  508. # 创建改写后的query state
  509. new_state = QueryState(
  510. query=rewrite.rewritten_query,
  511. level=query_state.level + 1,
  512. relevance_score=rewrite_eval.relevance_score,
  513. parent_query=query,
  514. strategy="抽象改写"
  515. )
  516. # 添加到演化图(无论是否提升)
  517. add_query_to_graph(
  518. context,
  519. new_state,
  520. iteration,
  521. evaluation_reason=rewrite_eval.reason,
  522. is_selected=rewrite_eval.is_improved,
  523. parent_level=query_state.level # 父节点的层级
  524. )
  525. if rewrite_eval.is_improved:
  526. print(f" ✓ 改写(抽象): {rewrite.rewritten_query} (分数: {rewrite_eval.relevance_score:.2f})")
  527. new_queries.append(new_state)
  528. else:
  529. print(f" ✗ 改写(抽象): {rewrite.rewritten_query} (分数: {rewrite_eval.relevance_score:.2f}, 未提升)")
  530. # 3.2. 同义改写策略
  531. if len(new_queries) < 4: # 如果还不够,尝试同义改写
  532. rewrite_input_synonym = f"""
  533. <当前Query>
  534. {query}
  535. </当前Query>
  536. <改写要求>
  537. 类型: synonym (同义改写)
  538. 使用同义词或相关表达来改写query,保持语义相同但表达方式不同。
  539. </改写要求>
  540. 请改写这个query。
  541. """
  542. result = await Runner.run(query_rewriter, rewrite_input_synonym)
  543. rewrite_syn: QueryRewrite = result.final_output
  544. # 收集同义改写Agent的输入输出
  545. rewrite_syn_agent_call = {
  546. "agent": "Query改写专家",
  547. "action": "同义改写",
  548. "input": {
  549. "query": query,
  550. "rewrite_type": "synonym"
  551. },
  552. "output": {
  553. "rewritten_query": rewrite_syn.rewritten_query,
  554. "rewrite_type": rewrite_syn.rewrite_type,
  555. "reasoning": rewrite_syn.reasoning
  556. }
  557. }
  558. agent_calls.append(rewrite_syn_agent_call)
  559. # 评估改写后的query
  560. rewrite_syn_eval = await evaluate_query_relevance(rewrite_syn.rewritten_query, original_need, query_state.relevance_score, context)
  561. # 创建改写后的query state
  562. new_state = QueryState(
  563. query=rewrite_syn.rewritten_query,
  564. level=query_state.level + 1,
  565. relevance_score=rewrite_syn_eval.relevance_score,
  566. parent_query=query,
  567. strategy="同义改写"
  568. )
  569. # 添加到演化图(无论是否提升)
  570. add_query_to_graph(
  571. context,
  572. new_state,
  573. iteration,
  574. evaluation_reason=rewrite_syn_eval.reason,
  575. is_selected=rewrite_syn_eval.is_improved,
  576. parent_level=query_state.level # 父节点的层级
  577. )
  578. if rewrite_syn_eval.is_improved:
  579. print(f" ✓ 改写(同义): {rewrite_syn.rewritten_query} (分数: {rewrite_syn_eval.relevance_score:.2f})")
  580. new_queries.append(new_state)
  581. else:
  582. print(f" ✗ 改写(同义): {rewrite_syn.rewritten_query} (分数: {rewrite_syn_eval.relevance_score:.2f}, 未提升)")
  583. # 4. 加词策略
  584. unused_word = word_lib.get_unused_word(query)
  585. if unused_word and len(new_queries) < 5:
  586. insertion_input = f"""
  587. <当前Query>
  588. {query}
  589. </当前Query>
  590. <要添加的词>
  591. {unused_word}
  592. </要添加的词>
  593. 请将这个词加到query的最合适位置。
  594. """
  595. result = await Runner.run(word_inserter, insertion_input)
  596. insertion: WordInsertion = result.final_output
  597. # 收集加词Agent的输入输出
  598. insertion_agent_call = {
  599. "agent": "加词位置评估专家",
  600. "action": "加词",
  601. "input": {
  602. "query": query,
  603. "word_to_add": unused_word
  604. },
  605. "output": {
  606. "new_query": insertion.new_query,
  607. "insertion_position": insertion.insertion_position,
  608. "reasoning": insertion.reasoning
  609. }
  610. }
  611. agent_calls.append(insertion_agent_call)
  612. # 评估加词后的query
  613. insertion_eval = await evaluate_query_relevance(insertion.new_query, original_need, query_state.relevance_score, context)
  614. # 创建加词后的query state
  615. new_state = QueryState(
  616. query=insertion.new_query,
  617. level=query_state.level + 1,
  618. relevance_score=insertion_eval.relevance_score,
  619. parent_query=query,
  620. strategy="加词"
  621. )
  622. # 添加到演化图(无论是否提升)
  623. add_query_to_graph(
  624. context,
  625. new_state,
  626. iteration,
  627. evaluation_reason=insertion_eval.reason,
  628. is_selected=insertion_eval.is_improved,
  629. parent_level=query_state.level # 父节点的层级
  630. )
  631. if insertion_eval.is_improved:
  632. print(f" ✓ 加词: {insertion.new_query} (分数: {insertion_eval.relevance_score:.2f})")
  633. new_queries.append(new_state)
  634. else:
  635. print(f" ✗ 加词: {insertion.new_query} (分数: {insertion_eval.relevance_score:.2f}, 未提升)")
  636. # 记录完整的suggestion分支处理结果(层级化)
  637. add_step(context, f"Suggestion分支 - {query}", "suggestion_branch", {
  638. "query": query,
  639. "query_level": query_state.level,
  640. "query_relevance": query_state.relevance_score,
  641. "suggestions_count": len(suggestions),
  642. "suggestions_evaluated": len(suggestion_evaluations),
  643. "suggestion_evaluations": suggestion_evaluations, # 保存所有评估
  644. "agent_calls": agent_calls, # 所有Agent调用的详细记录
  645. "new_queries_generated": len(new_queries),
  646. "new_queries": [{"query": nq.query, "score": nq.relevance_score, "strategy": nq.strategy} for nq in new_queries],
  647. "no_suggestion_rounds": query_state.no_suggestion_rounds
  648. })
  649. return new_queries
  650. async def process_search_results(
  651. query: str,
  652. query_state: QueryState,
  653. original_need: str,
  654. word_lib: WordLibrary,
  655. context: RunContext,
  656. xiaohongshu_search: XiaohongshuSearch,
  657. relevance_threshold: float,
  658. iteration: int
  659. ) -> tuple[list[dict], list[QueryState]]:
  660. """
  661. 处理搜索结果分支
  662. 返回: (满足需求的notes, 需要继续迭代的新queries)
  663. """
  664. print(f"\n [Result分支] 搜索query: {query}")
  665. # 收集本次分支处理中的所有Agent调用
  666. agent_calls = []
  667. # 1. 判断query相关度是否达到门槛
  668. if query_state.relevance_score < relevance_threshold:
  669. print(f" ✗ 相关度 {query_state.relevance_score:.2f} 低于门槛 {relevance_threshold},跳过搜索")
  670. return [], []
  671. print(f" ✓ 相关度 {query_state.relevance_score:.2f} 达到门槛,执行搜索")
  672. # 2. 执行搜索
  673. try:
  674. search_result = xiaohongshu_search.search(keyword=query)
  675. result_str = search_result.get("result", "{}")
  676. if isinstance(result_str, str):
  677. result_data = json.loads(result_str)
  678. else:
  679. result_data = result_str
  680. notes = result_data.get("data", {}).get("data", [])
  681. print(f" → 搜索到 {len(notes)} 个帖子")
  682. except Exception as e:
  683. print(f" ✗ 搜索失败: {e}")
  684. return [], []
  685. if not notes:
  686. return [], []
  687. # 3. 评估每个帖子
  688. satisfied_notes = []
  689. partial_notes = []
  690. for note in notes: # 评估所有帖子
  691. note_data = process_note_data(note)
  692. title = note_data["title"] or ""
  693. desc = note_data["desc"] or ""
  694. # 跳过空标题和描述的帖子
  695. if not title and not desc:
  696. continue
  697. # 评估帖子
  698. eval_input = f"""
  699. <原始需求>
  700. {original_need}
  701. </原始需求>
  702. <帖子>
  703. 标题: {title}
  704. 描述: {desc}
  705. </帖子>
  706. 请评估这个帖子与原始需求的匹配程度。
  707. """
  708. result = await Runner.run(result_evaluator, eval_input)
  709. evaluation: ResultEvaluation = result.final_output
  710. # 收集Result评估Agent的输入输出
  711. result_eval_agent_call = {
  712. "agent": "Result匹配度评估专家",
  713. "action": "评估帖子匹配度",
  714. "input": {
  715. "note_id": note_data.get("note_id"),
  716. "title": title,
  717. "desc": desc # 完整描述
  718. },
  719. "output": {
  720. "match_level": evaluation.match_level,
  721. "relevance_score": evaluation.relevance_score,
  722. "missing_aspects": evaluation.missing_aspects,
  723. "reason": evaluation.reason
  724. }
  725. }
  726. agent_calls.append(result_eval_agent_call)
  727. note_data["evaluation"] = {
  728. "match_level": evaluation.match_level,
  729. "relevance_score": evaluation.relevance_score,
  730. "missing_aspects": evaluation.missing_aspects,
  731. "reason": evaluation.reason
  732. }
  733. # 将所有评估过的帖子添加到演化图(包括satisfied、partial、unsatisfied)
  734. add_note_to_graph(context, query, query_state.level, note_data)
  735. if evaluation.match_level == "satisfied":
  736. satisfied_notes.append(note_data)
  737. print(f" ✓ 满足: {title[:30] if len(title) > 30 else title}... (分数: {evaluation.relevance_score:.2f})")
  738. elif evaluation.match_level == "partial":
  739. partial_notes.append(note_data)
  740. print(f" ~ 部分: {title[:30] if len(title) > 30 else title}... (缺失: {', '.join(evaluation.missing_aspects[:2])})")
  741. else: # unsatisfied
  742. print(f" ✗ 不满足: {title[:30] if len(title) > 30 else title}... (分数: {evaluation.relevance_score:.2f})")
  743. # 4. 处理满足的帖子:不再扩充分词库(避免无限扩张)
  744. new_queries = []
  745. if satisfied_notes:
  746. print(f"\n ✓ 找到 {len(satisfied_notes)} 个满足的帖子,不再提取关键词入库")
  747. # 注释掉关键词提取逻辑,保持分词库稳定
  748. # for note in satisfied_notes[:3]:
  749. # extract_input = f"""
  750. # <帖子>
  751. # 标题: {note['title']}
  752. # 描述: {note['desc']}
  753. # </帖子>
  754. #
  755. # 请提取核心关键词。
  756. # """
  757. # result = await Runner.run(keyword_extractor, extract_input)
  758. # extraction: KeywordExtraction = result.final_output
  759. #
  760. # # 添加新词到分词库,标记来源
  761. # note_id = note.get('note_id', 'unknown')
  762. # for keyword in extraction.keywords:
  763. # if keyword not in word_lib.words:
  764. # word_lib.add_word(keyword, source=f"note:{note_id}")
  765. # print(f" + 新词入库: {keyword} (来源: {note_id})")
  766. # 5. 处理部分匹配的帖子:改造query
  767. if partial_notes and len(satisfied_notes) < 5: # 如果满足的不够,基于部分匹配改进
  768. print(f"\n 基于 {len(partial_notes)} 个部分匹配帖子改造query...")
  769. # 收集所有缺失方面
  770. all_missing = []
  771. for note in partial_notes:
  772. all_missing.extend(note["evaluation"]["missing_aspects"])
  773. if all_missing:
  774. improvement_input = f"""
  775. <当前Query>
  776. {query}
  777. </当前Query>
  778. <缺失的方面>
  779. {', '.join(set(all_missing))}
  780. </缺失的方面>
  781. 请改造query使其包含这些缺失的内容。
  782. """
  783. result = await Runner.run(query_improver, improvement_input)
  784. improvement: QueryImprovement = result.final_output
  785. # 收集Query改造Agent的输入输出
  786. improvement_agent_call = {
  787. "agent": "Query改造专家",
  788. "action": "基于缺失方面改造Query",
  789. "input": {
  790. "query": query,
  791. "missing_aspects": list(set(all_missing))
  792. },
  793. "output": {
  794. "improved_query": improvement.improved_query,
  795. "added_aspects": improvement.added_aspects,
  796. "reasoning": improvement.reasoning
  797. }
  798. }
  799. agent_calls.append(improvement_agent_call)
  800. # 评估改进后的query
  801. improved_eval = await evaluate_query_relevance(improvement.improved_query, original_need, query_state.relevance_score, context)
  802. # 创建改进后的query state
  803. new_state = QueryState(
  804. query=improvement.improved_query,
  805. level=query_state.level + 1,
  806. relevance_score=improved_eval.relevance_score,
  807. parent_query=query,
  808. strategy="基于部分匹配改进"
  809. )
  810. # 添加到演化图(无论是否提升)
  811. add_query_to_graph(
  812. context,
  813. new_state,
  814. iteration,
  815. evaluation_reason=improved_eval.reason,
  816. is_selected=improved_eval.is_improved,
  817. parent_level=query_state.level # 父节点的层级
  818. )
  819. if improved_eval.is_improved:
  820. print(f" ✓ 改进: {improvement.improved_query} (添加: {', '.join(improvement.added_aspects[:2])})")
  821. new_queries.append(new_state)
  822. else:
  823. print(f" ✗ 改进: {improvement.improved_query} (分数: {improved_eval.relevance_score:.2f}, 未提升)")
  824. # 6. Result分支的改写策略(向上抽象和同义改写)
  825. # 如果搜索结果不理想且新queries不够,尝试改写当前query
  826. if len(satisfied_notes) < 3 and len(new_queries) < 2:
  827. print(f"\n 搜索结果不理想,尝试改写query...")
  828. # 6.1 向上抽象
  829. if len(new_queries) < 3:
  830. rewrite_input_abstract = f"""
  831. <当前Query>
  832. {query}
  833. </当前Query>
  834. <改写要求>
  835. 类型: abstract (向上抽象)
  836. </改写要求>
  837. 请改写这个query。
  838. """
  839. result = await Runner.run(query_rewriter, rewrite_input_abstract)
  840. rewrite: QueryRewrite = result.final_output
  841. # 收集Result分支改写(抽象)Agent的输入输出
  842. rewrite_agent_call = {
  843. "agent": "Query改写专家",
  844. "action": "向上抽象改写(Result分支)",
  845. "input": {
  846. "query": query,
  847. "rewrite_type": "abstract"
  848. },
  849. "output": {
  850. "rewritten_query": rewrite.rewritten_query,
  851. "rewrite_type": rewrite.rewrite_type,
  852. "reasoning": rewrite.reasoning
  853. }
  854. }
  855. agent_calls.append(rewrite_agent_call)
  856. # 评估改写后的query
  857. rewrite_eval = await evaluate_query_relevance(rewrite.rewritten_query, original_need, query_state.relevance_score, context)
  858. # 创建改写后的query state
  859. new_state = QueryState(
  860. query=rewrite.rewritten_query,
  861. level=query_state.level + 1,
  862. relevance_score=rewrite_eval.relevance_score,
  863. parent_query=query,
  864. strategy="结果分支-抽象改写"
  865. )
  866. # 添加到演化图(无论是否提升)
  867. add_query_to_graph(
  868. context,
  869. new_state,
  870. iteration,
  871. evaluation_reason=rewrite_eval.reason,
  872. is_selected=rewrite_eval.is_improved,
  873. parent_level=query_state.level # 父节点的层级
  874. )
  875. if rewrite_eval.is_improved:
  876. print(f" ✓ 改写(抽象): {rewrite.rewritten_query} (分数: {rewrite_eval.relevance_score:.2f})")
  877. new_queries.append(new_state)
  878. else:
  879. print(f" ✗ 改写(抽象): {rewrite.rewritten_query} (分数: {rewrite_eval.relevance_score:.2f}, 未提升)")
  880. # 6.2 同义改写
  881. if len(new_queries) < 4:
  882. rewrite_input_synonym = f"""
  883. <当前Query>
  884. {query}
  885. </当前Query>
  886. <改写要求>
  887. 类型: synonym (同义改写)
  888. 使用同义词或相关表达来改写query,保持语义相同但表达方式不同。
  889. </改写要求>
  890. 请改写这个query。
  891. """
  892. result = await Runner.run(query_rewriter, rewrite_input_synonym)
  893. rewrite_syn: QueryRewrite = result.final_output
  894. # 收集Result分支改写(同义)Agent的输入输出
  895. rewrite_syn_agent_call = {
  896. "agent": "Query改写专家",
  897. "action": "同义改写(Result分支)",
  898. "input": {
  899. "query": query,
  900. "rewrite_type": "synonym"
  901. },
  902. "output": {
  903. "rewritten_query": rewrite_syn.rewritten_query,
  904. "rewrite_type": rewrite_syn.rewrite_type,
  905. "reasoning": rewrite_syn.reasoning
  906. }
  907. }
  908. agent_calls.append(rewrite_syn_agent_call)
  909. # 评估改写后的query
  910. rewrite_syn_eval = await evaluate_query_relevance(rewrite_syn.rewritten_query, original_need, query_state.relevance_score, context)
  911. # 创建改写后的query state
  912. new_state = QueryState(
  913. query=rewrite_syn.rewritten_query,
  914. level=query_state.level + 1,
  915. relevance_score=rewrite_syn_eval.relevance_score,
  916. parent_query=query,
  917. strategy="结果分支-同义改写"
  918. )
  919. # 添加到演化图(无论是否提升)
  920. add_query_to_graph(
  921. context,
  922. new_state,
  923. iteration,
  924. evaluation_reason=rewrite_syn_eval.reason,
  925. is_selected=rewrite_syn_eval.is_improved,
  926. parent_level=query_state.level # 父节点的层级
  927. )
  928. if rewrite_syn_eval.is_improved:
  929. print(f" ✓ 改写(同义): {rewrite_syn.rewritten_query} (分数: {rewrite_syn_eval.relevance_score:.2f})")
  930. new_queries.append(new_state)
  931. else:
  932. print(f" ✗ 改写(同义): {rewrite_syn.rewritten_query} (分数: {rewrite_syn_eval.relevance_score:.2f}, 未提升)")
  933. # 记录完整的result分支处理结果(层级化)
  934. add_step(context, f"Result分支 - {query}", "result_branch", {
  935. "query": query,
  936. "query_level": query_state.level,
  937. "query_relevance": query_state.relevance_score,
  938. "relevance_threshold": relevance_threshold,
  939. "passed_threshold": query_state.relevance_score >= relevance_threshold,
  940. "notes_count": len(notes) if 'notes' in locals() else 0,
  941. "satisfied_count": len(satisfied_notes),
  942. "partial_count": len(partial_notes),
  943. "satisfied_notes": [
  944. {
  945. "note_id": note["note_id"],
  946. "title": note["title"],
  947. "score": note["evaluation"]["relevance_score"],
  948. "match_level": note["evaluation"]["match_level"]
  949. }
  950. for note in satisfied_notes # 保存所有满足的帖子
  951. ],
  952. "agent_calls": agent_calls, # 所有Agent调用的详细记录
  953. "new_queries_generated": len(new_queries),
  954. "new_queries": [{"query": nq.query, "score": nq.relevance_score, "strategy": nq.strategy} for nq in new_queries]
  955. })
  956. return satisfied_notes, new_queries
  957. async def iterative_search_loop(
  958. context: RunContext,
  959. max_iterations: int = 20,
  960. max_concurrent_queries: int = 5,
  961. relevance_threshold: float = 0.6
  962. ) -> list[dict]:
  963. """
  964. 主循环:迭代搜索
  965. Args:
  966. context: 运行上下文
  967. max_iterations: 最大迭代次数
  968. max_concurrent_queries: 最大并发query数量
  969. relevance_threshold: 相关度门槛
  970. Returns:
  971. 满足需求的帖子列表
  972. """
  973. print(f"\n{'='*60}")
  974. print(f"开始迭代搜索循环")
  975. print(f"{'='*60}")
  976. # 0. 添加原始问题作为根节点
  977. root_query_state = QueryState(
  978. query=context.q,
  979. level=0,
  980. relevance_score=1.0, # 原始问题本身相关度为1.0
  981. strategy="根节点"
  982. )
  983. add_query_to_graph(context, root_query_state, 0, evaluation_reason="原始问题,作为搜索的根节点", is_selected=True)
  984. print(f"[根节点] 原始问题: {context.q}")
  985. # 1. 初始化分词库
  986. word_lib = await initialize_word_library(context.q, context)
  987. # 2. 初始化query队列 - 智能选择最相关的词
  988. all_words = list(word_lib.words)
  989. query_queue = []
  990. print(f"\n评估所有初始分词的相关度...")
  991. word_scores = []
  992. for word in all_words:
  993. # 评估每个词的相关度
  994. eval_result = await evaluate_query_relevance(word, context.q, None, context)
  995. word_scores.append({
  996. 'word': word,
  997. 'score': eval_result.relevance_score,
  998. 'eval': eval_result
  999. })
  1000. print(f" {word}: {eval_result.relevance_score:.2f}")
  1001. # 按相关度排序,使用所有分词
  1002. word_scores.sort(key=lambda x: x['score'], reverse=True)
  1003. selected_words = word_scores # 使用所有分词
  1004. # 将所有分词添加到演化图(全部被选中)
  1005. for item in word_scores:
  1006. is_selected = True # 所有分词都被选中
  1007. query_state = QueryState(
  1008. query=item['word'],
  1009. level=1,
  1010. relevance_score=item['score'],
  1011. strategy="初始分词",
  1012. parent_query=context.q # 父节点是原始问题
  1013. )
  1014. # 添加到演化图(会自动创建从parent_query到该query的边)
  1015. add_query_to_graph(context, query_state, 0, evaluation_reason=item['eval'].reason, is_selected=is_selected, parent_level=0) # 父节点是根节点(level 0)
  1016. # 只有被选中的才加入队列
  1017. if is_selected:
  1018. query_queue.append(query_state)
  1019. print(f"\n初始query队列(按相关度排序): {[(q.query, f'{q.relevance_score:.2f}') for q in query_queue]}")
  1020. print(f" (共评估了 {len(word_scores)} 个分词,全部加入队列)")
  1021. # 3. API实例
  1022. xiaohongshu_api = XiaohongshuSearchRecommendations()
  1023. xiaohongshu_search = XiaohongshuSearch()
  1024. # 4. 主循环
  1025. all_satisfied_notes = []
  1026. iteration = 0
  1027. while query_queue and iteration < max_iterations:
  1028. iteration += 1
  1029. print(f"\n{'='*60}")
  1030. print(f"迭代 {iteration}: 队列中有 {len(query_queue)} 个query")
  1031. print(f"{'='*60}")
  1032. # 限制并发数量
  1033. current_batch = query_queue[:max_concurrent_queries]
  1034. query_queue = query_queue[max_concurrent_queries:]
  1035. # 记录本轮处理的queries
  1036. add_step(context, f"迭代 {iteration}", "iteration", {
  1037. "iteration": iteration,
  1038. "queue_size": len(query_queue) + len(current_batch),
  1039. "processing_queries": [q.query for q in current_batch]
  1040. })
  1041. new_queries_from_sug = []
  1042. new_queries_from_result = []
  1043. # 处理每个query
  1044. for query_state in current_batch:
  1045. print(f"\n处理Query [{query_state.level}]: {query_state.query} (分数: {query_state.relevance_score:.2f})")
  1046. # 检查终止条件
  1047. if query_state.is_terminated or query_state.no_suggestion_rounds >= 2:
  1048. print(f" ✗ 已终止或连续2轮无suggestion,跳过该query")
  1049. query_state.is_terminated = True
  1050. continue
  1051. # 并行处理两个分支
  1052. sug_task = process_suggestions(
  1053. query_state.query, query_state, context.q, word_lib, context, xiaohongshu_api, iteration
  1054. )
  1055. result_task = process_search_results(
  1056. query_state.query, query_state, context.q, word_lib, context,
  1057. xiaohongshu_search, relevance_threshold, iteration
  1058. )
  1059. # 等待两个分支完成
  1060. sug_queries, (satisfied_notes, result_queries) = await asyncio.gather(
  1061. sug_task,
  1062. result_task
  1063. )
  1064. # 如果suggestion分支返回空,说明没有获取到suggestion,需要继承no_suggestion_rounds
  1065. # 注意:process_suggestions内部已经更新了query_state.no_suggestion_rounds
  1066. # 所以这里生成的新queries需要继承父query的no_suggestion_rounds(如果sug分支也返回空)
  1067. if not sug_queries and not result_queries:
  1068. # 两个分支都没有产生新query,标记当前query为终止
  1069. query_state.is_terminated = True
  1070. print(f" ⚠ 两个分支均未产生新query,标记该query为终止")
  1071. new_queries_from_sug.extend(sug_queries)
  1072. new_queries_from_result.extend(result_queries)
  1073. all_satisfied_notes.extend(satisfied_notes)
  1074. # 更新队列
  1075. all_new_queries = new_queries_from_sug + new_queries_from_result
  1076. # 注意:不需要在这里再次添加到演化图,因为在 process_suggestions 和 process_search_results 中已经添加过了
  1077. # 如果在这里再次调用 add_query_to_graph,会覆盖之前设置的 evaluation_reason 等字段
  1078. query_queue.extend(all_new_queries)
  1079. # 去重(基于query文本)并过滤已终止的query
  1080. seen = set()
  1081. unique_queue = []
  1082. for q in query_queue:
  1083. if q.query not in seen and not q.is_terminated:
  1084. seen.add(q.query)
  1085. unique_queue.append(q)
  1086. query_queue = unique_queue
  1087. # 按相关度排序
  1088. query_queue.sort(key=lambda x: x.relevance_score, reverse=True)
  1089. print(f"\n本轮结果:")
  1090. print(f" 新增满足帖子: {len(satisfied_notes)}")
  1091. print(f" 累计满足帖子: {len(all_satisfied_notes)}")
  1092. print(f" 新增queries: {len(all_new_queries)}")
  1093. print(f" 队列剩余: {len(query_queue)}")
  1094. # 更新分词库到context
  1095. context.word_library = word_lib.model_dump()
  1096. # 如果满足条件的帖子足够多,可以提前结束
  1097. if len(all_satisfied_notes) >= 20:
  1098. print(f"\n已找到足够的满足帖子 ({len(all_satisfied_notes)}个),提前结束")
  1099. break
  1100. print(f"\n{'='*60}")
  1101. print(f"迭代搜索完成")
  1102. print(f" 总迭代次数: {iteration}")
  1103. print(f" 最终满足帖子数: {len(all_satisfied_notes)}")
  1104. print(f" 最终分词库大小: {len(word_lib.words)}")
  1105. print(f"{'='*60}")
  1106. # 保存最终结果
  1107. add_step(context, "迭代搜索完成", "loop_complete", {
  1108. "total_iterations": iteration,
  1109. "total_satisfied_notes": len(all_satisfied_notes),
  1110. "final_word_library_size": len(word_lib.words),
  1111. "final_word_library": list(word_lib.words)
  1112. })
  1113. return all_satisfied_notes
  1114. # ============================================================================
  1115. # 主函数
  1116. # ============================================================================
  1117. async def main(input_dir: str, max_iterations: int = 20, visualize: bool = False):
  1118. """主函数"""
  1119. current_time, log_url = set_trace()
  1120. # 读取输入
  1121. input_context_file = os.path.join(input_dir, 'context.md')
  1122. input_q_file = os.path.join(input_dir, 'q.md')
  1123. q_context = read_file_as_string(input_context_file)
  1124. q = read_file_as_string(input_q_file)
  1125. q_with_context = f"""
  1126. <需求上下文>
  1127. {q_context}
  1128. </需求上下文>
  1129. <当前问题>
  1130. {q}
  1131. </当前问题>
  1132. """.strip()
  1133. # 版本信息
  1134. version = os.path.basename(__file__)
  1135. version_name = os.path.splitext(version)[0]
  1136. # 日志目录
  1137. log_dir = os.path.join(input_dir, "output", version_name, current_time)
  1138. # 创建运行上下文
  1139. run_context = RunContext(
  1140. version=version,
  1141. input_files={
  1142. "input_dir": input_dir,
  1143. "context_file": input_context_file,
  1144. "q_file": input_q_file,
  1145. },
  1146. q_with_context=q_with_context,
  1147. q_context=q_context,
  1148. q=q,
  1149. log_dir=log_dir,
  1150. log_url=log_url,
  1151. )
  1152. # 执行迭代搜索
  1153. satisfied_notes = await iterative_search_loop(
  1154. run_context,
  1155. max_iterations=max_iterations,
  1156. max_concurrent_queries=3,
  1157. relevance_threshold=0.6
  1158. )
  1159. # 保存结果
  1160. run_context.satisfied_notes = satisfied_notes
  1161. # 格式化输出
  1162. output = f"原始问题:{run_context.q}\n"
  1163. output += f"找到满足需求的帖子:{len(satisfied_notes)} 个\n"
  1164. output += f"分词库大小:{len(run_context.word_library.get('words', []))} 个词\n"
  1165. output += "\n" + "="*60 + "\n"
  1166. if satisfied_notes:
  1167. output += "【满足需求的帖子】\n\n"
  1168. for idx, note in enumerate(satisfied_notes, 1):
  1169. output += f"{idx}. {note['title']}\n"
  1170. output += f" 相关度: {note['evaluation']['relevance_score']:.2f}\n"
  1171. output += f" URL: {note['note_url']}\n\n"
  1172. else:
  1173. output += "未找到满足需求的帖子\n"
  1174. run_context.final_output = output
  1175. print(f"\n{'='*60}")
  1176. print("最终结果")
  1177. print(f"{'='*60}")
  1178. print(output)
  1179. # 保存日志
  1180. os.makedirs(run_context.log_dir, exist_ok=True)
  1181. context_file_path = os.path.join(run_context.log_dir, "run_context.json")
  1182. context_dict = run_context.model_dump()
  1183. with open(context_file_path, "w", encoding="utf-8") as f:
  1184. json.dump(context_dict, f, ensure_ascii=False, indent=2)
  1185. print(f"\nRunContext saved to: {context_file_path}")
  1186. steps_file_path = os.path.join(run_context.log_dir, "steps.json")
  1187. with open(steps_file_path, "w", encoding="utf-8") as f:
  1188. json.dump(run_context.steps, f, ensure_ascii=False, indent=2)
  1189. print(f"Steps log saved to: {steps_file_path}")
  1190. # 保存Query演化图
  1191. query_graph_file_path = os.path.join(run_context.log_dir, "query_graph.json")
  1192. with open(query_graph_file_path, "w", encoding="utf-8") as f:
  1193. json.dump(run_context.query_graph, f, ensure_ascii=False, indent=2)
  1194. print(f"Query graph saved to: {query_graph_file_path}")
  1195. # 可视化
  1196. if visualize:
  1197. import subprocess
  1198. output_html = os.path.join(run_context.log_dir, "visualization.html")
  1199. print(f"\n🎨 生成可视化HTML...")
  1200. result = subprocess.run([
  1201. "python", "sug_v6_1_2_3.visualize.py",
  1202. steps_file_path,
  1203. "-o", output_html
  1204. ])
  1205. if result.returncode == 0:
  1206. print(f"✅ 可视化已生成: {output_html}")
  1207. else:
  1208. print(f"❌ 可视化生成失败")
  1209. if __name__ == "__main__":
  1210. parser = argparse.ArgumentParser(description="搜索query优化工具 - v6.1.2.5 迭代循环版")
  1211. parser.add_argument(
  1212. "--input-dir",
  1213. type=str,
  1214. default="input/简单扣图",
  1215. help="输入目录路径,默认: input/简单扣图"
  1216. )
  1217. parser.add_argument(
  1218. "--max-iterations",
  1219. type=int,
  1220. default=20,
  1221. help="最大迭代次数,默认: 20"
  1222. )
  1223. parser.add_argument(
  1224. "--visualize",
  1225. action="store_true",
  1226. default=False,
  1227. help="运行完成后自动生成可视化HTML"
  1228. )
  1229. args = parser.parse_args()
  1230. asyncio.run(main(args.input_dir, max_iterations=args.max_iterations, visualize=args.visualize))