sug_v6_1_2_5.py 50 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465
  1. import asyncio
  2. import json
  3. import os
  4. import sys
  5. import argparse
  6. from datetime import datetime
  7. from typing import Literal
  8. from agents import Agent, Runner
  9. from lib.my_trace import set_trace
  10. from pydantic import BaseModel, Field
  11. from lib.utils import read_file_as_string
  12. from lib.client import get_model
  13. MODEL_NAME = "google/gemini-2.5-flash"
  14. from script.search_recommendations.xiaohongshu_search_recommendations import XiaohongshuSearchRecommendations
  15. from script.search.xiaohongshu_search import XiaohongshuSearch
  16. # ============================================================================
  17. # 数据模型
  18. # ============================================================================
  19. class QueryState(BaseModel):
  20. """Query状态跟踪"""
  21. query: str
  22. level: int # 当前所在层级
  23. no_suggestion_rounds: int = 0 # 连续没有suggestion的轮数
  24. relevance_score: float = 0.0 # 与原始需求的相关度
  25. parent_query: str | None = None # 父query
  26. strategy: str | None = None # 生成策略:direct_sug, rewrite, add_word
  27. is_terminated: bool = False # 是否已终止(不再处理)
  28. class WordLibrary(BaseModel):
  29. """动态分词库"""
  30. words: set[str] = Field(default_factory=set)
  31. word_sources: dict[str, str] = Field(default_factory=dict) # 记录词的来源:word -> source(note_id或"initial")
  32. def add_word(self, word: str, source: str = "unknown"):
  33. """添加单词到分词库"""
  34. if word and word.strip():
  35. word = word.strip()
  36. self.words.add(word)
  37. if word not in self.word_sources:
  38. self.word_sources[word] = source
  39. def add_words(self, words: list[str], source: str = "unknown"):
  40. """批量添加单词"""
  41. for word in words:
  42. self.add_word(word, source)
  43. def get_unused_word(self, current_query: str) -> str | None:
  44. """获取一个当前query中没有的词"""
  45. for word in self.words:
  46. if word not in current_query:
  47. return word
  48. return None
  49. def model_dump(self):
  50. """序列化为dict"""
  51. return {
  52. "words": list(self.words),
  53. "word_sources": self.word_sources
  54. }
  55. class RunContext(BaseModel):
  56. """运行上下文"""
  57. version: str
  58. input_files: dict[str, str]
  59. q_with_context: str
  60. q_context: str
  61. q: str
  62. log_url: str
  63. log_dir: str
  64. # 新增字段
  65. word_library: dict = Field(default_factory=dict) # 使用dict存储,因为set不能直接序列化
  66. query_states: list[dict] = Field(default_factory=list)
  67. steps: list[dict] = Field(default_factory=list)
  68. # Query演化图
  69. query_graph: dict = Field(default_factory=dict) # 记录Query的演化路径和关系
  70. # 最终结果
  71. satisfied_notes: list[dict] = Field(default_factory=list)
  72. final_output: str | None = None
  73. # ============================================================================
  74. # Agent 定义
  75. # ============================================================================
  76. # Agent 1: 分词专家
  77. class WordSegmentation(BaseModel):
  78. """分词结果"""
  79. words: list[str] = Field(..., description="分词结果列表")
  80. reasoning: str = Field(..., description="分词理由")
  81. word_segmentation_instructions = """
  82. 你是分词专家。给定一个query,将其拆分成有意义的最小单元。
  83. ## 分词原则
  84. 1. 保留有搜索意义的词汇
  85. 2. 拆分成独立的概念
  86. 3. 保留专业术语的完整性
  87. 4. 去除虚词(的、吗、呢等)
  88. ## 输出要求
  89. 返回分词列表和分词理由。
  90. """.strip()
  91. word_segmenter = Agent[None](
  92. name="分词专家",
  93. instructions=word_segmentation_instructions,
  94. model=get_model(MODEL_NAME),
  95. output_type=WordSegmentation,
  96. )
  97. # Agent 2: Query相关度评估专家
  98. class RelevanceEvaluation(BaseModel):
  99. """相关度评估"""
  100. relevance_score: float = Field(..., description="相关性分数 0-1")
  101. is_improved: bool = Field(..., description="是否比之前更好")
  102. reason: str = Field(..., description="评估理由")
  103. relevance_evaluation_instructions = """
  104. 你是Query相关度评估专家。
  105. ## 任务
  106. 评估当前query与原始需求的匹配程度。
  107. ## 评估标准
  108. - 主题相关性
  109. - 要素覆盖度
  110. - 意图匹配度
  111. ## 输出
  112. - relevance_score: 0-1的相关性分数
  113. - is_improved: 如果提供了previous_score,判断是否有提升
  114. - reason: 详细理由
  115. """.strip()
  116. relevance_evaluator = Agent[None](
  117. name="Query相关度评估专家",
  118. instructions=relevance_evaluation_instructions,
  119. model=get_model(MODEL_NAME),
  120. output_type=RelevanceEvaluation,
  121. )
  122. # Agent 3: Query改写专家
  123. class QueryRewrite(BaseModel):
  124. """Query改写结果"""
  125. rewritten_query: str = Field(..., description="改写后的query")
  126. rewrite_type: str = Field(..., description="改写类型:abstract或synonym")
  127. reasoning: str = Field(..., description="改写理由")
  128. query_rewrite_instructions = """
  129. 你是Query改写专家。
  130. ## 改写策略
  131. 1. **向上抽象**:将具体概念泛化到更高层次
  132. - 例:iPhone 13 → 智能手机
  133. 2. **同义改写**:使用同义词或相关表达
  134. - 例:购买 → 入手、获取
  135. ## 输出要求
  136. 返回改写后的query、改写类型和理由。
  137. """.strip()
  138. query_rewriter = Agent[None](
  139. name="Query改写专家",
  140. instructions=query_rewrite_instructions,
  141. model=get_model(MODEL_NAME),
  142. output_type=QueryRewrite,
  143. )
  144. # Agent 4: 加词位置评估专家
  145. class WordInsertion(BaseModel):
  146. """加词结果"""
  147. new_query: str = Field(..., description="加词后的新query")
  148. insertion_position: str = Field(..., description="插入位置描述")
  149. reasoning: str = Field(..., description="插入理由")
  150. word_insertion_instructions = """
  151. 你是加词位置评估专家。
  152. ## 任务
  153. 将新词加到当前query的最合适位置,保持语义通顺。
  154. ## 原则
  155. 1. 保持语法正确
  156. 2. 语义连贯
  157. 3. 符合搜索习惯
  158. ## 输出
  159. 返回新query、插入位置描述和理由。
  160. """.strip()
  161. word_inserter = Agent[None](
  162. name="加词位置评估专家",
  163. instructions=word_insertion_instructions,
  164. model=get_model(MODEL_NAME),
  165. output_type=WordInsertion,
  166. )
  167. # Agent 5: Result匹配度评估专家
  168. class ResultEvaluation(BaseModel):
  169. """Result评估结果"""
  170. match_level: str = Field(..., description="匹配等级:satisfied, partial, unsatisfied")
  171. relevance_score: float = Field(..., description="相关性分数 0-1")
  172. missing_aspects: list[str] = Field(default_factory=list, description="缺失的方面")
  173. reason: str = Field(..., description="评估理由")
  174. result_evaluation_instructions = """
  175. 你是Result匹配度评估专家。
  176. ## 任务
  177. 评估搜索结果(帖子)与原始需求的匹配程度。
  178. ## 评估等级
  179. 1. **satisfied**: 完全满足需求
  180. 2. **partial**: 部分满足,但有缺失
  181. 3. **unsatisfied**: 基本不满足
  182. ## 输出要求
  183. - match_level: 匹配等级
  184. - relevance_score: 相关性分数
  185. - missing_aspects: 如果是partial,列出缺失的方面
  186. - reason: 详细理由
  187. """.strip()
  188. result_evaluator = Agent[None](
  189. name="Result匹配度评估专家",
  190. instructions=result_evaluation_instructions,
  191. model=get_model(MODEL_NAME),
  192. output_type=ResultEvaluation,
  193. )
  194. # Agent 6: Query改造专家(基于缺失部分)
  195. class QueryImprovement(BaseModel):
  196. """Query改造结果"""
  197. improved_query: str = Field(..., description="改造后的query")
  198. added_aspects: list[str] = Field(..., description="添加的方面")
  199. reasoning: str = Field(..., description="改造理由")
  200. query_improvement_instructions = """
  201. 你是Query改造专家。
  202. ## 任务
  203. 根据搜索结果的缺失部分,改造query使其包含这些内容。
  204. ## 原则
  205. 1. 针对性补充缺失方面
  206. 2. 保持query简洁
  207. 3. 符合搜索习惯
  208. ## 输出
  209. 返回改造后的query、添加的方面和理由。
  210. """.strip()
  211. query_improver = Agent[None](
  212. name="Query改造专家",
  213. instructions=query_improvement_instructions,
  214. model=get_model(MODEL_NAME),
  215. output_type=QueryImprovement,
  216. )
  217. # Agent 7: 关键词提取专家
  218. class KeywordExtraction(BaseModel):
  219. """关键词提取结果"""
  220. keywords: list[str] = Field(..., description="提取的关键词列表")
  221. reasoning: str = Field(..., description="提取理由")
  222. keyword_extraction_instructions = """
  223. 你是关键词提取专家。
  224. ## 任务
  225. 从帖子标题和描述中提取核心关键词。
  226. ## 提取原则
  227. 1. 提取有搜索价值的词汇
  228. 2. 去除虚词和通用词
  229. 3. 保留专业术语
  230. 4. 提取3-10个关键词
  231. ## 输出
  232. 返回关键词列表和提取理由。
  233. """.strip()
  234. keyword_extractor = Agent[None](
  235. name="关键词提取专家",
  236. instructions=keyword_extraction_instructions,
  237. model=get_model(MODEL_NAME),
  238. output_type=KeywordExtraction,
  239. )
  240. # ============================================================================
  241. # 辅助函数
  242. # ============================================================================
  243. def add_step(context: RunContext, step_name: str, step_type: str, data: dict):
  244. """添加步骤记录"""
  245. step = {
  246. "step_number": len(context.steps) + 1,
  247. "step_name": step_name,
  248. "step_type": step_type,
  249. "timestamp": datetime.now().isoformat(),
  250. "data": data
  251. }
  252. context.steps.append(step)
  253. return step
  254. def add_query_to_graph(context: RunContext, query_state: QueryState, iteration: int, evaluation_reason: str = "", is_selected: bool = True):
  255. """添加Query节点到演化图
  256. Args:
  257. context: 运行上下文
  258. query_state: Query状态
  259. iteration: 迭代次数
  260. evaluation_reason: 评估原因(可选)
  261. is_selected: 是否被选中进入处理队列(默认True)
  262. """
  263. query_id = query_state.query # 直接使用query作为ID
  264. # 初始化图结构
  265. if "nodes" not in context.query_graph:
  266. context.query_graph["nodes"] = {}
  267. context.query_graph["edges"] = []
  268. context.query_graph["iterations"] = {}
  269. # 添加Query节点(type: query)
  270. context.query_graph["nodes"][query_id] = {
  271. "type": "query",
  272. "query": query_state.query,
  273. "level": query_state.level,
  274. "relevance_score": query_state.relevance_score,
  275. "strategy": query_state.strategy,
  276. "parent_query": query_state.parent_query,
  277. "iteration": iteration,
  278. "is_terminated": query_state.is_terminated,
  279. "no_suggestion_rounds": query_state.no_suggestion_rounds,
  280. "evaluation_reason": evaluation_reason, # 评估原因
  281. "is_selected": is_selected # 是否被选中
  282. }
  283. # 添加边(父子关系)
  284. if query_state.parent_query:
  285. parent_id = query_state.parent_query
  286. if parent_id in context.query_graph["nodes"]:
  287. context.query_graph["edges"].append({
  288. "from": parent_id,
  289. "to": query_id,
  290. "edge_type": "query_to_query",
  291. "strategy": query_state.strategy,
  292. "score_improvement": query_state.relevance_score - context.query_graph["nodes"][parent_id]["relevance_score"]
  293. })
  294. # 按迭代分组
  295. if iteration not in context.query_graph["iterations"]:
  296. context.query_graph["iterations"][iteration] = []
  297. context.query_graph["iterations"][iteration].append(query_id)
  298. def add_note_to_graph(context: RunContext, query: str, note: dict):
  299. """添加Note节点到演化图,并连接到对应的Query"""
  300. note_id = note["note_id"]
  301. # 初始化图结构
  302. if "nodes" not in context.query_graph:
  303. context.query_graph["nodes"] = {}
  304. context.query_graph["edges"] = []
  305. context.query_graph["iterations"] = {}
  306. # 添加Note节点(type: note),包含完整的元信息
  307. context.query_graph["nodes"][note_id] = {
  308. "type": "note",
  309. "note_id": note_id,
  310. "title": note["title"],
  311. "desc": note.get("desc", ""), # 完整描述,不截断
  312. "note_url": note.get("note_url", ""),
  313. "image_list": note.get("image_list", []), # 图片列表
  314. "interact_info": note.get("interact_info", {}), # 互动信息(点赞、收藏、评论、分享)
  315. "match_level": note["evaluation"]["match_level"],
  316. "relevance_score": note["evaluation"]["relevance_score"],
  317. "evaluation_reason": note["evaluation"].get("reason", ""), # 评估原因
  318. "found_by_query": query
  319. }
  320. # 添加边:Query → Note
  321. if query in context.query_graph["nodes"]:
  322. context.query_graph["edges"].append({
  323. "from": query,
  324. "to": note_id,
  325. "edge_type": "query_to_note",
  326. "match_level": note["evaluation"]["match_level"],
  327. "relevance_score": note["evaluation"]["relevance_score"]
  328. })
  329. def process_note_data(note: dict) -> dict:
  330. """处理搜索接口返回的帖子数据"""
  331. note_card = note.get("note_card", {})
  332. image_list = note_card.get("image_list", [])
  333. interact_info = note_card.get("interact_info", {})
  334. user_info = note_card.get("user", {})
  335. return {
  336. "note_id": note.get("id", ""),
  337. "title": note_card.get("display_title", ""),
  338. "desc": note_card.get("desc", ""),
  339. "image_list": image_list,
  340. "interact_info": {
  341. "liked_count": interact_info.get("liked_count", 0),
  342. "collected_count": interact_info.get("collected_count", 0),
  343. "comment_count": interact_info.get("comment_count", 0),
  344. "shared_count": interact_info.get("shared_count", 0)
  345. },
  346. "user": {
  347. "nickname": user_info.get("nickname", ""),
  348. "user_id": user_info.get("user_id", "")
  349. },
  350. "type": note_card.get("type", "normal"),
  351. "note_url": f"https://www.xiaohongshu.com/explore/{note.get('id', '')}"
  352. }
  353. # ============================================================================
  354. # 核心流程函数
  355. # ============================================================================
  356. async def initialize_word_library(original_query: str, context: RunContext) -> WordLibrary:
  357. """初始化分词库"""
  358. print("\n[初始化] 创建分词库...")
  359. # 使用Agent进行分词
  360. result = await Runner.run(word_segmenter, original_query)
  361. segmentation: WordSegmentation = result.final_output
  362. word_lib = WordLibrary()
  363. word_lib.add_words(segmentation.words, source="initial")
  364. print(f"初始分词库: {list(word_lib.words)}")
  365. print(f"分词理由: {segmentation.reasoning}")
  366. # 保存到context
  367. context.word_library = word_lib.model_dump()
  368. add_step(context, "初始化分词库", "word_library_init", {
  369. "agent": "分词专家",
  370. "input": original_query,
  371. "output": {
  372. "words": segmentation.words,
  373. "reasoning": segmentation.reasoning
  374. },
  375. "result": {
  376. "word_library": list(word_lib.words)
  377. }
  378. })
  379. return word_lib
  380. async def evaluate_query_relevance(
  381. query: str,
  382. original_need: str,
  383. previous_score: float | None = None,
  384. context: RunContext = None
  385. ) -> RelevanceEvaluation:
  386. """评估query与原始需求的相关度"""
  387. eval_input = f"""
  388. <原始需求>
  389. {original_need}
  390. </原始需求>
  391. <当前Query>
  392. {query}
  393. </当前Query>
  394. {"<之前的相关度分数>" + str(previous_score) + "</之前的相关度分数>" if previous_score is not None else ""}
  395. 请评估当前query与原始需求的相关度。
  396. """
  397. result = await Runner.run(relevance_evaluator, eval_input)
  398. evaluation: RelevanceEvaluation = result.final_output
  399. return evaluation
  400. async def process_suggestions(
  401. query: str,
  402. query_state: QueryState,
  403. original_need: str,
  404. word_lib: WordLibrary,
  405. context: RunContext,
  406. xiaohongshu_api: XiaohongshuSearchRecommendations,
  407. iteration: int
  408. ) -> list[QueryState]:
  409. """处理suggestion分支,返回新的query states"""
  410. print(f"\n [Suggestion分支] 处理query: {query}")
  411. # 收集本次分支处理中的所有Agent调用
  412. agent_calls = []
  413. # 1. 获取suggestions
  414. suggestions = xiaohongshu_api.get_recommendations(keyword=query)
  415. if not suggestions or len(suggestions) == 0:
  416. print(f" → 没有获取到suggestion")
  417. query_state.no_suggestion_rounds += 1
  418. # 记录步骤
  419. add_step(context, f"Suggestion分支 - {query}", "suggestion_branch", {
  420. "query": query,
  421. "query_level": query_state.level,
  422. "suggestions_count": 0,
  423. "no_suggestion_rounds": query_state.no_suggestion_rounds,
  424. "new_queries_generated": 0
  425. })
  426. return []
  427. print(f" → 获取到 {len(suggestions)} 个suggestions")
  428. query_state.no_suggestion_rounds = 0 # 重置计数
  429. # 2. 评估每个suggestion
  430. new_queries = []
  431. suggestion_evaluations = []
  432. for sug in suggestions[:5]: # 限制处理数量
  433. # 评估sug与原始需求的相关度(注意:这里是与原始需求original_need对比,而非当前query)
  434. # 这样可以确保生成的suggestion始终围绕用户的核心需求
  435. sug_eval = await evaluate_query_relevance(sug, original_need, query_state.relevance_score, context)
  436. sug_eval_record = {
  437. "suggestion": sug,
  438. "relevance_score": sug_eval.relevance_score,
  439. "is_improved": sug_eval.is_improved,
  440. "reason": sug_eval.reason
  441. }
  442. suggestion_evaluations.append(sug_eval_record)
  443. # 创建query state(所有suggestion都作为query节点)
  444. sug_state = QueryState(
  445. query=sug,
  446. level=query_state.level + 1,
  447. relevance_score=sug_eval.relevance_score,
  448. parent_query=query,
  449. strategy="direct_sug"
  450. )
  451. # 判断是否比当前query更好(只有提升的才加入待处理队列)
  452. is_selected = sug_eval.is_improved and sug_eval.relevance_score > query_state.relevance_score
  453. # 将所有suggestion添加到演化图(包括未提升的)
  454. add_query_to_graph(
  455. context,
  456. sug_state,
  457. iteration,
  458. evaluation_reason=sug_eval.reason,
  459. is_selected=is_selected
  460. )
  461. if is_selected:
  462. print(f" ✓ {sug} (分数: {sug_eval.relevance_score:.2f}, 提升: {sug_eval.is_improved})")
  463. new_queries.append(sug_state)
  464. else:
  465. print(f" ✗ {sug} (分数: {sug_eval.relevance_score:.2f}, 未提升)")
  466. # 3. 改写策略(向上抽象或同义改写)
  467. if len(new_queries) < 3: # 如果直接使用sug的数量不够,尝试改写
  468. # 尝试向上抽象
  469. rewrite_input_abstract = f"""
  470. <当前Query>
  471. {query}
  472. </当前Query>
  473. <改写要求>
  474. 类型: abstract (向上抽象)
  475. </改写要求>
  476. 请改写这个query。
  477. """
  478. result = await Runner.run(query_rewriter, rewrite_input_abstract)
  479. rewrite: QueryRewrite = result.final_output
  480. # 收集改写Agent的输入输出
  481. rewrite_agent_call = {
  482. "agent": "Query改写专家",
  483. "action": "向上抽象改写",
  484. "input": {
  485. "query": query,
  486. "rewrite_type": "abstract"
  487. },
  488. "output": {
  489. "rewritten_query": rewrite.rewritten_query,
  490. "rewrite_type": rewrite.rewrite_type,
  491. "reasoning": rewrite.reasoning
  492. }
  493. }
  494. agent_calls.append(rewrite_agent_call)
  495. # 评估改写后的query
  496. rewrite_eval = await evaluate_query_relevance(rewrite.rewritten_query, original_need, query_state.relevance_score, context)
  497. # 创建改写后的query state
  498. new_state = QueryState(
  499. query=rewrite.rewritten_query,
  500. level=query_state.level + 1,
  501. relevance_score=rewrite_eval.relevance_score,
  502. parent_query=query,
  503. strategy="rewrite_abstract"
  504. )
  505. # 添加到演化图(无论是否提升)
  506. add_query_to_graph(
  507. context,
  508. new_state,
  509. iteration,
  510. evaluation_reason=rewrite_eval.reason,
  511. is_selected=rewrite_eval.is_improved
  512. )
  513. if rewrite_eval.is_improved:
  514. print(f" ✓ 改写(抽象): {rewrite.rewritten_query} (分数: {rewrite_eval.relevance_score:.2f})")
  515. new_queries.append(new_state)
  516. else:
  517. print(f" ✗ 改写(抽象): {rewrite.rewritten_query} (分数: {rewrite_eval.relevance_score:.2f}, 未提升)")
  518. # 3.2. 同义改写策略
  519. if len(new_queries) < 4: # 如果还不够,尝试同义改写
  520. rewrite_input_synonym = f"""
  521. <当前Query>
  522. {query}
  523. </当前Query>
  524. <改写要求>
  525. 类型: synonym (同义改写)
  526. 使用同义词或相关表达来改写query,保持语义相同但表达方式不同。
  527. </改写要求>
  528. 请改写这个query。
  529. """
  530. result = await Runner.run(query_rewriter, rewrite_input_synonym)
  531. rewrite_syn: QueryRewrite = result.final_output
  532. # 收集同义改写Agent的输入输出
  533. rewrite_syn_agent_call = {
  534. "agent": "Query改写专家",
  535. "action": "同义改写",
  536. "input": {
  537. "query": query,
  538. "rewrite_type": "synonym"
  539. },
  540. "output": {
  541. "rewritten_query": rewrite_syn.rewritten_query,
  542. "rewrite_type": rewrite_syn.rewrite_type,
  543. "reasoning": rewrite_syn.reasoning
  544. }
  545. }
  546. agent_calls.append(rewrite_syn_agent_call)
  547. # 评估改写后的query
  548. rewrite_syn_eval = await evaluate_query_relevance(rewrite_syn.rewritten_query, original_need, query_state.relevance_score, context)
  549. # 创建改写后的query state
  550. new_state = QueryState(
  551. query=rewrite_syn.rewritten_query,
  552. level=query_state.level + 1,
  553. relevance_score=rewrite_syn_eval.relevance_score,
  554. parent_query=query,
  555. strategy="rewrite_synonym"
  556. )
  557. # 添加到演化图(无论是否提升)
  558. add_query_to_graph(
  559. context,
  560. new_state,
  561. iteration,
  562. evaluation_reason=rewrite_syn_eval.reason,
  563. is_selected=rewrite_syn_eval.is_improved
  564. )
  565. if rewrite_syn_eval.is_improved:
  566. print(f" ✓ 改写(同义): {rewrite_syn.rewritten_query} (分数: {rewrite_syn_eval.relevance_score:.2f})")
  567. new_queries.append(new_state)
  568. else:
  569. print(f" ✗ 改写(同义): {rewrite_syn.rewritten_query} (分数: {rewrite_syn_eval.relevance_score:.2f}, 未提升)")
  570. # 4. 加词策略
  571. unused_word = word_lib.get_unused_word(query)
  572. if unused_word and len(new_queries) < 5:
  573. insertion_input = f"""
  574. <当前Query>
  575. {query}
  576. </当前Query>
  577. <要添加的词>
  578. {unused_word}
  579. </要添加的词>
  580. 请将这个词加到query的最合适位置。
  581. """
  582. result = await Runner.run(word_inserter, insertion_input)
  583. insertion: WordInsertion = result.final_output
  584. # 收集加词Agent的输入输出
  585. insertion_agent_call = {
  586. "agent": "加词位置评估专家",
  587. "action": "加词",
  588. "input": {
  589. "query": query,
  590. "word_to_add": unused_word
  591. },
  592. "output": {
  593. "new_query": insertion.new_query,
  594. "insertion_position": insertion.insertion_position,
  595. "reasoning": insertion.reasoning
  596. }
  597. }
  598. agent_calls.append(insertion_agent_call)
  599. # 评估加词后的query
  600. insertion_eval = await evaluate_query_relevance(insertion.new_query, original_need, query_state.relevance_score, context)
  601. # 创建加词后的query state
  602. new_state = QueryState(
  603. query=insertion.new_query,
  604. level=query_state.level + 1,
  605. relevance_score=insertion_eval.relevance_score,
  606. parent_query=query,
  607. strategy="add_word"
  608. )
  609. # 添加到演化图(无论是否提升)
  610. add_query_to_graph(
  611. context,
  612. new_state,
  613. iteration,
  614. evaluation_reason=insertion_eval.reason,
  615. is_selected=insertion_eval.is_improved
  616. )
  617. if insertion_eval.is_improved:
  618. print(f" ✓ 加词: {insertion.new_query} (分数: {insertion_eval.relevance_score:.2f})")
  619. new_queries.append(new_state)
  620. else:
  621. print(f" ✗ 加词: {insertion.new_query} (分数: {insertion_eval.relevance_score:.2f}, 未提升)")
  622. # 记录完整的suggestion分支处理结果(层级化)
  623. add_step(context, f"Suggestion分支 - {query}", "suggestion_branch", {
  624. "query": query,
  625. "query_level": query_state.level,
  626. "query_relevance": query_state.relevance_score,
  627. "suggestions_count": len(suggestions),
  628. "suggestions_evaluated": len(suggestion_evaluations),
  629. "suggestion_evaluations": suggestion_evaluations[:10], # 只保存前10个
  630. "agent_calls": agent_calls, # 所有Agent调用的详细记录
  631. "new_queries_generated": len(new_queries),
  632. "new_queries": [{"query": nq.query, "score": nq.relevance_score, "strategy": nq.strategy} for nq in new_queries],
  633. "no_suggestion_rounds": query_state.no_suggestion_rounds
  634. })
  635. return new_queries
  636. async def process_search_results(
  637. query: str,
  638. query_state: QueryState,
  639. original_need: str,
  640. word_lib: WordLibrary,
  641. context: RunContext,
  642. xiaohongshu_search: XiaohongshuSearch,
  643. relevance_threshold: float,
  644. iteration: int
  645. ) -> tuple[list[dict], list[QueryState]]:
  646. """
  647. 处理搜索结果分支
  648. 返回: (满足需求的notes, 需要继续迭代的新queries)
  649. """
  650. print(f"\n [Result分支] 搜索query: {query}")
  651. # 收集本次分支处理中的所有Agent调用
  652. agent_calls = []
  653. # 1. 判断query相关度是否达到门槛
  654. if query_state.relevance_score < relevance_threshold:
  655. print(f" ✗ 相关度 {query_state.relevance_score:.2f} 低于门槛 {relevance_threshold},跳过搜索")
  656. return [], []
  657. print(f" ✓ 相关度 {query_state.relevance_score:.2f} 达到门槛,执行搜索")
  658. # 2. 执行搜索
  659. try:
  660. search_result = xiaohongshu_search.search(keyword=query)
  661. result_str = search_result.get("result", "{}")
  662. if isinstance(result_str, str):
  663. result_data = json.loads(result_str)
  664. else:
  665. result_data = result_str
  666. notes = result_data.get("data", {}).get("data", [])
  667. print(f" → 搜索到 {len(notes)} 个帖子")
  668. except Exception as e:
  669. print(f" ✗ 搜索失败: {e}")
  670. return [], []
  671. if not notes:
  672. return [], []
  673. # 3. 评估每个帖子
  674. satisfied_notes = []
  675. partial_notes = []
  676. for note in notes[:10]: # 限制评估数量
  677. note_data = process_note_data(note)
  678. title = note_data["title"] or ""
  679. desc = note_data["desc"] or ""
  680. # 跳过空标题和描述的帖子
  681. if not title and not desc:
  682. continue
  683. # 评估帖子
  684. eval_input = f"""
  685. <原始需求>
  686. {original_need}
  687. </原始需求>
  688. <帖子>
  689. 标题: {title}
  690. 描述: {desc}
  691. </帖子>
  692. 请评估这个帖子与原始需求的匹配程度。
  693. """
  694. result = await Runner.run(result_evaluator, eval_input)
  695. evaluation: ResultEvaluation = result.final_output
  696. # 收集Result评估Agent的输入输出
  697. result_eval_agent_call = {
  698. "agent": "Result匹配度评估专家",
  699. "action": "评估帖子匹配度",
  700. "input": {
  701. "note_id": note_data.get("note_id"),
  702. "title": title,
  703. "desc": desc[:200] if len(desc) > 200 else desc # 限制长度
  704. },
  705. "output": {
  706. "match_level": evaluation.match_level,
  707. "relevance_score": evaluation.relevance_score,
  708. "missing_aspects": evaluation.missing_aspects,
  709. "reason": evaluation.reason
  710. }
  711. }
  712. agent_calls.append(result_eval_agent_call)
  713. note_data["evaluation"] = {
  714. "match_level": evaluation.match_level,
  715. "relevance_score": evaluation.relevance_score,
  716. "missing_aspects": evaluation.missing_aspects,
  717. "reason": evaluation.reason
  718. }
  719. # 将所有评估过的帖子添加到演化图(包括satisfied、partial、unsatisfied)
  720. add_note_to_graph(context, query, note_data)
  721. if evaluation.match_level == "satisfied":
  722. satisfied_notes.append(note_data)
  723. print(f" ✓ 满足: {title[:30] if len(title) > 30 else title}... (分数: {evaluation.relevance_score:.2f})")
  724. elif evaluation.match_level == "partial":
  725. partial_notes.append(note_data)
  726. print(f" ~ 部分: {title[:30] if len(title) > 30 else title}... (缺失: {', '.join(evaluation.missing_aspects[:2])})")
  727. else: # unsatisfied
  728. print(f" ✗ 不满足: {title[:30] if len(title) > 30 else title}... (分数: {evaluation.relevance_score:.2f})")
  729. # 4. 处理满足的帖子:不再扩充分词库(避免无限扩张)
  730. new_queries = []
  731. if satisfied_notes:
  732. print(f"\n ✓ 找到 {len(satisfied_notes)} 个满足的帖子,不再提取关键词入库")
  733. # 注释掉关键词提取逻辑,保持分词库稳定
  734. # for note in satisfied_notes[:3]:
  735. # extract_input = f"""
  736. # <帖子>
  737. # 标题: {note['title']}
  738. # 描述: {note['desc']}
  739. # </帖子>
  740. #
  741. # 请提取核心关键词。
  742. # """
  743. # result = await Runner.run(keyword_extractor, extract_input)
  744. # extraction: KeywordExtraction = result.final_output
  745. #
  746. # # 添加新词到分词库,标记来源
  747. # note_id = note.get('note_id', 'unknown')
  748. # for keyword in extraction.keywords:
  749. # if keyword not in word_lib.words:
  750. # word_lib.add_word(keyword, source=f"note:{note_id}")
  751. # print(f" + 新词入库: {keyword} (来源: {note_id})")
  752. # 5. 处理部分匹配的帖子:改造query
  753. if partial_notes and len(satisfied_notes) < 5: # 如果满足的不够,基于部分匹配改进
  754. print(f"\n 基于 {len(partial_notes)} 个部分匹配帖子改造query...")
  755. # 收集所有缺失方面
  756. all_missing = []
  757. for note in partial_notes:
  758. all_missing.extend(note["evaluation"]["missing_aspects"])
  759. if all_missing:
  760. improvement_input = f"""
  761. <当前Query>
  762. {query}
  763. </当前Query>
  764. <缺失的方面>
  765. {', '.join(set(all_missing[:5]))}
  766. </缺失的方面>
  767. 请改造query使其包含这些缺失的内容。
  768. """
  769. result = await Runner.run(query_improver, improvement_input)
  770. improvement: QueryImprovement = result.final_output
  771. # 收集Query改造Agent的输入输出
  772. improvement_agent_call = {
  773. "agent": "Query改造专家",
  774. "action": "基于缺失方面改造Query",
  775. "input": {
  776. "query": query,
  777. "missing_aspects": list(set(all_missing[:5]))
  778. },
  779. "output": {
  780. "improved_query": improvement.improved_query,
  781. "added_aspects": improvement.added_aspects,
  782. "reasoning": improvement.reasoning
  783. }
  784. }
  785. agent_calls.append(improvement_agent_call)
  786. # 评估改进后的query
  787. improved_eval = await evaluate_query_relevance(improvement.improved_query, original_need, query_state.relevance_score, context)
  788. # 创建改进后的query state
  789. new_state = QueryState(
  790. query=improvement.improved_query,
  791. level=query_state.level + 1,
  792. relevance_score=improved_eval.relevance_score,
  793. parent_query=query,
  794. strategy="improve_from_partial"
  795. )
  796. # 添加到演化图(无论是否提升)
  797. add_query_to_graph(
  798. context,
  799. new_state,
  800. iteration,
  801. evaluation_reason=improved_eval.reason,
  802. is_selected=improved_eval.is_improved
  803. )
  804. if improved_eval.is_improved:
  805. print(f" ✓ 改进: {improvement.improved_query} (添加: {', '.join(improvement.added_aspects[:2])})")
  806. new_queries.append(new_state)
  807. else:
  808. print(f" ✗ 改进: {improvement.improved_query} (分数: {improved_eval.relevance_score:.2f}, 未提升)")
  809. # 6. Result分支的改写策略(向上抽象和同义改写)
  810. # 如果搜索结果不理想且新queries不够,尝试改写当前query
  811. if len(satisfied_notes) < 3 and len(new_queries) < 2:
  812. print(f"\n 搜索结果不理想,尝试改写query...")
  813. # 6.1 向上抽象
  814. if len(new_queries) < 3:
  815. rewrite_input_abstract = f"""
  816. <当前Query>
  817. {query}
  818. </当前Query>
  819. <改写要求>
  820. 类型: abstract (向上抽象)
  821. </改写要求>
  822. 请改写这个query。
  823. """
  824. result = await Runner.run(query_rewriter, rewrite_input_abstract)
  825. rewrite: QueryRewrite = result.final_output
  826. # 收集Result分支改写(抽象)Agent的输入输出
  827. rewrite_agent_call = {
  828. "agent": "Query改写专家",
  829. "action": "向上抽象改写(Result分支)",
  830. "input": {
  831. "query": query,
  832. "rewrite_type": "abstract"
  833. },
  834. "output": {
  835. "rewritten_query": rewrite.rewritten_query,
  836. "rewrite_type": rewrite.rewrite_type,
  837. "reasoning": rewrite.reasoning
  838. }
  839. }
  840. agent_calls.append(rewrite_agent_call)
  841. # 评估改写后的query
  842. rewrite_eval = await evaluate_query_relevance(rewrite.rewritten_query, original_need, query_state.relevance_score, context)
  843. # 创建改写后的query state
  844. new_state = QueryState(
  845. query=rewrite.rewritten_query,
  846. level=query_state.level + 1,
  847. relevance_score=rewrite_eval.relevance_score,
  848. parent_query=query,
  849. strategy="result_rewrite_abstract"
  850. )
  851. # 添加到演化图(无论是否提升)
  852. add_query_to_graph(
  853. context,
  854. new_state,
  855. iteration,
  856. evaluation_reason=rewrite_eval.reason,
  857. is_selected=rewrite_eval.is_improved
  858. )
  859. if rewrite_eval.is_improved:
  860. print(f" ✓ 改写(抽象): {rewrite.rewritten_query} (分数: {rewrite_eval.relevance_score:.2f})")
  861. new_queries.append(new_state)
  862. else:
  863. print(f" ✗ 改写(抽象): {rewrite.rewritten_query} (分数: {rewrite_eval.relevance_score:.2f}, 未提升)")
  864. # 6.2 同义改写
  865. if len(new_queries) < 4:
  866. rewrite_input_synonym = f"""
  867. <当前Query>
  868. {query}
  869. </当前Query>
  870. <改写要求>
  871. 类型: synonym (同义改写)
  872. 使用同义词或相关表达来改写query,保持语义相同但表达方式不同。
  873. </改写要求>
  874. 请改写这个query。
  875. """
  876. result = await Runner.run(query_rewriter, rewrite_input_synonym)
  877. rewrite_syn: QueryRewrite = result.final_output
  878. # 收集Result分支改写(同义)Agent的输入输出
  879. rewrite_syn_agent_call = {
  880. "agent": "Query改写专家",
  881. "action": "同义改写(Result分支)",
  882. "input": {
  883. "query": query,
  884. "rewrite_type": "synonym"
  885. },
  886. "output": {
  887. "rewritten_query": rewrite_syn.rewritten_query,
  888. "rewrite_type": rewrite_syn.rewrite_type,
  889. "reasoning": rewrite_syn.reasoning
  890. }
  891. }
  892. agent_calls.append(rewrite_syn_agent_call)
  893. # 评估改写后的query
  894. rewrite_syn_eval = await evaluate_query_relevance(rewrite_syn.rewritten_query, original_need, query_state.relevance_score, context)
  895. # 创建改写后的query state
  896. new_state = QueryState(
  897. query=rewrite_syn.rewritten_query,
  898. level=query_state.level + 1,
  899. relevance_score=rewrite_syn_eval.relevance_score,
  900. parent_query=query,
  901. strategy="result_rewrite_synonym"
  902. )
  903. # 添加到演化图(无论是否提升)
  904. add_query_to_graph(
  905. context,
  906. new_state,
  907. iteration,
  908. evaluation_reason=rewrite_syn_eval.reason,
  909. is_selected=rewrite_syn_eval.is_improved
  910. )
  911. if rewrite_syn_eval.is_improved:
  912. print(f" ✓ 改写(同义): {rewrite_syn.rewritten_query} (分数: {rewrite_syn_eval.relevance_score:.2f})")
  913. new_queries.append(new_state)
  914. else:
  915. print(f" ✗ 改写(同义): {rewrite_syn.rewritten_query} (分数: {rewrite_syn_eval.relevance_score:.2f}, 未提升)")
  916. # 记录完整的result分支处理结果(层级化)
  917. add_step(context, f"Result分支 - {query}", "result_branch", {
  918. "query": query,
  919. "query_level": query_state.level,
  920. "query_relevance": query_state.relevance_score,
  921. "relevance_threshold": relevance_threshold,
  922. "passed_threshold": query_state.relevance_score >= relevance_threshold,
  923. "notes_count": len(notes) if 'notes' in locals() else 0,
  924. "satisfied_count": len(satisfied_notes),
  925. "partial_count": len(partial_notes),
  926. "satisfied_notes": [
  927. {
  928. "note_id": note["note_id"],
  929. "title": note["title"],
  930. "score": note["evaluation"]["relevance_score"],
  931. "match_level": note["evaluation"]["match_level"]
  932. }
  933. for note in satisfied_notes[:10] # 只保存前10个
  934. ],
  935. "agent_calls": agent_calls, # 所有Agent调用的详细记录
  936. "new_queries_generated": len(new_queries),
  937. "new_queries": [{"query": nq.query, "score": nq.relevance_score, "strategy": nq.strategy} for nq in new_queries]
  938. })
  939. return satisfied_notes, new_queries
  940. async def iterative_search_loop(
  941. context: RunContext,
  942. max_iterations: int = 20,
  943. max_concurrent_queries: int = 5,
  944. relevance_threshold: float = 0.6
  945. ) -> list[dict]:
  946. """
  947. 主循环:迭代搜索
  948. Args:
  949. context: 运行上下文
  950. max_iterations: 最大迭代次数
  951. max_concurrent_queries: 最大并发query数量
  952. relevance_threshold: 相关度门槛
  953. Returns:
  954. 满足需求的帖子列表
  955. """
  956. print(f"\n{'='*60}")
  957. print(f"开始迭代搜索循环")
  958. print(f"{'='*60}")
  959. # 0. 添加原始问题作为根节点
  960. root_query_state = QueryState(
  961. query=context.q,
  962. level=0,
  963. relevance_score=1.0, # 原始问题本身相关度为1.0
  964. strategy="root"
  965. )
  966. add_query_to_graph(context, root_query_state, 0, evaluation_reason="原始问题,作为搜索的根节点", is_selected=True)
  967. print(f"[根节点] 原始问题: {context.q}")
  968. # 1. 初始化分词库
  969. word_lib = await initialize_word_library(context.q, context)
  970. # 2. 初始化query队列 - 智能选择最相关的词
  971. all_words = list(word_lib.words)
  972. query_queue = []
  973. print(f"\n评估所有初始分词的相关度...")
  974. word_scores = []
  975. for word in all_words:
  976. # 评估每个词的相关度
  977. eval_result = await evaluate_query_relevance(word, context.q, None, context)
  978. word_scores.append({
  979. 'word': word,
  980. 'score': eval_result.relevance_score,
  981. 'eval': eval_result
  982. })
  983. print(f" {word}: {eval_result.relevance_score:.2f}")
  984. # 按相关度排序,选择top 3
  985. word_scores.sort(key=lambda x: x['score'], reverse=True)
  986. selected_words = word_scores[:3]
  987. # 将所有分词添加到演化图(包括未被选中的)
  988. for item in word_scores:
  989. is_selected = item in selected_words
  990. query_state = QueryState(
  991. query=item['word'],
  992. level=1,
  993. relevance_score=item['score'],
  994. strategy="initial",
  995. parent_query=context.q # 父节点是原始问题
  996. )
  997. # 添加到演化图(会自动创建从parent_query到该query的边)
  998. add_query_to_graph(context, query_state, 0, evaluation_reason=item['eval'].reason, is_selected=is_selected)
  999. # 只有被选中的才加入队列
  1000. if is_selected:
  1001. query_queue.append(query_state)
  1002. print(f"\n初始query队列(按相关度选择): {[(q.query, f'{q.relevance_score:.2f}') for q in query_queue]}")
  1003. print(f" (共评估了 {len(word_scores)} 个分词,选择了前 {len(query_queue)} 个)")
  1004. # 3. API实例
  1005. xiaohongshu_api = XiaohongshuSearchRecommendations()
  1006. xiaohongshu_search = XiaohongshuSearch()
  1007. # 4. 主循环
  1008. all_satisfied_notes = []
  1009. iteration = 0
  1010. while query_queue and iteration < max_iterations:
  1011. iteration += 1
  1012. print(f"\n{'='*60}")
  1013. print(f"迭代 {iteration}: 队列中有 {len(query_queue)} 个query")
  1014. print(f"{'='*60}")
  1015. # 限制并发数量
  1016. current_batch = query_queue[:max_concurrent_queries]
  1017. query_queue = query_queue[max_concurrent_queries:]
  1018. # 记录本轮处理的queries
  1019. add_step(context, f"迭代 {iteration}", "iteration", {
  1020. "iteration": iteration,
  1021. "queue_size": len(query_queue) + len(current_batch),
  1022. "processing_queries": [q.query for q in current_batch]
  1023. })
  1024. new_queries_from_sug = []
  1025. new_queries_from_result = []
  1026. # 处理每个query
  1027. for query_state in current_batch:
  1028. print(f"\n处理Query [{query_state.level}]: {query_state.query} (分数: {query_state.relevance_score:.2f})")
  1029. # 检查终止条件
  1030. if query_state.is_terminated or query_state.no_suggestion_rounds >= 2:
  1031. print(f" ✗ 已终止或连续2轮无suggestion,跳过该query")
  1032. query_state.is_terminated = True
  1033. continue
  1034. # 并行处理两个分支
  1035. sug_task = process_suggestions(
  1036. query_state.query, query_state, context.q, word_lib, context, xiaohongshu_api, iteration
  1037. )
  1038. result_task = process_search_results(
  1039. query_state.query, query_state, context.q, word_lib, context,
  1040. xiaohongshu_search, relevance_threshold, iteration
  1041. )
  1042. # 等待两个分支完成
  1043. sug_queries, (satisfied_notes, result_queries) = await asyncio.gather(
  1044. sug_task,
  1045. result_task
  1046. )
  1047. # 如果suggestion分支返回空,说明没有获取到suggestion,需要继承no_suggestion_rounds
  1048. # 注意:process_suggestions内部已经更新了query_state.no_suggestion_rounds
  1049. # 所以这里生成的新queries需要继承父query的no_suggestion_rounds(如果sug分支也返回空)
  1050. if not sug_queries and not result_queries:
  1051. # 两个分支都没有产生新query,标记当前query为终止
  1052. query_state.is_terminated = True
  1053. print(f" ⚠ 两个分支均未产生新query,标记该query为终止")
  1054. new_queries_from_sug.extend(sug_queries)
  1055. new_queries_from_result.extend(result_queries)
  1056. all_satisfied_notes.extend(satisfied_notes)
  1057. # 更新队列
  1058. all_new_queries = new_queries_from_sug + new_queries_from_result
  1059. # 将新生成的queries添加到演化图
  1060. for new_q in all_new_queries:
  1061. add_query_to_graph(context, new_q, iteration)
  1062. query_queue.extend(all_new_queries)
  1063. # 去重(基于query文本)并过滤已终止的query
  1064. seen = set()
  1065. unique_queue = []
  1066. for q in query_queue:
  1067. if q.query not in seen and not q.is_terminated:
  1068. seen.add(q.query)
  1069. unique_queue.append(q)
  1070. query_queue = unique_queue
  1071. # 按相关度排序
  1072. query_queue.sort(key=lambda x: x.relevance_score, reverse=True)
  1073. print(f"\n本轮结果:")
  1074. print(f" 新增满足帖子: {len(satisfied_notes)}")
  1075. print(f" 累计满足帖子: {len(all_satisfied_notes)}")
  1076. print(f" 新增queries: {len(all_new_queries)}")
  1077. print(f" 队列剩余: {len(query_queue)}")
  1078. # 更新分词库到context
  1079. context.word_library = word_lib.model_dump()
  1080. # 如果满足条件的帖子足够多,可以提前结束
  1081. if len(all_satisfied_notes) >= 20:
  1082. print(f"\n已找到足够的满足帖子 ({len(all_satisfied_notes)}个),提前结束")
  1083. break
  1084. print(f"\n{'='*60}")
  1085. print(f"迭代搜索完成")
  1086. print(f" 总迭代次数: {iteration}")
  1087. print(f" 最终满足帖子数: {len(all_satisfied_notes)}")
  1088. print(f" 最终分词库大小: {len(word_lib.words)}")
  1089. print(f"{'='*60}")
  1090. # 保存最终结果
  1091. add_step(context, "迭代搜索完成", "loop_complete", {
  1092. "total_iterations": iteration,
  1093. "total_satisfied_notes": len(all_satisfied_notes),
  1094. "final_word_library_size": len(word_lib.words),
  1095. "final_word_library": list(word_lib.words)
  1096. })
  1097. return all_satisfied_notes
  1098. # ============================================================================
  1099. # 主函数
  1100. # ============================================================================
  1101. async def main(input_dir: str, max_iterations: int = 20, visualize: bool = False):
  1102. """主函数"""
  1103. current_time, log_url = set_trace()
  1104. # 读取输入
  1105. input_context_file = os.path.join(input_dir, 'context.md')
  1106. input_q_file = os.path.join(input_dir, 'q.md')
  1107. q_context = read_file_as_string(input_context_file)
  1108. q = read_file_as_string(input_q_file)
  1109. q_with_context = f"""
  1110. <需求上下文>
  1111. {q_context}
  1112. </需求上下文>
  1113. <当前问题>
  1114. {q}
  1115. </当前问题>
  1116. """.strip()
  1117. # 版本信息
  1118. version = os.path.basename(__file__)
  1119. version_name = os.path.splitext(version)[0]
  1120. # 日志目录
  1121. log_dir = os.path.join(input_dir, "output", version_name, current_time)
  1122. # 创建运行上下文
  1123. run_context = RunContext(
  1124. version=version,
  1125. input_files={
  1126. "input_dir": input_dir,
  1127. "context_file": input_context_file,
  1128. "q_file": input_q_file,
  1129. },
  1130. q_with_context=q_with_context,
  1131. q_context=q_context,
  1132. q=q,
  1133. log_dir=log_dir,
  1134. log_url=log_url,
  1135. )
  1136. # 执行迭代搜索
  1137. satisfied_notes = await iterative_search_loop(
  1138. run_context,
  1139. max_iterations=max_iterations,
  1140. max_concurrent_queries=3,
  1141. relevance_threshold=0.6
  1142. )
  1143. # 保存结果
  1144. run_context.satisfied_notes = satisfied_notes
  1145. # 格式化输出
  1146. output = f"原始问题:{run_context.q}\n"
  1147. output += f"找到满足需求的帖子:{len(satisfied_notes)} 个\n"
  1148. output += f"分词库大小:{len(run_context.word_library.get('words', []))} 个词\n"
  1149. output += "\n" + "="*60 + "\n"
  1150. if satisfied_notes:
  1151. output += "【满足需求的帖子】\n\n"
  1152. for idx, note in enumerate(satisfied_notes[:10], 1):
  1153. output += f"{idx}. {note['title']}\n"
  1154. output += f" 相关度: {note['evaluation']['relevance_score']:.2f}\n"
  1155. output += f" URL: {note['note_url']}\n\n"
  1156. else:
  1157. output += "未找到满足需求的帖子\n"
  1158. run_context.final_output = output
  1159. print(f"\n{'='*60}")
  1160. print("最终结果")
  1161. print(f"{'='*60}")
  1162. print(output)
  1163. # 保存日志
  1164. os.makedirs(run_context.log_dir, exist_ok=True)
  1165. context_file_path = os.path.join(run_context.log_dir, "run_context.json")
  1166. context_dict = run_context.model_dump()
  1167. with open(context_file_path, "w", encoding="utf-8") as f:
  1168. json.dump(context_dict, f, ensure_ascii=False, indent=2)
  1169. print(f"\nRunContext saved to: {context_file_path}")
  1170. steps_file_path = os.path.join(run_context.log_dir, "steps.json")
  1171. with open(steps_file_path, "w", encoding="utf-8") as f:
  1172. json.dump(run_context.steps, f, ensure_ascii=False, indent=2)
  1173. print(f"Steps log saved to: {steps_file_path}")
  1174. # 保存Query演化图
  1175. query_graph_file_path = os.path.join(run_context.log_dir, "query_graph.json")
  1176. with open(query_graph_file_path, "w", encoding="utf-8") as f:
  1177. json.dump(run_context.query_graph, f, ensure_ascii=False, indent=2)
  1178. print(f"Query graph saved to: {query_graph_file_path}")
  1179. # 可视化
  1180. if visualize:
  1181. import subprocess
  1182. output_html = os.path.join(run_context.log_dir, "visualization.html")
  1183. print(f"\n🎨 生成可视化HTML...")
  1184. result = subprocess.run([
  1185. "python", "sug_v6_1_2_3.visualize.py",
  1186. steps_file_path,
  1187. "-o", output_html
  1188. ])
  1189. if result.returncode == 0:
  1190. print(f"✅ 可视化已生成: {output_html}")
  1191. else:
  1192. print(f"❌ 可视化生成失败")
  1193. if __name__ == "__main__":
  1194. parser = argparse.ArgumentParser(description="搜索query优化工具 - v6.1.2.5 迭代循环版")
  1195. parser.add_argument(
  1196. "--input-dir",
  1197. type=str,
  1198. default="input/简单扣图",
  1199. help="输入目录路径,默认: input/简单扣图"
  1200. )
  1201. parser.add_argument(
  1202. "--max-iterations",
  1203. type=int,
  1204. default=20,
  1205. help="最大迭代次数,默认: 20"
  1206. )
  1207. parser.add_argument(
  1208. "--visualize",
  1209. action="store_true",
  1210. default=False,
  1211. help="运行完成后自动生成可视化HTML"
  1212. )
  1213. args = parser.parse_args()
  1214. asyncio.run(main(args.input_dir, max_iterations=args.max_iterations, visualize=args.visualize))