enhanced_search_v2.py 55 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 增强搜索系统 V2
  5. 支持LLM评估和扩展搜索的完整流程
  6. """
  7. import json
  8. import logging
  9. import copy
  10. import time
  11. import os
  12. import argparse
  13. import subprocess
  14. from typing import Dict, List, Any, Optional, Set, Tuple
  15. from datetime import datetime
  16. from concurrent.futures import ThreadPoolExecutor, as_completed
  17. from itertools import combinations
  18. from openrouter_client import OpenRouterClient
  19. from llm_evaluator import LLMEvaluator
  20. from xiaohongshu_search import XiaohongshuSearch
  21. # 配置日志
  22. logging.basicConfig(
  23. level=logging.INFO,
  24. format='%(asctime)s - %(levelname)s - %(message)s',
  25. datefmt='%Y-%m-%d %H:%M:%S',
  26. handlers=[
  27. logging.FileHandler('enhanced_search_v2.log', encoding='utf-8'),
  28. logging.StreamHandler()
  29. ]
  30. )
  31. logger = logging.getLogger(__name__)
  32. class EnhancedSearchV2:
  33. """增强搜索系统V2"""
  34. def __init__(
  35. self,
  36. how_json_path: str,
  37. dimension_associations_path: str,
  38. optimized_clustered_data_path: str,
  39. openrouter_api_key: Optional[str] = None,
  40. output_dir: str = "output_v2",
  41. top_n: int = 10,
  42. max_total_searches: Optional[int] = None,
  43. search_max_workers: int = 3
  44. ):
  45. """
  46. 初始化系统
  47. Args:
  48. how_json_path: How解构文件路径
  49. dimension_associations_path: 维度关联文件路径
  50. optimized_clustered_data_path: 人设特征库路径
  51. openrouter_api_key: OpenRouter API密钥
  52. output_dir: 输出目录
  53. top_n: 每个原始特征取评分最高的N个搜索词(默认10)
  54. max_total_searches: 全局最大搜索次数限制(默认None不限制)
  55. search_max_workers: 搜索并发数(默认3)
  56. """
  57. self.how_json_path = how_json_path
  58. self.dimension_associations_path = dimension_associations_path
  59. self.optimized_clustered_data_path = optimized_clustered_data_path
  60. self.output_dir = output_dir
  61. self.top_n = top_n
  62. self.max_total_searches = max_total_searches
  63. self.search_max_workers = search_max_workers
  64. # 创建输出目录
  65. os.makedirs(output_dir, exist_ok=True)
  66. # 加载数据
  67. logger.info("加载数据文件...")
  68. self.how_data = self._load_json(how_json_path)
  69. self.dimension_associations = self._load_json(dimension_associations_path)
  70. self.optimized_clustered_data = self._load_json(optimized_clustered_data_path)
  71. # 初始化组件
  72. logger.info("初始化组件...")
  73. self.openrouter_client = OpenRouterClient(
  74. api_key=openrouter_api_key,
  75. model="google/gemini-2.5-flash",
  76. retry_delay=5 # 增加重试延迟避免限流
  77. )
  78. self.llm_evaluator = LLMEvaluator(self.openrouter_client)
  79. self.search_client = XiaohongshuSearch()
  80. logger.info("系统初始化完成")
  81. def _load_json(self, file_path: str) -> Any:
  82. """加载JSON文件"""
  83. try:
  84. with open(file_path, 'r', encoding='utf-8') as f:
  85. return json.load(f)
  86. except Exception as e:
  87. logger.error(f"加载文件失败 {file_path}: {e}")
  88. raise
  89. def _save_json(self, data: Any, file_path: str):
  90. """保存JSON文件"""
  91. try:
  92. with open(file_path, 'w', encoding='utf-8') as f:
  93. json.dump(data, f, ensure_ascii=False, indent=2)
  94. logger.info(f"已保存: {file_path}")
  95. except Exception as e:
  96. logger.error(f"保存文件失败 {file_path}: {e}")
  97. raise
  98. # ========== 阶段1:筛选 0.5 <= 相似度 < 0.8 的特征 ==========
  99. def stage1_filter_features(self) -> List[Dict[str, Any]]:
  100. """
  101. 阶段1:筛选中等匹配度特征
  102. 筛选条件:0.5 <= 最高相似度 < 0.8
  103. Returns:
  104. 筛选后的特征列表
  105. """
  106. logger.info("=" * 60)
  107. logger.info("阶段1:筛选中等匹配度特征 (0.5 <= 相似度 < 0.8)")
  108. logger.info("=" * 60)
  109. results = []
  110. how_result = self.how_data.get('how解构结果', {})
  111. total_features = 0
  112. filtered_out_low = 0 # < 0.5
  113. filtered_out_high = 0 # >= 0.8
  114. selected_count = 0
  115. # 遍历三个维度
  116. for level_name, level_list in how_result.items():
  117. if not isinstance(level_list, list):
  118. continue
  119. logger.info(f"\n处理 {level_name}...")
  120. for item_idx, item in enumerate(level_list):
  121. item_name = item.get('名称', f'未命名-{item_idx}')
  122. how_steps = item.get('how步骤列表', [])
  123. for step in how_steps:
  124. features = step.get('特征列表', [])
  125. for feature in features:
  126. feature_name = feature.get('特征名称', '')
  127. match_results = feature.get('匹配结果', [])
  128. total_features += 1
  129. if not match_results:
  130. continue
  131. # 找到最高相似度
  132. max_similarity = max(
  133. (m.get('匹配结果', {}).get('相似度', 0) for m in match_results),
  134. default=0
  135. )
  136. # 筛选条件
  137. if max_similarity < 0.5:
  138. filtered_out_low += 1
  139. continue
  140. elif max_similarity >= 0.8:
  141. filtered_out_high += 1
  142. continue
  143. # 0.5 <= max_similarity < 0.8,保留
  144. # 按相似度降序排序,取前3个
  145. sorted_matches = sorted(
  146. match_results,
  147. key=lambda x: x.get('匹配结果', {}).get('相似度', 0),
  148. reverse=True
  149. )
  150. top3_matches = sorted_matches[:3] # 取前3个
  151. # 构建top3匹配信息列表
  152. top3_match_info = []
  153. for match in top3_matches:
  154. feature_classification = match.get('特征分类', [])
  155. classification_path = self._build_classification_path(feature_classification)
  156. # 如果路径为空且是分类类型,搜索补全路径
  157. if not classification_path and match.get('特征类型') == '分类':
  158. feature_name_to_search = match.get('人设特征名称', '')
  159. classification_path = self._search_classification_path(feature_name_to_search)
  160. is_classification = self._is_classification(match.get('人设特征名称', ''), classification_path)
  161. top3_match_info.append({
  162. '人设特征名称': match.get('人设特征名称'),
  163. '人设特征层级': match.get('人设特征层级'),
  164. '特征类型': match.get('特征类型'),
  165. '特征分类': feature_classification,
  166. '相似度': match.get('匹配结果', {}).get('相似度', 0),
  167. '匹配说明': match.get('匹配结果', {}).get('说明', ''),
  168. '是分类': is_classification,
  169. '所属分类路径': classification_path
  170. })
  171. result_item = {
  172. '原始特征名称': feature_name,
  173. '来源层级': level_name,
  174. '权重': feature.get('权重', 0),
  175. '所属点名称': item_name,
  176. '最高匹配信息': top3_match_info[0], # 保留第1个用于Stage2
  177. 'top3匹配信息': top3_match_info # 新增字段
  178. }
  179. results.append(result_item)
  180. selected_count += 1
  181. # 显示top3匹配信息
  182. top3_names = [m['人设特征名称'] for m in top3_match_info]
  183. logger.info(f" ✓ {feature_name} → Top{len(top3_match_info)}: {', '.join(top3_names)}")
  184. # 统计信息
  185. logger.info(f"\n" + "=" * 60)
  186. logger.info(f"阶段1完成")
  187. logger.info(f" 总特征数: {total_features}")
  188. logger.info(f" 过滤掉(<0.5): {filtered_out_low}")
  189. logger.info(f" 过滤掉(>=0.8): {filtered_out_high}")
  190. logger.info(f" 保留(0.5-0.8): {selected_count}")
  191. logger.info("=" * 60)
  192. # 保存结果
  193. output_path = os.path.join(self.output_dir, "stage1_filtered_features.json")
  194. self._save_json(results, output_path)
  195. return results
  196. def _build_classification_path(self, feature_classification: List[str]) -> str:
  197. """
  198. 构建分类路径
  199. Args:
  200. feature_classification: 特征分类数组
  201. Returns:
  202. 分类路径
  203. """
  204. if not feature_classification:
  205. return ""
  206. # 步骤1: 去掉中间元素的"实质"后缀
  207. cleaned = []
  208. for i, item in enumerate(feature_classification):
  209. if i == len(feature_classification) - 1: # 最后一个保留
  210. cleaned.append(item)
  211. elif item.endswith("实质") and i != 0: # 中间的去掉"实质"
  212. cleaned.append(item[:-2])
  213. else:
  214. cleaned.append(item)
  215. # 步骤2: 反转数组
  216. reversed_list = list(reversed(cleaned))
  217. # 步骤3: 拼接路径
  218. path = "/".join(reversed_list)
  219. return path
  220. def _is_classification(self, persona_feature_name: str, classification_path: str) -> bool:
  221. """
  222. 判断是分类还是特征
  223. Args:
  224. persona_feature_name: 人设特征名称
  225. classification_path: 分类路径
  226. Returns:
  227. True: 是分类, False: 是特征
  228. """
  229. # 在optimized_clustered_data中查找
  230. # 如果在特征列表中找到,就是特征
  231. # 如果作为节点存在且有子节点,就是分类
  232. # 导航到节点
  233. node = self._navigate_to_node(classification_path)
  234. if not node:
  235. return False
  236. # 检查是否在特征列表中
  237. features = node.get('特征列表', [])
  238. for f in features:
  239. if f.get('特征名称') == persona_feature_name:
  240. return False # 在特征列表中,是特征
  241. # 检查是否作为子节点存在
  242. if persona_feature_name in node:
  243. sub_node = node[persona_feature_name]
  244. if isinstance(sub_node, dict):
  245. return True # 是子节点,是分类
  246. return False # 默认是特征
  247. def _navigate_to_node(self, path: str) -> Optional[Dict[str, Any]]:
  248. """
  249. 导航到指定路径的节点
  250. Args:
  251. path: 路径,如 "实质/猫咪宠物"
  252. Returns:
  253. 节点,未找到返回None
  254. """
  255. if not path:
  256. return None
  257. parts = path.split('/')
  258. first_part = parts[0]
  259. # 确定顶层key
  260. top_level_map = {
  261. '意图': '目的点',
  262. '要素': '目的点',
  263. '实质': None,
  264. '形式': None,
  265. '场景': None
  266. }
  267. top_keys = []
  268. if first_part in top_level_map:
  269. mapped = top_level_map[first_part]
  270. if mapped:
  271. top_keys.append(mapped)
  272. if not top_keys:
  273. top_keys = ['灵感点列表', '关键点列表', '目的点']
  274. # 尝试在每个顶层中查找
  275. for top_key in top_keys:
  276. current = self.optimized_clustered_data.get(top_key)
  277. if not current:
  278. continue
  279. # 逐层导航
  280. found = True
  281. for part in parts:
  282. if isinstance(current, dict) and part in current:
  283. current = current[part]
  284. else:
  285. found = False
  286. break
  287. if found and isinstance(current, dict):
  288. return current
  289. return None
  290. def _recursive_search(
  291. self,
  292. obj: Dict[str, Any],
  293. target_name: str,
  294. current_path: str = ""
  295. ) -> Optional[str]:
  296. """
  297. 递归搜索分类节点
  298. Args:
  299. obj: 当前搜索的对象
  300. target_name: 目标分类名称
  301. current_path: 当前路径
  302. Returns:
  303. 找到的完整路径,未找到返回None
  304. """
  305. if not isinstance(obj, dict):
  306. return None
  307. # 遍历所有键
  308. for key in obj.keys():
  309. # 跳过元数据和特征列表
  310. if key in ['_meta', '特征列表']:
  311. continue
  312. # 检查是否匹配
  313. if target_name in key or key in target_name:
  314. # 找到匹配,返回路径
  315. if current_path:
  316. return f"{current_path}/{key}"
  317. else:
  318. return key
  319. # 递归搜索子节点
  320. if isinstance(obj[key], dict):
  321. next_path = f"{current_path}/{key}" if current_path else key
  322. result = self._recursive_search(obj[key], target_name, next_path)
  323. if result:
  324. return result
  325. return None
  326. def _search_classification_path(self, classification_name: str) -> str:
  327. """
  328. 在optimized_clustered_data中搜索分类节点路径
  329. Args:
  330. classification_name: 分类名称,如"实体物品实质"
  331. Returns:
  332. 完整路径,如"实质/实体物品",未找到返回空字符串
  333. """
  334. if not classification_name:
  335. return ""
  336. # 清理名称:去掉常见后缀
  337. clean_name = classification_name
  338. for suffix in ['实质', '意图', '形式', '要素']:
  339. if clean_name.endswith(suffix) and len(clean_name) > len(suffix):
  340. clean_name = clean_name[:-len(suffix)]
  341. break
  342. logger.info(f" 搜索分类: {classification_name} → 清理为: {clean_name}")
  343. # 在三个顶级列表中搜索
  344. for top_key in ['灵感点列表', '关键点列表', '目的点']:
  345. top_data = self.optimized_clustered_data.get(top_key, {})
  346. if not top_data:
  347. continue
  348. # 递归搜索
  349. path = self._recursive_search(top_data, clean_name, "")
  350. if path:
  351. logger.info(f" ✓ 找到路径: {path}")
  352. return path
  353. logger.warning(f" ✗ 未找到分类路径: {classification_name}")
  354. return ""
  355. # ========== 阶段2:收集关联分类+标签+子分类 ==========
  356. def stage2_find_associations(self, filtered_features: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
  357. """
  358. 阶段2:查找关联分类,收集分类名称、标签、子分类
  359. 改进: 为top3的每个base_word都查找关联
  360. Args:
  361. filtered_features: 阶段1筛选的特征
  362. Returns:
  363. 带关联信息的特征列表
  364. """
  365. logger.info("=" * 60)
  366. logger.info("阶段2:查找关联分类(为每个base_word)")
  367. logger.info("=" * 60)
  368. for idx, feature in enumerate(filtered_features, 1):
  369. logger.info(f"\n[{idx}/{len(filtered_features)}] 处理: {feature['原始特征名称']}")
  370. # 获取top3 base_words
  371. top3_info = feature.get('top3匹配信息', [])
  372. if not top3_info:
  373. logger.warning(f" 无top3匹配信息,跳过")
  374. feature['找到的关联_按base_word'] = {}
  375. continue
  376. logger.info(f" 找到 {len(top3_info)} 个base_word")
  377. # 为每个base_word查找关联
  378. associations_by_base_word = {}
  379. for base_idx, base_info in enumerate(top3_info, 1):
  380. base_word = base_info.get('人设特征名称', '')
  381. is_classification = base_info['是分类']
  382. classification_path = base_info['所属分类路径']
  383. source_level = base_info['人设特征层级']
  384. logger.info(f" [{base_idx}/{len(top3_info)}] Base Word: {base_word}")
  385. if is_classification:
  386. search_path = classification_path
  387. logger.info(f" 匹配到分类: {search_path}")
  388. else:
  389. search_path = classification_path
  390. logger.info(f" 匹配到特征,使用所属分类: {search_path}")
  391. # 查找关联
  392. associations = self._find_associations(search_path, source_level)
  393. # 收集关联信息
  394. base_word_associations = []
  395. for assoc in associations:
  396. target_path = assoc['目标分类']
  397. # 收集分类信息
  398. classification_info = self._collect_classification_info(target_path)
  399. if classification_info:
  400. base_word_associations.append({
  401. '来源方向': assoc['来源方向'],
  402. '关联类型': assoc['关联类型'],
  403. '目标分类路径': target_path,
  404. '共同帖子数': assoc['共同帖子数'],
  405. 'Jaccard相似度': assoc['Jaccard相似度'],
  406. '分类名称': classification_info['classification_name'],
  407. '标签列表': classification_info['tags'],
  408. '子分类列表': classification_info['sub_classifications']
  409. })
  410. associations_by_base_word[base_word] = base_word_associations
  411. logger.info(f" 找到 {len(base_word_associations)} 个关联")
  412. # 保存结果
  413. feature['找到的关联_按base_word'] = associations_by_base_word
  414. # 向后兼容:保留基于最高匹配信息的关联(即第1个base_word的关联)
  415. first_base_word = top3_info[0].get('人设特征名称', '')
  416. feature['找到的关联'] = associations_by_base_word.get(first_base_word, [])
  417. total_associations = sum(len(v) for v in associations_by_base_word.values())
  418. logger.info(f" 总共找到 {total_associations} 个关联({len(associations_by_base_word)} 个base_word)")
  419. # 保存结果
  420. output_path = os.path.join(self.output_dir, "stage2_associations.json")
  421. self._save_json(filtered_features, output_path)
  422. logger.info(f"\n" + "=" * 60)
  423. logger.info(f"阶段2完成")
  424. logger.info("=" * 60)
  425. return filtered_features
  426. def _find_associations(self, classification_path: str, source_level: str) -> List[Dict[str, Any]]:
  427. """
  428. 查找关联节点
  429. Args:
  430. classification_path: 分类路径
  431. source_level: 源层级
  432. Returns:
  433. 关联节点列表
  434. """
  435. associations = []
  436. # 确定维度名称
  437. if '灵感点' in source_level:
  438. dimension_key = '灵感点维度'
  439. elif '关键点' in source_level:
  440. dimension_key = '关键点维度'
  441. elif '目的点' in source_level:
  442. dimension_key = '目的点维度'
  443. else:
  444. return associations
  445. # 获取维度数据
  446. single_dim = self.dimension_associations.get('单维度关联分析', {})
  447. dimension_data = single_dim.get(dimension_key, {})
  448. if not dimension_data:
  449. return associations
  450. # 遍历所有方向
  451. for direction_key, direction_data in dimension_data.items():
  452. if direction_key == '说明':
  453. continue
  454. # 查找源分类
  455. if classification_path in direction_data:
  456. source_data = direction_data[classification_path]
  457. # 获取关联节点
  458. for assoc_key in source_data.keys():
  459. if assoc_key.startswith('与') and assoc_key.endswith('的关联'):
  460. assoc_list = source_data[assoc_key]
  461. for assoc_item in assoc_list:
  462. associations.append({
  463. '来源方向': direction_key,
  464. '关联类型': assoc_key,
  465. '目标分类': assoc_item.get('目标分类'),
  466. '目标层级': assoc_item.get('目标层级'),
  467. '共同帖子数': assoc_item.get('共同帖子数'),
  468. 'Jaccard相似度': assoc_item.get('Jaccard相似度'),
  469. '共同帖子ID': assoc_item.get('共同帖子ID', [])
  470. })
  471. return associations
  472. def _collect_classification_info(self, classification_path: str) -> Optional[Dict[str, Any]]:
  473. """
  474. 收集分类信息:分类名 + 标签 + 子分类
  475. Args:
  476. classification_path: 分类路径
  477. Returns:
  478. 分类信息
  479. """
  480. node = self._navigate_to_node(classification_path)
  481. if not node:
  482. return None
  483. # 分类名称(路径最后一段)
  484. classification_name = classification_path.split('/')[-1]
  485. # 标签(特征列表)
  486. tags = [f.get('特征名称', '') for f in node.get('特征列表', [])]
  487. # 子分类(子节点,排除_meta和特征列表)
  488. sub_classifications = [
  489. key for key in node.keys()
  490. if isinstance(node[key], dict) and key not in ['_meta', '特征列表']
  491. ]
  492. return {
  493. 'classification_name': classification_name,
  494. 'tags': tags,
  495. 'sub_classifications': sub_classifications
  496. }
  497. # ========== 阶段3:筛选高相似度匹配(>0.8) ==========
  498. def stage3_filter_high_similarity_matches(self, associations_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
  499. """
  500. 阶段3:筛选高相似度匹配(>0.8)
  501. 改进:为每个base_word独立筛选候选词
  502. 基于该base_word的关联范围,在how解构中找出相似度>0.8的匹配
  503. Args:
  504. associations_data: 阶段2的关联数据
  505. Returns:
  506. 带高相似度候选的数据
  507. """
  508. logger.info("=" * 60)
  509. logger.info("阶段3:筛选高相似度匹配(>0.8,为每个base_word)")
  510. logger.info("=" * 60)
  511. for idx, feature_result in enumerate(associations_data, 1):
  512. original_feature_name = feature_result['原始特征名称']
  513. logger.info(f"\n[{idx}/{len(associations_data)}] 处理: {original_feature_name}")
  514. # 获取top3 base_words
  515. top3_info = feature_result.get('top3匹配信息', [])
  516. associations_by_base_word = feature_result.get('找到的关联_按base_word', {})
  517. if not top3_info or not associations_by_base_word:
  518. logger.warning(f" 无top3匹配信息或关联数据,跳过")
  519. feature_result['高相似度候选_按base_word'] = {}
  520. continue
  521. logger.info(f" 找到 {len(top3_info)} 个base_word")
  522. # 为每个base_word独立筛选候选词
  523. candidates_by_base_word = {}
  524. for base_idx, base_info in enumerate(top3_info, 1):
  525. base_word = base_info.get('人设特征名称', '')
  526. logger.info(f" [{base_idx}/{len(top3_info)}] Base Word: {base_word}")
  527. # 步骤1: 收集该base_word的关联范围
  528. base_word_associations = associations_by_base_word.get(base_word, [])
  529. base_word_scope = self._collect_scope_from_associations(base_word_associations)
  530. logger.info(f" 关联范围包含 {len(base_word_scope)} 个分类/标签")
  531. if not base_word_scope:
  532. logger.warning(f" 无关联范围,跳过")
  533. candidates_by_base_word[base_word] = []
  534. continue
  535. # 步骤2: 遍历how解构,找出高相似度匹配
  536. high_sim_candidates = []
  537. total_checked = 0
  538. high_sim_found = 0
  539. how_result = self.how_data.get('how解构结果', {})
  540. for level_name, level_list in how_result.items():
  541. if not isinstance(level_list, list):
  542. continue
  543. for item in level_list:
  544. for step in item.get('how步骤列表', []):
  545. for feature in step.get('特征列表', []):
  546. matches = feature.get('匹配结果', [])
  547. total_checked += len(matches)
  548. # 筛选相似度>0.8且在该base_word的范围内的匹配
  549. for match in matches:
  550. sim = match.get('匹配结果', {}).get('相似度', 0)
  551. persona_feature_name = match.get('人设特征名称', '')
  552. if sim > 0.8 and persona_feature_name in base_word_scope:
  553. high_sim_found += 1
  554. high_sim_candidates.append({
  555. '人设特征名称': persona_feature_name,
  556. '相似度': sim,
  557. '特征类型': match.get('特征类型', ''),
  558. '特征分类': match.get('特征分类', []),
  559. '人设特征层级': match.get('人设特征层级', ''),
  560. '来源路径': self._build_classification_path(match.get('特征分类', [])),
  561. '匹配说明': match.get('匹配结果', {}).get('说明', ''),
  562. '来源原始特征': feature.get('特征名称', '')
  563. })
  564. logger.info(f" 检查了 {total_checked} 个匹配")
  565. logger.info(f" 找到 {high_sim_found} 个相似度>0.8的匹配")
  566. # 按相似度降序排序并去重
  567. seen_names = set()
  568. unique_candidates = []
  569. high_sim_candidates.sort(key=lambda x: x['相似度'], reverse=True)
  570. for candidate in high_sim_candidates:
  571. name = candidate['人设特征名称']
  572. if name not in seen_names:
  573. seen_names.add(name)
  574. unique_candidates.append(candidate)
  575. candidates_by_base_word[base_word] = unique_candidates
  576. logger.info(f" 去重后筛选出 {len(unique_candidates)} 个候选")
  577. # 显示前5个
  578. if unique_candidates:
  579. logger.info(f" Top 5:")
  580. for c in unique_candidates[:5]:
  581. logger.info(f" • {c['人设特征名称']} ({c['相似度']:.3f}) ← 来自\"{c['来源原始特征']}\"")
  582. # 保存结果
  583. feature_result['高相似度候选_按base_word'] = candidates_by_base_word
  584. # 向后兼容:保留第1个base_word的候选
  585. first_base_word = top3_info[0].get('人设特征名称', '')
  586. feature_result['高相似度候选'] = candidates_by_base_word.get(first_base_word, [])
  587. total_candidates = sum(len(v) for v in candidates_by_base_word.values())
  588. logger.info(f" 总共筛选出 {total_candidates} 个候选({len(candidates_by_base_word)} 个base_word)")
  589. # 保存结果
  590. output_path = os.path.join(self.output_dir, "stage3_high_similarity.json")
  591. self._save_json(associations_data, output_path)
  592. logger.info(f"\n" + "=" * 60)
  593. logger.info(f"阶段3完成")
  594. logger.info("=" * 60)
  595. return associations_data
  596. def _collect_scope_from_associations(self, associations: List[Dict[str, Any]]) -> Set[str]:
  597. """
  598. 从关联列表中收集所有分类名和标签,形成范围集合
  599. Args:
  600. associations: 关联列表
  601. Returns:
  602. 包含所有分类名和标签的集合
  603. """
  604. scope = set()
  605. for assoc in associations:
  606. # 添加分类名
  607. scope.add(assoc['分类名称'])
  608. # 添加所有标签
  609. tags = assoc.get('标签列表', [])
  610. scope.update(tags)
  611. return scope
  612. def _collect_stage2_scope(self, feature_result: Dict[str, Any]) -> Set[str]:
  613. """
  614. 收集Stage2找到的所有分类名和标签,形成范围集合(兼容旧方法)
  615. Args:
  616. feature_result: 特征结果数据
  617. Returns:
  618. 包含所有分类名和标签的集合
  619. """
  620. associations = feature_result.get('找到的关联', [])
  621. return self._collect_scope_from_associations(associations)
  622. def _find_features_by_path(self, target_classification: str) -> List[Dict[str, Any]]:
  623. """
  624. 根据路径查找特征列表
  625. Args:
  626. target_classification: 目标分类路径
  627. Returns:
  628. 特征列表
  629. """
  630. node = self._navigate_to_node(target_classification)
  631. if not node:
  632. return []
  633. features = node.get('特征列表', [])
  634. # 深拷贝
  635. return copy.deepcopy(features)
  636. # ========== 阶段4:多词组合 + LLM评估 ==========
  637. def stage4_generate_and_evaluate_search_words(
  638. self,
  639. features_data: List[Dict[str, Any]],
  640. max_workers: int = 4,
  641. max_candidates: int = 20,
  642. max_combo_length: int = 4
  643. ) -> List[Dict[str, Any]]:
  644. """
  645. 阶段4:多词组合 + LLM评估
  646. 基于Stage1的基础词和Stage3的高相似度候选,
  647. 生成所有2-N词组合,通过LLM评估选出Top10
  648. Args:
  649. features_data: 阶段3的数据(包含高相似度候选)
  650. max_workers: 并发评估的原始特征数(默认4)
  651. max_candidates: 参与组合的最大候选词数(默认20)
  652. max_combo_length: 最大组合词数(默认4,即基础词+3个候选)
  653. Returns:
  654. 带LLM评估的数据
  655. """
  656. logger.info("=" * 60)
  657. logger.info("阶段4:多词组合 + LLM评估")
  658. logger.info(f" 最大候选词数: {max_candidates}")
  659. logger.info(f" 最大组合长度: {max_combo_length} 词")
  660. logger.info(f" 并发数: {max_workers} 个原始特征")
  661. logger.info("=" * 60)
  662. total_features = len(features_data)
  663. # 使用ThreadPoolExecutor并行处理不同的原始特征
  664. with ThreadPoolExecutor(max_workers=max_workers) as executor:
  665. # 提交所有任务
  666. futures = []
  667. for idx, feature_result in enumerate(features_data, 1):
  668. future = executor.submit(
  669. self._process_single_feature_combinations,
  670. idx,
  671. total_features,
  672. feature_result,
  673. max_candidates,
  674. max_combo_length
  675. )
  676. futures.append((future, feature_result))
  677. # 等待所有任务完成并收集结果
  678. for future, feature_result in futures:
  679. try:
  680. _ = future.result() # 等待完成,结果已经写回到feature_result中
  681. except Exception as e:
  682. logger.error(f" 评估失败: {feature_result['原始特征名称']}, 错误: {e}")
  683. # 保存结果
  684. output_path = os.path.join(self.output_dir, "stage4_combinations_evaluated.json")
  685. self._save_json(features_data, output_path)
  686. logger.info(f"\n" + "=" * 60)
  687. logger.info(f"阶段4完成")
  688. logger.info("=" * 60)
  689. return features_data
  690. def _process_single_feature_combinations(
  691. self,
  692. idx: int,
  693. total: int,
  694. feature_result: Dict[str, Any],
  695. max_candidates: int,
  696. max_combo_length: int
  697. ) -> None:
  698. """
  699. 处理单个原始特征的组合生成和评估
  700. 改进: 每个base_word使用自己的候选词(而不是共享)
  701. Steps:
  702. 1. Get top3 base_words from Stage1's top3匹配信息
  703. 2. For each base_word:
  704. a. Get candidates from Stage3's 高相似度候选_按base_word
  705. b. Generate combinations
  706. c. LLM evaluation
  707. d. Select Top 10
  708. 3. Save grouped results
  709. Args:
  710. idx: 特征索引
  711. total: 总特征数
  712. feature_result: 特征结果数据
  713. max_candidates: 参与组合的最大候选词数
  714. max_combo_length: 最大组合词数
  715. """
  716. original_feature = feature_result['原始特征名称']
  717. logger.info(f"\n[{idx}/{total}] 处理: {original_feature}")
  718. # 步骤1: 获取top3基础词
  719. top3_info = feature_result.get('top3匹配信息', [])
  720. if not top3_info:
  721. logger.info(f" 无top3匹配信息,跳过")
  722. feature_result['组合评估结果_分组'] = []
  723. return
  724. logger.info(f" 找到 {len(top3_info)} 个base_word")
  725. # 步骤2: 获取按base_word分组的候选词
  726. candidates_by_base_word = feature_result.get('高相似度候选_按base_word', {})
  727. if not candidates_by_base_word:
  728. logger.warning(f" 无按base_word分组的候选词,跳过")
  729. feature_result['组合评估结果_分组'] = []
  730. return
  731. # 步骤3: 为每个base_word独立处理
  732. grouped_results = []
  733. for base_idx, base_info in enumerate(top3_info, 1):
  734. base_word = base_info.get('人设特征名称', '')
  735. base_similarity = base_info.get('相似度', 0)
  736. if not base_word:
  737. continue
  738. logger.info(f" [{base_idx}/{len(top3_info)}] Base Word: {base_word} (相似度: {base_similarity:.3f})")
  739. # 获取该base_word的候选词
  740. base_candidates = candidates_by_base_word.get(base_word, [])
  741. candidates = base_candidates[:max_candidates]
  742. candidate_words = [c['人设特征名称'] for c in candidates]
  743. if not candidate_words:
  744. logger.warning(f" 该base_word无候选词,跳过")
  745. grouped_results.append({
  746. 'base_word': base_word,
  747. 'base_word_similarity': base_similarity,
  748. 'base_word_info': base_info,
  749. 'top10_searches': [],
  750. 'available_words': []
  751. })
  752. continue
  753. logger.info(f" 候选词数量: {len(candidate_words)} (限制: {max_candidates})")
  754. # 生成组合
  755. combinations_for_base = []
  756. for length in range(1, min(max_combo_length, len(candidate_words) + 1)):
  757. for combo in combinations(candidate_words, length):
  758. search_phrase = base_word + ' ' + ' '.join(combo)
  759. combinations_for_base.append({
  760. 'search_word': search_phrase,
  761. 'base_word': base_word,
  762. 'candidate_words': list(combo),
  763. 'combo_length': length + 1
  764. })
  765. logger.info(f" 生成 {len(combinations_for_base)} 个组合")
  766. # LLM评估
  767. logger.info(f" 开始LLM评估...")
  768. evaluated = self.llm_evaluator.evaluate_search_words_in_batches(
  769. original_feature=original_feature,
  770. search_words=[c['search_word'] for c in combinations_for_base],
  771. batch_size=50
  772. )
  773. # 选出Top 10
  774. top_10 = evaluated[:10]
  775. max_score = top_10[0]['score'] if top_10 else 0.0
  776. logger.info(f" 评估完成,Top 10 最高分: {max_score:.3f}")
  777. # 保存分组结果 - 每个base_word有自己的available_words
  778. grouped_results.append({
  779. 'base_word': base_word,
  780. 'base_word_similarity': base_similarity,
  781. 'base_word_info': base_info,
  782. 'top10_searches': top_10,
  783. 'available_words': candidate_words # 该base_word自己的候选词
  784. })
  785. # 写回结果
  786. feature_result['组合评估结果_分组'] = grouped_results
  787. total_searches = sum(len(g['top10_searches']) for g in grouped_results)
  788. logger.info(f" 完成!共 {len(grouped_results)} 个base_word,{total_searches} 个搜索词")
  789. # ========== 阶段5:执行搜索 ==========
  790. def _execute_single_search(
  791. self,
  792. idx: int,
  793. total: int,
  794. search_word: str,
  795. feature_ref: Dict[str, Any]
  796. ) -> Dict[str, Any]:
  797. """
  798. 执行单个搜索任务(用于并发执行)
  799. Args:
  800. idx: 搜索索引
  801. total: 总搜索数
  802. search_word: 搜索词
  803. feature_ref: 特征引用(用于写入结果)
  804. Returns:
  805. 搜索结果信息
  806. """
  807. logger.info(f"[{idx}/{total}] 搜索: {search_word}")
  808. try:
  809. result = self.search_client.search(
  810. keyword=search_word,
  811. content_type='不限',
  812. sort_type='综合',
  813. max_retries=3,
  814. use_cache=True # 启用搜索缓存
  815. )
  816. note_count = len(result.get('data', {}).get('data', []))
  817. logger.info(f" ✓ 成功,获取 {note_count} 条帖子")
  818. # 写入结果
  819. feature_ref['search_result'] = result
  820. feature_ref['search_metadata'] = {
  821. 'searched_at': datetime.now().isoformat(),
  822. 'status': 'success',
  823. 'note_count': note_count,
  824. 'search_params': {
  825. 'keyword': search_word,
  826. 'content_type': '图文',
  827. 'sort_type': '综合'
  828. }
  829. }
  830. return {'status': 'success', 'search_word': search_word, 'note_count': note_count}
  831. except Exception as e:
  832. logger.error(f" ✗ 失败: {e}")
  833. feature_ref['search_result'] = None
  834. feature_ref['search_metadata'] = {
  835. 'searched_at': datetime.now().isoformat(),
  836. 'status': 'failed',
  837. 'note_count': 0,
  838. 'error': str(e)
  839. }
  840. return {'status': 'failed', 'search_word': search_word, 'error': str(e)}
  841. def stage5_execute_searches(
  842. self,
  843. features_data: List[Dict[str, Any]],
  844. search_delay: float = 2.0,
  845. top_n: int = 10
  846. ) -> List[Dict[str, Any]]:
  847. """
  848. 阶段5:执行小红书搜索
  849. Args:
  850. features_data: 阶段4的数据
  851. search_delay: 搜索延迟
  852. top_n: 每个原始特征取评分最高的N个搜索词
  853. Returns:
  854. 带搜索结果的数据
  855. """
  856. logger.info("=" * 60)
  857. logger.info("阶段5:执行小红书搜索")
  858. logger.info("=" * 60)
  859. # 按原始特征分组收集搜索词(从Stage4的组合评估结果_分组读取)
  860. feature_search_groups = {}
  861. for feature_result in features_data:
  862. original_feature = feature_result['原始特征名称']
  863. if original_feature not in feature_search_groups:
  864. feature_search_groups[original_feature] = []
  865. # 从Stage4的组合评估结果_分组读取(新结构)
  866. grouped_results = feature_result.get('组合评估结果_分组', [])
  867. if grouped_results:
  868. # 使用分组结构:每个base_word的top10都执行
  869. for group in grouped_results:
  870. base_word = group.get('base_word', '')
  871. base_similarity = group.get('base_word_similarity', 0)
  872. for eval_item in group.get('top10_searches', []):
  873. sw = eval_item.get('search_word')
  874. if not sw:
  875. continue
  876. score = eval_item.get('score', 0.0)
  877. feature_search_groups[original_feature].append({
  878. 'search_word': sw,
  879. 'score': score,
  880. 'base_word': base_word,
  881. 'base_word_similarity': base_similarity,
  882. 'feature_ref': eval_item # 引用评估项,用于写入搜索结果
  883. })
  884. else:
  885. # 兼容旧结构(组合评估结果)
  886. for eval_item in feature_result.get('组合评估结果', []):
  887. sw = eval_item.get('search_word')
  888. if not sw:
  889. continue
  890. score = eval_item.get('score', 0.0)
  891. feature_search_groups[original_feature].append({
  892. 'search_word': sw,
  893. 'score': score,
  894. 'feature_ref': eval_item
  895. })
  896. # 收集所有搜索任务(分组结构下执行所有base_word的top10,不再过滤)
  897. all_searches = []
  898. total_count = 0
  899. for original_feature, search_list in feature_search_groups.items():
  900. total_count += len(search_list)
  901. all_searches.extend(search_list)
  902. logger.info(f" {original_feature}: {len(search_list)} 个搜索词")
  903. # 应用全局搜索次数限制
  904. if self.max_total_searches and len(all_searches) > self.max_total_searches:
  905. logger.info(f" 应用全局限制:从 {len(all_searches)} 个减少到 {self.max_total_searches} 个")
  906. all_searches = all_searches[:self.max_total_searches]
  907. logger.info(f"\n共 {len(all_searches)} 个搜索任务")
  908. logger.info(f" 并发执行搜索(并发数: {self.search_max_workers})")
  909. # 使用ThreadPoolExecutor并发执行搜索
  910. with ThreadPoolExecutor(max_workers=self.search_max_workers) as executor:
  911. # 提交所有搜索任务
  912. futures = []
  913. for idx, item in enumerate(all_searches, 1):
  914. future = executor.submit(
  915. self._execute_single_search,
  916. idx,
  917. len(all_searches),
  918. item['search_word'],
  919. item['feature_ref']
  920. )
  921. futures.append(future)
  922. # 等待所有搜索完成
  923. for future in as_completed(futures):
  924. try:
  925. result = future.result()
  926. # 结果已经写入feature_ref,无需额外处理
  927. except Exception as e:
  928. logger.error(f" 搜索任务失败: {e}")
  929. # 保存结果
  930. output_path = os.path.join(self.output_dir, "stage5_with_search_results.json")
  931. self._save_json(features_data, output_path)
  932. logger.info(f"\n" + "=" * 60)
  933. logger.info(f"阶段5完成")
  934. logger.info("=" * 60)
  935. return features_data
  936. # ========== 阶段6:LLM评估搜索结果 ==========
  937. def stage6_evaluate_search_results(
  938. self,
  939. features_data: List[Dict[str, Any]]
  940. ) -> List[Dict[str, Any]]:
  941. """
  942. 阶段6:用LLM评估搜索结果(多模态)
  943. Args:
  944. features_data: 阶段5的数据
  945. Returns:
  946. 带结果评估的数据
  947. """
  948. logger.info("=" * 60)
  949. logger.info("阶段6:LLM评估搜索结果")
  950. logger.info("=" * 60)
  951. # 收集所有需要评估的特征节点
  952. features_to_evaluate = []
  953. for feature_result in features_data:
  954. original_feature = feature_result['原始特征名称']
  955. for assoc in feature_result.get('找到的关联', []):
  956. for feature in assoc.get('特征列表', []):
  957. if feature.get('search_result') and feature['search_metadata']['status'] == 'success':
  958. features_to_evaluate.append({
  959. 'original_feature': original_feature,
  960. 'feature_node': feature
  961. })
  962. logger.info(f"共 {len(features_to_evaluate)} 个搜索结果需要评估")
  963. # 并行评估(并发数较低)
  964. with ThreadPoolExecutor(max_workers=8) as executor:
  965. futures = []
  966. for item in features_to_evaluate:
  967. future = executor.submit(
  968. self._evaluate_single_search_result,
  969. item['original_feature'],
  970. item['feature_node']
  971. )
  972. futures.append((future, item))
  973. # 收集结果
  974. for idx, (future, item) in enumerate(futures, 1):
  975. try:
  976. evaluation = future.result()
  977. item['feature_node']['result_evaluation'] = evaluation
  978. logger.info(f" [{idx}/{len(futures)}] {item['feature_node']['search_word']}: "
  979. f"relevance={evaluation['overall_relevance']:.3f}")
  980. except Exception as e:
  981. logger.error(f" 评估失败: {item['feature_node']['search_word']}, 错误: {e}")
  982. item['feature_node']['result_evaluation'] = None
  983. # 保存结果
  984. output_path = os.path.join(self.output_dir, "stage6_with_evaluations.json")
  985. self._save_json(features_data, output_path)
  986. logger.info(f"\n" + "=" * 60)
  987. logger.info(f"阶段6完成")
  988. logger.info("=" * 60)
  989. return features_data
  990. def _evaluate_single_search_result(
  991. self,
  992. original_feature: str,
  993. feature_node: Dict[str, Any]
  994. ) -> Dict[str, Any]:
  995. """
  996. 评估单个搜索结果(使用并行评估)
  997. Args:
  998. original_feature: 原始特征
  999. feature_node: 特征节点
  1000. Returns:
  1001. 评估结果
  1002. """
  1003. search_word = feature_node.get('search_word', '')
  1004. notes = feature_node['search_result'].get('data', {}).get('data', [])
  1005. return self.llm_evaluator.evaluate_search_results_parallel(
  1006. original_feature=original_feature,
  1007. search_word=search_word,
  1008. notes=notes,
  1009. max_notes=20,
  1010. max_workers=20 # 20个并发评估每个帖子
  1011. )
  1012. # ========== 阶段7:扩展搜索 ==========
  1013. def stage7_extended_searches(
  1014. self,
  1015. features_data: List[Dict[str, Any]],
  1016. search_delay: float = 2.0
  1017. ) -> List[Dict[str, Any]]:
  1018. """
  1019. 阶段7:基于评估结果扩展搜索(多个)
  1020. Args:
  1021. features_data: 阶段6的数据
  1022. search_delay: 搜索延迟
  1023. Returns:
  1024. 带扩展搜索的数据
  1025. """
  1026. logger.info("=" * 60)
  1027. logger.info("阶段7:扩展搜索")
  1028. logger.info("=" * 60)
  1029. # 收集需要扩展搜索的任务
  1030. extension_tasks = []
  1031. for feature_result in features_data:
  1032. original_feature = feature_result['原始特征名称']
  1033. for assoc in feature_result.get('找到的关联', []):
  1034. for feature in assoc.get('特征列表', []):
  1035. result_eval = feature.get('result_evaluation')
  1036. if not result_eval:
  1037. continue
  1038. extracted_elements = result_eval.get('extracted_elements', [])
  1039. if not extracted_elements:
  1040. continue
  1041. # 为每个提取的元素创建扩展搜索
  1042. base_search_word = feature.get('search_word', '')
  1043. for element in extracted_elements:
  1044. extended_keyword = f"{base_search_word} {element}"
  1045. extension_tasks.append({
  1046. 'extended_keyword': extended_keyword,
  1047. 'original_feature': original_feature,
  1048. 'feature_node': feature,
  1049. 'element': element
  1050. })
  1051. logger.info(f"共 {len(extension_tasks)} 个扩展搜索任务")
  1052. # 执行扩展搜索
  1053. for idx, task in enumerate(extension_tasks, 1):
  1054. extended_kw = task['extended_keyword']
  1055. logger.info(f"[{idx}/{len(extension_tasks)}] 扩展搜索: {extended_kw}")
  1056. try:
  1057. result = self.search_client.search(
  1058. keyword=extended_kw,
  1059. content_type='不限',
  1060. sort_type='综合',
  1061. max_retries=3,
  1062. use_cache=True # 启用搜索缓存
  1063. )
  1064. note_count = len(result.get('data', {}).get('data', []))
  1065. logger.info(f" ✓ 成功,获取 {note_count} 条帖子")
  1066. # 评估扩展搜索结果
  1067. logger.info(f" 评估扩展搜索结果...")
  1068. evaluation = self.llm_evaluator.evaluate_search_results(
  1069. original_feature=task['original_feature'],
  1070. search_word=extended_kw,
  1071. notes=result.get('data', {}).get('data', []),
  1072. max_notes=20,
  1073. max_images_per_note=2
  1074. )
  1075. # 存储扩展搜索结果
  1076. feature_node = task['feature_node']
  1077. if 'extended_searches' not in feature_node:
  1078. feature_node['extended_searches'] = []
  1079. feature_node['extended_searches'].append({
  1080. 'extended_keyword': extended_kw,
  1081. 'based_on_element': task['element'],
  1082. 'search_result': result,
  1083. 'search_metadata': {
  1084. 'searched_at': datetime.now().isoformat(),
  1085. 'status': 'success',
  1086. 'note_count': note_count
  1087. },
  1088. 'result_evaluation': evaluation
  1089. })
  1090. logger.info(f" 评估完成,relevance={evaluation['overall_relevance']:.3f}")
  1091. except Exception as e:
  1092. logger.error(f" ✗ 失败: {e}")
  1093. # 延迟
  1094. if idx < len(extension_tasks):
  1095. time.sleep(search_delay)
  1096. # 保存结果
  1097. output_path = os.path.join(self.output_dir, "stage7_final_results.json")
  1098. self._save_json(features_data, output_path)
  1099. logger.info(f"\n" + "=" * 60)
  1100. logger.info(f"阶段7完成")
  1101. logger.info("=" * 60)
  1102. return features_data
  1103. # ========== 主流程 ==========
  1104. def run_full_pipeline(self):
  1105. """执行完整流程"""
  1106. logger.info("\n" + "=" * 60)
  1107. logger.info("开始执行完整流程")
  1108. logger.info("=" * 60)
  1109. try:
  1110. # 阶段1
  1111. stage1_results = self.stage1_filter_features()
  1112. # 阶段2
  1113. stage2_results = self.stage2_find_associations(stage1_results)
  1114. # 阶段3 - 使用新方法:筛选高相似度匹配
  1115. stage3_results = self.stage3_filter_high_similarity_matches(stage2_results)
  1116. # 阶段4
  1117. stage4_results = self.stage4_generate_and_evaluate_search_words(
  1118. stage3_results,
  1119. max_workers=8, # 提高并发从4到8
  1120. max_combo_length=3 # 降低组合长度从4到3
  1121. )
  1122. # 阶段5
  1123. stage5_results = self.stage5_execute_searches(stage4_results, search_delay=2.0, top_n=self.top_n)
  1124. # 阶段6 - 暂时切断执行(代码保留)
  1125. # stage6_results = self.stage6_evaluate_search_results(stage5_results)
  1126. # 阶段7 - 暂时切断执行(代码保留)
  1127. # final_results = self.stage7_extended_searches(stage6_results, search_delay=2.0)
  1128. logger.info("\n" + "=" * 60)
  1129. logger.info("✓ 完整流程执行完成(Stage1-5)")
  1130. logger.info("=" * 60)
  1131. # 自动执行可视化
  1132. logger.info("\n" + "=" * 60)
  1133. logger.info("开始生成可视化...")
  1134. logger.info("=" * 60)
  1135. try:
  1136. result = subprocess.run(
  1137. ['python3', 'visualize_stage5_results.py'],
  1138. capture_output=True,
  1139. text=True,
  1140. timeout=60
  1141. )
  1142. if result.returncode == 0:
  1143. logger.info("✓ 可视化生成成功")
  1144. logger.info(result.stdout)
  1145. else:
  1146. logger.error(f"可视化生成失败: {result.stderr}")
  1147. except subprocess.TimeoutExpired:
  1148. logger.error("可视化生成超时")
  1149. except Exception as e:
  1150. logger.error(f"可视化生成异常: {e}")
  1151. return stage5_results
  1152. except Exception as e:
  1153. logger.error(f"流程执行失败: {e}")
  1154. raise
  1155. def main():
  1156. """主函数"""
  1157. parser = argparse.ArgumentParser(description='增强搜索系统V2')
  1158. parser.add_argument(
  1159. '--how-json',
  1160. default='69114f150000000007001f30_how copy.json',
  1161. help='How解构文件路径'
  1162. )
  1163. parser.add_argument(
  1164. '--dimension-associations',
  1165. default='dimension_associations_analysis.json',
  1166. help='维度关联文件路径'
  1167. )
  1168. parser.add_argument(
  1169. '--optimized-clustered',
  1170. default='optimized_clustered_data_gemini-3-pro-preview.json',
  1171. help='人设特征库路径'
  1172. )
  1173. parser.add_argument(
  1174. '--api-key',
  1175. default=None,
  1176. help='OpenRouter API密钥(默认从环境变量读取)'
  1177. )
  1178. parser.add_argument(
  1179. '--output-dir',
  1180. default='output_v2',
  1181. help='输出目录'
  1182. )
  1183. parser.add_argument(
  1184. '--top-n',
  1185. type=int,
  1186. default=10,
  1187. help='每个原始特征取评分最高的N个搜索词(默认10)'
  1188. )
  1189. parser.add_argument(
  1190. '--max-total-searches',
  1191. type=int,
  1192. default=None,
  1193. help='全局最大搜索次数限制(默认None不限制)'
  1194. )
  1195. parser.add_argument(
  1196. '--search-workers',
  1197. type=int,
  1198. default=3,
  1199. help='搜索并发数(默认3)'
  1200. )
  1201. args = parser.parse_args()
  1202. # 创建系统实例
  1203. system = EnhancedSearchV2(
  1204. how_json_path=args.how_json,
  1205. dimension_associations_path=args.dimension_associations,
  1206. optimized_clustered_data_path=args.optimized_clustered,
  1207. openrouter_api_key=args.api_key,
  1208. output_dir=args.output_dir,
  1209. top_n=args.top_n,
  1210. max_total_searches=args.max_total_searches,
  1211. search_max_workers=args.search_workers
  1212. )
  1213. # 执行完整流程
  1214. system.run_full_pipeline()
  1215. if __name__ == '__main__':
  1216. main()