enhanced_search_v2.py 49 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 增强搜索系统 V2
  5. 支持LLM评估和扩展搜索的完整流程
  6. """
  7. import json
  8. import logging
  9. import copy
  10. import time
  11. import os
  12. import argparse
  13. import subprocess
  14. from typing import Dict, List, Any, Optional, Set, Tuple
  15. from datetime import datetime
  16. from concurrent.futures import ThreadPoolExecutor, as_completed
  17. from itertools import combinations
  18. from openrouter_client import OpenRouterClient
  19. from llm_evaluator import LLMEvaluator
  20. from xiaohongshu_search import XiaohongshuSearch
  21. # 配置日志
  22. logging.basicConfig(
  23. level=logging.INFO,
  24. format='%(asctime)s - %(levelname)s - %(message)s',
  25. datefmt='%Y-%m-%d %H:%M:%S',
  26. handlers=[
  27. logging.FileHandler('enhanced_search_v2.log', encoding='utf-8'),
  28. logging.StreamHandler()
  29. ]
  30. )
  31. logger = logging.getLogger(__name__)
  32. class EnhancedSearchV2:
  33. """增强搜索系统V2"""
  34. def __init__(
  35. self,
  36. how_json_path: str,
  37. dimension_associations_path: str,
  38. optimized_clustered_data_path: str,
  39. openrouter_api_key: Optional[str] = None,
  40. output_dir: str = "output_v2",
  41. top_n: int = 10,
  42. max_total_searches: Optional[int] = None,
  43. search_max_workers: int = 3
  44. ):
  45. """
  46. 初始化系统
  47. Args:
  48. how_json_path: How解构文件路径
  49. dimension_associations_path: 维度关联文件路径
  50. optimized_clustered_data_path: 人设特征库路径
  51. openrouter_api_key: OpenRouter API密钥
  52. output_dir: 输出目录
  53. top_n: 每个原始特征取评分最高的N个搜索词(默认10)
  54. max_total_searches: 全局最大搜索次数限制(默认None不限制)
  55. search_max_workers: 搜索并发数(默认3)
  56. """
  57. self.how_json_path = how_json_path
  58. self.dimension_associations_path = dimension_associations_path
  59. self.optimized_clustered_data_path = optimized_clustered_data_path
  60. self.output_dir = output_dir
  61. self.top_n = top_n
  62. self.max_total_searches = max_total_searches
  63. self.search_max_workers = search_max_workers
  64. # 创建输出目录
  65. os.makedirs(output_dir, exist_ok=True)
  66. # 加载数据
  67. logger.info("加载数据文件...")
  68. self.how_data = self._load_json(how_json_path)
  69. self.dimension_associations = self._load_json(dimension_associations_path)
  70. self.optimized_clustered_data = self._load_json(optimized_clustered_data_path)
  71. # 初始化组件
  72. logger.info("初始化组件...")
  73. self.openrouter_client = OpenRouterClient(
  74. api_key=openrouter_api_key,
  75. model="google/gemini-2.5-flash",
  76. retry_delay=5 # 增加重试延迟避免限流
  77. )
  78. self.llm_evaluator = LLMEvaluator(self.openrouter_client)
  79. self.search_client = XiaohongshuSearch()
  80. logger.info("系统初始化完成")
  81. def _load_json(self, file_path: str) -> Any:
  82. """加载JSON文件"""
  83. try:
  84. with open(file_path, 'r', encoding='utf-8') as f:
  85. return json.load(f)
  86. except Exception as e:
  87. logger.error(f"加载文件失败 {file_path}: {e}")
  88. raise
  89. def _save_json(self, data: Any, file_path: str):
  90. """保存JSON文件"""
  91. try:
  92. with open(file_path, 'w', encoding='utf-8') as f:
  93. json.dump(data, f, ensure_ascii=False, indent=2)
  94. logger.info(f"已保存: {file_path}")
  95. except Exception as e:
  96. logger.error(f"保存文件失败 {file_path}: {e}")
  97. raise
  98. # ========== 阶段1:筛选 0.5 <= 相似度 < 0.8 的特征 ==========
  99. def stage1_filter_features(self) -> List[Dict[str, Any]]:
  100. """
  101. 阶段1:筛选中等匹配度特征
  102. 筛选条件:0.5 <= 最高相似度 < 0.8
  103. Returns:
  104. 筛选后的特征列表
  105. """
  106. logger.info("=" * 60)
  107. logger.info("阶段1:筛选中等匹配度特征 (0.5 <= 相似度 < 0.8)")
  108. logger.info("=" * 60)
  109. results = []
  110. how_result = self.how_data.get('how解构结果', {})
  111. total_features = 0
  112. filtered_out_low = 0 # < 0.5
  113. filtered_out_high = 0 # >= 0.8
  114. selected_count = 0
  115. # 遍历三个维度
  116. for level_name, level_list in how_result.items():
  117. if not isinstance(level_list, list):
  118. continue
  119. logger.info(f"\n处理 {level_name}...")
  120. for item_idx, item in enumerate(level_list):
  121. item_name = item.get('名称', f'未命名-{item_idx}')
  122. how_steps = item.get('how步骤列表', [])
  123. for step in how_steps:
  124. features = step.get('特征列表', [])
  125. for feature in features:
  126. feature_name = feature.get('特征名称', '')
  127. match_results = feature.get('匹配结果', [])
  128. total_features += 1
  129. if not match_results:
  130. continue
  131. # 找到最高相似度
  132. max_similarity = max(
  133. (m.get('匹配结果', {}).get('相似度', 0) for m in match_results),
  134. default=0
  135. )
  136. # 筛选条件
  137. if max_similarity < 0.5:
  138. filtered_out_low += 1
  139. continue
  140. elif max_similarity >= 0.8:
  141. filtered_out_high += 1
  142. continue
  143. # 0.5 <= max_similarity < 0.8,保留
  144. best_match = max(
  145. match_results,
  146. key=lambda x: x.get('匹配结果', {}).get('相似度', 0)
  147. )
  148. # 判断是分类还是特征
  149. feature_classification = best_match.get('特征分类', [])
  150. classification_path = self._build_classification_path(feature_classification)
  151. # 如果路径为空且是分类类型,搜索补全路径
  152. if not classification_path and best_match.get('特征类型') == '分类':
  153. feature_name_to_search = best_match.get('人设特征名称', '')
  154. classification_path = self._search_classification_path(feature_name_to_search)
  155. is_classification = self._is_classification(best_match.get('人设特征名称', ''), classification_path)
  156. result_item = {
  157. '原始特征名称': feature_name,
  158. '来源层级': level_name,
  159. '权重': feature.get('权重', 0),
  160. '所属点名称': item_name,
  161. '最高匹配信息': {
  162. '人设特征名称': best_match.get('人设特征名称'),
  163. '人设特征层级': best_match.get('人设特征层级'),
  164. '特征类型': best_match.get('特征类型'),
  165. '特征分类': feature_classification,
  166. '相似度': best_match.get('匹配结果', {}).get('相似度', 0),
  167. '匹配说明': best_match.get('匹配结果', {}).get('说明', ''),
  168. '是分类': is_classification,
  169. '所属分类路径': classification_path
  170. }
  171. }
  172. results.append(result_item)
  173. selected_count += 1
  174. logger.info(f" ✓ {feature_name} → {best_match.get('人设特征名称')} "
  175. f"(相似度: {max_similarity:.3f}, "
  176. f"{'分类' if is_classification else '特征'})")
  177. # 统计信息
  178. logger.info(f"\n" + "=" * 60)
  179. logger.info(f"阶段1完成")
  180. logger.info(f" 总特征数: {total_features}")
  181. logger.info(f" 过滤掉(<0.5): {filtered_out_low}")
  182. logger.info(f" 过滤掉(>=0.8): {filtered_out_high}")
  183. logger.info(f" 保留(0.5-0.8): {selected_count}")
  184. logger.info("=" * 60)
  185. # 保存结果
  186. output_path = os.path.join(self.output_dir, "stage1_filtered_features.json")
  187. self._save_json(results, output_path)
  188. return results
  189. def _build_classification_path(self, feature_classification: List[str]) -> str:
  190. """
  191. 构建分类路径
  192. Args:
  193. feature_classification: 特征分类数组
  194. Returns:
  195. 分类路径
  196. """
  197. if not feature_classification:
  198. return ""
  199. # 步骤1: 去掉中间元素的"实质"后缀
  200. cleaned = []
  201. for i, item in enumerate(feature_classification):
  202. if i == len(feature_classification) - 1: # 最后一个保留
  203. cleaned.append(item)
  204. elif item.endswith("实质") and i != 0: # 中间的去掉"实质"
  205. cleaned.append(item[:-2])
  206. else:
  207. cleaned.append(item)
  208. # 步骤2: 反转数组
  209. reversed_list = list(reversed(cleaned))
  210. # 步骤3: 拼接路径
  211. path = "/".join(reversed_list)
  212. return path
  213. def _is_classification(self, persona_feature_name: str, classification_path: str) -> bool:
  214. """
  215. 判断是分类还是特征
  216. Args:
  217. persona_feature_name: 人设特征名称
  218. classification_path: 分类路径
  219. Returns:
  220. True: 是分类, False: 是特征
  221. """
  222. # 在optimized_clustered_data中查找
  223. # 如果在特征列表中找到,就是特征
  224. # 如果作为节点存在且有子节点,就是分类
  225. # 导航到节点
  226. node = self._navigate_to_node(classification_path)
  227. if not node:
  228. return False
  229. # 检查是否在特征列表中
  230. features = node.get('特征列表', [])
  231. for f in features:
  232. if f.get('特征名称') == persona_feature_name:
  233. return False # 在特征列表中,是特征
  234. # 检查是否作为子节点存在
  235. if persona_feature_name in node:
  236. sub_node = node[persona_feature_name]
  237. if isinstance(sub_node, dict):
  238. return True # 是子节点,是分类
  239. return False # 默认是特征
  240. def _navigate_to_node(self, path: str) -> Optional[Dict[str, Any]]:
  241. """
  242. 导航到指定路径的节点
  243. Args:
  244. path: 路径,如 "实质/猫咪宠物"
  245. Returns:
  246. 节点,未找到返回None
  247. """
  248. if not path:
  249. return None
  250. parts = path.split('/')
  251. first_part = parts[0]
  252. # 确定顶层key
  253. top_level_map = {
  254. '意图': '目的点',
  255. '要素': '目的点',
  256. '实质': None,
  257. '形式': None,
  258. '场景': None
  259. }
  260. top_keys = []
  261. if first_part in top_level_map:
  262. mapped = top_level_map[first_part]
  263. if mapped:
  264. top_keys.append(mapped)
  265. if not top_keys:
  266. top_keys = ['灵感点列表', '关键点列表', '目的点']
  267. # 尝试在每个顶层中查找
  268. for top_key in top_keys:
  269. current = self.optimized_clustered_data.get(top_key)
  270. if not current:
  271. continue
  272. # 逐层导航
  273. found = True
  274. for part in parts:
  275. if isinstance(current, dict) and part in current:
  276. current = current[part]
  277. else:
  278. found = False
  279. break
  280. if found and isinstance(current, dict):
  281. return current
  282. return None
  283. def _recursive_search(
  284. self,
  285. obj: Dict[str, Any],
  286. target_name: str,
  287. current_path: str = ""
  288. ) -> Optional[str]:
  289. """
  290. 递归搜索分类节点
  291. Args:
  292. obj: 当前搜索的对象
  293. target_name: 目标分类名称
  294. current_path: 当前路径
  295. Returns:
  296. 找到的完整路径,未找到返回None
  297. """
  298. if not isinstance(obj, dict):
  299. return None
  300. # 遍历所有键
  301. for key in obj.keys():
  302. # 跳过元数据和特征列表
  303. if key in ['_meta', '特征列表']:
  304. continue
  305. # 检查是否匹配
  306. if target_name in key or key in target_name:
  307. # 找到匹配,返回路径
  308. if current_path:
  309. return f"{current_path}/{key}"
  310. else:
  311. return key
  312. # 递归搜索子节点
  313. if isinstance(obj[key], dict):
  314. next_path = f"{current_path}/{key}" if current_path else key
  315. result = self._recursive_search(obj[key], target_name, next_path)
  316. if result:
  317. return result
  318. return None
  319. def _search_classification_path(self, classification_name: str) -> str:
  320. """
  321. 在optimized_clustered_data中搜索分类节点路径
  322. Args:
  323. classification_name: 分类名称,如"实体物品实质"
  324. Returns:
  325. 完整路径,如"实质/实体物品",未找到返回空字符串
  326. """
  327. if not classification_name:
  328. return ""
  329. # 清理名称:去掉常见后缀
  330. clean_name = classification_name
  331. for suffix in ['实质', '意图', '形式', '要素']:
  332. if clean_name.endswith(suffix) and len(clean_name) > len(suffix):
  333. clean_name = clean_name[:-len(suffix)]
  334. break
  335. logger.info(f" 搜索分类: {classification_name} → 清理为: {clean_name}")
  336. # 在三个顶级列表中搜索
  337. for top_key in ['灵感点列表', '关键点列表', '目的点']:
  338. top_data = self.optimized_clustered_data.get(top_key, {})
  339. if not top_data:
  340. continue
  341. # 递归搜索
  342. path = self._recursive_search(top_data, clean_name, "")
  343. if path:
  344. logger.info(f" ✓ 找到路径: {path}")
  345. return path
  346. logger.warning(f" ✗ 未找到分类路径: {classification_name}")
  347. return ""
  348. # ========== 阶段2:收集关联分类+标签+子分类 ==========
  349. def stage2_find_associations(self, filtered_features: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
  350. """
  351. 阶段2:查找关联分类,收集分类名称、标签、子分类
  352. Args:
  353. filtered_features: 阶段1筛选的特征
  354. Returns:
  355. 带关联信息的特征列表
  356. """
  357. logger.info("=" * 60)
  358. logger.info("阶段2:查找关联分类")
  359. logger.info("=" * 60)
  360. for idx, feature in enumerate(filtered_features, 1):
  361. logger.info(f"\n[{idx}/{len(filtered_features)}] 处理: {feature['原始特征名称']}")
  362. match_info = feature['最高匹配信息']
  363. is_classification = match_info['是分类']
  364. classification_path = match_info['所属分类路径']
  365. source_level = match_info['人设特征层级']
  366. if is_classification:
  367. # 匹配的是分类,直接用分类路径
  368. search_path = classification_path
  369. logger.info(f" 匹配到分类: {search_path}")
  370. else:
  371. # 匹配的是特征,用所属分类路径
  372. search_path = classification_path
  373. logger.info(f" 匹配到特征,使用所属分类: {search_path}")
  374. # 查找关联
  375. associations = self._find_associations(search_path, source_level)
  376. # 收集关联信息
  377. feature['找到的关联'] = []
  378. for assoc in associations:
  379. target_path = assoc['目标分类']
  380. logger.info(f" 处理关联: {target_path}")
  381. # 收集分类信息
  382. classification_info = self._collect_classification_info(target_path)
  383. if classification_info:
  384. feature['找到的关联'].append({
  385. '来源方向': assoc['来源方向'],
  386. '关联类型': assoc['关联类型'],
  387. '目标分类路径': target_path,
  388. '共同帖子数': assoc['共同帖子数'],
  389. 'Jaccard相似度': assoc['Jaccard相似度'],
  390. '分类名称': classification_info['classification_name'],
  391. '标签列表': classification_info['tags'],
  392. '子分类列表': classification_info['sub_classifications']
  393. })
  394. logger.info(f" 找到 {len(feature['找到的关联'])} 个关联")
  395. # 保存结果
  396. output_path = os.path.join(self.output_dir, "stage2_associations.json")
  397. self._save_json(filtered_features, output_path)
  398. logger.info(f"\n" + "=" * 60)
  399. logger.info(f"阶段2完成")
  400. logger.info("=" * 60)
  401. return filtered_features
  402. def _find_associations(self, classification_path: str, source_level: str) -> List[Dict[str, Any]]:
  403. """
  404. 查找关联节点
  405. Args:
  406. classification_path: 分类路径
  407. source_level: 源层级
  408. Returns:
  409. 关联节点列表
  410. """
  411. associations = []
  412. # 确定维度名称
  413. if '灵感点' in source_level:
  414. dimension_key = '灵感点维度'
  415. elif '关键点' in source_level:
  416. dimension_key = '关键点维度'
  417. elif '目的点' in source_level:
  418. dimension_key = '目的点维度'
  419. else:
  420. return associations
  421. # 获取维度数据
  422. single_dim = self.dimension_associations.get('单维度关联分析', {})
  423. dimension_data = single_dim.get(dimension_key, {})
  424. if not dimension_data:
  425. return associations
  426. # 遍历所有方向
  427. for direction_key, direction_data in dimension_data.items():
  428. if direction_key == '说明':
  429. continue
  430. # 查找源分类
  431. if classification_path in direction_data:
  432. source_data = direction_data[classification_path]
  433. # 获取关联节点
  434. for assoc_key in source_data.keys():
  435. if assoc_key.startswith('与') and assoc_key.endswith('的关联'):
  436. assoc_list = source_data[assoc_key]
  437. for assoc_item in assoc_list:
  438. associations.append({
  439. '来源方向': direction_key,
  440. '关联类型': assoc_key,
  441. '目标分类': assoc_item.get('目标分类'),
  442. '目标层级': assoc_item.get('目标层级'),
  443. '共同帖子数': assoc_item.get('共同帖子数'),
  444. 'Jaccard相似度': assoc_item.get('Jaccard相似度'),
  445. '共同帖子ID': assoc_item.get('共同帖子ID', [])
  446. })
  447. return associations
  448. def _collect_classification_info(self, classification_path: str) -> Optional[Dict[str, Any]]:
  449. """
  450. 收集分类信息:分类名 + 标签 + 子分类
  451. Args:
  452. classification_path: 分类路径
  453. Returns:
  454. 分类信息
  455. """
  456. node = self._navigate_to_node(classification_path)
  457. if not node:
  458. return None
  459. # 分类名称(路径最后一段)
  460. classification_name = classification_path.split('/')[-1]
  461. # 标签(特征列表)
  462. tags = [f.get('特征名称', '') for f in node.get('特征列表', [])]
  463. # 子分类(子节点,排除_meta和特征列表)
  464. sub_classifications = [
  465. key for key in node.keys()
  466. if isinstance(node[key], dict) and key not in ['_meta', '特征列表']
  467. ]
  468. return {
  469. 'classification_name': classification_name,
  470. 'tags': tags,
  471. 'sub_classifications': sub_classifications
  472. }
  473. # ========== 阶段3:筛选高相似度匹配(>0.8) ==========
  474. def stage3_filter_high_similarity_matches(self, associations_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
  475. """
  476. 阶段3:筛选高相似度匹配(>0.8)
  477. 遍历how解构中的所有原始特征,找出匹配结果中相似度>0.8
  478. 且人设特征名称在Stage2关联范围内的高质量匹配
  479. Args:
  480. associations_data: 阶段2的关联数据
  481. Returns:
  482. 带高相似度候选的数据
  483. """
  484. logger.info("=" * 60)
  485. logger.info("阶段3:筛选高相似度匹配(>0.8)")
  486. logger.info("=" * 60)
  487. for idx, feature_result in enumerate(associations_data, 1):
  488. original_feature_name = feature_result['原始特征名称']
  489. logger.info(f"\n[{idx}/{len(associations_data)}] 处理: {original_feature_name}")
  490. # 步骤1: 收集Stage2的关联范围(分类名+标签)
  491. stage2_scope = self._collect_stage2_scope(feature_result)
  492. logger.info(f" Stage2范围包含 {len(stage2_scope)} 个分类/标签")
  493. # 步骤2: 遍历how解构中的所有原始特征,找出高相似度匹配
  494. high_sim_candidates = []
  495. total_checked = 0
  496. high_sim_found = 0
  497. how_result = self.how_data.get('how解构结果', {})
  498. for level_name, level_list in how_result.items():
  499. if not isinstance(level_list, list):
  500. continue
  501. for item in level_list:
  502. for step in item.get('how步骤列表', []):
  503. for feature in step.get('特征列表', []):
  504. # 获取该特征的所有匹配
  505. matches = feature.get('匹配结果', [])
  506. total_checked += len(matches)
  507. # 筛选相似度>0.8且在Stage2范围内的匹配
  508. for match in matches:
  509. sim = match.get('匹配结果', {}).get('相似度', 0)
  510. persona_feature_name = match.get('人设特征名称', '')
  511. if sim > 0.8 and persona_feature_name in stage2_scope:
  512. high_sim_found += 1
  513. # 记录来源信息
  514. high_sim_candidates.append({
  515. '人设特征名称': persona_feature_name,
  516. '相似度': sim,
  517. '特征类型': match.get('特征类型', ''),
  518. '特征分类': match.get('特征分类', []),
  519. '人设特征层级': match.get('人设特征层级', ''),
  520. '来源路径': self._build_classification_path(match.get('特征分类', [])),
  521. '匹配说明': match.get('匹配结果', {}).get('说明', ''),
  522. '来源原始特征': feature.get('特征名称', '') # 记录来自哪个原始特征
  523. })
  524. logger.info(f" 检查了 {total_checked} 个匹配")
  525. logger.info(f" 找到 {high_sim_found} 个相似度>0.8的匹配")
  526. # 按相似度降序排序,并去重(同一个人设特征名称只保留最高分)
  527. seen_names = set()
  528. unique_candidates = []
  529. high_sim_candidates.sort(key=lambda x: x['相似度'], reverse=True)
  530. for candidate in high_sim_candidates:
  531. name = candidate['人设特征名称']
  532. if name not in seen_names:
  533. seen_names.add(name)
  534. unique_candidates.append(candidate)
  535. # 添加到结果中
  536. feature_result['高相似度候选'] = unique_candidates
  537. logger.info(f" 去重后筛选出 {len(unique_candidates)} 个高相似度候选")
  538. # 显示前5个
  539. if unique_candidates:
  540. logger.info(f" Top 5:")
  541. for c in unique_candidates[:5]:
  542. logger.info(f" • {c['人设特征名称']} ({c['相似度']:.3f}) ← 来自\"{c['来源原始特征']}\"")
  543. # 保存结果
  544. output_path = os.path.join(self.output_dir, "stage3_high_similarity.json")
  545. self._save_json(associations_data, output_path)
  546. logger.info(f"\n" + "=" * 60)
  547. logger.info(f"阶段3完成")
  548. logger.info("=" * 60)
  549. return associations_data
  550. def _collect_stage2_scope(self, feature_result: Dict[str, Any]) -> Set[str]:
  551. """
  552. 收集Stage2找到的所有分类名和标签,形成范围集合
  553. Args:
  554. feature_result: 特征结果数据
  555. Returns:
  556. 包含所有分类名和标签的集合
  557. """
  558. scope = set()
  559. for assoc in feature_result.get('找到的关联', []):
  560. # 添加分类名
  561. scope.add(assoc['分类名称'])
  562. # 添加所有标签
  563. tags = assoc.get('标签列表', [])
  564. scope.update(tags)
  565. return scope
  566. def _find_features_by_path(self, target_classification: str) -> List[Dict[str, Any]]:
  567. """
  568. 根据路径查找特征列表
  569. Args:
  570. target_classification: 目标分类路径
  571. Returns:
  572. 特征列表
  573. """
  574. node = self._navigate_to_node(target_classification)
  575. if not node:
  576. return []
  577. features = node.get('特征列表', [])
  578. # 深拷贝
  579. return copy.deepcopy(features)
  580. # ========== 阶段4:多词组合 + LLM评估 ==========
  581. def stage4_generate_and_evaluate_search_words(
  582. self,
  583. features_data: List[Dict[str, Any]],
  584. max_workers: int = 4,
  585. max_candidates: int = 20,
  586. max_combo_length: int = 4
  587. ) -> List[Dict[str, Any]]:
  588. """
  589. 阶段4:多词组合 + LLM评估
  590. 基于Stage1的基础词和Stage3的高相似度候选,
  591. 生成所有2-N词组合,通过LLM评估选出Top10
  592. Args:
  593. features_data: 阶段3的数据(包含高相似度候选)
  594. max_workers: 并发评估的原始特征数(默认4)
  595. max_candidates: 参与组合的最大候选词数(默认20)
  596. max_combo_length: 最大组合词数(默认4,即基础词+3个候选)
  597. Returns:
  598. 带LLM评估的数据
  599. """
  600. logger.info("=" * 60)
  601. logger.info("阶段4:多词组合 + LLM评估")
  602. logger.info(f" 最大候选词数: {max_candidates}")
  603. logger.info(f" 最大组合长度: {max_combo_length} 词")
  604. logger.info(f" 并发数: {max_workers} 个原始特征")
  605. logger.info("=" * 60)
  606. total_features = len(features_data)
  607. # 使用ThreadPoolExecutor并行处理不同的原始特征
  608. with ThreadPoolExecutor(max_workers=max_workers) as executor:
  609. # 提交所有任务
  610. futures = []
  611. for idx, feature_result in enumerate(features_data, 1):
  612. future = executor.submit(
  613. self._process_single_feature_combinations,
  614. idx,
  615. total_features,
  616. feature_result,
  617. max_candidates,
  618. max_combo_length
  619. )
  620. futures.append((future, feature_result))
  621. # 等待所有任务完成并收集结果
  622. for future, feature_result in futures:
  623. try:
  624. _ = future.result() # 等待完成,结果已经写回到feature_result中
  625. except Exception as e:
  626. logger.error(f" 评估失败: {feature_result['原始特征名称']}, 错误: {e}")
  627. # 保存结果
  628. output_path = os.path.join(self.output_dir, "stage4_combinations_evaluated.json")
  629. self._save_json(features_data, output_path)
  630. logger.info(f"\n" + "=" * 60)
  631. logger.info(f"阶段4完成")
  632. logger.info("=" * 60)
  633. return features_data
  634. def _process_single_feature_combinations(
  635. self,
  636. idx: int,
  637. total: int,
  638. feature_result: Dict[str, Any],
  639. max_candidates: int,
  640. max_combo_length: int
  641. ) -> None:
  642. """
  643. 处理单个原始特征的组合生成和评估
  644. Steps:
  645. 1. Get base_word from Stage1's 最高匹配信息
  646. 2. Get candidates from Stage3's 高相似度候选 (top max_candidates)
  647. 3. Generate 2-N word combinations
  648. 4. LLM batch evaluation
  649. 5. Select Top 10 and write back
  650. Args:
  651. idx: 特征索引
  652. total: 总特征数
  653. feature_result: 特征结果数据
  654. max_candidates: 参与组合的最大候选词数
  655. max_combo_length: 最大组合词数
  656. """
  657. original_feature = feature_result['原始特征名称']
  658. logger.info(f"\n[{idx}/{total}] 处理: {original_feature}")
  659. # 步骤1: 获取基础词
  660. base_word = feature_result.get('最高匹配信息', {}).get('人设特征名称', '')
  661. if not base_word:
  662. logger.info(f" 无基础词,跳过")
  663. feature_result['组合评估结果'] = []
  664. return
  665. logger.info(f" 基础词: {base_word}")
  666. # 步骤2: 获取候选词(从高相似度候选中)
  667. high_sim_candidates = feature_result.get('高相似度候选', [])
  668. # 限制候选词数量
  669. candidates = high_sim_candidates[:max_candidates]
  670. candidate_words = [c['人设特征名称'] for c in candidates]
  671. if not candidate_words:
  672. logger.info(f" 无候选词,跳过")
  673. feature_result['组合评估结果'] = []
  674. return
  675. logger.info(f" 候选词数量: {len(candidate_words)} (限制: {max_candidates})")
  676. # 步骤3: 生成所有组合
  677. all_combinations = []
  678. # 生成1词到max_combo_length-1词的候选词组合(因为还要加上base_word)
  679. for length in range(1, min(max_combo_length, len(candidate_words) + 1)):
  680. for combo in combinations(candidate_words, length):
  681. # 组合成搜索词:基础词 + 候选词组合
  682. search_phrase = base_word + ' ' + ' '.join(combo)
  683. all_combinations.append({
  684. 'search_word': search_phrase,
  685. 'base_word': base_word,
  686. 'candidate_words': list(combo),
  687. 'combo_length': length + 1 # +1 因为包含base_word
  688. })
  689. logger.info(f" 生成 {len(all_combinations)} 个组合")
  690. # 步骤4: LLM批量评估
  691. logger.info(f" 开始LLM评估...")
  692. evaluated = self.llm_evaluator.evaluate_search_words_in_batches(
  693. original_feature=original_feature,
  694. search_words=[c['search_word'] for c in all_combinations],
  695. batch_size=50
  696. )
  697. # 步骤5: 选出Top 10
  698. top_10 = evaluated[:10]
  699. # 写回结果
  700. feature_result['组合评估结果'] = top_10
  701. max_score = top_10[0]['score'] if top_10 else 0.0
  702. logger.info(f" 评估完成,Top 10 最高分: {max_score:.3f}")
  703. # ========== 阶段5:执行搜索 ==========
  704. def _execute_single_search(
  705. self,
  706. idx: int,
  707. total: int,
  708. search_word: str,
  709. feature_ref: Dict[str, Any]
  710. ) -> Dict[str, Any]:
  711. """
  712. 执行单个搜索任务(用于并发执行)
  713. Args:
  714. idx: 搜索索引
  715. total: 总搜索数
  716. search_word: 搜索词
  717. feature_ref: 特征引用(用于写入结果)
  718. Returns:
  719. 搜索结果信息
  720. """
  721. logger.info(f"[{idx}/{total}] 搜索: {search_word}")
  722. try:
  723. result = self.search_client.search(
  724. keyword=search_word,
  725. content_type='不限',
  726. sort_type='综合',
  727. max_retries=3,
  728. use_cache=True # 启用搜索缓存
  729. )
  730. note_count = len(result.get('data', {}).get('data', []))
  731. logger.info(f" ✓ 成功,获取 {note_count} 条帖子")
  732. # 写入结果
  733. feature_ref['search_result'] = result
  734. feature_ref['search_metadata'] = {
  735. 'searched_at': datetime.now().isoformat(),
  736. 'status': 'success',
  737. 'note_count': note_count,
  738. 'search_params': {
  739. 'keyword': search_word,
  740. 'content_type': '图文',
  741. 'sort_type': '综合'
  742. }
  743. }
  744. return {'status': 'success', 'search_word': search_word, 'note_count': note_count}
  745. except Exception as e:
  746. logger.error(f" ✗ 失败: {e}")
  747. feature_ref['search_result'] = None
  748. feature_ref['search_metadata'] = {
  749. 'searched_at': datetime.now().isoformat(),
  750. 'status': 'failed',
  751. 'note_count': 0,
  752. 'error': str(e)
  753. }
  754. return {'status': 'failed', 'search_word': search_word, 'error': str(e)}
  755. def stage5_execute_searches(
  756. self,
  757. features_data: List[Dict[str, Any]],
  758. search_delay: float = 2.0,
  759. top_n: int = 10
  760. ) -> List[Dict[str, Any]]:
  761. """
  762. 阶段5:执行小红书搜索
  763. Args:
  764. features_data: 阶段4的数据
  765. search_delay: 搜索延迟
  766. top_n: 每个原始特征取评分最高的N个搜索词
  767. Returns:
  768. 带搜索结果的数据
  769. """
  770. logger.info("=" * 60)
  771. logger.info("阶段5:执行小红书搜索")
  772. logger.info("=" * 60)
  773. # 按原始特征分组收集搜索词(从Stage4的组合评估结果读取)
  774. feature_search_groups = {}
  775. for feature_result in features_data:
  776. original_feature = feature_result['原始特征名称']
  777. if original_feature not in feature_search_groups:
  778. feature_search_groups[original_feature] = []
  779. # 从Stage4的组合评估结果读取
  780. for eval_item in feature_result.get('组合评估结果', []):
  781. sw = eval_item.get('search_word')
  782. if not sw:
  783. continue
  784. score = eval_item.get('score', 0.0)
  785. feature_search_groups[original_feature].append({
  786. 'search_word': sw,
  787. 'score': score,
  788. 'feature_ref': eval_item # 引用评估项,用于写入搜索结果
  789. })
  790. # 每组取Top N
  791. all_searches = []
  792. total_before_filter = 0
  793. total_filtered = 0
  794. for original_feature, search_list in feature_search_groups.items():
  795. total_before_filter += len(search_list)
  796. # 按分数降序排序
  797. sorted_list = sorted(search_list, key=lambda x: x['score'], reverse=True)
  798. # 取前top_n个
  799. selected = sorted_list[:top_n]
  800. all_searches.extend(selected)
  801. filtered = len(sorted_list) - len(selected)
  802. total_filtered += filtered
  803. logger.info(f" {original_feature}: 从 {len(sorted_list)} 个搜索词中选择 Top {len(selected)} (过滤 {filtered} 个)")
  804. # 应用全局搜索次数限制
  805. if self.max_total_searches and len(all_searches) > self.max_total_searches:
  806. logger.info(f" 应用全局限制:从 {len(all_searches)} 个减少到 {self.max_total_searches} 个")
  807. all_searches = all_searches[:self.max_total_searches]
  808. logger.info(f"\n共 {len(all_searches)} 个搜索任务(过滤前: {total_before_filter}, 过滤掉: {total_filtered})")
  809. logger.info(f" 并发执行搜索(并发数: {self.search_max_workers})")
  810. # 使用ThreadPoolExecutor并发执行搜索
  811. with ThreadPoolExecutor(max_workers=self.search_max_workers) as executor:
  812. # 提交所有搜索任务
  813. futures = []
  814. for idx, item in enumerate(all_searches, 1):
  815. future = executor.submit(
  816. self._execute_single_search,
  817. idx,
  818. len(all_searches),
  819. item['search_word'],
  820. item['feature_ref']
  821. )
  822. futures.append(future)
  823. # 等待所有搜索完成
  824. for future in as_completed(futures):
  825. try:
  826. result = future.result()
  827. # 结果已经写入feature_ref,无需额外处理
  828. except Exception as e:
  829. logger.error(f" 搜索任务失败: {e}")
  830. # 保存结果
  831. output_path = os.path.join(self.output_dir, "stage5_with_search_results.json")
  832. self._save_json(features_data, output_path)
  833. logger.info(f"\n" + "=" * 60)
  834. logger.info(f"阶段5完成")
  835. logger.info("=" * 60)
  836. return features_data
  837. # ========== 阶段6:LLM评估搜索结果 ==========
  838. def stage6_evaluate_search_results(
  839. self,
  840. features_data: List[Dict[str, Any]]
  841. ) -> List[Dict[str, Any]]:
  842. """
  843. 阶段6:用LLM评估搜索结果(多模态)
  844. Args:
  845. features_data: 阶段5的数据
  846. Returns:
  847. 带结果评估的数据
  848. """
  849. logger.info("=" * 60)
  850. logger.info("阶段6:LLM评估搜索结果")
  851. logger.info("=" * 60)
  852. # 收集所有需要评估的特征节点
  853. features_to_evaluate = []
  854. for feature_result in features_data:
  855. original_feature = feature_result['原始特征名称']
  856. for assoc in feature_result.get('找到的关联', []):
  857. for feature in assoc.get('特征列表', []):
  858. if feature.get('search_result') and feature['search_metadata']['status'] == 'success':
  859. features_to_evaluate.append({
  860. 'original_feature': original_feature,
  861. 'feature_node': feature
  862. })
  863. logger.info(f"共 {len(features_to_evaluate)} 个搜索结果需要评估")
  864. # 并行评估(并发数较低)
  865. with ThreadPoolExecutor(max_workers=8) as executor:
  866. futures = []
  867. for item in features_to_evaluate:
  868. future = executor.submit(
  869. self._evaluate_single_search_result,
  870. item['original_feature'],
  871. item['feature_node']
  872. )
  873. futures.append((future, item))
  874. # 收集结果
  875. for idx, (future, item) in enumerate(futures, 1):
  876. try:
  877. evaluation = future.result()
  878. item['feature_node']['result_evaluation'] = evaluation
  879. logger.info(f" [{idx}/{len(futures)}] {item['feature_node']['search_word']}: "
  880. f"relevance={evaluation['overall_relevance']:.3f}")
  881. except Exception as e:
  882. logger.error(f" 评估失败: {item['feature_node']['search_word']}, 错误: {e}")
  883. item['feature_node']['result_evaluation'] = None
  884. # 保存结果
  885. output_path = os.path.join(self.output_dir, "stage6_with_evaluations.json")
  886. self._save_json(features_data, output_path)
  887. logger.info(f"\n" + "=" * 60)
  888. logger.info(f"阶段6完成")
  889. logger.info("=" * 60)
  890. return features_data
  891. def _evaluate_single_search_result(
  892. self,
  893. original_feature: str,
  894. feature_node: Dict[str, Any]
  895. ) -> Dict[str, Any]:
  896. """
  897. 评估单个搜索结果(使用并行评估)
  898. Args:
  899. original_feature: 原始特征
  900. feature_node: 特征节点
  901. Returns:
  902. 评估结果
  903. """
  904. search_word = feature_node.get('search_word', '')
  905. notes = feature_node['search_result'].get('data', {}).get('data', [])
  906. return self.llm_evaluator.evaluate_search_results_parallel(
  907. original_feature=original_feature,
  908. search_word=search_word,
  909. notes=notes,
  910. max_notes=20,
  911. max_workers=20 # 20个并发评估每个帖子
  912. )
  913. # ========== 阶段7:扩展搜索 ==========
  914. def stage7_extended_searches(
  915. self,
  916. features_data: List[Dict[str, Any]],
  917. search_delay: float = 2.0
  918. ) -> List[Dict[str, Any]]:
  919. """
  920. 阶段7:基于评估结果扩展搜索(多个)
  921. Args:
  922. features_data: 阶段6的数据
  923. search_delay: 搜索延迟
  924. Returns:
  925. 带扩展搜索的数据
  926. """
  927. logger.info("=" * 60)
  928. logger.info("阶段7:扩展搜索")
  929. logger.info("=" * 60)
  930. # 收集需要扩展搜索的任务
  931. extension_tasks = []
  932. for feature_result in features_data:
  933. original_feature = feature_result['原始特征名称']
  934. for assoc in feature_result.get('找到的关联', []):
  935. for feature in assoc.get('特征列表', []):
  936. result_eval = feature.get('result_evaluation')
  937. if not result_eval:
  938. continue
  939. extracted_elements = result_eval.get('extracted_elements', [])
  940. if not extracted_elements:
  941. continue
  942. # 为每个提取的元素创建扩展搜索
  943. base_search_word = feature.get('search_word', '')
  944. for element in extracted_elements:
  945. extended_keyword = f"{base_search_word} {element}"
  946. extension_tasks.append({
  947. 'extended_keyword': extended_keyword,
  948. 'original_feature': original_feature,
  949. 'feature_node': feature,
  950. 'element': element
  951. })
  952. logger.info(f"共 {len(extension_tasks)} 个扩展搜索任务")
  953. # 执行扩展搜索
  954. for idx, task in enumerate(extension_tasks, 1):
  955. extended_kw = task['extended_keyword']
  956. logger.info(f"[{idx}/{len(extension_tasks)}] 扩展搜索: {extended_kw}")
  957. try:
  958. result = self.search_client.search(
  959. keyword=extended_kw,
  960. content_type='不限',
  961. sort_type='综合',
  962. max_retries=3,
  963. use_cache=True # 启用搜索缓存
  964. )
  965. note_count = len(result.get('data', {}).get('data', []))
  966. logger.info(f" ✓ 成功,获取 {note_count} 条帖子")
  967. # 评估扩展搜索结果
  968. logger.info(f" 评估扩展搜索结果...")
  969. evaluation = self.llm_evaluator.evaluate_search_results(
  970. original_feature=task['original_feature'],
  971. search_word=extended_kw,
  972. notes=result.get('data', {}).get('data', []),
  973. max_notes=20,
  974. max_images_per_note=2
  975. )
  976. # 存储扩展搜索结果
  977. feature_node = task['feature_node']
  978. if 'extended_searches' not in feature_node:
  979. feature_node['extended_searches'] = []
  980. feature_node['extended_searches'].append({
  981. 'extended_keyword': extended_kw,
  982. 'based_on_element': task['element'],
  983. 'search_result': result,
  984. 'search_metadata': {
  985. 'searched_at': datetime.now().isoformat(),
  986. 'status': 'success',
  987. 'note_count': note_count
  988. },
  989. 'result_evaluation': evaluation
  990. })
  991. logger.info(f" 评估完成,relevance={evaluation['overall_relevance']:.3f}")
  992. except Exception as e:
  993. logger.error(f" ✗ 失败: {e}")
  994. # 延迟
  995. if idx < len(extension_tasks):
  996. time.sleep(search_delay)
  997. # 保存结果
  998. output_path = os.path.join(self.output_dir, "stage7_final_results.json")
  999. self._save_json(features_data, output_path)
  1000. logger.info(f"\n" + "=" * 60)
  1001. logger.info(f"阶段7完成")
  1002. logger.info("=" * 60)
  1003. return features_data
  1004. # ========== 主流程 ==========
  1005. def run_full_pipeline(self):
  1006. """执行完整流程"""
  1007. logger.info("\n" + "=" * 60)
  1008. logger.info("开始执行完整流程")
  1009. logger.info("=" * 60)
  1010. try:
  1011. # 阶段1
  1012. stage1_results = self.stage1_filter_features()
  1013. # 阶段2
  1014. stage2_results = self.stage2_find_associations(stage1_results)
  1015. # 阶段3 - 使用新方法:筛选高相似度匹配
  1016. stage3_results = self.stage3_filter_high_similarity_matches(stage2_results)
  1017. # 阶段4
  1018. stage4_results = self.stage4_generate_and_evaluate_search_words(
  1019. stage3_results,
  1020. max_workers=8, # 提高并发从4到8
  1021. max_combo_length=3 # 降低组合长度从4到3
  1022. )
  1023. # 阶段5
  1024. stage5_results = self.stage5_execute_searches(stage4_results, search_delay=2.0, top_n=self.top_n)
  1025. # 阶段6 - 暂时切断执行(代码保留)
  1026. # stage6_results = self.stage6_evaluate_search_results(stage5_results)
  1027. # 阶段7 - 暂时切断执行(代码保留)
  1028. # final_results = self.stage7_extended_searches(stage6_results, search_delay=2.0)
  1029. logger.info("\n" + "=" * 60)
  1030. logger.info("✓ 完整流程执行完成(Stage1-5)")
  1031. logger.info("=" * 60)
  1032. # 自动执行可视化
  1033. logger.info("\n" + "=" * 60)
  1034. logger.info("开始生成可视化...")
  1035. logger.info("=" * 60)
  1036. try:
  1037. result = subprocess.run(
  1038. ['python3', 'visualize_stage5_results.py'],
  1039. capture_output=True,
  1040. text=True,
  1041. timeout=60
  1042. )
  1043. if result.returncode == 0:
  1044. logger.info("✓ 可视化生成成功")
  1045. logger.info(result.stdout)
  1046. else:
  1047. logger.error(f"可视化生成失败: {result.stderr}")
  1048. except subprocess.TimeoutExpired:
  1049. logger.error("可视化生成超时")
  1050. except Exception as e:
  1051. logger.error(f"可视化生成异常: {e}")
  1052. return stage5_results
  1053. except Exception as e:
  1054. logger.error(f"流程执行失败: {e}")
  1055. raise
  1056. def main():
  1057. """主函数"""
  1058. parser = argparse.ArgumentParser(description='增强搜索系统V2')
  1059. parser.add_argument(
  1060. '--how-json',
  1061. default='69114f150000000007001f30_how copy.json',
  1062. help='How解构文件路径'
  1063. )
  1064. parser.add_argument(
  1065. '--dimension-associations',
  1066. default='dimension_associations_analysis.json',
  1067. help='维度关联文件路径'
  1068. )
  1069. parser.add_argument(
  1070. '--optimized-clustered',
  1071. default='optimized_clustered_data_gemini-3-pro-preview.json',
  1072. help='人设特征库路径'
  1073. )
  1074. parser.add_argument(
  1075. '--api-key',
  1076. default=None,
  1077. help='OpenRouter API密钥(默认从环境变量读取)'
  1078. )
  1079. parser.add_argument(
  1080. '--output-dir',
  1081. default='output_v2',
  1082. help='输出目录'
  1083. )
  1084. parser.add_argument(
  1085. '--top-n',
  1086. type=int,
  1087. default=10,
  1088. help='每个原始特征取评分最高的N个搜索词(默认10)'
  1089. )
  1090. parser.add_argument(
  1091. '--max-total-searches',
  1092. type=int,
  1093. default=None,
  1094. help='全局最大搜索次数限制(默认None不限制)'
  1095. )
  1096. parser.add_argument(
  1097. '--search-workers',
  1098. type=int,
  1099. default=3,
  1100. help='搜索并发数(默认3)'
  1101. )
  1102. args = parser.parse_args()
  1103. # 创建系统实例
  1104. system = EnhancedSearchV2(
  1105. how_json_path=args.how_json,
  1106. dimension_associations_path=args.dimension_associations,
  1107. optimized_clustered_data_path=args.optimized_clustered,
  1108. openrouter_api_key=args.api_key,
  1109. output_dir=args.output_dir,
  1110. top_n=args.top_n,
  1111. max_total_searches=args.max_total_searches,
  1112. search_max_workers=args.search_workers
  1113. )
  1114. # 执行完整流程
  1115. system.run_full_pipeline()
  1116. if __name__ == '__main__':
  1117. main()