enhanced_search_v2.py 79 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 增强搜索系统 V2
  5. 支持LLM评估和扩展搜索的完整流程
  6. """
  7. import json
  8. import logging
  9. import copy
  10. import time
  11. import os
  12. import argparse
  13. import subprocess
  14. from typing import Dict, List, Any, Optional, Set, Tuple
  15. from datetime import datetime
  16. from concurrent.futures import ThreadPoolExecutor, as_completed
  17. from openrouter_client import OpenRouterClient
  18. from llm_evaluator import LLMEvaluator
  19. from xiaohongshu_search import XiaohongshuSearch
  20. from stage7_analyzer import Stage7DeconstructionAnalyzer
  21. # 配置日志
  22. logging.basicConfig(
  23. level=logging.INFO,
  24. format='%(asctime)s - %(levelname)s - %(message)s',
  25. datefmt='%Y-%m-%d %H:%M:%S',
  26. handlers=[
  27. logging.FileHandler('enhanced_search_v2.log', encoding='utf-8'),
  28. logging.StreamHandler()
  29. ]
  30. )
  31. logger = logging.getLogger(__name__)
  32. class EnhancedSearchV2:
  33. """增强搜索系统V2"""
  34. def __init__(
  35. self,
  36. how_json_path: str,
  37. dimension_associations_path: str,
  38. intra_associations_path: str,
  39. optimized_clustered_data_path: str,
  40. openrouter_api_key: Optional[str] = None,
  41. output_dir: str = "output_v2",
  42. top_n: int = 10,
  43. max_total_searches: Optional[int] = None,
  44. search_max_workers: int = 3,
  45. max_searches_per_feature: Optional[int] = None,
  46. max_searches_per_base_word: Optional[int] = None,
  47. combination_source: str = "how_based",
  48. enable_stage6: bool = False,
  49. stage6_max_workers: int = 10,
  50. stage6_max_notes: int = 20,
  51. enable_stage7: bool = False,
  52. stage7_only: bool = False,
  53. stage7_max_workers: int = 5,
  54. stage7_max_notes: Optional[int] = None,
  55. stage7_skip: int = 0,
  56. stage7_sort_by: str = 'score',
  57. stage7_api_url: str = "http://192.168.245.150:7000/what/analysis/single",
  58. stage7_min_score: float = 0.8
  59. ):
  60. """
  61. 初始化系统
  62. Args:
  63. how_json_path: How解构文件路径
  64. dimension_associations_path: 维度关联文件路径
  65. intra_associations_path: 维度内关联文件路径
  66. optimized_clustered_data_path: 人设特征库路径
  67. openrouter_api_key: OpenRouter API密钥
  68. output_dir: 输出目录
  69. top_n: 每个原始特征取评分最高的N个搜索词(默认10)
  70. max_total_searches: 全局最大搜索次数限制(默认None不限制)
  71. search_max_workers: 搜索并发数(默认3)
  72. max_searches_per_feature: 每个原始特征的最大搜索次数(默认None不限制)
  73. max_searches_per_base_word: 每个base_word的最大搜索次数(默认None不限制)
  74. combination_source: 组合词来源方式(默认how_based)
  75. - "how_based": 从how文件提取相似度>=0.8的候选词(新方式,默认)
  76. - "association": 基于关联分析提取候选词(旧方式)
  77. enable_stage6: 是否启用Stage 6评估(默认False)
  78. stage6_max_workers: Stage 6并发评估数(默认10)
  79. stage6_max_notes: 每个搜索结果评估的最大帖子数(默认20)
  80. enable_stage7: 是否启用Stage 7深度解构(默认False)
  81. stage7_only: 只运行Stage 7(从Stage 6结果开始,默认False)
  82. stage7_max_workers: Stage 7并发数(默认5)
  83. stage7_max_notes: Stage 7最多处理多少个帖子(默认None不限制)
  84. stage7_skip: Stage 7跳过前N个帖子(默认0)
  85. stage7_sort_by: Stage 7排序方式:score/time/engagement(默认score)
  86. stage7_api_url: Stage 7解构API地址
  87. stage7_min_score: Stage 7处理的最低分数阈值(默认0.8,0-1分制)
  88. """
  89. self.how_json_path = how_json_path
  90. self.dimension_associations_path = dimension_associations_path
  91. self.intra_associations_path = intra_associations_path
  92. self.optimized_clustered_data_path = optimized_clustered_data_path
  93. self.output_dir = output_dir
  94. self.top_n = top_n
  95. self.max_total_searches = max_total_searches
  96. self.search_max_workers = search_max_workers
  97. self.max_searches_per_feature = max_searches_per_feature
  98. self.max_searches_per_base_word = max_searches_per_base_word
  99. self.combination_source = combination_source
  100. self.enable_stage6 = enable_stage6
  101. self.stage6_max_workers = stage6_max_workers
  102. self.stage6_max_notes = stage6_max_notes
  103. self.enable_stage7 = enable_stage7
  104. self.stage7_only = stage7_only
  105. # 创建输出目录
  106. os.makedirs(output_dir, exist_ok=True)
  107. # 加载数据
  108. logger.info("加载数据文件...")
  109. self.how_data = self._load_json(how_json_path)
  110. self.dimension_associations = self._load_json(dimension_associations_path)
  111. self.intra_associations = self._load_json(intra_associations_path)
  112. self.optimized_clustered_data = self._load_json(optimized_clustered_data_path)
  113. # 初始化组件
  114. logger.info("初始化组件...")
  115. self.openrouter_client = OpenRouterClient(
  116. api_key=openrouter_api_key,
  117. model="google/gemini-2.5-flash",
  118. retry_delay=5 # 增加重试延迟避免限流
  119. )
  120. self.llm_evaluator = LLMEvaluator(self.openrouter_client)
  121. self.search_client = XiaohongshuSearch()
  122. # 初始化 Stage 7 分析器
  123. self.stage7_analyzer = Stage7DeconstructionAnalyzer(
  124. api_url=stage7_api_url,
  125. max_workers=stage7_max_workers,
  126. max_notes=stage7_max_notes,
  127. min_score=stage7_min_score,
  128. skip_count=stage7_skip,
  129. sort_by=stage7_sort_by,
  130. output_dir=output_dir,
  131. enable_image_download=False, # 直接使用原始图片URL,不做代理
  132. image_server_url="http://localhost:8765", # 图片服务器URL(已弃用)
  133. image_download_dir="downloaded_images" # 图片下载目录(已弃用)
  134. )
  135. logger.info("系统初始化完成")
  136. def _load_json(self, file_path: str) -> Any:
  137. """加载JSON文件"""
  138. try:
  139. with open(file_path, 'r', encoding='utf-8') as f:
  140. return json.load(f)
  141. except Exception as e:
  142. logger.error(f"加载文件失败 {file_path}: {e}")
  143. raise
  144. def _save_json(self, data: Any, file_path: str):
  145. """保存JSON文件"""
  146. try:
  147. with open(file_path, 'w', encoding='utf-8') as f:
  148. json.dump(data, f, ensure_ascii=False, indent=2)
  149. logger.info(f"已保存: {file_path}")
  150. except Exception as e:
  151. logger.error(f"保存文件失败 {file_path}: {e}")
  152. raise
  153. # ========== 阶段1:筛选 0.5 <= 相似度 < 0.8 的特征 ==========
  154. def stage1_filter_features(self) -> List[Dict[str, Any]]:
  155. """
  156. 阶段1:筛选中等匹配度特征
  157. 筛选条件:0.5 <= 最高相似度 < 0.8
  158. Returns:
  159. 筛选后的特征列表
  160. """
  161. logger.info("=" * 60)
  162. logger.info("阶段1:筛选中等匹配度特征 (0.5 <= 相似度 < 0.8)")
  163. logger.info("=" * 60)
  164. results = []
  165. how_result = self.how_data.get('how解构结果', {})
  166. total_features = 0
  167. filtered_out_low = 0 # < 0.5
  168. filtered_out_high = 0 # >= 0.8
  169. selected_count = 0
  170. # 遍历三个维度
  171. for level_name, level_list in how_result.items():
  172. if not isinstance(level_list, list):
  173. continue
  174. logger.info(f"\n处理 {level_name}...")
  175. for item_idx, item in enumerate(level_list):
  176. item_name = item.get('名称', f'未命名-{item_idx}')
  177. how_steps = item.get('how步骤列表', [])
  178. for step in how_steps:
  179. features = step.get('特征列表', [])
  180. for feature in features:
  181. feature_name = feature.get('特征名称', '')
  182. match_results = feature.get('匹配结果', [])
  183. total_features += 1
  184. if not match_results:
  185. continue
  186. # 找到最高相似度
  187. max_similarity = max(
  188. (m.get('匹配结果', {}).get('相似度', 0) for m in match_results),
  189. default=0
  190. )
  191. # 筛选条件
  192. if max_similarity < 0.5:
  193. filtered_out_low += 1
  194. continue
  195. elif max_similarity >= 0.8:
  196. filtered_out_high += 1
  197. continue
  198. # 0.5 <= max_similarity < 0.8,保留
  199. # 按相似度降序排序,取前3个
  200. sorted_matches = sorted(
  201. match_results,
  202. key=lambda x: x.get('匹配结果', {}).get('相似度', 0),
  203. reverse=True
  204. )
  205. top3_matches = sorted_matches[:3] # 取前3个
  206. # 构建top3匹配信息列表
  207. top3_match_info = []
  208. for match in top3_matches:
  209. feature_classification = match.get('特征分类', [])
  210. classification_path = self._build_classification_path(feature_classification)
  211. # 如果路径为空且是分类类型,搜索补全路径
  212. if not classification_path and match.get('特征类型') == '分类':
  213. feature_name_to_search = match.get('人设特征名称', '')
  214. classification_path = self._search_classification_path(feature_name_to_search)
  215. is_classification = self._is_classification(match.get('人设特征名称', ''), classification_path)
  216. top3_match_info.append({
  217. '人设特征名称': match.get('人设特征名称'),
  218. '人设特征层级': match.get('人设特征层级'),
  219. '特征类型': match.get('特征类型'),
  220. '特征分类': feature_classification,
  221. '相似度': match.get('匹配结果', {}).get('相似度', 0),
  222. '匹配说明': match.get('匹配结果', {}).get('说明', ''),
  223. '是分类': is_classification,
  224. '所属分类路径': classification_path
  225. })
  226. result_item = {
  227. '原始特征名称': feature_name,
  228. '来源层级': level_name,
  229. '权重': feature.get('权重', 0),
  230. '所属点名称': item_name,
  231. '最高匹配信息': top3_match_info[0], # 保留第1个用于Stage2
  232. 'top3匹配信息': top3_match_info # 新增字段
  233. }
  234. results.append(result_item)
  235. selected_count += 1
  236. # 显示top3匹配信息
  237. top3_names = [m['人设特征名称'] for m in top3_match_info]
  238. logger.info(f" ✓ {feature_name} → Top{len(top3_match_info)}: {', '.join(top3_names)}")
  239. # 统计信息
  240. logger.info(f"\n" + "=" * 60)
  241. logger.info(f"阶段1完成")
  242. logger.info(f" 总特征数: {total_features}")
  243. logger.info(f" 过滤掉(<0.5): {filtered_out_low}")
  244. logger.info(f" 过滤掉(>=0.8): {filtered_out_high}")
  245. logger.info(f" 保留(0.5-0.8): {selected_count}")
  246. logger.info("=" * 60)
  247. # 保存结果
  248. output_path = os.path.join(self.output_dir, "stage1_filtered_features.json")
  249. self._save_json(results, output_path)
  250. return results
  251. def _build_classification_path(self, feature_classification: List[str]) -> str:
  252. """
  253. 构建分类路径
  254. Args:
  255. feature_classification: 特征分类数组
  256. Returns:
  257. 分类路径
  258. """
  259. if not feature_classification:
  260. return ""
  261. # 步骤1: 去掉中间元素的"实质"后缀
  262. cleaned = []
  263. for i, item in enumerate(feature_classification):
  264. if i == len(feature_classification) - 1: # 最后一个保留
  265. cleaned.append(item)
  266. elif item.endswith("实质") and i != 0: # 中间的去掉"实质"
  267. cleaned.append(item[:-2])
  268. else:
  269. cleaned.append(item)
  270. # 步骤2: 反转数组
  271. reversed_list = list(reversed(cleaned))
  272. # 步骤3: 拼接路径
  273. path = "/".join(reversed_list)
  274. return path
  275. def _is_classification(self, persona_feature_name: str, classification_path: str) -> bool:
  276. """
  277. 判断是分类还是特征
  278. Args:
  279. persona_feature_name: 人设特征名称
  280. classification_path: 分类路径
  281. Returns:
  282. True: 是分类, False: 是特征
  283. """
  284. # 在optimized_clustered_data中查找
  285. # 如果在特征列表中找到,就是特征
  286. # 如果作为节点存在且有子节点,就是分类
  287. # 导航到节点
  288. node = self._navigate_to_node(classification_path)
  289. if not node:
  290. return False
  291. # 检查是否在特征列表中
  292. features = node.get('特征列表', [])
  293. for f in features:
  294. if f.get('特征名称') == persona_feature_name:
  295. return False # 在特征列表中,是特征
  296. # 检查是否作为子节点存在
  297. if persona_feature_name in node:
  298. sub_node = node[persona_feature_name]
  299. if isinstance(sub_node, dict):
  300. return True # 是子节点,是分类
  301. return False # 默认是特征
  302. def _navigate_to_node(self, path: str) -> Optional[Dict[str, Any]]:
  303. """
  304. 导航到指定路径的节点
  305. Args:
  306. path: 路径,如 "实质/猫咪宠物"
  307. Returns:
  308. 节点,未找到返回None
  309. """
  310. if not path:
  311. return None
  312. parts = path.split('/')
  313. first_part = parts[0]
  314. # 确定顶层key
  315. top_level_map = {
  316. '意图': '目的点',
  317. '要素': '目的点',
  318. '实质': None,
  319. '形式': None,
  320. '场景': None
  321. }
  322. top_keys = []
  323. if first_part in top_level_map:
  324. mapped = top_level_map[first_part]
  325. if mapped:
  326. top_keys.append(mapped)
  327. if not top_keys:
  328. top_keys = ['灵感点列表', '关键点列表', '目的点']
  329. # 尝试在每个顶层中查找
  330. for top_key in top_keys:
  331. current = self.optimized_clustered_data.get(top_key)
  332. if not current:
  333. continue
  334. # 逐层导航
  335. found = True
  336. for part in parts:
  337. if isinstance(current, dict) and part in current:
  338. current = current[part]
  339. else:
  340. found = False
  341. break
  342. if found and isinstance(current, dict):
  343. return current
  344. return None
  345. def _recursive_search(
  346. self,
  347. obj: Dict[str, Any],
  348. target_name: str,
  349. current_path: str = ""
  350. ) -> Optional[str]:
  351. """
  352. 递归搜索分类节点
  353. Args:
  354. obj: 当前搜索的对象
  355. target_name: 目标分类名称
  356. current_path: 当前路径
  357. Returns:
  358. 找到的完整路径,未找到返回None
  359. """
  360. if not isinstance(obj, dict):
  361. return None
  362. # 遍历所有键
  363. for key in obj.keys():
  364. # 跳过元数据和特征列表
  365. if key in ['_meta', '特征列表']:
  366. continue
  367. # 检查是否匹配
  368. if target_name in key or key in target_name:
  369. # 找到匹配,返回路径
  370. if current_path:
  371. return f"{current_path}/{key}"
  372. else:
  373. return key
  374. # 递归搜索子节点
  375. if isinstance(obj[key], dict):
  376. next_path = f"{current_path}/{key}" if current_path else key
  377. result = self._recursive_search(obj[key], target_name, next_path)
  378. if result:
  379. return result
  380. return None
  381. def _search_classification_path(self, classification_name: str) -> str:
  382. """
  383. 在optimized_clustered_data中搜索分类节点路径
  384. Args:
  385. classification_name: 分类名称,如"实体物品实质"
  386. Returns:
  387. 完整路径,如"实质/实体物品",未找到返回空字符串
  388. """
  389. if not classification_name:
  390. return ""
  391. # 清理名称:去掉常见后缀
  392. clean_name = classification_name
  393. for suffix in ['实质', '意图', '形式', '要素']:
  394. if clean_name.endswith(suffix) and len(clean_name) > len(suffix):
  395. clean_name = clean_name[:-len(suffix)]
  396. break
  397. logger.info(f" 搜索分类: {classification_name} → 清理为: {clean_name}")
  398. # 在三个顶级列表中搜索
  399. for top_key in ['灵感点列表', '关键点列表', '目的点']:
  400. top_data = self.optimized_clustered_data.get(top_key, {})
  401. if not top_data:
  402. continue
  403. # 递归搜索
  404. path = self._recursive_search(top_data, clean_name, "")
  405. if path:
  406. logger.info(f" ✓ 找到路径: {path}")
  407. return path
  408. logger.warning(f" ✗ 未找到分类路径: {classification_name}")
  409. return ""
  410. # ========== 阶段2:收集关联分类+标签+子分类 ==========
  411. def stage2_find_associations(self, filtered_features: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
  412. """
  413. 阶段2:查找关联分类,收集分类名称、标签、子分类
  414. 改进: 为top3的每个base_word都查找关联
  415. Args:
  416. filtered_features: 阶段1筛选的特征
  417. Returns:
  418. 带关联信息的特征列表
  419. """
  420. logger.info("=" * 60)
  421. logger.info("阶段2:查找关联分类(为每个base_word)")
  422. logger.info("=" * 60)
  423. for idx, feature in enumerate(filtered_features, 1):
  424. logger.info(f"\n[{idx}/{len(filtered_features)}] 处理: {feature['原始特征名称']}")
  425. # 获取top3 base_words
  426. top3_info = feature.get('top3匹配信息', [])
  427. if not top3_info:
  428. logger.warning(f" 无top3匹配信息,跳过")
  429. feature['找到的关联_按base_word'] = {}
  430. continue
  431. logger.info(f" 找到 {len(top3_info)} 个base_word")
  432. # 为每个base_word查找关联
  433. associations_by_base_word = {}
  434. for base_idx, base_info in enumerate(top3_info, 1):
  435. base_word = base_info.get('人设特征名称', '')
  436. is_classification = base_info['是分类']
  437. classification_path = base_info['所属分类路径']
  438. source_level = base_info['人设特征层级']
  439. logger.info(f" [{base_idx}/{len(top3_info)}] Base Word: {base_word}")
  440. if is_classification:
  441. search_path = classification_path
  442. logger.info(f" 匹配到分类: {search_path}")
  443. else:
  444. search_path = classification_path
  445. logger.info(f" 匹配到特征,使用所属分类: {search_path}")
  446. # 查找跨维度关联
  447. associations = self._find_associations(search_path, source_level)
  448. logger.info(f" 找到 {len(associations)} 个跨维度关联")
  449. # 查找维度内关联
  450. intra_associations = self._find_intra_dimension_associations(search_path, source_level)
  451. logger.info(f" 找到 {len(intra_associations)} 个维度内关联")
  452. # 合并两种关联
  453. all_associations = associations + intra_associations
  454. # 收集关联信息
  455. base_word_associations = []
  456. for assoc in all_associations:
  457. target_path = assoc['目标分类']
  458. # 收集分类信息
  459. classification_info = self._collect_classification_info(target_path)
  460. if classification_info:
  461. # 检查是否为维度内关联
  462. is_intra = assoc['关联类型'] == '维度内组合关联'
  463. base_word_associations.append({
  464. '来源方向': assoc['来源方向'],
  465. '关联类型': assoc['关联类型'],
  466. '目标分类路径': target_path,
  467. '共同帖子数': assoc.get('点数', assoc.get('共同帖子数', 0)),
  468. 'Jaccard相似度': assoc.get('Jaccard相似度', 0.0) if not is_intra else 0.0,
  469. '分类名称': classification_info['classification_name'],
  470. '标签列表': classification_info['tags'],
  471. '子分类列表': classification_info['sub_classifications']
  472. })
  473. associations_by_base_word[base_word] = base_word_associations
  474. logger.info(f" 总计 {len(base_word_associations)} 个关联(跨维度: {len(associations)}, 维度内: {len(intra_associations)})")
  475. # 保存结果
  476. feature['找到的关联_按base_word'] = associations_by_base_word
  477. # 向后兼容:保留基于最高匹配信息的关联(即第1个base_word的关联)
  478. first_base_word = top3_info[0].get('人设特征名称', '')
  479. feature['找到的关联'] = associations_by_base_word.get(first_base_word, [])
  480. total_associations = sum(len(v) for v in associations_by_base_word.values())
  481. logger.info(f" 总共找到 {total_associations} 个关联({len(associations_by_base_word)} 个base_word)")
  482. # 保存结果
  483. output_path = os.path.join(self.output_dir, "stage2_associations.json")
  484. self._save_json(filtered_features, output_path)
  485. logger.info(f"\n" + "=" * 60)
  486. logger.info(f"阶段2完成")
  487. logger.info("=" * 60)
  488. return filtered_features
  489. def _find_associations(self, classification_path: str, source_level: str) -> List[Dict[str, Any]]:
  490. """
  491. 查找关联节点
  492. Args:
  493. classification_path: 分类路径
  494. source_level: 源层级
  495. Returns:
  496. 关联节点列表
  497. """
  498. associations = []
  499. # 确定维度名称
  500. if '灵感点' in source_level:
  501. dimension_key = '灵感点维度'
  502. elif '关键点' in source_level:
  503. dimension_key = '关键点维度'
  504. elif '目的点' in source_level:
  505. dimension_key = '目的点维度'
  506. else:
  507. return associations
  508. # 获取维度数据
  509. single_dim = self.dimension_associations.get('单维度关联分析', {})
  510. dimension_data = single_dim.get(dimension_key, {})
  511. if not dimension_data:
  512. return associations
  513. # 遍历所有方向
  514. for direction_key, direction_data in dimension_data.items():
  515. if direction_key == '说明':
  516. continue
  517. # 查找源分类
  518. if classification_path in direction_data:
  519. source_data = direction_data[classification_path]
  520. # 获取关联节点
  521. for assoc_key in source_data.keys():
  522. if assoc_key.startswith('与') and assoc_key.endswith('的关联'):
  523. assoc_list = source_data[assoc_key]
  524. for assoc_item in assoc_list:
  525. associations.append({
  526. '来源方向': direction_key,
  527. '关联类型': assoc_key,
  528. '目标分类': assoc_item.get('目标分类'),
  529. '目标层级': assoc_item.get('目标层级'),
  530. '共同帖子数': assoc_item.get('共同帖子数'),
  531. 'Jaccard相似度': assoc_item.get('Jaccard相似度'),
  532. '共同帖子ID': assoc_item.get('共同帖子ID', [])
  533. })
  534. return associations
  535. def _find_intra_dimension_associations(
  536. self,
  537. classification_path: str,
  538. source_level: str
  539. ) -> List[Dict[str, Any]]:
  540. """
  541. 查找维度内关联
  542. 在同一维度内,查找叶子分类的组合关联。
  543. 例如:如果A和B经常在同一帖子中出现,它们就有维度内关联。
  544. Args:
  545. classification_path: 分类路径,如 "实质/身份与情绪/生理状态与行为/疲惫与熬夜状态"
  546. source_level: 源层级,如 "关键点列表"
  547. Returns:
  548. 关联列表,每个关联包含:
  549. - 来源方向: 维度-维度内
  550. - 关联类型: 维度内组合关联
  551. - 目标分类: 关联分类的完整路径
  552. - 组合键: 组合的唯一标识(如 "夸张极致表现|疲惫与熬夜状态")
  553. - 点数: 该组合出现的次数
  554. - 目标层级: 目标层级(与源层级相同)
  555. """
  556. if not self.intra_associations:
  557. return []
  558. associations = []
  559. # 步骤1: 提取叶子分类名称(路径最后一段)
  560. if not classification_path:
  561. return []
  562. leaf_name = classification_path.split('/')[-1]
  563. # 步骤2: 确定维度
  564. dimension = None
  565. if '灵感点' in source_level:
  566. dimension = '灵感点'
  567. elif '关键点' in source_level:
  568. dimension = '关键点'
  569. elif '目的点' in source_level:
  570. dimension = '目的点'
  571. if not dimension:
  572. return []
  573. # 步骤3: 查找组合
  574. clusters = self.intra_associations.get('叶子分类组合聚类', {}).get(dimension, {})
  575. if not clusters:
  576. return []
  577. # 步骤4: 遍历所有组合,找到包含当前叶子分类的组合
  578. for combo_key, cluster in clusters.items():
  579. combo_parts = combo_key.split('|')
  580. # 如果当前叶子分类在组合中
  581. if leaf_name not in combo_parts:
  582. continue
  583. # 提取点详情中的特征信息
  584. for point in cluster.get('点详情列表', []):
  585. for feature in point.get('特征列表', []):
  586. other_leaf = feature.get('叶子分类', '')
  587. other_path = feature.get('完整路径', '')
  588. # 跳过自己
  589. if other_leaf == leaf_name or not other_path:
  590. continue
  591. # 添加维度内关联(保持与跨维度关联相同的结构)
  592. associations.append({
  593. '来源方向': f'{dimension}-维度内',
  594. '关联类型': '维度内组合关联',
  595. '目标分类': other_path, # 使用'目标分类'保持与跨维度关联一致
  596. '组合键': combo_key,
  597. '点数': cluster.get('点数', 0),
  598. '目标层级': source_level # 同一维度内的关联,层级相同
  599. })
  600. return associations
  601. def _collect_classification_info(self, classification_path: str) -> Optional[Dict[str, Any]]:
  602. """
  603. 收集分类信息:分类名 + 标签 + 子分类
  604. Args:
  605. classification_path: 分类路径
  606. Returns:
  607. 分类信息
  608. """
  609. node = self._navigate_to_node(classification_path)
  610. if not node:
  611. return None
  612. # 分类名称(路径最后一段)
  613. classification_name = classification_path.split('/')[-1]
  614. # 标签(特征列表)
  615. tags = [f.get('特征名称', '') for f in node.get('特征列表', [])]
  616. # 子分类(子节点,排除_meta和特征列表)
  617. sub_classifications = [
  618. key for key in node.keys()
  619. if isinstance(node[key], dict) and key not in ['_meta', '特征列表']
  620. ]
  621. return {
  622. 'classification_name': classification_name,
  623. 'tags': tags,
  624. 'sub_classifications': sub_classifications
  625. }
  626. # ========== 阶段3:筛选高相似度匹配(>0.8) ==========
  627. def stage3_filter_high_similarity_matches(self, associations_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
  628. """
  629. 阶段3:筛选高相似度匹配(>0.8)
  630. 改进:为每个base_word独立筛选候选词
  631. 基于该base_word的关联范围,在how解构中找出相似度>0.8的匹配
  632. Args:
  633. associations_data: 阶段2的关联数据
  634. Returns:
  635. 带高相似度候选的数据
  636. """
  637. logger.info("=" * 60)
  638. logger.info("阶段3:筛选高相似度匹配(>0.8,为每个base_word)")
  639. logger.info("=" * 60)
  640. for idx, feature_result in enumerate(associations_data, 1):
  641. original_feature_name = feature_result['原始特征名称']
  642. logger.info(f"\n[{idx}/{len(associations_data)}] 处理: {original_feature_name}")
  643. # 获取top3 base_words
  644. top3_info = feature_result.get('top3匹配信息', [])
  645. associations_by_base_word = feature_result.get('找到的关联_按base_word', {})
  646. if not top3_info or not associations_by_base_word:
  647. logger.warning(f" 无top3匹配信息或关联数据,跳过")
  648. feature_result['高相似度候选_按base_word'] = {}
  649. continue
  650. logger.info(f" 找到 {len(top3_info)} 个base_word")
  651. # 为每个base_word独立筛选候选词
  652. candidates_by_base_word = {}
  653. for base_idx, base_info in enumerate(top3_info, 1):
  654. base_word = base_info.get('人设特征名称', '')
  655. logger.info(f" [{base_idx}/{len(top3_info)}] Base Word: {base_word}")
  656. # 步骤1: 收集该base_word的关联范围
  657. base_word_associations = associations_by_base_word.get(base_word, [])
  658. base_word_scope = self._collect_scope_from_associations(base_word_associations)
  659. logger.info(f" 关联范围包含 {len(base_word_scope)} 个分类/标签")
  660. if not base_word_scope:
  661. logger.warning(f" 无关联范围,跳过")
  662. candidates_by_base_word[base_word] = []
  663. continue
  664. # 步骤2: 遍历how解构,找出高相似度匹配
  665. high_sim_candidates = []
  666. total_checked = 0
  667. high_sim_found = 0
  668. how_result = self.how_data.get('how解构结果', {})
  669. for level_name, level_list in how_result.items():
  670. if not isinstance(level_list, list):
  671. continue
  672. for item in level_list:
  673. for step in item.get('how步骤列表', []):
  674. for feature in step.get('特征列表', []):
  675. matches = feature.get('匹配结果', [])
  676. total_checked += len(matches)
  677. # 筛选相似度>0.8且在该base_word的范围内的匹配
  678. for match in matches:
  679. sim = match.get('匹配结果', {}).get('相似度', 0)
  680. persona_feature_name = match.get('人设特征名称', '')
  681. if sim > 0.8 and persona_feature_name in base_word_scope:
  682. high_sim_found += 1
  683. high_sim_candidates.append({
  684. '人设特征名称': persona_feature_name,
  685. '相似度': sim,
  686. '特征类型': match.get('特征类型', ''),
  687. '特征分类': match.get('特征分类', []),
  688. '人设特征层级': match.get('人设特征层级', ''),
  689. '来源路径': self._build_classification_path(match.get('特征分类', [])),
  690. '匹配说明': match.get('匹配结果', {}).get('说明', ''),
  691. '来源原始特征': feature.get('特征名称', '')
  692. })
  693. logger.info(f" 检查了 {total_checked} 个匹配")
  694. logger.info(f" 找到 {high_sim_found} 个相似度>0.8的匹配")
  695. # 按相似度降序排序并去重
  696. seen_names = set()
  697. unique_candidates = []
  698. high_sim_candidates.sort(key=lambda x: x['相似度'], reverse=True)
  699. for candidate in high_sim_candidates:
  700. name = candidate['人设特征名称']
  701. if name not in seen_names:
  702. seen_names.add(name)
  703. unique_candidates.append(candidate)
  704. candidates_by_base_word[base_word] = unique_candidates
  705. logger.info(f" 去重后筛选出 {len(unique_candidates)} 个候选")
  706. # 显示前5个
  707. if unique_candidates:
  708. logger.info(f" Top 5:")
  709. for c in unique_candidates[:5]:
  710. logger.info(f" • {c['人设特征名称']} ({c['相似度']:.3f}) ← 来自\"{c['来源原始特征']}\"")
  711. # 保存结果
  712. feature_result['高相似度候选_按base_word'] = candidates_by_base_word
  713. # 向后兼容:保留第1个base_word的候选
  714. first_base_word = top3_info[0].get('人设特征名称', '')
  715. feature_result['高相似度候选'] = candidates_by_base_word.get(first_base_word, [])
  716. total_candidates = sum(len(v) for v in candidates_by_base_word.values())
  717. logger.info(f" 总共筛选出 {total_candidates} 个候选({len(candidates_by_base_word)} 个base_word)")
  718. # 保存结果
  719. output_path = os.path.join(self.output_dir, "stage3_high_similarity.json")
  720. self._save_json(associations_data, output_path)
  721. logger.info(f"\n" + "=" * 60)
  722. logger.info(f"阶段3完成")
  723. logger.info("=" * 60)
  724. return associations_data
  725. def _collect_scope_from_associations(self, associations: List[Dict[str, Any]]) -> Set[str]:
  726. """
  727. 从关联列表中收集所有分类名和标签,形成范围集合
  728. Args:
  729. associations: 关联列表
  730. Returns:
  731. 包含所有分类名和标签的集合
  732. """
  733. scope = set()
  734. for assoc in associations:
  735. # 添加分类名
  736. scope.add(assoc['分类名称'])
  737. # 添加所有标签
  738. tags = assoc.get('标签列表', [])
  739. scope.update(tags)
  740. return scope
  741. def stage23_extract_candidates_from_how(self, filtered_features: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
  742. """
  743. 新方式:从how文件提取相似度>=0.8的候选词
  744. 替代 Stage 2-3,但构造相同的数据结构
  745. 处理流程:
  746. 1. 遍历 how_data['how解构结果'] 所有特征的匹配结果
  747. 2. 筛选 相似度 >= 0.8 的人设特征名称
  748. 3. 去重(按最高相似度保留)
  749. 4. 按相似度降序排序
  750. 5. 为每个中心词复制相同的候选词列表
  751. 6. 构造 '高相似度候选_按base_word' 结构
  752. Args:
  753. filtered_features: Stage 1筛选的特征列表
  754. Returns:
  755. 与Stage 3输出结构完全一致的特征列表
  756. """
  757. logger.info("=" * 60)
  758. logger.info("Stage 2-3 (新方式): 从how文件提取高相似度候选词")
  759. logger.info("=" * 60)
  760. # Step 1: 从整个how文件提取候选词
  761. candidates_dict = {} # {人设特征名称: {候选词信息}}
  762. how_result = self.how_data.get('how解构结果', {})
  763. # 遍历三个维度
  764. for dimension in ['灵感点列表', '关键点列表', '目的点列表']:
  765. features_list = how_result.get(dimension, [])
  766. for item in features_list:
  767. item_name = item.get('名称', '')
  768. how_steps = item.get('how步骤列表', [])
  769. for step in how_steps:
  770. for feature in step.get('特征列表', []):
  771. feature_name = feature.get('特征名称', '')
  772. matches = feature.get('匹配结果', [])
  773. for match in matches:
  774. # 获取相似度(从匹配结果的嵌套结构中)
  775. similarity = match.get('匹配结果', {}).get('相似度', 0)
  776. persona_feature_name = match.get('人设特征名称', '')
  777. # 筛选相似度 >= 0.8
  778. if similarity >= 0.8 and persona_feature_name:
  779. # 去重逻辑:保留最高相似度
  780. if persona_feature_name not in candidates_dict or \
  781. similarity > candidates_dict[persona_feature_name]['相似度']:
  782. candidates_dict[persona_feature_name] = {
  783. '人设特征名称': persona_feature_name,
  784. '相似度': similarity,
  785. '特征类型': match.get('特征类型', ''),
  786. '特征分类': match.get('特征分类', []),
  787. '人设特征层级': match.get('人设特征层级', ''),
  788. '来源路径': self._build_classification_path(match.get('特征分类', [])),
  789. '匹配说明': match.get('匹配结果', {}).get('说明', ''),
  790. '来源原始特征': feature_name
  791. }
  792. # Step 2: 转为列表并按相似度降序排序
  793. global_candidates = sorted(
  794. candidates_dict.values(),
  795. key=lambda x: x['相似度'],
  796. reverse=True
  797. )
  798. logger.info(f"从how文件提取到 {len(global_candidates)} 个唯一的高相似度候选词")
  799. # 显示Top 10候选词
  800. if global_candidates:
  801. logger.info("Top 10 候选词:")
  802. for i, candidate in enumerate(global_candidates[:10], 1):
  803. logger.info(f" {i}. {candidate['人设特征名称']} (相似度: {candidate['相似度']:.3f})")
  804. # Step 3: 为每个特征构造输出结构
  805. results = []
  806. for idx, feature_data in enumerate(filtered_features, 1):
  807. original_feature_name = feature_data.get('原始特征名称', '')
  808. logger.info(f"\n[{idx}/{len(filtered_features)}] 处理: {original_feature_name}")
  809. top3_matches = feature_data.get('top3匹配信息', [])
  810. # 提取3个中心词
  811. base_words = [match.get('人设特征名称', '') for match in top3_matches[:3]]
  812. logger.info(f" 中心词: {', '.join(base_words)}")
  813. # 所有中心词共享相同的候选词列表
  814. high_similarity_by_base = {}
  815. for base_word in base_words:
  816. if base_word:
  817. high_similarity_by_base[base_word] = global_candidates.copy()
  818. logger.info(f" 每个中心词分配 {len(global_candidates)} 个候选词")
  819. result = {
  820. '原始特征名称': original_feature_name,
  821. '来源层级': feature_data.get('来源层级', ''), # 保留元数据
  822. '权重': feature_data.get('权重', 0), # 保留元数据
  823. 'top3匹配信息': top3_matches,
  824. '找到的关联_按base_word': {}, # 新方式不需要关联分析
  825. '高相似度候选_按base_word': high_similarity_by_base
  826. }
  827. results.append(result)
  828. # 保存结果
  829. output_path = os.path.join(self.output_dir, 'stage3_high_similarity_how_based.json')
  830. self._save_json(results, output_path)
  831. logger.info(f"\n" + "=" * 60)
  832. logger.info(f"Stage 2-3 (新方式) 完成")
  833. logger.info(f" 提取候选词: {len(global_candidates)} 个")
  834. logger.info(f" 处理特征: {len(results)} 个")
  835. logger.info("=" * 60)
  836. return results
  837. def _collect_stage2_scope(self, feature_result: Dict[str, Any]) -> Set[str]:
  838. """
  839. 收集Stage2找到的所有分类名和标签,形成范围集合(兼容旧方法)
  840. Args:
  841. feature_result: 特征结果数据
  842. Returns:
  843. 包含所有分类名和标签的集合
  844. """
  845. associations = feature_result.get('找到的关联', [])
  846. return self._collect_scope_from_associations(associations)
  847. def _find_features_by_path(self, target_classification: str) -> List[Dict[str, Any]]:
  848. """
  849. 根据路径查找特征列表
  850. Args:
  851. target_classification: 目标分类路径
  852. Returns:
  853. 特征列表
  854. """
  855. node = self._navigate_to_node(target_classification)
  856. if not node:
  857. return []
  858. features = node.get('特征列表', [])
  859. # 深拷贝
  860. return copy.deepcopy(features)
  861. # ========== 阶段4:多词组合 + LLM评估 ==========
  862. def stage4_generate_and_evaluate_search_words(
  863. self,
  864. features_data: List[Dict[str, Any]],
  865. max_workers: int = 4,
  866. max_candidates: int = 20,
  867. max_combo_length: int = 4
  868. ) -> List[Dict[str, Any]]:
  869. """
  870. 阶段4:多词组合 + LLM评估
  871. 基于Stage1的基础词和Stage3的高相似度候选,
  872. 生成所有2-N词组合,通过LLM评估选出Top10
  873. Args:
  874. features_data: 阶段3的数据(包含高相似度候选)
  875. max_workers: 并发评估的原始特征数(默认4)
  876. max_candidates: 参与组合的最大候选词数(默认20)
  877. max_combo_length: 最大组合词数(默认4,即基础词+3个候选)
  878. Returns:
  879. 带LLM评估的数据
  880. """
  881. logger.info("=" * 60)
  882. logger.info("阶段4:多词组合 + LLM评估")
  883. logger.info(f" 最大候选词数: {max_candidates}")
  884. logger.info(f" 最大组合长度: {max_combo_length} 词")
  885. logger.info(f" 并发数: {max_workers} 个原始特征")
  886. logger.info("=" * 60)
  887. total_features = len(features_data)
  888. # 使用ThreadPoolExecutor并行处理不同的原始特征
  889. with ThreadPoolExecutor(max_workers=max_workers) as executor:
  890. # 提交所有任务
  891. futures = []
  892. for idx, feature_result in enumerate(features_data, 1):
  893. future = executor.submit(
  894. self._process_single_feature_combinations,
  895. idx,
  896. total_features,
  897. feature_result,
  898. max_candidates,
  899. max_combo_length
  900. )
  901. futures.append((future, feature_result))
  902. # 等待所有任务完成并收集结果
  903. for future, feature_result in futures:
  904. try:
  905. _ = future.result() # 等待完成,结果已经写回到feature_result中
  906. except Exception as e:
  907. logger.error(f" 评估失败: {feature_result['原始特征名称']}, 错误: {e}")
  908. # 保存结果
  909. output_path = os.path.join(self.output_dir, "stage4_combinations_evaluated.json")
  910. self._save_json(features_data, output_path)
  911. logger.info(f"\n" + "=" * 60)
  912. logger.info(f"阶段4完成")
  913. logger.info("=" * 60)
  914. return features_data
  915. def _process_single_feature_combinations(
  916. self,
  917. idx: int,
  918. total: int,
  919. feature_result: Dict[str, Any],
  920. max_candidates: int,
  921. max_combo_length: int
  922. ) -> None:
  923. """
  924. 处理单个原始特征的组合生成和评估
  925. 改进: 每个base_word使用自己的候选词(而不是共享)
  926. Steps:
  927. 1. Get top3 base_words from Stage1's top3匹配信息
  928. 2. For each base_word:
  929. a. Get candidates from Stage3's 高相似度候选_按base_word
  930. b. Generate combinations
  931. c. LLM evaluation
  932. d. Select Top 10
  933. 3. Save grouped results
  934. Args:
  935. idx: 特征索引
  936. total: 总特征数
  937. feature_result: 特征结果数据
  938. max_candidates: 参与组合的最大候选词数
  939. max_combo_length: 最大组合词数
  940. """
  941. original_feature = feature_result['原始特征名称']
  942. logger.info(f"\n[{idx}/{total}] 处理: {original_feature}")
  943. # 步骤1: 获取top3基础词
  944. top3_info = feature_result.get('top3匹配信息', [])
  945. if not top3_info:
  946. logger.info(f" 无top3匹配信息,跳过")
  947. feature_result['组合评估结果_分组'] = []
  948. return
  949. logger.info(f" 找到 {len(top3_info)} 个base_word")
  950. # 步骤2: 获取按base_word分组的候选词
  951. candidates_by_base_word = feature_result.get('高相似度候选_按base_word', {})
  952. if not candidates_by_base_word:
  953. logger.warning(f" 无按base_word分组的候选词,跳过")
  954. feature_result['组合评估结果_分组'] = []
  955. return
  956. # 步骤3: 为每个base_word独立处理
  957. grouped_results = []
  958. for base_idx, base_info in enumerate(top3_info, 1):
  959. base_word = base_info.get('人设特征名称', '')
  960. base_similarity = base_info.get('相似度', 0)
  961. if not base_word:
  962. continue
  963. logger.info(f" [{base_idx}/{len(top3_info)}] Base Word: {base_word} (相似度: {base_similarity:.3f})")
  964. # 获取该base_word的候选词
  965. base_candidates = candidates_by_base_word.get(base_word, [])
  966. candidates = base_candidates[:max_candidates]
  967. candidate_words = [c['人设特征名称'] for c in candidates]
  968. if not candidate_words:
  969. logger.warning(f" 该base_word无候选词,跳过")
  970. grouped_results.append({
  971. 'base_word': base_word,
  972. 'base_word_similarity': base_similarity,
  973. 'base_word_info': base_info,
  974. 'top10_searches': [],
  975. 'available_words': []
  976. })
  977. continue
  978. logger.info(f" 候选词数量: {len(candidate_words)} (限制: {max_candidates})")
  979. # LLM生成query(新方式:直接让LLM基于候选词生成query)
  980. logger.info(f" 使用LLM生成query(中心词: {base_word})...")
  981. evaluated = self.llm_evaluator.generate_queries_from_candidates(
  982. original_feature=original_feature,
  983. base_word=base_word,
  984. candidate_words=candidate_words,
  985. max_queries=10
  986. )
  987. # 选出Top 10(已经由LLM生成方法控制数量)
  988. top_10 = evaluated[:10]
  989. logger.info(f" 生成完成,共 {len(top_10)} 个query")
  990. # 保存分组结果 - 每个base_word有自己的available_words
  991. grouped_results.append({
  992. 'base_word': base_word,
  993. 'base_word_similarity': base_similarity,
  994. 'base_word_info': base_info,
  995. 'top10_searches': top_10,
  996. 'available_words': candidate_words # 该base_word自己的候选词
  997. })
  998. # 写回结果
  999. feature_result['组合评估结果_分组'] = grouped_results
  1000. total_searches = sum(len(g['top10_searches']) for g in grouped_results)
  1001. logger.info(f" 完成!共 {len(grouped_results)} 个base_word,{total_searches} 个搜索词")
  1002. # ========== 阶段5:执行搜索 ==========
  1003. def _execute_single_search(
  1004. self,
  1005. idx: int,
  1006. total: int,
  1007. search_word: str,
  1008. feature_ref: Dict[str, Any]
  1009. ) -> Dict[str, Any]:
  1010. """
  1011. 执行单个搜索任务(用于并发执行)
  1012. Args:
  1013. idx: 搜索索引
  1014. total: 总搜索数
  1015. search_word: 搜索词
  1016. feature_ref: 特征引用(用于写入结果)
  1017. Returns:
  1018. 搜索结果信息
  1019. """
  1020. logger.info(f"[{idx}/{total}] 搜索: {search_word}")
  1021. try:
  1022. result = self.search_client.search(
  1023. keyword=search_word,
  1024. content_type='不限',
  1025. sort_type='综合',
  1026. max_retries=3,
  1027. use_cache=True # 启用搜索缓存
  1028. )
  1029. note_count = len(result.get('data', {}).get('data', []))
  1030. logger.info(f" ✓ 成功,获取 {note_count} 条帖子")
  1031. # 写入结果
  1032. feature_ref['search_result'] = result
  1033. feature_ref['search_metadata'] = {
  1034. 'searched_at': datetime.now().isoformat(),
  1035. 'status': 'success',
  1036. 'note_count': note_count,
  1037. 'search_params': {
  1038. 'keyword': search_word,
  1039. 'content_type': '图文',
  1040. 'sort_type': '综合'
  1041. }
  1042. }
  1043. return {'status': 'success', 'search_word': search_word, 'note_count': note_count}
  1044. except Exception as e:
  1045. logger.error(f" ✗ 失败: {e}")
  1046. feature_ref['search_result'] = None
  1047. feature_ref['search_metadata'] = {
  1048. 'searched_at': datetime.now().isoformat(),
  1049. 'status': 'failed',
  1050. 'note_count': 0,
  1051. 'error': str(e)
  1052. }
  1053. return {'status': 'failed', 'search_word': search_word, 'error': str(e)}
  1054. def stage5_execute_searches(
  1055. self,
  1056. features_data: List[Dict[str, Any]],
  1057. search_delay: float = 2.0,
  1058. top_n: int = 10
  1059. ) -> List[Dict[str, Any]]:
  1060. """
  1061. 阶段5:执行小红书搜索
  1062. Args:
  1063. features_data: 阶段4的数据
  1064. search_delay: 搜索延迟
  1065. top_n: 每个原始特征取评分最高的N个搜索词
  1066. Returns:
  1067. 带搜索结果的数据
  1068. """
  1069. logger.info("=" * 60)
  1070. logger.info("阶段5:执行小红书搜索")
  1071. logger.info("=" * 60)
  1072. # 按原始特征分组收集搜索词(从Stage4的组合评估结果_分组读取)
  1073. feature_search_groups = {}
  1074. for feature_result in features_data:
  1075. original_feature = feature_result['原始特征名称']
  1076. if original_feature not in feature_search_groups:
  1077. feature_search_groups[original_feature] = []
  1078. # 从Stage4的组合评估结果_分组读取(新结构)
  1079. grouped_results = feature_result.get('组合评估结果_分组', [])
  1080. if grouped_results:
  1081. # 使用分组结构:每个base_word的top10都执行
  1082. for group in grouped_results:
  1083. base_word = group.get('base_word', '')
  1084. base_similarity = group.get('base_word_similarity', 0)
  1085. base_word_searches = []
  1086. for eval_item in group.get('top10_searches', []):
  1087. sw = eval_item.get('search_word')
  1088. if not sw:
  1089. continue
  1090. score = eval_item.get('score', 0.0)
  1091. base_word_searches.append({
  1092. 'search_word': sw,
  1093. 'score': score,
  1094. 'base_word': base_word,
  1095. 'base_word_similarity': base_similarity,
  1096. 'feature_ref': eval_item # 引用评估项,用于写入搜索结果
  1097. })
  1098. # 应用每个base_word的搜索次数限制
  1099. if self.max_searches_per_base_word and len(base_word_searches) > self.max_searches_per_base_word:
  1100. logger.info(f" 应用base_word限制: {base_word} 从 {len(base_word_searches)} 减少到 {self.max_searches_per_base_word}")
  1101. base_word_searches = base_word_searches[:self.max_searches_per_base_word]
  1102. feature_search_groups[original_feature].extend(base_word_searches)
  1103. else:
  1104. # 兼容旧结构(组合评估结果)
  1105. for eval_item in feature_result.get('组合评估结果', []):
  1106. sw = eval_item.get('search_word')
  1107. if not sw:
  1108. continue
  1109. score = eval_item.get('score', 0.0)
  1110. feature_search_groups[original_feature].append({
  1111. 'search_word': sw,
  1112. 'score': score,
  1113. 'feature_ref': eval_item
  1114. })
  1115. # 应用每个原始特征的搜索次数限制
  1116. if self.max_searches_per_feature and len(feature_search_groups[original_feature]) > self.max_searches_per_feature:
  1117. logger.info(f" 应用特征限制: {original_feature} 从 {len(feature_search_groups[original_feature])} 减少到 {self.max_searches_per_feature}")
  1118. feature_search_groups[original_feature] = feature_search_groups[original_feature][:self.max_searches_per_feature]
  1119. # 收集所有搜索任务(分组结构下执行所有base_word的top10,不再过滤)
  1120. all_searches = []
  1121. total_count = 0
  1122. for original_feature, search_list in feature_search_groups.items():
  1123. total_count += len(search_list)
  1124. all_searches.extend(search_list)
  1125. logger.info(f" {original_feature}: {len(search_list)} 个搜索词")
  1126. # 应用全局搜索次数限制
  1127. if self.max_total_searches and len(all_searches) > self.max_total_searches:
  1128. logger.info(f" 应用全局限制:从 {len(all_searches)} 个减少到 {self.max_total_searches} 个")
  1129. all_searches = all_searches[:self.max_total_searches]
  1130. logger.info(f"\n共 {len(all_searches)} 个搜索任务")
  1131. logger.info(f" 并发执行搜索(并发数: {self.search_max_workers})")
  1132. # 使用ThreadPoolExecutor并发执行搜索
  1133. with ThreadPoolExecutor(max_workers=self.search_max_workers) as executor:
  1134. # 提交所有搜索任务
  1135. futures = []
  1136. for idx, item in enumerate(all_searches, 1):
  1137. future = executor.submit(
  1138. self._execute_single_search,
  1139. idx,
  1140. len(all_searches),
  1141. item['search_word'],
  1142. item['feature_ref']
  1143. )
  1144. futures.append(future)
  1145. # 等待所有搜索完成
  1146. for future in as_completed(futures):
  1147. try:
  1148. result = future.result()
  1149. # 结果已经写入feature_ref,无需额外处理
  1150. except Exception as e:
  1151. logger.error(f" 搜索任务失败: {e}")
  1152. # 保存结果
  1153. output_path = os.path.join(self.output_dir, "stage5_with_search_results.json")
  1154. self._save_json(features_data, output_path)
  1155. logger.info(f"\n" + "=" * 60)
  1156. logger.info(f"阶段5完成")
  1157. logger.info("=" * 60)
  1158. return features_data
  1159. # ========== 阶段6:LLM评估搜索结果 ==========
  1160. def stage6_evaluate_search_results(
  1161. self,
  1162. features_data: List[Dict[str, Any]]
  1163. ) -> List[Dict[str, Any]]:
  1164. """
  1165. 阶段6:用LLM评估搜索结果(多模态)
  1166. Args:
  1167. features_data: 阶段5的数据
  1168. Returns:
  1169. 带结果评估的数据
  1170. """
  1171. logger.info("=" * 60)
  1172. logger.info("阶段6:LLM评估搜索结果")
  1173. logger.info("=" * 60)
  1174. # 收集所有需要评估的特征节点
  1175. features_to_evaluate = []
  1176. for feature_result in features_data:
  1177. original_feature = feature_result['原始特征名称']
  1178. for assoc in feature_result.get('找到的关联', []):
  1179. for feature in assoc.get('特征列表', []):
  1180. if feature.get('search_result') and feature['search_metadata']['status'] == 'success':
  1181. features_to_evaluate.append({
  1182. 'original_feature': original_feature,
  1183. 'feature_node': feature
  1184. })
  1185. logger.info(f"共 {len(features_to_evaluate)} 个搜索结果需要评估")
  1186. # 并行评估(并发数较低)
  1187. with ThreadPoolExecutor(max_workers=8) as executor:
  1188. futures = []
  1189. for item in features_to_evaluate:
  1190. future = executor.submit(
  1191. self._evaluate_single_search_result,
  1192. item['original_feature'],
  1193. item['feature_node']
  1194. )
  1195. futures.append((future, item))
  1196. # 收集结果
  1197. for idx, (future, item) in enumerate(futures, 1):
  1198. try:
  1199. evaluation = future.result()
  1200. item['feature_node']['result_evaluation'] = evaluation
  1201. logger.info(f" [{idx}/{len(futures)}] {item['feature_node']['search_word']}: "
  1202. f"relevance={evaluation['overall_relevance']:.3f}")
  1203. except Exception as e:
  1204. logger.error(f" 评估失败: {item['feature_node']['search_word']}, 错误: {e}")
  1205. item['feature_node']['result_evaluation'] = None
  1206. # 保存结果
  1207. output_path = os.path.join(self.output_dir, "stage6_with_evaluations.json")
  1208. self._save_json(features_data, output_path)
  1209. logger.info(f"\n" + "=" * 60)
  1210. logger.info(f"阶段6完成")
  1211. logger.info("=" * 60)
  1212. return features_data
  1213. def _evaluate_single_search_result(
  1214. self,
  1215. original_feature: str,
  1216. feature_node: Dict[str, Any]
  1217. ) -> Dict[str, Any]:
  1218. """
  1219. 评估单个搜索结果(使用并行评估)
  1220. Args:
  1221. original_feature: 原始特征
  1222. feature_node: 特征节点
  1223. Returns:
  1224. 评估结果
  1225. """
  1226. search_word = feature_node.get('search_word', '')
  1227. notes = feature_node['search_result'].get('data', {}).get('data', [])
  1228. return self.llm_evaluator.evaluate_search_results_parallel(
  1229. original_feature=original_feature,
  1230. search_word=search_word,
  1231. notes=notes,
  1232. max_notes=20,
  1233. max_workers=20 # 20个并发评估每个帖子
  1234. )
  1235. def stage6_evaluate_search_results_with_filter(
  1236. self,
  1237. features_data: List[Dict[str, Any]]
  1238. ) -> List[Dict[str, Any]]:
  1239. """
  1240. 阶段6:用LLM评估搜索结果(使用两层过滤评估)
  1241. 遍历所有搜索结果,使用两层评估机制:
  1242. 1. 第一层:过滤与搜索Query无关的结果
  1243. 2. 第二层:评估与目标特征的匹配度(0.8-1.0/0.6-0.79/0.5-0.59/≤0.4)
  1244. Args:
  1245. features_data: 阶段5的数据
  1246. Returns:
  1247. 带评估结果的数据
  1248. """
  1249. logger.info("=" * 60)
  1250. logger.info("阶段6:LLM评估搜索结果(两层过滤评估)")
  1251. logger.info(f" 并发数: {self.stage6_max_workers}")
  1252. logger.info(f" 每个搜索最多评估: {self.stage6_max_notes} 个帖子")
  1253. logger.info("=" * 60)
  1254. # 收集所有需要评估的搜索项
  1255. search_items_to_evaluate = []
  1256. for feature_result in features_data:
  1257. original_feature = feature_result['原始特征名称']
  1258. # 从组合评估结果_分组中读取搜索结果
  1259. grouped_results = feature_result.get('组合评估结果_分组', [])
  1260. if grouped_results:
  1261. for group in grouped_results:
  1262. for eval_item in group.get('top10_searches', []):
  1263. # 检查是否有搜索结果
  1264. if eval_item.get('search_result') and eval_item.get('search_metadata', {}).get('status') == 'success':
  1265. search_items_to_evaluate.append({
  1266. 'original_feature': original_feature,
  1267. 'search_item': eval_item,
  1268. 'base_word': group.get('base_word', '')
  1269. })
  1270. else:
  1271. # 兼容旧结构
  1272. for eval_item in feature_result.get('组合评估结果', []):
  1273. if eval_item.get('search_result') and eval_item.get('search_metadata', {}).get('status') == 'success':
  1274. search_items_to_evaluate.append({
  1275. 'original_feature': original_feature,
  1276. 'search_item': eval_item,
  1277. 'base_word': ''
  1278. })
  1279. logger.info(f"共 {len(search_items_to_evaluate)} 个搜索结果需要评估")
  1280. # 并行评估所有搜索结果
  1281. with ThreadPoolExecutor(max_workers=self.stage6_max_workers) as executor:
  1282. futures = []
  1283. for idx, item in enumerate(search_items_to_evaluate, 1):
  1284. future = executor.submit(
  1285. self._evaluate_single_search_with_filter,
  1286. idx,
  1287. len(search_items_to_evaluate),
  1288. item['original_feature'],
  1289. item['search_item'],
  1290. item['base_word']
  1291. )
  1292. futures.append((future, item))
  1293. # 收集结果
  1294. success_count = 0
  1295. failed_count = 0
  1296. for future, item in futures:
  1297. try:
  1298. evaluation = future.result()
  1299. item['search_item']['evaluation_with_filter'] = evaluation
  1300. success_count += 1
  1301. except Exception as e:
  1302. logger.error(f" 评估失败: {item['search_item'].get('search_word', 'unknown')}, 错误: {e}")
  1303. item['search_item']['evaluation_with_filter'] = None
  1304. failed_count += 1
  1305. logger.info(f"\n评估完成: 成功 {success_count}, 失败 {failed_count}")
  1306. # 保存结果
  1307. output_path = os.path.join(self.output_dir, "stage6_with_evaluations.json")
  1308. self._save_json(features_data, output_path)
  1309. logger.info(f"\n" + "=" * 60)
  1310. logger.info(f"阶段6完成")
  1311. logger.info("=" * 60)
  1312. return features_data
  1313. def _evaluate_single_search_with_filter(
  1314. self,
  1315. idx: int,
  1316. total: int,
  1317. original_feature: str,
  1318. search_item: Dict[str, Any],
  1319. base_word: str
  1320. ) -> Dict[str, Any]:
  1321. """
  1322. 评估单个搜索结果(使用两层过滤)
  1323. Args:
  1324. idx: 索引
  1325. total: 总数
  1326. original_feature: 原始特征
  1327. search_item: 搜索项(包含search_word和search_result)
  1328. base_word: 基础词
  1329. Returns:
  1330. 评估结果
  1331. """
  1332. search_word = search_item.get('search_word', '')
  1333. notes = search_item['search_result'].get('data', {}).get('data', [])
  1334. logger.info(f"[{idx}/{total}] 评估: {search_word} (帖子数: {len(notes)})")
  1335. # 调用LLM评估器的批量评估方法
  1336. evaluation = self.llm_evaluator.batch_evaluate_notes_with_filter(
  1337. search_query=search_word,
  1338. target_feature=original_feature,
  1339. notes=notes,
  1340. max_notes=self.stage6_max_notes,
  1341. max_workers=self.stage6_max_workers
  1342. )
  1343. # 统计信息
  1344. filtered_count = evaluation.get('filtered_count', 0)
  1345. evaluated_count = evaluation.get('evaluated_count', 0)
  1346. match_dist = evaluation.get('match_distribution', {})
  1347. logger.info(f" ✓ 完成: 过滤 {filtered_count}, 评估 {evaluated_count}, "
  1348. f"完全匹配 {match_dist.get('完全匹配(0.8-1.0)', 0)}, "
  1349. f"相似匹配 {match_dist.get('相似匹配(0.6-0.79)', 0)}")
  1350. return evaluation
  1351. # ========== 阶段7:扩展搜索 ==========
  1352. def stage7_extended_searches(
  1353. self,
  1354. features_data: List[Dict[str, Any]],
  1355. search_delay: float = 2.0
  1356. ) -> List[Dict[str, Any]]:
  1357. """
  1358. 阶段7:基于评估结果扩展搜索(多个)
  1359. Args:
  1360. features_data: 阶段6的数据
  1361. search_delay: 搜索延迟
  1362. Returns:
  1363. 带扩展搜索的数据
  1364. """
  1365. logger.info("=" * 60)
  1366. logger.info("阶段7:扩展搜索")
  1367. logger.info("=" * 60)
  1368. # 收集需要扩展搜索的任务
  1369. extension_tasks = []
  1370. for feature_result in features_data:
  1371. original_feature = feature_result['原始特征名称']
  1372. for assoc in feature_result.get('找到的关联', []):
  1373. for feature in assoc.get('特征列表', []):
  1374. result_eval = feature.get('result_evaluation')
  1375. if not result_eval:
  1376. continue
  1377. extracted_elements = result_eval.get('extracted_elements', [])
  1378. if not extracted_elements:
  1379. continue
  1380. # 为每个提取的元素创建扩展搜索
  1381. base_search_word = feature.get('search_word', '')
  1382. for element in extracted_elements:
  1383. extended_keyword = f"{base_search_word} {element}"
  1384. extension_tasks.append({
  1385. 'extended_keyword': extended_keyword,
  1386. 'original_feature': original_feature,
  1387. 'feature_node': feature,
  1388. 'element': element
  1389. })
  1390. logger.info(f"共 {len(extension_tasks)} 个扩展搜索任务")
  1391. # 执行扩展搜索
  1392. for idx, task in enumerate(extension_tasks, 1):
  1393. extended_kw = task['extended_keyword']
  1394. logger.info(f"[{idx}/{len(extension_tasks)}] 扩展搜索: {extended_kw}")
  1395. try:
  1396. result = self.search_client.search(
  1397. keyword=extended_kw,
  1398. content_type='不限',
  1399. sort_type='综合',
  1400. max_retries=3,
  1401. use_cache=True # 启用搜索缓存
  1402. )
  1403. note_count = len(result.get('data', {}).get('data', []))
  1404. logger.info(f" ✓ 成功,获取 {note_count} 条帖子")
  1405. # 评估扩展搜索结果
  1406. logger.info(f" 评估扩展搜索结果...")
  1407. evaluation = self.llm_evaluator.evaluate_search_results(
  1408. original_feature=task['original_feature'],
  1409. search_word=extended_kw,
  1410. notes=result.get('data', {}).get('data', []),
  1411. max_notes=20,
  1412. max_images_per_note=2
  1413. )
  1414. # 存储扩展搜索结果
  1415. feature_node = task['feature_node']
  1416. if 'extended_searches' not in feature_node:
  1417. feature_node['extended_searches'] = []
  1418. feature_node['extended_searches'].append({
  1419. 'extended_keyword': extended_kw,
  1420. 'based_on_element': task['element'],
  1421. 'search_result': result,
  1422. 'search_metadata': {
  1423. 'searched_at': datetime.now().isoformat(),
  1424. 'status': 'success',
  1425. 'note_count': note_count
  1426. },
  1427. 'result_evaluation': evaluation
  1428. })
  1429. logger.info(f" 评估完成,relevance={evaluation['overall_relevance']:.3f}")
  1430. except Exception as e:
  1431. logger.error(f" ✗ 失败: {e}")
  1432. # 延迟
  1433. if idx < len(extension_tasks):
  1434. time.sleep(search_delay)
  1435. # 保存结果
  1436. output_path = os.path.join(self.output_dir, "stage7_final_results.json")
  1437. self._save_json(features_data, output_path)
  1438. logger.info(f"\n" + "=" * 60)
  1439. logger.info(f"阶段7完成")
  1440. logger.info("=" * 60)
  1441. return features_data
  1442. # ========== 主流程 ==========
  1443. def run_full_pipeline(self):
  1444. """执行完整流程"""
  1445. logger.info("\n" + "=" * 60)
  1446. logger.info("开始执行完整流程")
  1447. logger.info("=" * 60)
  1448. try:
  1449. # Stage 7 Only 模式:只运行 Stage 7
  1450. if self.stage7_only:
  1451. logger.info("运行模式: Stage 7 Only (从 Stage 6 结果开始)")
  1452. stage6_path = os.path.join(self.output_dir, "stage6_with_evaluations.json")
  1453. if not os.path.exists(stage6_path):
  1454. raise FileNotFoundError(f"Stage 6 结果不存在: {stage6_path}")
  1455. with open(stage6_path, 'r', encoding='utf-8') as f:
  1456. stage6_results = json.load(f)
  1457. stage7_results = self.stage7_analyzer.run(stage6_results)
  1458. return stage7_results
  1459. # 正常流程:从 Stage 1 开始
  1460. # 阶段1
  1461. stage1_results = self.stage1_filter_features()
  1462. # 阶段2-3:根据 combination_source 选择方式
  1463. if self.combination_source == "how_based":
  1464. # 新方式:直接从how文件提取候选词(跳过Stage 2,直接生成Stage 3格式)
  1465. logger.info(f"\n使用组合词来源方式: {self.combination_source} (新方式)")
  1466. stage3_results = self.stage23_extract_candidates_from_how(stage1_results)
  1467. else:
  1468. # 旧方式:基于关联分析(association)
  1469. logger.info(f"\n使用组合词来源方式: {self.combination_source} (旧方式)")
  1470. # 阶段2
  1471. stage2_results = self.stage2_find_associations(stage1_results)
  1472. # 阶段3
  1473. stage3_results = self.stage3_filter_high_similarity_matches(stage2_results)
  1474. # 阶段4
  1475. stage4_results = self.stage4_generate_and_evaluate_search_words(
  1476. stage3_results,
  1477. max_workers=8, # 提高并发从4到8
  1478. max_combo_length=3 # 降低组合长度从4到3
  1479. )
  1480. # 阶段5
  1481. stage5_results = self.stage5_execute_searches(stage4_results, search_delay=2.0, top_n=self.top_n)
  1482. # 阶段6 - 条件执行(使用新的两层过滤评估)
  1483. if self.enable_stage6:
  1484. stage6_results = self.stage6_evaluate_search_results_with_filter(stage5_results)
  1485. else:
  1486. stage6_results = stage5_results
  1487. logger.info("\n" + "=" * 60)
  1488. logger.info("阶段6:跳过(未启用)")
  1489. logger.info("=" * 60)
  1490. # 阶段7 - 深度解构分析(条件执行)
  1491. if self.enable_stage7:
  1492. stage7_results = self.stage7_analyzer.run(stage6_results)
  1493. final_results = stage7_results
  1494. else:
  1495. final_results = stage6_results
  1496. logger.info("\n" + "=" * 60)
  1497. if self.enable_stage7:
  1498. logger.info("✓ 完整流程执行完成(Stage1-7)")
  1499. elif self.enable_stage6:
  1500. logger.info("✓ 完整流程执行完成(Stage1-6)")
  1501. else:
  1502. logger.info("✓ 完整流程执行完成(Stage1-5)")
  1503. logger.info("=" * 60)
  1504. # 自动执行可视化
  1505. logger.info("\n" + "=" * 60)
  1506. logger.info("开始生成可视化...")
  1507. logger.info("=" * 60)
  1508. try:
  1509. # 根据是否启用stage6选择不同的可视化脚本
  1510. viz_script = 'visualize_stage6_results.py' if self.enable_stage6 else 'visualize_stage5_results.py'
  1511. logger.info(f" 使用可视化脚本: {viz_script}")
  1512. result = subprocess.run(
  1513. ['python3', viz_script],
  1514. capture_output=True,
  1515. text=True,
  1516. timeout=60
  1517. )
  1518. if result.returncode == 0:
  1519. logger.info("✓ 可视化生成成功")
  1520. logger.info(result.stdout)
  1521. else:
  1522. logger.error(f"可视化生成失败: {result.stderr}")
  1523. except subprocess.TimeoutExpired:
  1524. logger.error("可视化生成超时")
  1525. except Exception as e:
  1526. logger.error(f"可视化生成异常: {e}")
  1527. return final_results
  1528. except Exception as e:
  1529. logger.error(f"流程执行失败: {e}")
  1530. raise
  1531. def main():
  1532. """主函数"""
  1533. parser = argparse.ArgumentParser(description='增强搜索系统V2')
  1534. parser.add_argument(
  1535. '--how-json',
  1536. default='690d977d0000000007036331_how.json',
  1537. help='How解构文件路径'
  1538. )
  1539. parser.add_argument(
  1540. '--dimension-associations',
  1541. default='dimension_associations_analysis.json',
  1542. help='维度关联文件路径'
  1543. )
  1544. parser.add_argument(
  1545. '--intra-associations',
  1546. default='intra_dimension_associations_analysis.json',
  1547. help='维度内关联文件路径'
  1548. )
  1549. parser.add_argument(
  1550. '--optimized-clustered',
  1551. default='optimized_clustered_data_gemini-3-pro-preview.json',
  1552. help='人设特征库路径'
  1553. )
  1554. parser.add_argument(
  1555. '--api-key',
  1556. default=None,
  1557. help='OpenRouter API密钥(默认从环境变量读取)'
  1558. )
  1559. parser.add_argument(
  1560. '--output-dir',
  1561. default='output_v2',
  1562. help='输出目录'
  1563. )
  1564. parser.add_argument(
  1565. '--top-n',
  1566. type=int,
  1567. default=10,
  1568. help='每个原始特征取评分最高的N个搜索词(默认10)'
  1569. )
  1570. parser.add_argument(
  1571. '--max-total-searches',
  1572. type=int,
  1573. default=None,
  1574. help='全局最大搜索次数限制(默认None不限制)'
  1575. )
  1576. parser.add_argument(
  1577. '--search-workers',
  1578. type=int,
  1579. default=3,
  1580. help='搜索并发数(默认3)'
  1581. )
  1582. parser.add_argument(
  1583. '--max-searches-per-feature',
  1584. type=int,
  1585. default=None,
  1586. help='每个原始特征的最大搜索次数(默认None不限制)'
  1587. )
  1588. parser.add_argument(
  1589. '--max-searches-per-base-word',
  1590. type=int,
  1591. default=None,
  1592. help='每个base_word的最大搜索次数(默认None不限制)'
  1593. )
  1594. parser.add_argument(
  1595. '--combination-source',
  1596. type=str,
  1597. choices=['how_based', 'association'],
  1598. default='how_based',
  1599. help='组合词来源方式(默认how_based):how_based=从how文件提取相似度>=0.8的候选词(新方式),association=基于关联分析提取候选词(旧方式)'
  1600. )
  1601. parser.add_argument(
  1602. '--enable-stage6',
  1603. action='store_true',
  1604. help='启用Stage 6评估(默认False)'
  1605. )
  1606. parser.add_argument(
  1607. '--stage6-max-workers',
  1608. type=int,
  1609. default=10,
  1610. help='Stage 6并发评估数(默认10)'
  1611. )
  1612. parser.add_argument(
  1613. '--stage6-max-notes',
  1614. type=int,
  1615. default=20,
  1616. help='每个搜索结果评估的最大帖子数(默认20)'
  1617. )
  1618. parser.add_argument(
  1619. '--enable-stage7',
  1620. action='store_true',
  1621. help='启用 Stage 7 深度解构分析'
  1622. )
  1623. parser.add_argument(
  1624. '--stage7-only',
  1625. action='store_true',
  1626. help='只运行 Stage 7(从 Stage 6 结果开始)'
  1627. )
  1628. parser.add_argument(
  1629. '--stage7-max-workers',
  1630. type=int,
  1631. default=5,
  1632. help='Stage 7 并发数(默认5)'
  1633. )
  1634. parser.add_argument(
  1635. '--stage7-max-notes',
  1636. type=int,
  1637. default=None,
  1638. help='Stage 7 最多处理多少个完全匹配的帖子(默认None不限制)'
  1639. )
  1640. parser.add_argument(
  1641. '--stage7-skip',
  1642. type=int,
  1643. default=0,
  1644. help='Stage 7 跳过前 N 个完全匹配的帖子(默认0)'
  1645. )
  1646. parser.add_argument(
  1647. '--stage7-sort-by',
  1648. type=str,
  1649. choices=['score', 'time', 'engagement'],
  1650. default='score',
  1651. help='Stage 7 排序方式: score(评分), time(时间), engagement(互动量)'
  1652. )
  1653. parser.add_argument(
  1654. '--stage7-api-url',
  1655. type=str,
  1656. default='http://192.168.245.150:7000/what/analysis/single',
  1657. help='Stage 7 解构 API 地址'
  1658. )
  1659. parser.add_argument(
  1660. '--stage7-min-score',
  1661. type=float,
  1662. default=0.8,
  1663. help='Stage 7 处理的最低分数阈值(默认0.8,0-1分制)'
  1664. )
  1665. args = parser.parse_args()
  1666. # 创建系统实例
  1667. system = EnhancedSearchV2(
  1668. how_json_path=args.how_json,
  1669. dimension_associations_path=args.dimension_associations,
  1670. intra_associations_path=args.intra_associations,
  1671. optimized_clustered_data_path=args.optimized_clustered,
  1672. openrouter_api_key=args.api_key,
  1673. output_dir=args.output_dir,
  1674. top_n=args.top_n,
  1675. max_total_searches=args.max_total_searches,
  1676. search_max_workers=args.search_workers,
  1677. max_searches_per_feature=args.max_searches_per_feature,
  1678. max_searches_per_base_word=args.max_searches_per_base_word,
  1679. enable_stage6=args.enable_stage6,
  1680. stage6_max_workers=args.stage6_max_workers,
  1681. stage6_max_notes=args.stage6_max_notes,
  1682. enable_stage7=args.enable_stage7,
  1683. stage7_only=args.stage7_only,
  1684. stage7_max_workers=args.stage7_max_workers,
  1685. stage7_max_notes=args.stage7_max_notes,
  1686. stage7_skip=args.stage7_skip,
  1687. stage7_sort_by=args.stage7_sort_by,
  1688. stage7_api_url=args.stage7_api_url,
  1689. stage7_min_score=args.stage7_min_score
  1690. )
  1691. # 执行完整流程
  1692. system.run_full_pipeline()
  1693. if __name__ == '__main__':
  1694. main()