match_inspiration_features.py 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 灵感点特征匹配脚本
  5. 从解构任务列表中提取灵感点的特征,与人设灵感特征进行匹配,
  6. 使用 relation_analyzer 模块分析特征之间的语义关系。
  7. """
  8. import json
  9. import asyncio
  10. from pathlib import Path
  11. from typing import Dict, List
  12. import sys
  13. # 添加项目根目录到路径
  14. project_root = Path(__file__).parent.parent.parent
  15. sys.path.insert(0, str(project_root))
  16. from lib.semantic_similarity import compare_phrases
  17. # 全局并发限制
  18. MAX_CONCURRENT_REQUESTS = 20
  19. semaphore = None
  20. def get_semaphore():
  21. """获取全局信号量"""
  22. global semaphore
  23. if semaphore is None:
  24. semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
  25. return semaphore
  26. async def match_single_pair(
  27. feature_name: str,
  28. persona_name: str,
  29. model_name: str = None
  30. ) -> Dict:
  31. """
  32. 匹配单个特征对(带并发限制)
  33. Args:
  34. feature_name: 要匹配的特征名称
  35. persona_name: 人设特征名称
  36. model_name: 使用的模型名称
  37. Returns:
  38. 单个匹配结果,格式:
  39. {
  40. "人设特征名称": "xxx",
  41. "匹配结果": {
  42. "相似度": 0.75,
  43. "说明": "..."
  44. }
  45. }
  46. """
  47. sem = get_semaphore()
  48. async with sem:
  49. print(f" 匹配: {feature_name} <-> {persona_name}")
  50. similarity_result = await compare_phrases(
  51. phrase_a=feature_name,
  52. phrase_b=persona_name,
  53. )
  54. return {
  55. "人设特征名称": persona_name,
  56. "匹配结果": similarity_result
  57. }
  58. async def match_feature_with_persona(
  59. feature_name: str,
  60. persona_features: List[Dict],
  61. model_name: str = None
  62. ) -> List[Dict]:
  63. """
  64. 将一个特征与人设特征列表进行匹配(并发执行)
  65. Args:
  66. feature_name: 要匹配的特征名称
  67. persona_features: 人设特征列表
  68. model_name: 使用的模型名称
  69. Returns:
  70. 匹配结果列表
  71. """
  72. # 创建所有匹配任务
  73. tasks = [
  74. match_single_pair(feature_name, persona_feature["特征名称"], model_name)
  75. for persona_feature in persona_features
  76. ]
  77. # 并发执行所有匹配
  78. match_results = await asyncio.gather(*tasks)
  79. return list(match_results)
  80. async def match_single_feature(
  81. feature_name: str,
  82. persona_features: List[Dict],
  83. model_name: str = None
  84. ) -> Dict:
  85. """
  86. 匹配单个特征与所有人设特征
  87. Args:
  88. feature_name: 特征名称
  89. persona_features: 人设特征列表
  90. model_name: 使用的模型名称
  91. Returns:
  92. 特征匹配结果
  93. """
  94. print(f" 特征: {feature_name}")
  95. match_results = await match_feature_with_persona(
  96. feature_name=feature_name,
  97. persona_features=persona_features,
  98. model_name=model_name
  99. )
  100. return {
  101. "特征名称": feature_name,
  102. "匹配结果": match_results
  103. }
  104. async def process_single_inspiration_point(
  105. inspiration_point: Dict,
  106. persona_features: List[Dict],
  107. model_name: str = None
  108. ) -> Dict:
  109. """
  110. 处理单个灵感点的特征匹配(并发执行)
  111. Args:
  112. inspiration_point: 灵感点数据
  113. persona_features: 人设灵感特征列表
  114. model_name: 使用的模型名称
  115. Returns:
  116. 包含 how 步骤列表的灵感点数据
  117. """
  118. point_name = inspiration_point.get("名称", "")
  119. feature_list = inspiration_point.get("特征列表", [])
  120. print(f" 处理灵感点: {point_name}")
  121. print(f" 特征数量: {len(feature_list)}")
  122. # 并发匹配所有特征
  123. tasks = [
  124. match_single_feature(feature_name, persona_features, model_name)
  125. for feature_name in feature_list
  126. ]
  127. feature_match_results = await asyncio.gather(*tasks)
  128. # 构建 how 步骤
  129. how_step = {
  130. "步骤名称": "灵感特征分别匹配人设特征",
  131. "特征列表": list(feature_match_results)
  132. }
  133. # 返回更新后的灵感点
  134. result = inspiration_point.copy()
  135. result["how步骤列表"] = [how_step]
  136. return result
  137. async def process_single_task(
  138. task: Dict,
  139. task_index: int,
  140. total_tasks: int,
  141. persona_inspiration_features: List[Dict],
  142. model_name: str = None
  143. ) -> Dict:
  144. """
  145. 处理单个任务
  146. Args:
  147. task: 任务数据
  148. task_index: 任务索引(从1开始)
  149. total_tasks: 总任务数
  150. persona_inspiration_features: 人设灵感特征列表
  151. model_name: 使用的模型名称
  152. Returns:
  153. 包含 how 解构结果的任务
  154. """
  155. post_id = task.get("帖子id", "")
  156. print(f"\n处理任务 [{task_index}/{total_tasks}]: {post_id}")
  157. # 获取灵感点列表
  158. what_result = task.get("what解构结果", {})
  159. inspiration_list = what_result.get("灵感点列表", [])
  160. print(f" 灵感点数量: {len(inspiration_list)}")
  161. # 并发处理所有灵感点
  162. tasks = [
  163. process_single_inspiration_point(
  164. inspiration_point=inspiration_point,
  165. persona_features=persona_inspiration_features,
  166. model_name=model_name
  167. )
  168. for inspiration_point in inspiration_list
  169. ]
  170. updated_inspiration_list = await asyncio.gather(*tasks)
  171. # 构建 how 解构结果
  172. how_result = {
  173. "灵感点列表": list(updated_inspiration_list)
  174. }
  175. # 更新任务
  176. updated_task = task.copy()
  177. updated_task["how解构结果"] = how_result
  178. return updated_task
  179. async def process_task_list(
  180. task_list: List[Dict],
  181. persona_features_dict: Dict,
  182. model_name: str = None
  183. ) -> List[Dict]:
  184. """
  185. 处理整个解构任务列表(并发执行)
  186. Args:
  187. task_list: 解构任务列表
  188. persona_features_dict: 人设特征字典(包含灵感点、目的点、关键点)
  189. model_name: 使用的模型名称
  190. Returns:
  191. 包含 how 解构结果的任务列表
  192. """
  193. persona_inspiration_features = persona_features_dict.get("灵感点", [])
  194. print(f"人设灵感特征数量: {len(persona_inspiration_features)}")
  195. # 并发处理所有任务
  196. tasks = [
  197. process_single_task(
  198. task=task,
  199. task_index=i,
  200. total_tasks=len(task_list),
  201. persona_inspiration_features=persona_inspiration_features,
  202. model_name=model_name
  203. )
  204. for i, task in enumerate(task_list, 1)
  205. ]
  206. updated_task_list = await asyncio.gather(*tasks)
  207. return list(updated_task_list)
  208. async def main():
  209. """主函数"""
  210. # 输入输出路径
  211. script_dir = Path(__file__).parent
  212. project_root = script_dir.parent.parent
  213. data_dir = project_root / "data" / "data_1118"
  214. task_list_file = data_dir / "当前帖子_解构任务列表.json"
  215. persona_features_file = data_dir / "特征名称_帖子来源.json"
  216. output_dir = data_dir / "当前帖子_how解构结果"
  217. # 创建输出目录
  218. output_dir.mkdir(parents=True, exist_ok=True)
  219. print(f"读取解构任务列表: {task_list_file}")
  220. with open(task_list_file, "r", encoding="utf-8") as f:
  221. task_list_data = json.load(f)
  222. print(f"读取人设特征: {persona_features_file}")
  223. with open(persona_features_file, "r", encoding="utf-8") as f:
  224. persona_features_data = json.load(f)
  225. # 获取任务列表
  226. task_list = task_list_data.get("解构任务列表", [])
  227. print(f"\n总任务数: {len(task_list)}")
  228. # 处理任务列表
  229. updated_task_list = await process_task_list(
  230. task_list=task_list,
  231. persona_features_dict=persona_features_data,
  232. model_name=None # 使用默认模型
  233. )
  234. # 分文件保存结果
  235. print(f"\n保存结果到: {output_dir}")
  236. for task in updated_task_list:
  237. post_id = task.get("帖子id", "unknown")
  238. output_file = output_dir / f"{post_id}_how.json"
  239. print(f" 保存: {output_file.name}")
  240. with open(output_file, "w", encoding="utf-8") as f:
  241. json.dump(task, f, ensure_ascii=False, indent=4)
  242. print("\n完成!")
  243. # 打印统计信息
  244. total_inspiration_points = sum(
  245. len(task["how解构结果"]["灵感点列表"])
  246. for task in updated_task_list
  247. )
  248. total_features = sum(
  249. len(point["特征列表"])
  250. for task in updated_task_list
  251. for point in task["how解构结果"]["灵感点列表"]
  252. )
  253. print(f"\n统计:")
  254. print(f" 处理的帖子数: {len(updated_task_list)}")
  255. print(f" 处理的灵感点数: {total_inspiration_points}")
  256. print(f" 处理的特征数: {total_features}")
  257. if __name__ == "__main__":
  258. asyncio.run(main())