data_converter.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 数据格式转换模块
  5. 将API输入转换为pipeline格式,将pipeline输出转换为API响应格式
  6. """
  7. import logging
  8. from typing import Dict, List, Any, Optional
  9. logger = logging.getLogger(__name__)
  10. def convert_api_input_to_pipeline_format(
  11. original_target: str,
  12. persona_features: List[Dict[str, str]],
  13. candidate_words: List[str]
  14. ) -> List[Dict[str, Any]]:
  15. """
  16. 将API输入转换为pipeline需要的格式(模拟阶段1-2的输出)
  17. Args:
  18. original_target: 原始目标名称
  19. persona_features: 人设特征列表,每个元素包含persona_feature_name
  20. candidate_words: 候选词列表
  21. Returns:
  22. 符合candidate_words.json格式的数据列表
  23. """
  24. # 构建top3匹配信息结构
  25. top3_match_info = []
  26. for idx, persona_feature in enumerate(persona_features[:3]): # 最多取3个
  27. persona_feature_name = persona_feature.get('persona_feature_name', '')
  28. if not persona_feature_name:
  29. continue
  30. # 构建匹配信息(模拟阶段1的输出格式)
  31. match_info = {
  32. '人设特征名称': persona_feature_name,
  33. '人设特征层级': '', # API输入中没有,留空
  34. '特征类型': '', # API输入中没有,留空
  35. '特征分类': [], # API输入中没有,留空
  36. '相似度': 0.75 - idx * 0.05, # 模拟相似度,第一个最高
  37. '匹配说明': '', # API输入中没有,留空
  38. '是分类': False, # API输入中没有,默认False
  39. '所属分类路径': '' # API输入中没有,留空
  40. }
  41. top3_match_info.append(match_info)
  42. if not top3_match_info:
  43. logger.warning(f"没有有效的人设特征,无法构建匹配信息")
  44. return []
  45. # 构建高相似度候选词结构(模拟阶段2的输出格式)
  46. # 将候选词转换为pipeline需要的格式
  47. global_candidates = []
  48. for candidate_word in candidate_words:
  49. candidate_item = {
  50. '候选词': candidate_word,
  51. '候选词类型': 'persona', # 标记为人设候选词
  52. '相似度': 1.0, # API输入中没有相似度,默认1.0
  53. '特征类型': '',
  54. '特征分类': [],
  55. '人设特征层级': '',
  56. '来源层级': 'persona',
  57. '来源路径': '',
  58. '匹配说明': '',
  59. '来源原始特征': original_target
  60. }
  61. global_candidates.append(candidate_item)
  62. # 构建高相似度候选_按base_word结构
  63. # 每个base_word共享相同的候选词列表
  64. high_similarity_by_base = {}
  65. for match_info in top3_match_info:
  66. base_word = match_info['人设特征名称']
  67. if base_word:
  68. high_similarity_by_base[base_word] = global_candidates.copy()
  69. # 构建最终结果(符合candidate_words.json格式)
  70. result = {
  71. '原始特征名称': original_target,
  72. '来源层级': '', # API输入中没有,留空
  73. '权重': 1.0, # 默认权重
  74. 'top3匹配信息': top3_match_info,
  75. '找到的关联_按base_word': {}, # 不需要关联分析
  76. '高相似度候选_按base_word': high_similarity_by_base
  77. }
  78. return [result]
  79. def convert_pipeline_output_to_api_response(
  80. pipeline_results: List[Dict[str, Any]],
  81. original_target: str,
  82. similarity_results: Optional[Dict[str, Any]] = None
  83. ) -> Dict[str, Any]:
  84. """
  85. 将pipeline输出转换为API响应格式
  86. 只返回综合得分P > 0的搜索结果
  87. Args:
  88. pipeline_results: pipeline的输出结果(evaluated_results.json格式)
  89. original_target: 原始目标名称
  90. similarity_results: 相似度分析结果(可选)
  91. Returns:
  92. API响应格式的数据
  93. """
  94. # 提取相似度映射
  95. similarity_map = extract_similarity_map_from_results(similarity_results)
  96. search_results = []
  97. # 遍历pipeline结果
  98. for feature_result in pipeline_results:
  99. feature_name = feature_result.get('原始特征名称', '')
  100. # 只处理匹配的目标
  101. if feature_name != original_target:
  102. continue
  103. # 从组合评估结果_分组中读取搜索结果
  104. grouped_results = feature_result.get('组合评估结果_分组', [])
  105. if not grouped_results:
  106. continue
  107. # 遍历每个base_word的搜索结果
  108. for group in grouped_results:
  109. base_word = group.get('base_word', '')
  110. # 遍历该base_word的top10搜索词
  111. for search_item in group.get('top10_searches', []):
  112. # 检查是否有评估结果
  113. evaluation = search_item.get('evaluation_with_filter')
  114. if not evaluation:
  115. continue
  116. # 获取综合得分P
  117. comprehensive_score = search_item.get('comprehensive_score', 0)
  118. comprehensive_score_detail = search_item.get('comprehensive_score_detail')
  119. # 只返回P > 0的结果
  120. if comprehensive_score <= 0:
  121. continue
  122. # 提取匹配的帖子信息
  123. matched_notes = []
  124. notes_evaluation = evaluation.get('notes_evaluation', [])
  125. search_result_data = search_item.get('search_result', {})
  126. notes_data = search_result_data.get('data', {}).get('data', [])
  127. # 遍历评估结果,提取完全匹配的帖子(综合得分 >= 0.8)
  128. for note_eval in notes_evaluation:
  129. evaluation_score = note_eval.get('综合得分', 0)
  130. if evaluation_score >= 0.8:
  131. note_index = note_eval.get('note_index', -1)
  132. if 0 <= note_index < len(notes_data):
  133. note = notes_data[note_index]
  134. note_id = note.get('id', '')
  135. # 获取最高相似度(从相似度分析结果中获取)
  136. max_similarity = similarity_map.get(note_id, 0)
  137. # 计算贡献
  138. contribution = evaluation_score * max_similarity if max_similarity > 0 else evaluation_score
  139. # 获取该帖子的解构特征匹配详情
  140. matched_features = extract_matched_features_from_results(
  141. similarity_results,
  142. note_id
  143. )
  144. matched_note = {
  145. 'note_id': note_id,
  146. 'note_title': note.get('note_card', {}).get('display_title') or '',
  147. 'evaluation_score': round(evaluation_score, 3),
  148. 'max_similarity': round(max_similarity, 3),
  149. 'contribution': round(contribution, 3),
  150. # 评估详情
  151. 'evaluation_reasoning': note_eval.get('评分说明') or '',
  152. 'key_matching_points': note_eval.get('关键匹配点') or [],
  153. 'query_relevance': note_eval.get('Query相关性') or '',
  154. 'query_relevance_explanation': note_eval.get('Query相关性说明') or '',
  155. 'matched_features': matched_features,
  156. 'note_data': note # 包含完整的搜索结果信息
  157. }
  158. matched_notes.append(matched_note)
  159. # 提取并转换 source_word 为数组格式
  160. source_word_str = search_item.get('source_word', '')
  161. if source_word_str and isinstance(source_word_str, str):
  162. # 按空格分割,过滤空字符串
  163. source_words = [word.strip() for word in source_word_str.split() if word.strip()]
  164. else:
  165. source_words = []
  166. # 构建搜索结果显示
  167. search_result = {
  168. 'search_word': search_item.get('search_word', ''),
  169. 'source_words': source_words, # 数组格式的来源词组合
  170. 'comprehensive_score': round(comprehensive_score, 3),
  171. 'comprehensive_score_detail': comprehensive_score_detail or {},
  172. 'matched_notes': matched_notes
  173. }
  174. search_results.append(search_result)
  175. return {
  176. 'original_target': original_target,
  177. 'search_results': search_results
  178. }
  179. def extract_similarity_map_from_results(
  180. similarity_results: Optional[Dict[str, Any]]
  181. ) -> Dict[str, float]:
  182. """
  183. 从相似度分析结果中提取note_id到max_similarity的映射
  184. Args:
  185. similarity_results: 相似度分析结果
  186. Returns:
  187. note_id -> max_similarity 的映射
  188. """
  189. similarity_map = {}
  190. if not similarity_results:
  191. return similarity_map
  192. results_list = similarity_results.get('results', [])
  193. for result in results_list:
  194. note_id = result.get('note_id', '')
  195. statistics = result.get('similarity_statistics', {})
  196. max_similarity = statistics.get('max_similarity', 0)
  197. if note_id:
  198. similarity_map[note_id] = max_similarity
  199. return similarity_map
  200. def extract_matched_features_from_results(
  201. similarity_results: Optional[Dict[str, Any]],
  202. note_id: str
  203. ) -> List[Dict[str, Any]]:
  204. """
  205. 从相似度分析结果中提取指定帖子的解构特征匹配详情
  206. Args:
  207. similarity_results: 相似度分析结果
  208. note_id: 帖子ID
  209. Returns:
  210. 匹配的特征列表,每个特征包含名称、维度、相似度等信息
  211. """
  212. if not similarity_results or not note_id:
  213. return []
  214. results_list = similarity_results.get('results', [])
  215. for result in results_list:
  216. if result.get('note_id', '') == note_id:
  217. # 提取解构特征列表(字段名为deconstructed_features)
  218. deconstructed_features = result.get('deconstructed_features', [])
  219. matched_features = []
  220. for feature in deconstructed_features:
  221. matched_feature = {
  222. 'feature_name': feature.get('feature_name', ''),
  223. 'dimension': feature.get('dimension', ''),
  224. 'dimension_detail': feature.get('dimension_detail', ''),
  225. 'weight': feature.get('weight', 0),
  226. 'similarity_score': round(feature.get('similarity_score', 0), 3)
  227. }
  228. matched_features.append(matched_feature)
  229. return matched_features
  230. return []