| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- 数据格式转换模块
- 将API输入转换为pipeline格式,将pipeline输出转换为API响应格式
- """
- import logging
- from typing import Dict, List, Any, Optional
- logger = logging.getLogger(__name__)
- def convert_api_input_to_pipeline_format(
- original_target: str,
- persona_features: List[Dict[str, str]],
- candidate_words: List[str]
- ) -> List[Dict[str, Any]]:
- """
- 将API输入转换为pipeline需要的格式(模拟阶段1-2的输出)
-
- Args:
- original_target: 原始目标名称
- persona_features: 人设特征列表,每个元素包含persona_feature_name
- candidate_words: 候选词列表
-
- Returns:
- 符合candidate_words.json格式的数据列表
- """
- # 构建top3匹配信息结构
- top3_match_info = []
- for idx, persona_feature in enumerate(persona_features[:3]): # 最多取3个
- persona_feature_name = persona_feature.get('persona_feature_name', '')
- if not persona_feature_name:
- continue
-
- # 构建匹配信息(模拟阶段1的输出格式)
- match_info = {
- '人设特征名称': persona_feature_name,
- '人设特征层级': '', # API输入中没有,留空
- '特征类型': '', # API输入中没有,留空
- '特征分类': [], # API输入中没有,留空
- '相似度': 0.75 - idx * 0.05, # 模拟相似度,第一个最高
- '匹配说明': '', # API输入中没有,留空
- '是分类': False, # API输入中没有,默认False
- '所属分类路径': '' # API输入中没有,留空
- }
- top3_match_info.append(match_info)
-
- if not top3_match_info:
- logger.warning(f"没有有效的人设特征,无法构建匹配信息")
- return []
-
- # 构建高相似度候选词结构(模拟阶段2的输出格式)
- # 将候选词转换为pipeline需要的格式
- global_candidates = []
- for candidate_word in candidate_words:
- candidate_item = {
- '候选词': candidate_word,
- '候选词类型': 'persona', # 标记为人设候选词
- '相似度': 1.0, # API输入中没有相似度,默认1.0
- '特征类型': '',
- '特征分类': [],
- '人设特征层级': '',
- '来源层级': 'persona',
- '来源路径': '',
- '匹配说明': '',
- '来源原始特征': original_target
- }
- global_candidates.append(candidate_item)
-
- # 构建高相似度候选_按base_word结构
- # 每个base_word共享相同的候选词列表
- high_similarity_by_base = {}
- for match_info in top3_match_info:
- base_word = match_info['人设特征名称']
- if base_word:
- high_similarity_by_base[base_word] = global_candidates.copy()
-
- # 构建最终结果(符合candidate_words.json格式)
- result = {
- '原始特征名称': original_target,
- '来源层级': '', # API输入中没有,留空
- '权重': 1.0, # 默认权重
- 'top3匹配信息': top3_match_info,
- '找到的关联_按base_word': {}, # 不需要关联分析
- '高相似度候选_按base_word': high_similarity_by_base
- }
-
- return [result]
- def convert_pipeline_output_to_api_response(
- pipeline_results: List[Dict[str, Any]],
- original_target: str,
- similarity_results: Optional[Dict[str, Any]] = None
- ) -> Dict[str, Any]:
- """
- 将pipeline输出转换为API响应格式
- 只返回综合得分P > 0的搜索结果
-
- Args:
- pipeline_results: pipeline的输出结果(evaluated_results.json格式)
- original_target: 原始目标名称
- similarity_results: 相似度分析结果(可选)
-
- Returns:
- API响应格式的数据
- """
- # 提取相似度映射
- similarity_map = extract_similarity_map_from_results(similarity_results)
-
- search_results = []
-
- # 遍历pipeline结果
- for feature_result in pipeline_results:
- feature_name = feature_result.get('原始特征名称', '')
-
- # 只处理匹配的目标
- if feature_name != original_target:
- continue
-
- # 从组合评估结果_分组中读取搜索结果
- grouped_results = feature_result.get('组合评估结果_分组', [])
-
- if not grouped_results:
- continue
-
- # 遍历每个base_word的搜索结果
- for group in grouped_results:
- base_word = group.get('base_word', '')
-
- # 遍历该base_word的top10搜索词
- for search_item in group.get('top10_searches', []):
- # 检查是否有评估结果
- evaluation = search_item.get('evaluation_with_filter')
- if not evaluation:
- continue
-
- # 获取综合得分P
- comprehensive_score = search_item.get('comprehensive_score', 0)
- comprehensive_score_detail = search_item.get('comprehensive_score_detail')
-
- # 只返回P > 0的结果
- if comprehensive_score <= 0:
- continue
-
- # 提取匹配的帖子信息
- matched_notes = []
- notes_evaluation = evaluation.get('notes_evaluation', [])
- search_result_data = search_item.get('search_result', {})
- notes_data = search_result_data.get('data', {}).get('data', [])
-
- # 遍历评估结果,提取完全匹配的帖子(综合得分 >= 0.8)
- for note_eval in notes_evaluation:
- evaluation_score = note_eval.get('综合得分', 0)
- if evaluation_score >= 0.8:
- note_index = note_eval.get('note_index', -1)
- if 0 <= note_index < len(notes_data):
- note = notes_data[note_index]
- note_id = note.get('id', '')
-
- # 获取最高相似度(从相似度分析结果中获取)
- max_similarity = similarity_map.get(note_id, 0)
-
- # 计算贡献
- contribution = evaluation_score * max_similarity if max_similarity > 0 else evaluation_score
-
- matched_note = {
- 'note_id': note_id,
- 'note_title': note.get('note_card', {}).get('display_title', ''),
- 'evaluation_score': round(evaluation_score, 3),
- 'max_similarity': round(max_similarity, 3),
- 'contribution': round(contribution, 3),
- 'note_data': note # 包含完整的搜索结果信息
- }
- matched_notes.append(matched_note)
-
- # 构建搜索结果显示
- search_result = {
- 'search_word': search_item.get('search_word', ''),
- 'comprehensive_score': round(comprehensive_score, 3),
- 'comprehensive_score_detail': comprehensive_score_detail or {},
- 'matched_notes': matched_notes
- }
-
- search_results.append(search_result)
-
- return {
- 'original_target': original_target,
- 'search_results': search_results
- }
- def extract_similarity_map_from_results(
- similarity_results: Optional[Dict[str, Any]]
- ) -> Dict[str, float]:
- """
- 从相似度分析结果中提取note_id到max_similarity的映射
-
- Args:
- similarity_results: 相似度分析结果
-
- Returns:
- note_id -> max_similarity 的映射
- """
- similarity_map = {}
-
- if not similarity_results:
- return similarity_map
-
- results_list = similarity_results.get('results', [])
- for result in results_list:
- note_id = result.get('note_id', '')
- statistics = result.get('similarity_statistics', {})
- max_similarity = statistics.get('max_similarity', 0)
-
- if note_id:
- similarity_map[note_id] = max_similarity
-
- return similarity_map
|