| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- 数据格式转换模块
- 将API输入转换为pipeline格式,将pipeline输出转换为API响应格式
- """
- import logging
- from typing import Dict, List, Any, Optional
- logger = logging.getLogger(__name__)
- def convert_api_input_to_pipeline_format(
- original_target: str,
- persona_features: List[Dict[str, str]],
- candidate_words: List[str]
- ) -> List[Dict[str, Any]]:
- """
- 将API输入转换为pipeline需要的格式(模拟阶段1-2的输出)
-
- Args:
- original_target: 原始目标名称
- persona_features: 人设特征列表,每个元素包含persona_feature_name
- candidate_words: 候选词列表
-
- Returns:
- 符合candidate_words.json格式的数据列表
- """
- # 构建top3匹配信息结构
- top3_match_info = []
- for idx, persona_feature in enumerate(persona_features[:3]): # 最多取3个
- persona_feature_name = persona_feature.get('persona_feature_name', '')
- if not persona_feature_name:
- continue
-
- # 构建匹配信息(模拟阶段1的输出格式)
- match_info = {
- '人设特征名称': persona_feature_name,
- '人设特征层级': '', # API输入中没有,留空
- '特征类型': '', # API输入中没有,留空
- '特征分类': [], # API输入中没有,留空
- '相似度': 0.75 - idx * 0.05, # 模拟相似度,第一个最高
- '匹配说明': '', # API输入中没有,留空
- '是分类': False, # API输入中没有,默认False
- '所属分类路径': '' # API输入中没有,留空
- }
- top3_match_info.append(match_info)
-
- if not top3_match_info:
- logger.warning(f"没有有效的人设特征,无法构建匹配信息")
- return []
-
- # 构建高相似度候选词结构(模拟阶段2的输出格式)
- # 将候选词转换为pipeline需要的格式
- global_candidates = []
- for candidate_word in candidate_words:
- candidate_item = {
- '候选词': candidate_word,
- '候选词类型': 'persona', # 标记为人设候选词
- '相似度': 1.0, # API输入中没有相似度,默认1.0
- '特征类型': '',
- '特征分类': [],
- '人设特征层级': '',
- '来源层级': 'persona',
- '来源路径': '',
- '匹配说明': '',
- '来源原始特征': original_target
- }
- global_candidates.append(candidate_item)
-
- # 构建高相似度候选_按base_word结构
- # 每个base_word共享相同的候选词列表
- high_similarity_by_base = {}
- for match_info in top3_match_info:
- base_word = match_info['人设特征名称']
- if base_word:
- high_similarity_by_base[base_word] = global_candidates.copy()
-
- # 构建最终结果(符合candidate_words.json格式)
- result = {
- '原始特征名称': original_target,
- '来源层级': '', # API输入中没有,留空
- '权重': 1.0, # 默认权重
- 'top3匹配信息': top3_match_info,
- '找到的关联_按base_word': {}, # 不需要关联分析
- '高相似度候选_按base_word': high_similarity_by_base
- }
-
- return [result]
- def convert_pipeline_output_to_api_response(
- pipeline_results: List[Dict[str, Any]],
- original_target: str,
- similarity_results: Optional[Dict[str, Any]] = None
- ) -> Dict[str, Any]:
- """
- 将pipeline输出转换为API响应格式
- 只返回综合得分P > 0的搜索结果
-
- Args:
- pipeline_results: pipeline的输出结果(evaluated_results.json格式)
- original_target: 原始目标名称
- similarity_results: 相似度分析结果(可选)
-
- Returns:
- API响应格式的数据
- """
- # 提取相似度映射
- similarity_map = extract_similarity_map_from_results(similarity_results)
-
- search_results = []
-
- # 遍历pipeline结果
- for feature_result in pipeline_results:
- feature_name = feature_result.get('原始特征名称', '')
-
- # 只处理匹配的目标
- if feature_name != original_target:
- continue
-
- # 从组合评估结果_分组中读取搜索结果
- grouped_results = feature_result.get('组合评估结果_分组', [])
-
- if not grouped_results:
- continue
-
- # 遍历每个base_word的搜索结果
- for group in grouped_results:
- base_word = group.get('base_word', '')
-
- # 遍历该base_word的top10搜索词
- for search_item in group.get('top10_searches', []):
- # 检查是否有评估结果
- evaluation = search_item.get('evaluation_with_filter')
- if not evaluation:
- continue
-
- # 获取综合得分P
- comprehensive_score = search_item.get('comprehensive_score', 0)
- comprehensive_score_detail = search_item.get('comprehensive_score_detail')
-
- # 只返回P > 0的结果
- if comprehensive_score <= 0:
- continue
-
- # 提取匹配的帖子信息
- matched_notes = []
- notes_evaluation = evaluation.get('notes_evaluation', [])
- search_result_data = search_item.get('search_result', {})
- notes_data = search_result_data.get('data', {}).get('data', [])
-
- # 遍历评估结果,提取完全匹配的帖子(综合得分 >= 0.8)
- for note_eval in notes_evaluation:
- evaluation_score = note_eval.get('综合得分', 0)
- if evaluation_score >= 0.8:
- note_index = note_eval.get('note_index', -1)
- if 0 <= note_index < len(notes_data):
- note = notes_data[note_index]
- note_id = note.get('id', '')
- # 获取最高相似度(从相似度分析结果中获取)
- max_similarity = similarity_map.get(note_id, 0)
- # 计算贡献
- contribution = evaluation_score * max_similarity if max_similarity > 0 else evaluation_score
- # 获取该帖子的解构特征匹配详情
- matched_features = extract_matched_features_from_results(
- similarity_results,
- note_id
- )
- matched_note = {
- 'note_id': note_id,
- 'note_title': note.get('note_card', {}).get('display_title', ''),
- 'evaluation_score': round(evaluation_score, 3),
- 'max_similarity': round(max_similarity, 3),
- 'contribution': round(contribution, 3),
- # 评估详情
- 'evaluation_reasoning': note_eval.get('评分说明', ''),
- 'key_matching_points': note_eval.get('关键匹配点', []),
- 'query_relevance': note_eval.get('Query相关性', ''),
- 'query_relevance_explanation': note_eval.get('Query相关性说明', ''),
- 'matched_features': matched_features,
- 'note_data': note # 包含完整的搜索结果信息
- }
- matched_notes.append(matched_note)
-
- # 提取并转换 source_word 为数组格式
- source_word_str = search_item.get('source_word', '')
- if source_word_str and isinstance(source_word_str, str):
- # 按空格分割,过滤空字符串
- source_words = [word.strip() for word in source_word_str.split() if word.strip()]
- else:
- source_words = []
-
- # 构建搜索结果显示
- search_result = {
- 'search_word': search_item.get('search_word', ''),
- 'source_words': source_words, # 数组格式的来源词组合
- 'comprehensive_score': round(comprehensive_score, 3),
- 'comprehensive_score_detail': comprehensive_score_detail or {},
- 'matched_notes': matched_notes
- }
-
- search_results.append(search_result)
-
- return {
- 'original_target': original_target,
- 'search_results': search_results
- }
- def extract_similarity_map_from_results(
- similarity_results: Optional[Dict[str, Any]]
- ) -> Dict[str, float]:
- """
- 从相似度分析结果中提取note_id到max_similarity的映射
- Args:
- similarity_results: 相似度分析结果
- Returns:
- note_id -> max_similarity 的映射
- """
- similarity_map = {}
- if not similarity_results:
- return similarity_map
- results_list = similarity_results.get('results', [])
- for result in results_list:
- note_id = result.get('note_id', '')
- statistics = result.get('similarity_statistics', {})
- max_similarity = statistics.get('max_similarity', 0)
- if note_id:
- similarity_map[note_id] = max_similarity
- return similarity_map
- def extract_matched_features_from_results(
- similarity_results: Optional[Dict[str, Any]],
- note_id: str
- ) -> List[Dict[str, Any]]:
- """
- 从相似度分析结果中提取指定帖子的解构特征匹配详情
- Args:
- similarity_results: 相似度分析结果
- note_id: 帖子ID
- Returns:
- 匹配的特征列表,每个特征包含名称、维度、相似度等信息
- """
- if not similarity_results or not note_id:
- return []
- results_list = similarity_results.get('results', [])
- for result in results_list:
- if result.get('note_id', '') == note_id:
- # 提取解构特征列表(字段名为deconstructed_features)
- deconstructed_features = result.get('deconstructed_features', [])
- matched_features = []
- for feature in deconstructed_features:
- matched_feature = {
- 'feature_name': feature.get('feature_name', ''),
- 'dimension': feature.get('dimension', ''),
- 'dimension_detail': feature.get('dimension_detail', ''),
- 'weight': feature.get('weight', 0),
- 'similarity_score': round(feature.get('similarity_score', 0), 3)
- }
- matched_features.append(matched_feature)
- return matched_features
- return []
|