liulidong
/
knowledge_search


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284
							#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
数据格式转换模块
将API输入转换为pipeline格式，将pipeline输出转换为API响应格式
"""

import logging
from typing import Dict, List, Any, Optional

logger = logging.getLogger(__name__)


def convert_api_input_to_pipeline_format(
    original_target: str,
    persona_features: List[Dict[str, str]],
    candidate_words: List[str]
) -> List[Dict[str, Any]]:
    """
    将API输入转换为pipeline需要的格式（模拟阶段1-2的输出）
    
    Args:
        original_target: 原始目标名称
        persona_features: 人设特征列表，每个元素包含persona_feature_name
        candidate_words: 候选词列表
    
    Returns:
        符合candidate_words.json格式的数据列表
    """
    # 构建top3匹配信息结构
    top3_match_info = []
    for idx, persona_feature in enumerate(persona_features[:3]):  # 最多取3个
        persona_feature_name = persona_feature.get('persona_feature_name', '')
        if not persona_feature_name:
            continue
            
        # 构建匹配信息（模拟阶段1的输出格式）
        match_info = {
            '人设特征名称': persona_feature_name,
            '人设特征层级': '',  # API输入中没有，留空
            '特征类型': '',  # API输入中没有，留空
            '特征分类': [],  # API输入中没有，留空
            '相似度': 0.75 - idx * 0.05,  # 模拟相似度，第一个最高
            '匹配说明': '',  # API输入中没有，留空
            '是分类': False,  # API输入中没有，默认False
            '所属分类路径': ''  # API输入中没有，留空
        }
        top3_match_info.append(match_info)
    
    if not top3_match_info:
        logger.warning(f"没有有效的人设特征，无法构建匹配信息")
        return []
    
    # 构建高相似度候选词结构（模拟阶段2的输出格式）
    # 将候选词转换为pipeline需要的格式
    global_candidates = []
    for candidate_word in candidate_words:
        candidate_item = {
            '候选词': candidate_word,
            '候选词类型': 'persona',  # 标记为人设候选词
            '相似度': 1.0,  # API输入中没有相似度，默认1.0
            '特征类型': '',
            '特征分类': [],
            '人设特征层级': '',
            '来源层级': 'persona',
            '来源路径': '',
            '匹配说明': '',
            '来源原始特征': original_target
        }
        global_candidates.append(candidate_item)
    
    # 构建高相似度候选_按base_word结构
    # 每个base_word共享相同的候选词列表
    high_similarity_by_base = {}
    for match_info in top3_match_info:
        base_word = match_info['人设特征名称']
        if base_word:
            high_similarity_by_base[base_word] = global_candidates.copy()
    
    # 构建最终结果（符合candidate_words.json格式）
    result = {
        '原始特征名称': original_target,
        '来源层级': '',  # API输入中没有，留空
        '权重': 1.0,  # 默认权重
        'top3匹配信息': top3_match_info,
        '找到的关联_按base_word': {},  # 不需要关联分析
        '高相似度候选_按base_word': high_similarity_by_base
    }
    
    return [result]


def convert_pipeline_output_to_api_response(
    pipeline_results: List[Dict[str, Any]],
    original_target: str,
    similarity_results: Optional[Dict[str, Any]] = None
) -> Dict[str, Any]:
    """
    将pipeline输出转换为API响应格式
    只返回综合得分P > 0的搜索结果
    
    Args:
        pipeline_results: pipeline的输出结果（evaluated_results.json格式）
        original_target: 原始目标名称
        similarity_results: 相似度分析结果（可选）
    
    Returns:
        API响应格式的数据
    """
    # 提取相似度映射
    similarity_map = extract_similarity_map_from_results(similarity_results)
    
    search_results = []
    
    # 遍历pipeline结果
    for feature_result in pipeline_results:
        feature_name = feature_result.get('原始特征名称', '')
        
        # 只处理匹配的目标
        if feature_name != original_target:
            continue
        
        # 从组合评估结果_分组中读取搜索结果
        grouped_results = feature_result.get('组合评估结果_分组', [])
        
        if not grouped_results:
            continue
        
        # 遍历每个base_word的搜索结果
        for group in grouped_results:
            base_word = group.get('base_word', '')
            
            # 遍历该base_word的top10搜索词
            for search_item in group.get('top10_searches', []):
                # 检查是否有评估结果
                evaluation = search_item.get('evaluation_with_filter')
                if not evaluation:
                    continue
                
                # 获取综合得分P
                comprehensive_score = search_item.get('comprehensive_score', 0)
                comprehensive_score_detail = search_item.get('comprehensive_score_detail')
                
                # 只返回P > 0的结果
                if comprehensive_score <= 0:
                    continue
                
                # 提取匹配的帖子信息
                matched_notes = []
                notes_evaluation = evaluation.get('notes_evaluation', [])
                search_result_data = search_item.get('search_result', {})
                notes_data = search_result_data.get('data', {}).get('data', [])
                
                # 遍历评估结果，提取完全匹配的帖子（综合得分 >= 0.8）
                for note_eval in notes_evaluation:
                    evaluation_score = note_eval.get('综合得分', 0)
                    if evaluation_score >= 0.8:
                        note_index = note_eval.get('note_index', -1)
                        if 0 <= note_index < len(notes_data):
                            note = notes_data[note_index]
                            note_id = note.get('id', '')

                            # 获取最高相似度（从相似度分析结果中获取）
                            max_similarity = similarity_map.get(note_id, 0)

                            # 计算贡献
                            contribution = evaluation_score * max_similarity if max_similarity > 0 else evaluation_score

                            # 获取该帖子的解构特征匹配详情
                            matched_features = extract_matched_features_from_results(
                                similarity_results,
                                note_id
                            )

                            matched_note = {
                                'note_id': note_id,
                                'note_title': note.get('note_card', {}).get('display_title', ''),
                                'evaluation_score': round(evaluation_score, 3),
                                'max_similarity': round(max_similarity, 3),
                                'contribution': round(contribution, 3),

                                # 评估详情
                                'evaluation_reasoning': note_eval.get('评分说明', ''),
                                'key_matching_points': note_eval.get('关键匹配点', []),
                                'query_relevance': note_eval.get('Query相关性', ''),
                                'query_relevance_explanation': note_eval.get('Query相关性说明', ''),
                                'matched_features': matched_features,

                                'note_data': note  # 包含完整的搜索结果信息
                            }
                            matched_notes.append(matched_note)
                
                # 提取并转换 source_word 为数组格式
                source_word_str = search_item.get('source_word', '')
                if source_word_str and isinstance(source_word_str, str):
                    # 按空格分割，过滤空字符串
                    source_words = [word.strip() for word in source_word_str.split() if word.strip()]
                else:
                    source_words = []
                
                # 构建搜索结果显示
                search_result = {
                    'search_word': search_item.get('search_word', ''),
                    'source_words': source_words,  # 数组格式的来源词组合
                    'comprehensive_score': round(comprehensive_score, 3),
                    'comprehensive_score_detail': comprehensive_score_detail or {},
                    'matched_notes': matched_notes
                }
                
                search_results.append(search_result)
    
    return {
        'original_target': original_target,
        'search_results': search_results
    }


def extract_similarity_map_from_results(
    similarity_results: Optional[Dict[str, Any]]
) -> Dict[str, float]:
    """
    从相似度分析结果中提取note_id到max_similarity的映射

    Args:
        similarity_results: 相似度分析结果

    Returns:
        note_id -> max_similarity 的映射
    """
    similarity_map = {}

    if not similarity_results:
        return similarity_map

    results_list = similarity_results.get('results', [])
    for result in results_list:
        note_id = result.get('note_id', '')
        statistics = result.get('similarity_statistics', {})
        max_similarity = statistics.get('max_similarity', 0)

        if note_id:
            similarity_map[note_id] = max_similarity

    return similarity_map


def extract_matched_features_from_results(
    similarity_results: Optional[Dict[str, Any]],
    note_id: str
) -> List[Dict[str, Any]]:
    """
    从相似度分析结果中提取指定帖子的解构特征匹配详情

    Args:
        similarity_results: 相似度分析结果
        note_id: 帖子ID

    Returns:
        匹配的特征列表，每个特征包含名称、维度、相似度等信息
    """
    if not similarity_results or not note_id:
        return []

    results_list = similarity_results.get('results', [])
    for result in results_list:
        if result.get('note_id', '') == note_id:
            # 提取解构特征列表（字段名为deconstructed_features）
            deconstructed_features = result.get('deconstructed_features', [])
            matched_features = []

            for feature in deconstructed_features:
                matched_feature = {
                    'feature_name': feature.get('feature_name', ''),
                    'dimension': feature.get('dimension', ''),
                    'dimension_detail': feature.get('dimension_detail', ''),
                    'weight': feature.get('weight', 0),
                    'similarity_score': round(feature.get('similarity_score', 0), 3)
                }
                matched_features.append(matched_feature)

            return matched_features

    return []