liulidong
/
knowledge_search


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
							#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Stage 6 评估结果统计分析
分析两层评估的过滤效果和匹配质量
"""

import json
from typing import Dict, List, Any
from collections import defaultdict


def load_stage6_results(file_path: str) -> List[Dict[str, Any]]:
    """加载Stage 6评估结果"""
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)


def analyze_evaluation_results(data: List[Dict[str, Any]]) -> Dict[str, Any]:
    """分析评估结果"""

    # 全局统计
    global_stats = {
        'total_search_words': 0,
        'total_notes_evaluated': 0,
        'total_filtered': 0,
        'match_distribution': {
            '完全匹配(8-10)': 0,
            '相似匹配(6-7)': 0,
            '弱相似(5-6)': 0,
            '无匹配(≤4)': 0
        }
    }

    # 按原始特征分组统计
    feature_stats = defaultdict(lambda: {
        'search_words_count': 0,
        'total_notes': 0,
        'total_filtered': 0,
        'match_distribution': {
            '完全匹配(8-10)': 0,
            '相似匹配(6-7)': 0,
            '弱相似(5-6)': 0,
            '无匹配(≤4)': 0
        },
        'search_words': []
    })

    # 所有搜索词的详细统计
    search_word_details = []

    # 遍历所有原始特征
    for feature_result in data:
        original_feature = feature_result.get('原始特征名称', 'Unknown')

        # 从组合评估结果_分组中读取
        grouped_results = feature_result.get('组合评估结果_分组', [])

        for group in grouped_results:
            base_word = group.get('base_word', '')

            for eval_item in group.get('top10_searches', []):
                # 检查是否有评估结果
                evaluation = eval_item.get('evaluation_with_filter')
                if not evaluation:
                    continue

                search_word = eval_item.get('search_word', '')

                # 提取评估数据
                total_notes = evaluation.get('total_notes', 0)
                evaluated_notes = evaluation.get('evaluated_notes', 0)
                filtered_count = evaluation.get('filtered_count', 0)
                statistics = evaluation.get('statistics', {})

                # 更新全局统计
                global_stats['total_search_words'] += 1
                global_stats['total_notes_evaluated'] += total_notes
                global_stats['total_filtered'] += filtered_count

                for key in global_stats['match_distribution']:
                    global_stats['match_distribution'][key] += statistics.get(key, 0)

                # 更新特征统计
                feature_stats[original_feature]['search_words_count'] += 1
                feature_stats[original_feature]['total_notes'] += total_notes
                feature_stats[original_feature]['total_filtered'] += filtered_count

                for key in feature_stats[original_feature]['match_distribution']:
                    feature_stats[original_feature]['match_distribution'][key] += statistics.get(key, 0)

                # 记录搜索词详情
                search_word_info = {
                    'original_feature': original_feature,
                    'base_word': base_word,
                    'search_word': search_word,
                    'total_notes': total_notes,
                    'evaluated_notes': evaluated_notes,
                    'filtered_count': filtered_count,
                    'match_distribution': statistics,
                    'high_quality_count': statistics.get('完全匹配(8-10)', 0),
                    'similar_count': statistics.get('相似匹配(6-7)', 0)
                }

                search_word_details.append(search_word_info)
                feature_stats[original_feature]['search_words'].append(search_word_info)

    # 计算全局过滤率
    if global_stats['total_notes_evaluated'] > 0:
        global_stats['filter_rate'] = global_stats['total_filtered'] / global_stats['total_notes_evaluated']
    else:
        global_stats['filter_rate'] = 0.0

    # 计算每个特征的过滤率
    for feature_name, stats in feature_stats.items():
        if stats['total_notes'] > 0:
            stats['filter_rate'] = stats['total_filtered'] / stats['total_notes']
        else:
            stats['filter_rate'] = 0.0

    # 按高质量匹配数排序搜索词
    search_word_details.sort(key=lambda x: x['high_quality_count'], reverse=True)

    return {
        'global_stats': global_stats,
        'feature_stats': dict(feature_stats),
        'search_word_details': search_word_details
    }


def print_statistics(stats: Dict[str, Any]):
    """打印统计结果"""
    global_stats = stats['global_stats']
    feature_stats = stats['feature_stats']
    search_word_details = stats['search_word_details']

    print("=" * 80)
    print("Stage 6 评估结果统计分析")
    print("=" * 80)

    # 全局统计
    print("\n【全局统计】")
    print(f"  总搜索词数: {global_stats['total_search_words']}")
    print(f"  总评估帖子数: {global_stats['total_notes_evaluated']}")
    print(f"  总过滤帖子数: {global_stats['total_filtered']} (过滤率: {global_stats['filter_rate']*100:.1f}%)")
    print(f"\n  匹配度分布:")
    for match_type, count in global_stats['match_distribution'].items():
        print(f"    {match_type}: {count} 个帖子")

    # 按原始特征统计
    print("\n" + "=" * 80)
    print("【按原始特征统计】")
    print("=" * 80)

    for feature_name, stats in sorted(feature_stats.items()):
        print(f"\n特征: {feature_name}")
        print(f"  搜索词数: {stats['search_words_count']}")
        print(f"  总评估帖子: {stats['total_notes']}")
        print(f"  总过滤帖子: {stats['total_filtered']} (过滤率: {stats['filter_rate']*100:.1f}%)")
        print(f"  高质量匹配: {stats['match_distribution']['完全匹配(8-10)']} 个帖子")
        print(f"  相似匹配: {stats['match_distribution']['相似匹配(6-7)']} 个帖子")

        # 找出该特征下高质量匹配最多的搜索词
        best_searches = sorted(stats['search_words'], key=lambda x: x['high_quality_count'], reverse=True)[:3]
        if best_searches:
            print(f"  Top 3 最佳搜索词:")
            for idx, sw in enumerate(best_searches, 1):
                print(f"    {idx}. \"{sw['search_word']}\" - {sw['high_quality_count']}个完全匹配")

    # Top 10 最佳搜索词
    print("\n" + "=" * 80)
    print("【Top 10 最佳搜索词（按完全匹配数排序）】")
    print("=" * 80)

    for idx, sw in enumerate(search_word_details[:10], 1):
        print(f"\n{idx}. \"{sw['search_word']}\"")
        print(f"   原始特征: {sw['original_feature']}")
        print(f"   Base Word: {sw['base_word']}")
        print(f"   评估帖子: {sw['total_notes']}, 过滤: {sw['filtered_count']}")
        print(f"   完全匹配(8-10): {sw['high_quality_count']} 个")
        print(f"   相似匹配(6-7): {sw['similar_count']} 个")

    # 过滤效果分析
    print("\n" + "=" * 80)
    print("【过滤效果分析】")
    print("=" * 80)

    total_evaluated = global_stats['total_notes_evaluated']
    total_filtered = global_stats['total_filtered']
    total_remaining = total_evaluated - total_filtered

    total_high_quality = global_stats['match_distribution']['完全匹配(8-10)']
    total_similar = global_stats['match_distribution']['相似匹配(6-7)']
    total_weak = global_stats['match_distribution']['弱相似(5-6)']
    total_no_match = global_stats['match_distribution']['无匹配(≤4)']

    print(f"  评估帖子总数: {total_evaluated}")
    print(f"  第一层过滤（Query不相关）: {total_filtered} ({total_filtered/total_evaluated*100:.1f}%)")
    print(f"  通过过滤的帖子: {total_remaining} ({total_remaining/total_evaluated*100:.1f}%)")
    print(f"\n  通过过滤后的质量分布:")
    if total_remaining > 0:
        print(f"    完全匹配(8-10): {total_high_quality} ({total_high_quality/total_remaining*100:.1f}%)")
        print(f"    相似匹配(6-7): {total_similar} ({total_similar/total_remaining*100:.1f}%)")
        print(f"    弱相似(5-6): {total_weak} ({total_weak/total_remaining*100:.1f}%)")
        print(f"    无匹配(≤4): {total_no_match} ({total_no_match/total_remaining*100:.1f}%)")

    print("\n" + "=" * 80)


def save_statistics(stats: Dict[str, Any], output_path: str):
    """保存统计结果到JSON文件"""
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(stats, f, ensure_ascii=False, indent=2)
    print(f"\n统计结果已保存到: {output_path}")


def main():
    """主函数"""
    input_file = "output_v2/stage6_with_evaluations.json"
    output_file = "output_v2/stage6_statistics.json"

    print("正在加载数据...")
    data = load_stage6_results(input_file)

    print("正在分析评估结果...")
    stats = analyze_evaluation_results(data)

    # 打印统计结果
    print_statistics(stats)

    # 保存结果
    save_statistics(stats, output_file)


if __name__ == '__main__':
    main()