| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- 分析模型对比结果并导出到Excel
- 同时分析两种实现:
- 1. cache/text_embedding - 向量模型实现(text2vec)
- 2. cache/semantic_similarity - LLM实现(GPT/Claude等)
- 生成Excel报告,对比不同实现的效果差异。
- """
- import json
- import sys
- from pathlib import Path
- from typing import Dict, List, Tuple
- import pandas as pd
- from datetime import datetime
- # 添加项目根目录到路径
- sys.path.insert(0, str(Path(__file__).parent.parent.parent))
- from lib.config import get_cache_dir
- project_root = Path(__file__).parent.parent.parent
- sys.path.insert(0, str(project_root))
- def extract_results_from_cache(
- cache_dir: str,
- model_type: str
- ) -> Dict[Tuple[str, str], Dict]:
- """
- 从缓存目录提取结果
- Args:
- cache_dir: 缓存目录路径
- model_type: 模型类型("text_embedding" 或 "semantic_similarity")
- Returns:
- 结果字典,键为 (phrase_a, phrase_b) 元组,值为结果数据
- """
- cache_path = Path(cache_dir)
- if not cache_path.exists():
- print(f"缓存目录不存在: {cache_dir}")
- return {}
- results = {}
- cache_files = list(cache_path.glob("*.json"))
- print(f"扫描 {model_type} 缓存: {len(cache_files)} 个文件")
- for cache_file in cache_files:
- try:
- with open(cache_file, 'r', encoding='utf-8') as f:
- data = json.load(f)
- # 提取输入和输出
- input_data = data.get("input", {})
- output_data = data.get("output", {})
- phrase_a = input_data.get("phrase_a")
- phrase_b = input_data.get("phrase_b")
- if phrase_a and phrase_b:
- # 统一处理两种缓存格式
- if model_type == "text_embedding":
- # text_embedding 的输出直接是 {"说明": "...", "相似度": 0.xx}
- result = {
- "相似度": output_data.get("相似度"),
- "说明": output_data.get("说明", ""),
- "模型": input_data.get("model_name", "unknown")
- }
- elif model_type == "semantic_similarity":
- # semantic_similarity 的输出在 output.parsed 中
- parsed = output_data.get("parsed", output_data)
- result = {
- "相似度": parsed.get("相似度"),
- "说明": parsed.get("说明", ""),
- "模型": input_data.get("model_name", "LLM")
- }
- else:
- continue
- # 使用原始顺序的元组作为键(保持 phrase_a 和 phrase_b 的原始顺序)
- pair_key = (phrase_a, phrase_b)
- # 如果同一对短语有多个缓存(不同模型),保存为列表
- if pair_key not in results:
- results[pair_key] = []
- results[pair_key].append({
- "phrase_a": phrase_a,
- "phrase_b": phrase_b,
- "相似度": result["相似度"],
- "说明": result["说明"],
- "模型": result["模型"]
- })
- except (json.JSONDecodeError, IOError, KeyError) as e:
- print(f" 读取缓存文件失败: {cache_file.name} - {e}")
- continue
- return results
- def merge_all_results(
- text_embedding_results: Dict[Tuple[str, str], List[Dict]],
- semantic_similarity_results: Dict[Tuple[str, str], List[Dict]]
- ) -> List[Dict]:
- """
- 合并所有结果
- Args:
- text_embedding_results: 向量模型结果
- semantic_similarity_results: LLM模型结果
- Returns:
- 合并后的结果列表
- """
- # 获取所有唯一的短语对
- all_pairs = set(text_embedding_results.keys()) | set(semantic_similarity_results.keys())
- merged = []
- for pair_key in all_pairs:
- phrase_a, phrase_b = pair_key
- row = {
- "短语A": phrase_a,
- "短语B": phrase_b,
- }
- # 添加向量模型结果
- if pair_key in text_embedding_results:
- for result in text_embedding_results[pair_key]:
- model_name = result["模型"].split('/')[-1] # 提取模型简称
- row[f"向量_{model_name}_相似度"] = result["相似度"]
- row[f"向量_{model_name}_说明"] = result["说明"]
- # 添加LLM模型结果
- if pair_key in semantic_similarity_results:
- for result in semantic_similarity_results[pair_key]:
- model_name = result["模型"].split('/')[-1] # 提取模型简称
- row[f"LLM_{model_name}_相似度"] = result["相似度"]
- row[f"LLM_{model_name}_说明"] = result["说明"]
- merged.append(row)
- return merged
- def create_comparison_dataframe(merged_results: List[Dict]) -> pd.DataFrame:
- """
- 创建模型对比数据框
- Args:
- merged_results: 合并后的结果列表
- Returns:
- 包含所有模型对比的DataFrame
- """
- # 直接从合并结果创建DataFrame
- df = pd.DataFrame(merged_results)
- # 添加序号
- df.insert(0, "序号", range(1, len(df) + 1))
- # 只保留相似度列,移除说明列
- columns_to_keep = ["序号", "短语A", "短语B"]
- similarity_cols = [col for col in df.columns if col.endswith("_相似度")]
- columns_to_keep.extend(similarity_cols)
- # 过滤出要保留的列
- df = df[[col for col in columns_to_keep if col in df.columns]]
- # 计算相似度差异
- for idx, row in df.iterrows():
- similarities = []
- for col in similarity_cols:
- if col in df.columns:
- sim = row[col]
- if sim is not None and isinstance(sim, (int, float)):
- similarities.append(sim)
- if len(similarities) > 1:
- df.at[idx, "相似度_差异"] = max(similarities) - min(similarities)
- elif len(similarities) == 1:
- df.at[idx, "相似度_差异"] = 0
- else:
- df.at[idx, "相似度_差异"] = None
- # 移动相似度差异列到最后
- if "相似度_差异" in df.columns:
- cols = [col for col in df.columns if col != "相似度_差异"]
- cols.append("相似度_差异")
- df = df[cols]
- return df
- def export_to_excel(
- comparison_df: pd.DataFrame,
- output_file: str
- ) -> None:
- """
- 导出到Excel文件
- Args:
- comparison_df: 对比数据框
- output_file: 输出Excel文件路径
- """
- output_path = Path(output_file)
- output_path.parent.mkdir(parents=True, exist_ok=True)
- # 创建Excel写入器
- with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
- # 写入完整对比数据
- comparison_df.to_excel(writer, sheet_name='模型对比', index=False)
- # 格式化工作表
- workbook = writer.book
- ws = writer.sheets['模型对比']
- # 自动调整列宽
- for col in ws.columns:
- max_length = 0
- column = col[0].column_letter
- for cell in col:
- try:
- if len(str(cell.value)) > max_length:
- max_length = len(str(cell.value))
- except:
- pass
- adjusted_width = min(max_length + 2, 50)
- ws.column_dimensions[column].width = adjusted_width
- print(f"Excel报告已导出到: {output_file}")
- def main():
- """主函数"""
- # 配置参数(从配置模块获取)
- text_embedding_cache = get_cache_dir("text_embedding")
- semantic_similarity_cache = get_cache_dir("semantic_similarity")
- output_file = f"data/model_comparison_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
- print("=" * 60)
- print("模型对比结果分析(向量模型 vs LLM模型)")
- print("=" * 60)
- # 步骤 1: 从缓存提取结果
- print("\n步骤 1: 从缓存提取结果...")
- print(f" - 向量模型缓存: {text_embedding_cache}")
- text_embedding_results = extract_results_from_cache(text_embedding_cache, "text_embedding")
- print(f" 提取到 {len(text_embedding_results)} 个唯一短语对")
- print(f" - LLM模型缓存: {semantic_similarity_cache}")
- semantic_similarity_results = extract_results_from_cache(semantic_similarity_cache, "semantic_similarity")
- print(f" 提取到 {len(semantic_similarity_results)} 个唯一短语对")
- if not text_embedding_results and not semantic_similarity_results:
- print("\n错误: 未找到任何缓存数据")
- print("请先运行以下脚本生成缓存:")
- print(" - match_inspiration_features.py (生成 text_embedding 缓存)")
- print(" - test_all_models.py (生成多模型缓存)")
- return
- # 步骤 2: 合并结果
- print("\n步骤 2: 合并所有模型结果...")
- merged_results = merge_all_results(text_embedding_results, semantic_similarity_results)
- print(f"合并后共 {len(merged_results)} 个测试用例")
- # 步骤 3: 创建对比数据框
- print("\n步骤 3: 创建对比数据框...")
- comparison_df = create_comparison_dataframe(merged_results)
- print(f"对比数据框创建完成,共 {len(comparison_df)} 行")
- # 显示列信息
- similarity_cols = [col for col in comparison_df.columns if col.endswith("_相似度")]
- print(f"包含 {len(similarity_cols)} 个模型的相似度数据:")
- for col in similarity_cols:
- print(f" - {col}")
- # 步骤 4: 导出到Excel
- print("\n步骤 4: 导出到Excel...")
- export_to_excel(comparison_df, output_file)
- print("\n" + "=" * 60)
- print("分析完成!")
- print(f"报告文件: {output_file}")
- print("=" * 60)
- if __name__ == "__main__":
- main()
|