|
@@ -0,0 +1,168 @@
|
|
|
|
|
+"""
|
|
|
|
|
+分析特定原始特征的搜索执行情况
|
|
|
|
|
+"""
|
|
|
|
|
+
|
|
|
|
|
+import json
|
|
|
|
|
+import sys
|
|
|
|
|
+from typing import Dict, Any, List
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def analyze_feature_searches(stage4_path: str, stage6_path: str, feature_name: str):
|
|
|
|
|
+ """分析指定原始特征的搜索情况"""
|
|
|
|
|
+
|
|
|
|
|
+ # 加载数据
|
|
|
|
|
+ with open(stage4_path, 'r', encoding='utf-8') as f:
|
|
|
|
|
+ stage4_data = json.load(f)
|
|
|
|
|
+
|
|
|
|
|
+ with open(stage6_path, 'r', encoding='utf-8') as f:
|
|
|
|
|
+ stage6_data = json.load(f)
|
|
|
|
|
+
|
|
|
|
|
+ # 找到指定特征
|
|
|
|
|
+ stage4_feature = None
|
|
|
|
|
+ stage6_feature = None
|
|
|
|
|
+
|
|
|
|
|
+ for item in stage4_data:
|
|
|
|
|
+ if item['原始特征名称'] == feature_name:
|
|
|
|
|
+ stage4_feature = item
|
|
|
|
|
+ break
|
|
|
|
|
+
|
|
|
|
|
+ for item in stage6_data:
|
|
|
|
|
+ if item['原始特征名称'] == feature_name:
|
|
|
|
|
+ stage6_feature = item
|
|
|
|
|
+ break
|
|
|
|
|
+
|
|
|
|
|
+ if not stage4_feature:
|
|
|
|
|
+ print(f"❌ 在 Stage4 中未找到特征: {feature_name}")
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
|
|
+ if not stage6_feature:
|
|
|
|
|
+ print(f"❌ 在 Stage6 中未找到特征: {feature_name}")
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
|
|
+ print("=" * 80)
|
|
|
|
|
+ print(f"原始特征: {feature_name}")
|
|
|
|
|
+ print("=" * 80)
|
|
|
|
|
+
|
|
|
|
|
+ # 收集 Stage4 的所有搜索词
|
|
|
|
|
+ stage4_search_words = []
|
|
|
|
|
+ for association in stage4_feature.get('找到的关联', []):
|
|
|
|
|
+ assoc_name = association.get('分类名称', '')
|
|
|
|
|
+ for feature in association.get('特征列表', []):
|
|
|
|
|
+ search_word = feature.get('search_word')
|
|
|
|
|
+ llm_eval = feature.get('llm_evaluation', {})
|
|
|
|
|
+
|
|
|
|
|
+ if search_word:
|
|
|
|
|
+ stage4_search_words.append({
|
|
|
|
|
+ 'search_word': search_word,
|
|
|
|
|
+ 'association': assoc_name,
|
|
|
|
|
+ 'feature_name': feature.get('特征名称', ''),
|
|
|
|
|
+ 'llm_score': llm_eval.get('score'),
|
|
|
|
|
+ 'llm_rank': llm_eval.get('rank'),
|
|
|
|
|
+ 'reasoning': llm_eval.get('reasoning', '')
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ # 收集 Stage6 的所有搜索词及其执行状态
|
|
|
|
|
+ stage6_search_words = []
|
|
|
|
|
+ for association in stage6_feature.get('找到的关联', []):
|
|
|
|
|
+ assoc_name = association.get('分类名称', '')
|
|
|
|
|
+ for feature in association.get('特征列表', []):
|
|
|
|
|
+ search_word = feature.get('search_word')
|
|
|
|
|
+ search_result = feature.get('search_result')
|
|
|
|
|
+ search_metadata = feature.get('search_metadata', {})
|
|
|
|
|
+ llm_eval = feature.get('llm_evaluation', {})
|
|
|
|
|
+
|
|
|
|
|
+ if search_word:
|
|
|
|
|
+ stage6_search_words.append({
|
|
|
|
|
+ 'search_word': search_word,
|
|
|
|
|
+ 'association': assoc_name,
|
|
|
|
|
+ 'feature_name': feature.get('特征名称', ''),
|
|
|
|
|
+ 'llm_score': llm_eval.get('score'),
|
|
|
|
|
+ 'llm_rank': llm_eval.get('rank'),
|
|
|
|
|
+ 'has_result': search_result is not None,
|
|
|
|
|
+ 'status': search_metadata.get('status', 'not_searched'),
|
|
|
|
|
+ 'note_count': search_metadata.get('note_count', 0)
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ # 统计
|
|
|
|
|
+ total_stage4 = len(stage4_search_words)
|
|
|
|
|
+ total_stage6 = len(stage6_search_words)
|
|
|
|
|
+ searched = sum(1 for w in stage6_search_words if w['has_result'])
|
|
|
|
|
+ not_searched = total_stage6 - searched
|
|
|
|
|
+
|
|
|
|
|
+ print(f"\n📊 统计信息:")
|
|
|
|
|
+ print(f" Stage4 生成的搜索词数: {total_stage4}")
|
|
|
|
|
+ print(f" Stage6 保留的搜索词数: {total_stage6}")
|
|
|
|
|
+ print(f" 已执行搜索: {searched} 个")
|
|
|
|
|
+ print(f" 未执行搜索: {not_searched} 个")
|
|
|
|
|
+ print(f" 搜索执行率: {searched/total_stage6*100:.1f}%")
|
|
|
|
|
+
|
|
|
|
|
+ # 按 rank 排序并展示
|
|
|
|
|
+ stage6_sorted = sorted(stage6_search_words, key=lambda x: x['llm_rank'] if x['llm_rank'] else 999)
|
|
|
|
|
+
|
|
|
|
|
+ print(f"\n📋 详细搜索词列表 (按 LLM Rank 排序):")
|
|
|
|
|
+ print(f"{'Rank':<6} {'评分':<6} {'搜索状态':<12} {'帖子数':<8} 搜索词")
|
|
|
|
|
+ print("-" * 80)
|
|
|
|
|
+
|
|
|
|
|
+ for idx, word in enumerate(stage6_sorted, 1):
|
|
|
|
|
+ rank = word['llm_rank'] if word['llm_rank'] else 'N/A'
|
|
|
|
|
+ score = f"{word['llm_score']:.2f}" if word['llm_score'] else 'N/A'
|
|
|
|
|
+ status = '✅ 已搜索' if word['has_result'] else '⏸️ 未搜索'
|
|
|
|
|
+ note_count = word['note_count'] if word['has_result'] else '-'
|
|
|
|
|
+
|
|
|
|
|
+ print(f"{rank:<6} {score:<6} {status:<12} {note_count:<8} {word['search_word']}")
|
|
|
|
|
+
|
|
|
|
|
+ # 展示已搜索的搜索词详情
|
|
|
|
|
+ searched_words = [w for w in stage6_sorted if w['has_result']]
|
|
|
|
|
+ if searched_words:
|
|
|
|
|
+ print(f"\n✅ 已执行搜索的 {len(searched_words)} 个搜索词:")
|
|
|
|
|
+ for idx, word in enumerate(searched_words, 1):
|
|
|
|
|
+ print(f"\n 【{idx}】 {word['search_word']}")
|
|
|
|
|
+ print(f" 关联: {word['association']}")
|
|
|
|
|
+ print(f" 特征: {word['feature_name']}")
|
|
|
|
|
+ print(f" 评分: {word['llm_score']:.2f}, 排名: #{word['llm_rank']}")
|
|
|
|
|
+ print(f" 结果: {word['note_count']} 个帖子")
|
|
|
|
|
+
|
|
|
|
|
+ # 展示未搜索的搜索词
|
|
|
|
|
+ not_searched_words = [w for w in stage6_sorted if not w['has_result']]
|
|
|
|
|
+ if not_searched_words:
|
|
|
|
|
+ print(f"\n⏸️ 未执行搜索的 {len(not_searched_words)} 个搜索词:")
|
|
|
|
|
+ for idx, word in enumerate(not_searched_words, 1):
|
|
|
|
|
+ print(f"\n 【{idx}】 {word['search_word']}")
|
|
|
|
|
+ print(f" 关联: {word['association']}")
|
|
|
|
|
+ print(f" 特征: {word['feature_name']}")
|
|
|
|
|
+ print(f" 评分: {word['llm_score']:.2f}, 排名: #{word['llm_rank']}")
|
|
|
|
|
+
|
|
|
|
|
+ # 分析为什么只搜索了部分
|
|
|
|
|
+ print(f"\n🔍 搜索策略分析:")
|
|
|
|
|
+ if searched == 10:
|
|
|
|
|
+ print(f" 系统使用了 Top-10 策略")
|
|
|
|
|
+ top_10_ranks = sorted([w['llm_rank'] for w in searched_words if w['llm_rank']])
|
|
|
|
|
+ print(f" 实际搜索的 Rank 范围: {top_10_ranks}")
|
|
|
|
|
+
|
|
|
|
|
+ # 检查是否严格按 rank 取的 top-10
|
|
|
|
|
+ expected_top_10_ranks = sorted([w['llm_rank'] for w in stage6_sorted[:10] if w['llm_rank']])
|
|
|
|
|
+ if top_10_ranks == expected_top_10_ranks:
|
|
|
|
|
+ print(f" ✓ 严格按照 LLM Rank 取了 Top-10")
|
|
|
|
|
+ else:
|
|
|
|
|
+ print(f" ⚠️ 不是严格的 Top-10 (期望: {expected_top_10_ranks})")
|
|
|
|
|
+ elif searched > 0:
|
|
|
|
|
+ print(f" 系统执行了 {searched} 个搜索")
|
|
|
|
|
+ else:
|
|
|
|
|
+ print(f" 该特征的搜索尚未执行")
|
|
|
|
|
+
|
|
|
|
|
+ print("\n" + "=" * 80)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+if __name__ == '__main__':
|
|
|
|
|
+ stage4_path = 'output_v2/stage4_with_llm_scores.json'
|
|
|
|
|
+ stage6_path = 'output_v2/stage6_with_evaluations.json'
|
|
|
|
|
+ feature_name = '墨镜'
|
|
|
|
|
+
|
|
|
|
|
+ if len(sys.argv) > 1:
|
|
|
|
|
+ feature_name = sys.argv[1]
|
|
|
|
|
+ if len(sys.argv) > 2:
|
|
|
|
|
+ stage4_path = sys.argv[2]
|
|
|
|
|
+ if len(sys.argv) > 3:
|
|
|
|
|
+ stage6_path = sys.argv[3]
|
|
|
|
|
+
|
|
|
|
|
+ analyze_feature_searches(stage4_path, stage6_path, feature_name)
|