#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
级联搜索结果可视化工具
展示候选词 → Top3人设特征 → 搜索词 → 搜索结果的完整流程
"""
import json
import os
import sys
from datetime import datetime
from typing import List, Dict, Any, Set
import webbrowser
def load_json(file_path: str) -> Any:
"""加载JSON文件"""
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
def load_all_data(output_dir: str = "output_v2") -> Dict[str, Any]:
"""
加载所有需要的数据文件
Returns:
包含所有数据的字典
"""
print("正在加载数据文件...")
data = {
'filtered_features': load_json(os.path.join(output_dir, 'filtered_features.json')),
'candidate_words': load_json(os.path.join(output_dir, 'candidate_words.json')),
'search_queries': load_json(os.path.join(output_dir, 'search_queries_evaluated.json')),
'search_results': load_json(os.path.join(output_dir, 'search_results.json')),
'evaluated_results': load_json(os.path.join(output_dir, 'evaluated_results.json'))
}
# 尝试加载深度分析数据(可选)
deep_path = os.path.join(output_dir, 'deep_analysis_results.json')
similarity_path = os.path.join(output_dir, 'similarity_analysis_results.json')
if os.path.exists(deep_path):
deep_data = load_json(deep_path)
# 创建note_id到解构数据的映射
data['stage7_mapping'] = {}
for result in deep_data.get('results', []):
note_id = result.get('note_id')
if note_id:
data['stage7_mapping'][note_id] = result
else:
data['stage7_mapping'] = {}
if os.path.exists(similarity_path):
sim_data = load_json(similarity_path)
# 创建note_id到相似度数据的映射
data['stage8_mapping'] = {}
for result in sim_data.get('results', []):
note_id = result.get('note_id')
if note_id:
data['stage8_mapping'][note_id] = result
else:
data['stage8_mapping'] = {}
print(f" ✓ 已加载 {len(data['filtered_features'])} 个原始特征")
print(f" ✓ 已加载 {len(data['candidate_words'])} 个候选词数据")
print(f" ✓ 已加载解构数据: {len(data['stage7_mapping'])} 个帖子")
print(f" ✓ 已加载相似度数据: {len(data['stage8_mapping'])} 个帖子")
return data
def extract_global_candidates(data: Dict[str, Any]) -> Dict[str, List[Dict[str, Any]]]:
"""
提取全局候选词并按相似度分类
Returns:
{
'matched': [...], # 相似度 >= 0.8
'partial': [...], # 0.5 <= 相似度 < 0.8
'unmatched': [...] # 相似度 < 0.5
}
"""
print("\n提取全局候选词...")
candidates_map = {} # 用于去重
# 遍历所有特征的候选词
for feature_data in data['candidate_words']:
candidates_by_base = feature_data.get('高相似度候选_按base_word', {})
for base_word, candidates in candidates_by_base.items():
for cand in candidates:
cand_name = cand.get('候选词', '')
if not cand_name:
continue
# 计算相似度
similarity = cand.get('相似度', 0)
# 如果是帖子候选词,使用点最高人设相似度
if cand.get('候选词类型') == 'post':
similarity = cand.get('点最高人设相似度', similarity)
# 去重:保留最高相似度
if cand_name not in candidates_map or similarity > candidates_map[cand_name]['相似度']:
candidates_map[cand_name] = {
'名称': cand_name,
'类型': cand.get('候选词类型', 'unknown'),
'相似度': similarity,
'特征类型': cand.get('特征类型', ''),
'来源路径': cand.get('来源路径', ''),
'匹配说明': cand.get('匹配说明', '')
}
# 按相似度分类
result = {
'matched': [], # >= 0.8
'partial': [], # 0.5 ~ 0.8
'unmatched': [] # < 0.5
}
for cand in candidates_map.values():
similarity = cand['相似度']
if similarity >= 0.8:
result['matched'].append(cand)
elif similarity >= 0.5:
result['partial'].append(cand)
else:
result['unmatched'].append(cand)
# 排序:按相似度降序
for category in result.values():
category.sort(key=lambda x: x['相似度'], reverse=True)
print(f" ✓ 已匹配: {len(result['matched'])} 个")
print(f" ✓ 部分匹配: {len(result['partial'])} 个")
print(f" ✓ 不匹配: {len(result['unmatched'])} 个")
return result
def render_left_candidates_html(global_candidates: Dict[str, List[Dict[str, Any]]]) -> str:
"""
渲染左侧固定候选词区域HTML
Args:
global_candidates: 分类后的全局候选词
Returns:
HTML字符串
"""
html_parts = []
html_parts.append('''
''')
# 已匹配区域
html_parts.append('''
✅ 已匹配 ({count})
与人设相似度 ≥ 0.8
'''.format(count=len(global_candidates['matched'])))
for cand in global_candidates['matched']:
icon = '📝' if cand['类型'] == 'post' else '👤'
type_label = '帖子' if cand['类型'] == 'post' else '人设'
html_parts.append(f'''
{icon}
{cand['名称']}
{type_label}
{cand['相似度']:.2f}
''')
html_parts.append('''
''')
# 部分匹配区域
html_parts.append('''
🟡 部分匹配 ({count})
与人设特征相似度 0.5-0.8
'''.format(count=len(global_candidates['partial'])))
for cand in global_candidates['partial']:
icon = '📝' if cand['类型'] == 'post' else '👤'
type_label = '帖子' if cand['类型'] == 'post' else '人设'
html_parts.append(f'''
{icon}
{cand['名称']}
{type_label}
{cand['相似度']:.2f}
''')
html_parts.append('''
''')
# 不匹配区域
html_parts.append('''
❌ 不匹配 ({count})
与人设特征相似度 < 0.5
'''.format(count=len(global_candidates['unmatched'])))
for cand in global_candidates['unmatched']:
icon = '📝' if cand['类型'] == 'post' else '👤'
type_label = '帖子' if cand['类型'] == 'post' else '人设'
html_parts.append(f'''
{icon}
{cand['名称']}
{type_label}
{cand['相似度']:.2f}
''')
html_parts.append('''
''')
return ''.join(html_parts)
def render_cascade_flow_html(data: Dict[str, Any]) -> str:
"""
渲染中间级联流程HTML(三层结构)
Returns:
HTML字符串
"""
html_parts = []
html_parts.append('''
''')
# 默认显示第一个特征的级联流程
if data['evaluated_results']:
first_feature = data['evaluated_results'][0]
html_parts.append(render_single_cascade(first_feature, 0, data))
html_parts.append('''
''')
return ''.join(html_parts)
def render_single_cascade(feature_data: Dict[str, Any], feature_idx: int, data: Dict[str, Any]) -> str:
"""
渲染单个特征的级联流程
Args:
feature_data: 特征数据
feature_idx: 特征索引
data: 全部数据
Returns:
HTML字符串
"""
html_parts = []
original_feature = feature_data.get('原始特征名称', '')
top3_matches = feature_data.get('top3匹配信息', [])
groups = feature_data.get('组合评估结果_分组', [])
# 层级1: 原始特征
html_parts.append(f'''
''')
# 层级2: Top3人设特征
html_parts.append('''
↓
🎯 Top1各 相似度(x)
''')
for idx, match in enumerate(top3_matches[:3], 1):
base_word = match.get('人设特征名称', '')
similarity = match.get('相似度', 0)
is_top1 = (idx == 1)
card_class = 'top3-card top1-card' if is_top1 else 'top3-card'
html_parts.append(f'''
Top{idx}
{base_word}
相似度: {similarity:.2f}
''')
html_parts.append('''
''')
# 层级3: 搜索词(默认展开Top1)
if groups:
html_parts.append('''
↓
🔍 搜索词生成
''')
# 默认显示第一个group(Top1)
html_parts.append(render_search_words_group(groups[0], feature_idx, 0))
html_parts.append('''
''')
return ''.join(html_parts)
def render_search_words_group(group: Dict[str, Any], feature_idx: int, group_idx: int) -> str:
"""
渲染搜索词组
Args:
group: 搜索词组数据
feature_idx: 特征索引
group_idx: 组索引
Returns:
HTML字符串
"""
html_parts = []
base_word = group.get('base_word', '')
searches = group.get('top10_searches', [])
available_words = group.get('available_words', [])
html_parts.append(f'''
中心词: {base_word}
''')
# 显示每个搜索词
for sw_idx, search in enumerate(searches):
html_parts.append(render_search_word_card(search, feature_idx, group_idx, sw_idx, available_words))
html_parts.append('''
''')
return ''.join(html_parts)
def render_search_word_card(search: Dict[str, Any], feature_idx: int, group_idx: int, sw_idx: int, available_words: List) -> str:
"""
渲染单个搜索词卡片
Args:
search: 搜索词数据
feature_idx, group_idx, sw_idx: 索引
available_words: 可用候选词列表
Returns:
HTML字符串
"""
search_word = search.get('search_word', '')
score = search.get('score', 0)
reasoning = search.get('reasoning', '')
has_result = search.get('search_result') is not None
# 检查是否已执行搜索
status_icon = '✅' if has_result else '⏸️'
status_text = '已搜索' if has_result else '未搜索'
status_class = 'searched' if has_result else 'not-searched'
# 显示候选词(最多前10个)
cand_names = [w.get('候选词', '') if isinstance(w, dict) else w for w in available_words[:10]]
cand_display = ', '.join(cand_names) if cand_names else '无'
html = f'''
'''
return html
def generate_html(data: Dict[str, Any], global_candidates: Dict[str, List[Dict[str, Any]]]) -> str:
"""
生成完整HTML页面
Args:
data: 所有数据
global_candidates: 全局候选词
Returns:
完整HTML字符串
"""
print("\n正在生成HTML...")
# 准备数据JSON
data_json = json.dumps(data['evaluated_results'], ensure_ascii=False)
stage7_json = json.dumps(data['stage7_mapping'], ensure_ascii=False)
stage8_json = json.dumps(data['stage8_mapping'], ensure_ascii=False)
# 生成各部分HTML
left_html = render_left_candidates_html(global_candidates)
cascade_html = render_cascade_flow_html(data)
# 生成完整HTML
html_template = f'''
级联搜索结果可视化
{left_html}
{cascade_html}
'''
print(" ✓ HTML生成完成")
return html_template
def get_css_styles() -> str:
"""获取CSS样式"""
return '''
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
background: #f5f7fa;
color: #333;
overflow-x: hidden;
}
/* 页面头部 */
.page-header {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 20px;
text-align: center;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
}
.header-title {
font-size: 24px;
font-weight: bold;
margin-bottom: 5px;
}
.header-subtitle {
font-size: 14px;
opacity: 0.9;
}
/* 主布局 - 三栏 */
.main-layout {
display: flex;
gap: 20px;
padding: 20px;
height: calc(100vh - 100px);
}
/* 左侧候选词面板 - 固定 */
.left-candidates-panel {
width: 280px;
background: white;
border-radius: 8px;
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
position: sticky;
top: 20px;
height: fit-content;
max-height: calc(100vh - 140px);
display: flex;
flex-direction: column;
}
.candidates-header {
padding: 15px;
border-bottom: 2px solid #e5e7eb;
}
.candidates-title {
font-size: 16px;
font-weight: 600;
color: #374151;
margin-bottom: 5px;
}
.candidates-hint {
font-size: 11px;
color: #ef4444;
font-weight: 500;
}
.candidates-content {
flex: 1;
overflow-y: auto;
padding: 10px;
}
.candidates-section {
margin-bottom: 15px;
}
.section-title {
font-size: 13px;
font-weight: 600;
margin-bottom: 5px;
display: flex;
align-items: center;
gap: 5px;
}
.section-count {
color: #6b7280;
font-size: 12px;
}
.section-hint {
font-size: 11px;
color: #6b7280;
margin-bottom: 8px;
}
.candidates-list {
display: flex;
flex-direction: column;
gap: 6px;
}
.candidate-item {
display: flex;
align-items: center;
gap: 8px;
padding: 8px;
border-radius: 6px;
border: 1px solid #e5e7eb;
transition: all 0.2s;
cursor: pointer;
}
.candidate-item:hover {
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
transform: translateY(-1px);
}
.candidate-item.matched {
background: #f0fdf4;
border-color: #86efac;
}
.candidate-item.partial {
background: #fffbeb;
border-color: #fcd34d;
}
.candidate-item.unmatched {
background: #fef2f2;
border-color: #fca5a5;
}
.candidate-icon {
font-size: 18px;
flex-shrink: 0;
}
.candidate-info {
flex: 1;
min-width: 0;
}
.candidate-name {
font-size: 12px;
font-weight: 500;
color: #374151;
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
}
.candidate-meta {
display: flex;
justify-content: space-between;
align-items: center;
margin-top: 2px;
}
.candidate-type {
font-size: 10px;
color: #6b7280;
}
.candidate-similarity {
font-size: 10px;
font-weight: 600;
color: #10b981;
}
/* 中间级联流程面板 */
.cascade-flow-panel {
flex: 1;
background: white;
border-radius: 8px;
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
overflow-y: auto;
padding: 20px;
}
.cascade-header {
margin-bottom: 20px;
padding-bottom: 10px;
border-bottom: 2px solid #e5e7eb;
}
.cascade-title {
font-size: 18px;
font-weight: 600;
color: #374151;
}
.cascade-content {
display: flex;
flex-direction: column;
gap: 15px;
}
.cascade-layer {
background: #f9fafb;
border-radius: 8px;
padding: 15px;
}
.layer-title {
font-size: 14px;
font-weight: 600;
color: #6b7280;
margin-bottom: 10px;
}
/* 层级1: 特征选择器 */
.selected-feature {
display: flex;
justify-content: space-between;
align-items: center;
padding: 12px;
background: white;
border-radius: 6px;
border: 2px solid #667eea;
}
.feature-name {
font-size: 15px;
font-weight: 600;
color: #374151;
}
.switch-feature-btn {
padding: 6px 12px;
background: #667eea;
color: white;
border: none;
border-radius: 4px;
cursor: pointer;
font-size: 12px;
transition: all 0.2s;
}
.switch-feature-btn:hover {
background: #5568d3;
}
/* 层级2: Top3卡片 */
.top3-container {
display: flex;
gap: 10px;
}
.top3-card {
flex: 1;
padding: 12px;
background: white;
border-radius: 6px;
border: 2px solid #e5e7eb;
cursor: pointer;
transition: all 0.2s;
}
.top3-card:hover {
border-color: #667eea;
box-shadow: 0 2px 6px rgba(102, 126, 234, 0.2);
}
.top3-card.top1-card {
border-color: #10b981;
background: #f0fdf4;
}
.top3-card.top1-card:hover {
border-color: #059669;
}
.top3-card.selected {
border-color: #667eea;
box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.2);
}
.top3-rank {
font-size: 11px;
font-weight: 600;
color: #6b7280;
margin-bottom: 4px;
}
.top3-name {
font-size: 14px;
font-weight: 600;
color: #374151;
margin-bottom: 4px;
}
.top3-similarity {
font-size: 12px;
color: #10b981;
}
/* 级联箭头 */
.cascade-arrow {
text-align: center;
font-size: 24px;
color: #667eea;
margin: 5px 0;
}
/* 层级3: 搜索词 */
.base-word-label {
font-size: 13px;
color: #6b7280;
margin-bottom: 12px;
}
.base-word-value {
font-weight: 600;
color: #10b981;
}
.search-word-card {
background: white;
border-radius: 8px;
border: 2px solid #e5e7eb;
padding: 15px;
margin-bottom: 12px;
cursor: pointer;
transition: all 0.2s;
}
.search-word-card:hover {
border-color: #667eea;
box-shadow: 0 2px 6px rgba(0,0,0,0.1);
}
.search-word-card.searched {
border-color: #10b981;
}
.search-word-card.selected {
border-color: #667eea;
box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.2);
}
.sw-header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 10px;
}
.sw-status {
font-size: 12px;
font-weight: 600;
color: #10b981;
}
.sw-rank {
font-size: 11px;
color: #6b7280;
}
.sw-candidates-pool {
margin-bottom: 10px;
}
.sw-label {
font-size: 11px;
color: #6b7280;
margin-bottom: 4px;
}
.sw-candidates {
font-size: 12px;
color: #374151;
background: #f9fafb;
padding: 6px;
border-radius: 4px;
}
.sw-arrow-container {
text-align: center;
margin: 10px 0;
}
.sw-arrow {
display: inline-flex;
align-items: center;
gap: 8px;
}
.arrow-line {
font-size: 20px;
color: #667eea;
}
.arrow-score {
font-size: 12px;
font-weight: 600;
color: #667eea;
background: #ede9fe;
padding: 2px 8px;
border-radius: 4px;
}
.sw-result {
text-align: center;
margin-bottom: 10px;
}
.sw-query {
font-size: 16px;
font-weight: 600;
color: #374151;
background: #f0fdf4;
padding: 8px;
border-radius: 6px;
border: 1px solid #86efac;
}
.sw-reasoning {
background: #fffbeb;
padding: 10px;
border-radius: 6px;
border: 1px solid #fcd34d;
}
.reasoning-label {
font-size: 12px;
font-weight: 600;
color: #374151;
margin-bottom: 4px;
}
.reasoning-content {
font-size: 12px;
color: #6b7280;
line-height: 1.5;
}
/* 右侧结果面板 */
.right-results-panel {
width: 500px;
background: white;
border-radius: 8px;
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
overflow-y: auto;
display: flex;
flex-direction: column;
}
.results-header {
padding: 15px;
border-bottom: 2px solid #e5e7eb;
}
.results-title {
font-size: 16px;
font-weight: 600;
color: #374151;
margin-bottom: 5px;
}
.results-subtitle {
font-size: 12px;
color: #6b7280;
}
.results-content {
flex: 1;
padding: 15px;
}
.empty-results {
text-align: center;
padding: 60px 20px;
}
.empty-icon {
font-size: 48px;
margin-bottom: 15px;
}
.empty-text {
font-size: 14px;
color: #6b7280;
}
/* Modal */
.modal-overlay {
display: none;
position: fixed;
top: 0;
left: 0;
right: 0;
bottom: 0;
background: rgba(0,0,0,0.5);
z-index: 1000;
align-items: center;
justify-content: center;
}
.modal-overlay.active {
display: flex;
}
.modal-window {
background: white;
border-radius: 12px;
box-shadow: 0 10px 40px rgba(0,0,0,0.2);
max-width: 600px;
width: 90%;
max-height: 80vh;
display: flex;
flex-direction: column;
}
.modal-header {
padding: 20px;
border-bottom: 1px solid #e5e7eb;
display: flex;
justify-content: space-between;
align-items: center;
}
.modal-title {
font-size: 18px;
font-weight: 600;
color: #374151;
}
.modal-close-btn {
background: none;
border: none;
font-size: 28px;
color: #6b7280;
cursor: pointer;
padding: 0;
width: 32px;
height: 32px;
display: flex;
align-items: center;
justify-content: center;
border-radius: 4px;
}
.modal-close-btn:hover {
background: #f3f4f6;
}
.modal-body {
flex: 1;
overflow-y: auto;
padding: 20px;
}
.feature-list {
display: flex;
flex-direction: column;
gap: 10px;
}
.feature-list-item {
padding: 12px;
background: #f9fafb;
border-radius: 6px;
border: 2px solid #e5e7eb;
cursor: pointer;
transition: all 0.2s;
}
.feature-list-item:hover {
border-color: #667eea;
background: white;
}
.feature-list-item.active {
border-color: #10b981;
background: #f0fdf4;
}
'''
def get_javascript_code() -> str:
"""获取JavaScript代码"""
return '''
// 初始化
document.addEventListener('DOMContentLoaded', function() {
console.log('页面加载完成');
renderFeatureList();
});
// 显示特征选择器
function showFeatureSelector() {
const modal = document.getElementById('featureSelectorModal');
modal.classList.add('active');
}
// 关闭特征选择器
function closeFeatureSelector() {
const modal = document.getElementById('featureSelectorModal');
modal.classList.remove('active');
}
// 渲染特征列表
function renderFeatureList() {
const listEl = document.getElementById('featureList');
let html = '';
allData.forEach((feature, idx) => {
const name = feature['原始特征名称'];
const isActive = idx === currentFeatureIdx;
const activeClass = isActive ? 'active' : '';
html += `
${name}
`;
});
listEl.innerHTML = html;
}
// 选择特征
function selectFeature(featureIdx) {
currentFeatureIdx = featureIdx;
currentGroupIdx = 0;
currentSwIdx = 0;
closeFeatureSelector();
updateCascadeView();
renderFeatureList();
}
// 更新级联视图
function updateCascadeView() {
const feature = allData[currentFeatureIdx];
const cascadeContent = document.getElementById('cascadeContent');
// 重新渲染级联流程(这里简化处理,实际应该用JavaScript动态更新)
location.reload(); // 简化版:重新加载页面
}
// 选择base_word
function selectBaseWord(featureIdx, matchIdx) {
currentFeatureIdx = featureIdx;
currentGroupIdx = matchIdx;
currentSwIdx = 0;
// 移除所有选中状态
document.querySelectorAll('.top3-card').forEach(card => {
card.classList.remove('selected');
});
// 添加选中状态
event.target.closest('.top3-card').classList.add('selected');
// 更新搜索词显示
const feature = allData[currentFeatureIdx];
const groups = feature['组合评估结果_分组'] || [];
if (groups[currentGroupIdx]) {
// TODO: 更新搜索词列表显示
console.log('切换到group:', currentGroupIdx);
}
}
// 选择搜索词
function selectSearchWord(featureIdx, groupIdx, swIdx) {
currentFeatureIdx = featureIdx;
currentGroupIdx = groupIdx;
currentSwIdx = swIdx;
// 移除所有搜索词的选中状态
document.querySelectorAll('.search-word-card').forEach(card => {
card.classList.remove('selected');
});
// 添加选中状态
event.target.closest('.search-word-card').classList.add('selected');
// 显示搜索结果
renderSearchResults(featureIdx, groupIdx, swIdx);
}
// 渲染搜索结果
function renderSearchResults(featureIdx, groupIdx, swIdx) {
const feature = allData[featureIdx];
const groups = feature['组合评估结果_分组'] || [];
const group = groups[groupIdx];
if (!group) return;
const searches = group['top10_searches'] || [];
const search = searches[swIdx];
if (!search) return;
const searchWord = search['search_word'] || '';
const searchResult = search['search_result'];
const resultsContent = document.getElementById('resultsContent');
const resultsSubtitle = document.getElementById('resultsSubtitle');
resultsSubtitle.textContent = `搜索词: ${searchWord}`;
if (!searchResult) {
resultsContent.innerHTML = `
`;
return;
}
const notes = searchResult.data?.data || [];
if (notes.length === 0) {
resultsContent.innerHTML = `
`;
return;
}
// 渲染帖子卡片(简化版)
let html = '';
notes.forEach((note, idx) => {
const card = note.note_card || {};
const title = card.display_title || '无标题';
const image = (card.image_list || [])[0] || '';
html += `
${image ? `

` : ''}
${title}
`;
});
html += '
';
resultsContent.innerHTML = html;
}
'''
def main():
"""主函数"""
print("=" * 60)
print("级联搜索结果可视化工具")
print("=" * 60)
# 加载数据
data = load_all_data()
# 提取全局候选词
global_candidates = extract_global_candidates(data)
# 生成HTML
html_content = generate_html(data, global_candidates)
# 保存HTML文件
output_path = "visualization/cascade_search_results.html"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(html_content)
print(f"\n✓ HTML文件已保存: {output_path}")
# 打开HTML文件
abs_path = os.path.abspath(output_path)
print(f"正在打开浏览器...")
webbrowser.open(f'file://{abs_path}')
print("\n" + "=" * 60)
print("✅ 可视化生成完成!")
print("=" * 60)
if __name__ == '__main__':
main()