How解构结果可视化

#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ How解构结果可视化脚本 V2 改进版： - 使用标签页展示多个帖子 - 参考 visualize_inspiration_points.py 的帖子详情展示 - 分层可折叠的匹配结果 """ import json from pathlib import Path from typing import Dict, List import sys import html as html_module # 添加项目根目录到路径 project_root = Path(__file__).parent.parent.parent sys.path.insert(0, str(project_root)) # 注意：已改用基于相似度的显示方式，不再使用关系类型 # def get_relation_color(relation: str) -> str: # """根据关系类型返回对应的颜色""" # color_map = { # "same": "#10b981", # 绿色 - 同义 # "contains": "#3b82f6", # 蓝色 - 包含 # "contained_by": "#8b5cf6", # 紫色 - 被包含 # "coordinate": "#f59e0b", # 橙色 - 同级 # "overlap": "#ec4899", # 粉色 - 部分重叠 # "related": "#6366f1", # 靛蓝 - 相关 # "unrelated": "#9ca3af" # 灰色 - 无关 # } # return color_map.get(relation, "#9ca3af") # # # def get_relation_label(relation: str) -> str: # """返回关系类型的中文标签""" # label_map = { # "same": "同义", # "contains": "包含", # "contained_by": "被包含", # "coordinate": "同级", # "overlap": "部分重叠", # "related": "相关", # "unrelated": "无关" # } # return label_map.get(relation, relation) def generate_historical_post_card_html(post_detail: Dict, inspiration_point: Dict) -> str: """生成历史帖子的紧凑卡片HTML""" title = post_detail.get("title", "无标题") body_text = post_detail.get("body_text", "") images = post_detail.get("images", []) like_count = post_detail.get("like_count", 0) collect_count = post_detail.get("collect_count", 0) comment_count = post_detail.get("comment_count", 0) author = post_detail.get("channel_account_name", "") link = post_detail.get("link", "#") publish_time = post_detail.get("publish_time", "") # 获取灵感点信息 point_name = inspiration_point.get("点的名称", "") point_desc = inspiration_point.get("点的描述", "") # 准备详情数据（用于模态框） import json post_detail_data = { "title": title, "body_text": body_text, "images": images, "like_count": like_count, "comment_count": comment_count, "collect_count": collect_count, "author": author, "publish_time": publish_time, "link": link } post_data_json = json.dumps(post_detail_data, ensure_ascii=False) post_data_json_escaped = html_module.escape(post_data_json) # 截取正文预览（前80个字符） body_preview = body_text[:80] + "..." if len(body_text) > 80 else body_text # 生成缩略图 thumbnail_html = "" if images: thumbnail_html = f'

' html = f'''

{thumbnail_html} {f'

{len(images)}

' if len(images) > 1 else ''}

{html_module.escape(title)}

灵感点 {html_module.escape(point_name)}

{html_module.escape(point_desc[:100])}{"..." if len(point_desc) > 100 else ""}

📅 {publish_time}

❤ {like_count} ⭐ {collect_count} 查看原帖 →

''' return html def generate_post_detail_html(post_data: Dict, post_idx: int) -> str: """生成帖子详情HTML（紧凑的卡片样式，点击可展开）""" post_detail = post_data.get("帖子详情", {}) title = post_detail.get("title", "无标题") body_text = post_detail.get("body_text", "") images = post_detail.get("images", []) like_count = post_detail.get("like_count", 0) comment_count = post_detail.get("comment_count", 0) collect_count = post_detail.get("collect_count", 0) author = post_detail.get("channel_account_name", "") publish_time = post_detail.get("publish_time", "") link = post_detail.get("link", "") post_id = post_data.get("帖子id", f"post-{post_idx}") # 准备详情数据（用于模态框） import json post_detail_data = { "title": title, "body_text": body_text, "images": images, "like_count": like_count, "comment_count": comment_count, "collect_count": collect_count, "author": author, "publish_time": publish_time, "link": link } post_data_json = json.dumps(post_detail_data, ensure_ascii=False) post_data_json_escaped = html_module.escape(post_data_json) # 生成缩略图HTML thumbnail_html = "" if images and len(images) > 0: # 使用第一张图片作为缩略图 thumbnail_html = f'

' else: thumbnail_html = '

📄

' # 截断正文用于预览 body_preview = body_text[:80] + "..." if len(body_text) > 80 else body_text html = f'''

{thumbnail_html} {f'

📷 {len(images)}

' if len(images) > 1 else ''}

{html_module.escape(title)}

{html_module.escape(body_preview) if body_preview else "暂无正文"}

👤 {html_module.escape(author)} 📅 {publish_time}

👍 {like_count} 💬 {comment_count if comment_count else 0} ⭐ {collect_count if collect_count else 0}

''' return html def generate_inspiration_detail_html(inspiration_point: Dict, feature_status_map: Dict[str, str] = None) -> str: """生成灵感点详情HTML Args: inspiration_point: 灵感点数据 feature_status_map: 特征名称到状态的映射 {特征名称: "相同"|"相似"|"无关"} """ name = inspiration_point.get("名称", "") desc = inspiration_point.get("描述", "") features = inspiration_point.get("特征列表", []) if feature_status_map is None: feature_status_map = {} # 计算该灵感点的整体结论 feature_statuses = [] features_html_list = [] for f in features: feature_name = f if isinstance(f, str) else f.get("特征名称", "") weight = f.get("权重", 1.0) if isinstance(f, dict) else 1.0 # 获取该特征的状态 status = feature_status_map.get(feature_name, "无关") feature_statuses.append(status) if status == "相同": status_class = "feature-same" status_label = "相同" elif status == "相似": status_class = "feature-similar" status_label = "相似" else: status_class = "feature-unrelated" status_label = "无关" features_html_list.append( f'' f'{status_label} ' f'{html_module.escape(feature_name)} ' f'({weight})' f'' ) features_html = "".join(features_html_list) # 计算灵感点结论 has_same = "相同" in feature_statuses has_similar = "相似" in feature_statuses has_unrelated = "无关" in feature_statuses if not has_unrelated: # 没有无关的 -> 找到 insp_conclusion = "找到" insp_conclusion_class = "insp-conclusion-found" elif has_same or has_similar: # 有相同或相似，但也有无关 -> 部分找到 insp_conclusion = "部分找到" insp_conclusion_class = "insp-conclusion-partial" else: # 都是无关 -> 都找不到 insp_conclusion = "都找不到" insp_conclusion_class = "insp-conclusion-not-found" html = f'''

灵感点

{html_module.escape(name)}

{insp_conclusion}

描述:

{html_module.escape(desc)}

特征列表:

{features_html}

''' return html def load_feature_category_mapping() -> Dict: """加载特征名称到分类的映射""" script_dir = Path(__file__).parent project_root = script_dir.parent.parent mapping_file = project_root / "data" / "data_1118" / "特征名称_分类映射.json" try: with open(mapping_file, "r", encoding="utf-8") as f: return json.load(f) except Exception as e: print(f"警告: 无法加载特征分类映射文件: {e}") return {} def load_feature_source_mapping() -> Dict: """加载特征名称到帖子来源的映射""" script_dir = Path(__file__).parent project_root = script_dir.parent.parent mapping_file = project_root / "data" / "data_1118" / "特征名称_帖子来源.json" try: with open(mapping_file, "r", encoding="utf-8") as f: data = json.load(f) # 转换为便于查询的格式: {特征名称: [来源列表]} result = {} for feature_type in ["灵感点", "关键点", "目的点"]: if feature_type in data: for item in data[feature_type]: feature_name = item.get("特征名称") if feature_name: result[feature_name] = item.get("特征来源", []) return result except Exception as e: print(f"警告: 无法加载特征来源映射文件: {e}") return {} def generate_single_match_html(match: Dict, match_idx: int, post_idx: int, insp_idx: int, feature_idx: int, category_mapping: Dict = None, source_mapping: Dict = None) -> str: """生成单个匹配项的HTML Args: match: 单个匹配数据 match_idx: 匹配项索引 post_idx: 帖子索引 insp_idx: 灵感点索引 feature_idx: 特征索引 category_mapping: 特征分类映射 source_mapping: 特征来源映射 """ persona_name = match.get("人设特征名称", "") feature_type = match.get("特征类型", "") feature_categories = match.get("特征分类", []) match_result = match.get("匹配结果", {}) similarity = match_result.get("相似度", 0.0) explanation = match_result.get("说明", "") # 根据相似度确定颜色和标签 if similarity >= 0.9: color = "#10b981" # 绿色 - 相同 label = "相同" elif similarity >= 0.8: color = "#f59e0b" # 橙色 - 相似 label = "相似" else: color = "#9ca3af" # 灰色 - 无关 label = "无关" match_id = f"post-{post_idx}-insp-{insp_idx}-feat-{feature_idx}-match-{match_idx}" # 生成特征类型和分类标签 type_badge_html = "" if feature_type: type_badge_html = f'{html_module.escape(feature_type)}' categories_badge_html = "" if feature_categories: categories_text = " / ".join(feature_categories) categories_badge_html = f'{html_module.escape(categories_text)}' # 获取该人设特征的分类信息 categories_html = "" if category_mapping and persona_name: found_categories = None # 依次在灵感点、关键点、目的点中查找 for persona_type in ["灵感点", "关键点", "目的点"]: if persona_type in category_mapping: type_mapping = category_mapping[persona_type] if persona_name in type_mapping: found_categories = type_mapping[persona_name].get("所属分类", []) break if found_categories: # 简洁样式：[大类/中类/小类] categories_reversed = list(reversed(found_categories)) categories_text = "/".join(categories_reversed) categories_html = f'[{html_module.escape(categories_text)}]' # 获取该人设特征的历史帖子来源 historical_posts_html = "" if source_mapping and persona_name and persona_name in source_mapping: source_list = source_mapping[persona_name] if source_list: historical_cards = [] for source_item in source_list: post_detail = source_item.get("帖子详情", {}) if post_detail: card_html = generate_historical_post_card_html(post_detail, source_item) historical_cards.append(card_html) if historical_cards: historical_posts_html = f'''

历史帖子来源

{"".join(historical_cards)}

''' # 生成历史帖子HTML historical_posts_html = "" if source_mapping and persona_name and persona_name in source_mapping: source_list = source_mapping[persona_name] if source_list: for source_item in source_list[:5]: # 最多5个 post_detail = source_item.get("帖子详情", {}) if post_detail: card_html = generate_historical_post_card_html(post_detail, source_item) historical_posts_html += card_html # 将数据编码到data属性中 import html as html_encode data_explanation = html_encode.escape(explanation) data_historical = html_encode.escape(historical_posts_html) # 生成紧凑的匹配项HTML（可点击，弹出模态框） html = f'''

{type_badge_html} {html_module.escape(persona_name)} 相似度: {similarity:.2f} {label}

''' return html def generate_match_results_html(how_steps: List[Dict], feature_idx: int, insp_idx: int, post_idx: int, category_mapping: Dict = None, source_mapping: Dict = None) -> str: """生成可折叠的匹配结果HTML""" if not how_steps or len(how_steps) == 0: return "" step = how_steps[0] features = step.get("特征列表", []) if feature_idx >= len(features): return "" feature_data = features[feature_idx] feature_name = feature_data.get("特征名称", "") feature_weight = feature_data.get("权重", 1.0) match_results = feature_data.get("匹配结果", []) if category_mapping is None: category_mapping = {} # 按相似度排序 sorted_matches = sorted(match_results, key=lambda x: x.get("匹配结果", {}).get("相似度", 0), reverse=True) # 找出最高相似度，确定状态 max_similarity = 0.0 if match_results: max_similarity = max(match.get("匹配结果", {}).get("相似度", 0) for match in match_results) # 根据最高相似度确定状态 if max_similarity >= 0.9: status = "相同" status_class = "status-same" elif max_similarity >= 0.8: status = "相似" status_class = "status-similar" else: status = "无关" status_class = "status-unrelated" found_status_html = f'{status}' # 统计相似度分布 similarity_ranges = { "相同 (≥0.9)": 0, "相似 (0.8-0.9)": 0, "无关 (<0.8)": 0 } for match in match_results: similarity = match.get("匹配结果", {}).get("相似度", 0) if similarity >= 0.9: similarity_ranges["相同 (≥0.9)"] += 1 elif similarity >= 0.8: similarity_ranges["相似 (0.8-0.9)"] += 1 else: similarity_ranges["无关 (<0.8)"] += 1 # 生成统计信息 stats_items = [] range_colors = { "相同 (≥0.9)": "#10b981", "相似 (0.8-0.9)": "#f59e0b", "无关 (<0.8)": "#9ca3af" } for range_name, count in similarity_ranges.items(): if count > 0: color = range_colors[range_name] stats_items.append(f'{range_name}: {count}') stats_html = "".join(stats_items) # 按特征类型分组匹配项 match_groups = { "标签": [], "分类": [] } for i, match in enumerate(sorted_matches): feature_type = match.get("特征类型", "") if feature_type in match_groups: match_groups[feature_type].append((i, match)) # 生成分组的匹配项HTML matches_html = "" # 先显示"标签"匹配结果 if match_groups["标签"]: group_id = f"post-{post_idx}-insp-{insp_idx}-feat-{feature_idx}-group-label" group_matches_html = "" for i, match in match_groups["标签"]: match_html = generate_single_match_html( match, i, post_idx, insp_idx, feature_idx, category_mapping, source_mapping ) group_matches_html += match_html matches_html += f'''

▼

匹配标签 ({len(match_groups["标签"])})

{group_matches_html}

''' # 再显示"分类"匹配结果 if match_groups["分类"]: group_id = f"post-{post_idx}-insp-{insp_idx}-feat-{feature_idx}-group-category" group_matches_html = "" for i, match in match_groups["分类"]: match_html = generate_single_match_html( match, i, post_idx, insp_idx, feature_idx, category_mapping, source_mapping ) group_matches_html += match_html matches_html += f'''

▼

匹配分类 ({len(match_groups["分类"])})

{group_matches_html}

''' section_id = f"post-{post_idx}-insp-{insp_idx}-feat-{feature_idx}-section" html = f'''

▼

匹配结果: {html_module.escape(feature_name)} (权重: {feature_weight})

{found_status_html}

{stats_html}

{matches_html}

''' return html def generate_toc_html(post_data: Dict, post_idx: int, feature_status_map: Dict[str, str] = None, overall_conclusion: str = "") -> str: """生成目录导航HTML Args: post_data: 帖子数据 post_idx: 帖子索引 feature_status_map: 特征名称到状态的映射 {特征名称: "相同"|"相似"|"无关"} overall_conclusion: 整体结论 """ how_result = post_data.get("how解构结果", {}) inspiration_list = how_result.get("灵感点列表", []) if feature_status_map is None: feature_status_map = {} toc_items = [] # 帖子详情 toc_items.append(f'

帖子详情帖子信息

') # 灵感点 for insp_idx, inspiration_point in enumerate(inspiration_list): name = inspiration_point.get("名称", f"灵感点 {insp_idx + 1}") name_short = name[:18] + "..." if len(name) > 18 else name toc_items.append(f'

灵感点 {html_module.escape(name_short)}

') # 特征列表 how_steps = inspiration_point.get("how步骤列表", []) if how_steps: features = how_steps[0].get("特征列表", []) for feat_idx, feature_data in enumerate(features): feature_name = feature_data.get("特征名称", f"特征 {feat_idx + 1}") # 获取状态 status = feature_status_map.get(feature_name, "无关") if status == "相同": status_class = "toc-feature-same" status_label = "相同" elif status == "相似": status_class = "toc-feature-similar" status_label = "相似" else: status_class = "toc-feature-unrelated" status_label = "无关" toc_items.append(f'

特征 {html_module.escape(feature_name)} {status_label}

') # 整体结论HTML conclusion_html = "" if overall_conclusion: if overall_conclusion == "找到": conclusion_class = "conclusion-found" conclusion_icon = "✓" elif overall_conclusion == "部分找到": conclusion_class = "conclusion-partial" conclusion_icon = "~" else: # 都找不到 conclusion_class = "conclusion-not-found" conclusion_icon = "✗" conclusion_html = f'''

{conclusion_icon} {overall_conclusion}

''' return f'''

目录导航

{conclusion_html}

{"".join(toc_items)}

''' def generate_post_content_html(post_data: Dict, post_idx: int, category_mapping: Dict = None, source_mapping: Dict = None) -> str: """生成单个帖子的完整内容HTML""" # 2. 灵感点详情和匹配结果 how_result = post_data.get("how解构结果", {}) inspiration_list = how_result.get("灵感点列表", []) # 先计算所有特征的状态（基于最高相似度） feature_status_map = {} # {特征名称: "相同"|"相似"|"无关"} for inspiration_point in inspiration_list: how_steps = inspiration_point.get("how步骤列表", []) if how_steps: features = how_steps[0].get("特征列表", []) for feature_data in features: feature_name = feature_data.get("特征名称", "") match_results = feature_data.get("匹配结果", []) # 找出最高相似度 max_similarity = 0.0 if match_results: max_similarity = max(match.get("匹配结果", {}).get("相似度", 0) for match in match_results) # 根据最高相似度确定状态 if max_similarity >= 0.9: feature_status_map[feature_name] = "相同" elif max_similarity >= 0.8: feature_status_map[feature_name] = "相似" else: feature_status_map[feature_name] = "无关" # 计算整体结论 status_values = list(feature_status_map.values()) has_same = "相同" in status_values has_similar = "相似" in status_values has_unrelated = "无关" in status_values if not has_unrelated: # 没有无关的 -> 找到 overall_conclusion = "找到" elif has_same or has_similar: # 有相同或相似，但也有无关 -> 部分找到 overall_conclusion = "部分找到" else: # 都是无关 -> 都找不到 overall_conclusion = "都找不到" # 生成目录（传入状态映射和整体结论） toc_html = generate_toc_html(post_data, post_idx, feature_status_map, overall_conclusion) # 1. 帖子详情 post_detail_html = generate_post_detail_html(post_data, post_idx) # 生成所有灵感点的详情HTML（传入状态映射） inspirations_detail_html = "" for insp_idx, inspiration_point in enumerate(inspiration_list): inspiration_detail = generate_inspiration_detail_html(inspiration_point, feature_status_map) inspirations_detail_html += f'''

{inspiration_detail}

''' # 生成所有匹配结果HTML，按照how步骤分组 all_matches_html = "" for insp_idx, inspiration_point in enumerate(inspiration_list): inspiration_name = inspiration_point.get("名称", f"灵感点 {insp_idx + 1}") how_steps = inspiration_point.get("how步骤列表", []) if how_steps: # 为每个灵感点创建一个区域 for step_idx, step in enumerate(how_steps): step_name = step.get("步骤名称", f"步骤 {step_idx + 1}") features = step.get("特征列表", []) # 生成该步骤下所有特征的匹配结果 features_html = "" for feat_idx, feature_data in enumerate(features): match_html = generate_match_results_html([step], feat_idx, insp_idx, post_idx, category_mapping, source_mapping) features_html += f'

{match_html}

' # 生成步骤区域（可折叠） step_section_id = f"post-{post_idx}-step-{insp_idx}-{step_idx}" all_matches_html += f'''

▼

{html_module.escape(step_name)}

来自: {html_module.escape(inspiration_name)}

{features_html}

''' html = f'''

{toc_html}

{post_detail_html}

{inspirations_detail_html}

{all_matches_html}

''' return html def generate_combined_html(posts_data: List[Dict], category_mapping: Dict = None, source_mapping: Dict = None) -> str: """生成包含所有帖子的单一HTML（带标签页）""" # 生成标签页按钮 tabs_html = "" for i, post in enumerate(posts_data): post_detail = post.get("帖子详情", {}) title = post_detail.get("title", "无标题") active_class = "active" if i == 0 else "" tabs_html += f'\n' # 生成标签页内容 contents_html = "" for i, post in enumerate(posts_data): active_class = "active" if i == 0 else "" content = generate_post_content_html(post, i, category_mapping, source_mapping) contents_html += f'''

{content}

''' html = f''' How解构结果可视化

{tabs_html}

{contents_html}

''' return html def main(): """主函数""" script_dir = Path(__file__).parent project_root = script_dir.parent.parent data_dir = project_root / "data" / "data_1118" input_dir = data_dir / "当前帖子_how解构结果" output_file = data_dir / "当前帖子_how解构结果_可视化.html" print(f"读取 how 解构结果: {input_dir}") # 加载特征分类映射 print(f"加载特征分类映射...") category_mapping = load_feature_category_mapping() print(f"已加载 {sum(len(v) for v in category_mapping.values())} 个特征分类") # 加载特征来源映射 print(f"加载特征来源映射...") source_mapping = load_feature_source_mapping() print(f"已加载 {len(source_mapping)} 个特征的来源信息") json_files = list(input_dir.glob("*_how.json")) print(f"找到 {len(json_files)} 个文件\n") posts_data = [] for i, file_path in enumerate(json_files, 1): print(f"读取文件 [{i}/{len(json_files)}]: {file_path.name}") with open(file_path, "r", encoding="utf-8") as f: post_data = json.load(f) posts_data.append(post_data) print(f"\n生成合并的 HTML...") html_content = generate_combined_html(posts_data, category_mapping, source_mapping) print(f"保存到: {output_file}") with open(output_file, "w", encoding="utf-8") as f: f.write(html_content) print(f"\n完成! 可视化文件已保存") print(f"请在浏览器中打开: {output_file}") if __name__ == "__main__": main()

{html_module.escape(name)}

历史帖子来源

匹配标签 ({len(match_groups["标签"])})

匹配分类 ({len(match_groups["分类"])})

匹配结果: {html_module.escape(feature_name)} (权重: {feature_weight})

{html_module.escape(step_name)}

How 解构结果可视化