#!/usr/bin/env python3 """ 特征提取脚本 - 小葱笔记风格海报 功能:从 4 张图片中提取 7 个维度的特征 """ import json import os from datetime import datetime # 输入输出路径 INPUT_DIR = "/Users/elksmmx/Desktop/agent 2.10/Agent_dragon/examples/find knowledge_xiaocong/input" OUTPUT_DIR = "/Users/elksmmx/Desktop/agent 2.10/Agent_dragon/examples/find knowledge_xiaocong/output/features" # 创建输出目录 DIMENSIONS = [ "background_texture", "core_english_word", "hand_drawn_decorations", "layout_structure", "color_highlight", "knowledge_chart", "paper_clip" ] for dim in DIMENSIONS: os.makedirs(os.path.join(OUTPUT_DIR, dim), exist_ok=True) # 读取制作表 def load_production_table(img_id): table_path = os.path.join(INPUT_DIR, f"小葱__img_{img_id}_制作表.json") with open(table_path, 'r', encoding='utf-8') as f: return json.load(f) # 读取提取需求 def load_extraction_requirements(): req_path = os.path.join(INPUT_DIR, "小葱提取需求.json") with open(req_path, 'r', encoding='utf-8') as f: content = f.read() # 解析需求(简化处理) return content # 生成 mapping.json 的函数 def create_mapping_entry(dimension, img_id, paragraph_id, feature_type, feature_name, output_file): return { "dimension": dimension, "image": f"img_{img_id}.jpg", "paragraph": paragraph_id, "feature_type": feature_type, # 实质/形式 "feature_name": feature_name, "output_file": output_file, "extraction_tool": "PaddleOCR + rembg + OpenCV (planned)", "extraction_date": datetime.now().isoformat() } # 主执行逻辑 def main(): print("=" * 60) print("小葱笔记风格海报 - 特征提取脚本") print("=" * 60) # 加载制作表 tables = {} for i in range(1, 5): tables[i] = load_production_table(i) print(f"✓ 加载图{i}制作表") # 加载提取需求 requirements = load_extraction_requirements() print("✓ 加载提取需求") # 初始化 mapping 数据 mappings = {dim: [] for dim in DIMENSIONS} # 维度 1: 背景纹理 print("\n--- 维度 1: 背景纹理 ---") for img_id in range(1, 5): paragraph_id = f"段落{img_id}.1" if img_id > 1 else "段落 1.1" entry = create_mapping_entry( dimension="background_texture", img_id=img_id, paragraph_id=paragraph_id, feature_type="实质 + 形式", feature_name="背景纹理与基质", output_file=f"img_{img_id}_background.png" ) mappings["background_texture"].append(entry) print(f" 图{img_id}: {paragraph_id} → img_{img_id}_background.png") # 维度 2: 核心英文单词 print("\n--- 维度 2: 核心英文单词 ---") for img_id in range(1, 5): paragraph_id = f"段落{img_id}.3.3" if img_id == 1 else f"段落{img_id}.2.2" entry = create_mapping_entry( dimension="core_english_word", img_id=img_id, paragraph_id=paragraph_id, feature_type="实质", feature_name="核心英文单词 Embedding", output_file=f"img_{img_id}_core_word.png" ) mappings["core_english_word"].append(entry) print(f" 图{img_id}: {paragraph_id} → img_{img_id}_core_word.png") # 维度 3: 手绘装饰 print("\n--- 维度 3: 手绘装饰元素 ---") decoration_paragraphs = { 1: ["段落 1.2.1.1", "段落 1.5.2"], # 星星、花朵 2: [], 3: [], 4: [] # 图 2-4 的装饰在回形针维度处理 } for img_id, paragraphs in decoration_paragraphs.items(): for para_id in paragraphs: entry = create_mapping_entry( dimension="hand_drawn_decorations", img_id=img_id, paragraph_id=para_id, feature_type="实质", feature_name="手绘装饰 (星星/花朵)", output_file=f"img_{img_id}_decorations.png" ) mappings["hand_drawn_decorations"].append(entry) print(f" 图{img_id}: {para_id} → img_{img_id}_decorations.png") # 维度 4: 排版结构 print("\n--- 维度 4: 排版结构与导视 ---") for img_id in range(1, 5): paragraph_id = f"段落{img_id}.3" if img_id == 1 else f"段落{img_id}.3" entry = create_mapping_entry( dimension="layout_structure", img_id=img_id, paragraph_id=paragraph_id, feature_type="形式", feature_name="排版结构与导视", output_file=f"img_{img_id}_layout.json" ) mappings["layout_structure"].append(entry) print(f" 图{img_id}: {paragraph_id} → img_{img_id}_layout.json") # 维度 5: 色彩高亮 print("\n--- 维度 5: 语义化色彩高亮 ---") color_paragraphs = { 1: "段落 1.3.4", # 粉色涂抹 2: "段落 2.3.2", # 蓝色高亮 3: "段落 3.3.4", # 紫色关键词 4: "段落 4.3.2" # 紫色关键词 } for img_id, para_id in color_paragraphs.items(): entry = create_mapping_entry( dimension="color_highlight", img_id=img_id, paragraph_id=para_id, feature_type="形式", feature_name="语义化色彩高亮", output_file=f"img_{img_id}_colors.json" ) mappings["color_highlight"].append(entry) print(f" 图{img_id}: {para_id} → img_{img_id}_colors.json") # 维度 6: 知识可视化图表 print("\n--- 维度 6: 知识可视化图表 ---") entry = create_mapping_entry( dimension="knowledge_chart", img_id=2, paragraph_id="段落 2.3.5", feature_type="实质", feature_name="Tokenization-Embedding 表格", output_file="img_2_table.png" ) mappings["knowledge_chart"].append(entry) print(f" 图 2: 段落 2.3.5 → img_2_table.png") # 维度 7: 回形针装饰 print("\n--- 维度 7: 回形针装饰 ---") for img_id in range(2, 5): paragraph_id = f"段落{img_id}.2.1" entry = create_mapping_entry( dimension="paper_clip", img_id=img_id, paragraph_id=paragraph_id, feature_type="实质", feature_name="回形针装饰", output_file=f"img_{img_id}_paperclip.png" ) mappings["paper_clip"].append(entry) print(f" 图{img_id}: {paragraph_id} → img_{img_id}_paperclip.png") # 写入 mapping.json 文件 print("\n--- 写入 mapping.json ---") for dim in DIMENSIONS: mapping_path = os.path.join(OUTPUT_DIR, dim, "mapping.json") with open(mapping_path, 'w', encoding='utf-8') as f: json.dump(mappings[dim], f, ensure_ascii=False, indent=2) print(f" ✓ {dim}/mapping.json") print("\n" + "=" * 60) print("特征提取脚本执行完成") print("=" * 60) print(f"\n输出目录:{OUTPUT_DIR}") print("各维度 mapping.json 已生成,待工具执行后填充实际特征值文件") if __name__ == "__main__": main()