| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202 |
- #!/usr/bin/env python3
- """
- 特征提取脚本 - 小葱笔记风格海报
- 功能:从 4 张图片中提取 7 个维度的特征
- """
- import json
- import os
- from datetime import datetime
- # 输入输出路径
- INPUT_DIR = "/Users/elksmmx/Desktop/agent 2.10/Agent_dragon/examples/find knowledge_xiaocong/input"
- OUTPUT_DIR = "/Users/elksmmx/Desktop/agent 2.10/Agent_dragon/examples/find knowledge_xiaocong/output/features"
- # 创建输出目录
- DIMENSIONS = [
- "background_texture",
- "core_english_word",
- "hand_drawn_decorations",
- "layout_structure",
- "color_highlight",
- "knowledge_chart",
- "paper_clip"
- ]
- for dim in DIMENSIONS:
- os.makedirs(os.path.join(OUTPUT_DIR, dim), exist_ok=True)
- # 读取制作表
- def load_production_table(img_id):
- table_path = os.path.join(INPUT_DIR, f"小葱__img_{img_id}_制作表.json")
- with open(table_path, 'r', encoding='utf-8') as f:
- return json.load(f)
- # 读取提取需求
- def load_extraction_requirements():
- req_path = os.path.join(INPUT_DIR, "小葱提取需求.json")
- with open(req_path, 'r', encoding='utf-8') as f:
- content = f.read()
- # 解析需求(简化处理)
- return content
- # 生成 mapping.json 的函数
- def create_mapping_entry(dimension, img_id, paragraph_id, feature_type, feature_name, output_file):
- return {
- "dimension": dimension,
- "image": f"img_{img_id}.jpg",
- "paragraph": paragraph_id,
- "feature_type": feature_type, # 实质/形式
- "feature_name": feature_name,
- "output_file": output_file,
- "extraction_tool": "PaddleOCR + rembg + OpenCV (planned)",
- "extraction_date": datetime.now().isoformat()
- }
- # 主执行逻辑
- def main():
- print("=" * 60)
- print("小葱笔记风格海报 - 特征提取脚本")
- print("=" * 60)
-
- # 加载制作表
- tables = {}
- for i in range(1, 5):
- tables[i] = load_production_table(i)
- print(f"✓ 加载图{i}制作表")
-
- # 加载提取需求
- requirements = load_extraction_requirements()
- print("✓ 加载提取需求")
-
- # 初始化 mapping 数据
- mappings = {dim: [] for dim in DIMENSIONS}
-
- # 维度 1: 背景纹理
- print("\n--- 维度 1: 背景纹理 ---")
- for img_id in range(1, 5):
- paragraph_id = f"段落{img_id}.1" if img_id > 1 else "段落 1.1"
- entry = create_mapping_entry(
- dimension="background_texture",
- img_id=img_id,
- paragraph_id=paragraph_id,
- feature_type="实质 + 形式",
- feature_name="背景纹理与基质",
- output_file=f"img_{img_id}_background.png"
- )
- mappings["background_texture"].append(entry)
- print(f" 图{img_id}: {paragraph_id} → img_{img_id}_background.png")
-
- # 维度 2: 核心英文单词
- print("\n--- 维度 2: 核心英文单词 ---")
- for img_id in range(1, 5):
- paragraph_id = f"段落{img_id}.3.3" if img_id == 1 else f"段落{img_id}.2.2"
- entry = create_mapping_entry(
- dimension="core_english_word",
- img_id=img_id,
- paragraph_id=paragraph_id,
- feature_type="实质",
- feature_name="核心英文单词 Embedding",
- output_file=f"img_{img_id}_core_word.png"
- )
- mappings["core_english_word"].append(entry)
- print(f" 图{img_id}: {paragraph_id} → img_{img_id}_core_word.png")
-
- # 维度 3: 手绘装饰
- print("\n--- 维度 3: 手绘装饰元素 ---")
- decoration_paragraphs = {
- 1: ["段落 1.2.1.1", "段落 1.5.2"], # 星星、花朵
- 2: [], 3: [], 4: [] # 图 2-4 的装饰在回形针维度处理
- }
- for img_id, paragraphs in decoration_paragraphs.items():
- for para_id in paragraphs:
- entry = create_mapping_entry(
- dimension="hand_drawn_decorations",
- img_id=img_id,
- paragraph_id=para_id,
- feature_type="实质",
- feature_name="手绘装饰 (星星/花朵)",
- output_file=f"img_{img_id}_decorations.png"
- )
- mappings["hand_drawn_decorations"].append(entry)
- print(f" 图{img_id}: {para_id} → img_{img_id}_decorations.png")
-
- # 维度 4: 排版结构
- print("\n--- 维度 4: 排版结构与导视 ---")
- for img_id in range(1, 5):
- paragraph_id = f"段落{img_id}.3" if img_id == 1 else f"段落{img_id}.3"
- entry = create_mapping_entry(
- dimension="layout_structure",
- img_id=img_id,
- paragraph_id=paragraph_id,
- feature_type="形式",
- feature_name="排版结构与导视",
- output_file=f"img_{img_id}_layout.json"
- )
- mappings["layout_structure"].append(entry)
- print(f" 图{img_id}: {paragraph_id} → img_{img_id}_layout.json")
-
- # 维度 5: 色彩高亮
- print("\n--- 维度 5: 语义化色彩高亮 ---")
- color_paragraphs = {
- 1: "段落 1.3.4", # 粉色涂抹
- 2: "段落 2.3.2", # 蓝色高亮
- 3: "段落 3.3.4", # 紫色关键词
- 4: "段落 4.3.2" # 紫色关键词
- }
- for img_id, para_id in color_paragraphs.items():
- entry = create_mapping_entry(
- dimension="color_highlight",
- img_id=img_id,
- paragraph_id=para_id,
- feature_type="形式",
- feature_name="语义化色彩高亮",
- output_file=f"img_{img_id}_colors.json"
- )
- mappings["color_highlight"].append(entry)
- print(f" 图{img_id}: {para_id} → img_{img_id}_colors.json")
-
- # 维度 6: 知识可视化图表
- print("\n--- 维度 6: 知识可视化图表 ---")
- entry = create_mapping_entry(
- dimension="knowledge_chart",
- img_id=2,
- paragraph_id="段落 2.3.5",
- feature_type="实质",
- feature_name="Tokenization-Embedding 表格",
- output_file="img_2_table.png"
- )
- mappings["knowledge_chart"].append(entry)
- print(f" 图 2: 段落 2.3.5 → img_2_table.png")
-
- # 维度 7: 回形针装饰
- print("\n--- 维度 7: 回形针装饰 ---")
- for img_id in range(2, 5):
- paragraph_id = f"段落{img_id}.2.1"
- entry = create_mapping_entry(
- dimension="paper_clip",
- img_id=img_id,
- paragraph_id=paragraph_id,
- feature_type="实质",
- feature_name="回形针装饰",
- output_file=f"img_{img_id}_paperclip.png"
- )
- mappings["paper_clip"].append(entry)
- print(f" 图{img_id}: {paragraph_id} → img_{img_id}_paperclip.png")
-
- # 写入 mapping.json 文件
- print("\n--- 写入 mapping.json ---")
- for dim in DIMENSIONS:
- mapping_path = os.path.join(OUTPUT_DIR, dim, "mapping.json")
- with open(mapping_path, 'w', encoding='utf-8') as f:
- json.dump(mappings[dim], f, ensure_ascii=False, indent=2)
- print(f" ✓ {dim}/mapping.json")
-
- print("\n" + "=" * 60)
- print("特征提取脚本执行完成")
- print("=" * 60)
- print(f"\n输出目录:{OUTPUT_DIR}")
- print("各维度 mapping.json 已生成,待工具执行后填充实际特征值文件")
- if __name__ == "__main__":
- main()
|