howard
/
Agent


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281
							#!/usr/bin/env python3
"""
使用VLM提取语义特征
维度4: 人物外观语义描述 (person_appearance)
维度5: 场景构图描述 (composition_layout)
维度6: 光影氛围描述 (lighting_atmosphere)
维度8: 画中画内容描述 (painting_content)
"""

import openai
import base64
import json
import os
import time

client = openai.OpenAI(
    api_key=os.environ.get('OPEN_ROUTER_API_KEY'),
    base_url='https://openrouter.ai/api/v1'
)

def encode_image(image_path):
    with open(image_path, 'rb') as f:
        return base64.b64encode(f.read()).decode()

def query_vlm(image_path, prompt, max_tokens=800):
    """调用VLM分析图片"""
    img_data = encode_image(image_path)
    
    response = client.chat.completions.create(
        model='google/gemini-2.0-flash-001',
        max_tokens=max_tokens,
        messages=[{
            'role': 'user',
            'content': [
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{img_data}'}},
                {'type': 'text', 'text': prompt}
            ]
        }]
    )
    return response.choices[0].message.content

def extract_json_from_response(text):
    """从响应中提取JSON"""
    start = text.find('{')
    end = text.rfind('}') + 1
    if start >= 0 and end > start:
        try:
            return json.loads(text[start:end])
        except:
            pass
    return {"raw_text": text}

# ============================================================
# 维度4: 人物外观语义描述
# ============================================================
PERSON_APPEARANCE_PROMPT = """请分析图片中女性人物的外观特征，以JSON格式返回：
{
  "clothing": {
    "type": "服装类型（如：白色长裙/白色上衣等）",
    "color": "颜色",
    "style": "风格描述",
    "details": "细节描述（如：飘逸、垂坠感等）"
  },
  "hair": {
    "color": "发色",
    "length": "发长（长/中/短）",
    "style": "发型（直发/卷发/盘发等）",
    "details": "细节（如：散落肩上、发梢微卷等）"
  },
  "accessories": {
    "earrings": "耳饰描述（无则填null）",
    "necklace": "项链描述（无则填null）",
    "bracelet": "手镯描述（无则填null）",
    "other": "其他配饰"
  },
  "skin": "肤色描述",
  "overall_impression": "整体气质描述（50字以内）",
  "generation_prompt": "用于AI生成的英文提示词（描述人物外观，50词以内）"
}
只返回JSON，不要其他文字。"""

# ============================================================
# 维度5: 场景构图描述
# ============================================================
COMPOSITION_PROMPT = """请分析图片的构图特征，以JSON格式返回：
{
  "shot_type": "景别（特写/近景/中景/全景/远景）",
  "camera_angle": "拍摄角度（正面/侧面/背面/俯视/仰视）",
  "camera_position": "相机位置描述（如：人物右后方、侧前方等）",
  "subject_position": {
    "horizontal": "主体水平位置（左/中/右，及大致比例）",
    "vertical": "主体垂直位置（上/中/下，及大致比例）"
  },
  "composition_type": "构图类型（如：三分法/对角线/引导线等）",
  "visual_flow": "视线引导方向描述",
  "depth_layers": "景深层次（前景/中景/背景的内容）",
  "aspect_ratio": "画面比例",
  "generation_prompt": "用于AI生成的英文构图提示词（30词以内）"
}
只返回JSON，不要其他文字。"""

# ============================================================
# 维度6: 光影氛围描述
# ============================================================
LIGHTING_PROMPT = """请分析图片的光影和氛围特征，以JSON格式返回：
{
  "light_type": "光照类型（自然光/逆光/侧光/散射光等）",
  "light_direction": "光线方向（从哪个方向照射）",
  "light_quality": "光线质感（柔和/硬朗/温暖/冷调等）",
  "bokeh": {
    "present": true或false,
    "intensity": "虚化程度（轻微/中等/强烈）",
    "description": "散景描述"
  },
  "color_temperature": "色温（暖/中性/冷）",
  "overall_atmosphere": "整体氛围（如：梦幻/清新/温暖/浪漫等）",
  "mood": "情绪感受（50字以内）",
  "generation_prompt": "用于AI生成的英文光影提示词（如：soft natural backlight, bokeh background, dreamy atmosphere，30词以内）"
}
只返回JSON，不要其他文字。"""

# ============================================================
# 维度8: 画中画内容描述
# ============================================================
PAINTING_CONTENT_PROMPT = """请分析图片中画架上的画布内容（如果可见），以JSON格式返回：
{
  "canvas_visible": true或false,
  "canvas_content": {
    "subject": "画布上描绘的主题（如：人物/风景/空白等）",
    "style": "绘画风格（如：油画/写实/印象派等）",
    "colors": "主要颜色",
    "completion": "完成程度（空白/草稿/半完成/完成）",
    "description": "详细描述（50字以内）"
  },
  "reality_art_relationship": "现实场景与画作的关系（如：画中画/镜像/互文等）",
  "narrative_value": "叙事价值描述（这个画中画结构如何增强画面叙事）",
  "generation_prompt": "用于AI生成的英文提示词（描述画布内容，30词以内，如果画布不可见则填null）"
}
只返回JSON，不要其他文字。"""

def main():
    input_dir = "input"
    
    # 各维度输出目录
    dirs = {
        "person_appearance": "output/features/person_appearance",
        "composition_layout": "output/features/composition_layout",
        "lighting_atmosphere": "output/features/lighting_atmosphere",
        "painting_content": "output/features/painting_content",
    }
    
    # 段落对应关系
    segment_maps = {
        "person_appearance": {
            "img_1": "段落1.1", "img_2": "段落2.1", "img_3": "段落3.1",
            "img_4": "段落4.1", "img_5": "段落5.1", "img_6": "段落6.1",
            "img_7": "段落7.1", "img_8": "段落8.1", "img_9": "段落9.1"
        },
        "composition_layout": {
            "img_1": "段落1", "img_2": "段落2", "img_3": "段落3",
            "img_4": "段落4", "img_5": "段落5", "img_6": "段落6",
            "img_7": "段落7", "img_8": "段落8", "img_9": "段落9"
        },
        "lighting_atmosphere": {
            "img_1": "段落1", "img_2": "段落2", "img_3": "段落3",
            "img_4": "段落4", "img_5": "段落5", "img_6": "段落6",
            "img_7": "段落7", "img_8": "段落8", "img_9": "段落9"
        },
        "painting_content": {
            "img_1": "段落1.2.1", "img_2": "段落2.2.1", "img_3": "段落3.2.1",
            "img_4": "段落4.2.1", "img_5": "段落5.2", "img_6": "段落6.2.1",
            "img_7": None,  # img_7无画架
            "img_8": "段落8.2.1", "img_9": "段落9.2.1"
        }
    }
    
    # 高亮聚类对应
    highlight_clusters = {
        "person_appearance": "cluster_1",
        "composition_layout": "cluster_6",
        "lighting_atmosphere": "cluster_4",
        "painting_content": "cluster_5"
    }
    
    # 特征描述
    feature_names = {
        "person_appearance": "女性人物外观（服装/发型/配饰）",
        "composition_layout": "场景构图（景别/角度/视线引导）",
        "lighting_atmosphere": "光影氛围（逆光/散景/梦幻）",
        "painting_content": "画中画内容（画布上的油画）"
    }
    
    prompts = {
        "person_appearance": PERSON_APPEARANCE_PROMPT,
        "composition_layout": COMPOSITION_PROMPT,
        "lighting_atmosphere": LIGHTING_PROMPT,
        "painting_content": PAINTING_CONTENT_PROMPT,
    }
    
    # 存储所有维度的mappings
    all_mappings = {dim: [] for dim in dirs.keys()}
    
    for i in range(1, 10):
        img_id = f"img_{i}"
        image_path = os.path.join(input_dir, f"{img_id}.jpg")
        
        if not os.path.exists(image_path):
            continue
        
        print(f"\n处理 {img_id}...")
        
        for dim, prompt in prompts.items():
            # 检查是否需要处理（如img_7无画架）
            seg = segment_maps[dim].get(img_id)
            
            print(f"  提取 {dim}...", end=" ")
            
            try:
                response = query_vlm(image_path, prompt)
                data = extract_json_from_response(response)
                
                # 保存JSON
                json_path = os.path.join(dirs[dim], f"{img_id}_{dim}.json")
                with open(json_path, 'w', encoding='utf-8') as f:
                    json.dump({
                        "image_id": img_id,
                        "dimension": dim,
                        "segment": seg,
                        "data": data
                    }, f, ensure_ascii=False, indent=2)
                
                print(f"✓")
                
                # 添加到mappings
                mapping_entry = {
                    "file": f"{img_id}_{dim}.json",
                    "source_image": f"input/{img_id}.jpg",
                    "segment": seg or "N/A",
                    "category": "实质" if dim == "person_appearance" else "形式",
                    "feature": feature_names[dim],
                    "highlight_cluster": highlight_clusters[dim]
                }
                if dim == "person_appearance":
                    mapping_entry["element_id"] = "元素1"
                elif dim == "painting_content":
                    mapping_entry["element_id"] = "元素5"
                
                all_mappings[dim].append(mapping_entry)
                
                time.sleep(0.5)  # 避免API限流
                
            except Exception as e:
                print(f"✗ 错误: {e}")
    
    # 保存各维度的mapping.json
    dim_descriptions = {
        "person_appearance": "人物外观语义描述，使用VLM分析服装/发型/配饰/气质",
        "composition_layout": "场景构图描述，使用VLM分析景别/角度/视线引导/空间布局",
        "lighting_atmosphere": "光影氛围描述，使用VLM分析光照类型/散景/色温/整体氛围",
        "painting_content": "画中画内容描述，使用VLM分析画布上的油画内容及叙事关系"
    }
    
    for dim, mappings in all_mappings.items():
        mapping = {
            "dimension": dim,
            "description": dim_descriptions[dim],
            "tool": "Google Gemini 2.0 Flash (via OpenRouter)",
            "format": {
                "json": "JSON，包含结构化语义描述和生成提示词"
            },
            "mappings": mappings
        }
        
        mapping_path = os.path.join(dirs[dim], "mapping.json")
        with open(mapping_path, 'w', encoding='utf-8') as f:
            json.dump(mapping, f, ensure_ascii=False, indent=2)
    
    print("\n✓ VLM语义特征提取完成")

if __name__ == "__main__":
    main()