#!/usr/bin/env python3 """ 使用VLM提取语义特征 维度4: 人物外观语义描述 (person_appearance) 维度5: 场景构图描述 (composition_layout) 维度6: 光影氛围描述 (lighting_atmosphere) 维度8: 画中画内容描述 (painting_content) """ import openai import base64 import json import os import time client = openai.OpenAI( api_key=os.environ.get('OPEN_ROUTER_API_KEY'), base_url='https://openrouter.ai/api/v1' ) def encode_image(image_path): with open(image_path, 'rb') as f: return base64.b64encode(f.read()).decode() def query_vlm(image_path, prompt, max_tokens=800): """调用VLM分析图片""" img_data = encode_image(image_path) response = client.chat.completions.create( model='google/gemini-2.0-flash-001', max_tokens=max_tokens, messages=[{ 'role': 'user', 'content': [ {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{img_data}'}}, {'type': 'text', 'text': prompt} ] }] ) return response.choices[0].message.content def extract_json_from_response(text): """从响应中提取JSON""" start = text.find('{') end = text.rfind('}') + 1 if start >= 0 and end > start: try: return json.loads(text[start:end]) except: pass return {"raw_text": text} # ============================================================ # 维度4: 人物外观语义描述 # ============================================================ PERSON_APPEARANCE_PROMPT = """请分析图片中女性人物的外观特征,以JSON格式返回: { "clothing": { "type": "服装类型(如:白色长裙/白色上衣等)", "color": "颜色", "style": "风格描述", "details": "细节描述(如:飘逸、垂坠感等)" }, "hair": { "color": "发色", "length": "发长(长/中/短)", "style": "发型(直发/卷发/盘发等)", "details": "细节(如:散落肩上、发梢微卷等)" }, "accessories": { "earrings": "耳饰描述(无则填null)", "necklace": "项链描述(无则填null)", "bracelet": "手镯描述(无则填null)", "other": "其他配饰" }, "skin": "肤色描述", "overall_impression": "整体气质描述(50字以内)", "generation_prompt": "用于AI生成的英文提示词(描述人物外观,50词以内)" } 只返回JSON,不要其他文字。""" # ============================================================ # 维度5: 场景构图描述 # ============================================================ COMPOSITION_PROMPT = """请分析图片的构图特征,以JSON格式返回: { "shot_type": "景别(特写/近景/中景/全景/远景)", "camera_angle": "拍摄角度(正面/侧面/背面/俯视/仰视)", "camera_position": "相机位置描述(如:人物右后方、侧前方等)", "subject_position": { "horizontal": "主体水平位置(左/中/右,及大致比例)", "vertical": "主体垂直位置(上/中/下,及大致比例)" }, "composition_type": "构图类型(如:三分法/对角线/引导线等)", "visual_flow": "视线引导方向描述", "depth_layers": "景深层次(前景/中景/背景的内容)", "aspect_ratio": "画面比例", "generation_prompt": "用于AI生成的英文构图提示词(30词以内)" } 只返回JSON,不要其他文字。""" # ============================================================ # 维度6: 光影氛围描述 # ============================================================ LIGHTING_PROMPT = """请分析图片的光影和氛围特征,以JSON格式返回: { "light_type": "光照类型(自然光/逆光/侧光/散射光等)", "light_direction": "光线方向(从哪个方向照射)", "light_quality": "光线质感(柔和/硬朗/温暖/冷调等)", "bokeh": { "present": true或false, "intensity": "虚化程度(轻微/中等/强烈)", "description": "散景描述" }, "color_temperature": "色温(暖/中性/冷)", "overall_atmosphere": "整体氛围(如:梦幻/清新/温暖/浪漫等)", "mood": "情绪感受(50字以内)", "generation_prompt": "用于AI生成的英文光影提示词(如:soft natural backlight, bokeh background, dreamy atmosphere,30词以内)" } 只返回JSON,不要其他文字。""" # ============================================================ # 维度8: 画中画内容描述 # ============================================================ PAINTING_CONTENT_PROMPT = """请分析图片中画架上的画布内容(如果可见),以JSON格式返回: { "canvas_visible": true或false, "canvas_content": { "subject": "画布上描绘的主题(如:人物/风景/空白等)", "style": "绘画风格(如:油画/写实/印象派等)", "colors": "主要颜色", "completion": "完成程度(空白/草稿/半完成/完成)", "description": "详细描述(50字以内)" }, "reality_art_relationship": "现实场景与画作的关系(如:画中画/镜像/互文等)", "narrative_value": "叙事价值描述(这个画中画结构如何增强画面叙事)", "generation_prompt": "用于AI生成的英文提示词(描述画布内容,30词以内,如果画布不可见则填null)" } 只返回JSON,不要其他文字。""" def main(): input_dir = "input" # 各维度输出目录 dirs = { "person_appearance": "output/features/person_appearance", "composition_layout": "output/features/composition_layout", "lighting_atmosphere": "output/features/lighting_atmosphere", "painting_content": "output/features/painting_content", } # 段落对应关系 segment_maps = { "person_appearance": { "img_1": "段落1.1", "img_2": "段落2.1", "img_3": "段落3.1", "img_4": "段落4.1", "img_5": "段落5.1", "img_6": "段落6.1", "img_7": "段落7.1", "img_8": "段落8.1", "img_9": "段落9.1" }, "composition_layout": { "img_1": "段落1", "img_2": "段落2", "img_3": "段落3", "img_4": "段落4", "img_5": "段落5", "img_6": "段落6", "img_7": "段落7", "img_8": "段落8", "img_9": "段落9" }, "lighting_atmosphere": { "img_1": "段落1", "img_2": "段落2", "img_3": "段落3", "img_4": "段落4", "img_5": "段落5", "img_6": "段落6", "img_7": "段落7", "img_8": "段落8", "img_9": "段落9" }, "painting_content": { "img_1": "段落1.2.1", "img_2": "段落2.2.1", "img_3": "段落3.2.1", "img_4": "段落4.2.1", "img_5": "段落5.2", "img_6": "段落6.2.1", "img_7": None, # img_7无画架 "img_8": "段落8.2.1", "img_9": "段落9.2.1" } } # 高亮聚类对应 highlight_clusters = { "person_appearance": "cluster_1", "composition_layout": "cluster_6", "lighting_atmosphere": "cluster_4", "painting_content": "cluster_5" } # 特征描述 feature_names = { "person_appearance": "女性人物外观(服装/发型/配饰)", "composition_layout": "场景构图(景别/角度/视线引导)", "lighting_atmosphere": "光影氛围(逆光/散景/梦幻)", "painting_content": "画中画内容(画布上的油画)" } prompts = { "person_appearance": PERSON_APPEARANCE_PROMPT, "composition_layout": COMPOSITION_PROMPT, "lighting_atmosphere": LIGHTING_PROMPT, "painting_content": PAINTING_CONTENT_PROMPT, } # 存储所有维度的mappings all_mappings = {dim: [] for dim in dirs.keys()} for i in range(1, 10): img_id = f"img_{i}" image_path = os.path.join(input_dir, f"{img_id}.jpg") if not os.path.exists(image_path): continue print(f"\n处理 {img_id}...") for dim, prompt in prompts.items(): # 检查是否需要处理(如img_7无画架) seg = segment_maps[dim].get(img_id) print(f" 提取 {dim}...", end=" ") try: response = query_vlm(image_path, prompt) data = extract_json_from_response(response) # 保存JSON json_path = os.path.join(dirs[dim], f"{img_id}_{dim}.json") with open(json_path, 'w', encoding='utf-8') as f: json.dump({ "image_id": img_id, "dimension": dim, "segment": seg, "data": data }, f, ensure_ascii=False, indent=2) print(f"✓") # 添加到mappings mapping_entry = { "file": f"{img_id}_{dim}.json", "source_image": f"input/{img_id}.jpg", "segment": seg or "N/A", "category": "实质" if dim == "person_appearance" else "形式", "feature": feature_names[dim], "highlight_cluster": highlight_clusters[dim] } if dim == "person_appearance": mapping_entry["element_id"] = "元素1" elif dim == "painting_content": mapping_entry["element_id"] = "元素5" all_mappings[dim].append(mapping_entry) time.sleep(0.5) # 避免API限流 except Exception as e: print(f"✗ 错误: {e}") # 保存各维度的mapping.json dim_descriptions = { "person_appearance": "人物外观语义描述,使用VLM分析服装/发型/配饰/气质", "composition_layout": "场景构图描述,使用VLM分析景别/角度/视线引导/空间布局", "lighting_atmosphere": "光影氛围描述,使用VLM分析光照类型/散景/色温/整体氛围", "painting_content": "画中画内容描述,使用VLM分析画布上的油画内容及叙事关系" } for dim, mappings in all_mappings.items(): mapping = { "dimension": dim, "description": dim_descriptions[dim], "tool": "Google Gemini 2.0 Flash (via OpenRouter)", "format": { "json": "JSON,包含结构化语义描述和生成提示词" }, "mappings": mappings } mapping_path = os.path.join(dirs[dim], "mapping.json") with open(mapping_path, 'w', encoding='utf-8') as f: json.dump(mapping, f, ensure_ascii=False, indent=2) print("\n✓ VLM语义特征提取完成") if __name__ == "__main__": main()