| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281 |
- #!/usr/bin/env python3
- """
- 使用VLM提取语义特征
- 维度4: 人物外观语义描述 (person_appearance)
- 维度5: 场景构图描述 (composition_layout)
- 维度6: 光影氛围描述 (lighting_atmosphere)
- 维度8: 画中画内容描述 (painting_content)
- """
- import openai
- import base64
- import json
- import os
- import time
- client = openai.OpenAI(
- api_key=os.environ.get('OPEN_ROUTER_API_KEY'),
- base_url='https://openrouter.ai/api/v1'
- )
- def encode_image(image_path):
- with open(image_path, 'rb') as f:
- return base64.b64encode(f.read()).decode()
- def query_vlm(image_path, prompt, max_tokens=800):
- """调用VLM分析图片"""
- img_data = encode_image(image_path)
-
- response = client.chat.completions.create(
- model='google/gemini-2.0-flash-001',
- max_tokens=max_tokens,
- messages=[{
- 'role': 'user',
- 'content': [
- {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{img_data}'}},
- {'type': 'text', 'text': prompt}
- ]
- }]
- )
- return response.choices[0].message.content
- def extract_json_from_response(text):
- """从响应中提取JSON"""
- start = text.find('{')
- end = text.rfind('}') + 1
- if start >= 0 and end > start:
- try:
- return json.loads(text[start:end])
- except:
- pass
- return {"raw_text": text}
- # ============================================================
- # 维度4: 人物外观语义描述
- # ============================================================
- PERSON_APPEARANCE_PROMPT = """请分析图片中女性人物的外观特征,以JSON格式返回:
- {
- "clothing": {
- "type": "服装类型(如:白色长裙/白色上衣等)",
- "color": "颜色",
- "style": "风格描述",
- "details": "细节描述(如:飘逸、垂坠感等)"
- },
- "hair": {
- "color": "发色",
- "length": "发长(长/中/短)",
- "style": "发型(直发/卷发/盘发等)",
- "details": "细节(如:散落肩上、发梢微卷等)"
- },
- "accessories": {
- "earrings": "耳饰描述(无则填null)",
- "necklace": "项链描述(无则填null)",
- "bracelet": "手镯描述(无则填null)",
- "other": "其他配饰"
- },
- "skin": "肤色描述",
- "overall_impression": "整体气质描述(50字以内)",
- "generation_prompt": "用于AI生成的英文提示词(描述人物外观,50词以内)"
- }
- 只返回JSON,不要其他文字。"""
- # ============================================================
- # 维度5: 场景构图描述
- # ============================================================
- COMPOSITION_PROMPT = """请分析图片的构图特征,以JSON格式返回:
- {
- "shot_type": "景别(特写/近景/中景/全景/远景)",
- "camera_angle": "拍摄角度(正面/侧面/背面/俯视/仰视)",
- "camera_position": "相机位置描述(如:人物右后方、侧前方等)",
- "subject_position": {
- "horizontal": "主体水平位置(左/中/右,及大致比例)",
- "vertical": "主体垂直位置(上/中/下,及大致比例)"
- },
- "composition_type": "构图类型(如:三分法/对角线/引导线等)",
- "visual_flow": "视线引导方向描述",
- "depth_layers": "景深层次(前景/中景/背景的内容)",
- "aspect_ratio": "画面比例",
- "generation_prompt": "用于AI生成的英文构图提示词(30词以内)"
- }
- 只返回JSON,不要其他文字。"""
- # ============================================================
- # 维度6: 光影氛围描述
- # ============================================================
- LIGHTING_PROMPT = """请分析图片的光影和氛围特征,以JSON格式返回:
- {
- "light_type": "光照类型(自然光/逆光/侧光/散射光等)",
- "light_direction": "光线方向(从哪个方向照射)",
- "light_quality": "光线质感(柔和/硬朗/温暖/冷调等)",
- "bokeh": {
- "present": true或false,
- "intensity": "虚化程度(轻微/中等/强烈)",
- "description": "散景描述"
- },
- "color_temperature": "色温(暖/中性/冷)",
- "overall_atmosphere": "整体氛围(如:梦幻/清新/温暖/浪漫等)",
- "mood": "情绪感受(50字以内)",
- "generation_prompt": "用于AI生成的英文光影提示词(如:soft natural backlight, bokeh background, dreamy atmosphere,30词以内)"
- }
- 只返回JSON,不要其他文字。"""
- # ============================================================
- # 维度8: 画中画内容描述
- # ============================================================
- PAINTING_CONTENT_PROMPT = """请分析图片中画架上的画布内容(如果可见),以JSON格式返回:
- {
- "canvas_visible": true或false,
- "canvas_content": {
- "subject": "画布上描绘的主题(如:人物/风景/空白等)",
- "style": "绘画风格(如:油画/写实/印象派等)",
- "colors": "主要颜色",
- "completion": "完成程度(空白/草稿/半完成/完成)",
- "description": "详细描述(50字以内)"
- },
- "reality_art_relationship": "现实场景与画作的关系(如:画中画/镜像/互文等)",
- "narrative_value": "叙事价值描述(这个画中画结构如何增强画面叙事)",
- "generation_prompt": "用于AI生成的英文提示词(描述画布内容,30词以内,如果画布不可见则填null)"
- }
- 只返回JSON,不要其他文字。"""
- def main():
- input_dir = "input"
-
- # 各维度输出目录
- dirs = {
- "person_appearance": "output/features/person_appearance",
- "composition_layout": "output/features/composition_layout",
- "lighting_atmosphere": "output/features/lighting_atmosphere",
- "painting_content": "output/features/painting_content",
- }
-
- # 段落对应关系
- segment_maps = {
- "person_appearance": {
- "img_1": "段落1.1", "img_2": "段落2.1", "img_3": "段落3.1",
- "img_4": "段落4.1", "img_5": "段落5.1", "img_6": "段落6.1",
- "img_7": "段落7.1", "img_8": "段落8.1", "img_9": "段落9.1"
- },
- "composition_layout": {
- "img_1": "段落1", "img_2": "段落2", "img_3": "段落3",
- "img_4": "段落4", "img_5": "段落5", "img_6": "段落6",
- "img_7": "段落7", "img_8": "段落8", "img_9": "段落9"
- },
- "lighting_atmosphere": {
- "img_1": "段落1", "img_2": "段落2", "img_3": "段落3",
- "img_4": "段落4", "img_5": "段落5", "img_6": "段落6",
- "img_7": "段落7", "img_8": "段落8", "img_9": "段落9"
- },
- "painting_content": {
- "img_1": "段落1.2.1", "img_2": "段落2.2.1", "img_3": "段落3.2.1",
- "img_4": "段落4.2.1", "img_5": "段落5.2", "img_6": "段落6.2.1",
- "img_7": None, # img_7无画架
- "img_8": "段落8.2.1", "img_9": "段落9.2.1"
- }
- }
-
- # 高亮聚类对应
- highlight_clusters = {
- "person_appearance": "cluster_1",
- "composition_layout": "cluster_6",
- "lighting_atmosphere": "cluster_4",
- "painting_content": "cluster_5"
- }
-
- # 特征描述
- feature_names = {
- "person_appearance": "女性人物外观(服装/发型/配饰)",
- "composition_layout": "场景构图(景别/角度/视线引导)",
- "lighting_atmosphere": "光影氛围(逆光/散景/梦幻)",
- "painting_content": "画中画内容(画布上的油画)"
- }
-
- prompts = {
- "person_appearance": PERSON_APPEARANCE_PROMPT,
- "composition_layout": COMPOSITION_PROMPT,
- "lighting_atmosphere": LIGHTING_PROMPT,
- "painting_content": PAINTING_CONTENT_PROMPT,
- }
-
- # 存储所有维度的mappings
- all_mappings = {dim: [] for dim in dirs.keys()}
-
- for i in range(1, 10):
- img_id = f"img_{i}"
- image_path = os.path.join(input_dir, f"{img_id}.jpg")
-
- if not os.path.exists(image_path):
- continue
-
- print(f"\n处理 {img_id}...")
-
- for dim, prompt in prompts.items():
- # 检查是否需要处理(如img_7无画架)
- seg = segment_maps[dim].get(img_id)
-
- print(f" 提取 {dim}...", end=" ")
-
- try:
- response = query_vlm(image_path, prompt)
- data = extract_json_from_response(response)
-
- # 保存JSON
- json_path = os.path.join(dirs[dim], f"{img_id}_{dim}.json")
- with open(json_path, 'w', encoding='utf-8') as f:
- json.dump({
- "image_id": img_id,
- "dimension": dim,
- "segment": seg,
- "data": data
- }, f, ensure_ascii=False, indent=2)
-
- print(f"✓")
-
- # 添加到mappings
- mapping_entry = {
- "file": f"{img_id}_{dim}.json",
- "source_image": f"input/{img_id}.jpg",
- "segment": seg or "N/A",
- "category": "实质" if dim == "person_appearance" else "形式",
- "feature": feature_names[dim],
- "highlight_cluster": highlight_clusters[dim]
- }
- if dim == "person_appearance":
- mapping_entry["element_id"] = "元素1"
- elif dim == "painting_content":
- mapping_entry["element_id"] = "元素5"
-
- all_mappings[dim].append(mapping_entry)
-
- time.sleep(0.5) # 避免API限流
-
- except Exception as e:
- print(f"✗ 错误: {e}")
-
- # 保存各维度的mapping.json
- dim_descriptions = {
- "person_appearance": "人物外观语义描述,使用VLM分析服装/发型/配饰/气质",
- "composition_layout": "场景构图描述,使用VLM分析景别/角度/视线引导/空间布局",
- "lighting_atmosphere": "光影氛围描述,使用VLM分析光照类型/散景/色温/整体氛围",
- "painting_content": "画中画内容描述,使用VLM分析画布上的油画内容及叙事关系"
- }
-
- for dim, mappings in all_mappings.items():
- mapping = {
- "dimension": dim,
- "description": dim_descriptions[dim],
- "tool": "Google Gemini 2.0 Flash (via OpenRouter)",
- "format": {
- "json": "JSON,包含结构化语义描述和生成提示词"
- },
- "mappings": mappings
- }
-
- mapping_path = os.path.join(dirs[dim], "mapping.json")
- with open(mapping_path, 'w', encoding='utf-8') as f:
- json.dump(mapping, f, ensure_ascii=False, indent=2)
-
- print("\n✓ VLM语义特征提取完成")
- if __name__ == "__main__":
- main()
|