extract_vlm_features.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281
  1. #!/usr/bin/env python3
  2. """
  3. 使用VLM提取语义特征
  4. 维度4: 人物外观语义描述 (person_appearance)
  5. 维度5: 场景构图描述 (composition_layout)
  6. 维度6: 光影氛围描述 (lighting_atmosphere)
  7. 维度8: 画中画内容描述 (painting_content)
  8. """
  9. import openai
  10. import base64
  11. import json
  12. import os
  13. import time
  14. client = openai.OpenAI(
  15. api_key=os.environ.get('OPEN_ROUTER_API_KEY'),
  16. base_url='https://openrouter.ai/api/v1'
  17. )
  18. def encode_image(image_path):
  19. with open(image_path, 'rb') as f:
  20. return base64.b64encode(f.read()).decode()
  21. def query_vlm(image_path, prompt, max_tokens=800):
  22. """调用VLM分析图片"""
  23. img_data = encode_image(image_path)
  24. response = client.chat.completions.create(
  25. model='google/gemini-2.0-flash-001',
  26. max_tokens=max_tokens,
  27. messages=[{
  28. 'role': 'user',
  29. 'content': [
  30. {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{img_data}'}},
  31. {'type': 'text', 'text': prompt}
  32. ]
  33. }]
  34. )
  35. return response.choices[0].message.content
  36. def extract_json_from_response(text):
  37. """从响应中提取JSON"""
  38. start = text.find('{')
  39. end = text.rfind('}') + 1
  40. if start >= 0 and end > start:
  41. try:
  42. return json.loads(text[start:end])
  43. except:
  44. pass
  45. return {"raw_text": text}
  46. # ============================================================
  47. # 维度4: 人物外观语义描述
  48. # ============================================================
  49. PERSON_APPEARANCE_PROMPT = """请分析图片中女性人物的外观特征,以JSON格式返回:
  50. {
  51. "clothing": {
  52. "type": "服装类型(如:白色长裙/白色上衣等)",
  53. "color": "颜色",
  54. "style": "风格描述",
  55. "details": "细节描述(如:飘逸、垂坠感等)"
  56. },
  57. "hair": {
  58. "color": "发色",
  59. "length": "发长(长/中/短)",
  60. "style": "发型(直发/卷发/盘发等)",
  61. "details": "细节(如:散落肩上、发梢微卷等)"
  62. },
  63. "accessories": {
  64. "earrings": "耳饰描述(无则填null)",
  65. "necklace": "项链描述(无则填null)",
  66. "bracelet": "手镯描述(无则填null)",
  67. "other": "其他配饰"
  68. },
  69. "skin": "肤色描述",
  70. "overall_impression": "整体气质描述(50字以内)",
  71. "generation_prompt": "用于AI生成的英文提示词(描述人物外观,50词以内)"
  72. }
  73. 只返回JSON,不要其他文字。"""
  74. # ============================================================
  75. # 维度5: 场景构图描述
  76. # ============================================================
  77. COMPOSITION_PROMPT = """请分析图片的构图特征,以JSON格式返回:
  78. {
  79. "shot_type": "景别(特写/近景/中景/全景/远景)",
  80. "camera_angle": "拍摄角度(正面/侧面/背面/俯视/仰视)",
  81. "camera_position": "相机位置描述(如:人物右后方、侧前方等)",
  82. "subject_position": {
  83. "horizontal": "主体水平位置(左/中/右,及大致比例)",
  84. "vertical": "主体垂直位置(上/中/下,及大致比例)"
  85. },
  86. "composition_type": "构图类型(如:三分法/对角线/引导线等)",
  87. "visual_flow": "视线引导方向描述",
  88. "depth_layers": "景深层次(前景/中景/背景的内容)",
  89. "aspect_ratio": "画面比例",
  90. "generation_prompt": "用于AI生成的英文构图提示词(30词以内)"
  91. }
  92. 只返回JSON,不要其他文字。"""
  93. # ============================================================
  94. # 维度6: 光影氛围描述
  95. # ============================================================
  96. LIGHTING_PROMPT = """请分析图片的光影和氛围特征,以JSON格式返回:
  97. {
  98. "light_type": "光照类型(自然光/逆光/侧光/散射光等)",
  99. "light_direction": "光线方向(从哪个方向照射)",
  100. "light_quality": "光线质感(柔和/硬朗/温暖/冷调等)",
  101. "bokeh": {
  102. "present": true或false,
  103. "intensity": "虚化程度(轻微/中等/强烈)",
  104. "description": "散景描述"
  105. },
  106. "color_temperature": "色温(暖/中性/冷)",
  107. "overall_atmosphere": "整体氛围(如:梦幻/清新/温暖/浪漫等)",
  108. "mood": "情绪感受(50字以内)",
  109. "generation_prompt": "用于AI生成的英文光影提示词(如:soft natural backlight, bokeh background, dreamy atmosphere,30词以内)"
  110. }
  111. 只返回JSON,不要其他文字。"""
  112. # ============================================================
  113. # 维度8: 画中画内容描述
  114. # ============================================================
  115. PAINTING_CONTENT_PROMPT = """请分析图片中画架上的画布内容(如果可见),以JSON格式返回:
  116. {
  117. "canvas_visible": true或false,
  118. "canvas_content": {
  119. "subject": "画布上描绘的主题(如:人物/风景/空白等)",
  120. "style": "绘画风格(如:油画/写实/印象派等)",
  121. "colors": "主要颜色",
  122. "completion": "完成程度(空白/草稿/半完成/完成)",
  123. "description": "详细描述(50字以内)"
  124. },
  125. "reality_art_relationship": "现实场景与画作的关系(如:画中画/镜像/互文等)",
  126. "narrative_value": "叙事价值描述(这个画中画结构如何增强画面叙事)",
  127. "generation_prompt": "用于AI生成的英文提示词(描述画布内容,30词以内,如果画布不可见则填null)"
  128. }
  129. 只返回JSON,不要其他文字。"""
  130. def main():
  131. input_dir = "input"
  132. # 各维度输出目录
  133. dirs = {
  134. "person_appearance": "output/features/person_appearance",
  135. "composition_layout": "output/features/composition_layout",
  136. "lighting_atmosphere": "output/features/lighting_atmosphere",
  137. "painting_content": "output/features/painting_content",
  138. }
  139. # 段落对应关系
  140. segment_maps = {
  141. "person_appearance": {
  142. "img_1": "段落1.1", "img_2": "段落2.1", "img_3": "段落3.1",
  143. "img_4": "段落4.1", "img_5": "段落5.1", "img_6": "段落6.1",
  144. "img_7": "段落7.1", "img_8": "段落8.1", "img_9": "段落9.1"
  145. },
  146. "composition_layout": {
  147. "img_1": "段落1", "img_2": "段落2", "img_3": "段落3",
  148. "img_4": "段落4", "img_5": "段落5", "img_6": "段落6",
  149. "img_7": "段落7", "img_8": "段落8", "img_9": "段落9"
  150. },
  151. "lighting_atmosphere": {
  152. "img_1": "段落1", "img_2": "段落2", "img_3": "段落3",
  153. "img_4": "段落4", "img_5": "段落5", "img_6": "段落6",
  154. "img_7": "段落7", "img_8": "段落8", "img_9": "段落9"
  155. },
  156. "painting_content": {
  157. "img_1": "段落1.2.1", "img_2": "段落2.2.1", "img_3": "段落3.2.1",
  158. "img_4": "段落4.2.1", "img_5": "段落5.2", "img_6": "段落6.2.1",
  159. "img_7": None, # img_7无画架
  160. "img_8": "段落8.2.1", "img_9": "段落9.2.1"
  161. }
  162. }
  163. # 高亮聚类对应
  164. highlight_clusters = {
  165. "person_appearance": "cluster_1",
  166. "composition_layout": "cluster_6",
  167. "lighting_atmosphere": "cluster_4",
  168. "painting_content": "cluster_5"
  169. }
  170. # 特征描述
  171. feature_names = {
  172. "person_appearance": "女性人物外观(服装/发型/配饰)",
  173. "composition_layout": "场景构图(景别/角度/视线引导)",
  174. "lighting_atmosphere": "光影氛围(逆光/散景/梦幻)",
  175. "painting_content": "画中画内容(画布上的油画)"
  176. }
  177. prompts = {
  178. "person_appearance": PERSON_APPEARANCE_PROMPT,
  179. "composition_layout": COMPOSITION_PROMPT,
  180. "lighting_atmosphere": LIGHTING_PROMPT,
  181. "painting_content": PAINTING_CONTENT_PROMPT,
  182. }
  183. # 存储所有维度的mappings
  184. all_mappings = {dim: [] for dim in dirs.keys()}
  185. for i in range(1, 10):
  186. img_id = f"img_{i}"
  187. image_path = os.path.join(input_dir, f"{img_id}.jpg")
  188. if not os.path.exists(image_path):
  189. continue
  190. print(f"\n处理 {img_id}...")
  191. for dim, prompt in prompts.items():
  192. # 检查是否需要处理(如img_7无画架)
  193. seg = segment_maps[dim].get(img_id)
  194. print(f" 提取 {dim}...", end=" ")
  195. try:
  196. response = query_vlm(image_path, prompt)
  197. data = extract_json_from_response(response)
  198. # 保存JSON
  199. json_path = os.path.join(dirs[dim], f"{img_id}_{dim}.json")
  200. with open(json_path, 'w', encoding='utf-8') as f:
  201. json.dump({
  202. "image_id": img_id,
  203. "dimension": dim,
  204. "segment": seg,
  205. "data": data
  206. }, f, ensure_ascii=False, indent=2)
  207. print(f"✓")
  208. # 添加到mappings
  209. mapping_entry = {
  210. "file": f"{img_id}_{dim}.json",
  211. "source_image": f"input/{img_id}.jpg",
  212. "segment": seg or "N/A",
  213. "category": "实质" if dim == "person_appearance" else "形式",
  214. "feature": feature_names[dim],
  215. "highlight_cluster": highlight_clusters[dim]
  216. }
  217. if dim == "person_appearance":
  218. mapping_entry["element_id"] = "元素1"
  219. elif dim == "painting_content":
  220. mapping_entry["element_id"] = "元素5"
  221. all_mappings[dim].append(mapping_entry)
  222. time.sleep(0.5) # 避免API限流
  223. except Exception as e:
  224. print(f"✗ 错误: {e}")
  225. # 保存各维度的mapping.json
  226. dim_descriptions = {
  227. "person_appearance": "人物外观语义描述,使用VLM分析服装/发型/配饰/气质",
  228. "composition_layout": "场景构图描述,使用VLM分析景别/角度/视线引导/空间布局",
  229. "lighting_atmosphere": "光影氛围描述,使用VLM分析光照类型/散景/色温/整体氛围",
  230. "painting_content": "画中画内容描述,使用VLM分析画布上的油画内容及叙事关系"
  231. }
  232. for dim, mappings in all_mappings.items():
  233. mapping = {
  234. "dimension": dim,
  235. "description": dim_descriptions[dim],
  236. "tool": "Google Gemini 2.0 Flash (via OpenRouter)",
  237. "format": {
  238. "json": "JSON,包含结构化语义描述和生成提示词"
  239. },
  240. "mappings": mappings
  241. }
  242. mapping_path = os.path.join(dirs[dim], "mapping.json")
  243. with open(mapping_path, 'w', encoding='utf-8') as f:
  244. json.dump(mapping, f, ensure_ascii=False, indent=2)
  245. print("\n✓ VLM语义特征提取完成")
  246. if __name__ == "__main__":
  247. main()