#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 图文识别脚本 主要功能:使用 Gemini API 进行图片OCR识别 """ import os import json import time import sys from typing import Dict, Any, List, Optional from dotenv import load_dotenv import google.generativeai as genai from PIL import Image import requests from io import BytesIO from concurrent.futures import ThreadPoolExecutor, as_completed # 导入自定义模块 sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) from llm.openrouter import OpenRouterProcessor, OpenRouterModel # 构建OCR提示词 prompt = """ #### 人设 你是一名图像文字理解专家,请对输入的文章图片进行精准的文字提取和结构化整理。 #### 任务要求如下: 1. 仅提取图片中可见的文字内容,不需要改写、总结或推理隐藏信息。 2. 如果图片包含结构(如表格、图表、标题、段落等),请按结构输出。 3. 所有提取的内容需保持原始顺序和排版上下文的逻辑。 4. 不需要进行OCR校正,只需要原样提取图中文字。 5. 舍弃图片中和标题不相关的文字 6. 对于结构不明确或自由排列的文字,按照从上到下、从左到右的顺序依次提取。 #### 输出格式 1. 仅输出提取的文字即可,不需要其他说明性的文字 """ class ImageIdentifier: def __init__(self): # 加载环境变量 load_dotenv() # 延迟配置Gemini,在真正使用时再设置 self._configured = False self.model = None def _ensure_configured(self): """确保Gemini已配置""" if not self._configured: self.api_key = os.getenv('GEMINI_API_KEY') print(f"配置Gemini: {self.api_key}") if not self.api_key: raise ValueError("请在环境变量中设置 GEMINI_API_KEY") genai.configure(api_key=self.api_key) self.model = genai.GenerativeModel('gemini-2.5-flash') self._configured = True def download_image(self, image_url: str) -> Optional[Image.Image]: """下载图片并转换为PIL Image对象""" try: response = requests.get(image_url, timeout=10) response.raise_for_status() image = Image.open(BytesIO(response.content)) return image except Exception as e: print(f"下载图片失败 {image_url}: {e}") return None def extract_image_urls(self, formatted_content: Dict[str, Any]) -> List[str]: """提取图片URL列表""" image_urls = [] image_url_list = formatted_content.get('image_url_list', []) for img_data in image_url_list: if isinstance(img_data, dict) and 'image_url' in img_data: image_urls.append(img_data['image_url']) return image_urls def analyze_images_with_gemini(self, image_urls: List[str]) -> Dict[str, Any]: """使用 Gemini 并发(最多5条)提取图片文字(仅内容提取)""" try: if not image_urls: return {"images_comprehension": [], "error": "没有图片需要分析"} # 系统提示:严格限制为"仅提取文字,不做分析" [[memory:7272937]] system_prompt = prompt # 保持输入顺序 results: List[Dict[str, Any]] = [{} for _ in range(len(image_urls))] def analyze_image_job(idx_and_url) -> Dict[str, Any]: idx, url = idx_and_url try: # 下载图片 image = self.download_image(url) if image is None: return {"idx": idx, "url": url, "content": "", "success": False, "error": "图片下载失败"} # 使用 Gemini 直接分析图片 self._ensure_configured() response = self.model.generate_content([system_prompt, image]) if response.text: return {"idx": idx, "url": url, "content": response.text, "success": True} else: return {"idx": idx, "url": url, "content": "", "success": False, "error": "识别失败或无内容返回"} except Exception as e: return {"idx": idx, "url": url, "content": "", "success": False, "error": str(e)} # 并发最多5条 with ThreadPoolExecutor(max_workers=5) as executor: future_to_index = {} for idx, url in enumerate(image_urls): future = executor.submit(analyze_image_job, (idx, url)) future_to_index[future] = idx for future in as_completed(list(future_to_index.keys())): result = future.result() idx = result["idx"] results[idx] = { "url": result["url"], "content": result["content"], "success": result["success"] } if not result["success"]: results[idx]["error"] = result["error"] return {"images_comprehension": results} except Exception as e: print(f"Gemini 并发调用失败: {e}") return {"images_comprehension": [], "error": f"Gemini API 调用失败: {str(e)}"} def process_images(self, formatted_content: Dict[str, Any]) -> Dict[str, Any]: """处理图片识别的主函数""" # 提取图片URL image_urls = self.extract_image_urls(formatted_content) if not image_urls: print("没有图片需要分析") return {"images_comprehension": [], "error": "没有图片需要分析"} # 分析图片 result = self.analyze_images_with_gemini(image_urls) if result.get("images_comprehension"): successful_count = sum(1 for img in result['images_comprehension'] if img.get('success', False)) else: print("图片OCR识别失败") return result def main(): """测试函数""" # 模拟数据 test_content = { "image_url_list": [ { "image_type": 2, "image_url": "http://rescdn.yishihui.com/pipeline/image/ea4f33e9-9e36-4124-aaec-138ea9bcadd9.jpg" }, { "image_type": 2, "image_url": "http://rescdn.yishihui.com/pipeline/image/ea4f33e9-9e36-4124-aaec-138ea9bcadd9.jpg" } ] } try: identifier = ImageIdentifier() result = identifier.process_images(test_content) print(f"识别结果: {json.dumps(result, ensure_ascii=False, indent=2)}") except Exception as e: print(f"初始化失败: {e}") if __name__ == '__main__': main()