123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- 图文识别脚本
- 主要功能:使用 Gemini API 进行图片OCR识别
- """
- import os
- import json
- import time
- import sys
- from typing import Dict, Any, List, Optional
- from dotenv import load_dotenv
- import google.generativeai as genai
- from PIL import Image
- import requests
- from io import BytesIO
- from concurrent.futures import ThreadPoolExecutor, as_completed
- # 导入自定义模块
- sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
- from llm.openrouter import OpenRouterProcessor, OpenRouterModel
- from utils.logging_config import get_logger
- # 创建 logger
- logger = get_logger('ImageIdentifier')
- # 构建OCR提示词
- prompt = """
- #### 人设
- 你是一名图像文字理解专家,请对输入的文章图片进行精准的文字提取和结构化整理。
- #### 任务要求如下:
- 1. 仅提取图片中可见的文字内容,不需要改写、总结或推理隐藏信息。
- 2. 如果图片包含结构(如表格、图表、标题、段落等),请按结构输出。
- 3. 所有提取的内容需保持原始顺序和排版上下文的逻辑。
- 4. 不需要进行OCR校正,只需要原样提取图中文字。
- 5. 舍弃图片中和标题不相关的文字
- 6. 对于结构不明确或自由排列的文字,按照从上到下、从左到右的顺序依次提取。
- #### 输出格式
- 1. 仅输出提取的文字即可,不需要其他说明性的文字
- """
- class ImageIdentifier:
- def __init__(self):
- # 加载环境变量
- load_dotenv()
-
- # 延迟配置Gemini,在真正使用时再设置
- self._configured = False
- self.model = None
-
- def _ensure_configured(self):
- """确保Gemini已配置"""
- if not self._configured:
- self.api_key = os.getenv('GEMINI_API_KEY_1')
-
- if not self.api_key:
- raise ValueError("请在环境变量中设置 GEMINI_API_KEY")
- genai.configure(api_key=self.api_key)
-
- # 创建模型时设置安全设置,避免内容被过滤
- self.model = genai.GenerativeModel(
- 'gemini-2.5-flash',
- safety_settings={
- genai.types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: genai.types.HarmBlockThreshold.BLOCK_NONE,
- genai.types.HarmCategory.HARM_CATEGORY_HARASSMENT: genai.types.HarmBlockThreshold.BLOCK_NONE,
- genai.types.HarmCategory.HARM_CATEGORY_HATE_SPEECH: genai.types.HarmBlockThreshold.BLOCK_NONE,
- genai.types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: genai.types.HarmBlockThreshold.BLOCK_NONE,
- }
- )
- self._configured = True
-
- def download_image(self, image_url: str) -> Optional[Image.Image]:
- """下载图片并转换为PIL Image对象"""
- try:
- response = requests.get(image_url, timeout=10)
- response.raise_for_status()
- image = Image.open(BytesIO(response.content))
- return image
- except Exception as e:
- print(f"下载图片失败 {image_url}: {e}")
- return None
-
- def extract_image_urls(self, formatted_content: Dict[str, Any]) -> List[str]:
- """提取图片URL列表"""
- image_urls = []
- image_url_list = formatted_content.get('image_url_list', [])
-
- for img_data in image_url_list:
- if isinstance(img_data, dict) and 'image_url' in img_data:
- image_urls.append(img_data['image_url'])
-
- return image_urls
-
- def analyze_images_with_gemini(self, image_urls: List[str]) -> Dict[str, Any]:
- """使用 Gemini 并发(最多5条)提取图片文字(仅内容提取)"""
- try:
- if not image_urls:
- return {"images_comprehension": [], "error": "没有图片需要分析"}
- # 系统提示:严格限制为"仅提取文字,不做分析" [[memory:7272937]]
- system_prompt = prompt
- # 保持输入顺序
- results: List[Dict[str, Any]] = [{} for _ in range(len(image_urls))]
- def analyze_image_job(idx_and_url) -> Dict[str, Any]:
- idx, url = idx_and_url
- try:
- # 下载图片
- image = self.download_image(url)
- if image is None:
- return {"idx": idx, "url": url, "content": "", "success": False, "error": "图片下载失败"}
- # 使用 Gemini 直接分析图片
- self._ensure_configured()
- logger.info(f"配置Gemini: {self.api_key}")
- response = self.model.generate_content([system_prompt, image])
-
- # 尝试获取文本内容
- try:
- if response.text:
- return {"idx": idx, "url": url, "content": response.text, "success": True}
- else:
- return {"idx": idx, "url": url, "content": "", "success": False, "error": "识别失败或无内容返回"}
- except Exception as text_error:
- logger.error(f"获取响应文本失败: {text_error}")
- return {"idx": idx, "url": url, "content": "", "success": False, "error": f"获取响应文本失败: {str(text_error)}"}
-
- except Exception as e:
- return {"idx": idx, "url": url, "content": "", "success": False, "error": str(e)}
- # 顺序逐个处理,取消并发
- for idx, url in enumerate(image_urls):
- result = analyze_image_job((idx, url))
- results[idx] = {
- "url": result["url"],
- "content": result["content"],
- "success": result["success"]
- }
- if not result["success"]:
- results[idx]["error"] = result["error"]
- return {"images_comprehension": results}
- except Exception as e:
- print(f"Gemini 并发调用失败: {e}")
- return {"images_comprehension": [], "error": f"Gemini API 调用失败: {str(e)}"}
-
- def process_images(self, formatted_content: Dict[str, Any]) -> Dict[str, Any]:
- """处理图片识别的主函数"""
-
- # 提取图片URL
- image_urls = self.extract_image_urls(formatted_content)
-
- if not image_urls:
- print("没有图片需要分析")
- return {"images_comprehension": [], "error": "没有图片需要分析"}
-
- # 分析图片
- result = self.analyze_images_with_gemini(image_urls)
-
- if result.get("images_comprehension"):
- successful_count = sum(1 for img in result['images_comprehension'] if img.get('success', False))
- else:
- print("图片OCR识别失败")
-
- return result
- def main():
- """测试函数"""
- # 模拟数据
- test_content = {
- "image_url_list": [
- {
- "image_type": 2,
- "image_url": "http://rescdn.yishihui.com/pipeline/image/ea4f33e9-9e36-4124-aaec-138ea9bcadd9.jpg"
- },
- {
- "image_type": 2,
- "image_url": "http://rescdn.yishihui.com/pipeline/image/ea4f33e9-9e36-4124-aaec-138ea9bcadd9.jpg"
- }
- ]
- }
-
- try:
- identifier = ImageIdentifier()
- result = identifier.process_images(test_content)
-
- print(f"识别结果: {json.dumps(result, ensure_ascii=False, indent=2)}")
- except Exception as e:
- print(f"初始化失败: {e}")
- if __name__ == '__main__':
- main()
|