|
|
@@ -10,8 +10,11 @@
|
|
|
"""
|
|
|
|
|
|
import asyncio
|
|
|
+import base64
|
|
|
import json
|
|
|
+import mimetypes
|
|
|
import os
|
|
|
+import traceback
|
|
|
from datetime import datetime
|
|
|
from typing import Optional
|
|
|
from pydantic import BaseModel, Field
|
|
|
@@ -26,6 +29,13 @@ API_TIMEOUT = 120
|
|
|
ENABLE_CACHE = True # 是否启用评估结果缓存
|
|
|
CACHE_DIR = ".evaluation_cache" # 缓存目录
|
|
|
|
|
|
+# 视频处理配置
|
|
|
+MAX_VIDEO_SIZE_MB = 60 # 最大视频大小限制(MB)
|
|
|
+VIDEO_DOWNLOAD_TIMEOUT = 60 # 视频下载超时(秒)
|
|
|
+TEMP_VIDEO_DIR = "/tmp/kg_agent_videos" # 临时视频存储目录(同时也是缓存目录)
|
|
|
+VIDEO_CHUNK_SIZE = 8192 # 下载分块大小(字节)
|
|
|
+MAX_VIDEO_DOWNLOAD_RETRIES = 2 # 下载重试次数
|
|
|
+
|
|
|
# ============================================================================
|
|
|
# 数据模型
|
|
|
# ============================================================================
|
|
|
@@ -987,6 +997,249 @@ PROMPT4_CATEGORY_MATCH = """# Prompt 2: 多模态内容品类匹配评估
|
|
|
# 辅助函数
|
|
|
# ============================================================================
|
|
|
|
|
|
+# 视频处理函数
|
|
|
+async def download_video(video_url: str, note_id: str) -> Optional[str]:
|
|
|
+ """
|
|
|
+ 异步下载视频到本地文件(支持缓存)
|
|
|
+
|
|
|
+ Args:
|
|
|
+ video_url: 视频URL
|
|
|
+ note_id: 帖子ID(用于文件命名)
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 本地文件路径,失败返回None
|
|
|
+ """
|
|
|
+ os.makedirs(TEMP_VIDEO_DIR, exist_ok=True)
|
|
|
+ video_path = os.path.join(TEMP_VIDEO_DIR, f"{note_id}.mp4")
|
|
|
+
|
|
|
+ # 检查视频缓存(如果文件已存在,直接返回)
|
|
|
+ if os.path.exists(video_path):
|
|
|
+ file_size = os.path.getsize(video_path)
|
|
|
+ print(f" ♻️ 使用缓存的视频: {file_size / 1024 / 1024:.2f}MB")
|
|
|
+ return video_path
|
|
|
+
|
|
|
+ for attempt in range(MAX_VIDEO_DOWNLOAD_RETRIES + 1):
|
|
|
+ try:
|
|
|
+ loop = asyncio.get_event_loop()
|
|
|
+ response = await loop.run_in_executor(
|
|
|
+ None,
|
|
|
+ lambda: requests.get(
|
|
|
+ video_url,
|
|
|
+ stream=True,
|
|
|
+ timeout=VIDEO_DOWNLOAD_TIMEOUT
|
|
|
+ )
|
|
|
+ )
|
|
|
+
|
|
|
+ if response.status_code != 200:
|
|
|
+ raise Exception(f"HTTP {response.status_code}")
|
|
|
+
|
|
|
+ # 检查Content-Length header(如果存在)
|
|
|
+ content_length = response.headers.get('content-length')
|
|
|
+ if content_length:
|
|
|
+ size_mb = int(content_length) / 1024 / 1024
|
|
|
+ print(f" 📊 视频大小: {size_mb:.2f}MB")
|
|
|
+ if size_mb > MAX_VIDEO_SIZE_MB:
|
|
|
+ print(f" ⚠️ 视频超过{MAX_VIDEO_SIZE_MB}MB限制,跳过下载")
|
|
|
+ return None
|
|
|
+
|
|
|
+ # 流式下载,检查大小
|
|
|
+ current_size = 0
|
|
|
+ max_size = MAX_VIDEO_SIZE_MB * 1024 * 1024
|
|
|
+
|
|
|
+ with open(video_path, 'wb') as f:
|
|
|
+ for chunk in response.iter_content(chunk_size=VIDEO_CHUNK_SIZE):
|
|
|
+ if chunk:
|
|
|
+ current_size += len(chunk)
|
|
|
+ if current_size > max_size:
|
|
|
+ # 删除不完整的文件
|
|
|
+ if os.path.exists(video_path):
|
|
|
+ try:
|
|
|
+ os.remove(video_path)
|
|
|
+ except:
|
|
|
+ pass
|
|
|
+ print(f" ⚠️ 视频超过{MAX_VIDEO_SIZE_MB}MB限制")
|
|
|
+ return None
|
|
|
+ f.write(chunk)
|
|
|
+
|
|
|
+ print(f" 📥 视频下载成功: {current_size / 1024 / 1024:.2f}MB")
|
|
|
+ return video_path
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ if attempt < MAX_VIDEO_DOWNLOAD_RETRIES:
|
|
|
+ wait_time = 2 * (attempt + 1)
|
|
|
+ print(f" ⚠️ 下载失败,{wait_time}秒后重试 ({attempt + 1}/{MAX_VIDEO_DOWNLOAD_RETRIES}) - {str(e)[:100]}")
|
|
|
+ await asyncio.sleep(wait_time)
|
|
|
+ else:
|
|
|
+ print(f" ❌ 视频下载失败: {str(e)[:100]}")
|
|
|
+ print(f" 📋 错误详情: {traceback.format_exc()[:300]}")
|
|
|
+ # 清理可能的不完整文件
|
|
|
+ if os.path.exists(video_path):
|
|
|
+ try:
|
|
|
+ os.remove(video_path)
|
|
|
+ except:
|
|
|
+ pass
|
|
|
+ return None
|
|
|
+
|
|
|
+
|
|
|
+async def encode_video_to_base64(video_path: str) -> Optional[str]:
|
|
|
+ """
|
|
|
+ 将视频文件编码为base64字符串
|
|
|
+
|
|
|
+ Args:
|
|
|
+ video_path: 本地视频文件路径
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ base64编码字符串,失败返回None
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ # 检查文件是否存在
|
|
|
+ if not os.path.exists(video_path):
|
|
|
+ print(f" ❌ 视频文件不存在: {video_path}")
|
|
|
+ return None
|
|
|
+
|
|
|
+ # 检查文件大小(base64编码会增加约33%的大小)
|
|
|
+ file_size = os.path.getsize(video_path)
|
|
|
+ estimated_base64_size = file_size * 1.33 # base64编码后的大小估算
|
|
|
+ max_memory_size = MAX_VIDEO_SIZE_MB * 1024 * 1024 * 1.5 # 允许一些余量
|
|
|
+
|
|
|
+ if estimated_base64_size > max_memory_size:
|
|
|
+ print(f" ⚠️ 视频文件过大,无法编码到内存 ({file_size / 1024 / 1024:.2f}MB)")
|
|
|
+ return None
|
|
|
+
|
|
|
+ loop = asyncio.get_event_loop()
|
|
|
+
|
|
|
+ def _encode_video():
|
|
|
+ """同步编码函数"""
|
|
|
+ try:
|
|
|
+ with open(video_path, 'rb') as f:
|
|
|
+ video_data = f.read()
|
|
|
+ base64_str = base64.b64encode(video_data).decode('utf-8')
|
|
|
+ return base64_str
|
|
|
+ except MemoryError as e:
|
|
|
+ print(f" ❌ 内存不足,无法编码视频: {str(e)[:100]}")
|
|
|
+ raise
|
|
|
+ except Exception as e:
|
|
|
+ print(f" ❌ 读取/编码视频文件失败: {str(e)[:100]}")
|
|
|
+ raise
|
|
|
+
|
|
|
+ base64_str = await loop.run_in_executor(None, _encode_video)
|
|
|
+
|
|
|
+ if base64_str:
|
|
|
+ print(f" 🔐 视频编码完成: {len(base64_str) / 1024 / 1024:.2f}MB (base64)")
|
|
|
+ return base64_str
|
|
|
+ else:
|
|
|
+ return None
|
|
|
+
|
|
|
+ except MemoryError as e:
|
|
|
+ print(f" ❌ 内存不足,无法编码视频: {str(e)[:100]}")
|
|
|
+ print(f" 📋 错误详情: {traceback.format_exc()[:300]}")
|
|
|
+ return None
|
|
|
+ except Exception as e:
|
|
|
+ print(f" ❌ 视频编码失败: {str(e)[:100]}")
|
|
|
+ print(f" 📋 错误详情: {traceback.format_exc()[:300]}")
|
|
|
+ return None
|
|
|
+
|
|
|
+
|
|
|
+def get_video_mime_type(video_path: str) -> str:
|
|
|
+ """
|
|
|
+ 检测视频的MIME类型
|
|
|
+
|
|
|
+ Args:
|
|
|
+ video_path: 视频文件路径
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ MIME类型字符串 (默认 "video/mp4")
|
|
|
+ """
|
|
|
+ mime_type, _ = mimetypes.guess_type(video_path)
|
|
|
+ if mime_type and mime_type.startswith('video/'):
|
|
|
+ return mime_type
|
|
|
+ return "video/mp4"
|
|
|
+
|
|
|
+
|
|
|
+# 视频不再清理,保留作为缓存
|
|
|
+# async def cleanup_video(video_path: str):
|
|
|
+# """
|
|
|
+# 清理临时视频文件
|
|
|
+#
|
|
|
+# Args:
|
|
|
+# video_path: 要删除的视频路径
|
|
|
+# """
|
|
|
+# try:
|
|
|
+# if os.path.exists(video_path):
|
|
|
+# os.remove(video_path)
|
|
|
+# print(f" 🗑️ 清理临时文件: {os.path.basename(video_path)}")
|
|
|
+# except Exception as e:
|
|
|
+# print(f" ⚠️ 清理失败: {str(e)[:50]}")
|
|
|
+
|
|
|
+
|
|
|
+async def _prepare_media_content(post) -> tuple[list[str], Optional[str], str]:
|
|
|
+ """
|
|
|
+ 统一准备媒体内容(图片+视频)
|
|
|
+
|
|
|
+ Args:
|
|
|
+ post: Post对象
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ (image_urls, video_base64, video_mime_type)
|
|
|
+ """
|
|
|
+ # 提取图片(包括视频封面图)
|
|
|
+ image_urls = post.images[:MAX_IMAGES_PER_POST] if post.images else []
|
|
|
+
|
|
|
+ # 初始化视频相关变量
|
|
|
+ video_base64 = None
|
|
|
+ video_mime_type = "video/mp4"
|
|
|
+
|
|
|
+ # 处理视频
|
|
|
+ if post.type == "video" and post.video:
|
|
|
+ print(f" 🎬 检测到视频帖子 (ID: {post.note_id})")
|
|
|
+ print(f" 📍 视频URL: {post.video[:80]}...")
|
|
|
+ print(f" 🖼️ 封面图数量: {len(image_urls)}")
|
|
|
+ print(f" ⏳ 开始下载视频...")
|
|
|
+
|
|
|
+ video_path = None
|
|
|
+ try:
|
|
|
+ # 下载视频
|
|
|
+ video_path = await download_video(post.video, post.note_id)
|
|
|
+
|
|
|
+ if video_path and os.path.exists(video_path):
|
|
|
+ try:
|
|
|
+ print(f" 🔄 开始编码视频...")
|
|
|
+ # 编码视频
|
|
|
+ video_base64 = await encode_video_to_base64(video_path)
|
|
|
+ if video_base64:
|
|
|
+ video_mime_type = get_video_mime_type(video_path)
|
|
|
+ print(f" ✅ 视频处理成功!类型: {video_mime_type}")
|
|
|
+ print(f" 📦 将使用视频+封面图进行评估")
|
|
|
+ else:
|
|
|
+ print(f" ⚠️ 视频编码失败,降级使用封面图评估")
|
|
|
+ except MemoryError as e:
|
|
|
+ print(f" ⚠️ 内存不足,无法处理视频: {str(e)[:100]}")
|
|
|
+ print(f" 📦 降级使用封面图评估")
|
|
|
+ except Exception as e:
|
|
|
+ print(f" ⚠️ 视频编码异常: {str(e)[:100]}")
|
|
|
+ print(f" 📋 错误详情: {traceback.format_exc()[:300]}")
|
|
|
+ print(f" 📦 降级使用封面图评估")
|
|
|
+ # 视频不再清理,保留作为缓存
|
|
|
+ else:
|
|
|
+ print(f" ⚠️ 视频下载失败,降级使用封面图评估")
|
|
|
+ except Exception as e:
|
|
|
+ print(f" ⚠️ 视频处理流程异常: {str(e)[:100]}")
|
|
|
+ print(f" 📋 错误详情: {traceback.format_exc()[:300]}")
|
|
|
+ print(f" 📦 降级使用封面图评估")
|
|
|
+ # 视频不再清理,保留作为缓存
|
|
|
+ elif post.type == "video" and not post.video:
|
|
|
+ print(f" ⚠️ 视频类型帖子但video字段为空 (ID: {post.note_id})")
|
|
|
+
|
|
|
+ # 打印最终使用的媒体
|
|
|
+ if post.type == "video":
|
|
|
+ if video_base64:
|
|
|
+ print(f" 📊 最终媒体: {len(image_urls)}张图片 + 1个视频")
|
|
|
+ else:
|
|
|
+ print(f" 📊 最终媒体: {len(image_urls)}张图片 (视频处理失败)")
|
|
|
+
|
|
|
+ return image_urls, video_base64, video_mime_type
|
|
|
+
|
|
|
+
|
|
|
def _get_cache_key(note_id: str) -> str:
|
|
|
"""
|
|
|
生成缓存key
|
|
|
@@ -1102,6 +1355,8 @@ def _clean_json_response(content_text: str) -> str:
|
|
|
async def _call_openrouter_api(
|
|
|
prompt_text: str,
|
|
|
image_urls: list[str],
|
|
|
+ video_base64: Optional[str] = None,
|
|
|
+ video_mime_type: str = "video/mp4",
|
|
|
semaphore: Optional[asyncio.Semaphore] = None
|
|
|
) -> dict:
|
|
|
"""
|
|
|
@@ -1110,6 +1365,8 @@ async def _call_openrouter_api(
|
|
|
Args:
|
|
|
prompt_text: Prompt文本
|
|
|
image_urls: 图片URL列表
|
|
|
+ video_base64: 视频的base64编码字符串(可选)
|
|
|
+ video_mime_type: 视频MIME类型(默认video/mp4)
|
|
|
semaphore: 并发控制信号量
|
|
|
|
|
|
Returns:
|
|
|
@@ -1123,6 +1380,11 @@ async def _call_openrouter_api(
|
|
|
for url in image_urls:
|
|
|
content.append({"type": "image_url", "image_url": {"url": url}})
|
|
|
|
|
|
+ # 添加视频(如果存在)
|
|
|
+ if video_base64:
|
|
|
+ data_url = f"data:{video_mime_type};base64,{video_base64}"
|
|
|
+ content.append({"type": "video_url", "video_url": {"url": data_url}})
|
|
|
+
|
|
|
payload = {
|
|
|
"model": MODEL_NAME,
|
|
|
"messages": [{"role": "user", "content": content}],
|
|
|
@@ -1195,10 +1457,8 @@ async def evaluate_is_knowledge(
|
|
|
Returns:
|
|
|
KnowledgeEvaluation 或 None(失败时)
|
|
|
"""
|
|
|
- if post.type == "video":
|
|
|
- return None
|
|
|
-
|
|
|
- image_urls = post.images[:MAX_IMAGES_PER_POST] if post.images else []
|
|
|
+ # 准备媒体内容(图片+视频)
|
|
|
+ image_urls, video_base64, video_mime_type = await _prepare_media_content(post)
|
|
|
|
|
|
try:
|
|
|
prompt_text = PROMPT1_IS_KNOWLEDGE.format(
|
|
|
@@ -1207,7 +1467,7 @@ async def evaluate_is_knowledge(
|
|
|
num_images=len(image_urls)
|
|
|
)
|
|
|
|
|
|
- data = await _call_openrouter_api(prompt_text, image_urls, semaphore)
|
|
|
+ data = await _call_openrouter_api(prompt_text, image_urls, video_base64, video_mime_type, semaphore)
|
|
|
|
|
|
return KnowledgeEvaluation(
|
|
|
is_knowledge=data.get("is_knowledge", False),
|
|
|
@@ -1239,10 +1499,8 @@ async def evaluate_is_content_knowledge(
|
|
|
Returns:
|
|
|
ContentKnowledgeEvaluation 或 None(失败时)
|
|
|
"""
|
|
|
- if post.type == "video":
|
|
|
- return None
|
|
|
-
|
|
|
- image_urls = post.images[:MAX_IMAGES_PER_POST] if post.images else []
|
|
|
+ # 准备媒体内容(图片+视频)
|
|
|
+ image_urls, video_base64, video_mime_type = await _prepare_media_content(post)
|
|
|
|
|
|
try:
|
|
|
prompt_text = PROMPT2_IS_CONTENT_KNOWLEDGE.format(
|
|
|
@@ -1251,7 +1509,7 @@ async def evaluate_is_content_knowledge(
|
|
|
num_images=len(image_urls)
|
|
|
)
|
|
|
|
|
|
- data = await _call_openrouter_api(prompt_text, image_urls, semaphore)
|
|
|
+ data = await _call_openrouter_api(prompt_text, image_urls, video_base64, video_mime_type, semaphore)
|
|
|
|
|
|
# 判定是否是内容知识:得分 >= 55 分
|
|
|
final_score = data.get("final_score", 0)
|
|
|
@@ -1288,10 +1546,8 @@ async def evaluate_purpose_match(
|
|
|
Returns:
|
|
|
PurposeEvaluation 或 None(失败时)
|
|
|
"""
|
|
|
- if post.type == "video":
|
|
|
- return None
|
|
|
-
|
|
|
- image_urls = post.images[:MAX_IMAGES_PER_POST] if post.images else []
|
|
|
+ # 准备媒体内容(图片+视频)
|
|
|
+ image_urls, video_base64, video_mime_type = await _prepare_media_content(post)
|
|
|
|
|
|
try:
|
|
|
prompt_text = PROMPT3_PURPOSE_MATCH.format(
|
|
|
@@ -1301,7 +1557,7 @@ async def evaluate_purpose_match(
|
|
|
num_images=len(image_urls)
|
|
|
)
|
|
|
|
|
|
- data = await _call_openrouter_api(prompt_text, image_urls, semaphore)
|
|
|
+ data = await _call_openrouter_api(prompt_text, image_urls, video_base64, video_mime_type, semaphore)
|
|
|
|
|
|
# Prompt3的输出在"目的动机评估"键下
|
|
|
purpose_data = data.get("目的动机评估", {})
|
|
|
@@ -1336,10 +1592,8 @@ async def evaluate_category_match(
|
|
|
Returns:
|
|
|
CategoryEvaluation 或 None(失败时)
|
|
|
"""
|
|
|
- if post.type == "video":
|
|
|
- return None
|
|
|
-
|
|
|
- image_urls = post.images[:MAX_IMAGES_PER_POST] if post.images else []
|
|
|
+ # 准备媒体内容(图片+视频)
|
|
|
+ image_urls, video_base64, video_mime_type = await _prepare_media_content(post)
|
|
|
|
|
|
try:
|
|
|
prompt_text = PROMPT4_CATEGORY_MATCH.format(
|
|
|
@@ -1349,7 +1603,7 @@ async def evaluate_category_match(
|
|
|
num_images=len(image_urls)
|
|
|
)
|
|
|
|
|
|
- data = await _call_openrouter_api(prompt_text, image_urls, semaphore)
|
|
|
+ data = await _call_openrouter_api(prompt_text, image_urls, video_base64, video_mime_type, semaphore)
|
|
|
|
|
|
# Prompt4的输出在"品类评估"键下
|
|
|
category_data = data.get("品类评估", {})
|
|
|
@@ -1416,10 +1670,6 @@ async def evaluate_post_v3(
|
|
|
(knowledge_eval, content_eval, purpose_eval, category_eval, final_score, match_level)
|
|
|
任一步骤失败,后续结果为None
|
|
|
"""
|
|
|
- if post.type == "video":
|
|
|
- print(f" ⊗ 跳过视频帖子: {post.note_id}")
|
|
|
- return (None, None, None, None, None, None)
|
|
|
-
|
|
|
# 检查缓存
|
|
|
if ENABLE_CACHE:
|
|
|
cached_result = _load_from_cache(post.note_id)
|