|
- # -*- coding: utf-8 -*-
- import socket
- import os
- import time
- import uuid
- import threading
- import pandas as pd
- import requests
- from requests.adapters import HTTPAdapter
- import google.generativeai as genai
- import orjson
- from google.generativeai.types import HarmBlockThreshold, HarmCategory
- from pandas import ExcelWriter
- from prompt.prompt import (
- VIDEO_TOPIC_ANALYSIS_PROMPT,
- VIDEO_TEXT_EXTRACTION_PROMPT,
- VIDEO_SEGMENT_ANALYSIS_PROMPT,
- HOOK_EXTRACTION_PROMPT
- )
- # =================== 环境配置 ===================
- os.environ.update({
- "GENAI_UPLOAD_CHUNK_SIZE": "5242880",
- "GENAI_UPLOAD_TIMEOUT": "300",
- "HTTP_PROXY": "http://127.0.0.1:7890",
- "HTTPS_PROXY": "http://127.0.0.1:7890"
- })
- # =================== 网络配置 ===================
- _original_getaddrinfo = socket.getaddrinfo
- def _new_getaddrinfo(*args, **kwargs):
- return [res for res in _original_getaddrinfo(*args, **kwargs) if res[0] == socket.AF_INET]
- socket.getaddrinfo = _new_getaddrinfo
- # =================== 常量配置 ===================
- CACHE_DIR = './video_cache/'
- API_KEYS = ["AIzaSyBGPYEc9F3FoDEqwlaVBxUHsNdkxmR_sl0"]
- RESULT_EXCEL = '视频分析报告.xlsx'
- PROXY_CONFIG = {"http": "http://127.0.0.1:7890", "https": "http://127.0.0.1:7890"}
- # =================== 初始化配置 ===================
- os.makedirs(CACHE_DIR, exist_ok=True)
- # =================== 核心类定义 ===================
- class GoogleVideoAnalyzer:
- def __init__(self):
- self.current_api_key = API_KEYS[0]
- self._stop_event = threading.Event()
- self.session = self._create_proxied_session()
- genai.configure(api_key=self.current_api_key, transport='rest')
- def _create_proxied_session(self):
- """创建带代理配置的会话"""
- session = requests.Session()
- session.proxies = PROXY_CONFIG
- session.verify = False
- adapter = HTTPAdapter(max_retries=3, pool_connections=30, pool_maxsize=10)
- session.mount('https://', adapter)
- session.mount('http/', adapter)
-
- # 增强超时处理
- original_send = session.send
- def new_send(request, **kwargs):
- kwargs.setdefault('timeout', (10, 30))
- return original_send(request, **kwargs)
- session.send = new_send
-
- return session
- def _validate_video_file(self, path: str):
- """视频文件验证增强"""
- if not os.path.exists(path):
- raise FileNotFoundError(f"视频文件不存在: {path}")
- if os.path.getsize(path) == 0:
- raise ValueError("空文件无法上传")
- if not path.lower().endswith('.mp4'):
- raise ValueError("仅支持MP4格式文件")
- if os.path.getsize(path) > 100 * 1024 * 1024: # 100MB限制
- raise ValueError("视频文件超过100MB限制")
- def _safe_upload(self, video_path: str):
- """安全上传实现(增强重试机制)"""
- self._validate_video_file(video_path)
- video = None
- retry_count = 0
- max_retries = 3
-
- while retry_count < max_retries:
- try:
- print(f'[上传] 开始上传 | 文件大小: {os.path.getsize(video_path)//1024}KB')
- video = genai.upload_file(path=video_path, mime_type='video/mp4')
-
- while True:
- current_state = video.state.name
- print(f"[状态] {current_state} | 进度: {getattr(video, 'progress', 0)}%")
- if current_state == 'ACTIVE':
- return video
- elif current_state == 'FAILED':
- raise Exception("云端处理失败")
- elif self._stop_event.is_set():
- raise KeyboardInterrupt("用户中断上传")
- time.sleep(10)
- video = genai.get_file(name=video.name)
-
- except Exception as e:
- retry_count += 1
- if video:
- genai.delete_file(name=video.name)
- if retry_count >= max_retries:
- raise Exception(f"上传失败(已重试{max_retries}次): {str(e)}")
- print(f"[重试] 上传失败,第{retry_count}次重试...")
- time.sleep(5)
- def _download_video(self, video_url: str) -> str:
- """增强版视频下载(强制完整性校验+断点续传)"""
- file_path = os.path.join(CACHE_DIR, f'{uuid.uuid4()}.mp4')
- temp_file = None
- retry_count = 0
- max_retries = 3
- downloaded = 0
-
- while retry_count < max_retries:
- try:
- # 获取文件总大小(带重试)
- with self.session.head(video_url, timeout=10) as head_resp:
- head_resp.raise_for_status()
- total_size = int(head_resp.headers.get('content-length', 0))
- if total_size == 0:
- raise ValueError("服务器未返回有效文件大小")
- # 支持断点续传
- if os.path.exists(file_path):
- downloaded = os.path.getsize(file_path)
- headers = {'Range': f'bytes={downloaded}-'}
- else:
- headers = {}
- with self.session.get(
- video_url,
- stream=True,
- timeout=30,
- headers=headers
- ) as response:
- response.raise_for_status()
-
- # 验证范围请求响应
- if downloaded > 0 and response.status_code != 206:
- raise ConnectionError("服务器不支持断点续传")
- mode = 'ab' if downloaded > 0 else 'wb'
- temp_file = open(file_path, mode)
-
- for chunk in response.iter_content(chunk_size=8192):
- if self._stop_event.is_set():
- raise KeyboardInterrupt("用户中断下载")
- if chunk:
- temp_file.write(chunk)
- downloaded += len(chunk)
- progress = downloaded/total_size*100
- print(f"\r[下载] 进度: {progress:.1f}% | {downloaded//1024}KB/{total_size//1024}KB",
- end='', flush=True)
- # 强制完整性校验
- if downloaded != total_size:
- raise IOError(f"下载不完整({downloaded}/{total_size}字节)")
-
- return file_path
- except Exception as e:
- retry_count += 1
- if retry_count >= max_retries:
- if os.path.exists(file_path):
- os.remove(file_path)
- raise Exception(f"下载失败(重试{max_retries}次): {str(e)}")
- print(f"\n[重试] 下载中断,第{retry_count}次重试...")
- time.sleep(5)
-
- finally:
- if temp_file and not temp_file.closed:
- temp_file.close()
- print("\n[下载] 文件句柄已安全关闭")
- def _analyze_content(self, video, prompt):
- """增强版内容分析"""
- model = genai.GenerativeModel(
- model_name='gemini-2.0-flash',
- generation_config=genai.GenerationConfig(
- response_mime_type='application/json',
- temperature=0.3,
- max_output_tokens=20480
- ),
- safety_settings={
- HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
- HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
- }
- )
-
- try:
- response = model.generate_content(
- contents=[video, prompt],
- request_options={'timeout': 300}
- )
-
- if hasattr(response, '_error') and response._error:
- raise Exception(f"生成错误: {response._error}")
-
- result = orjson.loads(response.text.strip())
- print(f"[视频分析] 响应: {result}")
- if not isinstance(result, dict):
- raise ValueError("响应格式错误:非字典结构")
-
- return result
- except orjson.JSONDecodeError:
- raise Exception("响应解析失败,非JSON格式")
- except Exception as e:
- raise Exception(f"分析失败: {str(e)}")
- def _generate_hooks(self, video, hook_prompt, analysis_data):
- """钩子内容生成专用方法(完整修复版)"""
- try:
- # 1. 准备格式化参数(包含空值保护和类型转换)
- format_args = {
- "summary": str(analysis_data.get("视频选题与要点理解", {}) or "无相关内容"),
- "detail": str(analysis_data.get("视频完整文本提取", {}) or "无相关内容"),
- "timeline": str(analysis_data.get("视频分段与时间点分析", {}) or "无相关内容")
- }
- # 2. 打印调试信息
- print(f"[DEBUG] 分析数据类型验证:")
- print(f"- 选题理解类型:{type(analysis_data.get('视频选题与要点理解'))}")
- print(f"- 文本提取类型:{type(analysis_data.get('视频完整文本提取'))}")
- print(f"- 分段分析类型:{type(analysis_data.get('视频分段与时间点分析'))}")
- # 3. 执行模板替换(关键修复点)
- formatted_prompt = hook_prompt.format(**format_args)
- print(f"[SUCCESS] 模板替换完成,新Prompt长度:{len(formatted_prompt)}")
- print(f"[DEBUG] 格式化Prompt预览(前500字符):\n{formatted_prompt[:500]}...")
- # 4. 模型调用(修复参数传递)
- model = genai.GenerativeModel(
- model_name='gemini-2.0-flash',
- generation_config=genai.GenerationConfig(
- response_mime_type='application/json',
- temperature=0.5,
- max_output_tokens=4096
- )
- )
-
- # 5. 发送格式化后的prompt(关键修复点)
- response = model.generate_content(
- contents=[video, formatted_prompt], # 使用格式化后的内容
- request_options={'timeout': 600}
- )
- print(f"[响应原始数据] 长度:{len(response.text)}字符")
- # 6. 响应预处理(解决单引号问题)
- clean_text = response.text.replace("'", "\"") # 替换单引号
- clean_text = clean_text.replace("\n", "") # 去除换行符
- print(f"[响应清洗后] 预览:{clean_text[:200]}...")
- # 7. 严格JSON验证
- try:
- result = orjson.loads(clean_text)
- if not isinstance(result, list):
- raise ValueError("响应应为JSON数组")
-
- # 字段完整性验证
- required_fields = {
- "需求排序序号", "需求详细query", "需求分类",
- "推测出该点需求的原因", "需求钩子话术", "需求钩子出现时间"
- }
-
- for idx, item in enumerate(result):
- missing = required_fields - set(item.keys())
- if missing:
- raise ValueError(f"第{idx+1}个对象缺失字段:{missing}")
- if len(item["需求钩子话术"]) > 11:
- raise ValueError(f"第{idx+1}个话术超长:'{item['需求钩子话术']}'")
-
- return result
-
- except orjson.JSONDecodeError as e:
- error_msg = f"JSON解析失败:{str(e)}\n原始响应:{clean_text[:500]}"
- raise ValueError(error_msg)
- except KeyError as e:
- print(f"!! 关键错误:模板变量 {e} 未定义,请检查Excel占位符")
- return {"error": f"模板变量 {e} 缺失"}
- except ValueError as e:
- print(f"!! 数据验证失败:{str(e)}")
- return {"error": str(e), "type": "DATA_VALIDATION"}
- except Exception as e:
- import traceback
- error_detail = f"""
- === 未捕获异常 ===
- 类型:{type(e)}
- 信息:{str(e)}
- 追踪:
- {traceback.format_exc()}
- """
- print(error_detail)
- return {"error": "未知异常"}
- def cancel_operation(self):
- """操作中止"""
- self._stop_event.set()
- print("[系统] 正在终止操作...")
- def analyze(self, video_url: str, prompts: list):
- """增强版分析流程"""
- self._stop_event.clear()
- video_path = None
-
- try:
- print(f"\n[下载] 开始下载 {video_url}")
- video_path = self._download_video(video_url)
-
- print("[上传] 启动云端处理")
- video = self._safe_upload(video_path)
-
- analysis_data = {}
- for prompt in prompts[:3]:
- print(f"[分析] 正在执行: {prompt['name']}")
- try:
- result = self._analyze_content(video, prompt['content'])
- analysis_data[prompt['name']] = result
- except Exception as e:
- analysis_data[prompt['name']] = {
- "error": str(e),
- "error_type": type(e).__name__
- }
-
- hook_result = {}
- if len(prompts) >=4:
- hook_prompt = prompts[3]
- print(f"[钩子生成] 正在执行: {hook_prompt['name']}")
- try:
- hook_result = self._generate_hooks(video, hook_prompt['content'], analysis_data)
- print("钩子提取完成")
- except Exception as e:
- print(e)
- hook_result = {
- "error": str(e),
- "error_type": type(e).__name__
- }
-
- return {
- "基础分析": analysis_data,
- "钩子提取": hook_result
- }
-
- finally:
- if video_path and os.path.exists(video_path):
- os.remove(video_path)
- # =================== 数据处理 ===================
- def load_prompts():
- """从prompt.py加载Prompt"""
- try:
- print("\n[初始化] 从prompt.py加载Prompt")
-
- prompts = [
- {
- "name": "视频选题与要点理解",
- "content": VIDEO_TOPIC_ANALYSIS_PROMPT
- },
- {
- "name": "视频完整文本提取",
- "content": VIDEO_TEXT_EXTRACTION_PROMPT
- },
- {
- "name": "视频分段与时间点分析",
- "content": VIDEO_SEGMENT_ANALYSIS_PROMPT
- },
- {
- "name": "钩子提取",
- "content": HOOK_EXTRACTION_PROMPT
- }
- ]
-
- print(f"[成功] 加载 {len(prompts)} 个Prompt")
- return prompts
-
- except Exception as e:
- raise Exception(f"加载Prompt失败: {str(e)}")
- def process_video_data():
- """增强版数据处理"""
- try:
- prompts = load_prompts()
- video_df = pd.read_excel('0517.xlsx', engine='openpyxl').iloc[18:] # 从第19个视频开始
- analyzer = GoogleVideoAnalyzer()
- results = []
-
- import signal
- signal.signal(signal.SIGINT, lambda s,f: analyzer.cancel_operation())
- for idx, row in video_df.iterrows():
- video_id = row['videoid']
- video_url = f"http://visionularcdn.yishihui.com/{row['transcode_video_path'].replace('mp4/', 'mp4')}"
-
- record = {
- "视频ID": video_id,
- "播放量": row.get('播放次数', 'N/A'),
- "视频标题": row.get('视频标题', 'N/A'),
- "视频地址": video_url,
- "状态": "成功"
- }
-
- try:
- print(f"\n{'='*30} 处理视频 {idx+1}/{len(video_df)} {'='*30}")
- analysis = analyzer.analyze(video_url, prompts)
-
- for prompt in prompts[:3]:
- record[prompt['name']] = str(analysis["基础分析"].get(prompt['name'], {}))
-
- record["钩子提取"] = str(analysis.get("钩子提取", {}))
-
- except Exception as e:
- record.update({
- "状态": "失败",
- "错误类型": type(e).__name__,
- "错误详情": str(e)
- })
-
- finally:
- results.append(record)
- pd.DataFrame(results).to_excel(RESULT_EXCEL, index=False)
- with ExcelWriter(RESULT_EXCEL, engine='openpyxl') as writer:
- df_results = pd.DataFrame(results)
- df_results.to_excel(writer, index=False)
-
- worksheet = writer.sheets['Sheet1']
- for col in worksheet.columns:
- max_len = max(len(str(cell.value)) for cell in col)
- worksheet.column_dimensions[col[0].column_letter].width = min(max_len + 2, 50)
-
- print(f"\n{'='*30}\n报告已生成: {os.path.abspath(RESULT_EXCEL)}")
- except Exception as e:
- print(f"\n{'!'*30} 系统级错误 {'!'*30}\n{str(e)}")
- # =================== 执行入口 ===================
- if __name__ == '__main__':
- print("=== 视频分析系统启动 ===")
- print(f"工作目录: {os.getcwd()}")
-
- try:
- test_resp = requests.get("https://www.google.com",
- proxies=PROXY_CONFIG,
- timeout=10,
- verify=False)
- print(f"[网络] 连接测试成功 ({test_resp.status_code})")
- except Exception as e:
- print(f"[网络] 连接测试失败: {str(e)}")
- exit(1)
-
- start_time = time.time()
- try:
- process_video_data()
- except KeyboardInterrupt:
- print("\n[中断] 用户主动终止程序")
- finally:
- print(f"总运行时间: {time.time()-start_time:.1f}秒")
|