|
@@ -1,451 +0,0 @@
|
|
|
-import os
|
|
|
-import sys
|
|
|
-import json
|
|
|
-import random
|
|
|
-import time
|
|
|
-import uuid
|
|
|
-import yaml
|
|
|
-import requests
|
|
|
-import cv2
|
|
|
-from datetime import datetime
|
|
|
-from typing import Dict, Any, List, Optional, Union
|
|
|
-from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception_type, RetryCallState
|
|
|
-
|
|
|
-from application.common.log.logger_manager import LoggerManager
|
|
|
-from utils.extractors import safe_extract, extract_multiple
|
|
|
-
|
|
|
-# 添加公共模块路径
|
|
|
-sys.path.append(os.getcwd())
|
|
|
-print(os.getcwd())
|
|
|
-
|
|
|
-from application.items import VideoItem
|
|
|
-from application.pipeline import PiaoQuanPipeline
|
|
|
-from application.common.messageQueue import MQ
|
|
|
-from application.common.log import AliyunLogger
|
|
|
-from application.common.log import Local
|
|
|
-from configs.config import base_url
|
|
|
-from application.functions.mysql_service import MysqlService
|
|
|
-
|
|
|
-
|
|
|
-def before_send_log(retry_state: RetryCallState) -> None:
|
|
|
- """请求重试前记录日志"""
|
|
|
- attempt = retry_state.attempt_number
|
|
|
- last_result = retry_state.outcome
|
|
|
- if last_result.failed:
|
|
|
- exc = last_result.exception()
|
|
|
- logger = retry_state.kwargs.get('logger')
|
|
|
- url = retry_state.args[0] if retry_state.args else "unknown"
|
|
|
- if logger:
|
|
|
- logger.warning(f"请求失败,准备重试 ({attempt}/3): {url}, 错误: {str(exc)}")
|
|
|
-
|
|
|
-
|
|
|
-class UniversalCrawler:
|
|
|
- """通用爬虫类,通过YAML配置驱动不同平台的爬取逻辑"""
|
|
|
-
|
|
|
- def __init__(self, platform_config: Dict, rule_dict: Dict, user_list: List, trace_id: str, env: str = "prod"):
|
|
|
- """
|
|
|
- 初始化爬虫
|
|
|
- :param platform_config: 平台配置字典
|
|
|
- :param rule_dict: 规则字典
|
|
|
- :param user_list: 用户列表
|
|
|
- :param trace_id: 追踪ID
|
|
|
- :param env: 运行环境
|
|
|
- """
|
|
|
- self.platform = platform_config["platform"]
|
|
|
- self.mode = platform_config["mode"]
|
|
|
- self.rule_dict = rule_dict
|
|
|
- self.user_list = user_list
|
|
|
- self.trace_id = trace_id
|
|
|
- self.env = env
|
|
|
- self.config = platform_config
|
|
|
- # 初始化日志
|
|
|
- self.logger = LoggerManager.get_logger(platform=self.platform, mode=self.mode)
|
|
|
- self.aliyun_logr = LoggerManager.get_aliyun_logger(platform=self.platform, mode=self.mode)
|
|
|
- self.mq = MQ(topic_name=f"topic_crawler_etl_{env}")
|
|
|
-
|
|
|
- self.has_enough_videos = False
|
|
|
- self.base_api = base_url
|
|
|
- self.loop_times = platform_config.get('loop_times', 1)
|
|
|
-
|
|
|
- # 提取常用配置为类属性,提高可读性
|
|
|
- self.request_method = self.config["method"].upper()
|
|
|
- self.request_url = self.config["url"]
|
|
|
- self.request_headers = self.config.get("headers", {})
|
|
|
- self.request_body = self.config.get("request_body", {})
|
|
|
- self.response_data_path = self.config["response_parse"]["data_path"]
|
|
|
- self.video_fields_map = self.config["response_parse"]["fields"]
|
|
|
-
|
|
|
- # 下载限制配置
|
|
|
- self.download_min_limit = self.config.get("download_limit", {}).get("min", 200)
|
|
|
-
|
|
|
- @retry(
|
|
|
- stop=stop_after_attempt(3), # 最多重试3次
|
|
|
- wait=wait_fixed(2), # 每次重试间隔2秒
|
|
|
- retry=retry_if_exception_type((requests.RequestException, ValueError)),
|
|
|
- before=before_send_log, # 添加重试前日志
|
|
|
- )
|
|
|
- def _send_request(self, url: str, method: str = None, headers: Dict = None,
|
|
|
- payload: Dict = None, timeout: int = 30) -> Optional[Dict]:
|
|
|
- """
|
|
|
- 发送API请求,失败自动重试最多3次
|
|
|
- :param url: 请求URL
|
|
|
- :param method: 请求方法,默认使用配置中的方法
|
|
|
- :param headers: 请求头,默认使用配置中的头
|
|
|
- :param payload: 请求体,默认使用配置中的体
|
|
|
- :param timeout: 超时时间
|
|
|
- :return: 响应JSON数据或None
|
|
|
- """
|
|
|
- # 使用默认配置(如果未提供参数)
|
|
|
- method = method or self.request_method
|
|
|
- headers = headers or self.request_headers
|
|
|
- payload = payload or self.request_body
|
|
|
-
|
|
|
- try:
|
|
|
- self.logger.info(f"{self.trace_id}--正在发送请求: {url}")
|
|
|
- response = requests.request(
|
|
|
- method=method,
|
|
|
- url=url,
|
|
|
- headers=headers,
|
|
|
- json=payload,
|
|
|
- timeout=timeout
|
|
|
- )
|
|
|
- response.raise_for_status()
|
|
|
- resp = response.json()
|
|
|
- if resp.get("code") == 0:
|
|
|
- return resp
|
|
|
- self.logger.warning(f"{self.trace_id}--API响应非零状态码: {resp}")
|
|
|
- raise ValueError(f"API响应错误: {resp}")
|
|
|
- except requests.exceptions.Timeout:
|
|
|
- self.logger.error(f"{self.trace_id}--请求超时: {url}")
|
|
|
- raise
|
|
|
- except requests.exceptions.RequestException as e:
|
|
|
- self.logger.error(f"{self.trace_id}--请求异常: {e}")
|
|
|
- raise
|
|
|
- except json.JSONDecodeError as e:
|
|
|
- self.logger.error(f"{self.trace_id}--解析JSON响应失败: {e}")
|
|
|
- raise
|
|
|
- except Exception as e:
|
|
|
- # 在最后一次失败时记录详细日志
|
|
|
- self.aliyun_log.logging(
|
|
|
- code="3000",
|
|
|
- message=f"请求失败: {url}",
|
|
|
- data={"error": str(e)},
|
|
|
- trace_id=self.trace_id
|
|
|
- )
|
|
|
- self.logger.error(f"{self.trace_id}--意外错误: {e}")
|
|
|
- raise
|
|
|
-
|
|
|
- def get_video_duration(self, video_url: str, timeout: int = 20) -> float:
|
|
|
- """
|
|
|
- 获取网络视频的时长(秒),增加网络异常处理和超时控制
|
|
|
- :param video_url: 视频URL
|
|
|
- :param timeout: 超时时间
|
|
|
- :return: 视频时长(秒),失败时返回0
|
|
|
- """
|
|
|
- # 检查URL是否可访问
|
|
|
- try:
|
|
|
- response = requests.head(video_url, timeout=timeout)
|
|
|
- response.raise_for_status() # 检查HTTP状态码
|
|
|
- except requests.exceptions.RequestException as e:
|
|
|
- self.logger.error(f"{self.trace_id}--网络错误: 无法访问视频URL - {e}")
|
|
|
- return 0
|
|
|
-
|
|
|
- cap = None
|
|
|
- try:
|
|
|
- # 创建VideoCapture对象
|
|
|
- cap = cv2.VideoCapture(video_url)
|
|
|
-
|
|
|
- # 设置缓冲区大小,减少延迟
|
|
|
- cap.set(cv2.CAP_PROP_BUFFERSIZE, 3)
|
|
|
-
|
|
|
- # 尝试打开视频流(最多尝试3次)
|
|
|
- max_attempts = 3
|
|
|
- for attempt in range(max_attempts):
|
|
|
- if cap.isOpened():
|
|
|
- break
|
|
|
- self.logger.info(f"{self.trace_id}--尝试打开视频流 ({attempt + 1}/{max_attempts})...")
|
|
|
- time.sleep(1)
|
|
|
-
|
|
|
- if not cap.isOpened():
|
|
|
- self.logger.error(f"{self.trace_id}--错误: 无法打开视频流 {video_url}")
|
|
|
- return 0
|
|
|
-
|
|
|
- # 获取视频属性
|
|
|
- fps = cap.get(cv2.CAP_PROP_FPS)
|
|
|
- frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
|
|
|
-
|
|
|
- if fps <= 0 or frame_count <= 0:
|
|
|
- # 某些网络视频可能无法直接获取总帧数,尝试读取几帧估算
|
|
|
- self.logger.info(f"{self.trace_id}--无法获取总帧数,尝试估算...")
|
|
|
- frame_count = 0
|
|
|
- start_time = time.time()
|
|
|
-
|
|
|
- # 读取10帧估算帧率和时长
|
|
|
- for _ in range(10):
|
|
|
- ret, frame = cap.read()
|
|
|
- if not ret:
|
|
|
- break
|
|
|
- frame_count += 1
|
|
|
-
|
|
|
- elapsed_time = time.time() - start_time
|
|
|
- if elapsed_time > 0:
|
|
|
- estimated_fps = frame_count / elapsed_time if elapsed_time > 0 else 0
|
|
|
- # 假设视频时长为30秒(可根据实际情况调整)
|
|
|
- estimated_duration = 30.0
|
|
|
- self.logger.info(f"{self.trace_id}--估算视频时长: {estimated_duration}秒 (基于{frame_count}帧)")
|
|
|
- return estimated_duration
|
|
|
- else:
|
|
|
- self.logger.error(f"{self.trace_id}--错误: 无法估算视频时长")
|
|
|
- return 0
|
|
|
-
|
|
|
- duration = frame_count / fps
|
|
|
- self.logger.info(f"{self.trace_id}--视频时长: {duration}秒")
|
|
|
- return duration
|
|
|
-
|
|
|
- except Exception as e:
|
|
|
- self.logger.error(f"{self.trace_id}--获取视频时长时发生异常: {e}")
|
|
|
- return 0
|
|
|
-
|
|
|
- finally:
|
|
|
- if cap:
|
|
|
- cap.release() # 确保资源释放
|
|
|
-
|
|
|
- def _extract_video_list(self, response: Dict) -> List[Dict]:
|
|
|
- """从API响应中提取视频列表"""
|
|
|
- return safe_extract(response, self.response_data_path) or []
|
|
|
-
|
|
|
- def _is_video_valid(self, video_data: Dict) -> bool:
|
|
|
- """
|
|
|
- 判断视频是否满足条件
|
|
|
- :param video_data: 视频数据
|
|
|
- :return: True if valid, False otherwise
|
|
|
- """
|
|
|
- if not self.rule_dict:
|
|
|
- return True
|
|
|
-
|
|
|
- # 检查视频时长规则
|
|
|
- rule_duration = self.rule_dict.get("duration")
|
|
|
- if rule_duration:
|
|
|
- extract_video_url_path = self.video_fields_map.get("video_url")
|
|
|
- if not extract_video_url_path:
|
|
|
- self.logger.warning(f"{self.trace_id}--缺少视频URL字段映射")
|
|
|
- return False
|
|
|
-
|
|
|
- video_url = safe_extract(video_data, extract_video_url_path)
|
|
|
- if not video_url:
|
|
|
- self.logger.warning(f"{self.trace_id}--无法提取视频URL")
|
|
|
- return False
|
|
|
-
|
|
|
- video_duration = self.get_video_duration(video_url)
|
|
|
- min_duration = rule_duration.get("min", 0)
|
|
|
- max_duration = rule_duration.get("max", float('inf'))
|
|
|
-
|
|
|
- if not (min_duration <= video_duration <= max_duration):
|
|
|
- self.logger.info(f"{self.trace_id}--视频时长{video_duration}秒超出范围[{min_duration}, {max_duration}]")
|
|
|
- return False
|
|
|
-
|
|
|
- # 检查视频数量规则
|
|
|
- rule_videos_cnt = self.rule_dict.get("videos_cnt")
|
|
|
- if rule_videos_cnt:
|
|
|
- # 这里应该查询数据库获取实际视频数量
|
|
|
- # 示例代码,实际实现需要根据业务逻辑完善
|
|
|
- video_count = self._get_video_count_from_db() # 假设这是获取视频数量的方法
|
|
|
- min_count = rule_videos_cnt.get("min", 0)
|
|
|
- if video_count >= min_count:
|
|
|
- self.logger.info(f"{self.trace_id}--视频数量{video_count}达到最小要求{min_count}")
|
|
|
- return False
|
|
|
-
|
|
|
- return True
|
|
|
-
|
|
|
- def _get_video_count_from_db(self) -> int:
|
|
|
- """从数据库获取视频数量"""
|
|
|
- mysql = MysqlService(self.platform,self.mode,self.trace_id)
|
|
|
- video_count = mysql.get_today_videos()
|
|
|
- return video_count
|
|
|
-
|
|
|
- def _process_video(self, video_data: Dict) -> bool:
|
|
|
- """
|
|
|
- 处理单个视频数据
|
|
|
- :param video_data: 视频数据
|
|
|
- :return: 处理成功返回True,失败返回False
|
|
|
- """
|
|
|
- # 先判断视频是否符合条件
|
|
|
- if not self._is_video_valid(video_data):
|
|
|
- self.logger.info(f"{self.trace_id}--视频因验证不通过被跳过")
|
|
|
- return False
|
|
|
-
|
|
|
- # 创建视频项
|
|
|
- item = VideoItem()
|
|
|
-
|
|
|
- # 从配置中获取字段映射并填充数据
|
|
|
- for field_name, path in self.video_fields_map.items():
|
|
|
- if isinstance(path, str) and path.startswith("$."):
|
|
|
- match = safe_extract(video_data, path)
|
|
|
- item.add_video_info(field_name, match)
|
|
|
- else:
|
|
|
- # 如果是固定值(int、str等),直接使用
|
|
|
- item.add_video_info(field_name, path)
|
|
|
-
|
|
|
- # 添加固定字段
|
|
|
- item.add_video_info("platform", self.platform)
|
|
|
- item.add_video_info("strategy", self.config["mode"])
|
|
|
- item.add_video_info("session", f"{self.platform}-{int(time.time())}")
|
|
|
-
|
|
|
- # 随机选择一个用户
|
|
|
- our_user = random.choice(self.user_list)
|
|
|
- item.add_video_info("user_id", our_user["uid"])
|
|
|
- item.add_video_info("user_name", our_user["nick_name"])
|
|
|
-
|
|
|
- video_title = item.get("title", "未知标题")
|
|
|
- self.logger.info(f"{self.trace_id}--正在处理视频: {video_title}")
|
|
|
-
|
|
|
- # 处理管道
|
|
|
- trace_id = f"{self.platform}-{uuid.uuid4()}"
|
|
|
- pipeline = PiaoQuanPipeline(
|
|
|
- platform=self.platform,
|
|
|
- mode=self.config["mode"],
|
|
|
- rule_dict=self.rule_dict,
|
|
|
- env=self.env,
|
|
|
- item=item.produce_item(),
|
|
|
- trace_id=trace_id,
|
|
|
- )
|
|
|
-
|
|
|
- if pipeline.process_item():
|
|
|
- self.download_cnt += 1
|
|
|
- self.mq.send_msg(item.produce_item())
|
|
|
- self.aliyun_log.logging(
|
|
|
- code="1002",
|
|
|
- message="成功发送至ETL",
|
|
|
- data=item.produce_item(),
|
|
|
- trace_id=self.trace_id
|
|
|
- )
|
|
|
- self.logger.info(f"{self.trace_id}--视频处理完成并发送至消息队列,已处理总数: {self.download_cnt}")
|
|
|
-
|
|
|
- # 检查下载限制
|
|
|
- if self.download_cnt >= self.download_min_limit:
|
|
|
- self.has_enough_videos = True
|
|
|
- self.aliyun_log.logging(
|
|
|
- code="2000",
|
|
|
- message=f"达到下载限制: {self.download_min_limit}",
|
|
|
- trace_id=self.trace_id
|
|
|
- )
|
|
|
- self.logger.info(f"{self.trace_id}--达到下载限制,停止进一步处理")
|
|
|
- return True
|
|
|
-
|
|
|
- self.logger.warning(f"{self.trace_id}--通过管道处理视频失败")
|
|
|
- return False
|
|
|
-
|
|
|
- def _fetch_video_list(self) -> Optional[List[Dict]]:
|
|
|
- """
|
|
|
- 获取并解析视频列表
|
|
|
- :return: 视频列表或None
|
|
|
- """
|
|
|
- self.logger.info(f"{self.trace_id}--从{self.request_url}获取视频列表")
|
|
|
- response = self._send_request(
|
|
|
- self.request_url,
|
|
|
- self.request_method,
|
|
|
- self.request_headers,
|
|
|
- self.request_body
|
|
|
- )
|
|
|
-
|
|
|
- if not response:
|
|
|
- self.logger.error(f"{self.trace_id}--获取视频列表失败")
|
|
|
- return None
|
|
|
-
|
|
|
- video_list = self._extract_video_list(response)
|
|
|
- self.logger.info(f"{self.trace_id}--获取到{len(video_list)}个视频")
|
|
|
- return video_list
|
|
|
-
|
|
|
- def _execute_post_actions(self):
|
|
|
- """执行爬取后的额外操作(如曝光上报)"""
|
|
|
- for action in self.config.get("post_actions", []):
|
|
|
- if action.get("trigger") == "after_video_processed":
|
|
|
- endpoint = action.get("endpoint")
|
|
|
- payload = action.get("payload", {})
|
|
|
- if endpoint:
|
|
|
- self.logger.info(f"{self.trace_id}--执行后置操作: {endpoint}")
|
|
|
- self._send_request(endpoint, payload=payload)
|
|
|
-
|
|
|
- def run(self):
|
|
|
- """执行爬取任务"""
|
|
|
- self.aliyun_log.logging(
|
|
|
- code=1003,
|
|
|
- message="开始执行爬虫",
|
|
|
- data=self.platform,
|
|
|
- trace_id=self.trace_id
|
|
|
- )
|
|
|
- self.logger.info(f"{self.trace_id}--开始{self.platform}执行爬虫")
|
|
|
-
|
|
|
- for loop in range(self.loop_times):
|
|
|
- if self.has_enough_videos:
|
|
|
- self.aliyun_log.logging(
|
|
|
- code=2000,
|
|
|
- message=f"[{self.platform}] 达到每日最大爬取量",
|
|
|
- data=self.platform,
|
|
|
- trace_id=self.trace_id
|
|
|
- )
|
|
|
- self.logger.info(f"{self.trace_id}--达到每日最大爬取量,停止爬虫")
|
|
|
- break
|
|
|
-
|
|
|
- self.logger.info(f"{self.trace_id}--开始第{loop + 1}/{self.loop_times}轮循环")
|
|
|
- video_list = self._fetch_video_list()
|
|
|
-
|
|
|
- if not video_list:
|
|
|
- self.logger.warning(f"{self.trace_id}--视频列表为空,跳过本轮循环")
|
|
|
- continue
|
|
|
-
|
|
|
- for video_data in video_list:
|
|
|
- if self.has_enough_videos:
|
|
|
- self.logger.info(f"{self.trace_id}--达到每日最大爬取量,停止处理")
|
|
|
- break
|
|
|
-
|
|
|
- self._process_video(video_data)
|
|
|
-
|
|
|
- # 执行额外操作(如曝光上报)
|
|
|
- self._execute_post_actions()
|
|
|
-
|
|
|
- # 添加循环间隔
|
|
|
- loop_interval = self.config.get("loop_interval", 0)
|
|
|
- if loop_interval > 0:
|
|
|
- self.logger.info(f"{self.trace_id}--在下一轮循环前等待{loop_interval}秒")
|
|
|
- time.sleep(loop_interval)
|
|
|
-
|
|
|
- self.aliyun_log.logging(
|
|
|
- code=0000,
|
|
|
- message="爬虫执行完成",
|
|
|
- data=self.platform,
|
|
|
- trace_id=self.trace_id
|
|
|
- )
|
|
|
- self.logger.info(f"{self.trace_id}--平台{self.platform}的爬虫完成,已处理{self.download_cnt}个视频")
|
|
|
-
|
|
|
-
|
|
|
-if __name__ == '__main__':
|
|
|
- cr = UniversalCrawler(
|
|
|
- platform_config={
|
|
|
- "platform": "benshanzhufu",
|
|
|
- "mode": "recommend",
|
|
|
- "method": "POST",
|
|
|
- "url": "https://api.example.com/video/list",
|
|
|
- "headers": {"Content-Type": "application/json"},
|
|
|
- "request_body": {"page": 1, "size": 20},
|
|
|
- "response_parse": {
|
|
|
- "data_path": "$.data.items",
|
|
|
- "fields": {
|
|
|
- "title": "$.title",
|
|
|
- "video_url": "$.videoUrl",
|
|
|
- "author": "$.author.name",
|
|
|
- "duration": "$.duration"
|
|
|
- }
|
|
|
- },
|
|
|
- "download_limit": {"min": 200},
|
|
|
- "loop_times": 3
|
|
|
- },
|
|
|
- rule_dict={
|
|
|
- 'videos_cnt': {'min': 500, 'max': 0},
|
|
|
- 'duration': {'min': 30, 'max': 1200}
|
|
|
- },
|
|
|
- user_list=[{"uid": 20631262, "link": "recommend_2060", "nick_name": "人老心不老"}],
|
|
|
- trace_id=str(uuid.uuid4())
|
|
|
- )
|
|
|
-
|
|
|
- cr.run()
|