|
@@ -6,11 +6,11 @@ import time
|
|
|
import uuid
|
|
|
import yaml
|
|
|
import requests
|
|
|
-
|
|
|
+import cv2
|
|
|
from datetime import datetime
|
|
|
-from typing import Dict, Any, List, Optional
|
|
|
-from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception_type
|
|
|
-from utils.extractors import safe_extract,extract_multiple
|
|
|
+from typing import Dict, Any, List, Optional, Union
|
|
|
+from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception_type, RetryCallState
|
|
|
+from utils.extractors import safe_extract, extract_multiple
|
|
|
|
|
|
# 添加公共模块路径
|
|
|
sys.path.append(os.getcwd())
|
|
@@ -28,40 +28,80 @@ from application.common.log import Local
|
|
|
from configs.config import base_url
|
|
|
|
|
|
|
|
|
+def before_send_log(retry_state: RetryCallState) -> None:
|
|
|
+ """请求重试前记录日志"""
|
|
|
+ attempt = retry_state.attempt_number
|
|
|
+ last_result = retry_state.outcome
|
|
|
+ if last_result.failed:
|
|
|
+ exc = last_result.exception()
|
|
|
+ logger = retry_state.kwargs.get('logger')
|
|
|
+ url = retry_state.args[0] if retry_state.args else "unknown"
|
|
|
+ if logger:
|
|
|
+ logger.warning(f"请求失败,准备重试 ({attempt}/3): {url}, 错误: {str(exc)}")
|
|
|
+
|
|
|
+
|
|
|
class UniversalCrawler:
|
|
|
"""通用爬虫类,通过YAML配置驱动不同平台的爬取逻辑"""
|
|
|
|
|
|
- def __init__(self, platform: str, mode: str, rule_dict: Dict, user_list: List, env: str = "prod"):
|
|
|
+ def __init__(self, platform_config: Dict, rule_dict: Dict, user_list: List, trace_id: str, env: str = "prod"):
|
|
|
"""
|
|
|
初始化爬虫
|
|
|
- :param platform: 平台名称(对应YAML文件名)
|
|
|
+ :param platform_config: 平台配置字典
|
|
|
+ :param rule_dict: 规则字典
|
|
|
+ :param user_list: 用户列表
|
|
|
+ :param trace_id: 追踪ID
|
|
|
:param env: 运行环境
|
|
|
"""
|
|
|
- self.platform = platform
|
|
|
- self.mode = mode
|
|
|
+ self.platform = platform_config["platform"]
|
|
|
+ self.mode = platform_config["mode"]
|
|
|
self.rule_dict = rule_dict
|
|
|
self.user_list = user_list
|
|
|
+ self.trace_id = trace_id
|
|
|
self.env = env
|
|
|
- self.config_path = "/Users/zhangliang/Documents/piaoquan/AutoScraperX/configs/spiders_config.yaml"
|
|
|
- self.config = ConfigLoader().get_platform_config(self.platform)
|
|
|
- self.aliyun_log = AliyunLogger(platform=platform, mode=self.config["mode"])
|
|
|
- self.mq = MQ(topic_name=f"topic_crawler_etl_{env}")
|
|
|
- # self.mysql = MysqlHelper(mode=self.config["mode"], platform=platform)
|
|
|
+ self.config = platform_config
|
|
|
+ self.aliyun_log = AliyunLogger(platform=self.platform, mode=self.mode)
|
|
|
self.logger = Local.init_logger(platform=self.platform, mode=self.mode, log_level="INFO", log_to_console=True)
|
|
|
- self.download_cnt = 0
|
|
|
- self.limit_flag = False
|
|
|
+ self.mq = MQ(topic_name=f"topic_crawler_etl_{env}")
|
|
|
+
|
|
|
+ self.has_enough_videos = False
|
|
|
self.base_api = base_url
|
|
|
+ self.loop_times = platform_config.get('loop_times', 1)
|
|
|
+
|
|
|
+ # 提取常用配置为类属性,提高可读性
|
|
|
+ self.request_method = self.config["method"].upper()
|
|
|
+ self.request_url = self.config["url"]
|
|
|
+ self.request_headers = self.config.get("headers", {})
|
|
|
+ self.request_body = self.config.get("request_body", {})
|
|
|
+ self.response_data_path = self.config["response_parse"]["data_path"]
|
|
|
+ self.video_fields_map = self.config["response_parse"]["fields"]
|
|
|
+
|
|
|
+ # 下载限制配置
|
|
|
+ self.download_min_limit = self.config.get("download_limit", {}).get("min", 200)
|
|
|
|
|
|
@retry(
|
|
|
- stop=stop_after_attempt(3), # 最多重试 3 次
|
|
|
- wait=wait_fixed(2), # 每次重试间隔 2 秒
|
|
|
- retry=retry_if_exception_type((requests.RequestException, ValueError))
|
|
|
+ stop=stop_after_attempt(3), # 最多重试3次
|
|
|
+ wait=wait_fixed(2), # 每次重试间隔2秒
|
|
|
+ retry=retry_if_exception_type((requests.RequestException, ValueError)),
|
|
|
+ before=before_send_log, # 添加重试前日志
|
|
|
)
|
|
|
- def _send_request(self, method: str, url: str, headers, payload, timeout = 30) -> Optional[
|
|
|
- Dict]:
|
|
|
- """发送API请求,失败自动重试最多3次"""
|
|
|
+ def _send_request(self, url: str, method: str = None, headers: Dict = None,
|
|
|
+ payload: Dict = None, timeout: int = 30) -> Optional[Dict]:
|
|
|
+ """
|
|
|
+ 发送API请求,失败自动重试最多3次
|
|
|
+ :param url: 请求URL
|
|
|
+ :param method: 请求方法,默认使用配置中的方法
|
|
|
+ :param headers: 请求头,默认使用配置中的头
|
|
|
+ :param payload: 请求体,默认使用配置中的体
|
|
|
+ :param timeout: 超时时间
|
|
|
+ :return: 响应JSON数据或None
|
|
|
+ """
|
|
|
+ # 使用默认配置(如果未提供参数)
|
|
|
+ method = method or self.request_method
|
|
|
+ headers = headers or self.request_headers
|
|
|
+ payload = payload or self.request_body
|
|
|
|
|
|
try:
|
|
|
+ self.logger.info(f"{self.trace_id}--正在发送请求: {url}")
|
|
|
response = requests.request(
|
|
|
method=method,
|
|
|
url=url,
|
|
@@ -71,33 +111,179 @@ class UniversalCrawler:
|
|
|
)
|
|
|
response.raise_for_status()
|
|
|
resp = response.json()
|
|
|
- if resp["code"] == 0:
|
|
|
- return response.json()
|
|
|
- raise ValueError(f"接口响应非0:{resp}")
|
|
|
+ if resp.get("code") == 0:
|
|
|
+ return resp
|
|
|
+ self.logger.warning(f"{self.trace_id}--API响应非零状态码: {resp}")
|
|
|
+ raise ValueError(f"API响应错误: {resp}")
|
|
|
+ except requests.exceptions.Timeout:
|
|
|
+ self.logger.error(f"{self.trace_id}--请求超时: {url}")
|
|
|
+ raise
|
|
|
+ except requests.exceptions.RequestException as e:
|
|
|
+ self.logger.error(f"{self.trace_id}--请求异常: {e}")
|
|
|
+ raise
|
|
|
+ except json.JSONDecodeError as e:
|
|
|
+ self.logger.error(f"{self.trace_id}--解析JSON响应失败: {e}")
|
|
|
+ raise
|
|
|
except Exception as e:
|
|
|
- # 在最后一次失败时才记录日志
|
|
|
+ # 在最后一次失败时记录详细日志
|
|
|
self.aliyun_log.logging(
|
|
|
code="3000",
|
|
|
message=f"请求失败: {url}",
|
|
|
- data={"error": str(e)}
|
|
|
+ data={"error": str(e)},
|
|
|
+ trace_id=self.trace_id
|
|
|
)
|
|
|
- return
|
|
|
+ self.logger.error(f"{self.trace_id}--意外错误: {e}")
|
|
|
+ raise
|
|
|
+
|
|
|
+ def get_video_duration(self, video_url: str, timeout: int = 20) -> float:
|
|
|
+ """
|
|
|
+ 获取网络视频的时长(秒),增加网络异常处理和超时控制
|
|
|
+ :param video_url: 视频URL
|
|
|
+ :param timeout: 超时时间
|
|
|
+ :return: 视频时长(秒),失败时返回0
|
|
|
+ """
|
|
|
+ # 检查URL是否可访问
|
|
|
+ try:
|
|
|
+ response = requests.head(video_url, timeout=timeout)
|
|
|
+ response.raise_for_status() # 检查HTTP状态码
|
|
|
+ except requests.exceptions.RequestException as e:
|
|
|
+ self.logger.error(f"{self.trace_id}--网络错误: 无法访问视频URL - {e}")
|
|
|
+ return 0
|
|
|
+
|
|
|
+ cap = None
|
|
|
+ try:
|
|
|
+ # 创建VideoCapture对象
|
|
|
+ cap = cv2.VideoCapture(video_url)
|
|
|
+
|
|
|
+ # 设置缓冲区大小,减少延迟
|
|
|
+ cap.set(cv2.CAP_PROP_BUFFERSIZE, 3)
|
|
|
+
|
|
|
+ # 尝试打开视频流(最多尝试3次)
|
|
|
+ max_attempts = 3
|
|
|
+ for attempt in range(max_attempts):
|
|
|
+ if cap.isOpened():
|
|
|
+ break
|
|
|
+ self.logger.info(f"{self.trace_id}--尝试打开视频流 ({attempt + 1}/{max_attempts})...")
|
|
|
+ time.sleep(1)
|
|
|
+
|
|
|
+ if not cap.isOpened():
|
|
|
+ self.logger.error(f"{self.trace_id}--错误: 无法打开视频流 {video_url}")
|
|
|
+ return 0
|
|
|
+
|
|
|
+ # 获取视频属性
|
|
|
+ fps = cap.get(cv2.CAP_PROP_FPS)
|
|
|
+ frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
|
|
|
+
|
|
|
+ if fps <= 0 or frame_count <= 0:
|
|
|
+ # 某些网络视频可能无法直接获取总帧数,尝试读取几帧估算
|
|
|
+ self.logger.info(f"{self.trace_id}--无法获取总帧数,尝试估算...")
|
|
|
+ frame_count = 0
|
|
|
+ start_time = time.time()
|
|
|
+
|
|
|
+ # 读取10帧估算帧率和时长
|
|
|
+ for _ in range(10):
|
|
|
+ ret, frame = cap.read()
|
|
|
+ if not ret:
|
|
|
+ break
|
|
|
+ frame_count += 1
|
|
|
+
|
|
|
+ elapsed_time = time.time() - start_time
|
|
|
+ if elapsed_time > 0:
|
|
|
+ estimated_fps = frame_count / elapsed_time if elapsed_time > 0 else 0
|
|
|
+ # 假设视频时长为30秒(可根据实际情况调整)
|
|
|
+ estimated_duration = 30.0
|
|
|
+ self.logger.info(f"{self.trace_id}--估算视频时长: {estimated_duration}秒 (基于{frame_count}帧)")
|
|
|
+ return estimated_duration
|
|
|
+ else:
|
|
|
+ self.logger.error(f"{self.trace_id}--错误: 无法估算视频时长")
|
|
|
+ return 0
|
|
|
+
|
|
|
+ duration = frame_count / fps
|
|
|
+ self.logger.info(f"{self.trace_id}--视频时长: {duration}秒")
|
|
|
+ return duration
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ self.logger.error(f"{self.trace_id}--获取视频时长时发生异常: {e}")
|
|
|
+ return 0
|
|
|
+
|
|
|
+ finally:
|
|
|
+ if cap:
|
|
|
+ cap.release() # 确保资源释放
|
|
|
+
|
|
|
+ def _extract_video_list(self, response: Dict) -> List[Dict]:
|
|
|
+ """从API响应中提取视频列表"""
|
|
|
+ return safe_extract(response, self.response_data_path) or []
|
|
|
+
|
|
|
+ def _is_video_valid(self, video_data: Dict) -> bool:
|
|
|
+ """
|
|
|
+ 判断视频是否满足条件
|
|
|
+ :param video_data: 视频数据
|
|
|
+ :return: True if valid, False otherwise
|
|
|
+ """
|
|
|
+ if not self.rule_dict:
|
|
|
+ return True
|
|
|
+
|
|
|
+ # 检查视频时长规则
|
|
|
+ rule_duration = self.rule_dict.get("duration")
|
|
|
+ if rule_duration:
|
|
|
+ extract_video_url_path = self.video_fields_map.get("video_url")
|
|
|
+ if not extract_video_url_path:
|
|
|
+ self.logger.warning(f"{self.trace_id}--缺少视频URL字段映射")
|
|
|
+ return False
|
|
|
+
|
|
|
+ video_url = safe_extract(video_data, extract_video_url_path)
|
|
|
+ if not video_url:
|
|
|
+ self.logger.warning(f"{self.trace_id}--无法提取视频URL")
|
|
|
+ return False
|
|
|
+
|
|
|
+ video_duration = self.get_video_duration(video_url)
|
|
|
+ min_duration = rule_duration.get("min", 0)
|
|
|
+ max_duration = rule_duration.get("max", float('inf'))
|
|
|
+
|
|
|
+ if not (min_duration <= video_duration <= max_duration):
|
|
|
+ self.logger.info(f"{self.trace_id}--视频时长{video_duration}秒超出范围[{min_duration}, {max_duration}]")
|
|
|
+ return False
|
|
|
+
|
|
|
+ # 检查视频数量规则
|
|
|
+ rule_videos_cnt = self.rule_dict.get("videos_cnt")
|
|
|
+ if rule_videos_cnt:
|
|
|
+ # 这里应该查询数据库获取实际视频数量
|
|
|
+ # 示例代码,实际实现需要根据业务逻辑完善
|
|
|
+ video_count = self._get_video_count_from_db() # 假设这是获取视频数量的方法
|
|
|
+ min_count = rule_videos_cnt.get("min", 0)
|
|
|
+ if video_count >= min_count:
|
|
|
+ self.logger.info(f"{self.trace_id}--视频数量{video_count}达到最小要求{min_count}")
|
|
|
+ return False
|
|
|
+
|
|
|
+ return True
|
|
|
+
|
|
|
+ def _get_video_count_from_db(self) -> int:
|
|
|
+ """从数据库获取视频数量(示例方法,需根据实际业务实现)"""
|
|
|
+ # 实际实现中应该查询数据库
|
|
|
+ return 0 # 占位符
|
|
|
|
|
|
def _process_video(self, video_data: Dict) -> bool:
|
|
|
- """处理单个视频数据"""
|
|
|
- # 从配置中获取字段映射
|
|
|
- field_map = self.config["response_parse"]["fields"]
|
|
|
+ """
|
|
|
+ 处理单个视频数据
|
|
|
+ :param video_data: 视频数据
|
|
|
+ :return: 处理成功返回True,失败返回False
|
|
|
+ """
|
|
|
+ # 先判断视频是否符合条件
|
|
|
+ if not self._is_video_valid(video_data):
|
|
|
+ self.logger.info(f"{self.trace_id}--视频因验证不通过被跳过")
|
|
|
+ return False
|
|
|
|
|
|
# 创建视频项
|
|
|
item = VideoItem()
|
|
|
- for field_name, path in field_map.items():
|
|
|
- if isinstance(path, str) and path.startswith("$."):
|
|
|
|
|
|
- match = safe_extract(video_data,path)
|
|
|
+ # 从配置中获取字段映射并填充数据
|
|
|
+ for field_name, path in self.video_fields_map.items():
|
|
|
+ if isinstance(path, str) and path.startswith("$."):
|
|
|
+ match = safe_extract(video_data, path)
|
|
|
item.add_video_info(field_name, match)
|
|
|
else:
|
|
|
# 如果是固定值(int、str等),直接使用
|
|
|
- item.add_video_info(field_name,path)
|
|
|
+ item.add_video_info(field_name, path)
|
|
|
|
|
|
# 添加固定字段
|
|
|
item.add_video_info("platform", self.platform)
|
|
@@ -109,7 +295,8 @@ class UniversalCrawler:
|
|
|
item.add_video_info("user_id", our_user["uid"])
|
|
|
item.add_video_info("user_name", our_user["nick_name"])
|
|
|
|
|
|
- print(item)
|
|
|
+ video_title = item.get("title", "未知标题")
|
|
|
+ self.logger.info(f"{self.trace_id}--正在处理视频: {video_title}")
|
|
|
|
|
|
# 处理管道
|
|
|
trace_id = f"{self.platform}-{uuid.uuid4()}"
|
|
@@ -128,78 +315,136 @@ class UniversalCrawler:
|
|
|
self.aliyun_log.logging(
|
|
|
code="1002",
|
|
|
message="成功发送至ETL",
|
|
|
- data=item.produce_item()
|
|
|
+ data=item.produce_item(),
|
|
|
+ trace_id=self.trace_id
|
|
|
)
|
|
|
+ self.logger.info(f"{self.trace_id}--视频处理完成并发送至消息队列,已处理总数: {self.download_cnt}")
|
|
|
|
|
|
# 检查下载限制
|
|
|
- min_limit = self.config.get("download_limit", {}).get("min", 200)
|
|
|
- if self.download_cnt >= min_limit:
|
|
|
- self.limit_flag = True
|
|
|
+ if self.download_cnt >= self.download_min_limit:
|
|
|
+ self.has_enough_videos = True
|
|
|
self.aliyun_log.logging(
|
|
|
code="2000",
|
|
|
- message=f"达到下载限制: {min_limit}",
|
|
|
+ message=f"达到下载限制: {self.download_min_limit}",
|
|
|
+ trace_id=self.trace_id
|
|
|
)
|
|
|
+ self.logger.info(f"{self.trace_id}--达到下载限制,停止进一步处理")
|
|
|
return True
|
|
|
+
|
|
|
+ self.logger.warning(f"{self.trace_id}--通过管道处理视频失败")
|
|
|
return False
|
|
|
|
|
|
+ def _fetch_video_list(self) -> Optional[List[Dict]]:
|
|
|
+ """
|
|
|
+ 获取并解析视频列表
|
|
|
+ :return: 视频列表或None
|
|
|
+ """
|
|
|
+ self.logger.info(f"{self.trace_id}--从{self.request_url}获取视频列表")
|
|
|
+ response = self._send_request(
|
|
|
+ self.request_url,
|
|
|
+ self.request_method,
|
|
|
+ self.request_headers,
|
|
|
+ self.request_body
|
|
|
+ )
|
|
|
|
|
|
- # --------------------- 自定义处理函数 ---------------------
|
|
|
- def _func_current_timestamp(self, _) -> int:
|
|
|
- """获取当前时间戳"""
|
|
|
- return int(time.time())
|
|
|
+ if not response:
|
|
|
+ self.logger.error(f"{self.trace_id}--获取视频列表失败")
|
|
|
+ return None
|
|
|
|
|
|
- def _func_formatted_time(self, _) -> str:
|
|
|
- """获取格式化时间"""
|
|
|
- return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
|
+ video_list = self._extract_video_list(response)
|
|
|
+ self.logger.info(f"{self.trace_id}--获取到{len(video_list)}个视频")
|
|
|
+ return video_list
|
|
|
|
|
|
- def _func_random_delay(self, _) -> None:
|
|
|
- """随机延迟"""
|
|
|
- min_delay = self.config.get("delay", {}).get("min", 3)
|
|
|
- max_delay = self.config.get("delay", {}).get("max", 8)
|
|
|
- time.sleep(random.randint(min_delay, max_delay))
|
|
|
- return None
|
|
|
+ def _execute_post_actions(self):
|
|
|
+ """执行爬取后的额外操作(如曝光上报)"""
|
|
|
+ for action in self.config.get("post_actions", []):
|
|
|
+ if action.get("trigger") == "after_video_processed":
|
|
|
+ endpoint = action.get("endpoint")
|
|
|
+ payload = action.get("payload", {})
|
|
|
+ if endpoint:
|
|
|
+ self.logger.info(f"{self.trace_id}--执行后置操作: {endpoint}")
|
|
|
+ self._send_request(endpoint, payload=payload)
|
|
|
|
|
|
def run(self):
|
|
|
"""执行爬取任务"""
|
|
|
- self.logger.info(f"开始执行爬虫{self.platform}")
|
|
|
-
|
|
|
- while not self.limit_flag:
|
|
|
- # 获取初始列表数据
|
|
|
- initial_data = self._send_request(
|
|
|
- self.config["method"].upper(),
|
|
|
- self.config["url"],
|
|
|
- self.config.get("headers", {}),
|
|
|
- self.config.get("request_body", {})
|
|
|
- )
|
|
|
- print(initial_data)
|
|
|
-
|
|
|
- if not initial_data:
|
|
|
- return
|
|
|
- video_objs = safe_extract(initial_data,self.config["response_parse"]["data_path"])
|
|
|
- self.logger.info(f"获取到的视频列表:{json.dumps(video_objs)}")
|
|
|
-
|
|
|
- next_cursor = None
|
|
|
- # 处理视频列表
|
|
|
- video_list = safe_extract(
|
|
|
- initial_data,
|
|
|
- self.config["response_parse"]["data_path"]
|
|
|
- )
|
|
|
+ self.aliyun_log.logging(
|
|
|
+ code=1003,
|
|
|
+ message="开始执行爬虫",
|
|
|
+ data=self.platform,
|
|
|
+ trace_id=self.trace_id
|
|
|
+ )
|
|
|
+ self.logger.info(f"{self.trace_id}--开始{self.platform}执行爬虫")
|
|
|
+
|
|
|
+ for loop in range(self.loop_times):
|
|
|
+ if self.has_enough_videos:
|
|
|
+ self.aliyun_log.logging(
|
|
|
+ code=2000,
|
|
|
+ message=f"[{self.platform}] 达到每日最大爬取量",
|
|
|
+ data=self.platform,
|
|
|
+ trace_id=self.trace_id
|
|
|
+ )
|
|
|
+ self.logger.info(f"{self.trace_id}--达到每日最大爬取量,停止爬虫")
|
|
|
+ break
|
|
|
+
|
|
|
+ self.logger.info(f"{self.trace_id}--开始第{loop + 1}/{self.loop_times}轮循环")
|
|
|
+ video_list = self._fetch_video_list()
|
|
|
+
|
|
|
+ if not video_list:
|
|
|
+ self.logger.warning(f"{self.trace_id}--视频列表为空,跳过本轮循环")
|
|
|
+ continue
|
|
|
|
|
|
for video_data in video_list:
|
|
|
- self.logger.info(f"视频对象{video_data}")
|
|
|
- if self.limit_flag:
|
|
|
+ if self.has_enough_videos:
|
|
|
+ self.logger.info(f"{self.trace_id}--达到每日最大爬取量,停止处理")
|
|
|
break
|
|
|
+
|
|
|
self._process_video(video_data)
|
|
|
|
|
|
- # 执行额外操作(如曝光上报)
|
|
|
- for action in self.config.get("post_actions", []):
|
|
|
- if action["trigger"] == "after_video_processed":
|
|
|
- self._send_request(action["endpoint"], action.get("payload", {}))
|
|
|
+ # 执行额外操作(如曝光上报)
|
|
|
+ self._execute_post_actions()
|
|
|
+
|
|
|
+ # 添加循环间隔
|
|
|
+ loop_interval = self.config.get("loop_interval", 0)
|
|
|
+ if loop_interval > 0:
|
|
|
+ self.logger.info(f"{self.trace_id}--在下一轮循环前等待{loop_interval}秒")
|
|
|
+ time.sleep(loop_interval)
|
|
|
+
|
|
|
+ self.aliyun_log.logging(
|
|
|
+ code=0000,
|
|
|
+ message="爬虫执行完成",
|
|
|
+ data=self.platform,
|
|
|
+ trace_id=self.trace_id
|
|
|
+ )
|
|
|
+ self.logger.info(f"{self.trace_id}--平台{self.platform}的爬虫完成,已处理{self.download_cnt}个视频")
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
- cr = UniversalCrawler("benshanzhufu", "recommend",
|
|
|
- rule_dict={'videos_cnt': {'min': 500, 'max': 0}, 'duration': {'min': 30, 'max': 1200}},
|
|
|
- user_list=[{"uid": 20631262, "link": "recommend_2060", "nick_name": "人老心不老"}])
|
|
|
+ cr = UniversalCrawler(
|
|
|
+ platform_config={
|
|
|
+ "platform": "benshanzhufu",
|
|
|
+ "mode": "recommend",
|
|
|
+ "method": "POST",
|
|
|
+ "url": "https://api.example.com/video/list",
|
|
|
+ "headers": {"Content-Type": "application/json"},
|
|
|
+ "request_body": {"page": 1, "size": 20},
|
|
|
+ "response_parse": {
|
|
|
+ "data_path": "$.data.items",
|
|
|
+ "fields": {
|
|
|
+ "title": "$.title",
|
|
|
+ "video_url": "$.videoUrl",
|
|
|
+ "author": "$.author.name",
|
|
|
+ "duration": "$.duration"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "download_limit": {"min": 200},
|
|
|
+ "loop_times": 3
|
|
|
+ },
|
|
|
+ rule_dict={
|
|
|
+ 'videos_cnt': {'min': 500, 'max': 0},
|
|
|
+ 'duration': {'min': 30, 'max': 1200}
|
|
|
+ },
|
|
|
+ user_list=[{"uid": 20631262, "link": "recommend_2060", "nick_name": "人老心不老"}],
|
|
|
+ trace_id=str(uuid.uuid4())
|
|
|
+ )
|
|
|
|
|
|
- cr.run()
|
|
|
+ cr.run()
|