|
@@ -6,10 +6,10 @@ import aiohttp
|
|
|
|
|
|
from core.models.video_item import VideoItem
|
|
|
from core.utils.helpers import generate_titles
|
|
|
+from core.utils.request_preparer import RequestPreparer
|
|
|
from core.utils.spider_config import SpiderConfig
|
|
|
from core.utils.extractors import safe_extract, extract_fields
|
|
|
from core.utils.log.logger_manager import LoggerManager
|
|
|
-from core.utils.template_resolver import resolve_request_body_template
|
|
|
from services.async_mysql_service import AsyncMysqlService
|
|
|
from services.pipeline import PiaoQuanPipeline
|
|
|
from core.base.async_request_client import AsyncRequestClient
|
|
@@ -26,52 +26,67 @@ class BaseSpider:
|
|
|
"""
|
|
|
|
|
|
def __init__(self, rule_dict: Dict, user_list: List, env: str = "prod"):
|
|
|
- self.env = env
|
|
|
- self.user_list = user_list
|
|
|
self.rule_dict = rule_dict
|
|
|
+ self.user_list = user_list
|
|
|
+ self.env = env
|
|
|
self.class_name = self.__class__.__name__.lower()
|
|
|
|
|
|
- # 通过小写子类名读取配置
|
|
|
- self.platform_config = SpiderConfig.get_platform_config(classname=self.class_name)
|
|
|
+ # --- 1. 初始化核心组件 ---
|
|
|
+ self._setup_configuration()
|
|
|
+ self._setup_logging()
|
|
|
+ self._setup_services()
|
|
|
+ self._setup_state()
|
|
|
|
|
|
+ # 初始化辅助方法
|
|
|
+ def _setup_configuration(self):
|
|
|
+ """加载并设置爬虫的核心配置。"""
|
|
|
+ self.platform_config = SpiderConfig.get_platform_config(classname=self.class_name)
|
|
|
if not self.platform_config:
|
|
|
- raise ValueError(f"找不到对应配置: {self.class_name}")
|
|
|
+ raise ValueError(f"找不到爬虫配置: {self.class_name}")
|
|
|
|
|
|
self.platform = self.platform_config.platform
|
|
|
self.mode = self.platform_config.mode
|
|
|
- self.logger = LoggerManager.get_logger(platform=self.platform, mode=self.mode)
|
|
|
-
|
|
|
- self.aliyun_log = LoggerManager.get_aliyun_logger(platform=self.platform, mode=self.mode)
|
|
|
-
|
|
|
- self.mq_producer = AsyncMQProducer(topic_name="topic_crawler_etl_prod_v2", platform=self.platform, mode=self.mode)
|
|
|
- self.method = self.platform_config.method.upper()
|
|
|
self.url = self.platform_config.url
|
|
|
+ self.method = self.platform_config.method.upper()
|
|
|
self.headers = self.platform_config.headers or {}
|
|
|
- self.request_body_template = self.platform_config.request_body or {}
|
|
|
|
|
|
- self.response_parse = self.platform_config.response_parse or {}
|
|
|
- self.next_cursor_path = self.response_parse.get("next_cursor")
|
|
|
- self.data_path = self.response_parse.get("data_path")
|
|
|
- self.field_map = self.response_parse.get("fields", {})
|
|
|
+ # 请求和解析相关的配置
|
|
|
+ self.request_body_template = self.platform_config.request_body or {}
|
|
|
+ self.response_parse_config = self.platform_config.response_parse or {}
|
|
|
+ self.data_path = self.response_parse_config.get("data_path")
|
|
|
+ # self.next_cursor_path = self.response_parse_config.get("next_cursor")
|
|
|
+ self.field_map = self.response_parse_config.get("fields", {})
|
|
|
|
|
|
+ # 爬取行为相关的配置
|
|
|
self.loop_times = self.platform_config.loop_times or 100
|
|
|
self.loop_interval = self.platform_config.loop_interval or 5
|
|
|
- self.logger.info(f"开始{self.platform}爬取,最大循环次数{self.loop_times},循环间隔{self.loop_interval}s")
|
|
|
- self.feishu_sheetid = self.platform_config.feishu_sheetid
|
|
|
-
|
|
|
- self.db_service = AsyncMysqlService(platform=self.platform, mode=self.mode)
|
|
|
- self.request_client = AsyncRequestClient(logger=self.logger, aliyun_log=self.aliyun_log)
|
|
|
-
|
|
|
self.timeout = self.platform_config.request_timeout or 30
|
|
|
self.max_retries = self.platform_config.max_retries or 3
|
|
|
+ self.feishu_sheetid = self.platform_config.feishu_sheetid
|
|
|
|
|
|
- # 当前分页游标,默认空字符串,支持动态替换request_body中任何字段(如cursor)
|
|
|
- self.dynamic_params = {key: "" for key in self.request_body_template.keys()}
|
|
|
- # 允许子类重写,支持多游标等复杂情况
|
|
|
- self.current_cursor = ""
|
|
|
+ def _setup_logging(self):
|
|
|
+ """初始化日志记录器。"""
|
|
|
+ self.logger = LoggerManager.get_logger(platform=self.platform, mode=self.mode)
|
|
|
+ self.aliyun_log = LoggerManager.get_aliyun_logger(platform=self.platform, mode=self.mode)
|
|
|
+ self.logger.info(f"爬虫 '{self.platform}/{self.mode}' 初始化...")
|
|
|
+ self.logger.info(f"最大循环次数: {self.loop_times}, 循环间隔: {self.loop_interval}s")
|
|
|
+
|
|
|
+ def _setup_services(self):
|
|
|
+ """初始化外部服务客户端。"""
|
|
|
+ self.request_client = AsyncRequestClient(logger=self.logger, aliyun_log=self.aliyun_log)
|
|
|
+ self.db_service = AsyncMysqlService(platform=self.platform, mode=self.mode)
|
|
|
+ self.mq_producer = AsyncMQProducer(topic_name="topic_crawler_etl_prod_v2", platform=self.platform,
|
|
|
+ mode=self.mode)
|
|
|
+
|
|
|
+ def _setup_state(self):
|
|
|
+ """初始化爬虫的内部状态。"""
|
|
|
+ self.last_response_data = {}
|
|
|
+ self.request_preparer = RequestPreparer(
|
|
|
+ response_parse_config=self.response_parse_config,
|
|
|
+ logger=self.logger,
|
|
|
+ aliyun_log=self.aliyun_log
|
|
|
+ )
|
|
|
|
|
|
- self.download_cnt = 0
|
|
|
- self.limit_flag = False
|
|
|
|
|
|
|
|
|
async def run(self):
|
|
@@ -81,18 +96,19 @@ class BaseSpider:
|
|
|
async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=self.timeout)) as session:
|
|
|
for loop_index in range(self.loop_times):
|
|
|
if not await self.is_video_count_sufficient():
|
|
|
- self.logger.info(f"视频抓取数量已达上限,提前结束")
|
|
|
+ self.logger.info(f"视频抓取数量已达上限,停止爬取")
|
|
|
return
|
|
|
succ, fail = await self.run_single_loop(session)
|
|
|
total_success += succ
|
|
|
total_fail += fail
|
|
|
await self._wait_for_next_loop(loop_index + 1)
|
|
|
|
|
|
- self.logger.info(f"爬虫完成 成功:{total_success} 失败:{total_fail}")
|
|
|
+ self.logger.info(f"爬虫完成 成功:{total_success} 失败:{total_fail}")
|
|
|
|
|
|
async def run_single_loop(self, session) -> (int, int):
|
|
|
"""
|
|
|
- 单轮请求与处理
|
|
|
+ 执行单轮的请求、解析和处理。
|
|
|
+ 返回: (本轮成功处理的数量, 本轮失败处理的数量)
|
|
|
"""
|
|
|
success_count, fail_count = 0, 0
|
|
|
try:
|
|
@@ -110,8 +126,7 @@ class BaseSpider:
|
|
|
success_count += 1
|
|
|
else:
|
|
|
fail_count += 1
|
|
|
- if self.limit_flag:
|
|
|
- break
|
|
|
+ self.logger.info(f"接口返回<{len(videos)}>条视频,处理成功<{success_count}>条,处理失败:<{fail_count}>")
|
|
|
await self.after_run()
|
|
|
|
|
|
except Exception as e:
|
|
@@ -131,63 +146,42 @@ class BaseSpider:
|
|
|
请求接口,自动渲染动态参数,自动更新游标
|
|
|
支持单请求和多请求(分页)逻辑。
|
|
|
"""
|
|
|
- # 动态渲染请求体
|
|
|
- # resolved_body = self._render_request_body()
|
|
|
-
|
|
|
+ request_body = self.request_preparer.prepare(self.request_body_template,
|
|
|
+ self.last_response_data)
|
|
|
# 发送请求
|
|
|
response = await self.request_client.request(
|
|
|
session=session,
|
|
|
method=self.method,
|
|
|
url=self.url,
|
|
|
headers=self.headers,
|
|
|
- json= self.dynamic_params
|
|
|
+ json = request_body
|
|
|
)
|
|
|
|
|
|
if not response:
|
|
|
self.logger.error(f"响应为空")
|
|
|
- return []
|
|
|
-
|
|
|
- # 更新游标(支持动态参数更新)
|
|
|
- if self.next_cursor_path:
|
|
|
- next_cursor = safe_extract(response, self.next_cursor_path) or ""
|
|
|
- self._update_cursor(next_cursor)
|
|
|
+ return
|
|
|
|
|
|
+ self.last_response_data = response
|
|
|
# 解析数据列表
|
|
|
data_list = safe_extract(response, self.data_path)
|
|
|
if not data_list:
|
|
|
- self.logger.info(f"未获取到有效数据")
|
|
|
- return []
|
|
|
+ self.logger.info(f"接口返回视频列表为空{response}")
|
|
|
+ self.aliyun_log.logging(
|
|
|
+ code="9021",
|
|
|
+ message="接口返回视频列表为空",
|
|
|
+ data= response
|
|
|
+ )
|
|
|
+ return
|
|
|
|
|
|
return data_list
|
|
|
|
|
|
- def _render_request_body(self) -> Dict:
|
|
|
- """
|
|
|
- 用当前动态参数渲染请求体模板,支持多参数动态替换
|
|
|
- """
|
|
|
- body = {}
|
|
|
- for k, v in self.request_body_template.items():
|
|
|
- if isinstance(v, str) and v.startswith("{{") and v.endswith("}}"):
|
|
|
- key = v.strip("{} ")
|
|
|
- body[k] = self.dynamic_params.get(key, "")
|
|
|
- else:
|
|
|
- body[k] = v
|
|
|
- return body
|
|
|
-
|
|
|
- def _update_cursor(self, cursor_value: str):
|
|
|
- """
|
|
|
- 更新分页游标并动态参数,方便下一次请求使用
|
|
|
- """
|
|
|
- self.current_cursor = cursor_value
|
|
|
- # 如果配置的游标字段在请求体中,更新动态参数
|
|
|
- if "cursor" in self.dynamic_params:
|
|
|
- self.dynamic_params["cursor"] = cursor_value
|
|
|
-
|
|
|
async def process_and_push_video(self, video: Dict[str, Any]) -> bool:
|
|
|
"""
|
|
|
数据处理完整流程(字段映射 -> 校验 -> 推送)
|
|
|
子类可重写 process_video 或 filter_data 来定制处理和校验逻辑
|
|
|
"""
|
|
|
try:
|
|
|
+ # 字段映射
|
|
|
video_obj = await self.process_video(video)
|
|
|
if not video_obj:
|
|
|
return False
|
|
@@ -197,15 +191,6 @@ class BaseSpider:
|
|
|
|
|
|
await self.integrated_video_handling(video_obj)
|
|
|
pushed = await self.push_to_etl(video_obj)
|
|
|
-
|
|
|
- # 达到下载上限,停止继续抓取
|
|
|
- if self.rule_dict.get("videos_cnt", {}).get("min") and \
|
|
|
- self.download_cnt >= self.rule_dict["videos_cnt"]["min"]:
|
|
|
- self.limit_flag = True
|
|
|
-
|
|
|
- if pushed:
|
|
|
- self.download_cnt += 1
|
|
|
-
|
|
|
return pushed
|
|
|
except Exception as e:
|
|
|
self.logger.exception(f"视频处理异常: {e}")
|
|
@@ -217,19 +202,19 @@ class BaseSpider:
|
|
|
子类可重写或扩展以定制字段映射、过滤等
|
|
|
"""
|
|
|
self.logger.debug(f"处理视频数据: {video.get('title', '无标题')}")
|
|
|
- publish_user = None
|
|
|
if self.user_list:
|
|
|
import random
|
|
|
publish_user = random.choice(self.user_list)
|
|
|
else:
|
|
|
- publish_user = {"uid": "default", "nick_name": "default_user"}
|
|
|
+ self.logger.error(f"未获取到用户列表数据{self.user_list}")
|
|
|
+ return
|
|
|
|
|
|
item_kwargs = extract_fields(video, self.field_map, logger=self.logger,aliyun_log=self.aliyun_log)
|
|
|
item_kwargs.update({
|
|
|
"user_id": publish_user.get("uid"),
|
|
|
"user_name": publish_user.get("nick_name"),
|
|
|
"platform": self.platform,
|
|
|
- "strategy": self.mode
|
|
|
+ "strategy": self.mode,
|
|
|
})
|
|
|
|
|
|
try:
|
|
@@ -286,10 +271,14 @@ class BaseSpider:
|
|
|
async with AsyncMysqlService(self.platform, self.mode) as mysql:
|
|
|
current_count = await mysql.get_today_videos()
|
|
|
if current_count >= max_count:
|
|
|
- self.logger.info(f"{self.platform} 今日视频数量已达上限: {current_count}")
|
|
|
self.aliyun_log.logging(code="1011", message="视频数量达到当日最大值", data=f"<今日视频数量>{current_count}")
|
|
|
return False
|
|
|
- return True
|
|
|
+ self.logger.info(f"{self.platform} 今日入库视频数: {current_count}/{max_count}")
|
|
|
+ self.aliyun_log.logging(code="1012",
|
|
|
+ message=f"目前入库量{current_count}",
|
|
|
+ data=f"{current_count}/{max_count}"
|
|
|
+ )
|
|
|
+ return True
|
|
|
|
|
|
async def _wait_for_next_loop(self, current_loop: int) -> None:
|
|
|
"""等待下次循环"""
|