|
@@ -25,24 +25,25 @@ class BaseSpider:
|
|
|
子类只需根据业务重写少量方法,如 process_video/process_item。
|
|
|
"""
|
|
|
|
|
|
- def __init__(self, rule_dict: Dict, user_list: List, trace_id: str, env: str = "prod"):
|
|
|
- self.trace_id = trace_id
|
|
|
+ def __init__(self, rule_dict: Dict, user_list: List, env: str = "prod"):
|
|
|
self.env = env
|
|
|
self.user_list = user_list
|
|
|
self.rule_dict = rule_dict
|
|
|
self.class_name = self.__class__.__name__.lower()
|
|
|
|
|
|
- # 读取配置
|
|
|
+ # 通过小写子类名读取配置
|
|
|
self.platform_config = SpiderConfig.get_platform_config(classname=self.class_name)
|
|
|
+
|
|
|
if not self.platform_config:
|
|
|
raise ValueError(f"找不到对应配置: {self.class_name}")
|
|
|
|
|
|
self.platform = self.platform_config.platform
|
|
|
self.mode = self.platform_config.mode
|
|
|
self.logger = LoggerManager.get_logger(platform=self.platform, mode=self.mode)
|
|
|
- self.aliyun_logr = LoggerManager.get_aliyun_logger(platform=self.platform, mode=self.mode)
|
|
|
- self.mq_producer = AsyncMQProducer(topic_name="topic_crawler_etl_prod_v2", platform=self.platform, mode=self.mode)
|
|
|
|
|
|
+ self.aliyun_log = LoggerManager.get_aliyun_logger(platform=self.platform, mode=self.mode)
|
|
|
+
|
|
|
+ self.mq_producer = AsyncMQProducer(topic_name="topic_crawler_etl_prod_v2", platform=self.platform, mode=self.mode)
|
|
|
self.method = self.platform_config.method.upper()
|
|
|
self.url = self.platform_config.url
|
|
|
self.headers = self.platform_config.headers or {}
|
|
@@ -55,13 +56,14 @@ class BaseSpider:
|
|
|
|
|
|
self.loop_times = self.platform_config.loop_times or 100
|
|
|
self.loop_interval = self.platform_config.loop_interval or 5
|
|
|
+ self.logger.info(f"开始{self.platform}爬取,最大循环次数{self.loop_times},循环间隔{self.loop_interval}s")
|
|
|
self.feishu_sheetid = self.platform_config.feishu_sheetid
|
|
|
|
|
|
self.db_service = AsyncMysqlService(platform=self.platform, mode=self.mode)
|
|
|
- self.request_client = AsyncRequestClient(logger=self.logger, aliyun_log=self.aliyun_logr)
|
|
|
+ self.request_client = AsyncRequestClient(logger=self.logger, aliyun_log=self.aliyun_log)
|
|
|
|
|
|
- self.timeout = 30
|
|
|
- self.max_retries = 3
|
|
|
+ self.timeout = self.platform_config.request_timeout or 30
|
|
|
+ self.max_retries = self.platform_config.max_retries or 3
|
|
|
|
|
|
# 当前分页游标,默认空字符串,支持动态替换request_body中任何字段(如cursor)
|
|
|
self.dynamic_params = {key: "" for key in self.request_body_template.keys()}
|
|
@@ -75,26 +77,18 @@ class BaseSpider:
|
|
|
async def run(self):
|
|
|
""" 爬虫主流程 """
|
|
|
await self.before_run()
|
|
|
-
|
|
|
total_success, total_fail = 0, 0
|
|
|
async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=self.timeout)) as session:
|
|
|
for loop_index in range(self.loop_times):
|
|
|
- if self.limit_flag:
|
|
|
- self.logger.info(f"{self.trace_id} 已达到抓取限制,停止爬虫")
|
|
|
- break
|
|
|
-
|
|
|
if not await self.is_video_count_sufficient():
|
|
|
- self.logger.info(f"{self.trace_id} 视频抓取数量已达上限,提前结束")
|
|
|
- break
|
|
|
-
|
|
|
+ self.logger.info(f"视频抓取数量已达上限,提前结束")
|
|
|
+ return
|
|
|
succ, fail = await self.run_single_loop(session)
|
|
|
total_success += succ
|
|
|
total_fail += fail
|
|
|
-
|
|
|
await self._wait_for_next_loop(loop_index + 1)
|
|
|
|
|
|
- await self.after_run()
|
|
|
- self.logger.info(f"{self.trace_id} 爬虫完成 成功:{total_success} 失败:{total_fail}")
|
|
|
+ self.logger.info(f"爬虫完成 成功:{total_success} 失败:{total_fail}")
|
|
|
|
|
|
async def run_single_loop(self, session) -> (int, int):
|
|
|
"""
|
|
@@ -102,9 +96,10 @@ class BaseSpider:
|
|
|
"""
|
|
|
success_count, fail_count = 0, 0
|
|
|
try:
|
|
|
+ # 爬取数据
|
|
|
videos = await self.crawl_data(session)
|
|
|
if not videos:
|
|
|
- self.logger.info(f"{self.trace_id} 无数据返回,停止本轮")
|
|
|
+ self.logger.info(f"无数据返回,停止本轮")
|
|
|
return success_count, fail_count
|
|
|
|
|
|
for video in videos:
|
|
@@ -117,9 +112,10 @@ class BaseSpider:
|
|
|
fail_count += 1
|
|
|
if self.limit_flag:
|
|
|
break
|
|
|
+ await self.after_run()
|
|
|
|
|
|
except Exception as e:
|
|
|
- self.logger.exception(f"{self.trace_id} 运行异常: {e}")
|
|
|
+ self.logger.exception(f"运行异常: {e}")
|
|
|
|
|
|
return success_count, fail_count
|
|
|
|
|
@@ -148,7 +144,7 @@ class BaseSpider:
|
|
|
)
|
|
|
|
|
|
if not response:
|
|
|
- self.logger.error(f"{self.trace_id} 响应为空")
|
|
|
+ self.logger.error(f"响应为空")
|
|
|
return []
|
|
|
|
|
|
# 更新游标(支持动态参数更新)
|
|
@@ -159,7 +155,7 @@ class BaseSpider:
|
|
|
# 解析数据列表
|
|
|
data_list = safe_extract(response, self.data_path)
|
|
|
if not data_list:
|
|
|
- self.logger.info(f"{self.trace_id} 未获取到有效数据")
|
|
|
+ self.logger.info(f"未获取到有效数据")
|
|
|
return []
|
|
|
|
|
|
return data_list
|
|
@@ -212,7 +208,7 @@ class BaseSpider:
|
|
|
|
|
|
return pushed
|
|
|
except Exception as e:
|
|
|
- self.logger.exception(f"{self.trace_id} 视频处理异常: {e}")
|
|
|
+ self.logger.exception(f"视频处理异常: {e}")
|
|
|
return False
|
|
|
|
|
|
async def process_video(self, video: Dict) -> Optional[Dict]:
|
|
@@ -220,7 +216,7 @@ class BaseSpider:
|
|
|
统一字段抽取及 VideoItem 初始化
|
|
|
子类可重写或扩展以定制字段映射、过滤等
|
|
|
"""
|
|
|
- self.logger.debug(f"{self.trace_id} 处理视频数据: {video.get('title', '无标题')}")
|
|
|
+ self.logger.debug(f"处理视频数据: {video.get('title', '无标题')}")
|
|
|
publish_user = None
|
|
|
if self.user_list:
|
|
|
import random
|
|
@@ -228,7 +224,7 @@ class BaseSpider:
|
|
|
else:
|
|
|
publish_user = {"uid": "default", "nick_name": "default_user"}
|
|
|
|
|
|
- item_kwargs = extract_fields(video, self.field_map, logger=self.logger, trace_id=self.trace_id, aliyun_log=self.aliyun_logr)
|
|
|
+ item_kwargs = extract_fields(video, self.field_map, logger=self.logger,aliyun_log=self.aliyun_log)
|
|
|
item_kwargs.update({
|
|
|
"user_id": publish_user.get("uid"),
|
|
|
"user_name": publish_user.get("nick_name"),
|
|
@@ -240,11 +236,11 @@ class BaseSpider:
|
|
|
item = VideoItem(**item_kwargs)
|
|
|
video_dict = await item.produce_item()
|
|
|
if not video_dict:
|
|
|
- self.logger.warning(f"{self.trace_id} VideoItem 校验失败")
|
|
|
+ self.logger.warning(f"VideoItem 校验失败")
|
|
|
return None
|
|
|
return video_dict
|
|
|
except Exception as e:
|
|
|
- self.logger.error(f"{self.trace_id} VideoItem 初始化失败: {e}")
|
|
|
+ self.logger.error(f"VideoItem 初始化失败: {e}")
|
|
|
return None
|
|
|
|
|
|
async def filter_data(self, video: Dict) -> bool:
|
|
@@ -274,10 +270,10 @@ class BaseSpider:
|
|
|
"""
|
|
|
try:
|
|
|
await self.mq_producer.send_msg(video)
|
|
|
- self.logger.info(f"{self.trace_id} 成功推送视频至ETL")
|
|
|
+ self.logger.info(f"成功推送视频至ETL")
|
|
|
return True
|
|
|
except Exception as e:
|
|
|
- self.logger.exception(f"{self.trace_id} 推送ETL失败: {e}")
|
|
|
+ self.logger.exception(f"推送ETL失败: {e}")
|
|
|
return False
|
|
|
|
|
|
async def is_video_count_sufficient(self) -> bool:
|
|
@@ -290,15 +286,15 @@ class BaseSpider:
|
|
|
async with AsyncMysqlService(self.platform, self.mode) as mysql:
|
|
|
current_count = await mysql.get_today_videos()
|
|
|
if current_count >= max_count:
|
|
|
- self.logger.info(f"{self.trace_id} 今日视频已达上限: {current_count}")
|
|
|
- self.aliyun_logr.logging(code="1011", message="视频数量达到当日最大值", data=f"<今日视频数量>{current_count}")
|
|
|
+ self.logger.info(f"{self.platform} 今日视频数量已达上限: {current_count}")
|
|
|
+ self.aliyun_log.logging(code="1011", message="视频数量达到当日最大值", data=f"<今日视频数量>{current_count}")
|
|
|
return False
|
|
|
return True
|
|
|
|
|
|
async def _wait_for_next_loop(self, current_loop: int) -> None:
|
|
|
"""等待下次循环"""
|
|
|
if current_loop < self.loop_times and self.loop_interval > 0:
|
|
|
- self.logger.info(f"{self.trace_id} 等待 {self.loop_interval} 秒后进行下一次请求")
|
|
|
+ self.logger.info(f"等待 {self.loop_interval} 秒后进行下一次请求")
|
|
|
await asyncio.sleep(self.loop_interval)
|
|
|
|
|
|
async def before_run(self):
|