|
@@ -1,20 +1,20 @@
|
|
|
import asyncio
|
|
|
import random
|
|
|
+import time
|
|
|
import traceback
|
|
|
import uuid
|
|
|
-
|
|
|
-import aiohttp
|
|
|
from abc import ABC
|
|
|
from typing import List, Dict, Optional
|
|
|
-import time
|
|
|
-from core.utils.log.logger_manager import LoggerManager
|
|
|
-from services.pipeline import PiaoQuanPipeline
|
|
|
-from core.utils.extractors import safe_extract
|
|
|
-from core.utils.config_loader import ConfigLoader
|
|
|
-from services.async_mysql_service import AsyncMysqlService
|
|
|
-from core.models.video_item import VideoItem
|
|
|
|
|
|
+import aiohttp
|
|
|
|
|
|
+from core.models.video_item import VideoItem
|
|
|
+from core.utils.spider_config import SpiderConfig
|
|
|
+from core.utils.extractors import safe_extract, extract_fields
|
|
|
+from core.utils.log.logger_manager import LoggerManager
|
|
|
+from services.async_mysql_service import AsyncMysqlService
|
|
|
+from services.pipeline import PiaoQuanPipeline
|
|
|
+from core.base.async_request_client import AsyncRequestClient
|
|
|
|
|
|
class BaseSpider(ABC):
|
|
|
"""
|
|
@@ -32,13 +32,13 @@ class BaseSpider(ABC):
|
|
|
self.class_name = self.__class__.__name__ # 获取子类类名
|
|
|
|
|
|
# 根据类名自动获取配置
|
|
|
- self.platform_config = ConfigLoader.get_platform_config(platform=str(self.class_name.lower()))
|
|
|
+ self.platform_config = SpiderConfig.get_platform_config(classname=str(self.class_name.lower()))
|
|
|
if not self.platform_config:
|
|
|
raise ValueError(f"找不到对应配置: {self.class_name}")
|
|
|
|
|
|
# 初始化日志和MQ
|
|
|
- self.platform = self.platform_config.get("platform")
|
|
|
- self.mode = self.platform_config.get("mode")
|
|
|
+ self.platform = self.platform_config.platform
|
|
|
+ self.mode = self.platform_config.mode
|
|
|
self.logger = LoggerManager.get_logger(platform=self.platform, mode=self.mode)
|
|
|
|
|
|
self.logger.info(f"{trace_id}--初始化爬虫类: {self.class_name}")
|
|
@@ -46,90 +46,40 @@ class BaseSpider(ABC):
|
|
|
self.mq = MQ(topic_name=f"topic_crawler_etl_{env}")
|
|
|
|
|
|
# 请求配置
|
|
|
- self.method = self.platform_config.get("method", "GET").upper()
|
|
|
- self.url = self.platform_config.get("url")
|
|
|
- self.headers = self.platform_config.get("headers", {})
|
|
|
- self.body = self.platform_config.get("request_body", {})
|
|
|
- self.field_map = self.platform_config.get("response_parse", {}).get("fields", {})
|
|
|
- self.data_path = self.platform_config.get("response_parse", {}).get("data_path")
|
|
|
- self.video_fields_map = self.platform_config.get("video_fields_map", {})
|
|
|
+ self.method = self.platform_config.method.upper()
|
|
|
+ self.url = self.platform_config.url
|
|
|
+ self.headers = self.platform_config.headers
|
|
|
+ self.body = self.platform_config.request_body
|
|
|
+
|
|
|
+
|
|
|
+ self.response =self.platform_config.response_parse
|
|
|
+ self.field_map = self.response.get("fields", {})
|
|
|
+ self.data_path = self.response.get("data_path")
|
|
|
+
|
|
|
|
|
|
# 流程控制配置
|
|
|
- self.loop_times = self.platform_config.get("loop_times", 1) # 循环次数
|
|
|
- self.loop_interval = self.platform_config.get("loop_interval", 0) # 循环间隔(秒)
|
|
|
+ self.loop_times = self.platform_config.loop_times # 循环次数
|
|
|
+ self.loop_interval = self.platform_config.loop_interval # 循环间隔(秒)
|
|
|
|
|
|
self.db_service = AsyncMysqlService(platform=self.platform, mode=self.mode)
|
|
|
+ self.request_client = AsyncRequestClient(logger=self.logger)
|
|
|
|
|
|
self.logger.info(
|
|
|
f"{self.trace_id}--配置: 循环{self.loop_times}次,间隔{self.loop_interval}秒")
|
|
|
|
|
|
self.session = None
|
|
|
|
|
|
- async def _send_async_request(self, session: aiohttp.ClientSession, method: str, url: str,
|
|
|
- **kwargs) -> aiohttp.ClientResponse:
|
|
|
- """
|
|
|
- 使用提供的 session 发送异步HTTP请求,支持重试机制
|
|
|
- """
|
|
|
- retries = 0
|
|
|
- self.logger.info(f"{self.trace_id}--请求准备: {method} {url}, 参数: {kwargs}")
|
|
|
-
|
|
|
- while retries < self.MAX_RETRIES:
|
|
|
- try:
|
|
|
- async with session.request(method, url, **kwargs) as response:
|
|
|
- response.raise_for_status()
|
|
|
- self.logger.info(f"{self.trace_id}--请求成功: {response.status}")
|
|
|
- return await response.json()
|
|
|
- except Exception as e:
|
|
|
- retries += 1
|
|
|
- remaining_attempts = self.MAX_RETRIES - retries
|
|
|
-
|
|
|
- if retries < self.MAX_RETRIES:
|
|
|
- self.logger.warning(
|
|
|
- f"{self.trace_id}--请求失败 (尝试 {retries}/{self.MAX_RETRIES}): {e}. "
|
|
|
- f"剩余尝试次数: {remaining_attempts}"
|
|
|
- )
|
|
|
- await asyncio.sleep(1)
|
|
|
- else:
|
|
|
- self.aliyun_logr.logging(
|
|
|
- code="5001",
|
|
|
- message="请求失败,已达到最大重试次数",
|
|
|
- data={
|
|
|
- "url": url,
|
|
|
- "method": method,
|
|
|
- "error": str(e),
|
|
|
- "headers": kwargs.get("headers", {}),
|
|
|
- "body": kwargs.get("json", {})
|
|
|
- },
|
|
|
- trace_id=self.trace_id
|
|
|
- )
|
|
|
- self.logger.error(f"{self.trace_id}--请求失败,已达到最大重试次数: {e}")
|
|
|
- raise
|
|
|
-
|
|
|
- async def crawl_data(self) -> Optional[List[Dict]]:
|
|
|
- """异步获取视频数据"""
|
|
|
- self.logger.info(f"{self.trace_id}--开始获取视频数据")
|
|
|
- try:
|
|
|
- response = await self._send_async_request(
|
|
|
- session=self.session,
|
|
|
- method=self.method,
|
|
|
- url=self.url,
|
|
|
- headers=self.headers,
|
|
|
- json=self.body
|
|
|
- )
|
|
|
- self.logger.debug(f"{self.trace_id}--响应结果: {response}")
|
|
|
-
|
|
|
- data = safe_extract(response, self.data_path)
|
|
|
-
|
|
|
- if not data:
|
|
|
- self.logger.warning(f"{self.trace_id}--未获取到数据,路径: {self.data_path}")
|
|
|
- return []
|
|
|
|
|
|
- self.logger.info(f"{self.trace_id}--成功获取{len(data)}条视频数据")
|
|
|
- return data
|
|
|
- except Exception as e:
|
|
|
- traceback.extract_stack()
|
|
|
- self.logger.exception(f"{self.trace_id}--获取视频数据失败: {e}")
|
|
|
- return []
|
|
|
+ async def crawl_data(self,session) -> Optional[List[Dict]]:
|
|
|
+ response = await self.request_client.request(
|
|
|
+ session=session,
|
|
|
+ method=self.method,
|
|
|
+ url=self.url,
|
|
|
+ headers=self.headers,
|
|
|
+ json=self.body
|
|
|
+ )
|
|
|
+ data = safe_extract(response, self.data_path)
|
|
|
+ return data if data else []
|
|
|
|
|
|
async def filter_data(self, video: Dict) -> bool:
|
|
|
"""校验视频是否符合规则"""
|
|
@@ -143,7 +93,6 @@ class BaseSpider(ABC):
|
|
|
)
|
|
|
return await pipeline.process_item()
|
|
|
|
|
|
-
|
|
|
async def is_video_count_sufficient(self) -> bool:
|
|
|
"""
|
|
|
校验视频是否达到当日最大量
|
|
@@ -167,41 +116,21 @@ class BaseSpider(ABC):
|
|
|
"""
|
|
|
self.logger.debug(f"{self.trace_id}--开始处理视频: {video.get('title', '无标题')}")
|
|
|
publish_user = random.choice(self.user_list)
|
|
|
- try:
|
|
|
- # 从 field_map 中动态构建 VideoItem 初始化参数
|
|
|
- item_kwargs = {}
|
|
|
- for field, path in self.field_map.items():
|
|
|
- if not isinstance(path, str) or not path.startswith("$"):
|
|
|
- item_kwargs[field] = path
|
|
|
- continue
|
|
|
-
|
|
|
- value = safe_extract(video, path)
|
|
|
- if value is None:
|
|
|
- self.logger.warning(f"{self.trace_id}--字段提取失败: {field} 路径: {path}")
|
|
|
- continue
|
|
|
- item_kwargs[field] = value
|
|
|
-
|
|
|
- item_kwargs["user_id"] = publish_user["uid"]
|
|
|
- item_kwargs["user_name"] = publish_user["nick_name"]
|
|
|
- # 手动注入 platform 与 strategy
|
|
|
- item_kwargs["platform"] = self.platform
|
|
|
- item_kwargs["strategy"] = self.mode
|
|
|
-
|
|
|
-
|
|
|
- try:
|
|
|
- item = VideoItem(**item_kwargs)
|
|
|
- except Exception as e:
|
|
|
- self.logger.warning(f"{self.trace_id}--VideoItem 初始化失败: {e}, 数据: {item_kwargs}")
|
|
|
- return None
|
|
|
+ item_kwargs = extract_fields(video, self.field_map, logger=self.logger, trace_id=self.trace_id)
|
|
|
+ item_kwargs["user_id"] = publish_user["uid"]
|
|
|
+ item_kwargs["user_name"] = publish_user["nick_name"]
|
|
|
+ item_kwargs["platform"] = self.platform
|
|
|
+ item_kwargs["strategy"] = self.mode
|
|
|
|
|
|
+ try:
|
|
|
+ item = VideoItem(**item_kwargs)
|
|
|
video_dict = await item.produce_item()
|
|
|
if not video_dict:
|
|
|
- self.logger.warning(f"{self.trace_id}--VideoItem 校验失败")
|
|
|
+ self.logger.warning(f"{self.trace_id} 校验失败")
|
|
|
return None
|
|
|
return video_dict
|
|
|
-
|
|
|
except Exception as e:
|
|
|
- self.logger.exception(f"{self.trace_id}--视频处理异常: {e}")
|
|
|
+ self.logger.error(f"{self.trace_id} VideoItem 初始化失败: {e}")
|
|
|
return None
|
|
|
|
|
|
async def push_to_etl(self, item: Dict) -> bool:
|
|
@@ -226,7 +155,7 @@ class BaseSpider(ABC):
|
|
|
查询每天的爬虫爬取到的视频数量
|
|
|
:return:
|
|
|
"""
|
|
|
- video_count = self.db_service.get_today_videos()
|
|
|
+ video_count = await self.db_service.get_today_videos()
|
|
|
return video_count
|
|
|
|
|
|
async def integrated_video_handling(self):
|
|
@@ -235,6 +164,7 @@ class BaseSpider(ABC):
|
|
|
:return:
|
|
|
"""
|
|
|
pass
|
|
|
+
|
|
|
async def run(self):
|
|
|
"""
|
|
|
异步运行爬虫任务,严格按顺序执行
|
|
@@ -245,27 +175,16 @@ class BaseSpider(ABC):
|
|
|
5. 推送到ETL
|
|
|
"""
|
|
|
try:
|
|
|
- self.logger.info(f"{self.trace_id}--[{self.platform}] 开始执行爬虫任务")
|
|
|
- total_success = 0
|
|
|
- total_failed = 0
|
|
|
-
|
|
|
- async with aiohttp.ClientSession(
|
|
|
- timeout=aiohttp.ClientTimeout(total=self.TIMEOUT)
|
|
|
- ) as session: # 上下文管理
|
|
|
- self.session = session
|
|
|
-
|
|
|
+ total_success,total_failed= 0,0
|
|
|
+ loop_start_time = time.time()
|
|
|
+ async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=self.TIMEOUT)) as session:
|
|
|
for loop_index in range(1, self.loop_times + 1):
|
|
|
- if not await self.is_video_count_sufficient():
|
|
|
- return
|
|
|
self.logger.info(f"{self.trace_id}--步骤1: 开始第 {loop_index}/{self.loop_times} 次循环请求")
|
|
|
- loop_start_time = time.time()
|
|
|
-
|
|
|
- video_list = await self.crawl_data()
|
|
|
+ video_list = await self.crawl_data(session)
|
|
|
if not video_list:
|
|
|
self.logger.warning(f"{self.trace_id}--未获取到视频数据,跳过当前循环")
|
|
|
await self._wait_for_next_loop(loop_index)
|
|
|
continue
|
|
|
-
|
|
|
success_count = 0
|
|
|
fail_count = 0
|
|
|
|
|
@@ -302,7 +221,7 @@ class BaseSpider(ABC):
|
|
|
message="爬虫执行指标汇总",
|
|
|
data={
|
|
|
"trace_id": self.trace_id,
|
|
|
- "platform": self.platform,
|
|
|
+ "classname": self.platform,
|
|
|
"success_count": total_success,
|
|
|
"fail_count": total_failed
|
|
|
},
|