|
@@ -19,23 +19,27 @@ from services.async_mq_producer import AsyncMQProducer
|
|
|
|
|
|
class BaseSpider(ABC):
|
|
class BaseSpider(ABC):
|
|
"""通用爬虫基类"""
|
|
"""通用爬虫基类"""
|
|
-
|
|
|
|
- def __init__(self, rule_dict: Dict, user_list: List, env: str = "prod"):
|
|
|
|
|
|
+
|
|
|
|
+ def __init__(self, rule_dict: Dict, user_list: List, env: str = "prod",
|
|
|
|
+ request_client: AsyncRequestClient = None,
|
|
|
|
+ db_service: AsyncMysqlService = None,
|
|
|
|
+ mq_producer: AsyncMQProducer = None):
|
|
self.rule_dict = rule_dict
|
|
self.rule_dict = rule_dict
|
|
self.user_list = user_list
|
|
self.user_list = user_list
|
|
self.env = env
|
|
self.env = env
|
|
self.class_name = self.__class__.__name__.lower()
|
|
self.class_name = self.__class__.__name__.lower()
|
|
- print(self.class_name)
|
|
|
|
|
|
|
|
# 初始化核心组件
|
|
# 初始化核心组件
|
|
self._setup_configuration()
|
|
self._setup_configuration()
|
|
self._setup_logging()
|
|
self._setup_logging()
|
|
- self._setup_services()
|
|
|
|
|
|
+ self._setup_services(request_client, db_service, mq_producer)
|
|
self._setup_state()
|
|
self._setup_state()
|
|
|
|
|
|
# 通用状态
|
|
# 通用状态
|
|
self.total_success = 0
|
|
self.total_success = 0
|
|
self.total_fail = 0
|
|
self.total_fail = 0
|
|
|
|
+ self.video = None
|
|
|
|
+
|
|
|
|
|
|
def _setup_configuration(self):
|
|
def _setup_configuration(self):
|
|
self.platform_config = SpiderConfig.get_platform_config(classname=self.class_name)
|
|
self.platform_config = SpiderConfig.get_platform_config(classname=self.class_name)
|
|
@@ -51,6 +55,7 @@ class BaseSpider(ABC):
|
|
self.data_path = self.response_parse_config.get("data_path")
|
|
self.data_path = self.response_parse_config.get("data_path")
|
|
self.has_more = self.response_parse_config.get("has_more")
|
|
self.has_more = self.response_parse_config.get("has_more")
|
|
self.field_map = self.response_parse_config.get("fields", {})
|
|
self.field_map = self.response_parse_config.get("fields", {})
|
|
|
|
+ self.next_cursor = self.response_parse_config.get("next_cursor") or ""
|
|
self.loop_times = self.platform_config.loop_times or 100
|
|
self.loop_times = self.platform_config.loop_times or 100
|
|
self.loop_interval = self.platform_config.loop_interval or {"min": 2, "max": 5}
|
|
self.loop_interval = self.platform_config.loop_interval or {"min": 2, "max": 5}
|
|
self.timeout = self.platform_config.request_timeout or 30
|
|
self.timeout = self.platform_config.request_timeout or 30
|
|
@@ -62,11 +67,23 @@ class BaseSpider(ABC):
|
|
self.aliyun_log = LoggerManager.get_aliyun_logger(platform=self.platform, mode=self.mode)
|
|
self.aliyun_log = LoggerManager.get_aliyun_logger(platform=self.platform, mode=self.mode)
|
|
self.logger.info(f"爬虫 '{self.platform}/{self.mode}' 初始化...")
|
|
self.logger.info(f"爬虫 '{self.platform}/{self.mode}' 初始化...")
|
|
|
|
|
|
- def _setup_services(self):
|
|
|
|
- self.request_client = AsyncRequestClient(logger=self.logger, aliyun_log=self.aliyun_log)
|
|
|
|
- self.db_service = AsyncMysqlService(platform=self.platform, mode=self.mode)
|
|
|
|
- self.mq_producer = AsyncMQProducer(topic_name="topic_crawler_etl_prod_v2", platform=self.platform, mode=self.mode)
|
|
|
|
-
|
|
|
|
|
|
+ def _setup_services(self, request_client: AsyncRequestClient = None,
|
|
|
|
+ db_service: AsyncMysqlService = None,
|
|
|
|
+ mq_producer: AsyncMQProducer = None):
|
|
|
|
+ """初始化服务组件"""
|
|
|
|
+ self.request_client = request_client or AsyncRequestClient(
|
|
|
|
+ logger=self.logger,
|
|
|
|
+ aliyun_log=self.aliyun_log
|
|
|
|
+ )
|
|
|
|
+ self.db_service = db_service or AsyncMysqlService(
|
|
|
|
+ platform=self.platform,
|
|
|
|
+ mode=self.mode
|
|
|
|
+ )
|
|
|
|
+ self.mq_producer = mq_producer or AsyncMQProducer(
|
|
|
|
+ topic_name="topic_crawler_etl_prod_v2",
|
|
|
|
+ platform=self.platform,
|
|
|
|
+ mode=self.mode
|
|
|
|
+ )
|
|
def _setup_state(self):
|
|
def _setup_state(self):
|
|
self.last_response_data = {}
|
|
self.last_response_data = {}
|
|
self.request_preparer = RequestPreparer(
|
|
self.request_preparer = RequestPreparer(
|
|
@@ -99,9 +116,9 @@ class BaseSpider(ABC):
|
|
return item
|
|
return item
|
|
|
|
|
|
# 通用数据处理流程
|
|
# 通用数据处理流程
|
|
- async def process_raw_data(self, raw_data: List[Dict]):
|
|
|
|
|
|
+ async def process_data(self, video_data: List[Dict]):
|
|
"""处理原始数据列表(清洗→过滤→推送)"""
|
|
"""处理原始数据列表(清洗→过滤→推送)"""
|
|
- for item in raw_data:
|
|
|
|
|
|
+ for item in video_data:
|
|
try:
|
|
try:
|
|
# 补充详情(完全由子类实现)
|
|
# 补充详情(完全由子类实现)
|
|
detail_data = await self.fetch_detail(item)
|
|
detail_data = await self.fetch_detail(item)
|
|
@@ -117,6 +134,7 @@ class BaseSpider(ABC):
|
|
|
|
|
|
async def process_and_push_video(self, video: Dict[str, Any]) -> bool:
|
|
async def process_and_push_video(self, video: Dict[str, Any]) -> bool:
|
|
try:
|
|
try:
|
|
|
|
+ self.video = video
|
|
video_obj = await self.process_video(video)
|
|
video_obj = await self.process_video(video)
|
|
if not video_obj:
|
|
if not video_obj:
|
|
return False
|
|
return False
|
|
@@ -128,17 +146,28 @@ class BaseSpider(ABC):
|
|
self.logger.exception(f"视频处理异常: {e}")
|
|
self.logger.exception(f"视频处理异常: {e}")
|
|
return False
|
|
return False
|
|
|
|
|
|
|
|
+ async def publish_video_user(self) -> Dict[str, Any]:
|
|
|
|
+ """获取随机发布用户"""
|
|
|
|
+ if self.user_list:
|
|
|
|
+ return random.choice(self.user_list)
|
|
|
|
+ else:
|
|
|
|
+ self.logger.error("未获取到用户列表数据")
|
|
|
|
+ return None
|
|
|
|
+
|
|
|
|
+
|
|
async def process_video(self, video: Dict) -> Optional[Dict]:
|
|
async def process_video(self, video: Dict) -> Optional[Dict]:
|
|
"""
|
|
"""
|
|
字段映射
|
|
字段映射
|
|
统一字段抽取及 VideoItem 初始化
|
|
统一字段抽取及 VideoItem 初始化
|
|
"""
|
|
"""
|
|
self.logger.info(f"处理视频数据: {video}")
|
|
self.logger.info(f"处理视频数据: {video}")
|
|
- if self.user_list:
|
|
|
|
- publish_user = random.choice(self.user_list)
|
|
|
|
- else:
|
|
|
|
- self.logger.error(f"未获取到用户列表数据{self.user_list}")
|
|
|
|
- return
|
|
|
|
|
|
+ publish_user = await self.publish_video_user()
|
|
|
|
+
|
|
|
|
+ # 检查是否成功获取到发布用户
|
|
|
|
+ if not publish_user:
|
|
|
|
+ self.logger.error("无法获取发布用户信息")
|
|
|
|
+ return None
|
|
|
|
+
|
|
item_kwargs = extract_fields(video, self.field_map, logger=self.logger, aliyun_log=self.aliyun_log)
|
|
item_kwargs = extract_fields(video, self.field_map, logger=self.logger, aliyun_log=self.aliyun_log)
|
|
item_kwargs.update({
|
|
item_kwargs.update({
|
|
"user_id": publish_user.get("uid"),
|
|
"user_id": publish_user.get("uid"),
|
|
@@ -150,7 +179,7 @@ class BaseSpider(ABC):
|
|
item = VideoItem(**item_kwargs)
|
|
item = VideoItem(**item_kwargs)
|
|
video_dict = await item.produce_item()
|
|
video_dict = await item.produce_item()
|
|
if not video_dict:
|
|
if not video_dict:
|
|
- self.logger.warning(f"VideoItem 校验失败")
|
|
|
|
|
|
+ self.logger.warning("VideoItem 校验失败")
|
|
return None
|
|
return None
|
|
return video_dict
|
|
return video_dict
|
|
except Exception as e:
|
|
except Exception as e:
|
|
@@ -176,7 +205,8 @@ class BaseSpider(ABC):
|
|
"""
|
|
"""
|
|
钩子函数:可在此实现自动生成标题或其他业务逻辑
|
|
钩子函数:可在此实现自动生成标题或其他业务逻辑
|
|
"""
|
|
"""
|
|
- await generate_titles(self.feishu_sheetid, video)
|
|
|
|
|
|
+ # 视频标题处理生成
|
|
|
|
+ await generate_titles(self.feishu_sheetid, video,self.logger,self.aliyun_log)
|
|
|
|
|
|
async def push_to_etl(self, video: Dict) -> bool:
|
|
async def push_to_etl(self, video: Dict) -> bool:
|
|
try:
|
|
try:
|
|
@@ -214,7 +244,11 @@ class BaseSpider(ABC):
|
|
return True
|
|
return True
|
|
|
|
|
|
async def wait(self):
|
|
async def wait(self):
|
|
- wait_time = random.randint(self.loop_interval["min"], self.loop_interval["max"])
|
|
|
|
|
|
+ """等待随机时间间隔"""
|
|
|
|
+ # 确保loop_interval包含min和max键
|
|
|
|
+ min_time = self.loop_interval.get("min", 1)
|
|
|
|
+ max_time = self.loop_interval.get("max", 5)
|
|
|
|
+ wait_time = random.randint(min_time, max_time)
|
|
self.logger.info(f"等待 {wait_time} 秒后继续")
|
|
self.logger.info(f"等待 {wait_time} 秒后继续")
|
|
await asyncio.sleep(wait_time)
|
|
await asyncio.sleep(wait_time)
|
|
|
|
|