12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758 |
- from spiders.basespider import BaseSpider
- from typing import Optional, List, Dict
- import aiohttp
- from core.utils.extractors import safe_extract
- class RecommendSpider(BaseSpider):
- """推荐模式爬虫:从推荐接口分页爬取"""
- async def core_loop(self):
- """核心循环:分页请求推荐接口"""
- async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=self.timeout)) as session:
- for loop_index in range(self.loop_times):
- # 检查数量限制
- self.logger.info(f"检测{self.platform}当日入库视频量")
- if not await self.is_video_count_sufficient():
- return
-
- # 获取推荐列表数据
- self.logger.info(f"开始获取{self.platform}推荐列表数据")
- raw_data = await self.crawl_data(session)
- if not raw_data:
- self.logger.info("视频列表为空,开始下次请求")
- await self.wait()
- continue
-
- # 处理数据
- await self.process_data(raw_data)
-
- # 等待下一轮
- await self.wait()
- async def crawl_data(self, session) -> Optional[List[Dict]]:
- """请求推荐接口(适配推荐模式)"""
- request_body = self.request_preparer.prepare(self.request_body_template, self.last_response_data)
- response = await self.request_client.request(
- session=session,
- method=self.method,
- url=self.url,
- headers=self.headers,
- json=request_body
- )
- self.last_response_data = response
-
- # 解析推荐列表
- if not response:
- self.logger.warning("接口响应为空")
- return None
-
- data_list = safe_extract(response, self.data_path)
- if not data_list:
- self.logger.info(f"接口返回视频列表为空: {response}")
- self.aliyun_log.logging(code="9021", message="接口返回视频列表为空", data=response)
- return None
-
- return data_list
|