recommendspider.py 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. from spiders.basespider import BaseSpider
  2. from typing import Optional, List, Dict
  3. import aiohttp
  4. from core.utils.extractors import safe_extract
  5. class RecommendSpider(BaseSpider):
  6. """推荐模式爬虫:从推荐接口分页爬取"""
  7. async def core_loop(self):
  8. """核心循环:分页请求推荐接口"""
  9. async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=self.timeout)) as session:
  10. for loop_index in range(self.loop_times):
  11. # 检查数量限制
  12. self.logger.info(f"检测{self.platform}当日入库视频量")
  13. if not await self.is_video_count_sufficient():
  14. return
  15. # 获取推荐列表数据
  16. self.logger.info(f"开始获取{self.platform}推荐列表数据")
  17. raw_data = await self.crawl_data(session)
  18. if not raw_data:
  19. self.logger.info("视频列表为空,开始下次请求")
  20. await self.wait()
  21. continue
  22. # 处理数据
  23. await self.process_data(raw_data)
  24. # 等待下一轮
  25. await self.wait()
  26. async def crawl_data(self, session) -> Optional[List[Dict]]:
  27. """请求推荐接口(适配推荐模式)"""
  28. request_body = self.request_preparer.prepare(self.request_body_template, self.last_response_data)
  29. response = await self.request_client.request(
  30. session=session,
  31. method=self.method,
  32. url=self.url,
  33. headers=self.headers,
  34. json=request_body
  35. )
  36. self.last_response_data = response
  37. # 解析推荐列表
  38. if not response:
  39. self.logger.warning("接口响应为空")
  40. return None
  41. data_list = safe_extract(response, self.data_path)
  42. if not data_list:
  43. self.logger.info(f"接口返回视频列表为空: {response}")
  44. self.aliyun_log.logging(code="9021", message="接口返回视频列表为空", data=response)
  45. return None
  46. return data_list