|
@@ -2,6 +2,8 @@
|
|
|
from datetime import datetime, timedelta
|
|
|
from typing import List, Dict, Optional
|
|
|
|
|
|
+from config import settings
|
|
|
+from core.base.async_redis_client import RedisManager
|
|
|
from core.utils.helpers import is_near_next_day
|
|
|
from spiders.basespider import BaseSpider
|
|
|
from core.utils.extractors import safe_extract
|
|
@@ -13,6 +15,7 @@ class AuthorSpider(BaseSpider):
|
|
|
def __init__(self, rule_dict: Dict, user_list: List, env: str = "prod"):
|
|
|
super().__init__(rule_dict, user_list, env)
|
|
|
# 账号模式特有状态
|
|
|
+
|
|
|
self.user_list_from_db = [] # 数据库用户列表
|
|
|
self.current_user_index = 0 # 当前用户索引
|
|
|
self.current_cursor = "" # 当前分页游标(初始为空)
|
|
@@ -20,6 +23,7 @@ class AuthorSpider(BaseSpider):
|
|
|
async def before_run(self):
|
|
|
"""运行前:获取用户列表 """
|
|
|
await super().before_run()
|
|
|
+ # await RedisManager.init(redis_url=settings.redis_url)
|
|
|
self.user_list_from_db = await self.fetch_user_list()
|
|
|
if not self.user_list_from_db:
|
|
|
self.logger.warning("用户列表为空,终止账号模式")
|
|
@@ -53,8 +57,8 @@ class AuthorSpider(BaseSpider):
|
|
|
if self.platform == "xiaoniangao":
|
|
|
self.user_list = [user] # 特殊逻辑
|
|
|
pass_video = await self.process_data(raw_data)
|
|
|
- # 根据是否有通过的数据和下一页游标判断是否继续当前用户
|
|
|
- if pass_video != 0 and self.current_cursor:
|
|
|
+ # 根据是否有更多数据和下一页游标判断是否继续当前用户
|
|
|
+ if raw_data and self.current_cursor:
|
|
|
self.logger.info(
|
|
|
f"用户 {crawler_user_uid} 获取到 {pass_video} 个通过视频,继续扫描第{self.current_cursor}页")
|
|
|
else:
|
|
@@ -74,7 +78,7 @@ class AuthorSpider(BaseSpider):
|
|
|
virtual_data = {
|
|
|
# "uid": "173309188", # 测试
|
|
|
"uid": str(user.get("link")),
|
|
|
- "cursor": self.current_cursor
|
|
|
+ "next_cursor": self.current_cursor
|
|
|
}
|
|
|
|
|
|
return self.request_preparer.prepare(
|
|
@@ -92,8 +96,10 @@ class AuthorSpider(BaseSpider):
|
|
|
return None
|
|
|
|
|
|
# 游标处理逻辑
|
|
|
- if safe_extract(response, self.next_cursor):
|
|
|
- self.current_cursor = safe_extract(response, self.next_cursor)
|
|
|
+ next_cursor_value = safe_extract(response, self.next_cursor)
|
|
|
+ if next_cursor_value is not None:
|
|
|
+ self.current_cursor = str(next_cursor_value)
|
|
|
+ self.logger.debug(f"更新游标: {self.current_cursor}")
|
|
|
|
|
|
data_list = safe_extract(response, self.data_path)
|
|
|
if not data_list:
|