zhangliang 1 týždeň pred
rodič
commit
e45d8d3e6a

+ 29 - 14
core/utils/request_preparer.py

@@ -35,22 +35,28 @@ class RequestPreparer:
             if isinstance(value, str) and "{{" in value and "}}" in value:
                 # 提取变量名(支持后续扩展默认值)
                 var_name = value.strip("{}").split("|")[0]
-                jsonpath_expr = self.response_parse_config.get(var_name)
                 
-                if jsonpath_expr:
-                    extracted_value = safe_extract(response_data, jsonpath_expr, default="")
-                    prepared_body[key] = extracted_value
-                    
-                    # 记录提取信息(仅在有日志记录器时)
-                    if extracted_value == "" and self.logger:
-                        self.logger.debug(f"变量 {var_name} 提取结果为空,使用默认值")
+                # 首先尝试直接从 response_data 中获取(处理扁平字典情况)
+                if var_name in response_data:
+                    prepared_body[key] = response_data[var_name]
                 else:
-                    # response_parse_config 中未配置路径,默认空字符串
-                    prepared_body[key] = ""
+                    # 否则使用 JSONPath 提取(处理嵌套对象情况)
+                    jsonpath_expr = self.response_parse_config.get(var_name)
                     
-                    # 记录警告信息
-                    if self.logger:
-                        self.logger.warning(f"未在response_parse_config中找到变量 {var_name} 的路径配置")
+                    if jsonpath_expr:
+                        extracted_value = safe_extract(response_data, jsonpath_expr, default="")
+                        prepared_body[key] = extracted_value
+                        
+                        # 记录提取信息(仅在有日志记录器时)
+                        if extracted_value == "" and self.logger:
+                            self.logger.debug(f"变量 {var_name} 提取结果为空,使用默认值")
+                    else:
+                        # response_parse_config 中未配置路径,默认空字符串
+                        prepared_body[key] = ""
+                        
+                        # 记录警告信息
+                        if self.logger:
+                            self.logger.warning(f"未在response_parse_config中找到变量 {var_name} 的路径配置")
             else:
                 prepared_body[key] = value
         return prepared_body
@@ -86,4 +92,13 @@ if __name__ == "__main__":
     # 测试首次请求情况
     prepared_body_first = preparer.prepare(request_body_config, {})
     print("首次请求时请求体:", prepared_body_first)
-    # 输出: {'cursor': '', 'category': 'recommend', 'flag': ''}
+    # 输出: {'cursor': '', 'category': 'recommend', 'flag': ''}
+    
+    # 测试扁平字典情况
+    flat_data = {
+        "next_cursor": "flat_abc123",
+        "flag": "flat_on"
+    }
+    prepared_body_flat = preparer.prepare(request_body_config, flat_data)
+    print("扁平字典请求体:", prepared_body_flat)
+    # 输出: {'cursor': 'flat_abc123', 'category': 'recommend', 'flag': 'flat_on'}

+ 11 - 5
spiders/authorspider.py

@@ -2,6 +2,8 @@
 from datetime import datetime, timedelta
 from typing import List, Dict, Optional
 
+from config import settings
+from core.base.async_redis_client import RedisManager
 from core.utils.helpers import is_near_next_day
 from spiders.basespider import BaseSpider
 from core.utils.extractors import safe_extract
@@ -13,6 +15,7 @@ class AuthorSpider(BaseSpider):
     def __init__(self, rule_dict: Dict, user_list: List, env: str = "prod"):
         super().__init__(rule_dict, user_list, env)
         # 账号模式特有状态
+
         self.user_list_from_db = []  # 数据库用户列表
         self.current_user_index = 0  # 当前用户索引
         self.current_cursor = ""  # 当前分页游标(初始为空)
@@ -20,6 +23,7 @@ class AuthorSpider(BaseSpider):
     async def before_run(self):
         """运行前:获取用户列表 """
         await super().before_run()
+        # await RedisManager.init(redis_url=settings.redis_url)
         self.user_list_from_db = await self.fetch_user_list()
         if not self.user_list_from_db:
             self.logger.warning("用户列表为空,终止账号模式")
@@ -53,8 +57,8 @@ class AuthorSpider(BaseSpider):
             if self.platform == "xiaoniangao":
                 self.user_list = [user]  # 特殊逻辑
             pass_video = await self.process_data(raw_data)
-            # 根据是否有通过的数据和下一页游标判断是否继续当前用户
-            if pass_video != 0 and self.current_cursor:
+            # 根据是否有更多数据和下一页游标判断是否继续当前用户
+            if raw_data and self.current_cursor:
                 self.logger.info(
                     f"用户 {crawler_user_uid} 获取到 {pass_video} 个通过视频,继续扫描第{self.current_cursor}页")
             else:
@@ -74,7 +78,7 @@ class AuthorSpider(BaseSpider):
         virtual_data = {
             # "uid": "173309188", # 测试
             "uid": str(user.get("link")),
-            "cursor": self.current_cursor
+            "next_cursor": self.current_cursor
         }
 
         return self.request_preparer.prepare(
@@ -92,8 +96,10 @@ class AuthorSpider(BaseSpider):
             return None
 
         # 游标处理逻辑
-        if safe_extract(response, self.next_cursor):
-            self.current_cursor = safe_extract(response, self.next_cursor)
+        next_cursor_value = safe_extract(response, self.next_cursor)
+        if next_cursor_value is not None:
+            self.current_cursor = str(next_cursor_value)
+            self.logger.debug(f"更新游标: {self.current_cursor}")
 
         data_list = safe_extract(response, self.data_path)
         if not data_list:

+ 1 - 1
spiders/basespider.py

@@ -172,7 +172,7 @@ class BaseSpider(ABC):
             except Exception as e:
                 self.logger.error(f"处理单条数据失败: {e}")
                 self.stats['fail'] += 1
-        self.logger.info(f"批次处理完成: 成功 {success_count}/{data_length}")
+        self.logger.info(f"批次处理完成: 成功抓取 {success_count}/{data_length}")
         return success_count