2 mesiacov pred · e45d8d3e6a
--- a/core/utils/request_preparer.py
+++ b/core/utils/request_preparer.py
@@ -35,22 +35,28 @@ class RequestPreparer:
 
				             if isinstance(value, str) and "{{" in value and "}}" in value:
			
 
				                 # 提取变量名（支持后续扩展默认值）
			
 
				                 var_name = value.strip("{}").split("|")[0]
			
 
				-                jsonpath_expr = self.response_parse_config.get(var_name)
			
 
				                 
			
 
				-                if jsonpath_expr:
			
 
				-                    extracted_value = safe_extract(response_data, jsonpath_expr, default="")
			
 
				-                    prepared_body[key] = extracted_value
			
 
				-                    
			
 
				-                    # 记录提取信息（仅在有日志记录器时）
			
 
				-                    if extracted_value == "" and self.logger:
			
 
				-                        self.logger.debug(f"变量 {var_name} 提取结果为空，使用默认值")
			
 
				+                # 首先尝试直接从 response_data 中获取（处理扁平字典情况）
			
 
				+                if var_name in response_data:
			
 
				+                    prepared_body[key] = response_data[var_name]
			
 
				                 else:
			
 
				-                    # response_parse_config 中未配置路径，默认空字符串
			
 
				-                    prepared_body[key] = ""
			
 
				+                    # 否则使用 JSONPath 提取（处理嵌套对象情况）
			
 
				+                    jsonpath_expr = self.response_parse_config.get(var_name)
			
 
				                     
			
 
				-                    # 记录警告信息
			
 
				-                    if self.logger:
			
 
				-                        self.logger.warning(f"未在response_parse_config中找到变量 {var_name} 的路径配置")
			
 
				+                    if jsonpath_expr:
			
 
				+                        extracted_value = safe_extract(response_data, jsonpath_expr, default="")
			
 
				+                        prepared_body[key] = extracted_value
			
 
				+                        
			
 
				+                        # 记录提取信息（仅在有日志记录器时）
			
 
				+                        if extracted_value == "" and self.logger:
			
 
				+                            self.logger.debug(f"变量 {var_name} 提取结果为空，使用默认值")
			
 
				+                    else:
			
 
				+                        # response_parse_config 中未配置路径，默认空字符串
			
 
				+                        prepared_body[key] = ""
			
 
				+                        
			
 
				+                        # 记录警告信息
			
 
				+                        if self.logger:
			
 
				+                            self.logger.warning(f"未在response_parse_config中找到变量 {var_name} 的路径配置")
			
 
				             else:
			
 
				                 prepared_body[key] = value
			
 
				         return prepared_body
			
@@ -86,4 +92,13 @@ if __name__ == "__main__":
 
				     # 测试首次请求情况
			
 
				     prepared_body_first = preparer.prepare(request_body_config, {})
			
 
				     print("首次请求时请求体:", prepared_body_first)
			
 
				-    # 输出: {'cursor': '', 'category': 'recommend', 'flag': ''}
			
 
				+    # 输出: {'cursor': '', 'category': 'recommend', 'flag': ''}
			
 
				+    
			
 
				+    # 测试扁平字典情况
			
 
				+    flat_data = {
			
 
				+        "next_cursor": "flat_abc123",
			
 
				+        "flag": "flat_on"
			
 
				+    }
			
 
				+    prepared_body_flat = preparer.prepare(request_body_config, flat_data)
			
 
				+    print("扁平字典请求体:", prepared_body_flat)
			
 
				+    # 输出: {'cursor': 'flat_abc123', 'category': 'recommend', 'flag': 'flat_on'}
			
--- a/spiders/authorspider.py
+++ b/spiders/authorspider.py
@@ -2,6 +2,8 @@
 
				 from datetime import datetime, timedelta
			
 
				 from typing import List, Dict, Optional
			
 
				 
			
 
				+from config import settings
			
 
				+from core.base.async_redis_client import RedisManager
			
 
				 from core.utils.helpers import is_near_next_day
			
 
				 from spiders.basespider import BaseSpider
			
 
				 from core.utils.extractors import safe_extract
			
@@ -13,6 +15,7 @@ class AuthorSpider(BaseSpider):
 
				     def __init__(self, rule_dict: Dict, user_list: List, env: str = "prod"):
			
 
				         super().__init__(rule_dict, user_list, env)
			
 
				         # 账号模式特有状态
			
 
				+
			
 
				         self.user_list_from_db = []  # 数据库用户列表
			
 
				         self.current_user_index = 0  # 当前用户索引
			
 
				         self.current_cursor = ""  # 当前分页游标（初始为空）
			
@@ -20,6 +23,7 @@ class AuthorSpider(BaseSpider):
 
				     async def before_run(self):
			
 
				         """运行前：获取用户列表 """
			
 
				         await super().before_run()
			
 
				+        # await RedisManager.init(redis_url=settings.redis_url)
			
 
				         self.user_list_from_db = await self.fetch_user_list()
			
 
				         if not self.user_list_from_db:
			
 
				             self.logger.warning("用户列表为空，终止账号模式")
			
@@ -53,8 +57,8 @@ class AuthorSpider(BaseSpider):
 
				             if self.platform == "xiaoniangao":
			
 
				                 self.user_list = [user]  # 特殊逻辑
			
 
				             pass_video = await self.process_data(raw_data)
			
 
				-            # 根据是否有通过的数据和下一页游标判断是否继续当前用户
			
 
				-            if pass_video != 0 and self.current_cursor:
			
 
				+            # 根据是否有更多数据和下一页游标判断是否继续当前用户
			
 
				+            if raw_data and self.current_cursor:
			
 
				                 self.logger.info(
			
 
				                     f"用户 {crawler_user_uid} 获取到 {pass_video} 个通过视频,继续扫描第{self.current_cursor}页")
			
 
				             else:
			
@@ -74,7 +78,7 @@ class AuthorSpider(BaseSpider):
 
				         virtual_data = {
			
 
				             # "uid": "173309188", # 测试
			
 
				             "uid": str(user.get("link")),
			
 
				-            "cursor": self.current_cursor
			
 
				+            "next_cursor": self.current_cursor
			
 
				         }
			
 
				 
			
 
				         return self.request_preparer.prepare(
			
@@ -92,8 +96,10 @@ class AuthorSpider(BaseSpider):
 
				             return None
			
 
				 
			
 
				         # 游标处理逻辑
			
 
				-        if safe_extract(response, self.next_cursor):
			
 
				-            self.current_cursor = safe_extract(response, self.next_cursor)
			
 
				+        next_cursor_value = safe_extract(response, self.next_cursor)
			
 
				+        if next_cursor_value is not None:
			
 
				+            self.current_cursor = str(next_cursor_value)
			
 
				+            self.logger.debug(f"更新游标: {self.current_cursor}")
			
 
				 
			
 
				         data_list = safe_extract(response, self.data_path)
			
 
				         if not data_list:
			
--- a/spiders/basespider.py
+++ b/spiders/basespider.py
@@ -172,7 +172,7 @@ class BaseSpider(ABC):
 
				             except Exception as e:
			
 
				                 self.logger.error(f"处理单条数据失败: {e}")
			
 
				                 self.stats['fail'] += 1
			
 
				-        self.logger.info(f"批次处理完成: 成功 {success_count}/{data_length}")
			
 
				+        self.logger.info(f"批次处理完成: 成功抓取 {success_count}/{data_length}")
			
 
				         return success_count