|
@@ -13,11 +13,15 @@ from core.utils.helpers import generate_titles
|
|
|
from core.utils.spider_config import SpiderConfig
|
|
|
from core.utils.extractors import safe_extract, extract_fields
|
|
|
from core.utils.log.logger_manager import LoggerManager
|
|
|
+from core.utils.template_resolver import resolve_request_body_template
|
|
|
from services.async_mysql_service import AsyncMysqlService
|
|
|
from services.pipeline import PiaoQuanPipeline
|
|
|
from core.base.async_request_client import AsyncRequestClient
|
|
|
from services.async_mq_producer import AsyncMQProducer
|
|
|
|
|
|
+
|
|
|
+import re
|
|
|
+PLACEHOLDER_PATTERN = re.compile(r"\{\{\s*([a-zA-Z0-9_.]+)(\|[^}]+)?\s*\}\}")
|
|
|
class BaseSpider(ABC):
|
|
|
"""
|
|
|
通用爬虫基类:支持严格顺序执行流程
|
|
@@ -56,6 +60,7 @@ class BaseSpider(ABC):
|
|
|
self.response =self.platform_config.response_parse
|
|
|
self.field_map = self.response.get("fields", {})
|
|
|
self.data_path = self.response.get("data_path")
|
|
|
+ self.next_cursor_path = self.response.get("next_cursor")
|
|
|
|
|
|
|
|
|
# 流程控制配置
|
|
@@ -71,15 +76,39 @@ class BaseSpider(ABC):
|
|
|
self.session = None
|
|
|
self.feishu_sheetid = self.platform_config.feishu_sheetid
|
|
|
|
|
|
+ # 动态检测分页变量名
|
|
|
+ self.cursor_variable_name = None
|
|
|
+ body_str = str(self.body)
|
|
|
+ match = PLACEHOLDER_PATTERN.search(body_str)
|
|
|
+ if match:
|
|
|
+ self.cursor_variable_name = match.group(1)
|
|
|
+ self.logger.info(f"{self.trace_id}--检测到分页变量名: {self.cursor_variable_name}")
|
|
|
+ else:
|
|
|
+ self.logger.info(f"{self.trace_id}--未检测到分页变量名,默认不分页")
|
|
|
+
|
|
|
+ self.current_cursor = None
|
|
|
+
|
|
|
+
|
|
|
+ async def crawl_data(self,session,dynamic_variables=None) -> Optional[List[Dict]]:
|
|
|
+ dynamic_variables = dynamic_variables or {}
|
|
|
+ if self.cursor_variable_name:
|
|
|
+ dynamic_variables[self.cursor_variable_name] = self.current_cursor or ""
|
|
|
+
|
|
|
+ resolved_body = resolve_request_body_template(self.body, dynamic_variables)
|
|
|
|
|
|
- async def crawl_data(self,session) -> Optional[List[Dict]]:
|
|
|
response = await self.request_client.request(
|
|
|
session=session,
|
|
|
method=self.method,
|
|
|
url=self.url,
|
|
|
headers=self.headers,
|
|
|
- json=self.body
|
|
|
+ json=resolved_body
|
|
|
)
|
|
|
+
|
|
|
+ if self.next_cursor_path:
|
|
|
+ extracted_cursor = safe_extract(response, self.next_cursor_path)
|
|
|
+ self.current_cursor = extracted_cursor if extracted_cursor else None
|
|
|
+ self.logger.info(f"{self.trace_id}--解析到下一页 {self.cursor_variable_name}: {self.current_cursor}")
|
|
|
+
|
|
|
data = safe_extract(response, self.data_path)
|
|
|
return data if data else []
|
|
|
|
|
@@ -200,6 +229,7 @@ class BaseSpider(ABC):
|
|
|
success_count = 0
|
|
|
fail_count = 0
|
|
|
|
|
|
+
|
|
|
for video in video_list:
|
|
|
# 提取视频字段映射关系
|
|
|
video_obj = await self.process_video(video)
|