zhangliang преди 2 седмици
родител
ревизия
f692fdb1f7
променени са 42 файла, в които са добавени 1269 реда и са изтрити 1440 реда
  1. 0 213
      CONFIGURATION.md
  2. 3 3
      README.md
  3. 23 2
      config/__init__.py
  4. 54 51
      config/base.py
  5. 1 1
      config/spiders_config.yaml
  6. 172 64
      core/base/async_request_client.py
  7. 0 0
      core/di/__init__.py
  8. 40 0
      core/di/container.py
  9. 86 0
      core/models/message_parser.py
  10. 32 23
      core/models/spiders_config_models.py
  11. 26 34
      core/models/video_item.py
  12. 6 0
      core/utils/codes.py
  13. 0 219
      core/utils/config_documentation.py
  14. 0 184
      core/utils/config_health_check.py
  15. 0 129
      core/utils/config_manager.py
  16. 0 3
      core/utils/feishu_data_async.py
  17. 38 7
      core/utils/helpers.py
  18. 3 2
      core/utils/log/log_codes.py
  19. 87 0
      core/utils/log_codes.py
  20. 7 32
      core/utils/spider_config.py
  21. 140 0
      core/video_processor.py
  22. 2 1
      main.py
  23. 40 40
      requirements.txt
  24. 2 5
      services/pipeline.py
  25. 108 0
      spiders/advanced_factory.py
  26. 0 0
      spiders/author/__init__.py
  27. 0 0
      spiders/author/xiaoniangao_author.py
  28. 82 73
      spiders/authorspider.py
  29. 198 189
      spiders/basespider.py
  30. 0 0
      spiders/recommend/__init__.py
  31. 5 1
      spiders/recommend/benshanzhufu_recommend.py
  32. 4 1
      spiders/recommend/yuannifuqimanman_recommend.py
  33. 49 53
      spiders/recommendspider.py
  34. 4 4
      spiders/spider_registry.py
  35. 0 0
      test/__init__.py
  36. 57 0
      test/test_pydantic_upgrade.py
  37. 0 8
      tests/test1.py
  38. 0 33
      tests/test_async_redis_service.py
  39. 0 10
      tests/test_benshanzhufu_recommend.py
  40. 0 4
      tests/test_config.py
  41. 0 21
      tests/test_feishu.py
  42. 0 30
      tests/test_video_item.py

+ 0 - 213
CONFIGURATION.md

@@ -1,213 +0,0 @@
-# AutoScraperX 配置说明
-
-本文档详细说明了AutoScraperX项目的配置项。
-
----
-
-# 环境配置说明
-
-环境配置通过 `.env` 文件进行配置,以下为所有可配置项:
-
-| 配置项 | 描述 | 是否必填 | 默认值 |
-|--------|------|----------|--------|
-| ENV | 运行环境 (可选值: prod, dev) | 否 | prod |
-| DB_HOST | 数据库主机地址 | 是 |  |
-| DB_PORT | 数据库端口 | 否 | 3306 |
-| DB_USER | 数据库用户名 | 是 |  |
-| DB_PASSWORD | 数据库密码 | 是 |  |
-| DB_NAME | 数据库名称 | 是 |  |
-| DB_CHARSET | 数据库字符集 | 是 |  |
-| ROCKETMQ_ENDPOINT | RocketMQ接入点 | 是 |  |
-| ROCKETMQ_ACCESS_KEY_ID | RocketMQ访问密钥ID | 是 |  |
-| ROCKETMQ_ACCESS_KEY_SECRET | RocketMQ访问密钥 | 是 |  |
-| FEISHU_APPID | 飞书应用ID | 是 |  |
-| FEISHU_APPSECRET | 飞书应用密钥 | 是 |  |
-| ALIYUN_ACCESS_KEY_ID | 阿里云访问密钥ID | 是 |  |
-| ALIYUN_ACCESS_KEY_SECRET | 阿里云访问密钥 | 是 |  |
-| REDIS_HOST | Redis主机地址 | 是 |  |
-| REDIS_PORT | Redis端口 | 否 | 6379 |
-| REDIS_PASSWORD | Redis密码 | 是 |  |
-
----
-
-# 爬虫配置说明
-
-爬虫配置通过 `config/spiders_config.yaml` 文件进行配置。
-
-## 配置示例
-
-```yaml
-default:
-  base_url: http://8.217.192.46:8889
-  request_timeout: 30
-  max_retries: 3
-  headers:
-    {"Content-Type": "application/json"}
-
-benshanzhufurecommend:
-  platform: benshanzhufu
-  mode: recommend
-  path: /crawler/ben_shan_zhu_fu/recommend
-  method: post
-  request_body:
-    cursor: "{{next_cursor}}"
-  loop_times: 50
-  loop_interval:
-    min: 30
-    max: 60
-  feishu_sheetid: "aTSJH4"
-  response_parse:
-    data: "$.data"
-    next_cursor: "$.data.next_cursor"
-    data_path: "$.data.data"
-    fields:
-      video_id: "$.nid"
-      video_title: "$.title"
-      play_cnt: 0
-      publish_time_stamp: "$.update_time"
-      out_user_id: "$.nid"
-      cover_url: "$.video_cover"
-      like_cnt: 0
-      video_url: "$.video_url"
-      out_video_id: "$.nid"
-
-
-yuannifuqimanmanrecommend:
-  platform: yuannifuqimanman
-  mode: recommend
-  path: /crawler/yuan_ni_fu_qi_man_man/recommend
-  method: post
-  request_body:
-    cursor: "{{next_cursor}}"
-  loop_times: 100
-  loop_interval:
-    min: 30
-    max: 60
-  feishu_sheetid: "golXy9"
-  response_parse:
-    data: "$.data"
-    next_cursor: "$.data.next_cursor"
-    data_path: "$.data.data"
-    fields:
-      video_id: "$.nid"
-      video_title: "$.title"
-      out_user_id: "$.nid"
-      cover_url: "$.video_cover"
-      video_url: "$.video_url"
-      out_video_id: "$.nid"
-
-xiaoniangaoauthor:
-  platform: xiaoniangao
-  mode: author
-  path: /crawler/xiao_nian_gao_plus/blogger
-  method: post
-  request_body:
-      cursor: "{{next_cursor}}"
-      account_id: "{{uid}}" # 数据库的uid
-  loop_times: 100
-  loop_interval:
-    min: 5
-    max: 10
-  feishu_sheetid: "K0gA9Y"
-  response_parse:
-    uid: "$.uid" # 数据库的uid
-    next_cursor: "$.cursor"
-    data: "$.data"
-    has_more: "$.data.has_more"
-    data_path: "$.data.data"
-    fields:
-      video_title: "$.title"
-      duration: "$.du"
-      play_cnt: "$.play_pv"
-      like_cnt: "$.favor.total"
-      comment_cnt: "$.comment_count"
-      share_cnt: "$.share"
-      width: "$.w"
-      height: "$.h"
-      avatar_url: "$.user.hurl"
-      cover_url: "$.url"
-      video_url: "$.v_url"
-      out_user_id: "$.user.mid"
-      out_video_id: "$.vid"
-      publish_time_stamp: "$.t"
-
-xiaoniangaorecommend:
-  platform: xiaoniangao
-  mode: recommend
-  path: crawler/xiao_nian_gao_plus/recommend
-  method: post
-  request_body:
-
-  loop_times: 100
-  loop_interval:
-    min: 5
-    max: 20
-  feishu_sheetid: "D1nVxQ"
-  response_parse:
-    uid: "$.uid" # 数据库的uid
-    next_cursor: "$.cursor"
-    data: "$.data"
-    has_more: "$.data.has_more"
-    data_path: "$.data.data"
-    fields:
-      video_title: "$.title"
-      duration: "$.du"
-      play_cnt: "$.play_pv"
-      like_cnt: "$.favor.total"
-      comment_cnt: "$.comment_count"
-      share_cnt: "$.share"
-      width: "$.w"
-      height: "$.h"
-      avatar_url: "$.user.hurl"
-      cover_url: "$.url"
-      video_url: "$.v_url"
-      out_user_id: "$.user.mid"
-      out_video_id: "$.vid"
-      publish_time_stamp: "$.t"
-
-
-
-```
-
-## 字段说明
-
-### 全局配置字段
-
-| 字段 | 描述 |
-|------|------|
-| base_url | 基础URL,用于拼接完整请求URL |
-| request_timeout | 请求超时时间(秒) |
-| max_retries | 最大重试次数 |
-| headers | 请求头信息 |
-
-### 平台配置字段
-
-| 字段 | 描述 |
-|------|------|
-| platform | 平台名称 |
-| mode | 爬取模式(如 recommend, author) |
-| path | API路径 |
-| url | 完整请求URL |
-| method | HTTP请求方法 |
-| request_body | 请求体参数 |
-| loop_times | 循环次数 |
-| loop_interval | 循环间隔(min/max) |
-| response_parse | 响应解析配置 |
-| feishu_sheetid | 飞书表格ID |
-
-### 响应解析字段
-
-| 字段 | 描述 |
-|------|------|
-| data_path | 数据列表路径 |
-| next_cursor | 下一页游标路径 |
-| has_more | 是否还有更多数据路径 |
-| fields | 字段映射配置 |
-
----
-
-## 当前配置状态
-
-- 平台配置数量: 4
-- 运行环境: prod
-- 配置文件路径: /Users/zhangliang/Documents/piaoquan/AutoScraperX/config/spiders_config.yaml

+ 3 - 3
README.md

@@ -9,10 +9,10 @@
 ```bash
 ├── config/                # 配置文件
 │   ├── __init__.py        # 配置初始化
-│   ├── base.py            # 环境配置定义
+│   ├── recommend.py            # 环境配置定义
 │   └── spiders_config.yaml# 爬虫平台配置
 ├── core/                  # 核心框架模块
-│   ├── base/              # 基础组件(异步客户端等)
+│   ├── recommend/              # 基础组件(异步客户端等)
 │   ├── models/            # 数据模型
 │   ├── utils/             # 工具类
 │   │   ├── config_manager.py      # 统一配置管理器
@@ -54,7 +54,7 @@
 - **Pipeline**:负责数据校验、去重和推送至 ETL MQ
 - **MQ 系统**:阿里云 MQ,支持按平台配置多个 Topic,消费完成再手动 ACK
 - **配置系统**:
-  - 环境配置:通过 `.env` 文件和 `config/base.py` 管理
+  - 环境配置:通过 `.env` 文件和 `config/recommend.py` 管理
   - 爬虫配置:通过 `config/spiders_config.yaml` 管理
 
 ---

+ 23 - 2
config/__init__.py

@@ -1,3 +1,24 @@
-from .prod import settings  # 使用生产环境配置
+# config/__init__.py
+"""统一配置入口"""
+from .base import settings
+from core.utils.spider_config import SpiderConfig
+from core.models.spiders_config_models import PlatformConfig
 
-__all__ = ['settings']
+
+# 提供统一的配置访问接口
+class ConfigManager:
+    @staticmethod
+    def get_spider_config(spider_name: str) -> PlatformConfig:
+        return SpiderConfig.get_platform_config(spider_name)
+
+    @staticmethod
+    def get_all_spiders() -> list:
+        return SpiderConfig.list_all_platforms()
+
+    @staticmethod
+    def get_app_settings():
+        return settings
+
+
+# 全局配置管理器
+config_manager = ConfigManager()

+ 54 - 51
config/base.py

@@ -1,103 +1,106 @@
+# base.py 修改后
 import os
-
 from dotenv import load_dotenv
 from core.utils.path_utils import project_root, log_dir
-from pydantic import BaseSettings, Field, AnyUrl, validator
-
-# 在 Settings 类之前加载 .env 文件
+from pydantic import Field, AnyUrl, field_validator, ConfigDict
+from pydantic_settings import BaseSettings  # 注意:BaseSettings 现在在 pydantic_settings 中
 
-load_dotenv(os.path.join(project_root,".env"))
+load_dotenv(os.path.join(project_root, ".env"))
 
 
 class Settings(BaseSettings):
+    # Pydantic 2.x 使用 ConfigDict 替代内部 Config 类
+    model_config = ConfigDict(
+        env_file=".env",
+        env_file_encoding='utf-8',
+        case_sensitive=False,
+        env_prefix="",  # 明确指定环境变量前缀
+    )
+
     # 环境标识
-    ENV: str = "prod"  # prod/dev
+    ENV: str = "prod"
     ENABLE_ALIYUN_LOG: bool = True
 
     # 日志配置
     LOG_DIR: str = log_dir
     LOG_LEVEL: str = "INFO"
 
-    # 阿里云数据库配置 (RDS)
-    DB_HOST: str = Field(..., env="DB_HOST")
-    DB_PORT: int = Field(3306, env="DB_PORT")
-    DB_USER: str = Field(..., env="DB_USER")
-    DB_PASSWORD: str = Field(..., env="DB_PASSWORD")
-    DB_NAME: str = Field(..., env="DB_NAME")
-    DB_CHARSET: str = Field(..., env="DB_CHARSET")
+    # 阿里云数据库配置
+    DB_HOST: str = Field(..., validation_alias="DB_HOST")
+    DB_PORT: int = Field(3306, validation_alias="DB_PORT")
+    DB_USER: str = Field(..., validation_alias="DB_USER")
+    DB_PASSWORD: str = Field(..., validation_alias="DB_PASSWORD")
+    DB_NAME: str = Field(..., validation_alias="DB_NAME")
+    DB_CHARSET: str = Field(..., validation_alias="DB_CHARSET")
     DB_POOL_SIZE: int = 20
     DB_POOL_RECYCLE: int = 3600
 
     # 阿里云RocketMQ配置
-    ROCKETMQ_ENDPOINT: AnyUrl = Field(..., env="ROCKETMQ_ENDPOINT")
-    ROCKETMQ_ACCESS_KEY_ID: str = Field(..., env="ROCKETMQ_ACCESS_KEY_ID")
-    ROCKETMQ_ACCESS_KEY_SECRET: str = Field(..., env="ROCKETMQ_ACCESS_KEY_SECRET")
-    ROCKETMQ_INSTANCE_ID: str = Field(..., env="ROCKETMQ_INSTANCE_ID")
-    ROCKETMQ_WAIT_SECONDS: int = 30 # 最长30s
+    ROCKETMQ_ENDPOINT: str = Field(..., validation_alias="ROCKETMQ_ENDPOINT")
+    ROCKETMQ_ACCESS_KEY_ID: str = Field(..., validation_alias="ROCKETMQ_ACCESS_KEY_ID")
+    ROCKETMQ_ACCESS_KEY_SECRET: str = Field(..., validation_alias="ROCKETMQ_ACCESS_KEY_SECRET")
+    ROCKETMQ_INSTANCE_ID: str = Field(..., validation_alias="ROCKETMQ_INSTANCE_ID")
+    ROCKETMQ_WAIT_SECONDS: int = 30
     ROCKETMQ_BATCH: int = 1
 
     # 飞书配置
-    FEISHU_APPID: str = Field(..., env="FEISHU_APPID")
-    FEISHU_APPSECRET: str = Field(..., env="FEISHU_APPSECRET")
+    FEISHU_APPID: str = Field(..., validation_alias="FEISHU_APPID")
+    FEISHU_APPSECRET: str = Field(..., validation_alias="FEISHU_APPSECRET")
 
     # 连接池配置
     CONNECTION_TIMEOUT: int = 10
     REQUEST_TIMEOUT: int = 30
 
     # 阿里云日志
-    ALIYUN_ACCESS_KEY_ID: str = Field(..., env="ALIYUN_ACCESS_KEY_ID")
-    ALIYUN_ACCESS_KEY_SECRET: str = Field(..., env="ALIYUN_ACCESS_KEY_SECRET")
+    ALIYUN_ACCESS_KEY_ID: str = Field(..., validation_alias="ALIYUN_ACCESS_KEY_ID")
+    ALIYUN_ACCESS_KEY_SECRET: str = Field(..., validation_alias="ALIYUN_ACCESS_KEY_SECRET")
 
     # redis
-    REDIS_HOST: str = Field(..., env="REDIS_HOST")
-    REDIS_PORT: int = Field(..., env="REDIS_PORT")
-    REDIS_PASSWORD: str = Field(..., env="REDIS_PASSWORD")
-    REDIS_DB: int = Field(0, env="REDIS_DB")
-    REDIS_MAX_CONNECTIONS: int = Field(20, env="REDIS_MAX_CONNECTIONS")
+    REDIS_HOST: str = Field(..., validation_alias="REDIS_HOST")
+    REDIS_PORT: int = Field(..., validation_alias="REDIS_PORT")
+    REDIS_PASSWORD: str = Field(..., validation_alias="REDIS_PASSWORD")
+    REDIS_DB: int = Field(0, validation_alias="REDIS_DB")
+    REDIS_MAX_CONNECTIONS: int = Field(20, validation_alias="REDIS_MAX_CONNECTIONS")
+
     @property
     def redis_url(self) -> str:
-        """生成Redis连接URL"""
         return f"redis://:{self.REDIS_PASSWORD}@{self.REDIS_HOST}:{self.REDIS_PORT}/{self.REDIS_DB}"
 
-    @validator('DB_PORT', 'REDIS_PORT')
-    def validate_port(cls, v):
+    # Pydantic 2.x 验证器语法
+    @field_validator('DB_PORT', 'REDIS_PORT')
+    @classmethod
+    def validate_port(cls, v: int) -> int:
         if not 1 <= v <= 65535:
             raise ValueError('Port must be between 1 and 65535')
         return v
 
-    @validator('DB_POOL_SIZE', 'DB_POOL_RECYCLE', 'REDIS_MAX_CONNECTIONS')
-    def validate_positive_int(cls, v, field):
+    @field_validator('DB_POOL_SIZE', 'DB_POOL_RECYCLE', 'REDIS_MAX_CONNECTIONS')
+    @classmethod
+    def validate_positive_int(cls, v: int) -> int:
         if v <= 0:
-            raise ValueError(f'{field.name} must be positive')
+            raise ValueError('Value must be positive')
         return v
 
-    @validator('ROCKETMQ_WAIT_SECONDS')
-    def validate_rocketmq_wait_seconds(cls, v):
+    @field_validator('ROCKETMQ_WAIT_SECONDS')
+    @classmethod
+    def validate_rocketmq_wait_seconds(cls, v: int) -> int:
         if not 1 <= v <= 30:
             raise ValueError('ROCKETMQ_WAIT_SECONDS must be between 1 and 30')
         return v
 
-    @validator('ROCKETMQ_BATCH')
-    def validate_rocketmq_batch(cls, v):
+    @field_validator('ROCKETMQ_BATCH')
+    @classmethod
+    def validate_rocketmq_batch(cls, v: int) -> int:
         if not 1 <= v <= 16:
             raise ValueError('ROCKETMQ_BATCH must be between 1 and 16')
         return v
 
-    @validator('CONNECTION_TIMEOUT', 'REQUEST_TIMEOUT')
-    def validate_timeouts(cls, v, field):
+    @field_validator('CONNECTION_TIMEOUT', 'REQUEST_TIMEOUT')
+    @classmethod
+    def validate_timeouts(cls, v: int) -> int:
         if v <= 0:
-            raise ValueError(f'{field.name} must be positive')
+            raise ValueError('Timeout must be positive')
         return v
 
-    class Config:
-        env_file = ".env"
-        env_file_encoding = 'utf-8'
-        case_sensitive = False
-
-    # @property
-    # def database_url(self) -> str:
-    #     """生成安全的数据库连接字符串"""
-    #     return f"mysql+asyncmy://{self.DB_USER}:{self.DB_PASSWORD}@{self.DB_HOST}:{self.DB_PORT}/{self.DB_NAME}?charset={self.DB_CHARSET}"
-
 
-settings = Settings()
+settings = Settings()

+ 1 - 1
config/spiders_config.yaml

@@ -12,7 +12,7 @@ benshanzhufurecommend:
   method: post
   request_body:
     cursor: "{{next_cursor}}"
-  loop_times: 50
+  loop_times: 7
   loop_interval:
     min: 30
     max: 60

+ 172 - 64
core/base/async_request_client.py

@@ -1,5 +1,6 @@
 import asyncio
-from typing import Optional, Dict
+import json
+from typing import Optional, Dict, Any
 import aiohttp
 
 from core.utils.log.logger_manager import LoggerManager
@@ -7,78 +8,185 @@ from core.utils.log.logger_manager import LoggerManager
 
 class AsyncRequestClient:
     """
-    请求失败重试,本地日志记录
-    请求返回code!=0重试,本地日志记录
-    重试达到最大次数后上报阿里云日志
+    异步HTTP请求客户端,支持重试机制和详细日志记录
+
+    特性:
+    - 请求失败自动重试(网络错误和业务逻辑错误)
+    - 指数退避重试策略
+    - 本地日志和阿里云日志双重记录
+    - 可配置的重试参数和超时设置
     """
-    def __init__(self, logger: Optional[LoggerManager.get_logger()] = None,
-                 aliyun_log: Optional[LoggerManager.get_aliyun_logger()] = None,
-                 max_retries=3, timeout=30):
+
+    class BusinessLogicError(Exception):
+        """业务逻辑错误异常"""
+        pass
+
+    def __init__(
+            self,
+            logger: Optional[Any] = None,
+            aliyun_log: Optional[Any] = None,
+            max_retries: int = 3,
+            base_delay: float = 1.0,
+            timeout: int = 30
+    ):
+        """
+        初始化请求客户端
+
+        Args:
+            logger: 本地日志记录器实例
+            aliyun_log: 阿里云日志记录器实例
+            max_retries: 最大重试次数
+            base_delay: 基础等待时间(秒),用于指数退避计算
+            timeout: 请求超时时间(秒)
+        """
         self.logger = logger
         self.aliyun_log = aliyun_log
         self.max_retries = max_retries
+        self.base_delay = base_delay
         self.timeout = timeout
 
-    async def request(self, session: aiohttp.ClientSession, method: str, url: str, **kwargs) -> Optional[Dict]:
+    async def request(
+            self,
+            session: aiohttp.ClientSession,
+            method: str,
+            url: str,
+            **kwargs
+    ) -> Optional[Dict]:
+        """
+        发送HTTP请求,支持重试机制
+
+        Args:
+            session: aiohttp会话对象
+            method: HTTP方法(GET, POST等)
+            url: 请求URL
+            **kwargs: 其他请求参数
+
+        Returns:
+            Optional[Dict]: 响应数据(JSON格式),失败时返回None
+        """
         retries = 0
-        resp = None  # 初始化resp变量
+        last_error = None
 
-        while retries < self.max_retries:
+        # 记录请求开始
+        self._log_request_start(method, url, kwargs)
+
+        while retries <= self.max_retries:
             try:
-                if self.logger:
-                    self.logger.info(f"请求 {method} {url}, 请求参数{kwargs}")
-                if self.aliyun_log:
-                    self.aliyun_log.logging(
-                        code="1012",
-                        message="初始化请求",
-                        data={"url": url,
-                              "method": method,
-                              "requestBody": kwargs}
-                    )
-                async with session.request(method, url, **kwargs) as response:
+                # 计算等待时间(指数退避)
+                if retries > 0:
+                    delay = self.base_delay * (2 ** (retries - 1))
+                    self._log_retry_delay(retries, delay)
+                    await asyncio.sleep(delay)
+
+                # 发送请求
+                async with session.request(
+                        method, url,
+                        timeout=aiohttp.ClientTimeout(total=self.timeout),
+                        **kwargs
+                ) as response:
+                    # 检查HTTP状态
                     response.raise_for_status()
-                    resp = await response.json()
-                    if resp.get('code') != 0:
-                        retries += 1
-                        if self.logger:
-                            self.logger.info(f"{url} 响应 {resp}, 重试 {retries}/{self.max_retries}")
-                        if retries >= self.max_retries:
-                            error_msg = f"请求响应code非0且达到最大重试次数 {self.max_retries}"
-                            if self.logger:
-                                self.logger.error(error_msg)
-                            if self.aliyun_log:
-                                self.aliyun_log.logging(
-                                    code="9006",
-                                    message=error_msg,
-                                    data={
-                                        "url": url,
-                                        "method": method,
-                                        "requestBody": kwargs,
-                                        "response": resp
-                                    }
-                                )
-                        await asyncio.sleep(5)
-                        continue
-                    self.logger.info(f"{url} 响应: {resp}")
-                    return resp
+
+                    # 解析响应
+                    resp_data = await response.json()
+
+                    # 检查业务状态码
+                    if resp_data.get('code') != 0:
+                        raise self.BusinessLogicError(
+                            f"业务逻辑错误: code={resp_data.get('code')}"
+                        )
+
+                    # 记录成功响应
+                    self._log_success_response(url, resp_data)
+                    return resp_data
+
+            except (aiohttp.ClientError, asyncio.TimeoutError) as e:
+                last_error = e
+                retries += 1
+                self._log_network_error(e, retries, url)
+
+            except self.BusinessLogicError as e:
+                last_error = e
+                retries += 1
+                self._log_business_error(e, retries, url)
+
             except Exception as e:
+                last_error = e
                 retries += 1
-                if retries >= self.max_retries:
-                    if self.logger:
-                        self.logger.error(f"请求失败达到最大重试次数: {e}")
-                    if self.aliyun_log:
-                        self.aliyun_log.logging(
-                            code="9006",
-                            message="请求异常达到最大重试次数",
-                            data={
-                                "url": url,
-                                "method": method,
-                                "requestBody": kwargs,
-                                "response": str(resp) if resp else str(e),
-                                "error_type": type(e).__name__
-                            }
-                        )
-                    return
-                if self.logger:
-                    self.logger.warning(f"请求失败 {e}, 重试 {retries}/{self.max_retries}")
-                await asyncio.sleep(5)
+                self._log_unexpected_error(e, retries, url)
+
+        # 所有重试都失败
+        self._log_final_failure(method, url, kwargs, last_error)
+        return None
+
+    def _log_request_start(self, method: str, url: str, kwargs: Dict):
+        """记录请求开始"""
+        if self.logger:
+            # 简化日志,避免记录过大请求体
+            # log_data = {k: v for k, v in kwargs.items() if k != 'json' and k != 'data'}
+            # if 'json' in kwargs:
+            #     log_data['json'] = '[...]'  # 简化JSON内容
+            self.logger.info(f"请求 {method} {url}, 参数: {kwargs}")
+
+        if self.aliyun_log:
+            self.aliyun_log.logging(
+                code="1012",
+                message="初始化请求",
+                data={
+                    "url": url,
+                    "method": method,
+                    "requestBody": {k: v for k, v in kwargs.items() if k != 'json'}
+                }
+            )
+
+    def _log_retry_delay(self, retries: int, delay: float):
+        """记录重试等待"""
+        if self.logger:
+            self.logger.info(f"第 {retries} 次重试,等待 {delay:.2f} 秒")
+
+    def _log_network_error(self, error: Exception, retries: int, url: str):
+        """记录网络错误"""
+        if self.logger:
+            self.logger.warning(
+                f"请求 {url} 网络错误({retries}/{self.max_retries}): {error}"
+            )
+
+    def _log_business_error(self, error: Exception, retries: int, url: str):
+        """记录业务逻辑错误"""
+        if self.logger:
+            self.logger.warning(
+                f"请求 {url} 业务逻辑错误({retries}/{self.max_retries}): {error}"
+            )
+
+    def _log_unexpected_error(self, error: Exception, retries: int, url: str):
+        """记录未预期错误"""
+        if self.logger:
+            self.logger.error(
+                f"请求 {url} 未预期错误({retries}/{self.max_retries}): {error}"
+            )
+
+    def _log_success_response(self, url: str, response: Dict):
+        """记录成功响应"""
+        if self.logger:
+            # 只记录关键信息,避免日志过大
+            self.logger.info(f"请求 {url} 成功: code={response.get('code')}")
+
+    def _log_final_failure(self, method: str, url: str, kwargs: Dict, error: Exception):
+        """记录最终失败"""
+        error_msg = f"请求 {method} {url} 最终失败: {error}"
+
+        if self.logger:
+            self.logger.error(error_msg)
+
+        if self.aliyun_log:
+            self.aliyun_log.logging(
+                code="9006",
+                message=error_msg,
+                data={
+                    "url": url,
+                    "method": method,
+                    "requestBody": {k: v for k, v in kwargs.items() if k != 'json'},
+                    "error_type": type(error).__name__,
+                    "error_message": str(error)
+                }
+            )

+ 0 - 0
tests/__init__.py → core/di/__init__.py


+ 40 - 0
core/di/container.py

@@ -0,0 +1,40 @@
+# core/di/container.py
+from typing import Dict, Any, Type
+from dependency_injector import containers, providers
+from config.base import settings
+from services.async_mysql_service import AsyncMysqlService
+from services.async_mq_producer import AsyncMQProducer
+from services.async_mq_consumer import AsyncRocketMQConsumer
+
+
+class ServiceContainer(containers.DeclarativeContainer):
+    """服务容器:统一管理所有服务的生命周期"""
+
+    # 配置
+    config = providers.Configuration()
+
+    # 数据库服务工厂
+    db_service = providers.Factory(
+        AsyncMysqlService,
+        platform=config.platform,
+        mode=config.mode
+    )
+
+    # MQ 生产者工厂
+    mq_producer = providers.Singleton(
+        AsyncMQProducer,
+        topic_name="topic_crawler_etl_prod_v2",
+        platform=config.platform,
+        mode=config.mode
+    )
+
+    # MQ 消费者工厂
+    mq_consumer = providers.Factory(
+        AsyncRocketMQConsumer,
+        topic_name=config.topic_name,
+        group_id=config.group_id
+    )
+
+
+# 全局容器实例
+container = ServiceContainer()

+ 86 - 0
core/models/message_parser.py

@@ -0,0 +1,86 @@
+from pydantic import BaseModel, Field
+from typing import Optional, Union, List
+import json
+
+
+class RuleField(BaseModel):
+    min: Union[int, float] = 0
+    max: Union[int, float] = 0
+
+
+class RuleModel(BaseModel):
+    period: Optional[RuleField] = None
+    duration: Optional[RuleField] = None
+    play_cnt: Optional[RuleField] = None
+    like_cnt: Optional[RuleField] = None
+    comment_cnt: Optional[RuleField] = None
+    share_cnt: Optional[RuleField] = None
+    videos_cnt: Optional[RuleField] = None
+    width: Optional[RuleField] = None
+    height: Optional[RuleField] = None
+
+
+class MessageModel(BaseModel):
+    id: int
+    rule: str
+    parsed_rule: List[RuleModel] = Field(default_factory=list)
+
+    def model_post_init(self, __context) -> None:
+        # 解析 rule 字符串
+        try:
+            if self.rule:
+                rule_list = json.loads(self.rule)
+                if not isinstance(rule_list, list):
+                    rule_list = [rule_list]
+                self.parsed_rule = []
+                for item in rule_list:
+                    try:
+                        self.parsed_rule.append(RuleModel(**item))
+                    except Exception as e:
+                        print(f"解析规则项失败: {item}, 错误: {e}")
+        except json.JSONDecodeError as e:
+            print(f"JSON 解析错误: {e}")
+        except Exception as e:
+            print(f"其他错误: {e}")
+
+        super().model_post_init(__context)
+
+
+if __name__ == "__main__":
+    message_data = {
+        "createTime": 1747042241127,
+        "id": 147,
+        "interval": 86400,
+        "machine": "",
+        "mode": "related",
+        "operator": "张良",
+        "rule": '[{"videos_cnt":{"min":200,"max":0}},{"duration":{"min":30,"max":1200}}]',
+        "source": "zhongqingkandianrelated",
+        "spiderName": "run_zqkd_related",
+        "startTime": 1758211500000,
+        "status": 0,
+        "taskName": "中青看点相关推荐",
+        "updateTime": 1749178503914
+    }
+
+    try:
+        message = MessageModel(**message_data)
+        print(f"ID: {message.id}")
+        print(f"原始规则字符串: {message.rule}")
+        print("解析后的规则:")
+
+
+        for idx, rule in enumerate(message.parsed_rule, 1):
+            print(f"规则 {idx}:")
+            rule_dict = rule.model_dump()
+            for field_name, field_value in rule_dict.items():
+                if field_value is not None:
+                    print(f"  {field_name}: min={field_value['min']}, max={field_value['max']}")
+
+        if not message.parsed_rule:
+            print("  没有有效的规则")
+
+    except Exception as e:
+        print(f"解析失败: {e}")
+        import traceback
+        traceback.print_exc()

+ 32 - 23
core/models/spiders_config_models.py

@@ -1,25 +1,29 @@
-from pydantic import BaseModel, AnyUrl, validator
+# spiders_config_models.py 修改后
+from pydantic import BaseModel, AnyUrl, field_validator, ConfigDict
 from typing import Dict, Any, Optional, Union
 
 
 class BaseConfig(BaseModel):
-    base_url: Optional[AnyUrl]
+    model_config = ConfigDict(extra='forbid')  # 禁止额外字段
+
+    base_url: Optional[AnyUrl] = None
     request_timeout: int = 30
     max_retries: int = 3
     headers: Dict[str, Any] = {}
 
-    @validator('request_timeout', 'max_retries')
-    def validate_positive_int(cls, v, field):
+    @field_validator('request_timeout', 'max_retries')
+    @classmethod
+    def validate_positive_int(cls, v: int) -> int:
         if v <= 0:
-            raise ValueError(f'{field.name} must be positive')
+            raise ValueError('Value must be positive')
         return v
 
 
 class PlatformConfig(BaseConfig):
     platform: str
     mode: str
-    path: Optional[str]
-    url: AnyUrl
+    path: Optional[str] = None
+    url: str
     method: str
     request_body: Dict[str, Any] = {}
     loop_times: int = 1
@@ -28,21 +32,24 @@ class PlatformConfig(BaseConfig):
     retry_times: int = 0
     feishu_sheetid: Optional[str] = None
 
-    @validator('method')
-    def validate_method(cls, v):
+    @field_validator('method')
+    @classmethod
+    def validate_method(cls, v: str) -> str:
         allowed_methods = ['GET', 'POST', 'PUT', 'DELETE', 'PATCH']
         if v.upper() not in allowed_methods:
             raise ValueError(f'Method must be one of {", ".join(allowed_methods)}')
         return v.upper()
 
-    @validator('loop_times')
-    def validate_loop_times(cls, v):
+    @field_validator('loop_times')
+    @classmethod
+    def validate_loop_times(cls, v: int) -> int:
         if v <= 0:
             raise ValueError('loop_times must be positive')
         return v
 
-    @validator('loop_interval')
-    def validate_loop_interval(cls, v):
+    @field_validator('loop_interval')
+    @classmethod
+    def validate_loop_interval(cls, v: Dict[str, int]) -> Dict[str, int]:
         if 'min' not in v or 'max' not in v:
             raise ValueError('loop_interval must contain both min and max keys')
         if v['min'] < 0 or v['max'] < 0:
@@ -51,33 +58,35 @@ class PlatformConfig(BaseConfig):
             raise ValueError('min value cannot be greater than max value')
         return v
 
-    @validator('response_parse')
-    def validate_response_parse(cls, v):
+    @field_validator('response_parse')
+    @classmethod
+    def validate_response_parse(cls, v: Dict[str, Any]) -> Dict[str, Any]:
         if 'data_path' not in v:
             raise ValueError('response_parse must contain data_path')
         return v
 
-    @validator('retry_times')
-    def validate_retry_times(cls, v):
+    @field_validator('retry_times')
+    @classmethod
+    def validate_retry_times(cls, v: int) -> int:
         if v < 0:
             raise ValueError('retry_times must be non-negative')
         return v
 
-    @validator('request_body')
-    def validate_request_body(cls, v):
-        # 确保request_body中的值是基本类型或字典/列表
+    @field_validator('request_body')
+    @classmethod
+    def validate_request_body(cls, v: Dict[str, Any]) -> Dict[str, Any]:
         if not isinstance(v, dict):
             raise ValueError('request_body must be a dictionary')
-        
+
         def is_valid_type(value):
             if isinstance(value, (str, int, float, bool, type(None))):
                 return True
             elif isinstance(value, (list, tuple)):
                 return all(is_valid_type(item) for item in value)
             elif isinstance(value, dict):
-                return all(isinstance(k, str) and is_valid_type(v) for k, v in value.items())
+                return all(isinstance(k, str) and is_valid_type(v_val) for k, v_val in value.items())
             return False
-            
+
         for key, value in v.items():
             if not is_valid_type(value):
                 raise ValueError(f'Invalid type for request_body["{key}"]: {type(value)}')

+ 26 - 34
core/models/video_item.py

@@ -1,31 +1,27 @@
+# video_item.py 修改后
 import time
 import uuid
 from typing import Optional, Union
-
-from pydantic import BaseModel, Field, validator
-
+from pydantic import BaseModel, Field, field_validator, ConfigDict
 from services.clean_title import clean_title
 
 
 class VideoItem(BaseModel):
-    """
-    视频数据结构,支持字段校验和预处理逻辑
-    - 字段初始化后可通过 `prepare()` 异步方法补全和清洗数据
-    - 使用 `produce_item()` 返回最终有效数据 dict
-    """
-
-    video_id: Optional[str] = Field(default_factory=lambda: str(uuid.uuid4()))
-    user_id: str
+    # Pydantic 2.x 配置
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    video_id: Optional[Union[int, str]] = Field(default_factory=lambda: str(uuid.uuid4()))
+    user_id: Union[int, str]
     user_name: str
-    out_video_id: str
-    out_user_id: Optional[str]
+    out_video_id: Union[int, str]
+    out_user_id: Optional[Union[int, str]]
     video_url: str
     cover_url: str
     platform: str
     strategy: str
-    session: Optional[str]
+    session: Optional[str] = None
 
-    video_title: Optional[str]
+    video_title: Optional[str] = None
     publish_time_stamp: Optional[Union[int, str]] = None
     update_time_stamp: Optional[Union[int, str]] = None
 
@@ -40,30 +36,29 @@ class VideoItem(BaseModel):
     publish_time_str: Optional[str] = None
     publish_time: Optional[str] = None
 
-    # 添加验证器确保数值字段非负
-    @validator('duration', 'play_cnt', 'like_cnt', 'comment_cnt', 'share_cnt', 'width', 'height')
-    def validate_non_negative(cls, v, field):
+
+    # Pydantic 2.x 验证器语法
+    @field_validator('duration', 'play_cnt', 'like_cnt', 'comment_cnt', 'share_cnt', 'width', 'height')
+    @classmethod
+    def validate_non_negative(cls, v: int) -> int:
         if v < 0:
-            raise ValueError(f'{field.name} must be non-negative')
+            raise ValueError('Value must be non-negative')
         return v
 
-    @validator('video_url', 'cover_url')
-    def validate_url(cls, v, field):
+    @field_validator('video_url', 'cover_url')
+    @classmethod
+    def validate_url(cls, v: str) -> str:
         if v and not (v.startswith('http://') or v.startswith('https://')):
-            raise ValueError(f'{field.name} must be a valid URL')
+            raise ValueError('URL must start with http:// or https://')
         return v
 
+
     async def prepare(self):
-        """
-        异步预处理:清洗标题、补全发布时间和更新时间
-        """
-        # 标题清洗
+        """异步预处理(保持不变)"""
         if self.video_title:
             self.video_title = await clean_title(self.video_title)
 
-        # 发布时间处理
         if self.publish_time_stamp:
-            # 确保publish_time_stamp是整数类型
             if isinstance(self.publish_time_stamp, str):
                 try:
                     if len(self.publish_time_stamp) == 13:
@@ -82,11 +77,9 @@ class VideoItem(BaseModel):
         )
         self.publish_time = self.publish_time_str
 
-        # 更新时间戳默认当前时间
         if not self.update_time_stamp:
             self.update_time_stamp = int(time.time())
         else:
-            # 确保update_time_stamp是整数类型
             if isinstance(self.update_time_stamp, str):
                 try:
                     if len(self.update_time_stamp) == 13:
@@ -102,9 +95,7 @@ class VideoItem(BaseModel):
             self.session = str(f"{self.platform}_{int(time.time())}")
 
     async def produce_item(self) -> Optional[dict]:
-        """
-        异步生成最终数据字典,校验必要字段是否存在,返回 None 则不合格
-        """
+        """异步生成最终数据字典"""
         await self.prepare()
 
         must_fields = [
@@ -115,4 +106,5 @@ class VideoItem(BaseModel):
             if not getattr(self, f, None):
                 return None
 
-        return self.dict()
+        # Pydantic 2.x: 使用 model_dump() 替代 dict()
+        return self.model_dump()

+ 6 - 0
core/utils/codes.py

@@ -27,6 +27,11 @@ CODES = {
     "1011": "视频数量达到当日最大值",
     "1012": "实时入库量",
 
+    # 爬虫过程(2xxx)
+    ""
+    "2000": "请求到的数据长度",
+
+
     # 配置错误类 (4xxx)
     "4000": "爬虫配置缺失",
     "4001": "ETL配置缺失",
@@ -55,6 +60,7 @@ CODES = {
     "9019": "任务参数缺失",
     "9020": "过滤条件不匹配",
     "9021": "接口返回数据为空",
+    "9022": "爬虫执行失败",
 
     # 系统致命错误类 (99xx)
     "9900": "数据库连接失败",

+ 0 - 219
core/utils/config_documentation.py

@@ -1,219 +0,0 @@
-"""
-配置文档生成工具
-自动生成配置文件说明文档
-"""
-import yaml
-from core.utils.config_manager import get_config_manager
-from core.utils.path_utils import spiders_config_path
-
-
-class ConfigDocumentation:
-    """
-    配置文档生成工具
-    """
-    
-    def __init__(self):
-        self.config_manager = get_config_manager()
-
-    def generate_env_config_docs(self) -> str:
-        """
-        生成环境配置文档
-        """
-        docs = "# 环境配置说明\n\n"
-        docs += "环境配置通过 `.env` 文件进行配置,以下为所有可配置项:\n\n"
-        
-        env_settings_info = {
-            "ENV": {
-                "description": "运行环境",
-                "default": "prod",
-                "options": ["prod", "dev"]
-            },
-            "DB_HOST": {
-                "description": "数据库主机地址",
-                "required": True
-            },
-            "DB_PORT": {
-                "description": "数据库端口",
-                "default": 3306
-            },
-            "DB_USER": {
-                "description": "数据库用户名",
-                "required": True
-            },
-            "DB_PASSWORD": {
-                "description": "数据库密码",
-                "required": True
-            },
-            "DB_NAME": {
-                "description": "数据库名称",
-                "required": True
-            },
-            "DB_CHARSET": {
-                "description": "数据库字符集",
-                "required": True
-            },
-            "ROCKETMQ_ENDPOINT": {
-                "description": "RocketMQ接入点",
-                "required": True
-            },
-            "ROCKETMQ_ACCESS_KEY_ID": {
-                "description": "RocketMQ访问密钥ID",
-                "required": True
-            },
-            "ROCKETMQ_ACCESS_KEY_SECRET": {
-                "description": "RocketMQ访问密钥",
-                "required": True
-            },
-            "FEISHU_APPID": {
-                "description": "飞书应用ID",
-                "required": True
-            },
-            "FEISHU_APPSECRET": {
-                "description": "飞书应用密钥",
-                "required": True
-            },
-            "ALIYUN_ACCESS_KEY_ID": {
-                "description": "阿里云访问密钥ID",
-                "required": True
-            },
-            "ALIYUN_ACCESS_KEY_SECRET": {
-                "description": "阿里云访问密钥",
-                "required": True
-            },
-            "REDIS_HOST": {
-                "description": "Redis主机地址",
-                "required": True
-            },
-            "REDIS_PORT": {
-                "description": "Redis端口",
-                "default": 6379
-            },
-            "REDIS_PASSWORD": {
-                "description": "Redis密码",
-                "required": True
-            }
-        }
-        
-        docs += "| 配置项 | 描述 | 是否必填 | 默认值 |\n"
-        docs += "|--------|------|----------|--------|\n"
-        
-        for key, info in env_settings_info.items():
-            description = info.get("description", "")
-            required = "是" if info.get("required", False) else "否"
-            default = str(info.get("default", "")) if info.get("default") is not None else ""
-            options = ", ".join(info.get("options", []))
-            if options:
-                description += f" (可选值: {options})"
-                
-            docs += f"| {key} | {description} | {required} | {default} |\n"
-            
-        return docs
-
-    def generate_spider_config_docs(self) -> str:
-        """
-        生成爬虫配置文档
-        """
-        docs = "# 爬虫配置说明\n\n"
-        docs += "爬虫配置通过 `config/spiders_config.yaml` 文件进行配置。\n\n"
-        
-        # 添加配置示例
-        docs += "## 配置示例\n\n```yaml\n"
-        with open(spiders_config_path, 'r', encoding='utf-8') as f:
-            docs += f.read()
-        docs += "```\n\n"
-        
-        # 添加字段说明
-        docs += "## 字段说明\n\n"
-        
-        global_config_fields = {
-            "base_url": "基础URL,用于拼接完整请求URL",
-            "request_timeout": "请求超时时间(秒)",
-            "max_retries": "最大重试次数",
-            "headers": "请求头信息"
-        }
-        
-        platform_config_fields = {
-            "platform": "平台名称",
-            "mode": "爬取模式(如 recommend, author)",
-            "path": "API路径",
-            "url": "完整请求URL",
-            "method": "HTTP请求方法",
-            "request_body": "请求体参数",
-            "loop_times": "循环次数",
-            "loop_interval": "循环间隔(min/max)",
-            "response_parse": "响应解析配置",
-            "feishu_sheetid": "飞书表格ID"
-        }
-        
-        response_parse_fields = {
-            "data_path": "数据列表路径",
-            "next_cursor": "下一页游标路径",
-            "has_more": "是否还有更多数据路径",
-            "fields": "字段映射配置"
-        }
-        
-        docs += "### 全局配置字段\n\n"
-        docs += "| 字段 | 描述 |\n"
-        docs += "|------|------|\n"
-        for field, description in global_config_fields.items():
-            docs += f"| {field} | {description} |\n"
-            
-        docs += "\n### 平台配置字段\n\n"
-        docs += "| 字段 | 描述 |\n"
-        docs += "|------|------|\n"
-        for field, description in platform_config_fields.items():
-            docs += f"| {field} | {description} |\n"
-            
-        docs += "\n### 响应解析字段\n\n"
-        docs += "| 字段 | 描述 |\n"
-        docs += "|------|------|\n"
-        for field, description in response_parse_fields.items():
-            docs += f"| {field} | {description} |\n"
-            
-        return docs
-
-    def generate_complete_docs(self) -> str:
-        """
-        生成完整配置文档
-        """
-        docs = "# AutoScraperX 配置说明\n\n"
-        docs += "本文档详细说明了AutoScraperX项目的配置项。\n\n"
-        docs += "---\n\n"
-        docs += self.generate_env_config_docs()
-        docs += "\n---\n\n"
-        docs += self.generate_spider_config_docs()
-        docs += "\n---\n\n"
-        docs += "## 当前配置状态\n\n"
-        
-        try:
-            stats = self.config_manager.get_config_stats()
-            docs += f"- 平台配置数量: {stats['total_platforms']}\n"
-            docs += f"- 运行环境: {stats['env']}\n"
-            docs += f"- 配置文件路径: {stats['config_file']}\n"
-        except Exception as e:
-            docs += f"配置状态获取失败: {e}\n"
-            
-        return docs
-
-    def save_docs(self, filepath: str = "CONFIGURATION.md"):
-        """
-        保存文档到文件
-        """
-        docs = self.generate_complete_docs()
-        with open(filepath, 'w', encoding='utf-8') as f:
-            f.write(docs)
-        return filepath
-
-
-def generate_config_docs():
-    """
-    生成配置文档
-    """
-    doc_generator = ConfigDocumentation()
-    filepath = doc_generator.save_docs()
-    print(f"配置文档已保存到: {filepath}")
-    return filepath
-
-
-if __name__ == "__main__":
-    generate_config_docs()

+ 0 - 184
core/utils/config_health_check.py

@@ -1,184 +0,0 @@
-"""
-配置健康检查工具
-用于验证配置文件的完整性和正确性
-"""
-import sys
-from typing import List, Dict, Any
-from core.utils.config_manager import get_config_manager
-from core.utils.spider_config import SpiderConfig
-from config import settings
-
-
-class ConfigHealthCheck:
-    """
-    配置健康检查工具
-    """
-    
-    def __init__(self):
-        self.config_manager = get_config_manager()
-        self.errors = []
-        self.warnings = []
-
-    def check_env_config(self) -> bool:
-        """
-        检查环境配置
-        """
-        try:
-            # 检查必要配置是否存在
-            required_settings = [
-                'DB_HOST', 'DB_USER', 'DB_PASSWORD', 'DB_NAME',
-                'ROCKETMQ_ENDPOINT', 'ROCKETMQ_ACCESS_KEY_ID', 'ROCKETMQ_ACCESS_KEY_SECRET',
-                'FEISHU_APPID', 'FEISHU_APPSECRET',
-                'ALIYUN_ACCESS_KEY_ID', 'ALIYUN_ACCESS_KEY_SECRET',
-                'REDIS_HOST', 'REDIS_PASSWORD'
-            ]
-            
-            for setting in required_settings:
-                if not getattr(settings, setting, None):
-                    self.errors.append(f"环境配置缺失: {setting}")
-            
-            # 检查URL格式
-            url_settings = ['ROCKETMQ_ENDPOINT']
-            for setting in url_settings:
-                url = getattr(settings, setting, None)
-                if url and not isinstance(url, str):
-                    self.errors.append(f"URL配置格式错误: {setting}")
-            
-            return len(self.errors) == 0
-            
-        except Exception as e:
-            self.errors.append(f"环境配置检查异常: {str(e)}")
-            return False
-
-    def check_spider_configs(self) -> bool:
-        """
-        检查所有爬虫配置
-        """
-        try:
-            platforms = self.config_manager.list_platforms()
-            if not platforms:
-                self.warnings.append("未找到任何平台配置")
-                return True
-                
-            valid_count = 0
-            for platform in platforms:
-                try:
-                    config = self.config_manager.get_platform_config(platform)
-                    # 验证配置字段
-                    if not config.platform:
-                        self.errors.append(f"平台 {platform} 缺少 platform 字段")
-                    if not config.mode:
-                        self.errors.append(f"平台 {platform} 缺少 mode 字段")
-                    if not config.url:
-                        self.errors.append(f"平台 {platform} 缺少 url 字段")
-                    valid_count += 1
-                except Exception as e:
-                    self.errors.append(f"平台 {platform} 配置验证失败: {str(e)}")
-            
-            return len(self.errors) == 0
-            
-        except Exception as e:
-            self.errors.append(f"爬虫配置检查异常: {str(e)}")
-            return False
-
-    def check_file_permissions(self) -> bool:
-        """
-        检查配置文件权限
-        """
-        import os
-        from core.utils.path_utils import spiders_config_path
-        
-        try:
-            # 检查爬虫配置文件是否存在
-            if not os.path.exists(spiders_config_path):
-                self.errors.append(f"爬虫配置文件不存在: {spiders_config_path}")
-                return False
-                
-            # 检查文件是否可读
-            if not os.access(spiders_config_path, os.R_OK):
-                self.errors.append(f"爬虫配置文件不可读: {spiders_config_path}")
-                
-            return len(self.errors) == 0
-            
-        except Exception as e:
-            self.errors.append(f"文件权限检查异常: {str(e)}")
-            return False
-
-    def run_all_checks(self) -> Dict[str, Any]:
-        """
-        运行所有检查
-        """
-        self.errors.clear()
-        self.warnings.clear()
-        
-        env_ok = self.check_env_config()
-        spider_ok = self.check_spider_configs()
-        file_ok = self.check_file_permissions()
-        
-        overall_ok = env_ok and spider_ok and file_ok
-        
-        return {
-            "success": overall_ok,
-            "errors": self.errors.copy(),
-            "warnings": self.warnings.copy(),
-            "details": {
-                "env_config": env_ok,
-                "spider_configs": spider_ok,
-                "file_permissions": file_ok
-            }
-        }
-
-    def print_report(self):
-        """
-        打印健康检查报告
-        """
-        result = self.run_all_checks()
-        
-        print("=" * 50)
-        print("配置健康检查报告")
-        print("=" * 50)
-        
-        if result["success"]:
-            print("✓ 所有配置检查通过")
-        else:
-            print("✗ 配置存在问题")
-            
-        print(f"\n详细信息:")
-        print(f"  环境配置: {'✓' if result['details']['env_config'] else '✗'}")
-        print(f"  爬虫配置: {'✓' if result['details']['spider_configs'] else '✗'}")
-        print(f"  文件权限: {'✓' if result['details']['file_permissions'] else '✗'}")
-        
-        if result["warnings"]:
-            print(f"\n警告:")
-            for warning in result["warnings"]:
-                print(f"  - {warning}")
-                
-        if result["errors"]:
-            print(f"\n错误:")
-            for error in result["errors"]:
-                print(f"  - {error}")
-        
-        print("\n统计信息:")
-        try:
-            stats = self.config_manager.get_config_stats()
-            print(f"  平台数量: {stats['total_platforms']}")
-            print(f"  运行环境: {stats['env']}")
-        except Exception as e:
-            print(f"  统计信息获取失败: {e}")
-            
-        print("=" * 50)
-        
-        return result
-
-
-def run_health_check():
-    """
-    运行配置健康检查
-    """
-    checker = ConfigHealthCheck()
-    return checker.print_report()
-
-
-if __name__ == "__main__":
-    result = run_health_check()
-    sys.exit(0 if result["success"] else 1)

+ 0 - 129
core/utils/config_manager.py

@@ -1,129 +0,0 @@
-"""
-配置管理服务
-统一管理环境配置和爬虫配置
-"""
-import json
-from typing import Dict, Any, Optional
-from core.utils.spider_config import SpiderConfig
-from core.models.spiders_config_models import PlatformConfig
-
-
-class ConfigManager:
-    """
-    统一配置管理器
-    提供对环境配置和爬虫配置的统一访问接口
-    """
-    
-    def __init__(self):
-        # 延迟导入settings以避免循环导入
-        from config import settings
-        self._env_settings = settings
-        self._spider_config = SpiderConfig
-
-    @property
-    def env_settings(self):
-        """
-        获取环境配置
-        """
-        return self._env_settings
-
-    def get_platform_config(self, platform_name: str) -> PlatformConfig:
-        """
-        获取平台爬虫配置
-        """
-        return self._spider_config.get_platform_config(platform_name)
-
-    def list_platforms(self) -> list:
-        """
-        获取所有平台列表
-        """
-        return self._spider_config.list_all_platforms()
-
-    def reload_spider_configs(self):
-        """
-        重新加载爬虫配置
-        """
-        self._spider_config.reload_config()
-
-    def get_config_stats(self) -> Dict[str, Any]:
-        """
-        获取配置统计信息
-        """
-        stats = self._spider_config.get_config_stats()
-        stats["env"] = self._env_settings.ENV
-        return stats
-
-    def validate_platform_config(self, platform_name: str) -> bool:
-        """
-        验证平台配置是否有效
-        """
-        try:
-            self.get_platform_config(platform_name)
-            return True
-        except Exception:
-            return False
-
-    def export_configs(self) -> Dict[str, Any]:
-        """
-        导出所有配置信息(用于调试和监控)
-        """
-        return {
-            "env_settings": {
-                "env": self._env_settings.ENV,
-                "log_level": self._env_settings.LOG_LEVEL,
-                "db_host": self._env_settings.DB_HOST,
-                "rocketmq_endpoint": str(self._env_settings.ROCKETMQ_ENDPOINT),
-                # 不包含敏感信息如密码、密钥等
-            },
-            "spider_configs": self.list_platforms(),
-            "stats": self.get_config_stats()
-        }
-
-    def get_platform_configs_summary(self) -> Dict[str, Dict[str, Any]]:
-        """
-        获取所有平台配置摘要信息
-        """
-        platforms = self.list_platforms()
-        summary = {}
-        
-        for platform in platforms:
-            try:
-                config = self.get_platform_config(platform)
-                summary[platform] = {
-                    "platform": config.platform,
-                    "mode": config.mode,
-                    "method": config.method,
-                    "url": str(config.url),
-                    "loop_times": config.loop_times,
-                }
-            except Exception as e:
-                summary[platform] = {
-                    "error": str(e)
-                }
-                
-        return summary
-
-    async def reload_configs_runtime(self):
-        """
-        运行时重新加载配置(支持不重启服务的情况下重新加载配置)
-        这个方法可以在接收到特定信号或API调用时被调用
-        """
-        try:
-            # 重新加载爬虫配置
-            self.reload_spider_configs()
-            return True
-        except Exception as e:
-            # 记录错误日志
-            print(f"运行时重新加载配置失败: {e}")
-            return False
-
-
-# 全局配置管理器实例
-config_manager = ConfigManager()
-
-
-def get_config_manager() -> ConfigManager:
-    """
-    获取配置管理器实例
-    """
-    return config_manager

+ 0 - 3
core/utils/feishu_data_async.py

@@ -138,10 +138,7 @@ class FeishuDataAsync:
                     },
 
             }
-        print(payload)
-
         async with self.session.post(url, headers=headers, json=payload,ssl=False) as response:
-            print(response)
             if response.status != 200:
                 error_text = await response.text()
             if response.status != 200:

+ 38 - 7
core/utils/helpers.py

@@ -2,7 +2,7 @@ import asyncio
 
 from typing import List, Dict
 
-from datetime import datetime
+from datetime import datetime, timedelta
 from core.utils.feishu_data_async import FeishuDataAsync
 from core.utils.gpt4o_mini_help import GPT4oMini
 
@@ -17,13 +17,28 @@ async def get_title_filter_word() -> List[str]:
     sheet_id = "BS9uyu"
     async with FeishuDataAsync() as feishu:
         feishu_data = await feishu.get_values(spreadsheet_token=spreadsheet_token, sheet_id=sheet_id)
-        return feishu_data[1]
+        for row in feishu_data[1:]:
+            title_rule = row[0]
+            if title_rule:
+                return title_rule.split(",")
+            else:
+                return None
+        return None
 
 async def generate_titles(sheet_id: str,video_obj: Dict,logger,aliyun_log):
     title_filter_word = await get_title_filter_word()
     title = video_obj.get("video_title")
     if not title:
         return video_obj
+    # 确保title和title_filter_word中的所有元素都是字符串
+    if title is None:
+        title = ""
+    else:
+        title = str(title)
+
+    # 过滤掉title_filter_word中的非字符串元素,并确保它们不为None
+    title_filter_word = [str(keyword) for keyword in title_filter_word if keyword is not None]
+    
     contains_keyword = any(keyword in title for keyword in title_filter_word)
     logger.info(f"【{title}】标题包含过滤关键词:{contains_keyword}")
     if contains_keyword:
@@ -33,8 +48,8 @@ async def generate_titles(sheet_id: str,video_obj: Dict,logger,aliyun_log):
         current_time = datetime.now()
         formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
         values = [
-                video_obj["v_url"],
-                video_obj["url"],
+                video_obj["video_url"],
+                video_obj["cover_url"],
                 title,
                 new_title,
                 formatted_time,
@@ -47,9 +62,25 @@ async def insert_safe_data(sheet_id: str, values: List):
     async with FeishuDataAsync() as feishu:
         await feishu.insert_values(spreadsheet_token=spreadsheet_token, sheet_id=sheet_id,ranges="A2:Z2",values=values)
 
+async def is_near_next_day(threshold_minutes: int = 3) -> bool:
+        """
+        时间检查
+
+        """
+        now = datetime.now()
+        tomorrow = now.date() + timedelta(days=1)
+        midnight = datetime.combine(tomorrow, datetime.min.time())
+        time_left = midnight - now
+
+        if time_left.total_seconds() < threshold_minutes * 60:
+            return True
+
+        return False
+
+
 
 
 
-if __name__ == '__main__':
-     asyncio.run(insert_safe_data("K0gA9Y", ["1","2","3","4","5"]))
-    
+# if __name__ == '__main__':
+#      # asyncio.run(insert_safe_data("K0gA9Y", ["1","2","3","4","5"]))
+#       print(asyncio.run(get_title_filter_word()))

+ 3 - 2
core/utils/log/log_codes.py

@@ -14,8 +14,8 @@ codes.py
 CODES = {
     # 成功类 (1xxx)
     "1000": "任务接收成功",
-    "1001": "数据采集成功",
-    "1002": "视频处理成功",
+    "1001": "获取到一条数据",
+    "1002": "推送ETL成功",
     "1003": "爬虫执行指标汇总",
     "1004": "未获取到数据",
     "1005": "视频过滤成功",
@@ -74,6 +74,7 @@ CODES = {
     "9023": "请求返回code非0",
     "9024": "字段提取失败",
     "9025": "接口返回为空",
+    "9026": "处理数据过程异常",
 
     # 系统致命错误 (99xx)
     "9900": "数据库连接失败",

+ 87 - 0
core/utils/log_codes.py

@@ -0,0 +1,87 @@
+# core/utils/log_codes.py
+"""
+阿里云日志码统一管理
+采用分层分类的方式管理所有日志码
+"""
+
+from enum import Enum
+from typing import Dict, Any
+
+
+class LogCategory(Enum):
+    """日志分类"""
+    SYSTEM = "系统"  # 系统相关日志
+    TASK = "任务"  # 任务处理日志
+    DATA = "数据"  # 数据处理日志
+    ERROR = "错误"  # 错误日志
+    MONITOR = "监控"  # 监控日志
+
+
+class LogCodeManager:
+    """日志码管理器"""
+
+    # 系统日志码 (1000-1999)
+    SYSTEM_START = ("1000", "系统启动", LogCategory.SYSTEM)
+    SYSTEM_SHUTDOWN = ("1001", "系统关闭", LogCategory.SYSTEM)
+    SYSTEM_HEALTHY = ("1002", "系统健康检查", LogCategory.SYSTEM)
+
+    # 任务日志码 (2000-2999)
+    TASK_RECEIVED = ("2000", "任务接收成功", LogCategory.TASK)
+    TASK_START = ("2001", "任务开始执行", LogCategory.TASK)
+    TASK_SUCCESS = ("2010", "任务执行成功", LogCategory.TASK)
+    TASK_FAILED = ("2011", "任务执行失败", LogCategory.TASK)
+
+    # 数据处理日志码 (3000-3999)
+    DATA_PROCESS_START = ("3000", "开始处理数据", LogCategory.DATA)
+    DATA_PROCESS_SUCCESS = ("3001", "数据处理成功", LogCategory.DATA)
+    DATA_VALIDATION_FAILED = ("3002", "数据验证失败", LogCategory.DATA)
+    DATA_TRANSFORM_SUCCESS = ("3003", "数据转换成功", LogCategory.DATA)
+
+    # 错误日志码 (9000-9999)
+    ERROR_GENERAL = ("9000", "一般错误", LogCategory.ERROR)
+    ERROR_NETWORK = ("9001", "网络错误", LogCategory.ERROR)
+    ERROR_DATABASE = ("9002", "数据库错误", LogCategory.ERROR)
+    ERROR_CONFIG = ("9003", "配置错误", LogCategory.ERROR)
+
+    # 监控日志码 (1500-1599)
+    MONITOR_PROCESS_START = ("1500", "进程启动", LogCategory.MONITOR)
+    MONITOR_PROCESS_RESTART = ("1501", "进程重启", LogCategory.MONITOR)
+    MONITOR_PROCESS_STOP = ("1502", "进程停止", LogCategory.MONITOR)
+
+    @classmethod
+    def get_code_info(cls, code: str) -> Dict[str, Any]:
+        """根据code获取日志信息"""
+        for attr_name in dir(cls):
+            if not attr_name.startswith('_') and attr_name.isupper():
+                code_info = getattr(cls, attr_name)
+                if isinstance(code_info, tuple) and code_info[0] == code:
+                    return {
+                        "code": code_info[0],
+                        "message": code_info[1],
+                        "category": code_info[2].value
+                    }
+        return None
+
+    @classmethod
+    def get_codes_by_category(cls, category: LogCategory) -> list:
+        """根据分类获取所有日志码"""
+        codes = []
+        for attr_name in dir(cls):
+            if not attr_name.startswith('_') and attr_name.isupper():
+                code_info = getattr(cls, attr_name)
+                if isinstance(code_info, tuple) and code_info[2] == category:
+                    codes.append({
+                        "name": attr_name,
+                        "code": code_info[0],
+                        "message": code_info[1]
+                    })
+        return codes
+
+    @classmethod
+    def validate_code(cls, code: str) -> bool:
+        """验证日志码是否存在"""
+        return cls.get_code_info(code) is not None
+
+
+# 全局实例
+log_codes = LogCodeManager()

+ 7 - 32
core/utils/spider_config.py

@@ -1,4 +1,4 @@
-# spider_config.py
+# spider_config.py 修改后
 import os
 from urllib.parse import urljoin
 import yaml
@@ -12,23 +12,14 @@ class SpiderConfig:
 
     @classmethod
     def _load_yaml(cls):
-        """
-        加载spiders_config.yaml
-        :return:
-        """
         if not os.path.exists(cls._config_path):
             raise FileNotFoundError(f"[配置错误] 找不到配置文件: {cls._config_path}")
-            
-        # 检查文件是否修改过
+
         with open(cls._config_path, "r", encoding="utf-8") as f:
             cls._config = yaml.safe_load(f)
 
     @classmethod
     def get_platform_config(cls, classname: str) -> PlatformConfig:
-        """
-        获取平台配置,并拼接完整 URL
-        支持类方法调用 + 单次加载配置
-        """
         if cls._config is None:
             cls._load_yaml()
 
@@ -38,52 +29,36 @@ class SpiderConfig:
         platform_config = cls._config.get(classname, {})
         base_config = cls._config.get("default", {})
 
-        # 合并配置:平台配置覆盖默认配置
+        # 合并配置
         merged = {**base_config, **platform_config}
 
-        # 自动拼接完整 url(优先用完整 url)
+        # 自动拼接完整 url
         if "url" not in merged and "base_url" in merged and "path" in merged:
             merged["url"] = urljoin(merged["base_url"], merged["path"])
 
-        # 使用 pydantic 进行验证
         try:
+            # Pydantic 2.x 直接实例化
             return PlatformConfig(**merged)
         except Exception as e:
             raise ValueError(f"[配置错误] 平台 {classname} 的配置验证失败: {e}")
 
     @classmethod
     def reload_config(cls):
-        """
-        强制重新加载配置文件
-        """
         cls._config = None
         cls._load_yaml()
 
     @classmethod
     def list_all_platforms(cls):
-        """
-        获取所有平台配置名称列表
-        """
         if cls._config is None:
             cls._load_yaml()
-        platforms = [key for key in cls._config.keys() if key != "default"]
-        return platforms
+        return [key for key in cls._config.keys() if key != "default"]
 
     @classmethod
     def get_config_stats(cls):
-        """
-        获取配置统计信息
-        """
         if cls._config is None:
             cls._load_yaml()
         return {
             "total_platforms": len(cls.list_all_platforms()),
             "last_modified": os.path.getmtime(cls._config_path) if os.path.exists(cls._config_path) else 0,
             "config_file": cls._config_path
-        }
-
-
-# 示例使用
-if __name__ == '__main__':
-    config = SpiderConfig.get_platform_config("yuannifuqimanmanrecommend")
-    print(config)
+        }

+ 140 - 0
core/video_processor.py

@@ -0,0 +1,140 @@
+# core/video_processor.py
+import time
+import traceback
+from typing import Dict, Optional, Any
+from core.models.video_item import VideoItem
+from core.utils.extractors import extract_fields
+from core.utils.helpers import generate_titles
+from core.utils.log.logger_manager import LoggerManager
+from services.pipeline import PiaoQuanPipeline
+import uuid
+
+
+class VideoProcessor:
+    """
+    视频处理器:
+    1、字段统一
+    2、数据过滤
+    3、视频处理(标题生成)
+    """
+
+    def __init__(self, platform: str, mode: str, field_map: Dict,
+                 feishu_sheetid: str = None, logger=None, aliyun_log=None):
+        self.platform = platform
+        self.mode = mode
+        self.field_map = field_map
+        self.feishu_sheetid = feishu_sheetid
+        self.logger = logger or LoggerManager.get_logger(platform=platform, mode=mode)
+        self.aliyun_log = aliyun_log or LoggerManager.get_aliyun_logger(platform=platform, mode=mode)
+
+    async def process_single_video(self, raw_data: Dict, user_info: Dict,
+                                   rule_dict: Dict, env: str = "prod") -> Optional[Dict]:
+        """
+        完整的单条视频处理流程
+        """
+        try:
+            # 1. 提取字段并创建视频对象
+            video_obj = await self._create_video_item(raw_data, user_info)
+
+            if not video_obj:
+                return None
+
+            # 2. 数据过滤
+            if not await self._filter_video(video_obj, rule_dict, env):
+                return None
+
+            # 3. 集成视频处理(标题生成等)
+            processed_video = await self._integrated_handling(video_obj)
+            if not processed_video:
+                return None
+
+            return processed_video
+
+        except Exception as e:
+            self.logger.error(f"视频处理失败: {e}")
+            return None
+
+    async def _create_video_item(self, raw_data: Dict, user_info: Dict) -> Optional[Dict]:
+        """创建视频数据对象"""
+        try:
+            # 提取字段
+            item_kwargs = extract_fields(raw_data, self.field_map,
+                                         logger=self.logger,
+                                         aliyun_log=self.aliyun_log)
+
+            if not item_kwargs:
+                self.logger.error("字段提取失败")
+                return None
+
+
+            # 确保必需的用户信息存在
+            user_id = user_info.get("uid")
+            user_name = user_info.get("nick_name")
+            
+            if not user_id:
+                self.logger.error("用户ID为空")
+                return None
+
+            # 添加平台相关信息
+            item_kwargs.update({
+                "user_id": str(user_id),
+                "user_name": user_name,
+                "platform": self.platform,
+                "strategy": self.mode,
+                "session": f"{self.platform}_{int(time.time())}"  # 确保session字段存在
+            })
+
+            # 创建视频项
+            video_item = VideoItem(**item_kwargs)
+            return await video_item.produce_item()
+
+        except Exception as e:
+            self.logger.error(f"创建视频项失败: {e}")
+            return None
+
+    async def _filter_video(self, video: Dict, rule_dict: Dict, env: str) -> bool:
+        """过滤视频数据"""
+        try:
+            pipeline = PiaoQuanPipeline(
+                platform=self.platform,
+                mode=self.mode,
+                rule_dict=rule_dict,
+                env=env,
+                item=video,
+                trace_id=f"{self.platform}_{uuid.uuid1()}"
+            )
+            return await pipeline.process_item()
+        except Exception as e:
+            self.logger.error(f"过滤视频失败: {e}")
+            return False
+
+    async def _integrated_handling(self, video: Dict) -> Optional[Dict]:
+        """
+        集成视频处理 - 标题生成等业务逻辑
+        返回处理后的视频数据,如果处理失败返回None
+        """
+        try:
+            if self.feishu_sheetid:
+                # 调用标题生成逻辑
+                processed_video = await generate_titles(
+                    self.feishu_sheetid,
+                    video,
+                    self.logger,
+                    self.aliyun_log
+                )
+
+                if processed_video:
+                    self.logger.info(f"视频标题处理完成: {video}")
+                    return processed_video
+                else:
+                    self.logger.warning("标题生成失败,使用原始视频数据")
+                    return video
+            else:
+                self.logger.debug("未配置飞书sheetid,跳过标题生成")
+                return video
+
+        except Exception as e:
+            tb = traceback.format_exc()
+            self.logger.error(f"集成视频处理失败: {e} \n {tb}")
+            # 处理失败时返回原始数据,不中断流程
+            return video

+ 2 - 1
main.py

@@ -36,6 +36,7 @@ def main():
 
     max_processes = cpu_count()
     num_processes = min(len(topic_list), max_processes)
+    # topic均分给进程
     topic_groups = split_topics(topic_list, num_processes)
     logger.info(f"[主进程] CPU 核心数: {max_processes},启动进程数: {num_processes}")
 
@@ -44,7 +45,7 @@ def main():
     for group_id, topic_group in enumerate(topic_groups):
         start_worker_process(group_id, topic_group, process_map)
 
-    # 重启状态跟踪 - 新增重启限制
+    # 重启状态跟踪
     restart_count = {group_id: 0 for group_id in range(len(topic_groups))}
     last_restart_time = {group_id: 0 for group_id in range(len(topic_groups))}
 

+ 40 - 40
requirements.txt

@@ -1,73 +1,73 @@
 aiohappyeyeballs==2.6.1
-aiohttp==3.12.13
-aiosignal==1.3.2
-aliyun-log-python-sdk==0.9.24
-aliyun-python-sdk-core==2.13.36
+aiohttp==3.12.15
+aiosignal==1.4.0
+aliyun-log-python-sdk==0.9.32
+aliyun-python-sdk-core==2.16.0
 annotated-types==0.7.0
 async-timeout==5.0.1
 asyncmy==0.2.10
 attrs==25.3.0
-build==1.2.2.post1
-certifi==2025.6.15
-cffi==1.17.1
-charset-normalizer==3.4.2
-click==8.2.1
-coverage==7.9.1
-cryptography==45.0.4
-dateparser==1.2.1
+build==1.3.0
+certifi==2025.8.3
+cffi==2.0.0
+charset-normalizer==3.4.3
+click==8.3.0
+coverage==7.10.7
+cryptography==46.0.1
+dateparser==1.2.2
 dotenv==0.9.9
-elastic-transport==8.17.1
-elasticsearch==9.0.2
+elastic-transport==9.1.0
+elasticsearch==9.1.1
 frozenlist==1.7.0
 googleapis-common-protos==1.70.0
-grpcio==1.73.1
-grpcio-tools==1.71.2
+grpcio==1.75.0
+grpcio-tools==1.75.0
 idna==3.10
 importlib_metadata==8.7.0
 iniconfig==2.1.0
-jmespath==0.10.0
+jmespath==1.0.1
 jsonpath-ng==1.7.0
 loguru==0.7.3
 lz4==4.4.4
 mq-http-sdk==1.0.3
-multidict==6.4.4
-opentelemetry-api==1.34.1
-opentelemetry-exporter-otlp==1.34.1
-opentelemetry-exporter-otlp-proto-common==1.34.1
-opentelemetry-exporter-otlp-proto-grpc==1.34.1
-opentelemetry-exporter-otlp-proto-http==1.34.1
-opentelemetry-proto==1.34.1
-opentelemetry-sdk==1.34.1
+multidict==6.6.4
+opentelemetry-api==1.37.0
+opentelemetry-exporter-otlp==1.37.0
+opentelemetry-exporter-otlp-proto-common==1.37.0
+opentelemetry-exporter-otlp-proto-grpc==1.37.0
+opentelemetry-exporter-otlp-proto-http==1.37.0
+opentelemetry-proto==1.37.0
+opentelemetry-sdk==1.37.0
 opentelemetry-semantic-conventions==0.55b1
 packaging==25.0
-pip-tools==7.4.1
+pip-tools==7.5.0
 pluggy==1.6.0
 ply==3.11
 propcache==0.3.2
-protobuf==5.27.0
-pycparser==2.22
-pydantic==1.10.22
+protobuf==6.32.1
+pycparser==2.23
+pydantic==2.11.9
 pydantic-env-settings==0.1.1
-pydantic_core==2.33.2
+pydantic_core==2.39.0
 Pygments==2.19.2
-PyMySQL==1.1.1
+PyMySQL==1.1.2
 pyproject_hooks==1.2.0
-pytest==8.4.1
-pytest-asyncio==1.0.0
+pytest==8.4.2
+pytest-asyncio==1.2.0
 python-dateutil==2.9.0.post0
-python-dotenv==1.1.0
+python-dotenv==1.1.1
 pytz==2025.2
 PyYAML==6.0.2
-redis==6.2.0
-regex==2024.11.6
-requests==2.32.4
+redis==6.4.0
+regex==2025.9.18
+requests==2.32.5
 rocketmq-client-python==2.0.0
 rocketmq-python-client==5.0.6
 six==1.17.0
 tenacity==9.1.2
 typing-inspection==0.4.1
-typing_extensions==4.14.0
+typing_extensions==4.15.0
 tzlocal==5.3.1
-urllib3==2.4.0
+urllib3==2.5.0
 yarl==1.20.1
-zipp==3.23.0
+zipp==3.23.0

+ 2 - 5
services/pipeline.py

@@ -66,11 +66,8 @@ class PiaoQuanPipeline:
 
         max_d = self.rule_dict.get("period", {}).get("max", 1000)
         min_d = self.rule_dict.get("period", {}).get("min", 1000)
-        days = max(max_d, min_d)
+        days = int(max(max_d, min_d))
 
-        # feishu_days = await self.feishu_time_list()
-        # if feishu_days:
-        #     days = int(feishu_days)
 
         now_ts = int(time.time())
 
@@ -182,7 +179,7 @@ class PiaoQuanPipeline:
                     if not flag:
                         self.logger.info(f"校验结束: 下载规则不符合{flag}")
                         self.aliyun_log.logging(
-                            code="2004",
+                            code="2005",
                             trace_id=self.trace_id,
                             data=self.item,
                             message="{}: {} <= {} <= {}, {}".format(

+ 108 - 0
spiders/advanced_factory.py

@@ -0,0 +1,108 @@
+# spiders/advanced_factory.py
+from typing import Dict, List, Type, Optional
+from config.spider_config import SpiderConfig
+from core.di.container import container
+from spiders.basespider import BaseSpider
+from spiders.spider_registry import SPIDER_CLASS_MAP, get_spider_class
+
+
+class AdvancedSpiderFactory:
+    """高级爬虫工厂,支持自动配置映射和依赖注入"""
+
+    @classmethod
+    def create_spider(cls, spider_key: str, rule_dict: Dict, user_list: List,
+                      env: str = "prod", **kwargs) -> BaseSpider:
+        """
+        智能创建爬虫实例
+        """
+        # 1. 获取配置
+        config = SpiderConfig.get_platform_config(spider_key)
+
+        # 2. 确定爬虫类
+        spider_class = cls._resolve_spider_class(spider_key, config)
+
+        # 3. 准备依赖服务
+        dependencies = cls._prepare_dependencies(config, **kwargs)
+
+        # 4. 创建实例
+        return spider_class(
+            rule_dict=rule_dict,
+            user_list=user_list,
+            env=env,
+            **dependencies
+        )
+
+    @classmethod
+    def create_spider_by_topic(cls, topic: str, task_id: str, env: str = "prod") -> Optional[BaseSpider]:
+        """
+        根据MQ topic和任务ID创建爬虫(完整流程)
+        """
+        # 1. 映射topic到爬虫类型
+        spider_key = cls._map_topic_to_spider_key(topic)
+        if not spider_key:
+            return None
+
+        # 2. 从数据库获取任务配置
+        rule_dict, user_list = cls._get_task_config(task_id)
+        if not rule_dict:
+            return None
+
+        # 3. 创建爬虫实例
+        return cls.create_spider(spider_key, rule_dict, user_list, env)
+
+    @classmethod
+    def _resolve_spider_class(cls, spider_key: str, config) -> Type[BaseSpider]:
+        """解析爬虫类(支持自动发现)"""
+        # 优先使用注册表
+        if spider_key in SPIDER_CLASS_MAP:
+            return SPIDER_CLASS_MAP[spider_key]
+
+        # 根据命名约定自动推断
+        if spider_key.endswith('recommend'):
+            from spiders.recommendspider import RecommendSpider
+            return RecommendSpider
+        elif spider_key.endswith('author'):
+            from spiders.authorspider import AuthorSpider
+            return AuthorSpider
+
+        # 默认使用基类
+        return BaseSpider
+
+    @classmethod
+    def _prepare_dependencies(cls, config, **kwargs):
+        """准备依赖服务"""
+        dependencies = {}
+
+        # 数据库服务
+        if 'db_service' not in kwargs:
+            dependencies['db_service'] = container.db_service(
+                platform=config.platform,
+                mode=config.mode
+            )
+
+        # MQ 生产者
+        if 'mq_producer' not in kwargs:
+            dependencies['mq_producer'] = container.mq_producer()
+
+        return {**dependencies, **kwargs}
+
+    @classmethod
+    def _map_topic_to_spider_key(cls, topic: str) -> Optional[str]:
+        """topic到爬虫配置的映射"""
+        mapping = {
+            "bszf_recommend_prod": "benshanzhufurecommend",
+            "ynfqmm_recommend_prod": "yuannifuqimanmanrecommend",
+            "xng_author_prod": "xiaoniangaoauthor"
+        }
+        return mapping.get(topic)
+
+    @classmethod
+    def _get_task_config(cls, task_id: str):
+        """从数据库获取任务配置"""
+        # 这里需要实现数据库查询逻辑
+        # 返回 (rule_dict, user_list)
+        return {}, []
+
+
+# 全局工厂实例
+spider_factory = AdvancedSpiderFactory()

+ 0 - 0
spiders/author/__init__.py


+ 0 - 0
spiders/xiaoniangao_author.py → spiders/author/xiaoniangao_author.py


+ 82 - 73
spiders/authorspider.py

@@ -1,75 +1,91 @@
+# spiders/authorspider.py
 from datetime import datetime, timedelta
-from spiders.basespider import BaseSpider
-from typing import Optional, List, Dict
-import aiohttp
+from typing import List, Dict, Optional
 
+from core.utils.helpers import is_near_next_day
+from spiders.basespider import BaseSpider
 from core.utils.extractors import safe_extract
 
 
 class AuthorSpider(BaseSpider):
-    """账号模式爬虫:从用户列表爬取"""
+    """作者模式爬虫 """
 
     def __init__(self, rule_dict: Dict, user_list: List, env: str = "prod"):
         super().__init__(rule_dict, user_list, env)
         # 账号模式特有状态
         self.user_list_from_db = []  # 数据库用户列表
         self.current_user_index = 0  # 当前用户索引
-        self.current_cursor = "" # 当前分页游标(初始为空)
-        self.next_cursor_last = ""
-
+        self.current_cursor = ""  # 当前分页游标(初始为空)
 
     async def before_run(self):
-        """运行前:获取用户列表"""
+        """运行前:获取用户列表 """
+        await super().before_run()
         self.user_list_from_db = await self.fetch_user_list()
         if not self.user_list_from_db:
             self.logger.warning("用户列表为空,终止账号模式")
         self.logger.info(f"{self.platform}获取用户列表完成,共 {len(self.user_list_from_db)} 个用户")
 
+    async def execute(self):
+        """执行核心逻辑 - 使用 make_request 方法"""
+        if not await self.is_video_count_sufficient():
+            self.logger.info("视频数量已达到上限,跳过执行")
+            return
+
+        while await self.is_video_count_sufficient():
+
+            # 检查时间条件
+            if await is_near_next_day():
+                self.logger.info(f"距离第二天不足3分钟,停止执行")
+                return
+
+            user = self.user_list_from_db[self.current_user_index]
+            crawler_user_uid = user.get("link")
+            self.logger.info(
+                f"处理用户 uid={crawler_user_uid}(第{self.current_user_index + 1}个),"
+                f"当前cursor: {self.current_cursor or '0'}"
+            )
+
+            # 构建请求体
+            request_body = self._build_request_body(user)
 
-    async def core_loop(self):
-        """核心循环:处理每个用户的视频"""
-        async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=self.timeout)) as session:
-            while self.current_user_index < len(self.user_list_from_db):
-                if self.is_less_than_3_minutes():
-                    return
-                # 检查数量限制
-                if not await self.is_video_count_sufficient():
-                    return
-                # 当前用户
-                user = self.user_list_from_db[self.current_user_index]
-                crawler_user_uid = user.get("link")  # 数据库中的link字段
-                self.logger.info(
-                    f"处理用户 uid={crawler_user_uid}(第{self.current_user_index + 1}个),"
-                    f"当前cursor: {self.current_cursor or '0'}"
-                )
-
-                # 构建请求体:注入uid和cursor
-                request_body = self._build_request_body(user)
-
-                # 获取当前用户视频
-                raw_data = await self.crawl_user_videos(session, request_body, crawler_user_uid)
-                if not raw_data:
-                    # 切换到下一个用户
-                    self.current_user_index += 1
-                    continue
-                # 处理数据
-                if self.platform == "xiaoniangao":
-                    self.user_list = [user]
-                await self.process_data(raw_data)
-                if self.current_user_index == len(self.user_list_from_db)-1:
-                    self.current_cursor =  self.next_cursor_last
-                    self.current_user_index = 0
-                    continue
+            # 获取当前用户视频
+            raw_data = await self.crawl_user_videos(request_body, crawler_user_uid)
+
+            if not raw_data:
+                # 当前用户无数据,切换到下一个用户
                 self.current_user_index += 1
-                await self.wait()
+                self.current_cursor = ""  # 重置游标
+                await self.wait_between_iterations()
+                continue
+
+
+            # 处理数据
+            if self.platform == "xiaoniangao":
+                self.user_list = [user]  # 特殊逻辑
+
+            pass_video = await self.process_data(raw_data)
+            # 根据是否有通过的数据和下一页游标判断是否继续当前用户
+            if pass_video == 0 and not self.current_cursor:
+                # 没有通过数据或没有更多数据,切换到下一个用户
+                self.current_user_index += 1
+                self.current_cursor = ""
+
+            self.logger.info(
+                f"用户 {crawler_user_uid} 获取到 {pass_video} 个通过视频,继续扫描第{self.current_cursor}页")
+
 
+            # 检查是否所有用户处理完毕
+            if self.current_user_index >= len(self.user_list_from_db):
+                self.current_user_index = 0  # 重置索引
+                self.current_cursor = ""
+
+            await self.wait_between_iterations()
 
     def _build_request_body(self, user: Dict) -> Dict:
-        """构建请求体:将用户link和当前cursor注入"""
-        # 准备"虚拟数据",键名对应你的配置路径($.uid 和 $.cursor)
+        """构建请求体"""
         virtual_data = {
-            "uid": str(user.get("link")),  # 对应配置中的 $.uid
-            "cursor": self.current_cursor  # 对应配置中的 $.cursor
+            "uid": str(user.get("link")),
+            "cursor": self.current_cursor
         }
 
         return self.request_preparer.prepare(
@@ -77,37 +93,30 @@ class AuthorSpider(BaseSpider):
             response_data=virtual_data
         )
 
-    async def fetch_user_list(self) -> List[Dict]:
-        """获取待爬取的用户列表(从数据库)"""
-        return []
-
-    async def crawl_user_videos(self, session, request_body: Dict, user_uid: str) -> Optional[List[Dict]]:
-        """请求用户视频接口"""
-        response = await self.request_client.request(
-            session=session,
-            method=self.method,
-            url=self.url,
-            headers=self.headers,
-            json=request_body
-        )
-        # has_more = safe_extract(response,self.has_more)
-        # 解析用户视频列表
-        data_list = safe_extract(response, self.data_path)
+    async def crawl_user_videos(self, request_body: Dict, user_uid: str) -> Optional[List[Dict]]:
+        """请求用户视频接口 - 使用 make_request 方法"""
+        # 使用基类的 make_request 方法发送请求
+        response = await self.make_request(request_body)
+
+        if not response:
+            self.logger.info(f"用户 {user_uid} 请求失败")
+            return None
+
+        # 游标处理逻辑
         if safe_extract(response, self.next_cursor):
-           self.next_cursor_last = safe_extract(response, self.next_cursor)
+            self.current_cursor = safe_extract(response, self.next_cursor)
+
+        data_list = safe_extract(response, self.data_path)
         if not data_list:
             self.logger.info(f"用户 {user_uid} 无更多视频数据")
             return None
+
         return data_list
 
+    async def fetch_user_list(self) -> List[Dict]:
+        """获取待爬取的用户列表(从数据库)- 子类实现"""
+        return self.user_list  # 默认返回传入的列表
+
     async def fetch_detail(self, item: Dict) -> Dict:
         """账号模式:补充视频详情(子类自行实现)"""
-        return item  # 默认返回原数据
-
-
-    def is_less_than_3_minutes(self):
-        now = datetime.now()
-        tomorrow = now.date() + timedelta(days=1)
-        midnight = datetime.combine(tomorrow, datetime.min.time())
-        time_left = midnight - now
-        return time_left.total_seconds() < 3 * 60
+        return item  # 默认返回原数据

+ 198 - 189
spiders/basespider.py

@@ -1,261 +1,270 @@
+# spiders/basespider.py
 import asyncio
 import random
 import traceback
-import uuid
-from typing import List, Dict, Optional, Any
+from typing import List, Dict, Any, Optional
 from abc import ABC, abstractmethod
 
-from core.models.video_item import VideoItem
-from core.utils.helpers import generate_titles
+import aiohttp
+
 from core.utils.request_preparer import RequestPreparer
 from core.utils.spider_config import SpiderConfig
-from core.utils.extractors import safe_extract, extract_fields
 from core.utils.log.logger_manager import LoggerManager
+from core.video_processor import VideoProcessor
 from services.async_mysql_service import AsyncMysqlService
-from services.pipeline import PiaoQuanPipeline
-from core.base.async_request_client import AsyncRequestClient
 from services.async_mq_producer import AsyncMQProducer
+from core.base.async_request_client import AsyncRequestClient
 
 
 class BaseSpider(ABC):
-    """通用爬虫基类"""
-    
+    """爬虫基类 - 简化版本,不包含循环逻辑"""
+
     def __init__(self, rule_dict: Dict, user_list: List, env: str = "prod",
                  request_client: AsyncRequestClient = None,
                  db_service: AsyncMysqlService = None,
                  mq_producer: AsyncMQProducer = None):
+        # 基础属性
         self.rule_dict = rule_dict
         self.user_list = user_list
         self.env = env
-        self.class_name = self.__class__.__name__.lower()
-
-        # 初始化核心组件
-        self._setup_configuration()
-        self._setup_logging()
-        self._setup_services(request_client, db_service, mq_producer)
-        self._setup_state()
-
-        # 通用状态
-        self.total_success = 0
-        self.total_fail = 0
-        self.video = None
-
-
-    def _setup_configuration(self):
-        self.platform_config = SpiderConfig.get_platform_config(classname=self.class_name)
-        if not self.platform_config:
-            raise ValueError(f"找不到爬虫配置: {self.class_name}")
-        self.platform = self.platform_config.platform
-        self.mode = self.platform_config.mode
-        self.url = self.platform_config.url
-        self.method = self.platform_config.method.upper()
-        self.headers = self.platform_config.headers or {}
-        self.request_body_template = self.platform_config.request_body or {}
-        self.response_parse_config = self.platform_config.response_parse or {}
-        self.data_path = self.response_parse_config.get("data_path")
-        self.has_more = self.response_parse_config.get("has_more")
-        self.field_map = self.response_parse_config.get("fields", {})
-        self.next_cursor = self.response_parse_config.get("next_cursor") or ""
-        self.loop_times = self.platform_config.loop_times or 100
-        self.loop_interval = self.platform_config.loop_interval or {"min": 2, "max": 5}
-        self.timeout = self.platform_config.request_timeout or 30
-        self.max_retries = self.platform_config.max_retries or 3
-        self.feishu_sheetid = self.platform_config.feishu_sheetid
-
-    def _setup_logging(self):
-        self.logger = LoggerManager.get_logger(platform=self.platform, mode=self.mode)
-        self.aliyun_log = LoggerManager.get_aliyun_logger(platform=self.platform, mode=self.mode)
-        self.logger.info(f"爬虫 '{self.platform}/{self.mode}' 初始化...")
-
-    def _setup_services(self, request_client: AsyncRequestClient = None,
-                        db_service: AsyncMysqlService = None,
-                        mq_producer: AsyncMQProducer = None):
-        """初始化服务组件"""
-        self.request_client = request_client or AsyncRequestClient(
+
+        # 服务依赖
+        self.request_client = request_client
+        self.db_service = db_service
+        self.mq_producer = mq_producer
+
+        # 通过类名获取配置
+        class_name = self.__class__.__name__.lower()
+        self.config = SpiderConfig.get_platform_config(class_name)
+        self._setup_from_config()
+
+        # 日志服务
+        self.logger = LoggerManager.get_logger(
+            platform=self.config.platform,
+            mode=self.config.mode
+        )
+        self.aliyun_log = LoggerManager.get_aliyun_logger(
+            platform=self.config.platform,
+            mode=self.config.mode
+        )
+
+        # 请求准备器
+        self.request_preparer = RequestPreparer(
+            response_parse_config=self.config.response_parse,
             logger=self.logger,
             aliyun_log=self.aliyun_log
         )
-        self.db_service = db_service or AsyncMysqlService(
-            platform=self.platform,
-            mode=self.mode
-        )
-        self.mq_producer = mq_producer or AsyncMQProducer(
-            topic_name="topic_crawler_etl_prod_v2",
+
+        # 状态跟踪
+        self.stats = {
+            'success': 0,
+            'fail': 0,
+            'start_time': None,
+            'end_time': None
+        }
+
+        # 如果没有传入服务,则创建默认实例
+        self._setup_default_services()
+
+        # 初始化视频处理器
+        self.video_processor = VideoProcessor(
             platform=self.platform,
-            mode=self.mode
-        )
-    def _setup_state(self):
-        self.last_response_data = {}
-        self.request_preparer = RequestPreparer(
-            response_parse_config=self.response_parse_config,
+            mode=self.mode,
+            field_map=self.field_map,
+            feishu_sheetid=self.feishu_sheetid,
             logger=self.logger,
             aliyun_log=self.aliyun_log
         )
 
-    # 核心入口(统一流程)
+    def _setup_default_services(self):
+        """设置默认服务实例"""
+        if not self.request_client:
+            self.request_client = AsyncRequestClient(
+                logger=self.logger,
+                aliyun_log=self.aliyun_log
+            )
+
+        if not self.db_service:
+            self.db_service = AsyncMysqlService(
+                platform=self.config.platform,
+                mode=self.config.mode
+            )
+
+        if not self.mq_producer:
+            self.mq_producer = AsyncMQProducer(
+                topic_name="topic_crawler_etl_prod_v2",
+                platform=self.config.platform,
+                mode=self.config.mode
+            )
+
+    def _setup_from_config(self):
+        """从配置中设置属性"""
+        self.platform = self.config.platform
+        self.mode = self.config.mode
+        self.url = self.config.url
+        self.method = self.config.method.upper()
+        self.headers = self.config.headers or {}
+        self.request_body_template = self.config.request_body or {}
+        self.loop_times = self.config.loop_times or 100
+        self.timeout = self.config.request_timeout or 30
+        self.feishu_sheetid = self.config.feishu_sheetid
+
+        # 响应解析配置
+        response_parse = self.config.response_parse or {}
+        self.data_path = response_parse.get("data_path")
+        self.has_more = response_parse.get("has_more")
+        self.next_cursor = response_parse.get("next_cursor")
+        self.field_map = response_parse.get("fields", {})
+
     async def run(self):
-        """主流程:初始化→核心循环→收尾"""
+        """主运行流程 - 子类实现完整循环逻辑"""
+        self.stats['start_time'] = asyncio.get_event_loop().time()
         self.logger.info(f"开始运行爬虫: {self.platform}/{self.mode}")
-        await self.before_run()
+
         try:
-            await self.core_loop()  # 子类实现具体模式逻辑
+            await self.before_run()
+            await self.execute()  # 子类实现完整的执行逻辑
         except Exception as e:
-            tb = traceback.format_exc()
-            self.logger.exception(f"运行异常: {e},堆栈信息:{tb}")
+            self.logger.error(f"爬虫执行异常: {e}\n{traceback.format_exc()}")
         finally:
             await self.after_run()
-            self.logger.info(f"总统计:成功{self.total_success},失败{self.total_fail}")
+            self._log_final_stats()
 
     @abstractmethod
-    async def core_loop(self):
-        """子类必须实现:模式特有核心循环(推荐/账号)"""
+    async def execute(self):
+        """执行核心逻辑 - 子类必须实现完整循环逻辑"""
         pass
 
-    async def fetch_detail(self, item: Dict) -> Dict:
-        """子类选择实现:补充详情(完全由子类控制)"""
-        return item
-
-    # 通用数据处理流程
-    async def process_data(self, video_data: List[Dict]):
-        """处理原始数据列表(清洗→过滤→推送)"""
-        for item in video_data:
+    async def process_data(self, data: List[Dict]):
+        """处理数据"""
+        success_count = 0
+        for item in data:
+            self.aliyun_log.logging(
+                code="1001",
+                message=f"获取到一条数据",
+                data=item
+            )
             try:
-                # 补充详情(完全由子类实现)
-                detail_data = await self.fetch_detail(item)
-                # 处理并推送
-                result = await self.process_and_push_video(detail_data)
-                if result:
-                    self.total_success += 1
+                if await self.process_single_item(item):
+                    success_count += 1
+                    self.stats['success'] += 1
                 else:
-                    self.total_fail += 1
+                    self.stats['fail'] += 1
             except Exception as e:
-                self.logger.exception(f"处理单条数据失败: {e}")
-                self.total_fail += 1
+                self.logger.error(f"处理单条数据失败: {e}")
+                self.stats['fail'] += 1
+        self.logger.info(f"批次处理完成: 成功 {success_count}/{len(data)}")
+        return success_count
 
-    async def process_and_push_video(self, video: Dict[str, Any]) -> bool:
-        try:
-            self.video = video
-            video_obj = await self.process_video(video)
-            if not video_obj:
-                return False
-            if not await self.filter_data(video_obj):
-                return False
-            # video_obj = await self.integrated_video_handling(video_obj)
-            return await self.push_to_etl(video_obj)
-        except Exception as e:
-            self.logger.exception(f"视频处理异常: {e}")
-            return False
-
-    async def publish_video_user(self) -> Dict[str, Any]:
-        """获取随机发布用户"""
-        if self.user_list:
-            return random.choice(self.user_list)
-        else:
-            self.logger.error("未获取到用户列表数据")
-            return None
 
 
-    async def process_video(self, video: Dict) -> Optional[Dict]:
-        """
-        字段映射
-        统一字段抽取及 VideoItem 初始化
-        """
-        self.logger.info(f"处理视频数据: {video}")
-        publish_user = await self.publish_video_user()
-        
-        # 检查是否成功获取到发布用户
-        if not publish_user:
-            self.logger.error("无法获取发布用户信息")
-            return None
-            
-        item_kwargs = extract_fields(video, self.field_map, logger=self.logger, aliyun_log=self.aliyun_log)
-        item_kwargs.update({
-            "user_id": publish_user.get("uid"),
-            "user_name": publish_user.get("nick_name"),
-            "platform": self.platform,
-            "strategy": self.mode,
-        })
-        try:
-            item = VideoItem(**item_kwargs)
-            video_dict = await item.produce_item()
-            if not video_dict:
-                self.logger.warning("VideoItem 校验失败")
-                return None
-            return video_dict
-        except Exception as e:
-            self.logger.error(f"VideoItem 初始化失败: {e}")
-            return None
+    async def process_single_item(self, item: Dict) -> bool:
+        """处理单条数据"""
+        # 1. 补充详情
+        detail_item = await self.fetch_detail(item)
 
-    async def filter_data(self, video: Dict) -> bool:
-        """
-           数据校验过滤,默认使用 PiaoQuanPipeline
-           子类可重写此方法实现自定义过滤
-        """
-        pipeline = PiaoQuanPipeline(
-            platform=self.platform,
-            mode=self.mode,
+        # 2. 使用视频处理器进行完整处理
+        video_obj = await self.video_processor.process_single_video(
+            raw_data=detail_item,
+            user_info=await self.select_publish_user(),
             rule_dict=self.rule_dict,
-            env=self.env,
-            item=video,
-            trace_id=self.platform + str(uuid.uuid1())
+            env=self.env
         )
-        return await pipeline.process_item()
 
-    async def integrated_video_handling(self, video: Dict) -> None:
-        """
-          钩子函数:可在此实现自动生成标题或其他业务逻辑
-        """
-        # 视频标题处理生成
-        return await generate_titles(self.feishu_sheetid, video,self.logger,self.aliyun_log)
+        if not video_obj:
+            return False
 
-    async def push_to_etl(self, video: Dict) -> bool:
+        # 3. 推送数据
+        return await self.push_video(video_obj)
+
+    async def select_publish_user(self) -> Dict:
+        """选择发布用户"""
+        if self.user_list:
+            return random.choice(self.user_list)
+        return {}
+
+    async def fetch_detail(self, item: Dict) -> Dict:
+        """补充详情 - 子类可重写"""
+        return item
+
+    async def push_video(self, video: Dict) -> bool:
+        """推送视频数据"""
         try:
             await self.mq_producer.send_msg(video)
-            self.aliyun_log.logging(code="1009",
-                                    message="推送ETL成功",
-                                    data=video)
-            self.logger.info(f"成功推送视频至ETL: {video}")
+            self.aliyun_log.logging(
+                code="1002",
+                message="推送ETL成功",
+                data=video
+            )
             return True
         except Exception as e:
-            self.logger.exception(f"推送ETL失败: {e}")
+            self.logger.error(f"推送视频失败: {e}")
             return False
 
     async def is_video_count_sufficient(self) -> bool:
         """
-        校验当日视频是否达到最大爬取量
-        True未达到
-        False达到最大量
-        :return:True/False
+        检查视频数量是否足够
+        未达到 True 反之 False
         """
         max_count = self.rule_dict.get("videos_cnt", {}).get("min", 0)
         if max_count <= 0:
-            self.logger.info(f"{self.platform} 未限制视频入库量,跳过检测")
             return True
+
         current_count = await self.db_service.get_today_videos()
         if current_count >= max_count:
-            self.logger.info(f"{self.platform} 视频数量达到当日最大值: {current_count}/{max_count}")
-            self.aliyun_log.logging(code="1011", message="视频数量达到当日最大值", data=f"{current_count}")
+            self.logger.info(f"视频数量达到当日最大值: {current_count}/{max_count}")
+            self.aliyun_log.logging(
+                code="1011",
+                message="视频数量达到最大值",
+                data={
+                    "current_count": current_count,
+                    "max_count": max_count
+                }
+            )
             return False
-        self.logger.info(f"{self.platform} 今日入库视频数: {current_count}/{max_count}")
-        self.aliyun_log.logging(code="1012",
-                                message=f"目前入库量{current_count}",
-                                data=f"{current_count}/{max_count}"
-                                )
+
         return True
 
-    async def wait(self):
-        """等待随机时间间隔"""
-        # 确保loop_interval包含min和max键
-        min_time = self.loop_interval.get("min", 1)
-        max_time = self.loop_interval.get("max", 5)
-        wait_time = random.randint(min_time, max_time)
-        self.logger.info(f"等待 {wait_time} 秒后继续")
+    async def wait_between_iterations(self, wait_time: int = None):
+        """等待间隔"""
+        if wait_time is None:
+            interval_config = getattr(self.config, 'loop_interval', {})
+            min_time = interval_config.get('min', 1)
+            max_time = interval_config.get('max', 5)
+            wait_time = random.randint(min_time, max_time)
+
+        self.logger.info(f"等待 {wait_time} 秒")
         await asyncio.sleep(wait_time)
 
+    async def make_request(self, request_body: Dict) -> Optional[Dict]:
+        """发送请求"""
+        async with aiohttp.ClientSession(
+                timeout=aiohttp.ClientTimeout(total=self.timeout)
+        ) as session:
+            return await self.request_client.request(
+                session=session,
+                method=self.method,
+                url=self.url,
+                headers=self.headers,
+                json=request_body
+            )
+
     async def before_run(self):
-        """运行前钩子(子类可重写)"""
+        """运行前准备 - 子类可重写"""
         pass
 
     async def after_run(self):
-        """运行后钩子(子类可重写)"""
-        pass
+        """运行后清理 - 子类可重写"""
+        pass
+
+    def _log_final_stats(self):
+        """记录最终统计"""
+        self.stats['end_time'] = asyncio.get_event_loop().time()
+        duration = 0
+        if self.stats['start_time'] is not None:
+            duration = self.stats['end_time'] - self.stats['start_time']
+
+        self.logger.info(
+            f"爬虫执行完成: 成功 {self.stats['success']}, "
+            f"失败 {self.stats['fail']}, 耗时 {duration:.2f}秒"
+        )

+ 0 - 0
spiders/recommend/__init__.py


+ 5 - 1
spiders/benshanzhufu_recommend.py → spiders/recommend/benshanzhufu_recommend.py

@@ -4,8 +4,12 @@ from spiders.recommendspider import RecommendSpider
 
 
 class BenshanzhufuRecommend(RecommendSpider):
-    pass
+    """本山祝福推荐爬虫 - 保持原有的服务注入方式"""
 
+    async def custom_start_checks(self) -> bool:
+        """自定义启动检查"""
+        self.logger.info("本山祝福爬虫启动检查通过")
+        return True
 
 async def main():
     rule_dict = {"videos_cnt":{"min":500}}

+ 4 - 1
spiders/yuannifuqimanman_recommend.py → spiders/recommend/yuannifuqimanman_recommend.py

@@ -5,7 +5,10 @@ from spiders.recommendspider import RecommendSpider
 
 
 class YuannifuqimanmanRecommend(RecommendSpider):
-    pass
+    async def custom_start_checks(self) -> bool:
+        """自定义启动检查"""
+        self.logger.info("愿你福气满满爬虫启动检查通过")
+        return True
 
 
 async def main():

+ 49 - 53
spiders/recommendspider.py

@@ -1,58 +1,54 @@
+# spiders/recommendspider.py
+from typing import List, Dict, Optional
 from spiders.basespider import BaseSpider
-from typing import Optional, List, Dict
-import aiohttp
-
 from core.utils.extractors import safe_extract
 
 
 class RecommendSpider(BaseSpider):
-    """推荐模式爬虫:从推荐接口分页爬取"""
-
-    async def core_loop(self):
-        """核心循环:分页请求推荐接口"""
-        async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=self.timeout)) as session:
-            for loop_index in range(self.loop_times):
-                # 检查数量限制
-                self.logger.info(f"检测{self.platform}当日入库视频量")
-                if not await self.is_video_count_sufficient():
-                    return
-                    
-                # 获取推荐列表数据
-                self.logger.info(f"开始获取{self.platform}推荐列表数据")
-                raw_data = await self.crawl_data(session)
-                if not raw_data:
-                    self.logger.info("视频列表为空,开始下次请求")
-                    await self.wait()
-                    continue
-                    
-                # 处理数据
-                await self.process_data(raw_data)
-                
-                # 等待下一轮
-                await self.wait()
-
-    async def crawl_data(self, session) -> Optional[List[Dict]]:
-        """请求推荐接口(适配推荐模式)"""
-        request_body = self.request_preparer.prepare(self.request_body_template, self.last_response_data)
-        response = await self.request_client.request(
-            session=session,
-            method=self.method,
-            url=self.url,
-            headers=self.headers,
-            json=request_body
-        )
-
-        self.last_response_data = response
-        
-        # 解析推荐列表
-        if not response:
-            self.logger.warning("接口响应为空")
-            return None
-            
-        data_list = safe_extract(response, self.data_path)
-        if not data_list:
-            self.logger.info(f"接口返回视频列表为空: {response}")
-            self.aliyun_log.logging(code="9021", message="接口返回视频列表为空", data=response)
-            return None
-            
-        return data_list
+    """推荐模式爬虫 - 重新封装版本"""
+
+    def __init__(self, rule_dict: Dict, user_list: List, env: str = "prod",
+                 request_client=None, db_service=None, mq_producer=None):
+        super().__init__(rule_dict, user_list, env, request_client, db_service, mq_producer)
+        self.last_response = None
+
+    async def execute(self):
+        """执行核心逻辑 - 使用 make_request 方法"""
+        if not await self.is_video_count_sufficient():
+            self.logger.info("视频数量已达到上限,跳过执行")
+            return
+
+        iteration = 0
+        while iteration < self.loop_times and await self.is_video_count_sufficient():
+            self.logger.info(f"执行第 {iteration + 1} 轮")
+
+            # 准备请求体
+            request_body = self.request_preparer.prepare(
+                self.request_body_template,
+                self.last_response or {}
+            )
+
+            # 发送请求 - 使用 make_request 方法
+            response = await self.make_request(request_body)
+
+            if not response:
+                self.logger.info("未获取到响应数据")
+                iteration += 1
+                await self.wait_between_iterations()
+                continue
+
+            self.last_response = response
+
+            # 提取数据
+            data_list = safe_extract(response, self.data_path)
+            if not data_list:
+                self.logger.info("未获取到数据")
+                iteration += 1
+                await self.wait_between_iterations()
+                continue
+
+            # 处理数据
+            await self.process_data(data_list)
+
+            iteration += 1
+            await self.wait_between_iterations()

+ 4 - 4
spiders/spider_registry.py

@@ -1,11 +1,11 @@
-# spider_registry.py完整注释版
+x# spider_registry.py完整注释版
 """爬虫注册表模块:维护topic到爬虫类的映射关系"""
 
 from core.utils.log.logger_manager import LoggerManager
 from spiders.basespider import BaseSpider
-from spiders.benshanzhufu_recommend import BenshanzhufuRecommend
-from spiders.xiaoniangao_author import XiaoniangaoAuthor
-from spiders.yuannifuqimanman_recommend import YuannifuqimanmanRecommend
+from spiders.recommend.benshanzhufu_recommend import BenshanzhufuRecommend
+from spiders.author.xiaoniangao_author import XiaoniangaoAuthor
+from spiders.recommend.yuannifuqimanman_recommend import YuannifuqimanmanRecommend
 
 logger = LoggerManager.get_logger()
 aliyun_log = LoggerManager.get_aliyun_logger()

+ 0 - 0
test/__init__.py


+ 57 - 0
test/test_pydantic_upgrade.py

@@ -0,0 +1,57 @@
+# test_pydantic_upgrade.py
+# !/usr/bin/env python3
+"""Pydantic 2.x 升级验证脚本"""
+
+from config.base import settings
+from core.models.video_item import VideoItem
+from core.utils.spider_config import SpiderConfig
+
+
+def test_settings():
+    """测试配置加载"""
+    print("=== 测试 Settings ===")
+    print(f"环境: {settings.ENV}")
+    print(f"数据库主机: {settings.DB_HOST}")
+    print(f"Redis URL: {settings.redis_url}")
+    print("✓ Settings 加载成功")
+
+
+def test_video_item():
+    """测试视频项模型"""
+    print("\n=== 测试 VideoItem ===")
+    video_data = {
+        "user_id": "test_user",
+        "user_name": "测试用户",
+        "out_video_id": "12345",
+        "video_url": "https://example.com/video.mp4",
+        "cover_url": "https://example.com/cover.jpg",
+        "platform": "test",
+        "strategy": "recommend",
+        "out_user_id":"123",
+        "session":"2134",
+        "video_title":"123"
+    }
+
+    item = VideoItem(**video_data)
+    print(f"创建视频项: {item.video_id}")
+    print("✓ VideoItem 创建成功")
+
+
+def test_spider_config():
+    """测试爬虫配置"""
+    print("\n=== 测试 SpiderConfig ===")
+    try:
+        config = SpiderConfig.get_platform_config("benshanzhufurecommend")
+        print(f"平台: {config.platform}")
+        print(f"模式: {config.mode}")
+        print(f"URL: {config.url}")
+        print("✓ SpiderConfig 加载成功")
+    except Exception as e:
+        print(f"✗ SpiderConfig 加载失败: {e}")
+
+
+if __name__ == "__main__":
+    test_settings()
+    test_video_item()
+    test_spider_config()
+    print("\n=== 升级验证完成 ===")

+ 0 - 8
tests/test1.py

@@ -1,8 +0,0 @@
-# topics = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
-# num_groups = 4
-#
-# print([topics[i::num_groups] for i in range(num_groups)])
-v = "{{next_cursor}}"
-if isinstance(v, str) and v.startswith("{{") and v.endswith("}}"):
-    key = v.strip("{}")
-    print(key)

+ 0 - 33
tests/test_async_redis_service.py

@@ -1,33 +0,0 @@
-import pytest
-import asyncio
-
-from config import settings
-from services.async_redis_service import AsyncRedisService
-from core.base.async_redis_client import RedisManager
-
-@pytest.mark.asyncio
-async def test_async_redis_service_mark_and_get_status():
-    await RedisManager.init(redis_url=settings.redis_url)
-    service = AsyncRedisService(prefix="crawler:task", ttl=20)
-    test_message_id = "test_123456"
-
-    # 确保干净
-    pool = RedisManager.get_pool()
-    await pool.delete(service._build_key(test_message_id))
-
-    # 初始应获取 None
-    status = await service.get_status(test_message_id)
-    assert status is None
-
-    # 标记为执行中
-    await service.mark_processing(test_message_id)
-    status = await service.get_status(test_message_id)
-    assert status == "0"
-
-    # 标记为完成
-    await service.mark_done(test_message_id)
-    status = await service.get_status(test_message_id)
-    assert status == "1"
-
-    # 清理
-    # await pool.delete(service._build_key(test_message_id))

+ 0 - 10
tests/test_benshanzhufu_recommend.py

@@ -1,10 +0,0 @@
-import unittest
-
-
-class MyTestCase(unittest.TestCase):
-    def test_something(self):
-        self.assertEqual(True, False)  # add assertion here
-
-
-if __name__ == '__main__':
-    unittest.main()

+ 0 - 4
tests/test_config.py

@@ -1,4 +0,0 @@
-from config import settings
-
-print("=== 配置验证 ===")
-print("DB连接:", settings.REDIS_HOST)

+ 0 - 21
tests/test_feishu.py

@@ -1,21 +0,0 @@
-import asyncio
-import random
-import string
-from datetime import datetime
-
-# 导入封装的异步工具类
-from core.utils.feishu_data_async import FeishuDataAsync  # 假设已保存为该文件名
-
-async def test_feishu():
-    async with FeishuDataAsync() as feishu:
-        spreadsheet_token = "ISFnspGcjhyxO6tj0fycbQARnUg"
-        sheet_id = "a44975"
-        valus =await feishu.get_values(spreadsheet_token=spreadsheet_token,sheet_id=sheet_id)
-
-        print(valus)
-        await feishu.insert_values(spreadsheet_token=spreadsheet_token, sheet_id=sheet_id, ranges="A2:Z2", values=["2", "23333", "3", "4"])
-
-
-
-if __name__ == '__main__':
-    asyncio.run(test_feishu())

+ 0 - 30
tests/test_video_item.py

@@ -1,30 +0,0 @@
-import asyncio
-import time
-from pprint import pprint
-
-from core.models.video_item import VideoItem  # 你的 Pydantic 模型路径
-
-
-async def main():
-    fake_video_data = {
-        "user_id": "uid456",
-        "user_name": "测试用户",
-        "out_video_id": "out789",
-        "out_user_id": "out_user",
-        "video_url": "http://example.com/video.mp4",
-        "cover_url": "http://example.com/cover.jpg",
-        "video_title": "   测试 视频 标题!!!",
-        "publish_time_stamp": int(time.time()) - 86400,  # 昨天
-        "strategy": "recommend",
-        "classname": "test_platform"
-    }
-
-    item = VideoItem(**fake_video_data)
-    result = await item.produce_item()
-
-    print("✅ 校验通过的最终结构:")
-    pprint(result)
-
-
-if __name__ == "__main__":
-    asyncio.run(main())