Bladeren bron

结构调整

zhangliang 1 week geleden
bovenliggende
commit
4a5edf4157
92 gewijzigde bestanden met toevoegingen van 1387 en 2310 verwijderingen
  1. 4 0
      .env
  2. 1 1
      .gitignore
  3. 3 3
      README.md
  4. 0 2
      application/config/__init__.py
  5. 0 0
      application/config/aliyun_config.py
  6. 0 6
      application/config/common/__init__.py
  7. 0 76
      application/config/common/ffmpeg/ffmpeg_utils.py
  8. 0 57
      application/config/common/log/local_log.py
  9. 0 21
      application/config/common/log/logger_manager.py
  10. 0 3
      application/config/common/messageQueue/__init__.py
  11. 0 18
      application/config/common/messageQueue/ack_message.py
  12. 0 25
      application/config/common/messageQueue/consumer.py
  13. 0 51
      application/config/common/messageQueue/mq.py
  14. 0 1
      application/config/common/mysql/__init__.py
  15. 0 68
      application/config/common/mysql/mysql_helper.py
  16. 0 55
      application/config/common/mysql/sql.py
  17. 0 2
      application/config/common/proxies/__init__.py
  18. 0 23
      application/config/common/proxies/fast_proxy.py
  19. 0 55
      application/config/common/redis/pyredis.py
  20. 0 54
      application/config/common/redis/redis_helper.py
  21. 0 54
      application/config/common/redis/xng_redis.py
  22. 0 7
      application/config/config.py
  23. 0 30
      application/config/ipconfig.py
  24. 0 36
      application/config/mysql_config.py
  25. 0 134
      application/etl/download.py
  26. 0 3
      application/functions/__init__.py
  27. 0 69
      application/functions/async_mysql_service.py
  28. 0 9
      application/functions/get_redirect_url.py
  29. 0 147
      application/functions/mysql_service.py
  30. 0 46
      application/functions/read_mysql_config.py
  31. 0 240
      application/functions/zqkd_db_redis.py
  32. 0 1
      application/items/__init__.py
  33. 0 94
      application/items/item.py
  34. 0 2
      application/pipeline/__init__.py
  35. 0 272
      application/pipeline/pipeline.py
  36. 0 112
      application/pipeline/pipeline_dev.py
  37. 0 241
      application/spiders/base_spider.py
  38. 0 6
      application/spiders/benshanzhufu_recommend.py
  39. 0 21
      application/spiders/spider_registry.py
  40. 3 0
      config/__init__.py
  41. 64 0
      config/base.py
  42. 16 0
      config/prod.py
  43. 98 0
      config/settings.py
  44. 1 1
      config/spiders_config.yaml
  45. 0 0
      config/topic_map.yaml
  46. 0 42
      configs/codes.py
  47. 0 1
      configs/config.py
  48. 0 36
      configs/messages.py
  49. 0 0
      core/__init__.py
  50. 0 0
      core/base/__init__.py
  51. 0 0
      core/base/async_mysql_client.py
  52. 1 1
      core/base/async_rocketmq_consumer.py
  53. 0 0
      core/models/__init__.py
  54. 79 0
      core/models/video_item.py
  55. 0 0
      core/utils/__init__.py
  56. 46 0
      core/utils/config_loader.py
  57. 1 1
      core/utils/env_loader.py
  58. 0 0
      core/utils/extractors.py
  59. 0 0
      core/utils/feishu/__init__.py
  60. 2 2
      core/utils/feishu/feishu.py
  61. 1 1
      core/utils/feishu/feishu_data.py
  62. 0 0
      core/utils/feishu/feishu_insert.py
  63. 2 2
      core/utils/feishu/feishu_utils.py
  64. 0 0
      core/utils/gpt/__init__.py
  65. 0 0
      core/utils/gpt/gpt4o_mini_help.py
  66. 0 0
      core/utils/log/__init__.py
  67. 4 5
      core/utils/log/aliyun_log.py
  68. 50 0
      core/utils/log/local_log.py
  69. 31 0
      core/utils/log/logger_manager.py
  70. 2 4
      core/utils/path_utils.py
  71. 0 0
      core/utils/trace_utils.py
  72. 15 110
      main.py
  73. 0 0
      scheduler/__init__.py
  74. 75 0
      scheduler/async_consumer.py
  75. 24 0
      scheduler/process_manager.py
  76. 0 0
      services/__init__.py
  77. 206 0
      services/async_mysql_service.py
  78. 5 2
      services/clean_title.py
  79. 201 0
      services/pipeline.py
  80. 12 16
      services/rocketmq_consumer.py
  81. 0 0
      spiders/__init__.py
  82. 323 0
      spiders/base_spider.py
  83. 19 0
      spiders/benshanzhufu_recommend.py
  84. 47 0
      spiders/spider_registry.py
  85. 3 3
      spiders/universal_crawler.py
  86. 0 0
      tests/__init__.py
  87. 2 1
      tests/test1.py
  88. 10 0
      tests/test_benshanzhufu_recommend.py
  89. 7 0
      tests/test_config.py
  90. 29 0
      tests/test_video_item.py
  91. 0 0
      utils/__init__.py
  92. 0 37
      utils/config_loader.py

+ 4 - 0
.env.prod → .env

@@ -20,3 +20,7 @@ ROCKETMQ_ACCESS_KEY_SECRET="nEbq3xWNQd1qLpdy2u71qFweHkZjSG"
 ROCKETMQ_INSTANCE_ID="MQ_INST_1894469520484605_BXhXuzkZ"
 ROCKETMQ_WAIT_SECONDS=10
 ROCKETMQ_BATCH=1
+
+# 阿里云日志上报
+ALIYUN_ACCESS_KEY_ID="LTAIWYUujJAm7CbH"
+ALIYUN_ACCESS_KEY_SECRET="RfSjdiWwED1sGFlsjXv0DlfTnZTG1P"

+ 1 - 1
.gitignore

@@ -35,7 +35,7 @@ var/
 pip-log.txt
 pip-delete-this-directory.txt
 
-# Unit test / coverage reports
+# Unit tests / coverage reports
 htmlcov/
 .tox/
 .coverage

+ 3 - 3
README.md

@@ -9,11 +9,11 @@
 
 ```bash
 AutoScraperX/
-├── main.py                           # 项目入口:监听 MQ 消息,调度 UniversalCrawler
+├── main1.py                           # 项目入口:监听 MQ 消息,调度 UniversalCrawler
 ├── spiders/
 │   ├── universal_crawler.py         # 通用爬虫主类,读取配置并执行爬虫逻辑
 │   └── rabbitmq_consumer.py         # 多线程消费 MQ 中的消息并执行任务
-├── configs/
+├── config/
 │   ├── spiders_config.yaml         # 各平台爬虫规则配置文件(含 JsonPath)
 │   └── topic_map.yaml              # topic 与平台名映射关系配置
 ├── application/
@@ -61,7 +61,7 @@ AutoScraperX/
 ### 1. 启动项目
 
 ```bash
-python main.py
+python main1.py
 ```
 
 > 程序将自动监听所有 Topic,消费消息后创建对应的爬虫任务并执行。

+ 0 - 2
application/config/__init__.py

@@ -1,2 +0,0 @@
-from .ipconfig import ip_config
-from .mysql_config import env_dict

+ 0 - 0
application/config/aliyun_config.py


+ 0 - 6
application/config/common/__init__.py

@@ -1,6 +0,0 @@
-from .feishu import Feishu, FeishuInsert
-from .log import *
-from .messageQueue import *
-from .mysql import *
-from .proxies import *
-from .redis import redis_helper

+ 0 - 76
application/config/common/ffmpeg/ffmpeg_utils.py

@@ -1,76 +0,0 @@
-import requests
-import json
-
-class Ffmpeg:
-
-    def get_oss_link(self, oss_key):
-        url = "http://61.48.133.26:5555/api/v1/oss/get_object_link"
-
-        payload = json.dumps({
-            "oss_object_key": oss_key
-        })
-        headers = {
-            'Authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyX2lkIjoiNGNhMTI4ZGYtYWMzMy00NWQ2LTg3MmEtMDAzOTk4MGVhM2ViIiwibmFtZSI6Inp5IiwiZXhwIjoyMDUwOTI3MjExfQ.k_rvuESjA62RgPDiLniVgJyLJn3Q8C1Y_AGq3CPRuKI',
-            'Content-Type': 'application/json'
-        }
-
-        response = requests.request("POST", url, headers=headers, data=payload)
-        response = response.json()
-        data = response['data']
-        return data
-
-    def merge_m3u8(self,url_link):
-        url = "http://101.37.24.17:5555/api/v1/ffmpeg/merge_m3u8"
-
-        data = {
-            "url": url_link,
-            "referer": ""
-        }
-        headers = {
-            'Authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyX2lkIjoiNGNhMTI4ZGYtYWMzMy00NWQ2LTg3MmEtMDAzOTk4MGVhM2ViIiwibmFtZSI6Inp5IiwiZXhwIjoyMDUwOTI3MjExfQ.k_rvuESjA62RgPDiLniVgJyLJn3Q8C1Y_AGq3CPRuKI',
-            'Content-Type': 'application/json'
-        }
-
-        response = requests.request("POST", url, headers=headers, json=data, stream=True)
-        for item in response.content.split(b'\r\n\r\n'):
-            try:
-                item = json.loads(item[6:].decode())
-                if item['event'] == 'message':
-                    continue
-                elif item['event'] == 'ffmpeg code':
-                    code = int(item['data'])
-                    if code != 0:  # ffmpeg处理异常
-                        return
-                elif item['event'] == 'result':
-                    oss_object_key = item['data']['oss_object_key']
-                    if oss_object_key:
-                        oss_url = self.get_oss_link(oss_object_key)
-                        return oss_url
-            except json.decoder.JSONDecodeError:
-                continue
-
-    def webp2_jpg(self,webp2_url):
-        url = "http://101.37.24.17:5555/api/v1/ffmpeg/webp2jpg"
-
-        payload = json.dumps({
-            "url": webp2_url,
-            "referer": ""
-        })
-        headers = {
-            'Authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyX2lkIjoiNGNhMTI4ZGYtYWMzMy00NWQ2LTg3MmEtMDAzOTk4MGVhM2ViIiwibmFtZSI6Inp5IiwiZXhwIjoyMDUwOTI3MjExfQ.k_rvuESjA62RgPDiLniVgJyLJn3Q8C1Y_AGq3CPRuKI',
-            'Content-Type': 'application/json'
-        }
-
-        response = requests.request("POST", url, headers=headers, data=payload)
-        response = response.json()
-        oss_object_key = response['data']['oss_object_key']
-        if oss_object_key:
-            oss_url = self.get_oss_link(oss_object_key)
-            return oss_url
-        else:
-            return None
-
-
-if __name__ == '__main__':
-    ffmpeg = Ffmpeg()
-    print(ffmpeg.get_oss_link("jq_oss/video/20250103135417425230.mp4"))

+ 0 - 57
application/config/common/log/local_log.py

@@ -1,57 +0,0 @@
-import os.path
-import sys
-from datetime import date, timedelta, datetime
-from loguru import logger
-from pathlib import Path
-from utils.path_utils import log_dir
-
-class Local:
-    # 日期常量
-    now = datetime.now()
-    today = date.today()
-    yesterday = (today - timedelta(days=1)).strftime("%Y-%m-%d")
-    tomorrow = (today + timedelta(days=1)).strftime("%Y-%m-%d")
-
-    @staticmethod
-    def init_logger(platform: str, mode: str, log_level: str = "INFO", log_to_console: bool = False,
-                    rotation: str = "00:00", retention: str = "10 days"):
-        """
-        初始化日志记录器
-        :param platform: 平台名称,用于区分日志目录
-        :param mode: 运行环境(如 prod/test/dev)
-        :param log_level: 日志级别(如 INFO、DEBUG)
-        :param log_to_console: 是否同时输出到控制台
-        :param rotation: 日志文件切分策略(默认每天 00:00)
-        :param retention: 日志保留时间(默认10天)
-        """
-
-        # 创建日志目录
-        log_path = Path(f"{log_dir}/{platform}")
-        log_path.mkdir(parents=True, exist_ok=True)
-
-        # 设置日志文件名
-        log_filename = f"{platform}-{mode}-{Local.today.strftime('%Y-%m-%d')}.log"
-        log_file_path = os.path.join(log_dir,log_filename)
-
-        # 清除默认 handler
-        logger.remove()
-
-        # 添加文件日志 handler
-        logger.add(
-            str(log_file_path),
-            level=log_level.upper(),
-            rotation=rotation,
-            retention=retention,
-            encoding="utf-8",
-            enqueue=True
-        )
-
-        # 可选:输出到控制台
-        if log_to_console:
-            logger.add(
-                sink=sys.stdout,
-                level=log_level.upper(),
-                format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level}</level> | {message}"
-            )
-
-        return logger

+ 0 - 21
application/config/common/log/logger_manager.py

@@ -1,21 +0,0 @@
-from application.config.common.log import Local, AliyunLogger
-
-class LoggerManager:
-    _local_loggers = {}
-    _aliyun_loggers = {}
-
-    @staticmethod
-    def get_logger(platform, mode, log_to_console=True):
-        key = f"{platform}_{mode}"
-        if key not in LoggerManager._local_loggers:
-            LoggerManager._local_loggers[key] = Local.init_logger(
-                platform=platform, mode=mode, log_to_console=log_to_console
-            )
-        return LoggerManager._local_loggers[key]
-
-    @staticmethod
-    def get_aliyun_logger(platform, mode):
-        key = f"{platform}_{mode}"
-        if key not in LoggerManager._aliyun_loggers:
-            LoggerManager._aliyun_loggers[key] = AliyunLogger(platform=platform, mode=mode)
-        return LoggerManager._aliyun_loggers[key]

+ 0 - 3
application/config/common/messageQueue/__init__.py

@@ -1,3 +0,0 @@
-from .mq import MQ
-from .ack_message import ack_message
-from .consumer import get_consumer

+ 0 - 18
application/config/common/messageQueue/ack_message.py

@@ -1,18 +0,0 @@
-from application.config.common import Local
-
-
-def ack_message(mode, platform, recv_msgs, consumer, trace_id=None):
-    """
-    消费成功后确认消息
-    """
-    try:
-        receipt_handle_list = [recv_msgs.receipt_handle]
-        consumer.ack_message(receipt_handle_list)
-        Local.init_logger(platform, mode).info(
-            f"[trace_id={trace_id}] Ack {len(receipt_handle_list)} Message Succeed."
-        )
-
-    except MQExceptionBase as err:
-        Local.init_logger(platform, mode).error(
-            f"[trace_id={trace_id}] Ack Message Fail! Exception:{err}"
-        )

+ 0 - 25
application/config/common/messageQueue/consumer.py

@@ -1,25 +0,0 @@
-from mq_http_sdk.mq_client import *
-
-
-def get_consumer(topic_name, group_id):
-    # 初始化client。
-    mq_client = MQClient(
-        # 设置HTTP协议客户端接入点,进入云消息队列 RocketMQ 版控制台实例详情页面的接入点区域查看。
-        "http://1894469520484605.mqrest.cn-qingdao-public.aliyuncs.com",
-        # AccessKey ID,阿里云身份验证标识。获取方式,请参见创建AccessKey。
-        "LTAI4G7puhXtLyHzHQpD6H7A",
-        # AccessKey Secret,阿里云身份验证密钥。获取方式,请参见创建AccessKey。
-        "nEbq3xWNQd1qLpdy2u71qFweHkZjSG",
-    )
-    # 消息所属的Topic,在云消息队列 RocketMQ 版控制台创建。
-    # topic_name = "${TOPIC}"
-    topic_name = str(topic_name)
-    # 您在云消息队列 RocketMQ 版控制台创建的Group ID。
-    # group_id = "${GROUP_ID}"
-    group_id = str(group_id)
-    # Topic所属的实例ID,在云消息队列 RocketMQ 版控制台创建。
-    # 若实例有命名空间,则实例ID必须传入;若实例无命名空间,则实例ID传入空字符串。实例的命名空间可以在云消息队列 RocketMQ 版控制台的实例详情页面查看。
-    instance_id = "MQ_INST_1894469520484605_BXhXuzkZ"
-
-    consumer = mq_client.get_consumer(instance_id, topic_name, group_id)
-    return consumer

+ 0 - 51
application/config/common/messageQueue/mq.py

@@ -1,51 +0,0 @@
-import json
-from mq_http_sdk.mq_exception import MQExceptionBase
-from mq_http_sdk.mq_producer import TopicMessage
-from mq_http_sdk.mq_client import MQClient
-import traceback
-from application.config.common.log import Local
-from application.config.common.log import AliyunLogger
-
-
-class MQ(object):
-    """
-    MQ Class
-    """
-    instance_id = "MQ_INST_1894469520484605_BXhXuzkZ"
-
-    def __init__(self, topic_name) -> None:
-        self.mq_client = MQClient("http://1894469520484605.mqrest.cn-qingdao-public.aliyuncs.com",
-                                  "LTAI4G7puhXtLyHzHQpD6H7A",
-                                  "nEbq3xWNQd1qLpdy2u71qFweHkZjSG")
-        topic_name = topic_name+"_v2"
-        self.producer = self.mq_client.get_producer(self.instance_id, topic_name)
-
-    def send_msg(self, video_dict, max_retries = 3):
-        """
-        发送 mq,并且记录 redis
-        :param video_dict:
-        """
-        strategy = video_dict["strategy"]
-        platform = video_dict["platform"]
-        self.aliyun_log = AliyunLogger(mode=strategy, platform=platform)
-        for retry in range(max_retries):
-            try:
-                msg = TopicMessage(json.dumps(video_dict))
-                message_key = "{}-{}-{}".format(platform, strategy, video_dict['out_video_id'])
-                msg.set_message_key(message_key)
-                re_msg = self.producer.publish_message(msg)
-                Local.init_logger(platform,strategy).info("Publish Message Succeed. MessageID:%s, BodyMD5:%s\n" %
-                                                      (re_msg.message_id, re_msg.message_body_md5))
-                return
-            except MQExceptionBase as e:
-                tb = traceback.format_exc()
-                # 如果是最后一次重试失败,记录日志
-                if retry == max_retries - 1:
-                    Local.init_logger(platform, strategy).error(
-                        f"Publish Message Fail after {max_retries} attempts. Exception: {e}\n{tb}"
-                    )
-                    self.aliyun_log.logging(
-                        code="5005",
-                        message=f"Publish Message Fail after {max_retries} attempts. Exception: {e}",
-                        data= tb
-                    )

+ 0 - 1
application/config/common/mysql/__init__.py

@@ -1 +0,0 @@
-from .mysql_helper import MysqlHelper

+ 0 - 68
application/config/common/mysql/mysql_helper.py

@@ -1,68 +0,0 @@
-"""
-数据库连接及操作
-"""
-import pymysql
-import os
-import sys
-
-sys.path.append(os.getcwd())
-
-from application.config.common.log import Local
-from application.config.mysql_config import env_dict
-
-
-class MysqlHelper(object):
-    """
-    MySQL工具, env默认prod版本
-    """
-    def __init__(self, env="prod", mode='', platform='', action=''):
-        mysql_config = env_dict[env]
-        self.connection = pymysql.connect(
-            host=mysql_config['host'],  # 数据库IP地址,内网地址
-            port=mysql_config['port'],  # 端口号
-            user=mysql_config['user'],  # mysql用户名
-            passwd=mysql_config['passwd'],  # mysql用户登录密码
-            db=mysql_config['db'],  # 数据库名
-            charset=mysql_config['charset']  # 如果数据库里面的文本是utf8编码的,charset指定是utf8
-        )
-        self.mode = mode
-        self.platform = platform
-        self.action = action
-
-    def select(self, sql):
-        """
-        查询
-        :param sql:
-        :return:
-        """
-        cursor = self.connection.cursor()
-        cursor.execute(sql)
-        data = cursor.fetchall()
-        return data
-
-    def select_params(self, sql, params=None):
-        cursor = self.connection.cursor()
-        cursor.execute(sql, params or ())  # 支持参数化查询
-        data = cursor.fetchall()
-        return data
-
-    def update(self, sql):
-        """
-        插入
-        :param sql:
-        :return:
-        """
-        cursor = self.connection.cursor()
-        try:
-            res = cursor.execute(sql)
-            self.connection.commit()
-            return res
-        except Exception as e:
-            Local.logger(self.mode, self.platform).error(f"update_values异常,进行回滚操作:{e}\n")
-            self.connection.rollback()
-
-    def close(self):
-        """
-        关闭连接
-        """
-        self.connection.close()

+ 0 - 55
application/config/common/mysql/sql.py

@@ -1,55 +0,0 @@
-
-
-from datetime import datetime
-
-from application.config.common.mysql import MysqlHelper
-
-class Sql:
-    """
-    修改用户名+头像
-    """
-    def update_name_url(self, mid, avatar_url, user_name):
-        sql = f""" update xng_uid set avatar_url = "{avatar_url}", user_name="{user_name}" where uid = "{mid}"; """
-        db = MysqlHelper()
-        repeat_video = db.update(sql=sql)
-        if repeat_video:
-            return True
-        return False
-
-    """
-    插入 用户名 头像 用户id
-    """
-
-    def insert_name_url(self, uid, avatar_url, user_name):
-        current_time = datetime.now()
-        formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
-        insert_sql = f"""INSERT INTO xng_uid (uid, avatar_url, user_name, data_time) values ('{uid}' ,'{avatar_url}','{user_name}', '{formatted_time}')"""
-        db = MysqlHelper()
-        repeat_video = db.update(sql=insert_sql)
-        if repeat_video:
-            return True
-        return False
-
-    """
-    查询用户id是否存在
-    """
-
-    def select_id(self, uid):
-        sql = f""" select uid from xng_uid where uid = "{uid}"; """
-        db = MysqlHelper()
-        repeat_video = db.select(sql=sql)
-        if repeat_video:
-            return True
-        return False
-
-    """
-    查询用户id是否之前已添加过
-    """
-
-    def select_id_status(self, uid):
-        sql = f""" select uid from crawler_user_v3 where link = "{uid}"; """
-        db = MysqlHelper()
-        repeat_video = db.select(sql=sql)
-        if repeat_video:
-            return False
-        return True

+ 0 - 2
application/config/common/proxies/__init__.py

@@ -1,2 +0,0 @@
-from .fast_proxy import tunnel_proxies
-from .fast_proxy import haiwai_tunnel_proxies

+ 0 - 23
application/config/common/proxies/fast_proxy.py

@@ -1,23 +0,0 @@
-def tunnel_proxies():
-    # 隧道域名:端口号
-    tunnel = "q796.kdltps.com:15818"
-    # 用户名密码方式
-    username = "t17772369458618"
-    password = "5zqcjkmy"
-    proxies = {
-        "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
-        "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
-    }
-    return proxies
-
-
-def haiwai_tunnel_proxies():
-    tunnel = "c101.kdlfps.com:18866"
-    # 用户名密码方式
-    username = "f2801246645"
-    password = "q0i0ohnl"
-    proxies = {
-        "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
-        "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
-    }
-    return proxies

+ 0 - 55
application/config/common/redis/pyredis.py

@@ -1,55 +0,0 @@
-"""
-Redis client Python
-@author luojunhui
-"""
-import redis
-
-
-class RedisClient(object):
-    """
-    Redis client by python
-    Todo 如果 Redis 服务挂了,怎么做能够不影响业务
-    思路, 每次使用 redis 接口前先判断是否连接成功,如果连接失败则跳过 redis ,不影响全局
-    """
-
-    def __init__(self):
-        self.pool = None
-        # self.host = 'r-bp1mb0v08fqi4hjffu.redis.rds.aliyuncs.com'
-        self.host="r-bp1mb0v08fqi4hjffupd.redis.rds.aliyuncs.com",  # 外网地址
-        self.port = 6379
-        self.db = 2
-        self.password = 'Wqsd@2019'
-
-    def connect(self):
-        """
-        connect to redis server
-        :return: bool
-        """
-        try:
-            self.pool = redis.Redis(host=self.host, port=self.port, db=self.db, password=self.password)
-            return True
-        except Exception as e:
-            print("connect to redis fail, the reason is {}".format(e))
-            return False
-
-    def select(self, key):
-        """
-        read info from redis
-        :return:
-        """
-        return self.pool.get(key)
-
-    def insert(self, key, value, expire_time):
-        """
-        insert info from redis
-        :return:
-        """
-        self.pool.set(key, value, expire_time)
-
-    def delete(self, key):
-        """
-        delete key
-        :param key:
-        :return:
-        """
-        self.pool.delete(key)

+ 0 - 54
application/config/common/redis/redis_helper.py

@@ -1,54 +0,0 @@
-import redis
-import os
-import sys
-
-sys.path.append(os.getcwd())
-
-
-class RedisHelper:
-    @classmethod
-    def connect_redis(cls, env):
-        if env == 'hk':
-            redis_pool = redis.ConnectionPool(
-                # host='r-bp154bpw97gptefiqk.redis.rds.aliyuncs.com',  # 内网地址
-                # host='r-bp154bpw97gptefiqkpd.redis.rds.aliyuncs.com',  # 测试地址
-                host='r-bp1mb0v08fqi4hjffupd.redis.rds.aliyuncs.com',  # 外网地址
-                port=6379,
-                db=2,
-                password='Wqsd@2019'
-            )
-            redis_conn = redis.Redis(connection_pool=redis_pool)
-        elif env == 'prod':
-            redis_pool = redis.ConnectionPool(
-                host='r-bp1mb0v08fqi4hjffu.redis.rds.aliyuncs.com',  # 内网地址
-                # host='r-bp1mb0v08fqi4hjffupd.redis.rds.aliyuncs.com',  # 外网地址
-                port=6379,
-                db=2,
-                password='Wqsd@2019'
-            )
-            redis_conn = redis.Redis(connection_pool=redis_pool)
-        else:
-            redis_pool = redis.ConnectionPool(
-                # host='r-bp154bpw97gptefiqk.redis.rds.aliyuncs.com',  # 内网地址
-                host='r-bp154bpw97gptefiqkpd.redis.rds.aliyuncs.com',  # 外网地址
-                port=6379,
-                db=2,
-                password='Qingqu2019'
-            )
-            redis_conn = redis.Redis(connection_pool=redis_pool)
-        return redis_conn
-
-    @classmethod
-    def redis_push(cls, env, task_key, data):
-        redis_conn = cls.connect_redis(env)
-        # print("开始写入数据")
-        redis_conn.lpush(task_key, data)
-        # print("数据写入完成")
-
-    @classmethod
-    def redis_pop(cls, env, task_key):
-        redis_conn = cls.connect_redis(env)
-        if redis_conn.llen(task_key) == 0:
-            return None
-        else:
-            return redis_conn.rpop(task_key)

+ 0 - 54
application/config/common/redis/xng_redis.py

@@ -1,54 +0,0 @@
-import json
-
-import redis
-
-
-
-class XNGSyncRedisHelper:
-    _pool: redis.ConnectionPool = None
-    _instance = None
-
-    def __init__(self):
-        if not self._instance:
-            self._pool = self._get_pool()
-            self._instance = self
-
-    def _get_pool(self) -> redis.ConnectionPool:
-        if self._pool is None:
-            self._pool = redis.ConnectionPool(
-                # host="r-bp1mb0v08fqi4hjffupd.redis.rds.aliyuncs.com",  # 外网地址
-                host="r-bp1mb0v08fqi4hjffu.redis.rds.aliyuncs.com",  # 内网地址
-                port=6379,
-                db=0,
-                password="Wqsd@2019",
-                # password="Qingqu2019",
-
-            )
-        return self._pool
-
-    def get_client(self) -> redis.Redis:
-        pool = self._get_pool()
-        client = redis.Redis(connection_pool=pool)
-        return client
-
-    def close(self):
-        if self._pool:
-            self._pool.disconnect(inuse_connections=True)
-
-
-
-
-def xng_get_video_data():
-    """获取一条id"""
-    task = f"task:xng_video_id"
-    helper = XNGSyncRedisHelper()
-    client = helper.get_client()
-    ret = client.rpop(task)
-    return ret
-
-def xng_in_video_data(ret):
-    """写入"""
-    task = f"task:xng_video_id"
-    helper = XNGSyncRedisHelper()
-    client = helper.get_client()
-    client.rpush(task, ret)

+ 0 - 7
application/config/config.py

@@ -1,7 +0,0 @@
-# api 配置
-crawler_api_domain = 'http://8.217.192.46:8889'
-zhufuquanzi_view_api = crawler_api_domain + '/crawler/zhu_fu_quan_zi/detail_exposure'
-zhufuquanzi_history_api = crawler_api_domain + '/crawler/zhu_fu_quan_zi/detail_history'
-xiaoniangao_view_api = crawler_api_domain + '/crawler/xiao_nian_gao_plus/detail_exposure'
-xiaoniangao_history_api = crawler_api_domain + '/crawler/xiao_nian_gao_plus/detail_history'
-zhufuquanzi_log_upload_api = crawler_api_domain + '/crawler/zhu_fu_quan_zi/log_upload'

+ 0 - 30
application/config/ipconfig.py

@@ -1,30 +0,0 @@
-"""
-ipconfig
-每一个容器和手机需要在同一个局域网,保证容器内appium和手机的网络通畅
-"""
-
-
-def ip_config():
-    ip_dict = {
-        "machine_01": "",
-        "machine_02": "",
-        "machine_03": "",
-        "machine_04": "",
-        "machine_05": "",
-        "machine_06": "",
-        "machine_07": "",
-        "machine_08": "",
-        "machine_09": "",
-        "machine_10": "",
-        "machine_11": "",
-        "machine_12": "",
-        "machine_13": "",
-        "machine_14": "",
-        "machine_15": "",
-        "machine_16": "",
-        "machine_17": "",
-        "machine_18": "",
-        "machine_19": "",
-        "machine_20": ""
-    }
-    return ip_dict

+ 0 - 36
application/config/mysql_config.py

@@ -1,36 +0,0 @@
-"""
-MySQL的配置任务
-"""
-
-
-# 香港服务器, 暂时不写
-mysql_hk = {
-    "", ""
-}
-
-# prod环境服务器地址
-mysql_prod = {
-    "host": "rm-bp1159bu17li9hi94.mysql.rds.aliyuncs.com",  # 数据库IP地址,内网地址
-    # host="rm-bp1159bu17li9hi94ro.mysql.rds.aliyuncs.com",# 数据库IP地址,外网地址
-    "port": 3306,  # 端口号
-    "user":"crawler",  # mysql用户名
-    "passwd": "crawler123456@",  # mysql用户登录密码
-    "db": "piaoquan-crawler",  # 数据库名
-    "charset": "utf8mb4"  # 如果数据库里面的文本是utf8编码的,charset指定是utf8
-}
-# 测试环境Mysql服务器地址
-mysql_dev = {
-    "host": "rm-bp1k5853td1r25g3n690.mysql.rds.aliyuncs.com",  # 数据库IP地址,内网地址
-    # host="rm-bp1k5853td1r25g3ndo.mysql.rds.aliyuncs.com",  # 数据库IP地址,外网地址
-    "port": 3306,  # 端口号
-    "user":"crawler",  # mysql用户名
-    "passwd": "crawler123456@",  # mysql用户登录密码
-    "db": "piaoquan-crawler",  # 数据库名
-    "charset": "utf8mb4"  # 如果数据库里面的文本是utf8编码的,charset指定是utf8
-}
-
-env_dict = {
-    "hk": mysql_hk,
-    "prod": mysql_prod,
-    "dev": mysql_dev
-}

+ 0 - 134
application/etl/download.py

@@ -1,134 +0,0 @@
-"""
-下载视频
-"""
-import os
-import json
-import time
-import asyncio
-from hashlib import md5
-import datetime
-
-import httpx
-import requests
-
-
-class VideoDownloader(object):
-    """
-    视频下载功能
-    """
-
-    def __init__(self, video_obj):
-        self.platform = video_obj['platform']
-        self.video_id = video_obj['video_id']
-        self.video_url = video_obj['video_url']
-        self.cover_url = video_obj['cover_url']
-        self.proxy = {
-            "http://": "http://t17772369458618:5zqcjkmy@q796.kdltps.com:15818/",
-            "https://": "http://t17772369458618:5zqcjkmy@q796.kdltps.com:15818/"
-        }
-        self.max_retry = 5
-
-    def generate_video_path(self):
-        """
-        通过视频信息生成唯一视频地址
-        :return:
-        """
-        index = "{}-{}".format(self.platform, self.video_id)
-        index = md5(index.encode()).hexdigest()
-        temp_dir = "/Users/luojunhui/cyber/automatic_crawler"
-        file_name = "{}.mp4".format(index)
-        date_info = datetime.datetime.today().strftime("%Y%m%d")
-        video_path = os.path.join(temp_dir, date_info, file_name)
-        if os.path.exists(video_path):
-            return
-        else:
-            os.makedirs(os.path.dirname(video_path), exist_ok=True)
-        return video_path
-
-    async def download_video(self):
-        """
-        download video from the web
-        :return:
-        """
-        if self.platform == "fuqiwang":
-            download_path = self.generate_video_path()
-            if download_path:
-                headers = {
-                    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.6(0x13080610) XWEB/1156',
-                    'Accept-Encoding': 'identity;q=1, *;q=0',
-                    'Accept': '*/*',
-                    'Sec-Fetch-Site': 'cross-site',
-                    'Sec-Fetch-Mode': 'no-cors',
-                    'Sec-Fetch-Dest': 'video',
-                    'Referer': 'https://servicewechat.com/wxa1431c6e7acdd32d/2/page-frame.html',
-                    'Accept-Language': 'en-US,en;q=0.9',
-                    'Range': 'bytes=0-',
-                }
-                async with httpx.AsyncClient(http2=True, proxies=self.proxy, headers=headers) as client:
-                    try:
-                        response = await client.get(self.video_url, headers=headers)
-                        if response.status_code == 206:
-                            with open(download_path, "wb") as f:
-                                f.write(response.content)
-                        else:
-                            for _ in range(self.max_retry):
-                                response = await client.get(self.video_url, headers=headers, follow_redirects=True)
-                                if response.status_code == 206:
-                                    with open(download_path, "wb") as f:
-                                        f.write(response.content)
-                                    break
-                    except httpx.HTTPError as e:
-                        print(f"An error occurred while downloading: {e}")
-            else:
-                print("视频已经存在")
-
-    def get_by_request(self):
-        """
-        req
-        :return:
-        """
-        download_path = self.generate_video_path()
-        headers = {
-            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.6(0x13080610) XWEB/1156',
-            'Accept-Encoding': 'identity;q=1, *;q=0',
-            'Accept': '*/*',
-            'Sec-Fetch-Site': 'cross-site',
-            'Sec-Fetch-Mode': 'no-cors',
-            'Sec-Fetch-Dest': 'video',
-            'Referer': 'https://servicewechat.com/wxa1431c6e7acdd32d/2/page-frame.html',
-            'Accept-Language': 'en-US,en;q=0.9',
-            'Range': 'bytes=0-',
-        }
-        r = requests.get(
-            url=self.video_url,
-            headers=headers,
-            proxies=self.proxy
-        )
-        print(r.status_code)
-        with open("test.mp4", "wb") as f:
-            f.write(r.content)
-
-
-async def main(video_obj):
-    """
-    异步执行函数
-    :param video_obj:
-    :return:
-    """
-    downloader = VideoDownloader(video_obj)
-    await downloader.download_video()
-
-
-if __name__ == '__main__':
-    video_o = {
-        "update_time": 1709784300,
-        "platform": "fuqiwang",
-        "video_id": 142599,
-        "title": "🔴3·8妇女节,最美的祝福,送给全天下的女神!",
-        "type": 1,
-        "video_type": 2,
-        "cover_url": "https://znl-video-bos.cdn.bcebos.com/c6f12b49992ef638342065439f55b444/65e93632/picture/20240306/b8b0c1cc262c2394f111650c9f82e35a_thumb.jpg",
-        "video_url": "https://znl-video-bos.cdn.bcebos.com/e368801a814c548e443835086d37caaf/65e93632/video/20240306/820ee1498e3ed2a59d37aed54d39ae95_1.mp4",
-    }
-    VideoDownloader(video_obj=video_o).get_by_request()
-    # asyncio.run(main(video_obj=video_o))

+ 0 - 3
application/functions/__init__.py

@@ -1,3 +0,0 @@
-from .get_redirect_url import get_redirect_url
-from .clean_title import clean_title
-from .read_mysql_config import get_config_from_mysql

+ 0 - 69
application/functions/async_mysql_service.py

@@ -1,69 +0,0 @@
-# application/functions/async_mysql_service.py
-import asyncio
-import json
-import os
-from typing import List, Optional, Dict, Any
-from application.base.async_mysql_client import AsyncMySQLClient
-from utils.env_loader import load_env
-
-
-class AsyncMysqlService:
-    """
-    项目业务逻辑封装类,基于 AsyncMySQLClient 实现异步数据库访问
-
-    功能:
-    - 封装与业务相关的 SQL 操作
-    - 自动读取环境变量进行配置初始化
-    - 与爬虫、任务处理逻辑解耦
-    """
-
-    def __init__(self):
-        """
-        初始化时自动从环境变量读取配置并构造底层连接池客户端
-        """
-        db_config = {
-            "host": os.getenv("DB_HOST"),
-            "port": int(os.getenv("DB_PORT")),
-            "user": os.getenv("DB_USER"),
-            "password": os.getenv("DB_PASSWORD"),
-            "db": os.getenv("DB_NAME"),
-            "charset": os.getenv("DB_CHARSET")
-        }
-        self.client = AsyncMySQLClient(**db_config)
-
-    async def init(self):
-        """连接池初始化,在服务启动时调用一次"""
-        await self.client.init_pool()
-
-    async def get_user_list(self,id) -> List[Dict[str, Any]]:
-        sql = "SELECT uid, link, nick_name from crawler_user_v3 where task_id = %s"
-        return await self.client.fetch_all(sql, [id])
-
-    async def get_rule_dict(self, rule_id: int) -> Optional[Dict[str, Any]]:
-        sql = "SELECT rule FROM crawler_task_v3 WHERE id = %s"
-        row = await self.client.fetch_one(sql, [rule_id])
-        if not row or "rule" not in row:
-            return None
-
-        try:
-            # 合并 list[dict] 为一个 dict
-            return {k: v for item in json.loads(row["rule"]) for k, v in item.items()}
-        except json.JSONDecodeError as e:
-            print(f"[get_rule_dict] JSON 解析失败: {e}")
-            return None
-
-
-async def main():
-    mysql_service = AsyncMysqlService()
-    await mysql_service.init()
-    users = await mysql_service.get_user_list(18)
-    rules = await mysql_service.get_rule_dict(18)
-    print(users)
-    await mysql_service.client.close()
-
-
-if __name__ == '__main__':
-    asyncio.run(main())
-
-
-

+ 0 - 9
application/functions/get_redirect_url.py

@@ -1,9 +0,0 @@
-import requests
-
-
-def get_redirect_url(url):
-    res = requests.get(url, allow_redirects=False)
-    if res.status_code == 302 or res.status_code == 301:
-        return res.headers['Location']
-    else:
-        return url

+ 0 - 147
application/functions/mysql_service.py

@@ -1,147 +0,0 @@
-import json
-import traceback
-
-from application.config.common import MysqlHelper, AliyunLogger,Local
-
-
-class MysqlService:
-    def __init__(self,platform, mode, task_id):
-        self.env = "prod"
-        self.task_id = task_id
-        self.mode = mode
-        self.platform = platform
-        self.MySQL = MysqlHelper(mode=self.mode, platform=self.platform, env=self.env)
-        self.aliyun_log = AliyunLogger(platform=self.platform, mode=self.mode)
-        self.logger = Local.init_logger(platform=self.platform, mode=self.mode, log_level="INFO", log_to_console=True)
-        Local.init_logger(self.platform,self.mode)
-
-
-    def get_rule_dict(self):
-        """
-        :return: 返回任务的规则, task_rule
-        """
-        rule_dict = {}
-        task_rule_sql = f"SELECT rule FROM crawler_task_v3 WHERE id = {self.task_id};"
-        data = self.MySQL.select(task_rule_sql)
-        if data:
-            rule_list = json.loads(data[0][0])
-            for item in rule_list:
-                for key in item:
-                    rule_dict[key] = item[key]
-        self.aliyun_log.logging(
-            code=1000,
-            message="抓取规则",
-            data=rule_dict
-        )
-        return rule_dict
-
-
-    def get_user_list(self):
-        """
-        :return: 返回用户列表
-        """
-        task_user_list_sql = f"SELECT uid, link, nick_name from crawler_user_v3 where task_id = {self.task_id};"
-        uid_list = self.MySQL.select(task_user_list_sql)
-        user_list = [{"uid": i[0], "link": i[1], "nick_name": i[2]} for i in uid_list] if uid_list else []
-        self.aliyun_log.logging(
-            code=1000,
-            message="用户列表",
-            data=user_list
-        )
-        return user_list
-
-    def check_user_id(self, uid):
-        """
-        检查指定用户ID是否存在于数据库的zqkd_uid表中。
-
-        :param uid:要检查的用户ID
-        :return:如果用户ID存在于表中返回True,否则返回False
-        """
-        try:
-            query_sql = f""" SELECT uid FROM zqkd_user WHERE uid = "{uid}"; """
-            result = self.mysql.select(sql=query_sql)
-            return bool(result)
-        except Exception as e:
-            tb = traceback.format_exc()
-            self.LocalLog.error(f"检查用户ID失败: {e}\n{tb}")
-            return False
-
-    def update_user(self, uid, user_name, avatar_url):
-        """
-        更新数据库中指定用户的用户名和头像URL。
-
-        :param uid:要更新信息的用户ID
-        :param user_name:新的用户名
-        :param avatar_url:新的头像URL
-        :return:如果更新操作成功,返回更新操作的结果(通常是影响的行数),失败则返回None或抛出异常
-        """
-        try:
-            update_sql = f""" UPDATE zqkd_user SET avatar_url = "{avatar_url}", user_name = "{user_name}" WHERE uid = "{uid}"; """
-            return self.MySQL.update(sql=update_sql)
-        except Exception as e:
-            tb = traceback.format_exc()
-            self.logger.error(f"更新用户信息失败: {e}\n{tb}")
-            return None
-
-    def insert_user(self, uid, user_name, avatar_url):
-        """
-        向数据库的zqkd_user表中插入或更新用户信息
-
-        :param uid: 用户ID(数值类型)
-        :param user_name: 用户名
-        :param avatar_url: 头像URL
-        :return: 成功返回影响的行数,失败返回None
-        """
-        try:
-            # 直接拼接SQL(不推荐,有SQL注入风险)
-            insert_sql = f"""
-                  INSERT INTO zqkd_user (uid, avatar_url, user_name) 
-                  VALUES ({uid}, '{avatar_url.replace("'", "''")}', '{user_name.replace("'", "''")}') 
-                  ON DUPLICATE KEY UPDATE 
-                  user_name = '{user_name.replace("'", "''")}', 
-                  avatar_url = '{avatar_url.replace("'", "''")}'
-              """
-            return self.MySQL.update(sql=insert_sql)
-        except Exception as e:
-            tb = traceback.format_exc()
-            self.logger.error(f"插入用户信息失败: {e}\n{tb}")
-            return None
-
-    def get_today_videos(self):
-        try:
-            # 手动转义单引号(仅缓解部分风险)
-
-            sql = """
-                          SELECT count(*) as cnt
-                          FROM crawler_video 
-                          WHERE create_time >= CURDATE() 
-                            AND create_time < CURDATE() + INTERVAL 1 DAY 
-                            AND platform = %s 
-                            AND strategy = %s
-                      """
-            result = self.MySQL.select_params(sql, (self.platform, self.mode))
-            if result and len(result) > 0:
-                return result[0][0]  # 返回第一行第一列的计数值
-            return 0  # 无结果时返回0
-        except Exception as e:
-            self.logger.error(f"查询失败: {e}")
-            return 0
-
-    def select_user(self, last_scanned_id=0):
-        """
-        根据last_scanned_id查询用户数据
-        :param last_scanned_id: 上次扫描的ID,0表示从头开始
-        :return: 查询结果列表
-        """
-        try:
-            # 构建查询(根据last_scanned_id过滤)
-            query = "SELECT id, uid FROM zqkd_user"
-            if last_scanned_id > 0:
-                query += f" WHERE id > {last_scanned_id}"
-            query += " ORDER BY id ASC"
-
-            return self.MySQL.select(query)
-        except Exception as e:
-            tb = traceback.format_exc()
-            self.logger.error(f"查询用户列表失败: {e}\n{tb}")
-            return []

+ 0 - 46
application/functions/read_mysql_config.py

@@ -1,46 +0,0 @@
-import json
-
-from application.config.common import MysqlHelper
-
-
-def get_config_from_mysql(log_type, source, text):
-    """
-    :param log_type: mode
-    :param source: platform
-    :param text:
-    :return:
-    """
-    select_sql = f"""select config from crawler_config where source="{source}" """
-    MySQL = MysqlHelper(mode=log_type, platform=select_sql)
-    configs = MySQL.select(select_sql)
-    title_list = []
-    filter_list = []
-    emoji_list = []
-    search_word_list = []
-    for config in configs:
-        config_dict = json.loads(config[0])
-        for k, v in config_dict.items():
-            if k == "title":
-                title_list_config = v.split(",")
-                for title in title_list_config:
-                    title_list.append(title)
-            if k == "filter":
-                filter_list_config = v.split(",")
-                for filter_word in filter_list_config:
-                    filter_list.append(filter_word)
-            if k == "emoji":
-                emoji_list_config = v.split(",")
-                for emoji in emoji_list_config:
-                    emoji_list.append(emoji)
-            if k == "search_word":
-                search_word_list_config = v.split(",")
-                for search_word in search_word_list_config:
-                    search_word_list.append(search_word)
-    if text == "title":
-        return title_list
-    elif text == "filter":
-        return filter_list
-    elif text == "emoji":
-        return emoji_list
-    elif text == "search_word":
-        return search_word_list

+ 0 - 240
application/functions/zqkd_db_redis.py

@@ -1,240 +0,0 @@
-import os
-import sys
-import threading
-import traceback
-from datetime import timedelta
-
-import redis
-
-from application.config.common import Local
-
-sys.path.append(os.getcwd())
-
-from application.config.common import MysqlHelper
-
-
-class DatabaseOperations:
-    def __init__(self, mode, platform):
-        self.mysql = MysqlHelper(mode=mode, platform=platform)
-        self.LocalLog = Local.logger(platform, mode)
-        self.mode = mode
-        self.platform = platform
-
-    def check_user_id(self, uid):
-        """
-        检查指定用户ID是否存在于数据库的zqkd_uid表中。
-
-        :param uid:要检查的用户ID
-        :return:如果用户ID存在于表中返回True,否则返回False
-        """
-        try:
-            query_sql = f""" SELECT uid FROM zqkd_user WHERE uid = "{uid}"; """
-            result = self.mysql.select(sql=query_sql)
-            return bool(result)
-        except Exception as e:
-            tb = traceback.format_exc()
-            self.LocalLog.error(f"检查用户ID失败: {e}\n{tb}")
-            return False
-
-    def update_user(self, uid, user_name, avatar_url):
-        """
-        更新数据库中指定用户的用户名和头像URL。
-
-        :param uid:要更新信息的用户ID
-        :param user_name:新的用户名
-        :param avatar_url:新的头像URL
-        :return:如果更新操作成功,返回更新操作的结果(通常是影响的行数),失败则返回None或抛出异常
-        """
-        try:
-            update_sql = f""" UPDATE zqkd_user SET avatar_url = "{avatar_url}", user_name = "{user_name}" WHERE uid = "{uid}"; """
-            return self.mysql.update(sql=update_sql)
-        except Exception as e:
-            tb = traceback.format_exc()
-            self.LocalLog.error(f"更新用户信息失败: {e}\n{tb}")
-            return None
-
-    def insert_user(self, uid, user_name, avatar_url):
-        """
-        向数据库的zqkd_user表中插入或更新用户信息
-
-        :param uid: 用户ID(数值类型)
-        :param user_name: 用户名
-        :param avatar_url: 头像URL
-        :return: 成功返回影响的行数,失败返回None
-        """
-        try:
-            # 直接拼接SQL(不推荐,有SQL注入风险)
-            insert_sql = f"""
-                INSERT INTO zqkd_user (uid, avatar_url, user_name) 
-                VALUES ({uid}, '{avatar_url.replace("'", "''")}', '{user_name.replace("'", "''")}') 
-                ON DUPLICATE KEY UPDATE 
-                user_name = '{user_name.replace("'", "''")}', 
-                avatar_url = '{avatar_url.replace("'", "''")}'
-            """
-            return self.mysql.update(sql=insert_sql)
-        except Exception as e:
-            tb = traceback.format_exc()
-            self.LocalLog.error(f"插入用户信息失败: {e}\n{tb}")
-            return None
-    def get_today_videos(self):
-        try:
-            # 手动转义单引号(仅缓解部分风险)
-
-            sql = """
-                        SELECT count(*) as cnt
-                        FROM crawler_video 
-                        WHERE create_time >= CURDATE() 
-                          AND create_time < CURDATE() + INTERVAL 1 DAY 
-                          AND platform = %s 
-                          AND strategy = %s
-                    """
-            result = self.mysql.select_params(sql, (self.platform,self.mode))
-            if result and len(result) > 0:
-                return result[0][0]  # 返回第一行第一列的计数值
-            return 0  # 无结果时返回0
-        except Exception as e:
-            self.LocalLog.error(f"查询失败: {e}")
-            return 0
-    def select_user(self, last_scanned_id=0):
-        """
-        根据last_scanned_id查询用户数据
-        :param last_scanned_id: 上次扫描的ID,0表示从头开始
-        :return: 查询结果列表
-        """
-        try:
-            # 构建查询(根据last_scanned_id过滤)
-            query = "SELECT id, uid FROM zqkd_user"
-            if last_scanned_id > 0:
-                query += f" WHERE id > {last_scanned_id}"
-            query += " ORDER BY id ASC"
-
-            return self.mysql.select(query)
-        except Exception as e:
-            tb = traceback.format_exc()
-            self.LocalLog.error(f"查询用户列表失败: {e}\n{tb}")
-            return []
-
-
-class RedisOperations:
-    _pool: redis.ConnectionPool = None
-    _instance = None
-    _lock = threading.Lock()  # 用于线程安全的单例创建
-
-    @classmethod
-    def get_instance(cls, mode="", platform=""):
-        """线程安全的单例获取方法"""
-        if not cls._instance:
-            with cls._lock:
-                if not cls._instance:
-                    cls._instance = cls(mode, platform)
-        return cls._instance
-
-    def __init__(self, mode, platform):
-        # 私有构造函数,使用 get_instance() 获取实例
-        self.mode = mode
-        self.platform = platform
-        self.LocalLog = Local.logger(self.platform, self.mode)
-        if RedisOperations._instance is not None:
-            raise Exception("请使用 get_instance() 获取实例")
-
-        self._pool = self._get_pool()
-        self.client = redis.Redis(connection_pool=self._pool, decode_responses=True)  # 复用同一个客户端
-
-    def _get_pool(self) -> redis.ConnectionPool:
-        if self._pool is None:
-            try:
-                self._pool = redis.ConnectionPool(
-                    host="r-bp1mb0v08fqi4hjffu.redis.rds.aliyuncs.com",
-                    port=6379,
-                    db=0,
-                    password="Wqsd@2019",
-                    max_connections=50,  # 增加最大连接数
-                    socket_timeout=10,
-                    retry_on_timeout=True
-                )
-            except Exception as e:
-                tb = traceback.format_exc()
-                self.LocalLog.error(f"创建Redis连接池失败: {e}\n{tb}")
-                raise
-        return self._pool
-
-    def close(self):
-        """关闭连接池"""
-        try:
-            if self._pool:
-                self._pool.disconnect(inuse_connections=True)
-        except Exception as e:
-            tb = traceback.format_exc()
-            self.LocalLog.error(f"关闭Redis连接池失败: {e}\n{tb}")
-
-    def get_recommend_video(self, task="task:zqkd_video_id"):
-        """从Redis的指定列表中弹出并返回最左边的视频ID"""
-        try:
-            value_bytes = self.client.rpop(task)
-            value_str = value_bytes.decode('utf-8')
-            return value_str
-        except Exception as e:
-            tb = traceback.format_exc()
-            self.LocalLog.error(f"获取推荐视频ID失败: {e}\n{tb}")
-            return None
-
-    def check_video_id_exists(self, videoID):
-        """检查指定的视频ID是否已经存在于Redis中"""
-        try:
-            key = f"crawler:zqkd:{videoID}"
-            return self.client.exists(key)
-        except Exception as e:
-            tb = traceback.format_exc()
-            self.LocalLog.error(f"检查视频ID是否存在失败: {e}\n{tb}")
-            return False
-
-    def save_video_id(self, videoID):
-        """将视频ID存储到Redis中,并为其设置3天的过期时间"""
-        try:
-            key = f"crawler:zqkd:{videoID}"
-            expiration_time = int(timedelta(days=3).total_seconds())
-            self.client.setex(key, expiration_time, "1")
-        except Exception as e:
-            tb = traceback.format_exc()
-            self.LocalLog.error(f"保存视频ID失败: {e}\n{tb}")
-
-    def save_recommend_video(self, videoID):
-        """将推荐视频ID添加到Redis的指定列表中,并为该列表设置2天的过期时间"""
-        try:
-            task = "task:zqkd_video_id"
-            pipe = self.client.pipeline()  # 使用管道执行多个命令
-            pipe.rpush(task, videoID)
-            pipe.expire(task, int(timedelta(days=2).total_seconds()))
-            pipe.execute()
-
-            # 检查数据是否写入成功
-            list_length = self.client.llen(task)
-            self.LocalLog.info(f"保存推荐视频ID成功,列表长度: {list_length}")
-        except Exception as e:
-            tb = traceback.format_exc()
-            self.LocalLog.error(f"保存推荐视频ID失败: {e}\n{tb}")
-
-    def get_last_scanned_id(self):
-        """获取上次扫描的ID"""
-        try:
-            return self.client.get("zqkd_last_scanned_id").decode('utf-8')
-        except Exception as e:
-            tb = traceback.format_exc()
-            self.LocalLog.error(f"获取上次扫描的ID失败: {e}\n{tb}")
-            return None
-
-    def set_last_scanned_id(self, last_scanned_id):
-        """设置上次扫描的ID"""
-        try:
-            result = self.client.set("zqkd_last_scanned_id", last_scanned_id)
-            if result:
-                self.LocalLog.info(f"成功设置上次扫描的ID: {last_scanned_id}")
-        except Exception as e:
-            tb = traceback.format_exc()
-            self.LocalLog.error(f"设置上次扫描的ID失败: {e}\n{tb}")
-            return False
-
-
-if __name__ == '__main__':
-    db = DatabaseOperations("author", "zhongqingkandianauthor")
-    print(db.get_today_videos())

+ 0 - 1
application/items/__init__.py

@@ -1 +0,0 @@
-from .item import VideoItem

+ 0 - 94
application/items/item.py

@@ -1,94 +0,0 @@
-import time
-from application.functions import clean_title
-
-
-class VideoItem(object):
-    """
-    function: 当扫描进一条视频的时候,对该视频的基本信息进行处理,保证发送给 pipeline和 etl 的 video_dict 是正确的
-    __init__: 初始化空json 对象,用来存储视频信息
-    add_video_info: 把视频信息存储到 item 对象中
-    check_item: 检查 item 对象中的各个元素以及处理
-    """
-
-    def __init__(self):
-        self.item = {}
-
-    def add_video_info(self, key, value):
-        self.item[key] = value
-
-    def check_item(self):
-        """
-        判断item 里面的字段,是否符合要求
-        字段分为 3 类:
-        1. 必须存在数据的字段: ["video_id", "user_id", "user_name", "out_user_id", "out_video_id", "session", "video_url", "cover_url", "platform", "strategy"]
-        2. 不存在默认为 0 的字段 :["duration", "play_cnt", "like_cnt", "comment_cnt", "share_cnt", "width", "height"]
-        3. 需要后出理的字段: video_title, publish_time
-        """
-        if self.item.get("video_title"):
-            self.item["video_title"] = clean_title(self.item["video_title"])
-        else:
-            return False
-        if self.item.get("publish_time_stamp"):
-            publish_time_str = time.strftime(
-                "%Y-%m-%d %H:%M:%S", time.localtime(self.item["publish_time_stamp"])
-            )
-            self.add_video_info("publish_time_str", publish_time_str)
-        else:
-            publish_time_stamp = int(time.time())
-            publish_time_str = time.strftime(
-                "%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp)
-            )
-            self.add_video_info("publish_time_stamp", publish_time_stamp)
-            self.add_video_info("publish_time_str", publish_time_str)
-        self.add_video_info("publish_time", publish_time_str)
-        if not self.item.get("update_time_stamp"):
-            self.add_video_info("update_time_stamp", int(time.time()))
-
-        # 如果不存在,默认值为 0
-        config_keys = [
-            "duration",
-            "play_cnt",
-            "like_cnt",
-            "comment_cnt",
-            "share_cnt",
-            "width",
-            "height",
-        ]
-        for config_key in config_keys:
-            if self.item.get(config_key):
-                continue
-            else:
-                self.add_video_info(config_key, 0)
-
-        # 必须存在的元素,若不存在则会报错
-        must_keys = [
-            "video_id",
-            "user_id",
-            "user_name",
-            "out_video_id",
-            "session",
-            "video_url",
-            "cover_url",
-            "platform",
-            "strategy",
-        ]
-        """
-        video_id, out_video_id 均为站外视频 id
-        usr_id: 站内用户 id
-        out_user_id: 站外用户 id
-        user_name: 站外用户名称
-        """
-        for m_key in must_keys:
-            if self.item.get(m_key):
-                continue
-            else:
-                # print(m_key)
-                return False
-        return True
-
-    def produce_item(self):
-        flag = self.check_item()
-        if flag:
-            return self.item
-        else:
-            return False

+ 0 - 2
application/pipeline/__init__.py

@@ -1,2 +0,0 @@
-from .pipeline_dev import PiaoQuanPipelineTest
-from .pipeline import PiaoQuanPipeline

+ 0 - 272
application/pipeline/pipeline.py

@@ -1,272 +0,0 @@
-import re
-import sys
-import os
-import time
-
-from application.config.common.feishu.feishu_utils import FeishuUtils
-
-sys.path.append(os.getcwd())
-from datetime import datetime
-
-from application.config.common import MysqlHelper, AliyunLogger
-from application.config.common import RedisClient
-
-
-class PiaoQuanPipeline(object):
-    """
-    爬虫管道——爬虫规则判断
-    """
-
-    def __init__(self, platform, mode, rule_dict, env, item, trace_id, account=None):
-        self.platform = platform
-        self.mode = mode
-        self.item = item
-        self.rule_dict = rule_dict
-        self.env = env
-        self.trace_id = trace_id
-        self.mysql = MysqlHelper(env=env, mode=mode, platform=platform)
-        self.aliyun_log = AliyunLogger(platform=platform, mode=mode, env=env)
-        self.account = account
-        self.red = RedisClient()
-
-    def feishu_time_list(self):
-        summary = FeishuUtils.get_values_batch("KsoMsyP2ghleM9tzBfmcEEXBnXg", "RuLK77")
-        for row in summary[1:]:
-            channel = row[0]
-            day_count = row[1]
-            if channel:
-                if channel == self.platform:
-                    return day_count
-            else:
-                return None
-        return None
-
-    def publish_time_flag(self):
-        """
-        判断发布时间是否过期
-        :return: True or False
-        """
-        # 判断发布时间
-        publish_time_stamp = self.item["publish_time_stamp"]
-        update_time_stamp = self.item["update_time_stamp"]
-        max_d = self.rule_dict.get("period", {}).get("max", 1000)
-        min_d = self.rule_dict.get("period", {}).get("min", 1000)
-        days = max_d if max_d > min_d else min_d
-        days_time = self.feishu_time_list()
-        if days_time:
-            days = int(days_time)
-        if self.platform == "gongzhonghao":
-            if (
-                    int(time.time()) - publish_time_stamp
-                    > 3600 * 24 * days
-            ) and (
-                    int(time.time()) - update_time_stamp
-                    > 3600 * 24 * days
-            ):
-                self.aliyun_log.logging(
-                    code="2004",
-                    trace_id=self.trace_id,
-                    data=self.item,
-                    message="发布时间超过{}天".format(days),
-                )
-                return False
-        else:
-            if days == 0:
-                publish_time_stamp = int(time.time())  # 示例时间戳
-                is_today = datetime.fromtimestamp(publish_time_stamp).date() == datetime.today().date()
-                if not is_today:
-                    return False
-
-            elif (
-                    int(time.time()) - publish_time_stamp
-                    > 3600 * 24 * days
-            ):
-                self.aliyun_log.logging(
-                    code="2004",
-                    trace_id=self.trace_id,
-                    data=self.item,
-                    message="发布时间超过{}天".format(days),
-                )
-                return False
-        return True
-
-    def title_flag(self):
-        """
-        视频标题是否满足需求
-        :return:
-        """
-        title = self.item["video_title"]
-        cleaned_title = re.sub(r"[^\w]", " ", title)
-        # 敏感词
-        # 获取敏感词列表
-        sensitive_words = []
-        if any(word in cleaned_title for word in sensitive_words):
-            self.aliyun_log.logging(
-                code="2003",
-                trace_id=self.trace_id,
-                message="标题中包含敏感词",
-                data=self.item,
-                account=self.account
-            )
-            return False
-        return True
-
-    def download_rule_flag(self):
-        """
-        视频基础下载规则
-        :return:
-        """
-        for key in self.item:
-            if self.rule_dict.get(key):
-                max_value = (
-                    int(self.rule_dict[key]["max"])
-                    if int(self.rule_dict[key]["max"]) > 0
-                    else 999999999999999
-                )
-                if key == "peroid":  # peroid是抓取周期天数
-                    continue
-                else:
-                    flag = int(self.rule_dict[key]["min"]) <= int(self.item[key]) <= max_value
-                    if not flag:
-                        self.aliyun_log.logging(
-                            code="2004",
-                            trace_id=self.trace_id,
-                            data=self.item,
-                            message="{}: {} <= {} <= {}, {}".format(
-                                key,
-                                self.rule_dict[key]["min"],
-                                self.item[key],
-                                max_value,
-                                flag,
-                            ),
-                            account=self.account
-                        )
-                        return flag
-            else:
-                continue
-        return True
-
-    def feishu_list(self):
-        summary = FeishuUtils.get_values_batch("KsoMsyP2ghleM9tzBfmcEEXBnXg", "letS93")
-        for row in summary[1:]:
-            channel = row[0]
-            day_count = row[1]
-            if channel:
-                if channel == self.platform:
-                    return day_count
-            else:
-                return None
-        return None
-
-    # 按照某个具体平台来去重
-    def repeat_video(self):
-        """
-        视频是否重复
-        :return:
-        """
-        out_id = self.item["out_video_id"]
-        day_count = self.feishu_list()
-        if day_count:
-            sql_2 = f"""select create_time from crawler_video where platform = "{self.platform}" and  out_video_id="{out_id}" AND create_time >= DATE_SUB(NOW(), INTERVAL {int(day_count)} DAY);"""
-            repeat_video = self.mysql.select(sql=sql_2)
-            if repeat_video:
-                self.aliyun_log.logging(
-                    code="2002",
-                    trace_id=self.trace_id,
-                    message="重复的视频",
-                    data=self.item,
-                    account=self.account
-                )
-                return False
-            else:
-                return True
-
-        if self.platform == "zhufuniannianshunxinjixiang" or  self.platform == "weiquanshipin" or  self.platform == "piaoquangushi" or  self.platform == "lepaoledong" or  self.platform == "zhufukuaizhuan" or self.platform == "linglingkuailezhufu" or self.platform == "lepaoledongdijie":
-            return True
-        if self.platform == "jierizhufuhuakaifugui" or self.platform == "yuannifuqimanman" or self.platform == "haoyunzhufuduo" or self.platform == "quzhuan" or self.platform == "zhufudewenhou" or self.platform == "jierizhufuxingfujixiang" or self.platform == "haoyoushipin" or self.platform == "xinshiquan" or self.platform == "laonianshenghuokuaile" or self.platform == "laonianquan":
-            return True
-        if self.platform == "zhuwanwufusunew" and self.mode == "recommend":
-            return True
-        if self.platform == "jixiangxingfu" and self.mode == "recommend":
-            return True
-        if self.platform == "yuannifuqichangzai" and self.mode == "recommend":
-            return True
-        if self.platform == "benshanzhufu" and self.mode == "recommend":
-            return True
-        if self.platform == "zuihaodesongni" and self.mode == "recommend":
-            return True
-        if self.platform == "tiantianjufuqi" and self.mode == "recommend":
-            return True
-        # 判断加上标题去重
-        if self.mode == "recommend" and self.platform == "zhufuhaoyunbaofu":
-            title = self.item["video_title"]
-            sql = f""" select 1 from crawler_video where platform = "{self.platform}" and out_video_id="{out_id}" and video_title="{title}"; """
-        else:
-            sql = f""" select 1 from crawler_video where platform = "{self.platform}" and out_video_id="{out_id}"; """
-        repeat_video = self.mysql.select(sql=sql)
-        if repeat_video:
-            # 喜事多多平台 4 天去重一次
-            if self.platform == "xishiduoduo":
-                sql_2 = f"""select create_time from crawler_video where out_video_id="{out_id}";"""
-                video_time = self.mysql.select(sql=sql_2)[0][0].timestamp()
-                if int(time.time()) - video_time >= 86400 * 4:
-                    return True
-            # 小年糕推荐流和祝福圈子推荐流 3 天去重一次
-            elif self.platform == "xiaoniangaotuijianliu" or self.platform == "zhufuquanzituijianliu":
-                sql_2 = f"""select create_time from crawler_video where out_video_id="{out_id}";"""
-                video_time = self.mysql.select(sql=sql_2)[0][0].timestamp()
-                if int(time.time()) - video_time >= 86400 * 3:
-                    return True
-            self.aliyun_log.logging(
-                code="2002",
-                trace_id=self.trace_id,
-                message="重复的视频",
-                data=self.item,
-                account=self.account
-            )
-            return False
-        return True
-
-    # def mq_exists(self):
-    #     """
-    #     检测 mq 是否已经发送过了
-    #     :return:
-    #     """
-    #     if self.red.connect():
-    #         index_txt = "{}-{}".format(self.platform, self.item['video_id'])
-    #         index_md5 = hashlib.md5(index_txt.encode()).hexdigest()
-    #         if self.red.select(index_md5):
-    #             self.aliyun_log.logging(
-    #                 code="2007",
-    #                 trace_id=self.trace_id,
-    #                 message="该视频 mq 已经发送"
-    #             )
-    #             return False
-    #         else:
-    #             self.red.insert(index_md5, int(time.time()), 43200)
-    #             return True
-    #     else:
-    #         return True
-
-    def process_item(self):
-        """
-        全规则判断,符合规则的数据则return True
-        :return:
-        """
-        # 判断该 mq 是否已经发了
-        # if not self.mq_exists():
-        #     return False
-        if not self.publish_time_flag():
-            # 记录相关日志
-            return False
-        if not self.title_flag():
-            # 记录相关日志
-            return False
-        if not self.repeat_video():
-            # 记录相关日志
-            return False
-        if not self.download_rule_flag():
-            # 记录相关日志
-            return False
-        return True
-

+ 0 - 112
application/pipeline/pipeline_dev.py

@@ -1,112 +0,0 @@
-import re
-import time
-
-
-class PiaoQuanPipelineTest:
-    def __init__(self, platform, mode, rule_dict, env, item, trace_id):
-        self.platform = platform
-        self.mode = mode
-        self.item = item
-        self.rule_dict = rule_dict
-        self.env = env
-        self.trace_id = trace_id
-
-    # 视频的发布时间限制, 属于是规则过滤
-    def publish_time_flag(self):
-        # 判断发布时间
-        publish_time_stamp = self.item["publish_time_stamp"]
-        update_time_stamp = self.item["update_time_stamp"]
-        if self.platform == "gongzhonghao":
-            if (
-                    int(time.time()) - publish_time_stamp
-                    > 3600 * 24 * int(self.rule_dict.get("period", {}).get("max", 1000))
-            ) and (
-                    int(time.time()) - update_time_stamp
-                    > 3600 * 24 * int(self.rule_dict.get("period", {}).get("max", 1000))
-            ):
-                message = "发布时间超过{}天".format(
-                    int(self.rule_dict.get("period", {}).get("max", 1000))
-                )
-                print(message)
-                return False
-        else:
-            if (
-                    int(time.time()) - publish_time_stamp
-                    > 3600 * 24 * int(self.rule_dict.get("period", {}).get("max", 1000))
-            ):
-                message = "发布时间超过{}天".format(
-                    int(self.rule_dict.get("period", {}).get("max", 1000))
-                )
-                print(message)
-                return False
-        return True
-
-    # 视频标题是否满足需求
-    def title_flag(self):
-        title = self.item["video_title"]
-        cleaned_title = re.sub(r"[^\w]", " ", title)
-        # 敏感词
-        # 获取敏感词列表
-        sensitive_words = []
-        if any(word in cleaned_title for word in sensitive_words):
-            message = "标题中包含敏感词"
-            print(message)
-            return False
-        return True
-
-    # 视频基础下载规则
-    def download_rule_flag(self):
-        for key in self.item:
-            if self.rule_dict.get(key):
-                max_value = (
-                    int(self.rule_dict[key]["max"])
-                    if int(self.rule_dict[key]["max"]) > 0
-                    else 999999999999999
-                )
-                if key == "peroid":  # peroid是抓取周期天数
-                    continue
-                else:
-                    flag = int(self.rule_dict[key]["min"]) <= int(self.item[key]) <= max_value
-                    if not flag:
-                        message = "{}: {} <= {} <= {}, {}".format(
-                            key,
-                            self.rule_dict[key]["min"],
-                            self.item[key],
-                            max_value,
-                            flag,
-                        )
-                        print(message)
-                        return flag
-            else:
-                continue
-        return True
-
-    # 按照某个具体平台来去重
-    def repeat_video(self):
-        # sql = f""" select * from crawler_video where platform="公众号" and out_video_id="{video_id}"; """
-        # out_id = self.item["out_video_id"]
-        # sql = f""" select * from crawler_video where platform = "{self.platform}" and out_video_id="{out_id}"; """
-        # repeat_video = MysqlHelper.get_values(
-        #     log_type=self.mode, crawler=self.platform, env=self.env, sql=sql, action=""
-        # )
-        # if repeat_video:
-        #     message = "重复的视频"
-        #     print(message)
-        #     return False
-        return True
-
-    def process_item(self):
-        if not self.publish_time_flag():
-            # 记录相关日志
-            return False
-        if not self.title_flag():
-            # 记录相关日志
-            return False
-        if not self.repeat_video():
-            # 记录相关日志
-            return False
-        if not self.download_rule_flag():
-            # 记录相关日志
-            return False
-        return True
-

+ 0 - 241
application/spiders/base_spider.py

@@ -1,241 +0,0 @@
-import asyncio
-import aiohttp
-from abc import ABC
-from typing import List, Dict, Optional
-import time
-from application.config.common.log.logger_manager import LoggerManager
-from utils.extractors import safe_extract
-from application.config.common import MQ
-from utils.config_loader import ConfigLoader  # 新增导入
-
-
-class BaseSpider(ABC):
-    """
-    通用爬虫基类:支持严格顺序执行流程
-    """
-
-    MAX_RETRIES = 3  # 单个请求最大重试次数
-    TIMEOUT = 30  # 请求超时时间(秒)
-
-    def __init__(self, rule_dict: Dict, user_list: List, trace_id: str, env: str = "prod"):
-        self.trace_id = trace_id
-        self.env = env
-        self.user_list = user_list
-        self.rule_dict = rule_dict
-        self.class_name = self.__class__.__name__  # 获取子类类名
-
-        # 根据类名自动获取配置
-        self.platform_config = ConfigLoader.get_config_by_class_name(self.class_name)
-        if not self.platform_config:
-            raise ValueError(f"找不到对应配置: {self.class_name}")
-
-        # 初始化日志和MQ
-        self.platform = self.platform_config.get("platform")
-        self.mode = self.platform_config.get("mode")
-        self.logger = LoggerManager.get_logger(platform=self.platform, mode=self.mode)
-        self.logger.info(f"{trace_id}--初始化爬虫类: {self.class_name}")
-        self.aliyun_logr = LoggerManager.get_aliyun_logger(platform=self.platform, mode=self.mode)
-        self.mq = MQ(topic_name=f"topic_crawler_etl_{env}")
-
-        # 请求配置
-        self.method = self.platform_config.get("method", "GET").upper()
-        self.url = self.platform_config.get("url")
-        self.headers = self.platform_config.get("headers", {})
-        self.body = self.platform_config.get("request_body", {})
-        self.field_map = self.platform_config.get("response_parse", {}).get("fields", {})
-        self.data_path = self.platform_config.get("response_parse", {}).get("data_path")
-        self.video_fields_map = self.platform_config.get("video_fields_map", {})
-
-        # 流程控制配置
-        self.loop_times = self.platform_config.get("loop_times", 1)  # 循环次数
-        self.loop_interval = self.platform_config.get("loop_interval", 0)  # 循环间隔(秒)
-
-        self.logger.info(
-            f"{self.trace_id}--配置: 循环{self.loop_times}次,间隔{self.loop_interval}秒")
-
-    async def _send_async_request(self, method: str, url: str, **kwargs) -> aiohttp.ClientResponse:
-        """
-        发送异步HTTP请求,支持重试机制
-        """
-        retries = 0
-        timeout = aiohttp.ClientTimeout(total=self.TIMEOUT)
-
-        while retries < self.MAX_RETRIES:
-            try:
-                async with aiohttp.ClientSession(timeout=timeout) as session:
-                    async with session.request(method, url, **kwargs) as response:
-                        response.raise_for_status()
-                        return response
-            except Exception as e:
-                retries += 1
-                remaining_attempts = self.MAX_RETRIES - retries
-
-                if retries < self.MAX_RETRIES:
-                    self.logger.warning(
-                        f"{self.trace_id}--请求失败 (尝试 {retries}/{self.MAX_RETRIES}): {e}. "
-                        f"剩余尝试次数: {remaining_attempts}"
-                    )
-                    await asyncio.sleep(1)  # 异步等待
-                else:
-                    self.aliyun_logr.logging(
-                        code="5001",
-                        message="请求失败,已达到最大重试次数",
-                        data={
-                            "url": url,
-                            "method": method,
-                            "error": str(e),
-                            "headers": kwargs.get("headers", {}),
-                            "body": kwargs.get("json", {})
-                        },
-                        trace_id=self.trace_id
-                    )
-                    self.logger.error(f"{self.trace_id}--请求失败,已达到最大重试次数: {e}")
-                    raise
-
-    async def crawl_data(self) -> Optional[List[Dict]]:
-        """异步获取视频数据"""
-        self.logger.info(f"{self.trace_id}--开始获取视频数据")
-        try:
-            response = await self._send_async_request(
-                method=self.method,
-                url=self.url,
-                headers=self.headers,
-                json=self.body
-            )
-            result = await response.json()
-            data = safe_extract(result, self.data_path)
-
-            if not data:
-                self.logger.warning(f"{self.trace_id}--未获取到数据,路径: {self.data_path}")
-                return []
-
-            self.logger.info(f"{self.trace_id}--成功获取{len(data)}条视频数据")
-            return data
-        except Exception as e:
-            self.logger.error(f"{self.trace_id}--获取视频数据失败: {e}")
-            return []
-
-    def filter_data(self, video: Dict) -> bool:
-        """校验视频是否符合规则"""
-        if not self.rule_dict:
-            return True
-
-        rule_duration = self.rule_dict.get("duration")
-        if rule_duration:
-            video_url = safe_extract(video, self.video_fields_map.get("video_url"))
-            duration = self.get_video_duration(video_url)
-            if not (rule_duration['min'] <= duration <= rule_duration['max']):
-                return False
-
-        rule_videos_cnt = self.rule_dict.get("videos_cnt")
-        if rule_videos_cnt:
-            video_count = MysqlService(self.platform, self.mode, self.trace_id).get_today_videos()
-            if video_count >= rule_videos_cnt.get("min", 0):
-                return False
-
-        return True
-
-    def process_video(self, video: Dict) -> Optional[Dict]:
-        """处理单条视频数据"""
-        self.logger.debug(f"{self.trace_id}--开始处理视频: {video.get('title', '无标题')}")
-        try:
-            item = {}
-            for field, path in self.field_map.items():
-                value = safe_extract(video, path)
-                if value is None:
-                    self.logger.warning(f"{self.trace_id}--字段提取失败: {field}")
-                    continue
-                item[field] = value
-
-            if not item:
-                self.logger.warning(f"{self.trace_id}--视频处理结果为空")
-                return None
-
-            item.update({
-                "platform": self.platform,
-                "strategy": self.mode,
-                "session": f"{self.platform}-{int(time.time())}"
-            })
-
-            self.logger.debug(f"{self.trace_id}--视频处理成功")
-            return item
-        except Exception as e:
-            self.logger.error(f"{self.trace_id}--视频处理异常: {e}")
-            return None
-
-    def push_to_etl(self, item: Dict) -> bool:
-        """推送数据到ETL(同步)"""
-        self.logger.info(f"{self.trace_id}--开始推送数据到ETL: {item.get('title', '无标题')}")
-        try:
-            self.mq.send_msg(item)
-            self.aliyun_logr.logging(
-                code="1002",
-                message="成功发送至ETL",
-                data=item,
-                trace_id=self.trace_id
-            )
-            self.logger.info(f"{self.trace_id}--数据推送成功")
-            return True
-        except Exception as e:
-            self.logger.error(f"{self.trace_id}--数据推送失败: {e}")
-            return False
-
-    async def run(self):
-        """
-        异步运行爬虫任务,严格按顺序执行
-        1. 爬取
-        2. 过滤
-        3. 处理每条数据
-        4. 推送到ETL
-        """
-        self.logger.info(f"{self.trace_id}--[{self.platform}] 开始执行爬虫任务")
-        for loop_index in range(1, self.loop_times + 1):
-
-            self.logger.info(f"{self.trace_id}--步骤1: 开始第 {loop_index}/{self.loop_times} 次循环请求")
-            loop_start_time = time.time()
-
-            # 步骤1: 获取视频数据(失败则跳过当前循环)
-            video_list = await self.crawl_data()
-            if not video_list:
-                self.logger.warning(f"{self.trace_id}--未获取到视频数据,跳过当前循环")
-                await self._wait_for_next_loop(loop_index)
-                continue
-
-            # 步骤2: 处理每条视频并推送到ETL
-            success_count = 0
-            fail_count = 0
-
-            for video in video_list:
-                # 步骤2.1: 校验视频(失败则跳过)
-                if not self.filter_data(video):
-                    self.logger.debug(f"{self.trace_id}--视频不符合规则,已跳过")
-                    continue
-
-                # 步骤2.2: 处理视频(失败则记录并继续)
-                item = self.process_video(video)
-                if not item:
-                    self.logger.warning(f"{self.trace_id}--视频处理失败,已跳过")
-                    fail_count += 1
-                    continue
-
-                # 步骤2.3: 推送到ETL(失败则记录并继续)
-                if self.push_to_etl(item):
-                    success_count += 1
-                else:
-                    fail_count += 1
-
-            loop_duration = time.time() - loop_start_time
-            self.logger.info(f"{self.trace_id}--第 {loop_index}/{self.loop_times} 次循环完成. "
-                             f"成功: {success_count}, 失败: {fail_count}, 耗时: {loop_duration:.2f}秒")
-
-            # 等待下一次循环
-            await self._wait_for_next_loop(loop_index)
-
-        self.logger.info(f"{self.trace_id}--[{self.platform}] 爬虫任务全部完成")
-        return True
-
-    async def _wait_for_next_loop(self, current_loop: int) -> None:
-        """等待下一次循环请求"""
-        if current_loop < self.loop_times and self.loop_interval > 0:
-            self.logger.info(f"{self.trace_id}--等待 {self.loop_interval} 秒后进行下一次请求")
-            await asyncio.sleep(self.loop_interval)

+ 0 - 6
application/spiders/benshanzhufu_recommend.py

@@ -1,6 +0,0 @@
-from application.spiders.base_spider import BaseSpider
-
-
-class BenshanzhufuRecommend(BaseSpider):
-    def __init__(self, rule_dict, user_list, trace_id):
-        super().__init__(rule_dict, user_list, trace_id)

+ 0 - 21
application/spiders/spider_registry.py

@@ -1,21 +0,0 @@
-from application.spiders.benshanzhufu_recommend import BenshanzhufuRecommend
-
-
-SPIDER_CLASS_MAP = {
-    "bszf_recommend_prod": BenshanzhufuRecommend
-}
-
-
-
-
-
-
-
-
-def get_spider_class(topic: str):
-    """
-    根据 topic 获取对应爬虫类
-    :param topic: MQ 消息的 topic 名称
-    :return: 爬虫类(继承自 BaseSpider)
-    """
-    return SPIDER_CLASS_MAP.get(topic, "未找到对应配置")

+ 3 - 0
config/__init__.py

@@ -0,0 +1,3 @@
+from .prod import settings  # 使用生产环境配置
+
+__all__ = ['settings']

+ 64 - 0
config/base.py

@@ -0,0 +1,64 @@
+import os
+from pathlib import Path
+from typing import Optional
+from pydantic import BaseSettings, Field, AnyUrl, validator
+from dotenv import load_dotenv
+import os
+
+# 在 Settings 类之前加载 .env 文件
+load_dotenv(os.path.join(os.path.dirname(__file__), '../.env'))
+
+class Settings(BaseSettings):
+
+    # 环境标识
+    ENV: str = "prod"  # prod/dev
+    ENABLE_ALIYUN_LOG: bool = True
+
+    # 路径配置
+    PROJECT_ROOT: Path = Path(__file__).parent.parent.resolve()
+    LOG_DIR: Path = PROJECT_ROOT / "logs"
+
+    # 日志配置
+    LOG_LEVEL: str = "INFO"
+
+    # 阿里云数据库配置 (RDS)
+    DB_HOST: str = Field(..., env="DB_HOST")
+    DB_PORT: int = Field(3306, env="DB_PORT")
+    DB_USER: str = Field(..., env="DB_USER")
+    DB_PASSWORD: str = Field(..., env="DB_PASSWORD")
+    DB_NAME: str = Field(..., env="DB_NAME")
+    DB_CHARSET: str = Field(..., env="DB_CHARSET")
+    DB_POOL_SIZE: int = 20
+    DB_POOL_RECYCLE: int = 3600
+
+    # 阿里云RocketMQ配置
+    ROCKETMQ_ENDPOINT: AnyUrl = Field(..., env="ROCKETMQ_ENDPOINT")
+    ROCKETMQ_ACCESS_KEY_ID: str = Field(..., env="ROCKETMQ_ACCESS_KEY_ID")
+    ROCKETMQ_ACCESS_KEY_SECRET: str = Field(..., env="ROCKETMQ_ACCESS_KEY_SECRET")
+    ROCKETMQ_INSTANCE_ID: str = Field(..., env="ROCKETMQ_INSTANCE_ID")
+    ROCKETMQ_WAIT_SECONDS: int = 10
+    ROCKETMQ_BATCH: int = 1
+
+    # 连接池配置
+    CONNECTION_TIMEOUT: int = 10
+    REQUEST_TIMEOUT: int = 30
+
+    # 阿里云日志
+    ALIYUN_ACCESS_KEY_ID: str = Field(..., env="ALIYUN_ACCESS_KEY_ID")
+    ALIYUN_ACCESS_KEY_SECRET: str = Field(..., env="ALIYUN_ACCESS_KEY_SECRET")
+
+    class Config:
+        env_file = ".env"
+        env_file_encoding = 'utf-8'
+        case_sensitive = False
+
+    # @property
+    # def database_url(self) -> str:
+    #     """生成安全的数据库连接字符串"""
+    #     return f"mysql+asyncmy://{self.DB_USER}:{self.DB_PASSWORD}@{self.DB_HOST}:{self.DB_PORT}/{self.DB_NAME}?charset={self.DB_CHARSET}"
+
+
+settings = Settings()
+
+# 自动创建日志目录
+os.makedirs(settings.LOG_DIR, exist_ok=True)

+ 16 - 0
config/prod.py

@@ -0,0 +1,16 @@
+from .base import Settings
+
+
+class ProdSettings(Settings):
+    """生产环境专用配置"""
+
+    LOG_LEVEL: str = "INFO"
+
+    # 生产环境禁用调试功能
+    DEBUG: bool = False
+
+    class Config(Settings.Config):
+        env_prefix = "PROD_"
+
+
+settings = ProdSettings()

+ 98 - 0
config/settings.py

@@ -0,0 +1,98 @@
+from pydantic import BaseSettings, Field, validator, BaseModel
+from typing import Dict, Any, Optional, List
+
+
+class DatabaseConfig(BaseSettings):
+    """数据库配置"""
+    host: str = Field(..., env="DB_HOST")
+    port: int = Field(..., env="DB_PORT")
+    user: str = Field(..., env="DB_USER")
+    password: str = Field(..., env="DB_PASSWORD")
+    db: str = Field(..., env="DB_NAME")
+    charset: str = Field("utf8mb4", env="DB_CHARSET")
+
+    class Config:
+        env_file = ".env"
+        env_file_encoding = "utf-8"
+
+
+class RocketMQConfig(BaseSettings):
+    """阿里云 RocketMQ 配置"""
+    endpoint: str = Field(..., env="ROCKETMQ_ENDPOINT")
+    access_key_id: str = Field(..., env="ROCKETMQ_ACCESS_KEY_ID")
+    access_key_secret: str = Field(..., env="ROCKETMQ_ACCESS_KEY_SECRET")
+    instance_id: str = Field(..., env="ROCKETMQ_INSTANCE_ID")
+    wait_seconds: int = Field(10, env="ROCKETMQ_WAIT_SECONDS")
+    batch: int = Field(1, env="ROCKETMQ_BATCH")
+
+    class Config:
+        env_file = ".env"
+
+
+class GlobalConfig(BaseSettings):
+    """全局配置"""
+    env: str = Field("prod", env="ENV")
+    base_url: str = Field("https://api.production.com", env="BASE_URL")
+    request_timeout: int = Field(30, env="REQUEST_TIMEOUT")
+    log_level: str = Field("INFO", env="LOG_LEVEL")
+    enable_aliyun_log: bool = Field(True, env="ENABLE_ALIYUN_LOG")
+
+    class Config:
+        env_file = ".env"
+
+
+class ResponseParse(BaseModel):
+    """数据解析配置"""
+    next_cursor: str = Field(..., description="下一页游标路径")
+    data_path: str = Field(..., description="数据主体路径")
+    fields: Dict[str, str] = Field(..., description="字段映射规则")
+
+
+class PlatformConfig(BaseModel):
+    """平台配置"""
+    platform: str
+    mode: str
+    path: str
+    method: str = "POST"
+    request_body: Dict[str, Any] = {}
+    loop_times: int = 1
+    loop_interval: int = 0
+    response_parse: ResponseParse
+    etl_hook: Optional[str] = None
+    post_actions: Optional[List[PostAction]] = None
+
+    @validator("etl_hook", pre=True)
+    def resolve_etl_hook(cls, v):
+        """动态加载钩子函数"""
+        if not v:
+            return None
+        module_name, func_name = v.rsplit(".", 1)
+        return getattr(importlib.import_module(module_name), func_name)
+
+
+class SpiderConfig(BaseModel):
+    """全局配置容器"""
+    default: dict = Field(...)  # 全局默认配置
+    platforms: Dict[str, PlatformConfig] = {}
+
+    @classmethod
+    def load(cls):
+        """从 YAML 加载配置"""
+        with open("config/config.yaml") as f:
+            raw_config = yaml.safe_load(f)
+        return cls(
+            default=raw_config["default"],
+            platforms=raw_config["platforms"]
+        )
+
+
+class SpiderConfig(BaseSettings):
+    """全局配置容器"""
+    default: GlobalConfig
+    database: DatabaseConfig
+    mq: RocketMQConfig
+
+    class Config:
+        env_file = ".env"
+        env_prefix = "SPIDER_"  # 环境变量前缀
+        case_sensitive = False  # 环境变量不区分大小写

+ 1 - 1
configs/spiders_config.yaml → config/spiders_config.yaml

@@ -4,7 +4,7 @@ default:
   headers:
     {"Content-Type": "application/json"}
 
-bszf_recommend_prod:
+benshanzhufurecommend:
   platform: benshanzhufu
   mode: recommend
   path: /crawler/ben_shan_zhu_fu/recommend

+ 0 - 0
configs/topic_map.yaml → config/topic_map.yaml


+ 0 - 42
configs/codes.py

@@ -1,42 +0,0 @@
-
-# === 配置相关 ===
-CONFIG_LOAD_SUCCESS = "1000"
-CONFIG_MISSING_FIELD = "1001"
-CONFIG_FORMAT_ERROR = "1002"
-
-# === 启动流程 ===
-CRAWLER_START = "1100"
-CRAWLER_STOP_BY_LIMIT = "1101"
-
-# === 请求发送 ===
-REQUEST_SEND_SUCCESS = "1200"
-REQUEST_SEND_FAIL = "1201"
-REQUEST_TIMEOUT = "1202"
-REQUEST_RESPONSE_INVALID = "1203"
-
-# === 数据解析 ===
-DATA_PARSE_SUCCESS = "1300"
-DATA_PARSE_FAIL = "1301"
-DATA_FIELD_MISSING = "1302"
-
-# === 视频处理 ===
-VIDEO_PROCESS_SUCCESS = "1400"
-VIDEO_DURATION_INVALID = "1401"
-VIDEO_FIELD_ERROR = "1402"
-VIDEO_USER_CNT_INVALID = "1403"
-
-# === 管道处理 ===
-PIPELINE_SUCCESS = "1500"
-PIPELINE_FAIL = "1501"
-PIPELINE_PUSH_MQ = "1502"
-
-# === 下载限制判断 ===
-LIMIT_REACHED = "1600"
-LIMIT_RULE_CHECK_FAIL = "1601"
-
-# === 后置操作 ===
-POST_ACTION_TRIGGERED = "1700"
-POST_ACTION_FAILED = "1701"
-
-# === 兜底异常 ===
-SYSTEM_UNEXPECTED_ERROR = "9999"

+ 0 - 1
configs/config.py

@@ -1 +0,0 @@
-base_url="http://8.217.192.46:8889"

+ 0 - 36
configs/messages.py

@@ -1,36 +0,0 @@
-# messages.py
-
-MESSAGES = {
-    "1000": "配置加载成功",
-    "1001": "配置缺失必要字段",
-    "1002": "配置格式错误",
-
-    "1100": "开始执行爬虫任务",
-    "1101": "任务终止:达到下载上限",
-
-    "1200": "请求发送成功",
-    "1201": "请求发送失败",
-    "1202": "请求超时",
-    "1203": "接口响应格式异常或返回非0",
-
-    "1300": "数据解析成功",
-    "1301": "数据解析失败",
-    "1302": "关键字段缺失",
-
-    "1400": "视频处理成功",
-    "1401": "视频时长不符合要求",
-    "1402": "视频字段提取失败",
-    "1403": "用户视频数量不满足要求",
-
-    "1500": "数据管道处理成功",
-    "1501": "数据管道处理失败",
-    "1502": "已推送至消息队列",
-
-    "1600": "达到下载数量限制",
-    "1601": "下载规则校验失败",
-
-    "1700": "已触发后置操作",
-    "1701": "后置操作执行失败",
-
-    "9999": "系统内部未知错误"
-}

+ 0 - 0
application/__init__.py → core/__init__.py


+ 0 - 0
application/base/__init__.py → core/base/__init__.py


+ 0 - 0
application/base/async_mysql_client.py → core/base/async_mysql_client.py


+ 1 - 1
application/base/async_rocketmq_consumer.py → core/base/async_rocketmq_consumer.py

@@ -3,7 +3,7 @@ import json
 from typing import List, Optional
 from mq_http_sdk.mq_client import MQClient
 from mq_http_sdk.mq_exception import MQExceptionBase
-from mq_http_sdk.consumer import Message
+from mq_http_sdk.mq_consumer import Message
 
 
 class AsyncRocketMQConsumer:

+ 0 - 0
application/config/common/ffmpeg/__init__.py → core/models/__init__.py


+ 79 - 0
core/models/video_item.py

@@ -0,0 +1,79 @@
+import time
+import uuid
+
+from pydantic import BaseModel, Field
+from typing import Optional
+from services import clean_title
+
+
+class VideoItem(BaseModel):
+    """
+    视频数据结构,支持字段校验和预处理逻辑
+    - 字段初始化后可通过 `prepare()` 异步方法补全和清洗数据
+    - 使用 `produce_item()` 返回最终有效数据 dict
+    """
+
+    video_id: Optional[str] = Field(default_factory=lambda: str(uuid.uuid4()))
+    user_id: str
+    user_name: str
+    out_video_id: str
+    out_user_id: Optional[str]
+    video_url: str
+    cover_url: str
+    video_title: str
+    platform: str
+    strategy: str
+    session: Optional[str]
+
+    publish_time_stamp: Optional[int] = None
+    update_time_stamp: Optional[int] = None
+
+    duration: int = 0
+    play_cnt: int = 0
+    like_cnt: int = 0
+    comment_cnt: int = 0
+    share_cnt: int = 0
+    width: int = 0
+    height: int = 0
+
+    publish_time_str: Optional[str] = None
+    publish_time: Optional[str] = None
+
+    async def prepare(self):
+        """
+        异步预处理:清洗标题、补全发布时间和更新时间
+        """
+        # 标题清洗
+        self.video_title = await clean_title(self.video_title)
+
+        # 发布时间处理
+        if not self.publish_time_stamp:
+            self.publish_time_stamp = int(time.time())
+
+        self.publish_time_str = time.strftime(
+            "%Y-%m-%d %H:%M:%S", time.localtime(self.publish_time_stamp)
+        )
+        self.publish_time = self.publish_time_str
+
+        # 更新时间戳默认当前时间
+        if not self.update_time_stamp:
+            self.update_time_stamp = int(time.time())
+
+        if not self.session:
+            self.session = str(f"{self.platform}_{int(time.time())}")
+
+    async def produce_item(self) -> Optional[dict]:
+        """
+        异步生成最终数据字典,校验必要字段是否存在,返回 None 则不合格
+        """
+        await self.prepare()
+
+        must_fields = [
+            "video_id", "user_id", "user_name", "out_video_id", "session",
+            "video_url", "cover_url", "platform", "strategy"
+        ]
+        for f in must_fields:
+            if not getattr(self, f, None):
+                return False
+
+        return self.dict()

+ 0 - 0
application/config/common/redis/__init__.py → core/utils/__init__.py


+ 46 - 0
core/utils/config_loader.py

@@ -0,0 +1,46 @@
+import yaml
+import os
+from urllib.parse import urljoin
+from core.utils.path_utils import config_spiders_path
+
+
+class ConfigLoader:
+    _config = None
+    _config_path = config_spiders_path
+
+    @classmethod
+    def _load_yaml(cls):
+        if not os.path.exists(cls._config_path):
+            raise FileNotFoundError(f"[配置错误] 找不到配置文件: {cls._config_path}")
+        with open(cls._config_path, "r", encoding="utf-8") as f:
+            cls._config = yaml.safe_load(f)
+
+    @classmethod
+    def get_platform_config(cls, platform: str) -> dict:
+        """
+        获取平台配置,并拼接完整 URL
+        支持类方法调用 + 单次加载配置
+        """
+        if cls._config is None:
+            cls._load_yaml()
+
+        if platform not in cls._config:
+            raise ValueError(f"[配置错误] 未找到平台配置: {platform}")
+
+        platform_config = cls._config.get(platform, {})
+        base_config = cls._config.get("default", {})
+
+        # 合并配置:平台配置覆盖默认配置
+        merged = {**base_config, **platform_config}
+
+        # 自动拼接完整 url(优先用完整 url)
+        if "url" not in merged and "base_url" in merged and "path" in merged:
+            merged["url"] = urljoin(merged["base_url"], merged["path"])
+
+        return merged
+
+
+# 示例使用
+if __name__ == '__main__':
+    config = ConfigLoader.get_platform_config("benshanzhufu")
+    print(config)

+ 1 - 1
utils/env_loader.py → core/utils/env_loader.py

@@ -3,7 +3,7 @@ from dotenv import load_dotenv
 
 def load_env(env: str = None):
     """
-    根据传入的环境名加载对应的 .env 文件,默认加载 .env.prod
+    根据传入的环境名加载对应的 .env 文件,默认加载 .env
     """
     if env is None:
         env = os.getenv("ENV", "prod")  # 默认prod

+ 0 - 0
utils/extractors.py → core/utils/extractors.py


+ 0 - 0
application/config/common/feishu/__init__.py → core/utils/feishu/__init__.py


+ 2 - 2
application/config/common/feishu/feishu.py → core/utils/feishu/feishu.py

@@ -12,7 +12,7 @@ import urllib3
 
 sys.path.append(os.getcwd())
 
-from application.config.common.log import Local
+from core.utils.log import Local
 proxies = {"http": None, "https": None}
 
 
@@ -681,7 +681,7 @@ class Feishu:
             data = json.dumps({
                 "msg_type": "interactive",
                 "card": {
-                    "configs": {
+                    "config": {
                         "wide_screen_mode": True,
                         "enable_forward": True
                     },

+ 1 - 1
application/config/common/feishu/feishu_data.py → core/utils/feishu/feishu_data.py

@@ -1,4 +1,4 @@
-from application.config.common.feishu.feishu_utils import FeishuUtils
+from core.utils.feishu.feishu_utils import FeishuUtils
 
 
 class FsData:

+ 0 - 0
application/config/common/feishu/feishu_insert.py → core/utils/feishu/feishu_insert.py


+ 2 - 2
application/config/common/feishu/feishu_utils.py → core/utils/feishu/feishu_utils.py

@@ -328,7 +328,7 @@ class FeishuUtils:
             data = json.dumps({
                 "msg_type": "interactive",
                 "card": {
-                    "configs": {
+                    "config": {
                         "wide_screen_mode": True,
                         "enable_forward": True
                     },
@@ -372,7 +372,7 @@ class FeishuUtils:
             data = json.dumps({
                 "msg_type": "interactive",
                 "card": {
-                    "configs": {
+                    "config": {
                         "wide_screen_mode": True,
                         "enable_forward": True
                     },

+ 0 - 0
application/config/common/gpt/__init__.py → core/utils/gpt/__init__.py


+ 0 - 0
application/config/common/gpt/gpt4o_mini_help.py → core/utils/gpt/gpt4o_mini_help.py


+ 0 - 0
application/config/common/log/__init__.py → core/utils/log/__init__.py


+ 4 - 5
application/config/common/log/aliyun_log.py → core/utils/log/aliyun_log.py

@@ -1,12 +1,11 @@
-# -*- coding: utf-8 -*-
-# @Author: 罗俊辉
-# @Time: 2023/12/18
+
 """
 公共方法,包含:生成log
 """
 import json
 from aliyun.log import LogClient, PutLogsRequest, LogItem
 import time
+from config import settings
 
 proxies = {"http": None, "https": None}
 
@@ -32,8 +31,8 @@ class AliyunLogger(object):
         # 设置阿里云日志服务的访问信息
         if data is None:
             data = {}
-        accessKeyId = "LTAIWYUujJAm7CbH"
-        accessKey = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P"
+        accessKeyId = settings.ALIYUN_ACCESS_KEY_ID
+        accessKey = settings.ALIYUN_ACCESS_KEY_SECRET
         if self.env == "dev":
             project = "crawler-log-dev"
             logstore = "crawler-log-dev"

+ 50 - 0
core/utils/log/local_log.py

@@ -0,0 +1,50 @@
+import sys
+from datetime import date, datetime
+from loguru import logger
+from pathlib import Path
+from config import settings
+
+
+class Local:
+    """
+    恢复有效的本地日志记录器
+    """
+
+    @staticmethod
+    def init_logger(platform: str, mode: str, log_level: str = settings.LOG_LEVEL,
+                    log_to_console: bool = False, rotation: str = "00:00",
+                    retention: str = "10 days"):
+        """
+        初始化日志记录器 - 有效版本
+        """
+        # 创建日志目录
+        log_path = Path(f"{settings.LOG_DIR}/{platform}")
+        log_path.mkdir(parents=True, exist_ok=True)
+
+        # 获取当前日期(动态)
+        current_date = date.today().strftime("%Y-%m-%d")
+        log_filename = f"{platform}-{mode}-{current_date}.log"
+        log_file_path = log_path / log_filename
+
+        # 清除默认 handler
+        logger.remove()
+
+        # 添加文件日志 handler
+        logger.add(
+            str(log_file_path),  # 使用字符串路径
+            level=log_level.upper(),
+            rotation=rotation,  # 支持rotation参数
+            retention=retention,
+            encoding="utf-8",
+            enqueue=True
+        )
+
+        # 可选:输出到控制台
+        if log_to_console:
+            logger.add(
+                sys.stdout,
+                level=log_level.upper(),
+                format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level}</level> | {message}"
+            )
+
+        return logger

+ 31 - 0
core/utils/log/logger_manager.py

@@ -0,0 +1,31 @@
+from core.utils.log.local_log import Local
+from core.utils.log.aliyun_log import AliyunLogger
+
+class LoggerManager:
+    """
+    日志管理器 - 优化版
+    """
+    _local_loggers = {}
+    _aliyun_loggers = {}
+
+    @staticmethod
+    def get_logger(platform: str = "system", mode: str = "crawler", log_to_console=True):
+        key = f"{platform}_{mode}"
+        if key not in LoggerManager._local_loggers:
+            LoggerManager._local_loggers[key] = Local.init_logger(
+                platform=platform,
+                mode=mode,
+                log_to_console=log_to_console
+            )
+        return LoggerManager._local_loggers[key]
+
+    @staticmethod
+    def get_aliyun_logger(platform: str = "system", mode: str = "crawler", env: str = "prod"):
+        key = f"{platform}_{mode}"
+        if key not in LoggerManager._aliyun_loggers:
+            LoggerManager._aliyun_loggers[key] = AliyunLogger(
+                platform=platform,
+                mode=mode,
+                env=env
+            )
+        return LoggerManager._aliyun_loggers[key]

+ 2 - 4
utils/path_utils.py → core/utils/path_utils.py

@@ -13,11 +13,11 @@ def get_project_path() -> str:
 project_root = get_project_path()
 
 # 配置路径
-config_dir = os.path.join(project_root, "configs")
+config_dir = os.path.join(project_root, "config")
 config_spiders_path = os.path.join(config_dir, "spiders_config.yaml")
 
 # 日志路径
-log_dir = os.path.join(project_root, "log_store")
+log_dir = os.path.join(project_root, "logs")
 
 
 # 数据库配置路径(可选)
@@ -28,6 +28,4 @@ __all__ = [
     "config_dir",
     "config_spiders_path",
     "log_dir",
-    "model_dir",
-    "tmp_dir",
 ]

+ 0 - 0
utils/trace_utils.py → core/utils/trace_utils.py


+ 15 - 110
main.py

@@ -1,113 +1,15 @@
-import json
 import time
-import traceback
 from multiprocessing import Process, cpu_count
-from typing import List, Dict
-import asyncio
+from typing import Dict
 
-from application.config.common.log.logger_manager import LoggerManager
-from utils.trace_utils import generate_trace_id
-from application.config.common import get_consumer, ack_message
-from application.functions.async_mysql_service import AsyncMysqlService
-from application.spiders.spider_registry import get_spider_class, SPIDER_CLASS_MAP
-from application.functions.rocketmq_consumer import AsyncRocketMQConsumer
-
-# ------------------------------- Topic 协程处理核心 -------------------------------
-
-# 每个进程共享的 mysql service 实例(全局变量)
-mysql_service: AsyncMysqlService = None
-
-
-async def async_handle_topic(topic: str):
-    """
-    单个 topic 的消费逻辑,运行在协程中:
-    - 从 MQ 中消费消息;
-    - 根据消息内容执行对应爬虫;
-    - 使用异步数据库服务查询配置;
-    - 记录日志、确认消息。
-    """
-    logger = LoggerManager.get_logger(topic, "worker")
-    aliyun_logger = LoggerManager.get_aliyun_logger(topic, "worker")
-
-    # 每个 topic 创建独立的 consumer 实例
-    consumer = AsyncRocketMQConsumer(topic_name=topic, group_id=topic)
-
-    async def handle_single_message(message):
-        trace_id = generate_trace_id()
-        try:
-            payload = json.loads(message.message_body)
-            platform = payload["platform"]
-            mode = payload["mode"]
-            task_id = payload["id"]
-
-            user_list = await mysql_service.get_user_list(task_id)
-            rule_dict = await mysql_service.get_rule_dict(task_id)
-
-            CrawlerClass = get_spider_class(topic)
-            crawler = CrawlerClass(
-                rule_dict=rule_dict,
-                user_list=user_list,
-                trace_id=trace_id
-            )
-            await crawler.run()
-
-            # ack 由 run 成功后执行
-            await consumer.ack_message(message.receipt_handle)
-            aliyun_logger.logging(code="1000", message="任务成功完成并确认消息", trace_id=trace_id)
-
-        except Exception as e:
-            aliyun_logger.logging(
-                code="9001",
-                message=f"处理消息失败: {e}\n{traceback.format_exc()}",
-                trace_id=trace_id,
-                data=message.message_body,
-            )
-
-    # 消费循环启动
-    await consumer.run_forever(handle_single_message)
-
-
-async def run_all_topics(topics: List[str]):
-    """
-    启动当前进程中所有 topic 的协程监听任务。
-    初始化全局 AsyncMysqlService 实例。
-    """
-    global mysql_service
-    mysql_service = AsyncMysqlService()
-    await mysql_service.init()  # 初始化连接池
-
-    tasks = [asyncio.create_task(async_handle_topic(topic)) for topic in topics]
-    await asyncio.gather(*tasks)
-
-
-def handle_topic_group(topics: List[str]):
-    """
-    子进程入口函数:
-    启动异步事件循环处理该组 topics。
-    """
-    asyncio.run(run_all_topics(topics))
-
-
-# ------------------------------- 主调度部分 -------------------------------
-
-def split_topics(topics: List[str], num_groups: int) -> List[List[str]]:
-    """
-    将所有 topic 平均划分为 num_groups 组,用于分配给子进程。
-    """
-    return [topics[i::num_groups] for i in range(num_groups)]
-
-
-def start_worker_process(group_id: int, topic_group: List[str], process_map: Dict[int, Process]):
-    """
-    启动一个子进程处理一组 topic。
-    """
-    p = Process(target=handle_topic_group, args=(topic_group,), name=f"Worker-{group_id}")
-    p.start()
-    process_map[group_id] = p
-    print(f"[主进程] 启动进程 PID={p.pid} 处理 topics={topic_group}")
+from scheduler.process_manager import split_topics, start_worker_process
+from spiders.spider_registry import SPIDER_CLASS_MAP
+from core.utils.log.logger_manager import LoggerManager
 
 
 def main():
+    logger = LoggerManager.get_logger()
+    aliyun_log = LoggerManager.get_aliyun_logger()
     """
     主调度入口:
     - 获取全部爬虫 topic;
@@ -116,11 +18,14 @@ def main():
     - 监控子进程状态,自动恢复。
     """
     topic_list = list(SPIDER_CLASS_MAP.keys())
-    print(f"[主进程] 监听 Topics: {topic_list}")
+    logger.info(f"[主进程] 监听 Topics: {topic_list}")
 
-    num_cpus = cpu_count()
-    topic_groups = split_topics(topic_list, num_cpus)
-    print(f"[主进程] CPU 核心数: {num_cpus},将启动进程数: {len(topic_groups)}")
+    max_processes = cpu_count()
+    num_processes = min(len(topic_list), max_processes)  # 避免过多进程
+    # 将所有topic平均分配给子进程
+    topic_groups = split_topics(topic_list,
+                                num_processes)  # [[1, 5, 9, 13], [2, 6, 10, 14], [3, 7, 11, 15], [4, 8, 12]]
+    logger.info(f"[主进程] CPU 核心数: {max_processes},启动进程数: {num_processes}")
 
     process_map: Dict[int, Process] = {}
 
@@ -133,11 +38,11 @@ def main():
             time.sleep(5)
             for group_id, p in list(process_map.items()):
                 if not p.is_alive():
-                    print(f"[监控] 进程 {p.name} PID={p.pid} 已崩溃,正在重启...")
+                    logger.warning(f"[监控] 进程 {p.name} PID={p.pid} 已崩溃,正在重启...")
                     time.sleep(2)
                     start_worker_process(group_id, topic_groups[group_id], process_map)
     except KeyboardInterrupt:
-        print("[主进程] 接收到退出信号,终止所有子进程...")
+        logger.warning("[主进程] 接收到退出信号,终止所有子进程...")
         for p in process_map.values():
             p.terminate()
         for p in process_map.values():

+ 0 - 0
application/etl/__init__.py → scheduler/__init__.py


+ 75 - 0
scheduler/async_consumer.py

@@ -0,0 +1,75 @@
+import json
+import traceback
+from typing import List
+import asyncio
+
+from core.utils.log.logger_manager import LoggerManager
+from core.utils.trace_utils import generate_trace_id
+from services.async_mysql_service import AsyncMysqlService
+from spiders.spider_registry import get_spider_class
+
+async def async_handle_topic(topic: str):
+    """
+    单个 topic 的消费逻辑,运行在协程中:
+    - 从 MQ 中消费消息;
+    - 根据消息内容执行对应爬虫;
+    - 使用异步数据库服务查询配置;
+    - 记录日志、确认消息。
+    """
+    logger = LoggerManager.get_logger()
+    aliyun_logger = LoggerManager.get_aliyun_logger()
+
+    # 每个 topic 创建独立的 consumer 实例
+    from services.rocketmq_consumer import AsyncRocketMQConsumer
+
+    consumer = AsyncRocketMQConsumer(topic_name=topic, group_id=topic)
+
+    async def handle_single_message(message):
+        trace_id = generate_trace_id()
+        try:
+            payload = json.loads(message.message_body)
+            task_id = payload["id"]
+
+            async with AsyncMysqlService("system", "crawler") as mysql:
+                user_list = await mysql.get_user_list(task_id)
+                rule_dict = await mysql.get_rule_dict(task_id)
+
+            CrawlerClass = get_spider_class(topic)
+            crawler = CrawlerClass(
+                rule_dict=rule_dict,
+                user_list=user_list,
+                trace_id=trace_id
+            )
+            await crawler.run()
+
+            # ack 由 run 成功后执行
+            await consumer.ack_message(message.receipt_handle)
+
+        except Exception as e:
+            aliyun_logger.logging(
+                code="9001",
+                message=f"处理消息失败: {str(e)}",
+                trace_id=trace_id,
+                data={
+                    "error_type": type(e).__name__,
+                    "stack_trace": traceback.format_exc(),
+                    "message_body": message.message_body
+                }
+            )
+    # 消费循环启动
+    await consumer.run_forever(handle_single_message)
+
+
+async def run_all_topics(topics: List[str]):
+    """
+    启动当前进程中所有 topic 的协程监听任务。
+    初始化全局 AsyncMysqlService 实例。
+    """
+
+    tasks = [asyncio.create_task(async_handle_topic(topic)) for topic in topics]
+    await asyncio.gather(*tasks)
+
+
+def handle_topic_group(topics: List[str]):
+    """子进程入口函数:启动异步事件循环处理该组 topics。"""
+    asyncio.run(run_all_topics(topics))

+ 24 - 0
scheduler/process_manager.py

@@ -0,0 +1,24 @@
+import multiprocessing
+from typing import List, Dict
+from core.utils.log.logger_manager import LoggerManager
+logger = LoggerManager.get_logger()
+aliyun_log = LoggerManager.get_aliyun_logger()
+
+
+def split_topics(topics: List[str], num_groups: int) -> List[List[str]]:
+    """将所有 topic 平均划分为 num_groups 组,用于分配给子进程。"""
+    return [topics[i::num_groups] for i in range(num_groups)]
+
+
+def start_worker_process(group_id: int, topic_group: List[str], process_map: Dict[int, multiprocessing.Process]):
+    """启动一个子进程处理一组 topic。"""
+    from scheduler.async_consumer import handle_topic_group
+
+    p = multiprocessing.Process(
+        target=handle_topic_group,
+        args=(topic_group,),
+        name=f"Worker-{group_id}"
+    )
+    p.start()
+    process_map[group_id] = p
+    logger.info(f"[主进程] 启动进程 PID={p.pid} 处理 topics={topic_group}")

+ 0 - 0
application/spiders/__init__.py → services/__init__.py


+ 206 - 0
services/async_mysql_service.py

@@ -0,0 +1,206 @@
+import asyncio
+import json
+import os
+import logging
+from typing import List, Optional, Dict, Any, Tuple
+from core.base.async_mysql_client import AsyncMySQLClient
+from core.utils.log.logger_manager import LoggerManager
+from config import settings
+logger = logging.getLogger(__name__)
+
+
+class AsyncMysqlService:
+    """
+    异步业务数据库访问类(支持单例和async with)
+
+    功能特点:
+    - 单例模式实现,相同配置共享连接池
+    - 支持async with上下文管理,自动处理连接池生命周期
+    - 封装业务相关的SQL操作
+    - 完善的错误处理和日志记录
+    """
+
+    # 存储不同配置的单例实例,键为(platform, mode)元组
+    _instances: Dict[Tuple[str, str], "AsyncMysqlService"] = {}
+
+    def __new__(cls, platform: Optional[str] = None, mode: Optional[str] = None):
+        """基于配置的单例模式,相同platform和mode共享同一个实例"""
+        # 处理None值,设置默认值为"system"
+        platform = platform or "system"
+        mode = mode or "system"
+        key = (platform, mode)
+
+        if key not in cls._instances:
+            instance = super().__new__(cls)
+            instance._platform = platform
+            instance._mode = mode
+            instance._client = None
+            instance._pool_initialized = False
+            cls._instances[key] = instance
+        return cls._instances[key]
+
+    def __init__(self, platform: Optional[str] = None, mode: Optional[str] = None):
+        """初始化数据库配置(仅在创建新实例时执行)"""
+        # 避免重复初始化
+        if self._client is not None:
+            return
+
+        # 处理None值,设置默认值为"system"
+        platform = platform or "system"
+        mode = mode or "system"
+        self._platform = platform
+        self._mode = mode
+
+        # 加载环境变量配置
+        db_config = {
+            "host": settings.DB_HOST,
+            "port": settings.DB_PORT,
+            "user": settings.DB_USER,
+            "password": settings.DB_PASSWORD,
+            "db": settings.DB_NAME,
+            "charset": settings.DB_CHARSET
+        }
+
+        self.logger = LoggerManager.get_logger(platform=self.platform, mode=self.mode)
+        self.aliyun_logr = LoggerManager.get_aliyun_logger(platform=self.platform, mode=self.mode)
+
+        # 创建数据库客户端
+        self._client = AsyncMySQLClient(
+            host=db_config["host"],
+            port=db_config["port"],
+            user=db_config["user"],
+            password=db_config["password"],
+            db=db_config["db"],
+            charset=db_config["charset"],
+            minsize=1,
+            maxsize=10
+        )
+        self.logger.info(f"创建数据库服务实例: platform={platform}, mode={mode}")
+
+    # 以下方法与原实现一致,未修改
+    async def __aenter__(self):
+        """支持async with上下文,初始化连接池"""
+        if not self._pool_initialized:
+            try:
+                await self._client.init_pool()
+                self._pool_initialized = True
+                self.logger.info(f"连接池初始化成功: platform={self._platform}, mode={self._mode}")
+            except Exception as e:
+                self.logger.error(f"连接池初始化失败: {str(e)}")
+                raise
+        return self
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """支持async with上下文,关闭连接池"""
+        if self._pool_initialized:
+            try:
+                await self._client.close()
+                self._pool_initialized = False
+                self.logger.info(f"连接池已关闭: platform={self._platform}, mode={self._mode}")
+            except Exception as e:
+                self.logger.warning(f"连接池关闭失败: {str(e)}")
+
+    @property
+    def platform(self) -> str:
+        """获取服务关联的平台"""
+        return self._platform
+
+    @property
+    def mode(self) -> str:
+        """获取服务运行模式"""
+        return self._mode
+
+    async def fetch_all(self, sql: str, params: Optional[List[Any]] = None) -> List[Dict[str, Any]]:
+        """执行查询并返回多行结果"""
+        try:
+            return await self._client.fetch_all(sql, params or [])
+        except Exception as e:
+            self.logger.error(f"查询失败 [SQL: {sql}]: {str(e)}")
+            raise
+
+    async def fetch_one(self, sql: str, params: Optional[List[Any]] = None) -> Optional[Dict[str, Any]]:
+        """执行查询并返回单行结果"""
+        try:
+            return await self._client.fetch_one(sql, params or [])
+        except Exception as e:
+            self.logger.error(f"查询失败 [SQL: {sql}]: {str(e)}")
+            raise
+
+    async def execute(self, sql: str, params: Optional[List[Any]] = None) -> int:
+        """执行单条写操作(insert/update/delete)"""
+        try:
+            return await self._client.execute(sql, params or [])
+        except Exception as e:
+            self.logger.error(f"写操作失败 [SQL: {sql}]: {str(e)}")
+            raise
+
+    async def executemany(self, sql: str, params_list: List[List[Any]]) -> int:
+        """批量执行写操作"""
+        try:
+            return await self._client.executemany(sql, params_list)
+        except Exception as e:
+            self.logger.error(f"批量写操作失败 [SQL: {sql}]: {str(e)}")
+            raise
+
+    # 业务相关方法保持不变...
+    async def get_user_list(self, task_id: int) -> List[Dict[str, Any]]:
+        sql = "SELECT uid, link, nick_name FROM crawler_user_v3 WHERE task_id = %s"
+        return await self.fetch_all(sql, [task_id])
+
+    async def get_rule_dict(self, rule_id: int) -> Optional[Dict[str, Any]]:
+        sql = "SELECT rule FROM crawler_task_v3 WHERE id = %s"
+        row = await self.fetch_one(sql, [rule_id])
+        if not row or "rule" not in row:
+            self.logger.warning(f"未找到规则: rule_id={rule_id}")
+            return None
+        try:
+            rule_data = json.loads(row["rule"])
+            return {k: v for item in rule_data for k, v in item.items()}
+        except json.JSONDecodeError as e:
+            self.logger.error(f"规则解析失败 [rule_id={rule_id}]: {str(e)}")
+            return None
+
+    async def get_today_videos(self) -> int:
+        sql = """
+            SELECT COUNT(*) as cnt
+            FROM crawler_video 
+            WHERE DATE(create_time) = CURDATE()
+              AND platform = %s 
+              AND strategy = %s
+        """
+        self.logger.info(f"查询今日视频数量: platform={self.platform}, strategy={self.mode}")
+        result = await self.fetch_one(sql, [self.platform, self.mode])
+        return result["cnt"] if result else 0
+
+
+# 全局便捷访问函数(支持None参数)
+async def get_db_service(platform: Optional[str] = None, mode: Optional[str] = None) -> AsyncMysqlService:
+    """获取数据库服务实例的便捷函数,支持platform/mode为None"""
+    service = AsyncMysqlService(platform, mode)
+    await service.__aenter__()
+    return service
+
+
+# 示例用法
+async def demo_usage():
+    # 方式一:platform和mode为None,使用默认值"system"
+    async with AsyncMysqlService() as default_service:
+        users = await default_service.get_user_list(8)
+        print(f"系统配置用户数: {users}")
+
+    # 方式二:显式传入None
+    async with AsyncMysqlService(None, None) as system_service:
+        rule = await system_service.get_rule_dict(18)
+        print(f"自定义配置规则: {rule}")
+
+    # 方式三:使用便捷函数
+    service = await get_db_service("benshanzhufu", "recommend")
+    try:
+        count = await service.get_today_videos()
+        print(f"默认配置今日视频数: {count}")
+
+    finally:
+        await service.__aexit__(None, None, None)
+
+if __name__ == '__main__':
+    asyncio.run(demo_usage())

+ 5 - 2
application/functions/clean_title.py → services/clean_title.py

@@ -1,4 +1,7 @@
-def clean_title(strings):
+import asyncio
+
+
+async def clean_title(strings):
     return (
         strings.strip()
         .replace("\n", "")
@@ -19,4 +22,4 @@ def clean_title(strings):
         .replace(" ", "")
         .replace('"', "")
         .replace("'", "")
-    )
+    )

+ 201 - 0
services/pipeline.py

@@ -0,0 +1,201 @@
+import re
+import sys
+import os
+import time
+from datetime import datetime
+
+from core.utils.feishu.feishu_utils import FeishuUtils
+from services.async_mysql_service import AsyncMysqlService
+from core.utils.log.logger_manager import LoggerManager
+
+sys.path.append(os.getcwd())
+
+
+class PiaoQuanPipeline:
+    """
+    异步版爬虫管道——用于执行视频规则校验
+    """
+
+    def __init__(self, platform, mode, rule_dict, env, item, trace_id, account=None):
+        self.platform = platform
+        self.mode = mode
+        self.item = item
+        self.rule_dict = rule_dict
+        self.env = env
+        self.trace_id = trace_id
+        self.account = account
+
+        # 初始化异步MySQL客户端
+        self.mysql = AsyncMysqlService(platform=platform, mode=mode)
+        self.aliyun_log = LoggerManager.get_aliyun_logger(platform=platform, mode=mode)
+
+    async def feishu_time_list(self):
+        """从飞书读取天数配置"""
+        summary = FeishuUtils.get_values_batch("KsoMsyP2ghleM9tzBfmcEEXBnXg", "RuLK77")
+        for row in summary[1:]:
+            if row[0] == self.platform:
+                return row[1]
+        return None
+
+    async def publish_time_flag(self) -> bool:
+        """
+        判断发布时间是否过期
+        :return: True or False
+        """
+        publish_time_stamp = self.item["publish_time_stamp"]
+        update_time_stamp = self.item["update_time_stamp"]
+        max_d = self.rule_dict.get("period", {}).get("max", 1000)
+        min_d = self.rule_dict.get("period", {}).get("min", 1000)
+        days = max(max_d, min_d)
+        days_time = await self.feishu_time_list()
+        if days_time:
+            days = int(days_time)
+
+        now_ts = int(time.time())
+
+        if self.platform == "gongzhonghao":
+            if now_ts - publish_time_stamp > 86400 * days and now_ts - update_time_stamp > 86400 * days:
+                self.aliyun_log.logging(
+                    code="2004",
+                    trace_id=self.trace_id,
+                    data=self.item,
+                    message=f"发布时间超过{days}天"
+                )
+                return False
+        else:
+            if days == 0:
+                is_today = datetime.fromtimestamp(publish_time_stamp).date() == datetime.today().date()
+                if not is_today:
+                    return False
+            elif now_ts - publish_time_stamp > 86400 * days:
+                self.aliyun_log.logging(
+                    code="2004",
+                    trace_id=self.trace_id,
+                    data=self.item,
+                    message=f"发布时间超过{days}天"
+                )
+                return False
+        return True
+
+    def title_flag(self) -> bool:
+        """
+        检查标题是否包含敏感词
+        """
+        title = self.item["video_title"]
+        cleaned_title = re.sub(r"[^\w]", " ", title)
+        sensitive_words = []  # 这里可添加敏感词
+        if any(word in cleaned_title for word in sensitive_words):
+            self.aliyun_log.logging(
+                code="2003",
+                trace_id=self.trace_id,
+                message="标题中包含敏感词",
+                data=self.item,
+                account=self.account
+            )
+            return False
+        return True
+
+    def download_rule_flag(self) -> bool:
+        """
+        检查是否符合各项下载数值规则
+        """
+        for key in self.item:
+            if self.rule_dict.get(key):
+                max_value = int(self.rule_dict[key].get("max", 999999999))
+                if key == "peroid":
+                    continue
+                val = int(self.item.get(key, 0))
+                if not int(self.rule_dict[key]["min"]) <= val <= max_value:
+                    self.aliyun_log.logging(
+                        code="2004",
+                        trace_id=self.trace_id,
+                        message=f"{key}: 不符合规则",
+                        data=self.item,
+                        account=self.account
+                    )
+                    return False
+        return True
+
+    async def feishu_list(self):
+        """从飞书拉取天数配置,用于去重判断"""
+        summary = FeishuUtils.get_values_batch("KsoMsyP2ghleM9tzBfmcEEXBnXg", "letS93")
+        for row in summary[1:]:
+            if row[0] == self.platform:
+                return row[1]
+        return None
+
+    async def repeat_video(self) -> bool:
+        """
+        判断视频是否重复(包括飞书配置去重天数逻辑)
+        """
+        out_id = self.item["out_video_id"]
+        title = self.item["video_title"]
+        day_count = await self.feishu_list()
+
+        async with self.mysql as db:
+            if day_count:
+                sql = f"""
+                SELECT create_time 
+                FROM crawler_video 
+                WHERE platform = %s AND out_video_id = %s 
+                  AND create_time >= DATE_SUB(NOW(), INTERVAL %s DAY)
+                """
+                rows = await db.client.fetch_all(sql, [self.platform, out_id, int(day_count)])
+                if rows:
+                    self.aliyun_log.logging(
+                        code="2002",
+                        trace_id=self.trace_id,
+                        message="重复的视频",
+                        data=self.item,
+                        account=self.account
+                    )
+                    return False
+
+            # 特定平台绕过去重判断
+            bypass = {
+                ("zhufuniannianshunxinjixiang", "recommend"),
+                ("benshanzhufu", "recommend"),
+                ("tiantianjufuqi", "recommend"),
+            }
+            if (self.platform, self.mode) in bypass:
+                return True
+
+            # 标题去重逻辑(示例)
+            if self.platform == "zhufuhaoyunbaofu" and self.mode == "recommend":
+                sql = """
+                    SELECT 1 FROM crawler_video 
+                    WHERE platform = %s AND out_video_id = %s AND video_title = %s
+                """
+                result = await db.client.fetch_one(sql, [self.platform, out_id, title])
+            else:
+                sql = """
+                    SELECT 1 FROM crawler_video 
+                    WHERE platform = %s AND out_video_id = %s
+                """
+                result = await db.client.fetch_one(sql, [self.platform, out_id])
+
+            if result:
+                self.aliyun_log.logging(
+                    code="2002",
+                    trace_id=self.trace_id,
+                    message="重复的视频",
+                    data=self.item,
+                    account=self.account
+                )
+                return False
+
+        return True
+
+    async def process_item(self) -> bool:
+        """
+        异步执行所有规则校验
+        """
+        if not await self.publish_time_flag():
+            return False
+        if not self.title_flag():
+            return False
+        if not await self.repeat_video():
+            return False
+        if not self.download_rule_flag():
+            return False
+        return True

+ 12 - 16
application/functions/rocketmq_consumer.py → services/rocketmq_consumer.py

@@ -1,15 +1,10 @@
 import asyncio
-import json
-import os
 from typing import List, Optional, Callable
 from mq_http_sdk.mq_client import MQClient
 from mq_http_sdk.mq_exception import MQExceptionBase
-from mq_http_sdk.consumer import Message
+from mq_http_sdk.mq_consumer import Message
 
-from utils.env_loader import load_env, get_env, get_int_env  # 如果你有统一封装
-
-# 确保环境加载
-load_env()
+from config import settings
 
 
 class AsyncRocketMQConsumer:
@@ -28,22 +23,23 @@ class AsyncRocketMQConsumer:
         batch: Optional[int] = None,
     ):
         # 从环境变量读取配置
-        self.endpoint = get_env("ROCKETMQ_ENDPOINT")
-        self.access_key_id = get_env("ROCKETMQ_AK")
-        self.access_key_secret = get_env("ROCKETMQ_SK")
-        self.instance_id = get_env("ROCKETMQ_INSTANCE_ID")
-        self.wait_seconds = wait_seconds or get_int_env("ROCKETMQ_WAIT_SECONDS", 10)
-        self.batch = batch or get_int_env("ROCKETMQ_BATCH", 1)
-
+        self.endpoint = settings.ROCKETMQ_ENDPOINT
+        self.access_key_id = settings.ROCKETMQ_ACCESS_KEY_ID
+        self.access_key_secret = settings.ROCKETMQ_ACCESS_KEY_SECRET
+        self.instance_id = settings.ROCKETMQ_INSTANCE_ID
+        self.wait_seconds = settings.ROCKETMQ_WAIT_SECONDS
+        self.batch = batch or settings.ROCKETMQ_BATCH
+        self.topic_name = topic_name
+        self.group_id = group_id
         # 初始化客户端
         self.client = MQClient(self.endpoint, self.access_key_id, self.access_key_secret)
-        self.consumer = self.client.get_consumer(self.instance_id, topic_name, group_id)
+        self.consumer = self.client.get_consumer(self.instance_id, self.topic_name, self.group_id)
 
     async def receive_messages(self) -> List[Message]:
         """异步封装消息拉取"""
         try:
             return await asyncio.to_thread(
-                self.consumer.receive_message,
+                self.consumer.consume_message,
                 self.batch,
                 self.wait_seconds,
             )

+ 0 - 0
configs/__init__.py → spiders/__init__.py


+ 323 - 0
spiders/base_spider.py

@@ -0,0 +1,323 @@
+import asyncio
+import random
+import traceback
+import uuid
+
+import aiohttp
+from abc import ABC
+from typing import List, Dict, Optional
+import time
+from core.utils.log.logger_manager import LoggerManager
+from services.pipeline import PiaoQuanPipeline
+from core.utils.extractors import safe_extract
+from core.utils.config_loader import ConfigLoader
+from services.async_mysql_service import AsyncMysqlService
+from core.models.video_item import VideoItem
+
+
+
+class BaseSpider(ABC):
+    """
+    通用爬虫基类:支持严格顺序执行流程
+    """
+
+    MAX_RETRIES = 3  # 单个请求最大重试次数
+    TIMEOUT = 30  # 请求超时时间(秒)
+
+    def __init__(self, rule_dict: Dict, user_list: List, trace_id: str, env: str = "prod"):
+        self.trace_id = trace_id
+        self.env = env
+        self.user_list = user_list
+        self.rule_dict = rule_dict
+        self.class_name = self.__class__.__name__  # 获取子类类名
+
+        # 根据类名自动获取配置
+        self.platform_config = ConfigLoader.get_platform_config(platform=str(self.class_name.lower()))
+        if not self.platform_config:
+            raise ValueError(f"找不到对应配置: {self.class_name}")
+
+        # 初始化日志和MQ
+        self.platform = self.platform_config.get("platform")
+        self.mode = self.platform_config.get("mode")
+        self.logger = LoggerManager.get_logger(platform=self.platform, mode=self.mode)
+
+        self.logger.info(f"{trace_id}--初始化爬虫类: {self.class_name}")
+        self.aliyun_logr = LoggerManager.get_aliyun_logger(platform=self.platform, mode=self.mode)
+        self.mq = MQ(topic_name=f"topic_crawler_etl_{env}")
+
+        # 请求配置
+        self.method = self.platform_config.get("method", "GET").upper()
+        self.url = self.platform_config.get("url")
+        self.headers = self.platform_config.get("headers", {})
+        self.body = self.platform_config.get("request_body", {})
+        self.field_map = self.platform_config.get("response_parse", {}).get("fields", {})
+        self.data_path = self.platform_config.get("response_parse", {}).get("data_path")
+        self.video_fields_map = self.platform_config.get("video_fields_map", {})
+
+        # 流程控制配置
+        self.loop_times = self.platform_config.get("loop_times", 1)  # 循环次数
+        self.loop_interval = self.platform_config.get("loop_interval", 0)  # 循环间隔(秒)
+
+        self.db_service = AsyncMysqlService(platform=self.platform, mode=self.mode)
+
+        self.logger.info(
+            f"{self.trace_id}--配置: 循环{self.loop_times}次,间隔{self.loop_interval}秒")
+
+        self.session = None
+
+    async def _send_async_request(self, session: aiohttp.ClientSession, method: str, url: str,
+                                  **kwargs) -> aiohttp.ClientResponse:
+        """
+        使用提供的 session 发送异步HTTP请求,支持重试机制
+        """
+        retries = 0
+        self.logger.info(f"{self.trace_id}--请求准备: {method} {url}, 参数: {kwargs}")
+
+        while retries < self.MAX_RETRIES:
+            try:
+                async with session.request(method, url, **kwargs) as response:
+                    response.raise_for_status()
+                    self.logger.info(f"{self.trace_id}--请求成功: {response.status}")
+                    return await response.json()
+            except Exception as e:
+                retries += 1
+                remaining_attempts = self.MAX_RETRIES - retries
+
+                if retries < self.MAX_RETRIES:
+                    self.logger.warning(
+                        f"{self.trace_id}--请求失败 (尝试 {retries}/{self.MAX_RETRIES}): {e}. "
+                        f"剩余尝试次数: {remaining_attempts}"
+                    )
+                    await asyncio.sleep(1)
+                else:
+                    self.aliyun_logr.logging(
+                        code="5001",
+                        message="请求失败,已达到最大重试次数",
+                        data={
+                            "url": url,
+                            "method": method,
+                            "error": str(e),
+                            "headers": kwargs.get("headers", {}),
+                            "body": kwargs.get("json", {})
+                        },
+                        trace_id=self.trace_id
+                    )
+                    self.logger.error(f"{self.trace_id}--请求失败,已达到最大重试次数: {e}")
+                    raise
+
+    async def crawl_data(self) -> Optional[List[Dict]]:
+        """异步获取视频数据"""
+        self.logger.info(f"{self.trace_id}--开始获取视频数据")
+        try:
+            response = await self._send_async_request(
+                session=self.session,
+                method=self.method,
+                url=self.url,
+                headers=self.headers,
+                json=self.body
+            )
+            self.logger.debug(f"{self.trace_id}--响应结果: {response}")
+
+            data = safe_extract(response, self.data_path)
+
+            if not data:
+                self.logger.warning(f"{self.trace_id}--未获取到数据,路径: {self.data_path}")
+                return []
+
+            self.logger.info(f"{self.trace_id}--成功获取{len(data)}条视频数据")
+            return data
+        except Exception as e:
+            traceback.extract_stack()
+            self.logger.exception(f"{self.trace_id}--获取视频数据失败: {e}")
+            return []
+
+    async def filter_data(self, video: Dict) -> bool:
+        """校验视频是否符合规则"""
+        pipeline = PiaoQuanPipeline(
+            platform=self.platform,
+            mode=self.mode,
+            rule_dict=self.rule_dict,
+            env=self.env,
+            item=video,
+            trace_id=self.platform + str(uuid.uuid1())
+        )
+        return await pipeline.process_item()
+
+
+    async def is_video_count_sufficient(self) -> bool:
+        """
+        校验视频是否达到当日最大量
+        :return:True False
+        """
+        rule_videos_cnt = self.rule_dict.get("videos_cnt")
+        if not rule_videos_cnt:
+            return True
+        async with AsyncMysqlService(self.platform, self.mode) as mysql:
+            video_count = await mysql.get_today_videos()
+        if video_count >= rule_videos_cnt.get("min", 200):
+            self.logger.info(f"{self.trace_id}--当日视频已达到最大量{video_count}")
+            return False
+        return True
+
+    async def process_video(self, video: Dict) -> Optional[Dict]:
+        """
+        处理单条视频数据,字段映射关系,必要字段检验
+        :param video:
+        :return:
+        """
+        self.logger.debug(f"{self.trace_id}--开始处理视频: {video.get('title', '无标题')}")
+        publish_user = random.choice(self.user_list)
+        try:
+            # 从 field_map 中动态构建 VideoItem 初始化参数
+            item_kwargs = {}
+            for field, path in self.field_map.items():
+                if not isinstance(path, str) or not path.startswith("$"):
+                    item_kwargs[field] = path
+                    continue
+
+                value = safe_extract(video, path)
+                if value is None:
+                    self.logger.warning(f"{self.trace_id}--字段提取失败: {field} 路径: {path}")
+                    continue
+                item_kwargs[field] = value
+
+            item_kwargs["user_id"] = publish_user["uid"]
+            item_kwargs["user_name"] = publish_user["nick_name"]
+            # 手动注入 platform 与 strategy
+            item_kwargs["platform"] = self.platform
+            item_kwargs["strategy"] = self.mode
+
+
+            try:
+                item = VideoItem(**item_kwargs)
+            except Exception as e:
+                self.logger.warning(f"{self.trace_id}--VideoItem 初始化失败: {e}, 数据: {item_kwargs}")
+                return None
+
+            video_dict = await item.produce_item()
+            if not video_dict:
+                self.logger.warning(f"{self.trace_id}--VideoItem 校验失败")
+                return None
+            return video_dict
+
+        except Exception as e:
+            self.logger.exception(f"{self.trace_id}--视频处理异常: {e}")
+            return None
+
+    async def push_to_etl(self, item: Dict) -> bool:
+        """推送数据到ETL(同步)"""
+        self.logger.info(f"{self.trace_id}--开始推送数据到ETL: {item.get('video_id', item.get('title', '无标题'))}")
+        try:
+            self.mq.send_msg(item)
+            self.aliyun_logr.logging(
+                code="1009",
+                message="成功发送至ETL",
+                data=item,
+                trace_id=self.trace_id
+            )
+            self.logger.info(f"{self.trace_id}--数据推送成功")
+            return True
+        except Exception as e:
+            self.logger.exception(f"{self.trace_id}--数据推送失败: {e}, 内容: {item}")
+            return False
+
+    async def get_today_videos(self):
+        """
+        查询每天的爬虫爬取到的视频数量
+        :return:
+        """
+        video_count = self.db_service.get_today_videos()
+        return video_count
+
+    async def integrated_video_handling(self):
+        """
+        视频处理
+        :return:
+        """
+        pass
+    async def run(self):
+        """
+        异步运行爬虫任务,严格按顺序执行
+        1. 爬取
+        2. 处理每条数据,字段校验
+        3. 过滤(重复,平台规则,标题,发布时间)
+        4. 标题处理
+        5. 推送到ETL
+        """
+        try:
+            self.logger.info(f"{self.trace_id}--[{self.platform}] 开始执行爬虫任务")
+            total_success = 0
+            total_failed = 0
+
+            async with aiohttp.ClientSession(
+                    timeout=aiohttp.ClientTimeout(total=self.TIMEOUT)
+            ) as session:  # 上下文管理
+                self.session = session
+
+                for loop_index in range(1, self.loop_times + 1):
+                    if not await self.is_video_count_sufficient():
+                        return
+                    self.logger.info(f"{self.trace_id}--步骤1: 开始第 {loop_index}/{self.loop_times} 次循环请求")
+                    loop_start_time = time.time()
+
+                    video_list = await self.crawl_data()
+                    if not video_list:
+                        self.logger.warning(f"{self.trace_id}--未获取到视频数据,跳过当前循环")
+                        await self._wait_for_next_loop(loop_index)
+                        continue
+
+                    success_count = 0
+                    fail_count = 0
+
+                    for video in video_list:
+                        video_obj = await self.process_video(video)
+                        if not video_obj:
+                            self.logger.warning(f"{self.trace_id}--视频处理失败,已跳过")
+                            fail_count += 1
+                            continue
+
+                        if not await self.filter_data(video_obj):
+                            self.logger.debug(f"{self.trace_id}--视频不符合规则,已跳过")
+                            continue
+
+                        await self.integrated_video_handling()
+
+                        if await self.push_to_etl(video_obj):
+                            success_count += 1
+                        else:
+                            fail_count += 1
+
+                    total_success += success_count
+                    total_failed += fail_count
+
+                    loop_duration = time.time() - loop_start_time
+                    self.logger.info(f"{self.trace_id}--第 {loop_index}/{self.loop_times} 次循环完成. "
+                                     f"成功: {success_count}, 失败: {fail_count}, 耗时: {loop_duration:.2f}秒")
+
+                    await self._wait_for_next_loop(loop_index)
+
+                # 全局指标日志
+                self.aliyun_logr.logging(
+                    code="1003",
+                    message="爬虫执行指标汇总",
+                    data={
+                        "trace_id": self.trace_id,
+                        "platform": self.platform,
+                        "success_count": total_success,
+                        "fail_count": total_failed
+                    },
+                    trace_id=self.trace_id
+                )
+
+                self.logger.info(
+                    f"{self.trace_id}--[{self.platform}] 爬虫任务全部完成,总成功: {total_success}, 总失败: {total_failed}")
+                return True
+        except Exception as e:
+            self.logger.error(f"爬虫致命错误: {e}")
+            raise
+
+    async def _wait_for_next_loop(self, current_loop: int) -> None:
+        """等待下一次循环请求"""
+        if current_loop < self.loop_times and self.loop_interval > 0:
+            self.logger.info(f"{self.trace_id}--等待 {self.loop_interval} 秒后进行下一次请求")
+            await asyncio.sleep(self.loop_interval)

+ 19 - 0
spiders/benshanzhufu_recommend.py

@@ -0,0 +1,19 @@
+import asyncio
+from spiders.base_spider import BaseSpider
+
+
+class BenshanzhufuRecommend(BaseSpider):
+    def __init__(self, rule_dict, user_list, trace_id):
+        super().__init__(rule_dict, user_list, trace_id)
+
+
+async def main():
+    rule_dict = {}
+    user_list = [{'uid': 20631262, 'link': 'recommend_2060', 'nick_name': '人老心不老'}, {'uid': 20631263, 'link': 'recommend_2061', 'nick_name': '荷花朵朵'}, {'uid': 20631264, 'link': 'recommend_2062', 'nick_name': '战友情'}, {'uid': 20631265, 'link': 'recommend_2063', 'nick_name': '闲人老李'}, {'uid': 20631266, 'link': 'recommend_2064', 'nick_name': '盛世白莲'}, {'uid': 20631267, 'link': 'recommend_2065', 'nick_name': '星星点灯'}, {'uid': 20631268, 'link': 'recommend_2066', 'nick_name': '老同学'}, {'uid': 20631269, 'link': 'recommend_2067', 'nick_name': '赤子之心'}, {'uid': 20631271, 'link': 'recommend_2068', 'nick_name': '缘分'}, {'uid': 20631272, 'link': 'recommend_2069', 'nick_name': '欢度余生'}]
+    trace_id = "1321"
+    bszf = BenshanzhufuRecommend(rule_dict, user_list, trace_id)
+    await bszf.run()
+
+
+if __name__ == '__main__':
+    asyncio.run(main())  # 异步入口

+ 47 - 0
spiders/spider_registry.py

@@ -0,0 +1,47 @@
+# spider_registry.py完整注释版
+"""爬虫注册表模块:维护topic到爬虫类的映射关系"""
+
+from spiders.benshanzhufu_recommend import BenshanzhufuRecommend
+from spiders.base_spider import BaseSpider
+from core.utils.log.logger_manager import LoggerManager
+logger = LoggerManager.get_logger()
+aliyun_log = LoggerManager.get_aliyun_logger()
+
+# 爬虫类映射表:topic名称 -> 爬虫类
+# 格式说明:键为MQ主题名称,值为继承自BaseSpider的爬虫类
+SPIDER_CLASS_MAP = {
+    "bszf_recommend_prod": BenshanzhufuRecommend,
+    # 新增爬虫时在此添加映射
+}
+
+
+def get_spider_class(topic: str):
+    """
+    根据MQ主题获取对应的爬虫类
+
+    Args:
+        topic: MQ消息的主题名称,需与SPIDER_CLASS_MAP中的键一致
+
+    Returns:
+        对应的爬虫类,继承自BaseSpider
+
+    Raises:
+        ValueError: 当topic未注册时抛出
+        TypeError: 当注册的类不是BaseSpider子类时抛出
+    """
+    spider_class = SPIDER_CLASS_MAP.get(topic)
+    if not spider_class:
+        available = ', '.join(SPIDER_CLASS_MAP.keys())
+        logger.error(f"未注册的topic: {topic},可用topic: {available}")
+        raise ValueError(f"未知topic: {topic}")
+
+    if not isinstance(spider_class, type) or not issubclass(spider_class, BaseSpider):
+        logger.error(f"非法爬虫类: {spider_class},必须继承自BaseSpider")
+        raise TypeError(f"Invalid spider class: {spider_class}")
+
+    return spider_class
+
+
+def list_registered_topics():
+    """获取所有已注册的topic列表"""
+    return list(SPIDER_CLASS_MAP.keys())

+ 3 - 3
application/spiders/universal_crawler.py → spiders/universal_crawler.py

@@ -6,13 +6,13 @@ from typing import Dict, List, Optional
 from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception_type, RetryCallState
 
 from application.config.common import MQ
-from configs.config import base_url
+from config.config import base_url
 from application.functions import MysqlService
 from application.items import VideoItem
 from application.pipeline import PiaoQuanPipeline
-from utils.extractors import safe_extract
+from core.utils import safe_extract
 
-from application.spiders.base_spider import BaseSpider  # 抽象基类导入
+from spiders.base_spider import BaseSpider  # 抽象基类导入
 
 def before_send_log(retry_state: RetryCallState) -> None:
     attempt = retry_state.attempt_number

+ 0 - 0
test/__init__.py → tests/__init__.py


+ 2 - 1
test/test1.py → tests/test1.py

@@ -2,5 +2,6 @@ import asyncio
 import time
 
 topics = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+num_groups = 4
 
-print(topics[1::8])
+print([topics[i::num_groups] for i in range(num_groups)])

+ 10 - 0
tests/test_benshanzhufu_recommend.py

@@ -0,0 +1,10 @@
+import unittest
+
+
+class MyTestCase(unittest.TestCase):
+    def test_something(self):
+        self.assertEqual(True, False)  # add assertion here
+
+
+if __name__ == '__main__':
+    unittest.main()

+ 7 - 0
tests/test_config.py

@@ -0,0 +1,7 @@
+from config import settings
+from services.async_mysql_service import AsyncMysqlService
+from services.rocketmq_consumer import AsyncRocketMQConsumer
+
+print("=== 配置验证 ===")
+print("DB连接:", settings.database_url)
+

+ 29 - 0
tests/test_video_item.py

@@ -0,0 +1,29 @@
+import asyncio
+import time
+from core.models.video_item import VideoItem  # 你的 Pydantic 模型路径
+from pprint import pprint
+
+
+async def main():
+    fake_video_data = {
+        "user_id": "uid456",
+        "user_name": "测试用户",
+        "out_video_id": "out789",
+        "out_user_id": "out_user",
+        "video_url": "http://example.com/video.mp4",
+        "cover_url": "http://example.com/cover.jpg",
+        "video_title": "   测试 视频 标题!!!",
+        "publish_time_stamp": int(time.time()) - 86400,  # 昨天
+        "strategy": "recommend",
+        "platform": "test_platform"
+    }
+
+    item = VideoItem(**fake_video_data)
+    result = await item.produce_item()
+
+    print("✅ 校验通过的最终结构:")
+    pprint(result)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())

+ 0 - 0
utils/__init__.py


+ 0 - 37
utils/config_loader.py

@@ -1,37 +0,0 @@
-import yaml
-import os
-from urllib.parse import urljoin
-from utils.path_utils import config_spiders_path
-
-
-class ConfigLoader:
-    def __init__(self, config_path=config_spiders_path):
-        if not os.path.exists(config_path):
-            raise FileNotFoundError(f"[配置错误] 找不到配置文件: {config_path}")
-        self.config_path = config_path
-        self.config = self._load_yaml()
-
-    def _load_yaml(self):
-        with open(self.config_path, "r", encoding="utf-8") as f:
-            return yaml.safe_load(f)
-
-    def get_platform_config(self, platform: str) -> dict:
-        """获取平台配置,并拼接完整 URL"""
-        if platform not in self.config:
-            raise ValueError(f"[配置错误] 未找到平台配置: {platform}")
-
-        platform_config = self.config.get(platform, {})
-        base_config = self.config.get("default", {})
-
-        # 合并配置:平台配置覆盖默认配置
-        merged = {**base_config, **platform_config}
-
-        # 自动拼接完整 url(优先用完整 url)
-        if "url" not in merged and "base_url" in merged and "path" in merged:
-            merged["url"] = urljoin(merged["base_url"], merged["path"])
-
-        return merged
-
-if __name__ == '__main__':
-    config = ConfigLoader().get_platform_config("benshanzhufu")
-    print(config)