1 month ago · 275f340ef0
--- a/config/spiders_config.yaml
+++ b/config/spiders_config.yaml
@@ -127,4 +127,33 @@ xiaoniangaoauthor:
 
				 #      publish_time_stamp: "$.t"
			
 
				 
			
 
				 
			
 
				+zhongqingkandianrecommend:
			
 
				+  platform: zhongqingkandian
			
 
				+  mode: recommend
			
 
				+  path: /crawler/zhong_qing_kan_dian/recommend
			
 
				+  method: post
			
 
				+  request_body:
			
 
				+    cursor: "{{next_cursor}}"
			
 
				+  loop_times: 10000
			
 
				+  loop_interval:
			
 
				+    min: 30
			
 
				+    max: 60
			
 
				+  feishu_sheetid: "v8S6nL"
			
 
				+  response_parse:
			
 
				+    data: "$.data"
			
 
				+    next_cursor: "$.data.next_cursor"
			
 
				+    data_path: "$.data.data"
			
 
				+    fields:
			
 
				+      video_id: "$.channel_content_id"
			
 
				+      video_title: "$.title"
			
 
				+      play_cnt: "$.read_num"
			
 
				+      publish_time_stamp: "$.publish_timestamp"
			
 
				+      out_user_id: "$.channel_account_id"
			
 
				+      cover_url: "$.image_url_list[0]['image_url']"
			
 
				+      collection_cnt: "$.collect_num"
			
 
				+      share_cnt: "$.share_num"
			
 
				+      comment_cnt: "$.cmt_num"
			
 
				+      video_url: "$.video_url"
			
 
				+      out_video_id: "$.channel_content_id"
			
 
				+
			
 
				 
			
--- a/core/models/__init__.py
+++ b/core/models/__init__.py
@@ -6,11 +6,13 @@
 
				 from .video_item import VideoItem
			
 
				 from .spiders_config_models import BaseConfig, PlatformConfig
			
 
				 from .rule_models import RuleModel, RuleField
			
 
				+from .crawler_account_info import CrawlerAccountInfo
			
 
				 
			
 
				 __all__ = [
			
 
				     "VideoItem",
			
 
				     "BaseConfig", 
			
 
				     "PlatformConfig",
			
 
				     "RuleModel",
			
 
				-    "RuleField"
			
 
				+    "RuleField",
			
 
				+    "CrawlerAccountInfo",
			
 
				 ]
			
--- a/core/utils/helpers.py
+++ b/core/utils/helpers.py
@@ -65,7 +65,7 @@ async def insert_safe_data(sheet_id: str, values: List):
 
				 async def is_near_next_day(threshold_minutes: int = 3) -> bool:
			
 
				         """
			
 
				         时间检查
			
 
				-
			
 
				+        需要停止返回False，不需要停止返回True
			
 
				         """
			
 
				         now = datetime.now()
			
 
				         tomorrow = now.date() + timedelta(days=1)
			
@@ -73,9 +73,9 @@ async def is_near_next_day(threshold_minutes: int = 3) -> bool:
 
				         time_left = midnight - now
			
 
				 
			
 
				         if time_left.total_seconds() < threshold_minutes * 60:
			
 
				-            return True
			
 
				+            return False
			
 
				 
			
 
				-        return False
			
 
				+        return True
			
 
				 
			
 
				 
			
 
				 
			
--- a/main.py
+++ b/main.py
@@ -2,11 +2,15 @@ import time
 
				 import signal  # 新增：导入signal模块
			
 
				 from multiprocessing import Process, cpu_count
			
 
				 from typing import Dict
			
 
				+import warnings
			
 
				 
			
 
				 from core.utils.log.logger_manager import LoggerManager
			
 
				 from scheduler.process_manager import split_topics, start_worker_process
			
 
				 from spiders.spider_registry import SPIDER_CLASS_MAP
			
 
				 
			
 
				+# 禁用 urllib3 InsecureRequestWarning 警告
			
 
				+warnings.filterwarnings('ignore', message='Unverified HTTPS request')
			
 
				+
			
 
				 # 全局关闭标志
			
 
				 shutdown_flag = False
			
 
				 
			
--- a/scripts/office/wx_getDomainInfo.py
+++ b/scripts/office/wx_getDomainInfo.py
@@ -1,8 +1,13 @@
 
				 import asyncio
			
 
				 from readline import insert_text
			
 
				-
			
 
				+import urllib
			
 
				 import requests
			
 
				 import json
			
 
				+import warnings
			
 
				+
			
 
				+# 禁用 urllib3 InsecureRequestWarning 警告
			
 
				+warnings.filterwarnings('ignore', message='Unverified HTTPS request')
			
 
				+
			
 
				 
			
 
				 from core.utils.feishu_data_async import FeishuDataAsync
			
 
				 
			
@@ -25,8 +30,11 @@ class WechatDomainFetcher:
 
				         }
			
 
				 
			
 
				         try:
			
 
				-            response = requests.get(url, params=params)
			
 
				-            print(response)
			
 
				+            response = requests.get(url, params=params,verify= False)
			
 
				+            resp = response.json()
			
 
				+            if resp.get("errcode",0) != 0:
			
 
				+                print(f"获取微信令牌失败: {resp}")
			
 
				+                return None
			
 
				             return response.json().get("access_token")
			
 
				         except Exception as e:
			
 
				             print(f"获取微信令牌失败: {str(e)}")
			
@@ -35,13 +43,13 @@ class WechatDomainFetcher:
 
				     def get_domain_info(self, appid, secret):
			
 
				         """获取小程序域名信息"""
			
 
				         access_token = self.get_access_token(appid, secret)
			
 
				-        if not access_token:
			
 
				-            return None
			
 
				 
			
 
				         url = f"https://api.weixin.qq.com/wxa/getwxadevinfo?access_token={access_token}"
			
 
				 
			
 
				         try:
			
 
				-            response = requests.get(url)
			
 
				+            response = requests.get(url,verify= False)
			
 
				+            if not response.json():
			
 
				+                print(response.json())
			
 
				             return response.json()
			
 
				         except Exception as e:
			
 
				             print(f"获取域名信息失败: {str(e)}")
			
@@ -54,19 +62,18 @@ async def main():
 
				     domain_fetcher = WechatDomainFetcher()
			
 
				 
			
 
				     for _,app_name, appid, secret in config[1:]:
			
 
				+        print(f"正在获取小程序 {app_name} 的域名信息...")
			
 
				         if not appid or not secret:
			
 
				+            print(f"小程序 {app_name} 缺少 appid 或 secret")
			
 
				             continue
			
 
				-        domain_info = domain_fetcher.get_domain_info(appid, secret)
			
 
				-        if not domain_info:
			
 
				-            continue
			
 
				+        domain_info = domain_fetcher.get_domain_info(appid.strip(), secret.strip())
			
 
				         bizdomain = domain_info.get('bizdomain')
			
 
				         if bizdomain:
			
 
				             insert_data = list(map(lambda x: [app_name, x], bizdomain))
			
 
				-            print(insert_data)
			
 
				             async with FeishuDataAsync() as feishu_data:
			
 
				                     await feishu_data.insert_values("TxA2wpGZHiuLl2kMMokcaU9Mnlb", "d3a349", "A2:B", insert_data)
			
 
				         else:
			
 
				-            print(f"小程序 {app_name} 无法获取域名信息")
			
 
				+            print(f"小程序 {app_name} 无法获取域名信息{domain_info}")
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
--- a/services/async_mysql_service.py
+++ b/services/async_mysql_service.py
@@ -5,6 +5,7 @@ from typing import List, Optional, Dict, Any, Tuple
 
				 
			
 
				 from config import settings
			
 
				 from core.base.async_mysql_client import AsyncMySQLClient
			
 
				+from core.models.crawler_account_info import CrawlerAccountInfo
			
 
				 from core.utils.log.logger_manager import LoggerManager
			
 
				 
			
 
				 class AsyncMysqlService:
			
@@ -166,6 +167,68 @@ class AsyncMysqlService:
 
				         result = await self.fetch_all(sql)
			
 
				         return result if result else 0
			
 
				 
			
 
				+    async def get_crawler_account_list(self, platform: str, mode: str) -> List[CrawlerAccountInfo]:
			
 
				+        """
			
 
				+        获取指定平台和模式的爬虫账户列表
			
 
				+        
			
 
				+        Args:
			
 
				+            platform: 平台名称
			
 
				+            mode: 平台模式
			
 
				+            
			
 
				+        Returns:
			
 
				+            List[CrawlerAccountInfo]: 爬虫账户信息列表
			
 
				+        """
			
 
				+        sql = """
			
 
				+            SELECT id, platform, platform_mode, priority, created_at, updated_at, last_crawled_at
			
 
				+            FROM crawler_account_info 
			
 
				+            WHERE platform = %s AND platform_mode = %s
			
 
				+            ORDER BY priority DESC, id ASC
			
 
				+        """
			
 
				+        rows = await self.fetch_all(sql, [platform, mode])
			
 
				+        return [CrawlerAccountInfo(**row) for row in rows]
			
 
				+
			
 
				+    async def update_crawler_account_last_crawled(self, account_id: int) -> bool:
			
 
				+        """
			
 
				+        更新爬虫账户的最后爬取时间
			
 
				+        
			
 
				+        Args:
			
 
				+            account_id: 账户ID
			
 
				+            
			
 
				+        Returns:
			
 
				+            bool: 是否更新成功
			
 
				+        """
			
 
				+        sql = "UPDATE crawler_account_info SET last_crawled_at = CURRENT_TIMESTAMP WHERE id = %s"
			
 
				+        affected_rows = await self.execute(sql, [account_id])
			
 
				+        return affected_rows > 0
			
 
				+
			
 
				+    async def insert_crawler_account(self, account: CrawlerAccountInfo) -> Optional[int]:
			
 
				+        """
			
 
				+        插入新的爬虫账户
			
 
				+        
			
 
				+        Args:
			
 
				+            account: 爬虫账户信息
			
 
				+            
			
 
				+        Returns:
			
 
				+            Optional[int]: 新插入记录的ID，失败返回None
			
 
				+        """
			
 
				+        sql = """
			
 
				+            INSERT INTO crawler_account_info 
			
 
				+            (platform, platform_mode, priority) 
			
 
				+            VALUES (%s, %s, %s)
			
 
				+        """
			
 
				+        try:
			
 
				+            # 注意：这里我们只插入指定的字段，其他字段使用默认值
			
 
				+            affected_rows = await self.execute(
			
 
				+                sql, 
			
 
				+                [account.platform, account.platform_mode, account.priority]
			
 
				+            )
			
 
				+            if affected_rows > 0:
			
 
				+                # 获取插入的记录ID
			
 
				+                result = await self.fetch_one("SELECT LAST_INSERT_ID() as id")
			
 
				+                return result["id"] if result else None
			
 
				+        except Exception as e:
			
 
				+            self.logger.error(f"插入爬虫账户失败: {str(e)}")
			
 
				+        return None
			
 
				 
			
 
				 # 全局便捷访问函数（支持None参数）
			
 
				 async def get_db_service(platform: Optional[str] = None, mode: Optional[str] = None) -> AsyncMysqlService:
			
--- a/spiders/authorspider.py
+++ b/spiders/authorspider.py
@@ -36,7 +36,7 @@ class AuthorSpider(BaseSpider):
 
				         while await self.is_video_count_sufficient():
			
 
				 
			
 
				             # 检查时间条件
			
 
				-            if await is_near_next_day():
			
 
				+            if not await is_near_next_day():
			
 
				                 self.logger.info(f"距离第二天不足3分钟，停止执行")
			
 
				                 return
			
 
				 
			
--- a/spiders/basespider.py
+++ b/spiders/basespider.py
@@ -286,4 +286,12 @@ class BaseSpider(ABC):
 
				         self.logger.info(
			
 
				             f"爬虫执行完成: 成功 {self.stats['success']}, "
			
 
				             f"失败 {self.stats['fail']}, 耗时 {duration:.2f}秒"
			
 
				-        )
			
 
				+        )
			
 
				+
			
 
				+    async def save_account_info(self):
			
 
				+        sql = """
			
 
				+            INSERT INTO crawler_account_info (platform, mode, trace_id, account_info)
			
 
				+            VALUES (%s, %s, %s, %s)
			
 
				+            ON DUPLICATE KEY UPDATE account_info = %s
			
 
				+        """
			
 
				+        await self.db_service.execute(sql, [self.platform, self.mode, self.trace_id, self.account_info, self.account_info])
			
--- a/spiders/recommend/zhongqingkandian_recommend.py
+++ b/spiders/recommend/zhongqingkandian_recommend.py
@@ -0,0 +1,23 @@
 
				+import asyncio
			
 
				+
			
 
				+from spiders.basespider import BaseSpider
			
 
				+from spiders.recommendspider import RecommendSpider
			
 
				+
			
 
				+
			
 
				+class ZhongqingkandianRecommend(RecommendSpider):
			
 
				+    async def custom_start_checks(self) -> bool:
			
 
				+        """自定义启动检查"""
			
 
				+        self.logger.info("愿你福气满满爬虫启动检查通过")
			
 
				+        return True
			
 
				+
			
 
				+
			
 
				+async def main():
			
 
				+    rule_dict = {'videos_cnt': {'min': 1500, 'max': 0}, 'duration': {'min': 30, 'max': 1200}}
			
 
				+    user_list = [{"uid": 81522822, "link": "中青看点推荐", "nick_name": "免不了俗"}]
			
 
				+    trace_id = "yuannifuqimanman_202507021200"
			
 
				+    bszf = ZhongqingkandianRecommend(rule_dict, user_list, trace_id)
			
 
				+    await bszf.run()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    asyncio.run(main())  # 异步入口
			
--- a/spiders/recommendspider.py
+++ b/spiders/recommendspider.py
@@ -1,5 +1,7 @@
 
				 # spiders/recommendspider.py
			
 
				 from typing import List, Dict, Optional
			
 
				+
			
 
				+from core.utils.helpers import is_near_next_day
			
 
				 from spiders.basespider import BaseSpider
			
 
				 from core.utils.extractors import safe_extract
			
 
				 
			
@@ -15,7 +17,7 @@ class RecommendSpider(BaseSpider):
 
				     async def execute(self):
			
 
				         """执行核心逻辑 - 使用 make_request 方法"""
			
 
				         iteration = 0
			
 
				-        while iteration < self.loop_times and await self.is_video_count_sufficient():
			
 
				+        while iteration < self.loop_times and await self.is_video_count_sufficient() and await is_near_next_day():
			
 
				             self.logger.info(f"执行第 {iteration + 1} 轮")
			
 
				 
			
 
				             # 准备请求体
			
--- a/spiders/spider_registry.py
+++ b/spiders/spider_registry.py
@@ -6,16 +6,22 @@ from spiders.basespider import BaseSpider
 
				 from spiders.recommend.benshanzhufu_recommend import BenshanzhufuRecommend
			
 
				 from spiders.author.xiaoniangao_author import XiaoniangaoAuthor
			
 
				 from spiders.recommend.yuannifuqimanman_recommend import YuannifuqimanmanRecommend
			
 
				+from spiders.recommend.zhongqingkandian_recommend import ZhongqingkandianRecommend
			
 
				 
			
 
				 logger = LoggerManager.get_logger()
			
 
				 aliyun_log = LoggerManager.get_aliyun_logger()
			
 
				 
			
 
				 # 爬虫类映射表：topic名称 -> 爬虫类
			
 
				 # 格式说明：键为MQ主题名称，值为继承自BaseSpider的爬虫类
			
 
				+
			
 
				+
			
 
				+
			
 
				 SPIDER_CLASS_MAP = {
			
 
				     "bszf_recommend_prod": BenshanzhufuRecommend,
			
 
				     "ynfqmm_recommend_prod": YuannifuqimanmanRecommend,
			
 
				+    "zqkd_recommend_prod": ZhongqingkandianRecommend,
			
 
				     "xng_author_prod": XiaoniangaoAuthor,
			
 
				+
			
 
				     # 新增爬虫时在此添加映射
			
 
				 }