Browse Source

中青看点

zhangliang 1 month ago
parent
commit
275f340ef0

+ 29 - 0
config/spiders_config.yaml

@@ -127,4 +127,33 @@ xiaoniangaoauthor:
 #      publish_time_stamp: "$.t"
 
 
+zhongqingkandianrecommend:
+  platform: zhongqingkandian
+  mode: recommend
+  path: /crawler/zhong_qing_kan_dian/recommend
+  method: post
+  request_body:
+    cursor: "{{next_cursor}}"
+  loop_times: 10000
+  loop_interval:
+    min: 30
+    max: 60
+  feishu_sheetid: "v8S6nL"
+  response_parse:
+    data: "$.data"
+    next_cursor: "$.data.next_cursor"
+    data_path: "$.data.data"
+    fields:
+      video_id: "$.channel_content_id"
+      video_title: "$.title"
+      play_cnt: "$.read_num"
+      publish_time_stamp: "$.publish_timestamp"
+      out_user_id: "$.channel_account_id"
+      cover_url: "$.image_url_list[0]['image_url']"
+      collection_cnt: "$.collect_num"
+      share_cnt: "$.share_num"
+      comment_cnt: "$.cmt_num"
+      video_url: "$.video_url"
+      out_video_id: "$.channel_content_id"
+
 

+ 3 - 1
core/models/__init__.py

@@ -6,11 +6,13 @@
 from .video_item import VideoItem
 from .spiders_config_models import BaseConfig, PlatformConfig
 from .rule_models import RuleModel, RuleField
+from .crawler_account_info import CrawlerAccountInfo
 
 __all__ = [
     "VideoItem",
     "BaseConfig", 
     "PlatformConfig",
     "RuleModel",
-    "RuleField"
+    "RuleField",
+    "CrawlerAccountInfo",
 ]

+ 3 - 3
core/utils/helpers.py

@@ -65,7 +65,7 @@ async def insert_safe_data(sheet_id: str, values: List):
 async def is_near_next_day(threshold_minutes: int = 3) -> bool:
         """
         时间检查
-
+        需要停止返回False,不需要停止返回True
         """
         now = datetime.now()
         tomorrow = now.date() + timedelta(days=1)
@@ -73,9 +73,9 @@ async def is_near_next_day(threshold_minutes: int = 3) -> bool:
         time_left = midnight - now
 
         if time_left.total_seconds() < threshold_minutes * 60:
-            return True
+            return False
 
-        return False
+        return True
 
 
 

+ 4 - 0
main.py

@@ -2,11 +2,15 @@ import time
 import signal  # 新增:导入signal模块
 from multiprocessing import Process, cpu_count
 from typing import Dict
+import warnings
 
 from core.utils.log.logger_manager import LoggerManager
 from scheduler.process_manager import split_topics, start_worker_process
 from spiders.spider_registry import SPIDER_CLASS_MAP
 
+# 禁用 urllib3 InsecureRequestWarning 警告
+warnings.filterwarnings('ignore', message='Unverified HTTPS request')
+
 # 全局关闭标志
 shutdown_flag = False
 

+ 18 - 11
scripts/office/wx_getDomainInfo.py

@@ -1,8 +1,13 @@
 import asyncio
 from readline import insert_text
-
+import urllib
 import requests
 import json
+import warnings
+
+# 禁用 urllib3 InsecureRequestWarning 警告
+warnings.filterwarnings('ignore', message='Unverified HTTPS request')
+
 
 from core.utils.feishu_data_async import FeishuDataAsync
 
@@ -25,8 +30,11 @@ class WechatDomainFetcher:
         }
 
         try:
-            response = requests.get(url, params=params)
-            print(response)
+            response = requests.get(url, params=params,verify= False)
+            resp = response.json()
+            if resp.get("errcode",0) != 0:
+                print(f"获取微信令牌失败: {resp}")
+                return None
             return response.json().get("access_token")
         except Exception as e:
             print(f"获取微信令牌失败: {str(e)}")
@@ -35,13 +43,13 @@ class WechatDomainFetcher:
     def get_domain_info(self, appid, secret):
         """获取小程序域名信息"""
         access_token = self.get_access_token(appid, secret)
-        if not access_token:
-            return None
 
         url = f"https://api.weixin.qq.com/wxa/getwxadevinfo?access_token={access_token}"
 
         try:
-            response = requests.get(url)
+            response = requests.get(url,verify= False)
+            if not response.json():
+                print(response.json())
             return response.json()
         except Exception as e:
             print(f"获取域名信息失败: {str(e)}")
@@ -54,19 +62,18 @@ async def main():
     domain_fetcher = WechatDomainFetcher()
 
     for _,app_name, appid, secret in config[1:]:
+        print(f"正在获取小程序 {app_name} 的域名信息...")
         if not appid or not secret:
+            print(f"小程序 {app_name} 缺少 appid 或 secret")
             continue
-        domain_info = domain_fetcher.get_domain_info(appid, secret)
-        if not domain_info:
-            continue
+        domain_info = domain_fetcher.get_domain_info(appid.strip(), secret.strip())
         bizdomain = domain_info.get('bizdomain')
         if bizdomain:
             insert_data = list(map(lambda x: [app_name, x], bizdomain))
-            print(insert_data)
             async with FeishuDataAsync() as feishu_data:
                     await feishu_data.insert_values("TxA2wpGZHiuLl2kMMokcaU9Mnlb", "d3a349", "A2:B", insert_data)
         else:
-            print(f"小程序 {app_name} 无法获取域名信息")
+            print(f"小程序 {app_name} 无法获取域名信息{domain_info}")
 
 
 if __name__ == '__main__':

+ 63 - 0
services/async_mysql_service.py

@@ -5,6 +5,7 @@ from typing import List, Optional, Dict, Any, Tuple
 
 from config import settings
 from core.base.async_mysql_client import AsyncMySQLClient
+from core.models.crawler_account_info import CrawlerAccountInfo
 from core.utils.log.logger_manager import LoggerManager
 
 class AsyncMysqlService:
@@ -166,6 +167,68 @@ class AsyncMysqlService:
         result = await self.fetch_all(sql)
         return result if result else 0
 
+    async def get_crawler_account_list(self, platform: str, mode: str) -> List[CrawlerAccountInfo]:
+        """
+        获取指定平台和模式的爬虫账户列表
+        
+        Args:
+            platform: 平台名称
+            mode: 平台模式
+            
+        Returns:
+            List[CrawlerAccountInfo]: 爬虫账户信息列表
+        """
+        sql = """
+            SELECT id, platform, platform_mode, priority, created_at, updated_at, last_crawled_at
+            FROM crawler_account_info 
+            WHERE platform = %s AND platform_mode = %s
+            ORDER BY priority DESC, id ASC
+        """
+        rows = await self.fetch_all(sql, [platform, mode])
+        return [CrawlerAccountInfo(**row) for row in rows]
+
+    async def update_crawler_account_last_crawled(self, account_id: int) -> bool:
+        """
+        更新爬虫账户的最后爬取时间
+        
+        Args:
+            account_id: 账户ID
+            
+        Returns:
+            bool: 是否更新成功
+        """
+        sql = "UPDATE crawler_account_info SET last_crawled_at = CURRENT_TIMESTAMP WHERE id = %s"
+        affected_rows = await self.execute(sql, [account_id])
+        return affected_rows > 0
+
+    async def insert_crawler_account(self, account: CrawlerAccountInfo) -> Optional[int]:
+        """
+        插入新的爬虫账户
+        
+        Args:
+            account: 爬虫账户信息
+            
+        Returns:
+            Optional[int]: 新插入记录的ID,失败返回None
+        """
+        sql = """
+            INSERT INTO crawler_account_info 
+            (platform, platform_mode, priority) 
+            VALUES (%s, %s, %s)
+        """
+        try:
+            # 注意:这里我们只插入指定的字段,其他字段使用默认值
+            affected_rows = await self.execute(
+                sql, 
+                [account.platform, account.platform_mode, account.priority]
+            )
+            if affected_rows > 0:
+                # 获取插入的记录ID
+                result = await self.fetch_one("SELECT LAST_INSERT_ID() as id")
+                return result["id"] if result else None
+        except Exception as e:
+            self.logger.error(f"插入爬虫账户失败: {str(e)}")
+        return None
 
 # 全局便捷访问函数(支持None参数)
 async def get_db_service(platform: Optional[str] = None, mode: Optional[str] = None) -> AsyncMysqlService:

+ 1 - 1
spiders/authorspider.py

@@ -36,7 +36,7 @@ class AuthorSpider(BaseSpider):
         while await self.is_video_count_sufficient():
 
             # 检查时间条件
-            if await is_near_next_day():
+            if not await is_near_next_day():
                 self.logger.info(f"距离第二天不足3分钟,停止执行")
                 return
 

+ 9 - 1
spiders/basespider.py

@@ -286,4 +286,12 @@ class BaseSpider(ABC):
         self.logger.info(
             f"爬虫执行完成: 成功 {self.stats['success']}, "
             f"失败 {self.stats['fail']}, 耗时 {duration:.2f}秒"
-        )
+        )
+
+    async def save_account_info(self):
+        sql = """
+            INSERT INTO crawler_account_info (platform, mode, trace_id, account_info)
+            VALUES (%s, %s, %s, %s)
+            ON DUPLICATE KEY UPDATE account_info = %s
+        """
+        await self.db_service.execute(sql, [self.platform, self.mode, self.trace_id, self.account_info, self.account_info])

+ 23 - 0
spiders/recommend/zhongqingkandian_recommend.py

@@ -0,0 +1,23 @@
+import asyncio
+
+from spiders.basespider import BaseSpider
+from spiders.recommendspider import RecommendSpider
+
+
+class ZhongqingkandianRecommend(RecommendSpider):
+    async def custom_start_checks(self) -> bool:
+        """自定义启动检查"""
+        self.logger.info("愿你福气满满爬虫启动检查通过")
+        return True
+
+
+async def main():
+    rule_dict = {'videos_cnt': {'min': 1500, 'max': 0}, 'duration': {'min': 30, 'max': 1200}}
+    user_list = [{"uid": 81522822, "link": "中青看点推荐", "nick_name": "免不了俗"}]
+    trace_id = "yuannifuqimanman_202507021200"
+    bszf = ZhongqingkandianRecommend(rule_dict, user_list, trace_id)
+    await bszf.run()
+
+
+if __name__ == '__main__':
+    asyncio.run(main())  # 异步入口

+ 3 - 1
spiders/recommendspider.py

@@ -1,5 +1,7 @@
 # spiders/recommendspider.py
 from typing import List, Dict, Optional
+
+from core.utils.helpers import is_near_next_day
 from spiders.basespider import BaseSpider
 from core.utils.extractors import safe_extract
 
@@ -15,7 +17,7 @@ class RecommendSpider(BaseSpider):
     async def execute(self):
         """执行核心逻辑 - 使用 make_request 方法"""
         iteration = 0
-        while iteration < self.loop_times and await self.is_video_count_sufficient():
+        while iteration < self.loop_times and await self.is_video_count_sufficient() and await is_near_next_day():
             self.logger.info(f"执行第 {iteration + 1} 轮")
 
             # 准备请求体

+ 6 - 0
spiders/spider_registry.py

@@ -6,16 +6,22 @@ from spiders.basespider import BaseSpider
 from spiders.recommend.benshanzhufu_recommend import BenshanzhufuRecommend
 from spiders.author.xiaoniangao_author import XiaoniangaoAuthor
 from spiders.recommend.yuannifuqimanman_recommend import YuannifuqimanmanRecommend
+from spiders.recommend.zhongqingkandian_recommend import ZhongqingkandianRecommend
 
 logger = LoggerManager.get_logger()
 aliyun_log = LoggerManager.get_aliyun_logger()
 
 # 爬虫类映射表:topic名称 -> 爬虫类
 # 格式说明:键为MQ主题名称,值为继承自BaseSpider的爬虫类
+
+
+
 SPIDER_CLASS_MAP = {
     "bszf_recommend_prod": BenshanzhufuRecommend,
     "ynfqmm_recommend_prod": YuannifuqimanmanRecommend,
+    "zqkd_recommend_prod": ZhongqingkandianRecommend,
     "xng_author_prod": XiaoniangaoAuthor,
+
     # 新增爬虫时在此添加映射
 }