Преглед изворни кода

Merge branch 'feature/luojunhui/20251231-add-cooperate-accounts-crawler' of Server/LongArticleTaskServer into master

luojunhui пре 2 месеци
родитељ
комит
5abbc4c3a4

+ 1 - 1
applications/tasks/cold_start_tasks/article_pool/article_pool_cold_start_const.py

@@ -17,7 +17,7 @@ class ArticlePoolColdStartConst:
     TITLE_LENGTH_LIMIT = 12
     TITLE_LENGTH_MAX = 40
 
-    DEFAULT_CRAWLER_METHODS = ["1030-手动挑号", "account_association"]
+    DEFAULT_CRAWLER_METHODS = ["1030-手动挑号", "cooperate_account"]
 
     CATEGORY_CONFIG_MAP = {
         "知识科普": {

+ 1 - 1
applications/tasks/cold_start_tasks/article_pool/article_pool_filter_strategy.py

@@ -52,7 +52,7 @@ class ArticlePoolFilterStrategy(ArticlePoolColdStartConst):
         # 第4层通过相关性分数过滤
         filter_df = filter_df[filter_df["score"] > self.SIMILARITY_SCORE_THRESHOLD]
         length_level4 = filter_df.shape[0]
-        daily_article_num = self.CATEGORY_CONFIG_MAP.get(category, 120).get("num", 120)
+        daily_article_num = self.CATEGORY_CONFIG_MAP.get(category, {}).get("num", 120)
         match strategy:
             case "strategy_v1":
                 await feishu_robot.bot(

+ 1 - 1
applications/tasks/cold_start_tasks/article_pool_cold_start.py

@@ -122,7 +122,7 @@ class ArticlePoolColdStart(ArticlePoolColdStartStrategy, ArticlePoolFilterStrate
         # create_crawler_plan
         crawler_plan_response = await auto_create_crawler_task(
             plan_id=None,
-            plan_name=f"冷启动-{strategy}-{category}-{datetime.date.today().__str__()}-{len(url_list)}",
+            plan_name=f"冷启动-{strategy}-{crawl_method}-{category}-{datetime.date.today().__str__()}-{len(url_list)}",
             plan_tag="品类冷启动",
             platform=platform,
             url_list=url_list,

+ 11 - 2
applications/tasks/crawler_tasks/crawler_gzh.py

@@ -238,7 +238,15 @@ class CrawlerGzhAccountArticles(CrawlerGzhBaseStrategy):
         """crawler single account"""
         current_cursor = None
         gh_id = account["gh_id"]
-        latest_timestamp = account["latest_update_time"].timestamp()
+        # latest_timestamp = account["latest_update_time"].timestamp()
+        latest_update_time = account["latest_update_time"]
+        if latest_update_time:
+            latest_timestamp = latest_update_time.timestamp()
+        else:
+            latest_timestamp = self.DEFAULT_TIMESTAMP
+
+        print("最新更新时间:", timestamp_to_str(latest_timestamp))
+
         while True:
             # fetch response from weixin
             response = await get_article_list_from_account(
@@ -246,6 +254,7 @@ class CrawlerGzhAccountArticles(CrawlerGzhBaseStrategy):
             )
             msg_list = response.get("data", {}).get("data")
             if not msg_list:
+                print("No msg, Please check your data")
                 break
 
             # process current page
@@ -256,7 +265,7 @@ class CrawlerGzhAccountArticles(CrawlerGzhBaseStrategy):
             last_time_stamp_in_this_msg = last_article_in_this_page["AppMsg"][
                 "BaseInfo"
             ]["UpdateTime"]
-            if last_time_stamp_in_this_msg > latest_timestamp:
+            if last_time_stamp_in_this_msg <= latest_timestamp:
                 await self.update_account_latest_timestamp(gh_id)
                 break
 

+ 2 - 2
applications/tasks/monitor_tasks/gzh_article_monitor.py

@@ -17,7 +17,7 @@ class MonitorConst:
     INIT_STATUS = 0
 
     # 监测周期
-    MONITOR_CYCLE = 3 * 24 * 3600
+    MONITOR_CYCLE = 2 * 24 * 3600
 
     # article code
     ARTICLE_ILLEGAL_CODE = 25012
@@ -324,7 +324,7 @@ class InnerGzhArticlesMonitor(MonitorConst):
                     await delete_illegal_gzh_articles(gh_id, title)
 
         except Exception as e:
-            print(f"crawler failed: {account_name}, error: {e}")
+            print(f"crawler failed: {account_name}-{url}, error: {e}")
 
     async def deal(self):
         article_list = await self.fetch_article_list_to_check()