Bläddra i källkod

2025-12-30-新增冷启动策略
使用外部阅读优替换相关性优先

luojunhui 2 månader sedan
förälder
incheckning
00aec71f27
1 ändrade filer med 11 tillägg och 2 borttagningar
  1. 11 2
      applications/tasks/crawler_tasks/crawler_gzh.py

+ 11 - 2
applications/tasks/crawler_tasks/crawler_gzh.py

@@ -238,7 +238,15 @@ class CrawlerGzhAccountArticles(CrawlerGzhBaseStrategy):
         """crawler single account"""
         current_cursor = None
         gh_id = account["gh_id"]
-        latest_timestamp = account["latest_update_time"].timestamp()
+        # latest_timestamp = account["latest_update_time"].timestamp()
+        latest_update_time = account["latest_update_time"]
+        if latest_update_time:
+            latest_timestamp = latest_update_time.timestamp()
+        else:
+            latest_timestamp = self.DEFAULT_TIMESTAMP
+
+        print("最新更新时间:", timestamp_to_str(latest_timestamp))
+
         while True:
             # fetch response from weixin
             response = await get_article_list_from_account(
@@ -246,6 +254,7 @@ class CrawlerGzhAccountArticles(CrawlerGzhBaseStrategy):
             )
             msg_list = response.get("data", {}).get("data")
             if not msg_list:
+                print("No msg, Please check your data")
                 break
 
             # process current page
@@ -256,7 +265,7 @@ class CrawlerGzhAccountArticles(CrawlerGzhBaseStrategy):
             last_time_stamp_in_this_msg = last_article_in_this_page["AppMsg"][
                 "BaseInfo"
             ]["UpdateTime"]
-            if last_time_stamp_in_this_msg > latest_timestamp:
+            if last_time_stamp_in_this_msg <= latest_timestamp:
                 await self.update_account_latest_timestamp(gh_id)
                 break