Przeglądaj źródła

outside article monitor

luojunhui 5 miesięcy temu
rodzic
commit
907656830f
1 zmienionych plików z 42 dodań i 25 usunięć
  1. 42 25
      tasks/monitor_tasks/outside_gzh_articles_monitor.py

+ 42 - 25
tasks/monitor_tasks/outside_gzh_articles_monitor.py

@@ -10,7 +10,20 @@ from cold_start.crawler.wechat import get_article_list_from_account
 from config import long_articles_config, denet_config
 
 
-class OutsideGzhArticlesManager:
+class Const:
+    # 文章违规状态
+    ILLEGAL_STATUS = 1
+    INIT_STATUS = 0
+
+    # 监测周期
+    MONITOR_CYCLE = 5 * 60 * 60 * 24
+
+    # Article Code
+    ILLEGAL_CODE = 25012
+    SUCCESS_CODE = 0
+
+
+class OutsideGzhArticlesManager(Const):
 
     def __init__(self):
         self.long_articles_client = DatabaseConnector(long_articles_config)
@@ -19,7 +32,9 @@ class OutsideGzhArticlesManager:
         self.denet_client.connect()
         self.feishu_bot_api = FeishuBotApi()
 
-    def update_article_illegal_status(self, article_id: int, illegal_reason: str) -> None:
+    def update_article_illegal_status(
+        self, article_id: int, illegal_reason: str
+    ) -> None:
         update_query = f"""
             update outside_gzh_account_monitor
             set illegal_status = %s, illegal_reason = %s
@@ -27,7 +42,7 @@ class OutsideGzhArticlesManager:
         """
         self.long_articles_client.save(
             query=update_query,
-            params=(1, illegal_reason, article_id, 0)
+            params=(self.ILLEGAL_STATUS, illegal_reason, article_id, self.INIT_STATUS),
         )
 
     def whether_published_in_a_week(self, gh_id: str) -> bool:
@@ -40,13 +55,15 @@ class OutsideGzhArticlesManager:
             order by publish_timestamp desc
             limit 1;
         """
-        fetch_response = self.long_articles_client.fetch(query=fetch_query, cursor_type=DictCursor)
+        fetch_response = self.long_articles_client.fetch(
+            query=fetch_query, cursor_type=DictCursor
+        )
         if fetch_response:
-             publish_timestamp = fetch_response[0]['publish_timestamp']
-             if publish_timestamp is None:
-                 return False
-             else:
-                 return int(time.time()) - publish_timestamp <= 5 * 24 * 3600
+            publish_timestamp = fetch_response[0]["publish_timestamp"]
+            if publish_timestamp is None:
+                return False
+            else:
+                return int(time.time()) - publish_timestamp <= self.MONITOR_CYCLE
         else:
             return False
 
@@ -78,7 +95,9 @@ class OutsideGzhArticlesCollector(OutsideGzhArticlesManager):
         try:
             msg_list = fetch_response.get("data", {}).get("data", [])
             if msg_list:
-                for msg in tqdm(msg_list, desc=f"insert account {account['account_name']}"):
+                for msg in tqdm(
+                    msg_list, desc=f"insert account {account['account_name']}"
+                ):
                     self.save_each_msg_to_db(msg, account)
 
             else:
@@ -101,22 +120,22 @@ class OutsideGzhArticlesCollector(OutsideGzhArticlesManager):
             link = article["ContentUrl"]
             article_detail = get_article_detail(link)
             response_code = article_detail["code"]
-            if response_code == 25012:
+            if response_code == self.ILLEGAL_CODE:
                 illegal_reason = article_detail.get("msg")
                 # bot and return
                 self.feishu_bot_api.bot(
                     title="文章违规告警",
                     detail={
                         "account_name": article["account_name"],
-                        "title": article['title'],
+                        "title": article["title"],
                         "reason": illegal_reason,
                         "publish_timestamp": create_timestamp,
-                        "account_source": article["account_source"]
+                        "account_source": article["account_source"],
                     },
-                    env="outside_gzh_monitor"
+                    env="outside_gzh_monitor",
                 )
 
-            elif response_code == 0:
+            elif response_code == self.SUCCESS_CODE:
                 insert_query = f"""
                     insert ignore into outside_gzh_account_monitor
                     (account_name, gh_id, account_source, account_type, app_msg_id, publish_type, position, title, link, 
@@ -150,20 +169,18 @@ class OutsideGzhArticlesCollector(OutsideGzhArticlesManager):
             try:
                 self.fetch_each_account(account)
             except Exception as e:
-               print(
-                   f"crawler failed: {account['account_name']}, error: {e}"
-               )
+                print(f"crawler failed: {account['account_name']}, error: {e}")
 
 
 class OutsideGzhArticlesMonitor(OutsideGzhArticlesManager):
 
     def fetch_article_list_to_check(self):
-        publish_timestamp_threshold = int(time.time()) - 7 * 24 * 3600
+        publish_timestamp_threshold = int(time.time()) - self.MONITOR_CYCLE
         fetch_query = f"""
             select id, account_name, gh_id, account_source, account_type, 
                 title, link, from_unixtime(publish_timestamp) as publish_date
             from outside_gzh_account_monitor
-            where illegal_status = 0 and publish_timestamp > {publish_timestamp_threshold};
+            where illegal_status = {self.INIT_STATUS} and publish_timestamp > {publish_timestamp_threshold};
         """
         return self.long_articles_client.fetch(
             query=fetch_query, cursor_type=DictCursor
@@ -176,18 +193,18 @@ class OutsideGzhArticlesMonitor(OutsideGzhArticlesManager):
         link = article["link"]
         article_detail = get_article_detail(link)
         response_code = article_detail["code"]
-        if response_code == 25012:
+        if response_code == self.ILLEGAL_CODE:
             illegal_reason = article_detail.get("msg")
             self.feishu_bot_api.bot(
                 title="文章违规告警",
                 detail={
                     "account_name": article["account_name"],
-                    "title": article['title'],
+                    "title": article["title"],
                     "reason": illegal_reason,
                     "publish_date": article["publish_date"],
-                    "account_source": article["account_source"]
+                    "account_source": article["account_source"],
                 },
-                env="outside_gzh_monitor"
+                env="outside_gzh_monitor",
             )
             article_id = article["id"]
             self.update_article_illegal_status(article_id, illegal_reason)
@@ -205,4 +222,4 @@ class OutsideGzhArticlesMonitor(OutsideGzhArticlesManager):
                     f"link: {article['link']}\n"
                     f"title: {article['title']}\n"
                     f"error: {e}\n"
-                )
+                )