Ver código fonte

Update weixinCategoryCrawler: update sensitivity status in batches

StrayWarrior 9 meses atrás
pai
commit
2f7ea41909
1 arquivos alterados com 33 adições e 16 exclusões
  1. 33 16
      coldStartTasks/crawler/weixinCategoryCrawler.py

+ 33 - 16
coldStartTasks/crawler/weixinCategoryCrawler.py

@@ -55,6 +55,7 @@ class weixinCategory(object):
         将数据更新到数据库
         :return:
         """
+        success_records = []
         for article_obj in article_list:
             detail_article_list = article_obj["AppMsg"]["DetailInfo"]
             for obj in detail_article_list:
@@ -62,6 +63,7 @@ class weixinCategory(object):
                     show_stat = self.function.show_desc_to_sta(obj["ShowDesc"])
                     show_view_count = show_stat.get("show_view_count", DEFAULT_VIEW_COUNT)
                     show_like_count = show_stat.get("show_like_count", DEFAULT_LIKE_COUNT)
+                    unique_idx = self.function.generateGzhId(obj["ContentUrl"])
                     insert_sql = f"""
                         insert into crawler_meta_article
                         (
@@ -87,12 +89,28 @@ class weixinCategory(object):
                             obj["send_time"],
                             int(time.time()),
                             DEFAULT_ARTICLE_STATUS,
-                            self.function.generateGzhId(obj["ContentUrl"]),
+                            unique_idx,
                             obj.get("llm_sensitivity", -1)
                         ),
                     )
+                    success_records.append({
+                        'unique_index': unique_idx, 'title': obj['Title']
+                    })
                 except Exception as e:
                     print(e)
+        return success_records
+
+    def update_article_sensitive_status(self, category, unique_index, status):
+        """
+        更新文章敏感状态
+        :return:
+        """
+        update_sql = f"""
+            update crawler_meta_article
+            set llm_sensitivity = %s
+            where category = %s and unique_index = %s;
+        """
+        self.db_client_lam.update(sql=update_sql, params=(status, category, unique_index))
 
     def update_latest_account_timestamp(self, gh_id):
         """
@@ -124,25 +142,13 @@ class weixinCategory(object):
         msg_list = response.get("data", {}).get("data")
         if msg_list:
             last_article_in_this_msg = msg_list[-1]
-
-            article_titles = []
-            for msg in msg_list:
-                for article in msg['AppMsg']['DetailInfo']:
-                    article_titles.append(article['Title'])
-            sensitive_results = llm_sensitivity.check_titles(article_titles, True)
-            for msg in msg_list:
-                for article in msg['AppMsg']['DetailInfo']:
-                    sensitive_hit = sensitive_results.get(article['Title'], None)
-                    if sensitive_hit:
-                        article['llm_sensitivity'] = sensitive_hit['hit_rule']
-
-            self.insert_data_into_db(
+            success_records = self.insert_data_into_db(
                 gh_id=gh_id, category=category, article_list=msg_list
             )
             last_time_stamp_in_this_msg = last_article_in_this_msg["AppMsg"]["BaseInfo"]["UpdateTime"]
             if latest_time_stamp < last_time_stamp_in_this_msg:
                 next_cursor = response["data"]["next_cursor"]
-                return self.update_each_account(
+                return success_records + self.update_each_account(
                     gh_id=gh_id,
                     latest_time_stamp=latest_time_stamp,
                     category=category,
@@ -152,8 +158,10 @@ class weixinCategory(object):
                 # 更新最近抓取时间
                 self.update_latest_account_timestamp(gh_id=gh_id)
                 print("账号时间更新成功")
+                return success_records
         else:
             print("No more data")
+            return []
 
     def deal(self, category_list):
         """
@@ -171,11 +179,20 @@ class weixinCategory(object):
                         timestamp = int(account['latest_timestamp'].timestamp())
                     except Exception as e:
                         timestamp = DEFAULT_TIMESTAMP
-                    self.update_each_account(
+                    success_records = self.update_each_account(
                         gh_id=gh_id,
                         category=category,
                         latest_time_stamp=timestamp
                     )
+                    success_titles = [x['title'] for x in success_records]
+                    if success_titles:
+                        sensitive_results = llm_sensitivity.check_titles(success_titles)
+                        for record, sensitive_result in zip(success_records, sensitive_results):
+                            self.update_article_sensitive_status(
+                                category=category,
+                                unique_index=record['unique_index'],
+                                status=sensitive_result['hit_rule']
+                            )
                     print("success")
                 except Exception as e:
                     print("fail because of {}".format(e))