Ver Fonte

添加标题敏感度

luojunhui há 8 meses atrás
pai
commit
43d2e77900

+ 29 - 7
coldStartTasks/crawler/weixinCategoryCrawler.py

@@ -2,7 +2,7 @@
 @author: luojunhui
 抓取全局品类文章
 """
-
+import json
 import time
 
 from tqdm import tqdm
@@ -10,6 +10,7 @@ from pymysql.cursors import DictCursor
 
 from applications import WeixinSpider, Functions, llm_sensitivity, log
 from coldStartTasks.filter import article_crawler_duplicate_filter
+from config import apolloConfig
 
 # 常量
 ACCOUNT_GOOD_STATUS = 1
@@ -24,6 +25,24 @@ DEFAULT_LIKE_COUNT = 0
 DEFAULT_ARTICLE_STATUS = 1
 DEFAULT_TIMESTAMP = 1717171200
 
+# 标题sensitivity
+TITLE_SENSITIVE = 1
+TITLE_NOT_SENSITIVE = 0
+
+config = apolloConfig()
+sensitive_word_list = json.loads(config.getConfigValue("sensitive_word_list"))
+
+
+def whether_title_sensitive(title: str) -> bool:
+    """
+    : param title:
+    判断视频是否的标题是否包含敏感词
+    """
+    for word in sensitive_word_list:
+        if word in title:
+            return True
+    return False
+
 
 class weixinCategory(object):
     """
@@ -77,6 +96,7 @@ class weixinCategory(object):
         """
         将数据更新到数据库
         :return:
+
         """
         success_records = []
         for article_obj in article_list:
@@ -85,7 +105,7 @@ class weixinCategory(object):
                 try:
                     # 判断文章是否存在相同的标题
                     if article_crawler_duplicate_filter(
-                        new_article_title=obj["Title"], db_client=self.db_client_lam
+                            new_article_title=obj["Title"], db_client=self.db_client_lam
                     ):
                         log(
                             function="weixinCategory",
@@ -94,6 +114,9 @@ class weixinCategory(object):
                             data={"title": obj["Title"]}
                         )
                         continue
+
+                    # 判断标题是否包含敏感词
+                    title_sensitivity = TITLE_SENSITIVE if whether_title_sensitive(obj["Title"]) else TITLE_NOT_SENSITIVE
                     show_stat = self.function.show_desc_to_sta(obj["ShowDesc"])
                     show_view_count = show_stat.get("show_view_count", DEFAULT_VIEW_COUNT)
                     show_like_count = show_stat.get("show_like_count", DEFAULT_LIKE_COUNT)
@@ -102,10 +125,10 @@ class weixinCategory(object):
                         insert into crawler_meta_article
                         (
                          platform, mode, category, out_account_id, article_index, title, link, read_cnt, like_cnt,
-                         description, publish_time, crawler_time, status, unique_index, llm_sensitivity
+                         description, publish_time, crawler_time, status, unique_index, llm_sensitivity, title_sensitivity
                         )
                         VALUES 
-                        (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+                        (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
                     """
                     self.db_client_lam.update(
                         sql=insert_sql,
@@ -124,7 +147,8 @@ class weixinCategory(object):
                             int(time.time()),
                             DEFAULT_ARTICLE_STATUS,
                             unique_idx,
-                            obj.get("llm_sensitivity", -1)
+                            obj.get("llm_sensitivity", -1),
+                            title_sensitivity
                         ),
                     )
                     success_records.append({
@@ -275,5 +299,3 @@ class weixinCategory(object):
                 )
             except Exception as e:
                 print(e)
-
-

+ 3 - 1
coldStartTasks/publish/publishCategoryArticles.py

@@ -15,6 +15,7 @@ from config import apolloConfig
 apollo = apolloConfig()
 DAILY_CRAWLER_MAX_NUM = 1000
 SIMILARITY_MIN_SCORE = 0.4
+TITLE_NOT_SENSITIVE = 0
 
 
 class CategoryColdStartTask(object):
@@ -81,6 +82,7 @@ class CategoryColdStartTask(object):
         """
         从长文 meta 库中获取冷启文章
         :return:
+
         """
         sql = f"""
         SELECT 
@@ -88,7 +90,7 @@ class CategoryColdStartTask(object):
         FROM
             crawler_meta_article
         WHERE 
-            category = "{category}" and platform = "{article_source}"
+            category = "{category}" and platform = "{article_source}" and title_sensitivity = {TITLE_NOT_SENSITIVE}
         ORDER BY score DESC;
         """
         article_list = self.db_client.select(sql)