Browse Source

抓取增加标题判断,判断节日,短标题,这两类视频不会进入审核队列

luojunhui 6 months ago
parent
commit
25e85a5e0a

+ 10 - 0
applications/const.py

@@ -128,6 +128,16 @@ class WeixinVideoCrawlerConst:
     # 每天发送的审核视频数量
     MAX_VIDEO_NUM = 500
 
+    # 标题状态
+    TITLE_DEFAULT_STATUS = 0
+    TITLE_EXIT_STATUS = 1
+    TITLE_FESTIVAL_STATUS = 2
+    TITLE_SHORT_STATUS = 3
+
+    # 标题最短长度
+    TITLE_MIN_LENGTH = 15
+
+
 
 
 

+ 32 - 3
coldStartTasks/crawler/weixin_video_crawler.py

@@ -10,6 +10,7 @@ from typing import List, Dict
 from tqdm import tqdm
 from pymysql.cursors import DictCursor
 
+from config import apolloConfig
 from applications import bot
 from applications import log
 from applications import Functions
@@ -18,8 +19,9 @@ from applications import longArticlesMySQL
 from applications.const import WeixinVideoCrawlerConst
 
 spider = WeixinSpider()
-const = WeixinVideoCrawlerConst()
 functions = Functions()
+config = apolloConfig(env="prod")
+const = WeixinVideoCrawlerConst()
 
 
 class WeixinVideoCrawler(object):
@@ -29,6 +31,31 @@ class WeixinVideoCrawler(object):
 
     def __init__(self):
         self.db_client = longArticlesMySQL()
+        self.festival_list = json.loads(config.getConfigValue("festival"))
+
+    def is_festival(self, title: str) -> bool:
+        """
+        判断是否为节假日
+        :param title:
+        :return:
+        """
+        for festival in self.festival_list:
+            if festival in title:
+                return True
+        return False
+
+    def get_title_status(self, title: str) -> int:
+        """
+        通过标题获取文章状态
+        :param title:
+        :return:
+        """
+        if self.is_festival(title):
+            return const.TITLE_FESTIVAL_STATUS
+        elif len(title) < const.TITLE_MIN_LENGTH:
+            return const.TITLE_SHORT_STATUS
+        else:
+            return const.TITLE_DEFAULT_STATUS
 
     def update_account_latest_crawler_timestamp(self, gh_id: str) -> int:
         """
@@ -142,11 +169,12 @@ class WeixinVideoCrawler(object):
                             show_stat = functions.show_desc_to_sta(show_desc)
                             read_cnt = show_stat.get("show_view_count", 0)
                             like_cnt = show_stat.get("show_like_count", 0)
+                            title_status = self.get_title_status(title)
                             insert_sql = f"""
                                 INSERT INTO publish_single_video_source
-                                (content_trace_id, article_title, out_account_id, out_account_name, read_cnt, like_cnt, article_index, article_publish_type, article_url, cover_url, video_oss_path, publish_timestamp, crawler_timestamp, url_unique_md5)
+                                (content_trace_id, article_title, out_account_id, out_account_name, read_cnt, like_cnt, article_index, article_publish_type, article_url, cover_url, video_oss_path, bad_status, publish_timestamp, crawler_timestamp, url_unique_md5)
                                 values
-                                (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+                                (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
                             """
                             try:
                                 self.db_client.update(
@@ -163,6 +191,7 @@ class WeixinVideoCrawler(object):
                                         article_url,
                                         cover_url,
                                         oss_path,
+                                        title_status,
                                         create_time,
                                         int(time.time()),
                                         url_unique

+ 1 - 1
coldStartTasks/publish/publish_video_to_pq_for_audit.py

@@ -36,7 +36,7 @@ class PublishVideosForAudit(object):
         sql = f"""
             SELECT id, article_title, video_oss_path 
             FROM publish_single_video_source 
-            WHERE audit_status = {const.VIDEO_AUDIT_INIT_STATUS}
+            WHERE audit_status = {const.VIDEO_AUDIT_INIT_STATUS} and bad_status = {const.TITLE_DEFAULT_STATUS}
             LIMIT {rest_count};
             """
         response = self.db.select(sql, cursor_type=DictCursor)