Kaynağa Gözat

标题相似度

luojunhui 3 ay önce
ebeveyn
işleme
9742548ecd

+ 62 - 16
coldStartTasks/filter/title_similarity_task.py

@@ -54,7 +54,7 @@ class ColdStartTitleSimilarityTask(object):
         title_list = [i[0] for i in mysql_response]
         return title_list
 
-    def get_title_from_meta_base(self, limit):
+    def get_article_title_from_meta_base(self, limit):
         """
         获取meta_base表中文章标题列表
         status: 1 表示文章初始化状态
@@ -70,17 +70,56 @@ class ColdStartTitleSimilarityTask(object):
         mysql_response = self.db_client.fetch(query=sql, cursor_type=DictCursor)
         return mysql_response
 
-    def update_meta_article_batch(self, update_data_list: list[tuple]) -> int:
+    def get_video_title_from_meta_table(self, limit):
         """
-        批量更新crawler_meta_article
+        获取meta_base表中视频标题列表
+        audit_status = 0 表示视频初始化状态
+        """
+        if limit:
+            sql = f"""
+                select id as article_id, article_title as title 
+                from publish_single_video_source 
+                where audit_status = 0 
+                    and score is null 
+                    and bad_status = 0
+                limit {limit};
+            """
+        else:
+            sql = f"""
+                select id as article_id, article_title as title 
+                from publish_single_video_source 
+                where audit_status = 0 
+                    and score is null
+                    and bad_status = 0;
+            """
+        mysql_response = self.db_client.fetch(query=sql, cursor_type=DictCursor)
+        return mysql_response
+
+    def update_meta_database_batch(self, meta_source: str, update_data_list: list[tuple]) -> int:
         """
-        sql = """
-            update crawler_meta_article
-            set score = case article_id
-                {}
-            end
-            where article_id in %s and score is null;
+        批量更新crawler_meta_article
         """
+        match meta_source:
+            case "video":
+                sql = """
+                    update publish_single_video_source
+                    set score = case id
+                        {}
+                    end
+                    where id in %s and score is null;
+                """
+            case "article":
+                sql = """
+                    update crawler_meta_article
+                    set score = case article_id
+                        {}
+                    end
+                    where article_id in %s and score is null;
+                """
+            case _:
+                print("source_type is not valid")
+                return 0
+
         case_statement = []
         article_id_list = []
         params = []
@@ -95,22 +134,29 @@ class ColdStartTitleSimilarityTask(object):
         affected_rows = self.db_client.save(formatted_sql, params)
         return affected_rows
 
-    def run(self, limit=None):
+    def run(self, meta_source, limit=None):
         """
         执行任务
         """
-        target_article_list = self.get_title_from_meta_base(limit=limit)
-        if not target_article_list:
+        match meta_source:
+            case "article":
+                target_list = self.get_article_title_from_meta_base(limit=limit)
+            case "video":
+                target_list = self.get_video_title_from_meta_table(limit=limit)
+            case _:
+                print("meta_source is not valid")
+                return
+
+        if not target_list:
             print("No more articles to process.")
             return
 
         base_title_list = self.get_level_up_title_list()
-
-        batch_task_list = chunks(target_article_list, ARTICLE_BATCH)
+        batch_task_list = chunks(target_list, ARTICLE_BATCH)
 
         for batch_task in batch_task_list:
             try:
-                batch_target_title_list = [i['title'] for i in batch_task]
+                batch_target_title_list = [i['title'][:30] for i in batch_task]
                 similarity_array = similarity_between_title_list(batch_target_title_list, base_title_list)
 
                 update_data_list = []
@@ -119,7 +165,7 @@ class ColdStartTitleSimilarityTask(object):
                     percent_threshold_score = np.percentile(sorted_score_list, PERCENT_THRESHOLD)
                     update_data_list.append((percent_threshold_score, batch_task[index]['article_id']))
 
-                affected_rows = self.update_meta_article_batch(update_data_list)
+                affected_rows = self.update_meta_database_batch(meta_source=meta_source, update_data_list=update_data_list)
 
                 print("{}: \t本次任务处理数量: {}".format(datetime.datetime.today().__str__(), affected_rows))
             except Exception as e:

+ 2 - 2
title_similarity_score_task.py

@@ -5,7 +5,7 @@ from coldStartTasks.filter.title_similarity_task import ColdStartTitleSimilarity
 
 
 if __name__ == '__main__':
-    batch_size = 3000
+    batch_size = 1000
     task = ColdStartTitleSimilarityTask()
     task.init_database()
-    task.run(limit=batch_size)
+    task.run(meta_source="article")