Browse Source

Merge branch 'feature/20241104-improve-old-task' of Server/title_with_video into master

fengzhoutian 5 months ago
parent
commit
63a035a392
2 changed files with 24 additions and 9 deletions
  1. 22 8
      tasks/task1.py
  2. 2 1
      tasks/task3.py

+ 22 - 8
tasks/task1.py

@@ -8,6 +8,8 @@ from applications.schedule import search_videos
 from applications.functions.log import logging
 from static.config import spider_coroutines
 
+# Temporary solution for task dead-lock
+g_values = {'row_offset': 0}
 
 class MatchTask1(object):
     """
@@ -25,19 +27,30 @@ class MatchTask1(object):
         获取任务
         :return:
         """
+        select_limit = spider_coroutines * 100
         select_sql1 = f"""
-            SELECT DISTINCT (content_id)       
+            SELECT content_id, id
             FROM {db_article} 
             WHERE content_status = 0 and process_times <= 3
-            ORDER BY request_time_stamp
-            ASC
-            LIMIT {spider_coroutines};
+            AND id >= {g_values['row_offset']}
+            ORDER BY id
+            LIMIT {select_limit};
         """
         content_ids = await self.mysql_client.async_select(select_sql1)
-        cil = []
+        unique_content_ids = set()
         for content_id in content_ids:
-            cil.append(content_id[0])
-        content_ids_tuple = str(cil).replace("[", "(").replace("]", ")")
+            unique_content_ids.add(content_id[0])
+        if not unique_content_ids:
+            return []
+        g_values['row_offset'] = content_ids[-1][1]
+        print(f"update row offset to: {g_values['row_offset']}")
+        unique_content_ids = list(unique_content_ids)[0:spider_coroutines]
+        logging(
+            code=9001,
+            function="task1.get_task",
+            info=f"unique content ids in batch: {len(unique_content_ids)}"
+        )
+        content_ids_tuple = str(unique_content_ids).replace("[", "(").replace("]", ")")
         if len(content_ids_tuple) > 3:
             select_sql = f"""
                 SELECT trace_id, content_id, gh_id, article_title, article_text, content_status, process_times
@@ -60,8 +73,9 @@ class MatchTask1(object):
             ]
             logging(
                 code="9001",
+                function="task1.get_task",
                 info="本次任务获取到 {} 条视频".format(len(task_obj_list)),
-                data=task_obj_list
+                data=[x['content_id'] for x in task_obj_list]
             )
             return task_obj_list
         else:

+ 2 - 1
tasks/task3.py

@@ -58,8 +58,9 @@ class MatchTask3(object):
         ]
         logging(
             code="9001",
+            function="task3.get_task",
             info="本次任务获取到 {} 条视频".format(len(task_obj_list)),
-            data=task_obj_list
+            data=[x['content_id'] for x in task_obj_list]
         )
         return task_obj_list