Browse Source

Merge branch 'feature/20241104-improve-old-task' of Server/title_with_video into master

fengzhoutian 5 months ago
parent
commit
90244a1cbf
1 changed files with 21 additions and 4 deletions
  1. 21 4
      tasks/task1.py

+ 21 - 4
tasks/task1.py

@@ -9,7 +9,7 @@ from applications.functions.log import logging
 from static.config import spider_coroutines
 
 # Temporary solution for task dead-lock
-g_values = {'row_offset': 0}
+g_values = {'row_offset': 0, 'skip_num': 0}
 
 class MatchTask1(object):
     """
@@ -41,16 +41,33 @@ class MatchTask1(object):
         for content_id in content_ids:
             unique_content_ids.add(content_id[0])
         if not unique_content_ids:
+            if g_values['skip_num'] > 0:
+                logging(code=9001, function="task1.get_task", info="reset row offset to 0")
+                g_values['row_offset'] = 0
+                g_values['skip_num'] = 0
             return []
         g_values['row_offset'] = content_ids[-1][1]
         print(f"update row offset to: {g_values['row_offset']}")
-        unique_content_ids = list(unique_content_ids)[0:spider_coroutines]
         logging(
             code=9001,
             function="task1.get_task",
-            info=f"unique content ids in batch: {len(unique_content_ids)}"
+            info=f"unique content ids: {len(unique_content_ids)}"
         )
-        content_ids_tuple = str(unique_content_ids).replace("[", "(").replace("]", ")")
+        content_ids_to_process = []
+        for content_id in unique_content_ids:
+            history_videos = await self.get_history_videos(content_id)
+            if not history_videos:
+                content_ids_to_process.append(content_id)
+        if spider_coroutines > len(content_ids_to_process):
+            logging(code=9001, function="task1.get_task", info="some content is skipped, process it later")
+            g_values['skip_num'] = 1
+        content_ids_to_process = content_ids_to_process[0:spider_coroutines]
+        logging(
+            code=9001,
+            function="task1.get_task",
+            info=f"content ids to process: {len(content_ids_to_process)}"
+        )
+        content_ids_tuple = str(content_ids_to_process).replace("[", "(").replace("]", ")")
         if len(content_ids_tuple) > 3:
             select_sql = f"""
                 SELECT trace_id, content_id, gh_id, article_title, article_text, content_status, process_times