浏览代码

Update task1: use full concurrency

StrayWarrior 5 月之前
父节点
当前提交
e3b844d8df
共有 1 个文件被更改,包括 13 次插入2 次删除
  1. 13 2
      tasks/task1.py

+ 13 - 2
tasks/task1.py

@@ -48,9 +48,20 @@ class MatchTask1(object):
         logging(
             code=9001,
             function="task1.get_task",
-            info=f"unique content ids in batch: {len(unique_content_ids)}"
+            info=f"unique content ids: {len(unique_content_ids)}"
         )
-        content_ids_tuple = str(unique_content_ids).replace("[", "(").replace("]", ")")
+        content_ids_to_process = []
+        for content_id in unique_content_ids:
+            history_videos = await self.get_history_videos(content_id)
+            if not history_videos:
+                content_ids_to_process.append(content_id)
+        content_ids_to_process = content_ids_to_process[0:spider_coroutines]
+        logging(
+            code=9001,
+            function="task1.get_task",
+            info=f"content ids to process: {len(content_ids_to_process)}"
+        )
+        content_ids_tuple = str(content_ids_to_process).replace("[", "(").replace("]", ")")
         if len(content_ids_tuple) > 3:
             select_sql = f"""
                 SELECT trace_id, content_id, gh_id, article_title, article_text, content_status, process_times