浏览代码

Update task1: do not distinct

StrayWarrior 5 月之前
父节点
当前提交
5b570d87e7
共有 1 个文件被更改,包括 7 次插入5 次删除
  1. 7 5
      tasks/task1.py

+ 7 - 5
tasks/task1.py

@@ -25,19 +25,21 @@ class MatchTask1(object):
         获取任务
         :return:
         """
+        select_limit = spider_coroutines * 5
         select_sql1 = f"""
-            SELECT DISTINCT (content_id)       
+            SELECT content_id
             FROM {db_article} 
             WHERE content_status = 0 and process_times <= 3
             ORDER BY request_time_stamp
             ASC
-            LIMIT {spider_coroutines};
+            LIMIT {select_limit};
         """
         content_ids = await self.mysql_client.async_select(select_sql1)
-        cil = []
+        unique_content_ids = set()
         for content_id in content_ids:
-            cil.append(content_id[0])
-        content_ids_tuple = str(cil).replace("[", "(").replace("]", ")")
+            unique_content_ids.add(content_id[0])
+        unique_content_ids = list(unique_content_ids)[0:spider_coroutines]
+        content_ids_tuple = str(unique_content_ids).replace("[", "(").replace("]", ")")
         if len(content_ids_tuple) > 3:
             select_sql = f"""
                 SELECT trace_id, content_id, gh_id, article_title, article_text, content_status, process_times