|
@@ -62,12 +62,20 @@ class NewContentIdTask(object):
|
|
|
# 获取 process_times <= 3 且 content_status = 0 的任务
|
|
|
select_sql = f"""
|
|
|
SELECT
|
|
|
- trace_id, content_id, flow_pool_level, gh_id, process_times, publish_flag
|
|
|
+ t1.trace_id, t1.content_id, t1.flow_pool_level, t1.gh_id, t1.process_times, t1.publish_flag
|
|
|
FROM
|
|
|
- {self.article_match_video_table}
|
|
|
+ {self.article_match_video_table} t1
|
|
|
+ LEFT JOIN (
|
|
|
+ SELECT content_id, count(1) as cnt
|
|
|
+ FROM {self.article_crawler_video_table}
|
|
|
+ WHERE download_status = {NewContentIdTaskConst.VIDEO_DOWNLOAD_SUCCESS_STATUS}
|
|
|
+ GROUP BY content_id
|
|
|
+ ) t2
|
|
|
+ ON t1.content_id = t2.content_id
|
|
|
WHERE
|
|
|
- content_status = {NewContentIdTaskConst.TASK_INIT_STATUS}
|
|
|
- and process_times <= {NewContentIdTaskConst.TASK_MAX_PROCESS_TIMES}
|
|
|
+ t1.content_status = {NewContentIdTaskConst.TASK_INIT_STATUS}
|
|
|
+ AND t1.process_times <= {NewContentIdTaskConst.TASK_MAX_PROCESS_TIMES}
|
|
|
+ AND t2.cnt IS NULL
|
|
|
ORDER BY flow_pool_level, request_timestamp
|
|
|
LIMIT {self.spider_coroutines};
|
|
|
"""
|