|
@@ -9,7 +9,7 @@ from applications.functions.log import logging
|
|
|
from static.config import spider_coroutines
|
|
|
|
|
|
# Temporary solution for task dead-lock
|
|
|
-g_values = {'row_offset': 0}
|
|
|
+g_values = {'row_offset': 0, 'skip_num': 0}
|
|
|
|
|
|
class MatchTask1(object):
|
|
|
"""
|
|
@@ -41,16 +41,33 @@ class MatchTask1(object):
|
|
|
for content_id in content_ids:
|
|
|
unique_content_ids.add(content_id[0])
|
|
|
if not unique_content_ids:
|
|
|
+ if g_values['skip_num'] > 0:
|
|
|
+ logging(code=9001, function="task1.get_task", info="reset row offset to 0")
|
|
|
+ g_values['row_offset'] = 0
|
|
|
+ g_values['skip_num'] = 0
|
|
|
return []
|
|
|
g_values['row_offset'] = content_ids[-1][1]
|
|
|
print(f"update row offset to: {g_values['row_offset']}")
|
|
|
- unique_content_ids = list(unique_content_ids)[0:spider_coroutines]
|
|
|
logging(
|
|
|
code=9001,
|
|
|
function="task1.get_task",
|
|
|
- info=f"unique content ids in batch: {len(unique_content_ids)}"
|
|
|
+ info=f"unique content ids: {len(unique_content_ids)}"
|
|
|
)
|
|
|
- content_ids_tuple = str(unique_content_ids).replace("[", "(").replace("]", ")")
|
|
|
+ content_ids_to_process = []
|
|
|
+ for content_id in unique_content_ids:
|
|
|
+ history_videos = await self.get_history_videos(content_id)
|
|
|
+ if not history_videos:
|
|
|
+ content_ids_to_process.append(content_id)
|
|
|
+ if spider_coroutines > len(content_ids_to_process):
|
|
|
+ logging(code=9001, function="task1.get_task", info="some content is skipped, process it later")
|
|
|
+ g_values['skip_num'] = 1
|
|
|
+ content_ids_to_process = content_ids_to_process[0:spider_coroutines]
|
|
|
+ logging(
|
|
|
+ code=9001,
|
|
|
+ function="task1.get_task",
|
|
|
+ info=f"content ids to process: {len(content_ids_to_process)}"
|
|
|
+ )
|
|
|
+ content_ids_tuple = str(content_ids_to_process).replace("[", "(").replace("]", ")")
|
|
|
if len(content_ids_tuple) > 3:
|
|
|
select_sql = f"""
|
|
|
SELECT trace_id, content_id, gh_id, article_title, article_text, content_status, process_times
|