|
@@ -1,7 +1,6 @@
|
|
|
"""
|
|
|
@author: luojunhui
|
|
|
"""
|
|
|
-import traceback
|
|
|
from typing import Dict, List
|
|
|
|
|
|
from aiomysql.cursors import DictCursor
|
|
@@ -9,12 +8,6 @@ from aiomysql.cursors import DictCursor
|
|
|
from applications.log import logging
|
|
|
from applications.config import Config
|
|
|
from applications.const import rematch_task_const
|
|
|
-from applications.etl_function import download_cover
|
|
|
-from applications.etl_function import download_video
|
|
|
-from applications.etl_function import generate_video_path
|
|
|
-from applications.etl_function import upload_to_oss
|
|
|
-from applications.spider import search_videos_from_web
|
|
|
-from .utils import get_kimi_result
|
|
|
|
|
|
|
|
|
class ReMatchTask(object):
|
|
@@ -28,232 +21,37 @@ class ReMatchTask(object):
|
|
|
self.article_match_video_table = self.config.article_match_video_table
|
|
|
self.article_text_table = self.config.article_text_table
|
|
|
self.article_crawler_video_table = self.config.article_crawler_video_table
|
|
|
+ self.rematch_coroutines = int(self.config.get_config_value(key='rematchCoroutines'))
|
|
|
|
|
|
- async def get_tasks(self) -> List[Dict]:
|
|
|
+ async def get_tasks(self, business_type) -> List[Dict]:
|
|
|
"""
|
|
|
get task needs to rematch
|
|
|
"""
|
|
|
- select_sql = f"""
|
|
|
- SELECT DISTINCT content_id
|
|
|
- FROM article_re_match_record
|
|
|
- WHERE status = {rematch_task_const.REMATCH_INIT_STATUS};
|
|
|
- """
|
|
|
+ if business_type == 'process':
|
|
|
+ select_sql = f"""
|
|
|
+ SELECT trace_id, content_id
|
|
|
+ FROM article_re_match_record
|
|
|
+ WHERE status = {rematch_task_const.REMATCH_INIT_STATUS}
|
|
|
+ LIMIT {self.rematch_coroutines};
|
|
|
+ """
|
|
|
+ elif business_type == 'check':
|
|
|
+ select_sql = f"""
|
|
|
+ SELECT trace_id, content_id
|
|
|
+ FROM article_re_match_record
|
|
|
+ WHERE status = {rematch_task_const.REMATCH_PROCESSING_STATUS}
|
|
|
+ LIMIT {self.rematch_coroutines};
|
|
|
+ """
|
|
|
+ else:
|
|
|
+ return []
|
|
|
result = await self.db_client.async_select(select_sql, cursor_type=DictCursor)
|
|
|
+ task_dict = {task['content_id']: task for task in result}
|
|
|
+ task_list = list(task_dict.values())
|
|
|
logging(
|
|
|
code="rematch_1001",
|
|
|
function="get_tasks",
|
|
|
- info="获取content_id数量: {}".format(len(result))
|
|
|
+ info="获取content_id数量: {}".format(len(task_list))
|
|
|
)
|
|
|
- return result
|
|
|
-
|
|
|
- async def async_download_video_list(self, to_download_videos: List[Dict], illegal_platform_id_list: List[str]) -> int:
|
|
|
- """
|
|
|
- 异步下载视频
|
|
|
- """
|
|
|
- success_count = 0
|
|
|
- for video in to_download_videos:
|
|
|
- # check whether video is illegal
|
|
|
- out_key = "{}_{}".format(video['platform'], video['out_video_id'])
|
|
|
- if out_key in illegal_platform_id_list:
|
|
|
- continue
|
|
|
- # start download
|
|
|
- try:
|
|
|
- local_video_path, local_cover_path = generate_video_path(video['platform'], video['out_video_id'])
|
|
|
- # download videos
|
|
|
- file_path = await download_video(
|
|
|
- file_path=local_video_path,
|
|
|
- platform=video['platform'],
|
|
|
- video_url=video['video_url']
|
|
|
- )
|
|
|
- if not file_path:
|
|
|
- # 说明视频下载失败,无需上传该视频, 将该条记录设置为失败状态
|
|
|
- update_sql = f"""
|
|
|
- UPDATE {self.article_crawler_video_table}
|
|
|
- SET download_status = %s
|
|
|
- WHERE id = %s;
|
|
|
- """
|
|
|
-
|
|
|
- await self.db_client.async_insert(
|
|
|
- sql=update_sql,
|
|
|
- params=(rematch_task_const.VIDEO_DOWNLOAD_FAIL_STATUS, video['id'])
|
|
|
- )
|
|
|
- else:
|
|
|
- # download cover
|
|
|
- cover_path = await download_cover(
|
|
|
- file_path=local_cover_path,
|
|
|
- platform=video['platform'],
|
|
|
- cover_url=video['cover_url']
|
|
|
- )
|
|
|
- # upload video to oss
|
|
|
- oss_video = await upload_to_oss(
|
|
|
- local_video_path=file_path,
|
|
|
- download_type="video"
|
|
|
- )
|
|
|
- # upload cover to oss
|
|
|
- if cover_path:
|
|
|
- oss_cover = await upload_to_oss(
|
|
|
- local_video_path=cover_path,
|
|
|
- download_type="image"
|
|
|
- )
|
|
|
- else:
|
|
|
- oss_cover = None
|
|
|
-
|
|
|
- # change status to success
|
|
|
- update_sql = f"""
|
|
|
- UPDATE {self.article_crawler_video_table}
|
|
|
- SET video_oss_path = %s, cover_oss_path = %s, download_status = %s
|
|
|
- WHERE id = %s;
|
|
|
- """
|
|
|
- await self.db_client.async_insert(
|
|
|
- sql=update_sql,
|
|
|
- params=(
|
|
|
- oss_video,
|
|
|
- oss_cover,
|
|
|
- rematch_task_const.VIDEO_DOWNLOAD_SUCCESS_STATUS,
|
|
|
- video['id']
|
|
|
- )
|
|
|
- )
|
|
|
- success_count += 1
|
|
|
- # 如果下载的视频数已经大于3, 则直接退出循环,修改状态为ETL成功状态
|
|
|
- if success_count > rematch_task_const.MIN_MATCH_VIDEO_NUM:
|
|
|
- return success_count
|
|
|
- except Exception as e:
|
|
|
- update_sql = f"""
|
|
|
- UPDATE {self.article_crawler_video_table}
|
|
|
- SET download_status = %s
|
|
|
- WHERE id = %s;
|
|
|
- """
|
|
|
- await self.db_client.async_insert(
|
|
|
- sql=update_sql,
|
|
|
- params=(rematch_task_const.VIDEO_DOWNLOAD_FAIL_STATUS, video['id'])
|
|
|
- )
|
|
|
-
|
|
|
- return success_count
|
|
|
-
|
|
|
- async def download_upload_task(self, content_id: str, illegal_platform_id_list: List[str], legal_not_downloaded_videos=None) -> int:
|
|
|
- """
|
|
|
- 下载任务
|
|
|
- """
|
|
|
- success_count = 0
|
|
|
- if legal_not_downloaded_videos:
|
|
|
- # 下载legal_not_download_videos, 记录下载成功数量
|
|
|
- new_download_count = await self.async_download_video_list(
|
|
|
- to_download_videos=legal_not_downloaded_videos,
|
|
|
- illegal_platform_id_list=illegal_platform_id_list
|
|
|
- )
|
|
|
- success_count += new_download_count
|
|
|
- return success_count
|
|
|
-
|
|
|
- select_sql = f"""
|
|
|
- SELECT id, out_video_id, platform, video_title, video_url, cover_url
|
|
|
- FROM {self.article_crawler_video_table}
|
|
|
- WHERE content_id = '{content_id}' AND download_status = {rematch_task_const.VIDEO_DOWNLOAD_INIT_STATUS};
|
|
|
- """
|
|
|
- to_download_videos = await self.db_client.async_select(select_sql, cursor_type=DictCursor)
|
|
|
- new_download_count = await self.async_download_video_list(
|
|
|
- to_download_videos=to_download_videos,
|
|
|
- illegal_platform_id_list=illegal_platform_id_list
|
|
|
- )
|
|
|
- success_count += new_download_count
|
|
|
- return success_count
|
|
|
-
|
|
|
- async def spider_task(self, content_id: str) -> int:
|
|
|
- """
|
|
|
- 爬虫抓取任务
|
|
|
- """
|
|
|
- kimi_result = await get_kimi_result(
|
|
|
- content_id=content_id,
|
|
|
- article_text_table=self.article_text_table,
|
|
|
- db_client=self.db_client
|
|
|
- )
|
|
|
- # 该搜索任务是content_id粒度,因此将trace_id 传为content_id
|
|
|
- search_videos_count = await search_videos_from_web(
|
|
|
- info={
|
|
|
- "ori_title": kimi_result['ori_title'],
|
|
|
- "kimi_summary": kimi_result['kimi_summary'],
|
|
|
- "kimi_keys": kimi_result['kimi_keys'],
|
|
|
- "trace_id": content_id,
|
|
|
- "gh_id": rematch_task_const.TASK_DEFAULT_GH_ID,
|
|
|
- "content_id": content_id,
|
|
|
- "crawler_video_table": self.article_crawler_video_table
|
|
|
- },
|
|
|
- gh_id_map={},
|
|
|
- db_client=self.db_client
|
|
|
- )
|
|
|
- return search_videos_count
|
|
|
-
|
|
|
- async def update_each_content_id(self, content_id: str) -> None:
|
|
|
- """
|
|
|
- 更新每一个content_id的视频信息
|
|
|
- """
|
|
|
- select_sql = f"""
|
|
|
- SELECT
|
|
|
- id, platform, out_video_id, video_oss_path, download_status, is_illegal, video_url, cover_url
|
|
|
- FROM
|
|
|
- {self.article_crawler_video_table}
|
|
|
- WHERE content_id = '{content_id}';
|
|
|
- """
|
|
|
- record_list = await self.db_client.async_select(select_sql, cursor_type=DictCursor)
|
|
|
- # 获取违规视频,违规视频out_id && platform
|
|
|
- illegal_videos = [record for record in record_list if record["is_illegal"] == rematch_task_const.VIDEO_UNSAFE]
|
|
|
- illegal_platform_id_list = [
|
|
|
- "{}_{}".format(record['platform'], record['out_video_id']) for record in illegal_videos
|
|
|
- ]
|
|
|
-
|
|
|
- # 处理非违规字段
|
|
|
- legal_downloaded_videos = []
|
|
|
- legal_not_downloaded_videos = []
|
|
|
- for record in record_list:
|
|
|
- if record['is_illegal'] == rematch_task_const.VIDEO_UNSAFE:
|
|
|
- continue
|
|
|
- else:
|
|
|
- out_key = "{}_{}".format(record['platform'], record['out_video_id'])
|
|
|
- if out_key in illegal_platform_id_list:
|
|
|
- continue
|
|
|
- if record['download_status'] == rematch_task_const.VIDEO_DOWNLOAD_INIT_STATUS:
|
|
|
- legal_not_downloaded_videos.append(record)
|
|
|
- elif record['download_status'] == rematch_task_const.VIDEO_DOWNLOAD_SUCCESS_STATUS:
|
|
|
- legal_downloaded_videos.append(record)
|
|
|
- else:
|
|
|
- continue
|
|
|
-
|
|
|
- if len(legal_downloaded_videos) >= rematch_task_const.MIN_MATCH_VIDEO_NUM:
|
|
|
- logging(
|
|
|
- code="rematch_1002",
|
|
|
- info="存在{}条以上待合规已下载内容,无需处理该content_id".format(rematch_task_const.MIN_MATCH_VIDEO_NUM),
|
|
|
- data={
|
|
|
- "content_id": content_id
|
|
|
- }
|
|
|
- )
|
|
|
- return
|
|
|
- elif len(legal_not_downloaded_videos + legal_downloaded_videos) >= rematch_task_const.MIN_MATCH_VIDEO_NUM:
|
|
|
- logging(
|
|
|
- code="rematch_1003",
|
|
|
- info="存在{}条以上待合规内容".format(rematch_task_const.MIN_MATCH_VIDEO_NUM),
|
|
|
- data={
|
|
|
- "content_id": content_id,
|
|
|
- "待下载数量": len(legal_not_downloaded_videos),
|
|
|
- "已下载数量": len(legal_downloaded_videos)
|
|
|
- }
|
|
|
- )
|
|
|
- await self.download_upload_task(
|
|
|
- content_id=content_id,
|
|
|
- illegal_platform_id_list=illegal_platform_id_list,
|
|
|
- legal_not_downloaded_videos=legal_not_downloaded_videos
|
|
|
- )
|
|
|
- else:
|
|
|
- logging(
|
|
|
- code="rematch_1004",
|
|
|
- info="重新执行爬虫任务",
|
|
|
- data={
|
|
|
- "content_id": content_id,
|
|
|
- }
|
|
|
- )
|
|
|
- await self.spider_task(content_id=content_id)
|
|
|
- await self.download_upload_task(
|
|
|
- content_id=content_id,
|
|
|
- illegal_platform_id_list=illegal_platform_id_list
|
|
|
- )
|
|
|
+ return task_list
|
|
|
|
|
|
async def check_whether_get_enough_videos(self, content_id: str) -> bool:
|
|
|
"""
|
|
@@ -273,7 +71,7 @@ class ReMatchTask(object):
|
|
|
else:
|
|
|
return False
|
|
|
|
|
|
- async def update_content_id_status(self, content_id: str) -> int:
|
|
|
+ async def update_content_id_status_to_success(self, content_id: str) -> int:
|
|
|
"""
|
|
|
更新content_id的status为成功
|
|
|
"""
|
|
@@ -288,31 +86,140 @@ class ReMatchTask(object):
|
|
|
)
|
|
|
return affected_rows
|
|
|
|
|
|
- async def deal(self):
|
|
|
+ async def get_task_lock(self, trace_id: str) -> int:
|
|
|
"""
|
|
|
- do job here
|
|
|
+ 将任务上锁,防止被其他进程抢占
|
|
|
+ """
|
|
|
+ update_sql = f"""
|
|
|
+ UPDATE article_re_match_record
|
|
|
+ SET status = %s
|
|
|
+ WHERE trace_id = %s and status = %s;
|
|
|
+ """
|
|
|
+ affected_rows = await self.db_client.async_insert(
|
|
|
+ sql=update_sql,
|
|
|
+ params=(
|
|
|
+ rematch_task_const.REMATCH_PROCESSING_STATUS,
|
|
|
+ trace_id,
|
|
|
+ rematch_task_const.REMATCH_INIT_STATUS
|
|
|
+ )
|
|
|
+ )
|
|
|
+ return affected_rows
|
|
|
+
|
|
|
+ async def whether_same_content_id_processing(self, content_id: str) -> bool:
|
|
|
+ """
|
|
|
+ 是否相同的content_id处理中 or 处理完成
|
|
|
+ """
|
|
|
+ select_sql = f"""
|
|
|
+ SELECT DISTINCT status
|
|
|
+ FROM article_re_match_record
|
|
|
+ WHERE content_id = '{content_id}';
|
|
|
+ """
|
|
|
+ response = await self.db_client.async_select(select_sql)
|
|
|
+ status_list = list(i[0] for i in response)
|
|
|
+ for status in status_list:
|
|
|
+ if status in [rematch_task_const.REMATCH_PROCESSING_STATUS, rematch_task_const.REMATCH_SUCCESS_STATUS]:
|
|
|
+ return True
|
|
|
+ return False
|
|
|
+
|
|
|
+ async def process_content_id(self, content_id: str) -> int:
|
|
|
+ """
|
|
|
+ 处理content_id
|
|
|
+ """
|
|
|
+ update_sql = f"""
|
|
|
+ UPDATE {self.article_match_video_table}
|
|
|
+ SET content_status = %s, success_status = %s, process_times = %s
|
|
|
+ WHERE content_id = %s;
|
|
|
+ """
|
|
|
+ affected_rows = await self.db_client.async_insert(
|
|
|
+ sql=update_sql,
|
|
|
+ params=(
|
|
|
+ rematch_task_const.TASK_INIT_STATUS,
|
|
|
+ rematch_task_const.AIGC_DONT_GET_RESULT_STATUS,
|
|
|
+ rematch_task_const.TASK_INIT_PROCESS_TIMES,
|
|
|
+ content_id
|
|
|
+ )
|
|
|
+ )
|
|
|
+ return affected_rows
|
|
|
+
|
|
|
+ async def process_task(self, task_list: List[Dict]) -> None:
|
|
|
+ """
|
|
|
+ 处理
|
|
|
"""
|
|
|
- task_list = await self.get_tasks()
|
|
|
for task in task_list:
|
|
|
content_id = task['content_id']
|
|
|
- try:
|
|
|
- await self.update_each_content_id(content_id=content_id)
|
|
|
-
|
|
|
- if await self.check_whether_get_enough_videos(content_id=content_id):
|
|
|
- # 修改状态为1
|
|
|
- await self.update_content_id_status(content_id=content_id)
|
|
|
- else:
|
|
|
+ trace_id = task['trace_id']
|
|
|
+ processing_flag = await self.whether_same_content_id_processing(content_id=content_id)
|
|
|
+ if processing_flag:
|
|
|
+ continue
|
|
|
+ else:
|
|
|
+ affected_row = await self.get_task_lock(trace_id)
|
|
|
+ if not affected_row:
|
|
|
continue
|
|
|
- except Exception as e:
|
|
|
- error_stack = traceback.format_exc()
|
|
|
- logging(
|
|
|
- code="rematch_1010",
|
|
|
- function="update_each_content_id",
|
|
|
- data={
|
|
|
- "error_stack": error_stack,
|
|
|
- "error": str(e),
|
|
|
- "content_id": content_id
|
|
|
- }
|
|
|
- )
|
|
|
+ affected_rows = await self.process_content_id(content_id)
|
|
|
+ if affected_rows:
|
|
|
+ logging(
|
|
|
+ code="rematch_1002",
|
|
|
+ function="deal",
|
|
|
+ info="回滚content_id成功",
|
|
|
+ data={
|
|
|
+ "content_id": content_id,
|
|
|
+ "trace_id": trace_id,
|
|
|
+ "affected_rows": affected_rows
|
|
|
+ }
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ logging(
|
|
|
+ code="rematch_1003",
|
|
|
+ function="deal",
|
|
|
+ info="回滚content_id失败",
|
|
|
+ data={
|
|
|
+ "content_id": content_id,
|
|
|
+ "trace_id": trace_id
|
|
|
+ }
|
|
|
+ )
|
|
|
+
|
|
|
+ async def check_task(self, task_list: List[Dict]) -> None:
|
|
|
+ """
|
|
|
+ 校验任务是否完成
|
|
|
+ """
|
|
|
+ for task in task_list:
|
|
|
+ content_id = task['content_id']
|
|
|
+ enough_video_flag = await self.check_whether_get_enough_videos(content_id=content_id)
|
|
|
+ if enough_video_flag:
|
|
|
+ affected_rows = await self.update_content_id_status_to_success(content_id=content_id)
|
|
|
+ if affected_rows:
|
|
|
+ logging(
|
|
|
+ code="rematch_1004",
|
|
|
+ function="check_task",
|
|
|
+ info="修改状态成功",
|
|
|
+ data={
|
|
|
+ "content_id": content_id,
|
|
|
+ "affected_rows": affected_rows
|
|
|
+ }
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ continue
|
|
|
|
|
|
+ async def deal(self):
|
|
|
+ """
|
|
|
+ do job here
|
|
|
+ """
|
|
|
+ # 处理任务
|
|
|
+ task_list = await self.get_tasks(business_type="process")
|
|
|
+ if task_list:
|
|
|
+ await self.process_task(task_list)
|
|
|
+ else:
|
|
|
+ logging(
|
|
|
+ code="rematch_5001",
|
|
|
+ info="do not get article to process"
|
|
|
+ )
|
|
|
|
|
|
+ # 校验任务
|
|
|
+ task_list = await self.get_tasks(business_type="check")
|
|
|
+ if task_list:
|
|
|
+ await self.check_task(task_list)
|
|
|
+ else:
|
|
|
+ logging(
|
|
|
+ code="rematch_5002",
|
|
|
+ info="do not get article to check"
|
|
|
+ )
|