|  | @@ -0,0 +1,225 @@
 | 
											
												
													
														|  | 
 |  | +"""
 | 
											
												
													
														|  | 
 |  | +@author: luojunhui
 | 
											
												
													
														|  | 
 |  | +"""
 | 
											
												
													
														|  | 
 |  | +from typing import Dict, List
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +from aiomysql.cursors import DictCursor
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +from applications.log import logging
 | 
											
												
													
														|  | 
 |  | +from applications.config import Config
 | 
											
												
													
														|  | 
 |  | +from applications.const import rematch_task_const
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +class ReMatchTask(object):
 | 
											
												
													
														|  | 
 |  | +    """
 | 
											
												
													
														|  | 
 |  | +    重新匹配任务
 | 
											
												
													
														|  | 
 |  | +    """
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    def __init__(self, db_client):
 | 
											
												
													
														|  | 
 |  | +        self.db_client = db_client
 | 
											
												
													
														|  | 
 |  | +        self.config = Config()
 | 
											
												
													
														|  | 
 |  | +        self.article_match_video_table = self.config.article_match_video_table
 | 
											
												
													
														|  | 
 |  | +        self.article_text_table = self.config.article_text_table
 | 
											
												
													
														|  | 
 |  | +        self.article_crawler_video_table = self.config.article_crawler_video_table
 | 
											
												
													
														|  | 
 |  | +        self.rematch_coroutines = int(self.config.get_config_value(key='rematchCoroutines'))
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    async def get_tasks(self, business_type) -> List[Dict]:
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        get task needs to rematch
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        if business_type == 'process':
 | 
											
												
													
														|  | 
 |  | +            select_sql = f"""
 | 
											
												
													
														|  | 
 |  | +                SELECT trace_id, content_id
 | 
											
												
													
														|  | 
 |  | +                FROM article_re_match_record
 | 
											
												
													
														|  | 
 |  | +                WHERE status = {rematch_task_const.REMATCH_INIT_STATUS}
 | 
											
												
													
														|  | 
 |  | +                LIMIT {self.rematch_coroutines};
 | 
											
												
													
														|  | 
 |  | +            """
 | 
											
												
													
														|  | 
 |  | +        elif business_type == 'check':
 | 
											
												
													
														|  | 
 |  | +            select_sql = f"""
 | 
											
												
													
														|  | 
 |  | +                SELECT trace_id, content_id
 | 
											
												
													
														|  | 
 |  | +                FROM article_re_match_record
 | 
											
												
													
														|  | 
 |  | +                WHERE status = {rematch_task_const.REMATCH_PROCESSING_STATUS}
 | 
											
												
													
														|  | 
 |  | +                LIMIT {self.rematch_coroutines};
 | 
											
												
													
														|  | 
 |  | +            """
 | 
											
												
													
														|  | 
 |  | +        else:
 | 
											
												
													
														|  | 
 |  | +            return []
 | 
											
												
													
														|  | 
 |  | +        result = await self.db_client.async_select(select_sql, cursor_type=DictCursor)
 | 
											
												
													
														|  | 
 |  | +        task_dict = {task['content_id']: task for task in result}
 | 
											
												
													
														|  | 
 |  | +        task_list = list(task_dict.values())
 | 
											
												
													
														|  | 
 |  | +        logging(
 | 
											
												
													
														|  | 
 |  | +            code="rematch_1001",
 | 
											
												
													
														|  | 
 |  | +            function="get_tasks",
 | 
											
												
													
														|  | 
 |  | +            info="获取content_id数量: {}".format(len(task_list))
 | 
											
												
													
														|  | 
 |  | +        )
 | 
											
												
													
														|  | 
 |  | +        return task_list
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    async def check_whether_get_enough_videos(self, content_id: str) -> bool:
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        check 该content_id是否存在足够的视频
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        select_sql = f"""
 | 
											
												
													
														|  | 
 |  | +            SELECT count(1)
 | 
											
												
													
														|  | 
 |  | +            FROM {self.article_crawler_video_table}
 | 
											
												
													
														|  | 
 |  | +            WHERE content_id = '{content_id}' 
 | 
											
												
													
														|  | 
 |  | +                AND download_status = {rematch_task_const.VIDEO_DOWNLOAD_SUCCESS_STATUS} 
 | 
											
												
													
														|  | 
 |  | +                AND is_illegal = {rematch_task_const.VIDEO_SAFE};
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        count_tuple = await self.db_client.async_select(select_sql)
 | 
											
												
													
														|  | 
 |  | +        count = count_tuple[0][0]
 | 
											
												
													
														|  | 
 |  | +        if count >= rematch_task_const.MIN_MATCH_VIDEO_NUM:
 | 
											
												
													
														|  | 
 |  | +            return True
 | 
											
												
													
														|  | 
 |  | +        else:
 | 
											
												
													
														|  | 
 |  | +            return False
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    async def update_content_id_status_to_success(self, content_id: str) -> int:
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        更新content_id的status为成功
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        update_sql = f"""
 | 
											
												
													
														|  | 
 |  | +            UPDATE article_re_match_record
 | 
											
												
													
														|  | 
 |  | +            SET status = %s
 | 
											
												
													
														|  | 
 |  | +            WHERE content_id = '{content_id}';
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        affected_rows = await self.db_client.async_insert(
 | 
											
												
													
														|  | 
 |  | +            update_sql,
 | 
											
												
													
														|  | 
 |  | +            params=(rematch_task_const.REMATCH_SUCCESS_STATUS,)
 | 
											
												
													
														|  | 
 |  | +        )
 | 
											
												
													
														|  | 
 |  | +        return affected_rows
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    async def get_task_lock(self, trace_id: str) -> int:
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        将任务上锁,防止被其他进程抢占
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        update_sql = f"""
 | 
											
												
													
														|  | 
 |  | +            UPDATE article_re_match_record
 | 
											
												
													
														|  | 
 |  | +            SET status = %s
 | 
											
												
													
														|  | 
 |  | +            WHERE trace_id = %s and status = %s;
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        affected_rows = await self.db_client.async_insert(
 | 
											
												
													
														|  | 
 |  | +            sql=update_sql,
 | 
											
												
													
														|  | 
 |  | +            params=(
 | 
											
												
													
														|  | 
 |  | +                rematch_task_const.REMATCH_PROCESSING_STATUS,
 | 
											
												
													
														|  | 
 |  | +                trace_id,
 | 
											
												
													
														|  | 
 |  | +                rematch_task_const.REMATCH_INIT_STATUS
 | 
											
												
													
														|  | 
 |  | +            )
 | 
											
												
													
														|  | 
 |  | +        )
 | 
											
												
													
														|  | 
 |  | +        return affected_rows
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    async def whether_same_content_id_processing(self, content_id: str) -> bool:
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        是否相同的content_id处理中 or 处理完成
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        select_sql = f"""
 | 
											
												
													
														|  | 
 |  | +            SELECT DISTINCT status
 | 
											
												
													
														|  | 
 |  | +            FROM article_re_match_record
 | 
											
												
													
														|  | 
 |  | +            WHERE content_id = '{content_id}';
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        response = await self.db_client.async_select(select_sql)
 | 
											
												
													
														|  | 
 |  | +        status_list = list(i[0] for i in response)
 | 
											
												
													
														|  | 
 |  | +        for status in status_list:
 | 
											
												
													
														|  | 
 |  | +            if status in [rematch_task_const.REMATCH_PROCESSING_STATUS, rematch_task_const.REMATCH_SUCCESS_STATUS]:
 | 
											
												
													
														|  | 
 |  | +                return True
 | 
											
												
													
														|  | 
 |  | +        return False
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    async def process_content_id(self, content_id: str) -> int:
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        处理content_id
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        update_sql = f"""
 | 
											
												
													
														|  | 
 |  | +            UPDATE {self.article_match_video_table}
 | 
											
												
													
														|  | 
 |  | +            SET content_status = %s, success_status = %s, process_times = %s
 | 
											
												
													
														|  | 
 |  | +            WHERE content_id = %s;
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        affected_rows = await self.db_client.async_insert(
 | 
											
												
													
														|  | 
 |  | +            sql=update_sql,
 | 
											
												
													
														|  | 
 |  | +            params=(
 | 
											
												
													
														|  | 
 |  | +                rematch_task_const.TASK_INIT_STATUS,
 | 
											
												
													
														|  | 
 |  | +                rematch_task_const.AIGC_DONT_GET_RESULT_STATUS,
 | 
											
												
													
														|  | 
 |  | +                rematch_task_const.TASK_INIT_PROCESS_TIMES,
 | 
											
												
													
														|  | 
 |  | +                content_id
 | 
											
												
													
														|  | 
 |  | +            )
 | 
											
												
													
														|  | 
 |  | +        )
 | 
											
												
													
														|  | 
 |  | +        return affected_rows
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    async def process_task(self, task_list: List[Dict]) -> None:
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        处理
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        for task in task_list:
 | 
											
												
													
														|  | 
 |  | +            content_id = task['content_id']
 | 
											
												
													
														|  | 
 |  | +            trace_id = task['trace_id']
 | 
											
												
													
														|  | 
 |  | +            processing_flag = await self.whether_same_content_id_processing(content_id=content_id)
 | 
											
												
													
														|  | 
 |  | +            if processing_flag:
 | 
											
												
													
														|  | 
 |  | +                continue
 | 
											
												
													
														|  | 
 |  | +            else:
 | 
											
												
													
														|  | 
 |  | +                affected_row = await self.get_task_lock(trace_id)
 | 
											
												
													
														|  | 
 |  | +                if not affected_row:
 | 
											
												
													
														|  | 
 |  | +                    continue
 | 
											
												
													
														|  | 
 |  | +                affected_rows = await self.process_content_id(content_id)
 | 
											
												
													
														|  | 
 |  | +                if affected_rows:
 | 
											
												
													
														|  | 
 |  | +                    logging(
 | 
											
												
													
														|  | 
 |  | +                        code="rematch_1002",
 | 
											
												
													
														|  | 
 |  | +                        function="deal",
 | 
											
												
													
														|  | 
 |  | +                        info="回滚content_id成功",
 | 
											
												
													
														|  | 
 |  | +                        data={
 | 
											
												
													
														|  | 
 |  | +                            "content_id": content_id,
 | 
											
												
													
														|  | 
 |  | +                            "trace_id": trace_id,
 | 
											
												
													
														|  | 
 |  | +                            "affected_rows": affected_rows
 | 
											
												
													
														|  | 
 |  | +                        }
 | 
											
												
													
														|  | 
 |  | +                    )
 | 
											
												
													
														|  | 
 |  | +                else:
 | 
											
												
													
														|  | 
 |  | +                    logging(
 | 
											
												
													
														|  | 
 |  | +                        code="rematch_1003",
 | 
											
												
													
														|  | 
 |  | +                        function="deal",
 | 
											
												
													
														|  | 
 |  | +                        info="回滚content_id失败",
 | 
											
												
													
														|  | 
 |  | +                        data={
 | 
											
												
													
														|  | 
 |  | +                            "content_id": content_id,
 | 
											
												
													
														|  | 
 |  | +                            "trace_id": trace_id
 | 
											
												
													
														|  | 
 |  | +                        }
 | 
											
												
													
														|  | 
 |  | +                    )
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    async def check_task(self, task_list: List[Dict]) -> None:
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        校验任务是否完成
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        for task in task_list:
 | 
											
												
													
														|  | 
 |  | +            content_id = task['content_id']
 | 
											
												
													
														|  | 
 |  | +            enough_video_flag = await self.check_whether_get_enough_videos(content_id=content_id)
 | 
											
												
													
														|  | 
 |  | +            if enough_video_flag:
 | 
											
												
													
														|  | 
 |  | +                affected_rows = await self.update_content_id_status_to_success(content_id=content_id)
 | 
											
												
													
														|  | 
 |  | +                if affected_rows:
 | 
											
												
													
														|  | 
 |  | +                    logging(
 | 
											
												
													
														|  | 
 |  | +                        code="rematch_1004",
 | 
											
												
													
														|  | 
 |  | +                        function="check_task",
 | 
											
												
													
														|  | 
 |  | +                        info="修改状态成功",
 | 
											
												
													
														|  | 
 |  | +                        data={
 | 
											
												
													
														|  | 
 |  | +                            "content_id": content_id,
 | 
											
												
													
														|  | 
 |  | +                            "affected_rows": affected_rows
 | 
											
												
													
														|  | 
 |  | +                        }
 | 
											
												
													
														|  | 
 |  | +                    )
 | 
											
												
													
														|  | 
 |  | +            else:
 | 
											
												
													
														|  | 
 |  | +                continue
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    async def deal(self):
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        do job here
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        # 处理任务
 | 
											
												
													
														|  | 
 |  | +        task_list = await self.get_tasks(business_type="process")
 | 
											
												
													
														|  | 
 |  | +        if task_list:
 | 
											
												
													
														|  | 
 |  | +            await self.process_task(task_list)
 | 
											
												
													
														|  | 
 |  | +        else:
 | 
											
												
													
														|  | 
 |  | +            logging(
 | 
											
												
													
														|  | 
 |  | +                code="rematch_5001",
 | 
											
												
													
														|  | 
 |  | +                info="do not get article to process"
 | 
											
												
													
														|  | 
 |  | +            )
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +        # 校验任务
 | 
											
												
													
														|  | 
 |  | +        task_list = await self.get_tasks(business_type="check")
 | 
											
												
													
														|  | 
 |  | +        if task_list:
 | 
											
												
													
														|  | 
 |  | +            await self.check_task(task_list)
 | 
											
												
													
														|  | 
 |  | +        else:
 | 
											
												
													
														|  | 
 |  | +            logging(
 | 
											
												
													
														|  | 
 |  | +                code="rematch_5002",
 | 
											
												
													
														|  | 
 |  | +                info="do not get article to check"
 | 
											
												
													
														|  | 
 |  | +            )
 |