1 yıl önce · 28c226ea2d
--- a/RematchTask.py
+++ b/RematchTask.py
@@ -0,0 +1,22 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+import time
														
 
															+import datetime
														
 
															+import asyncio
														
 
															+from applications.db import AsyncMySQLClient
														
 
															+from tasks.rm_match_task import ReMatchTask
														
 
															+
														
 
															+
														
 
															+async def main_job():
														
 
															+    """
														
 
															+    main job
														
 
															+    :return:
														
 
															+    """
														
 
															+    async with AsyncMySQLClient() as long_articles_pool:
														
 
															+        new_content_id_task = ReMatchTask(long_articles_pool)
														
 
															+        await new_content_id_task.deal()
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    asyncio.run(main_job())
														
--- a/applications/const/__init__.py
+++ b/applications/const/__init__.py
@@ -2,9 +2,12 @@
 
															 @author: luojunhui
														
 
															 """
														
 
															 from .server_const import ServerConst
														
 
															-from .task_const import HistoryContentIdTaskConst, NewContentIdTaskConst
														
 
															+from .task_const import HistoryContentIdTaskConst
														
 
															+from .task_const import NewContentIdTaskConst
														
 
															+from .task_const import RematchTaskConst
														
 
															 server_const = ServerConst()
														
 
															 new_content_id_task_const = NewContentIdTaskConst()
														
 
															-history_content_id_task_const = HistoryContentIdTaskConst()
														
 
															+history_content_id_task_const = HistoryContentIdTaskConst()
														
 
															+rematch_task_const = RematchTaskConst()
														
--- a/applications/const/task_const.py
+++ b/applications/const/task_const.py
@@ -73,4 +73,17 @@ class NewContentIdTaskConst(HistoryContentIdTaskConst):
 
															     KIMI_FAIL_STATUS = 2
														
 
															     # 视频下载失败状态
														
 
															+    VIDEO_DOWNLOAD_INIT_STATUS = 0
														
 
															     VIDEO_DOWNLOAD_FAIL_STATUS = 3
														
 
															+
														
 
															+
														
 
															+class RematchTaskConst(NewContentIdTaskConst):
														
 
															+    """
														
 
															+    Rematch Task const
														
 
															+    """
														
 
															+    TASK_DEFAULT_GH_ID = 'DEFAULT_ID'
														
 
															+
														
 
															+    # 待重新匹配状态
														
 
															+    REMATCH_INIT_STATUS = 0
														
 
															+    REMATCH_SUCCESS_STATUS = 1
														
 
															+    REMATCH_FAIL_STATUS = 2
														
--- a/applications/db/__init__.py
+++ b/applications/db/__init__.py
@@ -3,6 +3,8 @@
 
															 """
														
 
															 import aiomysql
														
 
															+from aiomysql.cursors import Cursor
														
 
															+
														
 
															 from applications.config import denet_config, long_articles_config
														
@@ -47,17 +49,18 @@ class AsyncMySQLClient(object):
 
															         self.mysql_pool.close()
														
 
															         await self.mysql_pool.wait_closed()
														
 
															-    async def async_select(self, sql):
														
 
															+    async def async_select(self, sql, cursor_type=Cursor):
														
 
															         """
														
 
															         select method
														
 
															         :param sql:
														
 
															+        :param cursor_type:
														
 
															         :return:
														
 
															         """
														
 
															         async with self.mysql_pool.acquire() as conn:
														
 
															-            async with conn.cursor() as cursor:
														
 
															+            async with conn.cursor(cursor_type) as cursor:
														
 
															                 await cursor.execute(sql)
														
 
															                 result = await cursor.fetchall()
														
 
															-                return result
														
 
															+            return result
														
 
															     async def async_insert(self, sql, params):
														
 
															         """
														
--- a/tasks/rm_match_task.py
+++ b/tasks/rm_match_task.py
@@ -0,0 +1,318 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+import traceback
														
 
															+from typing import Dict, List
														
 
															+
														
 
															+from aiomysql.cursors import DictCursor
														
 
															+
														
 
															+from applications.log import logging
														
 
															+from applications.config import Config
														
 
															+from applications.const import rematch_task_const
														
 
															+from applications.etl_function import download_cover
														
 
															+from applications.etl_function import download_video
														
 
															+from applications.etl_function import generate_video_path
														
 
															+from applications.etl_function import upload_to_oss
														
 
															+from applications.spider import search_videos_from_web
														
 
															+from .utils import get_kimi_result
														
 
															+
														
 
															+
														
 
															+class ReMatchTask(object):
														
 
															+    """
														
 
															+    重新匹配任务
														
 
															+    """
														
 
															+
														
 
															+    def __init__(self, db_client):
														
 
															+        self.db_client = db_client
														
 
															+        self.config = Config()
														
 
															+        self.article_match_video_table = self.config.article_match_video_table
														
 
															+        self.article_text_table = self.config.article_text_table
														
 
															+        self.article_crawler_video_table = self.config.article_crawler_video_table
														
 
															+
														
 
															+    async def get_tasks(self) -> List[Dict]:
														
 
															+        """
														
 
															+        get task needs to rematch
														
 
															+        """
														
 
															+        select_sql = f"""
														
 
															+            SELECT DISTINCT content_id 
														
 
															+            FROM article_re_match_record
														
 
															+            WHERE status = {rematch_task_const.REMATCH_INIT_STATUS};
														
 
															+        """
														
 
															+        result = await self.db_client.async_select(select_sql, cursor_type=DictCursor)
														
 
															+        logging(
														
 
															+            code="rematch_1001",
														
 
															+            function="get_tasks",
														
 
															+            info="获取content_id数量： {}".format(len(result))
														
 
															+        )
														
 
															+        return result
														
 
															+
														
 
															+    async def async_download_video_list(self, to_download_videos: List[Dict], illegal_platform_id_list: List[str]) -> int:
														
 
															+        """
														
 
															+        异步下载视频
														
 
															+        """
														
 
															+        success_count = 0
														
 
															+        for video in to_download_videos:
														
 
															+            # check whether video is illegal
														
 
															+            out_key = "{}_{}".format(video['platform'], video['out_video_id'])
														
 
															+            if out_key in illegal_platform_id_list:
														
 
															+                continue
														
 
															+            # start download
														
 
															+            try:
														
 
															+                local_video_path, local_cover_path = generate_video_path(video['platform'], video['out_video_id'])
														
 
															+                # download videos
														
 
															+                file_path = await download_video(
														
 
															+                    file_path=local_video_path,
														
 
															+                    platform=video['platform'],
														
 
															+                    video_url=video['video_url']
														
 
															+                )
														
 
															+                if not file_path:
														
 
															+                    # 说明视频下载失败，无需上传该视频, 将该条记录设置为失败状态
														
 
															+                    update_sql = f"""
														
 
															+                            UPDATE {self.article_crawler_video_table}
														
 
															+                            SET download_status = %s
														
 
															+                            WHERE id = %s;
														
 
															+                    """
														
 
															+
														
 
															+                    await self.db_client.async_insert(
														
 
															+                        sql=update_sql,
														
 
															+                        params=(rematch_task_const.VIDEO_DOWNLOAD_FAIL_STATUS, video['id'])
														
 
															+                    )
														
 
															+                else:
														
 
															+                    # download cover
														
 
															+                    cover_path = await download_cover(
														
 
															+                        file_path=local_cover_path,
														
 
															+                        platform=video['platform'],
														
 
															+                        cover_url=video['cover_url']
														
 
															+                    )
														
 
															+                    # upload video to oss
														
 
															+                    oss_video = await upload_to_oss(
														
 
															+                        local_video_path=file_path,
														
 
															+                        download_type="video"
														
 
															+                    )
														
 
															+                    # upload cover to oss
														
 
															+                    if cover_path:
														
 
															+                        oss_cover = await upload_to_oss(
														
 
															+                            local_video_path=cover_path,
														
 
															+                            download_type="image"
														
 
															+                        )
														
 
															+                    else:
														
 
															+                        oss_cover = None
														
 
															+
														
 
															+                    # change status to success
														
 
															+                    update_sql = f"""
														
 
															+                        UPDATE {self.article_crawler_video_table}
														
 
															+                        SET video_oss_path = %s, cover_oss_path = %s, download_status = %s
														
 
															+                        WHERE id = %s;
														
 
															+                    """
														
 
															+                    await self.db_client.async_insert(
														
 
															+                        sql=update_sql,
														
 
															+                        params=(
														
 
															+                            oss_video,
														
 
															+                            oss_cover,
														
 
															+                            rematch_task_const.VIDEO_DOWNLOAD_SUCCESS_STATUS,
														
 
															+                            video['id']
														
 
															+                        )
														
 
															+                    )
														
 
															+                    success_count += 1
														
 
															+                # 如果下载的视频数已经大于3， 则直接退出循环，修改状态为ETL成功状态
														
 
															+                if success_count > rematch_task_const.MIN_MATCH_VIDEO_NUM:
														
 
															+                    return success_count
														
 
															+            except Exception as e:
														
 
															+                update_sql = f"""
														
 
															+                            UPDATE {self.article_crawler_video_table}
														
 
															+                            SET download_status = %s
														
 
															+                            WHERE id = %s;
														
 
															+                            """
														
 
															+                await self.db_client.async_insert(
														
 
															+                    sql=update_sql,
														
 
															+                    params=(rematch_task_const.VIDEO_DOWNLOAD_FAIL_STATUS, video['id'])
														
 
															+                )
														
 
															+
														
 
															+        return success_count
														
 
															+
														
 
															+    async def download_upload_task(self, content_id: str, illegal_platform_id_list: List[str], legal_not_downloaded_videos=None) -> int:
														
 
															+        """
														
 
															+        下载任务
														
 
															+        """
														
 
															+        success_count = 0
														
 
															+        if legal_not_downloaded_videos:
														
 
															+            # 下载legal_not_download_videos, 记录下载成功数量
														
 
															+            new_download_count = await self.async_download_video_list(
														
 
															+                to_download_videos=legal_not_downloaded_videos,
														
 
															+                illegal_platform_id_list=illegal_platform_id_list
														
 
															+            )
														
 
															+            success_count += new_download_count
														
 
															+            return success_count
														
 
															+
														
 
															+        select_sql = f"""
														
 
															+            SELECT id, out_video_id, platform, video_title, video_url, cover_url
														
 
															+            FROM {self.article_crawler_video_table} 
														
 
															+            WHERE content_id = {content_id} AND download_status = {rematch_task_const.VIDEO_DOWNLOAD_INIT_STATUS};
														
 
															+        """
														
 
															+        to_download_videos = await self.db_client.async_select(select_sql, cursor_type=DictCursor)
														
 
															+        new_download_count = await self.async_download_video_list(
														
 
															+            to_download_videos=to_download_videos,
														
 
															+            illegal_platform_id_list=illegal_platform_id_list
														
 
															+        )
														
 
															+        success_count += new_download_count
														
 
															+        return success_count
														
 
															+
														
 
															+    async def spider_task(self, content_id: str) -> int:
														
 
															+        """
														
 
															+        爬虫抓取任务
														
 
															+        """
														
 
															+        kimi_result = await get_kimi_result(
														
 
															+            content_id=content_id,
														
 
															+            article_text_table=self.article_text_table,
														
 
															+            db_client=self.db_client
														
 
															+        )
														
 
															+        # 该搜索任务是content_id粒度，因此将trace_id 传为content_id
														
 
															+        search_videos_count = await search_videos_from_web(
														
 
															+            info={
														
 
															+                "ori_title": kimi_result['ori_title'],
														
 
															+                "kimi_summary": kimi_result['kimi_summary'],
														
 
															+                "kimi_keys": kimi_result['kimi_keys'],
														
 
															+                "trace_id": content_id,
														
 
															+                "gh_id": rematch_task_const.TASK_DEFAULT_GH_ID,
														
 
															+                "content_id": content_id,
														
 
															+                "crawler_video_table": self.article_crawler_video_table
														
 
															+            },
														
 
															+            gh_id_map={},
														
 
															+            db_client=self.db_client
														
 
															+        )
														
 
															+        return search_videos_count
														
 
															+
														
 
															+    async def update_each_content_id(self, content_id: str) -> None:
														
 
															+        """
														
 
															+        更新每一个content_id的视频信息
														
 
															+        """
														
 
															+        select_sql = f"""
														
 
															+            SELECT
														
 
															+                id, platform, out_video_id, video_oss_path, download_status, is_illegal, video_url, cover_url
														
 
															+            FROM
														
 
															+                {self.article_crawler_video_table}
														
 
															+            WHERE content_id = '{content_id}';
														
 
															+        """
														
 
															+        record_list = await self.db_client.async_select(select_sql, cursor_type=DictCursor)
														
 
															+        # 获取违规视频，违规视频out_id && platform
														
 
															+        illegal_videos = [record for record in record_list if record["is_illegal"] == rematch_task_const.VIDEO_UNSAFE]
														
 
															+        illegal_platform_id_list = [
														
 
															+            "{}_{}".format(record['platform'], record['out_video_id']) for record in illegal_videos
														
 
															+        ]
														
 
															+
														
 
															+        # 处理非违规字段
														
 
															+        legal_downloaded_videos = []
														
 
															+        legal_not_downloaded_videos = []
														
 
															+        for record in record_list:
														
 
															+            if record['is_illegal'] == rematch_task_const.VIDEO_UNSAFE:
														
 
															+                continue
														
 
															+            else:
														
 
															+                out_key = "{}_{}".format(record['platform'], record['out_video_id'])
														
 
															+                if out_key in illegal_platform_id_list:
														
 
															+                    continue
														
 
															+                if record['download_status'] == rematch_task_const.VIDEO_DOWNLOAD_INIT_STATUS:
														
 
															+                    legal_not_downloaded_videos.append(record)
														
 
															+                elif record['download_status'] == rematch_task_const.VIDEO_DOWNLOAD_SUCCESS_STATUS:
														
 
															+                    legal_downloaded_videos.append(record)
														
 
															+                else:
														
 
															+                    continue
														
 
															+
														
 
															+        if len(legal_downloaded_videos) >= rematch_task_const.MIN_MATCH_VIDEO_NUM:
														
 
															+            logging(
														
 
															+                code="rematch_1002",
														
 
															+                info="存在{}条以上待合规已下载内容，无需处理该content_id".format(rematch_task_const.MIN_MATCH_VIDEO_NUM),
														
 
															+                data={
														
 
															+                    "content_id": content_id
														
 
															+                }
														
 
															+            )
														
 
															+            return
														
 
															+        elif len(legal_not_downloaded_videos + legal_downloaded_videos) >= rematch_task_const.MIN_MATCH_VIDEO_NUM:
														
 
															+            logging(
														
 
															+                code="rematch_1003",
														
 
															+                info="存在{}条以上待合规内容".format(rematch_task_const.MIN_MATCH_VIDEO_NUM),
														
 
															+                data={
														
 
															+                    "content_id": content_id,
														
 
															+                    "待下载数量": len(legal_not_downloaded_videos),
														
 
															+                    "已下载数量": len(legal_downloaded_videos)
														
 
															+                }
														
 
															+            )
														
 
															+            await self.download_upload_task(
														
 
															+                content_id=content_id,
														
 
															+                illegal_platform_id_list=illegal_platform_id_list,
														
 
															+                legal_not_downloaded_videos=legal_not_downloaded_videos
														
 
															+            )
														
 
															+        else:
														
 
															+            logging(
														
 
															+                code="rematch_1004",
														
 
															+                info="重新执行爬虫任务",
														
 
															+                data={
														
 
															+                    "content_id": content_id,
														
 
															+                }
														
 
															+            )
														
 
															+            await self.spider_task(content_id=content_id)
														
 
															+            await self.download_upload_task(
														
 
															+                content_id=content_id,
														
 
															+                illegal_platform_id_list=illegal_platform_id_list
														
 
															+            )
														
 
															+
														
 
															+    async def check_whether_get_enough_videos(self, content_id: str) -> bool:
														
 
															+        """
														
 
															+        check 该content_id是否存在足够的视频
														
 
															+        """
														
 
															+        select_sql = f"""
														
 
															+            SELECT count(1)
														
 
															+            FROM {self.article_crawler_video_table}
														
 
															+            WHERE content_id = '{content_id}' 
														
 
															+                AND download_status = {rematch_task_const.VIDEO_DOWNLOAD_SUCCESS_STATUS} 
														
 
															+                AND is_illegal = {rematch_task_const.VIDEO_SAFE};
														
 
															+        """
														
 
															+        count_tuple = await self.db_client.async_select(select_sql)
														
 
															+        count = count_tuple[0][0]
														
 
															+        if count >= rematch_task_const.MIN_MATCH_VIDEO_NUM:
														
 
															+            return True
														
 
															+        else:
														
 
															+            return False
														
 
															+
														
 
															+    async def update_content_id_status(self, content_id: str) -> int:
														
 
															+        """
														
 
															+        更新content_id的status为成功
														
 
															+        """
														
 
															+        update_sql = f"""
														
 
															+            UPDATE article_re_match_record
														
 
															+            SET status = %s
														
 
															+            WHERE content_id = '{content_id}';
														
 
															+        """
														
 
															+        affected_rows = await self.db_client.async_insert(
														
 
															+            update_sql,
														
 
															+            params=(rematch_task_const.REMATCH_SUCCESS_STATUS,)
														
 
															+        )
														
 
															+        return affected_rows
														
 
															+
														
 
															+    async def deal(self):
														
 
															+        """
														
 
															+        do job here
														
 
															+        """
														
 
															+        task_list = await self.get_tasks()
														
 
															+        for task in task_list:
														
 
															+            content_id = task['content_id']
														
 
															+            try:
														
 
															+                await self.update_each_content_id(content_id=content_id)
														
 
															+
														
 
															+                if await self.check_whether_get_enough_videos(content_id=content_id):
														
 
															+                    # 修改状态为1
														
 
															+                    await self.update_content_id_status(content_id=content_id)
														
 
															+                else:
														
 
															+                    continue
														
 
															+            except Exception as e:
														
 
															+                error_stack = traceback.format_exc()
														
 
															+                logging(
														
 
															+                    code="rematch_1010",
														
 
															+                    function="update_each_content_id",
														
 
															+                    data={
														
 
															+                        "error_stack": error_stack,
														
 
															+                        "error": str(e),
														
 
															+                        "content_id": content_id
														
 
															+                    }
														
 
															+                )
														
 
															+
														
 
															+