|
@@ -0,0 +1,159 @@
|
|
|
+"""
|
|
|
+@author: luojunhui
|
|
|
+"""
|
|
|
+import time
|
|
|
+import asyncio
|
|
|
+import datetime
|
|
|
+
|
|
|
+from applications.db import AsyncMySQLClient
|
|
|
+from applications.spider import search_videos_from_web
|
|
|
+from tasks.utils.kimi_task import get_kimi_result
|
|
|
+from tasks.utils.etl_task import async_download_videos
|
|
|
+
|
|
|
+
|
|
|
+class ContentRematch:
|
|
|
+ """
|
|
|
+ 内容重新匹配
|
|
|
+ """
|
|
|
+
|
|
|
+ def __init__(self, db_client):
|
|
|
+ self.db_client = db_client
|
|
|
+
|
|
|
+ async def get_content_list(self):
|
|
|
+ """
|
|
|
+ 获取待匹配的content_id列表
|
|
|
+ """
|
|
|
+ select_sql = f"""
|
|
|
+ SELECT content_id, count(1)
|
|
|
+ FROM long_articles_crawler_videos
|
|
|
+ WHERE download_status = 2
|
|
|
+ GROUP BY content_id
|
|
|
+ HAVING count(1) < 3
|
|
|
+ LIMIT 10;
|
|
|
+ """
|
|
|
+ content_list = await self.db_client.async_select(select_sql)
|
|
|
+ return content_list
|
|
|
+
|
|
|
+ async def get_illegal_out_ids(self, content_id: str) -> list:
|
|
|
+ """
|
|
|
+ 获取违规的外站视频id
|
|
|
+ """
|
|
|
+ select_sql = f"""
|
|
|
+ SELECT platform, out_video_id
|
|
|
+ FROM long_articles_crawler_videos
|
|
|
+ WHERE content_id = '{content_id}' and is_illegal = 1;
|
|
|
+ """
|
|
|
+ response = await self.db_client.async_select(select_sql)
|
|
|
+ if response:
|
|
|
+ result = ["{}_{}".format(line[0], line[1]) for line in response]
|
|
|
+ return result
|
|
|
+ else:
|
|
|
+ return []
|
|
|
+
|
|
|
+ async def spider_task(self, params, kimi_result):
|
|
|
+ """
|
|
|
+ 爬虫任务
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ content_id = params['content_id']
|
|
|
+ gh_id = "test_gh_id"
|
|
|
+ trace_id = "rematch_{}".format(content_id)
|
|
|
+
|
|
|
+ try:
|
|
|
+ search_videos_count = await search_videos_from_web(
|
|
|
+ info={
|
|
|
+ "ori_title": kimi_result['ori_title'],
|
|
|
+ "kimi_summary": kimi_result['kimi_summary'],
|
|
|
+ "kimi_keys": kimi_result['kimi_keys'],
|
|
|
+ "trace_id": trace_id,
|
|
|
+ "gh_id": gh_id,
|
|
|
+ "content_id": content_id,
|
|
|
+ "crawler_video_table": "long_articles_crawler_videos",
|
|
|
+ },
|
|
|
+ gh_id_map={},
|
|
|
+ db_client=self.db_client
|
|
|
+ )
|
|
|
+ if search_videos_count >= 3:
|
|
|
+ return True
|
|
|
+ else:
|
|
|
+ return False
|
|
|
+ except Exception as e:
|
|
|
+ return False
|
|
|
+
|
|
|
+ async def etl_task(self, content_id, trace_id):
|
|
|
+ """
|
|
|
+ etl任务
|
|
|
+ """
|
|
|
+ # download videos
|
|
|
+ illegal_videos = await self.get_illegal_out_ids(content_id)
|
|
|
+ downloaded_count = await async_download_videos(
|
|
|
+ trace_id=trace_id,
|
|
|
+ content_id=content_id,
|
|
|
+ article_crawler_video_table="long_articles_crawler_videos",
|
|
|
+ db_client=self.db_client,
|
|
|
+ illegal_videos=illegal_videos
|
|
|
+ )
|
|
|
+
|
|
|
+ if downloaded_count >= 3:
|
|
|
+ return True
|
|
|
+ else:
|
|
|
+ return False
|
|
|
+
|
|
|
+ async def process_each_content(self, content):
|
|
|
+ """
|
|
|
+ 处理每个content_id
|
|
|
+ """
|
|
|
+ content_id = content[0]
|
|
|
+
|
|
|
+ kimi_result = await get_kimi_result(
|
|
|
+ content_id=content_id,
|
|
|
+ db_client=self.db_client,
|
|
|
+ article_text_table="long_articles_text",
|
|
|
+ )
|
|
|
+
|
|
|
+ if not kimi_result:
|
|
|
+ return
|
|
|
+
|
|
|
+ spider_flag = await self.spider_task(
|
|
|
+ params={
|
|
|
+ "content_id": content_id,
|
|
|
+ },
|
|
|
+ kimi_result=kimi_result
|
|
|
+ )
|
|
|
+
|
|
|
+ if not spider_flag:
|
|
|
+ return
|
|
|
+ etl_flag = await self.etl_task(
|
|
|
+ content_id=content_id,
|
|
|
+ trace_id="rematch_{}".format(content_id)
|
|
|
+ )
|
|
|
+ if not etl_flag:
|
|
|
+ return
|
|
|
+ return True
|
|
|
+
|
|
|
+ async def deal(self):
|
|
|
+ """
|
|
|
+ 处理每个content_id
|
|
|
+ """
|
|
|
+ content_list = await self.get_content_list()
|
|
|
+ for content in content_list[:1]:
|
|
|
+ print(content)
|
|
|
+ await self.process_each_content(content)
|
|
|
+
|
|
|
+
|
|
|
+async def main_job():
|
|
|
+ """
|
|
|
+ main job
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ async with AsyncMySQLClient() as long_articles_pool:
|
|
|
+ _task = ContentRematch(long_articles_pool)
|
|
|
+ await _task.deal()
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ while True:
|
|
|
+ asyncio.run(main_job())
|
|
|
+ now_str = datetime.datetime.now().__str__()
|
|
|
+ print("{} 请求执行完成, 等待60s".format(now_str))
|
|
|
+ time.sleep(60)
|