123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135 |
- from __future__ import annotations
- import json
- import time
- import traceback
- from pymysql.cursors import DictCursor
- from tqdm import tqdm
- from applications import log
- from applications.api import fetch_piaoquan_video_list_detail
- from applications.const.crawler_video_const import CrawlerPiaoQuanVideosConst
- from applications.db import DatabaseConnector
- from applications.pipeline import scrape_video_entities_process
- from applications.utils import Item
- from applications.utils import str_to_md5
- from applications.utils import insert_into_single_video_source_table
- from config import long_articles_config
- const = CrawlerPiaoQuanVideosConst()
- category_map = {
- "知识科普": "知识科普",
- "生活技巧科普": "知识科普",
- "老年相关法律科普": "知识科普",
- "中国战争史": "军事历史",
- "中国历史影像": "军事历史",
- "正能量剧情": "家长里短",
- "人财诈骗": "社会法治",
- "贪污腐败": "社会法治",
- "罕见画面": "奇闻趣事",
- "惊奇事件": "奇闻趣事",
- "动物萌宠": "奇闻趣事",
- "老明星": "名人八卦",
- "健康知识": "健康养生",
- "饮食健康": "健康养生",
- "人生忠告": "情感故事",
- "老年生活": "情感故事",
- "国际军事": "政治新闻",
- "他国政策": "政治新闻",
- "国际时政": "政治新闻",
- "历史名人": "历史人物",
- }
- class CrawlerPiaoQuanVideos:
- def __init__(self):
- self.db_client = DatabaseConnector(long_articles_config)
- self.db_client.connect()
- def get_piaoquan_top_video_list(self) -> list[dict]:
- fetch_query = f"""
- select id, video_id, title
- from {const.PIAOQUAN_TOP_VIDEO_TABLE}
- where status = {const.INIT_STATUS};
- """
- task_list = self.db_client.fetch(fetch_query, cursor_type=DictCursor)
- return task_list
- def update_piaoquan_top_video_status(
- self, pool_id: int, ori_status: int, new_status: int
- ) -> int:
- update_query = f"""
- update {const.PIAOQUAN_TOP_VIDEO_TABLE}
- set status = %s
- where id = %s and status = %s;
- """
- return self.db_client.save(update_query, (pool_id, ori_status, new_status))
- def crawler_each_video(self, video_data: dict) -> None:
- """
- crawler each video data
- """
- # lock video id
- lock_acquired = self.update_piaoquan_top_video_status(
- pool_id=video_data["id"],
- ori_status=const.INIT_STATUS,
- new_status=const.PROCESSING_STATUS,
- )
- if not lock_acquired:
- return
- # get video detail from piaoquan
- response_from_piaoquan = fetch_piaoquan_video_list_detail(
- [video_data["video_id"]]
- )
- video_detail = response_from_piaoquan["data"][0]
- video_item = Item()
- unique_id = f"{const.PLATFORM}-{video_data['video_id']}"
- # add info into item
- video_item.add("content_trace_id", "video{}".format(str_to_md5(unique_id)))
- video_item.add("url_unique_md5", video_data["video_id"])
- video_item.add("article_title", video_data["title"])
- video_item.add("out_account_id", video_detail["uid"])
- video_item.add("out_account_name", video_data["user"]["nickName"])
- video_item.add(
- "publish_timestamp", int(video_detail["gmtCreateTimestamp"] / 1000)
- )
- video_item.add("platform", const.PLATFORM)
- video_item.add(
- "article_url",
- f"https://admin.piaoquantv.com/cms/post-detail/{video_data['video_id']}/detail",
- )
- video_item.add("source_account", const.NO_SOURCE_ACCOUNT)
- video_item.add("crawler_timestamp", int(time.time()))
- video_item.add("oss_path", video_detail["ossVideoPath"])
- video_item.add("audit_status", video_detail["auditStatus"])
- video_item.add("category", category_map.get(video_data["category"]))
- # check item before insert
- video_item.check(source="video")
- try:
- item_with_oss_path = scrape_video_entities_process(
- video_item=video_item.item, db_client=self.db_client
- )
- if item_with_oss_path:
- insert_into_single_video_source_table(
- db_client=self.db_client, video_item=item_with_oss_path
- )
- except Exception as e:
- detail = {
- "video_item": video_item.item,
- "error": str(e),
- "traceback": traceback.format_exc(),
- }
- log(
- task="crawler_piaoquan_videos",
- function="crawler_each_video",
- message="crawler_piaoquan_videos failed",
- status="failed",
- data=detail,
- )
|