luojunhui
/
LongArticlesJob


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
							"""
@author: luojunhui
@tool: pycharm && deepseek
"""
import json
import time
import traceback

from applications import log
from applications.db import DatabaseConnector
from applications.utils import download_sph_video
from applications.utils import str_to_md5
from applications.utils import upload_to_oss
from config import long_articles_config
from coldStartTasks.crawler.channels import get_channel_account_videos

NO_SOURCE_ACCOUNT = 0


class CrawlerChannelAccountVideos:
    """
    crawler channel account videos
    """
    def __init__(self):
        self.db_client = DatabaseConnector(db_config=long_articles_config)
        self.db_client.connect()
        self.success_crawler_video_count = 0

    def whether_video_exists(self, title: str) -> bool:
        """
        whether video exists, use video_id && title
        """
        # check title
        sql = f"""
            select id from publish_single_video_source
            where article_title = %s;
        """
        duplicate_id = self.db_client.fetch(query=sql, params=(title,))
        if duplicate_id:
            print(title + " video exists")
            return True

        return False

    def get_channel_account_list(self):
        """
        get channel account list from database
        """
        return

    def crawler_each_account(self, channel_account_id: str, channel_account_name: str):
        """
        get channel account videos
        """
        response = get_channel_account_videos(channel_account_id)
        if response['ret'] == 200:
            response_data = response['data']
            last_buffer = response_data['lastBuffer']
            continue_flag = response_data['continueFlag']
            video_list = response_data['object']
            for video in video_list:
                video_id = str(video['id'])
                account_name = video['nickname']
                object_desc = video['objectDesc']
                publish_timestamp = video['createtime']
                title = object_desc['description']
                if self.whether_video_exists(title):
                    continue

                media = object_desc['media'][0]
                url = media['Url']
                decode_key = media['decodeKey']
                url_token = media['urlToken']
                download_url = url + url_token
                try:
                    decrypt_path = download_sph_video(download_url=download_url, key=decode_key)
                    oss_path = upload_to_oss(decrypt_path)
                    insert_sql = f"""
                        insert into publish_single_video_source
                        (content_trace_id, article_title, out_account_id, out_account_name, video_oss_path, publish_timestamp, crawler_timestamp, url_unique_md5, platform, source_account)
                        values
                        (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
                    """

                    try:
                        self.db_client.save(
                            query=insert_sql,
                            params=(
                                "video{}".format(str_to_md5(video_id)),
                                title,
                                channel_account_id,
                                account_name,
                                oss_path,
                                publish_timestamp,
                                int(time.time()),
                                video_id,
                                "sph",
                                NO_SOURCE_ACCOUNT
                            ),
                        )
                        self.success_crawler_video_count += 1
                    except Exception as e:
                        log(
                            task="baidu_video_crawler",
                            function="save_each_video",
                            message="save video failed",
                            data={
                                "error": str(e),
                                "traceback": traceback.format_exc(),
                                "video_id": video_id,
                                "oss_path": oss_path,
                            },
                        )

                except Exception as e:
                    print("download video error:", e)

        else:
            print(f"crawler channel account {channel_account_name} videos failed")
            return