123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101 |
- """
- @author: luojunhui
- @tool: pycharm && deepseek
- """
- import json
- import time
- import traceback
- from applications import log
- from applications.db import DatabaseConnector
- from applications.utils import download_sph_video
- from applications.utils import str_to_md5
- from applications.utils import upload_to_oss
- from config import long_articles_config
- from coldStartTasks.crawler.channels import get_channel_account_videos
- NO_SOURCE_ACCOUNT = 0
- class CrawlerChannelAccountVideos:
- """
- crawler channel account videos
- """
- def __init__(self):
- self.db_client = DatabaseConnector(db_config=long_articles_config)
- self.db_client.connect()
- self.success_crawler_video_count = 0
- def get_channel_account_list(self):
- """
- get channel account list from database
- """
- return
- def crawler_each_account(self, channel_account_id: str, channel_account_name: str):
- """
- get channel account videos
- """
- response = get_channel_account_videos(channel_account_id)
- if response['ret'] == 200:
- response_data = response['data']
- last_buffer = response_data['lastBuffer']
- continue_flag = response_data['continueFlag']
- video_list = response_data['object']
- for video in video_list[:1]:
- video_id = str(video['id'])
- account_name = video['nickname']
- object_desc = video['objectDesc']
- publish_timestamp = video['createtime']
- title = object_desc['description']
- media = object_desc['media'][0]
- url = media['Url']
- decode_key = media['decodeKey']
- url_token = media['urlToken']
- download_url = url + url_token
- try:
- decrypt_path = download_sph_video(download_url=download_url, key=decode_key)
- oss_path = upload_to_oss(decrypt_path)
- insert_sql = f"""
- insert into publish_single_video_source
- (content_trace_id, article_title, out_account_id, out_account_name, video_oss_path, publish_timestamp, crawler_timestamp, url_unique_md5, platform, source_account)
- values
- (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
- """
- try:
- self.db_client.save(
- query=insert_sql,
- params=(
- "video{}".format(str_to_md5(video_id)),
- title,
- channel_account_id,
- account_name,
- oss_path,
- publish_timestamp,
- int(time.time()),
- video_id,
- "sph",
- NO_SOURCE_ACCOUNT
- ),
- )
- self.success_crawler_video_count += 1
- except Exception as e:
- log(
- task="baidu_video_crawler",
- function="save_each_video",
- message="save video failed",
- data={
- "error": str(e),
- "traceback": traceback.format_exc(),
- "video_id": video_id,
- "oss_path": oss_path,
- },
- )
- except Exception as e:
- print("download video error:", e)
- else:
- print(f"crawler channel account {channel_account_name} videos failed")
- return
|