|  | @@ -2,13 +2,17 @@
 | 
	
		
			
				|  |  |  @author: luojunhui
 | 
	
		
			
				|  |  |  @tool: pycharm && deepseek
 | 
	
		
			
				|  |  |  """
 | 
	
		
			
				|  |  | -import json
 | 
	
		
			
				|  |  | -import time
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +import os
 | 
	
		
			
				|  |  |  import traceback
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | +from tqdm import tqdm
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |  from applications import log
 | 
	
		
			
				|  |  |  from applications.db import DatabaseConnector
 | 
	
		
			
				|  |  |  from applications.utils import download_sph_video
 | 
	
		
			
				|  |  | +from applications.utils import insert_into_single_video_source_table
 | 
	
		
			
				|  |  | +from applications.utils import Item
 | 
	
		
			
				|  |  |  from applications.utils import str_to_md5
 | 
	
		
			
				|  |  |  from applications.utils import upload_to_oss
 | 
	
		
			
				|  |  |  from config import long_articles_config
 | 
	
	
		
			
				|  | @@ -21,6 +25,7 @@ class CrawlerChannelAccountVideos:
 | 
	
		
			
				|  |  |      """
 | 
	
		
			
				|  |  |      crawler channel account videos
 | 
	
		
			
				|  |  |      """
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |      def __init__(self):
 | 
	
		
			
				|  |  |          self.db_client = DatabaseConnector(db_config=long_articles_config)
 | 
	
		
			
				|  |  |          self.db_client.connect()
 | 
	
	
		
			
				|  | @@ -48,73 +53,63 @@ class CrawlerChannelAccountVideos:
 | 
	
		
			
				|  |  |          """
 | 
	
		
			
				|  |  |          return
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | +    def crawler_each_video(self, video: dict):
 | 
	
		
			
				|  |  | +        """
 | 
	
		
			
				|  |  | +        download each video
 | 
	
		
			
				|  |  | +        save video and decrypt video
 | 
	
		
			
				|  |  | +        upload video to oss
 | 
	
		
			
				|  |  | +        """
 | 
	
		
			
				|  |  | +        object_desc = video["objectDesc"]
 | 
	
		
			
				|  |  | +        title = object_desc["description"]
 | 
	
		
			
				|  |  | +        if self.whether_video_exists(title):
 | 
	
		
			
				|  |  | +            return
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        video_item = Item()
 | 
	
		
			
				|  |  | +        video_item.add("content_trace_id", "video{}".format(str_to_md5(video["id"])))
 | 
	
		
			
				|  |  | +        video_item.add("url_unique_md5", video["id"])
 | 
	
		
			
				|  |  | +        video_item.add("article_title", title)
 | 
	
		
			
				|  |  | +        video_item.add("out_account_id", video["username"])
 | 
	
		
			
				|  |  | +        video_item.add("out_account_name", video["nickname"])
 | 
	
		
			
				|  |  | +        video_item.add("publish_timestamp", video["createtime"])
 | 
	
		
			
				|  |  | +        media = object_desc["media"][0]
 | 
	
		
			
				|  |  | +        url = media["Url"]
 | 
	
		
			
				|  |  | +        decode_key = media["decodeKey"]
 | 
	
		
			
				|  |  | +        url_token = media["urlToken"]
 | 
	
		
			
				|  |  | +        download_url = url + url_token
 | 
	
		
			
				|  |  | +        try:
 | 
	
		
			
				|  |  | +            decrypt_path = download_sph_video(download_url=download_url, key=decode_key)
 | 
	
		
			
				|  |  | +            oss_path = upload_to_oss(decrypt_path)
 | 
	
		
			
				|  |  | +            video_item.add("video_oss_path", oss_path)
 | 
	
		
			
				|  |  | +            video_item.add("source_account", NO_SOURCE_ACCOUNT)
 | 
	
		
			
				|  |  | +            video_item.check(source="video")
 | 
	
		
			
				|  |  | +            insert_into_single_video_source_table(self.db_client, video_item.item)
 | 
	
		
			
				|  |  | +            os.remove(decrypt_path)
 | 
	
		
			
				|  |  | +        except Exception as e:
 | 
	
		
			
				|  |  | +            log(
 | 
	
		
			
				|  |  | +                task="crawler_channel_account_videos",
 | 
	
		
			
				|  |  | +                function="crawler_each_video",
 | 
	
		
			
				|  |  | +                message="download video failed",
 | 
	
		
			
				|  |  | +                data={
 | 
	
		
			
				|  |  | +                    "error": str(e),
 | 
	
		
			
				|  |  | +                    "traceback": traceback.format_exc(),
 | 
	
		
			
				|  |  | +                    "video_id": video["id"],
 | 
	
		
			
				|  |  | +                },
 | 
	
		
			
				|  |  | +            )
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |      def crawler_each_account(self, channel_account_id: str, channel_account_name: str):
 | 
	
		
			
				|  |  |          """
 | 
	
		
			
				|  |  |          get channel account videos
 | 
	
		
			
				|  |  |          """
 | 
	
		
			
				|  |  |          response = get_channel_account_videos(channel_account_id)
 | 
	
		
			
				|  |  | -        if response['ret'] == 200:
 | 
	
		
			
				|  |  | -            response_data = response['data']
 | 
	
		
			
				|  |  | -            last_buffer = response_data['lastBuffer']
 | 
	
		
			
				|  |  | -            continue_flag = response_data['continueFlag']
 | 
	
		
			
				|  |  | -            video_list = response_data['object']
 | 
	
		
			
				|  |  | -            for video in video_list:
 | 
	
		
			
				|  |  | -                video_id = str(video['id'])
 | 
	
		
			
				|  |  | -                account_name = video['nickname']
 | 
	
		
			
				|  |  | -                object_desc = video['objectDesc']
 | 
	
		
			
				|  |  | -                publish_timestamp = video['createtime']
 | 
	
		
			
				|  |  | -                title = object_desc['description']
 | 
	
		
			
				|  |  | -                if self.whether_video_exists(title):
 | 
	
		
			
				|  |  | -                    continue
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -                media = object_desc['media'][0]
 | 
	
		
			
				|  |  | -                url = media['Url']
 | 
	
		
			
				|  |  | -                decode_key = media['decodeKey']
 | 
	
		
			
				|  |  | -                url_token = media['urlToken']
 | 
	
		
			
				|  |  | -                download_url = url + url_token
 | 
	
		
			
				|  |  | -                try:
 | 
	
		
			
				|  |  | -                    decrypt_path = download_sph_video(download_url=download_url, key=decode_key)
 | 
	
		
			
				|  |  | -                    oss_path = upload_to_oss(decrypt_path)
 | 
	
		
			
				|  |  | -                    insert_sql = f"""
 | 
	
		
			
				|  |  | -                        insert into publish_single_video_source
 | 
	
		
			
				|  |  | -                        (content_trace_id, article_title, out_account_id, out_account_name, video_oss_path, publish_timestamp, crawler_timestamp, url_unique_md5, platform, source_account)
 | 
	
		
			
				|  |  | -                        values
 | 
	
		
			
				|  |  | -                        (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
 | 
	
		
			
				|  |  | -                    """
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -                    try:
 | 
	
		
			
				|  |  | -                        self.db_client.save(
 | 
	
		
			
				|  |  | -                            query=insert_sql,
 | 
	
		
			
				|  |  | -                            params=(
 | 
	
		
			
				|  |  | -                                "video{}".format(str_to_md5(video_id)),
 | 
	
		
			
				|  |  | -                                title,
 | 
	
		
			
				|  |  | -                                channel_account_id,
 | 
	
		
			
				|  |  | -                                account_name,
 | 
	
		
			
				|  |  | -                                oss_path,
 | 
	
		
			
				|  |  | -                                publish_timestamp,
 | 
	
		
			
				|  |  | -                                int(time.time()),
 | 
	
		
			
				|  |  | -                                video_id,
 | 
	
		
			
				|  |  | -                                "sph",
 | 
	
		
			
				|  |  | -                                NO_SOURCE_ACCOUNT
 | 
	
		
			
				|  |  | -                            ),
 | 
	
		
			
				|  |  | -                        )
 | 
	
		
			
				|  |  | -                        self.success_crawler_video_count += 1
 | 
	
		
			
				|  |  | -                    except Exception as e:
 | 
	
		
			
				|  |  | -                        log(
 | 
	
		
			
				|  |  | -                            task="baidu_video_crawler",
 | 
	
		
			
				|  |  | -                            function="save_each_video",
 | 
	
		
			
				|  |  | -                            message="save video failed",
 | 
	
		
			
				|  |  | -                            data={
 | 
	
		
			
				|  |  | -                                "error": str(e),
 | 
	
		
			
				|  |  | -                                "traceback": traceback.format_exc(),
 | 
	
		
			
				|  |  | -                                "video_id": video_id,
 | 
	
		
			
				|  |  | -                                "oss_path": oss_path,
 | 
	
		
			
				|  |  | -                            },
 | 
	
		
			
				|  |  | -                        )
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -                except Exception as e:
 | 
	
		
			
				|  |  | -                    print("download video error:", e)
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | +        if response["ret"] == 200:
 | 
	
		
			
				|  |  | +            response_data = response["data"]
 | 
	
		
			
				|  |  | +            last_buffer = response_data["lastBuffer"]
 | 
	
		
			
				|  |  | +            continue_flag = response_data["continueFlag"]
 | 
	
		
			
				|  |  | +            video_list = response_data["object"]
 | 
	
		
			
				|  |  | +            crawl_video_list_bar = tqdm(video_list, desc="crawl videos")
 | 
	
		
			
				|  |  | +            for video in crawl_video_list_bar:
 | 
	
		
			
				|  |  | +                crawl_video_list_bar.set_postfix({"video_id": video["id"]})
 | 
	
		
			
				|  |  | +                self.crawler_each_video(video)
 | 
	
		
			
				|  |  |          else:
 | 
	
		
			
				|  |  |              print(f"crawler channel account {channel_account_name} videos failed")
 | 
	
		
			
				|  |  | -            return
 | 
	
		
			
				|  |  | +            return
 |