пре 1 година · cd85375a03
--- a/applications/exception/spider_error.py
+++ b/applications/exception/spider_error.py
@@ -9,7 +9,7 @@ from applications import log
 
															 class SpiderError(Exception):
														
 
															     """数据库查询异常"""
														
 
															-    def __init__(self, error=None, spider=None, url=None):
														
 
															+    def __init__(self, platform=None, error=None, spider=None, url=None):
														
 
															         """
														
 
															         :param error: 异常对象，可选，用于提供更详细的错误信息。
														
 
															         :param spider: 爬虫任务
														
@@ -22,7 +22,8 @@ class SpiderError(Exception):
 
															         }
														
 
															         log(
														
 
															             task="spider_task",
														
 
															-            function="log_spider_error",
														
 
															+            function="{}".format(platform),
														
 
															+            message="{} 抓取失败".format(spider),
														
 
															             data=error_obj
														
 
															         )
														
 
															         super().__init__(json.dumps(error_obj, ensure_ascii=False, indent=4))
														
--- a/applications/functions.py
+++ b/applications/functions.py
@@ -296,3 +296,28 @@ class Functions(object):
 
															         params = parse_qs(urlparse(url).query)
														
 
															         info = params.get(key, [])
														
 
															         return info[0] if info else None
														
 
															+
														
 
															+    @classmethod
														
 
															+    def download_baidu_videos(cls, video_url, save_path):
														
 
															+        """
														
 
															+        :param video_url: baidu video url
														
 
															+        :param save_path: save path
														
 
															+        """
														
 
															+        if os.path.exists(save_path):
														
 
															+            return save_path
														
 
															+
														
 
															+        response = requests.get(
														
 
															+            video_url,
														
 
															+            headers={
														
 
															+                'User-Agent': FakeUserAgent().chrome,
														
 
															+                "Accept": "*/*",
														
 
															+                "Accept-Language": "zh-CN,zh;q=0.9"
														
 
															+            }
														
 
															+        )
														
 
															+        with open(save_path, 'wb') as f:
														
 
															+            f.write(response.content)
														
 
															+        TEN_KB = 1024 * 10
														
 
															+        if os.path.getsize(save_path) > TEN_KB:
														
 
															+            return save_path
														
 
															+        else:
														
 
															+            return None
														
--- a/coldStartTasks/crawler/baidu/__init__.py
+++ b/coldStartTasks/crawler/baidu/__init__.py
@@ -0,0 +1 @@
 
															+from .video_crawler import BaiduVideoCrawler
														
--- a/coldStartTasks/crawler/baidu/account_crawler.py
+++ b/coldStartTasks/crawler/baidu/account_crawler.py
@@ -0,0 +1,4 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+
														
--- a/coldStartTasks/crawler/baidu/baidu_spider.py
+++ b/coldStartTasks/crawler/baidu/baidu_spider.py
@@ -0,0 +1,123 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+import base64
														
 
															+import uuid
														
 
															+
														
 
															+import requests
														
 
															+
														
 
															+from fake_useragent import FakeUserAgent
														
 
															+
														
 
															+from applications.exception import SpiderError
														
 
															+from applications import Functions
														
 
															+
														
 
															+functions = Functions()
														
 
															+
														
 
															+
														
 
															+def baidu_account_video_crawler(account_id, cursor=None):
														
 
															+    """
														
 
															+    baidu account video crawler
														
 
															+    :param account_id: 百度账号id
														
 
															+    :param cursor: 游标, 默认为None，表示从最新的开始爬取
														
 
															+    success requests:
														
 
															+    {
														
 
															+        "errno": 0,
														
 
															+        "errmsg": "成功",
														
 
															+        "data": {
														
 
															+            "response_count": 10,
														
 
															+            "has_more": 1,
														
 
															+            "ctime" : timestamp_ms plus one integer,
														
 
															+            "results": [
														
 
															+                {
														
 
															+                "tplName": "video",
														
 
															+                "type": "video",
														
 
															+                "content": {
														
 
															+                    "vid": "6472901034127874496",
														
 
															+                    "publish_time": "昨天",
														
 
															+                    "title": "8年前妈妈囤黄金当彩礼，金价飙升后，我们全家乐开了花",
														
 
															+                    "cover_src": "https://f7.baidu.com/it/u=1085139160,1164454909&fm=222&app=106&f=JPEG@s_0,w_660,h_370,q_80,f_auto",
														
 
															+                    "cover_src_pc": "https://f7.baidu.com/it/u=1085139160,1164454909&fm=222&app=106&f=JPEG@s_0,w_660,h_370,q_80,f_auto",
														
 
															+                    "thumbnails": "https://gimg0.baidu.com/gimg/src=h&refer=http%3A%2F%2Fwww.baidu.com&app=0&size=f339,225&n=0&g=0n&q=80?sec=0&t=f01af5f96ffb6d0d1904b33cbc2e136b",
														
 
															+                    "duration": "03:15",
														
 
															+                    "poster": "https://f7.baidu.com/it/u=1085139160,1164454909&fm=222&app=106&f=JPEG@s_0,w_660,h_370,q_80,f_auto",
														
 
															+                    "playcnt": "1054",
														
 
															+                    "playcntText": "1054次播放"
														
 
															+                }
														
 
															+            }...
														
 
															+            ]
														
 
															+        }
														
 
															+    }
														
 
															+    """
														
 
															+    cookie_str = uuid.uuid4().__str__().replace('-', '').upper()
														
 
															+    url = "https://haokan.baidu.com/web/author/listall?"
														
 
															+    params = {
														
 
															+        'app_id': account_id,
														
 
															+        'ctime': cursor,
														
 
															+        'rn': 10,
														
 
															+        'searchAfter': '',
														
 
															+        '_api': 1
														
 
															+    }
														
 
															+    headers = {
														
 
															+        'Accept': '*/*',
														
 
															+        'Accept-Language': 'zh,zh-CN;q=0.9',
														
 
															+        'Connection': 'keep-alive',
														
 
															+        'Referer': 'https://haokan.baidu.com/author/{}'.format(account_id),
														
 
															+        'User-Agent': FakeUserAgent().chrome,
														
 
															+        'x-requested-with': 'xmlhttprequest',
														
 
															+        'Cookie': 'BAIDUID={}:FG=1; BAIDUID_BFESS={}:FG=1'.format(cookie_str, cookie_str)
														
 
															+    }
														
 
															+    try:
														
 
															+        response = requests.request("GET", url, headers=headers, params=params, proxies=functions.proxy())
														
 
															+        response_json = response.json()
														
 
															+        if response_json['errmsg'] == '成功':
														
 
															+            response_data = response_json['data']
														
 
															+            return response_data
														
 
															+        else:
														
 
															+            raise SpiderError(
														
 
															+                platform="baidu",
														
 
															+                spider="account_video_crawler",
														
 
															+                error=response_json['errmsg'],
														
 
															+                url=url
														
 
															+            )
														
 
															+
														
 
															+    except Exception as e:
														
 
															+        raise SpiderError(
														
 
															+            platform="baidu",
														
 
															+            spider="account_video_crawler",
														
 
															+            error=str(e),
														
 
															+            url=url
														
 
															+        )
														
 
															+
														
 
															+
														
 
															+def baidu_single_video_crawler(video_id):
														
 
															+    """
														
 
															+    baidu video crawler
														
 
															+    :param video_id: 视频id
														
 
															+    """
														
 
															+    url = "https://haokan.baidu.com/v"
														
 
															+    params = {
														
 
															+        'vid': video_id,
														
 
															+        '_format': 'json'
														
 
															+    }
														
 
															+    base_64_string = base64.b64encode(str(uuid.uuid4()).encode()).decode()
														
 
															+    headers = {
														
 
															+        'Accept': '*/*',
														
 
															+        'cookie': "BIDUPSID={}".format(base_64_string),
														
 
															+        'Accept-Language': 'en,zh;q=0.9,zh-CN;q=0.8',
														
 
															+        'Cache-Control': 'no-cache',
														
 
															+        'Connection': 'keep-alive',
														
 
															+        'Content-Type': 'application/x-www-form-urlencoded',
														
 
															+        'Referer': 'https://haokan.baidu.com',
														
 
															+        'User-Agent': FakeUserAgent().chrome,
														
 
															+    }
														
 
															+    try:
														
 
															+        response = requests.request("GET", url, headers=headers, params=params, proxies=functions.proxy())
														
 
															+        response_json = response.json()
														
 
															+        return response_json['data']['apiData']['curVideoMeta']
														
 
															+    except Exception as e:
														
 
															+        raise SpiderError(
														
 
															+            platform="baidu",
														
 
															+            spider="single_video_crawler",
														
 
															+            error=str(e),
														
 
															+            url=url
														
 
															+        )
														
--- a/coldStartTasks/crawler/baidu/video_crawler.py
+++ b/coldStartTasks/crawler/baidu/video_crawler.py
@@ -0,0 +1,200 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+@description: video crawler
														
 
															+"""
														
 
															+import json
														
 
															+import time
														
 
															+
														
 
															+from pymysql.cursors import DictCursor
														
 
															+from tqdm import tqdm
														
 
															+
														
 
															+from applications import Functions
														
 
															+from applications.db import DatabaseConnector
														
 
															+from applications.exception import SpiderError
														
 
															+from config import long_articles_config
														
 
															+from coldStartTasks.crawler.baidu.baidu_spider import baidu_account_video_crawler
														
 
															+from coldStartTasks.crawler.baidu.baidu_spider import baidu_single_video_crawler
														
 
															+
														
 
															+empty_list = []
														
 
															+functions = Functions()
														
 
															+DEFAULT_CURSOR = 17040384000000  # 最早时间为2024-01-01 00:00:00
														
 
															+
														
 
															+
														
 
															+class BaiduVideoCrawler(object):
														
 
															+    """
														
 
															+    baidu video crawler
														
 
															+    """
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        self.db = None
														
 
															+
														
 
															+    def connect_db(self):
														
 
															+        """
														
 
															+        connect db
														
 
															+        """
														
 
															+        self.db = DatabaseConnector(db_config=long_articles_config)
														
 
															+        self.db.connect()
														
 
															+
														
 
															+    def get_account_list(self):
														
 
															+        """
														
 
															+        get account list
														
 
															+        status = 1 表示正常抓取的账号
														
 
															+        """
														
 
															+        sql = f"""
														
 
															+            select account_id, account_name, latest_crawler_timestamp as max_cursor 
														
 
															+            from baidu_account_for_videos
														
 
															+            where status = 1;
														
 
															+        """
														
 
															+        account_list = self.db.fetch(query=sql, cursor_type=DictCursor)
														
 
															+        return account_list
														
 
															+
														
 
															+    def whether_video_exists(self, video_id, title):
														
 
															+        """
														
 
															+        whether video exists, use video_id && title
														
 
															+        """
														
 
															+        # first check video_id
														
 
															+        sql_1 = f"""
														
 
															+            select id from publish_single_video_source
														
 
															+            where url_unique_md5 = '{video_id}';
														
 
															+        """
														
 
															+        count_1 = self.db.fetch(query=sql_1)
														
 
															+        if count_1:
														
 
															+            print(video_id + " video exists")
														
 
															+            return True
														
 
															+
														
 
															+        # check title
														
 
															+        sql_2 = f"""
														
 
															+            select id from publish_single_video_source
														
 
															+            where article_title = '{title}';
														
 
															+        """
														
 
															+        count_2 = self.db.fetch(query=sql_2)
														
 
															+        if count_2:
														
 
															+            print(title + " video exists")
														
 
															+            return True
														
 
															+
														
 
															+        return False
														
 
															+
														
 
															+    def save_each_video(self, video, account_id, account_name):
														
 
															+        """
														
 
															+        download and save each video
														
 
															+        """
														
 
															+        # print(json.dumps(video, ensure_ascii=False, indent=4))
														
 
															+        video_id = video['id']
														
 
															+        title = video['title']
														
 
															+
														
 
															+        # judge whether video exists
														
 
															+        if self.whether_video_exists(video_id, title):
														
 
															+            return
														
 
															+
														
 
															+        read_cnt = video.get('playcnt', 0)
														
 
															+        like_cnt = video.get('like_num', 0)
														
 
															+        publish_timestamp = video['publish_time']
														
 
															+        # duration = video['duration']
														
 
															+        cover_url = video['poster']
														
 
															+        video_url = video['playurl']
														
 
															+        # sensitive_flag = video.get('sensitive_flag')
														
 
															+        video_more_info = video.get('contentcms_intervene_data')
														
 
															+        if video_more_info:
														
 
															+            video_category_list = video_more_info.get('category_v2')
														
 
															+            if video_category_list:
														
 
															+                video_category = video_category_list[0]
														
 
															+            else:
														
 
															+                video_category = None
														
 
															+        else:
														
 
															+            video_category = None
														
 
															+        manual_tags = video.get('manual_tags')
														
 
															+
														
 
															+        video_path = 'static/{}.mp4'.format(video_id)
														
 
															+        download_path = functions.download_baidu_videos(video_url, video_path)
														
 
															+        if download_path:
														
 
															+            oss_path = functions.upload_to_oss(local_video_path=download_path)
														
 
															+            insert_sql = f"""
														
 
															+                INSERT INTO publish_single_video_source
														
 
															+                (content_trace_id, article_title, out_account_id, out_account_name, read_cnt, like_cnt, article_url, cover_url, video_oss_path, publish_timestamp, crawler_timestamp, url_unique_md5, category, tags, platform, source_account)
														
 
															+                values
														
 
															+                (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
														
 
															+            """
														
 
															+            try:
														
 
															+                self.db.save(
														
 
															+                    query=insert_sql,
														
 
															+                    params=(
														
 
															+                        "video{}".format(functions.str_to_md5(video_id)),
														
 
															+                        title,
														
 
															+                        account_id,
														
 
															+                        account_name,
														
 
															+                        read_cnt,
														
 
															+                        like_cnt,
														
 
															+                        video_url,
														
 
															+                        cover_url,
														
 
															+                        oss_path,
														
 
															+                        publish_timestamp,
														
 
															+                        int(time.time()),
														
 
															+                        video_id,
														
 
															+                        video_category,
														
 
															+                        json.dumps(manual_tags, ensure_ascii=False) if manual_tags else None,
														
 
															+                        "baidu",
														
 
															+                        0
														
 
															+                    )
														
 
															+                )
														
 
															+            except Exception as e:
														
 
															+                print(e)
														
 
															+        else:
														
 
															+            print(f"download video failed, video_id: {video_id}")
														
 
															+
														
 
															+    def save_video_list(self, account_id, account_name, video_list):
														
 
															+        """
														
 
															+        save video list
														
 
															+        """
														
 
															+        # print(json.dumps(video_list, ensure_ascii=False, indent=4))
														
 
															+        for video_obj in tqdm(video_list, desc="save video list"):
														
 
															+            if video_obj['type'] == 'video':
														
 
															+                video_id = video_obj['content']['vid']
														
 
															+                try:
														
 
															+                    video_detail = baidu_single_video_crawler(video_id)
														
 
															+                    self.save_each_video(video_detail, account_id=account_id, account_name=account_name)
														
 
															+                except SpiderError as e:
														
 
															+                    print(e)
														
 
															+                    continue
														
 
															+            else:
														
 
															+                continue
														
 
															+
														
 
															+    def crawler_each_account(self, account, cursor=None):
														
 
															+        """
														
 
															+        crawler each account
														
 
															+        response_strategy
														
 
															+        """
														
 
															+        account_id = account['account_id']
														
 
															+        max_cursor = account['max_cursor']
														
 
															+        if not max_cursor:
														
 
															+            max_cursor = DEFAULT_CURSOR
														
 
															+        account_name = account['account_name']
														
 
															+        try:
														
 
															+            response_json = baidu_account_video_crawler(account_id, cursor=cursor)
														
 
															+
														
 
															+            video_list = response_json.get("results", empty_list)
														
 
															+            if video_list:
														
 
															+                self.save_video_list(
														
 
															+                    account_id=account_id,
														
 
															+                    account_name=account_name,
														
 
															+                    video_list=video_list
														
 
															+                )
														
 
															+            # check next page
														
 
															+            has_next_page = response_json.get("has_more", False)
														
 
															+            if has_next_page:
														
 
															+                next_cursor = response_json.get("ctime", DEFAULT_CURSOR)
														
 
															+                if next_cursor < max_cursor:
														
 
															+                    print("No more videos after 2024-01-01")
														
 
															+                    return
														
 
															+                else:
														
 
															+                    return self.crawler_each_account(account, next_cursor)
														
 
															+        except SpiderError as e:
														
 
															+            print(e)
														
 
															+            return
														
 
															+
														
 
															+    def deal(self):
														
 
															+        """
														
 
															+        deal
														
 
															+        """
														
 
															+        account_list = self.get_account_list()
														
 
															+        for account in account_list[1:]:
														
 
															+            self.crawler_each_account(account)
														
--- a/run_baidu_video_crawler.py
+++ b/run_baidu_video_crawler.py
@@ -0,0 +1,9 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+from coldStartTasks.crawler.baidu import BaiduVideoCrawler
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    task = BaiduVideoCrawler()
														
 
															+    task.connect_db()
														
 
															+    task.deal()
	`@@ -0,0 +1 @@`
			`+from .video_crawler import BaiduVideoCrawler`