Browse Source

头条抓取

luojunhui 1 month ago
parent
commit
127d013941

File diff suppressed because it is too large
+ 7548 - 0
applications/js/toutiao.js


+ 4 - 0
applications/pipeline/__init__.py

@@ -0,0 +1,4 @@
+"""
+@author: luojunhui
+"""
+from .crawler_pipeline import video_crawler_pipeline

+ 61 - 0
applications/pipeline/crawler_pipeline.py

@@ -0,0 +1,61 @@
+"""
+@author: luojunhui
+"""
+import os
+
+from applications.utils import download_gzh_video
+from applications.utils import download_toutiao_video
+from applications.utils import upload_to_oss
+
+empty_dict = {}
+
+
+def whether_duplicate_video_title(video_title, db_client):
+    """
+    whether duplicate video title
+    """
+    sql = f"""
+            select id from publish_single_video_source
+            where article_title = %s;
+    """
+    duplicate_id = db_client.fetch(query=sql, params=(video_title,))
+    if duplicate_id:
+        return True
+    return False
+
+
+def video_crawler_pipeline(video_item, db_client) -> dict:
+    """
+    video crawler pipeline
+    """
+    # whether duplicate video title
+    video_title = video_item['article_title']
+    if whether_duplicate_video_title(video_title, db_client):
+        return empty_dict
+
+    # video title sensitive words filter
+
+    # download video
+    article_url = video_item['article_url']
+    platform = video_item['platform']
+
+    match platform:
+        case "toutiao":
+            video_path = download_toutiao_video(article_url)
+        case "gzh":
+            video_path = download_gzh_video(article_url)
+        case "hksp":
+            video_path = ''
+        case "sph":
+            video_path = ''
+        case _:
+            return empty_dict
+
+    if video_path:
+        # upload video to oss
+        oss_path = upload_to_oss(video_path)
+        video_item['video_oss_path'] = oss_path
+        os.remove(video_path)
+        return video_item
+    else:
+        return empty_dict

+ 1 - 0
applications/utils/__init__.py

@@ -6,6 +6,7 @@ from .cold_start import get_inner_account_set
 from .common import *
 from .download_video import download_gzh_video
 from .download_video import download_sph_video
+from .download_video import download_toutiao_video
 from .item import Item
 from .save_to_db import insert_into_single_video_source_table
 from .upload import upload_to_oss

+ 17 - 0
applications/utils/common.py

@@ -20,3 +20,20 @@ def str_to_md5(strings):
     # 获取16进制形式的MD5哈希值
     md5_value = md5_hash.hexdigest()
     return md5_value
+
+
+def proxy():
+    """
+    快代理
+    """
+    # 隧道域名:端口号
+    tunnel = "j685.kdltps.com:15818"
+
+    # 用户名密码方式
+    username = "t14070979713487"
+    password = "hqwanfvy"
+    proxies = {
+        "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
+        "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
+    }
+    return proxies

+ 15 - 0
applications/utils/download_video.py

@@ -139,3 +139,18 @@ def decrypt_sph_video(video_path: str, key: int, save_path: str) -> None:
     except Exception as e:
         print(traceback.format_exc())
         raise RuntimeError(f"Decryption failed: {str(e)}") from e
+
+
+def download_toutiao_video(video_url: str) -> str:
+    """
+    download toutiao video
+    """
+    save_path = "static/{}.mp4".format(str_to_md5(video_url))
+    response = requests.get(video_url, headers=headers, stream=True)
+    with open(save_path, "wb") as f:
+        for chunk in response.iter_content(chunk_size=8192):
+            if chunk:
+                f.write(chunk)
+
+    return save_path
+

+ 1 - 1
coldStartTasks/crawler/__init__.py

@@ -2,4 +2,4 @@
 @author: luojunhui
 """
 from .weixin_account_crawler import WeixinAccountCrawler
-from .weixin_video_crawler import WeixinVideoCrawler
+from .weixin_video_crawler import WeixinVideoCrawler

+ 4 - 0
coldStartTasks/crawler/toutiao/__init__.py

@@ -0,0 +1,4 @@
+"""
+@author: luojunhui
+"""
+from .blogger import get_toutiao_account_video_list

+ 34 - 0
coldStartTasks/crawler/toutiao/blogger.py

@@ -0,0 +1,34 @@
+"""
+@author: luojunhui
+"""
+import json
+import requests
+from applications.utils import proxy
+from .use_js import call_js_function
+
+
+def get_toutiao_account_video_list(account_id: str, cookie: str, max_behot_time=0) -> dict:
+    """
+    get toutiao account video list
+    :param account_id: toutiao account id
+    :param cookie: cookie maybe expire not quite sure
+    :param max_behot_time: max behot time
+    :return: toutiao account video list
+    """
+    ms_token = 'mFs9gU4FJc23gFWPvBfQxFsBRrx1xBEJD_ZRTAolHfPrae84kTEBaHQR3s8ToiLX4-U9hgATTZ2cVHlSixmj5YCTOPoVM-43gOt3aVHkxfXHEuUtTJe-wUEs%3D'
+    query_params = [
+        0,
+        1,
+        14,
+        'category=pc_user_hot&token={}&aid=24&app_name=toutiao_web&msToken={}'.format(account_id, ms_token),
+        '',
+        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36',
+    ]
+    a_bogus = call_js_function(query_params)
+    url = f'https://www.toutiao.com/api/pc/list/user/feed?category=pc_profile_video&token={account_id}&max_behot_time={max_behot_time}&hot_video=0&entrance_gid=&aid=24&app_name=toutiao_web&msToken={ms_token}&a_bogus={a_bogus}'
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
+        'cookie': cookie
+    }
+    response = requests.get(url, headers=headers, proxies=proxy())
+    return response.json()

+ 25 - 0
coldStartTasks/crawler/toutiao/use_js.py

@@ -0,0 +1,25 @@
+"""
+@author: luojunhui
+"""
+import json
+import subprocess
+
+from config import toutiao_js_path
+
+
+def call_js_function(arguments_list):
+    """
+    call js function
+    """
+    # 将参数转换为JSON字符串
+    args_json = json.dumps(arguments_list)
+    # 调用Node.js执行脚本
+    result = subprocess.run(
+        ['node', toutiao_js_path, args_json],
+        capture_output=True,
+        text=True
+    )
+    if result.returncode == 0:
+        return result.stdout.strip()
+    else:
+        raise Exception(f"Error: {result.stderr}")

+ 4 - 1
config/__init__.py

@@ -104,4 +104,7 @@ gewe_token = "d3fb918f-0f36-4769-b095-410181614231"
 gewe_app_id = "wx_GKpVW8xfEhcaxMIK9sSm6"
 
 # sph decrypt key
-decrypt_key_path = 'applications/so/libsph_decrypt.so'
+decrypt_key_path = 'applications/so/libsph_decrypt.so'
+
+# toutiao js path
+toutiao_js_path = 'applications/js/toutiao.js'

+ 104 - 0
tasks/crawler_toutiao_account_videos.py

@@ -0,0 +1,104 @@
+"""
+@author: luojunhui
+"""
+import time
+from tqdm import tqdm
+
+from applications.db import DatabaseConnector
+from applications.pipeline import video_crawler_pipeline
+from applications.utils import Item
+from applications.utils import str_to_md5
+from applications.utils import insert_into_single_video_source_table
+from coldStartTasks.crawler.toutiao import get_toutiao_account_video_list
+from config import apolloConfig, long_articles_config
+
+config = apolloConfig()
+cookie = config.getConfigValue("toutiao_blogger_cookie")
+
+
+class CrawlerToutiaoAccountVideos:
+    """
+    toutiao blogger crawler
+    """
+
+    def __init__(self):
+        self.db_client = DatabaseConnector(db_config=long_articles_config)
+        self.db_client.connect()
+
+    def get_account_list(self):
+        """
+        get account list
+        """
+        return
+
+    def crawler_each_account_video_list(self, account_id, max_behot_time=0):
+        """
+        get each account video list
+        """
+        min_behot_time = 1704038400
+        current_cursor = max_behot_time
+        has_more = True
+
+        while has_more:
+            response = get_toutiao_account_video_list(account_id=account_id, cookie=cookie,
+                                                      max_behot_time=current_cursor)
+            if response['message'] != 'success':
+                print("error")
+                break
+
+            video_list = response['data']
+            has_more = response['has_more']
+            current_cursor = response['next']['max_behot_time']
+
+            if not video_list:
+                break
+
+            max_timestamp_in_this_group = video_list[0]['publish_time']
+            if max_timestamp_in_this_group < min_behot_time:
+                break
+
+            crawler_video_list_bar = tqdm(video_list, desc="crawler videos")
+            for video in crawler_video_list_bar:
+                crawler_video_list_bar.set_postfix({"video_id": video["id"]})
+                self.crawler_each_video(video)
+
+            if has_more:
+                time.sleep(3)
+            else:
+                break
+
+    def crawler_each_video(self, video_data):
+        """
+        crawler each video data
+        """
+        video_item = Item()
+        video_id = video_data['video_id']
+        title = video_data['title']
+        video_item.add("content_trace_id", "video{}".format(str_to_md5(str(video_id))))
+        video_item.add("url_unique_md5", video_id)
+        video_item.add("article_title", title)
+        video_item.add("out_account_id", video_data['user']['user_id'])
+        video_item.add("out_account_name", video_data['source'])
+        video_item.add("publish_timestamp", video_data['publish_time'])
+        video_item.add("platform", "toutiao")
+        video_item.add("read_cnt", video_data['read_count'])
+        media = video_data['video']
+        url = media["download_addr"]['url_list'][0]
+        video_item.add("article_url", url)
+        video_item.add("source_account", 0)
+        video_item.check(source="video")
+        try:
+            item_with_oss_path = video_crawler_pipeline(
+                video_item=video_item.item,
+                db_client=self.db_client
+            )
+            insert_into_single_video_source_table(self.db_client, item_with_oss_path)
+        except Exception as e:
+            print(e)
+
+    def deal(self):
+        """
+        class entrance
+        """
+        account_id = 'MS4wLjABAAAAXp7v7A9VfXh-Pfo1TwejlJViATS7aqxuLnBHjaEb8tx1nDTLe7jF7KsNAR9RoVWk'
+        self.crawler_each_account_video_list(account_id)

+ 10 - 0
toutiao_video_crawler.py

@@ -0,0 +1,10 @@
+"""
+@author: luojunhui
+"""
+
+from tasks.crawler_toutiao_account_videos import CrawlerToutiaoAccountVideos
+
+
+if __name__ == '__main__':
+    crawler = CrawlerToutiaoAccountVideos()
+    crawler.deal()

Some files were not shown because too many files changed in this diff