ソースを参照

change backup url

luojunhui 1 ヶ月 前
コミット
37acd32531

+ 5 - 1
applications/utils/__init__.py

@@ -1,4 +1,8 @@
 """
 utils
 """
-from .cold_start import *
+from .cold_start import *
+from .common import *
+from .download_video import download_gzh_video
+from .download_video import download_sph_video
+from .upload import upload_to_oss

+ 21 - 0
applications/utils/common.py

@@ -0,0 +1,21 @@
+"""
+@author: luojunhui
+"""
+import hashlib
+
+
+def str_to_md5(strings):
+    """
+    字符串转化为 md5 值
+    :param strings:
+    :return:
+    """
+    # 将字符串转换为字节
+    original_bytes = strings.encode('utf-8')
+    # 创建一个md5 hash对象
+    md5_hash = hashlib.md5()
+    # 更新hash对象,传入原始字节
+    md5_hash.update(original_bytes)
+    # 获取16进制形式的MD5哈希值
+    md5_value = md5_hash.hexdigest()
+    return md5_value

+ 138 - 0
applications/utils/download_video.py

@@ -0,0 +1,138 @@
+"""
+@author: luojunhui
+"""
+import os
+import re
+import html
+import cffi
+
+import requests
+from uuid import uuid4
+from fake_useragent import FakeUserAgent
+
+from applications.utils.common import str_to_md5
+from config import decrypt_key_path
+
+headers = {
+  'Content-Type': 'application/json',
+  'User-Agent': FakeUserAgent().chrome
+}
+
+
+def extract_video_url_from_article(article_url):
+    """
+    :param article_url:
+    :return:
+    """
+    response = requests.get(
+        url=article_url,
+        headers={'User-Agent': FakeUserAgent().random},
+    )
+    html_text = response.text
+    w = re.search(
+        r"mp_video_trans_info.*url:\s*\(\'(.*?)\'\)\.replace", html_text, re.S | re.M
+    ).group(1)
+    url = html.unescape(
+        re.sub(
+            r"\\x\d+", lambda x: bytes.fromhex(x.group().replace("\\x", "")).decode(), w
+        )
+    )
+    return url
+
+
+def download_gzh_video(article_url):
+    """
+    下载公众号视频
+    :param article_url:
+    :return:
+    """
+    try:
+        video_url = extract_video_url_from_article(article_url)
+    except Exception as e:
+        return
+    save_path = "static/{}.mp4".format(str_to_md5(video_url))
+    headers = {
+        'Accept': '*/*',
+        'Accept-Language': 'zh,zh-CN;q=0.9',
+        'Connection': 'keep-alive',
+        'Origin': 'https://mp.weixin.qq.com',
+        'Referer': 'https://mp.weixin.qq.com/',
+        'Sec-Fetch-Dest': 'video',
+        'Sec-Fetch-Mode': 'cors',
+        'Sec-Fetch-Site': 'cross-site',
+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
+        'sec-ch-ua': '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"',
+        'sec-ch-ua-mobile': '?0',
+        'sec-ch-ua-platform': '"macOS"'
+    }
+    res = requests.get(video_url, headers=headers)
+    with open(save_path, "wb") as f:
+        f.write(res.content)
+
+    TEN_KB = 1024 * 10
+    if os.path.getsize(save_path) > TEN_KB:
+        return save_path
+    else:
+        return None
+
+
+def download_sph_video(download_url, key):
+    """
+    download video, decrypt video and save to local
+    """
+    file_id = uuid4().hex
+    encrypted_path = f"static/encrypted_{file_id}.mp4"
+    decrypted_path = f"static/decrypted_{file_id}.mp4"
+
+    try:
+        with requests.get(download_url, headers=headers, stream=True) as response:
+            response.raise_for_status()
+
+            with open(encrypted_path, 'wb') as f:
+                for chunk in response.iter_content(chunk_size=8192):
+                    if chunk:  # filter out keep-alive chunks
+                        f.write(chunk)
+
+        decrypt_sph_video(encrypted_path, key, decrypted_path)
+        os.remove(encrypted_path)
+        return decrypted_path
+
+    except Exception as e:
+        for path in [encrypted_path, decrypted_path]:
+            if os.path.exists(path):
+                try:
+                    os.remove(path)
+                except OSError:
+                    pass
+        raise RuntimeError(f"Video processing failed: {str(e)}") from e
+
+
+def decrypt_sph_video(video_path: str, key: int, save_path: str) -> None:
+    """
+    Decrypt video file using C library.
+    Args:
+        video_path: Path to encrypted video file
+        key: 32-bit unsigned integer decryption key
+        save_path: Path to save decrypted video
+    Raises:
+        RuntimeError: If decryption fails
+    """
+    ffi = cffi.FFI()
+    ffi.cdef('void decrypt(unsigned char *data, const size_t data_length, const uint32_t key);')
+
+    try:
+        lib = ffi.dlopen(decrypt_key_path)
+
+        with open(video_path, 'rb') as f:
+            encrypted_data = f.read()
+
+        c_data = ffi.new('unsigned char[]', list(encrypted_data))
+        lib.decrypt(c_data, 2 ** 17, key)
+        decrypted_data = bytes(ffi.buffer(c_data, len(encrypted_data))[:])
+
+        with open(save_path, 'wb') as f:
+            f.write(decrypted_data)
+
+    except Exception as e:
+        raise RuntimeError(f"Decryption failed: {str(e)}") from e
+

+ 22 - 0
applications/utils/upload.py

@@ -0,0 +1,22 @@
+"""
+@author: luojunhui
+"""
+import oss2
+from uuid import uuid4
+
+
+def upload_to_oss(local_video_path):
+    """
+    把视频上传到 oss
+    :return:
+    """
+    oss_video_key = "long_articles/video/" + str(uuid4())
+    access_key_id = "LTAIP6x1l3DXfSxm"
+    access_key_secret = "KbTaM9ars4OX3PMS6Xm7rtxGr1FLon"
+    endpoint = "oss-cn-hangzhou.aliyuncs.com"
+    bucket_name = "art-pubbucket"
+    bucket = oss2.Bucket(
+        oss2.Auth(access_key_id, access_key_secret), endpoint, bucket_name
+    )
+    bucket.put_object_from_file(key=oss_video_key, filename=local_video_path)
+    return oss_video_key

+ 4 - 1
config/__init__.py

@@ -92,4 +92,7 @@ moon_shot = {
 
 #GeWe
 gewe_token = "d3fb918f-0f36-4769-b095-410181614231"
-gewe_app_id = "wx_GKpVW8xfEhcaxMIK9sSm6"
+gewe_app_id = "wx_GKpVW8xfEhcaxMIK9sSm6"
+
+# sph decrypt key
+decrypt_key_path = 'applications/so/libsph_decrypt.so'

+ 9 - 0
crawler_sph_video.py

@@ -0,0 +1,9 @@
+"""
+@author: luojunhui
+"""
+from tasks.crawler_channel_account_videos import CrawlerChannelAccountVideos
+
+if __name__ == '__main__':
+    crawler_channel_account_videos = CrawlerChannelAccountVideos()
+    account_id = 'v2_060000231003b20faec8c5eb8a1cc3d1c902e43cb0774ec288165f96c810e3553f5069c92d73@finder'
+    crawler_channel_account_videos.crawler_each_account(channel_account_id=account_id, channel_account_name="")

+ 2 - 1
requirements.txt

@@ -21,4 +21,5 @@ openai~=1.17.0
 oss2~=2.19.1
 fake-useragent~=1.5.1
 playwright~=1.49.1
-volcengine-python-sdk[ark]
+volcengine-python-sdk[ark]
+tenacity~=9.0.0

+ 54 - 8
tasks/crawler_channel_account_videos.py

@@ -3,11 +3,19 @@
 @tool: pycharm && deepseek
 """
 import json
+import time
+import traceback
 
+from applications import log
 from applications.db import DatabaseConnector
+from applications.utils import download_sph_video
+from applications.utils import str_to_md5
+from applications.utils import upload_to_oss
 from config import long_articles_config
 from coldStartTasks.crawler.channels import get_channel_account_videos
 
+NO_SOURCE_ACCOUNT = 0
+
 
 class CrawlerChannelAccountVideos:
     """
@@ -16,6 +24,7 @@ class CrawlerChannelAccountVideos:
     def __init__(self):
         self.db_client = DatabaseConnector(db_config=long_articles_config)
         self.db_client.connect()
+        self.success_crawler_video_count = 0
 
     def get_channel_account_list(self):
         """
@@ -37,19 +46,56 @@ class CrawlerChannelAccountVideos:
                 video_id = video['id']
                 account_name = video['nickname']
                 object_desc = video['objectDesc']
+                publish_timestamp = video['createtime']
                 title = object_desc['description']
                 media = object_desc['media'][0]
                 url = media['Url']
                 decode_key = media['decodeKey']
                 url_token = media['urlToken']
                 download_url = url + url_token
-                print(json.dumps(video, ensure_ascii=False, indent=4))
-        else:
-            print(f"crawler channel account {channel_account_name} videos failed")
-            return
+                try:
+                    decrypt_path = download_sph_video(download_url=download_url, key=decode_key)
+                    oss_path = upload_to_oss(decrypt_path)
+                    insert_sql = f"""
+                        insert into publish_single_video_source
+                        (content_trace_id, article_title, out_account_id, out_account_name, video_oss_path, publish_timestamp, crawler_timestamp, url_unique_md5, platform, source_account)
+                        values
+                        (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+                    """
 
+                    try:
+                        self.db_client.save(
+                            query=insert_sql,
+                            params=(
+                                "video{}".format(str_to_md5(video_id)),
+                                title,
+                                channel_account_id,
+                                account_name,
+                                oss_path,
+                                publish_timestamp,
+                                int(time.time()),
+                                video_id,
+                                "sph",
+                                NO_SOURCE_ACCOUNT
+                            ),
+                        )
+                        self.success_crawler_video_count += 1
+                    except Exception as e:
+                        log(
+                            task="baidu_video_crawler",
+                            function="save_each_video",
+                            message="save video failed",
+                            data={
+                                "error": str(e),
+                                "traceback": traceback.format_exc(),
+                                "video_id": video_id,
+                                "oss_path": oss_path,
+                            },
+                        )
 
-if __name__ == '__main__':
-    crawler_channel_account_videos = CrawlerChannelAccountVideos()
-    account_id = 'v2_060000231003b20faec8c5eb8a1cc3d1c902e43cb0774ec288165f96c810e3553f5069c92d73@finder'
-    crawler_channel_account_videos.crawler_each_account(channel_account_id=account_id, channel_account_name="")
+                except Exception as e:
+                    print("download video error:", e)
+
+        else:
+            print(f"crawler channel account {channel_account_name} videos failed")
+            return