il y a 4 mois · 37acd32531
--- a/applications/utils/__init__.py
+++ b/applications/utils/__init__.py
@@ -1,4 +1,8 @@
 
				 """
			
 
				 utils
			
 
				 """
			
 
				-from .cold_start import *
			
 
				+from .cold_start import *
			
 
				+from .common import *
			
 
				+from .download_video import download_gzh_video
			
 
				+from .download_video import download_sph_video
			
 
				+from .upload import upload_to_oss
			
--- a/applications/utils/common.py
+++ b/applications/utils/common.py
@@ -0,0 +1,21 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+import hashlib
			
 
				+
			
 
				+
			
 
				+def str_to_md5(strings):
			
 
				+    """
			
 
				+    字符串转化为 md5 值
			
 
				+    :param strings:
			
 
				+    :return:
			
 
				+    """
			
 
				+    # 将字符串转换为字节
			
 
				+    original_bytes = strings.encode('utf-8')
			
 
				+    # 创建一个md5 hash对象
			
 
				+    md5_hash = hashlib.md5()
			
 
				+    # 更新hash对象，传入原始字节
			
 
				+    md5_hash.update(original_bytes)
			
 
				+    # 获取16进制形式的MD5哈希值
			
 
				+    md5_value = md5_hash.hexdigest()
			
 
				+    return md5_value
			
--- a/applications/utils/download_video.py
+++ b/applications/utils/download_video.py
@@ -0,0 +1,138 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+import os
			
 
				+import re
			
 
				+import html
			
 
				+import cffi
			
 
				+
			
 
				+import requests
			
 
				+from uuid import uuid4
			
 
				+from fake_useragent import FakeUserAgent
			
 
				+
			
 
				+from applications.utils.common import str_to_md5
			
 
				+from config import decrypt_key_path
			
 
				+
			
 
				+headers = {
			
 
				+  'Content-Type': 'application/json',
			
 
				+  'User-Agent': FakeUserAgent().chrome
			
 
				+}
			
 
				+
			
 
				+
			
 
				+def extract_video_url_from_article(article_url):
			
 
				+    """
			
 
				+    :param article_url:
			
 
				+    :return:
			
 
				+    """
			
 
				+    response = requests.get(
			
 
				+        url=article_url,
			
 
				+        headers={'User-Agent': FakeUserAgent().random},
			
 
				+    )
			
 
				+    html_text = response.text
			
 
				+    w = re.search(
			
 
				+        r"mp_video_trans_info.*url:\s*\(\'(.*?)\'\)\.replace", html_text, re.S | re.M
			
 
				+    ).group(1)
			
 
				+    url = html.unescape(
			
 
				+        re.sub(
			
 
				+            r"\\x\d+", lambda x: bytes.fromhex(x.group().replace("\\x", "")).decode(), w
			
 
				+        )
			
 
				+    )
			
 
				+    return url
			
 
				+
			
 
				+
			
 
				+def download_gzh_video(article_url):
			
 
				+    """
			
 
				+    下载公众号视频
			
 
				+    :param article_url:
			
 
				+    :return:
			
 
				+    """
			
 
				+    try:
			
 
				+        video_url = extract_video_url_from_article(article_url)
			
 
				+    except Exception as e:
			
 
				+        return
			
 
				+    save_path = "static/{}.mp4".format(str_to_md5(video_url))
			
 
				+    headers = {
			
 
				+        'Accept': '*/*',
			
 
				+        'Accept-Language': 'zh,zh-CN;q=0.9',
			
 
				+        'Connection': 'keep-alive',
			
 
				+        'Origin': 'https://mp.weixin.qq.com',
			
 
				+        'Referer': 'https://mp.weixin.qq.com/',
			
 
				+        'Sec-Fetch-Dest': 'video',
			
 
				+        'Sec-Fetch-Mode': 'cors',
			
 
				+        'Sec-Fetch-Site': 'cross-site',
			
 
				+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
			
 
				+        'sec-ch-ua': '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"',
			
 
				+        'sec-ch-ua-mobile': '?0',
			
 
				+        'sec-ch-ua-platform': '"macOS"'
			
 
				+    }
			
 
				+    res = requests.get(video_url, headers=headers)
			
 
				+    with open(save_path, "wb") as f:
			
 
				+        f.write(res.content)
			
 
				+
			
 
				+    TEN_KB = 1024 * 10
			
 
				+    if os.path.getsize(save_path) > TEN_KB:
			
 
				+        return save_path
			
 
				+    else:
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def download_sph_video(download_url, key):
			
 
				+    """
			
 
				+    download video, decrypt video and save to local
			
 
				+    """
			
 
				+    file_id = uuid4().hex
			
 
				+    encrypted_path = f"static/encrypted_{file_id}.mp4"
			
 
				+    decrypted_path = f"static/decrypted_{file_id}.mp4"
			
 
				+
			
 
				+    try:
			
 
				+        with requests.get(download_url, headers=headers, stream=True) as response:
			
 
				+            response.raise_for_status()
			
 
				+
			
 
				+            with open(encrypted_path, 'wb') as f:
			
 
				+                for chunk in response.iter_content(chunk_size=8192):
			
 
				+                    if chunk:  # filter out keep-alive chunks
			
 
				+                        f.write(chunk)
			
 
				+
			
 
				+        decrypt_sph_video(encrypted_path, key, decrypted_path)
			
 
				+        os.remove(encrypted_path)
			
 
				+        return decrypted_path
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        for path in [encrypted_path, decrypted_path]:
			
 
				+            if os.path.exists(path):
			
 
				+                try:
			
 
				+                    os.remove(path)
			
 
				+                except OSError:
			
 
				+                    pass
			
 
				+        raise RuntimeError(f"Video processing failed: {str(e)}") from e
			
 
				+
			
 
				+
			
 
				+def decrypt_sph_video(video_path: str, key: int, save_path: str) -> None:
			
 
				+    """
			
 
				+    Decrypt video file using C library.
			
 
				+    Args:
			
 
				+        video_path: Path to encrypted video file
			
 
				+        key: 32-bit unsigned integer decryption key
			
 
				+        save_path: Path to save decrypted video
			
 
				+    Raises:
			
 
				+        RuntimeError: If decryption fails
			
 
				+    """
			
 
				+    ffi = cffi.FFI()
			
 
				+    ffi.cdef('void decrypt(unsigned char *data, const size_t data_length, const uint32_t key);')
			
 
				+
			
 
				+    try:
			
 
				+        lib = ffi.dlopen(decrypt_key_path)
			
 
				+
			
 
				+        with open(video_path, 'rb') as f:
			
 
				+            encrypted_data = f.read()
			
 
				+
			
 
				+        c_data = ffi.new('unsigned char[]', list(encrypted_data))
			
 
				+        lib.decrypt(c_data, 2 ** 17, key)
			
 
				+        decrypted_data = bytes(ffi.buffer(c_data, len(encrypted_data))[:])
			
 
				+
			
 
				+        with open(save_path, 'wb') as f:
			
 
				+            f.write(decrypted_data)
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        raise RuntimeError(f"Decryption failed: {str(e)}") from e
			
 
				+
			
--- a/applications/utils/upload.py
+++ b/applications/utils/upload.py
@@ -0,0 +1,22 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+import oss2
			
 
				+from uuid import uuid4
			
 
				+
			
 
				+
			
 
				+def upload_to_oss(local_video_path):
			
 
				+    """
			
 
				+    把视频上传到 oss
			
 
				+    :return:
			
 
				+    """
			
 
				+    oss_video_key = "long_articles/video/" + str(uuid4())
			
 
				+    access_key_id = "LTAIP6x1l3DXfSxm"
			
 
				+    access_key_secret = "KbTaM9ars4OX3PMS6Xm7rtxGr1FLon"
			
 
				+    endpoint = "oss-cn-hangzhou.aliyuncs.com"
			
 
				+    bucket_name = "art-pubbucket"
			
 
				+    bucket = oss2.Bucket(
			
 
				+        oss2.Auth(access_key_id, access_key_secret), endpoint, bucket_name
			
 
				+    )
			
 
				+    bucket.put_object_from_file(key=oss_video_key, filename=local_video_path)
			
 
				+    return oss_video_key
			
--- a/config/__init__.py
+++ b/config/__init__.py
@@ -92,4 +92,7 @@ moon_shot = {
 
				 
			
 
				 #GeWe
			
 
				 gewe_token = "d3fb918f-0f36-4769-b095-410181614231"
			
 
				-gewe_app_id = "wx_GKpVW8xfEhcaxMIK9sSm6"
			
 
				+gewe_app_id = "wx_GKpVW8xfEhcaxMIK9sSm6"
			
 
				+
			
 
				+# sph decrypt key
			
 
				+decrypt_key_path = 'applications/so/libsph_decrypt.so'
			
--- a/crawler_sph_video.py
+++ b/crawler_sph_video.py
@@ -0,0 +1,9 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+from tasks.crawler_channel_account_videos import CrawlerChannelAccountVideos
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    crawler_channel_account_videos = CrawlerChannelAccountVideos()
			
 
				+    account_id = 'v2_060000231003b20faec8c5eb8a1cc3d1c902e43cb0774ec288165f96c810e3553f5069c92d73@finder'
			
 
				+    crawler_channel_account_videos.crawler_each_account(channel_account_id=account_id, channel_account_name="")
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,4 +21,5 @@ openai~=1.17.0
 
				 oss2~=2.19.1
			
 
				 fake-useragent~=1.5.1
			
 
				 playwright~=1.49.1
			
 
				-volcengine-python-sdk[ark]
			
 
				+volcengine-python-sdk[ark]
			
 
				+tenacity~=9.0.0
			
--- a/tasks/crawler_channel_account_videos.py
+++ b/tasks/crawler_channel_account_videos.py
@@ -3,11 +3,19 @@
 
				 @tool: pycharm && deepseek
			
 
				 """
			
 
				 import json
			
 
				+import time
			
 
				+import traceback
			
 
				 
			
 
				+from applications import log
			
 
				 from applications.db import DatabaseConnector
			
 
				+from applications.utils import download_sph_video
			
 
				+from applications.utils import str_to_md5
			
 
				+from applications.utils import upload_to_oss
			
 
				 from config import long_articles_config
			
 
				 from coldStartTasks.crawler.channels import get_channel_account_videos
			
 
				 
			
 
				+NO_SOURCE_ACCOUNT = 0
			
 
				+
			
 
				 
			
 
				 class CrawlerChannelAccountVideos:
			
 
				     """
			
@@ -16,6 +24,7 @@ class CrawlerChannelAccountVideos:
 
				     def __init__(self):
			
 
				         self.db_client = DatabaseConnector(db_config=long_articles_config)
			
 
				         self.db_client.connect()
			
 
				+        self.success_crawler_video_count = 0
			
 
				 
			
 
				     def get_channel_account_list(self):
			
 
				         """
			
@@ -37,19 +46,56 @@ class CrawlerChannelAccountVideos:
 
				                 video_id = video['id']
			
 
				                 account_name = video['nickname']
			
 
				                 object_desc = video['objectDesc']
			
 
				+                publish_timestamp = video['createtime']
			
 
				                 title = object_desc['description']
			
 
				                 media = object_desc['media'][0]
			
 
				                 url = media['Url']
			
 
				                 decode_key = media['decodeKey']
			
 
				                 url_token = media['urlToken']
			
 
				                 download_url = url + url_token
			
 
				-                print(json.dumps(video, ensure_ascii=False, indent=4))
			
 
				-        else:
			
 
				-            print(f"crawler channel account {channel_account_name} videos failed")
			
 
				-            return
			
 
				+                try:
			
 
				+                    decrypt_path = download_sph_video(download_url=download_url, key=decode_key)
			
 
				+                    oss_path = upload_to_oss(decrypt_path)
			
 
				+                    insert_sql = f"""
			
 
				+                        insert into publish_single_video_source
			
 
				+                        (content_trace_id, article_title, out_account_id, out_account_name, video_oss_path, publish_timestamp, crawler_timestamp, url_unique_md5, platform, source_account)
			
 
				+                        values
			
 
				+                        (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
			
 
				+                    """
			
 
				 
			
 
				+                    try:
			
 
				+                        self.db_client.save(
			
 
				+                            query=insert_sql,
			
 
				+                            params=(
			
 
				+                                "video{}".format(str_to_md5(video_id)),
			
 
				+                                title,
			
 
				+                                channel_account_id,
			
 
				+                                account_name,
			
 
				+                                oss_path,
			
 
				+                                publish_timestamp,
			
 
				+                                int(time.time()),
			
 
				+                                video_id,
			
 
				+                                "sph",
			
 
				+                                NO_SOURCE_ACCOUNT
			
 
				+                            ),
			
 
				+                        )
			
 
				+                        self.success_crawler_video_count += 1
			
 
				+                    except Exception as e:
			
 
				+                        log(
			
 
				+                            task="baidu_video_crawler",
			
 
				+                            function="save_each_video",
			
 
				+                            message="save video failed",
			
 
				+                            data={
			
 
				+                                "error": str(e),
			
 
				+                                "traceback": traceback.format_exc(),
			
 
				+                                "video_id": video_id,
			
 
				+                                "oss_path": oss_path,
			
 
				+                            },
			
 
				+                        )
			
 
				 
			
 
				-if __name__ == '__main__':
			
 
				-    crawler_channel_account_videos = CrawlerChannelAccountVideos()
			
 
				-    account_id = 'v2_060000231003b20faec8c5eb8a1cc3d1c902e43cb0774ec288165f96c810e3553f5069c92d73@finder'
			
 
				-    crawler_channel_account_videos.crawler_each_account(channel_account_id=account_id, channel_account_name="")
			
 
				+                except Exception as e:
			
 
				+                    print("download video error:", e)
			
 
				+
			
 
				+        else:
			
 
				+            print(f"crawler channel account {channel_account_name} videos failed")
			
 
				+            return