9 months ago · de4264c151
--- a/applications/api/__init__.py
+++ b/applications/api/__init__.py
@@ -3,4 +3,5 @@
 
				 """
			
 
				 from .deep_seek_api_by_volcanoengine import fetch_deepseek_response
			
 
				 from .moon_shot_api import fetch_moon_shot_response
			
 
				-from .nlp_api import similarity_between_title_list
			
 
				+from .nlp_api import similarity_between_title_list
			
 
				+from .gewe_api import WechatChannelAPI
			
--- a/applications/api/gewe_api.py
+++ b/applications/api/gewe_api.py
@@ -0,0 +1,108 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+
			
 
				+from tenacity import (
			
 
				+    retry,
			
 
				+    stop_after_attempt,
			
 
				+    wait_exponential,
			
 
				+    retry_if_exception_type,
			
 
				+)
			
 
				+from requests.exceptions import RequestException
			
 
				+import requests
			
 
				+import json
			
 
				+from typing import Optional, Dict
			
 
				+
			
 
				+COMMON_RETRY = dict(
			
 
				+    stop=stop_after_attempt(3),  # 总共尝试3次
			
 
				+    wait=wait_exponential(min=2, max=30),
			
 
				+    retry=retry_if_exception_type((RequestException, TimeoutError)),
			
 
				+    reraise=True  # 重试耗尽后重新抛出异常
			
 
				+)
			
 
				+
			
 
				+
			
 
				+class WechatChannelAPI:
			
 
				+    """
			
 
				+    wechat channel api by gw
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, base_url: str, token: str, app_id: str):
			
 
				+        self.base_url = base_url
			
 
				+        self.token = token
			
 
				+        self.app_id = app_id
			
 
				+
			
 
				+    @retry(**COMMON_RETRY)
			
 
				+    def search(
			
 
				+            self,
			
 
				+            search_key: str,
			
 
				+            search_type: int,
			
 
				+            page: int = 0,
			
 
				+            cookie: str = "",
			
 
				+            search_id: str = "",
			
 
				+            offset: int = 0,
			
 
				+    ) -> Optional[Dict]:
			
 
				+        """
			
 
				+        搜索微信视频号内容（支持重试）
			
 
				+
			
 
				+        :param search_key: 搜索关键字
			
 
				+        :param search_type: 搜索类型，1: 搜索所有视频，2: 搜索视频号账号
			
 
				+        :param page: 页码
			
 
				+        :param cookie: 登录后的cookie
			
 
				+        :param search_id: 搜索id
			
 
				+        :param offset: 偏移量
			
 
				+        :return: 返回搜索结果字典，失败时返回None
			
 
				+        """
			
 
				+        url = f"{self.base_url}/gewe/v2/api/finder/search"
			
 
				+        payload = {
			
 
				+            "appId": self.app_id,
			
 
				+            "proxyIp": "",
			
 
				+            "content": search_key,
			
 
				+            "category": search_type,
			
 
				+            "filter": 0,
			
 
				+            "page": page,
			
 
				+            "cookie": cookie,
			
 
				+            "searchId": search_id,
			
 
				+            "offset": offset,
			
 
				+        }
			
 
				+        headers = {"X-GEWE-TOKEN": self.token, "Content-Type": "application/json"}
			
 
				+
			
 
				+        try:
			
 
				+            response = requests.post(url, headers=headers, json=payload, timeout=60)
			
 
				+            response.raise_for_status()
			
 
				+            return response.json()
			
 
				+        except RequestException as e:
			
 
				+            print(f"API请求失败: {e}")
			
 
				+        except json.JSONDecodeError as e:
			
 
				+            print(f"响应解析失败: {e}")
			
 
				+        return None
			
 
				+
			
 
				+    @retry(**COMMON_RETRY)
			
 
				+    def get_channel_video_list(
			
 
				+            self, user_id: str, last_buffer: str = ""
			
 
				+    ) -> Optional[Dict]:
			
 
				+        """
			
 
				+        获取视频号账号的视频列表（支持重试）
			
 
				+
			
 
				+        :param user_id: 视频号账号ID
			
 
				+        :param last_buffer: 分页标记，用于获取下一页数据
			
 
				+        :return: 返回视频列表字典，失败时返回None
			
 
				+        """
			
 
				+        url = f"{self.base_url}/gewe/v2/api/finder/userPage"
			
 
				+        payload = {
			
 
				+            "appId": self.app_id,
			
 
				+            "proxyIp": "",
			
 
				+            "lastBuffer": last_buffer,
			
 
				+            "toUserName": user_id,
			
 
				+            "maxId": 0,
			
 
				+        }
			
 
				+        headers = {"X-GEWE-TOKEN": self.token, "Content-Type": "application/json"}
			
 
				+
			
 
				+        try:
			
 
				+            response = requests.post(url, headers=headers, json=payload, timeout=60)
			
 
				+            response.raise_for_status()
			
 
				+            return response.json()
			
 
				+        except RequestException as e:
			
 
				+            print(f"获取视频列表请求失败: {e}")
			
 
				+        except json.JSONDecodeError as e:
			
 
				+            print(f"响应解析失败: {e}")
			
 
				+        return None
			
--- a/applications/const/__init__.py
+++ b/applications/const/__init__.py
@@ -292,6 +292,33 @@ class TitleRewriteTaskConst:
 
				     TITLE_REWRITE_LOCK_TIME = 60 * 60
			
 
				 
			
 
				 
			
 
				+class ChannelVideoCrawlerConst:
			
 
				+    """
			
 
				+    const for baidu video crawler
			
 
				+    """
			
 
				+    # account status
			
 
				+    CHANNEL_ACCOUNT_GOOD_STATUS = 1
			
 
				+    CHANNEL_ACCOUNT_BAD_STATUS = 0
			
 
				+
			
 
				+    # earliest cursor, 2024-01-01 00:00:00
			
 
				+    DEFAULT_CURSOR = 1704038400
			
 
				+
			
 
				+    # no source account
			
 
				+    NO_SOURCE_ACCOUNT_STATUS = 0
			
 
				+
			
 
				+    # local path dir
			
 
				+    LOCAL_PATH_DIR = "static"
			
 
				+
			
 
				+    # title length min
			
 
				+    MIN_TITLE_LENGTH = 10
			
 
				+
			
 
				+    # max video length(second)
			
 
				+    MAX_VIDEO_LENGTH = 600
			
 
				+
			
 
				+    # sleep second
			
 
				+    SLEEP_SECOND = 2
			
 
				+
			
 
				+
			
 
				 
			
 
				 
			
 
				 
			
--- a/applications/so/libsph_decrypt.so
+++ b/applications/so/libsph_decrypt.so
--- a/applications/utils/__init__.py
+++ b/applications/utils/__init__.py
@@ -1,4 +1,11 @@
 
				 """
			
 
				 utils
			
 
				 """
			
 
				-from .cold_start import *
			
 
				+from .cold_start import whether_title_sensitive
			
 
				+from .cold_start import get_inner_account_set
			
 
				+from .common import *
			
 
				+from .download_video import download_gzh_video
			
 
				+from .download_video import download_sph_video
			
 
				+from .item import Item
			
 
				+from .save_to_db import insert_into_single_video_source_table
			
 
				+from .upload import upload_to_oss
			
--- a/applications/utils/common.py
+++ b/applications/utils/common.py
@@ -0,0 +1,22 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+
			
 
				+import hashlib
			
 
				+
			
 
				+
			
 
				+def str_to_md5(strings):
			
 
				+    """
			
 
				+    字符串转化为 md5 值
			
 
				+    :param strings:
			
 
				+    :return:
			
 
				+    """
			
 
				+    # 将字符串转换为字节
			
 
				+    original_bytes = strings.encode("utf-8")
			
 
				+    # 创建一个md5 hash对象
			
 
				+    md5_hash = hashlib.md5()
			
 
				+    # 更新hash对象，传入原始字节
			
 
				+    md5_hash.update(original_bytes)
			
 
				+    # 获取16进制形式的MD5哈希值
			
 
				+    md5_value = md5_hash.hexdigest()
			
 
				+    return md5_value
			
--- a/applications/utils/download_video.py
+++ b/applications/utils/download_video.py
@@ -0,0 +1,141 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import re
			
 
				+import html
			
 
				+import cffi
			
 
				+import traceback
			
 
				+
			
 
				+import requests
			
 
				+from uuid import uuid4
			
 
				+from fake_useragent import FakeUserAgent
			
 
				+
			
 
				+from applications.utils.common import str_to_md5
			
 
				+from config import decrypt_key_path
			
 
				+
			
 
				+headers = {"Content-Type": "application/json", "User-Agent": FakeUserAgent().chrome}
			
 
				+
			
 
				+
			
 
				+def extract_video_url_from_article(article_url):
			
 
				+    """
			
 
				+    :param article_url:
			
 
				+    :return:
			
 
				+    """
			
 
				+    response = requests.get(
			
 
				+        url=article_url,
			
 
				+        headers={"User-Agent": FakeUserAgent().random},
			
 
				+    )
			
 
				+    html_text = response.text
			
 
				+    w = re.search(
			
 
				+        r"mp_video_trans_info.*url:\s*\(\'(.*?)\'\)\.replace", html_text, re.S | re.M
			
 
				+    ).group(1)
			
 
				+    url = html.unescape(
			
 
				+        re.sub(
			
 
				+            r"\\x\d+", lambda x: bytes.fromhex(x.group().replace("\\x", "")).decode(), w
			
 
				+        )
			
 
				+    )
			
 
				+    return url
			
 
				+
			
 
				+
			
 
				+def download_gzh_video(article_url):
			
 
				+    """
			
 
				+    下载公众号视频
			
 
				+    :param article_url:
			
 
				+    :return:
			
 
				+    """
			
 
				+    try:
			
 
				+        video_url = extract_video_url_from_article(article_url)
			
 
				+    except Exception as e:
			
 
				+        return
			
 
				+    save_path = "static/{}.mp4".format(str_to_md5(video_url))
			
 
				+    headers = {
			
 
				+        "Accept": "*/*",
			
 
				+        "Accept-Language": "zh,zh-CN;q=0.9",
			
 
				+        "Connection": "keep-alive",
			
 
				+        "Origin": "https://mp.weixin.qq.com",
			
 
				+        "Referer": "https://mp.weixin.qq.com/",
			
 
				+        "Sec-Fetch-Dest": "video",
			
 
				+        "Sec-Fetch-Mode": "cors",
			
 
				+        "Sec-Fetch-Site": "cross-site",
			
 
				+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
			
 
				+        "sec-ch-ua": '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"',
			
 
				+        "sec-ch-ua-mobile": "?0",
			
 
				+        "sec-ch-ua-platform": '"macOS"',
			
 
				+    }
			
 
				+    res = requests.get(video_url, headers=headers)
			
 
				+    with open(save_path, "wb") as f:
			
 
				+        f.write(res.content)
			
 
				+
			
 
				+    TEN_KB = 1024 * 10
			
 
				+    if os.path.getsize(save_path) > TEN_KB:
			
 
				+        return save_path
			
 
				+    else:
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def download_sph_video(download_url, key):
			
 
				+    """
			
 
				+    download video, decrypt video and save to local
			
 
				+    """
			
 
				+    file_id = uuid4().hex
			
 
				+    encrypted_path = f"static/encrypted_{file_id}.mp4"
			
 
				+    decrypted_path = f"static/decrypted_{file_id}.mp4"
			
 
				+
			
 
				+    try:
			
 
				+        with requests.get(download_url, headers=headers, stream=True) as response:
			
 
				+            response.raise_for_status()
			
 
				+
			
 
				+            with open(encrypted_path, "wb") as f:
			
 
				+                for chunk in response.iter_content(chunk_size=8192):
			
 
				+                    if chunk:  # filter out keep-alive chunks
			
 
				+                        f.write(chunk)
			
 
				+
			
 
				+        decrypt_sph_video(encrypted_path, key, decrypted_path)
			
 
				+        os.remove(encrypted_path)
			
 
				+        return decrypted_path
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        print(traceback.format_exc())
			
 
				+        for path in [encrypted_path, decrypted_path]:
			
 
				+            if os.path.exists(path):
			
 
				+                try:
			
 
				+                    os.remove(path)
			
 
				+                except OSError:
			
 
				+                    pass
			
 
				+        raise RuntimeError(f"Video processing failed: {str(e)}") from e
			
 
				+
			
 
				+
			
 
				+def decrypt_sph_video(video_path: str, key: int, save_path: str) -> None:
			
 
				+    """
			
 
				+    Decrypt video file using C library.
			
 
				+    Args:
			
 
				+        video_path: Path to encrypted video file
			
 
				+        key: 32-bit unsigned integer decryption key
			
 
				+        save_path: Path to save decrypted video
			
 
				+    Raises:
			
 
				+        RuntimeError: If decryption fails
			
 
				+    """
			
 
				+    print("key is {}".format(key))
			
 
				+    ffi = cffi.FFI()
			
 
				+
			
 
				+    try:
			
 
				+        lib = ffi.dlopen(decrypt_key_path)
			
 
				+        ffi.cdef(
			
 
				+            "void decrypt(unsigned char *data, const size_t data_length, const uint32_t key);"
			
 
				+        )
			
 
				+
			
 
				+        with open(video_path, "rb") as f:
			
 
				+            encrypted_data = f.read()
			
 
				+
			
 
				+        c_data = ffi.new("unsigned char[]", list(encrypted_data))
			
 
				+        lib.decrypt(c_data, 2**17, int(key))
			
 
				+        decrypted_data = bytes(ffi.buffer(c_data, len(encrypted_data))[:])
			
 
				+
			
 
				+        with open(save_path, "wb") as f:
			
 
				+            f.write(decrypted_data)
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        print(traceback.format_exc())
			
 
				+        raise RuntimeError(f"Decryption failed: {str(e)}") from e
			
--- a/applications/utils/item.py
+++ b/applications/utils/item.py
@@ -0,0 +1,69 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+
			
 
				+import time
			
 
				+
			
 
				+default_single_video_table_fields = {
			
 
				+    "platform": "gzh",
			
 
				+    "article_title": None,
			
 
				+    "content_trace_id": None,
			
 
				+    "read_cnt": 0,
			
 
				+    "article_index": None,
			
 
				+    "out_account_name": None,
			
 
				+    "article_url": None,
			
 
				+    "url_unique_md5": None,
			
 
				+    "category": None,
			
 
				+    "publish_timestamp": None,
			
 
				+    "out_account_id": None,
			
 
				+    "cover_url": None,
			
 
				+    "crawler_timestamp": int(time.time()),
			
 
				+    "source_account": 1,
			
 
				+    "article_publish_type": None,
			
 
				+    "like_cnt": 0,
			
 
				+    "bad_status": 0,
			
 
				+    "tags": None,
			
 
				+    "video_oss_path": None,
			
 
				+}
			
 
				+
			
 
				+
			
 
				+class Item(object):
			
 
				+    """
			
 
				+    format save to article meta table or single video source table
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.item = {}
			
 
				+
			
 
				+    def add(self, key, value):
			
 
				+        """
			
 
				+        add key value to item
			
 
				+        """
			
 
				+        self.item[key] = value
			
 
				+
			
 
				+    def check_video_item(self):
			
 
				+        """
			
 
				+        check video item
			
 
				+        """
			
 
				+        fields = list(default_single_video_table_fields.keys())
			
 
				+        for field in fields:
			
 
				+            if self.item.get(field, None) is not None:
			
 
				+                continue
			
 
				+            else:
			
 
				+                self.item[field] = default_single_video_table_fields[field]
			
 
				+
			
 
				+    def check_article_item(self):
			
 
				+        """
			
 
				+        check article item
			
 
				+        """
			
 
				+        return
			
 
				+
			
 
				+    def check(self, source):
			
 
				+        """
			
 
				+        check item
			
 
				+        """
			
 
				+        match source:
			
 
				+            case "video":
			
 
				+                self.check_video_item()
			
 
				+            case "article":
			
 
				+                self.check_article_item()
			
--- a/applications/utils/save_to_db.py
+++ b/applications/utils/save_to_db.py
@@ -0,0 +1,52 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+
			
 
				+import traceback
			
 
				+from applications.aliyunLogApi import log
			
 
				+
			
 
				+
			
 
				+def insert_into_single_video_source_table(db_client, video_item):
			
 
				+    """
			
 
				+    insert video into single video source table
			
 
				+    """
			
 
				+    insert_sql = f"""
			
 
				+        INSERT INTO publish_single_video_source
			
 
				+        (content_trace_id, article_title, out_account_id, out_account_name, read_cnt, like_cnt, article_url, cover_url, video_oss_path, publish_timestamp, crawler_timestamp, url_unique_md5, category, tags, platform, source_account)
			
 
				+        values
			
 
				+        (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
			
 
				+    """
			
 
				+    try:
			
 
				+        db_client.save(
			
 
				+            query=insert_sql,
			
 
				+            params=(
			
 
				+                video_item["content_trace_id"],
			
 
				+                video_item["article_title"],
			
 
				+                video_item["out_account_id"],
			
 
				+                video_item["out_account_name"],
			
 
				+                video_item["read_cnt"],
			
 
				+                video_item["like_cnt"],
			
 
				+                video_item["article_url"],
			
 
				+                video_item["cover_url"],
			
 
				+                video_item["video_oss_path"],
			
 
				+                video_item["publish_timestamp"],
			
 
				+                video_item["crawler_timestamp"],
			
 
				+                video_item["url_unique_md5"],
			
 
				+                video_item["category"],
			
 
				+                video_item["tags"],
			
 
				+                video_item["platform"],
			
 
				+                video_item["source_account"],
			
 
				+            ),
			
 
				+        )
			
 
				+    except Exception as e:
			
 
				+        log(
			
 
				+            task="{}_video_crawler".format(video_item["platform"]),
			
 
				+            function="save_each_video",
			
 
				+            message="save video failed",
			
 
				+            data={
			
 
				+                "error": str(e),
			
 
				+                "traceback": traceback.format_exc(),
			
 
				+                "video_id": video_item["url_unique_md5"],
			
 
				+                "oss_path": video_item["video_oss_path"],
			
 
				+            },
			
 
				+        )
			
--- a/applications/utils/upload.py
+++ b/applications/utils/upload.py
@@ -0,0 +1,23 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+
			
 
				+import oss2
			
 
				+from uuid import uuid4
			
 
				+
			
 
				+
			
 
				+def upload_to_oss(local_video_path):
			
 
				+    """
			
 
				+    把视频上传到 oss
			
 
				+    :return:
			
 
				+    """
			
 
				+    oss_video_key = "long_articles/video/" + str(uuid4())
			
 
				+    access_key_id = "LTAIP6x1l3DXfSxm"
			
 
				+    access_key_secret = "KbTaM9ars4OX3PMS6Xm7rtxGr1FLon"
			
 
				+    endpoint = "oss-cn-hangzhou.aliyuncs.com"
			
 
				+    bucket_name = "art-pubbucket"
			
 
				+    bucket = oss2.Bucket(
			
 
				+        oss2.Auth(access_key_id, access_key_secret), endpoint, bucket_name
			
 
				+    )
			
 
				+    bucket.put_object_from_file(key=oss_video_key, filename=local_video_path)
			
 
				+    return oss_video_key
			
--- a/coldStartTasks/crawler/channels/__init__.py
+++ b/coldStartTasks/crawler/channels/__init__.py
@@ -0,0 +1,6 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+@tool: pycharm && deepseek
			
 
				+"""
			
 
				+from .blogger import get_channel_account_videos
			
 
				+from .search import search_in_wechat_channel
			
--- a/coldStartTasks/crawler/channels/blogger.py
+++ b/coldStartTasks/crawler/channels/blogger.py
@@ -0,0 +1,22 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+
			
 
				+import requests
			
 
				+import json
			
 
				+
			
 
				+from applications.api import WechatChannelAPI
			
 
				+from config import gewe_token, gewe_app_id
			
 
				+
			
 
				+
			
 
				+def get_channel_account_videos(user_id, last_buffer=""):
			
 
				+    """
			
 
				+    get channel account videos
			
 
				+    """
			
 
				+    channel_api = WechatChannelAPI(
			
 
				+        base_url='http://api.geweapi.com',
			
 
				+        token=gewe_token,
			
 
				+        app_id=gewe_app_id
			
 
				+    )
			
 
				+    result = channel_api.get_channel_video_list(user_id, last_buffer)
			
 
				+    return result
			
--- a/coldStartTasks/crawler/channels/search.py
+++ b/coldStartTasks/crawler/channels/search.py
@@ -0,0 +1,41 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+
			
 
				+from typing import Dict
			
 
				+
			
 
				+from applications.api import WechatChannelAPI
			
 
				+from config import gewe_token, gewe_app_id
			
 
				+
			
 
				+
			
 
				+def search_in_wechat_channel(
			
 
				+    search_key: str,
			
 
				+    search_type: int,
			
 
				+    page: int = 0,
			
 
				+    cookie: str = "",
			
 
				+    search_id: str = "",
			
 
				+    offset: int = 0,
			
 
				+) -> Dict:
			
 
				+    """
			
 
				+    :param search_key: 搜索关键字
			
 
				+    :param search_type: 搜索类型，1: 搜索所有视频， 2: 搜索视频号账号
			
 
				+    :param page: 页码
			
 
				+    :param cookie: 登录后的cookie
			
 
				+    :param search_id: 搜索id
			
 
				+    :param offset: 偏移量
			
 
				+    :return: result_list
			
 
				+    """
			
 
				+    channel_api = WechatChannelAPI(
			
 
				+        base_url='http://api.geweapi.com',
			
 
				+        token=gewe_token,
			
 
				+        app_id=gewe_app_id
			
 
				+    )
			
 
				+    result = channel_api.search(
			
 
				+        search_key=search_key,
			
 
				+        search_type=search_type,
			
 
				+        page=page,
			
 
				+        cookie=cookie,
			
 
				+        search_id=search_id,
			
 
				+        offset=offset
			
 
				+    )
			
 
				+    return result
			
--- a/config/__init__.py
+++ b/config/__init__.py
@@ -97,4 +97,11 @@ deep_seek_model = {
 
				 
			
 
				 deep_seek_default_model = "ep-20250213194558-rrmr2"
			
 
				 
			
 
				-deep_seek_api_key_byte_dance = '5e275c38-44fd-415f-abcf-4b59f6377f72'
			
 
				+deep_seek_api_key_byte_dance = '5e275c38-44fd-415f-abcf-4b59f6377f72'
			
 
				+
			
 
				+#GeWe
			
 
				+gewe_token = "d3fb918f-0f36-4769-b095-410181614231"
			
 
				+gewe_app_id = "wx_GKpVW8xfEhcaxMIK9sSm6"
			
 
				+
			
 
				+# sph decrypt key
			
 
				+decrypt_key_path = 'applications/so/libsph_decrypt.so'
			
--- a/crawler_sph_video.py
+++ b/crawler_sph_video.py
@@ -0,0 +1,9 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+
			
 
				+from tasks.crawler_channel_account_videos import CrawlerChannelAccountVideos
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    crawler_channel_account_videos = CrawlerChannelAccountVideos()
			
 
				+    crawler_channel_account_videos.deal()
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,4 +21,5 @@ openai~=1.17.0
 
				 oss2~=2.19.1
			
 
				 fake-useragent~=1.5.1
			
 
				 playwright~=1.49.1
			
 
				-volcengine-python-sdk[ark]
			
 
				+volcengine-python-sdk[ark]
			
 
				+tenacity~=9.0.0
			
--- a/tasks/crawler_channel_account_videos.py
+++ b/tasks/crawler_channel_account_videos.py
@@ -0,0 +1,219 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+@tool: pycharm && deepseek
			
 
				+"""
			
 
				+
			
 
				+import re
			
 
				+import os
			
 
				+import traceback
			
 
				+import time
			
 
				+
			
 
				+from pymysql.cursors import DictCursor
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+from applications import log
			
 
				+from applications.const import ChannelVideoCrawlerConst
			
 
				+from applications.db import DatabaseConnector
			
 
				+from applications.utils import download_sph_video
			
 
				+from applications.utils import insert_into_single_video_source_table
			
 
				+from applications.utils import Item
			
 
				+from applications.utils import str_to_md5
			
 
				+from applications.utils import upload_to_oss
			
 
				+from config import long_articles_config
			
 
				+from coldStartTasks.crawler.channels import get_channel_account_videos
			
 
				+
			
 
				+const = ChannelVideoCrawlerConst()
			
 
				+
			
 
				+
			
 
				+class CrawlerChannelAccountVideos:
			
 
				+    """
			
 
				+    crawler channel account videos
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.db_client = DatabaseConnector(db_config=long_articles_config)
			
 
				+        self.db_client.connect()
			
 
				+
			
 
				+    def whether_video_exists(self, title: str) -> bool:
			
 
				+        """
			
 
				+        whether video exists, use video_id && title
			
 
				+        """
			
 
				+        # check title
			
 
				+        sql = f"""
			
 
				+            select id from publish_single_video_source
			
 
				+            where article_title = %s;
			
 
				+        """
			
 
				+        duplicate_id = self.db_client.fetch(query=sql, params=(title,))
			
 
				+        if duplicate_id:
			
 
				+            return True
			
 
				+
			
 
				+        return False
			
 
				+
			
 
				+    def get_channel_account_list(self) -> list[dict]:
			
 
				+        """
			
 
				+        get channel account list from database
			
 
				+        """
			
 
				+        sql = f"""select account_id, max_cursor from sph_account_for_videos where status = {const.CHANNEL_ACCOUNT_GOOD_STATUS};"""
			
 
				+        account_list = self.db_client.fetch(query=sql, cursor_type=DictCursor)
			
 
				+        return account_list
			
 
				+
			
 
				+    def crawler_each_video(self, video: dict) -> None:
			
 
				+        """
			
 
				+        download each video
			
 
				+        save video and decrypt video
			
 
				+        upload video to oss
			
 
				+        """
			
 
				+        object_desc = video["objectDesc"]
			
 
				+        title = object_desc["description"]
			
 
				+        if self.whether_video_exists(title):
			
 
				+            log(
			
 
				+                task="crawler_channel_account_videos",
			
 
				+                function="crawler_each_video",
			
 
				+                message="video title exists",
			
 
				+                data={"video_id": video["id"], "title": title},
			
 
				+            )
			
 
				+            return
			
 
				+
			
 
				+        cleaned_title = re.sub(r"[^\u4e00-\u9fff]", "", title)
			
 
				+        if len(cleaned_title) < const.MIN_TITLE_LENGTH:
			
 
				+            log(
			
 
				+                task="crawler_channel_account_videos",
			
 
				+                function="crawler_each_video",
			
 
				+                message="video title is too short",
			
 
				+                data={"video_id": video["id"], "title": title},
			
 
				+            )
			
 
				+            return
			
 
				+
			
 
				+        video_length = video["objectDesc"]["media"][0]["VideoPlayLen"]
			
 
				+        if video_length and int(video_length) > const.MAX_VIDEO_LENGTH:
			
 
				+            log(
			
 
				+                task="crawler_channel_account_videos",
			
 
				+                function="crawler_each_video",
			
 
				+                message="video length is too long",
			
 
				+                data={"video_id": video["id"], "title": title, "length": video_length},
			
 
				+            )
			
 
				+            return
			
 
				+
			
 
				+        video_item = Item()
			
 
				+        video_id = video["id"]
			
 
				+        video_item.add("content_trace_id", "video{}".format(str_to_md5(str(video_id))))
			
 
				+        video_item.add("url_unique_md5", video_id)
			
 
				+        video_item.add("article_title", title)
			
 
				+        video_item.add("out_account_id", video["username"])
			
 
				+        video_item.add("out_account_name", video["nickname"])
			
 
				+        video_item.add("publish_timestamp", video["createtime"])
			
 
				+        video_item.add("platform", "sph")
			
 
				+        media = object_desc["media"][0]
			
 
				+        url = media["Url"]
			
 
				+        decode_key = media["decodeKey"]
			
 
				+        url_token = media["urlToken"]
			
 
				+        download_url = url + url_token
			
 
				+        try:
			
 
				+            decrypt_path = download_sph_video(download_url=download_url, key=decode_key)
			
 
				+            oss_path = upload_to_oss(decrypt_path)
			
 
				+            video_item.add("video_oss_path", oss_path)
			
 
				+            video_item.add("source_account", const.NO_SOURCE_ACCOUNT_STATUS)
			
 
				+            video_item.check(source="video")
			
 
				+            insert_into_single_video_source_table(self.db_client, video_item.item)
			
 
				+            os.remove(decrypt_path)
			
 
				+        except Exception as e:
			
 
				+            log(
			
 
				+                task="crawler_channel_account_videos",
			
 
				+                function="crawler_each_video",
			
 
				+                message="download video failed",
			
 
				+                data={
			
 
				+                    "error": str(e),
			
 
				+                    "traceback": traceback.format_exc(),
			
 
				+                    "video_id": video["id"],
			
 
				+                },
			
 
				+            )
			
 
				+
			
 
				+    def crawler_each_account(self, channel_account: dict, last_buffer: str = "") -> None:
			
 
				+        """
			
 
				+        通过循环替代递归，分页爬取频道账号视频
			
 
				+        """
			
 
				+        channel_account_id = channel_account["account_id"]
			
 
				+        max_cursor = channel_account.get("max_cursor") or const.DEFAULT_CURSOR
			
 
				+        current_last_buffer = last_buffer
			
 
				+        has_more = True
			
 
				+
			
 
				+        while has_more:
			
 
				+            response = get_channel_account_videos(channel_account_id, last_buffer=current_last_buffer)
			
 
				+            if response["ret"] != 200:
			
 
				+                log(
			
 
				+                    task="crawler_channel_account_videos",
			
 
				+                    function="crawler_each_video",
			
 
				+                    message="get_channel_account_videos failed",
			
 
				+                    data={
			
 
				+                        "response": response,
			
 
				+                        "channel_account_id": channel_account_id,
			
 
				+                        "max_cursor": max_cursor,
			
 
				+                    },
			
 
				+                )
			
 
				+                break
			
 
				+
			
 
				+            response_data = response["data"]
			
 
				+            current_last_buffer = response_data["lastBuffer"]  # 更新分页游标
			
 
				+            has_more = response_data["continueFlag"]  # 是否还有下一页
			
 
				+            video_list = response_data["object"]
			
 
				+
			
 
				+            if not video_list:
			
 
				+                break
			
 
				+
			
 
				+            create_timestamp = video_list[0]["createtime"]
			
 
				+            if create_timestamp < max_cursor:
			
 
				+                break
			
 
				+
			
 
				+            crawl_video_list_bar = tqdm(video_list, desc="crawl videos")
			
 
				+            for video in crawl_video_list_bar:
			
 
				+                crawl_video_list_bar.set_postfix({"video_id": video["id"]})
			
 
				+                self.crawler_each_video(video)
			
 
				+
			
 
				+            if has_more:
			
 
				+                time.sleep(const.SLEEP_SECOND)
			
 
				+            else:
			
 
				+                break
			
 
				+
			
 
				+    def update_account_max_cursor(self, account_id: str) -> None:
			
 
				+        """
			
 
				+        update account max cursor
			
 
				+        """
			
 
				+        select_sql = f"""
			
 
				+            select max(publish_timestamp) as max_cursor from publish_single_video_source where out_account_id = '{account_id}';
			
 
				+        """
			
 
				+        response_mysql = self.db_client.fetch(query=select_sql)
			
 
				+        max_publish_timestamp = response_mysql[0][0]
			
 
				+
			
 
				+        if max_publish_timestamp:
			
 
				+            update_sql = f"""
			
 
				+                        update sph_account_for_videos
			
 
				+                        set max_cursor = %s
			
 
				+                        where account_id = %s;
			
 
				+                    """
			
 
				+            self.db_client.save(
			
 
				+                query=update_sql, params=(max_publish_timestamp, account_id)
			
 
				+            )
			
 
				+
			
 
				+    def deal(self):
			
 
				+        """
			
 
				+        deal channel account videos
			
 
				+        """
			
 
				+        account_list = self.get_channel_account_list()
			
 
				+        account_crawler_bar = tqdm(account_list, desc="crawler channel account videos")
			
 
				+        for account in account_crawler_bar:
			
 
				+            try:
			
 
				+                account_crawler_bar.set_postfix({"account_id": account["account_id"]})
			
 
				+                self.crawler_each_account(channel_account=account)
			
 
				+                self.update_account_max_cursor(account["account_id"])
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                log(
			
 
				+                    task="crawler_channel_account_videos",
			
 
				+                    function="deal",
			
 
				+                    message="crawler channel account videos failed",
			
 
				+                    data={
			
 
				+                        "error": str(e),
			
 
				+                        "traceback": traceback.format_exc(),
			
 
				+                        "account_id": account["account_id"],
			
 
				+                    },
			
 
				+                )