7 tháng trước cách đây · 35d974234a
--- a/account_cold_start_daily.py
+++ b/account_cold_start_daily.py
@@ -54,7 +54,7 @@ class AccountColdStartDailyTask(object):
 
															             # 抓取完成之后，给抓取到的标题进行相似度打分
														
 
															             cold_start_title_similarity_task = ColdStartTitleSimilarityTask()
														
 
															             cold_start_title_similarity_task.init_database()
														
 
															-            cold_start_title_similarity_task.run()
														
 
															+            cold_start_title_similarity_task.run(meta_source='article')
														
 
															             bot(
														
 
															                 title="账号冷启动任务，抓取完成",
														
--- a/applications/api/__init__.py
+++ b/applications/api/__init__.py
@@ -1,8 +1,8 @@
 
															 """
														
 
															 @author: luojunhui
														
 
															 """
														
 
															-from .deep_seek_by_byte_dance_api import fetch_deepseek_response
														
 
															-from .google_ai_api import GoogleAIAPI
														
 
															-from .moon_shot_api import generate_mini_program_title
														
 
															+from .deep_seek_api_by_volcanoengine import fetch_deepseek_response
														
 
															+from .moon_shot_api import fetch_moon_shot_response
														
 
															 from .nlp_api import similarity_between_title_list
														
 
															-
														
 
															+from .gewe_api import WechatChannelAPI
														
 
															+from .google_ai_api import GoogleAIAPI
														
--- a/applications/api/deep_seek_api_by_volcanoengine.py
+++ b/applications/api/deep_seek_api_by_volcanoengine.py
@@ -0,0 +1,26 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+from volcenginesdkarkruntime import Ark
														
 
															+
														
 
															+from config import deep_seek_model
														
 
															+from config import deep_seek_default_model
														
 
															+from config import deep_seek_api_key_byte_dance
														
 
															+
														
 
															+
														
 
															+def fetch_deepseek_response(model, prompt):
														
 
															+    """
														
 
															+    deep_seek方法
														
 
															+    """
														
 
															+    client = Ark(
														
 
															+        api_key=deep_seek_api_key_byte_dance,
														
 
															+        timeout=1800,
														
 
															+        max_retries=2,
														
 
															+        )
														
 
															+    response = client.chat.completions.create(
														
 
															+        model=deep_seek_model.get(model, deep_seek_default_model),
														
 
															+        messages=[
														
 
															+            {"role": "user", "content": prompt}
														
 
															+        ]
														
 
															+    )
														
 
															+    return response.choices[0].message.content
														
--- a/applications/api/gewe_api.py
+++ b/applications/api/gewe_api.py
@@ -0,0 +1,108 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+
														
 
															+from tenacity import (
														
 
															+    retry,
														
 
															+    stop_after_attempt,
														
 
															+    wait_exponential,
														
 
															+    retry_if_exception_type,
														
 
															+)
														
 
															+from requests.exceptions import RequestException
														
 
															+import requests
														
 
															+import json
														
 
															+from typing import Optional, Dict
														
 
															+
														
 
															+COMMON_RETRY = dict(
														
 
															+    stop=stop_after_attempt(3),  # 总共尝试3次
														
 
															+    wait=wait_exponential(min=2, max=30),
														
 
															+    retry=retry_if_exception_type((RequestException, TimeoutError)),
														
 
															+    reraise=True  # 重试耗尽后重新抛出异常
														
 
															+)
														
 
															+
														
 
															+
														
 
															+class WechatChannelAPI:
														
 
															+    """
														
 
															+    wechat channel api by gw
														
 
															+    """
														
 
															+
														
 
															+    def __init__(self, base_url: str, token: str, app_id: str):
														
 
															+        self.base_url = base_url
														
 
															+        self.token = token
														
 
															+        self.app_id = app_id
														
 
															+
														
 
															+    @retry(**COMMON_RETRY)
														
 
															+    def search(
														
 
															+            self,
														
 
															+            search_key: str,
														
 
															+            search_type: int,
														
 
															+            page: int = 0,
														
 
															+            cookie: str = "",
														
 
															+            search_id: str = "",
														
 
															+            offset: int = 0,
														
 
															+    ) -> Optional[Dict]:
														
 
															+        """
														
 
															+        搜索微信视频号内容（支持重试）
														
 
															+
														
 
															+        :param search_key: 搜索关键字
														
 
															+        :param search_type: 搜索类型，1: 搜索所有视频，2: 搜索视频号账号
														
 
															+        :param page: 页码
														
 
															+        :param cookie: 登录后的cookie
														
 
															+        :param search_id: 搜索id
														
 
															+        :param offset: 偏移量
														
 
															+        :return: 返回搜索结果字典，失败时返回None
														
 
															+        """
														
 
															+        url = f"{self.base_url}/gewe/v2/api/finder/search"
														
 
															+        payload = {
														
 
															+            "appId": self.app_id,
														
 
															+            "proxyIp": "",
														
 
															+            "content": search_key,
														
 
															+            "category": search_type,
														
 
															+            "filter": 0,
														
 
															+            "page": page,
														
 
															+            "cookie": cookie,
														
 
															+            "searchId": search_id,
														
 
															+            "offset": offset,
														
 
															+        }
														
 
															+        headers = {"X-GEWE-TOKEN": self.token, "Content-Type": "application/json"}
														
 
															+
														
 
															+        try:
														
 
															+            response = requests.post(url, headers=headers, json=payload, timeout=60)
														
 
															+            response.raise_for_status()
														
 
															+            return response.json()
														
 
															+        except RequestException as e:
														
 
															+            print(f"API请求失败: {e}")
														
 
															+        except json.JSONDecodeError as e:
														
 
															+            print(f"响应解析失败: {e}")
														
 
															+        return None
														
 
															+
														
 
															+    @retry(**COMMON_RETRY)
														
 
															+    def get_channel_video_list(
														
 
															+            self, user_id: str, last_buffer: str = ""
														
 
															+    ) -> Optional[Dict]:
														
 
															+        """
														
 
															+        获取视频号账号的视频列表（支持重试）
														
 
															+
														
 
															+        :param user_id: 视频号账号ID
														
 
															+        :param last_buffer: 分页标记，用于获取下一页数据
														
 
															+        :return: 返回视频列表字典，失败时返回None
														
 
															+        """
														
 
															+        url = f"{self.base_url}/gewe/v2/api/finder/userPage"
														
 
															+        payload = {
														
 
															+            "appId": self.app_id,
														
 
															+            "proxyIp": "",
														
 
															+            "lastBuffer": last_buffer,
														
 
															+            "toUserName": user_id,
														
 
															+            "maxId": 0,
														
 
															+        }
														
 
															+        headers = {"X-GEWE-TOKEN": self.token, "Content-Type": "application/json"}
														
 
															+
														
 
															+        try:
														
 
															+            response = requests.post(url, headers=headers, json=payload, timeout=60)
														
 
															+            response.raise_for_status()
														
 
															+            return response.json()
														
 
															+        except RequestException as e:
														
 
															+            print(f"获取视频列表请求失败: {e}")
														
 
															+        except json.JSONDecodeError as e:
														
 
															+            print(f"响应解析失败: {e}")
														
 
															+        return None
														
--- a/applications/api/moon_shot_api.py
+++ b/applications/api/moon_shot_api.py
@@ -1,45 +1,120 @@
 
															 """
														
 
															 @author: luojunhui
														
 
															 """
														
 
															+import json
														
 
															 from openai import OpenAI
														
 
															-mini_program_title_generate_prompt = """
														
 
															+from config import moon_shot
														
 
															+
														
 
															+generate_program_title_prompt = """
														
 
															     请将以上标题改写成适合小程序点击和传播的小程序标题，小程序标题的写作规范如下，请学习后进行小程序标题的编写。直接输出最终的小程序标题
														
 
															     小程序标题写作规范：
														
 
															     1.要点前置：将最重要的信息放在标题的最前面，以快速吸引读者的注意力。例如，“5月一辈子同学，三辈子亲，送给我的老同学，听哭无数人！”中的“5月”和“一辈子同学，三辈子亲”都是重要的信息点。
														
 
															     2.激发情绪：使用能够触动人心的语言，激发读者的情感共鸣。如“只剩两人同学聚会，看后感动落泪。”使用“感动落泪”激发读者的同情和怀旧情绪。
														
 
															     3.使用数字和特殊符号：数字可以提供具体性，而特殊符号如“🔴”、“😄”、“🔥”等可以吸引视觉注意力，增加点击率。
														
 
															-    4.悬念和好奇心：创建悬念或提出问题，激发读者的好奇心。例如，“太神奇了!长江水位下降，重庆出现惊奇一幕!”中的“惊奇一幕”就是一个悬念。
														
 
															+    4.悬念和好奇心：创建悬念或提出问题，激发读者的好奇心。
														
 
															     5.名人效应：如果内容与知名人士相关，提及他们的名字可以增加标题的吸引力。
														
 
															     6.社会价值观：触及读者的文化和社会价值观，如家庭、友情、国家荣誉等。
														
 
															     7.标点符号的运用：使用感叹号、问号等标点来增强语气和情感表达。
														
 
															     8.直接的语言：使用直白、口语化的语言，易于理解，如“狗屁股，笑死我了!”。
														
 
															     9.热点人物或事件：提及当前的热点人物或事件，利用热点效应吸引读者。
														
 
															     10.字数适中：保持标题在10-20个字之间，既不过长也不过短，确保信息的完整性和吸引力。
														
 
															-    11.适当的紧迫感：使用“最新”、“首次”、“紧急”等词汇，创造一种紧迫感，促使读者立即行动。
														
 
															-    12.情感或价值诉求：使用如“感动”、“泪目”、“经典”等词汇，直接与读者的情感或价值观产生共鸣。
														
 
															+    11.情感或价值诉求：使用如“感动”、“泪目”、“经典”等词汇，直接与读者的情感或价值观产生共鸣。
														
 
															     避免误导：确保标题准确反映内容，避免夸大或误导读者。
														
 
															     """
														
 
															+get_title_safe_score_prompt = """
														
 
															+    请你学习一下内容规范，以下标题可能会违反了某条内容规范。请你对标题做一个内容风险评级，1-10分，等级越高内容违规风险越大。 
														
 
															+    请直接输出内容风险评级的分数，不要输出你的理由、分析等内容。 
														
 
															+    输出:
														
 
															+        只需要输出危险分级分数，不要输出任何其他内容。
														
 
															+    内容规范为： 
														
 
															+    4.2 色俗内容 
														
 
															+        4.2.1 散布淫秽、色情内容，包括但不限于招嫖、寻找一夜情、性伴侣等。 
														
 
															+        4.2.2 发布有色情意味的情色文字、情色视频、情色漫画等内容。 
														
 
															+        4.2.3 以投稿/爆料等形式描述约炮经历、性交体验、偷情、涉隐私部位偷拍等伤风败俗的话题内容。 
														
 
															+        4.2.4 以低俗的配图引诱用户阅读文章、关注微信公众号。包含性撩拨、性挑逗画面；疑似女性性高潮/性虐场面；偷拍的沐浴/更衣/如厕/亲热等私密画面；女性故意露出敏感部位 (纯裸露的胸、生殖器官)以及敏感部位未打码的真人写真/艺术摄影等。 
														
 
															+        4.2.5 文内以低俗类的动图或引导图，诱导用户点击进而跳转至另一篇图文页或关注某个公众号。 
														
 
															+        4.2.6 文章主要描述PUA撩妹、撩汉等相关话题，且引导用户关注公众号/加个人微信号/加群。 
														
 
															+    4.11 煽动、夸大、误导类内容 平台鼓励创作者提供准确、清晰、能体现文章内容主旨的标题，不允许通过标题噱头诱导用户点击或误导用户。
														
 
															+        包括但不限于以下情况： 
														
 
															+        4.11.1 标题含有危害人身安全、恐吓侮辱、惊悚、极端内容，或者以命令式语气强迫用户阅读。 
														
 
															+        4.11.2 标题无依据夸大事件严重程度、紧急程度、受影响面以及事件引发的情绪。 
														
 
															+        4.11.3 标题以浮夸的描述，反常识强调某种食物/行为对人体健康的影响，煽动人群要/不要去做某行为。 
														
 
															+        4.11.4 非官方通知或者公告，但标题假借官方名义煽动获取流量，或以信息来源机密、看完即删来诱导用户。 
														
 
															+        4.11.5 标题故意隐藏关键信息，或无中生有部分信息，给用户造成误导。 
														
 
															+        4.12 违反国家法律法规禁止的内容 
														
 
															+            （1）违反宪法确定的基本原则的； 
														
 
															+            （2）危害国家安全，泄露国家秘密，颠覆国家政权，破坏国家统一的； 
														
 
															+            （3）损害国家荣誉和利益的； 
														
 
															+            （4）煽动民族仇恨、民族歧视，破坏民族团结的； 
														
 
															+            （5）破坏国家宗教政策，宣扬邪教和封建迷信的； 
														
 
															+            （6）散布不实信息，扰乱社会秩序，破坏社会稳定的； 
														
 
															+            （7）散布淫秽、色情、赌博、暴力、恐怖或者教唆犯罪的； 
														
 
															+            （8）侮辱或者诽谤他人，侵害他人合法权益的； 
														
 
															+            （9）煽动非法集会、结社、游行、示威、聚众扰乱社会秩序； 
														
 
															+            （10）以非法民间组织名义活动的； 
														
 
															+            （11）不符合《即时通信工具公众信息服务发展管理暂行规定》及遵守法律法规、社会主义制度、国家利益、公民合法利益、公共秩序、社会道德风尚和信息真实性等“七条底线”要求的； 
														
 
															+            （12）含有法律、行政法规禁止的其他内容的。
														
 
															+    输入的标题是： 
														
 
															+    """
														
 
															+
														
 
															+make_title_safe_prompt = """
														
 
															+    以下每行为一个文章的标题，请用尽量平实的语言对以上标题进行改写，保持在10～15字左右，请注意：
														
 
															+    1. 不要虚构或改变标题的含义。
														
 
															+    2. 不要用笃定的语气描述存疑的可能性，不要将表述可能性的问句改为肯定句。
														
 
															+    直接输出改写后的标题列表。
														
 
															+    在改写完成后，再输出一次，在改写的标题前增加和标题情感、语气匹配的特殊符号，如：“🔴”、“😄”、“🔥”、“😨”等等
														
 
															+    输出：
														
 
															+        输出结果是Dict, 格式为: 
														
 
															+        {
														
 
															+        "title_v1": 请填写第一次输出的标题,
														
 
															+        "title_v2": 请填写第二次输出的标题
														
 
															+        }
														
 
															+    输入的标题是: 
														
 
															+        """
														
 
															+
														
 
															-def generate_mini_program_title(ori_title):
														
 
															+def fetch_moon_shot_response(task, input_text, output_type="text"):
														
 
															     """
														
 
															-    prompt + kimi + ori_title generate new title
														
 
															-    :param ori_title:
														
 
															-    :return:
														
 
															+    调用kimi的api获取结果
														
 
															     """
														
 
															+    # generate prompt
														
 
															+    match task:
														
 
															+        case "generate_kimi_title":
														
 
															+            prompt = input_text + '\n' + generate_program_title_prompt
														
 
															+        case "get_title_safe_score":
														
 
															+            prompt = get_title_safe_score_prompt + input_text
														
 
															+        case "make_title_safe":
														
 
															+            prompt = make_title_safe_prompt + input_text
														
 
															+        case _:
														
 
															+            prompt = input_text
														
 
															+
														
 
															+    # init client
														
 
															     client = OpenAI(
														
 
															-        api_key='sk-5DqYCa88kche6nwIWjLE1p4oMm8nXrR9kQMKbBolNAWERu7q',
														
 
															-        base_url="https://api.moonshot.cn/v1"
														
 
															+        api_key=moon_shot['api_key'],
														
 
															+        base_url=moon_shot['base_url']
														
 
															     )
														
 
															+
														
 
															+    # get response format
														
 
															+    if output_type == "json":
														
 
															+        response_format = {"type": "json_object"}
														
 
															+    else:
														
 
															+        response_format = {"type": "text"}
														
 
															+
														
 
															     chat_completion = client.chat.completions.create(
														
 
															         messages=[
														
 
															             {
														
 
															                 "role": "user",
														
 
															-                "content": ori_title + "\n" + mini_program_title_generate_prompt
														
 
															+                "content": prompt,
														
 
															             }
														
 
															         ],
														
 
															-        model="moonshot-v1-32k",
														
 
															+        model=moon_shot['model'],
														
 
															+        response_format=response_format,
														
 
															     )
														
 
															     response = chat_completion.choices[0].message.content
														
 
															-    return response.split("\n")[0]
														
 
															+    if output_type == "json":
														
 
															+        response_json = json.loads(response)
														
 
															+        return response_json
														
 
															+
														
 
															+    return response
														
--- a/applications/api/nlp_api.py
+++ b/applications/api/nlp_api.py
@@ -2,6 +2,10 @@
 
															 @author: luojunhui
														
 
															 """
														
 
															 import requests
														
 
															+import traceback
														
 
															+from requests.exceptions import RequestException, JSONDecodeError
														
 
															+
														
 
															+from applications.aliyunLogApi import log
														
 
															 def similarity_between_title_list(target_title_list: list[str], base_title_list: list[str]) -> list[list[float]]:
														
@@ -11,7 +15,9 @@ def similarity_between_title_list(target_title_list: list[str], base_title_list:
 
															     :param base_title_list: base title_list
														
 
															     :return: list of similarity
														
 
															     """
														
 
															+
														
 
															     url = 'http://61.48.133.26:6060/nlp'
														
 
															+    url_backup = 'http://192.168.203.4:6060/nlp'
														
 
															     body = {
														
 
															         "data": {
														
 
															             "text_list_a": target_title_list,
														
@@ -20,7 +26,52 @@ def similarity_between_title_list(target_title_list: list[str], base_title_list:
 
															         "function": "similarities_cross",
														
 
															         "use_cache": False
														
 
															     }
														
 
															-    response_json = requests.post(url, json=body, timeout=120).json()
														
 
															-    score_array = response_json['score_list_list']
														
 
															-    return score_array
														
 
															+    try:
														
 
															+        response = requests.post(url, json=body, timeout=120)
														
 
															+        if response.status_code != 200:
														
 
															+            response = requests.post(url_backup, json=body, timeout=120)
														
 
															+    except RequestException as e:
														
 
															+        log(
														
 
															+            task="nlp",
														
 
															+            function="similarity_between_title_list",
														
 
															+            status="fail",
														
 
															+            message="nlp server web error",
														
 
															+            data={
														
 
															+                "e": str(e),
														
 
															+                "error_msg": traceback.format_exc()
														
 
															+            }
														
 
															+        )
														
 
															+        # use back up
														
 
															+        response = requests.post(url_backup, json=body, timeout=120)
														
 
															+
														
 
															+    if response.status_code != 200:
														
 
															+        log(
														
 
															+            task="nlp",
														
 
															+            function="similarity_between_title_list",
														
 
															+            status="fail",
														
 
															+            message='nlp server request error',
														
 
															+            data={
														
 
															+                "status_code": response.status_code,
														
 
															+                "response_text": response.text[:200]  # 截取部分内容避免过大
														
 
															+            }
														
 
															+        )
														
 
															+        return []
														
 
															+
														
 
															+    try:
														
 
															+        response_json = response.json()
														
 
															+        score_array = response_json['score_list_list']
														
 
															+    except (JSONDecodeError, KeyError) as e:
														
 
															+        log(
														
 
															+            task="nlp",
														
 
															+            function="similarity_between_title_list",
														
 
															+            status="fail",
														
 
															+            message='nlp server response error',
														
 
															+            data={
														
 
															+                "error_type": type(e).__name__,
														
 
															+                "raw_response": response.text[:200]
														
 
															+            }
														
 
															+        )
														
 
															+        return []
														
 
															+
														
 
															+    return score_array
														
--- a/applications/const/__init__.py
+++ b/applications/const/__init__.py
@@ -4,7 +4,7 @@
 
															 """
														
 
															-class coldStartTaskConst:
														
 
															+class ColdStartTaskConst:
														
 
															     """
														
 
															     冷启动任务常量配置
														
 
															     """
														
@@ -12,6 +12,44 @@ class coldStartTaskConst:
 
															     INIT_STATUS = 1  # 文章初始状态
														
 
															     BAD_STATUS = 0  # 低质量文章状态
														
 
															+    # 常量
														
 
															+    ACCOUNT_GOOD_STATUS = 1
														
 
															+
														
 
															+    # 账号是否每日抓取
														
 
															+    ACCOUNT_DAILY_SCRAPE = 1
														
 
															+    ACCOUNT_NOT_DAILY_SCRAPE = 0
														
 
															+
														
 
															+    # 默认值
														
 
															+    DEFAULT_VIEW_COUNT = 0
														
 
															+    DEFAULT_LIKE_COUNT = 0
														
 
															+    DEFAULT_ARTICLE_STATUS = 1
														
 
															+    DEFAULT_TIMESTAMP = 1717171200
														
 
															+
														
 
															+    # 标题sensitivity
														
 
															+    TITLE_SENSITIVE = 1
														
 
															+    TITLE_NOT_SENSITIVE = 0
														
 
															+
														
 
															+    # 文章联想深度
														
 
															+    ARTICLE_ASSOCIATION_MAX_DEPTH = 4
														
 
															+
														
 
															+    # 相关分百分位阈值
														
 
															+    PERCENT_THRESHOLD = 95
														
 
															+
														
 
															+    # 相关性分阈值
														
 
															+    CORRELATION_THRESHOLD = 0.5
														
 
															+
														
 
															+    # 阅读量阈值
														
 
															+    READ_COUNT_THRESHOLD = 1000
														
 
															+
														
 
															+    # 阅读均值倍数阈值
														
 
															+    READ_AVG_THRESHOLD = 1.3
														
 
															+
														
 
															+    # 群发类型
														
 
															+    BULK_PUBLISH_TYPE = 9
														
 
															+
														
 
															+    # 种子文章数量
														
 
															+    SEED_ARTICLE_LIMIT_NUM = 60
														
 
															+
														
 
															 class updatePublishedMsgTaskConst:
														
 
															     """
														
@@ -41,7 +79,7 @@ class updatePublishedMsgTaskConst:
 
															     # 服务号
														
 
															     SERVICE_TYPE = 2
														
 
															     # 监测周期（秒）
														
 
															-    MONITOR_PERIOD = 60 * 60 * 24 * 7
														
 
															+    MONITOR_PERIOD = 60 * 60 * 24 * 3
														
 
															     # 新号抓文章周期
														
 
															     NEW_ACCOUNT_CRAWL_PERIOD = 60 * 60 * 24 * 30
														
@@ -50,7 +88,7 @@ class updatePublishedMsgTaskConst:
 
															     SUBSCRIBE_FAIL_RATE_THRESHOLD = 0.3
														
 
															-class updateAccountReadRateTaskConst:
														
 
															+class UpdateAccountReadRateTaskConst:
														
 
															     """
														
 
															     更新账号阅读率常量配置
														
 
															     """
														
@@ -66,8 +104,14 @@ class updateAccountReadRateTaskConst:
 
															     # 文章位置
														
 
															     ARTICLE_INDEX_LIST = [1, 2, 3, 4, 5, 6, 7, 8]
														
 
															+    # 默认粉丝
														
 
															+    DEFAULT_FANS = 0
														
 
															-class updateAccountReadAvgTaskConst:
														
 
															+    # 最低粉丝量
														
 
															+    MIN_FANS = 1000
														
 
															+
														
 
															+
														
 
															+class UpdateAccountReadAvgTaskConst:
														
 
															     """
														
 
															     更新账号阅读均值常量配置
														
 
															     """
														
@@ -86,6 +130,19 @@ class updateAccountReadAvgTaskConst:
 
															     ARTICLES_DAILY = 1
														
 
															     TOULIU = 2
														
 
															+    # 默认粉丝
														
 
															+    DEFAULT_FANS = 0
														
 
															+
														
 
															+    # index list
														
 
															+    ARTICLE_INDEX_LIST = [1, 2, 3, 4, 5, 6, 7, 8]
														
 
															+
														
 
															+    # 默认点赞
														
 
															+    DEFAULT_LIKE = 0
														
 
															+
														
 
															+    # 状态
														
 
															+    USING_STATUS = 1
														
 
															+    NOT_USING_STATUS = 0
														
 
															+
														
 
															 class WeixinVideoCrawlerConst:
														
 
															     """
														
@@ -139,18 +196,23 @@ class WeixinVideoCrawlerConst:
 
															     DEFAULT_ACCOUNT_UID = 76862180
														
 
															     # 每天发送的审核视频数量
														
 
															-    MAX_VIDEO_NUM = 500
														
 
															+    MAX_VIDEO_NUM = 1000
														
 
															+
														
 
															+    # 单次发布视频审核量
														
 
															+    MAX_VIDEO_NUM_PER_PUBLISH = 350
														
 
															     # 标题状态
														
 
															     TITLE_DEFAULT_STATUS = 0
														
 
															     TITLE_EXIT_STATUS = 1
														
 
															     TITLE_FESTIVAL_STATUS = 2
														
 
															-    TITLE_DUPLICATE_STATUS = 3
														
 
															-    TITLE_SHORT_STATUS = 4
														
 
															+    TITLE_SHORT_STATUS = 3
														
 
															     # 标题最短长度
														
 
															     TITLE_MIN_LENGTH = 15
														
 
															+    # safe score
														
 
															+    TITLE_SAFE_SCORE_THRESHOLD = 7
														
 
															+
														
 
															 class UpdateMiniProgramDetailConst(updatePublishedMsgTaskConst):
														
 
															     """
														
@@ -207,6 +269,109 @@ class ArticleCollectorConst:
 
															     ARTICLE_UNKNOWN_CODE = 10000
														
 
															+class BaiduVideoCrawlerConst:
														
 
															+    """
														
 
															+    const for baidu video crawler
														
 
															+    """
														
 
															+    # account status
														
 
															+    BAIDU_ACCOUNT_GOOD_STATUS = 1
														
 
															+    BAIDU_ACCOUNT_BAD_STATUS = 0
														
 
															+
														
 
															+    # earliest cursor, 2024-01-01 00:00:00
														
 
															+    DEFAULT_CURSOR = 17040384000000
														
 
															+
														
 
															+    # no source account
														
 
															+    NO_SOURCE_ACCOUNT_STATUS = 0
														
 
															+
														
 
															+    # timestamp To Cursor
														
 
															+    TIMESTAMP_TO_CURSOR = 10000
														
 
															+
														
 
															+    # local path dir
														
 
															+    LOCAL_PATH_DIR = "static"
														
 
															+
														
 
															+
														
 
															+class TitleRewriteTaskConst:
														
 
															+    """
														
 
															+    title rewrite task const
														
 
															+    """
														
 
															+    # title rewrite status
														
 
															+    TITLE_REWRITE_INIT_STATUS = 0
														
 
															+    TITLE_REWRITE_SUCCESS_STATUS = 1
														
 
															+    TITLE_REWRITE_FAIL_STATUS = 99
														
 
															+    TITLE_REWRITE_LOCK_STATUS = 101
														
 
															+
														
 
															+    # article status
														
 
															+    ARTICLE_AUDIT_PASSED_STATUS = 1
														
 
															+    ARTICLE_POSITIVE_STATUS = 0
														
 
															+
														
 
															+    # title useful status
														
 
															+    TITLE_USEFUL_STATUS = 1
														
 
															+
														
 
															+    # prompt version
														
 
															+    PROMPT_VERSION = "xx_250228"  # 信欣2025-02-28提供
														
 
															+
														
 
															+    # block expire time 1h
														
 
															+    TITLE_REWRITE_LOCK_TIME = 60 * 60
														
 
															+
														
 
															+
														
 
															+class ChannelVideoCrawlerConst:
														
 
															+    """
														
 
															+    const for baidu video crawler
														
 
															+    """
														
 
															+    # account status
														
 
															+    CHANNEL_ACCOUNT_GOOD_STATUS = 1
														
 
															+    CHANNEL_ACCOUNT_BAD_STATUS = 0
														
 
															+
														
 
															+    # earliest cursor, 2024-01-01 00:00:00
														
 
															+    DEFAULT_CURSOR = 1704038400
														
 
															+
														
 
															+    # no source account
														
 
															+    NO_SOURCE_ACCOUNT_STATUS = 0
														
 
															+
														
 
															+    # local path dir
														
 
															+    LOCAL_PATH_DIR = "static"
														
 
															+
														
 
															+    # title length min
														
 
															+    MIN_TITLE_LENGTH = 10
														
 
															+
														
 
															+    # max video length(second)
														
 
															+    MAX_VIDEO_LENGTH = 600
														
 
															+
														
 
															+    # sleep second
														
 
															+    SLEEP_SECOND = 2
														
 
															+
														
 
															+
														
 
															+class ToutiaoVideoCrawlerConst:
														
 
															+    """
														
 
															+    const for toutiao video crawler
														
 
															+    """
														
 
															+    # platform
														
 
															+    PLATFORM = "toutiao"
														
 
															+
														
 
															+    # account status
														
 
															+    TOUTIAO_ACCOUNT_GOOD_STATUS = 1
														
 
															+    TOUTIAO_ACCOUNT_BAD_STATUS = 0
														
 
															+
														
 
															+    # earliest cursor, 2021-01-01 00:00:00
														
 
															+    DEFAULT_CURSOR = 1609430400
														
 
															+
														
 
															+    # no source account
														
 
															+    NO_SOURCE_ACCOUNT_STATUS = 0
														
 
															+
														
 
															+    # title length min
														
 
															+    MIN_TITLE_LENGTH = 10
														
 
															+
														
 
															+    # max video length(second)
														
 
															+    MAX_VIDEO_LENGTH = 600
														
 
															+
														
 
															+    # sleep second
														
 
															+    SLEEP_SECOND = 3
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															 # 视频转文本任务
														
 
															 class VideoToTextConst:
														
 
															     """
														
--- a/applications/db/__init__.py
+++ b/applications/db/__init__.py
@@ -30,12 +30,12 @@ class DatabaseConnector:
 
															         """
														
 
															         try:
														
 
															             self.connection = pymysql.connect(
														
 
															-                host=self.db_config.get('host', 'localhost'),
														
 
															-                user=self.db_config['user'],
														
 
															-                password=self.db_config['password'],
														
 
															-                db=self.db_config['db'],
														
 
															-                port=self.db_config.get('port', 3306),
														
 
															-                charset=self.db_config.get('charset', 'utf8mb4')
														
 
															+                host=self.db_config.get("host", "localhost"),
														
 
															+                user=self.db_config["user"],
														
 
															+                password=self.db_config["password"],
														
 
															+                db=self.db_config["db"],
														
 
															+                port=self.db_config.get("port", 3306),
														
 
															+                charset=self.db_config.get("charset", "utf8mb4"),
														
 
															             )
														
 
															         except pymysql.MySQLError as e:
														
 
															             raise ConnectionError(f"无法连接到数据库: {e}")
														
@@ -48,9 +48,10 @@ class DatabaseConnector:
 
															             self.connection.close()
														
 
															             self.connection = None
														
 
															-    def fetch(self, query, cursor_type=None):
														
 
															+    def fetch(self, query, cursor_type=None, params=None):
														
 
															         """
														
 
															         执行单条查询语句，并返回结果。
														
 
															+        :param params: 查询传参
														
 
															         :param cursor_type: 输出的返回格式
														
 
															         :param query: 查询语句
														
 
															         :return: 查询结果列表
														
@@ -61,7 +62,10 @@ class DatabaseConnector:
 
															         try:
														
 
															             with self.connection.cursor(cursor_type) as cursor:
														
 
															-                cursor.execute(query)
														
 
															+                if params:
														
 
															+                    cursor.execute(query, params)
														
 
															+                else:
														
 
															+                    cursor.execute(query)
														
 
															                 result = cursor.fetchall()
														
 
															                 return result
														
 
															         except pymysql.MySQLError as e:
														
--- a/applications/exception/spider_error.py
+++ b/applications/exception/spider_error.py
@@ -7,9 +7,9 @@ from applications import log
 
															 class SpiderError(Exception):
														
 
															-    """数据库查询异常"""
														
 
															+    """spider_task_error"""
														
 
															-    def __init__(self, error=None, spider=None, url=None):
														
 
															+    def __init__(self, platform=None, error=None, spider=None, url=None):
														
 
															         """
														
 
															         :param error: 异常对象，可选，用于提供更详细的错误信息。
														
 
															         :param spider: 爬虫任务
														
@@ -22,7 +22,8 @@ class SpiderError(Exception):
 
															         }
														
 
															         log(
														
 
															             task="spider_task",
														
 
															-            function="log_spider_error",
														
 
															+            function="{}".format(platform),
														
 
															+            message="{} 抓取失败".format(spider),
														
 
															             data=error_obj
														
 
															         )
														
 
															         super().__init__(json.dumps(error_obj, ensure_ascii=False, indent=4))
														
--- a/applications/functions.py
+++ b/applications/functions.py
@@ -153,11 +153,11 @@ class Functions(object):
 
															         快代理
														
 
															         """
														
 
															         # 隧道域名:端口号
														
 
															-        tunnel = "l901.kdltps.com:15818"
														
 
															+        tunnel = "j685.kdltps.com:15818"
														
 
															         # 用户名密码方式
														
 
															-        username = "t11983523373311"
														
 
															-        password = "mtuhdr2z"
														
 
															+        username = "t14070979713487"
														
 
															+        password = "hqwanfvy"
														
 
															         proxies = {
														
 
															             "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
														
 
															             "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
														
@@ -296,3 +296,28 @@ class Functions(object):
 
															         params = parse_qs(urlparse(url).query)
														
 
															         info = params.get(key, [])
														
 
															         return info[0] if info else None
														
 
															+
														
 
															+    @classmethod
														
 
															+    def download_baidu_videos(cls, video_url, save_path):
														
 
															+        """
														
 
															+        :param video_url: baidu video url
														
 
															+        :param save_path: save path
														
 
															+        """
														
 
															+        if os.path.exists(save_path):
														
 
															+            return save_path
														
 
															+
														
 
															+        response = requests.get(
														
 
															+            video_url,
														
 
															+            headers={
														
 
															+                'User-Agent': FakeUserAgent().chrome,
														
 
															+                "Accept": "*/*",
														
 
															+                "Accept-Language": "zh-CN,zh;q=0.9"
														
 
															+            }
														
 
															+        )
														
 
															+        with open(save_path, 'wb') as f:
														
 
															+            f.write(response.content)
														
 
															+        TEN_KB = 1024 * 10
														
 
															+        if os.path.getsize(save_path) > TEN_KB:
														
 
															+            return save_path
														
 
															+        else:
														
 
															+            return None
														
--- a/applications/js/toutiao.js
+++ b/applications/js/toutiao.js
--- a/applications/llm_sensitivity.py
+++ b/applications/llm_sensitivity.py
@@ -8,8 +8,8 @@ from openai import OpenAI
 
															 def request_llm_api(prompt, text):
														
 
															     client = OpenAI(
														
 
															-        api_key='sk-c1b18099dadc4dd1b48239bdde184f6c',
														
 
															-        base_url="https://api.deepseek.com"
														
 
															+        api_key='5e275c38-44fd-415f-abcf-4b59f6377f72',
														
 
															+        base_url="https://ark.cn-beijing.volces.com/api/v3"
														
 
															     )
														
 
															     chat_completion = client.chat.completions.create(
														
 
															         messages=[
														
@@ -18,7 +18,7 @@ def request_llm_api(prompt, text):
 
															                 "content": prompt + text,
														
 
															             }
														
 
															         ],
														
 
															-        model="deepseek-chat",
														
 
															+        model="ep-20250213194558-rrmr2", # deepseek-v3
														
 
															         temperature=0.2,
														
 
															         response_format={"type": "json_object"}
														
 
															     )
														
--- a/applications/pipeline/__init__.py
+++ b/applications/pipeline/__init__.py
@@ -0,0 +1,4 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+from .crawler_pipeline import scrape_video_entities_process
														
--- a/applications/pipeline/crawler_pipeline.py
+++ b/applications/pipeline/crawler_pipeline.py
@@ -0,0 +1,83 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+
														
 
															+import os
														
 
															+import json
														
 
															+
														
 
															+from applications import log
														
 
															+
														
 
															+from applications.utils import download_gzh_video
														
 
															+from applications.utils import download_toutiao_video
														
 
															+from applications.utils import upload_to_oss
														
 
															+
														
 
															+from config import apolloConfig
														
 
															+
														
 
															+my_config = apolloConfig()
														
 
															+
														
 
															+empty_dict = {}
														
 
															+sensitive_word_list = json.loads(my_config.getConfigValue("sensitive_word_list"))
														
 
															+
														
 
															+
														
 
															+def whether_title_sensitive(title: str) -> bool:
														
 
															+    """
														
 
															+    title sensitive words filter
														
 
															+    """
														
 
															+    for word in sensitive_word_list:
														
 
															+        if word in title:
														
 
															+            return True
														
 
															+
														
 
															+    return False
														
 
															+
														
 
															+
														
 
															+def whether_duplicate_video_title(video_title: str, db_client) -> bool:
														
 
															+    """
														
 
															+    whether duplicate video title
														
 
															+    """
														
 
															+    sql = f"""
														
 
															+        select id from publish_single_video_source
														
 
															+        where article_title = %s;
														
 
															+    """
														
 
															+    duplicate_id = db_client.fetch(query=sql, params=(video_title,))
														
 
															+    if duplicate_id:
														
 
															+        return True
														
 
															+
														
 
															+    return False
														
 
															+
														
 
															+
														
 
															+def scrape_video_entities_process(video_item, db_client) -> dict:
														
 
															+    """
														
 
															+    video crawler pipeline
														
 
															+    """
														
 
															+    article_url = video_item["article_url"]
														
 
															+    platform = video_item["platform"]
														
 
															+    video_title = video_item["article_title"]
														
 
															+    # whether title sensitive
														
 
															+    if whether_title_sensitive(video_title):
														
 
															+        return empty_dict
														
 
															+
														
 
															+    # whether duplicate video title
														
 
															+    if whether_duplicate_video_title(video_title, db_client):
														
 
															+        return empty_dict
														
 
															+
														
 
															+    # download video
														
 
															+    match platform:
														
 
															+        case "toutiao":
														
 
															+            video_path = download_toutiao_video(article_url)
														
 
															+        case "gzh":
														
 
															+            video_path = download_gzh_video(article_url)
														
 
															+        case "hksp":
														
 
															+            video_path = ""
														
 
															+        case "sph":
														
 
															+            video_path = ""
														
 
															+        case _:
														
 
															+            return empty_dict
														
 
															+
														
 
															+    if video_path:
														
 
															+        # upload video to oss
														
 
															+        oss_path = upload_to_oss(video_path)
														
 
															+        video_item["video_oss_path"] = oss_path
														
 
															+        os.remove(video_path)
														
 
															+        return video_item
														
 
															+    else:
														
 
															+        return empty_dict
														
--- a/applications/so/libsph_decrypt.so
+++ b/applications/so/libsph_decrypt.so
--- a/applications/utils/__init__.py
+++ b/applications/utils/__init__.py
@@ -0,0 +1,14 @@
 
															+"""
														
 
															+utils
														
 
															+"""
														
 
															+from .cold_start import whether_title_sensitive
														
 
															+from .cold_start import get_inner_account_set
														
 
															+from .common import *
														
 
															+from .download_video import download_gzh_video
														
 
															+from .download_video import download_sph_video
														
 
															+from .download_video import download_toutiao_video
														
 
															+from .item import Item
														
 
															+from .save_to_db import insert_into_single_video_source_table
														
 
															+from .upload import upload_to_oss
														
 
															+from .fetch_info_from_aigc import fetch_account_fans
														
 
															+from .fetch_info_from_aigc import fetch_publishing_account_list
														
--- a/applications/utils/cold_start.py
+++ b/applications/utils/cold_start.py
@@ -0,0 +1,30 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+import json
														
 
															+
														
 
															+from applications import aiditApi
														
 
															+from config import apolloConfig
														
 
															+
														
 
															+config = apolloConfig()
														
 
															+sensitive_word_list = json.loads(config.getConfigValue("sensitive_word_list"))
														
 
															+
														
 
															+
														
 
															+def whether_title_sensitive(title: str) -> bool:
														
 
															+    """
														
 
															+    : param title:
														
 
															+    判断视频是否的标题是否包含敏感词
														
 
															+    """
														
 
															+    for word in sensitive_word_list:
														
 
															+        if word in title:
														
 
															+            return True
														
 
															+    return False
														
 
															+
														
 
															+
														
 
															+def get_inner_account_set() -> set:
														
 
															+    """
														
 
															+    get inner account set
														
 
															+    """
														
 
															+    accounts = aiditApi.get_publish_account_from_aigc()
														
 
															+    gh_id_list = [i['ghId'] for i in accounts]
														
 
															+    return set(gh_id_list)
														
--- a/applications/utils/common.py
+++ b/applications/utils/common.py
@@ -0,0 +1,61 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+
														
 
															+import hashlib
														
 
															+
														
 
															+from requests import RequestException
														
 
															+from tenacity import (
														
 
															+    stop_after_attempt,
														
 
															+    wait_exponential,
														
 
															+    retry_if_exception_type,
														
 
															+)
														
 
															+
														
 
															+
														
 
															+def str_to_md5(strings):
														
 
															+    """
														
 
															+    字符串转化为 md5 值
														
 
															+    :param strings:
														
 
															+    :return:
														
 
															+    """
														
 
															+    # 将字符串转换为字节
														
 
															+    original_bytes = strings.encode("utf-8")
														
 
															+    # 创建一个md5 hash对象
														
 
															+    md5_hash = hashlib.md5()
														
 
															+    # 更新hash对象，传入原始字节
														
 
															+    md5_hash.update(original_bytes)
														
 
															+    # 获取16进制形式的MD5哈希值
														
 
															+    md5_value = md5_hash.hexdigest()
														
 
															+    return md5_value
														
 
															+
														
 
															+
														
 
															+def proxy():
														
 
															+    """
														
 
															+    快代理
														
 
															+    """
														
 
															+    # 隧道域名:端口号
														
 
															+    tunnel = "j685.kdltps.com:15818"
														
 
															+
														
 
															+    # 用户名密码方式
														
 
															+    username = "t14070979713487"
														
 
															+    password = "hqwanfvy"
														
 
															+    proxies = {
														
 
															+        "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
														
 
															+        "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
														
 
															+    }
														
 
															+    return proxies
														
 
															+
														
 
															+
														
 
															+def request_retry(retry_times, min_retry_delay, max_retry_delay):
														
 
															+    """
														
 
															+    :param retry_times:
														
 
															+    :param min_retry_delay:
														
 
															+    :param max_retry_delay:
														
 
															+    """
														
 
															+    common_retry = dict(
														
 
															+        stop=stop_after_attempt(retry_times),
														
 
															+        wait=wait_exponential(min=min_retry_delay, max=max_retry_delay),
														
 
															+        retry=retry_if_exception_type((RequestException, TimeoutError)),
														
 
															+        reraise=True  # 重试耗尽后重新抛出异常
														
 
															+    )
														
 
															+    return common_retry
														
--- a/applications/utils/download_video.py
+++ b/applications/utils/download_video.py
@@ -0,0 +1,156 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+
														
 
															+import os
														
 
															+import re
														
 
															+import html
														
 
															+import cffi
														
 
															+import traceback
														
 
															+
														
 
															+import requests
														
 
															+from uuid import uuid4
														
 
															+from fake_useragent import FakeUserAgent
														
 
															+
														
 
															+from applications.utils.common import str_to_md5
														
 
															+from config import decrypt_key_path
														
 
															+
														
 
															+headers = {"Content-Type": "application/json", "User-Agent": FakeUserAgent().chrome}
														
 
															+
														
 
															+
														
 
															+def extract_video_url_from_article(article_url):
														
 
															+    """
														
 
															+    :param article_url:
														
 
															+    :return:
														
 
															+    """
														
 
															+    response = requests.get(
														
 
															+        url=article_url,
														
 
															+        headers={"User-Agent": FakeUserAgent().random},
														
 
															+    )
														
 
															+    html_text = response.text
														
 
															+    w = re.search(
														
 
															+        r"mp_video_trans_info.*url:\s*\(\'(.*?)\'\)\.replace", html_text, re.S | re.M
														
 
															+    ).group(1)
														
 
															+    url = html.unescape(
														
 
															+        re.sub(
														
 
															+            r"\\x\d+", lambda x: bytes.fromhex(x.group().replace("\\x", "")).decode(), w
														
 
															+        )
														
 
															+    )
														
 
															+    return url
														
 
															+
														
 
															+
														
 
															+def download_gzh_video(article_url):
														
 
															+    """
														
 
															+    下载公众号视频
														
 
															+    :param article_url:
														
 
															+    :return:
														
 
															+    """
														
 
															+    try:
														
 
															+        video_url = extract_video_url_from_article(article_url)
														
 
															+    except Exception as e:
														
 
															+        return
														
 
															+    save_path = "static/{}.mp4".format(str_to_md5(video_url))
														
 
															+    headers = {
														
 
															+        "Accept": "*/*",
														
 
															+        "Accept-Language": "zh,zh-CN;q=0.9",
														
 
															+        "Connection": "keep-alive",
														
 
															+        "Origin": "https://mp.weixin.qq.com",
														
 
															+        "Referer": "https://mp.weixin.qq.com/",
														
 
															+        "Sec-Fetch-Dest": "video",
														
 
															+        "Sec-Fetch-Mode": "cors",
														
 
															+        "Sec-Fetch-Site": "cross-site",
														
 
															+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
														
 
															+        "sec-ch-ua": '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"',
														
 
															+        "sec-ch-ua-mobile": "?0",
														
 
															+        "sec-ch-ua-platform": '"macOS"',
														
 
															+    }
														
 
															+    res = requests.get(video_url, headers=headers)
														
 
															+    with open(save_path, "wb") as f:
														
 
															+        f.write(res.content)
														
 
															+
														
 
															+    TEN_KB = 1024 * 10
														
 
															+    if os.path.getsize(save_path) > TEN_KB:
														
 
															+        return save_path
														
 
															+    else:
														
 
															+        return None
														
 
															+
														
 
															+
														
 
															+def download_sph_video(download_url, key):
														
 
															+    """
														
 
															+    download video, decrypt video and save to local
														
 
															+    """
														
 
															+    file_id = uuid4().hex
														
 
															+    encrypted_path = f"static/encrypted_{file_id}.mp4"
														
 
															+    decrypted_path = f"static/decrypted_{file_id}.mp4"
														
 
															+
														
 
															+    try:
														
 
															+        with requests.get(download_url, headers=headers, stream=True) as response:
														
 
															+            response.raise_for_status()
														
 
															+
														
 
															+            with open(encrypted_path, "wb") as f:
														
 
															+                for chunk in response.iter_content(chunk_size=8192):
														
 
															+                    if chunk:  # filter out keep-alive chunks
														
 
															+                        f.write(chunk)
														
 
															+
														
 
															+        decrypt_sph_video(encrypted_path, key, decrypted_path)
														
 
															+        os.remove(encrypted_path)
														
 
															+        return decrypted_path
														
 
															+
														
 
															+    except Exception as e:
														
 
															+        print(traceback.format_exc())
														
 
															+        for path in [encrypted_path, decrypted_path]:
														
 
															+            if os.path.exists(path):
														
 
															+                try:
														
 
															+                    os.remove(path)
														
 
															+                except OSError:
														
 
															+                    pass
														
 
															+        raise RuntimeError(f"Video processing failed: {str(e)}") from e
														
 
															+
														
 
															+
														
 
															+def decrypt_sph_video(video_path: str, key: int, save_path: str) -> None:
														
 
															+    """
														
 
															+    Decrypt video file using C library.
														
 
															+    Args:
														
 
															+        video_path: Path to encrypted video file
														
 
															+        key: 32-bit unsigned integer decryption key
														
 
															+        save_path: Path to save decrypted video
														
 
															+    Raises:
														
 
															+        RuntimeError: If decryption fails
														
 
															+    """
														
 
															+    print("key is {}".format(key))
														
 
															+    ffi = cffi.FFI()
														
 
															+
														
 
															+    try:
														
 
															+        lib = ffi.dlopen(decrypt_key_path)
														
 
															+        ffi.cdef(
														
 
															+            "void decrypt(unsigned char *data, const size_t data_length, const uint32_t key);"
														
 
															+        )
														
 
															+
														
 
															+        with open(video_path, "rb") as f:
														
 
															+            encrypted_data = f.read()
														
 
															+
														
 
															+        c_data = ffi.new("unsigned char[]", list(encrypted_data))
														
 
															+        lib.decrypt(c_data, 2**17, int(key))
														
 
															+        decrypted_data = bytes(ffi.buffer(c_data, len(encrypted_data))[:])
														
 
															+
														
 
															+        with open(save_path, "wb") as f:
														
 
															+            f.write(decrypted_data)
														
 
															+
														
 
															+    except Exception as e:
														
 
															+        print(traceback.format_exc())
														
 
															+        raise RuntimeError(f"Decryption failed: {str(e)}") from e
														
 
															+
														
 
															+
														
 
															+def download_toutiao_video(video_url: str) -> str:
														
 
															+    """
														
 
															+    download toutiao video
														
 
															+    """
														
 
															+    save_path = "static/{}.mp4".format(str_to_md5(video_url))
														
 
															+    response = requests.get(video_url, headers=headers, stream=True)
														
 
															+    with open(save_path, "wb") as f:
														
 
															+        for chunk in response.iter_content(chunk_size=8192):
														
 
															+            if chunk:
														
 
															+                f.write(chunk)
														
 
															+
														
 
															+    return save_path
														
 
															+
														
--- a/applications/utils/fetch_info_from_aigc.py
+++ b/applications/utils/fetch_info_from_aigc.py
@@ -0,0 +1,58 @@
 
															+"""
														
 
															+fetch info from aigc database system
														
 
															+"""
														
 
															+from collections import defaultdict
														
 
															+from typing import List, Dict
														
 
															+
														
 
															+from pymysql.cursors import DictCursor
														
 
															+
														
 
															+
														
 
															+def fetch_publishing_account_list(db_client) -> List[Dict]:
														
 
															+    """
														
 
															+    fetch account_list from aigc database
														
 
															+    """
														
 
															+    fetch_sql = f"""
														
 
															+        SELECT DISTINCT
														
 
															+            t3.`name` as account_name,
														
 
															+            t3.gh_id as gh_id,
														
 
															+            t3.follower_count as fans,
														
 
															+            t6.account_source_name as account_source,
														
 
															+            t6.mode_type as mode_type,
														
 
															+            t6.account_type as account_type,
														
 
															+            t6.`status` as status
														
 
															+        FROM
														
 
															+            publish_plan t1
														
 
															+            JOIN publish_plan_account t2 ON t1.id = t2.plan_id
														
 
															+            JOIN publish_account t3 ON t2.account_id = t3.id
														
 
															+            LEFT JOIN publish_account_wx_type t4 on t3.id = t4.account_id
														
 
															+            LEFT JOIN wx_statistics_group_source_account t5 on t3.id = t5.account_id
														
 
															+            LEFT JOIN wx_statistics_group_source t6 on t5.group_source_name = t6.account_source_name
														
 
															+        WHERE
														
 
															+            t1.plan_status = 1
														
 
															+            AND t3.channel = 5
														
 
															+            GROUP BY t3.id;
														
 
															+    """
														
 
															+    account_list = db_client.fetch(
														
 
															+        query=fetch_sql,
														
 
															+        cursor_type=DictCursor
														
 
															+    )
														
 
															+    return account_list
														
 
															+
														
 
															+def fetch_account_fans(db_client, start_date: str) -> Dict:
														
 
															+    """
														
 
															+    fetch account fans from aigc database
														
 
															+    """
														
 
															+    sql = f"""
														
 
															+        SELECT t1.date_str, t1.fans_count, t2.gh_id
														
 
															+        FROM datastat_wx t1 JOIN publish_account t2 ON t1.account_id = t2.id
														
 
															+        WHERE t2.channel = 5
														
 
															+            AND t2.status = 1 
														
 
															+            AND t1.date_str >= '{start_date}' 
														
 
															+        ORDER BY t1.date_str;
														
 
															+        """
														
 
															+    result = db_client.fetch(sql)
														
 
															+    fans_dict = defaultdict(dict)
														
 
															+    for dt, fans, gh_id in result:
														
 
															+        fans_dict.setdefault(gh_id, {})[dt] = fans
														
 
															+    return fans_dict
														
 
															+
														
--- a/applications/utils/item.py
+++ b/applications/utils/item.py
@@ -0,0 +1,69 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+
														
 
															+import time
														
 
															+
														
 
															+default_single_video_table_fields = {
														
 
															+    "platform": "gzh",
														
 
															+    "article_title": None,
														
 
															+    "content_trace_id": None,
														
 
															+    "read_cnt": 0,
														
 
															+    "article_index": None,
														
 
															+    "out_account_name": None,
														
 
															+    "article_url": None,
														
 
															+    "url_unique_md5": None,
														
 
															+    "category": None,
														
 
															+    "publish_timestamp": None,
														
 
															+    "out_account_id": None,
														
 
															+    "cover_url": None,
														
 
															+    "crawler_timestamp": int(time.time()),
														
 
															+    "source_account": 1,
														
 
															+    "article_publish_type": None,
														
 
															+    "like_cnt": 0,
														
 
															+    "bad_status": 0,
														
 
															+    "tags": None,
														
 
															+    "video_oss_path": None,
														
 
															+}
														
 
															+
														
 
															+
														
 
															+class Item(object):
														
 
															+    """
														
 
															+    format save to article meta table or single video source table
														
 
															+    """
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        self.item = {}
														
 
															+
														
 
															+    def add(self, key, value):
														
 
															+        """
														
 
															+        add key value to item
														
 
															+        """
														
 
															+        self.item[key] = value
														
 
															+
														
 
															+    def check_video_item(self):
														
 
															+        """
														
 
															+        check video item
														
 
															+        """
														
 
															+        fields = list(default_single_video_table_fields.keys())
														
 
															+        for field in fields:
														
 
															+            if self.item.get(field, None) is not None:
														
 
															+                continue
														
 
															+            else:
														
 
															+                self.item[field] = default_single_video_table_fields[field]
														
 
															+
														
 
															+    def check_article_item(self):
														
 
															+        """
														
 
															+        check article item
														
 
															+        """
														
 
															+        return
														
 
															+
														
 
															+    def check(self, source):
														
 
															+        """
														
 
															+        check item
														
 
															+        """
														
 
															+        match source:
														
 
															+            case "video":
														
 
															+                self.check_video_item()
														
 
															+            case "article":
														
 
															+                self.check_article_item()
														
--- a/applications/utils/save_to_db.py
+++ b/applications/utils/save_to_db.py
@@ -0,0 +1,52 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+
														
 
															+import traceback
														
 
															+from applications.aliyunLogApi import log
														
 
															+
														
 
															+
														
 
															+def insert_into_single_video_source_table(db_client, video_item):
														
 
															+    """
														
 
															+    insert video into single video source table
														
 
															+    """
														
 
															+    insert_sql = f"""
														
 
															+        INSERT INTO publish_single_video_source
														
 
															+        (content_trace_id, article_title, out_account_id, out_account_name, read_cnt, like_cnt, article_url, cover_url, video_oss_path, publish_timestamp, crawler_timestamp, url_unique_md5, category, tags, platform, source_account)
														
 
															+        values
														
 
															+        (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
														
 
															+    """
														
 
															+    try:
														
 
															+        db_client.save(
														
 
															+            query=insert_sql,
														
 
															+            params=(
														
 
															+                video_item["content_trace_id"],
														
 
															+                video_item["article_title"],
														
 
															+                video_item["out_account_id"],
														
 
															+                video_item["out_account_name"],
														
 
															+                video_item["read_cnt"],
														
 
															+                video_item["like_cnt"],
														
 
															+                video_item["article_url"],
														
 
															+                video_item["cover_url"],
														
 
															+                video_item["video_oss_path"],
														
 
															+                video_item["publish_timestamp"],
														
 
															+                video_item["crawler_timestamp"],
														
 
															+                video_item["url_unique_md5"],
														
 
															+                video_item["category"],
														
 
															+                video_item["tags"],
														
 
															+                video_item["platform"],
														
 
															+                video_item["source_account"],
														
 
															+            ),
														
 
															+        )
														
 
															+    except Exception as e:
														
 
															+        log(
														
 
															+            task="{}_video_crawler".format(video_item["platform"]),
														
 
															+            function="save_each_video",
														
 
															+            message="save video failed",
														
 
															+            data={
														
 
															+                "error": str(e),
														
 
															+                "traceback": traceback.format_exc(),
														
 
															+                "video_id": video_item["url_unique_md5"],
														
 
															+                "oss_path": video_item["video_oss_path"],
														
 
															+            },
														
 
															+        )
														
--- a/applications/utils/upload.py
+++ b/applications/utils/upload.py
@@ -0,0 +1,23 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+
														
 
															+import oss2
														
 
															+from uuid import uuid4
														
 
															+
														
 
															+
														
 
															+def upload_to_oss(local_video_path):
														
 
															+    """
														
 
															+    把视频上传到 oss
														
 
															+    :return:
														
 
															+    """
														
 
															+    oss_video_key = "long_articles/video/" + str(uuid4())
														
 
															+    access_key_id = "LTAIP6x1l3DXfSxm"
														
 
															+    access_key_secret = "KbTaM9ars4OX3PMS6Xm7rtxGr1FLon"
														
 
															+    endpoint = "oss-cn-hangzhou.aliyuncs.com"
														
 
															+    bucket_name = "art-pubbucket"
														
 
															+    bucket = oss2.Bucket(
														
 
															+        oss2.Auth(access_key_id, access_key_secret), endpoint, bucket_name
														
 
															+    )
														
 
															+    bucket.put_object_from_file(key=oss_video_key, filename=local_video_path)
														
 
															+    return oss_video_key
														
--- a/applications/wxSpiderApi.py
+++ b/applications/wxSpiderApi.py
@@ -1,9 +1,12 @@
 
															 """
														
 
															 @author: luojunhui
														
 
															 """
														
 
															+
														
 
															 import json
														
 
															+import time
														
 
															 import requests
														
 
															+from applications.aliyunLogApi import log
														
 
															 from applications.decoratorApi import retryOnNone
														
@@ -11,13 +14,12 @@ class WeixinSpider(object):
 
															     """
														
 
															     Update account articles
														
 
															     """
														
 
															+
														
 
															     # ip = "8.217.190.241"
														
 
															     # ip = "47.98.154.124"
														
 
															     # port = "8888"
														
 
															     base_url = "http://crawler-cn.aiddit.com/crawler/wei_xin"
														
 
															-    headers = {
														
 
															-        "Content-Type": "application/json"
														
 
															-    }
														
 
															+    headers = {"Content-Type": "application/json"}
														
 
															     @classmethod
														
 
															     @retryOnNone()
														
@@ -27,11 +29,10 @@ class WeixinSpider(object):
 
															         :return:
														
 
															         """
														
 
															         url = "{}/keyword".format(cls.base_url)
														
 
															-        payload = json.dumps({
														
 
															-            "keyword": title,
														
 
															-            "cursor": page
														
 
															-        })
														
 
															-        response = requests.request("POST", url, headers=cls.headers, data=payload, timeout=120)
														
 
															+        payload = json.dumps({"keyword": title, "cursor": page})
														
 
															+        response = requests.request(
														
 
															+            "POST", url, headers=cls.headers, data=payload, timeout=120
														
 
															+        )
														
 
															         return response.json()
														
 
															     @classmethod
														
@@ -45,13 +46,17 @@ class WeixinSpider(object):
 
															         :return:
														
 
															         """
														
 
															         url = "{}/detail".format(cls.base_url)
														
 
															-        payload = json.dumps({
														
 
															-            "content_link": content_link,
														
 
															-            "is_count": is_count,
														
 
															-            "is_ad": False,
														
 
															-            "is_cache": is_cache
														
 
															-        })
														
 
															-        response = requests.request("POST", url, headers=cls.headers, data=payload, timeout=120)
														
 
															+        payload = json.dumps(
														
 
															+            {
														
 
															+                "content_link": content_link,
														
 
															+                "is_count": is_count,
														
 
															+                "is_ad": False,
														
 
															+                "is_cache": is_cache,
														
 
															+            }
														
 
															+        )
														
 
															+        response = requests.request(
														
 
															+            "POST", url, headers=cls.headers, data=payload, timeout=120
														
 
															+        )
														
 
															         return response.json()
														
 
															     @classmethod
														
@@ -60,12 +65,14 @@ class WeixinSpider(object):
 
															         """
														
 
															         :return:
														
 
															         """
														
 
															-        url = '{}/blogger'.format(cls.base_url)
														
 
															+        url = "{}/blogger".format(cls.base_url)
														
 
															         payload = {
														
 
															-            'account_id': ghId,
														
 
															-            'cursor': index,
														
 
															+            "account_id": ghId,
														
 
															+            "cursor": index,
														
 
															         }
														
 
															-        response = requests.post(url=url, headers=cls.headers, data=json.dumps(payload), timeout=120)
														
 
															+        response = requests.post(
														
 
															+            url=url, headers=cls.headers, data=json.dumps(payload), timeout=120
														
 
															+        )
														
 
															         return response.json()
														
 
															     @classmethod
														
@@ -76,9 +83,11 @@ class WeixinSpider(object):
 
															         :param content_url:
														
 
															         :return:
														
 
															         """
														
 
															-        url = '{}/account_info'.format(cls.base_url)
														
 
															+        url = "{}/account_info".format(cls.base_url)
														
 
															         data = {"content_link": content_url}
														
 
															-        response = requests.request("POST", url=url, headers=cls.headers, json=data, timeout=120)
														
 
															+        response = requests.request(
														
 
															+            "POST", url=url, headers=cls.headers, json=data, timeout=120
														
 
															+        )
														
 
															         return response.json()
														
 
															     @classmethod
														
@@ -89,8 +98,34 @@ class WeixinSpider(object):
 
															         :return:
														
 
															         """
														
 
															         url = "{}/recommend".format(cls.base_url)
														
 
															-        payload = json.dumps(
														
 
															-            {"content_link": content_link}
														
 
															+        payload = json.dumps({"content_link": content_link})
														
 
															+        response = requests.request(
														
 
															+            "POST", url=url, headers=cls.headers, data=payload, timeout=120
														
 
															+        )
														
 
															+        response_json = response.json()
														
 
															+        if response_json["code"] != 0:
														
 
															+            return cls.get_recommend_articles(content_link)
														
 
															+        time.sleep(3)
														
 
															+        return response.json()
														
 
															+
														
 
															+    @classmethod
														
 
															+    def get_recommend_articles_v2(cls, content_link) -> dict:
														
 
															+        """
														
 
															+        use content link to get recommend articles
														
 
															+        :param content_link:
														
 
															+        :return:
														
 
															+        """
														
 
															+        url = "http://datapi.top/wxapi/relatedarticle"
														
 
															+        payload = {
														
 
															+            'url': content_link,
														
 
															+            'token': '401e4d3c85068bb5'
														
 
															+        }
														
 
															+        response = requests.request("POST", url, headers={}, data=payload, timeout=120)
														
 
															+        log(
														
 
															+            task="article_association_crawler",
														
 
															+            function="get_recommend_articles_v2",
														
 
															+            message="获取推荐链接，付费接口",
														
 
															+            data={"content_link": content_link, "response": response.json()},
														
 
															         )
														
 
															-        response = requests.request("POST", url=url, headers=cls.headers, data=payload, timeout=120)
														
 
															+        time.sleep(3)
														
 
															         return response.json()
														
--- a/article_association_task.py
+++ b/article_association_task.py
@@ -0,0 +1,53 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+import traceback
														
 
															+from argparse import ArgumentParser
														
 
															+
														
 
															+from applications import bot
														
 
															+from coldStartTasks.crawler.wechat import ArticleAssociationCrawler
														
 
															+from coldStartTasks.publish.publish_article_association_articles import ArticleAssociationPublish
														
 
															+
														
 
															+
														
 
															+def main():
														
 
															+    """
														
 
															+    main function
														
 
															+    """
														
 
															+    parser = ArgumentParser()
														
 
															+    parser.add_argument("--biz_date", type=str, help="format 2025-01-01")
														
 
															+    args = parser.parse_args()
														
 
															+
														
 
															+    if args.biz_date:
														
 
															+        biz_date = args.biz_date
														
 
															+    else:
														
 
															+        biz_date = None
														
 
															+    try:
														
 
															+        article_association_crawler = ArticleAssociationCrawler()
														
 
															+        article_association_crawler.deal(biz_date=biz_date)
														
 
															+    except Exception as e:
														
 
															+        bot(
														
 
															+            title="It occurred an Exception in ArticleAssociationCrawler",
														
 
															+            detail={
														
 
															+                "Error": str(e),
														
 
															+                "Traceback": traceback.format_exc()
														
 
															+            },
														
 
															+            mention=False,
														
 
															+        )
														
 
															+
														
 
															+    # publish
														
 
															+    try:
														
 
															+        article_association_publish = ArticleAssociationPublish()
														
 
															+        article_association_publish.deal()
														
 
															+    except Exception as e:
														
 
															+        bot(
														
 
															+            title="It occurred an Exception in ArticleAssociationPublish",
														
 
															+            detail={
														
 
															+                "Error": str(e),
														
 
															+                "Traceback": traceback.format_exc()
														
 
															+            },
														
 
															+            mention=False,
														
 
															+        )
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    main()
														
--- a/cal_account_read_rate_avg_daily.py
+++ b/cal_account_read_rate_avg_daily.py
@@ -7,14 +7,21 @@ from tqdm import tqdm
 
															 from pandas import DataFrame
														
 
															 from argparse import ArgumentParser
														
 
															 from datetime import datetime
														
 
															+from pymysql.cursors import DictCursor
														
 
															-from applications import DeNetMysql, PQMySQL, longArticlesMySQL, bot, Functions, create_feishu_columns_sheet
														
 
															-from applications.const import updateAccountReadRateTaskConst
														
 
															-from config import apolloConfig
														
 
															+from applications import bot, Functions, log
														
 
															+from applications import create_feishu_columns_sheet
														
 
															+from applications.db import DatabaseConnector
														
 
															+from applications.const import UpdateAccountReadRateTaskConst
														
 
															+from applications.utils import fetch_publishing_account_list
														
 
															+from applications.utils import fetch_account_fans
														
 
															+from config import apolloConfig, long_articles_config, piaoquan_crawler_config, denet_config
														
 
															-const = updateAccountReadRateTaskConst()
														
 
															+
														
 
															+const = UpdateAccountReadRateTaskConst()
														
 
															 config = apolloConfig()
														
 
															 unauthorized_account = json.loads(config.getConfigValue("unauthorized_gh_id_fans"))
														
 
															+backup_account_fans = json.loads(config.getConfigValue("backup_account_fans"))
														
 
															 functions = Functions()
														
 
															 read_rate_table = "long_articles_read_rate"
														
@@ -37,75 +44,7 @@ def filter_outlier_data(group, key='show_view_count'):
 
															     return filtered_group
														
 
															-def get_account_fans_by_dt(db_client) -> dict:
														
 
															-    """
														
 
															-    获取每个账号发粉丝，通过日期来区分
														
 
															-    :return:
														
 
															-    """
														
 
															-    sql = f"""
														
 
															-        SELECT 
														
 
															-            t1.date_str, 
														
 
															-            t1.fans_count, 
														
 
															-            t2.gh_id
														
 
															-        FROM datastat_wx t1
														
 
															-        JOIN publish_account t2 ON t1.account_id = t2.id
														
 
															-        WHERE 
														
 
															-            t2.channel = 5 
														
 
															-        AND t2.status = 1 
														
 
															-        AND t1.date_str >= '2024-07-01' 
														
 
															-        ORDER BY t1.date_str;
														
 
															-    """
														
 
															-    result = db_client.select(sql)
														
 
															-    D = {}
														
 
															-    for line in result:
														
 
															-        dt = line[0]
														
 
															-        fans = line[1]
														
 
															-        gh_id = line[2]
														
 
															-        if D.get(gh_id):
														
 
															-            D[gh_id][dt] = fans
														
 
															-        else:
														
 
															-            D[gh_id] = {dt: fans}
														
 
															-    return D
														
 
															-
														
 
															-
														
 
															-def get_publishing_accounts(db_client) -> list[dict]:
														
 
															-    """
														
 
															-    获取每日正在发布的账号
														
 
															-    :return:
														
 
															-    """
														
 
															-    sql = f"""
														
 
															-    SELECT DISTINCT
														
 
															-        t3.`name`,
														
 
															-        t3.gh_id,
														
 
															-        t3.follower_count,
														
 
															-        t6.account_source_name,
														
 
															-        t6.mode_type,
														
 
															-        t6.account_type,
														
 
															-        t6.`status`
														
 
															-    FROM
														
 
															-        publish_plan t1
														
 
															-        JOIN publish_plan_account t2 ON t1.id = t2.plan_id
														
 
															-        JOIN publish_account t3 ON t2.account_id = t3.id
														
 
															-        LEFT JOIN publish_account_wx_type t4 on t3.id = t4.account_id
														
 
															-        LEFT JOIN wx_statistics_group_source_account t5 on t3.id = t5.account_id
														
 
															-        LEFT JOIN wx_statistics_group_source t6 on t5.group_source_name = t6.account_source_name
														
 
															-    WHERE
														
 
															-        t1.plan_status = 1
														
 
															-        AND t3.channel = 5
														
 
															-        -- AND t3.follower_count > 0
														
 
															-        GROUP BY t3.id;
														
 
															-    """
														
 
															-    account_list = db_client.select(sql)
														
 
															-    result_list = [
														
 
															-        {
														
 
															-            "account_name": i[0],
														
 
															-            "gh_id": i[1]
														
 
															-        } for i in account_list
														
 
															-    ]
														
 
															-    return result_list
														
 
															-
														
 
															-
														
 
															-def get_account_articles_detail(db_client, gh_id_tuple) -> list[dict]:
														
 
															+def get_account_articles_detail(db_client, gh_id_tuple, min_publish_timestamp) -> list[dict]:
														
 
															     """
														
 
															     get articles details
														
 
															     :return:
														
@@ -116,47 +55,37 @@ def get_account_articles_detail(db_client, gh_id_tuple) -> list[dict]:
 
															             FROM 
														
 
															                 official_articles_v2
														
 
															             WHERE 
														
 
															-                ghId IN {gh_id_tuple} and Type = '{const.BULK_PUBLISH_TYPE}';
														
 
															+                ghId IN {gh_id_tuple} and Type = '{const.BULK_PUBLISH_TYPE}' and publish_timestamp >= {min_publish_timestamp};
														
 
															             """
														
 
															-    result = db_client.select(sql)
														
 
															-    response_list = [
														
 
															-        {
														
 
															-            "ghId": i[0],
														
 
															-            "accountName": i[1],
														
 
															-            "ItemIndex": i[2],
														
 
															-            "show_view_count": i[3],
														
 
															-            "publish_timestamp": i[4]
														
 
															-        }
														
 
															-        for i in result
														
 
															-    ]
														
 
															+    response_list = db_client.fetch(query=sql, cursor_type=DictCursor)
														
 
															     return response_list
														
 
															-def cal_account_read_rate(gh_id_tuple) -> DataFrame:
														
 
															+def cal_account_read_rate(article_list, fans_dict) -> DataFrame:
														
 
															     """
														
 
															     计算账号位置的阅读率
														
 
															     :return:
														
 
															     """
														
 
															-    pq_db = PQMySQL()
														
 
															-    de_db = DeNetMysql()
														
 
															     response = []
														
 
															-    fans_dict_each_day = get_account_fans_by_dt(db_client=de_db)
														
 
															-    account_article_detail = get_account_articles_detail(
														
 
															-        db_client=pq_db,
														
 
															-        gh_id_tuple=gh_id_tuple
														
 
															-    )
														
 
															-    for line in account_article_detail:
														
 
															+    for line in article_list:
														
 
															         gh_id = line['ghId']
														
 
															         dt = functions.timestamp_to_str(timestamp=line['publish_timestamp'], string_format='%Y-%m-%d')
														
 
															-        fans = fans_dict_each_day.get(gh_id, {}).get(dt, 0)
														
 
															+        fans = fans_dict.get(gh_id, {}).get(dt, const.DEFAULT_FANS)
														
 
															+        if not fans:
														
 
															+            fans = int(unauthorized_account.get(gh_id, const.DEFAULT_FANS))
														
 
															         if not fans:
														
 
															-            fans = int(unauthorized_account.get(gh_id, 0))
														
 
															+            fans = int(backup_account_fans.get(gh_id, const.DEFAULT_FANS))
														
 
															+            log(
														
 
															+                task='cal_read_rate_avg_task',
														
 
															+                function='cal_account_read_rate',
														
 
															+                message='未获取到粉丝，使用备份粉丝表',
														
 
															+                data=line
														
 
															+            )
														
 
															         line['fans'] = fans
														
 
															-        if fans > 1000:
														
 
															+        if fans > const.MIN_FANS:
														
 
															             line['readRate'] = line['show_view_count'] / fans if fans else 0
														
 
															             response.append(line)
														
 
															-    return DataFrame(response,
														
 
															-                     columns=['ghId', 'accountName', 'ItemIndex', 'show_view_count', 'publish_timestamp', 'readRate'])
														
 
															+    return DataFrame(response, columns=['ghId', 'accountName', 'ItemIndex', 'show_view_count', 'publish_timestamp', 'readRate'])
														
 
															 def cal_avg_account_read_rate(df, gh_id, index, dt) -> dict:
														
@@ -168,7 +97,7 @@ def cal_avg_account_read_rate(df, gh_id, index, dt) -> dict:
 
															     min_time = max_time - const.STATISTICS_PERIOD
														
 
															     # 通过
														
 
															-    filterDataFrame = df[
														
 
															+    filter_dataframe = df[
														
 
															         (df["ghId"] == gh_id)
														
 
															         & (min_time <= df["publish_timestamp"])
														
 
															         & (df["publish_timestamp"] <= max_time)
														
@@ -176,13 +105,13 @@ def cal_avg_account_read_rate(df, gh_id, index, dt) -> dict:
 
															         ]
														
 
															     # 用二倍标准差过滤
														
 
															-    finalDF = filter_outlier_data(filterDataFrame)
														
 
															+    final_dataframe = filter_outlier_data(filter_dataframe)
														
 
															     return {
														
 
															-        "read_rate_avg": finalDF['readRate'].mean(),
														
 
															-        "max_publish_time": finalDF['publish_timestamp'].max(),
														
 
															-        "min_publish_time": finalDF['publish_timestamp'].min(),
														
 
															-        "records": len(finalDF)
														
 
															+        "read_rate_avg": final_dataframe['readRate'].mean(),
														
 
															+        "max_publish_time": final_dataframe['publish_timestamp'].max(),
														
 
															+        "min_publish_time": final_dataframe['publish_timestamp'].min(),
														
 
															+        "records": len(final_dataframe)
														
 
															     }
														
@@ -204,7 +133,7 @@ def check_each_position(db_client, gh_id, index, dt, avg_rate) -> dict:
 
															         WHERE gh_id = '{gh_id}' and position = {index} and dt_version < {dt}
														
 
															         ORDER BY dt_version DESC limit 1;
														
 
															     """
														
 
															-    result = db_client.select(select_sql)
														
 
															+    result = db_client.fetch(select_sql)
														
 
															     if result:
														
 
															         account_name = result[0][0]
														
 
															         previous_read_rate_avg = result[0][1]
														
@@ -246,6 +175,9 @@ def update_single_day(dt, account_list, article_df, lam):
 
															         string_format='%Y-%m-%d'
														
 
															     )
														
 
															+    # processed_account_set
														
 
															+    processed_account_set = set()
														
 
															+
														
 
															     for account in tqdm(account_list, desc=dt):
														
 
															         for index in const.ARTICLE_INDEX_LIST:
														
 
															             read_rate_detail = cal_avg_account_read_rate(
														
@@ -259,7 +191,9 @@ def update_single_day(dt, account_list, article_df, lam):
 
															             min_publish_time = read_rate_detail['min_publish_time']
														
 
															             articles_count = read_rate_detail['records']
														
 
															             if articles_count:
														
 
															-                if index in {1, 2}:
														
 
															+                processed_account_set.add(account['gh_id'])
														
 
															+                # check read rate in position 1 and 2
														
 
															+                if index in [1, 2]:
														
 
															                     error_obj = check_each_position(
														
 
															                         db_client=lam,
														
 
															                         gh_id=account['gh_id'],
														
@@ -269,6 +203,7 @@ def update_single_day(dt, account_list, article_df, lam):
 
															                     )
														
 
															                     if error_obj:
														
 
															                         error_list.append(error_obj)
														
 
															+                # insert into database
														
 
															                 try:
														
 
															                     if not read_rate_avg:
														
 
															                         continue
														
@@ -278,8 +213,8 @@ def update_single_day(dt, account_list, article_df, lam):
 
															                         values
														
 
															                         (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
														
 
															                     """
														
 
															-                    lam.update(
														
 
															-                        sql=insert_sql,
														
 
															+                    lam.save(
														
 
															+                        query=insert_sql,
														
 
															                         params=(
														
 
															                             account['account_name'],
														
 
															                             account['gh_id'],
														
@@ -294,14 +229,17 @@ def update_single_day(dt, account_list, article_df, lam):
 
															                         )
														
 
															                     )
														
 
															                 except Exception as e:
														
 
															+                    print(e)
														
 
															                     insert_error_list.append(str(e))
														
 
															+    # bot sql error
														
 
															     if insert_error_list:
														
 
															         bot(
														
 
															             title="更新阅读率均值，存在sql 插入失败",
														
 
															             detail=insert_error_list
														
 
															         )
														
 
															+    # bot outliers
														
 
															     if error_list:
														
 
															         columns = [
														
 
															             create_feishu_columns_sheet(sheet_type="plain_text", sheet_name="account_name", display_name="账号名称"),
														
@@ -314,7 +252,7 @@ def update_single_day(dt, account_list, article_df, lam):
 
															                                         display_name="相对变化率")
														
 
															         ]
														
 
															         bot(
														
 
															-            title="更新阅读率均值，头次出现异常值通知",
														
 
															+            title="阅读率均值表异常信息， 总共处理{}个账号".format(len(processed_account_set)),
														
 
															             detail={
														
 
															                 "columns": columns,
														
 
															                 "rows": error_list
														
@@ -323,12 +261,14 @@ def update_single_day(dt, account_list, article_df, lam):
 
															             mention=False
														
 
															         )
														
 
															+    # if no error, send success info
														
 
															     if not error_list and not insert_error_list:
														
 
															         bot(
														
 
															-            title="阅读率均值表，更新成功",
														
 
															+            title="阅读率均值表更新成功， 总共处理{}个账号".format(len(processed_account_set)),
														
 
															             detail={
														
 
															                 "日期": dt
														
 
															-            }
														
 
															+            },
														
 
															+            mention=False
														
 
															         )
														
@@ -347,12 +287,36 @@ def main() -> None:
 
															     else:
														
 
															         dt = datetime.today().strftime('%Y-%m-%d')
														
 
															-    lam = longArticlesMySQL()
														
 
															-    de = DeNetMysql()
														
 
															-    account_list = get_publishing_accounts(db_client=de)
														
 
															-    df = cal_account_read_rate(tuple([i['gh_id'] for i in account_list]))
														
 
															+    # init stat period
														
 
															+    max_time = functions.str_to_timestamp(date_string=dt)
														
 
															+    min_time = max_time - const.STATISTICS_PERIOD
														
 
															+    min_stat_date = functions.timestamp_to_str(timestamp=min_time, string_format='%Y-%m-%d')
														
 
															+
														
 
															+    # init database connector
														
 
															+    long_articles_db_client = DatabaseConnector(db_config=long_articles_config)
														
 
															+    long_articles_db_client.connect()
														
 
															+
														
 
															+    piaoquan_crawler_db_client = DatabaseConnector(db_config=piaoquan_crawler_config)
														
 
															+    piaoquan_crawler_db_client.connect()
														
 
															+
														
 
															+    denet_db_client = DatabaseConnector(db_config=denet_config)
														
 
															+    denet_db_client.connect()
														
 
															+
														
 
															+    # get account list
														
 
															+    account_list = fetch_publishing_account_list(db_client=denet_db_client)
														
 
															+
														
 
															+    # get fans dict
														
 
															+    fans_dict = fetch_account_fans(db_client=denet_db_client, start_date=min_stat_date)
														
 
															+
														
 
															+    # get data frame from official_articles_v2
														
 
															+    gh_id_tuple = tuple([i['gh_id'] for i in account_list])
														
 
															+    article_list = get_account_articles_detail(db_client=piaoquan_crawler_db_client, gh_id_tuple=gh_id_tuple, min_publish_timestamp=min_time)
														
 
															+
														
 
															+    # cal account read rate and make a dataframe
														
 
															+    read_rate_dataframe = cal_account_read_rate(article_list, fans_dict)
														
 
															-    update_single_day(dt, account_list, df, lam)
														
 
															+    # update each day's data
														
 
															+    update_single_day(dt, account_list, read_rate_dataframe, long_articles_db_client)
														
 
															 if __name__ == '__main__':
														
--- a/coldStartTasks/crawler/__init__.py
+++ b/coldStartTasks/crawler/__init__.py
@@ -2,4 +2,4 @@
 
															 @author: luojunhui
														
 
															 """
														
 
															 from .weixin_account_crawler import WeixinAccountCrawler
														
 
															-from .weixin_video_crawler import WeixinVideoCrawler
														
 
															+from .weixin_video_crawler import WeixinVideoCrawler
														
--- a/coldStartTasks/crawler/baidu/__init__.py
+++ b/coldStartTasks/crawler/baidu/__init__.py
@@ -0,0 +1 @@
 
															+from .video_crawler import BaiduVideoCrawler
														
--- a/coldStartTasks/publish/publishArticleAssociationArticles.py
+++ b/coldStartTasks/publish/publishArticleAssociationArticles.py
@@ -1,4 +1,4 @@
 
															 """
														
 
															 @author: luojunhui
														
 
															-发布i2i文章
														
 
															 """
														
 
															+
														
--- a/coldStartTasks/crawler/baidu/baidu_spider.py
+++ b/coldStartTasks/crawler/baidu/baidu_spider.py
@@ -0,0 +1,96 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+import base64
														
 
															+import uuid
														
 
															+
														
 
															+import requests
														
 
															+
														
 
															+from fake_useragent import FakeUserAgent
														
 
															+
														
 
															+from applications.exception import SpiderError
														
 
															+from applications import Functions
														
 
															+
														
 
															+functions = Functions()
														
 
															+
														
 
															+
														
 
															+def baidu_account_video_crawler(account_id, cursor=None):
														
 
															+    """
														
 
															+    baidu account video crawler
														
 
															+    :param account_id: 百度账号id
														
 
															+    :param cursor: 游标, 默认为None，表示从最新的开始爬取
														
 
															+    success requests:
														
 
															+    """
														
 
															+    cookie_str = uuid.uuid4().__str__().replace('-', '').upper()
														
 
															+    url = "https://haokan.baidu.com/web/author/listall?"
														
 
															+    params = {
														
 
															+        'app_id': account_id,
														
 
															+        'ctime': cursor,
														
 
															+        'rn': 10,
														
 
															+        'searchAfter': '',
														
 
															+        '_api': 1
														
 
															+    }
														
 
															+    headers = {
														
 
															+        'Accept': '*/*',
														
 
															+        'Accept-Language': 'zh,zh-CN;q=0.9',
														
 
															+        'Connection': 'keep-alive',
														
 
															+        'Referer': 'https://haokan.baidu.com/author/{}'.format(account_id),
														
 
															+        'User-Agent': FakeUserAgent().chrome,
														
 
															+        'x-requested-with': 'xmlhttprequest',
														
 
															+        'Cookie': 'BAIDUID={}:FG=1; BAIDUID_BFESS={}:FG=1'.format(cookie_str, cookie_str)
														
 
															+    }
														
 
															+    try:
														
 
															+        response = requests.request("GET", url, headers=headers, params=params, proxies=functions.proxy())
														
 
															+        response_json = response.json()
														
 
															+        if response_json['errmsg'] == '成功':
														
 
															+            response_data = response_json['data']
														
 
															+            return response_data
														
 
															+        else:
														
 
															+            raise SpiderError(
														
 
															+                platform="baidu",
														
 
															+                spider="account_video_crawler",
														
 
															+                error=response_json['errmsg'],
														
 
															+                url=url
														
 
															+            )
														
 
															+
														
 
															+    except Exception as e:
														
 
															+        raise SpiderError(
														
 
															+            platform="baidu",
														
 
															+            spider="account_video_crawler",
														
 
															+            error=str(e),
														
 
															+            url=url
														
 
															+        )
														
 
															+
														
 
															+
														
 
															+def baidu_single_video_crawler(video_id):
														
 
															+    """
														
 
															+    baidu video crawler
														
 
															+    :param video_id: 视频id
														
 
															+    """
														
 
															+    url = "https://haokan.baidu.com/v"
														
 
															+    params = {
														
 
															+        'vid': video_id,
														
 
															+        '_format': 'json'
														
 
															+    }
														
 
															+    base_64_string = base64.b64encode(str(uuid.uuid4()).encode()).decode()
														
 
															+    headers = {
														
 
															+        'Accept': '*/*',
														
 
															+        'cookie': "BIDUPSID={}".format(base_64_string),
														
 
															+        'Accept-Language': 'en,zh;q=0.9,zh-CN;q=0.8',
														
 
															+        'Cache-Control': 'no-cache',
														
 
															+        'Connection': 'keep-alive',
														
 
															+        'Content-Type': 'application/x-www-form-urlencoded',
														
 
															+        'Referer': 'https://haokan.baidu.com',
														
 
															+        'User-Agent': FakeUserAgent().chrome,
														
 
															+    }
														
 
															+    try:
														
 
															+        response = requests.request("GET", url, headers=headers, params=params, proxies=functions.proxy())
														
 
															+        response_json = response.json()
														
 
															+        return response_json['data']['apiData']['curVideoMeta']
														
 
															+    except Exception as e:
														
 
															+        raise SpiderError(
														
 
															+            platform="baidu",
														
 
															+            spider="single_video_crawler",
														
 
															+            error=str(e),
														
 
															+            url=url
														
 
															+        )
														
--- a/coldStartTasks/crawler/baidu/video_crawler.py
+++ b/coldStartTasks/crawler/baidu/video_crawler.py
@@ -0,0 +1,269 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+@description: video crawler
														
 
															+"""
														
 
															+
														
 
															+import os
														
 
															+import json
														
 
															+import time
														
 
															+import traceback
														
 
															+from typing import List, Dict
														
 
															+
														
 
															+from pymysql.cursors import DictCursor
														
 
															+from tqdm import tqdm
														
 
															+
														
 
															+from applications import Functions
														
 
															+from applications import bot, log
														
 
															+from applications.const import BaiduVideoCrawlerConst
														
 
															+from applications.db import DatabaseConnector
														
 
															+from applications.exception import SpiderError
														
 
															+from config import long_articles_config
														
 
															+from coldStartTasks.crawler.baidu.baidu_spider import baidu_account_video_crawler
														
 
															+from coldStartTasks.crawler.baidu.baidu_spider import baidu_single_video_crawler
														
 
															+
														
 
															+const = BaiduVideoCrawlerConst()
														
 
															+empty_list = []
														
 
															+functions = Functions()
														
 
															+
														
 
															+
														
 
															+class BaiduVideoCrawler(object):
														
 
															+    """
														
 
															+    baidu video crawler
														
 
															+    """
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        self.db = None
														
 
															+        self.success_crawler_video_count = 0
														
 
															+        self.connect_db()
														
 
															+
														
 
															+    def connect_db(self) -> None:
														
 
															+        """
														
 
															+        connect db
														
 
															+        """
														
 
															+        self.db = DatabaseConnector(db_config=long_articles_config)
														
 
															+        self.db.connect()
														
 
															+
														
 
															+    def get_account_list(self) -> List[Dict]:
														
 
															+        """
														
 
															+        get account list
														
 
															+        """
														
 
															+        sql = f"""
														
 
															+            select account_id, account_name, max_cursor 
														
 
															+            from baidu_account_for_videos
														
 
															+            where status = {const.BAIDU_ACCOUNT_GOOD_STATUS};
														
 
															+        """
														
 
															+        account_list = self.db.fetch(query=sql, cursor_type=DictCursor)
														
 
															+        return account_list
														
 
															+
														
 
															+    def whether_video_exists(self, title: str) -> bool:
														
 
															+        """
														
 
															+        whether video exists, use video_id && title
														
 
															+        """
														
 
															+        # check title
														
 
															+        sql = f"""
														
 
															+            select id from publish_single_video_source
														
 
															+            where article_title = %s;
														
 
															+        """
														
 
															+        duplicate_id = self.db.fetch(query=sql, params=(title,))
														
 
															+        if duplicate_id:
														
 
															+            print(title + " video exists")
														
 
															+            return True
														
 
															+
														
 
															+        return False
														
 
															+
														
 
															+    def save_each_video(self, video: Dict, account_id: str, account_name: str) -> None:
														
 
															+        """
														
 
															+        download and save each video
														
 
															+        """
														
 
															+        # print(json.dumps(video, ensure_ascii=False, indent=4))
														
 
															+        video_id = video["id"]
														
 
															+        title = video["title"]
														
 
															+
														
 
															+        # judge whether video exists
														
 
															+        if self.whether_video_exists(title):
														
 
															+            return
														
 
															+
														
 
															+        read_cnt = video.get("playcnt", 0)
														
 
															+        like_cnt = video.get("like_num", 0)
														
 
															+        publish_timestamp = video["publish_time"]
														
 
															+        # duration = video['duration']
														
 
															+        cover_url = video["poster"]
														
 
															+        video_url = video["playurl"]
														
 
															+        # sensitive_flag = video.get('sensitive_flag')
														
 
															+        video_more_info = video.get("contentcms_intervene_data")
														
 
															+        if video_more_info:
														
 
															+            video_category_list = video_more_info.get("category_v2")
														
 
															+            if video_category_list:
														
 
															+                video_category = video_category_list[0]
														
 
															+            else:
														
 
															+                video_category = None
														
 
															+        else:
														
 
															+            video_category = None
														
 
															+        manual_tags = video.get("manual_tags")
														
 
															+
														
 
															+        video_path = os.path.join(const.LOCAL_PATH_DIR, "{}.mp4".format(video_id))
														
 
															+        download_path = functions.download_baidu_videos(video_url, video_path)
														
 
															+        if download_path:
														
 
															+            oss_path = functions.upload_to_oss(local_video_path=download_path)
														
 
															+            insert_sql = f"""
														
 
															+                INSERT INTO publish_single_video_source
														
 
															+                (content_trace_id, article_title, out_account_id, out_account_name, read_cnt, like_cnt, article_url, cover_url, video_oss_path, publish_timestamp, crawler_timestamp, url_unique_md5, category, tags, platform, source_account)
														
 
															+                values
														
 
															+                (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
														
 
															+            """
														
 
															+            try:
														
 
															+                self.db.save(
														
 
															+                    query=insert_sql,
														
 
															+                    params=(
														
 
															+                        "video{}".format(functions.str_to_md5(video_id)),
														
 
															+                        title,
														
 
															+                        account_id,
														
 
															+                        account_name,
														
 
															+                        read_cnt,
														
 
															+                        like_cnt,
														
 
															+                        video_url,
														
 
															+                        cover_url,
														
 
															+                        oss_path,
														
 
															+                        publish_timestamp,
														
 
															+                        int(time.time()),
														
 
															+                        video_id,
														
 
															+                        video_category,
														
 
															+                        (
														
 
															+                            json.dumps(manual_tags, ensure_ascii=False)
														
 
															+                            if manual_tags
														
 
															+                            else None
														
 
															+                        ),
														
 
															+                        "hksp",
														
 
															+                        const.NO_SOURCE_ACCOUNT_STATUS,
														
 
															+                    ),
														
 
															+                )
														
 
															+                self.success_crawler_video_count += 1
														
 
															+            except Exception as e:
														
 
															+                log(
														
 
															+                    task="baidu_video_crawler",
														
 
															+                    function="save_each_video",
														
 
															+                    message="save video failed",
														
 
															+                    data={
														
 
															+                        "error": str(e),
														
 
															+                        "traceback": traceback.format_exc(),
														
 
															+                        "video_id": video_id,
														
 
															+                        "oss_path": oss_path,
														
 
															+                    },
														
 
															+                )
														
 
															+        else:
														
 
															+            print(f"download video failed, video_id: {video_id}")
														
 
															+
														
 
															+    def save_video_list(
														
 
															+        self, account_id: str, account_name: str, video_list: List[Dict]
														
 
															+    ) -> None:
														
 
															+        """
														
 
															+        save video list
														
 
															+        """
														
 
															+        progress_bar = tqdm(video_list, desc="crawler account: {}".format(account_name))
														
 
															+        for video_obj in progress_bar:
														
 
															+            if video_obj["type"] == "video":
														
 
															+                video_id = video_obj["content"]["vid"]
														
 
															+                try:
														
 
															+                    video_detail = baidu_single_video_crawler(video_id)
														
 
															+                    self.save_each_video(
														
 
															+                        video=video_detail,
														
 
															+                        account_id=account_id,
														
 
															+                        account_name=account_name,
														
 
															+                    )
														
 
															+                    progress_bar.set_postfix({"videoId": video_id})
														
 
															+                except SpiderError as e:
														
 
															+                    print("save single video fail", e)
														
 
															+                    continue
														
 
															+            else:
														
 
															+                continue
														
 
															+
														
 
															+    def crawler_each_account(self, account: Dict, cursor=None) -> None:
														
 
															+        """
														
 
															+        crawler each account
														
 
															+        response_strategy
														
 
															+        """
														
 
															+        account_id = account["account_id"]
														
 
															+        max_cursor = account["max_cursor"]
														
 
															+        if not max_cursor:
														
 
															+            max_cursor = const.DEFAULT_CURSOR
														
 
															+        account_name = account["account_name"]
														
 
															+        try:
														
 
															+            response_json = baidu_account_video_crawler(account_id, cursor=cursor)
														
 
															+
														
 
															+            video_list = response_json.get("results", empty_list)
														
 
															+            if video_list:
														
 
															+                self.save_video_list(
														
 
															+                    account_id=account_id,
														
 
															+                    account_name=account_name,
														
 
															+                    video_list=video_list,
														
 
															+                )
														
 
															+            # check next page
														
 
															+            has_next_page = response_json.get("has_more", False)
														
 
															+            if has_next_page:
														
 
															+                next_cursor = response_json.get("ctime", const.DEFAULT_CURSOR)
														
 
															+                if next_cursor < max_cursor:
														
 
															+                    print("No more videos after 2024-01-01")
														
 
															+                    return
														
 
															+                else:
														
 
															+                    return self.crawler_each_account(account, next_cursor)
														
 
															+        except SpiderError as e:
														
 
															+            print(e)
														
 
															+            return
														
 
															+
														
 
															+    def update_cursor(self, account_id: str) -> None:
														
 
															+        """
														
 
															+        update cursor for each account
														
 
															+        """
														
 
															+        select_sql = f"""
														
 
															+            select max(publish_timestamp) as max_cursor from publish_single_video_source where out_account_id = '{account_id}';
														
 
															+        """
														
 
															+        response_mysql = self.db.fetch(query=select_sql)
														
 
															+        max_publish_timestamp = response_mysql[0][0]
														
 
															+        if max_publish_timestamp:
														
 
															+            max_cursor = max_publish_timestamp * const.TIMESTAMP_TO_CURSOR
														
 
															+            update_sql = f"""
														
 
															+                update baidu_account_for_videos
														
 
															+                set max_cursor = %s
														
 
															+                where account_id = %s;
														
 
															+            """
														
 
															+            self.db.save(query=update_sql, params=(max_cursor, account_id))
														
 
															+
														
 
															+    def deal(self) -> None:
														
 
															+        """
														
 
															+        deal
														
 
															+        """
														
 
															+        account_list = self.get_account_list()
														
 
															+        success_cnt = 0
														
 
															+        fail_cnt = 0
														
 
															+        account_list_process_bar = tqdm(account_list, desc="process account list")
														
 
															+        for account in account_list_process_bar:
														
 
															+            try:
														
 
															+                account_list_process_bar.set_postfix(
														
 
															+                    {"account_name": account["account_name"]}
														
 
															+                )
														
 
															+                self.crawler_each_account(account)
														
 
															+                self.update_cursor(account["account_id"])
														
 
															+                success_cnt += 1
														
 
															+            except Exception as e:
														
 
															+                fail_cnt += 1
														
 
															+                log(
														
 
															+                    task="baidu_video_crawler",
														
 
															+                    function="deal",
														
 
															+                    message="crawler each account failed",
														
 
															+                    data={
														
 
															+                        "account_id": account["account_id"],
														
 
															+                        "account_name": account["account_name"],
														
 
															+                        "error": str(e),
														
 
															+                        "trace_back": traceback.format_exc(),
														
 
															+                    },
														
 
															+                )
														
 
															+        bot(
														
 
															+            title="baidu video crawler task finished",
														
 
															+            detail={
														
 
															+                "success_crawl_account_num": success_cnt,
														
 
															+                "fail_crawl_account_num": fail_cnt,
														
 
															+                "success_crawl_video_num": self.success_crawler_video_count,
														
 
															+                "success_crawl_account_rate": success_cnt / (success_cnt + fail_cnt),
														
 
															+            },
														
 
															+            mention=False,
														
 
															+        )
														
--- a/coldStartTasks/crawler/channels/__init__.py
+++ b/coldStartTasks/crawler/channels/__init__.py
@@ -0,0 +1,6 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+@tool: pycharm && deepseek
														
 
															+"""
														
 
															+from .blogger import get_channel_account_videos
														
 
															+from .search import search_in_wechat_channel
														
--- a/coldStartTasks/crawler/channels/blogger.py
+++ b/coldStartTasks/crawler/channels/blogger.py
@@ -0,0 +1,22 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+
														
 
															+import requests
														
 
															+import json
														
 
															+
														
 
															+from applications.api import WechatChannelAPI
														
 
															+from config import gewe_token, gewe_app_id
														
 
															+
														
 
															+
														
 
															+def get_channel_account_videos(user_id, last_buffer=""):
														
 
															+    """
														
 
															+    get channel account videos
														
 
															+    """
														
 
															+    channel_api = WechatChannelAPI(
														
 
															+        base_url='http://api.geweapi.com',
														
 
															+        token=gewe_token,
														
 
															+        app_id=gewe_app_id
														
 
															+    )
														
 
															+    result = channel_api.get_channel_video_list(user_id, last_buffer)
														
 
															+    return result
														
--- a/coldStartTasks/crawler/channels/search.py
+++ b/coldStartTasks/crawler/channels/search.py
@@ -0,0 +1,41 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+
														
 
															+from typing import Dict
														
 
															+
														
 
															+from applications.api import WechatChannelAPI
														
 
															+from config import gewe_token, gewe_app_id
														
 
															+
														
 
															+
														
 
															+def search_in_wechat_channel(
														
 
															+    search_key: str,
														
 
															+    search_type: int,
														
 
															+    page: int = 0,
														
 
															+    cookie: str = "",
														
 
															+    search_id: str = "",
														
 
															+    offset: int = 0,
														
 
															+) -> Dict:
														
 
															+    """
														
 
															+    :param search_key: 搜索关键字
														
 
															+    :param search_type: 搜索类型，1: 搜索所有视频， 2: 搜索视频号账号
														
 
															+    :param page: 页码
														
 
															+    :param cookie: 登录后的cookie
														
 
															+    :param search_id: 搜索id
														
 
															+    :param offset: 偏移量
														
 
															+    :return: result_list
														
 
															+    """
														
 
															+    channel_api = WechatChannelAPI(
														
 
															+        base_url='http://api.geweapi.com',
														
 
															+        token=gewe_token,
														
 
															+        app_id=gewe_app_id
														
 
															+    )
														
 
															+    result = channel_api.search(
														
 
															+        search_key=search_key,
														
 
															+        search_type=search_type,
														
 
															+        page=page,
														
 
															+        cookie=cookie,
														
 
															+        search_id=search_id,
														
 
															+        offset=offset
														
 
															+    )
														
 
															+    return result
														
--- a/coldStartTasks/crawler/toutiao/__init__.py
+++ b/coldStartTasks/crawler/toutiao/__init__.py
@@ -0,0 +1,4 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+from .blogger import get_toutiao_account_video_list
														
--- a/coldStartTasks/crawler/toutiao/blogger.py
+++ b/coldStartTasks/crawler/toutiao/blogger.py
@@ -0,0 +1,64 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+
														
 
															+from __future__ import annotations
														
 
															+
														
 
															+import json
														
 
															+import requests
														
 
															+from tenacity import retry
														
 
															+
														
 
															+from applications import log
														
 
															+from applications.utils import proxy, request_retry
														
 
															+from .use_js import call_js_function
														
 
															+
														
 
															+retry_desc = request_retry(retry_times=3, min_retry_delay=2, max_retry_delay=30)
														
 
															+
														
 
															+
														
 
															+@retry(**retry_desc)
														
 
															+def get_toutiao_account_video_list(
														
 
															+    account_id: str, cookie: str, max_behot_time=0
														
 
															+) -> dict | None:
														
 
															+    """
														
 
															+    get toutiao account video list
														
 
															+    :param account_id: toutiao account id
														
 
															+    :param cookie: cookie maybe expire not quite sure
														
 
															+    :param max_behot_time: max behot time
														
 
															+    :return: toutiao account video list
														
 
															+    """
														
 
															+    ms_token = "mFs9gU4FJc23gFWPvBfQxFsBRrx1xBEJD_ZRTAolHfPrae84kTEBaHQR3s8ToiLX4-U9hgATTZ2cVHlSixmj5YCTOPoVM-43gOt3aVHkxfXHEuUtTJe-wUEs%3D"
														
 
															+    query_params = [
														
 
															+        0,
														
 
															+        1,
														
 
															+        14,
														
 
															+        "category=pc_user_hot&token={}&aid=24&app_name=toutiao_web&msToken={}".format(
														
 
															+            account_id, ms_token
														
 
															+        ),
														
 
															+        "",
														
 
															+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
														
 
															+    ]
														
 
															+    a_bogus = call_js_function(query_params)
														
 
															+    url = f"https://www.toutiao.com/api/pc/list/user/feed?category=pc_profile_video&token={account_id}&max_behot_time={max_behot_time}&hot_video=0&entrance_gid=&aid=24&app_name=toutiao_web&msToken={ms_token}&a_bogus={a_bogus}"
														
 
															+    headers = {
														
 
															+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
														
 
															+        "cookie": cookie,
														
 
															+    }
														
 
															+    try:
														
 
															+        response = requests.get(url, headers=headers, proxies=proxy())
														
 
															+        response.raise_for_status()
														
 
															+        return response.json()
														
 
															+    except requests.exceptions.RequestException as e:
														
 
															+        log(
														
 
															+            task="toutiao account crawler",
														
 
															+            function="get_toutiao_account_video_list",
														
 
															+            message=f"API请求失败: {e}",
														
 
															+            data={"account_id": account_id},
														
 
															+        )
														
 
															+    except json.JSONDecodeError as e:
														
 
															+        log(
														
 
															+            task="toutiao account crawler",
														
 
															+            function="get_toutiao_account_video_list",
														
 
															+            message=f"响应解析失败: {e}",
														
 
															+            data={"account_id": account_id},
														
 
															+        )
														
 
															+    return None
														
--- a/coldStartTasks/crawler/toutiao/use_js.py
+++ b/coldStartTasks/crawler/toutiao/use_js.py
@@ -0,0 +1,25 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+import json
														
 
															+import subprocess
														
 
															+
														
 
															+from config import toutiao_js_path
														
 
															+
														
 
															+
														
 
															+def call_js_function(arguments_list):
														
 
															+    """
														
 
															+    call js function
														
 
															+    """
														
 
															+    # 将参数转换为JSON字符串
														
 
															+    args_json = json.dumps(arguments_list)
														
 
															+    # 调用Node.js执行脚本
														
 
															+    result = subprocess.run(
														
 
															+        ['node', toutiao_js_path, args_json],
														
 
															+        capture_output=True,
														
 
															+        text=True
														
 
															+    )
														
 
															+    if result.returncode == 0:
														
 
															+        return result.stdout.strip()
														
 
															+    else:
														
 
															+        raise Exception(f"Error: {result.stderr}")
														
--- a/coldStartTasks/crawler/wechat/__init__.py
+++ b/coldStartTasks/crawler/wechat/__init__.py
@@ -0,0 +1,4 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+from .article_association import ArticleAssociationCrawler
														
--- a/coldStartTasks/crawler/wechat/article_association.py
+++ b/coldStartTasks/crawler/wechat/article_association.py
@@ -0,0 +1,210 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+
														
 
															+import time
														
 
															+import traceback
														
 
															+from datetime import datetime
														
 
															+
														
 
															+import numpy as np
														
 
															+
														
 
															+from pymysql.cursors import DictCursor
														
 
															+from tqdm import tqdm
														
 
															+
														
 
															+
														
 
															+from applications import WeixinSpider, log
														
 
															+from applications.api import similarity_between_title_list
														
 
															+from applications.const import ColdStartTaskConst
														
 
															+from applications.db import DatabaseConnector
														
 
															+from applications.functions import Functions
														
 
															+from applications.utils import get_inner_account_set
														
 
															+from applications.utils import whether_title_sensitive
														
 
															+from config import long_articles_config
														
 
															+
														
 
															+spider = WeixinSpider()
														
 
															+functions = Functions()
														
 
															+const = ColdStartTaskConst()
														
 
															+
														
 
															+
														
 
															+class ArticleAssociationCrawler(object):
														
 
															+    """
														
 
															+    article association crawler task
														
 
															+    """
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        self.db_client = DatabaseConnector(db_config=long_articles_config)
														
 
															+        self.db_client.connect()
														
 
															+        self.inner_account_set = get_inner_account_set()
														
 
															+
														
 
															+    def get_seed_url_list(self, biz_date):
														
 
															+        """
														
 
															+        获取种子url列表
														
 
															+        """
														
 
															+        sql = f"""
														
 
															+            select gh_id, title, link
														
 
															+            from datastat_sort_strategy
														
 
															+            where date_str > DATE_FORMAT(DATE_SUB('{biz_date}', INTERVAL 2 DAY), '%Y%m%d') 
														
 
															+                and view_count > {const.READ_COUNT_THRESHOLD} 
														
 
															+                and read_rate > {const.READ_AVG_THRESHOLD} 
														
 
															+                and type = {const.BULK_PUBLISH_TYPE}
														
 
															+            order by read_rate desc 
														
 
															+            limit {const.SEED_ARTICLE_LIMIT_NUM};
														
 
															+        """
														
 
															+        seed_article_list = self.db_client.fetch(query=sql, cursor_type=DictCursor)
														
 
															+        return seed_article_list
														
 
															+
														
 
															+    def get_level_up_title_list(self):
														
 
															+        """
														
 
															+        获取晋级文章标题列表
														
 
															+        status: 1 表示文章已经溯源完成
														
 
															+        deleted: 0 表示文章正常
														
 
															+        level = 'autoArticlePoolLevel1' 表示头条
														
 
															+        """
														
 
															+        sql = f"""
														
 
															+            select distinct title 
														
 
															+            from article_pool_promotion_source 
														
 
															+            where level = 'autoArticlePoolLevel1' and status = 1 and deleted = 0;
														
 
															+        """
														
 
															+        mysql_response = self.db_client.fetch(query=sql)
														
 
															+        title_list = [i[0] for i in mysql_response]
														
 
															+        return title_list
														
 
															+
														
 
															+    def get_recommend_url_list_with_depth(
														
 
															+        self, seed_url, source_title, source_account, base_title_list, depth=1
														
 
															+    ):
														
 
															+        """
														
 
															+        @param seed_url: good url from data_sort_strategy
														
 
															+        @param depth: association depth
														
 
															+        @param source_title: article title
														
 
															+        @param source_account: article account
														
 
															+        """
														
 
															+        if depth > const.ARTICLE_ASSOCIATION_MAX_DEPTH:
														
 
															+            return
														
 
															+
														
 
															+        res = spider.get_recommend_articles(content_link=seed_url)
														
 
															+        related_articles = res["data"]["data"]["list"]
														
 
															+        if related_articles:
														
 
															+            title_list = [i["title"] for i in related_articles]
														
 
															+            similarity_array = similarity_between_title_list(
														
 
															+                title_list, base_title_list
														
 
															+            )
														
 
															+
														
 
															+            recommend_articles = []
														
 
															+            for index, score_list in enumerate(similarity_array):
														
 
															+                sorted_score_list = sorted(score_list)
														
 
															+                percent_threshold_score = np.percentile(
														
 
															+                    sorted_score_list, const.PERCENT_THRESHOLD
														
 
															+                )
														
 
															+                if percent_threshold_score < const.CORRELATION_THRESHOLD:
														
 
															+                    continue
														
 
															+
														
 
															+                else:
														
 
															+                    article_obj = related_articles[index]
														
 
															+                    article_obj["score"] = percent_threshold_score
														
 
															+                    recommend_articles.append(article_obj)
														
 
															+
														
 
															+            recommend_process_bar = tqdm(
														
 
															+                recommend_articles, desc="save recommend articles"
														
 
															+            )
														
 
															+            for article in recommend_process_bar:
														
 
															+                obj = {
														
 
															+                    "title": article["title"],
														
 
															+                    "url": article["url"],
														
 
															+                    "gh_id": article["username"],
														
 
															+                    "index": article["idx"],
														
 
															+                    "send_time": article["send_time"],
														
 
															+                    "read_cnt": article["read_num"],
														
 
															+                    "depth": depth,
														
 
															+                    "source_article_title": source_title,
														
 
															+                    "source_account": source_account,
														
 
															+                }
														
 
															+                self.insert_recommend_article(obj)
														
 
															+                recommend_process_bar.set_postfix(
														
 
															+                    {"title": article["title"], "depth": depth}
														
 
															+                )
														
 
															+                self.get_recommend_url_list_with_depth(
														
 
															+                    seed_url=obj["url"],
														
 
															+                    source_title=obj["title"],
														
 
															+                    source_account=obj["gh_id"],
														
 
															+                    base_title_list=base_title_list,
														
 
															+                    depth=depth + 1,
														
 
															+                )
														
 
															+        else:
														
 
															+            return
														
 
															+
														
 
															+    def insert_recommend_article(self, obj):
														
 
															+        """
														
 
															+        insert recommend article
														
 
															+        """
														
 
															+        # whether account inside
														
 
															+        if obj["gh_id"] in self.inner_account_set:
														
 
															+            return
														
 
															+
														
 
															+        # whether article title exists
														
 
															+        title = obj["title"]
														
 
															+        select_sql = "select article_id from crawler_meta_article where title = %s;"
														
 
															+        res = self.db_client.fetch(query=select_sql, params=(title,))
														
 
															+        if res:
														
 
															+            return
														
 
															+
														
 
															+        # whether title sensitive
														
 
															+        title_sensitivity = (
														
 
															+            const.TITLE_SENSITIVE
														
 
															+            if whether_title_sensitive(title)
														
 
															+            else const.TITLE_NOT_SENSITIVE
														
 
															+        )
														
 
															+
														
 
															+        # insert this article
														
 
															+        insert_sql = f"""
														
 
															+            insert into crawler_meta_article 
														
 
															+            (platform, mode, category, out_account_id, article_index, title, link, read_cnt, publish_time, crawler_time, status, unique_index, source_article_title, source_account, title_sensitivity)
														
 
															+            values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
														
 
															+        """
														
 
															+        self.db_client.save(
														
 
															+            query=insert_sql,
														
 
															+            params=(
														
 
															+                "weixin",
														
 
															+                "recommend",
														
 
															+                "article_association",
														
 
															+                obj["gh_id"],
														
 
															+                obj["index"],
														
 
															+                obj["title"],
														
 
															+                obj["url"],
														
 
															+                obj["read_cnt"],
														
 
															+                obj["send_time"],
														
 
															+                int(time.time()),
														
 
															+                const.DEFAULT_ARTICLE_STATUS,
														
 
															+                functions.generateGzhId(obj["url"]),
														
 
															+                obj["source_article_title"],
														
 
															+                obj["source_account"],
														
 
															+                title_sensitivity,
														
 
															+            ),
														
 
															+        )
														
 
															+
														
 
															+    def deal(self, biz_date=None):
														
 
															+        """
														
 
															+        class entrance
														
 
															+        :param biz_date:
														
 
															+        """
														
 
															+        if biz_date is None:
														
 
															+            biz_date = datetime.today().strftime("%Y-%m-%d")
														
 
															+
														
 
															+        seed_article_list = self.get_seed_url_list(biz_date)
														
 
															+        deal_bar = tqdm(seed_article_list, desc="article association crawler")
														
 
															+        base_title_list = self.get_level_up_title_list()
														
 
															+        for article in deal_bar:
														
 
															+            try:
														
 
															+                self.get_recommend_url_list_with_depth(
														
 
															+                    seed_url=article["link"],
														
 
															+                    source_title=article["title"],
														
 
															+                    source_account=article["gh_id"],
														
 
															+                    base_title_list=base_title_list,
														
 
															+                )
														
 
															+                deal_bar.set_postfix({"article_title": article["title"]})
														
 
															+            except Exception as e:
														
 
															+                log(
														
 
															+                    task="article_association_crawler",
														
 
															+                    function="deal",
														
 
															+                    message=f"article association crawler error, article title: {article['title']}, error: {e}",
														
 
															+                    data={"article": article, "traceback": traceback.format_exc()},
														
 
															+                )
														
--- a/coldStartTasks/crawler/weixinCategoryCrawler.py
+++ b/coldStartTasks/crawler/weixinCategoryCrawler.py
@@ -8,7 +8,7 @@ import time
 
															 from tqdm import tqdm
														
 
															 from pymysql.cursors import DictCursor
														
 
															-from applications import WeixinSpider, Functions, llm_sensitivity, log
														
 
															+from applications import WeixinSpider, Functions, log
														
 
															 from coldStartTasks.filter import article_crawler_duplicate_filter
														
 
															 from config import apolloConfig
														
@@ -158,18 +158,6 @@ class weixinCategory(object):
 
															                     print(e)
														
 
															         return success_records
														
 
															-    def update_article_sensitive_status(self, category, unique_index, status):
														
 
															-        """
														
 
															-        更新文章敏感状态
														
 
															-        :return:
														
 
															-        """
														
 
															-        update_sql = f"""
														
 
															-            update crawler_meta_article
														
 
															-            set llm_sensitivity = %s
														
 
															-            where category = %s and unique_index = %s;
														
 
															-        """
														
 
															-        self.db_client_lam.update(sql=update_sql, params=(status, category, unique_index))
														
 
															-
														
 
															     def update_latest_account_timestamp(self, gh_id):
														
 
															         """
														
 
															         更新账号的最新时间戳
														
@@ -242,18 +230,6 @@ class weixinCategory(object):
 
															                 print("success")
														
 
															             except Exception as e:
														
 
															                 print("fail because of {}".format(e))
														
 
															-        success_titles = [x['title'] for x in success_records]
														
 
															-        if success_titles:
														
 
															-            try:
														
 
															-                sensitive_results = llm_sensitivity.check_titles(success_titles)
														
 
															-                for record, sensitive_result in zip(success_records, sensitive_results):
														
 
															-                    self.update_article_sensitive_status(
														
 
															-                        category=category,
														
 
															-                        unique_index=record['unique_index'],
														
 
															-                        status=sensitive_result['hit_rule']
														
 
															-                    )
														
 
															-            except Exception as e:
														
 
															-                print("failed to update sensitive status: {}".format(e))
														
 
															     def deal(self, category_list, date_str):
														
 
															         """
														
--- a/coldStartTasks/crawler/weixin_account_association_crawler.py
+++ b/coldStartTasks/crawler/weixin_account_association_crawler.py
@@ -229,7 +229,7 @@ class AccountAssociationCrawler(object):
 
															                     task="account_association",
														
 
															                     function="run_account_association",
														
 
															                     data={
														
 
															-                        "biz_date": biz_date,
														
 
															+                        "biz_date": biz_date.strftime("%Y-%m-%d"),
														
 
															                         "article": article,
														
 
															                         "trace_back": traceback.format_exc(),
														
 
															                         "error": f"{e}"
														
--- a/coldStartTasks/crawler/weixin_account_crawler.py
+++ b/coldStartTasks/crawler/weixin_account_crawler.py
@@ -44,7 +44,9 @@ class WeixinAccountCrawler(object):
 
															         sql = f"""
														
 
															             SELECT id, article_url
														
 
															             FROM publish_single_video_source
														
 
															-            WHERE source_account = {const.NEED_SCAN_SOURCE_ACCOUNT};
														
 
															+            WHERE source_account = {const.NEED_SCAN_SOURCE_ACCOUNT} 
														
 
															+            and bad_status = {const.TITLE_DEFAULT_STATUS}
														
 
															+            and platform = 'gzh' limit 1000;
														
 
															         """
														
 
															         article_url_list = self.db_client.select(sql, cursor_type=DictCursor)
														
 
															         return article_url_list
														
--- a/coldStartTasks/crawler/weixin_video_crawler.py
+++ b/coldStartTasks/crawler/weixin_video_crawler.py
@@ -87,7 +87,8 @@ class WeixinVideoCrawler(object):
 
															         select_sql = f"""
														
 
															             SELECT gh_id, account_name, latest_crawler_timestamp
														
 
															             FROM weixin_account_for_videos
														
 
															-            WHERE status = {const.ACCOUNT_CRAWL_STATUS};
														
 
															+            WHERE status = {const.ACCOUNT_CRAWL_STATUS}
														
 
															+            ORDER BY latest_crawler_timestamp;
														
 
															         """
														
 
															         response = self.db_client.select(select_sql, DictCursor)
														
 
															         return response
														
@@ -158,10 +159,15 @@ class WeixinVideoCrawler(object):
 
															                     url_unique = functions.generateGzhId(article_url)
														
 
															                     # 判断该视频链接是否下载，若已经下载则直接跳过
														
 
															                     if self.is_downloaded(url_unique):
														
 
															+                        print("url exists")
														
 
															+                        continue
														
 
															+
														
 
															+                    title = article.get("Title", None)
														
 
															+                    if not title:
														
 
															                         continue
														
 
															                     # 判断标题是否重复
														
 
															-                    if video_crawler_duplicate_filter(article_url, self.db_client):
														
 
															+                    if video_crawler_duplicate_filter(title, self.db_client):
														
 
															                         log(
														
 
															                             task='weixin_video_crawler',
														
 
															                             function="insert_msg_list",
														
@@ -174,7 +180,6 @@ class WeixinVideoCrawler(object):
 
															                         download_path = functions.download_gzh_video(article_url)
														
 
															                         if download_path:
														
 
															                             oss_path = functions.upload_to_oss(local_video_path=download_path)
														
 
															-                            title = article.get("Title", None)
														
 
															                             position = article.get("ItemIndex", None)
														
 
															                             cover_url = article.get("CoverImgUrl", None)
														
 
															                             show_desc = article.get("ShowDesc", None)
														
--- a/coldStartTasks/filter/title_similarity_task.py
+++ b/coldStartTasks/filter/title_similarity_task.py
@@ -54,7 +54,7 @@ class ColdStartTitleSimilarityTask(object):
 
															         title_list = [i[0] for i in mysql_response]
														
 
															         return title_list
														
 
															-    def get_title_from_meta_base(self, limit):
														
 
															+    def get_article_title_from_meta_base(self, limit):
														
 
															         """
														
 
															         获取meta_base表中文章标题列表
														
 
															         status: 1 表示文章初始化状态
														
@@ -70,17 +70,56 @@ class ColdStartTitleSimilarityTask(object):
 
															         mysql_response = self.db_client.fetch(query=sql, cursor_type=DictCursor)
														
 
															         return mysql_response
														
 
															-    def update_meta_article_batch(self, update_data_list: list[tuple]) -> int:
														
 
															+    def get_video_title_from_meta_table(self, limit):
														
 
															         """
														
 
															-        批量更新crawler_meta_article
														
 
															+        获取meta_base表中视频标题列表
														
 
															+        audit_status = 0 表示视频初始化状态
														
 
															+        """
														
 
															+        if limit:
														
 
															+            sql = f"""
														
 
															+                select id as article_id, article_title as title 
														
 
															+                from publish_single_video_source 
														
 
															+                where audit_status = 0 
														
 
															+                    and score is null 
														
 
															+                    and bad_status = 0
														
 
															+                limit {limit};
														
 
															+            """
														
 
															+        else:
														
 
															+            sql = f"""
														
 
															+                select id as article_id, article_title as title 
														
 
															+                from publish_single_video_source 
														
 
															+                where audit_status = 0 
														
 
															+                    and score is null
														
 
															+                    and bad_status = 0;
														
 
															+            """
														
 
															+        mysql_response = self.db_client.fetch(query=sql, cursor_type=DictCursor)
														
 
															+        return mysql_response
														
 
															+
														
 
															+    def update_meta_database_batch(self, meta_source: str, update_data_list: list[tuple]) -> int:
														
 
															         """
														
 
															-        sql = """
														
 
															-            update crawler_meta_article
														
 
															-            set score = case article_id
														
 
															-                {}
														
 
															-            end
														
 
															-            where article_id in %s and score is null;
														
 
															+        批量更新crawler_meta_article
														
 
															         """
														
 
															+        match meta_source:
														
 
															+            case "video":
														
 
															+                sql = """
														
 
															+                    update publish_single_video_source
														
 
															+                    set score = case id
														
 
															+                        {}
														
 
															+                    end
														
 
															+                    where id in %s and score is null;
														
 
															+                """
														
 
															+            case "article":
														
 
															+                sql = """
														
 
															+                    update crawler_meta_article
														
 
															+                    set score = case article_id
														
 
															+                        {}
														
 
															+                    end
														
 
															+                    where article_id in %s and score is null;
														
 
															+                """
														
 
															+            case _:
														
 
															+                print("source_type is not valid")
														
 
															+                return 0
														
 
															+
														
 
															         case_statement = []
														
 
															         article_id_list = []
														
 
															         params = []
														
@@ -95,22 +134,29 @@ class ColdStartTitleSimilarityTask(object):
 
															         affected_rows = self.db_client.save(formatted_sql, params)
														
 
															         return affected_rows
														
 
															-    def run(self, limit=None):
														
 
															+    def run(self, meta_source, limit=None):
														
 
															         """
														
 
															         执行任务
														
 
															         """
														
 
															-        target_article_list = self.get_title_from_meta_base(limit=limit)
														
 
															-        if not target_article_list:
														
 
															+        match meta_source:
														
 
															+            case "article":
														
 
															+                target_list = self.get_article_title_from_meta_base(limit=limit)
														
 
															+            case "video":
														
 
															+                target_list = self.get_video_title_from_meta_table(limit=limit)
														
 
															+            case _:
														
 
															+                print("meta_source is not valid")
														
 
															+                return
														
 
															+
														
 
															+        if not target_list:
														
 
															             print("No more articles to process.")
														
 
															             return
														
 
															         base_title_list = self.get_level_up_title_list()
														
 
															-
														
 
															-        batch_task_list = chunks(target_article_list, ARTICLE_BATCH)
														
 
															+        batch_task_list = chunks(target_list, ARTICLE_BATCH)
														
 
															         for batch_task in batch_task_list:
														
 
															             try:
														
 
															-                batch_target_title_list = [i['title'] for i in batch_task]
														
 
															+                batch_target_title_list = [i['title'][:30] for i in batch_task]
														
 
															                 similarity_array = similarity_between_title_list(batch_target_title_list, base_title_list)
														
 
															                 update_data_list = []
														
@@ -119,7 +165,7 @@ class ColdStartTitleSimilarityTask(object):
 
															                     percent_threshold_score = np.percentile(sorted_score_list, PERCENT_THRESHOLD)
														
 
															                     update_data_list.append((percent_threshold_score, batch_task[index]['article_id']))
														
 
															-                affected_rows = self.update_meta_article_batch(update_data_list)
														
 
															+                affected_rows = self.update_meta_database_batch(meta_source=meta_source, update_data_list=update_data_list)
														
 
															                 print("{}: \t本次任务处理数量: {}".format(datetime.datetime.today().__str__(), affected_rows))
														
 
															             except Exception as e:
														
--- a/coldStartTasks/publish/basic.py
+++ b/coldStartTasks/publish/basic.py
@@ -0,0 +1,276 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+
														
 
															+import json
														
 
															+import time
														
 
															+import datetime
														
 
															+import pandas as pd
														
 
															+import traceback
														
 
															+
														
 
															+from pandas import DataFrame
														
 
															+from tqdm import tqdm
														
 
															+
														
 
															+from applications import log, aiditApi, bot
														
 
															+from applications.const import ColdStartTaskConst
														
 
															+from config import apolloConfig
														
 
															+
														
 
															+const = ColdStartTaskConst()
														
 
															+config = apolloConfig()
														
 
															+
														
 
															+category_cold_start_threshold = json.loads(
														
 
															+    config.getConfigValue("category_cold_start_threshold")
														
 
															+)
														
 
															+READ_TIMES_THRESHOLD = category_cold_start_threshold.get("READ_TIMES_THRESHOLD", 1.3)
														
 
															+READ_THRESHOLD = category_cold_start_threshold.get("READ_THRESHOLD", 5000)
														
 
															+LIMIT_TITLE_LENGTH = category_cold_start_threshold.get("LIMIT_TITLE_LENGTH", 15)
														
 
															+TITLE_LENGTH_MAX = category_cold_start_threshold.get("TITLE_LENGTH_MAX", 50)
														
 
															+
														
 
															+
														
 
															+def get_article_from_meta_table(db_client, category: str, platform: str) -> DataFrame:
														
 
															+    """
														
 
															+    get article from meta data
														
 
															+    :param db_client: database connector
														
 
															+    :param category: article category
														
 
															+    :param platform: article platform
														
 
															+    :return: article dataframe
														
 
															+    """
														
 
															+    sql = f"""
														
 
															+        select 
														
 
															+            article_id, out_account_id, article_index, title, link, read_cnt, status, llm_sensitivity, score
														
 
															+        from crawler_meta_article
														
 
															+        where category = "{category}" and platform = "{platform}" and title_sensitivity = {const.TITLE_NOT_SENSITIVE}
														
 
															+        order by score desc;
														
 
															+    """
														
 
															+    article_list = db_client.fetch(sql)
														
 
															+    log(
														
 
															+        task="category_publish_task",
														
 
															+        function="get_articles_from_meta_table",
														
 
															+        message="获取品类文章总数",
														
 
															+        data={"total_articles": len(article_list), "category": category},
														
 
															+    )
														
 
															+    article_df = pd.DataFrame(
														
 
															+        article_list,
														
 
															+        columns=[
														
 
															+            "article_id",
														
 
															+            "gh_id",
														
 
															+            "position",
														
 
															+            "title",
														
 
															+            "link",
														
 
															+            "read_cnt",
														
 
															+            "status",
														
 
															+            "llm_sensitivity",
														
 
															+            "score",
														
 
															+        ],
														
 
															+    )
														
 
															+    return article_df
														
 
															+
														
 
															+
														
 
															+def update_published_articles_status(db_client) -> None:
														
 
															+    """
														
 
															+    filter published articles
														
 
															+    """
														
 
															+    category_map = json.loads(config.getConfigValue("category_cold_start_map"))
														
 
															+    category_list = list(category_map.keys())
														
 
															+    processing_bar = tqdm(category_list, desc="update_published_articles")
														
 
															+    for category in processing_bar:
														
 
															+        plan_id = category_map.get(category)
														
 
															+        if plan_id:
														
 
															+            article_list = aiditApi.get_generated_article_list(plan_id)
														
 
															+            title_list = [i[1] for i in article_list]
														
 
															+            if title_list:
														
 
															+                update_sql = f"""
														
 
															+                        update crawler_meta_article
														
 
															+                        set status = %s 
														
 
															+                        where title in %s and status = %s;
														
 
															+                """
														
 
															+                affected_rows = db_client.save(
														
 
															+                    query=update_sql,
														
 
															+                    params=(
														
 
															+                        const.PUBLISHED_STATUS,
														
 
															+                        tuple(title_list),
														
 
															+                        const.INIT_STATUS,
														
 
															+                    ),
														
 
															+                )
														
 
															+                processing_bar.set_postfix(
														
 
															+                    {"category": category, "affected_rows": affected_rows}
														
 
															+                )
														
 
															+        else:
														
 
															+            return
														
 
															+
														
 
															+
														
 
															+def filter_by_read_times(article_df: DataFrame) -> DataFrame:
														
 
															+    """
														
 
															+    filter by read times
														
 
															+    """
														
 
															+    article_df["average_read"] = article_df.groupby(["gh_id", "position"])[
														
 
															+        "read_cnt"
														
 
															+    ].transform("mean")
														
 
															+    article_df["read_times"] = article_df["read_cnt"] / article_df["average_read"]
														
 
															+    filter_df = article_df[article_df["read_times"] >= READ_TIMES_THRESHOLD]
														
 
															+    return filter_df
														
 
															+
														
 
															+
														
 
															+def filter_by_status(article_df: DataFrame) -> DataFrame:
														
 
															+    """
														
 
															+    filter by status
														
 
															+    """
														
 
															+    filter_df = article_df[article_df["status"] == const.INIT_STATUS]
														
 
															+    return filter_df
														
 
															+
														
 
															+
														
 
															+def filter_by_read_cnt(article_df: DataFrame) -> DataFrame:
														
 
															+    """
														
 
															+    filter by read cnt
														
 
															+    """
														
 
															+    filter_df = article_df[article_df["read_cnt"] >= READ_THRESHOLD]
														
 
															+    return filter_df
														
 
															+
														
 
															+
														
 
															+def filter_by_title_length(article_df: DataFrame) -> DataFrame:
														
 
															+    """
														
 
															+    filter by title length
														
 
															+    """
														
 
															+    filter_df = article_df[
														
 
															+        (article_df["title"].str.len() >= LIMIT_TITLE_LENGTH)
														
 
															+        & (article_df["title"].str.len() <= TITLE_LENGTH_MAX)
														
 
															+    ]
														
 
															+    return filter_df
														
 
															+
														
 
															+
														
 
															+def filter_by_sensitive_words(article_df: DataFrame) -> DataFrame:
														
 
															+    """
														
 
															+    filter by sensitive words
														
 
															+    """
														
 
															+    filter_df = article_df[
														
 
															+        (~article_df["title"].str.contains("农历"))
														
 
															+        & (~article_df["title"].str.contains("太极"))
														
 
															+        & (~article_df["title"].str.contains("节"))
														
 
															+        & (~article_df["title"].str.contains("早上好"))
														
 
															+        & (~article_df["title"].str.contains("赖清德"))
														
 
															+        & (~article_df["title"].str.contains("普京"))
														
 
															+        & (~article_df["title"].str.contains("俄"))
														
 
															+        & (~article_df["title"].str.contains("南海"))
														
 
															+        & (~article_df["title"].str.contains("台海"))
														
 
															+        & (~article_df["title"].str.contains("解放军"))
														
 
															+        & (~article_df["title"].str.contains("蔡英文"))
														
 
															+        & (~article_df["title"].str.contains("中国"))
														
 
															+    ]
														
 
															+    return filter_df
														
 
															+
														
 
															+
														
 
															+def filter_by_similarity_score(article_df: DataFrame, score) -> DataFrame:
														
 
															+    """
														
 
															+    filter by similarity score
														
 
															+    """
														
 
															+    filter_df = article_df[article_df["score"] >= score]
														
 
															+    return filter_df
														
 
															+
														
 
															+
														
 
															+def insert_into_article_crawler_plan(
														
 
															+    db_client, crawler_plan_id, crawler_plan_name, create_timestamp
														
 
															+):
														
 
															+    """
														
 
															+    insert into article crawler plan
														
 
															+    """
														
 
															+    insert_sql = f"""
														
 
															+        insert into article_crawler_plan (crawler_plan_id, name, create_timestamp)
														
 
															+        values (%s, %s, %s);
														
 
															+    """
														
 
															+    try:
														
 
															+        db_client.save(
														
 
															+            query=insert_sql,
														
 
															+            params=(crawler_plan_id, crawler_plan_name, create_timestamp),
														
 
															+        )
														
 
															+    except Exception as e:
														
 
															+        bot(
														
 
															+            title="品类冷启任务，记录抓取计划id失败",
														
 
															+            detail={
														
 
															+                "error": str(e),
														
 
															+                "error_msg": traceback.format_exc(),
														
 
															+                "crawler_plan_id": crawler_plan_id,
														
 
															+                "crawler_plan_name": crawler_plan_name,
														
 
															+            },
														
 
															+        )
														
 
															+
														
 
															+
														
 
															+def create_crawler_plan(url_list, plan_tag, platform) -> tuple:
														
 
															+    """
														
 
															+    create crawler plan
														
 
															+    """
														
 
															+    crawler_plan_response = aiditApi.auto_create_crawler_task(
														
 
															+        plan_id=None,
														
 
															+        plan_name="自动绑定-{}--{}--{}".format(
														
 
															+            plan_tag, datetime.date.today().__str__(), len(url_list)
														
 
															+        ),
														
 
															+        plan_tag=plan_tag,
														
 
															+        article_source=platform,
														
 
															+        url_list=url_list,
														
 
															+    )
														
 
															+    log(
														
 
															+        task="category_publish_task",
														
 
															+        function="publish_filter_articles",
														
 
															+        message="成功创建抓取计划",
														
 
															+        data=crawler_plan_response,
														
 
															+    )
														
 
															+    # save to db
														
 
															+    create_timestamp = int(time.time()) * 1000
														
 
															+    crawler_plan_id = crawler_plan_response["data"]["id"]
														
 
															+    crawler_plan_name = crawler_plan_response["data"]["name"]
														
 
															+    return crawler_plan_id, crawler_plan_name, create_timestamp
														
 
															+
														
 
															+
														
 
															+def bind_to_generate_plan(category, crawler_plan_id, crawler_plan_name, platform):
														
 
															+    """
														
 
															+    auto bind to generate plan
														
 
															+    """
														
 
															+    match platform:
														
 
															+        case "weixin":
														
 
															+            input_source_channel = 5
														
 
															+        case "toutiao":
														
 
															+            input_source_channel = 6
														
 
															+        case _:
														
 
															+            input_source_channel = 5
														
 
															+
														
 
															+    new_crawler_task_list = [
														
 
															+        {
														
 
															+            "contentType": 1,
														
 
															+            "inputSourceType": 2,
														
 
															+            "inputSourceSubType": None,
														
 
															+            "fieldName": None,
														
 
															+            "inputSourceValue": crawler_plan_id,
														
 
															+            "inputSourceLabel": crawler_plan_name,
														
 
															+            "inputSourceModal": 3,
														
 
															+            "inputSourceChannel": input_source_channel,
														
 
															+        }
														
 
															+    ]
														
 
															+    category_map = json.loads(config.getConfigValue("category_cold_start_map"))
														
 
															+    generate_plan_response = aiditApi.bind_crawler_task_to_generate_task(
														
 
															+        crawler_task_list=new_crawler_task_list, generate_task_id=category_map[category]
														
 
															+    )
														
 
															+    log(
														
 
															+        task="category_publish_task",
														
 
															+        function="publish_filter_articles",
														
 
															+        message="成功绑定到生成计划",
														
 
															+        data=generate_plan_response,
														
 
															+    )
														
 
															+
														
 
															+
														
 
															+def update_article_status_after_publishing(db_client, article_id_list):
														
 
															+    """
														
 
															+    update article status after publishing
														
 
															+    """
														
 
															+    update_sql = f"""
														
 
															+        update crawler_meta_article
														
 
															+        set status = %s
														
 
															+        where article_id in %s and status = %s;
														
 
															+    """
														
 
															+    affect_rows = db_client.save(
														
 
															+        query=update_sql,
														
 
															+        params=(const.PUBLISHED_STATUS, tuple(article_id_list), const.INIT_STATUS),
														
 
															+    )
														
 
															+    if affect_rows != len(article_id_list):
														
 
															+        bot(
														
 
															+            title="品类冷启任务中，出现更新状文章状态失败异常",
														
 
															+            detail={"affected_rows": affect_rows, "task_rows": len(article_id_list)},
														
 
															+        )
														
--- a/coldStartTasks/publish/publishCategoryArticles.py
+++ b/coldStartTasks/publish/publishCategoryArticles.py
@@ -9,7 +9,7 @@ import traceback
 
															 from pandas import DataFrame
														
 
															-from applications import aiditApi, log, bot
														
 
															+from applications import aiditApi, log, bot, llm_sensitivity
														
 
															 from config import apolloConfig
														
 
															 apollo = apolloConfig()
														
@@ -297,6 +297,18 @@ class CategoryColdStartTask(object):
 
															         )
														
 
															         return zero_level_funnel_df
														
 
															+    def update_article_sensitive_status(self, article_id, status):
														
 
															+        """
														
 
															+        更新文章敏感状态
														
 
															+        :return:
														
 
															+        """
														
 
															+        update_sql = f"""
														
 
															+            update crawler_meta_article
														
 
															+            set llm_sensitivity = %s
														
 
															+            where article_id = %s;
														
 
															+        """
														
 
															+        self.db_client.update(sql=update_sql, params=(status, article_id))
														
 
															+
														
 
															     def publish_filter_articles(self, category, articles_df, article_source):
														
 
															         """
														
 
															         过滤文章
														
@@ -315,6 +327,22 @@ class CategoryColdStartTask(object):
 
															             case _:
														
 
															                 return
														
 
															+        success_titles = filtered_articles_df['title'].values.tolist()
														
 
															+        article_id_list = filtered_articles_df['article_id'].values.tolist()
														
 
															+        if success_titles:
														
 
															+            try:
														
 
															+                sensitive_results = llm_sensitivity.check_titles(success_titles)
														
 
															+                for article_id, sensitive_result in zip(article_id_list, sensitive_results):
														
 
															+                    self.update_article_sensitive_status(
														
 
															+                        article_id=article_id,
														
 
															+                        status=sensitive_result['hit_rule']
														
 
															+                    )
														
 
															+                    if sensitive_result['hit_rule'] > TITLE_NOT_SENSITIVE:
														
 
															+                        filtered_articles_df = filtered_articles_df[filtered_articles_df['article_id'] != article_id]
														
 
															+
														
 
															+            except Exception as e:
														
 
															+                print("failed to update sensitive status: {}".format(e))
														
 
															+
														
 
															         url_list = filtered_articles_df['link'].values.tolist()
														
 
															         if url_list:
														
 
															             # create_crawler_plan
														
--- a/coldStartTasks/publish/publish_article_association_articles.py
+++ b/coldStartTasks/publish/publish_article_association_articles.py
@@ -0,0 +1,125 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+
														
 
															+from pandas import DataFrame
														
 
															+
														
 
															+from applications import bot
														
 
															+from applications.const import ColdStartTaskConst
														
 
															+from applications.db import DatabaseConnector
														
 
															+from config import long_articles_config
														
 
															+
														
 
															+from coldStartTasks.publish.basic import filter_by_status
														
 
															+from coldStartTasks.publish.basic import filter_by_sensitive_words
														
 
															+from coldStartTasks.publish.basic import filter_by_title_length
														
 
															+from coldStartTasks.publish.basic import update_published_articles_status
														
 
															+from coldStartTasks.publish.basic import get_article_from_meta_table
														
 
															+from coldStartTasks.publish.basic import update_article_status_after_publishing
														
 
															+from coldStartTasks.publish.basic import create_crawler_plan
														
 
															+from coldStartTasks.publish.basic import insert_into_article_crawler_plan
														
 
															+from coldStartTasks.publish.basic import bind_to_generate_plan
														
 
															+
														
 
															+const = ColdStartTaskConst()
														
 
															+
														
 
															+
														
 
															+def filter_articles_before_create_plan(article_df: DataFrame) -> DataFrame:
														
 
															+    """
														
 
															+    filter articles before create plan
														
 
															+    """
														
 
															+    total_length = article_df.shape[0]
														
 
															+
														
 
															+    # filter by status
														
 
															+    filter_df = filter_by_status(article_df)
														
 
															+    filter_length0 = filter_df.shape[0]
														
 
															+
														
 
															+    # filter by sensitive words
														
 
															+    filter_df = filter_by_sensitive_words(filter_df)
														
 
															+    filter_length1 = filter_df.shape[0]
														
 
															+
														
 
															+    # filter by title length
														
 
															+    filter_df = filter_by_title_length(filter_df)
														
 
															+    filter_length2 = filter_df.shape[0]
														
 
															+
														
 
															+    bot(
														
 
															+        title="文章联想任务，开始创建抓取计划",
														
 
															+        detail={
														
 
															+            "文章总数": total_length,
														
 
															+            "发布状态过滤": "过滤： {}, 剩余： {}".format(
														
 
															+                total_length - filter_length0, filter_length0
														
 
															+            ),
														
 
															+            "敏感词过滤": "过滤： {}, 剩余： {}".format(
														
 
															+                filter_length0 - filter_length1, filter_length1
														
 
															+            ),
														
 
															+            "标题长度过滤": "过滤： {}, 剩余： {}".format(
														
 
															+                filter_length1 - filter_length2, filter_length2
														
 
															+            ),
														
 
															+        },
														
 
															+        mention=False,
														
 
															+    )
														
 
															+
														
 
															+    return filter_df
														
 
															+
														
 
															+
														
 
															+class ArticleAssociationPublish(object):
														
 
															+    """
														
 
															+    publish i2i articles
														
 
															+    """
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        self.db_client = DatabaseConnector(db_config=long_articles_config)
														
 
															+        self.db_client.connect()
														
 
															+
														
 
															+    def deal(self):
														
 
															+        """
														
 
															+        class entrance
														
 
															+        """
														
 
															+        # update published articles
														
 
															+        update_published_articles_status(db_client=self.db_client)
														
 
															+
														
 
															+        # get data from meta table
														
 
															+        article_dataframe = get_article_from_meta_table(
														
 
															+            db_client=self.db_client, category="article_association", platform="weixin"
														
 
															+        )
														
 
															+
														
 
															+        # fileter articles
														
 
															+        filter_dataframe = filter_articles_before_create_plan(article_dataframe)
														
 
															+
														
 
															+        # create crawler plan
														
 
															+        url_list = filter_dataframe["link"].values.tolist()
														
 
															+        if url_list:
														
 
															+            crawler_plan_id, crawler_plan_name, create_timestamp = create_crawler_plan(
														
 
															+                url_list=url_list, plan_tag="article_association", platform="weixin"
														
 
															+            )
														
 
															+
														
 
															+            # insert crawler plan
														
 
															+            insert_into_article_crawler_plan(
														
 
															+                db_client=self.db_client,
														
 
															+                crawler_plan_id=crawler_plan_id,
														
 
															+                crawler_plan_name=crawler_plan_name,
														
 
															+                create_timestamp=create_timestamp,
														
 
															+            )
														
 
															+
														
 
															+            # bind to generate plan
														
 
															+            bind_to_generate_plan(
														
 
															+                category="article_association",
														
 
															+                crawler_plan_id=crawler_plan_id,
														
 
															+                crawler_plan_name=crawler_plan_name,
														
 
															+                platform="weixin",
														
 
															+            )
														
 
															+
														
 
															+            # update status
														
 
															+            article_id_list = filter_dataframe["article_id"].values.tolist()
														
 
															+            update_article_status_after_publishing(
														
 
															+                db_client=self.db_client, article_id_list=article_id_list
														
 
															+            )
														
 
															+
														
 
															+            bot(
														
 
															+                title="文章联想任务，创建抓取计划成功",
														
 
															+                detail={
														
 
															+                    "抓取计划id": crawler_plan_id,
														
 
															+                    "抓取计划名称": crawler_plan_name,
														
 
															+                    "抓取条数": len(url_list),
														
 
															+                    "冷启动类型": "article_association",
														
 
															+                },
														
 
															+                mention=False,
														
 
															+            )
														
--- a/coldStartTasks/publish/publish_video_to_pq_for_audit.py
+++ b/coldStartTasks/publish/publish_video_to_pq_for_audit.py
@@ -12,7 +12,7 @@ from pymysql.cursors import DictCursor
 
															 from applications import log
														
 
															 from applications import PQAPI
														
 
															 from applications.const import WeixinVideoCrawlerConst
														
 
															-from applications.api import generate_mini_program_title
														
 
															+from applications.api import fetch_moon_shot_response
														
 
															 from applications.db import DatabaseConnector
														
 
															 from config import long_articles_config
														
@@ -36,11 +36,14 @@ class PublishVideosForAudit(object):
 
															         """
														
 
															         already_published_count = self.get_published_articles_today()
														
 
															         rest_count = const.MAX_VIDEO_NUM - already_published_count
														
 
															+
														
 
															+        limit_count = min(rest_count, const.MAX_VIDEO_NUM_PER_PUBLISH)
														
 
															         sql = f"""
														
 
															             SELECT id, article_title, video_oss_path 
														
 
															             FROM publish_single_video_source 
														
 
															             WHERE audit_status = {const.VIDEO_AUDIT_INIT_STATUS} and bad_status = {const.TITLE_DEFAULT_STATUS}
														
 
															-            LIMIT {rest_count};
														
 
															+            ORDER BY score DESC
														
 
															+            LIMIT {limit_count};
														
 
															             """
														
 
															         response = self.db_client.fetch(sql, cursor_type=DictCursor)
														
 
															         return response
														
@@ -153,7 +156,18 @@ class PublishVideosForAudit(object):
 
															         title = self.db_client.fetch(select_sql, cursor_type=DictCursor)[0]['article_title']
														
 
															         try:
														
 
															-            mini_program_title = generate_mini_program_title(title)
														
 
															+            # generate kimi title
														
 
															+            mini_program_title = fetch_moon_shot_response(task='generate_kimi_title', input_text=title)
														
 
															+
														
 
															+            # score kimi title
														
 
															+            kimi_safe_title = None
														
 
															+            title_safe_score = fetch_moon_shot_response(task='get_title_safe_score', input_text=mini_program_title)
														
 
															+            if int(title_safe_score) > const.TITLE_SAFE_SCORE_THRESHOLD:
														
 
															+                kimi_safe_title_obj = fetch_moon_shot_response(task='make_title_safe', input_text=title, output_type='json')
														
 
															+                kimi_safe_title = kimi_safe_title_obj['title_v2']
														
 
															+
														
 
															+            mini_program_title = kimi_safe_title if kimi_safe_title else mini_program_title
														
 
															+
														
 
															             update_sql = f"""
														
 
															             UPDATE publish_single_video_source SET mini_program_title = %s WHERE audit_video_id = %s;
														
 
															             """
														
--- a/config/__init__.py
+++ b/config/__init__.py
@@ -81,6 +81,15 @@ piaoquan_crawler_config = {
 
															     'charset': 'utf8mb4'
														
 
															 }
														
 
															+
														
 
															+# moonshot model config(kimi)
														
 
															+moon_shot = {
														
 
															+    "api_key": "sk-5DqYCa88kche6nwIWjLE1p4oMm8nXrR9kQMKbBolNAWERu7q",
														
 
															+    "model": "moonshot-v1-32k",
														
 
															+    "base_url": "https://api.moonshot.cn/v1"
														
 
															+}
														
 
															+
														
 
															+
														
 
															 deep_seek_model = {
														
 
															     "DeepSeek-R1": "ep-20250213194143-d8q4t",
														
 
															     "DeepSeek-V3": "ep-20250213194558-rrmr2"
														
@@ -88,4 +97,14 @@ deep_seek_model = {
 
															 deep_seek_default_model = "ep-20250213194558-rrmr2"
														
 
															-deep_seek_api_key_byte_dance = '5e275c38-44fd-415f-abcf-4b59f6377f72'
														
 
															+deep_seek_api_key_byte_dance = '5e275c38-44fd-415f-abcf-4b59f6377f72'
														
 
															+
														
 
															+#GeWe
														
 
															+gewe_token = "d3fb918f-0f36-4769-b095-410181614231"
														
 
															+gewe_app_id = "wx_GKpVW8xfEhcaxMIK9sSm6"
														
 
															+
														
 
															+# sph decrypt key
														
 
															+decrypt_key_path = 'applications/so/libsph_decrypt.so'
														
 
															+
														
 
															+# toutiao js path
														
 
															+toutiao_js_path = 'applications/js/toutiao.js'
														
--- a/config/crontab_backup
+++ b/config/crontab_backup
@@ -1,11 +1,26 @@
 
															-# 凌晨1点30执行更新小程序信息任务
														
 
															-30 1 * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_minigram_info_daily.sh
														
 
															+# 每天凌晨 4点，下午 4 点各执行一次头条视频抓取
														
 
															+0 4,16 * * * bash /root/luojunhui/LongArticlesJob/sh/run_toutiao_account_video_crawler.sh
														
 
															+
														
 
															+# 每15分钟执行一次今日头条推荐流抓取
														
 
															+*/15 * * * * bash /root/luojunhui/LongArticlesJob/sh/run_toutiao_recommend.sh
														
 
															+
														
 
															+# 每10分钟执行一次从aigc系统获取发布文章
														
 
															+*/10 * * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_article_info_from_aigc.sh
														
 
															+
														
 
															+# 每10分钟执行一次标题相似度计算任务
														
 
															+*/10 * * * * bash /root/luojunhui/LongArticlesJob/sh/run_title_similarity_task.sh
														
 
															+
														
 
															+# 凌晨2点30执行更新小程序信息任务
														
 
															+30 2 * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_minigram_info_daily.sh
														
 
															 # 每天上午10点30执行文章退场 && 晋升任务
														
 
															 30 10 * * * bash /root/luojunhui/LongArticlesJob/sh/run_article_title_exit_v1.sh
														
 
															 # 每天上午4点执行账号冷启动任务
														
 
															-0 4 * * * bash /root/luojunhui/LongArticlesJob/sh/run_account_cold_start_daily.sh
														
 
															+0 1 * * * bash /root/luojunhui/LongArticlesJob/sh/run_account_cold_start_daily.sh
														
 
															+
														
 
															+# 每日上午9点执行账号联想任务
														
 
															+0 9 * * * bash /root/luojunhui/LongArticlesJob/sh/run_account_association.sh
														
 
															 # 每天 10 点执行前一天的阅读率均值代码
														
 
															 0 10 * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_account_read_rate_avg.sh
														
@@ -13,18 +28,24 @@
 
															 # 每天10点40执行阅读均值任务
														
 
															 40 10 * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_account_avg_v3.sh
														
 
															+# 每天11点执行文章联想任务
														
 
															+0 11 * * * bash /root/luojunhui/LongArticlesJob/sh/run_article_association.sh
														
 
															+
														
 
															 # 每小时执行一次校验视频状态
														
 
															 20 * * * * bash /root/luojunhui/LongArticlesJob/sh/run_check_video_status_hourly.sh
														
 
															-# 每天凌晨4：30， 8：30， 15：30执行视频发布和审核流程
														
 
															-30 4,8,15 * * * bash /root/luojunhui/LongArticlesJob/sh/run_video_publish_and_audit.sh
														
 
															+# 每天凌晨4：30 15：30执行视频发布和审核流程
														
 
															+30 4,15 * * * bash /root/luojunhui/LongArticlesJob/sh/run_video_publish_and_audit.sh
														
 
															 # 每天 上午8:30， 下午1：00， 晚上8：50执行
														
 
															 30 8 * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_published_articles_daily.sh
														
 
															-20 13 * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_published_articles_daily.sh
														
 
															+20 14 * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_published_articles_daily.sh
														
 
															 50 20 * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_published_articles_daily.sh
														
 
															+# 每天上午9点，下午2点，晚上9点执行v2代码
														
 
															+# 0 9,14,21 * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_published_articles_daily_v2.sh
														
 
															+
														
 
															 # 每天上午 9:30 点,下午 2 点,晚上 7 点执行下架视频任务
														
@@ -37,8 +58,10 @@
 
															 # 每天早上9点，下午2：30， 晚上7：30
														
 
															-30 9,14 * * * bash /root/luojunhui/LongArticlesJob/sh/published_articles_monitor.sh
														
 
															+0 10,16,20 * * * bash /root/luojunhui/LongArticlesJob/sh/published_articles_monitor.sh
														
 
															+# 每晚11点开始执行百度视频
														
 
															+0 23 * * * bash /root/luojunhui/LongArticlesJob/sh/run_baidu_video_crawler.sh
														
 
															 # check kimo balance hourly
														
--- a/crawler_sph_video.py
+++ b/crawler_sph_video.py
@@ -0,0 +1,9 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+
														
 
															+from tasks.crawler_channel_account_videos import CrawlerChannelAccountVideos
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    crawler_channel_account_videos = CrawlerChannelAccountVideos()
														
 
															+    crawler_channel_account_videos.deal()
														
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,4 +20,6 @@ protobuf~=3.20.3
 
															 openai~=1.17.0
														
 
															 oss2~=2.19.1
														
 
															 fake-useragent~=1.5.1
														
 
															-playwright~=1.49.1
														
 
															+playwright~=1.49.1
														
 
															+volcengine-python-sdk[ark]
														
 
															+tenacity~=9.0.0
														
--- a/run_baidu_video_crawler.py
+++ b/run_baidu_video_crawler.py
@@ -0,0 +1,8 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+from coldStartTasks.crawler.baidu import BaiduVideoCrawler
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    task = BaiduVideoCrawler()
														
 
															+    task.deal()
														
--- a/run_title_rewrite_task.py
+++ b/run_title_rewrite_task.py
@@ -0,0 +1,9 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+from tasks.title_rewrite_task import TitleRewriteTask
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    task = TitleRewriteTask()
														
 
															+    task.deal()
														
--- a/sh/run_article_association.sh
+++ b/sh/run_article_association.sh
@@ -0,0 +1,26 @@
 
															+#!/bin/bash
														
 
															+
														
 
															+# 获取当前日期，格式为 YYYY-MM-DD
														
 
															+CURRENT_DATE=$(date +%F)
														
 
															+
														
 
															+# 日志文件路径，包含日期
														
 
															+LOG_FILE="/root/luojunhui/logs/article_association_crawler_log_$CURRENT_DATE.txt"
														
 
															+
														
 
															+# 重定向整个脚本的输出到带日期的日志文件
														
 
															+exec >> "$LOG_FILE" 2>&1
														
 
															+if pgrep -f "python3 article_association_task.py" > /dev/null
														
 
															+then
														
 
															+    echo "$(date '+%Y-%m-%d %H:%M:%S') - article_association_task.py is running"
														
 
															+else
														
 
															+    echo "$(date '+%Y-%m-%d %H:%M:%S') - trying to restart article_association_task.py"
														
 
															+    # 切换到指定目录
														
 
															+    cd /root/luojunhui/LongArticlesJob
														
 
															+
														
 
															+    # 激活 Conda 环境
														
 
															+    source /root/miniconda3/etc/profile.d/conda.sh
														
 
															+    conda activate tasks
														
 
															+
														
 
															+    # 在后台运行 Python 脚本并重定向日志输出
														
 
															+    nohup python3 article_association_task.py >> "${LOG_FILE}" 2>&1 &
														
 
															+    echo "$(date '+%Y-%m-%d %H:%M:%S') - successfully restarted article_association_task.py"
														
 
															+fi
														
--- a/sh/run_baidu_video_crawler.sh
+++ b/sh/run_baidu_video_crawler.sh
@@ -0,0 +1,26 @@
 
															+#!/bin/bash
														
 
															+
														
 
															+# 获取当前日期，格式为 YYYY-MM-DD
														
 
															+CURRENT_DATE=$(date +%F)
														
 
															+
														
 
															+# 日志文件路径，包含日期
														
 
															+LOG_FILE="/root/luojunhui/logs/baidu_video_crawler_log_$CURRENT_DATE.txt"
														
 
															+
														
 
															+# 重定向整个脚本的输出到带日期的日志文件
														
 
															+exec >> "$LOG_FILE" 2>&1
														
 
															+if pgrep -f "python3 run_baidu_video_crawler.py" > /dev/null
														
 
															+then
														
 
															+    echo "$(date '+%Y-%m-%d %H:%M:%S') - run_baidu_video_crawler.py is running"
														
 
															+else
														
 
															+    echo "$(date '+%Y-%m-%d %H:%M:%S') - trying to restart run_baidu_video_crawler.py"
														
 
															+    # 切换到指定目录
														
 
															+    cd /root/luojunhui/LongArticlesJob
														
 
															+
														
 
															+    # 激活 Conda 环境
														
 
															+    source /root/miniconda3/etc/profile.d/conda.sh
														
 
															+    conda activate tasks
														
 
															+
														
 
															+    # 在后台运行 Python 脚本并重定向日志输出
														
 
															+    nohup python3 run_baidu_video_crawler.py >> "${LOG_FILE}" 2>&1 &
														
 
															+    echo "$(date '+%Y-%m-%d %H:%M:%S') - successfully restarted run_baidu_video_crawler.py"
														
 
															+fi
														
--- a/sh/run_gzh_video_crawler.sh
+++ b/sh/run_gzh_video_crawler.sh
@@ -0,0 +1,26 @@
 
															+#!/bin/bash
														
 
															+
														
 
															+# 获取当前日期，格式为 YYYY-MM-DD
														
 
															+CURRENT_DATE=$(date +%F)
														
 
															+
														
 
															+# 日志文件路径，包含日期
														
 
															+LOG_FILE="/root/luojunhui/logs/gzh_video_crawler_log_$CURRENT_DATE.txt"
														
 
															+
														
 
															+# 重定向整个脚本的输出到带日期的日志文件
														
 
															+exec >> "$LOG_FILE" 2>&1
														
 
															+if pgrep -f "python3 run_video_account_crawler.py" > /dev/null
														
 
															+then
														
 
															+    echo "$(date '+%Y-%m-%d %H:%M:%S') - run_video_account_crawler.py is running"
														
 
															+else
														
 
															+    echo "$(date '+%Y-%m-%d %H:%M:%S') - trying to restart run_video_account_crawler.py"
														
 
															+    # 切换到指定目录
														
 
															+    cd /root/luojunhui/LongArticlesJob
														
 
															+
														
 
															+    # 激活 Conda 环境
														
 
															+    source /root/miniconda3/etc/profile.d/conda.sh
														
 
															+    conda activate tasks
														
 
															+
														
 
															+    # 在后台运行 Python 脚本并重定向日志输出
														
 
															+    nohup python3 run_video_account_crawler.py >> "${LOG_FILE}" 2>&1 &
														
 
															+    echo "$(date '+%Y-%m-%d %H:%M:%S') - successfully restarted run_video_account_crawler.py"
														
 
															+fi
														
--- a/sh/run_sph_video_crawler.sh
+++ b/sh/run_sph_video_crawler.sh
@@ -0,0 +1,26 @@
 
															+#!/bin/bash
														
 
															+
														
 
															+# 获取当前日期，格式为 YYYY-MM-DD
														
 
															+CURRENT_DATE=$(date +%F)
														
 
															+
														
 
															+# 日志文件路径，包含日期
														
 
															+LOG_FILE="/root/luojunhui/logs/sph_video_crawler_log_$CURRENT_DATE.txt"
														
 
															+
														
 
															+# 重定向整个脚本的输出到带日期的日志文件
														
 
															+exec >> "$LOG_FILE" 2>&1
														
 
															+if pgrep -f "python3 crawler_sph_video.py" > /dev/null
														
 
															+then
														
 
															+    echo "$(date '+%Y-%m-%d %H:%M:%S') - crawler_sph_video.py is running"
														
 
															+else
														
 
															+    echo "$(date '+%Y-%m-%d %H:%M:%S') - trying to restart crawler_sph_video.py"
														
 
															+    # 切换到指定目录
														
 
															+    cd /root/luojunhui/LongArticlesJob
														
 
															+
														
 
															+    # 激活 Conda 环境
														
 
															+    source /root/miniconda3/etc/profile.d/conda.sh
														
 
															+    conda activate tasks
														
 
															+
														
 
															+    # 在后台运行 Python 脚本并重定向日志输出
														
 
															+    nohup python3 crawler_sph_video.py >> "${LOG_FILE}" 2>&1 &
														
 
															+    echo "$(date '+%Y-%m-%d %H:%M:%S') - successfully restarted crawler_sph_video.py"
														
 
															+fi
														
--- a/sh/run_title_rewrite_task.sh
+++ b/sh/run_title_rewrite_task.sh
@@ -0,0 +1,26 @@
 
															+#!/bin/bash
														
 
															+
														
 
															+# 获取当前日期，格式为 YYYY-MM-DD
														
 
															+CURRENT_DATE=$(date +%F)
														
 
															+
														
 
															+# 日志文件路径，包含日期
														
 
															+LOG_FILE="/root/luojunhui/logs/title_rewrite_task_log_$CURRENT_DATE.txt"
														
 
															+
														
 
															+# 重定向整个脚本的输出到带日期的日志文件
														
 
															+exec >> "$LOG_FILE" 2>&1
														
 
															+if pgrep -f "python3 run_title_rewrite_task.py" > /dev/null
														
 
															+then
														
 
															+    echo "$(date '+%Y-%m-%d %H:%M:%S') - run_title_rewrite_task.py is running"
														
 
															+else
														
 
															+    echo "$(date '+%Y-%m-%d %H:%M:%S') - trying to restart run_title_rewrite_task.py"
														
 
															+    # 切换到指定目录
														
 
															+    cd /root/luojunhui/LongArticlesJob
														
 
															+
														
 
															+    # 激活 Conda 环境
														
 
															+    source /root/miniconda3/etc/profile.d/conda.sh
														
 
															+    conda activate tasks
														
 
															+
														
 
															+    # 在后台运行 Python 脚本并重定向日志输出
														
 
															+    nohup python3 run_title_rewrite_task.py >> "${LOG_FILE}" 2>&1 &
														
 
															+    echo "$(date '+%Y-%m-%d %H:%M:%S') - successfully restarted run_title_rewrite_task.py"
														
 
															+fi
														
--- a/sh/run_toutiao_account_video_crawler.sh
+++ b/sh/run_toutiao_account_video_crawler.sh
@@ -0,0 +1,26 @@
 
															+#!/bin/bash
														
 
															+
														
 
															+# 获取当前日期，格式为 YYYY-MM-DD
														
 
															+CURRENT_DATE=$(date +%F)
														
 
															+
														
 
															+# 日志文件路径，包含日期
														
 
															+LOG_FILE="/root/luojunhui/logs/toutiao_account_video_crawler_task_log_$CURRENT_DATE.txt"
														
 
															+
														
 
															+# 重定向整个脚本的输出到带日期的日志文件
														
 
															+exec >> "$LOG_FILE" 2>&1
														
 
															+if pgrep -f "python3 toutiao_video_crawler.py" > /dev/null
														
 
															+then
														
 
															+    echo "$(date '+%Y-%m-%d %H:%M:%S') - toutiao_video_crawler.py is running"
														
 
															+else
														
 
															+    echo "$(date '+%Y-%m-%d %H:%M:%S') - trying to restart toutiao_video_crawler.py"
														
 
															+    # 切换到指定目录
														
 
															+    cd /root/luojunhui/LongArticlesJob
														
 
															+
														
 
															+    # 激活 Conda 环境
														
 
															+    source /root/miniconda3/etc/profile.d/conda.sh
														
 
															+    conda activate tasks
														
 
															+
														
 
															+    # 在后台运行 Python 脚本并重定向日志输出
														
 
															+    nohup python3 toutiao_video_crawler.py >> "${LOG_FILE}" 2>&1 &
														
 
															+    echo "$(date '+%Y-%m-%d %H:%M:%S') - successfully restarted toutiao_video_crawler.py"
														
 
															+fi
														
--- a/sh/run_video_publish_and_audit.sh
+++ b/sh/run_video_publish_and_audit.sh
@@ -21,8 +21,6 @@ else
 
															     conda activate tasks
														
 
															     # 在后台运行 Python 脚本并重定向日志输出
														
 
															-    nohup python3 run_video_account_crawler.py >> "${LOG_FILE}" 2>&1 &
														
 
															-    sleep 180
														
 
															     nohup python3 run_video_publish_and_audit.py >> "${LOG_FILE}" 2>&1 &
														
 
															     echo "$(date '+%Y-%m-%d %H:%M:%S') - successfully restarted run_video_publish_and_audit.py"
														
 
															 fi
														
--- a/tasks/article_summary_task.py
+++ b/tasks/article_summary_task.py
@@ -69,7 +69,7 @@ class ArticleSummaryTask(object):
 
															         """
														
 
															         rollback_rows = self.db_client.save(
														
 
															             query=update_sql,
														
 
															-            params=(const.SUMMARY_INIT_STATUS, const.SUMMARY_LOCK, timestamp_threshold),
														
 
															+            params=(const.INIT_STATUS, const.PROCESSING_STATUS, timestamp_threshold),
														
 
															         )
														
 
															         return rollback_rows
														
--- a/tasks/crawler_channel_account_videos.py
+++ b/tasks/crawler_channel_account_videos.py
@@ -0,0 +1,224 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+@tool: pycharm && deepseek
														
 
															+"""
														
 
															+
														
 
															+import re
														
 
															+import os
														
 
															+import traceback
														
 
															+import time
														
 
															+
														
 
															+from pymysql.cursors import DictCursor
														
 
															+from tqdm import tqdm
														
 
															+
														
 
															+from applications import log
														
 
															+from applications.const import ChannelVideoCrawlerConst
														
 
															+from applications.db import DatabaseConnector
														
 
															+from applications.utils import download_sph_video
														
 
															+from applications.utils import insert_into_single_video_source_table
														
 
															+from applications.utils import Item
														
 
															+from applications.utils import str_to_md5
														
 
															+from applications.utils import upload_to_oss
														
 
															+from config import long_articles_config
														
 
															+from coldStartTasks.crawler.channels import get_channel_account_videos
														
 
															+
														
 
															+const = ChannelVideoCrawlerConst()
														
 
															+
														
 
															+
														
 
															+class CrawlerChannelAccountVideos:
														
 
															+    """
														
 
															+    crawler channel account videos
														
 
															+    """
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        self.db_client = DatabaseConnector(db_config=long_articles_config)
														
 
															+        self.db_client.connect()
														
 
															+
														
 
															+    def whether_video_exists(self, title: str) -> bool:
														
 
															+        """
														
 
															+        whether video exists, use video_id && title
														
 
															+        """
														
 
															+        # check title
														
 
															+        sql = f"""
														
 
															+            select id from publish_single_video_source
														
 
															+            where article_title = %s;
														
 
															+        """
														
 
															+        duplicate_id = self.db_client.fetch(query=sql, params=(title,))
														
 
															+        if duplicate_id:
														
 
															+            return True
														
 
															+
														
 
															+        return False
														
 
															+
														
 
															+    def get_channel_account_list(self) -> list[dict]:
														
 
															+        """
														
 
															+        get channel account list from database
														
 
															+        """
														
 
															+        sql = f"""
														
 
															+            select account_id, max_cursor 
														
 
															+            from sph_account_for_videos 
														
 
															+            where status = {const.CHANNEL_ACCOUNT_GOOD_STATUS}
														
 
															+            order by max_cursor;"""
														
 
															+        account_list = self.db_client.fetch(query=sql, cursor_type=DictCursor)
														
 
															+        return account_list
														
 
															+
														
 
															+    def crawler_each_video(self, video: dict) -> None:
														
 
															+        """
														
 
															+        download each video
														
 
															+        save video and decrypt video
														
 
															+        upload video to oss
														
 
															+        """
														
 
															+        object_desc = video["objectDesc"]
														
 
															+        title = object_desc["description"]
														
 
															+        if self.whether_video_exists(title):
														
 
															+            log(
														
 
															+                task="crawler_channel_account_videos",
														
 
															+                function="crawler_each_video",
														
 
															+                message="video title exists",
														
 
															+                data={"video_id": video["id"], "title": title},
														
 
															+            )
														
 
															+            return
														
 
															+
														
 
															+        cleaned_title = re.sub(r"[^\u4e00-\u9fff]", "", title)
														
 
															+        if len(cleaned_title) < const.MIN_TITLE_LENGTH:
														
 
															+            log(
														
 
															+                task="crawler_channel_account_videos",
														
 
															+                function="crawler_each_video",
														
 
															+                message="video title is too short",
														
 
															+                data={"video_id": video["id"], "title": title},
														
 
															+            )
														
 
															+            return
														
 
															+
														
 
															+        video_length = video["objectDesc"]["media"][0]["VideoPlayLen"]
														
 
															+        if video_length and int(video_length) > const.MAX_VIDEO_LENGTH:
														
 
															+            log(
														
 
															+                task="crawler_channel_account_videos",
														
 
															+                function="crawler_each_video",
														
 
															+                message="video length is too long",
														
 
															+                data={"video_id": video["id"], "title": title, "length": video_length},
														
 
															+            )
														
 
															+            return
														
 
															+
														
 
															+        video_item = Item()
														
 
															+        video_id = video["id"]
														
 
															+        video_item.add("content_trace_id", "video{}".format(str_to_md5(str(video_id))))
														
 
															+        video_item.add("url_unique_md5", video_id)
														
 
															+        video_item.add("article_title", title)
														
 
															+        video_item.add("out_account_id", video["username"])
														
 
															+        video_item.add("out_account_name", video["nickname"])
														
 
															+        video_item.add("publish_timestamp", video["createtime"])
														
 
															+        video_item.add("platform", "sph")
														
 
															+        video_item.add("crawler_timestamp", int(time.time()))
														
 
															+        media = object_desc["media"][0]
														
 
															+        url = media["Url"]
														
 
															+        decode_key = media["decodeKey"]
														
 
															+        url_token = media["urlToken"]
														
 
															+        download_url = url + url_token
														
 
															+        try:
														
 
															+            decrypt_path = download_sph_video(download_url=download_url, key=decode_key)
														
 
															+            oss_path = upload_to_oss(decrypt_path)
														
 
															+            video_item.add("video_oss_path", oss_path)
														
 
															+            video_item.add("source_account", const.NO_SOURCE_ACCOUNT_STATUS)
														
 
															+            video_item.check(source="video")
														
 
															+            insert_into_single_video_source_table(self.db_client, video_item.item)
														
 
															+            os.remove(decrypt_path)
														
 
															+        except Exception as e:
														
 
															+            log(
														
 
															+                task="crawler_channel_account_videos",
														
 
															+                function="crawler_each_video",
														
 
															+                message="download video failed",
														
 
															+                data={
														
 
															+                    "error": str(e),
														
 
															+                    "traceback": traceback.format_exc(),
														
 
															+                    "video_id": video["id"],
														
 
															+                },
														
 
															+            )
														
 
															+
														
 
															+    def crawler_each_account(self, channel_account: dict, last_buffer: str = "") -> None:
														
 
															+        """
														
 
															+        通过循环替代递归，分页爬取频道账号视频
														
 
															+        """
														
 
															+        channel_account_id = channel_account["account_id"]
														
 
															+        max_cursor = channel_account.get("max_cursor") or const.DEFAULT_CURSOR
														
 
															+        current_last_buffer = last_buffer
														
 
															+        has_more = True
														
 
															+
														
 
															+        while has_more:
														
 
															+            response = get_channel_account_videos(channel_account_id, last_buffer=current_last_buffer)
														
 
															+            if response["ret"] != 200:
														
 
															+                log(
														
 
															+                    task="crawler_channel_account_videos",
														
 
															+                    function="crawler_each_video",
														
 
															+                    message="get_channel_account_videos failed",
														
 
															+                    data={
														
 
															+                        "response": response,
														
 
															+                        "channel_account_id": channel_account_id,
														
 
															+                        "max_cursor": max_cursor,
														
 
															+                    },
														
 
															+                )
														
 
															+                break
														
 
															+
														
 
															+            response_data = response["data"]
														
 
															+            current_last_buffer = response_data["lastBuffer"]  # 更新分页游标
														
 
															+            has_more = response_data["continueFlag"]  # 是否还有下一页
														
 
															+            video_list = response_data["object"]
														
 
															+
														
 
															+            if not video_list:
														
 
															+                break
														
 
															+
														
 
															+            create_timestamp = video_list[0]["createtime"]
														
 
															+            if create_timestamp < max_cursor:
														
 
															+                break
														
 
															+
														
 
															+            crawl_video_list_bar = tqdm(video_list, desc="crawl videos")
														
 
															+            for video in crawl_video_list_bar:
														
 
															+                crawl_video_list_bar.set_postfix({"video_id": video["id"]})
														
 
															+                self.crawler_each_video(video)
														
 
															+
														
 
															+            if has_more:
														
 
															+                time.sleep(const.SLEEP_SECOND)
														
 
															+            else:
														
 
															+                break
														
 
															+
														
 
															+    def update_account_max_cursor(self, account_id: str) -> None:
														
 
															+        """
														
 
															+        update account max cursor
														
 
															+        """
														
 
															+        select_sql = f"""
														
 
															+            select max(publish_timestamp) as max_cursor from publish_single_video_source where out_account_id = '{account_id}';
														
 
															+        """
														
 
															+        response_mysql = self.db_client.fetch(query=select_sql)
														
 
															+        max_publish_timestamp = response_mysql[0][0]
														
 
															+
														
 
															+        if max_publish_timestamp:
														
 
															+            update_sql = f"""
														
 
															+                        update sph_account_for_videos
														
 
															+                        set max_cursor = %s
														
 
															+                        where account_id = %s;
														
 
															+                    """
														
 
															+            self.db_client.save(
														
 
															+                query=update_sql, params=(max_publish_timestamp, account_id)
														
 
															+            )
														
 
															+
														
 
															+    def deal(self):
														
 
															+        """
														
 
															+        deal channel account videos
														
 
															+        """
														
 
															+        account_list = self.get_channel_account_list()
														
 
															+        account_crawler_bar = tqdm(account_list, desc="crawler channel account videos")
														
 
															+        for account in account_crawler_bar:
														
 
															+            try:
														
 
															+                account_crawler_bar.set_postfix({"account_id": account["account_id"]})
														
 
															+                self.crawler_each_account(channel_account=account)
														
 
															+                self.update_account_max_cursor(account["account_id"])
														
 
															+
														
 
															+            except Exception as e:
														
 
															+                log(
														
 
															+                    task="crawler_channel_account_videos",
														
 
															+                    function="deal",
														
 
															+                    message="crawler channel account videos failed",
														
 
															+                    data={
														
 
															+                        "error": str(e),
														
 
															+                        "traceback": traceback.format_exc(),
														
 
															+                        "account_id": account["account_id"],
														
 
															+                    },
														
 
															+                )
														
--- a/tasks/crawler_toutiao_account_videos.py
+++ b/tasks/crawler_toutiao_account_videos.py
@@ -0,0 +1,208 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+
														
 
															+from __future__ import annotations
														
 
															+
														
 
															+import time
														
 
															+import traceback
														
 
															+
														
 
															+from pymysql.cursors import DictCursor
														
 
															+from tqdm import tqdm
														
 
															+
														
 
															+from applications import log
														
 
															+from applications.const import ToutiaoVideoCrawlerConst
														
 
															+from applications.db import DatabaseConnector
														
 
															+from applications.pipeline import scrape_video_entities_process
														
 
															+from applications.utils import Item
														
 
															+from applications.utils import str_to_md5
														
 
															+from applications.utils import insert_into_single_video_source_table
														
 
															+from coldStartTasks.crawler.toutiao import get_toutiao_account_video_list
														
 
															+from config import apolloConfig, long_articles_config
														
 
															+
														
 
															+const = ToutiaoVideoCrawlerConst()
														
 
															+config = apolloConfig()
														
 
															+cookie = config.getConfigValue("toutiao_blogger_cookie")
														
 
															+
														
 
															+
														
 
															+class CrawlerToutiaoAccountVideos:
														
 
															+    """
														
 
															+    toutiao blogger crawler
														
 
															+    """
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        self.db_client = DatabaseConnector(db_config=long_articles_config)
														
 
															+        self.db_client.connect()
														
 
															+
														
 
															+    def get_account_list(self):
														
 
															+        """
														
 
															+        get account list
														
 
															+        """
														
 
															+        sql = f"""
														
 
															+            select account_id, max_cursor
														
 
															+            from video_meta_accounts
														
 
															+            where platform = 'toutiao' and status = {const.TOUTIAO_ACCOUNT_GOOD_STATUS};
														
 
															+        """
														
 
															+        account_list = self.db_client.fetch(query=sql, cursor_type=DictCursor)
														
 
															+        return account_list
														
 
															+
														
 
															+    def crawler_each_account_video_list(
														
 
															+        self, account_id: str, max_cursor: int | None, max_behot_time: int = 0
														
 
															+    ):
														
 
															+        """
														
 
															+        account_id: toutiao account id
														
 
															+        max_cursor: crawler latest cursor for each account
														
 
															+        max_behot_time: max behot time from toutiao, use to switch to next page
														
 
															+        """
														
 
															+        has_more = True
														
 
															+        current_cursor = max_behot_time
														
 
															+        max_cursor = max_cursor or const.DEFAULT_CURSOR
														
 
															+
														
 
															+        while has_more:
														
 
															+            response = get_toutiao_account_video_list(
														
 
															+                account_id=account_id, cookie=cookie, max_behot_time=current_cursor
														
 
															+            )
														
 
															+            if not response:
														
 
															+                break
														
 
															+
														
 
															+            if response["message"] != "success":
														
 
															+                log(
														
 
															+                    task="crawler_toutiao_account_videos",
														
 
															+                    function="crawler_toutiao_account_videos",
														
 
															+                    message="get response from toutiao failed",
														
 
															+                    data={"account_id": account_id, "response": response},
														
 
															+                )
														
 
															+                break
														
 
															+
														
 
															+            video_list = response["data"]
														
 
															+            has_more = response["has_more"]
														
 
															+            current_cursor = response["next"]["max_behot_time"]
														
 
															+
														
 
															+            if not video_list:
														
 
															+                break
														
 
															+
														
 
															+            max_timestamp_in_this_group = video_list[0]["publish_time"]
														
 
															+            if max_timestamp_in_this_group < max_cursor:
														
 
															+                break
														
 
															+
														
 
															+            # do crawler each video
														
 
															+            crawler_video_list_bar = tqdm(video_list, desc="crawler videos")
														
 
															+            for video in crawler_video_list_bar:
														
 
															+                try:
														
 
															+                    crawler_video_list_bar.set_postfix({"video_id": video["id"]})
														
 
															+                    self.crawler_each_video(video)
														
 
															+
														
 
															+                except Exception as e:
														
 
															+                    log(
														
 
															+                        task="crawler_toutiao_account_videos",
														
 
															+                        function="crawler_each_account_video_list",
														
 
															+                        message="crawler each video failed",
														
 
															+                        data={
														
 
															+                            "account_id": account_id,
														
 
															+                            "video_info": video,
														
 
															+                            "error": str(e),
														
 
															+                            "traceback": traceback.format_exc(),
														
 
															+                        },
														
 
															+                    )
														
 
															+
														
 
															+            if has_more:
														
 
															+                time.sleep(const.SLEEP_SECOND)
														
 
															+            else:
														
 
															+                break
														
 
															+
														
 
															+    def crawler_each_video(self, video_data):
														
 
															+        """
														
 
															+        crawler each video data
														
 
															+        """
														
 
															+        video_item = Item()
														
 
															+        video_id = video_data["group_id"]
														
 
															+        title = video_data["title"]
														
 
															+        media = video_data["video"]
														
 
															+        url = media["download_addr"]["url_list"][0]
														
 
															+
														
 
															+        # add info into item
														
 
															+        video_item.add("content_trace_id", "video{}".format(str_to_md5(str(video_id))))
														
 
															+        video_item.add("url_unique_md5", video_id)
														
 
															+        video_item.add("article_title", title)
														
 
															+        video_item.add("out_account_id", video_data["user"]["user_id"])
														
 
															+        video_item.add("out_account_name", video_data["source"])
														
 
															+        video_item.add("publish_timestamp", video_data["publish_time"])
														
 
															+        video_item.add("platform", const.PLATFORM)
														
 
															+        video_item.add("read_cnt", video_data.get("read_count", 0))
														
 
															+        video_item.add("article_url", url)
														
 
															+        video_item.add("source_account", const.NO_SOURCE_ACCOUNT_STATUS)
														
 
															+        video_item.add("crawler_timestamp", int(time.time()))
														
 
															+
														
 
															+        # check item before insert
														
 
															+        video_item.check(source="video")
														
 
															+        try:
														
 
															+            item_with_oss_path = scrape_video_entities_process(
														
 
															+                video_item=video_item.item, db_client=self.db_client
														
 
															+            )
														
 
															+            if item_with_oss_path:
														
 
															+                insert_into_single_video_source_table(
														
 
															+                    self.db_client, item_with_oss_path
														
 
															+                )
														
 
															+        except Exception as e:
														
 
															+            log(
														
 
															+                task="crawler_toutiao_account_videos",
														
 
															+                function="crawler_toutiao_account_videos",
														
 
															+                message="etl failed",
														
 
															+                data={
														
 
															+                    "video_item": video_item.item,
														
 
															+                    "error": str(e),
														
 
															+                    "traceback": traceback.format_exc(),
														
 
															+                }
														
 
															+            )
														
 
															+
														
 
															+    def update_account_max_cursor(self, account_id: str) -> None:
														
 
															+        """
														
 
															+        update account max cursor
														
 
															+        """
														
 
															+        select_sql = f"""
														
 
															+            select max(publish_timestamp) as max_cursor 
														
 
															+            from publish_single_video_source 
														
 
															+            where out_account_id = '{account_id}' and platform = '{const.PLATFORM}';
														
 
															+        """
														
 
															+        response_mysql = self.db_client.fetch(query=select_sql)
														
 
															+        max_publish_timestamp = response_mysql[0][0]
														
 
															+
														
 
															+        if max_publish_timestamp:
														
 
															+            update_sql = f"""
														
 
															+                update video_meta_accounts
														
 
															+                set max_cursor = %s
														
 
															+                where account_id = %s and platform = %s;
														
 
															+            """
														
 
															+            self.db_client.save(
														
 
															+                query=update_sql,
														
 
															+                params=(max_publish_timestamp, account_id, const.PLATFORM),
														
 
															+            )
														
 
															+
														
 
															+    def deal(self) -> None:
														
 
															+        """
														
 
															+        class entrance
														
 
															+        """
														
 
															+        account_list = self.get_account_list()
														
 
															+        account_list_bar = tqdm(account_list, desc="crawler toutiao accounts")
														
 
															+        for account in account_list_bar:
														
 
															+            account_id = account["account_id"]
														
 
															+            max_cursor = account["max_cursor"]
														
 
															+            try:
														
 
															+                # crawl each account
														
 
															+                account_list_bar.set_postfix({"account_id": account_id})
														
 
															+                self.crawler_each_account_video_list(
														
 
															+                    account_id=account_id, max_cursor=max_cursor
														
 
															+                )
														
 
															+                self.update_account_max_cursor(account_id)
														
 
															+
														
 
															+            except Exception as e:
														
 
															+                # add log and bot
														
 
															+                log(
														
 
															+                    task="crawler_toutiao_account_videos",
														
 
															+                    function="deal",
														
 
															+                    message=account_id,
														
 
															+                    data={
														
 
															+                        "error": str(e),
														
 
															+                        "traceback": traceback.format_exc(),
														
 
															+                    },
														
 
															+                )
														
--- a/tasks/title_rewrite_task.py
+++ b/tasks/title_rewrite_task.py
@@ -0,0 +1,270 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+
														
 
															+import time
														
 
															+import traceback
														
 
															+
														
 
															+from pymysql.cursors import DictCursor
														
 
															+from tqdm import tqdm
														
 
															+
														
 
															+from applications import log
														
 
															+from applications.api import fetch_deepseek_response
														
 
															+from applications.const import TitleRewriteTaskConst
														
 
															+from applications.db import DatabaseConnector
														
 
															+from config import long_articles_config
														
 
															+
														
 
															+const = TitleRewriteTaskConst()
														
 
															+
														
 
															+
														
 
															+def generate_prompt(ori_title):
														
 
															+    """
														
 
															+    生成prompt
														
 
															+    """
														
 
															+    prompt = f"""
														
 
															+    请将以下标题改写成适合公众号中小程序点击和传播的文章标题，文章标题的写作规范如下，请学习后进行文章标题的编写。直接输出最终的文章标题，文章标题撰写规范如下：
														
 
															+    1. 标题结构：要点前置，信息明确
														
 
															+        核心信息前置：标题开头直接点出文章的核心内容或亮点，吸引读者注意。例如：
														
 
															+          “我国存款最安全的五大银行，永远都不会倒闭，你知道是哪五家吗？”
														
 
															+          “亩产7000斤，被误认成萝卜却曾是‘救命粮’，如今成我国出口名蔬”。
														
 
															+        简洁明了：标题通常在20字以内，信息集中且易于理解。
														
 
															+        悬念前置结构：前半句设置反常/冲突场景（如"刑满释放蹬三轮"）+后半句用结果反转制造悬念（"政府领导登门分配工作"）
														
 
															+        多要素拼接：通过冒号/逗号分隔不同叙事主体（地域+人物冲突+权威评价），如"辽宁女子住高档小区被敲门，法院判决意外"
														
 
															+    
														
 
															+    2. 情绪表达：激发共鸣，引发好奇
														
 
															+        情感共鸣：通过情感化的语言触动读者，泪崩/守护/抱头痛哭等情感冲击词，配合家庭伦理场景
														
 
															+        例如：
														
 
															+          “老母亲分家产，给亲闺女30万，给养女一筐青菜，养女意外摔倒，看到筐子里的东西，瞬间愣住了”。
														
 
															+          “儿子卖车卖房给母亲治病，母亲去世后儿媳收拾房间，打开床底柜，儿子突然痛哭”。
														
 
															+        悬念与好奇心：通过提问或制造悬念，激发读者点击欲望。例如：
														
 
															+          “你知道是哪五家吗？”
														
 
															+          “打开床底柜，儿子突然痛哭”。
														
 
															+        冲突性情绪词：拍桌大骂/气愤不已/眼红不已/算计等强对抗性词汇
														
 
															+        结果反差刺激：用"风光善终/价值过亿/判决意外"等违反预期的结果
														
 
															+    
														
 
															+    3. 语言风格：口语化、接地气
														
 
															+        口语化表达：使用通俗易懂的语言，贴近读者生活。
														
 
															+        刻意使用"赶都赶不走/各吃各的/我就知道你在家"等市井化用语。
														
 
															+        例如：
														
 
															+          “狗屎运？江西男子钓鱼时发现青鱼尸骸，扒开后捡到鸡蛋大小的青鱼石”。
														
 
															+          “聪明的女人，不会帮婆家3种忙，而蠢女人才一再插手”。
														
 
															+        接地气的词汇：使用“狗屎运”“蠢女人”等口语化词汇，增强亲切感。
														
 
															+        身份反差构建：突出人物命运转折（老农→亿万富翁/囚犯→政府帮扶对象）
														
 
															+        权威背书暗示："专家气愤/法院判决/网友评价"等第三方视角增强可信度
														
 
															+    
														
 
															+    4. 标点运用：增强语气，突出重点
														
 
															+        问号与感叹号：通过问号制造悬念，感叹号强化情感。
														
 
															+        在关键转折点使用（"太气人了！/赔不了！"）
														
 
															+        问号制造互动：如"容嬷嬷是校花？"激发读者验证心理
														
 
															+        例如：
														
 
															+          “你知道是哪五家吗？”
														
 
															+          “太无耻了！湖南，一名厨师被公司派到云南‘出差’被拒……”
														
 
															+        引号与冒号：用于突出关键词或转折点。
														
 
															+        破折号递进：用"——"引导关键信息（"吃不完最好扔掉——"）
														
 
															+        例如：
														
 
															+          “被误认成萝卜却曾是‘救命粮’”。
														
 
															+          “女子归还后，失主拒绝支付报酬，还说：要有格局”。
														
 
															+    
														
 
															+    5. 热点与话题性：结合社会热点或争议
														
 
															+        社会热点：结合当前热点事件或争议话题，吸引关注。例如：
														
 
															+          “上海：男子超市连续购买46枚过期咸鸭蛋，2天分46次交易，向厂家索赔金14万，法院判了！”
														
 
															+        争议性话题：通过争议性内容引发讨论。例如：
														
 
															+          “李玉成终于说出实话，公开吐槽马玉琴年纪太大，结婚28年疑似后悔”。
														
 
															+    
														
 
															+    6. 数字与具体细节：增强可信度与吸引力
														
 
															+        数字的运用：通过具体数字增强标题的可信度和吸引力。例如：
														
 
															+          “亩产7000斤”。
														
 
															+          “22年河南男子跳河救人，体力耗尽留遗言”。
														
 
															+        细节描述：通过细节让标题更具画面感。例如：
														
 
															+          “打开床底柜，儿子突然痛哭”。
														
 
															+          “扒开后捡到鸡蛋大小的青鱼石”。
														
 
															+    
														
 
															+    7. 价值诉求：传递实用信息或情感价值
														
 
															+        实用信息：提供对读者有价值的信息。例如：
														
 
															+          “我国存款最安全的五大银行，永远都不会倒闭”。
														
 
															+          “72岁老人每天一个蒸苹果，半年后体检，看到指标变化让他乐开了花”。
														
 
															+        情感价值：通过情感故事或人生哲理打动读者。例如：
														
 
															+          “父母越老越能暴露家庭最真实的一面：当父母70岁，子女不该抱有这三种期待”。
														
 
															+    
														
 
															+    8. 名人效应与历史情怀：增强吸引力
														
 
															+        名人效应：提及名人或历史事件，吸引关注。例如：
														
 
															+          “难怪王扶林说陈晓旭不够漂亮，看看他选的原黛玉候选人，那才叫美”。
														
 
															+          “1975年‘下馆子’的老照片，2元能吃些什么，勾起那段最难忘的时光”。
														
 
															+    
														
 
															+    9.隐藏传播逻辑：通过标题中暗含的、能触发人性弱点（如猎奇、贪婪、同情）或社会痛点的心理机制，通过潜意识刺激读者点击欲望
														
 
															+       人性弱点触发：贪婪（200万保单）、猎奇（林彪密件）、窥私（家庭算计）
														
 
															+       生存焦虑关联：医疗（脑瘫儿）、养老（子女不孝）、食品安全（二次加热）
														
 
															+       身份代入设计：选择"老太太/外甥女/退休母亲"等易引发群体共鸣的角色
														
 
															+    输入的标题是： '{ori_title}'
														
 
															+    """
														
 
															+    return prompt
														
 
															+
														
 
															+
														
 
															+class TitleRewriteTask:
														
 
															+    """
														
 
															+    标题重写任务
														
 
															+    """
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        self.db = DatabaseConnector(db_config=long_articles_config)
														
 
															+        self.db.connect()
														
 
															+
														
 
															+    def roll_back_blocked_tasks(self):
														
 
															+        """
														
 
															+        rollback blocked tasks
														
 
															+        """
														
 
															+        sql = f"""
														
 
															+            select id, title_rewrite_status_update_timestamp
														
 
															+            from publish_single_video_source
														
 
															+            where title_rewrite_status = {const.TITLE_REWRITE_LOCK_STATUS};
														
 
															+        """
														
 
															+        article_list = self.db.fetch(query=sql, cursor_type=DictCursor)
														
 
															+        if article_list:
														
 
															+            blocked_id_list = [
														
 
															+                i["id"]
														
 
															+                for i in article_list
														
 
															+                if (
														
 
															+                    int(time.time())
														
 
															+                    - i["title_rewrite_status_update_timestamp"]
														
 
															+                )
														
 
															+                > const.TITLE_REWRITE_LOCK_TIME
														
 
															+            ]
														
 
															+            if blocked_id_list:
														
 
															+                update_sql = f"""
														
 
															+                    update publish_single_video_source
														
 
															+                    set title_rewrite_status = %s
														
 
															+                    where id in %s and title_rewrite_status = %s;
														
 
															+                """
														
 
															+                self.db.save(
														
 
															+                    query=update_sql,
														
 
															+                    params=(
														
 
															+                        const.TITLE_REWRITE_INIT_STATUS,
														
 
															+                        tuple(blocked_id_list),
														
 
															+                        const.TITLE_REWRITE_LOCK_STATUS,
														
 
															+                    )
														
 
															+                )
														
 
															+
														
 
															+    def get_articles_batch(self, batch_size=1000):
														
 
															+        """
														
 
															+        从数据库中获取文章
														
 
															+        """
														
 
															+        sql = f"""
														
 
															+            select content_trace_id, article_title
														
 
															+            from publish_single_video_source 
														
 
															+            where bad_status = {const.ARTICLE_POSITIVE_STATUS} 
														
 
															+                and audit_status = {const.ARTICLE_AUDIT_PASSED_STATUS} 
														
 
															+                and title_rewrite_status = {const.TITLE_REWRITE_INIT_STATUS}
														
 
															+                and platform in ('hksp', 'sph')
														
 
															+            limit {batch_size};
														
 
															+        """
														
 
															+        res = self.db.fetch(query=sql, cursor_type=DictCursor)
														
 
															+        return res
														
 
															+
														
 
															+    def update_title_rewrite_status(self, content_trace_id, ori_status, new_status):
														
 
															+        """
														
 
															+        更新标题重写状态
														
 
															+        """
														
 
															+        sql = f"""
														
 
															+            update publish_single_video_source
														
 
															+            set title_rewrite_status = %s, title_rewrite_status_update_timestamp = %s
														
 
															+            where content_trace_id = %s and title_rewrite_status= %s;
														
 
															+        """
														
 
															+        affected_rows = self.db.save(
														
 
															+            query=sql, params=(new_status, int(time.time()), content_trace_id, ori_status)
														
 
															+        )
														
 
															+        return affected_rows
														
 
															+
														
 
															+    def insert_into_rewrite_table(self, content_trace_id, new_title):
														
 
															+        """
														
 
															+        insert into rewrite_table
														
 
															+        """
														
 
															+        insert_sql = f"""
														
 
															+            insert into video_title_rewrite
														
 
															+            (content_trace_id, new_title, status, prompt_version)
														
 
															+            values (%s, %s, %s, %s);
														
 
															+        """
														
 
															+        self.db.save(
														
 
															+            query=insert_sql,
														
 
															+            params=(
														
 
															+                content_trace_id,
														
 
															+                new_title,
														
 
															+                const.TITLE_USEFUL_STATUS,
														
 
															+                const.PROMPT_VERSION
														
 
															+            ),
														
 
															+        )
														
 
															+
														
 
															+    def rewrite_each_article(self, article):
														
 
															+        """
														
 
															+        rewrite each article
														
 
															+        """
														
 
															+        content_trace_id = article["content_trace_id"]
														
 
															+        article_title = article["article_title"]
														
 
															+
														
 
															+        # lock each task
														
 
															+        affected_rows = self.update_title_rewrite_status(
														
 
															+            content_trace_id=content_trace_id,
														
 
															+            ori_status=const.TITLE_REWRITE_INIT_STATUS,
														
 
															+            new_status=const.TITLE_REWRITE_LOCK_STATUS,
														
 
															+        )
														
 
															+        if not affected_rows:
														
 
															+            return
														
 
															+
														
 
															+        try:
														
 
															+            prompt = generate_prompt(article_title)
														
 
															+            new_title = fetch_deepseek_response(model="default", prompt=prompt)
														
 
															+
														
 
															+            # insert into rewrite table
														
 
															+            self.insert_into_rewrite_table(
														
 
															+                content_trace_id=content_trace_id, new_title=new_title
														
 
															+            )
														
 
															+
														
 
															+            # unlock
														
 
															+            self.update_title_rewrite_status(
														
 
															+                content_trace_id=content_trace_id,
														
 
															+                ori_status=const.TITLE_REWRITE_LOCK_STATUS,
														
 
															+                new_status=const.TITLE_REWRITE_SUCCESS_STATUS,
														
 
															+            )
														
 
															+        except Exception as e:
														
 
															+            log(
														
 
															+                task="title rewrite task",
														
 
															+                function="rewrite_each_article",
														
 
															+                message=content_trace_id,
														
 
															+                status="fail",
														
 
															+                data={
														
 
															+                    "error_message": str(e),
														
 
															+                    "error_type": type(e).__name__,
														
 
															+                    "traceback": traceback.format_exc(),
														
 
															+                },
														
 
															+            )
														
 
															+            self.update_title_rewrite_status(
														
 
															+                content_trace_id=content_trace_id,
														
 
															+                ori_status=const.TITLE_REWRITE_LOCK_STATUS,
														
 
															+                new_status=const.TITLE_REWRITE_FAIL_STATUS,
														
 
															+            )
														
 
															+
														
 
															+    def deal(self):
														
 
															+        """
														
 
															+        get tasks && deal tasks
														
 
															+        """
														
 
															+        # rollback blocked tasks
														
 
															+        try:
														
 
															+            self.roll_back_blocked_tasks()
														
 
															+        except Exception as e:
														
 
															+            log(
														
 
															+                task="title rewrite task",
														
 
															+                function="roll_back_blocked_tasks",
														
 
															+                message="roll back blocked tasks fail",
														
 
															+                status="fail",
														
 
															+                data={
														
 
															+                    "error_message": str(e),
														
 
															+                    "error_type": type(e).__name__,
														
 
															+                    "traceback": traceback.format_exc()
														
 
															+                }
														
 
															+            )
														
 
															+
														
 
															+        # process tasks
														
 
															+        articles = self.get_articles_batch()
														
 
															+        bar = tqdm(articles, desc="title rewrite task")
														
 
															+        for article in bar:
														
 
															+            self.rewrite_each_article(article)
														
 
															+            bar.set_postfix({"content_trace_id": article["content_trace_id"]})
														
--- a/tasks/update_published_articles_minigram_detail.py
+++ b/tasks/update_published_articles_minigram_detail.py
@@ -120,7 +120,7 @@ class UpdatePublishedArticlesMinigramDetail(object):
 
															         :return:
														
 
															         """
														
 
															         select_sql = f"""
														
 
															-            SELECT first_uv, split0, split1, split2
														
 
															+            SELECT first_uv, split0, split0_head, split0_recommend, split1, split1_head, split1_recommend, split2, split2_head, split2_recommend
														
 
															             FROM changwen_data_rootsourceid
														
 
															             WHERE root_source_id = '{root_source_id}' AND dt = '{dt}';
														
 
															         """
														
@@ -235,7 +235,7 @@ class UpdatePublishedArticlesMinigramDetail(object):
 
															                 return article_info
														
 
															         else:
														
 
															-            return article_info
														
 
															+            return EMPTY_DICT
														
 
															     def get_root_source_id_for_three_days(self, biz_date: str) -> List[Dict]:
														
 
															         """
														
@@ -263,7 +263,10 @@ class UpdatePublishedArticlesMinigramDetail(object):
 
															             # do update job
														
 
															             update_sql = f"""
														
 
															                 UPDATE {DETAIL_TABLE}
														
 
															-                SET first_level = %s, fission_0 = %s, fission_1 = %s, fission_2 = %s
														
 
															+                SET first_level = %s, 
														
 
															+                    fission_0 = %s, fission_0_head = %s, fission_0_recommend = %s, 
														
 
															+                    fission_1 = %s, fission_1_head = %s, fission_1_recommend = %s, 
														
 
															+                    fission_2 = %s, fission_2_head = %s, fission_2_recommend = %s
														
 
															                 WHERE root_source_id = %s and recall_dt = %s;
														
 
															             """
														
 
															             self.piaoquan_crawler_db_client.save(
														
@@ -271,8 +274,14 @@ class UpdatePublishedArticlesMinigramDetail(object):
 
															                 params=(
														
 
															                     mini_program_detail['first_uv'],
														
 
															                     mini_program_detail['split0'],
														
 
															+                    mini_program_detail['split0_head'],
														
 
															+                    mini_program_detail['split0_recommend'],
														
 
															                     mini_program_detail['split1'],
														
 
															+                    mini_program_detail['split1_head'],
														
 
															+                    mini_program_detail['split1_recommend'],
														
 
															                     mini_program_detail['split2'],
														
 
															+                    mini_program_detail['split2_head'],
														
 
															+                    mini_program_detail['split2_recommend'],
														
 
															                     root_source_id,
														
 
															                     recall_dt
														
 
															                 )
														
--- a/title_similarity_score_task.py
+++ b/title_similarity_score_task.py
@@ -1,6 +1,8 @@
 
															 """
														
 
															 @author: luojunhui
														
 
															 """
														
 
															+import traceback
														
 
															+from applications import bot
														
 
															 from coldStartTasks.filter.title_similarity_task import ColdStartTitleSimilarityTask
														
@@ -8,4 +10,27 @@ if __name__ == '__main__':
 
															     batch_size = 3000
														
 
															     task = ColdStartTitleSimilarityTask()
														
 
															     task.init_database()
														
 
															-    task.run(limit=batch_size)
														
 
															+    # process video
														
 
															+    try:
														
 
															+        task.run(meta_source="video")
														
 
															+    except Exception as e:
														
 
															+        bot(
														
 
															+            title="视频冷启池nlp任务异常",
														
 
															+            mention=False,
														
 
															+            detail={
														
 
															+                "traceback": traceback.format_exc(),
														
 
															+                "error": f"{e}"
														
 
															+            }
														
 
															+        )
														
 
															+    # process article
														
 
															+    try:
														
 
															+        task.run(meta_source="article")
														
 
															+    except Exception as e:
														
 
															+        bot(
														
 
															+            title="文章冷启池nlp任务异常",
														
 
															+            mention=False,
														
 
															+            detail={
														
 
															+                "traceback": traceback.format_exc(),
														
 
															+                "error": f"{e}"
														
 
															+            }
														
 
															+        )
														
--- a/toutiao_video_crawler.py
+++ b/toutiao_video_crawler.py
@@ -0,0 +1,10 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+
														
 
															+from tasks.crawler_toutiao_account_videos import CrawlerToutiaoAccountVideos
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    crawler = CrawlerToutiaoAccountVideos()
														
 
															+    crawler.deal()
														
--- a/updateAccountV3.py
+++ b/updateAccountV3.py
@@ -7,151 +7,104 @@ import time
 
															 from tqdm import tqdm
														
 
															 from datetime import datetime, timedelta
														
 
															 from argparse import ArgumentParser
														
 
															+from pymysql.cursors import DictCursor
														
 
															-from applications import PQMySQL, DeNetMysql, longArticlesMySQL
														
 
															-from applications.const import updateAccountReadAvgTaskConst
														
 
															+from applications.const import UpdateAccountReadAvgTaskConst
														
 
															+from applications.db import DatabaseConnector
														
 
															+from applications.utils import fetch_account_fans
														
 
															+from applications.utils import fetch_publishing_account_list
														
 
															 from config import apolloConfig
														
 
															+from config import long_articles_config, denet_config, piaoquan_crawler_config
														
 
															+read_rate_table = "long_articles_read_rate"
														
 
															+read_avg_table = "account_avg_info_v3"
														
 
															 config = apolloConfig()
														
 
															+const = UpdateAccountReadAvgTaskConst()
														
 
															 unauthorized_account = json.loads(config.getConfigValue("unauthorized_gh_id_fans"))
														
 
															 touliu_accounts = set(json.loads(config.getConfigValue("touliu_gh_id_list")))
														
 
															-
														
 
															-
														
 
															-def get_account_fans_by_dt(db_client) -> dict:
														
 
															-    """
														
 
															-    获取每个账号发粉丝，通过日期来区分
														
 
															-    :return:
														
 
															-    """
														
 
															-    sql = f"""
														
 
															-        SELECT 
														
 
															-            t1.date_str, 
														
 
															-            t1.fans_count, 
														
 
															-            t2.gh_id
														
 
															-        FROM datastat_wx t1
														
 
															-        JOIN publish_account t2 ON t1.account_id = t2.id
														
 
															-        WHERE 
														
 
															-            t2.channel = 5 
														
 
															-        AND t2.status = 1 
														
 
															-        AND t1.date_str >= '2024-09-01' 
														
 
															-        ORDER BY t1.date_str;
														
 
															-    """
														
 
															-    result = db_client.select(sql)
														
 
															-    D = {}
														
 
															-    for line in result:
														
 
															-        dt = line[0]
														
 
															-        fans = line[1]
														
 
															-        gh_id = line[2]
														
 
															-        if D.get(gh_id):
														
 
															-            D[gh_id][dt] = fans
														
 
															-        else:
														
 
															-            D[gh_id] = {dt: fans}
														
 
															-    return D
														
 
															-
														
 
															+backup_account_fans = json.loads(config.getConfigValue("backup_account_fans"))
														
 
															 class UpdateAccountInfoVersion3(object):
														
 
															     """
														
 
															-    更新账号信息 v3
														
 
															+    更新账号的平均阅读率
														
 
															     """
														
 
															     def __init__(self):
														
 
															-        self.const = updateAccountReadAvgTaskConst()
														
 
															-        self.pq = PQMySQL()
														
 
															-        self.de = DeNetMysql()
														
 
															-        self.lam = longArticlesMySQL()
														
 
															+        # init piaoquan crawler db client
														
 
															+        self.piaoquan_crawler_db_client = DatabaseConnector(piaoquan_crawler_config)
														
 
															+        self.piaoquan_crawler_db_client.connect()
														
 
															+
														
 
															+        # init long articles db client
														
 
															+        self.long_articles_db_client = DatabaseConnector(long_articles_config)
														
 
															+        self.long_articles_db_client.connect()
														
 
															+
														
 
															+        #  init aigc db client
														
 
															+        self.denet_db_client = DatabaseConnector(denet_config)
														
 
															+        self.denet_db_client.connect()
														
 
															-    def get_account_position_read_rate(self, dt):
														
 
															+    def fetch_read_rate_avg_for_each_account(self, dt):
														
 
															         """
														
 
															         从长文数据库获取账号阅读均值
														
 
															         :return:
														
 
															         """
														
 
															         dt = int(dt.replace("-", ""))
														
 
															         sql = f"""
														
 
															-            SELECT 
														
 
															-                gh_id, position, read_rate_avg
														
 
															-            FROM
														
 
															-                long_articles_read_rate
														
 
															-            WHERE dt_version = {dt};
														
 
															+            select gh_id, position, read_rate_avg
														
 
															+            from {read_rate_table}
														
 
															+            where dt_version = {dt};
														
 
															         """
														
 
															-
														
 
															-        result = self.lam.select(sql)
														
 
															+        fetch_response_list = self.long_articles_db_client.fetch(query=sql, cursor_type=DictCursor)
														
 
															         account_read_rate_dict = {}
														
 
															-        for item in result:
														
 
															-            gh_id = item[0]
														
 
															-            position = item[1]
														
 
															-            rate = item[2]
														
 
															-            key = "{}_{}".format(gh_id, position)
														
 
															-            account_read_rate_dict[key] = rate
														
 
															+        for item in fetch_response_list:
														
 
															+            key = "{}_{}".format(item['gh_id'], item['position'])
														
 
															+            account_read_rate_dict[key] = item['read_rate_avg']
														
 
															         return account_read_rate_dict
														
 
															-    def get_publishing_accounts(self):
														
 
															-        """
														
 
															-        获取每日正在发布的账号
														
 
															-        :return:
														
 
															-        """
														
 
															-        sql = f"""
														
 
															-        SELECT DISTINCT
														
 
															-            t3.`name`,
														
 
															-            t3.gh_id,
														
 
															-            t3.follower_count,
														
 
															-            t6.account_source_name,
														
 
															-            t6.mode_type,
														
 
															-            t6.account_type,
														
 
															-            t6.`status`
														
 
															-        FROM
														
 
															-            publish_plan t1
														
 
															-            JOIN publish_plan_account t2 ON t1.id = t2.plan_id
														
 
															-            JOIN publish_account t3 ON t2.account_id = t3.id
														
 
															-            LEFT JOIN publish_account_wx_type t4 on t3.id = t4.account_id
														
 
															-            LEFT JOIN wx_statistics_group_source_account t5 on t3.id = t5.account_id
														
 
															-            LEFT JOIN wx_statistics_group_source t6 on t5.group_source_name = t6.account_source_name
														
 
															-        WHERE
														
 
															-            t1.plan_status = 1
														
 
															-            AND t3.channel = 5
														
 
															-            GROUP BY t3.id;
														
 
															-        """
														
 
															-        account_list = self.de.select(sql)
														
 
															-        result_list = [
														
 
															-            {
														
 
															-                "account_name": i[0],
														
 
															-                "gh_id": i[1],
														
 
															-                "fans": i[2],
														
 
															-                "account_source_name": i[3],
														
 
															-                "mode_type": i[4],
														
 
															-                "account_type": i[5],
														
 
															-                "status": i[6]
														
 
															-            } for i in account_list
														
 
															-        ]
														
 
															-        return result_list
														
 
															-
														
 
															     def do_task_list(self, dt):
														
 
															         """
														
 
															         do it
														
 
															         """
														
 
															-        fans_dict = get_account_fans_by_dt(db_client=self.de)
														
 
															-        account_list = self.get_publishing_accounts()
														
 
															-        rate_dict = self.get_account_position_read_rate(dt)
														
 
															+        # get fans dict from aigc
														
 
															+        fans_dict = fetch_account_fans(self.denet_db_client, dt)
														
 
															+
														
 
															+        # get publishing account list from aigc
														
 
															+        account_list = fetch_publishing_account_list(self.denet_db_client)
														
 
															+
														
 
															+        # fetch each account's read avg for each position
														
 
															+        read_rate_avg_dict = self.fetch_read_rate_avg_for_each_account(dt)
														
 
															+
														
 
															         for account in tqdm(account_list, desc=dt):
														
 
															             gh_id = account["gh_id"]
														
 
															-            business_type = self.const.TOULIU if gh_id in touliu_accounts else self.const.ARTICLES_DAILY
														
 
															-            fans = fans_dict.get(gh_id, {}).get(dt, 0)
														
 
															+            business_type = const.TOULIU if gh_id in touliu_accounts else const.ARTICLES_DAILY
														
 
															+            fans = fans_dict.get(gh_id, {}).get(dt, const.DEFAULT_FANS)
														
 
															+
														
 
															+            # use unauthorized account's fans if not found in aigc
														
 
															+            if not fans:
														
 
															+                fans = int(unauthorized_account.get(gh_id, const.DEFAULT_FANS))
														
 
															+
														
 
															+            # use backup account's fans if not found in aigc
														
 
															             if not fans:
														
 
															-                fans = int(unauthorized_account.get(gh_id, 0))
														
 
															+                fans = int(backup_account_fans.get(gh_id, const.DEFAULT_FANS))
														
 
															+
														
 
															             if fans:
														
 
															-                for index in range(1, 9):
														
 
															+                for index in const.ARTICLE_INDEX_LIST:
														
 
															                     gh_id_position = "{}_{}".format(gh_id, index)
														
 
															-                    if rate_dict.get(gh_id_position):
														
 
															-                        rate = rate_dict[gh_id_position]
														
 
															-                        read_avg = fans * rate
														
 
															-                        print(rate, read_avg)
														
 
															+                    if read_rate_avg_dict.get(gh_id_position):
														
 
															+                        # fetch read rate avg
														
 
															+                        read_rate_avg = read_rate_avg_dict[gh_id_position]
														
 
															+                        # cal read avg
														
 
															+                        read_avg = fans * read_rate_avg
														
 
															+
														
 
															+                        # insert into database
														
 
															                         insert_sql = f"""
														
 
															-                        INSERT INTO account_avg_info_v3
														
 
															-                        (gh_id, position, update_time, account_name, fans, read_avg, like_avg, status, account_type, account_mode, account_source, account_status, business_type, read_rate_avg)
														
 
															-                        values
														
 
															-                        (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
														
 
															+                            insert into {read_avg_table}
														
 
															+                            (gh_id, position, update_time, account_name, fans, read_avg, like_avg, status, account_type, account_mode, account_source, account_status, business_type, read_rate_avg)
														
 
															+                            values
														
 
															+                            (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
														
 
															                         """
														
 
															                         try:
														
 
															-                            self.pq.update(
														
 
															-                                sql=insert_sql,
														
 
															+                            self.piaoquan_crawler_db_client.save(
														
 
															+                                query=insert_sql,
														
 
															                                 params=(
														
 
															                                     gh_id,
														
 
															                                     index,
														
@@ -159,29 +112,29 @@ class UpdateAccountInfoVersion3(object):
 
															                                     account['account_name'],
														
 
															                                     fans,
														
 
															                                     read_avg,
														
 
															-                                    0,
														
 
															-                                    1,
														
 
															+                                    const.DEFAULT_LIKE,
														
 
															+                                    const.USING_STATUS,
														
 
															                                     account['account_type'],
														
 
															                                     account['mode_type'],
														
 
															-                                    account['account_source_name'],
														
 
															+                                    account['account_source'],
														
 
															                                     account['status'],
														
 
															                                     business_type,
														
 
															-                                    rate
														
 
															+                                    read_rate_avg
														
 
															                                 )
														
 
															                             )
														
 
															                         except Exception as e:
														
 
															-                            updateSQL = f"""
														
 
															-                            UPDATE account_avg_info_v3
														
 
															-                            set fans = %s, read_avg = %s, read_rate_avg = %s
														
 
															-                            where gh_id = %s and position = %s and update_time = %s
														
 
															+                            update_sql = f"""
														
 
															+                                update {read_avg_table}
														
 
															+                                set fans = %s, read_avg = %s, read_rate_avg = %s
														
 
															+                                where gh_id = %s and position = %s and update_time = %s
														
 
															                             """
														
 
															                             try:
														
 
															-                                affected_rows = self.pq.update(
														
 
															-                                    sql=updateSQL,
														
 
															+                                self.piaoquan_crawler_db_client.save(
														
 
															+                                    query=update_sql,
														
 
															                                     params=(
														
 
															                                         fans,
														
 
															                                         read_avg,
														
 
															-                                        rate,
														
 
															+                                        read_rate_avg,
														
 
															                                         account['gh_id'],
														
 
															                                         index,
														
 
															                                         dt
														
@@ -192,17 +145,16 @@ class UpdateAccountInfoVersion3(object):
 
															                         # 修改前一天的状态为 0
														
 
															                         update_status_sql = f"""
														
 
															-                        UPDATE account_avg_info_v3
														
 
															-                        SET status = %s
														
 
															-                        where update_time != %s and gh_id = %s and position = %s;
														
 
															+                            update {read_avg_table}
														
 
															+                            set status = %s
														
 
															+                            where update_time != %s and gh_id = %s and position = %s;
														
 
															                         """
														
 
															-                        rows_affected = self.pq.update(
														
 
															-                            sql=update_status_sql,
														
 
															+                        self.piaoquan_crawler_db_client.save(
														
 
															+                            query=update_status_sql,
														
 
															                             params=(
														
 
															-                                0, dt, account['gh_id'], index
														
 
															+                                const.NOT_USING_STATUS, dt, account['gh_id'], index
														
 
															                             )
														
 
															                         )
														
 
															-                        print("修改成功")
														
 
															 def main():
														
@@ -215,15 +167,15 @@ def main():
 
															                         help="Run only once for date in format of %Y-%m-%d. \
														
 
															                                 If no specified, run as daily jobs.")
														
 
															     args = parser.parse_args()
														
 
															-    Up = UpdateAccountInfoVersion3()
														
 
															+    update_account_read_avg_task = UpdateAccountInfoVersion3()
														
 
															     if args.run_date:
														
 
															-        Up.do_task_list(dt=args.run_date)
														
 
															+        update_account_read_avg_task.do_task_list(dt=args.run_date)
														
 
															     else:
														
 
															         dt_object = datetime.fromtimestamp(int(time.time()))
														
 
															         one_day = timedelta(days=1)
														
 
															         yesterday = dt_object - one_day
														
 
															         yesterday_str = yesterday.strftime('%Y-%m-%d')
														
 
															-        Up.do_task_list(dt=yesterday_str)
														
 
															+        update_account_read_avg_task.do_task_list(dt=yesterday_str)
														
 
															 if __name__ == '__main__':
	`@@ -0,0 +1 @@`
			`+from .video_crawler import BaiduVideoCrawler`