8 months ago · 35d974234a
--- a/account_cold_start_daily.py
+++ b/account_cold_start_daily.py
@@ -54,7 +54,7 @@ class AccountColdStartDailyTask(object):
 
				             # 抓取完成之后，给抓取到的标题进行相似度打分
			
 
				             cold_start_title_similarity_task = ColdStartTitleSimilarityTask()
			
 
				             cold_start_title_similarity_task.init_database()
			
 
				-            cold_start_title_similarity_task.run()
			
 
				+            cold_start_title_similarity_task.run(meta_source='article')
			
 
				 
			
 
				             bot(
			
 
				                 title="账号冷启动任务，抓取完成",
			
--- a/applications/api/__init__.py
+++ b/applications/api/__init__.py
@@ -1,8 +1,8 @@
 
				 """
			
 
				 @author: luojunhui
			
 
				 """
			
 
				-from .deep_seek_by_byte_dance_api import fetch_deepseek_response
			
 
				-from .google_ai_api import GoogleAIAPI
			
 
				-from .moon_shot_api import generate_mini_program_title
			
 
				+from .deep_seek_api_by_volcanoengine import fetch_deepseek_response
			
 
				+from .moon_shot_api import fetch_moon_shot_response
			
 
				 from .nlp_api import similarity_between_title_list
			
 
				-
			
 
				+from .gewe_api import WechatChannelAPI
			
 
				+from .google_ai_api import GoogleAIAPI
			
--- a/applications/api/deep_seek_api_by_volcanoengine.py
+++ b/applications/api/deep_seek_api_by_volcanoengine.py
@@ -0,0 +1,26 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+from volcenginesdkarkruntime import Ark
			
 
				+
			
 
				+from config import deep_seek_model
			
 
				+from config import deep_seek_default_model
			
 
				+from config import deep_seek_api_key_byte_dance
			
 
				+
			
 
				+
			
 
				+def fetch_deepseek_response(model, prompt):
			
 
				+    """
			
 
				+    deep_seek方法
			
 
				+    """
			
 
				+    client = Ark(
			
 
				+        api_key=deep_seek_api_key_byte_dance,
			
 
				+        timeout=1800,
			
 
				+        max_retries=2,
			
 
				+        )
			
 
				+    response = client.chat.completions.create(
			
 
				+        model=deep_seek_model.get(model, deep_seek_default_model),
			
 
				+        messages=[
			
 
				+            {"role": "user", "content": prompt}
			
 
				+        ]
			
 
				+    )
			
 
				+    return response.choices[0].message.content
			
--- a/applications/api/gewe_api.py
+++ b/applications/api/gewe_api.py
@@ -0,0 +1,108 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+
			
 
				+from tenacity import (
			
 
				+    retry,
			
 
				+    stop_after_attempt,
			
 
				+    wait_exponential,
			
 
				+    retry_if_exception_type,
			
 
				+)
			
 
				+from requests.exceptions import RequestException
			
 
				+import requests
			
 
				+import json
			
 
				+from typing import Optional, Dict
			
 
				+
			
 
				+COMMON_RETRY = dict(
			
 
				+    stop=stop_after_attempt(3),  # 总共尝试3次
			
 
				+    wait=wait_exponential(min=2, max=30),
			
 
				+    retry=retry_if_exception_type((RequestException, TimeoutError)),
			
 
				+    reraise=True  # 重试耗尽后重新抛出异常
			
 
				+)
			
 
				+
			
 
				+
			
 
				+class WechatChannelAPI:
			
 
				+    """
			
 
				+    wechat channel api by gw
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, base_url: str, token: str, app_id: str):
			
 
				+        self.base_url = base_url
			
 
				+        self.token = token
			
 
				+        self.app_id = app_id
			
 
				+
			
 
				+    @retry(**COMMON_RETRY)
			
 
				+    def search(
			
 
				+            self,
			
 
				+            search_key: str,
			
 
				+            search_type: int,
			
 
				+            page: int = 0,
			
 
				+            cookie: str = "",
			
 
				+            search_id: str = "",
			
 
				+            offset: int = 0,
			
 
				+    ) -> Optional[Dict]:
			
 
				+        """
			
 
				+        搜索微信视频号内容（支持重试）
			
 
				+
			
 
				+        :param search_key: 搜索关键字
			
 
				+        :param search_type: 搜索类型，1: 搜索所有视频，2: 搜索视频号账号
			
 
				+        :param page: 页码
			
 
				+        :param cookie: 登录后的cookie
			
 
				+        :param search_id: 搜索id
			
 
				+        :param offset: 偏移量
			
 
				+        :return: 返回搜索结果字典，失败时返回None
			
 
				+        """
			
 
				+        url = f"{self.base_url}/gewe/v2/api/finder/search"
			
 
				+        payload = {
			
 
				+            "appId": self.app_id,
			
 
				+            "proxyIp": "",
			
 
				+            "content": search_key,
			
 
				+            "category": search_type,
			
 
				+            "filter": 0,
			
 
				+            "page": page,
			
 
				+            "cookie": cookie,
			
 
				+            "searchId": search_id,
			
 
				+            "offset": offset,
			
 
				+        }
			
 
				+        headers = {"X-GEWE-TOKEN": self.token, "Content-Type": "application/json"}
			
 
				+
			
 
				+        try:
			
 
				+            response = requests.post(url, headers=headers, json=payload, timeout=60)
			
 
				+            response.raise_for_status()
			
 
				+            return response.json()
			
 
				+        except RequestException as e:
			
 
				+            print(f"API请求失败: {e}")
			
 
				+        except json.JSONDecodeError as e:
			
 
				+            print(f"响应解析失败: {e}")
			
 
				+        return None
			
 
				+
			
 
				+    @retry(**COMMON_RETRY)
			
 
				+    def get_channel_video_list(
			
 
				+            self, user_id: str, last_buffer: str = ""
			
 
				+    ) -> Optional[Dict]:
			
 
				+        """
			
 
				+        获取视频号账号的视频列表（支持重试）
			
 
				+
			
 
				+        :param user_id: 视频号账号ID
			
 
				+        :param last_buffer: 分页标记，用于获取下一页数据
			
 
				+        :return: 返回视频列表字典，失败时返回None
			
 
				+        """
			
 
				+        url = f"{self.base_url}/gewe/v2/api/finder/userPage"
			
 
				+        payload = {
			
 
				+            "appId": self.app_id,
			
 
				+            "proxyIp": "",
			
 
				+            "lastBuffer": last_buffer,
			
 
				+            "toUserName": user_id,
			
 
				+            "maxId": 0,
			
 
				+        }
			
 
				+        headers = {"X-GEWE-TOKEN": self.token, "Content-Type": "application/json"}
			
 
				+
			
 
				+        try:
			
 
				+            response = requests.post(url, headers=headers, json=payload, timeout=60)
			
 
				+            response.raise_for_status()
			
 
				+            return response.json()
			
 
				+        except RequestException as e:
			
 
				+            print(f"获取视频列表请求失败: {e}")
			
 
				+        except json.JSONDecodeError as e:
			
 
				+            print(f"响应解析失败: {e}")
			
 
				+        return None
			
--- a/applications/api/moon_shot_api.py
+++ b/applications/api/moon_shot_api.py
@@ -1,45 +1,120 @@
 
				 """
			
 
				 @author: luojunhui
			
 
				 """
			
 
				+import json
			
 
				 from openai import OpenAI
			
 
				 
			
 
				-mini_program_title_generate_prompt = """
			
 
				+from config import moon_shot
			
 
				+
			
 
				+generate_program_title_prompt = """
			
 
				     请将以上标题改写成适合小程序点击和传播的小程序标题，小程序标题的写作规范如下，请学习后进行小程序标题的编写。直接输出最终的小程序标题
			
 
				     小程序标题写作规范：
			
 
				     1.要点前置：将最重要的信息放在标题的最前面，以快速吸引读者的注意力。例如，“5月一辈子同学，三辈子亲，送给我的老同学，听哭无数人！”中的“5月”和“一辈子同学，三辈子亲”都是重要的信息点。
			
 
				     2.激发情绪：使用能够触动人心的语言，激发读者的情感共鸣。如“只剩两人同学聚会，看后感动落泪。”使用“感动落泪”激发读者的同情和怀旧情绪。
			
 
				     3.使用数字和特殊符号：数字可以提供具体性，而特殊符号如“🔴”、“😄”、“🔥”等可以吸引视觉注意力，增加点击率。
			
 
				-    4.悬念和好奇心：创建悬念或提出问题，激发读者的好奇心。例如，“太神奇了!长江水位下降，重庆出现惊奇一幕!”中的“惊奇一幕”就是一个悬念。
			
 
				+    4.悬念和好奇心：创建悬念或提出问题，激发读者的好奇心。
			
 
				     5.名人效应：如果内容与知名人士相关，提及他们的名字可以增加标题的吸引力。
			
 
				     6.社会价值观：触及读者的文化和社会价值观，如家庭、友情、国家荣誉等。
			
 
				     7.标点符号的运用：使用感叹号、问号等标点来增强语气和情感表达。
			
 
				     8.直接的语言：使用直白、口语化的语言，易于理解，如“狗屁股，笑死我了!”。
			
 
				     9.热点人物或事件：提及当前的热点人物或事件，利用热点效应吸引读者。
			
 
				     10.字数适中：保持标题在10-20个字之间，既不过长也不过短，确保信息的完整性和吸引力。
			
 
				-    11.适当的紧迫感：使用“最新”、“首次”、“紧急”等词汇，创造一种紧迫感，促使读者立即行动。
			
 
				-    12.情感或价值诉求：使用如“感动”、“泪目”、“经典”等词汇，直接与读者的情感或价值观产生共鸣。
			
 
				+    11.情感或价值诉求：使用如“感动”、“泪目”、“经典”等词汇，直接与读者的情感或价值观产生共鸣。
			
 
				     避免误导：确保标题准确反映内容，避免夸大或误导读者。
			
 
				     """
			
 
				 
			
 
				+get_title_safe_score_prompt = """
			
 
				+    请你学习一下内容规范，以下标题可能会违反了某条内容规范。请你对标题做一个内容风险评级，1-10分，等级越高内容违规风险越大。 
			
 
				+    请直接输出内容风险评级的分数，不要输出你的理由、分析等内容。 
			
 
				+    输出:
			
 
				+        只需要输出危险分级分数，不要输出任何其他内容。
			
 
				+    内容规范为： 
			
 
				+    4.2 色俗内容 
			
 
				+        4.2.1 散布淫秽、色情内容，包括但不限于招嫖、寻找一夜情、性伴侣等。 
			
 
				+        4.2.2 发布有色情意味的情色文字、情色视频、情色漫画等内容。 
			
 
				+        4.2.3 以投稿/爆料等形式描述约炮经历、性交体验、偷情、涉隐私部位偷拍等伤风败俗的话题内容。 
			
 
				+        4.2.4 以低俗的配图引诱用户阅读文章、关注微信公众号。包含性撩拨、性挑逗画面；疑似女性性高潮/性虐场面；偷拍的沐浴/更衣/如厕/亲热等私密画面；女性故意露出敏感部位 (纯裸露的胸、生殖器官)以及敏感部位未打码的真人写真/艺术摄影等。 
			
 
				+        4.2.5 文内以低俗类的动图或引导图，诱导用户点击进而跳转至另一篇图文页或关注某个公众号。 
			
 
				+        4.2.6 文章主要描述PUA撩妹、撩汉等相关话题，且引导用户关注公众号/加个人微信号/加群。 
			
 
				+    4.11 煽动、夸大、误导类内容 平台鼓励创作者提供准确、清晰、能体现文章内容主旨的标题，不允许通过标题噱头诱导用户点击或误导用户。
			
 
				+        包括但不限于以下情况： 
			
 
				+        4.11.1 标题含有危害人身安全、恐吓侮辱、惊悚、极端内容，或者以命令式语气强迫用户阅读。 
			
 
				+        4.11.2 标题无依据夸大事件严重程度、紧急程度、受影响面以及事件引发的情绪。 
			
 
				+        4.11.3 标题以浮夸的描述，反常识强调某种食物/行为对人体健康的影响，煽动人群要/不要去做某行为。 
			
 
				+        4.11.4 非官方通知或者公告，但标题假借官方名义煽动获取流量，或以信息来源机密、看完即删来诱导用户。 
			
 
				+        4.11.5 标题故意隐藏关键信息，或无中生有部分信息，给用户造成误导。 
			
 
				+        4.12 违反国家法律法规禁止的内容 
			
 
				+            （1）违反宪法确定的基本原则的； 
			
 
				+            （2）危害国家安全，泄露国家秘密，颠覆国家政权，破坏国家统一的； 
			
 
				+            （3）损害国家荣誉和利益的； 
			
 
				+            （4）煽动民族仇恨、民族歧视，破坏民族团结的； 
			
 
				+            （5）破坏国家宗教政策，宣扬邪教和封建迷信的； 
			
 
				+            （6）散布不实信息，扰乱社会秩序，破坏社会稳定的； 
			
 
				+            （7）散布淫秽、色情、赌博、暴力、恐怖或者教唆犯罪的； 
			
 
				+            （8）侮辱或者诽谤他人，侵害他人合法权益的； 
			
 
				+            （9）煽动非法集会、结社、游行、示威、聚众扰乱社会秩序； 
			
 
				+            （10）以非法民间组织名义活动的； 
			
 
				+            （11）不符合《即时通信工具公众信息服务发展管理暂行规定》及遵守法律法规、社会主义制度、国家利益、公民合法利益、公共秩序、社会道德风尚和信息真实性等“七条底线”要求的； 
			
 
				+            （12）含有法律、行政法规禁止的其他内容的。
			
 
				+    输入的标题是： 
			
 
				+    """
			
 
				+
			
 
				+make_title_safe_prompt = """
			
 
				+    以下每行为一个文章的标题，请用尽量平实的语言对以上标题进行改写，保持在10～15字左右，请注意：
			
 
				+    1. 不要虚构或改变标题的含义。
			
 
				+    2. 不要用笃定的语气描述存疑的可能性，不要将表述可能性的问句改为肯定句。
			
 
				+    直接输出改写后的标题列表。
			
 
				+    在改写完成后，再输出一次，在改写的标题前增加和标题情感、语气匹配的特殊符号，如：“🔴”、“😄”、“🔥”、“😨”等等
			
 
				+    输出：
			
 
				+        输出结果是Dict, 格式为: 
			
 
				+        {
			
 
				+        "title_v1": 请填写第一次输出的标题,
			
 
				+        "title_v2": 请填写第二次输出的标题
			
 
				+        }
			
 
				+    输入的标题是: 
			
 
				+        """
			
 
				+
			
 
				 
			
 
				-def generate_mini_program_title(ori_title):
			
 
				+def fetch_moon_shot_response(task, input_text, output_type="text"):
			
 
				     """
			
 
				-    prompt + kimi + ori_title generate new title
			
 
				-    :param ori_title:
			
 
				-    :return:
			
 
				+    调用kimi的api获取结果
			
 
				     """
			
 
				+    # generate prompt
			
 
				+    match task:
			
 
				+        case "generate_kimi_title":
			
 
				+            prompt = input_text + '\n' + generate_program_title_prompt
			
 
				+        case "get_title_safe_score":
			
 
				+            prompt = get_title_safe_score_prompt + input_text
			
 
				+        case "make_title_safe":
			
 
				+            prompt = make_title_safe_prompt + input_text
			
 
				+        case _:
			
 
				+            prompt = input_text
			
 
				+
			
 
				+    # init client
			
 
				     client = OpenAI(
			
 
				-        api_key='sk-5DqYCa88kche6nwIWjLE1p4oMm8nXrR9kQMKbBolNAWERu7q',
			
 
				-        base_url="https://api.moonshot.cn/v1"
			
 
				+        api_key=moon_shot['api_key'],
			
 
				+        base_url=moon_shot['base_url']
			
 
				     )
			
 
				+
			
 
				+    # get response format
			
 
				+    if output_type == "json":
			
 
				+        response_format = {"type": "json_object"}
			
 
				+    else:
			
 
				+        response_format = {"type": "text"}
			
 
				+
			
 
				     chat_completion = client.chat.completions.create(
			
 
				         messages=[
			
 
				             {
			
 
				                 "role": "user",
			
 
				-                "content": ori_title + "\n" + mini_program_title_generate_prompt
			
 
				+                "content": prompt,
			
 
				             }
			
 
				         ],
			
 
				-        model="moonshot-v1-32k",
			
 
				+        model=moon_shot['model'],
			
 
				+        response_format=response_format,
			
 
				     )
			
 
				     response = chat_completion.choices[0].message.content
			
 
				-    return response.split("\n")[0]
			
 
				+    if output_type == "json":
			
 
				+        response_json = json.loads(response)
			
 
				+        return response_json
			
 
				+
			
 
				+    return response
			
--- a/applications/api/nlp_api.py
+++ b/applications/api/nlp_api.py
@@ -2,6 +2,10 @@
 
				 @author: luojunhui
			
 
				 """
			
 
				 import requests
			
 
				+import traceback
			
 
				+from requests.exceptions import RequestException, JSONDecodeError
			
 
				+
			
 
				+from applications.aliyunLogApi import log
			
 
				 
			
 
				 
			
 
				 def similarity_between_title_list(target_title_list: list[str], base_title_list: list[str]) -> list[list[float]]:
			
@@ -11,7 +15,9 @@ def similarity_between_title_list(target_title_list: list[str], base_title_list:
 
				     :param base_title_list: base title_list
			
 
				     :return: list of similarity
			
 
				     """
			
 
				+
			
 
				     url = 'http://61.48.133.26:6060/nlp'
			
 
				+    url_backup = 'http://192.168.203.4:6060/nlp'
			
 
				     body = {
			
 
				         "data": {
			
 
				             "text_list_a": target_title_list,
			
@@ -20,7 +26,52 @@ def similarity_between_title_list(target_title_list: list[str], base_title_list:
 
				         "function": "similarities_cross",
			
 
				         "use_cache": False
			
 
				     }
			
 
				-    response_json = requests.post(url, json=body, timeout=120).json()
			
 
				-    score_array = response_json['score_list_list']
			
 
				-    return score_array
			
 
				 
			
 
				+    try:
			
 
				+        response = requests.post(url, json=body, timeout=120)
			
 
				+        if response.status_code != 200:
			
 
				+            response = requests.post(url_backup, json=body, timeout=120)
			
 
				+    except RequestException as e:
			
 
				+        log(
			
 
				+            task="nlp",
			
 
				+            function="similarity_between_title_list",
			
 
				+            status="fail",
			
 
				+            message="nlp server web error",
			
 
				+            data={
			
 
				+                "e": str(e),
			
 
				+                "error_msg": traceback.format_exc()
			
 
				+            }
			
 
				+        )
			
 
				+        # use back up
			
 
				+        response = requests.post(url_backup, json=body, timeout=120)
			
 
				+
			
 
				+    if response.status_code != 200:
			
 
				+        log(
			
 
				+            task="nlp",
			
 
				+            function="similarity_between_title_list",
			
 
				+            status="fail",
			
 
				+            message='nlp server request error',
			
 
				+            data={
			
 
				+                "status_code": response.status_code,
			
 
				+                "response_text": response.text[:200]  # 截取部分内容避免过大
			
 
				+            }
			
 
				+        )
			
 
				+        return []
			
 
				+
			
 
				+    try:
			
 
				+        response_json = response.json()
			
 
				+        score_array = response_json['score_list_list']
			
 
				+    except (JSONDecodeError, KeyError) as e:
			
 
				+        log(
			
 
				+            task="nlp",
			
 
				+            function="similarity_between_title_list",
			
 
				+            status="fail",
			
 
				+            message='nlp server response error',
			
 
				+            data={
			
 
				+                "error_type": type(e).__name__,
			
 
				+                "raw_response": response.text[:200]
			
 
				+            }
			
 
				+        )
			
 
				+        return []
			
 
				+
			
 
				+    return score_array
			
--- a/applications/const/__init__.py
+++ b/applications/const/__init__.py
@@ -4,7 +4,7 @@
 
				 """
			
 
				 
			
 
				 
			
 
				-class coldStartTaskConst:
			
 
				+class ColdStartTaskConst:
			
 
				     """
			
 
				     冷启动任务常量配置
			
 
				     """
			
@@ -12,6 +12,44 @@ class coldStartTaskConst:
 
				     INIT_STATUS = 1  # 文章初始状态
			
 
				     BAD_STATUS = 0  # 低质量文章状态
			
 
				 
			
 
				+    # 常量
			
 
				+    ACCOUNT_GOOD_STATUS = 1
			
 
				+
			
 
				+    # 账号是否每日抓取
			
 
				+    ACCOUNT_DAILY_SCRAPE = 1
			
 
				+    ACCOUNT_NOT_DAILY_SCRAPE = 0
			
 
				+
			
 
				+    # 默认值
			
 
				+    DEFAULT_VIEW_COUNT = 0
			
 
				+    DEFAULT_LIKE_COUNT = 0
			
 
				+    DEFAULT_ARTICLE_STATUS = 1
			
 
				+    DEFAULT_TIMESTAMP = 1717171200
			
 
				+
			
 
				+    # 标题sensitivity
			
 
				+    TITLE_SENSITIVE = 1
			
 
				+    TITLE_NOT_SENSITIVE = 0
			
 
				+
			
 
				+    # 文章联想深度
			
 
				+    ARTICLE_ASSOCIATION_MAX_DEPTH = 4
			
 
				+
			
 
				+    # 相关分百分位阈值
			
 
				+    PERCENT_THRESHOLD = 95
			
 
				+
			
 
				+    # 相关性分阈值
			
 
				+    CORRELATION_THRESHOLD = 0.5
			
 
				+
			
 
				+    # 阅读量阈值
			
 
				+    READ_COUNT_THRESHOLD = 1000
			
 
				+
			
 
				+    # 阅读均值倍数阈值
			
 
				+    READ_AVG_THRESHOLD = 1.3
			
 
				+
			
 
				+    # 群发类型
			
 
				+    BULK_PUBLISH_TYPE = 9
			
 
				+
			
 
				+    # 种子文章数量
			
 
				+    SEED_ARTICLE_LIMIT_NUM = 60
			
 
				+
			
 
				 
			
 
				 class updatePublishedMsgTaskConst:
			
 
				     """
			
@@ -41,7 +79,7 @@ class updatePublishedMsgTaskConst:
 
				     # 服务号
			
 
				     SERVICE_TYPE = 2
			
 
				     # 监测周期（秒）
			
 
				-    MONITOR_PERIOD = 60 * 60 * 24 * 7
			
 
				+    MONITOR_PERIOD = 60 * 60 * 24 * 3
			
 
				 
			
 
				     # 新号抓文章周期
			
 
				     NEW_ACCOUNT_CRAWL_PERIOD = 60 * 60 * 24 * 30
			
@@ -50,7 +88,7 @@ class updatePublishedMsgTaskConst:
 
				     SUBSCRIBE_FAIL_RATE_THRESHOLD = 0.3
			
 
				 
			
 
				 
			
 
				-class updateAccountReadRateTaskConst:
			
 
				+class UpdateAccountReadRateTaskConst:
			
 
				     """
			
 
				     更新账号阅读率常量配置
			
 
				     """
			
@@ -66,8 +104,14 @@ class updateAccountReadRateTaskConst:
 
				     # 文章位置
			
 
				     ARTICLE_INDEX_LIST = [1, 2, 3, 4, 5, 6, 7, 8]
			
 
				 
			
 
				+    # 默认粉丝
			
 
				+    DEFAULT_FANS = 0
			
 
				 
			
 
				-class updateAccountReadAvgTaskConst:
			
 
				+    # 最低粉丝量
			
 
				+    MIN_FANS = 1000
			
 
				+
			
 
				+
			
 
				+class UpdateAccountReadAvgTaskConst:
			
 
				     """
			
 
				     更新账号阅读均值常量配置
			
 
				     """
			
@@ -86,6 +130,19 @@ class updateAccountReadAvgTaskConst:
 
				     ARTICLES_DAILY = 1
			
 
				     TOULIU = 2
			
 
				 
			
 
				+    # 默认粉丝
			
 
				+    DEFAULT_FANS = 0
			
 
				+
			
 
				+    # index list
			
 
				+    ARTICLE_INDEX_LIST = [1, 2, 3, 4, 5, 6, 7, 8]
			
 
				+
			
 
				+    # 默认点赞
			
 
				+    DEFAULT_LIKE = 0
			
 
				+
			
 
				+    # 状态
			
 
				+    USING_STATUS = 1
			
 
				+    NOT_USING_STATUS = 0
			
 
				+
			
 
				 
			
 
				 class WeixinVideoCrawlerConst:
			
 
				     """
			
@@ -139,18 +196,23 @@ class WeixinVideoCrawlerConst:
 
				     DEFAULT_ACCOUNT_UID = 76862180
			
 
				 
			
 
				     # 每天发送的审核视频数量
			
 
				-    MAX_VIDEO_NUM = 500
			
 
				+    MAX_VIDEO_NUM = 1000
			
 
				+
			
 
				+    # 单次发布视频审核量
			
 
				+    MAX_VIDEO_NUM_PER_PUBLISH = 350
			
 
				 
			
 
				     # 标题状态
			
 
				     TITLE_DEFAULT_STATUS = 0
			
 
				     TITLE_EXIT_STATUS = 1
			
 
				     TITLE_FESTIVAL_STATUS = 2
			
 
				-    TITLE_DUPLICATE_STATUS = 3
			
 
				-    TITLE_SHORT_STATUS = 4
			
 
				+    TITLE_SHORT_STATUS = 3
			
 
				 
			
 
				     # 标题最短长度
			
 
				     TITLE_MIN_LENGTH = 15
			
 
				 
			
 
				+    # safe score
			
 
				+    TITLE_SAFE_SCORE_THRESHOLD = 7
			
 
				+
			
 
				 
			
 
				 class UpdateMiniProgramDetailConst(updatePublishedMsgTaskConst):
			
 
				     """
			
@@ -207,6 +269,109 @@ class ArticleCollectorConst:
 
				     ARTICLE_UNKNOWN_CODE = 10000
			
 
				 
			
 
				 
			
 
				+class BaiduVideoCrawlerConst:
			
 
				+    """
			
 
				+    const for baidu video crawler
			
 
				+    """
			
 
				+    # account status
			
 
				+    BAIDU_ACCOUNT_GOOD_STATUS = 1
			
 
				+    BAIDU_ACCOUNT_BAD_STATUS = 0
			
 
				+
			
 
				+    # earliest cursor, 2024-01-01 00:00:00
			
 
				+    DEFAULT_CURSOR = 17040384000000
			
 
				+
			
 
				+    # no source account
			
 
				+    NO_SOURCE_ACCOUNT_STATUS = 0
			
 
				+
			
 
				+    # timestamp To Cursor
			
 
				+    TIMESTAMP_TO_CURSOR = 10000
			
 
				+
			
 
				+    # local path dir
			
 
				+    LOCAL_PATH_DIR = "static"
			
 
				+
			
 
				+
			
 
				+class TitleRewriteTaskConst:
			
 
				+    """
			
 
				+    title rewrite task const
			
 
				+    """
			
 
				+    # title rewrite status
			
 
				+    TITLE_REWRITE_INIT_STATUS = 0
			
 
				+    TITLE_REWRITE_SUCCESS_STATUS = 1
			
 
				+    TITLE_REWRITE_FAIL_STATUS = 99
			
 
				+    TITLE_REWRITE_LOCK_STATUS = 101
			
 
				+
			
 
				+    # article status
			
 
				+    ARTICLE_AUDIT_PASSED_STATUS = 1
			
 
				+    ARTICLE_POSITIVE_STATUS = 0
			
 
				+
			
 
				+    # title useful status
			
 
				+    TITLE_USEFUL_STATUS = 1
			
 
				+
			
 
				+    # prompt version
			
 
				+    PROMPT_VERSION = "xx_250228"  # 信欣2025-02-28提供
			
 
				+
			
 
				+    # block expire time 1h
			
 
				+    TITLE_REWRITE_LOCK_TIME = 60 * 60
			
 
				+
			
 
				+
			
 
				+class ChannelVideoCrawlerConst:
			
 
				+    """
			
 
				+    const for baidu video crawler
			
 
				+    """
			
 
				+    # account status
			
 
				+    CHANNEL_ACCOUNT_GOOD_STATUS = 1
			
 
				+    CHANNEL_ACCOUNT_BAD_STATUS = 0
			
 
				+
			
 
				+    # earliest cursor, 2024-01-01 00:00:00
			
 
				+    DEFAULT_CURSOR = 1704038400
			
 
				+
			
 
				+    # no source account
			
 
				+    NO_SOURCE_ACCOUNT_STATUS = 0
			
 
				+
			
 
				+    # local path dir
			
 
				+    LOCAL_PATH_DIR = "static"
			
 
				+
			
 
				+    # title length min
			
 
				+    MIN_TITLE_LENGTH = 10
			
 
				+
			
 
				+    # max video length(second)
			
 
				+    MAX_VIDEO_LENGTH = 600
			
 
				+
			
 
				+    # sleep second
			
 
				+    SLEEP_SECOND = 2
			
 
				+
			
 
				+
			
 
				+class ToutiaoVideoCrawlerConst:
			
 
				+    """
			
 
				+    const for toutiao video crawler
			
 
				+    """
			
 
				+    # platform
			
 
				+    PLATFORM = "toutiao"
			
 
				+
			
 
				+    # account status
			
 
				+    TOUTIAO_ACCOUNT_GOOD_STATUS = 1
			
 
				+    TOUTIAO_ACCOUNT_BAD_STATUS = 0
			
 
				+
			
 
				+    # earliest cursor, 2021-01-01 00:00:00
			
 
				+    DEFAULT_CURSOR = 1609430400
			
 
				+
			
 
				+    # no source account
			
 
				+    NO_SOURCE_ACCOUNT_STATUS = 0
			
 
				+
			
 
				+    # title length min
			
 
				+    MIN_TITLE_LENGTH = 10
			
 
				+
			
 
				+    # max video length(second)
			
 
				+    MAX_VIDEO_LENGTH = 600
			
 
				+
			
 
				+    # sleep second
			
 
				+    SLEEP_SECOND = 3
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				 # 视频转文本任务
			
 
				 class VideoToTextConst:
			
 
				     """
			
--- a/applications/db/__init__.py
+++ b/applications/db/__init__.py
@@ -30,12 +30,12 @@ class DatabaseConnector:
 
				         """
			
 
				         try:
			
 
				             self.connection = pymysql.connect(
			
 
				-                host=self.db_config.get('host', 'localhost'),
			
 
				-                user=self.db_config['user'],
			
 
				-                password=self.db_config['password'],
			
 
				-                db=self.db_config['db'],
			
 
				-                port=self.db_config.get('port', 3306),
			
 
				-                charset=self.db_config.get('charset', 'utf8mb4')
			
 
				+                host=self.db_config.get("host", "localhost"),
			
 
				+                user=self.db_config["user"],
			
 
				+                password=self.db_config["password"],
			
 
				+                db=self.db_config["db"],
			
 
				+                port=self.db_config.get("port", 3306),
			
 
				+                charset=self.db_config.get("charset", "utf8mb4"),
			
 
				             )
			
 
				         except pymysql.MySQLError as e:
			
 
				             raise ConnectionError(f"无法连接到数据库: {e}")
			
@@ -48,9 +48,10 @@ class DatabaseConnector:
 
				             self.connection.close()
			
 
				             self.connection = None
			
 
				 
			
 
				-    def fetch(self, query, cursor_type=None):
			
 
				+    def fetch(self, query, cursor_type=None, params=None):
			
 
				         """
			
 
				         执行单条查询语句，并返回结果。
			
 
				+        :param params: 查询传参
			
 
				         :param cursor_type: 输出的返回格式
			
 
				         :param query: 查询语句
			
 
				         :return: 查询结果列表
			
@@ -61,7 +62,10 @@ class DatabaseConnector:
 
				 
			
 
				         try:
			
 
				             with self.connection.cursor(cursor_type) as cursor:
			
 
				-                cursor.execute(query)
			
 
				+                if params:
			
 
				+                    cursor.execute(query, params)
			
 
				+                else:
			
 
				+                    cursor.execute(query)
			
 
				                 result = cursor.fetchall()
			
 
				                 return result
			
 
				         except pymysql.MySQLError as e:
			
--- a/applications/exception/spider_error.py
+++ b/applications/exception/spider_error.py
@@ -7,9 +7,9 @@ from applications import log
 
				 
			
 
				 
			
 
				 class SpiderError(Exception):
			
 
				-    """数据库查询异常"""
			
 
				+    """spider_task_error"""
			
 
				 
			
 
				-    def __init__(self, error=None, spider=None, url=None):
			
 
				+    def __init__(self, platform=None, error=None, spider=None, url=None):
			
 
				         """
			
 
				         :param error: 异常对象，可选，用于提供更详细的错误信息。
			
 
				         :param spider: 爬虫任务
			
@@ -22,7 +22,8 @@ class SpiderError(Exception):
 
				         }
			
 
				         log(
			
 
				             task="spider_task",
			
 
				-            function="log_spider_error",
			
 
				+            function="{}".format(platform),
			
 
				+            message="{} 抓取失败".format(spider),
			
 
				             data=error_obj
			
 
				         )
			
 
				         super().__init__(json.dumps(error_obj, ensure_ascii=False, indent=4))
			
--- a/applications/functions.py
+++ b/applications/functions.py
@@ -153,11 +153,11 @@ class Functions(object):
 
				         快代理
			
 
				         """
			
 
				         # 隧道域名:端口号
			
 
				-        tunnel = "l901.kdltps.com:15818"
			
 
				+        tunnel = "j685.kdltps.com:15818"
			
 
				 
			
 
				         # 用户名密码方式
			
 
				-        username = "t11983523373311"
			
 
				-        password = "mtuhdr2z"
			
 
				+        username = "t14070979713487"
			
 
				+        password = "hqwanfvy"
			
 
				         proxies = {
			
 
				             "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
			
 
				             "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
			
@@ -296,3 +296,28 @@ class Functions(object):
 
				         params = parse_qs(urlparse(url).query)
			
 
				         info = params.get(key, [])
			
 
				         return info[0] if info else None
			
 
				+
			
 
				+    @classmethod
			
 
				+    def download_baidu_videos(cls, video_url, save_path):
			
 
				+        """
			
 
				+        :param video_url: baidu video url
			
 
				+        :param save_path: save path
			
 
				+        """
			
 
				+        if os.path.exists(save_path):
			
 
				+            return save_path
			
 
				+
			
 
				+        response = requests.get(
			
 
				+            video_url,
			
 
				+            headers={
			
 
				+                'User-Agent': FakeUserAgent().chrome,
			
 
				+                "Accept": "*/*",
			
 
				+                "Accept-Language": "zh-CN,zh;q=0.9"
			
 
				+            }
			
 
				+        )
			
 
				+        with open(save_path, 'wb') as f:
			
 
				+            f.write(response.content)
			
 
				+        TEN_KB = 1024 * 10
			
 
				+        if os.path.getsize(save_path) > TEN_KB:
			
 
				+            return save_path
			
 
				+        else:
			
 
				+            return None
			
--- a/applications/js/toutiao.js
+++ b/applications/js/toutiao.js
--- a/applications/llm_sensitivity.py
+++ b/applications/llm_sensitivity.py
@@ -8,8 +8,8 @@ from openai import OpenAI
 
				 
			
 
				 def request_llm_api(prompt, text):
			
 
				     client = OpenAI(
			
 
				-        api_key='sk-c1b18099dadc4dd1b48239bdde184f6c',
			
 
				-        base_url="https://api.deepseek.com"
			
 
				+        api_key='5e275c38-44fd-415f-abcf-4b59f6377f72',
			
 
				+        base_url="https://ark.cn-beijing.volces.com/api/v3"
			
 
				     )
			
 
				     chat_completion = client.chat.completions.create(
			
 
				         messages=[
			
@@ -18,7 +18,7 @@ def request_llm_api(prompt, text):
 
				                 "content": prompt + text,
			
 
				             }
			
 
				         ],
			
 
				-        model="deepseek-chat",
			
 
				+        model="ep-20250213194558-rrmr2", # deepseek-v3
			
 
				         temperature=0.2,
			
 
				         response_format={"type": "json_object"}
			
 
				     )
			
--- a/applications/pipeline/__init__.py
+++ b/applications/pipeline/__init__.py
@@ -0,0 +1,4 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+from .crawler_pipeline import scrape_video_entities_process
			
--- a/applications/pipeline/crawler_pipeline.py
+++ b/applications/pipeline/crawler_pipeline.py
@@ -0,0 +1,83 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import json
			
 
				+
			
 
				+from applications import log
			
 
				+
			
 
				+from applications.utils import download_gzh_video
			
 
				+from applications.utils import download_toutiao_video
			
 
				+from applications.utils import upload_to_oss
			
 
				+
			
 
				+from config import apolloConfig
			
 
				+
			
 
				+my_config = apolloConfig()
			
 
				+
			
 
				+empty_dict = {}
			
 
				+sensitive_word_list = json.loads(my_config.getConfigValue("sensitive_word_list"))
			
 
				+
			
 
				+
			
 
				+def whether_title_sensitive(title: str) -> bool:
			
 
				+    """
			
 
				+    title sensitive words filter
			
 
				+    """
			
 
				+    for word in sensitive_word_list:
			
 
				+        if word in title:
			
 
				+            return True
			
 
				+
			
 
				+    return False
			
 
				+
			
 
				+
			
 
				+def whether_duplicate_video_title(video_title: str, db_client) -> bool:
			
 
				+    """
			
 
				+    whether duplicate video title
			
 
				+    """
			
 
				+    sql = f"""
			
 
				+        select id from publish_single_video_source
			
 
				+        where article_title = %s;
			
 
				+    """
			
 
				+    duplicate_id = db_client.fetch(query=sql, params=(video_title,))
			
 
				+    if duplicate_id:
			
 
				+        return True
			
 
				+
			
 
				+    return False
			
 
				+
			
 
				+
			
 
				+def scrape_video_entities_process(video_item, db_client) -> dict:
			
 
				+    """
			
 
				+    video crawler pipeline
			
 
				+    """
			
 
				+    article_url = video_item["article_url"]
			
 
				+    platform = video_item["platform"]
			
 
				+    video_title = video_item["article_title"]
			
 
				+    # whether title sensitive
			
 
				+    if whether_title_sensitive(video_title):
			
 
				+        return empty_dict
			
 
				+
			
 
				+    # whether duplicate video title
			
 
				+    if whether_duplicate_video_title(video_title, db_client):
			
 
				+        return empty_dict
			
 
				+
			
 
				+    # download video
			
 
				+    match platform:
			
 
				+        case "toutiao":
			
 
				+            video_path = download_toutiao_video(article_url)
			
 
				+        case "gzh":
			
 
				+            video_path = download_gzh_video(article_url)
			
 
				+        case "hksp":
			
 
				+            video_path = ""
			
 
				+        case "sph":
			
 
				+            video_path = ""
			
 
				+        case _:
			
 
				+            return empty_dict
			
 
				+
			
 
				+    if video_path:
			
 
				+        # upload video to oss
			
 
				+        oss_path = upload_to_oss(video_path)
			
 
				+        video_item["video_oss_path"] = oss_path
			
 
				+        os.remove(video_path)
			
 
				+        return video_item
			
 
				+    else:
			
 
				+        return empty_dict
			
--- a/applications/so/libsph_decrypt.so
+++ b/applications/so/libsph_decrypt.so
--- a/applications/utils/__init__.py
+++ b/applications/utils/__init__.py
@@ -0,0 +1,14 @@
 
				+"""
			
 
				+utils
			
 
				+"""
			
 
				+from .cold_start import whether_title_sensitive
			
 
				+from .cold_start import get_inner_account_set
			
 
				+from .common import *
			
 
				+from .download_video import download_gzh_video
			
 
				+from .download_video import download_sph_video
			
 
				+from .download_video import download_toutiao_video
			
 
				+from .item import Item
			
 
				+from .save_to_db import insert_into_single_video_source_table
			
 
				+from .upload import upload_to_oss
			
 
				+from .fetch_info_from_aigc import fetch_account_fans
			
 
				+from .fetch_info_from_aigc import fetch_publishing_account_list
			
--- a/applications/utils/cold_start.py
+++ b/applications/utils/cold_start.py
@@ -0,0 +1,30 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+import json
			
 
				+
			
 
				+from applications import aiditApi
			
 
				+from config import apolloConfig
			
 
				+
			
 
				+config = apolloConfig()
			
 
				+sensitive_word_list = json.loads(config.getConfigValue("sensitive_word_list"))
			
 
				+
			
 
				+
			
 
				+def whether_title_sensitive(title: str) -> bool:
			
 
				+    """
			
 
				+    : param title:
			
 
				+    判断视频是否的标题是否包含敏感词
			
 
				+    """
			
 
				+    for word in sensitive_word_list:
			
 
				+        if word in title:
			
 
				+            return True
			
 
				+    return False
			
 
				+
			
 
				+
			
 
				+def get_inner_account_set() -> set:
			
 
				+    """
			
 
				+    get inner account set
			
 
				+    """
			
 
				+    accounts = aiditApi.get_publish_account_from_aigc()
			
 
				+    gh_id_list = [i['ghId'] for i in accounts]
			
 
				+    return set(gh_id_list)
			
--- a/applications/utils/common.py
+++ b/applications/utils/common.py
@@ -0,0 +1,61 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+
			
 
				+import hashlib
			
 
				+
			
 
				+from requests import RequestException
			
 
				+from tenacity import (
			
 
				+    stop_after_attempt,
			
 
				+    wait_exponential,
			
 
				+    retry_if_exception_type,
			
 
				+)
			
 
				+
			
 
				+
			
 
				+def str_to_md5(strings):
			
 
				+    """
			
 
				+    字符串转化为 md5 值
			
 
				+    :param strings:
			
 
				+    :return:
			
 
				+    """
			
 
				+    # 将字符串转换为字节
			
 
				+    original_bytes = strings.encode("utf-8")
			
 
				+    # 创建一个md5 hash对象
			
 
				+    md5_hash = hashlib.md5()
			
 
				+    # 更新hash对象，传入原始字节
			
 
				+    md5_hash.update(original_bytes)
			
 
				+    # 获取16进制形式的MD5哈希值
			
 
				+    md5_value = md5_hash.hexdigest()
			
 
				+    return md5_value
			
 
				+
			
 
				+
			
 
				+def proxy():
			
 
				+    """
			
 
				+    快代理
			
 
				+    """
			
 
				+    # 隧道域名:端口号
			
 
				+    tunnel = "j685.kdltps.com:15818"
			
 
				+
			
 
				+    # 用户名密码方式
			
 
				+    username = "t14070979713487"
			
 
				+    password = "hqwanfvy"
			
 
				+    proxies = {
			
 
				+        "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
			
 
				+        "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
			
 
				+    }
			
 
				+    return proxies
			
 
				+
			
 
				+
			
 
				+def request_retry(retry_times, min_retry_delay, max_retry_delay):
			
 
				+    """
			
 
				+    :param retry_times:
			
 
				+    :param min_retry_delay:
			
 
				+    :param max_retry_delay:
			
 
				+    """
			
 
				+    common_retry = dict(
			
 
				+        stop=stop_after_attempt(retry_times),
			
 
				+        wait=wait_exponential(min=min_retry_delay, max=max_retry_delay),
			
 
				+        retry=retry_if_exception_type((RequestException, TimeoutError)),
			
 
				+        reraise=True  # 重试耗尽后重新抛出异常
			
 
				+    )
			
 
				+    return common_retry
			
--- a/applications/utils/download_video.py
+++ b/applications/utils/download_video.py
@@ -0,0 +1,156 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import re
			
 
				+import html
			
 
				+import cffi
			
 
				+import traceback
			
 
				+
			
 
				+import requests
			
 
				+from uuid import uuid4
			
 
				+from fake_useragent import FakeUserAgent
			
 
				+
			
 
				+from applications.utils.common import str_to_md5
			
 
				+from config import decrypt_key_path
			
 
				+
			
 
				+headers = {"Content-Type": "application/json", "User-Agent": FakeUserAgent().chrome}
			
 
				+
			
 
				+
			
 
				+def extract_video_url_from_article(article_url):
			
 
				+    """
			
 
				+    :param article_url:
			
 
				+    :return:
			
 
				+    """
			
 
				+    response = requests.get(
			
 
				+        url=article_url,
			
 
				+        headers={"User-Agent": FakeUserAgent().random},
			
 
				+    )
			
 
				+    html_text = response.text
			
 
				+    w = re.search(
			
 
				+        r"mp_video_trans_info.*url:\s*\(\'(.*?)\'\)\.replace", html_text, re.S | re.M
			
 
				+    ).group(1)
			
 
				+    url = html.unescape(
			
 
				+        re.sub(
			
 
				+            r"\\x\d+", lambda x: bytes.fromhex(x.group().replace("\\x", "")).decode(), w
			
 
				+        )
			
 
				+    )
			
 
				+    return url
			
 
				+
			
 
				+
			
 
				+def download_gzh_video(article_url):
			
 
				+    """
			
 
				+    下载公众号视频
			
 
				+    :param article_url:
			
 
				+    :return:
			
 
				+    """
			
 
				+    try:
			
 
				+        video_url = extract_video_url_from_article(article_url)
			
 
				+    except Exception as e:
			
 
				+        return
			
 
				+    save_path = "static/{}.mp4".format(str_to_md5(video_url))
			
 
				+    headers = {
			
 
				+        "Accept": "*/*",
			
 
				+        "Accept-Language": "zh,zh-CN;q=0.9",
			
 
				+        "Connection": "keep-alive",
			
 
				+        "Origin": "https://mp.weixin.qq.com",
			
 
				+        "Referer": "https://mp.weixin.qq.com/",
			
 
				+        "Sec-Fetch-Dest": "video",
			
 
				+        "Sec-Fetch-Mode": "cors",
			
 
				+        "Sec-Fetch-Site": "cross-site",
			
 
				+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
			
 
				+        "sec-ch-ua": '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"',
			
 
				+        "sec-ch-ua-mobile": "?0",
			
 
				+        "sec-ch-ua-platform": '"macOS"',
			
 
				+    }
			
 
				+    res = requests.get(video_url, headers=headers)
			
 
				+    with open(save_path, "wb") as f:
			
 
				+        f.write(res.content)
			
 
				+
			
 
				+    TEN_KB = 1024 * 10
			
 
				+    if os.path.getsize(save_path) > TEN_KB:
			
 
				+        return save_path
			
 
				+    else:
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def download_sph_video(download_url, key):
			
 
				+    """
			
 
				+    download video, decrypt video and save to local
			
 
				+    """
			
 
				+    file_id = uuid4().hex
			
 
				+    encrypted_path = f"static/encrypted_{file_id}.mp4"
			
 
				+    decrypted_path = f"static/decrypted_{file_id}.mp4"
			
 
				+
			
 
				+    try:
			
 
				+        with requests.get(download_url, headers=headers, stream=True) as response:
			
 
				+            response.raise_for_status()
			
 
				+
			
 
				+            with open(encrypted_path, "wb") as f:
			
 
				+                for chunk in response.iter_content(chunk_size=8192):
			
 
				+                    if chunk:  # filter out keep-alive chunks
			
 
				+                        f.write(chunk)
			
 
				+
			
 
				+        decrypt_sph_video(encrypted_path, key, decrypted_path)
			
 
				+        os.remove(encrypted_path)
			
 
				+        return decrypted_path
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        print(traceback.format_exc())
			
 
				+        for path in [encrypted_path, decrypted_path]:
			
 
				+            if os.path.exists(path):
			
 
				+                try:
			
 
				+                    os.remove(path)
			
 
				+                except OSError:
			
 
				+                    pass
			
 
				+        raise RuntimeError(f"Video processing failed: {str(e)}") from e
			
 
				+
			
 
				+
			
 
				+def decrypt_sph_video(video_path: str, key: int, save_path: str) -> None:
			
 
				+    """
			
 
				+    Decrypt video file using C library.
			
 
				+    Args:
			
 
				+        video_path: Path to encrypted video file
			
 
				+        key: 32-bit unsigned integer decryption key
			
 
				+        save_path: Path to save decrypted video
			
 
				+    Raises:
			
 
				+        RuntimeError: If decryption fails
			
 
				+    """
			
 
				+    print("key is {}".format(key))
			
 
				+    ffi = cffi.FFI()
			
 
				+
			
 
				+    try:
			
 
				+        lib = ffi.dlopen(decrypt_key_path)
			
 
				+        ffi.cdef(
			
 
				+            "void decrypt(unsigned char *data, const size_t data_length, const uint32_t key);"
			
 
				+        )
			
 
				+
			
 
				+        with open(video_path, "rb") as f:
			
 
				+            encrypted_data = f.read()
			
 
				+
			
 
				+        c_data = ffi.new("unsigned char[]", list(encrypted_data))
			
 
				+        lib.decrypt(c_data, 2**17, int(key))
			
 
				+        decrypted_data = bytes(ffi.buffer(c_data, len(encrypted_data))[:])
			
 
				+
			
 
				+        with open(save_path, "wb") as f:
			
 
				+            f.write(decrypted_data)
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        print(traceback.format_exc())
			
 
				+        raise RuntimeError(f"Decryption failed: {str(e)}") from e
			
 
				+
			
 
				+
			
 
				+def download_toutiao_video(video_url: str) -> str:
			
 
				+    """
			
 
				+    download toutiao video
			
 
				+    """
			
 
				+    save_path = "static/{}.mp4".format(str_to_md5(video_url))
			
 
				+    response = requests.get(video_url, headers=headers, stream=True)
			
 
				+    with open(save_path, "wb") as f:
			
 
				+        for chunk in response.iter_content(chunk_size=8192):
			
 
				+            if chunk:
			
 
				+                f.write(chunk)
			
 
				+
			
 
				+    return save_path
			
 
				+
			
--- a/applications/utils/fetch_info_from_aigc.py
+++ b/applications/utils/fetch_info_from_aigc.py
@@ -0,0 +1,58 @@
 
				+"""
			
 
				+fetch info from aigc database system
			
 
				+"""
			
 
				+from collections import defaultdict
			
 
				+from typing import List, Dict
			
 
				+
			
 
				+from pymysql.cursors import DictCursor
			
 
				+
			
 
				+
			
 
				+def fetch_publishing_account_list(db_client) -> List[Dict]:
			
 
				+    """
			
 
				+    fetch account_list from aigc database
			
 
				+    """
			
 
				+    fetch_sql = f"""
			
 
				+        SELECT DISTINCT
			
 
				+            t3.`name` as account_name,
			
 
				+            t3.gh_id as gh_id,
			
 
				+            t3.follower_count as fans,
			
 
				+            t6.account_source_name as account_source,
			
 
				+            t6.mode_type as mode_type,
			
 
				+            t6.account_type as account_type,
			
 
				+            t6.`status` as status
			
 
				+        FROM
			
 
				+            publish_plan t1
			
 
				+            JOIN publish_plan_account t2 ON t1.id = t2.plan_id
			
 
				+            JOIN publish_account t3 ON t2.account_id = t3.id
			
 
				+            LEFT JOIN publish_account_wx_type t4 on t3.id = t4.account_id
			
 
				+            LEFT JOIN wx_statistics_group_source_account t5 on t3.id = t5.account_id
			
 
				+            LEFT JOIN wx_statistics_group_source t6 on t5.group_source_name = t6.account_source_name
			
 
				+        WHERE
			
 
				+            t1.plan_status = 1
			
 
				+            AND t3.channel = 5
			
 
				+            GROUP BY t3.id;
			
 
				+    """
			
 
				+    account_list = db_client.fetch(
			
 
				+        query=fetch_sql,
			
 
				+        cursor_type=DictCursor
			
 
				+    )
			
 
				+    return account_list
			
 
				+
			
 
				+def fetch_account_fans(db_client, start_date: str) -> Dict:
			
 
				+    """
			
 
				+    fetch account fans from aigc database
			
 
				+    """
			
 
				+    sql = f"""
			
 
				+        SELECT t1.date_str, t1.fans_count, t2.gh_id
			
 
				+        FROM datastat_wx t1 JOIN publish_account t2 ON t1.account_id = t2.id
			
 
				+        WHERE t2.channel = 5
			
 
				+            AND t2.status = 1 
			
 
				+            AND t1.date_str >= '{start_date}' 
			
 
				+        ORDER BY t1.date_str;
			
 
				+        """
			
 
				+    result = db_client.fetch(sql)
			
 
				+    fans_dict = defaultdict(dict)
			
 
				+    for dt, fans, gh_id in result:
			
 
				+        fans_dict.setdefault(gh_id, {})[dt] = fans
			
 
				+    return fans_dict
			
 
				+
			
--- a/applications/utils/item.py
+++ b/applications/utils/item.py
@@ -0,0 +1,69 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+
			
 
				+import time
			
 
				+
			
 
				+default_single_video_table_fields = {
			
 
				+    "platform": "gzh",
			
 
				+    "article_title": None,
			
 
				+    "content_trace_id": None,
			
 
				+    "read_cnt": 0,
			
 
				+    "article_index": None,
			
 
				+    "out_account_name": None,
			
 
				+    "article_url": None,
			
 
				+    "url_unique_md5": None,
			
 
				+    "category": None,
			
 
				+    "publish_timestamp": None,
			
 
				+    "out_account_id": None,
			
 
				+    "cover_url": None,
			
 
				+    "crawler_timestamp": int(time.time()),
			
 
				+    "source_account": 1,
			
 
				+    "article_publish_type": None,
			
 
				+    "like_cnt": 0,
			
 
				+    "bad_status": 0,
			
 
				+    "tags": None,
			
 
				+    "video_oss_path": None,
			
 
				+}
			
 
				+
			
 
				+
			
 
				+class Item(object):
			
 
				+    """
			
 
				+    format save to article meta table or single video source table
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.item = {}
			
 
				+
			
 
				+    def add(self, key, value):
			
 
				+        """
			
 
				+        add key value to item
			
 
				+        """
			
 
				+        self.item[key] = value
			
 
				+
			
 
				+    def check_video_item(self):
			
 
				+        """
			
 
				+        check video item
			
 
				+        """
			
 
				+        fields = list(default_single_video_table_fields.keys())
			
 
				+        for field in fields:
			
 
				+            if self.item.get(field, None) is not None:
			
 
				+                continue
			
 
				+            else:
			
 
				+                self.item[field] = default_single_video_table_fields[field]
			
 
				+
			
 
				+    def check_article_item(self):
			
 
				+        """
			
 
				+        check article item
			
 
				+        """
			
 
				+        return
			
 
				+
			
 
				+    def check(self, source):
			
 
				+        """
			
 
				+        check item
			
 
				+        """
			
 
				+        match source:
			
 
				+            case "video":
			
 
				+                self.check_video_item()
			
 
				+            case "article":
			
 
				+                self.check_article_item()
			
--- a/applications/utils/save_to_db.py
+++ b/applications/utils/save_to_db.py
@@ -0,0 +1,52 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+
			
 
				+import traceback
			
 
				+from applications.aliyunLogApi import log
			
 
				+
			
 
				+
			
 
				+def insert_into_single_video_source_table(db_client, video_item):
			
 
				+    """
			
 
				+    insert video into single video source table
			
 
				+    """
			
 
				+    insert_sql = f"""
			
 
				+        INSERT INTO publish_single_video_source
			
 
				+        (content_trace_id, article_title, out_account_id, out_account_name, read_cnt, like_cnt, article_url, cover_url, video_oss_path, publish_timestamp, crawler_timestamp, url_unique_md5, category, tags, platform, source_account)
			
 
				+        values
			
 
				+        (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
			
 
				+    """
			
 
				+    try:
			
 
				+        db_client.save(
			
 
				+            query=insert_sql,
			
 
				+            params=(
			
 
				+                video_item["content_trace_id"],
			
 
				+                video_item["article_title"],
			
 
				+                video_item["out_account_id"],
			
 
				+                video_item["out_account_name"],
			
 
				+                video_item["read_cnt"],
			
 
				+                video_item["like_cnt"],
			
 
				+                video_item["article_url"],
			
 
				+                video_item["cover_url"],
			
 
				+                video_item["video_oss_path"],
			
 
				+                video_item["publish_timestamp"],
			
 
				+                video_item["crawler_timestamp"],
			
 
				+                video_item["url_unique_md5"],
			
 
				+                video_item["category"],
			
 
				+                video_item["tags"],
			
 
				+                video_item["platform"],
			
 
				+                video_item["source_account"],
			
 
				+            ),
			
 
				+        )
			
 
				+    except Exception as e:
			
 
				+        log(
			
 
				+            task="{}_video_crawler".format(video_item["platform"]),
			
 
				+            function="save_each_video",
			
 
				+            message="save video failed",
			
 
				+            data={
			
 
				+                "error": str(e),
			
 
				+                "traceback": traceback.format_exc(),
			
 
				+                "video_id": video_item["url_unique_md5"],
			
 
				+                "oss_path": video_item["video_oss_path"],
			
 
				+            },
			
 
				+        )
			
--- a/applications/utils/upload.py
+++ b/applications/utils/upload.py
@@ -0,0 +1,23 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+
			
 
				+import oss2
			
 
				+from uuid import uuid4
			
 
				+
			
 
				+
			
 
				+def upload_to_oss(local_video_path):
			
 
				+    """
			
 
				+    把视频上传到 oss
			
 
				+    :return:
			
 
				+    """
			
 
				+    oss_video_key = "long_articles/video/" + str(uuid4())
			
 
				+    access_key_id = "LTAIP6x1l3DXfSxm"
			
 
				+    access_key_secret = "KbTaM9ars4OX3PMS6Xm7rtxGr1FLon"
			
 
				+    endpoint = "oss-cn-hangzhou.aliyuncs.com"
			
 
				+    bucket_name = "art-pubbucket"
			
 
				+    bucket = oss2.Bucket(
			
 
				+        oss2.Auth(access_key_id, access_key_secret), endpoint, bucket_name
			
 
				+    )
			
 
				+    bucket.put_object_from_file(key=oss_video_key, filename=local_video_path)
			
 
				+    return oss_video_key
			
--- a/applications/wxSpiderApi.py
+++ b/applications/wxSpiderApi.py
@@ -1,9 +1,12 @@
 
				 """
			
 
				 @author: luojunhui
			
 
				 """
			
 
				+
			
 
				 import json
			
 
				+import time
			
 
				 import requests
			
 
				 
			
 
				+from applications.aliyunLogApi import log
			
 
				 from applications.decoratorApi import retryOnNone
			
 
				 
			
 
				 
			
@@ -11,13 +14,12 @@ class WeixinSpider(object):
 
				     """
			
 
				     Update account articles
			
 
				     """
			
 
				+
			
 
				     # ip = "8.217.190.241"
			
 
				     # ip = "47.98.154.124"
			
 
				     # port = "8888"
			
 
				     base_url = "http://crawler-cn.aiddit.com/crawler/wei_xin"
			
 
				-    headers = {
			
 
				-        "Content-Type": "application/json"
			
 
				-    }
			
 
				+    headers = {"Content-Type": "application/json"}
			
 
				 
			
 
				     @classmethod
			
 
				     @retryOnNone()
			
@@ -27,11 +29,10 @@ class WeixinSpider(object):
 
				         :return:
			
 
				         """
			
 
				         url = "{}/keyword".format(cls.base_url)
			
 
				-        payload = json.dumps({
			
 
				-            "keyword": title,
			
 
				-            "cursor": page
			
 
				-        })
			
 
				-        response = requests.request("POST", url, headers=cls.headers, data=payload, timeout=120)
			
 
				+        payload = json.dumps({"keyword": title, "cursor": page})
			
 
				+        response = requests.request(
			
 
				+            "POST", url, headers=cls.headers, data=payload, timeout=120
			
 
				+        )
			
 
				         return response.json()
			
 
				 
			
 
				     @classmethod
			
@@ -45,13 +46,17 @@ class WeixinSpider(object):
 
				         :return:
			
 
				         """
			
 
				         url = "{}/detail".format(cls.base_url)
			
 
				-        payload = json.dumps({
			
 
				-            "content_link": content_link,
			
 
				-            "is_count": is_count,
			
 
				-            "is_ad": False,
			
 
				-            "is_cache": is_cache
			
 
				-        })
			
 
				-        response = requests.request("POST", url, headers=cls.headers, data=payload, timeout=120)
			
 
				+        payload = json.dumps(
			
 
				+            {
			
 
				+                "content_link": content_link,
			
 
				+                "is_count": is_count,
			
 
				+                "is_ad": False,
			
 
				+                "is_cache": is_cache,
			
 
				+            }
			
 
				+        )
			
 
				+        response = requests.request(
			
 
				+            "POST", url, headers=cls.headers, data=payload, timeout=120
			
 
				+        )
			
 
				         return response.json()
			
 
				 
			
 
				     @classmethod
			
@@ -60,12 +65,14 @@ class WeixinSpider(object):
 
				         """
			
 
				         :return:
			
 
				         """
			
 
				-        url = '{}/blogger'.format(cls.base_url)
			
 
				+        url = "{}/blogger".format(cls.base_url)
			
 
				         payload = {
			
 
				-            'account_id': ghId,
			
 
				-            'cursor': index,
			
 
				+            "account_id": ghId,
			
 
				+            "cursor": index,
			
 
				         }
			
 
				-        response = requests.post(url=url, headers=cls.headers, data=json.dumps(payload), timeout=120)
			
 
				+        response = requests.post(
			
 
				+            url=url, headers=cls.headers, data=json.dumps(payload), timeout=120
			
 
				+        )
			
 
				         return response.json()
			
 
				 
			
 
				     @classmethod
			
@@ -76,9 +83,11 @@ class WeixinSpider(object):
 
				         :param content_url:
			
 
				         :return:
			
 
				         """
			
 
				-        url = '{}/account_info'.format(cls.base_url)
			
 
				+        url = "{}/account_info".format(cls.base_url)
			
 
				         data = {"content_link": content_url}
			
 
				-        response = requests.request("POST", url=url, headers=cls.headers, json=data, timeout=120)
			
 
				+        response = requests.request(
			
 
				+            "POST", url=url, headers=cls.headers, json=data, timeout=120
			
 
				+        )
			
 
				         return response.json()
			
 
				 
			
 
				     @classmethod
			
@@ -89,8 +98,34 @@ class WeixinSpider(object):
 
				         :return:
			
 
				         """
			
 
				         url = "{}/recommend".format(cls.base_url)
			
 
				-        payload = json.dumps(
			
 
				-            {"content_link": content_link}
			
 
				+        payload = json.dumps({"content_link": content_link})
			
 
				+        response = requests.request(
			
 
				+            "POST", url=url, headers=cls.headers, data=payload, timeout=120
			
 
				+        )
			
 
				+        response_json = response.json()
			
 
				+        if response_json["code"] != 0:
			
 
				+            return cls.get_recommend_articles(content_link)
			
 
				+        time.sleep(3)
			
 
				+        return response.json()
			
 
				+
			
 
				+    @classmethod
			
 
				+    def get_recommend_articles_v2(cls, content_link) -> dict:
			
 
				+        """
			
 
				+        use content link to get recommend articles
			
 
				+        :param content_link:
			
 
				+        :return:
			
 
				+        """
			
 
				+        url = "http://datapi.top/wxapi/relatedarticle"
			
 
				+        payload = {
			
 
				+            'url': content_link,
			
 
				+            'token': '401e4d3c85068bb5'
			
 
				+        }
			
 
				+        response = requests.request("POST", url, headers={}, data=payload, timeout=120)
			
 
				+        log(
			
 
				+            task="article_association_crawler",
			
 
				+            function="get_recommend_articles_v2",
			
 
				+            message="获取推荐链接，付费接口",
			
 
				+            data={"content_link": content_link, "response": response.json()},
			
 
				         )
			
 
				-        response = requests.request("POST", url=url, headers=cls.headers, data=payload, timeout=120)
			
 
				+        time.sleep(3)
			
 
				         return response.json()
			
--- a/article_association_task.py
+++ b/article_association_task.py
@@ -0,0 +1,53 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+import traceback
			
 
				+from argparse import ArgumentParser
			
 
				+
			
 
				+from applications import bot
			
 
				+from coldStartTasks.crawler.wechat import ArticleAssociationCrawler
			
 
				+from coldStartTasks.publish.publish_article_association_articles import ArticleAssociationPublish
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """
			
 
				+    main function
			
 
				+    """
			
 
				+    parser = ArgumentParser()
			
 
				+    parser.add_argument("--biz_date", type=str, help="format 2025-01-01")
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    if args.biz_date:
			
 
				+        biz_date = args.biz_date
			
 
				+    else:
			
 
				+        biz_date = None
			
 
				+    try:
			
 
				+        article_association_crawler = ArticleAssociationCrawler()
			
 
				+        article_association_crawler.deal(biz_date=biz_date)
			
 
				+    except Exception as e:
			
 
				+        bot(
			
 
				+            title="It occurred an Exception in ArticleAssociationCrawler",
			
 
				+            detail={
			
 
				+                "Error": str(e),
			
 
				+                "Traceback": traceback.format_exc()
			
 
				+            },
			
 
				+            mention=False,
			
 
				+        )
			
 
				+
			
 
				+    # publish
			
 
				+    try:
			
 
				+        article_association_publish = ArticleAssociationPublish()
			
 
				+        article_association_publish.deal()
			
 
				+    except Exception as e:
			
 
				+        bot(
			
 
				+            title="It occurred an Exception in ArticleAssociationPublish",
			
 
				+            detail={
			
 
				+                "Error": str(e),
			
 
				+                "Traceback": traceback.format_exc()
			
 
				+            },
			
 
				+            mention=False,
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/cal_account_read_rate_avg_daily.py
+++ b/cal_account_read_rate_avg_daily.py
@@ -7,14 +7,21 @@ from tqdm import tqdm
 
				 from pandas import DataFrame
			
 
				 from argparse import ArgumentParser
			
 
				 from datetime import datetime
			
 
				+from pymysql.cursors import DictCursor
			
 
				 
			
 
				-from applications import DeNetMysql, PQMySQL, longArticlesMySQL, bot, Functions, create_feishu_columns_sheet
			
 
				-from applications.const import updateAccountReadRateTaskConst
			
 
				-from config import apolloConfig
			
 
				+from applications import bot, Functions, log
			
 
				+from applications import create_feishu_columns_sheet
			
 
				+from applications.db import DatabaseConnector
			
 
				+from applications.const import UpdateAccountReadRateTaskConst
			
 
				+from applications.utils import fetch_publishing_account_list
			
 
				+from applications.utils import fetch_account_fans
			
 
				+from config import apolloConfig, long_articles_config, piaoquan_crawler_config, denet_config
			
 
				 
			
 
				-const = updateAccountReadRateTaskConst()
			
 
				+
			
 
				+const = UpdateAccountReadRateTaskConst()
			
 
				 config = apolloConfig()
			
 
				 unauthorized_account = json.loads(config.getConfigValue("unauthorized_gh_id_fans"))
			
 
				+backup_account_fans = json.loads(config.getConfigValue("backup_account_fans"))
			
 
				 functions = Functions()
			
 
				 read_rate_table = "long_articles_read_rate"
			
 
				 
			
@@ -37,75 +44,7 @@ def filter_outlier_data(group, key='show_view_count'):
 
				     return filtered_group
			
 
				 
			
 
				 
			
 
				-def get_account_fans_by_dt(db_client) -> dict:
			
 
				-    """
			
 
				-    获取每个账号发粉丝，通过日期来区分
			
 
				-    :return:
			
 
				-    """
			
 
				-    sql = f"""
			
 
				-        SELECT 
			
 
				-            t1.date_str, 
			
 
				-            t1.fans_count, 
			
 
				-            t2.gh_id
			
 
				-        FROM datastat_wx t1
			
 
				-        JOIN publish_account t2 ON t1.account_id = t2.id
			
 
				-        WHERE 
			
 
				-            t2.channel = 5 
			
 
				-        AND t2.status = 1 
			
 
				-        AND t1.date_str >= '2024-07-01' 
			
 
				-        ORDER BY t1.date_str;
			
 
				-    """
			
 
				-    result = db_client.select(sql)
			
 
				-    D = {}
			
 
				-    for line in result:
			
 
				-        dt = line[0]
			
 
				-        fans = line[1]
			
 
				-        gh_id = line[2]
			
 
				-        if D.get(gh_id):
			
 
				-            D[gh_id][dt] = fans
			
 
				-        else:
			
 
				-            D[gh_id] = {dt: fans}
			
 
				-    return D
			
 
				-
			
 
				-
			
 
				-def get_publishing_accounts(db_client) -> list[dict]:
			
 
				-    """
			
 
				-    获取每日正在发布的账号
			
 
				-    :return:
			
 
				-    """
			
 
				-    sql = f"""
			
 
				-    SELECT DISTINCT
			
 
				-        t3.`name`,
			
 
				-        t3.gh_id,
			
 
				-        t3.follower_count,
			
 
				-        t6.account_source_name,
			
 
				-        t6.mode_type,
			
 
				-        t6.account_type,
			
 
				-        t6.`status`
			
 
				-    FROM
			
 
				-        publish_plan t1
			
 
				-        JOIN publish_plan_account t2 ON t1.id = t2.plan_id
			
 
				-        JOIN publish_account t3 ON t2.account_id = t3.id
			
 
				-        LEFT JOIN publish_account_wx_type t4 on t3.id = t4.account_id
			
 
				-        LEFT JOIN wx_statistics_group_source_account t5 on t3.id = t5.account_id
			
 
				-        LEFT JOIN wx_statistics_group_source t6 on t5.group_source_name = t6.account_source_name
			
 
				-    WHERE
			
 
				-        t1.plan_status = 1
			
 
				-        AND t3.channel = 5
			
 
				-        -- AND t3.follower_count > 0
			
 
				-        GROUP BY t3.id;
			
 
				-    """
			
 
				-    account_list = db_client.select(sql)
			
 
				-    result_list = [
			
 
				-        {
			
 
				-            "account_name": i[0],
			
 
				-            "gh_id": i[1]
			
 
				-        } for i in account_list
			
 
				-    ]
			
 
				-    return result_list
			
 
				-
			
 
				-
			
 
				-def get_account_articles_detail(db_client, gh_id_tuple) -> list[dict]:
			
 
				+def get_account_articles_detail(db_client, gh_id_tuple, min_publish_timestamp) -> list[dict]:
			
 
				     """
			
 
				     get articles details
			
 
				     :return:
			
@@ -116,47 +55,37 @@ def get_account_articles_detail(db_client, gh_id_tuple) -> list[dict]:
 
				             FROM 
			
 
				                 official_articles_v2
			
 
				             WHERE 
			
 
				-                ghId IN {gh_id_tuple} and Type = '{const.BULK_PUBLISH_TYPE}';
			
 
				+                ghId IN {gh_id_tuple} and Type = '{const.BULK_PUBLISH_TYPE}' and publish_timestamp >= {min_publish_timestamp};
			
 
				             """
			
 
				-    result = db_client.select(sql)
			
 
				-    response_list = [
			
 
				-        {
			
 
				-            "ghId": i[0],
			
 
				-            "accountName": i[1],
			
 
				-            "ItemIndex": i[2],
			
 
				-            "show_view_count": i[3],
			
 
				-            "publish_timestamp": i[4]
			
 
				-        }
			
 
				-        for i in result
			
 
				-    ]
			
 
				+    response_list = db_client.fetch(query=sql, cursor_type=DictCursor)
			
 
				     return response_list
			
 
				 
			
 
				 
			
 
				-def cal_account_read_rate(gh_id_tuple) -> DataFrame:
			
 
				+def cal_account_read_rate(article_list, fans_dict) -> DataFrame:
			
 
				     """
			
 
				     计算账号位置的阅读率
			
 
				     :return:
			
 
				     """
			
 
				-    pq_db = PQMySQL()
			
 
				-    de_db = DeNetMysql()
			
 
				     response = []
			
 
				-    fans_dict_each_day = get_account_fans_by_dt(db_client=de_db)
			
 
				-    account_article_detail = get_account_articles_detail(
			
 
				-        db_client=pq_db,
			
 
				-        gh_id_tuple=gh_id_tuple
			
 
				-    )
			
 
				-    for line in account_article_detail:
			
 
				+    for line in article_list:
			
 
				         gh_id = line['ghId']
			
 
				         dt = functions.timestamp_to_str(timestamp=line['publish_timestamp'], string_format='%Y-%m-%d')
			
 
				-        fans = fans_dict_each_day.get(gh_id, {}).get(dt, 0)
			
 
				+        fans = fans_dict.get(gh_id, {}).get(dt, const.DEFAULT_FANS)
			
 
				+        if not fans:
			
 
				+            fans = int(unauthorized_account.get(gh_id, const.DEFAULT_FANS))
			
 
				         if not fans:
			
 
				-            fans = int(unauthorized_account.get(gh_id, 0))
			
 
				+            fans = int(backup_account_fans.get(gh_id, const.DEFAULT_FANS))
			
 
				+            log(
			
 
				+                task='cal_read_rate_avg_task',
			
 
				+                function='cal_account_read_rate',
			
 
				+                message='未获取到粉丝，使用备份粉丝表',
			
 
				+                data=line
			
 
				+            )
			
 
				         line['fans'] = fans
			
 
				-        if fans > 1000:
			
 
				+        if fans > const.MIN_FANS:
			
 
				             line['readRate'] = line['show_view_count'] / fans if fans else 0
			
 
				             response.append(line)
			
 
				-    return DataFrame(response,
			
 
				-                     columns=['ghId', 'accountName', 'ItemIndex', 'show_view_count', 'publish_timestamp', 'readRate'])
			
 
				+    return DataFrame(response, columns=['ghId', 'accountName', 'ItemIndex', 'show_view_count', 'publish_timestamp', 'readRate'])
			
 
				 
			
 
				 
			
 
				 def cal_avg_account_read_rate(df, gh_id, index, dt) -> dict:
			
@@ -168,7 +97,7 @@ def cal_avg_account_read_rate(df, gh_id, index, dt) -> dict:
 
				     min_time = max_time - const.STATISTICS_PERIOD
			
 
				 
			
 
				     # 通过
			
 
				-    filterDataFrame = df[
			
 
				+    filter_dataframe = df[
			
 
				         (df["ghId"] == gh_id)
			
 
				         & (min_time <= df["publish_timestamp"])
			
 
				         & (df["publish_timestamp"] <= max_time)
			
@@ -176,13 +105,13 @@ def cal_avg_account_read_rate(df, gh_id, index, dt) -> dict:
 
				         ]
			
 
				 
			
 
				     # 用二倍标准差过滤
			
 
				-    finalDF = filter_outlier_data(filterDataFrame)
			
 
				+    final_dataframe = filter_outlier_data(filter_dataframe)
			
 
				 
			
 
				     return {
			
 
				-        "read_rate_avg": finalDF['readRate'].mean(),
			
 
				-        "max_publish_time": finalDF['publish_timestamp'].max(),
			
 
				-        "min_publish_time": finalDF['publish_timestamp'].min(),
			
 
				-        "records": len(finalDF)
			
 
				+        "read_rate_avg": final_dataframe['readRate'].mean(),
			
 
				+        "max_publish_time": final_dataframe['publish_timestamp'].max(),
			
 
				+        "min_publish_time": final_dataframe['publish_timestamp'].min(),
			
 
				+        "records": len(final_dataframe)
			
 
				     }
			
 
				 
			
 
				 
			
@@ -204,7 +133,7 @@ def check_each_position(db_client, gh_id, index, dt, avg_rate) -> dict:
 
				         WHERE gh_id = '{gh_id}' and position = {index} and dt_version < {dt}
			
 
				         ORDER BY dt_version DESC limit 1;
			
 
				     """
			
 
				-    result = db_client.select(select_sql)
			
 
				+    result = db_client.fetch(select_sql)
			
 
				     if result:
			
 
				         account_name = result[0][0]
			
 
				         previous_read_rate_avg = result[0][1]
			
@@ -246,6 +175,9 @@ def update_single_day(dt, account_list, article_df, lam):
 
				         string_format='%Y-%m-%d'
			
 
				     )
			
 
				 
			
 
				+    # processed_account_set
			
 
				+    processed_account_set = set()
			
 
				+
			
 
				     for account in tqdm(account_list, desc=dt):
			
 
				         for index in const.ARTICLE_INDEX_LIST:
			
 
				             read_rate_detail = cal_avg_account_read_rate(
			
@@ -259,7 +191,9 @@ def update_single_day(dt, account_list, article_df, lam):
 
				             min_publish_time = read_rate_detail['min_publish_time']
			
 
				             articles_count = read_rate_detail['records']
			
 
				             if articles_count:
			
 
				-                if index in {1, 2}:
			
 
				+                processed_account_set.add(account['gh_id'])
			
 
				+                # check read rate in position 1 and 2
			
 
				+                if index in [1, 2]:
			
 
				                     error_obj = check_each_position(
			
 
				                         db_client=lam,
			
 
				                         gh_id=account['gh_id'],
			
@@ -269,6 +203,7 @@ def update_single_day(dt, account_list, article_df, lam):
 
				                     )
			
 
				                     if error_obj:
			
 
				                         error_list.append(error_obj)
			
 
				+                # insert into database
			
 
				                 try:
			
 
				                     if not read_rate_avg:
			
 
				                         continue
			
@@ -278,8 +213,8 @@ def update_single_day(dt, account_list, article_df, lam):
 
				                         values
			
 
				                         (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
			
 
				                     """
			
 
				-                    lam.update(
			
 
				-                        sql=insert_sql,
			
 
				+                    lam.save(
			
 
				+                        query=insert_sql,
			
 
				                         params=(
			
 
				                             account['account_name'],
			
 
				                             account['gh_id'],
			
@@ -294,14 +229,17 @@ def update_single_day(dt, account_list, article_df, lam):
 
				                         )
			
 
				                     )
			
 
				                 except Exception as e:
			
 
				+                    print(e)
			
 
				                     insert_error_list.append(str(e))
			
 
				 
			
 
				+    # bot sql error
			
 
				     if insert_error_list:
			
 
				         bot(
			
 
				             title="更新阅读率均值，存在sql 插入失败",
			
 
				             detail=insert_error_list
			
 
				         )
			
 
				 
			
 
				+    # bot outliers
			
 
				     if error_list:
			
 
				         columns = [
			
 
				             create_feishu_columns_sheet(sheet_type="plain_text", sheet_name="account_name", display_name="账号名称"),
			
@@ -314,7 +252,7 @@ def update_single_day(dt, account_list, article_df, lam):
 
				                                         display_name="相对变化率")
			
 
				         ]
			
 
				         bot(
			
 
				-            title="更新阅读率均值，头次出现异常值通知",
			
 
				+            title="阅读率均值表异常信息， 总共处理{}个账号".format(len(processed_account_set)),
			
 
				             detail={
			
 
				                 "columns": columns,
			
 
				                 "rows": error_list
			
@@ -323,12 +261,14 @@ def update_single_day(dt, account_list, article_df, lam):
 
				             mention=False
			
 
				         )
			
 
				 
			
 
				+    # if no error, send success info
			
 
				     if not error_list and not insert_error_list:
			
 
				         bot(
			
 
				-            title="阅读率均值表，更新成功",
			
 
				+            title="阅读率均值表更新成功， 总共处理{}个账号".format(len(processed_account_set)),
			
 
				             detail={
			
 
				                 "日期": dt
			
 
				-            }
			
 
				+            },
			
 
				+            mention=False
			
 
				         )
			
 
				 
			
 
				 
			
@@ -347,12 +287,36 @@ def main() -> None:
 
				     else:
			
 
				         dt = datetime.today().strftime('%Y-%m-%d')
			
 
				 
			
 
				-    lam = longArticlesMySQL()
			
 
				-    de = DeNetMysql()
			
 
				-    account_list = get_publishing_accounts(db_client=de)
			
 
				-    df = cal_account_read_rate(tuple([i['gh_id'] for i in account_list]))
			
 
				+    # init stat period
			
 
				+    max_time = functions.str_to_timestamp(date_string=dt)
			
 
				+    min_time = max_time - const.STATISTICS_PERIOD
			
 
				+    min_stat_date = functions.timestamp_to_str(timestamp=min_time, string_format='%Y-%m-%d')
			
 
				+
			
 
				+    # init database connector
			
 
				+    long_articles_db_client = DatabaseConnector(db_config=long_articles_config)
			
 
				+    long_articles_db_client.connect()
			
 
				+
			
 
				+    piaoquan_crawler_db_client = DatabaseConnector(db_config=piaoquan_crawler_config)
			
 
				+    piaoquan_crawler_db_client.connect()
			
 
				+
			
 
				+    denet_db_client = DatabaseConnector(db_config=denet_config)
			
 
				+    denet_db_client.connect()
			
 
				+
			
 
				+    # get account list
			
 
				+    account_list = fetch_publishing_account_list(db_client=denet_db_client)
			
 
				+
			
 
				+    # get fans dict
			
 
				+    fans_dict = fetch_account_fans(db_client=denet_db_client, start_date=min_stat_date)
			
 
				+
			
 
				+    # get data frame from official_articles_v2
			
 
				+    gh_id_tuple = tuple([i['gh_id'] for i in account_list])
			
 
				+    article_list = get_account_articles_detail(db_client=piaoquan_crawler_db_client, gh_id_tuple=gh_id_tuple, min_publish_timestamp=min_time)
			
 
				+
			
 
				+    # cal account read rate and make a dataframe
			
 
				+    read_rate_dataframe = cal_account_read_rate(article_list, fans_dict)
			
 
				 
			
 
				-    update_single_day(dt, account_list, df, lam)
			
 
				+    # update each day's data
			
 
				+    update_single_day(dt, account_list, read_rate_dataframe, long_articles_db_client)
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
--- a/coldStartTasks/crawler/__init__.py
+++ b/coldStartTasks/crawler/__init__.py
@@ -2,4 +2,4 @@
 
				 @author: luojunhui
			
 
				 """
			
 
				 from .weixin_account_crawler import WeixinAccountCrawler
			
 
				-from .weixin_video_crawler import WeixinVideoCrawler
			
 
				+from .weixin_video_crawler import WeixinVideoCrawler
			
--- a/coldStartTasks/crawler/baidu/__init__.py
+++ b/coldStartTasks/crawler/baidu/__init__.py
@@ -0,0 +1 @@
 
				+from .video_crawler import BaiduVideoCrawler
			
--- a/coldStartTasks/publish/publishArticleAssociationArticles.py
+++ b/coldStartTasks/publish/publishArticleAssociationArticles.py
@@ -1,4 +1,4 @@
 
				 """
			
 
				 @author: luojunhui
			
 
				-发布i2i文章
			
 
				 """
			
 
				+
			
--- a/coldStartTasks/crawler/baidu/baidu_spider.py
+++ b/coldStartTasks/crawler/baidu/baidu_spider.py
@@ -0,0 +1,96 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+import base64
			
 
				+import uuid
			
 
				+
			
 
				+import requests
			
 
				+
			
 
				+from fake_useragent import FakeUserAgent
			
 
				+
			
 
				+from applications.exception import SpiderError
			
 
				+from applications import Functions
			
 
				+
			
 
				+functions = Functions()
			
 
				+
			
 
				+
			
 
				+def baidu_account_video_crawler(account_id, cursor=None):
			
 
				+    """
			
 
				+    baidu account video crawler
			
 
				+    :param account_id: 百度账号id
			
 
				+    :param cursor: 游标, 默认为None，表示从最新的开始爬取
			
 
				+    success requests:
			
 
				+    """
			
 
				+    cookie_str = uuid.uuid4().__str__().replace('-', '').upper()
			
 
				+    url = "https://haokan.baidu.com/web/author/listall?"
			
 
				+    params = {
			
 
				+        'app_id': account_id,
			
 
				+        'ctime': cursor,
			
 
				+        'rn': 10,
			
 
				+        'searchAfter': '',
			
 
				+        '_api': 1
			
 
				+    }
			
 
				+    headers = {
			
 
				+        'Accept': '*/*',
			
 
				+        'Accept-Language': 'zh,zh-CN;q=0.9',
			
 
				+        'Connection': 'keep-alive',
			
 
				+        'Referer': 'https://haokan.baidu.com/author/{}'.format(account_id),
			
 
				+        'User-Agent': FakeUserAgent().chrome,
			
 
				+        'x-requested-with': 'xmlhttprequest',
			
 
				+        'Cookie': 'BAIDUID={}:FG=1; BAIDUID_BFESS={}:FG=1'.format(cookie_str, cookie_str)
			
 
				+    }
			
 
				+    try:
			
 
				+        response = requests.request("GET", url, headers=headers, params=params, proxies=functions.proxy())
			
 
				+        response_json = response.json()
			
 
				+        if response_json['errmsg'] == '成功':
			
 
				+            response_data = response_json['data']
			
 
				+            return response_data
			
 
				+        else:
			
 
				+            raise SpiderError(
			
 
				+                platform="baidu",
			
 
				+                spider="account_video_crawler",
			
 
				+                error=response_json['errmsg'],
			
 
				+                url=url
			
 
				+            )
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        raise SpiderError(
			
 
				+            platform="baidu",
			
 
				+            spider="account_video_crawler",
			
 
				+            error=str(e),
			
 
				+            url=url
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+def baidu_single_video_crawler(video_id):
			
 
				+    """
			
 
				+    baidu video crawler
			
 
				+    :param video_id: 视频id
			
 
				+    """
			
 
				+    url = "https://haokan.baidu.com/v"
			
 
				+    params = {
			
 
				+        'vid': video_id,
			
 
				+        '_format': 'json'
			
 
				+    }
			
 
				+    base_64_string = base64.b64encode(str(uuid.uuid4()).encode()).decode()
			
 
				+    headers = {
			
 
				+        'Accept': '*/*',
			
 
				+        'cookie': "BIDUPSID={}".format(base_64_string),
			
 
				+        'Accept-Language': 'en,zh;q=0.9,zh-CN;q=0.8',
			
 
				+        'Cache-Control': 'no-cache',
			
 
				+        'Connection': 'keep-alive',
			
 
				+        'Content-Type': 'application/x-www-form-urlencoded',
			
 
				+        'Referer': 'https://haokan.baidu.com',
			
 
				+        'User-Agent': FakeUserAgent().chrome,
			
 
				+    }
			
 
				+    try:
			
 
				+        response = requests.request("GET", url, headers=headers, params=params, proxies=functions.proxy())
			
 
				+        response_json = response.json()
			
 
				+        return response_json['data']['apiData']['curVideoMeta']
			
 
				+    except Exception as e:
			
 
				+        raise SpiderError(
			
 
				+            platform="baidu",
			
 
				+            spider="single_video_crawler",
			
 
				+            error=str(e),
			
 
				+            url=url
			
 
				+        )
			
--- a/coldStartTasks/crawler/baidu/video_crawler.py
+++ b/coldStartTasks/crawler/baidu/video_crawler.py
@@ -0,0 +1,269 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+@description: video crawler
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import json
			
 
				+import time
			
 
				+import traceback
			
 
				+from typing import List, Dict
			
 
				+
			
 
				+from pymysql.cursors import DictCursor
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+from applications import Functions
			
 
				+from applications import bot, log
			
 
				+from applications.const import BaiduVideoCrawlerConst
			
 
				+from applications.db import DatabaseConnector
			
 
				+from applications.exception import SpiderError
			
 
				+from config import long_articles_config
			
 
				+from coldStartTasks.crawler.baidu.baidu_spider import baidu_account_video_crawler
			
 
				+from coldStartTasks.crawler.baidu.baidu_spider import baidu_single_video_crawler
			
 
				+
			
 
				+const = BaiduVideoCrawlerConst()
			
 
				+empty_list = []
			
 
				+functions = Functions()
			
 
				+
			
 
				+
			
 
				+class BaiduVideoCrawler(object):
			
 
				+    """
			
 
				+    baidu video crawler
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.db = None
			
 
				+        self.success_crawler_video_count = 0
			
 
				+        self.connect_db()
			
 
				+
			
 
				+    def connect_db(self) -> None:
			
 
				+        """
			
 
				+        connect db
			
 
				+        """
			
 
				+        self.db = DatabaseConnector(db_config=long_articles_config)
			
 
				+        self.db.connect()
			
 
				+
			
 
				+    def get_account_list(self) -> List[Dict]:
			
 
				+        """
			
 
				+        get account list
			
 
				+        """
			
 
				+        sql = f"""
			
 
				+            select account_id, account_name, max_cursor 
			
 
				+            from baidu_account_for_videos
			
 
				+            where status = {const.BAIDU_ACCOUNT_GOOD_STATUS};
			
 
				+        """
			
 
				+        account_list = self.db.fetch(query=sql, cursor_type=DictCursor)
			
 
				+        return account_list
			
 
				+
			
 
				+    def whether_video_exists(self, title: str) -> bool:
			
 
				+        """
			
 
				+        whether video exists, use video_id && title
			
 
				+        """
			
 
				+        # check title
			
 
				+        sql = f"""
			
 
				+            select id from publish_single_video_source
			
 
				+            where article_title = %s;
			
 
				+        """
			
 
				+        duplicate_id = self.db.fetch(query=sql, params=(title,))
			
 
				+        if duplicate_id:
			
 
				+            print(title + " video exists")
			
 
				+            return True
			
 
				+
			
 
				+        return False
			
 
				+
			
 
				+    def save_each_video(self, video: Dict, account_id: str, account_name: str) -> None:
			
 
				+        """
			
 
				+        download and save each video
			
 
				+        """
			
 
				+        # print(json.dumps(video, ensure_ascii=False, indent=4))
			
 
				+        video_id = video["id"]
			
 
				+        title = video["title"]
			
 
				+
			
 
				+        # judge whether video exists
			
 
				+        if self.whether_video_exists(title):
			
 
				+            return
			
 
				+
			
 
				+        read_cnt = video.get("playcnt", 0)
			
 
				+        like_cnt = video.get("like_num", 0)
			
 
				+        publish_timestamp = video["publish_time"]
			
 
				+        # duration = video['duration']
			
 
				+        cover_url = video["poster"]
			
 
				+        video_url = video["playurl"]
			
 
				+        # sensitive_flag = video.get('sensitive_flag')
			
 
				+        video_more_info = video.get("contentcms_intervene_data")
			
 
				+        if video_more_info:
			
 
				+            video_category_list = video_more_info.get("category_v2")
			
 
				+            if video_category_list:
			
 
				+                video_category = video_category_list[0]
			
 
				+            else:
			
 
				+                video_category = None
			
 
				+        else:
			
 
				+            video_category = None
			
 
				+        manual_tags = video.get("manual_tags")
			
 
				+
			
 
				+        video_path = os.path.join(const.LOCAL_PATH_DIR, "{}.mp4".format(video_id))
			
 
				+        download_path = functions.download_baidu_videos(video_url, video_path)
			
 
				+        if download_path:
			
 
				+            oss_path = functions.upload_to_oss(local_video_path=download_path)
			
 
				+            insert_sql = f"""
			
 
				+                INSERT INTO publish_single_video_source
			
 
				+                (content_trace_id, article_title, out_account_id, out_account_name, read_cnt, like_cnt, article_url, cover_url, video_oss_path, publish_timestamp, crawler_timestamp, url_unique_md5, category, tags, platform, source_account)
			
 
				+                values
			
 
				+                (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
			
 
				+            """
			
 
				+            try:
			
 
				+                self.db.save(
			
 
				+                    query=insert_sql,
			
 
				+                    params=(
			
 
				+                        "video{}".format(functions.str_to_md5(video_id)),
			
 
				+                        title,
			
 
				+                        account_id,
			
 
				+                        account_name,
			
 
				+                        read_cnt,
			
 
				+                        like_cnt,
			
 
				+                        video_url,
			
 
				+                        cover_url,
			
 
				+                        oss_path,
			
 
				+                        publish_timestamp,
			
 
				+                        int(time.time()),
			
 
				+                        video_id,
			
 
				+                        video_category,
			
 
				+                        (
			
 
				+                            json.dumps(manual_tags, ensure_ascii=False)
			
 
				+                            if manual_tags
			
 
				+                            else None
			
 
				+                        ),
			
 
				+                        "hksp",
			
 
				+                        const.NO_SOURCE_ACCOUNT_STATUS,
			
 
				+                    ),
			
 
				+                )
			
 
				+                self.success_crawler_video_count += 1
			
 
				+            except Exception as e:
			
 
				+                log(
			
 
				+                    task="baidu_video_crawler",
			
 
				+                    function="save_each_video",
			
 
				+                    message="save video failed",
			
 
				+                    data={
			
 
				+                        "error": str(e),
			
 
				+                        "traceback": traceback.format_exc(),
			
 
				+                        "video_id": video_id,
			
 
				+                        "oss_path": oss_path,
			
 
				+                    },
			
 
				+                )
			
 
				+        else:
			
 
				+            print(f"download video failed, video_id: {video_id}")
			
 
				+
			
 
				+    def save_video_list(
			
 
				+        self, account_id: str, account_name: str, video_list: List[Dict]
			
 
				+    ) -> None:
			
 
				+        """
			
 
				+        save video list
			
 
				+        """
			
 
				+        progress_bar = tqdm(video_list, desc="crawler account: {}".format(account_name))
			
 
				+        for video_obj in progress_bar:
			
 
				+            if video_obj["type"] == "video":
			
 
				+                video_id = video_obj["content"]["vid"]
			
 
				+                try:
			
 
				+                    video_detail = baidu_single_video_crawler(video_id)
			
 
				+                    self.save_each_video(
			
 
				+                        video=video_detail,
			
 
				+                        account_id=account_id,
			
 
				+                        account_name=account_name,
			
 
				+                    )
			
 
				+                    progress_bar.set_postfix({"videoId": video_id})
			
 
				+                except SpiderError as e:
			
 
				+                    print("save single video fail", e)
			
 
				+                    continue
			
 
				+            else:
			
 
				+                continue
			
 
				+
			
 
				+    def crawler_each_account(self, account: Dict, cursor=None) -> None:
			
 
				+        """
			
 
				+        crawler each account
			
 
				+        response_strategy
			
 
				+        """
			
 
				+        account_id = account["account_id"]
			
 
				+        max_cursor = account["max_cursor"]
			
 
				+        if not max_cursor:
			
 
				+            max_cursor = const.DEFAULT_CURSOR
			
 
				+        account_name = account["account_name"]
			
 
				+        try:
			
 
				+            response_json = baidu_account_video_crawler(account_id, cursor=cursor)
			
 
				+
			
 
				+            video_list = response_json.get("results", empty_list)
			
 
				+            if video_list:
			
 
				+                self.save_video_list(
			
 
				+                    account_id=account_id,
			
 
				+                    account_name=account_name,
			
 
				+                    video_list=video_list,
			
 
				+                )
			
 
				+            # check next page
			
 
				+            has_next_page = response_json.get("has_more", False)
			
 
				+            if has_next_page:
			
 
				+                next_cursor = response_json.get("ctime", const.DEFAULT_CURSOR)
			
 
				+                if next_cursor < max_cursor:
			
 
				+                    print("No more videos after 2024-01-01")
			
 
				+                    return
			
 
				+                else:
			
 
				+                    return self.crawler_each_account(account, next_cursor)
			
 
				+        except SpiderError as e:
			
 
				+            print(e)
			
 
				+            return
			
 
				+
			
 
				+    def update_cursor(self, account_id: str) -> None:
			
 
				+        """
			
 
				+        update cursor for each account
			
 
				+        """
			
 
				+        select_sql = f"""
			
 
				+            select max(publish_timestamp) as max_cursor from publish_single_video_source where out_account_id = '{account_id}';
			
 
				+        """
			
 
				+        response_mysql = self.db.fetch(query=select_sql)
			
 
				+        max_publish_timestamp = response_mysql[0][0]
			
 
				+        if max_publish_timestamp:
			
 
				+            max_cursor = max_publish_timestamp * const.TIMESTAMP_TO_CURSOR
			
 
				+            update_sql = f"""
			
 
				+                update baidu_account_for_videos
			
 
				+                set max_cursor = %s
			
 
				+                where account_id = %s;
			
 
				+            """
			
 
				+            self.db.save(query=update_sql, params=(max_cursor, account_id))
			
 
				+
			
 
				+    def deal(self) -> None:
			
 
				+        """
			
 
				+        deal
			
 
				+        """
			
 
				+        account_list = self.get_account_list()
			
 
				+        success_cnt = 0
			
 
				+        fail_cnt = 0
			
 
				+        account_list_process_bar = tqdm(account_list, desc="process account list")
			
 
				+        for account in account_list_process_bar:
			
 
				+            try:
			
 
				+                account_list_process_bar.set_postfix(
			
 
				+                    {"account_name": account["account_name"]}
			
 
				+                )
			
 
				+                self.crawler_each_account(account)
			
 
				+                self.update_cursor(account["account_id"])
			
 
				+                success_cnt += 1
			
 
				+            except Exception as e:
			
 
				+                fail_cnt += 1
			
 
				+                log(
			
 
				+                    task="baidu_video_crawler",
			
 
				+                    function="deal",
			
 
				+                    message="crawler each account failed",
			
 
				+                    data={
			
 
				+                        "account_id": account["account_id"],
			
 
				+                        "account_name": account["account_name"],
			
 
				+                        "error": str(e),
			
 
				+                        "trace_back": traceback.format_exc(),
			
 
				+                    },
			
 
				+                )
			
 
				+        bot(
			
 
				+            title="baidu video crawler task finished",
			
 
				+            detail={
			
 
				+                "success_crawl_account_num": success_cnt,
			
 
				+                "fail_crawl_account_num": fail_cnt,
			
 
				+                "success_crawl_video_num": self.success_crawler_video_count,
			
 
				+                "success_crawl_account_rate": success_cnt / (success_cnt + fail_cnt),
			
 
				+            },
			
 
				+            mention=False,
			
 
				+        )
			
--- a/coldStartTasks/crawler/channels/__init__.py
+++ b/coldStartTasks/crawler/channels/__init__.py
@@ -0,0 +1,6 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+@tool: pycharm && deepseek
			
 
				+"""
			
 
				+from .blogger import get_channel_account_videos
			
 
				+from .search import search_in_wechat_channel
			
--- a/coldStartTasks/crawler/channels/blogger.py
+++ b/coldStartTasks/crawler/channels/blogger.py
@@ -0,0 +1,22 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+
			
 
				+import requests
			
 
				+import json
			
 
				+
			
 
				+from applications.api import WechatChannelAPI
			
 
				+from config import gewe_token, gewe_app_id
			
 
				+
			
 
				+
			
 
				+def get_channel_account_videos(user_id, last_buffer=""):
			
 
				+    """
			
 
				+    get channel account videos
			
 
				+    """
			
 
				+    channel_api = WechatChannelAPI(
			
 
				+        base_url='http://api.geweapi.com',
			
 
				+        token=gewe_token,
			
 
				+        app_id=gewe_app_id
			
 
				+    )
			
 
				+    result = channel_api.get_channel_video_list(user_id, last_buffer)
			
 
				+    return result
			
--- a/coldStartTasks/crawler/channels/search.py
+++ b/coldStartTasks/crawler/channels/search.py
@@ -0,0 +1,41 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+
			
 
				+from typing import Dict
			
 
				+
			
 
				+from applications.api import WechatChannelAPI
			
 
				+from config import gewe_token, gewe_app_id
			
 
				+
			
 
				+
			
 
				+def search_in_wechat_channel(
			
 
				+    search_key: str,
			
 
				+    search_type: int,
			
 
				+    page: int = 0,
			
 
				+    cookie: str = "",
			
 
				+    search_id: str = "",
			
 
				+    offset: int = 0,
			
 
				+) -> Dict:
			
 
				+    """
			
 
				+    :param search_key: 搜索关键字
			
 
				+    :param search_type: 搜索类型，1: 搜索所有视频， 2: 搜索视频号账号
			
 
				+    :param page: 页码
			
 
				+    :param cookie: 登录后的cookie
			
 
				+    :param search_id: 搜索id
			
 
				+    :param offset: 偏移量
			
 
				+    :return: result_list
			
 
				+    """
			
 
				+    channel_api = WechatChannelAPI(
			
 
				+        base_url='http://api.geweapi.com',
			
 
				+        token=gewe_token,
			
 
				+        app_id=gewe_app_id
			
 
				+    )
			
 
				+    result = channel_api.search(
			
 
				+        search_key=search_key,
			
 
				+        search_type=search_type,
			
 
				+        page=page,
			
 
				+        cookie=cookie,
			
 
				+        search_id=search_id,
			
 
				+        offset=offset
			
 
				+    )
			
 
				+    return result
			
--- a/coldStartTasks/crawler/toutiao/__init__.py
+++ b/coldStartTasks/crawler/toutiao/__init__.py
@@ -0,0 +1,4 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+from .blogger import get_toutiao_account_video_list
			
--- a/coldStartTasks/crawler/toutiao/blogger.py
+++ b/coldStartTasks/crawler/toutiao/blogger.py
@@ -0,0 +1,64 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+import json
			
 
				+import requests
			
 
				+from tenacity import retry
			
 
				+
			
 
				+from applications import log
			
 
				+from applications.utils import proxy, request_retry
			
 
				+from .use_js import call_js_function
			
 
				+
			
 
				+retry_desc = request_retry(retry_times=3, min_retry_delay=2, max_retry_delay=30)
			
 
				+
			
 
				+
			
 
				+@retry(**retry_desc)
			
 
				+def get_toutiao_account_video_list(
			
 
				+    account_id: str, cookie: str, max_behot_time=0
			
 
				+) -> dict | None:
			
 
				+    """
			
 
				+    get toutiao account video list
			
 
				+    :param account_id: toutiao account id
			
 
				+    :param cookie: cookie maybe expire not quite sure
			
 
				+    :param max_behot_time: max behot time
			
 
				+    :return: toutiao account video list
			
 
				+    """
			
 
				+    ms_token = "mFs9gU4FJc23gFWPvBfQxFsBRrx1xBEJD_ZRTAolHfPrae84kTEBaHQR3s8ToiLX4-U9hgATTZ2cVHlSixmj5YCTOPoVM-43gOt3aVHkxfXHEuUtTJe-wUEs%3D"
			
 
				+    query_params = [
			
 
				+        0,
			
 
				+        1,
			
 
				+        14,
			
 
				+        "category=pc_user_hot&token={}&aid=24&app_name=toutiao_web&msToken={}".format(
			
 
				+            account_id, ms_token
			
 
				+        ),
			
 
				+        "",
			
 
				+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
			
 
				+    ]
			
 
				+    a_bogus = call_js_function(query_params)
			
 
				+    url = f"https://www.toutiao.com/api/pc/list/user/feed?category=pc_profile_video&token={account_id}&max_behot_time={max_behot_time}&hot_video=0&entrance_gid=&aid=24&app_name=toutiao_web&msToken={ms_token}&a_bogus={a_bogus}"
			
 
				+    headers = {
			
 
				+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
			
 
				+        "cookie": cookie,
			
 
				+    }
			
 
				+    try:
			
 
				+        response = requests.get(url, headers=headers, proxies=proxy())
			
 
				+        response.raise_for_status()
			
 
				+        return response.json()
			
 
				+    except requests.exceptions.RequestException as e:
			
 
				+        log(
			
 
				+            task="toutiao account crawler",
			
 
				+            function="get_toutiao_account_video_list",
			
 
				+            message=f"API请求失败: {e}",
			
 
				+            data={"account_id": account_id},
			
 
				+        )
			
 
				+    except json.JSONDecodeError as e:
			
 
				+        log(
			
 
				+            task="toutiao account crawler",
			
 
				+            function="get_toutiao_account_video_list",
			
 
				+            message=f"响应解析失败: {e}",
			
 
				+            data={"account_id": account_id},
			
 
				+        )
			
 
				+    return None
			
--- a/coldStartTasks/crawler/toutiao/use_js.py
+++ b/coldStartTasks/crawler/toutiao/use_js.py
@@ -0,0 +1,25 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+import json
			
 
				+import subprocess
			
 
				+
			
 
				+from config import toutiao_js_path
			
 
				+
			
 
				+
			
 
				+def call_js_function(arguments_list):
			
 
				+    """
			
 
				+    call js function
			
 
				+    """
			
 
				+    # 将参数转换为JSON字符串
			
 
				+    args_json = json.dumps(arguments_list)
			
 
				+    # 调用Node.js执行脚本
			
 
				+    result = subprocess.run(
			
 
				+        ['node', toutiao_js_path, args_json],
			
 
				+        capture_output=True,
			
 
				+        text=True
			
 
				+    )
			
 
				+    if result.returncode == 0:
			
 
				+        return result.stdout.strip()
			
 
				+    else:
			
 
				+        raise Exception(f"Error: {result.stderr}")
			
--- a/coldStartTasks/crawler/wechat/__init__.py
+++ b/coldStartTasks/crawler/wechat/__init__.py
@@ -0,0 +1,4 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+from .article_association import ArticleAssociationCrawler
			
--- a/coldStartTasks/crawler/wechat/article_association.py
+++ b/coldStartTasks/crawler/wechat/article_association.py
@@ -0,0 +1,210 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+
			
 
				+import time
			
 
				+import traceback
			
 
				+from datetime import datetime
			
 
				+
			
 
				+import numpy as np
			
 
				+
			
 
				+from pymysql.cursors import DictCursor
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+
			
 
				+from applications import WeixinSpider, log
			
 
				+from applications.api import similarity_between_title_list
			
 
				+from applications.const import ColdStartTaskConst
			
 
				+from applications.db import DatabaseConnector
			
 
				+from applications.functions import Functions
			
 
				+from applications.utils import get_inner_account_set
			
 
				+from applications.utils import whether_title_sensitive
			
 
				+from config import long_articles_config
			
 
				+
			
 
				+spider = WeixinSpider()
			
 
				+functions = Functions()
			
 
				+const = ColdStartTaskConst()
			
 
				+
			
 
				+
			
 
				+class ArticleAssociationCrawler(object):
			
 
				+    """
			
 
				+    article association crawler task
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.db_client = DatabaseConnector(db_config=long_articles_config)
			
 
				+        self.db_client.connect()
			
 
				+        self.inner_account_set = get_inner_account_set()
			
 
				+
			
 
				+    def get_seed_url_list(self, biz_date):
			
 
				+        """
			
 
				+        获取种子url列表
			
 
				+        """
			
 
				+        sql = f"""
			
 
				+            select gh_id, title, link
			
 
				+            from datastat_sort_strategy
			
 
				+            where date_str > DATE_FORMAT(DATE_SUB('{biz_date}', INTERVAL 2 DAY), '%Y%m%d') 
			
 
				+                and view_count > {const.READ_COUNT_THRESHOLD} 
			
 
				+                and read_rate > {const.READ_AVG_THRESHOLD} 
			
 
				+                and type = {const.BULK_PUBLISH_TYPE}
			
 
				+            order by read_rate desc 
			
 
				+            limit {const.SEED_ARTICLE_LIMIT_NUM};
			
 
				+        """
			
 
				+        seed_article_list = self.db_client.fetch(query=sql, cursor_type=DictCursor)
			
 
				+        return seed_article_list
			
 
				+
			
 
				+    def get_level_up_title_list(self):
			
 
				+        """
			
 
				+        获取晋级文章标题列表
			
 
				+        status: 1 表示文章已经溯源完成
			
 
				+        deleted: 0 表示文章正常
			
 
				+        level = 'autoArticlePoolLevel1' 表示头条
			
 
				+        """
			
 
				+        sql = f"""
			
 
				+            select distinct title 
			
 
				+            from article_pool_promotion_source 
			
 
				+            where level = 'autoArticlePoolLevel1' and status = 1 and deleted = 0;
			
 
				+        """
			
 
				+        mysql_response = self.db_client.fetch(query=sql)
			
 
				+        title_list = [i[0] for i in mysql_response]
			
 
				+        return title_list
			
 
				+
			
 
				+    def get_recommend_url_list_with_depth(
			
 
				+        self, seed_url, source_title, source_account, base_title_list, depth=1
			
 
				+    ):
			
 
				+        """
			
 
				+        @param seed_url: good url from data_sort_strategy
			
 
				+        @param depth: association depth
			
 
				+        @param source_title: article title
			
 
				+        @param source_account: article account
			
 
				+        """
			
 
				+        if depth > const.ARTICLE_ASSOCIATION_MAX_DEPTH:
			
 
				+            return
			
 
				+
			
 
				+        res = spider.get_recommend_articles(content_link=seed_url)
			
 
				+        related_articles = res["data"]["data"]["list"]
			
 
				+        if related_articles:
			
 
				+            title_list = [i["title"] for i in related_articles]
			
 
				+            similarity_array = similarity_between_title_list(
			
 
				+                title_list, base_title_list
			
 
				+            )
			
 
				+
			
 
				+            recommend_articles = []
			
 
				+            for index, score_list in enumerate(similarity_array):
			
 
				+                sorted_score_list = sorted(score_list)
			
 
				+                percent_threshold_score = np.percentile(
			
 
				+                    sorted_score_list, const.PERCENT_THRESHOLD
			
 
				+                )
			
 
				+                if percent_threshold_score < const.CORRELATION_THRESHOLD:
			
 
				+                    continue
			
 
				+
			
 
				+                else:
			
 
				+                    article_obj = related_articles[index]
			
 
				+                    article_obj["score"] = percent_threshold_score
			
 
				+                    recommend_articles.append(article_obj)
			
 
				+
			
 
				+            recommend_process_bar = tqdm(
			
 
				+                recommend_articles, desc="save recommend articles"
			
 
				+            )
			
 
				+            for article in recommend_process_bar:
			
 
				+                obj = {
			
 
				+                    "title": article["title"],
			
 
				+                    "url": article["url"],
			
 
				+                    "gh_id": article["username"],
			
 
				+                    "index": article["idx"],
			
 
				+                    "send_time": article["send_time"],
			
 
				+                    "read_cnt": article["read_num"],
			
 
				+                    "depth": depth,
			
 
				+                    "source_article_title": source_title,
			
 
				+                    "source_account": source_account,
			
 
				+                }
			
 
				+                self.insert_recommend_article(obj)
			
 
				+                recommend_process_bar.set_postfix(
			
 
				+                    {"title": article["title"], "depth": depth}
			
 
				+                )
			
 
				+                self.get_recommend_url_list_with_depth(
			
 
				+                    seed_url=obj["url"],
			
 
				+                    source_title=obj["title"],
			
 
				+                    source_account=obj["gh_id"],
			
 
				+                    base_title_list=base_title_list,
			
 
				+                    depth=depth + 1,
			
 
				+                )
			
 
				+        else:
			
 
				+            return
			
 
				+
			
 
				+    def insert_recommend_article(self, obj):
			
 
				+        """
			
 
				+        insert recommend article
			
 
				+        """
			
 
				+        # whether account inside
			
 
				+        if obj["gh_id"] in self.inner_account_set:
			
 
				+            return
			
 
				+
			
 
				+        # whether article title exists
			
 
				+        title = obj["title"]
			
 
				+        select_sql = "select article_id from crawler_meta_article where title = %s;"
			
 
				+        res = self.db_client.fetch(query=select_sql, params=(title,))
			
 
				+        if res:
			
 
				+            return
			
 
				+
			
 
				+        # whether title sensitive
			
 
				+        title_sensitivity = (
			
 
				+            const.TITLE_SENSITIVE
			
 
				+            if whether_title_sensitive(title)
			
 
				+            else const.TITLE_NOT_SENSITIVE
			
 
				+        )
			
 
				+
			
 
				+        # insert this article
			
 
				+        insert_sql = f"""
			
 
				+            insert into crawler_meta_article 
			
 
				+            (platform, mode, category, out_account_id, article_index, title, link, read_cnt, publish_time, crawler_time, status, unique_index, source_article_title, source_account, title_sensitivity)
			
 
				+            values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
			
 
				+        """
			
 
				+        self.db_client.save(
			
 
				+            query=insert_sql,
			
 
				+            params=(
			
 
				+                "weixin",
			
 
				+                "recommend",
			
 
				+                "article_association",
			
 
				+                obj["gh_id"],
			
 
				+                obj["index"],
			
 
				+                obj["title"],
			
 
				+                obj["url"],
			
 
				+                obj["read_cnt"],
			
 
				+                obj["send_time"],
			
 
				+                int(time.time()),
			
 
				+                const.DEFAULT_ARTICLE_STATUS,
			
 
				+                functions.generateGzhId(obj["url"]),
			
 
				+                obj["source_article_title"],
			
 
				+                obj["source_account"],
			
 
				+                title_sensitivity,
			
 
				+            ),
			
 
				+        )
			
 
				+
			
 
				+    def deal(self, biz_date=None):
			
 
				+        """
			
 
				+        class entrance
			
 
				+        :param biz_date:
			
 
				+        """
			
 
				+        if biz_date is None:
			
 
				+            biz_date = datetime.today().strftime("%Y-%m-%d")
			
 
				+
			
 
				+        seed_article_list = self.get_seed_url_list(biz_date)
			
 
				+        deal_bar = tqdm(seed_article_list, desc="article association crawler")
			
 
				+        base_title_list = self.get_level_up_title_list()
			
 
				+        for article in deal_bar:
			
 
				+            try:
			
 
				+                self.get_recommend_url_list_with_depth(
			
 
				+                    seed_url=article["link"],
			
 
				+                    source_title=article["title"],
			
 
				+                    source_account=article["gh_id"],
			
 
				+                    base_title_list=base_title_list,
			
 
				+                )
			
 
				+                deal_bar.set_postfix({"article_title": article["title"]})
			
 
				+            except Exception as e:
			
 
				+                log(
			
 
				+                    task="article_association_crawler",
			
 
				+                    function="deal",
			
 
				+                    message=f"article association crawler error, article title: {article['title']}, error: {e}",
			
 
				+                    data={"article": article, "traceback": traceback.format_exc()},
			
 
				+                )
			
--- a/coldStartTasks/crawler/weixinCategoryCrawler.py
+++ b/coldStartTasks/crawler/weixinCategoryCrawler.py
@@ -8,7 +8,7 @@ import time
 
				 from tqdm import tqdm
			
 
				 from pymysql.cursors import DictCursor
			
 
				 
			
 
				-from applications import WeixinSpider, Functions, llm_sensitivity, log
			
 
				+from applications import WeixinSpider, Functions, log
			
 
				 from coldStartTasks.filter import article_crawler_duplicate_filter
			
 
				 from config import apolloConfig
			
 
				 
			
@@ -158,18 +158,6 @@ class weixinCategory(object):
 
				                     print(e)
			
 
				         return success_records
			
 
				 
			
 
				-    def update_article_sensitive_status(self, category, unique_index, status):
			
 
				-        """
			
 
				-        更新文章敏感状态
			
 
				-        :return:
			
 
				-        """
			
 
				-        update_sql = f"""
			
 
				-            update crawler_meta_article
			
 
				-            set llm_sensitivity = %s
			
 
				-            where category = %s and unique_index = %s;
			
 
				-        """
			
 
				-        self.db_client_lam.update(sql=update_sql, params=(status, category, unique_index))
			
 
				-
			
 
				     def update_latest_account_timestamp(self, gh_id):
			
 
				         """
			
 
				         更新账号的最新时间戳
			
@@ -242,18 +230,6 @@ class weixinCategory(object):
 
				                 print("success")
			
 
				             except Exception as e:
			
 
				                 print("fail because of {}".format(e))
			
 
				-        success_titles = [x['title'] for x in success_records]
			
 
				-        if success_titles:
			
 
				-            try:
			
 
				-                sensitive_results = llm_sensitivity.check_titles(success_titles)
			
 
				-                for record, sensitive_result in zip(success_records, sensitive_results):
			
 
				-                    self.update_article_sensitive_status(
			
 
				-                        category=category,
			
 
				-                        unique_index=record['unique_index'],
			
 
				-                        status=sensitive_result['hit_rule']
			
 
				-                    )
			
 
				-            except Exception as e:
			
 
				-                print("failed to update sensitive status: {}".format(e))
			
 
				 
			
 
				     def deal(self, category_list, date_str):
			
 
				         """
			
--- a/coldStartTasks/crawler/weixin_account_association_crawler.py
+++ b/coldStartTasks/crawler/weixin_account_association_crawler.py
@@ -229,7 +229,7 @@ class AccountAssociationCrawler(object):
 
				                     task="account_association",
			
 
				                     function="run_account_association",
			
 
				                     data={
			
 
				-                        "biz_date": biz_date,
			
 
				+                        "biz_date": biz_date.strftime("%Y-%m-%d"),
			
 
				                         "article": article,
			
 
				                         "trace_back": traceback.format_exc(),
			
 
				                         "error": f"{e}"
			
--- a/coldStartTasks/crawler/weixin_account_crawler.py
+++ b/coldStartTasks/crawler/weixin_account_crawler.py
@@ -44,7 +44,9 @@ class WeixinAccountCrawler(object):
 
				         sql = f"""
			
 
				             SELECT id, article_url
			
 
				             FROM publish_single_video_source
			
 
				-            WHERE source_account = {const.NEED_SCAN_SOURCE_ACCOUNT};
			
 
				+            WHERE source_account = {const.NEED_SCAN_SOURCE_ACCOUNT} 
			
 
				+            and bad_status = {const.TITLE_DEFAULT_STATUS}
			
 
				+            and platform = 'gzh' limit 1000;
			
 
				         """
			
 
				         article_url_list = self.db_client.select(sql, cursor_type=DictCursor)
			
 
				         return article_url_list
			
--- a/coldStartTasks/crawler/weixin_video_crawler.py
+++ b/coldStartTasks/crawler/weixin_video_crawler.py
@@ -87,7 +87,8 @@ class WeixinVideoCrawler(object):
 
				         select_sql = f"""
			
 
				             SELECT gh_id, account_name, latest_crawler_timestamp
			
 
				             FROM weixin_account_for_videos
			
 
				-            WHERE status = {const.ACCOUNT_CRAWL_STATUS};
			
 
				+            WHERE status = {const.ACCOUNT_CRAWL_STATUS}
			
 
				+            ORDER BY latest_crawler_timestamp;
			
 
				         """
			
 
				         response = self.db_client.select(select_sql, DictCursor)
			
 
				         return response
			
@@ -158,10 +159,15 @@ class WeixinVideoCrawler(object):
 
				                     url_unique = functions.generateGzhId(article_url)
			
 
				                     # 判断该视频链接是否下载，若已经下载则直接跳过
			
 
				                     if self.is_downloaded(url_unique):
			
 
				+                        print("url exists")
			
 
				+                        continue
			
 
				+
			
 
				+                    title = article.get("Title", None)
			
 
				+                    if not title:
			
 
				                         continue
			
 
				 
			
 
				                     # 判断标题是否重复
			
 
				-                    if video_crawler_duplicate_filter(article_url, self.db_client):
			
 
				+                    if video_crawler_duplicate_filter(title, self.db_client):
			
 
				                         log(
			
 
				                             task='weixin_video_crawler',
			
 
				                             function="insert_msg_list",
			
@@ -174,7 +180,6 @@ class WeixinVideoCrawler(object):
 
				                         download_path = functions.download_gzh_video(article_url)
			
 
				                         if download_path:
			
 
				                             oss_path = functions.upload_to_oss(local_video_path=download_path)
			
 
				-                            title = article.get("Title", None)
			
 
				                             position = article.get("ItemIndex", None)
			
 
				                             cover_url = article.get("CoverImgUrl", None)
			
 
				                             show_desc = article.get("ShowDesc", None)
			
--- a/coldStartTasks/filter/title_similarity_task.py
+++ b/coldStartTasks/filter/title_similarity_task.py
@@ -54,7 +54,7 @@ class ColdStartTitleSimilarityTask(object):
 
				         title_list = [i[0] for i in mysql_response]
			
 
				         return title_list
			
 
				 
			
 
				-    def get_title_from_meta_base(self, limit):
			
 
				+    def get_article_title_from_meta_base(self, limit):
			
 
				         """
			
 
				         获取meta_base表中文章标题列表
			
 
				         status: 1 表示文章初始化状态
			
@@ -70,17 +70,56 @@ class ColdStartTitleSimilarityTask(object):
 
				         mysql_response = self.db_client.fetch(query=sql, cursor_type=DictCursor)
			
 
				         return mysql_response
			
 
				 
			
 
				-    def update_meta_article_batch(self, update_data_list: list[tuple]) -> int:
			
 
				+    def get_video_title_from_meta_table(self, limit):
			
 
				         """
			
 
				-        批量更新crawler_meta_article
			
 
				+        获取meta_base表中视频标题列表
			
 
				+        audit_status = 0 表示视频初始化状态
			
 
				+        """
			
 
				+        if limit:
			
 
				+            sql = f"""
			
 
				+                select id as article_id, article_title as title 
			
 
				+                from publish_single_video_source 
			
 
				+                where audit_status = 0 
			
 
				+                    and score is null 
			
 
				+                    and bad_status = 0
			
 
				+                limit {limit};
			
 
				+            """
			
 
				+        else:
			
 
				+            sql = f"""
			
 
				+                select id as article_id, article_title as title 
			
 
				+                from publish_single_video_source 
			
 
				+                where audit_status = 0 
			
 
				+                    and score is null
			
 
				+                    and bad_status = 0;
			
 
				+            """
			
 
				+        mysql_response = self.db_client.fetch(query=sql, cursor_type=DictCursor)
			
 
				+        return mysql_response
			
 
				+
			
 
				+    def update_meta_database_batch(self, meta_source: str, update_data_list: list[tuple]) -> int:
			
 
				         """
			
 
				-        sql = """
			
 
				-            update crawler_meta_article
			
 
				-            set score = case article_id
			
 
				-                {}
			
 
				-            end
			
 
				-            where article_id in %s and score is null;
			
 
				+        批量更新crawler_meta_article
			
 
				         """
			
 
				+        match meta_source:
			
 
				+            case "video":
			
 
				+                sql = """
			
 
				+                    update publish_single_video_source
			
 
				+                    set score = case id
			
 
				+                        {}
			
 
				+                    end
			
 
				+                    where id in %s and score is null;
			
 
				+                """
			
 
				+            case "article":
			
 
				+                sql = """
			
 
				+                    update crawler_meta_article
			
 
				+                    set score = case article_id
			
 
				+                        {}
			
 
				+                    end
			
 
				+                    where article_id in %s and score is null;
			
 
				+                """
			
 
				+            case _:
			
 
				+                print("source_type is not valid")
			
 
				+                return 0
			
 
				+
			
 
				         case_statement = []
			
 
				         article_id_list = []
			
 
				         params = []
			
@@ -95,22 +134,29 @@ class ColdStartTitleSimilarityTask(object):
 
				         affected_rows = self.db_client.save(formatted_sql, params)
			
 
				         return affected_rows
			
 
				 
			
 
				-    def run(self, limit=None):
			
 
				+    def run(self, meta_source, limit=None):
			
 
				         """
			
 
				         执行任务
			
 
				         """
			
 
				-        target_article_list = self.get_title_from_meta_base(limit=limit)
			
 
				-        if not target_article_list:
			
 
				+        match meta_source:
			
 
				+            case "article":
			
 
				+                target_list = self.get_article_title_from_meta_base(limit=limit)
			
 
				+            case "video":
			
 
				+                target_list = self.get_video_title_from_meta_table(limit=limit)
			
 
				+            case _:
			
 
				+                print("meta_source is not valid")
			
 
				+                return
			
 
				+
			
 
				+        if not target_list:
			
 
				             print("No more articles to process.")
			
 
				             return
			
 
				 
			
 
				         base_title_list = self.get_level_up_title_list()
			
 
				-
			
 
				-        batch_task_list = chunks(target_article_list, ARTICLE_BATCH)
			
 
				+        batch_task_list = chunks(target_list, ARTICLE_BATCH)
			
 
				 
			
 
				         for batch_task in batch_task_list:
			
 
				             try:
			
 
				-                batch_target_title_list = [i['title'] for i in batch_task]
			
 
				+                batch_target_title_list = [i['title'][:30] for i in batch_task]
			
 
				                 similarity_array = similarity_between_title_list(batch_target_title_list, base_title_list)
			
 
				 
			
 
				                 update_data_list = []
			
@@ -119,7 +165,7 @@ class ColdStartTitleSimilarityTask(object):
 
				                     percent_threshold_score = np.percentile(sorted_score_list, PERCENT_THRESHOLD)
			
 
				                     update_data_list.append((percent_threshold_score, batch_task[index]['article_id']))
			
 
				 
			
 
				-                affected_rows = self.update_meta_article_batch(update_data_list)
			
 
				+                affected_rows = self.update_meta_database_batch(meta_source=meta_source, update_data_list=update_data_list)
			
 
				 
			
 
				                 print("{}: \t本次任务处理数量: {}".format(datetime.datetime.today().__str__(), affected_rows))
			
 
				             except Exception as e:
			
--- a/coldStartTasks/publish/basic.py
+++ b/coldStartTasks/publish/basic.py
@@ -0,0 +1,276 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+import time
			
 
				+import datetime
			
 
				+import pandas as pd
			
 
				+import traceback
			
 
				+
			
 
				+from pandas import DataFrame
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+from applications import log, aiditApi, bot
			
 
				+from applications.const import ColdStartTaskConst
			
 
				+from config import apolloConfig
			
 
				+
			
 
				+const = ColdStartTaskConst()
			
 
				+config = apolloConfig()
			
 
				+
			
 
				+category_cold_start_threshold = json.loads(
			
 
				+    config.getConfigValue("category_cold_start_threshold")
			
 
				+)
			
 
				+READ_TIMES_THRESHOLD = category_cold_start_threshold.get("READ_TIMES_THRESHOLD", 1.3)
			
 
				+READ_THRESHOLD = category_cold_start_threshold.get("READ_THRESHOLD", 5000)
			
 
				+LIMIT_TITLE_LENGTH = category_cold_start_threshold.get("LIMIT_TITLE_LENGTH", 15)
			
 
				+TITLE_LENGTH_MAX = category_cold_start_threshold.get("TITLE_LENGTH_MAX", 50)
			
 
				+
			
 
				+
			
 
				+def get_article_from_meta_table(db_client, category: str, platform: str) -> DataFrame:
			
 
				+    """
			
 
				+    get article from meta data
			
 
				+    :param db_client: database connector
			
 
				+    :param category: article category
			
 
				+    :param platform: article platform
			
 
				+    :return: article dataframe
			
 
				+    """
			
 
				+    sql = f"""
			
 
				+        select 
			
 
				+            article_id, out_account_id, article_index, title, link, read_cnt, status, llm_sensitivity, score
			
 
				+        from crawler_meta_article
			
 
				+        where category = "{category}" and platform = "{platform}" and title_sensitivity = {const.TITLE_NOT_SENSITIVE}
			
 
				+        order by score desc;
			
 
				+    """
			
 
				+    article_list = db_client.fetch(sql)
			
 
				+    log(
			
 
				+        task="category_publish_task",
			
 
				+        function="get_articles_from_meta_table",
			
 
				+        message="获取品类文章总数",
			
 
				+        data={"total_articles": len(article_list), "category": category},
			
 
				+    )
			
 
				+    article_df = pd.DataFrame(
			
 
				+        article_list,
			
 
				+        columns=[
			
 
				+            "article_id",
			
 
				+            "gh_id",
			
 
				+            "position",
			
 
				+            "title",
			
 
				+            "link",
			
 
				+            "read_cnt",
			
 
				+            "status",
			
 
				+            "llm_sensitivity",
			
 
				+            "score",
			
 
				+        ],
			
 
				+    )
			
 
				+    return article_df
			
 
				+
			
 
				+
			
 
				+def update_published_articles_status(db_client) -> None:
			
 
				+    """
			
 
				+    filter published articles
			
 
				+    """
			
 
				+    category_map = json.loads(config.getConfigValue("category_cold_start_map"))
			
 
				+    category_list = list(category_map.keys())
			
 
				+    processing_bar = tqdm(category_list, desc="update_published_articles")
			
 
				+    for category in processing_bar:
			
 
				+        plan_id = category_map.get(category)
			
 
				+        if plan_id:
			
 
				+            article_list = aiditApi.get_generated_article_list(plan_id)
			
 
				+            title_list = [i[1] for i in article_list]
			
 
				+            if title_list:
			
 
				+                update_sql = f"""
			
 
				+                        update crawler_meta_article
			
 
				+                        set status = %s 
			
 
				+                        where title in %s and status = %s;
			
 
				+                """
			
 
				+                affected_rows = db_client.save(
			
 
				+                    query=update_sql,
			
 
				+                    params=(
			
 
				+                        const.PUBLISHED_STATUS,
			
 
				+                        tuple(title_list),
			
 
				+                        const.INIT_STATUS,
			
 
				+                    ),
			
 
				+                )
			
 
				+                processing_bar.set_postfix(
			
 
				+                    {"category": category, "affected_rows": affected_rows}
			
 
				+                )
			
 
				+        else:
			
 
				+            return
			
 
				+
			
 
				+
			
 
				+def filter_by_read_times(article_df: DataFrame) -> DataFrame:
			
 
				+    """
			
 
				+    filter by read times
			
 
				+    """
			
 
				+    article_df["average_read"] = article_df.groupby(["gh_id", "position"])[
			
 
				+        "read_cnt"
			
 
				+    ].transform("mean")
			
 
				+    article_df["read_times"] = article_df["read_cnt"] / article_df["average_read"]
			
 
				+    filter_df = article_df[article_df["read_times"] >= READ_TIMES_THRESHOLD]
			
 
				+    return filter_df
			
 
				+
			
 
				+
			
 
				+def filter_by_status(article_df: DataFrame) -> DataFrame:
			
 
				+    """
			
 
				+    filter by status
			
 
				+    """
			
 
				+    filter_df = article_df[article_df["status"] == const.INIT_STATUS]
			
 
				+    return filter_df
			
 
				+
			
 
				+
			
 
				+def filter_by_read_cnt(article_df: DataFrame) -> DataFrame:
			
 
				+    """
			
 
				+    filter by read cnt
			
 
				+    """
			
 
				+    filter_df = article_df[article_df["read_cnt"] >= READ_THRESHOLD]
			
 
				+    return filter_df
			
 
				+
			
 
				+
			
 
				+def filter_by_title_length(article_df: DataFrame) -> DataFrame:
			
 
				+    """
			
 
				+    filter by title length
			
 
				+    """
			
 
				+    filter_df = article_df[
			
 
				+        (article_df["title"].str.len() >= LIMIT_TITLE_LENGTH)
			
 
				+        & (article_df["title"].str.len() <= TITLE_LENGTH_MAX)
			
 
				+    ]
			
 
				+    return filter_df
			
 
				+
			
 
				+
			
 
				+def filter_by_sensitive_words(article_df: DataFrame) -> DataFrame:
			
 
				+    """
			
 
				+    filter by sensitive words
			
 
				+    """
			
 
				+    filter_df = article_df[
			
 
				+        (~article_df["title"].str.contains("农历"))
			
 
				+        & (~article_df["title"].str.contains("太极"))
			
 
				+        & (~article_df["title"].str.contains("节"))
			
 
				+        & (~article_df["title"].str.contains("早上好"))
			
 
				+        & (~article_df["title"].str.contains("赖清德"))
			
 
				+        & (~article_df["title"].str.contains("普京"))
			
 
				+        & (~article_df["title"].str.contains("俄"))
			
 
				+        & (~article_df["title"].str.contains("南海"))
			
 
				+        & (~article_df["title"].str.contains("台海"))
			
 
				+        & (~article_df["title"].str.contains("解放军"))
			
 
				+        & (~article_df["title"].str.contains("蔡英文"))
			
 
				+        & (~article_df["title"].str.contains("中国"))
			
 
				+    ]
			
 
				+    return filter_df
			
 
				+
			
 
				+
			
 
				+def filter_by_similarity_score(article_df: DataFrame, score) -> DataFrame:
			
 
				+    """
			
 
				+    filter by similarity score
			
 
				+    """
			
 
				+    filter_df = article_df[article_df["score"] >= score]
			
 
				+    return filter_df
			
 
				+
			
 
				+
			
 
				+def insert_into_article_crawler_plan(
			
 
				+    db_client, crawler_plan_id, crawler_plan_name, create_timestamp
			
 
				+):
			
 
				+    """
			
 
				+    insert into article crawler plan
			
 
				+    """
			
 
				+    insert_sql = f"""
			
 
				+        insert into article_crawler_plan (crawler_plan_id, name, create_timestamp)
			
 
				+        values (%s, %s, %s);
			
 
				+    """
			
 
				+    try:
			
 
				+        db_client.save(
			
 
				+            query=insert_sql,
			
 
				+            params=(crawler_plan_id, crawler_plan_name, create_timestamp),
			
 
				+        )
			
 
				+    except Exception as e:
			
 
				+        bot(
			
 
				+            title="品类冷启任务，记录抓取计划id失败",
			
 
				+            detail={
			
 
				+                "error": str(e),
			
 
				+                "error_msg": traceback.format_exc(),
			
 
				+                "crawler_plan_id": crawler_plan_id,
			
 
				+                "crawler_plan_name": crawler_plan_name,
			
 
				+            },
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+def create_crawler_plan(url_list, plan_tag, platform) -> tuple:
			
 
				+    """
			
 
				+    create crawler plan
			
 
				+    """
			
 
				+    crawler_plan_response = aiditApi.auto_create_crawler_task(
			
 
				+        plan_id=None,
			
 
				+        plan_name="自动绑定-{}--{}--{}".format(
			
 
				+            plan_tag, datetime.date.today().__str__(), len(url_list)
			
 
				+        ),
			
 
				+        plan_tag=plan_tag,
			
 
				+        article_source=platform,
			
 
				+        url_list=url_list,
			
 
				+    )
			
 
				+    log(
			
 
				+        task="category_publish_task",
			
 
				+        function="publish_filter_articles",
			
 
				+        message="成功创建抓取计划",
			
 
				+        data=crawler_plan_response,
			
 
				+    )
			
 
				+    # save to db
			
 
				+    create_timestamp = int(time.time()) * 1000
			
 
				+    crawler_plan_id = crawler_plan_response["data"]["id"]
			
 
				+    crawler_plan_name = crawler_plan_response["data"]["name"]
			
 
				+    return crawler_plan_id, crawler_plan_name, create_timestamp
			
 
				+
			
 
				+
			
 
				+def bind_to_generate_plan(category, crawler_plan_id, crawler_plan_name, platform):
			
 
				+    """
			
 
				+    auto bind to generate plan
			
 
				+    """
			
 
				+    match platform:
			
 
				+        case "weixin":
			
 
				+            input_source_channel = 5
			
 
				+        case "toutiao":
			
 
				+            input_source_channel = 6
			
 
				+        case _:
			
 
				+            input_source_channel = 5
			
 
				+
			
 
				+    new_crawler_task_list = [
			
 
				+        {
			
 
				+            "contentType": 1,
			
 
				+            "inputSourceType": 2,
			
 
				+            "inputSourceSubType": None,
			
 
				+            "fieldName": None,
			
 
				+            "inputSourceValue": crawler_plan_id,
			
 
				+            "inputSourceLabel": crawler_plan_name,
			
 
				+            "inputSourceModal": 3,
			
 
				+            "inputSourceChannel": input_source_channel,
			
 
				+        }
			
 
				+    ]
			
 
				+    category_map = json.loads(config.getConfigValue("category_cold_start_map"))
			
 
				+    generate_plan_response = aiditApi.bind_crawler_task_to_generate_task(
			
 
				+        crawler_task_list=new_crawler_task_list, generate_task_id=category_map[category]
			
 
				+    )
			
 
				+    log(
			
 
				+        task="category_publish_task",
			
 
				+        function="publish_filter_articles",
			
 
				+        message="成功绑定到生成计划",
			
 
				+        data=generate_plan_response,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+def update_article_status_after_publishing(db_client, article_id_list):
			
 
				+    """
			
 
				+    update article status after publishing
			
 
				+    """
			
 
				+    update_sql = f"""
			
 
				+        update crawler_meta_article
			
 
				+        set status = %s
			
 
				+        where article_id in %s and status = %s;
			
 
				+    """
			
 
				+    affect_rows = db_client.save(
			
 
				+        query=update_sql,
			
 
				+        params=(const.PUBLISHED_STATUS, tuple(article_id_list), const.INIT_STATUS),
			
 
				+    )
			
 
				+    if affect_rows != len(article_id_list):
			
 
				+        bot(
			
 
				+            title="品类冷启任务中，出现更新状文章状态失败异常",
			
 
				+            detail={"affected_rows": affect_rows, "task_rows": len(article_id_list)},
			
 
				+        )
			
--- a/coldStartTasks/publish/publishCategoryArticles.py
+++ b/coldStartTasks/publish/publishCategoryArticles.py
@@ -9,7 +9,7 @@ import traceback
 
				 
			
 
				 from pandas import DataFrame
			
 
				 
			
 
				-from applications import aiditApi, log, bot
			
 
				+from applications import aiditApi, log, bot, llm_sensitivity
			
 
				 from config import apolloConfig
			
 
				 
			
 
				 apollo = apolloConfig()
			
@@ -297,6 +297,18 @@ class CategoryColdStartTask(object):
 
				         )
			
 
				         return zero_level_funnel_df
			
 
				 
			
 
				+    def update_article_sensitive_status(self, article_id, status):
			
 
				+        """
			
 
				+        更新文章敏感状态
			
 
				+        :return:
			
 
				+        """
			
 
				+        update_sql = f"""
			
 
				+            update crawler_meta_article
			
 
				+            set llm_sensitivity = %s
			
 
				+            where article_id = %s;
			
 
				+        """
			
 
				+        self.db_client.update(sql=update_sql, params=(status, article_id))
			
 
				+
			
 
				     def publish_filter_articles(self, category, articles_df, article_source):
			
 
				         """
			
 
				         过滤文章
			
@@ -315,6 +327,22 @@ class CategoryColdStartTask(object):
 
				             case _:
			
 
				                 return
			
 
				 
			
 
				+        success_titles = filtered_articles_df['title'].values.tolist()
			
 
				+        article_id_list = filtered_articles_df['article_id'].values.tolist()
			
 
				+        if success_titles:
			
 
				+            try:
			
 
				+                sensitive_results = llm_sensitivity.check_titles(success_titles)
			
 
				+                for article_id, sensitive_result in zip(article_id_list, sensitive_results):
			
 
				+                    self.update_article_sensitive_status(
			
 
				+                        article_id=article_id,
			
 
				+                        status=sensitive_result['hit_rule']
			
 
				+                    )
			
 
				+                    if sensitive_result['hit_rule'] > TITLE_NOT_SENSITIVE:
			
 
				+                        filtered_articles_df = filtered_articles_df[filtered_articles_df['article_id'] != article_id]
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                print("failed to update sensitive status: {}".format(e))
			
 
				+
			
 
				         url_list = filtered_articles_df['link'].values.tolist()
			
 
				         if url_list:
			
 
				             # create_crawler_plan
			
--- a/coldStartTasks/publish/publish_article_association_articles.py
+++ b/coldStartTasks/publish/publish_article_association_articles.py
@@ -0,0 +1,125 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+
			
 
				+from pandas import DataFrame
			
 
				+
			
 
				+from applications import bot
			
 
				+from applications.const import ColdStartTaskConst
			
 
				+from applications.db import DatabaseConnector
			
 
				+from config import long_articles_config
			
 
				+
			
 
				+from coldStartTasks.publish.basic import filter_by_status
			
 
				+from coldStartTasks.publish.basic import filter_by_sensitive_words
			
 
				+from coldStartTasks.publish.basic import filter_by_title_length
			
 
				+from coldStartTasks.publish.basic import update_published_articles_status
			
 
				+from coldStartTasks.publish.basic import get_article_from_meta_table
			
 
				+from coldStartTasks.publish.basic import update_article_status_after_publishing
			
 
				+from coldStartTasks.publish.basic import create_crawler_plan
			
 
				+from coldStartTasks.publish.basic import insert_into_article_crawler_plan
			
 
				+from coldStartTasks.publish.basic import bind_to_generate_plan
			
 
				+
			
 
				+const = ColdStartTaskConst()
			
 
				+
			
 
				+
			
 
				+def filter_articles_before_create_plan(article_df: DataFrame) -> DataFrame:
			
 
				+    """
			
 
				+    filter articles before create plan
			
 
				+    """
			
 
				+    total_length = article_df.shape[0]
			
 
				+
			
 
				+    # filter by status
			
 
				+    filter_df = filter_by_status(article_df)
			
 
				+    filter_length0 = filter_df.shape[0]
			
 
				+
			
 
				+    # filter by sensitive words
			
 
				+    filter_df = filter_by_sensitive_words(filter_df)
			
 
				+    filter_length1 = filter_df.shape[0]
			
 
				+
			
 
				+    # filter by title length
			
 
				+    filter_df = filter_by_title_length(filter_df)
			
 
				+    filter_length2 = filter_df.shape[0]
			
 
				+
			
 
				+    bot(
			
 
				+        title="文章联想任务，开始创建抓取计划",
			
 
				+        detail={
			
 
				+            "文章总数": total_length,
			
 
				+            "发布状态过滤": "过滤： {}, 剩余： {}".format(
			
 
				+                total_length - filter_length0, filter_length0
			
 
				+            ),
			
 
				+            "敏感词过滤": "过滤： {}, 剩余： {}".format(
			
 
				+                filter_length0 - filter_length1, filter_length1
			
 
				+            ),
			
 
				+            "标题长度过滤": "过滤： {}, 剩余： {}".format(
			
 
				+                filter_length1 - filter_length2, filter_length2
			
 
				+            ),
			
 
				+        },
			
 
				+        mention=False,
			
 
				+    )
			
 
				+
			
 
				+    return filter_df
			
 
				+
			
 
				+
			
 
				+class ArticleAssociationPublish(object):
			
 
				+    """
			
 
				+    publish i2i articles
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.db_client = DatabaseConnector(db_config=long_articles_config)
			
 
				+        self.db_client.connect()
			
 
				+
			
 
				+    def deal(self):
			
 
				+        """
			
 
				+        class entrance
			
 
				+        """
			
 
				+        # update published articles
			
 
				+        update_published_articles_status(db_client=self.db_client)
			
 
				+
			
 
				+        # get data from meta table
			
 
				+        article_dataframe = get_article_from_meta_table(
			
 
				+            db_client=self.db_client, category="article_association", platform="weixin"
			
 
				+        )
			
 
				+
			
 
				+        # fileter articles
			
 
				+        filter_dataframe = filter_articles_before_create_plan(article_dataframe)
			
 
				+
			
 
				+        # create crawler plan
			
 
				+        url_list = filter_dataframe["link"].values.tolist()
			
 
				+        if url_list:
			
 
				+            crawler_plan_id, crawler_plan_name, create_timestamp = create_crawler_plan(
			
 
				+                url_list=url_list, plan_tag="article_association", platform="weixin"
			
 
				+            )
			
 
				+
			
 
				+            # insert crawler plan
			
 
				+            insert_into_article_crawler_plan(
			
 
				+                db_client=self.db_client,
			
 
				+                crawler_plan_id=crawler_plan_id,
			
 
				+                crawler_plan_name=crawler_plan_name,
			
 
				+                create_timestamp=create_timestamp,
			
 
				+            )
			
 
				+
			
 
				+            # bind to generate plan
			
 
				+            bind_to_generate_plan(
			
 
				+                category="article_association",
			
 
				+                crawler_plan_id=crawler_plan_id,
			
 
				+                crawler_plan_name=crawler_plan_name,
			
 
				+                platform="weixin",
			
 
				+            )
			
 
				+
			
 
				+            # update status
			
 
				+            article_id_list = filter_dataframe["article_id"].values.tolist()
			
 
				+            update_article_status_after_publishing(
			
 
				+                db_client=self.db_client, article_id_list=article_id_list
			
 
				+            )
			
 
				+
			
 
				+            bot(
			
 
				+                title="文章联想任务，创建抓取计划成功",
			
 
				+                detail={
			
 
				+                    "抓取计划id": crawler_plan_id,
			
 
				+                    "抓取计划名称": crawler_plan_name,
			
 
				+                    "抓取条数": len(url_list),
			
 
				+                    "冷启动类型": "article_association",
			
 
				+                },
			
 
				+                mention=False,
			
 
				+            )
			
--- a/coldStartTasks/publish/publish_video_to_pq_for_audit.py
+++ b/coldStartTasks/publish/publish_video_to_pq_for_audit.py
@@ -12,7 +12,7 @@ from pymysql.cursors import DictCursor
 
				 from applications import log
			
 
				 from applications import PQAPI
			
 
				 from applications.const import WeixinVideoCrawlerConst
			
 
				-from applications.api import generate_mini_program_title
			
 
				+from applications.api import fetch_moon_shot_response
			
 
				 from applications.db import DatabaseConnector
			
 
				 from config import long_articles_config
			
 
				 
			
@@ -36,11 +36,14 @@ class PublishVideosForAudit(object):
 
				         """
			
 
				         already_published_count = self.get_published_articles_today()
			
 
				         rest_count = const.MAX_VIDEO_NUM - already_published_count
			
 
				+
			
 
				+        limit_count = min(rest_count, const.MAX_VIDEO_NUM_PER_PUBLISH)
			
 
				         sql = f"""
			
 
				             SELECT id, article_title, video_oss_path 
			
 
				             FROM publish_single_video_source 
			
 
				             WHERE audit_status = {const.VIDEO_AUDIT_INIT_STATUS} and bad_status = {const.TITLE_DEFAULT_STATUS}
			
 
				-            LIMIT {rest_count};
			
 
				+            ORDER BY score DESC
			
 
				+            LIMIT {limit_count};
			
 
				             """
			
 
				         response = self.db_client.fetch(sql, cursor_type=DictCursor)
			
 
				         return response
			
@@ -153,7 +156,18 @@ class PublishVideosForAudit(object):
 
				         title = self.db_client.fetch(select_sql, cursor_type=DictCursor)[0]['article_title']
			
 
				 
			
 
				         try:
			
 
				-            mini_program_title = generate_mini_program_title(title)
			
 
				+            # generate kimi title
			
 
				+            mini_program_title = fetch_moon_shot_response(task='generate_kimi_title', input_text=title)
			
 
				+
			
 
				+            # score kimi title
			
 
				+            kimi_safe_title = None
			
 
				+            title_safe_score = fetch_moon_shot_response(task='get_title_safe_score', input_text=mini_program_title)
			
 
				+            if int(title_safe_score) > const.TITLE_SAFE_SCORE_THRESHOLD:
			
 
				+                kimi_safe_title_obj = fetch_moon_shot_response(task='make_title_safe', input_text=title, output_type='json')
			
 
				+                kimi_safe_title = kimi_safe_title_obj['title_v2']
			
 
				+
			
 
				+            mini_program_title = kimi_safe_title if kimi_safe_title else mini_program_title
			
 
				+
			
 
				             update_sql = f"""
			
 
				             UPDATE publish_single_video_source SET mini_program_title = %s WHERE audit_video_id = %s;
			
 
				             """
			
--- a/config/__init__.py
+++ b/config/__init__.py
@@ -81,6 +81,15 @@ piaoquan_crawler_config = {
 
				     'charset': 'utf8mb4'
			
 
				 }
			
 
				 
			
 
				+
			
 
				+# moonshot model config(kimi)
			
 
				+moon_shot = {
			
 
				+    "api_key": "sk-5DqYCa88kche6nwIWjLE1p4oMm8nXrR9kQMKbBolNAWERu7q",
			
 
				+    "model": "moonshot-v1-32k",
			
 
				+    "base_url": "https://api.moonshot.cn/v1"
			
 
				+}
			
 
				+
			
 
				+
			
 
				 deep_seek_model = {
			
 
				     "DeepSeek-R1": "ep-20250213194143-d8q4t",
			
 
				     "DeepSeek-V3": "ep-20250213194558-rrmr2"
			
@@ -88,4 +97,14 @@ deep_seek_model = {
 
				 
			
 
				 deep_seek_default_model = "ep-20250213194558-rrmr2"
			
 
				 
			
 
				-deep_seek_api_key_byte_dance = '5e275c38-44fd-415f-abcf-4b59f6377f72'
			
 
				+deep_seek_api_key_byte_dance = '5e275c38-44fd-415f-abcf-4b59f6377f72'
			
 
				+
			
 
				+#GeWe
			
 
				+gewe_token = "d3fb918f-0f36-4769-b095-410181614231"
			
 
				+gewe_app_id = "wx_GKpVW8xfEhcaxMIK9sSm6"
			
 
				+
			
 
				+# sph decrypt key
			
 
				+decrypt_key_path = 'applications/so/libsph_decrypt.so'
			
 
				+
			
 
				+# toutiao js path
			
 
				+toutiao_js_path = 'applications/js/toutiao.js'
			
--- a/config/crontab_backup
+++ b/config/crontab_backup
@@ -1,11 +1,26 @@
 
				-# 凌晨1点30执行更新小程序信息任务
			
 
				-30 1 * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_minigram_info_daily.sh
			
 
				+# 每天凌晨 4点，下午 4 点各执行一次头条视频抓取
			
 
				+0 4,16 * * * bash /root/luojunhui/LongArticlesJob/sh/run_toutiao_account_video_crawler.sh
			
 
				+
			
 
				+# 每15分钟执行一次今日头条推荐流抓取
			
 
				+*/15 * * * * bash /root/luojunhui/LongArticlesJob/sh/run_toutiao_recommend.sh
			
 
				+
			
 
				+# 每10分钟执行一次从aigc系统获取发布文章
			
 
				+*/10 * * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_article_info_from_aigc.sh
			
 
				+
			
 
				+# 每10分钟执行一次标题相似度计算任务
			
 
				+*/10 * * * * bash /root/luojunhui/LongArticlesJob/sh/run_title_similarity_task.sh
			
 
				+
			
 
				+# 凌晨2点30执行更新小程序信息任务
			
 
				+30 2 * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_minigram_info_daily.sh
			
 
				 
			
 
				 # 每天上午10点30执行文章退场 && 晋升任务
			
 
				 30 10 * * * bash /root/luojunhui/LongArticlesJob/sh/run_article_title_exit_v1.sh
			
 
				 
			
 
				 # 每天上午4点执行账号冷启动任务
			
 
				-0 4 * * * bash /root/luojunhui/LongArticlesJob/sh/run_account_cold_start_daily.sh
			
 
				+0 1 * * * bash /root/luojunhui/LongArticlesJob/sh/run_account_cold_start_daily.sh
			
 
				+
			
 
				+# 每日上午9点执行账号联想任务
			
 
				+0 9 * * * bash /root/luojunhui/LongArticlesJob/sh/run_account_association.sh
			
 
				 
			
 
				 # 每天 10 点执行前一天的阅读率均值代码
			
 
				 0 10 * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_account_read_rate_avg.sh
			
@@ -13,18 +28,24 @@
 
				 # 每天10点40执行阅读均值任务
			
 
				 40 10 * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_account_avg_v3.sh
			
 
				 
			
 
				+# 每天11点执行文章联想任务
			
 
				+0 11 * * * bash /root/luojunhui/LongArticlesJob/sh/run_article_association.sh
			
 
				+
			
 
				 # 每小时执行一次校验视频状态
			
 
				 20 * * * * bash /root/luojunhui/LongArticlesJob/sh/run_check_video_status_hourly.sh
			
 
				 
			
 
				-# 每天凌晨4：30， 8：30， 15：30执行视频发布和审核流程
			
 
				-30 4,8,15 * * * bash /root/luojunhui/LongArticlesJob/sh/run_video_publish_and_audit.sh
			
 
				+# 每天凌晨4：30 15：30执行视频发布和审核流程
			
 
				+30 4,15 * * * bash /root/luojunhui/LongArticlesJob/sh/run_video_publish_and_audit.sh
			
 
				 
			
 
				 # 每天 上午8:30， 下午1：00， 晚上8：50执行
			
 
				 
			
 
				 30 8 * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_published_articles_daily.sh
			
 
				-20 13 * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_published_articles_daily.sh
			
 
				+20 14 * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_published_articles_daily.sh
			
 
				 50 20 * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_published_articles_daily.sh
			
 
				 
			
 
				+# 每天上午9点，下午2点，晚上9点执行v2代码
			
 
				+# 0 9,14,21 * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_published_articles_daily_v2.sh
			
 
				+
			
 
				 
			
 
				 # 每天上午 9:30 点,下午 2 点,晚上 7 点执行下架视频任务
			
 
				 
			
@@ -37,8 +58,10 @@
 
				 
			
 
				 # 每天早上9点，下午2：30， 晚上7：30
			
 
				 
			
 
				-30 9,14 * * * bash /root/luojunhui/LongArticlesJob/sh/published_articles_monitor.sh
			
 
				+0 10,16,20 * * * bash /root/luojunhui/LongArticlesJob/sh/published_articles_monitor.sh
			
 
				 
			
 
				+# 每晚11点开始执行百度视频
			
 
				+0 23 * * * bash /root/luojunhui/LongArticlesJob/sh/run_baidu_video_crawler.sh
			
 
				 
			
 
				 # check kimo balance hourly
			
 
				 
			
--- a/crawler_sph_video.py
+++ b/crawler_sph_video.py
@@ -0,0 +1,9 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+
			
 
				+from tasks.crawler_channel_account_videos import CrawlerChannelAccountVideos
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    crawler_channel_account_videos = CrawlerChannelAccountVideos()
			
 
				+    crawler_channel_account_videos.deal()
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,4 +20,6 @@ protobuf~=3.20.3
 
				 openai~=1.17.0
			
 
				 oss2~=2.19.1
			
 
				 fake-useragent~=1.5.1
			
 
				-playwright~=1.49.1
			
 
				+playwright~=1.49.1
			
 
				+volcengine-python-sdk[ark]
			
 
				+tenacity~=9.0.0
			
--- a/run_baidu_video_crawler.py
+++ b/run_baidu_video_crawler.py
@@ -0,0 +1,8 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+from coldStartTasks.crawler.baidu import BaiduVideoCrawler
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    task = BaiduVideoCrawler()
			
 
				+    task.deal()
			
--- a/run_title_rewrite_task.py
+++ b/run_title_rewrite_task.py
@@ -0,0 +1,9 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+from tasks.title_rewrite_task import TitleRewriteTask
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    task = TitleRewriteTask()
			
 
				+    task.deal()
			
--- a/sh/run_article_association.sh
+++ b/sh/run_article_association.sh
@@ -0,0 +1,26 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+# 获取当前日期，格式为 YYYY-MM-DD
			
 
				+CURRENT_DATE=$(date +%F)
			
 
				+
			
 
				+# 日志文件路径，包含日期
			
 
				+LOG_FILE="/root/luojunhui/logs/article_association_crawler_log_$CURRENT_DATE.txt"
			
 
				+
			
 
				+# 重定向整个脚本的输出到带日期的日志文件
			
 
				+exec >> "$LOG_FILE" 2>&1
			
 
				+if pgrep -f "python3 article_association_task.py" > /dev/null
			
 
				+then
			
 
				+    echo "$(date '+%Y-%m-%d %H:%M:%S') - article_association_task.py is running"
			
 
				+else
			
 
				+    echo "$(date '+%Y-%m-%d %H:%M:%S') - trying to restart article_association_task.py"
			
 
				+    # 切换到指定目录
			
 
				+    cd /root/luojunhui/LongArticlesJob
			
 
				+
			
 
				+    # 激活 Conda 环境
			
 
				+    source /root/miniconda3/etc/profile.d/conda.sh
			
 
				+    conda activate tasks
			
 
				+
			
 
				+    # 在后台运行 Python 脚本并重定向日志输出
			
 
				+    nohup python3 article_association_task.py >> "${LOG_FILE}" 2>&1 &
			
 
				+    echo "$(date '+%Y-%m-%d %H:%M:%S') - successfully restarted article_association_task.py"
			
 
				+fi
			
--- a/sh/run_baidu_video_crawler.sh
+++ b/sh/run_baidu_video_crawler.sh
@@ -0,0 +1,26 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+# 获取当前日期，格式为 YYYY-MM-DD
			
 
				+CURRENT_DATE=$(date +%F)
			
 
				+
			
 
				+# 日志文件路径，包含日期
			
 
				+LOG_FILE="/root/luojunhui/logs/baidu_video_crawler_log_$CURRENT_DATE.txt"
			
 
				+
			
 
				+# 重定向整个脚本的输出到带日期的日志文件
			
 
				+exec >> "$LOG_FILE" 2>&1
			
 
				+if pgrep -f "python3 run_baidu_video_crawler.py" > /dev/null
			
 
				+then
			
 
				+    echo "$(date '+%Y-%m-%d %H:%M:%S') - run_baidu_video_crawler.py is running"
			
 
				+else
			
 
				+    echo "$(date '+%Y-%m-%d %H:%M:%S') - trying to restart run_baidu_video_crawler.py"
			
 
				+    # 切换到指定目录
			
 
				+    cd /root/luojunhui/LongArticlesJob
			
 
				+
			
 
				+    # 激活 Conda 环境
			
 
				+    source /root/miniconda3/etc/profile.d/conda.sh
			
 
				+    conda activate tasks
			
 
				+
			
 
				+    # 在后台运行 Python 脚本并重定向日志输出
			
 
				+    nohup python3 run_baidu_video_crawler.py >> "${LOG_FILE}" 2>&1 &
			
 
				+    echo "$(date '+%Y-%m-%d %H:%M:%S') - successfully restarted run_baidu_video_crawler.py"
			
 
				+fi
			
--- a/sh/run_gzh_video_crawler.sh
+++ b/sh/run_gzh_video_crawler.sh
@@ -0,0 +1,26 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+# 获取当前日期，格式为 YYYY-MM-DD
			
 
				+CURRENT_DATE=$(date +%F)
			
 
				+
			
 
				+# 日志文件路径，包含日期
			
 
				+LOG_FILE="/root/luojunhui/logs/gzh_video_crawler_log_$CURRENT_DATE.txt"
			
 
				+
			
 
				+# 重定向整个脚本的输出到带日期的日志文件
			
 
				+exec >> "$LOG_FILE" 2>&1
			
 
				+if pgrep -f "python3 run_video_account_crawler.py" > /dev/null
			
 
				+then
			
 
				+    echo "$(date '+%Y-%m-%d %H:%M:%S') - run_video_account_crawler.py is running"
			
 
				+else
			
 
				+    echo "$(date '+%Y-%m-%d %H:%M:%S') - trying to restart run_video_account_crawler.py"
			
 
				+    # 切换到指定目录
			
 
				+    cd /root/luojunhui/LongArticlesJob
			
 
				+
			
 
				+    # 激活 Conda 环境
			
 
				+    source /root/miniconda3/etc/profile.d/conda.sh
			
 
				+    conda activate tasks
			
 
				+
			
 
				+    # 在后台运行 Python 脚本并重定向日志输出
			
 
				+    nohup python3 run_video_account_crawler.py >> "${LOG_FILE}" 2>&1 &
			
 
				+    echo "$(date '+%Y-%m-%d %H:%M:%S') - successfully restarted run_video_account_crawler.py"
			
 
				+fi
			
--- a/sh/run_sph_video_crawler.sh
+++ b/sh/run_sph_video_crawler.sh
@@ -0,0 +1,26 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+# 获取当前日期，格式为 YYYY-MM-DD
			
 
				+CURRENT_DATE=$(date +%F)
			
 
				+
			
 
				+# 日志文件路径，包含日期
			
 
				+LOG_FILE="/root/luojunhui/logs/sph_video_crawler_log_$CURRENT_DATE.txt"
			
 
				+
			
 
				+# 重定向整个脚本的输出到带日期的日志文件
			
 
				+exec >> "$LOG_FILE" 2>&1
			
 
				+if pgrep -f "python3 crawler_sph_video.py" > /dev/null
			
 
				+then
			
 
				+    echo "$(date '+%Y-%m-%d %H:%M:%S') - crawler_sph_video.py is running"
			
 
				+else
			
 
				+    echo "$(date '+%Y-%m-%d %H:%M:%S') - trying to restart crawler_sph_video.py"
			
 
				+    # 切换到指定目录
			
 
				+    cd /root/luojunhui/LongArticlesJob
			
 
				+
			
 
				+    # 激活 Conda 环境
			
 
				+    source /root/miniconda3/etc/profile.d/conda.sh
			
 
				+    conda activate tasks
			
 
				+
			
 
				+    # 在后台运行 Python 脚本并重定向日志输出
			
 
				+    nohup python3 crawler_sph_video.py >> "${LOG_FILE}" 2>&1 &
			
 
				+    echo "$(date '+%Y-%m-%d %H:%M:%S') - successfully restarted crawler_sph_video.py"
			
 
				+fi
			
--- a/sh/run_title_rewrite_task.sh
+++ b/sh/run_title_rewrite_task.sh
@@ -0,0 +1,26 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+# 获取当前日期，格式为 YYYY-MM-DD
			
 
				+CURRENT_DATE=$(date +%F)
			
 
				+
			
 
				+# 日志文件路径，包含日期
			
 
				+LOG_FILE="/root/luojunhui/logs/title_rewrite_task_log_$CURRENT_DATE.txt"
			
 
				+
			
 
				+# 重定向整个脚本的输出到带日期的日志文件
			
 
				+exec >> "$LOG_FILE" 2>&1
			
 
				+if pgrep -f "python3 run_title_rewrite_task.py" > /dev/null
			
 
				+then
			
 
				+    echo "$(date '+%Y-%m-%d %H:%M:%S') - run_title_rewrite_task.py is running"
			
 
				+else
			
 
				+    echo "$(date '+%Y-%m-%d %H:%M:%S') - trying to restart run_title_rewrite_task.py"
			
 
				+    # 切换到指定目录
			
 
				+    cd /root/luojunhui/LongArticlesJob
			
 
				+
			
 
				+    # 激活 Conda 环境
			
 
				+    source /root/miniconda3/etc/profile.d/conda.sh
			
 
				+    conda activate tasks
			
 
				+
			
 
				+    # 在后台运行 Python 脚本并重定向日志输出
			
 
				+    nohup python3 run_title_rewrite_task.py >> "${LOG_FILE}" 2>&1 &
			
 
				+    echo "$(date '+%Y-%m-%d %H:%M:%S') - successfully restarted run_title_rewrite_task.py"
			
 
				+fi
			
--- a/sh/run_toutiao_account_video_crawler.sh
+++ b/sh/run_toutiao_account_video_crawler.sh
@@ -0,0 +1,26 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+# 获取当前日期，格式为 YYYY-MM-DD
			
 
				+CURRENT_DATE=$(date +%F)
			
 
				+
			
 
				+# 日志文件路径，包含日期
			
 
				+LOG_FILE="/root/luojunhui/logs/toutiao_account_video_crawler_task_log_$CURRENT_DATE.txt"
			
 
				+
			
 
				+# 重定向整个脚本的输出到带日期的日志文件
			
 
				+exec >> "$LOG_FILE" 2>&1
			
 
				+if pgrep -f "python3 toutiao_video_crawler.py" > /dev/null
			
 
				+then
			
 
				+    echo "$(date '+%Y-%m-%d %H:%M:%S') - toutiao_video_crawler.py is running"
			
 
				+else
			
 
				+    echo "$(date '+%Y-%m-%d %H:%M:%S') - trying to restart toutiao_video_crawler.py"
			
 
				+    # 切换到指定目录
			
 
				+    cd /root/luojunhui/LongArticlesJob
			
 
				+
			
 
				+    # 激活 Conda 环境
			
 
				+    source /root/miniconda3/etc/profile.d/conda.sh
			
 
				+    conda activate tasks
			
 
				+
			
 
				+    # 在后台运行 Python 脚本并重定向日志输出
			
 
				+    nohup python3 toutiao_video_crawler.py >> "${LOG_FILE}" 2>&1 &
			
 
				+    echo "$(date '+%Y-%m-%d %H:%M:%S') - successfully restarted toutiao_video_crawler.py"
			
 
				+fi
			
--- a/sh/run_video_publish_and_audit.sh
+++ b/sh/run_video_publish_and_audit.sh
@@ -21,8 +21,6 @@ else
 
				     conda activate tasks
			
 
				 
			
 
				     # 在后台运行 Python 脚本并重定向日志输出
			
 
				-    nohup python3 run_video_account_crawler.py >> "${LOG_FILE}" 2>&1 &
			
 
				-    sleep 180
			
 
				     nohup python3 run_video_publish_and_audit.py >> "${LOG_FILE}" 2>&1 &
			
 
				     echo "$(date '+%Y-%m-%d %H:%M:%S') - successfully restarted run_video_publish_and_audit.py"
			
 
				 fi
			
--- a/tasks/article_summary_task.py
+++ b/tasks/article_summary_task.py
@@ -69,7 +69,7 @@ class ArticleSummaryTask(object):
 
				         """
			
 
				         rollback_rows = self.db_client.save(
			
 
				             query=update_sql,
			
 
				-            params=(const.SUMMARY_INIT_STATUS, const.SUMMARY_LOCK, timestamp_threshold),
			
 
				+            params=(const.INIT_STATUS, const.PROCESSING_STATUS, timestamp_threshold),
			
 
				         )
			
 
				 
			
 
				         return rollback_rows
			
--- a/tasks/crawler_channel_account_videos.py
+++ b/tasks/crawler_channel_account_videos.py
@@ -0,0 +1,224 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+@tool: pycharm && deepseek
			
 
				+"""
			
 
				+
			
 
				+import re
			
 
				+import os
			
 
				+import traceback
			
 
				+import time
			
 
				+
			
 
				+from pymysql.cursors import DictCursor
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+from applications import log
			
 
				+from applications.const import ChannelVideoCrawlerConst
			
 
				+from applications.db import DatabaseConnector
			
 
				+from applications.utils import download_sph_video
			
 
				+from applications.utils import insert_into_single_video_source_table
			
 
				+from applications.utils import Item
			
 
				+from applications.utils import str_to_md5
			
 
				+from applications.utils import upload_to_oss
			
 
				+from config import long_articles_config
			
 
				+from coldStartTasks.crawler.channels import get_channel_account_videos
			
 
				+
			
 
				+const = ChannelVideoCrawlerConst()
			
 
				+
			
 
				+
			
 
				+class CrawlerChannelAccountVideos:
			
 
				+    """
			
 
				+    crawler channel account videos
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.db_client = DatabaseConnector(db_config=long_articles_config)
			
 
				+        self.db_client.connect()
			
 
				+
			
 
				+    def whether_video_exists(self, title: str) -> bool:
			
 
				+        """
			
 
				+        whether video exists, use video_id && title
			
 
				+        """
			
 
				+        # check title
			
 
				+        sql = f"""
			
 
				+            select id from publish_single_video_source
			
 
				+            where article_title = %s;
			
 
				+        """
			
 
				+        duplicate_id = self.db_client.fetch(query=sql, params=(title,))
			
 
				+        if duplicate_id:
			
 
				+            return True
			
 
				+
			
 
				+        return False
			
 
				+
			
 
				+    def get_channel_account_list(self) -> list[dict]:
			
 
				+        """
			
 
				+        get channel account list from database
			
 
				+        """
			
 
				+        sql = f"""
			
 
				+            select account_id, max_cursor 
			
 
				+            from sph_account_for_videos 
			
 
				+            where status = {const.CHANNEL_ACCOUNT_GOOD_STATUS}
			
 
				+            order by max_cursor;"""
			
 
				+        account_list = self.db_client.fetch(query=sql, cursor_type=DictCursor)
			
 
				+        return account_list
			
 
				+
			
 
				+    def crawler_each_video(self, video: dict) -> None:
			
 
				+        """
			
 
				+        download each video
			
 
				+        save video and decrypt video
			
 
				+        upload video to oss
			
 
				+        """
			
 
				+        object_desc = video["objectDesc"]
			
 
				+        title = object_desc["description"]
			
 
				+        if self.whether_video_exists(title):
			
 
				+            log(
			
 
				+                task="crawler_channel_account_videos",
			
 
				+                function="crawler_each_video",
			
 
				+                message="video title exists",
			
 
				+                data={"video_id": video["id"], "title": title},
			
 
				+            )
			
 
				+            return
			
 
				+
			
 
				+        cleaned_title = re.sub(r"[^\u4e00-\u9fff]", "", title)
			
 
				+        if len(cleaned_title) < const.MIN_TITLE_LENGTH:
			
 
				+            log(
			
 
				+                task="crawler_channel_account_videos",
			
 
				+                function="crawler_each_video",
			
 
				+                message="video title is too short",
			
 
				+                data={"video_id": video["id"], "title": title},
			
 
				+            )
			
 
				+            return
			
 
				+
			
 
				+        video_length = video["objectDesc"]["media"][0]["VideoPlayLen"]
			
 
				+        if video_length and int(video_length) > const.MAX_VIDEO_LENGTH:
			
 
				+            log(
			
 
				+                task="crawler_channel_account_videos",
			
 
				+                function="crawler_each_video",
			
 
				+                message="video length is too long",
			
 
				+                data={"video_id": video["id"], "title": title, "length": video_length},
			
 
				+            )
			
 
				+            return
			
 
				+
			
 
				+        video_item = Item()
			
 
				+        video_id = video["id"]
			
 
				+        video_item.add("content_trace_id", "video{}".format(str_to_md5(str(video_id))))
			
 
				+        video_item.add("url_unique_md5", video_id)
			
 
				+        video_item.add("article_title", title)
			
 
				+        video_item.add("out_account_id", video["username"])
			
 
				+        video_item.add("out_account_name", video["nickname"])
			
 
				+        video_item.add("publish_timestamp", video["createtime"])
			
 
				+        video_item.add("platform", "sph")
			
 
				+        video_item.add("crawler_timestamp", int(time.time()))
			
 
				+        media = object_desc["media"][0]
			
 
				+        url = media["Url"]
			
 
				+        decode_key = media["decodeKey"]
			
 
				+        url_token = media["urlToken"]
			
 
				+        download_url = url + url_token
			
 
				+        try:
			
 
				+            decrypt_path = download_sph_video(download_url=download_url, key=decode_key)
			
 
				+            oss_path = upload_to_oss(decrypt_path)
			
 
				+            video_item.add("video_oss_path", oss_path)
			
 
				+            video_item.add("source_account", const.NO_SOURCE_ACCOUNT_STATUS)
			
 
				+            video_item.check(source="video")
			
 
				+            insert_into_single_video_source_table(self.db_client, video_item.item)
			
 
				+            os.remove(decrypt_path)
			
 
				+        except Exception as e:
			
 
				+            log(
			
 
				+                task="crawler_channel_account_videos",
			
 
				+                function="crawler_each_video",
			
 
				+                message="download video failed",
			
 
				+                data={
			
 
				+                    "error": str(e),
			
 
				+                    "traceback": traceback.format_exc(),
			
 
				+                    "video_id": video["id"],
			
 
				+                },
			
 
				+            )
			
 
				+
			
 
				+    def crawler_each_account(self, channel_account: dict, last_buffer: str = "") -> None:
			
 
				+        """
			
 
				+        通过循环替代递归，分页爬取频道账号视频
			
 
				+        """
			
 
				+        channel_account_id = channel_account["account_id"]
			
 
				+        max_cursor = channel_account.get("max_cursor") or const.DEFAULT_CURSOR
			
 
				+        current_last_buffer = last_buffer
			
 
				+        has_more = True
			
 
				+
			
 
				+        while has_more:
			
 
				+            response = get_channel_account_videos(channel_account_id, last_buffer=current_last_buffer)
			
 
				+            if response["ret"] != 200:
			
 
				+                log(
			
 
				+                    task="crawler_channel_account_videos",
			
 
				+                    function="crawler_each_video",
			
 
				+                    message="get_channel_account_videos failed",
			
 
				+                    data={
			
 
				+                        "response": response,
			
 
				+                        "channel_account_id": channel_account_id,
			
 
				+                        "max_cursor": max_cursor,
			
 
				+                    },
			
 
				+                )
			
 
				+                break
			
 
				+
			
 
				+            response_data = response["data"]
			
 
				+            current_last_buffer = response_data["lastBuffer"]  # 更新分页游标
			
 
				+            has_more = response_data["continueFlag"]  # 是否还有下一页
			
 
				+            video_list = response_data["object"]
			
 
				+
			
 
				+            if not video_list:
			
 
				+                break
			
 
				+
			
 
				+            create_timestamp = video_list[0]["createtime"]
			
 
				+            if create_timestamp < max_cursor:
			
 
				+                break
			
 
				+
			
 
				+            crawl_video_list_bar = tqdm(video_list, desc="crawl videos")
			
 
				+            for video in crawl_video_list_bar:
			
 
				+                crawl_video_list_bar.set_postfix({"video_id": video["id"]})
			
 
				+                self.crawler_each_video(video)
			
 
				+
			
 
				+            if has_more:
			
 
				+                time.sleep(const.SLEEP_SECOND)
			
 
				+            else:
			
 
				+                break
			
 
				+
			
 
				+    def update_account_max_cursor(self, account_id: str) -> None:
			
 
				+        """
			
 
				+        update account max cursor
			
 
				+        """
			
 
				+        select_sql = f"""
			
 
				+            select max(publish_timestamp) as max_cursor from publish_single_video_source where out_account_id = '{account_id}';
			
 
				+        """
			
 
				+        response_mysql = self.db_client.fetch(query=select_sql)
			
 
				+        max_publish_timestamp = response_mysql[0][0]
			
 
				+
			
 
				+        if max_publish_timestamp:
			
 
				+            update_sql = f"""
			
 
				+                        update sph_account_for_videos
			
 
				+                        set max_cursor = %s
			
 
				+                        where account_id = %s;
			
 
				+                    """
			
 
				+            self.db_client.save(
			
 
				+                query=update_sql, params=(max_publish_timestamp, account_id)
			
 
				+            )
			
 
				+
			
 
				+    def deal(self):
			
 
				+        """
			
 
				+        deal channel account videos
			
 
				+        """
			
 
				+        account_list = self.get_channel_account_list()
			
 
				+        account_crawler_bar = tqdm(account_list, desc="crawler channel account videos")
			
 
				+        for account in account_crawler_bar:
			
 
				+            try:
			
 
				+                account_crawler_bar.set_postfix({"account_id": account["account_id"]})
			
 
				+                self.crawler_each_account(channel_account=account)
			
 
				+                self.update_account_max_cursor(account["account_id"])
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                log(
			
 
				+                    task="crawler_channel_account_videos",
			
 
				+                    function="deal",
			
 
				+                    message="crawler channel account videos failed",
			
 
				+                    data={
			
 
				+                        "error": str(e),
			
 
				+                        "traceback": traceback.format_exc(),
			
 
				+                        "account_id": account["account_id"],
			
 
				+                    },
			
 
				+                )
			
--- a/tasks/crawler_toutiao_account_videos.py
+++ b/tasks/crawler_toutiao_account_videos.py
@@ -0,0 +1,208 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+import time
			
 
				+import traceback
			
 
				+
			
 
				+from pymysql.cursors import DictCursor
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+from applications import log
			
 
				+from applications.const import ToutiaoVideoCrawlerConst
			
 
				+from applications.db import DatabaseConnector
			
 
				+from applications.pipeline import scrape_video_entities_process
			
 
				+from applications.utils import Item
			
 
				+from applications.utils import str_to_md5
			
 
				+from applications.utils import insert_into_single_video_source_table
			
 
				+from coldStartTasks.crawler.toutiao import get_toutiao_account_video_list
			
 
				+from config import apolloConfig, long_articles_config
			
 
				+
			
 
				+const = ToutiaoVideoCrawlerConst()
			
 
				+config = apolloConfig()
			
 
				+cookie = config.getConfigValue("toutiao_blogger_cookie")
			
 
				+
			
 
				+
			
 
				+class CrawlerToutiaoAccountVideos:
			
 
				+    """
			
 
				+    toutiao blogger crawler
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.db_client = DatabaseConnector(db_config=long_articles_config)
			
 
				+        self.db_client.connect()
			
 
				+
			
 
				+    def get_account_list(self):
			
 
				+        """
			
 
				+        get account list
			
 
				+        """
			
 
				+        sql = f"""
			
 
				+            select account_id, max_cursor
			
 
				+            from video_meta_accounts
			
 
				+            where platform = 'toutiao' and status = {const.TOUTIAO_ACCOUNT_GOOD_STATUS};
			
 
				+        """
			
 
				+        account_list = self.db_client.fetch(query=sql, cursor_type=DictCursor)
			
 
				+        return account_list
			
 
				+
			
 
				+    def crawler_each_account_video_list(
			
 
				+        self, account_id: str, max_cursor: int | None, max_behot_time: int = 0
			
 
				+    ):
			
 
				+        """
			
 
				+        account_id: toutiao account id
			
 
				+        max_cursor: crawler latest cursor for each account
			
 
				+        max_behot_time: max behot time from toutiao, use to switch to next page
			
 
				+        """
			
 
				+        has_more = True
			
 
				+        current_cursor = max_behot_time
			
 
				+        max_cursor = max_cursor or const.DEFAULT_CURSOR
			
 
				+
			
 
				+        while has_more:
			
 
				+            response = get_toutiao_account_video_list(
			
 
				+                account_id=account_id, cookie=cookie, max_behot_time=current_cursor
			
 
				+            )
			
 
				+            if not response:
			
 
				+                break
			
 
				+
			
 
				+            if response["message"] != "success":
			
 
				+                log(
			
 
				+                    task="crawler_toutiao_account_videos",
			
 
				+                    function="crawler_toutiao_account_videos",
			
 
				+                    message="get response from toutiao failed",
			
 
				+                    data={"account_id": account_id, "response": response},
			
 
				+                )
			
 
				+                break
			
 
				+
			
 
				+            video_list = response["data"]
			
 
				+            has_more = response["has_more"]
			
 
				+            current_cursor = response["next"]["max_behot_time"]
			
 
				+
			
 
				+            if not video_list:
			
 
				+                break
			
 
				+
			
 
				+            max_timestamp_in_this_group = video_list[0]["publish_time"]
			
 
				+            if max_timestamp_in_this_group < max_cursor:
			
 
				+                break
			
 
				+
			
 
				+            # do crawler each video
			
 
				+            crawler_video_list_bar = tqdm(video_list, desc="crawler videos")
			
 
				+            for video in crawler_video_list_bar:
			
 
				+                try:
			
 
				+                    crawler_video_list_bar.set_postfix({"video_id": video["id"]})
			
 
				+                    self.crawler_each_video(video)
			
 
				+
			
 
				+                except Exception as e:
			
 
				+                    log(
			
 
				+                        task="crawler_toutiao_account_videos",
			
 
				+                        function="crawler_each_account_video_list",
			
 
				+                        message="crawler each video failed",
			
 
				+                        data={
			
 
				+                            "account_id": account_id,
			
 
				+                            "video_info": video,
			
 
				+                            "error": str(e),
			
 
				+                            "traceback": traceback.format_exc(),
			
 
				+                        },
			
 
				+                    )
			
 
				+
			
 
				+            if has_more:
			
 
				+                time.sleep(const.SLEEP_SECOND)
			
 
				+            else:
			
 
				+                break
			
 
				+
			
 
				+    def crawler_each_video(self, video_data):
			
 
				+        """
			
 
				+        crawler each video data
			
 
				+        """
			
 
				+        video_item = Item()
			
 
				+        video_id = video_data["group_id"]
			
 
				+        title = video_data["title"]
			
 
				+        media = video_data["video"]
			
 
				+        url = media["download_addr"]["url_list"][0]
			
 
				+
			
 
				+        # add info into item
			
 
				+        video_item.add("content_trace_id", "video{}".format(str_to_md5(str(video_id))))
			
 
				+        video_item.add("url_unique_md5", video_id)
			
 
				+        video_item.add("article_title", title)
			
 
				+        video_item.add("out_account_id", video_data["user"]["user_id"])
			
 
				+        video_item.add("out_account_name", video_data["source"])
			
 
				+        video_item.add("publish_timestamp", video_data["publish_time"])
			
 
				+        video_item.add("platform", const.PLATFORM)
			
 
				+        video_item.add("read_cnt", video_data.get("read_count", 0))
			
 
				+        video_item.add("article_url", url)
			
 
				+        video_item.add("source_account", const.NO_SOURCE_ACCOUNT_STATUS)
			
 
				+        video_item.add("crawler_timestamp", int(time.time()))
			
 
				+
			
 
				+        # check item before insert
			
 
				+        video_item.check(source="video")
			
 
				+        try:
			
 
				+            item_with_oss_path = scrape_video_entities_process(
			
 
				+                video_item=video_item.item, db_client=self.db_client
			
 
				+            )
			
 
				+            if item_with_oss_path:
			
 
				+                insert_into_single_video_source_table(
			
 
				+                    self.db_client, item_with_oss_path
			
 
				+                )
			
 
				+        except Exception as e:
			
 
				+            log(
			
 
				+                task="crawler_toutiao_account_videos",
			
 
				+                function="crawler_toutiao_account_videos",
			
 
				+                message="etl failed",
			
 
				+                data={
			
 
				+                    "video_item": video_item.item,
			
 
				+                    "error": str(e),
			
 
				+                    "traceback": traceback.format_exc(),
			
 
				+                }
			
 
				+            )
			
 
				+
			
 
				+    def update_account_max_cursor(self, account_id: str) -> None:
			
 
				+        """
			
 
				+        update account max cursor
			
 
				+        """
			
 
				+        select_sql = f"""
			
 
				+            select max(publish_timestamp) as max_cursor 
			
 
				+            from publish_single_video_source 
			
 
				+            where out_account_id = '{account_id}' and platform = '{const.PLATFORM}';
			
 
				+        """
			
 
				+        response_mysql = self.db_client.fetch(query=select_sql)
			
 
				+        max_publish_timestamp = response_mysql[0][0]
			
 
				+
			
 
				+        if max_publish_timestamp:
			
 
				+            update_sql = f"""
			
 
				+                update video_meta_accounts
			
 
				+                set max_cursor = %s
			
 
				+                where account_id = %s and platform = %s;
			
 
				+            """
			
 
				+            self.db_client.save(
			
 
				+                query=update_sql,
			
 
				+                params=(max_publish_timestamp, account_id, const.PLATFORM),
			
 
				+            )
			
 
				+
			
 
				+    def deal(self) -> None:
			
 
				+        """
			
 
				+        class entrance
			
 
				+        """
			
 
				+        account_list = self.get_account_list()
			
 
				+        account_list_bar = tqdm(account_list, desc="crawler toutiao accounts")
			
 
				+        for account in account_list_bar:
			
 
				+            account_id = account["account_id"]
			
 
				+            max_cursor = account["max_cursor"]
			
 
				+            try:
			
 
				+                # crawl each account
			
 
				+                account_list_bar.set_postfix({"account_id": account_id})
			
 
				+                self.crawler_each_account_video_list(
			
 
				+                    account_id=account_id, max_cursor=max_cursor
			
 
				+                )
			
 
				+                self.update_account_max_cursor(account_id)
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                # add log and bot
			
 
				+                log(
			
 
				+                    task="crawler_toutiao_account_videos",
			
 
				+                    function="deal",
			
 
				+                    message=account_id,
			
 
				+                    data={
			
 
				+                        "error": str(e),
			
 
				+                        "traceback": traceback.format_exc(),
			
 
				+                    },
			
 
				+                )
			
--- a/tasks/title_rewrite_task.py
+++ b/tasks/title_rewrite_task.py
@@ -0,0 +1,270 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+
			
 
				+import time
			
 
				+import traceback
			
 
				+
			
 
				+from pymysql.cursors import DictCursor
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+from applications import log
			
 
				+from applications.api import fetch_deepseek_response
			
 
				+from applications.const import TitleRewriteTaskConst
			
 
				+from applications.db import DatabaseConnector
			
 
				+from config import long_articles_config
			
 
				+
			
 
				+const = TitleRewriteTaskConst()
			
 
				+
			
 
				+
			
 
				+def generate_prompt(ori_title):
			
 
				+    """
			
 
				+    生成prompt
			
 
				+    """
			
 
				+    prompt = f"""
			
 
				+    请将以下标题改写成适合公众号中小程序点击和传播的文章标题，文章标题的写作规范如下，请学习后进行文章标题的编写。直接输出最终的文章标题，文章标题撰写规范如下：
			
 
				+    1. 标题结构：要点前置，信息明确
			
 
				+        核心信息前置：标题开头直接点出文章的核心内容或亮点，吸引读者注意。例如：
			
 
				+          “我国存款最安全的五大银行，永远都不会倒闭，你知道是哪五家吗？”
			
 
				+          “亩产7000斤，被误认成萝卜却曾是‘救命粮’，如今成我国出口名蔬”。
			
 
				+        简洁明了：标题通常在20字以内，信息集中且易于理解。
			
 
				+        悬念前置结构：前半句设置反常/冲突场景（如"刑满释放蹬三轮"）+后半句用结果反转制造悬念（"政府领导登门分配工作"）
			
 
				+        多要素拼接：通过冒号/逗号分隔不同叙事主体（地域+人物冲突+权威评价），如"辽宁女子住高档小区被敲门，法院判决意外"
			
 
				+    
			
 
				+    2. 情绪表达：激发共鸣，引发好奇
			
 
				+        情感共鸣：通过情感化的语言触动读者，泪崩/守护/抱头痛哭等情感冲击词，配合家庭伦理场景
			
 
				+        例如：
			
 
				+          “老母亲分家产，给亲闺女30万，给养女一筐青菜，养女意外摔倒，看到筐子里的东西，瞬间愣住了”。
			
 
				+          “儿子卖车卖房给母亲治病，母亲去世后儿媳收拾房间，打开床底柜，儿子突然痛哭”。
			
 
				+        悬念与好奇心：通过提问或制造悬念，激发读者点击欲望。例如：
			
 
				+          “你知道是哪五家吗？”
			
 
				+          “打开床底柜，儿子突然痛哭”。
			
 
				+        冲突性情绪词：拍桌大骂/气愤不已/眼红不已/算计等强对抗性词汇
			
 
				+        结果反差刺激：用"风光善终/价值过亿/判决意外"等违反预期的结果
			
 
				+    
			
 
				+    3. 语言风格：口语化、接地气
			
 
				+        口语化表达：使用通俗易懂的语言，贴近读者生活。
			
 
				+        刻意使用"赶都赶不走/各吃各的/我就知道你在家"等市井化用语。
			
 
				+        例如：
			
 
				+          “狗屎运？江西男子钓鱼时发现青鱼尸骸，扒开后捡到鸡蛋大小的青鱼石”。
			
 
				+          “聪明的女人，不会帮婆家3种忙，而蠢女人才一再插手”。
			
 
				+        接地气的词汇：使用“狗屎运”“蠢女人”等口语化词汇，增强亲切感。
			
 
				+        身份反差构建：突出人物命运转折（老农→亿万富翁/囚犯→政府帮扶对象）
			
 
				+        权威背书暗示："专家气愤/法院判决/网友评价"等第三方视角增强可信度
			
 
				+    
			
 
				+    4. 标点运用：增强语气，突出重点
			
 
				+        问号与感叹号：通过问号制造悬念，感叹号强化情感。
			
 
				+        在关键转折点使用（"太气人了！/赔不了！"）
			
 
				+        问号制造互动：如"容嬷嬷是校花？"激发读者验证心理
			
 
				+        例如：
			
 
				+          “你知道是哪五家吗？”
			
 
				+          “太无耻了！湖南，一名厨师被公司派到云南‘出差’被拒……”
			
 
				+        引号与冒号：用于突出关键词或转折点。
			
 
				+        破折号递进：用"——"引导关键信息（"吃不完最好扔掉——"）
			
 
				+        例如：
			
 
				+          “被误认成萝卜却曾是‘救命粮’”。
			
 
				+          “女子归还后，失主拒绝支付报酬，还说：要有格局”。
			
 
				+    
			
 
				+    5. 热点与话题性：结合社会热点或争议
			
 
				+        社会热点：结合当前热点事件或争议话题，吸引关注。例如：
			
 
				+          “上海：男子超市连续购买46枚过期咸鸭蛋，2天分46次交易，向厂家索赔金14万，法院判了！”
			
 
				+        争议性话题：通过争议性内容引发讨论。例如：
			
 
				+          “李玉成终于说出实话，公开吐槽马玉琴年纪太大，结婚28年疑似后悔”。
			
 
				+    
			
 
				+    6. 数字与具体细节：增强可信度与吸引力
			
 
				+        数字的运用：通过具体数字增强标题的可信度和吸引力。例如：
			
 
				+          “亩产7000斤”。
			
 
				+          “22年河南男子跳河救人，体力耗尽留遗言”。
			
 
				+        细节描述：通过细节让标题更具画面感。例如：
			
 
				+          “打开床底柜，儿子突然痛哭”。
			
 
				+          “扒开后捡到鸡蛋大小的青鱼石”。
			
 
				+    
			
 
				+    7. 价值诉求：传递实用信息或情感价值
			
 
				+        实用信息：提供对读者有价值的信息。例如：
			
 
				+          “我国存款最安全的五大银行，永远都不会倒闭”。
			
 
				+          “72岁老人每天一个蒸苹果，半年后体检，看到指标变化让他乐开了花”。
			
 
				+        情感价值：通过情感故事或人生哲理打动读者。例如：
			
 
				+          “父母越老越能暴露家庭最真实的一面：当父母70岁，子女不该抱有这三种期待”。
			
 
				+    
			
 
				+    8. 名人效应与历史情怀：增强吸引力
			
 
				+        名人效应：提及名人或历史事件，吸引关注。例如：
			
 
				+          “难怪王扶林说陈晓旭不够漂亮，看看他选的原黛玉候选人，那才叫美”。
			
 
				+          “1975年‘下馆子’的老照片，2元能吃些什么，勾起那段最难忘的时光”。
			
 
				+    
			
 
				+    9.隐藏传播逻辑：通过标题中暗含的、能触发人性弱点（如猎奇、贪婪、同情）或社会痛点的心理机制，通过潜意识刺激读者点击欲望
			
 
				+       人性弱点触发：贪婪（200万保单）、猎奇（林彪密件）、窥私（家庭算计）
			
 
				+       生存焦虑关联：医疗（脑瘫儿）、养老（子女不孝）、食品安全（二次加热）
			
 
				+       身份代入设计：选择"老太太/外甥女/退休母亲"等易引发群体共鸣的角色
			
 
				+    输入的标题是： '{ori_title}'
			
 
				+    """
			
 
				+    return prompt
			
 
				+
			
 
				+
			
 
				+class TitleRewriteTask:
			
 
				+    """
			
 
				+    标题重写任务
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.db = DatabaseConnector(db_config=long_articles_config)
			
 
				+        self.db.connect()
			
 
				+
			
 
				+    def roll_back_blocked_tasks(self):
			
 
				+        """
			
 
				+        rollback blocked tasks
			
 
				+        """
			
 
				+        sql = f"""
			
 
				+            select id, title_rewrite_status_update_timestamp
			
 
				+            from publish_single_video_source
			
 
				+            where title_rewrite_status = {const.TITLE_REWRITE_LOCK_STATUS};
			
 
				+        """
			
 
				+        article_list = self.db.fetch(query=sql, cursor_type=DictCursor)
			
 
				+        if article_list:
			
 
				+            blocked_id_list = [
			
 
				+                i["id"]
			
 
				+                for i in article_list
			
 
				+                if (
			
 
				+                    int(time.time())
			
 
				+                    - i["title_rewrite_status_update_timestamp"]
			
 
				+                )
			
 
				+                > const.TITLE_REWRITE_LOCK_TIME
			
 
				+            ]
			
 
				+            if blocked_id_list:
			
 
				+                update_sql = f"""
			
 
				+                    update publish_single_video_source
			
 
				+                    set title_rewrite_status = %s
			
 
				+                    where id in %s and title_rewrite_status = %s;
			
 
				+                """
			
 
				+                self.db.save(
			
 
				+                    query=update_sql,
			
 
				+                    params=(
			
 
				+                        const.TITLE_REWRITE_INIT_STATUS,
			
 
				+                        tuple(blocked_id_list),
			
 
				+                        const.TITLE_REWRITE_LOCK_STATUS,
			
 
				+                    )
			
 
				+                )
			
 
				+
			
 
				+    def get_articles_batch(self, batch_size=1000):
			
 
				+        """
			
 
				+        从数据库中获取文章
			
 
				+        """
			
 
				+        sql = f"""
			
 
				+            select content_trace_id, article_title
			
 
				+            from publish_single_video_source 
			
 
				+            where bad_status = {const.ARTICLE_POSITIVE_STATUS} 
			
 
				+                and audit_status = {const.ARTICLE_AUDIT_PASSED_STATUS} 
			
 
				+                and title_rewrite_status = {const.TITLE_REWRITE_INIT_STATUS}
			
 
				+                and platform in ('hksp', 'sph')
			
 
				+            limit {batch_size};
			
 
				+        """
			
 
				+        res = self.db.fetch(query=sql, cursor_type=DictCursor)
			
 
				+        return res
			
 
				+
			
 
				+    def update_title_rewrite_status(self, content_trace_id, ori_status, new_status):
			
 
				+        """
			
 
				+        更新标题重写状态
			
 
				+        """
			
 
				+        sql = f"""
			
 
				+            update publish_single_video_source
			
 
				+            set title_rewrite_status = %s, title_rewrite_status_update_timestamp = %s
			
 
				+            where content_trace_id = %s and title_rewrite_status= %s;
			
 
				+        """
			
 
				+        affected_rows = self.db.save(
			
 
				+            query=sql, params=(new_status, int(time.time()), content_trace_id, ori_status)
			
 
				+        )
			
 
				+        return affected_rows
			
 
				+
			
 
				+    def insert_into_rewrite_table(self, content_trace_id, new_title):
			
 
				+        """
			
 
				+        insert into rewrite_table
			
 
				+        """
			
 
				+        insert_sql = f"""
			
 
				+            insert into video_title_rewrite
			
 
				+            (content_trace_id, new_title, status, prompt_version)
			
 
				+            values (%s, %s, %s, %s);
			
 
				+        """
			
 
				+        self.db.save(
			
 
				+            query=insert_sql,
			
 
				+            params=(
			
 
				+                content_trace_id,
			
 
				+                new_title,
			
 
				+                const.TITLE_USEFUL_STATUS,
			
 
				+                const.PROMPT_VERSION
			
 
				+            ),
			
 
				+        )
			
 
				+
			
 
				+    def rewrite_each_article(self, article):
			
 
				+        """
			
 
				+        rewrite each article
			
 
				+        """
			
 
				+        content_trace_id = article["content_trace_id"]
			
 
				+        article_title = article["article_title"]
			
 
				+
			
 
				+        # lock each task
			
 
				+        affected_rows = self.update_title_rewrite_status(
			
 
				+            content_trace_id=content_trace_id,
			
 
				+            ori_status=const.TITLE_REWRITE_INIT_STATUS,
			
 
				+            new_status=const.TITLE_REWRITE_LOCK_STATUS,
			
 
				+        )
			
 
				+        if not affected_rows:
			
 
				+            return
			
 
				+
			
 
				+        try:
			
 
				+            prompt = generate_prompt(article_title)
			
 
				+            new_title = fetch_deepseek_response(model="default", prompt=prompt)
			
 
				+
			
 
				+            # insert into rewrite table
			
 
				+            self.insert_into_rewrite_table(
			
 
				+                content_trace_id=content_trace_id, new_title=new_title
			
 
				+            )
			
 
				+
			
 
				+            # unlock
			
 
				+            self.update_title_rewrite_status(
			
 
				+                content_trace_id=content_trace_id,
			
 
				+                ori_status=const.TITLE_REWRITE_LOCK_STATUS,
			
 
				+                new_status=const.TITLE_REWRITE_SUCCESS_STATUS,
			
 
				+            )
			
 
				+        except Exception as e:
			
 
				+            log(
			
 
				+                task="title rewrite task",
			
 
				+                function="rewrite_each_article",
			
 
				+                message=content_trace_id,
			
 
				+                status="fail",
			
 
				+                data={
			
 
				+                    "error_message": str(e),
			
 
				+                    "error_type": type(e).__name__,
			
 
				+                    "traceback": traceback.format_exc(),
			
 
				+                },
			
 
				+            )
			
 
				+            self.update_title_rewrite_status(
			
 
				+                content_trace_id=content_trace_id,
			
 
				+                ori_status=const.TITLE_REWRITE_LOCK_STATUS,
			
 
				+                new_status=const.TITLE_REWRITE_FAIL_STATUS,
			
 
				+            )
			
 
				+
			
 
				+    def deal(self):
			
 
				+        """
			
 
				+        get tasks && deal tasks
			
 
				+        """
			
 
				+        # rollback blocked tasks
			
 
				+        try:
			
 
				+            self.roll_back_blocked_tasks()
			
 
				+        except Exception as e:
			
 
				+            log(
			
 
				+                task="title rewrite task",
			
 
				+                function="roll_back_blocked_tasks",
			
 
				+                message="roll back blocked tasks fail",
			
 
				+                status="fail",
			
 
				+                data={
			
 
				+                    "error_message": str(e),
			
 
				+                    "error_type": type(e).__name__,
			
 
				+                    "traceback": traceback.format_exc()
			
 
				+                }
			
 
				+            )
			
 
				+
			
 
				+        # process tasks
			
 
				+        articles = self.get_articles_batch()
			
 
				+        bar = tqdm(articles, desc="title rewrite task")
			
 
				+        for article in bar:
			
 
				+            self.rewrite_each_article(article)
			
 
				+            bar.set_postfix({"content_trace_id": article["content_trace_id"]})
			
--- a/tasks/update_published_articles_minigram_detail.py
+++ b/tasks/update_published_articles_minigram_detail.py
@@ -120,7 +120,7 @@ class UpdatePublishedArticlesMinigramDetail(object):
 
				         :return:
			
 
				         """
			
 
				         select_sql = f"""
			
 
				-            SELECT first_uv, split0, split1, split2
			
 
				+            SELECT first_uv, split0, split0_head, split0_recommend, split1, split1_head, split1_recommend, split2, split2_head, split2_recommend
			
 
				             FROM changwen_data_rootsourceid
			
 
				             WHERE root_source_id = '{root_source_id}' AND dt = '{dt}';
			
 
				         """
			
@@ -235,7 +235,7 @@ class UpdatePublishedArticlesMinigramDetail(object):
 
				                 return article_info
			
 
				 
			
 
				         else:
			
 
				-            return article_info
			
 
				+            return EMPTY_DICT
			
 
				 
			
 
				     def get_root_source_id_for_three_days(self, biz_date: str) -> List[Dict]:
			
 
				         """
			
@@ -263,7 +263,10 @@ class UpdatePublishedArticlesMinigramDetail(object):
 
				             # do update job
			
 
				             update_sql = f"""
			
 
				                 UPDATE {DETAIL_TABLE}
			
 
				-                SET first_level = %s, fission_0 = %s, fission_1 = %s, fission_2 = %s
			
 
				+                SET first_level = %s, 
			
 
				+                    fission_0 = %s, fission_0_head = %s, fission_0_recommend = %s, 
			
 
				+                    fission_1 = %s, fission_1_head = %s, fission_1_recommend = %s, 
			
 
				+                    fission_2 = %s, fission_2_head = %s, fission_2_recommend = %s
			
 
				                 WHERE root_source_id = %s and recall_dt = %s;
			
 
				             """
			
 
				             self.piaoquan_crawler_db_client.save(
			
@@ -271,8 +274,14 @@ class UpdatePublishedArticlesMinigramDetail(object):
 
				                 params=(
			
 
				                     mini_program_detail['first_uv'],
			
 
				                     mini_program_detail['split0'],
			
 
				+                    mini_program_detail['split0_head'],
			
 
				+                    mini_program_detail['split0_recommend'],
			
 
				                     mini_program_detail['split1'],
			
 
				+                    mini_program_detail['split1_head'],
			
 
				+                    mini_program_detail['split1_recommend'],
			
 
				                     mini_program_detail['split2'],
			
 
				+                    mini_program_detail['split2_head'],
			
 
				+                    mini_program_detail['split2_recommend'],
			
 
				                     root_source_id,
			
 
				                     recall_dt
			
 
				                 )
			
--- a/title_similarity_score_task.py
+++ b/title_similarity_score_task.py
@@ -1,6 +1,8 @@
 
				 """
			
 
				 @author: luojunhui
			
 
				 """
			
 
				+import traceback
			
 
				+from applications import bot
			
 
				 from coldStartTasks.filter.title_similarity_task import ColdStartTitleSimilarityTask
			
 
				 
			
 
				 
			
@@ -8,4 +10,27 @@ if __name__ == '__main__':
 
				     batch_size = 3000
			
 
				     task = ColdStartTitleSimilarityTask()
			
 
				     task.init_database()
			
 
				-    task.run(limit=batch_size)
			
 
				+    # process video
			
 
				+    try:
			
 
				+        task.run(meta_source="video")
			
 
				+    except Exception as e:
			
 
				+        bot(
			
 
				+            title="视频冷启池nlp任务异常",
			
 
				+            mention=False,
			
 
				+            detail={
			
 
				+                "traceback": traceback.format_exc(),
			
 
				+                "error": f"{e}"
			
 
				+            }
			
 
				+        )
			
 
				+    # process article
			
 
				+    try:
			
 
				+        task.run(meta_source="article")
			
 
				+    except Exception as e:
			
 
				+        bot(
			
 
				+            title="文章冷启池nlp任务异常",
			
 
				+            mention=False,
			
 
				+            detail={
			
 
				+                "traceback": traceback.format_exc(),
			
 
				+                "error": f"{e}"
			
 
				+            }
			
 
				+        )
			
--- a/toutiao_video_crawler.py
+++ b/toutiao_video_crawler.py
@@ -0,0 +1,10 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+
			
 
				+from tasks.crawler_toutiao_account_videos import CrawlerToutiaoAccountVideos
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    crawler = CrawlerToutiaoAccountVideos()
			
 
				+    crawler.deal()
			
--- a/updateAccountV3.py
+++ b/updateAccountV3.py
@@ -7,151 +7,104 @@ import time
 
				 from tqdm import tqdm
			
 
				 from datetime import datetime, timedelta
			
 
				 from argparse import ArgumentParser
			
 
				+from pymysql.cursors import DictCursor
			
 
				 
			
 
				-from applications import PQMySQL, DeNetMysql, longArticlesMySQL
			
 
				-from applications.const import updateAccountReadAvgTaskConst
			
 
				+from applications.const import UpdateAccountReadAvgTaskConst
			
 
				+from applications.db import DatabaseConnector
			
 
				+from applications.utils import fetch_account_fans
			
 
				+from applications.utils import fetch_publishing_account_list
			
 
				 from config import apolloConfig
			
 
				+from config import long_articles_config, denet_config, piaoquan_crawler_config
			
 
				 
			
 
				+read_rate_table = "long_articles_read_rate"
			
 
				+read_avg_table = "account_avg_info_v3"
			
 
				 config = apolloConfig()
			
 
				+const = UpdateAccountReadAvgTaskConst()
			
 
				 unauthorized_account = json.loads(config.getConfigValue("unauthorized_gh_id_fans"))
			
 
				 touliu_accounts = set(json.loads(config.getConfigValue("touliu_gh_id_list")))
			
 
				-
			
 
				-
			
 
				-def get_account_fans_by_dt(db_client) -> dict:
			
 
				-    """
			
 
				-    获取每个账号发粉丝，通过日期来区分
			
 
				-    :return:
			
 
				-    """
			
 
				-    sql = f"""
			
 
				-        SELECT 
			
 
				-            t1.date_str, 
			
 
				-            t1.fans_count, 
			
 
				-            t2.gh_id
			
 
				-        FROM datastat_wx t1
			
 
				-        JOIN publish_account t2 ON t1.account_id = t2.id
			
 
				-        WHERE 
			
 
				-            t2.channel = 5 
			
 
				-        AND t2.status = 1 
			
 
				-        AND t1.date_str >= '2024-09-01' 
			
 
				-        ORDER BY t1.date_str;
			
 
				-    """
			
 
				-    result = db_client.select(sql)
			
 
				-    D = {}
			
 
				-    for line in result:
			
 
				-        dt = line[0]
			
 
				-        fans = line[1]
			
 
				-        gh_id = line[2]
			
 
				-        if D.get(gh_id):
			
 
				-            D[gh_id][dt] = fans
			
 
				-        else:
			
 
				-            D[gh_id] = {dt: fans}
			
 
				-    return D
			
 
				-
			
 
				+backup_account_fans = json.loads(config.getConfigValue("backup_account_fans"))
			
 
				 
			
 
				 class UpdateAccountInfoVersion3(object):
			
 
				     """
			
 
				-    更新账号信息 v3
			
 
				+    更新账号的平均阅读率
			
 
				     """
			
 
				 
			
 
				     def __init__(self):
			
 
				-        self.const = updateAccountReadAvgTaskConst()
			
 
				-        self.pq = PQMySQL()
			
 
				-        self.de = DeNetMysql()
			
 
				-        self.lam = longArticlesMySQL()
			
 
				+        # init piaoquan crawler db client
			
 
				+        self.piaoquan_crawler_db_client = DatabaseConnector(piaoquan_crawler_config)
			
 
				+        self.piaoquan_crawler_db_client.connect()
			
 
				+
			
 
				+        # init long articles db client
			
 
				+        self.long_articles_db_client = DatabaseConnector(long_articles_config)
			
 
				+        self.long_articles_db_client.connect()
			
 
				+
			
 
				+        #  init aigc db client
			
 
				+        self.denet_db_client = DatabaseConnector(denet_config)
			
 
				+        self.denet_db_client.connect()
			
 
				 
			
 
				-    def get_account_position_read_rate(self, dt):
			
 
				+    def fetch_read_rate_avg_for_each_account(self, dt):
			
 
				         """
			
 
				         从长文数据库获取账号阅读均值
			
 
				         :return:
			
 
				         """
			
 
				         dt = int(dt.replace("-", ""))
			
 
				         sql = f"""
			
 
				-            SELECT 
			
 
				-                gh_id, position, read_rate_avg
			
 
				-            FROM
			
 
				-                long_articles_read_rate
			
 
				-            WHERE dt_version = {dt};
			
 
				+            select gh_id, position, read_rate_avg
			
 
				+            from {read_rate_table}
			
 
				+            where dt_version = {dt};
			
 
				         """
			
 
				-
			
 
				-        result = self.lam.select(sql)
			
 
				+        fetch_response_list = self.long_articles_db_client.fetch(query=sql, cursor_type=DictCursor)
			
 
				         account_read_rate_dict = {}
			
 
				-        for item in result:
			
 
				-            gh_id = item[0]
			
 
				-            position = item[1]
			
 
				-            rate = item[2]
			
 
				-            key = "{}_{}".format(gh_id, position)
			
 
				-            account_read_rate_dict[key] = rate
			
 
				+        for item in fetch_response_list:
			
 
				+            key = "{}_{}".format(item['gh_id'], item['position'])
			
 
				+            account_read_rate_dict[key] = item['read_rate_avg']
			
 
				         return account_read_rate_dict
			
 
				 
			
 
				-    def get_publishing_accounts(self):
			
 
				-        """
			
 
				-        获取每日正在发布的账号
			
 
				-        :return:
			
 
				-        """
			
 
				-        sql = f"""
			
 
				-        SELECT DISTINCT
			
 
				-            t3.`name`,
			
 
				-            t3.gh_id,
			
 
				-            t3.follower_count,
			
 
				-            t6.account_source_name,
			
 
				-            t6.mode_type,
			
 
				-            t6.account_type,
			
 
				-            t6.`status`
			
 
				-        FROM
			
 
				-            publish_plan t1
			
 
				-            JOIN publish_plan_account t2 ON t1.id = t2.plan_id
			
 
				-            JOIN publish_account t3 ON t2.account_id = t3.id
			
 
				-            LEFT JOIN publish_account_wx_type t4 on t3.id = t4.account_id
			
 
				-            LEFT JOIN wx_statistics_group_source_account t5 on t3.id = t5.account_id
			
 
				-            LEFT JOIN wx_statistics_group_source t6 on t5.group_source_name = t6.account_source_name
			
 
				-        WHERE
			
 
				-            t1.plan_status = 1
			
 
				-            AND t3.channel = 5
			
 
				-            GROUP BY t3.id;
			
 
				-        """
			
 
				-        account_list = self.de.select(sql)
			
 
				-        result_list = [
			
 
				-            {
			
 
				-                "account_name": i[0],
			
 
				-                "gh_id": i[1],
			
 
				-                "fans": i[2],
			
 
				-                "account_source_name": i[3],
			
 
				-                "mode_type": i[4],
			
 
				-                "account_type": i[5],
			
 
				-                "status": i[6]
			
 
				-            } for i in account_list
			
 
				-        ]
			
 
				-        return result_list
			
 
				-
			
 
				     def do_task_list(self, dt):
			
 
				         """
			
 
				         do it
			
 
				         """
			
 
				-        fans_dict = get_account_fans_by_dt(db_client=self.de)
			
 
				-        account_list = self.get_publishing_accounts()
			
 
				-        rate_dict = self.get_account_position_read_rate(dt)
			
 
				+        # get fans dict from aigc
			
 
				+        fans_dict = fetch_account_fans(self.denet_db_client, dt)
			
 
				+
			
 
				+        # get publishing account list from aigc
			
 
				+        account_list = fetch_publishing_account_list(self.denet_db_client)
			
 
				+
			
 
				+        # fetch each account's read avg for each position
			
 
				+        read_rate_avg_dict = self.fetch_read_rate_avg_for_each_account(dt)
			
 
				+
			
 
				         for account in tqdm(account_list, desc=dt):
			
 
				             gh_id = account["gh_id"]
			
 
				-            business_type = self.const.TOULIU if gh_id in touliu_accounts else self.const.ARTICLES_DAILY
			
 
				-            fans = fans_dict.get(gh_id, {}).get(dt, 0)
			
 
				+            business_type = const.TOULIU if gh_id in touliu_accounts else const.ARTICLES_DAILY
			
 
				+            fans = fans_dict.get(gh_id, {}).get(dt, const.DEFAULT_FANS)
			
 
				+
			
 
				+            # use unauthorized account's fans if not found in aigc
			
 
				+            if not fans:
			
 
				+                fans = int(unauthorized_account.get(gh_id, const.DEFAULT_FANS))
			
 
				+
			
 
				+            # use backup account's fans if not found in aigc
			
 
				             if not fans:
			
 
				-                fans = int(unauthorized_account.get(gh_id, 0))
			
 
				+                fans = int(backup_account_fans.get(gh_id, const.DEFAULT_FANS))
			
 
				+
			
 
				             if fans:
			
 
				-                for index in range(1, 9):
			
 
				+                for index in const.ARTICLE_INDEX_LIST:
			
 
				                     gh_id_position = "{}_{}".format(gh_id, index)
			
 
				-                    if rate_dict.get(gh_id_position):
			
 
				-                        rate = rate_dict[gh_id_position]
			
 
				-                        read_avg = fans * rate
			
 
				-                        print(rate, read_avg)
			
 
				+                    if read_rate_avg_dict.get(gh_id_position):
			
 
				+                        # fetch read rate avg
			
 
				+                        read_rate_avg = read_rate_avg_dict[gh_id_position]
			
 
				+                        # cal read avg
			
 
				+                        read_avg = fans * read_rate_avg
			
 
				+
			
 
				+                        # insert into database
			
 
				                         insert_sql = f"""
			
 
				-                        INSERT INTO account_avg_info_v3
			
 
				-                        (gh_id, position, update_time, account_name, fans, read_avg, like_avg, status, account_type, account_mode, account_source, account_status, business_type, read_rate_avg)
			
 
				-                        values
			
 
				-                        (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
			
 
				+                            insert into {read_avg_table}
			
 
				+                            (gh_id, position, update_time, account_name, fans, read_avg, like_avg, status, account_type, account_mode, account_source, account_status, business_type, read_rate_avg)
			
 
				+                            values
			
 
				+                            (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
			
 
				                         """
			
 
				                         try:
			
 
				-                            self.pq.update(
			
 
				-                                sql=insert_sql,
			
 
				+                            self.piaoquan_crawler_db_client.save(
			
 
				+                                query=insert_sql,
			
 
				                                 params=(
			
 
				                                     gh_id,
			
 
				                                     index,
			
@@ -159,29 +112,29 @@ class UpdateAccountInfoVersion3(object):
 
				                                     account['account_name'],
			
 
				                                     fans,
			
 
				                                     read_avg,
			
 
				-                                    0,
			
 
				-                                    1,
			
 
				+                                    const.DEFAULT_LIKE,
			
 
				+                                    const.USING_STATUS,
			
 
				                                     account['account_type'],
			
 
				                                     account['mode_type'],
			
 
				-                                    account['account_source_name'],
			
 
				+                                    account['account_source'],
			
 
				                                     account['status'],
			
 
				                                     business_type,
			
 
				-                                    rate
			
 
				+                                    read_rate_avg
			
 
				                                 )
			
 
				                             )
			
 
				                         except Exception as e:
			
 
				-                            updateSQL = f"""
			
 
				-                            UPDATE account_avg_info_v3
			
 
				-                            set fans = %s, read_avg = %s, read_rate_avg = %s
			
 
				-                            where gh_id = %s and position = %s and update_time = %s
			
 
				+                            update_sql = f"""
			
 
				+                                update {read_avg_table}
			
 
				+                                set fans = %s, read_avg = %s, read_rate_avg = %s
			
 
				+                                where gh_id = %s and position = %s and update_time = %s
			
 
				                             """
			
 
				                             try:
			
 
				-                                affected_rows = self.pq.update(
			
 
				-                                    sql=updateSQL,
			
 
				+                                self.piaoquan_crawler_db_client.save(
			
 
				+                                    query=update_sql,
			
 
				                                     params=(
			
 
				                                         fans,
			
 
				                                         read_avg,
			
 
				-                                        rate,
			
 
				+                                        read_rate_avg,
			
 
				                                         account['gh_id'],
			
 
				                                         index,
			
 
				                                         dt
			
@@ -192,17 +145,16 @@ class UpdateAccountInfoVersion3(object):
 
				 
			
 
				                         # 修改前一天的状态为 0
			
 
				                         update_status_sql = f"""
			
 
				-                        UPDATE account_avg_info_v3
			
 
				-                        SET status = %s
			
 
				-                        where update_time != %s and gh_id = %s and position = %s;
			
 
				+                            update {read_avg_table}
			
 
				+                            set status = %s
			
 
				+                            where update_time != %s and gh_id = %s and position = %s;
			
 
				                         """
			
 
				-                        rows_affected = self.pq.update(
			
 
				-                            sql=update_status_sql,
			
 
				+                        self.piaoquan_crawler_db_client.save(
			
 
				+                            query=update_status_sql,
			
 
				                             params=(
			
 
				-                                0, dt, account['gh_id'], index
			
 
				+                                const.NOT_USING_STATUS, dt, account['gh_id'], index
			
 
				                             )
			
 
				                         )
			
 
				-                        print("修改成功")
			
 
				 
			
 
				 
			
 
				 def main():
			
@@ -215,15 +167,15 @@ def main():
 
				                         help="Run only once for date in format of %Y-%m-%d. \
			
 
				                                 If no specified, run as daily jobs.")
			
 
				     args = parser.parse_args()
			
 
				-    Up = UpdateAccountInfoVersion3()
			
 
				+    update_account_read_avg_task = UpdateAccountInfoVersion3()
			
 
				     if args.run_date:
			
 
				-        Up.do_task_list(dt=args.run_date)
			
 
				+        update_account_read_avg_task.do_task_list(dt=args.run_date)
			
 
				     else:
			
 
				         dt_object = datetime.fromtimestamp(int(time.time()))
			
 
				         one_day = timedelta(days=1)
			
 
				         yesterday = dt_object - one_day
			
 
				         yesterday_str = yesterday.strftime('%Y-%m-%d')
			
 
				-        Up.do_task_list(dt=yesterday_str)
			
 
				+        update_account_read_avg_task.do_task_list(dt=yesterday_str)
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
		`@@ -0,0 +1 @@`
		`+from .video_crawler import BaiduVideoCrawler`