luojunhui 7 maanden geleden
bovenliggende
commit
35d974234a
69 gewijzigde bestanden met toevoegingen van 11126 en 385 verwijderingen
  1. 1 1
      account_cold_start_daily.py
  2. 4 4
      applications/api/__init__.py
  3. 26 0
      applications/api/deep_seek_api_by_volcanoengine.py
  4. 108 0
      applications/api/gewe_api.py
  5. 88 13
      applications/api/moon_shot_api.py
  6. 54 3
      applications/api/nlp_api.py
  7. 172 7
      applications/const/__init__.py
  8. 12 8
      applications/db/__init__.py
  9. 4 3
      applications/exception/spider_error.py
  10. 28 3
      applications/functions.py
  11. 7548 0
      applications/js/toutiao.js
  12. 3 3
      applications/llm_sensitivity.py
  13. 4 0
      applications/pipeline/__init__.py
  14. 83 0
      applications/pipeline/crawler_pipeline.py
  15. BIN
      applications/so/libsph_decrypt.so
  16. 14 0
      applications/utils/__init__.py
  17. 30 0
      applications/utils/cold_start.py
  18. 61 0
      applications/utils/common.py
  19. 156 0
      applications/utils/download_video.py
  20. 58 0
      applications/utils/fetch_info_from_aigc.py
  21. 69 0
      applications/utils/item.py
  22. 52 0
      applications/utils/save_to_db.py
  23. 23 0
      applications/utils/upload.py
  24. 59 24
      applications/wxSpiderApi.py
  25. 53 0
      article_association_task.py
  26. 81 117
      cal_account_read_rate_avg_daily.py
  27. 1 1
      coldStartTasks/crawler/__init__.py
  28. 1 0
      coldStartTasks/crawler/baidu/__init__.py
  29. 1 1
      coldStartTasks/crawler/baidu/account_crawler.py
  30. 96 0
      coldStartTasks/crawler/baidu/baidu_spider.py
  31. 269 0
      coldStartTasks/crawler/baidu/video_crawler.py
  32. 6 0
      coldStartTasks/crawler/channels/__init__.py
  33. 22 0
      coldStartTasks/crawler/channels/blogger.py
  34. 41 0
      coldStartTasks/crawler/channels/search.py
  35. 4 0
      coldStartTasks/crawler/toutiao/__init__.py
  36. 64 0
      coldStartTasks/crawler/toutiao/blogger.py
  37. 25 0
      coldStartTasks/crawler/toutiao/use_js.py
  38. 4 0
      coldStartTasks/crawler/wechat/__init__.py
  39. 210 0
      coldStartTasks/crawler/wechat/article_association.py
  40. 1 25
      coldStartTasks/crawler/weixinCategoryCrawler.py
  41. 1 1
      coldStartTasks/crawler/weixin_account_association_crawler.py
  42. 3 1
      coldStartTasks/crawler/weixin_account_crawler.py
  43. 8 3
      coldStartTasks/crawler/weixin_video_crawler.py
  44. 62 16
      coldStartTasks/filter/title_similarity_task.py
  45. 276 0
      coldStartTasks/publish/basic.py
  46. 29 1
      coldStartTasks/publish/publishCategoryArticles.py
  47. 125 0
      coldStartTasks/publish/publish_article_association_articles.py
  48. 17 3
      coldStartTasks/publish/publish_video_to_pq_for_audit.py
  49. 20 1
      config/__init__.py
  50. 30 7
      config/crontab_backup
  51. 9 0
      crawler_sph_video.py
  52. 3 1
      requirements.txt
  53. 8 0
      run_baidu_video_crawler.py
  54. 9 0
      run_title_rewrite_task.py
  55. 26 0
      sh/run_article_association.sh
  56. 26 0
      sh/run_baidu_video_crawler.sh
  57. 26 0
      sh/run_gzh_video_crawler.sh
  58. 26 0
      sh/run_sph_video_crawler.sh
  59. 26 0
      sh/run_title_rewrite_task.sh
  60. 26 0
      sh/run_toutiao_account_video_crawler.sh
  61. 0 2
      sh/run_video_publish_and_audit.sh
  62. 1 1
      tasks/article_summary_task.py
  63. 224 0
      tasks/crawler_channel_account_videos.py
  64. 208 0
      tasks/crawler_toutiao_account_videos.py
  65. 270 0
      tasks/title_rewrite_task.py
  66. 12 3
      tasks/update_published_articles_minigram_detail.py
  67. 26 1
      title_similarity_score_task.py
  68. 10 0
      toutiao_video_crawler.py
  69. 83 131
      updateAccountV3.py

+ 1 - 1
account_cold_start_daily.py

@@ -54,7 +54,7 @@ class AccountColdStartDailyTask(object):
             # 抓取完成之后,给抓取到的标题进行相似度打分
             cold_start_title_similarity_task = ColdStartTitleSimilarityTask()
             cold_start_title_similarity_task.init_database()
-            cold_start_title_similarity_task.run()
+            cold_start_title_similarity_task.run(meta_source='article')
 
             bot(
                 title="账号冷启动任务,抓取完成",

+ 4 - 4
applications/api/__init__.py

@@ -1,8 +1,8 @@
 """
 @author: luojunhui
 """
-from .deep_seek_by_byte_dance_api import fetch_deepseek_response
-from .google_ai_api import GoogleAIAPI
-from .moon_shot_api import generate_mini_program_title
+from .deep_seek_api_by_volcanoengine import fetch_deepseek_response
+from .moon_shot_api import fetch_moon_shot_response
 from .nlp_api import similarity_between_title_list
-
+from .gewe_api import WechatChannelAPI
+from .google_ai_api import GoogleAIAPI

+ 26 - 0
applications/api/deep_seek_api_by_volcanoengine.py

@@ -0,0 +1,26 @@
+"""
+@author: luojunhui
+"""
+from volcenginesdkarkruntime import Ark
+
+from config import deep_seek_model
+from config import deep_seek_default_model
+from config import deep_seek_api_key_byte_dance
+
+
+def fetch_deepseek_response(model, prompt):
+    """
+    deep_seek方法
+    """
+    client = Ark(
+        api_key=deep_seek_api_key_byte_dance,
+        timeout=1800,
+        max_retries=2,
+        )
+    response = client.chat.completions.create(
+        model=deep_seek_model.get(model, deep_seek_default_model),
+        messages=[
+            {"role": "user", "content": prompt}
+        ]
+    )
+    return response.choices[0].message.content

+ 108 - 0
applications/api/gewe_api.py

@@ -0,0 +1,108 @@
+"""
+@author: luojunhui
+"""
+
+from tenacity import (
+    retry,
+    stop_after_attempt,
+    wait_exponential,
+    retry_if_exception_type,
+)
+from requests.exceptions import RequestException
+import requests
+import json
+from typing import Optional, Dict
+
+COMMON_RETRY = dict(
+    stop=stop_after_attempt(3),  # 总共尝试3次
+    wait=wait_exponential(min=2, max=30),
+    retry=retry_if_exception_type((RequestException, TimeoutError)),
+    reraise=True  # 重试耗尽后重新抛出异常
+)
+
+
+class WechatChannelAPI:
+    """
+    wechat channel api by gw
+    """
+
+    def __init__(self, base_url: str, token: str, app_id: str):
+        self.base_url = base_url
+        self.token = token
+        self.app_id = app_id
+
+    @retry(**COMMON_RETRY)
+    def search(
+            self,
+            search_key: str,
+            search_type: int,
+            page: int = 0,
+            cookie: str = "",
+            search_id: str = "",
+            offset: int = 0,
+    ) -> Optional[Dict]:
+        """
+        搜索微信视频号内容(支持重试)
+
+        :param search_key: 搜索关键字
+        :param search_type: 搜索类型,1: 搜索所有视频,2: 搜索视频号账号
+        :param page: 页码
+        :param cookie: 登录后的cookie
+        :param search_id: 搜索id
+        :param offset: 偏移量
+        :return: 返回搜索结果字典,失败时返回None
+        """
+        url = f"{self.base_url}/gewe/v2/api/finder/search"
+        payload = {
+            "appId": self.app_id,
+            "proxyIp": "",
+            "content": search_key,
+            "category": search_type,
+            "filter": 0,
+            "page": page,
+            "cookie": cookie,
+            "searchId": search_id,
+            "offset": offset,
+        }
+        headers = {"X-GEWE-TOKEN": self.token, "Content-Type": "application/json"}
+
+        try:
+            response = requests.post(url, headers=headers, json=payload, timeout=60)
+            response.raise_for_status()
+            return response.json()
+        except RequestException as e:
+            print(f"API请求失败: {e}")
+        except json.JSONDecodeError as e:
+            print(f"响应解析失败: {e}")
+        return None
+
+    @retry(**COMMON_RETRY)
+    def get_channel_video_list(
+            self, user_id: str, last_buffer: str = ""
+    ) -> Optional[Dict]:
+        """
+        获取视频号账号的视频列表(支持重试)
+
+        :param user_id: 视频号账号ID
+        :param last_buffer: 分页标记,用于获取下一页数据
+        :return: 返回视频列表字典,失败时返回None
+        """
+        url = f"{self.base_url}/gewe/v2/api/finder/userPage"
+        payload = {
+            "appId": self.app_id,
+            "proxyIp": "",
+            "lastBuffer": last_buffer,
+            "toUserName": user_id,
+            "maxId": 0,
+        }
+        headers = {"X-GEWE-TOKEN": self.token, "Content-Type": "application/json"}
+
+        try:
+            response = requests.post(url, headers=headers, json=payload, timeout=60)
+            response.raise_for_status()
+            return response.json()
+        except RequestException as e:
+            print(f"获取视频列表请求失败: {e}")
+        except json.JSONDecodeError as e:
+            print(f"响应解析失败: {e}")
+        return None

+ 88 - 13
applications/api/moon_shot_api.py

@@ -1,45 +1,120 @@
 """
 @author: luojunhui
 """
+import json
 from openai import OpenAI
 
-mini_program_title_generate_prompt = """
+from config import moon_shot
+
+generate_program_title_prompt = """
     请将以上标题改写成适合小程序点击和传播的小程序标题,小程序标题的写作规范如下,请学习后进行小程序标题的编写。直接输出最终的小程序标题
     小程序标题写作规范:
     1.要点前置:将最重要的信息放在标题的最前面,以快速吸引读者的注意力。例如,“5月一辈子同学,三辈子亲,送给我的老同学,听哭无数人!”中的“5月”和“一辈子同学,三辈子亲”都是重要的信息点。
     2.激发情绪:使用能够触动人心的语言,激发读者的情感共鸣。如“只剩两人同学聚会,看后感动落泪。”使用“感动落泪”激发读者的同情和怀旧情绪。
     3.使用数字和特殊符号:数字可以提供具体性,而特殊符号如“🔴”、“😄”、“🔥”等可以吸引视觉注意力,增加点击率。
-    4.悬念和好奇心:创建悬念或提出问题,激发读者的好奇心。例如,“太神奇了!长江水位下降,重庆出现惊奇一幕!”中的“惊奇一幕”就是一个悬念。
+    4.悬念和好奇心:创建悬念或提出问题,激发读者的好奇心。
     5.名人效应:如果内容与知名人士相关,提及他们的名字可以增加标题的吸引力。
     6.社会价值观:触及读者的文化和社会价值观,如家庭、友情、国家荣誉等。
     7.标点符号的运用:使用感叹号、问号等标点来增强语气和情感表达。
     8.直接的语言:使用直白、口语化的语言,易于理解,如“狗屁股,笑死我了!”。
     9.热点人物或事件:提及当前的热点人物或事件,利用热点效应吸引读者。
     10.字数适中:保持标题在10-20个字之间,既不过长也不过短,确保信息的完整性和吸引力。
-    11.适当的紧迫感:使用“最新”、“首次”、“紧急”等词汇,创造一种紧迫感,促使读者立即行动。
-    12.情感或价值诉求:使用如“感动”、“泪目”、“经典”等词汇,直接与读者的情感或价值观产生共鸣。
+    11.情感或价值诉求:使用如“感动”、“泪目”、“经典”等词汇,直接与读者的情感或价值观产生共鸣。
     避免误导:确保标题准确反映内容,避免夸大或误导读者。
     """
 
+get_title_safe_score_prompt = """
+    请你学习一下内容规范,以下标题可能会违反了某条内容规范。请你对标题做一个内容风险评级,1-10分,等级越高内容违规风险越大。 
+    请直接输出内容风险评级的分数,不要输出你的理由、分析等内容。 
+    输出:
+        只需要输出危险分级分数,不要输出任何其他内容。
+    内容规范为: 
+    4.2 色俗内容 
+        4.2.1 散布淫秽、色情内容,包括但不限于招嫖、寻找一夜情、性伴侣等。 
+        4.2.2 发布有色情意味的情色文字、情色视频、情色漫画等内容。 
+        4.2.3 以投稿/爆料等形式描述约炮经历、性交体验、偷情、涉隐私部位偷拍等伤风败俗的话题内容。 
+        4.2.4 以低俗的配图引诱用户阅读文章、关注微信公众号。包含性撩拨、性挑逗画面;疑似女性性高潮/性虐场面;偷拍的沐浴/更衣/如厕/亲热等私密画面;女性故意露出敏感部位 (纯裸露的胸、生殖器官)以及敏感部位未打码的真人写真/艺术摄影等。 
+        4.2.5 文内以低俗类的动图或引导图,诱导用户点击进而跳转至另一篇图文页或关注某个公众号。 
+        4.2.6 文章主要描述PUA撩妹、撩汉等相关话题,且引导用户关注公众号/加个人微信号/加群。 
+    4.11 煽动、夸大、误导类内容 平台鼓励创作者提供准确、清晰、能体现文章内容主旨的标题,不允许通过标题噱头诱导用户点击或误导用户。
+        包括但不限于以下情况: 
+        4.11.1 标题含有危害人身安全、恐吓侮辱、惊悚、极端内容,或者以命令式语气强迫用户阅读。 
+        4.11.2 标题无依据夸大事件严重程度、紧急程度、受影响面以及事件引发的情绪。 
+        4.11.3 标题以浮夸的描述,反常识强调某种食物/行为对人体健康的影响,煽动人群要/不要去做某行为。 
+        4.11.4 非官方通知或者公告,但标题假借官方名义煽动获取流量,或以信息来源机密、看完即删来诱导用户。 
+        4.11.5 标题故意隐藏关键信息,或无中生有部分信息,给用户造成误导。 
+        4.12 违反国家法律法规禁止的内容 
+            (1)违反宪法确定的基本原则的; 
+            (2)危害国家安全,泄露国家秘密,颠覆国家政权,破坏国家统一的; 
+            (3)损害国家荣誉和利益的; 
+            (4)煽动民族仇恨、民族歧视,破坏民族团结的; 
+            (5)破坏国家宗教政策,宣扬邪教和封建迷信的; 
+            (6)散布不实信息,扰乱社会秩序,破坏社会稳定的; 
+            (7)散布淫秽、色情、赌博、暴力、恐怖或者教唆犯罪的; 
+            (8)侮辱或者诽谤他人,侵害他人合法权益的; 
+            (9)煽动非法集会、结社、游行、示威、聚众扰乱社会秩序; 
+            (10)以非法民间组织名义活动的; 
+            (11)不符合《即时通信工具公众信息服务发展管理暂行规定》及遵守法律法规、社会主义制度、国家利益、公民合法利益、公共秩序、社会道德风尚和信息真实性等“七条底线”要求的; 
+            (12)含有法律、行政法规禁止的其他内容的。
+    输入的标题是: 
+    """
+
+make_title_safe_prompt = """
+    以下每行为一个文章的标题,请用尽量平实的语言对以上标题进行改写,保持在10~15字左右,请注意:
+    1. 不要虚构或改变标题的含义。
+    2. 不要用笃定的语气描述存疑的可能性,不要将表述可能性的问句改为肯定句。
+    直接输出改写后的标题列表。
+    在改写完成后,再输出一次,在改写的标题前增加和标题情感、语气匹配的特殊符号,如:“🔴”、“😄”、“🔥”、“😨”等等
+    输出:
+        输出结果是Dict, 格式为: 
+        {
+        "title_v1": 请填写第一次输出的标题,
+        "title_v2": 请填写第二次输出的标题
+        }
+    输入的标题是: 
+        """
+
 
-def generate_mini_program_title(ori_title):
+def fetch_moon_shot_response(task, input_text, output_type="text"):
     """
-    prompt + kimi + ori_title generate new title
-    :param ori_title:
-    :return:
+    调用kimi的api获取结果
     """
+    # generate prompt
+    match task:
+        case "generate_kimi_title":
+            prompt = input_text + '\n' + generate_program_title_prompt
+        case "get_title_safe_score":
+            prompt = get_title_safe_score_prompt + input_text
+        case "make_title_safe":
+            prompt = make_title_safe_prompt + input_text
+        case _:
+            prompt = input_text
+
+    # init client
     client = OpenAI(
-        api_key='sk-5DqYCa88kche6nwIWjLE1p4oMm8nXrR9kQMKbBolNAWERu7q',
-        base_url="https://api.moonshot.cn/v1"
+        api_key=moon_shot['api_key'],
+        base_url=moon_shot['base_url']
     )
+
+    # get response format
+    if output_type == "json":
+        response_format = {"type": "json_object"}
+    else:
+        response_format = {"type": "text"}
+
     chat_completion = client.chat.completions.create(
         messages=[
             {
                 "role": "user",
-                "content": ori_title + "\n" + mini_program_title_generate_prompt
+                "content": prompt,
             }
         ],
-        model="moonshot-v1-32k",
+        model=moon_shot['model'],
+        response_format=response_format,
     )
     response = chat_completion.choices[0].message.content
-    return response.split("\n")[0]
+    if output_type == "json":
+        response_json = json.loads(response)
+        return response_json
+
+    return response

+ 54 - 3
applications/api/nlp_api.py

@@ -2,6 +2,10 @@
 @author: luojunhui
 """
 import requests
+import traceback
+from requests.exceptions import RequestException, JSONDecodeError
+
+from applications.aliyunLogApi import log
 
 
 def similarity_between_title_list(target_title_list: list[str], base_title_list: list[str]) -> list[list[float]]:
@@ -11,7 +15,9 @@ def similarity_between_title_list(target_title_list: list[str], base_title_list:
     :param base_title_list: base title_list
     :return: list of similarity
     """
+
     url = 'http://61.48.133.26:6060/nlp'
+    url_backup = 'http://192.168.203.4:6060/nlp'
     body = {
         "data": {
             "text_list_a": target_title_list,
@@ -20,7 +26,52 @@ def similarity_between_title_list(target_title_list: list[str], base_title_list:
         "function": "similarities_cross",
         "use_cache": False
     }
-    response_json = requests.post(url, json=body, timeout=120).json()
-    score_array = response_json['score_list_list']
-    return score_array
 
+    try:
+        response = requests.post(url, json=body, timeout=120)
+        if response.status_code != 200:
+            response = requests.post(url_backup, json=body, timeout=120)
+    except RequestException as e:
+        log(
+            task="nlp",
+            function="similarity_between_title_list",
+            status="fail",
+            message="nlp server web error",
+            data={
+                "e": str(e),
+                "error_msg": traceback.format_exc()
+            }
+        )
+        # use back up
+        response = requests.post(url_backup, json=body, timeout=120)
+
+    if response.status_code != 200:
+        log(
+            task="nlp",
+            function="similarity_between_title_list",
+            status="fail",
+            message='nlp server request error',
+            data={
+                "status_code": response.status_code,
+                "response_text": response.text[:200]  # 截取部分内容避免过大
+            }
+        )
+        return []
+
+    try:
+        response_json = response.json()
+        score_array = response_json['score_list_list']
+    except (JSONDecodeError, KeyError) as e:
+        log(
+            task="nlp",
+            function="similarity_between_title_list",
+            status="fail",
+            message='nlp server response error',
+            data={
+                "error_type": type(e).__name__,
+                "raw_response": response.text[:200]
+            }
+        )
+        return []
+
+    return score_array

+ 172 - 7
applications/const/__init__.py

@@ -4,7 +4,7 @@
 """
 
 
-class coldStartTaskConst:
+class ColdStartTaskConst:
     """
     冷启动任务常量配置
     """
@@ -12,6 +12,44 @@ class coldStartTaskConst:
     INIT_STATUS = 1  # 文章初始状态
     BAD_STATUS = 0  # 低质量文章状态
 
+    # 常量
+    ACCOUNT_GOOD_STATUS = 1
+
+    # 账号是否每日抓取
+    ACCOUNT_DAILY_SCRAPE = 1
+    ACCOUNT_NOT_DAILY_SCRAPE = 0
+
+    # 默认值
+    DEFAULT_VIEW_COUNT = 0
+    DEFAULT_LIKE_COUNT = 0
+    DEFAULT_ARTICLE_STATUS = 1
+    DEFAULT_TIMESTAMP = 1717171200
+
+    # 标题sensitivity
+    TITLE_SENSITIVE = 1
+    TITLE_NOT_SENSITIVE = 0
+
+    # 文章联想深度
+    ARTICLE_ASSOCIATION_MAX_DEPTH = 4
+
+    # 相关分百分位阈值
+    PERCENT_THRESHOLD = 95
+
+    # 相关性分阈值
+    CORRELATION_THRESHOLD = 0.5
+
+    # 阅读量阈值
+    READ_COUNT_THRESHOLD = 1000
+
+    # 阅读均值倍数阈值
+    READ_AVG_THRESHOLD = 1.3
+
+    # 群发类型
+    BULK_PUBLISH_TYPE = 9
+
+    # 种子文章数量
+    SEED_ARTICLE_LIMIT_NUM = 60
+
 
 class updatePublishedMsgTaskConst:
     """
@@ -41,7 +79,7 @@ class updatePublishedMsgTaskConst:
     # 服务号
     SERVICE_TYPE = 2
     # 监测周期(秒)
-    MONITOR_PERIOD = 60 * 60 * 24 * 7
+    MONITOR_PERIOD = 60 * 60 * 24 * 3
 
     # 新号抓文章周期
     NEW_ACCOUNT_CRAWL_PERIOD = 60 * 60 * 24 * 30
@@ -50,7 +88,7 @@ class updatePublishedMsgTaskConst:
     SUBSCRIBE_FAIL_RATE_THRESHOLD = 0.3
 
 
-class updateAccountReadRateTaskConst:
+class UpdateAccountReadRateTaskConst:
     """
     更新账号阅读率常量配置
     """
@@ -66,8 +104,14 @@ class updateAccountReadRateTaskConst:
     # 文章位置
     ARTICLE_INDEX_LIST = [1, 2, 3, 4, 5, 6, 7, 8]
 
+    # 默认粉丝
+    DEFAULT_FANS = 0
 
-class updateAccountReadAvgTaskConst:
+    # 最低粉丝量
+    MIN_FANS = 1000
+
+
+class UpdateAccountReadAvgTaskConst:
     """
     更新账号阅读均值常量配置
     """
@@ -86,6 +130,19 @@ class updateAccountReadAvgTaskConst:
     ARTICLES_DAILY = 1
     TOULIU = 2
 
+    # 默认粉丝
+    DEFAULT_FANS = 0
+
+    # index list
+    ARTICLE_INDEX_LIST = [1, 2, 3, 4, 5, 6, 7, 8]
+
+    # 默认点赞
+    DEFAULT_LIKE = 0
+
+    # 状态
+    USING_STATUS = 1
+    NOT_USING_STATUS = 0
+
 
 class WeixinVideoCrawlerConst:
     """
@@ -139,18 +196,23 @@ class WeixinVideoCrawlerConst:
     DEFAULT_ACCOUNT_UID = 76862180
 
     # 每天发送的审核视频数量
-    MAX_VIDEO_NUM = 500
+    MAX_VIDEO_NUM = 1000
+
+    # 单次发布视频审核量
+    MAX_VIDEO_NUM_PER_PUBLISH = 350
 
     # 标题状态
     TITLE_DEFAULT_STATUS = 0
     TITLE_EXIT_STATUS = 1
     TITLE_FESTIVAL_STATUS = 2
-    TITLE_DUPLICATE_STATUS = 3
-    TITLE_SHORT_STATUS = 4
+    TITLE_SHORT_STATUS = 3
 
     # 标题最短长度
     TITLE_MIN_LENGTH = 15
 
+    # safe score
+    TITLE_SAFE_SCORE_THRESHOLD = 7
+
 
 class UpdateMiniProgramDetailConst(updatePublishedMsgTaskConst):
     """
@@ -207,6 +269,109 @@ class ArticleCollectorConst:
     ARTICLE_UNKNOWN_CODE = 10000
 
 
+class BaiduVideoCrawlerConst:
+    """
+    const for baidu video crawler
+    """
+    # account status
+    BAIDU_ACCOUNT_GOOD_STATUS = 1
+    BAIDU_ACCOUNT_BAD_STATUS = 0
+
+    # earliest cursor, 2024-01-01 00:00:00
+    DEFAULT_CURSOR = 17040384000000
+
+    # no source account
+    NO_SOURCE_ACCOUNT_STATUS = 0
+
+    # timestamp To Cursor
+    TIMESTAMP_TO_CURSOR = 10000
+
+    # local path dir
+    LOCAL_PATH_DIR = "static"
+
+
+class TitleRewriteTaskConst:
+    """
+    title rewrite task const
+    """
+    # title rewrite status
+    TITLE_REWRITE_INIT_STATUS = 0
+    TITLE_REWRITE_SUCCESS_STATUS = 1
+    TITLE_REWRITE_FAIL_STATUS = 99
+    TITLE_REWRITE_LOCK_STATUS = 101
+
+    # article status
+    ARTICLE_AUDIT_PASSED_STATUS = 1
+    ARTICLE_POSITIVE_STATUS = 0
+
+    # title useful status
+    TITLE_USEFUL_STATUS = 1
+
+    # prompt version
+    PROMPT_VERSION = "xx_250228"  # 信欣2025-02-28提供
+
+    # block expire time 1h
+    TITLE_REWRITE_LOCK_TIME = 60 * 60
+
+
+class ChannelVideoCrawlerConst:
+    """
+    const for baidu video crawler
+    """
+    # account status
+    CHANNEL_ACCOUNT_GOOD_STATUS = 1
+    CHANNEL_ACCOUNT_BAD_STATUS = 0
+
+    # earliest cursor, 2024-01-01 00:00:00
+    DEFAULT_CURSOR = 1704038400
+
+    # no source account
+    NO_SOURCE_ACCOUNT_STATUS = 0
+
+    # local path dir
+    LOCAL_PATH_DIR = "static"
+
+    # title length min
+    MIN_TITLE_LENGTH = 10
+
+    # max video length(second)
+    MAX_VIDEO_LENGTH = 600
+
+    # sleep second
+    SLEEP_SECOND = 2
+
+
+class ToutiaoVideoCrawlerConst:
+    """
+    const for toutiao video crawler
+    """
+    # platform
+    PLATFORM = "toutiao"
+
+    # account status
+    TOUTIAO_ACCOUNT_GOOD_STATUS = 1
+    TOUTIAO_ACCOUNT_BAD_STATUS = 0
+
+    # earliest cursor, 2021-01-01 00:00:00
+    DEFAULT_CURSOR = 1609430400
+
+    # no source account
+    NO_SOURCE_ACCOUNT_STATUS = 0
+
+    # title length min
+    MIN_TITLE_LENGTH = 10
+
+    # max video length(second)
+    MAX_VIDEO_LENGTH = 600
+
+    # sleep second
+    SLEEP_SECOND = 3
+
+
+
+
+
+
 # 视频转文本任务
 class VideoToTextConst:
     """

+ 12 - 8
applications/db/__init__.py

@@ -30,12 +30,12 @@ class DatabaseConnector:
         """
         try:
             self.connection = pymysql.connect(
-                host=self.db_config.get('host', 'localhost'),
-                user=self.db_config['user'],
-                password=self.db_config['password'],
-                db=self.db_config['db'],
-                port=self.db_config.get('port', 3306),
-                charset=self.db_config.get('charset', 'utf8mb4')
+                host=self.db_config.get("host", "localhost"),
+                user=self.db_config["user"],
+                password=self.db_config["password"],
+                db=self.db_config["db"],
+                port=self.db_config.get("port", 3306),
+                charset=self.db_config.get("charset", "utf8mb4"),
             )
         except pymysql.MySQLError as e:
             raise ConnectionError(f"无法连接到数据库: {e}")
@@ -48,9 +48,10 @@ class DatabaseConnector:
             self.connection.close()
             self.connection = None
 
-    def fetch(self, query, cursor_type=None):
+    def fetch(self, query, cursor_type=None, params=None):
         """
         执行单条查询语句,并返回结果。
+        :param params: 查询传参
         :param cursor_type: 输出的返回格式
         :param query: 查询语句
         :return: 查询结果列表
@@ -61,7 +62,10 @@ class DatabaseConnector:
 
         try:
             with self.connection.cursor(cursor_type) as cursor:
-                cursor.execute(query)
+                if params:
+                    cursor.execute(query, params)
+                else:
+                    cursor.execute(query)
                 result = cursor.fetchall()
                 return result
         except pymysql.MySQLError as e:

+ 4 - 3
applications/exception/spider_error.py

@@ -7,9 +7,9 @@ from applications import log
 
 
 class SpiderError(Exception):
-    """数据库查询异常"""
+    """spider_task_error"""
 
-    def __init__(self, error=None, spider=None, url=None):
+    def __init__(self, platform=None, error=None, spider=None, url=None):
         """
         :param error: 异常对象,可选,用于提供更详细的错误信息。
         :param spider: 爬虫任务
@@ -22,7 +22,8 @@ class SpiderError(Exception):
         }
         log(
             task="spider_task",
-            function="log_spider_error",
+            function="{}".format(platform),
+            message="{} 抓取失败".format(spider),
             data=error_obj
         )
         super().__init__(json.dumps(error_obj, ensure_ascii=False, indent=4))

+ 28 - 3
applications/functions.py

@@ -153,11 +153,11 @@ class Functions(object):
         快代理
         """
         # 隧道域名:端口号
-        tunnel = "l901.kdltps.com:15818"
+        tunnel = "j685.kdltps.com:15818"
 
         # 用户名密码方式
-        username = "t11983523373311"
-        password = "mtuhdr2z"
+        username = "t14070979713487"
+        password = "hqwanfvy"
         proxies = {
             "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
             "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
@@ -296,3 +296,28 @@ class Functions(object):
         params = parse_qs(urlparse(url).query)
         info = params.get(key, [])
         return info[0] if info else None
+
+    @classmethod
+    def download_baidu_videos(cls, video_url, save_path):
+        """
+        :param video_url: baidu video url
+        :param save_path: save path
+        """
+        if os.path.exists(save_path):
+            return save_path
+
+        response = requests.get(
+            video_url,
+            headers={
+                'User-Agent': FakeUserAgent().chrome,
+                "Accept": "*/*",
+                "Accept-Language": "zh-CN,zh;q=0.9"
+            }
+        )
+        with open(save_path, 'wb') as f:
+            f.write(response.content)
+        TEN_KB = 1024 * 10
+        if os.path.getsize(save_path) > TEN_KB:
+            return save_path
+        else:
+            return None

File diff suppressed because it is too large
+ 7548 - 0
applications/js/toutiao.js


+ 3 - 3
applications/llm_sensitivity.py

@@ -8,8 +8,8 @@ from openai import OpenAI
 
 def request_llm_api(prompt, text):
     client = OpenAI(
-        api_key='sk-c1b18099dadc4dd1b48239bdde184f6c',
-        base_url="https://api.deepseek.com"
+        api_key='5e275c38-44fd-415f-abcf-4b59f6377f72',
+        base_url="https://ark.cn-beijing.volces.com/api/v3"
     )
     chat_completion = client.chat.completions.create(
         messages=[
@@ -18,7 +18,7 @@ def request_llm_api(prompt, text):
                 "content": prompt + text,
             }
         ],
-        model="deepseek-chat",
+        model="ep-20250213194558-rrmr2", # deepseek-v3
         temperature=0.2,
         response_format={"type": "json_object"}
     )

+ 4 - 0
applications/pipeline/__init__.py

@@ -0,0 +1,4 @@
+"""
+@author: luojunhui
+"""
+from .crawler_pipeline import scrape_video_entities_process

+ 83 - 0
applications/pipeline/crawler_pipeline.py

@@ -0,0 +1,83 @@
+"""
+@author: luojunhui
+"""
+
+import os
+import json
+
+from applications import log
+
+from applications.utils import download_gzh_video
+from applications.utils import download_toutiao_video
+from applications.utils import upload_to_oss
+
+from config import apolloConfig
+
+my_config = apolloConfig()
+
+empty_dict = {}
+sensitive_word_list = json.loads(my_config.getConfigValue("sensitive_word_list"))
+
+
+def whether_title_sensitive(title: str) -> bool:
+    """
+    title sensitive words filter
+    """
+    for word in sensitive_word_list:
+        if word in title:
+            return True
+
+    return False
+
+
+def whether_duplicate_video_title(video_title: str, db_client) -> bool:
+    """
+    whether duplicate video title
+    """
+    sql = f"""
+        select id from publish_single_video_source
+        where article_title = %s;
+    """
+    duplicate_id = db_client.fetch(query=sql, params=(video_title,))
+    if duplicate_id:
+        return True
+
+    return False
+
+
+def scrape_video_entities_process(video_item, db_client) -> dict:
+    """
+    video crawler pipeline
+    """
+    article_url = video_item["article_url"]
+    platform = video_item["platform"]
+    video_title = video_item["article_title"]
+    # whether title sensitive
+    if whether_title_sensitive(video_title):
+        return empty_dict
+
+    # whether duplicate video title
+    if whether_duplicate_video_title(video_title, db_client):
+        return empty_dict
+
+    # download video
+    match platform:
+        case "toutiao":
+            video_path = download_toutiao_video(article_url)
+        case "gzh":
+            video_path = download_gzh_video(article_url)
+        case "hksp":
+            video_path = ""
+        case "sph":
+            video_path = ""
+        case _:
+            return empty_dict
+
+    if video_path:
+        # upload video to oss
+        oss_path = upload_to_oss(video_path)
+        video_item["video_oss_path"] = oss_path
+        os.remove(video_path)
+        return video_item
+    else:
+        return empty_dict

BIN
applications/so/libsph_decrypt.so


+ 14 - 0
applications/utils/__init__.py

@@ -0,0 +1,14 @@
+"""
+utils
+"""
+from .cold_start import whether_title_sensitive
+from .cold_start import get_inner_account_set
+from .common import *
+from .download_video import download_gzh_video
+from .download_video import download_sph_video
+from .download_video import download_toutiao_video
+from .item import Item
+from .save_to_db import insert_into_single_video_source_table
+from .upload import upload_to_oss
+from .fetch_info_from_aigc import fetch_account_fans
+from .fetch_info_from_aigc import fetch_publishing_account_list

+ 30 - 0
applications/utils/cold_start.py

@@ -0,0 +1,30 @@
+"""
+@author: luojunhui
+"""
+import json
+
+from applications import aiditApi
+from config import apolloConfig
+
+config = apolloConfig()
+sensitive_word_list = json.loads(config.getConfigValue("sensitive_word_list"))
+
+
+def whether_title_sensitive(title: str) -> bool:
+    """
+    : param title:
+    判断视频是否的标题是否包含敏感词
+    """
+    for word in sensitive_word_list:
+        if word in title:
+            return True
+    return False
+
+
+def get_inner_account_set() -> set:
+    """
+    get inner account set
+    """
+    accounts = aiditApi.get_publish_account_from_aigc()
+    gh_id_list = [i['ghId'] for i in accounts]
+    return set(gh_id_list)

+ 61 - 0
applications/utils/common.py

@@ -0,0 +1,61 @@
+"""
+@author: luojunhui
+"""
+
+import hashlib
+
+from requests import RequestException
+from tenacity import (
+    stop_after_attempt,
+    wait_exponential,
+    retry_if_exception_type,
+)
+
+
+def str_to_md5(strings):
+    """
+    字符串转化为 md5 值
+    :param strings:
+    :return:
+    """
+    # 将字符串转换为字节
+    original_bytes = strings.encode("utf-8")
+    # 创建一个md5 hash对象
+    md5_hash = hashlib.md5()
+    # 更新hash对象,传入原始字节
+    md5_hash.update(original_bytes)
+    # 获取16进制形式的MD5哈希值
+    md5_value = md5_hash.hexdigest()
+    return md5_value
+
+
+def proxy():
+    """
+    快代理
+    """
+    # 隧道域名:端口号
+    tunnel = "j685.kdltps.com:15818"
+
+    # 用户名密码方式
+    username = "t14070979713487"
+    password = "hqwanfvy"
+    proxies = {
+        "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
+        "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
+    }
+    return proxies
+
+
+def request_retry(retry_times, min_retry_delay, max_retry_delay):
+    """
+    :param retry_times:
+    :param min_retry_delay:
+    :param max_retry_delay:
+    """
+    common_retry = dict(
+        stop=stop_after_attempt(retry_times),
+        wait=wait_exponential(min=min_retry_delay, max=max_retry_delay),
+        retry=retry_if_exception_type((RequestException, TimeoutError)),
+        reraise=True  # 重试耗尽后重新抛出异常
+    )
+    return common_retry

+ 156 - 0
applications/utils/download_video.py

@@ -0,0 +1,156 @@
+"""
+@author: luojunhui
+"""
+
+import os
+import re
+import html
+import cffi
+import traceback
+
+import requests
+from uuid import uuid4
+from fake_useragent import FakeUserAgent
+
+from applications.utils.common import str_to_md5
+from config import decrypt_key_path
+
+headers = {"Content-Type": "application/json", "User-Agent": FakeUserAgent().chrome}
+
+
+def extract_video_url_from_article(article_url):
+    """
+    :param article_url:
+    :return:
+    """
+    response = requests.get(
+        url=article_url,
+        headers={"User-Agent": FakeUserAgent().random},
+    )
+    html_text = response.text
+    w = re.search(
+        r"mp_video_trans_info.*url:\s*\(\'(.*?)\'\)\.replace", html_text, re.S | re.M
+    ).group(1)
+    url = html.unescape(
+        re.sub(
+            r"\\x\d+", lambda x: bytes.fromhex(x.group().replace("\\x", "")).decode(), w
+        )
+    )
+    return url
+
+
+def download_gzh_video(article_url):
+    """
+    下载公众号视频
+    :param article_url:
+    :return:
+    """
+    try:
+        video_url = extract_video_url_from_article(article_url)
+    except Exception as e:
+        return
+    save_path = "static/{}.mp4".format(str_to_md5(video_url))
+    headers = {
+        "Accept": "*/*",
+        "Accept-Language": "zh,zh-CN;q=0.9",
+        "Connection": "keep-alive",
+        "Origin": "https://mp.weixin.qq.com",
+        "Referer": "https://mp.weixin.qq.com/",
+        "Sec-Fetch-Dest": "video",
+        "Sec-Fetch-Mode": "cors",
+        "Sec-Fetch-Site": "cross-site",
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
+        "sec-ch-ua": '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"',
+        "sec-ch-ua-mobile": "?0",
+        "sec-ch-ua-platform": '"macOS"',
+    }
+    res = requests.get(video_url, headers=headers)
+    with open(save_path, "wb") as f:
+        f.write(res.content)
+
+    TEN_KB = 1024 * 10
+    if os.path.getsize(save_path) > TEN_KB:
+        return save_path
+    else:
+        return None
+
+
+def download_sph_video(download_url, key):
+    """
+    download video, decrypt video and save to local
+    """
+    file_id = uuid4().hex
+    encrypted_path = f"static/encrypted_{file_id}.mp4"
+    decrypted_path = f"static/decrypted_{file_id}.mp4"
+
+    try:
+        with requests.get(download_url, headers=headers, stream=True) as response:
+            response.raise_for_status()
+
+            with open(encrypted_path, "wb") as f:
+                for chunk in response.iter_content(chunk_size=8192):
+                    if chunk:  # filter out keep-alive chunks
+                        f.write(chunk)
+
+        decrypt_sph_video(encrypted_path, key, decrypted_path)
+        os.remove(encrypted_path)
+        return decrypted_path
+
+    except Exception as e:
+        print(traceback.format_exc())
+        for path in [encrypted_path, decrypted_path]:
+            if os.path.exists(path):
+                try:
+                    os.remove(path)
+                except OSError:
+                    pass
+        raise RuntimeError(f"Video processing failed: {str(e)}") from e
+
+
+def decrypt_sph_video(video_path: str, key: int, save_path: str) -> None:
+    """
+    Decrypt video file using C library.
+    Args:
+        video_path: Path to encrypted video file
+        key: 32-bit unsigned integer decryption key
+        save_path: Path to save decrypted video
+    Raises:
+        RuntimeError: If decryption fails
+    """
+    print("key is {}".format(key))
+    ffi = cffi.FFI()
+
+    try:
+        lib = ffi.dlopen(decrypt_key_path)
+        ffi.cdef(
+            "void decrypt(unsigned char *data, const size_t data_length, const uint32_t key);"
+        )
+
+        with open(video_path, "rb") as f:
+            encrypted_data = f.read()
+
+        c_data = ffi.new("unsigned char[]", list(encrypted_data))
+        lib.decrypt(c_data, 2**17, int(key))
+        decrypted_data = bytes(ffi.buffer(c_data, len(encrypted_data))[:])
+
+        with open(save_path, "wb") as f:
+            f.write(decrypted_data)
+
+    except Exception as e:
+        print(traceback.format_exc())
+        raise RuntimeError(f"Decryption failed: {str(e)}") from e
+
+
+def download_toutiao_video(video_url: str) -> str:
+    """
+    download toutiao video
+    """
+    save_path = "static/{}.mp4".format(str_to_md5(video_url))
+    response = requests.get(video_url, headers=headers, stream=True)
+    with open(save_path, "wb") as f:
+        for chunk in response.iter_content(chunk_size=8192):
+            if chunk:
+                f.write(chunk)
+
+    return save_path
+

+ 58 - 0
applications/utils/fetch_info_from_aigc.py

@@ -0,0 +1,58 @@
+"""
+fetch info from aigc database system
+"""
+from collections import defaultdict
+from typing import List, Dict
+
+from pymysql.cursors import DictCursor
+
+
+def fetch_publishing_account_list(db_client) -> List[Dict]:
+    """
+    fetch account_list from aigc database
+    """
+    fetch_sql = f"""
+        SELECT DISTINCT
+            t3.`name` as account_name,
+            t3.gh_id as gh_id,
+            t3.follower_count as fans,
+            t6.account_source_name as account_source,
+            t6.mode_type as mode_type,
+            t6.account_type as account_type,
+            t6.`status` as status
+        FROM
+            publish_plan t1
+            JOIN publish_plan_account t2 ON t1.id = t2.plan_id
+            JOIN publish_account t3 ON t2.account_id = t3.id
+            LEFT JOIN publish_account_wx_type t4 on t3.id = t4.account_id
+            LEFT JOIN wx_statistics_group_source_account t5 on t3.id = t5.account_id
+            LEFT JOIN wx_statistics_group_source t6 on t5.group_source_name = t6.account_source_name
+        WHERE
+            t1.plan_status = 1
+            AND t3.channel = 5
+            GROUP BY t3.id;
+    """
+    account_list = db_client.fetch(
+        query=fetch_sql,
+        cursor_type=DictCursor
+    )
+    return account_list
+
+def fetch_account_fans(db_client, start_date: str) -> Dict:
+    """
+    fetch account fans from aigc database
+    """
+    sql = f"""
+        SELECT t1.date_str, t1.fans_count, t2.gh_id
+        FROM datastat_wx t1 JOIN publish_account t2 ON t1.account_id = t2.id
+        WHERE t2.channel = 5
+            AND t2.status = 1 
+            AND t1.date_str >= '{start_date}' 
+        ORDER BY t1.date_str;
+        """
+    result = db_client.fetch(sql)
+    fans_dict = defaultdict(dict)
+    for dt, fans, gh_id in result:
+        fans_dict.setdefault(gh_id, {})[dt] = fans
+    return fans_dict
+

+ 69 - 0
applications/utils/item.py

@@ -0,0 +1,69 @@
+"""
+@author: luojunhui
+"""
+
+import time
+
+default_single_video_table_fields = {
+    "platform": "gzh",
+    "article_title": None,
+    "content_trace_id": None,
+    "read_cnt": 0,
+    "article_index": None,
+    "out_account_name": None,
+    "article_url": None,
+    "url_unique_md5": None,
+    "category": None,
+    "publish_timestamp": None,
+    "out_account_id": None,
+    "cover_url": None,
+    "crawler_timestamp": int(time.time()),
+    "source_account": 1,
+    "article_publish_type": None,
+    "like_cnt": 0,
+    "bad_status": 0,
+    "tags": None,
+    "video_oss_path": None,
+}
+
+
+class Item(object):
+    """
+    format save to article meta table or single video source table
+    """
+
+    def __init__(self):
+        self.item = {}
+
+    def add(self, key, value):
+        """
+        add key value to item
+        """
+        self.item[key] = value
+
+    def check_video_item(self):
+        """
+        check video item
+        """
+        fields = list(default_single_video_table_fields.keys())
+        for field in fields:
+            if self.item.get(field, None) is not None:
+                continue
+            else:
+                self.item[field] = default_single_video_table_fields[field]
+
+    def check_article_item(self):
+        """
+        check article item
+        """
+        return
+
+    def check(self, source):
+        """
+        check item
+        """
+        match source:
+            case "video":
+                self.check_video_item()
+            case "article":
+                self.check_article_item()

+ 52 - 0
applications/utils/save_to_db.py

@@ -0,0 +1,52 @@
+"""
+@author: luojunhui
+"""
+
+import traceback
+from applications.aliyunLogApi import log
+
+
+def insert_into_single_video_source_table(db_client, video_item):
+    """
+    insert video into single video source table
+    """
+    insert_sql = f"""
+        INSERT INTO publish_single_video_source
+        (content_trace_id, article_title, out_account_id, out_account_name, read_cnt, like_cnt, article_url, cover_url, video_oss_path, publish_timestamp, crawler_timestamp, url_unique_md5, category, tags, platform, source_account)
+        values
+        (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+    """
+    try:
+        db_client.save(
+            query=insert_sql,
+            params=(
+                video_item["content_trace_id"],
+                video_item["article_title"],
+                video_item["out_account_id"],
+                video_item["out_account_name"],
+                video_item["read_cnt"],
+                video_item["like_cnt"],
+                video_item["article_url"],
+                video_item["cover_url"],
+                video_item["video_oss_path"],
+                video_item["publish_timestamp"],
+                video_item["crawler_timestamp"],
+                video_item["url_unique_md5"],
+                video_item["category"],
+                video_item["tags"],
+                video_item["platform"],
+                video_item["source_account"],
+            ),
+        )
+    except Exception as e:
+        log(
+            task="{}_video_crawler".format(video_item["platform"]),
+            function="save_each_video",
+            message="save video failed",
+            data={
+                "error": str(e),
+                "traceback": traceback.format_exc(),
+                "video_id": video_item["url_unique_md5"],
+                "oss_path": video_item["video_oss_path"],
+            },
+        )

+ 23 - 0
applications/utils/upload.py

@@ -0,0 +1,23 @@
+"""
+@author: luojunhui
+"""
+
+import oss2
+from uuid import uuid4
+
+
+def upload_to_oss(local_video_path):
+    """
+    把视频上传到 oss
+    :return:
+    """
+    oss_video_key = "long_articles/video/" + str(uuid4())
+    access_key_id = "LTAIP6x1l3DXfSxm"
+    access_key_secret = "KbTaM9ars4OX3PMS6Xm7rtxGr1FLon"
+    endpoint = "oss-cn-hangzhou.aliyuncs.com"
+    bucket_name = "art-pubbucket"
+    bucket = oss2.Bucket(
+        oss2.Auth(access_key_id, access_key_secret), endpoint, bucket_name
+    )
+    bucket.put_object_from_file(key=oss_video_key, filename=local_video_path)
+    return oss_video_key

+ 59 - 24
applications/wxSpiderApi.py

@@ -1,9 +1,12 @@
 """
 @author: luojunhui
 """
+
 import json
+import time
 import requests
 
+from applications.aliyunLogApi import log
 from applications.decoratorApi import retryOnNone
 
 
@@ -11,13 +14,12 @@ class WeixinSpider(object):
     """
     Update account articles
     """
+
     # ip = "8.217.190.241"
     # ip = "47.98.154.124"
     # port = "8888"
     base_url = "http://crawler-cn.aiddit.com/crawler/wei_xin"
-    headers = {
-        "Content-Type": "application/json"
-    }
+    headers = {"Content-Type": "application/json"}
 
     @classmethod
     @retryOnNone()
@@ -27,11 +29,10 @@ class WeixinSpider(object):
         :return:
         """
         url = "{}/keyword".format(cls.base_url)
-        payload = json.dumps({
-            "keyword": title,
-            "cursor": page
-        })
-        response = requests.request("POST", url, headers=cls.headers, data=payload, timeout=120)
+        payload = json.dumps({"keyword": title, "cursor": page})
+        response = requests.request(
+            "POST", url, headers=cls.headers, data=payload, timeout=120
+        )
         return response.json()
 
     @classmethod
@@ -45,13 +46,17 @@ class WeixinSpider(object):
         :return:
         """
         url = "{}/detail".format(cls.base_url)
-        payload = json.dumps({
-            "content_link": content_link,
-            "is_count": is_count,
-            "is_ad": False,
-            "is_cache": is_cache
-        })
-        response = requests.request("POST", url, headers=cls.headers, data=payload, timeout=120)
+        payload = json.dumps(
+            {
+                "content_link": content_link,
+                "is_count": is_count,
+                "is_ad": False,
+                "is_cache": is_cache,
+            }
+        )
+        response = requests.request(
+            "POST", url, headers=cls.headers, data=payload, timeout=120
+        )
         return response.json()
 
     @classmethod
@@ -60,12 +65,14 @@ class WeixinSpider(object):
         """
         :return:
         """
-        url = '{}/blogger'.format(cls.base_url)
+        url = "{}/blogger".format(cls.base_url)
         payload = {
-            'account_id': ghId,
-            'cursor': index,
+            "account_id": ghId,
+            "cursor": index,
         }
-        response = requests.post(url=url, headers=cls.headers, data=json.dumps(payload), timeout=120)
+        response = requests.post(
+            url=url, headers=cls.headers, data=json.dumps(payload), timeout=120
+        )
         return response.json()
 
     @classmethod
@@ -76,9 +83,11 @@ class WeixinSpider(object):
         :param content_url:
         :return:
         """
-        url = '{}/account_info'.format(cls.base_url)
+        url = "{}/account_info".format(cls.base_url)
         data = {"content_link": content_url}
-        response = requests.request("POST", url=url, headers=cls.headers, json=data, timeout=120)
+        response = requests.request(
+            "POST", url=url, headers=cls.headers, json=data, timeout=120
+        )
         return response.json()
 
     @classmethod
@@ -89,8 +98,34 @@ class WeixinSpider(object):
         :return:
         """
         url = "{}/recommend".format(cls.base_url)
-        payload = json.dumps(
-            {"content_link": content_link}
+        payload = json.dumps({"content_link": content_link})
+        response = requests.request(
+            "POST", url=url, headers=cls.headers, data=payload, timeout=120
+        )
+        response_json = response.json()
+        if response_json["code"] != 0:
+            return cls.get_recommend_articles(content_link)
+        time.sleep(3)
+        return response.json()
+
+    @classmethod
+    def get_recommend_articles_v2(cls, content_link) -> dict:
+        """
+        use content link to get recommend articles
+        :param content_link:
+        :return:
+        """
+        url = "http://datapi.top/wxapi/relatedarticle"
+        payload = {
+            'url': content_link,
+            'token': '401e4d3c85068bb5'
+        }
+        response = requests.request("POST", url, headers={}, data=payload, timeout=120)
+        log(
+            task="article_association_crawler",
+            function="get_recommend_articles_v2",
+            message="获取推荐链接,付费接口",
+            data={"content_link": content_link, "response": response.json()},
         )
-        response = requests.request("POST", url=url, headers=cls.headers, data=payload, timeout=120)
+        time.sleep(3)
         return response.json()

+ 53 - 0
article_association_task.py

@@ -0,0 +1,53 @@
+"""
+@author: luojunhui
+"""
+import traceback
+from argparse import ArgumentParser
+
+from applications import bot
+from coldStartTasks.crawler.wechat import ArticleAssociationCrawler
+from coldStartTasks.publish.publish_article_association_articles import ArticleAssociationPublish
+
+
+def main():
+    """
+    main function
+    """
+    parser = ArgumentParser()
+    parser.add_argument("--biz_date", type=str, help="format 2025-01-01")
+    args = parser.parse_args()
+
+    if args.biz_date:
+        biz_date = args.biz_date
+    else:
+        biz_date = None
+    try:
+        article_association_crawler = ArticleAssociationCrawler()
+        article_association_crawler.deal(biz_date=biz_date)
+    except Exception as e:
+        bot(
+            title="It occurred an Exception in ArticleAssociationCrawler",
+            detail={
+                "Error": str(e),
+                "Traceback": traceback.format_exc()
+            },
+            mention=False,
+        )
+
+    # publish
+    try:
+        article_association_publish = ArticleAssociationPublish()
+        article_association_publish.deal()
+    except Exception as e:
+        bot(
+            title="It occurred an Exception in ArticleAssociationPublish",
+            detail={
+                "Error": str(e),
+                "Traceback": traceback.format_exc()
+            },
+            mention=False,
+        )
+
+
+if __name__ == "__main__":
+    main()

+ 81 - 117
cal_account_read_rate_avg_daily.py

@@ -7,14 +7,21 @@ from tqdm import tqdm
 from pandas import DataFrame
 from argparse import ArgumentParser
 from datetime import datetime
+from pymysql.cursors import DictCursor
 
-from applications import DeNetMysql, PQMySQL, longArticlesMySQL, bot, Functions, create_feishu_columns_sheet
-from applications.const import updateAccountReadRateTaskConst
-from config import apolloConfig
+from applications import bot, Functions, log
+from applications import create_feishu_columns_sheet
+from applications.db import DatabaseConnector
+from applications.const import UpdateAccountReadRateTaskConst
+from applications.utils import fetch_publishing_account_list
+from applications.utils import fetch_account_fans
+from config import apolloConfig, long_articles_config, piaoquan_crawler_config, denet_config
 
-const = updateAccountReadRateTaskConst()
+
+const = UpdateAccountReadRateTaskConst()
 config = apolloConfig()
 unauthorized_account = json.loads(config.getConfigValue("unauthorized_gh_id_fans"))
+backup_account_fans = json.loads(config.getConfigValue("backup_account_fans"))
 functions = Functions()
 read_rate_table = "long_articles_read_rate"
 
@@ -37,75 +44,7 @@ def filter_outlier_data(group, key='show_view_count'):
     return filtered_group
 
 
-def get_account_fans_by_dt(db_client) -> dict:
-    """
-    获取每个账号发粉丝,通过日期来区分
-    :return:
-    """
-    sql = f"""
-        SELECT 
-            t1.date_str, 
-            t1.fans_count, 
-            t2.gh_id
-        FROM datastat_wx t1
-        JOIN publish_account t2 ON t1.account_id = t2.id
-        WHERE 
-            t2.channel = 5 
-        AND t2.status = 1 
-        AND t1.date_str >= '2024-07-01' 
-        ORDER BY t1.date_str;
-    """
-    result = db_client.select(sql)
-    D = {}
-    for line in result:
-        dt = line[0]
-        fans = line[1]
-        gh_id = line[2]
-        if D.get(gh_id):
-            D[gh_id][dt] = fans
-        else:
-            D[gh_id] = {dt: fans}
-    return D
-
-
-def get_publishing_accounts(db_client) -> list[dict]:
-    """
-    获取每日正在发布的账号
-    :return:
-    """
-    sql = f"""
-    SELECT DISTINCT
-        t3.`name`,
-        t3.gh_id,
-        t3.follower_count,
-        t6.account_source_name,
-        t6.mode_type,
-        t6.account_type,
-        t6.`status`
-    FROM
-        publish_plan t1
-        JOIN publish_plan_account t2 ON t1.id = t2.plan_id
-        JOIN publish_account t3 ON t2.account_id = t3.id
-        LEFT JOIN publish_account_wx_type t4 on t3.id = t4.account_id
-        LEFT JOIN wx_statistics_group_source_account t5 on t3.id = t5.account_id
-        LEFT JOIN wx_statistics_group_source t6 on t5.group_source_name = t6.account_source_name
-    WHERE
-        t1.plan_status = 1
-        AND t3.channel = 5
-        -- AND t3.follower_count > 0
-        GROUP BY t3.id;
-    """
-    account_list = db_client.select(sql)
-    result_list = [
-        {
-            "account_name": i[0],
-            "gh_id": i[1]
-        } for i in account_list
-    ]
-    return result_list
-
-
-def get_account_articles_detail(db_client, gh_id_tuple) -> list[dict]:
+def get_account_articles_detail(db_client, gh_id_tuple, min_publish_timestamp) -> list[dict]:
     """
     get articles details
     :return:
@@ -116,47 +55,37 @@ def get_account_articles_detail(db_client, gh_id_tuple) -> list[dict]:
             FROM 
                 official_articles_v2
             WHERE 
-                ghId IN {gh_id_tuple} and Type = '{const.BULK_PUBLISH_TYPE}';
+                ghId IN {gh_id_tuple} and Type = '{const.BULK_PUBLISH_TYPE}' and publish_timestamp >= {min_publish_timestamp};
             """
-    result = db_client.select(sql)
-    response_list = [
-        {
-            "ghId": i[0],
-            "accountName": i[1],
-            "ItemIndex": i[2],
-            "show_view_count": i[3],
-            "publish_timestamp": i[4]
-        }
-        for i in result
-    ]
+    response_list = db_client.fetch(query=sql, cursor_type=DictCursor)
     return response_list
 
 
-def cal_account_read_rate(gh_id_tuple) -> DataFrame:
+def cal_account_read_rate(article_list, fans_dict) -> DataFrame:
     """
     计算账号位置的阅读率
     :return:
     """
-    pq_db = PQMySQL()
-    de_db = DeNetMysql()
     response = []
-    fans_dict_each_day = get_account_fans_by_dt(db_client=de_db)
-    account_article_detail = get_account_articles_detail(
-        db_client=pq_db,
-        gh_id_tuple=gh_id_tuple
-    )
-    for line in account_article_detail:
+    for line in article_list:
         gh_id = line['ghId']
         dt = functions.timestamp_to_str(timestamp=line['publish_timestamp'], string_format='%Y-%m-%d')
-        fans = fans_dict_each_day.get(gh_id, {}).get(dt, 0)
+        fans = fans_dict.get(gh_id, {}).get(dt, const.DEFAULT_FANS)
+        if not fans:
+            fans = int(unauthorized_account.get(gh_id, const.DEFAULT_FANS))
         if not fans:
-            fans = int(unauthorized_account.get(gh_id, 0))
+            fans = int(backup_account_fans.get(gh_id, const.DEFAULT_FANS))
+            log(
+                task='cal_read_rate_avg_task',
+                function='cal_account_read_rate',
+                message='未获取到粉丝,使用备份粉丝表',
+                data=line
+            )
         line['fans'] = fans
-        if fans > 1000:
+        if fans > const.MIN_FANS:
             line['readRate'] = line['show_view_count'] / fans if fans else 0
             response.append(line)
-    return DataFrame(response,
-                     columns=['ghId', 'accountName', 'ItemIndex', 'show_view_count', 'publish_timestamp', 'readRate'])
+    return DataFrame(response, columns=['ghId', 'accountName', 'ItemIndex', 'show_view_count', 'publish_timestamp', 'readRate'])
 
 
 def cal_avg_account_read_rate(df, gh_id, index, dt) -> dict:
@@ -168,7 +97,7 @@ def cal_avg_account_read_rate(df, gh_id, index, dt) -> dict:
     min_time = max_time - const.STATISTICS_PERIOD
 
     # 通过
-    filterDataFrame = df[
+    filter_dataframe = df[
         (df["ghId"] == gh_id)
         & (min_time <= df["publish_timestamp"])
         & (df["publish_timestamp"] <= max_time)
@@ -176,13 +105,13 @@ def cal_avg_account_read_rate(df, gh_id, index, dt) -> dict:
         ]
 
     # 用二倍标准差过滤
-    finalDF = filter_outlier_data(filterDataFrame)
+    final_dataframe = filter_outlier_data(filter_dataframe)
 
     return {
-        "read_rate_avg": finalDF['readRate'].mean(),
-        "max_publish_time": finalDF['publish_timestamp'].max(),
-        "min_publish_time": finalDF['publish_timestamp'].min(),
-        "records": len(finalDF)
+        "read_rate_avg": final_dataframe['readRate'].mean(),
+        "max_publish_time": final_dataframe['publish_timestamp'].max(),
+        "min_publish_time": final_dataframe['publish_timestamp'].min(),
+        "records": len(final_dataframe)
     }
 
 
@@ -204,7 +133,7 @@ def check_each_position(db_client, gh_id, index, dt, avg_rate) -> dict:
         WHERE gh_id = '{gh_id}' and position = {index} and dt_version < {dt}
         ORDER BY dt_version DESC limit 1;
     """
-    result = db_client.select(select_sql)
+    result = db_client.fetch(select_sql)
     if result:
         account_name = result[0][0]
         previous_read_rate_avg = result[0][1]
@@ -246,6 +175,9 @@ def update_single_day(dt, account_list, article_df, lam):
         string_format='%Y-%m-%d'
     )
 
+    # processed_account_set
+    processed_account_set = set()
+
     for account in tqdm(account_list, desc=dt):
         for index in const.ARTICLE_INDEX_LIST:
             read_rate_detail = cal_avg_account_read_rate(
@@ -259,7 +191,9 @@ def update_single_day(dt, account_list, article_df, lam):
             min_publish_time = read_rate_detail['min_publish_time']
             articles_count = read_rate_detail['records']
             if articles_count:
-                if index in {1, 2}:
+                processed_account_set.add(account['gh_id'])
+                # check read rate in position 1 and 2
+                if index in [1, 2]:
                     error_obj = check_each_position(
                         db_client=lam,
                         gh_id=account['gh_id'],
@@ -269,6 +203,7 @@ def update_single_day(dt, account_list, article_df, lam):
                     )
                     if error_obj:
                         error_list.append(error_obj)
+                # insert into database
                 try:
                     if not read_rate_avg:
                         continue
@@ -278,8 +213,8 @@ def update_single_day(dt, account_list, article_df, lam):
                         values
                         (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
                     """
-                    lam.update(
-                        sql=insert_sql,
+                    lam.save(
+                        query=insert_sql,
                         params=(
                             account['account_name'],
                             account['gh_id'],
@@ -294,14 +229,17 @@ def update_single_day(dt, account_list, article_df, lam):
                         )
                     )
                 except Exception as e:
+                    print(e)
                     insert_error_list.append(str(e))
 
+    # bot sql error
     if insert_error_list:
         bot(
             title="更新阅读率均值,存在sql 插入失败",
             detail=insert_error_list
         )
 
+    # bot outliers
     if error_list:
         columns = [
             create_feishu_columns_sheet(sheet_type="plain_text", sheet_name="account_name", display_name="账号名称"),
@@ -314,7 +252,7 @@ def update_single_day(dt, account_list, article_df, lam):
                                         display_name="相对变化率")
         ]
         bot(
-            title="更新阅读率均值,头次出现异常值通知",
+            title="阅读率均值表异常信息, 总共处理{}个账号".format(len(processed_account_set)),
             detail={
                 "columns": columns,
                 "rows": error_list
@@ -323,12 +261,14 @@ def update_single_day(dt, account_list, article_df, lam):
             mention=False
         )
 
+    # if no error, send success info
     if not error_list and not insert_error_list:
         bot(
-            title="阅读率均值表更新成功",
+            title="阅读率均值表更新成功, 总共处理{}个账号".format(len(processed_account_set)),
             detail={
                 "日期": dt
-            }
+            },
+            mention=False
         )
 
 
@@ -347,12 +287,36 @@ def main() -> None:
     else:
         dt = datetime.today().strftime('%Y-%m-%d')
 
-    lam = longArticlesMySQL()
-    de = DeNetMysql()
-    account_list = get_publishing_accounts(db_client=de)
-    df = cal_account_read_rate(tuple([i['gh_id'] for i in account_list]))
+    # init stat period
+    max_time = functions.str_to_timestamp(date_string=dt)
+    min_time = max_time - const.STATISTICS_PERIOD
+    min_stat_date = functions.timestamp_to_str(timestamp=min_time, string_format='%Y-%m-%d')
+
+    # init database connector
+    long_articles_db_client = DatabaseConnector(db_config=long_articles_config)
+    long_articles_db_client.connect()
+
+    piaoquan_crawler_db_client = DatabaseConnector(db_config=piaoquan_crawler_config)
+    piaoquan_crawler_db_client.connect()
+
+    denet_db_client = DatabaseConnector(db_config=denet_config)
+    denet_db_client.connect()
+
+    # get account list
+    account_list = fetch_publishing_account_list(db_client=denet_db_client)
+
+    # get fans dict
+    fans_dict = fetch_account_fans(db_client=denet_db_client, start_date=min_stat_date)
+
+    # get data frame from official_articles_v2
+    gh_id_tuple = tuple([i['gh_id'] for i in account_list])
+    article_list = get_account_articles_detail(db_client=piaoquan_crawler_db_client, gh_id_tuple=gh_id_tuple, min_publish_timestamp=min_time)
+
+    # cal account read rate and make a dataframe
+    read_rate_dataframe = cal_account_read_rate(article_list, fans_dict)
 
-    update_single_day(dt, account_list, df, lam)
+    # update each day's data
+    update_single_day(dt, account_list, read_rate_dataframe, long_articles_db_client)
 
 
 if __name__ == '__main__':

+ 1 - 1
coldStartTasks/crawler/__init__.py

@@ -2,4 +2,4 @@
 @author: luojunhui
 """
 from .weixin_account_crawler import WeixinAccountCrawler
-from .weixin_video_crawler import WeixinVideoCrawler
+from .weixin_video_crawler import WeixinVideoCrawler

+ 1 - 0
coldStartTasks/crawler/baidu/__init__.py

@@ -0,0 +1 @@
+from .video_crawler import BaiduVideoCrawler

+ 1 - 1
coldStartTasks/publish/publishArticleAssociationArticles.py → coldStartTasks/crawler/baidu/account_crawler.py

@@ -1,4 +1,4 @@
 """
 @author: luojunhui
-发布i2i文章
 """
+

+ 96 - 0
coldStartTasks/crawler/baidu/baidu_spider.py

@@ -0,0 +1,96 @@
+"""
+@author: luojunhui
+"""
+import base64
+import uuid
+
+import requests
+
+from fake_useragent import FakeUserAgent
+
+from applications.exception import SpiderError
+from applications import Functions
+
+functions = Functions()
+
+
+def baidu_account_video_crawler(account_id, cursor=None):
+    """
+    baidu account video crawler
+    :param account_id: 百度账号id
+    :param cursor: 游标, 默认为None,表示从最新的开始爬取
+    success requests:
+    """
+    cookie_str = uuid.uuid4().__str__().replace('-', '').upper()
+    url = "https://haokan.baidu.com/web/author/listall?"
+    params = {
+        'app_id': account_id,
+        'ctime': cursor,
+        'rn': 10,
+        'searchAfter': '',
+        '_api': 1
+    }
+    headers = {
+        'Accept': '*/*',
+        'Accept-Language': 'zh,zh-CN;q=0.9',
+        'Connection': 'keep-alive',
+        'Referer': 'https://haokan.baidu.com/author/{}'.format(account_id),
+        'User-Agent': FakeUserAgent().chrome,
+        'x-requested-with': 'xmlhttprequest',
+        'Cookie': 'BAIDUID={}:FG=1; BAIDUID_BFESS={}:FG=1'.format(cookie_str, cookie_str)
+    }
+    try:
+        response = requests.request("GET", url, headers=headers, params=params, proxies=functions.proxy())
+        response_json = response.json()
+        if response_json['errmsg'] == '成功':
+            response_data = response_json['data']
+            return response_data
+        else:
+            raise SpiderError(
+                platform="baidu",
+                spider="account_video_crawler",
+                error=response_json['errmsg'],
+                url=url
+            )
+
+    except Exception as e:
+        raise SpiderError(
+            platform="baidu",
+            spider="account_video_crawler",
+            error=str(e),
+            url=url
+        )
+
+
+def baidu_single_video_crawler(video_id):
+    """
+    baidu video crawler
+    :param video_id: 视频id
+    """
+    url = "https://haokan.baidu.com/v"
+    params = {
+        'vid': video_id,
+        '_format': 'json'
+    }
+    base_64_string = base64.b64encode(str(uuid.uuid4()).encode()).decode()
+    headers = {
+        'Accept': '*/*',
+        'cookie': "BIDUPSID={}".format(base_64_string),
+        'Accept-Language': 'en,zh;q=0.9,zh-CN;q=0.8',
+        'Cache-Control': 'no-cache',
+        'Connection': 'keep-alive',
+        'Content-Type': 'application/x-www-form-urlencoded',
+        'Referer': 'https://haokan.baidu.com',
+        'User-Agent': FakeUserAgent().chrome,
+    }
+    try:
+        response = requests.request("GET", url, headers=headers, params=params, proxies=functions.proxy())
+        response_json = response.json()
+        return response_json['data']['apiData']['curVideoMeta']
+    except Exception as e:
+        raise SpiderError(
+            platform="baidu",
+            spider="single_video_crawler",
+            error=str(e),
+            url=url
+        )

+ 269 - 0
coldStartTasks/crawler/baidu/video_crawler.py

@@ -0,0 +1,269 @@
+"""
+@author: luojunhui
+@description: video crawler
+"""
+
+import os
+import json
+import time
+import traceback
+from typing import List, Dict
+
+from pymysql.cursors import DictCursor
+from tqdm import tqdm
+
+from applications import Functions
+from applications import bot, log
+from applications.const import BaiduVideoCrawlerConst
+from applications.db import DatabaseConnector
+from applications.exception import SpiderError
+from config import long_articles_config
+from coldStartTasks.crawler.baidu.baidu_spider import baidu_account_video_crawler
+from coldStartTasks.crawler.baidu.baidu_spider import baidu_single_video_crawler
+
+const = BaiduVideoCrawlerConst()
+empty_list = []
+functions = Functions()
+
+
+class BaiduVideoCrawler(object):
+    """
+    baidu video crawler
+    """
+
+    def __init__(self):
+        self.db = None
+        self.success_crawler_video_count = 0
+        self.connect_db()
+
+    def connect_db(self) -> None:
+        """
+        connect db
+        """
+        self.db = DatabaseConnector(db_config=long_articles_config)
+        self.db.connect()
+
+    def get_account_list(self) -> List[Dict]:
+        """
+        get account list
+        """
+        sql = f"""
+            select account_id, account_name, max_cursor 
+            from baidu_account_for_videos
+            where status = {const.BAIDU_ACCOUNT_GOOD_STATUS};
+        """
+        account_list = self.db.fetch(query=sql, cursor_type=DictCursor)
+        return account_list
+
+    def whether_video_exists(self, title: str) -> bool:
+        """
+        whether video exists, use video_id && title
+        """
+        # check title
+        sql = f"""
+            select id from publish_single_video_source
+            where article_title = %s;
+        """
+        duplicate_id = self.db.fetch(query=sql, params=(title,))
+        if duplicate_id:
+            print(title + " video exists")
+            return True
+
+        return False
+
+    def save_each_video(self, video: Dict, account_id: str, account_name: str) -> None:
+        """
+        download and save each video
+        """
+        # print(json.dumps(video, ensure_ascii=False, indent=4))
+        video_id = video["id"]
+        title = video["title"]
+
+        # judge whether video exists
+        if self.whether_video_exists(title):
+            return
+
+        read_cnt = video.get("playcnt", 0)
+        like_cnt = video.get("like_num", 0)
+        publish_timestamp = video["publish_time"]
+        # duration = video['duration']
+        cover_url = video["poster"]
+        video_url = video["playurl"]
+        # sensitive_flag = video.get('sensitive_flag')
+        video_more_info = video.get("contentcms_intervene_data")
+        if video_more_info:
+            video_category_list = video_more_info.get("category_v2")
+            if video_category_list:
+                video_category = video_category_list[0]
+            else:
+                video_category = None
+        else:
+            video_category = None
+        manual_tags = video.get("manual_tags")
+
+        video_path = os.path.join(const.LOCAL_PATH_DIR, "{}.mp4".format(video_id))
+        download_path = functions.download_baidu_videos(video_url, video_path)
+        if download_path:
+            oss_path = functions.upload_to_oss(local_video_path=download_path)
+            insert_sql = f"""
+                INSERT INTO publish_single_video_source
+                (content_trace_id, article_title, out_account_id, out_account_name, read_cnt, like_cnt, article_url, cover_url, video_oss_path, publish_timestamp, crawler_timestamp, url_unique_md5, category, tags, platform, source_account)
+                values
+                (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+            """
+            try:
+                self.db.save(
+                    query=insert_sql,
+                    params=(
+                        "video{}".format(functions.str_to_md5(video_id)),
+                        title,
+                        account_id,
+                        account_name,
+                        read_cnt,
+                        like_cnt,
+                        video_url,
+                        cover_url,
+                        oss_path,
+                        publish_timestamp,
+                        int(time.time()),
+                        video_id,
+                        video_category,
+                        (
+                            json.dumps(manual_tags, ensure_ascii=False)
+                            if manual_tags
+                            else None
+                        ),
+                        "hksp",
+                        const.NO_SOURCE_ACCOUNT_STATUS,
+                    ),
+                )
+                self.success_crawler_video_count += 1
+            except Exception as e:
+                log(
+                    task="baidu_video_crawler",
+                    function="save_each_video",
+                    message="save video failed",
+                    data={
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                        "video_id": video_id,
+                        "oss_path": oss_path,
+                    },
+                )
+        else:
+            print(f"download video failed, video_id: {video_id}")
+
+    def save_video_list(
+        self, account_id: str, account_name: str, video_list: List[Dict]
+    ) -> None:
+        """
+        save video list
+        """
+        progress_bar = tqdm(video_list, desc="crawler account: {}".format(account_name))
+        for video_obj in progress_bar:
+            if video_obj["type"] == "video":
+                video_id = video_obj["content"]["vid"]
+                try:
+                    video_detail = baidu_single_video_crawler(video_id)
+                    self.save_each_video(
+                        video=video_detail,
+                        account_id=account_id,
+                        account_name=account_name,
+                    )
+                    progress_bar.set_postfix({"videoId": video_id})
+                except SpiderError as e:
+                    print("save single video fail", e)
+                    continue
+            else:
+                continue
+
+    def crawler_each_account(self, account: Dict, cursor=None) -> None:
+        """
+        crawler each account
+        response_strategy
+        """
+        account_id = account["account_id"]
+        max_cursor = account["max_cursor"]
+        if not max_cursor:
+            max_cursor = const.DEFAULT_CURSOR
+        account_name = account["account_name"]
+        try:
+            response_json = baidu_account_video_crawler(account_id, cursor=cursor)
+
+            video_list = response_json.get("results", empty_list)
+            if video_list:
+                self.save_video_list(
+                    account_id=account_id,
+                    account_name=account_name,
+                    video_list=video_list,
+                )
+            # check next page
+            has_next_page = response_json.get("has_more", False)
+            if has_next_page:
+                next_cursor = response_json.get("ctime", const.DEFAULT_CURSOR)
+                if next_cursor < max_cursor:
+                    print("No more videos after 2024-01-01")
+                    return
+                else:
+                    return self.crawler_each_account(account, next_cursor)
+        except SpiderError as e:
+            print(e)
+            return
+
+    def update_cursor(self, account_id: str) -> None:
+        """
+        update cursor for each account
+        """
+        select_sql = f"""
+            select max(publish_timestamp) as max_cursor from publish_single_video_source where out_account_id = '{account_id}';
+        """
+        response_mysql = self.db.fetch(query=select_sql)
+        max_publish_timestamp = response_mysql[0][0]
+        if max_publish_timestamp:
+            max_cursor = max_publish_timestamp * const.TIMESTAMP_TO_CURSOR
+            update_sql = f"""
+                update baidu_account_for_videos
+                set max_cursor = %s
+                where account_id = %s;
+            """
+            self.db.save(query=update_sql, params=(max_cursor, account_id))
+
+    def deal(self) -> None:
+        """
+        deal
+        """
+        account_list = self.get_account_list()
+        success_cnt = 0
+        fail_cnt = 0
+        account_list_process_bar = tqdm(account_list, desc="process account list")
+        for account in account_list_process_bar:
+            try:
+                account_list_process_bar.set_postfix(
+                    {"account_name": account["account_name"]}
+                )
+                self.crawler_each_account(account)
+                self.update_cursor(account["account_id"])
+                success_cnt += 1
+            except Exception as e:
+                fail_cnt += 1
+                log(
+                    task="baidu_video_crawler",
+                    function="deal",
+                    message="crawler each account failed",
+                    data={
+                        "account_id": account["account_id"],
+                        "account_name": account["account_name"],
+                        "error": str(e),
+                        "trace_back": traceback.format_exc(),
+                    },
+                )
+        bot(
+            title="baidu video crawler task finished",
+            detail={
+                "success_crawl_account_num": success_cnt,
+                "fail_crawl_account_num": fail_cnt,
+                "success_crawl_video_num": self.success_crawler_video_count,
+                "success_crawl_account_rate": success_cnt / (success_cnt + fail_cnt),
+            },
+            mention=False,
+        )

+ 6 - 0
coldStartTasks/crawler/channels/__init__.py

@@ -0,0 +1,6 @@
+"""
+@author: luojunhui
+@tool: pycharm && deepseek
+"""
+from .blogger import get_channel_account_videos
+from .search import search_in_wechat_channel

+ 22 - 0
coldStartTasks/crawler/channels/blogger.py

@@ -0,0 +1,22 @@
+"""
+@author: luojunhui
+"""
+
+import requests
+import json
+
+from applications.api import WechatChannelAPI
+from config import gewe_token, gewe_app_id
+
+
+def get_channel_account_videos(user_id, last_buffer=""):
+    """
+    get channel account videos
+    """
+    channel_api = WechatChannelAPI(
+        base_url='http://api.geweapi.com',
+        token=gewe_token,
+        app_id=gewe_app_id
+    )
+    result = channel_api.get_channel_video_list(user_id, last_buffer)
+    return result

+ 41 - 0
coldStartTasks/crawler/channels/search.py

@@ -0,0 +1,41 @@
+"""
+@author: luojunhui
+"""
+
+from typing import Dict
+
+from applications.api import WechatChannelAPI
+from config import gewe_token, gewe_app_id
+
+
+def search_in_wechat_channel(
+    search_key: str,
+    search_type: int,
+    page: int = 0,
+    cookie: str = "",
+    search_id: str = "",
+    offset: int = 0,
+) -> Dict:
+    """
+    :param search_key: 搜索关键字
+    :param search_type: 搜索类型,1: 搜索所有视频, 2: 搜索视频号账号
+    :param page: 页码
+    :param cookie: 登录后的cookie
+    :param search_id: 搜索id
+    :param offset: 偏移量
+    :return: result_list
+    """
+    channel_api = WechatChannelAPI(
+        base_url='http://api.geweapi.com',
+        token=gewe_token,
+        app_id=gewe_app_id
+    )
+    result = channel_api.search(
+        search_key=search_key,
+        search_type=search_type,
+        page=page,
+        cookie=cookie,
+        search_id=search_id,
+        offset=offset
+    )
+    return result

+ 4 - 0
coldStartTasks/crawler/toutiao/__init__.py

@@ -0,0 +1,4 @@
+"""
+@author: luojunhui
+"""
+from .blogger import get_toutiao_account_video_list

+ 64 - 0
coldStartTasks/crawler/toutiao/blogger.py

@@ -0,0 +1,64 @@
+"""
+@author: luojunhui
+"""
+
+from __future__ import annotations
+
+import json
+import requests
+from tenacity import retry
+
+from applications import log
+from applications.utils import proxy, request_retry
+from .use_js import call_js_function
+
+retry_desc = request_retry(retry_times=3, min_retry_delay=2, max_retry_delay=30)
+
+
+@retry(**retry_desc)
+def get_toutiao_account_video_list(
+    account_id: str, cookie: str, max_behot_time=0
+) -> dict | None:
+    """
+    get toutiao account video list
+    :param account_id: toutiao account id
+    :param cookie: cookie maybe expire not quite sure
+    :param max_behot_time: max behot time
+    :return: toutiao account video list
+    """
+    ms_token = "mFs9gU4FJc23gFWPvBfQxFsBRrx1xBEJD_ZRTAolHfPrae84kTEBaHQR3s8ToiLX4-U9hgATTZ2cVHlSixmj5YCTOPoVM-43gOt3aVHkxfXHEuUtTJe-wUEs%3D"
+    query_params = [
+        0,
+        1,
+        14,
+        "category=pc_user_hot&token={}&aid=24&app_name=toutiao_web&msToken={}".format(
+            account_id, ms_token
+        ),
+        "",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
+    ]
+    a_bogus = call_js_function(query_params)
+    url = f"https://www.toutiao.com/api/pc/list/user/feed?category=pc_profile_video&token={account_id}&max_behot_time={max_behot_time}&hot_video=0&entrance_gid=&aid=24&app_name=toutiao_web&msToken={ms_token}&a_bogus={a_bogus}"
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
+        "cookie": cookie,
+    }
+    try:
+        response = requests.get(url, headers=headers, proxies=proxy())
+        response.raise_for_status()
+        return response.json()
+    except requests.exceptions.RequestException as e:
+        log(
+            task="toutiao account crawler",
+            function="get_toutiao_account_video_list",
+            message=f"API请求失败: {e}",
+            data={"account_id": account_id},
+        )
+    except json.JSONDecodeError as e:
+        log(
+            task="toutiao account crawler",
+            function="get_toutiao_account_video_list",
+            message=f"响应解析失败: {e}",
+            data={"account_id": account_id},
+        )
+    return None

+ 25 - 0
coldStartTasks/crawler/toutiao/use_js.py

@@ -0,0 +1,25 @@
+"""
+@author: luojunhui
+"""
+import json
+import subprocess
+
+from config import toutiao_js_path
+
+
+def call_js_function(arguments_list):
+    """
+    call js function
+    """
+    # 将参数转换为JSON字符串
+    args_json = json.dumps(arguments_list)
+    # 调用Node.js执行脚本
+    result = subprocess.run(
+        ['node', toutiao_js_path, args_json],
+        capture_output=True,
+        text=True
+    )
+    if result.returncode == 0:
+        return result.stdout.strip()
+    else:
+        raise Exception(f"Error: {result.stderr}")

+ 4 - 0
coldStartTasks/crawler/wechat/__init__.py

@@ -0,0 +1,4 @@
+"""
+@author: luojunhui
+"""
+from .article_association import ArticleAssociationCrawler

+ 210 - 0
coldStartTasks/crawler/wechat/article_association.py

@@ -0,0 +1,210 @@
+"""
+@author: luojunhui
+"""
+
+import time
+import traceback
+from datetime import datetime
+
+import numpy as np
+
+from pymysql.cursors import DictCursor
+from tqdm import tqdm
+
+
+from applications import WeixinSpider, log
+from applications.api import similarity_between_title_list
+from applications.const import ColdStartTaskConst
+from applications.db import DatabaseConnector
+from applications.functions import Functions
+from applications.utils import get_inner_account_set
+from applications.utils import whether_title_sensitive
+from config import long_articles_config
+
+spider = WeixinSpider()
+functions = Functions()
+const = ColdStartTaskConst()
+
+
+class ArticleAssociationCrawler(object):
+    """
+    article association crawler task
+    """
+
+    def __init__(self):
+        self.db_client = DatabaseConnector(db_config=long_articles_config)
+        self.db_client.connect()
+        self.inner_account_set = get_inner_account_set()
+
+    def get_seed_url_list(self, biz_date):
+        """
+        获取种子url列表
+        """
+        sql = f"""
+            select gh_id, title, link
+            from datastat_sort_strategy
+            where date_str > DATE_FORMAT(DATE_SUB('{biz_date}', INTERVAL 2 DAY), '%Y%m%d') 
+                and view_count > {const.READ_COUNT_THRESHOLD} 
+                and read_rate > {const.READ_AVG_THRESHOLD} 
+                and type = {const.BULK_PUBLISH_TYPE}
+            order by read_rate desc 
+            limit {const.SEED_ARTICLE_LIMIT_NUM};
+        """
+        seed_article_list = self.db_client.fetch(query=sql, cursor_type=DictCursor)
+        return seed_article_list
+
+    def get_level_up_title_list(self):
+        """
+        获取晋级文章标题列表
+        status: 1 表示文章已经溯源完成
+        deleted: 0 表示文章正常
+        level = 'autoArticlePoolLevel1' 表示头条
+        """
+        sql = f"""
+            select distinct title 
+            from article_pool_promotion_source 
+            where level = 'autoArticlePoolLevel1' and status = 1 and deleted = 0;
+        """
+        mysql_response = self.db_client.fetch(query=sql)
+        title_list = [i[0] for i in mysql_response]
+        return title_list
+
+    def get_recommend_url_list_with_depth(
+        self, seed_url, source_title, source_account, base_title_list, depth=1
+    ):
+        """
+        @param seed_url: good url from data_sort_strategy
+        @param depth: association depth
+        @param source_title: article title
+        @param source_account: article account
+        """
+        if depth > const.ARTICLE_ASSOCIATION_MAX_DEPTH:
+            return
+
+        res = spider.get_recommend_articles(content_link=seed_url)
+        related_articles = res["data"]["data"]["list"]
+        if related_articles:
+            title_list = [i["title"] for i in related_articles]
+            similarity_array = similarity_between_title_list(
+                title_list, base_title_list
+            )
+
+            recommend_articles = []
+            for index, score_list in enumerate(similarity_array):
+                sorted_score_list = sorted(score_list)
+                percent_threshold_score = np.percentile(
+                    sorted_score_list, const.PERCENT_THRESHOLD
+                )
+                if percent_threshold_score < const.CORRELATION_THRESHOLD:
+                    continue
+
+                else:
+                    article_obj = related_articles[index]
+                    article_obj["score"] = percent_threshold_score
+                    recommend_articles.append(article_obj)
+
+            recommend_process_bar = tqdm(
+                recommend_articles, desc="save recommend articles"
+            )
+            for article in recommend_process_bar:
+                obj = {
+                    "title": article["title"],
+                    "url": article["url"],
+                    "gh_id": article["username"],
+                    "index": article["idx"],
+                    "send_time": article["send_time"],
+                    "read_cnt": article["read_num"],
+                    "depth": depth,
+                    "source_article_title": source_title,
+                    "source_account": source_account,
+                }
+                self.insert_recommend_article(obj)
+                recommend_process_bar.set_postfix(
+                    {"title": article["title"], "depth": depth}
+                )
+                self.get_recommend_url_list_with_depth(
+                    seed_url=obj["url"],
+                    source_title=obj["title"],
+                    source_account=obj["gh_id"],
+                    base_title_list=base_title_list,
+                    depth=depth + 1,
+                )
+        else:
+            return
+
+    def insert_recommend_article(self, obj):
+        """
+        insert recommend article
+        """
+        # whether account inside
+        if obj["gh_id"] in self.inner_account_set:
+            return
+
+        # whether article title exists
+        title = obj["title"]
+        select_sql = "select article_id from crawler_meta_article where title = %s;"
+        res = self.db_client.fetch(query=select_sql, params=(title,))
+        if res:
+            return
+
+        # whether title sensitive
+        title_sensitivity = (
+            const.TITLE_SENSITIVE
+            if whether_title_sensitive(title)
+            else const.TITLE_NOT_SENSITIVE
+        )
+
+        # insert this article
+        insert_sql = f"""
+            insert into crawler_meta_article 
+            (platform, mode, category, out_account_id, article_index, title, link, read_cnt, publish_time, crawler_time, status, unique_index, source_article_title, source_account, title_sensitivity)
+            values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+        """
+        self.db_client.save(
+            query=insert_sql,
+            params=(
+                "weixin",
+                "recommend",
+                "article_association",
+                obj["gh_id"],
+                obj["index"],
+                obj["title"],
+                obj["url"],
+                obj["read_cnt"],
+                obj["send_time"],
+                int(time.time()),
+                const.DEFAULT_ARTICLE_STATUS,
+                functions.generateGzhId(obj["url"]),
+                obj["source_article_title"],
+                obj["source_account"],
+                title_sensitivity,
+            ),
+        )
+
+    def deal(self, biz_date=None):
+        """
+        class entrance
+        :param biz_date:
+        """
+        if biz_date is None:
+            biz_date = datetime.today().strftime("%Y-%m-%d")
+
+        seed_article_list = self.get_seed_url_list(biz_date)
+        deal_bar = tqdm(seed_article_list, desc="article association crawler")
+        base_title_list = self.get_level_up_title_list()
+        for article in deal_bar:
+            try:
+                self.get_recommend_url_list_with_depth(
+                    seed_url=article["link"],
+                    source_title=article["title"],
+                    source_account=article["gh_id"],
+                    base_title_list=base_title_list,
+                )
+                deal_bar.set_postfix({"article_title": article["title"]})
+            except Exception as e:
+                log(
+                    task="article_association_crawler",
+                    function="deal",
+                    message=f"article association crawler error, article title: {article['title']}, error: {e}",
+                    data={"article": article, "traceback": traceback.format_exc()},
+                )

+ 1 - 25
coldStartTasks/crawler/weixinCategoryCrawler.py

@@ -8,7 +8,7 @@ import time
 from tqdm import tqdm
 from pymysql.cursors import DictCursor
 
-from applications import WeixinSpider, Functions, llm_sensitivity, log
+from applications import WeixinSpider, Functions, log
 from coldStartTasks.filter import article_crawler_duplicate_filter
 from config import apolloConfig
 
@@ -158,18 +158,6 @@ class weixinCategory(object):
                     print(e)
         return success_records
 
-    def update_article_sensitive_status(self, category, unique_index, status):
-        """
-        更新文章敏感状态
-        :return:
-        """
-        update_sql = f"""
-            update crawler_meta_article
-            set llm_sensitivity = %s
-            where category = %s and unique_index = %s;
-        """
-        self.db_client_lam.update(sql=update_sql, params=(status, category, unique_index))
-
     def update_latest_account_timestamp(self, gh_id):
         """
         更新账号的最新时间戳
@@ -242,18 +230,6 @@ class weixinCategory(object):
                 print("success")
             except Exception as e:
                 print("fail because of {}".format(e))
-        success_titles = [x['title'] for x in success_records]
-        if success_titles:
-            try:
-                sensitive_results = llm_sensitivity.check_titles(success_titles)
-                for record, sensitive_result in zip(success_records, sensitive_results):
-                    self.update_article_sensitive_status(
-                        category=category,
-                        unique_index=record['unique_index'],
-                        status=sensitive_result['hit_rule']
-                    )
-            except Exception as e:
-                print("failed to update sensitive status: {}".format(e))
 
     def deal(self, category_list, date_str):
         """

+ 1 - 1
coldStartTasks/crawler/weixin_account_association_crawler.py

@@ -229,7 +229,7 @@ class AccountAssociationCrawler(object):
                     task="account_association",
                     function="run_account_association",
                     data={
-                        "biz_date": biz_date,
+                        "biz_date": biz_date.strftime("%Y-%m-%d"),
                         "article": article,
                         "trace_back": traceback.format_exc(),
                         "error": f"{e}"

+ 3 - 1
coldStartTasks/crawler/weixin_account_crawler.py

@@ -44,7 +44,9 @@ class WeixinAccountCrawler(object):
         sql = f"""
             SELECT id, article_url
             FROM publish_single_video_source
-            WHERE source_account = {const.NEED_SCAN_SOURCE_ACCOUNT};
+            WHERE source_account = {const.NEED_SCAN_SOURCE_ACCOUNT} 
+            and bad_status = {const.TITLE_DEFAULT_STATUS}
+            and platform = 'gzh' limit 1000;
         """
         article_url_list = self.db_client.select(sql, cursor_type=DictCursor)
         return article_url_list

+ 8 - 3
coldStartTasks/crawler/weixin_video_crawler.py

@@ -87,7 +87,8 @@ class WeixinVideoCrawler(object):
         select_sql = f"""
             SELECT gh_id, account_name, latest_crawler_timestamp
             FROM weixin_account_for_videos
-            WHERE status = {const.ACCOUNT_CRAWL_STATUS};
+            WHERE status = {const.ACCOUNT_CRAWL_STATUS}
+            ORDER BY latest_crawler_timestamp;
         """
         response = self.db_client.select(select_sql, DictCursor)
         return response
@@ -158,10 +159,15 @@ class WeixinVideoCrawler(object):
                     url_unique = functions.generateGzhId(article_url)
                     # 判断该视频链接是否下载,若已经下载则直接跳过
                     if self.is_downloaded(url_unique):
+                        print("url exists")
+                        continue
+
+                    title = article.get("Title", None)
+                    if not title:
                         continue
 
                     # 判断标题是否重复
-                    if video_crawler_duplicate_filter(article_url, self.db_client):
+                    if video_crawler_duplicate_filter(title, self.db_client):
                         log(
                             task='weixin_video_crawler',
                             function="insert_msg_list",
@@ -174,7 +180,6 @@ class WeixinVideoCrawler(object):
                         download_path = functions.download_gzh_video(article_url)
                         if download_path:
                             oss_path = functions.upload_to_oss(local_video_path=download_path)
-                            title = article.get("Title", None)
                             position = article.get("ItemIndex", None)
                             cover_url = article.get("CoverImgUrl", None)
                             show_desc = article.get("ShowDesc", None)

+ 62 - 16
coldStartTasks/filter/title_similarity_task.py

@@ -54,7 +54,7 @@ class ColdStartTitleSimilarityTask(object):
         title_list = [i[0] for i in mysql_response]
         return title_list
 
-    def get_title_from_meta_base(self, limit):
+    def get_article_title_from_meta_base(self, limit):
         """
         获取meta_base表中文章标题列表
         status: 1 表示文章初始化状态
@@ -70,17 +70,56 @@ class ColdStartTitleSimilarityTask(object):
         mysql_response = self.db_client.fetch(query=sql, cursor_type=DictCursor)
         return mysql_response
 
-    def update_meta_article_batch(self, update_data_list: list[tuple]) -> int:
+    def get_video_title_from_meta_table(self, limit):
         """
-        批量更新crawler_meta_article
+        获取meta_base表中视频标题列表
+        audit_status = 0 表示视频初始化状态
+        """
+        if limit:
+            sql = f"""
+                select id as article_id, article_title as title 
+                from publish_single_video_source 
+                where audit_status = 0 
+                    and score is null 
+                    and bad_status = 0
+                limit {limit};
+            """
+        else:
+            sql = f"""
+                select id as article_id, article_title as title 
+                from publish_single_video_source 
+                where audit_status = 0 
+                    and score is null
+                    and bad_status = 0;
+            """
+        mysql_response = self.db_client.fetch(query=sql, cursor_type=DictCursor)
+        return mysql_response
+
+    def update_meta_database_batch(self, meta_source: str, update_data_list: list[tuple]) -> int:
         """
-        sql = """
-            update crawler_meta_article
-            set score = case article_id
-                {}
-            end
-            where article_id in %s and score is null;
+        批量更新crawler_meta_article
         """
+        match meta_source:
+            case "video":
+                sql = """
+                    update publish_single_video_source
+                    set score = case id
+                        {}
+                    end
+                    where id in %s and score is null;
+                """
+            case "article":
+                sql = """
+                    update crawler_meta_article
+                    set score = case article_id
+                        {}
+                    end
+                    where article_id in %s and score is null;
+                """
+            case _:
+                print("source_type is not valid")
+                return 0
+
         case_statement = []
         article_id_list = []
         params = []
@@ -95,22 +134,29 @@ class ColdStartTitleSimilarityTask(object):
         affected_rows = self.db_client.save(formatted_sql, params)
         return affected_rows
 
-    def run(self, limit=None):
+    def run(self, meta_source, limit=None):
         """
         执行任务
         """
-        target_article_list = self.get_title_from_meta_base(limit=limit)
-        if not target_article_list:
+        match meta_source:
+            case "article":
+                target_list = self.get_article_title_from_meta_base(limit=limit)
+            case "video":
+                target_list = self.get_video_title_from_meta_table(limit=limit)
+            case _:
+                print("meta_source is not valid")
+                return
+
+        if not target_list:
             print("No more articles to process.")
             return
 
         base_title_list = self.get_level_up_title_list()
-
-        batch_task_list = chunks(target_article_list, ARTICLE_BATCH)
+        batch_task_list = chunks(target_list, ARTICLE_BATCH)
 
         for batch_task in batch_task_list:
             try:
-                batch_target_title_list = [i['title'] for i in batch_task]
+                batch_target_title_list = [i['title'][:30] for i in batch_task]
                 similarity_array = similarity_between_title_list(batch_target_title_list, base_title_list)
 
                 update_data_list = []
@@ -119,7 +165,7 @@ class ColdStartTitleSimilarityTask(object):
                     percent_threshold_score = np.percentile(sorted_score_list, PERCENT_THRESHOLD)
                     update_data_list.append((percent_threshold_score, batch_task[index]['article_id']))
 
-                affected_rows = self.update_meta_article_batch(update_data_list)
+                affected_rows = self.update_meta_database_batch(meta_source=meta_source, update_data_list=update_data_list)
 
                 print("{}: \t本次任务处理数量: {}".format(datetime.datetime.today().__str__(), affected_rows))
             except Exception as e:

+ 276 - 0
coldStartTasks/publish/basic.py

@@ -0,0 +1,276 @@
+"""
+@author: luojunhui
+"""
+
+import json
+import time
+import datetime
+import pandas as pd
+import traceback
+
+from pandas import DataFrame
+from tqdm import tqdm
+
+from applications import log, aiditApi, bot
+from applications.const import ColdStartTaskConst
+from config import apolloConfig
+
+const = ColdStartTaskConst()
+config = apolloConfig()
+
+category_cold_start_threshold = json.loads(
+    config.getConfigValue("category_cold_start_threshold")
+)
+READ_TIMES_THRESHOLD = category_cold_start_threshold.get("READ_TIMES_THRESHOLD", 1.3)
+READ_THRESHOLD = category_cold_start_threshold.get("READ_THRESHOLD", 5000)
+LIMIT_TITLE_LENGTH = category_cold_start_threshold.get("LIMIT_TITLE_LENGTH", 15)
+TITLE_LENGTH_MAX = category_cold_start_threshold.get("TITLE_LENGTH_MAX", 50)
+
+
+def get_article_from_meta_table(db_client, category: str, platform: str) -> DataFrame:
+    """
+    get article from meta data
+    :param db_client: database connector
+    :param category: article category
+    :param platform: article platform
+    :return: article dataframe
+    """
+    sql = f"""
+        select 
+            article_id, out_account_id, article_index, title, link, read_cnt, status, llm_sensitivity, score
+        from crawler_meta_article
+        where category = "{category}" and platform = "{platform}" and title_sensitivity = {const.TITLE_NOT_SENSITIVE}
+        order by score desc;
+    """
+    article_list = db_client.fetch(sql)
+    log(
+        task="category_publish_task",
+        function="get_articles_from_meta_table",
+        message="获取品类文章总数",
+        data={"total_articles": len(article_list), "category": category},
+    )
+    article_df = pd.DataFrame(
+        article_list,
+        columns=[
+            "article_id",
+            "gh_id",
+            "position",
+            "title",
+            "link",
+            "read_cnt",
+            "status",
+            "llm_sensitivity",
+            "score",
+        ],
+    )
+    return article_df
+
+
+def update_published_articles_status(db_client) -> None:
+    """
+    filter published articles
+    """
+    category_map = json.loads(config.getConfigValue("category_cold_start_map"))
+    category_list = list(category_map.keys())
+    processing_bar = tqdm(category_list, desc="update_published_articles")
+    for category in processing_bar:
+        plan_id = category_map.get(category)
+        if plan_id:
+            article_list = aiditApi.get_generated_article_list(plan_id)
+            title_list = [i[1] for i in article_list]
+            if title_list:
+                update_sql = f"""
+                        update crawler_meta_article
+                        set status = %s 
+                        where title in %s and status = %s;
+                """
+                affected_rows = db_client.save(
+                    query=update_sql,
+                    params=(
+                        const.PUBLISHED_STATUS,
+                        tuple(title_list),
+                        const.INIT_STATUS,
+                    ),
+                )
+                processing_bar.set_postfix(
+                    {"category": category, "affected_rows": affected_rows}
+                )
+        else:
+            return
+
+
+def filter_by_read_times(article_df: DataFrame) -> DataFrame:
+    """
+    filter by read times
+    """
+    article_df["average_read"] = article_df.groupby(["gh_id", "position"])[
+        "read_cnt"
+    ].transform("mean")
+    article_df["read_times"] = article_df["read_cnt"] / article_df["average_read"]
+    filter_df = article_df[article_df["read_times"] >= READ_TIMES_THRESHOLD]
+    return filter_df
+
+
+def filter_by_status(article_df: DataFrame) -> DataFrame:
+    """
+    filter by status
+    """
+    filter_df = article_df[article_df["status"] == const.INIT_STATUS]
+    return filter_df
+
+
+def filter_by_read_cnt(article_df: DataFrame) -> DataFrame:
+    """
+    filter by read cnt
+    """
+    filter_df = article_df[article_df["read_cnt"] >= READ_THRESHOLD]
+    return filter_df
+
+
+def filter_by_title_length(article_df: DataFrame) -> DataFrame:
+    """
+    filter by title length
+    """
+    filter_df = article_df[
+        (article_df["title"].str.len() >= LIMIT_TITLE_LENGTH)
+        & (article_df["title"].str.len() <= TITLE_LENGTH_MAX)
+    ]
+    return filter_df
+
+
+def filter_by_sensitive_words(article_df: DataFrame) -> DataFrame:
+    """
+    filter by sensitive words
+    """
+    filter_df = article_df[
+        (~article_df["title"].str.contains("农历"))
+        & (~article_df["title"].str.contains("太极"))
+        & (~article_df["title"].str.contains("节"))
+        & (~article_df["title"].str.contains("早上好"))
+        & (~article_df["title"].str.contains("赖清德"))
+        & (~article_df["title"].str.contains("普京"))
+        & (~article_df["title"].str.contains("俄"))
+        & (~article_df["title"].str.contains("南海"))
+        & (~article_df["title"].str.contains("台海"))
+        & (~article_df["title"].str.contains("解放军"))
+        & (~article_df["title"].str.contains("蔡英文"))
+        & (~article_df["title"].str.contains("中国"))
+    ]
+    return filter_df
+
+
+def filter_by_similarity_score(article_df: DataFrame, score) -> DataFrame:
+    """
+    filter by similarity score
+    """
+    filter_df = article_df[article_df["score"] >= score]
+    return filter_df
+
+
+def insert_into_article_crawler_plan(
+    db_client, crawler_plan_id, crawler_plan_name, create_timestamp
+):
+    """
+    insert into article crawler plan
+    """
+    insert_sql = f"""
+        insert into article_crawler_plan (crawler_plan_id, name, create_timestamp)
+        values (%s, %s, %s);
+    """
+    try:
+        db_client.save(
+            query=insert_sql,
+            params=(crawler_plan_id, crawler_plan_name, create_timestamp),
+        )
+    except Exception as e:
+        bot(
+            title="品类冷启任务,记录抓取计划id失败",
+            detail={
+                "error": str(e),
+                "error_msg": traceback.format_exc(),
+                "crawler_plan_id": crawler_plan_id,
+                "crawler_plan_name": crawler_plan_name,
+            },
+        )
+
+
+def create_crawler_plan(url_list, plan_tag, platform) -> tuple:
+    """
+    create crawler plan
+    """
+    crawler_plan_response = aiditApi.auto_create_crawler_task(
+        plan_id=None,
+        plan_name="自动绑定-{}--{}--{}".format(
+            plan_tag, datetime.date.today().__str__(), len(url_list)
+        ),
+        plan_tag=plan_tag,
+        article_source=platform,
+        url_list=url_list,
+    )
+    log(
+        task="category_publish_task",
+        function="publish_filter_articles",
+        message="成功创建抓取计划",
+        data=crawler_plan_response,
+    )
+    # save to db
+    create_timestamp = int(time.time()) * 1000
+    crawler_plan_id = crawler_plan_response["data"]["id"]
+    crawler_plan_name = crawler_plan_response["data"]["name"]
+    return crawler_plan_id, crawler_plan_name, create_timestamp
+
+
+def bind_to_generate_plan(category, crawler_plan_id, crawler_plan_name, platform):
+    """
+    auto bind to generate plan
+    """
+    match platform:
+        case "weixin":
+            input_source_channel = 5
+        case "toutiao":
+            input_source_channel = 6
+        case _:
+            input_source_channel = 5
+
+    new_crawler_task_list = [
+        {
+            "contentType": 1,
+            "inputSourceType": 2,
+            "inputSourceSubType": None,
+            "fieldName": None,
+            "inputSourceValue": crawler_plan_id,
+            "inputSourceLabel": crawler_plan_name,
+            "inputSourceModal": 3,
+            "inputSourceChannel": input_source_channel,
+        }
+    ]
+    category_map = json.loads(config.getConfigValue("category_cold_start_map"))
+    generate_plan_response = aiditApi.bind_crawler_task_to_generate_task(
+        crawler_task_list=new_crawler_task_list, generate_task_id=category_map[category]
+    )
+    log(
+        task="category_publish_task",
+        function="publish_filter_articles",
+        message="成功绑定到生成计划",
+        data=generate_plan_response,
+    )
+
+
+def update_article_status_after_publishing(db_client, article_id_list):
+    """
+    update article status after publishing
+    """
+    update_sql = f"""
+        update crawler_meta_article
+        set status = %s
+        where article_id in %s and status = %s;
+    """
+    affect_rows = db_client.save(
+        query=update_sql,
+        params=(const.PUBLISHED_STATUS, tuple(article_id_list), const.INIT_STATUS),
+    )
+    if affect_rows != len(article_id_list):
+        bot(
+            title="品类冷启任务中,出现更新状文章状态失败异常",
+            detail={"affected_rows": affect_rows, "task_rows": len(article_id_list)},
+        )

+ 29 - 1
coldStartTasks/publish/publishCategoryArticles.py

@@ -9,7 +9,7 @@ import traceback
 
 from pandas import DataFrame
 
-from applications import aiditApi, log, bot
+from applications import aiditApi, log, bot, llm_sensitivity
 from config import apolloConfig
 
 apollo = apolloConfig()
@@ -297,6 +297,18 @@ class CategoryColdStartTask(object):
         )
         return zero_level_funnel_df
 
+    def update_article_sensitive_status(self, article_id, status):
+        """
+        更新文章敏感状态
+        :return:
+        """
+        update_sql = f"""
+            update crawler_meta_article
+            set llm_sensitivity = %s
+            where article_id = %s;
+        """
+        self.db_client.update(sql=update_sql, params=(status, article_id))
+
     def publish_filter_articles(self, category, articles_df, article_source):
         """
         过滤文章
@@ -315,6 +327,22 @@ class CategoryColdStartTask(object):
             case _:
                 return
 
+        success_titles = filtered_articles_df['title'].values.tolist()
+        article_id_list = filtered_articles_df['article_id'].values.tolist()
+        if success_titles:
+            try:
+                sensitive_results = llm_sensitivity.check_titles(success_titles)
+                for article_id, sensitive_result in zip(article_id_list, sensitive_results):
+                    self.update_article_sensitive_status(
+                        article_id=article_id,
+                        status=sensitive_result['hit_rule']
+                    )
+                    if sensitive_result['hit_rule'] > TITLE_NOT_SENSITIVE:
+                        filtered_articles_df = filtered_articles_df[filtered_articles_df['article_id'] != article_id]
+
+            except Exception as e:
+                print("failed to update sensitive status: {}".format(e))
+
         url_list = filtered_articles_df['link'].values.tolist()
         if url_list:
             # create_crawler_plan

+ 125 - 0
coldStartTasks/publish/publish_article_association_articles.py

@@ -0,0 +1,125 @@
+"""
+@author: luojunhui
+"""
+
+from pandas import DataFrame
+
+from applications import bot
+from applications.const import ColdStartTaskConst
+from applications.db import DatabaseConnector
+from config import long_articles_config
+
+from coldStartTasks.publish.basic import filter_by_status
+from coldStartTasks.publish.basic import filter_by_sensitive_words
+from coldStartTasks.publish.basic import filter_by_title_length
+from coldStartTasks.publish.basic import update_published_articles_status
+from coldStartTasks.publish.basic import get_article_from_meta_table
+from coldStartTasks.publish.basic import update_article_status_after_publishing
+from coldStartTasks.publish.basic import create_crawler_plan
+from coldStartTasks.publish.basic import insert_into_article_crawler_plan
+from coldStartTasks.publish.basic import bind_to_generate_plan
+
+const = ColdStartTaskConst()
+
+
+def filter_articles_before_create_plan(article_df: DataFrame) -> DataFrame:
+    """
+    filter articles before create plan
+    """
+    total_length = article_df.shape[0]
+
+    # filter by status
+    filter_df = filter_by_status(article_df)
+    filter_length0 = filter_df.shape[0]
+
+    # filter by sensitive words
+    filter_df = filter_by_sensitive_words(filter_df)
+    filter_length1 = filter_df.shape[0]
+
+    # filter by title length
+    filter_df = filter_by_title_length(filter_df)
+    filter_length2 = filter_df.shape[0]
+
+    bot(
+        title="文章联想任务,开始创建抓取计划",
+        detail={
+            "文章总数": total_length,
+            "发布状态过滤": "过滤: {}, 剩余: {}".format(
+                total_length - filter_length0, filter_length0
+            ),
+            "敏感词过滤": "过滤: {}, 剩余: {}".format(
+                filter_length0 - filter_length1, filter_length1
+            ),
+            "标题长度过滤": "过滤: {}, 剩余: {}".format(
+                filter_length1 - filter_length2, filter_length2
+            ),
+        },
+        mention=False,
+    )
+
+    return filter_df
+
+
+class ArticleAssociationPublish(object):
+    """
+    publish i2i articles
+    """
+
+    def __init__(self):
+        self.db_client = DatabaseConnector(db_config=long_articles_config)
+        self.db_client.connect()
+
+    def deal(self):
+        """
+        class entrance
+        """
+        # update published articles
+        update_published_articles_status(db_client=self.db_client)
+
+        # get data from meta table
+        article_dataframe = get_article_from_meta_table(
+            db_client=self.db_client, category="article_association", platform="weixin"
+        )
+
+        # fileter articles
+        filter_dataframe = filter_articles_before_create_plan(article_dataframe)
+
+        # create crawler plan
+        url_list = filter_dataframe["link"].values.tolist()
+        if url_list:
+            crawler_plan_id, crawler_plan_name, create_timestamp = create_crawler_plan(
+                url_list=url_list, plan_tag="article_association", platform="weixin"
+            )
+
+            # insert crawler plan
+            insert_into_article_crawler_plan(
+                db_client=self.db_client,
+                crawler_plan_id=crawler_plan_id,
+                crawler_plan_name=crawler_plan_name,
+                create_timestamp=create_timestamp,
+            )
+
+            # bind to generate plan
+            bind_to_generate_plan(
+                category="article_association",
+                crawler_plan_id=crawler_plan_id,
+                crawler_plan_name=crawler_plan_name,
+                platform="weixin",
+            )
+
+            # update status
+            article_id_list = filter_dataframe["article_id"].values.tolist()
+            update_article_status_after_publishing(
+                db_client=self.db_client, article_id_list=article_id_list
+            )
+
+            bot(
+                title="文章联想任务,创建抓取计划成功",
+                detail={
+                    "抓取计划id": crawler_plan_id,
+                    "抓取计划名称": crawler_plan_name,
+                    "抓取条数": len(url_list),
+                    "冷启动类型": "article_association",
+                },
+                mention=False,
+            )

+ 17 - 3
coldStartTasks/publish/publish_video_to_pq_for_audit.py

@@ -12,7 +12,7 @@ from pymysql.cursors import DictCursor
 from applications import log
 from applications import PQAPI
 from applications.const import WeixinVideoCrawlerConst
-from applications.api import generate_mini_program_title
+from applications.api import fetch_moon_shot_response
 from applications.db import DatabaseConnector
 from config import long_articles_config
 
@@ -36,11 +36,14 @@ class PublishVideosForAudit(object):
         """
         already_published_count = self.get_published_articles_today()
         rest_count = const.MAX_VIDEO_NUM - already_published_count
+
+        limit_count = min(rest_count, const.MAX_VIDEO_NUM_PER_PUBLISH)
         sql = f"""
             SELECT id, article_title, video_oss_path 
             FROM publish_single_video_source 
             WHERE audit_status = {const.VIDEO_AUDIT_INIT_STATUS} and bad_status = {const.TITLE_DEFAULT_STATUS}
-            LIMIT {rest_count};
+            ORDER BY score DESC
+            LIMIT {limit_count};
             """
         response = self.db_client.fetch(sql, cursor_type=DictCursor)
         return response
@@ -153,7 +156,18 @@ class PublishVideosForAudit(object):
         title = self.db_client.fetch(select_sql, cursor_type=DictCursor)[0]['article_title']
 
         try:
-            mini_program_title = generate_mini_program_title(title)
+            # generate kimi title
+            mini_program_title = fetch_moon_shot_response(task='generate_kimi_title', input_text=title)
+
+            # score kimi title
+            kimi_safe_title = None
+            title_safe_score = fetch_moon_shot_response(task='get_title_safe_score', input_text=mini_program_title)
+            if int(title_safe_score) > const.TITLE_SAFE_SCORE_THRESHOLD:
+                kimi_safe_title_obj = fetch_moon_shot_response(task='make_title_safe', input_text=title, output_type='json')
+                kimi_safe_title = kimi_safe_title_obj['title_v2']
+
+            mini_program_title = kimi_safe_title if kimi_safe_title else mini_program_title
+
             update_sql = f"""
             UPDATE publish_single_video_source SET mini_program_title = %s WHERE audit_video_id = %s;
             """

+ 20 - 1
config/__init__.py

@@ -81,6 +81,15 @@ piaoquan_crawler_config = {
     'charset': 'utf8mb4'
 }
 
+
+# moonshot model config(kimi)
+moon_shot = {
+    "api_key": "sk-5DqYCa88kche6nwIWjLE1p4oMm8nXrR9kQMKbBolNAWERu7q",
+    "model": "moonshot-v1-32k",
+    "base_url": "https://api.moonshot.cn/v1"
+}
+
+
 deep_seek_model = {
     "DeepSeek-R1": "ep-20250213194143-d8q4t",
     "DeepSeek-V3": "ep-20250213194558-rrmr2"
@@ -88,4 +97,14 @@ deep_seek_model = {
 
 deep_seek_default_model = "ep-20250213194558-rrmr2"
 
-deep_seek_api_key_byte_dance = '5e275c38-44fd-415f-abcf-4b59f6377f72'
+deep_seek_api_key_byte_dance = '5e275c38-44fd-415f-abcf-4b59f6377f72'
+
+#GeWe
+gewe_token = "d3fb918f-0f36-4769-b095-410181614231"
+gewe_app_id = "wx_GKpVW8xfEhcaxMIK9sSm6"
+
+# sph decrypt key
+decrypt_key_path = 'applications/so/libsph_decrypt.so'
+
+# toutiao js path
+toutiao_js_path = 'applications/js/toutiao.js'

+ 30 - 7
config/crontab_backup

@@ -1,11 +1,26 @@
-# 凌晨1点30执行更新小程序信息任务
-30 1 * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_minigram_info_daily.sh
+# 每天凌晨 4点,下午 4 点各执行一次头条视频抓取
+0 4,16 * * * bash /root/luojunhui/LongArticlesJob/sh/run_toutiao_account_video_crawler.sh
+
+# 每15分钟执行一次今日头条推荐流抓取
+*/15 * * * * bash /root/luojunhui/LongArticlesJob/sh/run_toutiao_recommend.sh
+
+# 每10分钟执行一次从aigc系统获取发布文章
+*/10 * * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_article_info_from_aigc.sh
+
+# 每10分钟执行一次标题相似度计算任务
+*/10 * * * * bash /root/luojunhui/LongArticlesJob/sh/run_title_similarity_task.sh
+
+# 凌晨2点30执行更新小程序信息任务
+30 2 * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_minigram_info_daily.sh
 
 # 每天上午10点30执行文章退场 && 晋升任务
 30 10 * * * bash /root/luojunhui/LongArticlesJob/sh/run_article_title_exit_v1.sh
 
 # 每天上午4点执行账号冷启动任务
-0 4 * * * bash /root/luojunhui/LongArticlesJob/sh/run_account_cold_start_daily.sh
+0 1 * * * bash /root/luojunhui/LongArticlesJob/sh/run_account_cold_start_daily.sh
+
+# 每日上午9点执行账号联想任务
+0 9 * * * bash /root/luojunhui/LongArticlesJob/sh/run_account_association.sh
 
 # 每天 10 点执行前一天的阅读率均值代码
 0 10 * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_account_read_rate_avg.sh
@@ -13,18 +28,24 @@
 # 每天10点40执行阅读均值任务
 40 10 * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_account_avg_v3.sh
 
+# 每天11点执行文章联想任务
+0 11 * * * bash /root/luojunhui/LongArticlesJob/sh/run_article_association.sh
+
 # 每小时执行一次校验视频状态
 20 * * * * bash /root/luojunhui/LongArticlesJob/sh/run_check_video_status_hourly.sh
 
-# 每天凌晨4:30, 8:30, 15:30执行视频发布和审核流程
-30 4,8,15 * * * bash /root/luojunhui/LongArticlesJob/sh/run_video_publish_and_audit.sh
+# 每天凌晨4:30 15:30执行视频发布和审核流程
+30 4,15 * * * bash /root/luojunhui/LongArticlesJob/sh/run_video_publish_and_audit.sh
 
 # 每天 上午8:30, 下午1:00, 晚上8:50执行
 
 30 8 * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_published_articles_daily.sh
-20 13 * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_published_articles_daily.sh
+20 14 * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_published_articles_daily.sh
 50 20 * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_published_articles_daily.sh
 
+# 每天上午9点,下午2点,晚上9点执行v2代码
+# 0 9,14,21 * * * bash /root/luojunhui/LongArticlesJob/sh/run_update_published_articles_daily_v2.sh
+
 
 # 每天上午 9:30 点,下午 2 点,晚上 7 点执行下架视频任务
 
@@ -37,8 +58,10 @@
 
 # 每天早上9点,下午2:30, 晚上7:30
 
-30 9,14 * * * bash /root/luojunhui/LongArticlesJob/sh/published_articles_monitor.sh
+0 10,16,20 * * * bash /root/luojunhui/LongArticlesJob/sh/published_articles_monitor.sh
 
+# 每晚11点开始执行百度视频
+0 23 * * * bash /root/luojunhui/LongArticlesJob/sh/run_baidu_video_crawler.sh
 
 # check kimo balance hourly
 

+ 9 - 0
crawler_sph_video.py

@@ -0,0 +1,9 @@
+"""
+@author: luojunhui
+"""
+
+from tasks.crawler_channel_account_videos import CrawlerChannelAccountVideos
+
+if __name__ == "__main__":
+    crawler_channel_account_videos = CrawlerChannelAccountVideos()
+    crawler_channel_account_videos.deal()

+ 3 - 1
requirements.txt

@@ -20,4 +20,6 @@ protobuf~=3.20.3
 openai~=1.17.0
 oss2~=2.19.1
 fake-useragent~=1.5.1
-playwright~=1.49.1
+playwright~=1.49.1
+volcengine-python-sdk[ark]
+tenacity~=9.0.0

+ 8 - 0
run_baidu_video_crawler.py

@@ -0,0 +1,8 @@
+"""
+@author: luojunhui
+"""
+from coldStartTasks.crawler.baidu import BaiduVideoCrawler
+
+if __name__ == '__main__':
+    task = BaiduVideoCrawler()
+    task.deal()

+ 9 - 0
run_title_rewrite_task.py

@@ -0,0 +1,9 @@
+"""
+@author: luojunhui
+"""
+from tasks.title_rewrite_task import TitleRewriteTask
+
+
+if __name__ == '__main__':
+    task = TitleRewriteTask()
+    task.deal()

+ 26 - 0
sh/run_article_association.sh

@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# 获取当前日期,格式为 YYYY-MM-DD
+CURRENT_DATE=$(date +%F)
+
+# 日志文件路径,包含日期
+LOG_FILE="/root/luojunhui/logs/article_association_crawler_log_$CURRENT_DATE.txt"
+
+# 重定向整个脚本的输出到带日期的日志文件
+exec >> "$LOG_FILE" 2>&1
+if pgrep -f "python3 article_association_task.py" > /dev/null
+then
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - article_association_task.py is running"
+else
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - trying to restart article_association_task.py"
+    # 切换到指定目录
+    cd /root/luojunhui/LongArticlesJob
+
+    # 激活 Conda 环境
+    source /root/miniconda3/etc/profile.d/conda.sh
+    conda activate tasks
+
+    # 在后台运行 Python 脚本并重定向日志输出
+    nohup python3 article_association_task.py >> "${LOG_FILE}" 2>&1 &
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - successfully restarted article_association_task.py"
+fi

+ 26 - 0
sh/run_baidu_video_crawler.sh

@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# 获取当前日期,格式为 YYYY-MM-DD
+CURRENT_DATE=$(date +%F)
+
+# 日志文件路径,包含日期
+LOG_FILE="/root/luojunhui/logs/baidu_video_crawler_log_$CURRENT_DATE.txt"
+
+# 重定向整个脚本的输出到带日期的日志文件
+exec >> "$LOG_FILE" 2>&1
+if pgrep -f "python3 run_baidu_video_crawler.py" > /dev/null
+then
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - run_baidu_video_crawler.py is running"
+else
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - trying to restart run_baidu_video_crawler.py"
+    # 切换到指定目录
+    cd /root/luojunhui/LongArticlesJob
+
+    # 激活 Conda 环境
+    source /root/miniconda3/etc/profile.d/conda.sh
+    conda activate tasks
+
+    # 在后台运行 Python 脚本并重定向日志输出
+    nohup python3 run_baidu_video_crawler.py >> "${LOG_FILE}" 2>&1 &
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - successfully restarted run_baidu_video_crawler.py"
+fi

+ 26 - 0
sh/run_gzh_video_crawler.sh

@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# 获取当前日期,格式为 YYYY-MM-DD
+CURRENT_DATE=$(date +%F)
+
+# 日志文件路径,包含日期
+LOG_FILE="/root/luojunhui/logs/gzh_video_crawler_log_$CURRENT_DATE.txt"
+
+# 重定向整个脚本的输出到带日期的日志文件
+exec >> "$LOG_FILE" 2>&1
+if pgrep -f "python3 run_video_account_crawler.py" > /dev/null
+then
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - run_video_account_crawler.py is running"
+else
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - trying to restart run_video_account_crawler.py"
+    # 切换到指定目录
+    cd /root/luojunhui/LongArticlesJob
+
+    # 激活 Conda 环境
+    source /root/miniconda3/etc/profile.d/conda.sh
+    conda activate tasks
+
+    # 在后台运行 Python 脚本并重定向日志输出
+    nohup python3 run_video_account_crawler.py >> "${LOG_FILE}" 2>&1 &
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - successfully restarted run_video_account_crawler.py"
+fi

+ 26 - 0
sh/run_sph_video_crawler.sh

@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# 获取当前日期,格式为 YYYY-MM-DD
+CURRENT_DATE=$(date +%F)
+
+# 日志文件路径,包含日期
+LOG_FILE="/root/luojunhui/logs/sph_video_crawler_log_$CURRENT_DATE.txt"
+
+# 重定向整个脚本的输出到带日期的日志文件
+exec >> "$LOG_FILE" 2>&1
+if pgrep -f "python3 crawler_sph_video.py" > /dev/null
+then
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - crawler_sph_video.py is running"
+else
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - trying to restart crawler_sph_video.py"
+    # 切换到指定目录
+    cd /root/luojunhui/LongArticlesJob
+
+    # 激活 Conda 环境
+    source /root/miniconda3/etc/profile.d/conda.sh
+    conda activate tasks
+
+    # 在后台运行 Python 脚本并重定向日志输出
+    nohup python3 crawler_sph_video.py >> "${LOG_FILE}" 2>&1 &
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - successfully restarted crawler_sph_video.py"
+fi

+ 26 - 0
sh/run_title_rewrite_task.sh

@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# 获取当前日期,格式为 YYYY-MM-DD
+CURRENT_DATE=$(date +%F)
+
+# 日志文件路径,包含日期
+LOG_FILE="/root/luojunhui/logs/title_rewrite_task_log_$CURRENT_DATE.txt"
+
+# 重定向整个脚本的输出到带日期的日志文件
+exec >> "$LOG_FILE" 2>&1
+if pgrep -f "python3 run_title_rewrite_task.py" > /dev/null
+then
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - run_title_rewrite_task.py is running"
+else
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - trying to restart run_title_rewrite_task.py"
+    # 切换到指定目录
+    cd /root/luojunhui/LongArticlesJob
+
+    # 激活 Conda 环境
+    source /root/miniconda3/etc/profile.d/conda.sh
+    conda activate tasks
+
+    # 在后台运行 Python 脚本并重定向日志输出
+    nohup python3 run_title_rewrite_task.py >> "${LOG_FILE}" 2>&1 &
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - successfully restarted run_title_rewrite_task.py"
+fi

+ 26 - 0
sh/run_toutiao_account_video_crawler.sh

@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# 获取当前日期,格式为 YYYY-MM-DD
+CURRENT_DATE=$(date +%F)
+
+# 日志文件路径,包含日期
+LOG_FILE="/root/luojunhui/logs/toutiao_account_video_crawler_task_log_$CURRENT_DATE.txt"
+
+# 重定向整个脚本的输出到带日期的日志文件
+exec >> "$LOG_FILE" 2>&1
+if pgrep -f "python3 toutiao_video_crawler.py" > /dev/null
+then
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - toutiao_video_crawler.py is running"
+else
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - trying to restart toutiao_video_crawler.py"
+    # 切换到指定目录
+    cd /root/luojunhui/LongArticlesJob
+
+    # 激活 Conda 环境
+    source /root/miniconda3/etc/profile.d/conda.sh
+    conda activate tasks
+
+    # 在后台运行 Python 脚本并重定向日志输出
+    nohup python3 toutiao_video_crawler.py >> "${LOG_FILE}" 2>&1 &
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - successfully restarted toutiao_video_crawler.py"
+fi

+ 0 - 2
sh/run_video_publish_and_audit.sh

@@ -21,8 +21,6 @@ else
     conda activate tasks
 
     # 在后台运行 Python 脚本并重定向日志输出
-    nohup python3 run_video_account_crawler.py >> "${LOG_FILE}" 2>&1 &
-    sleep 180
     nohup python3 run_video_publish_and_audit.py >> "${LOG_FILE}" 2>&1 &
     echo "$(date '+%Y-%m-%d %H:%M:%S') - successfully restarted run_video_publish_and_audit.py"
 fi

+ 1 - 1
tasks/article_summary_task.py

@@ -69,7 +69,7 @@ class ArticleSummaryTask(object):
         """
         rollback_rows = self.db_client.save(
             query=update_sql,
-            params=(const.SUMMARY_INIT_STATUS, const.SUMMARY_LOCK, timestamp_threshold),
+            params=(const.INIT_STATUS, const.PROCESSING_STATUS, timestamp_threshold),
         )
 
         return rollback_rows

+ 224 - 0
tasks/crawler_channel_account_videos.py

@@ -0,0 +1,224 @@
+"""
+@author: luojunhui
+@tool: pycharm && deepseek
+"""
+
+import re
+import os
+import traceback
+import time
+
+from pymysql.cursors import DictCursor
+from tqdm import tqdm
+
+from applications import log
+from applications.const import ChannelVideoCrawlerConst
+from applications.db import DatabaseConnector
+from applications.utils import download_sph_video
+from applications.utils import insert_into_single_video_source_table
+from applications.utils import Item
+from applications.utils import str_to_md5
+from applications.utils import upload_to_oss
+from config import long_articles_config
+from coldStartTasks.crawler.channels import get_channel_account_videos
+
+const = ChannelVideoCrawlerConst()
+
+
+class CrawlerChannelAccountVideos:
+    """
+    crawler channel account videos
+    """
+
+    def __init__(self):
+        self.db_client = DatabaseConnector(db_config=long_articles_config)
+        self.db_client.connect()
+
+    def whether_video_exists(self, title: str) -> bool:
+        """
+        whether video exists, use video_id && title
+        """
+        # check title
+        sql = f"""
+            select id from publish_single_video_source
+            where article_title = %s;
+        """
+        duplicate_id = self.db_client.fetch(query=sql, params=(title,))
+        if duplicate_id:
+            return True
+
+        return False
+
+    def get_channel_account_list(self) -> list[dict]:
+        """
+        get channel account list from database
+        """
+        sql = f"""
+            select account_id, max_cursor 
+            from sph_account_for_videos 
+            where status = {const.CHANNEL_ACCOUNT_GOOD_STATUS}
+            order by max_cursor;"""
+        account_list = self.db_client.fetch(query=sql, cursor_type=DictCursor)
+        return account_list
+
+    def crawler_each_video(self, video: dict) -> None:
+        """
+        download each video
+        save video and decrypt video
+        upload video to oss
+        """
+        object_desc = video["objectDesc"]
+        title = object_desc["description"]
+        if self.whether_video_exists(title):
+            log(
+                task="crawler_channel_account_videos",
+                function="crawler_each_video",
+                message="video title exists",
+                data={"video_id": video["id"], "title": title},
+            )
+            return
+
+        cleaned_title = re.sub(r"[^\u4e00-\u9fff]", "", title)
+        if len(cleaned_title) < const.MIN_TITLE_LENGTH:
+            log(
+                task="crawler_channel_account_videos",
+                function="crawler_each_video",
+                message="video title is too short",
+                data={"video_id": video["id"], "title": title},
+            )
+            return
+
+        video_length = video["objectDesc"]["media"][0]["VideoPlayLen"]
+        if video_length and int(video_length) > const.MAX_VIDEO_LENGTH:
+            log(
+                task="crawler_channel_account_videos",
+                function="crawler_each_video",
+                message="video length is too long",
+                data={"video_id": video["id"], "title": title, "length": video_length},
+            )
+            return
+
+        video_item = Item()
+        video_id = video["id"]
+        video_item.add("content_trace_id", "video{}".format(str_to_md5(str(video_id))))
+        video_item.add("url_unique_md5", video_id)
+        video_item.add("article_title", title)
+        video_item.add("out_account_id", video["username"])
+        video_item.add("out_account_name", video["nickname"])
+        video_item.add("publish_timestamp", video["createtime"])
+        video_item.add("platform", "sph")
+        video_item.add("crawler_timestamp", int(time.time()))
+        media = object_desc["media"][0]
+        url = media["Url"]
+        decode_key = media["decodeKey"]
+        url_token = media["urlToken"]
+        download_url = url + url_token
+        try:
+            decrypt_path = download_sph_video(download_url=download_url, key=decode_key)
+            oss_path = upload_to_oss(decrypt_path)
+            video_item.add("video_oss_path", oss_path)
+            video_item.add("source_account", const.NO_SOURCE_ACCOUNT_STATUS)
+            video_item.check(source="video")
+            insert_into_single_video_source_table(self.db_client, video_item.item)
+            os.remove(decrypt_path)
+        except Exception as e:
+            log(
+                task="crawler_channel_account_videos",
+                function="crawler_each_video",
+                message="download video failed",
+                data={
+                    "error": str(e),
+                    "traceback": traceback.format_exc(),
+                    "video_id": video["id"],
+                },
+            )
+
+    def crawler_each_account(self, channel_account: dict, last_buffer: str = "") -> None:
+        """
+        通过循环替代递归,分页爬取频道账号视频
+        """
+        channel_account_id = channel_account["account_id"]
+        max_cursor = channel_account.get("max_cursor") or const.DEFAULT_CURSOR
+        current_last_buffer = last_buffer
+        has_more = True
+
+        while has_more:
+            response = get_channel_account_videos(channel_account_id, last_buffer=current_last_buffer)
+            if response["ret"] != 200:
+                log(
+                    task="crawler_channel_account_videos",
+                    function="crawler_each_video",
+                    message="get_channel_account_videos failed",
+                    data={
+                        "response": response,
+                        "channel_account_id": channel_account_id,
+                        "max_cursor": max_cursor,
+                    },
+                )
+                break
+
+            response_data = response["data"]
+            current_last_buffer = response_data["lastBuffer"]  # 更新分页游标
+            has_more = response_data["continueFlag"]  # 是否还有下一页
+            video_list = response_data["object"]
+
+            if not video_list:
+                break
+
+            create_timestamp = video_list[0]["createtime"]
+            if create_timestamp < max_cursor:
+                break
+
+            crawl_video_list_bar = tqdm(video_list, desc="crawl videos")
+            for video in crawl_video_list_bar:
+                crawl_video_list_bar.set_postfix({"video_id": video["id"]})
+                self.crawler_each_video(video)
+
+            if has_more:
+                time.sleep(const.SLEEP_SECOND)
+            else:
+                break
+
+    def update_account_max_cursor(self, account_id: str) -> None:
+        """
+        update account max cursor
+        """
+        select_sql = f"""
+            select max(publish_timestamp) as max_cursor from publish_single_video_source where out_account_id = '{account_id}';
+        """
+        response_mysql = self.db_client.fetch(query=select_sql)
+        max_publish_timestamp = response_mysql[0][0]
+
+        if max_publish_timestamp:
+            update_sql = f"""
+                        update sph_account_for_videos
+                        set max_cursor = %s
+                        where account_id = %s;
+                    """
+            self.db_client.save(
+                query=update_sql, params=(max_publish_timestamp, account_id)
+            )
+
+    def deal(self):
+        """
+        deal channel account videos
+        """
+        account_list = self.get_channel_account_list()
+        account_crawler_bar = tqdm(account_list, desc="crawler channel account videos")
+        for account in account_crawler_bar:
+            try:
+                account_crawler_bar.set_postfix({"account_id": account["account_id"]})
+                self.crawler_each_account(channel_account=account)
+                self.update_account_max_cursor(account["account_id"])
+
+            except Exception as e:
+                log(
+                    task="crawler_channel_account_videos",
+                    function="deal",
+                    message="crawler channel account videos failed",
+                    data={
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                        "account_id": account["account_id"],
+                    },
+                )

+ 208 - 0
tasks/crawler_toutiao_account_videos.py

@@ -0,0 +1,208 @@
+"""
+@author: luojunhui
+"""
+
+from __future__ import annotations
+
+import time
+import traceback
+
+from pymysql.cursors import DictCursor
+from tqdm import tqdm
+
+from applications import log
+from applications.const import ToutiaoVideoCrawlerConst
+from applications.db import DatabaseConnector
+from applications.pipeline import scrape_video_entities_process
+from applications.utils import Item
+from applications.utils import str_to_md5
+from applications.utils import insert_into_single_video_source_table
+from coldStartTasks.crawler.toutiao import get_toutiao_account_video_list
+from config import apolloConfig, long_articles_config
+
+const = ToutiaoVideoCrawlerConst()
+config = apolloConfig()
+cookie = config.getConfigValue("toutiao_blogger_cookie")
+
+
+class CrawlerToutiaoAccountVideos:
+    """
+    toutiao blogger crawler
+    """
+
+    def __init__(self):
+        self.db_client = DatabaseConnector(db_config=long_articles_config)
+        self.db_client.connect()
+
+    def get_account_list(self):
+        """
+        get account list
+        """
+        sql = f"""
+            select account_id, max_cursor
+            from video_meta_accounts
+            where platform = 'toutiao' and status = {const.TOUTIAO_ACCOUNT_GOOD_STATUS};
+        """
+        account_list = self.db_client.fetch(query=sql, cursor_type=DictCursor)
+        return account_list
+
+    def crawler_each_account_video_list(
+        self, account_id: str, max_cursor: int | None, max_behot_time: int = 0
+    ):
+        """
+        account_id: toutiao account id
+        max_cursor: crawler latest cursor for each account
+        max_behot_time: max behot time from toutiao, use to switch to next page
+        """
+        has_more = True
+        current_cursor = max_behot_time
+        max_cursor = max_cursor or const.DEFAULT_CURSOR
+
+        while has_more:
+            response = get_toutiao_account_video_list(
+                account_id=account_id, cookie=cookie, max_behot_time=current_cursor
+            )
+            if not response:
+                break
+
+            if response["message"] != "success":
+                log(
+                    task="crawler_toutiao_account_videos",
+                    function="crawler_toutiao_account_videos",
+                    message="get response from toutiao failed",
+                    data={"account_id": account_id, "response": response},
+                )
+                break
+
+            video_list = response["data"]
+            has_more = response["has_more"]
+            current_cursor = response["next"]["max_behot_time"]
+
+            if not video_list:
+                break
+
+            max_timestamp_in_this_group = video_list[0]["publish_time"]
+            if max_timestamp_in_this_group < max_cursor:
+                break
+
+            # do crawler each video
+            crawler_video_list_bar = tqdm(video_list, desc="crawler videos")
+            for video in crawler_video_list_bar:
+                try:
+                    crawler_video_list_bar.set_postfix({"video_id": video["id"]})
+                    self.crawler_each_video(video)
+
+                except Exception as e:
+                    log(
+                        task="crawler_toutiao_account_videos",
+                        function="crawler_each_account_video_list",
+                        message="crawler each video failed",
+                        data={
+                            "account_id": account_id,
+                            "video_info": video,
+                            "error": str(e),
+                            "traceback": traceback.format_exc(),
+                        },
+                    )
+
+            if has_more:
+                time.sleep(const.SLEEP_SECOND)
+            else:
+                break
+
+    def crawler_each_video(self, video_data):
+        """
+        crawler each video data
+        """
+        video_item = Item()
+        video_id = video_data["group_id"]
+        title = video_data["title"]
+        media = video_data["video"]
+        url = media["download_addr"]["url_list"][0]
+
+        # add info into item
+        video_item.add("content_trace_id", "video{}".format(str_to_md5(str(video_id))))
+        video_item.add("url_unique_md5", video_id)
+        video_item.add("article_title", title)
+        video_item.add("out_account_id", video_data["user"]["user_id"])
+        video_item.add("out_account_name", video_data["source"])
+        video_item.add("publish_timestamp", video_data["publish_time"])
+        video_item.add("platform", const.PLATFORM)
+        video_item.add("read_cnt", video_data.get("read_count", 0))
+        video_item.add("article_url", url)
+        video_item.add("source_account", const.NO_SOURCE_ACCOUNT_STATUS)
+        video_item.add("crawler_timestamp", int(time.time()))
+
+        # check item before insert
+        video_item.check(source="video")
+        try:
+            item_with_oss_path = scrape_video_entities_process(
+                video_item=video_item.item, db_client=self.db_client
+            )
+            if item_with_oss_path:
+                insert_into_single_video_source_table(
+                    self.db_client, item_with_oss_path
+                )
+        except Exception as e:
+            log(
+                task="crawler_toutiao_account_videos",
+                function="crawler_toutiao_account_videos",
+                message="etl failed",
+                data={
+                    "video_item": video_item.item,
+                    "error": str(e),
+                    "traceback": traceback.format_exc(),
+                }
+            )
+
+    def update_account_max_cursor(self, account_id: str) -> None:
+        """
+        update account max cursor
+        """
+        select_sql = f"""
+            select max(publish_timestamp) as max_cursor 
+            from publish_single_video_source 
+            where out_account_id = '{account_id}' and platform = '{const.PLATFORM}';
+        """
+        response_mysql = self.db_client.fetch(query=select_sql)
+        max_publish_timestamp = response_mysql[0][0]
+
+        if max_publish_timestamp:
+            update_sql = f"""
+                update video_meta_accounts
+                set max_cursor = %s
+                where account_id = %s and platform = %s;
+            """
+            self.db_client.save(
+                query=update_sql,
+                params=(max_publish_timestamp, account_id, const.PLATFORM),
+            )
+
+    def deal(self) -> None:
+        """
+        class entrance
+        """
+        account_list = self.get_account_list()
+        account_list_bar = tqdm(account_list, desc="crawler toutiao accounts")
+        for account in account_list_bar:
+            account_id = account["account_id"]
+            max_cursor = account["max_cursor"]
+            try:
+                # crawl each account
+                account_list_bar.set_postfix({"account_id": account_id})
+                self.crawler_each_account_video_list(
+                    account_id=account_id, max_cursor=max_cursor
+                )
+                self.update_account_max_cursor(account_id)
+
+            except Exception as e:
+                # add log and bot
+                log(
+                    task="crawler_toutiao_account_videos",
+                    function="deal",
+                    message=account_id,
+                    data={
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                )

+ 270 - 0
tasks/title_rewrite_task.py

@@ -0,0 +1,270 @@
+"""
+@author: luojunhui
+"""
+
+import time
+import traceback
+
+from pymysql.cursors import DictCursor
+from tqdm import tqdm
+
+from applications import log
+from applications.api import fetch_deepseek_response
+from applications.const import TitleRewriteTaskConst
+from applications.db import DatabaseConnector
+from config import long_articles_config
+
+const = TitleRewriteTaskConst()
+
+
+def generate_prompt(ori_title):
+    """
+    生成prompt
+    """
+    prompt = f"""
+    请将以下标题改写成适合公众号中小程序点击和传播的文章标题,文章标题的写作规范如下,请学习后进行文章标题的编写。直接输出最终的文章标题,文章标题撰写规范如下:
+    1. 标题结构:要点前置,信息明确
+        核心信息前置:标题开头直接点出文章的核心内容或亮点,吸引读者注意。例如:
+          “我国存款最安全的五大银行,永远都不会倒闭,你知道是哪五家吗?”
+          “亩产7000斤,被误认成萝卜却曾是‘救命粮’,如今成我国出口名蔬”。
+        简洁明了:标题通常在20字以内,信息集中且易于理解。
+        悬念前置结构:前半句设置反常/冲突场景(如"刑满释放蹬三轮")+后半句用结果反转制造悬念("政府领导登门分配工作")
+        多要素拼接:通过冒号/逗号分隔不同叙事主体(地域+人物冲突+权威评价),如"辽宁女子住高档小区被敲门,法院判决意外"
+    
+    2. 情绪表达:激发共鸣,引发好奇
+        情感共鸣:通过情感化的语言触动读者,泪崩/守护/抱头痛哭等情感冲击词,配合家庭伦理场景
+        例如:
+          “老母亲分家产,给亲闺女30万,给养女一筐青菜,养女意外摔倒,看到筐子里的东西,瞬间愣住了”。
+          “儿子卖车卖房给母亲治病,母亲去世后儿媳收拾房间,打开床底柜,儿子突然痛哭”。
+        悬念与好奇心:通过提问或制造悬念,激发读者点击欲望。例如:
+          “你知道是哪五家吗?”
+          “打开床底柜,儿子突然痛哭”。
+        冲突性情绪词:拍桌大骂/气愤不已/眼红不已/算计等强对抗性词汇
+        结果反差刺激:用"风光善终/价值过亿/判决意外"等违反预期的结果
+    
+    3. 语言风格:口语化、接地气
+        口语化表达:使用通俗易懂的语言,贴近读者生活。
+        刻意使用"赶都赶不走/各吃各的/我就知道你在家"等市井化用语。
+        例如:
+          “狗屎运?江西男子钓鱼时发现青鱼尸骸,扒开后捡到鸡蛋大小的青鱼石”。
+          “聪明的女人,不会帮婆家3种忙,而蠢女人才一再插手”。
+        接地气的词汇:使用“狗屎运”“蠢女人”等口语化词汇,增强亲切感。
+        身份反差构建:突出人物命运转折(老农→亿万富翁/囚犯→政府帮扶对象)
+        权威背书暗示:"专家气愤/法院判决/网友评价"等第三方视角增强可信度
+    
+    4. 标点运用:增强语气,突出重点
+        问号与感叹号:通过问号制造悬念,感叹号强化情感。
+        在关键转折点使用("太气人了!/赔不了!")
+        问号制造互动:如"容嬷嬷是校花?"激发读者验证心理
+        例如:
+          “你知道是哪五家吗?”
+          “太无耻了!湖南,一名厨师被公司派到云南‘出差’被拒……”
+        引号与冒号:用于突出关键词或转折点。
+        破折号递进:用"——"引导关键信息("吃不完最好扔掉——")
+        例如:
+          “被误认成萝卜却曾是‘救命粮’”。
+          “女子归还后,失主拒绝支付报酬,还说:要有格局”。
+    
+    5. 热点与话题性:结合社会热点或争议
+        社会热点:结合当前热点事件或争议话题,吸引关注。例如:
+          “上海:男子超市连续购买46枚过期咸鸭蛋,2天分46次交易,向厂家索赔金14万,法院判了!”
+        争议性话题:通过争议性内容引发讨论。例如:
+          “李玉成终于说出实话,公开吐槽马玉琴年纪太大,结婚28年疑似后悔”。
+    
+    6. 数字与具体细节:增强可信度与吸引力
+        数字的运用:通过具体数字增强标题的可信度和吸引力。例如:
+          “亩产7000斤”。
+          “22年河南男子跳河救人,体力耗尽留遗言”。
+        细节描述:通过细节让标题更具画面感。例如:
+          “打开床底柜,儿子突然痛哭”。
+          “扒开后捡到鸡蛋大小的青鱼石”。
+    
+    7. 价值诉求:传递实用信息或情感价值
+        实用信息:提供对读者有价值的信息。例如:
+          “我国存款最安全的五大银行,永远都不会倒闭”。
+          “72岁老人每天一个蒸苹果,半年后体检,看到指标变化让他乐开了花”。
+        情感价值:通过情感故事或人生哲理打动读者。例如:
+          “父母越老越能暴露家庭最真实的一面:当父母70岁,子女不该抱有这三种期待”。
+    
+    8. 名人效应与历史情怀:增强吸引力
+        名人效应:提及名人或历史事件,吸引关注。例如:
+          “难怪王扶林说陈晓旭不够漂亮,看看他选的原黛玉候选人,那才叫美”。
+          “1975年‘下馆子’的老照片,2元能吃些什么,勾起那段最难忘的时光”。
+    
+    9.隐藏传播逻辑:通过标题中暗含的、能触发人性弱点(如猎奇、贪婪、同情)或社会痛点的心理机制,通过潜意识刺激读者点击欲望
+       人性弱点触发:贪婪(200万保单)、猎奇(林彪密件)、窥私(家庭算计)
+       生存焦虑关联:医疗(脑瘫儿)、养老(子女不孝)、食品安全(二次加热)
+       身份代入设计:选择"老太太/外甥女/退休母亲"等易引发群体共鸣的角色
+    输入的标题是: '{ori_title}'
+    """
+    return prompt
+
+
+class TitleRewriteTask:
+    """
+    标题重写任务
+    """
+
+    def __init__(self):
+        self.db = DatabaseConnector(db_config=long_articles_config)
+        self.db.connect()
+
+    def roll_back_blocked_tasks(self):
+        """
+        rollback blocked tasks
+        """
+        sql = f"""
+            select id, title_rewrite_status_update_timestamp
+            from publish_single_video_source
+            where title_rewrite_status = {const.TITLE_REWRITE_LOCK_STATUS};
+        """
+        article_list = self.db.fetch(query=sql, cursor_type=DictCursor)
+        if article_list:
+            blocked_id_list = [
+                i["id"]
+                for i in article_list
+                if (
+                    int(time.time())
+                    - i["title_rewrite_status_update_timestamp"]
+                )
+                > const.TITLE_REWRITE_LOCK_TIME
+            ]
+            if blocked_id_list:
+                update_sql = f"""
+                    update publish_single_video_source
+                    set title_rewrite_status = %s
+                    where id in %s and title_rewrite_status = %s;
+                """
+                self.db.save(
+                    query=update_sql,
+                    params=(
+                        const.TITLE_REWRITE_INIT_STATUS,
+                        tuple(blocked_id_list),
+                        const.TITLE_REWRITE_LOCK_STATUS,
+                    )
+                )
+
+    def get_articles_batch(self, batch_size=1000):
+        """
+        从数据库中获取文章
+        """
+        sql = f"""
+            select content_trace_id, article_title
+            from publish_single_video_source 
+            where bad_status = {const.ARTICLE_POSITIVE_STATUS} 
+                and audit_status = {const.ARTICLE_AUDIT_PASSED_STATUS} 
+                and title_rewrite_status = {const.TITLE_REWRITE_INIT_STATUS}
+                and platform in ('hksp', 'sph')
+            limit {batch_size};
+        """
+        res = self.db.fetch(query=sql, cursor_type=DictCursor)
+        return res
+
+    def update_title_rewrite_status(self, content_trace_id, ori_status, new_status):
+        """
+        更新标题重写状态
+        """
+        sql = f"""
+            update publish_single_video_source
+            set title_rewrite_status = %s, title_rewrite_status_update_timestamp = %s
+            where content_trace_id = %s and title_rewrite_status= %s;
+        """
+        affected_rows = self.db.save(
+            query=sql, params=(new_status, int(time.time()), content_trace_id, ori_status)
+        )
+        return affected_rows
+
+    def insert_into_rewrite_table(self, content_trace_id, new_title):
+        """
+        insert into rewrite_table
+        """
+        insert_sql = f"""
+            insert into video_title_rewrite
+            (content_trace_id, new_title, status, prompt_version)
+            values (%s, %s, %s, %s);
+        """
+        self.db.save(
+            query=insert_sql,
+            params=(
+                content_trace_id,
+                new_title,
+                const.TITLE_USEFUL_STATUS,
+                const.PROMPT_VERSION
+            ),
+        )
+
+    def rewrite_each_article(self, article):
+        """
+        rewrite each article
+        """
+        content_trace_id = article["content_trace_id"]
+        article_title = article["article_title"]
+
+        # lock each task
+        affected_rows = self.update_title_rewrite_status(
+            content_trace_id=content_trace_id,
+            ori_status=const.TITLE_REWRITE_INIT_STATUS,
+            new_status=const.TITLE_REWRITE_LOCK_STATUS,
+        )
+        if not affected_rows:
+            return
+
+        try:
+            prompt = generate_prompt(article_title)
+            new_title = fetch_deepseek_response(model="default", prompt=prompt)
+
+            # insert into rewrite table
+            self.insert_into_rewrite_table(
+                content_trace_id=content_trace_id, new_title=new_title
+            )
+
+            # unlock
+            self.update_title_rewrite_status(
+                content_trace_id=content_trace_id,
+                ori_status=const.TITLE_REWRITE_LOCK_STATUS,
+                new_status=const.TITLE_REWRITE_SUCCESS_STATUS,
+            )
+        except Exception as e:
+            log(
+                task="title rewrite task",
+                function="rewrite_each_article",
+                message=content_trace_id,
+                status="fail",
+                data={
+                    "error_message": str(e),
+                    "error_type": type(e).__name__,
+                    "traceback": traceback.format_exc(),
+                },
+            )
+            self.update_title_rewrite_status(
+                content_trace_id=content_trace_id,
+                ori_status=const.TITLE_REWRITE_LOCK_STATUS,
+                new_status=const.TITLE_REWRITE_FAIL_STATUS,
+            )
+
+    def deal(self):
+        """
+        get tasks && deal tasks
+        """
+        # rollback blocked tasks
+        try:
+            self.roll_back_blocked_tasks()
+        except Exception as e:
+            log(
+                task="title rewrite task",
+                function="roll_back_blocked_tasks",
+                message="roll back blocked tasks fail",
+                status="fail",
+                data={
+                    "error_message": str(e),
+                    "error_type": type(e).__name__,
+                    "traceback": traceback.format_exc()
+                }
+            )
+
+        # process tasks
+        articles = self.get_articles_batch()
+        bar = tqdm(articles, desc="title rewrite task")
+        for article in bar:
+            self.rewrite_each_article(article)
+            bar.set_postfix({"content_trace_id": article["content_trace_id"]})

+ 12 - 3
tasks/update_published_articles_minigram_detail.py

@@ -120,7 +120,7 @@ class UpdatePublishedArticlesMinigramDetail(object):
         :return:
         """
         select_sql = f"""
-            SELECT first_uv, split0, split1, split2
+            SELECT first_uv, split0, split0_head, split0_recommend, split1, split1_head, split1_recommend, split2, split2_head, split2_recommend
             FROM changwen_data_rootsourceid
             WHERE root_source_id = '{root_source_id}' AND dt = '{dt}';
         """
@@ -235,7 +235,7 @@ class UpdatePublishedArticlesMinigramDetail(object):
                 return article_info
 
         else:
-            return article_info
+            return EMPTY_DICT
 
     def get_root_source_id_for_three_days(self, biz_date: str) -> List[Dict]:
         """
@@ -263,7 +263,10 @@ class UpdatePublishedArticlesMinigramDetail(object):
             # do update job
             update_sql = f"""
                 UPDATE {DETAIL_TABLE}
-                SET first_level = %s, fission_0 = %s, fission_1 = %s, fission_2 = %s
+                SET first_level = %s, 
+                    fission_0 = %s, fission_0_head = %s, fission_0_recommend = %s, 
+                    fission_1 = %s, fission_1_head = %s, fission_1_recommend = %s, 
+                    fission_2 = %s, fission_2_head = %s, fission_2_recommend = %s
                 WHERE root_source_id = %s and recall_dt = %s;
             """
             self.piaoquan_crawler_db_client.save(
@@ -271,8 +274,14 @@ class UpdatePublishedArticlesMinigramDetail(object):
                 params=(
                     mini_program_detail['first_uv'],
                     mini_program_detail['split0'],
+                    mini_program_detail['split0_head'],
+                    mini_program_detail['split0_recommend'],
                     mini_program_detail['split1'],
+                    mini_program_detail['split1_head'],
+                    mini_program_detail['split1_recommend'],
                     mini_program_detail['split2'],
+                    mini_program_detail['split2_head'],
+                    mini_program_detail['split2_recommend'],
                     root_source_id,
                     recall_dt
                 )

+ 26 - 1
title_similarity_score_task.py

@@ -1,6 +1,8 @@
 """
 @author: luojunhui
 """
+import traceback
+from applications import bot
 from coldStartTasks.filter.title_similarity_task import ColdStartTitleSimilarityTask
 
 
@@ -8,4 +10,27 @@ if __name__ == '__main__':
     batch_size = 3000
     task = ColdStartTitleSimilarityTask()
     task.init_database()
-    task.run(limit=batch_size)
+    # process video
+    try:
+        task.run(meta_source="video")
+    except Exception as e:
+        bot(
+            title="视频冷启池nlp任务异常",
+            mention=False,
+            detail={
+                "traceback": traceback.format_exc(),
+                "error": f"{e}"
+            }
+        )
+    # process article
+    try:
+        task.run(meta_source="article")
+    except Exception as e:
+        bot(
+            title="文章冷启池nlp任务异常",
+            mention=False,
+            detail={
+                "traceback": traceback.format_exc(),
+                "error": f"{e}"
+            }
+        )

+ 10 - 0
toutiao_video_crawler.py

@@ -0,0 +1,10 @@
+"""
+@author: luojunhui
+"""
+
+from tasks.crawler_toutiao_account_videos import CrawlerToutiaoAccountVideos
+
+
+if __name__ == '__main__':
+    crawler = CrawlerToutiaoAccountVideos()
+    crawler.deal()

+ 83 - 131
updateAccountV3.py

@@ -7,151 +7,104 @@ import time
 from tqdm import tqdm
 from datetime import datetime, timedelta
 from argparse import ArgumentParser
+from pymysql.cursors import DictCursor
 
-from applications import PQMySQL, DeNetMysql, longArticlesMySQL
-from applications.const import updateAccountReadAvgTaskConst
+from applications.const import UpdateAccountReadAvgTaskConst
+from applications.db import DatabaseConnector
+from applications.utils import fetch_account_fans
+from applications.utils import fetch_publishing_account_list
 from config import apolloConfig
+from config import long_articles_config, denet_config, piaoquan_crawler_config
 
+read_rate_table = "long_articles_read_rate"
+read_avg_table = "account_avg_info_v3"
 config = apolloConfig()
+const = UpdateAccountReadAvgTaskConst()
 unauthorized_account = json.loads(config.getConfigValue("unauthorized_gh_id_fans"))
 touliu_accounts = set(json.loads(config.getConfigValue("touliu_gh_id_list")))
-
-
-def get_account_fans_by_dt(db_client) -> dict:
-    """
-    获取每个账号发粉丝,通过日期来区分
-    :return:
-    """
-    sql = f"""
-        SELECT 
-            t1.date_str, 
-            t1.fans_count, 
-            t2.gh_id
-        FROM datastat_wx t1
-        JOIN publish_account t2 ON t1.account_id = t2.id
-        WHERE 
-            t2.channel = 5 
-        AND t2.status = 1 
-        AND t1.date_str >= '2024-09-01' 
-        ORDER BY t1.date_str;
-    """
-    result = db_client.select(sql)
-    D = {}
-    for line in result:
-        dt = line[0]
-        fans = line[1]
-        gh_id = line[2]
-        if D.get(gh_id):
-            D[gh_id][dt] = fans
-        else:
-            D[gh_id] = {dt: fans}
-    return D
-
+backup_account_fans = json.loads(config.getConfigValue("backup_account_fans"))
 
 class UpdateAccountInfoVersion3(object):
     """
-    更新账号信息 v3
+    更新账号的平均阅读率
     """
 
     def __init__(self):
-        self.const = updateAccountReadAvgTaskConst()
-        self.pq = PQMySQL()
-        self.de = DeNetMysql()
-        self.lam = longArticlesMySQL()
+        # init piaoquan crawler db client
+        self.piaoquan_crawler_db_client = DatabaseConnector(piaoquan_crawler_config)
+        self.piaoquan_crawler_db_client.connect()
+
+        # init long articles db client
+        self.long_articles_db_client = DatabaseConnector(long_articles_config)
+        self.long_articles_db_client.connect()
+
+        #  init aigc db client
+        self.denet_db_client = DatabaseConnector(denet_config)
+        self.denet_db_client.connect()
 
-    def get_account_position_read_rate(self, dt):
+    def fetch_read_rate_avg_for_each_account(self, dt):
         """
         从长文数据库获取账号阅读均值
         :return:
         """
         dt = int(dt.replace("-", ""))
         sql = f"""
-            SELECT 
-                gh_id, position, read_rate_avg
-            FROM
-                long_articles_read_rate
-            WHERE dt_version = {dt};
+            select gh_id, position, read_rate_avg
+            from {read_rate_table}
+            where dt_version = {dt};
         """
-
-        result = self.lam.select(sql)
+        fetch_response_list = self.long_articles_db_client.fetch(query=sql, cursor_type=DictCursor)
         account_read_rate_dict = {}
-        for item in result:
-            gh_id = item[0]
-            position = item[1]
-            rate = item[2]
-            key = "{}_{}".format(gh_id, position)
-            account_read_rate_dict[key] = rate
+        for item in fetch_response_list:
+            key = "{}_{}".format(item['gh_id'], item['position'])
+            account_read_rate_dict[key] = item['read_rate_avg']
         return account_read_rate_dict
 
-    def get_publishing_accounts(self):
-        """
-        获取每日正在发布的账号
-        :return:
-        """
-        sql = f"""
-        SELECT DISTINCT
-            t3.`name`,
-            t3.gh_id,
-            t3.follower_count,
-            t6.account_source_name,
-            t6.mode_type,
-            t6.account_type,
-            t6.`status`
-        FROM
-            publish_plan t1
-            JOIN publish_plan_account t2 ON t1.id = t2.plan_id
-            JOIN publish_account t3 ON t2.account_id = t3.id
-            LEFT JOIN publish_account_wx_type t4 on t3.id = t4.account_id
-            LEFT JOIN wx_statistics_group_source_account t5 on t3.id = t5.account_id
-            LEFT JOIN wx_statistics_group_source t6 on t5.group_source_name = t6.account_source_name
-        WHERE
-            t1.plan_status = 1
-            AND t3.channel = 5
-            GROUP BY t3.id;
-        """
-        account_list = self.de.select(sql)
-        result_list = [
-            {
-                "account_name": i[0],
-                "gh_id": i[1],
-                "fans": i[2],
-                "account_source_name": i[3],
-                "mode_type": i[4],
-                "account_type": i[5],
-                "status": i[6]
-            } for i in account_list
-        ]
-        return result_list
-
     def do_task_list(self, dt):
         """
         do it
         """
-        fans_dict = get_account_fans_by_dt(db_client=self.de)
-        account_list = self.get_publishing_accounts()
-        rate_dict = self.get_account_position_read_rate(dt)
+        # get fans dict from aigc
+        fans_dict = fetch_account_fans(self.denet_db_client, dt)
+
+        # get publishing account list from aigc
+        account_list = fetch_publishing_account_list(self.denet_db_client)
+
+        # fetch each account's read avg for each position
+        read_rate_avg_dict = self.fetch_read_rate_avg_for_each_account(dt)
+
         for account in tqdm(account_list, desc=dt):
             gh_id = account["gh_id"]
-            business_type = self.const.TOULIU if gh_id in touliu_accounts else self.const.ARTICLES_DAILY
-            fans = fans_dict.get(gh_id, {}).get(dt, 0)
+            business_type = const.TOULIU if gh_id in touliu_accounts else const.ARTICLES_DAILY
+            fans = fans_dict.get(gh_id, {}).get(dt, const.DEFAULT_FANS)
+
+            # use unauthorized account's fans if not found in aigc
+            if not fans:
+                fans = int(unauthorized_account.get(gh_id, const.DEFAULT_FANS))
+
+            # use backup account's fans if not found in aigc
             if not fans:
-                fans = int(unauthorized_account.get(gh_id, 0))
+                fans = int(backup_account_fans.get(gh_id, const.DEFAULT_FANS))
+
             if fans:
-                for index in range(1, 9):
+                for index in const.ARTICLE_INDEX_LIST:
                     gh_id_position = "{}_{}".format(gh_id, index)
-                    if rate_dict.get(gh_id_position):
-                        rate = rate_dict[gh_id_position]
-                        read_avg = fans * rate
-                        print(rate, read_avg)
+                    if read_rate_avg_dict.get(gh_id_position):
+                        # fetch read rate avg
+                        read_rate_avg = read_rate_avg_dict[gh_id_position]
+                        # cal read avg
+                        read_avg = fans * read_rate_avg
+
+                        # insert into database
                         insert_sql = f"""
-                        INSERT INTO account_avg_info_v3
-                        (gh_id, position, update_time, account_name, fans, read_avg, like_avg, status, account_type, account_mode, account_source, account_status, business_type, read_rate_avg)
-                        values
-                        (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+                            insert into {read_avg_table}
+                            (gh_id, position, update_time, account_name, fans, read_avg, like_avg, status, account_type, account_mode, account_source, account_status, business_type, read_rate_avg)
+                            values
+                            (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
                         """
                         try:
-                            self.pq.update(
-                                sql=insert_sql,
+                            self.piaoquan_crawler_db_client.save(
+                                query=insert_sql,
                                 params=(
                                     gh_id,
                                     index,
@@ -159,29 +112,29 @@ class UpdateAccountInfoVersion3(object):
                                     account['account_name'],
                                     fans,
                                     read_avg,
-                                    0,
-                                    1,
+                                    const.DEFAULT_LIKE,
+                                    const.USING_STATUS,
                                     account['account_type'],
                                     account['mode_type'],
-                                    account['account_source_name'],
+                                    account['account_source'],
                                     account['status'],
                                     business_type,
-                                    rate
+                                    read_rate_avg
                                 )
                             )
                         except Exception as e:
-                            updateSQL = f"""
-                            UPDATE account_avg_info_v3
-                            set fans = %s, read_avg = %s, read_rate_avg = %s
-                            where gh_id = %s and position = %s and update_time = %s
+                            update_sql = f"""
+                                update {read_avg_table}
+                                set fans = %s, read_avg = %s, read_rate_avg = %s
+                                where gh_id = %s and position = %s and update_time = %s
                             """
                             try:
-                                affected_rows = self.pq.update(
-                                    sql=updateSQL,
+                                self.piaoquan_crawler_db_client.save(
+                                    query=update_sql,
                                     params=(
                                         fans,
                                         read_avg,
-                                        rate,
+                                        read_rate_avg,
                                         account['gh_id'],
                                         index,
                                         dt
@@ -192,17 +145,16 @@ class UpdateAccountInfoVersion3(object):
 
                         # 修改前一天的状态为 0
                         update_status_sql = f"""
-                        UPDATE account_avg_info_v3
-                        SET status = %s
-                        where update_time != %s and gh_id = %s and position = %s;
+                            update {read_avg_table}
+                            set status = %s
+                            where update_time != %s and gh_id = %s and position = %s;
                         """
-                        rows_affected = self.pq.update(
-                            sql=update_status_sql,
+                        self.piaoquan_crawler_db_client.save(
+                            query=update_status_sql,
                             params=(
-                                0, dt, account['gh_id'], index
+                                const.NOT_USING_STATUS, dt, account['gh_id'], index
                             )
                         )
-                        print("修改成功")
 
 
 def main():
@@ -215,15 +167,15 @@ def main():
                         help="Run only once for date in format of %Y-%m-%d. \
                                 If no specified, run as daily jobs.")
     args = parser.parse_args()
-    Up = UpdateAccountInfoVersion3()
+    update_account_read_avg_task = UpdateAccountInfoVersion3()
     if args.run_date:
-        Up.do_task_list(dt=args.run_date)
+        update_account_read_avg_task.do_task_list(dt=args.run_date)
     else:
         dt_object = datetime.fromtimestamp(int(time.time()))
         one_day = timedelta(days=1)
         yesterday = dt_object - one_day
         yesterday_str = yesterday.strftime('%Y-%m-%d')
-        Up.do_task_list(dt=yesterday_str)
+        update_account_read_avg_task.do_task_list(dt=yesterday_str)
 
 
 if __name__ == '__main__':

Some files were not shown because too many files changed in this diff