1 月之前 · fa554f726c
--- a/account_cold_start_daily.py
+++ b/account_cold_start_daily.py
@@ -7,13 +7,48 @@ import traceback
 
				 from argparse import ArgumentParser
			
 
				 
			
 
				 from applications import longArticlesMySQL, bot
			
 
				-from cold_start.crawler.weixinCategoryCrawler import weixinCategory
			
 
				+from tasks.crawler_tasks.crawler_articles import CrawlerDailyScrapeAccountArticles
			
 
				+from tasks.crawler_tasks.crawler_articles import CrawlerAssociationAccountArticles
			
 
				 from cold_start.publish.publishCategoryArticles import CategoryColdStartTask
			
 
				 from cold_start.filter.title_similarity_task import ColdStartTitleSimilarityTask
			
 
				 
			
 
				-DEFAULT_CATEGORY_LIST = ['1030-手动挑号', 'account_association']
			
 
				+DEFAULT_METHOD_LIST = ['account_association']
			
 
				 
			
 
				 
			
 
				+def crawler_task(method_list, date_str):
			
 
				+    """
			
 
				+    :return:
			
 
				+    """
			
 
				+    # 初始化category抓取类
			
 
				+    try:
			
 
				+        daily_scrape_tasks = CrawlerDailyScrapeAccountArticles()
			
 
				+        daily_scrape_tasks.deal(method_list=method_list)
			
 
				+
			
 
				+        association_scrape_tasks = CrawlerAssociationAccountArticles()
			
 
				+        association_scrape_tasks.deal(date_str=date_str)
			
 
				+
			
 
				+        # 抓取完成之后，给抓取到的标题进行相似度打分
			
 
				+        cold_start_title_similarity_task = ColdStartTitleSimilarityTask()
			
 
				+        cold_start_title_similarity_task.init_database()
			
 
				+        cold_start_title_similarity_task.run(meta_source='article')
			
 
				+
			
 
				+        bot(
			
 
				+            title="账号冷启动任务，抓取完成",
			
 
				+            detail={
			
 
				+                "finish_time": datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
			
 
				+                "method": method_list
			
 
				+            },
			
 
				+            mention=False
			
 
				+        )
			
 
				+    except Exception as e:
			
 
				+        bot(
			
 
				+            title="账号抓取冷启动任务，抓取失败",
			
 
				+            detail={
			
 
				+                "error": str(e),
			
 
				+                "error_msg": traceback.format_exc()
			
 
				+            }
			
 
				+        )
			
 
				+
			
 
				 
			
 
				 class AccountColdStartDailyTask(object):
			
 
				     """
			
@@ -43,37 +78,6 @@ class AccountColdStartDailyTask(object):
 
				             )
			
 
				             return False
			
 
				 
			
 
				-    def crawler_task(self, category_list, date_str):
			
 
				-        """
			
 
				-        :return:
			
 
				-        """
			
 
				-        # 初始化category抓取类
			
 
				-        try:
			
 
				-            weixin_category_crawler = weixinCategory(db_client=self.db_client)
			
 
				-            weixin_category_crawler.deal(category_list=category_list, date_str=date_str)
			
 
				-
			
 
				-            # 抓取完成之后，给抓取到的标题进行相似度打分
			
 
				-            cold_start_title_similarity_task = ColdStartTitleSimilarityTask()
			
 
				-            cold_start_title_similarity_task.init_database()
			
 
				-            cold_start_title_similarity_task.run(meta_source='article')
			
 
				-
			
 
				-            bot(
			
 
				-                title="账号冷启动任务，抓取完成",
			
 
				-                detail={
			
 
				-                    "finish_time": datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
			
 
				-                    "category": category_list
			
 
				-                },
			
 
				-                mention=False
			
 
				-            )
			
 
				-        except Exception as e:
			
 
				-            bot(
			
 
				-                title="账号抓取冷启动任务，抓取失败",
			
 
				-                detail={
			
 
				-                    "error": str(e),
			
 
				-                    "error_msg": traceback.format_exc()
			
 
				-                }
			
 
				-            )
			
 
				-
			
 
				     def publish_article_task(self, category_list, article_source):
			
 
				         """
			
 
				         将账号文章发布到aigc抓取计划，并且绑定生成计划
			
@@ -105,21 +109,18 @@ class AccountColdStartDailyTask(object):
 
				             )
			
 
				 
			
 
				 
			
 
				-def main(date_str, category_list=None, article_source=None):
			
 
				+def main(date_str, method_list=None, article_source=None):
			
 
				     """
			
 
				     main job, use crontab to do job daily
			
 
				     :return:
			
 
				     """
			
 
				-    if not category_list:
			
 
				-        category_list = DEFAULT_CATEGORY_LIST
			
 
				+    if not method_list:
			
 
				+        method_list = DEFAULT_METHOD_LIST
			
 
				     if not article_source:
			
 
				         article_source = 'weixin'
			
 
				     task = AccountColdStartDailyTask()
			
 
				     if task.init_db():
			
 
				-        task.publish_article_task(category_list=category_list, article_source=article_source)
			
 
				-
			
 
				-        if article_source == 'weixin':
			
 
				-            task.crawler_task(category_list=category_list, date_str=date_str)
			
 
				+        task.publish_article_task(category_list=method_list, article_source=article_source)
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
@@ -135,11 +136,13 @@ if __name__ == '__main__':
 
				     # 执行头条发布
			
 
				     main(
			
 
				         date_str=run_date,
			
 
				-        category_list=['history', 'tech', 'finance', 'entertainment'],
			
 
				+        method_list=['history', 'tech', 'finance', 'entertainment'],
			
 
				         article_source='toutiao'
			
 
				     )
			
 
				-
			
 
				     # 执行微信抓取发布
			
 
				     main(date_str=run_date)
			
 
				 
			
 
				+    # 执行抓取
			
 
				+    crawler_task(
			
 
				+        method_list=DEFAULT_METHOD_LIST, date_str=run_date)
			
 
				 
			
--- a/applications/pipeline/__init__.py
+++ b/applications/pipeline/__init__.py
@@ -2,4 +2,7 @@
 
				 @author: luojunhui
			
 
				 """
			
 
				 from .account_pipeline import scrape_account_entities_process
			
 
				-from .crawler_pipeline import scrape_video_entities_process
			
 
				+from .crawler_pipeline import scrape_article_entities_process
			
 
				+from .crawler_pipeline import scrape_video_entities_process
			
 
				+from .crawler_pipeline import whether_duplicate_article_title
			
 
				+from .crawler_pipeline import whether_title_sensitive
			
--- a/applications/pipeline/crawler_pipeline.py
+++ b/applications/pipeline/crawler_pipeline.py
@@ -86,3 +86,33 @@ def scrape_video_entities_process(video_item, db_client) -> dict:
 
				         return video_item
			
 
				     else:
			
 
				         return empty_dict
			
 
				+
			
 
				+
			
 
				+def whether_duplicate_article_title(article_title: str, db_client) -> bool:
			
 
				+    """
			
 
				+    whether duplicate video title
			
 
				+    """
			
 
				+    sql = f"""
			
 
				+        select article_id from crawler_meta_article
			
 
				+        where title = %s;
			
 
				+    """
			
 
				+    duplicate_id = db_client.fetch(query=sql, params=(article_title,))
			
 
				+    if duplicate_id:
			
 
				+        return True
			
 
				+
			
 
				+    return False
			
 
				+
			
 
				+
			
 
				+def scrape_article_entities_process(article_item, db_client) -> dict:
			
 
				+    """
			
 
				+    article crawler pipeline
			
 
				+    """
			
 
				+    article_title = article_item["article_title"]
			
 
				+    if whether_title_sensitive(article_title):
			
 
				+        article_item['title_sensitive'] = 1
			
 
				+        return article_item
			
 
				+
			
 
				+    if whether_duplicate_article_title(article_title, db_client):
			
 
				+        return empty_dict
			
 
				+
			
 
				+    return article_item
			
--- a/applications/utils/common.py
+++ b/applications/utils/common.py
@@ -4,6 +4,7 @@
 
				 
			
 
				 import hashlib
			
 
				 
			
 
				+from datetime import datetime, timezone
			
 
				 from requests import RequestException
			
 
				 from urllib.parse import urlparse, parse_qs
			
 
				 from tenacity import (
			
@@ -92,3 +93,73 @@ def extract_root_source_id(path: str) -> dict:
 
				         return res
			
 
				     else:
			
 
				         return {}
			
 
				+
			
 
				+
			
 
				+def show_desc_to_sta(show_desc):
			
 
				+
			
 
				+    def decode_show_v(show_v):
			
 
				+        """
			
 
				+
			
 
				+        :param show_v:
			
 
				+        :return:
			
 
				+        """
			
 
				+        foo = show_v.replace('千', 'e3').replace('万', 'e4').replace('亿', 'e8')
			
 
				+        foo = eval(foo)
			
 
				+        return int(foo)
			
 
				+
			
 
				+    def decode_show_k(show_k):
			
 
				+        """
			
 
				+
			
 
				+        :param show_k:
			
 
				+        :return:
			
 
				+        """
			
 
				+        this_dict = {
			
 
				+            '阅读': 'show_view_count',  # 文章
			
 
				+            '看过': 'show_view_count',  # 图文
			
 
				+            '观看': 'show_view_count',  # 视频
			
 
				+            '赞': 'show_like_count',
			
 
				+            '付费': 'show_pay_count',
			
 
				+            '赞赏': 'show_zs_count',
			
 
				+        }
			
 
				+        if show_k not in this_dict:
			
 
				+            print(f'error from decode_show_k, show_k not found: {show_k}')
			
 
				+        return this_dict.get(show_k, 'show_unknown')
			
 
				+
			
 
				+    show_desc = show_desc.replace('+', '')
			
 
				+    sta = {}
			
 
				+    for show_kv in show_desc.split('\u2004\u2005'):
			
 
				+        if not show_kv:
			
 
				+            continue
			
 
				+        show_k, show_v = show_kv.split('\u2006')
			
 
				+        k = decode_show_k(show_k)
			
 
				+        v = decode_show_v(show_v)
			
 
				+        sta[k] = v
			
 
				+    res = {
			
 
				+        'show_view_count': sta.get('show_view_count', 0),
			
 
				+        'show_like_count': sta.get('show_like_count', 0),
			
 
				+        'show_pay_count': sta.get('show_pay_count', 0),
			
 
				+        'show_zs_count': sta.get('show_zs_count', 0),
			
 
				+    }
			
 
				+    return res
			
 
				+
			
 
				+
			
 
				+def generate_gzh_id(url):
			
 
				+    biz = url.split("biz=")[1].split("&")[0]
			
 
				+    idx = url.split("&idx=")[1].split("&")[0]
			
 
				+    sn = url.split("&sn=")[1].split("&")[0]
			
 
				+    url_bit = "{}-{}-{}".format(biz, idx, sn).encode()
			
 
				+    md5_hash = hashlib.md5()
			
 
				+    md5_hash.update(url_bit)
			
 
				+    md5_value = md5_hash.hexdigest()
			
 
				+    return md5_value
			
 
				+
			
 
				+
			
 
				+
			
 
				+def timestamp_to_str(timestamp, string_format='%Y-%m-%d %H:%M:%S') -> str:
			
 
				+    """
			
 
				+    :param string_format:
			
 
				+    :param timestamp:
			
 
				+    """
			
 
				+    dt_object = datetime.utcfromtimestamp(timestamp).replace(tzinfo=timezone.utc).astimezone()
			
 
				+    date_string = dt_object.strftime(string_format)
			
 
				+    return date_string
			
--- a/applications/utils/item.py
+++ b/applications/utils/item.py
@@ -2,7 +2,6 @@
 
				 @author: luojunhui
			
 
				 """
			
 
				 
			
 
				-
			
 
				 import time
			
 
				 
			
 
				 default_single_video_table_fields = {
			
--- a/cold_start/crawler/weixinCategoryCrawler.py
+++ b/cold_start/crawler/weixinCategoryCrawler.py
@@ -1,277 +0,0 @@
 
				-"""
			
 
				-@author: luojunhui
			
 
				-抓取全局品类文章
			
 
				-"""
			
 
				-import json
			
 
				-import time
			
 
				-
			
 
				-from tqdm import tqdm
			
 
				-from pymysql.cursors import DictCursor
			
 
				-
			
 
				-from applications import WeixinSpider, Functions, log
			
 
				-from cold_start.filter import article_crawler_duplicate_filter
			
 
				-from config import apolloConfig
			
 
				-
			
 
				-# 常量
			
 
				-ACCOUNT_GOOD_STATUS = 1
			
 
				-
			
 
				-# 账号是否每日抓取
			
 
				-ACCOUNT_DAILY_SCRAPE = 1
			
 
				-ACCOUNT_NOT_DAILY_SCRAPE = 0
			
 
				-
			
 
				-# 默认值
			
 
				-DEFAULT_VIEW_COUNT = 0
			
 
				-DEFAULT_LIKE_COUNT = 0
			
 
				-DEFAULT_ARTICLE_STATUS = 1
			
 
				-DEFAULT_TIMESTAMP = 1717171200
			
 
				-
			
 
				-# 标题sensitivity
			
 
				-TITLE_SENSITIVE = 1
			
 
				-TITLE_NOT_SENSITIVE = 0
			
 
				-
			
 
				-config = apolloConfig()
			
 
				-sensitive_word_list = json.loads(config.getConfigValue("sensitive_word_list"))
			
 
				-
			
 
				-
			
 
				-def whether_title_sensitive(title: str) -> bool:
			
 
				-    """
			
 
				-    : param title:
			
 
				-    判断视频是否的标题是否包含敏感词
			
 
				-    """
			
 
				-    for word in sensitive_word_list:
			
 
				-        if word in title:
			
 
				-            return True
			
 
				-    return False
			
 
				-
			
 
				-
			
 
				-class weixinCategory(object):
			
 
				-    """
			
 
				-    微信全局品类账号抓取
			
 
				-    """
			
 
				-
			
 
				-    def __init__(self, db_client):
			
 
				-        self.db_client_lam = db_client
			
 
				-        self.spider = WeixinSpider()
			
 
				-        self.function = Functions()
			
 
				-
			
 
				-    def get_account_list(self, account_category):
			
 
				-        """
			
 
				-        获取账号
			
 
				-        :param account_category 品类
			
 
				-        :return:
			
 
				-        """
			
 
				-        sql = f"""
			
 
				-            select gh_id, account_source, account_name, account_category, latest_update_time
			
 
				-            from long_articles_accounts 
			
 
				-            where account_category = '{account_category}' and is_using = {ACCOUNT_GOOD_STATUS} and daily_scrape = {ACCOUNT_DAILY_SCRAPE};
			
 
				-            """
			
 
				-        account_tuple = self.db_client_lam.select(sql)
			
 
				-        result = [
			
 
				-            {
			
 
				-                "gh_id": i[0],
			
 
				-                "platform": i[1],
			
 
				-                "account_name": i[2],
			
 
				-                "category": i[3],
			
 
				-                "latest_timestamp": i[4],
			
 
				-            }
			
 
				-            for i in account_tuple
			
 
				-        ]
			
 
				-        return result
			
 
				-
			
 
				-    def get_association_account_list(self, date_str):
			
 
				-        """
			
 
				-        获取账号联想的轮询账号
			
 
				-        """
			
 
				-        group_id = date_str[-1]
			
 
				-        sql = f"""
			
 
				-            select account_id, gh_id, account_name, latest_update_time
			
 
				-            from long_articles_accounts
			
 
				-            where account_category = 'account_association' and is_using = {ACCOUNT_DAILY_SCRAPE} and daily_scrape = {ACCOUNT_NOT_DAILY_SCRAPE};
			
 
				-        """
			
 
				-        account_list = self.db_client_lam.select(sql, cursor_type=DictCursor)
			
 
				-        today_crawler_account_list = [i for i in account_list if str(i['account_id'])[-1] == group_id]
			
 
				-        return today_crawler_account_list
			
 
				-
			
 
				-    def insert_data_into_db(self, gh_id, category, article_list):
			
 
				-        """
			
 
				-        将数据更新到数据库
			
 
				-        :return:
			
 
				-
			
 
				-        """
			
 
				-        success_records = []
			
 
				-        for article_obj in article_list:
			
 
				-            detail_article_list = article_obj["AppMsg"]["DetailInfo"]
			
 
				-            for obj in detail_article_list:
			
 
				-                try:
			
 
				-                    # 判断文章是否存在相同的标题
			
 
				-                    if article_crawler_duplicate_filter(
			
 
				-                            new_article_title=obj["Title"], db_client=self.db_client_lam
			
 
				-                    ):
			
 
				-                        log(
			
 
				-                            function="weixinCategory",
			
 
				-                            task="weixinCategory",
			
 
				-                            message="文章去重",
			
 
				-                            data={"title": obj["Title"]}
			
 
				-                        )
			
 
				-                        continue
			
 
				-
			
 
				-                    # 判断标题是否包含敏感词
			
 
				-                    title_sensitivity = TITLE_SENSITIVE if whether_title_sensitive(obj["Title"]) else TITLE_NOT_SENSITIVE
			
 
				-                    show_stat = self.function.show_desc_to_sta(obj["ShowDesc"])
			
 
				-                    show_view_count = show_stat.get("show_view_count", DEFAULT_VIEW_COUNT)
			
 
				-                    show_like_count = show_stat.get("show_like_count", DEFAULT_LIKE_COUNT)
			
 
				-                    unique_idx = self.function.generateGzhId(obj["ContentUrl"])
			
 
				-                    insert_sql = f"""
			
 
				-                        insert into crawler_meta_article
			
 
				-                        (
			
 
				-                         platform, mode, category, out_account_id, article_index, title, link, read_cnt, like_cnt,
			
 
				-                         description, publish_time, crawler_time, status, unique_index, llm_sensitivity, title_sensitivity
			
 
				-                        )
			
 
				-                        VALUES 
			
 
				-                        (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
			
 
				-                    """
			
 
				-                    self.db_client_lam.update(
			
 
				-                        sql=insert_sql,
			
 
				-                        params=(
			
 
				-                            "weixin",
			
 
				-                            "account",
			
 
				-                            category,
			
 
				-                            gh_id,
			
 
				-                            obj['ItemIndex'],
			
 
				-                            obj["Title"],
			
 
				-                            obj["ContentUrl"],
			
 
				-                            show_view_count,
			
 
				-                            show_like_count,
			
 
				-                            obj["Digest"],
			
 
				-                            obj["send_time"],
			
 
				-                            int(time.time()),
			
 
				-                            DEFAULT_ARTICLE_STATUS,
			
 
				-                            unique_idx,
			
 
				-                            obj.get("llm_sensitivity", -1),
			
 
				-                            title_sensitivity
			
 
				-                        ),
			
 
				-                    )
			
 
				-                    success_records.append({
			
 
				-                        'unique_index': unique_idx, 'title': obj['Title']
			
 
				-                    })
			
 
				-                except Exception as e:
			
 
				-                    print(e)
			
 
				-        return success_records
			
 
				-
			
 
				-    def update_latest_account_timestamp(self, gh_id):
			
 
				-        """
			
 
				-        更新账号的最新时间戳
			
 
				-        :return:
			
 
				-        """
			
 
				-        select_sql = f"""
			
 
				-            SELECT publish_time 
			
 
				-            From crawler_meta_article 
			
 
				-            WHERE out_account_id = '{gh_id}'
			
 
				-            ORDER BY publish_time DESC LIMIT 1;
			
 
				-        """
			
 
				-        result = self.db_client_lam.select(select_sql)
			
 
				-        time_stamp = result[0][0]
			
 
				-        dt_str = self.function.timestamp_to_str(time_stamp)
			
 
				-        update_sql = f"""
			
 
				-            update long_articles_accounts
			
 
				-            set latest_update_time = %s
			
 
				-            where gh_id = %s;
			
 
				-        """
			
 
				-        self.db_client_lam.update(sql=update_sql, params=(dt_str, gh_id))
			
 
				-
			
 
				-    def update_each_account(self, gh_id, category, latest_time_stamp, index=None):
			
 
				-        """
			
 
				-        更新账号文章
			
 
				-        :return:
			
 
				-        """
			
 
				-        response = self.spider.update_msg_list(ghId=gh_id, index=index)
			
 
				-        msg_list = response.get("data", {}).get("data")
			
 
				-        if msg_list:
			
 
				-            last_article_in_this_msg = msg_list[-1]
			
 
				-            success_records = self.insert_data_into_db(
			
 
				-                gh_id=gh_id, category=category, article_list=msg_list
			
 
				-            )
			
 
				-            last_time_stamp_in_this_msg = last_article_in_this_msg["AppMsg"]["BaseInfo"]["UpdateTime"]
			
 
				-            if latest_time_stamp < last_time_stamp_in_this_msg:
			
 
				-                next_cursor = response["data"]["next_cursor"]
			
 
				-                return success_records + self.update_each_account(
			
 
				-                    gh_id=gh_id,
			
 
				-                    latest_time_stamp=latest_time_stamp,
			
 
				-                    category=category,
			
 
				-                    index=next_cursor,
			
 
				-                )
			
 
				-            else:
			
 
				-                # 更新最近抓取时间
			
 
				-                self.update_latest_account_timestamp(gh_id=gh_id)
			
 
				-                print("账号时间更新成功")
			
 
				-                return success_records
			
 
				-        else:
			
 
				-            print("No more data")
			
 
				-            return []
			
 
				-
			
 
				-    def crawler_each_category(self, account_list, category):
			
 
				-        """
			
 
				-        抓取每个品类
			
 
				-        :return:
			
 
				-        """
			
 
				-        success_records = []
			
 
				-        for account in tqdm(account_list, desc="crawler_each_category"):
			
 
				-            try:
			
 
				-                gh_id = account['gh_id']
			
 
				-                try:
			
 
				-                    timestamp = int(account['latest_timestamp'].timestamp())
			
 
				-                except Exception as e:
			
 
				-                    timestamp = DEFAULT_TIMESTAMP
			
 
				-                success_records += self.update_each_account(
			
 
				-                    gh_id=gh_id,
			
 
				-                    category=category,
			
 
				-                    latest_time_stamp=timestamp
			
 
				-                )
			
 
				-                print("success")
			
 
				-            except Exception as e:
			
 
				-                print("fail because of {}".format(e))
			
 
				-
			
 
				-    def deal(self, category_list, date_str):
			
 
				-        """
			
 
				-        :param category_list:
			
 
				-        :param date_str: YYYY-MM-DD
			
 
				-        :return:
			
 
				-        """
			
 
				-        # daily 品类账号抓取
			
 
				-        for category in category_list:
			
 
				-            account_list = self.get_account_list(category)
			
 
				-            self.crawler_each_category(account_list=account_list, category=category)
			
 
				-
			
 
				-        # 账号联想账号轮询抓取
			
 
				-        association_account_list = self.get_association_account_list(date_str)
			
 
				-        self.crawler_each_category(account_list=association_account_list, category="account_association")
			
 
				-
			
 
				-    def deal_accounts(self, account_list):
			
 
				-        """
			
 
				-        input account list
			
 
				-        :param account_list: 具体账号抓取，只抓一页
			
 
				-        :return:
			
 
				-        """
			
 
				-        account_tuple = tuple(account_list)
			
 
				-        sql = f"""
			
 
				-            SELECT gh_id, account_name, account_category, latest_update_time 
			
 
				-            FROM long_articles_accounts
			
 
				-            WHERE account_name in {account_tuple};
			
 
				-        """
			
 
				-        response = self.db_client_lam.select(sql)
			
 
				-        for account in tqdm(response):
			
 
				-            try:
			
 
				-                gh_id = account[0]
			
 
				-                category = account[2]
			
 
				-                try:
			
 
				-                    latest_timestamp = account[3].timestamp()
			
 
				-                except Exception as e:
			
 
				-                    print(e)
			
 
				-                    latest_timestamp = DEFAULT_TIMESTAMP
			
 
				-                self.update_each_account(
			
 
				-                    gh_id=gh_id,
			
 
				-                    category=category,
			
 
				-                    latest_time_stamp=latest_timestamp
			
 
				-                )
			
 
				-            except Exception as e:
			
 
				-                print(e)
			
--- a/cold_start/publish/publishCategoryArticles.py
+++ b/cold_start/publish/publishCategoryArticles.py
@@ -13,7 +13,7 @@ from applications import aiditApi, log, bot, llm_sensitivity
 
				 from config import apolloConfig
			
 
				 
			
 
				 apollo = apolloConfig()
			
 
				-DAILY_CRAWLER_MAX_NUM = 1000
			
 
				+DAILY_CRAWLER_MAX_NUM = 100
			
 
				 SIMILARITY_MIN_SCORE = 0.4
			
 
				 TITLE_NOT_SENSITIVE = 0
			
 
				 
			
@@ -86,14 +86,18 @@ class CategoryColdStartTask(object):
 
				 
			
 
				         """
			
 
				         sql = f"""
			
 
				-        SELECT 
			
 
				-            article_id, out_account_id, article_index, title, link, read_cnt, status, llm_sensitivity, score, category_by_ai
			
 
				-        FROM
			
 
				-            crawler_meta_article
			
 
				-        WHERE 
			
 
				-            category = "{category}" and platform = "{article_source}" and title_sensitivity = {TITLE_NOT_SENSITIVE}
			
 
				-        ORDER BY score DESC;
			
 
				-        """
			
 
				+            select 
			
 
				+                article_id, title, link,  llm_sensitivity, score, category_by_ai
			
 
				+            from crawler_meta_article t1 
			
 
				+            join crawler_meta_article_accounts_read_avg t2 on t1.out_account_id = t2.gh_id and t1.article_index = t2.position
			
 
				+            where category = '{category}' 
			
 
				+                and platform = '{article_source}' 
			
 
				+                and title_sensitivity = {TITLE_NOT_SENSITIVE} 
			
 
				+                and t1.status = {self.INIT_STATUS}
			
 
				+                and t1.read_cnt / t2.read_avg >= {self.READ_TIMES_THRESHOLD}
			
 
				+                and t1.read_cnt >= {self.READ_THRESHOLD}
			
 
				+            ORDER BY score DESC;
			
 
				+            """
			
 
				         article_list = self.db_client.select(sql)
			
 
				         log(
			
 
				             task="category_publish_task",
			
@@ -105,8 +109,7 @@ class CategoryColdStartTask(object):
 
				             }
			
 
				         )
			
 
				         article_df = DataFrame(article_list,
			
 
				-                               columns=['article_id', 'gh_id', 'position', 'title', 'link', 'read_cnt', 'status',
			
 
				-                                        'llm_sensitivity', 'score', 'category_by_ai'])
			
 
				+                               columns=['article_id', 'title', 'link', 'llm_sensitivity', 'score', 'category_by_ai'])
			
 
				         return article_df
			
 
				 
			
 
				     def filter_each_category(self, category):
			
@@ -120,12 +123,9 @@ class CategoryColdStartTask(object):
 
				             if title_list:
			
 
				                 # update
			
 
				                 update_sql = f"""
			
 
				-                UPDATE 
			
 
				-                    crawler_meta_article
			
 
				-                SET
			
 
				-                    status = %s
			
 
				-                WHERE
			
 
				-                    title in %s and status = %s;
			
 
				+                update crawler_meta_article
			
 
				+                set status = %s
			
 
				+                where title in %s and status = %s;
			
 
				                 """
			
 
				                 affected_rows = self.db_client.update(
			
 
				                     sql=update_sql,
			
@@ -164,55 +164,36 @@ class CategoryColdStartTask(object):
 
				         :return:
			
 
				         """
			
 
				         update_sql = f"""
			
 
				-                    UPDATE 
			
 
				-                        crawler_meta_article
			
 
				-                    SET
			
 
				-                        status = %s
			
 
				-                    WHERE
			
 
				-                        article_id in %s and status = %s;
			
 
				-                    """
			
 
				+                    update crawler_meta_article
			
 
				+                    set status = %s
			
 
				+                    where article_id in %s and status = %s;
			
 
				+        """
			
 
				         affect_rows = self.db_client.update(
			
 
				             sql=update_sql,
			
 
				             params=(self.PUBLISHED_STATUS, tuple(article_id_list), self.INIT_STATUS)
			
 
				         )
			
 
				-        if affect_rows != len(article_id_list):
			
 
				-            bot(
			
 
				-                title="品类冷启任务中，出现更新状文章状态失败异常",
			
 
				-                detail={
			
 
				-                    "affected_rows": affect_rows,
			
 
				-                    "task_rows": len(article_id_list)
			
 
				-                }
			
 
				-            )
			
 
				+        # if affect_rows != len(article_id_list):
			
 
				+        #     bot(
			
 
				+        #         title="品类冷启任务中，出现更新状文章状态失败异常",
			
 
				+        #         detail={
			
 
				+        #             "affected_rows": affect_rows,
			
 
				+        #             "task_rows": len(article_id_list)
			
 
				+        #         }
			
 
				+        #     )
			
 
				 
			
 
				     def filter_weixin_articles(self, articles_df, category):
			
 
				         """
			
 
				         微信抓取文章过滤漏斗
			
 
				         """
			
 
				-        articles_df['average_read'] = articles_df.groupby(['gh_id', 'position'])['read_cnt'].transform('mean')
			
 
				-        articles_df['read_times'] = articles_df['read_cnt'] / articles_df['average_read']
			
 
				         total_length = articles_df.shape[0]
			
 
				-        # 第0层过滤已经发布的文章
			
 
				-        filter_df = articles_df[articles_df['status'] == self.INIT_STATUS]
			
 
				-        length_level0 = filter_df.shape[0]
			
 
				-
			
 
				-        # 第一层漏斗通过阅读均值倍数过滤
			
 
				-        filter_df = filter_df[filter_df['read_times'] >= self.READ_TIMES_THRESHOLD]
			
 
				-        length_level1 = filter_df.shape[0]
			
 
				-
			
 
				-        # 第二层漏斗通过阅读量过滤
			
 
				-        filter_df = filter_df[
			
 
				-            filter_df['read_cnt'] >= self.READ_THRESHOLD
			
 
				-            ]
			
 
				-        length_level2 = filter_df.shape[0]
			
 
				-
			
 
				-        # 第三层漏斗通过标题长度过滤
			
 
				-        filter_df = filter_df[
			
 
				-            (filter_df['title'].str.len() >= self.LIMIT_TITLE_LENGTH)
			
 
				-            & (filter_df['title'].str.len() <= self.TITLE_LENGTH_MAX)
			
 
				+        # 第1层漏斗通过标题长度过滤
			
 
				+        filter_df = articles_df[
			
 
				+            (articles_df['title'].str.len() >= self.LIMIT_TITLE_LENGTH)
			
 
				+            & (articles_df['title'].str.len() <= self.TITLE_LENGTH_MAX)
			
 
				             ]
			
 
				-        length_level3 = filter_df.shape[0]
			
 
				+        length_level1 = filter_df.shape[0]
			
 
				 
			
 
				-        # 第四层通过敏感词过滤
			
 
				+        # 第2层通过敏感词过滤
			
 
				         filter_df = filter_df[
			
 
				             (~filter_df['title'].str.contains('农历'))
			
 
				             & (~filter_df['title'].str.contains('太极'))
			
@@ -227,23 +208,23 @@ class CategoryColdStartTask(object):
 
				             & (~filter_df['title'].str.contains('蔡英文'))
			
 
				             & (~filter_df['title'].str.contains('中国'))
			
 
				             ]
			
 
				-        length_level4 = filter_df.shape[0]
			
 
				-        # 第五层通过LLM敏感度过滤
			
 
				+        length_level2 = filter_df.shape[0]
			
 
				+        # 第3层通过LLM敏感度过滤
			
 
				         filter_df = filter_df[
			
 
				             ~(filter_df['llm_sensitivity'] > 0)
			
 
				         ]
			
 
				-        length_level5 = filter_df.shape[0]
			
 
				+        length_level3 = filter_df.shape[0]
			
 
				 
			
 
				-        # 第六层通过相关性分数过滤
			
 
				+        # 第4层通过相关性分数过滤
			
 
				         filter_df = filter_df[filter_df['score'] > SIMILARITY_MIN_SCORE]
			
 
				-        length_level6 = filter_df.shape[0]
			
 
				+        length_level4 = filter_df.shape[0]
			
 
				 
			
 
				         log(
			
 
				             task="category_publish_task",
			
 
				             function="publish_filter_articles",
			
 
				             message="过滤后文章总数",
			
 
				             data={
			
 
				-                "total_articles": length_level5,
			
 
				+                "total_articles": length_level4,
			
 
				                 "category": category
			
 
				             }
			
 
				         )
			
@@ -251,21 +232,15 @@ class CategoryColdStartTask(object):
 
				             title="冷启任务发布通知",
			
 
				             detail={
			
 
				                 "总文章数量": total_length,
			
 
				-                "通过已经发布状态过滤": "过滤数量: {}    剩余数量: {}".format(
			
 
				-                    total_length - length_level0, length_level0),
			
 
				-                "通过阅读均值倍数过滤": "过滤数量: {}    剩余数量: {}".format(
			
 
				-                    length_level0 - length_level1, length_level1),
			
 
				-                "通过阅读量过滤": "过滤数量: {}    剩余数量: {}".format(
			
 
				-                    length_level1 - length_level2, length_level2),
			
 
				                 "通过标题长度过滤": "过滤数量: {}    剩余数量: {}".format(
			
 
				-                    length_level2 - length_level3, length_level3),
			
 
				+                    total_length - length_level1, length_level1),
			
 
				                 "通过敏感词过滤": "过滤数量: {}    剩余数量: {}".format(
			
 
				-                    length_level3 - length_level4, length_level4),
			
 
				+                    length_level1 - length_level2, length_level2),
			
 
				                 "通过LLM敏感度过滤": "过滤数量: {}    剩余数量: {}".format(
			
 
				-                    length_level4 - length_level5, length_level5
			
 
				+                    length_level2 - length_level3, length_level3
			
 
				                 ),
			
 
				                 "通过相关性分数过滤": "过滤数量: {}    剩余数量: {}".format(
			
 
				-                    length_level5 - length_level6, length_level6
			
 
				+                    length_level3 - length_level4, length_level4
			
 
				                 ),
			
 
				                 "品类": category,
			
 
				                 "阅读均值倍数阈值": self.READ_TIMES_THRESHOLD,
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -27,4 +27,5 @@ scikit-learn~=1.6.1
 
				 google~=3.0.0
			
 
				 cffi~=1.17.1
			
 
				 lxml~=5.3.2
			
 
				-scipy~=1.15.2
			
 
				+scipy~=1.15.2
			
 
				+pydantic~=2.10.6
			
--- a/tasks/crawler_tasks/crawler_articles/__init__.py
+++ b/tasks/crawler_tasks/crawler_articles/__init__.py
@@ -0,0 +1,2 @@
 
				+from .gzh_article_crawler import CrawlerAssociationAccountArticles
			
 
				+from .gzh_article_crawler import CrawlerDailyScrapeAccountArticles
			
--- a/tasks/crawler_tasks/crawler_articles/gzh_article_crawler.py
+++ b/tasks/crawler_tasks/crawler_articles/gzh_article_crawler.py
@@ -0,0 +1,275 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+抓取全局品类文章
			
 
				+"""
			
 
				+
			
 
				+import datetime
			
 
				+import time
			
 
				+import traceback
			
 
				+from typing import Dict, List
			
 
				+
			
 
				+from tqdm import tqdm
			
 
				+from pymysql.cursors import DictCursor
			
 
				+
			
 
				+from applications.db import DatabaseConnector
			
 
				+from applications.pipeline import (
			
 
				+    whether_title_sensitive,
			
 
				+    whether_duplicate_article_title,
			
 
				+)
			
 
				+from applications.utils import show_desc_to_sta, generate_gzh_id, timestamp_to_str
			
 
				+from cold_start.crawler.wechat import get_article_list_from_account
			
 
				+from config import long_articles_config
			
 
				+
			
 
				+
			
 
				+class Const:
			
 
				+    ACCOUNT_GOOD_STATUS = 1
			
 
				+
			
 
				+    # 账号是否每日抓取
			
 
				+    ACCOUNT_DAILY_SCRAPE = 1
			
 
				+    ACCOUNT_NOT_DAILY_SCRAPE = 0
			
 
				+
			
 
				+    # 默认值
			
 
				+    DEFAULT_VIEW_COUNT = 0
			
 
				+    DEFAULT_LIKE_COUNT = 0
			
 
				+    DEFAULT_ARTICLE_STATUS = 1
			
 
				+    DEFAULT_TIMESTAMP = 1717171200
			
 
				+
			
 
				+    # 标题sensitivity
			
 
				+    TITLE_SENSITIVE = 1
			
 
				+    TITLE_NOT_SENSITIVE = 0
			
 
				+
			
 
				+
			
 
				+class GzhArticleCrawler(Const):
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.db_client = DatabaseConnector(long_articles_config)
			
 
				+        self.db_client.connect()
			
 
				+
			
 
				+    def get_latest_timestamp(self, account: dict) -> int:
			
 
				+        try:
			
 
				+            timestamp = int(account["latest_update_time"].timestamp())
			
 
				+        except Exception as e:
			
 
				+            timestamp = self.DEFAULT_TIMESTAMP
			
 
				+        return timestamp
			
 
				+
			
 
				+    def insert_article_into_meta(self, gh_id, account_mode, article_list):
			
 
				+        """
			
 
				+        将数据更新到数据库
			
 
				+        :return:
			
 
				+        """
			
 
				+        for article_obj in article_list:
			
 
				+            detail_article_list = article_obj["AppMsg"]["DetailInfo"]
			
 
				+            for obj in detail_article_list:
			
 
				+                try:
			
 
				+                    if whether_duplicate_article_title(obj["Title"], self.db_client):
			
 
				+                        continue
			
 
				+
			
 
				+                    # 判断标题是否包含敏感词
			
 
				+                    title_sensitivity = (
			
 
				+                        self.TITLE_SENSITIVE
			
 
				+                        if whether_title_sensitive(obj["Title"])
			
 
				+                        else self.TITLE_NOT_SENSITIVE
			
 
				+                    )
			
 
				+                    show_stat = show_desc_to_sta(obj["ShowDesc"])
			
 
				+                    show_view_count = show_stat.get(
			
 
				+                        "show_view_count", self.DEFAULT_VIEW_COUNT
			
 
				+                    )
			
 
				+                    show_like_count = show_stat.get(
			
 
				+                        "show_like_count", self.DEFAULT_LIKE_COUNT
			
 
				+                    )
			
 
				+                    unique_idx = generate_gzh_id(obj["ContentUrl"])
			
 
				+                    insert_sql = f"""
			
 
				+                        insert into crawler_meta_article
			
 
				+                        (
			
 
				+                         platform, mode, category, out_account_id, article_index, title, link, read_cnt, like_cnt,
			
 
				+                         description, publish_time, crawler_time, status, unique_index, llm_sensitivity, title_sensitivity
			
 
				+                        )
			
 
				+                        VALUES 
			
 
				+                        (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
			
 
				+                    """
			
 
				+                    self.db_client.save(
			
 
				+                        query=insert_sql,
			
 
				+                        params=(
			
 
				+                            "weixin",
			
 
				+                            "account",
			
 
				+                            account_mode,
			
 
				+                            gh_id,
			
 
				+                            obj["ItemIndex"],
			
 
				+                            obj["Title"],
			
 
				+                            obj["ContentUrl"],
			
 
				+                            show_view_count,
			
 
				+                            show_like_count,
			
 
				+                            obj["Digest"],
			
 
				+                            obj["send_time"],
			
 
				+                            int(time.time()),
			
 
				+                            self.DEFAULT_ARTICLE_STATUS,
			
 
				+                            unique_idx,
			
 
				+                            obj.get("llm_sensitivity", -1),
			
 
				+                            title_sensitivity,
			
 
				+                        ),
			
 
				+                    )
			
 
				+                except Exception as e:
			
 
				+                    print(e)
			
 
				+
			
 
				+    def update_latest_account_timestamp(self, gh_id):
			
 
				+        """
			
 
				+        更新账号的最新时间戳
			
 
				+        :return:
			
 
				+        """
			
 
				+        select_sql = f"""
			
 
				+            SELECT publish_time 
			
 
				+            From crawler_meta_article 
			
 
				+            WHERE out_account_id = '{gh_id}'
			
 
				+            ORDER BY publish_time DESC LIMIT 1;
			
 
				+        """
			
 
				+        result = self.db_client.fetch(select_sql)
			
 
				+        time_stamp = result[0][0]
			
 
				+        dt_str = timestamp_to_str(time_stamp)
			
 
				+        update_sql = f"""
			
 
				+            update long_articles_accounts
			
 
				+            set latest_update_time = %s
			
 
				+            where gh_id = %s;
			
 
				+        """
			
 
				+        self.db_client.save(query=update_sql, params=(dt_str, gh_id))
			
 
				+
			
 
				+    def crawl_each_account(self, gh_id, account_mode, latest_time_stamp):
			
 
				+        """
			
 
				+        更新账号文章
			
 
				+        :return:
			
 
				+        """
			
 
				+        current_cursor = None
			
 
				+        while True:
			
 
				+            # fetch response from weixin
			
 
				+            response = get_article_list_from_account(
			
 
				+                account_id=gh_id, index=current_cursor
			
 
				+            )
			
 
				+            print(response)
			
 
				+            msg_list = response.get("data", {}).get("data")
			
 
				+            if not msg_list:
			
 
				+                break
			
 
				+
			
 
				+            # process current page
			
 
				+            self.insert_article_into_meta(gh_id, account_mode, msg_list)
			
 
				+
			
 
				+            # whether crawl next page
			
 
				+            last_article_in_this_page = msg_list[-1]
			
 
				+            last_time_stamp_in_this_msg = last_article_in_this_page["AppMsg"][
			
 
				+                "BaseInfo"
			
 
				+            ]["UpdateTime"]
			
 
				+            if last_time_stamp_in_this_msg > latest_time_stamp:
			
 
				+                self.update_latest_account_timestamp(gh_id)
			
 
				+                break
			
 
				+
			
 
				+            # update cursor for next page
			
 
				+            current_cursor = response.get("data", {}).get("next_cursor")
			
 
				+            if not current_cursor:
			
 
				+                break
			
 
				+
			
 
				+    def crawl_account_list(self, account_list, account_method):
			
 
				+        for account in tqdm(account_list):
			
 
				+            try:
			
 
				+                gh_id = account["gh_id"]
			
 
				+                account_name = account["account_name"]
			
 
				+                latest_timestamp = self.get_latest_timestamp(account)
			
 
				+                self.crawl_each_account(gh_id, account_method, latest_timestamp)
			
 
				+                self.update_account_read_avg_info(gh_id, account_name)
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                print(f"fail because of {e}")
			
 
				+                print(traceback.format_exc() )
			
 
				+
			
 
				+    def update_account_read_avg_info(self, gh_id, account_name):
			
 
				+        """
			
 
				+        calculate read avg info and read_avg_ci_high
			
 
				+        """
			
 
				+        position_list = [i for i in range(1, 9)]
			
 
				+        today_dt = datetime.date.today().isoformat()
			
 
				+        for position in position_list:
			
 
				+            fetch_query = f"""
			
 
				+                select read_cnt, from_unixtime(publish_time, "%Y-%m_%d") as publish_dt from crawler_meta_article
			
 
				+                where out_account_id = '{gh_id}' and article_index = {position}
			
 
				+                order by publish_time desc limit 30;
			
 
				+            """
			
 
				+            fetch_response = self.db_client.fetch(fetch_query, cursor_type=DictCursor)
			
 
				+            if fetch_response:
			
 
				+                read_cnt_list = [i["read_cnt"] for i in fetch_response]
			
 
				+                n = len(read_cnt_list)
			
 
				+                read_avg = sum(read_cnt_list) / n
			
 
				+                max_publish_dt = fetch_response[0]["publish_dt"]
			
 
				+                remark = f"从{max_publish_dt}开始计算，往前算{len(fetch_response)}天"
			
 
				+                insert_query = f"""
			
 
				+                    insert ignore into 
			
 
				+                        crawler_meta_article_accounts_read_avg
			
 
				+                        (gh_id, account_name, position, read_avg, dt, status, remark)
			
 
				+                        values
			
 
				+                        (%s, %s, %s, %s, %s, %s, %s);
			
 
				+                """
			
 
				+                insert_rows = self.db_client.save(
			
 
				+                    insert_query,
			
 
				+                    params=(
			
 
				+                        gh_id,
			
 
				+                        account_name,
			
 
				+                        position,
			
 
				+                        read_avg,
			
 
				+                        today_dt,
			
 
				+                        1,
			
 
				+                        remark,
			
 
				+                    ),
			
 
				+                )
			
 
				+                if insert_rows:
			
 
				+                    update_query = f"""
			
 
				+                        update crawler_meta_article_accounts_read_avg
			
 
				+                        set status = %s 
			
 
				+                        where gh_id = %s and position = %s and dt < %s;
			
 
				+                    """
			
 
				+                    self.db_client.save(update_query, (0, gh_id, position, today_dt))
			
 
				+
			
 
				+
			
 
				+class CrawlerDailyScrapeAccountArticles(GzhArticleCrawler):
			
 
				+
			
 
				+    def get_account_list(self, account_method: str) -> List[Dict]:
			
 
				+        """
			
 
				+        获取账号
			
 
				+        :param account_method:
			
 
				+        :return:
			
 
				+        """
			
 
				+        query = f"""
			
 
				+            select gh_id, account_source, account_name, account_category, latest_update_time
			
 
				+            from long_articles_accounts 
			
 
				+            where account_category = '{account_method}' and is_using = {self.ACCOUNT_GOOD_STATUS} and daily_scrape = {self.ACCOUNT_DAILY_SCRAPE};
			
 
				+        """
			
 
				+        account_list = self.db_client.fetch(query, cursor_type=DictCursor)
			
 
				+        return account_list
			
 
				+
			
 
				+    def deal(self, method_list):
			
 
				+        """
			
 
				+        :param method_list:
			
 
				+        :return:
			
 
				+        """
			
 
				+        # daily 品类账号抓取
			
 
				+        for account_method in method_list:
			
 
				+            account_list = self.get_account_list(account_method)
			
 
				+            self.crawl_account_list(account_list, account_method)
			
 
				+
			
 
				+
			
 
				+class CrawlerAssociationAccountArticles(GzhArticleCrawler):
			
 
				+
			
 
				+    def get_association_account_list(self, date_str):
			
 
				+        """
			
 
				+        获取账号联想的轮询账号
			
 
				+        """
			
 
				+        group_id = date_str[-1]
			
 
				+        query = f"""
			
 
				+            select account_id, gh_id, account_name, latest_update_time
			
 
				+            from long_articles_accounts
			
 
				+            where account_category = 'account_association' and is_using = {self.ACCOUNT_DAILY_SCRAPE} and daily_scrape = {self.ACCOUNT_NOT_DAILY_SCRAPE};
			
 
				+        """
			
 
				+        account_list = self.db_client.fetch(query, cursor_type=DictCursor)
			
 
				+        today_crawler_account_list = [
			
 
				+            i for i in account_list if str(i["account_id"])[-1] == group_id
			
 
				+        ]
			
 
				+        return today_crawler_account_list
			
 
				+
			
 
				+    def deal(self, date_str):
			
 
				+        account_list = self.get_association_account_list(date_str)
			
 
				+        self.crawl_account_list(account_list, "account_association")