hace 9 meses · fda7a1cbca
--- a/applications/const/__init__.py
+++ b/applications/const/__init__.py
@@ -4,7 +4,7 @@
 
				 """
			
 
				 
			
 
				 
			
 
				-class coldStartTaskConst:
			
 
				+class ColdStartTaskConst:
			
 
				     """
			
 
				     冷启动任务常量配置
			
 
				     """
			
@@ -12,6 +12,32 @@ class coldStartTaskConst:
 
				     INIT_STATUS = 1  # 文章初始状态
			
 
				     BAD_STATUS = 0  # 低质量文章状态
			
 
				 
			
 
				+    # 常量
			
 
				+    ACCOUNT_GOOD_STATUS = 1
			
 
				+
			
 
				+    # 账号是否每日抓取
			
 
				+    ACCOUNT_DAILY_SCRAPE = 1
			
 
				+    ACCOUNT_NOT_DAILY_SCRAPE = 0
			
 
				+
			
 
				+    # 默认值
			
 
				+    DEFAULT_VIEW_COUNT = 0
			
 
				+    DEFAULT_LIKE_COUNT = 0
			
 
				+    DEFAULT_ARTICLE_STATUS = 1
			
 
				+    DEFAULT_TIMESTAMP = 1717171200
			
 
				+
			
 
				+    # 标题sensitivity
			
 
				+    TITLE_SENSITIVE = 1
			
 
				+    TITLE_NOT_SENSITIVE = 0
			
 
				+
			
 
				+    # 文章联想深度
			
 
				+    ARTICLE_ASSOCIATION_MAX_DEPTH = 4
			
 
				+
			
 
				+    # 相关分百分位阈值
			
 
				+    PERCENT_THRESHOLD = 95
			
 
				+
			
 
				+    # 相关性分阈值
			
 
				+    CORRELATION_THRESHOLD = 0.5
			
 
				+
			
 
				 
			
 
				 class updatePublishedMsgTaskConst:
			
 
				     """
			
--- a/applications/utils/__init__.py
+++ b/applications/utils/__init__.py
@@ -0,0 +1,4 @@
 
				+"""
			
 
				+utils
			
 
				+"""
			
 
				+from .cold_start import *
			
--- a/applications/utils/cold_start.py
+++ b/applications/utils/cold_start.py
@@ -0,0 +1,30 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+import json
			
 
				+
			
 
				+from applications import aiditApi
			
 
				+from config import apolloConfig
			
 
				+
			
 
				+config = apolloConfig()
			
 
				+sensitive_word_list = json.loads(config.getConfigValue("sensitive_word_list"))
			
 
				+
			
 
				+
			
 
				+def whether_title_sensitive(title: str) -> bool:
			
 
				+    """
			
 
				+    : param title:
			
 
				+    判断视频是否的标题是否包含敏感词
			
 
				+    """
			
 
				+    for word in sensitive_word_list:
			
 
				+        if word in title:
			
 
				+            return True
			
 
				+    return False
			
 
				+
			
 
				+
			
 
				+def get_inner_account_set() -> set:
			
 
				+    """
			
 
				+    get inner account set
			
 
				+    """
			
 
				+    accounts = aiditApi.get_publish_account_from_aigc()
			
 
				+    gh_id_list = [i['ghId'] for i in accounts]
			
 
				+    return set(gh_id_list)
			
--- a/applications/wxSpiderApi.py
+++ b/applications/wxSpiderApi.py
@@ -2,6 +2,7 @@
 
				 @author: luojunhui
			
 
				 """
			
 
				 import json
			
 
				+import time
			
 
				 import requests
			
 
				 
			
 
				 from applications.decoratorApi import retryOnNone
			
@@ -93,4 +94,26 @@ class WeixinSpider(object):
 
				             {"content_link": content_link}
			
 
				         )
			
 
				         response = requests.request("POST", url=url, headers=cls.headers, data=payload, timeout=120)
			
 
				+        response_json = response.json()
			
 
				+        if response_json['code'] != 0:
			
 
				+            return cls.get_recommend_articles(content_link)
			
 
				+        time.sleep(3)
			
 
				         return response.json()
			
 
				+
			
 
				+    @classmethod
			
 
				+    def get_recommend_articles_v2(cls, content_link) -> dict:
			
 
				+        """
			
 
				+        use content link to get recommend articles
			
 
				+        :param content_link:
			
 
				+        :return:
			
 
				+        """
			
 
				+        url = 'http://datapi.top/wxapi/relatedarticle'
			
 
				+        payload = json.dumps(
			
 
				+            {
			
 
				+                "content_link": content_link,
			
 
				+                "token": "401e4d3c85068bb5"
			
 
				+            }
			
 
				+        )
			
 
				+        response = requests.request("POST", url=url, headers=cls.headers, data=payload, timeout=120)
			
 
				+        time.sleep(3)
			
 
				+        return response.json()
			
--- a/article_association_task.py
+++ b/article_association_task.py
@@ -0,0 +1,9 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+from coldStartTasks.crawler.wechat import ArticleAssociationCrawler
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    article_association_crawler = ArticleAssociationCrawler()
			
 
				+    article_association_crawler.deal()
			
--- a/coldStartTasks/crawler/wechat/__init__.py
+++ b/coldStartTasks/crawler/wechat/__init__.py
@@ -0,0 +1,4 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+from .article_association import ArticleAssociationCrawler
			
--- a/coldStartTasks/crawler/wechat/article_association.py
+++ b/coldStartTasks/crawler/wechat/article_association.py
@@ -0,0 +1,179 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+import time
			
 
				+import numpy as np
			
 
				+
			
 
				+from pymysql.cursors import DictCursor
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+
			
 
				+from applications import WeixinSpider
			
 
				+from applications.api import similarity_between_title_list
			
 
				+from applications.const import ColdStartTaskConst
			
 
				+from applications.db import DatabaseConnector
			
 
				+from applications.functions import Functions
			
 
				+from applications.utils import get_inner_account_set
			
 
				+from applications.utils import whether_title_sensitive
			
 
				+from config import long_articles_config
			
 
				+
			
 
				+spider = WeixinSpider()
			
 
				+functions = Functions()
			
 
				+const = ColdStartTaskConst()
			
 
				+
			
 
				+
			
 
				+class ArticleAssociationCrawler(object):
			
 
				+    """
			
 
				+    article association crawler task
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.db_client = DatabaseConnector(db_config=long_articles_config)
			
 
				+        self.db_client.connect()
			
 
				+        self.inner_account_set = get_inner_account_set()
			
 
				+
			
 
				+    def get_seed_url_list(self):
			
 
				+        """
			
 
				+        获取种子url列表
			
 
				+        """
			
 
				+        sql = f"""
			
 
				+            select gh_id, title, link
			
 
				+            from datastat_sort_strategy
			
 
				+            where date_str > '20250220' and view_count > 1000 and read_rate > 1.3 and type = 9
			
 
				+            order by read_rate desc limit 30;
			
 
				+        """
			
 
				+        seed_article_list = self.db_client.fetch(query=sql, cursor_type=DictCursor)
			
 
				+        return seed_article_list
			
 
				+
			
 
				+    def get_level_up_title_list(self):
			
 
				+        """
			
 
				+        获取晋级文章标题列表
			
 
				+        status: 1 表示文章已经溯源完成
			
 
				+        deleted: 0 表示文章正常
			
 
				+        level = 'autoArticlePoolLevel1' 表示头条
			
 
				+        """
			
 
				+        sql = f"""
			
 
				+        select distinct title from article_pool_promotion_source where level = 'autoArticlePoolLevel1' and status = 1 and deleted = 0;
			
 
				+        """
			
 
				+        mysql_response = self.db_client.fetch(query=sql)
			
 
				+        title_list = [i[0] for i in mysql_response]
			
 
				+        return title_list
			
 
				+
			
 
				+    def get_recommend_url_list_with_depth(self, seed_url, source_title, source_account, base_title_list, depth=1):
			
 
				+        """
			
 
				+        @param seed_url: good url from data_sort_strategy
			
 
				+        @param depth: association depth
			
 
				+        @param source_title: article title
			
 
				+        @param source_account: article account
			
 
				+        """
			
 
				+        if depth > const.ARTICLE_ASSOCIATION_MAX_DEPTH:
			
 
				+            return
			
 
				+
			
 
				+        res = spider.get_recommend_articles(content_link=seed_url)
			
 
				+        related_articles = res['data']['data']['list']
			
 
				+        if related_articles:
			
 
				+            title_list = [i['title'] for i in related_articles]
			
 
				+            similarity_array = similarity_between_title_list(title_list, base_title_list)
			
 
				+
			
 
				+            recommend_articles = []
			
 
				+            for index, score_list in enumerate(similarity_array):
			
 
				+                sorted_score_list = sorted(score_list)
			
 
				+                percent_threshold_score = np.percentile(sorted_score_list, const.PERCENT_THRESHOLD)
			
 
				+                if percent_threshold_score < const.CORRELATION_THRESHOLD:
			
 
				+                    continue
			
 
				+
			
 
				+                else:
			
 
				+                    article_obj = related_articles[index]
			
 
				+                    article_obj['score'] = percent_threshold_score
			
 
				+                    recommend_articles.append(article_obj)
			
 
				+
			
 
				+            recommend_process_bar = tqdm(recommend_articles, desc="save recommend articles")
			
 
				+            for article in recommend_process_bar:
			
 
				+                obj = {
			
 
				+                    "title": article['title'],
			
 
				+                    "url": article['url'],
			
 
				+                    "gh_id": article['username'],
			
 
				+                    "index": article['idx'],
			
 
				+                    "send_time": article['send_time'],
			
 
				+                    "read_cnt": article['read_num'],
			
 
				+                    "depth": depth,
			
 
				+                    "source_article_title": source_title,
			
 
				+                    "source_account": source_account,
			
 
				+                }
			
 
				+                self.insert_recommend_article(obj)
			
 
				+                recommend_process_bar.set_postfix({"title": article['title'], "depth": depth})
			
 
				+                self.get_recommend_url_list_with_depth(
			
 
				+                    seed_url=obj["url"],
			
 
				+                    source_title=obj["title"],
			
 
				+                    source_account=obj["gh_id"],
			
 
				+                    base_title_list=base_title_list,
			
 
				+                    depth=depth + 1
			
 
				+                )
			
 
				+        else:
			
 
				+            return
			
 
				+
			
 
				+    def insert_recommend_article(self, obj):
			
 
				+        """
			
 
				+        insert recommend article
			
 
				+        """
			
 
				+        # whether article title exists
			
 
				+        title = obj['title']
			
 
				+        select_sql = "select article_id from crawler_meta_article where title = %s;"
			
 
				+        res = self.db_client.fetch(query=select_sql, params=(title,))
			
 
				+        if res:
			
 
				+            return
			
 
				+
			
 
				+        # whether account inside
			
 
				+        if obj['gh_id'] in self.inner_account_set:
			
 
				+            return
			
 
				+
			
 
				+        # whether title sensitive
			
 
				+        title_sensitivity = const.TITLE_SENSITIVE if whether_title_sensitive(title) else const.TITLE_NOT_SENSITIVE
			
 
				+
			
 
				+        # insert this article
			
 
				+        insert_sql = f"""
			
 
				+            insert into crawler_meta_article 
			
 
				+            (platform, mode, category, out_account_id, article_index, title, link, read_cnt, publish_time, crawler_time, status, unique_index, source_article_title, source_account, title_sensitivity)
			
 
				+            values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
			
 
				+        """
			
 
				+        self.db_client.save(
			
 
				+            query=insert_sql,
			
 
				+            params=(
			
 
				+                "weixin",
			
 
				+                "recommend",
			
 
				+                "article_association",
			
 
				+                obj["gh_id"],
			
 
				+                obj["index"],
			
 
				+                obj["title"],
			
 
				+                obj["url"],
			
 
				+                obj["read_cnt"],
			
 
				+                obj["send_time"],
			
 
				+                int(time.time()),
			
 
				+                const.DEFAULT_ARTICLE_STATUS,
			
 
				+                functions.generateGzhId(obj["url"]),
			
 
				+                obj['source_article_title'],
			
 
				+                obj['source_account'],
			
 
				+                title_sensitivity
			
 
				+            )
			
 
				+        )
			
 
				+
			
 
				+    def deal(self):
			
 
				+        """
			
 
				+        class entrance
			
 
				+        """
			
 
				+        seed_article_list = self.get_seed_url_list()
			
 
				+        deal_bar = tqdm(seed_article_list, desc="article association crawler")
			
 
				+        base_title_list = self.get_level_up_title_list()
			
 
				+        for article in deal_bar:
			
 
				+            try:
			
 
				+                self.get_recommend_url_list_with_depth(
			
 
				+                    seed_url=article["link"],
			
 
				+                    source_title=article["title"],
			
 
				+                    source_account=article["gh_id"],
			
 
				+                    base_title_list=base_title_list
			
 
				+                )
			
 
				+                deal_bar.set_postfix({"article_title": article["title"]})
			
 
				+            except Exception as e:
			
 
				+                print(e)
			
 
				+                print(article)
			
 
				+                continue