瀏覽代碼

article_association_crawler

luojunhui 3 月之前
父節點
當前提交
a68a939b93

+ 12 - 0
applications/const/__init__.py

@@ -38,6 +38,18 @@ class ColdStartTaskConst:
     # 相关性分阈值
     CORRELATION_THRESHOLD = 0.5
 
+    # 阅读量阈值
+    READ_COUNT_THRESHOLD = 1000
+
+    # 阅读均值倍数阈值
+    READ_AVG_THRESHOLD = 1.3
+
+    # 群发类型
+    BULK_PUBLISH_TYPE = 9
+
+    # 种子文章数量
+    SEED_ARTICLE_LIMIT_NUM = 30
+
 
 class updatePublishedMsgTaskConst:
     """

+ 40 - 37
applications/wxSpiderApi.py

@@ -1,11 +1,12 @@
 """
 @author: luojunhui
 """
+
 import json
 import time
 import requests
 
-from applications import log
+from applications.aliyunLogApi import log
 from applications.decoratorApi import retryOnNone
 
 
@@ -13,13 +14,12 @@ class WeixinSpider(object):
     """
     Update account articles
     """
+
     # ip = "8.217.190.241"
     # ip = "47.98.154.124"
     # port = "8888"
     base_url = "http://crawler-cn.aiddit.com/crawler/wei_xin"
-    headers = {
-        "Content-Type": "application/json"
-    }
+    headers = {"Content-Type": "application/json"}
 
     @classmethod
     @retryOnNone()
@@ -29,11 +29,10 @@ class WeixinSpider(object):
         :return:
         """
         url = "{}/keyword".format(cls.base_url)
-        payload = json.dumps({
-            "keyword": title,
-            "cursor": page
-        })
-        response = requests.request("POST", url, headers=cls.headers, data=payload, timeout=120)
+        payload = json.dumps({"keyword": title, "cursor": page})
+        response = requests.request(
+            "POST", url, headers=cls.headers, data=payload, timeout=120
+        )
         return response.json()
 
     @classmethod
@@ -47,13 +46,17 @@ class WeixinSpider(object):
         :return:
         """
         url = "{}/detail".format(cls.base_url)
-        payload = json.dumps({
-            "content_link": content_link,
-            "is_count": is_count,
-            "is_ad": False,
-            "is_cache": is_cache
-        })
-        response = requests.request("POST", url, headers=cls.headers, data=payload, timeout=120)
+        payload = json.dumps(
+            {
+                "content_link": content_link,
+                "is_count": is_count,
+                "is_ad": False,
+                "is_cache": is_cache,
+            }
+        )
+        response = requests.request(
+            "POST", url, headers=cls.headers, data=payload, timeout=120
+        )
         return response.json()
 
     @classmethod
@@ -62,12 +65,14 @@ class WeixinSpider(object):
         """
         :return:
         """
-        url = '{}/blogger'.format(cls.base_url)
+        url = "{}/blogger".format(cls.base_url)
         payload = {
-            'account_id': ghId,
-            'cursor': index,
+            "account_id": ghId,
+            "cursor": index,
         }
-        response = requests.post(url=url, headers=cls.headers, data=json.dumps(payload), timeout=120)
+        response = requests.post(
+            url=url, headers=cls.headers, data=json.dumps(payload), timeout=120
+        )
         return response.json()
 
     @classmethod
@@ -78,9 +83,11 @@ class WeixinSpider(object):
         :param content_url:
         :return:
         """
-        url = '{}/account_info'.format(cls.base_url)
+        url = "{}/account_info".format(cls.base_url)
         data = {"content_link": content_url}
-        response = requests.request("POST", url=url, headers=cls.headers, json=data, timeout=120)
+        response = requests.request(
+            "POST", url=url, headers=cls.headers, json=data, timeout=120
+        )
         return response.json()
 
     @classmethod
@@ -91,12 +98,12 @@ class WeixinSpider(object):
         :return:
         """
         url = "{}/recommend".format(cls.base_url)
-        payload = json.dumps(
-            {"content_link": content_link}
+        payload = json.dumps({"content_link": content_link})
+        response = requests.request(
+            "POST", url=url, headers=cls.headers, data=payload, timeout=120
         )
-        response = requests.request("POST", url=url, headers=cls.headers, data=payload, timeout=120)
         response_json = response.json()
-        if response_json['code'] != 0:
+        if response_json["code"] != 0:
             return cls.get_recommend_articles(content_link)
         time.sleep(3)
         return response.json()
@@ -108,22 +115,18 @@ class WeixinSpider(object):
         :param content_link:
         :return:
         """
-        url = 'http://datapi.top/wxapi/relatedarticle'
+        url = "http://datapi.top/wxapi/relatedarticle"
         payload = json.dumps(
-            {
-                "content_link": content_link,
-                "token": "401e4d3c85068bb5"
-            }
+            {"content_link": content_link, "token": "401e4d3c85068bb5"}
+        )
+        response = requests.request(
+            "POST", url=url, headers=cls.headers, data=payload, timeout=120
         )
-        response = requests.request("POST", url=url, headers=cls.headers, data=payload, timeout=120)
         log(
             task="article_association_crawler",
             function="get_recommend_articles_v2",
             message="获取推荐链接,付费接口",
-            data={
-                "content_link": content_link,
-                "response": response.json()
-            }
+            data={"content_link": content_link, "response": response.json()},
         )
         time.sleep(3)
-        return response.json()
+        return response.json()

+ 20 - 2
article_association_task.py

@@ -1,9 +1,27 @@
 """
 @author: luojunhui
 """
+from argparse import ArgumentParser
+
 from coldStartTasks.crawler.wechat import ArticleAssociationCrawler
 
 
-if __name__ == '__main__':
+def main():
+    """
+    main function
+    """
+    parser = ArgumentParser()
+    parser.add_argument("--biz_date", type=str, help="format 2025-01-01")
+    args = parser.parse_args()
+
+    if args.biz_date:
+        biz_date = args.biz_date
+    else:
+        biz_date = None
+
     article_association_crawler = ArticleAssociationCrawler()
-    article_association_crawler.deal()
+    article_association_crawler.deal(biz_date=biz_date)
+
+
+if __name__ == "__main__":
+    main()

+ 61 - 32
coldStartTasks/crawler/wechat/article_association.py

@@ -1,14 +1,18 @@
 """
 @author: luojunhui
 """
+
 import time
+import traceback
+from datetime import datetime
+
 import numpy as np
 
 from pymysql.cursors import DictCursor
 from tqdm import tqdm
 
 
-from applications import WeixinSpider
+from applications import WeixinSpider, log
 from applications.api import similarity_between_title_list
 from applications.const import ColdStartTaskConst
 from applications.db import DatabaseConnector
@@ -32,15 +36,19 @@ class ArticleAssociationCrawler(object):
         self.db_client.connect()
         self.inner_account_set = get_inner_account_set()
 
-    def get_seed_url_list(self):
+    def get_seed_url_list(self, biz_date):
         """
         获取种子url列表
         """
         sql = f"""
             select gh_id, title, link
             from datastat_sort_strategy
-            where date_str > '20250220' and view_count > 1000 and read_rate > 1.3 and type = 9
-            order by read_rate desc limit 30;
+            where date_str > DATE_FORMAT(DATE_SUB('{biz_date}', INTERVAL 2 DAY), '%Y%m%d') 
+                and view_count > {const.READ_COUNT_THRESHOLD} 
+                and read_rate > {const.READ_AVG_THRESHOLD} 
+                and type = {const.BULK_PUBLISH_TYPE}
+            order by read_rate desc 
+            limit {const.SEED_ARTICLE_LIMIT_NUM};
         """
         seed_article_list = self.db_client.fetch(query=sql, cursor_type=DictCursor)
         return seed_article_list
@@ -59,7 +67,9 @@ class ArticleAssociationCrawler(object):
         title_list = [i[0] for i in mysql_response]
         return title_list
 
-    def get_recommend_url_list_with_depth(self, seed_url, source_title, source_account, base_title_list, depth=1):
+    def get_recommend_url_list_with_depth(
+        self, seed_url, source_title, source_account, base_title_list, depth=1
+    ):
         """
         @param seed_url: good url from data_sort_strategy
         @param depth: association depth
@@ -70,44 +80,52 @@ class ArticleAssociationCrawler(object):
             return
 
         res = spider.get_recommend_articles(content_link=seed_url)
-        related_articles = res['data']['data']['list']
+        related_articles = res["data"]["data"]["list"]
         if related_articles:
-            title_list = [i['title'] for i in related_articles]
-            similarity_array = similarity_between_title_list(title_list, base_title_list)
+            title_list = [i["title"] for i in related_articles]
+            similarity_array = similarity_between_title_list(
+                title_list, base_title_list
+            )
 
             recommend_articles = []
             for index, score_list in enumerate(similarity_array):
                 sorted_score_list = sorted(score_list)
-                percent_threshold_score = np.percentile(sorted_score_list, const.PERCENT_THRESHOLD)
+                percent_threshold_score = np.percentile(
+                    sorted_score_list, const.PERCENT_THRESHOLD
+                )
                 if percent_threshold_score < const.CORRELATION_THRESHOLD:
                     continue
 
                 else:
                     article_obj = related_articles[index]
-                    article_obj['score'] = percent_threshold_score
+                    article_obj["score"] = percent_threshold_score
                     recommend_articles.append(article_obj)
 
-            recommend_process_bar = tqdm(recommend_articles, desc="save recommend articles")
+            recommend_process_bar = tqdm(
+                recommend_articles, desc="save recommend articles"
+            )
             for article in recommend_process_bar:
                 obj = {
-                    "title": article['title'],
-                    "url": article['url'],
-                    "gh_id": article['username'],
-                    "index": article['idx'],
-                    "send_time": article['send_time'],
-                    "read_cnt": article['read_num'],
+                    "title": article["title"],
+                    "url": article["url"],
+                    "gh_id": article["username"],
+                    "index": article["idx"],
+                    "send_time": article["send_time"],
+                    "read_cnt": article["read_num"],
                     "depth": depth,
                     "source_article_title": source_title,
                     "source_account": source_account,
                 }
                 self.insert_recommend_article(obj)
-                recommend_process_bar.set_postfix({"title": article['title'], "depth": depth})
+                recommend_process_bar.set_postfix(
+                    {"title": article["title"], "depth": depth}
+                )
                 self.get_recommend_url_list_with_depth(
                     seed_url=obj["url"],
                     source_title=obj["title"],
                     source_account=obj["gh_id"],
                     base_title_list=base_title_list,
-                    depth=depth + 1
+                    depth=depth + 1,
                 )
         else:
             return
@@ -117,18 +135,22 @@ class ArticleAssociationCrawler(object):
         insert recommend article
         """
         # whether account inside
-        if obj['gh_id'] in self.inner_account_set:
+        if obj["gh_id"] in self.inner_account_set:
             return
 
         # whether article title exists
-        title = obj['title']
+        title = obj["title"]
         select_sql = "select article_id from crawler_meta_article where title = %s;"
         res = self.db_client.fetch(query=select_sql, params=(title,))
         if res:
             return
 
         # whether title sensitive
-        title_sensitivity = const.TITLE_SENSITIVE if whether_title_sensitive(title) else const.TITLE_NOT_SENSITIVE
+        title_sensitivity = (
+            const.TITLE_SENSITIVE
+            if whether_title_sensitive(title)
+            else const.TITLE_NOT_SENSITIVE
+        )
 
         # insert this article
         insert_sql = f"""
@@ -151,17 +173,21 @@ class ArticleAssociationCrawler(object):
                 int(time.time()),
                 const.DEFAULT_ARTICLE_STATUS,
                 functions.generateGzhId(obj["url"]),
-                obj['source_article_title'],
-                obj['source_account'],
-                title_sensitivity
-            )
+                obj["source_article_title"],
+                obj["source_account"],
+                title_sensitivity,
+            ),
         )
 
-    def deal(self):
+    def deal(self, biz_date=None):
         """
         class entrance
+        :param biz_date:
         """
-        seed_article_list = self.get_seed_url_list()
+        if biz_date is None:
+            biz_date = datetime.today().strftime("%Y-%m-%d")
+
+        seed_article_list = self.get_seed_url_list(biz_date)
         deal_bar = tqdm(seed_article_list, desc="article association crawler")
         base_title_list = self.get_level_up_title_list()
         for article in deal_bar:
@@ -170,10 +196,13 @@ class ArticleAssociationCrawler(object):
                     seed_url=article["link"],
                     source_title=article["title"],
                     source_account=article["gh_id"],
-                    base_title_list=base_title_list
+                    base_title_list=base_title_list,
                 )
                 deal_bar.set_postfix({"article_title": article["title"]})
             except Exception as e:
-                print(e)
-                print(article)
-                continue
+                log(
+                    task="article_association_crawler",
+                    function="deal",
+                    message=f"article association crawler error, article title: {article['title']}, error: {e}",
+                    data={"article": article, "traceback": traceback.format_exc()},
+                )