luojunhui 4 miesięcy temu
rodzic
commit
dd6cf71b9c

+ 9 - 1
applications/const.py

@@ -72,4 +72,12 @@ class updateAccountReadAvgTaskConst:
 
     # 发文模式
     ARTICLES_DAILY = 1
-    TOULIU = 2
+    TOULIU = 2
+
+
+class ArticleAssociationTaskConst:
+    """
+    文章关联任务常量配置
+    """
+    # 请求成功状态
+    SPIDER_API_SUCCESS_CODE = 0

+ 18 - 0
applications/longArticlesMysql.py

@@ -41,6 +41,24 @@ class longArticlesMySQL(object):
         result = cursor.fetchall()
         return result
 
+    @classmethod
+    def select_json(cls, sql):
+        """
+        查询
+        :param sql:
+        :return:
+        """
+        cursor = cls.connection.cursor()
+        cursor.execute(sql)
+        result = cursor.fetchall()
+        json_data = [
+            dict(
+                zip([column[0] for column in cursor.description], row)
+            )
+            for row in result
+        ]
+        return json_data
+
     @classmethod
     def insertMany(cls, sql, params_list):
         """

+ 160 - 0
coldStartTasks/crawler/weixin_article_association.py

@@ -0,0 +1,160 @@
+"""
+@author: luojunhui
+"""
+import json
+import time
+from typing import AnyStr, List, Dict
+
+from tqdm import tqdm
+
+from applications import longArticlesMySQL, Functions, WeixinSpider
+from applications.const import ArticleAssociationTaskConst
+
+
+functions = Functions()
+db_client = longArticlesMySQL()
+spider = WeixinSpider()
+const = ArticleAssociationTaskConst()
+
+
+def get_good_articles() -> List[Dict]:
+    """
+    获取表现好的文章
+    :return:
+    """
+    sql = f"""
+        SELECT account_name, gh_id, view_count, read_rate, link, title
+        FROM datastat_sort_strategy
+        WHERE 
+            type = 9 
+            and position = 1 
+            and date_str > '20241101' 
+            and fans > 300000 
+            and view_count > 5000
+            and read_rate > 1.1;
+    """
+    article_list = db_client.select_json(sql)
+    return article_list
+
+
+def get_recommend_article_list_for_each_article(account_name: AnyStr, article_url: AnyStr, title: AnyStr) -> List[Dict]:
+    """
+    获取推荐文章
+    :param title:
+    :param account_name:
+    :param article_url:
+    :return:
+    """
+    recommend_response = spider.get_recommend_articles(content_link=article_url)
+    if recommend_response['code'] == const.SPIDER_API_SUCCESS_CODE:
+        recommend_article_list = recommend_response['data']['data']['list']
+        filter_recommend_article_list = [
+            {
+                "seed_account_name": account_name,
+                "seed_url": article_url,
+                "seed_title": title,
+                "recommend_title": recommend_article['title'],
+                "recommend_account_name": recommend_article['nickname'],
+                "recommend_gh_id": recommend_article['username'],
+                "recommend_url": recommend_article['url'],
+                "recommend_send_timestamp": recommend_article['send_time'],
+                "recommend_read": recommend_article['read_num'],
+                "recommend_like": recommend_article['old_like_num'],
+                "recommend_index": recommend_article['idx'],
+                "recommend_time": int(time.time())
+            }
+            for recommend_article in recommend_article_list if recommend_article['nickname'] != account_name
+        ]
+        return filter_recommend_article_list
+    else:
+        return []
+
+
+def get_recommend_article_list_task() -> None:
+    """
+    获取推荐文章
+    :return:
+    """
+    article_list = get_good_articles()
+    for article_detail_tuple in tqdm(article_list[:1], desc="article list"):
+        account_name = article_detail_tuple['account_name']
+        url = article_detail_tuple['link']
+        title = article_detail_tuple['title']
+        recommend_article_list = get_recommend_article_list_for_each_article(
+            account_name=account_name,
+            article_url=url,
+            title=title
+        )
+        insert_recommend_list_into_meta(recommend_article_list)
+
+
+def insert_recommend_list_into_meta(recommend_article_list: List[Dict]) -> None:
+    """
+    插入数据
+    :param recommend_article_list:
+    :return:
+    """
+    if not recommend_article_list:
+        return
+    for recommend_obj in recommend_article_list:
+        try:
+            insert_sql = f"""
+                INSERT INTO crawler_meta_article
+                (platform, mode, category, out_account_id, article_index, title, link, read_cnt, like_cnt, publish_time, crawler_time, status, unique_index, source_article_title, source_account)
+                VALUES
+                (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
+            """
+            db_client.update(
+                insert_sql,
+                params=(
+                    "weixin",
+                    "association",
+                    "article_association",
+                    recommend_obj['recommend_gh_id'],
+                    recommend_obj['recommend_index'],
+                    recommend_obj['recommend_title'],
+                    recommend_obj['recommend_url'],
+                    recommend_obj['recommend_read'],
+                    recommend_obj['recommend_like'],
+                    recommend_obj['recommend_send_timestamp'],
+                    int(time.time()),
+                    1,
+                    functions.generateGzhId(url=recommend_obj['recommend_url']),
+                    recommend_obj['seed_title'],
+                    recommend_obj['seed_account_name'],
+                )
+            )
+        except Exception as e:
+            print("insert error", e)
+            update_sql = f"""
+                UPDATE crawler_meta_article
+                SET
+                    read_cnt = %s,
+                    like_cnt = %s,
+                    source_article_title = %s,
+                    source_account = %s
+                WHERE
+                    unique_index = %s and category = %s;
+            """
+            try:
+                db_client.update(
+                    update_sql,
+                    params=(
+                        recommend_obj['recommend_read'],
+                        recommend_obj['recommend_like'],
+                        recommend_obj['seed_title'],
+                        recommend_obj['seed_account_name'],
+                        functions.generateGzhId(url=recommend_obj['recommend_url']),
+                        "article_association",
+                    )
+                )
+            except Exception as e:
+                print("update error", e)
+
+
+def main():
+    """
+    主函数
+    :return:
+    """
+    get_recommend_article_list_task()