Преглед на файлове

account association improve

luojunhui преди 9 месеца
родител
ревизия
751c1ae850
променени са 1 файла, в които са добавени 162 реда и са изтрити 0 реда
  1. 162 0
      coldStartTasks/crawler/toutiao_recommend_crawler.py

+ 162 - 0
coldStartTasks/crawler/toutiao_recommend_crawler.py

@@ -0,0 +1,162 @@
+"""
+@author: luojunhui
+"""
+import json
+import time
+import traceback
+from typing import Dict
+
+import requests
+from tqdm import tqdm
+
+from applications import bot
+from applications import log
+from applications.db import DatabaseConnector
+from config import long_articles_config
+
+
+class ToutiaoRecommendCrawler(object):
+    """
+    今日头条推荐流
+    """
+    def __init__(self) -> None:
+        self.db_client = None
+
+    def init_database(self) -> None:
+        """
+        初始化数据库
+        :return:
+        """
+        try:
+            self.db_client = DatabaseConnector(db_config=long_articles_config)
+            self.db_client.connect()
+        except Exception as e:
+            bot(
+                title="今日头条推荐流文章抓取任务数据库连接失败",
+                detail={
+                    "error": str(e),
+                    "error_stack": traceback.format_exc()
+                }
+            )
+
+    def get_history_recommendation(self) -> Dict:
+        """
+        获取历史推荐流文章
+        :return:
+        """
+        url = 'https://www.toutiao.com/api/pc/list/feed'
+        params = {
+            'channel_id': '3189398965',
+            'min_behot_time': '0',
+            'offset': '0',
+            'refresh_count': '1',
+            'category': 'pc_profile_channel',
+            'client_extra_params': '{"short_video_item":"filter"}',
+            'aid': '24',
+            'app_name': 'toutiao_web',
+            'msToken': '_rhOZjdccInxERSE5rot9jsH_4FDZLRNYC9HVypTRtZ2IEb0wHQCtxjLEjXSoDM4oUNW2EbhPJqomKQvt8_Jg503jGiFSl2hmP3neRKfE9uBanlfhoD1yQ==',
+            'a_bogus': 'xyRh/mL6DkdNXfyI55QLfY3qV4P3YkLG0t9bMDhqTVfSty39HMPd9exEuvhvMy8jxs/gIegjy4hbY3/DrQAJMpyUHuXLUdQ2mymsKl5Q59gCs1feejuQnU4Nmkt-tec25JZ4EKi8o7/aSYuDl2Be-wnAP6ZCcHhMHjD8CpMpvn6lErm=',
+        }
+        headers = {
+            'Accept': 'application/json, text/plain, */*',
+            'Accept-Language': 'zh,zh-CN;q=0.9',
+            'Cookie': '__ac_signature=_02B4Z6wo00f01uL1Y5QAAIDDwi9p-RyULmbi1WcAAN8B5b; ttcid=a7499fc4f17243e1a6f1d47fc054799e16; _ga=GA1.1.1771235425.1716434457; csrftoken=ee756af695a449eeb73b5a3fc78978b2; _S_IPAD=0; notRedShot=1; tt_webid=7371293454351697471; s_v_web_id=verify_m559znfi_rrsyYOnm_GvTT_4r4T_8FNl_VV6uBCq85ctv; __feed_out_channel_key=history; _S_DPR=2.200000047683716; gfkadpd=24,6457; _S_WIN_WH=1554_860; ttwid=1%7C9b5sTIuwZxZKt0wFsvE-2t5OoFxH_Q5VIpVNWEREbAo%7C1735808360%7Cc9062afa201717f3b20ed1ac1b940d22da7b5dd01e201c26a97350f99c2683ee; tt_scid=i1kVV8f6ncNAV18LRQKFsVz-XuURdfA2uP9P2oTf.IBYozNpDFa5qspcCeSQNAQ5df75; _ga_QEHZPBE5HH=GS1.1.1735807708.20.1.1735809077.0.0.0',
+            'Priority': 'u=1, i',
+            'Referer': 'https://www.toutiao.com/',
+            'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
+            'sec-ch-ua-mobile': '?0',
+            'sec-ch-ua-platform': '"macOS"',
+            'sec-fetch-dest': 'empty',
+            'sec-fetch-mode': 'cors',
+            'sec-fetch-site': 'same-origin',
+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
+        }
+        response = requests.get(url, params=params, headers=headers)
+        return response.json()
+
+    def insert_each_article(self, item: Dict) -> Dict:
+        """
+        提取文章信息
+        :param article_info:
+        :return:
+        """
+        item_id = item.get('item_id')
+        article_url = item['article_url']
+        like_count = item['like_count']
+        read_count = item['read_count']
+        title = item['title']
+        user_info = item['user_info']
+        user_name = user_info.get('name')
+        user_id = user_info.get('user_id')
+        abstract = item['Abstract']
+        publish_time = item['publish_time']
+        insert_sql = f"""
+            INSERT INTO crawler_meta_article
+            (platform, mode, category, out_account_id, title, link, read_cnt, like_cnt, description, publish_time, crawler_time, unique_index)
+            VALUES
+            (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+        """
+        self.db_client.save(
+            query=insert_sql,
+            params=(
+                "toutiao",
+                "recommend",
+                "history",
+                user_id,
+                title,
+                article_url,
+                read_count,
+                like_count,
+                abstract,
+                publish_time,
+                int(time.time()),
+                item_id
+            )
+        )
+
+    def process_recommendation(self, recommendation) -> Dict:
+        """
+        处理推荐流文章
+        :param recommendation:
+        :return:
+        """
+        for item in tqdm(recommendation['data']):
+            if item.get('article_url'):
+                video_flag = item.get('has_video')
+                if not video_flag:
+                    try:
+                        self.insert_each_article(item)
+                    except Exception as e:
+                        error_data = {
+                            "error": str(e),
+                            "error_stack": traceback.format_exc()
+                        }
+                        log(
+                            task='toutiao_recommend',
+                            message='头条推荐流文章插入失败',
+                            data=error_data,
+                            status='fail'
+                        )
+
+
+def main():
+    """
+    主函数
+    :return:
+    """
+    toutiao_recommend_crawler = ToutiaoRecommendCrawler()
+    toutiao_recommend_crawler.init_database()
+    for i in range(10):
+        try:
+            article_list = toutiao_recommend_crawler.get_history_recommendation()
+        except Exception as e:
+            error_data = {
+                "error": str(e),
+                "error_stack": traceback.format_exc()
+            }
+            print(error_data)
+            continue
+        toutiao_recommend_crawler.process_recommendation(article_list)
+
+
+main()