toutiao_recommend_crawler.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. """
  2. @author: luojunhui
  3. """
  4. import json
  5. import time
  6. import traceback
  7. from typing import Dict
  8. import requests
  9. from tqdm import tqdm
  10. from applications import bot
  11. from applications import log
  12. from applications.db import DatabaseConnector
  13. from config import long_articles_config
  14. class ToutiaoRecommendCrawler(object):
  15. """
  16. 今日头条推荐流
  17. """
  18. def __init__(self) -> None:
  19. self.db_client = None
  20. def init_database(self) -> None:
  21. """
  22. 初始化数据库
  23. :return:
  24. """
  25. try:
  26. self.db_client = DatabaseConnector(db_config=long_articles_config)
  27. self.db_client.connect()
  28. except Exception as e:
  29. bot(
  30. title="今日头条推荐流文章抓取任务数据库连接失败",
  31. detail={
  32. "error": str(e),
  33. "error_stack": traceback.format_exc()
  34. }
  35. )
  36. def get_history_recommendation(self) -> Dict:
  37. """
  38. 获取历史推荐流文章
  39. :return:
  40. """
  41. url = 'https://www.toutiao.com/api/pc/list/feed'
  42. params = {
  43. 'channel_id': '3189398965',
  44. 'min_behot_time': '0',
  45. 'offset': '0',
  46. 'refresh_count': '1',
  47. 'category': 'pc_profile_channel',
  48. 'client_extra_params': '{"short_video_item":"filter"}',
  49. 'aid': '24',
  50. 'app_name': 'toutiao_web',
  51. 'msToken': '_rhOZjdccInxERSE5rot9jsH_4FDZLRNYC9HVypTRtZ2IEb0wHQCtxjLEjXSoDM4oUNW2EbhPJqomKQvt8_Jg503jGiFSl2hmP3neRKfE9uBanlfhoD1yQ==',
  52. 'a_bogus': 'xyRh/mL6DkdNXfyI55QLfY3qV4P3YkLG0t9bMDhqTVfSty39HMPd9exEuvhvMy8jxs/gIegjy4hbY3/DrQAJMpyUHuXLUdQ2mymsKl5Q59gCs1feejuQnU4Nmkt-tec25JZ4EKi8o7/aSYuDl2Be-wnAP6ZCcHhMHjD8CpMpvn6lErm=',
  53. }
  54. headers = {
  55. 'Accept': 'application/json, text/plain, */*',
  56. 'Accept-Language': 'zh,zh-CN;q=0.9',
  57. 'Cookie': '__ac_signature=_02B4Z6wo00f01uL1Y5QAAIDDwi9p-RyULmbi1WcAAN8B5b; ttcid=a7499fc4f17243e1a6f1d47fc054799e16; _ga=GA1.1.1771235425.1716434457; csrftoken=ee756af695a449eeb73b5a3fc78978b2; _S_IPAD=0; notRedShot=1; tt_webid=7371293454351697471; s_v_web_id=verify_m559znfi_rrsyYOnm_GvTT_4r4T_8FNl_VV6uBCq85ctv; __feed_out_channel_key=history; _S_DPR=2.200000047683716; gfkadpd=24,6457; _S_WIN_WH=1554_860; ttwid=1%7C9b5sTIuwZxZKt0wFsvE-2t5OoFxH_Q5VIpVNWEREbAo%7C1735808360%7Cc9062afa201717f3b20ed1ac1b940d22da7b5dd01e201c26a97350f99c2683ee; tt_scid=i1kVV8f6ncNAV18LRQKFsVz-XuURdfA2uP9P2oTf.IBYozNpDFa5qspcCeSQNAQ5df75; _ga_QEHZPBE5HH=GS1.1.1735807708.20.1.1735809077.0.0.0',
  58. 'Priority': 'u=1, i',
  59. 'Referer': 'https://www.toutiao.com/',
  60. 'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
  61. 'sec-ch-ua-mobile': '?0',
  62. 'sec-ch-ua-platform': '"macOS"',
  63. 'sec-fetch-dest': 'empty',
  64. 'sec-fetch-mode': 'cors',
  65. 'sec-fetch-site': 'same-origin',
  66. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
  67. }
  68. response = requests.get(url, params=params, headers=headers)
  69. return response.json()
  70. def insert_each_article(self, item: Dict) -> Dict:
  71. """
  72. 提取文章信息
  73. :param article_info:
  74. :return:
  75. """
  76. item_id = item.get('item_id')
  77. article_url = item['article_url']
  78. like_count = item['like_count']
  79. read_count = item['read_count']
  80. title = item['title']
  81. user_info = item['user_info']
  82. user_name = user_info.get('name')
  83. user_id = user_info.get('user_id')
  84. abstract = item['Abstract']
  85. publish_time = item['publish_time']
  86. insert_sql = f"""
  87. INSERT INTO crawler_meta_article
  88. (platform, mode, category, out_account_id, title, link, read_cnt, like_cnt, description, publish_time, crawler_time, unique_index)
  89. VALUES
  90. (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
  91. """
  92. self.db_client.save(
  93. query=insert_sql,
  94. params=(
  95. "toutiao",
  96. "recommend",
  97. "history",
  98. user_id,
  99. title,
  100. article_url,
  101. read_count,
  102. like_count,
  103. abstract,
  104. publish_time,
  105. int(time.time()),
  106. item_id
  107. )
  108. )
  109. def process_recommendation(self, recommendation) -> Dict:
  110. """
  111. 处理推荐流文章
  112. :param recommendation:
  113. :return:
  114. """
  115. for item in tqdm(recommendation['data']):
  116. if item.get('article_url'):
  117. video_flag = item.get('has_video')
  118. if not video_flag:
  119. try:
  120. self.insert_each_article(item)
  121. except Exception as e:
  122. error_data = {
  123. "error": str(e),
  124. "error_stack": traceback.format_exc()
  125. }
  126. log(
  127. task='toutiao_recommend',
  128. message='头条推荐流文章插入失败',
  129. data=error_data,
  130. status='fail'
  131. )
  132. def main():
  133. """
  134. 主函数
  135. :return:
  136. """
  137. toutiao_recommend_crawler = ToutiaoRecommendCrawler()
  138. toutiao_recommend_crawler.init_database()
  139. for i in range(10):
  140. try:
  141. article_list = toutiao_recommend_crawler.get_history_recommendation()
  142. except Exception as e:
  143. error_data = {
  144. "error": str(e),
  145. "error_stack": traceback.format_exc()
  146. }
  147. print(error_data)
  148. continue
  149. toutiao_recommend_crawler.process_recommendation(article_list)
  150. main()