weixin_article_association.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. """
  2. @author: luojunhui
  3. """
  4. import json
  5. import time
  6. from typing import AnyStr, List, Dict
  7. from tqdm import tqdm
  8. from applications import longArticlesMySQL, Functions, WeixinSpider
  9. from applications.const import ArticleAssociationTaskConst
  10. functions = Functions()
  11. db_client = longArticlesMySQL()
  12. spider = WeixinSpider()
  13. const = ArticleAssociationTaskConst()
  14. def get_good_articles() -> List[Dict]:
  15. """
  16. 获取表现好的文章
  17. :return: List[Dict] 查询到到的文章列表
  18. """
  19. sql = f"""
  20. SELECT account_name, gh_id, view_count, read_rate, link, title
  21. FROM datastat_sort_strategy
  22. WHERE
  23. type = 9
  24. and position = 1
  25. and date_str > '20241101'
  26. and fans > 300000
  27. and view_count > 5000
  28. and read_rate > 1.1
  29. and status = 1;
  30. """
  31. article_list = db_client.select_json(sql)
  32. return article_list
  33. def get_recommend_article_list_for_each_article(account_name: AnyStr, article_url: AnyStr, title: AnyStr) -> List[Dict]:
  34. """
  35. 获取推荐文章
  36. :param title: 种子标题
  37. :param account_name: 种子账号
  38. :param article_url: 种子文章链接
  39. :return:
  40. """
  41. recommend_response = spider.get_recommend_articles(content_link=article_url)
  42. if recommend_response['code'] == const.SPIDER_API_SUCCESS_CODE:
  43. recommend_article_list = recommend_response['data']['data']['list']
  44. filter_recommend_article_list = [
  45. {
  46. "seed_account_name": account_name,
  47. "seed_url": article_url,
  48. "seed_title": title,
  49. "recommend_title": recommend_article['title'],
  50. "recommend_account_name": recommend_article['nickname'],
  51. "recommend_gh_id": recommend_article['username'],
  52. "recommend_url": recommend_article['url'],
  53. "recommend_send_timestamp": recommend_article['send_time'],
  54. "recommend_read": recommend_article['read_num'],
  55. "recommend_like": recommend_article['old_like_num'],
  56. "recommend_index": recommend_article['idx'],
  57. "recommend_time": int(time.time())
  58. }
  59. for recommend_article in recommend_article_list if recommend_article['nickname'] != account_name
  60. ]
  61. return filter_recommend_article_list
  62. else:
  63. return []
  64. def insert_recommend_list_into_meta(recommend_article_list: List[Dict]) -> None:
  65. """
  66. 插入数据
  67. :param recommend_article_list:
  68. :return:
  69. """
  70. if not recommend_article_list:
  71. return
  72. for recommend_obj in recommend_article_list:
  73. try:
  74. insert_sql = f"""
  75. INSERT INTO crawler_meta_article
  76. (platform, mode, category, out_account_id, article_index, title, link, read_cnt, like_cnt, publish_time, crawler_time, status, unique_index, source_article_title, source_account)
  77. VALUES
  78. (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
  79. """
  80. db_client.update(
  81. insert_sql,
  82. params=(
  83. "weixin",
  84. "association",
  85. "article_association",
  86. recommend_obj['recommend_gh_id'],
  87. recommend_obj['recommend_index'],
  88. recommend_obj['recommend_title'],
  89. recommend_obj['recommend_url'],
  90. recommend_obj['recommend_read'],
  91. recommend_obj['recommend_like'],
  92. recommend_obj['recommend_send_timestamp'],
  93. int(time.time()),
  94. 1,
  95. functions.generateGzhId(url=recommend_obj['recommend_url']),
  96. recommend_obj['seed_title'],
  97. recommend_obj['seed_account_name'],
  98. )
  99. )
  100. except Exception as e:
  101. print("insert error", e)
  102. update_sql = f"""
  103. UPDATE crawler_meta_article
  104. SET
  105. read_cnt = %s,
  106. like_cnt = %s,
  107. source_article_title = %s,
  108. source_account = %s
  109. WHERE
  110. unique_index = %s and category = %s;
  111. """
  112. try:
  113. db_client.update(
  114. update_sql,
  115. params=(
  116. recommend_obj['recommend_read'],
  117. recommend_obj['recommend_like'],
  118. recommend_obj['seed_title'],
  119. recommend_obj['seed_account_name'],
  120. functions.generateGzhId(url=recommend_obj['recommend_url']),
  121. "article_association",
  122. )
  123. )
  124. except Exception as e:
  125. print("update error", e)
  126. def do_i2i_crawler_task() -> None:
  127. """
  128. 获取推荐文章
  129. :return:
  130. """
  131. article_list = get_good_articles()
  132. for article_detail_tuple in tqdm(article_list[:1], desc="article list"):
  133. account_name = article_detail_tuple['account_name']
  134. url = article_detail_tuple['link']
  135. title = article_detail_tuple['title']
  136. recommend_article_list = get_recommend_article_list_for_each_article(
  137. account_name=account_name,
  138. article_url=url,
  139. title=title
  140. )
  141. insert_recommend_list_into_meta(recommend_article_list)
  142. def main():
  143. """
  144. 主函数
  145. :return:
  146. """
  147. do_i2i_crawler_task()