weixin_article_association.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. """
  2. @author: luojunhui
  3. """
  4. import json
  5. import time
  6. from typing import AnyStr, List, Dict
  7. from tqdm import tqdm
  8. from applications import longArticlesMySQL, Functions, WeixinSpider
  9. from applications.const import ArticleAssociationTaskConst
  10. functions = Functions()
  11. db_client = longArticlesMySQL()
  12. spider = WeixinSpider()
  13. const = ArticleAssociationTaskConst()
  14. def get_good_articles() -> List[Dict]:
  15. """
  16. 获取表现好的文章
  17. :return:
  18. """
  19. sql = f"""
  20. SELECT account_name, gh_id, view_count, read_rate, link, title
  21. FROM datastat_sort_strategy
  22. WHERE
  23. type = 9
  24. and position = 1
  25. and date_str > '20241101'
  26. and fans > 300000
  27. and view_count > 5000
  28. and read_rate > 1.1;
  29. """
  30. article_list = db_client.select_json(sql)
  31. return article_list
  32. def get_recommend_article_list_for_each_article(account_name: AnyStr, article_url: AnyStr, title: AnyStr) -> List[Dict]:
  33. """
  34. 获取推荐文章
  35. :param title:
  36. :param account_name:
  37. :param article_url:
  38. :return:
  39. """
  40. recommend_response = spider.get_recommend_articles(content_link=article_url)
  41. if recommend_response['code'] == const.SPIDER_API_SUCCESS_CODE:
  42. recommend_article_list = recommend_response['data']['data']['list']
  43. filter_recommend_article_list = [
  44. {
  45. "seed_account_name": account_name,
  46. "seed_url": article_url,
  47. "seed_title": title,
  48. "recommend_title": recommend_article['title'],
  49. "recommend_account_name": recommend_article['nickname'],
  50. "recommend_gh_id": recommend_article['username'],
  51. "recommend_url": recommend_article['url'],
  52. "recommend_send_timestamp": recommend_article['send_time'],
  53. "recommend_read": recommend_article['read_num'],
  54. "recommend_like": recommend_article['old_like_num'],
  55. "recommend_index": recommend_article['idx'],
  56. "recommend_time": int(time.time())
  57. }
  58. for recommend_article in recommend_article_list if recommend_article['nickname'] != account_name
  59. ]
  60. return filter_recommend_article_list
  61. else:
  62. return []
  63. def get_recommend_article_list_task() -> None:
  64. """
  65. 获取推荐文章
  66. :return:
  67. """
  68. article_list = get_good_articles()
  69. for article_detail_tuple in tqdm(article_list[:1], desc="article list"):
  70. account_name = article_detail_tuple['account_name']
  71. url = article_detail_tuple['link']
  72. title = article_detail_tuple['title']
  73. recommend_article_list = get_recommend_article_list_for_each_article(
  74. account_name=account_name,
  75. article_url=url,
  76. title=title
  77. )
  78. insert_recommend_list_into_meta(recommend_article_list)
  79. def insert_recommend_list_into_meta(recommend_article_list: List[Dict]) -> None:
  80. """
  81. 插入数据
  82. :param recommend_article_list:
  83. :return:
  84. """
  85. if not recommend_article_list:
  86. return
  87. for recommend_obj in recommend_article_list:
  88. try:
  89. insert_sql = f"""
  90. INSERT INTO crawler_meta_article
  91. (platform, mode, category, out_account_id, article_index, title, link, read_cnt, like_cnt, publish_time, crawler_time, status, unique_index, source_article_title, source_account)
  92. VALUES
  93. (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
  94. """
  95. db_client.update(
  96. insert_sql,
  97. params=(
  98. "weixin",
  99. "association",
  100. "article_association",
  101. recommend_obj['recommend_gh_id'],
  102. recommend_obj['recommend_index'],
  103. recommend_obj['recommend_title'],
  104. recommend_obj['recommend_url'],
  105. recommend_obj['recommend_read'],
  106. recommend_obj['recommend_like'],
  107. recommend_obj['recommend_send_timestamp'],
  108. int(time.time()),
  109. 1,
  110. functions.generateGzhId(url=recommend_obj['recommend_url']),
  111. recommend_obj['seed_title'],
  112. recommend_obj['seed_account_name'],
  113. )
  114. )
  115. except Exception as e:
  116. print("insert error", e)
  117. update_sql = f"""
  118. UPDATE crawler_meta_article
  119. SET
  120. read_cnt = %s,
  121. like_cnt = %s,
  122. source_article_title = %s,
  123. source_account = %s
  124. WHERE
  125. unique_index = %s and category = %s;
  126. """
  127. try:
  128. db_client.update(
  129. update_sql,
  130. params=(
  131. recommend_obj['recommend_read'],
  132. recommend_obj['recommend_like'],
  133. recommend_obj['seed_title'],
  134. recommend_obj['seed_account_name'],
  135. functions.generateGzhId(url=recommend_obj['recommend_url']),
  136. "article_association",
  137. )
  138. )
  139. except Exception as e:
  140. print("update error", e)
  141. def main():
  142. """
  143. 主函数
  144. :return:
  145. """
  146. get_recommend_article_list_task()