article_association.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179
  1. """
  2. @author: luojunhui
  3. """
  4. import time
  5. import numpy as np
  6. from pymysql.cursors import DictCursor
  7. from tqdm import tqdm
  8. from applications import WeixinSpider
  9. from applications.api import similarity_between_title_list
  10. from applications.const import ColdStartTaskConst
  11. from applications.db import DatabaseConnector
  12. from applications.functions import Functions
  13. from applications.utils import get_inner_account_set
  14. from applications.utils import whether_title_sensitive
  15. from config import long_articles_config
  16. spider = WeixinSpider()
  17. functions = Functions()
  18. const = ColdStartTaskConst()
  19. class ArticleAssociationCrawler(object):
  20. """
  21. article association crawler task
  22. """
  23. def __init__(self):
  24. self.db_client = DatabaseConnector(db_config=long_articles_config)
  25. self.db_client.connect()
  26. self.inner_account_set = get_inner_account_set()
  27. def get_seed_url_list(self):
  28. """
  29. 获取种子url列表
  30. """
  31. sql = f"""
  32. select gh_id, title, link
  33. from datastat_sort_strategy
  34. where date_str > '20250220' and view_count > 1000 and read_rate > 1.3 and type = 9
  35. order by read_rate desc limit 30;
  36. """
  37. seed_article_list = self.db_client.fetch(query=sql, cursor_type=DictCursor)
  38. return seed_article_list
  39. def get_level_up_title_list(self):
  40. """
  41. 获取晋级文章标题列表
  42. status: 1 表示文章已经溯源完成
  43. deleted: 0 表示文章正常
  44. level = 'autoArticlePoolLevel1' 表示头条
  45. """
  46. sql = f"""
  47. select distinct title from article_pool_promotion_source where level = 'autoArticlePoolLevel1' and status = 1 and deleted = 0;
  48. """
  49. mysql_response = self.db_client.fetch(query=sql)
  50. title_list = [i[0] for i in mysql_response]
  51. return title_list
  52. def get_recommend_url_list_with_depth(self, seed_url, source_title, source_account, base_title_list, depth=1):
  53. """
  54. @param seed_url: good url from data_sort_strategy
  55. @param depth: association depth
  56. @param source_title: article title
  57. @param source_account: article account
  58. """
  59. if depth > const.ARTICLE_ASSOCIATION_MAX_DEPTH:
  60. return
  61. res = spider.get_recommend_articles(content_link=seed_url)
  62. related_articles = res['data']['data']['list']
  63. if related_articles:
  64. title_list = [i['title'] for i in related_articles]
  65. similarity_array = similarity_between_title_list(title_list, base_title_list)
  66. recommend_articles = []
  67. for index, score_list in enumerate(similarity_array):
  68. sorted_score_list = sorted(score_list)
  69. percent_threshold_score = np.percentile(sorted_score_list, const.PERCENT_THRESHOLD)
  70. if percent_threshold_score < const.CORRELATION_THRESHOLD:
  71. continue
  72. else:
  73. article_obj = related_articles[index]
  74. article_obj['score'] = percent_threshold_score
  75. recommend_articles.append(article_obj)
  76. recommend_process_bar = tqdm(recommend_articles, desc="save recommend articles")
  77. for article in recommend_process_bar:
  78. obj = {
  79. "title": article['title'],
  80. "url": article['url'],
  81. "gh_id": article['username'],
  82. "index": article['idx'],
  83. "send_time": article['send_time'],
  84. "read_cnt": article['read_num'],
  85. "depth": depth,
  86. "source_article_title": source_title,
  87. "source_account": source_account,
  88. }
  89. self.insert_recommend_article(obj)
  90. recommend_process_bar.set_postfix({"title": article['title'], "depth": depth})
  91. self.get_recommend_url_list_with_depth(
  92. seed_url=obj["url"],
  93. source_title=obj["title"],
  94. source_account=obj["gh_id"],
  95. base_title_list=base_title_list,
  96. depth=depth + 1
  97. )
  98. else:
  99. return
  100. def insert_recommend_article(self, obj):
  101. """
  102. insert recommend article
  103. """
  104. # whether account inside
  105. if obj['gh_id'] in self.inner_account_set:
  106. return
  107. # whether article title exists
  108. title = obj['title']
  109. select_sql = "select article_id from crawler_meta_article where title = %s;"
  110. res = self.db_client.fetch(query=select_sql, params=(title,))
  111. if res:
  112. return
  113. # whether title sensitive
  114. title_sensitivity = const.TITLE_SENSITIVE if whether_title_sensitive(title) else const.TITLE_NOT_SENSITIVE
  115. # insert this article
  116. insert_sql = f"""
  117. insert into crawler_meta_article
  118. (platform, mode, category, out_account_id, article_index, title, link, read_cnt, publish_time, crawler_time, status, unique_index, source_article_title, source_account, title_sensitivity)
  119. values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
  120. """
  121. self.db_client.save(
  122. query=insert_sql,
  123. params=(
  124. "weixin",
  125. "recommend",
  126. "article_association",
  127. obj["gh_id"],
  128. obj["index"],
  129. obj["title"],
  130. obj["url"],
  131. obj["read_cnt"],
  132. obj["send_time"],
  133. int(time.time()),
  134. const.DEFAULT_ARTICLE_STATUS,
  135. functions.generateGzhId(obj["url"]),
  136. obj['source_article_title'],
  137. obj['source_account'],
  138. title_sensitivity
  139. )
  140. )
  141. def deal(self):
  142. """
  143. class entrance
  144. """
  145. seed_article_list = self.get_seed_url_list()
  146. deal_bar = tqdm(seed_article_list, desc="article association crawler")
  147. base_title_list = self.get_level_up_title_list()
  148. for article in deal_bar:
  149. try:
  150. self.get_recommend_url_list_with_depth(
  151. seed_url=article["link"],
  152. source_title=article["title"],
  153. source_account=article["gh_id"],
  154. base_title_list=base_title_list
  155. )
  156. deal_bar.set_postfix({"article_title": article["title"]})
  157. except Exception as e:
  158. print(e)
  159. print(article)
  160. continue