article_association.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. """
  2. @author: luojunhui
  3. """
  4. import time
  5. import traceback
  6. from datetime import datetime
  7. import numpy as np
  8. from pymysql.cursors import DictCursor
  9. from tqdm import tqdm
  10. from applications import WeixinSpider, log
  11. from applications.api import similarity_between_title_list
  12. from applications.const import ColdStartTaskConst
  13. from applications.db import DatabaseConnector
  14. from applications.functions import Functions
  15. from applications.utils import get_inner_account_set
  16. from applications.utils import whether_title_sensitive
  17. from config import long_articles_config
  18. spider = WeixinSpider()
  19. functions = Functions()
  20. const = ColdStartTaskConst()
  21. class ArticleAssociationCrawler(object):
  22. """
  23. article association crawler task
  24. """
  25. def __init__(self):
  26. self.db_client = DatabaseConnector(db_config=long_articles_config)
  27. self.db_client.connect()
  28. self.inner_account_set = get_inner_account_set()
  29. def get_seed_url_list(self, biz_date):
  30. """
  31. 获取种子url列表
  32. """
  33. sql = f"""
  34. select gh_id, title, link
  35. from datastat_sort_strategy
  36. where date_str > DATE_FORMAT(DATE_SUB('{biz_date}', INTERVAL 2 DAY), '%Y%m%d')
  37. and view_count > {const.READ_COUNT_THRESHOLD}
  38. and read_rate > {const.READ_AVG_THRESHOLD}
  39. and type = {const.BULK_PUBLISH_TYPE}
  40. order by read_rate desc
  41. limit {const.SEED_ARTICLE_LIMIT_NUM};
  42. """
  43. seed_article_list = self.db_client.fetch(query=sql, cursor_type=DictCursor)
  44. return seed_article_list
  45. def get_level_up_title_list(self):
  46. """
  47. 获取晋级文章标题列表
  48. status: 1 表示文章已经溯源完成
  49. deleted: 0 表示文章正常
  50. level = 'autoArticlePoolLevel1' 表示头条
  51. """
  52. sql = f"""
  53. select distinct title
  54. from article_pool_promotion_source
  55. where level = 'autoArticlePoolLevel1' and status = 1 and deleted = 0;
  56. """
  57. mysql_response = self.db_client.fetch(query=sql)
  58. title_list = [i[0] for i in mysql_response]
  59. return title_list
  60. def get_recommend_url_list_with_depth(
  61. self, seed_url, source_title, source_account, base_title_list, depth=1
  62. ):
  63. """
  64. @param seed_url: good url from data_sort_strategy
  65. @param depth: association depth
  66. @param source_title: article title
  67. @param source_account: article account
  68. """
  69. if depth > const.ARTICLE_ASSOCIATION_MAX_DEPTH:
  70. return
  71. res = spider.get_recommend_articles(content_link=seed_url)
  72. related_articles = res["data"]["data"]["list"]
  73. if related_articles:
  74. title_list = [i["title"] for i in related_articles]
  75. similarity_array = similarity_between_title_list(
  76. title_list, base_title_list
  77. )
  78. recommend_articles = []
  79. for index, score_list in enumerate(similarity_array):
  80. sorted_score_list = sorted(score_list)
  81. percent_threshold_score = np.percentile(
  82. sorted_score_list, const.PERCENT_THRESHOLD
  83. )
  84. if percent_threshold_score < const.CORRELATION_THRESHOLD:
  85. continue
  86. else:
  87. article_obj = related_articles[index]
  88. article_obj["score"] = percent_threshold_score
  89. recommend_articles.append(article_obj)
  90. recommend_process_bar = tqdm(
  91. recommend_articles, desc="save recommend articles"
  92. )
  93. for article in recommend_process_bar:
  94. obj = {
  95. "title": article["title"],
  96. "url": article["url"],
  97. "gh_id": article["username"],
  98. "index": article["idx"],
  99. "send_time": article["send_time"],
  100. "read_cnt": article["read_num"],
  101. "depth": depth,
  102. "source_article_title": source_title,
  103. "source_account": source_account,
  104. }
  105. self.insert_recommend_article(obj)
  106. recommend_process_bar.set_postfix(
  107. {"title": article["title"], "depth": depth}
  108. )
  109. self.get_recommend_url_list_with_depth(
  110. seed_url=obj["url"],
  111. source_title=obj["title"],
  112. source_account=obj["gh_id"],
  113. base_title_list=base_title_list,
  114. depth=depth + 1,
  115. )
  116. else:
  117. return
  118. def insert_recommend_article(self, obj):
  119. """
  120. insert recommend article
  121. """
  122. # whether account inside
  123. if obj["gh_id"] in self.inner_account_set:
  124. return
  125. # whether article title exists
  126. title = obj["title"]
  127. select_sql = "select article_id from crawler_meta_article where title = %s;"
  128. res = self.db_client.fetch(query=select_sql, params=(title,))
  129. if res:
  130. return
  131. # whether title sensitive
  132. title_sensitivity = (
  133. const.TITLE_SENSITIVE
  134. if whether_title_sensitive(title)
  135. else const.TITLE_NOT_SENSITIVE
  136. )
  137. # insert this article
  138. insert_sql = f"""
  139. insert into crawler_meta_article
  140. (platform, mode, category, out_account_id, article_index, title, link, read_cnt, publish_time, crawler_time, status, unique_index, source_article_title, source_account, title_sensitivity)
  141. values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
  142. """
  143. self.db_client.save(
  144. query=insert_sql,
  145. params=(
  146. "weixin",
  147. "recommend",
  148. "article_association",
  149. obj["gh_id"],
  150. obj["index"],
  151. obj["title"],
  152. obj["url"],
  153. obj["read_cnt"],
  154. obj["send_time"],
  155. int(time.time()),
  156. const.DEFAULT_ARTICLE_STATUS,
  157. functions.generateGzhId(obj["url"]),
  158. obj["source_article_title"],
  159. obj["source_account"],
  160. title_sensitivity,
  161. ),
  162. )
  163. def deal(self, biz_date=None):
  164. """
  165. class entrance
  166. :param biz_date:
  167. """
  168. if biz_date is None:
  169. biz_date = datetime.today().strftime("%Y-%m-%d")
  170. seed_article_list = self.get_seed_url_list(biz_date)
  171. deal_bar = tqdm(seed_article_list, desc="article association crawler")
  172. base_title_list = self.get_level_up_title_list()
  173. for article in deal_bar:
  174. try:
  175. self.get_recommend_url_list_with_depth(
  176. seed_url=article["link"],
  177. source_title=article["title"],
  178. source_account=article["gh_id"],
  179. base_title_list=base_title_list,
  180. )
  181. deal_bar.set_postfix({"article_title": article["title"]})
  182. except Exception as e:
  183. log(
  184. task="article_association_crawler",
  185. function="deal",
  186. message=f"article association crawler error, article title: {article['title']}, error: {e}",
  187. data={"article": article, "traceback": traceback.format_exc()},
  188. )