article_association.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208
  1. """
  2. @author: luojunhui
  3. """
  4. import time
  5. import traceback
  6. from datetime import datetime
  7. import numpy as np
  8. from pymysql.cursors import DictCursor
  9. from tqdm import tqdm
  10. from applications import WeixinSpider, log
  11. from applications.api import similarity_between_title_list
  12. from applications.const import ColdStartTaskConst
  13. from applications.db import DatabaseConnector
  14. from applications.functions import Functions
  15. from applications.utils import get_inner_account_set
  16. from applications.utils import whether_title_sensitive
  17. from config import long_articles_config
  18. spider = WeixinSpider()
  19. functions = Functions()
  20. const = ColdStartTaskConst()
  21. class ArticleAssociationCrawler(object):
  22. """
  23. article association crawler task
  24. """
  25. def __init__(self):
  26. self.db_client = DatabaseConnector(db_config=long_articles_config)
  27. self.db_client.connect()
  28. self.inner_account_set = get_inner_account_set()
  29. def get_seed_url_list(self, biz_date):
  30. """
  31. 获取种子url列表
  32. """
  33. sql = f"""
  34. select gh_id, title, link
  35. from datastat_sort_strategy
  36. where date_str > DATE_FORMAT(DATE_SUB('{biz_date}', INTERVAL 2 DAY), '%Y%m%d')
  37. and view_count > {const.READ_COUNT_THRESHOLD}
  38. and read_rate > {const.READ_AVG_THRESHOLD}
  39. and type = {const.BULK_PUBLISH_TYPE}
  40. order by read_rate desc
  41. limit {const.SEED_ARTICLE_LIMIT_NUM};
  42. """
  43. seed_article_list = self.db_client.fetch(query=sql, cursor_type=DictCursor)
  44. return seed_article_list
  45. def get_level_up_title_list(self):
  46. """
  47. 获取晋级文章标题列表
  48. status: 1 表示文章已经溯源完成
  49. deleted: 0 表示文章正常
  50. level = 'autoArticlePoolLevel1' 表示头条
  51. """
  52. sql = f"""
  53. select distinct title from article_pool_promotion_source where level = 'autoArticlePoolLevel1' and status = 1 and deleted = 0;
  54. """
  55. mysql_response = self.db_client.fetch(query=sql)
  56. title_list = [i[0] for i in mysql_response]
  57. return title_list
  58. def get_recommend_url_list_with_depth(
  59. self, seed_url, source_title, source_account, base_title_list, depth=1
  60. ):
  61. """
  62. @param seed_url: good url from data_sort_strategy
  63. @param depth: association depth
  64. @param source_title: article title
  65. @param source_account: article account
  66. """
  67. if depth > const.ARTICLE_ASSOCIATION_MAX_DEPTH:
  68. return
  69. res = spider.get_recommend_articles(content_link=seed_url)
  70. related_articles = res["data"]["data"]["list"]
  71. if related_articles:
  72. title_list = [i["title"] for i in related_articles]
  73. similarity_array = similarity_between_title_list(
  74. title_list, base_title_list
  75. )
  76. recommend_articles = []
  77. for index, score_list in enumerate(similarity_array):
  78. sorted_score_list = sorted(score_list)
  79. percent_threshold_score = np.percentile(
  80. sorted_score_list, const.PERCENT_THRESHOLD
  81. )
  82. if percent_threshold_score < const.CORRELATION_THRESHOLD:
  83. continue
  84. else:
  85. article_obj = related_articles[index]
  86. article_obj["score"] = percent_threshold_score
  87. recommend_articles.append(article_obj)
  88. recommend_process_bar = tqdm(
  89. recommend_articles, desc="save recommend articles"
  90. )
  91. for article in recommend_process_bar:
  92. obj = {
  93. "title": article["title"],
  94. "url": article["url"],
  95. "gh_id": article["username"],
  96. "index": article["idx"],
  97. "send_time": article["send_time"],
  98. "read_cnt": article["read_num"],
  99. "depth": depth,
  100. "source_article_title": source_title,
  101. "source_account": source_account,
  102. }
  103. self.insert_recommend_article(obj)
  104. recommend_process_bar.set_postfix(
  105. {"title": article["title"], "depth": depth}
  106. )
  107. self.get_recommend_url_list_with_depth(
  108. seed_url=obj["url"],
  109. source_title=obj["title"],
  110. source_account=obj["gh_id"],
  111. base_title_list=base_title_list,
  112. depth=depth + 1,
  113. )
  114. else:
  115. return
  116. def insert_recommend_article(self, obj):
  117. """
  118. insert recommend article
  119. """
  120. # whether account inside
  121. if obj["gh_id"] in self.inner_account_set:
  122. return
  123. # whether article title exists
  124. title = obj["title"]
  125. select_sql = "select article_id from crawler_meta_article where title = %s;"
  126. res = self.db_client.fetch(query=select_sql, params=(title,))
  127. if res:
  128. return
  129. # whether title sensitive
  130. title_sensitivity = (
  131. const.TITLE_SENSITIVE
  132. if whether_title_sensitive(title)
  133. else const.TITLE_NOT_SENSITIVE
  134. )
  135. # insert this article
  136. insert_sql = f"""
  137. insert into crawler_meta_article
  138. (platform, mode, category, out_account_id, article_index, title, link, read_cnt, publish_time, crawler_time, status, unique_index, source_article_title, source_account, title_sensitivity)
  139. values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
  140. """
  141. self.db_client.save(
  142. query=insert_sql,
  143. params=(
  144. "weixin",
  145. "recommend",
  146. "article_association",
  147. obj["gh_id"],
  148. obj["index"],
  149. obj["title"],
  150. obj["url"],
  151. obj["read_cnt"],
  152. obj["send_time"],
  153. int(time.time()),
  154. const.DEFAULT_ARTICLE_STATUS,
  155. functions.generateGzhId(obj["url"]),
  156. obj["source_article_title"],
  157. obj["source_account"],
  158. title_sensitivity,
  159. ),
  160. )
  161. def deal(self, biz_date=None):
  162. """
  163. class entrance
  164. :param biz_date:
  165. """
  166. if biz_date is None:
  167. biz_date = datetime.today().strftime("%Y-%m-%d")
  168. seed_article_list = self.get_seed_url_list(biz_date)
  169. deal_bar = tqdm(seed_article_list, desc="article association crawler")
  170. base_title_list = self.get_level_up_title_list()
  171. for article in deal_bar:
  172. try:
  173. self.get_recommend_url_list_with_depth(
  174. seed_url=article["link"],
  175. source_title=article["title"],
  176. source_account=article["gh_id"],
  177. base_title_list=base_title_list,
  178. )
  179. deal_bar.set_postfix({"article_title": article["title"]})
  180. except Exception as e:
  181. log(
  182. task="article_association_crawler",
  183. function="deal",
  184. message=f"article association crawler error, article title: {article['title']}, error: {e}",
  185. data={"article": article, "traceback": traceback.format_exc()},
  186. )