exit_article_with_title.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. """
  2. @author: luojunhui
  3. """
  4. import json
  5. import traceback
  6. from datetime import datetime, timedelta
  7. from applications import PQMySQL, longArticlesMySQL, bot, log
  8. from applications.aiditApi import get_generated_article_list
  9. from config import apolloConfig
  10. config = apolloConfig()
  11. article_exit_threshold = json.loads(config.getConfigValue("article_exit_threshold"))
  12. def get_level_up_articles() -> set:
  13. """
  14. :return:
  15. """
  16. generate_pool_ids = [
  17. "20240804003153130851174",
  18. "20240802171417146947657",
  19. "20240802143345289374071",
  20. ]
  21. good_title_set = set()
  22. for pool_id in generate_pool_ids:
  23. articles = get_generated_article_list(pool_id)
  24. titles = [article[1] for article in articles]
  25. good_title_set.update(titles)
  26. return good_title_set
  27. class ArticleTitleStatusManager(object):
  28. """
  29. 文章退场表格维护
  30. """
  31. def __init__(self):
  32. self.INIT_STATUS = 0
  33. self.pq_client = None
  34. self.lam_client = None
  35. def init_database(self) -> bool:
  36. """
  37. 初始化数据库
  38. :return:
  39. """
  40. try:
  41. self.pq_client = PQMySQL()
  42. except Exception as e:
  43. bot(
  44. title="文章退场管理任务,数据库连接失败",
  45. detail={
  46. "e": str(e),
  47. "error_msg": traceback.format_exc(),
  48. "server": "old server"
  49. }
  50. )
  51. return False
  52. try:
  53. self.lam_client = longArticlesMySQL()
  54. except Exception as e:
  55. bot(
  56. title="文章退场管理任务,数据库连接失败",
  57. detail={
  58. "e": str(e),
  59. "error_msg": traceback.format_exc(),
  60. "server": "new server"
  61. }
  62. )
  63. return True
  64. def get_bad_articles(self, read_times_on_avg_threshold, discovery_times_threshold) -> list[str]:
  65. """
  66. 找出质量很差的文章标题,将该标题设置为退场状态
  67. :return:
  68. """
  69. sql = f"""
  70. SELECT
  71. title, max(read_rate) as max_rate, count(1) as title_count
  72. FROM
  73. datastat_sort_strategy
  74. WHERE position > 2 and fans > 10000
  75. GROUP BY title
  76. HAVING title_count >= {discovery_times_threshold} and max_rate < {read_times_on_avg_threshold};
  77. """
  78. articles = self.lam_client.select(sql)
  79. return [i[0] for i in articles]
  80. def get_bad_articles_v2(self, publish_date_threshold, discovery_times_threshold) -> list[str]:
  81. """
  82. 找出第一次发布在一个月之前,且发布次数大于5次的文章
  83. :param publish_date_threshold: 发布时间戳阈值
  84. :param discovery_times_threshold: 发布次数阈值
  85. :return:
  86. """
  87. sql = f"""
  88. SELECT
  89. title, count(1) as title_count, min(date_str) as min_date
  90. FROM
  91. datastat_sort_strategy
  92. WHERE position > 2 and fans > 10000
  93. GROUP BY title
  94. HAVING title_count >= {discovery_times_threshold} and min_date < {publish_date_threshold};
  95. """
  96. articles = self.lam_client.select(sql)
  97. return [i[0] for i in articles]
  98. def save_titles(self, title_list, status) -> int:
  99. """
  100. 修改标题状态
  101. :param status:
  102. :param title_list:
  103. :return: None
  104. """
  105. fail_list = []
  106. insert_count = 0
  107. for title in title_list:
  108. insert_sql = f"""
  109. INSERT INTO cold_start_title_pool
  110. (title, status)
  111. values
  112. (%s, %s)
  113. """
  114. try:
  115. self.lam_client.update(
  116. sql=insert_sql,
  117. params=(title, status)
  118. )
  119. insert_count += 1
  120. except Exception as e:
  121. update_sql = f"""
  122. UPDATE cold_start_title_pool
  123. SET status = %s
  124. where title = %s and status = %s;
  125. """
  126. try:
  127. self.lam_client.update(
  128. sql=update_sql,
  129. params=(status, title, self.INIT_STATUS)
  130. )
  131. except Exception as e:
  132. error_msg = traceback.format_exc()
  133. log(
  134. task="article_exit_with_title",
  135. function="save_titles",
  136. status="fail",
  137. data={
  138. "e": str(e),
  139. "error_msg": error_msg,
  140. }
  141. )
  142. fail_list.append(title)
  143. if fail_list:
  144. bot(
  145. title="冷启动文章标题退场,sql操作失败",
  146. detail=fail_list
  147. )
  148. return -1
  149. else:
  150. return insert_count
  151. def main():
  152. """
  153. main function
  154. :return:
  155. """
  156. UP_LEVEL_STATUS = 1
  157. ARTICLE_EXIT_STATUS = -1
  158. # 策略一:
  159. read_times_on_avg_threshold = article_exit_threshold['strategy_1']['read_times_on_avg']
  160. explore_times_threshold = article_exit_threshold['strategy_1']['explore_times_threshold']
  161. # 策略二:
  162. publish_times_threshold = article_exit_threshold['strategy_2']['publish_times_threshold']
  163. days_threshold = article_exit_threshold['strategy_2']['days_threshold']
  164. first_publish_date_threshold = (datetime.now() - timedelta(days=days_threshold)).strftime('%Y%m%d')
  165. article_title_manager = ArticleTitleStatusManager()
  166. article_title_manager.init_database()
  167. # 处理晋级标题
  168. up_level_title = get_level_up_articles()
  169. up_level_success_count = article_title_manager.save_titles(
  170. title_list=up_level_title,
  171. status=UP_LEVEL_STATUS
  172. )
  173. # 处理退场标题V1
  174. exit_article_list = article_title_manager.get_bad_articles(
  175. read_times_on_avg_threshold=read_times_on_avg_threshold,
  176. discovery_times_threshold=explore_times_threshold
  177. )
  178. exit_success_count = article_title_manager.save_titles(
  179. title_list=exit_article_list,
  180. status=ARTICLE_EXIT_STATUS)
  181. # 处理退场标题v2
  182. exit_article_list_v2 = article_title_manager.get_bad_articles_v2(
  183. publish_date_threshold=first_publish_date_threshold,
  184. discovery_times_threshold=publish_times_threshold
  185. )
  186. exit_success_count_v2 = article_title_manager.save_titles(
  187. title_list=exit_article_list_v2,
  188. status=ARTICLE_EXIT_STATUS)
  189. bot(
  190. title="冷启动文章晋级/退场完成",
  191. detail={
  192. "晋级文章数量": up_level_success_count,
  193. "策略1:退场文章数量": exit_success_count,
  194. "策略2:退场文章数量": exit_success_count_v2,
  195. "策略1:阅读均值倍数阈值": read_times_on_avg_threshold,
  196. "策略1:探索次数阈值": explore_times_threshold,
  197. "策略2:发布次数阈值": publish_times_threshold,
  198. "策略2:发布天数阈值": days_threshold
  199. },
  200. mention=False
  201. )
  202. if __name__ == '__main__':
  203. main()