exit_article_with_title.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. """
  2. @author: luojunhui
  3. """
  4. import traceback
  5. from datetime import datetime, timedelta
  6. from applications import PQMySQL, longArticlesMySQL, bot, log
  7. from applications.aiditApi import get_generated_article_list
  8. def get_level_up_articles() -> set:
  9. """
  10. :return:
  11. """
  12. generate_pool_ids = [
  13. "20240804003153130851174",
  14. "20240802171417146947657",
  15. "20240802143345289374071",
  16. ]
  17. good_title_set = set()
  18. for pool_id in generate_pool_ids:
  19. articles = get_generated_article_list(pool_id)
  20. titles = [article[1] for article in articles]
  21. good_title_set.update(titles)
  22. return good_title_set
  23. class ArticleTitleStatusManager(object):
  24. """
  25. 文章退场表格维护
  26. """
  27. def __init__(self):
  28. self.INIT_STATUS = 0
  29. self.pq_client = None
  30. self.lam_client = None
  31. def init_database(self) -> bool:
  32. """
  33. 初始化数据库
  34. :return:
  35. """
  36. try:
  37. self.pq_client = PQMySQL()
  38. except Exception as e:
  39. bot(
  40. title="文章退场管理任务,数据库连接失败",
  41. detail={
  42. "e": str(e),
  43. "error_msg": traceback.format_exc(),
  44. "server": "old server"
  45. }
  46. )
  47. return False
  48. try:
  49. self.lam_client = longArticlesMySQL()
  50. except Exception as e:
  51. bot(
  52. title="文章退场管理任务,数据库连接失败",
  53. detail={
  54. "e": str(e),
  55. "error_msg": traceback.format_exc(),
  56. "server": "new server"
  57. }
  58. )
  59. return True
  60. def get_bad_articles(self, read_times_on_avg_threshold, discovery_times_threshold) -> list[str]:
  61. """
  62. 找出质量很差的文章标题,将该标题设置为退场状态
  63. :return:
  64. """
  65. sql = f"""
  66. SELECT
  67. title, max(read_rate) as max_rate, count(1) as title_count
  68. FROM
  69. datastat_sort_strategy
  70. WHERE position > 2 and fans > 10000
  71. GROUP BY title
  72. HAVING title_count >= {discovery_times_threshold} and max_rate < {read_times_on_avg_threshold};
  73. """
  74. articles = self.lam_client.select(sql)
  75. return [i[0] for i in articles]
  76. def get_bad_articles_v2(self, publish_date_threshold, discovery_times_threshold) -> list[str]:
  77. """
  78. 找出第一次发布在一个月之前,且发布次数大于5次的文章
  79. :param publish_date_threshold: 发布时间戳阈值
  80. :param discovery_times_threshold: 发布次数阈值
  81. :return:
  82. """
  83. sql = f"""
  84. SELECT
  85. title, count(1) as title_count, min(date_str) as min_date
  86. FROM
  87. datastat_sort_strategy
  88. WHERE position > 2 and fans > 10000
  89. GROUP BY title
  90. HAVING title_count >= {discovery_times_threshold} and min_date < {publish_date_threshold};
  91. """
  92. articles = self.lam_client.select(sql)
  93. return [i[0] for i in articles]
  94. def save_titles(self, title_list, status) -> int:
  95. """
  96. 修改标题状态
  97. :param status:
  98. :param title_list:
  99. :return: None
  100. """
  101. fail_list = []
  102. insert_count = 0
  103. for title in title_list:
  104. insert_sql = f"""
  105. INSERT INTO cold_start_title_pool
  106. (title, status)
  107. values
  108. (%s, %s)
  109. """
  110. try:
  111. self.lam_client.update(
  112. sql=insert_sql,
  113. params=(title, status)
  114. )
  115. insert_count += 1
  116. except Exception as e:
  117. update_sql = f"""
  118. UPDATE cold_start_title_pool
  119. SET status = %s
  120. where title = %s and status = %s;
  121. """
  122. try:
  123. self.lam_client.update(
  124. sql=update_sql,
  125. params=(status, title, self.INIT_STATUS)
  126. )
  127. except Exception as e:
  128. error_msg = traceback.format_exc()
  129. log(
  130. task="article_exit_with_title",
  131. function="save_titles",
  132. status="fail",
  133. data={
  134. "e": str(e),
  135. "error_msg": error_msg,
  136. }
  137. )
  138. fail_list.append(title)
  139. if fail_list:
  140. bot(
  141. title="冷启动文章标题退场,sql操作失败",
  142. detail=fail_list
  143. )
  144. return -1
  145. else:
  146. return insert_count
  147. def main():
  148. """
  149. main function
  150. :return:
  151. """
  152. UP_LEVEL_STATUS = 1
  153. ARTICLE_EXIT_STATUS = -1
  154. READ_TIMES_ON_AVG_THRESHOLD = 0.5
  155. DISCOVERY_TIMES_THRESHOLD = 10
  156. PUBLISH_TIMES_THRESHOLD = 8
  157. DAYS_THRESHOLD = 30
  158. FIRST_PUBLISH_DATE_THRESHOLD = (datetime.now() - timedelta(days=DAYS_THRESHOLD)).strftime('%Y%m%d')
  159. article_title_manager = ArticleTitleStatusManager()
  160. article_title_manager.init_database()
  161. # 处理晋级标题
  162. up_level_title = get_level_up_articles()
  163. up_level_success_count = article_title_manager.save_titles(
  164. title_list=up_level_title,
  165. status=UP_LEVEL_STATUS
  166. )
  167. # 处理退场标题V1
  168. exit_article_list = article_title_manager.get_bad_articles(
  169. read_times_on_avg_threshold=READ_TIMES_ON_AVG_THRESHOLD,
  170. discovery_times_threshold=DISCOVERY_TIMES_THRESHOLD
  171. )
  172. exit_success_count = article_title_manager.save_titles(
  173. title_list=exit_article_list,
  174. status=ARTICLE_EXIT_STATUS)
  175. # 处理退场标题v2
  176. exit_article_list_v2 = article_title_manager.get_bad_articles_v2(
  177. publish_date_threshold=FIRST_PUBLISH_DATE_THRESHOLD,
  178. discovery_times_threshold=PUBLISH_TIMES_THRESHOLD
  179. )
  180. exit_success_count_v2 = article_title_manager.save_titles(
  181. title_list=exit_article_list_v2,
  182. status=ARTICLE_EXIT_STATUS)
  183. bot(
  184. title="冷启动文章晋级/退场完成",
  185. detail={
  186. "晋级文章数量": up_level_success_count,
  187. "策略1:退场文章数量": exit_success_count,
  188. "策略2:退场文章数量": exit_success_count_v2,
  189. "策略1:阅读均值倍数阈值": READ_TIMES_ON_AVG_THRESHOLD,
  190. "策略1:探索次数阈值": DISCOVERY_TIMES_THRESHOLD,
  191. "策略2:发布次数阈值": PUBLISH_TIMES_THRESHOLD,
  192. "策略2:发布天数阈值": DAYS_THRESHOLD
  193. },
  194. mention=False
  195. )
  196. if __name__ == '__main__':
  197. main()