exit_article_with_title.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. """
  2. @author: luojunhui
  3. """
  4. import traceback
  5. import pandas as pd
  6. from applications import PQMySQL, longArticlesMySQL, bot, log
  7. from applications.aiditApi import get_generated_article_list
  8. def get_level_up_articles() -> set:
  9. """
  10. :return:
  11. """
  12. generate_pool_ids = [
  13. "20240804003153130851174",
  14. "20240802171417146947657",
  15. "20240802143345289374071",
  16. ]
  17. good_title_set = set()
  18. for pool_id in generate_pool_ids:
  19. articles = get_generated_article_list(pool_id)
  20. titles = [article[1] for article in articles]
  21. good_title_set.update(titles)
  22. return good_title_set
  23. class ArticleTitleStatusManager(object):
  24. """
  25. 文章退场表格维护
  26. """
  27. def __init__(self):
  28. self.INIT_STATUS = 0
  29. self.pq_client = None
  30. self.lam_client = None
  31. def init_database(self) -> bool:
  32. """
  33. 初始化数据库
  34. :return:
  35. """
  36. try:
  37. self.pq_client = PQMySQL()
  38. except Exception as e:
  39. bot(
  40. title="文章退场管理任务,数据库连接失败",
  41. detail={
  42. "e": str(e),
  43. "error_msg": traceback.format_exc(),
  44. "server": "old server"
  45. }
  46. )
  47. return False
  48. try:
  49. self.lam_client = longArticlesMySQL()
  50. except Exception as e:
  51. bot(
  52. title="文章退场管理任务,数据库连接失败",
  53. detail={
  54. "e": str(e),
  55. "error_msg": traceback.format_exc(),
  56. "server": "new server"
  57. }
  58. )
  59. return True
  60. def get_bad_articles(self, read_times_on_avg_threshold, discovery_times_threshold) -> list[str]:
  61. """
  62. 找出质量很差的文章标题,将该标题设置为退场状态
  63. :return:
  64. """
  65. sql = f"""
  66. SELECT
  67. title, max(read_rate) as max_rate, count(1) as title_count
  68. FROM
  69. datastat_sort_strategy
  70. WHERE position > 2 and fans > 10000
  71. GROUP BY title
  72. HAVING title_count >= {discovery_times_threshold} and max_rate < {read_times_on_avg_threshold};
  73. """
  74. articles = self.lam_client.select(sql)
  75. return [i[0] for i in articles]
  76. def save_titles(self, title_list, status) -> int:
  77. """
  78. 修改标题状态
  79. :param status:
  80. :param title_list:
  81. :return: None
  82. """
  83. fail_list = []
  84. insert_count = 0
  85. for title in title_list:
  86. insert_sql = f"""
  87. INSERT INTO cold_start_title_pool
  88. (title, status)
  89. values
  90. (%s, %s)
  91. """
  92. try:
  93. self.lam_client.update(
  94. sql=insert_sql,
  95. params=(title, status)
  96. )
  97. insert_count += 1
  98. except Exception as e:
  99. update_sql = f"""
  100. UPDATE cold_start_title_pool
  101. SET status = %s
  102. where title = %s and status = %s;
  103. """
  104. try:
  105. self.lam_client.update(
  106. sql=update_sql,
  107. params=(status, title, self.INIT_STATUS)
  108. )
  109. except Exception as e:
  110. error_msg = traceback.format_exc()
  111. log(
  112. task="article_exit_with_title",
  113. function="save_titles",
  114. status="fail",
  115. data={
  116. "e": str(e),
  117. "error_msg": error_msg,
  118. }
  119. )
  120. fail_list.append(title)
  121. if fail_list:
  122. bot(
  123. title="冷启动文章标题退场,sql操作失败",
  124. detail=fail_list
  125. )
  126. return -1
  127. else:
  128. return insert_count
  129. def main():
  130. """
  131. main function
  132. :return:
  133. """
  134. UP_LEVEL_STATUS = 1
  135. ARTICLE_EXIT_STATUS = -1
  136. READ_TIMES_ON_AVG_THRESHOLD = 0.5
  137. DISCOVERY_TIMES_THRESHOLD = 10
  138. article_title_manager = ArticleTitleStatusManager()
  139. article_title_manager.init_database()
  140. # 处理晋级标题
  141. up_level_title = get_level_up_articles()
  142. up_level_success_count = article_title_manager.save_titles(
  143. title_list=up_level_title,
  144. status=UP_LEVEL_STATUS
  145. )
  146. # 处理退场标题
  147. exit_article_list = article_title_manager.get_bad_articles(
  148. read_times_on_avg_threshold=READ_TIMES_ON_AVG_THRESHOLD,
  149. discovery_times_threshold=DISCOVERY_TIMES_THRESHOLD
  150. )
  151. exit_success_count = article_title_manager.save_titles(
  152. title_list=exit_article_list,
  153. status=ARTICLE_EXIT_STATUS)
  154. bot(
  155. title="冷启动文章晋级/退场完成",
  156. detail={
  157. "晋级文章数量": up_level_success_count,
  158. "退场文章数量": exit_success_count,
  159. "阅读均值倍数阈值": READ_TIMES_ON_AVG_THRESHOLD,
  160. "探索次数阈值": DISCOVERY_TIMES_THRESHOLD
  161. },
  162. mention=False
  163. )
  164. if __name__ == '__main__':
  165. main()