123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228 |
- """
- @author: luojunhui
- """
- import json
- import traceback
- from datetime import datetime, timedelta
- from applications import PQMySQL, longArticlesMySQL, bot, log
- from applications.aiditApi import get_generated_article_list
- from config import apolloConfig
- config = apolloConfig()
- article_exit_threshold = json.loads(config.getConfigValue("article_exit_threshold"))
- def get_level_up_articles() -> set:
- """
- :return:
- """
- generate_pool_ids = [
- "20240804003153130851174",
- "20240802171417146947657",
- "20240802143345289374071",
- ]
- good_title_set = set()
- for pool_id in generate_pool_ids:
- articles = get_generated_article_list(pool_id)
- titles = [article[1] for article in articles]
- good_title_set.update(titles)
- return good_title_set
- class ArticleTitleStatusManager(object):
- """
- 文章退场表格维护
- """
- def __init__(self):
- self.INIT_STATUS = 0
- self.pq_client = None
- self.lam_client = None
- def init_database(self) -> bool:
- """
- 初始化数据库
- :return:
- """
- try:
- self.pq_client = PQMySQL()
- except Exception as e:
- bot(
- title="文章退场管理任务,数据库连接失败",
- detail={
- "e": str(e),
- "error_msg": traceback.format_exc(),
- "server": "old server"
- }
- )
- return False
- try:
- self.lam_client = longArticlesMySQL()
- except Exception as e:
- bot(
- title="文章退场管理任务,数据库连接失败",
- detail={
- "e": str(e),
- "error_msg": traceback.format_exc(),
- "server": "new server"
- }
- )
- return True
- def get_bad_articles(self, read_times_on_avg_threshold, discovery_times_threshold) -> list[str]:
- """
- 找出质量很差的文章标题,将该标题设置为退场状态
- :return:
- """
- sql = f"""
- SELECT
- title, max(read_rate) as max_rate, count(1) as title_count
- FROM
- datastat_sort_strategy
- WHERE position > 2 and fans > 10000
- GROUP BY title
- HAVING title_count >= {discovery_times_threshold} and max_rate < {read_times_on_avg_threshold};
- """
- articles = self.lam_client.select(sql)
- return [i[0] for i in articles]
- def get_bad_articles_v2(self, publish_date_threshold, discovery_times_threshold) -> list[str]:
- """
- 找出第一次发布在一个月之前,且发布次数大于5次的文章
- :param publish_date_threshold: 发布时间戳阈值
- :param discovery_times_threshold: 发布次数阈值
- :return:
- """
- sql = f"""
- SELECT
- title, count(1) as title_count, min(date_str) as min_date
- FROM
- datastat_sort_strategy
- WHERE position > 2 and fans > 10000
- GROUP BY title
- HAVING title_count >= {discovery_times_threshold} and min_date < {publish_date_threshold};
- """
- articles = self.lam_client.select(sql)
- return [i[0] for i in articles]
- def save_titles(self, title_list, status) -> int:
- """
- 修改标题状态
- :param status:
- :param title_list:
- :return: None
- """
- fail_list = []
- insert_count = 0
- for title in title_list:
- insert_sql = f"""
- INSERT INTO cold_start_title_pool
- (title, status)
- values
- (%s, %s)
- """
- try:
- self.lam_client.update(
- sql=insert_sql,
- params=(title, status)
- )
- insert_count += 1
- except Exception as e:
- update_sql = f"""
- UPDATE cold_start_title_pool
- SET status = %s
- where title = %s and status = %s;
- """
- try:
- self.lam_client.update(
- sql=update_sql,
- params=(status, title, self.INIT_STATUS)
- )
- except Exception as e:
- error_msg = traceback.format_exc()
- log(
- task="article_exit_with_title",
- function="save_titles",
- status="fail",
- data={
- "e": str(e),
- "error_msg": error_msg,
- }
- )
- fail_list.append(title)
- if fail_list:
- bot(
- title="冷启动文章标题退场,sql操作失败",
- detail=fail_list
- )
- return -1
- else:
- return insert_count
- def main():
- """
- main function
- :return:
- """
- UP_LEVEL_STATUS = 1
- ARTICLE_EXIT_STATUS = -1
- # 策略一:
- read_times_on_avg_threshold = article_exit_threshold['strategy_1']['read_times_on_avg']
- explore_times_threshold = article_exit_threshold['strategy_1']['explore_times_threshold']
- # 策略二:
- publish_times_threshold = article_exit_threshold['strategy_2']['publish_times_threshold']
- days_threshold = article_exit_threshold['strategy_2']['days_threshold']
- first_publish_date_threshold = (datetime.now() - timedelta(days=days_threshold)).strftime('%Y%m%d')
- article_title_manager = ArticleTitleStatusManager()
- article_title_manager.init_database()
- # 处理晋级标题
- up_level_title = get_level_up_articles()
- up_level_success_count = article_title_manager.save_titles(
- title_list=up_level_title,
- status=UP_LEVEL_STATUS
- )
- # 处理退场标题V1
- exit_article_list = article_title_manager.get_bad_articles(
- read_times_on_avg_threshold=read_times_on_avg_threshold,
- discovery_times_threshold=explore_times_threshold
- )
- exit_success_count = article_title_manager.save_titles(
- title_list=exit_article_list,
- status=ARTICLE_EXIT_STATUS)
- # 处理退场标题v2
- exit_article_list_v2 = article_title_manager.get_bad_articles_v2(
- publish_date_threshold=first_publish_date_threshold,
- discovery_times_threshold=publish_times_threshold
- )
- exit_success_count_v2 = article_title_manager.save_titles(
- title_list=exit_article_list_v2,
- status=ARTICLE_EXIT_STATUS)
- bot(
- title="冷启动文章晋级/退场完成",
- detail={
- "晋级文章数量": up_level_success_count,
- "策略1:退场文章数量": exit_success_count,
- "策略2:退场文章数量": exit_success_count_v2,
- "策略1:阅读均值倍数阈值": read_times_on_avg_threshold,
- "策略1:探索次数阈值": explore_times_threshold,
- "策略2:发布次数阈值": publish_times_threshold,
- "策略2:发布天数阈值": days_threshold
- },
- mention=False
- )
- if __name__ == '__main__':
- main()
|