""" @author: luojunhui """ import json import traceback from datetime import datetime, timedelta from applications import PQMySQL, longArticlesMySQL, bot, log from applications.aiditApi import get_generated_article_list from config import apolloConfig config = apolloConfig() article_exit_threshold = json.loads(config.getConfigValue("article_exit_threshold")) def get_level_up_articles() -> set: """ :return: """ generate_pool_ids = [ "20240804003153130851174", "20240802171417146947657", "20240802143345289374071", ] good_title_set = set() for pool_id in generate_pool_ids: articles = get_generated_article_list(pool_id) titles = [article[1] for article in articles] good_title_set.update(titles) return good_title_set class ArticleTitleStatusManager(object): """ 文章退场表格维护 """ def __init__(self): self.INIT_STATUS = 0 self.pq_client = None self.lam_client = None def init_database(self) -> bool: """ 初始化数据库 :return: """ try: self.pq_client = PQMySQL() except Exception as e: bot( title="文章退场管理任务,数据库连接失败", detail={ "e": str(e), "error_msg": traceback.format_exc(), "server": "old server" } ) return False try: self.lam_client = longArticlesMySQL() except Exception as e: bot( title="文章退场管理任务,数据库连接失败", detail={ "e": str(e), "error_msg": traceback.format_exc(), "server": "new server" } ) return True def get_bad_articles(self, read_times_on_avg_threshold, discovery_times_threshold) -> list[str]: """ 找出质量很差的文章标题,将该标题设置为退场状态 :return: """ sql = f""" SELECT title, max(read_rate) as max_rate, count(1) as title_count FROM datastat_sort_strategy WHERE position > 2 and fans > 10000 GROUP BY title HAVING title_count >= {discovery_times_threshold} and max_rate < {read_times_on_avg_threshold}; """ articles = self.lam_client.select(sql) return [i[0] for i in articles] def get_bad_articles_v2(self, publish_date_threshold, discovery_times_threshold) -> list[str]: """ 找出第一次发布在一个月之前,且发布次数大于5次的文章 :param publish_date_threshold: 发布时间戳阈值 :param discovery_times_threshold: 发布次数阈值 :return: """ sql = f""" SELECT title, count(1) as title_count, min(date_str) as min_date FROM datastat_sort_strategy WHERE position > 2 and fans > 10000 GROUP BY title HAVING title_count >= {discovery_times_threshold} and min_date < {publish_date_threshold}; """ articles = self.lam_client.select(sql) return [i[0] for i in articles] def save_titles(self, title_list, status) -> int: """ 修改标题状态 :param status: :param title_list: :return: None """ fail_list = [] insert_count = 0 for title in title_list: insert_sql = f""" INSERT INTO cold_start_title_pool (title, status) values (%s, %s) """ try: self.lam_client.update( sql=insert_sql, params=(title, status) ) insert_count += 1 except Exception as e: update_sql = f""" UPDATE cold_start_title_pool SET status = %s where title = %s and status = %s; """ try: self.lam_client.update( sql=update_sql, params=(status, title, self.INIT_STATUS) ) except Exception as e: error_msg = traceback.format_exc() log( task="article_exit_with_title", function="save_titles", status="fail", data={ "e": str(e), "error_msg": error_msg, } ) fail_list.append(title) if fail_list: bot( title="冷启动文章标题退场,sql操作失败", detail=fail_list ) return -1 else: return insert_count def main(): """ main function :return: """ UP_LEVEL_STATUS = 1 ARTICLE_EXIT_STATUS = -1 # 策略一: read_times_on_avg_threshold = article_exit_threshold['strategy_1']['read_times_on_avg'] explore_times_threshold = article_exit_threshold['strategy_1']['explore_times_threshold'] # 策略二: publish_times_threshold = article_exit_threshold['strategy_2']['publish_times_threshold'] days_threshold = article_exit_threshold['strategy_2']['days_threshold'] first_publish_date_threshold = (datetime.now() - timedelta(days=days_threshold)).strftime('%Y%m%d') article_title_manager = ArticleTitleStatusManager() article_title_manager.init_database() # 处理晋级标题 up_level_title = get_level_up_articles() up_level_success_count = article_title_manager.save_titles( title_list=up_level_title, status=UP_LEVEL_STATUS ) # 处理退场标题V1 exit_article_list = article_title_manager.get_bad_articles( read_times_on_avg_threshold=read_times_on_avg_threshold, discovery_times_threshold=explore_times_threshold ) exit_success_count = article_title_manager.save_titles( title_list=exit_article_list, status=ARTICLE_EXIT_STATUS) # 处理退场标题v2 exit_article_list_v2 = article_title_manager.get_bad_articles_v2( publish_date_threshold=first_publish_date_threshold, discovery_times_threshold=publish_times_threshold ) exit_success_count_v2 = article_title_manager.save_titles( title_list=exit_article_list_v2, status=ARTICLE_EXIT_STATUS) bot( title="冷启动文章晋级/退场完成", detail={ "晋级文章数量": up_level_success_count, "策略1:退场文章数量": exit_success_count, "策略2:退场文章数量": exit_success_count_v2, "策略1:阅读均值倍数阈值": read_times_on_avg_threshold, "策略1:探索次数阈值": explore_times_threshold, "策略2:发布次数阈值": publish_times_threshold, "策略2:发布天数阈值": days_threshold }, mention=False ) if __name__ == '__main__': main()