""" @author: luojunhui """ import traceback import pandas as pd from applications import PQMySQL, longArticlesMySQL, bot, log from applications.aiditApi import get_generated_article_list def get_level_up_articles() -> set: """ :return: """ pool_level2 = "20240804003153130851174" pool_level1 = "20240802171417146947657" pool_level0 = "20240802143345289374071" pool_level2_result = get_generated_article_list(pool_level2) title_list_2 = [i[1] for i in pool_level2_result] pool_level1_result = get_generated_article_list(pool_level1) title_list_1 = [i[1] for i in pool_level1_result] pool_level0_result = get_generated_article_list(pool_level0) title_list_0 = [i[1] for i in pool_level0_result] title_list = title_list_1 + title_list_0 + title_list_2 good_title_set = set(title_list) return good_title_set class ArticleExitWithTitle(object): """ 文章退场表格维护 """ def __init__(self): self.INIT_STATUS = 0 self.pq_client = None self.lam_client = None def init_database(self) -> bool: """ 初始化数据库 :return: """ try: self.pq_client = PQMySQL() except Exception as e: bot( title="文章退场管理任务,数据库连接失败", detail={ "e": str(e), "error_msg": traceback.format_exc(), "server": "old server" } ) return False try: self.lam_client = longArticlesMySQL() except Exception as e: bot( title="文章退场管理任务,数据库连接失败", detail={ "e": str(e), "error_msg": traceback.format_exc(), "server": "new server" } ) return True def get_discovery_published_articles(self) -> pd.DataFrame: """ :return: """ sql = f""" SELECT title, max(read_rate), count(1) as title_count FROM datastat_sort_strategy WHERE position > 2 and fans > 10000 GROUP BY title; """ articles = self.pq_client.select(sql) article_df = pd.DataFrame(articles, columns=['title', 'max_read_times_on_avg', 'articles_count']) return article_df def bad_article_manager(self, read_times_on_avg_threshold, discovery_times_threshold) -> list[str]: """ 找出质量很差的文章标题,将该标题设置为退场状态 :return: """ discovery_published_articles_df = self.get_discovery_published_articles() target_bad_dataframe = discovery_published_articles_df[ (discovery_published_articles_df['max_read_times_on_avg'] < read_times_on_avg_threshold) & (discovery_published_articles_df['articles_count'] < discovery_times_threshold) ] target_bad_title_list = target_bad_dataframe['title'].tolist() return target_bad_title_list def record_title_list(self, title_list, status) -> int: """ 修改标题状态 :param status: :param title_list: :return: None """ fail_list = [] insert_count = 0 for title in title_list: insert_sql = f""" INSERT INTO cold_start_title_pool (title, status) values (%s, %s) """ try: self.lam_client.update( sql=insert_sql, params=(title, status) ) insert_count += 1 except Exception as e: update_sql = f""" UPDATE cold_start_title_pool SET status = %s where title = %s and status = %s; """ try: self.lam_client.update( sql=update_sql, params=(status, title, self.INIT_STATUS) ) except Exception as e: error_msg = traceback.format_exc() log( task="article_exit_with_title", function="record_title_list", status="fail", data={ "e": str(e), "error_msg": error_msg, } ) fail_list.append(title) if fail_list: bot( title="冷启动文章标题退场,sql操作失败", detail=fail_list ) return -1 else: return insert_count def main(): """ main function :return: """ UP_LEVEL_STATUS = 1 ARTICLE_EXIT_STATUS = -1 READ_TIMES_ON_AVG_THRESHOLD = 0.5 DISCOVERY_TIMES_THRESHOLD = 3 article_title_manager = ArticleExitWithTitle() article_title_manager.init_database() # 处理晋级标题 up_level_title = get_level_up_articles() up_level_success_count = article_title_manager.record_title_list(title_list=up_level_title, status=UP_LEVEL_STATUS) # 处理退场标题 exit_article_list = article_title_manager.bad_article_manager( read_times_on_avg_threshold=READ_TIMES_ON_AVG_THRESHOLD, discovery_times_threshold=DISCOVERY_TIMES_THRESHOLD ) exit_success_count = article_title_manager.record_title_list(title_list=exit_article_list, status=ARTICLE_EXIT_STATUS) if exit_success_count >= 0 and up_level_success_count >= 0: bot( title="冷启动文章晋级, 退场完成", detail={ "已经晋级文章数量": up_level_success_count, "已经退场文章数控": exit_success_count, "阅读均值倍数阈值": READ_TIMES_ON_AVG_THRESHOLD, "探索次数阈值": DISCOVERY_TIMES_THRESHOLD }, mention=False ) if __name__ == '__main__': main()