""" @author: luojunhui """ from pandas import DataFrame from applications import bot from applications.const import ColdStartTaskConst from applications.db import DatabaseConnector from config import long_articles_config from coldStartTasks.publish.basic import filter_by_status from coldStartTasks.publish.basic import filter_by_sensitive_words from coldStartTasks.publish.basic import filter_by_title_length from coldStartTasks.publish.basic import update_published_articles_status from coldStartTasks.publish.basic import get_article_from_meta_table from coldStartTasks.publish.basic import update_article_status_after_publishing from coldStartTasks.publish.basic import create_crawler_plan from coldStartTasks.publish.basic import insert_into_article_crawler_plan from coldStartTasks.publish.basic import bind_to_generate_plan const = ColdStartTaskConst() def filter_articles_before_create_plan(article_df: DataFrame) -> DataFrame: """ filter articles before create plan """ total_length = article_df.shape[0] # filter by status filter_df = filter_by_status(article_df) filter_length0 = filter_df.shape[0] # filter by sensitive words filter_df = filter_by_sensitive_words(filter_df) filter_length1 = filter_df.shape[0] # filter by title length filter_df = filter_by_title_length(filter_df) filter_length2 = filter_df.shape[0] bot( title="文章联想任务,开始创建抓取计划", detail={ "文章总数": total_length, "发布状态过滤": "过滤: {}, 剩余: {}".format( total_length - filter_length0, filter_length0 ), "敏感词过滤": "过滤: {}, 剩余: {}".format( filter_length0 - filter_length1, filter_length1 ), "标题长度过滤": "过滤: {}, 剩余: {}".format( filter_length1 - filter_length2, filter_length2 ), }, mention=False, ) return filter_df class ArticleAssociationPublish(object): """ publish i2i articles """ def __init__(self): self.db_client = DatabaseConnector(db_config=long_articles_config) self.db_client.connect() def deal(self): """ class entrance """ # update published articles update_published_articles_status(db_client=self.db_client) # get data from meta table article_dataframe = get_article_from_meta_table( db_client=self.db_client, category="article_association", platform="weixin" ) # fileter articles filter_dataframe = filter_articles_before_create_plan(article_dataframe) # create crawler plan url_list = filter_dataframe["link"].values.tolist() if url_list: crawler_plan_id, crawler_plan_name, create_timestamp = create_crawler_plan( url_list=url_list, plan_tag="article_association", platform="weixin" ) # insert crawler plan insert_into_article_crawler_plan( db_client=self.db_client, crawler_plan_id=crawler_plan_id, crawler_plan_name=crawler_plan_name, create_timestamp=create_timestamp, ) # bind to generate plan bind_to_generate_plan( category="article_association", crawler_plan_id=crawler_plan_id, crawler_plan_name=crawler_plan_name, platform="weixin", ) # update status article_id_list = filter_dataframe["article_id"].values.tolist() update_article_status_after_publishing( db_client=self.db_client, article_id_list=article_id_list ) bot( title="文章联想任务,创建抓取计划成功", detail={ "抓取计划id": crawler_plan_id, "抓取计划名称": crawler_plan_name, "抓取条数": len(url_list), "冷启动类型": "article_association", }, mention=False, )