""" @author: luojunhui """ import datetime from applications import bot from applications.db import DatabaseConnector from config import long_articles_config from coldStartTasks.publish.basic import * const = ColdStartTaskConst() class ArticleAssociationPublish(object): """ publish i2i articles """ def __init__(self): self.db_client = DatabaseConnector(db_config=long_articles_config) self.db_client.connect() def filter_articles_before_create_plan(self, article_df: DataFrame) -> DataFrame: """ filter articles before create plan """ total_length = article_df.shape[0] # filter by status filter_df = filter_by_status(article_df) filter_length0 = filter_df.shape[0] # filter by sensitive words filter_df = filter_by_sensitive_words(filter_df) filter_length1 = filter_df.shape[0] # filter by title length filter_df = filter_by_title_length(filter_df) filter_length2 = filter_df.shape[0] bot( title="文章联想任务,开始创建抓取计划", detail={ "文章总数": total_length, "发布状态过滤": "过滤: {}, 剩余: {}".format(total_length - filter_length0, filter_length0), "敏感词过滤": "过滤: {}, 剩余: {}".format(filter_length0 - filter_length1, filter_length1), "标题长度过滤": "过滤: {}, 剩余: {}".format(filter_length1 - filter_length2, filter_length2) }, mention=False ) return filter_df def deal(self): """ class entrance """ # update published articles update_published_articles_status(db_client=self.db_client) # get data from meta table article_dataframe = get_article_from_meta_table(db_client=self.db_client, category='article_association', platform='weixin') # fileter articles filter_dataframe = self.filter_articles_before_create_plan(article_dataframe) # create crawler plan url_list = filter_dataframe['link'].values.tolist() if url_list: # create_crawler_plan create_crawler_plan(db_client=self.db_client, url_list=url_list, plan_tag='article_association', platform='weixin') # change article status article_id_list = filtered_articles_df['article_id'].values.tolist() self.change_article_status_while_publishing(article_id_list=article_id_list)