123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475 |
- """
- @author: luojunhui
- """
- import datetime
- from applications import bot
- from applications.db import DatabaseConnector
- from config import long_articles_config
- from coldStartTasks.publish.basic import *
- const = ColdStartTaskConst()
- class ArticleAssociationPublish(object):
- """
- publish i2i articles
- """
- def __init__(self):
- self.db_client = DatabaseConnector(db_config=long_articles_config)
- self.db_client.connect()
- def filter_articles_before_create_plan(self, article_df: DataFrame) -> DataFrame:
- """
- filter articles before create plan
- """
- total_length = article_df.shape[0]
- # filter by status
- filter_df = filter_by_status(article_df)
- filter_length0 = filter_df.shape[0]
- # filter by sensitive words
- filter_df = filter_by_sensitive_words(filter_df)
- filter_length1 = filter_df.shape[0]
- # filter by title length
- filter_df = filter_by_title_length(filter_df)
- filter_length2 = filter_df.shape[0]
- bot(
- title="文章联想任务,开始创建抓取计划",
- detail={
- "文章总数": total_length,
- "发布状态过滤": "过滤: {}, 剩余: {}".format(total_length - filter_length0, filter_length0),
- "敏感词过滤": "过滤: {}, 剩余: {}".format(filter_length0 - filter_length1, filter_length1),
- "标题长度过滤": "过滤: {}, 剩余: {}".format(filter_length1 - filter_length2, filter_length2)
- },
- mention=False
- )
- return filter_df
- def deal(self):
- """
- class entrance
- """
- # update published articles
- update_published_articles_status(db_client=self.db_client)
- # get data from meta table
- article_dataframe = get_article_from_meta_table(db_client=self.db_client, category='article_association',
- platform='weixin')
- # fileter articles
- filter_dataframe = self.filter_articles_before_create_plan(article_dataframe)
- # create crawler plan
- url_list = filter_dataframe['link'].values.tolist()
- if url_list:
- # create_crawler_plan
- create_crawler_plan(db_client=self.db_client, url_list=url_list, plan_tag='article_association', platform='weixin')
- # change article status
- article_id_list = filtered_articles_df['article_id'].values.tolist()
- self.change_article_status_while_publishing(article_id_list=article_id_list)
|