luojunhui
/
LongArticlesJob


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
							"""
@author: luojunhui
"""
import datetime
from applications import bot
from applications.db import DatabaseConnector
from config import long_articles_config
from coldStartTasks.publish.basic import *

const = ColdStartTaskConst()


class ArticleAssociationPublish(object):
    """
    publish i2i articles
    """

    def __init__(self):
        self.db_client = DatabaseConnector(db_config=long_articles_config)
        self.db_client.connect()

    def filter_articles_before_create_plan(self, article_df: DataFrame) -> DataFrame:
        """
        filter articles before create plan
        """
        total_length = article_df.shape[0]

        # filter by status
        filter_df = filter_by_status(article_df)
        filter_length0 = filter_df.shape[0]

        # filter by sensitive words
        filter_df = filter_by_sensitive_words(filter_df)
        filter_length1 = filter_df.shape[0]

        # filter by title length
        filter_df = filter_by_title_length(filter_df)
        filter_length2 = filter_df.shape[0]

        bot(
            title="文章联想任务，开始创建抓取计划",
            detail={
                "文章总数": total_length,
                "发布状态过滤": "过滤： {}, 剩余： {}".format(total_length - filter_length0, filter_length0),
                "敏感词过滤": "过滤： {}, 剩余： {}".format(filter_length0 - filter_length1, filter_length1),
                "标题长度过滤": "过滤： {}, 剩余： {}".format(filter_length1 - filter_length2, filter_length2)
            },
            mention=False
        )

        return filter_df

    def deal(self):
        """
        class entrance
        """
        # update published articles
        update_published_articles_status(db_client=self.db_client)

        # get data from meta table
        article_dataframe = get_article_from_meta_table(db_client=self.db_client, category='article_association',
                                                        platform='weixin')

        # fileter articles
        filter_dataframe = self.filter_articles_before_create_plan(article_dataframe)

        # create crawler plan
        url_list = filter_dataframe['link'].values.tolist()
        if url_list:
            # create_crawler_plan
            create_crawler_plan(db_client=self.db_client, url_list=url_list, plan_tag='article_association', platform='weixin')

            # change article status
            article_id_list = filtered_articles_df['article_id'].values.tolist()
            self.change_article_status_while_publishing(article_id_list=article_id_list)