| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125 | """@author: luojunhui"""from pandas import DataFramefrom applications import botfrom applications.const import ColdStartTaskConstfrom applications.db import DatabaseConnectorfrom config import long_articles_configfrom cold_start.publish.basic import filter_by_statusfrom cold_start.publish.basic import filter_by_sensitive_wordsfrom cold_start.publish.basic import filter_by_title_lengthfrom cold_start.publish.basic import update_published_articles_statusfrom cold_start.publish.basic import get_article_from_meta_tablefrom cold_start.publish.basic import update_article_status_after_publishingfrom cold_start.publish.basic import create_crawler_planfrom cold_start.publish.basic import insert_into_article_crawler_planfrom cold_start.publish.basic import bind_to_generate_planconst = ColdStartTaskConst()def filter_articles_before_create_plan(article_df: DataFrame) -> DataFrame:    """    filter articles before create plan    """    total_length = article_df.shape[0]    # filter by status    filter_df = filter_by_status(article_df)    filter_length0 = filter_df.shape[0]    # filter by sensitive words    filter_df = filter_by_sensitive_words(filter_df)    filter_length1 = filter_df.shape[0]    # filter by title length    filter_df = filter_by_title_length(filter_df)    filter_length2 = filter_df.shape[0]    bot(        title="文章联想任务,开始创建抓取计划",        detail={            "文章总数": total_length,            "发布状态过滤": "过滤: {}, 剩余: {}".format(                total_length - filter_length0, filter_length0            ),            "敏感词过滤": "过滤: {}, 剩余: {}".format(                filter_length0 - filter_length1, filter_length1            ),            "标题长度过滤": "过滤: {}, 剩余: {}".format(                filter_length1 - filter_length2, filter_length2            ),        },        mention=False,    )    return filter_dfclass ArticleAssociationPublish(object):    """    publish i2i articles    """    def __init__(self):        self.db_client = DatabaseConnector(db_config=long_articles_config)        self.db_client.connect()    def deal(self):        """        class entrance        """        # update published articles        update_published_articles_status(db_client=self.db_client)        # get data from meta table        article_dataframe = get_article_from_meta_table(            db_client=self.db_client, category="article_association", platform="weixin"        )        # fileter articles        filter_dataframe = filter_articles_before_create_plan(article_dataframe)        # create crawler plan        url_list = filter_dataframe["link"].values.tolist()        if url_list:            crawler_plan_id, crawler_plan_name, create_timestamp = create_crawler_plan(                url_list=url_list, plan_tag="article_association", platform="weixin"            )            # insert crawler plan            insert_into_article_crawler_plan(                db_client=self.db_client,                crawler_plan_id=crawler_plan_id,                crawler_plan_name=crawler_plan_name,                create_timestamp=create_timestamp,            )            # bind to generate plan            bind_to_generate_plan(                category="article_association",                crawler_plan_id=crawler_plan_id,                crawler_plan_name=crawler_plan_name,                platform="weixin",            )            # update status            article_id_list = filter_dataframe["article_id"].values.tolist()            update_article_status_after_publishing(                db_client=self.db_client, article_id_list=article_id_list            )            bot(                title="文章联想任务,创建抓取计划成功",                detail={                    "抓取计划id": crawler_plan_id,                    "抓取计划名称": crawler_plan_name,                    "抓取条数": len(url_list),                    "冷启动类型": "article_association",                },                mention=False,            )
 |