123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125 |
- """
- @author: luojunhui
- """
- from pandas import DataFrame
- from applications import bot
- from applications.const import ColdStartTaskConst
- from applications.db import DatabaseConnector
- from config import long_articles_config
- from coldStartTasks.publish.basic import filter_by_status
- from coldStartTasks.publish.basic import filter_by_sensitive_words
- from coldStartTasks.publish.basic import filter_by_title_length
- from coldStartTasks.publish.basic import update_published_articles_status
- from coldStartTasks.publish.basic import get_article_from_meta_table
- from coldStartTasks.publish.basic import update_article_status_after_publishing
- from coldStartTasks.publish.basic import create_crawler_plan
- from coldStartTasks.publish.basic import insert_into_article_crawler_plan
- from coldStartTasks.publish.basic import bind_to_generate_plan
- const = ColdStartTaskConst()
- def filter_articles_before_create_plan(article_df: DataFrame) -> DataFrame:
- """
- filter articles before create plan
- """
- total_length = article_df.shape[0]
- # filter by status
- filter_df = filter_by_status(article_df)
- filter_length0 = filter_df.shape[0]
- # filter by sensitive words
- filter_df = filter_by_sensitive_words(filter_df)
- filter_length1 = filter_df.shape[0]
- # filter by title length
- filter_df = filter_by_title_length(filter_df)
- filter_length2 = filter_df.shape[0]
- bot(
- title="文章联想任务,开始创建抓取计划",
- detail={
- "文章总数": total_length,
- "发布状态过滤": "过滤: {}, 剩余: {}".format(
- total_length - filter_length0, filter_length0
- ),
- "敏感词过滤": "过滤: {}, 剩余: {}".format(
- filter_length0 - filter_length1, filter_length1
- ),
- "标题长度过滤": "过滤: {}, 剩余: {}".format(
- filter_length1 - filter_length2, filter_length2
- ),
- },
- mention=False,
- )
- return filter_df
- class ArticleAssociationPublish(object):
- """
- publish i2i articles
- """
- def __init__(self):
- self.db_client = DatabaseConnector(db_config=long_articles_config)
- self.db_client.connect()
- def deal(self):
- """
- class entrance
- """
- # update published articles
- update_published_articles_status(db_client=self.db_client)
- # get data from meta table
- article_dataframe = get_article_from_meta_table(
- db_client=self.db_client, category="article_association", platform="weixin"
- )
- # fileter articles
- filter_dataframe = filter_articles_before_create_plan(article_dataframe)
- # create crawler plan
- url_list = filter_dataframe["link"].values.tolist()
- if url_list:
- crawler_plan_id, crawler_plan_name, create_timestamp = create_crawler_plan(
- url_list=url_list, plan_tag="article_association", platform="weixin"
- )
- # insert crawler plan
- insert_into_article_crawler_plan(
- db_client=self.db_client,
- crawler_plan_id=crawler_plan_id,
- crawler_plan_name=crawler_plan_name,
- create_timestamp=create_timestamp,
- )
- # bind to generate plan
- bind_to_generate_plan(
- category="article_association",
- crawler_plan_id=crawler_plan_id,
- crawler_plan_name=crawler_plan_name,
- platform="weixin",
- )
- # update status
- article_id_list = filter_dataframe["article_id"].values.tolist()
- update_article_status_after_publishing(
- db_client=self.db_client, article_id_list=article_id_list
- )
- bot(
- title="文章联想任务,创建抓取计划成功",
- detail={
- "抓取计划id": crawler_plan_id,
- "抓取计划名称": crawler_plan_name,
- "抓取条数": len(url_list),
- "冷启动类型": "article_association",
- },
- mention=False,
- )
|