publishArticleAssociationArticles.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. """
  2. @author: luojunhui
  3. """
  4. import datetime
  5. from applications import bot
  6. from applications.db import DatabaseConnector
  7. from config import long_articles_config
  8. from coldStartTasks.publish.basic import *
  9. const = ColdStartTaskConst()
  10. class ArticleAssociationPublish(object):
  11. """
  12. publish i2i articles
  13. """
  14. def __init__(self):
  15. self.db_client = DatabaseConnector(db_config=long_articles_config)
  16. self.db_client.connect()
  17. def filter_articles_before_create_plan(self, article_df: DataFrame) -> DataFrame:
  18. """
  19. filter articles before create plan
  20. """
  21. total_length = article_df.shape[0]
  22. # filter by status
  23. filter_df = filter_by_status(article_df)
  24. filter_length0 = filter_df.shape[0]
  25. # filter by sensitive words
  26. filter_df = filter_by_sensitive_words(filter_df)
  27. filter_length1 = filter_df.shape[0]
  28. # filter by title length
  29. filter_df = filter_by_title_length(filter_df)
  30. filter_length2 = filter_df.shape[0]
  31. bot(
  32. title="文章联想任务,开始创建抓取计划",
  33. detail={
  34. "文章总数": total_length,
  35. "发布状态过滤": "过滤: {}, 剩余: {}".format(total_length - filter_length0, filter_length0),
  36. "敏感词过滤": "过滤: {}, 剩余: {}".format(filter_length0 - filter_length1, filter_length1),
  37. "标题长度过滤": "过滤: {}, 剩余: {}".format(filter_length1 - filter_length2, filter_length2)
  38. },
  39. mention=False
  40. )
  41. return filter_df
  42. def deal(self):
  43. """
  44. class entrance
  45. """
  46. # update published articles
  47. update_published_articles_status(db_client=self.db_client)
  48. # get data from meta table
  49. article_dataframe = get_article_from_meta_table(db_client=self.db_client, category='article_association',
  50. platform='weixin')
  51. # fileter articles
  52. filter_dataframe = self.filter_articles_before_create_plan(article_dataframe)
  53. # create crawler plan
  54. url_list = filter_dataframe['link'].values.tolist()
  55. if url_list:
  56. # create_crawler_plan
  57. create_crawler_plan(db_client=self.db_client, url_list=url_list, plan_tag='article_association', platform='weixin')
  58. # change article status
  59. article_id_list = filtered_articles_df['article_id'].values.tolist()
  60. self.change_article_status_while_publishing(article_id_list=article_id_list)