publish_article_association_articles.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. """
  2. @author: luojunhui
  3. """
  4. from pandas import DataFrame
  5. from applications import bot
  6. from applications.const import ColdStartTaskConst
  7. from applications.db import DatabaseConnector
  8. from config import long_articles_config
  9. from coldStartTasks.publish.basic import filter_by_status
  10. from coldStartTasks.publish.basic import filter_by_sensitive_words
  11. from coldStartTasks.publish.basic import filter_by_title_length
  12. from coldStartTasks.publish.basic import update_published_articles_status
  13. from coldStartTasks.publish.basic import get_article_from_meta_table
  14. from coldStartTasks.publish.basic import update_article_status_after_publishing
  15. from coldStartTasks.publish.basic import create_crawler_plan
  16. from coldStartTasks.publish.basic import insert_into_article_crawler_plan
  17. from coldStartTasks.publish.basic import bind_to_generate_plan
  18. const = ColdStartTaskConst()
  19. def filter_articles_before_create_plan(article_df: DataFrame) -> DataFrame:
  20. """
  21. filter articles before create plan
  22. """
  23. total_length = article_df.shape[0]
  24. # filter by status
  25. filter_df = filter_by_status(article_df)
  26. filter_length0 = filter_df.shape[0]
  27. # filter by sensitive words
  28. filter_df = filter_by_sensitive_words(filter_df)
  29. filter_length1 = filter_df.shape[0]
  30. # filter by title length
  31. filter_df = filter_by_title_length(filter_df)
  32. filter_length2 = filter_df.shape[0]
  33. bot(
  34. title="文章联想任务,开始创建抓取计划",
  35. detail={
  36. "文章总数": total_length,
  37. "发布状态过滤": "过滤: {}, 剩余: {}".format(
  38. total_length - filter_length0, filter_length0
  39. ),
  40. "敏感词过滤": "过滤: {}, 剩余: {}".format(
  41. filter_length0 - filter_length1, filter_length1
  42. ),
  43. "标题长度过滤": "过滤: {}, 剩余: {}".format(
  44. filter_length1 - filter_length2, filter_length2
  45. ),
  46. },
  47. mention=False,
  48. )
  49. return filter_df
  50. class ArticleAssociationPublish(object):
  51. """
  52. publish i2i articles
  53. """
  54. def __init__(self):
  55. self.db_client = DatabaseConnector(db_config=long_articles_config)
  56. self.db_client.connect()
  57. def deal(self):
  58. """
  59. class entrance
  60. """
  61. # update published articles
  62. update_published_articles_status(db_client=self.db_client)
  63. # get data from meta table
  64. article_dataframe = get_article_from_meta_table(
  65. db_client=self.db_client, category="article_association", platform="weixin"
  66. )
  67. # fileter articles
  68. filter_dataframe = filter_articles_before_create_plan(article_dataframe)
  69. # create crawler plan
  70. url_list = filter_dataframe["link"].values.tolist()
  71. if url_list:
  72. crawler_plan_id, crawler_plan_name, create_timestamp = create_crawler_plan(
  73. url_list=url_list, plan_tag="article_association", platform="weixin"
  74. )
  75. # insert crawler plan
  76. insert_into_article_crawler_plan(
  77. db_client=self.db_client,
  78. crawler_plan_id=crawler_plan_id,
  79. crawler_plan_name=crawler_plan_name,
  80. create_timestamp=create_timestamp,
  81. )
  82. # bind to generate plan
  83. bind_to_generate_plan(
  84. category="article_association",
  85. crawler_plan_id=crawler_plan_id,
  86. crawler_plan_name=crawler_plan_name,
  87. platform="weixin",
  88. )
  89. # update status
  90. article_id_list = filter_dataframe["article_id"].values.tolist()
  91. update_article_status_after_publishing(
  92. db_client=self.db_client, article_id_list=article_id_list
  93. )
  94. bot(
  95. title="文章联想任务,创建抓取计划成功",
  96. detail={
  97. "抓取计划id": crawler_plan_id,
  98. "抓取计划名称": crawler_plan_name,
  99. "抓取条数": len(url_list),
  100. "冷启动类型": "article_association",
  101. },
  102. mention=False,
  103. )