account_cold_start_daily.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. """
  2. @author: luojunhui
  3. """
  4. import datetime
  5. import traceback
  6. from argparse import ArgumentParser
  7. from applications import longArticlesMySQL, bot
  8. from tasks.crawler_tasks.crawler_articles import CrawlerDailyScrapeAccountArticles
  9. from tasks.crawler_tasks.crawler_articles import CrawlerAssociationAccountArticles
  10. from cold_start.publish.publishCategoryArticles import CategoryColdStartTask
  11. from cold_start.filter.title_similarity_task import ColdStartTitleSimilarityTask
  12. DEFAULT_METHOD_LIST = ['1030-手动挑号', 'account_association']
  13. def crawler_task(method_list, date_str):
  14. """
  15. :return:
  16. """
  17. # 初始化category抓取类
  18. try:
  19. daily_scrape_tasks = CrawlerDailyScrapeAccountArticles()
  20. daily_scrape_tasks.deal(method_list=method_list)
  21. association_scrape_tasks = CrawlerAssociationAccountArticles()
  22. association_scrape_tasks.deal(date_str=date_str)
  23. # 抓取完成之后,给抓取到的标题进行相似度打分
  24. cold_start_title_similarity_task = ColdStartTitleSimilarityTask()
  25. cold_start_title_similarity_task.init_database()
  26. cold_start_title_similarity_task.run(meta_source='article')
  27. bot(
  28. title="账号冷启动任务,抓取完成",
  29. detail={
  30. "finish_time": datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
  31. "method": method_list
  32. },
  33. mention=False
  34. )
  35. except Exception as e:
  36. bot(
  37. title="账号抓取冷启动任务,抓取失败",
  38. detail={
  39. "error": str(e),
  40. "error_msg": traceback.format_exc()
  41. }
  42. )
  43. class AccountColdStartDailyTask(object):
  44. """
  45. 账号冷启动代码
  46. """
  47. def __init__(self):
  48. """
  49. """
  50. self.db_client = None
  51. def init_db(self):
  52. """
  53. 初始化数据库
  54. :return:
  55. """
  56. try:
  57. self.db_client = longArticlesMySQL()
  58. return True
  59. except Exception as e:
  60. bot(
  61. title='账号抓取任务, 冷启动数据库连接失败',
  62. detail={
  63. "error": str(e),
  64. "error_msg": traceback.format_exc()
  65. }
  66. )
  67. return False
  68. def publish_article_task(self, category_list, article_source):
  69. """
  70. 将账号文章发布到aigc抓取计划,并且绑定生成计划
  71. :param category_list: 文章品类
  72. :param article_source: 文章来源(toutiao or weixin)
  73. :return:
  74. """
  75. try:
  76. weixin_category_publisher = CategoryColdStartTask(db_client=self.db_client)
  77. weixin_category_publisher.do_job(
  78. category_list=category_list,
  79. article_source=article_source
  80. )
  81. bot(
  82. title="账号冷启任务,发布完成",
  83. detail={
  84. "finish_time": datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
  85. "category": category_list
  86. },
  87. mention=False
  88. )
  89. except Exception as e:
  90. bot(
  91. title="账号发布冷启动任务,发布失败",
  92. detail={
  93. "error": str(e),
  94. "error_msg": traceback.format_exc()
  95. }
  96. )
  97. def main(method_list=None, article_source=None):
  98. """
  99. main job, use crontab to do job daily
  100. :return:
  101. """
  102. if not method_list:
  103. method_list = DEFAULT_METHOD_LIST
  104. if not article_source:
  105. article_source = 'weixin'
  106. task = AccountColdStartDailyTask()
  107. if task.init_db():
  108. task.publish_article_task(category_list=method_list, article_source=article_source)
  109. if __name__ == '__main__':
  110. parser = ArgumentParser()
  111. parser.add_argument("--run_date", help="--run_date format: %Y-%m-%d")
  112. args = parser.parse_args()
  113. if args.run_date:
  114. run_date = args.run_date
  115. else:
  116. run_date = datetime.date.today().isoformat()
  117. # 执行头条发布
  118. main(
  119. method_list=['history', 'tech', 'finance', 'entertainment'],
  120. article_source='toutiao'
  121. )
  122. # 执行微信抓取发布
  123. main()
  124. # 执行抓取
  125. crawler_task(
  126. method_list=DEFAULT_METHOD_LIST, date_str=run_date)