account_cold_start_daily.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. """
  2. @author: luojunhui
  3. """
  4. import datetime
  5. import traceback
  6. from argparse import ArgumentParser
  7. from applications import longArticlesMySQL, bot
  8. from coldStartTasks.crawler.weixinCategoryCrawler import weixinCategory
  9. from coldStartTasks.publish.publishCategoryArticles import CategoryColdStartTask
  10. from coldStartTasks.filter.title_similarity_task import ColdStartTitleSimilarityTask
  11. DEFAULT_CATEGORY_LIST = ['1030-手动挑号', 'account_association']
  12. class AccountColdStartDailyTask(object):
  13. """
  14. 账号冷启动代码
  15. """
  16. def __init__(self):
  17. """
  18. """
  19. self.db_client = None
  20. def init_db(self):
  21. """
  22. 初始化数据库
  23. :return:
  24. """
  25. try:
  26. self.db_client = longArticlesMySQL()
  27. return True
  28. except Exception as e:
  29. bot(
  30. title='账号抓取任务, 冷启动数据库连接失败',
  31. detail={
  32. "error": str(e),
  33. "error_msg": traceback.format_exc()
  34. }
  35. )
  36. return False
  37. def crawler_task(self, category_list, date_str):
  38. """
  39. :return:
  40. """
  41. # 初始化category抓取类
  42. try:
  43. weixin_category_crawler = weixinCategory(db_client=self.db_client)
  44. weixin_category_crawler.deal(category_list=category_list, date_str=date_str)
  45. # 抓取完成之后,给抓取到的标题进行相似度打分
  46. cold_start_title_similarity_task = ColdStartTitleSimilarityTask()
  47. cold_start_title_similarity_task.init_database()
  48. cold_start_title_similarity_task.run(meta_source='article')
  49. bot(
  50. title="账号冷启动任务,抓取完成",
  51. detail={
  52. "finish_time": datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
  53. "category": category_list
  54. },
  55. mention=False
  56. )
  57. except Exception as e:
  58. bot(
  59. title="账号抓取冷启动任务,抓取失败",
  60. detail={
  61. "error": str(e),
  62. "error_msg": traceback.format_exc()
  63. }
  64. )
  65. def publish_task(self, category_list, article_source):
  66. """
  67. 将账号文章发布到aigc抓取计划,并且绑定生成计划
  68. :param category_list: 文章品类
  69. :param article_source: 文章来源(toutiao or weixin)
  70. :return:
  71. """
  72. try:
  73. weixin_category_publisher = CategoryColdStartTask(db_client=self.db_client)
  74. weixin_category_publisher.do_job(
  75. category_list=category_list,
  76. article_source=article_source
  77. )
  78. bot(
  79. title="账号冷启任务,发布完成",
  80. detail={
  81. "finish_time": datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
  82. "category": category_list
  83. },
  84. mention=False
  85. )
  86. except Exception as e:
  87. bot(
  88. title="账号发布冷启动任务,发布失败",
  89. detail={
  90. "error": str(e),
  91. "error_msg": traceback.format_exc()
  92. }
  93. )
  94. def main(date_str, category_list=None, article_source=None):
  95. """
  96. main job, use crontab to do job daily
  97. :return:
  98. """
  99. if not category_list:
  100. category_list = DEFAULT_CATEGORY_LIST
  101. if not article_source:
  102. article_source = 'weixin'
  103. task = AccountColdStartDailyTask()
  104. if task.init_db():
  105. if article_source == 'weixin':
  106. task.crawler_task(category_list=category_list, date_str=date_str)
  107. task.publish_task(category_list=category_list, article_source=article_source)
  108. if __name__ == '__main__':
  109. parser = ArgumentParser()
  110. parser.add_argument("--run_date", help="--run_date format: %Y-%m-%d")
  111. args = parser.parse_args()
  112. if args.run_date:
  113. run_date = args.run_date
  114. else:
  115. run_date = datetime.date.today().isoformat()
  116. # 执行微信抓取发布
  117. main(date_str=run_date)
  118. # 执行头条发布
  119. main(
  120. date_str=run_date,
  121. category_list=['history', 'tech', 'finance', 'entertainment'],
  122. article_source='toutiao'
  123. )