account_cold_start_daily.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. """
  2. @author: luojunhui
  3. """
  4. import datetime
  5. import traceback
  6. from argparse import ArgumentParser
  7. from applications import longArticlesMySQL, bot
  8. from cold_start.crawler.weixinCategoryCrawler import weixinCategory
  9. from cold_start.publish.publishCategoryArticles import CategoryColdStartTask
  10. from cold_start.filter.title_similarity_task import ColdStartTitleSimilarityTask
  11. DEFAULT_CATEGORY_LIST = ['1030-手动挑号', 'account_association']
  12. class AccountColdStartDailyTask(object):
  13. """
  14. 账号冷启动代码
  15. """
  16. def __init__(self):
  17. """
  18. """
  19. self.db_client = None
  20. def init_db(self):
  21. """
  22. 初始化数据库
  23. :return:
  24. """
  25. try:
  26. self.db_client = longArticlesMySQL()
  27. return True
  28. except Exception as e:
  29. bot(
  30. title='账号抓取任务, 冷启动数据库连接失败',
  31. detail={
  32. "error": str(e),
  33. "error_msg": traceback.format_exc()
  34. }
  35. )
  36. return False
  37. def crawler_task(self, category_list, date_str):
  38. """
  39. :return:
  40. """
  41. # 初始化category抓取类
  42. try:
  43. weixin_category_crawler = weixinCategory(db_client=self.db_client)
  44. weixin_category_crawler.deal(category_list=category_list, date_str=date_str)
  45. # 抓取完成之后,给抓取到的标题进行相似度打分
  46. cold_start_title_similarity_task = ColdStartTitleSimilarityTask()
  47. cold_start_title_similarity_task.init_database()
  48. cold_start_title_similarity_task.run(meta_source='article')
  49. bot(
  50. title="账号冷启动任务,抓取完成",
  51. detail={
  52. "finish_time": datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
  53. "category": category_list
  54. },
  55. mention=False
  56. )
  57. except Exception as e:
  58. bot(
  59. title="账号抓取冷启动任务,抓取失败",
  60. detail={
  61. "error": str(e),
  62. "error_msg": traceback.format_exc()
  63. }
  64. )
  65. def publish_article_task(self, category_list, article_source):
  66. """
  67. 将账号文章发布到aigc抓取计划,并且绑定生成计划
  68. :param category_list: 文章品类
  69. :param article_source: 文章来源(toutiao or weixin)
  70. :return:
  71. """
  72. try:
  73. weixin_category_publisher = CategoryColdStartTask(db_client=self.db_client)
  74. weixin_category_publisher.do_job(
  75. category_list=category_list,
  76. article_source=article_source
  77. )
  78. bot(
  79. title="账号冷启任务,发布完成",
  80. detail={
  81. "finish_time": datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
  82. "category": category_list
  83. },
  84. mention=False
  85. )
  86. except Exception as e:
  87. bot(
  88. title="账号发布冷启动任务,发布失败",
  89. detail={
  90. "error": str(e),
  91. "error_msg": traceback.format_exc()
  92. }
  93. )
  94. def main(date_str, category_list=None, article_source=None):
  95. """
  96. main job, use crontab to do job daily
  97. :return:
  98. """
  99. if not category_list:
  100. category_list = DEFAULT_CATEGORY_LIST
  101. if not article_source:
  102. article_source = 'weixin'
  103. task = AccountColdStartDailyTask()
  104. if task.init_db():
  105. task.publish_article_task(category_list=category_list, article_source=article_source)
  106. if article_source == 'weixin':
  107. task.crawler_task(category_list=category_list, date_str=date_str)
  108. if __name__ == '__main__':
  109. parser = ArgumentParser()
  110. parser.add_argument("--run_date", help="--run_date format: %Y-%m-%d")
  111. args = parser.parse_args()
  112. if args.run_date:
  113. run_date = args.run_date
  114. else:
  115. run_date = datetime.date.today().isoformat()
  116. # 执行头条发布
  117. main(
  118. date_str=run_date,
  119. category_list=['history', 'tech', 'finance', 'entertainment'],
  120. article_source='toutiao'
  121. )
  122. # 执行微信抓取发布
  123. main(date_str=run_date)