crawler_channel_account_videos.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. """
  2. @author: luojunhui
  3. @tool: pycharm && deepseek
  4. """
  5. import json
  6. import time
  7. import traceback
  8. from applications import log
  9. from applications.db import DatabaseConnector
  10. from applications.utils import download_sph_video
  11. from applications.utils import str_to_md5
  12. from applications.utils import upload_to_oss
  13. from config import long_articles_config
  14. from coldStartTasks.crawler.channels import get_channel_account_videos
  15. NO_SOURCE_ACCOUNT = 0
  16. class CrawlerChannelAccountVideos:
  17. """
  18. crawler channel account videos
  19. """
  20. def __init__(self):
  21. self.db_client = DatabaseConnector(db_config=long_articles_config)
  22. self.db_client.connect()
  23. self.success_crawler_video_count = 0
  24. def get_channel_account_list(self):
  25. """
  26. get channel account list from database
  27. """
  28. return
  29. def crawler_each_account(self, channel_account_id: str, channel_account_name: str):
  30. """
  31. get channel account videos
  32. """
  33. response = get_channel_account_videos(channel_account_id)
  34. if response['ret'] == 200:
  35. response_data = response['data']
  36. last_buffer = response_data['lastBuffer']
  37. continue_flag = response_data['continueFlag']
  38. video_list = response_data['object']
  39. for video in video_list[:1]:
  40. video_id = str(video['id'])
  41. account_name = video['nickname']
  42. object_desc = video['objectDesc']
  43. publish_timestamp = video['createtime']
  44. title = object_desc['description']
  45. media = object_desc['media'][0]
  46. url = media['Url']
  47. decode_key = media['decodeKey']
  48. url_token = media['urlToken']
  49. download_url = url + url_token
  50. try:
  51. decrypt_path = download_sph_video(download_url=download_url, key=decode_key)
  52. oss_path = upload_to_oss(decrypt_path)
  53. insert_sql = f"""
  54. insert into publish_single_video_source
  55. (content_trace_id, article_title, out_account_id, out_account_name, video_oss_path, publish_timestamp, crawler_timestamp, url_unique_md5, platform, source_account)
  56. values
  57. (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
  58. """
  59. try:
  60. self.db_client.save(
  61. query=insert_sql,
  62. params=(
  63. "video{}".format(str_to_md5(video_id)),
  64. title,
  65. channel_account_id,
  66. account_name,
  67. oss_path,
  68. publish_timestamp,
  69. int(time.time()),
  70. video_id,
  71. "sph",
  72. NO_SOURCE_ACCOUNT
  73. ),
  74. )
  75. self.success_crawler_video_count += 1
  76. except Exception as e:
  77. log(
  78. task="baidu_video_crawler",
  79. function="save_each_video",
  80. message="save video failed",
  81. data={
  82. "error": str(e),
  83. "traceback": traceback.format_exc(),
  84. "video_id": video_id,
  85. "oss_path": oss_path,
  86. },
  87. )
  88. except Exception as e:
  89. print("download video error:", e)
  90. else:
  91. print(f"crawler channel account {channel_account_name} videos failed")
  92. return