sph_crawling_data.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. import configparser
  2. import json
  3. import os
  4. import random
  5. import time
  6. import requests
  7. from common import Material, Oss, Common
  8. from common.sql_help import sqlCollect
  9. from data_channel.piaoquan import PQ
  10. from data_channel.shipinhao import SPH
  11. class SphHistory:
  12. @classmethod
  13. def remove_files(cls, video_path_url):
  14. """
  15. 删除指定目录下的所有文件和子目录
  16. """
  17. if os.path.exists(video_path_url) and os.path.isdir(video_path_url):
  18. for root, dirs, files in os.walk(video_path_url):
  19. for file in files:
  20. file_path = os.path.join(root, file)
  21. os.remove(file_path)
  22. for dir in dirs:
  23. dir_path = os.path.join(root, dir)
  24. os.rmdir(dir_path)
  25. @classmethod
  26. def create_folders(cls):
  27. """
  28. 根据标示和任务标示创建目录
  29. """
  30. # video_path_url = "/Users/tzld/Desktop/video_rewriting/path/sph_crawling/"
  31. video_path_url = '/root/video_rewriting/path/sph_crawling/'
  32. if not os.path.exists(video_path_url):
  33. os.makedirs(video_path_url)
  34. return video_path_url
  35. """获取视频号所有内容"""
  36. @classmethod
  37. def sph_data_info(cls):
  38. user_list = cls.get_sph_user()
  39. video_path_url = cls.create_folders()
  40. if user_list == None:
  41. return
  42. for user in user_list:
  43. Common.logger("sph_crawling").info(f"{user}开始获取数据")
  44. account_id = SPH.get_account_id(user)
  45. if account_id == False:
  46. print(f"{account_id}:没有获取到视频account_id,无法抓取数据")
  47. continue
  48. url = "http://61.48.133.26:30001/FinderGetUpMasterNextPage"
  49. last_buffer = ""
  50. try:
  51. while True:
  52. headers = {
  53. 'Content-Type': 'application/json'
  54. }
  55. payload = json.dumps({
  56. "username": account_id,
  57. "last_buffer": last_buffer
  58. })
  59. response = requests.request("POST", url, headers=headers, data=payload)
  60. time.sleep(random.randint(1, 5))
  61. res_json = response.json()
  62. try:
  63. if len(res_json["DownloadAddress"]) == 0 or res_json["DownloadAddress"] == "" or res_json["DownloadAddress"] == None:
  64. break
  65. except:
  66. pass
  67. if "objectId" not in response.text or response.status_code != 200:
  68. break
  69. if len(res_json["UpMasterHomePage"]) == 0:
  70. break
  71. if not res_json["UpMasterHomePage"]:
  72. break
  73. last_buffer = res_json.get('last_buffer')
  74. for obj in res_json["UpMasterHomePage"]:
  75. Common.logger("sph_crawling").info(f"{user}扫描到一条数据")
  76. objectId = obj['objectId']
  77. objectNonceId = obj['objectNonceId']
  78. url = "http://61.48.133.26:30001/GetFinderDownloadAddress"
  79. payload = json.dumps({
  80. "objectId": objectId,
  81. "objectNonceId": objectNonceId
  82. })
  83. headers = {
  84. 'Content-Type': 'text/plain'
  85. }
  86. response = requests.request("POST", url, headers=headers, data=payload)
  87. time.sleep(random.randint(0, 1))
  88. video_obj = response.json()
  89. video_url = video_obj.get('DownloadAddress')
  90. if len(video_url) == 0:
  91. continue
  92. v_id = f"sph/{objectId}"
  93. oss_video_key = Oss.channel_upload_oss(video_url, v_id) # 视频发送OSS
  94. oss_video_key = oss_video_key.get("oss_object_key")
  95. share_cnt = int(obj['forward_count']) # 分享
  96. like_cnt = int(obj['like_count']) # 点赞
  97. video_title = video_obj.get('title').split("\n")[0].split("#")[0]
  98. cover = video_obj.get('thumb_url')
  99. jpg_path = PQ.download_video_jpg(cover, video_path_url, v_id) # 下载视频封面
  100. if os.path.isfile(jpg_path):
  101. oss_jpg_key = Oss.stitching_fm_upload_oss(jpg_path, v_id) # 封面发送OSS
  102. oss_cover_key = oss_jpg_key.get("oss_object_key")
  103. else:
  104. oss_cover_key = ''
  105. Common.logger("sph_crawling").info(f"{user}oss地址:视频{oss_video_key},封面{oss_cover_key}")
  106. create_time = obj['createtime'] # 发布时间
  107. user_name = obj['username'] # 用户名标示
  108. nick_name = obj['nickname'] # 用户名
  109. comment_count = obj['comment_count'] # 评论数
  110. fav_count = obj['fav_count'] # 大拇指点赞数
  111. sqlCollect.sph_data_info('视频号', objectId, video_url, cover, video_title, str(share_cnt), str(like_cnt), oss_video_key, oss_cover_key, nick_name, user_name, comment_count, fav_count, create_time)
  112. Common.logger("sph_crawling").info(f"{nick_name}插入数据成功")
  113. cls.remove_files(video_path_url)
  114. return "完成"
  115. except Exception as e:
  116. Common.logger("sph_crawling").info(f"{user}异常,异常信息{e}")
  117. cls.remove_files(video_path_url)
  118. continue
  119. @classmethod
  120. def get_sph_user(cls):
  121. data = sqlCollect.sph_channel_user_list()
  122. if data == None:
  123. user_list = Material.get_sph_user()
  124. if user_list:
  125. for user in user_list:
  126. sqlCollect.insert_sph_channel_user("视频号", user)
  127. else:
  128. return None
  129. result_list = [item for sublist in data for item in sublist]
  130. return result_list
  131. if __name__ == '__main__':
  132. SphHistory.sph_data_info()