sph_crawling_data.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. import configparser
  2. import json
  3. import os
  4. import random
  5. import time
  6. import requests
  7. from common import Material, Oss, Common
  8. from common.sql_help import sqlCollect
  9. from data_channel.shipinhao import SPH
  10. class SphHistory:
  11. """获取视频号所有内容"""
  12. @classmethod
  13. def sph_data_info(cls):
  14. user_list = cls.get_sph_user()
  15. if user_list == None:
  16. return
  17. for user in user_list:
  18. Common.logger("sph_crawling").info(f"{user}开始获取数据")
  19. account_id = SPH.get_account_id(user)
  20. if account_id == False:
  21. print(f"{account_id}:没有获取到视频account_id,无法抓取数据")
  22. url = "http://61.48.133.26:30001/FinderGetUpMasterNextPage"
  23. last_buffer = ""
  24. try:
  25. while True:
  26. headers = {
  27. 'Content-Type': 'application/json'
  28. }
  29. payload = json.dumps({
  30. "username": account_id,
  31. "last_buffer": last_buffer
  32. })
  33. response = requests.request("POST", url, headers=headers, data=payload)
  34. time.sleep(random.randint(1, 5))
  35. res_json = response.json()
  36. try:
  37. if len(res_json["DownloadAddress"]) == 0 or res_json["DownloadAddress"] == "" or res_json["DownloadAddress"] == None:
  38. break
  39. except:
  40. pass
  41. if "objectId" not in response.text or response.status_code != 200:
  42. break
  43. if len(res_json["UpMasterHomePage"]) == 0:
  44. break
  45. if not res_json["UpMasterHomePage"]:
  46. break
  47. last_buffer = res_json.get('last_buffer')
  48. for obj in res_json["UpMasterHomePage"]:
  49. objectId = obj['objectId']
  50. objectNonceId = obj['objectNonceId']
  51. url = "http://61.48.133.26:30001/GetFinderDownloadAddress"
  52. payload = json.dumps({
  53. "objectId": objectId,
  54. "objectNonceId": objectNonceId
  55. })
  56. headers = {
  57. 'Content-Type': 'text/plain'
  58. }
  59. response = requests.request("POST", url, headers=headers, data=payload)
  60. time.sleep(random.randint(0, 1))
  61. video_obj = response.json()
  62. video_url = video_obj.get('DownloadAddress')
  63. if len(video_url) == 0:
  64. continue
  65. v_id = f"sph_{objectId}.mp4"
  66. c_id = f"sph_{objectId}.jpg"
  67. oss_video_path = Oss.channel_upload_oss(video_url, v_id)
  68. oss_video_key = oss_video_path.get("oss_object_key")
  69. share_cnt = int(obj['forward_count']) # 分享
  70. like_cnt = int(obj['like_count']) # 点赞
  71. video_title = video_obj.get('title').split("\n")[0].split("#")[0]
  72. cover = video_obj.get('thumb_url')
  73. oss_cover_path = Oss.channel_upload_oss(cover, c_id)
  74. oss_cover_key = oss_cover_path.get("oss_object_key")
  75. Common.logger("sph_crawling").info(f"{user}oss地址:视频{oss_video_key},封面{oss_cover_key}")
  76. create_time = obj['createtime'] # 发布时间
  77. user_name = obj['username'] # 用户名标示
  78. nick_name = obj['nickname'] # 用户名
  79. comment_count = obj['comment_count'] # 评论数
  80. fav_count = obj['fav_count'] # 大拇指点赞数
  81. sqlCollect.sph_data_info('视频号', objectId, video_url, cover, video_title, str(share_cnt), str(like_cnt), oss_video_key, oss_cover_key, nick_name, user_name, comment_count, fav_count, create_time)
  82. Common.logger("sph_crawling").info(f"{nick_name}插入数据成功")
  83. return "完成"
  84. except:
  85. continue
  86. @classmethod
  87. def get_sph_user(cls):
  88. data = sqlCollect.sph_channel_user_list()
  89. if data == None:
  90. user_list = Material.get_sph_user()
  91. if user_list:
  92. for user in user_list:
  93. sqlCollect.insert_sph_channel_user("视频号", user)
  94. else:
  95. return None
  96. result_list = [item for sublist in data for item in sublist]
  97. return result_list
  98. if __name__ == '__main__':
  99. SphHistory.sph_data_info()