sph_crawling_data.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. import json
  2. import random
  3. import time
  4. import requests
  5. from common import Material, Oss, Common, Feishu
  6. from common.sql_help import sqlCollect
  7. from data_channel.data_help import dataHelp
  8. from data_channel.shipinhao import SPH
  9. class SphHistory:
  10. """获取视频号所有内容"""
  11. @classmethod
  12. def sph_data_info(cls):
  13. user_list = cls.get_sph_user()
  14. if user_list == None:
  15. return
  16. for user in user_list:
  17. Common.logger("sph_crawling").info(f"{user}开始获取数据")
  18. account_id = SPH.get_account_id(user)
  19. if account_id == False:
  20. print(f"{account_id}:没有获取到视频account_id,无法抓取数据")
  21. continue
  22. url = "http://61.48.133.26:30001/FinderGetUpMasterNextPage"
  23. last_buffer = ""
  24. try:
  25. count = 1
  26. while True:
  27. headers = {
  28. 'Content-Type': 'application/json'
  29. }
  30. payload = json.dumps({
  31. "username": account_id,
  32. "last_buffer": last_buffer
  33. })
  34. response = requests.request("POST", url, headers=headers, data=payload)
  35. time.sleep(random.randint(1, 5))
  36. Common.logger("sph_crawling").info(f"{user}获取第{count}页视频")
  37. count += 1
  38. if response.text == "" or response.text == None:
  39. break
  40. res_json = response.json()
  41. try:
  42. if len(res_json["DownloadAddress"]) == 0 or res_json["DownloadAddress"] == "" or res_json["DownloadAddress"] == None:
  43. break
  44. except:
  45. pass
  46. if "objectId" not in response.text or response.status_code != 200:
  47. break
  48. if len(res_json["UpMasterHomePage"]) == 0:
  49. break
  50. if not res_json["UpMasterHomePage"]:
  51. break
  52. last_buffer = res_json.get('last_buffer')
  53. try:
  54. for obj in res_json["UpMasterHomePage"]:
  55. Common.logger("sph_crawling").info(f"{user}扫描到一条数据")
  56. objectId = obj['objectId']
  57. object_id = sqlCollect.sph_data_info_v_id(objectId, "视频号")
  58. if object_id:
  59. continue
  60. objectNonceId = obj['objectNonceId']
  61. url1 = "http://61.48.133.26:30001/GetFinderDownloadAddress"
  62. payload = json.dumps({
  63. "objectId": objectId,
  64. "objectNonceId": objectNonceId
  65. })
  66. headers = {
  67. 'Content-Type': 'text/plain'
  68. }
  69. response = requests.request("POST", url1, headers=headers, data=payload)
  70. time.sleep(random.randint(0, 1))
  71. video_obj = response.json()
  72. video_url = video_obj.get('DownloadAddress')
  73. duration = dataHelp.video_duration(video_url)
  74. cover = video_obj.get('thumb_url')
  75. if len(video_url) == 0:
  76. continue
  77. v_id = f"sph/{objectId}"
  78. try:
  79. Common.logger("sph_crawling").info(f"{user}视频ID:{objectId},视频链接:{video_url}开始发送oss")
  80. oss_video_key = Oss.channel_upload_oss(video_url, v_id) # 视频发送OSS
  81. oss_video_key = oss_video_key.get("oss_object_key")
  82. Common.logger("sph_crawling").info(f"{user}视频发送oss成功,视频oss地址{oss_video_key}")
  83. Common.logger("sph_crawling").info(f"{user}视频ID:{objectId},封面链接:{cover}开始发送oss")
  84. oss_cover_key = Oss.channel_upload_oss(cover, f"sph/{objectId}.jpg") # 视频发送OSS
  85. oss_cover_key = oss_cover_key.get("oss_object_key")
  86. Common.logger("sph_crawling").info(f"{user}封面发送oss成功,封面oss地址{oss_video_key}")
  87. create_time = obj['createtime'] # 发布时间
  88. except:
  89. continue
  90. share_cnt = int(obj['forward_count']) # 分享
  91. like_cnt = int(obj['like_count']) # 点赞
  92. video_title = video_obj.get('title').split("\n")[0].split("#")[0]
  93. user_name = obj['username'] # 用户名标示
  94. nick_name = obj['nickname'] # 用户名
  95. comment_count = obj['comment_count'] # 评论数
  96. fav_count = obj['fav_count'] # 大拇指点赞数
  97. sqlCollect.sph_data_info('视频号', objectId, video_url, cover, video_title, str(share_cnt), str(like_cnt), oss_video_key, oss_cover_key, nick_name, user_name, comment_count, fav_count, create_time,duration)
  98. Common.logger("sph_crawling").info(f"{nick_name}插入数据成功")
  99. except Exception as e:
  100. Common.logger("sph_crawling").info(f"{user}异常,异常信息{e}")
  101. continue
  102. sqlCollect.update_sph_channel_user_status(user)
  103. Common.logger("sph_crawling").info(f"{user}用户抓取完成")
  104. count = sqlCollect.sph_data_info_count(user, "视频号")
  105. text = (
  106. f"**{user}抓取完成:共抓了{count[0]}条数据**\n"
  107. )
  108. Feishu.finish_bot(text,
  109. "https://open.feishu.cn/open-apis/bot/v2/hook/029fa989-9847-4574-8e1b-5c396e665f16",
  110. "【 视频号历史数据抓取通知 】")
  111. except Exception as e:
  112. Common.logger("sph_crawling").info(f"{user}异常,异常信息{e}")
  113. Feishu.finish_bot(e,
  114. "https://open.feishu.cn/open-apis/bot/v2/hook/029fa989-9847-4574-8e1b-5c396e665f16",
  115. "【 视频号抓取异常通知 】")
  116. continue
  117. @classmethod
  118. def get_sph_user(cls):
  119. data = sqlCollect.sph_channel_user_list()
  120. if data == None:
  121. user_list = Material.get_sph_user()
  122. if user_list:
  123. for user in user_list:
  124. sqlCollect.insert_sph_channel_user("视频号", user)
  125. else:
  126. return None
  127. result_list = [item for sublist in data for item in sublist]
  128. return result_list
  129. if __name__ == '__main__':
  130. SphHistory.sph_data_info()
  131. # count = sqlCollect.sph_data_info_count("郑蓝旗", "视频号")
  132. # print(count)