download.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2022/4/25
  4. """
  5. 从 微信小程序-本山祝福短视频 中,下载符合规则的视频
  6. """
  7. import json
  8. import os
  9. import random
  10. import sys
  11. import time
  12. from urllib import parse
  13. import requests
  14. import urllib3
  15. sys.path.append(os.getcwd())
  16. from main.common import Common
  17. from main.publish import Publish
  18. proxies = {"http": None, "https": None}
  19. class BSZF:
  20. # 已下载视频列表
  21. download_video_list = []
  22. # 过滤关键字
  23. @classmethod
  24. def sensitive_words(cls):
  25. sensitive_words = [
  26. "早上好",
  27. "晚上好",
  28. ]
  29. return sensitive_words
  30. @classmethod
  31. def get_recommend(cls):
  32. """
  33. 从本山祝福小程序首页推荐获取视频list:
  34. 1.在 benshanzhufu_videoid.txt 中去重
  35. 2.在 benshanzhufu_feeds.txt 中去重
  36. 3.添加视频信息到 benshanzhufu_feeds.txt
  37. """
  38. now = int(time.time() * 1000)
  39. url = "https://bszf.wentingyou.cn/index.php/v111/index/index?parameter="
  40. header = {
  41. "Connection": "keep-alive",
  42. "vision": "1.1.0",
  43. "content-type": "application/x-www-form-urlencoded",
  44. "scene": "1008",
  45. "content-time": str(now),
  46. "token": "",
  47. "visitorKey": "165086930003741",
  48. "chatKey": "wx0fb8149da961d3b0",
  49. "cache-time": str(now),
  50. "Accept-Encoding": "gzip,compress,br,deflate",
  51. "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) "
  52. "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 "
  53. "MicroMessenger/8.0.20(0x1800142d) NetType/WIFI Language/zh_CN",
  54. "Referer": "https://servicewechat.com/wx0fb8149da961d3b0/2/page-frame.html"
  55. }
  56. parameter = {
  57. "cid": "",
  58. "page": random.randint(1, 75),
  59. "is_ads": 1,
  60. "model": "iPhone 11<iPhone12,1>",
  61. "mini_version": "8.0.20",
  62. "origin_channel": "3",
  63. "origin_type": "2",
  64. "origin_level": "0",
  65. "ini_id": "165086930003741"
  66. }
  67. params = parse.quote(json.dumps(parameter))
  68. url = url + str(params)
  69. try:
  70. urllib3.disable_warnings()
  71. r = requests.get(headers=header, url=url, proxies=proxies, verify=False)
  72. response = json.loads(r.content.decode("utf8"))
  73. if "data" not in response:
  74. Common.crawler_log().error("获取本山祝福视频 list 出错:{},休眠 3s".format(response))
  75. time.sleep(3)
  76. else:
  77. feeds = response["data"]["list"]
  78. for i in range(len(feeds)):
  79. if "nid" not in feeds[i]:
  80. video_id = "0"
  81. Common.crawler_log().info("video_id:{}".format(video_id))
  82. else:
  83. video_id = feeds[i]["nid"]
  84. Common.crawler_log().info("video_id:{}".format(video_id))
  85. if "video_cover" not in feeds[i]:
  86. video_cover = "0"
  87. Common.crawler_log().info("video_cover不存在")
  88. else:
  89. video_cover = feeds[i]["video_cover"]
  90. Common.crawler_log().info("video_cover:{}".format(video_cover))
  91. if "video_url" not in feeds[i]:
  92. video_url = "0"
  93. Common.crawler_log().info("video_url:不存在")
  94. elif ".mp4" not in feeds[i]["video_url"]:
  95. video_url = "0"
  96. Common.crawler_log().info("video_url无效:".format(video_url))
  97. else:
  98. video_url = feeds[i]["video_url"]
  99. Common.crawler_log().info("video_url:{}".format(video_url))
  100. if "width" not in feeds[i] or "height" not in feeds[i]:
  101. video_width = "0"
  102. video_height = "0"
  103. video_resolution = str(video_width) + "*" + str(video_height)
  104. Common.crawler_log().info("无分辨率")
  105. else:
  106. video_width = feeds[i]["width"]
  107. video_height = feeds[i]["height"]
  108. video_resolution = str(video_width) + "*" + str(video_height)
  109. Common.crawler_log().info("video_resolution:{}".format(video_resolution))
  110. if "commentCount" not in feeds[i]:
  111. video_comment_cnt = "0"
  112. Common.crawler_log().info("video_comment_cnt:0")
  113. else:
  114. video_comment_cnt = feeds[i]["commentCount"]
  115. Common.crawler_log().info("video_comment_cnt:{}".format(video_comment_cnt))
  116. if "update_time" not in feeds[i]:
  117. video_send_time = "0"
  118. Common.crawler_log().info("video_send_time:不存在")
  119. else:
  120. video_send_time = feeds[i]["update_time"]
  121. Common.crawler_log().info("video_send_time:{}".format(
  122. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(video_send_time)))))
  123. # 视频标题过滤话题及处理特殊字符
  124. if "title" not in feeds[i]:
  125. video_title = "0"
  126. Common.crawler_log().info("video_title不存在")
  127. else:
  128. video_title = feeds[i]["title"].strip().replace("\n", "")\
  129. .replace("/", "").replace("本山祝福", "").replace(" ", "")\
  130. .replace(" ", "").replace("&NBSP", "").replace("\r", "")\
  131. .replace("#", "").replace(".", "。").replace("\\", "")\
  132. .replace(":", "").replace("*", "").replace("?", "")\
  133. .replace("?", "").replace('"', "").replace("<", "")\
  134. .replace(">", "").replace("|", "")
  135. Common.crawler_log().info("video_title:{}".format(video_title))
  136. video_like_cnt = "10000"
  137. video_share_cnt = "10000"
  138. video_duration = "10000"
  139. video_play_cnt = "10000"
  140. user_name = "bszf"
  141. head_url = video_cover
  142. user_id = "10000"
  143. # 从 benshanzhufu_videoid.txt 中去重
  144. video_ids = Common.read_txt("benshanzhufu_videoid.txt")
  145. if video_id in [p_id.strip() for p_id in video_ids]:
  146. Common.crawler_log().info("该视频已下载:{}".format(video_title))
  147. pass
  148. else:
  149. Common.crawler_log().info("该视频未下载:{}".format(video_title))
  150. # 从 benshanzhufu_feeds.txt 中去重
  151. contents = Common.read_txt("benshanzhufu_feeds.txt")
  152. # benshanzhufu_feeds.txt 为空时,直接保存
  153. if len(contents) == 0 and video_id != "0" and video_url != "0" and video_title != "0":
  154. basic_time = int(time.time())
  155. Common.crawler_log().info("添加视频信息至benshanzhufu_feeds.txt:{}".format(video_title))
  156. with open(r"./txt/benshanzhufu_feeds.txt", "a", encoding="UTF-8") as f_a:
  157. f_a.write(str(basic_time) + " + " +
  158. str(video_id) + " + " +
  159. str(video_play_cnt) + " + " +
  160. str(video_title) + " + " +
  161. str(video_duration) + " + " +
  162. str(video_comment_cnt) + " + " +
  163. str(video_like_cnt) + " + " +
  164. str(video_share_cnt) + " + " +
  165. str(video_resolution) + " + " +
  166. str(video_send_time) + " + " +
  167. str(user_name) + " + " +
  168. str(head_url) + " + " +
  169. str(video_cover) + " + " +
  170. str(video_url) + " + " +
  171. str(user_id) + " + " +
  172. str("wx0fb8149da961d3b0") + "\n")
  173. else:
  174. if video_id in [content.split(" + ")[1] for content in contents]:
  175. Common.crawler_log().info("该视频已在 benshanzhufu_feeds.txt 中:{}".format(video_title))
  176. elif video_id == "0" or video_url == "0" or video_title == "0":
  177. Common.crawler_log().info("视频不存在")
  178. else:
  179. basic_time = int(time.time())
  180. Common.crawler_log().info("添加视频信息至benshanzhufu_feeds.txt:{}".format(video_title))
  181. with open(r"./txt/benshanzhufu_feeds.txt", "a", encoding="UTF-8") as f_a:
  182. f_a.write(str(basic_time) + " + " +
  183. str(video_id) + " + " +
  184. str(video_play_cnt) + " + " +
  185. str(video_title) + " + " +
  186. str(video_duration) + " + " +
  187. str(video_comment_cnt) + " + " +
  188. str(video_like_cnt) + " + " +
  189. str(video_share_cnt) + " + " +
  190. str(video_resolution) + " + " +
  191. str(video_send_time) + " + " +
  192. str(user_name) + " + " +
  193. str(head_url) + " + " +
  194. str(video_cover) + " + " +
  195. str(video_url) + " + " +
  196. str(user_id) + " + " +
  197. str("wx0fb8149da961d3b0") + "\n")
  198. except Exception as e:
  199. Common.crawler_log().error("获取视频 list 异常:{}".format(e))
  200. @classmethod
  201. def download_video(cls, env):
  202. """
  203. 下载视频
  204. 测试环境:env == dev
  205. 正式环境:env == prod
  206. """
  207. videos = Common.read_txt("benshanzhufu_feeds.txt")
  208. for video in videos:
  209. download_video_id = video.strip().split(" + ")[1]
  210. try:
  211. download_video_title = video.strip().split(" + ")[3]
  212. download_video_duration = video.strip().split(" + ")[4]
  213. download_video_play_cnt = video.strip().split(" + ")[2]
  214. download_video_comment_cnt = video.strip().split(" + ")[5]
  215. download_video_like_cnt = video.strip().split(" + ")[6]
  216. download_video_share_cnt = video.strip().split(" + ")[7]
  217. download_video_resolution = video.strip().split(" + ")[8]
  218. download_video_send_time = video.strip().split(" + ")[9]
  219. download_user_name = video.strip().split(" + ")[10]
  220. download_head_url = video.strip().split(" + ")[11]
  221. download_cover_url = video.strip().split(" + ")[12]
  222. download_video_url = video.strip().split(" + ")[13]
  223. download_video_session = video.strip().split(" + ")[-1]
  224. if any(word if word in download_video_title else False for word in cls.sensitive_words()) is True:
  225. Common.crawler_log().info("视频已中敏感词,删除该视频信息:{}".format(download_video_title))
  226. # 删除该视频在benshanzhufu_feeds.txt中的信息
  227. with open(r"./txt/benshanzhufu_feeds.txt", "r", encoding="UTF-8") as f_r:
  228. lines = f_r.readlines()
  229. with open(r"./txt/benshanzhufu_feeds.txt", "w", encoding="utf-8") as f_w:
  230. for line in lines:
  231. if download_video_id in line.split(" + ")[1]:
  232. continue
  233. f_w.write(line)
  234. else:
  235. Common.crawler_log().info("开始下载视频:{}".format(download_video_title))
  236. # 下载封面
  237. Common.download_method(text="cover", d_name=download_video_title, d_url=download_cover_url)
  238. # 下载视频
  239. Common.download_method(text="video", d_name=download_video_title, d_url=download_video_url)
  240. # 保存视频信息至 benshanzhufu_videoid.txt
  241. with open(r"./txt/benshanzhufu_videoid.txt", "a", encoding="UTF-8") as fa:
  242. fa.write(download_video_id + "\n")
  243. # 添加视频 ID 到 list
  244. cls.download_video_list.append(download_video_id)
  245. # 保存视频信息至 "./videos/{download_video_title}/info.txt"
  246. with open(r"./videos/" + download_video_title + "/info.txt", "a", encoding="UTF-8") as f_a:
  247. f_a.write(str(download_video_id) + "\n" +
  248. str(download_video_title) + "\n" +
  249. str(download_video_duration) + "\n" +
  250. str(download_video_play_cnt) + "\n" +
  251. str(download_video_comment_cnt) + "\n" +
  252. str(download_video_like_cnt) + "\n" +
  253. str(download_video_share_cnt) + "\n" +
  254. str(download_video_resolution) + "\n" +
  255. str(download_video_send_time) + "\n" +
  256. str(download_user_name) + "\n" +
  257. str(download_head_url) + "\n" +
  258. str(download_video_url) + "\n" +
  259. str(download_cover_url) + "\n" +
  260. str(download_video_session))
  261. # 上传视频
  262. if env == "dev":
  263. Common.crawler_log().info("开始上传视频:{}".format(download_video_title))
  264. Publish.upload_and_publish("dev", "play")
  265. elif env == "prod":
  266. Common.crawler_log().info("开始上传视频:{}".format(download_video_title))
  267. Publish.upload_and_publish("prod", "play")
  268. # 删除该视频在benshanzhufu_feeds.txt中的信息
  269. Common.crawler_log().info("删除该视频在benshanzhufu_feeds.txt中的信息:{}".format(download_video_title))
  270. with open(r"./txt/benshanzhufu_feeds.txt", "r", encoding="UTF-8") as f_r3:
  271. lines = f_r3.readlines()
  272. with open(r"./txt/benshanzhufu_feeds.txt", "w", encoding="utf-8") as f_w3:
  273. for line in lines:
  274. if download_video_id in line.split(" + ")[1]:
  275. continue
  276. f_w3.write(line)
  277. except Exception as e:
  278. # 删除该视频在 recommend.txt中的信息
  279. Common.crawler_log().error("该视频信息异常,删除在benshanzhufu_feeds.txt中的信息:{}".format(e))
  280. with open(r"./txt/benshanzhufu_feeds.txt", "r", encoding="UTF-8") as f_r4:
  281. lines = f_r4.readlines()
  282. with open(r"./txt/benshanzhufu_feeds.txt", "w", encoding="utf-8") as f_w4:
  283. for line in lines:
  284. if download_video_id in line.split(" + ")[1]:
  285. continue
  286. f_w4.write(line)
  287. if __name__ == "__main__":
  288. bszf = BSZF()
  289. bszf.get_recommend()
  290. bszf.download_video("dev")