get_feeds.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2022/4/18
  4. """
  5. 获取看一看+小程序,首页推荐视频列表
  6. """
  7. import json
  8. import os
  9. import random
  10. import sys
  11. import time
  12. import requests
  13. import urllib3
  14. sys.path.append(os.getcwd())
  15. from main.common import Common
  16. proxies = {"http": None, "https": None}
  17. def get_feeds():
  18. """
  19. 获取视频信息后:
  20. 1.先在 video.txt 中去重
  21. 2.再从 basic.txt 中去重
  22. 3.添加视频信息至 basic.txt
  23. """
  24. host = "https://search.weixin.qq.com"
  25. url = '/cgi-bin/recwxa/recwxavideolist?'
  26. get_video_list_session = Common.get_session()
  27. Common.crawler_log().info("获取视频list时,session:{}".format(get_video_list_session))
  28. header = {
  29. "Connection": "keep-alive",
  30. "content-type": "application/json",
  31. "Accept-Encoding": "gzip,compress,br,deflate",
  32. "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) "
  33. "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.18(0x18001236) "
  34. "NetType/WIFI Language/zh_CN",
  35. "Referer": "https://servicewechat.com/wxbb9a805eb4f9533c/234/page-frame.html",
  36. }
  37. params = {
  38. 'session': get_video_list_session,
  39. "offset": 0,
  40. "wxaVersion": "3.9.2",
  41. "count": "10",
  42. "channelid": "208",
  43. "scene": '310',
  44. "subscene": '1089',
  45. "clientVersion": '8.0.18',
  46. "sharesearchid": '0',
  47. "nettype": 'wifi',
  48. "switchprofile": "0",
  49. "switchnewuser": "0",
  50. }
  51. try:
  52. urllib3.disable_warnings()
  53. r = requests.get(host + url, headers=header, params=params, proxies=proxies, verify=False)
  54. response = json.loads(r.content.decode("utf8"))
  55. if "data" not in response:
  56. Common.crawler_log().info("获取视频list时,session过期,随机睡眠 31-50 秒")
  57. # 如果返回空信息,则随机睡眠 50-100 秒
  58. time.sleep(random.randint(31, 50))
  59. get_feeds()
  60. elif "items" not in response["data"]:
  61. Common.crawler_log().info("获取视频list时,返回空信息,随机睡眠 1-3 分钟")
  62. # 如果返回空信息,则随机睡眠 1-3 分钟
  63. time.sleep(random.randint(60, 180))
  64. get_feeds()
  65. else:
  66. items = response["data"]["items"]
  67. for i in range(len(items)):
  68. # 如果该视频没有视频信息,则忽略
  69. if "videoInfo" not in items[i]:
  70. Common.crawler_log().info("无视频信息")
  71. else:
  72. # 获取视频ID
  73. video_id = items[i]["videoId"]
  74. Common.crawler_log().info('视频ID:{}'.format(video_id))
  75. # 获取视频标题
  76. video_title = items[i]["title"].strip().replace("\n", "")\
  77. .replace("/", "").replace("\\", "").replace("\r", "")\
  78. .replace(":", "").replace("*", "").replace("?", "")\
  79. .replace("?", "").replace('"', "").replace("<", "")\
  80. .replace(">", "").replace("|", "")
  81. Common.crawler_log().info('视频标题:{}'.format(video_title))
  82. # 获取视频播放次数
  83. video_play_cnt = items[i]["playCount"]
  84. Common.crawler_log().info('视频播放次数:{}'.format(video_play_cnt))
  85. # 获取视频点赞数
  86. video_liked_cnt = items[i]["liked_cnt"]
  87. Common.crawler_log().info('视频点赞数:{}'.format(video_liked_cnt))
  88. # 获取视频时长
  89. video_duration = items[i]["mediaDuration"]
  90. Common.crawler_log().info('视频时长:{}秒'.format(video_duration))
  91. # 获取视频评论数
  92. video_comment_cnt = items[i]["comment_cnt"]
  93. Common.crawler_log().info('视频评论数:{}'.format(video_comment_cnt))
  94. # 获取视频分享数
  95. video_shared_cnt = items[i]["shared_cnt"]
  96. Common.crawler_log().info('视频分享数:{}'.format(video_shared_cnt))
  97. # 获取视频发布时间
  98. video_send_date = items[i]["date"]
  99. Common.crawler_log().info('视频发布时间:{}'.format(
  100. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(video_send_date))))
  101. # 获取视频用户名
  102. video_user = items[i]["source"].strip().replace("\n", "")
  103. Common.crawler_log().info('视频用户名:{}'.format(video_user))
  104. # 获取视频宽高
  105. if "short_video_info" not in items[i]:
  106. video_width = "0"
  107. video_height = "0"
  108. video_resolution = str(video_width) + "*" + str(video_height)
  109. Common.crawler_log().info("无分辨率:{}".format(video_resolution))
  110. elif len(items[i]["short_video_info"]) == 0:
  111. video_width = "0"
  112. video_height = "0"
  113. video_resolution = str(video_width) + "*" + str(video_height)
  114. Common.crawler_log().info("无分辨率:{}".format(video_resolution))
  115. else:
  116. # 视频宽
  117. video_width = items[i]["short_video_info"]["width"]
  118. # 视频高
  119. video_height = items[i]["short_video_info"]["height"]
  120. video_resolution = str(video_width) + "*" + str(video_height)
  121. Common.crawler_log().info('视频宽高:{}'.format(video_resolution))
  122. # 获取视频用户头像
  123. video_user_cover = items[i]["bizIcon"]
  124. Common.crawler_log().info('视频用户头像:{}'.format(video_user_cover))
  125. # 获取视频封面
  126. if "smartCoverUrl" in items[i]:
  127. video_cover = items[i]["smartCoverUrl"]
  128. Common.crawler_log().info('视频封面:{}'.format(video_cover))
  129. else:
  130. video_cover = items[i]["thumbUrl"]
  131. Common.crawler_log().info('视频封面:{}'.format(video_cover))
  132. # 获取播放地址
  133. if "mpInfo" in items[i]["videoInfo"]["videoCdnInfo"].keys():
  134. if len(items[i]["videoInfo"]["videoCdnInfo"]["mpInfo"]["urlInfo"]) > 2:
  135. url = items[i]["videoInfo"]["videoCdnInfo"]["mpInfo"]["urlInfo"][2]["url"]
  136. Common.crawler_log().info('视频播放地址:{}'.format(url))
  137. else:
  138. url = items[i]["videoInfo"]["videoCdnInfo"]["mpInfo"]["urlInfo"][0]["url"]
  139. Common.crawler_log().info('视频播放地址:{}'.format(url))
  140. elif "ctnInfo" in items[i]["videoInfo"]["videoCdnInfo"]:
  141. url = items[i]["videoInfo"]["videoCdnInfo"]["ctnInfo"]["urlInfo"][0]["url"]
  142. Common.crawler_log().info('视频播放地址:{}'.format(url))
  143. else:
  144. url = items[i]["videoInfo"]["videoCdnInfo"]["urlInfo"][0]["url"]
  145. Common.crawler_log().info('视频播放地址:{}'.format(url))
  146. # 过滤无效视频
  147. if video_id == "" \
  148. or video_send_date == "" \
  149. or video_title.strip() == "" \
  150. or video_play_cnt == "" \
  151. or video_liked_cnt == "" \
  152. or video_duration == "" \
  153. or video_comment_cnt == "" \
  154. or video_shared_cnt == "" \
  155. or video_user == "" \
  156. or video_user_cover == "" \
  157. or video_cover == "" \
  158. or url == "":
  159. Common.crawler_log().info("无效视频")
  160. else:
  161. # 从 kanyikan_videoid.txt 去重
  162. videoids = Common.read_txt("kanyikan_videoid.txt")
  163. if video_id in [vid.strip() for vid in videoids]:
  164. Common.crawler_log().info("该视频已下载:{}".format(video_title))
  165. else:
  166. Common.crawler_log().info("该视频未下载:{}".format(video_title))
  167. # 获取当前时间
  168. basic_time = int(time.time())
  169. # 从 kanyikan_feeds.txt 去重
  170. contents = Common.read_txt("kanyikan_feeds.txt")
  171. # 文件为空时,直接添加该视频
  172. if len(contents) == 0:
  173. Common.crawler_log().info("添加该视频信息至kanyikan_feeds.txt:{}".format(video_title))
  174. # 当前时间、视频 ID、播放量 存储到 kanyikan_feeds.txt
  175. with open(r"./txt/kanyikan_feeds.txt", "a", encoding="utf8") as f:
  176. f.write(str(basic_time) + " + "
  177. + str(video_id) + " + "
  178. + str(video_play_cnt) + " + "
  179. + str(video_title) + " + "
  180. + str(video_duration) + " + "
  181. + str(video_comment_cnt) + " + "
  182. + str(video_liked_cnt) + " + "
  183. + str(video_shared_cnt) + " + "
  184. + str(video_resolution) + " + "
  185. + str(video_send_date) + " + "
  186. + str(video_user) + " + "
  187. + str(video_user_cover) + " + "
  188. + str(video_cover) + " + "
  189. + str(url) + " + "
  190. + Common.get_session() + "\n")
  191. else:
  192. # 文件不为空时,再做去重
  193. if video_id in [content.split(" + ")[1] for content in contents]:
  194. Common.crawler_log().info("该视频已在kanyikan_feeds.txt中:{}".format(video_title))
  195. else:
  196. Common.crawler_log().info("添加该视频信息至kanyikan_feeds.txt:{}".format(video_title))
  197. # 当前时间、视频 ID、播放量 存储到 kanyikan_feeds.txt
  198. with open(r"./txt/kanyikan_feeds.txt", "a", encoding="utf8") as f:
  199. f.write(str(basic_time) + " + "
  200. + str(video_id) + " + "
  201. + str(video_play_cnt) + " + "
  202. + str(video_title) + " + "
  203. + str(video_duration) + " + "
  204. + str(video_comment_cnt) + " + "
  205. + str(video_liked_cnt) + " + "
  206. + str(video_shared_cnt) + " + "
  207. + str(video_resolution) + " + "
  208. + str(video_send_date) + " + "
  209. + str(video_user) + " + "
  210. + str(video_user_cover) + " + "
  211. + str(video_cover) + " + "
  212. + str(url) + " + "
  213. + Common.get_session() + "\n")
  214. except Exception as e:
  215. Common.crawler_log().error("获取视频 list 时异常:{}".format(e))
  216. if __name__ == "__main__":
  217. get_feeds()