get_feeds.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2022/4/18
  4. """
  5. 获取看一看+小程序,首页推荐视频列表
  6. """
  7. import json
  8. import os
  9. import random
  10. import sys
  11. import time
  12. import requests
  13. import urllib3
  14. from main.feishu_lib import Feishu
  15. sys.path.append(os.getcwd())
  16. from main.common import Common
  17. proxies = {"http": None, "https": None}
  18. # 敏感词库
  19. def kanyikan_sensitive_words(log_type):
  20. # 敏感词库列表
  21. word_list = []
  22. # 从云文档读取所有敏感词,添加到词库列表
  23. lists = Feishu.get_values_batch(log_type, "kanyikan", "rofdM5")
  24. for i in lists:
  25. for j in i:
  26. # 过滤空的单元格内容
  27. if j is None:
  28. pass
  29. else:
  30. word_list.append(j)
  31. return word_list
  32. def get_feeds(log_type):
  33. """
  34. 1.从看一看+小程序首页推荐,获取视频列表
  35. 2.先在 https://w42nne6hzg.feishu.cn/sheets/shtcngRPoDYAi24x52j2nDuHMih?sheet=20ce0c 中去重
  36. 3.再从 https://w42nne6hzg.feishu.cn/sheets/shtcngRPoDYAi24x52j2nDuHMih?sheet=SdCHOM 中去重
  37. 4.添加视频信息至 https://w42nne6hzg.feishu.cn/sheets/shtcngRPoDYAi24x52j2nDuHMih?sheet=SdCHOM
  38. """
  39. Common.logger(log_type).info("开始从推荐页获取视频列表")
  40. host = "https://search.weixin.qq.com"
  41. url = '/cgi-bin/recwxa/recwxavideolist?'
  42. video_list_session = Common.get_session(log_type)
  43. # Common.logger(log_type).info("获取视频list时,session:{}", video_list_session)
  44. header = {
  45. "Connection": "keep-alive",
  46. "content-type": "application/json",
  47. "Accept-Encoding": "gzip,compress,br,deflate",
  48. "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) "
  49. "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.18(0x18001236) "
  50. "NetType/WIFI Language/zh_CN",
  51. "Referer": "https://servicewechat.com/wxbb9a805eb4f9533c/234/page-frame.html",
  52. }
  53. params = {
  54. 'session': video_list_session,
  55. "offset": 0,
  56. "wxaVersion": "3.9.2",
  57. "count": "10",
  58. "channelid": "208",
  59. "scene": '310',
  60. "subscene": '1089',
  61. "clientVersion": '8.0.18',
  62. "sharesearchid": '0',
  63. "nettype": 'wifi',
  64. "switchprofile": "0",
  65. "switchnewuser": "0",
  66. }
  67. try:
  68. urllib3.disable_warnings()
  69. r = requests.get(host + url, headers=header, params=params, proxies=proxies, verify=False)
  70. response = json.loads(r.content.decode("utf8"))
  71. if "data" not in response:
  72. Common.logger(log_type).info("获取视频list时,session过期,随机睡眠 31-50 秒")
  73. # 如果返回空信息,则随机睡眠 31-40 秒
  74. time.sleep(random.randint(31, 40))
  75. get_feeds(log_type)
  76. elif "items" not in response["data"]:
  77. Common.logger(log_type).info("获取视频list时,response:{},随机睡眠 1-3 分钟", response)
  78. # 如果返回空信息,则随机睡眠 1-3 分钟
  79. time.sleep(random.randint(60, 180))
  80. get_feeds(log_type)
  81. else:
  82. items = response["data"]["items"]
  83. for i in range(len(items)):
  84. # 如果该视频没有视频信息,则忽略
  85. if "videoInfo" not in items[i]:
  86. Common.logger(log_type).info("无视频信息")
  87. else:
  88. # 获取视频标题
  89. video_title = items[i]["title"].strip().replace("\n", "")\
  90. .replace("/", "").replace("\\", "").replace("\r", "")\
  91. .replace(":", "").replace("*", "").replace("?", "")\
  92. .replace("?", "").replace('"', "").replace("<", "")\
  93. .replace(">", "").replace("|", "").replace(" ", "")\
  94. .replace("&NBSP", "").replace(".", "。").replace(" ", "")\
  95. .replace("小年糕", "").replace("#", "").replace("Merge", "")
  96. Common.logger(log_type).info('视频标题:{}', video_title)
  97. # 获取视频ID
  98. video_id = items[i]["videoId"]
  99. Common.logger(log_type).info('视频ID:{}', video_id)
  100. # 获取视频播放次数
  101. video_play_cnt = items[i]["playCount"]
  102. Common.logger(log_type).info('视频播放次数:{}', video_play_cnt)
  103. # 获取视频点赞数
  104. video_liked_cnt = items[i]["liked_cnt"]
  105. Common.logger(log_type).info('视频点赞数:{}', video_liked_cnt)
  106. # 获取视频评论数
  107. video_comment_cnt = items[i]["comment_cnt"]
  108. Common.logger(log_type).info('视频评论数:{}', video_comment_cnt)
  109. # 获取视频分享数
  110. video_shared_cnt = items[i]["shared_cnt"]
  111. Common.logger(log_type).info('视频分享数:{}', video_shared_cnt)
  112. # 获取视频时长
  113. video_duration = items[i]["mediaDuration"]
  114. Common.logger(log_type).info('视频时长:{}秒', video_duration)
  115. # 获取视频宽高
  116. if "short_video_info" not in items[i]:
  117. video_width = "0"
  118. video_height = "0"
  119. video_resolution = str(video_width) + "*" + str(video_height)
  120. Common.logger(log_type).info("无分辨率:{}", video_resolution)
  121. elif len(items[i]["short_video_info"]) == 0:
  122. video_width = "0"
  123. video_height = "0"
  124. video_resolution = str(video_width) + "*" + str(video_height)
  125. Common.logger(log_type).info("无分辨率:{}", video_resolution)
  126. else:
  127. # 视频宽
  128. video_width = items[i]["short_video_info"]["width"]
  129. # 视频高
  130. video_height = items[i]["short_video_info"]["height"]
  131. video_resolution = str(video_width) + "*" + str(video_height)
  132. Common.logger(log_type).info('视频宽高:{}', video_resolution)
  133. # 获取视频发布时间
  134. video_send_date = items[i]["date"]
  135. Common.logger(log_type).info("视频发布时间:{}",
  136. time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(video_send_date)))
  137. # 获取视频用户名
  138. video_user = items[i]["source"].strip().replace("\n", "")
  139. Common.logger(log_type).info('视频用户名:{}', video_user)
  140. # user_id
  141. if "openid" not in items[i]:
  142. user_id = 0
  143. else:
  144. user_id = items[i]["openid"]
  145. # 获取视频用户头像
  146. video_user_cover = items[i]["bizIcon"]
  147. Common.logger(log_type).info('视频用户头像:{}', video_user_cover)
  148. # 获取视频封面
  149. if "smartCoverUrl" in items[i]:
  150. video_cover = items[i]["smartCoverUrl"]
  151. Common.logger(log_type).info('视频封面:{}', video_cover)
  152. else:
  153. video_cover = items[i]["thumbUrl"]
  154. Common.logger(log_type).info('视频封面:{}', video_cover)
  155. # 获取播放地址
  156. if "mpInfo" in items[i]["videoInfo"]["videoCdnInfo"].keys():
  157. if len(items[i]["videoInfo"]["videoCdnInfo"]["mpInfo"]["urlInfo"]) > 2:
  158. url = items[i]["videoInfo"]["videoCdnInfo"]["mpInfo"]["urlInfo"][2]["url"]
  159. Common.logger(log_type).info('视频播放地址:{}', url)
  160. else:
  161. url = items[i]["videoInfo"]["videoCdnInfo"]["mpInfo"]["urlInfo"][0]["url"]
  162. Common.logger(log_type).info('视频播放地址:{}', url)
  163. elif "ctnInfo" in items[i]["videoInfo"]["videoCdnInfo"]:
  164. url = items[i]["videoInfo"]["videoCdnInfo"]["ctnInfo"]["urlInfo"][0]["url"]
  165. Common.logger(log_type).info('视频播放地址:{}', url)
  166. else:
  167. url = items[i]["videoInfo"]["videoCdnInfo"]["urlInfo"][0]["url"]
  168. Common.logger(log_type).info('视频播放地址:{}', url)
  169. # 过滤无效视频
  170. if video_id == "" \
  171. or video_send_date == "" \
  172. or video_title.strip() == "" \
  173. or video_play_cnt == "" \
  174. or video_liked_cnt == "" \
  175. or video_duration == "" \
  176. or video_comment_cnt == "" \
  177. or video_shared_cnt == "" \
  178. or video_user == "" \
  179. or video_user_cover == "" \
  180. or video_cover == "" \
  181. or url == "":
  182. Common.logger(log_type).info("无效视频")
  183. # 基础门槛,播放量>=20000
  184. elif int(video_play_cnt) < 20000:
  185. Common.logger(log_type).info("播放量{} < 20000", video_play_cnt)
  186. # 过滤敏感词
  187. elif any(word if word in video_title else False
  188. for word in kanyikan_sensitive_words(log_type)) is True:
  189. Common.logger(log_type).info("视频已中敏感词:{}".format(video_title))
  190. # 从 云文档 去重:https://w42nne6hzg.feishu.cn/sheets/shtcngRPoDYAi24x52j2nDuHMih?sheet=20ce0c
  191. elif video_id in [j for i in Feishu.get_values_batch(log_type, "kanyikan", "20ce0c") for j in i]:
  192. Common.logger(log_type).info("该视频已下载:{}", video_title)
  193. # 从 云文档 去重:https://w42nne6hzg.feishu.cn/sheets/shtcngRPoDYAi24x52j2nDuHMih?sheet=SdCHOM
  194. elif video_id in [j for i in Feishu.get_values_batch(log_type, "kanyikan", "SdCHOM") for j in i]:
  195. Common.logger(log_type).info("该视频已在kanyikan_feeds中:{}", video_title)
  196. else:
  197. Common.logger(log_type).info("该视频未下载,添加至kanyikan_feeds:{}", video_title)
  198. # 看一看+工作表,插入首行
  199. Feishu.insert_columns(log_type, "kanyikan", "SdCHOM", "ROWS", 1, 2)
  200. # 获取当前时间
  201. get_feeds_time = int(time.time())
  202. # 准备写入云文档的数据
  203. values = [[time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(get_feeds_time)),
  204. "推荐榜",
  205. video_id,
  206. video_title,
  207. video_play_cnt,
  208. video_comment_cnt,
  209. video_liked_cnt,
  210. video_shared_cnt,
  211. video_duration,
  212. str(video_width) + "*" + str(video_height),
  213. time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(video_send_date)),
  214. video_user,
  215. user_id,
  216. video_user_cover,
  217. video_cover,
  218. url]]
  219. time.sleep(1)
  220. # 写入数据
  221. Feishu.update_values(log_type, "kanyikan", "SdCHOM", "A2:P2", values)
  222. except Exception as e:
  223. Common.logger(log_type).error("获取视频 list 时异常:{}", e)
  224. if __name__ == "__main__":
  225. get_feeds("recommend")