# -*- coding: utf-8 -*- # @Author: wangkun # @Time: 2022/4/18 """ 获取看一看+小程序,首页推荐视频列表 """ import json import os import random import sys import time import requests import urllib3 from main.feishu_lib import Feishu sys.path.append(os.getcwd()) from main.common import Common proxies = {"http": None, "https": None} # 敏感词库 def kanyikan_sensitive_words(): # 敏感词库列表 word_list = [] # 从云文档读取所有敏感词,添加到词库列表 lists = Feishu.get_values_batch("rofdM5") for i in lists: for j in i: # 过滤空的单元格内容 if j is None: pass else: word_list.append(j) return word_list def get_feeds(): """ 1.从看一看+小程序首页推荐,获取视频列表 2.先在 https://w42nne6hzg.feishu.cn/sheets/shtcngRPoDYAi24x52j2nDuHMih?sheet=20ce0c 中去重 3.再从 https://w42nne6hzg.feishu.cn/sheets/shtcngRPoDYAi24x52j2nDuHMih?sheet=SdCHOM 中去重 4.添加视频信息至 https://w42nne6hzg.feishu.cn/sheets/shtcngRPoDYAi24x52j2nDuHMih?sheet=SdCHOM """ Common.logger().info("开始从推荐页获取视频列表") host = "https://search.weixin.qq.com" url = '/cgi-bin/recwxa/recwxavideolist?' video_list_session = Common.get_session() # Common.logger().info("获取视频list时,session:{}", video_list_session) header = { "Connection": "keep-alive", "content-type": "application/json", "Accept-Encoding": "gzip,compress,br,deflate", "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) " "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.18(0x18001236) " "NetType/WIFI Language/zh_CN", "Referer": "https://servicewechat.com/wxbb9a805eb4f9533c/234/page-frame.html", } params = { 'session': video_list_session, "offset": 0, "wxaVersion": "3.9.2", "count": "10", "channelid": "208", "scene": '310', "subscene": '1089', "clientVersion": '8.0.18', "sharesearchid": '0', "nettype": 'wifi', "switchprofile": "0", "switchnewuser": "0", } try: urllib3.disable_warnings() r = requests.get(host + url, headers=header, params=params, proxies=proxies, verify=False) response = json.loads(r.content.decode("utf8")) if "data" not in response: Common.logger().info("获取视频list时,session过期,随机睡眠 31-50 秒") # 如果返回空信息,则随机睡眠 31-40 秒 time.sleep(random.randint(31, 40)) get_feeds() elif "items" not in response["data"]: Common.logger().info("获取视频list时,返回空信息,随机睡眠 1-3 分钟") # 如果返回空信息,则随机睡眠 1-3 分钟 time.sleep(random.randint(60, 180)) get_feeds() else: items = response["data"]["items"] for i in range(len(items)): # 如果该视频没有视频信息,则忽略 if "videoInfo" not in items[i]: Common.logger().info("无视频信息") else: # 获取视频标题 video_title = items[i]["title"].strip().replace("\n", "")\ .replace("/", "").replace("\\", "").replace("\r", "")\ .replace(":", "").replace("*", "").replace("?", "")\ .replace("?", "").replace('"', "").replace("<", "")\ .replace(">", "").replace("|", "").replace(" ", "")\ .replace("&NBSP", "").replace(".", "。").replace(" ", "")\ .replace("小年糕", "").replace("#", "").replace("Merge", "") Common.logger().info('视频标题:{}', video_title) # 获取视频ID video_id = items[i]["videoId"] Common.logger().info('视频ID:{}', video_id) # 获取视频播放次数 video_play_cnt = items[i]["playCount"] Common.logger().info('视频播放次数:{}', video_play_cnt) # 获取视频点赞数 video_liked_cnt = items[i]["liked_cnt"] Common.logger().info('视频点赞数:{}', video_liked_cnt) # 获取视频评论数 video_comment_cnt = items[i]["comment_cnt"] Common.logger().info('视频评论数:{}', video_comment_cnt) # 获取视频分享数 video_shared_cnt = items[i]["shared_cnt"] Common.logger().info('视频分享数:{}', video_shared_cnt) # 获取视频时长 video_duration = items[i]["mediaDuration"] Common.logger().info('视频时长:{}秒', video_duration) # 获取视频宽高 if "short_video_info" not in items[i]: video_width = "0" video_height = "0" video_resolution = str(video_width) + "*" + str(video_height) Common.logger().info("无分辨率:{}", video_resolution) elif len(items[i]["short_video_info"]) == 0: video_width = "0" video_height = "0" video_resolution = str(video_width) + "*" + str(video_height) Common.logger().info("无分辨率:{}", video_resolution) else: # 视频宽 video_width = items[i]["short_video_info"]["width"] # 视频高 video_height = items[i]["short_video_info"]["height"] video_resolution = str(video_width) + "*" + str(video_height) Common.logger().info('视频宽高:{}', video_resolution) # 获取视频发布时间 video_send_date = items[i]["date"] Common.logger().info("视频发布时间:{}", time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(video_send_date))) # 获取视频用户名 video_user = items[i]["source"].strip().replace("\n", "") Common.logger().info('视频用户名:{}', video_user) # user_id if "openid" not in items[i]: user_id = 0 else: user_id = items[i]["openid"] # 获取视频用户头像 video_user_cover = items[i]["bizIcon"] Common.logger().info('视频用户头像:{}', video_user_cover) # 获取视频封面 if "smartCoverUrl" in items[i]: video_cover = items[i]["smartCoverUrl"] Common.logger().info('视频封面:{}', video_cover) else: video_cover = items[i]["thumbUrl"] Common.logger().info('视频封面:{}', video_cover) # 获取播放地址 if "mpInfo" in items[i]["videoInfo"]["videoCdnInfo"].keys(): if len(items[i]["videoInfo"]["videoCdnInfo"]["mpInfo"]["urlInfo"]) > 2: url = items[i]["videoInfo"]["videoCdnInfo"]["mpInfo"]["urlInfo"][2]["url"] Common.logger().info('视频播放地址:{}', url) else: url = items[i]["videoInfo"]["videoCdnInfo"]["mpInfo"]["urlInfo"][0]["url"] Common.logger().info('视频播放地址:{}', url) elif "ctnInfo" in items[i]["videoInfo"]["videoCdnInfo"]: url = items[i]["videoInfo"]["videoCdnInfo"]["ctnInfo"]["urlInfo"][0]["url"] Common.logger().info('视频播放地址:{}', url) else: url = items[i]["videoInfo"]["videoCdnInfo"]["urlInfo"][0]["url"] Common.logger().info('视频播放地址:{}', url) # 过滤无效视频 if video_id == "" \ or video_send_date == "" \ or video_title.strip() == "" \ or video_play_cnt == "" \ or video_liked_cnt == "" \ or video_duration == "" \ or video_comment_cnt == "" \ or video_shared_cnt == "" \ or video_user == "" \ or video_user_cover == "" \ or video_cover == "" \ or url == "": Common.logger().info("无效视频") # 基础门槛,播放量>=20000 elif int(video_play_cnt) < 20000: Common.logger().info("播放量{} < 20000", video_play_cnt) # 过滤敏感词 elif any(word if word in video_title else False for word in kanyikan_sensitive_words()) is True: Common.logger().info("视频已中敏感词:{}".format(video_title)) # 从 云文档 去重:https://w42nne6hzg.feishu.cn/sheets/shtcngRPoDYAi24x52j2nDuHMih?sheet=20ce0c elif video_id in [j for i in Feishu.get_values_batch("20ce0c") for j in i]: Common.logger().info("该视频已下载:{}", video_title) # 从 云文档 去重:https://w42nne6hzg.feishu.cn/sheets/shtcngRPoDYAi24x52j2nDuHMih?sheet=SdCHOM elif video_id in [j for i in Feishu.get_values_batch("SdCHOM") for j in i]: Common.logger().info("该视频已在kanyikan_feeds中:{}", video_title) else: Common.logger().info("该视频未下载,添加至kanyikan_feeds:{}", video_title) # 看一看+工作表,插入首行 Feishu.insert_columns("SdCHOM", "ROWS", 1, 2) # 获取当前时间 get_feeds_time = int(time.time()) # 准备写入云文档的数据 values = [[time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(get_feeds_time)), "推荐榜", video_id, video_title, video_play_cnt, video_comment_cnt, video_liked_cnt, video_shared_cnt, video_duration, str(video_width) + "*" + str(video_height), time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(video_send_date)), video_user, user_id, video_user_cover, video_cover, url]] time.sleep(1) # 写入数据 Feishu.update_values("SdCHOM", "A2:P2", values) except Exception as e: Common.logger().error("获取视频 list 时异常:{}", e) if __name__ == "__main__": get_feeds()