# -*- coding: utf-8 -*- # @Author: wangkun # @Time: 2022/4/8 import json import time import requests import urllib3 from main.common import Common from main.publish import Publish class Weishi: @staticmethod def weishi_download_rule(d_duration, d_width, d_height, d_play_cnt): """ 下载视频的基本规则 :param d_duration: 时长 :param d_width: 宽 :param d_height: 高 :param d_play_cnt: 播放量 :return: 满足规则,返回 True;反之,返回 False """ if 600 >= int(float(d_duration)) >= 60: if int(d_width) >= 720 or int(d_height) >= 720: if int(d_play_cnt) >= 100000: return True else: return False return False return False @classmethod def get_weishi_recommend(cls): """ 从微视小程序首页推荐获取视频list: 1.在 weishi_videoid.txt 中去重 2.在 weishi_feeds.txt 中去重 3.添加视频信息到 weishi_feeds.txt """ url = "https://api.weishi.qq.com/trpc.weishi.weishi_h5_proxy.weishi_h5_proxy/WxminiGetFeedList" cookies = { "wesee_authtype": "3", "wesee_openid": "oWGa05FrwkuUvT-4n1qGeQuhVsc8", "wesee_openkey": "8c3ec202f5d679fb5ee6d9f643640d9a2580ba504612e2d979a881d3169caf189e2a5c1d532eeff172bc21cf2" "6230941ccbc10243a7879e8165ca608c17060de606a6d08afe0a3abd5250629314f9a99e9d1003b201bf5ec", "wesee_personid": "1593522421826902", "wesee_refresh_token": "", "wesee_access_token": "8c3ec202f5d679fb5ee6d9f643640d9a2580ba504612e2d979a881d3169caf18" "9e2a5c1d532eeff172bc21cf26230941ccbc10243a7879e8165ca608c17060de6" "06a6d08afe0a3abd5250629314f9a99e9d1003b201bf5ec", "wesee_thr_appid": "wx75ee9f19b93e5c46", "wesee_ichid": "8" } json_data = { "req_body": { "requestType": 16, "isrefresh": 0, "isfirst": 0, "attachInfo": "", "scene_id": 22, "requestExt": { "mini_openid": "oWGa05FrwkuUvT-4n1qGeQuhVsc8", "notLogin-personid": "1593522421826902" } }, "req_header": { "mapExt": "{\"imageSize\":\"480\",\"adaptScene\":\"PicHDWebpLimitScene\"}" } } try: urllib3.disable_warnings() r = requests.post(url=url, cookies=cookies, json=json_data, verify=False) response = json.loads(r.content.decode("utf8")) if "rsp_body" not in response: Common.crawler_log().info("获取微视视频 list 出错:{},休眠 10s".format(response)) time.sleep(10) else: feeds = response["rsp_body"]["feeds"] for i in range(len(feeds)): if "video" not in feeds[i]: Common.crawler_log().info("无视频信息") else: # 视频 ID if "id" not in feeds[i]["video"]: video_id = "0" Common.crawler_log().info("video_id:{}".format(video_id)) else: video_id = feeds[i]["video"]["id"] Common.crawler_log().info("video_id:{}".format(video_id)) # 视频标题 video_title = feeds[i]["desc"].strip().replace("\n", "") \ .replace("/", "").replace("快手", "").replace(" ", "") \ .replace(" ", "").replace("&NBSP", "").replace("\r", "") Common.crawler_log().info("video_title:{}".format(video_title)) # 视频发布时间 if "createTime" not in feeds[i]: video_send_time = "0" Common.crawler_log().info("video_send_time:不存在") else: video_send_time = int(feeds[i]["createTime"])*1000 Common.crawler_log().info( "video_send_time:{}".format(time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(int(video_send_time)/1000)))) # 视频封面地址 if len(feeds[i]["images"]) == 0: cover_url = "0" Common.crawler_log().info("cover_url:不存在") else: cover_url = feeds[i]["images"][0]["url"] Common.crawler_log().info("cover_url:{}".format(cover_url)) # 视频播放地址 if "url" not in feeds[i]["video"]: video_url = "0" Common.crawler_log().info("video_url:不存在") else: video_url = feeds[i]["video"]["url"] Common.crawler_log().info("video_url:{}".format(video_url)) # 视频分辨率 if "width" not in feeds[i]["video"] or "height" not in feeds[i]["video"]: video_width = "0" video_height = "0" video_resolution = str(video_width) + "*" + str(video_height) Common.crawler_log().info("无分辨率") else: video_width = feeds[i]["video"]["width"] video_height = feeds[i]["video"]["height"] video_resolution = str(video_width) + "*" + str(video_height) Common.crawler_log().info("video_resolution:{}".format(video_resolution)) # 视频时长 if "duration" not in feeds[i]["video"]: video_duration = "0" Common.crawler_log().info("video_duration:不存在") else: video_duration = int(int(feeds[i]["video"]["duration"]) / 1000) Common.crawler_log().info("video_duration:{}秒".format(video_duration)) # 播放数 if "playNum" not in feeds[i]["ugcData"]: video_play_cnt = "0" Common.crawler_log().info("video_play_cnt:{}".format(video_play_cnt)) else: video_play_cnt = feeds[i]["ugcData"]["playNum"] Common.crawler_log().info("video_play_cnt:{}".format(video_play_cnt)) # 点赞数 if "dingCount" not in feeds[i]["ugcData"]: video_like_cnt = "0" Common.crawler_log().info("video_like_cnt:{}".format(video_like_cnt)) else: video_like_cnt = feeds[i]["ugcData"]["dingCount"] Common.crawler_log().info("video_like_cnt:{}".format(video_like_cnt)) # 分享数 if "shareNum" not in feeds[i]["ugcData"]: video_share_cnt = "0" Common.crawler_log().info("video_share_cnt:{}".format(video_share_cnt)) else: video_share_cnt = feeds[i]["ugcData"]["shareNum"] Common.crawler_log().info("video_share_cnt:{}".format(video_share_cnt)) # 评论数 if "totalCommentNum" not in feeds[i]["ugcData"]: video_comment_cnt = "0" Common.crawler_log().info("video_comment_cnt:{}".format(video_comment_cnt)) else: video_comment_cnt = feeds[i]["ugcData"]["totalCommentNum"] Common.crawler_log().info("video_comment_cnt:{}".format(video_comment_cnt)) # 用户 ID user_id = feeds[i]["poster"]["id"] Common.crawler_log().info("user_id:{}".format(user_id)) # 用户昵称 user_name = feeds[i]["poster"]["nick"].strip().replace("\n", "") \ .replace("/", "").replace("快手", "").replace(" ", "") \ .replace(" ", "").replace("&NBSP", "").replace("\r", "") Common.crawler_log().info("user_name:{}".format(user_name)) # 用户头像地址 if "thumbURL" not in feeds[i]["material"] and "avatar" not in feeds[i]["poster"]: head_url = "0" Common.crawler_log().info("head_url:不存在") elif "thumbURL" in feeds[i]["material"]: head_url = feeds[i]["material"]["thumbURL"] Common.crawler_log().info("head_url:{}".format(head_url)) else: head_url = feeds[i]["poster"]["avatar"] Common.crawler_log().info("head_url:{}".format(head_url)) # 从 weishi_videoid.txt 中去重 videos_ids = Common.read_txt("weishi_videoid.txt") if video_id in [v_id.strip() for v_id in videos_ids]: Common.crawler_log().info("该视频已下载:{}".format(video_title)) pass else: Common.crawler_log().info("该视频未下载:{}".format(video_title)) # 从 weishi_feeds.txt 中去重 contents = Common.read_txt("weishi_feeds.txt") # 若 weishi_feeds.txt 为空时,直接保存 if len(contents) == 0 and head_url != "0" \ and cover_url != "0" and video_url != "0" \ and video_duration != "0" and video_id != "0": basic_time = int(time.time()) Common.crawler_log().info("添加视频信息至weishi_feeds.txt:{}".format(video_title)) with open("./txt/weishi_feeds.txt", "a", encoding="utf8") as f_a: f_a.write(str(basic_time) + " + " + str(video_id) + " + " + str(video_play_cnt) + " + " + str(video_title) + " + " + str(video_duration) + " + " + str(video_comment_cnt) + " + " + str(video_like_cnt) + " + " + str(video_share_cnt) + " + " + str(video_resolution) + " + " + str(video_send_time) + " + " + str(user_name) + " + " + str(head_url) + " + " + str(cover_url) + " + " + str(video_url) + " + " + str(user_id) + " + " + str("oWGa05FrwkuUvT-4n1qGeQuhVsc8") + "\n") else: if video_id in [content.split(" + ")[1] for content in contents]: Common.crawler_log().info("该视频已在 weishi_feeds.txt 中:{}".format(video_title)) elif head_url == "0" or cover_url == "0" \ or video_url == "0" or video_duration == "0" or video_id == "0": Common.crawler_log().info("视频封面/播放地址/播放时长/用户头像不存在") else: basic_time = int(time.time()) Common.crawler_log().info("添加视频信息至weishi_feeds.txt:{}".format(video_title)) with open("./txt/weishi_feeds.txt", "a", encoding="utf8") as f_a: f_a.write(str(basic_time) + " + " + str(video_id) + " + " + str(video_play_cnt) + " + " + str(video_title) + " + " + str(video_duration) + " + " + str(video_comment_cnt) + " + " + str(video_like_cnt) + " + " + str(video_share_cnt) + " + " + str(video_resolution) + " + " + str(video_send_time) + " + " + str(user_name) + " + " + str(head_url) + " + " + str(cover_url) + " + " + str(video_url) + " + " + str(user_id) + " + " + str("oWGa05FrwkuUvT-4n1qGeQuhVsc8") + "\n") except Exception as e: Common.crawler_log().error("获取微视视频 list 异常:{}".format(e)) @classmethod def download_weishi_play_video(cls, env): """ 下载播放量视频 测试环境:env == dev 正式环境:env == prod """ videos = Common.read_txt("weishi_feeds.txt") for video in videos: download_video_id = video.strip().split(" + ")[1] download_video_title = video.strip().split(" + ")[3] download_video_duration = video.strip().split(" + ")[4] download_video_play_cnt = video.strip().split(" + ")[2] download_video_comment_cnt = video.strip().split(" + ")[5] download_video_like_cnt = video.strip().split(" + ")[6] download_video_share_cnt = video.strip().split(" + ")[7] download_video_resolution = video.strip().split(" + ")[8] download_video_width = download_video_resolution.split("*")[0] download_video_height = download_video_resolution.split("*")[-1] download_video_send_time = video.strip().split(" + ")[9] download_user_name = video.strip().split(" + ")[10] download_head_url = video.strip().split(" + ")[11] download_cover_url = video.strip().split(" + ")[12] download_video_url = video.strip().split(" + ")[13] download_video_session = video.strip().split(" + ")[-1] if cls.weishi_download_rule(download_video_duration, download_video_width, download_video_height, download_video_play_cnt) is True: Common.crawler_log().info("开始下载视频:{}".format(download_video_title)) # 下载封面 Common.download_method(text="cover", d_name=download_video_title, d_url=download_cover_url) # 下载视频 Common.download_method(text="video", d_name=download_video_title, d_url=download_video_url) # 保存视频信息至 weishi_videoid.txt with open("./txt/weishi_videoid.txt", "a", encoding="utf8") as fa: fa.write(download_video_id + "\n") # 保存视频信息至 "./videos/{download_video_title}/info.txt" with open("./videos/" + download_video_title + "/info.txt", "a", encoding="utf8") as f_a: f_a.write(str(download_video_id) + "\n" + str(download_video_title) + "\n" + str(download_video_duration) + "\n" + str(download_video_play_cnt) + "\n" + str(download_video_comment_cnt) + "\n" + str(download_video_like_cnt) + "\n" + str(download_video_share_cnt) + "\n" + str(download_video_resolution) + "\n" + str(download_video_send_time) + "\n" + str(download_user_name) + "\n" + str(download_head_url) + "\n" + str(download_video_url) + "\n" + str(download_cover_url) + "\n" + str(download_video_session)) # 上传视频 if env == "dev": Common.crawler_log().info("开始上传视频:{}".format(download_video_title)) Publish.upload_and_publish("dev", "play") elif env == "prod": Common.crawler_log().info("开始上传视频:{}".format(download_video_title)) Publish.upload_and_publish("prod", "play") # 删除该视频在weishi_feeds.txt中的信息 Common.crawler_log().info("删除该视频在weishi_feeds.txt中的信息:{}".format(download_video_title)) with open("./txt/weishi_feeds.txt", "r", encoding="utf8") as f_r: lines = f_r.readlines() with open("./txt/weishi_feeds.txt", "w", encoding="utf-8") as f_w: for line in lines: if download_video_id in line.split(" + ")[1]: continue f_w.write(line) else: # 删除该视频在weishi_feeds.txt中的信息 Common.crawler_log().info("该视频不满足下载规则,删除在weishi_feeds.txt中的信息:{}".format(download_video_title)) with open("./txt/weishi_feeds.txt", "r", encoding="utf8") as f_r: lines = f_r.readlines() with open("./txt/weishi_feeds.txt", "w", encoding="utf-8") as f_w: for line in lines: if download_video_id in line.split(" + ")[1]: continue f_w.write(line) if __name__ == "__main__": weishi = Weishi() weishi.get_weishi_recommend()