123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239 |
- # -*- coding: utf-8 -*-
- # @Author: wangkun
- # @Time: 2022/4/18
- """
- 获取看一看+小程序,首页推荐视频列表
- """
- import json
- import os
- import random
- import sys
- import time
- import requests
- import urllib3
- sys.path.append(os.getcwd())
- from main.common import Common
- proxies = {"http": None, "https": None}
- def get_feeds():
- """
- 获取视频信息后:
- 1.先在 video.txt 中去重
- 2.再从 basic.txt 中去重
- 3.添加视频信息至 basic.txt
- """
- host = "https://search.weixin.qq.com"
- url = '/cgi-bin/recwxa/recwxavideolist?'
- get_video_list_session = Common.get_session()
- Common.crawler_log().info("获取视频list时,session:{}".format(get_video_list_session))
- header = {
- "Connection": "keep-alive",
- "content-type": "application/json",
- "Accept-Encoding": "gzip,compress,br,deflate",
- "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) "
- "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.18(0x18001236) "
- "NetType/WIFI Language/zh_CN",
- "Referer": "https://servicewechat.com/wxbb9a805eb4f9533c/234/page-frame.html",
- }
- params = {
- 'session': get_video_list_session,
- "offset": 0,
- "wxaVersion": "3.9.2",
- "count": "10",
- "channelid": "208",
- "scene": '310',
- "subscene": '1089',
- "clientVersion": '8.0.18',
- "sharesearchid": '0',
- "nettype": 'wifi',
- "switchprofile": "0",
- "switchnewuser": "0",
- }
- try:
- urllib3.disable_warnings()
- r = requests.get(host + url, headers=header, params=params, proxies=proxies, verify=False)
- response = json.loads(r.content.decode("utf8"))
- if "data" not in response:
- Common.crawler_log().info("获取视频list时,session过期,随机睡眠 31-50 秒")
- # 如果返回空信息,则随机睡眠 50-100 秒
- time.sleep(random.randint(31, 50))
- get_feeds()
- elif "items" not in response["data"]:
- Common.crawler_log().info("获取视频list时,返回空信息,随机睡眠 1-3 分钟")
- # 如果返回空信息,则随机睡眠 1-3 分钟
- time.sleep(random.randint(60, 180))
- get_feeds()
- else:
- items = response["data"]["items"]
- for i in range(len(items)):
- # 如果该视频没有视频信息,则忽略
- if "videoInfo" not in items[i]:
- Common.crawler_log().info("无视频信息")
- else:
- # 获取视频ID
- video_id = items[i]["videoId"]
- Common.crawler_log().info('视频ID:{}'.format(video_id))
-
- # 获取视频标题
- video_title = items[i]["title"].strip().replace("\n", "")\
- .replace("/", "").replace("\\", "").replace("\r", "")\
- .replace(":", "").replace("*", "").replace("?", "")\
- .replace("?", "").replace('"', "").replace("<", "")\
- .replace(">", "").replace("|", "").replace(" ", "")
- Common.crawler_log().info('视频标题:{}'.format(video_title))
-
- # 获取视频播放次数
- video_play_cnt = items[i]["playCount"]
- Common.crawler_log().info('视频播放次数:{}'.format(video_play_cnt))
-
- # 获取视频点赞数
- video_liked_cnt = items[i]["liked_cnt"]
- Common.crawler_log().info('视频点赞数:{}'.format(video_liked_cnt))
-
- # 获取视频时长
- video_duration = items[i]["mediaDuration"]
- Common.crawler_log().info('视频时长:{}秒'.format(video_duration))
-
- # 获取视频评论数
- video_comment_cnt = items[i]["comment_cnt"]
- Common.crawler_log().info('视频评论数:{}'.format(video_comment_cnt))
-
- # 获取视频分享数
- video_shared_cnt = items[i]["shared_cnt"]
- Common.crawler_log().info('视频分享数:{}'.format(video_shared_cnt))
-
- # 获取视频发布时间
- video_send_date = items[i]["date"]
- Common.crawler_log().info('视频发布时间:{}'.format(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(video_send_date))))
-
- # 获取视频用户名
- video_user = items[i]["source"].strip().replace("\n", "")
- Common.crawler_log().info('视频用户名:{}'.format(video_user))
-
- # 获取视频宽高
- if "short_video_info" not in items[i]:
- video_width = "0"
- video_height = "0"
- video_resolution = str(video_width) + "*" + str(video_height)
- Common.crawler_log().info("无分辨率:{}".format(video_resolution))
- elif len(items[i]["short_video_info"]) == 0:
- video_width = "0"
- video_height = "0"
- video_resolution = str(video_width) + "*" + str(video_height)
- Common.crawler_log().info("无分辨率:{}".format(video_resolution))
- else:
- # 视频宽
- video_width = items[i]["short_video_info"]["width"]
- # 视频高
- video_height = items[i]["short_video_info"]["height"]
- video_resolution = str(video_width) + "*" + str(video_height)
- Common.crawler_log().info('视频宽高:{}'.format(video_resolution))
-
- # 获取视频用户头像
- video_user_cover = items[i]["bizIcon"]
- Common.crawler_log().info('视频用户头像:{}'.format(video_user_cover))
-
- # 获取视频封面
- if "smartCoverUrl" in items[i]:
- video_cover = items[i]["smartCoverUrl"]
- Common.crawler_log().info('视频封面:{}'.format(video_cover))
- else:
- video_cover = items[i]["thumbUrl"]
- Common.crawler_log().info('视频封面:{}'.format(video_cover))
-
- # 获取播放地址
- if "mpInfo" in items[i]["videoInfo"]["videoCdnInfo"].keys():
- if len(items[i]["videoInfo"]["videoCdnInfo"]["mpInfo"]["urlInfo"]) > 2:
- url = items[i]["videoInfo"]["videoCdnInfo"]["mpInfo"]["urlInfo"][2]["url"]
- Common.crawler_log().info('视频播放地址:{}'.format(url))
- else:
- url = items[i]["videoInfo"]["videoCdnInfo"]["mpInfo"]["urlInfo"][0]["url"]
- Common.crawler_log().info('视频播放地址:{}'.format(url))
- elif "ctnInfo" in items[i]["videoInfo"]["videoCdnInfo"]:
- url = items[i]["videoInfo"]["videoCdnInfo"]["ctnInfo"]["urlInfo"][0]["url"]
- Common.crawler_log().info('视频播放地址:{}'.format(url))
- else:
- url = items[i]["videoInfo"]["videoCdnInfo"]["urlInfo"][0]["url"]
- Common.crawler_log().info('视频播放地址:{}'.format(url))
- # 过滤无效视频
- if video_id == "" \
- or video_send_date == "" \
- or video_title.strip() == "" \
- or video_play_cnt == "" \
- or video_liked_cnt == "" \
- or video_duration == "" \
- or video_comment_cnt == "" \
- or video_shared_cnt == "" \
- or video_user == "" \
- or video_user_cover == "" \
- or video_cover == "" \
- or url == "":
- Common.crawler_log().info("无效视频")
- else:
- # 从 kanyikan_videoid.txt 去重
- videoids = Common.read_txt("kanyikan_videoid.txt")
- if video_id in [vid.strip() for vid in videoids]:
- Common.crawler_log().info("该视频已下载:{}".format(video_title))
- else:
- Common.crawler_log().info("该视频未下载:{}".format(video_title))
- # 获取当前时间
- basic_time = int(time.time())
- # 从 kanyikan_feeds.txt 去重
- contents = Common.read_txt("kanyikan_feeds.txt")
- # 文件为空时,直接添加该视频
- if len(contents) == 0:
- Common.crawler_log().info("添加该视频信息至kanyikan_feeds.txt:{}".format(video_title))
- # 当前时间、视频 ID、播放量 存储到 kanyikan_feeds.txt
- with open(r"./txt/kanyikan_feeds.txt", "a", encoding="utf8") as f:
- f.write(str(basic_time) + " + "
- + str(video_id) + " + "
- + str(video_play_cnt) + " + "
- + str(video_title) + " + "
- + str(video_duration) + " + "
- + str(video_comment_cnt) + " + "
- + str(video_liked_cnt) + " + "
- + str(video_shared_cnt) + " + "
- + str(video_resolution) + " + "
- + str(video_send_date) + " + "
- + str(video_user) + " + "
- + str(video_user_cover) + " + "
- + str(video_cover) + " + "
- + str(url) + " + "
- + Common.get_session() + "\n")
- else:
- # 文件不为空时,再做去重
- if video_id in [content.split(" + ")[1] for content in contents]:
- Common.crawler_log().info("该视频已在kanyikan_feeds.txt中:{}".format(video_title))
- else:
- Common.crawler_log().info("添加该视频信息至kanyikan_feeds.txt:{}".format(video_title))
- # 当前时间、视频 ID、播放量 存储到 kanyikan_feeds.txt
- with open(r"./txt/kanyikan_feeds.txt", "a", encoding="utf8") as f:
- f.write(str(basic_time) + " + "
- + str(video_id) + " + "
- + str(video_play_cnt) + " + "
- + str(video_title) + " + "
- + str(video_duration) + " + "
- + str(video_comment_cnt) + " + "
- + str(video_liked_cnt) + " + "
- + str(video_shared_cnt) + " + "
- + str(video_resolution) + " + "
- + str(video_send_date) + " + "
- + str(video_user) + " + "
- + str(video_user_cover) + " + "
- + str(video_cover) + " + "
- + str(url) + " + "
- + Common.get_session() + "\n")
- except Exception as e:
- Common.crawler_log().error("获取视频 list 时异常:{}".format(e))
- if __name__ == "__main__":
- get_feeds()
|