123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285 |
- # -*- coding: utf-8 -*-
- # @Author: wangkun
- # @Time: 2022/4/25
- """
- 从 微信小程序-本山祝福短视频 中,下载符合规则的视频
- """
- import json
- import os
- import random
- import sys
- import time
- from urllib import parse
- import requests
- import urllib3
- sys.path.append(os.getcwd())
- from main.common import Common
- from main.publish import Publish
- proxies = {"http": None, "https": None}
- class BSZF:
- @classmethod
- def get_recommend(cls):
- """
- 从本山祝福小程序首页推荐获取视频list:
- 1.在 benshanzhufu_videoid.txt 中去重
- 2.在 benshanzhufu_feeds.txt 中去重
- 3.添加视频信息到 benshanzhufu_feeds.txt
- """
- now = int(time.time() * 1000)
- url = "https://bszf.wentingyou.cn/index.php/v111/index/index?parameter="
- header = {
- "Connection": "keep-alive",
- "vision": "1.1.0",
- "content-type": "application/x-www-form-urlencoded",
- "scene": "1008",
- "content-time": str(now),
- "token": "",
- "visitorKey": "165086930003741",
- "chatKey": "wx0fb8149da961d3b0",
- "cache-time": str(now),
- "Accept-Encoding": "gzip,compress,br,deflate",
- "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) "
- "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 "
- "MicroMessenger/8.0.20(0x1800142d) NetType/WIFI Language/zh_CN",
- "Referer": "https://servicewechat.com/wx0fb8149da961d3b0/2/page-frame.html"
- }
- parameter = {
- "cid": "",
- "page": random.randint(1, 75),
- "is_ads": 1,
- "model": "iPhone 11<iPhone12,1>",
- "mini_version": "8.0.20",
- "origin_channel": "3",
- "origin_type": "2",
- "origin_level": "0",
- "ini_id": "165086930003741"
- }
- params = parse.quote(json.dumps(parameter))
- url = url + str(params)
- try:
- urllib3.disable_warnings()
- r = requests.get(headers=header, url=url, proxies=proxies, verify=False)
- response = json.loads(r.content.decode("utf8"))
- if "data" not in response:
- Common.crawler_log().error("获取本山祝福视频 list 出错:{},休眠 3s".format(response))
- time.sleep(3)
- else:
- feeds = response["data"]["list"]
- for i in range(len(feeds)):
- if "nid" not in feeds[i]:
- video_id = "0"
- Common.crawler_log().info("video_id:{}".format(video_id))
- else:
- video_id = feeds[i]["nid"]
- Common.crawler_log().info("video_id:{}".format(video_id))
- if "video_cover" not in feeds[i]:
- video_cover = "0"
- Common.crawler_log().info("video_cover不存在")
- else:
- video_cover = feeds[i]["video_cover"]
- Common.crawler_log().info("video_cover:{}".format(video_cover))
- if "video_url" not in feeds[i]:
- video_url = "0"
- Common.crawler_log().info("video_url:不存在")
- else:
- video_url = feeds[i]["video_url"]
- Common.crawler_log().info("video_url:{}".format(video_url))
- if "width" not in feeds[i] or "height" not in feeds[i]:
- video_width = "0"
- video_height = "0"
- video_resolution = str(video_width) + "*" + str(video_height)
- Common.crawler_log().info("无分辨率")
- else:
- video_width = feeds[i]["width"]
- video_height = feeds[i]["height"]
- video_resolution = str(video_width) + "*" + str(video_height)
- Common.crawler_log().info("video_resolution:{}".format(video_resolution))
- if "commentCount" not in feeds[i]:
- video_comment_cnt = "0"
- Common.crawler_log().info("video_comment_cnt:0")
- else:
- video_comment_cnt = feeds[i]["commentCount"]
- Common.crawler_log().info("video_comment_cnt:{}".format(video_comment_cnt))
- if "update_time" not in feeds[i]:
- video_send_time = "0"
- Common.crawler_log().info("video_send_time:不存在")
- else:
- video_send_time = feeds[i]["update_time"]
- Common.crawler_log().info("video_send_time:{}".format(
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(video_send_time)))))
- # 视频标题过滤话题及处理特殊字符
- if "title" not in feeds[i]:
- video_title = "0"
- Common.crawler_log().info("video_title不存在")
- else:
- video_title = feeds[i]["title"].strip().replace("\n", "")\
- .replace("/", "").replace("本山祝福", "").replace(" ", "")\
- .replace(" ", "").replace("&NBSP", "").replace("\r", "")\
- .replace("#", "").replace(".", "。").replace("\\", "")\
- .replace(":", "").replace("*", "").replace("?", "")\
- .replace("?", "").replace('"', "").replace("<", "")\
- .replace(">", "").replace("|", "")
- Common.crawler_log().info("video_title:{}".format(video_title))
- video_like_cnt = "10000"
- video_share_cnt = "10000"
- video_duration = "10000"
- video_play_cnt = "10000"
- user_name = "bszf"
- head_url = video_cover
- user_id = "10000"
- # 从 benshanzhufu_videoid.txt 中去重
- video_ids = Common.read_txt("benshanzhufu_videoid.txt")
- if video_id in [p_id.strip() for p_id in video_ids]:
- Common.crawler_log().info("该视频已下载:{}".format(video_title))
- else:
- Common.crawler_log().info("该视频未下载:{}".format(video_title))
- # 从 benshanzhufu_feeds.txt 中去重
- contents = Common.read_txt("benshanzhufu_feeds.txt")
- # benshanzhufu_feeds.txt 为空时,直接保存
- if len(contents) == 0 and video_id != "0" and video_url != "0" and video_title != "0":
- basic_time = int(time.time())
- Common.crawler_log().info("添加视频信息至benshanzhufu_feeds.txt:{}".format(video_title))
- with open(r"./txt/benshanzhufu_feeds.txt", "a", encoding="UTF-8") as f_a:
- f_a.write(str(basic_time) + " + " +
- str(video_id) + " + " +
- str(video_play_cnt) + " + " +
- str(video_title) + " + " +
- str(video_duration) + " + " +
- str(video_comment_cnt) + " + " +
- str(video_like_cnt) + " + " +
- str(video_share_cnt) + " + " +
- str(video_resolution) + " + " +
- str(video_send_time) + " + " +
- str(user_name) + " + " +
- str(head_url) + " + " +
- str(video_cover) + " + " +
- str(video_url) + " + " +
- str(user_id) + " + " +
- str("wx0fb8149da961d3b0") + "\n")
- else:
- if video_id in [content.split(" + ")[1] for content in contents]:
- Common.crawler_log().info("该视频已在 benshanzhufu_feeds.txt 中:{}".format(video_title))
- elif video_id == "0" or video_url == "0" or video_title != "0":
- Common.crawler_log().info("视频不存在")
- else:
- basic_time = int(time.time())
- Common.crawler_log().info("添加视频信息至benshanzhufu_feeds.txt:{}".format(video_title))
- with open(r"./txt/benshanzhufu_feeds.txt", "a", encoding="UTF-8") as f_a:
- f_a.write(str(basic_time) + " + " +
- str(video_id) + " + " +
- str(video_play_cnt) + " + " +
- str(video_title) + " + " +
- str(video_duration) + " + " +
- str(video_comment_cnt) + " + " +
- str(video_like_cnt) + " + " +
- str(video_share_cnt) + " + " +
- str(video_resolution) + " + " +
- str(video_send_time) + " + " +
- str(user_name) + " + " +
- str(head_url) + " + " +
- str(video_cover) + " + " +
- str(video_url) + " + " +
- str(user_id) + " + " +
- str("wx0fb8149da961d3b0") + "\n")
- except Exception as e:
- Common.crawler_log().error("获取视频 list 异常:{}".format(e))
- @classmethod
- def download_video(cls, env):
- """
- 下载视频
- 测试环境:env == dev
- 正式环境:env == prod
- """
- videos = Common.read_txt("benshanzhufu_feeds.txt")
- for video in videos:
- download_video_id = video.strip().split(" + ")[1]
- try:
- download_video_title = video.strip().split(" + ")[3]
- download_video_duration = video.strip().split(" + ")[4]
- download_video_play_cnt = video.strip().split(" + ")[2]
- download_video_comment_cnt = video.strip().split(" + ")[5]
- download_video_like_cnt = video.strip().split(" + ")[6]
- download_video_share_cnt = video.strip().split(" + ")[7]
- download_video_resolution = video.strip().split(" + ")[8]
- download_video_send_time = video.strip().split(" + ")[9]
- download_user_name = video.strip().split(" + ")[10]
- download_head_url = video.strip().split(" + ")[11]
- download_cover_url = video.strip().split(" + ")[12]
- download_video_url = video.strip().split(" + ")[13]
- download_video_session = video.strip().split(" + ")[-1]
- Common.crawler_log().info("开始下载视频:{}".format(download_video_title))
- # 下载封面
- Common.download_method(text="cover", d_name=download_video_title, d_url=download_cover_url)
- # 下载视频
- Common.download_method(text="video", d_name=download_video_title, d_url=download_video_url)
- # 保存视频信息至 "./videos/{download_video_title}/info.txt"
- with open(r"./videos/" + download_video_title + "/info.txt", "a", encoding="UTF-8") as f_a:
- f_a.write(str(download_video_id) + "\n" +
- str(download_video_title) + "\n" +
- str(download_video_duration) + "\n" +
- str(download_video_play_cnt) + "\n" +
- str(download_video_comment_cnt) + "\n" +
- str(download_video_like_cnt) + "\n" +
- str(download_video_share_cnt) + "\n" +
- str(download_video_resolution) + "\n" +
- str(download_video_send_time) + "\n" +
- str(download_user_name) + "\n" +
- str(download_head_url) + "\n" +
- str(download_video_url) + "\n" +
- str(download_cover_url) + "\n" +
- str(download_video_session))
- # 上传视频
- if env == "dev":
- Common.crawler_log().info("开始上传视频:{}".format(download_video_title))
- Publish.upload_and_publish("dev", "play")
- elif env == "prod":
- Common.crawler_log().info("开始上传视频:{}".format(download_video_title))
- Publish.upload_and_publish("prod", "play")
- # 保存视频信息至 benshanzhufu_videoid.txt
- with open(r"./txt/benshanzhufu_videoid.txt", "a", encoding="UTF-8") as fa:
- fa.write(download_video_id + "\n")
- # 删除该视频在benshanzhufu_feeds.txt中的信息
- Common.crawler_log().info("删除该视频在benshanzhufu_feeds.txt中的信息:{}".format(download_video_title))
- with open(r"./txt/benshanzhufu_feeds.txt", "r", encoding="UTF-8") as f_r:
- lines = f_r.readlines()
- with open(r"./txt/benshanzhufu_feeds.txt", "w", encoding="utf-8") as f_w:
- for line in lines:
- if download_video_id in line.split(" + ")[1]:
- continue
- f_w.write(line)
- except Exception as e:
- # 删除该视频在 recommend.txt中的信息
- Common.crawler_log().error("该视频信息异常,删除在benshanzhufu_feeds.txt中的信息:{}".format(e))
- with open(r"./txt/benshanzhufu_feeds.txt", "r", encoding="UTF-8") as f_r:
- lines = f_r.readlines()
- with open(r"./txt/benshanzhufu_feeds.txt", "w", encoding="utf-8") as f_w:
- for line in lines:
- if download_video_id in line.split(" + ")[1]:
- continue
- f_w.write(line)
- if __name__ == "__main__":
- bszf = BSZF()
- bszf.get_recommend()
- bszf.download_video("dev")
|