|
@@ -0,0 +1,285 @@
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+# @Author: wangkun
|
|
|
+# @Time: 2022/4/25
|
|
|
+"""
|
|
|
+从 微信小程序-本山祝福短视频 中,下载符合规则的视频
|
|
|
+"""
|
|
|
+import json
|
|
|
+import os
|
|
|
+import random
|
|
|
+import sys
|
|
|
+import time
|
|
|
+from urllib import parse
|
|
|
+import requests
|
|
|
+import urllib3
|
|
|
+sys.path.append(os.getcwd())
|
|
|
+from main.common import Common
|
|
|
+from main.publish import Publish
|
|
|
+
|
|
|
+proxies = {"http": None, "https": None}
|
|
|
+
|
|
|
+
|
|
|
+class BSZF:
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def get_recommend(cls):
|
|
|
+ """
|
|
|
+ 从本山祝福小程序首页推荐获取视频list:
|
|
|
+ 1.在 benshanzhufu_videoid.txt 中去重
|
|
|
+ 2.在 benshanzhufu_feeds.txt 中去重
|
|
|
+ 3.添加视频信息到 benshanzhufu_feeds.txt
|
|
|
+ """
|
|
|
+ now = int(time.time() * 1000)
|
|
|
+ url = "https://bszf.wentingyou.cn/index.php/v111/index/index?parameter="
|
|
|
+ header = {
|
|
|
+ "Connection": "keep-alive",
|
|
|
+ "vision": "1.1.0",
|
|
|
+ "content-type": "application/x-www-form-urlencoded",
|
|
|
+ "scene": "1008",
|
|
|
+ "content-time": str(now),
|
|
|
+ "token": "",
|
|
|
+ "visitorKey": "165086930003741",
|
|
|
+ "chatKey": "wx0fb8149da961d3b0",
|
|
|
+ "cache-time": str(now),
|
|
|
+ "Accept-Encoding": "gzip,compress,br,deflate",
|
|
|
+ "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) "
|
|
|
+ "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 "
|
|
|
+ "MicroMessenger/8.0.20(0x1800142d) NetType/WIFI Language/zh_CN",
|
|
|
+ "Referer": "https://servicewechat.com/wx0fb8149da961d3b0/2/page-frame.html"
|
|
|
+ }
|
|
|
+ parameter = {
|
|
|
+ "cid": "",
|
|
|
+ "page": random.randint(1, 75),
|
|
|
+ "is_ads": 1,
|
|
|
+ "model": "iPhone 11<iPhone12,1>",
|
|
|
+ "mini_version": "8.0.20",
|
|
|
+ "origin_channel": "3",
|
|
|
+ "origin_type": "2",
|
|
|
+ "origin_level": "0",
|
|
|
+ "ini_id": "165086930003741"
|
|
|
+ }
|
|
|
+
|
|
|
+ params = parse.quote(json.dumps(parameter))
|
|
|
+ url = url + str(params)
|
|
|
+
|
|
|
+ try:
|
|
|
+ urllib3.disable_warnings()
|
|
|
+ r = requests.get(headers=header, url=url, proxies=proxies, verify=False)
|
|
|
+ response = json.loads(r.content.decode("utf8"))
|
|
|
+ if "data" not in response:
|
|
|
+ Common.crawler_log().error("获取本山祝福视频 list 出错:{},休眠 3s".format(response))
|
|
|
+ time.sleep(3)
|
|
|
+ else:
|
|
|
+ feeds = response["data"]["list"]
|
|
|
+ for i in range(len(feeds)):
|
|
|
+ if "nid" not in feeds[i]:
|
|
|
+ video_id = "0"
|
|
|
+ Common.crawler_log().info("video_id:{}".format(video_id))
|
|
|
+ else:
|
|
|
+ video_id = feeds[i]["nid"]
|
|
|
+ Common.crawler_log().info("video_id:{}".format(video_id))
|
|
|
+
|
|
|
+ if "video_cover" not in feeds[i]:
|
|
|
+ video_cover = "0"
|
|
|
+ Common.crawler_log().info("video_cover不存在")
|
|
|
+ else:
|
|
|
+ video_cover = feeds[i]["video_cover"]
|
|
|
+ Common.crawler_log().info("video_cover:{}".format(video_cover))
|
|
|
+
|
|
|
+ if "video_url" not in feeds[i]:
|
|
|
+ video_url = "0"
|
|
|
+ Common.crawler_log().info("video_url:不存在")
|
|
|
+ else:
|
|
|
+ video_url = feeds[i]["video_url"]
|
|
|
+ Common.crawler_log().info("video_url:{}".format(video_url))
|
|
|
+
|
|
|
+ if "width" not in feeds[i] or "height" not in feeds[i]:
|
|
|
+ video_width = "0"
|
|
|
+ video_height = "0"
|
|
|
+ video_resolution = str(video_width) + "*" + str(video_height)
|
|
|
+ Common.crawler_log().info("无分辨率")
|
|
|
+ else:
|
|
|
+ video_width = feeds[i]["width"]
|
|
|
+ video_height = feeds[i]["height"]
|
|
|
+ video_resolution = str(video_width) + "*" + str(video_height)
|
|
|
+ Common.crawler_log().info("video_resolution:{}".format(video_resolution))
|
|
|
+
|
|
|
+ if "commentCount" not in feeds[i]:
|
|
|
+ video_comment_cnt = "0"
|
|
|
+ Common.crawler_log().info("video_comment_cnt:0")
|
|
|
+ else:
|
|
|
+ video_comment_cnt = feeds[i]["commentCount"]
|
|
|
+ Common.crawler_log().info("video_comment_cnt:{}".format(video_comment_cnt))
|
|
|
+
|
|
|
+ if "update_time" not in feeds[i]:
|
|
|
+ video_send_time = "0"
|
|
|
+ Common.crawler_log().info("video_send_time:不存在")
|
|
|
+ else:
|
|
|
+ video_send_time = feeds[i]["update_time"]
|
|
|
+ Common.crawler_log().info("video_send_time:{}".format(
|
|
|
+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(video_send_time)))))
|
|
|
+
|
|
|
+ # 视频标题过滤话题及处理特殊字符
|
|
|
+ if "title" not in feeds[i]:
|
|
|
+ video_title = "0"
|
|
|
+ Common.crawler_log().info("video_title不存在")
|
|
|
+ else:
|
|
|
+ video_title = feeds[i]["title"].strip().replace("\n", "")\
|
|
|
+ .replace("/", "").replace("本山祝福", "").replace(" ", "")\
|
|
|
+ .replace(" ", "").replace("&NBSP", "").replace("\r", "")\
|
|
|
+ .replace("#", "").replace(".", "。").replace("\\", "")\
|
|
|
+ .replace(":", "").replace("*", "").replace("?", "")\
|
|
|
+ .replace("?", "").replace('"', "").replace("<", "")\
|
|
|
+ .replace(">", "").replace("|", "")
|
|
|
+ Common.crawler_log().info("video_title:{}".format(video_title))
|
|
|
+
|
|
|
+ video_like_cnt = "10000"
|
|
|
+ video_share_cnt = "10000"
|
|
|
+ video_duration = "10000"
|
|
|
+ video_play_cnt = "10000"
|
|
|
+ user_name = "bszf"
|
|
|
+ head_url = video_cover
|
|
|
+ user_id = "10000"
|
|
|
+
|
|
|
+ # 从 benshanzhufu_videoid.txt 中去重
|
|
|
+ video_ids = Common.read_txt("benshanzhufu_videoid.txt")
|
|
|
+ if video_id in [p_id.strip() for p_id in video_ids]:
|
|
|
+ Common.crawler_log().info("该视频已下载:{}".format(video_title))
|
|
|
+ else:
|
|
|
+ Common.crawler_log().info("该视频未下载:{}".format(video_title))
|
|
|
+
|
|
|
+ # 从 benshanzhufu_feeds.txt 中去重
|
|
|
+ contents = Common.read_txt("benshanzhufu_feeds.txt")
|
|
|
+ # benshanzhufu_feeds.txt 为空时,直接保存
|
|
|
+ if len(contents) == 0 and video_id != "0" and video_url != "0" and video_title != "0":
|
|
|
+ basic_time = int(time.time())
|
|
|
+ Common.crawler_log().info("添加视频信息至benshanzhufu_feeds.txt:{}".format(video_title))
|
|
|
+ with open(r"./txt/benshanzhufu_feeds.txt", "a", encoding="UTF-8") as f_a:
|
|
|
+ f_a.write(str(basic_time) + " + " +
|
|
|
+ str(video_id) + " + " +
|
|
|
+ str(video_play_cnt) + " + " +
|
|
|
+ str(video_title) + " + " +
|
|
|
+ str(video_duration) + " + " +
|
|
|
+ str(video_comment_cnt) + " + " +
|
|
|
+ str(video_like_cnt) + " + " +
|
|
|
+ str(video_share_cnt) + " + " +
|
|
|
+ str(video_resolution) + " + " +
|
|
|
+ str(video_send_time) + " + " +
|
|
|
+ str(user_name) + " + " +
|
|
|
+ str(head_url) + " + " +
|
|
|
+ str(video_cover) + " + " +
|
|
|
+ str(video_url) + " + " +
|
|
|
+ str(user_id) + " + " +
|
|
|
+ str("wx0fb8149da961d3b0") + "\n")
|
|
|
+ else:
|
|
|
+ if video_id in [content.split(" + ")[1] for content in contents]:
|
|
|
+ Common.crawler_log().info("该视频已在 benshanzhufu_feeds.txt 中:{}".format(video_title))
|
|
|
+ elif video_id == "0" or video_url == "0" or video_title != "0":
|
|
|
+ Common.crawler_log().info("视频不存在")
|
|
|
+ else:
|
|
|
+ basic_time = int(time.time())
|
|
|
+ Common.crawler_log().info("添加视频信息至benshanzhufu_feeds.txt:{}".format(video_title))
|
|
|
+ with open(r"./txt/benshanzhufu_feeds.txt", "a", encoding="UTF-8") as f_a:
|
|
|
+ f_a.write(str(basic_time) + " + " +
|
|
|
+ str(video_id) + " + " +
|
|
|
+ str(video_play_cnt) + " + " +
|
|
|
+ str(video_title) + " + " +
|
|
|
+ str(video_duration) + " + " +
|
|
|
+ str(video_comment_cnt) + " + " +
|
|
|
+ str(video_like_cnt) + " + " +
|
|
|
+ str(video_share_cnt) + " + " +
|
|
|
+ str(video_resolution) + " + " +
|
|
|
+ str(video_send_time) + " + " +
|
|
|
+ str(user_name) + " + " +
|
|
|
+ str(head_url) + " + " +
|
|
|
+ str(video_cover) + " + " +
|
|
|
+ str(video_url) + " + " +
|
|
|
+ str(user_id) + " + " +
|
|
|
+ str("wx0fb8149da961d3b0") + "\n")
|
|
|
+ except Exception as e:
|
|
|
+ Common.crawler_log().error("获取视频 list 异常:{}".format(e))
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def download_video(cls, env):
|
|
|
+ """
|
|
|
+ 下载视频
|
|
|
+ 测试环境:env == dev
|
|
|
+ 正式环境:env == prod
|
|
|
+ """
|
|
|
+ videos = Common.read_txt("benshanzhufu_feeds.txt")
|
|
|
+ for video in videos:
|
|
|
+ download_video_id = video.strip().split(" + ")[1]
|
|
|
+ try:
|
|
|
+ download_video_title = video.strip().split(" + ")[3]
|
|
|
+ download_video_duration = video.strip().split(" + ")[4]
|
|
|
+ download_video_play_cnt = video.strip().split(" + ")[2]
|
|
|
+ download_video_comment_cnt = video.strip().split(" + ")[5]
|
|
|
+ download_video_like_cnt = video.strip().split(" + ")[6]
|
|
|
+ download_video_share_cnt = video.strip().split(" + ")[7]
|
|
|
+ download_video_resolution = video.strip().split(" + ")[8]
|
|
|
+ download_video_send_time = video.strip().split(" + ")[9]
|
|
|
+ download_user_name = video.strip().split(" + ")[10]
|
|
|
+ download_head_url = video.strip().split(" + ")[11]
|
|
|
+ download_cover_url = video.strip().split(" + ")[12]
|
|
|
+ download_video_url = video.strip().split(" + ")[13]
|
|
|
+ download_video_session = video.strip().split(" + ")[-1]
|
|
|
+
|
|
|
+ Common.crawler_log().info("开始下载视频:{}".format(download_video_title))
|
|
|
+ # 下载封面
|
|
|
+ Common.download_method(text="cover", d_name=download_video_title, d_url=download_cover_url)
|
|
|
+ # 下载视频
|
|
|
+ Common.download_method(text="video", d_name=download_video_title, d_url=download_video_url)
|
|
|
+ # 保存视频信息至 "./videos/{download_video_title}/info.txt"
|
|
|
+ with open(r"./videos/" + download_video_title + "/info.txt", "a", encoding="UTF-8") as f_a:
|
|
|
+ f_a.write(str(download_video_id) + "\n" +
|
|
|
+ str(download_video_title) + "\n" +
|
|
|
+ str(download_video_duration) + "\n" +
|
|
|
+ str(download_video_play_cnt) + "\n" +
|
|
|
+ str(download_video_comment_cnt) + "\n" +
|
|
|
+ str(download_video_like_cnt) + "\n" +
|
|
|
+ str(download_video_share_cnt) + "\n" +
|
|
|
+ str(download_video_resolution) + "\n" +
|
|
|
+ str(download_video_send_time) + "\n" +
|
|
|
+ str(download_user_name) + "\n" +
|
|
|
+ str(download_head_url) + "\n" +
|
|
|
+ str(download_video_url) + "\n" +
|
|
|
+ str(download_cover_url) + "\n" +
|
|
|
+ str(download_video_session))
|
|
|
+
|
|
|
+ # 上传视频
|
|
|
+ if env == "dev":
|
|
|
+ Common.crawler_log().info("开始上传视频:{}".format(download_video_title))
|
|
|
+ Publish.upload_and_publish("dev", "play")
|
|
|
+ elif env == "prod":
|
|
|
+ Common.crawler_log().info("开始上传视频:{}".format(download_video_title))
|
|
|
+ Publish.upload_and_publish("prod", "play")
|
|
|
+
|
|
|
+ # 保存视频信息至 benshanzhufu_videoid.txt
|
|
|
+ with open(r"./txt/benshanzhufu_videoid.txt", "a", encoding="UTF-8") as fa:
|
|
|
+ fa.write(download_video_id + "\n")
|
|
|
+
|
|
|
+ # 删除该视频在benshanzhufu_feeds.txt中的信息
|
|
|
+ Common.crawler_log().info("删除该视频在benshanzhufu_feeds.txt中的信息:{}".format(download_video_title))
|
|
|
+ with open(r"./txt/benshanzhufu_feeds.txt", "r", encoding="UTF-8") as f_r:
|
|
|
+ lines = f_r.readlines()
|
|
|
+ with open(r"./txt/benshanzhufu_feeds.txt", "w", encoding="utf-8") as f_w:
|
|
|
+ for line in lines:
|
|
|
+ if download_video_id in line.split(" + ")[1]:
|
|
|
+ continue
|
|
|
+ f_w.write(line)
|
|
|
+ except Exception as e:
|
|
|
+ # 删除该视频在 recommend.txt中的信息
|
|
|
+ Common.crawler_log().error("该视频信息异常,删除在benshanzhufu_feeds.txt中的信息:{}".format(e))
|
|
|
+ with open(r"./txt/benshanzhufu_feeds.txt", "r", encoding="UTF-8") as f_r:
|
|
|
+ lines = f_r.readlines()
|
|
|
+ with open(r"./txt/benshanzhufu_feeds.txt", "w", encoding="utf-8") as f_w:
|
|
|
+ for line in lines:
|
|
|
+ if download_video_id in line.split(" + ")[1]:
|
|
|
+ continue
|
|
|
+ f_w.write(line)
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ bszf = BSZF()
|
|
|
+ bszf.get_recommend()
|
|
|
+ bszf.download_video("dev")
|