3 年之前 · 18ecd5de30
--- a/README.md
+++ b/README.md
@@ -0,0 +1 @@
 
				+本山祝福小程序爬虫
			
--- a/logs/__init__.py
+++ b/logs/__init__.py
@@ -0,0 +1,3 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+# @Author: wangkun
			
 
				+# @Time: 2022/4/25
			
--- a/main/__init__.py
+++ b/main/__init__.py
@@ -0,0 +1,3 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+# @Author: wangkun
			
 
				+# @Time: 2022/4/25
			
--- a/main/common.py
+++ b/main/common.py
@@ -0,0 +1,139 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+# @Author: wangkun
			
 
				+# @Time: 2022/4/25
			
 
				+"""
			
 
				+公共方法，包含：生成log / 删除log / 下载方法 / 读取文件 / 统计下载数
			
 
				+"""
			
 
				+from datetime import date, timedelta
			
 
				+import datetime
			
 
				+import logging
			
 
				+import os
			
 
				+import time
			
 
				+import requests
			
 
				+import urllib3
			
 
				+
			
 
				+proxies = {"http": None, "https": None}
			
 
				+
			
 
				+
			
 
				+class Common:
			
 
				+    # 统一获取当前时间 <class 'datetime.datetime'>  2022-04-14 20:13:51.244472
			
 
				+    now = datetime.datetime.now()
			
 
				+    # 昨天 <class 'str'>  2022-04-13
			
 
				+    yesterday = (date.today() + timedelta(days=-1)).strftime("%Y-%m-%d")
			
 
				+    # 今天 <class 'datetime.date'>  2022-04-14
			
 
				+    today = date.today()
			
 
				+    # 明天 <class 'str'>  2022-04-15
			
 
				+    tomorrow = (date.today() + timedelta(days=1)).strftime("%Y-%m-%d")
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def crawler_log():
			
 
				+        """
			
 
				+        生成 log 日志
			
 
				+        """
			
 
				+        # 日志路径
			
 
				+        log_dir = r"./logs/"
			
 
				+        log_path = os.getcwd() + os.sep + log_dir
			
 
				+        if not os.path.isdir(log_path):
			
 
				+            os.makedirs(log_path)
			
 
				+
			
 
				+        # 日志参数
			
 
				+        log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
			
 
				+        date_format = "%Y-%m-%d %p %H:%M:%S"
			
 
				+        log_name = time.strftime("%Y-%m-%d", time.localtime(time.time())) + '.log'
			
 
				+
			
 
				+        # 日志初始化
			
 
				+        logging.basicConfig(filename=log_path + log_name, level=logging.INFO, format=log_format, datefmt=date_format)
			
 
				+        crawler_logger = logging.getLogger("crawler-log")
			
 
				+
			
 
				+        return crawler_logger
			
 
				+
			
 
				+    @classmethod
			
 
				+    def del_logs(cls):
			
 
				+        """
			
 
				+        清除冗余日志文件
			
 
				+        :return: 保留最近 7 个日志
			
 
				+        """
			
 
				+        log_dir = r"./logs/"
			
 
				+        all_files = sorted(os.listdir(log_dir))
			
 
				+        all_logs = []
			
 
				+        for log in all_files:
			
 
				+            name = os.path.splitext(log)[-1]
			
 
				+            if name == ".log":
			
 
				+                all_logs.append(log)
			
 
				+
			
 
				+        if len(all_logs) <= 7:
			
 
				+            pass
			
 
				+        else:
			
 
				+            for file in all_logs[:len(all_logs) - 7]:
			
 
				+                os.remove(log_dir + file)
			
 
				+        cls.crawler_log().info("清除冗余日志成功")
			
 
				+
			
 
				+    @classmethod
			
 
				+    def download_method(cls, text, d_name, d_url):
			
 
				+        """
			
 
				+        下载封面：text == "cover" ； 下载视频：text == "video"
			
 
				+        需要下载的视频标题：d_title
			
 
				+        视频封面，或视频播放地址：d_url
			
 
				+        下载保存路径："./files/{d_title}/"
			
 
				+        """
			
 
				+        # 首先创建一个保存该视频相关信息的文件夹
			
 
				+        video_dir = "./videos/" + d_name + "/"
			
 
				+        if not os.path.exists(video_dir):
			
 
				+            os.mkdir(video_dir)
			
 
				+
			
 
				+        # 下载视频
			
 
				+        if text == "video":
			
 
				+            # 需要下载的视频地址
			
 
				+            video_url = d_url
			
 
				+            # 视频名
			
 
				+            video_name = "video.mp4"
			
 
				+
			
 
				+            # 下载视频
			
 
				+            urllib3.disable_warnings()
			
 
				+            response = requests.get(video_url, stream=True, proxies=proxies, verify=False)
			
 
				+            try:
			
 
				+                with open(video_dir + video_name, "wb") as f:
			
 
				+                    for chunk in response.iter_content(chunk_size=10240):
			
 
				+                        f.write(chunk)
			
 
				+                cls.crawler_log().info("==========视频下载完成==========")
			
 
				+            except Exception as e:
			
 
				+                cls.crawler_log().error("视频下载失败：{}".format(e))
			
 
				+
			
 
				+        # 下载封面
			
 
				+        elif text == "cover":
			
 
				+            # 需要下载的封面地址
			
 
				+            cover_url = d_url
			
 
				+            # 封面名
			
 
				+            cover_name = "image.jpg"
			
 
				+
			
 
				+            # 下载封面
			
 
				+            urllib3.disable_warnings()
			
 
				+            response = requests.get(cover_url, proxies=proxies, verify=False)
			
 
				+            try:
			
 
				+                with open(video_dir + cover_name, "wb") as f:
			
 
				+                    f.write(response.content)
			
 
				+                cls.crawler_log().info("==========封面下载完成==========")
			
 
				+            except Exception as e:
			
 
				+                cls.crawler_log().error("封面下载失败：{}".format(e))
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def read_txt(t_name):
			
 
				+        """
			
 
				+        读取 txt 文件
			
 
				+        :param t_name: 文件名
			
 
				+        :return: 文件内容
			
 
				+        """
			
 
				+        with open(r"./txt/" + t_name, "r", encoding="UTF-8") as f:
			
 
				+            return f.readlines()
			
 
				+
			
 
				+    @classmethod
			
 
				+    def benshanzhufu_download_count(cls):
			
 
				+        videoid_path = r"./txt/benshanzhufu_videoid.txt"
			
 
				+        count = 0
			
 
				+        for count, line in enumerate(open(videoid_path, "rb").readlines()):
			
 
				+            count += 1
			
 
				+        cls.crawler_log().info('累计下载视频数: {}\n'.format(count))
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    common = Common()
			
--- a/main/demo.py
+++ b/main/demo.py
@@ -0,0 +1,91 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+# @Author: wangkun
			
 
				+# @Time: 2022/4/25
			
 
				+from urllib import parse
			
 
				+import json
			
 
				+import random
			
 
				+import time
			
 
				+import urllib3
			
 
				+import requests
			
 
				+
			
 
				+
			
 
				+class Demo:
			
 
				+    num = 1
			
 
				+
			
 
				+    @classmethod
			
 
				+    def get_page_num(cls):
			
 
				+        cls.num += 1
			
 
				+        return cls.num
			
 
				+
			
 
				+    @classmethod
			
 
				+    def get_feeds(cls):
			
 
				+        now = int(time.time() * 1000)
			
 
				+        url = "https://bszf.wentingyou.cn/index.php/v111/index/index?parameter="
			
 
				+        header = {
			
 
				+            "Connection": "keep-alive",
			
 
				+            "vision": "1.1.0",
			
 
				+            "content-type": "application/x-www-form-urlencoded",
			
 
				+            "scene": "1008",
			
 
				+            "content-time": str(now),
			
 
				+            "token": "",
			
 
				+            "visitorKey": "165086930003741",
			
 
				+            "chatKey": "wx0fb8149da961d3b0",
			
 
				+            "cache-time": str(now),
			
 
				+            "Accept-Encoding": "gzip,compress,br,deflate",
			
 
				+            "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) "
			
 
				+                          "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 "
			
 
				+                          "MicroMessenger/8.0.20(0x1800142d) NetType/WIFI Language/zh_CN",
			
 
				+            "Referer": "https://servicewechat.com/wx0fb8149da961d3b0/2/page-frame.html"
			
 
				+        }
			
 
				+        parameter = {
			
 
				+                    "cid": "",
			
 
				+                    "page": random.randint(1, 75),
			
 
				+                    "is_ads": 1,
			
 
				+                    "model": "iPhone 11<iPhone12,1>",
			
 
				+                    "mini_version": "8.0.20",
			
 
				+                    "origin_channel": "3",
			
 
				+                    "origin_type": "2",
			
 
				+                    "origin_level": "0",
			
 
				+                    "ini_id": "165086930003741"
			
 
				+        }
			
 
				+
			
 
				+        params = parse.quote(json.dumps(parameter))
			
 
				+        url = url + str(params)
			
 
				+
			
 
				+        urllib3.disable_warnings()
			
 
				+        r = requests.get(headers=header, url=url, verify=False)
			
 
				+        response = json.loads(r.content.decode("utf8"))
			
 
				+        data = response["data"]["list"]
			
 
				+
			
 
				+        for k, v in parameter.items():
			
 
				+            print(f"{k}:{v}")
			
 
				+
			
 
				+        print("\n")
			
 
				+
			
 
				+        for video_list in data:
			
 
				+            print(video_list)
			
 
				+
			
 
				+    @classmethod
			
 
				+    def encode_params(cls):
			
 
				+        data = {
			
 
				+                "cid": "",
			
 
				+                "page": 8,
			
 
				+                "is_ads": 1,
			
 
				+                "model": "iPhone 11<iPhone12,1>",
			
 
				+                "mini_version": "8.0.20",
			
 
				+                "origin_channel": "3",
			
 
				+                "origin_type": "2",
			
 
				+                "origin_level": "3",
			
 
				+                "ini_id": "165086930003741"
			
 
				+        }
			
 
				+        text = parse.urlencode(data)
			
 
				+        print(text)
			
 
				+
			
 
				+        data = str(data)
			
 
				+        text1 = parse.quote(data)
			
 
				+        print(text1)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    demo = Demo()
			
 
				+    demo.get_feeds()
			
--- a/main/download.py
+++ b/main/download.py
@@ -0,0 +1,285 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+# @Author: wangkun
			
 
				+# @Time: 2022/4/25
			
 
				+"""
			
 
				+从 微信小程序-本山祝福短视频 中，下载符合规则的视频
			
 
				+"""
			
 
				+import json
			
 
				+import os
			
 
				+import random
			
 
				+import sys
			
 
				+import time
			
 
				+from urllib import parse
			
 
				+import requests
			
 
				+import urllib3
			
 
				+sys.path.append(os.getcwd())
			
 
				+from main.common import Common
			
 
				+from main.publish import Publish
			
 
				+
			
 
				+proxies = {"http": None, "https": None}
			
 
				+
			
 
				+
			
 
				+class BSZF:
			
 
				+
			
 
				+    @classmethod
			
 
				+    def get_recommend(cls):
			
 
				+        """
			
 
				+        从本山祝福小程序首页推荐获取视频list:
			
 
				+            1.在 benshanzhufu_videoid.txt 中去重
			
 
				+            2.在 benshanzhufu_feeds.txt 中去重
			
 
				+            3.添加视频信息到 benshanzhufu_feeds.txt
			
 
				+        """
			
 
				+        now = int(time.time() * 1000)
			
 
				+        url = "https://bszf.wentingyou.cn/index.php/v111/index/index?parameter="
			
 
				+        header = {
			
 
				+            "Connection": "keep-alive",
			
 
				+            "vision": "1.1.0",
			
 
				+            "content-type": "application/x-www-form-urlencoded",
			
 
				+            "scene": "1008",
			
 
				+            "content-time": str(now),
			
 
				+            "token": "",
			
 
				+            "visitorKey": "165086930003741",
			
 
				+            "chatKey": "wx0fb8149da961d3b0",
			
 
				+            "cache-time": str(now),
			
 
				+            "Accept-Encoding": "gzip,compress,br,deflate",
			
 
				+            "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) "
			
 
				+                          "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 "
			
 
				+                          "MicroMessenger/8.0.20(0x1800142d) NetType/WIFI Language/zh_CN",
			
 
				+            "Referer": "https://servicewechat.com/wx0fb8149da961d3b0/2/page-frame.html"
			
 
				+        }
			
 
				+        parameter = {
			
 
				+            "cid": "",
			
 
				+            "page": random.randint(1, 75),
			
 
				+            "is_ads": 1,
			
 
				+            "model": "iPhone 11<iPhone12,1>",
			
 
				+            "mini_version": "8.0.20",
			
 
				+            "origin_channel": "3",
			
 
				+            "origin_type": "2",
			
 
				+            "origin_level": "0",
			
 
				+            "ini_id": "165086930003741"
			
 
				+        }
			
 
				+
			
 
				+        params = parse.quote(json.dumps(parameter))
			
 
				+        url = url + str(params)
			
 
				+
			
 
				+        try:
			
 
				+            urllib3.disable_warnings()
			
 
				+            r = requests.get(headers=header, url=url, proxies=proxies, verify=False)
			
 
				+            response = json.loads(r.content.decode("utf8"))
			
 
				+            if "data" not in response:
			
 
				+                Common.crawler_log().error("获取本山祝福视频 list 出错:{}，休眠 3s".format(response))
			
 
				+                time.sleep(3)
			
 
				+            else:
			
 
				+                feeds = response["data"]["list"]
			
 
				+                for i in range(len(feeds)):
			
 
				+                    if "nid" not in feeds[i]:
			
 
				+                        video_id = "0"
			
 
				+                        Common.crawler_log().info("video_id:{}".format(video_id))
			
 
				+                    else:
			
 
				+                        video_id = feeds[i]["nid"]
			
 
				+                        Common.crawler_log().info("video_id:{}".format(video_id))
			
 
				+
			
 
				+                    if "video_cover" not in feeds[i]:
			
 
				+                        video_cover = "0"
			
 
				+                        Common.crawler_log().info("video_cover不存在")
			
 
				+                    else:
			
 
				+                        video_cover = feeds[i]["video_cover"]
			
 
				+                        Common.crawler_log().info("video_cover：{}".format(video_cover))
			
 
				+
			
 
				+                    if "video_url" not in feeds[i]:
			
 
				+                        video_url = "0"
			
 
				+                        Common.crawler_log().info("video_url:不存在")
			
 
				+                    else:
			
 
				+                        video_url = feeds[i]["video_url"]
			
 
				+                        Common.crawler_log().info("video_url:{}".format(video_url))
			
 
				+
			
 
				+                    if "width" not in feeds[i] or "height" not in feeds[i]:
			
 
				+                        video_width = "0"
			
 
				+                        video_height = "0"
			
 
				+                        video_resolution = str(video_width) + "*" + str(video_height)
			
 
				+                        Common.crawler_log().info("无分辨率")
			
 
				+                    else:
			
 
				+                        video_width = feeds[i]["width"]
			
 
				+                        video_height = feeds[i]["height"]
			
 
				+                        video_resolution = str(video_width) + "*" + str(video_height)
			
 
				+                        Common.crawler_log().info("video_resolution:{}".format(video_resolution))
			
 
				+
			
 
				+                    if "commentCount" not in feeds[i]:
			
 
				+                        video_comment_cnt = "0"
			
 
				+                        Common.crawler_log().info("video_comment_cnt:0")
			
 
				+                    else:
			
 
				+                        video_comment_cnt = feeds[i]["commentCount"]
			
 
				+                        Common.crawler_log().info("video_comment_cnt:{}".format(video_comment_cnt))
			
 
				+
			
 
				+                    if "update_time" not in feeds[i]:
			
 
				+                        video_send_time = "0"
			
 
				+                        Common.crawler_log().info("video_send_time:不存在")
			
 
				+                    else:
			
 
				+                        video_send_time = feeds[i]["update_time"]
			
 
				+                        Common.crawler_log().info("video_send_time:{}".format(
			
 
				+                            time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(video_send_time)))))
			
 
				+
			
 
				+                    # 视频标题过滤话题及处理特殊字符
			
 
				+                    if "title" not in feeds[i]:
			
 
				+                        video_title = "0"
			
 
				+                        Common.crawler_log().info("video_title不存在")
			
 
				+                    else:
			
 
				+                        video_title = feeds[i]["title"].strip().replace("\n", "")\
			
 
				+                            .replace("/", "").replace("本山祝福", "").replace(" ", "")\
			
 
				+                            .replace(" ", "").replace("&NBSP", "").replace("\r", "")\
			
 
				+                            .replace("#", "").replace(".", "。").replace("\\", "")\
			
 
				+                            .replace(":", "").replace("*", "").replace("？", "")\
			
 
				+                            .replace("?", "").replace('"', "").replace("<", "")\
			
 
				+                            .replace(">", "").replace("|", "")
			
 
				+                        Common.crawler_log().info("video_title:{}".format(video_title))
			
 
				+
			
 
				+                    video_like_cnt = "10000"
			
 
				+                    video_share_cnt = "10000"
			
 
				+                    video_duration = "10000"
			
 
				+                    video_play_cnt = "10000"
			
 
				+                    user_name = "bszf"
			
 
				+                    head_url = video_cover
			
 
				+                    user_id = "10000"
			
 
				+
			
 
				+                    # 从 benshanzhufu_videoid.txt 中去重
			
 
				+                    video_ids = Common.read_txt("benshanzhufu_videoid.txt")
			
 
				+                    if video_id in [p_id.strip() for p_id in video_ids]:
			
 
				+                        Common.crawler_log().info("该视频已下载:{}".format(video_title))
			
 
				+                    else:
			
 
				+                        Common.crawler_log().info("该视频未下载:{}".format(video_title))
			
 
				+
			
 
				+                        # 从 benshanzhufu_feeds.txt 中去重
			
 
				+                        contents = Common.read_txt("benshanzhufu_feeds.txt")
			
 
				+                        # benshanzhufu_feeds.txt 为空时，直接保存
			
 
				+                        if len(contents) == 0 and video_id != "0" and video_url != "0" and video_title != "0":
			
 
				+                            basic_time = int(time.time())
			
 
				+                            Common.crawler_log().info("添加视频信息至benshanzhufu_feeds.txt:{}".format(video_title))
			
 
				+                            with open(r"./txt/benshanzhufu_feeds.txt", "a", encoding="UTF-8") as f_a:
			
 
				+                                f_a.write(str(basic_time) + " + " +
			
 
				+                                          str(video_id) + " + " +
			
 
				+                                          str(video_play_cnt) + " + " +
			
 
				+                                          str(video_title) + " + " +
			
 
				+                                          str(video_duration) + " + " +
			
 
				+                                          str(video_comment_cnt) + " + " +
			
 
				+                                          str(video_like_cnt) + " + " +
			
 
				+                                          str(video_share_cnt) + " + " +
			
 
				+                                          str(video_resolution) + " + " +
			
 
				+                                          str(video_send_time) + " + " +
			
 
				+                                          str(user_name) + " + " +
			
 
				+                                          str(head_url) + " + " +
			
 
				+                                          str(video_cover) + " + " +
			
 
				+                                          str(video_url) + " + " +
			
 
				+                                          str(user_id) + " + " +
			
 
				+                                          str("wx0fb8149da961d3b0") + "\n")
			
 
				+                        else:
			
 
				+                            if video_id in [content.split(" + ")[1] for content in contents]:
			
 
				+                                Common.crawler_log().info("该视频已在 benshanzhufu_feeds.txt 中:{}".format(video_title))
			
 
				+                            elif video_id == "0" or video_url == "0" or video_title != "0":
			
 
				+                                Common.crawler_log().info("视频不存在")
			
 
				+                            else:
			
 
				+                                basic_time = int(time.time())
			
 
				+                                Common.crawler_log().info("添加视频信息至benshanzhufu_feeds.txt:{}".format(video_title))
			
 
				+                                with open(r"./txt/benshanzhufu_feeds.txt", "a", encoding="UTF-8") as f_a:
			
 
				+                                    f_a.write(str(basic_time) + " + " +
			
 
				+                                              str(video_id) + " + " +
			
 
				+                                              str(video_play_cnt) + " + " +
			
 
				+                                              str(video_title) + " + " +
			
 
				+                                              str(video_duration) + " + " +
			
 
				+                                              str(video_comment_cnt) + " + " +
			
 
				+                                              str(video_like_cnt) + " + " +
			
 
				+                                              str(video_share_cnt) + " + " +
			
 
				+                                              str(video_resolution) + " + " +
			
 
				+                                              str(video_send_time) + " + " +
			
 
				+                                              str(user_name) + " + " +
			
 
				+                                              str(head_url) + " + " +
			
 
				+                                              str(video_cover) + " + " +
			
 
				+                                              str(video_url) + " + " +
			
 
				+                                              str(user_id) + " + " +
			
 
				+                                              str("wx0fb8149da961d3b0") + "\n")
			
 
				+        except Exception as e:
			
 
				+            Common.crawler_log().error("获取视频 list 异常:{}".format(e))
			
 
				+
			
 
				+    @classmethod
			
 
				+    def download_video(cls, env):
			
 
				+        """
			
 
				+        下载视频
			
 
				+        测试环境:env == dev
			
 
				+        正式环境:env == prod
			
 
				+        """
			
 
				+        videos = Common.read_txt("benshanzhufu_feeds.txt")
			
 
				+        for video in videos:
			
 
				+            download_video_id = video.strip().split(" + ")[1]
			
 
				+            try:
			
 
				+                download_video_title = video.strip().split(" + ")[3]
			
 
				+                download_video_duration = video.strip().split(" + ")[4]
			
 
				+                download_video_play_cnt = video.strip().split(" + ")[2]
			
 
				+                download_video_comment_cnt = video.strip().split(" + ")[5]
			
 
				+                download_video_like_cnt = video.strip().split(" + ")[6]
			
 
				+                download_video_share_cnt = video.strip().split(" + ")[7]
			
 
				+                download_video_resolution = video.strip().split(" + ")[8]
			
 
				+                download_video_send_time = video.strip().split(" + ")[9]
			
 
				+                download_user_name = video.strip().split(" + ")[10]
			
 
				+                download_head_url = video.strip().split(" + ")[11]
			
 
				+                download_cover_url = video.strip().split(" + ")[12]
			
 
				+                download_video_url = video.strip().split(" + ")[13]
			
 
				+                download_video_session = video.strip().split(" + ")[-1]
			
 
				+
			
 
				+                Common.crawler_log().info("开始下载视频:{}".format(download_video_title))
			
 
				+                # 下载封面
			
 
				+                Common.download_method(text="cover", d_name=download_video_title, d_url=download_cover_url)
			
 
				+                # 下载视频
			
 
				+                Common.download_method(text="video", d_name=download_video_title, d_url=download_video_url)
			
 
				+                # 保存视频信息至 "./videos/{download_video_title}/info.txt"
			
 
				+                with open(r"./videos/" + download_video_title + "/info.txt", "a", encoding="UTF-8") as f_a:
			
 
				+                    f_a.write(str(download_video_id) + "\n" +
			
 
				+                              str(download_video_title) + "\n" +
			
 
				+                              str(download_video_duration) + "\n" +
			
 
				+                              str(download_video_play_cnt) + "\n" +
			
 
				+                              str(download_video_comment_cnt) + "\n" +
			
 
				+                              str(download_video_like_cnt) + "\n" +
			
 
				+                              str(download_video_share_cnt) + "\n" +
			
 
				+                              str(download_video_resolution) + "\n" +
			
 
				+                              str(download_video_send_time) + "\n" +
			
 
				+                              str(download_user_name) + "\n" +
			
 
				+                              str(download_head_url) + "\n" +
			
 
				+                              str(download_video_url) + "\n" +
			
 
				+                              str(download_cover_url) + "\n" +
			
 
				+                              str(download_video_session))
			
 
				+
			
 
				+                # 上传视频
			
 
				+                if env == "dev":
			
 
				+                    Common.crawler_log().info("开始上传视频:{}".format(download_video_title))
			
 
				+                    Publish.upload_and_publish("dev", "play")
			
 
				+                elif env == "prod":
			
 
				+                    Common.crawler_log().info("开始上传视频:{}".format(download_video_title))
			
 
				+                    Publish.upload_and_publish("prod", "play")
			
 
				+
			
 
				+                # 保存视频信息至 benshanzhufu_videoid.txt
			
 
				+                with open(r"./txt/benshanzhufu_videoid.txt", "a", encoding="UTF-8") as fa:
			
 
				+                    fa.write(download_video_id + "\n")
			
 
				+
			
 
				+                # 删除该视频在benshanzhufu_feeds.txt中的信息
			
 
				+                Common.crawler_log().info("删除该视频在benshanzhufu_feeds.txt中的信息:{}".format(download_video_title))
			
 
				+                with open(r"./txt/benshanzhufu_feeds.txt", "r", encoding="UTF-8") as f_r:
			
 
				+                    lines = f_r.readlines()
			
 
				+                with open(r"./txt/benshanzhufu_feeds.txt", "w", encoding="utf-8") as f_w:
			
 
				+                    for line in lines:
			
 
				+                        if download_video_id in line.split(" + ")[1]:
			
 
				+                            continue
			
 
				+                        f_w.write(line)
			
 
				+            except Exception as e:
			
 
				+                # 删除该视频在 recommend.txt中的信息
			
 
				+                Common.crawler_log().error("该视频信息异常，删除在benshanzhufu_feeds.txt中的信息:{}".format(e))
			
 
				+                with open(r"./txt/benshanzhufu_feeds.txt", "r", encoding="UTF-8") as f_r:
			
 
				+                    lines = f_r.readlines()
			
 
				+                with open(r"./txt/benshanzhufu_feeds.txt", "w", encoding="utf-8") as f_w:
			
 
				+                    for line in lines:
			
 
				+                        if download_video_id in line.split(" + ")[1]:
			
 
				+                            continue
			
 
				+                        f_w.write(line)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    bszf = BSZF()
			
 
				+    bszf.get_recommend()
			
 
				+    bszf.download_video("dev")
			
--- a/main/publish.py
+++ b/main/publish.py
@@ -0,0 +1,249 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+# @Author: wangkun
			
 
				+# @Time: 2022/4/25
			
 
				+"""
			
 
				+上传视频到阿里云 OSS
			
 
				+上传视频到管理后台
			
 
				+"""
			
 
				+import json
			
 
				+import os
			
 
				+import random
			
 
				+import time
			
 
				+
			
 
				+import oss2
			
 
				+import requests
			
 
				+import urllib3
			
 
				+from main.common import Common
			
 
				+
			
 
				+proxies = {"http": None, "https": None}
			
 
				+
			
 
				+
			
 
				+class Publish:
			
 
				+    @classmethod
			
 
				+    def publish_video_dev(cls, request_data):
			
 
				+        """
			
 
				+        loginUid  站内uid (随机)
			
 
				+        appType  默认：888888
			
 
				+        crawlerSrcId   站外视频ID
			
 
				+        crawlerSrcCode   渠道（自定义 KYK）
			
 
				+        crawlerSrcPublishTimestamp  视频原发布时间
			
 
				+        crawlerTaskTimestamp   爬虫创建时间（可以是当前时间）
			
 
				+        videoPath  视频oss地址
			
 
				+        coverImgPath  视频封面oss地址
			
 
				+        title  标题
			
 
				+        totalTime  视频时长
			
 
				+        viewStatus  视频的有效状态 默认1
			
 
				+        versionCode  版本 默认1
			
 
				+        :return:
			
 
				+        """
			
 
				+        # Common.crawler_log().info('publish request data: {}'.format(request_data))
			
 
				+        result = cls.request_post('https://videotest.yishihui.com/longvideoapi/crawler/video/send', request_data)
			
 
				+        Common.crawler_log().info('publish result: {}'.format(result))
			
 
				+        if result['code'] != 0:
			
 
				+            Common.crawler_log().error('pushlish failure msg = {}'.format(result['msg']))
			
 
				+        else:
			
 
				+            Common.crawler_log().info('publish success video_id = : {}'.format(request_data['crawlerSrcId']))
			
 
				+
			
 
				+    @classmethod
			
 
				+    def publish_video_prod(cls, request_data):
			
 
				+        """
			
 
				+        loginUid  站内uid (随机)
			
 
				+        appType  默认：888888
			
 
				+        crawlerSrcId   站外视频ID
			
 
				+        crawlerSrcCode   渠道（自定义 KYK）
			
 
				+        crawlerSrcPublishTimestamp  视频原发布时间
			
 
				+        crawlerTaskTimestamp   爬虫创建时间（可以是当前时间）
			
 
				+        videoPath  视频oss地址
			
 
				+        coverImgPath  视频封面oss地址
			
 
				+        title  标题
			
 
				+        totalTime  视频时长
			
 
				+        viewStatus  视频的有效状态 默认1
			
 
				+        versionCode  版本 默认1
			
 
				+        :return:
			
 
				+        """
			
 
				+        # Common.crawler_log().info('publish request data: {}'.format(request_data))
			
 
				+        result = cls.request_post('https://longvideoapi.piaoquantv.com/longvideoapi/crawler/video/send', request_data)
			
 
				+        Common.crawler_log().info('publish result: {}'.format(result))
			
 
				+        if result['code'] != 0:
			
 
				+            Common.crawler_log().error('pushlish failure msg = {}'.format(result['msg']))
			
 
				+        else:
			
 
				+            Common.crawler_log().info('publish success video_id = : {}'.format(request_data['crawlerSrcId']))
			
 
				+
			
 
				+    @classmethod
			
 
				+    def request_post(cls, request_url, request_data):
			
 
				+        """
			
 
				+        post 请求 HTTP接口
			
 
				+        :param request_url: 接口URL
			
 
				+        :param request_data: 请求参数
			
 
				+        :return: res_data json格式
			
 
				+        """
			
 
				+        urllib3.disable_warnings()
			
 
				+        response = requests.post(url=request_url, data=request_data, proxies=proxies, verify=False)
			
 
				+        if response.status_code == 200:
			
 
				+            res_data = json.loads(response.text)
			
 
				+            return res_data
			
 
				+
			
 
				+    # 以下代码展示了基本的文件上传、下载、罗列、删除用法。
			
 
				+
			
 
				+    # 首先初始化AccessKeyId、AccessKeySecret、Endpoint等信息。
			
 
				+    # 通过环境变量获取，或者把诸如“<你的AccessKeyId>”替换成真实的AccessKeyId等。
			
 
				+    #
			
 
				+    # 以杭州区域为例，Endpoint可以是：
			
 
				+    #   http://oss-cn-hangzhou.aliyuncs.com
			
 
				+    #   https://oss-cn-hangzhou.aliyuncs.com
			
 
				+    # 分别以HTTP、HTTPS协议访问。
			
 
				+    access_key_id = os.getenv('OSS_TEST_ACCESS_KEY_ID', 'LTAIP6x1l3DXfSxm')
			
 
				+    access_key_secret = os.getenv('OSS_TEST_ACCESS_KEY_SECRET', 'KbTaM9ars4OX3PMS6Xm7rtxGr1FLon')
			
 
				+    bucket_name = os.getenv('OSS_TEST_BUCKET', 'art-pubbucket')
			
 
				+    # endpoint = os.getenv('OSS_TEST_ENDPOINT', 'oss-cn-hangzhou-internal.aliyuncs.com')
			
 
				+    endpoint = os.getenv('OSS_TEST_ENDPOINT', 'oss-cn-hangzhou.aliyuncs.com')
			
 
				+
			
 
				+    # 确认上面的参数都填写正确了
			
 
				+    for param in (access_key_id, access_key_secret, bucket_name, endpoint):
			
 
				+        assert '<' not in param, '请设置参数：' + param
			
 
				+
			
 
				+    # 创建Bucket对象，所有Object相关的接口都可以通过Bucket对象来进行
			
 
				+    bucket = oss2.Bucket(oss2.Auth(access_key_id, access_key_secret), endpoint, bucket_name)
			
 
				+
			
 
				+    """
			
 
				+    处理流程：
			
 
				+    1. 定时（每天凌晨1点执行一次）循环files文件下的内容 结构：files -> 视频文件夹 -> 视频文件 + 封面图 + 基本信息
			
 
				+    2. 视频文件和封面上传到oss
			
 
				+    - 视频文件oss目录  longvideo/crawler_local/video/prod/文件名
			
 
				+    - 视频封面oss目录  longvideo/crawler_local/image/prod/文件名
			
 
				+    3. 发布视频
			
 
				+    - 读取 基本信息 调用发布接口
			
 
				+    """
			
 
				+    # env 日期20220225 文件名
			
 
				+    oss_file_path_video = r'longvideo/crawler_local/video/{}/{}/{}'
			
 
				+    oss_file_path_image = r'longvideo/crawler_local/image/{}/{}/{}'
			
 
				+
			
 
				+    @classmethod
			
 
				+    def put_file(cls, oss_file, local_file):
			
 
				+        # Common.crawler_log().info("put oss file = {}, local file = {}".format(oss_file, local_file))
			
 
				+        cls.bucket.put_object_from_file(oss_file, local_file)
			
 
				+        Common.crawler_log().info("put oss file = {}, local file = {} success".format(oss_file, local_file))
			
 
				+
			
 
				+    # 清除本地文件
			
 
				+    @classmethod
			
 
				+    def remove_local_file(cls, local_file):
			
 
				+        # Common.crawler_log().info("remove local file = {}".format(local_file))
			
 
				+        os.remove(local_file)
			
 
				+        Common.crawler_log().info("remove local file = {} success".format(local_file))
			
 
				+
			
 
				+    # 清除本地文件夹
			
 
				+    @classmethod
			
 
				+    def remove_local_file_dir(cls, local_file):
			
 
				+        # Common.crawler_log().info("remove local file dir = {}".format(local_file))
			
 
				+        os.rmdir(local_file)
			
 
				+        Common.crawler_log().info("remove local file dir = {} success".format(local_file))
			
 
				+
			
 
				+    local_file_path = '.\\videos'
			
 
				+    video_file = 'video'
			
 
				+    image_file = 'image'
			
 
				+    info_file = 'info'
			
 
				+    uids_dev_up = [6267140]
			
 
				+    uids_dev_play = [6267141]
			
 
				+    uids_prod_up = [20631241, 20631242]
			
 
				+    uids_prod_play = [20631241, 20631242]
			
 
				+
			
 
				+    @classmethod
			
 
				+    def upload_and_publish(cls, env, job):
			
 
				+        """
			
 
				+        上传视频到 oss
			
 
				+        :param env: 测试环境：dev，正式环境：prod
			
 
				+        :param job: 上升榜：up，播放量：play
			
 
				+        """
			
 
				+        Common.crawler_log().info("upload_and_publish starting...")
			
 
				+        today = time.strftime("%Y%m%d", time.localtime())
			
 
				+        # videos 目录下的所有视频文件夹
			
 
				+        files = os.listdir(cls.local_file_path)
			
 
				+        for f in files:
			
 
				+            try:
			
 
				+                # 单个视频文件夹
			
 
				+                fi_d = os.path.join(cls.local_file_path, f)
			
 
				+                # 确认为视频文件夹
			
 
				+                if os.path.isdir(fi_d):
			
 
				+                    Common.crawler_log().info('dir = {}'.format(fi_d))
			
 
				+                    # 列出所有视频文件夹
			
 
				+                    dir_files = os.listdir(fi_d)
			
 
				+                    data = {'appType': '888888', 'crawlerSrcCode': 'KANYIKAN', 'viewStatus': '1', 'versionCode': '1'}
			
 
				+                    now_timestamp = int(round(time.time() * 1000))
			
 
				+                    data['crawlerTaskTimestamp'] = str(now_timestamp)
			
 
				+                    global uid
			
 
				+                    if env == "dev" and job == "up":
			
 
				+                        uid = str(random.choice(cls.uids_dev_up))
			
 
				+                    elif env == "dev" and job == "play":
			
 
				+                        uid = str(random.choice(cls.uids_dev_play))
			
 
				+                    elif env == "prod" and job == "up":
			
 
				+                        uid = str(random.choice(cls.uids_prod_up))
			
 
				+                    elif env == "prod" and job == "play":
			
 
				+                        uid = str(random.choice(cls.uids_prod_play))
			
 
				+                    data['loginUid'] = uid
			
 
				+                    # 单个视频文件夹下的所有视频文件
			
 
				+                    for fi in dir_files:
			
 
				+                        # 视频文件夹下的所有文件路径
			
 
				+                        fi_path = fi_d + '\\' + fi
			
 
				+                        Common.crawler_log().info('dir fi_path = {}'.format(fi_path))
			
 
				+                        # 读取 info.txt，赋值给 data
			
 
				+                        if cls.info_file in fi:
			
 
				+                            f = open(fi_path, "r", encoding="UTF-8")
			
 
				+                            # 读取数据 数据准确性写入的时候保证 读取暂不处理
			
 
				+                            for i in range(14):
			
 
				+                                line = f.readline()
			
 
				+                                line = line.replace('\n', '')
			
 
				+                                if line is not None and len(line) != 0 and not line.isspace():
			
 
				+                                    Common.crawler_log().info("line = {}".format(line))
			
 
				+                                    if i == 0:
			
 
				+                                        data['crawlerSrcId'] = line
			
 
				+                                    elif i == 1:
			
 
				+                                        data['title'] = line
			
 
				+                                    elif i == 2:
			
 
				+                                        data['totalTime'] = line
			
 
				+                                    elif i == 8:
			
 
				+                                        data['crawlerSrcPublishTimestamp'] = line
			
 
				+                                else:
			
 
				+                                    Common.crawler_log().warning("{} line is None".format(fi_path))
			
 
				+                            f.close()
			
 
				+                            # remove info.txt
			
 
				+                            cls.remove_local_file(fi_path)
			
 
				+                    # 刷新数据
			
 
				+                    dir_files = os.listdir(fi_d)
			
 
				+                    for fi in dir_files:
			
 
				+                        fi_path = fi_d + '\\' + fi
			
 
				+                        Common.crawler_log().info('dir fi_path = {}'.format(fi_path))
			
 
				+                        # 上传oss
			
 
				+                        if cls.video_file in fi:
			
 
				+                            global oss_video_file
			
 
				+                            if env == "dev":
			
 
				+                                oss_video_file = cls.oss_file_path_video.format("dev", today, data['crawlerSrcId'])
			
 
				+                            elif env == "prod":
			
 
				+                                oss_video_file = cls.oss_file_path_video.format("prod", today, data['crawlerSrcId'])
			
 
				+                            Common.crawler_log().info("oss_video_file = {}".format(oss_video_file))
			
 
				+                            cls.put_file(oss_video_file, fi_path)
			
 
				+                            data['videoPath'] = oss_video_file
			
 
				+                            Common.crawler_log().info("videoPath = {}".format(oss_video_file))
			
 
				+                        elif cls.image_file in fi:
			
 
				+                            global oss_image_file
			
 
				+                            if env == "dev":
			
 
				+                                oss_image_file = cls.oss_file_path_image.format("env", today, data['crawlerSrcId'])
			
 
				+                            elif env == "prod":
			
 
				+                                oss_image_file = cls.oss_file_path_image.format("prod", today, data['crawlerSrcId'])
			
 
				+                            Common.crawler_log().info("oss_image_file = {}".format(oss_image_file))
			
 
				+                            cls.put_file(oss_image_file, fi_path)
			
 
				+                            data['coverImgPath'] = oss_image_file
			
 
				+                            Common.crawler_log().info("coverImgPath = {}".format(oss_image_file))
			
 
				+                        # 全部remove
			
 
				+                        cls.remove_local_file(fi_path)
			
 
				+
			
 
				+                    # 发布
			
 
				+                    if env == "dev":
			
 
				+                        cls.publish_video_dev(data)
			
 
				+                    elif env == "prod":
			
 
				+                        cls.publish_video_prod(data)
			
 
				+                    cls.remove_local_file_dir(fi_d)
			
 
				+
			
 
				+                else:
			
 
				+                    Common.crawler_log().error('file not a dir = {}'.format(fi_d))
			
 
				+            except Exception as e:
			
 
				+                Common.crawler_log().exception('upload_and_publish error', e)
			
--- a/main/run.py
+++ b/main/run.py
@@ -0,0 +1,65 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+# @Author: wangkun
			
 
				+# @Time: 2022/4/25
			
 
				+import datetime
			
 
				+import os
			
 
				+import random
			
 
				+import sys
			
 
				+import time
			
 
				+sys.path.append(os.getcwd())
			
 
				+from main.common import Common
			
 
				+from main.download import BSZF
			
 
				+
			
 
				+
			
 
				+class Main:
			
 
				+    @classmethod
			
 
				+    def download_job_dev(cls):
			
 
				+        """
			
 
				+        测试环境脚本
			
 
				+        """
			
 
				+        Common.crawler_log().info("开始抓取本山祝福视频\n")
			
 
				+        BSZF.get_recommend()
			
 
				+        BSZF.download_video("dev")
			
 
				+
			
 
				+        # 删除多余日志
			
 
				+        Common.del_logs()
			
 
				+        # 统计累计下载数量
			
 
				+        Common.benshanzhufu_download_count()
			
 
				+
			
 
				+    @classmethod
			
 
				+    def download_job_prod(cls):
			
 
				+        """
			
 
				+        正式环境脚本
			
 
				+        """
			
 
				+        while True:
			
 
				+            prod_time = datetime.datetime.now()
			
 
				+            if prod_time.hour < 9 or prod_time.hour > 12:
			
 
				+                Common.crawler_log().info("结束抓取视频\n")
			
 
				+                time.sleep(3)
			
 
				+                break
			
 
				+            else:
			
 
				+                BSZF.get_recommend()
			
 
				+                BSZF.download_video("dev")
			
 
				+                time.sleep(random.randint(1, 3))
			
 
				+
			
 
				+        # 删除多余日志
			
 
				+        Common.del_logs()
			
 
				+        # 统计累计下载数量
			
 
				+        Common.benshanzhufu_download_count()
			
 
				+
			
 
				+    @classmethod
			
 
				+    def main(cls):
			
 
				+        while True:
			
 
				+            while True:
			
 
				+                main_time = datetime.datetime.now()
			
 
				+                if 12 >= main_time.hour >= 9:
			
 
				+                    Common.crawler_log().info("开始抓取本山祝福视频\n")
			
 
				+                    cls.download_job_prod()
			
 
				+                else:
			
 
				+                    time.sleep(600)
			
 
				+                    break
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main = Main()
			
 
				+    main.main()
			
--- a/txt/__init__.py
+++ b/txt/__init__.py
@@ -0,0 +1,3 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+# @Author: wangkun
			
 
				+# @Time: 2022/4/25
			
--- a/txt/benshanzhufu_feeds.txt
+++ b/txt/benshanzhufu_feeds.txt
--- a/txt/benshanzhufu_videoid.txt
+++ b/txt/benshanzhufu_videoid.txt
--- a/videos/__init__.py
+++ b/videos/__init__.py
@@ -0,0 +1,3 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+# @Author: wangkun
			
 
				+# @Time: 2022/4/25
			
--- a/抓取规则.txt
+++ b/抓取规则.txt
@@ -0,0 +1,4 @@
 
				+==========2022/4/27===========
			
 
				+- 全爬
			
 
				+- 每天9-12点进行爬取
			
 
				+==============================