wangkun 3 anos atrás
pai
commit
18ecd5de30

+ 1 - 0
README.md

@@ -0,0 +1 @@
+本山祝福小程序爬虫

+ 3 - 0
logs/__init__.py

@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2022/4/25

+ 3 - 0
main/__init__.py

@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2022/4/25

+ 139 - 0
main/common.py

@@ -0,0 +1,139 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2022/4/25
+"""
+公共方法,包含:生成log / 删除log / 下载方法 / 读取文件 / 统计下载数
+"""
+from datetime import date, timedelta
+import datetime
+import logging
+import os
+import time
+import requests
+import urllib3
+
+proxies = {"http": None, "https": None}
+
+
+class Common:
+    # 统一获取当前时间 <class 'datetime.datetime'>  2022-04-14 20:13:51.244472
+    now = datetime.datetime.now()
+    # 昨天 <class 'str'>  2022-04-13
+    yesterday = (date.today() + timedelta(days=-1)).strftime("%Y-%m-%d")
+    # 今天 <class 'datetime.date'>  2022-04-14
+    today = date.today()
+    # 明天 <class 'str'>  2022-04-15
+    tomorrow = (date.today() + timedelta(days=1)).strftime("%Y-%m-%d")
+
+    @staticmethod
+    def crawler_log():
+        """
+        生成 log 日志
+        """
+        # 日志路径
+        log_dir = r"./logs/"
+        log_path = os.getcwd() + os.sep + log_dir
+        if not os.path.isdir(log_path):
+            os.makedirs(log_path)
+
+        # 日志参数
+        log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+        date_format = "%Y-%m-%d %p %H:%M:%S"
+        log_name = time.strftime("%Y-%m-%d", time.localtime(time.time())) + '.log'
+
+        # 日志初始化
+        logging.basicConfig(filename=log_path + log_name, level=logging.INFO, format=log_format, datefmt=date_format)
+        crawler_logger = logging.getLogger("crawler-log")
+
+        return crawler_logger
+
+    @classmethod
+    def del_logs(cls):
+        """
+        清除冗余日志文件
+        :return: 保留最近 7 个日志
+        """
+        log_dir = r"./logs/"
+        all_files = sorted(os.listdir(log_dir))
+        all_logs = []
+        for log in all_files:
+            name = os.path.splitext(log)[-1]
+            if name == ".log":
+                all_logs.append(log)
+
+        if len(all_logs) <= 7:
+            pass
+        else:
+            for file in all_logs[:len(all_logs) - 7]:
+                os.remove(log_dir + file)
+        cls.crawler_log().info("清除冗余日志成功")
+
+    @classmethod
+    def download_method(cls, text, d_name, d_url):
+        """
+        下载封面:text == "cover" ; 下载视频:text == "video"
+        需要下载的视频标题:d_title
+        视频封面,或视频播放地址:d_url
+        下载保存路径:"./files/{d_title}/"
+        """
+        # 首先创建一个保存该视频相关信息的文件夹
+        video_dir = "./videos/" + d_name + "/"
+        if not os.path.exists(video_dir):
+            os.mkdir(video_dir)
+
+        # 下载视频
+        if text == "video":
+            # 需要下载的视频地址
+            video_url = d_url
+            # 视频名
+            video_name = "video.mp4"
+
+            # 下载视频
+            urllib3.disable_warnings()
+            response = requests.get(video_url, stream=True, proxies=proxies, verify=False)
+            try:
+                with open(video_dir + video_name, "wb") as f:
+                    for chunk in response.iter_content(chunk_size=10240):
+                        f.write(chunk)
+                cls.crawler_log().info("==========视频下载完成==========")
+            except Exception as e:
+                cls.crawler_log().error("视频下载失败:{}".format(e))
+
+        # 下载封面
+        elif text == "cover":
+            # 需要下载的封面地址
+            cover_url = d_url
+            # 封面名
+            cover_name = "image.jpg"
+
+            # 下载封面
+            urllib3.disable_warnings()
+            response = requests.get(cover_url, proxies=proxies, verify=False)
+            try:
+                with open(video_dir + cover_name, "wb") as f:
+                    f.write(response.content)
+                cls.crawler_log().info("==========封面下载完成==========")
+            except Exception as e:
+                cls.crawler_log().error("封面下载失败:{}".format(e))
+
+    @staticmethod
+    def read_txt(t_name):
+        """
+        读取 txt 文件
+        :param t_name: 文件名
+        :return: 文件内容
+        """
+        with open(r"./txt/" + t_name, "r", encoding="UTF-8") as f:
+            return f.readlines()
+
+    @classmethod
+    def benshanzhufu_download_count(cls):
+        videoid_path = r"./txt/benshanzhufu_videoid.txt"
+        count = 0
+        for count, line in enumerate(open(videoid_path, "rb").readlines()):
+            count += 1
+        cls.crawler_log().info('累计下载视频数: {}\n'.format(count))
+
+
+if __name__ == "__main__":
+    common = Common()

+ 91 - 0
main/demo.py

@@ -0,0 +1,91 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2022/4/25
+from urllib import parse
+import json
+import random
+import time
+import urllib3
+import requests
+
+
+class Demo:
+    num = 1
+
+    @classmethod
+    def get_page_num(cls):
+        cls.num += 1
+        return cls.num
+
+    @classmethod
+    def get_feeds(cls):
+        now = int(time.time() * 1000)
+        url = "https://bszf.wentingyou.cn/index.php/v111/index/index?parameter="
+        header = {
+            "Connection": "keep-alive",
+            "vision": "1.1.0",
+            "content-type": "application/x-www-form-urlencoded",
+            "scene": "1008",
+            "content-time": str(now),
+            "token": "",
+            "visitorKey": "165086930003741",
+            "chatKey": "wx0fb8149da961d3b0",
+            "cache-time": str(now),
+            "Accept-Encoding": "gzip,compress,br,deflate",
+            "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) "
+                          "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 "
+                          "MicroMessenger/8.0.20(0x1800142d) NetType/WIFI Language/zh_CN",
+            "Referer": "https://servicewechat.com/wx0fb8149da961d3b0/2/page-frame.html"
+        }
+        parameter = {
+                    "cid": "",
+                    "page": random.randint(1, 75),
+                    "is_ads": 1,
+                    "model": "iPhone 11<iPhone12,1>",
+                    "mini_version": "8.0.20",
+                    "origin_channel": "3",
+                    "origin_type": "2",
+                    "origin_level": "0",
+                    "ini_id": "165086930003741"
+        }
+
+        params = parse.quote(json.dumps(parameter))
+        url = url + str(params)
+
+        urllib3.disable_warnings()
+        r = requests.get(headers=header, url=url, verify=False)
+        response = json.loads(r.content.decode("utf8"))
+        data = response["data"]["list"]
+
+        for k, v in parameter.items():
+            print(f"{k}:{v}")
+
+        print("\n")
+
+        for video_list in data:
+            print(video_list)
+
+    @classmethod
+    def encode_params(cls):
+        data = {
+                "cid": "",
+                "page": 8,
+                "is_ads": 1,
+                "model": "iPhone 11<iPhone12,1>",
+                "mini_version": "8.0.20",
+                "origin_channel": "3",
+                "origin_type": "2",
+                "origin_level": "3",
+                "ini_id": "165086930003741"
+        }
+        text = parse.urlencode(data)
+        print(text)
+
+        data = str(data)
+        text1 = parse.quote(data)
+        print(text1)
+
+
+if __name__ == "__main__":
+    demo = Demo()
+    demo.get_feeds()

+ 285 - 0
main/download.py

@@ -0,0 +1,285 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2022/4/25
+"""
+从 微信小程序-本山祝福短视频 中,下载符合规则的视频
+"""
+import json
+import os
+import random
+import sys
+import time
+from urllib import parse
+import requests
+import urllib3
+sys.path.append(os.getcwd())
+from main.common import Common
+from main.publish import Publish
+
+proxies = {"http": None, "https": None}
+
+
+class BSZF:
+
+    @classmethod
+    def get_recommend(cls):
+        """
+        从本山祝福小程序首页推荐获取视频list:
+            1.在 benshanzhufu_videoid.txt 中去重
+            2.在 benshanzhufu_feeds.txt 中去重
+            3.添加视频信息到 benshanzhufu_feeds.txt
+        """
+        now = int(time.time() * 1000)
+        url = "https://bszf.wentingyou.cn/index.php/v111/index/index?parameter="
+        header = {
+            "Connection": "keep-alive",
+            "vision": "1.1.0",
+            "content-type": "application/x-www-form-urlencoded",
+            "scene": "1008",
+            "content-time": str(now),
+            "token": "",
+            "visitorKey": "165086930003741",
+            "chatKey": "wx0fb8149da961d3b0",
+            "cache-time": str(now),
+            "Accept-Encoding": "gzip,compress,br,deflate",
+            "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) "
+                          "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 "
+                          "MicroMessenger/8.0.20(0x1800142d) NetType/WIFI Language/zh_CN",
+            "Referer": "https://servicewechat.com/wx0fb8149da961d3b0/2/page-frame.html"
+        }
+        parameter = {
+            "cid": "",
+            "page": random.randint(1, 75),
+            "is_ads": 1,
+            "model": "iPhone 11<iPhone12,1>",
+            "mini_version": "8.0.20",
+            "origin_channel": "3",
+            "origin_type": "2",
+            "origin_level": "0",
+            "ini_id": "165086930003741"
+        }
+
+        params = parse.quote(json.dumps(parameter))
+        url = url + str(params)
+
+        try:
+            urllib3.disable_warnings()
+            r = requests.get(headers=header, url=url, proxies=proxies, verify=False)
+            response = json.loads(r.content.decode("utf8"))
+            if "data" not in response:
+                Common.crawler_log().error("获取本山祝福视频 list 出错:{},休眠 3s".format(response))
+                time.sleep(3)
+            else:
+                feeds = response["data"]["list"]
+                for i in range(len(feeds)):
+                    if "nid" not in feeds[i]:
+                        video_id = "0"
+                        Common.crawler_log().info("video_id:{}".format(video_id))
+                    else:
+                        video_id = feeds[i]["nid"]
+                        Common.crawler_log().info("video_id:{}".format(video_id))
+
+                    if "video_cover" not in feeds[i]:
+                        video_cover = "0"
+                        Common.crawler_log().info("video_cover不存在")
+                    else:
+                        video_cover = feeds[i]["video_cover"]
+                        Common.crawler_log().info("video_cover:{}".format(video_cover))
+
+                    if "video_url" not in feeds[i]:
+                        video_url = "0"
+                        Common.crawler_log().info("video_url:不存在")
+                    else:
+                        video_url = feeds[i]["video_url"]
+                        Common.crawler_log().info("video_url:{}".format(video_url))
+
+                    if "width" not in feeds[i] or "height" not in feeds[i]:
+                        video_width = "0"
+                        video_height = "0"
+                        video_resolution = str(video_width) + "*" + str(video_height)
+                        Common.crawler_log().info("无分辨率")
+                    else:
+                        video_width = feeds[i]["width"]
+                        video_height = feeds[i]["height"]
+                        video_resolution = str(video_width) + "*" + str(video_height)
+                        Common.crawler_log().info("video_resolution:{}".format(video_resolution))
+
+                    if "commentCount" not in feeds[i]:
+                        video_comment_cnt = "0"
+                        Common.crawler_log().info("video_comment_cnt:0")
+                    else:
+                        video_comment_cnt = feeds[i]["commentCount"]
+                        Common.crawler_log().info("video_comment_cnt:{}".format(video_comment_cnt))
+
+                    if "update_time" not in feeds[i]:
+                        video_send_time = "0"
+                        Common.crawler_log().info("video_send_time:不存在")
+                    else:
+                        video_send_time = feeds[i]["update_time"]
+                        Common.crawler_log().info("video_send_time:{}".format(
+                            time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(video_send_time)))))
+
+                    # 视频标题过滤话题及处理特殊字符
+                    if "title" not in feeds[i]:
+                        video_title = "0"
+                        Common.crawler_log().info("video_title不存在")
+                    else:
+                        video_title = feeds[i]["title"].strip().replace("\n", "")\
+                            .replace("/", "").replace("本山祝福", "").replace(" ", "")\
+                            .replace(" ", "").replace("&NBSP", "").replace("\r", "")\
+                            .replace("#", "").replace(".", "。").replace("\\", "")\
+                            .replace(":", "").replace("*", "").replace("?", "")\
+                            .replace("?", "").replace('"', "").replace("<", "")\
+                            .replace(">", "").replace("|", "")
+                        Common.crawler_log().info("video_title:{}".format(video_title))
+
+                    video_like_cnt = "10000"
+                    video_share_cnt = "10000"
+                    video_duration = "10000"
+                    video_play_cnt = "10000"
+                    user_name = "bszf"
+                    head_url = video_cover
+                    user_id = "10000"
+
+                    # 从 benshanzhufu_videoid.txt 中去重
+                    video_ids = Common.read_txt("benshanzhufu_videoid.txt")
+                    if video_id in [p_id.strip() for p_id in video_ids]:
+                        Common.crawler_log().info("该视频已下载:{}".format(video_title))
+                    else:
+                        Common.crawler_log().info("该视频未下载:{}".format(video_title))
+
+                        # 从 benshanzhufu_feeds.txt 中去重
+                        contents = Common.read_txt("benshanzhufu_feeds.txt")
+                        # benshanzhufu_feeds.txt 为空时,直接保存
+                        if len(contents) == 0 and video_id != "0" and video_url != "0" and video_title != "0":
+                            basic_time = int(time.time())
+                            Common.crawler_log().info("添加视频信息至benshanzhufu_feeds.txt:{}".format(video_title))
+                            with open(r"./txt/benshanzhufu_feeds.txt", "a", encoding="UTF-8") as f_a:
+                                f_a.write(str(basic_time) + " + " +
+                                          str(video_id) + " + " +
+                                          str(video_play_cnt) + " + " +
+                                          str(video_title) + " + " +
+                                          str(video_duration) + " + " +
+                                          str(video_comment_cnt) + " + " +
+                                          str(video_like_cnt) + " + " +
+                                          str(video_share_cnt) + " + " +
+                                          str(video_resolution) + " + " +
+                                          str(video_send_time) + " + " +
+                                          str(user_name) + " + " +
+                                          str(head_url) + " + " +
+                                          str(video_cover) + " + " +
+                                          str(video_url) + " + " +
+                                          str(user_id) + " + " +
+                                          str("wx0fb8149da961d3b0") + "\n")
+                        else:
+                            if video_id in [content.split(" + ")[1] for content in contents]:
+                                Common.crawler_log().info("该视频已在 benshanzhufu_feeds.txt 中:{}".format(video_title))
+                            elif video_id == "0" or video_url == "0" or video_title != "0":
+                                Common.crawler_log().info("视频不存在")
+                            else:
+                                basic_time = int(time.time())
+                                Common.crawler_log().info("添加视频信息至benshanzhufu_feeds.txt:{}".format(video_title))
+                                with open(r"./txt/benshanzhufu_feeds.txt", "a", encoding="UTF-8") as f_a:
+                                    f_a.write(str(basic_time) + " + " +
+                                              str(video_id) + " + " +
+                                              str(video_play_cnt) + " + " +
+                                              str(video_title) + " + " +
+                                              str(video_duration) + " + " +
+                                              str(video_comment_cnt) + " + " +
+                                              str(video_like_cnt) + " + " +
+                                              str(video_share_cnt) + " + " +
+                                              str(video_resolution) + " + " +
+                                              str(video_send_time) + " + " +
+                                              str(user_name) + " + " +
+                                              str(head_url) + " + " +
+                                              str(video_cover) + " + " +
+                                              str(video_url) + " + " +
+                                              str(user_id) + " + " +
+                                              str("wx0fb8149da961d3b0") + "\n")
+        except Exception as e:
+            Common.crawler_log().error("获取视频 list 异常:{}".format(e))
+
+    @classmethod
+    def download_video(cls, env):
+        """
+        下载视频
+        测试环境:env == dev
+        正式环境:env == prod
+        """
+        videos = Common.read_txt("benshanzhufu_feeds.txt")
+        for video in videos:
+            download_video_id = video.strip().split(" + ")[1]
+            try:
+                download_video_title = video.strip().split(" + ")[3]
+                download_video_duration = video.strip().split(" + ")[4]
+                download_video_play_cnt = video.strip().split(" + ")[2]
+                download_video_comment_cnt = video.strip().split(" + ")[5]
+                download_video_like_cnt = video.strip().split(" + ")[6]
+                download_video_share_cnt = video.strip().split(" + ")[7]
+                download_video_resolution = video.strip().split(" + ")[8]
+                download_video_send_time = video.strip().split(" + ")[9]
+                download_user_name = video.strip().split(" + ")[10]
+                download_head_url = video.strip().split(" + ")[11]
+                download_cover_url = video.strip().split(" + ")[12]
+                download_video_url = video.strip().split(" + ")[13]
+                download_video_session = video.strip().split(" + ")[-1]
+
+                Common.crawler_log().info("开始下载视频:{}".format(download_video_title))
+                # 下载封面
+                Common.download_method(text="cover", d_name=download_video_title, d_url=download_cover_url)
+                # 下载视频
+                Common.download_method(text="video", d_name=download_video_title, d_url=download_video_url)
+                # 保存视频信息至 "./videos/{download_video_title}/info.txt"
+                with open(r"./videos/" + download_video_title + "/info.txt", "a", encoding="UTF-8") as f_a:
+                    f_a.write(str(download_video_id) + "\n" +
+                              str(download_video_title) + "\n" +
+                              str(download_video_duration) + "\n" +
+                              str(download_video_play_cnt) + "\n" +
+                              str(download_video_comment_cnt) + "\n" +
+                              str(download_video_like_cnt) + "\n" +
+                              str(download_video_share_cnt) + "\n" +
+                              str(download_video_resolution) + "\n" +
+                              str(download_video_send_time) + "\n" +
+                              str(download_user_name) + "\n" +
+                              str(download_head_url) + "\n" +
+                              str(download_video_url) + "\n" +
+                              str(download_cover_url) + "\n" +
+                              str(download_video_session))
+
+                # 上传视频
+                if env == "dev":
+                    Common.crawler_log().info("开始上传视频:{}".format(download_video_title))
+                    Publish.upload_and_publish("dev", "play")
+                elif env == "prod":
+                    Common.crawler_log().info("开始上传视频:{}".format(download_video_title))
+                    Publish.upload_and_publish("prod", "play")
+
+                # 保存视频信息至 benshanzhufu_videoid.txt
+                with open(r"./txt/benshanzhufu_videoid.txt", "a", encoding="UTF-8") as fa:
+                    fa.write(download_video_id + "\n")
+
+                # 删除该视频在benshanzhufu_feeds.txt中的信息
+                Common.crawler_log().info("删除该视频在benshanzhufu_feeds.txt中的信息:{}".format(download_video_title))
+                with open(r"./txt/benshanzhufu_feeds.txt", "r", encoding="UTF-8") as f_r:
+                    lines = f_r.readlines()
+                with open(r"./txt/benshanzhufu_feeds.txt", "w", encoding="utf-8") as f_w:
+                    for line in lines:
+                        if download_video_id in line.split(" + ")[1]:
+                            continue
+                        f_w.write(line)
+            except Exception as e:
+                # 删除该视频在 recommend.txt中的信息
+                Common.crawler_log().error("该视频信息异常,删除在benshanzhufu_feeds.txt中的信息:{}".format(e))
+                with open(r"./txt/benshanzhufu_feeds.txt", "r", encoding="UTF-8") as f_r:
+                    lines = f_r.readlines()
+                with open(r"./txt/benshanzhufu_feeds.txt", "w", encoding="utf-8") as f_w:
+                    for line in lines:
+                        if download_video_id in line.split(" + ")[1]:
+                            continue
+                        f_w.write(line)
+
+
+if __name__ == "__main__":
+    bszf = BSZF()
+    bszf.get_recommend()
+    bszf.download_video("dev")

+ 249 - 0
main/publish.py

@@ -0,0 +1,249 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2022/4/25
+"""
+上传视频到阿里云 OSS
+上传视频到管理后台
+"""
+import json
+import os
+import random
+import time
+
+import oss2
+import requests
+import urllib3
+from main.common import Common
+
+proxies = {"http": None, "https": None}
+
+
+class Publish:
+    @classmethod
+    def publish_video_dev(cls, request_data):
+        """
+        loginUid  站内uid (随机)
+        appType  默认:888888
+        crawlerSrcId   站外视频ID
+        crawlerSrcCode   渠道(自定义 KYK)
+        crawlerSrcPublishTimestamp  视频原发布时间
+        crawlerTaskTimestamp   爬虫创建时间(可以是当前时间)
+        videoPath  视频oss地址
+        coverImgPath  视频封面oss地址
+        title  标题
+        totalTime  视频时长
+        viewStatus  视频的有效状态 默认1
+        versionCode  版本 默认1
+        :return:
+        """
+        # Common.crawler_log().info('publish request data: {}'.format(request_data))
+        result = cls.request_post('https://videotest.yishihui.com/longvideoapi/crawler/video/send', request_data)
+        Common.crawler_log().info('publish result: {}'.format(result))
+        if result['code'] != 0:
+            Common.crawler_log().error('pushlish failure msg = {}'.format(result['msg']))
+        else:
+            Common.crawler_log().info('publish success video_id = : {}'.format(request_data['crawlerSrcId']))
+
+    @classmethod
+    def publish_video_prod(cls, request_data):
+        """
+        loginUid  站内uid (随机)
+        appType  默认:888888
+        crawlerSrcId   站外视频ID
+        crawlerSrcCode   渠道(自定义 KYK)
+        crawlerSrcPublishTimestamp  视频原发布时间
+        crawlerTaskTimestamp   爬虫创建时间(可以是当前时间)
+        videoPath  视频oss地址
+        coverImgPath  视频封面oss地址
+        title  标题
+        totalTime  视频时长
+        viewStatus  视频的有效状态 默认1
+        versionCode  版本 默认1
+        :return:
+        """
+        # Common.crawler_log().info('publish request data: {}'.format(request_data))
+        result = cls.request_post('https://longvideoapi.piaoquantv.com/longvideoapi/crawler/video/send', request_data)
+        Common.crawler_log().info('publish result: {}'.format(result))
+        if result['code'] != 0:
+            Common.crawler_log().error('pushlish failure msg = {}'.format(result['msg']))
+        else:
+            Common.crawler_log().info('publish success video_id = : {}'.format(request_data['crawlerSrcId']))
+
+    @classmethod
+    def request_post(cls, request_url, request_data):
+        """
+        post 请求 HTTP接口
+        :param request_url: 接口URL
+        :param request_data: 请求参数
+        :return: res_data json格式
+        """
+        urllib3.disable_warnings()
+        response = requests.post(url=request_url, data=request_data, proxies=proxies, verify=False)
+        if response.status_code == 200:
+            res_data = json.loads(response.text)
+            return res_data
+
+    # 以下代码展示了基本的文件上传、下载、罗列、删除用法。
+
+    # 首先初始化AccessKeyId、AccessKeySecret、Endpoint等信息。
+    # 通过环境变量获取,或者把诸如“<你的AccessKeyId>”替换成真实的AccessKeyId等。
+    #
+    # 以杭州区域为例,Endpoint可以是:
+    #   http://oss-cn-hangzhou.aliyuncs.com
+    #   https://oss-cn-hangzhou.aliyuncs.com
+    # 分别以HTTP、HTTPS协议访问。
+    access_key_id = os.getenv('OSS_TEST_ACCESS_KEY_ID', 'LTAIP6x1l3DXfSxm')
+    access_key_secret = os.getenv('OSS_TEST_ACCESS_KEY_SECRET', 'KbTaM9ars4OX3PMS6Xm7rtxGr1FLon')
+    bucket_name = os.getenv('OSS_TEST_BUCKET', 'art-pubbucket')
+    # endpoint = os.getenv('OSS_TEST_ENDPOINT', 'oss-cn-hangzhou-internal.aliyuncs.com')
+    endpoint = os.getenv('OSS_TEST_ENDPOINT', 'oss-cn-hangzhou.aliyuncs.com')
+
+    # 确认上面的参数都填写正确了
+    for param in (access_key_id, access_key_secret, bucket_name, endpoint):
+        assert '<' not in param, '请设置参数:' + param
+
+    # 创建Bucket对象,所有Object相关的接口都可以通过Bucket对象来进行
+    bucket = oss2.Bucket(oss2.Auth(access_key_id, access_key_secret), endpoint, bucket_name)
+
+    """
+    处理流程:
+    1. 定时(每天凌晨1点执行一次)循环files文件下的内容 结构:files -> 视频文件夹 -> 视频文件 + 封面图 + 基本信息
+    2. 视频文件和封面上传到oss
+    - 视频文件oss目录  longvideo/crawler_local/video/prod/文件名
+    - 视频封面oss目录  longvideo/crawler_local/image/prod/文件名
+    3. 发布视频
+    - 读取 基本信息 调用发布接口
+    """
+    # env 日期20220225 文件名
+    oss_file_path_video = r'longvideo/crawler_local/video/{}/{}/{}'
+    oss_file_path_image = r'longvideo/crawler_local/image/{}/{}/{}'
+
+    @classmethod
+    def put_file(cls, oss_file, local_file):
+        # Common.crawler_log().info("put oss file = {}, local file = {}".format(oss_file, local_file))
+        cls.bucket.put_object_from_file(oss_file, local_file)
+        Common.crawler_log().info("put oss file = {}, local file = {} success".format(oss_file, local_file))
+
+    # 清除本地文件
+    @classmethod
+    def remove_local_file(cls, local_file):
+        # Common.crawler_log().info("remove local file = {}".format(local_file))
+        os.remove(local_file)
+        Common.crawler_log().info("remove local file = {} success".format(local_file))
+
+    # 清除本地文件夹
+    @classmethod
+    def remove_local_file_dir(cls, local_file):
+        # Common.crawler_log().info("remove local file dir = {}".format(local_file))
+        os.rmdir(local_file)
+        Common.crawler_log().info("remove local file dir = {} success".format(local_file))
+
+    local_file_path = '.\\videos'
+    video_file = 'video'
+    image_file = 'image'
+    info_file = 'info'
+    uids_dev_up = [6267140]
+    uids_dev_play = [6267141]
+    uids_prod_up = [20631241, 20631242]
+    uids_prod_play = [20631241, 20631242]
+
+    @classmethod
+    def upload_and_publish(cls, env, job):
+        """
+        上传视频到 oss
+        :param env: 测试环境:dev,正式环境:prod
+        :param job: 上升榜:up,播放量:play
+        """
+        Common.crawler_log().info("upload_and_publish starting...")
+        today = time.strftime("%Y%m%d", time.localtime())
+        # videos 目录下的所有视频文件夹
+        files = os.listdir(cls.local_file_path)
+        for f in files:
+            try:
+                # 单个视频文件夹
+                fi_d = os.path.join(cls.local_file_path, f)
+                # 确认为视频文件夹
+                if os.path.isdir(fi_d):
+                    Common.crawler_log().info('dir = {}'.format(fi_d))
+                    # 列出所有视频文件夹
+                    dir_files = os.listdir(fi_d)
+                    data = {'appType': '888888', 'crawlerSrcCode': 'KANYIKAN', 'viewStatus': '1', 'versionCode': '1'}
+                    now_timestamp = int(round(time.time() * 1000))
+                    data['crawlerTaskTimestamp'] = str(now_timestamp)
+                    global uid
+                    if env == "dev" and job == "up":
+                        uid = str(random.choice(cls.uids_dev_up))
+                    elif env == "dev" and job == "play":
+                        uid = str(random.choice(cls.uids_dev_play))
+                    elif env == "prod" and job == "up":
+                        uid = str(random.choice(cls.uids_prod_up))
+                    elif env == "prod" and job == "play":
+                        uid = str(random.choice(cls.uids_prod_play))
+                    data['loginUid'] = uid
+                    # 单个视频文件夹下的所有视频文件
+                    for fi in dir_files:
+                        # 视频文件夹下的所有文件路径
+                        fi_path = fi_d + '\\' + fi
+                        Common.crawler_log().info('dir fi_path = {}'.format(fi_path))
+                        # 读取 info.txt,赋值给 data
+                        if cls.info_file in fi:
+                            f = open(fi_path, "r", encoding="UTF-8")
+                            # 读取数据 数据准确性写入的时候保证 读取暂不处理
+                            for i in range(14):
+                                line = f.readline()
+                                line = line.replace('\n', '')
+                                if line is not None and len(line) != 0 and not line.isspace():
+                                    Common.crawler_log().info("line = {}".format(line))
+                                    if i == 0:
+                                        data['crawlerSrcId'] = line
+                                    elif i == 1:
+                                        data['title'] = line
+                                    elif i == 2:
+                                        data['totalTime'] = line
+                                    elif i == 8:
+                                        data['crawlerSrcPublishTimestamp'] = line
+                                else:
+                                    Common.crawler_log().warning("{} line is None".format(fi_path))
+                            f.close()
+                            # remove info.txt
+                            cls.remove_local_file(fi_path)
+                    # 刷新数据
+                    dir_files = os.listdir(fi_d)
+                    for fi in dir_files:
+                        fi_path = fi_d + '\\' + fi
+                        Common.crawler_log().info('dir fi_path = {}'.format(fi_path))
+                        # 上传oss
+                        if cls.video_file in fi:
+                            global oss_video_file
+                            if env == "dev":
+                                oss_video_file = cls.oss_file_path_video.format("dev", today, data['crawlerSrcId'])
+                            elif env == "prod":
+                                oss_video_file = cls.oss_file_path_video.format("prod", today, data['crawlerSrcId'])
+                            Common.crawler_log().info("oss_video_file = {}".format(oss_video_file))
+                            cls.put_file(oss_video_file, fi_path)
+                            data['videoPath'] = oss_video_file
+                            Common.crawler_log().info("videoPath = {}".format(oss_video_file))
+                        elif cls.image_file in fi:
+                            global oss_image_file
+                            if env == "dev":
+                                oss_image_file = cls.oss_file_path_image.format("env", today, data['crawlerSrcId'])
+                            elif env == "prod":
+                                oss_image_file = cls.oss_file_path_image.format("prod", today, data['crawlerSrcId'])
+                            Common.crawler_log().info("oss_image_file = {}".format(oss_image_file))
+                            cls.put_file(oss_image_file, fi_path)
+                            data['coverImgPath'] = oss_image_file
+                            Common.crawler_log().info("coverImgPath = {}".format(oss_image_file))
+                        # 全部remove
+                        cls.remove_local_file(fi_path)
+
+                    # 发布
+                    if env == "dev":
+                        cls.publish_video_dev(data)
+                    elif env == "prod":
+                        cls.publish_video_prod(data)
+                    cls.remove_local_file_dir(fi_d)
+
+                else:
+                    Common.crawler_log().error('file not a dir = {}'.format(fi_d))
+            except Exception as e:
+                Common.crawler_log().exception('upload_and_publish error', e)

+ 65 - 0
main/run.py

@@ -0,0 +1,65 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2022/4/25
+import datetime
+import os
+import random
+import sys
+import time
+sys.path.append(os.getcwd())
+from main.common import Common
+from main.download import BSZF
+
+
+class Main:
+    @classmethod
+    def download_job_dev(cls):
+        """
+        测试环境脚本
+        """
+        Common.crawler_log().info("开始抓取本山祝福视频\n")
+        BSZF.get_recommend()
+        BSZF.download_video("dev")
+
+        # 删除多余日志
+        Common.del_logs()
+        # 统计累计下载数量
+        Common.benshanzhufu_download_count()
+
+    @classmethod
+    def download_job_prod(cls):
+        """
+        正式环境脚本
+        """
+        while True:
+            prod_time = datetime.datetime.now()
+            if prod_time.hour < 9 or prod_time.hour > 12:
+                Common.crawler_log().info("结束抓取视频\n")
+                time.sleep(3)
+                break
+            else:
+                BSZF.get_recommend()
+                BSZF.download_video("dev")
+                time.sleep(random.randint(1, 3))
+
+        # 删除多余日志
+        Common.del_logs()
+        # 统计累计下载数量
+        Common.benshanzhufu_download_count()
+
+    @classmethod
+    def main(cls):
+        while True:
+            while True:
+                main_time = datetime.datetime.now()
+                if 12 >= main_time.hour >= 9:
+                    Common.crawler_log().info("开始抓取本山祝福视频\n")
+                    cls.download_job_prod()
+                else:
+                    time.sleep(600)
+                    break
+
+
+if __name__ == "__main__":
+    main = Main()
+    main.main()

+ 3 - 0
txt/__init__.py

@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2022/4/25

+ 0 - 0
txt/benshanzhufu_feeds.txt


+ 0 - 0
txt/benshanzhufu_videoid.txt


+ 3 - 0
videos/__init__.py

@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2022/4/25

+ 4 - 0
抓取规则.txt

@@ -0,0 +1,4 @@
+==========2022/4/27===========
+- 全爬
+- 每天9-12点进行爬取
+==============================