Ver código fonte

实现需求:2022/7/21

wangkun 2 anos atrás
pai
commit
7a28077065

+ 64 - 1
README.md

@@ -1 +1,64 @@
-本山祝福小程序爬虫
+本山祝福小程序爬虫
+https://git.yishihui.com/Server/crawler_benshanzhufu.git
+
+ffmpeg==1.4
+loguru==0.6.0
+oss2==2.15.0
+requests==2.27.1
+urllib3==1.26.9
+python==3.10
+
+执行入口:
+
+1.cd ./crawler_benshanzhufu
+
+2.python3 main/run_recommend.py
+
+
+
+==========2022/7/20===========
+
+项目重启:
+
+1.接入飞书文档:https://w42nne6hzg.feishu.cn/sheets/shtcnGh2rrsPYM4iVNEBO7OqWrb
+
+2.代码逻辑重构
+
+3.时长限制>=60s
+
+4.宽高限制:宽>=高才会下载及上传
+
+5.标题敏感词限制
+
+
+
+==========2022/7/18===========
+20631262
+20631263
+20631264
+20631265
+20631266  
+20631267
+20631268
+20631269
+20631271
+20631272  
+
+每个账号上发布3条本山祝福视频
+
+
+==========2022/4/29===========
+- 增加敏感词过滤
+
+- 每天 11 点开始爬取,上限 200 条
+
+- 上传账号:[20631241, 20631242, 20631244, 20631245, 20631246, 20631247]
+
+
+
+==========2022/4/27===========
+- 全爬
+
+- 每天9-12点进行爬取
+
+- 上传账号:20631241 / 20631242

+ 39 - 60
main/common.py

@@ -5,8 +5,8 @@
 公共方法,包含:生成log / 删除log / 下载方法 / 读取文件 / 统计下载数
 """
 from datetime import date, timedelta
+from loguru import logger
 import datetime
-import logging
 import os
 import time
 import requests
@@ -25,51 +25,62 @@ class Common:
     # 明天 <class 'str'>  2022-04-15
     tomorrow = (date.today() + timedelta(days=1)).strftime("%Y-%m-%d")
 
+    # 使用 logger 模块生成日志
     @staticmethod
-    def crawler_log():
+    def logger(log_type):
         """
-        生成 log 日志
+        使用 logger 模块生成日志
         """
         # 日志路径
-        log_dir = r"./logs/"
+        log_dir = "./logs/"
         log_path = os.getcwd() + os.sep + log_dir
         if not os.path.isdir(log_path):
             os.makedirs(log_path)
 
-        # 日志参数
-        log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-        date_format = "%Y-%m-%d %p %H:%M:%S"
-        log_name = time.strftime("%Y-%m-%d", time.localtime(time.time())) + '.log'
+        # 日志文件名
+        if log_type == "recommend":
+            log_name = time.strftime("%Y-%m-%d", time.localtime(time.time())) + '-bszf-recommend.log'
+        else:
+            log_name = time.strftime("%Y-%m-%d", time.localtime(time.time())) + '-bszf.log'
+
+        # 日志不打印到控制台
+        logger.remove(handler_id=None)
 
-        # 日志初始化
-        logging.basicConfig(filename=log_path + log_name, level=logging.INFO, format=log_format, datefmt=date_format)
-        crawler_logger = logging.getLogger("crawler-log")
+        # rotation="500 MB",实现每 500MB 存储一个文件
+        # rotation="12:00",实现每天 12:00 创建一个文件
+        # rotation="1 week",每周创建一个文件
+        # retention="10 days",每隔10天之后就会清理旧的日志
+        # 初始化日志
+        logger.add(log_dir + log_name, level="INFO", rotation='00:00')
 
-        return crawler_logger
+        return logger
 
+    # 清除日志,保留最近 6 个文件
     @classmethod
-    def del_logs(cls):
+    def del_logs(cls, log_type):
         """
         清除冗余日志文件
-        :return: 保留最近 7 个日志
+        :d_dir: 需要删除的 log 地址
+        :return: 保留最近 6 个日志
         """
-        log_dir = r"./logs/"
-        all_files = sorted(os.listdir(log_dir))
+        logs_dir = "./logs/"
+        all_files = sorted(os.listdir(logs_dir))
         all_logs = []
         for log in all_files:
             name = os.path.splitext(log)[-1]
             if name == ".log":
                 all_logs.append(log)
 
-        if len(all_logs) <= 7:
+        if len(all_logs) <= 6:
             pass
         else:
-            for file in all_logs[:len(all_logs) - 7]:
-                os.remove(log_dir + file)
-        cls.crawler_log().info("清除冗余日志成功")
+            for file in all_logs[:len(all_logs) - 6]:
+                os.remove(logs_dir + file)
+        cls.logger(log_type).info("清除冗余日志成功")
 
+    # 封装下载视频或封面的方法
     @classmethod
-    def download_method(cls, text, d_name, d_url):
+    def download_method(cls, log_type, text, d_name, d_url):
         """
         下载封面:text == "cover" ; 下载视频:text == "video"
         需要下载的视频标题:d_title
@@ -78,6 +89,7 @@ class Common:
         """
         # 首先创建一个保存该视频相关信息的文件夹
         video_dir = "./videos/" + d_name + "/"
+        # video_dir = "./videos/"
         if not os.path.exists(video_dir):
             os.mkdir(video_dir)
 
@@ -95,9 +107,9 @@ class Common:
                 with open(video_dir + video_name, "wb") as f:
                     for chunk in response.iter_content(chunk_size=10240):
                         f.write(chunk)
-                cls.crawler_log().info("==========视频下载完成==========")
+                cls.logger(log_type).info("==========视频下载完成==========")
             except Exception as e:
-                cls.crawler_log().error("视频下载失败:{}".format(e))
+                cls.logger(log_type).exception("视频下载失败:{}", e)
 
         # 下载封面
         elif text == "cover":
@@ -105,6 +117,8 @@ class Common:
             cover_url = d_url
             # 封面名
             cover_name = "image.jpg"
+            # # 封面名
+            # cover_name = d_name + ".jpg"
 
             # 下载封面
             urllib3.disable_warnings()
@@ -112,44 +126,9 @@ class Common:
             try:
                 with open(video_dir + cover_name, "wb") as f:
                     f.write(response.content)
-                cls.crawler_log().info("==========封面下载完成==========")
+                cls.logger(log_type).info("==========封面下载完成==========")
             except Exception as e:
-                cls.crawler_log().error("封面下载失败:{}".format(e))
-
-    @staticmethod
-    def read_txt(t_name):
-        """
-        读取 txt 文件
-        :param t_name: 文件名
-        :return: 文件内容
-        """
-        with open(r"./txt/" + t_name, "r", encoding="UTF-8") as f:
-            return f.readlines()
-
-    @classmethod
-    def del_content_in_txt(cls, d_content, d_filename):
-        """
-        删除指定文本的指定内容
-        :param d_content: 删除的指定内容
-        :param d_filename: 指定的文本
-        :return: None
-        """
-        with open(r"./txt/" + d_filename, "r", encoding="UTF-8") as f_r:
-            lines = f_r.readlines()
-        with open(r"./txt/" + d_filename, "w", encoding="utf-8") as f_w:
-            for line in lines:
-                if d_content in line.split(" + ")[1]:
-                    continue
-                f_w.write(line)
-        cls.crawler_log().info("删除{}中的{}成功".format(d_filename, d_content))
-
-    @classmethod
-    def benshanzhufu_download_count(cls):
-        videoid_path = r"./txt/benshanzhufu_videoid.txt"
-        count = 0
-        for count, line in enumerate(open(videoid_path, "rb").readlines()):
-            count += 1
-        cls.crawler_log().info('累计下载视频数: {}\n'.format(count))
+                cls.logger(log_type).exception("封面下载失败:{}", e)
 
 
 if __name__ == "__main__":

+ 5 - 3
main/demo.py

@@ -100,6 +100,8 @@ class Demo:
 
 
 if __name__ == "__main__":
-    demo = Demo()
-    demo.get_feeds()
-    demo.distent_videoid()
+    # demo = Demo()
+    # demo.get_feeds()
+    # demo.distent_videoid()
+
+    pass

+ 0 - 321
main/download.py

@@ -1,321 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2022/4/25
-"""
-从 微信小程序-本山祝福短视频 中,下载符合规则的视频
-"""
-import json
-import os
-import random
-import sys
-import time
-from urllib import parse
-import requests
-import urllib3
-sys.path.append(os.getcwd())
-from main.common import Common
-from main.publish import Publish
-
-proxies = {"http": None, "https": None}
-
-
-class BSZF:
-    # 已下载视频列表
-    download_video_list = []
-
-    # 过滤关键字
-    @classmethod
-    def sensitive_words(cls):
-        sensitive_words = [
-            "早上好",
-            "晚上好",
-            "中午好",
-            "最美祝福",
-            "祝福",
-            "新年好",
-            "立春",
-            "雨水",
-            "惊蛰",
-            "春分",
-            "清明",
-            "谷雨",
-            "小暑",
-            "大暑",
-            "立秋",
-            "处暑",
-            "白露",
-            "秋分",
-            "寒露",
-            "霜降",
-            "立冬",
-            "小雪",
-            "大雪",
-            "冬至",
-            "小寒",
-            "大寒",
-        ]
-        return sensitive_words
-
-    @classmethod
-    def get_recommend(cls):
-        """
-        从本山祝福小程序首页推荐获取视频list:
-            1.在 benshanzhufu_videoid.txt 中去重
-            2.在 benshanzhufu_feeds.txt 中去重
-            3.添加视频信息到 benshanzhufu_feeds.txt
-        """
-        now = int(time.time() * 1000)
-        url = "https://bszf.wentingyou.cn/index.php/v111/index/index?parameter="
-        header = {
-            "Connection": "keep-alive",
-            "vision": "1.1.0",
-            "content-type": "application/x-www-form-urlencoded",
-            "scene": "1008",
-            "content-time": str(now),
-            "token": "",
-            "visitorKey": "165086930003741",
-            "chatKey": "wx0fb8149da961d3b0",
-            "cache-time": str(now),
-            "Accept-Encoding": "gzip,compress,br,deflate",
-            "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) "
-                          "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 "
-                          "MicroMessenger/8.0.20(0x1800142d) NetType/WIFI Language/zh_CN",
-            "Referer": "https://servicewechat.com/wx0fb8149da961d3b0/2/page-frame.html"
-        }
-        parameter = {
-            "cid": "",
-            "page": random.randint(1, 75),
-            "is_ads": 1,
-            "model": "iPhone 11<iPhone12,1>",
-            "mini_version": "8.0.20",
-            "origin_channel": "3",
-            "origin_type": "2",
-            "origin_level": "0",
-            "ini_id": "165086930003741"
-        }
-
-        params = parse.quote(json.dumps(parameter))
-        url = url + str(params)
-
-        try:
-            urllib3.disable_warnings()
-            r = requests.get(headers=header, url=url, proxies=proxies, verify=False)
-            response = json.loads(r.content.decode("utf8"))
-            if "data" not in response:
-                Common.crawler_log().error("获取本山祝福视频 list 出错:{},休眠 3s".format(response))
-                time.sleep(3)
-            else:
-                feeds = response["data"]["list"]
-                for i in range(len(feeds)):
-                    if "nid" not in feeds[i]:
-                        video_id = "0"
-                        Common.crawler_log().info("video_id:{}".format(video_id))
-                    else:
-                        video_id = feeds[i]["nid"]
-                        Common.crawler_log().info("video_id:{}".format(video_id))
-
-                    if "video_cover" not in feeds[i]:
-                        video_cover = "0"
-                        Common.crawler_log().info("video_cover不存在")
-                    else:
-                        video_cover = feeds[i]["video_cover"]
-                        Common.crawler_log().info("video_cover:{}".format(video_cover))
-
-                    if "video_url" not in feeds[i]:
-                        video_url = "0"
-                        Common.crawler_log().info("video_url:不存在")
-                    elif ".mp4" not in feeds[i]["video_url"]:
-                        video_url = "0"
-                        Common.crawler_log().info("video_url无效:".format(video_url))
-                    else:
-                        video_url = feeds[i]["video_url"]
-                        Common.crawler_log().info("video_url:{}".format(video_url))
-
-                    if "width" not in feeds[i] or "height" not in feeds[i]:
-                        video_width = "0"
-                        video_height = "0"
-                        video_resolution = str(video_width) + "*" + str(video_height)
-                        Common.crawler_log().info("无分辨率")
-                    else:
-                        video_width = feeds[i]["width"]
-                        video_height = feeds[i]["height"]
-                        video_resolution = str(video_width) + "*" + str(video_height)
-                        Common.crawler_log().info("video_resolution:{}".format(video_resolution))
-
-                    if "commentCount" not in feeds[i]:
-                        video_comment_cnt = "0"
-                        Common.crawler_log().info("video_comment_cnt:0")
-                    else:
-                        video_comment_cnt = feeds[i]["commentCount"]
-                        Common.crawler_log().info("video_comment_cnt:{}".format(video_comment_cnt))
-
-                    if "update_time" not in feeds[i]:
-                        video_send_time = "0"
-                        Common.crawler_log().info("video_send_time:不存在")
-                    else:
-                        video_send_time = feeds[i]["update_time"]
-                        Common.crawler_log().info("video_send_time:{}".format(
-                            time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(video_send_time)))))
-
-                    # 视频标题过滤话题及处理特殊字符
-                    if "title" not in feeds[i]:
-                        video_title = "0"
-                        Common.crawler_log().info("video_title不存在")
-                    else:
-                        video_title = feeds[i]["title"].strip().replace("\n", "")\
-                            .replace("/", "").replace("本山祝福", "").replace(" ", "")\
-                            .replace(" ", "").replace("&NBSP", "").replace("\r", "")\
-                            .replace("#", "").replace(".", "。").replace("\\", "")\
-                            .replace(":", "").replace("*", "").replace("?", "")\
-                            .replace("?", "").replace('"', "").replace("<", "")\
-                            .replace(">", "").replace("|", "")
-                        Common.crawler_log().info("video_title:{}".format(video_title))
-
-                    video_like_cnt = "10000"
-                    video_share_cnt = "10000"
-                    video_duration = "10000"
-                    video_play_cnt = "10000"
-                    user_name = "bszf"
-                    head_url = video_cover
-                    user_id = "10000"
-
-                    # 从 benshanzhufu_videoid.txt 中去重
-                    video_ids = Common.read_txt("benshanzhufu_videoid.txt")
-                    if video_id in [p_id.strip() for p_id in video_ids]:
-                        Common.crawler_log().info("该视频已下载:{}".format(video_title))
-                        pass
-                    else:
-                        Common.crawler_log().info("该视频未下载:{}".format(video_title))
-
-                        # 从 benshanzhufu_feeds.txt 中去重
-                        contents = Common.read_txt("benshanzhufu_feeds.txt")
-                        # benshanzhufu_feeds.txt 为空时,直接保存
-                        if len(contents) == 0 and video_id != "0" and video_url != "0" and video_title != "0":
-                            basic_time = int(time.time())
-                            Common.crawler_log().info("添加视频信息至benshanzhufu_feeds.txt:{}".format(video_title))
-                            with open(r"./txt/benshanzhufu_feeds.txt", "a", encoding="UTF-8") as f_a:
-                                f_a.write(str(basic_time) + " + " +
-                                          str(video_id) + " + " +
-                                          str(video_play_cnt) + " + " +
-                                          str(video_title) + " + " +
-                                          str(video_duration) + " + " +
-                                          str(video_comment_cnt) + " + " +
-                                          str(video_like_cnt) + " + " +
-                                          str(video_share_cnt) + " + " +
-                                          str(video_resolution) + " + " +
-                                          str(video_send_time) + " + " +
-                                          str(user_name) + " + " +
-                                          str(head_url) + " + " +
-                                          str(video_cover) + " + " +
-                                          str(video_url) + " + " +
-                                          str(user_id) + " + " +
-                                          str("wx0fb8149da961d3b0") + "\n")
-                        else:
-                            if video_id in [content.split(" + ")[1] for content in contents]:
-                                Common.crawler_log().info("该视频已在 benshanzhufu_feeds.txt 中:{}".format(video_title))
-                            elif video_id == "0" or video_url == "0" or video_title == "0":
-                                Common.crawler_log().info("视频不存在")
-                            else:
-                                basic_time = int(time.time())
-                                Common.crawler_log().info("添加视频信息至benshanzhufu_feeds.txt:{}".format(video_title))
-                                with open(r"./txt/benshanzhufu_feeds.txt", "a", encoding="UTF-8") as f_a:
-                                    f_a.write(str(basic_time) + " + " +
-                                              str(video_id) + " + " +
-                                              str(video_play_cnt) + " + " +
-                                              str(video_title) + " + " +
-                                              str(video_duration) + " + " +
-                                              str(video_comment_cnt) + " + " +
-                                              str(video_like_cnt) + " + " +
-                                              str(video_share_cnt) + " + " +
-                                              str(video_resolution) + " + " +
-                                              str(video_send_time) + " + " +
-                                              str(user_name) + " + " +
-                                              str(head_url) + " + " +
-                                              str(video_cover) + " + " +
-                                              str(video_url) + " + " +
-                                              str(user_id) + " + " +
-                                              str("wx0fb8149da961d3b0") + "\n")
-        except Exception as e:
-            Common.crawler_log().error("获取视频 list 异常:{}".format(e))
-
-    @classmethod
-    def download_video(cls, env):
-        """
-        下载视频
-        测试环境:env == dev
-        正式环境:env == prod
-        """
-        videos = Common.read_txt("benshanzhufu_feeds.txt")
-        for video in videos:
-            download_video_id = video.strip().split(" + ")[1]
-            try:
-                download_video_title = video.strip().split(" + ")[3]
-                download_video_duration = video.strip().split(" + ")[4]
-                download_video_play_cnt = video.strip().split(" + ")[2]
-                download_video_comment_cnt = video.strip().split(" + ")[5]
-                download_video_like_cnt = video.strip().split(" + ")[6]
-                download_video_share_cnt = video.strip().split(" + ")[7]
-                download_video_resolution = video.strip().split(" + ")[8]
-                download_video_send_time = video.strip().split(" + ")[9]
-                download_user_name = video.strip().split(" + ")[10]
-                download_head_url = video.strip().split(" + ")[11]
-                download_cover_url = video.strip().split(" + ")[12]
-                download_video_url = video.strip().split(" + ")[13]
-                download_video_session = video.strip().split(" + ")[-1]
-
-                if any(word if word in download_video_title else False for word in cls.sensitive_words()) is True:
-                    Common.crawler_log().info("视频已中敏感词,删除该视频信息:{}".format(download_video_title))
-                    # 删除该视频在benshanzhufu_feeds.txt中的信息
-                    Common.del_content_in_txt(download_video_id, "benshanzhufu_feeds.txt")
-                else:
-                    Common.crawler_log().info("开始下载视频:{}".format(download_video_title))
-                    # 下载封面
-                    Common.download_method(text="cover", d_name=download_video_title, d_url=download_cover_url)
-                    # 下载视频
-                    Common.download_method(text="video", d_name=download_video_title, d_url=download_video_url)
-                    # 保存视频信息至 benshanzhufu_videoid.txt
-                    with open(r"./txt/benshanzhufu_videoid.txt", "a", encoding="UTF-8") as fa:
-                        fa.write(download_video_id + "\n")
-                    # 添加视频 ID 到 list
-                    cls.download_video_list.append(download_video_id)
-                    # 保存视频信息至 "./videos/{download_video_title}/info.txt"
-                    with open(r"./videos/" + download_video_title + "/info.txt", "a", encoding="UTF-8") as f_a:
-                        f_a.write(str(download_video_id) + "\n" +
-                                  str(download_video_title) + "\n" +
-                                  str(download_video_duration) + "\n" +
-                                  str(download_video_play_cnt) + "\n" +
-                                  str(download_video_comment_cnt) + "\n" +
-                                  str(download_video_like_cnt) + "\n" +
-                                  str(download_video_share_cnt) + "\n" +
-                                  str(download_video_resolution) + "\n" +
-                                  str(download_video_send_time) + "\n" +
-                                  str(download_user_name) + "\n" +
-                                  str(download_head_url) + "\n" +
-                                  str(download_video_url) + "\n" +
-                                  str(download_cover_url) + "\n" +
-                                  str(download_video_session))
-
-                    # 上传视频
-                    if env == "dev":
-                        Common.crawler_log().info("开始上传视频:{}".format(download_video_title))
-                        Publish.upload_and_publish("dev", "play")
-                    elif env == "prod":
-                        Common.crawler_log().info("开始上传视频:{}".format(download_video_title))
-                        Publish.upload_and_publish("prod", "play")
-
-                    try:
-                        Common.del_content_in_txt(download_video_id, "benshanzhufu_feeds.txt")
-                    except Exception as e:
-                        Common.crawler_log().error("删除benshanzhufu_feeds.txt中的{}失败,重新删除:{}".format(download_video_id, e))
-                        Common.del_content_in_txt(download_video_id, "benshanzhufu_feeds.txt")
-
-            except Exception as e:
-                Common.crawler_log().error("下载视频异常:{}".format(e))
-                # 删除该视频在 recommend.txt 中的信息
-                Common.del_content_in_txt(download_video_id, "benshanzhufu_feeds.txt")
-
-
-if __name__ == "__main__":
-    bszf = BSZF()
-    bszf.get_recommend()
-    bszf.download_video("dev")

+ 306 - 0
main/feishu_lib.py

@@ -0,0 +1,306 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2022/7/20
+import json
+import requests
+import urllib3
+from main.common import Common
+proxies = {"http": None, "https": None}
+
+
+class Feishu:
+    """
+    编辑飞书云文档
+    """
+    # 看一看爬虫数据表
+    kanyikan_url = "https://w42nne6hzg.feishu.cn/sheets/shtcngRPoDYAi24x52j2nDuHMih?"
+    # 快手爬虫数据表
+    kuaishou_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnp4SaJt37q6OOOrYzPMjQkg?"
+    # 微视爬虫数据表
+    weishi_url = "https://w42nne6hzg.feishu.cn/sheets/shtcn5YSWg91JfVGzj0SFZIRRPh?"
+    # 小年糕爬虫数据表
+    xiaoniangao_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnYxiyQ1wLklo1W5Kdqc9cGh?"
+    # 数据监控表
+    crawler_monitor = "https://w42nne6hzg.feishu.cn/sheets/shtcnlZWYazInhf7Z60jkbLRJyd?"
+    # 本山祝福数据表
+    crawler_benshanzhufu = "https://w42nne6hzg.feishu.cn/sheets/shtcnGh2rrsPYM4iVNEBO7OqWrb?"
+
+    # 飞书路径token
+    @classmethod
+    def spreadsheettoken(cls, crawler):
+        """
+        :param crawler: 哪个爬虫
+        """
+        if crawler == "kanyikan":
+            return "shtcngRPoDYAi24x52j2nDuHMih"
+        elif crawler == "kuaishou":
+            return "shtcnp4SaJt37q6OOOrYzPMjQkg"
+        elif crawler == "weishi":
+            return "shtcn5YSWg91JfVGzj0SFZIRRPh"
+        elif crawler == "xiaoniangao":
+            return "shtcnYxiyQ1wLklo1W5Kdqc9cGh"
+        elif crawler == "monitor":
+            return "shtcnlZWYazInhf7Z60jkbLRJyd"
+        elif crawler == "bszf":
+            return "shtcnGh2rrsPYM4iVNEBO7OqWrb"
+
+    # 获取飞书api token
+    @classmethod
+    def get_token(cls, log_type):
+        """
+        获取飞书api token
+        :return:
+        """
+        url = "https://open.feishu.cn/open-apis/auth/v3/tenant_access_token/internal/"
+        post_data = {"app_id": "cli_a13ad2afa438d00b",  # 这里账号密码是发布应用的后台账号及密码
+                     "app_secret": "4tK9LY9VbiQlY5umhE42dclBFo6t4p5O"}
+
+        try:
+            urllib3.disable_warnings()
+            response = requests.post(url=url, data=post_data, proxies=proxies, verify=False)
+            tenant_access_token = response.json()["tenant_access_token"]
+            return tenant_access_token
+        except Exception as e:
+            Common.logger(log_type).error("获取飞书 api token 异常:{}", e)
+
+    # 获取表格元数据
+    @classmethod
+    def get_metainfo(cls, log_type, crawler):
+        """
+        获取表格元数据
+        :return:
+        """
+        get_metainfo_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                           + cls.spreadsheettoken(crawler) + "/metainfo"
+
+        headers = {
+            "Authorization": "Bearer " + cls.get_token(log_type),
+            "Content-Type": "application/json; charset=utf-8"
+        }
+        params = {
+            "extFields": "protectedRange",  # 额外返回的字段,extFields=protectedRange时返回保护行列信息
+            "user_id_type": "open_id"  # 返回的用户id类型,可选open_id,union_id
+        }
+        try:
+            urllib3.disable_warnings()
+            r = requests.get(url=get_metainfo_url, headers=headers, params=params, proxies=proxies, verify=False)
+            response = json.loads(r.content.decode("utf8"))
+            return response
+        except Exception as e:
+            Common.logger(log_type).error("获取表格元数据异常:{}", e)
+
+    # 读取工作表中所有数据
+    @classmethod
+    def get_values_batch(cls, log_type, crawler, sheetid):
+        """
+        读取工作表中所有数据
+        :param log_type: 启用哪个 log
+        :param crawler: 哪个爬虫
+        :param sheetid: 哪张表
+        :return: 所有数据
+        """
+        get_values_batch_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                               + cls.spreadsheettoken(crawler) + "/values_batch_get"
+        headers = {
+            "Authorization": "Bearer " + cls.get_token(log_type),
+            "Content-Type": "application/json; charset=utf-8"
+        }
+        params = {
+            # 多个查询范围 如 url?ranges=range1,range2 ,其中 range 包含 sheetId 与单元格范围两部分
+            "ranges": sheetid,
+
+            # valueRenderOption=ToString 可返回纯文本的值(数值类型除外);
+            # valueRenderOption=FormattedValue 计算并格式化单元格;
+            # valueRenderOption=Formula单元格中含有公式时返回公式本身;
+            # valueRenderOption=UnformattedValue计算但不对单元格进行格式化
+            "valueRenderOption": "ToString",
+
+            # dateTimeRenderOption=FormattedString 计算并将时间日期按照其格式进行格式化,但不会对数字进行格式化,返回格式化后的字符串。
+            "dateTimeRenderOption": "",
+
+            # 返回的用户id类型,可选open_id,union_id
+            "user_id_type": "open_id"
+        }
+        try:
+            urllib3.disable_warnings()
+            r = requests.get(url=get_values_batch_url, headers=headers, params=params, proxies=proxies, verify=False)
+            # print(r.text)
+            response = json.loads(r.content.decode("utf8"))
+            values = response["data"]["valueRanges"][0]["values"]
+            return values
+        except Exception as e:
+            Common.logger(log_type).error("读取工作表所有数据异常:{}", e)
+
+    # 工作表,插入行或列
+    @classmethod
+    def insert_columns(cls, log_type, crawler, sheetid, majordimension, startindex, endindex):
+        """
+        工作表插入行或列
+        :param log_type: 日志路径
+        :param crawler: 哪个爬虫的云文档
+        :param sheetid:哪张工作表
+        :param majordimension:行或者列, ROWS、COLUMNS
+        :param startindex:开始位置
+        :param endindex:结束位置
+        """
+        insert_columns_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                             + cls.spreadsheettoken(crawler) + "/insert_dimension_range"
+        headers = {
+            "Authorization": "Bearer " + cls.get_token(log_type),
+            "Content-Type": "application/json; charset=utf-8"
+        }
+        body = {
+            "dimension": {
+                "sheetId": sheetid,
+                "majorDimension": majordimension,  # 默认 ROWS ,可选 ROWS、COLUMNS
+                "startIndex": startindex,  # 开始的位置
+                "endIndex": endindex  # 结束的位置
+            },
+            "inheritStyle": "AFTER"  # BEFORE 或 AFTER,不填为不继承 style
+        }
+        try:
+            urllib3.disable_warnings()
+            r = requests.post(url=insert_columns_url, headers=headers, json=body, proxies=proxies, verify=False)
+            Common.logger(log_type).info("插入行或列:{}", r.json()["msg"])
+        except Exception as e:
+            Common.logger(log_type).error("插入行或列异常:{}", e)
+
+    # 写入数据
+    @classmethod
+    def update_values(cls, log_type, crawler, sheetid, ranges, values):
+        """
+        写入数据
+        :param log_type: 日志路径
+        :param crawler: 哪个爬虫的云文档
+        :param sheetid:哪张工作表
+        :param ranges:单元格范围
+        :param values:写入的具体数据,list
+        """
+        update_values_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                            + cls.spreadsheettoken(crawler) + "/values_batch_update"
+        headers = {
+            "Authorization": "Bearer " + cls.get_token(log_type),
+            "Content-Type": "application/json; charset=utf-8"
+        }
+        body = {
+            "valueRanges": [
+                {
+                    "range": sheetid + "!" + ranges,
+                    "values": values
+                },
+            ],
+        }
+
+        try:
+            urllib3.disable_warnings()
+            r = requests.post(url=update_values_url, headers=headers, json=body, proxies=proxies, verify=False)
+            Common.logger(log_type).info("写入数据:{}", r.json()["msg"])
+        except Exception as e:
+            Common.logger(log_type).error("写入数据异常:{}", e)
+
+    # 合并单元格
+    @classmethod
+    def merge_cells(cls, log_type, crawler, sheetid, ranges):
+        """
+        合并单元格
+        :param log_type: 日志路径
+        :param crawler: 哪个爬虫
+        :param sheetid:哪张工作表
+        :param ranges:需要合并的单元格范围
+        """
+        merge_cells_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                          + cls.spreadsheettoken(crawler) + "/merge_cells"
+        headers = {
+            "Authorization": "Bearer " + cls.get_token(log_type),
+            "Content-Type": "application/json; charset=utf-8"
+        }
+
+        body = {
+            "range": sheetid + "!" + ranges,
+            "mergeType": "MERGE_ROWS"
+        }
+
+        try:
+            urllib3.disable_warnings()
+            r = requests.post(url=merge_cells_url, headers=headers, json=body, proxies=proxies, verify=False)
+            Common.logger(log_type).info("合并单元格:{}", r.json()["msg"])
+        except Exception as e:
+            Common.logger(log_type).error("合并单元格异常:{}", e)
+
+    # 读取单元格数据
+    @classmethod
+    def get_range_value(cls, log_type, crawler, sheetid, cell):
+        """
+        读取单元格内容
+        :param log_type: 日志路径
+        :param crawler: 哪个爬虫
+        :param sheetid: 哪张工作表
+        :param cell: 哪个单元格
+        :return: 单元格内容
+        """
+        get_range_value_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                              + cls.spreadsheettoken(crawler) + "/values/" + sheetid + "!" + cell
+        headers = {
+            "Authorization": "Bearer " + cls.get_token(log_type),
+            "Content-Type": "application/json; charset=utf-8"
+        }
+        params = {
+            # valueRenderOption=ToString 可返回纯文本的值(数值类型除外);
+            # valueRenderOption=FormattedValue 计算并格式化单元格;
+            # valueRenderOption=Formula 单元格中含有公式时返回公式本身;
+            # valueRenderOption=UnformattedValue 计算但不对单元格进行格式化。
+            "valueRenderOption": "FormattedValue",
+
+            # dateTimeRenderOption=FormattedString 计算并对时间日期按照其格式进行格式化,但不会对数字进行格式化,返回格式化后的字符串。
+            "dateTimeRenderOption": "",
+
+            # 返回的用户id类型,可选open_id,union_id
+            "user_id_type": "open_id"
+        }
+        try:
+            urllib3.disable_warnings()
+            r = requests.get(url=get_range_value_url, headers=headers, params=params, proxies=proxies, verify=False)
+            # print(r.text)
+            return r.json()["data"]["valueRange"]["values"][0]
+        except Exception as e:
+            Common.logger(log_type).error("读取单元格数据异常:{}", e)
+
+    # 删除行或列,可选 ROWS、COLUMNS
+    @classmethod
+    def dimension_range(cls, log_type, crawler, sheetid, major_dimension, startindex, endindex):
+        """
+        删除行或列
+        :param log_type: 日志路径
+        :param crawler: 哪个爬虫
+        :param sheetid:工作表
+        :param major_dimension:默认 ROWS ,可选 ROWS、COLUMNS
+        :param startindex:开始的位置
+        :param endindex:结束的位置
+        :return:
+        """
+        dimension_range_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                              + cls.spreadsheettoken(crawler) + "/dimension_range"
+        headers = {
+            "Authorization": "Bearer " + cls.get_token(log_type),
+            "Content-Type": "application/json; charset=utf-8"
+        }
+        body = {
+            "dimension": {
+                "sheetId": sheetid,
+                "majorDimension": major_dimension,
+                "startIndex": startindex,
+                "endIndex": endindex
+                }
+            }
+        try:
+            urllib3.disable_warnings()
+            r = requests.delete(url=dimension_range_url, headers=headers, json=body, proxies=proxies, verify=False)
+            Common.logger(log_type).info("删除视频数据:{}", r.json()["msg"])
+        except Exception as e:
+            Common.logger(log_type).error("删除视频数据异常:{}", e)
+
+
+if __name__ == "__main__":
+    feishu = Feishu()
+
+    pass

+ 59 - 53
main/publish.py

@@ -1,10 +1,6 @@
 # -*- coding: utf-8 -*-
 # @Author: wangkun
 # @Time: 2022/4/25
-"""
-上传视频到阿里云 OSS
-上传视频到管理后台
-"""
 import json
 import os
 import random
@@ -20,7 +16,7 @@ proxies = {"http": None, "https": None}
 
 class Publish:
     @classmethod
-    def publish_video_dev(cls, request_data):
+    def publish_video_dev(cls, log_type, request_data):
         """
         loginUid  站内uid (随机)
         appType  默认:888888
@@ -36,16 +32,19 @@ class Publish:
         versionCode  版本 默认1
         :return:
         """
-        # Common.crawler_log().info('publish request data: {}'.format(request_data))
+        # Common.logger().info('publish request data: {}'.format(request_data))
         result = cls.request_post('https://videotest.yishihui.com/longvideoapi/crawler/video/send', request_data)
-        Common.crawler_log().info('publish result: {}'.format(result))
+        # Common.logger(log_type).info('publish result: {}'.format(result))
+        video_id = result["data"]["id"]
+        # Common.logger(log_type).info('video_id: {}'.format(video_id))
         if result['code'] != 0:
-            Common.crawler_log().error('pushlish failure msg = {}'.format(result['msg']))
+            Common.logger(log_type).error('pushlish failure msg = {}'.format(result['msg']))
         else:
-            Common.crawler_log().info('publish success video_id = : {}'.format(request_data['crawlerSrcId']))
+            Common.logger(log_type).info('publish success video_id = : {}'.format(request_data['crawlerSrcId']))
+        return video_id
 
     @classmethod
-    def publish_video_prod(cls, request_data):
+    def publish_video_prod(cls, log_type, request_data):
         """
         loginUid  站内uid (随机)
         appType  默认:888888
@@ -61,13 +60,15 @@ class Publish:
         versionCode  版本 默认1
         :return:
         """
-        # Common.crawler_log().info('publish request data: {}'.format(request_data))
         result = cls.request_post('https://longvideoapi.piaoquantv.com/longvideoapi/crawler/video/send', request_data)
-        Common.crawler_log().info('publish result: {}'.format(result))
+        # Common.logger(log_type).info('publish result: {}'.format(result))
+        video_id = result["data"]["id"]
+        # Common.logger(log_type).info('video_id: {}'.format(video_id))
         if result['code'] != 0:
-            Common.crawler_log().error('pushlish failure msg = {}'.format(result['msg']))
+            Common.logger(log_type).error('pushlish failure msg = {}'.format(result['msg']))
         else:
-            Common.crawler_log().info('publish success video_id = : {}'.format(request_data['crawlerSrcId']))
+            Common.logger(log_type).info('publish success video_id = : {}'.format(request_data['crawlerSrcId']))
+        return video_id
 
     @classmethod
     def request_post(cls, request_url, request_data):
@@ -115,46 +116,45 @@ class Publish:
     - 读取 基本信息 调用发布接口
     """
     # env 日期20220225 文件名
-    oss_file_path_video = r'longvideo/crawler_local/video/{}/{}/{}'
-    oss_file_path_image = r'longvideo/crawler_local/image/{}/{}/{}'
+    oss_file_path_video = 'longvideo/crawler_local/video/{}/{}/{}'
+    oss_file_path_image = 'longvideo/crawler_local/image/{}/{}/{}'
 
     @classmethod
-    def put_file(cls, oss_file, local_file):
-        # Common.crawler_log().info("put oss file = {}, local file = {}".format(oss_file, local_file))
+    def put_file(cls, log_type, oss_file, local_file):
         cls.bucket.put_object_from_file(oss_file, local_file)
-        Common.crawler_log().info("put oss file = {}, local file = {} success".format(oss_file, local_file))
+        Common.logger(log_type).info("put oss file = {}, local file = {} success".format(oss_file, local_file))
 
     # 清除本地文件
     @classmethod
-    def remove_local_file(cls, local_file):
-        # Common.crawler_log().info("remove local file = {}".format(local_file))
+    def remove_local_file(cls, log_type, local_file):
         os.remove(local_file)
-        Common.crawler_log().info("remove local file = {} success".format(local_file))
+        Common.logger(log_type).info("remove local file = {} success".format(local_file))
 
     # 清除本地文件夹
     @classmethod
-    def remove_local_file_dir(cls, local_file):
-        # Common.crawler_log().info("remove local file dir = {}".format(local_file))
+    def remove_local_file_dir(cls, log_type, local_file):
         os.rmdir(local_file)
-        Common.crawler_log().info("remove local file dir = {} success".format(local_file))
+        Common.logger(log_type).info("remove local file dir = {} success".format(local_file))
 
-    local_file_path = '.\\videos'
+    local_file_path = './videos'
     video_file = 'video'
     image_file = 'image'
     info_file = 'info'
-    uids_dev_up = [6267141]
-    uids_dev_play = [6267140]
-    uids_prod_up = [20631241, 20631242, 20631244, 20631245, 20631246, 20631247]
-    uids_prod_play = [20631241, 20631242, 20631244, 20631245, 20631246, 20631247]
+    uids_dev_up = [6267140]
+    uids_dev_play = [6267141]
+    uids_prod_up = [20631262, 20631263, 20631264, 20631265, 20631266, 20631267, 20631268, 20631269, 20631271, 20631272]
+    uids_prod_play = [20631262, 20631263, 20631264, 20631265, 20631266, 20631267, 20631268, 20631269, 20631271,
+                      20631272]
 
     @classmethod
-    def upload_and_publish(cls, env, job):
+    def upload_and_publish(cls, log_type, env, job):
         """
         上传视频到 oss
+        :param log_type: 选择的 log
         :param env: 测试环境:dev,正式环境:prod
         :param job: 上升榜:up,播放量:play
         """
-        Common.crawler_log().info("upload_and_publish starting...")
+        Common.logger(log_type).info("upload_and_publish starting...")
         today = time.strftime("%Y%m%d", time.localtime())
         # videos 目录下的所有视频文件夹
         files = os.listdir(cls.local_file_path)
@@ -164,10 +164,13 @@ class Publish:
                 fi_d = os.path.join(cls.local_file_path, f)
                 # 确认为视频文件夹
                 if os.path.isdir(fi_d):
-                    Common.crawler_log().info('dir = {}'.format(fi_d))
+                    Common.logger(log_type).info('dir = {}'.format(fi_d))
                     # 列出所有视频文件夹
                     dir_files = os.listdir(fi_d)
-                    data = {'appType': '888888', 'crawlerSrcCode': 'KANYIKAN', 'viewStatus': '1', 'versionCode': '1'}
+                    data = {'appType': '888888',
+                            'crawlerSrcCode': 'BENSHANZHUFU',
+                            'viewStatus': '1',
+                            'versionCode': '1'}
                     now_timestamp = int(round(time.time() * 1000))
                     data['crawlerTaskTimestamp'] = str(now_timestamp)
                     global uid
@@ -183,8 +186,8 @@ class Publish:
                     # 单个视频文件夹下的所有视频文件
                     for fi in dir_files:
                         # 视频文件夹下的所有文件路径
-                        fi_path = fi_d + '\\' + fi
-                        Common.crawler_log().info('dir fi_path = {}'.format(fi_path))
+                        fi_path = fi_d + '/' + fi
+                        Common.logger(log_type).info('dir fi_path = {}'.format(fi_path))
                         # 读取 info.txt,赋值给 data
                         if cls.info_file in fi:
                             f = open(fi_path, "r", encoding="UTF-8")
@@ -193,7 +196,7 @@ class Publish:
                                 line = f.readline()
                                 line = line.replace('\n', '')
                                 if line is not None and len(line) != 0 and not line.isspace():
-                                    Common.crawler_log().info("line = {}".format(line))
+                                    # Common.logger(log_type).info("line = {}".format(line))
                                     if i == 0:
                                         data['crawlerSrcId'] = line
                                     elif i == 1:
@@ -203,15 +206,15 @@ class Publish:
                                     elif i == 8:
                                         data['crawlerSrcPublishTimestamp'] = line
                                 else:
-                                    Common.crawler_log().warning("{} line is None".format(fi_path))
+                                    Common.logger(log_type).warning("{} line is None".format(fi_path))
                             f.close()
                             # remove info.txt
-                            cls.remove_local_file(fi_path)
+                            cls.remove_local_file(log_type, fi_path)
                     # 刷新数据
                     dir_files = os.listdir(fi_d)
                     for fi in dir_files:
-                        fi_path = fi_d + '\\' + fi
-                        Common.crawler_log().info('dir fi_path = {}'.format(fi_path))
+                        fi_path = fi_d + '/' + fi
+                        # Common.logger(log_type).info('dir fi_path = {}'.format(fi_path))
                         # 上传oss
                         if cls.video_file in fi:
                             global oss_video_file
@@ -219,31 +222,34 @@ class Publish:
                                 oss_video_file = cls.oss_file_path_video.format("dev", today, data['crawlerSrcId'])
                             elif env == "prod":
                                 oss_video_file = cls.oss_file_path_video.format("prod", today, data['crawlerSrcId'])
-                            Common.crawler_log().info("oss_video_file = {}".format(oss_video_file))
-                            cls.put_file(oss_video_file, fi_path)
+                            Common.logger(log_type).info("oss_video_file = {}".format(oss_video_file))
+                            cls.put_file(log_type, oss_video_file, fi_path)
                             data['videoPath'] = oss_video_file
-                            Common.crawler_log().info("videoPath = {}".format(oss_video_file))
+                            Common.logger(log_type).info("videoPath = {}".format(oss_video_file))
                         elif cls.image_file in fi:
                             global oss_image_file
                             if env == "dev":
                                 oss_image_file = cls.oss_file_path_image.format("env", today, data['crawlerSrcId'])
                             elif env == "prod":
                                 oss_image_file = cls.oss_file_path_image.format("prod", today, data['crawlerSrcId'])
-                            Common.crawler_log().info("oss_image_file = {}".format(oss_image_file))
-                            cls.put_file(oss_image_file, fi_path)
+                            Common.logger(log_type).info("oss_image_file = {}".format(oss_image_file))
+                            cls.put_file(log_type, oss_image_file, fi_path)
                             data['coverImgPath'] = oss_image_file
-                            Common.crawler_log().info("coverImgPath = {}".format(oss_image_file))
+                            Common.logger(log_type).info("coverImgPath = {}".format(oss_image_file))
                         # 全部remove
-                        cls.remove_local_file(fi_path)
+                        cls.remove_local_file(log_type, fi_path)
 
                     # 发布
                     if env == "dev":
-                        cls.publish_video_dev(data)
+                        video_id = cls.publish_video_dev(log_type, data)
                     elif env == "prod":
-                        cls.publish_video_prod(data)
-                    cls.remove_local_file_dir(fi_d)
+                        video_id = cls.publish_video_prod(log_type, data)
+                    else:
+                        video_id = cls.publish_video_dev(log_type, data)
+                    cls.remove_local_file_dir(log_type, fi_d)
+                    return video_id
 
                 else:
-                    Common.crawler_log().error('file not a dir = {}'.format(fi_d))
+                    Common.logger(log_type).error('file not a dir = {}'.format(fi_d))
             except Exception as e:
-                Common.crawler_log().exception('upload_and_publish error', e)
+                Common.logger(log_type).exception('upload_and_publish error', e)

+ 418 - 0
main/recommend.py

@@ -0,0 +1,418 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2022/4/25
+"""
+从 微信小程序-本山祝福短视频 中,下载符合规则的视频
+"""
+import json
+import os
+import random
+import shutil
+import sys
+import time
+from urllib import parse
+import ffmpeg
+import requests
+import urllib3
+sys.path.append(os.getcwd())
+from main.common import Common
+from main.publish import Publish
+from main.feishu_lib import Feishu
+proxies = {"http": None, "https": None}
+
+
+class Recommend:
+    # 翻页参数
+    visitor_key = ""
+    page = 1
+
+    # 过滤词库
+    @classmethod
+    def sensitive_words(cls, log_type):
+        word_list = []
+        # 从云文档读取所有敏感词,添加到词库列表
+        lists = Feishu.get_values_batch(log_type, "bszf", "DjXfqG")
+        for i in lists:
+            for j in i:
+                # 过滤空的单元格内容
+                if j is None:
+                    pass
+                else:
+                    word_list.append(j)
+        return word_list
+
+    # 获取已下载视频宽高、时长等信息
+    @classmethod
+    def get_video_info_from_local(cls, video_path):
+        probe = ffmpeg.probe(video_path)
+        # print('video_path: {}'.format(video_path))
+        # format1 = probe['format']
+        # bit_rate = int(format1['bit_rate']) / 1000
+        # duration = format['duration']
+        # size = int(format1['size']) / 1024 / 1024
+        video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None)
+        if video_stream is None:
+            print('No video stream found!')
+            return
+        width = int(video_stream['width'])
+        height = int(video_stream['height'])
+        # num_frames = int(video_stream['nb_frames'])
+        # fps = int(video_stream['r_frame_rate'].split('/')[0]) / int(video_stream['r_frame_rate'].split('/')[1])
+        duration = float(video_stream['duration'])
+        # print('width: {}'.format(width))
+        # print('height: {}'.format(height))
+        # print('num_frames: {}'.format(num_frames))
+        # print('bit_rate: {}k'.format(bit_rate))
+        # print('fps: {}'.format(fps))
+        # print('size: {}MB'.format(size))
+        # print('duration: {}'.format(duration))
+        return width, height, duration
+
+    # 推荐列表获取视频
+    @classmethod
+    def get_recommend(cls, log_type):
+        """
+        获取首页推荐视频列表,写入:https://w42nne6hzg.feishu.cn/sheets/shtcnGh2rrsPYM4iVNEBO7OqWrb?sheet=CcHgO7
+        """
+        now = int(time.time() * 1000)
+        url = "https://bszf.wentingyou.cn/index.php/v111/index/index?parameter="
+        header = {
+            "Connection": "keep-alive",
+            "vision": "1.1.0",
+            "content-type": "application/x-www-form-urlencoded",
+            "scene": "1008",
+            "content-time": str(now),
+            "token": "",
+            "visitorKey": "165086930003741",
+            "chatKey": "wx0fb8149da961d3b0",
+            "cache-time": str(now),
+            "Accept-Encoding": "gzip,compress,br,deflate",
+            "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) "
+                          "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 "
+                          "MicroMessenger/8.0.20(0x1800142d) NetType/WIFI Language/zh_CN",
+            "Referer": "https://servicewechat.com/wx0fb8149da961d3b0/2/page-frame.html"
+        }
+        parameter = {
+            "cid": "",
+            "page": random.randint(1, 76),
+            "is_ads": 1,
+            "model": "iPhone 11<iPhone12,1>",
+            "mini_version": "8.0.25",
+            "origin_channel": "-1",
+            "origin_type": "2",
+            "origin_level": "0",
+            "ini_id": cls.visitor_key
+        }
+
+        params = parse.quote(json.dumps(parameter))
+        url = url + str(params)
+
+        try:
+            urllib3.disable_warnings()
+            r = requests.get(headers=header, url=url, proxies=proxies, verify=False)
+            response = json.loads(r.content.decode("utf8"))
+            # 翻页
+            cls.visitor_key = r.json()["data"]["visitor_key"]
+            cls.page += 1
+
+            # Common.logger(log_type).info("visitor_key:{}", cls.visitor_key)
+            # Common.logger(log_type).info("page:{}\n", cls.page)
+            #
+            # for k, v in parameter.items():
+            #     Common.logger(log_type).info("{}:{}", k, v)
+            # Common.logger(log_type).info("\n")
+            #
+            # Common.logger(log_type).info("response:{}\n", response)
+
+            if "data" not in response:
+                Common.logger(log_type).warning("get_recommend, response:{}".format(response))
+                time.sleep(3)
+            else:
+                feeds = response["data"]["list"]
+                for i in range(len(feeds)):
+                    if "nid" not in feeds[i]:
+                        video_id = 0
+                    else:
+                        video_id = feeds[i]["nid"]
+
+                    if "video_cover" not in feeds[i]:
+                        cover_url = 0
+                    else:
+                        cover_url = feeds[i]["video_cover"]
+
+                    if "video_url" not in feeds[i]:
+                        video_url = 0
+                    elif ".mp4" not in feeds[i]["video_url"]:
+                        video_url = 0
+                    else:
+                        video_url = feeds[i]["video_url"]
+
+                    if "width" not in feeds[i] or "height" not in feeds[i]:
+                        video_width = 0
+                        video_height = 0
+                        video_resolution = str(video_width) + "*" + str(video_height)
+                    else:
+                        video_width = feeds[i]["width"]
+                        video_height = feeds[i]["height"]
+                        video_resolution = str(video_width) + "*" + str(video_height)
+
+                    if "commentCount" not in feeds[i]:
+                        video_comment_cnt = 0
+                    else:
+                        video_comment_cnt = feeds[i]["commentCount"]
+
+                    if "update_time" not in feeds[i]:
+                        video_send_time = 0
+                    else:
+                        video_send_time = feeds[i]["update_time"]
+
+                    # 视频标题过滤话题及处理特殊字符
+                    if "title" not in feeds[i]:
+                        video_title = 0
+                    else:
+                        video_title = feeds[i]["title"].strip().replace("\n", "")\
+                            .replace("/", "").replace("本山祝福", "").replace(" ", "")\
+                            .replace(" ", "").replace("&NBSP", "").replace("\r", "")\
+                            .replace("#", "").replace(".", "。").replace("\\", "")\
+                            .replace(":", "").replace("*", "").replace("?", "")\
+                            .replace("?", "").replace('"', "").replace("<", "")\
+                            .replace(">", "").replace("|", "")
+
+                    like_cnt = "0"
+                    share_cnt = "0"
+                    play_cnt = "0"
+                    user_name = "本山祝福"
+                    head_url = cover_url
+                    user_id = "benshanzhufu"
+                    Common.logger(log_type).info("video_title:{}".format(video_title))
+                    Common.logger(log_type).info("video_id:{}".format(video_id))
+                    # Common.logger(log_type).info("video_comment_cnt:{}".format(video_comment_cnt))
+                    # Common.logger(log_type).info("video_resolution:{}".format(video_resolution))
+                    Common.logger(log_type).info(
+                        "video_send_time:{}", time.strftime(
+                            "%Y/%m/%d %H:%M:%S", time.localtime(int(video_send_time))))
+                    # Common.logger(log_type).info("video_cover:{}".format(cover_url))
+                    Common.logger(log_type).info("video_url:{}".format(video_url))
+
+                    # 过滤无效视频
+                    if video_id == 0 or cover_url == 0 or video_url == 0:
+                        Common.logger(log_type).info("无效视频\n")
+                    # 已下载表去重:https://w42nne6hzg.feishu.cn/sheets/shtcnGh2rrsPYM4iVNEBO7OqWrb?sheet=440018
+                    elif str(video_id) in [n for m in Feishu.get_values_batch(log_type, "bszf", "440018") for n in m]:
+                        Common.logger(log_type).info("视频已下载\n")
+                    # recommend_feeds表去重:https://w42nne6hzg.feishu.cn/sheets/shtcnGh2rrsPYM4iVNEBO7OqWrb?sheet=CcHgO7
+                    elif str(video_id) in [n for m in Feishu.get_values_batch(log_type, "bszf", "CcHgO7") for n in m]:
+                        Common.logger(log_type).info("视频已在recommend_feeds表中\n")
+                    # 竖版视频表去重:https://w42nne6hzg.feishu.cn/sheets/shtcnGh2rrsPYM4iVNEBO7OqWrb?sheet=dAcOWt
+                    elif str(video_id) in [n for m in Feishu.get_values_batch(log_type, "bszf", "dAcOWt") for n in m]:
+                        Common.logger(log_type).info("视频已在竖版视频表中\n")
+                    else:
+                        time.sleep(1)
+                        Feishu.insert_columns(log_type, "bszf", "CcHgO7", "ROWS", 1, 2)
+                        get_feeds_time = int(time.time())
+                        values = [[str(time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(get_feeds_time))),
+                                   "推荐榜",
+                                   str(video_id),
+                                   video_title,
+                                   play_cnt,
+                                   like_cnt,
+                                   share_cnt,
+                                   video_comment_cnt,
+                                   time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(int(video_send_time))),
+                                   user_name,
+                                   user_id,
+                                   head_url,
+                                   cover_url,
+                                   video_url]]
+                        time.sleep(1)
+                        Feishu.update_values(log_type, "bszf", "CcHgO7", "A2:N2", values)
+                        Common.logger(log_type).info("添加至recommend_feeds成功\n")
+
+        except Exception as e:
+            Common.logger(log_type).error("get_recommend异常:{}".format(e))
+
+    # 下载 / 上传
+    @classmethod
+    def download_publish(cls, log_type, env):
+        """
+        下载视频
+        测试环境:env == dev
+        正式环境:env == prod
+        """
+        try:
+            recommend_feeds_sheet = Feishu.get_values_batch(log_type, "bszf", "CcHgO7")
+            for i in range(1, len(recommend_feeds_sheet)):
+                download_video_id = recommend_feeds_sheet[i][2]
+                download_video_title = recommend_feeds_sheet[i][3]
+                download_video_play_cnt = recommend_feeds_sheet[i][4]
+                download_video_comment_cnt = recommend_feeds_sheet[i][7]
+                download_video_like_cnt = recommend_feeds_sheet[i][5]
+                download_video_share_cnt = recommend_feeds_sheet[i][6]
+                download_video_send_time = recommend_feeds_sheet[i][8]
+                download_user_name = recommend_feeds_sheet[i][9]
+                download_user_id = recommend_feeds_sheet[i][10]
+                download_head_url = recommend_feeds_sheet[i][11]
+                download_cover_url = recommend_feeds_sheet[i][12]
+                download_video_url = recommend_feeds_sheet[i][13]
+
+                Common.logger(log_type).info("正在判断第{}行", i + 1)
+                Common.logger(log_type).info("download_video_title:{}", download_video_title)
+                Common.logger(log_type).info("download_video_send_time:{}", download_video_send_time)
+                Common.logger(log_type).info("download_video_url:{}", download_video_url)
+                # Common.logger(log_type).info("download_video_play_cnt:{}", download_video_play_cnt)
+                # Common.logger(log_type).info("download_video_id:{}", download_video_id)
+                # Common.logger(log_type).info("download_video_comment_cnt:{}", download_video_comment_cnt)
+                # Common.logger(log_type).info("download_video_like_cnt:{}", download_video_like_cnt)
+                # Common.logger(log_type).info("download_video_share_cnt:{}", download_video_share_cnt)
+                # Common.logger(log_type).info("download_user_name:{}", download_user_name)
+                # Common.logger(log_type).info("download_user_id:{}", download_user_id)
+                # Common.logger(log_type).info("download_head_url:{}", download_head_url)
+                # Common.logger(log_type).info("download_cover_url:{}", download_cover_url)
+
+                # 过滤空行
+                if download_video_id is None or download_video_title is None or download_video_play_cnt is None:
+                    Common.logger(log_type).warning("空行,略过\n")
+                # 过滤敏感词
+                elif any(word if word in download_video_title else False for word in
+                         cls.sensitive_words(log_type)) is True:
+                    Feishu.dimension_range(log_type, "bszf", "CcHgO7", "ROWS", i + 1, i + 1)
+                    Common.logger(log_type).info("视频已中敏感词,删除成功\n")
+                    return
+                # 已下载视频表去重
+                elif str(download_video_id) in [n for m in Feishu.get_values_batch(log_type, "bszf", "440018")
+                                                for n in m]:
+                    Feishu.dimension_range(log_type, "bszf", "CcHgO7", "ROWS", i + 1, i + 1)
+                    Common.logger(log_type).info("该视频已下载,删除成功\n")
+                    return
+                # 满足下载规则
+                else:
+                    # 下载视频
+                    Common.download_method(log_type=log_type, text="video",
+                                           d_name=str(download_video_title), d_url=str(download_video_url))
+                    # 获取视频时长
+                    video_info = cls.get_video_info_from_local("./videos/" + download_video_title + "/video.mp4")
+                    download_video_resolution = str(video_info[0]) + "*" + str(video_info[1])
+                    download_video_duration = video_info[2]
+
+                    # 视频时长<60s,直接删除
+                    if int(download_video_duration) < 60:
+                        shutil.rmtree("./videos/" + download_video_title + "/")
+                        Feishu.dimension_range(log_type, "bszf", "CcHgO7", "ROWS", i + 1, i + 1)
+                        Common.logger(log_type).info("时长:{}<60秒,删除成功\n", int(download_video_duration))
+                        return
+                    # 竖版视频不下载,写入竖版视频表
+                    elif int(video_info[0]) < int(video_info[1]):
+                        # 删除在 recommend_feeds 的记录
+                        Feishu.dimension_range(log_type, "bszf", "CcHgO7", "ROWS", i + 1, i + 1)
+                        Common.logger(log_type).info("宽:{}<高:{},删除成功", int(video_info[0]), int(video_info[1]))
+
+                        # 添加到竖版视频表
+                        time.sleep(1)
+                        Feishu.insert_columns(log_type, "bszf", "dAcOWt", "ROWS", 1, 2)
+                        # 视频ID工作表,首行写入数据
+                        upload_time = int(time.time())
+                        values = [[time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(upload_time)),
+                                   "推荐榜",
+                                   str(download_video_id),
+                                   str(download_video_title),
+                                   download_video_play_cnt,
+                                   download_video_like_cnt,
+                                   download_video_share_cnt,
+                                   download_video_comment_cnt,
+                                   int(download_video_duration),
+                                   str(download_video_resolution),
+                                   str(download_video_send_time),
+                                   str(download_user_name),
+                                   str(download_user_id),
+                                   str(download_head_url),
+                                   str(download_cover_url),
+                                   str(download_video_url)]]
+                        time.sleep(1)
+                        Feishu.update_values(log_type, "bszf", "dAcOWt", "A2:P2", values)
+                        Common.logger(log_type).info("写入竖版视频表成功\n")
+                        return
+                    else:
+                        # 下载封面
+                        Common.download_method(log_type=log_type, text="cover",
+                                               d_name=str(download_video_title), d_url=str(download_cover_url))
+                        # 保存视频信息至 "./videos/{download_video_title}/info.txt"
+                        with open("./videos/" + download_video_title
+                                  + "/" + "info.txt", "a", encoding="UTF-8") as f_a:
+                            f_a.write(str(download_video_id) + "\n" +
+                                      str(download_video_title) + "\n" +
+                                      str(int(download_video_duration)) + "\n" +
+                                      str(download_video_play_cnt) + "\n" +
+                                      str(download_video_comment_cnt) + "\n" +
+                                      str(download_video_like_cnt) + "\n" +
+                                      str(download_video_share_cnt) + "\n" +
+                                      str(download_video_resolution) + "\n" +
+                                      str(int(time.mktime(
+                                          time.strptime(download_video_send_time, "%Y/%m/%d %H:%M:%S")))) + "\n" +
+                                      str(download_user_name) + "\n" +
+                                      str(download_head_url) + "\n" +
+                                      str(download_video_url) + "\n" +
+                                      str(download_cover_url) + "\n" +
+                                      "benshanzhufu")
+                        Common.logger(log_type).info("==========视频信息已保存至info.txt==========")
+
+                        # 上传视频
+                        Common.logger(log_type).info("开始上传视频:{}".format(download_video_title))
+                        our_video_id = Publish.upload_and_publish(log_type, env, "play")
+                        our_video_link = "https://admin.piaoquantv.com/cms/post-detail/" + str(our_video_id) + "/info"
+                        Common.logger(log_type).info("视频上传完成:{}", download_video_title)
+
+                        # 保存视频 ID 到云文档:https://w42nne6hzg.feishu.cn/sheets/shtcnGh2rrsPYM4iVNEBO7OqWrb?sheet=440018
+                        Common.logger(log_type).info("保存视频ID至云文档:{}", download_video_title)
+                        # 视频ID工作表,插入首行
+                        Feishu.insert_columns(log_type, "bszf", "440018", "ROWS", 1, 2)
+                        # 视频ID工作表,首行写入数据
+                        upload_time = int(time.time())
+                        values = [[time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(upload_time)),
+                                   "推荐榜",
+                                   str(download_video_id),
+                                   str(download_video_title),
+                                   our_video_link,
+                                   download_video_play_cnt,
+                                   download_video_comment_cnt,
+                                   download_video_like_cnt,
+                                   download_video_share_cnt,
+                                   int(download_video_duration),
+                                   str(download_video_resolution),
+                                   str(download_video_send_time),
+                                   str(download_user_name),
+                                   str(download_user_id),
+                                   str(download_head_url),
+                                   str(download_cover_url),
+                                   str(download_video_url)]]
+                        time.sleep(1)
+                        Feishu.update_values(log_type, "bszf", "440018", "E2:V2", values)
+
+                        # 删除行或列,可选 ROWS、COLUMNS
+                        Feishu.dimension_range(log_type, "bszf", "CcHgO7", "ROWS", i + 1, i + 1)
+                        Common.logger(log_type).info("视频:{},下载/上传成功\n", download_video_title)
+                        return
+
+        except Exception as e:
+            Common.logger(log_type).error("download_publish异常:{}", e)
+
+    # 执行下载 / 上传
+    @classmethod
+    def run_download_publish(cls, log_type, env):
+        try:
+            while True:
+                time.sleep(1)
+                recommend_feeds_sheet = Feishu.get_values_batch(log_type, "bszf", "CcHgO7")
+                if len(recommend_feeds_sheet) == 1:
+                    Common.logger(log_type).info("下载/上传完成\n")
+                    break
+                else:
+                    cls.download_publish(log_type, env)
+                    time.sleep(random.randint(5, 10))
+        except Exception as e:
+            Common.logger(log_type).error("run_download_publish异常:{}", e)
+
+
+if __name__ == "__main__":
+    recommend = Recommend()
+    recommend.get_recommend("recommend")
+    recommend.run_download_publish("recommend", "dev")

+ 0 - 74
main/run.py

@@ -1,74 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2022/4/25
-import datetime
-import os
-import sys
-import time
-sys.path.append(os.getcwd())
-from main.common import Common
-from main.download import BSZF
-
-
-class Main:
-    @classmethod
-    def download_job_dev(cls):
-        """
-        测试环境脚本
-        """
-        if len(BSZF.download_video_list) >= 10:
-            Common.crawler_log().info("已下载视频数:{}".format(len(BSZF.download_video_list)))
-            time.sleep(1800)
-        else:
-            Common.crawler_log().info("开始抓取本山祝福视频\n")
-            BSZF.get_recommend()
-            BSZF.download_video("dev")
-
-        # 删除多余日志
-        Common.del_logs()
-        # 统计累计下载数量
-        Common.benshanzhufu_download_count()
-
-    @classmethod
-    def main_dev(cls):
-        while True:
-            while True:
-                main_dev_time = datetime.datetime.now()
-                if main_dev_time.hour >= 15:
-                    cls.download_job_dev()
-                else:
-                    break
-
-    @classmethod
-    def download_job_prod(cls):
-        """
-        正式环境脚本
-        """
-        if len(BSZF.download_video_list) >= 200:
-            Common.crawler_log().info("已下载视频数:{}".format(len(BSZF.download_video_list)))
-            time.sleep(1800)
-        else:
-            Common.crawler_log().info("开始抓取本山祝福视频\n")
-            BSZF.get_recommend()
-            BSZF.download_video("prod")
-
-        # 删除多余日志
-        Common.del_logs()
-        # 统计累计下载数量
-        Common.benshanzhufu_download_count()
-
-    @classmethod
-    def main(cls):
-        while True:
-            while True:
-                main_time = datetime.datetime.now()
-                if main_time.hour >= 10:
-                    cls.download_job_prod()
-                else:
-                    break
-
-
-if __name__ == "__main__":
-    main = Main()
-    main.main()
-    # main.main_dev()

+ 47 - 0
main/run_recommend.py

@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2022/4/25
+import datetime
+import os
+import random
+import sys
+import time
+
+sys.path.append(os.getcwd())
+from main.common import Common
+from main.recommend import Recommend
+
+
+class Main:
+
+    @classmethod
+    def main(cls):
+        while True:
+            while True:
+                main_time = datetime.datetime.now()
+                if 12 >= main_time.hour >= 8:
+                    # 抓取视频
+                    Common.logger("recommend").info("开始抓取本山祝福视频\n")
+                    for i in range(10):
+                        Common.logger("recommend").info("正在抓取第{}页\n", i+1)
+                        Recommend.get_recommend("recommend")
+                    # 翻页初始化
+                    Recommend.visitor_key = ""
+                    Recommend.page = 1
+                    # 下载/上传
+                    Common.logger("recommend").info("开始下载/上传\n")
+                    Recommend.run_download_publish("recommend", "prod")
+                    # 清除日志
+                    Common.logger("recommend").info("开始清除日志")
+                    Common.del_logs("recommend")
+                    # 休眠
+                    Common.logger("recommend").info("休眠 30 - 60 分钟\n")
+                    time.sleep(random.randint(1800, 3600))
+                    break
+                else:
+                    break
+
+
+if __name__ == "__main__":
+    main = Main()
+    main.main()

+ 0 - 3
txt/__init__.py

@@ -1,3 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2022/4/25

+ 0 - 0
txt/benshanzhufu_feeds.txt


+ 0 - 0
txt/benshanzhufu_videoid.txt


+ 0 - 3
videos/__init__.py

@@ -1,3 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2022/4/25

+ 0 - 12
抓取规则.txt

@@ -1,12 +0,0 @@
-==========2022/4/29===========
-- 增加敏感词过滤
-- 每天 11 点开始爬取,上限 200 条
-- 上传账号:[20631241, 20631242, 20631244, 20631245, 20631246, 20631247]
-==============================
-
-
-==========2022/4/27===========
-- 全爬
-- 每天9-12点进行爬取
-- 上传账号:20631241 / 20631242
-==============================