qingqu-git 3 rokov pred
commit
6dda8b1f2b

+ 63 - 0
.gitignore

@@ -0,0 +1,63 @@
+# ---> Python
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+.DS_Store
+.idea/
+

+ 1 - 0
README.md

@@ -0,0 +1 @@
+快手和微视小程序的爬虫

+ 3 - 0
logs/__init__.py

@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2022/3/30

+ 3 - 0
main/__init__.py

@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2022/3/29

+ 180 - 0
main/common.py

@@ -0,0 +1,180 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2022/3/30
+"""
+公共方法,包含:生成log / 删除log / 下载方法 / 读取文件 / 统计下载数
+"""
+from datetime import date, timedelta
+import datetime
+import logging
+import os
+import time
+import requests
+import urllib3
+
+
+class Common:
+    # 统一获取当前时间 <class 'datetime.datetime'>  2022-04-14 20:13:51.244472
+    now = datetime.datetime.now()
+    # 昨天 <class 'str'>  2022-04-13
+    yesterday = (date.today() + timedelta(days=-1)).strftime("%Y-%m-%d")
+    # 今天 <class 'datetime.date'>  2022-04-14
+    today = date.today()
+    # 明天 <class 'str'>  2022-04-15
+    tomorrow = (date.today() + timedelta(days=1)).strftime("%Y-%m-%d")
+
+    @staticmethod
+    def crawler_log():
+        """
+        生成 log 日志
+        """
+        # 日志路径
+        log_dir = "./logs/"
+        log_path = os.getcwd() + os.sep + log_dir
+        if not os.path.isdir(log_path):
+            os.makedirs(log_path)
+
+        # 日志参数
+        log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+        date_format = "%Y-%m-%d %p %H:%M:%S"
+        log_name = time.strftime("%Y-%m-%d", time.localtime(time.time())) + '.log'
+
+        # 日志初始化
+        logging.basicConfig(filename=log_path + log_name, level=logging.INFO, format=log_format, datefmt=date_format)
+        crawler_logger = logging.getLogger("crawler-log")
+
+        return crawler_logger
+
+    @classmethod
+    def del_logs(cls):
+        """
+        清除冗余日志文件
+        :return: 保留最近 7 个日志
+        """
+        log_dir = "./logs/"
+        all_files = sorted(os.listdir(log_dir))
+        all_logs = []
+        for log in all_files:
+            name = os.path.splitext(log)[-1]
+            if name == ".log":
+                all_logs.append(log)
+
+        if len(all_logs) <= 7:
+            pass
+        else:
+            for file in all_logs[:len(all_logs) - 7]:
+                os.remove(log_dir + file)
+        cls.crawler_log().info("清除冗余日志成功")
+
+    @classmethod
+    def download_method(cls, text, d_name, d_url):
+        """
+        下载封面:text == "cover" ; 下载视频:text == "video"
+        需要下载的视频标题:d_title
+        视频封面,或视频播放地址:d_url
+        下载保存路径:"./files/{d_title}/"
+        """
+        # 首先创建一个保存该视频相关信息的文件夹
+        video_dir = "./videos/" + d_name + "/"
+        if not os.path.exists(video_dir):
+            os.mkdir(video_dir)
+
+        # 下载视频
+        if text == "video":
+            # 需要下载的视频地址
+            video_url = d_url
+            # 视频名
+            video_name = "video.mp4"
+
+            # 下载视频
+            urllib3.disable_warnings()
+            response = requests.get(video_url, stream=True, verify=False)
+            try:
+                with open(video_dir + video_name, "wb") as f:
+                    for chunk in response.iter_content(chunk_size=10240):
+                        f.write(chunk)
+                cls.crawler_log().info("==========视频下载完成==========")
+            except Exception as e:
+                cls.crawler_log().info("视频下载失败:{}".format(e))
+            # except FileNotFoundError:
+            #     cls.kuaishou_log().info("==========视频下载失败==========")
+
+        # 下载封面
+        elif text == "cover":
+            # 需要下载的封面地址
+            cover_url = d_url
+            # 封面名
+            cover_name = "image.jpg"
+
+            # 下载封面
+            urllib3.disable_warnings()
+            response = requests.get(cover_url, verify=False)
+            try:
+                with open(video_dir + cover_name, "wb") as f:
+                    f.write(response.content)
+                cls.crawler_log().info("==========封面下载完成==========")
+            except Exception as e:
+                cls.crawler_log().info("封面下载失败:{}".format(e))
+            # except FileNotFoundError:
+            #     cls.kuaishou_log().info("==========封面下载失败==========")
+
+    @staticmethod
+    def read_txt(t_name):
+        """
+        读取 txt 文件
+        :param t_name: 文件名
+        :return: 文件内容
+        """
+        with open("./txt/" + t_name, "r", encoding="utf8") as f:
+            return f.readlines()
+
+    @classmethod
+    def kuaishou_download_count(cls):
+        videoid_path = "./txt/kuaishou_videoid.txt"
+        count = 0
+        for count, line in enumerate(open(videoid_path, "rb").readlines()):
+            count += 1
+        cls.crawler_log().info('累计下载视频数: {}\n'.format(count))
+
+    @classmethod
+    def weishi_download_count(cls):
+        videoid_path = "./txt/weishi_videoid.txt"
+        count = 0
+        for count, line in enumerate(open(videoid_path, "rb").readlines()):
+            count += 1
+        cls.crawler_log().info('累计下载视频数: {}\n'.format(count))
+
+    @classmethod
+    def kuaishou_today_download_count(cls):
+        """
+        统计快手渠道当日下载视频数
+        :return:
+        """
+        # 创建空文件
+        with open("./txt/" + str(cls.today) + "_kuaishou_videoid.txt", "a") as f:
+            f.write("")
+        videoid_path = "./txt/" + str(cls.today) + "_kuaishou_videoid.txt"
+        count = 0
+        for count, line in enumerate(open(videoid_path, "rb").readlines()):
+            count += 1
+        return count
+
+    @classmethod
+    def del_yesterday_kuaishou_videoid_txt(cls):
+        """
+        删除快手渠道昨日下载视频数的 txt 文件
+        :return:
+        """
+        yesterday_kuaishou_videoid_txt_dir = "./txt/"
+        all_files = sorted(os.listdir(yesterday_kuaishou_videoid_txt_dir))
+        for file in all_files:
+            name = os.path.splitext(file)[0]
+            if name == cls.yesterday + "_kuaishou_videoid":
+                os.remove(yesterday_kuaishou_videoid_txt_dir + file)
+        Common.crawler_log().info("删除快手昨天下载统计文件成功")
+
+
+if __name__ == "__main__":
+    common = Common()
+    common.del_yesterday_kuaishou_videoid_txt()
+    print(common.kuaishou_today_download_count())

+ 260 - 0
main/demo.py

@@ -0,0 +1,260 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2022/3/31
+from datetime import date, timedelta
+import datetime
+import json
+import re
+import time
+
+import requests
+import urllib3
+
+
+class Demo:
+    @classmethod
+    def demo1(cls):
+        download_video_resolution = "720*1280"
+        download_video_width = download_video_resolution.split("*")[0]
+        download_video_height = download_video_resolution.split("*")[-1]
+        print(download_video_resolution)
+        print(download_video_width)
+        print(download_video_height)
+
+    @classmethod
+    def time(cls):
+        # 推荐
+        time1 = int(time.time()*1000)
+        print(time1)
+        # 不推荐
+        time2 = round(time.time())*1000
+        print(time2)
+
+        # 统一获取当前时间
+        now = datetime.datetime.now()
+        print(type(now))
+        print(f"now:{now}")
+        # 昨天
+        yesterday = (date.today() + timedelta(days=-1)).strftime("%Y-%m-%d")
+        print(type(yesterday))
+        print(f"昨天:{yesterday}")
+        # 今天
+        today = date.today()
+        print(type(today))
+        print(f"今天:{today}")
+        # 明天
+        tomorrow = (date.today() + timedelta(days=1)).strftime("%Y-%m-%d")
+        print(type(tomorrow))
+        print(f"明天:{tomorrow}")
+
+    @classmethod
+    def get_douyin_feeds(cls):
+        """
+        获取抖音feed流视频 https://www.douyin.com
+        """
+        url = "https://www.douyin.com/aweme/v1/web/tab/feed/?"
+        params = {
+            "device_platform": "webapp",
+            "aid": "6383",
+            "channel": "channel_pc_web",
+            "count": "10",
+            "refresh_index": "4",
+            "video_type_select": "0",
+            "version_code": "170400",
+            "version_name": "17.4.0",
+            "cookie_enabled": "true",
+            "screen_width": "1920",
+            "screen_height": "1080",
+            "browser_language": "zh-CN",
+            "browser_platform": "MacIntel",
+            "browser_name": "Chrome",
+            "browser_version": "99.0.4844.84",
+            "browser_online": "true",
+            "engine_name": "Blink",
+            "engine_version": "99.0.4844.84",
+            "os_name": "Mac OS",
+            "os_version": "10.15.7",
+            "platform": "PC",
+            "cpu_core_num": "8",
+            "device_memory": "8",
+            "downlink": "10",
+            "effective_type": "4g",
+            "round_trip_time": "50",
+            "msToken": "304uY1lV7HmHkR1G1QUaFqg0yrL5_WqrFOR8qCbl3hOsl8aSNI_18vIfpTGNhNRVZx7ysRiCHpcBKhpujTsbbC"
+                       "ZEDbG7pllZzlO3tlrBOs2TFYUgJdsvbw==",
+            "X-Bogus": "DFSzswVYPVsANat/Sl8eGc3WxM23",
+            "_signature": "qaJgTwAAy.aVqLslyfC7aKmiYF"
+        }
+        cookies = {
+            "_tea_utm_cache_6383": "undefined",
+            "ttwid": "1%7CETZk6sDMDSBgewWhKJXghFN4cwXTz0fLuhsLEngD_Nk%7C1648812136%7Cfa66fa81ccfe3f552f4"
+                     "e8b8327e72cbbc5e897141c25a5fcd32defaed1466d3e",
+            "passport_csrf_token": "e2d0f1ed9fd22463be9f389137a781ce",
+            "passport_csrf_token_default": "e2d0f1ed9fd22463be9f389137a781ce",
+            "s_v_web_id": "verify_l1h7nzwr_ABN0FA2f_BTrM_4zSH_8WPN_2KY2iZFmbhE2",
+            "_tea_utm_cache_1300": "undefined",
+            "_tea_utm_cache_2285": "undefined",
+            "ttcid": "3220eeda36a244beadd32a4b44d2044b31",
+            "douyin.com": "",
+            "__ac_nonce": "06247fb0f00f050ccc9b2",
+            "__ac_signature": "_02B4Z6wo00f01AN7DoAAAIDB5nv.qI7xGZQDWwoAAGKfo4rd5YCAYF8o5PyppIpsdKxV0k2NerO"
+                              "f1VEQr3eJftkpgon9tcveDVpmfY555vzTTvRznegS1ax3KJXnoav2ZdEoYzwR3wDszPCk5d",
+            "strategyABtestKey": "1648865029.449",
+            "AB_LOGIN_GUIDE_TIMESTAMP": "1648865029279",
+            "THEME_STAY_TIME": "299621",
+            "IS_HIDE_THEME_CHANGE": "1",
+            "home_can_add_dy_2_desktop": "0",
+            "tt_scid": "vUl8CBW1SMQp2l5GmUIja5A6ziY1LByrsoN.P-wvKuutiB8ftvlfK.9ZEeehNC5u821d",
+            "pwa_guide_count": "2",
+            "msToken": "EHCmp9Qw7PAChI3do-MQPjOR29hf4ZFLYNrGl89HkFKdO5Iwb8n7z5fpETrgim2zFTIkGT"
+                       "ObOxH7HCrHCLVEX5eAuwAS1A2sjKH4MHEfjfPqA06Lo4v9Pw==",
+        }
+        try:
+            urllib3.disable_warnings()
+            r = requests.get(url=url, params=params, cookies=cookies, verify=False)
+            # response = json.loads(r.content.decode("utf8"))
+            print(r)
+            print(type(r.text))
+            print(r.text)
+        except Exception as e:
+            print(e)
+
+    @classmethod
+    def demo2(cls):
+        s = "0"
+        print(int(int(s) / 10))
+
+    @classmethod
+    def get_weishi_feeds(cls):
+        url = "https://api.weishi.qq.com/trpc.weishi.weishi_h5_proxy.weishi_h5_proxy/WxminiGetFeedList"
+        cookies = {
+            "wesee_authtype": "3",
+            "wesee_openid":	"oWGa05FrwkuUvT-4n1qGeQuhVsc8",
+            "wesee_openkey": "8c3ec202f5d679fb5ee6d9f643640d9a2580ba504612e2d979a881d3169caf189e2a5c1d532eeff172bc21cf2"
+                             "6230941ccbc10243a7879e8165ca608c17060de606a6d08afe0a3abd5250629314f9a99e9d1003b201bf5ec",
+            "wesee_personid": "1593522421826902",
+            "wesee_refresh_token": "",
+            "wesee_access_token": "8c3ec202f5d679fb5ee6d9f643640d9a2580ba504612e2d979a881d3169caf18"
+                                  "9e2a5c1d532eeff172bc21cf26230941ccbc10243a7879e8165ca608c17060de6"
+                                  "06a6d08afe0a3abd5250629314f9a99e9d1003b201bf5ec",
+            "wesee_thr_appid": "wx75ee9f19b93e5c46",
+            "wesee_ichid": "8"
+        }
+        json_data = {
+            "req_body": {
+                "requestType": 16,
+                "isrefresh": 0,
+                "isfirst": 0,
+                "attachInfo": "",
+                "scene_id": 22,
+                "requestExt": {
+                    "mini_openid": "oWGa05FrwkuUvT-4n1qGeQuhVsc8",
+                    "notLogin-personid": "1593522421826902"
+                }
+            },
+            "req_header": {
+                "mapExt": "{\"imageSize\":\"480\",\"adaptScene\":\"PicHDWebpLimitScene\"}"
+            }
+        }
+        try:
+            urllib3.disable_warnings()
+            r = requests.post(url=url, cookies=cookies, json=json_data, verify=False)
+            response = json.loads(r.content.decode("utf8"))
+            feeds = response["rsp_body"]["feeds"]
+            for feed in feeds:
+                print(feed)
+        except Exception as e:
+            print(e)
+
+    @classmethod
+    def edit_str(cls):
+        title_list = ["#上海战疫 上海累计感染超20万!这条被淹没的热搜,令全网泪目… 疫情一定要攻克,但所有人都不该遗忘这些弱者。#上海累计报告本土阳性感染者超20万例 #农民工",
+                      "#重庆地火村 #地火村 #旅行",
+                      "第79集 | 湖南最值得去的六个景区,每一个都是绝色…… #快手带你去旅行 #旅游胜地 #旅游",
+                      "霸王条款不废除,断供有多可怕。 #涨知识 #生活小常识 # 生活常识",
+                      "秦始皇还活着?地宫中有不明物体缓缓移动 #历史 #秦始皇 #新春寄语  @快手热点(O40300129)",
+                      "#夏日荷花  #国花牡丹 #昙花一现",
+                      "国内最良心的8个景区,这才是景区最该有的样子,看看你去过几个? #旅行  #旅游 ",
+                      "狗子呆在水里三天三夜,终于练成捕鱼神功,一口一个大鲶鱼 #狗狗  #神奇动物  #快手放映室  @快手热点(O40300129) ",
+                      "#集结吧光合创作者  养鸡小伙:喂鸡摆出各种造型,被称为鸡司令。",
+                      "89岁农民老艺人自食其力,街头卖艺表演“捏碎碗片”绝技,现场听到咔吱咔吱响,人狠功夫硬!这功夫已失传,以后再看不到了!#集结吧光合创作者 #农民 #街头表演  @快手光合作者助手(O40300118)  @快手热点(O40300129)  @我要上热门(O1907752910)",
+                      "我国最贵最有名的三棵树,你知道哪三棵吗?#旅游 #旅行攻略 #黄山迎客松",
+                      "潘长江带来热舞,蔡明 郭达也来了!太嗨了!歌词太棒了! @快手涨粉助手(O1815060199)  @快手热点(O40300129)  @快手平台帐号(O90041) #潘长江 #搞笑 #集结吧光合创作者",
+                      "#带你看世界 给大家带来一期烟花盛宴,希望大家能够喜欢,带上你的那个她一起来看吧 #烟花 #视觉震撼"
+                      ]
+        for title in title_list:
+            title_split1 = title.split(" #")
+            if title_split1[0] != "":
+                title1 = title_split1[0]
+            else:
+                title1 = title_split1[0]
+
+            title_split2 = title1.split(" #")
+            if title_split2[0] != "":
+                title2 = title_split2[0]
+            else:
+                title2 = title_split2[-1]
+
+            title_split3 = title2.split("@")
+            if title_split3[0] != "":
+                title3 = title_split3[0]
+            else:
+                title3 = title_split3[-1]
+
+            print(title3)
+            title = title3.replace("\n", "").replace("#", "").replace("/", "").replace("\r", "")
+            print(title)
+
+        # new_title = re.compile(r'(#)(.*)(#)')
+        # print(new_title.sub(r'', title))
+
+    @classmethod
+    def kuaishou_sensitive_words(cls):
+        sensitive_words = [
+            "汽车",
+            "电影解说",
+            "放映室",
+            "解说电影",
+            "断供",
+        ]
+        return sensitive_words
+
+    @classmethod
+    def sensitive_words(cls):
+        title_list = ["#上海战疫 上海累计感染超20万!这条被淹没的热搜,令全网泪目… 疫情一定要攻克,但所有人都不该遗忘这些弱者。#上海累计报告本土阳性感染者超20万例 #农民工",
+                      "#重庆地火村 #地火村 #旅行",
+                      "第79集 | 湖南最值得去的六个景区,每一个都是绝色…… #快手带你去旅行 #旅游胜地 #旅游",
+                      "霸王条款不废除,断供有多可怕。 #涨知识 #生活小常识 # 生活常识",
+                      "秦始皇还活着?地宫中有不明物体缓缓移动 #历史 #秦始皇 #新春寄语  @快手热点(O40300129)",
+                      "#夏日荷花  #国花牡丹 #昙花一现",
+                      "国内最良心的8个景区,这才是景区最该有的样子,看看你去过几个? #旅行  #旅游 ",
+                      "狗子呆在水里三天三夜,终于练成捕鱼神功,一口一个大鲶鱼 #狗狗  #神奇动物  #快手放映室  @快手热点(O40300129) ",
+                      "#集结吧光合创作者  养鸡小伙:喂鸡摆出各种造型,被称为鸡司令。",
+                      "89岁农民老艺人自食其力,街头卖艺表演“捏碎碗片”绝技,现场听到咔吱咔吱响,人狠功夫硬!这功夫已失传,以后再看不到了!#集结吧光合创作者 #农民 #街头表演  @快手光合作者助手(O40300118)  @快手热点(O40300129)  @我要上热门(O1907752910)",
+                      "我国最贵最有名的三棵树,你知道哪三棵吗?#旅游 #旅行攻略 #黄山迎客松",
+                      "潘长江带来热舞,蔡明 郭达也来了!太嗨了!歌词太棒了! @快手涨粉助手(O1815060199)  @快手热点(O40300129)  @快手平台帐号(O90041) #潘长江 #搞笑 #集结吧光合创作者",
+                      "#带你看世界 给大家带来一期烟花盛宴,希望大家能够喜欢,带上你的那个她一起来看吧 #烟花 #视觉震撼"
+                      ]
+        print(cls.kuaishou_sensitive_words())
+        for title in title_list:
+            for word in cls.kuaishou_sensitive_words():
+                if word in title:
+                    print(f"敏感词:{word}")
+                    print(f"敏感词视频:{title}")
+                    cls.kuaishou_sensitive_words().remove(word)
+                else:
+                    print(f"正常视频:{title}")
+                    cls.kuaishou_sensitive_words().remove(word)
+
+
+if __name__ == "__main__":
+    demo = Demo()
+    # demo.demo1()
+    demo.time()
+    # demo.get_douyin_feeds()
+    # demo.demo2()
+    # demo.get_weishi_feeds()
+    # demo.edit_str()
+    # demo.sensitive_words()
+
+    pass

+ 457 - 0
main/download_kuaishou.py

@@ -0,0 +1,457 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2022/3/29
+"""
+从 微信小程序-快手短视频 中,下载符合规则的视频
+"""
+import json
+import time
+
+import requests
+import urllib3
+from main.common import Common
+from main.publish import Publish
+
+
+class KuaiShou:
+    # 已下载视频列表
+    download_video_list = []
+
+    @classmethod
+    def kuaishou_sensitive_words(cls):
+        sensitive_words = [
+            "集结吧光合创作者",
+            "电影解说",
+            "快来露两手",
+            "分享家常美食教程",
+            "光合作者助手",
+            "创作者中心",
+            "创作者学院",
+            "娱乐星熠计划",
+            "解说电影",
+            "电影剪辑",
+            "放映室",
+            "老剧",
+            "影视剪辑",
+            "精彩片段",
+            "冬日影娱大作战",
+            "春日追剧计划单",
+            "影视解说",
+            "中视频影视混剪计划",
+            "众志成城共抗疫情",
+            "我在追好剧",
+            "娱乐星灿计划",
+            "电影",
+            "电视剧",
+            "毛泽东",
+            "毛主席",
+            "周恩来",
+            "林彪",
+            "习近平",
+            "习大大",
+            "彭丽媛",
+            "怀旧经典影视",
+                           ]
+        return sensitive_words
+
+    @staticmethod
+    def kuaishou_download_rule(d_duration, d_width, d_height,
+                               d_play_cnt, d_like_cnt, d_share_cnt):
+        """
+        下载视频的基本规则
+        :param d_duration: 时长
+        :param d_width: 宽
+        :param d_height: 高
+        :param d_play_cnt: 播放量
+        :param d_like_cnt: 点赞量
+        :param d_share_cnt: 分享量
+        :return: 满足规则,返回 True;反之,返回 False
+        """
+        if 600 >= int(float(d_duration)) >= 60:
+            if int(d_width) >= 720 or int(d_height) >= 720:
+                if int(d_play_cnt) >= 50000:
+                    if int(d_like_cnt) >= 50000:
+                        if int(d_share_cnt) >= 2000:
+                            return True
+                        else:
+                            return False
+                    else:
+                        return False
+                else:
+                    return False
+            return False
+        return False
+
+    @classmethod
+    def kuaishou_get_recommend(cls):
+        """
+        从快手小程序首页推荐获取视频list:
+            1.在 kuaishou_videoid.txt 中去重
+            2.在 kuaishou_feeds.txt 中去重
+            3.添加视频信息到 kuaishou_feeds.txt
+        """
+        url = "https://wxmini-api.uyouqu.com/rest/wd/wechatApp/feed/recommend"
+        params = {
+            "__NS_sig3": "e6f6b281ea31e3d7d1bbb8b91f662576fc25f7c3a7a7a5a5aaaba8b2",
+            "__NS_sig3_origin": "3sCt3iAAAAAAAAAAAAAAAwEQBv2b8ewCwkZKaiAAAAAPg0soi"
+                                "e7GiOlU vF4zPrG1Nl6xvaoBgFd3MwTzOed9w=="
+        }
+        cookies = {
+            "did": "wxo_05f915ac6b1deca87db36cea1a0fd18fae6c",
+            "preMinaVersion": "v3.109.0",
+            "sid": "kuaishou.wechat.app",
+            "appId": "ks_wechat_small_app_2",
+            "clientid": "13",
+            "client_key": "f60ac815",
+            "kpn": "WECHAT_SMALL_APP",
+            "kpf": "OUTSIDE_ANDROID_H5",
+            "language": "zh_CN",
+            "smallAppVersion": "v3.109.0",
+            "session_key": "123005bcc551a92aac29cdb96190251c9f492c29d4ba6c502dc"
+                           "0d2f8b8d18df356a2f7a22d6924d1dd34b8554a64af49b1bb1a"
+                           "1236cd2f69c25d4ac2a2531ebcd28c179da14b222023f9e111c"
+                           "c4d3b064ac7b0915d8c9fdaccb59e4048e96a5c38a32b2ce9f4abf628053001",
+            "unionid": "V2:1230b56c8337908c3eecba63142a58daca05535c1f14bf67d3d8"
+                       "85cace91a7db335c5572d204762d075f24aa84412e2955711a12bb9"
+                       "2bd9c2290489ba7a733708a4a446de83822205ab727650489dda0db"
+                       "9d2a226c5ddb66d88a1f1373283a3d3b959611d816660028053001",
+            "eUserStableOpenId": "12303325e8710eb802137c70fd1fb65997a4e5e33d82"
+                                 "cddd409d335d096e20873e07ee472090133bc7a67e5c"
+                                 "749da045d9a31a12da4c4c26181d432b873ec39432f4"
+                                 "10196c6c2220323d0e6b562d1b3786aefb352b4e509c"
+                                 "d96f3466b7b2e5e74b904a94c40792d928053001",
+            "openId": "o5otV45DcV1EUsWw4fAUk_iq0YSA",
+            "eOpenUserId": "124074b7726c996283f25044a42e2c7427e929cd6d968c5342"
+                           "330e61fc8939e57b0da4ffe21887f3abc8784175f73e1a267d"
+                           "671247273806f293f64c9c8c2adc00a21a12bb92bd9c229048"
+                           "9ba7a733708a4a446de8382220534aa79c69b74866bb09187e"
+                           "eceec880fa1e0fa421b7df8b3289dab603b17c4828053001",
+            "kuaishou.wechat.app_st": "ChZrdWFpc2hvdS53ZWNoYXQuYXBwLnN0ErAB8aO"
+                                      "EcB6jh4CMSJ-p_4BJFCId0PKNa_5IeFfeV_tj7q"
+                                      "CjdXK0y13CSte6-KHbNK9BPo6Rjy3OGny0sh4Zb"
+                                      "5AUl3Q_zqVXe2TunW7_F3nlTdJOdZ6iVIhPrHa1"
+                                      "CM0Y-cG9gS4FDDzTvejfWaTI0CbjfNN0RZXzYVE"
+                                      "AUVT_BNgUVDtYBbEY792gPylMfXxwxKMSzkhaDe"
+                                      "eaHkGCWUj62FGCFYQ9Fw2W3d7suCXFsNylqT4aE"
+                                      "s8oNwmycUiygfvfKuoXlHkbeSIgOhEFMZ3ArImS"
+                                      "vFY_OwLJDHak1iXRO8g5TwzHTvBT3WcoBTAB",
+            "passToken": "ChNwYXNzcG9ydC5wYXNzLXRva2VuEpABI42IhPCJHfFngXC3i-vF"
+                         "3daRTB-EtnAYyE6HpfWcPoZ6VSRDvKrom_RvltQ2zKk1T3_FJteb"
+                         "mv7ZzQLD7IicnTypaGoeflb7KQVrAv50Mp_JL4ObfBu_xTiwI53t"
+                         "bTlM6iML0G7DFd16K5z0jZZ1xECKVQQbk_vIqnseUujFIWAsKcDz"
+                         "BqqfnQNbUU5DzDUkGhKgKyzmNjRDxLfpDU5SPFhJmG0iIGBZ_Vd-"
+                         "7eT8i_Xit9ZPM-zdFpnRZFveFE9iplMg8Z06KAUwAQ",
+            "userId": "2845397958"
+        }
+        json_data = {
+            "thirdPartyUserId": 2845397958,
+            "photoId": "5250352807040393911",
+            "forwardUserId": 2845397958,
+            "count": 10,
+            "portal": 2,
+            "pageType": 2,
+            "needLivestream": "true",
+            "extraRequestInfo": "{\"scene\":1074,\"fid\":\"2845397958\","
+                                "\"sharerUserId\":\"2845397958\",\"curPhotoIndex\":0,"
+                                "\"adShow\":true,\"weChatAd\":{},\"page\":0}",
+            "pcursor": 0,
+            "sourceFrom": 2,
+        }
+
+        try:
+            urllib3.disable_warnings()
+            r = requests.post(url=url, params=params, cookies=cookies, json=json_data, verify=False)
+            response = json.loads(r.content.decode("utf8"))
+            if "feeds" not in response:
+                Common.crawler_log().info("获取快手视频 list 出错:{},休眠 10s".format(response))
+                time.sleep(10)
+            else:
+                feeds = response["feeds"]
+                for i in range(len(feeds)):
+                    if "photoId" not in feeds[i]:
+                        photo_id = "0"
+                        Common.crawler_log().info("photo_id:{}".format(photo_id))
+                    else:
+                        photo_id = feeds[i]["photoId"]
+                        Common.crawler_log().info("photo_id:{}".format(photo_id))
+
+                    if "viewCount" not in feeds[i]:
+                        video_play_cnt = "0"
+                        Common.crawler_log().info("video_play_cnt:0")
+                    else:
+                        video_play_cnt = feeds[i]["viewCount"]
+                        Common.crawler_log().info("video_play_cnt:{}".format(video_play_cnt))
+
+                    if "likeCount" not in feeds[i]:
+                        video_like_cnt = "0"
+                        Common.crawler_log().info("video_like_cnt:0")
+                    else:
+                        video_like_cnt = feeds[i]["likeCount"]
+                        Common.crawler_log().info("video_like_cnt:{}".format(video_like_cnt))
+
+                    if "headUrl" not in feeds[i]:
+                        head_url = "0"
+                        Common.crawler_log().info("head_url:不存在")
+                    else:
+                        head_url = feeds[i]["headUrl"]
+                        Common.crawler_log().info("head_url:{}".format(head_url))
+
+                    if len(feeds[i]["coverUrls"]) == 0:
+                        cover_url = "0"
+                        Common.crawler_log().info("cover_url:不存在")
+                    else:
+                        cover_url = feeds[i]["coverUrls"][0]["url"]
+                        Common.crawler_log().info("cover_url:{}".format(cover_url))
+
+                    if len(feeds[i]["mainMvUrls"]) == 0:
+                        video_url = "0"
+                        Common.crawler_log().info("video_url:不存在")
+                    else:
+                        video_url = feeds[i]["mainMvUrls"][0]["url"]
+                        Common.crawler_log().info("video_url:{}".format(video_url))
+
+                    if "shareCount" not in feeds[i]:
+                        video_share_cnt = "0"
+                        Common.crawler_log().info("video_share_cnt:0")
+                    else:
+                        video_share_cnt = feeds[i]["shareCount"]
+                        Common.crawler_log().info("video_share_cnt:{}".format(video_share_cnt))
+
+                    if "width" not in feeds[i] or "height"not in feeds[i]:
+                        video_width = "0"
+                        video_height = "0"
+                        video_resolution = str(video_width) + "*" + str(video_height)
+                        Common.crawler_log().info("无分辨率")
+                    else:
+                        video_width = feeds[i]["width"]
+                        video_height = feeds[i]["height"]
+                        video_resolution = str(video_width) + "*" + str(video_height)
+                        Common.crawler_log().info("video_resolution:{}".format(video_resolution))
+
+                    if "commentCount" not in feeds[i]:
+                        video_comment_cnt = "0"
+                        Common.crawler_log().info("video_comment_cnt:0")
+                    else:
+                        video_comment_cnt = feeds[i]["commentCount"]
+                        Common.crawler_log().info("video_comment_cnt:{}".format(video_comment_cnt))
+
+                    if "duration" not in feeds[i]:
+                        video_duration = "0"
+                        Common.crawler_log().info("video_duration:不存在")
+                    else:
+                        video_duration = int(int(feeds[i]["duration"])/1000)
+                        Common.crawler_log().info("video_duration:{}秒".format(video_duration))
+
+                    if "timestamp" not in feeds[i]:
+                        video_send_time = "0"
+                        Common.crawler_log().info("video_send_time:不存在")
+                    else:
+                        video_send_time = feeds[i]["timestamp"]
+                        Common.crawler_log().info("video_send_time:{}".format(
+                            time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(video_send_time)/1000))))
+
+                    user_name = feeds[i]["userName"].strip().replace("\n", "")\
+                        .replace("/", "").replace("快手", "").replace(" ", "")\
+                        .replace(" ", "").replace("&NBSP", "").replace("\r", "")
+                    Common.crawler_log().info("user_name:{}".format(user_name))
+
+                    user_id = feeds[i]["userId"]
+                    Common.crawler_log().info("user_id:{}".format(user_id))
+
+                    # 视频标题过滤话题及处理特殊字符
+                    kuaishou_title = feeds[i]["caption"]
+                    title_split1 = kuaishou_title.split(" #")
+                    if title_split1[0] != "":
+                        title1 = title_split1[0]
+                    else:
+                        title1 = title_split1[-1]
+
+                    title_split2 = title1.split(" #")
+                    if title_split2[0] != "":
+                        title2 = title_split2[0]
+                    else:
+                        title2 = title_split2[-1]
+
+                    title_split3 = title2.split("@")
+                    if title_split3[0] != "":
+                        title3 = title_split3[0]
+                    else:
+                        title3 = title_split3[-1]
+
+                    video_title = title3.strip().replace("\n", "")\
+                        .replace("/", "").replace("快手", "").replace(" ", "")\
+                        .replace(" ", "").replace("&NBSP", "").replace("\r", "")\
+                        .replace("#", "").replace(".", "。")
+
+                    Common.crawler_log().info("video_title:{}".format(video_title))
+
+                    # 从 kuaishou_videoid.txt 中去重
+                    photo_ids = Common.read_txt("kuaishou_videoid.txt")
+                    if photo_id in [p_id.strip() for p_id in photo_ids]:
+                        Common.crawler_log().info("该视频已下载:{}".format(video_title))
+                        pass
+                    else:
+                        Common.crawler_log().info("该视频未下载:{}".format(video_title))
+
+                        # 从 kuaishou_feeds.txt 中去重
+                        contents = Common.read_txt("kuaishou_feeds.txt")
+                        # kuaishou_feeds.txt 为空时,直接保存
+                        if len(contents) == 0 and head_url != "0" \
+                                and cover_url != "0" and video_url != "0" \
+                                and video_duration != "0" and photo_id != "0":
+                            # 判断敏感词
+                            if any(word if word in kuaishou_title else False
+                                   for word in cls.kuaishou_sensitive_words()) is True:
+                                Common.crawler_log().info("视频已中敏感词:{}".format(kuaishou_title))
+                            else:
+                                basic_time = int(time.time())
+                                Common.crawler_log().info("添加视频信息至kuaishou_feeds.txt:{}".format(video_title))
+                                with open("./txt/kuaishou_feeds.txt", "a", encoding="utf8") as f_a:
+                                    f_a.write(str(basic_time) + " + " +
+                                              str(photo_id) + " + " +
+                                              str(video_play_cnt) + " + " +
+                                              str(video_title) + " + " +
+                                              str(video_duration) + " + " +
+                                              str(video_comment_cnt) + " + " +
+                                              str(video_like_cnt) + " + " +
+                                              str(video_share_cnt) + " + " +
+                                              str(video_resolution) + " + " +
+                                              str(video_send_time) + " + " +
+                                              str(user_name) + " + " +
+                                              str(head_url) + " + " +
+                                              str(cover_url) + " + " +
+                                              str(video_url) + " + " +
+                                              str(user_id) + " + " +
+                                              str("wxo_b07ba02ad4340205d89b47c76030bb090977") + "\n")
+                        else:
+                            if photo_id in [content.split(" + ")[1] for content in contents]:
+                                Common.crawler_log().info("该视频已在 kuaishou_feeds.txt 中:{}".format(video_title))
+                            elif head_url == "0" or cover_url == "0" \
+                                    or video_url == "0" or video_duration == "0" or photo_id == "0":
+                                Common.crawler_log().info("视频封面/播放地址/播放时长/用户头像不存在")
+                            else:
+                                # 判断敏感词
+                                if any(word if word in kuaishou_title else False
+                                       for word in cls.kuaishou_sensitive_words()) is True:
+                                    Common.crawler_log().info("视频已中敏感词:{}".format(kuaishou_title))
+                                else:
+                                    basic_time = int(time.time())
+                                    Common.crawler_log().info("添加视频信息至kuaishou_feeds.txt:{}".format(video_title))
+                                    with open("./txt/kuaishou_feeds.txt", "a", encoding="utf8") as f_a:
+                                        f_a.write(str(basic_time) + " + " +
+                                                  str(photo_id) + " + " +
+                                                  str(video_play_cnt) + " + " +
+                                                  str(video_title) + " + " +
+                                                  str(video_duration) + " + " +
+                                                  str(video_comment_cnt) + " + " +
+                                                  str(video_like_cnt) + " + " +
+                                                  str(video_share_cnt) + " + " +
+                                                  str(video_resolution) + " + " +
+                                                  str(video_send_time) + " + " +
+                                                  str(user_name) + " + " +
+                                                  str(head_url) + " + " +
+                                                  str(cover_url) + " + " +
+                                                  str(video_url) + " + " +
+                                                  str(user_id) + " + " +
+                                                  str("wxo_b07ba02ad4340205d89b47c76030bb090977") + "\n")
+        except Exception as e:
+            Common.crawler_log().error("获取视频 list 异常:{}".format(e))
+
+    @classmethod
+    def kuaishou_download_play_video(cls, env):
+        """
+        下载播放量视频
+        测试环境:env == dev
+        正式环境:env == prod
+        """
+        videos = Common.read_txt("kuaishou_feeds.txt")
+        for video in videos:
+            download_photo_id = video.strip().split(" + ")[1]
+            download_video_title = video.strip().split(" + ")[3]
+            download_video_duration = video.strip().split(" + ")[4]
+            download_video_play_cnt = video.strip().split(" + ")[2]
+            download_video_comment_cnt = video.strip().split(" + ")[5]
+            download_video_like_cnt = video.strip().split(" + ")[6]
+            download_video_share_cnt = video.strip().split(" + ")[7]
+            download_video_resolution = video.strip().split(" + ")[8]
+            download_video_width = download_video_resolution.split("*")[0]
+            download_video_height = download_video_resolution.split("*")[-1]
+            download_video_send_time = video.strip().split(" + ")[9]
+            download_user_name = video.strip().split(" + ")[10]
+            download_head_url = video.strip().split(" + ")[11]
+            download_cover_url = video.strip().split(" + ")[12]
+            download_video_url = video.strip().split(" + ")[13]
+            download_video_session = video.strip().split(" + ")[-1]
+
+            if cls.kuaishou_download_rule(download_video_duration,
+                                          download_video_width,
+                                          download_video_height,
+                                          download_video_play_cnt,
+                                          download_video_like_cnt,
+                                          download_video_share_cnt) is True:
+                Common.crawler_log().info("开始下载快手视频:{}".format(download_video_title))
+                # 下载封面
+                Common.download_method(text="cover", d_name=download_video_title, d_url=download_cover_url)
+                # 下载视频
+                Common.download_method(text="video", d_name=download_video_title, d_url=download_video_url)
+
+                # 保存视频信息至 kuaishou_videoid.txt
+                with open("./txt/kuaishou_videoid.txt", "a", encoding="utf8") as fa:
+                    fa.write(download_photo_id + "\n")
+
+                # 添加视频 ID 到 list,用于统计当次下载总数
+                cls.download_video_list.append(download_photo_id)
+
+                # # 保存视频信息至 {today}_kuaishou_videoid.txt
+                # with open("./txt/" + str(Common.today) + "_kuaishou_videoid.txt", "a", encoding="utf8") as fc:
+                #     fc.write(download_photo_id + "\n")
+
+                # 保存视频信息至 "./videos/{download_video_title}/info.txt"
+                with open("./videos/" + download_video_title + "/info.txt", "a", encoding="utf8") as f_a:
+                    f_a.write(str(download_photo_id) + "\n" +
+                              str(download_video_title) + "\n" +
+                              str(download_video_duration) + "\n" +
+                              str(download_video_play_cnt) + "\n" +
+                              str(download_video_comment_cnt) + "\n" +
+                              str(download_video_like_cnt) + "\n" +
+                              str(download_video_share_cnt) + "\n" +
+                              str(download_video_resolution) + "\n" +
+                              str(download_video_send_time) + "\n" +
+                              str(download_user_name) + "\n" +
+                              str(download_head_url) + "\n" +
+                              str(download_video_url) + "\n" +
+                              str(download_cover_url) + "\n" +
+                              str(download_video_session))
+
+                # 上传视频
+                if env == "dev":
+                    Common.crawler_log().info("开始上传视频:{}".format(download_video_title))
+                    Publish.upload_and_publish("dev", "play")
+                elif env == "prod":
+                    Common.crawler_log().info("开始上传视频:{}".format(download_video_title))
+                    Publish.upload_and_publish("prod", "play")
+
+                # 删除该视频在kuaishou_feeds.txt中的信息
+                Common.crawler_log().info("删除该视频在kuaishou_feeds.txt中的信息:{}".format(download_video_title))
+                with open("./txt/kuaishou_feeds.txt", "r", encoding="utf8") as f_r:
+                    lines = f_r.readlines()
+                with open("./txt/kuaishou_feeds.txt", "w", encoding="utf-8") as f_w:
+                    for line in lines:
+                        if download_photo_id in line.split(" + ")[1]:
+                            continue
+                        f_w.write(line)
+            else:
+                # 删除该视频在 recommend.txt中的信息
+                Common.crawler_log().info("该视频不满足下载规则,删除在kuaishou_feeds.txt中的信息:{}".format(download_video_title))
+                with open("./txt/kuaishou_feeds.txt", "r", encoding="utf8") as f_r:
+                    lines = f_r.readlines()
+                with open("./txt/kuaishou_feeds.txt", "w", encoding="utf-8") as f_w:
+                    for line in lines:
+                        if download_photo_id in line.split(" + ")[1]:
+                            continue
+                        f_w.write(line)
+
+
+if __name__ == "__main__":
+    kuaishou = KuaiShou()
+    kuaishou.kuaishou_get_recommend()

+ 344 - 0
main/download_weishi.py

@@ -0,0 +1,344 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2022/4/8
+import json
+import time
+
+import requests
+import urllib3
+from main.common import Common
+from main.publish import Publish
+
+
+class Weishi:
+    @staticmethod
+    def weishi_download_rule(d_duration, d_width, d_height, d_play_cnt):
+        """
+        下载视频的基本规则
+        :param d_duration: 时长
+        :param d_width: 宽
+        :param d_height: 高
+        :param d_play_cnt: 播放量
+        :return: 满足规则,返回 True;反之,返回 False
+        """
+        if 600 >= int(float(d_duration)) >= 60:
+            if int(d_width) >= 720 or int(d_height) >= 720:
+                if int(d_play_cnt) >= 100000:
+                    return True
+                else:
+                    return False
+            return False
+        return False
+
+    @classmethod
+    def get_weishi_recommend(cls):
+        """
+        从微视小程序首页推荐获取视频list:
+            1.在 weishi_videoid.txt 中去重
+            2.在 weishi_feeds.txt 中去重
+            3.添加视频信息到 weishi_feeds.txt
+        """
+        url = "https://api.weishi.qq.com/trpc.weishi.weishi_h5_proxy.weishi_h5_proxy/WxminiGetFeedList"
+        cookies = {
+            "wesee_authtype": "3",
+            "wesee_openid": "oWGa05FrwkuUvT-4n1qGeQuhVsc8",
+            "wesee_openkey": "8c3ec202f5d679fb5ee6d9f643640d9a2580ba504612e2d979a881d3169caf189e2a5c1d532eeff172bc21cf2"
+                             "6230941ccbc10243a7879e8165ca608c17060de606a6d08afe0a3abd5250629314f9a99e9d1003b201bf5ec",
+            "wesee_personid": "1593522421826902",
+            "wesee_refresh_token": "",
+            "wesee_access_token": "8c3ec202f5d679fb5ee6d9f643640d9a2580ba504612e2d979a881d3169caf18"
+                                  "9e2a5c1d532eeff172bc21cf26230941ccbc10243a7879e8165ca608c17060de6"
+                                  "06a6d08afe0a3abd5250629314f9a99e9d1003b201bf5ec",
+            "wesee_thr_appid": "wx75ee9f19b93e5c46",
+            "wesee_ichid": "8"
+        }
+        json_data = {
+            "req_body": {
+                "requestType": 16,
+                "isrefresh": 0,
+                "isfirst": 0,
+                "attachInfo": "",
+                "scene_id": 22,
+                "requestExt": {
+                    "mini_openid": "oWGa05FrwkuUvT-4n1qGeQuhVsc8",
+                    "notLogin-personid": "1593522421826902"
+                }
+            },
+            "req_header": {
+                "mapExt": "{\"imageSize\":\"480\",\"adaptScene\":\"PicHDWebpLimitScene\"}"
+            }
+        }
+
+        try:
+            urllib3.disable_warnings()
+            r = requests.post(url=url, cookies=cookies, json=json_data, verify=False)
+            response = json.loads(r.content.decode("utf8"))
+            if "rsp_body" not in response:
+                Common.crawler_log().info("获取微视视频 list 出错:{},休眠 10s".format(response))
+                time.sleep(10)
+            else:
+                feeds = response["rsp_body"]["feeds"]
+                for i in range(len(feeds)):
+                    if "video" not in feeds[i]:
+                        Common.crawler_log().info("无视频信息")
+                    else:
+                        # 视频 ID
+                        if "id" not in feeds[i]["video"]:
+                            video_id = "0"
+                            Common.crawler_log().info("video_id:{}".format(video_id))
+                        else:
+                            video_id = feeds[i]["video"]["id"]
+                            Common.crawler_log().info("video_id:{}".format(video_id))
+
+                        # 视频标题
+                        video_title = feeds[i]["desc"].strip().replace("\n", "") \
+                            .replace("/", "").replace("快手", "").replace(" ", "") \
+                            .replace(" ", "").replace("&NBSP", "").replace("\r", "")
+                        Common.crawler_log().info("video_title:{}".format(video_title))
+
+                        # 视频发布时间
+                        if "createTime" not in feeds[i]:
+                            video_send_time = "0"
+                            Common.crawler_log().info("video_send_time:不存在")
+                        else:
+                            video_send_time = int(feeds[i]["createTime"])*1000
+                            Common.crawler_log().info(
+                                "video_send_time:{}".format(time.strftime(
+                                    "%Y-%m-%d %H:%M:%S", time.localtime(int(video_send_time)/1000))))
+
+                        # 视频封面地址
+                        if len(feeds[i]["images"]) == 0:
+                            cover_url = "0"
+                            Common.crawler_log().info("cover_url:不存在")
+                        else:
+                            cover_url = feeds[i]["images"][0]["url"]
+                            Common.crawler_log().info("cover_url:{}".format(cover_url))
+
+                        # 视频播放地址
+                        if "url" not in feeds[i]["video"]:
+                            video_url = "0"
+                            Common.crawler_log().info("video_url:不存在")
+                        else:
+                            video_url = feeds[i]["video"]["url"]
+                            Common.crawler_log().info("video_url:{}".format(video_url))
+
+                        # 视频分辨率
+                        if "width" not in feeds[i]["video"] or "height" not in feeds[i]["video"]:
+                            video_width = "0"
+                            video_height = "0"
+                            video_resolution = str(video_width) + "*" + str(video_height)
+                            Common.crawler_log().info("无分辨率")
+                        else:
+                            video_width = feeds[i]["video"]["width"]
+                            video_height = feeds[i]["video"]["height"]
+                            video_resolution = str(video_width) + "*" + str(video_height)
+                            Common.crawler_log().info("video_resolution:{}".format(video_resolution))
+
+                        # 视频时长
+                        if "duration" not in feeds[i]["video"]:
+                            video_duration = "0"
+                            Common.crawler_log().info("video_duration:不存在")
+                        else:
+                            video_duration = int(int(feeds[i]["video"]["duration"]) / 1000)
+                            Common.crawler_log().info("video_duration:{}秒".format(video_duration))
+
+                        # 播放数
+                        if "playNum" not in feeds[i]["ugcData"]:
+                            video_play_cnt = "0"
+                            Common.crawler_log().info("video_play_cnt:{}".format(video_play_cnt))
+                        else:
+                            video_play_cnt = feeds[i]["ugcData"]["playNum"]
+                            Common.crawler_log().info("video_play_cnt:{}".format(video_play_cnt))
+
+                        # 点赞数
+                        if "dingCount" not in feeds[i]["ugcData"]:
+                            video_like_cnt = "0"
+                            Common.crawler_log().info("video_like_cnt:{}".format(video_like_cnt))
+                        else:
+                            video_like_cnt = feeds[i]["ugcData"]["dingCount"]
+                            Common.crawler_log().info("video_like_cnt:{}".format(video_like_cnt))
+
+                        # 分享数
+                        if "shareNum" not in feeds[i]["ugcData"]:
+                            video_share_cnt = "0"
+                            Common.crawler_log().info("video_share_cnt:{}".format(video_share_cnt))
+                        else:
+                            video_share_cnt = feeds[i]["ugcData"]["shareNum"]
+                            Common.crawler_log().info("video_share_cnt:{}".format(video_share_cnt))
+
+                        # 评论数
+                        if "totalCommentNum" not in feeds[i]["ugcData"]:
+                            video_comment_cnt = "0"
+                            Common.crawler_log().info("video_comment_cnt:{}".format(video_comment_cnt))
+                        else:
+                            video_comment_cnt = feeds[i]["ugcData"]["totalCommentNum"]
+                            Common.crawler_log().info("video_comment_cnt:{}".format(video_comment_cnt))
+
+                        # 用户 ID
+                        user_id = feeds[i]["poster"]["id"]
+                        Common.crawler_log().info("user_id:{}".format(user_id))
+
+                        # 用户昵称
+                        user_name = feeds[i]["poster"]["nick"].strip().replace("\n", "") \
+                            .replace("/", "").replace("快手", "").replace(" ", "") \
+                            .replace(" ", "").replace("&NBSP", "").replace("\r", "")
+                        Common.crawler_log().info("user_name:{}".format(user_name))
+
+                        # 用户头像地址
+                        if "thumbURL" not in feeds[i]["material"] and "avatar" not in feeds[i]["poster"]:
+                            head_url = "0"
+                            Common.crawler_log().info("head_url:不存在")
+                        elif "thumbURL" in feeds[i]["material"]:
+                            head_url = feeds[i]["material"]["thumbURL"]
+                            Common.crawler_log().info("head_url:{}".format(head_url))
+                        else:
+                            head_url = feeds[i]["poster"]["avatar"]
+                            Common.crawler_log().info("head_url:{}".format(head_url))
+
+                        # 从 weishi_videoid.txt 中去重
+                        videos_ids = Common.read_txt("weishi_videoid.txt")
+                        if video_id in [v_id.strip() for v_id in videos_ids]:
+                            Common.crawler_log().info("该视频已下载:{}".format(video_title))
+                            pass
+                        else:
+                            Common.crawler_log().info("该视频未下载:{}".format(video_title))
+
+                            # 从 weishi_feeds.txt 中去重
+                            contents = Common.read_txt("weishi_feeds.txt")
+                            # 若 weishi_feeds.txt 为空时,直接保存
+                            if len(contents) == 0 and head_url != "0" \
+                                    and cover_url != "0" and video_url != "0" \
+                                    and video_duration != "0" and video_id != "0":
+                                basic_time = int(time.time())
+                                Common.crawler_log().info("添加视频信息至weishi_feeds.txt:{}".format(video_title))
+                                with open("./txt/weishi_feeds.txt", "a", encoding="utf8") as f_a:
+                                    f_a.write(str(basic_time) + " + " +
+                                              str(video_id) + " + " +
+                                              str(video_play_cnt) + " + " +
+                                              str(video_title) + " + " +
+                                              str(video_duration) + " + " +
+                                              str(video_comment_cnt) + " + " +
+                                              str(video_like_cnt) + " + " +
+                                              str(video_share_cnt) + " + " +
+                                              str(video_resolution) + " + " +
+                                              str(video_send_time) + " + " +
+                                              str(user_name) + " + " +
+                                              str(head_url) + " + " +
+                                              str(cover_url) + " + " +
+                                              str(video_url) + " + " +
+                                              str(user_id) + " + " +
+                                              str("oWGa05FrwkuUvT-4n1qGeQuhVsc8") + "\n")
+                            else:
+                                if video_id in [content.split(" + ")[1] for content in contents]:
+                                    Common.crawler_log().info("该视频已在 weishi_feeds.txt 中:{}".format(video_title))
+                                elif head_url == "0" or cover_url == "0" \
+                                        or video_url == "0" or video_duration == "0" or video_id == "0":
+                                    Common.crawler_log().info("视频封面/播放地址/播放时长/用户头像不存在")
+                                else:
+                                    basic_time = int(time.time())
+                                    Common.crawler_log().info("添加视频信息至weishi_feeds.txt:{}".format(video_title))
+                                    with open("./txt/weishi_feeds.txt", "a", encoding="utf8") as f_a:
+                                        f_a.write(str(basic_time) + " + " +
+                                                  str(video_id) + " + " +
+                                                  str(video_play_cnt) + " + " +
+                                                  str(video_title) + " + " +
+                                                  str(video_duration) + " + " +
+                                                  str(video_comment_cnt) + " + " +
+                                                  str(video_like_cnt) + " + " +
+                                                  str(video_share_cnt) + " + " +
+                                                  str(video_resolution) + " + " +
+                                                  str(video_send_time) + " + " +
+                                                  str(user_name) + " + " +
+                                                  str(head_url) + " + " +
+                                                  str(cover_url) + " + " +
+                                                  str(video_url) + " + " +
+                                                  str(user_id) + " + " +
+                                                  str("oWGa05FrwkuUvT-4n1qGeQuhVsc8") + "\n")
+        except Exception as e:
+            Common.crawler_log().error("获取微视视频 list 异常:{}".format(e))
+
+    @classmethod
+    def download_weishi_play_video(cls, env):
+        """
+        下载播放量视频
+        测试环境:env == dev
+        正式环境:env == prod
+        """
+        videos = Common.read_txt("weishi_feeds.txt")
+        for video in videos:
+            download_video_id = video.strip().split(" + ")[1]
+            download_video_title = video.strip().split(" + ")[3]
+            download_video_duration = video.strip().split(" + ")[4]
+            download_video_play_cnt = video.strip().split(" + ")[2]
+            download_video_comment_cnt = video.strip().split(" + ")[5]
+            download_video_like_cnt = video.strip().split(" + ")[6]
+            download_video_share_cnt = video.strip().split(" + ")[7]
+            download_video_resolution = video.strip().split(" + ")[8]
+            download_video_width = download_video_resolution.split("*")[0]
+            download_video_height = download_video_resolution.split("*")[-1]
+            download_video_send_time = video.strip().split(" + ")[9]
+            download_user_name = video.strip().split(" + ")[10]
+            download_head_url = video.strip().split(" + ")[11]
+            download_cover_url = video.strip().split(" + ")[12]
+            download_video_url = video.strip().split(" + ")[13]
+            download_video_session = video.strip().split(" + ")[-1]
+
+            if cls.weishi_download_rule(download_video_duration, download_video_width,
+                                        download_video_height, download_video_play_cnt) is True:
+                Common.crawler_log().info("开始下载视频:{}".format(download_video_title))
+                # 下载封面
+                Common.download_method(text="cover", d_name=download_video_title, d_url=download_cover_url)
+                # 下载视频
+                Common.download_method(text="video", d_name=download_video_title, d_url=download_video_url)
+                # 保存视频信息至 weishi_videoid.txt
+                with open("./txt/weishi_videoid.txt", "a", encoding="utf8") as fa:
+                    fa.write(download_video_id + "\n")
+                # 保存视频信息至 "./videos/{download_video_title}/info.txt"
+                with open("./videos/" + download_video_title + "/info.txt", "a", encoding="utf8") as f_a:
+                    f_a.write(str(download_video_id) + "\n" +
+                              str(download_video_title) + "\n" +
+                              str(download_video_duration) + "\n" +
+                              str(download_video_play_cnt) + "\n" +
+                              str(download_video_comment_cnt) + "\n" +
+                              str(download_video_like_cnt) + "\n" +
+                              str(download_video_share_cnt) + "\n" +
+                              str(download_video_resolution) + "\n" +
+                              str(download_video_send_time) + "\n" +
+                              str(download_user_name) + "\n" +
+                              str(download_head_url) + "\n" +
+                              str(download_video_url) + "\n" +
+                              str(download_cover_url) + "\n" +
+                              str(download_video_session))
+
+                # 上传视频
+                if env == "dev":
+                    Common.crawler_log().info("开始上传视频:{}".format(download_video_title))
+                    Publish.upload_and_publish("dev", "play")
+                elif env == "prod":
+                    Common.crawler_log().info("开始上传视频:{}".format(download_video_title))
+                    Publish.upload_and_publish("prod", "play")
+
+                # 删除该视频在weishi_feeds.txt中的信息
+                Common.crawler_log().info("删除该视频在weishi_feeds.txt中的信息:{}".format(download_video_title))
+                with open("./txt/weishi_feeds.txt", "r", encoding="utf8") as f_r:
+                    lines = f_r.readlines()
+                with open("./txt/weishi_feeds.txt", "w", encoding="utf-8") as f_w:
+                    for line in lines:
+                        if download_video_id in line.split(" + ")[1]:
+                            continue
+                        f_w.write(line)
+            else:
+                # 删除该视频在weishi_feeds.txt中的信息
+                Common.crawler_log().info("该视频不满足下载规则,删除在weishi_feeds.txt中的信息:{}".format(download_video_title))
+                with open("./txt/weishi_feeds.txt", "r", encoding="utf8") as f_r:
+                    lines = f_r.readlines()
+                with open("./txt/weishi_feeds.txt", "w", encoding="utf-8") as f_w:
+                    for line in lines:
+                        if download_video_id in line.split(" + ")[1]:
+                            continue
+                        f_w.write(line)
+
+
+if __name__ == "__main__":
+    weishi = Weishi()
+    weishi.get_weishi_recommend()

+ 248 - 0
main/publish.py

@@ -0,0 +1,248 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2022/3/30
+"""
+上传视频到阿里云 OSS
+上传视频到管理后台
+"""
+import json
+import os
+import random
+import time
+
+import oss2
+import requests
+import urllib3
+from main.common import Common
+
+
+class Publish:
+    @classmethod
+    def publish_video_dev(cls, request_data):
+        """
+        loginUid  站内uid (随机)
+        appType  默认:888888
+        crawlerSrcId   站外视频ID
+        crawlerSrcCode   渠道(自定义 KYK)
+        crawlerSrcPublishTimestamp  视频原发布时间
+        crawlerTaskTimestamp   爬虫创建时间(可以是当前时间)
+        videoPath  视频oss地址
+        coverImgPath  视频封面oss地址
+        title  标题
+        totalTime  视频时长
+        viewStatus  视频的有效状态 默认1
+        versionCode  版本 默认1
+        :return:
+        """
+        # Common.crawler_log().info('publish request data: {}'.format(request_data))
+        result = cls.request_post('https://videotest.yishihui.com/longvideoapi/crawler/video/send', request_data)
+        Common.crawler_log().info('publish result: {}'.format(result))
+        if result['code'] != 0:
+            Common.crawler_log().error('pushlish failure msg = {}'.format(result['msg']))
+        else:
+            Common.crawler_log().info('publish success video_id = : {}'.format(request_data['crawlerSrcId']))
+
+    @classmethod
+    def publish_video_prod(cls, request_data):
+        """
+        loginUid  站内uid (随机)
+        appType  默认:888888
+        crawlerSrcId   站外视频ID
+        crawlerSrcCode   渠道(自定义 KYK)
+        crawlerSrcPublishTimestamp  视频原发布时间
+        crawlerTaskTimestamp   爬虫创建时间(可以是当前时间)
+        videoPath  视频oss地址
+        coverImgPath  视频封面oss地址
+        title  标题
+        totalTime  视频时长
+        viewStatus  视频的有效状态 默认1
+        versionCode  版本 默认1
+        :return:
+        """
+        # Common.crawler_log().info('publish request data: {}'.format(request_data))
+        result = cls.request_post('https://longvideoapi.piaoquantv.com/longvideoapi/crawler/video/send', request_data)
+        Common.crawler_log().info('publish result: {}'.format(result))
+        if result['code'] != 0:
+            Common.crawler_log().error('pushlish failure msg = {}'.format(result['msg']))
+        else:
+            Common.crawler_log().info('publish success video_id = : {}'.format(request_data['crawlerSrcId']))
+
+    @classmethod
+    def request_post(cls, request_url, request_data):
+        """
+        post 请求 HTTP接口
+        :param request_url: 接口URL
+        :param request_data: 请求参数
+        :return: res_data json格式
+        """
+        urllib3.disable_warnings()
+        response = requests.post(url=request_url, data=request_data, verify=False)
+        if response.status_code == 200:
+            res_data = json.loads(response.text)
+            return res_data
+
+    # 以下代码展示了基本的文件上传、下载、罗列、删除用法。
+
+    # 首先初始化AccessKeyId、AccessKeySecret、Endpoint等信息。
+    # 通过环境变量获取,或者把诸如“<你的AccessKeyId>”替换成真实的AccessKeyId等。
+    #
+    # 以杭州区域为例,Endpoint可以是:
+    #   http://oss-cn-hangzhou.aliyuncs.com
+    #   https://oss-cn-hangzhou.aliyuncs.com
+    # 分别以HTTP、HTTPS协议访问。
+    access_key_id = os.getenv('OSS_TEST_ACCESS_KEY_ID', 'LTAIP6x1l3DXfSxm')
+    access_key_secret = os.getenv('OSS_TEST_ACCESS_KEY_SECRET', 'KbTaM9ars4OX3PMS6Xm7rtxGr1FLon')
+    bucket_name = os.getenv('OSS_TEST_BUCKET', 'art-pubbucket')
+    # endpoint = os.getenv('OSS_TEST_ENDPOINT', 'oss-cn-hangzhou-internal.aliyuncs.com')
+    endpoint = os.getenv('OSS_TEST_ENDPOINT', 'oss-cn-hangzhou.aliyuncs.com')
+
+    # 确认上面的参数都填写正确了
+    for param in (access_key_id, access_key_secret, bucket_name, endpoint):
+        assert '<' not in param, '请设置参数:' + param
+
+    # 创建Bucket对象,所有Object相关的接口都可以通过Bucket对象来进行
+    bucket = oss2.Bucket(oss2.Auth(access_key_id, access_key_secret), endpoint, bucket_name)
+
+    """
+    处理流程:
+    1. 定时(每天凌晨1点执行一次)循环files文件下的内容 结构:files -> 视频文件夹 -> 视频文件 + 封面图 + 基本信息
+    2. 视频文件和封面上传到oss
+    - 视频文件oss目录  longvideo/crawler_local/video/prod/文件名
+    - 视频封面oss目录  longvideo/crawler_local/image/prod/文件名
+    3. 发布视频
+    - 读取 基本信息 调用发布接口
+    """
+    # env 日期20220225 文件名
+    oss_file_path_video = 'longvideo/crawler_local/video/{}/{}/{}'
+    oss_file_path_image = 'longvideo/crawler_local/image/{}/{}/{}'
+
+    @classmethod
+    def put_file(cls, oss_file, local_file):
+        Common.crawler_log().info("put oss file = {}, local file = {}".format(oss_file, local_file))
+        cls.bucket.put_object_from_file(oss_file, local_file)
+        Common.crawler_log().info("put oss file = {}, local file = {} success".format(oss_file, local_file))
+
+    # 清除本地文件
+    @classmethod
+    def remove_local_file(cls, local_file):
+        Common.crawler_log().info("remove local file = {}".format(local_file))
+        os.remove(local_file)
+        Common.crawler_log().info("remove local file = {} success".format(local_file))
+
+    # 清除本地文件夹
+    @classmethod
+    def remove_local_file_dir(cls, local_file):
+        Common.crawler_log().info("remove local file dir = {}".format(local_file))
+        os.rmdir(local_file)
+        Common.crawler_log().info("remove local file dir = {} success".format(local_file))
+
+    local_file_path = './videos'
+    video_file = 'video'
+    image_file = 'image'
+    info_file = 'info'
+    uids_dev_up = [6267140]
+    uids_dev_play = [6267141]
+    uids_prod_up = [20631208, 20631209, 20631210, 20631211, 20631212,
+                    20631213, 20631214, 20631215, 20631216, 20631217]
+    uids_prod_play = [20631228, 20631229, 20631230, 20631231, 20631232,
+                      20631233, 20631234, 20631235, 20631236, 20631237]
+
+    @classmethod
+    def upload_and_publish(cls, env, job):
+        """
+        上传视频到 oss
+        :param env: 测试环境:dev,正式环境:prod
+        :param job: 上升榜:up,播放量:play
+        """
+        Common.crawler_log().info("upload_and_publish starting...")
+        today = time.strftime("%Y%m%d", time.localtime())
+        # videos 目录下的所有视频文件夹
+        files = os.listdir(cls.local_file_path)
+        for f in files:
+            try:
+                # 单个视频文件夹
+                fi_d = os.path.join(cls.local_file_path, f)
+                # 确认为视频文件夹
+                if os.path.isdir(fi_d):
+                    Common.crawler_log().info('dir = {}'.format(fi_d))
+                    # 列出所有视频文件夹
+                    dir_files = os.listdir(fi_d)
+                    data = {'appType': '888888', 'crawlerSrcCode': 'KANYIKAN', 'viewStatus': '1', 'versionCode': '1'}
+                    now_timestamp = int(round(time.time() * 1000))
+                    data['crawlerTaskTimestamp'] = str(now_timestamp)
+                    global uid
+                    if env == "dev" and job == "up":
+                        uid = str(random.choice(cls.uids_dev_up))
+                    elif env == "dev" and job == "play":
+                        uid = str(random.choice(cls.uids_dev_play))
+                    elif env == "prod" and job == "up":
+                        uid = str(random.choice(cls.uids_prod_up))
+                    elif env == "prod" and job == "play":
+                        uid = str(random.choice(cls.uids_prod_play))
+                    data['loginUid'] = uid
+                    # 单个视频文件夹下的所有视频文件
+                    for fi in dir_files:
+                        # 视频文件夹下的所有文件路径
+                        fi_path = fi_d + '/' + fi
+                        Common.crawler_log().info('dir fi_path = {}'.format(fi_path))
+                        # 读取 info.txt,赋值给 data
+                        if cls.info_file in fi:
+                            f = open(fi_path)
+                            # 读取数据 数据准确性写入的时候保证 读取暂不处理
+                            for i in range(14):
+                                line = f.readline()
+                                line = line.replace('\n', '')
+                                if line is not None and len(line) != 0 and not line.isspace():
+                                    Common.crawler_log().info("line = {}".format(line))
+                                    if i == 0:
+                                        data['crawlerSrcId'] = line
+                                    elif i == 1:
+                                        data['title'] = line
+                                    elif i == 2:
+                                        data['totalTime'] = line
+                                    elif i == 8:
+                                        data['crawlerSrcPublishTimestamp'] = line
+                                else:
+                                    Common.crawler_log().warning("{} line is None".format(fi_path))
+                            # remove info.txt
+                            cls.remove_local_file(fi_path)
+                    # 刷新数据
+                    dir_files = os.listdir(fi_d)
+                    for fi in dir_files:
+                        fi_path = fi_d + '/' + fi
+                        Common.crawler_log().info('dir fi_path = {}'.format(fi_path))
+                        # 上传oss
+                        if cls.video_file in fi:
+                            global oss_video_file
+                            if env == "dev":
+                                oss_video_file = cls.oss_file_path_video.format("dev", today, data['crawlerSrcId'])
+                            elif env == "prod":
+                                oss_video_file = cls.oss_file_path_video.format("prod", today, data['crawlerSrcId'])
+                            Common.crawler_log().info("oss_video_file = {}".format(oss_video_file))
+                            cls.put_file(oss_video_file, fi_path)
+                            data['videoPath'] = oss_video_file
+                            Common.crawler_log().info("videoPath = {}".format(oss_video_file))
+                        elif cls.image_file in fi:
+                            global oss_image_file
+                            if env == "dev":
+                                oss_image_file = cls.oss_file_path_image.format("env", today, data['crawlerSrcId'])
+                            elif env == "prod":
+                                oss_image_file = cls.oss_file_path_image.format("prod", today, data['crawlerSrcId'])
+                            Common.crawler_log().info("oss_image_file = {}".format(oss_image_file))
+                            cls.put_file(oss_image_file, fi_path)
+                            data['coverImgPath'] = oss_image_file
+                            Common.crawler_log().info("coverImgPath = {}".format(oss_image_file))
+                        # 全部remove
+                        cls.remove_local_file(fi_path)
+
+                    # 发布
+                    if env == "dev":
+                        cls.publish_video_dev(data)
+                    elif env == "prod":
+                        cls.publish_video_prod(data)
+                    cls.remove_local_file_dir(fi_d)
+
+                else:
+                    Common.crawler_log().error('file not a dir = {}'.format(fi_d))
+            except Exception as e:
+                Common.crawler_log().exception('upload_and_publish error', e)

+ 135 - 0
main/run.py

@@ -0,0 +1,135 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2022/3/30
+import os
+import random
+import sys
+import time
+from apscheduler.schedulers.blocking import BlockingScheduler
+sys.path.append(os.getcwd())
+from main.common import Common
+from main.download_weishi import Weishi
+from main.download_kuaishou import KuaiShou
+
+
+def kuaishou_dev_job():
+    """
+    执行测试环境快手脚本
+    """
+    while True:
+        # 当天下载及上传的视频数:20 条
+        if len(KuaiShou.download_video_list) >= 20:
+            time.sleep(60)
+            break
+        else:
+            Common.crawler_log().info("开始抓取快手视频")
+            time.sleep(1)
+
+            # 抓取符合规则的视频,写入 kuaishou_feeds.txt
+            KuaiShou.kuaishou_get_recommend()
+            # 下载视频,并上传
+            KuaiShou.kuaishou_download_play_video("dev")
+            # 随机睡眠1-3s
+            time.sleep(random.randint(1, 3))
+
+    # 删除冗余日志
+    Common.del_logs()
+    # 统计下载视频数
+    Common.kuaishou_download_count()
+
+
+def weishi_dev_job():
+    """
+    执行测试环境微视脚本
+    """
+    while True:
+        if 14 >= Common.now.hour >= 5:
+            Common.crawler_log().info("结束抓取及上传任务")
+            break
+        else:
+            # 抓取符合规则的视频,写入 weishi_feeds.txt
+            Weishi.get_weishi_recommend()
+            # 下载视频,并上传
+            Weishi.download_weishi_play_video("dev")
+            # 随机睡眠1-3s
+            time.sleep(random.randint(1, 3))
+
+    # 删除冗余日志
+    Common.del_logs()
+    # 统计下载视频数
+    Common.weishi_download_count()
+
+
+def main_dev():
+    """
+    测试环境主函数
+    """
+    scheduler = BlockingScheduler(timezone="Asia/Shanghai")
+    # 抓取视频的定时任务,在每天10点的40分,运行一次 job 方法
+    scheduler.add_job(kuaishou_dev_job, 'cron', hour=19, minute=10, misfire_grace_time=60)
+    # 开始运行脚本
+    scheduler.start()
+
+
+def kuaishou_prod_job():
+    """
+    执行正式环境快手脚本
+    """
+    while True:
+        # 当天下载及上传的视频数:200 条
+        if len(KuaiShou.download_video_list) >= 200:
+            time.sleep(60)
+            break
+        else:
+            Common.crawler_log().info("开始抓取快手视频")
+            time.sleep(1)
+
+            # 抓取符合规则的视频,写入 kuaishou_feeds.txt
+            KuaiShou.kuaishou_get_recommend()
+            # 下载视频,并上传
+            KuaiShou.kuaishou_download_play_video("prod")
+            # 随机睡眠1-3s
+            time.sleep(random.randint(1, 3))
+
+    # 删除冗余日志
+    Common.del_logs()
+    # 统计下载视频数
+    Common.kuaishou_download_count()
+
+
+def weishi_prod_job():
+    """
+    执行正式环境微视脚本
+    """
+    while True:
+        if 20 >= Common.now.hour >= 5:
+            Common.crawler_log().info("结束抓取微视视频任务")
+            break
+        else:
+            # 抓取符合规则的视频,写入 weishi_feeds.txt
+            Weishi.get_weishi_recommend()
+            # 下载视频,并上传
+            Weishi.download_weishi_play_video("prod")
+            # 随机睡眠1-3s
+            time.sleep(random.randint(1, 3))
+
+    # 删除冗余日志
+    Common.del_logs()
+    # 统计下载视频数
+    Common.weishi_download_count()
+
+
+def main_prod():
+    """
+    正式环境主函数
+    """
+    scheduler = BlockingScheduler(timezone="Asia/Shanghai")
+    # 抓取视频的定时任务,在每天10点的40分,运行一次 job 方法
+    scheduler.add_job(kuaishou_prod_job, 'cron', hour=8, minute=00, misfire_grace_time=60)
+    # 开始运行脚本
+    scheduler.start()
+
+
+if __name__ == "__main__":
+    # main_dev()
+    main_prod()

+ 3 - 0
txt/__init__.py

@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2022/3/30

+ 0 - 0
txt/kuaishou_feeds.txt


+ 0 - 0
txt/kuaishou_videoid.txt


+ 0 - 0
txt/weishi_feeds.txt


+ 0 - 0
txt/weishi_videoid.txt


+ 3 - 0
videos/__init__.py

@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2022/3/30

+ 19 - 0
抓取规则.txt

@@ -0,0 +1,19 @@
+==========2022/4/15===========
+一、按照数据指标抓取
+1、任务开始时间:
+- 每天早上8点-晚上21点
+2、抓取规则:
+  - 视频播放量点赞量5万+ ,分享量2000+
+  - 视频时长1分钟以上,10分钟以下
+  - 视频分辨率720以上
+  - 站内标题=快手视频原标题 (需要过滤掉标题中的话题#  #)
+  - 站内封面图=快手视频原封面图
+3、站内承接:
+- 每日入库200条视频
+- 视频随机分配到10个虚拟账号。
+4、特别注意:
+- 视频需要排重,已经抓取过得视频,不要重复抓取
+- 需要对视频库进行持续扫描:如1条视频上周未达到5万+点赞,本周达到了5万点赞,则进行抓取。
+5、新增爬虫视频标题过滤词
+-  集结吧光合创作者、电影解说、快来露两手、分享家常美食教程、光合作者助手、创作者中心、创作者学院、娱乐星熠计划、解说电影、电影剪辑、放映室、老剧、影视剪辑、精彩片段、冬日影娱大作战、春日追剧计划单、影视解说、中视频影视混剪计划、众志成城共抗疫情、我在追好剧、娱乐星灿计划、电影、电视剧、毛泽东、毛主席、周恩来、林彪、习近平、习大大、彭丽媛、怀旧经典影视
+==============================