瀏覽代碼

push readme

wangkun 2 年之前
父節點
當前提交
609eda4c26
共有 12 個文件被更改,包括 534 次插入969 次删除
  1. 23 1
      README.md
  2. 0 462
      main/download_kuaishou.py
  3. 0 346
      main/download_weishi.py
  4. 432 0
      main/recommend.py
  5. 0 138
      main/run.py
  6. 79 0
      main/run_recommend.py
  7. 0 3
      txt/__init__.py
  8. 0 0
      txt/kuaishou_feeds.txt
  9. 0 0
      txt/kuaishou_videoid.txt
  10. 0 0
      txt/weishi_feeds.txt
  11. 0 0
      txt/weishi_videoid.txt
  12. 0 19
      抓取规则.txt

+ 23 - 1
README.md

@@ -1 +1,23 @@
-快手和微视小程序的爬虫
+# 微视小程序爬虫
+
+https://git.yishihui.com/Server/crawler_weishi.git
+
+
+# 依赖库文件
+python==3.10.0
+loguru==0.6.0
+oss2==2.15.0
+requests==2.27.1
+urllib3==1.26.9
+
+
+# 执行入口
+
+cd ./crawler_weishi
+
+python3 main/run_xxx.py
+
+
+# 需求
+
+2022/8/18 需求链接 https://w42nne6hzg.feishu.cn/docs/doccnbLbxAzkzwJJigo1ii5c3ih

+ 0 - 462
main/download_kuaishou.py

@@ -1,462 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2022/3/29
-"""
-从 微信小程序-快手短视频 中,下载符合规则的视频
-"""
-import json
-import time
-
-import requests
-import urllib3
-from main.common import Common
-from main.publish import Publish
-
-proxies = {"http": None, "https": None}
-
-
-class KuaiShou:
-    # 已下载视频列表
-    download_video_list = []
-
-    @classmethod
-    def kuaishou_sensitive_words(cls):
-        sensitive_words = [
-            "集结吧光合创作者",
-            "电影解说",
-            "快来露两手",
-            "分享家常美食教程",
-            "光合作者助手",
-            "创作者中心",
-            "创作者学院",
-            "娱乐星熠计划",
-            "解说电影",
-            "电影剪辑",
-            "放映室",
-            "老剧",
-            "影视剪辑",
-            "精彩片段",
-            "冬日影娱大作战",
-            "春日追剧计划单",
-            "影视解说",
-            "中视频影视混剪计划",
-            "众志成城共抗疫情",
-            "我在追好剧",
-            "娱乐星灿计划",
-            "电影",
-            "电视剧",
-            "毛泽东",
-            "毛主席",
-            "周恩来",
-            "林彪",
-            "习近平",
-            "习大大",
-            "彭丽媛",
-            "怀旧经典影视",
-                           ]
-        return sensitive_words
-
-    @staticmethod
-    def kuaishou_download_rule(d_duration, d_width, d_height,
-                               d_play_cnt, d_like_cnt, d_share_cnt):
-        """
-        下载视频的基本规则
-        :param d_duration: 时长
-        :param d_width: 宽
-        :param d_height: 高
-        :param d_play_cnt: 播放量
-        :param d_like_cnt: 点赞量
-        :param d_share_cnt: 分享量
-        :return: 满足规则,返回 True;反之,返回 False
-        """
-        if 600 >= int(float(d_duration)) >= 60:
-            if int(d_width) >= 720 or int(d_height) >= 720:
-                if int(d_play_cnt) >= 50000:
-                    if int(d_like_cnt) >= 50000:
-                        if int(d_share_cnt) >= 2000:
-                            return True
-                        else:
-                            return False
-                    else:
-                        return False
-                else:
-                    return False
-            return False
-        return False
-
-    @classmethod
-    def kuaishou_get_recommend(cls):
-        """
-        从快手小程序首页推荐获取视频list:
-            1.在 kuaishou_videoid.txt 中去重
-            2.在 kuaishou_feeds.txt 中去重
-            3.添加视频信息到 kuaishou_feeds.txt
-        """
-        url = "https://wxmini-api.uyouqu.com/rest/wd/wechatApp/feed/recommend"
-        params = {
-            "__NS_sig3": "e6f6b281ea31e3d7d1bbb8b91f662576fc25f7c3a7a7a5a5aaaba8b2",
-            "__NS_sig3_origin": "3sCt3iAAAAAAAAAAAAAAAwEQBv2b8ewCwkZKaiAAAAAPg0soi"
-                                "e7GiOlU vF4zPrG1Nl6xvaoBgFd3MwTzOed9w=="
-        }
-        cookies = {
-            "did": "wxo_05f915ac6b1deca87db36cea1a0fd18fae6c",
-            "preMinaVersion": "v3.109.0",
-            "sid": "kuaishou.wechat.app",
-            "appId": "ks_wechat_small_app_2",
-            "clientid": "13",
-            "client_key": "f60ac815",
-            "kpn": "WECHAT_SMALL_APP",
-            "kpf": "OUTSIDE_ANDROID_H5",
-            "language": "zh_CN",
-            "smallAppVersion": "v3.109.0",
-            "session_key": "123005bcc551a92aac29cdb96190251c9f492c29d4ba6c502dc"
-                           "0d2f8b8d18df356a2f7a22d6924d1dd34b8554a64af49b1bb1a"
-                           "1236cd2f69c25d4ac2a2531ebcd28c179da14b222023f9e111c"
-                           "c4d3b064ac7b0915d8c9fdaccb59e4048e96a5c38a32b2ce9f4abf628053001",
-            "unionid": "V2:1230b56c8337908c3eecba63142a58daca05535c1f14bf67d3d8"
-                       "85cace91a7db335c5572d204762d075f24aa84412e2955711a12bb9"
-                       "2bd9c2290489ba7a733708a4a446de83822205ab727650489dda0db"
-                       "9d2a226c5ddb66d88a1f1373283a3d3b959611d816660028053001",
-            "eUserStableOpenId": "12303325e8710eb802137c70fd1fb65997a4e5e33d82"
-                                 "cddd409d335d096e20873e07ee472090133bc7a67e5c"
-                                 "749da045d9a31a12da4c4c26181d432b873ec39432f4"
-                                 "10196c6c2220323d0e6b562d1b3786aefb352b4e509c"
-                                 "d96f3466b7b2e5e74b904a94c40792d928053001",
-            "openId": "o5otV45DcV1EUsWw4fAUk_iq0YSA",
-            "eOpenUserId": "124074b7726c996283f25044a42e2c7427e929cd6d968c5342"
-                           "330e61fc8939e57b0da4ffe21887f3abc8784175f73e1a267d"
-                           "671247273806f293f64c9c8c2adc00a21a12bb92bd9c229048"
-                           "9ba7a733708a4a446de8382220534aa79c69b74866bb09187e"
-                           "eceec880fa1e0fa421b7df8b3289dab603b17c4828053001",
-            "kuaishou.wechat.app_st": "ChZrdWFpc2hvdS53ZWNoYXQuYXBwLnN0ErAB8aO"
-                                      "EcB6jh4CMSJ-p_4BJFCId0PKNa_5IeFfeV_tj7q"
-                                      "CjdXK0y13CSte6-KHbNK9BPo6Rjy3OGny0sh4Zb"
-                                      "5AUl3Q_zqVXe2TunW7_F3nlTdJOdZ6iVIhPrHa1"
-                                      "CM0Y-cG9gS4FDDzTvejfWaTI0CbjfNN0RZXzYVE"
-                                      "AUVT_BNgUVDtYBbEY792gPylMfXxwxKMSzkhaDe"
-                                      "eaHkGCWUj62FGCFYQ9Fw2W3d7suCXFsNylqT4aE"
-                                      "s8oNwmycUiygfvfKuoXlHkbeSIgOhEFMZ3ArImS"
-                                      "vFY_OwLJDHak1iXRO8g5TwzHTvBT3WcoBTAB",
-            "passToken": "ChNwYXNzcG9ydC5wYXNzLXRva2VuEpABI42IhPCJHfFngXC3i-vF"
-                         "3daRTB-EtnAYyE6HpfWcPoZ6VSRDvKrom_RvltQ2zKk1T3_FJteb"
-                         "mv7ZzQLD7IicnTypaGoeflb7KQVrAv50Mp_JL4ObfBu_xTiwI53t"
-                         "bTlM6iML0G7DFd16K5z0jZZ1xECKVQQbk_vIqnseUujFIWAsKcDz"
-                         "BqqfnQNbUU5DzDUkGhKgKyzmNjRDxLfpDU5SPFhJmG0iIGBZ_Vd-"
-                         "7eT8i_Xit9ZPM-zdFpnRZFveFE9iplMg8Z06KAUwAQ",
-            "userId": "2845397958"
-        }
-        json_data = {
-            "thirdPartyUserId": 2845397958,
-            "photoId": "5250352807040393911",
-            "forwardUserId": 2845397958,
-            "count": 10,
-            "portal": 2,
-            "pageType": 2,
-            "needLivestream": "true",
-            "extraRequestInfo": "{\"scene\":1074,\"fid\":\"2845397958\","
-                                "\"sharerUserId\":\"2845397958\",\"curPhotoIndex\":0,"
-                                "\"adShow\":true,\"weChatAd\":{},\"page\":0}",
-            "pcursor": 0,
-            "sourceFrom": 2,
-        }
-
-        try:
-            urllib3.disable_warnings()
-            r = requests.post(url=url, params=params, cookies=cookies, json=json_data, proxies=proxies, verify=False)
-            response = json.loads(r.content.decode("utf8"))
-            if "feeds" not in response:
-                Common.crawler_log().info("获取快手视频 list 出错:{},休眠 10s".format(response))
-                time.sleep(10)
-            else:
-                feeds = response["feeds"]
-                for i in range(len(feeds)):
-                    if "photoId" not in feeds[i]:
-                        photo_id = "0"
-                        Common.crawler_log().info("photo_id:{}".format(photo_id))
-                    else:
-                        photo_id = feeds[i]["photoId"]
-                        Common.crawler_log().info("photo_id:{}".format(photo_id))
-
-                    if "viewCount" not in feeds[i]:
-                        video_play_cnt = "0"
-                        Common.crawler_log().info("video_play_cnt:0")
-                    else:
-                        video_play_cnt = feeds[i]["viewCount"]
-                        Common.crawler_log().info("video_play_cnt:{}".format(video_play_cnt))
-
-                    if "likeCount" not in feeds[i]:
-                        video_like_cnt = "0"
-                        Common.crawler_log().info("video_like_cnt:0")
-                    else:
-                        video_like_cnt = feeds[i]["likeCount"]
-                        Common.crawler_log().info("video_like_cnt:{}".format(video_like_cnt))
-
-                    if "headUrl" not in feeds[i]:
-                        head_url = "0"
-                        Common.crawler_log().info("head_url:不存在")
-                    else:
-                        head_url = feeds[i]["headUrl"]
-                        Common.crawler_log().info("head_url:{}".format(head_url))
-
-                    if len(feeds[i]["coverUrls"]) == 0:
-                        cover_url = "0"
-                        Common.crawler_log().info("cover_url:不存在")
-                    else:
-                        cover_url = feeds[i]["coverUrls"][0]["url"]
-                        Common.crawler_log().info("cover_url:{}".format(cover_url))
-
-                    if len(feeds[i]["mainMvUrls"]) == 0:
-                        video_url = "0"
-                        Common.crawler_log().info("video_url:不存在")
-                    else:
-                        video_url = feeds[i]["mainMvUrls"][0]["url"]
-                        Common.crawler_log().info("video_url:{}".format(video_url))
-
-                    if "shareCount" not in feeds[i]:
-                        video_share_cnt = "0"
-                        Common.crawler_log().info("video_share_cnt:0")
-                    else:
-                        video_share_cnt = feeds[i]["shareCount"]
-                        Common.crawler_log().info("video_share_cnt:{}".format(video_share_cnt))
-
-                    if "width" not in feeds[i] or "height"not in feeds[i]:
-                        video_width = "0"
-                        video_height = "0"
-                        video_resolution = str(video_width) + "*" + str(video_height)
-                        Common.crawler_log().info("无分辨率")
-                    else:
-                        video_width = feeds[i]["width"]
-                        video_height = feeds[i]["height"]
-                        video_resolution = str(video_width) + "*" + str(video_height)
-                        Common.crawler_log().info("video_resolution:{}".format(video_resolution))
-
-                    if "commentCount" not in feeds[i]:
-                        video_comment_cnt = "0"
-                        Common.crawler_log().info("video_comment_cnt:0")
-                    else:
-                        video_comment_cnt = feeds[i]["commentCount"]
-                        Common.crawler_log().info("video_comment_cnt:{}".format(video_comment_cnt))
-
-                    if "duration" not in feeds[i]:
-                        video_duration = "0"
-                        Common.crawler_log().info("video_duration:不存在")
-                    else:
-                        video_duration = int(int(feeds[i]["duration"])/1000)
-                        Common.crawler_log().info("video_duration:{}秒".format(video_duration))
-
-                    if "timestamp" not in feeds[i]:
-                        video_send_time = "0"
-                        Common.crawler_log().info("video_send_time:不存在")
-                    else:
-                        video_send_time = feeds[i]["timestamp"]
-                        Common.crawler_log().info("video_send_time:{}".format(
-                            time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(video_send_time)/1000))))
-
-                    user_name = feeds[i]["userName"].strip().replace("\n", "")\
-                        .replace("/", "").replace("快手", "").replace(" ", "")\
-                        .replace(" ", "").replace("&NBSP", "").replace("\r", "")
-                    Common.crawler_log().info("user_name:{}".format(user_name))
-
-                    user_id = feeds[i]["userId"]
-                    Common.crawler_log().info("user_id:{}".format(user_id))
-
-                    # 视频标题过滤话题及处理特殊字符
-                    kuaishou_title = feeds[i]["caption"]
-                    title_split1 = kuaishou_title.split(" #")
-                    if title_split1[0] != "":
-                        title1 = title_split1[0]
-                    else:
-                        title1 = title_split1[-1]
-
-                    title_split2 = title1.split(" #")
-                    if title_split2[0] != "":
-                        title2 = title_split2[0]
-                    else:
-                        title2 = title_split2[-1]
-
-                    title_split3 = title2.split("@")
-                    if title_split3[0] != "":
-                        title3 = title_split3[0]
-                    else:
-                        title3 = title_split3[-1]
-
-                    video_title = title3.strip().replace("\n", "")\
-                        .replace("/", "").replace("快手", "").replace(" ", "")\
-                        .replace(" ", "").replace("&NBSP", "").replace("\r", "")\
-                        .replace("#", "").replace(".", "。").replace("\\", "")\
-                        .replace(":", "").replace("*", "").replace("?", "")\
-                        .replace("?", "").replace('"', "").replace("<", "")\
-                        .replace(">", "").replace("|", "")
-
-                    Common.crawler_log().info("video_title:{}".format(video_title))
-
-                    # 从 kuaishou_videoid.txt 中去重
-                    photo_ids = Common.read_txt("kuaishou_videoid.txt")
-                    if photo_id in [p_id.strip() for p_id in photo_ids]:
-                        Common.crawler_log().info("该视频已下载:{}".format(video_title))
-                        pass
-                    else:
-                        Common.crawler_log().info("该视频未下载:{}".format(video_title))
-
-                        # 从 kuaishou_feeds.txt 中去重
-                        contents = Common.read_txt("kuaishou_feeds.txt")
-                        # kuaishou_feeds.txt 为空时,直接保存
-                        if len(contents) == 0 and head_url != "0" \
-                                and cover_url != "0" and video_url != "0" \
-                                and video_duration != "0" and photo_id != "0":
-                            # 判断敏感词
-                            if any(word if word in kuaishou_title else False
-                                   for word in cls.kuaishou_sensitive_words()) is True:
-                                Common.crawler_log().info("视频已中敏感词:{}".format(kuaishou_title))
-                            else:
-                                basic_time = int(time.time())
-                                Common.crawler_log().info("添加视频信息至kuaishou_feeds.txt:{}".format(video_title))
-                                with open(r"./txt/kuaishou_feeds.txt", "a", encoding="UTF-8") as f_a:
-                                    f_a.write(str(basic_time) + " + " +
-                                              str(photo_id) + " + " +
-                                              str(video_play_cnt) + " + " +
-                                              str(video_title) + " + " +
-                                              str(video_duration) + " + " +
-                                              str(video_comment_cnt) + " + " +
-                                              str(video_like_cnt) + " + " +
-                                              str(video_share_cnt) + " + " +
-                                              str(video_resolution) + " + " +
-                                              str(video_send_time) + " + " +
-                                              str(user_name) + " + " +
-                                              str(head_url) + " + " +
-                                              str(cover_url) + " + " +
-                                              str(video_url) + " + " +
-                                              str(user_id) + " + " +
-                                              str("wxo_b07ba02ad4340205d89b47c76030bb090977") + "\n")
-                        else:
-                            if photo_id in [content.split(" + ")[1] for content in contents]:
-                                Common.crawler_log().info("该视频已在 kuaishou_feeds.txt 中:{}".format(video_title))
-                            elif head_url == "0" or cover_url == "0" \
-                                    or video_url == "0" or video_duration == "0" or photo_id == "0":
-                                Common.crawler_log().info("视频封面/播放地址/播放时长/用户头像不存在")
-                            else:
-                                # 判断敏感词
-                                if any(word if word in kuaishou_title else False
-                                       for word in cls.kuaishou_sensitive_words()) is True:
-                                    Common.crawler_log().info("视频已中敏感词:{}".format(kuaishou_title))
-                                else:
-                                    basic_time = int(time.time())
-                                    Common.crawler_log().info("添加视频信息至kuaishou_feeds.txt:{}".format(video_title))
-                                    with open(r"./txt/kuaishou_feeds.txt", "a", encoding="UTF-8") as f_a:
-                                        f_a.write(str(basic_time) + " + " +
-                                                  str(photo_id) + " + " +
-                                                  str(video_play_cnt) + " + " +
-                                                  str(video_title) + " + " +
-                                                  str(video_duration) + " + " +
-                                                  str(video_comment_cnt) + " + " +
-                                                  str(video_like_cnt) + " + " +
-                                                  str(video_share_cnt) + " + " +
-                                                  str(video_resolution) + " + " +
-                                                  str(video_send_time) + " + " +
-                                                  str(user_name) + " + " +
-                                                  str(head_url) + " + " +
-                                                  str(cover_url) + " + " +
-                                                  str(video_url) + " + " +
-                                                  str(user_id) + " + " +
-                                                  str("wxo_b07ba02ad4340205d89b47c76030bb090977") + "\n")
-        except Exception as e:
-            Common.crawler_log().error("获取视频 list 异常:{}".format(e))
-
-    @classmethod
-    def kuaishou_download_play_video(cls, env):
-        """
-        下载播放量视频
-        测试环境:env == dev
-        正式环境:env == prod
-        """
-        videos = Common.read_txt("kuaishou_feeds.txt")
-        for video in videos:
-            download_photo_id = video.strip().split(" + ")[1]
-            download_video_title = video.strip().split(" + ")[3]
-            download_video_duration = video.strip().split(" + ")[4]
-            download_video_play_cnt = video.strip().split(" + ")[2]
-            download_video_comment_cnt = video.strip().split(" + ")[5]
-            download_video_like_cnt = video.strip().split(" + ")[6]
-            download_video_share_cnt = video.strip().split(" + ")[7]
-            download_video_resolution = video.strip().split(" + ")[8]
-            download_video_width = download_video_resolution.split("*")[0]
-            download_video_height = download_video_resolution.split("*")[-1]
-            download_video_send_time = video.strip().split(" + ")[9]
-            download_user_name = video.strip().split(" + ")[10]
-            download_head_url = video.strip().split(" + ")[11]
-            download_cover_url = video.strip().split(" + ")[12]
-            download_video_url = video.strip().split(" + ")[13]
-            download_video_session = video.strip().split(" + ")[-1]
-
-            if cls.kuaishou_download_rule(download_video_duration,
-                                          download_video_width,
-                                          download_video_height,
-                                          download_video_play_cnt,
-                                          download_video_like_cnt,
-                                          download_video_share_cnt) is True:
-                Common.crawler_log().info("开始下载快手视频:{}".format(download_video_title))
-                # 下载封面
-                Common.download_method(text="cover", d_name=download_video_title, d_url=download_cover_url)
-                # 下载视频
-                Common.download_method(text="video", d_name=download_video_title, d_url=download_video_url)
-
-                # 保存视频信息至 kuaishou_videoid.txt
-                with open(r"./txt/kuaishou_videoid.txt", "a", encoding="UTF-8") as fa:
-                    fa.write(download_photo_id + "\n")
-
-                # 添加视频 ID 到 list,用于统计当次下载总数
-                cls.download_video_list.append(download_photo_id)
-
-                # # 保存视频信息至 {today}_kuaishou_videoid.txt
-                # with open("./txt/" + str(Common.today) + "_kuaishou_videoid.txt", "a", encoding="UTF-8") as fc:
-                #     fc.write(download_photo_id + "\n")
-
-                # 保存视频信息至 "./videos/{download_video_title}/info.txt"
-                with open(r"./videos/" + download_video_title + "/info.txt", "a", encoding="UTF-8") as f_a:
-                    f_a.write(str(download_photo_id) + "\n" +
-                              str(download_video_title) + "\n" +
-                              str(download_video_duration) + "\n" +
-                              str(download_video_play_cnt) + "\n" +
-                              str(download_video_comment_cnt) + "\n" +
-                              str(download_video_like_cnt) + "\n" +
-                              str(download_video_share_cnt) + "\n" +
-                              str(download_video_resolution) + "\n" +
-                              str(download_video_send_time) + "\n" +
-                              str(download_user_name) + "\n" +
-                              str(download_head_url) + "\n" +
-                              str(download_video_url) + "\n" +
-                              str(download_cover_url) + "\n" +
-                              str(download_video_session))
-
-                # 上传视频
-                if env == "dev":
-                    Common.crawler_log().info("开始上传视频:{}".format(download_video_title))
-                    Publish.upload_and_publish("dev", "play")
-                elif env == "prod":
-                    Common.crawler_log().info("开始上传视频:{}".format(download_video_title))
-                    Publish.upload_and_publish("prod", "play")
-
-                # 删除该视频在kuaishou_feeds.txt中的信息
-                Common.crawler_log().info("删除该视频在kuaishou_feeds.txt中的信息:{}".format(download_video_title))
-                with open(r"./txt/kuaishou_feeds.txt", "r", encoding="UTF-8") as f_r:
-                    lines = f_r.readlines()
-                with open(r"./txt/kuaishou_feeds.txt", "w", encoding="utf-8") as f_w:
-                    for line in lines:
-                        if download_photo_id in line.split(" + ")[1]:
-                            continue
-                        f_w.write(line)
-            else:
-                # 删除该视频在 recommend.txt中的信息
-                Common.crawler_log().info("该视频不满足下载规则,删除在kuaishou_feeds.txt中的信息:{}".format(download_video_title))
-                with open(r"./txt/kuaishou_feeds.txt", "r", encoding="UTF-8") as f_r:
-                    lines = f_r.readlines()
-                with open(r"./txt/kuaishou_feeds.txt", "w", encoding="utf-8") as f_w:
-                    for line in lines:
-                        if download_photo_id in line.split(" + ")[1]:
-                            continue
-                        f_w.write(line)
-
-
-if __name__ == "__main__":
-    kuaishou = KuaiShou()
-    kuaishou.kuaishou_get_recommend()

+ 0 - 346
main/download_weishi.py

@@ -1,346 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2022/4/8
-import json
-import time
-
-import requests
-import urllib3
-from main.common import Common
-from main.publish import Publish
-
-proxies = {"http": None, "https": None}
-
-
-class Weishi:
-    @staticmethod
-    def weishi_download_rule(d_duration, d_width, d_height, d_play_cnt):
-        """
-        下载视频的基本规则
-        :param d_duration: 时长
-        :param d_width: 宽
-        :param d_height: 高
-        :param d_play_cnt: 播放量
-        :return: 满足规则,返回 True;反之,返回 False
-        """
-        if 600 >= int(float(d_duration)) >= 60:
-            if int(d_width) >= 720 or int(d_height) >= 720:
-                if int(d_play_cnt) >= 100000:
-                    return True
-                else:
-                    return False
-            return False
-        return False
-
-    @classmethod
-    def get_weishi_recommend(cls):
-        """
-        从微视小程序首页推荐获取视频list:
-            1.在 weishi_videoid.txt 中去重
-            2.在 weishi_feeds.txt 中去重
-            3.添加视频信息到 weishi_feeds.txt
-        """
-        url = "https://api.weishi.qq.com/trpc.weishi.weishi_h5_proxy.weishi_h5_proxy/WxminiGetFeedList"
-        cookies = {
-            "wesee_authtype": "3",
-            "wesee_openid": "oWGa05FrwkuUvT-4n1qGeQuhVsc8",
-            "wesee_openkey": "8c3ec202f5d679fb5ee6d9f643640d9a2580ba504612e2d979a881d3169caf189e2a5c1d532eeff172bc21cf2"
-                             "6230941ccbc10243a7879e8165ca608c17060de606a6d08afe0a3abd5250629314f9a99e9d1003b201bf5ec",
-            "wesee_personid": "1593522421826902",
-            "wesee_refresh_token": "",
-            "wesee_access_token": "8c3ec202f5d679fb5ee6d9f643640d9a2580ba504612e2d979a881d3169caf18"
-                                  "9e2a5c1d532eeff172bc21cf26230941ccbc10243a7879e8165ca608c17060de6"
-                                  "06a6d08afe0a3abd5250629314f9a99e9d1003b201bf5ec",
-            "wesee_thr_appid": "wx75ee9f19b93e5c46",
-            "wesee_ichid": "8"
-        }
-        json_data = {
-            "req_body": {
-                "requestType": 16,
-                "isrefresh": 0,
-                "isfirst": 0,
-                "attachInfo": "",
-                "scene_id": 22,
-                "requestExt": {
-                    "mini_openid": "oWGa05FrwkuUvT-4n1qGeQuhVsc8",
-                    "notLogin-personid": "1593522421826902"
-                }
-            },
-            "req_header": {
-                "mapExt": "{\"imageSize\":\"480\",\"adaptScene\":\"PicHDWebpLimitScene\"}"
-            }
-        }
-
-        try:
-            urllib3.disable_warnings()
-            r = requests.post(url=url, cookies=cookies, json=json_data, proxies=proxies, verify=False)
-            response = json.loads(r.content.decode("utf8"))
-            if "rsp_body" not in response:
-                Common.crawler_log().info("获取微视视频 list 出错:{},休眠 10s".format(response))
-                time.sleep(10)
-            else:
-                feeds = response["rsp_body"]["feeds"]
-                for i in range(len(feeds)):
-                    if "video" not in feeds[i]:
-                        Common.crawler_log().info("无视频信息")
-                    else:
-                        # 视频 ID
-                        if "id" not in feeds[i]["video"]:
-                            video_id = "0"
-                            Common.crawler_log().info("video_id:{}".format(video_id))
-                        else:
-                            video_id = feeds[i]["video"]["id"]
-                            Common.crawler_log().info("video_id:{}".format(video_id))
-
-                        # 视频标题
-                        video_title = feeds[i]["desc"].strip().replace("\n", "") \
-                            .replace("/", "").replace("快手", "").replace(" ", "") \
-                            .replace(" ", "").replace("&NBSP", "").replace("\r", "")
-                        Common.crawler_log().info("video_title:{}".format(video_title))
-
-                        # 视频发布时间
-                        if "createTime" not in feeds[i]:
-                            video_send_time = "0"
-                            Common.crawler_log().info("video_send_time:不存在")
-                        else:
-                            video_send_time = int(feeds[i]["createTime"])*1000
-                            Common.crawler_log().info(
-                                "video_send_time:{}".format(time.strftime(
-                                    "%Y-%m-%d %H:%M:%S", time.localtime(int(video_send_time)/1000))))
-
-                        # 视频封面地址
-                        if len(feeds[i]["images"]) == 0:
-                            cover_url = "0"
-                            Common.crawler_log().info("cover_url:不存在")
-                        else:
-                            cover_url = feeds[i]["images"][0]["url"]
-                            Common.crawler_log().info("cover_url:{}".format(cover_url))
-
-                        # 视频播放地址
-                        if "url" not in feeds[i]["video"]:
-                            video_url = "0"
-                            Common.crawler_log().info("video_url:不存在")
-                        else:
-                            video_url = feeds[i]["video"]["url"]
-                            Common.crawler_log().info("video_url:{}".format(video_url))
-
-                        # 视频分辨率
-                        if "width" not in feeds[i]["video"] or "height" not in feeds[i]["video"]:
-                            video_width = "0"
-                            video_height = "0"
-                            video_resolution = str(video_width) + "*" + str(video_height)
-                            Common.crawler_log().info("无分辨率")
-                        else:
-                            video_width = feeds[i]["video"]["width"]
-                            video_height = feeds[i]["video"]["height"]
-                            video_resolution = str(video_width) + "*" + str(video_height)
-                            Common.crawler_log().info("video_resolution:{}".format(video_resolution))
-
-                        # 视频时长
-                        if "duration" not in feeds[i]["video"]:
-                            video_duration = "0"
-                            Common.crawler_log().info("video_duration:不存在")
-                        else:
-                            video_duration = int(int(feeds[i]["video"]["duration"]) / 1000)
-                            Common.crawler_log().info("video_duration:{}秒".format(video_duration))
-
-                        # 播放数
-                        if "playNum" not in feeds[i]["ugcData"]:
-                            video_play_cnt = "0"
-                            Common.crawler_log().info("video_play_cnt:{}".format(video_play_cnt))
-                        else:
-                            video_play_cnt = feeds[i]["ugcData"]["playNum"]
-                            Common.crawler_log().info("video_play_cnt:{}".format(video_play_cnt))
-
-                        # 点赞数
-                        if "dingCount" not in feeds[i]["ugcData"]:
-                            video_like_cnt = "0"
-                            Common.crawler_log().info("video_like_cnt:{}".format(video_like_cnt))
-                        else:
-                            video_like_cnt = feeds[i]["ugcData"]["dingCount"]
-                            Common.crawler_log().info("video_like_cnt:{}".format(video_like_cnt))
-
-                        # 分享数
-                        if "shareNum" not in feeds[i]["ugcData"]:
-                            video_share_cnt = "0"
-                            Common.crawler_log().info("video_share_cnt:{}".format(video_share_cnt))
-                        else:
-                            video_share_cnt = feeds[i]["ugcData"]["shareNum"]
-                            Common.crawler_log().info("video_share_cnt:{}".format(video_share_cnt))
-
-                        # 评论数
-                        if "totalCommentNum" not in feeds[i]["ugcData"]:
-                            video_comment_cnt = "0"
-                            Common.crawler_log().info("video_comment_cnt:{}".format(video_comment_cnt))
-                        else:
-                            video_comment_cnt = feeds[i]["ugcData"]["totalCommentNum"]
-                            Common.crawler_log().info("video_comment_cnt:{}".format(video_comment_cnt))
-
-                        # 用户 ID
-                        user_id = feeds[i]["poster"]["id"]
-                        Common.crawler_log().info("user_id:{}".format(user_id))
-
-                        # 用户昵称
-                        user_name = feeds[i]["poster"]["nick"].strip().replace("\n", "") \
-                            .replace("/", "").replace("快手", "").replace(" ", "") \
-                            .replace(" ", "").replace("&NBSP", "").replace("\r", "")
-                        Common.crawler_log().info("user_name:{}".format(user_name))
-
-                        # 用户头像地址
-                        if "thumbURL" not in feeds[i]["material"] and "avatar" not in feeds[i]["poster"]:
-                            head_url = "0"
-                            Common.crawler_log().info("head_url:不存在")
-                        elif "thumbURL" in feeds[i]["material"]:
-                            head_url = feeds[i]["material"]["thumbURL"]
-                            Common.crawler_log().info("head_url:{}".format(head_url))
-                        else:
-                            head_url = feeds[i]["poster"]["avatar"]
-                            Common.crawler_log().info("head_url:{}".format(head_url))
-
-                        # 从 weishi_videoid.txt 中去重
-                        videos_ids = Common.read_txt("weishi_videoid.txt")
-                        if video_id in [v_id.strip() for v_id in videos_ids]:
-                            Common.crawler_log().info("该视频已下载:{}".format(video_title))
-                            pass
-                        else:
-                            Common.crawler_log().info("该视频未下载:{}".format(video_title))
-
-                            # 从 weishi_feeds.txt 中去重
-                            contents = Common.read_txt("weishi_feeds.txt")
-                            # 若 weishi_feeds.txt 为空时,直接保存
-                            if len(contents) == 0 and head_url != "0" \
-                                    and cover_url != "0" and video_url != "0" \
-                                    and video_duration != "0" and video_id != "0":
-                                basic_time = int(time.time())
-                                Common.crawler_log().info("添加视频信息至weishi_feeds.txt:{}".format(video_title))
-                                with open(r"./txt/weishi_feeds.txt", "a", encoding="UTF-8") as f_a:
-                                    f_a.write(str(basic_time) + " + " +
-                                              str(video_id) + " + " +
-                                              str(video_play_cnt) + " + " +
-                                              str(video_title) + " + " +
-                                              str(video_duration) + " + " +
-                                              str(video_comment_cnt) + " + " +
-                                              str(video_like_cnt) + " + " +
-                                              str(video_share_cnt) + " + " +
-                                              str(video_resolution) + " + " +
-                                              str(video_send_time) + " + " +
-                                              str(user_name) + " + " +
-                                              str(head_url) + " + " +
-                                              str(cover_url) + " + " +
-                                              str(video_url) + " + " +
-                                              str(user_id) + " + " +
-                                              str("oWGa05FrwkuUvT-4n1qGeQuhVsc8") + "\n")
-                            else:
-                                if video_id in [content.split(" + ")[1] for content in contents]:
-                                    Common.crawler_log().info("该视频已在 weishi_feeds.txt 中:{}".format(video_title))
-                                elif head_url == "0" or cover_url == "0" \
-                                        or video_url == "0" or video_duration == "0" or video_id == "0":
-                                    Common.crawler_log().info("视频封面/播放地址/播放时长/用户头像不存在")
-                                else:
-                                    basic_time = int(time.time())
-                                    Common.crawler_log().info("添加视频信息至weishi_feeds.txt:{}".format(video_title))
-                                    with open(r"./txt/weishi_feeds.txt", "a", encoding="UTF-8") as f_a:
-                                        f_a.write(str(basic_time) + " + " +
-                                                  str(video_id) + " + " +
-                                                  str(video_play_cnt) + " + " +
-                                                  str(video_title) + " + " +
-                                                  str(video_duration) + " + " +
-                                                  str(video_comment_cnt) + " + " +
-                                                  str(video_like_cnt) + " + " +
-                                                  str(video_share_cnt) + " + " +
-                                                  str(video_resolution) + " + " +
-                                                  str(video_send_time) + " + " +
-                                                  str(user_name) + " + " +
-                                                  str(head_url) + " + " +
-                                                  str(cover_url) + " + " +
-                                                  str(video_url) + " + " +
-                                                  str(user_id) + " + " +
-                                                  str("oWGa05FrwkuUvT-4n1qGeQuhVsc8") + "\n")
-        except Exception as e:
-            Common.crawler_log().error("获取微视视频 list 异常:{}".format(e))
-
-    @classmethod
-    def download_weishi_play_video(cls, env):
-        """
-        下载播放量视频
-        测试环境:env == dev
-        正式环境:env == prod
-        """
-        videos = Common.read_txt("weishi_feeds.txt")
-        for video in videos:
-            download_video_id = video.strip().split(" + ")[1]
-            download_video_title = video.strip().split(" + ")[3]
-            download_video_duration = video.strip().split(" + ")[4]
-            download_video_play_cnt = video.strip().split(" + ")[2]
-            download_video_comment_cnt = video.strip().split(" + ")[5]
-            download_video_like_cnt = video.strip().split(" + ")[6]
-            download_video_share_cnt = video.strip().split(" + ")[7]
-            download_video_resolution = video.strip().split(" + ")[8]
-            download_video_width = download_video_resolution.split("*")[0]
-            download_video_height = download_video_resolution.split("*")[-1]
-            download_video_send_time = video.strip().split(" + ")[9]
-            download_user_name = video.strip().split(" + ")[10]
-            download_head_url = video.strip().split(" + ")[11]
-            download_cover_url = video.strip().split(" + ")[12]
-            download_video_url = video.strip().split(" + ")[13]
-            download_video_session = video.strip().split(" + ")[-1]
-
-            if cls.weishi_download_rule(download_video_duration, download_video_width,
-                                        download_video_height, download_video_play_cnt) is True:
-                Common.crawler_log().info("开始下载视频:{}".format(download_video_title))
-                # 下载封面
-                Common.download_method(text="cover", d_name=download_video_title, d_url=download_cover_url)
-                # 下载视频
-                Common.download_method(text="video", d_name=download_video_title, d_url=download_video_url)
-                # 保存视频信息至 weishi_videoid.txt
-                with open(r"./txt/weishi_videoid.txt", "a", encoding="UTF-8") as fa:
-                    fa.write(download_video_id + "\n")
-                # 保存视频信息至 "./videos/{download_video_title}/info.txt"
-                with open(r"./videos/" + download_video_title + "/info.txt", "a", encoding="UTF-8") as f_a:
-                    f_a.write(str(download_video_id) + "\n" +
-                              str(download_video_title) + "\n" +
-                              str(download_video_duration) + "\n" +
-                              str(download_video_play_cnt) + "\n" +
-                              str(download_video_comment_cnt) + "\n" +
-                              str(download_video_like_cnt) + "\n" +
-                              str(download_video_share_cnt) + "\n" +
-                              str(download_video_resolution) + "\n" +
-                              str(download_video_send_time) + "\n" +
-                              str(download_user_name) + "\n" +
-                              str(download_head_url) + "\n" +
-                              str(download_video_url) + "\n" +
-                              str(download_cover_url) + "\n" +
-                              str(download_video_session))
-
-                # 上传视频
-                if env == "dev":
-                    Common.crawler_log().info("开始上传视频:{}".format(download_video_title))
-                    Publish.upload_and_publish("dev", "play")
-                elif env == "prod":
-                    Common.crawler_log().info("开始上传视频:{}".format(download_video_title))
-                    Publish.upload_and_publish("prod", "play")
-
-                # 删除该视频在weishi_feeds.txt中的信息
-                Common.crawler_log().info("删除该视频在weishi_feeds.txt中的信息:{}".format(download_video_title))
-                with open(r"./txt/weishi_feeds.txt", "r", encoding="UTF-8") as f_r:
-                    lines = f_r.readlines()
-                with open(r"./txt/weishi_feeds.txt", "w", encoding="utf-8") as f_w:
-                    for line in lines:
-                        if download_video_id in line.split(" + ")[1]:
-                            continue
-                        f_w.write(line)
-            else:
-                # 删除该视频在weishi_feeds.txt中的信息
-                Common.crawler_log().info("该视频不满足下载规则,删除在weishi_feeds.txt中的信息:{}".format(download_video_title))
-                with open(r"./txt/weishi_feeds.txt", "r", encoding="UTF-8") as f_r:
-                    lines = f_r.readlines()
-                with open(r"./txt/weishi_feeds.txt", "w", encoding="utf-8") as f_w:
-                    for line in lines:
-                        if download_video_id in line.split(" + ")[1]:
-                            continue
-                        f_w.write(line)
-
-
-if __name__ == "__main__":
-    weishi = Weishi()
-    weishi.get_weishi_recommend()

+ 432 - 0
main/recommend.py

@@ -0,0 +1,432 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2022/4/8
+import json
+import os
+import sys
+import time
+import requests
+import urllib3
+sys.path.append(os.getcwd())
+from main.common import Common
+from main.feishu_lib import Feishu
+from main.publish import Publish
+proxies = {"http": None, "https": None}
+
+
+class DownloadRecommend:
+
+    # 配置微信号
+    Referer = Feishu.get_range_value("recommend", "9fTK1f", "C3:C3")[0]
+    wesee_openid = Feishu.get_range_value("recommend", "9fTK1f", "C4:C4")[0]
+    wesee_openkey = Feishu.get_range_value("recommend", "9fTK1f", "C5:C5")[0]
+    wesee_personid = Feishu.get_range_value("recommend", "9fTK1f", "C6:C6")[0]
+    wesee_access_token = Feishu.get_range_value("recommend", "9fTK1f", "C7:C7")[0]
+    wesee_thr_appid = Feishu.get_range_value("recommend", "9fTK1f", "C8:C8")[0]
+
+    # 过滤词库
+    @classmethod
+    def sensitive_words(cls):
+        # 敏感词库列表
+        word_list = []
+        # 从云文档读取所有敏感词,添加到词库列表
+        lists = Feishu.get_values_batch("recommend", "2Oxf8C")
+        for a in lists:
+            for j in a:
+                # 过滤空的单元格内容
+                if j is None:
+                    pass
+                else:
+                    word_list.append(j)
+        return word_list
+
+    # 抓取基础规则
+    @staticmethod
+    def download_rule(d_duration, d_width, d_height, d_play_cnt, d_like_cnt, d_share_cnt):
+        """
+        下载视频的基本规则
+        :param d_duration: 时长
+        :param d_width: 宽
+        :param d_height: 高
+        :param d_play_cnt: 播放量
+        :param d_like_cnt: 点赞量
+        :param d_share_cnt: 分享量
+        :return: 满足规则,返回 True;反之,返回 False
+        """
+        if int(float(d_duration)) >= 30:
+            if int(d_width) >= 720 or int(d_height) >= 720:
+                if int(d_play_cnt) >= 0:
+                    if int(d_like_cnt) >= 0:
+                        if int(d_share_cnt) >= 0:
+                            return True
+                        else:
+                            return False
+                    else:
+                        return False
+                else:
+                    return False
+            return False
+        return False
+
+    # 抓取列表
+    @classmethod
+    def get_feeds(cls):
+        """
+        1.从微视小程序首页推荐,获取视频列表
+        2.先在 https://w42nne6hzg.feishu.cn/sheets/shtcn5YSWg91JfVGzj0SFZIRRPh?sheet=caa3fa 中去重
+        3.再从 https://w42nne6hzg.feishu.cn/sheets/shtcn5YSWg91JfVGzj0SFZIRRPh?sheet=O7fCzr 中去重
+        4.添加视频信息至 https://w42nne6hzg.feishu.cn/sheets/shtcn5YSWg91JfVGzj0SFZIRRPh?sheet=O7fCzr
+        """
+        url = "https://api.weishi.qq.com/trpc.weishi.weishi_h5_proxy.weishi_h5_proxy/WxminiGetFeedList"
+        headers = {
+            "content-type": "application/json",
+            "Accept-Encoding": "gzip,compress,br,deflate",
+            "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X)"
+                          " AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"
+                          " MicroMessenger/8.0.20(0x18001442) NetType/WIFI Language/zh_CN",
+            "Referer": str(cls.Referer)
+        }
+        cookies = {
+            "wesee_authtype": "3",
+            "wesee_openid": str(cls.wesee_openid),
+            "wesee_openkey": str(cls.wesee_openkey),
+            "wesee_personid": str(cls.wesee_personid),
+            "wesee_refresh_token": "",
+            "wesee_access_token": str(cls.wesee_access_token),
+            "wesee_thr_appid": str(cls.wesee_thr_appid),
+            "wesee_ichid": "8"
+        }
+        json_data = {
+            "req_body": {
+                "requestType": 16,
+                "isrefresh": 1,
+                "isfirst": 1,
+                "attachInfo": "",
+                "scene_id": 22,
+                "requestExt": {
+                    "mini_openid": str(cls.wesee_openid),
+                    "notLogin-personid": str(cls.wesee_personid)
+                }
+            },
+            "req_header": {
+                "mapExt": "{\"imageSize\":\"480\",\"adaptScene\":\"PicHDWebpLimitScene\"}"
+            }
+        }
+
+        try:
+            urllib3.disable_warnings()
+            r = requests.post(headers=headers, url=url, cookies=cookies, json=json_data, proxies=proxies, verify=False)
+            response = json.loads(r.content.decode("utf8"))
+            feeds = response["rsp_body"]["feeds"]
+            for i in range(len(feeds)):
+                # 视频标题过滤话题及处理特殊字符
+                weishi_title = feeds[i]["desc"]
+                title_split1 = weishi_title.split(" #")
+                if title_split1[0] != "":
+                    title1 = title_split1[0]
+                else:
+                    title1 = title_split1[-1]
+
+                title_split2 = title1.split(" #")
+                if title_split2[0] != "":
+                    title2 = title_split2[0]
+                else:
+                    title2 = title_split2[-1]
+
+                title_split3 = title2.split("@")
+                if title_split3[0] != "":
+                    title3 = title_split3[0]
+                else:
+                    title3 = title_split3[-1]
+                # 视频标题
+                video_title = title3.strip().replace("\n", "") \
+                    .replace("/", "").replace("快手", "").replace(" ", "") \
+                    .replace(" ", "").replace("&NBSP", "").replace("\r", "") \
+                    .replace("#", "").replace(".", "。").replace("\\", "") \
+                    .replace(":", "").replace("*", "").replace("?", "") \
+                    .replace("?", "").replace('"', "").replace("<", "") \
+                    .replace(">", "").replace("|", "").replace("微视", "")
+
+                # 视频 ID
+                if "id" not in feeds[i]["video"]:
+                    video_id = 0
+                else:
+                    video_id = feeds[i]["video"]["id"]
+
+                # 播放数
+                if "playNum" not in feeds[i]["ugcData"]:
+                    video_play_cnt = 0
+                else:
+                    video_play_cnt = feeds[i]["ugcData"]["playNum"]
+
+                # 点赞数
+                if "dingCount" not in feeds[i]["ugcData"]:
+                    video_like_cnt = 0
+                else:
+                    video_like_cnt = feeds[i]["ugcData"]["dingCount"]
+
+                # 分享数
+                if "shareNum" not in feeds[i]["ugcData"]:
+                    video_share_cnt = 0
+                else:
+                    video_share_cnt = feeds[i]["ugcData"]["shareNum"]
+
+                # 评论数
+                if "totalCommentNum" not in feeds[i]["ugcData"]:
+                    video_comment_cnt = 0
+                else:
+                    video_comment_cnt = feeds[i]["ugcData"]["totalCommentNum"]
+
+                # 视频时长
+                if "duration" not in feeds[i]["video"]:
+                    video_duration = 0
+                else:
+                    video_duration = int(int(feeds[i]["video"]["duration"]) / 1000)
+
+                # 视频宽高
+                if "width" not in feeds[i]["video"] or "height" not in feeds[i]["video"]:
+                    video_width = 0
+                    video_height = 0
+                    video_resolution = str(video_width) + "*" + str(video_height)
+                else:
+                    video_width = feeds[i]["video"]["width"]
+                    video_height = feeds[i]["video"]["height"]
+                    video_resolution = str(video_width) + "*" + str(video_height)
+
+                # 视频发布时间
+                if "createTime" not in feeds[i]:
+                    video_send_time = 0
+                else:
+                    video_send_time = int(feeds[i]["createTime"]) * 1000
+
+                # 用户昵称
+                user_name = feeds[i]["poster"]["nick"].strip().replace("\n", "") \
+                    .replace("/", "").replace("快手", "").replace(" ", "") \
+                    .replace(" ", "").replace("&NBSP", "").replace("\r", "").replace("微视", "")
+
+                # 用户 ID
+                user_id = feeds[i]["poster"]["id"]
+
+                # 用户头像地址
+                if "thumbURL" not in feeds[i]["material"] and "avatar" not in feeds[i]["poster"]:
+                    head_url = 0
+                elif "thumbURL" in feeds[i]["material"]:
+                    head_url = feeds[i]["material"]["thumbURL"]
+                else:
+                    head_url = feeds[i]["poster"]["avatar"]
+
+                # 视频封面地址
+                if len(feeds[i]["images"]) == 0:
+                    cover_url = 0
+                else:
+                    cover_url = feeds[i]["images"][0]["url"]
+
+                # 视频播放地址
+                if "url" not in feeds[i]["video"]:
+                    video_url = 0
+                else:
+                    video_url = feeds[i]["video"]["url"]
+
+                Common.logger("recommend").info("video_title:{}".format(video_title))
+                Common.logger("recommend").info("video_id:{}".format(video_id))
+                Common.logger("recommend").info("video_play_cnt:{}".format(video_play_cnt))
+                Common.logger("recommend").info("video_like_cnt:{}".format(video_like_cnt))
+                Common.logger("recommend").info("video_share_cnt:{}".format(video_share_cnt))
+                # Common.logger("recommend").info("video_comment_cnt:{}".format(video_comment_cnt))
+                Common.logger("recommend").info("video_duration:{}秒".format(video_duration))
+                # Common.logger("recommend").info("video_resolution:{}".format(video_resolution))
+                Common.logger("recommend").info(
+                    "video_send_time:{}".format(time.strftime(
+                        "%Y/%m/%d %H:%M:%S", time.localtime(int(video_send_time) / 1000))))
+                Common.logger("recommend").info("user_name:{}".format(user_name))
+                # Common.logger("recommend").info("user_id:{}".format(user_id))
+                # Common.logger("recommend").info("head_url:{}".format(head_url))
+                # Common.logger("recommend").info("cover_url:{}".format(cover_url))
+                Common.logger("recommend").info("video_url:{}".format(video_url))
+
+                # 过滤无效视频
+                if video_id == 0 or video_duration == 0 or video_send_time == 0 or head_url == 0 \
+                        or cover_url == 0 or video_url == 0:
+                    Common.logger("recommend").info("无效视频")
+                # 判断基础规则
+                elif cls.download_rule(video_duration, video_width, video_height,
+                                       video_play_cnt, video_like_cnt, video_share_cnt) is False:
+                    Common.logger("recommend").info("不满足基础规则")
+                # 判断敏感词
+                elif any(word if word in weishi_title else False for word in cls.sensitive_words()) is True:
+                    Common.logger("recommend").info("视频已中敏感词:{}".format(weishi_title))
+                # 从 云文档 去重:https://w42nne6hzg.feishu.cn/sheets/shtcn5YSWg91JfVGzj0SFZIRRPh?sheet=caa3fa
+                elif video_id in [j for m in Feishu.get_values_batch("recommend", "caa3fa") for j in m]:
+                    Common.logger("recommend").info("该视频已下载:{}", video_title)
+                # 从 云文档 去重:https://w42nne6hzg.feishu.cn/sheets/shtcn5YSWg91JfVGzj0SFZIRRPh?sheet=O7fCzr
+                elif video_id in [j for n in Feishu.get_values_batch("recommend", "O7fCzr") for j in n]:
+                    Common.logger("recommend").info("该视频已在feeds中:{}", video_title)
+                else:
+                    Common.logger("recommend").info("该视频未下载,添加至feeds中:{}".format(video_title))
+                    # feeds工作表,插入首行
+                    time.sleep(1)
+                    Feishu.insert_columns("recommend", "O7fCzr", "ROWS", 1, 2)
+                    # 获取当前时间
+                    get_feeds_time = int(time.time())
+                    # 工作表 feeds 中写入数据
+                    values = [[time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(int(get_feeds_time))),
+                               "推荐榜",
+                               video_id,
+                               video_title,
+                               video_play_cnt,
+                               video_comment_cnt,
+                               video_like_cnt,
+                               video_share_cnt,
+                               video_duration,
+                               video_resolution,
+                               time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(int(video_send_time / 1000))),
+                               user_name,
+                               user_id,
+                               head_url,
+                               cover_url,
+                               video_url]]
+                    # 等待 1s,防止操作云文档太频繁,导致报错
+                    time.sleep(1)
+                    Feishu.update_values("recommend", "O7fCzr", "A2:P2", values)
+        except Exception as e:
+            Common.logger("recommend").error("获取微视视频list异常:{}".format(e))
+
+    # 下载/上传视频
+    @classmethod
+    def download_publish(cls):
+        try:
+            for i in range(1, len(Feishu.get_values_batch("recommend", "O7fCzr")) + 1):
+                time.sleep(1)
+                download_video_id = Feishu.get_values_batch("recommend", "O7fCzr")[i][2]
+                download_video_title = Feishu.get_values_batch("recommend", "O7fCzr")[i][3]
+                download_video_play_cnt = Feishu.get_values_batch("recommend", "O7fCzr")[i][4]
+                download_video_comment_cnt = Feishu.get_values_batch("recommend", "O7fCzr")[i][5]
+                download_video_like_cnt = Feishu.get_values_batch("recommend", "O7fCzr")[i][6]
+                download_video_share_cnt = Feishu.get_values_batch("recommend", "O7fCzr")[i][7]
+                download_video_duration = Feishu.get_values_batch("recommend", "O7fCzr")[i][8]
+                download_video_resolution = Feishu.get_values_batch("recommend", "O7fCzr")[i][9]
+                # download_video_width = download_video_resolution.split("*")[0]
+                # download_video_height = download_video_resolution.split("*")[-1]
+                download_video_send_time = Feishu.get_values_batch("recommend", "O7fCzr")[i][10]
+                download_user_name = Feishu.get_values_batch("recommend", "O7fCzr")[i][11]
+                download_user_id = Feishu.get_values_batch("recommend", "O7fCzr")[i][12]
+                download_head_url = Feishu.get_values_batch("recommend", "O7fCzr")[i][13]
+                download_cover_url = Feishu.get_values_batch("recommend", "O7fCzr")[i][14]
+                download_video_url = Feishu.get_values_batch("recommend", "O7fCzr")[i][15]
+
+                # Common.logger("recommend").info("download_video_id:{}", download_video_id)
+                # Common.logger("recommend").info("download_video_title:{}", download_video_title)
+                # Common.logger("recommend").info("download_video_play_cnt:{}", download_video_play_cnt)
+                # Common.logger("recommend").info("download_video_comment_cnt:{}", download_video_comment_cnt)
+                # Common.logger("recommend").info("download_video_like_cnt:{}", download_video_like_cnt)
+                # Common.logger("recommend").info("download_video_share_cnt:{}", download_video_share_cnt)
+                # Common.logger("recommend").info("download_video_duration:{}", download_video_duration)
+                # Common.logger("recommend").info("download_video_resolution:{}", download_video_resolution)
+                # Common.logger("recommend").info("download_video_send_time:{}", download_video_send_time)
+                # Common.logger("recommend").info("download_user_name:{}", download_user_name)
+                # Common.logger("recommend").info("download_user_id:{}", download_user_id)
+                # Common.logger("recommend").info("download_head_url:{}", download_head_url)
+                # Common.logger("recommend").info("download_cover_url:{}", download_cover_url)
+                # Common.logger("recommend").info("download_video_url:{}", download_video_url)
+
+                Common.logger("recommend").info("正在判断第{}行,视频:{}", i, download_video_title)
+
+                # 过滤空行
+                if download_video_id is None \
+                        or download_video_id == "" \
+                        or download_video_title is None \
+                        or download_video_title == "":
+                    Common.logger("recommend").warning("空行,删除")
+                    # 删除行或列,可选 ROWS、COLUMNS
+                    Feishu.dimension_range("recommend", "O7fCzr", "ROWS", i + 1, i + 1)
+                    return
+                # 分享量>=1000
+                elif int(download_video_share_cnt) < 1000:
+                    Common.logger("recommend").info("分享量:{} < 1000", download_video_share_cnt)
+                    # 删除行或列,可选 ROWS、COLUMNS
+                    Feishu.dimension_range("recommend", "O7fCzr", "ROWS", i + 1, i + 1)
+                    return
+                # 去重
+                elif download_video_id in [j for m in Feishu.get_values_batch("recommend", "caa3fa") for j in m]:
+                    Common.logger("recommend").info("该视频已下载:{}", download_video_title)
+                    # 删除行或列,可选 ROWS、COLUMNS
+                    Feishu.dimension_range("recommend", "O7fCzr", "ROWS", i + 1, i + 1)
+                    return
+                else:
+                    Common.logger("recommend").info("开始下载视频:{}", download_video_title)
+                    # 下载封面
+                    Common.download_method(job="recommend", text="cover",
+                                           d_name=str(download_video_title), d_url=str(download_cover_url))
+                    # 下载视频
+                    Common.download_method(job="recommend", text="video",
+                                           d_name=str(download_video_title), d_url=str(download_video_url))
+                    # 保存视频信息至 "./videos/{download_video_title}/info.txt"
+                    with open("./videos/" + download_video_title
+                              + "/" + "info.txt", "a", encoding="UTF-8") as f_a:
+                        f_a.write(str(download_video_id) + "\n" +
+                                  str(download_video_title) + "\n" +
+                                  str(download_video_duration) + "\n" +
+                                  str(download_video_play_cnt) + "\n" +
+                                  str(download_video_comment_cnt) + "\n" +
+                                  str(download_video_like_cnt) + "\n" +
+                                  str(download_video_share_cnt) + "\n" +
+                                  str(download_video_resolution) + "\n" +
+                                  str(int(time.mktime(
+                                      time.strptime(download_video_send_time, "%Y/%m/%d %H:%M:%S")))) + "\n" +
+                                  str(download_user_name) + "\n" +
+                                  str(download_head_url) + "\n" +
+                                  str(download_video_url) + "\n" +
+                                  str(download_cover_url) + "\n" +
+                                  str(cls.wesee_access_token))
+                    Common.logger("recommend").info("==========视频信息已保存至info.txt==========")
+
+                    # 上传视频
+                    Common.logger("recommend").info("开始上传视频:{}".format(download_video_title))
+                    Publish.upload_and_publish("recommend", "prod", "play")
+
+                    # 保存视频 ID 到云文档:https://w42nne6hzg.feishu.cn/sheets/shtcn5YSWg91JfVGzj0SFZIRRPh?sheet=caa3fa
+                    Common.logger("recommend").info("保存视频ID至云文档:{}", download_video_title)
+                    # 视频ID工作表,插入首行
+                    Feishu.insert_columns("recommend", "caa3fa", "ROWS", 1, 2)
+                    # 视频ID工作表,首行写入数据
+                    upload_time = int(time.time())
+                    values = [[str(time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(upload_time))),
+                               "推荐榜",
+                               str(download_video_id),
+                               str(download_video_title),
+                               download_video_play_cnt,
+                               download_video_comment_cnt,
+                               download_video_like_cnt,
+                               download_video_share_cnt,
+                               download_video_duration,
+                               str(download_video_resolution),
+                               str(download_video_send_time),
+                               str(download_user_name),
+                               str(download_user_id),
+                               str(download_head_url),
+                               str(download_cover_url),
+                               str(download_video_url)]]
+                    time.sleep(1)
+                    Feishu.update_values("recommend", "caa3fa", "A2:Q2", values)
+
+                    # 删除行或列,可选 ROWS、COLUMNS
+                    Feishu.dimension_range("recommend", "O7fCzr", "ROWS", i + 1, i + 1)
+                    return
+        except Exception as e:
+            Common.logger("recommend").error("下载/上传视频异常:{}", e)
+            Feishu.dimension_range("recommend", "O7fCzr", "ROWS", 2, 2)
+
+
+if __name__ == "__main__":
+    weishi = DownloadRecommend()
+    for n in range(2):
+        Common.logger("recommend").info("正在抓取第{}页视频", n + 1)
+        weishi.get_feeds()
+
+    # print(weishi.Referer)
+    # print(weishi.wesee_openid)
+    # print(weishi.wesee_openkey)
+    # print(weishi.wesee_personid)
+    # print(weishi.wesee_access_token)
+    # print(weishi.wesee_thr_appid)
+    # print(weishi.json_text)

+ 0 - 138
main/run.py

@@ -1,138 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2022/3/30
-import os
-import random
-import sys
-import time
-from apscheduler.schedulers.blocking import BlockingScheduler
-sys.path.append(os.getcwd())
-from main.common import Common
-from main.download_weishi import Weishi
-from main.download_kuaishou import KuaiShou
-
-
-def kuaishou_dev_job():
-    """
-    执行测试环境快手脚本
-    """
-    while True:
-        # 当天下载及上传的视频数:20 条
-        if len(KuaiShou.download_video_list) >= 20:
-            time.sleep(60)
-            break
-        else:
-            Common.crawler_log().info("开始抓取快手视频")
-            time.sleep(1)
-
-            # 抓取符合规则的视频,写入 kuaishou_feeds.txt
-            KuaiShou.kuaishou_get_recommend()
-            # 下载视频,并上传
-            KuaiShou.kuaishou_download_play_video("dev")
-            # 随机睡眠1-3s
-            time.sleep(random.randint(1, 3))
-
-    # 删除冗余日志
-    Common.del_logs()
-    # 统计下载视频数
-    Common.kuaishou_download_count()
-
-
-def weishi_dev_job():
-    """
-    执行测试环境微视脚本
-    """
-    while True:
-        if 14 >= Common.now.hour >= 5:
-            Common.crawler_log().info("结束抓取及上传任务")
-            break
-        else:
-            # 抓取符合规则的视频,写入 weishi_feeds.txt
-            Weishi.get_weishi_recommend()
-            # 下载视频,并上传
-            Weishi.download_weishi_play_video("dev")
-            # 随机睡眠1-3s
-            time.sleep(random.randint(1, 3))
-
-    # 删除冗余日志
-    Common.del_logs()
-    # 统计下载视频数
-    Common.weishi_download_count()
-
-
-def main_dev():
-    """
-    测试环境主函数
-    """
-    scheduler = BlockingScheduler(timezone="Asia/Shanghai")
-    # 抓取视频的定时任务,在每天10点的40分,运行一次 job 方法
-    scheduler.add_job(kuaishou_dev_job, 'cron', hour=19, minute=10, misfire_grace_time=60)
-    # 开始运行脚本
-    scheduler.start()
-
-
-def weishi_prod_job():
-    """
-    执行正式环境微视脚本
-    """
-    while True:
-        if 20 >= Common.now.hour >= 5:
-            Common.crawler_log().info("结束抓取微视视频任务")
-            break
-        else:
-            # 抓取符合规则的视频,写入 weishi_feeds.txt
-            Weishi.get_weishi_recommend()
-            # 下载视频,并上传
-            Weishi.download_weishi_play_video("prod")
-            # 随机睡眠1-3s
-            time.sleep(random.randint(1, 3))
-
-    # 删除冗余日志
-    Common.del_logs()
-    # 统计下载视频数
-    Common.weishi_download_count()
-
-
-def kuaishou_prod_job():
-    """
-    执行正式环境快手脚本
-    """
-    # while True:
-    #     # 当天下载及上传的视频数:200 条
-    #     if len(KuaiShou.download_video_list) >= 200:
-    #         time.sleep(60)
-    #         break
-    #     else:
-
-    Common.crawler_log().info("开始抓取快手视频")
-
-    # 抓取符合规则的视频,写入 kuaishou_feeds.txt
-    KuaiShou.kuaishou_get_recommend()
-    # 下载视频,并上传
-    KuaiShou.kuaishou_download_play_video("prod")
-    # 随机睡眠1-3s
-    time.sleep(random.randint(1, 3))
-
-    # 删除冗余日志
-    Common.del_logs()
-    # 统计下载视频数
-    Common.kuaishou_download_count()
-
-
-def main_prod():
-    """
-    正式环境主函数
-    """
-    while True:
-        kuaishou_prod_job()
-
-    # scheduler = BlockingScheduler(timezone="Asia/Shanghai")
-    # # 抓取视频的定时任务,在每天10点的40分,运行一次 job 方法
-    # scheduler.add_job(kuaishou_prod_job, 'cron', hour=8, minute=00, misfire_grace_time=60)
-    # # 开始运行脚本
-    # scheduler.start()
-
-
-if __name__ == "__main__":
-    # main_dev()
-    main_prod()

+ 79 - 0
main/run_recommend.py

@@ -0,0 +1,79 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2022/3/30
+import datetime
+import os
+import random
+import sys
+import time
+from apscheduler.schedulers.blocking import BlockingScheduler
+sys.path.append(os.getcwd())
+from main.common import Common
+from main.download_recommend import DownloadRecommend
+
+
+def weishi_dev_job():
+    """
+    执行测试环境微视脚本
+    """
+    while True:
+        weishi_dev_time = datetime.datetime.now()
+        if weishi_dev_time.hour >= 20 or weishi_dev_time.hour <= 10:
+            # 抓取符合规则的视频,写入 weishi_feeds.txt
+            DownloadRecommend.get_weishi_recommend()
+            # 下载视频,并上传
+            DownloadRecommend.download_weishi_play_video("dev")
+            # 随机睡眠1-3s
+            time.sleep(random.randint(1, 3))
+        else:
+            Common.crawler_log().info("结束抓取及上传任务")
+            break
+
+    # 删除冗余日志
+    Common.del_logs()
+    # 统计下载视频数
+    Common.weishi_download_count()
+
+
+def main_dev():
+    """
+    测试环境主函数
+    """
+    while True:
+        # Common.crawler_log().info("开始抓取微视视频")
+        weishi_dev_job()
+
+
+def weishi_prod_job():
+    """
+    执行正式环境微视脚本
+    """
+    while True:
+        if 20 >= Common.now.hour >= 5:
+            Common.crawler_log().info("结束抓取微视视频任务")
+            break
+        else:
+            # 抓取符合规则的视频,写入 weishi_feeds.txt
+            DownloadRecommend.get_weishi_recommend()
+            # 下载视频,并上传
+            DownloadRecommend.download_weishi_play_video("prod")
+            # 随机睡眠1-3s
+            time.sleep(random.randint(1, 3))
+
+    # 删除冗余日志
+    Common.del_logs()
+    # 统计下载视频数
+    Common.weishi_download_count()
+
+
+def main_prod():
+    """
+    正式环境主函数
+    """
+    while True:
+        weishi_prod_job()
+
+
+if __name__ == "__main__":
+    main_dev()
+    # main_prod()

+ 0 - 3
txt/__init__.py

@@ -1,3 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2022/3/30

+ 0 - 0
txt/kuaishou_feeds.txt


+ 0 - 0
txt/kuaishou_videoid.txt


+ 0 - 0
txt/weishi_feeds.txt


+ 0 - 0
txt/weishi_videoid.txt


+ 0 - 19
抓取规则.txt

@@ -1,19 +0,0 @@
-==========2022/4/15===========
-一、按照数据指标抓取
-1、任务开始时间:
-- 每天早上8点-晚上21点
-2、抓取规则:
-  - 视频播放量点赞量5万+ ,分享量2000+
-  - 视频时长1分钟以上,10分钟以下
-  - 视频分辨率720以上
-  - 站内标题=快手视频原标题 (需要过滤掉标题中的话题#  #)
-  - 站内封面图=快手视频原封面图
-3、站内承接:
-- 每日入库200条视频
-- 视频随机分配到10个虚拟账号。
-4、特别注意:
-- 视频需要排重,已经抓取过得视频,不要重复抓取
-- 需要对视频库进行持续扫描:如1条视频上周未达到5万+点赞,本周达到了5万点赞,则进行抓取。
-5、新增爬虫视频标题过滤词
--  集结吧光合创作者、电影解说、快来露两手、分享家常美食教程、光合作者助手、创作者中心、创作者学院、娱乐星熠计划、解说电影、电影剪辑、放映室、老剧、影视剪辑、精彩片段、冬日影娱大作战、春日追剧计划单、影视解说、中视频影视混剪计划、众志成城共抗疫情、我在追好剧、娱乐星灿计划、电影、电视剧、毛泽东、毛主席、周恩来、林彪、习近平、习大大、彭丽媛、怀旧经典影视
-==============================