Server
/
crawler_xiaoniangao


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324
							# -*- coding: utf-8 -*-
# @Author: wangkun
# @Time: 2022/7/1
import os
import sys
import time
import requests
import urllib3
sys.path.append(os.getcwd())
from main.common import Common
from main.feishu_lib import Feishu
proxies = {"http": None, "https": None}


class PlayList:
    # 配置微信
    wechat_sheet = Feishu.get_values_batch("hour", "xiaoniangao", "dzcWHw")
    hour_x_b3_traceid = wechat_sheet[2][1]
    hour_x_token_id = wechat_sheet[3][1]
    hour_referer = wechat_sheet[4][1]
    hour_uid = wechat_sheet[5][1]
    hour_token = wechat_sheet[6][1]

    # 过滤敏感词
    @classmethod
    def sensitive_words(cls, log_type):
        # 敏感词库列表
        word_list = []
        # 从云文档读取所有敏感词，添加到词库列表
        time.sleep(1)
        lists = Feishu.get_values_batch(log_type, "xiaoniangao", "DRAnZh")
        for i in lists:
            for j in i:
                # 过滤空的单元格内容
                if j is None:
                    pass
                else:
                    word_list.append(j)
        return word_list

    # 视频ID过滤字母
    @classmethod
    def sensitive_videoid_words(cls):
        # 字母列表
        words_list = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s",
                      "t", "u", "v", "w", "x", "y", "z",
                      "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S",
                      "T", "U", "V", "W", "X", "Y", "Z"]
        return words_list

    # 基础门槛规则
    @staticmethod
    def download_rule(d_duration, d_width, d_height, d_play_cnt, d_like_cnt, d_share_cnt, d_send_time):
        """
        下载视频的基本规则
        :param d_duration: 时长
        :param d_width: 宽
        :param d_height: 高
        :param d_play_cnt: 播放量
        :param d_like_cnt: 点赞量
        :param d_share_cnt: 分享量
        :param d_send_time: 发布时间
        :return: 满足规则，返回 True；反之，返回 False
        """
        # 视频时长
        if int(float(d_duration)) >= 40:
            # 宽或高
            if int(d_width) >= 0 or int(d_height) >= 0:
                # 播放量
                if int(d_play_cnt) >= 100000:
                    # 点赞量
                    if int(d_like_cnt) >= 0:
                        # 分享量
                        if int(d_share_cnt) >= 0:
                            # 发布时间 <= 7 天
                            if int(time.time()) - int(d_send_time) / 1000 <= 604800:
                                return True
                            else:
                                return False
                        else:
                            return False
                    else:
                        return False
                else:
                    return False
            return False
        return False

    # 获取列表
    @classmethod
    def get_hour_list_feeds(cls, log_type):
        """
        1.从列表获取视频，7 天内，播放量>=5000
        2.时长 1-10min
        3.每天10:00、15:00、20:00 把符合规则的视频，写入云文档
        https://w42nne6hzg.feishu.cn/sheets/shtcnYxiyQ1wLklo1W5Kdqc9cGh?sheet=ba0da4
        """
        url = "https://kapi.xiaoniangao.cn/trends/get_recommend_trends"
        headers = {
            "x-b3-traceid": cls.hour_x_b3_traceid,
            "X-Token-Id": cls.hour_x_token_id,
            "uid": cls.hour_uid,
            "content-type": "application/json",
            "Accept-Encoding": "gzip,compress,br,deflate",
            "User-Agent": 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X)'
                          ' AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 '
                          'MicroMessenger/8.0.20(0x18001432) NetType/WIFI Language/zh_CN',
            "Referer": cls.hour_referer
        }
        data = {
            "log_params": {
                "page": "discover_rec",
                "common": {
                    "brand": "iPhone",
                    "device": "iPhone 11",
                    "os": "iOS 14.7.1",
                    "weixinver": "8.0.20",
                    "srcver": "2.24.2",
                    "net": "wifi",
                    "scene": 1089
                }
            },
            "qs": "imageMogr2/gravity/center/rotate/$/thumbnail/!750x500r/crop/750x500/interlace/1/format/jpg",
            "h_qs": "imageMogr2/gravity/center/rotate/$/thumbnail/!80x80r/crop/80x80/interlace/1/format/jpg",
            "share_width": 625,
            "share_height": 500,
            "ext": {
                "fmid": 0,
                "items": {}
            },
            "app": "xng",
            "rec_scene": "discover_rec",
            "log_common_params": {
                "e": [{
                    "data": {
                        "page": "discoverIndexPage",
                        "topic": "recommend"
                    },
                    "ab": {}
                }],
                "ext": {
                    "brand": "iPhone",
                    "device": "iPhone 11",
                    "os": "iOS 14.7.1",
                    "weixinver": "8.0.20",
                    "srcver": "2.24.3",
                    "net": "wifi",
                    "scene": "1089"
                },
                "pj": "1",
                "pf": "2",
                "session_id": "7bcce313-b57d-4305-8d14-6ebd9a1bad29"
            },
            "refresh": False,
            "token": cls.hour_token,
            "uid": cls.hour_uid,
            "proj": "ma",
            "wx_ver": "8.0.20",
            "code_ver": "3.62.0"
        }
        try:
            urllib3.disable_warnings()
            r = requests.post(url=url, headers=headers, json=data, proxies=proxies, verify=False)
            if "data" not in r.json():
                Common.logger(log_type).warning("获取视频feeds错误:{}", r.text)
            elif "list" not in r.json()["data"]:
                Common.logger(log_type).warning("获取视频feeds无数据，休眠10s:{}", r.json()["data"])
            else:
                # 视频列表数据
                feeds = r.json()["data"]["list"]
                for i in range(len(feeds)):
                    # 标题
                    if "title" in feeds[i]:
                        video_title = feeds[i]["title"].strip().replace("\n", "") \
                            .replace("/", "").replace("\r", "").replace("#", "") \
                            .replace(".", "。").replace("\\", "").replace("&NBSP", "") \
                            .replace(":", "").replace("*", "").replace("？", "") \
                            .replace("?", "").replace('"', "").replace("<", "") \
                            .replace(">", "").replace("|", "").replace(" ", "")
                    else:
                        video_title = 0

                    # 视频 ID
                    if "vid" in feeds[i]:
                        video_id = feeds[i]["vid"]
                    else:
                        video_id = 0

                    # 播放量
                    if "play_pv" in feeds[i]:
                        video_play_cnt = feeds[i]["play_pv"]
                    else:
                        video_play_cnt = 0

                    # 点赞量
                    if "favor" in feeds[i]:
                        video_like_cnt = feeds[i]["favor"]["total"]
                    else:
                        video_like_cnt = 0

                    # 分享量
                    if "share" in feeds[i]:
                        video_share_cnt = feeds[i]["share"]
                    else:
                        video_share_cnt = 0

                    # # 评论量
                    # if "comment_count" in feeds[i]:
                    #     video_comment_cnt = feeds[i]["comment_count"]
                    # else:
                    #     video_comment_cnt = 0

                    # 时长
                    if "du" in feeds[i]:
                        video_duration = int(feeds[i]["du"] / 1000)
                    else:
                        video_duration = 0

                    # 宽和高
                    if "w" or "h" in feeds[i]:
                        video_width = feeds[i]["w"]
                        video_height = feeds[i]["h"]
                    else:
                        video_width = 0
                        video_height = 0

                    # 发布时间
                    if "t" in feeds[i]:
                        video_send_time = feeds[i]["t"]
                    else:
                        video_send_time = 0

                    # 用户名 / 头像
                    if "user" in feeds[i]:
                        user_name = feeds[i]["user"]["nick"].strip().replace("\n", "") \
                            .replace("/", "").replace("快手", "").replace(" ", "") \
                            .replace(" ", "").replace("&NBSP", "").replace("\r", "")
                        head_url = feeds[i]["user"]["hurl"]
                    else:
                        user_name = 0
                        head_url = 0

                    # 用户 ID
                    profile_id = feeds[i]["id"]

                    # 用户 mid
                    profile_mid = feeds[i]["user"]["mid"]

                    # 视频封面
                    if "url" in feeds[i]:
                        cover_url = feeds[i]["url"]
                    else:
                        cover_url = 0

                    # 视频播放地址
                    if "v_url" in feeds[i]:
                        video_url = feeds[i]["v_url"]
                    else:
                        video_url = 0

                    Common.logger(log_type).info("标题:{}", video_title)
                    Common.logger(log_type).info("视频ID:{}", video_id)
                    Common.logger(log_type).info("播放量:{}", video_play_cnt)
                    # Common.logger(log_type).info("点赞量:{}", video_like_cnt)
                    # Common.logger(log_type).info("分享量:{}", video_share_cnt)
                    # Common.logger(log_type).info("评论数:{}", video_comment_cnt)
                    Common.logger(log_type).info("时长:{}秒", video_duration)
                    # Common.logger(log_type).info("宽高:{}*{}", video_width, video_height)
                    Common.logger(log_type).info(
                        "视频发布时间:{}", time.strftime(
                            "%Y-%m-%d %H:%M:%S", time.localtime(int(video_send_time) / 1000)))
                    Common.logger(log_type).info("用户名:{}", user_name)
                    # Common.logger(log_type).info("用户头像:{}", head_url)
                    # Common.logger(log_type).info("封面:{}", cover_url)
                    Common.logger(log_type).info("播放地址:{}", video_url)

                    # 过滤无效视频
                    if video_title == 0 or video_id == 0 or video_duration == 0 \
                            or video_send_time == 0 or user_name == 0 or head_url == 0 \
                            or cover_url == 0 or video_url == 0:
                        Common.logger(log_type).warning("无效视频")

                    elif cls.download_rule(video_duration, video_width, video_height, video_play_cnt,
                                           video_like_cnt, video_share_cnt, video_send_time) is False:
                        Common.logger(log_type).info("不满足基础门槛规则")

                    # 过滤敏感词
                    elif any(word if word in video_title else False for word in
                             cls.sensitive_words(log_type)) is True:
                        Common.logger(log_type).info("视频已中敏感词:{}".format(video_title))
                        time.sleep(1)

                    # 从云文档中去重:https://w42nne6hzg.feishu.cn/sheets/shtcnYxiyQ1wLklo1W5Kdqc9cGh?sheet=yatRv2
                    elif video_id in [j for i in Feishu.get_values_batch(log_type, "xiaoniangao", "yatRv2") for j in
                                      i]:
                        Common.logger(log_type).info("该视频已下载:{}", video_title)
                        time.sleep(1)

                    else:
                        Common.logger(log_type).info("该视频未下载，添加至feeds中:{}".format(video_title))
                        # feeds工作表，插入空行
                        time.sleep(1)
                        Feishu.insert_columns(log_type, "xiaoniangao", "ba0da4", "ROWS", 2, 3)

                        # 获取当前时间
                        get_feeds_time = int(time.time())
                        # 看一看云文档，工作表中写入数据
                        values = [[profile_id,
                                   profile_mid,
                                   video_id,
                                   video_title,
                                   user_name,
                                   video_duration,
                                   cover_url,
                                   video_url,
                                   time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(video_send_time) / 1000)),
                                   str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(get_feeds_time))),
                                   video_play_cnt]]
                        # 等待 1s，防止操作云文档太频繁，导致报错
                        time.sleep(1)
                        Feishu.update_values(log_type, "xiaoniangao", "ba0da4", "A3:K3", values)

        except Exception as e:
            Common.logger(log_type).error("获取小时榜视频列表异常:{}", e)