Server
/
crawler_kanyikan


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
							# -*- coding: utf-8 -*-
# @Author: wangkun
# @Time: 2022/4/18
"""
获取看一看+小程序，首页推荐视频列表
"""
import json
import os
import random
import sys
import time
import requests
import urllib3

from main.feishu_lib import Feishu

sys.path.append(os.getcwd())
from main.common import Common

proxies = {"http": None, "https": None}


# 敏感词库
def kanyikan_sensitive_words(log_type):
    # 敏感词库列表
    word_list = []
    # 从云文档读取所有敏感词，添加到词库列表
    lists = Feishu.get_values_batch(log_type, "kanyikan", "rofdM5")
    for i in lists:
        for j in i:
            # 过滤空的单元格内容
            if j is None:
                pass
            else:
                word_list.append(j)
    return word_list


def get_feeds(log_type):
    """
    1.从看一看+小程序首页推荐，获取视频列表
    2.先在 https://w42nne6hzg.feishu.cn/sheets/shtcngRPoDYAi24x52j2nDuHMih?sheet=20ce0c 中去重
    3.再从 https://w42nne6hzg.feishu.cn/sheets/shtcngRPoDYAi24x52j2nDuHMih?sheet=SdCHOM 中去重
    4.添加视频信息至 https://w42nne6hzg.feishu.cn/sheets/shtcngRPoDYAi24x52j2nDuHMih?sheet=SdCHOM
    """
    Common.logger(log_type).info("开始从推荐页获取视频列表")
    host = "https://search.weixin.qq.com"
    url = '/cgi-bin/recwxa/recwxavideolist?'
    video_list_session = Common.get_session(log_type)
    # Common.logger(log_type).info("获取视频list时，session：{}", video_list_session)
    header = {
        "Connection": "keep-alive",
        "content-type": "application/json",
        "Accept-Encoding": "gzip,compress,br,deflate",
        "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) "
                      "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.18(0x18001236) "
                      "NetType/WIFI Language/zh_CN",
        "Referer": "https://servicewechat.com/wxbb9a805eb4f9533c/234/page-frame.html",
    }
    params = {
        'session': video_list_session,
        "offset": 0,
        "wxaVersion": "3.9.2",
        "count": "10",
        "channelid": "208",
        "scene": '310',
        "subscene": '1089',
        "clientVersion": '8.0.18',
        "sharesearchid": '0',
        "nettype": 'wifi',
        "switchprofile": "0",
        "switchnewuser": "0",
    }
    try:
        urllib3.disable_warnings()
        r = requests.get(host + url, headers=header, params=params, proxies=proxies, verify=False)
        response = json.loads(r.content.decode("utf8"))

        if "data" not in response:
            Common.logger(log_type).info("获取视频list时，session过期，随机睡眠 31-50 秒")
            # 如果返回空信息，则随机睡眠 31-40 秒
            time.sleep(random.randint(31, 40))
            get_feeds(log_type)
        elif "items" not in response["data"]:
            Common.logger(log_type).info("获取视频list时，response:{}，随机睡眠 1-3 分钟", response)
            # 如果返回空信息，则随机睡眠 1-3 分钟
            time.sleep(random.randint(60, 180))
            get_feeds(log_type)
        else:
            items = response["data"]["items"]
            for i in range(len(items)):
                # 如果该视频没有视频信息，则忽略
                if "videoInfo" not in items[i]:
                    Common.logger(log_type).info("无视频信息")
                else:
                    # 获取视频标题
                    video_title = items[i]["title"].strip().replace("\n", "")\
                        .replace("/", "").replace("\\", "").replace("\r", "")\
                        .replace(":", "").replace("*", "").replace("？", "")\
                        .replace("?", "").replace('"', "").replace("<", "")\
                        .replace(">", "").replace("|", "").replace(" ", "")\
                        .replace("&NBSP", "").replace(".", "。").replace(" ", "")\
                        .replace("小年糕", "").replace("#", "").replace("Merge", "")
                    Common.logger(log_type).info('视频标题：{}', video_title)

                    # 获取视频ID
                    video_id = items[i]["videoId"]
                    Common.logger(log_type).info('视频ID：{}', video_id)
                
                    # 获取视频播放次数
                    video_play_cnt = items[i]["playCount"]
                    Common.logger(log_type).info('视频播放次数：{}', video_play_cnt)
                
                    # 获取视频点赞数
                    video_liked_cnt = items[i]["liked_cnt"]
                    Common.logger(log_type).info('视频点赞数：{}', video_liked_cnt)
                
                    # 获取视频评论数
                    video_comment_cnt = items[i]["comment_cnt"]
                    Common.logger(log_type).info('视频评论数：{}', video_comment_cnt)
                
                    # 获取视频分享数
                    video_shared_cnt = items[i]["shared_cnt"]
                    Common.logger(log_type).info('视频分享数：{}', video_shared_cnt)

                    # 获取视频时长
                    video_duration = items[i]["mediaDuration"]
                    Common.logger(log_type).info('视频时长：{}秒', video_duration)

                    # 获取视频宽高
                    if "short_video_info" not in items[i]:
                        video_width = "0"
                        video_height = "0"
                        video_resolution = str(video_width) + "*" + str(video_height)
                        Common.logger(log_type).info("无分辨率：{}", video_resolution)
                    elif len(items[i]["short_video_info"]) == 0:
                        video_width = "0"
                        video_height = "0"
                        video_resolution = str(video_width) + "*" + str(video_height)
                        Common.logger(log_type).info("无分辨率：{}", video_resolution)

                    else:
                        # 视频宽
                        video_width = items[i]["short_video_info"]["width"]
                        # 视频高
                        video_height = items[i]["short_video_info"]["height"]
                        video_resolution = str(video_width) + "*" + str(video_height)
                        Common.logger(log_type).info('视频宽高：{}', video_resolution)
                
                    # 获取视频发布时间
                    video_send_date = items[i]["date"]
                    Common.logger(log_type).info("视频发布时间：{}",
                                                 time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(video_send_date)))
                
                    # 获取视频用户名
                    video_user = items[i]["source"].strip().replace("\n", "")
                    Common.logger(log_type).info('视频用户名：{}', video_user)

                    # user_id
                    if "openid" not in items[i]:
                        user_id = 0
                    else:
                        user_id = items[i]["openid"]
                
                    # 获取视频用户头像
                    video_user_cover = items[i]["bizIcon"]
                    Common.logger(log_type).info('视频用户头像：{}', video_user_cover)
                
                    # 获取视频封面
                    if "smartCoverUrl" in items[i]:
                        video_cover = items[i]["smartCoverUrl"]
                        Common.logger(log_type).info('视频封面：{}', video_cover)
                    else:
                        video_cover = items[i]["thumbUrl"]
                        Common.logger(log_type).info('视频封面：{}', video_cover)
                
                    # 获取播放地址
                    if "mpInfo" in items[i]["videoInfo"]["videoCdnInfo"].keys():
                        if len(items[i]["videoInfo"]["videoCdnInfo"]["mpInfo"]["urlInfo"]) > 2:
                            url = items[i]["videoInfo"]["videoCdnInfo"]["mpInfo"]["urlInfo"][2]["url"]
                            Common.logger(log_type).info('视频播放地址：{}', url)
                        else:
                            url = items[i]["videoInfo"]["videoCdnInfo"]["mpInfo"]["urlInfo"][0]["url"]
                            Common.logger(log_type).info('视频播放地址：{}', url)
                    elif "ctnInfo" in items[i]["videoInfo"]["videoCdnInfo"]:
                        url = items[i]["videoInfo"]["videoCdnInfo"]["ctnInfo"]["urlInfo"][0]["url"]
                        Common.logger(log_type).info('视频播放地址：{}', url)
                    else:
                        url = items[i]["videoInfo"]["videoCdnInfo"]["urlInfo"][0]["url"]
                        Common.logger(log_type).info('视频播放地址：{}', url)

                    # 过滤无效视频
                    if video_id == "" \
                            or video_send_date == "" \
                            or video_title.strip() == "" \
                            or video_play_cnt == "" \
                            or video_liked_cnt == "" \
                            or video_duration == "" \
                            or video_comment_cnt == "" \
                            or video_shared_cnt == "" \
                            or video_user == "" \
                            or video_user_cover == "" \
                            or video_cover == "" \
                            or url == "":
                        Common.logger(log_type).info("无效视频")

                    # 基础门槛，播放量>=20000
                    elif int(video_play_cnt) < 20000:
                        Common.logger(log_type).info("播放量{} < 20000", video_play_cnt)
                    # 过滤敏感词
                    elif any(word if word in video_title else False
                             for word in kanyikan_sensitive_words(log_type)) is True:
                        Common.logger(log_type).info("视频已中敏感词：{}".format(video_title))
                    # 从 云文档 去重:https://w42nne6hzg.feishu.cn/sheets/shtcngRPoDYAi24x52j2nDuHMih?sheet=20ce0c
                    elif video_id in [j for i in Feishu.get_values_batch(log_type, "kanyikan", "20ce0c") for j in i]:
                        Common.logger(log_type).info("该视频已下载：{}", video_title)
                    # 从 云文档 去重:https://w42nne6hzg.feishu.cn/sheets/shtcngRPoDYAi24x52j2nDuHMih?sheet=SdCHOM
                    elif video_id in [j for i in Feishu.get_values_batch(log_type, "kanyikan", "SdCHOM") for j in i]:
                        Common.logger(log_type).info("该视频已在kanyikan_feeds中:{}", video_title)
                    else:
                        Common.logger(log_type).info("该视频未下载，添加至kanyikan_feeds:{}", video_title)

                        # 看一看+工作表，插入首行
                        Feishu.insert_columns(log_type, "kanyikan", "SdCHOM", "ROWS", 1, 2)

                        # 获取当前时间
                        get_feeds_time = int(time.time())
                        # 准备写入云文档的数据
                        values = [[time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(get_feeds_time)),
                                   "推荐榜",
                                   video_id,
                                   video_title,
                                   video_play_cnt,
                                   video_comment_cnt,
                                   video_liked_cnt,
                                   video_shared_cnt,
                                   video_duration,
                                   str(video_width) + "*" + str(video_height),
                                   time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(video_send_date)),
                                   video_user,
                                   user_id,
                                   video_user_cover,
                                   video_cover,
                                   url]]
                        time.sleep(1)
                        # 写入数据
                        Feishu.update_values(log_type, "kanyikan", "SdCHOM", "A2:P2", values)
    except Exception as e:
        Common.logger(log_type).error("获取视频 list 时异常：{}", e)


if __name__ == "__main__":
    get_feeds("recommend")