wangkun 1 年間 前
コミット
1c3f111d12

+ 3 - 0
kanyikan.sh

@@ -39,6 +39,9 @@ time=$(date +%H:%M:%S)
 #echo "$(date "+%Y-%m-%d %H:%M:%S") 更新环境变量..." >> ${log_path}
 #cd ~ && source ${profile_path}
 #echo "$(date "+%Y-%m-%d %H:%M:%S") 更新环境变量完成!" >> ${log_path}
+echo "$(date "+%Y-%m-%d %H:%M:%S") 正在更新代码" >> ${log_path}
+cd /Users/lieyunye/Desktop/crawler/crawler_kanyikan/ && git pull origin master --force
+echo "$(date "+%Y-%m-%d %H:%M:%S") 代码更新完成"
 
 echo "$(date "+%Y-%m-%d %H:%M:%S") 正在检测看一看推荐爬虫服务状态" >> ${log_path}
 ps -ef | grep "run_kanyikan_recommend" | grep -v "grep"

+ 7 - 2
main/kanyikan_moment.py

@@ -10,7 +10,7 @@ import urllib3
 sys.path.append(os.getcwd())
 from main.feishu_lib import Feishu
 from main.common import Common
-from main.kanyikan_moment_publish import Publish
+from main.publish import Publish
 proxies = {"http": None, "https": None}
 
 
@@ -364,7 +364,12 @@ class Moment:
 
                     # 上传视频
                     Common.logger("moment").info("开始上传视频:{}".format(download_video_title))
-                    our_video_id = Publish.upload_and_publish("moment", env, "play")
+                    our_video_id = Publish.upload_and_publish(log_type="moment",
+                                                              crawler="kanyikan",
+                                                              strategy="朋友圈抓取策略",
+                                                              our_uid="moment",
+                                                              env=env,
+                                                              oss_endpoint="out")
                     our_video_link = "https://admin.piaoquantv.com/cms/post-detail/" + str(our_video_id) + "/info"
                     Common.logger("moment").info("视频上传完成:{}", download_video_title)
 

+ 0 - 259
main/kanyikan_moment_publish.py

@@ -1,259 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2022/6/10
-"""
-上传视频到阿里云 OSS
-上传视频到管理后台
-"""
-import json
-import os
-import random
-import shutil
-import time
-
-import oss2
-import requests
-import urllib3
-from main.common import Common
-
-proxies = {"http": None, "https": None}
-
-
-class Publish:
-    @classmethod
-    def publish_video_dev(cls, log_type, request_data):
-        """
-        loginUid  站内uid (随机)
-        appType  默认:888888
-        crawlerSrcId   站外视频ID
-        crawlerSrcCode   渠道(自定义 KYK)
-        crawlerSrcPublishTimestamp  视频原发布时间
-        crawlerTaskTimestamp   爬虫创建时间(可以是当前时间)
-        videoPath  视频oss地址
-        coverImgPath  视频封面oss地址
-        title  标题
-        totalTime  视频时长
-        viewStatus  视频的有效状态 默认1
-        versionCode  版本 默认1
-        :return:
-        """
-        # Common.logger(log_type).info('publish request data: {}'.format(request_data))
-        result = cls.request_post('https://videotest.yishihui.com/longvideoapi/crawler/video/send', request_data)
-        # Common.logger(log_type).info('publish result: {}'.format(result))
-        video_id = result["data"]["id"]
-        if result['code'] != 0:
-            Common.logger(log_type).error('pushlish failure msg = {}'.format(result['msg']))
-        else:
-            Common.logger(log_type).info('publish success video_id = : {}'.format(request_data['crawlerSrcId']))
-        return video_id
-
-    @classmethod
-    def publish_video_prod(cls, log_type, request_data):
-        """
-        loginUid  站内uid (随机)
-        appType  默认:888888
-        crawlerSrcId   站外视频ID
-        crawlerSrcCode   渠道(自定义 KYK)
-        crawlerSrcPublishTimestamp  视频原发布时间
-        crawlerTaskTimestamp   爬虫创建时间(可以是当前时间)
-        videoPath  视频oss地址
-        coverImgPath  视频封面oss地址
-        title  标题
-        totalTime  视频时长
-        viewStatus  视频的有效状态 默认1
-        versionCode  版本 默认1
-        :return:
-        """
-        result = cls.request_post('https://longvideoapi.piaoquantv.com/longvideoapi/crawler/video/send', request_data)
-        # Common.logger(log_type).info('publish result: {}'.format(result))
-        video_id = result["data"]["id"]
-        if result['code'] != 0:
-            Common.logger(log_type).error('pushlish failure msg = {}'.format(result['msg']))
-        else:
-            Common.logger(log_type).info('publish success video_id = : {}'.format(request_data['crawlerSrcId']))
-        return video_id
-
-    @classmethod
-    def request_post(cls, request_url, request_data):
-        """
-        post 请求 HTTP接口
-        :param request_url: 接口URL
-        :param request_data: 请求参数
-        :return: res_data json格式
-        """
-        urllib3.disable_warnings()
-        response = requests.post(url=request_url, data=request_data, proxies=proxies, verify=False)
-        if response.status_code == 200:
-            res_data = json.loads(response.text)
-            return res_data
-
-    # 以下代码展示了基本的文件上传、下载、罗列、删除用法。
-
-    # 首先初始化AccessKeyId、AccessKeySecret、Endpoint等信息。
-    # 通过环境变量获取,或者把诸如“<你的AccessKeyId>”替换成真实的AccessKeyId等。
-    #
-    # 以杭州区域为例,Endpoint可以是:
-    #   http://oss-cn-hangzhou.aliyuncs.com
-    #   https://oss-cn-hangzhou.aliyuncs.com
-    # 分别以HTTP、HTTPS协议访问。
-    access_key_id = os.getenv('OSS_TEST_ACCESS_KEY_ID', 'LTAIP6x1l3DXfSxm')
-    access_key_secret = os.getenv('OSS_TEST_ACCESS_KEY_SECRET', 'KbTaM9ars4OX3PMS6Xm7rtxGr1FLon')
-    bucket_name = os.getenv('OSS_TEST_BUCKET', 'art-pubbucket')
-    # endpoint = os.getenv('OSS_TEST_ENDPOINT', 'oss-cn-hangzhou-internal.aliyuncs.com')
-    endpoint = os.getenv('OSS_TEST_ENDPOINT', 'oss-cn-hangzhou.aliyuncs.com')
-
-    # 确认上面的参数都填写正确了
-    for param in (access_key_id, access_key_secret, bucket_name, endpoint):
-        assert '<' not in param, '请设置参数:' + param
-
-    # 创建Bucket对象,所有Object相关的接口都可以通过Bucket对象来进行
-    bucket = oss2.Bucket(oss2.Auth(access_key_id, access_key_secret), endpoint, bucket_name)
-
-    """
-    处理流程:
-    1. 定时(每天凌晨1点执行一次)循环files文件下的内容 结构:files -> 视频文件夹 -> 视频文件 + 封面图 + 基本信息
-    2. 视频文件和封面上传到oss
-    - 视频文件oss目录  longvideo/crawler_local/video/prod/文件名
-    - 视频封面oss目录  longvideo/crawler_local/image/prod/文件名
-    3. 发布视频
-    - 读取 基本信息 调用发布接口
-    """
-    # env 日期20220225 文件名
-    oss_file_path_video = r'longvideo/crawler_local/video/{}/{}/{}'
-    oss_file_path_image = r'longvideo/crawler_local/image/{}/{}/{}'
-
-    @classmethod
-    def put_file(cls, log_type, oss_file, local_file):
-        cls.bucket.put_object_from_file(oss_file, local_file)
-        Common.logger(log_type).info("put oss file = {}, local file = {} success".format(oss_file, local_file))
-
-    # 清除本地文件
-    @classmethod
-    def remove_local_file(cls, log_type, local_file):
-        os.remove(local_file)
-        Common.logger(log_type).info("remove local file = {} success".format(local_file))
-
-    # 清除本地文件夹
-    @classmethod
-    def remove_local_file_dir(cls, log_type, local_file):
-        os.rmdir(local_file)
-        Common.logger(log_type).info("remove local file dir = {} success".format(local_file))
-
-    local_file_path = './videos'
-    video_file = 'video'
-    image_file = 'image'
-    info_file = 'info'
-    uids_dev_up = [6267140]
-    uids_dev_play = [6267141]
-    uids_prod_up = [20631208, 20631209, 20631210, 20631211, 20631212,
-                    20631213, 20631214, 20631215, 20631216, 20631217]
-    uids_prod_play = [20631208, 20631209, 20631210, 20631211, 20631212,
-                      20631213, 20631214, 20631215, 20631216, 20631217,
-                      20631223, 20631224, 20631225, 20631226, 20631227]
-
-    @classmethod
-    def upload_and_publish(cls, log_type, env, job):
-        """
-        上传视频到 oss
-        :param log_type: 哪个日志
-        :param env: 测试环境:dev,正式环境:prod
-        :param job: 上升榜:up,播放量:play
-        """
-        Common.logger(log_type).info("upload_and_publish starting...")
-        today = time.strftime("%Y%m%d", time.localtime())
-        # videos 目录下的所有视频文件夹
-        files = os.listdir(cls.local_file_path)
-        for f in files:
-            try:
-                # 单个视频文件夹
-                fi_d = os.path.join(cls.local_file_path, f)
-                # 确认为视频文件夹
-                if os.path.isdir(fi_d):
-                    Common.logger(log_type).info('dir = {}'.format(fi_d))
-                    # 列出所有视频文件夹
-                    dir_files = os.listdir(fi_d)
-                    data = {'appType': '888888', 'crawlerSrcCode': 'KANYIKAN', 'viewStatus': '1', 'versionCode': '1'}
-                    now_timestamp = int(round(time.time() * 1000))
-                    data['crawlerTaskTimestamp'] = str(now_timestamp)
-                    global uid
-                    if env == "dev" and job == "up":
-                        uid = str(random.choice(cls.uids_dev_up))
-                    elif env == "dev" and job == "play":
-                        uid = str(random.choice(cls.uids_dev_play))
-                    elif env == "prod" and job == "up":
-                        uid = str(random.choice(cls.uids_prod_up))
-                    elif env == "prod" and job == "play":
-                        uid = str(random.choice(cls.uids_prod_play))
-                    data['loginUid'] = uid
-                    # 单个视频文件夹下的所有视频文件
-                    for fi in dir_files:
-                        # 视频文件夹下的所有文件路径
-                        fi_path = fi_d + '/' + fi
-                        Common.logger(log_type).info('dir fi_path = {}'.format(fi_path))
-                        # 读取 info.txt,赋值给 data
-                        if cls.info_file in fi:
-                            f = open(fi_path, "r", encoding="UTF-8")
-                            # 读取数据 数据准确性写入的时候保证 读取暂不处理
-                            for i in range(14):
-                                line = f.readline()
-                                line = line.replace('\n', '')
-                                if line is not None and len(line) != 0 and not line.isspace():
-                                    Common.logger(log_type).info("line = {}".format(line))
-                                    if i == 0:
-                                        data['crawlerSrcId'] = line
-                                    elif i == 1:
-                                        data['title'] = line
-                                    elif i == 2:
-                                        data['totalTime'] = line
-                                    elif i == 8:
-                                        data['crawlerSrcPublishTimestamp'] = line
-                                else:
-                                    Common.logger(log_type).warning("{} line is None".format(fi_path))
-                            f.close()
-                            # remove info.txt
-                            cls.remove_local_file(log_type, fi_path)
-                    # 刷新数据
-                    dir_files = os.listdir(fi_d)
-                    for fi in dir_files:
-                        fi_path = fi_d + '/' + fi
-                        Common.logger(log_type).info('dir fi_path = {}'.format(fi_path))
-                        # 上传oss
-                        if cls.video_file in fi:
-                            global oss_video_file
-                            if env == "dev":
-                                oss_video_file = cls.oss_file_path_video.format("dev", today, data['crawlerSrcId'])
-                            elif env == "prod":
-                                oss_video_file = cls.oss_file_path_video.format("prod", today, data['crawlerSrcId'])
-                            Common.logger(log_type).info("oss_video_file = {}".format(oss_video_file))
-                            cls.put_file(log_type, oss_video_file, fi_path)
-                            data['videoPath'] = oss_video_file
-                            Common.logger(log_type).info("videoPath = {}".format(oss_video_file))
-                        elif cls.image_file in fi:
-                            global oss_image_file
-                            if env == "dev":
-                                oss_image_file = cls.oss_file_path_image.format("env", today, data['crawlerSrcId'])
-                            elif env == "prod":
-                                oss_image_file = cls.oss_file_path_image.format("prod", today, data['crawlerSrcId'])
-                            Common.logger(log_type).info("oss_image_file = {}".format(oss_image_file))
-                            cls.put_file(log_type, oss_image_file, fi_path)
-                            data['coverImgPath'] = oss_image_file
-                            Common.logger(log_type).info("coverImgPath = {}".format(oss_image_file))
-                        # 全部remove
-                        cls.remove_local_file(log_type, fi_path)
-
-                    # 发布
-                    if env == "dev":
-                        video_id = cls.publish_video_dev(log_type, data)
-                    elif env == "prod":
-                        video_id = cls.publish_video_prod(log_type, data)
-                    else:
-                        video_id = cls.publish_video_dev(log_type, data)
-                    cls.remove_local_file_dir(log_type, fi_d)
-                    return video_id
-
-                else:
-                    Common.logger(log_type).error('file not a dir = {}'.format(fi_d))
-            except Exception as e:
-                # 删除视频文件夹
-                shutil.rmtree("./videos/" + f + "/")
-                Common.logger(log_type).exception('upload_and_publish error', e)

+ 107 - 107
main/kanyikan_recommend.py

@@ -50,115 +50,115 @@ class Kanyikanrecommend:
         while True:
             for page in range(1, 101):
                 Common.logger(log_type).info(f"正在抓取第{page}页")
-                # try:
-                session = Common.get_session(log_type)
-                if session is None:
-                    time.sleep(1)
-                    continue
-                url = 'https://search.weixin.qq.com/cgi-bin/recwxa/recwxavideolist?'
-                header = {
-                    "Connection": "keep-alive",
-                    "content-type": "application/json",
-                    "Accept-Encoding": "gzip,compress,br,deflate",
-                    "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) "
-                                  "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.18(0x18001236) "
-                                  "NetType/WIFI Language/zh_CN",
-                    "Referer": "https://servicewechat.com/wxbb9a805eb4f9533c/234/page-frame.html",
-                }
-                params = {
-                    'session': session,
-                    "offset": 0,
-                    "wxaVersion": "3.9.2",
-                    "count": "10",
-                    "channelid": "208",
-                    "scene": '310',
-                    "subscene": '1089',
-                    "clientVersion": '8.0.18',
-                    "sharesearchid": '0',
-                    "nettype": 'wifi',
-                    "switchprofile": "0",
-                    "switchnewuser": "0",
-                }
-                urllib3.disable_warnings()
-                response = requests.get(url=url, headers=header, params=params, proxies=proxies, verify=False)
-                if "data" not in response.text:
-                    Common.logger(log_type).info("获取视频list时,session过期,随机睡眠 31-50 秒")
-                    # 如果返回空信息,则随机睡眠 31-40 秒
-                    time.sleep(random.randint(31, 40))
-                    continue
-                elif "items" not in response.json()["data"]:
-                    Common.logger(log_type).info(f"get_feeds:{response.json()},随机睡眠 1-3 分钟")
-                    # 如果返回空信息,则随机睡眠 1-3 分钟
-                    time.sleep(random.randint(60, 180))
-                    continue
-                feeds = response.json().get("data", {}).get("items", "")
-                if feeds == "":
-                    Common.logger(log_type).info(f"feeds:{feeds}")
-                    time.sleep(random.randint(31, 40))
-                    continue
-                for i in range(len(feeds)):
-                    # try:
-                    video_title = feeds[i].get("title", "").strip().replace("\n", "") \
-                        .replace("/", "").replace("\\", "").replace("\r", "") \
-                        .replace(":", "").replace("*", "").replace("?", "") \
-                        .replace("?", "").replace('"', "").replace("<", "") \
-                        .replace(">", "").replace("|", "").replace(" ", "") \
-                        .replace("&NBSP", "").replace(".", "。").replace(" ", "") \
-                        .replace("'", "").replace("#", "").replace("Merge", "")
-                    publish_time_stamp = feeds[i].get("date", 0)
-                    publish_time_str = time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(publish_time_stamp))
-                    # 获取播放地址
-                    if "videoInfo" not in feeds[i]:
-                        video_url = ""
-                    elif "mpInfo" in feeds[i]["videoInfo"]["videoCdnInfo"]:
-                        if len(feeds[i]["videoInfo"]["videoCdnInfo"]["mpInfo"]["urlInfo"]) > 2:
-                            video_url = feeds[i]["videoInfo"]["videoCdnInfo"]["mpInfo"]["urlInfo"][2]["url"]
-                        else:
-                            video_url = feeds[i]["videoInfo"]["videoCdnInfo"]["mpInfo"]["urlInfo"][0]["url"]
-                    elif "ctnInfo" in feeds[i]["videoInfo"]["videoCdnInfo"]:
-                        video_url = feeds[i]["videoInfo"]["videoCdnInfo"]["ctnInfo"]["urlInfo"][0]["url"]
-                    else:
-                        video_url = feeds[i]["videoInfo"]["videoCdnInfo"]["urlInfo"][0]["url"]
-                    video_dict = {
-                        "video_title": video_title,
-                        "video_id":  feeds[i].get("videoId", ""),
-                        "play_cnt":  feeds[i].get("playCount", 0),
-                        "like_cnt":  feeds[i].get("liked_cnt", 0),
-                        "comment_cnt":  feeds[i].get("comment_cnt", 0),
-                        "share_cnt":  feeds[i].get("shared_cnt", 0),
-                        "duration":  feeds[i].get("mediaDuration", 0),
-                        "video_width":  feeds[i].get("short_video_info", {}).get("width", 0),
-                        "video_height":  feeds[i].get("short_video_info", {}).get("height", 0),
-                        "publish_time_stamp":  publish_time_stamp,
-                        "publish_time_str":  publish_time_str,
-                        "user_name": feeds[i].get("source", "").strip().replace("\n", ""),
-                        "user_id": feeds[i].get("openid", ""),
-                        "avatar_url": feeds[i].get("bizIcon", ""),
-                        "cover_url": feeds[i].get("thumbUrl", ""),
-                        "video_url": video_url,
-                        "session": session,
+                try:
+                    session = Common.get_session(log_type)
+                    if session is None:
+                        time.sleep(1)
+                        continue
+                    url = 'https://search.weixin.qq.com/cgi-bin/recwxa/recwxavideolist?'
+                    header = {
+                        "Connection": "keep-alive",
+                        "content-type": "application/json",
+                        "Accept-Encoding": "gzip,compress,br,deflate",
+                        "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) "
+                                      "AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.18(0x18001236) "
+                                      "NetType/WIFI Language/zh_CN",
+                        "Referer": "https://servicewechat.com/wxbb9a805eb4f9533c/234/page-frame.html",
+                    }
+                    params = {
+                        'session': session,
+                        "offset": 0,
+                        "wxaVersion": "3.9.2",
+                        "count": "10",
+                        "channelid": "208",
+                        "scene": '310',
+                        "subscene": '1089',
+                        "clientVersion": '8.0.18',
+                        "sharesearchid": '0',
+                        "nettype": 'wifi',
+                        "switchprofile": "0",
+                        "switchnewuser": "0",
                     }
-                    for k, v in video_dict.items():
-                        Common.logger(log_type).info(f"{k}:{v}")
+                    urllib3.disable_warnings()
+                    response = requests.get(url=url, headers=header, params=params, proxies=proxies, verify=False)
+                    if "data" not in response.text:
+                        Common.logger(log_type).info("获取视频list时,session过期,随机睡眠 31-50 秒")
+                        # 如果返回空信息,则随机睡眠 31-40 秒
+                        time.sleep(random.randint(31, 40))
+                        continue
+                    elif "items" not in response.json()["data"]:
+                        Common.logger(log_type).info(f"get_feeds:{response.json()},随机睡眠 1-3 分钟")
+                        # 如果返回空信息,则随机睡眠 1-3 分钟
+                        time.sleep(random.randint(60, 180))
+                        continue
+                    feeds = response.json().get("data", {}).get("items", "")
+                    if feeds == "":
+                        Common.logger(log_type).info(f"feeds:{feeds}")
+                        time.sleep(random.randint(31, 40))
+                        continue
+                    for i in range(len(feeds)):
+                        try:
+                            video_title = feeds[i].get("title", "").strip().replace("\n", "") \
+                                .replace("/", "").replace("\\", "").replace("\r", "") \
+                                .replace(":", "").replace("*", "").replace("?", "") \
+                                .replace("?", "").replace('"', "").replace("<", "") \
+                                .replace(">", "").replace("|", "").replace(" ", "") \
+                                .replace("&NBSP", "").replace(".", "。").replace(" ", "") \
+                                .replace("'", "").replace("#", "").replace("Merge", "")
+                            publish_time_stamp = feeds[i].get("date", 0)
+                            publish_time_str = time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(publish_time_stamp))
+                            # 获取播放地址
+                            if "videoInfo" not in feeds[i]:
+                                video_url = ""
+                            elif "mpInfo" in feeds[i]["videoInfo"]["videoCdnInfo"]:
+                                if len(feeds[i]["videoInfo"]["videoCdnInfo"]["mpInfo"]["urlInfo"]) > 2:
+                                    video_url = feeds[i]["videoInfo"]["videoCdnInfo"]["mpInfo"]["urlInfo"][2]["url"]
+                                else:
+                                    video_url = feeds[i]["videoInfo"]["videoCdnInfo"]["mpInfo"]["urlInfo"][0]["url"]
+                            elif "ctnInfo" in feeds[i]["videoInfo"]["videoCdnInfo"]:
+                                video_url = feeds[i]["videoInfo"]["videoCdnInfo"]["ctnInfo"]["urlInfo"][0]["url"]
+                            else:
+                                video_url = feeds[i]["videoInfo"]["videoCdnInfo"]["urlInfo"][0]["url"]
+                            video_dict = {
+                                "video_title": video_title,
+                                "video_id":  feeds[i].get("videoId", ""),
+                                "play_cnt":  feeds[i].get("playCount", 0),
+                                "like_cnt":  feeds[i].get("liked_cnt", 0),
+                                "comment_cnt":  feeds[i].get("comment_cnt", 0),
+                                "share_cnt":  feeds[i].get("shared_cnt", 0),
+                                "duration":  feeds[i].get("mediaDuration", 0),
+                                "video_width":  feeds[i].get("short_video_info", {}).get("width", 0),
+                                "video_height":  feeds[i].get("short_video_info", {}).get("height", 0),
+                                "publish_time_stamp":  publish_time_stamp,
+                                "publish_time_str":  publish_time_str,
+                                "user_name": feeds[i].get("source", "").strip().replace("\n", ""),
+                                "user_id": feeds[i].get("openid", ""),
+                                "avatar_url": feeds[i].get("bizIcon", ""),
+                                "cover_url": feeds[i].get("thumbUrl", ""),
+                                "video_url": video_url,
+                                "session": session,
+                            }
+                            for k, v in video_dict.items():
+                                Common.logger(log_type).info(f"{k}:{v}")
 
-                    if video_dict["video_id"] == "" \
-                            or video_dict["video_title"] == ""\
-                            or video_dict["video_url"] == "":
-                        Common.logger(log_type).info("无效视频\n")
-                    elif cls.download_rule(video_dict) is False:
-                        Common.logger(log_type).info("不满足抓取规则\n")
-                    elif any(str(word) if str(word) in video_title else False for word in cls.get_filter_word(log_type, crawler)) is True:
-                        Common.logger(log_type).info("视频已中过滤词\n")
-                    elif video_dict["video_id"] in [j for i in Feishu.get_values_batch(log_type, crawler, "ho98Ov") for j in i]:
-                        Common.logger(log_type).info("视频已下载\n")
-                    elif video_dict["video_id"] in [j for i in Feishu.get_values_batch(log_type, crawler, "20ce0c") for j in i]:
-                        Common.logger(log_type).info("视频已下载\n")
-                    else:
-                        cls.download_publish(log_type, crawler, video_dict, env)
-                #         except Exception as e:
-                #             Common.logger(log_type).error(f"抓取单条视频异常:{e}\n")
-                # except Exception as e:
-                #     Common.logger(log_type).error(f"抓取第{page}页时异常:{e}\n")
+                            if video_dict["video_id"] == "" \
+                                    or video_dict["video_title"] == ""\
+                                    or video_dict["video_url"] == "":
+                                Common.logger(log_type).info("无效视频\n")
+                            elif cls.download_rule(video_dict) is False:
+                                Common.logger(log_type).info("不满足抓取规则\n")
+                            elif any(str(word) if str(word) in video_title else False for word in cls.get_filter_word(log_type, crawler)) is True:
+                                Common.logger(log_type).info("视频已中过滤词\n")
+                            elif video_dict["video_id"] in [j for i in Feishu.get_values_batch(log_type, crawler, "ho98Ov") for j in i]:
+                                Common.logger(log_type).info("视频已下载\n")
+                            elif video_dict["video_id"] in [j for i in Feishu.get_values_batch(log_type, crawler, "20ce0c") for j in i]:
+                                Common.logger(log_type).info("视频已下载\n")
+                            else:
+                                cls.download_publish(log_type, crawler, video_dict, env)
+                        except Exception as e:
+                            Common.logger(log_type).error(f"抓取单条视频异常:{e}\n")
+                except Exception as e:
+                    Common.logger(log_type).error(f"抓取第{page}页时异常:{e}\n")
 
     @classmethod
     def download_publish(cls, log_type, crawler, video_dict, env):

+ 0 - 262
main/kanyikan_recommend_publish.py

@@ -1,262 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2022/4/18
-"""
-上传视频到阿里云 OSS
-上传视频到管理后台
-"""
-import json
-import os
-import random
-import time
-import oss2
-import requests
-import urllib3
-from main.common import Common
-proxies = {"http": None, "https": None}
-
-
-class Publish:
-    @classmethod
-    def publish_video_dev(cls, log_type, request_data):
-        """
-        loginUid  站内uid (随机)
-        appType  默认:888888
-        crawlerSrcId   站外视频ID
-        crawlerSrcCode   渠道(自定义 KYK)
-        crawlerSrcPublishTimestamp  视频原发布时间
-        crawlerTaskTimestamp   爬虫创建时间(可以是当前时间)
-        videoPath  视频oss地址
-        coverImgPath  视频封面oss地址
-        title  标题
-        totalTime  视频时长
-        viewStatus  视频的有效状态 默认1
-        versionCode  版本 默认1
-        :return:
-        """
-        result = cls.request_post('https://videotest.yishihui.com/longvideoapi/crawler/video/send', request_data)
-        # Common.logger(log_type).info('publish result: {}', result)
-        video_id = result["data"]["id"]
-        if result['code'] != 0:
-            Common.logger(log_type).error('pushlish failure msg = {}', result['msg'])
-        else:
-            Common.logger(log_type).info('publish success video_id = : {}', request_data['crawlerSrcId'])
-        return video_id
-
-    @classmethod
-    def publish_video_prod(cls, log_type, request_data):
-        """
-        loginUid  站内uid (随机)
-        appType  默认:888888
-        crawlerSrcId   站外视频ID
-        crawlerSrcCode   渠道(自定义 KYK)
-        crawlerSrcPublishTimestamp  视频原发布时间
-        crawlerTaskTimestamp   爬虫创建时间(可以是当前时间)
-        videoPath  视频oss地址
-        coverImgPath  视频封面oss地址
-        title  标题
-        totalTime  视频时长
-        viewStatus  视频的有效状态 默认1
-        versionCode  版本 默认1
-        :return:
-        """
-        result = cls.request_post('https://longvideoapi.piaoquantv.com/longvideoapi/crawler/video/send', request_data)
-        # Common.logger(log_type).info('publish result: {}', result)
-        video_id = result["data"]["id"]
-        if result['code'] != 0:
-            Common.logger(log_type).error('pushlish failure msg = {}', result['msg'])
-        else:
-            Common.logger(log_type).info('publish success video_id = : {}', request_data['crawlerSrcId'])
-        return video_id
-
-    @classmethod
-    def request_post(cls, request_url, request_data):
-        """
-        post 请求 HTTP接口
-        :param request_url: 接口URL
-        :param request_data: 请求参数
-        :return: res_data json格式
-        """
-        urllib3.disable_warnings()
-        response = requests.post(url=request_url, data=request_data, proxies=proxies, verify=False)
-        if response.status_code == 200:
-            res_data = json.loads(response.text)
-            return res_data
-
-    # 以下代码展示了基本的文件上传、下载、罗列、删除用法。
-
-    # 首先初始化AccessKeyId、AccessKeySecret、Endpoint等信息。
-    # 通过环境变量获取,或者把诸如“<你的AccessKeyId>”替换成真实的AccessKeyId等。
-    #
-    # 以杭州区域为例,Endpoint可以是:
-    #   http://oss-cn-hangzhou.aliyuncs.com
-    #   https://oss-cn-hangzhou.aliyuncs.com
-    # 分别以HTTP、HTTPS协议访问。
-    access_key_id = os.getenv('OSS_TEST_ACCESS_KEY_ID', 'LTAIP6x1l3DXfSxm')
-    access_key_secret = os.getenv('OSS_TEST_ACCESS_KEY_SECRET', 'KbTaM9ars4OX3PMS6Xm7rtxGr1FLon')
-    bucket_name = os.getenv('OSS_TEST_BUCKET', 'art-pubbucket')
-    # endpoint = os.getenv('OSS_TEST_ENDPOINT', 'oss-cn-hangzhou-internal.aliyuncs.com')
-    endpoint = os.getenv('OSS_TEST_ENDPOINT', 'oss-cn-hangzhou.aliyuncs.com')
-
-    # 确认上面的参数都填写正确了
-    for param in (access_key_id, access_key_secret, bucket_name, endpoint):
-        assert '<' not in param, '请设置参数:' + param
-
-    # 创建Bucket对象,所有Object相关的接口都可以通过Bucket对象来进行
-    bucket = oss2.Bucket(oss2.Auth(access_key_id, access_key_secret), endpoint, bucket_name)
-
-    """
-    处理流程:
-    1. 定时(每天凌晨1点执行一次)循环files文件下的内容 结构:files -> 视频文件夹 -> 视频文件 + 封面图 + 基本信息
-    2. 视频文件和封面上传到oss
-    - 视频文件oss目录  longvideo/crawler_local/video/prod/文件名
-    - 视频封面oss目录  longvideo/crawler_local/image/prod/文件名
-    3. 发布视频
-    - 读取 基本信息 调用发布接口
-    """
-    # env 日期20220225 文件名
-    oss_file_path_video = r'longvideo/crawler_local/video/{}/{}/{}'
-    oss_file_path_image = r'longvideo/crawler_local/image/{}/{}/{}'
-
-    @classmethod
-    def put_file(cls, log_type, oss_file, local_file):
-        cls.bucket.put_object_from_file(oss_file, local_file)
-        Common.logger(log_type).info("put oss file = {}, local file = {} success", oss_file, local_file)
-
-    # 清除本地文件
-    @classmethod
-    def remove_local_file(cls, log_type, local_file):
-        os.remove(local_file)
-        Common.logger(log_type).info("remove local file = {} success", local_file)
-
-    # 清除本地文件夹
-    @classmethod
-    def remove_local_file_dir(cls, log_type, local_file):
-        os.rmdir(local_file)
-        Common.logger(log_type).info("remove local file dir = {} success", local_file)
-
-    local_file_path = './videos'
-    video_file = 'video'
-    image_file = 'image'
-    info_file = 'info'
-    uids_dev_up = [6267140]
-    uids_dev_play = [6267141]
-    uids_dev_recommend = [6267140, 6267141, 6267824]
-    uids_prod_up = [20631208, 20631209, 20631210, 20631211, 20631212,
-                    20631213, 20631214, 20631215, 20631216, 20631217]
-    uids_prod_play = [20631208, 20631209, 20631210, 20631211, 20631212,
-                      20631213, 20631214, 20631215, 20631216, 20631217,
-                      20631223, 20631224, 20631225, 20631226, 20631227]
-    uids_prod_recommend = [20631208, 20631209, 20631210, 20631211, 20631212,
-                           20631213, 20631214, 20631215, 20631216, 20631217,
-                           20631223, 20631224, 20631225, 20631226, 20631227]
-
-    @classmethod
-    def upload_and_publish(cls, log_type, env, job):
-        """
-        上传视频到 oss
-        :param log_type: 选择的 log
-        :param env: 测试环境:dev,正式环境:prod
-        :param job: 上升榜:up,播放量:play, send_time:发布时间榜
-        """
-        Common.logger(log_type).info("upload_and_publish starting...")
-        today = time.strftime("%Y%m%d", time.localtime())
-        # videos 目录下的所有视频文件夹
-        files = os.listdir(cls.local_file_path)
-        for f in files:
-            try:
-                # 单个视频文件夹
-                fi_d = os.path.join(cls.local_file_path, f)
-                # 确认为视频文件夹
-                if os.path.isdir(fi_d):
-                    Common.logger(log_type).info('dir = {}', fi_d)
-                    # 列出所有视频文件夹
-                    dir_files = os.listdir(fi_d)
-                    data = {'appType': '888888', 'crawlerSrcCode': 'KANYIKAN', 'viewStatus': '1', 'versionCode': '1'}
-                    now_timestamp = int(round(time.time() * 1000))
-                    data['crawlerTaskTimestamp'] = str(now_timestamp)
-                    global uid
-                    if env == "dev" and job == "up":
-                        uid = str(random.choice(cls.uids_dev_up))
-                    elif env == "dev" and job == "play":
-                        uid = str(random.choice(cls.uids_dev_play))
-                    elif env == "dev" and job == "recommend":
-                        uid = str(random.choice(cls.uids_dev_recommend))
-                    elif env == "prod" and job == "up":
-                        uid = str(random.choice(cls.uids_prod_up))
-                    elif env == "prod" and job == "play":
-                        uid = str(random.choice(cls.uids_prod_play))
-                    elif env == "prod" and job == "recommend":
-                        uid = str(random.choice(cls.uids_prod_recommend))
-                    data['loginUid'] = uid
-                    # 单个视频文件夹下的所有视频文件
-                    for fi in dir_files:
-                        # 视频文件夹下的所有文件路径
-                        fi_path = fi_d + '/' + fi
-                        Common.logger(log_type).info('dir fi_path = {}', fi_path)
-                        # 读取 info.txt,赋值给 data
-                        if cls.info_file in fi:
-                            f = open(fi_path, "r", encoding="UTF-8")
-                            # 读取数据 数据准确性写入的时候保证 读取暂不处理
-                            for i in range(14):
-                                line = f.readline()
-                                line = line.replace('\n', '')
-                                if line is not None and len(line) != 0 and not line.isspace():
-                                    Common.logger(log_type).info("line = {}", line)
-                                    if i == 0:
-                                        data['crawlerSrcId'] = line
-                                    elif i == 1:
-                                        data['title'] = line
-                                    elif i == 2:
-                                        data['totalTime'] = line
-                                    elif i == 8:
-                                        data['crawlerSrcPublishTimestamp'] = line
-                                else:
-                                    Common.logger(log_type).warning("{} line is None", fi_path)
-                            f.close()
-                            # remove info.txt
-                            cls.remove_local_file(log_type, fi_path)
-                    # 刷新数据
-                    dir_files = os.listdir(fi_d)
-                    for fi in dir_files:
-                        fi_path = fi_d + '/' + fi
-                        Common.logger(log_type).info('dir fi_path = {}', fi_path)
-                        # 上传oss
-                        if cls.video_file in fi:
-                            global oss_video_file
-                            if env == "dev":
-                                oss_video_file = cls.oss_file_path_video.format("dev", today, data['crawlerSrcId'])
-                            elif env == "prod":
-                                oss_video_file = cls.oss_file_path_video.format("prod", today, data['crawlerSrcId'])
-                            Common.logger(log_type).info("oss_video_file = {}", oss_video_file)
-                            cls.put_file(log_type, oss_video_file, fi_path)
-                            data['videoPath'] = oss_video_file
-                            Common.logger(log_type).info("videoPath = {}", oss_video_file)
-                        elif cls.image_file in fi:
-                            global oss_image_file
-                            if env == "dev":
-                                oss_image_file = cls.oss_file_path_image.format("env", today, data['crawlerSrcId'])
-                            elif env == "prod":
-                                oss_image_file = cls.oss_file_path_image.format("prod", today, data['crawlerSrcId'])
-                            Common.logger(log_type).info("oss_image_file = {}", oss_image_file)
-                            cls.put_file(log_type, oss_image_file, fi_path)
-                            data['coverImgPath'] = oss_image_file
-                            Common.logger(log_type).info("coverImgPath = {}", oss_image_file)
-                        # 全部remove
-                        cls.remove_local_file(log_type, fi_path)
-
-                    # 发布
-                    if env == "dev":
-                        video_id = cls.publish_video_dev(log_type, data)
-                    elif env == "prod":
-                        video_id = cls.publish_video_prod(log_type, data)
-                    else:
-                        video_id = cls.publish_video_dev(log_type, data)
-                    cls.remove_local_file_dir(log_type, fi_d)
-                    return video_id
-
-                else:
-                    Common.logger(log_type).error('file not a dir = {}', fi_d)
-            except Exception as e:
-                # 删除视频文件夹
-                Common.logger(log_type).exception('upload_and_publish error', e)