wangkun 2 년 전
부모
커밋
335f91b193
3개의 변경된 파일418개의 추가작업 그리고 3개의 파일을 삭제
  1. 1 1
      README.md
  2. 7 2
      main/kanyikan_recommend.py
  3. 410 0
      main/publish.py

+ 1 - 1
README.md

@@ -15,7 +15,7 @@ sh kanyikan.sh --log_type="recommend" --crawler="kanyikan" --env="dev"
 正式环境:
 * * * * * /bin/sh /Users/lieyunye/Desktop/crawler/crawler_kanyikan/kanyikan.sh --log_type="recommend" --crawler="kanyikan" --env="prod"
 杀进程
-ps aux | grep run_kanyikan | grep -v grep | awk '{print $2}' | xargs kill -9
+ps aux | grep run_kanyikan_recommend | grep -v grep | awk '{print $2}' | xargs kill -9
 
 videoid.txt 存储视频信息:
    - 视频 ID

+ 7 - 2
main/kanyikan_recommend.py

@@ -11,7 +11,7 @@ import urllib3
 sys.path.append(os.getcwd())
 from main.common import Common
 from main.feishu_lib import Feishu
-from main.kanyikan_recommend_publish import Publish
+from main.publish import Publish
 proxies = {"http": None, "https": None}
 
 
@@ -193,7 +193,12 @@ class Kanyikanrecommend:
         Common.logger("recommend").info("==========视频信息已保存至info.txt==========")
 
         # 上传视频
-        our_video_id = Publish.upload_and_publish(log_type, env, "recommend")
+        our_video_id = Publish.upload_and_publish(log_type=log_type,
+                                                  crawler=crawler,
+                                                  strategy="推荐抓取策略",
+                                                  our_uid="recommend",
+                                                  env=env,
+                                                  oss_endpoint="out")
         if env == "dev":
             our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
         else:

+ 410 - 0
main/publish.py

@@ -0,0 +1,410 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/6/1
+import json
+import os
+import random
+import shutil
+import sys
+import time
+import oss2
+import requests
+import urllib3
+sys.path.append(os.getcwd())
+from main.common import Common
+proxies = {"http": None, "https": None}
+
+
+class Publish:
+    @classmethod
+    def publish_video_dev(cls, log_type, request_data):
+        """
+        loginUid  站内uid (随机)
+        appType  默认:888888
+        crawlerSrcId   站外视频ID
+        crawlerSrcCode   渠道(自定义 KYK)
+        crawlerSrcPublishTimestamp  视频原发布时间
+        crawlerTaskTimestamp   爬虫创建时间(可以是当前时间)
+        videoPath  视频oss地址
+        coverImgPath  视频封面oss地址
+        title  标题
+        totalTime  视频时长
+        viewStatus  视频的有效状态 默认1
+        versionCode  版本 默认1
+        :return:
+        """
+        Common.logger(log_type).info('publish request data: {}'.format(request_data))
+        result = cls.request_post('https://videotest.yishihui.com/longvideoapi/crawler/video/send', request_data)
+        Common.logger(log_type).info('publish result: {}'.format(result))
+        video_id = result["data"]["id"]
+        Common.logger(log_type).info('video_id: {}'.format(video_id))
+        if result['code'] != 0:
+            Common.logger(log_type).error('pushlish failure msg = {}'.format(result['msg']))
+        else:
+            Common.logger(log_type).info(
+                'publish success video_id = : {}'.format(request_data['crawlerSrcId']))
+        return video_id
+
+    @classmethod
+    def publish_video_prod(cls, log_type, request_data):
+        """
+        loginUid  站内uid (随机)
+        appType  默认:888888
+        crawlerSrcId   站外视频ID
+        crawlerSrcCode   渠道(自定义 KYK)
+        crawlerSrcPublishTimestamp  视频原发布时间
+        crawlerTaskTimestamp   爬虫创建时间(可以是当前时间)
+        videoPath  视频oss地址
+        coverImgPath  视频封面oss地址
+        title  标题
+        totalTime  视频时长
+        viewStatus  视频的有效状态 默认1
+        versionCode  版本 默认1
+        :return:
+        """
+        Common.logger(log_type).info(f'publish request data: {request_data}')
+        result = cls.request_post('https://longvideoapi.piaoquantv.com/longvideoapi/crawler/video/send', request_data)
+        Common.logger(log_type).info(f'publish result: {result}')
+        video_id = result["data"]["id"]
+        Common.logger(log_type).info(f'video_id: {video_id}')
+        if result['code'] != 0:
+            Common.logger(log_type).error('pushlish failure msg = {}'.format(result['msg']))
+        else:
+            Common.logger(log_type).info(
+                'publish success video_id = : {}'.format(request_data['crawlerSrcId']))
+        return video_id
+
+    @classmethod
+    def request_post(cls, request_url, request_data):
+        """
+        post 请求 HTTP接口
+        :param request_url: 接口URL
+        :param request_data: 请求参数
+        :return: res_data json格式
+        """
+        urllib3.disable_warnings()
+        response = requests.post(url=request_url, data=request_data, proxies=proxies, verify=False)
+        if response.status_code == 200:
+            res_data = json.loads(response.text)
+            return res_data
+
+    @classmethod
+    def bucket(cls, oss_endpoint):
+        """
+        创建 bucket
+        :param oss_endpoint: inner:内网;out:外网;hk:香港
+        :return: bucket
+        """
+        # 以下代码展示了基本的文件上传、下载、罗列、删除用法。
+
+        # 首先初始化AccessKeyId、AccessKeySecret、Endpoint等信息。
+        # 通过环境变量获取,或者把诸如“<你的AccessKeyId>”替换成真实的AccessKeyId等。
+        #
+        # 以杭州区域为例,Endpoint可以是:
+        #   http://oss-cn-hangzhou.aliyuncs.com
+        #   https://oss-cn-hangzhou.aliyuncs.com
+        # 分别以HTTP、HTTPS协议访问。
+        access_key_id = os.getenv('OSS_TEST_ACCESS_KEY_ID', 'LTAIP6x1l3DXfSxm')
+        access_key_secret = os.getenv('OSS_TEST_ACCESS_KEY_SECRET', 'KbTaM9ars4OX3PMS6Xm7rtxGr1FLon')
+        bucket_name = os.getenv('OSS_TEST_BUCKET', 'art-pubbucket')
+        # OSS 内网
+        if oss_endpoint == 'inner':
+            endpoint = os.getenv('OSS_TEST_ENDPOINT', 'oss-cn-hangzhou-internal.aliyuncs.com')
+        # OSS 外网
+        elif oss_endpoint == 'out':
+            endpoint = os.getenv('OSS_TEST_ENDPOINT', 'oss-cn-hangzhou.aliyuncs.com')
+        elif oss_endpoint == 'hk':
+            endpoint = os.getenv('OSS_TEST_ENDPOINT', 'oss-accelerate.aliyuncs.com')
+        # 默认走外网
+        else:
+            endpoint = os.getenv('OSS_TEST_ENDPOINT', 'oss-cn-hangzhou.aliyuncs.com')
+
+        # 确认上面的参数都填写正确了
+        for param in (access_key_id, access_key_secret, bucket_name, endpoint):
+            assert '<' not in param, '请设置参数:' + param
+
+        # 创建Bucket对象,所有Object相关的接口都可以通过Bucket对象来进行
+        bucket = oss2.Bucket(oss2.Auth(access_key_id, access_key_secret), endpoint, bucket_name)
+        return bucket
+
+    """
+    处理流程:
+    1. 定时(每天凌晨1点执行一次)循环files文件下的内容 结构:files -> 视频文件夹 -> 视频文件 + 封面图 + 基本信息
+    2. 视频文件和封面上传到oss
+    - 视频文件oss目录  longvideo/crawler_local/video/prod/文件名
+    - 视频封面oss目录  longvideo/crawler_local/image/prod/文件名
+    3. 发布视频
+    - 读取 基本信息 调用发布接口
+    """
+    # env 日期20220225 文件名
+    oss_file_path_video = 'longvideo/crawler_local/video/{}/{}/{}'
+    oss_file_path_image = 'longvideo/crawler_local/image/{}/{}/{}'
+
+    @classmethod
+    def put_file(cls, log_type, oss_endpoint, oss_file, local_file):
+        # cls.bucket.put_object_from_file(oss_file, local_file)
+        cls.bucket(oss_endpoint).put_object_from_file(oss_file, local_file)
+        Common.logger(log_type).info("put oss file = {}, local file = {} success".format(oss_file, local_file))
+
+    # 清除本地文件
+    @classmethod
+    def remove_local_file(cls, log_type, local_file):
+        os.remove(local_file)
+        Common.logger(log_type).info("remove local file = {} success".format(local_file))
+
+    # 清除本地文件夹
+    @classmethod
+    def remove_local_file_dir(cls, log_type, local_file):
+        os.rmdir(local_file)
+        Common.logger(log_type).info("remove local file dir = {} success".format(local_file))
+
+    # 站内 UID
+    @classmethod
+    def uids(cls, crawler, strategy, our_uid, env):
+        """
+        站内 ID
+        :param crawler: 哪款爬虫
+        :param env: 什么环境
+        :param strategy: 榜单类型,也可以是指定的站内 UID
+        :param our_uid: 上传到指定站内 UID
+        :return: uid
+        """
+        # if env == 'dev':
+        #     uids_dev = [6267140, 6267141]
+        #     return random.choice(uids_dev)
+
+        # 小年糕
+        if crawler == 'xiaoniangao' and env == 'prod' and strategy == '定向爬虫策略':
+            uids_prod_xiaoniangao_follow = [50322210, 50322211, 50322212, 50322213, 50322214, 50322215,
+                                            50322216, 50322217, 50322218, 50322219, 50322220, 50322221, 50322236, 50322237]
+            return random.choice(uids_prod_xiaoniangao_follow)
+        elif crawler == 'xiaoniangao' and env == 'prod' and strategy == '小时榜爬虫策略':
+            uids_prod_xiaoniangao_hour = [50322226, 50322227, 50322228, 50322229]
+            return random.choice(uids_prod_xiaoniangao_hour)
+        elif crawler == 'xiaoniangao' and env == 'prod' and strategy == '播放量榜爬虫策略':
+            uids_prod_xiaoniangao_play = [50322222, 50322223, 50322224, 50322225]
+            return random.choice(uids_prod_xiaoniangao_play)
+
+        elif crawler == 'kanyikan' and env == 'prod' and strategy == '推荐抓取策略':
+            uids_prod_kanyikan_recommend = [20631208, 20631209, 20631210, 20631211, 20631212,
+                           20631213, 20631214, 20631215, 20631216, 20631217,
+                           20631223, 20631224, 20631225, 20631226, 20631227]
+            return random.choice(uids_prod_kanyikan_recommend)
+        elif crawler == 'kanyikan' and env == 'prod' and strategy == '朋友圈抓取策略':
+            uids_prod_kanyikan_moment = [20631208, 20631209, 20631210, 20631211, 20631212,
+                      20631213, 20631214, 20631215, 20631216, 20631217,
+                      20631223, 20631224, 20631225, 20631226, 20631227]
+            return random.choice(uids_prod_kanyikan_moment)
+
+        # elif crawler == 'gongzhonghao' and env == 'prod' and strategy == '定向爬虫策略':
+        #     uids_prod_gongzhonghao_follow = [26117675, 26117676, 26117677, 26117678, 26117679, 26117680]
+        #     return random.choice(uids_prod_gongzhonghao_follow)
+        #
+        # elif crawler == 'xigua' and env == 'prod' and strategy == '推荐榜爬虫策略':
+        #     uids_prod_gongzhonghao_follow = [50322238]
+        #     return random.choice(uids_prod_gongzhonghao_follow)
+
+        # elif crawler == 'benshanzhufu' and env == 'prod' and strategy == '推荐榜爬虫策略':
+        #     uids_prod_benshanzhufu_recommend = [20631262, 20631263, 20631264, 20631265, 20631266, 20631267, 20631268, 20631269, 20631271, 20631272]
+        #     return random.choice(uids_prod_benshanzhufu_recommend)
+
+        # elif crawler == 'suisuiniannianyingfuqi' and env == 'prod' and strategy == '推荐榜爬虫策略':
+        #     uids_prod_suisuiniannianyingfuqi_recommend = [26117547, 26117548, 26117549, 26117550, 26117551]
+        #     return random.choice(uids_prod_suisuiniannianyingfuqi_recommend)
+
+        elif crawler == 'ganggangdouchuan' and env == 'prod' and strategy == '推荐榜爬虫策略':
+            uids_prod_ganggangdouchuan_recommend = [26117661, 26117662, 26117663]
+            return random.choice(uids_prod_ganggangdouchuan_recommend)
+
+        elif crawler == 'jixiangxingfu' and env == 'prod' and strategy == '推荐榜爬虫策略':
+            uids_prod_jixiangxingfu_recommend = [26117478, 26117479, 26117480, 26117471, 26117473, 26117474, 26117475, 26117476, 26117477]
+            return random.choice(uids_prod_jixiangxingfu_recommend)
+
+        elif crawler == 'zhongmiaoyinxin' and env == 'prod' and strategy == '推荐榜爬虫策略':
+            uids_prod_zhongmiaoyinxin_recommend = [26117493, 26117494, 26117495, 26117496, 26117497, 26117498]
+            return random.choice(uids_prod_zhongmiaoyinxin_recommend)
+
+        elif crawler == 'zhiqingtiantiankan' and env == 'prod' and strategy == '推荐榜爬虫策略':
+            uids_prod_zhiqingtiantiankan_recommend = [20631253, 20631254, 20631255, 20631256, 20631257, 20631258, 20631259, 20631260, 20631261]
+            return random.choice(uids_prod_zhiqingtiantiankan_recommend)
+
+        else:
+            return our_uid
+
+    # 爬虫渠道号
+    @classmethod
+    def crawlersrccode(cls, crawler):
+        if crawler == 'youtube':
+            return 'YOUTUBE'
+        elif crawler == "kuaishou":
+            return "KUAISHOU_XCX"
+        elif crawler == "xiaoniangao":
+            return "XIAONIANGAO_XCX"
+        elif crawler == "gongzhonghao":
+            return "GONGZHONGHAO_XINXIN"
+        elif crawler == 'xigua':
+            return 'XIGUA'
+        elif crawler == 'weixinzhishu':
+            return 'WEIXINZHISHU'
+        elif crawler == "douyin":
+            return "DOUYIN"
+        elif crawler == "benshanzhufu":
+            return "BENSHANZHUFU"
+        elif crawler == 'suisuiniannianyingfuqi':
+            return 'SUISUINIANNIANYINGFUQI'
+        elif crawler == 'jixiangxingfu':
+            return 'JIXIANGXINGFU'
+        elif crawler == 'ganggangdouchuan':
+            return 'GANGGANGDOUCHUAN'
+        elif crawler == 'zhongmiaoyinxin':
+            return 'ZHONGMIAOYINXIN'
+        elif crawler == 'zhiqingzongqun':
+            return 'ZHIQINGZONGQUN'
+        elif crawler == 'zhiqingtiantiankan':
+            return 'ZHIQINGZONGQUN'
+
+        elif crawler == 'kanyikan':
+            return 'KANYIKAN'
+        elif crawler == "weishi":
+            return "WEISHI"
+        elif crawler == 'shipinhao':
+            return 'SHIPINHAO_XCX'
+        elif crawler == 'zhihu':
+            return 'ZHIHU'
+        elif crawler == 'zhufumao':
+            return 'ZHUFUMAO'
+        elif crawler == 'zongjiao':
+            return 'ZONGJIAO'
+        elif crawler == 'haokan':
+            return 'HAOKAN'
+        elif crawler == 'kandaojiushifuqi':
+            return 'KANDAOJIUSHIFUQI'
+        elif crawler == 'shengshengyingyin':
+            return 'SHENGSHENGYINGYIN'
+        else:
+            return "CRAWLER"
+
+    @classmethod
+    def local_file_path(cls, crawler):
+        local_file_path = f'./{crawler}/videos'
+        video_file = 'video'
+        image_file = 'image'
+        info_file = 'info'
+
+        loacl_file_dict = {
+            'local_file_path': local_file_path,
+            'video_file': video_file,
+            'image_file': image_file,
+            'info_file': info_file}
+        return loacl_file_dict
+
+    @classmethod
+    def upload_and_publish(cls, log_type, crawler, strategy, our_uid, env, oss_endpoint):
+        """
+        上传视频到 oss
+        :param log_type: 选择的 log
+        :param crawler: 哪款爬虫
+        :param env: 测试环境:dev,正式环境:prod
+        :param our_uid: 站内 UID
+        :param strategy: 榜单类型
+        :param oss_endpoint: 内网:inner;外网:out
+        """
+        Common.logger(log_type).info("upload_and_publish starting...")
+        today = time.strftime("%Y%m%d", time.localtime())
+        # videos 目录下的所有视频文件夹
+        files = os.listdir(cls.local_file_path(crawler)["local_file_path"])
+        for fv in files:
+            try:
+                # 单个视频文件夹
+                fi_d = os.path.join(cls.local_file_path(crawler)["local_file_path"], fv)
+                # 确认为视频文件夹
+                if os.path.isdir(fi_d):
+                    Common.logger(log_type).info('dir = {}'.format(fi_d))
+                    # 列出所有视频文件夹
+                    dir_files = os.listdir(fi_d)
+                    data = {'appType': '888888',
+                            'crawlerSrcCode': cls.crawlersrccode(crawler),
+                            'viewStatus': '1',
+                            'versionCode': '1'}
+                    now_timestamp = int(round(time.time() * 1000))
+                    data['crawlerTaskTimestamp'] = str(now_timestamp)
+                    data['loginUid'] = cls.uids(crawler, strategy, our_uid, env)
+                    # 单个视频文件夹下的所有视频文件
+                    for fi in dir_files:
+                        # 视频文件夹下的所有文件路径
+                        fi_path = fi_d + '/' + fi
+                        Common.logger(log_type).info('dir fi_path = {}'.format(fi_path))
+                        # 读取 info.txt,赋值给 data
+                        if cls.local_file_path(crawler)["info_file"] in fi:
+                            f = open(fi_path, "r", encoding="UTF-8")
+                            # 读取数据 数据准确性写入的时候保证 读取暂不处理
+                            for i in range(14):
+                                line = f.readline()
+                                line = line.replace('\n', '')
+                                if line is not None and len(line) != 0 and not line.isspace():
+                                    # Common.logger(log_type).info("line = {}".format(line))
+                                    if i == 0:
+                                        data['crawlerSrcId'] = line
+                                    elif i == 1:
+                                        data['title'] = line
+                                    elif i == 2:
+                                        data['totalTime'] = line
+                                    elif i == 8:
+                                        data['crawlerSrcPublishTimestamp'] = line
+                                else:
+                                    Common.logger(log_type).warning("{} line is None".format(fi_path))
+                            f.close()
+                            # remove info.txt
+                            cls.remove_local_file(log_type, fi_path)
+                    # 刷新数据
+                    dir_files = os.listdir(fi_d)
+                    for fi in dir_files:
+                        fi_path = fi_d + '/' + fi
+                        # Common.logger(log_type).info('dir fi_path = {}'.format(fi_path))
+                        # 上传oss
+                        if cls.local_file_path(crawler)["video_file"] in fi:
+                            global oss_video_file
+                            if env == "dev":
+                                oss_video_file = cls.oss_file_path_video.format("dev", today, data['crawlerSrcId'])
+                            elif env == "prod":
+                                oss_video_file = cls.oss_file_path_video.format("prod", today, data['crawlerSrcId'])
+                            elif env == "hk":
+                                oss_video_file = cls.oss_file_path_video.format("prod", today, data['crawlerSrcId'])
+                            Common.logger(log_type).info("oss_video_file = {}".format(oss_video_file))
+                            cls.put_file(log_type, oss_endpoint, oss_video_file, fi_path)
+                            data['videoPath'] = oss_video_file
+                            Common.logger(log_type).info("videoPath = {}".format(oss_video_file))
+                        elif cls.local_file_path(crawler)["image_file"] in fi:
+                            global oss_image_file
+                            if env == "dev":
+                                oss_image_file = cls.oss_file_path_image.format("env", today, data['crawlerSrcId'])
+                            elif env == "prod":
+                                oss_image_file = cls.oss_file_path_image.format("prod", today, data['crawlerSrcId'])
+                            elif env == "hk":
+                                oss_image_file = cls.oss_file_path_image.format("prod", today, data['crawlerSrcId'])
+                            Common.logger(log_type).info("oss_image_file = {}".format(oss_image_file))
+                            cls.put_file(log_type, oss_endpoint, oss_image_file, fi_path)
+                            data['coverImgPath'] = oss_image_file
+                            Common.logger(log_type).info("coverImgPath = {}".format(oss_image_file))
+                        # 全部remove
+                        cls.remove_local_file(log_type, fi_path)
+
+                    # 发布
+                    if env == "dev":
+                        video_id = cls.publish_video_dev(log_type, data)
+                    elif env == "prod":
+                        video_id = cls.publish_video_prod(log_type, data)
+                    elif env == "hk":
+                        video_id = cls.publish_video_prod(log_type, data)
+                    else:
+                        video_id = cls.publish_video_dev(log_type, data)
+                    cls.remove_local_file_dir(log_type, fi_d)
+                    Common.logger(log_type).info('video_id:{}', video_id)
+                    return video_id
+
+                else:
+                    Common.logger(log_type).error('file not a dir = {}'.format(fi_d))
+            except Exception as e:
+                # 删除视频文件夹
+                shutil.rmtree(f"./{crawler}/videos/{fv}/")
+                Common.logger(log_type).exception('upload_and_publish error', e)