123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381 |
- # -*- coding: utf-8 -*-
- # @Author: wangkun
- # @Time: 2023/1/31
- """
- 站内UID配置 / 环境配置 / 视频上传
- """
- import json
- import os
- import random
- import shutil
- import sys
- import time
- import oss2
- import requests
- import urllib3
- sys.path.append(os.getcwd())
- from common.common import Common
- proxies = {"http": None, "https": None}
- class Publish:
- @classmethod
- def publish_video_dev(cls, log_type, crawler, request_data):
- """
- loginUid 站内uid (随机)
- appType 默认:888888
- crawlerSrcId 站外视频ID
- crawlerSrcCode 渠道(自定义 KYK)
- crawlerSrcPublishTimestamp 视频原发布时间
- crawlerTaskTimestamp 爬虫创建时间(可以是当前时间)
- videoPath 视频oss地址
- coverImgPath 视频封面oss地址
- title 标题
- totalTime 视频时长
- viewStatus 视频的有效状态 默认1
- versionCode 版本 默认1
- :return:
- """
- Common.logger(log_type, crawler).info('publish request data: {}'.format(request_data))
- result = cls.request_post('https://videotest.yishihui.com/longvideoapi/crawler/video/send', request_data)
- Common.logger(log_type, crawler).info('publish result: {}'.format(result))
- video_id = result["data"]["id"]
- Common.logger(log_type, crawler).info('video_id: {}'.format(video_id))
- if result['code'] != 0:
- Common.logger(log_type, crawler).error('pushlish failure msg = {}'.format(result['msg']))
- else:
- Common.logger(log_type, crawler).info(
- 'publish success video_id = : {}'.format(request_data['crawlerSrcId']))
- return video_id
- @classmethod
- def publish_video_prod(cls, log_type, crawler, request_data):
- """
- loginUid 站内uid (随机)
- appType 默认:888888
- crawlerSrcId 站外视频ID
- crawlerSrcCode 渠道(自定义 KYK)
- crawlerSrcPublishTimestamp 视频原发布时间
- crawlerTaskTimestamp 爬虫创建时间(可以是当前时间)
- videoPath 视频oss地址
- coverImgPath 视频封面oss地址
- title 标题
- totalTime 视频时长
- viewStatus 视频的有效状态 默认1
- versionCode 版本 默认1
- :return:
- """
- Common.logger(log_type, crawler).info(f'publish request data: {request_data}')
- result = cls.request_post('https://longvideoapi.piaoquantv.com/longvideoapi/crawler/video/send', request_data)
- Common.logger(log_type, crawler).info(f'publish result: {result}')
- video_id = result["data"]["id"]
- Common.logger(log_type, crawler).info(f'video_id: {video_id}')
- if result['code'] != 0:
- Common.logger(log_type, crawler).error('pushlish failure msg = {}'.format(result['msg']))
- else:
- Common.logger(log_type, crawler).info(
- 'publish success video_id = : {}'.format(request_data['crawlerSrcId']))
- return video_id
- @classmethod
- def request_post(cls, request_url, request_data):
- """
- post 请求 HTTP接口
- :param request_url: 接口URL
- :param request_data: 请求参数
- :return: res_data json格式
- """
- urllib3.disable_warnings()
- response = requests.post(url=request_url, data=request_data, proxies=proxies, verify=False)
- if response.status_code == 200:
- res_data = json.loads(response.text)
- return res_data
- @classmethod
- def bucket(cls, oss_endpoint):
- """
- 创建 bucket
- :param oss_endpoint: inner:内网;out:外网;hk:香港
- :return: bucket
- """
- # 以下代码展示了基本的文件上传、下载、罗列、删除用法。
- # 首先初始化AccessKeyId、AccessKeySecret、Endpoint等信息。
- # 通过环境变量获取,或者把诸如“<你的AccessKeyId>”替换成真实的AccessKeyId等。
- #
- # 以杭州区域为例,Endpoint可以是:
- # http://oss-cn-hangzhou.aliyuncs.com
- # https://oss-cn-hangzhou.aliyuncs.com
- # 分别以HTTP、HTTPS协议访问。
- access_key_id = os.getenv('OSS_TEST_ACCESS_KEY_ID', 'LTAIP6x1l3DXfSxm')
- access_key_secret = os.getenv('OSS_TEST_ACCESS_KEY_SECRET', 'KbTaM9ars4OX3PMS6Xm7rtxGr1FLon')
- bucket_name = os.getenv('OSS_TEST_BUCKET', 'art-pubbucket')
- # OSS 内网
- if oss_endpoint == 'inner':
- endpoint = os.getenv('OSS_TEST_ENDPOINT', 'oss-cn-hangzhou-internal.aliyuncs.com')
- # OSS 外网
- elif oss_endpoint == 'out':
- endpoint = os.getenv('OSS_TEST_ENDPOINT', 'oss-cn-hangzhou.aliyuncs.com')
- elif oss_endpoint == 'hk':
- endpoint = os.getenv('OSS_TEST_ENDPOINT', 'oss-accelerate.aliyuncs.com')
- # 默认走外网
- else:
- endpoint = os.getenv('OSS_TEST_ENDPOINT', 'oss-cn-hangzhou.aliyuncs.com')
- # 确认上面的参数都填写正确了
- for param in (access_key_id, access_key_secret, bucket_name, endpoint):
- assert '<' not in param, '请设置参数:' + param
- # 创建Bucket对象,所有Object相关的接口都可以通过Bucket对象来进行
- bucket = oss2.Bucket(oss2.Auth(access_key_id, access_key_secret), endpoint, bucket_name)
- return bucket
- """
- 处理流程:
- 1. 定时(每天凌晨1点执行一次)循环files文件下的内容 结构:files -> 视频文件夹 -> 视频文件 + 封面图 + 基本信息
- 2. 视频文件和封面上传到oss
- - 视频文件oss目录 longvideo/crawler_local/video/prod/文件名
- - 视频封面oss目录 longvideo/crawler_local/image/prod/文件名
- 3. 发布视频
- - 读取 基本信息 调用发布接口
- """
- # env 日期20220225 文件名
- oss_file_path_video = 'longvideo/crawler_local/video/{}/{}/{}'
- oss_file_path_image = 'longvideo/crawler_local/image/{}/{}/{}'
- @classmethod
- def put_file(cls, log_type, crawler, oss_endpoint, oss_file, local_file):
- # cls.bucket.put_object_from_file(oss_file, local_file)
- cls.bucket(oss_endpoint).put_object_from_file(oss_file, local_file)
- Common.logger(log_type, crawler).info("put oss file = {}, local file = {} success".format(oss_file, local_file))
- # 清除本地文件
- @classmethod
- def remove_local_file(cls, log_type, crawler, local_file):
- os.remove(local_file)
- Common.logger(log_type, crawler).info("remove local file = {} success".format(local_file))
- # 清除本地文件夹
- @classmethod
- def remove_local_file_dir(cls, log_type, crawler, local_file):
- os.rmdir(local_file)
- Common.logger(log_type, crawler).info("remove local file dir = {} success".format(local_file))
- # 站内 UID
- @classmethod
- def uids(cls, crawler, strategy, our_uid, env):
- """
- 站内 ID
- :param crawler: 哪款爬虫
- :param env: 什么环境
- :param strategy: 榜单类型,也可以是指定的站内 UID
- :param our_uid: 上传到指定站内 UID
- :return: uid
- """
- if env == 'dev':
- uids_dev = [6267140, 6267141]
- return random.choice(uids_dev)
- # 小年糕
- elif crawler == 'xiaoniangao' and env == 'prod' and strategy == '定向爬虫策略':
- uids_prod_xiaoniangao_follow = [50322210, 50322211, 50322212, 50322213, 50322214, 50322215,
- 50322216, 50322217, 50322218, 50322219, 50322220, 50322221]
- return random.choice(uids_prod_xiaoniangao_follow)
- elif crawler == 'xiaoniangao' and env == 'prod' and strategy == '小时榜爬虫策略':
- uids_prod_xiaoniangao_hour = [50322226, 50322227, 50322228, 50322229]
- return random.choice(uids_prod_xiaoniangao_hour)
- elif crawler == 'xiaoniangao' and env == 'prod' and strategy == '播放量榜爬虫策略':
- uids_prod_xiaoniangao_play = [50322222, 50322223, 50322224, 50322225]
- return random.choice(uids_prod_xiaoniangao_play)
- elif crawler == 'kanyikan':
- uids_prod_kanyikan_moment = [20631208, 20631209, 20631210, 20631211, 20631212,
- 20631213, 20631214, 20631215, 20631216, 20631217,
- 20631223, 20631224, 20631225, 20631226, 20631227]
- return random.choice(uids_prod_kanyikan_moment)
- elif crawler == 'ggdc' and env == 'prod' and strategy == 'kanyikan_recommend':
- uids_ggdc_prod_recommend = [26117661, 26117662, 26117663]
- return random.choice(uids_ggdc_prod_recommend)
- elif crawler == 'ggdc' and env == 'prod' and strategy == 'follow':
- uids_ggdc_prod_follow = [26117661, 26117662, 26117663]
- return random.choice(uids_ggdc_prod_follow)
- else:
- return our_uid
- # 爬虫渠道号
- @classmethod
- def crawlersrccode(cls, crawler):
- if crawler == 'youtube':
- return 'YOUTUBE'
- elif crawler == 'kanyikan':
- return 'KANYIKAN'
- elif crawler == "kuaishou":
- return "KUAISHOU_XCX"
- elif crawler == "weishi":
- return "WEISHI"
- elif crawler == "xiaoniangao":
- return "XIAONIANGAO_XCX"
- elif crawler == "benshanzhufu":
- return "BENSHANZHUFU"
- elif crawler == "gongzhonghao_xinxin":
- return "GONGZHONGHAO_XINXIN"
- elif crawler == 'shipinhao':
- return 'SHIPINHAO_XCX'
- elif crawler == 'xigua':
- return 'XIGUA'
- elif crawler == 'zhihu':
- return 'ZHIHU'
- elif crawler == 'jixiangxingfu':
- return 'JIXIANGXINGFU'
- elif crawler == 'zhongmiaoyinxin':
- return 'ZHONGMIAOYINXIN'
- elif crawler == 'suisuiniannianyingfuqi':
- return 'SUISUINIANNIANYINGFUQI'
- elif crawler == 'zhufumao':
- return 'ZHUFUMAO'
- elif crawler == 'zongjiao':
- return 'ZONGJIAO'
- elif crawler == 'haokan':
- return 'HAOKAN'
- elif crawler == 'kandaojiushifuqi':
- return 'KANDAOJIUSHIFUQI'
- elif crawler == 'shengshengyingyin':
- return 'SHENGSHENGYINGYIN'
- elif crawler == 'ganggangdouchuan':
- return 'GANGGANGDOUCHUAN'
- elif crawler == 'gongzhonghao_xinxin':
- return 'GONGZHONGHAO_XINXIN'
- elif crawler == 'weixinzhishu':
- return 'WEIXINZHISHU'
- else:
- return "CRAWLER"
- @classmethod
- def local_file_path(cls, crawler):
- local_file_path = f'./{crawler}/videos'
- video_file = 'video'
- image_file = 'image'
- info_file = 'info'
- loacl_file_dict = {
- 'local_file_path': local_file_path,
- 'video_file': video_file,
- 'image_file': image_file,
- 'info_file': info_file}
- return loacl_file_dict
- @classmethod
- def upload_and_publish(cls, log_type, crawler, strategy, our_uid, env, oss_endpoint):
- """
- 上传视频到 oss
- :param log_type: 选择的 log
- :param crawler: 哪款爬虫
- :param env: 测试环境:dev,正式环境:prod
- :param our_uid: 站内 UID
- :param strategy: 榜单类型
- :param oss_endpoint: 内网:inner;外网:out
- """
- Common.logger(log_type, crawler).info("upload_and_publish starting...")
- today = time.strftime("%Y%m%d", time.localtime())
- # videos 目录下的所有视频文件夹
- files = os.listdir(cls.local_file_path(crawler)["local_file_path"])
- for fv in files:
- try:
- # 单个视频文件夹
- fi_d = os.path.join(cls.local_file_path(crawler)["local_file_path"], fv)
- # 确认为视频文件夹
- if os.path.isdir(fi_d):
- Common.logger(log_type, crawler).info('dir = {}'.format(fi_d))
- # 列出所有视频文件夹
- dir_files = os.listdir(fi_d)
- data = {'appType': '888888',
- 'crawlerSrcCode': cls.crawlersrccode(crawler),
- 'viewStatus': '1',
- 'versionCode': '1'}
- now_timestamp = int(round(time.time() * 1000))
- data['crawlerTaskTimestamp'] = str(now_timestamp)
- data['loginUid'] = cls.uids(crawler, strategy, our_uid, env)
- # 单个视频文件夹下的所有视频文件
- for fi in dir_files:
- # 视频文件夹下的所有文件路径
- fi_path = fi_d + '/' + fi
- Common.logger(log_type, crawler).info('dir fi_path = {}'.format(fi_path))
- # 读取 info.txt,赋值给 data
- if cls.local_file_path(crawler)["info_file"] in fi:
- f = open(fi_path, "r", encoding="UTF-8")
- # 读取数据 数据准确性写入的时候保证 读取暂不处理
- for i in range(14):
- line = f.readline()
- line = line.replace('\n', '')
- if line is not None and len(line) != 0 and not line.isspace():
- # Common.logger(log_type, crawler).info("line = {}".format(line))
- if i == 0:
- data['crawlerSrcId'] = line
- elif i == 1:
- data['title'] = line
- elif i == 2:
- data['totalTime'] = line
- elif i == 8:
- data['crawlerSrcPublishTimestamp'] = line
- else:
- Common.logger(log_type, crawler).warning("{} line is None".format(fi_path))
- f.close()
- # remove info.txt
- cls.remove_local_file(log_type, crawler, fi_path)
- # 刷新数据
- dir_files = os.listdir(fi_d)
- for fi in dir_files:
- fi_path = fi_d + '/' + fi
- # Common.logger(log_type, crawler).info('dir fi_path = {}'.format(fi_path))
- # 上传oss
- if cls.local_file_path(crawler)["video_file"] in fi:
- global oss_video_file
- if env == "dev":
- oss_video_file = cls.oss_file_path_video.format("dev", today, data['crawlerSrcId'])
- elif env == "prod":
- oss_video_file = cls.oss_file_path_video.format("prod", today, data['crawlerSrcId'])
- elif env == "hk":
- oss_video_file = cls.oss_file_path_video.format("prod", today, data['crawlerSrcId'])
- Common.logger(log_type, crawler).info("oss_video_file = {}".format(oss_video_file))
- cls.put_file(log_type, crawler, oss_endpoint, oss_video_file, fi_path)
- data['videoPath'] = oss_video_file
- Common.logger(log_type, crawler).info("videoPath = {}".format(oss_video_file))
- elif cls.local_file_path(crawler)["image_file"] in fi:
- global oss_image_file
- if env == "dev":
- oss_image_file = cls.oss_file_path_image.format("env", today, data['crawlerSrcId'])
- elif env == "prod":
- oss_image_file = cls.oss_file_path_image.format("prod", today, data['crawlerSrcId'])
- elif env == "hk":
- oss_image_file = cls.oss_file_path_image.format("prod", today, data['crawlerSrcId'])
- Common.logger(log_type, crawler).info("oss_image_file = {}".format(oss_image_file))
- cls.put_file(log_type, crawler, oss_endpoint, oss_image_file, fi_path)
- data['coverImgPath'] = oss_image_file
- Common.logger(log_type, crawler).info("coverImgPath = {}".format(oss_image_file))
- # 全部remove
- cls.remove_local_file(log_type, crawler, fi_path)
- # 发布
- if env == "dev":
- video_id = cls.publish_video_dev(log_type, crawler, data)
- elif env == "prod":
- video_id = cls.publish_video_prod(log_type, crawler, data)
- elif env == "hk":
- video_id = cls.publish_video_prod(log_type, crawler, data)
- else:
- video_id = cls.publish_video_dev(log_type, crawler, data)
- cls.remove_local_file_dir(log_type, crawler, fi_d)
- Common.logger(log_type, crawler).info('video_id:{}', video_id)
- return video_id
- else:
- Common.logger(log_type, crawler).error('file not a dir = {}'.format(fi_d))
- except Exception as e:
- # 删除视频文件夹
- shutil.rmtree(f"./{crawler}/videos/{fv}/")
- Common.logger(log_type, crawler).exception('upload_and_publish error', e)
|