publish.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/1/31
  4. """
  5. 站内UID配置 / 环境配置 / 视频上传
  6. """
  7. import json
  8. import os
  9. import random
  10. import shutil
  11. import sys
  12. import time
  13. import oss2
  14. import requests
  15. import urllib3
  16. sys.path.append(os.getcwd())
  17. from common.common import Common
  18. proxies = {"http": None, "https": None}
  19. class Publish:
  20. @classmethod
  21. def publish_video_dev(cls, log_type, crawler, request_data):
  22. """
  23. loginUid 站内uid (随机)
  24. appType 默认:888888
  25. crawlerSrcId 站外视频ID
  26. crawlerSrcCode 渠道(自定义 KYK)
  27. crawlerSrcPublishTimestamp 视频原发布时间
  28. crawlerTaskTimestamp 爬虫创建时间(可以是当前时间)
  29. videoPath 视频oss地址
  30. coverImgPath 视频封面oss地址
  31. title 标题
  32. totalTime 视频时长
  33. viewStatus 视频的有效状态 默认1
  34. versionCode 版本 默认1
  35. :return:
  36. """
  37. Common.logger(log_type, crawler).info('publish request data: {}'.format(request_data))
  38. result = cls.request_post('https://videotest.yishihui.com/longvideoapi/crawler/video/send', request_data)
  39. Common.logger(log_type, crawler).info('publish result: {}'.format(result))
  40. video_id = result["data"]["id"]
  41. Common.logger(log_type, crawler).info('video_id: {}'.format(video_id))
  42. if result['code'] != 0:
  43. Common.logger(log_type, crawler).error('pushlish failure msg = {}'.format(result['msg']))
  44. else:
  45. Common.logger(log_type, crawler).info(
  46. 'publish success video_id = : {}'.format(request_data['crawlerSrcId']))
  47. return video_id
  48. @classmethod
  49. def publish_video_prod(cls, log_type, crawler, request_data):
  50. """
  51. loginUid 站内uid (随机)
  52. appType 默认:888888
  53. crawlerSrcId 站外视频ID
  54. crawlerSrcCode 渠道(自定义 KYK)
  55. crawlerSrcPublishTimestamp 视频原发布时间
  56. crawlerTaskTimestamp 爬虫创建时间(可以是当前时间)
  57. videoPath 视频oss地址
  58. coverImgPath 视频封面oss地址
  59. title 标题
  60. totalTime 视频时长
  61. viewStatus 视频的有效状态 默认1
  62. versionCode 版本 默认1
  63. :return:
  64. """
  65. Common.logger(log_type, crawler).info(f'publish request data: {request_data}')
  66. result = cls.request_post('https://longvideoapi.piaoquantv.com/longvideoapi/crawler/video/send', request_data)
  67. Common.logger(log_type, crawler).info(f'publish result: {result}')
  68. video_id = result["data"]["id"]
  69. Common.logger(log_type, crawler).info(f'video_id: {video_id}')
  70. if result['code'] != 0:
  71. Common.logger(log_type, crawler).error('pushlish failure msg = {}'.format(result['msg']))
  72. else:
  73. Common.logger(log_type, crawler).info(
  74. 'publish success video_id = : {}'.format(request_data['crawlerSrcId']))
  75. return video_id
  76. @classmethod
  77. def request_post(cls, request_url, request_data):
  78. """
  79. post 请求 HTTP接口
  80. :param request_url: 接口URL
  81. :param request_data: 请求参数
  82. :return: res_data json格式
  83. """
  84. urllib3.disable_warnings()
  85. response = requests.post(url=request_url, data=request_data, proxies=proxies, verify=False)
  86. if response.status_code == 200:
  87. res_data = json.loads(response.text)
  88. return res_data
  89. @classmethod
  90. def bucket(cls, oss_endpoint):
  91. """
  92. 创建 bucket
  93. :param oss_endpoint: inner:内网;out:外网;hk:香港
  94. :return: bucket
  95. """
  96. # 以下代码展示了基本的文件上传、下载、罗列、删除用法。
  97. # 首先初始化AccessKeyId、AccessKeySecret、Endpoint等信息。
  98. # 通过环境变量获取,或者把诸如“<你的AccessKeyId>”替换成真实的AccessKeyId等。
  99. #
  100. # 以杭州区域为例,Endpoint可以是:
  101. # http://oss-cn-hangzhou.aliyuncs.com
  102. # https://oss-cn-hangzhou.aliyuncs.com
  103. # 分别以HTTP、HTTPS协议访问。
  104. access_key_id = os.getenv('OSS_TEST_ACCESS_KEY_ID', 'LTAIP6x1l3DXfSxm')
  105. access_key_secret = os.getenv('OSS_TEST_ACCESS_KEY_SECRET', 'KbTaM9ars4OX3PMS6Xm7rtxGr1FLon')
  106. bucket_name = os.getenv('OSS_TEST_BUCKET', 'art-pubbucket')
  107. # OSS 内网
  108. if oss_endpoint == 'inner':
  109. endpoint = os.getenv('OSS_TEST_ENDPOINT', 'oss-cn-hangzhou-internal.aliyuncs.com')
  110. # OSS 外网
  111. elif oss_endpoint == 'out':
  112. endpoint = os.getenv('OSS_TEST_ENDPOINT', 'oss-cn-hangzhou.aliyuncs.com')
  113. elif oss_endpoint == 'hk':
  114. endpoint = os.getenv('OSS_TEST_ENDPOINT', 'oss-accelerate.aliyuncs.com')
  115. # 默认走外网
  116. else:
  117. endpoint = os.getenv('OSS_TEST_ENDPOINT', 'oss-cn-hangzhou.aliyuncs.com')
  118. # 确认上面的参数都填写正确了
  119. for param in (access_key_id, access_key_secret, bucket_name, endpoint):
  120. assert '<' not in param, '请设置参数:' + param
  121. # 创建Bucket对象,所有Object相关的接口都可以通过Bucket对象来进行
  122. bucket = oss2.Bucket(oss2.Auth(access_key_id, access_key_secret), endpoint, bucket_name)
  123. return bucket
  124. """
  125. 处理流程:
  126. 1. 定时(每天凌晨1点执行一次)循环files文件下的内容 结构:files -> 视频文件夹 -> 视频文件 + 封面图 + 基本信息
  127. 2. 视频文件和封面上传到oss
  128. - 视频文件oss目录 longvideo/crawler_local/video/prod/文件名
  129. - 视频封面oss目录 longvideo/crawler_local/image/prod/文件名
  130. 3. 发布视频
  131. - 读取 基本信息 调用发布接口
  132. """
  133. # env 日期20220225 文件名
  134. oss_file_path_video = 'longvideo/crawler_local/video/{}/{}/{}'
  135. oss_file_path_image = 'longvideo/crawler_local/image/{}/{}/{}'
  136. @classmethod
  137. def put_file(cls, log_type, crawler, oss_endpoint, oss_file, local_file):
  138. # cls.bucket.put_object_from_file(oss_file, local_file)
  139. cls.bucket(oss_endpoint).put_object_from_file(oss_file, local_file)
  140. Common.logger(log_type, crawler).info("put oss file = {}, local file = {} success".format(oss_file, local_file))
  141. # 清除本地文件
  142. @classmethod
  143. def remove_local_file(cls, log_type, crawler, local_file):
  144. os.remove(local_file)
  145. Common.logger(log_type, crawler).info("remove local file = {} success".format(local_file))
  146. # 清除本地文件夹
  147. @classmethod
  148. def remove_local_file_dir(cls, log_type, crawler, local_file):
  149. os.rmdir(local_file)
  150. Common.logger(log_type, crawler).info("remove local file dir = {} success".format(local_file))
  151. # 站内 UID
  152. @classmethod
  153. def uids(cls, crawler, strategy, our_uid, env):
  154. """
  155. 站内 ID
  156. :param crawler: 哪款爬虫
  157. :param env: 什么环境
  158. :param strategy: 榜单类型,也可以是指定的站内 UID
  159. :param our_uid: 上传到指定站内 UID
  160. :return: uid
  161. """
  162. if env == 'dev':
  163. uids_dev = [6267140, 6267141]
  164. return random.choice(uids_dev)
  165. # 小年糕
  166. elif crawler == 'xiaoniangao' and env == 'prod' and strategy == '定向爬虫策略':
  167. uids_prod_xiaoniangao_follow = [50322210, 50322211, 50322212, 50322213, 50322214, 50322215,
  168. 50322216, 50322217, 50322218, 50322219, 50322220, 50322221, 50322236, 50322237]
  169. return random.choice(uids_prod_xiaoniangao_follow)
  170. elif crawler == 'xiaoniangao' and env == 'prod' and strategy == '小时榜爬虫策略':
  171. uids_prod_xiaoniangao_hour = [50322226, 50322227, 50322228, 50322229]
  172. return random.choice(uids_prod_xiaoniangao_hour)
  173. elif crawler == 'xiaoniangao' and env == 'prod' and strategy == '播放量榜爬虫策略':
  174. uids_prod_xiaoniangao_play = [50322222, 50322223, 50322224, 50322225]
  175. return random.choice(uids_prod_xiaoniangao_play)
  176. elif crawler == 'gongzhonghao' and env == 'prod' and strategy == '定向爬虫策略':
  177. uids_prod_gongzhonghao_follow = [26117675, 26117676, 26117677, 26117678, 26117679, 26117680]
  178. return random.choice(uids_prod_gongzhonghao_follow)
  179. elif crawler == 'kanyikan':
  180. uids_prod_kanyikan_moment = [20631208, 20631209, 20631210, 20631211, 20631212,
  181. 20631213, 20631214, 20631215, 20631216, 20631217,
  182. 20631223, 20631224, 20631225, 20631226, 20631227]
  183. return random.choice(uids_prod_kanyikan_moment)
  184. elif crawler == 'ggdc' and env == 'prod' and strategy == 'kanyikan_recommend':
  185. uids_ggdc_prod_recommend = [26117661, 26117662, 26117663]
  186. return random.choice(uids_ggdc_prod_recommend)
  187. elif crawler == 'ggdc' and env == 'prod' and strategy == 'follow':
  188. uids_ggdc_prod_follow = [26117661, 26117662, 26117663]
  189. return random.choice(uids_ggdc_prod_follow)
  190. else:
  191. return our_uid
  192. # 爬虫渠道号
  193. @classmethod
  194. def crawlersrccode(cls, crawler):
  195. if crawler == 'youtube':
  196. return 'YOUTUBE'
  197. elif crawler == "kuaishou":
  198. return "KUAISHOU_XCX"
  199. elif crawler == "xiaoniangao":
  200. return "XIAONIANGAO_XCX"
  201. elif crawler == "gongzhonghao":
  202. return "GONGZHONGHAO_XINXIN"
  203. elif crawler == 'xigua':
  204. return 'XIGUA'
  205. elif crawler == 'weixinzhishu':
  206. return 'WEIXINZHISHU'
  207. elif crawler == "douyin":
  208. return "DOUYIN"
  209. elif crawler == 'kanyikan':
  210. return 'KANYIKAN'
  211. elif crawler == "weishi":
  212. return "WEISHI"
  213. elif crawler == "benshanzhufu":
  214. return "BENSHANZHUFU"
  215. elif crawler == 'shipinhao':
  216. return 'SHIPINHAO_XCX'
  217. elif crawler == 'zhihu':
  218. return 'ZHIHU'
  219. elif crawler == 'jixiangxingfu':
  220. return 'JIXIANGXINGFU'
  221. elif crawler == 'zhongmiaoyinxin':
  222. return 'ZHONGMIAOYINXIN'
  223. elif crawler == 'suisuiniannianyingfuqi':
  224. return 'SUISUINIANNIANYINGFUQI'
  225. elif crawler == 'zhufumao':
  226. return 'ZHUFUMAO'
  227. elif crawler == 'zongjiao':
  228. return 'ZONGJIAO'
  229. elif crawler == 'haokan':
  230. return 'HAOKAN'
  231. elif crawler == 'kandaojiushifuqi':
  232. return 'KANDAOJIUSHIFUQI'
  233. elif crawler == 'shengshengyingyin':
  234. return 'SHENGSHENGYINGYIN'
  235. elif crawler == 'ganggangdouchuan':
  236. return 'GANGGANGDOUCHUAN'
  237. else:
  238. return "CRAWLER"
  239. @classmethod
  240. def local_file_path(cls, crawler):
  241. local_file_path = f'./{crawler}/videos'
  242. video_file = 'video'
  243. image_file = 'image'
  244. info_file = 'info'
  245. loacl_file_dict = {
  246. 'local_file_path': local_file_path,
  247. 'video_file': video_file,
  248. 'image_file': image_file,
  249. 'info_file': info_file}
  250. return loacl_file_dict
  251. @classmethod
  252. def upload_and_publish(cls, log_type, crawler, strategy, our_uid, env, oss_endpoint):
  253. """
  254. 上传视频到 oss
  255. :param log_type: 选择的 log
  256. :param crawler: 哪款爬虫
  257. :param env: 测试环境:dev,正式环境:prod
  258. :param our_uid: 站内 UID
  259. :param strategy: 榜单类型
  260. :param oss_endpoint: 内网:inner;外网:out
  261. """
  262. Common.logger(log_type, crawler).info("upload_and_publish starting...")
  263. today = time.strftime("%Y%m%d", time.localtime())
  264. # videos 目录下的所有视频文件夹
  265. files = os.listdir(cls.local_file_path(crawler)["local_file_path"])
  266. for fv in files:
  267. try:
  268. # 单个视频文件夹
  269. fi_d = os.path.join(cls.local_file_path(crawler)["local_file_path"], fv)
  270. # 确认为视频文件夹
  271. if os.path.isdir(fi_d):
  272. Common.logger(log_type, crawler).info('dir = {}'.format(fi_d))
  273. # 列出所有视频文件夹
  274. dir_files = os.listdir(fi_d)
  275. data = {'appType': '888888',
  276. 'crawlerSrcCode': cls.crawlersrccode(crawler),
  277. 'viewStatus': '1',
  278. 'versionCode': '1'}
  279. now_timestamp = int(round(time.time() * 1000))
  280. data['crawlerTaskTimestamp'] = str(now_timestamp)
  281. data['loginUid'] = cls.uids(crawler, strategy, our_uid, env)
  282. # 单个视频文件夹下的所有视频文件
  283. for fi in dir_files:
  284. # 视频文件夹下的所有文件路径
  285. fi_path = fi_d + '/' + fi
  286. Common.logger(log_type, crawler).info('dir fi_path = {}'.format(fi_path))
  287. # 读取 info.txt,赋值给 data
  288. if cls.local_file_path(crawler)["info_file"] in fi:
  289. f = open(fi_path, "r", encoding="UTF-8")
  290. # 读取数据 数据准确性写入的时候保证 读取暂不处理
  291. for i in range(14):
  292. line = f.readline()
  293. line = line.replace('\n', '')
  294. if line is not None and len(line) != 0 and not line.isspace():
  295. # Common.logger(log_type, crawler).info("line = {}".format(line))
  296. if i == 0:
  297. data['crawlerSrcId'] = line
  298. elif i == 1:
  299. data['title'] = line
  300. elif i == 2:
  301. data['totalTime'] = line
  302. elif i == 8:
  303. data['crawlerSrcPublishTimestamp'] = line
  304. else:
  305. Common.logger(log_type, crawler).warning("{} line is None".format(fi_path))
  306. f.close()
  307. # remove info.txt
  308. cls.remove_local_file(log_type, crawler, fi_path)
  309. # 刷新数据
  310. dir_files = os.listdir(fi_d)
  311. for fi in dir_files:
  312. fi_path = fi_d + '/' + fi
  313. # Common.logger(log_type, crawler).info('dir fi_path = {}'.format(fi_path))
  314. # 上传oss
  315. if cls.local_file_path(crawler)["video_file"] in fi:
  316. global oss_video_file
  317. if env == "dev":
  318. oss_video_file = cls.oss_file_path_video.format("dev", today, data['crawlerSrcId'])
  319. elif env == "prod":
  320. oss_video_file = cls.oss_file_path_video.format("prod", today, data['crawlerSrcId'])
  321. elif env == "hk":
  322. oss_video_file = cls.oss_file_path_video.format("prod", today, data['crawlerSrcId'])
  323. Common.logger(log_type, crawler).info("oss_video_file = {}".format(oss_video_file))
  324. cls.put_file(log_type, crawler, oss_endpoint, oss_video_file, fi_path)
  325. data['videoPath'] = oss_video_file
  326. Common.logger(log_type, crawler).info("videoPath = {}".format(oss_video_file))
  327. elif cls.local_file_path(crawler)["image_file"] in fi:
  328. global oss_image_file
  329. if env == "dev":
  330. oss_image_file = cls.oss_file_path_image.format("env", today, data['crawlerSrcId'])
  331. elif env == "prod":
  332. oss_image_file = cls.oss_file_path_image.format("prod", today, data['crawlerSrcId'])
  333. elif env == "hk":
  334. oss_image_file = cls.oss_file_path_image.format("prod", today, data['crawlerSrcId'])
  335. Common.logger(log_type, crawler).info("oss_image_file = {}".format(oss_image_file))
  336. cls.put_file(log_type, crawler, oss_endpoint, oss_image_file, fi_path)
  337. data['coverImgPath'] = oss_image_file
  338. Common.logger(log_type, crawler).info("coverImgPath = {}".format(oss_image_file))
  339. # 全部remove
  340. cls.remove_local_file(log_type, crawler, fi_path)
  341. # 发布
  342. if env == "dev":
  343. video_id = cls.publish_video_dev(log_type, crawler, data)
  344. elif env == "prod":
  345. video_id = cls.publish_video_prod(log_type, crawler, data)
  346. elif env == "hk":
  347. video_id = cls.publish_video_prod(log_type, crawler, data)
  348. else:
  349. video_id = cls.publish_video_dev(log_type, crawler, data)
  350. cls.remove_local_file_dir(log_type, crawler, fi_d)
  351. Common.logger(log_type, crawler).info('video_id:{}', video_id)
  352. return video_id
  353. else:
  354. Common.logger(log_type, crawler).error('file not a dir = {}'.format(fi_d))
  355. except Exception as e:
  356. # 删除视频文件夹
  357. shutil.rmtree(f"./{crawler}/videos/{fv}/")
  358. Common.logger(log_type, crawler).exception('upload_and_publish error', e)