xiaoniangao_hour.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/3/15
  4. import datetime
  5. import json
  6. import os
  7. import random
  8. import shutil
  9. import sys
  10. import time
  11. import requests
  12. import urllib3
  13. sys.path.append(os.getcwd())
  14. from common.common import Common
  15. from common.feishu import Feishu
  16. from common.publish import Publish
  17. from common.scheduling_db import MysqlHelper
  18. from common.public import get_config_from_mysql
  19. proxies = {"http": None, "https": None}
  20. class XiaoniangaoHour:
  21. platform = "小年糕"
  22. words = "abcdefghijklmnopqrstuvwxyz0123456789"
  23. uid = f"""{"".join(random.sample(words, 8))}-{"".join(random.sample(words, 4))}-{"".join(random.sample(words, 4))}-{"".join(random.sample(words, 4))}-{"".join(random.sample(words, 12))}"""
  24. token = "".join(random.sample(words, 32))
  25. uid_token_dict = {
  26. "uid": uid,
  27. "token": token
  28. }
  29. # 生成 uid、token
  30. @classmethod
  31. def get_uid_token(cls):
  32. words = "abcdefghijklmnopqrstuvwxyz0123456789"
  33. uid = f"""{"".join(random.sample(words, 8))}-{"".join(random.sample(words, 4))}-{"".join(random.sample(words, 4))}-{"".join(random.sample(words, 4))}-{"".join(random.sample(words, 12))}"""
  34. token = "".join(random.sample(words, 32))
  35. uid_token_dict = {
  36. "uid": uid,
  37. "token": token
  38. }
  39. return uid_token_dict
  40. # 基础门槛规则
  41. @staticmethod
  42. def download_rule(video_dict):
  43. """
  44. 下载视频的基本规则
  45. :param video_dict: 视频信息,字典格式
  46. :return: 满足规则,返回 True;反之,返回 False
  47. """
  48. # 视频时长
  49. if int(float(video_dict["duration"])) >= 40:
  50. # 宽或高
  51. if int(video_dict["video_width"]) >= 0 or int(video_dict["video_height"]) >= 0:
  52. # 播放量
  53. if int(video_dict["play_cnt"]) >= 4000:
  54. # 点赞量
  55. if int(video_dict["like_cnt"]) >= 0:
  56. # 分享量
  57. if int(video_dict["share_cnt"]) >= 0:
  58. # 发布时间 <= 10 天
  59. if int(time.time()) - int(video_dict["publish_time_stamp"]) <= 3600 * 24 * 10:
  60. return True
  61. else:
  62. return False
  63. else:
  64. return False
  65. else:
  66. return False
  67. else:
  68. return False
  69. return False
  70. return False
  71. # 获取表情及符号
  72. @classmethod
  73. def get_expression(cls):
  74. # 表情列表
  75. expression_list = ['📍', '⭕️', '🔥', '📣', '🎈', '⚡', '🔔', '🚩', '💢', '💎', '👉', '💓', '❗️', '🔴', '🔺', '♦️', '♥️', '👉',
  76. '👈', '🏆', '❤️\u200d🔥']
  77. # 符号列表
  78. char_list = ['...', '~~']
  79. return expression_list, char_list
  80. @classmethod
  81. def repeat_video(cls, log_type, crawler, video_id, env):
  82. sql = f""" select * from crawler_video where platform="小年糕" and out_video_id="{video_id}"; """
  83. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
  84. return len(repeat_video)
  85. @classmethod
  86. def repeat_hour(cls, log_type, crawler, video_id, env):
  87. sql = f""" select * from crawler_xiaoniangao_hour where platform="小年糕" and out_video_id="{video_id}"; """
  88. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
  89. return len(repeat_video)
  90. # 获取列表
  91. @classmethod
  92. def get_videoList(cls, log_type, crawler, env):
  93. # try:
  94. uid_token_dict = cls.uid_token_dict
  95. url = "https://kapi.xiaoniangao.cn/trends/get_recommend_trends"
  96. headers = {
  97. # "x-b3-traceid": cls.hour_x_b3_traceid,
  98. "x-b3-traceid": '1c403a4aa72e3c',
  99. # "X-Token-Id": cls.hour_x_token_id,
  100. "X-Token-Id": 'ab619e96d801f1567388629260aa68ec-1202200806',
  101. # "uid": cls.hour_uid,
  102. "uid": uid_token_dict['uid'],
  103. "content-type": "application/json",
  104. "Accept-Encoding": "gzip,compress,br,deflate",
  105. "User-Agent": 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X)'
  106. ' AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 '
  107. 'MicroMessenger/8.0.20(0x18001432) NetType/WIFI Language/zh_CN',
  108. # "Referer": cls.hour_referer
  109. "Referer": 'https://servicewechat.com/wxd7911e4c177690e4/624/page-frame.html'
  110. }
  111. data = {
  112. "log_params": {
  113. "page": "discover_rec",
  114. "common": {
  115. "brand": "iPhone",
  116. "device": "iPhone 11",
  117. "os": "iOS 14.7.1",
  118. "weixinver": "8.0.20",
  119. "srcver": "2.24.2",
  120. "net": "wifi",
  121. "scene": 1089
  122. }
  123. },
  124. "qs": "imageMogr2/gravity/center/rotate/$/thumbnail/!750x500r/crop/750x500/interlace/1/format/jpg",
  125. "h_qs": "imageMogr2/gravity/center/rotate/$/thumbnail/!80x80r/crop/80x80/interlace/1/format/jpg",
  126. "share_width": 625,
  127. "share_height": 500,
  128. "ext": {
  129. "fmid": 0,
  130. "items": {}
  131. },
  132. "app": "xng",
  133. "rec_scene": "discover_rec",
  134. "log_common_params": {
  135. "e": [{
  136. "data": {
  137. "page": "discoverIndexPage",
  138. "topic": "recommend"
  139. },
  140. "ab": {}
  141. }],
  142. "ext": {
  143. "brand": "iPhone",
  144. "device": "iPhone 11",
  145. "os": "iOS 14.7.1",
  146. "weixinver": "8.0.20",
  147. "srcver": "2.24.3",
  148. "net": "wifi",
  149. "scene": "1089"
  150. },
  151. "pj": "1",
  152. "pf": "2",
  153. "session_id": "7bcce313-b57d-4305-8d14-6ebd9a1bad29"
  154. },
  155. "refresh": False,
  156. "token": uid_token_dict["token"],
  157. "uid": uid_token_dict["uid"],
  158. "proj": "ma",
  159. "wx_ver": "8.0.20",
  160. "code_ver": "3.62.0"
  161. }
  162. urllib3.disable_warnings()
  163. r = requests.post(url=url, headers=headers, json=data, proxies=proxies, verify=False)
  164. if 'data' not in r.text or r.status_code != 200:
  165. Common.logger(log_type, crawler).warning(f"get_videoList:{r.text}\n")
  166. return
  167. elif "data" not in r.json():
  168. Common.logger(log_type, crawler).warning(f"get_videoList:{r.json()}\n")
  169. return
  170. elif "list" not in r.json()["data"]:
  171. Common.logger(log_type, crawler).warning(f"get_videoList:{r.json()['data']}\n")
  172. return
  173. elif len(r.json()['data']['list']) == 0:
  174. Common.logger(log_type, crawler).warning(f"get_videoList:{r.json()['data']['list']}\n")
  175. return
  176. else:
  177. # 视频列表数据
  178. feeds = r.json()["data"]["list"]
  179. for i in range(len(feeds)):
  180. # 标题,表情随机加在片头、片尾,或替代句子中间的标点符号
  181. if "title" in feeds[i]:
  182. befor_video_title = feeds[i]["title"].strip().replace("\n", "") \
  183. .replace("/", "").replace("\r", "").replace("#", "") \
  184. .replace(".", "。").replace("\\", "").replace("&NBSP", "") \
  185. .replace(":", "").replace("*", "").replace("?", "") \
  186. .replace("?", "").replace('"', "").replace("<", "") \
  187. .replace(">", "").replace("|", "").replace(" ", "").replace("#表情", "").replace("#符号","").replace(
  188. '"', '').replace("'", '').replace('"', '').replace("'", '')
  189. expression = cls.get_expression()
  190. expression_list = expression[0]
  191. char_list = expression[1]
  192. # 随机取一个表情
  193. expression = random.choice(expression_list)
  194. # 生成标题list[表情+title, title+表情]
  195. expression_title_list = [expression + befor_video_title, befor_video_title + expression]
  196. # 从标题list中随机取一个标题
  197. title_list1 = random.choice(expression_title_list)
  198. # 生成标题:原标题+符号
  199. title_list2 = befor_video_title + random.choice(char_list)
  200. # 表情和标题组合,与标题和符号组合,汇总成待使用的标题列表
  201. title_list4 = [title_list2, title_list1]
  202. # 最终标题
  203. video_title = random.choice(title_list4)
  204. else:
  205. video_title = 0
  206. # 视频 ID
  207. if "vid" in feeds[i]:
  208. video_id = feeds[i]["vid"]
  209. else:
  210. video_id = 0
  211. # 播放量
  212. if "play_pv" in feeds[i]:
  213. video_play_cnt = feeds[i]["play_pv"]
  214. else:
  215. video_play_cnt = 0
  216. # 点赞量
  217. if "favor" in feeds[i]:
  218. video_like_cnt = feeds[i]["favor"]["total"]
  219. else:
  220. video_like_cnt = 0
  221. # 评论数
  222. if "comment_count" in feeds[i]:
  223. video_comment_cnt = feeds[i]["comment_count"]
  224. else:
  225. video_comment_cnt = 0
  226. # 分享量
  227. if "share" in feeds[i]:
  228. video_share_cnt = feeds[i]["share"]
  229. else:
  230. video_share_cnt = 0
  231. # 时长
  232. if "du" in feeds[i]:
  233. video_duration = int(feeds[i]["du"] / 1000)
  234. else:
  235. video_duration = 0
  236. # 宽和高
  237. if "w" or "h" in feeds[i]:
  238. video_width = feeds[i]["w"]
  239. video_height = feeds[i]["h"]
  240. else:
  241. video_width = 0
  242. video_height = 0
  243. # 发布时间
  244. if "t" in feeds[i]:
  245. video_send_time = feeds[i]["t"]
  246. else:
  247. video_send_time = 0
  248. publish_time_stamp = int(int(video_send_time) / 1000)
  249. publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))
  250. # 用户名 / 头像
  251. if "user" in feeds[i]:
  252. user_name = feeds[i]["user"]["nick"].strip().replace("\n", "") \
  253. .replace("/", "").replace("快手", "").replace(" ", "") \
  254. .replace(" ", "").replace("&NBSP", "").replace("\r", "")
  255. head_url = feeds[i]["user"]["hurl"]
  256. else:
  257. user_name = 0
  258. head_url = 0
  259. # 用户 ID
  260. profile_id = feeds[i]["id"]
  261. # 用户 mid
  262. profile_mid = feeds[i]["user"]["mid"]
  263. # 视频封面
  264. if "url" in feeds[i]:
  265. cover_url = feeds[i]["url"]
  266. else:
  267. cover_url = 0
  268. # 视频播放地址
  269. if "v_url" in feeds[i]:
  270. video_url = feeds[i]["v_url"]
  271. else:
  272. video_url = 0
  273. video_dict = {
  274. "video_title": video_title,
  275. "video_id": video_id,
  276. "duration": video_duration,
  277. "play_cnt": video_play_cnt,
  278. "like_cnt": video_like_cnt,
  279. "comment_cnt": video_comment_cnt,
  280. "share_cnt": video_share_cnt,
  281. "user_name": user_name,
  282. "publish_time_stamp": publish_time_stamp,
  283. "publish_time_str": publish_time_str,
  284. "video_width": video_width,
  285. "video_height": video_height,
  286. "avatar_url": head_url,
  287. "profile_id": profile_id,
  288. "profile_mid": profile_mid,
  289. "cover_url": cover_url,
  290. "video_url": video_url,
  291. "session": f"xiaoniangao-hour-{int(time.time())}"
  292. }
  293. for k, v in video_dict.items():
  294. Common.logger(log_type, crawler).info(f"{k}:{v}")
  295. # 过滤无效视频
  296. if video_title == 0 or video_id == 0 or video_duration == 0 \
  297. or video_send_time == 0 or user_name == 0 or head_url == 0 \
  298. or cover_url == 0 or video_url == 0:
  299. Common.logger(log_type, crawler).warning("无效视频\n")
  300. # 抓取基础规则过滤
  301. elif cls.download_rule(video_dict) is False:
  302. Common.logger(log_type, crawler).info("不满足基础门槛规则\n")
  303. elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env) != 0:
  304. Common.logger(log_type, crawler).info('视频已下载\n')
  305. elif any(str(word) if str(word) in video_dict['video_title'] else False for word in
  306. get_config_from_mysql(log_type=log_type,
  307. source=crawler,
  308. env=env,
  309. text="filter",
  310. action="")) is True:
  311. Common.logger(log_type, crawler).info("视频已中过滤词\n")
  312. time.sleep(1)
  313. else:
  314. # 写入飞书小时级feeds数据库表
  315. insert_sql = f""" insert into crawler_xiaoniangao_hour(profile_id,
  316. profile_mid,
  317. platform,
  318. out_video_id,
  319. video_title,
  320. user_name,
  321. cover_url,
  322. video_url,
  323. duration,
  324. publish_time,
  325. play_cnt,
  326. crawler_time_stamp,
  327. crawler_time)
  328. values({profile_id},
  329. {profile_mid},
  330. "{cls.platform}",
  331. "{video_id}",
  332. "{video_title}",
  333. "{user_name}",
  334. "{cover_url}",
  335. "{video_url}",
  336. {video_duration},
  337. "{publish_time_str}",
  338. {video_play_cnt},
  339. {int(time.time())},
  340. "{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time())))}"
  341. )"""
  342. Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
  343. MysqlHelper.update_values(log_type, crawler, insert_sql, env)
  344. Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
  345. @classmethod
  346. def get_video_info(cls, log_type, crawler, p_id, p_mid, v_title, v_id):
  347. # try:
  348. uid_token_dict = cls.uid_token_dict
  349. url = "https://kapi.xiaoniangao.cn/profile/get_profile_by_id"
  350. headers = {
  351. # "x-b3-traceid": cls.hour_x_b3_traceid,
  352. "x-b3-traceid": '1c403a4aa72e3c',
  353. # "X-Token-Id": cls.hour_x_token_id,
  354. "X-Token-Id": 'ab619e96d801f1567388629260aa68ec-1202200806',
  355. "uid": uid_token_dict['uid'],
  356. "content-type": "application/json",
  357. "Accept-Encoding": "gzip,compress,br,deflate",
  358. "User-Agent": 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X)'
  359. ' AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 '
  360. 'MicroMessenger/8.0.20(0x18001432) NetType/WIFI Language/zh_CN',
  361. # "Referer": cls.hour_referer
  362. "Referer": 'https://servicewechat.com/wxd7911e4c177690e4/624/page-frame.html'
  363. }
  364. data = {
  365. "play_src": "1",
  366. "profile_id": int(p_id),
  367. "profile_mid": int(p_mid),
  368. "qs": "imageMogr2/gravity/center/rotate/$/thumbnail/"
  369. "!400x400r/crop/400x400/interlace/1/format/jpg",
  370. "h_qs": "imageMogr2/gravity/center/rotate/$/thumbnail"
  371. "/!80x80r/crop/80x80/interlace/1/format/jpg",
  372. "share_width": 625,
  373. "share_height": 500,
  374. "no_comments": True,
  375. "no_follow": True,
  376. "vid": v_id,
  377. "hot_l1_comment": True,
  378. # "token": cls.hour_token,
  379. "token": uid_token_dict['token'],
  380. # "uid": cls.hour_uid,
  381. "uid": uid_token_dict['uid'],
  382. "proj": "ma",
  383. "wx_ver": "8.0.20",
  384. "code_ver": "3.62.0",
  385. "log_common_params": {
  386. "e": [{
  387. "data": {
  388. "page": "dynamicSharePage"
  389. }
  390. }],
  391. "ext": {
  392. "brand": "iPhone",
  393. "device": "iPhone 11",
  394. "os": "iOS 14.7.1",
  395. "weixinver": "8.0.20",
  396. "srcver": "2.24.3",
  397. "net": "wifi",
  398. "scene": "1089"
  399. },
  400. "pj": "1",
  401. "pf": "2",
  402. "session_id": "7bcce313-b57d-4305-8d14-6ebd9a1bad29"
  403. }
  404. }
  405. urllib3.disable_warnings()
  406. r = requests.post(headers=headers, url=url, json=data, proxies=proxies, verify=False)
  407. if r.status_code != 200 or 'data' not in r.text:
  408. Common.logger(log_type, crawler).warning(f"get_videoInfo:{r.text}\n")
  409. else:
  410. hour_play_cnt = r.json()["data"]["play_pv"]
  411. hour_cover_url = r.json()["data"]["url"]
  412. hour_video_url = r.json()["data"]["v_url"]
  413. hour_video_duration = r.json()["data"]["du"]
  414. hour_video_comment_cnt = r.json()["data"]["comment_count"]
  415. hour_video_like_cnt = r.json()["data"]["favor"]["total"]
  416. hour_video_share_cnt = r.json()["data"]["share"]
  417. hour_video_width = r.json()["data"]["w"]
  418. hour_video_height = r.json()["data"]["h"]
  419. hour_video_send_time = r.json()["data"]["t"]
  420. publish_time_stamp = int(int(hour_video_send_time) / 1000)
  421. publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))
  422. hour_user_name = r.json()["data"]["user"]["nick"]
  423. hour_head_url = r.json()["data"]["user"]["hurl"]
  424. video_info_dict = {
  425. "video_id": v_id,
  426. "video_title": v_title,
  427. "duration": hour_video_duration,
  428. "play_cnt": hour_play_cnt,
  429. "like_cnt": hour_video_like_cnt,
  430. "comment_cnt": hour_video_comment_cnt,
  431. "share_cnt": hour_video_share_cnt,
  432. "user_name": hour_user_name,
  433. "publish_time_stamp": publish_time_stamp,
  434. "publish_time_str": publish_time_str,
  435. "video_width": hour_video_width,
  436. "video_height": hour_video_height,
  437. "avatar_url": hour_head_url,
  438. "profile_id": p_id,
  439. "profile_mid": p_mid,
  440. "cover_url": hour_cover_url,
  441. "video_url": hour_video_url,
  442. "session": f"xiaoniangao-hour-{int(time.time())}"
  443. }
  444. return video_info_dict
  445. # 更新小时榜数据
  446. @classmethod
  447. def update_videoList(cls, log_type, crawler, strategy, oss_endpoint, env):
  448. """
  449. 更新小时榜数据
  450. """
  451. # try:
  452. befor_yesterday = (datetime.date.today() + datetime.timedelta(days=-3)).strftime("%Y-%m-%d %H:%M:%S")
  453. update_time_stamp = int(time.mktime(time.strptime(befor_yesterday, "%Y-%m-%d %H:%M:%S")))
  454. select_sql = f""" select * from crawler_xiaoniangao_hour where crawler_time_stamp >= {update_time_stamp} GROUP BY out_video_id """
  455. update_video_list = MysqlHelper.get_values(log_type, crawler, select_sql, env)
  456. if len(update_video_list) == 0:
  457. Common.logger(log_type, crawler).info("暂无需要更新的小时榜数据\n")
  458. return
  459. for update_video_info in update_video_list:
  460. profile_id = update_video_info["profile_id"]
  461. profile_mid = update_video_info["profile_mid"]
  462. video_title = update_video_info["video_title"]
  463. video_id = update_video_info["out_video_id"]
  464. if datetime.datetime.now().hour == 10 and datetime.datetime.now().minute <= 10:
  465. video_info_dict = cls.get_video_info(log_type=log_type,
  466. crawler=crawler,
  467. p_id=profile_id,
  468. p_mid=profile_mid,
  469. v_title=video_title,
  470. v_id=video_id)
  471. ten_play_cnt = video_info_dict['play_cnt']
  472. Common.logger(log_type, crawler).info(f"ten_play_cnt:{ten_play_cnt}")
  473. update_sql = f""" update crawler_xiaoniangao_hour set ten_play_cnt={ten_play_cnt} WHERE out_video_id="{video_id}"; """
  474. # Common.logger(log_type, crawler).info(f"update_sql:{update_sql}")
  475. MysqlHelper.update_values(log_type, crawler, update_sql, env)
  476. cls.download_publish(log_type, crawler, video_info_dict, update_video_info, strategy, oss_endpoint,
  477. env)
  478. elif datetime.datetime.now().hour == 15 and datetime.datetime.now().minute <= 10:
  479. video_info_dict = cls.get_video_info(log_type=log_type,
  480. crawler=crawler,
  481. p_id=profile_id,
  482. p_mid=profile_mid,
  483. v_title=video_title,
  484. v_id=video_id)
  485. fifteen_play_cnt = video_info_dict['play_cnt']
  486. Common.logger(log_type, crawler).info(f"fifteen_play_cnt:{fifteen_play_cnt}")
  487. update_sql = f""" update crawler_xiaoniangao_hour set fifteen_play_cnt={fifteen_play_cnt} WHERE out_video_id="{video_id}"; """
  488. # Common.logger(log_type, crawler).info(f"update_sql:{update_sql}")
  489. MysqlHelper.update_values(log_type, crawler, update_sql, env)
  490. cls.download_publish(log_type, crawler, video_info_dict, update_video_info, strategy, oss_endpoint,
  491. env)
  492. elif datetime.datetime.now().hour == 20 and datetime.datetime.now().minute <= 10:
  493. video_info_dict = cls.get_video_info(log_type=log_type,
  494. crawler=crawler,
  495. p_id=profile_id,
  496. p_mid=profile_mid,
  497. v_title=video_title,
  498. v_id=video_id)
  499. twenty_play_cnt = video_info_dict['play_cnt']
  500. Common.logger(log_type, crawler).info(f"twenty_play_cnt:{twenty_play_cnt}")
  501. update_sql = f""" update crawler_xiaoniangao_hour set twenty_play_cnt={twenty_play_cnt} WHERE out_video_id="{video_id}"; """
  502. # Common.logger(log_type, crawler).info(f"update_sql:{update_sql}")
  503. MysqlHelper.update_values(log_type, crawler, update_sql, env)
  504. cls.download_publish(log_type, crawler, video_info_dict, update_video_info, strategy, oss_endpoint,
  505. env)
  506. else:
  507. pass
  508. @classmethod
  509. def download(cls, log_type, crawler, video_info_dict, strategy, oss_endpoint, env):
  510. # 下载封面
  511. Common.download_method(log_type=log_type, crawler=crawler, text="cover", title=video_info_dict["video_title"],
  512. url=video_info_dict["cover_url"])
  513. # 下载视频
  514. Common.download_method(log_type=log_type, crawler=crawler, text="video", title=video_info_dict["video_title"],
  515. url=video_info_dict["video_url"])
  516. # 保存视频信息至 "./videos/{download_video_title}/info.txt"
  517. Common.save_video_info(log_type=log_type, crawler=crawler, video_dict=video_info_dict)
  518. # 上传视频
  519. Common.logger(log_type, crawler).info("开始上传视频...")
  520. our_video_id = Publish.upload_and_publish(log_type=log_type,
  521. crawler=crawler,
  522. strategy=strategy,
  523. our_uid="hour",
  524. env=env,
  525. oss_endpoint=oss_endpoint)
  526. if env == "dev":
  527. our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  528. else:
  529. our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  530. Common.logger(log_type, crawler).info("视频上传完成")
  531. if our_video_id is None:
  532. # 删除视频文件夹
  533. shutil.rmtree(f"./{crawler}/videos/{video_info_dict['video_title']}")
  534. return
  535. # 视频信息保存数据库
  536. rule_dict = {
  537. "duration": {"min": 40},
  538. "play_cnt": {"min": 4000},
  539. "publish_day": {"min": 10}
  540. }
  541. insert_sql = f""" insert into crawler_video(video_id,
  542. out_user_id,
  543. platform,
  544. strategy,
  545. out_video_id,
  546. video_title,
  547. cover_url,
  548. video_url,
  549. duration,
  550. publish_time,
  551. play_cnt,
  552. crawler_rule,
  553. width,
  554. height)
  555. values({our_video_id},
  556. "{video_info_dict['profile_id']}",
  557. "{cls.platform}",
  558. "小时榜爬虫策略",
  559. "{video_info_dict['video_id']}",
  560. "{video_info_dict['video_title']}",
  561. "{video_info_dict['cover_url']}",
  562. "{video_info_dict['video_url']}",
  563. {int(video_info_dict['duration'])},
  564. "{video_info_dict['publish_time_str']}",
  565. {int(video_info_dict['play_cnt'])},
  566. '{json.dumps(rule_dict)}',
  567. {int(video_info_dict['video_width'])},
  568. {int(video_info_dict['video_height'])}) """
  569. Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
  570. MysqlHelper.update_values(log_type, crawler, insert_sql, env)
  571. Common.logger(log_type, crawler).info('视频信息插入数据库成功!')
  572. # 视频写入飞书
  573. Feishu.insert_columns(log_type, crawler, "yatRv2", "ROWS", 1, 2)
  574. # 视频ID工作表,首行写入数据
  575. upload_time = int(time.time())
  576. values = [[time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(upload_time)),
  577. "小时级上升榜",
  578. str(video_info_dict['video_id']),
  579. str(video_info_dict['video_title']),
  580. our_video_link,
  581. video_info_dict['play_cnt'],
  582. video_info_dict['comment_cnt'],
  583. video_info_dict['like_cnt'],
  584. video_info_dict['share_cnt'],
  585. video_info_dict['duration'],
  586. f"{video_info_dict['video_width']}*{video_info_dict['video_height']}",
  587. str(video_info_dict['publish_time_str'].replace("-", "/")),
  588. str(video_info_dict['user_name']),
  589. str(video_info_dict['profile_id']),
  590. str(video_info_dict['profile_mid']),
  591. str(video_info_dict['avatar_url']),
  592. str(video_info_dict['cover_url']),
  593. str(video_info_dict['video_url'])]]
  594. time.sleep(1)
  595. Feishu.update_values(log_type, crawler, "yatRv2", "F2:Z2", values)
  596. Common.logger(log_type, crawler).info('视频信息写入飞书成功\n')
  597. # 下载/上传
  598. @classmethod
  599. def download_publish(cls, log_type, crawler, video_info_dict, update_video_info, strategy, oss_endpoint, env):
  600. # try:
  601. if cls.repeat_video(log_type, crawler, video_info_dict["video_id"], env) != 0:
  602. Common.logger(log_type, crawler).info('视频已下载\n')
  603. # 播放量大于 50000,直接下载
  604. elif int(video_info_dict["play_cnt"]) >= 30000:
  605. Common.logger(log_type, crawler).info(
  606. f"播放量:{video_info_dict['play_cnt']} >= 30000,满足下载规则,开始下载视频")
  607. cls.download(log_type, crawler, video_info_dict, strategy, oss_endpoint, env)
  608. # 上升榜判断逻辑,任意时间段上升量>=5000,连续两个时间段上升量>=2000
  609. elif int(update_video_info['ten_play_cnt']) >= 3000 or int(
  610. update_video_info['fifteen_play_cnt']) >= 3000 or int(update_video_info['twenty_play_cnt']) >= 3000:
  611. Common.logger(log_type, crawler).info(
  612. f"10:00 or 15:00 or 20:00 数据上升量:{int(update_video_info['ten_play_cnt'])} or {int(update_video_info['fifteen_play_cnt'])} or {int(update_video_info['twenty_play_cnt'])} >= 3000")
  613. Common.logger(log_type, crawler).info("满足下载规则,开始下载视频")
  614. cls.download(log_type, crawler, video_info_dict, strategy, oss_endpoint, env)
  615. elif int(update_video_info['ten_play_cnt']) >= 1000 and int(update_video_info['fifteen_play_cnt']) >= 1000:
  616. Common.logger(log_type, crawler).info(
  617. f"10:00 and 15:00 数据上升量:{int(update_video_info['ten_play_cnt'])} and {int(update_video_info['fifteen_play_cnt'])} >= 1000")
  618. Common.logger(log_type, crawler).info("满足下载规则,开始下载视频")
  619. cls.download(log_type, crawler, video_info_dict, strategy, oss_endpoint, env)
  620. elif int(update_video_info['fifteen_play_cnt']) >= 1000 and int(update_video_info['twenty_play_cnt']) >= 1000:
  621. Common.logger(log_type, crawler).info(
  622. f"15:00 and 20:00 数据上升量:{int(update_video_info['fifteen_play_cnt'])} and {int(update_video_info['twenty_play_cnt'])} >= 1000")
  623. Common.logger(log_type, crawler).info("满足下载规则,开始下载视频")
  624. cls.download(log_type, crawler, video_info_dict, strategy, oss_endpoint, env)
  625. elif int(update_video_info['ten_play_cnt']) >= 1000 and int(update_video_info['twenty_play_cnt']) >= 1000:
  626. Common.logger(log_type, crawler).info(
  627. f"今日10:00 / 20:00数据上升量:{int(update_video_info['ten_play_cnt'])} and {int(update_video_info['twenty_play_cnt'])} >= 1000")
  628. Common.logger(log_type, crawler).info("满足下载规则,开始下载视频")
  629. cls.download(log_type, crawler, video_info_dict, strategy, oss_endpoint, env)
  630. else:
  631. Common.logger(log_type, crawler).info("上升量不满足下载规则")
  632. if __name__ == "__main__":
  633. # print(XiaoniangaoHour.get_expression())
  634. # print(XiaoniangaoHour.get_uid_token())
  635. # XiaoniangaoHour.get_videoList("test", "xiaoniangao", "dev")
  636. # XiaoniangaoHour.update_videoList("test", "xiaoniangao", "小时榜爬虫策略", "out", "dev")
  637. # befor_yesterday = (datetime.date.today() + datetime.timedelta(days=-3)).strftime("%Y-%m-%d %H:%M:%S")
  638. # update_time_stamp = int(time.mktime(time.strptime(befor_yesterday, "%Y-%m-%d %H:%M:%S")))
  639. # print(update_time_stamp)
  640. # print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))))
  641. print(XiaoniangaoHour.uid_token_dict)
  642. pass