xiaoniangao_play.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/3/16
  4. import json
  5. import os
  6. import random
  7. import shutil
  8. import sys
  9. import time
  10. import requests
  11. import urllib3
  12. sys.path.append(os.getcwd())
  13. from common.common import Common
  14. from common.feishu import Feishu
  15. from common.publish import Publish
  16. from common.public import get_config_from_mysql
  17. from common.scheduling_db import MysqlHelper
  18. proxies = {"http": None, "https": None}
  19. class XiaoniangaoPlay:
  20. platform = "小年糕"
  21. words = "abcdefghijklmnopqrstuvwxyz0123456789"
  22. uid = f"""{"".join(random.sample(words, 8))}-{"".join(random.sample(words, 4))}-{"".join(random.sample(words, 4))}-{"".join(random.sample(words, 4))}-{"".join(random.sample(words, 12))}"""
  23. token = "".join(random.sample(words, 32))
  24. uid_token_dict = {
  25. "uid": uid,
  26. "token": token
  27. }
  28. # 生成 uid、token
  29. @classmethod
  30. def get_uid_token(cls):
  31. words = "abcdefghijklmnopqrstuvwxyz0123456789"
  32. uid = f"""{"".join(random.sample(words, 8))}-{"".join(random.sample(words, 4))}-{"".join(random.sample(words, 4))}-{"".join(random.sample(words, 4))}-{"".join(random.sample(words, 12))}"""
  33. token = "".join(random.sample(words, 32))
  34. uid_token_dict = {
  35. "uid": uid,
  36. "token": token
  37. }
  38. return uid_token_dict
  39. # 基础门槛规则
  40. @classmethod
  41. def download_rule(cls, video_dict):
  42. """
  43. 下载视频的基本规则
  44. :param video_dict: 视频信息,字典格式
  45. :return: 满足规则,返回 True;反之,返回 False
  46. """
  47. # 视频时长
  48. if int(float(video_dict['duration'])) >= 40:
  49. # 宽或高
  50. if int(video_dict['video_width']) >= 0 or int(video_dict['video_height']) >= 0:
  51. # 播放量
  52. if int(video_dict['play_cnt']) >= 20000:
  53. # 点赞量
  54. if int(video_dict['like_cnt']) >= 0:
  55. # 分享量
  56. if int(video_dict['share_cnt']) >= 0:
  57. # 发布时间 <= 60 天
  58. if int(time.time()) - int(video_dict['publish_time_stamp']) <= 3600 * 24 * 60:
  59. return True
  60. else:
  61. return False
  62. else:
  63. return False
  64. else:
  65. return False
  66. else:
  67. return False
  68. return False
  69. return False
  70. # 获取表情及符号
  71. @classmethod
  72. def get_expression(cls):
  73. # 表情列表
  74. expression_list = ['📍', '⭕️', '🔥', '📣', '🎈', '⚡', '🔔', '🚩', '💢', '💎', '👉', '💓', '❗️', '🔴', '🔺', '♦️', '♥️', '👉', '👈', '🏆', '❤️\u200d🔥']
  75. # 符号列表
  76. char_list = ['...', '~~']
  77. return expression_list, char_list
  78. # 获取列表
  79. @classmethod
  80. def get_videoList(cls, log_type, crawler, strategy, oss_endpoint, env):
  81. uid_token_dict = cls.uid_token_dict
  82. url = "https://kapi.xiaoniangao.cn/trends/get_recommend_trends"
  83. headers = {
  84. "x-b3-traceid": '1dc0a6d0929a2b',
  85. "X-Token-Id": 'ae99a4953804085ebb0ae36fa138031d-1146052582',
  86. "uid": uid_token_dict['uid'],
  87. "content-type": "application/json",
  88. "Accept-Encoding": "gzip,compress,br,deflate",
  89. "User-Agent": 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X)'
  90. ' AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 '
  91. 'MicroMessenger/8.0.20(0x18001432) NetType/WIFI Language/zh_CN',
  92. "Referer": 'https://servicewechat.com/wxd7911e4c177690e4/620/page-frame.html'
  93. }
  94. data = {
  95. "log_params": {
  96. "page": "discover_rec",
  97. "common": {
  98. "brand": "iPhone",
  99. "device": "iPhone 11",
  100. "os": "iOS 14.7.1",
  101. "weixinver": "8.0.20",
  102. "srcver": "2.24.2",
  103. "net": "wifi",
  104. "scene": 1089
  105. }
  106. },
  107. "qs": "imageMogr2/gravity/center/rotate/$/thumbnail/!750x500r/crop/750x500/interlace/1/format/jpg",
  108. "h_qs": "imageMogr2/gravity/center/rotate/$/thumbnail/!80x80r/crop/80x80/interlace/1/format/jpg",
  109. "share_width": 625,
  110. "share_height": 500,
  111. "ext": {
  112. "fmid": 0,
  113. "items": {}
  114. },
  115. "app": "xng",
  116. "rec_scene": "discover_rec",
  117. "log_common_params": {
  118. "e": [{
  119. "data": {
  120. "page": "discoverIndexPage",
  121. "topic": "recommend"
  122. },
  123. "ab": {}
  124. }],
  125. "ext": {
  126. "brand": "iPhone",
  127. "device": "iPhone 11",
  128. "os": "iOS 14.7.1",
  129. "weixinver": "8.0.20",
  130. "srcver": "2.24.3",
  131. "net": "wifi",
  132. "scene": "1089"
  133. },
  134. "pj": "1",
  135. "pf": "2",
  136. "session_id": "7bcce313-b57d-4305-8d14-6ebd9a1bad29"
  137. },
  138. "refresh": False,
  139. "token": uid_token_dict['token'],
  140. "uid": uid_token_dict['uid'],
  141. "proj": "ma",
  142. "wx_ver": "8.0.20",
  143. "code_ver": "3.62.0"
  144. }
  145. urllib3.disable_warnings()
  146. r = requests.post(url=url, headers=headers, json=data, proxies=proxies, verify=False)
  147. if "data" not in r.text or r.status_code != 200:
  148. Common.logger(log_type, crawler).warning(f"get_videoList:{r.text}")
  149. return
  150. elif "data" not in r.json():
  151. Common.logger(log_type, crawler).info(f"get_videoList:{r.json()}")
  152. return
  153. elif "list" not in r.json()["data"]:
  154. Common.logger(log_type, crawler).warning(f"get_videoList:{r.json()['data']}")
  155. return
  156. elif len(r.json()["data"]["list"]) == 0:
  157. Common.logger(log_type, crawler).warning(f"get_videoList:{r.json()['data']['list']}")
  158. return
  159. else:
  160. # 视频列表数据
  161. feeds = r.json()["data"]["list"]
  162. for i in range(len(feeds)):
  163. # 标题,表情随机加在片头、片尾,或替代句子中间的标点符号
  164. if "title" in feeds[i]:
  165. befor_video_title = feeds[i]["title"].strip().replace("\n", "") \
  166. .replace("/", "").replace("\r", "").replace("#", "") \
  167. .replace(".", "。").replace("\\", "").replace("&NBSP", "") \
  168. .replace(":", "").replace("*", "").replace("?", "") \
  169. .replace("?", "").replace('"', "").replace("<", "") \
  170. .replace(">", "").replace("|", "").replace(" ", "").replace("#表情", "").replace("#符号", "").replace('"' ,'').replace("'", '')
  171. expression = cls.get_expression()
  172. expression_list = expression[0]
  173. char_list = expression[1]
  174. # 随机取一个表情
  175. expression = random.choice(expression_list)
  176. # 生成标题list[表情+title, title+表情]
  177. expression_title_list = [expression + befor_video_title, befor_video_title + expression]
  178. # 从标题list中随机取一个标题
  179. title_list1 = random.choice(expression_title_list)
  180. # 生成标题:原标题+符号
  181. title_list2 = befor_video_title + random.choice(char_list)
  182. # 表情和标题组合,与标题和符号组合,汇总成待使用的标题列表
  183. title_list4 = [title_list2, title_list1]
  184. # 最终标题
  185. video_title = random.choice(title_list4)
  186. else:
  187. video_title = 0
  188. # 视频 ID
  189. if "vid" in feeds[i]:
  190. video_id = feeds[i]["vid"]
  191. else:
  192. video_id = 0
  193. # 播放量
  194. if "play_pv" in feeds[i]:
  195. video_play_cnt = feeds[i]["play_pv"]
  196. else:
  197. video_play_cnt = 0
  198. # 评论量
  199. if "comment_count" in feeds[i]:
  200. video_comment_cnt = feeds[i]["comment_count"]
  201. else:
  202. video_comment_cnt = 0
  203. # 点赞量
  204. if "favor" in feeds[i]:
  205. video_like_cnt = feeds[i]["favor"]["total"]
  206. else:
  207. video_like_cnt = 0
  208. # 分享量
  209. if "share" in feeds[i]:
  210. video_share_cnt = feeds[i]["share"]
  211. else:
  212. video_share_cnt = 0
  213. # 时长
  214. if "du" in feeds[i]:
  215. video_duration = int(feeds[i]["du"] / 1000)
  216. else:
  217. video_duration = 0
  218. # 宽和高
  219. if "w" or "h" in feeds[i]:
  220. video_width = feeds[i]["w"]
  221. video_height = feeds[i]["h"]
  222. else:
  223. video_width = 0
  224. video_height = 0
  225. # 发布时间
  226. if "t" in feeds[i]:
  227. video_send_time = feeds[i]["t"]
  228. else:
  229. video_send_time = 0
  230. publish_time_stamp = int(int(video_send_time)/1000)
  231. publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))
  232. # 用户名 / 头像
  233. if "user" in feeds[i]:
  234. user_name = feeds[i]["user"]["nick"].strip().replace("\n", "") \
  235. .replace("/", "").replace("快手", "").replace(" ", "") \
  236. .replace(" ", "").replace("&NBSP", "").replace("\r", "")
  237. head_url = feeds[i]["user"]["hurl"]
  238. else:
  239. user_name = 0
  240. head_url = 0
  241. # 用户 ID
  242. profile_id = feeds[i]["id"]
  243. # 用户 mid
  244. profile_mid = feeds[i]["user"]["mid"]
  245. # 视频封面
  246. if "url" in feeds[i]:
  247. cover_url = feeds[i]["url"]
  248. else:
  249. cover_url = 0
  250. # 视频播放地址
  251. if "v_url" in feeds[i]:
  252. video_url = feeds[i]["v_url"]
  253. else:
  254. video_url = 0
  255. video_dict = {
  256. "video_title": video_title,
  257. "video_id": video_id,
  258. "duration": video_duration,
  259. "play_cnt": video_play_cnt,
  260. "like_cnt": video_like_cnt,
  261. "comment_cnt": video_comment_cnt,
  262. "share_cnt": video_share_cnt,
  263. "user_name": user_name,
  264. "publish_time_stamp": publish_time_stamp,
  265. "publish_time_str": publish_time_str,
  266. "video_width": video_width,
  267. "video_height": video_height,
  268. "avatar_url": head_url,
  269. "profile_id": profile_id,
  270. "profile_mid": profile_mid,
  271. "cover_url": cover_url,
  272. "video_url": video_url,
  273. "session": f"xiaoniangao-play-{int(time.time())}"
  274. }
  275. for k, v in video_dict.items():
  276. Common.logger(log_type, crawler).info(f"{k}:{v}")
  277. cls.download_publish(log_type=log_type,
  278. crawler=crawler,
  279. video_dict=video_dict,
  280. strategy=strategy,
  281. oss_endpoint=oss_endpoint,
  282. env=env)
  283. @classmethod
  284. def repeat_video(cls, log_type, crawler, video_id, env):
  285. sql = f""" select * from crawler_video where platform="小年糕" and out_video_id="{video_id}"; """
  286. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
  287. return len(repeat_video)
  288. @classmethod
  289. def download_publish(cls, log_type, crawler, video_dict, strategy, oss_endpoint, env):
  290. # 过滤无效视频
  291. if video_dict["video_id"] == 0 \
  292. or video_dict["video_url"] == 0\
  293. or video_dict["cover_url"] == 0:
  294. Common.logger(log_type, crawler).warning("无效视频\n")
  295. # 抓取规则
  296. elif cls.download_rule(video_dict) is False:
  297. Common.logger(log_type, crawler).info("不满足抓取规则\n")
  298. # 去重
  299. elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env) != 0:
  300. Common.logger(log_type, crawler).info("视频已下载\n")
  301. elif any(str(word) if str(word) in video_dict['video_title'] else False for word in
  302. get_config_from_mysql(log_type=log_type,
  303. source=crawler,
  304. env=env,
  305. text="filter",
  306. action="")) is True:
  307. Common.logger(log_type, crawler).info("视频已中过滤词\n")
  308. else:
  309. # 下载封面
  310. Common.download_method(log_type=log_type, crawler=crawler, text="cover", title=video_dict["video_title"], url=video_dict["cover_url"])
  311. # 下载视频
  312. Common.download_method(log_type=log_type, crawler=crawler, text="video", title=video_dict["video_title"], url=video_dict["video_url"])
  313. # 保存视频信息至 "./videos/{download_video_title}/info.txt"
  314. Common.save_video_info(log_type=log_type, crawler=crawler, video_dict=video_dict)
  315. # 上传视频
  316. Common.logger(log_type, crawler).info("开始上传视频...")
  317. our_video_id = Publish.upload_and_publish(log_type=log_type,
  318. crawler=crawler,
  319. strategy=strategy,
  320. our_uid="play",
  321. env=env,
  322. oss_endpoint=oss_endpoint)
  323. if env == "dev":
  324. our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  325. else:
  326. our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  327. Common.logger(log_type, crawler).info("视频上传完成")
  328. if our_video_id is None:
  329. # 删除视频文件夹
  330. shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}")
  331. return
  332. # 视频信息保存数据库
  333. rule_dict = {
  334. "duration": {"min": 40},
  335. "play_cnt": {"min": 80000},
  336. "min_publish_day": {"min": 60}
  337. }
  338. insert_sql = f""" insert into crawler_video(video_id,
  339. out_user_id,
  340. platform,
  341. strategy,
  342. out_video_id,
  343. video_title,
  344. cover_url,
  345. video_url,
  346. duration,
  347. publish_time,
  348. play_cnt,
  349. crawler_rule,
  350. width,
  351. height)
  352. values({our_video_id},
  353. "{video_dict['profile_id']}",
  354. "{cls.platform}",
  355. "播放量榜爬虫策略",
  356. "{video_dict['video_id']}",
  357. "{video_dict['video_title']}",
  358. "{video_dict['cover_url']}",
  359. "{video_dict['video_url']}",
  360. {int(video_dict['duration'])},
  361. "{video_dict['publish_time_str']}",
  362. {int(video_dict['play_cnt'])},
  363. '{json.dumps(rule_dict)}',
  364. {int(video_dict['video_width'])},
  365. {int(video_dict['video_height'])}) """
  366. Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
  367. MysqlHelper.update_values(log_type, crawler, insert_sql, env)
  368. Common.logger(log_type, crawler).info('视频信息插入数据库成功!')
  369. # 视频写入飞书
  370. Feishu.insert_columns(log_type, crawler, "c85k1C", "ROWS", 1, 2)
  371. # 视频ID工作表,首行写入数据
  372. upload_time = int(time.time())
  373. values = [[time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(upload_time)),
  374. "播放量榜爬虫策略",
  375. str(video_dict['video_id']),
  376. str(video_dict['video_title']),
  377. our_video_link,
  378. video_dict['play_cnt'],
  379. video_dict['comment_cnt'],
  380. video_dict['like_cnt'],
  381. video_dict['share_cnt'],
  382. video_dict['duration'],
  383. f"{video_dict['video_width']}*{video_dict['video_height']}",
  384. str(video_dict['publish_time_str']),
  385. str(video_dict['user_name']),
  386. str(video_dict['profile_id']),
  387. str(video_dict['profile_mid']),
  388. str(video_dict['avatar_url']),
  389. str(video_dict['cover_url']),
  390. str(video_dict['video_url'])]]
  391. time.sleep(1)
  392. Feishu.update_values(log_type, crawler, "c85k1C", "F2:Z2", values)
  393. Common.logger(log_type, crawler).info('视频信息写入飞书成功\n')
  394. if __name__ == '__main__':
  395. XiaoniangaoPlay.get_videoList("play", "xiaoniangao", "播放量榜爬虫策略", "out", "dev")
  396. pass