zhihu_hot.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2022/10/31
  4. import os
  5. import sys
  6. import time
  7. import requests
  8. import urllib3
  9. sys.path.append(os.getcwd())
  10. from main.common import Common
  11. from main.feishu_lib import Feishu
  12. from main.zhihu_hot_publish import Publish
  13. proxies = {'http': None, 'https': None}
  14. class ZhihuHot:
  15. @classmethod
  16. def download_rule(cls, publish_time, play_cnt, duration):
  17. """
  18. 热榜内容抓取
  19. - 发布时间<30天
  20. - 视频播放量>1w
  21. - 视频时长1分钟以上
  22. - 站内标题=知乎 视频原标题
  23. - 站内封面图=知乎 视频原封面图
  24. """
  25. if int(time.time()) - publish_time <= 3600*24*180:
  26. if int(play_cnt) >= 30000:
  27. if int(duration) >= 60:
  28. return True
  29. else:
  30. return False
  31. else:
  32. return False
  33. else:
  34. return False
  35. @classmethod
  36. def get_hot_feeds(cls, log_type, env):
  37. try:
  38. url = "https://www.zhihu.com/api/v4/zvideo-tabs/tabs/choice/feeds/recommend?include=creation_relationship&limit=12&offset=0&trans="
  39. payload = {}
  40. headers = {
  41. 'authority': 'www.zhihu.com',
  42. 'accept': '*/*',
  43. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  44. 'cache-control': 'no-cache',
  45. 'cookie': '_zap=246102fb-af66-40c3-a5a5-9901921d5a71; d_c0=AHCWVw5U5hWPTqifPR-jYwskMnmcUFEgHzQ=|1669014326; q_c1=40c865e7cbed4099b5d090229d3096f5|1669983925000|1669983925000; _xsrf=05151c3d-2d05-47fe-98bc-7b01dae731ba; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1675166229; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1675650146; arialoadData=false; unlock_ticket=AGBWwkpQuhUmAAAAYAJVTWpr4GPx5OU2pJq5pGyrjlK8kt9ctaE_kQ==; z_c0=2|1:0|10:1675650146|4:z_c0|80:MS4xVFdsTlB3QUFBQUFtQUFBQVlBSlZUVDFJeG1TWWVaZEJSVWdsSWdRalloaWlGaVlqYlFrVmpRPT0=|c0945918804e3c623699052665e50a2452bceb732d25e544df0ca9419e50fa6d; SESSIONID=AgocnyI3witm93R7LqTR2y59rcyJQ9p0QONL8jD8laf; JOID=V1sUA0OVH-cf-lXUM5BktSnhBDki0nqJdJkqnwHvcJFzy2uABvN4DnnxUdc6mEsEpn2HejuruTnxTM_CAaxxrJQ=; osd=UVodBEiTHu4Y8VPVOpdvsyjoAzIk03OOf58rlgbkdpB6zGCGB_p_BX_wWNAxnkoNoXaBezKssj_wRcjJB614q58=; KLBRSID=b5ffb4aa1a842930a6f64d0a8f93e9bf|1675650208|1675650143; tst=v; KLBRSID=b5ffb4aa1a842930a6f64d0a8f93e9bf|1675650577|1675650143',
  46. 'pragma': 'no-cache',
  47. 'referer': 'https://www.zhihu.com/zvideo',
  48. 'sec-ch-ua': '"Not_A Brand";v="99", "Microsoft Edge";v="109", "Chromium";v="109"',
  49. 'sec-ch-ua-mobile': '?0',
  50. 'sec-ch-ua-platform': '"macOS"',
  51. 'sec-fetch-dest': 'empty',
  52. 'sec-fetch-mode': 'cors',
  53. 'sec-fetch-site': 'same-origin',
  54. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.52',
  55. 'x-ab-param': '',
  56. 'x-ab-pb': 'CmQIABsAPwBHALQAaQFqAXQBOwLMAtcC2AK3A9YEEQVRBYsFjAWeBTAGMQbrBicHdAh2CHkIPwlgCfQJBApJCmUKawq+Cv4KQwtxC4cLjQvXC+AL5QvmCzgMcQyPDKwMwwzJDPgMEjIBAAAAAAAAAAADAAAABAAAAAABAAABAAAABgABAwAAAAAAAAEAAAUCAQAAAgYAAAIAAA==',
  57. 'x-requested-with': 'fetch',
  58. 'x-zse-93': '101_3_3.0',
  59. 'x-zse-96': '2.0_9hhW+XP7k+dDB9BskcIiHxVJqhacSZWPOmtKRnK6sLS+A=09Pdu+xMIJRDD2=6RR',
  60. 'x-zst-81': '3_2.0aR_sn77yn6O92wOB8hPZn79qE72xcXFZ16fyQArZ39Sm7820XM20cL_1kwxYUqwT16P0EiUZbR2x-LOmwhp1tD_I-JOfgGXTzJO1ADRZ0cHsTJXII820Eer0c4nVDJH8zGCBADwMuukRo4Cqm4w0riRO70CB70O83uPmgbgmhufXiqomKbO1FJLYiRnxEL2ZZrxmDucmqhPXnXFMTAoTF6RhRuLPF8O8xUt9ahuYyveKQRVydDLKFUCycHpxf9CMNGFL209megOV2HLKXqSm3JSTv72pr9Y0MTHfagHLovw_2qof_CN_UhomyvO_EUV_HGcGsh3_suVGPGe_JD3qPDwBWhOYAhpys8SYzgu0WgXLuvN1Yq3GHDcYOuws6HwqsUXCDvN_HvxsCqfzKiLMricfPgXVqqc0_DoMSGR9NqeYEvO_1LgmoGxBOqtLThXyyuC_xcN0jqFY19xmkGVm2cwMNbXpTwFm39L_ACo_2CpKc6eLEBXynGOBpwc9QTLL1QOC'
  61. }
  62. urllib3.disable_warnings()
  63. response = requests.get(url=url, headers=headers, data=payload, verify=False, proxies=proxies)
  64. if 'data' not in response.json():
  65. Common.logger(log_type).error('response:{}', response.text)
  66. elif len(response.json()['data']) == 0:
  67. Common.logger(log_type).error('response:{}', response.text)
  68. else:
  69. feeds = response.json()['data']
  70. for i in range(len(feeds)):
  71. # video_title
  72. if 'title' not in feeds[i]:
  73. video_title = 0
  74. else:
  75. video_title = feeds[i]['title']
  76. # video_id
  77. if 'video' not in feeds[i]:
  78. video_id = 0
  79. elif 'video_id' not in feeds[i]['video']:
  80. video_id = 0
  81. else:
  82. video_id = feeds[i]['video']['video_id']
  83. # play_cnt
  84. if 'play_count' not in feeds[i]:
  85. play_cnt = 0
  86. else:
  87. play_cnt = feeds[i]['play_count']
  88. # comment_cnt
  89. if 'comment_count' not in feeds[i]:
  90. comment_cnt = 0
  91. else:
  92. comment_cnt = feeds[i]['comment_count']
  93. # like_cnt
  94. if 'liked_count' not in feeds[i]:
  95. like_cnt = 0
  96. else:
  97. like_cnt = feeds[i]['liked_count']
  98. # share_cnt
  99. if 'share_count' not in feeds[i]:
  100. share_cnt = 0
  101. else:
  102. share_cnt = feeds[i]['share_count']
  103. # voteup_cnt 赞同数
  104. if 'voteup_count' not in feeds[i]:
  105. voteup_cnt = 0
  106. else:
  107. voteup_cnt = feeds[i]['voteup_count']
  108. # publish_time
  109. if 'published_at' not in feeds[i]:
  110. publish_time = 0
  111. else:
  112. publish_time = feeds[i]['published_at']
  113. # duration
  114. if 'video' not in feeds[i]:
  115. duration = 0
  116. elif 'duration' not in feeds[i]['video']:
  117. duration = 0
  118. else:
  119. duration = feeds[i]['video']['duration']
  120. # width / height / video_url
  121. if 'video' not in feeds[i]:
  122. video_width = 0
  123. video_height = 0
  124. video_url = 0
  125. elif 'playlist' in feeds[i]['video'] and 'fhd' in feeds[i]['video']['playlist'] \
  126. and 'width' in feeds[i]['video']['playlist']['fhd'] \
  127. and 'height' in feeds[i]['video']['playlist']['fhd'] \
  128. and 'play_url' in feeds[i]['video']['playlist']['fhd']:
  129. video_width = feeds[i]['video']['playlist']['fhd']['width']
  130. video_height = feeds[i]['video']['playlist']['fhd']['height']
  131. video_url = feeds[i]['video']['playlist']['fhd']['play_url']
  132. elif 'playlist' in feeds[i]['video'] and 'fhd' in feeds[i]['video']['playlist'] \
  133. and 'width' in feeds[i]['video']['playlist']['fhd'] \
  134. and 'height' in feeds[i]['video']['playlist']['fhd'] \
  135. and 'url' in feeds[i]['video']['playlist']['fhd']:
  136. video_width = feeds[i]['video']['playlist']['fhd']['width']
  137. video_height = feeds[i]['video']['playlist']['fhd']['height']
  138. video_url = feeds[i]['video']['playlist']['fhd']['url']
  139. elif 'playlist' in feeds[i]['video'] and 'hd' in feeds[i]['video']['playlist'] \
  140. and 'width' in feeds[i]['video']['playlist']['hd'] \
  141. and 'height' in feeds[i]['video']['playlist']['hd'] \
  142. and 'play_url' in feeds[i]['video']['playlist']['hd']:
  143. video_width = feeds[i]['video']['playlist']['hd']['width']
  144. video_height = feeds[i]['video']['playlist']['hd']['height']
  145. video_url = feeds[i]['video']['playlist']['hd']['play_url']
  146. elif 'playlist' in feeds[i]['video'] and 'hd' in feeds[i]['video']['playlist'] \
  147. and 'width' in feeds[i]['video']['playlist']['hd'] \
  148. and 'height' in feeds[i]['video']['playlist']['hd'] \
  149. and 'url' in feeds[i]['video']['playlist']['hd']:
  150. video_width = feeds[i]['video']['playlist']['hd']['width']
  151. video_height = feeds[i]['video']['playlist']['hd']['height']
  152. video_url = feeds[i]['video']['playlist']['hd']['url']
  153. elif 'playlist' in feeds[i]['video'] and 'ld' in feeds[i]['video']['playlist'] \
  154. and 'width' in feeds[i]['video']['playlist']['ld'] \
  155. and 'height' in feeds[i]['video']['playlist']['ld'] \
  156. and 'play_url' in feeds[i]['video']['playlist']['ld']:
  157. video_width = feeds[i]['video']['playlist']['ld']['width']
  158. video_height = feeds[i]['video']['playlist']['ld']['height']
  159. video_url = feeds[i]['video']['playlist']['ld']['play_url']
  160. elif 'playlist' in feeds[i]['video'] and 'ld' in feeds[i]['video']['playlist'] \
  161. and 'width' in feeds[i]['video']['playlist']['ld'] \
  162. and 'height' in feeds[i]['video']['playlist']['ld'] \
  163. and 'url' in feeds[i]['video']['playlist']['ld']:
  164. video_width = feeds[i]['video']['playlist']['ld']['width']
  165. video_height = feeds[i]['video']['playlist']['ld']['height']
  166. video_url = feeds[i]['video']['playlist']['ld']['url']
  167. elif 'playlist' in feeds[i]['video'] and 'sd' in feeds[i]['video']['playlist'] \
  168. and 'width' in feeds[i]['video']['playlist']['sd'] \
  169. and 'height' in feeds[i]['video']['playlist']['sd'] \
  170. and 'play_url' in feeds[i]['video']['playlist']['sd']:
  171. video_width = feeds[i]['video']['playlist']['sd']['width']
  172. video_height = feeds[i]['video']['playlist']['sd']['height']
  173. video_url = feeds[i]['video']['playlist']['sd']['play_url']
  174. elif 'playlist' in feeds[i]['video'] and 'sd' in feeds[i]['video']['playlist'] \
  175. and 'width' in feeds[i]['video']['playlist']['sd'] \
  176. and 'height' in feeds[i]['video']['playlist']['sd'] \
  177. and 'url' in feeds[i]['video']['playlist']['sd']:
  178. video_width = feeds[i]['video']['playlist']['sd']['width']
  179. video_height = feeds[i]['video']['playlist']['sd']['height']
  180. video_url = feeds[i]['video']['playlist']['sd']['url']
  181. elif 'playlist_v2' in feeds[i]['video'] and 'fhd' in feeds[i]['video']['playlist_v2'] \
  182. and 'width' in feeds[i]['video']['playlist_v2']['fhd'] \
  183. and 'height' in feeds[i]['video']['playlist_v2']['fhd'] \
  184. and 'play_url' in feeds[i]['video']['playlist_v2']['fhd']:
  185. video_width = feeds[i]['video']['playlist_v2']['fhd']['width']
  186. video_height = feeds[i]['video']['playlist_v2']['fhd']['height']
  187. video_url = feeds[i]['video']['playlist_v2']['fhd']['play_url']
  188. elif 'playlist_v2' in feeds[i]['video'] and 'fhd' in feeds[i]['video']['playlist_v2'] \
  189. and 'width' in feeds[i]['video']['playlist_v2']['fhd'] \
  190. and 'height' in feeds[i]['video']['playlist_v2']['fhd'] \
  191. and 'url' in feeds[i]['video']['playlist_v2']['fhd']:
  192. video_width = feeds[i]['video']['playlist_v2']['fhd']['width']
  193. video_height = feeds[i]['video']['playlist_v2']['fhd']['height']
  194. video_url = feeds[i]['video']['playlist_v2']['fhd']['url']
  195. elif 'playlist_v2' in feeds[i]['video'] and 'hd' in feeds[i]['video']['playlist_v2'] \
  196. and 'width' in feeds[i]['video']['playlist_v2']['hd'] \
  197. and 'height' in feeds[i]['video']['playlist_v2']['hd'] \
  198. and 'play_url' in feeds[i]['video']['playlist_v2']['hd']:
  199. video_width = feeds[i]['video']['playlist_v2']['hd']['width']
  200. video_height = feeds[i]['video']['playlist_v2']['hd']['height']
  201. video_url = feeds[i]['video']['playlist_v2']['hd']['play_url']
  202. elif 'playlist_v2' in feeds[i]['video'] and 'hd' in feeds[i]['video']['playlist_v2'] \
  203. and 'width' in feeds[i]['video']['playlist_v2']['hd'] \
  204. and 'height' in feeds[i]['video']['playlist_v2']['hd'] \
  205. and 'url' in feeds[i]['video']['playlist_v2']['hd']:
  206. video_width = feeds[i]['video']['playlist_v2']['hd']['width']
  207. video_height = feeds[i]['video']['playlist_v2']['hd']['height']
  208. video_url = feeds[i]['video']['playlist_v2']['hd']['url']
  209. elif 'playlist_v2' in feeds[i]['video'] and 'ld' in feeds[i]['video']['playlist_v2'] \
  210. and 'width' in feeds[i]['video']['playlist_v2']['ld'] \
  211. and 'height' in feeds[i]['video']['playlist_v2']['ld'] \
  212. and 'play_url' in feeds[i]['video']['playlist_v2']['ld']:
  213. video_width = feeds[i]['video']['playlist_v2']['ld']['width']
  214. video_height = feeds[i]['video']['playlist_v2']['ld']['height']
  215. video_url = feeds[i]['video']['playlist_v2']['ld']['play_url']
  216. elif 'playlist_v2' in feeds[i]['video'] and 'ld' in feeds[i]['video']['playlist_v2'] \
  217. and 'width' in feeds[i]['video']['playlist_v2']['ld'] \
  218. and 'height' in feeds[i]['video']['playlist_v2']['ld'] \
  219. and 'url' in feeds[i]['video']['playlist_v2']['ld']:
  220. video_width = feeds[i]['video']['playlist_v2']['ld']['width']
  221. video_height = feeds[i]['video']['playlist_v2']['ld']['height']
  222. video_url = feeds[i]['video']['playlist_v2']['ld']['url']
  223. elif 'playlist_v2' in feeds[i]['video'] and 'sd' in feeds[i]['video']['playlist_v2'] \
  224. and 'width' in feeds[i]['video']['playlist_v2']['sd'] \
  225. and 'height' in feeds[i]['video']['playlist_v2']['sd'] \
  226. and 'play_url' in feeds[i]['video']['playlist_v2']['sd']:
  227. video_width = feeds[i]['video']['playlist_v2']['sd']['width']
  228. video_height = feeds[i]['video']['playlist_v2']['sd']['height']
  229. video_url = feeds[i]['video']['playlist_v2']['sd']['play_url']
  230. elif 'playlist_v2' in feeds[i]['video'] and 'sd' in feeds[i]['video']['playlist_v2'] \
  231. and 'width' in feeds[i]['video']['playlist_v2']['sd'] \
  232. and 'height' in feeds[i]['video']['playlist_v2']['sd'] \
  233. and 'url' in feeds[i]['video']['playlist_v2']['sd']:
  234. video_width = feeds[i]['video']['playlist_v2']['sd']['width']
  235. video_height = feeds[i]['video']['playlist_v2']['sd']['height']
  236. video_url = feeds[i]['video']['playlist_v2']['sd']['url']
  237. else:
  238. video_width = 0
  239. video_height = 0
  240. video_url = 0
  241. # cover_url
  242. if 'video' not in feeds[i]:
  243. cover_url = 0
  244. elif 'thumbnail' not in feeds[i]['video']:
  245. cover_url = 0
  246. else:
  247. cover_url = feeds[i]['video']['thumbnail']
  248. # user_name / uid / user_type / url_token / avatar_url
  249. if 'author' not in feeds[i]:
  250. user_name = 0
  251. uid = 0
  252. user_type = 0
  253. url_token = 0
  254. avatar_url = 0
  255. elif 'author' in feeds[i] \
  256. and 'name' in feeds[i]['author']\
  257. and 'uid' in feeds[i]['author']\
  258. and 'user_type' in feeds[i]['author']\
  259. and 'url_token' in feeds[i]['author']\
  260. and 'avatar_url_template' in feeds[i]['author']:
  261. user_name = feeds[i]['author']['name']
  262. uid = feeds[i]['author']['uid']
  263. user_type = feeds[i]['author']['user_type']
  264. url_token = feeds[i]['author']['url_token']
  265. avatar_url = feeds[i]['author']['avatar_url_template']
  266. elif 'author' in feeds[i] \
  267. and 'name' in feeds[i]['author']\
  268. and 'uid' in feeds[i]['author']\
  269. and 'user_type' in feeds[i]['author']\
  270. and 'url_token' in feeds[i]['author']\
  271. and 'avatar_url' in feeds[i]['author']:
  272. user_name = feeds[i]['author']['name']
  273. uid = feeds[i]['author']['uid']
  274. user_type = feeds[i]['author']['user_type']
  275. url_token = feeds[i]['author']['url_token']
  276. avatar_url = feeds[i]['author']['avatar_url']
  277. else:
  278. user_name = 0
  279. uid = 0
  280. user_type = 0
  281. url_token = 0
  282. avatar_url = 0
  283. Common.logger(log_type).info('video_title:{}', video_title)
  284. Common.logger(log_type).info('play_cnt:{}', play_cnt)
  285. Common.logger(log_type).info('duration:{}', int(duration))
  286. Common.logger(log_type).info(
  287. 'publish_time:{}', time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(publish_time)))
  288. if video_title == 0 or video_id == 0 or avatar_url == 0 or video_url == 0:
  289. Common.logger(log_type).info('无效视频\n')
  290. elif cls.download_rule(publish_time, play_cnt, duration) is False:
  291. Common.logger(log_type).info('不满足下载规则\n')
  292. elif str(video_id) in [x for y in Feishu.get_values_batch(log_type, 'zhihu', '8871e3') for x in y]:
  293. Common.logger(log_type).info('视频已下载\n')
  294. elif str(video_id) in [x for y in Feishu.get_values_batch(log_type, 'zhihu', '4MGuux') for x in y]:
  295. Common.logger(log_type).info('视频已下载\n')
  296. else:
  297. Common.download_method(log_type, 'cover', video_title, cover_url)
  298. Common.download_method(log_type, 'video', video_title, video_url)
  299. # 保存视频信息至 "./videos/{download_video_title}/info.txt"
  300. with open("./videos/" + video_title + "/" + "info.txt", "a", encoding="UTF-8") as f_a:
  301. f_a.write(str(video_id) + "\n" +
  302. str(video_title) + "\n" +
  303. str(int(duration)) + "\n" +
  304. str(play_cnt) + "\n" +
  305. str(comment_cnt) + "\n" +
  306. str(like_cnt) + "\n" +
  307. str(share_cnt) + "\n" +
  308. str(video_width)+'*'+str(video_height) + "\n" +
  309. str(publish_time) + "\n" +
  310. str(user_name) + "\n" +
  311. str(avatar_url) + "\n" +
  312. str(video_url) + "\n" +
  313. str(cover_url) + "\n" +
  314. "zhihu" + str(int(time.time())))
  315. Common.logger(log_type).info("==========视频信息已保存至info.txt==========")
  316. our_video_id = Publish.upload_and_publish(log_type, env, 'hot')
  317. if env == 'dev':
  318. our_video_link = "https://testadmin.piaoquantv.com/cms/post-detail/" + str(our_video_id) + "/info"
  319. else:
  320. our_video_link = "https://admin.piaoquantv.com/cms/post-detail/" + str(our_video_id) + "/info"
  321. Common.logger(log_type).info("视频上传完成:{}\n", video_title)
  322. Feishu.insert_columns(log_type, 'zhihu', '8871e3', 'ROWS', 1, 2)
  323. time.sleep(1)
  324. upload_time = int(time.time())
  325. values = [[
  326. time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(upload_time)),
  327. "热榜",
  328. video_title,
  329. str(video_id),
  330. our_video_link,
  331. play_cnt,
  332. comment_cnt,
  333. like_cnt,
  334. share_cnt,
  335. voteup_cnt,
  336. int(duration),
  337. str(video_width)+'*'+str(video_height),
  338. time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(int(publish_time))),
  339. user_name,
  340. uid,
  341. 'https://www.zhihu.com/'+user_type+'/'+url_token,
  342. user_type,
  343. url_token,
  344. avatar_url,
  345. cover_url,
  346. video_url]]
  347. Feishu.update_values(log_type, 'zhihu', "8871e3", "F2:Z2", values)
  348. Common.logger(log_type).info("视频已保存至云文档:{}\n", video_title)
  349. except Exception as e:
  350. Common.logger(log_type).error('get_hot_feeds异常:{}\n', e)
  351. if __name__ == '__main__':
  352. ZhihuHot.get_hot_feeds('hot', 'dev')