kanyikan_moment.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/6/21
  4. import os
  5. import random
  6. import sys
  7. import time
  8. import requests
  9. import urllib3
  10. sys.path.append(os.getcwd())
  11. proxies = {"http": None, "https": None}
  12. class Moment:
  13. # 过滤词库
  14. @classmethod
  15. def sensitive_words(cls):
  16. word_list = []
  17. # 从云文档读取所有敏感词,添加到词库列表
  18. lists = Feishu.get_values_batch("moment", "kanyikan", "rofdM5")
  19. for i in lists:
  20. for j in i:
  21. # 过滤空的单元格内容
  22. if j is None:
  23. pass
  24. else:
  25. word_list.append(j)
  26. return word_list
  27. # 朋友圈视频 ID
  28. @classmethod
  29. def moment_videoids(cls):
  30. try:
  31. videoid_list = []
  32. # 从云文档读取所有敏感词,添加到词库列表
  33. lists = Feishu.get_values_batch("moment", "kanyikan", "iK58HX")
  34. for i in lists:
  35. for j in i:
  36. # 过滤空的单元格内容
  37. if j is None:
  38. pass
  39. else:
  40. videoid_list.append(j)
  41. return videoid_list
  42. except Exception as e:
  43. Common.logger("moment").error("获取朋友圈视频ID异常:{}", e)
  44. return "t3256lo1cmk"
  45. # 抓取基础规则
  46. @staticmethod
  47. def download_rule(d_duration, d_width, d_height, d_play_cnt, d_like_cnt, d_share_cnt):
  48. """
  49. 抓取基础规则
  50. :param d_duration: 时长
  51. :param d_width: 宽
  52. :param d_height: 高
  53. :param d_play_cnt: 播放量
  54. :param d_like_cnt: 点赞量
  55. :param d_share_cnt: 分享量
  56. :return: 满足规则,返回 True;反之,返回 False
  57. """
  58. if int(float(d_duration)) >= 40:
  59. if int(d_width) >= 0 or int(d_height) >= 0:
  60. if int(d_play_cnt) >= 50000:
  61. if int(d_like_cnt) >= 0:
  62. if int(d_share_cnt) >= 0:
  63. return True
  64. else:
  65. return False
  66. else:
  67. return False
  68. else:
  69. return False
  70. return False
  71. return False
  72. # 获取推荐视频列表
  73. @classmethod
  74. def get_recommend(cls):
  75. url = "https://search.weixin.qq.com/cgi-bin/recwxa/snsgetvideoinfo?"
  76. headers = {
  77. "content-type": "application/json",
  78. "Accept-Encoding": "gzip,compress,br,deflate",
  79. "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X)"
  80. " AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"
  81. " MicroMessenger/8.0.20(0x18001442) NetType/WIFI Language/zh_CN",
  82. "Referer": "https://servicewechat.com/wxbb9a805eb4f9533c/236/page-frame.html"
  83. }
  84. time.sleep(1)
  85. videoid = random.choice(cls.moment_videoids())
  86. # Common.logger("moment").info("videoid:{}", videoid)
  87. params = {
  88. "vid": videoid,
  89. "openid": "1924336296754305",
  90. "model": "iPhone 11<iPhone12,1>14.7.1",
  91. "sharesearchid": "8406805193800900989",
  92. "shareOpenid": "oh_m45YffSEGxvDH--6s6g9ZkPxg",
  93. }
  94. try:
  95. urllib3.disable_warnings()
  96. r = requests.get(url=url, headers=headers, params=params, proxies=proxies, verify=False)
  97. # Common.logger("moment").info("response:{}", r.json())
  98. if "rec_video_list" not in r.json()["data"]:
  99. Common.logger("moment").warning("该视频无推荐视频列表:{}", videoid)
  100. else:
  101. feeds = r.json()["data"]["rec_video_list"]
  102. for i in range(len(feeds)):
  103. # video_id
  104. if "vid" in feeds[i]:
  105. video_id = feeds[i]["vid"]
  106. else:
  107. video_id = 0
  108. # video_title
  109. if "title" in feeds[i]:
  110. video_title = feeds[i]["title"].strip().replace("\n", "") \
  111. .replace("/", "").replace("\\", "").replace("\r", "") \
  112. .replace(":", "").replace("*", "").replace("?", "") \
  113. .replace("?", "").replace('"', "").replace("<", "") \
  114. .replace(">", "").replace("|", "").replace(" ", "") \
  115. .replace("&NBSP", "").replace(".", "。").replace(" ", "") \
  116. .replace("小年糕", "").replace("#", "").replace("Merge", "")
  117. else:
  118. video_title = 0
  119. # video_play_cnt
  120. if "played_cnt" in feeds[i]:
  121. video_play_cnt = feeds[i]["played_cnt"]
  122. else:
  123. video_play_cnt = 0
  124. # video_comment_cnt
  125. if "comment_cnt" in feeds[i]:
  126. video_comment_cnt = feeds[i]["comment_cnt"]
  127. else:
  128. video_comment_cnt = 0
  129. # video_liked_cnt
  130. if "liked_cnt" in feeds[i]:
  131. video_liked_cnt = feeds[i]["liked_cnt"]
  132. else:
  133. video_liked_cnt = 0
  134. # video_share_cnt
  135. if "shared_cnt" in feeds[i]:
  136. video_share_cnt = feeds[i]["shared_cnt"]
  137. else:
  138. video_share_cnt = 0
  139. # video_duration
  140. if "duration" in feeds[i]:
  141. video_duration = feeds[i]["duration"]
  142. else:
  143. video_duration = 0
  144. # video_width / video_height
  145. if "width" in feeds[i] or "height" in feeds[i]:
  146. video_width = feeds[i]["width"]
  147. video_height = feeds[i]["height"]
  148. else:
  149. video_width = 0
  150. video_height = 0
  151. # video_send_time
  152. if "upload_time" in feeds[i]:
  153. video_send_time = feeds[i]["upload_time"]
  154. else:
  155. video_send_time = 0
  156. # user_name
  157. if "user_info" not in feeds[i]:
  158. user_name = 0
  159. elif "nickname" not in feeds[i]["user_info"]:
  160. user_name = 0
  161. else:
  162. user_name = feeds[i]["user_info"]["nickname"].strip().replace("\n", "")
  163. # user_id
  164. if "user_info" not in feeds[i]:
  165. user_id = 0
  166. elif "openid" not in feeds[i]["user_info"]:
  167. user_id = 0
  168. else:
  169. user_id = feeds[i]["user_info"]["openid"]
  170. # head_url
  171. if "user_info" not in feeds[i]:
  172. head_url = 0
  173. elif "headimg_url" not in feeds[i]["user_info"]:
  174. head_url = 0
  175. else:
  176. head_url = feeds[i]["user_info"]["headimg_url"]
  177. # cover_url
  178. if "cover_url" not in feeds[i]:
  179. cover_url = 0
  180. else:
  181. cover_url = feeds[i]["cover_url"]
  182. # video_url
  183. if "play_info" not in feeds[i]:
  184. video_url = 0
  185. elif "items" not in feeds[i]["play_info"]:
  186. video_url = 0
  187. else:
  188. video_url = feeds[i]["play_info"]["items"][-1]["play_url"]
  189. Common.logger("moment").info("video_id:{}", video_id)
  190. Common.logger("moment").info("video_title:{}", video_title)
  191. Common.logger("moment").info("user_name:{}", user_name)
  192. Common.logger("moment").info("video_play_cnt:{}", video_play_cnt)
  193. Common.logger("moment").info("video_liked_cnt:{}", video_liked_cnt)
  194. Common.logger("moment").info("video_share_cnt:{}", video_share_cnt)
  195. Common.logger("moment").info("video_duration:{}", video_duration)
  196. Common.logger("moment").info("video_width * video_height:{}*{}", video_width, video_height)
  197. Common.logger("moment").info("video_url:{}", video_url)
  198. # 过滤无效视频
  199. if video_id == 0 or video_title == 0 or video_duration == 0 or video_send_time == 0 or user_id == 0\
  200. or head_url == 0 or cover_url == 0 or video_url == 0:
  201. Common.logger("moment").warning("无效视频")
  202. # 抓取基础规则
  203. elif cls.download_rule(
  204. d_duration=video_duration, d_width=video_width, d_height=video_height,
  205. d_play_cnt=video_play_cnt, d_like_cnt=video_liked_cnt,
  206. d_share_cnt=video_share_cnt) is False:
  207. Common.logger("moment").info("不满足基础规则:{}", video_title)
  208. elif int(video_send_time) < 1659283200:
  209. Common.logger("moment").info('发布时间{}<2022-08-01', video_send_time)
  210. # 过滤词库
  211. elif any(word if word in video_title else False for word in cls.sensitive_words()) is True:
  212. Common.logger("moment").info("视频已中过滤词:{}".format(video_title))
  213. # 从已下载视频表去重:https://w42nne6hzg.feishu.cn/sheets/shtcngRPoDYAi24x52j2nDuHMih?sheet=20ce0c
  214. elif video_id in [j for m in Feishu.get_values_batch("moment", "kanyikan", "20ce0c") for j in m]:
  215. Common.logger("moment").info("该视频已下载:{}", video_title)
  216. # 从feeds视频表去重:https://w42nne6hzg.feishu.cn/sheets/shtcngRPoDYAi24x52j2nDuHMih?sheet=tGqZMX
  217. elif video_id in [j for n in Feishu.get_values_batch("moment", "kanyikan", "tGqZMX") for j in n]:
  218. Common.logger("moment").info("该视频已在moment_feeds中:{}", video_title)
  219. else:
  220. Common.logger("moment").info("该视频未下载,添加至moment_feeds中:{}", video_title)
  221. # 看一看+工作表,插入首行
  222. Feishu.insert_columns("moment", "kanyikan", "tGqZMX", "ROWS", 1, 2)
  223. # 获取当前时间
  224. get_feeds_time = int(time.time())
  225. # 准备写入云文档的数据
  226. values = [[time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(get_feeds_time)),
  227. "朋友圈",
  228. video_id,
  229. video_title,
  230. video_play_cnt,
  231. video_comment_cnt,
  232. video_liked_cnt,
  233. video_share_cnt,
  234. video_duration,
  235. str(video_width)+"*"+str(video_height),
  236. time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(video_send_time)),
  237. user_name,
  238. user_id,
  239. head_url,
  240. cover_url,
  241. video_url]]
  242. time.sleep(1)
  243. Feishu.update_values("moment", "kanyikan", "tGqZMX", "A2:P2", values)
  244. except Exception as e:
  245. Common.logger("moment").error("获取视频列表异常:{}", e)
  246. # 下载/上传视频
  247. @classmethod
  248. def download_publish(cls, env):
  249. try:
  250. moment_feeds = Feishu.get_values_batch("moment", "kanyikan", "tGqZMX")
  251. for i in range(1, len(moment_feeds) + 1):
  252. time.sleep(1)
  253. # download_push_time = moment_feeds[i][0]
  254. download_video_id = moment_feeds[i][2]
  255. download_video_title = moment_feeds[i][3]
  256. download_video_play_cnt = moment_feeds[i][4]
  257. download_video_comment_cnt = moment_feeds[i][5]
  258. download_video_like_cnt = moment_feeds[i][6]
  259. download_video_share_cnt = moment_feeds[i][7]
  260. download_video_duration = moment_feeds[i][8]
  261. download_video_resolution = moment_feeds[i][9]
  262. download_video_send_time = moment_feeds[i][10]
  263. download_user_name = moment_feeds[i][11]
  264. download_user_id = moment_feeds[i][12]
  265. download_head_url = moment_feeds[i][13]
  266. download_cover_url = moment_feeds[i][14]
  267. download_video_url = moment_feeds[i][15]
  268. Common.logger("moment").info("正在判断第{}行,视频:{}", i, download_video_title)
  269. # 发布时间的时间戳格式(秒为单位)
  270. v_send_time = int(time.mktime(time.strptime(download_video_send_time, "%Y/%m/%d %H:%M:%S")))
  271. # 抓取时间的时间戳格式(秒为单位)
  272. # v_push_time = int(time.mktime(time.strptime(download_push_time, "%Y/%m/%d %H:%M:%S")))
  273. # 过滤空行及空标题视频
  274. if download_video_id is None\
  275. or download_video_id == ""\
  276. or download_video_title is None\
  277. or download_video_title == "":
  278. Common.logger("moment").warning("标题为空或空行,删除")
  279. # 删除行或列,可选 ROWS、COLUMNS
  280. Feishu.dimension_range("moment", "kanyikan", "tGqZMX", "ROWS", i + 1, i + 1)
  281. return
  282. # # 视频的抓取时间小于 2 天
  283. # elif int(time.time()) - v_push_time > 172800:
  284. # Common.logger("moment").info("抓取时间超过2天:{}", download_video_title)
  285. # # 删除行或列,可选 ROWS、COLUMNS
  286. # Feishu.dimension_range("tGqZMX", "ROWS", i + 1, i + 1)
  287. # return
  288. # 视频发布时间不小于 2021-06-01 00:00:00
  289. elif v_send_time < 1622476800:
  290. Common.logger("moment").info(
  291. "发布时间小于2021年6月:{},{}", download_video_title, download_video_send_time)
  292. # 删除行或列,可选 ROWS、COLUMNS
  293. Feishu.dimension_range("moment", "kanyikan", "tGqZMX", "ROWS", i + 1, i + 1)
  294. return
  295. # 从已下载视频表中去重
  296. elif download_video_id in [j for m in Feishu.get_values_batch(
  297. "moment", "kanyikan", "20ce0c") for j in m]:
  298. Common.logger("moment").info("视频已下载:{}", download_video_title)
  299. # 删除行或列,可选 ROWS、COLUMNS
  300. Feishu.dimension_range("moment", "kanyikan", "tGqZMX", "ROWS", i + 1, i + 1)
  301. return
  302. # 从已下载视频表中去重
  303. elif download_video_id in [j for m in Feishu.get_values_batch(
  304. "moment", "kanyikan", "ho98Ov") for j in m]:
  305. Common.logger("moment").info("视频已下载:{}", download_video_title)
  306. # 删除行或列,可选 ROWS、COLUMNS
  307. Feishu.dimension_range("moment", "kanyikan", "tGqZMX", "ROWS", i + 1, i + 1)
  308. return
  309. else:
  310. Common.logger("moment").info("开始下载视频:{}", download_video_title)
  311. # 下载封面
  312. Common.download_method(log_type="moment", text="cover",
  313. d_name=str(download_video_title), d_url=str(download_cover_url))
  314. # 下载视频
  315. Common.download_method(log_type="moment", text="video",
  316. d_name=str(download_video_title), d_url=str(download_video_url))
  317. # 保存视频信息至 "./videos/{download_video_title}/info.txt"
  318. with open("./videos/" + download_video_title + "/" + "info.txt",
  319. "a", encoding="UTF-8") as f_a:
  320. f_a.write(str(download_video_id) + "\n" +
  321. str(download_video_title) + "\n" +
  322. str(download_video_duration) + "\n" +
  323. str(download_video_play_cnt) + "\n" +
  324. str(download_video_comment_cnt) + "\n" +
  325. str(download_video_like_cnt) + "\n" +
  326. str(download_video_share_cnt) + "\n" +
  327. str(download_video_resolution) + "\n" +
  328. str(int(time.mktime(
  329. time.strptime(download_video_send_time, "%Y/%m/%d %H:%M:%S")))) + "\n" +
  330. str(download_user_name) + "\n" +
  331. str(download_head_url) + "\n" +
  332. str(download_video_url) + "\n" +
  333. str(download_cover_url) + "\n" +
  334. "KANYIKAN_MOMENT")
  335. Common.logger("moment").info("==========视频信息已保存至info.txt==========")
  336. # 上传视频
  337. Common.logger("moment").info("开始上传视频:{}".format(download_video_title))
  338. our_video_id = Publish.upload_and_publish(log_type="moment",
  339. crawler="kanyikan",
  340. strategy="朋友圈抓取策略",
  341. our_uid="moment",
  342. env=env,
  343. oss_endpoint="out")
  344. our_video_link = "https://admin.piaoquantv.com/cms/post-detail/" + str(our_video_id) + "/info"
  345. Common.logger("moment").info("视频上传完成:{}", download_video_title)
  346. # 保存视频 ID 到云文档:https://w42nne6hzg.feishu.cn/sheets/shtcngRPoDYAi24x52j2nDuHMih?sheet=20ce0c
  347. Common.logger("moment").info("保存视频ID至云文档:{}", download_video_title)
  348. # 视频ID工作表,插入首行
  349. Feishu.insert_columns("moment", "kanyikan", "20ce0c", "ROWS", 1, 2)
  350. # 视频ID工作表,首行写入数据
  351. upload_time = int(time.time())
  352. values = [[time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(upload_time)),
  353. "朋友圈",
  354. str(download_video_id),
  355. str(download_video_title),
  356. our_video_link,
  357. download_video_play_cnt,
  358. download_video_comment_cnt,
  359. download_video_like_cnt,
  360. download_video_share_cnt,
  361. download_video_duration,
  362. str(download_video_resolution),
  363. str(download_video_send_time),
  364. str(download_user_name),
  365. str(download_user_id),
  366. str(download_head_url),
  367. str(download_cover_url),
  368. str(download_video_url)]]
  369. time.sleep(1)
  370. Feishu.update_values("moment", "kanyikan", "20ce0c", "F2:W2", values)
  371. # 保存视频信息到监控表
  372. Common.logger("moment").info("添加视频到监控表:{}", download_video_title)
  373. # 插入空行
  374. time.sleep(1)
  375. Feishu.insert_columns("moment", "monitor", "6fed97", "ROWS", 1, 2)
  376. # 视频信息写入监控表
  377. values = [[time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(int(upload_time))),
  378. str(download_video_id),
  379. download_video_title,
  380. our_video_link,
  381. download_video_duration,
  382. str(download_video_send_time),
  383. download_video_play_cnt]]
  384. time.sleep(1)
  385. Feishu.update_values("moment", "monitor", "6fed97", "F2:L2", values)
  386. # 删除行或列,可选 ROWS、COLUMNS
  387. Feishu.dimension_range("moment", "kanyikan", "tGqZMX", "ROWS", i + 1, i + 1)
  388. return
  389. except Exception as e:
  390. Common.logger("moment").error("下载视频异常:{}", e)
  391. # 删除行或列,可选 ROWS、COLUMNS
  392. Feishu.dimension_range("moment", "kanyikan", "tGqZMX", "ROWS", 2, 2)
  393. # 执行下载/上传
  394. @classmethod
  395. def run_download_publish(cls, env):
  396. try:
  397. while True:
  398. if len(Feishu.get_values_batch("moment", "kanyikan", "tGqZMX")) == 1:
  399. break
  400. else:
  401. cls.download_publish(env)
  402. except Exception as e:
  403. Common.logger("moment").error("执行下载/上传异常:{}", e)
  404. if __name__ == "__main__":
  405. kuaishou = Moment()
  406. kuaishou.run_download_publish("dev")
  407. pass