gongzhonghao5_author.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/3/28
  4. import datetime
  5. import difflib
  6. import json
  7. import os
  8. import shutil
  9. import sys
  10. import time
  11. from hashlib import md5
  12. import requests
  13. import urllib3
  14. from selenium.webdriver import DesiredCapabilities
  15. from selenium.webdriver.chrome.service import Service
  16. from selenium.webdriver.common.by import By
  17. from selenium import webdriver
  18. sys.path.append(os.getcwd())
  19. from common.common import Common
  20. from common.feishu import Feishu
  21. from common.publish import Publish
  22. from common.scheduling_db import MysqlHelper
  23. from common.public import get_config_from_mysql
  24. class GongzhonghaoAuthor5:
  25. # 翻页参数
  26. begin = 0
  27. platform = "公众号"
  28. # 基础门槛规则
  29. @staticmethod
  30. def download_rule(log_type, crawler, video_dict, rule_dict):
  31. """
  32. 下载视频的基本规则
  33. :param log_type: 日志
  34. :param crawler: 哪款爬虫
  35. :param video_dict: 视频信息,字典格式
  36. :param rule_dict: 规则信息,字典格式
  37. :return: 满足规则,返回 True;反之,返回 False
  38. """
  39. rule_play_cnt_min = rule_dict.get('play_cnt', {}).get('min', 0)
  40. rule_play_cnt_max = rule_dict.get('play_cnt', {}).get('max', 100000000)
  41. if rule_play_cnt_max == 0:
  42. rule_play_cnt_max = 100000000
  43. rule_duration_min = rule_dict.get('duration', {}).get('min', 0)
  44. rule_duration_max = rule_dict.get('duration', {}).get('max', 100000000)
  45. if rule_duration_max == 0:
  46. rule_duration_max = 100000000
  47. rule_period_min = rule_dict.get('period', {}).get('min', 0)
  48. # rule_period_max = rule_dict.get('period', {}).get('max', 100000000)
  49. # if rule_period_max == 0:
  50. # rule_period_max = 100000000
  51. rule_fans_cnt_min = rule_dict.get('fans_cnt', {}).get('min', 0)
  52. rule_fans_cnt_max = rule_dict.get('fans_cnt', {}).get('max', 100000000)
  53. if rule_fans_cnt_max == 0:
  54. rule_fans_cnt_max = 100000000
  55. rule_videos_cnt_min = rule_dict.get('videos_cnt', {}).get('min', 0)
  56. rule_videos_cnt_max = rule_dict.get('videos_cnt', {}).get('max', 100000000)
  57. if rule_videos_cnt_max == 0:
  58. rule_videos_cnt_max = 100000000
  59. rule_like_cnt_min = rule_dict.get('like_cnt', {}).get('min', 0)
  60. rule_like_cnt_max = rule_dict.get('like_cnt', {}).get('max', 100000000)
  61. if rule_like_cnt_max == 0:
  62. rule_like_cnt_max = 100000000
  63. rule_width_min = rule_dict.get('width', {}).get('min', 0)
  64. rule_width_max = rule_dict.get('width', {}).get('max', 100000000)
  65. if rule_width_max == 0:
  66. rule_width_max = 100000000
  67. rule_height_min = rule_dict.get('height', {}).get('min', 0)
  68. rule_height_max = rule_dict.get('height', {}).get('max', 100000000)
  69. if rule_height_max == 0:
  70. rule_height_max = 100000000
  71. rule_share_cnt_min = rule_dict.get('share_cnt', {}).get('min', 0)
  72. rule_share_cnt_max = rule_dict.get('share_cnt', {}).get('max', 100000000)
  73. if rule_share_cnt_max == 0:
  74. rule_share_cnt_max = 100000000
  75. rule_comment_cnt_min = rule_dict.get('comment_cnt', {}).get('min', 0)
  76. rule_comment_cnt_max = rule_dict.get('comment_cnt', {}).get('max', 100000000)
  77. if rule_comment_cnt_max == 0:
  78. rule_comment_cnt_max = 100000000
  79. rule_publish_time_min = rule_dict.get('publish_time', {}).get('min', 0)
  80. rule_publish_time_max = rule_dict.get('publish_time', {}).get('max', 100000000)
  81. if rule_publish_time_max == 0:
  82. rule_publish_time_max = 4102415999000 # 2099-12-31 23:59:59
  83. Common.logger(log_type, crawler).info(
  84. f'rule_duration_max:{rule_duration_max} >= duration:{int(float(video_dict["duration"]))} >= rule_duration_min:{int(rule_duration_min)}')
  85. Common.logger(log_type, crawler).info(
  86. f'rule_play_cnt_max:{int(rule_play_cnt_max)} >= play_cnt:{int(video_dict["play_cnt"])} >= rule_play_cnt_min:{int(rule_play_cnt_min)}')
  87. Common.logger(log_type, crawler).info(
  88. f'now:{int(time.time())} - publish_time_stamp:{int(video_dict["publish_time_stamp"])} <= {3600 * 24 * int(rule_period_min)}')
  89. Common.logger(log_type, crawler).info(
  90. f'rule_like_cnt_max:{int(rule_like_cnt_max)} >= like_cnt:{int(video_dict["like_cnt"])} >= rule_like_cnt_min:{int(rule_like_cnt_min)}')
  91. Common.logger(log_type, crawler).info(
  92. f'rule_comment_cnt_max:{int(rule_comment_cnt_max)} >= comment_cnt:{int(video_dict["comment_cnt"])} >= rule_comment_cnt_min:{int(rule_comment_cnt_min)}')
  93. Common.logger(log_type, crawler).info(
  94. f'rule_share_cnt_max:{int(rule_share_cnt_max)} >= share_cnt:{int(video_dict["share_cnt"])} >= rule_share_cnt_min:{int(rule_share_cnt_min)}')
  95. Common.logger(log_type, crawler).info(
  96. f'rule_width_max:{int(rule_width_max)} >= video_width:{int(video_dict["video_width"])} >= rule_width_min:{int(rule_width_min)}')
  97. Common.logger(log_type, crawler).info(
  98. f'rule_height_max:{int(rule_height_max)} >= video_height:{int(video_dict["video_height"])} >= rule_height_min:{int(rule_height_min)}')
  99. Common.logger(log_type, crawler).info(
  100. f'rule_publish_time_max:{int(rule_publish_time_max)} >= publish_time_stamp:{int(video_dict["publish_time_stamp"])} >= rule_publish_time_min:{int(rule_publish_time_min)}')
  101. if int(rule_duration_max) >= int(float(video_dict["duration"])) >= int(rule_duration_min) \
  102. and int(rule_play_cnt_max) >= int(video_dict['play_cnt']) >= int(rule_play_cnt_min) \
  103. and int(rule_like_cnt_max) >= int(video_dict['like_cnt']) >= int(rule_like_cnt_min) \
  104. and int(rule_comment_cnt_max) >= int(video_dict['comment_cnt']) >= int(rule_comment_cnt_min) \
  105. and int(rule_share_cnt_max) >= int(video_dict['share_cnt']) >= int(rule_share_cnt_min) \
  106. and int(rule_width_max) >= int(video_dict['video_width']) >= int(rule_width_min) \
  107. and int(rule_height_max) >= int(video_dict['video_height']) >= int(rule_height_min) \
  108. and int(rule_publish_time_max) >= int(video_dict['publish_time_stamp']) >= int(rule_publish_time_min) \
  109. and int(time.time()) - int(video_dict["publish_time_stamp"]) <= 3600 * 24 * int(rule_period_min):
  110. return True
  111. else:
  112. return False
  113. @classmethod
  114. def title_like(cls, log_type, crawler, title, env):
  115. select_sql = f""" select * from crawler_video where platform="公众号" """
  116. video_list = MysqlHelper.get_values(log_type, crawler, select_sql, env, action="")
  117. if len(video_list) == 0:
  118. return None
  119. for video_dict in video_list:
  120. video_title = video_dict["video_title"]
  121. if difflib.SequenceMatcher(None, title, video_title).quick_ratio() >= 0.8:
  122. return True
  123. else:
  124. pass
  125. # 获取 token
  126. @classmethod
  127. def get_token(cls, log_type, crawler, env):
  128. select_sql = f""" select * from crawler_config where source="{crawler}" and title LIKE "%公众号_5%";"""
  129. configs = MysqlHelper.get_values(log_type, crawler, select_sql, env, action="")
  130. if len(configs) == 0:
  131. # Common.logger(log_type, crawler).warning(f"公众号_3未配置token")
  132. Feishu.bot(log_type, crawler, "公众号_5:未配置token")
  133. time.sleep(60)
  134. return None
  135. token_dict = {
  136. "token_id": configs[0]["id"],
  137. "title": configs[0]["title"],
  138. "token": dict(eval(configs[0]["config"]))["token"],
  139. "cookie": dict(eval(configs[0]["config"]))["cookie"],
  140. "update_time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(configs[0]["update_time"]/1000))),
  141. "operator": configs[0]["operator"]
  142. }
  143. for k, v in token_dict.items():
  144. print(f"{k}:{v}")
  145. return token_dict
  146. # 获取用户 fakeid
  147. @classmethod
  148. def get_fakeid(cls, log_type, crawler, wechat_name, env):
  149. while True:
  150. token_dict = cls.get_token(log_type, crawler, env)
  151. url = "https://mp.weixin.qq.com/cgi-bin/searchbiz?"
  152. headers = {
  153. "accept": "*/*",
  154. "accept-encoding": "gzip, deflate, br",
  155. "accept-language": "zh-CN,zh;q=0.9",
  156. "referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?"
  157. "t=media/appmsg_edit_v2&action=edit&isNew=1"
  158. "&type=77&createType=5&token=1011071554&lang=zh_CN",
  159. 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
  160. "sec-ch-ua-mobile": "?0",
  161. "sec-ch-ua-platform": '"Windows"',
  162. "sec-fetch-dest": "empty",
  163. "sec-fetch-mode": "cors",
  164. "sec-fetch-site": "same-origin",
  165. "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
  166. " (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",
  167. "x-requested-with": "XMLHttpRequest",
  168. 'cookie': token_dict['cookie'],
  169. }
  170. params = {
  171. "action": "search_biz",
  172. "begin": "0",
  173. "count": "5",
  174. "query": str(wechat_name),
  175. "token": token_dict['token'],
  176. "lang": "zh_CN",
  177. "f": "json",
  178. "ajax": "1",
  179. }
  180. urllib3.disable_warnings()
  181. r = requests.get(url=url, headers=headers, params=params, verify=False)
  182. r.close()
  183. if r.json()["base_resp"]["err_msg"] == "invalid session":
  184. Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}")
  185. Common.logger(log_type, crawler).warning(f"get_fakeid:{r.text}\n")
  186. # Common.logger(log_type, crawler).warning(f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} 过期啦\n")
  187. if 20 >= datetime.datetime.now().hour >= 10:
  188. Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n过期啦,请扫码更换token\nhttps://mp.weixin.qq.com/")
  189. time.sleep(60 * 10)
  190. continue
  191. if r.json()["base_resp"]["err_msg"] == "freq control":
  192. Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}")
  193. Common.logger(log_type, crawler).warning(f"get_fakeid:{r.text}\n")
  194. # Common.logger(log_type, crawler).warning(f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} 频控啦\n")
  195. if 20 >= datetime.datetime.now().hour >= 10:
  196. Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
  197. time.sleep(60 * 10)
  198. continue
  199. if "list" not in r.json() or len(r.json()["list"]) == 0:
  200. Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}")
  201. Common.logger(log_type, crawler).warning(f"get_fakeid:{r.text}\n")
  202. # Common.logger(log_type, crawler).warning(f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} 频控啦\n")
  203. if 20 >= datetime.datetime.now().hour >= 10:
  204. Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
  205. time.sleep(60 * 10)
  206. continue
  207. fakeid = r.json()["list"][0]["fakeid"]
  208. head_url = r.json()["list"][0]["round_head_img"]
  209. fakeid_dict = {'fakeid': fakeid, 'head_url': head_url}
  210. return fakeid_dict
  211. # 获取腾讯视频下载链接
  212. @classmethod
  213. def get_tencent_video_url(cls, video_id):
  214. url = 'https://vv.video.qq.com/getinfo?vids=' + str(video_id) + '&platform=101001&charge=0&otype=json'
  215. response = requests.get(url=url).text.replace('QZOutputJson=', '').replace('"};', '"}')
  216. response = json.loads(response)
  217. url = response['vl']['vi'][0]['ul']['ui'][0]['url']
  218. fvkey = response['vl']['vi'][0]['fvkey']
  219. video_url = url + str(video_id) + '.mp4?vkey=' + fvkey
  220. return video_url
  221. @classmethod
  222. def get_video_url(cls, article_url, env):
  223. # 打印请求配置
  224. ca = DesiredCapabilities.CHROME
  225. ca["goog:loggingPrefs"] = {"performance": "ALL"}
  226. # 不打开浏览器运行
  227. chrome_options = webdriver.ChromeOptions()
  228. chrome_options.add_argument("headless")
  229. chrome_options.add_argument(
  230. f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36')
  231. chrome_options.add_argument("--no-sandbox")
  232. # driver初始化
  233. if env == "prod":
  234. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
  235. else:
  236. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service(
  237. '/Users/wangkun/Downloads/chromedriver/chromedriver_v111/chromedriver'))
  238. driver.implicitly_wait(10)
  239. # Common.logger(log_type, crawler).info('打开文章链接')
  240. driver.get(article_url)
  241. time.sleep(1)
  242. if len(driver.find_elements(By.XPATH, '//div[@class="js_video_poster video_poster"]/*[2]')) != 0:
  243. video_url = driver.find_element(
  244. By.XPATH, '//div[@class="js_video_poster video_poster"]/*[2]').get_attribute('src')
  245. elif len(driver.find_elements(By.XPATH, '//span[@class="js_tx_video_container"]/*[1]')) != 0:
  246. iframe = driver.find_element(By.XPATH, '//span[@class="js_tx_video_container"]/*[1]').get_attribute(
  247. 'src')
  248. video_id = iframe.split('vid=')[-1].split('&')[0]
  249. video_url = cls.get_tencent_video_url(video_id)
  250. else:
  251. video_url = 0
  252. driver.quit()
  253. return video_url
  254. # 获取文章列表
  255. @classmethod
  256. def get_videoList(cls, log_type, crawler, wechat_name, rule_dict, user_name, uid, oss_endpoint, env):
  257. # try:
  258. while True:
  259. token_dict = cls.get_token(log_type, crawler, env)
  260. fakeid_dict = cls.get_fakeid(log_type=log_type,
  261. crawler=crawler,
  262. wechat_name=wechat_name,
  263. env=env)
  264. url = "https://mp.weixin.qq.com/cgi-bin/appmsg?"
  265. headers = {
  266. "accept": "*/*",
  267. "accept-encoding": "gzip, deflate, br",
  268. "accept-language": "zh-CN,zh;q=0.9",
  269. "referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?"
  270. "t=media/appmsg_edit_v2&action=edit&isNew=1"
  271. "&type=77&createType=5&token=" + str(token_dict['token']) + "&lang=zh_CN",
  272. 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
  273. "sec-ch-ua-mobile": "?0",
  274. "sec-ch-ua-platform": '"Windows"',
  275. "sec-fetch-dest": "empty",
  276. "sec-fetch-mode": "cors",
  277. "sec-fetch-site": "same-origin",
  278. "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
  279. " (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",
  280. "x-requested-with": "XMLHttpRequest",
  281. 'cookie': token_dict['cookie'],
  282. }
  283. params = {
  284. "action": "list_ex",
  285. "begin": str(cls.begin),
  286. "count": "5",
  287. "fakeid": fakeid_dict['fakeid'],
  288. "type": "9",
  289. "query": "",
  290. "token": str(token_dict['token']),
  291. "lang": "zh_CN",
  292. "f": "json",
  293. "ajax": "1",
  294. }
  295. urllib3.disable_warnings()
  296. r = requests.get(url=url, headers=headers, params=params, verify=False)
  297. r.close()
  298. if r.json()["base_resp"]["err_msg"] == "invalid session":
  299. Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}")
  300. Common.logger(log_type, crawler).warning(f"get_videoList:{r.text}\n")
  301. # Common.logger(log_type, crawler).warning(f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} 过期啦\n")
  302. if 20 >= datetime.datetime.now().hour >= 10:
  303. Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']}\n过期啦,请扫码更换token\nhttps://mp.weixin.qq.com/")
  304. time.sleep(60 * 10)
  305. continue
  306. if r.json()["base_resp"]["err_msg"] == "freq control":
  307. Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}")
  308. Common.logger(log_type, crawler).warning(f"get_videoList:{r.text}\n")
  309. # Common.logger(log_type, crawler).warning(f"{token_dict['title']}, 操作人:{token_dict['operator']}, 更换日期:{token_dict['update_time']} 频控啦\n")
  310. if 20 >= datetime.datetime.now().hour >= 10:
  311. Feishu.bot(log_type, crawler,f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
  312. time.sleep(60 * 10)
  313. continue
  314. if 'app_msg_list' not in r.json():
  315. Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}")
  316. Common.logger(log_type, crawler).warning(f"get_videoList:{r.text}\n")
  317. # Common.logger(log_type, crawler).warning(f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} 频控啦\n")
  318. if 20 >= datetime.datetime.now().hour >= 10:
  319. Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']}\n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
  320. time.sleep(60 * 10)
  321. continue
  322. if len(r.json()['app_msg_list']) == 0:
  323. Common.logger(log_type, crawler).info('没有更多视频了\n')
  324. return
  325. else:
  326. cls.begin += 5
  327. app_msg_list = r.json()['app_msg_list']
  328. for article_url in app_msg_list:
  329. # title
  330. video_title = article_url.get("title", "").replace('/', '').replace('\n', '') \
  331. .replace('.', '').replace('“', '').replace('”', '').replace(' ', '')\
  332. .replace('"', '').replace("'", "")
  333. # aid
  334. aid = article_url.get('aid', '')
  335. # create_time
  336. create_time = article_url.get('create_time', 0)
  337. publish_time_stamp = int(create_time)
  338. publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))
  339. avatar_url = fakeid_dict['head_url']
  340. # cover_url
  341. cover_url = article_url.get('cover', '')
  342. # article_url
  343. article_url = article_url.get('link', '')
  344. video_url = cls.get_video_url(article_url, env)
  345. video_dict = {
  346. 'video_id': aid,
  347. 'video_title': video_title,
  348. 'publish_time_stamp': publish_time_stamp,
  349. 'publish_time_str': publish_time_str,
  350. 'user_name': user_name,
  351. 'play_cnt': 0,
  352. 'comment_cnt': 0,
  353. 'like_cnt': 0,
  354. 'share_cnt': 0,
  355. 'user_id': fakeid_dict['fakeid'],
  356. 'avatar_url': avatar_url,
  357. 'cover_url': cover_url,
  358. 'article_url': article_url,
  359. 'video_url': video_url,
  360. 'session': f'gongzhonghao-author1-{int(time.time())}'
  361. }
  362. for k, v in video_dict.items():
  363. Common.logger(log_type, crawler).info(f"{k}:{v}")
  364. if int(time.time()) - publish_time_stamp > 3600 * 24 * int(rule_dict.get('period', {}).get('min', 1000)):
  365. Common.logger(log_type, crawler).info(f"发布时间超过{int(rule_dict.get('period', {}).get('min', 1000))}天\n")
  366. cls.begin = 0
  367. return
  368. if video_dict['article_url'] == 0 or video_dict['video_url'] == 0:
  369. Common.logger(log_type, crawler).info("文章涉嫌违反相关法律法规和政策\n")
  370. # 标题敏感词过滤
  371. elif any(str(word) if str(word) in video_dict['video_title'] else False
  372. for word in get_config_from_mysql(log_type=log_type,
  373. source=crawler,
  374. env=env,
  375. text="filter",
  376. action="")) is True:
  377. Common.logger(log_type, crawler).info("标题已中过滤词\n")
  378. # 已下载判断
  379. elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env) != 0:
  380. Common.logger(log_type, crawler).info("视频已下载\n")
  381. # 标题相似度
  382. elif cls.title_like(log_type, crawler, video_dict['video_title'], env) is True:
  383. Common.logger(log_type, crawler).info(f'标题相似度>=80%:{video_dict["video_title"]}\n')
  384. else:
  385. cls.download_publish(log_type=log_type,
  386. crawler=crawler,
  387. video_dict=video_dict,
  388. rule_dict=rule_dict,
  389. uid=uid,
  390. oss_endpoint=oss_endpoint,
  391. env=env)
  392. Common.logger(log_type, crawler).info('休眠 60 秒\n')
  393. time.sleep(60)
  394. @classmethod
  395. def repeat_video(cls, log_type, crawler, video_id, env):
  396. sql = f""" select * from crawler_video where platform="公众号" and out_video_id="{video_id}"; """
  397. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
  398. return len(repeat_video)
  399. # 下载/上传
  400. @classmethod
  401. def download_publish(cls, log_type, crawler, video_dict, rule_dict, uid, oss_endpoint, env):
  402. # 下载视频
  403. Common.download_method(log_type=log_type, crawler=crawler, text="video",
  404. title=video_dict["video_title"], url=video_dict["video_url"])
  405. md_title = md5(video_dict['video_title'].encode('utf8')).hexdigest()
  406. # 获取视频时长
  407. ffmpeg_dict = Common.ffmpeg(log_type, crawler,
  408. f"./{crawler}/videos/{video_dict['video_title']}/video.mp4")
  409. if ffmpeg_dict is None:
  410. # 删除视频文件夹
  411. shutil.rmtree(f"./{crawler}/videos/{md_title}")
  412. Common.logger(log_type, crawler).info("视频size=0,删除成功\n")
  413. return
  414. video_dict["video_width"] = ffmpeg_dict["width"]
  415. video_dict["video_height"] = ffmpeg_dict["height"]
  416. video_dict["duration"] = ffmpeg_dict["duration"]
  417. video_size = ffmpeg_dict["size"]
  418. Common.logger(log_type, crawler).info(f'video_width:{video_dict["video_width"]}')
  419. Common.logger(log_type, crawler).info(f'video_height:{video_dict["video_height"]}')
  420. Common.logger(log_type, crawler).info(f'duration:{video_dict["duration"]}')
  421. Common.logger(log_type, crawler).info(f'video_size:{video_size}')
  422. # 视频size=0,直接删除
  423. if int(video_size) == 0 or cls.download_rule(log_type, crawler, video_dict, rule_dict) is False:
  424. # 删除视频文件夹
  425. shutil.rmtree(f"./{crawler}/videos/{md_title}")
  426. Common.logger(log_type, crawler).info("视频size=0,删除成功\n")
  427. return
  428. if cls.download_rule(log_type, crawler, video_dict, rule_dict) is False:
  429. shutil.rmtree(f"./{crawler}/videos/{md_title}")
  430. Common.logger(log_type, crawler).info("不满足抓取规则,删除成功\n")
  431. return
  432. # 下载封面
  433. Common.download_method(log_type=log_type, crawler=crawler, text="cover",
  434. title=video_dict["video_title"], url=video_dict["cover_url"])
  435. # 保存视频信息至 "./videos/{video_title}/info.txt"
  436. Common.save_video_info(log_type=log_type, crawler=crawler, video_dict=video_dict)
  437. # 上传视频
  438. Common.logger(log_type, crawler).info("开始上传视频...")
  439. strategy = "定向榜爬虫策略"
  440. our_video_id = Publish.upload_and_publish(log_type=log_type,
  441. crawler=crawler,
  442. strategy=strategy,
  443. our_uid=uid,
  444. oss_endpoint=oss_endpoint,
  445. env=env)
  446. if env == 'prod':
  447. our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{str(our_video_id)}/info"
  448. else:
  449. our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{str(our_video_id)}/info"
  450. Common.logger(log_type, crawler).info("视频上传完成")
  451. if our_video_id is None:
  452. # 删除视频文件夹
  453. shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}")
  454. return
  455. insert_sql = f""" insert into crawler_video(video_id,
  456. out_user_id,
  457. platform,
  458. strategy,
  459. out_video_id,
  460. video_title,
  461. cover_url,
  462. video_url,
  463. duration,
  464. publish_time,
  465. play_cnt,
  466. crawler_rule,
  467. width,
  468. height)
  469. values({our_video_id},
  470. "{video_dict['user_id']}",
  471. "{cls.platform}",
  472. "定向爬虫策略",
  473. "{video_dict['video_id']}",
  474. "{video_dict['video_title']}",
  475. "{video_dict['cover_url']}",
  476. "{video_dict['video_url']}",
  477. {int(video_dict['duration'])},
  478. "{video_dict['publish_time_str']}",
  479. {int(video_dict['play_cnt'])},
  480. '{json.dumps(rule_dict)}',
  481. {int(video_dict['video_width'])},
  482. {int(video_dict['video_height'])}) """
  483. Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
  484. MysqlHelper.update_values(log_type, crawler, insert_sql, env)
  485. Common.logger(log_type, crawler).info('视频信息插入数据库成功!')
  486. # 视频写入飞书
  487. Feishu.insert_columns(log_type, crawler, "47e39d", "ROWS", 1, 2)
  488. # 视频ID工作表,首行写入数据
  489. upload_time = int(time.time())
  490. values = [[time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(upload_time)),
  491. "用户主页",
  492. video_dict['video_title'],
  493. video_dict['video_id'],
  494. our_video_link,
  495. int(video_dict['duration']),
  496. f"{video_dict['video_width']}*{video_dict['video_height']}",
  497. video_dict['publish_time_str'],
  498. video_dict['user_name'],
  499. video_dict['user_id'],
  500. video_dict['avatar_url'],
  501. video_dict['cover_url'],
  502. video_dict['article_url'],
  503. video_dict['video_url']]]
  504. time.sleep(0.5)
  505. Feishu.update_values(log_type, crawler, "47e39d", "F2:Z2", values)
  506. Common.logger(log_type, crawler).info('视频下载/上传成功\n')
  507. @classmethod
  508. def get_all_videos(cls, log_type, crawler, user_list, rule_dict, oss_endpoint, env):
  509. if len(user_list) == 0:
  510. Common.logger(log_type, crawler).warning(f"抓取用户列表为空\n")
  511. return
  512. for user in user_list:
  513. # try:
  514. user_name = user['nick_name']
  515. wechat_name = user['link']
  516. uid = user['uid']
  517. Common.logger(log_type, crawler).info(f'获取 {user_name} 公众号视频\n')
  518. cls.get_videoList(log_type=log_type,
  519. crawler=crawler,
  520. wechat_name=wechat_name,
  521. rule_dict=rule_dict,
  522. user_name=user_name,
  523. uid=uid,
  524. oss_endpoint=oss_endpoint,
  525. env=env)
  526. cls.begin = 0
  527. Common.logger(log_type, crawler).info('休眠 60 秒\n')
  528. time.sleep(60)
  529. # except Exception as e:
  530. # Common.logger(log_type, crawler).info(f'get_all_videos异常:{e}\n')
  531. if __name__ == "__main__":
  532. GongzhonghaoAuthor5.get_token("author", "gongzhonghao", "dev")
  533. # print(get_config_from_mysql("author", "gongzhonghao", "dev", "filter", action=""))
  534. pass