gongzhonghao.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/1/16
  4. import difflib
  5. import json
  6. import random
  7. # import shutil
  8. import time
  9. import ffmpeg
  10. import requests
  11. import urllib3
  12. from selenium.webdriver import DesiredCapabilities
  13. from selenium.webdriver.chrome.service import Service
  14. from selenium.webdriver.common.by import By
  15. from selenium import webdriver
  16. from main.common import Common
  17. from main.feishu_lib import Feishu
  18. from main.publish import Publish
  19. class GongZongHao:
  20. # 翻页参数
  21. begin = 0
  22. # 获取已下载视频宽高、时长等信息
  23. @classmethod
  24. def get_video_info_from_ffmpeg(cls, log_type, video_path):
  25. probe = ffmpeg.probe(video_path)
  26. video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None)
  27. if video_stream is None:
  28. Common.logger(log_type).info('No video stream found!')
  29. return
  30. width = int(video_stream['width'])
  31. height = int(video_stream['height'])
  32. duration = float(video_stream['duration'])
  33. return width, height, duration
  34. # 过滤词库
  35. @classmethod
  36. def filter_words(cls, log_type):
  37. try:
  38. filter_word_list = []
  39. filter_sheet = Feishu.get_values_batch(log_type, 'gongzhonghao', 'BwN8mo')
  40. for x in filter_sheet:
  41. for y in x:
  42. if y is None:
  43. pass
  44. else:
  45. filter_word_list.append(y)
  46. return filter_word_list
  47. except Exception as e:
  48. Common.logger(log_type).info(f'filter_words异常:{e}\n')
  49. @classmethod
  50. def title_like(cls, log_type, title):
  51. sheet = Feishu.get_values_batch(log_type, 'gongzhonghao', '47e39d')
  52. for i in range(1, len(sheet)):
  53. video_title = sheet[i][7]
  54. if video_title is None:
  55. pass
  56. elif difflib.SequenceMatcher(None, title, video_title).quick_ratio() >= 0.8:
  57. return True
  58. else:
  59. pass
  60. # 获取 token
  61. @classmethod
  62. def get_token(cls, log_type):
  63. try:
  64. sheet = Feishu.get_values_batch(log_type, "gongzhonghao", "OjyJqs")
  65. token = sheet[0][1]
  66. cookie = sheet[1][1]
  67. token_dict = {'token': token, 'cookie': cookie}
  68. return token_dict
  69. except Exception as e:
  70. Common.logger(log_type).error(f"get_cookie_token异常:{e}\n")
  71. # 获取用户 fakeid
  72. @classmethod
  73. def get_fakeid(cls, log_type, user, index):
  74. try:
  75. url = "https://mp.weixin.qq.com/cgi-bin/searchbiz?"
  76. headers = {
  77. "accept": "*/*",
  78. "accept-encoding": "gzip, deflate, br",
  79. "accept-language": "zh-CN,zh;q=0.9",
  80. "referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?"
  81. "t=media/appmsg_edit_v2&action=edit&isNew=1"
  82. "&type=77&createType=5&token=1011071554&lang=zh_CN",
  83. 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
  84. "sec-ch-ua-mobile": "?0",
  85. "sec-ch-ua-platform": '"Windows"',
  86. "sec-fetch-dest": "empty",
  87. "sec-fetch-mode": "cors",
  88. "sec-fetch-site": "same-origin",
  89. "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
  90. " (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",
  91. "x-requested-with": "XMLHttpRequest",
  92. 'cookie': cls.get_token(log_type)['cookie'],
  93. }
  94. params = {
  95. "action": "search_biz",
  96. "begin": "0",
  97. "count": "5",
  98. "query": str(user),
  99. "token": cls.get_token(log_type)['token'],
  100. "lang": "zh_CN",
  101. "f": "json",
  102. "ajax": "1",
  103. }
  104. urllib3.disable_warnings()
  105. r = requests.get(url=url, headers=headers, params=params, verify=False)
  106. if "list" not in r.json() or len(r.json()["list"]) == 0:
  107. Common.logger(log_type).warning(f"get_fakeid:{r.text},随机休眠 3-5 分钟\n")
  108. time.sleep(random.randint(60 * 3, 60 * 5))
  109. else:
  110. fakeid = r.json()["list"][int(index) - 1]["fakeid"]
  111. head_url = r.json()["list"][int(index) - 1]["round_head_img"]
  112. fakeid_dict = {'fakeid': fakeid, 'head_url': head_url}
  113. return fakeid_dict
  114. except Exception as e:
  115. Common.logger(log_type).error(f"get_fakeid异常:{e}\n")
  116. # 获取腾讯视频下载链接
  117. @classmethod
  118. def get_tencent_video_url(cls, log_type, video_id):
  119. try:
  120. url = 'https://vv.video.qq.com/getinfo?vids=' + str(video_id) + '&platform=101001&charge=0&otype=json'
  121. response = requests.get(url=url).text.replace('QZOutputJson=', '').replace('"};', '"}')
  122. response = json.loads(response)
  123. url = response['vl']['vi'][0]['ul']['ui'][0]['url']
  124. fvkey = response['vl']['vi'][0]['fvkey']
  125. video_url = url + str(video_id) + '.mp4?vkey=' + fvkey
  126. return video_url
  127. except Exception as e:
  128. Common.logger(log_type).error(f"get_tencent_video_url异常:{e}\n")
  129. @classmethod
  130. def get_video_url(cls, log_type, article_url):
  131. try:
  132. # 打印请求配置
  133. ca = DesiredCapabilities.CHROME
  134. ca["goog:loggingPrefs"] = {"performance": "ALL"}
  135. # 不打开浏览器运行
  136. chrome_options = webdriver.ChromeOptions()
  137. chrome_options.add_argument("headless")
  138. chrome_options.add_argument(
  139. f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36')
  140. chrome_options.add_argument("--no-sandbox")
  141. # driver初始化
  142. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
  143. # driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service('/Users/wangkun/Downloads/chromedriver/chromedriver_v108/chromedriver'))
  144. driver.implicitly_wait(10)
  145. # Common.logger(log_type).info('打开文章链接')
  146. driver.get(article_url)
  147. time.sleep(5)
  148. if len(driver.find_elements(By.XPATH, '//div[@class="js_video_poster video_poster"]/*[2]')) != 0:
  149. video_url = driver.find_element(
  150. By.XPATH, '//div[@class="js_video_poster video_poster"]/*[2]').get_attribute('src')
  151. elif len(driver.find_elements(By.XPATH, '//span[@class="js_tx_video_container"]/*[1]')) != 0:
  152. iframe = driver.find_element(By.XPATH, '//span[@class="js_tx_video_container"]/*[1]').get_attribute(
  153. 'src')
  154. video_id = iframe.split('vid=')[-1].split('&')[0]
  155. video_url = cls.get_tencent_video_url(log_type, video_id)
  156. else:
  157. video_url = 0
  158. return video_url
  159. except Exception as e:
  160. Common.logger(log_type).info(f'get_video_url异常:{e}\n')
  161. # 获取文章列表
  162. @classmethod
  163. def get_articles(cls, log_type, user, index, env):
  164. fakeid_dict = cls.get_fakeid(log_type, user, index)
  165. while True:
  166. try:
  167. url = "https://mp.weixin.qq.com/cgi-bin/appmsg?"
  168. headers = {
  169. "accept": "*/*",
  170. "accept-encoding": "gzip, deflate, br",
  171. "accept-language": "zh-CN,zh;q=0.9",
  172. "referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?"
  173. "t=media/appmsg_edit_v2&action=edit&isNew=1"
  174. "&type=77&createType=5&token=" + str(cls.get_token(log_type)['token']) + "&lang=zh_CN",
  175. 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
  176. "sec-ch-ua-mobile": "?0",
  177. "sec-ch-ua-platform": '"Windows"',
  178. "sec-fetch-dest": "empty",
  179. "sec-fetch-mode": "cors",
  180. "sec-fetch-site": "same-origin",
  181. "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
  182. " (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",
  183. "x-requested-with": "XMLHttpRequest",
  184. 'cookie': cls.get_token(log_type)['cookie'],
  185. }
  186. params = {
  187. "action": "list_ex",
  188. "begin": str(cls.begin),
  189. "count": "5",
  190. "fakeid": fakeid_dict['fakeid'],
  191. "type": "9",
  192. "query": "",
  193. "token": str(cls.get_token(log_type)['token']),
  194. "lang": "zh_CN",
  195. "f": "json",
  196. "ajax": "1",
  197. }
  198. urllib3.disable_warnings()
  199. r = requests.get(url=url, headers=headers, params=params, verify=False)
  200. cls.begin += 5
  201. if 'app_msg_list' not in r.json():
  202. Common.logger(log_type).warning(f"get_gzh_url:{r.text}\n")
  203. break
  204. elif len(r.json()['app_msg_list']) == 0:
  205. Common.logger(log_type).info('没有更多视频了\n')
  206. else:
  207. app_msg_list = r.json()['app_msg_list']
  208. for article_url in app_msg_list:
  209. # title
  210. if 'title' in article_url:
  211. title = article_url['title'].replace('/', '').replace('\n', '') \
  212. .replace('.', '').replace('“', '').replace('”', '').replace(' ', '')
  213. else:
  214. title = 0
  215. # aid
  216. if 'aid' in article_url:
  217. aid = article_url['aid']
  218. else:
  219. aid = 0
  220. # create_time
  221. if 'create_time' in article_url:
  222. create_time = article_url['create_time']
  223. else:
  224. create_time = 0
  225. head_url = fakeid_dict['head_url']
  226. # cover_url
  227. if 'cover' in article_url:
  228. cover_url = article_url['cover']
  229. else:
  230. cover_url = 0
  231. # article_url
  232. if 'link' in article_url:
  233. article_url = article_url['link']
  234. else:
  235. article_url = 0
  236. video_url = cls.get_video_url(log_type, article_url)
  237. Common.logger(log_type).info(f"title:{title}")
  238. # Common.logger(log_type).info(f"aid:{aid}, type{type(aid)}")
  239. # Common.logger(log_type).info("create_time:{}", create_time)
  240. # Common.logger(log_type).info("head_url:{}", head_url)
  241. # Common.logger(log_type).info("cover_url:{}", cover_url)
  242. Common.logger(log_type).info(f"article_url:{article_url}")
  243. Common.logger(log_type).info(f"video_url:{video_url}")
  244. if int(time.time()) - create_time >= 3600*24*3:
  245. Common.logger(log_type).info(f'发布时间{time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(create_time))} > 3 天\n')
  246. cls.begin = 0
  247. return
  248. else:
  249. video_dict = {
  250. 'video_title': title,
  251. 'aid': aid,
  252. 'create_time': create_time,
  253. 'user_name': user,
  254. 'user_id': fakeid_dict['fakeid'],
  255. 'head_url': head_url,
  256. 'cover_url': cover_url,
  257. 'article_url': article_url,
  258. 'video_url': video_url
  259. }
  260. cls.download_publish(log_type, video_dict, env)
  261. Common.logger(log_type).info('休眠 10 秒\n')
  262. time.sleep(10)
  263. except Exception as e:
  264. Common.logger(log_type).error("get_gzh_url异常:{}\n", e)
  265. # 下载/上传
  266. @classmethod
  267. def download_publish(cls, log_type, video_dict, env):
  268. try:
  269. if video_dict['article_url'] == 0 or video_dict['video_url'] == 0:
  270. Common.logger(log_type).info("文章涉嫌违反相关法律法规和政策\n")
  271. # 标题敏感词过滤
  272. elif any(word if word in video_dict['video_title'] else False for word in cls.filter_words(log_type)) is True:
  273. Common.logger(log_type).info("标题已中过滤词\n")
  274. # 已下载判断
  275. elif video_dict['aid'] in [x for y in Feishu.get_values_batch(log_type, 'gongzhonghao', '47e39d') for x in y]:
  276. Common.logger(log_type).info("视频已下载\n")
  277. # 标题相似度
  278. elif cls.title_like(log_type, video_dict['video_title']) is True:
  279. Common.logger(log_type).info('标题相似度>=80%:{}\n', video_dict['video_title'])
  280. else:
  281. # 下载视频
  282. Common.download_method(log_type, "video", video_dict['video_title'], video_dict['video_url'])
  283. # 获取视频时长
  284. video_info = cls.get_video_info_from_ffmpeg(log_type, "./videos/" + video_dict['video_title'] + "/video.mp4")
  285. video_width = str(video_info[0])
  286. video_height = str(video_info[1])
  287. duration = video_info[2]
  288. # # 视频时长<50s,直接删除
  289. # if int(duration) < 50:
  290. # # 删除视频文件夹
  291. # shutil.rmtree("./videos/" + video_dict['video_title'] + "/")
  292. # Common.logger(log_type).info("时长:{}<50秒,删除成功\n")
  293. # return
  294. # else:
  295. # 下载封面
  296. Common.download_method(log_type, 'cover', video_dict['video_title'], video_dict['cover_url'])
  297. # 保存视频信息至 "./videos/{video_title}/info.txt"
  298. with open("./videos/" + video_dict['video_title'] + "/" + "info.txt", "a", encoding="UTF-8") as f_a:
  299. f_a.write(str(video_dict['aid']) + "\n" +
  300. video_dict['video_title'] + "\n" +
  301. str(int(duration)) + "\n" +
  302. '100000' + "\n" +
  303. '100000' + "\n" +
  304. '100000' + "\n" +
  305. '100000' + "\n" +
  306. str(video_width) + '*' + str(video_height) + "\n" +
  307. str(video_dict['create_time']) + "\n" +
  308. video_dict['user_name'] + "\n" +
  309. video_dict['head_url'] + "\n" +
  310. video_dict['video_url'] + "\n" +
  311. video_dict['cover_url'] + "\n" +
  312. "gongzhonghao_xinxin" + str(time.time()) + "\n")
  313. Common.logger(log_type).info("==========视频信息已保存至info.txt==========")
  314. # 上传视频
  315. Common.logger(log_type).info("开始上传视频")
  316. our_video_id = Publish.upload_and_publish(log_type, env, "play")
  317. if env == 'prod':
  318. our_video_link = "https://admin.piaoquantv.com/cms/post-detail/" + str(our_video_id) + "/info"
  319. else:
  320. our_video_link = "https://testadmin.piaoquantv.com/cms/post-detail/" + str(our_video_id) + "/info"
  321. Common.logger(log_type).info("视频上传完成")
  322. # 保存视频 ID 到云文档
  323. Common.logger(log_type).info("保存视频信息至云文档")
  324. # 视频ID工作表,插入首行
  325. Feishu.insert_columns(log_type, "gongzhonghao", "47e39d", "ROWS", 1, 2)
  326. # 视频ID工作表,首行写入数据
  327. upload_time = int(time.time())
  328. values = [[time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(upload_time)),
  329. "公众号_信欣",
  330. video_dict['video_title'],
  331. video_dict['aid'],
  332. our_video_link,
  333. int(duration),
  334. str(video_width) + '*' + str(video_height),
  335. time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(video_dict['create_time'])),
  336. video_dict['user_name'],
  337. video_dict['user_id'],
  338. video_dict['head_url'],
  339. video_dict['cover_url'],
  340. video_dict['article_url'],
  341. video_dict['video_url']]]
  342. time.sleep(1)
  343. Feishu.update_values(log_type, "gongzhonghao", "47e39d", "F2:Z2", values)
  344. Common.logger(log_type).info("视频下载/上传成功\n")
  345. except Exception as e:
  346. Common.logger(log_type).error(f"download_publish异常:{e}\n")
  347. @classmethod
  348. def get_all_videos(cls, log_type, env):
  349. try:
  350. user_sheet = Feishu.get_values_batch(log_type, 'gongzhonghao', 'Bzv72P')
  351. for i in range(1, len(user_sheet)):
  352. user_name = user_sheet[i][0]
  353. index = user_sheet[i][1]
  354. if user_name is None or index is None:
  355. Common.logger(log_type).info(f'第{i+1}行,空行\n')
  356. else:
  357. Common.logger(log_type).info(f'获取 {user_name} 公众号视频\n')
  358. cls.get_articles(log_type, user_name, index, env)
  359. cls.begin = 0
  360. Common.logger(log_type).info('休眠1分钟\n')
  361. time.sleep(60)
  362. except Exception as e:
  363. Common.logger(log_type).info(f'get_all_videos异常:{e}\n')
  364. if __name__ == "__main__":
  365. pass