shipinhao_follow.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2022/12/14
  4. import difflib
  5. import os
  6. import sys
  7. import time
  8. from appium import webdriver
  9. from appium.webdriver.extensions.android.nativekey import AndroidKey
  10. from selenium.common import NoSuchElementException
  11. from appium.webdriver.webdriver import WebDriver
  12. from selenium.webdriver.common.by import By
  13. sys.path.append(os.getcwd())
  14. from main.common import Common
  15. from main.feishu_lib import Feishu
  16. from shipinhao.shipinhao_publish import Publish
  17. class Follow:
  18. # 过滤词库
  19. @classmethod
  20. def filter_words(cls, log_type):
  21. try:
  22. filter_words_sheet = Feishu.get_values_batch(log_type, 'shipinhao', 'gmeOgJ')
  23. filter_words_list = []
  24. for x in filter_words_sheet:
  25. for y in x:
  26. if y is None:
  27. pass
  28. else:
  29. filter_words_list.append(y)
  30. return filter_words_list
  31. except Exception as e:
  32. Common.logger(log_type).error('filter_words异常:{}\n', e)
  33. @classmethod
  34. def get_users_from_feishu(cls, log_type):
  35. try:
  36. users_sheet = Feishu.get_values_batch(log_type, 'shipinhao', 'yVFqxa')
  37. user_list = []
  38. for i in range(1, len(users_sheet)):
  39. user_name = users_sheet[i][1]
  40. if user_name is not None:
  41. user_list.append(user_name)
  42. return user_list
  43. except Exception as e:
  44. Common.logger(log_type).error(f'get_users_from_feishu异常:{e}\n')
  45. @classmethod
  46. def start_follow_wechat(cls, log_type, user_name, env):
  47. try:
  48. Common.logger(log_type).info('启动微信')
  49. caps = {
  50. "platformName": "Android", # 手机操作系统 Android / iOS
  51. "deviceName": "Android", # 连接的设备名(模拟器或真机),安卓可以随便写
  52. "platforVersion": "11", # 手机对应的系统版本(Android 11)
  53. "appPackage": "com.tencent.mm", # 被测APP的包名,乐活圈 Android
  54. "appActivity": ".ui.LauncherUI", # 启动的Activity名
  55. "autoGrantPermissions": "true", # 让 appium 自动授权 base 权限,
  56. # 如果 noReset 为 True,则该条不生效(该参数为 Android 独有),对应的值为 True 或 False
  57. "unicodekeyboard": True, # 使用自带输入法,输入中文时填True
  58. "resetkeyboard": True, # 执行完程序恢复原来输入法
  59. "noReset": True, # 不重置APP
  60. "printPageSourceOnFailure": True, # 找不到元素时,appium log 会完整记录当前页面的 pagesource
  61. "newCommandTimeout": 6000, # 初始等待时间
  62. "automationName": "UiAutomator2", # 使用引擎,默认为 Appium,
  63. # 其中 Appium、UiAutomator2、Selendroid、Espresso 用于 Android,XCUITest 用于 iOS
  64. "showChromedriverLog": True,
  65. # "chromeOptions": {"androidProcess": "com.tencent.mm:appbrand0"},
  66. "chromeOptions": {"androidProcess": "com.tencent.mm:tools"},
  67. 'enableWebviewDetailsCollection': True,
  68. 'setWebContentsDebuggingEnabled': True,
  69. # 'chromedriverExecutable': '/Users/wangkun/Downloads/chromedriver_v86/chromedriver',
  70. 'chromedriverExecutable': '/Users/lieyunye/Downloads/chromedriver_v86/chromedriver',
  71. }
  72. driver = webdriver.Remote("http://localhost:4723/wd/hub", caps)
  73. driver.implicitly_wait(10)
  74. cls.search_to_user_homepage(log_type, user_name, driver)
  75. cls.search_user_videos(log_type, driver, env)
  76. Common.logger(log_type).info('休眠 3s')
  77. time.sleep(3)
  78. cls.quit(log_type, driver)
  79. except Exception as e:
  80. Common.logger(log_type).error(f'start_follow_wechat异常:{e}\n')
  81. @classmethod
  82. def quit(cls, log_type, driver: WebDriver):
  83. driver.quit()
  84. Common.logger(log_type).info('退出 APP 成功\n')
  85. @classmethod
  86. def search_element(cls, log_type, driver: WebDriver, element):
  87. try:
  88. windowHandles = driver.window_handles
  89. # 遍历所有的handles,找到当前页面所在的handle:如果pageSource有包含你想要的元素,就是所要找的handle
  90. # 小程序的页面来回切换也需要:遍历所有的handles,切换到元素所在的handle
  91. for handle in windowHandles:
  92. driver.switch_to.window(handle)
  93. time.sleep(3)
  94. if len(driver.find_elements(By.XPATH, element)) != 0:
  95. return driver.find_element(By.XPATH, element)
  96. else:
  97. pass
  98. except Exception as e:
  99. Common.logger(log_type).warning('search_element异常:{}\n', e)
  100. @classmethod
  101. def search_to_user_homepage(cls, log_type, user_name, driver: WebDriver):
  102. Common.logger(log_type).info('点击搜索按钮')
  103. driver.find_element(By.ID, 'com.tencent.mm:id/j5t').click()
  104. Common.logger(log_type).info(f'输入搜索词:{user_name}')
  105. driver.find_element(By.ID, 'com.tencent.mm:id/cd7').send_keys(user_name)
  106. driver.press_keycode(AndroidKey.ENTER)
  107. Common.logger(log_type).info('点击进入搜索结果页')
  108. driver.find_element(By.ID, 'com.tencent.mm:id/m94').click()
  109. Common.logger(log_type).info('切换到webview')
  110. webview = driver.contexts
  111. driver.switch_to.context(webview[1])
  112. time.sleep(3)
  113. Common.logger(log_type).info('点击"视频号"分类')
  114. cls.search_element(log_type, driver, '//div[@class="unit"]/*[2]').click()
  115. time.sleep(3)
  116. Common.logger(log_type).info(f'进入用户主页:{user_name}')
  117. user_element = cls.search_element(log_type, driver, '//div[@class="video-account__container search_item_inner"]')
  118. if user_element is None:
  119. Common.logger(log_type).info(f'未搜索到用户:{user_name}\n')
  120. return
  121. else:
  122. user_element.click()
  123. time.sleep(1)
  124. Common.logger(log_type).info(f'进入 {user_name} 主页成功\n')
  125. @classmethod
  126. def search_user_videos(cls, log_type, driver: WebDriver, env):
  127. Common.logger(log_type).info('切回NATIVE_APP')
  128. driver.switch_to.context('NATIVE_APP')
  129. # 判断置顶视频
  130. top_videos = driver.find_elements(By.ID, 'com.tencent.mm:id/i56')
  131. Common.logger(log_type).info(f'发现 {len(top_videos)} 个置顶视频\n')
  132. if len(top_videos) == 0:
  133. # Common.logger(log_type).info('当前用户没有置顶视频')
  134. pass
  135. else:
  136. for i in range(len(top_videos)):
  137. top_videos[i].click()
  138. cls.get_video_info(log_type, driver, env)
  139. driver.press_keycode(AndroidKey.BACK)
  140. time.sleep(1)
  141. # 判断非置顶视频
  142. not_top_videos = driver.find_elements(By.ID, 'com.tencent.mm:id/e5s')
  143. Common.logger(log_type).info(f'发现 {len(not_top_videos)} 个非置顶视频')
  144. not_top_first_video = not_top_videos[len(top_videos)]
  145. not_top_first_video.click()
  146. while True:
  147. cls.get_video_info(log_type, driver, env)
  148. driver.swipe(10, 1800, 10, 200, 300)
  149. if len(driver.find_elements(By.ID, 'com.tencent.mm:id/g2s')) > 0:
  150. Common.logger(log_type).info('到底啦 ~\n')
  151. return
  152. @classmethod
  153. def get_video_info(cls, log_type, driver: WebDriver, env):
  154. try:
  155. driver.implicitly_wait(10)
  156. # 视频标题
  157. try:
  158. title_id = driver.find_element(By.ID, 'com.tencent.mm:id/ki5')
  159. video_title = title_id.get_attribute('name').split('\n')[0].strip()
  160. except NoSuchElementException:
  161. video_title = ''
  162. # 点击播放器,获取视频时长
  163. # Common.logger(log_type).info('暂停播放')
  164. pause_btn = driver.find_element(By.ID, 'com.tencent.mm:id/eh4')
  165. pause_btn.click()
  166. start_time = driver.find_element(By.ID, 'com.tencent.mm:id/l59').get_attribute('name')
  167. start_time = int(start_time.split(':')[0]) * 60 + int(start_time.split(':')[-1])
  168. try:
  169. end_time = driver.find_element(By.ID, 'com.tencent.mm:id/l7i').get_attribute('name')
  170. except NoSuchElementException:
  171. end_time = driver.find_element(By.ID, 'com.tencent.mm:id/g73').get_attribute('name')
  172. end_time = int(end_time.split(':')[0]) * 60 + int(end_time.split(':')[-1])
  173. duration = start_time + end_time
  174. # 点赞
  175. like_id = driver.find_element(By.ID, 'com.tencent.mm:id/k04')
  176. like_cnt = like_id.get_attribute('name')
  177. if like_cnt == "" or like_cnt == "喜欢":
  178. like_cnt = 0
  179. elif '万' in like_cnt:
  180. like_cnt = float(like_cnt.split('万')[0]) * 10000
  181. elif '万+' in like_cnt:
  182. like_cnt = float(like_cnt.split('万+')[0]) * 10000
  183. else:
  184. like_cnt = float(like_cnt)
  185. # 分享
  186. share_id = driver.find_element(By.ID, 'com.tencent.mm:id/jhv')
  187. share_cnt = share_id.get_attribute('name')
  188. if share_cnt == "" or share_cnt == "转发":
  189. share_cnt = 0
  190. elif '万' in share_cnt:
  191. share_cnt = float(share_cnt.split('万')[0]) * 10000
  192. elif '万+' in share_cnt:
  193. share_cnt = float(share_cnt.split('万+')[0]) * 10000
  194. else:
  195. share_cnt = float(share_cnt)
  196. # 收藏
  197. favorite_id = driver.find_element(By.ID, 'com.tencent.mm:id/fnp')
  198. favorite_cnt = favorite_id.get_attribute('name')
  199. if favorite_cnt == "" or favorite_cnt == "收藏":
  200. favorite_cnt = 0
  201. elif '万' in favorite_cnt:
  202. favorite_cnt = float(favorite_cnt.split('万')[0]) * 10000
  203. elif '万+' in favorite_cnt:
  204. favorite_cnt = float(favorite_cnt.split('万+')[0]) * 10000
  205. else:
  206. favorite_cnt = float(favorite_cnt)
  207. # 评论
  208. comment_id = driver.find_element(By.ID, 'com.tencent.mm:id/bje')
  209. comment_cnt = comment_id.get_attribute('name')
  210. if comment_cnt == "" or comment_cnt == "评论":
  211. comment_cnt = 0
  212. elif '万' in comment_cnt:
  213. comment_cnt = float(comment_cnt.split('万')[0]) * 10000
  214. elif '万+' in comment_cnt:
  215. comment_cnt = float(comment_cnt.split('万+')[0]) * 10000
  216. else:
  217. comment_cnt = float(comment_cnt)
  218. # 用户名
  219. username_id = driver.find_element(By.ID, 'com.tencent.mm:id/hft')
  220. user_name = username_id.get_attribute('name')
  221. Common.logger(log_type).info('video_title:{}', video_title)
  222. Common.logger(log_type).info('duration:{}', duration)
  223. Common.logger(log_type).info('like_cnt:{}', like_cnt)
  224. Common.logger(log_type).info('share_cnt:{}', share_cnt)
  225. Common.logger(log_type).info('favorite_cnt:{}', favorite_cnt)
  226. Common.logger(log_type).info('comment_cnt:{}', comment_cnt)
  227. Common.logger(log_type).info('user_name:{}', user_name)
  228. if int(duration) < 50:
  229. Common.logger(log_type).info(f'时长:{int(duration)} < 50 秒\n')
  230. elif video_title == '':
  231. Common.logger(log_type).info('视频标题为空\n')
  232. # 过滤词库(视频标题)
  233. elif any(word if word in video_title else False for word in cls.filter_words(log_type)) is True:
  234. Common.logger(log_type).info(f'视频已中过滤词:{video_title}\n')
  235. # 视频号推荐_已下载表
  236. elif video_title in [x for y in Feishu.get_values_batch(log_type, 'shipinhao', 'c77cf9') for x in y]:
  237. Common.logger(log_type).info('视频已下载\n')
  238. # 视频号定向_已下载表
  239. elif video_title in [x for y in Feishu.get_values_batch(log_type, 'shipinhao', 'KsVtLe') for x in y]:
  240. Common.logger(log_type).info('视频已下载\n')
  241. elif cls.title_like(log_type, video_title) is True:
  242. Common.logger(log_type).info('标题相似度>=90%')
  243. # feeds 表去重
  244. elif video_title in [x for y in Feishu.get_values_batch(log_type, 'shipinhao', 'FSDlBy') for x in y]:
  245. Common.logger(log_type).info('视频已存在\n')
  246. # feeds 表去重
  247. elif video_title in [x for y in Feishu.get_values_batch(log_type, 'shipinhao', 'qzDljJ') for x in y]:
  248. Common.logger(log_type).info('视频已存在\n')
  249. # 分享给 windows 爬虫机
  250. else:
  251. video_dict = {
  252. 'video_title': video_title,
  253. 'duration': duration,
  254. 'like_cnt': like_cnt,
  255. 'share_cnt': share_cnt,
  256. 'share_id': share_id,
  257. 'favorite_cnt': favorite_cnt,
  258. 'comment_cnt': comment_cnt,
  259. 'user_name': user_name
  260. }
  261. cls.share_to_windows(log_type, driver, video_dict, env)
  262. except Exception as e:
  263. Common.logger(log_type).error(f'get_video_info异常:{e}\n')
  264. @classmethod
  265. def title_like(cls, log_type, title):
  266. sheet = Feishu.get_values_batch(log_type, 'shipinhao', 'KsVtLe')
  267. for i in range(1, len(sheet)):
  268. video_title = sheet[i][7]
  269. if video_title is None:
  270. pass
  271. elif difflib.SequenceMatcher(None, title, video_title).quick_ratio() >= 0.9:
  272. return True
  273. else:
  274. pass
  275. @classmethod
  276. def share_to_windows(cls, log_type, driver: WebDriver, video_dict, env):
  277. Common.logger(log_type).info('分享给 windows 爬虫机器')
  278. video_dict['share_id'].click()
  279. driver.find_element(By.XPATH, '//*[@text="转发给朋友"]').click()
  280. driver.find_element(By.XPATH, '//*[@text="爬虫群"]').click()
  281. driver.find_element(By.ID, 'com.tencent.mm:id/guw').click()
  282. # 把视频信息写入飞书feeds文档
  283. Feishu.insert_columns(log_type, 'shipinhao', 'qzDljJ', 'ROWS', 1, 2)
  284. get_feeds_time = int(time.time())
  285. values = [[time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(get_feeds_time)),
  286. '定向榜',
  287. str(video_dict['video_title']),
  288. int(video_dict['duration']),
  289. int(video_dict['like_cnt']),
  290. int(video_dict['share_cnt']),
  291. int(video_dict['favorite_cnt']),
  292. int(video_dict['comment_cnt']),
  293. str(video_dict['user_name'])]]
  294. time.sleep(1)
  295. Feishu.update_values(log_type, 'shipinhao', 'qzDljJ', 'A2:Z2', values)
  296. Common.logger(log_type).info('视频信息写入飞书文档成功\n')
  297. while True:
  298. if Feishu.get_values_batch(log_type, 'shipinhao', 'qzDljJ')[1][11] is None:
  299. Common.logger(log_type).info('等待更新 URL 信息')
  300. time.sleep(10)
  301. else:
  302. Common.logger(log_type).info('URL 信息已更新\n')
  303. break
  304. cls.download_publish(log_type, env)
  305. # 下载 、上传
  306. @classmethod
  307. def download_publish(cls, log_type, env):
  308. try:
  309. follow_feeds_sheet = Feishu.get_values_batch(log_type, 'shipinhao', 'qzDljJ')
  310. for i in range(1, len(follow_feeds_sheet)):
  311. download_title = follow_feeds_sheet[i][2]
  312. download_duration = follow_feeds_sheet[i][3]
  313. download_like_cnt = follow_feeds_sheet[i][4]
  314. download_share_cnt = follow_feeds_sheet[i][5]
  315. download_favorite_cnt = follow_feeds_sheet[i][6]
  316. download_comment_cnt = follow_feeds_sheet[i][7]
  317. download_username = follow_feeds_sheet[i][8]
  318. download_head_url = follow_feeds_sheet[i][9]
  319. download_cover_url = follow_feeds_sheet[i][10]
  320. download_video_url = follow_feeds_sheet[i][11]
  321. Common.logger(log_type).info("download_title:{}", download_title)
  322. Common.logger(log_type).info("download_username:{}", download_username)
  323. Common.logger(log_type).info("download_video_url:{}", download_video_url)
  324. if download_title is None or download_duration is None or download_video_url is None:
  325. Feishu.dimension_range(log_type, 'shipinhao', 'qzDljJ', 'ROWS', i + 1, i + 1)
  326. Common.logger(log_type).info('空行,删除成功\n')
  327. return
  328. else:
  329. # 下载封面
  330. Common.download_method(log_type=log_type, text="cover",
  331. d_name=str(download_title), d_url=str(download_cover_url))
  332. # 下载视频
  333. Common.download_method(log_type=log_type, text="video",
  334. d_name=str(download_title), d_url=str(download_video_url))
  335. # 保存视频信息至 "./videos/{download_video_title}/info.txt"
  336. with open("./videos/" + download_title
  337. + "/" + "info.txt", "a", encoding="UTF-8") as f_a:
  338. f_a.write('shipinhao' + str(int(time.time())) + "\n" +
  339. str(download_title) + "\n" +
  340. str(download_duration) + "\n" +
  341. str(download_favorite_cnt) + "\n" +
  342. str(download_comment_cnt) + "\n" +
  343. str(download_like_cnt) + "\n" +
  344. str(download_share_cnt) + "\n" +
  345. str(1920 * 1080) + "\n" +
  346. str(int(time.time())) + "\n" +
  347. str(download_username) + "\n" +
  348. str(download_head_url) + "\n" +
  349. str(download_video_url) + "\n" +
  350. str(download_cover_url) + "\n" +
  351. "SHIPINHAO" + str(int(time.time())))
  352. Common.logger(log_type).info("==========视频信息已保存至info.txt==========")
  353. Common.logger(log_type).info("开始上传视频:{}".format(download_title))
  354. our_video_id = Publish.upload_and_publish(log_type, env, "follow")
  355. if env == 'dev':
  356. our_video_link = "https://testadmin.piaoquantv.com/cms/post-detail/" + str(
  357. our_video_id) + "/info"
  358. else:
  359. our_video_link = "https://admin.piaoquantv.com/cms/post-detail/" + str(
  360. our_video_id) + "/info"
  361. Common.logger(log_type).info("视频上传完成:{}", our_video_link)
  362. # 视频ID工作表,插入首行
  363. Feishu.insert_columns(log_type, "shipinhao", "KsVtLe", "ROWS", 1, 2)
  364. # 视频ID工作表,首行写入数据
  365. upload_time = int(time.time())
  366. values = [[time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(upload_time)),
  367. "定向榜",
  368. str(download_title),
  369. our_video_link,
  370. download_duration,
  371. download_like_cnt,
  372. download_share_cnt,
  373. download_favorite_cnt,
  374. download_comment_cnt,
  375. download_username,
  376. str(download_head_url),
  377. str(download_cover_url),
  378. str(download_video_url)]]
  379. time.sleep(1)
  380. Feishu.update_values(log_type, "shipinhao", "KsVtLe", "F2:V2", values)
  381. # 删除行或列,可选 ROWS、COLUMNS
  382. time.sleep(1)
  383. Feishu.dimension_range(log_type, "shipinhao", "qzDljJ", "ROWS", 2, 2)
  384. Common.logger(log_type).info("下载/上传成功\n")
  385. # return
  386. except Exception as e:
  387. Feishu.dimension_range(log_type, "shipinhao", "qzDljJ", "ROWS", 2, 2)
  388. Common.logger(log_type).error('download_publish异常,删除视频信息成功:{}\n', e)
  389. @classmethod
  390. def search_to_all_user_homepage(cls, log_type, env):
  391. try:
  392. user_list = cls.get_users_from_feishu(log_type)
  393. for user in user_list:
  394. cls.start_follow_wechat(log_type, user, env)
  395. Common.logger(log_type).info('所有用户已抓取完毕\n')
  396. except Exception as e:
  397. Common.logger(log_type).error(f'search_to_all_user_homepage异常:{e}\n')
  398. if __name__ == '__main__':
  399. print(Follow.get_users_from_feishu('follow'))
  400. pass