shipinhao_topic.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2022/9/19
  4. import time
  5. import os
  6. import sys
  7. from appium import webdriver
  8. from selenium.common import NoSuchElementException
  9. from selenium.webdriver.common.by import By
  10. from appium.webdriver.webdriver import WebDriver
  11. sys.path.append(os.getcwd())
  12. from main.common import Common
  13. from main.feishu_lib import Feishu
  14. class Topic:
  15. # 搜索词列表
  16. search_words = ['#正能量', '#老外真会玩', '#令人震惊的宠物', '#搞笑', '#娱乐',
  17. '#热门', '#我要上热门', '#微信娱乐时刻', '#搞笑视频', '#航拍']
  18. # 单个话题抓取视频数
  19. video_cnt = []
  20. # 下载规则
  21. @staticmethod
  22. def download_rule(duration, like_cnt, share_cnt, favorite_cnt, comment_cnt):
  23. if int(duration) >= 30:
  24. if int(like_cnt) >= 0:
  25. if int(share_cnt) >= 0:
  26. if int(favorite_cnt) >= 100000:
  27. if int(comment_cnt) >= 0:
  28. return True
  29. else:
  30. return False
  31. else:
  32. return False
  33. else:
  34. return False
  35. else:
  36. return False
  37. else:
  38. return False
  39. @classmethod
  40. def get_topics(cls, log_type):
  41. try:
  42. topic_sht = Feishu.get_values_batch(log_type, 'shipinhao', 'TZuDRX')
  43. topic_list = []
  44. for x in topic_sht:
  45. for y in x:
  46. if y is None:
  47. pass
  48. else:
  49. topic_list.append(y)
  50. return topic_list
  51. except Exception as e:
  52. Common.logger(log_type).error('get_topics异常:{}', e)
  53. @classmethod
  54. def start_topic(cls, log_type, topic):
  55. try:
  56. Common.logger(log_type).info('启动微信')
  57. caps = {
  58. "platformName": "Android", # 手机操作系统 Android / iOS
  59. "deviceName": "Android", # 连接的设备名(模拟器或真机),安卓可以随便写
  60. "platforVersion": "11", # 手机对应的系统版本(Android 11)
  61. "appPackage": "com.tencent.mm", # 被测APP的包名,乐活圈 Android
  62. "appActivity": ".ui.LauncherUI", # 启动的Activity名
  63. "autoGrantPermissions": "true", # 让 appium 自动授权 base 权限,
  64. # 如果 noReset 为 True,则该条不生效(该参数为 Android 独有),对应的值为 True 或 False
  65. "unicodekeyboard": True, # 使用自带输入法,输入中文时填True
  66. "resetkeyboard": True, # 执行完程序恢复原来输入法
  67. "noReset": True, # 不重置APP
  68. "recreateChromeDriverSessions": True, # 切换到非 chrome-Driver 会 kill 掉 session,就不需要手动 kill 了
  69. "printPageSourceOnFailure": True, # 找不到元素时,appium log 会完整记录当前页面的 pagesource
  70. "newCommandTimeout": 6000, # 初始等待时间
  71. "automationName": "UiAutomator2", # 使用引擎,默认为 Appium,
  72. # 其中 Appium、UiAutomator2、Selendroid、Espresso 用于 Android,XCUITest 用于 iOS
  73. "showChromedriverLog": True,
  74. # "chromeOptions": {"androidProcess": "com.tencent.mm:appbrand0"},
  75. "chromeOptions": {"androidProcess": "com.tencent.mm:tools"},
  76. 'enableWebviewDetailsCollection': True,
  77. 'setWebContentsDebuggingEnabled': True
  78. }
  79. driver = webdriver.Remote("http://localhost:4723/wd/hub", caps)
  80. driver.implicitly_wait(10)
  81. Common.logger(log_type).info('点击发现TAB')
  82. driver.find_elements(By.ID, 'com.tencent.mm:id/f2s')[2].click()
  83. Common.logger(log_type).info('进入视频号')
  84. driver.find_elements(By.ID, 'com.tencent.mm:id/gv6')[1].click()
  85. time.sleep(1)
  86. Common.logger(log_type).info('进入搜索页\n')
  87. driver.find_element(By.ID, 'com.tencent.mm:id/j4j').click()
  88. cls.get_topic_feeds(log_type, topic, driver)
  89. Common.logger(log_type).info('休眠 3s,退出微信')
  90. time.sleep(3)
  91. cls.quit(log_type, driver)
  92. except Exception as e:
  93. Common.logger(log_type).error('start_wechat异常\n', e)
  94. # 退出 APP
  95. @classmethod
  96. def quit(cls, log_type, driver: WebDriver):
  97. driver.quit()
  98. Common.logger(log_type).info('退出 APP 成功\n')
  99. # 操作安卓手机,自己滑动首页视频,并获取视频信息
  100. @classmethod
  101. def get_feeds(cls, log_type, topic, driver: WebDriver):
  102. try:
  103. driver.implicitly_wait(10)
  104. while True:
  105. # 滑动到底部
  106. try:
  107. driver.find_element(By.XPATH, '//*[@text="没有更多了"]')
  108. Common.logger(log_type).info('已经滑动到底部\n')
  109. return
  110. except NoSuchElementException:
  111. pass
  112. # 视频标题
  113. try:
  114. title_id = driver.find_element(By.ID, 'com.tencent.mm:id/ki5')
  115. video_title = title_id.get_attribute('name').split('\n')[0].replace('#', '').strip()
  116. except NoSuchElementException:
  117. video_title = ''
  118. driver.swipe(10, 1600, 10, 300, 200)
  119. # 点击播放器,获取视频时长
  120. # Common.logger(log_type).info('暂停播放')
  121. pause_btn = driver.find_element(By.ID, 'com.tencent.mm:id/eh4')
  122. pause_btn.click()
  123. try:
  124. start_time = driver.find_element(By.ID, 'com.tencent.mm:id/l59').get_attribute('name')
  125. start_time = int(start_time.split(':')[0]) * 60 + int(start_time.split(':')[-1])
  126. end_time = driver.find_element(By.ID, 'com.tencent.mm:id/l7i').get_attribute('name')
  127. except NoSuchElementException:
  128. # driver.find_element(By.ID, 'com.tencent.mm:id/eh4').click()
  129. start_time = 0
  130. end_time = driver.find_element(By.ID, 'com.tencent.mm:id/g73').get_attribute('name')
  131. end_time = int(end_time.split(':')[0]) * 60 + int(end_time.split(':')[-1])
  132. duration = start_time + end_time
  133. # 点赞
  134. like_id = driver.find_element(By.ID, 'com.tencent.mm:id/k04')
  135. like_cnt = like_id.get_attribute('name')
  136. if like_cnt == "" or like_cnt == "喜欢":
  137. like_cnt = 0
  138. elif '万' in like_cnt:
  139. like_cnt = float(like_cnt.split('万')[0]) * 10000
  140. elif '万+' in like_cnt:
  141. like_cnt = float(like_cnt.split('万+')[0]) * 10000
  142. else:
  143. like_cnt = float(like_cnt)
  144. # 分享
  145. share_id = driver.find_element(By.ID, 'com.tencent.mm:id/jhv')
  146. share_cnt = share_id.get_attribute('name')
  147. if share_cnt == "" or share_cnt == "转发":
  148. share_cnt = 0
  149. elif '万' in share_cnt:
  150. share_cnt = float(share_cnt.split('万')[0]) * 10000
  151. elif '万+' in share_cnt:
  152. share_cnt = float(share_cnt.split('万+')[0]) * 10000
  153. else:
  154. share_cnt = float(share_cnt)
  155. # 收藏
  156. favorite_id = driver.find_element(By.ID, 'com.tencent.mm:id/fnp')
  157. favorite_cnt = favorite_id.get_attribute('name')
  158. if favorite_cnt == "" or favorite_cnt == "收藏":
  159. favorite_cnt = 0
  160. elif '万' in favorite_cnt:
  161. favorite_cnt = float(favorite_cnt.split('万')[0]) * 10000
  162. elif '万+' in favorite_cnt:
  163. favorite_cnt = float(favorite_cnt.split('万+')[0]) * 10000
  164. else:
  165. favorite_cnt = float(favorite_cnt)
  166. # 评论
  167. comment_id = driver.find_element(By.ID, 'com.tencent.mm:id/bje')
  168. comment_cnt = comment_id.get_attribute('name')
  169. if comment_cnt == "" or comment_cnt == "评论":
  170. comment_cnt = 0
  171. elif '万' in comment_cnt:
  172. comment_cnt = float(comment_cnt.split('万')[0]) * 10000
  173. elif '万+' in comment_cnt:
  174. comment_cnt = float(comment_cnt.split('万+')[0]) * 10000
  175. else:
  176. comment_cnt = float(comment_cnt)
  177. # 用户名
  178. username_id = driver.find_element(By.ID, 'com.tencent.mm:id/hft')
  179. user_name = username_id.get_attribute('name')
  180. Common.logger(log_type).info('video_title:{}', video_title)
  181. Common.logger(log_type).info('user_name:{}', user_name)
  182. Common.logger(log_type).info('duration:{}', duration)
  183. Common.logger(log_type).info('favorite_cnt:{}', favorite_cnt)
  184. # 判断无效视频
  185. if video_title == '' or user_name == '':
  186. Common.logger(log_type).info('无效视频,滑动到下一个视频\n')
  187. driver.swipe(10, 1600, 10, 300, 200)
  188. # 判断下载规则
  189. elif cls.download_rule(duration, like_cnt, share_cnt, favorite_cnt, comment_cnt) is False:
  190. Common.logger(log_type).info('不满足抓取规则,滑动到下一个视频\n')
  191. driver.swipe(10, 1600, 10, 300, 200)
  192. # 已下载表去重
  193. elif str(video_title) in [x for y in Feishu.get_values_batch(log_type, 'shipinhao', 'c77cf9') for x
  194. in y]:
  195. Common.logger(log_type).info('视频已下载,滑动到下一个视频\n')
  196. driver.swipe(10, 1600, 10, 300, 200)
  197. # feeds 表去重
  198. elif str(video_title) in [x for y in Feishu.get_values_batch(log_type, 'shipinhao', 'FSDlBy') for x
  199. in y]:
  200. Common.logger(log_type).info('视频已存在,滑动到下一个视频\n')
  201. driver.swipe(10, 1600, 10, 300, 200)
  202. # 分享给 windows 爬虫机
  203. else:
  204. share_id.click()
  205. driver.find_element(By.XPATH, '//*[@text="转发给朋友"]').click()
  206. driver.find_element(By.XPATH, '//*[@text="爬虫群"]').click()
  207. driver.find_element(By.ID, 'com.tencent.mm:id/guw').click()
  208. # 把视频信息写入飞书feeds文档
  209. Feishu.insert_columns(log_type, 'shipinhao', 'FSDlBy', 'ROWS', 1, 2)
  210. get_feeds_time = int(time.time())
  211. values = [[time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(get_feeds_time)),
  212. '话题榜:'+str(topic),
  213. str(video_title),
  214. duration,
  215. like_cnt,
  216. share_cnt,
  217. favorite_cnt,
  218. comment_cnt,
  219. str(user_name)]]
  220. time.sleep(1)
  221. Feishu.update_values(log_type, 'shipinhao', 'FSDlBy', 'A2:Z2', values)
  222. Common.logger(log_type).info('视频信息写入飞书文档成功\n')
  223. while True:
  224. if Feishu.get_values_batch(log_type, 'shipinhao', 'FSDlBy')[1][11] is None:
  225. Common.logger(log_type).info('等待更新 URL 信息')
  226. time.sleep(10)
  227. else:
  228. Common.logger(log_type).info('URL 信息已更新,滑动到下一个视频\n')
  229. cls.video_cnt.append(video_title)
  230. driver.swipe(10, 1600, 10, 300, 200)
  231. break
  232. if len(cls.video_cnt) < 5:
  233. Common.logger(log_type).info('已抓取{}条视频\n', len(cls.video_cnt))
  234. else:
  235. Common.logger(log_type).info('已抓取{}条视频\n', len(cls.video_cnt))
  236. cls.video_cnt = []
  237. return
  238. except Exception as e:
  239. Common.logger(log_type).error('get_feeds异常,滑动到下一个视频\n', e)
  240. driver.swipe(10, 1600, 10, 300, 200)
  241. @classmethod
  242. def get_topic_feeds(cls, log_type, topic, driver: WebDriver):
  243. try:
  244. driver.implicitly_wait(15)
  245. webview = driver.contexts
  246. Common.logger(log_type).info('webview:{}', webview)
  247. Common.logger(log_type).info('切换到webview')
  248. driver.switch_to.context('WEBVIEW_com.tencent.mm:appbrand0')
  249. time.sleep(5)
  250. Common.logger(log_type).info('展开更多搜索历史')
  251. for i in range(3):
  252. try:
  253. search_history = driver.find_element(By.XPATH, '//*[@class="arrow-wrap"]')
  254. search_history.click()
  255. break
  256. except NoSuchElementException:
  257. time.sleep(3)
  258. Common.logger(log_type).info('未发现更多按钮')
  259. Common.logger(log_type).info('查找所有搜索历史')
  260. search_words = driver.find_elements(By.XPATH, '//*[@class="history__item__text"]')
  261. for search_word in search_words:
  262. if search_word.text == topic:
  263. search_word.click()
  264. time.sleep(3)
  265. Common.logger(log_type).info('点击搜索结果第一个视频')
  266. driver.find_elements(By.XPATH, '//*[@class="waterfall__item"]')[0].click()
  267. Common.logger(log_type).info('切回NATIVE_APP\n')
  268. driver.switch_to.context('NATIVE_APP')
  269. time.sleep(3)
  270. cls.get_feeds(log_type, topic, driver)
  271. return
  272. except Exception as e:
  273. Common.logger(log_type).error('get_topic_feeds异常:{}\n', e)
  274. @classmethod
  275. def search_topic(cls, log_type):
  276. for topic in cls.search_words:
  277. Common.logger(log_type).info('搜索话题:{}\n', topic)
  278. cls.start_topic(log_type, topic)
  279. time.sleep(3)
  280. if __name__ == '__main__':
  281. Topic.search_topic('topic')
  282. pass