recommend_h5.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/7/26
  4. import json
  5. import os
  6. import sys
  7. import time
  8. from appium import webdriver
  9. from selenium.common import NoSuchElementException
  10. from appium.webdriver.webdriver import WebDriver
  11. from hashlib import md5
  12. from selenium.webdriver.common.by import By
  13. sys.path.append(os.getcwd())
  14. from common.mq import MQ
  15. from common.public import similarity
  16. from common.common import Common
  17. from shipinhao.shipinhao_recommend.shipinhao_recommend import ShipinhaoRecommend
  18. class RecommendH5:
  19. platform = "视频号"
  20. @classmethod
  21. def start_wechat(cls, log_type, crawler, env):
  22. Common.logger(log_type, crawler).info('启动微信')
  23. Common.logging(log_type, crawler, env, '启动微信')
  24. if env == "dev":
  25. chromedriverExecutable = "/Users/wangkun/Downloads/chromedriver/chromedriver_v111/chromedriver"
  26. else:
  27. chromedriverExecutable = '/Users/lieyunye/Downloads/chromedriver/chromedriver_v111/chromedriver'
  28. caps = {
  29. "platformName": "Android", # 手机操作系统 Android / iOS
  30. "deviceName": "Android", # 连接的设备名(模拟器或真机),安卓可以随便写
  31. "platforVersion": "13", # 手机对应的系统版本(Android 13)
  32. "appPackage": "com.tencent.mm", # 被测APP的包名,乐活圈 Android
  33. "appActivity": ".ui.LauncherUI", # 启动的Activity名
  34. "autoGrantPermissions": True, # 让 appium 自动授权 base 权限,
  35. # 如果 noReset 为 True,则该条不生效(该参数为 Android 独有),对应的值为 True 或 False
  36. "unicodekeyboard": True, # 使用自带输入法,输入中文时填True
  37. "resetkeyboard": True, # 执行完程序恢复原来输入法
  38. "noReset": True, # 不重置APP
  39. "recreateChromeDriverSessions": True, # 切换到非 chrome-Driver 会 kill 掉 session,就不需要手动 kill 了
  40. "printPageSourceOnFailure": True, # 找不到元素时,appium log 会完整记录当前页面的 pagesource
  41. "newCommandTimeout": 6000, # 初始等待时间
  42. "automationName": "UiAutomator2", # 使用引擎,默认为 Appium,
  43. # 其中 Appium、UiAutomator2、Selendroid、Espresso 用于 Android,XCUITest 用于 iOS
  44. "showChromedriverLog": True,
  45. # "chromeOptions": {"androidProcess": "com.tencent.mm:appbrand0"},
  46. "chromeOptions": {"androidProcess": "com.tencent.mm:tools"},
  47. # "chromeOptions": {"androidProcess": "com.tencent.mm:toolsmp"},
  48. # "chromeOptions": {"androidProcess": "com.tencent.mm"},
  49. 'enableWebviewDetailsCollection': True,
  50. 'setWebContentsDebuggingEnabled': True,
  51. 'chromedriverExecutable': chromedriverExecutable,
  52. }
  53. driver = webdriver.Remote("http://localhost:4723/wd/hub", caps)
  54. driver.implicitly_wait(10)
  55. time.sleep(5)
  56. return driver
  57. # 查找元素
  58. @classmethod
  59. def search_elements(cls, driver: WebDriver, xpath):
  60. time.sleep(1)
  61. windowHandles = driver.window_handles
  62. for handle in windowHandles:
  63. driver.switch_to.window(handle)
  64. time.sleep(1)
  65. try:
  66. elements = driver.find_elements(By.XPATH, xpath)
  67. if elements:
  68. return elements
  69. except NoSuchElementException:
  70. pass
  71. # noinspection PyBroadException
  72. @classmethod
  73. def check_to_webview(cls, log_type, crawler, env, driver: WebDriver):
  74. webviews = driver.contexts
  75. Common.logger(log_type, crawler).info(f"webviews:{webviews}")
  76. Common.logging(log_type, crawler, env, f"webviews:{webviews}")
  77. driver.switch_to.context(webviews[1])
  78. Common.logger(log_type, crawler).info(driver.current_context)
  79. Common.logging(log_type, crawler, env, driver.current_context)
  80. time.sleep(1)
  81. windowHandles = driver.window_handles
  82. for handle in windowHandles:
  83. try:
  84. driver.switch_to.window(handle)
  85. time.sleep(1)
  86. driver.find_element(By.XPATH, '//div[@class="unit"]')
  87. Common.logger(log_type, crawler).info('切换 webview 成功')
  88. Common.logging(log_type, crawler, env, '切换 webview 成功')
  89. return "成功"
  90. except Exception:
  91. Common.logger(log_type, crawler).info("切换 webview 失败")
  92. Common.logging(log_type, crawler, env, "切换 webview 失败")
  93. @classmethod
  94. def search_video(cls, log_type, crawler, env, video_dict, rule_dict, our_uid):
  95. mq = MQ(topic_name="topic_crawler_etl_" + env)
  96. driver = cls.start_wechat(log_type, crawler, env)
  97. # 点击微信搜索框,并输入搜索词
  98. driver.implicitly_wait(10)
  99. Common.logger(log_type, crawler).info("点击搜索框")
  100. Common.logging(log_type, crawler, env, "点击搜索框")
  101. driver.find_element(By.ID, 'com.tencent.mm:id/j5t').click() # 微信8.0.30版本
  102. time.sleep(0.5)
  103. driver.find_element(By.ID, 'com.tencent.mm:id/cd7').clear().send_keys(
  104. video_dict['video_title'].replace('"', "").replace('“', "").replace('”', "").replace('#', "")) # 微信8.0.30版本
  105. # driver.press_keycode(AndroidKey.ENTER)
  106. Common.logger(log_type, crawler).info("进入搜索词页面")
  107. Common.logging(log_type, crawler, env, "进入搜索词页面")
  108. driver.find_element(By.ID, 'com.tencent.mm:id/m94').click() # 微信8.0.30版本
  109. # 切换到微信搜索结果页 webview
  110. check_to_webview = cls.check_to_webview(log_type, crawler, env, driver)
  111. if check_to_webview is None:
  112. Common.logger(log_type, crawler).info("切换到视频号 webview 失败\n")
  113. Common.logging(log_type, crawler, env, "切换到视频号 webview 失败\n")
  114. return
  115. time.sleep(1)
  116. # 切换到"视频号"分类
  117. shipinhao_tags = cls.search_elements(driver, '//div[@class="unit"]/*[2]')
  118. Common.logger(log_type, crawler).info('点击"视频号"分类')
  119. Common.logging(log_type, crawler, env, '点击"视频号"分类')
  120. shipinhao_tags[0].click()
  121. time.sleep(5)
  122. global h5_page
  123. for i in range(3):
  124. h5_page = cls.search_elements(driver, '//*[@class="mixed-box__bd"]')
  125. if h5_page is None:
  126. Common.logger(log_type, crawler).info('未发现H5页面')
  127. Common.logging(log_type, crawler, env, '未发现H5页面')
  128. driver.refresh()
  129. else:
  130. break
  131. if h5_page is None:
  132. driver.quit()
  133. return
  134. Common.logger(log_type, crawler).info('获取视频列表\n')
  135. Common.logging(log_type, crawler, env, '获取视频列表\n')
  136. video_elements = cls.search_elements(driver, '//div[@class="rich-media active__absolute"]')
  137. if video_elements is None:
  138. Common.logger(log_type, crawler).warning(f'video_elements:{video_elements}')
  139. Common.logging(log_type, crawler, env, f'video_elements:{video_elements}')
  140. return
  141. for i, video_element in enumerate(video_elements):
  142. try:
  143. if video_element is None:
  144. Common.logger(log_type, crawler).info('到底啦~\n')
  145. Common.logging(log_type, crawler, env, '到底啦~\n')
  146. return
  147. Common.logger(log_type, crawler).info(f'拖动"视频"列表第{i + 1}条至屏幕中间')
  148. Common.logging(log_type, crawler, env, f'拖动"视频"列表第{i + 1}条至屏幕中间')
  149. time.sleep(3)
  150. driver.execute_script("arguments[0].scrollIntoView({block:'center',inline:'center'})",
  151. video_element)
  152. if len(video_element.find_elements(By.XPATH, "//*[@text='没有更多的搜索结果']")) != 0:
  153. Common.logger(log_type, crawler).info("没有更多的搜索结果\n")
  154. Common.logging(log_type, crawler, env, "没有更多的搜索结果\n")
  155. return
  156. h5_video_title = \
  157. video_element.find_elements(By.XPATH, '//div[@class="rich-media__title ellipsis_2"]/span')[i].text[:40]
  158. h5_user_name = video_element.find_elements(By.XPATH, '//div[@class="rich-media__source__title"]')[
  159. i].text
  160. h5_video_url = video_element.find_elements(By.XPATH, '//div[@class="video-player"]')[i].get_attribute(
  161. 'src')
  162. cover_url = video_element.find_elements(By.XPATH, '//div[@class="video-player__bd"]')[i].get_attribute(
  163. 'style')
  164. h5_cover_url = cover_url.split('url("')[-1].split('")')[0]
  165. avatar_url = video_element.find_elements(By.XPATH,
  166. '//div[@class="ui-image-image ui-image rich-media__source__thumb"]')[
  167. i].get_attribute('style')
  168. h5_avatar_url = avatar_url.split('url("')[-1].split('")')[0]
  169. h5_out_video_id = md5(h5_video_title.encode('utf8')).hexdigest()
  170. h5_out_user_id = md5(h5_user_name.encode('utf8')).hexdigest()
  171. title_similarity = similarity(video_dict['video_title'], h5_video_title)
  172. user_name_similarity = similarity(video_dict['user_name'], h5_user_name)
  173. if title_similarity >= 0.5 and user_name_similarity >= 1.0:
  174. video_dict['cover_url'] = h5_cover_url
  175. video_dict['avatar_url'] = h5_avatar_url
  176. video_dict['out_video_id'] = h5_out_video_id
  177. video_dict['video_url'] = h5_video_url
  178. for k, v in video_dict.items():
  179. Common.logger(log_type, crawler).info(f"{k}:{v}")
  180. Common.logging(log_type, crawler, env, f"video_dict:{video_dict}")
  181. video_dict["out_user_id"] = h5_out_user_id
  182. video_dict["platform"] = crawler
  183. video_dict["strategy"] = log_type
  184. video_dict["strategyType"] = "recommend"
  185. video_dict["out_video_id"] = h5_out_video_id
  186. video_dict["width"] = 0
  187. video_dict["height"] = 0
  188. video_dict["crawler_rule"] = json.dumps(rule_dict)
  189. video_dict["user_id"] = our_uid
  190. video_dict["publish_time"] = video_dict["publish_time_str"]
  191. mq.send_msg(video_dict)
  192. Common.logger(log_type, crawler).info("已抓取到目标视频\n")
  193. Common.logging(log_type, crawler, env, "已抓取到目标视频\n")
  194. driver.quit()
  195. return
  196. else:
  197. Common.logger(log_type, crawler).info(f"video_dict['video_title']:{video_dict['video_title']}")
  198. Common.logging(log_type, crawler, env, f"video_dict['video_title']:{video_dict['video_title']}")
  199. Common.logger(log_type, crawler).info(f"h5_video_title:{h5_video_title}")
  200. Common.logging(log_type, crawler, env, f"h5_video_title:{h5_video_title}")
  201. Common.logger(log_type, crawler).info(f"title_similarity:{title_similarity}")
  202. Common.logging(log_type, crawler, env, f"title_similarity:{title_similarity}")
  203. Common.logger(log_type, crawler).info(f"video_dict['user_name']:{video_dict['user_name']}")
  204. Common.logging(log_type, crawler, env, f"video_dict['user_name']:{video_dict['user_name']}")
  205. Common.logger(log_type, crawler).info(f"h5_user_name:{h5_user_name}")
  206. Common.logging(log_type, crawler, env, f"h5_user_name:{h5_user_name}")
  207. Common.logger(log_type, crawler).info(f"user_name_similarity:{user_name_similarity}")
  208. Common.logging(log_type, crawler, env, f"user_name_similarity:{user_name_similarity}")
  209. except Exception as e:
  210. Common.logger(log_type, crawler).info(f"抓取单条H5视频时异常:{e}\n")
  211. Common.logging(log_type, crawler,env, f"抓取单条H5视频时异常:{e}\n")
  212. Common.logger(log_type, crawler).info("未找到目标视频\n")
  213. Common.logging(log_type, crawler, env, "未找到目标视频\n")
  214. @classmethod
  215. def download_videos(cls, log_type, crawler, env, rule_dict, our_uid):
  216. try:
  217. Common.logger(log_type, crawler).info(f'共{len(ShipinhaoRecommend.download_video_list)}条视频待抓取')
  218. Common.logging(log_type, crawler, env, f'共{len(ShipinhaoRecommend.download_video_list)}条视频待抓取')
  219. Common.logger(log_type, crawler).info(f'download_video_list:{ShipinhaoRecommend.download_video_list}\n')
  220. Common.logging(log_type, crawler, env, f'download_video_list:{ShipinhaoRecommend.download_video_list}\n')
  221. if len(ShipinhaoRecommend.download_video_list) == 0:
  222. Common.logger(log_type, crawler).info("没有待下载的视频\n")
  223. Common.logging(log_type, crawler, env, "没有待下载的视频\n")
  224. return
  225. for video_dict in ShipinhaoRecommend.download_video_list:
  226. try:
  227. cls.search_video(log_type, crawler, env, video_dict, rule_dict, our_uid)
  228. except Exception as e:
  229. Common.logger(log_type, crawler).info(f"抓取视频异常:{e}\n")
  230. Common.logging(log_type, crawler, env, f"抓取视频异常:{e}\n")
  231. except Exception as e:
  232. Common.logger(log_type, crawler).info(f"download_videos异常:{e}\n")
  233. Common.logging(log_type, crawler, env, f"download_videos异常:{e}\n")
  234. if __name__ == "__main__":
  235. ShipinhaoRecommend.download_video_list = [
  236. {'video_title': '网友:不知道此时此刻黑车司机在想什么', 'video_id': '96bfb8b86965df7365f02373ce37fe87', 'duration': 21, 'user_name': '沂蒙晚报', 'like_cnt': 9575, 'share_cnt': 11000, 'favorite_cnt': 25000, 'comment_cnt': 5026, 'publish_time_str': '2023-07-25', 'publish_time_stamp': 1690214400, 'publish_time': 1690214400000, 'period': 1},
  237. {'video_title': '女朋友这不就来了么', 'video_id': 'b1892886dca8c38dd6d72848ae4fd565', 'duration': 10, 'user_name': '向往的火焰蓝', 'like_cnt': 11000, 'share_cnt': 3701, 'favorite_cnt': 26000, 'comment_cnt': 1426, 'publish_time_str': '2023-07-26', 'publish_time_stamp': 1690300800, 'publish_time': 1690300800000, 'period': 0},
  238. {'video_title': '近日,在韩国举办的2023世界跆拳道大赛上,中国选手出“奇招”,引网友点赞。关注', 'video_id': 'ebe8637a152c58bac2f1d875b257f9b5', 'duration': 10, 'user_name': '搜狐新闻', 'like_cnt': 9475, 'share_cnt': 9134, 'favorite_cnt': 18000, 'comment_cnt': 1770, 'publish_time_str': '2023-07-26', 'publish_time_stamp': 1690300800, 'publish_time': 1690300800000, 'period': 0},
  239. {'video_title': '与愚者争论,自己就是愚者 #动画小故事 #哲理故事', 'video_id': '629abeb79f0de7a4dc45fadffc8ebc2b', 'duration': 32, 'user_name': '陈搞搞', 'like_cnt': 23000, 'share_cnt': 49000, 'favorite_cnt': 67000, 'comment_cnt': 1336, 'publish_time_str': '2023-07-24', 'publish_time_stamp': 1690128000, 'publish_time': 1690128000000, 'period': 2},
  240. {'video_title': '我看不懂这种行为的意义在哪里,所以我决定坚持反复观看试图参悟其中的深意,', 'video_id': 'd7e6e1eeb519183d5e8665c92a101378', 'duration': 15, 'user_name': '蜡笔小星丶', 'like_cnt': 20000, 'share_cnt': 100000, 'favorite_cnt': 51000, 'comment_cnt': 9836, 'publish_time_str': '2023-07-25', 'publish_time_stamp': 1690214400, 'publish_time': 1690214400000, 'period': 1},
  241. {'video_title': '女子一回家就开始脱衣服,不料老公的弟弟还在家里,女子下一秒的反应亮了!', 'video_id': 'c75472e887f2641acd34138b705cf8b9', 'duration': 11, 'user_name': '西米七七', 'like_cnt': 4335, 'share_cnt': 1107, 'favorite_cnt': 13000, 'comment_cnt': 1068, 'publish_time_str': '2023-07-26', 'publish_time_stamp': 1690300800, 'publish_time': 1690300800000, 'period': 0}]
  242. RecommendH5.download_videos(log_type="recommend",
  243. crawler="shipinhao",
  244. env="prod",
  245. rule_dict={"period": {"min": 365, "max": 365},
  246. "duration": {"min": 10, "max": 1800},
  247. "favorite_cnt": {"min": 50000, "max": 0},
  248. "share_cnt": {"min": 10000, "max": 0}},
  249. our_uid=61333564
  250. )