zhongmiaoyinxin_recommend_new.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231
  1. # 众妙音信接入后台
  2. # @Author: luojunhui
  3. # @Time: 2023/10/26
  4. import json
  5. import os
  6. import sys
  7. import time
  8. from hashlib import md5
  9. from appium import webdriver
  10. from appium.webdriver.common.touch_action import TouchAction
  11. from appium.webdriver.extensions.android.nativekey import AndroidKey
  12. from appium.webdriver.webdriver import WebDriver
  13. from bs4 import BeautifulSoup
  14. from selenium.common.exceptions import NoSuchElementException
  15. from selenium.webdriver.common.by import By
  16. import multiprocessing
  17. sys.path.append(os.getcwd())
  18. from common.mq import MQ
  19. from common.aliyun_log import AliyunLogger
  20. from common.pipeline import PiaoQuanPipeline
  21. class ZMYXRecommend:
  22. env = None
  23. driver = None
  24. log_type = None
  25. def __init__(self, log_type, crawler, env, rule_dict, our_uid):
  26. self.mq = None
  27. self.platform = "众妙音信"
  28. self.download_cnt = 0
  29. self.element_list = []
  30. self.count = 0
  31. self.swipe_count = 0
  32. self.log_type = log_type
  33. self.crawler = crawler
  34. self.env = env
  35. self.rule_dict = rule_dict
  36. self.our_uid = our_uid
  37. if self.env == "dev":
  38. chromedriverExecutable = "/Users/luojunhui/Downloads/chromedriver_V111/chromedriver"
  39. else:
  40. chromedriverExecutable = '/Users/piaoquan/Downloads/chromedriver' # Mac 爬虫机器
  41. # 微信的配置文件
  42. caps = {
  43. "platformName": "Android", # 手机操作系统 Android / iOS
  44. "deviceName": "a0a65126", # 连接的设备名(模拟器或真机),安卓可以随便写
  45. # "udid": "emulator-5554", # 指定 adb devices 中的哪一台设备
  46. "platforVersion": "11", # 手机对应的系统版本
  47. "appPackage": "com.tencent.mm", # 被测APP的包名,乐活圈 Android
  48. "appActivity": ".ui.LauncherUI", # 启动的Activity名
  49. "autoGrantPermissions": "true", # 让 appium 自动授权 base 权限,
  50. # 如果 noReset 为 True,则该条不生效(该参数为 Android 独有),对应的值为 True 或 False
  51. "unicodekeyboard": True, # 使用自带输入法,输入中文时填True
  52. "resetkeyboard": True, # 执行完程序恢复原来输入法
  53. "noReset": True, # 不重置APP
  54. "printPageSourceOnFailure": True, # 找不到元素时,appium log 会完整记录当前页面的 pagesource
  55. "newCommandTimeout": 6000, # 初始等待时间
  56. "automationName": "UiAutomator2", # 使用引擎,默认为 Appium,
  57. # 其中 Appium、UiAutomator2、Selendroid、Espresso 用于 Android,XCUITest 用于 iOS
  58. "showChromedriverLog": True,
  59. 'enableWebviewDetailsCollection': True,
  60. 'setWebContentsDebuggingEnabled': True,
  61. 'recreateChromeDriverSessions': True,
  62. 'chromedriverExecutable': chromedriverExecutable,
  63. "chromeOptions": {"androidProcess": "com.tencent.mm:appbrand0"},
  64. # "chromeOptions": {"androidProcess": "com.tencent.mm:tools"},
  65. 'browserName': ''
  66. }
  67. self.driver = webdriver.Remote("http://localhost:4723/wd/hub", caps)
  68. self.driver.implicitly_wait(30)
  69. for i in range(120):
  70. try:
  71. if self.driver.find_elements(By.ID, "com.tencent.mm:id/f2s"):
  72. print("启动微信成功")
  73. break
  74. elif self.driver.find_element(By.ID, "com.android.systemui:id/dismiss_view"):
  75. print("发现并关闭系统下拉菜单")
  76. self.driver.find_element(By.ID, "com.android.system:id/dismiss_view").click()
  77. else:
  78. pass
  79. except NoSuchElementException:
  80. time.sleep(1)
  81. size = self.driver.get_window_size()
  82. self.driver.swipe(int(size['width'] * 0.5), int(size['height'] * 0.2),
  83. int(size['width'] * 0.5), int(size['height'] * 0.8), 200)
  84. time.sleep(1)
  85. self.driver.find_elements(By.XPATH, '//*[@text="西瓜悦"]')[-1].click()
  86. print("打开小程序成功")
  87. time.sleep(5)
  88. self.get_videoList()
  89. time.sleep(100)
  90. self.driver.quit()
  91. def search_elements(self, xpath):
  92. time.sleep(1)
  93. windowHandles = self.driver.window_handles
  94. for handle in windowHandles:
  95. self.driver.switch_to.window(handle)
  96. time.sleep(1)
  97. try:
  98. elements = self.driver.find_elements(By.XPATH, xpath)
  99. if elements:
  100. return elements
  101. except NoSuchElementException:
  102. pass
  103. def check_to_applet(self):
  104. while True:
  105. webview = self.driver.contexts
  106. self.driver.switch_to.context(webview[1])
  107. windowHandles = self.driver.window_handles
  108. for handle in windowHandles:
  109. self.driver.switch_to.window(handle)
  110. time.sleep(1)
  111. try:
  112. video_list = self.driver.find_element(By.XPATH, '//*[@class="index--navbar-list"]/*[1]')
  113. video_list.click()
  114. print("切换 webview 成功")
  115. return
  116. except NoSuchElementException:
  117. time.sleep(1)
  118. print("切换 webview 失败")
  119. break
  120. def swipe_up(self):
  121. self.search_elements('//*[@class="list-list--list"]')
  122. size = self.driver.get_window_size()
  123. self.driver.swipe(int(size["width"] * 0.5), int(size["height"] * 0.8),
  124. int(size["width"] * 0.5), int(size["height"] * 0.442), 200)
  125. self.swipe_count += 1
  126. def close_ad(self):
  127. window_size = self.driver.get_window_size()
  128. TouchAction(self.driver).tap(x=int(window_size['width'] * 0.5), y=int(window_size['height'] * 0.1)).perform()
  129. def get_videoList(self):
  130. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  131. self.driver.implicitly_wait(20)
  132. print("关闭广告")
  133. self.close_ad()
  134. print("切换到 webview")
  135. self.check_to_applet()
  136. index = 0
  137. while True:
  138. if self.search_elements('//*[@id="scrollContainer"]') is None:
  139. print("窗口已销毁")
  140. return
  141. print("获取视频列表")
  142. video_elements = self.search_elements('//wx-view[@class="cover"]')
  143. if video_elements is None:
  144. print("视频列表为空列表")
  145. return
  146. video_element_temp = video_elements[index:]
  147. if len(video_element_temp) == 0:
  148. print("视频已经到底")
  149. return
  150. for i, video_element in enumerate(video_element_temp):
  151. if video_element is None:
  152. return
  153. self.download_cnt += 1
  154. self.search_elements('//wx-view[@class="cover"]')
  155. time.sleep(3)
  156. self.driver.execute_script("arguments[0].scrollIntoView({block:'center',inline:'center'})",
  157. video_element)
  158. video_title = video_element.find_elements(By.XPATH, '//wx-view[@class="playImgs"]')[index + i].text
  159. cover_url = video_element.find_elements(By.XPATH, '//wx-image[@class="coverImg"]')[
  160. index + i].get_attribute('src')
  161. play_cnt = video_element.find_elements(By.XPATH, '//wx-image[@class="coverImg"]/span/*[2]')[
  162. index + i].text
  163. if "万" in play_cnt:
  164. play_cnt = int(play_cnt.split("万")[0]) * 10000
  165. out_video_id = md5(video_title.encode('utf8')).hexdigest()
  166. video_dict = {
  167. 'video_title': video_title,
  168. 'video_id': out_video_id,
  169. 'out_video_id': out_video_id,
  170. 'play_cnt': play_cnt,
  171. 'comment_cnt': 0,
  172. 'like_cnt': 0,
  173. 'share_cnt': 0,
  174. 'publish_time_stamp': int(time.time()),
  175. 'publish_time_str': time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))),
  176. 'update_time_stamp': int(time.time()),
  177. 'user_name': "zhongmiaoyinxin",
  178. 'user_id': "zhongmiaoyinxin",
  179. 'avatar_url': cover_url,
  180. 'cover_url': cover_url,
  181. 'session': f"zhongmiaoyinxin-{int(time.time())}"
  182. }
  183. pipeline = PiaoQuanPipeline(
  184. platform=self.crawler,
  185. mode=self.log_type,
  186. item=video_dict,
  187. rule_dict=self.rule_dict,
  188. env=self.env
  189. )
  190. flag = pipeline.process_item()
  191. if flag:
  192. print(video_dict)
  193. else:
  194. print("被规则过滤")
  195. # self.mq.send_msg(video_dict)
  196. # print(video_dict)
  197. def run():
  198. rule_dict1 = {}
  199. ZMYXRecommend("recommend", "zhongmiaoyinxin", "dev", rule_dict1, 6267141)
  200. if __name__ == "__main__":
  201. run()
  202. # process = multiprocessing.Process(
  203. # target=run
  204. # )
  205. # process.start()
  206. # while True:
  207. # if not process.is_alive():
  208. # print("正在重启")
  209. # process.terminate()
  210. # time.sleep(60)
  211. # os.system("adb forward --remove-all")
  212. # process = multiprocessing.Process(target=run)
  213. # process.start()
  214. # time.sleep(60)