zhufuquanzi_recommend.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
  1. # -*- coding: utf-8 -*-
  2. # @Author: wang
  3. # @Time: 2023/9/6
  4. import json
  5. import os
  6. import sys
  7. import time
  8. from hashlib import md5
  9. from appium import webdriver
  10. from appium.webdriver.extensions.android.nativekey import AndroidKey
  11. from appium.webdriver.webdriver import WebDriver
  12. from selenium.common import NoSuchElementException
  13. from selenium.webdriver.common.by import By
  14. sys.path.append(os.getcwd())
  15. from common.common import Common
  16. from common.mq import MQ
  17. from common.public import download_rule, get_config_from_mysql
  18. from common.scheduling_db import MysqlHelper
  19. class ZFQZRecommend:
  20. platform = "祝福圈子"
  21. download_cnt = 0
  22. i = 0
  23. @classmethod
  24. def start_wechat(cls, log_type, crawler, env, rule_dict, our_uid):
  25. if env == "dev":
  26. chromedriverExecutable = "/Users/wangkun/Downloads/chromedriver/chromedriver_v111/chromedriver"
  27. else:
  28. chromedriverExecutable = "/Users/piaoquan/Downloads/chromedriver"
  29. Common.logger(log_type, crawler).info("启动微信")
  30. Common.logging(log_type, crawler, env, '启动微信')
  31. caps = {
  32. "platformName": "Android",
  33. "devicesName": "Android",
  34. "platformVersion": "11",
  35. # "udid": "emulator-5554",
  36. "appPackage": "com.tencent.mm",
  37. "appActivity": ".ui.LauncherUI",
  38. "autoGrantPermissions": "true",
  39. "noReset": True,
  40. "resetkeyboard": True,
  41. "unicodekeyboard": True,
  42. "showChromedriverLog": True,
  43. "printPageSourceOnFailure": True,
  44. "recreateChromeDriverSessions": True,
  45. "enableWebviewDetailsCollection": True,
  46. "setWebContentsDebuggingEnabled": True,
  47. "newCommandTimeout": 6000,
  48. "automationName": "UiAutomator2",
  49. "chromedriverExecutable": chromedriverExecutable,
  50. "chromeOptions": {"androidProcess": "com.tencent.mm:appbrand0"},
  51. }
  52. driver = webdriver.Remote("http://localhost:4723/wd/hub", caps)
  53. driver.implicitly_wait(30)
  54. for i in range(120):
  55. try:
  56. if driver.find_elements(By.ID, "com.tencent.mm:id/f2s"):
  57. Common.logger(log_type, crawler).info("微信启动成功")
  58. Common.logging(log_type, crawler, env, '微信启动成功')
  59. break
  60. elif driver.find_element(By.ID, "com.android.systemui:id/dismiss_view"):
  61. Common.logger(log_type, crawler).info("发现并关闭系统下拉菜单")
  62. Common.logging(log_type, crawler, env, '发现并关闭系统下拉菜单')
  63. driver.find_element(By.ID, "com.android.system:id/dismiss_view").click()
  64. else:
  65. pass
  66. except NoSuchElementException:
  67. time.sleep(1)
  68. Common.logger(log_type, crawler).info("下滑,展示小程序选择面板")
  69. Common.logging(log_type, crawler, env, '下滑,展示小程序选择面板')
  70. size = driver.get_window_size()
  71. driver.swipe(int(size['width'] * 0.5), int(size['height'] * 0.2),
  72. int(size['width'] * 0.5), int(size['height'] * 0.8), 200)
  73. time.sleep(5)
  74. Common.logger(log_type, crawler).info('打开小程序"祝福圈子"')
  75. Common.logging(log_type, crawler, env, '打开小程序"祝福圈子"')
  76. driver.find_elements(By.XPATH, '//*[@text="祝福圈子"]')[-1].click()
  77. time.sleep(10)
  78. cls.get_videoList(log_type, crawler, driver, env, rule_dict, our_uid)
  79. time.sleep(3)
  80. driver.quit()
  81. @classmethod
  82. def search_elements(cls, driver: WebDriver, xpath):
  83. time.sleep(1)
  84. windowHandles = driver.window_handles
  85. for handle in windowHandles:
  86. driver.switch_to.window(handle)
  87. time.sleep(1)
  88. try:
  89. elements = driver.find_elements(By.XPATH, xpath)
  90. if elements:
  91. return elements
  92. except NoSuchElementException:
  93. pass
  94. @classmethod
  95. def check_to_applet(cls, log_type, crawler, env, driver: WebDriver, xpath):
  96. time.sleep(1)
  97. webViews = driver.contexts
  98. Common.logger(log_type, crawler).info(f"webViews:{webViews}")
  99. Common.logging(log_type, crawler, env, f"webViews:{webViews}")
  100. driver.switch_to.context(webViews[1])
  101. windowHandles = driver.window_handles
  102. for handle in windowHandles:
  103. driver.switch_to.window(handle)
  104. time.sleep(1)
  105. try:
  106. driver.find_element(By.XPATH, xpath)
  107. Common.logger(log_type, crawler).info("切换到小程序成功\n")
  108. Common.logging(log_type, crawler, env, '切换到小程序成功\n')
  109. return
  110. except NoSuchElementException:
  111. time.sleep(1)
  112. @classmethod
  113. def repeat_video(cls, log_type, crawler, video_id, env):
  114. sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{video_id}"; """
  115. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
  116. return len(repeat_video)
  117. @classmethod
  118. def get_videoList(cls, log_type, crawler, driver: WebDriver, env, rule_dict, our_uid):
  119. mq = MQ(topic_name="topic_crawler_etl_" + env)
  120. driver.implicitly_wait(20)
  121. cls.check_to_applet(log_type=log_type,
  122. crawler=crawler,
  123. env=env,
  124. driver=driver,
  125. xpath='//*[@class="tags--tag tags--tag-0 tags--checked"]')
  126. time.sleep(3)
  127. index = 0
  128. while True:
  129. if cls.search_elements(driver, '//*[@class="bless--list"]') is None:
  130. Common.logger(log_type, crawler).info("窗口已销毁\n")
  131. Common.logging(log_type, crawler, env, '窗口已销毁\n')
  132. return
  133. video_list_elements = cls.search_elements(driver,
  134. '//*[@is="pages/discover/components/bless/dynamic/dynamic"]')
  135. if video_list_elements is None:
  136. Common.logger(log_type, crawler).warning(f"当前视频列表为空:{video_list_elements}")
  137. Common.logging(log_type, crawler, env, f"当前视频列表为空:{video_list_elements}")
  138. return
  139. video_list = video_list_elements[index:]
  140. if len(video_list) == 0 or video_list is None:
  141. Common.logger(log_type, crawler).info("到底啦~~~~~~~~~~\n")
  142. Common.logging(log_type, crawler, env, "到底啦~~~~~~~~~~\n")
  143. return
  144. for i, video_element in enumerate(video_list):
  145. try:
  146. if cls.download_cnt >= int(rule_dict.get("videos_cnt", {}).get("min", 10)):
  147. cls.download_cnt = 0
  148. cls.i = 0
  149. Common.logger(log_type, crawler).info(f"本轮已抓取视频数:{cls.download_cnt}")
  150. Common.logging(log_type, crawler, env, f"本轮已抓取视频数:{cls.download_cnt}")
  151. return
  152. if video_element is None:
  153. Common.logger(log_type, crawler).info("没有更多数据啦~\n")
  154. Common.logging(log_type, crawler, env, "没有更多数据啦~\n")
  155. return
  156. cls.i += 1
  157. cls.search_elements(driver, '//*[@is="pages/discover/components/bless/dynamic/dynamic"]')
  158. Common.logger(log_type, crawler).info(f"拖动第{cls.i}条视频至屏幕中间")
  159. Common.logging(log_type, crawler, env, f"拖动第{cls.i}条视频至屏幕中间")
  160. time.sleep(3)
  161. driver.execute_script("arguments[0].scrollIntoView({block:'center',inline:'center'})",
  162. video_element)
  163. video_title = video_element.find_elements(By.XPATH, '//*[@class="dynamic--title"]')[index+i].text
  164. play_cnt_str = video_element.find_elements(By.XPATH, '//*[@class="dynamic--views"]')[index + i].text
  165. duration_str = video_element.find_elements(By.XPATH, '//*[@class="dynamic--duration"]')[index + i].text
  166. user_name = video_element.find_elements(By.XPATH, '//*[@class="dynamic--nick-top"]')[index + i].text
  167. like_cnt_str = video_element.find_elements(By.XPATH, '//*[@class="dynamic--commerce"]/*[1]/*[2]')[index + i].text
  168. comment_cnt_str = video_element.find_elements(By.XPATH, '//*[@class="dynamic--commerce"]/*[2]/*[2]')[index + i].text
  169. cover_url = video_element.find_elements(By.XPATH, '//*[@class="dynamic--bg-image"]')[index+i].get_attribute('src')
  170. avatar_url = video_element.find_elements(By.XPATH, '//*[@class="avatar--avatar"]')[index+i].get_attribute('src')
  171. play_cnt = int(play_cnt_str.replace("+", "").replace("次播放", ""))
  172. duration = int(duration_str.split(":")[0].strip())*60 + int(duration_str.split(":")[-1].strip())
  173. if "点赞" in like_cnt_str:
  174. like_cnt = 0
  175. elif "万" in like_cnt_str:
  176. like_cnt = int(like_cnt_str.split("万")[0])*10000
  177. else:
  178. like_cnt = int(like_cnt_str)
  179. if "评论" in comment_cnt_str:
  180. comment_cnt = 0
  181. elif "万" in comment_cnt_str:
  182. comment_cnt = int(comment_cnt_str.split("万")[0])*10000
  183. else:
  184. comment_cnt = int(comment_cnt_str)
  185. out_video_id = md5(video_title.encode('utf8')).hexdigest()
  186. out_user_id = md5(user_name.encode('utf8')).hexdigest()
  187. video_dict = {
  188. "video_title": video_title,
  189. "video_id": out_video_id,
  190. "duration": duration,
  191. "play_cnt": play_cnt,
  192. "like_cnt": like_cnt,
  193. "comment_cnt": comment_cnt,
  194. "share_cnt": 0,
  195. "user_name": user_name,
  196. "user_id": out_user_id,
  197. 'publish_time_stamp': int(time.time()),
  198. 'publish_time_str': time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))),
  199. "avatar_url": avatar_url,
  200. "cover_url": cover_url,
  201. "session": f"zhufuquanzi-{int(time.time())}"
  202. }
  203. for k, v in video_dict.items():
  204. Common.logger(log_type, crawler).info(f"{k}:{v}")
  205. Common.logging(log_type, crawler, env, f"video_dict:{video_dict}")
  206. if video_title is None or cover_url is None:
  207. Common.logger(log_type, crawler).info("无效视频\n")
  208. Common.logging(log_type, crawler, env, '无效视频\n')
  209. elif download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict,
  210. rule_dict=rule_dict) is False:
  211. Common.logger(log_type, crawler).info("不满足抓取规则\n")
  212. Common.logging(log_type, crawler, env, "不满足抓取规则\n")
  213. elif any(str(word) if str(word) in video_dict["video_title"] else False
  214. for word in get_config_from_mysql(log_type=log_type,
  215. source=crawler,
  216. env=env,
  217. text="filter",
  218. action="")) is True:
  219. Common.logger(log_type, crawler).info('已中过滤词\n')
  220. Common.logging(log_type, crawler, env, '已中过滤词\n')
  221. elif cls.repeat_video(log_type, crawler, out_video_id, env) != 0:
  222. Common.logger(log_type, crawler).info('视频已下载\n')
  223. Common.logging(log_type, crawler, env, '视频已下载\n')
  224. else:
  225. video_element.click()
  226. time.sleep(3)
  227. video_url_elements = cls.search_elements(driver, '//*[@class="index--video-item index--video"]')
  228. if video_url_elements is None or len(video_url_elements) == 0:
  229. Common.logger(log_type, crawler).info("未获取到视频播放地址\n")
  230. Common.logging(log_type, crawler, env, "未获取到视频播放地址\n")
  231. driver.press_keycode(AndroidKey.BACK)
  232. else:
  233. video_url = video_url_elements[0].get_attribute("src")
  234. video_dict["video_url"] = video_url
  235. Common.logger(log_type, crawler).info(f"video_url:{video_url}")
  236. video_dict["platform"] = crawler
  237. video_dict["strategy"] = log_type
  238. video_dict["out_video_id"] = video_dict["video_id"]
  239. video_dict["crawler_rule"] = json.dumps(rule_dict)
  240. video_dict["user_id"] = our_uid
  241. video_dict["publish_time"] = video_dict["publish_time_str"]
  242. mq.send_msg(video_dict)
  243. cls.download_cnt += 1
  244. driver.press_keycode(AndroidKey.BACK)
  245. Common.logger(log_type, crawler).info("符合抓取条件,mq send msg 成功\n")
  246. Common.logging(log_type, crawler, env, "符合抓取条件,ACK MQ 成功\n")
  247. except Exception as e:
  248. Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
  249. Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
  250. Common.logger(log_type, crawler).info("已抓取完一组,休眠 10 秒\n")
  251. Common.logging(log_type, crawler, env, "已抓取完一组,休眠 10 秒\n")
  252. time.sleep(10)
  253. index = index + len(video_list)
  254. if __name__ == "__main__":
  255. rule_dict1 = {"period": {"min": 365, "max": 365},
  256. "duration": {"min": 30, "max": 1800},
  257. "favorite_cnt": {"min": 5000, "max": 0},
  258. "videos_cnt": {"min": 10, "max": 20},
  259. "share_cnt": {"min": 1000, "max": 0}}
  260. ZFQZRecommend.start_wechat("recommend", "zhufuquanzi", "dev", rule_dict1, 6267141)