xinshi_pc.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2022/10/25
  4. import os
  5. import sys
  6. import time
  7. from selenium.common import NoSuchElementException
  8. from selenium.webdriver import DesiredCapabilities, ActionChains
  9. from selenium.webdriver.chrome.service import Service
  10. from selenium.webdriver.common.by import By
  11. from selenium import webdriver
  12. sys.path.append(os.getcwd())
  13. from main.common import Common
  14. from main.feishu_lib import Feishu
  15. from xinshi.xinshi_app import XinshiAPP
  16. class XinshiPC:
  17. @classmethod
  18. def login(cls, log_type, env):
  19. # try:
  20. # 打印请求配置
  21. ca = DesiredCapabilities.CHROME
  22. ca["goog:loggingPrefs"] = {"performance": "ALL"}
  23. # 不打开浏览器运行
  24. chrome_options = webdriver.ChromeOptions()
  25. chrome_options.add_argument("headless")
  26. chrome_options.add_argument(
  27. f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36')
  28. chrome_options.add_argument("--no-sandbox")
  29. # driver初始化
  30. # Common.logger(log_type).info('初始化 webdriver')
  31. # driver = webdriver.Chrome(desired_capabilities=ca)
  32. # driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service('/Users/wangkun/Downloads/chromedriver_v106/chromedriver'))
  33. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service('/Users/lieyunye/Downloads/chromedriver_v107/chromedriver'))
  34. driver.implicitly_wait(10)
  35. Common.logger(log_type).info('打开网页"新视-热门内容"')
  36. driver.get('https://xs.newrank.cn/Material/faddish/recentHot')
  37. driver.maximize_window()
  38. driver.implicitly_wait(10)
  39. time.sleep(1)
  40. Common.logger(log_type).info('点击"登录/按钮"')
  41. driver.find_element(By.XPATH, '//button[@class="ant-btn ant-btn-primary"]').click()
  42. time.sleep(1)
  43. Common.logger(log_type).info('点击"其他登录方式"')
  44. driver.find_element(By.XPATH, '//span[@class ="_2XRFN1F6"]').click()
  45. time.sleep(1)
  46. Common.logger(log_type).info('输入手机号')
  47. driver.find_element(By.XPATH, '//input[@class="_2DyE0cvF"]').send_keys('13426262515')
  48. Common.logger(log_type).info('输入密码')
  49. driver.find_element(By.XPATH, '//input[@placeholder="输入密码"]').send_keys('test111111')
  50. time.sleep(1)
  51. Common.logger(log_type).info('勾选"保持登录状态"')
  52. driver.find_element(By.XPATH, '//input[@class="nrd-login-checkbox-input"]').click()
  53. time.sleep(1)
  54. Common.logger(log_type).info('点击"登录"')
  55. driver.find_element(By.XPATH, '//button[@class="_3RtjFeM- _CH1sF8Xz _38DPDVRd"]').click()
  56. # 滑块
  57. try:
  58. slider = driver.find_element(By.XPATH, '//span[@class="nc_iconfont btn_slide"]')
  59. slider_full = driver.find_element(By.XPATH, '//div[@class="scale_text slidetounlock"]')
  60. Common.logger(log_type).info('拖动滑块')
  61. time.sleep(1)
  62. """
  63. 解决特征识别的代码
  64. script = 'Object.defineProperty(navigator, "webdriver", {get: () => false,});'
  65. driver.execute_script(script)
  66. 如果不采取去除特征识别,即以下两行代码。则页面的滑块验证码在滑动后,会显示如下图的出错,从而阻止登录进行。
  67. 因为服务器识别到的selenium的特征。使用该两行代码更改了特征,即可以顺利通过识别。
  68. 一般是反爬虫机制,用selenium打开的浏览器,就算手动去滑动都不行。
  69. """
  70. script = 'Object.defineProperty(navigator, "webdriver", {get: () => false,});'
  71. driver.execute_script(script)
  72. ActionChains(driver).drag_and_drop_by_offset(
  73. slider, slider_full.size['width'], -slider.size['height']).perform()
  74. except NoSuchElementException:
  75. Common.logger(log_type).info('没有滑块')
  76. pass
  77. # 登录成功,获取到头像
  78. time.sleep(3)
  79. try:
  80. driver.find_element(By.XPATH, '//img[@class="_J1BGEmMJ"]')
  81. Common.logger(log_type).info('登录成功\n')
  82. except NoSuchElementException:
  83. Common.logger(log_type).info('登录失败,重新登录\n')
  84. driver.quit()
  85. cls.login(log_type, env)
  86. # 获取热门内容
  87. cls.get_recenhot(log_type, driver, env)
  88. Common.logger(log_type).info('新视-热门内容抓取完毕\n')
  89. # 获取十万推荐内容
  90. cls.get_hundredthousand(log_type, driver, env)
  91. Common.logger(log_type).info('新视-十万推荐内容抓取完毕\n')
  92. time.sleep(5)
  93. Common.logger(log_type).info('退出浏览器\n')
  94. driver.close()
  95. driver.quit()
  96. # except Exception as e:
  97. # Common.logger(log_type).error('XinshiPC异常,重启浏览器:{}\n', e)
  98. # cls.login(log_type, env)
  99. @classmethod
  100. def get_recenhot(cls, log_type, driver, env):
  101. time.sleep(3)
  102. for i in range(1, 21):
  103. video_title = driver.find_element(
  104. By.XPATH, '//div[@class="_6gxA3h-x"]/*[' + str(i) + ']//div[@class="_hsgIoLGN"]'
  105. ).get_attribute('title').strip().replace('"', '') \
  106. .replace('“', '').replace('“', '…').replace("\n", "") \
  107. .replace("/", "").replace("\r", "").replace("#", "") \
  108. .replace(".", "。").replace("\\", "").replace("&NBSP", "") \
  109. .replace(":", "").replace("*", "").replace("?", "") \
  110. .replace("?", "").replace('"', "").replace("<", "") \
  111. .replace(">", "").replace("|", "").replace(" ", "")
  112. user_name = driver.find_element(
  113. By.XPATH, '//div[@class="_6gxA3h-x"]/*[' + str(i) + ']//div[@class="_zoylmQ8m"]'
  114. ).get_attribute('title').replace('\n', '')
  115. Common.logger(log_type).info(video_title)
  116. Common.logger(log_type).info(user_name)
  117. if video_title == '':
  118. Common.logger(log_type).info('无标题\n')
  119. elif video_title in [x for y in Feishu.get_values_batch(log_type, 'shipinhao', 'c77cf9') for x in y]:
  120. Common.logger(log_type).info('视频已下载\n')
  121. elif video_title in [x for y in Feishu.get_values_batch(log_type, 'shipinhao', 'WAG7Dq') for x in y]:
  122. Common.logger(log_type).info('视频已下载\n')
  123. elif video_title in [x for y in Feishu.get_values_batch(log_type, 'shipinhao', '0i4jmV') for x in y]:
  124. Common.logger(log_type).info('视频已下载\n')
  125. elif video_title in [x for y in Feishu.get_values_batch(log_type, 'shipinhao', 'gO4Sn4') for x in y]:
  126. Common.logger(log_type).info('视频已存在\n')
  127. else:
  128. Feishu.insert_columns(log_type, 'shipinhao', 'gO4Sn4', 'ROWS', 1, 2)
  129. # 看一看云文档,工作表中写入数据
  130. values = [[time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(int(time.time()))),
  131. '新视-热门榜',
  132. video_title,
  133. user_name]]
  134. time.sleep(1)
  135. Feishu.update_values(log_type, 'shipinhao', 'gO4Sn4', 'A2:Z2', values)
  136. Common.logger(log_type).info('视频信息写入飞书成功\n')
  137. XinshiAPP.start_wechat(log_type, env)
  138. @classmethod
  139. def get_hundredthousand(cls, log_type, driver, env):
  140. time.sleep(3)
  141. Common.logger(log_type).info('点击"十万推荐"')
  142. driver.find_element(By.XPATH, '//div[@class="ant-tabs-nav-list"]/*[2]').click()
  143. time.sleep(3)
  144. Common.logger(log_type).info('滚动到页面底部')
  145. for i in range(5):
  146. Common.logger(log_type).info('向上滑动页面')
  147. driver.execute_script("window.scrollBy(0, 3000)")
  148. time.sleep(1)
  149. time.sleep(5)
  150. for i in range(1, 51):
  151. Common.logger(log_type).info('开始抓取第{}条', i)
  152. video_title = driver.find_element(
  153. By.XPATH, '//div[@class="_tCg-GF3J"]/*['+str(i)+']//div[@class="_EmoRHgxz"]'
  154. ).get_attribute('title').strip().replace('"', '') \
  155. .replace('“', '').replace('“', '…').replace("\n", "") \
  156. .replace("/", "").replace("\r", "").replace("#", "") \
  157. .replace(".", "。").replace("\\", "").replace("&NBSP", "") \
  158. .replace(":", "").replace("*", "").replace("?", "") \
  159. .replace("?", "").replace('"', "").replace("<", "") \
  160. .replace(">", "").replace("|", "").replace(" ", "")
  161. user_name = driver.find_element(
  162. By.XPATH, '//div[@class="_tCg-GF3J"]/*['+str(i)+']//div[@class="_gD23uy8R"]'
  163. ).get_attribute('title').replace('\n', '')
  164. Common.logger(log_type).info(video_title)
  165. Common.logger(log_type).info(user_name)
  166. if video_title == '':
  167. Common.logger(log_type).info('无标题\n')
  168. elif video_title in [x for y in Feishu.get_values_batch(log_type, 'shipinhao', 'c77cf9') for x in y]:
  169. Common.logger(log_type).info('视频已下载\n')
  170. elif video_title in [x for y in Feishu.get_values_batch(log_type, 'shipinhao', 'WAG7Dq') for x in y]:
  171. Common.logger(log_type).info('视频已下载\n')
  172. elif video_title in [x for y in Feishu.get_values_batch(log_type, 'shipinhao', '0i4jmV') for x in y]:
  173. Common.logger(log_type).info('视频已下载\n')
  174. elif video_title in [x for y in Feishu.get_values_batch(log_type, 'shipinhao', 'aOjaIU') for x in y]:
  175. Common.logger(log_type).info('视频已存在\n')
  176. else:
  177. Feishu.insert_columns(log_type, 'shipinhao', 'aOjaIU', 'ROWS', 1, 2)
  178. # 看一看云文档,工作表中写入数据
  179. values = [[time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(int(time.time()))),
  180. '新视-推荐榜',
  181. video_title,
  182. user_name]]
  183. time.sleep(1)
  184. Feishu.update_values(log_type, 'shipinhao', 'aOjaIU', 'A2:Z2', values)
  185. Common.logger(log_type).info('视频信息写入飞书成功\n')
  186. XinshiAPP.start_wechat(log_type, env)
  187. if __name__ == '__main__':
  188. XinshiPC.login('xinshi', 'dev')
  189. pass