xiaoniangao_zhanghao.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447
  1. # -*- coding: utf-8 -*-
  2. import json
  3. import os
  4. import random
  5. import subprocess
  6. import sys
  7. import time
  8. import uuid
  9. import requests
  10. from datetime import datetime, timedelta
  11. from appium import webdriver
  12. from appium.webdriver.extensions.android.nativekey import AndroidKey
  13. from appium.webdriver.common.touch_action import TouchAction
  14. from bs4 import BeautifulSoup
  15. from selenium.common.exceptions import NoSuchElementException
  16. from selenium.webdriver.common.by import By
  17. sys.path.append(os.getcwd())
  18. from application.common import MysqlHelper, Feishu
  19. class XiaoNianGaoZH(object):
  20. """
  21. 小年糕+线下爬虫
  22. """
  23. def __init__(self):
  24. self.count = 0
  25. self.swipe_count = 0
  26. chromedriverExecutable = "/Users/a123456/Downloads/chromedriver-mac-x64/chromedriver"
  27. print("启动微信")
  28. # 微信的配置文件
  29. caps = {
  30. "platformName": "Android",
  31. "devicesName": "Android",
  32. "appPackage": "com.tencent.mm",
  33. "appActivity": ".ui.LauncherUI",
  34. "autoGrantPermissions": True,
  35. "noReset": True,
  36. "resetkeyboard": True,
  37. "unicodekeyboard": True,
  38. "showChromedriverLog": True,
  39. "printPageSourceOnFailure": True,
  40. "recreateChromeDriverSessions": True,
  41. "enableWebviewDetailsCollection": True,
  42. "setWebContentsDebuggingEnabled": True,
  43. "newCommandTimeout": 6000,
  44. "automationName": "UiAutomator2",
  45. "chromedriverExecutable": chromedriverExecutable,
  46. "chromeOptions": {"androidProcess": "com.tencent.mm:appbrand0"},
  47. }
  48. try:
  49. self.driver = webdriver.Remote("http://localhost:4723/wd/hub", caps)
  50. except Exception as e:
  51. print(e)
  52. return
  53. self.driver.implicitly_wait(30)
  54. for i in range(10):
  55. try:
  56. if self.driver.find_elements(By.ID, "com.tencent.mm:id/f2s"):
  57. print("启动微信成功")
  58. break
  59. elif self.driver.find_element(
  60. By.ID, "com.android.systemui:id/dismiss_view"
  61. ):
  62. print("发现并关闭系统下拉菜单")
  63. size = self.driver.get_window_size()
  64. self.driver.swipe(
  65. int(size["width"] * 0.5),
  66. int(size["height"] * 0.8),
  67. int(size["width"] * 0.5),
  68. int(size["height"] * 0.2),
  69. 200,
  70. )
  71. else:
  72. pass
  73. except Exception as e:
  74. print(f"打开微信异常:{e}")
  75. time.sleep(1)
  76. size = self.driver.get_window_size()
  77. self.driver.swipe(
  78. int(size["width"] * 0.5),
  79. int(size["height"] * 0.2),
  80. int(size["width"] * 0.5),
  81. int(size["height"] * 0.8),
  82. 200,
  83. )
  84. time.sleep(1)
  85. command = 'adb shell service call statusbar 2'
  86. process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
  87. process.communicate()
  88. self.driver.find_elements(By.XPATH, '//*[@text="小年糕+"]')[-1].click()
  89. print("打开小程序小年糕+成功")
  90. time.sleep(5)
  91. self.get_videoList()
  92. time.sleep(1)
  93. self.driver.quit()
  94. def save_pq_uid(self, uid):
  95. current_time = datetime.now()
  96. time_after_10_minutes = current_time + timedelta(minutes=5)
  97. # 获取时间戳
  98. timestamp_seconds = time_after_10_minutes.timestamp()
  99. timestamp_milliseconds = int(timestamp_seconds * 1000)
  100. file_path = '/Users/tzld/Desktop/automatic_crawler/spider/crawler_offline/xng_zh.txt'
  101. with open(file_path, 'r') as file:
  102. content = file.read()
  103. if content and content[-1] != ',':
  104. uid = ',' + str(uid)
  105. with open(file_path, 'a') as file:
  106. file.write(uid)
  107. with open(file_path, 'r') as file:
  108. content = file.read()
  109. url = "https://admin.piaoquantv.com/manager/crawler/v3/task/save"
  110. payload = {
  111. "taskName": "小年糕账号",
  112. "source": "xiaoniangao",
  113. "mode": "author",
  114. "modeValue": "0",
  115. "modeBoard": "0",
  116. "spiderName": "run_xng_author",
  117. "startTime": timestamp_milliseconds,
  118. "interval": 4800,
  119. "uid": str(content),
  120. "machine": "aliyun",
  121. "rule": [{"period": {"min": 15, "max": 3}},
  122. {"duration": {"min": 50, "max": 0}},
  123. {"share_cnt": {"min": 2, "max": 0}},
  124. {"videos_cnt": {"min": 300, "max": 0}}],
  125. "id": 21
  126. }
  127. headers = {
  128. 'accept': 'application/json',
  129. 'content-type': 'application/json;',
  130. 'cookie': 'SESSION=ZmYwMzBmOWItM2M5YS00ZGMyLTk3MjctMzE0YzE4MmUxNThh',
  131. 'origin': 'https://admin.piaoquantv.com',
  132. 'pragma': 'no-cache',
  133. 'priority': 'u=1, i',
  134. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
  135. }
  136. requests.request("POST", url, headers=headers, json=payload)
  137. def search_elements(self, xpath):
  138. time.sleep(1)
  139. windowHandles = self.driver.window_handles
  140. for handle in windowHandles:
  141. self.driver.switch_to.window(handle)
  142. time.sleep(1)
  143. try:
  144. elements = self.driver.find_elements(By.XPATH, xpath)
  145. if elements:
  146. return elements
  147. except NoSuchElementException:
  148. pass
  149. def check_to_applet(self, xpath):
  150. time.sleep(1)
  151. webViews = self.driver.contexts
  152. self.driver.switch_to.context(webViews[-1])
  153. windowHandles = self.driver.window_handles
  154. for handle in windowHandles:
  155. self.driver.switch_to.window(handle)
  156. time.sleep(1)
  157. try:
  158. self.driver.find_element(By.XPATH, xpath)
  159. print("切换到WebView成功\n")
  160. return
  161. except NoSuchElementException:
  162. time.sleep(1)
  163. def swipe_up(self):
  164. self.search_elements('//*[@class="list-list--list"]')
  165. size = self.driver.get_window_size()
  166. action = TouchAction(self.driver)
  167. action.press(x=int(size["width"] * 0.5), y=int(size["height"] * 0.85))
  168. action.wait(ms=1300) # 可以调整等待时间
  169. action.move_to(x=int(size["width"] * 0.5), y=int(size["height"] * 0.2))
  170. action.release()
  171. action.perform()
  172. self.swipe_count += 1
  173. def get_video_url(self, video_title_element):
  174. for i in range(3):
  175. self.search_elements('//*[@class="list-list--list"]')
  176. time.sleep(1)
  177. self.driver.execute_script(
  178. "arguments[0].scrollIntoView({block:'center',inline:'center'});",
  179. video_title_element[0],
  180. )
  181. time.sleep(3)
  182. video_title_element[0].click()
  183. self.check_to_applet(
  184. xpath=r'//wx-video[@class="dynamic-index--video-item dynamic-index--video"]'
  185. )
  186. time.sleep(10)
  187. video_url_elements = self.search_elements(
  188. '//wx-video[@class="dynamic-index--video-item dynamic-index--video"]'
  189. )
  190. return video_url_elements[0].get_attribute("src")
  191. def parse_detail(self, index):
  192. page_source = self.driver.page_source
  193. soup = BeautifulSoup(page_source, "html.parser")
  194. soup.prettify()
  195. video_list = soup.findAll(
  196. name="wx-view", attrs={"class": "expose--adapt-parent"}
  197. )
  198. index = index + 1
  199. element_list = [i for i in video_list][index:]
  200. return element_list[0]
  201. def get_video_info_2(self, video_element):
  202. self.count += 1
  203. video_title = video_element.find("wx-view", class_="dynamic--title").text
  204. # 头像 URL
  205. avatar_url = video_element.find("wx-image", class_="avatar--avatar")["src"]
  206. # 用户名称
  207. user_name = video_element.find("wx-view", class_="dynamic--nick-top").text
  208. name_url = self.select_name_url(avatar_url, user_name)
  209. if name_url:
  210. video_title_element = self.search_elements(f'//*[contains(text(), "{video_title}")]')
  211. if video_title_element is None:
  212. return
  213. self.get_video_url(video_title_element)
  214. video_mid_elements = self.search_elements("//wx-view[@class='bar--navBar-content-capsule-wrap']")
  215. mid = int(video_mid_elements[0].get_attribute("data-mid"))
  216. self.driver.press_keycode(AndroidKey.BACK)
  217. time.sleep(5)
  218. uid = self.select_id(mid)
  219. if uid:
  220. self.update_name_url(mid, avatar_url, user_name)
  221. else:
  222. time.sleep(1)
  223. link = self.select_id_status(mid)
  224. if link:
  225. current_time = datetime.now()
  226. formatted_time = current_time.strftime("%Y%m%d")
  227. date_int = int(formatted_time)
  228. # 获取时间标签
  229. tag_id = self.get_tag_id(date_int)
  230. time.sleep(5)
  231. print(tag_id)
  232. # 新增账号
  233. pq_uid = self.insert_number(mid, tag_id)
  234. time.sleep(5)
  235. if pq_uid:
  236. self.insert_name_url(mid, avatar_url, user_name)
  237. time.sleep(2)
  238. self.save_pq_uid(pq_uid)
  239. # 获取当前时间
  240. current_time = datetime.now()
  241. formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
  242. values = [[
  243. str(mid),
  244. user_name,
  245. avatar_url,
  246. str(pq_uid),
  247. formatted_time,
  248. ]]
  249. Feishu.insert_columns('xiaoniangao', 'xiaoniangao', "8zlceR", "ROWS", 1, 2)
  250. time.sleep(0.5)
  251. Feishu.update_values('xiaoniangao', 'xiaoniangao', "8zlceR", "A2:Z2", values)
  252. print("写入飞书表格成功")
  253. def get_video_info(self, video_element):
  254. try:
  255. self.get_video_info_2(video_element)
  256. except Exception as e:
  257. print(f"抓取单条视频异常:{e}\n")
  258. def get_videoList(self):
  259. """
  260. 获取视频列表
  261. :return:
  262. """
  263. # while True:
  264. self.driver.implicitly_wait(20)
  265. # 切换到 web_view
  266. self.check_to_applet(xpath='//*[@class="tab-bar--tab tab-bar--tab-selected"]')
  267. print("切换到 webview 成功")
  268. time.sleep(1)
  269. if self.search_elements('//*[@class="list-list--list"]') is None:
  270. print("窗口已销毁")
  271. self.count = 0
  272. self.download_cnt = 0
  273. self.element_list = []
  274. return
  275. print("开始获取视频信息")
  276. for i in range(50):
  277. print("下滑{}次".format(i))
  278. element = self.parse_detail(i)
  279. self.get_video_info(element)
  280. self.swipe_up()
  281. time.sleep(random.randint(1, 5))
  282. def insert_number(self, mid, tag_id):
  283. for i in range(3):
  284. url = "https://admin.piaoquantv.com/manager/crawler/v3/user/save"
  285. payload = {
  286. "source": "xiaoniangao",
  287. "mode": "author",
  288. "modeValue": "",
  289. "modeBoard": "",
  290. "recomStatus": -6,
  291. "appRecomStatus": -6,
  292. "autoAuditStatus": 0,
  293. "tag": f"459,454,106,8240,{int(tag_id)}",
  294. "contentCategory": 0,
  295. "link": str(mid)
  296. }
  297. headers = {
  298. 'content-length': '0',
  299. 'cookie': 'SESSION=MWM4YzVlMTctNzdkNC00NjE3LWIxZTctOGQwYzgzYmVmN2Qw',
  300. 'origin': 'https://admin.piaoquantv.com',
  301. 'priority': 'u=1, i',
  302. 'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
  303. 'sec-ch-ua-mobile': '?0',
  304. 'sec-ch-ua-platform': '"macOS"'
  305. }
  306. response = requests.request("POST", url, headers=headers, json=payload)
  307. response = response.json()
  308. code = response["code"]
  309. if code == 0:
  310. print("添加账号成功")
  311. time.sleep(1)
  312. url = "https://admin.piaoquantv.com/manager/crawler/v3/user/list"
  313. payload = {
  314. "pageNum": 1,
  315. "pageSize": 20
  316. }
  317. response = requests.request("POST", url, headers=headers, json=payload)
  318. response = response.json()
  319. list = response["content"]['list']
  320. link = list[0]["link"]
  321. if link == str(mid):
  322. print("获取站内账号ID成功")
  323. return list[0]["uid"]
  324. """
  325. 查询用户名+头像是否存在
  326. """
  327. def select_name_url(self, avatar_url, user_name):
  328. sql = f""" select uid from xng_uid where avatar_url = "{avatar_url}" and user_name="{user_name}"; """
  329. db = MysqlHelper()
  330. repeat_video = db.select(sql=sql)
  331. if repeat_video:
  332. return False
  333. return True
  334. def get_tag_id(self, date_int):
  335. for i in range(3):
  336. url = f"https://admin.piaoquantv.com/manager/user/up/searchUserTypeTag?keyword={date_int}&muid=7"
  337. payload = {}
  338. headers = {
  339. 'content-length': '0',
  340. 'cookie': 'SESSION=MWM4YzVlMTctNzdkNC00NjE3LWIxZTctOGQwYzgzYmVmN2Qw',
  341. 'origin': 'https://admin.piaoquantv.com',
  342. 'priority': 'u=1, i',
  343. 'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
  344. 'sec-ch-ua-mobile': '?0',
  345. 'sec-ch-ua-platform': '"macOS"'
  346. }
  347. response = requests.request("POST", url, headers=headers, data=payload)
  348. response = response.json()
  349. content = response["content"]
  350. if content:
  351. tagId = content[0]['tagId']
  352. return tagId
  353. else:
  354. url = f"https://admin.piaoquantv.com/manager/user/up/createUserTypeTag?tagName={date_int}&muid=7"
  355. response = requests.request("POST", url, headers=headers, data=payload)
  356. response = response.json()
  357. content = response["content"]
  358. if content:
  359. tagId = content['tagId']
  360. return tagId
  361. """
  362. 修改用户名+头像
  363. """
  364. def update_name_url(self, mid, avatar_url, user_name):
  365. sql = f""" update xng_uid set avatar_url = "{avatar_url}", user_name="{user_name}" where uid = "{mid}"; """
  366. db = MysqlHelper()
  367. repeat_video = db.update(sql=sql)
  368. if repeat_video:
  369. return True
  370. return False
  371. """
  372. 插入 用户名 头像 用户id
  373. """
  374. def insert_name_url(self, uid, avatar_url, user_name):
  375. current_time = datetime.now()
  376. formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
  377. insert_sql = f"""INSERT INTO xng_uid (uid, avatar_url, user_name, data_time) values ('{uid}' ,'{avatar_url}','{user_name}', '{formatted_time}')"""
  378. db = MysqlHelper()
  379. repeat_video = db.update(sql=insert_sql)
  380. if repeat_video:
  381. return True
  382. return False
  383. """
  384. 查询用户id是否存在
  385. """
  386. def select_id(self, uid):
  387. sql = f""" select uid from xng_uid where uid = "{uid}"; """
  388. db = MysqlHelper()
  389. repeat_video = db.select(sql=sql)
  390. if repeat_video:
  391. return True
  392. return False
  393. """
  394. 查询用户id是否之前已添加过
  395. """
  396. def select_id_status(self, uid):
  397. sql = f""" select uid from crawler_user_v3 where link = "{uid}"; """
  398. db = MysqlHelper()
  399. repeat_video = db.select(sql=sql)
  400. if repeat_video:
  401. return False
  402. return True
  403. if __name__ == "__main__":
  404. XiaoNianGaoZH()