|
@@ -187,8 +187,8 @@ class ShipinhaoSearch:
|
|
|
# 其中 Appium、UiAutomator2、Selendroid、Espresso 用于 Android,XCUITest 用于 iOS
|
|
|
"showChromedriverLog": True,
|
|
|
# "chromeOptions": {"androidProcess": "com.tencent.mm:appbrand0"},
|
|
|
- # "chromeOptions": {"androidProcess": "com.tencent.mm:tools"},
|
|
|
- "chromeOptions": {"androidProcess": "com.tencent.mm:toolsmp"},
|
|
|
+ "chromeOptions": {"androidProcess": "com.tencent.mm:tools"},
|
|
|
+ # "chromeOptions": {"androidProcess": "com.tencent.mm:toolsmp"},
|
|
|
# "chromeOptions": {"androidProcess": "com.tencent.mm"},
|
|
|
'enableWebviewDetailsCollection': True,
|
|
|
'setWebContentsDebuggingEnabled': True,
|
|
@@ -391,7 +391,7 @@ class ShipinhaoSearch:
|
|
|
driver.switch_to.context('NATIVE_APP')
|
|
|
|
|
|
# 点赞
|
|
|
- like_id = driver.find_element(By.ID, 'com.tencent.mm:id/k04')
|
|
|
+ like_id = driver.find_element(By.ID, 'com.tencent.mm:id/k04') # 微信版本 8.0.30
|
|
|
like_cnt = like_id.get_attribute('name')
|
|
|
if like_cnt == "" or like_cnt == "喜欢" or like_cnt == "火" or cls.is_contain_chinese(like_cnt) is True:
|
|
|
like_cnt = 0
|
|
@@ -549,16 +549,16 @@ class ShipinhaoSearch:
|
|
|
driver.implicitly_wait(10)
|
|
|
Common.logger(log_type, crawler).info("点击搜索框")
|
|
|
Common.logging(log_type, crawler, env, "点击搜索框")
|
|
|
- # driver.find_element(By.ID, 'com.tencent.mm:id/j5t').click() # 微信8.0.30版本
|
|
|
- driver.find_element(By.ID, 'com.tencent.mm:id/he6').click() # 微信8.0.16版本
|
|
|
+ driver.find_element(By.ID, 'com.tencent.mm:id/j5t').click() # 微信8.0.30版本
|
|
|
+ # driver.find_element(By.ID, 'com.tencent.mm:id/he6').click() # 微信8.0.16版本
|
|
|
time.sleep(0.5)
|
|
|
- # driver.find_element(By.ID, 'com.tencent.mm:id/cd7').clear().send_keys(word) # 微信8.0.30版本
|
|
|
- driver.find_element(By.ID, 'com.tencent.mm:id/bxz').clear().send_keys(word) # 微信8.0.16版本
|
|
|
+ driver.find_element(By.ID, 'com.tencent.mm:id/cd7').clear().send_keys(word) # 微信8.0.30版本
|
|
|
+ # driver.find_element(By.ID, 'com.tencent.mm:id/bxz').clear().send_keys(word) # 微信8.0.16版本
|
|
|
driver.press_keycode(AndroidKey.ENTER)
|
|
|
Common.logger(log_type, crawler).info("进入搜索词页面")
|
|
|
Common.logging(log_type, crawler, env, "进入搜索词页面")
|
|
|
- # driver.find_elements(By.ID, 'com.tencent.mm:id/br8')[0].click() # 微信8.0.30版本
|
|
|
- driver.find_elements(By.ID, 'com.tencent.mm:id/jkg')[0].click() # 微信8.0.16版本
|
|
|
+ driver.find_elements(By.ID, 'com.tencent.mm:id/br8')[0].click() # 微信8.0.30版本
|
|
|
+ # driver.find_elements(By.ID, 'com.tencent.mm:id/jkg')[0].click() # 微信8.0.16版本
|
|
|
time.sleep(5)
|
|
|
|
|
|
# 切换到微信搜索结果页 webview
|
|
@@ -598,93 +598,93 @@ class ShipinhaoSearch:
|
|
|
return
|
|
|
|
|
|
for i, video_element in enumerate(video_element_temp):
|
|
|
- # try:
|
|
|
- Common.logger(log_type, crawler).info(f"download_cnt:{cls.download_cnt}")
|
|
|
- Common.logging(log_type, crawler, env, f"download_cnt:{cls.download_cnt}")
|
|
|
- if cls.download_cnt >= cls.videos_cnt(log_type, crawler):
|
|
|
- Common.logger(log_type, crawler).info(f'搜索词:"{word}",已抓取视频数:{cls.download_cnt}')
|
|
|
- Common.logging(log_type, crawler, env, f'搜索词:"{word}",已抓取视频数:{cls.download_cnt}')
|
|
|
- cls.download_cnt = 0
|
|
|
- return
|
|
|
-
|
|
|
- if video_element is None:
|
|
|
- Common.logger(log_type, crawler).info('到底啦~\n')
|
|
|
- Common.logging(log_type, crawler, env, '到底啦~\n')
|
|
|
- return
|
|
|
-
|
|
|
- cls.i += 1
|
|
|
- cls.search_elements(driver, '//*[@class="rich-media active__absolute"]')
|
|
|
-
|
|
|
- Common.logger(log_type, crawler).info(f'拖动"视频"列表第{cls.i}个至屏幕中间')
|
|
|
- Common.logging(log_type, crawler, env, f'拖动"视频"列表第{cls.i}个至屏幕中间')
|
|
|
- time.sleep(3)
|
|
|
- driver.execute_script("arguments[0].scrollIntoView({block:'center',inline:'center'})",
|
|
|
- video_element)
|
|
|
- if len(video_element.find_elements(By.XPATH, "//*[@text='没有更多的搜索结果']")) != 0:
|
|
|
- Common.logger(log_type, crawler).info("没有更多的搜索结果\n")
|
|
|
- Common.logging(log_type, crawler, env, "没有更多的搜索结果\n")
|
|
|
- return
|
|
|
- video_title = video_element.find_elements(By.XPATH, '//div[@class="rich-media__title ellipsis_2"]/span')[index + i].text[:40]
|
|
|
- video_url = video_element.find_elements(By.XPATH, '//div[@class="video-player"]')[index+i].get_attribute('src')
|
|
|
- cover_url = video_element.find_elements(By.XPATH, '//div[@class="video-player__bd"]')[index+i].get_attribute('style')
|
|
|
- cover_url = cover_url.split('url("')[-1].split('")')[0]
|
|
|
- duration = video_element.find_elements(By.XPATH, '//div[@class="video-player-mask__text"]')[index+i].text
|
|
|
- duration = int(duration.split(':')[0]) * 60 + int(duration.split(':')[-1])
|
|
|
- user_name = video_element.find_elements(By.XPATH, '//div[@class="rich-media__source__title"]')[index+i].text
|
|
|
- avatar_url = video_element.find_elements(By.XPATH, '//div[@class="ui-image-image ui-image rich-media__source__thumb"]')[index+i].get_attribute('style')
|
|
|
- avatar_url = avatar_url.split('url("')[-1].split('")')[0]
|
|
|
- out_video_id = md5(video_title.encode('utf8')).hexdigest()
|
|
|
- out_user_id = md5(user_name.encode('utf8')).hexdigest()
|
|
|
-
|
|
|
- video_dict = {
|
|
|
- "video_title": video_title,
|
|
|
- "video_id": out_video_id,
|
|
|
- "play_cnt": 0,
|
|
|
- "duration": duration,
|
|
|
- "user_name": user_name,
|
|
|
- "user_id": out_user_id,
|
|
|
- "avatar_url": avatar_url,
|
|
|
- "cover_url": cover_url,
|
|
|
- "video_url": video_url,
|
|
|
- "session": f"shipinhao-search-{int(time.time())}"
|
|
|
- }
|
|
|
- for k, v in video_dict.items():
|
|
|
- Common.logger(log_type, crawler).info(f"{k}:{v}")
|
|
|
- Common.logging(log_type, crawler, env, f"{video_dict}")
|
|
|
- if video_title is None or video_url is None:
|
|
|
- Common.logger(log_type, crawler).info("无效视频\n")
|
|
|
- Common.logging(log_type, crawler, env, "无效视频\n")
|
|
|
- elif cls.repeat_out_video_id(log_type, crawler, out_video_id, env) != 0:
|
|
|
- Common.logger(log_type, crawler).info('视频已下载\n')
|
|
|
- Common.logging(log_type, crawler, env, '视频已下载\n')
|
|
|
- elif cls.repeat_video_url(log_type, crawler, video_url, env) != 0:
|
|
|
- Common.logger(log_type, crawler).info('视频已下载\n')
|
|
|
- Common.logging(log_type, crawler, env, '视频已下载\n')
|
|
|
- else:
|
|
|
- video_element.click()
|
|
|
+ try:
|
|
|
+ Common.logger(log_type, crawler).info(f"download_cnt:{cls.download_cnt}")
|
|
|
+ Common.logging(log_type, crawler, env, f"download_cnt:{cls.download_cnt}")
|
|
|
+ if cls.download_cnt >= cls.videos_cnt(log_type, crawler):
|
|
|
+ Common.logger(log_type, crawler).info(f'搜索词:"{word}",已抓取视频数:{cls.download_cnt}')
|
|
|
+ Common.logging(log_type, crawler, env, f'搜索词:"{word}",已抓取视频数:{cls.download_cnt}')
|
|
|
+ cls.download_cnt = 0
|
|
|
+ return
|
|
|
+
|
|
|
+ if video_element is None:
|
|
|
+ Common.logger(log_type, crawler).info('到底啦~\n')
|
|
|
+ Common.logging(log_type, crawler, env, '到底啦~\n')
|
|
|
+ return
|
|
|
+
|
|
|
+ cls.i += 1
|
|
|
+ cls.search_elements(driver, '//*[@class="rich-media active__absolute"]')
|
|
|
+
|
|
|
+ Common.logger(log_type, crawler).info(f'拖动"视频"列表第{cls.i}个至屏幕中间')
|
|
|
+ Common.logging(log_type, crawler, env, f'拖动"视频"列表第{cls.i}个至屏幕中间')
|
|
|
time.sleep(3)
|
|
|
- video_info_dict = cls.get_video_info(driver)
|
|
|
- video_dict["like_cnt"] = video_info_dict["like_cnt"]
|
|
|
- video_dict["share_cnt"] = video_info_dict["share_cnt"]
|
|
|
- video_dict["favorite_cnt"] = video_info_dict["favorite_cnt"]
|
|
|
- video_dict["comment_cnt"] = video_info_dict["comment_cnt"]
|
|
|
- video_dict["publish_time_str"] = video_info_dict["publish_time_str"]
|
|
|
- video_dict["publish_time_stamp"] = video_info_dict["publish_time_stamp"]
|
|
|
- Common.logger(log_type, crawler).info(f'publish_time:{video_dict["publish_time_str"]}')
|
|
|
- Common.logging(log_type, crawler, env, f'publish_time:{video_dict["publish_time_str"]}')
|
|
|
- if cls.download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict) is False:
|
|
|
- Common.logger(log_type, crawler).info("不满足抓取规则\n")
|
|
|
- Common.logging(log_type, crawler, env, "不满足抓取规则\n")
|
|
|
+ driver.execute_script("arguments[0].scrollIntoView({block:'center',inline:'center'})",
|
|
|
+ video_element)
|
|
|
+ if len(video_element.find_elements(By.XPATH, "//*[@text='没有更多的搜索结果']")) != 0:
|
|
|
+ Common.logger(log_type, crawler).info("没有更多的搜索结果\n")
|
|
|
+ Common.logging(log_type, crawler, env, "没有更多的搜索结果\n")
|
|
|
+ return
|
|
|
+ video_title = video_element.find_elements(By.XPATH, '//div[@class="rich-media__title ellipsis_2"]/span')[index + i].text[:40]
|
|
|
+ video_url = video_element.find_elements(By.XPATH, '//div[@class="video-player"]')[index+i].get_attribute('src')
|
|
|
+ cover_url = video_element.find_elements(By.XPATH, '//div[@class="video-player__bd"]')[index+i].get_attribute('style')
|
|
|
+ cover_url = cover_url.split('url("')[-1].split('")')[0]
|
|
|
+ duration = video_element.find_elements(By.XPATH, '//div[@class="video-player-mask__text"]')[index+i].text
|
|
|
+ duration = int(duration.split(':')[0]) * 60 + int(duration.split(':')[-1])
|
|
|
+ user_name = video_element.find_elements(By.XPATH, '//div[@class="rich-media__source__title"]')[index+i].text
|
|
|
+ avatar_url = video_element.find_elements(By.XPATH, '//div[@class="ui-image-image ui-image rich-media__source__thumb"]')[index+i].get_attribute('style')
|
|
|
+ avatar_url = avatar_url.split('url("')[-1].split('")')[0]
|
|
|
+ out_video_id = md5(video_title.encode('utf8')).hexdigest()
|
|
|
+ out_user_id = md5(user_name.encode('utf8')).hexdigest()
|
|
|
+
|
|
|
+ video_dict = {
|
|
|
+ "video_title": video_title,
|
|
|
+ "video_id": out_video_id,
|
|
|
+ "play_cnt": 0,
|
|
|
+ "duration": duration,
|
|
|
+ "user_name": user_name,
|
|
|
+ "user_id": out_user_id,
|
|
|
+ "avatar_url": avatar_url,
|
|
|
+ "cover_url": cover_url,
|
|
|
+ "video_url": video_url,
|
|
|
+ "session": f"shipinhao-search-{int(time.time())}"
|
|
|
+ }
|
|
|
+ for k, v in video_dict.items():
|
|
|
+ Common.logger(log_type, crawler).info(f"{k}:{v}")
|
|
|
+ Common.logging(log_type, crawler, env, f"{video_dict}")
|
|
|
+ if video_title is None or video_url is None:
|
|
|
+ Common.logger(log_type, crawler).info("无效视频\n")
|
|
|
+ Common.logging(log_type, crawler, env, "无效视频\n")
|
|
|
+ elif cls.repeat_out_video_id(log_type, crawler, out_video_id, env) != 0:
|
|
|
+ Common.logger(log_type, crawler).info('视频已下载\n')
|
|
|
+ Common.logging(log_type, crawler, env, '视频已下载\n')
|
|
|
+ elif cls.repeat_video_url(log_type, crawler, video_url, env) != 0:
|
|
|
+ Common.logger(log_type, crawler).info('视频已下载\n')
|
|
|
+ Common.logging(log_type, crawler, env, '视频已下载\n')
|
|
|
else:
|
|
|
- cls.download_publish(log_type=log_type,
|
|
|
- crawler=crawler,
|
|
|
- word=word,
|
|
|
- video_dict=video_dict,
|
|
|
- our_uid=our_uid,
|
|
|
- env=env)
|
|
|
- # except Exception as e:
|
|
|
- # Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
|
|
|
- # Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
|
|
|
+ video_element.click()
|
|
|
+ time.sleep(3)
|
|
|
+ video_info_dict = cls.get_video_info(driver)
|
|
|
+ video_dict["like_cnt"] = video_info_dict["like_cnt"]
|
|
|
+ video_dict["share_cnt"] = video_info_dict["share_cnt"]
|
|
|
+ video_dict["favorite_cnt"] = video_info_dict["favorite_cnt"]
|
|
|
+ video_dict["comment_cnt"] = video_info_dict["comment_cnt"]
|
|
|
+ video_dict["publish_time_str"] = video_info_dict["publish_time_str"]
|
|
|
+ video_dict["publish_time_stamp"] = video_info_dict["publish_time_stamp"]
|
|
|
+ Common.logger(log_type, crawler).info(f'publish_time:{video_dict["publish_time_str"]}')
|
|
|
+ Common.logging(log_type, crawler, env, f'publish_time:{video_dict["publish_time_str"]}')
|
|
|
+ if cls.download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict) is False:
|
|
|
+ Common.logger(log_type, crawler).info("不满足抓取规则\n")
|
|
|
+ Common.logging(log_type, crawler, env, "不满足抓取规则\n")
|
|
|
+ else:
|
|
|
+ cls.download_publish(log_type=log_type,
|
|
|
+ crawler=crawler,
|
|
|
+ word=word,
|
|
|
+ video_dict=video_dict,
|
|
|
+ our_uid=our_uid,
|
|
|
+ env=env)
|
|
|
+ except Exception as e:
|
|
|
+ Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
|
|
|
+ Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
|
|
|
|
|
|
Common.logger(log_type, crawler).info('已抓取完一组视频,休眠1秒\n')
|
|
|
Common.logging(log_type, crawler, env, '已抓取完一组视频,休眠1秒\n')
|
|
@@ -695,22 +695,22 @@ class ShipinhaoSearch:
|
|
|
def get_search_videos(cls, log_type, crawler, env):
|
|
|
user_list = cls.get_users(log_type, crawler, "wNgi6Z", env)
|
|
|
for user in user_list:
|
|
|
- # try:
|
|
|
- cls.i = 0
|
|
|
- cls.download_cnt = 0
|
|
|
- search_word = user["search_word"]
|
|
|
- our_uid = user["our_uid"]
|
|
|
- Common.logger(log_type, crawler).info(f"开始抓取:{search_word}")
|
|
|
- Common.logging(log_type, crawler, env, f"开始抓取:{search_word}")
|
|
|
-
|
|
|
- cls.start_wechat(log_type=log_type,
|
|
|
- crawler=crawler,
|
|
|
- word=search_word,
|
|
|
- our_uid=our_uid,
|
|
|
- env=env)
|
|
|
- # except Exception as e:
|
|
|
- # Common.logger(log_type, crawler).error(f"抓取{user['search_word']}时异常:{e}\n")
|
|
|
- # Common.logging(log_type, crawler, env, f"抓取{user['search_word']}时异常:{e}\n")
|
|
|
+ try:
|
|
|
+ cls.i = 0
|
|
|
+ cls.download_cnt = 0
|
|
|
+ search_word = user["search_word"]
|
|
|
+ our_uid = user["our_uid"]
|
|
|
+ Common.logger(log_type, crawler).info(f"开始抓取:{search_word}")
|
|
|
+ Common.logging(log_type, crawler, env, f"开始抓取:{search_word}")
|
|
|
+
|
|
|
+ cls.start_wechat(log_type=log_type,
|
|
|
+ crawler=crawler,
|
|
|
+ word=search_word,
|
|
|
+ our_uid=our_uid,
|
|
|
+ env=env)
|
|
|
+ except Exception as e:
|
|
|
+ Common.logger(log_type, crawler).error(f"抓取{user['search_word']}时异常:{e}\n")
|
|
|
+ Common.logging(log_type, crawler, env, f"抓取{user['search_word']}时异常:{e}\n")
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|