123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270 |
- # -*- coding: utf-8 -*-
- # @Author: wangkun
- # @Time: 2023/7/26
- import json
- import os
- import sys
- import time
- from appium import webdriver
- from selenium.common import NoSuchElementException
- from appium.webdriver.webdriver import WebDriver
- from hashlib import md5
- from selenium.webdriver.common.by import By
- sys.path.append(os.getcwd())
- from common.mq import MQ
- from common.public import similarity
- from common.common import Common
- from shipinhao.shipinhao_recommend.shipinhao_recommend import ShipinhaoRecommend
- class RecommendH5:
- platform = "视频号"
- @classmethod
- def start_wechat(cls, log_type, crawler, env):
- Common.logger(log_type, crawler).info('启动微信')
- Common.logging(log_type, crawler, env, '启动微信')
- if env == "dev":
- chromedriverExecutable = "/Users/wangkun/Downloads/chromedriver/chromedriver_v111/chromedriver"
- else:
- chromedriverExecutable = '/Users/lieyunye/Downloads/chromedriver/chromedriver_v111/chromedriver'
- caps = {
- "platformName": "Android", # 手机操作系统 Android / iOS
- "deviceName": "Android", # 连接的设备名(模拟器或真机),安卓可以随便写
- "platforVersion": "13", # 手机对应的系统版本(Android 13)
- "appPackage": "com.tencent.mm", # 被测APP的包名,乐活圈 Android
- "appActivity": ".ui.LauncherUI", # 启动的Activity名
- "autoGrantPermissions": True, # 让 appium 自动授权 base 权限,
- # 如果 noReset 为 True,则该条不生效(该参数为 Android 独有),对应的值为 True 或 False
- "unicodekeyboard": True, # 使用自带输入法,输入中文时填True
- "resetkeyboard": True, # 执行完程序恢复原来输入法
- "noReset": True, # 不重置APP
- "recreateChromeDriverSessions": True, # 切换到非 chrome-Driver 会 kill 掉 session,就不需要手动 kill 了
- "printPageSourceOnFailure": True, # 找不到元素时,appium log 会完整记录当前页面的 pagesource
- "newCommandTimeout": 6000, # 初始等待时间
- "automationName": "UiAutomator2", # 使用引擎,默认为 Appium,
- # 其中 Appium、UiAutomator2、Selendroid、Espresso 用于 Android,XCUITest 用于 iOS
- "showChromedriverLog": True,
- # "chromeOptions": {"androidProcess": "com.tencent.mm:appbrand0"},
- "chromeOptions": {"androidProcess": "com.tencent.mm:tools"},
- # "chromeOptions": {"androidProcess": "com.tencent.mm:toolsmp"},
- # "chromeOptions": {"androidProcess": "com.tencent.mm"},
- 'enableWebviewDetailsCollection': True,
- 'setWebContentsDebuggingEnabled': True,
- 'chromedriverExecutable': chromedriverExecutable,
- }
- driver = webdriver.Remote("http://localhost:4723/wd/hub", caps)
- driver.implicitly_wait(10)
- time.sleep(5)
- return driver
- # 查找元素
- @classmethod
- def search_elements(cls, driver: WebDriver, xpath):
- time.sleep(1)
- windowHandles = driver.window_handles
- for handle in windowHandles:
- driver.switch_to.window(handle)
- time.sleep(1)
- try:
- elements = driver.find_elements(By.XPATH, xpath)
- if elements:
- return elements
- except NoSuchElementException:
- pass
- # noinspection PyBroadException
- @classmethod
- def check_to_webview(cls, log_type, crawler, env, driver: WebDriver):
- webviews = driver.contexts
- Common.logger(log_type, crawler).info(f"webviews:{webviews}")
- Common.logging(log_type, crawler, env, f"webviews:{webviews}")
- driver.switch_to.context(webviews[1])
- Common.logger(log_type, crawler).info(driver.current_context)
- Common.logging(log_type, crawler, env, driver.current_context)
- time.sleep(1)
- windowHandles = driver.window_handles
- for handle in windowHandles:
- try:
- driver.switch_to.window(handle)
- time.sleep(1)
- driver.find_element(By.XPATH, '//div[@class="unit"]')
- Common.logger(log_type, crawler).info('切换 webview 成功')
- Common.logging(log_type, crawler, env, '切换 webview 成功')
- return "成功"
- except Exception:
- Common.logger(log_type, crawler).info("切换 webview 失败")
- Common.logging(log_type, crawler, env, "切换 webview 失败")
- @classmethod
- def search_video(cls, log_type, crawler, env, video_dict, rule_dict, our_uid):
- mq = MQ(topic_name="topic_crawler_etl_" + env)
- driver = cls.start_wechat(log_type, crawler, env)
- # 点击微信搜索框,并输入搜索词
- driver.implicitly_wait(10)
- Common.logger(log_type, crawler).info("点击搜索框")
- Common.logging(log_type, crawler, env, "点击搜索框")
- driver.find_element(By.ID, 'com.tencent.mm:id/j5t').click() # 微信8.0.30版本
- time.sleep(0.5)
- driver.find_element(By.ID, 'com.tencent.mm:id/cd7').clear().send_keys(
- video_dict['video_title'].replace('"', "").replace('“', "").replace('”', "").replace('#', "")) # 微信8.0.30版本
- # driver.press_keycode(AndroidKey.ENTER)
- Common.logger(log_type, crawler).info("进入搜索词页面")
- Common.logging(log_type, crawler, env, "进入搜索词页面")
- driver.find_element(By.ID, 'com.tencent.mm:id/m94').click() # 微信8.0.30版本
- # 切换到微信搜索结果页 webview
- check_to_webview = cls.check_to_webview(log_type, crawler, env, driver)
- if check_to_webview is None:
- Common.logger(log_type, crawler).info("切换到视频号 webview 失败\n")
- Common.logging(log_type, crawler, env, "切换到视频号 webview 失败\n")
- return
- time.sleep(1)
- # 切换到"视频号"分类
- shipinhao_tags = cls.search_elements(driver, '//div[@class="unit"]/*[2]')
- Common.logger(log_type, crawler).info('点击"视频号"分类')
- Common.logging(log_type, crawler, env, '点击"视频号"分类')
- shipinhao_tags[0].click()
- time.sleep(5)
- global h5_page
- for i in range(3):
- h5_page = cls.search_elements(driver, '//*[@class="mixed-box__bd"]')
- if h5_page is None:
- Common.logger(log_type, crawler).info('未发现H5页面')
- Common.logging(log_type, crawler, env, '未发现H5页面')
- driver.refresh()
- else:
- break
- if h5_page is None:
- driver.quit()
- return
- Common.logger(log_type, crawler).info('获取视频列表\n')
- Common.logging(log_type, crawler, env, '获取视频列表\n')
- video_elements = cls.search_elements(driver, '//div[@class="rich-media active__absolute"]')
- if video_elements is None:
- Common.logger(log_type, crawler).warning(f'video_elements:{video_elements}')
- Common.logging(log_type, crawler, env, f'video_elements:{video_elements}')
- return
- for i, video_element in enumerate(video_elements):
- try:
- if video_element is None:
- Common.logger(log_type, crawler).info('到底啦~\n')
- Common.logging(log_type, crawler, env, '到底啦~\n')
- return
- Common.logger(log_type, crawler).info(f'拖动"视频"列表第{i + 1}条至屏幕中间')
- Common.logging(log_type, crawler, env, f'拖动"视频"列表第{i + 1}条至屏幕中间')
- time.sleep(3)
- driver.execute_script("arguments[0].scrollIntoView({block:'center',inline:'center'})",
- video_element)
- if len(video_element.find_elements(By.XPATH, "//*[@text='没有更多的搜索结果']")) != 0:
- Common.logger(log_type, crawler).info("没有更多的搜索结果\n")
- Common.logging(log_type, crawler, env, "没有更多的搜索结果\n")
- return
- h5_video_title = \
- video_element.find_elements(By.XPATH, '//div[@class="rich-media__title ellipsis_2"]/span')[i].text[:40]
- h5_user_name = video_element.find_elements(By.XPATH, '//div[@class="rich-media__source__title"]')[
- i].text
- h5_video_url = video_element.find_elements(By.XPATH, '//div[@class="video-player"]')[i].get_attribute(
- 'src')
- cover_url = video_element.find_elements(By.XPATH, '//div[@class="video-player__bd"]')[i].get_attribute(
- 'style')
- h5_cover_url = cover_url.split('url("')[-1].split('")')[0]
- avatar_url = video_element.find_elements(By.XPATH,
- '//div[@class="ui-image-image ui-image rich-media__source__thumb"]')[
- i].get_attribute('style')
- h5_avatar_url = avatar_url.split('url("')[-1].split('")')[0]
- h5_out_video_id = md5(h5_video_title.encode('utf8')).hexdigest()
- h5_out_user_id = md5(h5_user_name.encode('utf8')).hexdigest()
- title_similarity = similarity(video_dict['video_title'], h5_video_title)
- user_name_similarity = similarity(video_dict['user_name'], h5_user_name)
- if title_similarity >= 0.5 and user_name_similarity >= 1.0:
- video_dict['cover_url'] = h5_cover_url
- video_dict['avatar_url'] = h5_avatar_url
- video_dict['out_video_id'] = h5_out_video_id
- video_dict['video_url'] = h5_video_url
- for k, v in video_dict.items():
- Common.logger(log_type, crawler).info(f"{k}:{v}")
- Common.logging(log_type, crawler, env, f"video_dict:{video_dict}")
- video_dict["out_user_id"] = h5_out_user_id
- video_dict["platform"] = crawler
- video_dict["strategy"] = log_type
- video_dict["strategyType"] = "recommend"
- video_dict["out_video_id"] = h5_out_video_id
- video_dict["width"] = 0
- video_dict["height"] = 0
- video_dict["crawler_rule"] = json.dumps(rule_dict)
- video_dict["user_id"] = our_uid
- video_dict["publish_time"] = video_dict["publish_time_str"]
- mq.send_msg(video_dict)
- Common.logger(log_type, crawler).info("已抓取到目标视频\n")
- Common.logging(log_type, crawler, env, "已抓取到目标视频\n")
- driver.quit()
- return
- else:
- Common.logger(log_type, crawler).info(f"video_dict['video_title']:{video_dict['video_title']}")
- Common.logging(log_type, crawler, env, f"video_dict['video_title']:{video_dict['video_title']}")
- Common.logger(log_type, crawler).info(f"h5_video_title:{h5_video_title}")
- Common.logging(log_type, crawler, env, f"h5_video_title:{h5_video_title}")
- Common.logger(log_type, crawler).info(f"title_similarity:{title_similarity}")
- Common.logging(log_type, crawler, env, f"title_similarity:{title_similarity}")
- Common.logger(log_type, crawler).info(f"video_dict['user_name']:{video_dict['user_name']}")
- Common.logging(log_type, crawler, env, f"video_dict['user_name']:{video_dict['user_name']}")
- Common.logger(log_type, crawler).info(f"h5_user_name:{h5_user_name}")
- Common.logging(log_type, crawler, env, f"h5_user_name:{h5_user_name}")
- Common.logger(log_type, crawler).info(f"user_name_similarity:{user_name_similarity}")
- Common.logging(log_type, crawler, env, f"user_name_similarity:{user_name_similarity}")
- except Exception as e:
- Common.logger(log_type, crawler).info(f"抓取单条H5视频时异常:{e}\n")
- Common.logging(log_type, crawler,env, f"抓取单条H5视频时异常:{e}\n")
- Common.logger(log_type, crawler).info("未找到目标视频\n")
- Common.logging(log_type, crawler, env, "未找到目标视频\n")
- @classmethod
- def download_videos(cls, log_type, crawler, env, rule_dict, our_uid):
- try:
- Common.logger(log_type, crawler).info(f'共{len(ShipinhaoRecommend.download_video_list)}条视频待抓取')
- Common.logging(log_type, crawler, env, f'共{len(ShipinhaoRecommend.download_video_list)}条视频待抓取')
- Common.logger(log_type, crawler).info(f'download_video_list:{ShipinhaoRecommend.download_video_list}\n')
- Common.logging(log_type, crawler, env, f'download_video_list:{ShipinhaoRecommend.download_video_list}\n')
- if len(ShipinhaoRecommend.download_video_list) == 0:
- Common.logger(log_type, crawler).info("没有待下载的视频\n")
- Common.logging(log_type, crawler, env, "没有待下载的视频\n")
- return
- for video_dict in ShipinhaoRecommend.download_video_list:
- try:
- cls.search_video(log_type, crawler, env, video_dict, rule_dict, our_uid)
- except Exception as e:
- Common.logger(log_type, crawler).info(f"抓取视频异常:{e}\n")
- Common.logging(log_type, crawler, env, f"抓取视频异常:{e}\n")
- except Exception as e:
- Common.logger(log_type, crawler).info(f"download_videos异常:{e}\n")
- Common.logging(log_type, crawler, env, f"download_videos异常:{e}\n")
- if __name__ == "__main__":
- ShipinhaoRecommend.download_video_list = [
- {'video_title': '网友:不知道此时此刻黑车司机在想什么', 'video_id': '96bfb8b86965df7365f02373ce37fe87', 'duration': 21, 'user_name': '沂蒙晚报', 'like_cnt': 9575, 'share_cnt': 11000, 'favorite_cnt': 25000, 'comment_cnt': 5026, 'publish_time_str': '2023-07-25', 'publish_time_stamp': 1690214400, 'publish_time': 1690214400000, 'period': 1},
- {'video_title': '女朋友这不就来了么', 'video_id': 'b1892886dca8c38dd6d72848ae4fd565', 'duration': 10, 'user_name': '向往的火焰蓝', 'like_cnt': 11000, 'share_cnt': 3701, 'favorite_cnt': 26000, 'comment_cnt': 1426, 'publish_time_str': '2023-07-26', 'publish_time_stamp': 1690300800, 'publish_time': 1690300800000, 'period': 0},
- {'video_title': '近日,在韩国举办的2023世界跆拳道大赛上,中国选手出“奇招”,引网友点赞。关注', 'video_id': 'ebe8637a152c58bac2f1d875b257f9b5', 'duration': 10, 'user_name': '搜狐新闻', 'like_cnt': 9475, 'share_cnt': 9134, 'favorite_cnt': 18000, 'comment_cnt': 1770, 'publish_time_str': '2023-07-26', 'publish_time_stamp': 1690300800, 'publish_time': 1690300800000, 'period': 0},
- {'video_title': '与愚者争论,自己就是愚者 #动画小故事 #哲理故事', 'video_id': '629abeb79f0de7a4dc45fadffc8ebc2b', 'duration': 32, 'user_name': '陈搞搞', 'like_cnt': 23000, 'share_cnt': 49000, 'favorite_cnt': 67000, 'comment_cnt': 1336, 'publish_time_str': '2023-07-24', 'publish_time_stamp': 1690128000, 'publish_time': 1690128000000, 'period': 2},
- {'video_title': '我看不懂这种行为的意义在哪里,所以我决定坚持反复观看试图参悟其中的深意,', 'video_id': 'd7e6e1eeb519183d5e8665c92a101378', 'duration': 15, 'user_name': '蜡笔小星丶', 'like_cnt': 20000, 'share_cnt': 100000, 'favorite_cnt': 51000, 'comment_cnt': 9836, 'publish_time_str': '2023-07-25', 'publish_time_stamp': 1690214400, 'publish_time': 1690214400000, 'period': 1},
- {'video_title': '女子一回家就开始脱衣服,不料老公的弟弟还在家里,女子下一秒的反应亮了!', 'video_id': 'c75472e887f2641acd34138b705cf8b9', 'duration': 11, 'user_name': '西米七七', 'like_cnt': 4335, 'share_cnt': 1107, 'favorite_cnt': 13000, 'comment_cnt': 1068, 'publish_time_str': '2023-07-26', 'publish_time_stamp': 1690300800, 'publish_time': 1690300800000, 'period': 0}]
- RecommendH5.download_videos(log_type="recommend",
- crawler="shipinhao",
- env="prod",
- rule_dict={"period": {"min": 365, "max": 365},
- "duration": {"min": 10, "max": 1800},
- "favorite_cnt": {"min": 50000, "max": 0},
- "share_cnt": {"min": 10000, "max": 0}},
- our_uid=61333564
- )
|