Server
/
piaoquan_crawler


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
							# -*- coding: utf-8 -*-
# @Author: wangkun
# @Time: 2023/7/26
import json
import os
import sys
import time
from appium import webdriver
from selenium.common import NoSuchElementException
from appium.webdriver.webdriver import WebDriver
from hashlib import md5
from selenium.webdriver.common.by import By
sys.path.append(os.getcwd())
from common.mq import MQ
from common.public import similarity
from common.common import Common
from shipinhao.shipinhao_recommend.shipinhao_recommend import ShipinhaoRecommend


class RecommendH5:
    platform = "视频号"

    @classmethod
    def start_wechat(cls, log_type, crawler, env):
        Common.logger(log_type, crawler).info('启动微信')
        Common.logging(log_type, crawler, env, '启动微信')
        if env == "dev":
            chromedriverExecutable = "/Users/wangkun/Downloads/chromedriver/chromedriver_v111/chromedriver"
        else:
            chromedriverExecutable = '/Users/lieyunye/Downloads/chromedriver/chromedriver_v111/chromedriver'
        caps = {
            "platformName": "Android",  # 手机操作系统 Android / iOS
            "deviceName": "Android",  # 连接的设备名（模拟器或真机），安卓可以随便写
            "platforVersion": "13",  # 手机对应的系统版本（Android 13）
            "appPackage": "com.tencent.mm",  # 被测APP的包名，乐活圈 Android
            "appActivity": ".ui.LauncherUI",  # 启动的Activity名
            "autoGrantPermissions": True,  # 让 appium 自动授权 base 权限，
            # 如果 noReset 为 True，则该条不生效（该参数为 Android 独有），对应的值为 True 或 False
            "unicodekeyboard": True,  # 使用自带输入法，输入中文时填True
            "resetkeyboard": True,  # 执行完程序恢复原来输入法
            "noReset": True,  # 不重置APP
            "recreateChromeDriverSessions": True,  # 切换到非 chrome-Driver 会 kill 掉 session，就不需要手动 kill 了
            "printPageSourceOnFailure": True,  # 找不到元素时，appium log 会完整记录当前页面的 pagesource
            "newCommandTimeout": 6000,  # 初始等待时间
            "automationName": "UiAutomator2",  # 使用引擎，默认为 Appium，
            # 其中 Appium、UiAutomator2、Selendroid、Espresso 用于 Android，XCUITest 用于 iOS
            "showChromedriverLog": True,
            # "chromeOptions": {"androidProcess": "com.tencent.mm:appbrand0"},
            "chromeOptions": {"androidProcess": "com.tencent.mm:tools"},
            # "chromeOptions": {"androidProcess": "com.tencent.mm:toolsmp"},
            # "chromeOptions": {"androidProcess": "com.tencent.mm"},
            'enableWebviewDetailsCollection': True,
            'setWebContentsDebuggingEnabled': True,
            'chromedriverExecutable': chromedriverExecutable,
        }
        driver = webdriver.Remote("http://localhost:4723/wd/hub", caps)
        driver.implicitly_wait(10)
        time.sleep(5)
        return driver

    # 查找元素
    @classmethod
    def search_elements(cls, driver: WebDriver, xpath):
        time.sleep(1)
        windowHandles = driver.window_handles
        for handle in windowHandles:
            driver.switch_to.window(handle)
            time.sleep(1)
            try:
                elements = driver.find_elements(By.XPATH, xpath)
                if elements:
                    return elements
            except NoSuchElementException:
                pass

    # noinspection PyBroadException
    @classmethod
    def check_to_webview(cls, log_type, crawler, env, driver: WebDriver):
        webviews = driver.contexts
        Common.logger(log_type, crawler).info(f"webviews:{webviews}")
        Common.logging(log_type, crawler, env, f"webviews:{webviews}")
        driver.switch_to.context(webviews[1])
        Common.logger(log_type, crawler).info(driver.current_context)
        Common.logging(log_type, crawler, env, driver.current_context)
        time.sleep(1)
        windowHandles = driver.window_handles
        for handle in windowHandles:
            try:
                driver.switch_to.window(handle)
                time.sleep(1)
                driver.find_element(By.XPATH, '//div[@class="unit"]')
                Common.logger(log_type, crawler).info('切换 webview 成功')
                Common.logging(log_type, crawler, env, '切换 webview 成功')
                return "成功"
            except Exception:
                Common.logger(log_type, crawler).info("切换 webview 失败")
                Common.logging(log_type, crawler, env, "切换 webview 失败")

    @classmethod
    def search_video(cls, log_type, crawler, env, video_dict, rule_dict, our_uid):
        mq = MQ(topic_name="topic_crawler_etl_" + env)
        driver = cls.start_wechat(log_type, crawler, env)
        # 点击微信搜索框，并输入搜索词
        driver.implicitly_wait(10)
        Common.logger(log_type, crawler).info("点击搜索框")
        Common.logging(log_type, crawler, env, "点击搜索框")
        driver.find_element(By.ID, 'com.tencent.mm:id/j5t').click()  # 微信8.0.30版本
        time.sleep(0.5)
        driver.find_element(By.ID, 'com.tencent.mm:id/cd7').clear().send_keys(
            video_dict['video_title'].replace('"', "").replace('“', "").replace('”', "").replace('#', ""))  # 微信8.0.30版本
        # driver.press_keycode(AndroidKey.ENTER)
        Common.logger(log_type, crawler).info("进入搜索词页面")
        Common.logging(log_type, crawler, env, "进入搜索词页面")
        driver.find_element(By.ID, 'com.tencent.mm:id/m94').click()  # 微信8.0.30版本

        # 切换到微信搜索结果页 webview
        check_to_webview = cls.check_to_webview(log_type, crawler, env, driver)
        if check_to_webview is None:
            Common.logger(log_type, crawler).info("切换到视频号 webview 失败\n")
            Common.logging(log_type, crawler, env, "切换到视频号 webview 失败\n")
            return
        time.sleep(1)

        # 切换到"视频号"分类
        shipinhao_tags = cls.search_elements(driver, '//div[@class="unit"]/*[2]')
        Common.logger(log_type, crawler).info('点击"视频号"分类')
        Common.logging(log_type, crawler, env, '点击"视频号"分类')
        shipinhao_tags[0].click()
        time.sleep(5)

        global h5_page
        for i in range(3):
            h5_page = cls.search_elements(driver, '//*[@class="mixed-box__bd"]')
            if h5_page is None:
                Common.logger(log_type, crawler).info('未发现H5页面')
                Common.logging(log_type, crawler, env, '未发现H5页面')
                driver.refresh()
            else:
                break

        if h5_page is None:
            driver.quit()
            return

        Common.logger(log_type, crawler).info('获取视频列表\n')
        Common.logging(log_type, crawler, env, '获取视频列表\n')
        video_elements = cls.search_elements(driver, '//div[@class="rich-media active__absolute"]')
        if video_elements is None:
            Common.logger(log_type, crawler).warning(f'video_elements:{video_elements}')
            Common.logging(log_type, crawler, env, f'video_elements:{video_elements}')
            return

        for i, video_element in enumerate(video_elements):
            try:
                if video_element is None:
                    Common.logger(log_type, crawler).info('到底啦~\n')
                    Common.logging(log_type, crawler, env, '到底啦~\n')
                    return

                Common.logger(log_type, crawler).info(f'拖动"视频"列表第{i + 1}条至屏幕中间')
                Common.logging(log_type, crawler, env, f'拖动"视频"列表第{i + 1}条至屏幕中间')
                time.sleep(3)
                driver.execute_script("arguments[0].scrollIntoView({block:'center',inline:'center'})",
                                      video_element)
                if len(video_element.find_elements(By.XPATH, "//*[@text='没有更多的搜索结果']")) != 0:
                    Common.logger(log_type, crawler).info("没有更多的搜索结果\n")
                    Common.logging(log_type, crawler, env, "没有更多的搜索结果\n")
                    return
                h5_video_title = \
                video_element.find_elements(By.XPATH, '//div[@class="rich-media__title ellipsis_2"]/span')[i].text[:40]
                h5_user_name = video_element.find_elements(By.XPATH, '//div[@class="rich-media__source__title"]')[
                    i].text
                h5_video_url = video_element.find_elements(By.XPATH, '//div[@class="video-player"]')[i].get_attribute(
                    'src')
                cover_url = video_element.find_elements(By.XPATH, '//div[@class="video-player__bd"]')[i].get_attribute(
                    'style')
                h5_cover_url = cover_url.split('url("')[-1].split('")')[0]
                avatar_url = video_element.find_elements(By.XPATH,
                                                         '//div[@class="ui-image-image ui-image rich-media__source__thumb"]')[
                    i].get_attribute('style')
                h5_avatar_url = avatar_url.split('url("')[-1].split('")')[0]
                h5_out_video_id = md5(h5_video_title.encode('utf8')).hexdigest()
                h5_out_user_id = md5(h5_user_name.encode('utf8')).hexdigest()

                title_similarity = similarity(video_dict['video_title'], h5_video_title)
                user_name_similarity = similarity(video_dict['user_name'], h5_user_name)

                if title_similarity >= 0.5 and user_name_similarity >= 1.0:
                    video_dict['cover_url'] = h5_cover_url
                    video_dict['avatar_url'] = h5_avatar_url
                    video_dict['out_video_id'] = h5_out_video_id
                    video_dict['video_url'] = h5_video_url

                    for k, v in video_dict.items():
                        Common.logger(log_type, crawler).info(f"{k}:{v}")
                    Common.logging(log_type, crawler, env, f"video_dict:{video_dict}")

                    video_dict["out_user_id"] = h5_out_user_id
                    video_dict["platform"] = crawler
                    video_dict["strategy"] = log_type
                    video_dict["strategyType"] = "recommend"
                    video_dict["out_video_id"] = h5_out_video_id
                    video_dict["width"] = 0
                    video_dict["height"] = 0
                    video_dict["crawler_rule"] = json.dumps(rule_dict)
                    video_dict["user_id"] = our_uid
                    video_dict["publish_time"] = video_dict["publish_time_str"]
                    mq.send_msg(video_dict)
                    Common.logger(log_type, crawler).info("已抓取到目标视频\n")
                    Common.logging(log_type, crawler, env, "已抓取到目标视频\n")
                    driver.quit()
                    return
                else:
                    Common.logger(log_type, crawler).info(f"video_dict['video_title']:{video_dict['video_title']}")
                    Common.logging(log_type, crawler, env, f"video_dict['video_title']:{video_dict['video_title']}")
                    Common.logger(log_type, crawler).info(f"h5_video_title:{h5_video_title}")
                    Common.logging(log_type, crawler, env, f"h5_video_title:{h5_video_title}")
                    Common.logger(log_type, crawler).info(f"title_similarity:{title_similarity}")
                    Common.logging(log_type, crawler, env, f"title_similarity:{title_similarity}")
                    Common.logger(log_type, crawler).info(f"video_dict['user_name']:{video_dict['user_name']}")
                    Common.logging(log_type, crawler, env, f"video_dict['user_name']:{video_dict['user_name']}")
                    Common.logger(log_type, crawler).info(f"h5_user_name:{h5_user_name}")
                    Common.logging(log_type, crawler, env, f"h5_user_name:{h5_user_name}")
                    Common.logger(log_type, crawler).info(f"user_name_similarity:{user_name_similarity}")
                    Common.logging(log_type, crawler, env, f"user_name_similarity:{user_name_similarity}")
            except Exception as e:
                Common.logger(log_type, crawler).info(f"抓取单条H5视频时异常:{e}\n")
                Common.logging(log_type, crawler,env, f"抓取单条H5视频时异常:{e}\n")
        Common.logger(log_type, crawler).info("未找到目标视频\n")
        Common.logging(log_type, crawler, env, "未找到目标视频\n")

    @classmethod
    def download_videos(cls, log_type, crawler, env, rule_dict, our_uid):
        try:
            Common.logger(log_type, crawler).info(f'共{len(ShipinhaoRecommend.download_video_list)}条视频待抓取')
            Common.logging(log_type, crawler, env, f'共{len(ShipinhaoRecommend.download_video_list)}条视频待抓取')
            Common.logger(log_type, crawler).info(f'download_video_list:{ShipinhaoRecommend.download_video_list}\n')
            Common.logging(log_type, crawler, env, f'download_video_list:{ShipinhaoRecommend.download_video_list}\n')
            if len(ShipinhaoRecommend.download_video_list) == 0:
                Common.logger(log_type, crawler).info("没有待下载的视频\n")
                Common.logging(log_type, crawler, env, "没有待下载的视频\n")
                return
            for video_dict in ShipinhaoRecommend.download_video_list:
                try:
                    cls.search_video(log_type, crawler, env, video_dict, rule_dict, our_uid)
                except Exception as e:
                    Common.logger(log_type, crawler).info(f"抓取视频异常:{e}\n")
                    Common.logging(log_type, crawler, env, f"抓取视频异常:{e}\n")
        except Exception as e:
            Common.logger(log_type, crawler).info(f"download_videos异常:{e}\n")
            Common.logging(log_type, crawler, env, f"download_videos异常:{e}\n")


if __name__ == "__main__":
    ShipinhaoRecommend.download_video_list = [
        {'video_title': '网友：不知道此时此刻黑车司机在想什么', 'video_id': '96bfb8b86965df7365f02373ce37fe87', 'duration': 21, 'user_name': '沂蒙晚报', 'like_cnt': 9575, 'share_cnt': 11000, 'favorite_cnt': 25000, 'comment_cnt': 5026, 'publish_time_str': '2023-07-25', 'publish_time_stamp': 1690214400, 'publish_time': 1690214400000, 'period': 1},
        {'video_title': '女朋友这不就来了么', 'video_id': 'b1892886dca8c38dd6d72848ae4fd565', 'duration': 10, 'user_name': '向往的火焰蓝', 'like_cnt': 11000, 'share_cnt': 3701, 'favorite_cnt': 26000, 'comment_cnt': 1426, 'publish_time_str': '2023-07-26', 'publish_time_stamp': 1690300800, 'publish_time': 1690300800000, 'period': 0},
        {'video_title': '近日，在韩国举办的2023世界跆拳道大赛上，中国选手出“奇招”，引网友点赞。关注', 'video_id': 'ebe8637a152c58bac2f1d875b257f9b5', 'duration': 10, 'user_name': '搜狐新闻', 'like_cnt': 9475, 'share_cnt': 9134, 'favorite_cnt': 18000, 'comment_cnt': 1770, 'publish_time_str': '2023-07-26', 'publish_time_stamp': 1690300800, 'publish_time': 1690300800000, 'period': 0},
        {'video_title': '与愚者争论，自己就是愚者 #动画小故事  #哲理故事', 'video_id': '629abeb79f0de7a4dc45fadffc8ebc2b', 'duration': 32, 'user_name': '陈搞搞', 'like_cnt': 23000, 'share_cnt': 49000, 'favorite_cnt': 67000, 'comment_cnt': 1336, 'publish_time_str': '2023-07-24', 'publish_time_stamp': 1690128000, 'publish_time': 1690128000000, 'period': 2},
        {'video_title': '我看不懂这种行为的意义在哪里，所以我决定坚持反复观看试图参悟其中的深意，', 'video_id': 'd7e6e1eeb519183d5e8665c92a101378', 'duration': 15, 'user_name': '蜡笔小星丶', 'like_cnt': 20000, 'share_cnt': 100000, 'favorite_cnt': 51000, 'comment_cnt': 9836, 'publish_time_str': '2023-07-25', 'publish_time_stamp': 1690214400, 'publish_time': 1690214400000, 'period': 1},
        {'video_title': '女子一回家就开始脱衣服，不料老公的弟弟还在家里，女子下一秒的反应亮了！', 'video_id': 'c75472e887f2641acd34138b705cf8b9', 'duration': 11, 'user_name': '西米七七', 'like_cnt': 4335, 'share_cnt': 1107, 'favorite_cnt': 13000, 'comment_cnt': 1068, 'publish_time_str': '2023-07-26', 'publish_time_stamp': 1690300800, 'publish_time': 1690300800000, 'period': 0}]
    RecommendH5.download_videos(log_type="recommend",
                                crawler="shipinhao",
                                env="prod",
                                rule_dict={"period": {"min": 365, "max": 365},
                                           "duration": {"min": 10, "max": 1800},
                                           "favorite_cnt": {"min": 50000, "max": 0},
                                           "share_cnt": {"min": 10000, "max": 0}},
                                our_uid=61333564
                                )