wangkun 2 years ago
parent
commit
7b95c087d9

+ 12 - 0
README.MD

@@ -194,4 +194,16 @@ sh ./main/scheduling_main.sh ./suisuiniannianyingfuqi/suisuiniannianyingfuqi_mai
 检测进程
 ps aux | grep run_suisuiniannianyingfuqi
 ps aux | grep run_suisuiniannianyingfuqi | grep -v grep | awk '{print $2}' | xargs kill -9
+```
+
+#### 刚刚都传小程序
+```commandline
+MacAir 设备, crontab定时任务
+* * * * * /bin/sh /Users/piaoquan/Desktop/crawler/piaoquan_crawler/main/process_offline.sh "offline"
+线下调试
+cd /Users/wangkun/Desktop/crawler/piaoquan_crawler
+sh ./main/process_offline.sh "dev"
+检测进程
+ps aux | grep run_ganggangdouchuan
+ps aux | grep run_ganggangdouchuan | grep -v grep | awk '{print $2}' | xargs kill -9
 ```

+ 8 - 4
common/publish.py

@@ -197,12 +197,16 @@ class Publish:
             return random.choice(uids_prod_gongzhonghao_follow)
 
         elif crawler == 'benshanzhufu' and env == 'prod' and strategy == '推荐榜爬虫策略':
-            uids_prod_gongzhonghao_follow = [20631262, 20631263, 20631264, 20631265, 20631266, 20631267, 20631268, 20631269, 20631271, 20631272]
-            return random.choice(uids_prod_gongzhonghao_follow)
+            uids_prod_benshanzhufu_recommend = [20631262, 20631263, 20631264, 20631265, 20631266, 20631267, 20631268, 20631269, 20631271, 20631272]
+            return random.choice(uids_prod_benshanzhufu_recommend)
 
         elif crawler == 'suisuiniannianyingfuqi' and env == 'prod' and strategy == '推荐榜爬虫策略':
-            uids_prod_gongzhonghao_follow = [26117547, 26117548, 26117549, 26117550, 26117551]
-            return random.choice(uids_prod_gongzhonghao_follow)
+            uids_prod_suisuiniannianyingfuqi_recommend = [26117547, 26117548, 26117549, 26117550, 26117551]
+            return random.choice(uids_prod_suisuiniannianyingfuqi_recommend)
+
+        elif crawler == 'ganggangdouchuan' and env == 'prod' and strategy == '推荐榜爬虫策略':
+            uids_prod_ganggangdouchuan_recommend = [26117661, 26117662, 26117663]
+            return random.choice(uids_prod_ganggangdouchuan_recommend)
 
         else:
             return our_uid

BIN
ganggangdouchuan/.DS_Store


+ 23 - 0
ganggangdouchuan/ganggangdouchuan_main/run_ganggangdouchuan_recommend.py

@@ -1,3 +1,26 @@
 # -*- coding: utf-8 -*-
 # @Author: wangkun
 # @Time: 2023/4/13
+import argparse
+import os
+import sys
+sys.path.append(os.getcwd())
+from common.common import Common
+from ganggangdouchuan.ganggangdouchuan_recommend.ganggangdouchuan_recommend import GanggangdouchuanRecommend
+
+
+def main(log_type, crawler, env):
+    oss_endpoint = "out"
+    Common.logger(log_type, crawler).info('开始抓取 刚刚都传小程序\n')
+    GanggangdouchuanRecommend.start_wechat(log_type, crawler, oss_endpoint, env)
+    Common.del_logs(log_type, crawler)
+    Common.logger(log_type, crawler).info('抓取完一轮\n')
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()  ## 新建参数解释器对象
+    parser.add_argument('--log_type', type=str)  ## 添加参数,注明参数类型
+    parser.add_argument('--crawler')  ## 添加参数
+    parser.add_argument('--env')  ## 添加参数
+    args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
+    main(log_type=args.log_type, crawler=args.crawler, env=args.env)

+ 257 - 184
ganggangdouchuan/ganggangdouchuan_recommend/ganggangdouchuan_recommend.py

@@ -1,129 +1,170 @@
 # -*- coding: utf-8 -*-
 # @Author: wangkun
 # @Time: 2023/4/13
-import difflib
+import json
 import os
+import shutil
 import sys
 import time
-import ffmpeg
+from hashlib import md5
 from appium import webdriver
 from appium.webdriver.extensions.android.nativekey import AndroidKey
 from appium.webdriver.webdriver import WebDriver
 from selenium.common import NoSuchElementException
 from selenium.webdriver.common.by import By
+
 sys.path.append(os.getcwd())
 from common.common import Common
+from common.feishu import Feishu
+from common.publish import Publish
+from common.scheduling_db import MysqlHelper
 
 
 class GanggangdouchuanRecommend:
+    platform = "刚刚都传"
     i = 0
 
     @classmethod
     def start_wechat(cls, log_type, crawler, oss_endpoint, env):
-        # try:
-        if env == "dev":
-            chromedriverExecutable = '/Users/wangkun/Downloads/chromedriver/chromedriver_v107/chromedriver'
-        else:
-            chromedriverExecutable = '/Users/piaoquan/Downloads/chromedriver'
+        try:
+            if env == "dev":
+                chromedriverExecutable = '/Users/wangkun/Downloads/chromedriver/chromedriver_v107/chromedriver'
+            else:
+                chromedriverExecutable = '/Users/piaoquan/Downloads/chromedriver'
 
-        Common.logger(log_type, crawler).info('启动微信')
-        caps = {
-            "platformName": "Android",  # 手机操作系统 Android / iOS
-            "deviceName": "a0a65126",  # 连接的设备名(模拟器或真机),安卓可以随便写
-            "platforVersion": "11",  # 手机对应的系统版本(Android 11)
-            "appPackage": "com.tencent.mm",  # 被测APP的包名,乐活圈 Android
-            "appActivity": ".ui.LauncherUI",  # 启动的Activity名
-            "autoGrantPermissions": "true",  # 让 appium 自动授权 base 权限,
-            # 如果 noReset 为 True,则该条不生效(该参数为 Android 独有),对应的值为 True 或 False
-            "unicodekeyboard": True,  # 使用自带输入法,输入中文时填True
-            "resetkeyboard": True,  # 执行完程序恢复原来输入法
-            "noReset": True,  # 不重置APP
-            "printPageSourceOnFailure": True,  # 找不到元素时,appium log 会完整记录当前页面的 pagesource
-            "newCommandTimeout": 6000,  # 初始等待时间
-            "automationName": "UiAutomator2",  # 使用引擎,默认为 Appium,
-            # 其中 Appium、UiAutomator2、Selendroid、Espresso 用于 Android,XCUITest 用于 iOS
-            "showChromedriverLog": True,
-            'enableWebviewDetailsCollection': True,
-            'setWebContentsDebuggingEnabled': True,
-            'recreateChromeDriverSessions': True,
-            'chromedriverExecutable': chromedriverExecutable,
-            "chromeOptions": {"androidProcess": "com.tencent.mm:appbrand0"},
-            'browserName': ''
-        }
-        driver = webdriver.Remote("http://localhost:4723/wd/hub", caps)
-        driver.implicitly_wait(20)
-        # 向下滑动页面,展示出小程序选择面板
-        for i in range(120):
-            try:
-                # 发现微信消息 TAB,代表微信已启动成功
-                if driver.find_elements(By.ID, 'com.tencent.mm:id/f2s'):
-                    break
-                # 发现并关闭系统菜单栏
-                elif driver.find_element(By.ID, 'com.android.systemui:id/dismiss_view'):
-                    Common.logger(log_type, crawler).info('发现并关闭系统下拉菜单栏')
-                    driver.find_element(By.ID, 'com.android.systemui:id/dismiss_view').click()
-                else:
-                    pass
-            except NoSuchElementException:
-                time.sleep(1)
-        Common.logger(log_type, crawler).info('下滑,展示小程序选择面板')
-        size = driver.get_window_size()
-        driver.swipe(int(size['width'] * 0.5), int(size['height'] * 0.2),
-                     int(size['width'] * 0.5), int(size['height'] * 0.8), 200)
-        # 打开小程序"刚刚都传"
-        time.sleep(3)
-        Common.logger(log_type, crawler).info('打开小程序"刚刚都传"')
-        driver.find_elements(By.XPATH, '//*[@text="刚刚都传"]')[-1].click()
-        cls.get_videoList(log_type, crawler, oss_endpoint, env, driver)
-        driver.quit()
-        Common.logger(log_type, crawler).info('退出微信成功\n')
-        # except Exception as e:
-        #     Common.logger(log_type, crawler).error(f'start_wechat异常:{e}\n')
-        #     cmd = "cd ~ && source .bash_profile && adb kill-server && adb start-server"
-        #     os.system(cmd)
+            Common.logger(log_type, crawler).info('启动微信')
+            caps = {
+                "platformName": "Android",  # 手机操作系统 Android / iOS
+                "deviceName": "a0a65126",  # 连接的设备名(模拟器或真机),安卓可以随便写
+                "platforVersion": "11",  # 手机对应的系统版本(Android 11)
+                "appPackage": "com.tencent.mm",  # 被测APP的包名,乐活圈 Android
+                "appActivity": ".ui.LauncherUI",  # 启动的Activity名
+                "autoGrantPermissions": "true",  # 让 appium 自动授权 base 权限,
+                # 如果 noReset 为 True,则该条不生效(该参数为 Android 独有),对应的值为 True 或 False
+                "unicodekeyboard": True,  # 使用自带输入法,输入中文时填True
+                "resetkeyboard": True,  # 执行完程序恢复原来输入法
+                "noReset": True,  # 不重置APP
+                "printPageSourceOnFailure": True,  # 找不到元素时,appium log 会完整记录当前页面的 pagesource
+                "newCommandTimeout": 6000,  # 初始等待时间
+                "automationName": "UiAutomator2",  # 使用引擎,默认为 Appium,
+                # 其中 Appium、UiAutomator2、Selendroid、Espresso 用于 Android,XCUITest 用于 iOS
+                "showChromedriverLog": True,
+                'enableWebviewDetailsCollection': True,
+                'setWebContentsDebuggingEnabled': True,
+                'recreateChromeDriverSessions': True,
+                'chromedriverExecutable': chromedriverExecutable,
+                "chromeOptions": {"androidProcess": "com.tencent.mm:appbrand0"},
+                'browserName': ''
+            }
+            driver = webdriver.Remote("http://localhost:4723/wd/hub", caps)
+            driver.implicitly_wait(20)
+            # 向下滑动页面,展示出小程序选择面板
+            for i in range(120):
+                try:
+                    # 发现微信消息 TAB,代表微信已启动成功
+                    if driver.find_elements(By.ID, 'com.tencent.mm:id/f2s'):
+                        break
+                    # 发现并关闭系统菜单栏
+                    elif driver.find_element(By.ID, 'com.android.systemui:id/dismiss_view'):
+                        Common.logger(log_type, crawler).info('发现并关闭系统下拉菜单栏')
+                        driver.find_element(By.ID, 'com.android.systemui:id/dismiss_view').click()
+                    else:
+                        pass
+                except NoSuchElementException:
+                    time.sleep(1)
+            Common.logger(log_type, crawler).info('下滑,展示小程序选择面板')
+            size = driver.get_window_size()
+            driver.swipe(int(size['width'] * 0.5), int(size['height'] * 0.2),
+                         int(size['width'] * 0.5), int(size['height'] * 0.8), 200)
+            # 打开小程序"刚刚都传"
+            time.sleep(3)
+            Common.logger(log_type, crawler).info('打开小程序"刚刚都传"')
+            driver.find_elements(By.XPATH, '//*[@text="刚刚都传"]')[-1].click()
+            time.sleep(10)
+            cls.get_videoList(log_type, crawler, oss_endpoint, env, driver)
+            driver.quit()
+            Common.logger(log_type, crawler).info('退出微信成功\n')
+        except Exception as e:
+            Common.logger(log_type, crawler).error(f'start_wechat异常:{e}\n')
 
     @classmethod
-    def search_elements(cls, log_type, crawler, driver: WebDriver, element):
+    def get_video_url(cls, log_type, crawler, driver: WebDriver, video_element):
         try:
+            time.sleep(1)
+            # Common.logger(log_type, crawler).info('进入视频详情')
+            video_element.click()
+            time.sleep(5)
             windowHandles = driver.window_handles
+            # Common.logger(log_type, crawler).info('windowHandles:{}', windowHandles)
+            # 遍历所有的handles,找到当前页面所在的handle:如果pageSource有包含你想要的元素,就是所要找的handle
+            # 小程序的页面来回切换也需要:遍历所有的handles,切换到元素所在的handle
             for handle in windowHandles:
                 driver.switch_to.window(handle)
                 time.sleep(1)
-                if len(driver.find_elements(By.XPATH, element)) != 0:
-                    return driver.find_elements(By.XPATH, element)
-                else:
-                    return None
+                try:
+                    video_url_element = driver.find_element(By.XPATH, '//wx-video[@id="v_id"]')
+                    video_url = video_url_element.get_attribute("src")
+                    return video_url
+                except NoSuchElementException:
+                    time.sleep(1)
         except Exception as e:
-            Common.logger(log_type, crawler).error(f'search_element异常:{e}\n')
+            Common.logger(log_type, crawler).error(f'get_video_url异常:{e}\n')
 
+    # 切换 Handle
     @classmethod
-    def get_video_url(cls, log_type, crawler, driver: WebDriver, video_element):
+    def search_elements(cls, log_type, crawler, driver: WebDriver, xpath):
         try:
-            time.sleep(1)
-            # Common.logger(log_type, crawler).info('进入视频详情')
-            video_element.click()
-            time.sleep(3)
-            video_url_element = cls.search_elements(log_type, crawler, driver, '//wx-video[@id="v_id"]')
-            if video_url_element is None or len(video_url_element) == 0:
-                Common.logger(log_type, crawler).info('未获取到视频 URL')
-                return 0
-            else:
-                return video_url_element[0].get_attribute('src')
+            windowHandles = driver.window_handles
+            # Common.logger(log_type, crawler).info('windowHandles:{}', windowHandles)
+            # 遍历所有的handles,找到当前页面所在的handle:如果pageSource有包含你想要的元素,就是所要找的handle
+            # 小程序的页面来回切换也需要:遍历所有的handles,切换到元素所在的handle
+            for handle in windowHandles:
+                driver.switch_to.window(handle)
+                time.sleep(1)
+                try:
+                    elements = driver.find_elements(By.XPATH, xpath)
+                    return elements
+                except NoSuchElementException:
+                    pass
         except Exception as e:
-            Common.logger(log_type, crawler).error(f'get_video_url异常:{e}\n')
+            Common.logger(log_type, crawler).warning(f'search_elements异常:{e}\n')
 
     @classmethod
-    def get_videoList(cls, log_type, crawler, oss_endpoint, env, driver: WebDriver):
-        # try:
-        driver.implicitly_wait(15)
-        Common.logger(log_type, crawler).info('切换到小程序\n')
-        time.sleep(5)
-        webviews = driver.contexts
-        driver.switch_to.context(webviews[1])
+    def check_to_applet(cls, log_type, crawler, driver: WebDriver):
+        while True:
+            webview = driver.contexts
+            # Common.logger(log_type, crawler).info(f"webviews:{webview}")
+            driver.switch_to.context(webview[1])
+            windowHandles = driver.window_handles
+            for handle in windowHandles:
+                driver.switch_to.window(handle)
+                time.sleep(1)
+                try:
+                    video_list = driver.find_element(By.XPATH, '//wx-view[text()="视频"]')
+                    video_list.click()
+                    Common.logger(log_type, crawler).info('切换到小程序视频列表成功\n')
+                    return
+                except NoSuchElementException:
+                    time.sleep(1)
+            Common.logger(log_type, crawler).info("切换到小程序失败\n")
+            break
 
-        time.sleep(1)
-        cls.search_elements(log_type, crawler, driver, '//wx-view[text()="视频"]')[0].click()
+    @classmethod
+    def repeat_out_video_id(cls, log_type, crawler, out_video_id, env):
+        sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{out_video_id}"; """
+        repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
+        return len(repeat_video)
+
+    @classmethod
+    def repeat_video_url(cls, log_type, crawler, video_url, env):
+        sql = f""" select * from crawler_video where platform="{cls.platform}" and video_url="{video_url}"; """
+        repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
+        return len(repeat_video)
 
+    @classmethod
+    def get_videoList(cls, log_type, crawler, oss_endpoint, env, driver: WebDriver):
+        # 切换到小程序
+        cls.check_to_applet(log_type, crawler, driver)
         time.sleep(1)
         index = 0
 
@@ -153,110 +194,142 @@ class GanggangdouchuanRecommend:
                 time.sleep(3)
                 driver.execute_script("arguments[0].scrollIntoView({block:'center',inline:'center'})", video_element)
 
-                video_title = video_element.find_elements(By.XPATH, '//wx-view[@class="title"]//span[2]')[cls.i - 1].get_attribute('innerHTML')
+                # video_title = video_element.find_elements(By.XPATH, '//wx-view[@class="title"]//span[2]')[cls.i - 1].get_attribute('innerHTML')
+                video_title = video_element.find_elements(By.XPATH, '//wx-view[@class="title"]//span[2]')[cls.i - 1].text
                 cover_url = video_element.find_elements(By.XPATH, '//wx-image[@class="poster-img"]')[cls.i - 1].get_attribute('src')
+                out_video_id = md5(video_title.encode('utf8')).hexdigest()
+                video_dict = {
+                    'video_title': video_title,
+                    'video_id': out_video_id,
+                    'play_cnt': 0,
+                    'comment_cnt': 0,
+                    'like_cnt': 0,
+                    'share_cnt': 0,
+                    'publish_time_stamp': int(time.time()),
+                    'publish_time_str': time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))),
+                    'user_name': "ganggangdouchuan",
+                    'user_id': "ganggangdouchuan",
+                    'avatar_url': cover_url,
+                    'cover_url': cover_url,
+                    'session': f"ganggangdouchuan-{int(time.time())}"
+                }
+                for k, v in video_dict.items():
+                    Common.logger(log_type, crawler).info(f"{k}:{v}")
 
-                Common.logger(log_type, crawler).info(f'video_title:{video_title}')
-                Common.logger(log_type, crawler).info(f'cover_url:{cover_url}')
-
-                # cls.download_publish(log_type, crawler, oss_endpoint, env, job, driver, video_element, video_title, cover_url)
-                # time.sleep(3)
-
+                if video_title is None or cover_url is None:
+                    Common.logger(log_type, crawler).info("无效视频\n")
+                elif cls.repeat_out_video_id(log_type, crawler, out_video_id, env) != 0:
+                    Common.logger(log_type, crawler).info('视频已下载\n')
+                else:
+                    video_url = cls.get_video_url(log_type, crawler, driver, video_element)
+                    if video_url is None:
+                        Common.logger(log_type, crawler).info("未获取到视频播放地址\n")
+                        driver.press_keycode(AndroidKey.BACK)
+                    elif cls.repeat_video_url(log_type, crawler, video_url, env) != 0:
+                        Common.logger(log_type, crawler).info('视频已下载\n')
+                        driver.press_keycode(AndroidKey.BACK)
+                    else:
+                        video_dict["video_url"]=video_url
+                        cls.download_publish(log_type, crawler, video_dict, oss_endpoint, env, driver)
             Common.logger(log_type, crawler).info('已抓取完一组视频,休眠10秒\n')
             time.sleep(10)
             index = index + len(video_element_temp)
 
         # except Exception as e:
         #     Common.logger(log_type, crawler).error(f'get_recommend异常,重启APP:{e}\n')
-        #     cls.i = 0
-        #     cls.quit(log_type, driver)
-        #     cls.start_wechat(log_type, crawler, oss_endpoint, env, job)
 
-    # @classmethod
-    # def title_like(cls, log_type, title):
-    #     sheet = Feishu.get_values_batch(log_type, 'ggdc', '070a67')
-    #     for i in range(1, len(sheet)):
-    #         video_title = sheet[i][7]
-    #         if video_title is None:
-    #             pass
-    #         elif difflib.SequenceMatcher(None, title, video_title).quick_ratio() >= 0.8:
-    #             return True
-    #         else:
-    #             pass
-    #
-    # @classmethod
-    # def download_publish(cls, log_type, crawler, oss_endpoint, env, job, driver: WebDriver, video_element, video_title, cover_url):
-    #     try:
-    #         if video_title == 0 or cover_url == 0:
-    #             Common.logger(log_type, crawler).info('无效视频\n')
-    #         elif video_title in [x for y in Feishu.get_values_batch(log_type, 'ggdc', '070a67') for x in y]:
-    #             Common.logger(log_type, crawler).info('视频已下载\n')
-    #         elif any(word if word in video_title else False for word in cls.filter_words(log_type)) is True:
-    #             Common.logger(log_type, crawler).info('已中过滤词\n')
-    #         else:
-    #             video_url = cls.get_video_url(log_type, driver, video_element)
-    #             if video_url == 0:
-    #                 Common.logger(log_type, crawler).info('video_url:未获取到\n')
-    #             elif video_url in [x for y in Feishu.get_values_batch(log_type, 'ggdc', '070a67') for x in y]:
-    #                 Common.logger(log_type, crawler).info('视频已下载\n')
-    #             else:
-    #                 Common.logger(log_type, crawler).info(f'cover_url:{cover_url}')
-    #                 Common.logger(log_type, crawler).info(f'video_url:{video_url}')
-    #
-    #                 # 下载视频
-    #                 Common.download_method(log_type, 'video', video_title, video_url)
-    #                 # # 获取视频时长
-    #                 # video_info = cls.get_video_info_from_local(log_type, "./videos/" + video_title + "/video.mp4")
-    #                 # video_width = str(video_info[0])
-    #                 # video_height = str(video_info[1])
-    #                 # duration = video_info[2]
-    #                 # 下载封面
-    #                 Common.download_method(log_type, 'cover', video_title, cover_url)
-    #                 # 保存视频信息至 "./videos/{download_video_title}/info.txt"
-    #                 with open("./videos/" + video_title
-    #                           + "/" + "info.txt", "a", encoding="UTF-8") as f_a:
-    #                     f_a.write("ggdc" + str(int(time.time())) + "\n" +
-    #                               str(video_title) + "\n" +
-    #                               '100' + "\n" +
-    #                               '100000' + "\n" +
-    #                               '100000' + "\n" +
-    #                               '100000' + "\n" +
-    #                               '100000' + "\n" +
-    #                               '1920*1080' + "\n" +
-    #                               str(int(time.time())) + "\n" +
-    #                               '刚刚都传小程序' + "\n" +
-    #                               str(cover_url) + "\n" +
-    #                               str(video_url) + "\n" +
-    #                               str(cover_url) + "\n" +
-    #                               "ganggangdouchuan" + str(int(time.time())))
-    #                 Common.logger(log_type, crawler).info("==========视频信息已保存至info.txt==========")
-    #
-    #                 # 上传视频
-    #                 Common.logger(log_type, crawler).info(f"开始上传视频:{video_title}")
-    #                 if env == 'dev':
-    #                     our_video_id = Publish.upload_and_publish(log_type, crawler, oss_endpoint, env, job)
-    #                     our_video_link = "https://testadmin.piaoquantv.com/cms/post-detail/" + str(our_video_id) + "/info"
-    #                 else:
-    #                     our_video_id = Publish.upload_and_publish(log_type, crawler, oss_endpoint, env, job)
-    #                     our_video_link = "https://admin.piaoquantv.com/cms/post-detail/" + str(our_video_id) + "/info"
-    #                 Common.logger(log_type, crawler).info("视频上传完成")
-    #
-    #                 # 视频信息保存至飞书
-    #                 Feishu.insert_columns(log_type, "ggdc", "070a67", "ROWS", 1, 2)
-    #                 # 视频ID工作表,首行写入数据
-    #                 upload_time = int(time.time())
-    #                 values = [[time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(upload_time)),
-    #                            "推荐榜",
-    #                            video_title,
-    #                            our_video_link,
-    #                            cover_url,
-    #                            video_url]]
-    #                 time.sleep(1)
-    #                 Feishu.update_values(log_type, "ggdc", "070a67", "F2:V2", values)
-    #                 driver.press_keycode(AndroidKey.BACK)
-    #                 Common.logger(log_type, crawler).info(f"视频:{video_title},下载/上传成功\n")
-    #     except Exception as e:
-    #         Common.logger(log_type, crawler).error(f'download_publish异常:{e}\n')
+    @classmethod
+    def download_publish(cls, log_type, crawler, video_dict, oss_endpoint, env, driver: WebDriver):
+        try:
+            # 下载视频
+            Common.download_method(log_type=log_type, crawler=crawler, text='video', title=video_dict['video_title'], url=video_dict['video_url'])
+            ffmpeg_dict = Common.ffmpeg(log_type, crawler, f"./{crawler}/videos/{video_dict['video_title']}/video.mp4")
+            if ffmpeg_dict is None:
+                md_title = md5(video_dict['video_title'].encode('utf8')).hexdigest()
+                shutil.rmtree(f"./{crawler}/videos/{md_title}/")
+                Common.logger(log_type, crawler).info("视频size=0,删除成功\n")
+                return
+            video_dict["duration"] = ffmpeg_dict["duration"]
+            video_dict["video_width"] = ffmpeg_dict["width"]
+            video_dict["video_height"] = ffmpeg_dict["height"]
+            # 下载封面
+            Common.download_method(log_type=log_type, crawler=crawler, text='cover', title=video_dict['video_title'],
+                                   url=video_dict['cover_url'])
+            # 保存视频信息至txt
+            Common.save_video_info(log_type=log_type, crawler=crawler, video_dict=video_dict)
+
+            # 上传视频
+            Common.logger(log_type, crawler).info("开始上传视频...")
+            our_video_id = Publish.upload_and_publish(log_type=log_type,
+                                                      crawler=crawler,
+                                                      strategy="推荐榜爬虫策略",
+                                                      our_uid="recommend",
+                                                      env=env,
+                                                      oss_endpoint=oss_endpoint)
+            if env == 'dev':
+                our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
+            else:
+                our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
+            Common.logger(log_type, crawler).info("视频上传完成")
+
+            if our_video_id is None:
+                # 删除视频文件夹
+                shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}")
+                return
+
+            # 视频信息保存至飞书
+            Feishu.insert_columns(log_type, crawler, "070a67", "ROWS", 1, 2)
+            # 视频ID工作表,首行写入数据
+            upload_time = int(time.time())
+            values = [[time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(upload_time)),
+                       "推荐榜爬虫策略",
+                       video_dict["video_title"],
+                       video_dict["video_id"],
+                       video_dict["duration"],
+                       f'{video_dict["video_width"]}*{video_dict["video_height"]}',
+                       our_video_link,
+                       video_dict["cover_url"],
+                       video_dict["video_url"]]]
+            time.sleep(1)
+            Feishu.update_values(log_type, crawler, "070a67", "F2:V2", values)
+            Common.logger(log_type, crawler).info(f"视频已保存至飞书文档\n")
+
+            rule_dict = {}
+            # 视频信息保存数据库
+            insert_sql = f""" insert into crawler_video(video_id,
+                                                            out_user_id,
+                                                            platform,
+                                                            strategy,
+                                                            out_video_id,
+                                                            video_title,
+                                                            cover_url,
+                                                            video_url,
+                                                            duration,
+                                                            publish_time,
+                                                            play_cnt,
+                                                            crawler_rule,
+                                                            width,
+                                                            height)
+                                                            values({our_video_id},
+                                                            "{video_dict['user_id']}",
+                                                            "{cls.platform}",
+                                                            "推荐榜爬虫策略",
+                                                            "{video_dict['video_id']}",
+                                                            "{video_dict['video_title']}",
+                                                            "{video_dict['cover_url']}",
+                                                            "{video_dict['video_url']}",
+                                                            {int(video_dict['duration'])},
+                                                            "{video_dict['publish_time_str']}",
+                                                            {int(video_dict['play_cnt'])},
+                                                            '{json.dumps(rule_dict)}',
+                                                            {int(video_dict['video_width'])},
+                                                            {int(video_dict['video_height'])}) """
+            Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
+            MysqlHelper.update_values(log_type, crawler, insert_sql, env, action='')
+            Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
+            driver.press_keycode(AndroidKey.BACK)
+        except Exception as e:
+            Common.logger(log_type, crawler).error(f'download_publish异常:{e}\n')
 
 
 if __name__ == '__main__':

+ 18 - 51
ganggangdouchuan/ganggangdouchuan_recommend/insert.py

@@ -6,6 +6,8 @@ import os
 import sys
 import time
 from datetime import date, timedelta
+from hashlib import md5
+
 sys.path.append(os.getcwd())
 from common.common import Common
 from common.feishu import Feishu
@@ -75,57 +77,37 @@ class Insert:
 
     @classmethod
     def insert_video_from_feishu_to_mysql(cls, log_type, crawler, env):
-        benshanzhufu_sheetid = ['440018']
-        for sheetid in benshanzhufu_sheetid:
+        ganggangdouchuan_sheetid = ['070a67']
+        for sheetid in ganggangdouchuan_sheetid:
             xiaoniangao_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
             for i in range(1, len(xiaoniangao_sheet)):
-            # for i in range(1, 3):
-                if xiaoniangao_sheet[i][5] is None or xiaoniangao_sheet[i][9] is None:
+            # for i in range(1, 5):
+                if xiaoniangao_sheet[i][5] is None or xiaoniangao_sheet[i][7] is None:
                     continue
-                video_id = xiaoniangao_sheet[i][8].replace("https://admin.piaoquantv.com/cms/post-detail/", "").replace(
+                video_id = xiaoniangao_sheet[i][11].replace("https://admin.piaoquantv.com/cms/post-detail/", "").replace(
                     "/info", "")
                 if video_id == "None":
                     continue
                 video_id = int(video_id)
-                out_user_id = str(xiaoniangao_sheet[i][17])
-                platform = "本山祝福"
+                out_user_id = "ganggangdouchuan"
+                platform = "刚刚都传"
                 strategy = "推荐榜爬虫策略"
-                out_video_id = str(xiaoniangao_sheet[i][6])
                 video_title = str(xiaoniangao_sheet[i][7])
-                cover_url = str(xiaoniangao_sheet[i][19])
-                video_url = str(xiaoniangao_sheet[i][20])
-                duration = int(xiaoniangao_sheet[i][13])
-                publish_time = str(xiaoniangao_sheet[i][15]).replace("/", "-")
-                play_cnt = int(xiaoniangao_sheet[i][9])
-                like_cnt = int(xiaoniangao_sheet[i][11])
-                share_cnt = int(xiaoniangao_sheet[i][12])
-                # collection_cnt = 0
-                comment_cnt = int(xiaoniangao_sheet[i][10])
-                user_id = str(xiaoniangao_sheet[i][17])
+                cover_url = str(xiaoniangao_sheet[i][12])
+                video_url = str(xiaoniangao_sheet[i][13])
                 crawler_rule = json.dumps({})
-                width = int(xiaoniangao_sheet[i][14].split("*")[0])
-                height = int(xiaoniangao_sheet[i][14].split("*")[1])
+                out_video_id = md5(video_title.encode('utf8')).hexdigest()
 
                 # print(f"video_id:{video_id}, type:{type(video_id)}")
-                # print(f"user_id:{user_id}, type:{type(user_id)}")
                 # print(f"out_user_id:{out_user_id}, type:{type(out_user_id)}")
                 # print(f"platform:{platform}, type:{type(platform)}")
                 # print(f"strategy:{strategy}, type:{type(strategy)}")
-                # print(f"out_video_id:{out_video_id}, type:{type(out_video_id)}")
                 # print(f"video_title:{video_title}, type:{type(video_title)}")
                 # print(f"cover_url:{cover_url}, type:{type(cover_url)}")
                 # print(f"video_url:{video_url}, type:{type(video_url)}")
-                # print(f"duration:{duration}, type:{type(duration)}")
-                # print(f"publish_time:{publish_time}, type:{type(publish_time)}")
-                # print(f"play_cnt:{play_cnt}, type:{type(play_cnt)}")
-                # print(f"like_cnt:{like_cnt}, type:{type(like_cnt)}")
-                # print(f"share_cnt:{share_cnt}, type:{type(share_cnt)}")
-                # print(f"comment_cnt:{comment_cnt}, type:{type(comment_cnt)}")
                 # print(f"crawler_rule:{crawler_rule}, type:{type(crawler_rule)}")
-                # print(f"width:{width}, type:{type(width)}")
-                # print(f"height:{height}, type:{type(height)}\n")
 
-                select_sql = f""" select * from crawler_video where platform="{platform}" and out_video_id="{out_video_id}" """
+                select_sql = f""" select * from crawler_video where platform="{platform}" and video_url="{video_url}" """
                 Common.logger(log_type, crawler).info(f"select_sql:{select_sql}")
                 repeat_video = MysqlHelper.get_values(log_type, crawler, select_sql, env)
                 Common.logger(log_type, crawler).info(f"repeat_video:{repeat_video}")
@@ -142,15 +124,7 @@ class Insert:
                                         video_title,
                                         cover_url,
                                         video_url,
-                                        duration,
-                                        publish_time,
-                                        play_cnt,
-                                        like_cnt,
-                                        share_cnt,
-                                        comment_cnt,
-                                        crawler_rule,
-                                        width,
-                                        height)
+                                        crawler_rule)
                                         values({video_id},
                                         "{out_user_id}",
                                         "{platform}",
@@ -159,15 +133,7 @@ class Insert:
                                         "{video_title}",
                                         "{cover_url}",
                                         "{video_url}",
-                                        {duration},
-                                        "{publish_time}",
-                                        {play_cnt},
-                                        {like_cnt},
-                                        {share_cnt},
-                                        {comment_cnt},
-                                        '{crawler_rule}',
-                                        {width},
-                                        {height}) """
+                                        '{crawler_rule}') """
                     Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
                     MysqlHelper.update_values(log_type, crawler, insert_sql, env, action='')
                     Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
@@ -176,6 +142,7 @@ class Insert:
 
 if __name__ == "__main__":
     # Insert.insert_config("insert", "benshanzhufu", "dev")
-    # print(Insert.get_config("insert", "benshanzhufu", "filter", "dev"))
-    Insert.insert_video_from_feishu_to_mysql("insert-prod", "benshanzhufu", "prod")
+    # print(Insert.get_config("insert", "ganggangdouchuan", "filter", "dev"))
+    # Insert.insert_video_from_feishu_to_mysql("insert-dev", "ganggangdouchuan", "dev")
+    Insert.insert_video_from_feishu_to_mysql("insert-prod", "ganggangdouchuan", "prod")
     pass

+ 3 - 0
ganggangdouchuan/logs/__init__.py

@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/4/14

+ 88 - 0
main/process_offline.sh

@@ -0,0 +1,88 @@
+#! /bin/bash
+# **********线下爬虫********** #
+
+env=$1  # 环境
+if [ ${env} = "dev" ];then
+  piaoquan_crawler_dir=/Users/wangkun/Desktop/crawler/piaoquan_crawler/
+  profile_path=/etc/profile
+  python=python3
+  log_path=${piaoquan_crawler_dir}main/main_logs/process-$(date +%Y-%m-%d).log
+else
+  piaoquan_crawler_dir=/Users/piaoquan/Desktop/crawler/piaoquan_crawler/
+#  profile_path=/etc/profile
+  profile_path=./base_profile
+  python=python3
+  log_path=${piaoquan_crawler_dir}main/main_logs/process-$(date +%Y-%m-%d).log
+fi
+
+time=$(date +%H:%M:%S)
+echo "$(date "+%Y-%m-%d %H:%M:%S") 更新环境变量..." >> ${log_path}
+cd ~ && source ${profile_path}
+echo "$(date "+%Y-%m-%d %H:%M:%S") 更新环境变量完成!" >> ${log_path}
+
+## 吉祥幸福
+#if [[ "$time" > "00:00:0" ]] && [[ "$time" < "08:59:59" ]]; then
+#  echo "开始启动 吉祥幸福 爬虫脚本任务" >> ${log_path}
+#  ps aux | grep run_zhongmiaoyinxin | grep -v grep | awk '{print $2}' | xargs kill -9
+#  ps aux | grep run_zhiqingtiantiankan | grep -v grep | awk '{print $2}' | xargs kill -9
+#  ps aux | grep run_ganggangdouchuan | grep -v grep | awk '{print $2}' | xargs kill -9
+#  ps -ef | grep "run_jixiangxingfu_recommend.py" | grep -v "grep"
+#  if [ "$?" -eq 1 ];then
+#    echo "$(date "+%Y-%m-%d_%H:%M:%S") 异常停止,正在重启!" >> ${log_path}
+#    if [ ${env} = "dev" ];then
+#      cd ${piaoquan_crawler_dir} && sh main/scheduling_main.sh ./jixiangxingfu/jixiangxingfu_main/run_jixiangxingfu_recommend.py --log_type="recommend" --crawler="jixiangxingfu" --env="dev" jixiangxingfu/logs/nohup-recommend.log
+#    else
+#      cd ${piaoquan_crawler_dir} && /bin/sh main/scheduling_main.sh ./jixiangxingfu/jixiangxingfu_main/run_jixiangxingfu_recommend.py --log_type="recommend" --crawler="jixiangxingfu" --env="prod" jixiangxingfu/logs/nohup-recommend.log
+#    fi
+#    echo "$(date "+%Y-%m-%d %H:%M:%S") 重启完成!" >> ${log_path}
+#  else
+#    echo "$(date "+%Y-%m-%d %H:%M:%S") 吉祥幸福 进程状态正常" >> ${log_path}
+#  fi
+#else
+#  echo "吉祥幸福 爬虫脚本任务结束" >> ${log_path}
+#fi
+
+## 知青天天看
+#if [[ "$time" > "09:00:0" ]] && [[ "$time" < "12:59:59" ]]; then
+#  echo "开始启动 知青天天看 爬虫脚本任务" >> ${log_path}
+#
+#else
+#  echo "知青天天看 爬虫脚本任务结束" >> ${log_path}
+#fi
+
+# 刚刚都传
+if [[ "$time" > "13:00:0" ]] && [[ "$time" < "22:59:59" ]]; then
+  echo "开始启动 刚刚都传 爬虫脚本任务" >> ${log_path}
+  ps aux | grep run_zhongmiaoyinxin | grep -v grep | awk '{print $2}' | xargs kill -9
+  ps aux | grep run_zhiqingtiantiankan | grep -v grep | awk '{print $2}' | xargs kill -9
+  ps aux | grep run_jixiangxingfu | grep -v grep | awk '{print $2}' | xargs kill -9
+  ps -ef | grep "run_ganggangdouchuan_recommend.py" | grep -v "grep"
+  if [ "$?" -eq 1 ];then
+    echo "$(date "+%Y-%m-%d_%H:%M:%S") 异常停止,正在重启!" >> ${log_path}
+    if [ ${env} = "dev" ];then
+      cd ${piaoquan_crawler_dir} && sh main/scheduling_main.sh ./ganggangdouchuan/ganggangdouchuan_main/run_ganggangdouchuan_recommend.py --log_type="recommend" --crawler="ganggangdouchuan" --env="dev" ganggangdouchuan/logs/nohup-recommend.log
+    else
+      cd ${piaoquan_crawler_dir} && /bin/sh main/scheduling_main.sh ./ganggangdouchuan/ganggangdouchuan_main/run_ganggangdouchuan_recommend.py --log_type="recommend" --crawler="ganggangdouchuan" --env="prod" ganggangdouchuan/logs/nohup-recommend.log
+    fi
+    echo "$(date "+%Y-%m-%d %H:%M:%S") 重启完成!" >> ${log_path}
+  else
+    echo "$(date "+%Y-%m-%d %H:%M:%S") 刚刚都传 进程状态正常" >> ${log_path}
+  fi
+else
+  echo "刚刚都传 爬虫脚本任务结束" >> ${log_path}
+fi
+
+## 众妙音信
+#if [[ "$time" > "17:00:0" ]] && [[ "$time" < "23:59:59" ]]; then
+#  echo "开始启动 众妙音信 爬虫脚本任务" >> ${log_path}
+#
+#else
+#  echo "众妙音信 爬虫脚本任务结束" >> ${log_path}
+#fi
+
+
+# 删除日志
+echo "$(date "+%Y-%m-%d %H:%M:%S") 开始清理 5 天前的日志文件" >> ${log_path}
+find ${piaoquan_crawler_dir}main/main_logs/ -mtime +5 -name "*.log" -exec rm -rf {} \;
+echo "$(date "+%Y-%m-%d %H:%M:%S") 日志文件清理完毕" >> ${log_path}
+exit 0

+ 5 - 0
main/scheduling_main.sh

@@ -17,6 +17,11 @@ elif [ ${env} = "--env=prod" ];then
   piaoquan_crawler_dir=/data5/piaoquan_crawler/
   profile_path=/etc/profile
   python=python
+elif [ ${env} = "--env=offline" ];then
+  piaoquan_crawler_dir=/Users/piaoquan/Desktop/crawler/piaoquan_crawler/
+  profile_path=./base_profile
+  node_path=/usr/local/bin/node
+  python=python3
 elif [ ${env} = "--env=dev" ];then
   piaoquan_crawler_dir=/Users/wangkun/Desktop/crawler/piaoquan_crawler/
   profile_path=/etc/profile