wangkun 1 рік тому
батько
коміт
a252defe93

+ 3 - 2
README.MD

@@ -140,7 +140,7 @@ ps aux | grep run_monitor | grep -v grep | awk '{print $2}' | xargs kill -9
 /bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xng" "xiaoniangao" "play" "dev"
 /bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xng" "xiaoniangao" "hour" "dev"
 /bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xng" "xiaoniangao" "author" "dev"
-/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "kyk" "kanyikan" "recommend" "dev"
+/bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "kykjk" "kanyikan" "recommend" "dev"
 
 207 服务器
 # 调用 MQ 爬虫守护进程
@@ -157,8 +157,9 @@ ps aux | grep run_monitor | grep -v grep | awk '{print $2}' | xargs kill -9
 * * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "xng" "xiaoniangao" "hour" "prod"
 * * * * * /usr/bin/sh /root/piaoquan_crawler/main/process_mq.sh "xng" "xiaoniangao" "author" "prod"
 * * * * * /bin/sh /Users/lieyunye/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "kyk" "kanyikan" "recommend" "prod"
-* * * * * /bin/sh /Users/kanyikan/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "kykmv" "kanyikan" "recommend" "prod"
+* * * * * /bin/sh /Users/kanyikan/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "kykjk" "kanyikan" "recommend" "prod"
 * * * * * /bin/sh /Users/kanyikan/Desktop/crawler/piaoquan_crawler/main/process_mq.sh "xg" "xigua" "recommend" "prod"
+# 启动Appium /bin/sh /Users/wangkun/Desktop/crawler/piaoquan_crawler/main/start_appium.sh "local" "local" "dev"
 
 线下服务器
 

+ 0 - 0
kanyikan/kanyikan_main/run_kykmv_recommend.py → kanyikan/kanyikan_main/run_kykjk_recommend.py


+ 0 - 3
kanyikan/kanyikan_moment/__init__.py

@@ -1,3 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/6/21

+ 0 - 440
kanyikan/kanyikan_moment/kanyikan_moment.py

@@ -1,440 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/6/21
-import os
-import random
-import sys
-import time
-import requests
-import urllib3
-sys.path.append(os.getcwd())
-proxies = {"http": None, "https": None}
-
-
-class Moment:
-    # 过滤词库
-    @classmethod
-    def sensitive_words(cls):
-        word_list = []
-        # 从云文档读取所有敏感词,添加到词库列表
-        lists = Feishu.get_values_batch("moment", "kanyikan", "rofdM5")
-        for i in lists:
-            for j in i:
-                # 过滤空的单元格内容
-                if j is None:
-                    pass
-                else:
-                    word_list.append(j)
-        return word_list
-
-    # 朋友圈视频 ID
-    @classmethod
-    def moment_videoids(cls):
-        try:
-            videoid_list = []
-            # 从云文档读取所有敏感词,添加到词库列表
-            lists = Feishu.get_values_batch("moment", "kanyikan", "iK58HX")
-            for i in lists:
-                for j in i:
-                    # 过滤空的单元格内容
-                    if j is None:
-                        pass
-                    else:
-                        videoid_list.append(j)
-            return videoid_list
-        except Exception as e:
-            Common.logger("moment").error("获取朋友圈视频ID异常:{}", e)
-            return "t3256lo1cmk"
-
-    # 抓取基础规则
-    @staticmethod
-    def download_rule(d_duration, d_width, d_height, d_play_cnt, d_like_cnt, d_share_cnt):
-        """
-        抓取基础规则
-        :param d_duration: 时长
-        :param d_width: 宽
-        :param d_height: 高
-        :param d_play_cnt: 播放量
-        :param d_like_cnt: 点赞量
-        :param d_share_cnt: 分享量
-        :return: 满足规则,返回 True;反之,返回 False
-        """
-        if int(float(d_duration)) >= 40:
-            if int(d_width) >= 0 or int(d_height) >= 0:
-                if int(d_play_cnt) >= 50000:
-                    if int(d_like_cnt) >= 0:
-                        if int(d_share_cnt) >= 0:
-                            return True
-                        else:
-                            return False
-                    else:
-                        return False
-                else:
-                    return False
-            return False
-        return False
-
-    # 获取推荐视频列表
-    @classmethod
-    def get_recommend(cls):
-        url = "https://search.weixin.qq.com/cgi-bin/recwxa/snsgetvideoinfo?"
-        headers = {
-            "content-type": "application/json",
-            "Accept-Encoding": "gzip,compress,br,deflate",
-            "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X)"
-                          " AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"
-                          " MicroMessenger/8.0.20(0x18001442) NetType/WIFI Language/zh_CN",
-            "Referer": "https://servicewechat.com/wxbb9a805eb4f9533c/236/page-frame.html"
-        }
-        time.sleep(1)
-        videoid = random.choice(cls.moment_videoids())
-        # Common.logger("moment").info("videoid:{}", videoid)
-        params = {
-            "vid": videoid,
-            "openid": "1924336296754305",
-            "model": "iPhone 11<iPhone12,1>14.7.1",
-            "sharesearchid": "8406805193800900989",
-            "shareOpenid": "oh_m45YffSEGxvDH--6s6g9ZkPxg",
-        }
-        try:
-            urllib3.disable_warnings()
-            r = requests.get(url=url, headers=headers, params=params, proxies=proxies, verify=False)
-            # Common.logger("moment").info("response:{}", r.json())
-            if "rec_video_list" not in r.json()["data"]:
-                Common.logger("moment").warning("该视频无推荐视频列表:{}", videoid)
-            else:
-                feeds = r.json()["data"]["rec_video_list"]
-                for i in range(len(feeds)):
-                    # video_id
-                    if "vid" in feeds[i]:
-                        video_id = feeds[i]["vid"]
-                    else:
-                        video_id = 0
-
-                    # video_title
-                    if "title" in feeds[i]:
-                        video_title = feeds[i]["title"].strip().replace("\n", "") \
-                                .replace("/", "").replace("\\", "").replace("\r", "") \
-                                .replace(":", "").replace("*", "").replace("?", "") \
-                                .replace("?", "").replace('"', "").replace("<", "") \
-                                .replace(">", "").replace("|", "").replace(" ", "") \
-                                .replace("&NBSP", "").replace(".", "。").replace(" ", "") \
-                                .replace("小年糕", "").replace("#", "").replace("Merge", "")
-                    else:
-                        video_title = 0
-
-                    # video_play_cnt
-                    if "played_cnt" in feeds[i]:
-                        video_play_cnt = feeds[i]["played_cnt"]
-                    else:
-                        video_play_cnt = 0
-
-                    # video_comment_cnt
-                    if "comment_cnt" in feeds[i]:
-                        video_comment_cnt = feeds[i]["comment_cnt"]
-                    else:
-                        video_comment_cnt = 0
-
-                    # video_liked_cnt
-                    if "liked_cnt" in feeds[i]:
-                        video_liked_cnt = feeds[i]["liked_cnt"]
-                    else:
-                        video_liked_cnt = 0
-
-                    # video_share_cnt
-                    if "shared_cnt" in feeds[i]:
-                        video_share_cnt = feeds[i]["shared_cnt"]
-                    else:
-                        video_share_cnt = 0
-
-                    # video_duration
-                    if "duration" in feeds[i]:
-                        video_duration = feeds[i]["duration"]
-                    else:
-                        video_duration = 0
-
-                    # video_width / video_height
-                    if "width" in feeds[i] or "height" in feeds[i]:
-                        video_width = feeds[i]["width"]
-                        video_height = feeds[i]["height"]
-                    else:
-                        video_width = 0
-                        video_height = 0
-
-                    # video_send_time
-                    if "upload_time" in feeds[i]:
-                        video_send_time = feeds[i]["upload_time"]
-                    else:
-                        video_send_time = 0
-
-                    # user_name
-                    if "user_info" not in feeds[i]:
-                        user_name = 0
-                    elif "nickname" not in feeds[i]["user_info"]:
-                        user_name = 0
-                    else:
-                        user_name = feeds[i]["user_info"]["nickname"].strip().replace("\n", "")
-
-                    # user_id
-                    if "user_info" not in feeds[i]:
-                        user_id = 0
-                    elif "openid" not in feeds[i]["user_info"]:
-                        user_id = 0
-                    else:
-                        user_id = feeds[i]["user_info"]["openid"]
-
-                    # head_url
-                    if "user_info" not in feeds[i]:
-                        head_url = 0
-                    elif "headimg_url" not in feeds[i]["user_info"]:
-                        head_url = 0
-                    else:
-                        head_url = feeds[i]["user_info"]["headimg_url"]
-
-                    # cover_url
-                    if "cover_url" not in feeds[i]:
-                        cover_url = 0
-                    else:
-                        cover_url = feeds[i]["cover_url"]
-
-                    # video_url
-                    if "play_info" not in feeds[i]:
-                        video_url = 0
-                    elif "items" not in feeds[i]["play_info"]:
-                        video_url = 0
-                    else:
-                        video_url = feeds[i]["play_info"]["items"][-1]["play_url"]
-
-                    Common.logger("moment").info("video_id:{}", video_id)
-                    Common.logger("moment").info("video_title:{}", video_title)
-                    Common.logger("moment").info("user_name:{}", user_name)
-                    Common.logger("moment").info("video_play_cnt:{}", video_play_cnt)
-                    Common.logger("moment").info("video_liked_cnt:{}", video_liked_cnt)
-                    Common.logger("moment").info("video_share_cnt:{}", video_share_cnt)
-                    Common.logger("moment").info("video_duration:{}", video_duration)
-                    Common.logger("moment").info("video_width * video_height:{}*{}", video_width, video_height)
-                    Common.logger("moment").info("video_url:{}", video_url)
-
-                    # 过滤无效视频
-                    if video_id == 0 or video_title == 0 or video_duration == 0 or video_send_time == 0 or user_id == 0\
-                            or head_url == 0 or cover_url == 0 or video_url == 0:
-                        Common.logger("moment").warning("无效视频")
-                    # 抓取基础规则
-                    elif cls.download_rule(
-                            d_duration=video_duration, d_width=video_width, d_height=video_height,
-                            d_play_cnt=video_play_cnt, d_like_cnt=video_liked_cnt,
-                            d_share_cnt=video_share_cnt) is False:
-                        Common.logger("moment").info("不满足基础规则:{}", video_title)
-                    elif int(video_send_time) < 1659283200:
-                        Common.logger("moment").info('发布时间{}<2022-08-01', video_send_time)
-                    # 过滤词库
-                    elif any(word if word in video_title else False for word in cls.sensitive_words()) is True:
-                        Common.logger("moment").info("视频已中过滤词:{}".format(video_title))
-                    # 从已下载视频表去重:https://w42nne6hzg.feishu.cn/sheets/shtcngRPoDYAi24x52j2nDuHMih?sheet=20ce0c
-                    elif video_id in [j for m in Feishu.get_values_batch("moment", "kanyikan", "20ce0c") for j in m]:
-                        Common.logger("moment").info("该视频已下载:{}", video_title)
-                    # 从feeds视频表去重:https://w42nne6hzg.feishu.cn/sheets/shtcngRPoDYAi24x52j2nDuHMih?sheet=tGqZMX
-                    elif video_id in [j for n in Feishu.get_values_batch("moment", "kanyikan", "tGqZMX") for j in n]:
-                        Common.logger("moment").info("该视频已在moment_feeds中:{}", video_title)
-                    else:
-                        Common.logger("moment").info("该视频未下载,添加至moment_feeds中:{}", video_title)
-                        # 看一看+工作表,插入首行
-                        Feishu.insert_columns("moment", "kanyikan", "tGqZMX", "ROWS", 1, 2)
-                        # 获取当前时间
-                        get_feeds_time = int(time.time())
-                        # 准备写入云文档的数据
-                        values = [[time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(get_feeds_time)),
-                                   "朋友圈",
-                                   video_id,
-                                   video_title,
-                                   video_play_cnt,
-                                   video_comment_cnt,
-                                   video_liked_cnt,
-                                   video_share_cnt,
-                                   video_duration,
-                                   str(video_width)+"*"+str(video_height),
-                                   time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(video_send_time)),
-                                   user_name,
-                                   user_id,
-                                   head_url,
-                                   cover_url,
-                                   video_url]]
-                        time.sleep(1)
-                        Feishu.update_values("moment", "kanyikan", "tGqZMX", "A2:P2", values)
-
-        except Exception as e:
-            Common.logger("moment").error("获取视频列表异常:{}", e)
-
-    # 下载/上传视频
-    @classmethod
-    def download_publish(cls, env):
-        try:
-            moment_feeds = Feishu.get_values_batch("moment", "kanyikan", "tGqZMX")
-            for i in range(1, len(moment_feeds) + 1):
-                time.sleep(1)
-                # download_push_time = moment_feeds[i][0]
-                download_video_id = moment_feeds[i][2]
-                download_video_title = moment_feeds[i][3]
-                download_video_play_cnt = moment_feeds[i][4]
-                download_video_comment_cnt = moment_feeds[i][5]
-                download_video_like_cnt = moment_feeds[i][6]
-                download_video_share_cnt = moment_feeds[i][7]
-                download_video_duration = moment_feeds[i][8]
-                download_video_resolution = moment_feeds[i][9]
-                download_video_send_time = moment_feeds[i][10]
-                download_user_name = moment_feeds[i][11]
-                download_user_id = moment_feeds[i][12]
-                download_head_url = moment_feeds[i][13]
-                download_cover_url = moment_feeds[i][14]
-                download_video_url = moment_feeds[i][15]
-
-                Common.logger("moment").info("正在判断第{}行,视频:{}", i, download_video_title)
-
-                # 发布时间的时间戳格式(秒为单位)
-                v_send_time = int(time.mktime(time.strptime(download_video_send_time, "%Y/%m/%d %H:%M:%S")))
-                # 抓取时间的时间戳格式(秒为单位)
-                # v_push_time = int(time.mktime(time.strptime(download_push_time, "%Y/%m/%d %H:%M:%S")))
-
-                # 过滤空行及空标题视频
-                if download_video_id is None\
-                        or download_video_id == ""\
-                        or download_video_title is None\
-                        or download_video_title == "":
-                    Common.logger("moment").warning("标题为空或空行,删除")
-                    # 删除行或列,可选 ROWS、COLUMNS
-                    Feishu.dimension_range("moment", "kanyikan", "tGqZMX", "ROWS", i + 1, i + 1)
-                    return
-                # # 视频的抓取时间小于 2 天
-                # elif int(time.time()) - v_push_time > 172800:
-                #     Common.logger("moment").info("抓取时间超过2天:{}", download_video_title)
-                #     # 删除行或列,可选 ROWS、COLUMNS
-                #     Feishu.dimension_range("tGqZMX", "ROWS", i + 1, i + 1)
-                #     return
-                # 视频发布时间不小于 2021-06-01 00:00:00
-                elif v_send_time < 1622476800:
-                    Common.logger("moment").info(
-                        "发布时间小于2021年6月:{},{}", download_video_title, download_video_send_time)
-                    # 删除行或列,可选 ROWS、COLUMNS
-                    Feishu.dimension_range("moment", "kanyikan", "tGqZMX", "ROWS", i + 1, i + 1)
-                    return
-                # 从已下载视频表中去重
-                elif download_video_id in [j for m in Feishu.get_values_batch(
-                        "moment", "kanyikan", "20ce0c") for j in m]:
-                    Common.logger("moment").info("视频已下载:{}", download_video_title)
-                    # 删除行或列,可选 ROWS、COLUMNS
-                    Feishu.dimension_range("moment", "kanyikan", "tGqZMX", "ROWS", i + 1, i + 1)
-                    return
-                # 从已下载视频表中去重
-                elif download_video_id in [j for m in Feishu.get_values_batch(
-                    "moment", "kanyikan", "ho98Ov") for j in m]:
-                    Common.logger("moment").info("视频已下载:{}", download_video_title)
-                    # 删除行或列,可选 ROWS、COLUMNS
-                    Feishu.dimension_range("moment", "kanyikan", "tGqZMX", "ROWS", i + 1, i + 1)
-                    return
-                else:
-                    Common.logger("moment").info("开始下载视频:{}", download_video_title)
-                    # 下载封面
-                    Common.download_method(log_type="moment", text="cover",
-                                           d_name=str(download_video_title), d_url=str(download_cover_url))
-                    # 下载视频
-                    Common.download_method(log_type="moment", text="video",
-                                           d_name=str(download_video_title), d_url=str(download_video_url))
-                    # 保存视频信息至 "./videos/{download_video_title}/info.txt"
-                    with open("./videos/" + download_video_title + "/" + "info.txt",
-                              "a", encoding="UTF-8") as f_a:
-                        f_a.write(str(download_video_id) + "\n" +
-                                  str(download_video_title) + "\n" +
-                                  str(download_video_duration) + "\n" +
-                                  str(download_video_play_cnt) + "\n" +
-                                  str(download_video_comment_cnt) + "\n" +
-                                  str(download_video_like_cnt) + "\n" +
-                                  str(download_video_share_cnt) + "\n" +
-                                  str(download_video_resolution) + "\n" +
-                                  str(int(time.mktime(
-                                      time.strptime(download_video_send_time, "%Y/%m/%d %H:%M:%S")))) + "\n" +
-                                  str(download_user_name) + "\n" +
-                                  str(download_head_url) + "\n" +
-                                  str(download_video_url) + "\n" +
-                                  str(download_cover_url) + "\n" +
-                                  "KANYIKAN_MOMENT")
-                    Common.logger("moment").info("==========视频信息已保存至info.txt==========")
-
-                    # 上传视频
-                    Common.logger("moment").info("开始上传视频:{}".format(download_video_title))
-                    our_video_id = Publish.upload_and_publish(log_type="moment",
-                                                              crawler="kanyikan",
-                                                              strategy="朋友圈抓取策略",
-                                                              our_uid="moment",
-                                                              env=env,
-                                                              oss_endpoint="out")
-                    our_video_link = "https://admin.piaoquantv.com/cms/post-detail/" + str(our_video_id) + "/info"
-                    Common.logger("moment").info("视频上传完成:{}", download_video_title)
-
-                    # 保存视频 ID 到云文档:https://w42nne6hzg.feishu.cn/sheets/shtcngRPoDYAi24x52j2nDuHMih?sheet=20ce0c
-                    Common.logger("moment").info("保存视频ID至云文档:{}", download_video_title)
-                    # 视频ID工作表,插入首行
-                    Feishu.insert_columns("moment", "kanyikan", "20ce0c", "ROWS", 1, 2)
-                    # 视频ID工作表,首行写入数据
-                    upload_time = int(time.time())
-                    values = [[time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(upload_time)),
-                               "朋友圈",
-                               str(download_video_id),
-                               str(download_video_title),
-                               our_video_link,
-                               download_video_play_cnt,
-                               download_video_comment_cnt,
-                               download_video_like_cnt,
-                               download_video_share_cnt,
-                               download_video_duration,
-                               str(download_video_resolution),
-                               str(download_video_send_time),
-                               str(download_user_name),
-                               str(download_user_id),
-                               str(download_head_url),
-                               str(download_cover_url),
-                               str(download_video_url)]]
-                    time.sleep(1)
-                    Feishu.update_values("moment", "kanyikan", "20ce0c", "F2:W2", values)
-
-                    # 保存视频信息到监控表
-                    Common.logger("moment").info("添加视频到监控表:{}", download_video_title)
-                    # 插入空行
-                    time.sleep(1)
-                    Feishu.insert_columns("moment", "monitor", "6fed97", "ROWS", 1, 2)
-                    # 视频信息写入监控表
-                    values = [[time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(int(upload_time))),
-                               str(download_video_id),
-                               download_video_title,
-                               our_video_link,
-                               download_video_duration,
-                               str(download_video_send_time),
-                               download_video_play_cnt]]
-                    time.sleep(1)
-                    Feishu.update_values("moment", "monitor", "6fed97", "F2:L2", values)
-
-                    # 删除行或列,可选 ROWS、COLUMNS
-                    Feishu.dimension_range("moment", "kanyikan", "tGqZMX", "ROWS", i + 1, i + 1)
-                    return
-        except Exception as e:
-            Common.logger("moment").error("下载视频异常:{}", e)
-            # 删除行或列,可选 ROWS、COLUMNS
-            Feishu.dimension_range("moment", "kanyikan", "tGqZMX", "ROWS", 2, 2)
-
-    # 执行下载/上传
-    @classmethod
-    def run_download_publish(cls, env):
-        try:
-            while True:
-                if len(Feishu.get_values_batch("moment", "kanyikan", "tGqZMX")) == 1:
-                    break
-                else:
-                    cls.download_publish(env)
-        except Exception as e:
-            Common.logger("moment").error("执行下载/上传异常:{}", e)
-
-
-if __name__ == "__main__":
-    kuaishou = Moment()
-    kuaishou.run_download_publish("dev")
-
-    pass

+ 0 - 125
kanyikan/kanyikan_recommend/kanyikan_recommend0705.py

@@ -4,17 +4,13 @@
 import json
 import os
 import random
-import shutil
 import sys
 import time
-from hashlib import md5
 import requests
 import urllib3
 sys.path.append(os.getcwd())
 from common.mq import MQ
 from common.common import Common
-from common.feishu import Feishu
-from common.publish import Publish
 from common.scheduling_db import MysqlHelper
 from common.public import get_config_from_mysql, download_rule
 proxies = {"http": None, "https": None}
@@ -139,12 +135,6 @@ class KanyikanRecommend:
                         Common.logger(log_type, crawler).info('视频已下载\n')
                         Common.logging(log_type, crawler, env, '视频已下载\n')
                     else:
-                        # cls.download_publish(log_type=log_type,
-                        #                      crawler=crawler,
-                        #                      our_uid=our_uid,
-                        #                      video_dict=video_dict,
-                        #                      rule_dict=rule_dict,
-                        #                      env=env)
                         video_dict["out_user_id"] = video_dict["user_id"]
                         video_dict["platform"] = crawler
                         video_dict["strategy"] = log_type
@@ -164,121 +154,6 @@ class KanyikanRecommend:
             Common.logger(log_type, crawler).error(f"抓取列表页时异常:{e}\n")
             Common.logging(log_type, crawler, env, f"抓取列表页时异常:{e}\n")
 
-    @classmethod
-    def download_publish(cls, log_type, crawler, our_uid, video_dict, rule_dict, env):
-        # 下载视频
-        Common.download_method(log_type=log_type, crawler=crawler, text='video', title=video_dict['video_title'], url=video_dict['video_url'])
-        md_title = md5(video_dict['video_title'].encode('utf8')).hexdigest()
-        try:
-            if os.path.getsize(f"./{crawler}/videos/{md_title}/video.mp4") == 0:
-                # 删除视频文件夹
-                shutil.rmtree(f"./{crawler}/videos/{md_title}")
-                Common.logger(log_type, crawler).info("视频size=0,删除成功\n")
-                Common.logging(log_type, crawler, env, "视频size=0,删除成功\n")
-                return
-        except FileNotFoundError:
-            # 删除视频文件夹
-            shutil.rmtree(f"./{crawler}/videos/{md_title}")
-            Common.logger(log_type, crawler).info("视频文件不存在,删除文件夹成功\n")
-            Common.logging(log_type, crawler, env, "视频文件不存在,删除文件夹成功\n")
-            return
-        # 下载封面
-        Common.download_method(log_type=log_type, crawler=crawler, text='cover', title=video_dict['video_title'], url=video_dict['cover_url'])
-        # 保存视频信息至txt
-        Common.save_video_info(log_type=log_type, crawler=crawler, video_dict=video_dict)
-
-        # 上传视频
-        Common.logger(log_type, crawler).info("开始上传视频...")
-        Common.logging(log_type, crawler, env, "开始上传视频...")
-        if env == "dev":
-            oss_endpoint = "out"
-            our_video_id = Publish.upload_and_publish(log_type=log_type,
-                                                      crawler=crawler,
-                                                      strategy=cls.strategy,
-                                                      our_uid=our_uid,
-                                                      env=env,
-                                                      oss_endpoint=oss_endpoint)
-            our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
-        else:
-            our_video_id = Publish.upload_and_publish(log_type=log_type,
-                                                      crawler=crawler,
-                                                      strategy=cls.strategy,
-                                                      our_uid=our_uid,
-                                                      env=env,
-                                                      oss_endpoint="out")
-
-            our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
-
-        if our_video_id is None:
-            try:
-                # 删除视频文件夹
-                shutil.rmtree(f"./{crawler}/videos/{md_title}")
-                return
-            except FileNotFoundError:
-                return
-
-        # 视频信息保存数据库
-        insert_sql = f""" insert into crawler_video(video_id,
-                                                user_id,
-                                                out_user_id,
-                                                platform,
-                                                strategy,
-                                                out_video_id,
-                                                video_title,
-                                                cover_url,
-                                                video_url,
-                                                duration,
-                                                publish_time,
-                                                play_cnt,
-                                                crawler_rule,
-                                                width,
-                                                height)
-                                                values({our_video_id},
-                                                {our_uid},
-                                                "{video_dict['user_id']}",
-                                                "{cls.platform}",
-                                                "{cls.strategy}",
-                                                "{video_dict['video_id']}",
-                                                "{video_dict['video_title']}",
-                                                "{video_dict['cover_url']}",
-                                                "{video_dict['video_url']}",
-                                                {int(video_dict['duration'])},
-                                                "{video_dict['publish_time_str']}",
-                                                {int(video_dict['play_cnt'])},
-                                                '{json.dumps(rule_dict)}',
-                                                {int(video_dict['video_width'])},
-                                                {int(video_dict['video_height'])}) """
-        Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
-        Common.logging(log_type, crawler, env, f"insert_sql:{insert_sql}")
-        MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
-        Common.logger(log_type, crawler).info('视频信息写入数据库成功')
-        Common.logging(log_type, crawler, env, '视频信息写入数据库成功')
-
-        # 保存视频信息到云文档:
-        Feishu.insert_columns(log_type, crawler, "20ce0c", "ROWS", 1, 2)
-        # 看一看+ ,视频ID工作表,首行写入数据
-        values = [[time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))),
-                   "推荐榜",
-                   str(video_dict["video_id"]),
-                   str(video_dict["video_title"]),
-                   our_video_link,
-                   video_dict["play_cnt"],
-                   video_dict["comment_cnt"],
-                   video_dict["like_cnt"],
-                   video_dict["share_cnt"],
-                   video_dict["duration"],
-                   f'{video_dict["video_width"]}*{video_dict["video_height"]}',
-                   video_dict["publish_time_str"],
-                   video_dict["user_name"],
-                   video_dict["user_id"],
-                   video_dict["avatar_url"],
-                   video_dict["cover_url"],
-                   video_dict["video_url"]]]
-        time.sleep(0.5)
-        Feishu.update_values(log_type, crawler, "20ce0c", "F2:Z2", values)
-        Common.logger(log_type, crawler).info("视频信息保存至云文档成功\n")
-        Common.logging(log_type, crawler, env, "视频信息保存至云文档成功\n")
-
 
 if __name__ == "__main__":
     print(get_config_from_mysql(log_type="recommend",

+ 47 - 0
main/start_appium.sh

@@ -0,0 +1,47 @@
+#! /bin/bash
+log_type=$1   # 爬虫策略
+crawler=$2    # 哪款爬虫
+env=$3        # 爬虫运行环境,正式环境: prod / 测试环境: dev
+
+if [ ${crawler} = "shipinhao" ] && [ ${log_type} = "recommend" ] && [ ${env} = "prod" ];then
+  piaoquan_crawler_dir=/Users/lieyunye/Desktop/crawler/piaoquan_crawler/
+  profile_path=~/.bash_profile
+  node_path=/usr/local/bin/node
+  log_path=${piaoquan_crawler_dir}main/main_logs/start-appium-$(date +%Y-%m-%d).log
+elif [ ${crawler} = "jixiangxingfu" ] || [ ${crawler} = "zhongmiaoyinxin" ] || [ ${crawler} = "zhiqingtiantiankan" ] || [ ${crawler} = "ganggangdouchuan" ];then
+  piaoquan_crawler_dir=/Users/piaoquan/Desktop/piaoquan_crawler/
+  profile_path=./base_profile
+  node_path=/usr/local/bin/node
+  log_path=${piaoquan_crawler_dir}main/main_logs/start-appium-$(date +%Y-%m-%d).log
+elif [ ${crawler} = "xigua" ] || [ ${log_type} = "recommend" ];then
+  piaoquan_crawler_dir=/Users/kanyikan/Desktop/crawler/piaoquan_crawler/
+  profile_path=/etc/profile
+  node_path=/usr/local/bin/node
+  log_path=${piaoquan_crawler_dir}main/main_logs/start-appium-$(date +%Y-%m-%d).log
+elif [ ${crawler} = "shipinhao" ] || [ ${log_type} = "search" ];then
+  piaoquan_crawler_dir=/Users/piaoquan/Desktop/piaoquan_crawler/
+  profile_path=/etc/profile
+  node_path=/usr/local/bin/node
+  log_path=${piaoquan_crawler_dir}main/main_logs/start-appium-$(date +%Y-%m-%d).log
+else
+  piaoquan_crawler_dir=/Users/wangkun/Desktop/crawler/piaoquan_crawler/
+  profile_path=/etc/profile
+  node_path=/opt/homebrew/bin/node
+  log_path=${piaoquan_crawler_dir}main/main_logs/start-appium-$(date +%Y-%m-%d).log
+fi
+
+echo "$(date "+%Y-%m-%d %H:%M:%S") 正在检测 Appium 运行状态 ..." >> ${log_path}
+ps -ef | grep "/Applications/Appium.app/Contents/Resources/app/node_modules/appium/build/lib/main.js" | grep -v "grep"
+if [ "$?" -eq 1 ];then
+  echo "$(date "+%Y-%m-%d %H:%M:%S") Appium异常停止,正在重启!" >> ${log_path}
+  nohup ${node_path} /Applications/Appium.app/Contents/Resources/app/node_modules/appium/build/lib/main.js >>./main/main_logs/Appium.log 2>&1 &
+  echo "$(date "+%Y-%m-%d %H:%M:%S") 重启Appium完毕!" >> ${log_path}
+else
+  echo "$(date "+%Y-%m-%d %H:%M:%S") Appium 运行状态正常。" >> ${log_path}
+fi
+
+# 删除日志
+echo "$(date "+%Y-%m-%d %H:%M:%S") 开始清理 10 天前的日志文件" >> ${log_path}
+find ${piaoquan_crawler_dir}main/main_logs/ -mtime +10 -name "*.log" -exec rm -rf {} \;
+echo "$(date "+%Y-%m-%d %H:%M:%S") 日志文件清理完毕" >> ${log_path}
+exit 0

+ 124 - 0
shipinhao/shipinhao_main/run_sph_recommend.py

@@ -0,0 +1,124 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/7/27
+import argparse
+import random
+from mq_http_sdk.mq_client import *
+from mq_http_sdk.mq_consumer import *
+from mq_http_sdk.mq_exception import MQExceptionBase
+sys.path.append(os.getcwd())
+from common.common import Common
+from common.public import get_consumer, ack_message, task_fun_mq
+from common.scheduling_db import MysqlHelper
+from shipinhao.shipinhao_recommend.recommend_h5 import RecommendH5
+from shipinhao.shipinhao_recommend.shipinhao_recommend import ShipinhaoRecommend
+
+
+class ShipinhaoRecommendMain:
+    @classmethod
+    def shipinhao_recommend_main(cls, log_type, crawler, topic_name, group_id, env):
+        consumer = get_consumer(topic_name, group_id)
+        # 长轮询表示如果Topic没有消息,则客户端请求会在服务端挂起3秒,3秒内如果有消息可以消费则立即返回响应。
+        # 长轮询时间3秒(最多可设置为30秒)。
+        wait_seconds = 30
+        # 一次最多消费3条(最多可设置为16条)。
+        batch = 1
+        Common.logger(log_type, crawler).info(f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
+                                              f'WaitSeconds:{wait_seconds}\n'
+                                              f'TopicName:{topic_name}\n'
+                                              f'MQConsumer:{group_id}')
+        Common.logging(log_type, crawler, env, f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
+                                               f'WaitSeconds:{wait_seconds}\n'
+                                               f'TopicName:{topic_name}\n'
+                                               f'MQConsumer:{group_id}')
+        while True:
+            try:
+                # 长轮询消费消息。
+                recv_msgs = consumer.consume_message(batch, wait_seconds)
+                for msg in recv_msgs:
+                    Common.logger(log_type, crawler).info(f"Receive\n"
+                                                          f"MessageId:{msg.message_id}\n"
+                                                          f"MessageBodyMD5:{msg.message_body_md5}\n"
+                                                          f"MessageTag:{msg.message_tag}\n"
+                                                          f"ConsumedTimes:{msg.consumed_times}\n"
+                                                          f"PublishTime:{msg.publish_time}\n"
+                                                          f"Body:{msg.message_body}\n"
+                                                          f"NextConsumeTime:{msg.next_consume_time}\n"
+                                                          f"ReceiptHandle:{msg.receipt_handle}\n"
+                                                          f"Properties:{msg.properties}")
+                    Common.logging(log_type, crawler, env, f"Receive\n"
+                                                           f"MessageId:{msg.message_id}\n"
+                                                           f"MessageBodyMD5:{msg.message_body_md5}\n"
+                                                           f"MessageTag:{msg.message_tag}\n"
+                                                           f"ConsumedTimes:{msg.consumed_times}\n"
+                                                           f"PublishTime:{msg.publish_time}\n"
+                                                           f"Body:{msg.message_body}\n"
+                                                           f"NextConsumeTime:{msg.next_consume_time}\n"
+                                                           f"ReceiptHandle:{msg.receipt_handle}\n"
+                                                           f"Properties:{msg.properties}")
+                    # ack_mq_message
+                    ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
+
+                    # 处理爬虫业务
+                    task_dict = task_fun_mq(msg.message_body)['task_dict']
+                    rule_dict = task_fun_mq(msg.message_body)['rule_dict']
+                    task_id = task_dict['id']
+                    select_user_sql = f"""select * from crawler_user_v3 where task_id={task_id}"""
+                    user_list = MysqlHelper.get_values(log_type, crawler, select_user_sql, env, action="")
+                    our_uid_list = []
+                    for user in user_list:
+                        our_uid_list.append(user["uid"])
+                    our_uid = random.choice(our_uid_list)
+                    Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")
+                    Common.logging(log_type, crawler, env, f"调度任务:{task_dict}")
+                    Common.logger(log_type, crawler).info(f"抓取规则:{rule_dict}")
+                    Common.logging(log_type, crawler, env, f"抓取规则:{rule_dict}")
+                    Common.logger(log_type, crawler).info(f"用户列表:{user_list}\n")
+                    Common.logging(log_type, crawler, env, f"用户列表:{user_list}\n")
+                    Common.logger(log_type, crawler).info(f'开始抓取:{task_dict["taskName"]}\n')
+                    Common.logging(log_type, crawler, env, f'开始抓取:{task_dict["taskName"]}\n')
+
+                    # 每轮扫描视频数
+                    scan_count = 20
+                    # 抓取符合规则的视频列表
+                    ShipinhaoRecommend.get_recommend_list(log_type=log_type,
+                                                          crawler=crawler,
+                                                          rule_dict=rule_dict,
+                                                          scan_count=scan_count,
+                                                          env=env)
+                    # 抓取符合规则视频的 URL,并发送 MQ 消息给 ETL
+                    RecommendH5.download_videos(log_type=log_type,
+                                                crawler=crawler,
+                                                env=env,
+                                                rule_dict=rule_dict,
+                                                our_uid=our_uid)
+                    ShipinhaoRecommend.download_video_list = []
+                    Common.logger(log_type, crawler).info('抓取一轮结束\n')
+                    Common.logging(log_type, crawler, env, '抓取一轮结束\n')
+
+            except MQExceptionBase as err:
+                # Topic中没有消息可消费。
+                if err.type == "MessageNotExist":
+                    Common.logger(log_type, crawler).info(f"No new message! RequestId:{err.req_id}\n")
+                    Common.logging(log_type, crawler, env, f"No new message! RequestId:{err.req_id}\n")
+                    continue
+
+                Common.logger(log_type, crawler).info(f"Consume Message Fail! Exception:{err}\n")
+                Common.logging(log_type, crawler, env, f"Consume Message Fail! Exception:{err}\n")
+                time.sleep(2)
+                continue
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()  ## 新建参数解释器对象
+    parser.add_argument('--log_type', type=str)  ## 添加参数,注明参数类型
+    parser.add_argument('--crawler')  ## 添加参数
+    parser.add_argument('--topic_name')  ## 添加参数
+    parser.add_argument('--group_id')  ## 添加参数
+    parser.add_argument('--env')  ## 添加参数
+    args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
+    ShipinhaoRecommendMain.shipinhao_recommend_main(log_type=args.log_type,
+                                                    crawler=args.crawler,
+                                                    topic_name=args.topic_name,
+                                                    group_id=args.group_id,
+                                                    env=args.env)

+ 24 - 15
shipinhao/shipinhao_main/run_sph_recommend_dev.py

@@ -3,6 +3,8 @@
 # @Time: 2023/7/26
 import os
 import sys
+import time
+
 sys.path.append(os.getcwd())
 from common.common import Common
 from shipinhao.shipinhao_recommend.recommend_h5 import RecommendH5
@@ -12,21 +14,28 @@ from shipinhao.shipinhao_recommend.shipinhao_recommend import ShipinhaoRecommend
 class ShipinhaoRecommendMain:
     @classmethod
     def shipinhao_recommend_main(cls, log_type, crawler, env):
-        Common.logger(log_type, crawler).info("开始抓取视频号推荐\n")
-        rule_dict = {"period": {"min": 365, "max": 365},
-                     "duration": {"min": 10, "max": 1800},
-                     "favorite_cnt": {"min": 5000, "max": 0},
-                     "share_cnt": {"min": 1000, "max": 0}}
-        ShipinhaoRecommend.get_recommend_list(log_type=log_type,
-                                              crawler=crawler,
-                                              rule_dict=rule_dict,
-                                              env=env)
-        RecommendH5.download_videos(log_type=log_type,
-                                    crawler=crawler,
-                                    env=env,
-                                    rule_dict=rule_dict,
-                                    our_uid="6267140")
-        Common.logger(log_type, crawler).info("抓取一轮结束\n")
+        while True:
+            Common.logger(log_type, crawler).info("开始抓取视频号推荐\n")
+            Common.logging(log_type, crawler, env, "开始抓取视频号推荐\n")
+            scan_count = 20
+            rule_dict = {"period": {"min": 365, "max": 365},
+                         "duration": {"min": 10, "max": 1800},
+                         "favorite_cnt": {"min": 5000, "max": 0},
+                         "share_cnt": {"min": 1000, "max": 0}}
+            ShipinhaoRecommend.get_recommend_list(log_type=log_type,
+                                                  crawler=crawler,
+                                                  rule_dict=rule_dict,
+                                                  scan_count = scan_count,
+                                                  env=env)
+            RecommendH5.download_videos(log_type=log_type,
+                                        crawler=crawler,
+                                        env=env,
+                                        rule_dict=rule_dict,
+                                        our_uid="6267140")
+            Common.logger(log_type, crawler).info("抓取一轮结束\n")
+            Common.logging(log_type, crawler, env, "抓取一轮结束\n")
+            ShipinhaoRecommend.download_video_list = []
+            time.sleep(5)
 
 
 if __name__ == "__main__":

+ 45 - 14
shipinhao/shipinhao_recommend/recommend_h5.py

@@ -23,6 +23,7 @@ class RecommendH5:
     @classmethod
     def start_wechat(cls, log_type, crawler, env):
         Common.logger(log_type, crawler).info('启动微信')
+        Common.logging(log_type, crawler, env, '启动微信')
         if env == "dev":
             chromedriverExecutable = "/Users/wangkun/Downloads/chromedriver/chromedriver_v107/chromedriver"
         else:
@@ -74,11 +75,13 @@ class RecommendH5:
 
     # noinspection PyBroadException
     @classmethod
-    def check_to_webview(cls, log_type, crawler, driver: WebDriver):
+    def check_to_webview(cls, log_type, crawler, env, driver: WebDriver):
         webviews = driver.contexts
         Common.logger(log_type, crawler).info(f"webviews:{webviews}")
+        Common.logging(log_type, crawler, env, f"webviews:{webviews}")
         driver.switch_to.context(webviews[1])
         Common.logger(log_type, crawler).info(driver.current_context)
+        Common.logging(log_type, crawler, env, driver.current_context)
         time.sleep(1)
         windowHandles = driver.window_handles
         for handle in windowHandles:
@@ -87,9 +90,11 @@ class RecommendH5:
                 time.sleep(1)
                 driver.find_element(By.XPATH, '//div[@class="unit"]')
                 Common.logger(log_type, crawler).info('切换 webview 成功')
+                Common.logging(log_type, crawler, env, '切换 webview 成功')
                 return "成功"
             except Exception:
                 Common.logger(log_type, crawler).info("切换 webview 失败")
+                Common.logging(log_type, crawler, env, "切换 webview 失败")
 
     @classmethod
     def search_video(cls, log_type, crawler, env, video_dict, rule_dict, our_uid):
@@ -98,17 +103,18 @@ class RecommendH5:
         # 点击微信搜索框,并输入搜索词
         driver.implicitly_wait(10)
         Common.logger(log_type, crawler).info("点击搜索框")
+        Common.logging(log_type, crawler, env, "点击搜索框")
         driver.find_element(By.ID, 'com.tencent.mm:id/j5t').click()  # 微信8.0.30版本
         time.sleep(0.5)
         driver.find_element(By.ID, 'com.tencent.mm:id/cd7').clear().send_keys(
             video_dict['video_title'].replace('"', "").replace('“', "").replace('”', "").replace('#', ""))  # 微信8.0.30版本
         # driver.press_keycode(AndroidKey.ENTER)
         Common.logger(log_type, crawler).info("进入搜索词页面")
+        Common.logging(log_type, crawler, env, "进入搜索词页面")
         driver.find_element(By.ID, 'com.tencent.mm:id/m94').click()  # 微信8.0.30版本
-        time.sleep(5)
 
         # 切换到微信搜索结果页 webview
-        check_to_webview = cls.check_to_webview(log_type, crawler, driver)
+        check_to_webview = cls.check_to_webview(log_type, crawler, env, driver)
         if check_to_webview is None:
             Common.logger(log_type, crawler).info("切换到视频号 webview 失败\n")
             Common.logging(log_type, crawler, env, "切换到视频号 webview 失败\n")
@@ -127,6 +133,7 @@ class RecommendH5:
             h5_page = cls.search_elements(driver, '//*[@class="mixed-box__bd"]')
             if h5_page is None:
                 Common.logger(log_type, crawler).info('未发现H5页面')
+                Common.logging(log_type, crawler, env, '未发现H5页面')
                 driver.refresh()
             else:
                 break
@@ -136,23 +143,28 @@ class RecommendH5:
             return
 
         Common.logger(log_type, crawler).info('获取视频列表\n')
+        Common.logging(log_type, crawler, env, '获取视频列表\n')
         video_elements = cls.search_elements(driver, '//div[@class="rich-media active__absolute"]')
         if video_elements is None:
             Common.logger(log_type, crawler).warning(f'video_elements:{video_elements}')
+            Common.logging(log_type, crawler, env, f'video_elements:{video_elements}')
             return
 
         for i, video_element in enumerate(video_elements):
             try:
                 if video_element is None:
                     Common.logger(log_type, crawler).info('到底啦~\n')
+                    Common.logging(log_type, crawler, env, '到底啦~\n')
                     return
 
                 Common.logger(log_type, crawler).info(f'拖动"视频"列表第{i + 1}条至屏幕中间')
+                Common.logging(log_type, crawler, env, f'拖动"视频"列表第{i + 1}条至屏幕中间')
                 time.sleep(3)
                 driver.execute_script("arguments[0].scrollIntoView({block:'center',inline:'center'})",
                                       video_element)
                 if len(video_element.find_elements(By.XPATH, "//*[@text='没有更多的搜索结果']")) != 0:
                     Common.logger(log_type, crawler).info("没有更多的搜索结果\n")
+                    Common.logging(log_type, crawler, env, "没有更多的搜索结果\n")
                     return
                 h5_video_title = \
                 video_element.find_elements(By.XPATH, '//div[@class="rich-media__title ellipsis_2"]/span')[i].text[:40]
@@ -181,6 +193,7 @@ class RecommendH5:
 
                     for k, v in video_dict.items():
                         Common.logger(log_type, crawler).info(f"{k}:{v}")
+                    Common.logging(log_type, crawler, env, f"video_dict:{video_dict}")
 
                     video_dict["out_user_id"] = h5_out_user_id
                     video_dict["platform"] = crawler
@@ -193,30 +206,48 @@ class RecommendH5:
                     video_dict["publish_time"] = video_dict["publish_time_str"]
                     mq.send_msg(video_dict)
                     Common.logger(log_type, crawler).info("已抓取到目标视频\n")
+                    Common.logging(log_type, crawler, env, "已抓取到目标视频\n")
                     driver.quit()
                     return
                 else:
                     Common.logger(log_type, crawler).info(f"video_dict['video_title']:{video_dict['video_title']}")
+                    Common.logging(log_type, crawler, env, f"video_dict['video_title']:{video_dict['video_title']}")
                     Common.logger(log_type, crawler).info(f"h5_video_title:{h5_video_title}")
+                    Common.logging(log_type, crawler, env, f"h5_video_title:{h5_video_title}")
                     Common.logger(log_type, crawler).info(f"title_similarity:{title_similarity}")
+                    Common.logging(log_type, crawler, env, f"title_similarity:{title_similarity}")
                     Common.logger(log_type, crawler).info(f"video_dict['user_name']:{video_dict['user_name']}")
+                    Common.logging(log_type, crawler, env, f"video_dict['user_name']:{video_dict['user_name']}")
                     Common.logger(log_type, crawler).info(f"h5_user_name:{h5_user_name}")
-                    Common.logger(log_type, crawler).info(f"user_name_similarity:{user_name_similarity}\n")
+                    Common.logging(log_type, crawler, env, f"h5_user_name:{h5_user_name}")
+                    Common.logger(log_type, crawler).info(f"user_name_similarity:{user_name_similarity}")
+                    Common.logging(log_type, crawler, env, f"user_name_similarity:{user_name_similarity}")
             except Exception as e:
                 Common.logger(log_type, crawler).info(f"抓取单条H5视频时异常:{e}\n")
+                Common.logging(log_type, crawler,env, f"抓取单条H5视频时异常:{e}\n")
+        Common.logger(log_type, crawler).info("未找到目标视频\n")
+        Common.logging(log_type, crawler, env, "未找到目标视频\n")
 
     @classmethod
     def download_videos(cls, log_type, crawler, env, rule_dict, our_uid):
-        Common.logger(log_type, crawler).info(f'共{len(ShipinhaoRecommend.download_video_list)}条视频待抓取')
-        Common.logger(log_type, crawler).info(f'download_video_list:{ShipinhaoRecommend.download_video_list}\n')
-        if len(ShipinhaoRecommend.download_video_list) == 0:
-            Common.logger(log_type, crawler).info("没有待下载的视频\n")
-            return
-        for video_dict in ShipinhaoRecommend.download_video_list:
-            try:
-                cls.search_video(log_type, crawler, env, video_dict, rule_dict, our_uid)
-            except Exception as e:
-                Common.logger(log_type, crawler).info(f"抓取视频异常:{e}\n")
+        try:
+            Common.logger(log_type, crawler).info(f'共{len(ShipinhaoRecommend.download_video_list)}条视频待抓取')
+            Common.logging(log_type, crawler, env, f'共{len(ShipinhaoRecommend.download_video_list)}条视频待抓取')
+            Common.logger(log_type, crawler).info(f'download_video_list:{ShipinhaoRecommend.download_video_list}\n')
+            Common.logging(log_type, crawler, env, f'download_video_list:{ShipinhaoRecommend.download_video_list}\n')
+            if len(ShipinhaoRecommend.download_video_list) == 0:
+                Common.logger(log_type, crawler).info("没有待下载的视频\n")
+                Common.logging(log_type, crawler, env, "没有待下载的视频\n")
+                return
+            for video_dict in ShipinhaoRecommend.download_video_list:
+                try:
+                    cls.search_video(log_type, crawler, env, video_dict, rule_dict, our_uid)
+                except Exception as e:
+                    Common.logger(log_type, crawler).info(f"抓取视频异常:{e}\n")
+                    Common.logging(log_type, crawler, env, f"抓取视频异常:{e}\n")
+        except Exception as e:
+            Common.logger(log_type, crawler).info(f"download_videos异常:{e}\n")
+            Common.logging(log_type, crawler, env, f"download_videos异常:{e}\n")
 
 
 if __name__ == "__main__":

+ 64 - 40
shipinhao/shipinhao_recommend/shipinhao_recommend.py

@@ -20,7 +20,6 @@ from common.scheduling_db import MysqlHelper
 class ShipinhaoRecommend:
     platform = "视频号"
     download_video_list = []
-    scan_count = 20
 
     @classmethod
     def repeat_out_video_id(cls, log_type, crawler, out_video_id, env):
@@ -31,6 +30,7 @@ class ShipinhaoRecommend:
     @classmethod
     def start_wechat(cls, log_type, crawler, env):
         Common.logger(log_type, crawler).info('启动微信')
+        Common.logging(log_type, crawler, env, '启动微信')
         if env == "dev":
             chromedriverExecutable = "/Users/wangkun/Downloads/chromedriver/chromedriver_v107/chromedriver"
         else:
@@ -66,8 +66,9 @@ class ShipinhaoRecommend:
         return driver
 
     @classmethod
-    def get_videoList(cls, log_type, crawler, rule_dict, env, driver: WebDriver):
+    def get_videoList(cls, log_type, crawler, rule_dict, env, scan_count, driver: WebDriver):
         Common.logger(log_type, crawler).info("进入发现页")
+        Common.logging(log_type, crawler, env, "进入发现页")
         tabs = driver.find_elements(By.ID, "com.tencent.mm:id/f2s")
         for tab in tabs:
             if tab.text == "发现":
@@ -76,6 +77,7 @@ class ShipinhaoRecommend:
                 break
 
         Common.logger(log_type, crawler).info('点击"视频号"')
+        Common.logging(log_type, crawler, env, '点击"视频号"')
         textviews = driver.find_elements(By.ID, "android:id/title")
         for textview in textviews:
             if textview.text == "视频号":
@@ -85,37 +87,48 @@ class ShipinhaoRecommend:
 
         # 关闭青少年模式弹框
         Common.logger(log_type, crawler).info("尝试关闭青少年模式弹框\n")
+        Common.logging(log_type, crawler, env, "尝试关闭青少年模式弹框\n")
         try:
             driver.find_element(By.ID, "com.tencent.mm:id/lqz").click()
         except NoSuchElementException:
             pass
 
-        for i in range(cls.scan_count):
-            Common.logger(log_type, crawler).info(f"第{i + 1}条视频")
-            if len(driver.find_elements(By.ID, "com.tencent.mm:id/dkf")) != 0:
-                Common.logger(log_type, crawler).info("这是一个直播间,滑动至下一个视频\n")
+        for i in range(scan_count):
+            try:
+                Common.logger(log_type, crawler).info(f"第{i + 1}条视频")
+                Common.logging(log_type, crawler, env, f"第{i + 1}条视频")
+                if len(driver.find_elements(By.ID, "com.tencent.mm:id/dkf")) != 0:
+                    Common.logger(log_type, crawler).info("这是一个直播间,滑动至下一个视频\n")
+                    Common.logging(log_type, crawler, env, "这是一个直播间,滑动至下一个视频\n")
+                    driver.swipe(10, 1600, 10, 300, 200)
+                    continue
+                video_dict = cls.get_video_info(driver)
+                for k, v in video_dict.items():
+                    Common.logger(log_type, crawler).info(f"{k}:{v}")
+                Common.logging(log_type, crawler, env, f"video_dict:{video_dict}")
+
+                if video_dict["video_title"] is None:
+                    Common.logger(log_type, crawler).info("无效视频")
+                    Common.logging(log_type, crawler, env, "无效视频")
+                elif download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
+                    Common.logger(log_type, crawler).info("不满足抓取规则")
+                    Common.logging(log_type, crawler, env, "不满足抓取规则\n")
+                elif cls.repeat_out_video_id(log_type, crawler, video_dict["video_id"], env) != 0:
+                    Common.logger(log_type, crawler).info('视频已下载')
+                    Common.logging(log_type, crawler, env, '视频已下载\n')
+                else:
+                    cls.download_video_list.append(video_dict)
+                if i+1 == scan_count:
+                    Common.logger(log_type, crawler).info("扫描一轮结束\n")
+                    Common.logging(log_type, crawler, env, "扫描一轮结束\n")
+                    return
+                Common.logger(log_type, crawler).info(f"已抓取符合规则视频{len(cls.download_video_list)}条,滑动至下一个视频\n")
+                Common.logging(log_type, crawler, env, f"已抓取符合规则视频{len(cls.download_video_list)}条,滑动至下一个视频\n")
                 driver.swipe(10, 1600, 10, 300, 200)
-                continue
-            video_dict = cls.get_video_info(driver)
-            for k, v in video_dict.items():
-                Common.logger(log_type, crawler).info(f"{k}:{v}")
+            except Exception as e:
+                Common.logger(log_type, crawler).info(f"扫描单条视频时异常:{e}\n")
+                Common.logging(log_type, crawler, env, f"扫描单条视频时异常:{e}\n")
 
-            if video_dict["video_title"] is None:
-                Common.logger(log_type, crawler).info("无效视频")
-            elif download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
-                Common.logger(log_type, crawler).info("不满足抓取规则")
-                # Common.logging(log_type, crawler, env, "不满足抓取规则\n")
-            elif cls.repeat_out_video_id(log_type, crawler, video_dict["video_id"], env) != 0:
-                Common.logger(log_type, crawler).info('视频已下载')
-                # Common.logging(log_type, crawler, env, '视频已下载\n')
-            else:
-                cls.download_video_list.append(video_dict)
-            if i+1 == cls.scan_count:
-                Common.logger(log_type, crawler).info("扫描一轮结束\n")
-                driver.quit()
-                return
-            Common.logger(log_type, crawler).info(f"已抓取符合规则视频{len(cls.download_video_list)}条,滑动至下一个视频\n")
-            driver.swipe(10, 1600, 10, 300, 200)
 
     @classmethod
     def is_contain_chinese(cls, strword):
@@ -130,12 +143,13 @@ class ShipinhaoRecommend:
         # 点击暂停
         global duration
         for i in range(3):
-            try:
-                driver.find_element(By.ID, "com.tencent.mm:id/gpx").click()
+            pause_elements = driver.find_elements(By.ID, "com.tencent.mm:id/gpx")
+            if len(pause_elements) != 0:
+                pause_elements[0].click()
                 duration_str = driver.find_element(By.ID, "com.tencent.mm:id/l7i").text
                 duration = int(duration_str.split(":")[0]) * 60 + int(duration_str.split(":")[-1])
                 break
-            except NoSuchElementException:
+            else:
                 duration = 0
 
         # user_name
@@ -190,7 +204,11 @@ class ShipinhaoRecommend:
         time.sleep(1)
 
         # title
-        title = driver.find_elements(By.ID, "com.tencent.mm:id/bga")[0].text.replace("\n", " ")[:40]
+        title_elements = driver.find_elements(By.ID, "com.tencent.mm:id/bga")
+        if len(title_elements) == 0:
+            title = ""
+        else:
+            title = title_elements[0].text.replace("\n", " ")[:40]
 
         # 发布时间
         publish_time = driver.find_element(By.ID, "com.tencent.mm:id/bre").get_attribute("name")
@@ -239,15 +257,21 @@ class ShipinhaoRecommend:
         return video_dict
 
     @classmethod
-    def get_recommend_list(cls, log_type, crawler, rule_dict, env):
-        driver = cls.start_wechat(log_type, crawler, env)
-        cls.get_videoList(log_type=log_type,
-                          crawler=crawler,
-                          rule_dict=rule_dict,
-                          env=env,
-                          driver=driver)
-        driver.quit()
-        Common.logger(log_type, crawler).info(f"微信退出成功\n")
+    def get_recommend_list(cls, log_type, crawler, rule_dict, scan_count, env):
+        try:
+            driver = cls.start_wechat(log_type, crawler, env)
+            cls.get_videoList(log_type=log_type,
+                              crawler=crawler,
+                              rule_dict=rule_dict,
+                              env=env,
+                              scan_count=scan_count,
+                              driver=driver)
+            driver.quit()
+            Common.logger(log_type, crawler).info(f"微信退出成功\n")
+            Common.logging(log_type, crawler, env, f"微信退出成功\n")
+        except Exception as e:
+            Common.logger(log_type, crawler).info(f"扫描视频列表异常:{e}\n")
+            Common.logging(log_type, crawler, env, f"扫描视频列表异常:{e}\n")
 
 
 if __name__ == "__main__":
@@ -255,6 +279,6 @@ if __name__ == "__main__":
                  "duration": {"min": 10, "max": 1800},
                  "favorite_cnt": {"min": 50000, "max": 0},
                  "share_cnt": {"min": 10000, "max": 0}}
-    ShipinhaoRecommend.get_recommend_list("recommend", "shipinhao", rule_dict1, "dev")
+    ShipinhaoRecommend.get_recommend_list("recommend", "shipinhao", rule_dict1, 5, "dev")
     print(ShipinhaoRecommend.download_video_list)
     pass