소스 검색

add gongzhonghao_cookie

wangkun 1 년 전
부모
커밋
32ac482af9

+ 1 - 1
common/feishu.py

@@ -86,7 +86,7 @@ class Feishu:
             return "shtcn5YSWg91JfVGzj0SFZIRRPh"
             return "shtcn5YSWg91JfVGzj0SFZIRRPh"
         elif crawler == "xiaoniangao":
         elif crawler == "xiaoniangao":
             return "shtcnYxiyQ1wLklo1W5Kdqc9cGh"
             return "shtcnYxiyQ1wLklo1W5Kdqc9cGh"
-        elif crawler == "monitor":
+        elif crawler == "control":
             return "shtcnlZWYazInhf7Z60jkbLRJyd"
             return "shtcnlZWYazInhf7Z60jkbLRJyd"
         elif crawler == "music_album":
         elif crawler == "music_album":
             return "shtcnT6zvmfsYe1g0iv4pt7855g"
             return "shtcnT6zvmfsYe1g0iv4pt7855g"

+ 3 - 0
control/__init__.py

@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/7/17

+ 3 - 0
control/control/__init__.py

@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/7/17

+ 110 - 0
control/control/control.py

@@ -0,0 +1,110 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/7/17
+import os
+from crontab import CronTab
+
+
+class Control:
+    @classmethod
+    def crontab_add(cls, rule, command):
+        """
+        添加 crontab 任务
+        :param rule: 运行时间规则,如:* * * * *;00 00 * * *
+        :param command: crontab 任务,如:"python3 control/monitor/monitor_logs.py"
+        :return:None
+        """
+        # 创建CronTab对象
+        cron = CronTab(user=True)
+
+        # 新增一行
+        job = cron.new(command=command)
+        job.setall(rule)
+
+        # 保存更改
+        cron.write()
+
+    @classmethod
+    def crontab_edit(cls, enable, command):
+        """
+        注释 / 取消注释 crontab 任务
+        :param enable: True:取消注释 crontab 任务;False:注释 crontab 任务
+        :param command: 需要注释的 crontab 任务,如:"python3 control/monitor/monitor_logs.py"
+        :return: None
+        """
+        # 创建一个新的crontab对象
+        cron = CronTab(user=True)
+
+        # 选择要注释掉的cron任务
+        # find_command方法返回一个生成器对象,而不是单个任务对象。因此,需要使用next函数来获取生成器的下一个元素,然后对该元素调用enable方法来取消注释cron任务
+        job = next(cron.find_command(command))
+
+        # 取消注释cron任务
+        job.enable(enable)
+
+        # 将更改写入crontab
+        cron.write()
+
+    @classmethod
+    def crontab_remove(cls, command):
+        """
+        删除 crontab 任务
+        :param command: crontab 任务,如:"python3 control/monitor/monitor_logs.py"
+        :return:
+        """
+        # 创建一个新的crontab对象
+        cron = CronTab(user=True)
+
+        # 选择要删除的cron任务
+        job = cron.find_command(command)
+
+        # 删除cron任务
+        cron.remove(job)
+
+        # 将更改写入crontab
+        cron.write()
+
+    @classmethod
+    def crawler_stop(cls, command):
+        # 创建一个新的crontab对象
+        cron = CronTab(user=True)
+        if command in [job.command for job in cron]:
+            print("command已存在,开始停用\n")
+            cls.crontab_edit(False, command)
+            return
+        else:
+            print("command不存在,无需停用\n")
+            return
+
+    @classmethod
+    def crawler_start(cls, rule, command):
+        # 创建一个新的crontab对象
+        cron = CronTab(user=True)
+        if command in [job.command for job in cron]:
+            print("command 已存在,开始启用\n")
+            cls.crontab_edit(True, command)
+            return
+        else:
+            print("command 不存在,新增并启用\n")
+            cls.crontab_add(rule, command)
+            return
+
+    @classmethod
+    def crawler_restart(cls, rule, command):
+        cmd = f"ps aux | grep {command.split(' ')[-1].split('/')[-1]}" + " | grep -v grep | awk '{print $2}' | xargs kill -9"
+        print(cmd)
+        print("执行杀进程命令")
+        os.system(cmd)
+        print("正在启动 command")
+        cls.crawler_start(rule, command)
+
+
+if __name__ == "__main__":
+    # Control.crontab_add("* * * * *", "python3 control/monitor/monitor_logs.py")
+    # Control.crontab_edit(False, "python3 control/monitor/monitor_logs.py")
+    # Control.crontab_remove("python3 control/monitor/monitor_logs.py")
+    #
+    # Control.crawler_start("* * * * *", "python3 control/monitor/monitor_logs.py")
+    # Control.crawler_stop("python3 control/monitor/monitor_logs.py")
+    Control.crawler_restart("* * * * *", "python3 control/monitor/monitor_logs.py")
+    pass

+ 3 - 0
control/control_main/__init__.py

@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/7/17

+ 3 - 0
control/logs/__init__.py

@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/7/17

+ 3 - 0
control/monitor/__init__.py

@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/7/17

+ 45 - 0
control/monitor/monitor_logs.py

@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/7/17
+import glob
+import datetime
+import os.path
+import re
+import time
+
+
+class MonitorLogs:
+
+    @classmethod
+    def monitor_logs(cls, log_type, crawler):
+        # 获取日志文件的路径
+        log_files = glob.glob(f"./{crawler}/logs/*-{datetime.datetime.now().date().strftime('%Y-%m-%d')}.log")
+        if len(log_files) == 0:
+            print("未发现最新日志")
+            return
+        for log_file in log_files:
+            # 检查文件名是否包含"author"且后面带有数字
+            if re.search(r"author\d", log_file):
+                continue
+            # 读取最新日志文件的内容
+            with open(log_file, "r") as file:
+                logs = file.readlines()
+            # 过滤空行日志
+            logs = [log.strip() for log in logs if log.strip()]
+            # 获取最新一条日志的时间和内容
+            latest_log = logs[-1]
+            log_time_str = datetime.datetime.strptime(latest_log[:19], "%Y-%m-%d %H:%M:%S")
+            log_time_stamp = int(log_time_str.timestamp())
+            print("日志文件:", log_file)
+            print("最新日志时间:", log_time_str)
+            print("最新日志时间戳:", log_time_stamp)
+            print("最新日志内容:\n", latest_log[24:])
+
+            if int(time.time()) - log_time_stamp > 60*5:
+                print(f"{int(time.time())} - {log_time_stamp} = {int(time.time())-log_time_stamp} > {60 * 5}")
+
+
+if __name__ == "__main__":
+    MonitorLogs.monitor_logs("author", "gongzhonghao")
+
+    pass

+ 32 - 0
gongzhonghao/gongzhonghao_author/dev.py

@@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/7/14
+import random
+import string
+
+# 函数 1 生成随机 token
+def generate_token():
+    token = ''.join(random.choices(string.ascii_letters + string.digits, k=6))
+    return token
+
+# 函数 2 和函数 3 使用同一个 token,当 token 过期时重新获取
+def function_2():
+    global token
+    if not token:
+        token = generate_token()
+    print("函数 2 使用 token:", token)
+
+def function_3():
+    global token
+    if not token:
+        token = generate_token()
+    print("函数 3 使用 token:", token)
+
+# 测试
+token = generate_token()
+print("初始 token:", token)
+function_2()
+# 模拟 token 过期
+token = None
+function_2()
+function_3()

+ 0 - 525
gongzhonghao/gongzhonghao_author/gongzhonghao1_author.py

@@ -1,525 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/3/28
-import datetime
-import json
-import os
-import random
-import shutil
-import sys
-import time
-from hashlib import md5
-import requests
-import urllib3
-from selenium.webdriver import DesiredCapabilities
-from selenium.webdriver.chrome.service import Service
-from selenium.webdriver.common.by import By
-from selenium import webdriver
-
-from common.mq import MQ
-sys.path.append(os.getcwd())
-from common.common import Common
-from common.feishu import Feishu
-from common.publish import Publish
-# from common.getuser import getUser
-from common.scheduling_db import MysqlHelper
-from common.public import get_config_from_mysql, download_rule, title_like
-
-
-class GongzhonghaoAuthor1:
-    platform = "公众号"
-
-    # 获取 token
-    @classmethod
-    def get_token(cls, log_type, crawler, env):
-        select_sql = f""" select * from crawler_config where source="{crawler}" and title LIKE "%公众号_1%";"""
-        configs = MysqlHelper.get_values(log_type, crawler, select_sql, env, action="")
-        if len(configs) == 0:
-            Feishu.bot(log_type, crawler, "公众号_1:未配置token")
-            time.sleep(60)
-            return None
-        token_dict = {
-            "token_id": configs[0]["id"],
-            "title": configs[0]["title"].strip(),
-            "token": dict(eval(configs[0]["config"]))["token"].strip(),
-            "cookie": dict(eval(configs[0]["config"]))["cookie"].strip(),
-            "update_time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(configs[0]["update_time"]/1000))),
-            "operator": configs[0]["operator"].strip()
-        }
-        # for k, v in token_dict.items():
-        #     print(f"{k}:{type(v)}, {v}")
-        return token_dict
-
-    @classmethod
-    def get_users(cls, log_type, crawler, user_sheet, sheetid, i, env):
-        user_name = user_sheet[i][0]
-        wechat_name = user_sheet[i][2]
-        if wechat_name is None or wechat_name.strip() == "" or wechat_name.replace(" ", "") == "":
-            wechat_name = user_name
-        out_uid = user_sheet[i][3]
-        avatar_url = user_sheet[i][4]
-        if out_uid is None or out_uid.strip() == "" or out_uid.replace(" ", "") == "":
-            user_info_dict = cls.get_user_info(log_type=log_type, crawler=crawler, wechat_name=wechat_name, env=env)
-            out_uid = user_info_dict["user_id"]
-            avatar_url = user_info_dict["avatar_url"]
-            Feishu.update_values(log_type, crawler, sheetid, f'D{i + 1}:E{i + 1}', [[out_uid, avatar_url]])
-
-        our_user_dict = {
-            'user_name': user_name,
-            'user_id': out_uid,
-            'wechat_name': wechat_name,
-            'avatar_url': avatar_url,
-        }
-        for k, v in our_user_dict.items():
-            Common.logger(log_type, crawler).info(f"{k}:{v}")
-        Common.logging(log_type, crawler, env, f'our_user_dict:{our_user_dict}')
-        return our_user_dict
-
-    # 获取用户 fakeid
-    @classmethod
-    def get_user_info(cls, log_type, crawler, wechat_name, env):
-        Common.logger(log_type, crawler).info(f"获取站外用户信息:{wechat_name}")
-        Common.logging(log_type, crawler, env, f"获取站外用户信息:{wechat_name}")
-        while True:
-            token_dict = cls.get_token(log_type, crawler, env)
-            url = "https://mp.weixin.qq.com/cgi-bin/searchbiz?"
-            headers = {
-                "accept": "*/*",
-                "accept-encoding": "gzip, deflate, br",
-                "accept-language": "zh-CN,zh;q=0.9",
-                "referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?"
-                           "t=media/appmsg_edit_v2&action=edit&isNew=1"
-                           "&type=77&createType=5&token=1011071554&lang=zh_CN",
-                'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
-                "sec-ch-ua-mobile": "?0",
-                "sec-ch-ua-platform": '"Windows"',
-                "sec-fetch-dest": "empty",
-                "sec-fetch-mode": "cors",
-                "sec-fetch-site": "same-origin",
-                "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
-                              " (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",
-                "x-requested-with": "XMLHttpRequest",
-                'cookie': token_dict['cookie'],
-            }
-            params = {
-                "action": "search_biz",
-                "begin": "0",
-                "count": "5",
-                "query": str(wechat_name),
-                "token": token_dict['token'],
-                "lang": "zh_CN",
-                "f": "json",
-                "ajax": "1",
-            }
-            urllib3.disable_warnings()
-            r = requests.get(url=url, headers=headers, params=params, verify=False)
-            r.close()
-            if r.json()["base_resp"]["err_msg"] == "invalid session":
-                Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
-                Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
-                if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n过期啦,请扫码更换token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
-                continue
-            if r.json()["base_resp"]["err_msg"] == "freq control":
-                Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
-                Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
-                if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
-                continue
-            if "list" not in r.json() or len(r.json()["list"]) == 0:
-                Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
-                Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
-                if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
-                continue
-            user_info_dict = {'user_name': r.json()["list"][0]["nickname"],
-                              'user_id': r.json()["list"][0]["fakeid"],
-                              'avatar_url': r.json()["list"][0]["round_head_img"]}
-            return user_info_dict
-
-    # 获取腾讯视频下载链接
-    @classmethod
-    def get_tencent_video_url(cls, video_id):
-        url = 'https://vv.video.qq.com/getinfo?vids=' + str(video_id) + '&platform=101001&charge=0&otype=json'
-        response = requests.get(url=url).text.replace('QZOutputJson=', '').replace('"};', '"}')
-        response = json.loads(response)
-        url = response['vl']['vi'][0]['ul']['ui'][0]['url']
-        fvkey = response['vl']['vi'][0]['fvkey']
-        video_url = url + str(video_id) + '.mp4?vkey=' + fvkey
-        return video_url
-
-    @classmethod
-    def get_video_url(cls, article_url, env):
-        # 打印请求配置
-        ca = DesiredCapabilities.CHROME
-        ca["goog:loggingPrefs"] = {"performance": "ALL"}
-
-        # 不打开浏览器运行
-        chrome_options = webdriver.ChromeOptions()
-        chrome_options.add_argument("headless")
-        chrome_options.add_argument(
-            f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36')
-        chrome_options.add_argument("--no-sandbox")
-
-        # driver初始化
-        if env == "prod":
-            driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
-        else:
-            driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service(
-                '/Users/wangkun/Downloads/chromedriver/chromedriver_v113/chromedriver'))
-
-        driver.implicitly_wait(10)
-        driver.get(article_url)
-        time.sleep(1)
-
-        if len(driver.find_elements(By.XPATH, '//div[@class="js_video_poster video_poster"]/*[2]')) != 0:
-            video_url = driver.find_element(
-                By.XPATH, '//div[@class="js_video_poster video_poster"]/*[2]').get_attribute('src')
-        elif len(driver.find_elements(By.XPATH, '//span[@class="js_tx_video_container"]/*[1]')) != 0:
-            iframe = driver.find_element(By.XPATH, '//span[@class="js_tx_video_container"]/*[1]').get_attribute(
-                'src')
-            video_id = iframe.split('vid=')[-1].split('&')[0]
-            video_url = cls.get_tencent_video_url(video_id)
-        else:
-            video_url = 0
-        driver.quit()
-        return video_url
-
-    # 获取文章列表
-    @classmethod
-    def get_videoList(cls, log_type, crawler, rule_dict, user_dict, env):
-        mq = MQ(topic_name="topic_crawler_etl_" + env)
-        begin = 0
-        while True:
-            token_dict = cls.get_token(log_type, crawler, env)
-            url = "https://mp.weixin.qq.com/cgi-bin/appmsg?"
-            headers = {
-                "accept": "*/*",
-                "accept-encoding": "gzip, deflate, br",
-                "accept-language": "zh-CN,zh;q=0.9",
-                "referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?"
-                           "t=media/appmsg_edit_v2&action=edit&isNew=1"
-                           "&type=77&createType=5&token=" + str(token_dict['token']) + "&lang=zh_CN",
-                'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
-                "sec-ch-ua-mobile": "?0",
-                "sec-ch-ua-platform": '"Windows"',
-                "sec-fetch-dest": "empty",
-                "sec-fetch-mode": "cors",
-                "sec-fetch-site": "same-origin",
-                "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
-                              " (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",
-                "x-requested-with": "XMLHttpRequest",
-                'cookie': token_dict['cookie'],
-            }
-            params = {
-                "action": "list_ex",
-                "begin": str(begin),
-                "count": "5",
-                "fakeid": user_dict['user_id'],
-                "type": "9",
-                "query": "",
-                "token": str(token_dict['token']),
-                "lang": "zh_CN",
-                "f": "json",
-                "ajax": "1",
-            }
-            urllib3.disable_warnings()
-            r = requests.get(url=url, headers=headers, params=params, verify=False)
-            r.close()
-            if r.json()["base_resp"]["err_msg"] == "invalid session":
-                Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']}\n过期啦,请扫码更换token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
-                continue
-            if r.json()["base_resp"]["err_msg"] == "freq control":
-                Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler,f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
-                continue
-            if r.json()["base_resp"]["err_msg"] == "invalid args" and r.json()["base_resp"]["ret"] == 200002:
-                Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler,f"公众号:{user_dict['user_name']}\n抓取异常, 请检查该公众号\n")
-                return
-            if 'app_msg_list' not in r.json():
-                Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']}\n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
-                continue
-            if len(r.json()['app_msg_list']) == 0:
-                Common.logger(log_type, crawler).info('没有更多视频了\n')
-                Common.logging(log_type, crawler, env, '没有更多视频了\n')
-                return
-            else:
-                begin += 5
-                app_msg_list = r.json()['app_msg_list']
-                for article in app_msg_list:
-                    try:
-                        create_time = article.get('create_time', 0)
-                        publish_time_stamp = int(create_time)
-                        publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))
-                        article_url = article.get('link', '')
-                        video_dict = {
-                            'video_id': article.get('aid', ''),
-                            'video_title': article.get("title", "").replace(' ', '').replace('"', '').replace("'", ""),
-                            'publish_time_stamp': publish_time_stamp,
-                            'publish_time_str': publish_time_str,
-                            'user_name': user_dict["user_name"],
-                            'play_cnt': 0,
-                            'comment_cnt': 0,
-                            'like_cnt': 0,
-                            'share_cnt': 0,
-                            'user_id': user_dict['user_id'],
-                            'avatar_url': user_dict['avatar_url'],
-                            'cover_url': article.get('cover', ''),
-                            'article_url': article.get('link', ''),
-                            'video_url': cls.get_video_url(article_url, env),
-                            'session': f'gongzhonghao-author1-{int(time.time())}'
-                        }
-                        for k, v in video_dict.items():
-                            Common.logger(log_type, crawler).info(f"{k}:{v}")
-                        Common.logging(log_type, crawler, env, f'video_dict:{video_dict}')
-
-                        if int(time.time()) - publish_time_stamp > 3600 * 24 * int(rule_dict.get('period', {}).get('max', 1000)):
-                            Common.logger(log_type, crawler).info(f"发布时间超过{int(rule_dict.get('period', {}).get('max', 1000))}天\n")
-                            Common.logging(log_type, crawler, env, f"发布时间超过{int(rule_dict.get('period', {}).get('max', 1000))}天\n")
-                            return
-
-                        if video_dict['article_url'] == 0 or video_dict['video_url'] == 0:
-                            Common.logger(log_type, crawler).info("文章涉嫌违反相关法律法规和政策\n")
-                            Common.logging(log_type, crawler, env, "文章涉嫌违反相关法律法规和政策\n")
-                        # 标题敏感词过滤
-                        elif any(str(word) if str(word) in video_dict['video_title'] else False
-                                 for word in get_config_from_mysql(log_type=log_type,
-                                                                   source=crawler,
-                                                                   env=env,
-                                                                   text="filter",
-                                                                   action="")) is True:
-                            Common.logger(log_type, crawler).info("标题已中过滤词\n")
-                            Common.logging(log_type, crawler, env, "标题已中过滤词\n")
-                        # 已下载判断
-                        elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env) != 0:
-                            Common.logger(log_type, crawler).info("视频已下载\n")
-                            Common.logging(log_type, crawler, env, "视频已下载\n")
-                        # 标题相似度
-                        elif title_like(log_type, crawler, video_dict['video_title'], cls.platform, env) is True:
-                            Common.logger(log_type, crawler).info(f'标题相似度>=80%:{video_dict["video_title"]}\n')
-                            Common.logging(log_type, crawler, env, f'标题相似度>=80%:{video_dict["video_title"]}\n')
-                        else:
-                            # cls.download_publish(log_type=log_type,
-                            #                      crawler=crawler,
-                            #                      video_dict=video_dict,
-                            #                      rule_dict=rule_dict,
-                            #                      # user_dict=user_dict,
-                            #                      env=env)
-                            video_dict["out_user_id"] = video_dict["user_id"]
-                            video_dict["platform"] = crawler
-                            video_dict["strategy"] = log_type
-                            video_dict["out_video_id"] = video_dict["video_id"]
-                            video_dict["width"] = 0
-                            video_dict["height"] = 0
-                            video_dict["crawler_rule"] = json.dumps(rule_dict)
-                            # video_dict["user_id"] = user_dict["uid"]
-                            video_dict["user_id"] = Publish.uids(crawler, "定向爬虫策略", "", env)
-                            video_dict["publish_time"] = video_dict["publish_time_str"]
-
-                            mq.send_msg(video_dict)
-                    except Exception as e:
-                        Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
-                        Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
-                Common.logger(log_type, crawler).info('休眠 60 秒\n')
-                Common.logging(log_type, crawler, env, '休眠 60 秒\n')
-                time.sleep(60)
-
-    @classmethod
-    def repeat_video(cls, log_type, crawler, video_id, env):
-        # sql = f""" select * from crawler_video where platform="公众号" and out_video_id="{video_id}"; """
-        sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{video_id}"; """
-        repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
-        return len(repeat_video)
-
-    # 下载/上传
-    @classmethod
-    def download_publish(cls, log_type, crawler, video_dict, rule_dict, env):
-        # 下载视频
-        Common.download_method(log_type=log_type, crawler=crawler, text="video", title=video_dict["video_title"], url=video_dict["video_url"])
-        md_title = md5(video_dict['video_title'].encode('utf8')).hexdigest()
-        try:
-            if os.path.getsize(f"./{crawler}/videos/{md_title}/video.mp4") == 0:
-                # 删除视频文件夹
-                shutil.rmtree(f"./{crawler}/videos/{md_title}")
-                Common.logger(log_type, crawler).info("视频size=0,删除成功\n")
-                Common.logging(log_type, crawler, env, "视频size=0,删除成功\n")
-                return
-        except FileNotFoundError:
-            # 删除视频文件夹
-            shutil.rmtree(f"./{crawler}/videos/{md_title}")
-            Common.logger(log_type, crawler).info("视频文件不存在,删除文件夹成功\n")
-            Common.logging(log_type, crawler, env, "视频文件不存在,删除文件夹成功\n")
-            return
-        # 获取视频时长
-        ffmpeg_dict = Common.ffmpeg(log_type, crawler, f"./{crawler}/videos/{video_dict['video_title']}/video.mp4")
-        video_dict["video_width"] = ffmpeg_dict["width"]
-        video_dict["video_height"] = ffmpeg_dict["height"]
-        video_dict["duration"] = ffmpeg_dict["duration"]
-        Common.logger(log_type, crawler).info(f'video_width:{video_dict["video_width"]}')
-        Common.logging(log_type, crawler, env, f'video_width:{video_dict["video_width"]}')
-        Common.logger(log_type, crawler).info(f'video_height:{video_dict["video_height"]}')
-        Common.logging(log_type, crawler, env, f'video_height:{video_dict["video_height"]}')
-        Common.logger(log_type, crawler).info(f'duration:{video_dict["duration"]}')
-        Common.logging(log_type, crawler, env, f'duration:{video_dict["duration"]}')
-        if download_rule(log_type, crawler, video_dict, rule_dict) is False:
-            shutil.rmtree(f"./{crawler}/videos/{md_title}")
-            Common.logger(log_type, crawler).info("不满足抓取规则,删除成功\n")
-            Common.logging(log_type, crawler, env, "不满足抓取规则,删除成功\n")
-            return
-        # 下载封面
-        Common.download_method(log_type=log_type, crawler=crawler, text="cover",
-                               title=video_dict["video_title"], url=video_dict["cover_url"])
-        # 保存视频信息至 "./videos/{video_title}/info.txt"
-        Common.save_video_info(log_type=log_type, crawler=crawler, video_dict=video_dict)
-
-        # 上传视频
-        Common.logger(log_type, crawler).info("开始上传视频...")
-        Common.logging(log_type, crawler, env, "开始上传视频...")
-        strategy = "定向爬虫策略"
-        if env == 'prod':
-            oss_endpoint = "inner"
-            our_video_id = Publish.upload_and_publish(log_type=log_type,
-                                                      crawler=crawler,
-                                                      strategy=strategy,
-                                                      our_uid="follow",
-                                                      oss_endpoint=oss_endpoint,
-                                                      env=env)
-            our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{str(our_video_id)}/info"
-        else:
-            oss_endpoint = "out"
-            our_video_id = Publish.upload_and_publish(log_type=log_type,
-                                                      crawler=crawler,
-                                                      strategy=strategy,
-                                                      our_uid="follow",
-                                                      oss_endpoint=oss_endpoint,
-                                                      env=env)
-            our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{str(our_video_id)}/info"
-
-        if our_video_id is None:
-            try:
-                # 删除视频文件夹
-                shutil.rmtree(f"./{crawler}/videos/{md_title}")
-                return
-            except FileNotFoundError:
-                return
-
-        insert_sql = f""" insert into crawler_video(video_id,
-                                                    out_user_id,
-                                                    platform,
-                                                    strategy,
-                                                    out_video_id,
-                                                    video_title,
-                                                    cover_url,
-                                                    video_url,
-                                                    duration,
-                                                    publish_time,
-                                                    play_cnt,
-                                                    crawler_rule,
-                                                    width,
-                                                    height)
-                                                    values({our_video_id},
-                                                    "{video_dict['user_id']}",
-                                                    "{cls.platform}",
-                                                    "定向爬虫策略",
-                                                    "{video_dict['video_id']}",
-                                                    "{video_dict['video_title']}",
-                                                    "{video_dict['cover_url']}",
-                                                    "{video_dict['video_url']}",
-                                                    {int(video_dict['duration'])},
-                                                    "{video_dict['publish_time_str']}",
-                                                    {int(video_dict['play_cnt'])},
-                                                    '{json.dumps(rule_dict)}',
-                                                    {int(video_dict['video_width'])},
-                                                    {int(video_dict['video_height'])}) """
-        Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
-        Common.logging(log_type, crawler, env, f"insert_sql:{insert_sql}")
-        MysqlHelper.update_values(log_type, crawler, insert_sql, env)
-        Common.logger(log_type, crawler).info('视频信息写入数据库成功')
-        Common.logging(log_type, crawler, env, '视频信息写入数据库成功')
-
-        # 视频写入飞书
-        Feishu.insert_columns(log_type, crawler, "47e39d", "ROWS", 1, 2)
-        # 视频ID工作表,首行写入数据
-        upload_time = int(time.time())
-        values = [[time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(upload_time)),
-                   "用户主页",
-                   video_dict['video_title'],
-                   video_dict['video_id'],
-                   our_video_link,
-                   int(video_dict['duration']),
-                   f"{video_dict['video_width']}*{video_dict['video_height']}",
-                   video_dict['publish_time_str'],
-                   video_dict['user_name'],
-                   video_dict['user_id'],
-                   video_dict['avatar_url'],
-                   video_dict['cover_url'],
-                   video_dict['article_url'],
-                   video_dict['video_url']]]
-        time.sleep(0.5)
-        Feishu.update_values(log_type, crawler, "47e39d", "F2:Z2", values)
-        Common.logger(log_type, crawler).info('视频下载/上传成功\n')
-        Common.logging(log_type, crawler, env, '视频下载/上传成功\n')
-
-    @classmethod
-    def get_all_videos(cls, log_type, crawler, rule_dict, env):
-        while True:
-            sheetid = "Bzv72P"
-            # sheetid = "SHRnwl"
-            user_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
-            if user_sheet is None:
-                Common.logger(log_type, crawler).warning(f"user_sheet:{user_sheet}, 2秒后重试")
-                Common.logging(log_type, crawler, env, f"user_sheet:{user_sheet}, 2秒后重试")
-                time.sleep(2)
-                continue
-            len_sheet = len(user_sheet)
-            if len_sheet >= 141:
-                len_sheet = 141
-            for i in range(1, len_sheet):
-                user_dict = cls.get_users(log_type=log_type,
-                                          crawler=crawler,
-                                          user_sheet=user_sheet,
-                                          sheetid=sheetid,
-                                          i=i,
-                                          env=env)
-                Common.logger(log_type, crawler).info(f'获取 {user_dict["user_name"]} 公众号视频\n')
-                Common.logging(log_type, crawler, env, f'获取 {user_dict["user_name"]} 公众号视频\n')
-                try:
-                    cls.get_videoList(log_type=log_type,
-                                      crawler=crawler,
-                                      rule_dict=rule_dict,
-                                      user_dict=user_dict,
-                                      env=env)
-                    Common.logger(log_type, crawler).info('休眠 60 秒\n')
-                    Common.logging(log_type, crawler, env, '休眠 60 秒\n')
-                    time.sleep(60)
-                except Exception as e:
-                    Common.logger(log_type, crawler).info(f'抓取{user_dict["user_name"]}公众号时异常:{e}\n')
-                    Common.logging(log_type, crawler, env, f'抓取{user_dict["user_name"]}公众号时异常:{e}\n')
-            break
-
-
-if __name__ == "__main__":
-    # GongzhonghaoAuthor1.get_token("author", "gongzhonghao", "prod")
-    # print(GongzhonghaoAuthor1.get_users("author", "gongzhonghao", "Bzv72P", "dev"))
-    # print(get_config_from_mysql("author", "gongzhonghao", "dev", "filter", action=""))
-    # print(title_like("author", "gongzhonghao", "公众号", "123", "dev"))
-    # print(GongzhonghaoAuthor1.get_user_info("author", "gongzhonghao", "幸福花朵", "dev"))
-    GongzhonghaoAuthor1.get_all_videos("author", "gongzhonghao", {}, "dev")
-    pass

+ 0 - 521
gongzhonghao/gongzhonghao_author/gongzhonghao2_author.py

@@ -1,521 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/3/28
-import datetime
-import json
-import os
-import shutil
-import sys
-import time
-from hashlib import md5
-import requests
-import urllib3
-from selenium.webdriver import DesiredCapabilities
-from selenium.webdriver.chrome.service import Service
-from selenium.webdriver.common.by import By
-from selenium import webdriver
-
-from common.mq import MQ
-sys.path.append(os.getcwd())
-# from common.getuser import getUser
-from common.common import Common
-from common.feishu import Feishu
-from common.publish import Publish
-from common.scheduling_db import MysqlHelper
-from common.public import get_config_from_mysql, title_like, download_rule
-
-
-class GongzhonghaoAuthor2:
-    platform = "公众号"
-
-    # 获取 token
-    @classmethod
-    def get_token(cls, log_type, crawler, env):
-        select_sql = f""" select * from crawler_config where source="{crawler}" and title LIKE "%公众号_2%";"""
-        configs = MysqlHelper.get_values(log_type, crawler, select_sql, env, action="")
-        if len(configs) == 0:
-            Feishu.bot(log_type, crawler, "公众号_2:未配置token")
-            time.sleep(60)
-            return None
-        token_dict = {
-            "token_id": configs[0]["id"],
-            "title": configs[0]["title"].strip(),
-            "token": dict(eval(configs[0]["config"]))["token"].strip(),
-            "cookie": dict(eval(configs[0]["config"]))["cookie"].strip(),
-            "update_time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(configs[0]["update_time"]/1000))),
-            "operator": configs[0]["operator"].strip()
-        }
-        # for k, v in token_dict.items():
-        #     print(f"{k}:{v}")
-        return token_dict
-
-    @classmethod
-    def get_users(cls, log_type, crawler, user_sheet, sheetid, i, env):
-        user_name = user_sheet[i][0]
-        wechat_name = user_sheet[i][2]
-        if wechat_name is None or wechat_name.strip() == "" or wechat_name.replace(" ", "") == "":
-            wechat_name = user_name
-        out_uid = user_sheet[i][3]
-        avatar_url = user_sheet[i][4]
-        if out_uid is None or out_uid.strip() == "" or out_uid.replace(" ", "") == "":
-            user_info_dict = cls.get_user_info(log_type=log_type, crawler=crawler, wechat_name=wechat_name, env=env)
-            out_uid = user_info_dict["user_id"]
-            avatar_url = user_info_dict["avatar_url"]
-            Feishu.update_values(log_type, crawler, sheetid, f'D{i + 1}:E{i + 1}', [[out_uid, avatar_url]])
-
-        our_user_dict = {
-            'user_name': user_name,
-            'user_id': out_uid,
-            'wechat_name': wechat_name,
-            'avatar_url': avatar_url,
-        }
-        for k, v in our_user_dict.items():
-            Common.logger(log_type, crawler).info(f"{k}:{v}")
-        Common.logging(log_type, crawler, env, f'our_user_dict:{our_user_dict}')
-        return our_user_dict
-
-    # 获取用户 fakeid
-    @classmethod
-    def get_user_info(cls, log_type, crawler, wechat_name, env):
-        while True:
-            token_dict = cls.get_token(log_type, crawler, env)
-            url = "https://mp.weixin.qq.com/cgi-bin/searchbiz?"
-            headers = {
-                "accept": "*/*",
-                "accept-encoding": "gzip, deflate, br",
-                "accept-language": "zh-CN,zh;q=0.9",
-                "referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?"
-                           "t=media/appmsg_edit_v2&action=edit&isNew=1"
-                           "&type=77&createType=5&token=1011071554&lang=zh_CN",
-                'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
-                "sec-ch-ua-mobile": "?0",
-                "sec-ch-ua-platform": '"Windows"',
-                "sec-fetch-dest": "empty",
-                "sec-fetch-mode": "cors",
-                "sec-fetch-site": "same-origin",
-                "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
-                              " (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",
-                "x-requested-with": "XMLHttpRequest",
-                'cookie': token_dict['cookie'],
-            }
-            params = {
-                "action": "search_biz",
-                "begin": "0",
-                "count": "5",
-                "query": str(wechat_name),
-                "token": token_dict['token'],
-                "lang": "zh_CN",
-                "f": "json",
-                "ajax": "1",
-            }
-            urllib3.disable_warnings()
-            r = requests.get(url=url, headers=headers, params=params, verify=False)
-            r.close()
-            if r.json()["base_resp"]["err_msg"] == "invalid session":
-                Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
-                Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
-                if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n过期啦,请扫码更换token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
-                continue
-            if r.json()["base_resp"]["err_msg"] == "freq control":
-                Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
-                Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
-                if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
-                continue
-            if "list" not in r.json() or len(r.json()["list"]) == 0:
-                Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
-                Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
-                if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
-                continue
-
-            user_info_dict = {'user_name': r.json()["list"][0]["nickname"],
-                              'user_id': r.json()["list"][0]["fakeid"],
-                              'avatar_url': r.json()["list"][0]["round_head_img"]}
-            return user_info_dict
-
-    # 获取腾讯视频下载链接
-    @classmethod
-    def get_tencent_video_url(cls, video_id):
-        url = 'https://vv.video.qq.com/getinfo?vids=' + str(video_id) + '&platform=101001&charge=0&otype=json'
-        response = requests.get(url=url).text.replace('QZOutputJson=', '').replace('"};', '"}')
-        response = json.loads(response)
-        url = response['vl']['vi'][0]['ul']['ui'][0]['url']
-        fvkey = response['vl']['vi'][0]['fvkey']
-        video_url = url + str(video_id) + '.mp4?vkey=' + fvkey
-        return video_url
-
-    @classmethod
-    def get_video_url(cls, article_url, env):
-        # 打印请求配置
-        ca = DesiredCapabilities.CHROME
-        ca["goog:loggingPrefs"] = {"performance": "ALL"}
-
-        # 不打开浏览器运行
-        chrome_options = webdriver.ChromeOptions()
-        chrome_options.add_argument("headless")
-        chrome_options.add_argument(
-            f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36')
-        chrome_options.add_argument("--no-sandbox")
-
-        # driver初始化
-        if env == "prod":
-            driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
-        else:
-            driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service('/Users/wangkun/Downloads/chromedriver/chromedriver_v113/chromedriver'))
-
-        driver.implicitly_wait(10)
-        driver.get(article_url)
-        time.sleep(1)
-
-        if len(driver.find_elements(By.XPATH, '//div[@class="js_video_poster video_poster"]/*[2]')) != 0:
-            video_url = driver.find_element(
-                By.XPATH, '//div[@class="js_video_poster video_poster"]/*[2]').get_attribute('src')
-        elif len(driver.find_elements(By.XPATH, '//span[@class="js_tx_video_container"]/*[1]')) != 0:
-            iframe = driver.find_element(By.XPATH, '//span[@class="js_tx_video_container"]/*[1]').get_attribute(
-                'src')
-            video_id = iframe.split('vid=')[-1].split('&')[0]
-            video_url = cls.get_tencent_video_url(video_id)
-        else:
-            video_url = 0
-        driver.quit()
-        return video_url
-
-    # 获取文章列表
-    @classmethod
-    def get_videoList(cls, log_type, crawler, rule_dict, user_dict, env):
-        mq = MQ(topic_name="topic_crawler_etl_" + env)
-        begin = 0
-        while True:
-            token_dict = cls.get_token(log_type, crawler, env)
-            url = "https://mp.weixin.qq.com/cgi-bin/appmsg?"
-            headers = {
-                "accept": "*/*",
-                "accept-encoding": "gzip, deflate, br",
-                "accept-language": "zh-CN,zh;q=0.9",
-                "referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?"
-                           "t=media/appmsg_edit_v2&action=edit&isNew=1"
-                           "&type=77&createType=5&token=" + str(token_dict['token']) + "&lang=zh_CN",
-                'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
-                "sec-ch-ua-mobile": "?0",
-                "sec-ch-ua-platform": '"Windows"',
-                "sec-fetch-dest": "empty",
-                "sec-fetch-mode": "cors",
-                "sec-fetch-site": "same-origin",
-                "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
-                              " (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",
-                "x-requested-with": "XMLHttpRequest",
-                'cookie': token_dict['cookie'],
-            }
-            params = {
-                "action": "list_ex",
-                "begin": str(begin),
-                "count": "5",
-                "fakeid": user_dict['user_id'],
-                "type": "9",
-                "query": "",
-                "token": str(token_dict['token']),
-                "lang": "zh_CN",
-                "f": "json",
-                "ajax": "1",
-            }
-            urllib3.disable_warnings()
-            r = requests.get(url=url, headers=headers, params=params, verify=False)
-            r.close()
-            if r.json()["base_resp"]["err_msg"] == "invalid session":
-                Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']}\n过期啦,请扫码更换token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
-                continue
-            if r.json()["base_resp"]["err_msg"] == "freq control":
-                Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler,f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
-                continue
-            if r.json()["base_resp"]["err_msg"] == "invalid args" and r.json()["base_resp"]["ret"] == 200002:
-                Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler,f"公众号:{user_dict['user_name']}\n抓取异常, 请检查该公众号\n")
-                return
-            if 'app_msg_list' not in r.json():
-                Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']}\n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
-                continue
-            if len(r.json()['app_msg_list']) == 0:
-                Common.logger(log_type, crawler).info('没有更多视频了\n')
-                Common.logging(log_type, crawler, env, '没有更多视频了\n')
-                return
-            else:
-                begin += 5
-                app_msg_list = r.json()['app_msg_list']
-                for article in app_msg_list:
-                    try:
-                        create_time = article.get('create_time', 0)
-                        publish_time_stamp = int(create_time)
-                        publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))
-                        article_url = article.get('link', '')
-                        video_dict = {
-                            'video_id': article.get('aid', ''),
-                            'video_title': article.get("title", "").replace(' ', '').replace('"', '').replace("'", ""),
-                            'publish_time_stamp': publish_time_stamp,
-                            'publish_time_str': publish_time_str,
-                            'user_name': user_dict["user_name"],
-                            'play_cnt': 0,
-                            'comment_cnt': 0,
-                            'like_cnt': 0,
-                            'share_cnt': 0,
-                            'user_id': user_dict['user_id'],
-                            'avatar_url': user_dict['avatar_url'],
-                            'cover_url': article.get('cover', ''),
-                            'article_url': article.get('link', ''),
-                            'video_url': cls.get_video_url(article_url, env),
-                            'session': f'gongzhonghao-author1-{int(time.time())}'
-                        }
-                        for k, v in video_dict.items():
-                            Common.logger(log_type, crawler).info(f"{k}:{v}")
-                        Common.logging(log_type, crawler, env, f'video_dict:{video_dict}')
-
-                        if int(time.time()) - publish_time_stamp > 3600 * 24 * int(rule_dict.get('period', {}).get('max', 1000)):
-                            Common.logger(log_type, crawler).info(f"发布时间超过{int(rule_dict.get('period', {}).get('max', 1000))}天\n")
-                            Common.logging(log_type, crawler, env, f"发布时间超过{int(rule_dict.get('period', {}).get('max', 1000))}天\n")
-                            return
-
-                        if video_dict['article_url'] == 0 or video_dict['video_url'] == 0:
-                            Common.logger(log_type, crawler).info("文章涉嫌违反相关法律法规和政策\n")
-                            Common.logging(log_type, crawler, env, "文章涉嫌违反相关法律法规和政策\n")
-                        # 标题敏感词过滤
-                        elif any(str(word) if str(word) in video_dict['video_title'] else False
-                                 for word in get_config_from_mysql(log_type=log_type,
-                                                                   source=crawler,
-                                                                   env=env,
-                                                                   text="filter",
-                                                                   action="")) is True:
-                            Common.logger(log_type, crawler).info("标题已中过滤词\n")
-                            Common.logging(log_type, crawler, env, "标题已中过滤词\n")
-                        # 已下载判断
-                        elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env) != 0:
-                            Common.logger(log_type, crawler).info("视频已下载\n")
-                            Common.logging(log_type, crawler, env, "视频已下载\n")
-                        # 标题相似度
-                        elif title_like(log_type, crawler, video_dict['video_title'], cls.platform, env) is True:
-                            Common.logger(log_type, crawler).info(f'标题相似度>=80%:{video_dict["video_title"]}\n')
-                            Common.logging(log_type, crawler, env, f'标题相似度>=80%:{video_dict["video_title"]}\n')
-                        else:
-                            # cls.download_publish(log_type=log_type,
-                            #                      crawler=crawler,
-                            #                      video_dict=video_dict,
-                            #                      rule_dict=rule_dict,
-                            #                      # user_dict=user_dict,
-                            #                      env=env)
-                            video_dict["out_user_id"] = video_dict["user_id"]
-                            video_dict["platform"] = crawler
-                            video_dict["strategy"] = log_type
-                            video_dict["out_video_id"] = video_dict["video_id"]
-                            video_dict["width"] = 0
-                            video_dict["height"] = 0
-                            video_dict["crawler_rule"] = json.dumps(rule_dict)
-                            # video_dict["user_id"] = user_dict["uid"]
-                            video_dict["user_id"] = Publish.uids(crawler, "定向爬虫策略", "", env)
-                            video_dict["publish_time"] = video_dict["publish_time_str"]
-
-                            mq.send_msg(video_dict)
-                    except Exception as e:
-                        Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
-                        Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
-
-                Common.logger(log_type, crawler).info('休眠 60 秒\n')
-                Common.logging(log_type, crawler, env, '休眠 60 秒\n')
-                time.sleep(60)
-
-    @classmethod
-    def repeat_video(cls, log_type, crawler, video_id, env):
-        # sql = f""" select * from crawler_video where platform="公众号" and out_video_id="{video_id}"; """
-        sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{video_id}"; """
-        repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
-        return len(repeat_video)
-
-    # 下载/上传
-    @classmethod
-    def download_publish(cls, log_type, crawler, video_dict, rule_dict, env):
-        # 下载视频
-        Common.download_method(log_type=log_type, crawler=crawler, text="video", title=video_dict["video_title"], url=video_dict["video_url"])
-        md_title = md5(video_dict['video_title'].encode('utf8')).hexdigest()
-        try:
-            if os.path.getsize(f"./{crawler}/videos/{md_title}/video.mp4") == 0:
-                # 删除视频文件夹
-                shutil.rmtree(f"./{crawler}/videos/{md_title}")
-                Common.logger(log_type, crawler).info("视频size=0,删除成功\n")
-                Common.logging(log_type, crawler, env, "视频size=0,删除成功\n")
-                return
-        except FileNotFoundError:
-            # 删除视频文件夹
-            shutil.rmtree(f"./{crawler}/videos/{md_title}")
-            Common.logger(log_type, crawler).info("视频文件不存在,删除文件夹成功\n")
-            Common.logging(log_type, crawler, env, "视频文件不存在,删除文件夹成功\n")
-            return
-        # 获取视频时长
-        ffmpeg_dict = Common.ffmpeg(log_type, crawler, f"./{crawler}/videos/{video_dict['video_title']}/video.mp4")
-        video_dict["video_width"] = ffmpeg_dict["width"]
-        video_dict["video_height"] = ffmpeg_dict["height"]
-        video_dict["duration"] = ffmpeg_dict["duration"]
-        Common.logger(log_type, crawler).info(f'video_width:{video_dict["video_width"]}')
-        Common.logging(log_type, crawler, env, f'video_width:{video_dict["video_width"]}')
-        Common.logger(log_type, crawler).info(f'video_height:{video_dict["video_height"]}')
-        Common.logging(log_type, crawler, env, f'video_height:{video_dict["video_height"]}')
-        Common.logger(log_type, crawler).info(f'duration:{video_dict["duration"]}')
-        Common.logging(log_type, crawler, env, f'duration:{video_dict["duration"]}')
-        if download_rule(log_type, crawler, video_dict, rule_dict) is False:
-            shutil.rmtree(f"./{crawler}/videos/{md_title}")
-            Common.logger(log_type, crawler).info("不满足抓取规则,删除成功\n")
-            Common.logging(log_type, crawler, env, "不满足抓取规则,删除成功\n")
-            return
-        # 下载封面
-        Common.download_method(log_type=log_type, crawler=crawler, text="cover",
-                               title=video_dict["video_title"], url=video_dict["cover_url"])
-        # 保存视频信息至 "./videos/{video_title}/info.txt"
-        Common.save_video_info(log_type=log_type, crawler=crawler, video_dict=video_dict)
-
-        # 上传视频
-        Common.logger(log_type, crawler).info("开始上传视频...")
-        Common.logging(log_type, crawler, env, "开始上传视频...")
-        strategy = "定向爬虫策略"
-        if env == 'prod':
-            oss_endpoint = "inner"
-            our_video_id = Publish.upload_and_publish(log_type=log_type,
-                                                      crawler=crawler,
-                                                      strategy=strategy,
-                                                      our_uid="follow",
-                                                      oss_endpoint=oss_endpoint,
-                                                      env=env)
-            our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{str(our_video_id)}/info"
-        else:
-            oss_endpoint = "out"
-            our_video_id = Publish.upload_and_publish(log_type=log_type,
-                                                      crawler=crawler,
-                                                      strategy=strategy,
-                                                      our_uid="follow",
-                                                      oss_endpoint=oss_endpoint,
-                                                      env=env)
-            our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{str(our_video_id)}/info"
-
-        if our_video_id is None:
-            try:
-                # 删除视频文件夹
-                shutil.rmtree(f"./{crawler}/videos/{md_title}")
-                return
-            except FileNotFoundError:
-                return
-
-        insert_sql = f""" insert into crawler_video(video_id,
-                                                    out_user_id,
-                                                    platform,
-                                                    strategy,
-                                                    out_video_id,
-                                                    video_title,
-                                                    cover_url,
-                                                    video_url,
-                                                    duration,
-                                                    publish_time,
-                                                    play_cnt,
-                                                    crawler_rule,
-                                                    width,
-                                                    height)
-                                                    values({our_video_id},
-                                                    "{video_dict['user_id']}",
-                                                    "{cls.platform}",
-                                                    "定向爬虫策略",
-                                                    "{video_dict['video_id']}",
-                                                    "{video_dict['video_title']}",
-                                                    "{video_dict['cover_url']}",
-                                                    "{video_dict['video_url']}",
-                                                    {int(video_dict['duration'])},
-                                                    "{video_dict['publish_time_str']}",
-                                                    {int(video_dict['play_cnt'])},
-                                                    '{json.dumps(rule_dict)}',
-                                                    {int(video_dict['video_width'])},
-                                                    {int(video_dict['video_height'])}) """
-        Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
-        Common.logging(log_type, crawler, env, f"insert_sql:{insert_sql}")
-        MysqlHelper.update_values(log_type, crawler, insert_sql, env)
-        Common.logger(log_type, crawler).info('视频信息插入数据库成功!')
-        Common.logging(log_type, crawler, env, '视频信息插入数据库成功!')
-
-        # 视频写入飞书
-        Feishu.insert_columns(log_type, crawler, "47e39d", "ROWS", 1, 2)
-        # 视频ID工作表,首行写入数据
-        upload_time = int(time.time())
-        values = [[time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(upload_time)),
-                   "用户主页",
-                   video_dict['video_title'],
-                   video_dict['video_id'],
-                   our_video_link,
-                   int(video_dict['duration']),
-                   f"{video_dict['video_width']}*{video_dict['video_height']}",
-                   video_dict['publish_time_str'],
-                   video_dict['user_name'],
-                   video_dict['user_id'],
-                   video_dict['avatar_url'],
-                   video_dict['cover_url'],
-                   video_dict['article_url'],
-                   video_dict['video_url']]]
-        time.sleep(0.5)
-        Feishu.update_values(log_type, crawler, "47e39d", "F2:Z2", values)
-        Common.logger(log_type, crawler).info('视频下载/上传成功\n')
-        Common.logging(log_type, crawler, env, '视频下载/上传成功\n')
-
-    @classmethod
-    def get_all_videos(cls, log_type, crawler, rule_dict, env):
-        while True:
-            sheetid = "Bzv72P"
-            user_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
-            if user_sheet is None:
-                Common.logger(log_type, crawler).warning(f"user_sheet:{user_sheet}, 2秒后重试")
-                Common.logging(log_type, crawler, env, f"user_sheet:{user_sheet}, 2秒后重试")
-                time.sleep(2)
-                continue
-            len_sheet = len(user_sheet)
-            if len_sheet <= 141:
-                Common.logger(log_type, crawler).info("抓取用户数<=100,无需启动第二套抓取脚本\n")
-                Common.logging(log_type, crawler, env, "抓取用户数<=100,无需启动第二套抓取脚本\n")
-                return
-            if len_sheet >= 261:
-                len_sheet = 261
-            for i in range(141, len_sheet):
-                user_dict = cls.get_users(log_type=log_type,
-                                          crawler=crawler,
-                                          user_sheet=user_sheet,
-                                          sheetid=sheetid,
-                                          i=i,
-                                          env=env)
-                Common.logger(log_type, crawler).info(f'获取 {user_dict["user_name"]} 公众号视频\n')
-                Common.logging(log_type, crawler, env, f'获取 {user_dict["user_name"]} 公众号视频\n')
-                try:
-                    cls.get_videoList(log_type=log_type,
-                                      crawler=crawler,
-                                      rule_dict=rule_dict,
-                                      user_dict=user_dict,
-                                      env=env)
-                    Common.logger(log_type, crawler).info('休眠 60 秒\n')
-                    Common.logging(log_type, crawler, env, '休眠 60 秒\n')
-                    time.sleep(60)
-                except Exception as e:
-                    Common.logger(log_type, crawler).info(f'抓取{user_dict["user_name"]}公众号时异常:{e}\n')
-                    Common.logging(log_type, crawler, env, f'抓取{user_dict["user_name"]}公众号时异常:{e}\n')
-            break
-
-
-if __name__ == "__main__":
-    GongzhonghaoAuthor2.get_token("author", "gongzhonghao", "dev")
-    pass

+ 0 - 522
gongzhonghao/gongzhonghao_author/gongzhonghao3_author.py

@@ -1,522 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/3/28
-import datetime
-import json
-import os
-import shutil
-import sys
-import time
-from hashlib import md5
-import requests
-import urllib3
-from selenium.webdriver import DesiredCapabilities
-from selenium.webdriver.chrome.service import Service
-from selenium.webdriver.common.by import By
-from selenium import webdriver
-
-from common.mq import MQ
-sys.path.append(os.getcwd())
-from common.common import Common
-from common.feishu import Feishu
-from common.publish import Publish
-from common.scheduling_db import MysqlHelper
-from common.public import get_config_from_mysql, title_like, download_rule
-
-
-class GongzhonghaoAuthor3:
-    platform = "公众号"
-
-    # 获取 token
-    @classmethod
-    def get_token(cls, log_type, crawler, env):
-        select_sql = f""" select * from crawler_config where source="{crawler}" and title LIKE "%公众号_3%";"""
-        configs = MysqlHelper.get_values(log_type, crawler, select_sql, env, action="")
-        if len(configs) == 0:
-            Feishu.bot(log_type, crawler, "公众号_3:未配置token\n请登录后配置\nhttps://admin.piaoquantv.com/cms/spider-source-config")
-            time.sleep(60)
-            return None
-        token_dict = {
-            "token_id": configs[0]["id"],
-            "title": configs[0]["title"].strip(),
-            "token": dict(eval(configs[0]["config"]))["token"].strip(),
-            "cookie": dict(eval(configs[0]["config"]))["cookie"].strip(),
-            "update_time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(configs[0]["update_time"]/1000))),
-            "operator": configs[0]["operator"].strip()
-        }
-        # for k, v in token_dict.items():
-        #     print(f"{k}:{v}")
-        return token_dict
-
-    @classmethod
-    def get_users(cls, log_type, crawler, user_sheet, sheetid, i, env):
-        user_name = user_sheet[i][0]
-        wechat_name = user_sheet[i][2]
-        if wechat_name is None or wechat_name.strip() == "" or wechat_name.replace(" ", "") == "":
-            wechat_name = user_name
-        out_uid = user_sheet[i][3]
-        avatar_url = user_sheet[i][4]
-        if out_uid is None or out_uid.strip() == "" or out_uid.replace(" ", "") == "":
-            user_info_dict = cls.get_user_info(log_type=log_type, crawler=crawler, wechat_name=wechat_name, env=env)
-            out_uid = user_info_dict["user_id"]
-            avatar_url = user_info_dict["avatar_url"]
-            Feishu.update_values(log_type, crawler, sheetid, f'D{i + 1}:E{i + 1}', [[out_uid, avatar_url]])
-
-        our_user_dict = {
-            'user_name': user_name,
-            'user_id': out_uid,
-            'wechat_name': wechat_name,
-            'avatar_url': avatar_url,
-        }
-        for k, v in our_user_dict.items():
-            Common.logger(log_type, crawler).info(f"{k}:{v}")
-        Common.logging(log_type, crawler, env, f"our_user_dict:{our_user_dict}")
-        return our_user_dict
-
-    # 获取用户 fakeid
-    @classmethod
-    def get_user_info(cls, log_type, crawler, wechat_name, env):
-        while True:
-            token_dict = cls.get_token(log_type, crawler, env)
-            url = "https://mp.weixin.qq.com/cgi-bin/searchbiz?"
-            headers = {
-                "accept": "*/*",
-                "accept-encoding": "gzip, deflate, br",
-                "accept-language": "zh-CN,zh;q=0.9",
-                "referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?"
-                           "t=media/appmsg_edit_v2&action=edit&isNew=1"
-                           "&type=77&createType=5&token=1011071554&lang=zh_CN",
-                'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
-                "sec-ch-ua-mobile": "?0",
-                "sec-ch-ua-platform": '"Windows"',
-                "sec-fetch-dest": "empty",
-                "sec-fetch-mode": "cors",
-                "sec-fetch-site": "same-origin",
-                "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
-                              " (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",
-                "x-requested-with": "XMLHttpRequest",
-                'cookie': token_dict['cookie'],
-            }
-            params = {
-                "action": "search_biz",
-                "begin": "0",
-                "count": "5",
-                "query": str(wechat_name),
-                "token": token_dict['token'],
-                "lang": "zh_CN",
-                "f": "json",
-                "ajax": "1",
-            }
-            urllib3.disable_warnings()
-            r = requests.get(url=url, headers=headers, params=params, verify=False)
-            r.close()
-            if r.json()["base_resp"]["err_msg"] == "invalid session":
-                Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
-                Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
-                if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n过期啦,请扫码更换token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
-                continue
-            if r.json()["base_resp"]["err_msg"] == "freq control":
-                Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
-                Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
-                if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
-                continue
-            if "list" not in r.json() or len(r.json()["list"]) == 0:
-                Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
-                Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
-                if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
-                continue
-
-            user_info_dict = {'user_name': r.json()["list"][0]["nickname"],
-                              'user_id': r.json()["list"][0]["fakeid"],
-                              'avatar_url': r.json()["list"][0]["round_head_img"]}
-            return user_info_dict
-
-    # 获取腾讯视频下载链接
-    @classmethod
-    def get_tencent_video_url(cls, video_id):
-        url = 'https://vv.video.qq.com/getinfo?vids=' + str(video_id) + '&platform=101001&charge=0&otype=json'
-        response = requests.get(url=url).text.replace('QZOutputJson=', '').replace('"};', '"}')
-        response = json.loads(response)
-        url = response['vl']['vi'][0]['ul']['ui'][0]['url']
-        fvkey = response['vl']['vi'][0]['fvkey']
-        video_url = url + str(video_id) + '.mp4?vkey=' + fvkey
-        return video_url
-
-    @classmethod
-    def get_video_url(cls, article_url, env):
-        # 打印请求配置
-        ca = DesiredCapabilities.CHROME
-        ca["goog:loggingPrefs"] = {"performance": "ALL"}
-
-        # 不打开浏览器运行
-        chrome_options = webdriver.ChromeOptions()
-        chrome_options.add_argument("headless")
-        chrome_options.add_argument(
-            f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36')
-        chrome_options.add_argument("--no-sandbox")
-
-        # driver初始化
-        if env == "prod":
-            driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
-        else:
-            driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service(
-                '/Users/wangkun/Downloads/chromedriver/chromedriver_v113/chromedriver'))
-
-        driver.implicitly_wait(10)
-        driver.get(article_url)
-        time.sleep(1)
-
-        if len(driver.find_elements(By.XPATH, '//div[@class="js_video_poster video_poster"]/*[2]')) != 0:
-            video_url = driver.find_element(
-                By.XPATH, '//div[@class="js_video_poster video_poster"]/*[2]').get_attribute('src')
-        elif len(driver.find_elements(By.XPATH, '//span[@class="js_tx_video_container"]/*[1]')) != 0:
-            iframe = driver.find_element(By.XPATH, '//span[@class="js_tx_video_container"]/*[1]').get_attribute(
-                'src')
-            video_id = iframe.split('vid=')[-1].split('&')[0]
-            video_url = cls.get_tencent_video_url(video_id)
-        else:
-            video_url = 0
-        driver.quit()
-        return video_url
-
-    # 获取文章列表
-    @classmethod
-    def get_videoList(cls, log_type, crawler, rule_dict, user_dict, env):
-        mq = MQ(topic_name="topic_crawler_etl_" + env)
-        begin = 0
-        while True:
-            token_dict = cls.get_token(log_type, crawler, env)
-            url = "https://mp.weixin.qq.com/cgi-bin/appmsg?"
-            headers = {
-                "accept": "*/*",
-                "accept-encoding": "gzip, deflate, br",
-                "accept-language": "zh-CN,zh;q=0.9",
-                "referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?"
-                           "t=media/appmsg_edit_v2&action=edit&isNew=1"
-                           "&type=77&createType=5&token=" + str(token_dict['token']) + "&lang=zh_CN",
-                'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
-                "sec-ch-ua-mobile": "?0",
-                "sec-ch-ua-platform": '"Windows"',
-                "sec-fetch-dest": "empty",
-                "sec-fetch-mode": "cors",
-                "sec-fetch-site": "same-origin",
-                "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
-                              " (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",
-                "x-requested-with": "XMLHttpRequest",
-                'cookie': token_dict['cookie'],
-            }
-            params = {
-                "action": "list_ex",
-                "begin": str(begin),
-                "count": "5",
-                "fakeid": user_dict['user_id'],
-                "type": "9",
-                "query": "",
-                "token": str(token_dict['token']),
-                "lang": "zh_CN",
-                "f": "json",
-                "ajax": "1",
-            }
-            urllib3.disable_warnings()
-            r = requests.get(url=url, headers=headers, params=params, verify=False)
-            r.close()
-            if r.json()["base_resp"]["err_msg"] == "invalid session":
-                Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']}\n过期啦,请扫码更换token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
-                continue
-            if r.json()["base_resp"]["err_msg"] == "freq control":
-                Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler,f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
-                continue
-            if r.json()["base_resp"]["err_msg"] == "invalid args" and r.json()["base_resp"]["ret"] == 200002:
-                Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler,f"公众号:{user_dict['user_name']}\n抓取异常, 请检查该公众号\n")
-                return
-            if 'app_msg_list' not in r.json():
-                Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']}\n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
-                continue
-            if len(r.json()['app_msg_list']) == 0:
-                Common.logger(log_type, crawler).info('没有更多视频了\n')
-                Common.logging(log_type, crawler, env, '没有更多视频了\n')
-                return
-            else:
-                begin += 5
-                app_msg_list = r.json()['app_msg_list']
-                for article in app_msg_list:
-                    try:
-                        create_time = article.get('create_time', 0)
-                        publish_time_stamp = int(create_time)
-                        publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))
-                        article_url = article.get('link', '')
-                        video_dict = {
-                            'video_id': article.get('aid', ''),
-                            'video_title': article.get("title", "").replace(' ', '').replace('"', '').replace("'", ""),
-                            'publish_time_stamp': publish_time_stamp,
-                            'publish_time_str': publish_time_str,
-                            'user_name': user_dict["user_name"],
-                            'play_cnt': 0,
-                            'comment_cnt': 0,
-                            'like_cnt': 0,
-                            'share_cnt': 0,
-                            'user_id': user_dict['user_id'],
-                            'avatar_url': user_dict['avatar_url'],
-                            'cover_url': article.get('cover', ''),
-                            'article_url': article.get('link', ''),
-                            'video_url': cls.get_video_url(article_url, env),
-                            'session': f'gongzhonghao-author1-{int(time.time())}'
-                        }
-                        for k, v in video_dict.items():
-                            Common.logger(log_type, crawler).info(f"{k}:{v}")
-                        Common.logging(log_type, crawler, env, f'video_dict:{video_dict}')
-
-                        if int(time.time()) - publish_time_stamp > 3600 * 24 * int(
-                                rule_dict.get('period', {}).get('max', 1000)):
-                            Common.logger(log_type, crawler).info(f"发布时间超过{int(rule_dict.get('period', {}).get('max', 1000))}天\n")
-                            Common.logging(log_type, crawler, env, f"发布时间超过{int(rule_dict.get('period', {}).get('max', 1000))}天\n")
-                            return
-
-                        if video_dict['article_url'] == 0 or video_dict['video_url'] == 0:
-                            Common.logger(log_type, crawler).info("文章涉嫌违反相关法律法规和政策\n")
-                            Common.logging(log_type, crawler, env, "文章涉嫌违反相关法律法规和政策\n")
-                        # 标题敏感词过滤
-                        elif any(str(word) if str(word) in video_dict['video_title'] else False
-                                 for word in get_config_from_mysql(log_type=log_type,
-                                                                   source=crawler,
-                                                                   env=env,
-                                                                   text="filter",
-                                                                   action="")) is True:
-                            Common.logger(log_type, crawler).info("标题已中过滤词\n")
-                            Common.logging(log_type, crawler, env, "标题已中过滤词\n")
-                        # 已下载判断
-                        elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env) != 0:
-                            Common.logger(log_type, crawler).info("视频已下载\n")
-                            Common.logging(log_type, crawler, env, "视频已下载\n")
-                        # 标题相似度
-                        elif title_like(log_type, crawler, video_dict['video_title'], cls.platform, env) is True:
-                            Common.logger(log_type, crawler).info(f'标题相似度>=80%:{video_dict["video_title"]}\n')
-                            Common.logging(log_type, crawler, env, f'标题相似度>=80%:{video_dict["video_title"]}\n')
-                        else:
-                            # cls.download_publish(log_type=log_type,
-                            #                      crawler=crawler,
-                            #                      video_dict=video_dict,
-                            #                      rule_dict=rule_dict,
-                            #                      # user_dict=user_dict,
-                            #                      env=env)
-                            video_dict["out_user_id"] = video_dict["user_id"]
-                            video_dict["platform"] = crawler
-                            video_dict["strategy"] = log_type
-                            video_dict["out_video_id"] = video_dict["video_id"]
-                            video_dict["width"] = 0
-                            video_dict["height"] = 0
-                            video_dict["crawler_rule"] = json.dumps(rule_dict)
-                            # video_dict["user_id"] = user_dict["uid"]
-                            video_dict["user_id"] = Publish.uids(crawler, "定向爬虫策略", "", env)
-                            video_dict["publish_time"] = video_dict["publish_time_str"]
-
-                            mq.send_msg(video_dict)
-                    except Exception as e:
-                        Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
-                        Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
-
-                Common.logger(log_type, crawler).info('休眠 60 秒\n')
-                Common.logging(log_type, crawler, env, '休眠 60 秒\n')
-                time.sleep(60)
-
-    @classmethod
-    def repeat_video(cls, log_type, crawler, video_id, env):
-        # sql = f""" select * from crawler_video where platform="公众号" and out_video_id="{video_id}"; """
-        sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{video_id}"; """
-        repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
-        return len(repeat_video)
-
-    # 下载/上传
-    @classmethod
-    def download_publish(cls, log_type, crawler, video_dict, rule_dict, env):
-        # 下载视频
-        Common.download_method(log_type=log_type, crawler=crawler, text="video", title=video_dict["video_title"], url=video_dict["video_url"])
-        md_title = md5(video_dict['video_title'].encode('utf8')).hexdigest()
-        try:
-            if os.path.getsize(f"./{crawler}/videos/{md_title}/video.mp4") == 0:
-                # 删除视频文件夹
-                shutil.rmtree(f"./{crawler}/videos/{md_title}")
-                Common.logger(log_type, crawler).info("视频size=0,删除成功\n")
-                Common.logging(log_type, crawler, env, "视频size=0,删除成功\n")
-                return
-        except FileNotFoundError:
-            # 删除视频文件夹
-            shutil.rmtree(f"./{crawler}/videos/{md_title}")
-            Common.logger(log_type, crawler).info("视频文件不存在,删除文件夹成功\n")
-            Common.logging(log_type, crawler, env, "视频文件不存在,删除文件夹成功\n")
-            return
-        # 获取视频时长
-        ffmpeg_dict = Common.ffmpeg(log_type, crawler, f"./{crawler}/videos/{video_dict['video_title']}/video.mp4")
-        video_dict["video_width"] = ffmpeg_dict["width"]
-        video_dict["video_height"] = ffmpeg_dict["height"]
-        video_dict["duration"] = ffmpeg_dict["duration"]
-        Common.logger(log_type, crawler).info(f'video_width:{video_dict["video_width"]}')
-        Common.logging(log_type, crawler, env, f'video_width:{video_dict["video_width"]}')
-        Common.logger(log_type, crawler).info(f'video_height:{video_dict["video_height"]}')
-        Common.logging(log_type, crawler, env, f'video_height:{video_dict["video_height"]}')
-        Common.logger(log_type, crawler).info(f'duration:{video_dict["duration"]}')
-        Common.logging(log_type, crawler, env, f'duration:{video_dict["duration"]}')
-        if download_rule(log_type, crawler, video_dict, rule_dict) is False:
-            shutil.rmtree(f"./{crawler}/videos/{md_title}")
-            Common.logger(log_type, crawler).info("不满足抓取规则,删除成功\n")
-            Common.logging(log_type, crawler, env, "不满足抓取规则,删除成功\n")
-            return
-        # 下载封面
-        Common.download_method(log_type=log_type, crawler=crawler, text="cover",
-                               title=video_dict["video_title"], url=video_dict["cover_url"])
-        # 保存视频信息至 "./videos/{video_title}/info.txt"
-        Common.save_video_info(log_type=log_type, crawler=crawler, video_dict=video_dict)
-
-        # 上传视频
-        Common.logger(log_type, crawler).info("开始上传视频...")
-        Common.logging(log_type, crawler, env, "开始上传视频...")
-        strategy = "定向爬虫策略"
-        if env == 'prod':
-            oss_endpoint = "inner"
-            our_video_id = Publish.upload_and_publish(log_type=log_type,
-                                                      crawler=crawler,
-                                                      strategy=strategy,
-                                                      our_uid="follow",
-                                                      oss_endpoint=oss_endpoint,
-                                                      env=env)
-            our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{str(our_video_id)}/info"
-        else:
-            oss_endpoint = "out"
-            our_video_id = Publish.upload_and_publish(log_type=log_type,
-                                                      crawler=crawler,
-                                                      strategy=strategy,
-                                                      our_uid="follow",
-                                                      oss_endpoint=oss_endpoint,
-                                                      env=env)
-            our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{str(our_video_id)}/info"
-
-        if our_video_id is None:
-            try:
-                # 删除视频文件夹
-                shutil.rmtree(f"./{crawler}/videos/{md_title}")
-                return
-            except FileNotFoundError:
-                return
-
-        insert_sql = f""" insert into crawler_video(video_id,
-                                                    out_user_id,
-                                                    platform,
-                                                    strategy,
-                                                    out_video_id,
-                                                    video_title,
-                                                    cover_url,
-                                                    video_url,
-                                                    duration,
-                                                    publish_time,
-                                                    play_cnt,
-                                                    crawler_rule,
-                                                    width,
-                                                    height)
-                                                    values({our_video_id},
-                                                    "{video_dict['user_id']}",
-                                                    "{cls.platform}",
-                                                    "定向爬虫策略",
-                                                    "{video_dict['video_id']}",
-                                                    "{video_dict['video_title']}",
-                                                    "{video_dict['cover_url']}",
-                                                    "{video_dict['video_url']}",
-                                                    {int(video_dict['duration'])},
-                                                    "{video_dict['publish_time_str']}",
-                                                    {int(video_dict['play_cnt'])},
-                                                    '{json.dumps(rule_dict)}',
-                                                    {int(video_dict['video_width'])},
-                                                    {int(video_dict['video_height'])}) """
-        Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
-        Common.logging(log_type, crawler, env, f"insert_sql:{insert_sql}")
-        MysqlHelper.update_values(log_type, crawler, insert_sql, env)
-        Common.logger(log_type, crawler).info('视频信息插入数据库成功!')
-        Common.logging(log_type, crawler, env, '视频信息插入数据库成功!')
-
-        # 视频写入飞书
-        Feishu.insert_columns(log_type, crawler, "47e39d", "ROWS", 1, 2)
-        # 视频ID工作表,首行写入数据
-        upload_time = int(time.time())
-        values = [[time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(upload_time)),
-                   "用户主页",
-                   video_dict['video_title'],
-                   video_dict['video_id'],
-                   our_video_link,
-                   int(video_dict['duration']),
-                   f"{video_dict['video_width']}*{video_dict['video_height']}",
-                   video_dict['publish_time_str'],
-                   video_dict['user_name'],
-                   video_dict['user_id'],
-                   video_dict['avatar_url'],
-                   video_dict['cover_url'],
-                   video_dict['article_url'],
-                   video_dict['video_url']]]
-        time.sleep(0.5)
-        Feishu.update_values(log_type, crawler, "47e39d", "F2:Z2", values)
-        Common.logger(log_type, crawler).info('视频下载/上传成功\n')
-        Common.logging(log_type, crawler, env, '视频下载/上传成功\n')
-
-    @classmethod
-    def get_all_videos(cls, log_type, crawler, rule_dict, env):
-        while True:
-            sheetid = "Bzv72P"
-            user_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
-            if user_sheet is None:
-                Common.logger(log_type, crawler).warning(f"user_sheet:{user_sheet}, 2秒后重试")
-                Common.logging(log_type, crawler, env, f"user_sheet:{user_sheet}, 2秒后重试")
-                time.sleep(2)
-                continue
-            len_sheet = len(user_sheet)
-            if len_sheet <= 261:
-                Common.logger(log_type, crawler).info("抓取用户数<=200,无需启动第三套抓取脚本\n")
-                Common.logging(log_type, crawler, env, "抓取用户数<=200,无需启动第三套抓取脚本\n")
-                return
-            if len_sheet >= 361:
-                len_sheet = 361
-            for i in range(261, len_sheet):
-                user_dict = cls.get_users(log_type=log_type,
-                                          crawler=crawler,
-                                          user_sheet=user_sheet,
-                                          sheetid=sheetid,
-                                          i=i,
-                                          env=env)
-                Common.logger(log_type, crawler).info(f'获取 {user_dict["user_name"]} 公众号视频\n')
-                Common.logging(log_type, crawler, env, f'获取 {user_dict["user_name"]} 公众号视频\n')
-                try:
-                    cls.get_videoList(log_type=log_type,
-                                      crawler=crawler,
-                                      rule_dict=rule_dict,
-                                      user_dict=user_dict,
-                                      env=env)
-                    Common.logger(log_type, crawler).info('休眠 60 秒\n')
-                    Common.logging(log_type, crawler, env, '休眠 60 秒\n')
-                    time.sleep(60)
-                except Exception as e:
-                    Common.logger(log_type, crawler).info(f'抓取{user_dict["user_name"]}公众号时异常:{e}\n')
-                    Common.logging(log_type, crawler, env, f'抓取{user_dict["user_name"]}公众号时异常:{e}\n')
-            break
-
-
-if __name__ == "__main__":
-    GongzhonghaoAuthor3.get_token("author", "gongzhonghao", "dev")
-    pass

+ 0 - 525
gongzhonghao/gongzhonghao_author/gongzhonghao5_author.py

@@ -1,525 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/3/28
-import datetime
-# import difflib
-import json
-import os
-import shutil
-import sys
-import time
-from hashlib import md5
-import requests
-import urllib3
-from selenium.webdriver import DesiredCapabilities
-from selenium.webdriver.chrome.service import Service
-from selenium.webdriver.common.by import By
-from selenium import webdriver
-
-from common.mq import MQ
-sys.path.append(os.getcwd())
-from common.common import Common
-from common.feishu import Feishu
-from common.publish import Publish
-from common.scheduling_db import MysqlHelper
-from common.public import get_config_from_mysql, title_like, download_rule
-
-
-class GongzhonghaoAuthor5:
-    platform = "公众号"
-
-    # 获取 token
-    @classmethod
-    def get_token(cls, log_type, crawler, env):
-        select_sql = f""" select * from crawler_config where source="{crawler}" and title LIKE "%公众号_5%";"""
-        configs = MysqlHelper.get_values(log_type, crawler, select_sql, env, action="")
-        if len(configs) == 0:
-            Feishu.bot(log_type, crawler, "公众号_5:未配置token")
-            time.sleep(60)
-            return None
-        token_dict = {
-            "token_id": configs[0]["id"],
-            "title": configs[0]["title"],
-            "token": dict(eval(configs[0]["config"]))["token"],
-            "cookie": dict(eval(configs[0]["config"]))["cookie"],
-            "update_time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(configs[0]["update_time"]/1000))),
-            "operator": configs[0]["operator"]
-        }
-        for k, v in token_dict.items():
-            print(f"{k}:{v}")
-        return token_dict
-
-    @classmethod
-    def get_users(cls, log_type, crawler, user_sheet, sheetid, i, env):
-        user_name = user_sheet[i][0]
-        wechat_name = user_sheet[i][2]
-        if wechat_name is None or wechat_name.strip() == "" or wechat_name.replace(" ", "") == "":
-            wechat_name = user_name
-        out_uid = user_sheet[i][3]
-        avatar_url = user_sheet[i][4]
-        if out_uid is None or out_uid.strip() == "" or out_uid.replace(" ", "") == "":
-            user_info_dict = cls.get_user_info(log_type=log_type, crawler=crawler, wechat_name=wechat_name, env=env)
-            out_uid = user_info_dict["user_id"]
-            avatar_url = user_info_dict["avatar_url"]
-            Feishu.update_values(log_type, crawler, sheetid, f'D{i + 1}:E{i + 1}', [[out_uid, avatar_url]])
-
-        our_user_dict = {
-            'user_name': user_name,
-            'user_id': out_uid,
-            'wechat_name': wechat_name,
-            'avatar_url': avatar_url,
-        }
-        for k, v in our_user_dict.items():
-            Common.logger(log_type, crawler).info(f"{k}:{v}")
-        Common.logging(log_type, crawler, env, f"our_user_dict:{our_user_dict}")
-        return our_user_dict
-
-    # 获取用户 fakeid
-    @classmethod
-    def get_user_info(cls, log_type, crawler, wechat_name, env):
-        Common.logger(log_type, crawler).info(f"获取站外用户信息:{wechat_name}")
-        Common.logging(log_type, crawler, env, f"获取站外用户信息:{wechat_name}")
-        while True:
-            token_dict = cls.get_token(log_type, crawler, env)
-            url = "https://mp.weixin.qq.com/cgi-bin/searchbiz?"
-            headers = {
-                "accept": "*/*",
-                "accept-encoding": "gzip, deflate, br",
-                "accept-language": "zh-CN,zh;q=0.9",
-                "referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?"
-                           "t=media/appmsg_edit_v2&action=edit&isNew=1"
-                           "&type=77&createType=5&token=1011071554&lang=zh_CN",
-                'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
-                "sec-ch-ua-mobile": "?0",
-                "sec-ch-ua-platform": '"Windows"',
-                "sec-fetch-dest": "empty",
-                "sec-fetch-mode": "cors",
-                "sec-fetch-site": "same-origin",
-                "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
-                              " (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",
-                "x-requested-with": "XMLHttpRequest",
-                'cookie': token_dict['cookie'],
-            }
-            params = {
-                "action": "search_biz",
-                "begin": "0",
-                "count": "5",
-                "query": str(wechat_name),
-                "token": token_dict['token'],
-                "lang": "zh_CN",
-                "f": "json",
-                "ajax": "1",
-            }
-            urllib3.disable_warnings()
-            r = requests.get(url=url, headers=headers, params=params, verify=False)
-            r.close()
-            if r.json()["base_resp"]["err_msg"] == "invalid session":
-                Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
-                Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
-                if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n过期啦,请扫码更换token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
-                continue
-            if r.json()["base_resp"]["err_msg"] == "freq control":
-                Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
-                Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
-                if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
-                continue
-            if "list" not in r.json() or len(r.json()["list"]) == 0:
-                Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
-                Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
-                if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
-                continue
-            user_info_dict = {'user_name': r.json()["list"][0]["nickname"],
-                              'user_id': r.json()["list"][0]["fakeid"],
-                              'avatar_url': r.json()["list"][0]["round_head_img"]}
-            return user_info_dict
-
-    # 获取腾讯视频下载链接
-    @classmethod
-    def get_tencent_video_url(cls, video_id):
-        url = 'https://vv.video.qq.com/getinfo?vids=' + str(video_id) + '&platform=101001&charge=0&otype=json'
-        response = requests.get(url=url).text.replace('QZOutputJson=', '').replace('"};', '"}')
-        response = json.loads(response)
-        url = response['vl']['vi'][0]['ul']['ui'][0]['url']
-        fvkey = response['vl']['vi'][0]['fvkey']
-        video_url = url + str(video_id) + '.mp4?vkey=' + fvkey
-        return video_url
-
-    @classmethod
-    def get_video_url(cls, article_url, env):
-        # 打印请求配置
-        ca = DesiredCapabilities.CHROME
-        ca["goog:loggingPrefs"] = {"performance": "ALL"}
-
-        # 不打开浏览器运行
-        chrome_options = webdriver.ChromeOptions()
-        chrome_options.add_argument("headless")
-        chrome_options.add_argument(
-            f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36')
-        chrome_options.add_argument("--no-sandbox")
-
-        # driver初始化
-        if env == "prod":
-            driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
-        else:
-            driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service(
-                '/Users/wangkun/Downloads/chromedriver/chromedriver_v113/chromedriver'))
-
-        driver.implicitly_wait(10)
-        driver.get(article_url)
-        time.sleep(1)
-
-        if len(driver.find_elements(By.XPATH, '//div[@class="js_video_poster video_poster"]/*[2]')) != 0:
-            video_url = driver.find_element(
-                By.XPATH, '//div[@class="js_video_poster video_poster"]/*[2]').get_attribute('src')
-        elif len(driver.find_elements(By.XPATH, '//span[@class="js_tx_video_container"]/*[1]')) != 0:
-            iframe = driver.find_element(By.XPATH, '//span[@class="js_tx_video_container"]/*[1]').get_attribute(
-                'src')
-            video_id = iframe.split('vid=')[-1].split('&')[0]
-            video_url = cls.get_tencent_video_url(video_id)
-        else:
-            video_url = 0
-        driver.quit()
-        return video_url
-
-    # 获取文章列表
-    @classmethod
-    def get_videoList(cls, log_type, crawler, rule_dict, user_dict, env):
-        mq = MQ(topic_name="topic_crawler_etl_" + env)
-        begin = 0
-        while True:
-            token_dict = cls.get_token(log_type, crawler, env)
-            url = "https://mp.weixin.qq.com/cgi-bin/appmsg?"
-            headers = {
-                "accept": "*/*",
-                "accept-encoding": "gzip, deflate, br",
-                "accept-language": "zh-CN,zh;q=0.9",
-                "referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?"
-                           "t=media/appmsg_edit_v2&action=edit&isNew=1"
-                           "&type=77&createType=5&token=" + str(token_dict['token']) + "&lang=zh_CN",
-                'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
-                "sec-ch-ua-mobile": "?0",
-                "sec-ch-ua-platform": '"Windows"',
-                "sec-fetch-dest": "empty",
-                "sec-fetch-mode": "cors",
-                "sec-fetch-site": "same-origin",
-                "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
-                              " (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",
-                "x-requested-with": "XMLHttpRequest",
-                'cookie': token_dict['cookie'],
-            }
-            params = {
-                "action": "list_ex",
-                "begin": str(begin),
-                "count": "5",
-                "fakeid": user_dict['user_id'],
-                "type": "9",
-                "query": "",
-                "token": str(token_dict['token']),
-                "lang": "zh_CN",
-                "f": "json",
-                "ajax": "1",
-            }
-            urllib3.disable_warnings()
-            r = requests.get(url=url, headers=headers, params=params, verify=False)
-            r.close()
-            if r.json()["base_resp"]["err_msg"] == "invalid session":
-                Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']}\n过期啦,请扫码更换token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
-                continue
-            if r.json()["base_resp"]["err_msg"] == "freq control":
-                Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler,f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
-                continue
-            if r.json()["base_resp"]["err_msg"] == "invalid args" and r.json()["base_resp"]["ret"] == 200002:
-                Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler,f"公众号:{user_dict['user_name']}\n抓取异常, 请检查该公众号\n")
-                return
-            if 'app_msg_list' not in r.json():
-                Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
-                if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']}\n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
-                continue
-            if len(r.json()['app_msg_list']) == 0:
-                Common.logger(log_type, crawler).info('没有更多视频了\n')
-                Common.logging(log_type, crawler, env, "没有更多视频了\n")
-                return
-            else:
-                begin += 5
-                app_msg_list = r.json()['app_msg_list']
-                for article in app_msg_list:
-                    try:
-                        create_time = article.get('create_time', 0)
-                        publish_time_stamp = int(create_time)
-                        publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))
-                        article_url = article.get('link', '')
-                        video_dict = {
-                            'video_id': article.get('aid', ''),
-                            'video_title': article.get("title", "").replace(' ', '').replace('"', '').replace("'", ""),
-                            'publish_time_stamp': publish_time_stamp,
-                            'publish_time_str': publish_time_str,
-                            'user_name': user_dict["user_name"],
-                            'play_cnt': 0,
-                            'comment_cnt': 0,
-                            'like_cnt': 0,
-                            'share_cnt': 0,
-                            'user_id': user_dict['user_id'],
-                            'avatar_url': user_dict['avatar_url'],
-                            'cover_url': article.get('cover', ''),
-                            'article_url': article.get('link', ''),
-                            'video_url': cls.get_video_url(article_url, env),
-                            'session': f'gongzhonghao-author1-{int(time.time())}'
-                        }
-                        for k, v in video_dict.items():
-                            Common.logger(log_type, crawler).info(f"{k}:{v}")
-                        Common.logging(log_type, crawler, env, f"video_dict:{video_dict}")
-
-                        if int(time.time()) - publish_time_stamp > 3600 * 24 * int(rule_dict.get('period', {}).get('max', 1000)):
-                            Common.logger(log_type, crawler).info(f"发布时间超过{int(rule_dict.get('period', {}).get('max', 1000))}天\n")
-                            Common.logging(log_type, crawler, env, f"发布时间超过{int(rule_dict.get('period', {}).get('max', 1000))}天\n")
-                            return
-
-                        if video_dict['article_url'] == 0 or video_dict['video_url'] == 0:
-                            Common.logger(log_type, crawler).info("文章涉嫌违反相关法律法规和政策\n")
-                            Common.logging(log_type, crawler, env, "文章涉嫌违反相关法律法规和政策\n")
-                        # 标题敏感词过滤
-                        elif any(str(word) if str(word) in video_dict['video_title'] else False
-                                 for word in get_config_from_mysql(log_type=log_type,
-                                                                   source=crawler,
-                                                                   env=env,
-                                                                   text="filter",
-                                                                   action="")) is True:
-                            Common.logger(log_type, crawler).info("标题已中过滤词\n")
-                            Common.logging(log_type, crawler, env, "标题已中过滤词\n")
-                        # 已下载判断
-                        elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env) != 0:
-                            Common.logger(log_type, crawler).info("视频已下载\n")
-                            Common.logging(log_type, crawler, env, "视频已下载\n")
-                        # 标题相似度
-                        elif title_like(log_type, crawler, video_dict['video_title'], cls.platform, env) is True:
-                            Common.logger(log_type, crawler).info(f'标题相似度>=80%:{video_dict["video_title"]}\n')
-                            Common.logging(log_type, crawler, env, f'标题相似度>=80%:{video_dict["video_title"]}\n')
-                        else:
-                            # cls.download_publish(log_type=log_type,
-                            #                      crawler=crawler,
-                            #                      video_dict=video_dict,
-                            #                      rule_dict=rule_dict,
-                            #                      # user_dict=user_dict,
-                            #                      env=env)
-                            video_dict["out_user_id"] = video_dict["user_id"]
-                            video_dict["platform"] = crawler
-                            video_dict["strategy"] = log_type
-                            video_dict["out_video_id"] = video_dict["video_id"]
-                            video_dict["width"] = 0
-                            video_dict["height"] = 0
-                            video_dict["crawler_rule"] = json.dumps(rule_dict)
-                            # video_dict["user_id"] = user_dict["uid"]
-                            video_dict["user_id"] = Publish.uids(crawler, "定向爬虫策略", "", env)
-                            video_dict["publish_time"] = video_dict["publish_time_str"]
-
-                            mq.send_msg(video_dict)
-                    except Exception as e:
-                        Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
-                        Common.logging(log_type, crawler, env, f'抓取单条视频异常:{e}\n')
-                Common.logger(log_type, crawler).info('休眠 60 秒\n')
-                Common.logging(log_type, crawler, env, '休眠 60 秒\n')
-                time.sleep(60)
-
-
-    @classmethod
-    def repeat_video(cls, log_type, crawler, video_id, env):
-        # sql = f""" select * from crawler_video where platform="公众号" and out_video_id="{video_id}"; """
-        sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{video_id}"; """
-        repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
-        return len(repeat_video)
-
-    # 下载/上传
-    @classmethod
-    def download_publish(cls, log_type, crawler, video_dict, rule_dict, env):
-        # 下载视频
-        Common.download_method(log_type=log_type, crawler=crawler, text="video", title=video_dict["video_title"], url=video_dict["video_url"])
-        md_title = md5(video_dict['video_title'].encode('utf8')).hexdigest()
-        try:
-            if os.path.getsize(f"./{crawler}/videos/{md_title}/video.mp4") == 0:
-                # 删除视频文件夹
-                shutil.rmtree(f"./{crawler}/videos/{md_title}")
-                Common.logger(log_type, crawler).info("视频size=0,删除成功\n")
-                Common.logging(log_type, crawler, env, "视频size=0,删除成功\n")
-                return
-        except FileNotFoundError:
-            # 删除视频文件夹
-            shutil.rmtree(f"./{crawler}/videos/{md_title}")
-            Common.logger(log_type, crawler).info("视频文件不存在,删除文件夹成功\n")
-            Common.logging(log_type, crawler, env, "视频文件不存在,删除文件夹成功\n")
-            return
-        # 获取视频时长
-        ffmpeg_dict = Common.ffmpeg(log_type, crawler, f"./{crawler}/videos/{video_dict['video_title']}/video.mp4")
-        video_dict["video_width"] = ffmpeg_dict["width"]
-        video_dict["video_height"] = ffmpeg_dict["height"]
-        video_dict["duration"] = ffmpeg_dict["duration"]
-        Common.logger(log_type, crawler).info(f'video_width:{video_dict["video_width"]}')
-        Common.logging(log_type, crawler, env, f'video_width:{video_dict["video_width"]}')
-        Common.logger(log_type, crawler).info(f'video_height:{video_dict["video_height"]}')
-        Common.logging(log_type, crawler, env, f'video_height:{video_dict["video_height"]}')
-        Common.logger(log_type, crawler).info(f'duration:{video_dict["duration"]}')
-        Common.logging(log_type, crawler, env, f'duration:{video_dict["duration"]}')
-        if download_rule(log_type, crawler, video_dict, rule_dict) is False:
-            shutil.rmtree(f"./{crawler}/videos/{md_title}")
-            Common.logger(log_type, crawler).info("不满足抓取规则,删除成功\n")
-            Common.logging(log_type, crawler, env, "不满足抓取规则,删除成功\n")
-            return
-        # 下载封面
-        Common.download_method(log_type=log_type, crawler=crawler, text="cover",
-                               title=video_dict["video_title"], url=video_dict["cover_url"])
-        # 保存视频信息至 "./videos/{video_title}/info.txt"
-        Common.save_video_info(log_type=log_type, crawler=crawler, video_dict=video_dict)
-
-        # 上传视频
-        Common.logger(log_type, crawler).info("开始上传视频...")
-        Common.logging(log_type, crawler, env, "开始上传视频...")
-        strategy = "定向爬虫策略"
-        if env == 'prod':
-            oss_endpoint = "inner"
-            our_video_id = Publish.upload_and_publish(log_type=log_type,
-                                                      crawler=crawler,
-                                                      strategy=strategy,
-                                                      our_uid="follow",
-                                                      oss_endpoint=oss_endpoint,
-                                                      env=env)
-            our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{str(our_video_id)}/info"
-        else:
-            oss_endpoint = "out"
-            our_video_id = Publish.upload_and_publish(log_type=log_type,
-                                                      crawler=crawler,
-                                                      strategy=strategy,
-                                                      our_uid="follow",
-                                                      oss_endpoint=oss_endpoint,
-                                                      env=env)
-            our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{str(our_video_id)}/info"
-
-        if our_video_id is None:
-            try:
-                # 删除视频文件夹
-                shutil.rmtree(f"./{crawler}/videos/{md_title}")
-                return
-            except FileNotFoundError:
-                return
-
-        insert_sql = f""" insert into crawler_video(video_id,
-                                                    out_user_id,
-                                                    platform,
-                                                    strategy,
-                                                    out_video_id,
-                                                    video_title,
-                                                    cover_url,
-                                                    video_url,
-                                                    duration,
-                                                    publish_time,
-                                                    play_cnt,
-                                                    crawler_rule,
-                                                    width,
-                                                    height)
-                                                    values({our_video_id},
-                                                    "{video_dict['user_id']}",
-                                                    "{cls.platform}",
-                                                    "定向爬虫策略",
-                                                    "{video_dict['video_id']}",
-                                                    "{video_dict['video_title']}",
-                                                    "{video_dict['cover_url']}",
-                                                    "{video_dict['video_url']}",
-                                                    {int(video_dict['duration'])},
-                                                    "{video_dict['publish_time_str']}",
-                                                    {int(video_dict['play_cnt'])},
-                                                    '{json.dumps(rule_dict)}',
-                                                    {int(video_dict['video_width'])},
-                                                    {int(video_dict['video_height'])}) """
-        Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
-        Common.logging(log_type, crawler, env, f"insert_sql:{insert_sql}")
-        MysqlHelper.update_values(log_type, crawler, insert_sql, env)
-        Common.logger(log_type, crawler).info('视频信息写入数据库成功')
-        Common.logging(log_type, crawler, env, '视频信息写入数据库成功')
-
-        # 视频写入飞书
-        Feishu.insert_columns(log_type, crawler, "47e39d", "ROWS", 1, 2)
-        # 视频ID工作表,首行写入数据
-        upload_time = int(time.time())
-        values = [[time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(upload_time)),
-                   "用户主页",
-                   video_dict['video_title'],
-                   video_dict['video_id'],
-                   our_video_link,
-                   int(video_dict['duration']),
-                   f"{video_dict['video_width']}*{video_dict['video_height']}",
-                   video_dict['publish_time_str'],
-                   video_dict['user_name'],
-                   video_dict['user_id'],
-                   video_dict['avatar_url'],
-                   video_dict['cover_url'],
-                   video_dict['article_url'],
-                   video_dict['video_url']]]
-        time.sleep(0.5)
-        Feishu.update_values(log_type, crawler, "47e39d", "F2:Z2", values)
-        Common.logger(log_type, crawler).info('视频下载/上传成功\n')
-        Common.logging(log_type, crawler, env, '视频下载/上传成功\n')
-
-    @classmethod
-    def get_all_videos(cls, log_type, crawler, rule_dict, env):
-        while True:
-            sheetid = "Bzv72P"
-            user_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
-            if user_sheet is None:
-                Common.logger(log_type, crawler).warning(f"user_sheet:{user_sheet}, 2秒后重试")
-                Common.logging(log_type, crawler, env, f"user_sheet:{user_sheet}, 2秒后重试")
-                time.sleep(2)
-                continue
-            len_sheet = len(user_sheet)
-            if len_sheet <= 461:
-                Common.logger(log_type, crawler).info("抓取用户数<=400,无需启动第五套抓取脚本\n")
-                Common.logging(log_type, crawler, env, "抓取用户数<=400,无需启动第五套抓取脚本\n")
-                return
-            # if len_sheet >= 501:
-            #     len_sheet = 501
-            for i in range(461, len_sheet):
-                user_dict = cls.get_users(log_type=log_type,
-                                          crawler=crawler,
-                                          user_sheet=user_sheet,
-                                          sheetid=sheetid,
-                                          i=i,
-                                          env=env)
-                Common.logger(log_type, crawler).info(f'获取 {user_dict["user_name"]} 公众号视频\n')
-                Common.logging(log_type, crawler, env, f'获取 {user_dict["user_name"]} 公众号视频\n')
-                try:
-                    Common.logger(log_type, crawler).info(f'获取 {user_dict["user_name"]} 公众号视频\n')
-                    Common.logging(log_type, crawler, env, f'获取 {user_dict["user_name"]} 公众号视频\n')
-                    cls.get_videoList(log_type=log_type,
-                                      crawler=crawler,
-                                      rule_dict=rule_dict,
-                                      user_dict=user_dict,
-                                      env=env)
-                    Common.logger(log_type, crawler).info('休眠 60 秒\n')
-                    Common.logging(log_type, crawler, env, '休眠 60 秒\n')
-                    time.sleep(60)
-                except Exception as e:
-                    Common.logger(log_type, crawler).info(f'抓取{user_dict["user_name"]}公众号时异常:{e}\n')
-                    Common.logging(log_type, crawler, env, f'抓取{user_dict["user_name"]}公众号时异常:{e}\n')
-            break
-
-
-if __name__ == "__main__":
-    GongzhonghaoAuthor5.get_token("author", "gongzhonghao", "dev")
-    pass

+ 0 - 0
gongzhonghao/gongzhonghao_author/gongzhonghao1_author_create_user.py → gongzhonghao/gongzhonghao_author/gongzhonghao_author_create_user.py


+ 202 - 303
gongzhonghao/gongzhonghao_author/gongzhonghao4_author.py → gongzhonghao/gongzhonghao_author/gongzhonghao_author_lock.py

@@ -2,85 +2,189 @@
 # @Author: wangkun
 # @Author: wangkun
 # @Time: 2023/3/28
 # @Time: 2023/3/28
 import datetime
 import datetime
-# import difflib
 import json
 import json
 import os
 import os
-import shutil
+import random
 import sys
 import sys
 import time
 import time
-from hashlib import md5
 import requests
 import requests
 import urllib3
 import urllib3
 from selenium.webdriver import DesiredCapabilities
 from selenium.webdriver import DesiredCapabilities
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.common.by import By
 from selenium.webdriver.common.by import By
 from selenium import webdriver
 from selenium import webdriver
-
-from common.mq import MQ
 sys.path.append(os.getcwd())
 sys.path.append(os.getcwd())
+from common.mq import MQ
 from common.common import Common
 from common.common import Common
 from common.feishu import Feishu
 from common.feishu import Feishu
-from common.publish import Publish
 from common.scheduling_db import MysqlHelper
 from common.scheduling_db import MysqlHelper
-from common.public import get_config_from_mysql, title_like, download_rule
+from common.public import get_config_from_mysql, title_like, task_unbind
 
 
 
 
-class GongzhonghaoAuthor4:
-    platform = "公众号"
+class GongzhonghaoAuthor:
+    platform = "gongzhonghao"
 
 
-    # 获取 token
     @classmethod
     @classmethod
     def get_token(cls, log_type, crawler, env):
     def get_token(cls, log_type, crawler, env):
-        select_sql = f""" select * from crawler_config where source="{crawler}" and title LIKE "%公众号_4%";"""
-        configs = MysqlHelper.get_values(log_type, crawler, select_sql, env, action="")
-        if len(configs) == 0:
-            Feishu.bot(log_type, crawler, "公众号_4:未配置token")
-            time.sleep(60)
+        while True:
+            select_sql = f""" select * from crawler_config where source="gongzhonghao" and status=0;"""
+            sql_res_list = MysqlHelper.get_values(log_type, crawler, select_sql, env, action="")
+            token_list = []
+            for sql_res in sql_res_list:
+                if "token" not in sql_res["config"]:
+                    pass
+                else:
+                    token_list.append(sql_res)
+            if len(token_list) == 0:
+                Common.logger(log_type, crawler).info("暂无可用的token\n")
+                Common.logging(log_type, crawler, env, "暂无可用的token\n")
+                if 20 >= datetime.datetime.now().hour >= 10:
+                    Feishu.bot(log_type, crawler, "暂无可用的token,请更新\n")
+                time.sleep(60*15)
+                continue
+            token_info = random.choice(token_list)
+            lock_time_stamp = cls.lock_token(log_type, crawler, env, token_info["id"])
+            if lock_time_stamp is None:
+                continue
+            token_info_dict = {
+                "token_id": token_info["id"],
+                "title": token_info["title"].strip(),
+                "status": token_info["status"],
+                "token": dict(eval(token_info["config"]))["token"].strip(),
+                "cookie": dict(eval(token_info["config"]))["cookie"].strip(),
+                "update_time_stamp": lock_time_stamp,
+                "update_time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(lock_time_stamp / 1000))),
+                "operator": token_info["operator"].strip()
+            }
+            # for k, v in token_info_dict.items():
+            #     print(f"{k}:{type(v)}, {v}")
+
+            return token_info_dict
+
+    @classmethod
+    def lock_token(cls, log_type, crawler, env, token_id):
+        """
+        token 上锁。status=1,update_time=int(time.time()*1000)
+        :param log_type: log
+        :param crawler: crawler
+        :param env: env
+        :param token_id: token_id
+        :return: None
+        """
+        lock_time_stamp = int(time.time()*1000)
+        lock_sql = f""" update crawler_config set status={1}, update_time={lock_time_stamp} WHERE id ={token_id} and status={0} ; """
+        lock_token = MysqlHelper.update_values(log_type, crawler, lock_sql, env, action="")
+        # Common.logger(log_type, crawler).info(f"lock_token:{lock_token}")
+        if lock_token == 1:
+            return lock_time_stamp
+        else:
             return None
             return None
-        token_dict = {
-            "token_id": configs[0]["id"],
-            "title": configs[0]["title"],
-            "token": dict(eval(configs[0]["config"]))["token"],
-            "cookie": dict(eval(configs[0]["config"]))["cookie"],
-            "update_time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(configs[0]["update_time"]/1000))),
-            "operator": configs[0]["operator"]
-        }
-        # for k, v in token_dict.items():
-        #     print(f"{k}:{v}")
-        return token_dict
 
 
     @classmethod
     @classmethod
-    def get_users(cls, log_type, crawler, user_sheet, sheetid, i, env):
-        user_name = user_sheet[i][0]
-        wechat_name = user_sheet[i][2]
-        if wechat_name is None or wechat_name.strip() == "" or wechat_name.replace(" ", "") == "":
-            wechat_name = user_name
-        out_uid = user_sheet[i][3]
-        avatar_url = user_sheet[i][4]
-        if out_uid is None or out_uid.strip() == "" or out_uid.replace(" ", "") == "":
-            user_info_dict = cls.get_user_info(log_type=log_type, crawler=crawler, wechat_name=wechat_name, env=env)
-            out_uid = user_info_dict["user_id"]
-            avatar_url = user_info_dict["avatar_url"]
-            Feishu.update_values(log_type, crawler, sheetid, f'D{i + 1}:E{i + 1}', [[out_uid, avatar_url]])
+    def release_token(cls, log_type, crawler, env, token_id, status):
+        """
+        释放 token
+        :param log_type: 日志
+        :param crawler: 爬虫
+        :param env: 环境
+        :param token_id: token_id
+        :param status: 0,正常可用状态;1,被占用状态;-2,不可用状态(过期/频控)
+        :return: None
+        """
+        release_sql = f""" update crawler_config set status={status}, update_time={int(time.time()*1000)} WHERE id ={token_id} ; """
+        MysqlHelper.update_values(log_type, crawler, release_sql, env, action="")
+
+    # 获取腾讯视频下载链接
+    @classmethod
+    def get_tencent_video_url(cls, video_id):
+        url = 'https://vv.video.qq.com/getinfo?vids=' + str(video_id) + '&platform=101001&charge=0&otype=json'
+        response = requests.get(url=url).text.replace('QZOutputJson=', '').replace('"};', '"}')
+        response = json.loads(response)
+        url = response['vl']['vi'][0]['ul']['ui'][0]['url']
+        fvkey = response['vl']['vi'][0]['fvkey']
+        video_url = url + str(video_id) + '.mp4?vkey=' + fvkey
+        return video_url
+
+    @classmethod
+    def get_video_url(cls, article_url, env):
+        # 打印请求配置
+        ca = DesiredCapabilities.CHROME
+        ca["goog:loggingPrefs"] = {"performance": "ALL"}
+
+        # 不打开浏览器运行
+        chrome_options = webdriver.ChromeOptions()
+        chrome_options.add_argument("headless")
+        chrome_options.add_argument(
+            f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36')
+        chrome_options.add_argument("--no-sandbox")
+
+        # driver初始化
+        if env == "prod":
+            driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
+        else:
+            driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service(
+                '/Users/wangkun/Downloads/chromedriver/chromedriver_v113/chromedriver'))
+
+        driver.implicitly_wait(10)
+        driver.get(article_url)
+        time.sleep(1)
+
+        if len(driver.find_elements(By.XPATH, '//div[@class="js_video_poster video_poster"]/*[2]')) != 0:
+            video_url = driver.find_element(
+                By.XPATH, '//div[@class="js_video_poster video_poster"]/*[2]').get_attribute('src')
+        elif len(driver.find_elements(By.XPATH, '//span[@class="js_tx_video_container"]/*[1]')) != 0:
+            iframe = driver.find_element(By.XPATH, '//span[@class="js_tx_video_container"]/*[1]').get_attribute(
+                'src')
+            video_id = iframe.split('vid=')[-1].split('&')[0]
+            video_url = cls.get_tencent_video_url(video_id)
+        else:
+            video_url = 0
+        driver.quit()
+        return video_url
+
+    @classmethod
+    def repeat_video(cls, log_type, crawler, video_id, env):
+        sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{video_id}"; """
+        repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
+        return len(repeat_video)
 
 
-        our_user_dict = {
-            'user_name': user_name,
-            'user_id': out_uid,
-            'wechat_name': wechat_name,
-            'avatar_url': avatar_url,
-        }
-        for k, v in our_user_dict.items():
-            Common.logger(log_type, crawler).info(f"{k}:{v}")
-        Common.logging(log_type, crawler, env, f'our_user_dict:{our_user_dict}')
-        return our_user_dict
+    @classmethod
+    def get_all_videos(cls, log_type, crawler, task_dict, rule_dict, user_list, env):
+        for user_dict in user_list:
+            Common.logger(log_type, crawler).info(f'抓取公众号:{user_dict["nick_name"]}\n')
+            Common.logging(log_type, crawler, env, f'抓取公众号:{user_dict["nick_name"]}\n')
+            try:
+                cls.get_videoList(log_type=log_type,
+                                  crawler=crawler,
+                                  task_dict=task_dict,
+                                  rule_dict=rule_dict,
+                                  user_dict=user_dict,
+                                  env=env)
+                Common.logger(log_type, crawler).info('休眠 60 秒\n')
+                Common.logging(log_type, crawler, env, '休眠 60 秒\n')
+                time.sleep(60)
+            except Exception as e:
+                Common.logger(log_type, crawler).info(f'抓取公众号:{user_dict["nick_name"]}时异常:{e}\n')
+                Common.logging(log_type, crawler, env, f'抓取公众号:{user_dict["nick_name"]}时异常:{e}\n')
+        cls.release_token(log_type, crawler, env, token_dict["token_id"], 0)
 
 
     # 获取用户 fakeid
     # 获取用户 fakeid
     @classmethod
     @classmethod
-    def get_user_info(cls, log_type, crawler, wechat_name, env):
-        Common.logger(log_type, crawler).info(f"获取站外用户信息:{wechat_name}")
-        Common.logging(log_type, crawler, env, f"获取站外用户信息:{wechat_name}")
+    def get_user_info(cls, log_type, crawler, task_dict, user_dict, env):
+        Common.logger(log_type, crawler).info(f"获取站外用户信息:{user_dict['link']}")
+        Common.logging(log_type, crawler, env, f"获取站外用户信息:{user_dict['link']}")
         while True:
         while True:
+            global token_dict
             token_dict = cls.get_token(log_type, crawler, env)
             token_dict = cls.get_token(log_type, crawler, env)
+            Common.logger(log_type, crawler).info(f"get_user_info_token:{token_dict}")
+
+            if int(time.time()*1000) - token_dict["update_time_stamp"] >= 3600*24*1000:
+            # if int(time.time()*1000) - token_dict["update_time_stamp"] >= 30000:
+                Common.logger(log_type, crawler).info(f"{int(time.time()*1000)}-{token_dict['update_time_stamp']}={(int(time.time()*1000)-token_dict['update_time_stamp'])}")
+                Common.logger(log_type, crawler).info("token使用时长>=24小时,申请释放")
+                Common.logging(log_type, crawler, env, "token使用时长>=24小时,申请释放")
+                cls.release_token(log_type, crawler, env, token_dict["token_id"], 0)
+
             url = "https://mp.weixin.qq.com/cgi-bin/searchbiz?"
             url = "https://mp.weixin.qq.com/cgi-bin/searchbiz?"
             headers = {
             headers = {
                 "accept": "*/*",
                 "accept": "*/*",
@@ -104,7 +208,7 @@ class GongzhonghaoAuthor4:
                 "action": "search_biz",
                 "action": "search_biz",
                 "begin": "0",
                 "begin": "0",
                 "count": "5",
                 "count": "5",
-                "query": str(wechat_name),
+                "query": str(user_dict['link']),
                 "token": token_dict['token'],
                 "token": token_dict['token'],
                 "lang": "zh_CN",
                 "lang": "zh_CN",
                 "f": "json",
                 "f": "json",
@@ -116,84 +220,58 @@ class GongzhonghaoAuthor4:
             if r.json()["base_resp"]["err_msg"] == "invalid session":
             if r.json()["base_resp"]["err_msg"] == "invalid session":
                 Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
                 Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
                 Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
                 Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
+                cls.release_token(log_type, crawler, env, token_dict["token_id"], -2)
                 if 20 >= datetime.datetime.now().hour >= 10:
                 if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n过期啦,请扫码更换token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
+                    Feishu.bot(log_type, crawler,
+                               f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n过期啦,请扫码更换token\nhttps://mp.weixin.qq.com/")
+                time.sleep(60*15)
                 continue
                 continue
             if r.json()["base_resp"]["err_msg"] == "freq control":
             if r.json()["base_resp"]["err_msg"] == "freq control":
                 Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
                 Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
                 Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
                 Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
+                cls.release_token(log_type, crawler, env, token_dict["token_id"], -2)
                 if 20 >= datetime.datetime.now().hour >= 10:
                 if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
+                    Feishu.bot(log_type, crawler,
+                               f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
+                time.sleep(60*15)
                 continue
                 continue
-            if "list" not in r.json() or len(r.json()["list"]) == 0:
+            if r.json()["base_resp"]["err_msg"] == "ok" and len(r.json()["list"]) == 0:
                 Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
                 Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
                 Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
                 Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_fakeid:{r.text}\n")
-                if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
-                continue
+                unbind_msg = task_unbind(log_type=log_type, crawler=crawler, taskid=task_dict['id'],
+                                         uids=str(user_dict["uid"]), env=env)
+                if unbind_msg == "success":
+                    if 20 >= datetime.datetime.now().hour >= 10:
+                        Feishu.bot(log_type, crawler,
+                                   f"公众号:{user_dict['link']}, 站内昵称:{user_dict['nick_name']}\n抓取异常, 已取消抓取该公众号\n")
+                    Common.logging(log_type, crawler, env,
+                                   f"公众号:{user_dict['link']}, 站内昵称:{user_dict['nick_name']}\n抓取异常, 已取消抓取该公众号\n")
+                else:
+                    Common.logger(log_type, crawler).warning(f"unbind_msg:{unbind_msg}")
+                    Common.logging(log_type, crawler, env, f"unbind_msg:{unbind_msg}")
+                return None
             user_info_dict = {'user_name': r.json()["list"][0]["nickname"],
             user_info_dict = {'user_name': r.json()["list"][0]["nickname"],
                               'user_id': r.json()["list"][0]["fakeid"],
                               'user_id': r.json()["list"][0]["fakeid"],
                               'avatar_url': r.json()["list"][0]["round_head_img"]}
                               'avatar_url': r.json()["list"][0]["round_head_img"]}
             return user_info_dict
             return user_info_dict
-
-    # 获取腾讯视频下载链接
-    @classmethod
-    def get_tencent_video_url(cls, video_id):
-        url = 'https://vv.video.qq.com/getinfo?vids=' + str(video_id) + '&platform=101001&charge=0&otype=json'
-        response = requests.get(url=url).text.replace('QZOutputJson=', '').replace('"};', '"}')
-        response = json.loads(response)
-        url = response['vl']['vi'][0]['ul']['ui'][0]['url']
-        fvkey = response['vl']['vi'][0]['fvkey']
-        video_url = url + str(video_id) + '.mp4?vkey=' + fvkey
-        return video_url
-
-    @classmethod
-    def get_video_url(cls, article_url, env):
-        # 打印请求配置
-        ca = DesiredCapabilities.CHROME
-        ca["goog:loggingPrefs"] = {"performance": "ALL"}
-
-        # 不打开浏览器运行
-        chrome_options = webdriver.ChromeOptions()
-        chrome_options.add_argument("headless")
-        chrome_options.add_argument(
-            f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36')
-        chrome_options.add_argument("--no-sandbox")
-
-        # driver初始化
-        if env == "prod":
-            driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
-        else:
-            driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service(
-                '/Users/wangkun/Downloads/chromedriver/chromedriver_v113/chromedriver'))
-
-        driver.implicitly_wait(10)
-        driver.get(article_url)
-        time.sleep(1)
-
-        if len(driver.find_elements(By.XPATH, '//div[@class="js_video_poster video_poster"]/*[2]')) != 0:
-            video_url = driver.find_element(
-                By.XPATH, '//div[@class="js_video_poster video_poster"]/*[2]').get_attribute('src')
-        elif len(driver.find_elements(By.XPATH, '//span[@class="js_tx_video_container"]/*[1]')) != 0:
-            iframe = driver.find_element(By.XPATH, '//span[@class="js_tx_video_container"]/*[1]').get_attribute(
-                'src')
-            video_id = iframe.split('vid=')[-1].split('&')[0]
-            video_url = cls.get_tencent_video_url(video_id)
-        else:
-            video_url = 0
-        driver.quit()
-        return video_url
-
+        
     # 获取文章列表
     # 获取文章列表
     @classmethod
     @classmethod
-    def get_videoList(cls, log_type, crawler, rule_dict, user_dict, env):
+    def get_videoList(cls, log_type, crawler, task_dict, rule_dict, user_dict, env):
         mq = MQ(topic_name="topic_crawler_etl_" + env)
         mq = MQ(topic_name="topic_crawler_etl_" + env)
+        user_info_dict = cls.get_user_info(log_type=log_type,
+                                           crawler=crawler,
+                                           task_dict=task_dict,
+                                           user_dict=user_dict,
+                                           env=env)
+        if user_info_dict is None:
+            return
+        user_dict["user_id"] = user_info_dict["user_id"]
+        user_dict["user_name"] = user_info_dict["user_name"]
+        user_dict["avatar_url"] = user_info_dict["avatar_url"]
         begin = 0
         begin = 0
         while True:
         while True:
-            token_dict = cls.get_token(log_type, crawler, env)
+            Common.logger(log_type, crawler).info(f"get_videoList_token:{token_dict}")
             url = "https://mp.weixin.qq.com/cgi-bin/appmsg?"
             url = "https://mp.weixin.qq.com/cgi-bin/appmsg?"
             headers = {
             headers = {
                 "accept": "*/*",
                 "accept": "*/*",
@@ -231,29 +309,33 @@ class GongzhonghaoAuthor4:
             if r.json()["base_resp"]["err_msg"] == "invalid session":
             if r.json()["base_resp"]["err_msg"] == "invalid session":
                 Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
                 Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
                 Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
                 Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
+                cls.release_token(log_type, crawler, env, token_dict["token_id"], -2)
                 if 20 >= datetime.datetime.now().hour >= 10:
                 if 20 >= datetime.datetime.now().hour >= 10:
                     Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']}\n过期啦,请扫码更换token\nhttps://mp.weixin.qq.com/")
                     Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']}\n过期啦,请扫码更换token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
+                time.sleep(60*15)
                 continue
                 continue
             if r.json()["base_resp"]["err_msg"] == "freq control":
             if r.json()["base_resp"]["err_msg"] == "freq control":
                 Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
                 Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
                 Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
                 Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
+                cls.release_token(log_type, crawler, env, token_dict["token_id"], -2)
                 if 20 >= datetime.datetime.now().hour >= 10:
                 if 20 >= datetime.datetime.now().hour >= 10:
                     Feishu.bot(log_type, crawler,f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
                     Feishu.bot(log_type, crawler,f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
+                time.sleep(60*15)
                 continue
                 continue
             if r.json()["base_resp"]["err_msg"] == "invalid args" and r.json()["base_resp"]["ret"] == 200002:
             if r.json()["base_resp"]["err_msg"] == "invalid args" and r.json()["base_resp"]["ret"] == 200002:
                 Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
                 Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
                 Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
                 Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
+                task_unbind(log_type=log_type, crawler=crawler, taskid=task_dict['id'], uids=str(user_dict["uid"]), env=env)
                 if 20 >= datetime.datetime.now().hour >= 10:
                 if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler,f"公众号:{user_dict['user_name']}\n抓取异常, 请检查该公众号\n")
+                    Feishu.bot(log_type, crawler,f"公众号:{user_dict['link']}, 站内昵称:{user_dict['nick_name']}\n抓取异常, 已取消抓取该公众号\n")
                 return
                 return
             if 'app_msg_list' not in r.json():
             if 'app_msg_list' not in r.json():
                 Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
                 Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
                 Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
                 Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
+                cls.release_token(log_type, crawler, env, token_dict["token_id"], -2)
                 if 20 >= datetime.datetime.now().hour >= 10:
                 if 20 >= datetime.datetime.now().hour >= 10:
                     Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']}\n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
                     Feishu.bot(log_type, crawler, f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']}\n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/")
-                time.sleep(60 * 15)
+                time.sleep(60*15)
                 continue
                 continue
             if len(r.json()['app_msg_list']) == 0:
             if len(r.json()['app_msg_list']) == 0:
                 Common.logger(log_type, crawler).info('没有更多视频了\n')
                 Common.logger(log_type, crawler).info('没有更多视频了\n')
@@ -291,7 +373,7 @@ class GongzhonghaoAuthor4:
 
 
                         if int(time.time()) - publish_time_stamp > 3600 * 24 * int(rule_dict.get('period', {}).get('max', 1000)):
                         if int(time.time()) - publish_time_stamp > 3600 * 24 * int(rule_dict.get('period', {}).get('max', 1000)):
                             Common.logger(log_type, crawler).info(f"发布时间超过{int(rule_dict.get('period', {}).get('max', 1000))}天\n")
                             Common.logger(log_type, crawler).info(f"发布时间超过{int(rule_dict.get('period', {}).get('max', 1000))}天\n")
-                            Common.logging(log_type, crawler, env, "发布时间超过{int(rule_dict.get('period', {}).get('max', 1000))}天\n")
+                            Common.logging(log_type, crawler, env, f"发布时间超过{int(rule_dict.get('period', {}).get('max', 1000))}天\n")
                             return
                             return
 
 
                         if video_dict['article_url'] == 0 or video_dict['video_url'] == 0:
                         if video_dict['article_url'] == 0 or video_dict['video_url'] == 0:
@@ -315,12 +397,6 @@ class GongzhonghaoAuthor4:
                             Common.logger(log_type, crawler).info(f'标题相似度>=80%:{video_dict["video_title"]}\n')
                             Common.logger(log_type, crawler).info(f'标题相似度>=80%:{video_dict["video_title"]}\n')
                             Common.logging(log_type, crawler, env, f'标题相似度>=80%:{video_dict["video_title"]}\n')
                             Common.logging(log_type, crawler, env, f'标题相似度>=80%:{video_dict["video_title"]}\n')
                         else:
                         else:
-                            # cls.download_publish(log_type=log_type,
-                            #                      crawler=crawler,
-                            #                      video_dict=video_dict,
-                            #                      rule_dict=rule_dict,
-                            #                      # user_dict=user_dict,
-                            #                      env=env)
                             video_dict["out_user_id"] = video_dict["user_id"]
                             video_dict["out_user_id"] = video_dict["user_id"]
                             video_dict["platform"] = crawler
                             video_dict["platform"] = crawler
                             video_dict["strategy"] = log_type
                             video_dict["strategy"] = log_type
@@ -328,10 +404,8 @@ class GongzhonghaoAuthor4:
                             video_dict["width"] = 0
                             video_dict["width"] = 0
                             video_dict["height"] = 0
                             video_dict["height"] = 0
                             video_dict["crawler_rule"] = json.dumps(rule_dict)
                             video_dict["crawler_rule"] = json.dumps(rule_dict)
-                            # video_dict["user_id"] = user_dict["uid"]
-                            video_dict["user_id"] = Publish.uids(crawler, "定向爬虫策略", "", env)
+                            video_dict["user_id"] = user_dict["uid"]
                             video_dict["publish_time"] = video_dict["publish_time_str"]
                             video_dict["publish_time"] = video_dict["publish_time_str"]
-
                             mq.send_msg(video_dict)
                             mq.send_msg(video_dict)
                     except Exception as e:
                     except Exception as e:
                         Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
                         Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
@@ -341,182 +415,7 @@ class GongzhonghaoAuthor4:
                 time.sleep(60)
                 time.sleep(60)
 
 
 
 
-    @classmethod
-    def repeat_video(cls, log_type, crawler, video_id, env):
-        # sql = f""" select * from crawler_video where platform="公众号" and out_video_id="{video_id}"; """
-        sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{video_id}"; """
-        repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
-        return len(repeat_video)
-
-    # 下载/上传
-    @classmethod
-    def download_publish(cls, log_type, crawler, video_dict, rule_dict, env):
-        # 下载视频
-        Common.download_method(log_type=log_type, crawler=crawler, text="video", title=video_dict["video_title"], url=video_dict["video_url"])
-        md_title = md5(video_dict['video_title'].encode('utf8')).hexdigest()
-        try:
-            if os.path.getsize(f"./{crawler}/videos/{md_title}/video.mp4") == 0:
-                # 删除视频文件夹
-                shutil.rmtree(f"./{crawler}/videos/{md_title}")
-                Common.logger(log_type, crawler).info("视频size=0,删除成功\n")
-                Common.logging(log_type, crawler, env, "视频size=0,删除成功\n")
-                return
-        except FileNotFoundError:
-            # 删除视频文件夹
-            shutil.rmtree(f"./{crawler}/videos/{md_title}")
-            Common.logger(log_type, crawler).info("视频文件不存在,删除文件夹成功\n")
-            Common.logging(log_type, crawler, env, "视频文件不存在,删除文件夹成功\n")
-            return
-        # 获取视频时长
-        ffmpeg_dict = Common.ffmpeg(log_type, crawler, f"./{crawler}/videos/{video_dict['video_title']}/video.mp4")
-        video_dict["video_width"] = ffmpeg_dict["width"]
-        video_dict["video_height"] = ffmpeg_dict["height"]
-        video_dict["duration"] = ffmpeg_dict["duration"]
-        Common.logger(log_type, crawler).info(f'video_width:{video_dict["video_width"]}')
-        Common.logging(log_type, crawler, env, f'video_width:{video_dict["video_width"]}')
-        Common.logger(log_type, crawler).info(f'video_height:{video_dict["video_height"]}')
-        Common.logging(log_type, crawler, env, f'video_height:{video_dict["video_height"]}')
-        Common.logger(log_type, crawler).info(f'duration:{video_dict["duration"]}')
-        Common.logging(log_type, crawler, env, f'duration:{video_dict["duration"]}')
-        if download_rule(log_type, crawler, video_dict, rule_dict) is False:
-            shutil.rmtree(f"./{crawler}/videos/{md_title}")
-            Common.logger(log_type, crawler).info("不满足抓取规则,删除成功\n")
-            Common.logging(log_type, crawler, env, "不满足抓取规则,删除成功\n")
-            return
-        # 下载封面
-        Common.download_method(log_type=log_type, crawler=crawler, text="cover",
-                               title=video_dict["video_title"], url=video_dict["cover_url"])
-        # 保存视频信息至 "./videos/{video_title}/info.txt"
-        Common.save_video_info(log_type=log_type, crawler=crawler, video_dict=video_dict)
-
-        # 上传视频
-        Common.logger(log_type, crawler).info("开始上传视频...")
-        Common.logging(log_type, crawler, env, "开始上传视频...")
-        strategy = "定向爬虫策略"
-        if env == 'prod':
-            oss_endpoint = "inner"
-            our_video_id = Publish.upload_and_publish(log_type=log_type,
-                                                      crawler=crawler,
-                                                      strategy=strategy,
-                                                      our_uid="follow",
-                                                      oss_endpoint=oss_endpoint,
-                                                      env=env)
-            our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{str(our_video_id)}/info"
-        else:
-            oss_endpoint = "out"
-            our_video_id = Publish.upload_and_publish(log_type=log_type,
-                                                      crawler=crawler,
-                                                      strategy=strategy,
-                                                      our_uid="follow",
-                                                      oss_endpoint=oss_endpoint,
-                                                      env=env)
-            our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{str(our_video_id)}/info"
-
-        if our_video_id is None:
-            try:
-                # 删除视频文件夹
-                shutil.rmtree(f"./{crawler}/videos/{md_title}")
-                return
-            except FileNotFoundError:
-                return
-
-        insert_sql = f""" insert into crawler_video(video_id,
-                                                    out_user_id,
-                                                    platform,
-                                                    strategy,
-                                                    out_video_id,
-                                                    video_title,
-                                                    cover_url,
-                                                    video_url,
-                                                    duration,
-                                                    publish_time,
-                                                    play_cnt,
-                                                    crawler_rule,
-                                                    width,
-                                                    height)
-                                                    values({our_video_id},
-                                                    "{video_dict['user_id']}",
-                                                    "{cls.platform}",
-                                                    "定向爬虫策略",
-                                                    "{video_dict['video_id']}",
-                                                    "{video_dict['video_title']}",
-                                                    "{video_dict['cover_url']}",
-                                                    "{video_dict['video_url']}",
-                                                    {int(video_dict['duration'])},
-                                                    "{video_dict['publish_time_str']}",
-                                                    {int(video_dict['play_cnt'])},
-                                                    '{json.dumps(rule_dict)}',
-                                                    {int(video_dict['video_width'])},
-                                                    {int(video_dict['video_height'])}) """
-        Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
-        Common.logging(log_type, crawler, env, f"insert_sql:{insert_sql}")
-        MysqlHelper.update_values(log_type, crawler, insert_sql, env)
-        Common.logger(log_type, crawler).info('视频信息写入数据库成功')
-        Common.logging(log_type, crawler, env, '视频信息写入数据库成功')
-
-        # 视频写入飞书
-        Feishu.insert_columns(log_type, crawler, "47e39d", "ROWS", 1, 2)
-        # 视频ID工作表,首行写入数据
-        upload_time = int(time.time())
-        values = [[time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(upload_time)),
-                   "用户主页",
-                   video_dict['video_title'],
-                   video_dict['video_id'],
-                   our_video_link,
-                   int(video_dict['duration']),
-                   f"{video_dict['video_width']}*{video_dict['video_height']}",
-                   video_dict['publish_time_str'],
-                   video_dict['user_name'],
-                   video_dict['user_id'],
-                   video_dict['avatar_url'],
-                   video_dict['cover_url'],
-                   video_dict['article_url'],
-                   video_dict['video_url']]]
-        time.sleep(0.5)
-        Feishu.update_values(log_type, crawler, "47e39d", "F2:Z2", values)
-        Common.logger(log_type, crawler).info('视频下载/上传成功\n')
-        Common.logging(log_type, crawler, env, '视频下载/上传成功\n')
-
-    @classmethod
-    def get_all_videos(cls, log_type, crawler, rule_dict, env):
-        while True:
-            sheetid = "Bzv72P"
-            user_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
-            if user_sheet is None:
-                Common.logger(log_type, crawler).warning(f"user_sheet:{user_sheet}, 2秒后重试")
-                Common.logging(log_type, crawler, env, f"user_sheet:{user_sheet}, 2秒后重试")
-                time.sleep(2)
-                continue
-            len_sheet = len(user_sheet)
-            if len_sheet <= 361:
-                Common.logger(log_type, crawler).info("抓取用户数<=300,无需启动第四套抓取脚本\n")
-                Common.logging(log_type, crawler, env, "抓取用户数<=300,无需启动第四套抓取脚本\n")
-                return
-            if len_sheet >= 461:
-                len_sheet = 461
-            for i in range(361, len_sheet):
-                user_dict = cls.get_users(log_type=log_type,
-                                          crawler=crawler,
-                                          user_sheet=user_sheet,
-                                          sheetid=sheetid,
-                                          i=i,
-                                          env=env)
-                Common.logger(log_type, crawler).info(f'获取:{user_dict["user_name"]} 公众号视频\n')
-                Common.logging(log_type, crawler, env, f'获取:{user_dict["user_name"]} 公众号视频\n')
-                try:
-                    cls.get_videoList(log_type=log_type,
-                                      crawler=crawler,
-                                      rule_dict=rule_dict,
-                                      user_dict=user_dict,
-                                      env=env)
-                    Common.logger(log_type, crawler).info('休眠 60 秒\n')
-                    Common.logging(log_type, crawler, env, '休眠 60 秒\n')
-                    time.sleep(60)
-                except Exception as e:
-                    Common.logger(log_type, crawler).info(f'抓取{user_dict["user_name"]}公众号时异常:{e}\n')
-                    Common.logging(log_type, crawler, env, f'抓取{user_dict["user_name"]}公众号时异常:{e}\n')
-            break
-
 if __name__ == "__main__":
 if __name__ == "__main__":
-    GongzhonghaoAuthor4.get_token("author", "gongzhonghao", "dev")
+    GongzhonghaoAuthor.get_token("author", "gongzhonghao", "dev")
+    # print(int(time.time()*1000))
     pass
     pass

+ 0 - 108
gongzhonghao/gongzhonghao_main/run_gzh1_author.py

@@ -1,108 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/6/6
-import argparse
-from mq_http_sdk.mq_client import *
-from mq_http_sdk.mq_consumer import *
-from mq_http_sdk.mq_exception import MQExceptionBase
-sys.path.append(os.getcwd())
-from common.public import task_fun_mq, get_consumer, ack_message
-from common.common import Common
-from gongzhonghao.gongzhonghao_author.gongzhonghao1_author import GongzhonghaoAuthor1
-
-
-def main(log_type, crawler, topic_name, group_id, env):
-    if "gzh1" in topic_name:
-        log_type = "author1"
-    elif "gzh2" in topic_name:
-        log_type = "author2"
-    elif "gzh3" in topic_name:
-        log_type = "author3"
-    elif "gzh4" in topic_name:
-        log_type = "author4"
-    elif "gzh5" in topic_name:
-        log_type = "author5"
-    consumer = get_consumer(topic_name, group_id)
-    # 长轮询表示如果Topic没有消息,则客户端请求会在服务端挂起3秒,3秒内如果有消息可以消费则立即返回响应。
-    # 长轮询时间3秒(最多可设置为30秒)。
-    wait_seconds = 30
-    # 一次最多消费3条(最多可设置为16条)。
-    batch = 1
-    Common.logger(log_type, crawler).info(f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
-                                          f'WaitSeconds:{wait_seconds}\n'
-                                          f'TopicName:{topic_name}\n'
-                                          f'MQConsumer:{group_id}')
-    Common.logging(log_type, crawler, env, f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
-                                          f'WaitSeconds:{wait_seconds}\n'
-                                          f'TopicName:{topic_name}\n'
-                                          f'MQConsumer:{group_id}')
-    while True:
-        try:
-            # 长轮询消费消息。
-            recv_msgs = consumer.consume_message(batch, wait_seconds)
-            for msg in recv_msgs:
-                Common.logger(log_type, crawler).info(f"Receive\n"
-                                                      f"MessageId:{msg.message_id}\n"
-                                                      f"MessageBodyMD5:{msg.message_body_md5}\n"
-                                                      f"MessageTag:{msg.message_tag}\n"
-                                                      f"ConsumedTimes:{msg.consumed_times}\n"
-                                                      f"PublishTime:{msg.publish_time}\n"
-                                                      f"Body:{msg.message_body}\n"
-                                                      f"NextConsumeTime:{msg.next_consume_time}\n"
-                                                      f"ReceiptHandle:{msg.receipt_handle}\n"
-                                                      f"Properties:{msg.properties}")
-                Common.logging(log_type, crawler, env, f"Receive\n"
-                                                      f"MessageId:{msg.message_id}\n"
-                                                      f"MessageBodyMD5:{msg.message_body_md5}\n"
-                                                      f"MessageTag:{msg.message_tag}\n"
-                                                      f"ConsumedTimes:{msg.consumed_times}\n"
-                                                      f"PublishTime:{msg.publish_time}\n"
-                                                      f"Body:{msg.message_body}\n"
-                                                      f"NextConsumeTime:{msg.next_consume_time}\n"
-                                                      f"ReceiptHandle:{msg.receipt_handle}\n"
-                                                      f"Properties:{msg.properties}")
-                # ack_mq_message
-                ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
-
-                # 处理爬虫业务
-                task_dict = task_fun_mq(msg.message_body)['task_dict']
-                rule_dict = task_fun_mq(msg.message_body)['rule_dict']
-                Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")
-                Common.logging(log_type, crawler, env, f"调度任务:{task_dict}")
-                Common.logger(log_type, crawler).info(f"抓取规则:{rule_dict}\n")
-                Common.logging(log_type, crawler, env, f"抓取规则:{rule_dict}\n")
-                Common.logger(log_type, crawler).info(f'开始抓取:{task_dict["taskName"]}\n')
-                Common.logging(log_type, crawler, env, f'开始抓取:{task_dict["taskName"]}\n')
-                GongzhonghaoAuthor1.get_all_videos(log_type=log_type,
-                                                    crawler=crawler,
-                                                    rule_dict=rule_dict,
-                                                    env=env)
-                Common.del_logs(log_type, crawler)
-                Common.logger(log_type, crawler).info('抓取一轮结束\n')
-                Common.logging(log_type, crawler, env, '抓取一轮结束\n')
-        except MQExceptionBase as err:
-            # Topic中没有消息可消费。
-            if err.type == "MessageNotExist":
-                Common.logger(log_type, crawler).info(f"No new message! RequestId:{err.req_id}\n")
-                Common.logging(log_type, crawler, env, f"No new message! RequestId:{err.req_id}\n")
-                continue
-
-            Common.logger(log_type, crawler).info(f"Consume Message Fail! Exception:{err}\n")
-            Common.logging(log_type, crawler, env, f"Consume Message Fail! Exception:{err}\n")
-            time.sleep(2)
-            continue
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()  ## 新建参数解释器对象
-    parser.add_argument('--log_type', type=str)  ## 添加参数,注明参数类型
-    parser.add_argument('--crawler')  ## 添加参数
-    parser.add_argument('--topic_name')  ## 添加参数
-    parser.add_argument('--group_id')  ## 添加参数
-    parser.add_argument('--env')  ## 添加参数
-    args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
-    main(log_type=args.log_type,
-         crawler=args.crawler,
-         topic_name=args.topic_name,
-         group_id=args.group_id,
-         env=args.env)

+ 0 - 32
gongzhonghao/gongzhonghao_main/run_gzh1_author_dev.py

@@ -1,32 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/6/20
-import os
-import sys
-sys.path.append(os.getcwd())
-from common.common import Common
-# from gongzhonghao.gongzhonghao_author.gongzhonghao1_author import GongzhonghaoAuthor1
-# from gongzhonghao.gongzhonghao_author.gongzhonghao2_author import GongzhonghaoAuthor2
-# from gongzhonghao.gongzhonghao_author.gongzhonghao3_author import GongzhonghaoAuthor3
-# from gongzhonghao.gongzhonghao_author.gongzhonghao4_author import GongzhonghaoAuthor4
-from gongzhonghao.gongzhonghao_author.gongzhonghao5_author import GongzhonghaoAuthor5
-
-
-def gzh_main(log_type, crawler, env):
-    Common.logger(log_type, crawler).info("开始抓取:公众号")
-    Common.logging(log_type, crawler, env, "开始抓取:公众号")
-    # GongzhonghaoAuthor1.get_all_videos(log_type=log_type,
-    # GongzhonghaoAuthor2.get_all_videos(log_type=log_type,
-    # GongzhonghaoAuthor3.get_all_videos(log_type=log_type,
-    # GongzhonghaoAuthor4.get_all_videos(log_type=log_type,
-    GongzhonghaoAuthor5.get_all_videos(log_type=log_type,
-                                       crawler=crawler,
-                                       rule_dict={"period": {"max": 1, "min": 1}, "duration": {"max": 2700, "min": 20}},
-                                       env=env)
-    Common.del_logs(log_type, crawler)
-    Common.logger(log_type, crawler).info('抓取一轮结束\n')
-    Common.logging(log_type, crawler, env, '抓取一轮结束\n')
-
-
-if __name__ == "__main__":
-    gzh_main(log_type="author", crawler="gongzhonghao", env="dev")

+ 0 - 108
gongzhonghao/gongzhonghao_main/run_gzh2_author.py

@@ -1,108 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/6/6
-import argparse
-from mq_http_sdk.mq_client import *
-from mq_http_sdk.mq_consumer import *
-from mq_http_sdk.mq_exception import MQExceptionBase
-sys.path.append(os.getcwd())
-from common.public import task_fun_mq, get_consumer, ack_message
-from common.common import Common
-from gongzhonghao.gongzhonghao_author.gongzhonghao2_author import GongzhonghaoAuthor2
-
-
-def main(log_type, crawler, topic_name, group_id, env):
-    if "gzh1" in topic_name:
-        log_type = "author1"
-    elif "gzh2" in topic_name:
-        log_type = "author2"
-    elif "gzh3" in topic_name:
-        log_type = "author3"
-    elif "gzh4" in topic_name:
-        log_type = "author4"
-    elif "gzh5" in topic_name:
-        log_type = "author5"
-    consumer = get_consumer(topic_name, group_id)
-    # 长轮询表示如果Topic没有消息,则客户端请求会在服务端挂起3秒,3秒内如果有消息可以消费则立即返回响应。
-    # 长轮询时间3秒(最多可设置为30秒)。
-    wait_seconds = 30
-    # 一次最多消费3条(最多可设置为16条)。
-    batch = 1
-    Common.logger(log_type, crawler).info(f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
-                                          f'WaitSeconds:{wait_seconds}\n'
-                                          f'TopicName:{topic_name}\n'
-                                          f'MQConsumer:{group_id}')
-    Common.logging(log_type, crawler, env, f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
-                                           f'WaitSeconds:{wait_seconds}\n'
-                                           f'TopicName:{topic_name}\n'
-                                           f'MQConsumer:{group_id}')
-    while True:
-        try:
-            # 长轮询消费消息。
-            recv_msgs = consumer.consume_message(batch, wait_seconds)
-            for msg in recv_msgs:
-                Common.logger(log_type, crawler).info(f"Receive\n"
-                                                      f"MessageId:{msg.message_id}\n"
-                                                      f"MessageBodyMD5:{msg.message_body_md5}\n"
-                                                      f"MessageTag:{msg.message_tag}\n"
-                                                      f"ConsumedTimes:{msg.consumed_times}\n"
-                                                      f"PublishTime:{msg.publish_time}\n"
-                                                      f"Body:{msg.message_body}\n"
-                                                      f"NextConsumeTime:{msg.next_consume_time}\n"
-                                                      f"ReceiptHandle:{msg.receipt_handle}\n"
-                                                      f"Properties:{msg.properties}")
-                Common.logging(log_type, crawler, env, f"Receive\n"
-                                                       f"MessageId:{msg.message_id}\n"
-                                                       f"MessageBodyMD5:{msg.message_body_md5}\n"
-                                                       f"MessageTag:{msg.message_tag}\n"
-                                                       f"ConsumedTimes:{msg.consumed_times}\n"
-                                                       f"PublishTime:{msg.publish_time}\n"
-                                                       f"Body:{msg.message_body}\n"
-                                                       f"NextConsumeTime:{msg.next_consume_time}\n"
-                                                       f"ReceiptHandle:{msg.receipt_handle}\n"
-                                                       f"Properties:{msg.properties}")
-                # ack_mq_message
-                ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
-
-                # 处理爬虫业务
-                task_dict = task_fun_mq(msg.message_body)['task_dict']
-                rule_dict = task_fun_mq(msg.message_body)['rule_dict']
-                Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")
-                Common.logging(log_type, crawler, env, f"调度任务:{task_dict}")
-                Common.logger(log_type, crawler).info(f"抓取规则:{rule_dict}\n")
-                Common.logging(log_type, crawler, env, f"抓取规则:{rule_dict}\n")
-                Common.logger(log_type, crawler).info(f'开始抓取:{task_dict["taskName"]}\n')
-                Common.logging(log_type, crawler, env, f'开始抓取:{task_dict["taskName"]}\n')
-                GongzhonghaoAuthor2.get_all_videos(log_type=log_type,
-                                                    crawler=crawler,
-                                                    rule_dict=rule_dict,
-                                                    env=env)
-                Common.del_logs(log_type, crawler)
-                Common.logger(log_type, crawler).info('抓取一轮结束\n')
-                Common.logging(log_type, crawler, env, '抓取一轮结束\n')
-        except MQExceptionBase as err:
-            # Topic中没有消息可消费。
-            if err.type == "MessageNotExist":
-                Common.logger(log_type, crawler).info(f"No new message! RequestId:{err.req_id}\n")
-                Common.logging(log_type, crawler, env, f"No new message! RequestId:{err.req_id}\n")
-                continue
-
-            Common.logger(log_type, crawler).info(f"Consume Message Fail! Exception:{err}\n")
-            Common.logging(log_type, crawler, env, f"Consume Message Fail! Exception:{err}\n")
-            time.sleep(2)
-            continue
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()  ## 新建参数解释器对象
-    parser.add_argument('--log_type', type=str)  ## 添加参数,注明参数类型
-    parser.add_argument('--crawler')  ## 添加参数
-    parser.add_argument('--topic_name')  ## 添加参数
-    parser.add_argument('--group_id')  ## 添加参数
-    parser.add_argument('--env')  ## 添加参数
-    args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
-    main(log_type=args.log_type,
-         crawler=args.crawler,
-         topic_name=args.topic_name,
-         group_id=args.group_id,
-         env=args.env)

+ 0 - 110
gongzhonghao/gongzhonghao_main/run_gzh3_author.py

@@ -1,110 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/6/6
-import argparse
-from mq_http_sdk.mq_client import *
-from mq_http_sdk.mq_consumer import *
-from mq_http_sdk.mq_exception import MQExceptionBase
-sys.path.append(os.getcwd())
-from common.public import task_fun_mq, get_consumer, ack_message
-from common.common import Common
-from gongzhonghao.gongzhonghao_author.gongzhonghao3_author import GongzhonghaoAuthor3
-
-
-def main(log_type, crawler, topic_name, group_id, env):
-    if "gzh1" in topic_name:
-        log_type = "author1"
-    elif "gzh2" in topic_name:
-        log_type = "author2"
-    elif "gzh3" in topic_name:
-        log_type = "author3"
-    elif "gzh4" in topic_name:
-        log_type = "author4"
-    elif "gzh5" in topic_name:
-        log_type = "author5"
-    consumer = get_consumer(topic_name, group_id)
-    # 长轮询表示如果Topic没有消息,则客户端请求会在服务端挂起3秒,3秒内如果有消息可以消费则立即返回响应。
-    # 长轮询时间3秒(最多可设置为30秒)。
-    wait_seconds = 30
-    # 一次最多消费3条(最多可设置为16条)。
-    batch = 1
-    Common.logger(log_type, crawler).info(f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
-                                          f'WaitSeconds:{wait_seconds}\n'
-                                          f'TopicName:{topic_name}\n'
-                                          f'MQConsumer:{group_id}')
-    Common.logging(log_type, crawler, env, f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
-                                           f'WaitSeconds:{wait_seconds}\n'
-                                           f'TopicName:{topic_name}\n'
-                                           f'MQConsumer:{group_id}')
-    while True:
-        try:
-            # 长轮询消费消息。
-            recv_msgs = consumer.consume_message(batch, wait_seconds)
-            for msg in recv_msgs:
-                Common.logger(log_type, crawler).info(f"Receive\n"
-                                                      f"MessageId:{msg.message_id}\n"
-                                                      f"MessageBodyMD5:{msg.message_body_md5}\n"
-                                                      f"MessageTag:{msg.message_tag}\n"
-                                                      f"ConsumedTimes:{msg.consumed_times}\n"
-                                                      f"PublishTime:{msg.publish_time}\n"
-                                                      f"Body:{msg.message_body}\n"
-                                                      f"NextConsumeTime:{msg.next_consume_time}\n"
-                                                      f"ReceiptHandle:{msg.receipt_handle}\n"
-                                                      f"Properties:{msg.properties}")
-                Common.logging(log_type, crawler, env, f"Receive\n"
-                                                       f"MessageId:{msg.message_id}\n"
-                                                       f"MessageBodyMD5:{msg.message_body_md5}\n"
-                                                       f"MessageTag:{msg.message_tag}\n"
-                                                       f"ConsumedTimes:{msg.consumed_times}\n"
-                                                       f"PublishTime:{msg.publish_time}\n"
-                                                       f"Body:{msg.message_body}\n"
-                                                       f"NextConsumeTime:{msg.next_consume_time}\n"
-                                                       f"ReceiptHandle:{msg.receipt_handle}\n"
-                                                       f"Properties:{msg.properties}")
-                # ack_mq_message
-                Common.logger(log_type, crawler).info("回传 ack 消息")
-                ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
-                time.sleep(2)
-
-                # 处理爬虫业务
-                task_dict = task_fun_mq(msg.message_body)['task_dict']
-                rule_dict = task_fun_mq(msg.message_body)['rule_dict']
-                Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")
-                Common.logging(log_type, crawler, env, f"调度任务:{task_dict}")
-                Common.logger(log_type, crawler).info(f"抓取规则:{rule_dict}\n")
-                Common.logging(log_type, crawler, env, f"抓取规则:{rule_dict}\n")
-                Common.logger(log_type, crawler).info(f'开始抓取:{task_dict["taskName"]}\n')
-                Common.logging(log_type, crawler, env, f'开始抓取:{task_dict["taskName"]}\n')
-                GongzhonghaoAuthor3.get_all_videos(log_type=log_type,
-                                                    crawler=crawler,
-                                                    rule_dict=rule_dict,
-                                                    env=env)
-                Common.del_logs(log_type, crawler)
-                Common.logger(log_type, crawler).info('抓取一轮结束\n')
-                Common.logging(log_type, crawler, env, '抓取一轮结束\n')
-        except MQExceptionBase as err:
-            # Topic中没有消息可消费。
-            if err.type == "MessageNotExist":
-                Common.logger(log_type, crawler).info(f"No new message! RequestId:{err.req_id}\n")
-                Common.logging(log_type, crawler, env, f"No new message! RequestId:{err.req_id}\n")
-                continue
-
-            Common.logger(log_type, crawler).info(f"Consume Message Fail! Exception:{err}\n")
-            Common.logging(log_type, crawler, env, f"Consume Message Fail! Exception:{err}\n")
-            time.sleep(2)
-            continue
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()  ## 新建参数解释器对象
-    parser.add_argument('--log_type', type=str)  ## 添加参数,注明参数类型
-    parser.add_argument('--crawler')  ## 添加参数
-    parser.add_argument('--topic_name')  ## 添加参数
-    parser.add_argument('--group_id')  ## 添加参数
-    parser.add_argument('--env')  ## 添加参数
-    args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
-    main(log_type=args.log_type,
-         crawler=args.crawler,
-         topic_name=args.topic_name,
-         group_id=args.group_id,
-         env=args.env)

+ 0 - 108
gongzhonghao/gongzhonghao_main/run_gzh4_author.py

@@ -1,108 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/6/6
-import argparse
-from mq_http_sdk.mq_client import *
-from mq_http_sdk.mq_consumer import *
-from mq_http_sdk.mq_exception import MQExceptionBase
-sys.path.append(os.getcwd())
-from common.public import task_fun_mq, get_consumer, ack_message
-from common.common import Common
-from gongzhonghao.gongzhonghao_author.gongzhonghao4_author import GongzhonghaoAuthor4
-
-
-def main(log_type, crawler, topic_name, group_id, env):
-    if "gzh1" in topic_name:
-        log_type = "author1"
-    elif "gzh2" in topic_name:
-        log_type = "author2"
-    elif "gzh3" in topic_name:
-        log_type = "author3"
-    elif "gzh4" in topic_name:
-        log_type = "author4"
-    elif "gzh5" in topic_name:
-        log_type = "author5"
-    consumer = get_consumer(topic_name, group_id)
-    # 长轮询表示如果Topic没有消息,则客户端请求会在服务端挂起3秒,3秒内如果有消息可以消费则立即返回响应。
-    # 长轮询时间3秒(最多可设置为30秒)。
-    wait_seconds = 30
-    # 一次最多消费3条(最多可设置为16条)。
-    batch = 1
-    Common.logger(log_type, crawler).info(f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
-                                          f'WaitSeconds:{wait_seconds}\n'
-                                          f'TopicName:{topic_name}\n'
-                                          f'MQConsumer:{group_id}')
-    Common.logging(log_type, crawler, env, f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
-                                           f'WaitSeconds:{wait_seconds}\n'
-                                           f'TopicName:{topic_name}\n'
-                                           f'MQConsumer:{group_id}')
-    while True:
-        try:
-            # 长轮询消费消息。
-            recv_msgs = consumer.consume_message(batch, wait_seconds)
-            for msg in recv_msgs:
-                Common.logger(log_type, crawler).info(f"Receive\n"
-                                                      f"MessageId:{msg.message_id}\n"
-                                                      f"MessageBodyMD5:{msg.message_body_md5}\n"
-                                                      f"MessageTag:{msg.message_tag}\n"
-                                                      f"ConsumedTimes:{msg.consumed_times}\n"
-                                                      f"PublishTime:{msg.publish_time}\n"
-                                                      f"Body:{msg.message_body}\n"
-                                                      f"NextConsumeTime:{msg.next_consume_time}\n"
-                                                      f"ReceiptHandle:{msg.receipt_handle}\n"
-                                                      f"Properties:{msg.properties}")
-                Common.logging(log_type, crawler, env, f"Receive\n"
-                                                       f"MessageId:{msg.message_id}\n"
-                                                       f"MessageBodyMD5:{msg.message_body_md5}\n"
-                                                       f"MessageTag:{msg.message_tag}\n"
-                                                       f"ConsumedTimes:{msg.consumed_times}\n"
-                                                       f"PublishTime:{msg.publish_time}\n"
-                                                       f"Body:{msg.message_body}\n"
-                                                       f"NextConsumeTime:{msg.next_consume_time}\n"
-                                                       f"ReceiptHandle:{msg.receipt_handle}\n"
-                                                       f"Properties:{msg.properties}")
-                # ack_mq_message
-                ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
-
-                # 处理爬虫业务
-                task_dict = task_fun_mq(msg.message_body)['task_dict']
-                rule_dict = task_fun_mq(msg.message_body)['rule_dict']
-                Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")
-                Common.logging(log_type, crawler, env, f"调度任务:{task_dict}")
-                Common.logger(log_type, crawler).info(f"抓取规则:{rule_dict}\n")
-                Common.logging(log_type, crawler, env, f"抓取规则:{rule_dict}\n")
-                Common.logger(log_type, crawler).info(f'开始抓取:{task_dict["taskName"]}\n')
-                Common.logging(log_type, crawler, env, f'开始抓取:{task_dict["taskName"]}\n')
-                GongzhonghaoAuthor4.get_all_videos(log_type=log_type,
-                                                    crawler=crawler,
-                                                    rule_dict=rule_dict,
-                                                    env=env)
-                Common.del_logs(log_type, crawler)
-                Common.logger(log_type, crawler).info('抓取一轮结束\n')
-                Common.logging(log_type, crawler, env, '抓取一轮结束\n')
-        except MQExceptionBase as err:
-            # Topic中没有消息可消费。
-            if err.type == "MessageNotExist":
-                Common.logger(log_type, crawler).info(f"No new message! RequestId:{err.req_id}\n")
-                Common.logging(log_type, crawler, env, f"No new message! RequestId:{err.req_id}\n")
-                continue
-
-            Common.logger(log_type, crawler).info(f"Consume Message Fail! Exception:{err}\n")
-            Common.logging(log_type, crawler, env, f"Consume Message Fail! Exception:{err}\n")
-            time.sleep(2)
-            continue
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()  ## 新建参数解释器对象
-    parser.add_argument('--log_type', type=str)  ## 添加参数,注明参数类型
-    parser.add_argument('--crawler')  ## 添加参数
-    parser.add_argument('--topic_name')  ## 添加参数
-    parser.add_argument('--group_id')  ## 添加参数
-    parser.add_argument('--env')  ## 添加参数
-    args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
-    main(log_type=args.log_type,
-         crawler=args.crawler,
-         topic_name=args.topic_name,
-         group_id=args.group_id,
-         env=args.env)

+ 4 - 5
gongzhonghao/gongzhonghao_main/run_gzh_author.py

@@ -10,10 +10,10 @@ sys.path.append(os.getcwd())
 from common.public import task_fun_mq, get_consumer, ack_message
 from common.public import task_fun_mq, get_consumer, ack_message
 from common.common import Common
 from common.common import Common
 from common.scheduling_db import MysqlHelper
 from common.scheduling_db import MysqlHelper
-from gongzhonghao.gongzhonghao_author.gongzhonghao_author import GongzhonghaoAuthor
+from gongzhonghao.gongzhonghao_author.gongzhonghao_author_lock import GongzhonghaoAuthor
 
 
 
 
-def get_author_videos(log_type, crawler, token_index, task_dict, rule_dict, user_list, env):
+def get_author_videos(log_type, crawler, task_dict, rule_dict, user_list, env):
     Common.logger(log_type, crawler).info(f'开始抓取:{task_dict["taskName"]}\n')
     Common.logger(log_type, crawler).info(f'开始抓取:{task_dict["taskName"]}\n')
     Common.logging(log_type, crawler, env, f'开始抓取:{task_dict["taskName"]}\n')
     Common.logging(log_type, crawler, env, f'开始抓取:{task_dict["taskName"]}\n')
     Common.logger(log_type, crawler).info(f"user_list:{user_list}")
     Common.logger(log_type, crawler).info(f"user_list:{user_list}")
@@ -21,11 +21,10 @@ def get_author_videos(log_type, crawler, token_index, task_dict, rule_dict, user
     GongzhonghaoAuthor.get_all_videos(log_type=log_type,
     GongzhonghaoAuthor.get_all_videos(log_type=log_type,
                                       crawler=crawler,
                                       crawler=crawler,
                                       task_dict=task_dict,
                                       task_dict=task_dict,
-                                      token_index=token_index,
                                       rule_dict=rule_dict,
                                       rule_dict=rule_dict,
                                       user_list=user_list,
                                       user_list=user_list,
                                       env=env)
                                       env=env)
-    Common.del_logs(log_type, crawler)
+    # Common.del_logs(log_type, crawler)
     Common.logger(log_type, crawler).info('抓取一轮结束\n')
     Common.logger(log_type, crawler).info('抓取一轮结束\n')
     Common.logging(log_type, crawler, env, '抓取一轮结束\n')
     Common.logging(log_type, crawler, env, '抓取一轮结束\n')
 
 
@@ -103,7 +102,7 @@ def main(log_type, crawler, topic_name, group_id, env):
                     start = i * chunk_size
                     start = i * chunk_size
                     end = min((i + 1) * chunk_size, user_num + 1)
                     end = min((i + 1) * chunk_size, user_num + 1)
                     process = Process(target=get_author_videos, args=(
                     process = Process(target=get_author_videos, args=(
-                    f"{log_type}{i + 1}", crawler, i + 1, task_dict, rule_dict, user_list[start:end], env))
+                    f"{log_type}{i + 1}", crawler, task_dict, rule_dict, user_list[start:end], env))
                     process.start()
                     process.start()
                     processes.append(process)
                     processes.append(process)
 
 

+ 5 - 6
gongzhonghao/gongzhonghao_main/run_gzh_author_dev.py

@@ -7,10 +7,10 @@ from multiprocessing import Process
 sys.path.append(os.getcwd())
 sys.path.append(os.getcwd())
 from common.common import Common
 from common.common import Common
 from common.scheduling_db import MysqlHelper
 from common.scheduling_db import MysqlHelper
-from gongzhonghao.gongzhonghao_author.gongzhonghao_author import GongzhonghaoAuthor
+from gongzhonghao.gongzhonghao_author.gongzhonghao_author_lock import GongzhonghaoAuthor
 
 
 
 
-def get_author_videos(log_type, crawler, task_dict, token_index, rule_dict, user_list, env):
+def get_author_videos(log_type, crawler, task_dict, rule_dict, user_list, env):
     Common.logger(log_type, crawler).info(f'开始抓取:公众号账号\n')
     Common.logger(log_type, crawler).info(f'开始抓取:公众号账号\n')
     Common.logging(log_type, crawler, env, f'开始抓取:公众号账号\n')
     Common.logging(log_type, crawler, env, f'开始抓取:公众号账号\n')
     Common.logger(log_type, crawler).info(f"user_list:{user_list}")
     Common.logger(log_type, crawler).info(f"user_list:{user_list}")
@@ -18,11 +18,10 @@ def get_author_videos(log_type, crawler, task_dict, token_index, rule_dict, user
     GongzhonghaoAuthor.get_all_videos(log_type=log_type,
     GongzhonghaoAuthor.get_all_videos(log_type=log_type,
                                        crawler=crawler,
                                        crawler=crawler,
                                       task_dict=task_dict,
                                       task_dict=task_dict,
-                                       token_index = token_index,
                                        rule_dict=rule_dict,
                                        rule_dict=rule_dict,
                                        user_list = user_list,
                                        user_list = user_list,
                                        env=env)
                                        env=env)
-    Common.del_logs(log_type, crawler)
+    # Common.del_logs(log_type, crawler)
     Common.logger(log_type, crawler).info('抓取一轮结束\n')
     Common.logger(log_type, crawler).info('抓取一轮结束\n')
     Common.logging(log_type, crawler, env, '抓取一轮结束\n')
     Common.logging(log_type, crawler, env, '抓取一轮结束\n')
 
 
@@ -41,7 +40,7 @@ def main(log_type, crawler, env):
 
 
             # 计算启动脚本数 crawler_num
             # 计算启动脚本数 crawler_num
             user_num = len(user_list)
             user_num = len(user_list)
-            chunk_size = 2  # 每个进程处理的用户数量
+            chunk_size = 1  # 每个进程处理的用户数量
             crawler_num = int(user_num // chunk_size)  # 向下取整
             crawler_num = int(user_num // chunk_size)  # 向下取整
             if user_num % chunk_size != 0:
             if user_num % chunk_size != 0:
                 crawler_num += 1
                 crawler_num += 1
@@ -53,7 +52,7 @@ def main(log_type, crawler, env):
             for i in range(crawler_num):
             for i in range(crawler_num):
                 start = i * chunk_size
                 start = i * chunk_size
                 end = min((i + 1) * chunk_size, user_num + 1)
                 end = min((i + 1) * chunk_size, user_num + 1)
-                process = Process(target=get_author_videos, args=(f"{log_type}{i+1}", crawler, task_dict, i+1, rule_dict, user_list[start:end], env))
+                process = Process(target=get_author_videos, args=(f"{log_type}{i+1}", crawler, task_dict, rule_dict, user_list[start:end], env))
                 process.start()
                 process.start()
                 processes.append(process)
                 processes.append(process)
 
 

+ 54 - 24
gongzhonghao/gongzhonghao_main/run_gzh5_author.py → gongzhonghao/gongzhonghao_main/run_gzh_author_old.py

@@ -1,27 +1,36 @@
 # -*- coding: utf-8 -*-
 # -*- coding: utf-8 -*-
 # @Author: wangkun
 # @Author: wangkun
-# @Time: 2023/6/6
+# @Time: 2023/6/30
 import argparse
 import argparse
+from multiprocessing import Process
 from mq_http_sdk.mq_client import *
 from mq_http_sdk.mq_client import *
 from mq_http_sdk.mq_consumer import *
 from mq_http_sdk.mq_consumer import *
 from mq_http_sdk.mq_exception import MQExceptionBase
 from mq_http_sdk.mq_exception import MQExceptionBase
 sys.path.append(os.getcwd())
 sys.path.append(os.getcwd())
 from common.public import task_fun_mq, get_consumer, ack_message
 from common.public import task_fun_mq, get_consumer, ack_message
 from common.common import Common
 from common.common import Common
-from gongzhonghao.gongzhonghao_author.gongzhonghao5_author import GongzhonghaoAuthor5
+from common.scheduling_db import MysqlHelper
+from gongzhonghao.gongzhonghao_author.gongzhonghao_author import GongzhonghaoAuthor
+
+
+def get_author_videos(log_type, crawler, token_index, task_dict, rule_dict, user_list, env):
+    Common.logger(log_type, crawler).info(f'开始抓取:{task_dict["taskName"]}\n')
+    Common.logging(log_type, crawler, env, f'开始抓取:{task_dict["taskName"]}\n')
+    Common.logger(log_type, crawler).info(f"user_list:{user_list}")
+    Common.logging(log_type, crawler, env, f"user_list:{user_list}")
+    GongzhonghaoAuthor.get_all_videos(log_type=log_type,
+                                      crawler=crawler,
+                                      task_dict=task_dict,
+                                      token_index=token_index,
+                                      rule_dict=rule_dict,
+                                      user_list=user_list,
+                                      env=env)
+    Common.del_logs(log_type, crawler)
+    Common.logger(log_type, crawler).info('抓取一轮结束\n')
+    Common.logging(log_type, crawler, env, '抓取一轮结束\n')
 
 
 
 
 def main(log_type, crawler, topic_name, group_id, env):
 def main(log_type, crawler, topic_name, group_id, env):
-    if "gzh1" in topic_name:
-        log_type = "author1"
-    elif "gzh2" in topic_name:
-        log_type = "author2"
-    elif "gzh3" in topic_name:
-        log_type = "author3"
-    elif "gzh4" in topic_name:
-        log_type = "author4"
-    elif "gzh5" in topic_name:
-        log_type = "author5"
     consumer = get_consumer(topic_name, group_id)
     consumer = get_consumer(topic_name, group_id)
     # 长轮询表示如果Topic没有消息,则客户端请求会在服务端挂起3秒,3秒内如果有消息可以消费则立即返回响应。
     # 长轮询表示如果Topic没有消息,则客户端请求会在服务端挂起3秒,3秒内如果有消息可以消费则立即返回响应。
     # 长轮询时间3秒(最多可设置为30秒)。
     # 长轮询时间3秒(最多可设置为30秒)。
@@ -64,22 +73,43 @@ def main(log_type, crawler, topic_name, group_id, env):
                 # ack_mq_message
                 # ack_mq_message
                 ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
                 ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
 
 
-                # 处理爬虫业务
+                # 解析 task_dict
                 task_dict = task_fun_mq(msg.message_body)['task_dict']
                 task_dict = task_fun_mq(msg.message_body)['task_dict']
-                rule_dict = task_fun_mq(msg.message_body)['rule_dict']
                 Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")
                 Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")
                 Common.logging(log_type, crawler, env, f"调度任务:{task_dict}")
                 Common.logging(log_type, crawler, env, f"调度任务:{task_dict}")
+
+                # 解析 rule_dict
+                rule_dict = task_fun_mq(msg.message_body)['rule_dict']
                 Common.logger(log_type, crawler).info(f"抓取规则:{rule_dict}\n")
                 Common.logger(log_type, crawler).info(f"抓取规则:{rule_dict}\n")
                 Common.logging(log_type, crawler, env, f"抓取规则:{rule_dict}\n")
                 Common.logging(log_type, crawler, env, f"抓取规则:{rule_dict}\n")
-                Common.logger(log_type, crawler).info(f'开始抓取:{task_dict["taskName"]}\n')
-                Common.logging(log_type, crawler, env, f'开始抓取:{task_dict["taskName"]}\n')
-                GongzhonghaoAuthor5.get_all_videos(log_type=log_type,
-                                                    crawler=crawler,
-                                                    rule_dict=rule_dict,
-                                                    env=env)
-                Common.del_logs(log_type, crawler)
-                Common.logger(log_type, crawler).info('抓取一轮结束\n')
-                Common.logging(log_type, crawler, env, '抓取一轮结束\n')
+
+                # 解析 user_list
+                task_id = task_dict['id']
+                select_user_sql = f"""select * from crawler_user_v3 where task_id={task_id}"""
+                user_list = MysqlHelper.get_values(log_type, crawler, select_user_sql, env, action="")
+
+                # 计算启动脚本数 crawler_num
+                user_num = len(user_list)
+                chunk_size = 100  # 每个进程处理的用户数量
+                crawler_num = int(user_num // chunk_size)  # 向下取整
+                if user_num % chunk_size != 0:
+                    crawler_num += 1
+                Common.logger(log_type, crawler).info(f"共{user_num}个公众号,需要启动{crawler_num}个脚本任务")
+                Common.logging(log_type, crawler, env, f"共{user_num}个公众号,需要启动{crawler_num}个脚本任务")
+
+                # 多进程并行抓取
+                processes = []
+                for i in range(crawler_num):
+                    start = i * chunk_size
+                    end = min((i + 1) * chunk_size, user_num + 1)
+                    process = Process(target=get_author_videos, args=(
+                    f"{log_type}{i + 1}", crawler, i + 1, task_dict, rule_dict, user_list[start:end], env))
+                    process.start()
+                    processes.append(process)
+
+                for process in processes:
+                    process.join()
+
         except MQExceptionBase as err:
         except MQExceptionBase as err:
             # Topic中没有消息可消费。
             # Topic中没有消息可消费。
             if err.type == "MessageNotExist":
             if err.type == "MessageNotExist":
@@ -105,4 +135,4 @@ if __name__ == "__main__":
          crawler=args.crawler,
          crawler=args.crawler,
          topic_name=args.topic_name,
          topic_name=args.topic_name,
          group_id=args.group_id,
          group_id=args.group_id,
-         env=args.env)
+         env=args.env)

+ 2 - 2
kanyikan/kanyikan_moment/kanyikan_moment.py

@@ -400,7 +400,7 @@ class Moment:
                     Common.logger("moment").info("添加视频到监控表:{}", download_video_title)
                     Common.logger("moment").info("添加视频到监控表:{}", download_video_title)
                     # 插入空行
                     # 插入空行
                     time.sleep(1)
                     time.sleep(1)
-                    Feishu.insert_columns("moment", "monitor", "6fed97", "ROWS", 1, 2)
+                    Feishu.insert_columns("moment", "control", "6fed97", "ROWS", 1, 2)
                     # 视频信息写入监控表
                     # 视频信息写入监控表
                     values = [[time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(int(upload_time))),
                     values = [[time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(int(upload_time))),
                                str(download_video_id),
                                str(download_video_id),
@@ -410,7 +410,7 @@ class Moment:
                                str(download_video_send_time),
                                str(download_video_send_time),
                                download_video_play_cnt]]
                                download_video_play_cnt]]
                     time.sleep(1)
                     time.sleep(1)
-                    Feishu.update_values("moment", "monitor", "6fed97", "F2:L2", values)
+                    Feishu.update_values("moment", "control", "6fed97", "F2:L2", values)
 
 
                     # 删除行或列,可选 ROWS、COLUMNS
                     # 删除行或列,可选 ROWS、COLUMNS
                     Feishu.dimension_range("moment", "kanyikan", "tGqZMX", "ROWS", i + 1, i + 1)
                     Feishu.dimension_range("moment", "kanyikan", "tGqZMX", "ROWS", i + 1, i + 1)

+ 2 - 1
requirements.txt

@@ -12,4 +12,5 @@ selenium~=4.2.0
 urllib3==1.26.9
 urllib3==1.26.9
 workalendar==17.0.0
 workalendar==17.0.0
 opencv-python~=4.8.0.74
 opencv-python~=4.8.0.74
-Appium-Python-Client~=2.8.1
+Appium-Python-Client~=2.8.1
+crontab~=1.0.1