Przeglądaj źródła

2023-12-20: 通用方法修改上线, 增加了Feishu, item, pipeline, 日志, 快代理, MQ的通用配置

罗俊辉 1 rok temu
rodzic
commit
7c6aa27ab6

+ 1 - 0
application/common/feishu/__init__.py

@@ -0,0 +1 @@
+from .feishu import Feishu

+ 716 - 0
application/common/feishu/feishu.py

@@ -0,0 +1,716 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/1/31
+"""
+飞书表配置: token 鉴权 / 增删改查 / 机器人报警
+"""
+import json
+import os
+import sys
+import requests
+import urllib3
+
+sys.path.append(os.getcwd())
+
+from application.common.log import Local
+proxies = {"http": None, "https": None}
+
+
+class Feishu:
+    """
+    编辑飞书云文档
+    """
+    # 看一看爬虫数据表
+    kanyikan_url = "https://w42nne6hzg.feishu.cn/sheets/shtcngRPoDYAi24x52j2nDuHMih?"
+    # 快手爬虫数据表
+    kuaishou_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnICEfaw9llDNQkKgdymM1xf?"
+    # 微视爬虫数据表
+    weishi_url = "https://w42nne6hzg.feishu.cn/sheets/shtcn5YSWg91JfVGzj0SFZIRRPh?"
+    # 小年糕爬虫数据表
+    xiaoniangao_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnYxiyQ1wLklo1W5Kdqc9cGh?"
+    # 音乐相册
+    music_album = "https://w42nne6hzg.feishu.cn/sheets/shtcnT6zvmfsYe1g0iv4pt7855g?"
+    # 本山祝福数据表
+    crawler_benshanzhufu = "https://w42nne6hzg.feishu.cn/sheets/shtcnGh2rrsPYM4iVNEBO7OqWrb?"
+    # 公众号爬虫表
+    gzh_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnexNXnpDLHhARw0QdiwbYuA?"
+    # 数据监控表
+    crawler_monitor = "https://w42nne6hzg.feishu.cn/sheets/shtcnlZWYazInhf7Z60jkbLRJyd?"
+    # 微群视频爬虫表
+    crawler_weiqun_video = "https://w42nne6hzg.feishu.cn/sheets/shtcnoKThNquYRweaylMFVyo9Hc?"
+    # 视频号爬虫表
+    crawler_shipinhao = 'https://w42nne6hzg.feishu.cn/sheets/shtcn9rOdZRAGFbRkWpn7hqEHGc?'
+    # 西瓜视频
+    crawler_xigua = 'https://w42nne6hzg.feishu.cn/sheets/shtcnvOpx2P8vBXiV91Ot1MKIw8?'
+    # 知乎 PC 端
+    crawler_zhihu = 'https://w42nne6hzg.feishu.cn/sheets/shtcnkGPBmGsjaqapgzouuj8MXe?'
+    # 吉祥幸福
+    crawler_jixiangxingfu = 'https://w42nne6hzg.feishu.cn/sheets/shtcnSx4nafMbLTq7xl7RHBwHBf?'
+    # 众妙音信
+    crawler_zmyx = 'https://w42nne6hzg.feishu.cn/sheets/shtcnbZIxstPeM0xshW07b26sve?'
+    # 岁岁年年迎福气
+    crawler_ssnnyfq = 'https://w42nne6hzg.feishu.cn/sheets/shtcnyJmJSJynHDLLbLTkySfvZe?'
+    # 祝福猫视频
+    crawler_zhufumao = 'https://w42nne6hzg.feishu.cn/sheets/shtcnXfIJthvkjhI5zlEJq84i6g?'
+    # 宗教公众号
+    crawler_zongjiao = 'https://w42nne6hzg.feishu.cn/sheets/shtcn73NW0CyoOeF21HWO15KBsb?'
+    # 好看视频
+    crawler_haokan = 'https://w42nne6hzg.feishu.cn/sheets/shtcnaYz8Nhv8q6DbWtlL6rMEBd'
+    # 看到就是福气
+    crawler_kandaojiushifuqi = 'https://w42nne6hzg.feishu.cn/sheets/shtcnEokBkIjOUPAk8vbbPKnXgb'
+    # 胜胜影音
+    crawler_shengshengyingyin = 'https://w42nne6hzg.feishu.cn/sheets/shtcnz1ymxHL1u8WHblfqfys7qe'
+    # 刚刚都传
+    crawler_ganggangdouchuan = 'https://w42nne6hzg.feishu.cn/sheets/shtcnTuJgeZU2bc7VaesAqk3QJx'
+    # 知青天天看
+    crawler_zhiqingtiantiankan = 'https://w42nne6hzg.feishu.cn/sheets/shtcnjmhKdJOKdqnEzJcZb5xaHc?'
+    # 公众号_信欣
+    crawler_gongzhonghao = 'https://w42nne6hzg.feishu.cn/sheets/shtcna98M2mX7TbivTj9Sb7WKBN?'
+    # YouTube
+    crawler_youtube = 'https://w42nne6hzg.feishu.cn/sheets/shtcnrLyr1zbYbhhZyqpN7Xrd5f?'
+    # 微信指数
+    weixinzhishu = 'https://w42nne6hzg.feishu.cn/sheets/shtcnqhMRUGunIfGnGXMOBYiy4K?'
+    # 微信指数_搜索词
+    weixinzhishu_search_word = 'https://w42nne6hzg.feishu.cn/sheets/shtcnHxCj6dZBYMuK1Q3tIJVlqg?'
+    # 海豚祝福
+    crawler_haitunzhufu = 'https://w42nne6hzg.feishu.cn/sheets/VbyAsUGq3h9TQ7tG3GpczGjhn1M?'
+
+    # 飞书路径token
+    @classmethod
+    def spreadsheettoken(cls, crawler):
+        """
+        :param crawler: 哪个爬虫
+        """
+        if crawler == "kanyikan":
+            return "shtcngRPoDYAi24x52j2nDuHMih"
+        elif crawler == "kuaishou":
+            return "shtcnICEfaw9llDNQkKgdymM1xf"
+        elif crawler == "weishi":
+            return "shtcn5YSWg91JfVGzj0SFZIRRPh"
+        elif crawler == "xiaoniangao":
+            return "shtcnYxiyQ1wLklo1W5Kdqc9cGh"
+        elif crawler == "control":
+            return "shtcnlZWYazInhf7Z60jkbLRJyd"
+        elif crawler == "music_album":
+            return "shtcnT6zvmfsYe1g0iv4pt7855g"
+        elif crawler == "benshanzhufu":
+            return "shtcnGh2rrsPYM4iVNEBO7OqWrb"
+        elif crawler == "gzh":
+            return "shtcnexNXnpDLHhARw0QdiwbYuA"
+        elif crawler == "weiqun":
+            return "shtcnoKThNquYRweaylMFVyo9Hc"
+        elif crawler == 'shipinhao':
+            return 'shtcn9rOdZRAGFbRkWpn7hqEHGc'
+        elif crawler == 'xigua':
+            return 'shtcnvOpx2P8vBXiV91Ot1MKIw8'
+        elif crawler == 'zhihu':
+            return 'shtcnkGPBmGsjaqapgzouuj8MXe'
+        elif crawler == 'jixiangxingfu':
+            return 'shtcnSx4nafMbLTq7xl7RHBwHBf'
+        elif crawler == 'zhongmiaoyinxin':
+            return 'shtcnbZIxstPeM0xshW07b26sve'
+        elif crawler == 'suisuiniannianyingfuqi':
+            return 'shtcnyJmJSJynHDLLbLTkySfvZe'
+        elif crawler == 'zhufumao':
+            return 'shtcnXfIJthvkjhI5zlEJq84i6g'
+        elif crawler == 'zongjiao':
+            return 'shtcn73NW0CyoOeF21HWO15KBsb'
+        elif crawler == 'haokan':
+            return 'shtcnaYz8Nhv8q6DbWtlL6rMEBd'
+        elif crawler == 'kandaojiushifuqi':
+            return 'shtcnEokBkIjOUPAk8vbbPKnXgb'
+        elif crawler == 'shengshengyingyin':
+            return 'shtcnz1ymxHL1u8WHblfqfys7qe'
+        elif crawler == 'ganggangdouchuan':
+            return 'shtcnTuJgeZU2bc7VaesAqk3QJx'
+        elif crawler == 'youtube':
+            return 'shtcnrLyr1zbYbhhZyqpN7Xrd5f'
+        elif crawler == 'weixinzhishu':
+            return 'shtcnqhMRUGunIfGnGXMOBYiy4K'
+        elif crawler == 'weixinzhishu_search_word':
+            return 'shtcnHxCj6dZBYMuK1Q3tIJVlqg'
+        elif crawler == 'gongzhonghao':
+            return 'shtcna98M2mX7TbivTj9Sb7WKBN'
+        elif crawler == 'douyin':
+            return 'shtcnhq63MoXOpqbkuLuoapYIAh'
+        elif crawler == 'zhiqingtiantiankan':
+            return 'shtcnjmhKdJOKdqnEzJcZb5xaHc'
+        elif crawler == 'haitunzhufu':
+            return 'VbyAsUGq3h9TQ7tG3GpczGjhn1M'
+
+    # 获取飞书api token
+    @classmethod
+    def get_token(cls, log_type, crawler):
+        """
+        获取飞书api token
+        :return:
+        """
+        url = "https://open.feishu.cn/open-apis/auth/v3/tenant_access_token/internal/"
+        post_data = {"app_id": "cli_a13ad2afa438d00b",  # 这里账号密码是发布应用的后台账号及密码
+                     "app_secret": "4tK9LY9VbiQlY5umhE42dclBFo6t4p5O"}
+
+        try:
+            urllib3.disable_warnings()
+            response = requests.post(url=url, data=post_data, proxies=proxies, verify=False)
+            tenant_access_token = response.json()["tenant_access_token"]
+            return tenant_access_token
+        except Exception as e:
+            Local.logger(log_type, crawler).error("获取飞书 api token 异常:{}", e)
+
+    # 获取表格元数据
+    @classmethod
+    def get_metainfo(cls, log_type, crawler):
+        """
+        获取表格元数据
+        :return:
+        """
+        try:
+            get_metainfo_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                               + cls.spreadsheettoken(crawler) + "/metainfo"
+
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(log_type, crawler),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+            params = {
+                "extFields": "protectedRange",  # 额外返回的字段,extFields=protectedRange时返回保护行列信息
+                "user_id_type": "open_id"  # 返回的用户id类型,可选open_id,union_id
+            }
+            urllib3.disable_warnings()
+            r = requests.get(url=get_metainfo_url, headers=headers, params=params, proxies=proxies, verify=False)
+            response = json.loads(r.content.decode("utf8"))
+            return response
+        except Exception as e:
+            Local.logger(log_type, crawler).error("获取表格元数据异常:{}", e)
+
+    # 读取工作表中所有数据
+    @classmethod
+    def get_values_batch(cls, log_type, crawler, sheetid):
+        """
+        读取工作表中所有数据
+        :param log_type: 启用哪个 log
+        :param crawler: 哪个爬虫
+        :param sheetid: 哪张表
+        :return: 所有数据
+        """
+        try:
+            get_values_batch_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                                   + cls.spreadsheettoken(crawler) + "/values_batch_get"
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(log_type, crawler),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+            params = {
+                # 多个查询范围 如 url?ranges=range1,range2 ,其中 range 包含 sheetId 与单元格范围两部分
+                "ranges": sheetid,
+
+                # valueRenderOption=ToString 可返回纯文本的值(数值类型除外);
+                # valueRenderOption=FormattedValue 计算并格式化单元格;
+                # valueRenderOption=Formula单元格中含有公式时返回公式本身;
+                # valueRenderOption=UnformattedValue计算但不对单元格进行格式化
+                "valueRenderOption": "ToString",
+
+                # dateTimeRenderOption=FormattedString 计算并将时间日期按照其格式进行格式化,但不会对数字进行格式化,返回格式化后的字符串。
+                "dateTimeRenderOption": "",
+
+                # 返回的用户id类型,可选open_id,union_id
+                "user_id_type": "open_id"
+            }
+            urllib3.disable_warnings()
+            r = requests.get(url=get_values_batch_url, headers=headers, params=params, proxies=proxies, verify=False)
+            # print(r.text)
+            response = json.loads(r.content.decode("utf8"))
+            values = response["data"]["valueRanges"][0]["values"]
+            return values
+        except Exception as e:
+            Local.logger(log_type, crawler).error("读取工作表所有数据异常:{}", e)
+
+    # 工作表,插入行或列
+    @classmethod
+    def insert_columns(cls, log_type, crawler, sheetid, majordimension, startindex, endindex):
+        """
+        工作表插入行或列
+        :param log_type: 日志路径
+        :param crawler: 哪个爬虫的云文档
+        :param sheetid:哪张工作表
+        :param majordimension:行或者列, ROWS、COLUMNS
+        :param startindex:开始位置
+        :param endindex:结束位置
+        """
+        try:
+            insert_columns_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                                 + cls.spreadsheettoken(crawler) + "/insert_dimension_range"
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(log_type, crawler),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+            body = {
+                "dimension": {
+                    "sheetId": sheetid,
+                    "majorDimension": majordimension,  # 默认 ROWS ,可选 ROWS、COLUMNS
+                    "startIndex": startindex,  # 开始的位置
+                    "endIndex": endindex  # 结束的位置
+                },
+                "inheritStyle": "AFTER"  # BEFORE 或 AFTER,不填为不继承 style
+            }
+
+            urllib3.disable_warnings()
+            r = requests.post(url=insert_columns_url, headers=headers, json=body, proxies=proxies, verify=False)
+            Local.logger(log_type, crawler).info("插入行或列:{}", r.json()["msg"])
+        except Exception as e:
+            Local.logger(log_type, crawler).error("插入行或列异常:{}", e)
+
+    # 写入数据
+    @classmethod
+    def update_values(cls, log_type, crawler, sheetid, ranges, values):
+        """
+        写入数据
+        :param log_type: 日志路径
+        :param crawler: 哪个爬虫的云文档
+        :param sheetid:哪张工作表
+        :param ranges:单元格范围
+        :param values:写入的具体数据,list
+        """
+        try:
+            update_values_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                                + cls.spreadsheettoken(crawler) + "/values_batch_update"
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(log_type, crawler),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+            body = {
+                "valueRanges": [
+                    {
+                        "range": sheetid + "!" + ranges,
+                        "values": values
+                    },
+                ],
+            }
+            urllib3.disable_warnings()
+            r = requests.post(url=update_values_url, headers=headers, json=body, proxies=proxies, verify=False)
+            Local.logger(log_type, crawler).info("写入数据:{}", r.json()["msg"])
+        except Exception as e:
+            Local.logger(log_type, crawler).error("写入数据异常:{}", e)
+
+    # 合并单元格
+    @classmethod
+    def merge_cells(cls, log_type, crawler, sheetid, ranges):
+        """
+        合并单元格
+        :param log_type: 日志路径
+        :param crawler: 哪个爬虫
+        :param sheetid:哪张工作表
+        :param ranges:需要合并的单元格范围
+        """
+        try:
+            merge_cells_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                              + cls.spreadsheettoken(crawler) + "/merge_cells"
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(log_type, crawler),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+
+            body = {
+                "range": sheetid + "!" + ranges,
+                "mergeType": "MERGE_ROWS"
+            }
+            urllib3.disable_warnings()
+            r = requests.post(url=merge_cells_url, headers=headers, json=body, proxies=proxies, verify=False)
+            Local.logger(log_type, crawler).info("合并单元格:{}", r.json()["msg"])
+        except Exception as e:
+            Local.logger(log_type, crawler).error("合并单元格异常:{}", e)
+
+    # 读取单元格数据
+    @classmethod
+    def get_range_value(cls, log_type, crawler, sheetid, cell):
+        """
+        读取单元格内容
+        :param log_type: 日志路径
+        :param crawler: 哪个爬虫
+        :param sheetid: 哪张工作表
+        :param cell: 哪个单元格
+        :return: 单元格内容
+        """
+        try:
+            get_range_value_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                                  + cls.spreadsheettoken(crawler) + "/values/" + sheetid + "!" + cell
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(log_type, crawler),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+            params = {
+                # valueRenderOption=ToString 可返回纯文本的值(数值类型除外);
+                # valueRenderOption=FormattedValue 计算并格式化单元格;
+                # valueRenderOption=Formula 单元格中含有公式时返回公式本身;
+                # valueRenderOption=UnformattedValue 计算但不对单元格进行格式化。
+                "valueRenderOption": "FormattedValue",
+
+                # dateTimeRenderOption=FormattedString 计算并对时间日期按照其格式进行格式化,但不会对数字进行格式化,返回格式化后的字符串。
+                "dateTimeRenderOption": "",
+
+                # 返回的用户id类型,可选open_id,union_id
+                "user_id_type": "open_id"
+            }
+            urllib3.disable_warnings()
+            r = requests.get(url=get_range_value_url, headers=headers, params=params, proxies=proxies, verify=False)
+            # print(r.text)
+            return r.json()["data"]["valueRange"]["values"][0]
+        except Exception as e:
+            Local.logger(log_type, crawler).error("读取单元格数据异常:{}", e)
+
+    # 获取表内容
+    @classmethod
+    def get_sheet_content(cls, log_type, crawler, sheet_id):
+        try:
+            sheet = Feishu.get_values_batch(log_type, crawler, sheet_id)
+            content_list = []
+            for x in sheet:
+                for y in x:
+                    if y is None:
+                        pass
+                    else:
+                        content_list.append(y)
+            return content_list
+        except Exception as e:
+            Local.logger(log_type, crawler).error(f'get_sheet_content:{e}\n')
+
+    # 删除行或列,可选 ROWS、COLUMNS
+    @classmethod
+    def dimension_range(cls, log_type, crawler, sheetid, major_dimension, startindex, endindex):
+        """
+        删除行或列
+        :param log_type: 日志路径
+        :param crawler: 哪个爬虫
+        :param sheetid:工作表
+        :param major_dimension:默认 ROWS ,可选 ROWS、COLUMNS
+        :param startindex:开始的位置
+        :param endindex:结束的位置
+        :return:
+        """
+        try:
+            dimension_range_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                                  + cls.spreadsheettoken(crawler) + "/dimension_range"
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(log_type, crawler),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+            body = {
+                "dimension": {
+                    "sheetId": sheetid,
+                    "majorDimension": major_dimension,
+                    "startIndex": startindex,
+                    "endIndex": endindex
+                }
+            }
+            urllib3.disable_warnings()
+            r = requests.delete(url=dimension_range_url, headers=headers, json=body, proxies=proxies, verify=False)
+            Local.logger(log_type, crawler).info("删除视频数据:{}", r.json()["msg"])
+        except Exception as e:
+            Local.logger(log_type, crawler).error("删除视频数据异常:{}", e)
+
+    # 获取用户 ID
+    @classmethod
+    def get_userid(cls, log_type, crawler, username):
+        try:
+            url = "https://open.feishu.cn/open-apis/user/v1/batch_get_id?"
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(log_type, crawler),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+            name_phone_dict = {
+                "xinxin": "15546206651",
+                "muxinyi": "13699208058",
+                "wangxueke": "13513479926",
+                "yuzhuoyi": "18624010360",
+                "luojunhui": "18801281360",
+                "fanjun": "15200827642",
+                "zhangyong": "17600025055"
+            }
+
+            # if username == "wangkun":
+            #     username = "13426262515"
+            # # elif username == "gaonannan":
+            # #     username = "18501180073"
+            # elif username == "xinxin":
+            #     username = "15546206651"
+            # # elif username == "huxinxue":
+            # #     username = "18832292015"
+            # # elif username == "wuchaoyue":
+            # #     username = "15712941385"
+            # elif username == "muxinyi":
+            #     username = '13699208058'
+            # elif username == "wangxueke":
+            #     username = '13513479926'
+            # elif username == "yuzhuoyi":
+            #     username = '18624010360'
+            # elif username == "luojunhui":
+            #     username = '18801281360'
+            username = name_phone_dict.get(username)
+
+            data = {"mobiles": [username]}
+            urllib3.disable_warnings()
+            r = requests.get(url=url, headers=headers, params=data, verify=False, proxies=proxies)
+            open_id = r.json()["data"]["mobile_users"][username][0]["open_id"]
+            # Common.logger(log_type, crawler).info(f"{username}:{open_id}")
+            # print(f"{username}:{open_id}")
+            return open_id
+        except Exception as e:
+            Local.logger(log_type, crawler).error(f"get_userid异常:{e}\n")
+
+    # 飞书机器人
+    @classmethod
+    def bot(cls, log_type, crawler, text):
+        try:
+            url = "https://open.feishu.cn/open-apis/bot/v2/hook/96989577-50e7-4653-9ec2-308fe3f2c5fe"
+            headers = {'Content-Type': 'application/json'}
+            if crawler == "kanyikan":
+                content = "看一看爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcngRPoDYAi24x52j2nDuHMih"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "gaonannan")) + "></at>\n"
+
+            # elif crawler == "weixinzhishu_out":
+            #     content = "微信指数_站外指数"
+            #     sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnqhMRUGunIfGnGXMOBYiy4K?sheet=YVuVgQ"
+            #     users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+            #         cls.get_userid(log_type, crawler, "muxinyi")) + "></at>\n"
+            # elif crawler == "weixinzhishu_inner_sort":
+            #     content = "微信指数_站内短期指数"
+            #     sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnqhMRUGunIfGnGXMOBYiy4K?sheet=DrZHpa"
+            #     users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+            #         cls.get_userid(log_type, crawler, "muxinyi")) + "></at>\n"
+            # elif crawler == "weixinzhishu_inner_long":
+            #     content = "微信指数_站内长期指数"
+            #     sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnqhMRUGunIfGnGXMOBYiy4K?sheet=JpgyAv"
+            #     users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+            #         cls.get_userid(log_type, crawler, "muxinyi")) + "></at>\n"
+            #
+            # elif crawler == "weixinzhishu" and text == "今日微信指数抓取完毕":
+            #     content = "微信指数"
+            #     sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnqhMRUGunIfGnGXMOBYiy4K?sheet=sVL74k"
+            #     users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "yuzhuoyi")) + "></at>\n"
+            # elif crawler == "weixinzhishu":
+            #     content = "微信指数"
+            #     sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnqhMRUGunIfGnGXMOBYiy4K?sheet=sVL74k"
+            #     users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+            #         cls.get_userid(log_type, crawler, "muxinyi")) + "></at>\n"
+
+            elif crawler == "xiaoniangao_hour":
+                content = "小年糕_小时级_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnYxiyQ1wLklo1W5Kdqc9cGh?sheet=yatRv2"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "luojunhui")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "zhangyong")) + "></at>\n"
+            elif crawler == "xiaoniangao_person":
+                content = "小年糕_用户主页_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnYxiyQ1wLklo1W5Kdqc9cGh?sheet=Wu0CeL"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "luojunhui")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "zhangyong")) + "></at>\n"
+            elif crawler == "xiaoniangao_play":
+                content = "小年糕_播放量_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnYxiyQ1wLklo1W5Kdqc9cGh?sheet=c85k1C"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "luojunhui")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "zhangyong")) + "></at>\n"
+
+            elif crawler == 'xigua' and log_type == "recommend":
+                content = '西瓜视频_推荐_已下载表'
+                sheet_url = 'https://w42nne6hzg.feishu.cn/sheets/shtcnvOpx2P8vBXiV91Ot1MKIw8?sheet=ZzsClu'
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "luojunhui")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "wangxueke")) + "></at>\n"
+            # elif crawler == 'xigua':
+            #     content = '西瓜视频_用户主页_已下载表'
+            #     sheet_url = 'https://w42nne6hzg.feishu.cn/sheets/shtcnvOpx2P8vBXiV91Ot1MKIw8?sheet=e075e9'
+            #     users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "luojunhui")) + "></at> <at id=" + str(
+            #         cls.get_userid(log_type, crawler, "wuchaoyue")) + "></at>\n"
+            # elif crawler == 'xigua_little_video':
+            #     content = '西瓜视频_小视频_已下载表'
+            #     sheet_url = 'https://w42nne6hzg.feishu.cn/sheets/shtcnvOpx2P8vBXiV91Ot1MKIw8?sheet=hDSDnv'
+            #     users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+            #         cls.get_userid(log_type, crawler, "wuchaoyue")) + "></at>\n"
+
+            elif crawler == 'zhihu_hot':
+                content = '知乎_热门_已下载表'
+                sheet_url = 'https://w42nne6hzg.feishu.cn/sheets/shtcnkGPBmGsjaqapgzouuj8MXe?sheet=8871e3'
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "gaonannan")) + "></at>\n"
+            elif crawler == 'zhihu_follow':
+                content = '知乎_定向_已下载表'
+                sheet_url = 'https://w42nne6hzg.feishu.cn/sheets/shtcnkGPBmGsjaqapgzouuj8MXe?sheet=4MGuux'
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "gaonannan")) + "></at>\n"
+
+            elif crawler == 'haokan_hot':
+                content = '好看_热榜_已下载表'
+                sheet_url = 'https://w42nne6hzg.feishu.cn/sheets/shtcnaYz8Nhv8q6DbWtlL6rMEBd?sheet=5pWipX'
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "wuchaoyue")) + "></at>\n"
+            elif crawler == 'haokan_channel':
+                content = '好看_频道_已下载表'
+                sheet_url = 'https://w42nne6hzg.feishu.cn/sheets/shtcnaYz8Nhv8q6DbWtlL6rMEBd?sheet=7f05d8'
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "wuchaoyue")) + "></at>\n"
+            elif crawler == 'haokan_follow':
+                content = '好看_定向_已下载表'
+                sheet_url = 'https://w42nne6hzg.feishu.cn/sheets/shtcnaYz8Nhv8q6DbWtlL6rMEBd?sheet=kVaSjf'
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "wuchaoyue")) + "></at>\n"
+
+            elif crawler == "music_album":
+                content = "音乐相册爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnT6zvmfsYe1g0iv4pt7855g"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "gaonannan")) + "></at>\n"
+
+            elif crawler == "ssyy":
+                content = "胜胜影音爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnz1ymxHL1u8WHblfqfys7qe"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "gaonannan")) + "></at>\n"
+
+            elif crawler == "ggdc":
+                content = "刚刚都传爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnTuJgeZU2bc7VaesAqk3QJx"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "luojunhui")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "zhangyong")) + "></at>\n"
+
+            elif crawler == "bszf":
+                content = "本山祝福爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnGh2rrsPYM4iVNEBO7OqWrb"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "luojunhui")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "zhangyong")) + "></at>\n"
+
+            elif crawler == "jxxf":
+                content = "吉祥幸福爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnSx4nafMbLTq7xl7RHBwHBf"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "luojunhui")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "zhangyong")) + "></at>\n"
+
+            elif crawler == "zmyx":
+                content = "众妙音信爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnbZIxstPeM0xshW07b26sve"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "luojunhui")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "zhangyong")) + "></at>\n"
+
+            elif crawler == "zhufumao":
+                content = "祝福猫视频爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnXfIJthvkjhI5zlEJq84i6g"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "gaonannan")) + "></at>\n"
+
+            elif crawler == "kuaishou_follow":
+                content = "快手_用户主页_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnICEfaw9llDNQkKgdymM1xf?sheet=fYdA8F"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "xinxin")) + "></at>\n"
+            elif crawler == "kuaishou_recommend":
+                content = "快手_推荐榜_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnICEfaw9llDNQkKgdymM1xf?sheet=3cd128"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "xinxin")) + "></at>\n"
+
+            elif crawler == "ssnnyfq":
+                content = "岁岁年年迎福气_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnyJmJSJynHDLLbLTkySfvZe?sheet=290bae"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "luojunhui")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "xinxin")) + "></at>\n"
+
+            elif crawler == "kdjsfq":
+                content = "看到就是福气_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnEokBkIjOUPAk8vbbPKnXgb?sheet=ad3b6d"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "xinxin")) + "></at>\n"
+
+            elif crawler == "gzh":
+                content = "公众号爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnexNXnpDLHhARw0QdiwbYuA"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "huxinxue")) + "></at>\n"
+
+            elif crawler == "gongzhonghao":
+                content = "公众号_信欣_爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcna98M2mX7TbivTj9Sb7WKBN?"
+                users = f"\n<at id={str(cls.get_userid(log_type, crawler, 'fanjun'))}></at> <at id={str(cls.get_userid(log_type, crawler, 'wangxueke'))}></at> <at id={str(cls.get_userid(log_type, crawler, 'luojunhui'))}></at>\n"
+
+            elif crawler == "weiqun":
+                content = "微群爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnoKThNquYRweaylMFVyo9Hc"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "xinxin")) + "></at>\n"
+
+            elif crawler == "weishi":
+                content = "微视爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcn5YSWg91JfVGzj0SFZIRRPh"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "xinxin")) + "></at>\n"
+
+            elif crawler == "shipinhao_recommend":
+                content = "视频号_推荐_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcn9rOdZRAGFbRkWpn7hqEHGc?sheet=c77cf9"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "wuchaoyue")) + "></at>\n"
+            elif crawler == "shipinhao_follow":
+                content = "视频号_定向_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcn9rOdZRAGFbRkWpn7hqEHGc?sheet=KsVtLe"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "wuchaoyue")) + "></at>\n"
+            elif crawler == "youtube":
+                content = "youtube_定向_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnrLyr1zbYbhhZyqpN7Xrd5f?sheet=GVxlYk"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "wuchaoyue")) + "></at>\n"
+
+            elif crawler == "zongjiao":
+                content = "宗教公众号爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcn73NW0CyoOeF21HWO15KBsb"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "huxinxue")) + "></at>\n"
+
+            else:
+                content = "小年糕爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnYxiyQ1wLklo1W5Kdqc9cGh"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at>\n"
+
+            data = json.dumps({
+                "msg_type": "interactive",
+                "card": {
+                    "config": {
+                        "wide_screen_mode": True,
+                        "enable_forward": True
+                    },
+                    "elements": [{
+                        "tag": "div",
+                        "text": {
+                            "content": users + text,
+                            "tag": "lark_md"
+                        }
+                    }, {
+                        "actions": [{
+                            "tag": "button",
+                            "text": {
+                                "content": content,
+                                "tag": "lark_md"
+                            },
+                            "url": sheet_url,
+                            "type": "default",
+                            "value": {}
+                        }],
+                        "tag": "action"
+                    }],
+                    "header": {
+                        "title": {
+                            "content": "📣您有新的信息,请注意查收",
+                            "tag": "plain_text"
+                        }
+                    }
+                }
+            })
+            urllib3.disable_warnings()
+            r = requests.post(url, headers=headers, data=data, verify=False, proxies=proxies)
+            Local.logger(log_type, crawler).info(f'触发机器人消息:{r.status_code}, {text}')
+        except Exception as e:
+            Local.logger(log_type, crawler).error(f"bot异常:{e}\n")
+
+
+if __name__ == "__main__":
+    Feishu.bot('recommend', 'xigua', '测试: 西瓜推荐,登录失效')
+    # print(Feishu.get_userid('bot', 'weixinzhishu', 'wangkun'))
+    # print(Feishu.get_userid('bot', 'weixinzhishu', 'yuzhuoyi'))

+ 1 - 1
application/common/log/local_log.py

@@ -6,7 +6,7 @@ import os
 proxies = {"http": None, "https": None}
 
 
-class Local:
+class Local(object):
     # 统一获取当前时间 <class 'datetime.datetime'>  2022-04-14 20:13:51.244472
     now = datetime.now()
     # 昨天 <class 'str'>  2022-04-13

+ 15 - 0
application/common/messageQueue/ack_message.py

@@ -0,0 +1,15 @@
+from application.common.log import Local
+from mq_http_sdk.mq_exception import MQExceptionBase
+
+
+def ack_message(log_type, crawler, recv_msgs, consumer):
+    # msg.next_consume_time前若不确认消息消费成功,则消息会被重复消费。
+    # 消息句柄有时间戳,同一条消息每次消费拿到的都不一样。
+    try:
+        receipt_handle_list = [msg.receipt_handle for msg in recv_msgs]
+        consumer.ack_message(receipt_handle_list)
+        Local.logger(log_type, crawler).info(
+            f"Ack {len(receipt_handle_list)} Message Succeed.\n"
+        )
+    except MQExceptionBase as err:
+        Local.logger(log_type, crawler).info(f"Ack Message Fail! Exception:{err}\n")

+ 1 - 0
application/common/proxies/__init__.py

@@ -0,0 +1 @@
+from .fast_proxy import tunnel_proxies

+ 11 - 0
application/common/proxies/fast_proxy.py

@@ -0,0 +1,11 @@
+def tunnel_proxies():
+    # 隧道域名:端口号
+    tunnel = "q796.kdltps.com:15818"
+    # 用户名密码方式
+    username = "t17772369458618"
+    password = "5zqcjkmy"
+    proxies = {
+        "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
+        "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
+    }
+    return proxies

+ 2 - 1
application/functions/__init__.py

@@ -1 +1,2 @@
-from .get_redirect_url import get_redirect_url
+from .get_redirect_url import get_redirect_url
+from .clean_title import clean_title

+ 22 - 0
application/functions/clean_title.py

@@ -0,0 +1,22 @@
+def clean_title(strings):
+    return (
+        strings.strip()
+        .replace("\n", "")
+        .replace("/", "")
+        .replace("\r", "")
+        .replace("#", "")
+        .replace(".", "。")
+        .replace("\\", "")
+        .replace("&NBSP", "")
+        .replace(":", "")
+        .replace("*", "")
+        .replace("?", "")
+        .replace("?", "")
+        .replace('"', "")
+        .replace("<", "")
+        .replace(">", "")
+        .replace("|", "")
+        .replace(" ", "")
+        .replace('"', "")
+        .replace("'", "")
+    )

+ 38 - 0
application/functions/read_mysql_config.py

@@ -0,0 +1,38 @@
+from application.common.mysql import MysqlHelper
+
+
+def get_config_from_mysql(log_type, source, env, text, action=""):
+    select_sql = f"""select * from crawler_config where source="{source}" """
+    contents = MysqlHelper.get_values(log_type, source, select_sql, env, action=action)
+    title_list = []
+    filter_list = []
+    emoji_list = []
+    search_word_list = []
+    for content in contents:
+        config = content["config"]
+        config_dict = eval(config)
+        for k, v in config_dict.items():
+            if k == "title":
+                title_list_config = v.split(",")
+                for title in title_list_config:
+                    title_list.append(title)
+            if k == "filter":
+                filter_list_config = v.split(",")
+                for filter_word in filter_list_config:
+                    filter_list.append(filter_word)
+            if k == "emoji":
+                emoji_list_config = v.split(",")
+                for emoji in emoji_list_config:
+                    emoji_list.append(emoji)
+            if k == "search_word":
+                search_word_list_config = v.split(",")
+                for search_word in search_word_list_config:
+                    search_word_list.append(search_word)
+    if text == "title":
+        return title_list
+    elif text == "filter":
+        return filter_list
+    elif text == "emoji":
+        return emoji_list
+    elif text == "search_word":
+        return search_word_list

+ 1 - 0
application/items/__init__.py

@@ -0,0 +1 @@
+from .item import VideoItem

+ 94 - 0
application/items/item.py

@@ -0,0 +1,94 @@
+import time
+from application.functions import clean_title
+
+
+class VideoItem(object):
+    """
+    function: 当扫描进一条视频的时候,对该视频的基本信息进行处理,保证发送给 pipeline和 etl 的 video_dict 是正确的
+    __init__: 初始化空json 对象,用来存储视频信息
+    add_video_info: 把视频信息存储到 item 对象中
+    check_item: 检查 item 对象中的各个元素以及处理
+    """
+
+    def __init__(self):
+        self.item = {}
+
+    def add_video_info(self, key, value):
+        self.item[key] = value
+
+    def check_item(self):
+        """
+        判断item 里面的字段,是否符合要求
+        字段分为 3 类:
+        1. 必须存在数据的字段: ["video_id", "user_id", "user_name", "out_user_id", "out_video_id", "session", "video_url", "cover_url", "platform", "strategy"]
+        2. 不存在默认为 0 的字段 :["duration", "play_cnt", "like_cnt", "comment_cnt", "share_cnt", "width", "height"]
+        3. 需要后出理的字段: video_title, publish_time
+        """
+        if self.item.get("video_title"):
+            self.item["video_title"] = clean_title(self.item["video_title"])
+        else:
+            return False
+        if self.item.get("publish_time_stamp"):
+            publish_time_str = time.strftime(
+                "%Y-%m-%d %H:%M:%S", time.localtime(self.item["publish_time_stamp"])
+            )
+            self.add_video_info("publish_time_str", publish_time_str)
+        else:
+            publish_time_stamp = int(time.time())
+            publish_time_str = time.strftime(
+                "%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp)
+            )
+            self.add_video_info("publish_time_stamp", publish_time_stamp)
+            self.add_video_info("publish_time_str", publish_time_str)
+        self.add_video_info("publish_time", publish_time_str)
+        if not self.item.get("update_time_stamp"):
+            self.add_video_info("update_time_stamp", int(time.time()))
+
+        # 如果不存在,默认值为 0
+        config_keys = [
+            "duration",
+            "play_cnt",
+            "like_cnt",
+            "comment_cnt",
+            "share_cnt",
+            "width",
+            "height",
+        ]
+        for config_key in config_keys:
+            if self.item.get(config_key):
+                continue
+            else:
+                self.add_video_info(config_key, 0)
+
+        # 必须存在的元素,若不存在则会报错
+        must_keys = [
+            "video_id",
+            "user_id",
+            "user_name",
+            "out_video_id",
+            "session",
+            "video_url",
+            "cover_url",
+            "platform",
+            "strategy",
+        ]
+        """
+        video_id, out_video_id 均为站外视频 id
+        usr_id: 站内用户 id
+        out_user_id: 站外用户 id
+        user_name: 站外用户名称
+        """
+        for m_key in must_keys:
+            if self.item.get(m_key):
+                continue
+            else:
+                # print(m_key)
+                return False
+        return True
+
+    def produce_item(self):
+        flag = self.check_item()
+        if flag:
+            return self.item
+        else:
+            return False

+ 2 - 1
application/pipeline/__init__.py

@@ -1 +1,2 @@
-from .pipeline_dev import PiaoQuanPipelineTest
+from .pipeline_dev import PiaoQuanPipelineTest
+from .pipeline import PiaoQuanPipeline

+ 135 - 0
application/pipeline/pipeline.py

@@ -0,0 +1,135 @@
+import re
+import time
+from application.common.log import AliyunLogger
+from application.common.mysql import MysqlHelper
+
+
+class PiaoQuanPipeline:
+    def __init__(self, platform, mode, rule_dict, env, item, trace_id):
+        self.platform = platform
+        self.mode = mode
+        self.item = item
+        self.rule_dict = rule_dict
+        self.env = env
+        self.trace_id = trace_id
+        self.mysql = MysqlHelper(env=env,mode=mode, platform=platform)
+        self.aliyun_log = AliyunLogger(platform=platform, mode=mode, env=env)
+
+    # 视频的发布时间限制, 属于是规则过滤
+    def publish_time_flag(self):
+        # 判断发布时间
+        publish_time_stamp = self.item["publish_time_stamp"]
+        update_time_stamp = self.item["update_time_stamp"]
+        max_d = self.rule_dict.get("period", {}).get("max", 1000)
+        min_d = self.rule_dict.get("period", {}).get("min", 1000)
+        days = max_d if max_d > min_d else min_d
+        if self.platform == "gongzhonghao":
+            if (
+                int(time.time()) - publish_time_stamp
+                > 3600 * 24 * days
+            ) and (
+                int(time.time()) - update_time_stamp
+                > 3600 * 24 * days
+            ):
+                self.aliyun_log.logging(
+                    code="2004",
+                    trace_id=self.trace_id,
+                    data=self.item,
+                    message="发布时间超过{}天".format(days),
+                )
+                return False
+        else:
+            if (
+                int(time.time()) - publish_time_stamp
+                > 3600 * 24 * days
+            ):
+                self.aliyun_log.logging(
+                    code="2004",
+                    trace_id=self.trace_id,
+                    data=self.item,
+                    message="发布时间超过{}天".format(days),
+                )
+                return False
+        return True
+
+    # 视频标题是否满足需求
+    def title_flag(self):
+        title = self.item["video_title"]
+        cleaned_title = re.sub(r"[^\w]", " ", title)
+        # 敏感词
+        # 获取敏感词列表
+        sensitive_words = []
+        if any(word in cleaned_title for word in sensitive_words):
+            self.aliyun_log.logging(
+                code="2003",
+                trace_id=self.trace_id,
+                message="标题中包含敏感词",
+                data=self.item,
+            )
+            return False
+        return True
+
+    # 视频基础下载规则
+    def download_rule_flag(self):
+        for key in self.item:
+            if self.rule_dict.get(key):
+                max_value = (
+                    int(self.rule_dict[key]["max"])
+                    if int(self.rule_dict[key]["max"]) > 0
+                    else 999999999999999
+                )
+                if key == "peroid": # peroid是抓取周期天数
+                    continue
+                else:
+                    flag = int(self.rule_dict[key]["min"]) <= int(self.item[key]) <= max_value
+                    if not flag:
+                        self.aliyun_log.logging(
+                            code="2004",
+                            trace_id=self.trace_id,
+                            data=self.item,
+                            message="{}: {} <= {} <= {}, {}".format(
+                                key,
+                                self.rule_dict[key]["min"],
+                                self.item[key],
+                                max_value,
+                                flag,
+                            ),
+                        )
+                        return flag
+            else:
+                continue
+        return True
+
+    # 按照某个具体平台来去重
+    def repeat_video(self):
+        # sql = f""" select * from crawler_video where platform="公众号" and out_video_id="{video_id}"; """
+        out_id = self.item["out_video_id"]
+        sql = f""" select * from crawler_video where platform = "{self.platform}" and out_video_id="{out_id}"; """
+        repeat_video = self.mysql.select(sql=sql)
+        if repeat_video:
+            self.aliyun_log.logging(
+                code="2002",
+                trace_id=self.trace_id,
+                message="重复的视频",
+                data=self.item,
+            )
+            return False
+        return True
+
+    def process_item(self):
+        if not self.publish_time_flag():
+            # 记录相关日志
+            return False
+        if not self.title_flag():
+            # 记录相关日志
+            return False
+        if not self.repeat_video():
+            # 记录相关日志
+            return False
+        if not self.download_rule_flag():
+            # 记录相关日志
+            return False
+        return True
+
+
+