瀏覽代碼

first commit

zhangliang 2 周之前
當前提交
42e9e11fbd
共有 67 個文件被更改,包括 3803 次插入0 次删除
  1. 62 0
      .gitignore
  2. 19 0
      README.md
  3. 0 0
      application/__init__.py
  4. 6 0
      application/common/__init__.py
  5. 3 0
      application/common/feishu/__init__.py
  6. 725 0
      application/common/feishu/feishu.py
  7. 20 0
      application/common/feishu/feishu_data.py
  8. 56 0
      application/common/feishu/feishu_insert.py
  9. 398 0
      application/common/feishu/feishu_utils.py
  10. 0 0
      application/common/ffmpeg/__init__.py
  11. 76 0
      application/common/ffmpeg/ffmpeg_utils.py
  12. 1 0
      application/common/gpt/__init__.py
  13. 61 0
      application/common/gpt/gpt4o_mini_help.py
  14. 2 0
      application/common/log/__init__.py
  15. 81 0
      application/common/log/aliyun_log.py
  16. 54 0
      application/common/log/local_log.py
  17. 3 0
      application/common/messageQueue/__init__.py
  18. 15 0
      application/common/messageQueue/ack_message.py
  19. 25 0
      application/common/messageQueue/consumer.py
  20. 51 0
      application/common/messageQueue/mq.py
  21. 1 0
      application/common/mysql/__init__.py
  22. 122 0
      application/common/mysql/mysql_helper.py
  23. 57 0
      application/common/mysql/sql.py
  24. 2 0
      application/common/proxies/__init__.py
  25. 23 0
      application/common/proxies/fast_proxy.py
  26. 0 0
      application/common/redis/__init__.py
  27. 55 0
      application/common/redis/pyredis.py
  28. 67 0
      application/common/redis/redis_helper.py
  29. 54 0
      application/common/redis/xng_redis.py
  30. 3 0
      application/config/__init__.py
  31. 0 0
      application/config/aliyun_config.py
  32. 7 0
      application/config/config.py
  33. 30 0
      application/config/ipconfig.py
  34. 36 0
      application/config/mysql_config.py
  35. 24 0
      application/config/topic_group_queue.py
  36. 0 0
      application/etl/__init__.py
  37. 134 0
      application/etl/download.py
  38. 3 0
      application/functions/__init__.py
  39. 26 0
      application/functions/appium_tools.py
  40. 22 0
      application/functions/clean_title.py
  41. 3 0
      application/functions/crypt.py
  42. 9 0
      application/functions/get_redirect_url.py
  43. 46 0
      application/functions/read_mysql_config.py
  44. 240 0
      application/functions/zqkd_db_redis.py
  45. 1 0
      application/items/__init__.py
  46. 94 0
      application/items/item.py
  47. 2 0
      application/pipeline/__init__.py
  48. 273 0
      application/pipeline/pipeline.py
  49. 112 0
      application/pipeline/pipeline_dev.py
  50. 0 0
      configs/__init__.py
  51. 61 0
      configs/codes.py
  52. 1 0
      configs/config.py
  53. 52 0
      configs/messages.py
  54. 59 0
      configs/spiders_config.yaml
  55. 3 0
      configs/topic_map.yaml
  56. 0 0
      crawler_worker/__init__.py
  57. 90 0
      crawler_worker/rabbitmq_consumer.py
  58. 205 0
      crawler_worker/universal_crawler.py
  59. 76 0
      main.py
  60. 0 0
      pipelines/__init__.py
  61. 0 0
      scheduler/__init__.py
  62. 45 0
      scheduler/scheduler_main.py
  63. 0 0
      utils/__init__.py
  64. 37 0
      utils/config_loader.py
  65. 29 0
      utils/extractors.py
  66. 10 0
      utils/path_utils.py
  67. 31 0
      utils/project_paths.py

+ 62 - 0
.gitignore

@@ -0,0 +1,62 @@
+# ---> Python
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+.idea/
+*.DS_Store

+ 19 - 0
README.md

@@ -0,0 +1,19 @@
+整体流程:
+MQ 消息推送
+   ↓
+main.py 消费并生成线程
+   ↓
+handle_message() → 解析消息体(platform + mode)
+   ↓
+加载 user_list / rule_dict(数据库)
+   ↓
+执行 UniversalCrawler.run()
+   ↓
+1. 读取配置
+2. 请求视频接口
+3. 解析视频数据(字段映射)
+4. 构造 VideoItem
+5. 推送到 ETL MQ
+   ↓
+全部成功 → ack 消息
+失败 → 不 ack,MQ 自动重试

+ 0 - 0
application/__init__.py


+ 6 - 0
application/common/__init__.py

@@ -0,0 +1,6 @@
+from .feishu import Feishu, FeishuInsert
+from .log import *
+from .messageQueue import *
+from .mysql import *
+from .proxies import *
+from .redis import redis_helper

+ 3 - 0
application/common/feishu/__init__.py

@@ -0,0 +1,3 @@
+from .feishu import Feishu
+from .feishu_insert import FeishuInsert
+from .feishu_data import FsData

+ 725 - 0
application/common/feishu/feishu.py

@@ -0,0 +1,725 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/1/31
+"""
+飞书表配置: token 鉴权 / 增删改查 / 机器人报警
+"""
+import json
+import os
+import sys
+import requests
+import urllib3
+
+sys.path.append(os.getcwd())
+
+from application.common.log import Local
+proxies = {"http": None, "https": None}
+
+
+class Feishu:
+    """
+    编辑飞书云文档
+    """
+    # 看一看爬虫数据表
+    kanyikan_url = "https://w42nne6hzg.feishu.cn/sheets/shtcngRPoDYAi24x52j2nDuHMih?"
+    # 快手爬虫数据表
+    kuaishou_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnICEfaw9llDNQkKgdymM1xf?"
+    # 微视爬虫数据表
+    weishi_url = "https://w42nne6hzg.feishu.cn/sheets/shtcn5YSWg91JfVGzj0SFZIRRPh?"
+    # 小年糕爬虫数据表
+    xiaoniangao_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnYxiyQ1wLklo1W5Kdqc9cGh?"
+    # 音乐相册
+    music_album = "https://w42nne6hzg.feishu.cn/sheets/shtcnT6zvmfsYe1g0iv4pt7855g?"
+    # 本山祝福数据表
+    crawler_benshanzhufu = "https://w42nne6hzg.feishu.cn/sheets/shtcnGh2rrsPYM4iVNEBO7OqWrb?"
+    # 公众号爬虫表
+    gzh_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnexNXnpDLHhARw0QdiwbYuA?"
+    # 数据监控表
+    crawler_monitor = "https://w42nne6hzg.feishu.cn/sheets/shtcnlZWYazInhf7Z60jkbLRJyd?"
+    # 微群视频爬虫表
+    crawler_weiqun_video = "https://w42nne6hzg.feishu.cn/sheets/shtcnoKThNquYRweaylMFVyo9Hc?"
+    # 视频号爬虫表
+    crawler_shipinhao = 'https://w42nne6hzg.feishu.cn/sheets/shtcn9rOdZRAGFbRkWpn7hqEHGc?'
+    # 西瓜视频
+    crawler_xigua = 'https://w42nne6hzg.feishu.cn/sheets/shtcnvOpx2P8vBXiV91Ot1MKIw8?'
+    # 知乎 PC 端
+    crawler_zhihu = 'https://w42nne6hzg.feishu.cn/sheets/shtcnkGPBmGsjaqapgzouuj8MXe?'
+    # 吉祥幸福
+    crawler_jixiangxingfu = 'https://w42nne6hzg.feishu.cn/sheets/shtcnSx4nafMbLTq7xl7RHBwHBf?'
+    # 福小顺
+    crawler_fuxiaoshun = 'https://w42nne6hzg.feishu.cn/sheets/CoXEsl6MDhMaKKt6GUBcvLwsnWb?'
+    # 众妙音信
+    crawler_zmyx = 'https://w42nne6hzg.feishu.cn/sheets/shtcnbZIxstPeM0xshW07b26sve?'
+    # 岁岁年年迎福气
+    crawler_ssnnyfq = 'https://w42nne6hzg.feishu.cn/sheets/shtcnyJmJSJynHDLLbLTkySfvZe?'
+    # 祝福猫视频
+    crawler_zhufumao = 'https://w42nne6hzg.feishu.cn/sheets/shtcnXfIJthvkjhI5zlEJq84i6g?'
+    # 宗教公众号
+    crawler_zongjiao = 'https://w42nne6hzg.feishu.cn/sheets/shtcn73NW0CyoOeF21HWO15KBsb?'
+    # 好看视频
+    crawler_haokan = 'https://w42nne6hzg.feishu.cn/sheets/shtcnaYz8Nhv8q6DbWtlL6rMEBd'
+    # 看到就是福气
+    crawler_kandaojiushifuqi = 'https://w42nne6hzg.feishu.cn/sheets/shtcnEokBkIjOUPAk8vbbPKnXgb'
+    # 胜胜影音
+    crawler_shengshengyingyin = 'https://w42nne6hzg.feishu.cn/sheets/shtcnz1ymxHL1u8WHblfqfys7qe'
+    # 刚刚都传
+    crawler_ganggangdouchuan = 'https://w42nne6hzg.feishu.cn/sheets/shtcnTuJgeZU2bc7VaesAqk3QJx'
+    # 知青天天看
+    crawler_zhiqingtiantiankan = 'https://w42nne6hzg.feishu.cn/sheets/shtcnjmhKdJOKdqnEzJcZb5xaHc?'
+    # 公众号_信欣
+    crawler_gongzhonghao = 'https://w42nne6hzg.feishu.cn/sheets/shtcna98M2mX7TbivTj9Sb7WKBN?'
+    # YouTube
+    crawler_youtube = 'https://w42nne6hzg.feishu.cn/sheets/shtcnrLyr1zbYbhhZyqpN7Xrd5f?'
+    # 微信指数
+    weixinzhishu = 'https://w42nne6hzg.feishu.cn/sheets/shtcnqhMRUGunIfGnGXMOBYiy4K?'
+    # 微信指数_搜索词
+    weixinzhishu_search_word = 'https://w42nne6hzg.feishu.cn/sheets/shtcnHxCj6dZBYMuK1Q3tIJVlqg?'
+    # 海豚祝福
+    crawler_haitunzhufu = 'https://w42nne6hzg.feishu.cn/sheets/VbyAsUGq3h9TQ7tG3GpczGjhn1M?'
+
+    # 飞书路径token
+    @classmethod
+    def spreadsheettoken(cls, crawler):
+        """
+        :param crawler: 哪个爬虫
+        """
+        if crawler == "kanyikan":
+            return "shtcngRPoDYAi24x52j2nDuHMih"
+        elif crawler == "kuaishou":
+            return "shtcnICEfaw9llDNQkKgdymM1xf"
+        elif crawler == "weishi":
+            return "shtcn5YSWg91JfVGzj0SFZIRRPh"
+        elif crawler == "xiaoniangao":
+            return "shtcnYxiyQ1wLklo1W5Kdqc9cGh"
+        elif crawler == "control":
+            return "shtcnlZWYazInhf7Z60jkbLRJyd"
+        elif crawler == "music_album":
+            return "shtcnT6zvmfsYe1g0iv4pt7855g"
+        elif crawler == "benshanzhufu":
+            return "shtcnGh2rrsPYM4iVNEBO7OqWrb"
+        elif crawler == "gzh":
+            return "shtcnexNXnpDLHhARw0QdiwbYuA"
+        elif crawler == "weiqun":
+            return "shtcnoKThNquYRweaylMFVyo9Hc"
+        elif crawler == 'shipinhao':
+            return 'shtcn9rOdZRAGFbRkWpn7hqEHGc'
+        elif crawler == 'xigua':
+            return 'shtcnvOpx2P8vBXiV91Ot1MKIw8'
+        elif crawler == 'zhihu':
+            return 'shtcnkGPBmGsjaqapgzouuj8MXe'
+        elif crawler == 'jixiangxingfu':
+            return 'shtcnSx4nafMbLTq7xl7RHBwHBf'
+        elif crawler == 'fuxiaoshun':
+            return 'CoXEsl6MDhMaKKt6GUBcvLwsnWb'
+        elif crawler == 'zhongmiaoyinxin':
+            return 'shtcnbZIxstPeM0xshW07b26sve'
+        elif crawler == 'suisuiniannianyingfuqi':
+            return 'shtcnyJmJSJynHDLLbLTkySfvZe'
+        elif crawler == 'zhufumao':
+            return 'shtcnXfIJthvkjhI5zlEJq84i6g'
+        elif crawler == 'zongjiao':
+            return 'shtcn73NW0CyoOeF21HWO15KBsb'
+        elif crawler == 'haokan':
+            return 'shtcnaYz8Nhv8q6DbWtlL6rMEBd'
+        elif crawler == 'kandaojiushifuqi':
+            return 'shtcnEokBkIjOUPAk8vbbPKnXgb'
+        elif crawler == 'shengshengyingyin':
+            return 'shtcnz1ymxHL1u8WHblfqfys7qe'
+        elif crawler == 'ganggangdouchuan':
+            return 'shtcnTuJgeZU2bc7VaesAqk3QJx'
+        elif crawler == 'youtube':
+            return 'shtcnrLyr1zbYbhhZyqpN7Xrd5f'
+        elif crawler == 'weixinzhishu':
+            return 'shtcnqhMRUGunIfGnGXMOBYiy4K'
+        elif crawler == 'weixinzhishu_search_word':
+            return 'shtcnHxCj6dZBYMuK1Q3tIJVlqg'
+        elif crawler == 'gongzhonghao':
+            return 'shtcna98M2mX7TbivTj9Sb7WKBN'
+        elif crawler == 'douyin':
+            return 'shtcnhq63MoXOpqbkuLuoapYIAh'
+        elif crawler == 'zhiqingtiantiankan':
+            return 'shtcnjmhKdJOKdqnEzJcZb5xaHc'
+        elif crawler == 'haitunzhufu':
+            return 'VbyAsUGq3h9TQ7tG3GpczGjhn1M'
+
+    # 获取飞书api token
+    @classmethod
+    def get_token(cls, log_type, crawler):
+        """
+        获取飞书api token
+        :return:
+        """
+        url = "https://open.feishu.cn/open-apis/auth/v3/tenant_access_token/internal/"
+        post_data = {"app_id": "cli_a13ad2afa438d00b",  # 这里账号密码是发布应用的后台账号及密码
+                     "app_secret": "4tK9LY9VbiQlY5umhE42dclBFo6t4p5O"}
+
+        try:
+            urllib3.disable_warnings()
+            response = requests.post(url=url, data=post_data, proxies=proxies, verify=False)
+            tenant_access_token = response.json()["tenant_access_token"]
+            return tenant_access_token
+        except Exception as e:
+            Local.logger(log_type, crawler).error("获取飞书 api token 异常:{}", e)
+
+    # 获取表格元数据
+    @classmethod
+    def get_metainfo(cls, log_type, crawler):
+        """
+        获取表格元数据
+        :return:
+        """
+        try:
+            get_metainfo_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                               + cls.spreadsheettoken(crawler) + "/metainfo"
+
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(log_type, crawler),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+            params = {
+                "extFields": "protectedRange",  # 额外返回的字段,extFields=protectedRange时返回保护行列信息
+                "user_id_type": "open_id"  # 返回的用户id类型,可选open_id,union_id
+            }
+            urllib3.disable_warnings()
+            r = requests.get(url=get_metainfo_url, headers=headers, params=params, proxies=proxies, verify=False)
+            response = json.loads(r.content.decode("utf8"))
+            return response
+        except Exception as e:
+            Local.logger(log_type, crawler).error("获取表格元数据异常:{}", e)
+
+    # 读取工作表中所有数据
+    @classmethod
+    def get_values_batch(cls, log_type, crawler, sheetid):
+        """
+        读取工作表中所有数据
+        :param log_type: 启用哪个 log
+        :param crawler: 哪个爬虫
+        :param sheetid: 哪张表
+        :return: 所有数据
+        """
+        try:
+            get_values_batch_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                                   + cls.spreadsheettoken(crawler) + "/values_batch_get"
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(log_type, crawler),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+            params = {
+                # 多个查询范围 如 url?ranges=range1,range2 ,其中 range 包含 sheetId 与单元格范围两部分
+                "ranges": sheetid,
+
+                # valueRenderOption=ToString 可返回纯文本的值(数值类型除外);
+                # valueRenderOption=FormattedValue 计算并格式化单元格;
+                # valueRenderOption=Formula单元格中含有公式时返回公式本身;
+                # valueRenderOption=UnformattedValue计算但不对单元格进行格式化
+                "valueRenderOption": "ToString",
+
+                # dateTimeRenderOption=FormattedString 计算并将时间日期按照其格式进行格式化,但不会对数字进行格式化,返回格式化后的字符串。
+                "dateTimeRenderOption": "",
+
+                # 返回的用户id类型,可选open_id,union_id
+                "user_id_type": "open_id"
+            }
+            urllib3.disable_warnings()
+            r = requests.get(url=get_values_batch_url, headers=headers, params=params, proxies=proxies, verify=False)
+            # print(r.text)
+            response = json.loads(r.content.decode("utf8"))
+            values = response["data"]["valueRanges"][0]["values"]
+            return values
+        except Exception as e:
+            Local.logger(log_type, crawler).error("读取工作表所有数据异常:{}", e)
+
+    # 工作表,插入行或列
+    @classmethod
+    def insert_columns(cls, log_type, crawler, sheetid, majordimension, startindex, endindex):
+        """
+        工作表插入行或列
+        :param log_type: 日志路径
+        :param crawler: 哪个爬虫的云文档
+        :param sheetid:哪张工作表
+        :param majordimension:行或者列, ROWS、COLUMNS
+        :param startindex:开始位置
+        :param endindex:结束位置
+        """
+        try:
+            insert_columns_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                                 + cls.spreadsheettoken(crawler) + "/insert_dimension_range"
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(log_type, crawler),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+            body = {
+                "dimension": {
+                    "sheetId": sheetid,
+                    "majorDimension": majordimension,  # 默认 ROWS ,可选 ROWS、COLUMNS
+                    "startIndex": startindex,  # 开始的位置
+                    "endIndex": endindex  # 结束的位置
+                },
+                "inheritStyle": "AFTER"  # BEFORE 或 AFTER,不填为不继承 style
+            }
+
+            urllib3.disable_warnings()
+            r = requests.post(url=insert_columns_url, headers=headers, json=body, proxies=proxies, verify=False)
+            Local.logger(log_type, crawler).info("插入行或列:{}", r.json()["msg"])
+        except Exception as e:
+            Local.logger(log_type, crawler).error("插入行或列异常:{}", e)
+
+    # 写入数据
+    @classmethod
+    def update_values(cls, log_type, crawler, sheetid, ranges, values):
+        """
+        写入数据
+        :param log_type: 日志路径
+        :param crawler: 哪个爬虫的云文档
+        :param sheetid:哪张工作表
+        :param ranges:单元格范围
+        :param values:写入的具体数据,list
+        """
+        try:
+            update_values_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                                + cls.spreadsheettoken(crawler) + "/values_batch_update"
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(log_type, crawler),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+            body = {
+                "valueRanges": [
+                    {
+                        "range": sheetid + "!" + ranges,
+                        "values": values
+                    },
+                ],
+            }
+            urllib3.disable_warnings()
+            r = requests.post(url=update_values_url, headers=headers, json=body, proxies=proxies, verify=False)
+            Local.logger(log_type, crawler).info("写入数据:{}", r.json()["msg"])
+        except Exception as e:
+            Local.logger(log_type, crawler).error("写入数据异常:{}", e)
+
+    # 合并单元格
+    @classmethod
+    def merge_cells(cls, log_type, crawler, sheetid, ranges):
+        """
+        合并单元格
+        :param log_type: 日志路径
+        :param crawler: 哪个爬虫
+        :param sheetid:哪张工作表
+        :param ranges:需要合并的单元格范围
+        """
+        try:
+            merge_cells_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                              + cls.spreadsheettoken(crawler) + "/merge_cells"
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(log_type, crawler),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+
+            body = {
+                "range": sheetid + "!" + ranges,
+                "mergeType": "MERGE_ROWS"
+            }
+            urllib3.disable_warnings()
+            r = requests.post(url=merge_cells_url, headers=headers, json=body, proxies=proxies, verify=False)
+            Local.logger(log_type, crawler).info("合并单元格:{}", r.json()["msg"])
+        except Exception as e:
+            Local.logger(log_type, crawler).error("合并单元格异常:{}", e)
+
+    # 读取单元格数据
+    @classmethod
+    def get_range_value(cls, log_type, crawler, sheetid, cell):
+        """
+        读取单元格内容
+        :param log_type: 日志路径
+        :param crawler: 哪个爬虫
+        :param sheetid: 哪张工作表
+        :param cell: 哪个单元格
+        :return: 单元格内容
+        """
+        try:
+            get_range_value_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                                  + cls.spreadsheettoken(crawler) + "/values/" + sheetid + "!" + cell
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(log_type, crawler),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+            params = {
+                # valueRenderOption=ToString 可返回纯文本的值(数值类型除外);
+                # valueRenderOption=FormattedValue 计算并格式化单元格;
+                # valueRenderOption=Formula 单元格中含有公式时返回公式本身;
+                # valueRenderOption=UnformattedValue 计算但不对单元格进行格式化。
+                "valueRenderOption": "FormattedValue",
+
+                # dateTimeRenderOption=FormattedString 计算并对时间日期按照其格式进行格式化,但不会对数字进行格式化,返回格式化后的字符串。
+                "dateTimeRenderOption": "",
+
+                # 返回的用户id类型,可选open_id,union_id
+                "user_id_type": "open_id"
+            }
+            urllib3.disable_warnings()
+            r = requests.get(url=get_range_value_url, headers=headers, params=params, proxies=proxies, verify=False)
+            # print(r.text)
+            return r.json()["data"]["valueRange"]["values"][0]
+        except Exception as e:
+            Local.logger(log_type, crawler).error("读取单元格数据异常:{}", e)
+
+    # 获取表内容
+    @classmethod
+    def get_sheet_content(cls, log_type, crawler, sheet_id):
+        try:
+            sheet = Feishu.get_values_batch(log_type, crawler, sheet_id)
+            content_list = []
+            for x in sheet:
+                for y in x:
+                    if y is None:
+                        pass
+                    else:
+                        content_list.append(y)
+            return content_list
+        except Exception as e:
+            Local.logger(log_type, crawler).error(f'get_sheet_content:{e}\n')
+
+    # 删除行或列,可选 ROWS、COLUMNS
+    @classmethod
+    def dimension_range(cls, log_type, crawler, sheetid, major_dimension, startindex, endindex):
+        """
+        删除行或列
+        :param log_type: 日志路径
+        :param crawler: 哪个爬虫
+        :param sheetid:工作表
+        :param major_dimension:默认 ROWS ,可选 ROWS、COLUMNS
+        :param startindex:开始的位置
+        :param endindex:结束的位置
+        :return:
+        """
+        try:
+            dimension_range_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                                  + cls.spreadsheettoken(crawler) + "/dimension_range"
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(log_type, crawler),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+            body = {
+                "dimension": {
+                    "sheetId": sheetid,
+                    "majorDimension": major_dimension,
+                    "startIndex": startindex,
+                    "endIndex": endindex
+                }
+            }
+            urllib3.disable_warnings()
+            r = requests.delete(url=dimension_range_url, headers=headers, json=body, proxies=proxies, verify=False)
+            Local.logger(log_type, crawler).info("删除视频数据:{}", r.json()["msg"])
+        except Exception as e:
+            Local.logger(log_type, crawler).error("删除视频数据异常:{}", e)
+
+    # 获取用户 ID
+    @classmethod
+    def get_userid(cls, log_type, crawler, username):
+        try:
+            url = "https://open.feishu.cn/open-apis/user/v1/batch_get_id?"
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(log_type, crawler),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+            name_phone_dict = {
+                "xinxin": "15546206651",
+                "muxinyi": "13699208058",
+                "wangxueke": "13513479926",
+                "yuzhuoyi": "18624010360",
+                "luojunhui": "18801281360",
+                "fanjun": "15200827642",
+                "zhangyong": "17600025055"
+            }
+
+            # if username == "wangkun":
+            #     username = "13426262515"
+            # # elif username == "gaonannan":
+            # #     username = "18501180073"
+            # elif username == "xinxin":
+            #     username = "15546206651"
+            # # elif username == "huxinxue":
+            # #     username = "18832292015"
+            # # elif username == "wuchaoyue":
+            # #     username = "15712941385"
+            # elif username == "muxinyi":
+            #     username = '13699208058'
+            # elif username == "wangxueke":
+            #     username = '13513479926'
+            # elif username == "yuzhuoyi":
+            #     username = '18624010360'
+            # elif username == "luojunhui":
+            #     username = '18801281360'
+            username = name_phone_dict.get(username)
+
+            data = {"mobiles": [username]}
+            urllib3.disable_warnings()
+            r = requests.get(url=url, headers=headers, params=data, verify=False, proxies=proxies)
+            open_id = r.json()["data"]["mobile_users"][username][0]["open_id"]
+            # Common.logger(log_type, crawler).info(f"{username}:{open_id}")
+            # print(f"{username}:{open_id}")
+            return open_id
+        except Exception as e:
+            Local.logger(log_type, crawler).error(f"get_userid异常:{e}\n")
+
+    # 飞书机器人
+    @classmethod
+    def bot(cls, log_type, crawler, text):
+        try:
+            url = "https://open.feishu.cn/open-apis/bot/v2/hook/96989577-50e7-4653-9ec2-308fe3f2c5fe"
+            headers = {'Content-Type': 'application/json'}
+            if crawler == "kanyikan":
+                content = "看一看爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcngRPoDYAi24x52j2nDuHMih"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "gaonannan")) + "></at>\n"
+            elif crawler == "jixiangxingfu":
+                content = text
+                sheet_url = ""
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangxueke")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "zhangyong")) + "></at>\n"
+
+            # elif crawler == "weixinzhishu_out":
+            #     content = "微信指数_站外指数"
+            #     sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnqhMRUGunIfGnGXMOBYiy4K?sheet=YVuVgQ"
+            #     users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+            #         cls.get_userid(log_type, crawler, "muxinyi")) + "></at>\n"
+            # elif crawler == "weixinzhishu_inner_sort":
+            #     content = "微信指数_站内短期指数"
+            #     sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnqhMRUGunIfGnGXMOBYiy4K?sheet=DrZHpa"
+            #     users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+            #         cls.get_userid(log_type, crawler, "muxinyi")) + "></at>\n"
+            # elif crawler == "weixinzhishu_inner_long":
+            #     content = "微信指数_站内长期指数"
+            #     sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnqhMRUGunIfGnGXMOBYiy4K?sheet=JpgyAv"
+            #     users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+            #         cls.get_userid(log_type, crawler, "muxinyi")) + "></at>\n"
+            #
+            # elif crawler == "weixinzhishu" and text == "今日微信指数抓取完毕":
+            #     content = "微信指数"
+            #     sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnqhMRUGunIfGnGXMOBYiy4K?sheet=sVL74k"
+            #     users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "yuzhuoyi")) + "></at>\n"
+            # elif crawler == "weixinzhishu":
+            #     content = "微信指数"
+            #     sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnqhMRUGunIfGnGXMOBYiy4K?sheet=sVL74k"
+            #     users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+            #         cls.get_userid(log_type, crawler, "muxinyi")) + "></at>\n"
+
+            elif crawler == "xiaoniangao_hour":
+                content = "小年糕_小时级_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnYxiyQ1wLklo1W5Kdqc9cGh?sheet=yatRv2"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "luojunhui")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "zhangyong")) + "></at>\n"
+            elif crawler == "xiaoniangao_person":
+                content = "小年糕_用户主页_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnYxiyQ1wLklo1W5Kdqc9cGh?sheet=Wu0CeL"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "luojunhui")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "zhangyong")) + "></at>\n"
+            elif crawler == "xiaoniangao_play":
+                content = "小年糕_播放量_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnYxiyQ1wLklo1W5Kdqc9cGh?sheet=c85k1C"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "luojunhui")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "zhangyong")) + "></at>\n"
+
+            elif crawler == 'xigua' and log_type == "recommend":
+                content = '西瓜视频_推荐_已下载表'
+                sheet_url = 'https://w42nne6hzg.feishu.cn/sheets/shtcnvOpx2P8vBXiV91Ot1MKIw8?sheet=ZzsClu'
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "luojunhui")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "wangxueke")) + "></at>\n"
+            # elif crawler == 'xigua':
+            #     content = '西瓜视频_用户主页_已下载表'
+            #     sheet_url = 'https://w42nne6hzg.feishu.cn/sheets/shtcnvOpx2P8vBXiV91Ot1MKIw8?sheet=e075e9'
+            #     users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "luojunhui")) + "></at> <at id=" + str(
+            #         cls.get_userid(log_type, crawler, "wuchaoyue")) + "></at>\n"
+            # elif crawler == 'xigua_little_video':
+            #     content = '西瓜视频_小视频_已下载表'
+            #     sheet_url = 'https://w42nne6hzg.feishu.cn/sheets/shtcnvOpx2P8vBXiV91Ot1MKIw8?sheet=hDSDnv'
+            #     users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+            #         cls.get_userid(log_type, crawler, "wuchaoyue")) + "></at>\n"
+
+            elif crawler == 'zhihu_hot':
+                content = '知乎_热门_已下载表'
+                sheet_url = 'https://w42nne6hzg.feishu.cn/sheets/shtcnkGPBmGsjaqapgzouuj8MXe?sheet=8871e3'
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "gaonannan")) + "></at>\n"
+            elif crawler == 'zhihu_follow':
+                content = '知乎_定向_已下载表'
+                sheet_url = 'https://w42nne6hzg.feishu.cn/sheets/shtcnkGPBmGsjaqapgzouuj8MXe?sheet=4MGuux'
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "gaonannan")) + "></at>\n"
+
+            elif crawler == 'haokan_hot':
+                content = '好看_热榜_已下载表'
+                sheet_url = 'https://w42nne6hzg.feishu.cn/sheets/shtcnaYz8Nhv8q6DbWtlL6rMEBd?sheet=5pWipX'
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "wuchaoyue")) + "></at>\n"
+            elif crawler == 'haokan_channel':
+                content = '好看_频道_已下载表'
+                sheet_url = 'https://w42nne6hzg.feishu.cn/sheets/shtcnaYz8Nhv8q6DbWtlL6rMEBd?sheet=7f05d8'
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "wuchaoyue")) + "></at>\n"
+            elif crawler == 'haokan_follow':
+                content = '好看_定向_已下载表'
+                sheet_url = 'https://w42nne6hzg.feishu.cn/sheets/shtcnaYz8Nhv8q6DbWtlL6rMEBd?sheet=kVaSjf'
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "wuchaoyue")) + "></at>\n"
+
+            elif crawler == "music_album":
+                content = "音乐相册爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnT6zvmfsYe1g0iv4pt7855g"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "gaonannan")) + "></at>\n"
+
+            elif crawler == "ssyy":
+                content = "胜胜影音爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnz1ymxHL1u8WHblfqfys7qe"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "gaonannan")) + "></at>\n"
+
+            elif crawler == "ggdc":
+                content = "刚刚都传爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnTuJgeZU2bc7VaesAqk3QJx"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "luojunhui")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "zhangyong")) + "></at>\n"
+
+            elif crawler == "bszf":
+                content = "本山祝福爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnGh2rrsPYM4iVNEBO7OqWrb"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "luojunhui")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "zhangyong")) + "></at>\n"
+
+            elif crawler == "jxxf":
+                content = "吉祥幸福爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnSx4nafMbLTq7xl7RHBwHBf"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "luojunhui")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "zhangyong")) + "></at>\n"
+
+            elif crawler == "zmyx":
+                content = "众妙音信爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnbZIxstPeM0xshW07b26sve"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "luojunhui")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "zhangyong")) + "></at>\n"
+
+            elif crawler == "zhufumao":
+                content = "祝福猫视频爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnXfIJthvkjhI5zlEJq84i6g"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "gaonannan")) + "></at>\n"
+
+            elif crawler == "kuaishou_follow":
+                content = "快手_用户主页_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnICEfaw9llDNQkKgdymM1xf?sheet=fYdA8F"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "xinxin")) + "></at>\n"
+            elif crawler == "kuaishou_recommend":
+                content = "快手_推荐榜_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnICEfaw9llDNQkKgdymM1xf?sheet=3cd128"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "xinxin")) + "></at>\n"
+
+            elif crawler == "ssnnyfq":
+                content = "岁岁年年迎福气_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnyJmJSJynHDLLbLTkySfvZe?sheet=290bae"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "luojunhui")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "xinxin")) + "></at>\n"
+
+            elif crawler == "kdjsfq":
+                content = "看到就是福气_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnEokBkIjOUPAk8vbbPKnXgb?sheet=ad3b6d"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "xinxin")) + "></at>\n"
+
+            elif crawler == "gzh":
+                content = "公众号爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnexNXnpDLHhARw0QdiwbYuA"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "huxinxue")) + "></at>\n"
+
+            elif crawler == "gongzhonghao":
+                content = "公众号_信欣_爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcna98M2mX7TbivTj9Sb7WKBN?"
+                users = f"\n<at id={str(cls.get_userid(log_type, crawler, 'fanjun'))}></at> <at id={str(cls.get_userid(log_type, crawler, 'wangxueke'))}></at> <at id={str(cls.get_userid(log_type, crawler, 'luojunhui'))}></at>\n"
+
+            elif crawler == "weiqun":
+                content = "微群爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnoKThNquYRweaylMFVyo9Hc"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "xinxin")) + "></at>\n"
+
+            elif crawler == "weishi":
+                content = "微视爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcn5YSWg91JfVGzj0SFZIRRPh"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "xinxin")) + "></at>\n"
+
+            elif crawler == "shipinhao_recommend":
+                content = "视频号_推荐_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcn9rOdZRAGFbRkWpn7hqEHGc?sheet=c77cf9"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "wuchaoyue")) + "></at>\n"
+            elif crawler == "shipinhao_follow":
+                content = "视频号_定向_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcn9rOdZRAGFbRkWpn7hqEHGc?sheet=KsVtLe"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "wuchaoyue")) + "></at>\n"
+            elif crawler == "youtube":
+                content = "youtube_定向_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnrLyr1zbYbhhZyqpN7Xrd5f?sheet=GVxlYk"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "wuchaoyue")) + "></at>\n"
+
+            elif crawler == "zongjiao":
+                content = "宗教公众号爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcn73NW0CyoOeF21HWO15KBsb"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "huxinxue")) + "></at>\n"
+
+            else:
+                content = "小年糕爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnYxiyQ1wLklo1W5Kdqc9cGh"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at>\n"
+
+            data = json.dumps({
+                "msg_type": "interactive",
+                "card": {
+                    "configs": {
+                        "wide_screen_mode": True,
+                        "enable_forward": True
+                    },
+                    "elements": [{
+                        "tag": "div",
+                        "text": {
+                            "content": users + text,
+                            "tag": "lark_md"
+                        }
+                    }, {
+                        "actions": [{
+                            "tag": "button",
+                            "text": {
+                                "content": content,
+                                "tag": "lark_md"
+                            },
+                            "url": sheet_url,
+                            "type": "default",
+                            "value": {}
+                        }],
+                        "tag": "action"
+                    }],
+                    "header": {
+                        "title": {
+                            "content": "📣您有新的信息,请注意查收",
+                            "tag": "plain_text"
+                        }
+                    }
+                }
+            })
+            urllib3.disable_warnings()
+            r = requests.post(url, headers=headers, data=data, verify=False, proxies=proxies)
+            Local.logger(log_type, crawler).info(f'触发机器人消息:{r.status_code}, {text}')
+        except Exception as e:
+            Local.logger(log_type, crawler).error(f"bot异常:{e}\n")
+
+
+if __name__ == "__main__":
+    Feishu.bot('recommend', 'xigua', '测试: 西瓜推荐,登录失效')
+    # print(Feishu.get_userid('bot', 'weixinzhishu', 'wangkun'))
+    # print(Feishu.get_userid('bot', 'weixinzhishu', 'yuzhuoyi'))

+ 20 - 0
application/common/feishu/feishu_data.py

@@ -0,0 +1,20 @@
+from application.common.feishu.feishu_utils import FeishuUtils
+
+
+class FsData:
+
+    def get_title_rule(self):
+        summary = FeishuUtils.get_values_batch("KsoMsyP2ghleM9tzBfmcEEXBnXg", "BS9uyu")
+        for row in summary[1:]:
+            title_rule = row[0]
+            if title_rule:
+                return title_rule
+            else:
+                return None
+        return None
+
+
+if __name__ == '__main__':
+    data_rule = FsData()
+    title_rule = data_rule.get_title_rule()
+    print(title_rule)

+ 56 - 0
application/common/feishu/feishu_insert.py

@@ -0,0 +1,56 @@
+"""
+feishu python方法
+"""
+
+import requests
+
+
+def get_app_token():
+    """
+    获取飞书api token
+    :return:
+    """
+    url = "https://open.feishu.cn/open-apis/auth/v3/tenant_access_token/internal/"
+    post_data = {
+        "app_id": "cli_a51114cf8bf8d00c",  # 这里账号密码是发布应用的后台账号及密码
+        "app_secret": "cNoTAqMpsAm7mPBcpCAXFfvOzCNL27fe",
+    }
+    response = requests.request("POST", url=url, data=post_data)
+    tenant_access_token = response.json()["tenant_access_token"]
+    return tenant_access_token
+
+
+class FeishuInsert(object):
+    """
+    feishu Python Object
+    """
+
+    def __init__(self, document_token):
+        self.headers = {"Content-Type": "application/json"}
+        self.document_token = document_token
+
+    def insert_value(self, sheet_id, ranges, values):
+        """
+        在表的某一个sheet的ranges中插入数据,若该地方存在数据,会自动把已有的数据往下移动,再写如数据
+        :param sheet_id: 飞书表的唯一ID
+        :param ranges: 单元格位置的range, 从左上角到右下角, 两边都是闭区间
+        :param values: 二维数组, 用于填充ranges的空格数组
+        """
+        insert_value_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/{}/values_prepend".format(
+            self.document_token)
+        # print(get_app_token())
+        headers = {
+            "Authorization": "Bearer " + get_app_token(),
+            'contentType': 'application/json; charset=utf-8'
+        }
+        body = {
+            "valueRange": {
+                "range": "{}!{}".format(sheet_id, ranges),
+                "values": values
+            }
+        }
+        response = requests.request("POST", url=insert_value_url, headers=headers, json=body)
+        print(response.json())
+
+
+

+ 398 - 0
application/common/feishu/feishu_utils.py

@@ -0,0 +1,398 @@
+# -*- coding: utf-8 -*-
+# @Time: 2023/12/26
+"""
+飞书表配置: token 鉴权 / 增删改查 / 机器人报警
+"""
+import json
+import os
+import sys
+import requests
+import urllib3
+from loguru import logger
+
+sys.path.append(os.getcwd())
+
+proxies = {"http": None, "https": None}
+
+
+class FeishuUtils:
+    """
+    编辑飞书云文档
+    """
+    succinct_url = "https://w42nne6hzg.feishu.cn/sheets/"
+    # 飞书路径token
+    @classmethod
+    def spreadsheettoken(cls, crawler):
+        if crawler == "summary":
+            return "KsoMsyP2ghleM9tzBfmcEEXBnXg"
+        else:
+            return crawler
+
+
+
+    # 获取飞书api token
+    @classmethod
+    def get_token(cls):
+        """
+        获取飞书api token
+        :return:
+        """
+        url = "https://open.feishu.cn/open-apis/auth/v3/tenant_access_token/internal/"
+        post_data = {"app_id": "cli_a13ad2afa438d00b",  # 这里账号密码是发布应用的后台账号及密码
+                     "app_secret": "4tK9LY9VbiQlY5umhE42dclBFo6t4p5O"}
+        urllib3.disable_warnings()
+        response = requests.post(url=url, data=post_data, proxies=proxies, verify=False)
+        tenant_access_token = response.json()["tenant_access_token"]
+        return tenant_access_token
+
+    # 获取表格元数据
+    @classmethod
+    def get_metainfo(cls, crawler):
+        """
+        获取表格元数据
+        :return:
+        """
+        try:
+            get_metainfo_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                               + cls.spreadsheettoken(crawler) + "/metainfo"
+
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+            params = {
+                "extFields": "protectedRange",  # 额外返回的字段,extFields=protectedRange时返回保护行列信息
+                "user_id_type": "open_id"  # 返回的用户id类型,可选open_id,union_id
+            }
+            urllib3.disable_warnings()
+            r = requests.get(url=get_metainfo_url, headers=headers, params=params, proxies=proxies, verify=False)
+            response = json.loads(r.content.decode("utf8"))
+            return response
+        except Exception as e:
+            logger.error("获取表格元数据异常:{}", e)
+
+    # 读取工作表中所有数据
+    @classmethod
+    def get_values_batch(cls, crawler, sheetid):
+        """
+        读取工作表中所有数据
+        :param crawler: 哪个爬虫
+        :param sheetid: 哪张表
+        :return: 所有数据
+        """
+
+        get_values_batch_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                               + cls.spreadsheettoken(crawler) + "/values_batch_get"
+        headers = {
+            "Authorization": "Bearer " + cls.get_token(),
+            "Content-Type": "application/json; charset=utf-8"
+        }
+        params = {
+            "ranges": sheetid,
+            "valueRenderOption": "ToString",
+            "dateTimeRenderOption": "",
+            "user_id_type": "open_id"
+        }
+        urllib3.disable_warnings()
+        r = requests.get(url=get_values_batch_url, headers=headers, params=params, proxies=proxies, verify=False)
+        response = json.loads(r.content.decode("utf8"))
+        values = response["data"]["valueRanges"][0]["values"]
+        return values
+
+
+    # 工作表,插入行或列
+    @classmethod
+    def insert_columns(cls, crawler, sheetid, majordimension, startindex, endindex):
+        """
+        工作表插入行或列
+        :param log_type: 日志路径
+        :param crawler: 哪个爬虫的云文档
+        :param sheetid:哪张工作表
+        :param majordimension:行或者列, ROWS、COLUMNS
+        :param startindex:开始位置
+        :param endindex:结束位置
+        """
+        try:
+            insert_columns_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                                 + cls.spreadsheettoken(crawler) + "/insert_dimension_range"
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+            body = {
+                "dimension": {
+                    "sheetId": sheetid,
+                    "majorDimension": majordimension,  # 默认 ROWS ,可选 ROWS、COLUMNS
+                    "startIndex": startindex,  # 开始的位置
+                    "endIndex": endindex  # 结束的位置
+                },
+                "inheritStyle": "AFTER"  # BEFORE 或 AFTER,不填为不继承 style
+            }
+
+            urllib3.disable_warnings()
+            r = requests.post(url=insert_columns_url, headers=headers, json=body, proxies=proxies, verify=False)
+        except Exception as e:
+            logger.error("插入行或列异常:{}", e)
+
+    # 写入数据
+    @classmethod
+    def update_values(cls, crawler, sheetid, ranges, values):
+        """
+        写入数据
+        :param log_type: 日志路径
+        :param crawler: 哪个爬虫的云文档
+        :param sheetid:哪张工作表
+        :param ranges:单元格范围
+        :param values:写入的具体数据,list
+        """
+        try:
+            update_values_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                                + cls.spreadsheettoken(crawler) + "/values_batch_update"
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+            body = {
+                "valueRanges": [
+                    {
+                        "range": sheetid + "!" + ranges,
+                        "values": values
+                    },
+                ],
+            }
+            urllib3.disable_warnings()
+            r = requests.post(url=update_values_url, headers=headers, json=body, proxies=proxies, verify=False)
+        except Exception as e:
+            logger.error("写入数据异常:{}", e)
+
+    # 合并单元格
+    @classmethod
+    def merge_cells(cls, crawler, sheetid, ranges):
+        """
+        合并单元格
+        :param log_type: 日志路径
+        :param crawler: 哪个爬虫
+        :param sheetid:哪张工作表
+        :param ranges:需要合并的单元格范围
+        """
+        try:
+            merge_cells_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                              + cls.spreadsheettoken(crawler) + "/merge_cells"
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+
+            body = {
+                "range": sheetid + "!" + ranges,
+                "mergeType": "MERGE_ROWS"
+            }
+            urllib3.disable_warnings()
+            r = requests.post(url=merge_cells_url, headers=headers, json=body, proxies=proxies, verify=False)
+        except Exception as e:
+            logger.error("合并单元格异常:{}", e)
+
+    # 读取单元格数据
+    @classmethod
+    def get_range_value(cls, crawler, sheetid, cell):
+        """
+        读取单元格内容
+        :param log_type: 日志路径
+        :param crawler: 哪个爬虫
+        :param sheetid: 哪张工作表
+        :param cell: 哪个单元格
+        :return: 单元格内容
+        """
+        try:
+            get_range_value_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                                  + cls.spreadsheettoken(crawler) + "/values/" + sheetid + "!" + cell
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+            params = {
+                "valueRenderOption": "FormattedValue",
+
+                # dateTimeRenderOption=FormattedString 计算并对时间日期按照其格式进行格式化,但不会对数字进行格式化,返回格式化后的字符串。
+                "dateTimeRenderOption": "",
+
+                # 返回的用户id类型,可选open_id,union_id
+                "user_id_type": "open_id"
+            }
+            urllib3.disable_warnings()
+            r = requests.get(url=get_range_value_url, headers=headers, params=params, proxies=proxies, verify=False)
+            # logger.error(r.text)
+            return r.json()["data"]["valueRange"]["values"][0]
+        except Exception as e:
+            logger.error("读取单元格数据异常:{}", e)
+    # 获取表内容
+    @classmethod
+    def get_sheet_content(cls, crawler, sheet_id):
+        try:
+            sheet = Feishu.get_values_batch(crawler, sheet_id)
+            content_list = []
+            for x in sheet:
+                for y in x:
+                    if y is None:
+                        pass
+                    else:
+                        content_list.append(y)
+            return content_list
+        except Exception as e:
+            logger.error(f'get_sheet_content:{e}\n')
+
+    # 删除行或列,可选 ROWS、COLUMNS
+    @classmethod
+    def dimension_range(cls, log_type, crawler, sheetid, major_dimension, startindex, endindex):
+        """
+        删除行或列
+        :param log_type: 日志路径
+        :param crawler: 哪个爬虫
+        :param sheetid:工作表
+        :param major_dimension:默认 ROWS ,可选 ROWS、COLUMNS
+        :param startindex:开始的位置
+        :param endindex:结束的位置
+        :return:
+        """
+        try:
+            dimension_range_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                                  + cls.spreadsheettoken(crawler) + "/dimension_range"
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+            body = {
+                "dimension": {
+                    "sheetId": sheetid,
+                    "majorDimension": major_dimension,
+                    "startIndex": startindex,
+                    "endIndex": endindex
+                }
+            }
+            urllib3.disable_warnings()
+            r = requests.delete(url=dimension_range_url, headers=headers, json=body, proxies=proxies, verify=False)
+        except Exception as e:
+            logger.error("删除视频数据异常:{}", e)
+
+    # 获取用户 ID
+    @classmethod
+    def get_userid(cls, username):
+        try:
+            url = "https://open.feishu.cn/open-apis/user/v1/batch_get_id?"
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+            name_phone_dict = {
+                "xinxin": "15546206651",
+                "muxinyi": "13699208058",
+                "wangxueke": "13513479926",
+                "yuzhuoyi": "18624010360",
+                "luojunhui": "18801281360",
+                "fanjun": "15200827642",
+                "zhangyong": "17600025055",
+                'liukunyu': "18810931977"
+            }
+            username = name_phone_dict.get(username)
+
+            data = {"mobiles": [username]}
+            urllib3.disable_warnings()
+            r = requests.get(url=url, headers=headers, params=data, verify=False, proxies=proxies)
+            open_id = r.json()["data"]["mobile_users"][username][0]["open_id"]
+
+            return open_id
+        except Exception as e:
+            pass
+            # logger.error(f"get_userid异常:{e}\n")
+
+    # 飞书机器人
+    @classmethod
+    def bot(cls, log_type, crawler, text, mark_name):
+        try:
+
+            headers = {'Content-Type': 'application/json'}
+            if crawler == "机器自动改造消息通知":
+                url = "https://open.feishu.cn/open-apis/bot/v2/hook/e7697dc6-5254-4411-8b59-3cd0742bf703"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/KsoMsyP2ghleM9tzBfmcEEXBnXg?sheet=bc154d"
+                users = f"<at id=" + str(cls.get_userid(log_type)) + f">{mark_name}</at>"
+            elif crawler == "快手关键词搜索":
+                url = "https://open.feishu.cn/open-apis/bot/v2/hook/e7697dc6-5254-4411-8b59-3cd0742bf703"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/KsoMsyP2ghleM9tzBfmcEEXBnXg?sheet=U1gySe"
+                users = "".join([f'<at id="{cls.get_userid(type)}">{name}</at>' for type, name in
+                                 zip(log_type, mark_name)])
+                # users = f"<at id=" + str(cls.get_userid(log_type)) + f">{mark_name}</at>"
+            else:
+                url = "https://open.feishu.cn/open-apis/bot/v2/hook/7928f182-08c1-4c4d-b2f7-82e10c93ca80"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/KsoMsyP2ghleM9tzBfmcEEXBnXg?sheet=bc154d"
+                users = f"<at id=" + str(cls.get_userid(log_type)) + f">{mark_name}</at>"
+            data = json.dumps({
+                "msg_type": "interactive",
+                "card": {
+                    "configs": {
+                        "wide_screen_mode": True,
+                        "enable_forward": True
+                    },
+                    "elements": [{
+                        "tag": "div",
+                        "text": {
+                            "content": users + text,
+                            "tag": "lark_md"
+                        }
+                    }, {
+                        "actions": [{
+                            "tag": "button",
+                            "text": {
+                                "content": "详情,点击~~~~~",
+                                "tag": "lark_md"
+                            },
+                            "url": sheet_url,
+                            "type": "default",
+                            "value": {}
+                        }],
+                        "tag": "action"
+                    }],
+                    "header": {
+                        "title": {
+                            "content": "📣消息提醒",
+                            "tag": "plain_text"
+                        }
+                    }
+                }
+            })
+            urllib3.disable_warnings()
+            r = requests.post(url, headers=headers, data=data, verify=False, proxies=proxies)
+        except Exception as e:
+            logger.error(f"bot异常:{e}\n")
+
+    # 飞书机器人-改造计划完成通知
+    @classmethod
+    def finish_bot(cls, text, url, content):
+        try:
+            headers = {'Content-Type': 'application/json'}
+            data = json.dumps({
+                "msg_type": "interactive",
+                "card": {
+                    "configs": {
+                        "wide_screen_mode": True,
+                        "enable_forward": True
+                    },
+                    "elements": [{
+                        "tag": "div",
+                        "text": {
+                            "content": text,
+                            "tag": "lark_md"
+                        }
+                    }],
+                    "header": {
+                        "title": {
+                            "content": content,
+                            "tag": "plain_text"
+                        }
+                    }
+                }
+            })
+            urllib3.disable_warnings()
+            r = requests.post(url, headers=headers, data=data, verify=False, proxies=proxies)
+        except Exception as e:
+            logger.error(f"bot异常:{e}\n")
+

+ 0 - 0
application/common/ffmpeg/__init__.py


+ 76 - 0
application/common/ffmpeg/ffmpeg_utils.py

@@ -0,0 +1,76 @@
+import requests
+import json
+
+class Ffmpeg:
+
+    def get_oss_link(self, oss_key):
+        url = "http://61.48.133.26:5555/api/v1/oss/get_object_link"
+
+        payload = json.dumps({
+            "oss_object_key": oss_key
+        })
+        headers = {
+            'Authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyX2lkIjoiNGNhMTI4ZGYtYWMzMy00NWQ2LTg3MmEtMDAzOTk4MGVhM2ViIiwibmFtZSI6Inp5IiwiZXhwIjoyMDUwOTI3MjExfQ.k_rvuESjA62RgPDiLniVgJyLJn3Q8C1Y_AGq3CPRuKI',
+            'Content-Type': 'application/json'
+        }
+
+        response = requests.request("POST", url, headers=headers, data=payload)
+        response = response.json()
+        data = response['data']
+        return data
+
+    def merge_m3u8(self,url_link):
+        url = "http://101.37.24.17:5555/api/v1/ffmpeg/merge_m3u8"
+
+        data = {
+            "url": url_link,
+            "referer": ""
+        }
+        headers = {
+            'Authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyX2lkIjoiNGNhMTI4ZGYtYWMzMy00NWQ2LTg3MmEtMDAzOTk4MGVhM2ViIiwibmFtZSI6Inp5IiwiZXhwIjoyMDUwOTI3MjExfQ.k_rvuESjA62RgPDiLniVgJyLJn3Q8C1Y_AGq3CPRuKI',
+            'Content-Type': 'application/json'
+        }
+
+        response = requests.request("POST", url, headers=headers, json=data, stream=True)
+        for item in response.content.split(b'\r\n\r\n'):
+            try:
+                item = json.loads(item[6:].decode())
+                if item['event'] == 'message':
+                    continue
+                elif item['event'] == 'ffmpeg code':
+                    code = int(item['data'])
+                    if code != 0:  # ffmpeg处理异常
+                        return
+                elif item['event'] == 'result':
+                    oss_object_key = item['data']['oss_object_key']
+                    if oss_object_key:
+                        oss_url = self.get_oss_link(oss_object_key)
+                        return oss_url
+            except json.decoder.JSONDecodeError:
+                continue
+
+    def webp2_jpg(self,webp2_url):
+        url = "http://101.37.24.17:5555/api/v1/ffmpeg/webp2jpg"
+
+        payload = json.dumps({
+            "url": webp2_url,
+            "referer": ""
+        })
+        headers = {
+            'Authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyX2lkIjoiNGNhMTI4ZGYtYWMzMy00NWQ2LTg3MmEtMDAzOTk4MGVhM2ViIiwibmFtZSI6Inp5IiwiZXhwIjoyMDUwOTI3MjExfQ.k_rvuESjA62RgPDiLniVgJyLJn3Q8C1Y_AGq3CPRuKI',
+            'Content-Type': 'application/json'
+        }
+
+        response = requests.request("POST", url, headers=headers, data=payload)
+        response = response.json()
+        oss_object_key = response['data']['oss_object_key']
+        if oss_object_key:
+            oss_url = self.get_oss_link(oss_object_key)
+            return oss_url
+        else:
+            return None
+
+
+if __name__ == '__main__':
+    ffmpeg = Ffmpeg()
+    print(ffmpeg.get_oss_link("jq_oss/video/20250103135417425230.mp4"))

+ 1 - 0
application/common/gpt/__init__.py

@@ -0,0 +1 @@
+from .gpt4o_mini_help import GPT4oMini

+ 61 - 0
application/common/gpt/gpt4o_mini_help.py

@@ -0,0 +1,61 @@
+import json
+
+import requests
+class GPT4oMini:
+
+
+    @classmethod
+    def get_ai_mini_title(cls, title):
+        url = "http://aigc-api.cybertogether.net//aigc/dev/test/gpt"
+        payload = json.dumps({
+            "imageList": [],
+            "model": "gpt-4o-mini-2024-07-18",
+            "prompt": (
+            "针对微信平台视频类小程序场景"
+            "面向人群是中国中老年人,在单聊、群聊场景。为视频生成一个吸引人的标题。每次生成我会提供一个原标题,你通过以下规则生成一个新的标题。"
+            "生成规则:"
+            "a.生成的新标题一定一定不能包含以下任何一个或多个风险词。"
+            "风险词:请注意, 分享, 听听, 看看, 全体, 一定, 所以人, 无数人, 值得一看, 值得一听, 99 %, 震撼, 必, 必看, 必听, 必读, 全场, 听听, 一起听听, 一起, 快看, 看看, 快来, 分享, 转发, 都看看吧, 都来, 注意, 最新, 紧急, 速看, 速转, 刚刚, 事关, 赶紧, 一定要, 千万不要, 震惊, 惊人, 亿万, 无数, 百分之, 自杀, 致死, 全体国民, 全体国人, 央视, 中央, 国务院, 人民日报, 卫生部, 官方, 气象局, 世卫, 联合国, 新闻, 内部, 内幕, 最新, 医生提醒, 爆炸性消息, 九胞胎, 天大的, 连看三遍, 务必看, 终于曝光, 神药, 危害太大, 不要吃了, 大事发生, 无数国人, 再忙也要, 出大事, 关系你我, 正式确认, 好消息, 突然传出, 新规出台, 重要的消息, 重要消息, 即将失传, 打死都, 惊天, 不要再吃, 格外留心, 太危险, 可怕一幕, 身亡, 后果很严重, 寿命长短, 错过别后悔, 必看, 早点知道就好了, 不得不信, 看一次少一次, 无数人, 老美, 新华社, 新规, 最新骗局, 新型骗局, 吃的是这些, 大老虎, 官员财产, 老中医, 预言, 致命, 救命, 保命, 非常难得, 太震撼了, 快来看, 一定要看, 来看看, 所有人都, 头一次见, 新型"
+            "b.新标题字符不小于15个字,同时不超过30个字。"
+            "c.新标题最前面或最后面必须加上emoij符号。如“🔴”、“⭕️”、“🚩”、“🔥”、“💖”"
+            "d.新标题只去掉原标题里的低质词,但语句、语意都和原标题保持不变。"
+            "e.去掉低质词后,根据语意适当加字句,使新标题整句读起来简洁、通顺、有吸引力、并准确反映视频核心内容。但一定不能包含任何一个或多个风险词。"
+
+            "视频的原标题:“哇!好美的一个视频,发给您也看看!”、“晚上好,这也太美啦,发给大家一起欣赏欣赏。”、“____这段话说得真好,一起听听!每句话都很有道快分享给群友看看吧!”、“👈这段话说的真好,值得一听”、“🔴世界顶尖雪雕❗ 太真实了,太美了!忍不住发给你看看!”、“💖《等》说得真好,看看吧...”、“🔴这样的萌娃你们喜欢吗,都看看吧!”、“🔴2025金蛇纳福,这首歌送给全体群友,祝大家财运亨通永不断!”、“🔴元旦青蛇遇双春,这三件事千万别做,都看看吧!”、“💕呵呵太搞笑了!老师和家长的对话!值得一看!绝了!”、“❤️《中国知识大全》太珍贵了!值得我们每个中国人都看看!”、“六岁小女孩一首《爸》全场泪奔”、“🔴酒店招牌菜,菠菜炒鸡蛋的家常做法,快来学学!”、“这个视频,分享给我的老友,祝愿您能幸福安康”"
+
+            "请务必严格遵守上述生成规则,为原标题生成对应的新标题。"
+            f"请分析该标题,标题为:{title},返回新的标题。"
+            ),
+            "responseFormat": {
+                "type": "json_schema",
+                "json_schema": {
+                    "strict": True,
+                    "name": "share_script_result",
+                    "schema": {
+                        "type": "object",
+                        "properties": {
+                            "新标题": {
+                                "type": "string",
+                                "description": "生成新的标题"
+                            }
+                        },
+                        "required": ["新标题"],
+                        "additionalProperties": False
+                    }
+                }
+            }
+        })
+        headers = {'Content-Type': 'application/json'}
+        try:
+            response = requests.post(url, headers=headers, data=payload)
+            response_data = response.json()
+
+            data = json.loads(response_data.get('data', '{}'))
+            new_title = data["新标题"]
+            return new_title
+        except Exception as e:
+            return None
+
+if __name__ == '__main__':
+    title = GPT4oMini.get_ai_mini_title("🔴这位美女说的太好了!这就是我们的大中国")
+    print(title)

+ 2 - 0
application/common/log/__init__.py

@@ -0,0 +1,2 @@
+from .local_log import Local
+from .aliyun_log import AliyunLogger

+ 81 - 0
application/common/log/aliyun_log.py

@@ -0,0 +1,81 @@
+# -*- coding: utf-8 -*-
+# @Author: 罗俊辉
+# @Time: 2023/12/18
+"""
+公共方法,包含:生成log
+"""
+import json
+from aliyun.log import LogClient, PutLogsRequest, LogItem
+import time
+
+proxies = {"http": None, "https": None}
+
+
+class AliyunLogger(object):
+    """
+    阿里云日志方法
+    """
+    def __init__(self, platform, mode, env="prod"):
+        self.platform = platform
+        self.mode = mode
+        self.env = env
+
+    # 写入阿里云日志
+    def logging(
+            self, code, message, data=None, trace_id=None, account=None
+    ):
+        """
+        写入阿里云日志
+        测试库: https://sls.console.aliyun.com/lognext/project/crawler-log-dev/logsearch/crawler-log-dev
+        正式库: https://sls.console.aliyun.com/lognext/project/crawler-log-prod/logsearch/crawler-log-prod
+        """
+        # 设置阿里云日志服务的访问信息
+        if data is None:
+            data = {}
+        accessKeyId = "LTAIWYUujJAm7CbH"
+        accessKey = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P"
+        if self.env == "dev":
+            project = "crawler-log-dev"
+            logstore = "crawler-log-dev"
+            endpoint = "cn-hangzhou.log.aliyuncs.com"
+        else:
+            project = "crawler-log-prod"
+            logstore = "crawler-fetch"
+            endpoint = "cn-hangzhou.log.aliyuncs.com"
+
+        # 创建 LogClient 实例
+        client = LogClient(endpoint, accessKeyId, accessKey)
+        log_group = []
+        log_item = LogItem()
+
+        """
+        生成日志消息体格式,例如
+        crawler:xigua
+        message:不满足抓取规则 
+        mode:search
+        timestamp:1686656143
+        """
+        message = message.replace("\r", " ").replace("\n", " ")
+        contents = [
+            (f"TraceId", str(trace_id)),
+            (f"code", str(code)),
+            (f"platform", str(self.platform)),
+            (f"mode", str(self.mode)),
+            (f"message", str(message)),
+            (f"data", json.dumps(data, ensure_ascii=False) if data else ""),
+            (f"account", str(account)),
+            ("timestamp", str(int(time.time()))),
+        ]
+
+        log_item.set_contents(contents)
+        log_group.append(log_item)
+        # 写入日志
+        request = PutLogsRequest(
+            project=project,
+            logstore=logstore,
+            topic="",
+            source="",
+            logitems=log_group,
+            compress=False,
+        )
+        client.put_logs(request)

+ 54 - 0
application/common/log/local_log.py

@@ -0,0 +1,54 @@
+import sys
+from datetime import date, timedelta, datetime
+from loguru import logger
+from pathlib import Path
+
+class Local:
+    # 日期常量
+    now = datetime.now()
+    today = date.today()
+    yesterday = (today - timedelta(days=1)).strftime("%Y-%m-%d")
+    tomorrow = (today + timedelta(days=1)).strftime("%Y-%m-%d")
+
+    @staticmethod
+    def init_logger(platform: str, mode: str = "prod", log_level: str = "INFO", log_to_console: bool = False,
+                    rotation: str = "00:00", retention: str = "10 days"):
+        """
+        初始化日志记录器
+        :param platform: 平台名称,用于区分日志目录
+        :param mode: 运行环境(如 prod/test/dev)
+        :param log_level: 日志级别(如 INFO、DEBUG)
+        :param log_to_console: 是否同时输出到控制台
+        :param rotation: 日志文件切分策略(默认每天 00:00)
+        :param retention: 日志保留时间(默认10天)
+        """
+        # 创建日志目录
+        log_dir = Path(f"./log_store/{platform}")
+        log_dir.mkdir(parents=True, exist_ok=True)
+
+        # 设置日志文件名
+        log_filename = f"{platform}-{mode}-{Local.today.strftime('%Y-%m-%d')}.log"
+        log_file_path = log_dir / log_filename
+
+        # 清除默认 handler
+        logger.remove()
+
+        # 添加文件日志 handler
+        logger.add(
+            str(log_file_path),
+            level=log_level.upper(),
+            rotation=rotation,
+            retention=retention,
+            encoding="utf-8",
+            enqueue=True
+        )
+
+        # 可选:输出到控制台
+        if log_to_console:
+            logger.add(
+                sink=sys.stdout,
+                level=log_level.upper(),
+                format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level}</level> | {message}"
+            )
+
+        return logger

+ 3 - 0
application/common/messageQueue/__init__.py

@@ -0,0 +1,3 @@
+from .mq import MQ
+from .ack_message import ack_message
+from .consumer import get_consumer

+ 15 - 0
application/common/messageQueue/ack_message.py

@@ -0,0 +1,15 @@
+def ack_message(mode, platform, recv_msgs, consumer, trace_id=None):
+    """
+    消费成功后确认消息
+    """
+    try:
+        receipt_handle_list = [recv_msgs.receipt_handle]
+        consumer.ack_message(receipt_handle_list)
+        Local.logger(platform, mode).info(
+            f"[trace_id={trace_id}] Ack {len(receipt_handle_list)} Message Succeed."
+        )
+
+    except MQExceptionBase as err:
+        Local.logger(platform, mode).error(
+            f"[trace_id={trace_id}] Ack Message Fail! Exception:{err}"
+        )

+ 25 - 0
application/common/messageQueue/consumer.py

@@ -0,0 +1,25 @@
+from mq_http_sdk.mq_client import *
+
+
+def get_consumer(topic_name, group_id):
+    # 初始化client。
+    mq_client = MQClient(
+        # 设置HTTP协议客户端接入点,进入云消息队列 RocketMQ 版控制台实例详情页面的接入点区域查看。
+        "http://1894469520484605.mqrest.cn-qingdao-public.aliyuncs.com",
+        # AccessKey ID,阿里云身份验证标识。获取方式,请参见创建AccessKey。
+        "LTAI4G7puhXtLyHzHQpD6H7A",
+        # AccessKey Secret,阿里云身份验证密钥。获取方式,请参见创建AccessKey。
+        "nEbq3xWNQd1qLpdy2u71qFweHkZjSG",
+    )
+    # 消息所属的Topic,在云消息队列 RocketMQ 版控制台创建。
+    # topic_name = "${TOPIC}"
+    topic_name = str(topic_name)
+    # 您在云消息队列 RocketMQ 版控制台创建的Group ID。
+    # group_id = "${GROUP_ID}"
+    group_id = str(group_id)
+    # Topic所属的实例ID,在云消息队列 RocketMQ 版控制台创建。
+    # 若实例有命名空间,则实例ID必须传入;若实例无命名空间,则实例ID传入空字符串。实例的命名空间可以在云消息队列 RocketMQ 版控制台的实例详情页面查看。
+    instance_id = "MQ_INST_1894469520484605_BXhXuzkZ"
+
+    consumer = mq_client.get_consumer(instance_id, topic_name, group_id)
+    return consumer

+ 51 - 0
application/common/messageQueue/mq.py

@@ -0,0 +1,51 @@
+import json
+from mq_http_sdk.mq_exception import MQExceptionBase
+from mq_http_sdk.mq_producer import TopicMessage
+from mq_http_sdk.mq_client import MQClient
+import traceback
+from application.common.log import Local
+from application.common.log import AliyunLogger
+
+
+class MQ(object):
+    """
+    MQ Class
+    """
+    instance_id = "MQ_INST_1894469520484605_BXhXuzkZ"
+
+    def __init__(self, topic_name) -> None:
+        self.mq_client = MQClient("http://1894469520484605.mqrest.cn-qingdao-public.aliyuncs.com",
+                                  "LTAI4G7puhXtLyHzHQpD6H7A",
+                                  "nEbq3xWNQd1qLpdy2u71qFweHkZjSG")
+        topic_name = topic_name+"_v2"
+        self.producer = self.mq_client.get_producer(self.instance_id, topic_name)
+
+    def send_msg(self, video_dict, max_retries = 3):
+        """
+        发送 mq,并且记录 redis
+        :param video_dict:
+        """
+        strategy = video_dict["strategy"]
+        platform = video_dict["platform"]
+        self.aliyun_log = AliyunLogger(mode=strategy, platform=platform)
+        for retry in range(max_retries):
+            try:
+                msg = TopicMessage(json.dumps(video_dict))
+                message_key = "{}-{}-{}".format(platform, strategy, video_dict['out_video_id'])
+                msg.set_message_key(message_key)
+                re_msg = self.producer.publish_message(msg)
+                Local.init_logger(platform,strategy).info("Publish Message Succeed. MessageID:%s, BodyMD5:%s\n" %
+                                                      (re_msg.message_id, re_msg.message_body_md5))
+                return
+            except MQExceptionBase as e:
+                tb = traceback.format_exc()
+                # 如果是最后一次重试失败,记录日志
+                if retry == max_retries - 1:
+                    Local.init_logger(platform, strategy).error(
+                        f"Publish Message Fail after {max_retries} attempts. Exception: {e}\n{tb}"
+                    )
+                    self.aliyun_log.logging(
+                        code="5005",
+                        message=f"Publish Message Fail after {max_retries} attempts. Exception: {e}",
+                        data= tb
+                    )

+ 1 - 0
application/common/mysql/__init__.py

@@ -0,0 +1 @@
+from .mysql_helper import MysqlHelper

+ 122 - 0
application/common/mysql/mysql_helper.py

@@ -0,0 +1,122 @@
+# -*- coding: utf-8 -*-
+# @Author: luojunhui
+# @Time: 2023/12/19
+"""
+数据库连接及操作
+"""
+import redis
+import pymysql
+import os
+import sys
+
+sys.path.append(os.getcwd())
+
+from application.common.log import Local
+from application.config.mysql_config import env_dict
+
+
+class MysqlHelper(object):
+    """
+    MySQL工具, env默认prod版本
+    """
+    def __init__(self, env="prod", mode='', platform='', action=''):
+        mysql_config = env_dict[env]
+        self.connection = pymysql.connect(
+            host=mysql_config['host'],  # 数据库IP地址,内网地址
+            port=mysql_config['port'],  # 端口号
+            user=mysql_config['user'],  # mysql用户名
+            passwd=mysql_config['passwd'],  # mysql用户登录密码
+            db=mysql_config['db'],  # 数据库名
+            charset=mysql_config['charset']  # 如果数据库里面的文本是utf8编码的,charset指定是utf8
+        )
+        self.mode = mode
+        self.platform = platform
+        self.action = action
+
+    def select(self, sql):
+        """
+        查询
+        :param sql:
+        :return:
+        """
+        cursor = self.connection.cursor()
+        cursor.execute(sql)
+        data = cursor.fetchall()
+        return data
+
+    def select_params(self, sql, params=None):
+        cursor = self.connection.cursor()
+        cursor.execute(sql, params or ())  # 支持参数化查询
+        data = cursor.fetchall()
+        return data
+
+    def update(self, sql):
+        """
+        插入
+        :param sql:
+        :return:
+        """
+        cursor = self.connection.cursor()
+        try:
+            res = cursor.execute(sql)
+            self.connection.commit()
+            return res
+        except Exception as e:
+            Local.logger(self.mode, self.platform).error(f"update_values异常,进行回滚操作:{e}\n")
+            self.connection.rollback()
+
+    def close(self):
+        """
+        关闭连接
+        """
+        self.connection.close()
+
+
+
+class RedisHelper:
+    @classmethod
+    def connect_redis(cls, env):
+        if env == 'hk':
+            redis_pool = redis.ConnectionPool(
+                # host='r-bp154bpw97gptefiqk.redis.rds.aliyuncs.com',  # 内网地址
+                # host='r-bp154bpw97gptefiqkpd.redis.rds.aliyuncs.com',  # 测试地址
+                host='r-bp1mb0v08fqi4hjffupd.redis.rds.aliyuncs.com',  # 外网地址
+                port=6379,
+                db=2,
+                password='Wqsd@2019'
+            )
+            redis_conn = redis.Redis(connection_pool=redis_pool)
+        elif env == 'prod':
+            redis_pool = redis.ConnectionPool(
+                host='r-bp1mb0v08fqi4hjffu.redis.rds.aliyuncs.com',  # 内网地址
+                # host='r-bp1mb0v08fqi4hjffupd.redis.rds.aliyuncs.com',  # 外网地址
+                port=6379,
+                db=2,
+                password='Wqsd@2019'
+            )
+            redis_conn = redis.Redis(connection_pool=redis_pool)
+        else:
+            redis_pool = redis.ConnectionPool(
+                # host='r-bp154bpw97gptefiqk.redis.rds.aliyuncs.com',  # 内网地址
+                host='r-bp154bpw97gptefiqkpd.redis.rds.aliyuncs.com',  # 外网地址
+                port=6379,
+                db=2,
+                password='Qingqu2019'
+            )
+            redis_conn = redis.Redis(connection_pool=redis_pool)
+        return redis_conn
+
+    @classmethod
+    def redis_push(cls, env, task_key, data):
+        redis_conn = cls.connect_redis(env)
+        # print("开始写入数据")
+        redis_conn.lpush(task_key, data)
+        # print("数据写入完成")
+
+    @classmethod
+    def redis_pop(cls, env, task_key):
+        redis_conn = cls.connect_redis(env)
+        if redis_conn.llen(task_key) == 0:
+            return None
+        else:
+            return redis_conn.rpop(task_key)

+ 57 - 0
application/common/mysql/sql.py

@@ -0,0 +1,57 @@
+
+
+from datetime import datetime
+import os
+import sys
+
+from application.common.mysql import MysqlHelper
+
+class Sql:
+    """
+    修改用户名+头像
+    """
+    def update_name_url(self, mid, avatar_url, user_name):
+        sql = f""" update xng_uid set avatar_url = "{avatar_url}", user_name="{user_name}" where uid = "{mid}"; """
+        db = MysqlHelper()
+        repeat_video = db.update(sql=sql)
+        if repeat_video:
+            return True
+        return False
+
+    """
+    插入 用户名 头像 用户id
+    """
+
+    def insert_name_url(self, uid, avatar_url, user_name):
+        current_time = datetime.now()
+        formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
+        insert_sql = f"""INSERT INTO xng_uid (uid, avatar_url, user_name, data_time) values ('{uid}' ,'{avatar_url}','{user_name}', '{formatted_time}')"""
+        db = MysqlHelper()
+        repeat_video = db.update(sql=insert_sql)
+        if repeat_video:
+            return True
+        return False
+
+    """
+    查询用户id是否存在
+    """
+
+    def select_id(self, uid):
+        sql = f""" select uid from xng_uid where uid = "{uid}"; """
+        db = MysqlHelper()
+        repeat_video = db.select(sql=sql)
+        if repeat_video:
+            return True
+        return False
+
+    """
+    查询用户id是否之前已添加过
+    """
+
+    def select_id_status(self, uid):
+        sql = f""" select uid from crawler_user_v3 where link = "{uid}"; """
+        db = MysqlHelper()
+        repeat_video = db.select(sql=sql)
+        if repeat_video:
+            return False
+        return True

+ 2 - 0
application/common/proxies/__init__.py

@@ -0,0 +1,2 @@
+from .fast_proxy import tunnel_proxies
+from .fast_proxy import haiwai_tunnel_proxies

+ 23 - 0
application/common/proxies/fast_proxy.py

@@ -0,0 +1,23 @@
+def tunnel_proxies():
+    # 隧道域名:端口号
+    tunnel = "q796.kdltps.com:15818"
+    # 用户名密码方式
+    username = "t17772369458618"
+    password = "5zqcjkmy"
+    proxies = {
+        "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
+        "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
+    }
+    return proxies
+
+
+def haiwai_tunnel_proxies():
+    tunnel = "c101.kdlfps.com:18866"
+    # 用户名密码方式
+    username = "f2801246645"
+    password = "q0i0ohnl"
+    proxies = {
+        "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
+        "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
+    }
+    return proxies

+ 0 - 0
application/common/redis/__init__.py


+ 55 - 0
application/common/redis/pyredis.py

@@ -0,0 +1,55 @@
+"""
+Redis client Python
+@author luojunhui
+"""
+import redis
+
+
+class RedisClient(object):
+    """
+    Redis client by python
+    Todo 如果 Redis 服务挂了,怎么做能够不影响业务
+    思路, 每次使用 redis 接口前先判断是否连接成功,如果连接失败则跳过 redis ,不影响全局
+    """
+
+    def __init__(self):
+        self.pool = None
+        # self.host = 'r-bp1mb0v08fqi4hjffu.redis.rds.aliyuncs.com'
+        self.host="r-bp1mb0v08fqi4hjffupd.redis.rds.aliyuncs.com",  # 外网地址
+        self.port = 6379
+        self.db = 2
+        self.password = 'Wqsd@2019'
+
+    def connect(self):
+        """
+        connect to redis server
+        :return: bool
+        """
+        try:
+            self.pool = redis.Redis(host=self.host, port=self.port, db=self.db, password=self.password)
+            return True
+        except Exception as e:
+            print("connect to redis fail, the reason is {}".format(e))
+            return False
+
+    def select(self, key):
+        """
+        read info from redis
+        :return:
+        """
+        return self.pool.get(key)
+
+    def insert(self, key, value, expire_time):
+        """
+        insert info from redis
+        :return:
+        """
+        self.pool.set(key, value, expire_time)
+
+    def delete(self, key):
+        """
+        delete key
+        :param key:
+        :return:
+        """
+        self.pool.delete(key)

+ 67 - 0
application/common/redis/redis_helper.py

@@ -0,0 +1,67 @@
+import redis
+from datetime import timedelta
+
+
+class SyncRedisHelper:
+    _pool: redis.ConnectionPool = None
+    _instance = None
+
+    def __init__(self):
+        if not self._instance:
+            self._pool = self._get_pool()
+            self._instance = self
+
+    def _get_pool(self) -> redis.ConnectionPool:
+        if self._pool is None:
+            self._pool = redis.ConnectionPool(
+                host="r-bp1mb0v08fqi4hjffu.redis.rds.aliyuncs.com",  # 内网地址
+                # host="r-bp1mb0v08fqi4hjffupd.redis.rds.aliyuncs.com",  # 外网地址
+                port=6379,
+                db=2,
+                password="Wqsd@2019",
+                # password="Qingqu2019",
+
+            )
+        return self._pool
+
+    def get_client(self) -> redis.Redis:
+        pool = self._get_pool()
+        client = redis.Redis(connection_pool=pool)
+        return client
+
+    def close(self):
+        if self._pool:
+            self._pool.disconnect(inuse_connections=True)
+
+
+def store_data(platform, out_video_id, condition, day_time):
+    key = f"crawler:duplicate:{platform}:{out_video_id}"
+    value = 1
+    if condition:
+        timeout = timedelta(days=int(day_time))
+    else:
+        timeout = timedelta(hours=int(day_time))
+    helper = SyncRedisHelper()
+    client = helper.get_client()
+
+    client.set(key, value)
+    client.expire(key, timeout)
+
+
+def get_data(platform, out_video_id):
+    key = f"crawler:duplicate:{platform}:{out_video_id}"
+    helper = SyncRedisHelper()
+    client = helper.get_client()
+    value = client.exists(key)
+    return value
+
+
+# 示例:存储一个数据
+# store_data('xiaoniangao', '123457', True, 60)
+
+# # 示例:获取一个数据
+# value = get_data('xiaoniangao', '1234857')
+# if value is None:
+#     print("Value does not exist")
+# else:
+#     print(f"Retrieved value: {value}")

+ 54 - 0
application/common/redis/xng_redis.py

@@ -0,0 +1,54 @@
+import json
+
+import redis
+
+
+
+class XNGSyncRedisHelper:
+    _pool: redis.ConnectionPool = None
+    _instance = None
+
+    def __init__(self):
+        if not self._instance:
+            self._pool = self._get_pool()
+            self._instance = self
+
+    def _get_pool(self) -> redis.ConnectionPool:
+        if self._pool is None:
+            self._pool = redis.ConnectionPool(
+                # host="r-bp1mb0v08fqi4hjffupd.redis.rds.aliyuncs.com",  # 外网地址
+                host="r-bp1mb0v08fqi4hjffu.redis.rds.aliyuncs.com",  # 内网地址
+                port=6379,
+                db=0,
+                password="Wqsd@2019",
+                # password="Qingqu2019",
+
+            )
+        return self._pool
+
+    def get_client(self) -> redis.Redis:
+        pool = self._get_pool()
+        client = redis.Redis(connection_pool=pool)
+        return client
+
+    def close(self):
+        if self._pool:
+            self._pool.disconnect(inuse_connections=True)
+
+
+
+
+def xng_get_video_data():
+    """获取一条id"""
+    task = f"task:xng_video_id"
+    helper = XNGSyncRedisHelper()
+    client = helper.get_client()
+    ret = client.rpop(task)
+    return ret
+
+def xng_in_video_data(ret):
+    """写入"""
+    task = f"task:xng_video_id"
+    helper = XNGSyncRedisHelper()
+    client = helper.get_client()
+    client.rpush(task, ret)

+ 3 - 0
application/config/__init__.py

@@ -0,0 +1,3 @@
+from .ipconfig import ip_config
+from .mysql_config import env_dict
+from .topic_group_queue import TopicGroup

+ 0 - 0
application/config/aliyun_config.py


+ 7 - 0
application/config/config.py

@@ -0,0 +1,7 @@
+# api 配置
+crawler_api_domain = 'http://8.217.192.46:8889'
+zhufuquanzi_view_api = crawler_api_domain + '/crawler/zhu_fu_quan_zi/detail_exposure'
+zhufuquanzi_history_api = crawler_api_domain + '/crawler/zhu_fu_quan_zi/detail_history'
+xiaoniangao_view_api = crawler_api_domain + '/crawler/xiao_nian_gao_plus/detail_exposure'
+xiaoniangao_history_api = crawler_api_domain + '/crawler/xiao_nian_gao_plus/detail_history'
+zhufuquanzi_log_upload_api = crawler_api_domain + '/crawler/zhu_fu_quan_zi/log_upload'

+ 30 - 0
application/config/ipconfig.py

@@ -0,0 +1,30 @@
+"""
+ipconfig
+每一个容器和手机需要在同一个局域网,保证容器内appium和手机的网络通畅
+"""
+
+
+def ip_config():
+    ip_dict = {
+        "machine_01": "",
+        "machine_02": "",
+        "machine_03": "",
+        "machine_04": "",
+        "machine_05": "",
+        "machine_06": "",
+        "machine_07": "",
+        "machine_08": "",
+        "machine_09": "",
+        "machine_10": "",
+        "machine_11": "",
+        "machine_12": "",
+        "machine_13": "",
+        "machine_14": "",
+        "machine_15": "",
+        "machine_16": "",
+        "machine_17": "",
+        "machine_18": "",
+        "machine_19": "",
+        "machine_20": ""
+    }
+    return ip_dict

+ 36 - 0
application/config/mysql_config.py

@@ -0,0 +1,36 @@
+"""
+MySQL的配置任务
+"""
+
+
+# 香港服务器, 暂时不写
+mysql_hk = {
+    "", ""
+}
+
+# prod环境服务器地址
+mysql_prod = {
+    "host": "rm-bp1159bu17li9hi94.mysql.rds.aliyuncs.com",  # 数据库IP地址,内网地址
+    # host="rm-bp1159bu17li9hi94ro.mysql.rds.aliyuncs.com",# 数据库IP地址,外网地址
+    "port": 3306,  # 端口号
+    "user":"crawler",  # mysql用户名
+    "passwd": "crawler123456@",  # mysql用户登录密码
+    "db": "piaoquan-crawler",  # 数据库名
+    "charset": "utf8mb4"  # 如果数据库里面的文本是utf8编码的,charset指定是utf8
+}
+# 测试环境Mysql服务器地址
+mysql_dev = {
+    "host": "rm-bp1k5853td1r25g3n690.mysql.rds.aliyuncs.com",  # 数据库IP地址,内网地址
+    # host="rm-bp1k5853td1r25g3ndo.mysql.rds.aliyuncs.com",  # 数据库IP地址,外网地址
+    "port": 3306,  # 端口号
+    "user":"crawler",  # mysql用户名
+    "passwd": "crawler123456@",  # mysql用户登录密码
+    "db": "piaoquan-crawler",  # 数据库名
+    "charset": "utf8mb4"  # 如果数据库里面的文本是utf8编码的,charset指定是utf8
+}
+
+env_dict = {
+    "hk": mysql_hk,
+    "prod": mysql_prod,
+    "dev": mysql_dev
+}

+ 24 - 0
application/config/topic_group_queue.py

@@ -0,0 +1,24 @@
+import yaml
+from utils.project_paths import config_dir
+
+class TopicGroup:
+    def __init__(self, config_path=f"{config_dir}/topic_map.yaml"):
+        with open(config_path, "r") as f:
+            data = yaml.safe_load(f)
+            self.topics = data.get("topics", [])  # 直接获取 topic 列表
+
+    def __iter__(self):
+        """支持迭代遍历 topics"""
+        return iter(self.topics)
+
+    def __len__(self):
+        return len(self.topics)
+
+    def __str__(self):
+        return str(self.topics)
+
+
+if __name__ == '__main__':
+    tg = TopicGroup()
+    print(tg)
+

+ 0 - 0
application/etl/__init__.py


+ 134 - 0
application/etl/download.py

@@ -0,0 +1,134 @@
+"""
+下载视频
+"""
+import os
+import json
+import time
+import asyncio
+from hashlib import md5
+import datetime
+
+import httpx
+import requests
+
+
+class VideoDownloader(object):
+    """
+    视频下载功能
+    """
+
+    def __init__(self, video_obj):
+        self.platform = video_obj['platform']
+        self.video_id = video_obj['video_id']
+        self.video_url = video_obj['video_url']
+        self.cover_url = video_obj['cover_url']
+        self.proxy = {
+            "http://": "http://t17772369458618:5zqcjkmy@q796.kdltps.com:15818/",
+            "https://": "http://t17772369458618:5zqcjkmy@q796.kdltps.com:15818/"
+        }
+        self.max_retry = 5
+
+    def generate_video_path(self):
+        """
+        通过视频信息生成唯一视频地址
+        :return:
+        """
+        index = "{}-{}".format(self.platform, self.video_id)
+        index = md5(index.encode()).hexdigest()
+        temp_dir = "/Users/luojunhui/cyber/automatic_crawler"
+        file_name = "{}.mp4".format(index)
+        date_info = datetime.datetime.today().strftime("%Y%m%d")
+        video_path = os.path.join(temp_dir, date_info, file_name)
+        if os.path.exists(video_path):
+            return
+        else:
+            os.makedirs(os.path.dirname(video_path), exist_ok=True)
+        return video_path
+
+    async def download_video(self):
+        """
+        download video from the web
+        :return:
+        """
+        if self.platform == "fuqiwang":
+            download_path = self.generate_video_path()
+            if download_path:
+                headers = {
+                    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.6(0x13080610) XWEB/1156',
+                    'Accept-Encoding': 'identity;q=1, *;q=0',
+                    'Accept': '*/*',
+                    'Sec-Fetch-Site': 'cross-site',
+                    'Sec-Fetch-Mode': 'no-cors',
+                    'Sec-Fetch-Dest': 'video',
+                    'Referer': 'https://servicewechat.com/wxa1431c6e7acdd32d/2/page-frame.html',
+                    'Accept-Language': 'en-US,en;q=0.9',
+                    'Range': 'bytes=0-',
+                }
+                async with httpx.AsyncClient(http2=True, proxies=self.proxy, headers=headers) as client:
+                    try:
+                        response = await client.get(self.video_url, headers=headers)
+                        if response.status_code == 206:
+                            with open(download_path, "wb") as f:
+                                f.write(response.content)
+                        else:
+                            for _ in range(self.max_retry):
+                                response = await client.get(self.video_url, headers=headers, follow_redirects=True)
+                                if response.status_code == 206:
+                                    with open(download_path, "wb") as f:
+                                        f.write(response.content)
+                                    break
+                    except httpx.HTTPError as e:
+                        print(f"An error occurred while downloading: {e}")
+            else:
+                print("视频已经存在")
+
+    def get_by_request(self):
+        """
+        req
+        :return:
+        """
+        download_path = self.generate_video_path()
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.6(0x13080610) XWEB/1156',
+            'Accept-Encoding': 'identity;q=1, *;q=0',
+            'Accept': '*/*',
+            'Sec-Fetch-Site': 'cross-site',
+            'Sec-Fetch-Mode': 'no-cors',
+            'Sec-Fetch-Dest': 'video',
+            'Referer': 'https://servicewechat.com/wxa1431c6e7acdd32d/2/page-frame.html',
+            'Accept-Language': 'en-US,en;q=0.9',
+            'Range': 'bytes=0-',
+        }
+        r = requests.get(
+            url=self.video_url,
+            headers=headers,
+            proxies=self.proxy
+        )
+        print(r.status_code)
+        with open("test.mp4", "wb") as f:
+            f.write(r.content)
+
+
+async def main(video_obj):
+    """
+    异步执行函数
+    :param video_obj:
+    :return:
+    """
+    downloader = VideoDownloader(video_obj)
+    await downloader.download_video()
+
+
+if __name__ == '__main__':
+    video_o = {
+        "update_time": 1709784300,
+        "platform": "fuqiwang",
+        "video_id": 142599,
+        "title": "🔴3·8妇女节,最美的祝福,送给全天下的女神!",
+        "type": 1,
+        "video_type": 2,
+        "cover_url": "https://znl-video-bos.cdn.bcebos.com/c6f12b49992ef638342065439f55b444/65e93632/picture/20240306/b8b0c1cc262c2394f111650c9f82e35a_thumb.jpg",
+        "video_url": "https://znl-video-bos.cdn.bcebos.com/e368801a814c548e443835086d37caaf/65e93632/video/20240306/820ee1498e3ed2a59d37aed54d39ae95_1.mp4",
+    }
+    VideoDownloader(video_obj=video_o).get_by_request()
+    # asyncio.run(main(video_obj=video_o))

+ 3 - 0
application/functions/__init__.py

@@ -0,0 +1,3 @@
+from .get_redirect_url import get_redirect_url
+from .clean_title import clean_title
+from .read_mysql_config import get_config_from_mysql

+ 26 - 0
application/functions/appium_tools.py

@@ -0,0 +1,26 @@
+"""
+Appium 的一些公共方法
+"""
+import time
+from selenium.webdriver.common.by import By
+from selenium.common.exceptions import NoSuchElementException
+
+
+def search_elements(driver, xpath):
+    """
+    获取元素
+    :param driver:
+    :param xpath:
+    :return:
+    """
+    time.sleep(1)
+    windowHandles = driver.window_handles
+    for handle in windowHandles:
+        driver.switch_to.window(handle)
+        time.sleep(1)
+        try:
+            elements = driver.find_elements(By.XPATH, xpath)
+            if elements:
+                return elements
+        except NoSuchElementException:
+            pass

+ 22 - 0
application/functions/clean_title.py

@@ -0,0 +1,22 @@
+def clean_title(strings):
+    return (
+        strings.strip()
+        .replace("\n", "")
+        .replace("/", "")
+        .replace("\r", "")
+        .replace("#", "")
+        .replace(".", "。")
+        .replace("\\", "")
+        .replace("&NBSP", "")
+        .replace(":", "")
+        .replace("*", "")
+        .replace("?", "")
+        .replace("?", "")
+        .replace('"', "")
+        .replace("<", "")
+        .replace(">", "")
+        .replace("|", "")
+        .replace(" ", "")
+        .replace('"', "")
+        .replace("'", "")
+    )

+ 3 - 0
application/functions/crypt.py

@@ -0,0 +1,3 @@
+"""
+爬虫逆向加密算法
+"""

+ 9 - 0
application/functions/get_redirect_url.py

@@ -0,0 +1,9 @@
+import requests
+
+
+def get_redirect_url(url):
+    res = requests.get(url, allow_redirects=False)
+    if res.status_code == 302 or res.status_code == 301:
+        return res.headers['Location']
+    else:
+        return url

+ 46 - 0
application/functions/read_mysql_config.py

@@ -0,0 +1,46 @@
+import json
+
+from application.common.mysql import MysqlHelper
+
+
+def get_config_from_mysql(log_type, source, text):
+    """
+    :param log_type: mode
+    :param source: platform
+    :param text:
+    :return:
+    """
+    select_sql = f"""select config from crawler_config where source="{source}" """
+    MySQL = MysqlHelper(mode=log_type, platform=select_sql)
+    configs = MySQL.select(select_sql)
+    title_list = []
+    filter_list = []
+    emoji_list = []
+    search_word_list = []
+    for config in configs:
+        config_dict = json.loads(config[0])
+        for k, v in config_dict.items():
+            if k == "title":
+                title_list_config = v.split(",")
+                for title in title_list_config:
+                    title_list.append(title)
+            if k == "filter":
+                filter_list_config = v.split(",")
+                for filter_word in filter_list_config:
+                    filter_list.append(filter_word)
+            if k == "emoji":
+                emoji_list_config = v.split(",")
+                for emoji in emoji_list_config:
+                    emoji_list.append(emoji)
+            if k == "search_word":
+                search_word_list_config = v.split(",")
+                for search_word in search_word_list_config:
+                    search_word_list.append(search_word)
+    if text == "title":
+        return title_list
+    elif text == "filter":
+        return filter_list
+    elif text == "emoji":
+        return emoji_list
+    elif text == "search_word":
+        return search_word_list

+ 240 - 0
application/functions/zqkd_db_redis.py

@@ -0,0 +1,240 @@
+import os
+import sys
+import threading
+import traceback
+from datetime import datetime, timedelta
+
+import redis
+
+from application.common import Local
+
+sys.path.append(os.getcwd())
+
+from application.common.mysql import MysqlHelper
+
+
+class DatabaseOperations:
+    def __init__(self, mode, platform):
+        self.mysql = MysqlHelper(mode=mode, platform=platform)
+        self.LocalLog = Local.logger(platform, mode)
+        self.mode = mode
+        self.platform = platform
+
+    def check_user_id(self, uid):
+        """
+        检查指定用户ID是否存在于数据库的zqkd_uid表中。
+
+        :param uid:要检查的用户ID
+        :return:如果用户ID存在于表中返回True,否则返回False
+        """
+        try:
+            query_sql = f""" SELECT uid FROM zqkd_user WHERE uid = "{uid}"; """
+            result = self.mysql.select(sql=query_sql)
+            return bool(result)
+        except Exception as e:
+            tb = traceback.format_exc()
+            self.LocalLog.error(f"检查用户ID失败: {e}\n{tb}")
+            return False
+
+    def update_user(self, uid, user_name, avatar_url):
+        """
+        更新数据库中指定用户的用户名和头像URL。
+
+        :param uid:要更新信息的用户ID
+        :param user_name:新的用户名
+        :param avatar_url:新的头像URL
+        :return:如果更新操作成功,返回更新操作的结果(通常是影响的行数),失败则返回None或抛出异常
+        """
+        try:
+            update_sql = f""" UPDATE zqkd_user SET avatar_url = "{avatar_url}", user_name = "{user_name}" WHERE uid = "{uid}"; """
+            return self.mysql.update(sql=update_sql)
+        except Exception as e:
+            tb = traceback.format_exc()
+            self.LocalLog.error(f"更新用户信息失败: {e}\n{tb}")
+            return None
+
+    def insert_user(self, uid, user_name, avatar_url):
+        """
+        向数据库的zqkd_user表中插入或更新用户信息
+
+        :param uid: 用户ID(数值类型)
+        :param user_name: 用户名
+        :param avatar_url: 头像URL
+        :return: 成功返回影响的行数,失败返回None
+        """
+        try:
+            # 直接拼接SQL(不推荐,有SQL注入风险)
+            insert_sql = f"""
+                INSERT INTO zqkd_user (uid, avatar_url, user_name) 
+                VALUES ({uid}, '{avatar_url.replace("'", "''")}', '{user_name.replace("'", "''")}') 
+                ON DUPLICATE KEY UPDATE 
+                user_name = '{user_name.replace("'", "''")}', 
+                avatar_url = '{avatar_url.replace("'", "''")}'
+            """
+            return self.mysql.update(sql=insert_sql)
+        except Exception as e:
+            tb = traceback.format_exc()
+            self.LocalLog.error(f"插入用户信息失败: {e}\n{tb}")
+            return None
+    def get_today_videos(self):
+        try:
+            # 手动转义单引号(仅缓解部分风险)
+
+            sql = """
+                        SELECT count(*) as cnt
+                        FROM crawler_video 
+                        WHERE create_time >= CURDATE() 
+                          AND create_time < CURDATE() + INTERVAL 1 DAY 
+                          AND platform = %s 
+                          AND strategy = %s
+                    """
+            result = self.mysql.select_params(sql, (self.platform,self.mode))
+            if result and len(result) > 0:
+                return result[0][0]  # 返回第一行第一列的计数值
+            return 0  # 无结果时返回0
+        except Exception as e:
+            self.LocalLog.error(f"查询失败: {e}")
+            return 0
+    def select_user(self, last_scanned_id=0):
+        """
+        根据last_scanned_id查询用户数据
+        :param last_scanned_id: 上次扫描的ID,0表示从头开始
+        :return: 查询结果列表
+        """
+        try:
+            # 构建查询(根据last_scanned_id过滤)
+            query = "SELECT id, uid FROM zqkd_user"
+            if last_scanned_id > 0:
+                query += f" WHERE id > {last_scanned_id}"
+            query += " ORDER BY id ASC"
+
+            return self.mysql.select(query)
+        except Exception as e:
+            tb = traceback.format_exc()
+            self.LocalLog.error(f"查询用户列表失败: {e}\n{tb}")
+            return []
+
+
+class RedisOperations:
+    _pool: redis.ConnectionPool = None
+    _instance = None
+    _lock = threading.Lock()  # 用于线程安全的单例创建
+
+    @classmethod
+    def get_instance(cls, mode="", platform=""):
+        """线程安全的单例获取方法"""
+        if not cls._instance:
+            with cls._lock:
+                if not cls._instance:
+                    cls._instance = cls(mode, platform)
+        return cls._instance
+
+    def __init__(self, mode, platform):
+        # 私有构造函数,使用 get_instance() 获取实例
+        self.mode = mode
+        self.platform = platform
+        self.LocalLog = Local.logger(self.platform, self.mode)
+        if RedisOperations._instance is not None:
+            raise Exception("请使用 get_instance() 获取实例")
+
+        self._pool = self._get_pool()
+        self.client = redis.Redis(connection_pool=self._pool, decode_responses=True)  # 复用同一个客户端
+
+    def _get_pool(self) -> redis.ConnectionPool:
+        if self._pool is None:
+            try:
+                self._pool = redis.ConnectionPool(
+                    host="r-bp1mb0v08fqi4hjffu.redis.rds.aliyuncs.com",
+                    port=6379,
+                    db=0,
+                    password="Wqsd@2019",
+                    max_connections=50,  # 增加最大连接数
+                    socket_timeout=10,
+                    retry_on_timeout=True
+                )
+            except Exception as e:
+                tb = traceback.format_exc()
+                self.LocalLog.error(f"创建Redis连接池失败: {e}\n{tb}")
+                raise
+        return self._pool
+
+    def close(self):
+        """关闭连接池"""
+        try:
+            if self._pool:
+                self._pool.disconnect(inuse_connections=True)
+        except Exception as e:
+            tb = traceback.format_exc()
+            self.LocalLog.error(f"关闭Redis连接池失败: {e}\n{tb}")
+
+    def get_recommend_video(self, task="task:zqkd_video_id"):
+        """从Redis的指定列表中弹出并返回最左边的视频ID"""
+        try:
+            value_bytes = self.client.rpop(task)
+            value_str = value_bytes.decode('utf-8')
+            return value_str
+        except Exception as e:
+            tb = traceback.format_exc()
+            self.LocalLog.error(f"获取推荐视频ID失败: {e}\n{tb}")
+            return None
+
+    def check_video_id_exists(self, videoID):
+        """检查指定的视频ID是否已经存在于Redis中"""
+        try:
+            key = f"crawler:zqkd:{videoID}"
+            return self.client.exists(key)
+        except Exception as e:
+            tb = traceback.format_exc()
+            self.LocalLog.error(f"检查视频ID是否存在失败: {e}\n{tb}")
+            return False
+
+    def save_video_id(self, videoID):
+        """将视频ID存储到Redis中,并为其设置3天的过期时间"""
+        try:
+            key = f"crawler:zqkd:{videoID}"
+            expiration_time = int(timedelta(days=3).total_seconds())
+            self.client.setex(key, expiration_time, "1")
+        except Exception as e:
+            tb = traceback.format_exc()
+            self.LocalLog.error(f"保存视频ID失败: {e}\n{tb}")
+
+    def save_recommend_video(self, videoID):
+        """将推荐视频ID添加到Redis的指定列表中,并为该列表设置2天的过期时间"""
+        try:
+            task = "task:zqkd_video_id"
+            pipe = self.client.pipeline()  # 使用管道执行多个命令
+            pipe.rpush(task, videoID)
+            pipe.expire(task, int(timedelta(days=2).total_seconds()))
+            pipe.execute()
+
+            # 检查数据是否写入成功
+            list_length = self.client.llen(task)
+            self.LocalLog.info(f"保存推荐视频ID成功,列表长度: {list_length}")
+        except Exception as e:
+            tb = traceback.format_exc()
+            self.LocalLog.error(f"保存推荐视频ID失败: {e}\n{tb}")
+
+    def get_last_scanned_id(self):
+        """获取上次扫描的ID"""
+        try:
+            return self.client.get("zqkd_last_scanned_id").decode('utf-8')
+        except Exception as e:
+            tb = traceback.format_exc()
+            self.LocalLog.error(f"获取上次扫描的ID失败: {e}\n{tb}")
+            return None
+
+    def set_last_scanned_id(self, last_scanned_id):
+        """设置上次扫描的ID"""
+        try:
+            result = self.client.set("zqkd_last_scanned_id", last_scanned_id)
+            if result:
+                self.LocalLog.info(f"成功设置上次扫描的ID: {last_scanned_id}")
+        except Exception as e:
+            tb = traceback.format_exc()
+            self.LocalLog.error(f"设置上次扫描的ID失败: {e}\n{tb}")
+            return False
+
+
+if __name__ == '__main__':
+    db = DatabaseOperations("author", "zhongqingkandianauthor")
+    print(db.get_today_videos())

+ 1 - 0
application/items/__init__.py

@@ -0,0 +1 @@
+from .item import VideoItem

+ 94 - 0
application/items/item.py

@@ -0,0 +1,94 @@
+import time
+from application.functions import clean_title
+
+
+class VideoItem(object):
+    """
+    function: 当扫描进一条视频的时候,对该视频的基本信息进行处理,保证发送给 pipeline和 etl 的 video_dict 是正确的
+    __init__: 初始化空json 对象,用来存储视频信息
+    add_video_info: 把视频信息存储到 item 对象中
+    check_item: 检查 item 对象中的各个元素以及处理
+    """
+
+    def __init__(self):
+        self.item = {}
+
+    def add_video_info(self, key, value):
+        self.item[key] = value
+
+    def check_item(self):
+        """
+        判断item 里面的字段,是否符合要求
+        字段分为 3 类:
+        1. 必须存在数据的字段: ["video_id", "user_id", "user_name", "out_user_id", "out_video_id", "session", "video_url", "cover_url", "platform", "strategy"]
+        2. 不存在默认为 0 的字段 :["duration", "play_cnt", "like_cnt", "comment_cnt", "share_cnt", "width", "height"]
+        3. 需要后出理的字段: video_title, publish_time
+        """
+        if self.item.get("video_title"):
+            self.item["video_title"] = clean_title(self.item["video_title"])
+        else:
+            return False
+        if self.item.get("publish_time_stamp"):
+            publish_time_str = time.strftime(
+                "%Y-%m-%d %H:%M:%S", time.localtime(self.item["publish_time_stamp"])
+            )
+            self.add_video_info("publish_time_str", publish_time_str)
+        else:
+            publish_time_stamp = int(time.time())
+            publish_time_str = time.strftime(
+                "%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp)
+            )
+            self.add_video_info("publish_time_stamp", publish_time_stamp)
+            self.add_video_info("publish_time_str", publish_time_str)
+        self.add_video_info("publish_time", publish_time_str)
+        if not self.item.get("update_time_stamp"):
+            self.add_video_info("update_time_stamp", int(time.time()))
+
+        # 如果不存在,默认值为 0
+        config_keys = [
+            "duration",
+            "play_cnt",
+            "like_cnt",
+            "comment_cnt",
+            "share_cnt",
+            "width",
+            "height",
+        ]
+        for config_key in config_keys:
+            if self.item.get(config_key):
+                continue
+            else:
+                self.add_video_info(config_key, 0)
+
+        # 必须存在的元素,若不存在则会报错
+        must_keys = [
+            "video_id",
+            "user_id",
+            "user_name",
+            "out_video_id",
+            "session",
+            "video_url",
+            "cover_url",
+            "platform",
+            "strategy",
+        ]
+        """
+        video_id, out_video_id 均为站外视频 id
+        usr_id: 站内用户 id
+        out_user_id: 站外用户 id
+        user_name: 站外用户名称
+        """
+        for m_key in must_keys:
+            if self.item.get(m_key):
+                continue
+            else:
+                # print(m_key)
+                return False
+        return True
+
+    def produce_item(self):
+        flag = self.check_item()
+        if flag:
+            return self.item
+        else:
+            return False

+ 2 - 0
application/pipeline/__init__.py

@@ -0,0 +1,2 @@
+from .pipeline_dev import PiaoQuanPipelineTest
+from .pipeline import PiaoQuanPipeline

+ 273 - 0
application/pipeline/pipeline.py

@@ -0,0 +1,273 @@
+import hashlib
+import re
+import sys
+import os
+import time
+
+from application.common.feishu.feishu_utils import FeishuUtils
+
+sys.path.append(os.getcwd())
+from datetime import datetime
+
+from application.common import MysqlHelper, AliyunLogger
+from application.common.redis.pyredis import RedisClient
+
+
+class PiaoQuanPipeline(object):
+    """
+    爬虫管道——爬虫规则判断
+    """
+
+    def __init__(self, platform, mode, rule_dict, env, item, trace_id, account=None):
+        self.platform = platform
+        self.mode = mode
+        self.item = item
+        self.rule_dict = rule_dict
+        self.env = env
+        self.trace_id = trace_id
+        self.mysql = MysqlHelper(env=env, mode=mode, platform=platform)
+        self.aliyun_log = AliyunLogger(platform=platform, mode=mode, env=env)
+        self.account = account
+        self.red = RedisClient()
+
+    def feishu_time_list(self):
+        summary = FeishuUtils.get_values_batch("KsoMsyP2ghleM9tzBfmcEEXBnXg", "RuLK77")
+        for row in summary[1:]:
+            channel = row[0]
+            day_count = row[1]
+            if channel:
+                if channel == self.platform:
+                    return day_count
+            else:
+                return None
+        return None
+
+    def publish_time_flag(self):
+        """
+        判断发布时间是否过期
+        :return: True or False
+        """
+        # 判断发布时间
+        publish_time_stamp = self.item["publish_time_stamp"]
+        update_time_stamp = self.item["update_time_stamp"]
+        max_d = self.rule_dict.get("period", {}).get("max", 1000)
+        min_d = self.rule_dict.get("period", {}).get("min", 1000)
+        days = max_d if max_d > min_d else min_d
+        days_time = self.feishu_time_list()
+        if days_time:
+            days = int(days_time)
+        if self.platform == "gongzhonghao":
+            if (
+                    int(time.time()) - publish_time_stamp
+                    > 3600 * 24 * days
+            ) and (
+                    int(time.time()) - update_time_stamp
+                    > 3600 * 24 * days
+            ):
+                self.aliyun_log.logging(
+                    code="2004",
+                    trace_id=self.trace_id,
+                    data=self.item,
+                    message="发布时间超过{}天".format(days),
+                )
+                return False
+        else:
+            if days == 0:
+                publish_time_stamp = int(time.time())  # 示例时间戳
+                is_today = datetime.fromtimestamp(publish_time_stamp).date() == datetime.today().date()
+                if not is_today:
+                    return False
+
+            elif (
+                    int(time.time()) - publish_time_stamp
+                    > 3600 * 24 * days
+            ):
+                self.aliyun_log.logging(
+                    code="2004",
+                    trace_id=self.trace_id,
+                    data=self.item,
+                    message="发布时间超过{}天".format(days),
+                )
+                return False
+        return True
+
+    def title_flag(self):
+        """
+        视频标题是否满足需求
+        :return:
+        """
+        title = self.item["video_title"]
+        cleaned_title = re.sub(r"[^\w]", " ", title)
+        # 敏感词
+        # 获取敏感词列表
+        sensitive_words = []
+        if any(word in cleaned_title for word in sensitive_words):
+            self.aliyun_log.logging(
+                code="2003",
+                trace_id=self.trace_id,
+                message="标题中包含敏感词",
+                data=self.item,
+                account=self.account
+            )
+            return False
+        return True
+
+    def download_rule_flag(self):
+        """
+        视频基础下载规则
+        :return:
+        """
+        for key in self.item:
+            if self.rule_dict.get(key):
+                max_value = (
+                    int(self.rule_dict[key]["max"])
+                    if int(self.rule_dict[key]["max"]) > 0
+                    else 999999999999999
+                )
+                if key == "peroid":  # peroid是抓取周期天数
+                    continue
+                else:
+                    flag = int(self.rule_dict[key]["min"]) <= int(self.item[key]) <= max_value
+                    if not flag:
+                        self.aliyun_log.logging(
+                            code="2004",
+                            trace_id=self.trace_id,
+                            data=self.item,
+                            message="{}: {} <= {} <= {}, {}".format(
+                                key,
+                                self.rule_dict[key]["min"],
+                                self.item[key],
+                                max_value,
+                                flag,
+                            ),
+                            account=self.account
+                        )
+                        return flag
+            else:
+                continue
+        return True
+
+    def feishu_list(self):
+        summary = FeishuUtils.get_values_batch("KsoMsyP2ghleM9tzBfmcEEXBnXg", "letS93")
+        for row in summary[1:]:
+            channel = row[0]
+            day_count = row[1]
+            if channel:
+                if channel == self.platform:
+                    return day_count
+            else:
+                return None
+        return None
+
+    # 按照某个具体平台来去重
+    def repeat_video(self):
+        """
+        视频是否重复
+        :return:
+        """
+        out_id = self.item["out_video_id"]
+        day_count = self.feishu_list()
+        if day_count:
+            sql_2 = f"""select create_time from crawler_video where platform = "{self.platform}" and  out_video_id="{out_id}" AND create_time >= DATE_SUB(NOW(), INTERVAL {int(day_count)} DAY);"""
+            repeat_video = self.mysql.select(sql=sql_2)
+            if repeat_video:
+                self.aliyun_log.logging(
+                    code="2002",
+                    trace_id=self.trace_id,
+                    message="重复的视频",
+                    data=self.item,
+                    account=self.account
+                )
+                return False
+            else:
+                return True
+
+        if self.platform == "zhufuniannianshunxinjixiang" or  self.platform == "weiquanshipin" or  self.platform == "piaoquangushi" or  self.platform == "lepaoledong" or  self.platform == "zhufukuaizhuan" or self.platform == "linglingkuailezhufu" or self.platform == "lepaoledongdijie":
+            return True
+        if self.platform == "jierizhufuhuakaifugui" or self.platform == "yuannifuqimanman" or self.platform == "haoyunzhufuduo" or self.platform == "quzhuan" or self.platform == "zhufudewenhou" or self.platform == "jierizhufuxingfujixiang" or self.platform == "haoyoushipin" or self.platform == "xinshiquan" or self.platform == "laonianshenghuokuaile" or self.platform == "laonianquan":
+            return True
+        if self.platform == "zhuwanwufusunew" and self.mode == "recommend":
+            return True
+        if self.platform == "jixiangxingfu" and self.mode == "recommend":
+            return True
+        if self.platform == "yuannifuqichangzai" and self.mode == "recommend":
+            return True
+        if self.platform == "benshanzhufu" and self.mode == "recommend":
+            return True
+        if self.platform == "zuihaodesongni" and self.mode == "recommend":
+            return True
+        if self.platform == "tiantianjufuqi" and self.mode == "recommend":
+            return True
+        # 判断加上标题去重
+        if self.mode == "recommend" and self.platform == "zhufuhaoyunbaofu":
+            title = self.item["video_title"]
+            sql = f""" select 1 from crawler_video where platform = "{self.platform}" and out_video_id="{out_id}" and video_title="{title}"; """
+        else:
+            sql = f""" select 1 from crawler_video where platform = "{self.platform}" and out_video_id="{out_id}"; """
+        repeat_video = self.mysql.select(sql=sql)
+        if repeat_video:
+            # 喜事多多平台 4 天去重一次
+            if self.platform == "xishiduoduo":
+                sql_2 = f"""select create_time from crawler_video where out_video_id="{out_id}";"""
+                video_time = self.mysql.select(sql=sql_2)[0][0].timestamp()
+                if int(time.time()) - video_time >= 86400 * 4:
+                    return True
+            # 小年糕推荐流和祝福圈子推荐流 3 天去重一次
+            elif self.platform == "xiaoniangaotuijianliu" or self.platform == "zhufuquanzituijianliu":
+                sql_2 = f"""select create_time from crawler_video where out_video_id="{out_id}";"""
+                video_time = self.mysql.select(sql=sql_2)[0][0].timestamp()
+                if int(time.time()) - video_time >= 86400 * 3:
+                    return True
+            self.aliyun_log.logging(
+                code="2002",
+                trace_id=self.trace_id,
+                message="重复的视频",
+                data=self.item,
+                account=self.account
+            )
+            return False
+        return True
+
+    # def mq_exists(self):
+    #     """
+    #     检测 mq 是否已经发送过了
+    #     :return:
+    #     """
+    #     if self.red.connect():
+    #         index_txt = "{}-{}".format(self.platform, self.item['video_id'])
+    #         index_md5 = hashlib.md5(index_txt.encode()).hexdigest()
+    #         if self.red.select(index_md5):
+    #             self.aliyun_log.logging(
+    #                 code="2007",
+    #                 trace_id=self.trace_id,
+    #                 message="该视频 mq 已经发送"
+    #             )
+    #             return False
+    #         else:
+    #             self.red.insert(index_md5, int(time.time()), 43200)
+    #             return True
+    #     else:
+    #         return True
+
+    def process_item(self):
+        """
+        全规则判断,符合规则的数据则return True
+        :return:
+        """
+        # 判断该 mq 是否已经发了
+        # if not self.mq_exists():
+        #     return False
+        if not self.publish_time_flag():
+            # 记录相关日志
+            return False
+        if not self.title_flag():
+            # 记录相关日志
+            return False
+        if not self.repeat_video():
+            # 记录相关日志
+            return False
+        if not self.download_rule_flag():
+            # 记录相关日志
+            return False
+        return True
+

+ 112 - 0
application/pipeline/pipeline_dev.py

@@ -0,0 +1,112 @@
+import re
+import time
+
+
+class PiaoQuanPipelineTest:
+    def __init__(self, platform, mode, rule_dict, env, item, trace_id):
+        self.platform = platform
+        self.mode = mode
+        self.item = item
+        self.rule_dict = rule_dict
+        self.env = env
+        self.trace_id = trace_id
+
+    # 视频的发布时间限制, 属于是规则过滤
+    def publish_time_flag(self):
+        # 判断发布时间
+        publish_time_stamp = self.item["publish_time_stamp"]
+        update_time_stamp = self.item["update_time_stamp"]
+        if self.platform == "gongzhonghao":
+            if (
+                    int(time.time()) - publish_time_stamp
+                    > 3600 * 24 * int(self.rule_dict.get("period", {}).get("max", 1000))
+            ) and (
+                    int(time.time()) - update_time_stamp
+                    > 3600 * 24 * int(self.rule_dict.get("period", {}).get("max", 1000))
+            ):
+                message = "发布时间超过{}天".format(
+                    int(self.rule_dict.get("period", {}).get("max", 1000))
+                )
+                print(message)
+                return False
+        else:
+            if (
+                    int(time.time()) - publish_time_stamp
+                    > 3600 * 24 * int(self.rule_dict.get("period", {}).get("max", 1000))
+            ):
+                message = "发布时间超过{}天".format(
+                    int(self.rule_dict.get("period", {}).get("max", 1000))
+                )
+                print(message)
+                return False
+        return True
+
+    # 视频标题是否满足需求
+    def title_flag(self):
+        title = self.item["video_title"]
+        cleaned_title = re.sub(r"[^\w]", " ", title)
+        # 敏感词
+        # 获取敏感词列表
+        sensitive_words = []
+        if any(word in cleaned_title for word in sensitive_words):
+            message = "标题中包含敏感词"
+            print(message)
+            return False
+        return True
+
+    # 视频基础下载规则
+    def download_rule_flag(self):
+        for key in self.item:
+            if self.rule_dict.get(key):
+                max_value = (
+                    int(self.rule_dict[key]["max"])
+                    if int(self.rule_dict[key]["max"]) > 0
+                    else 999999999999999
+                )
+                if key == "peroid":  # peroid是抓取周期天数
+                    continue
+                else:
+                    flag = int(self.rule_dict[key]["min"]) <= int(self.item[key]) <= max_value
+                    if not flag:
+                        message = "{}: {} <= {} <= {}, {}".format(
+                            key,
+                            self.rule_dict[key]["min"],
+                            self.item[key],
+                            max_value,
+                            flag,
+                        )
+                        print(message)
+                        return flag
+            else:
+                continue
+        return True
+
+    # 按照某个具体平台来去重
+    def repeat_video(self):
+        # sql = f""" select * from crawler_video where platform="公众号" and out_video_id="{video_id}"; """
+        # out_id = self.item["out_video_id"]
+        # sql = f""" select * from crawler_video where platform = "{self.platform}" and out_video_id="{out_id}"; """
+        # repeat_video = MysqlHelper.get_values(
+        #     log_type=self.mode, crawler=self.platform, env=self.env, sql=sql, action=""
+        # )
+        # if repeat_video:
+        #     message = "重复的视频"
+        #     print(message)
+        #     return False
+        return True
+
+    def process_item(self):
+        if not self.publish_time_flag():
+            # 记录相关日志
+            return False
+        if not self.title_flag():
+            # 记录相关日志
+            return False
+        if not self.repeat_video():
+            # 记录相关日志
+            return False
+        if not self.download_rule_flag():
+            # 记录相关日志
+            return False
+        return True
+

+ 0 - 0
configs/__init__.py


+ 61 - 0
configs/codes.py

@@ -0,0 +1,61 @@
+# crawler_status/codes.py
+
+# 成功
+SUCCESS = "1000"
+
+# 参数配置错误
+CONFIG_MISSING = "2000"
+PARAM_REQUIRED = "2001"
+UNSUPPORTED_TYPE = "2002"
+URL_JOIN_FAILED = "2003"
+CUSTOM_CLASS_IMPORT_FAILED = "2004"
+CONFIG_LOAD_FAILED = "2005"
+
+# 抓取错误
+FETCH_EXCEPTION = "3000"
+FETCH_EMPTY = "3001"
+HTTP_ERROR = "3002"
+TIMEOUT = "3003"
+INVALID_FORMAT = "3004"
+BLOCKED = "3005"
+REDIRECT_ERROR = "3006"
+
+# 解析处理
+JSONPATH_FAIL = "3100"
+XPATH_FAIL = "3101"
+FIELD_MAP_ERROR = "3102"
+PARSE_EMPTY = "3103"
+FORMAT_INVALID = "3104"
+
+# 清洗转化
+CLEAN_MISMATCH = "3200"
+TRANSFORM_FAIL = "3201"
+MISSING_REQUIRED_FIELD = "3202"
+
+# 数据写入
+DB_WRITE_FAIL = "4000"
+DB_DUPLICATE = "4001"
+DB_CONN_FAIL = "4002"
+FILE_WRITE_FAIL = "4003"
+
+# ETL
+ETL_IMPORT_FAIL = "4100"
+ETL_RUN_FAIL = "4101"
+ETL_UNKNOWN_ERROR = "4102"
+
+# 系统
+UNKNOWN_ERROR = "5000"
+IMPORT_ERROR = "5001"
+DYNAMIC_LOAD_ERROR = "5002"
+FILE_NOT_FOUND = "5003"
+
+# 业务
+DATA_EXISTS = "6000"
+NO_UPDATE = "6001"
+FILTERED = "6002"
+
+# 重试
+RETRY = "7000"
+RETRY_MAX = "7001"
+
+

+ 1 - 0
configs/config.py

@@ -0,0 +1 @@
+base_url="http://8.217.192.46:8889"

+ 52 - 0
configs/messages.py

@@ -0,0 +1,52 @@
+# crawler_status/messages.py
+
+from .codes import *
+
+MESSAGES = {
+    SUCCESS: "成功",
+    CONFIG_MISSING: "配置缺失或无效",
+    PARAM_REQUIRED: "缺少必要参数",
+    UNSUPPORTED_TYPE: "不支持的爬虫类型",
+    URL_JOIN_FAILED: "URL 拼接失败",
+    CUSTOM_CLASS_IMPORT_FAILED: "自定义类加载失败",
+    CONFIG_LOAD_FAILED: "配置文件读取失败",
+
+    FETCH_EXCEPTION: "抓取单条视频失败,请求异常",
+    FETCH_EMPTY: "抓取返回空数据",
+    HTTP_ERROR: "HTTP 状态码异常",
+    TIMEOUT: "请求超时",
+    INVALID_FORMAT: "无效的响应格式",
+    BLOCKED: "被目标站封禁或滑块验证",
+    REDIRECT_ERROR: "请求被重定向异常",
+
+    JSONPATH_FAIL: "JSONPath 提取失败",
+    XPATH_FAIL: "HTML XPath 提取失败",
+    FIELD_MAP_ERROR: "字段映射缺失或类型错误",
+    PARSE_EMPTY: "解析后结果为空",
+    FORMAT_INVALID: "数据格式校验失败",
+
+    CLEAN_MISMATCH: "清洗规则不匹配",
+    TRANSFORM_FAIL: "数据转化失败",
+    MISSING_REQUIRED_FIELD: "字段缺失导致中断",
+
+    DB_WRITE_FAIL: "写入数据库失败",
+    DB_DUPLICATE: "主键冲突或重复数据",
+    DB_CONN_FAIL: "数据库连接失败",
+    FILE_WRITE_FAIL: "写入本地文件失败",
+
+    ETL_IMPORT_FAIL: "ETL 模块导入失败",
+    ETL_RUN_FAIL: "process_video_obj 执行失败",
+    ETL_UNKNOWN_ERROR: "ETL 处理逻辑异常",
+
+    UNKNOWN_ERROR: "未知系统错误",
+    IMPORT_ERROR: "模块导入错误",
+    DYNAMIC_LOAD_ERROR: "动态类加载失败",
+    FILE_NOT_FOUND: "路径错误或文件不存在",
+
+    DATA_EXISTS: "视频内容已存在,跳过",
+    NO_UPDATE: "当前无更新内容",
+    FILTERED: "需人工校验的内容被过滤",
+
+    RETRY: "触发重试机制",
+    RETRY_MAX: "最大重试次数已达,终止任务",
+}

+ 59 - 0
configs/spiders_config.yaml

@@ -0,0 +1,59 @@
+default:
+  base_url: http://8.217.192.46:8889
+  request_timeout: 30
+  headers:
+    {"Content-Type": "application/json"}
+benshanzhufu:
+  mode: recommend
+  path: /crawler/ben_shan_zhu_fu/recommend
+  method: post
+  request_body:
+    cursor: "1"
+  paging: true
+  max_pages: 5
+  etl_hook: "process_video_obj"
+  response_parse:
+    next_cursor: "$.data.next_cursor"
+    data_path: "$.data.data"
+    fields:
+      video_id: "$.nid"
+      video_title: "$.title"
+      play_cnt: 0
+      publish_time_stamp: "$.update_time"
+      out_user_id: "$.nid"
+      cover_url: "$.video_cover"
+      like_cnt: 0
+      video_url: "$.video_url"
+      out_video_id: "$.nid"
+
+
+zhongqingkandian:
+  mode: recommend
+  path: "/zqkd"
+  paging: true
+  max_pages: 5
+  db_config:
+    table: "zhongqingkandian"
+  etl_hook: "process_video_obj"
+  parse:
+    data_path: "$.data[*]"
+    fields:
+      title: "$.title"
+      vid: "$.id"
+      cover: "$.cover"
+      url: "$.video_url"
+  custom_class: my_crawlers.ZhongqingKandianCrawler
+
+fuqihaoyundao:
+  url: "/fuqi"
+  method: "POST"
+  paging: false
+  retry_times: 2
+  etl_hook: "process_video_obj"
+  parse:
+    data_path: "$.videos[*]"
+    fields:
+      id: "$.id"
+      name: "$.name"
+      mp4: "$.url"
+

+ 3 - 0
configs/topic_map.yaml

@@ -0,0 +1,3 @@
+topics:
+  - bszf_recommend_prod
+  - zqkd_recommend_prod

+ 0 - 0
crawler_worker/__init__.py


+ 90 - 0
crawler_worker/rabbitmq_consumer.py

@@ -0,0 +1,90 @@
+import pika
+import asyncio
+import json
+from .universal_crawler import AsyncCrawler
+from .utils.log_config import setup_logger
+
+
+class RabbitMQConsumer:
+    def __init__(self, config_path: str):
+        self.config_path = config_path
+        self.aliyun_log = setup_logger("rabbitmq_consumer", "system")
+        self.consumer_tag = None
+
+    def connect(self):
+        """连接到RabbitMQ"""
+        try:
+            with open('config/rabbitmq_config.yaml', 'r', encoding='utf-8') as f:
+                rabbit_config = json.load(f)
+
+            self.connection = pika.BlockingConnection(
+                pika.ConnectionParameters(
+                    host=rabbit_config.get('host', 'localhost'),
+                    port=rabbit_config.get('port', 5672),
+                    credentials=pika.PlainCredentials(
+                        rabbit_config.get('username', 'guest'),
+                        rabbit_config.get('password', 'guest')
+                    )
+                )
+            )
+            self.channel = self.connection.channel()
+            self.aliyun_log.info("成功连接到RabbitMQ")
+            return True
+        except Exception as e:
+            self.aliyun_log.error(f"连接RabbitMQ失败: {str(e)}")
+            return False
+
+    async def process_message(self, ch, method, properties, body):
+        """处理消息"""
+        task = json.loads(body)
+        self.aliyun_log.info(f"收到任务: {task.get('task_id', '未知ID')}")
+
+        platform = task.get('platform', 'unknown_platform')
+        mode = task.get('mode', 'recommend')
+
+        crawler = AsyncCrawler(platform, mode, self.config_path)
+        try:
+            await crawler.run()
+            ch.basic_ack(delivery_tag=method.delivery_tag)
+            self.aliyun_log.info(f"任务完成: {task.get('task_id', '未知ID')}")
+        except Exception as e:
+            self.aliyun_log.error(f"处理任务异常: {str(e)}")
+            # 重新排队
+            ch.basic_nack(delivery_tag=method.delivery_tag, requeue=True)
+
+    def start_consuming(self):
+        """开始消费消息"""
+        if not self.connect():
+            return
+
+        queue_name = self.setup_queue()
+        if not queue_name:
+            return
+
+        try:
+            self.channel.basic_consume(
+                queue=queue_name,
+                on_message_callback=self._sync_process_message,
+                auto_ack=False
+            )
+            self.aliyun_log.info(f"开始消费队列: {queue_name}")
+            self.channel.start_consuming()
+        except KeyboardInterrupt:
+            self.channel.stop_consuming()
+        except Exception as e:
+            self.aliyun_log.error(f"消费消息失败: {str(e)}")
+        finally:
+            self.connection.close()
+
+    def _sync_process_message(self, ch, method, properties, body):
+        """同步包装异步处理函数"""
+        asyncio.run(self.process_message(ch, method, properties, body))
+
+
+def main():
+    consumer = RabbitMQConsumer("config/platform_config.yaml")
+    consumer.start_consuming()
+
+
+if __name__ == "__main__":
+    main()

+ 205 - 0
crawler_worker/universal_crawler.py

@@ -0,0 +1,205 @@
+import os
+import sys
+import json
+import random
+import time
+import uuid
+import yaml
+import requests
+
+from datetime import datetime
+from typing import Dict, Any, List, Optional
+from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception_type
+from utils.extractors import safe_extract,extract_multiple
+
+# 添加公共模块路径
+sys.path.append(os.getcwd())
+print(os.getcwd())
+
+from application.items import VideoItem
+from application.pipeline import PiaoQuanPipeline
+from application.common.messageQueue import MQ
+from application.common.log import AliyunLogger
+# from application.common.mysql import MysqlHelper
+from configs.messages import MESSAGES
+from configs import codes
+from utils.config_loader import ConfigLoader
+from application.common.log import Local
+from configs.config import base_url
+
+
+class UniversalCrawler:
+    """通用爬虫类,通过YAML配置驱动不同平台的爬取逻辑"""
+
+    def __init__(self, platform: str, mode: str, rule_dict: Dict, user_list: List, env: str = "prod"):
+        """
+        初始化爬虫
+        :param platform: 平台名称(对应YAML文件名)
+        :param env: 运行环境
+        """
+        self.platform = platform
+        self.mode = mode
+        self.rule_dict = rule_dict
+        self.user_list = user_list
+        self.env = env
+        self.config_path = "/Users/zhangliang/Documents/piaoquan/AutoScraperX/configs/spiders_config.yaml"
+        self.config = ConfigLoader().get_platform_config(self.platform)
+        self.aliyun_log = AliyunLogger(platform=platform, mode=self.config["mode"])
+        self.mq = MQ(topic_name=f"topic_crawler_etl_{env}")
+        # self.mysql = MysqlHelper(mode=self.config["mode"], platform=platform)
+        self.logger = Local.init_logger(platform=self.platform, mode=self.mode, log_level="INFO", log_to_console=True)
+        self.download_cnt = 0
+        self.limit_flag = False
+        self.base_api = base_url
+
+    @retry(
+        stop=stop_after_attempt(3),  # 最多重试 3 次
+        wait=wait_fixed(2),  # 每次重试间隔 2 秒
+        retry=retry_if_exception_type((requests.RequestException, ValueError))
+    )
+    def _send_request(self, method: str, url: str, headers, payload, timeout = 30) -> Optional[
+        Dict]:
+        """发送API请求,失败自动重试最多3次"""
+
+        try:
+            response = requests.request(
+                method=method,
+                url=url,
+                headers=headers,
+                json=payload,
+                timeout=timeout
+            )
+            response.raise_for_status()
+            resp = response.json()
+            if resp["code"] == 0:
+                return response.json()
+            raise ValueError(f"接口响应非0:{resp}")
+        except Exception as e:
+            # 在最后一次失败时才记录日志
+            self.aliyun_log.logging(
+                code="3000",
+                message=f"请求失败: {url}",
+                data={"error": str(e)}
+            )
+            return
+
+    def _process_video(self, video_data: Dict) -> bool:
+        """处理单个视频数据"""
+        # 从配置中获取字段映射
+        field_map = self.config["response_parse"]["fields"]
+
+        # 创建视频项
+        item = VideoItem()
+        for field_name, path in field_map.items():
+            if isinstance(path, str) and path.startswith("$."):
+
+                match = safe_extract(video_data,path)
+                item.add_video_info(field_name, match)
+            else:
+                # 如果是固定值(int、str等),直接使用
+                item.add_video_info(field_name,path)
+
+        # 添加固定字段
+        item.add_video_info("platform", self.platform)
+        item.add_video_info("strategy", self.config["mode"])
+        item.add_video_info("session", f"{self.platform}-{int(time.time())}")
+
+        # 随机选择一个用户
+        our_user = random.choice(self.user_list)
+        item.add_video_info("user_id", our_user["uid"])
+        item.add_video_info("user_name", our_user["nick_name"])
+
+        print(item)
+
+        # 处理管道
+        trace_id = f"{self.platform}-{uuid.uuid4()}"
+        pipeline = PiaoQuanPipeline(
+            platform=self.platform,
+            mode=self.config["mode"],
+            rule_dict=self.rule_dict,
+            env=self.env,
+            item=item.produce_item(),
+            trace_id=trace_id,
+        )
+
+        if pipeline.process_item():
+            self.download_cnt += 1
+            self.mq.send_msg(item.produce_item())
+            self.aliyun_log.logging(
+                code="1002",
+                message="成功发送至ETL",
+                data=item.produce_item()
+            )
+
+            # 检查下载限制
+            min_limit = self.config.get("download_limit", {}).get("min", 200)
+            if self.download_cnt >= min_limit:
+                self.limit_flag = True
+                self.aliyun_log.logging(
+                    code="2000",
+                    message=f"达到下载限制: {min_limit}",
+                )
+            return True
+        return False
+
+
+    # --------------------- 自定义处理函数 ---------------------
+    def _func_current_timestamp(self, _) -> int:
+        """获取当前时间戳"""
+        return int(time.time())
+
+    def _func_formatted_time(self, _) -> str:
+        """获取格式化时间"""
+        return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+    def _func_random_delay(self, _) -> None:
+        """随机延迟"""
+        min_delay = self.config.get("delay", {}).get("min", 3)
+        max_delay = self.config.get("delay", {}).get("max", 8)
+        time.sleep(random.randint(min_delay, max_delay))
+        return None
+
+    def run(self):
+        """执行爬取任务"""
+        self.logger.info(f"开始执行爬虫{self.platform}")
+
+        while not self.limit_flag:
+            # 获取初始列表数据
+            initial_data = self._send_request(
+                self.config["method"].upper(),
+                self.config["url"],
+                self.config.get("headers", {}),
+                self.config.get("request_body", {})
+            )
+            print(initial_data)
+
+            if not initial_data:
+                return
+            video_objs = safe_extract(initial_data,self.config["response_parse"]["data_path"])
+            self.logger.info(f"获取到的视频列表:{json.dumps(video_objs)}")
+
+            next_cursor = None
+            # 处理视频列表
+            video_list = safe_extract(
+                initial_data,
+                self.config["response_parse"]["data_path"]
+            )
+
+            for video_data in video_list:
+                self.logger.info(f"视频对象{video_data}")
+                if self.limit_flag:
+                    break
+                self._process_video(video_data)
+
+                # 执行额外操作(如曝光上报)
+                for action in self.config.get("post_actions", []):
+                    if action["trigger"] == "after_video_processed":
+                        self._send_request(action["endpoint"], action.get("payload", {}))
+
+
+if __name__ == '__main__':
+    cr = UniversalCrawler("benshanzhufu", "recommend",
+                          rule_dict={'videos_cnt': {'min': 500, 'max': 0}, 'duration': {'min': 30, 'max': 1200}},
+                          user_list=[{"uid": 20631262, "link": "recommend_2060", "nick_name": "人老心不老"}])
+
+    cr.run()

+ 76 - 0
main.py

@@ -0,0 +1,76 @@
+import threading
+import traceback
+import json
+import time
+import uuid
+
+from application.common import AliyunLogger, get_consumer, ack_message
+from application.common.log import Local
+from application.spiders.universal_crawler import UniversalCrawler
+from application.config import TopicGroup
+from application.service.user_service import get_user_list
+from application.service.rule_service import get_rule_dict
+
+def generate_trace_id():
+    return f"{uuid.uuid4().hex}{int(time.time() * 1000)}"
+
+def handle_message(topic: str):
+    consumer = get_consumer(topic)
+    logger = AliyunLogger(platform=topic, mode="unknown")
+
+    while True:
+        try:
+            messages = consumer.consume_message(wait_seconds=10, batch_size=1)
+            if not messages:
+                continue
+
+            for message in messages:
+                trace_id = generate_trace_id()
+                body = message.message_body
+
+                try:
+                    payload = json.loads(body)
+                    platform = payload["platform"]
+                    mode = payload.get("mode", "recommend")
+                    logger = AliyunLogger(platform=platform, mode=mode)
+                    Local.logger(platform, mode).info(f"[trace_id={trace_id}] 收到任务: {body}")
+
+                    # 加载 user_list 与 rule_dict
+                    user_list = get_user_list(platform, mode)
+                    rule_dict = get_rule_dict(platform, mode)
+
+                    # 同步执行 UniversalCrawler
+                    crawler = UniversalCrawler(platform, mode, rule_dict, user_list)
+                    crawler.run()
+
+                    # 执行成功后 ack
+                    ack_message(mode, platform, message, consumer, trace_id=trace_id)
+                    logger.logging(code="1000", message="任务成功完成并确认消息", trace_id=trace_id)
+
+                except Exception as e:
+                    logger.logging(
+                        code="9001",
+                        message=f"处理消息失败(未确认 ack): {e}\n{traceback.format_exc()}",
+                        trace_id=trace_id,
+                        data=body,
+                    )
+                    # 不 ack,等待下次重试
+        except Exception as err:
+            logger.logging(code="9002", message=f"消费失败: {err}\n{traceback.format_exc()}")
+        time.sleep(2)
+
+def main():
+    topic_list = TopicGroup().topics
+    print(f"监听 Topics:{topic_list}")
+
+    threads = []
+    for topic in topic_list:
+        t = threading.Thread(target=handle_message, args=(topic,))
+        t.start()
+        threads.append(t)
+
+    for t in threads:
+        t.join()
+
+if __name__ == '__main__':
+    main()

+ 0 - 0
pipelines/__init__.py


+ 0 - 0
scheduler/__init__.py


+ 45 - 0
scheduler/scheduler_main.py

@@ -0,0 +1,45 @@
+# scheduler_main.py - 爬虫调度主程序
+import asyncio
+import json
+import time
+import traceback
+import sys
+import os
+from crawler_controller import CrawlerController
+from application.common.log import Local
+from application.common import AliyunLogger
+from crawler_worker.universal_crawler import AsyncCrawler
+
+
+async def main():
+    """主函数"""
+    # 设置日志
+    logger = AliyunLogger(platform="system", mode="manager")
+
+    try:
+        # 从环境变量获取配置
+        config_topic = os.getenv("CONFIG_TOPIC", "crawler_config")
+        config_group = os.getenv("CONFIG_GROUP", "crawler_config_group")
+
+        # 创建爬虫控制器
+        controller = AsyncCrawler(
+            platform: str,
+            mode: str,
+        )
+        # 启动控制器
+        await controller.run()
+
+        # 保持主线程运行
+        while True:
+            await asyncio.sleep(60)
+
+    except Exception as e:
+        tb = traceback.format_exc()
+        message = f"主程序发生错误: {e}\n{tb}"
+        logger.logging(code="1006", message=message)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    # 运行主事件循环
+    asyncio.run(main())

+ 0 - 0
utils/__init__.py


+ 37 - 0
utils/config_loader.py

@@ -0,0 +1,37 @@
+import yaml
+import os
+from urllib.parse import urljoin
+from utils.project_paths import config_spiders_path
+
+
+class ConfigLoader:
+    def __init__(self, config_path=config_spiders_path):
+        if not os.path.exists(config_path):
+            raise FileNotFoundError(f"[配置错误] 找不到配置文件: {config_path}")
+        self.config_path = config_path
+        self.config = self._load_yaml()
+
+    def _load_yaml(self):
+        with open(self.config_path, "r", encoding="utf-8") as f:
+            return yaml.safe_load(f)
+
+    def get_platform_config(self, platform: str) -> dict:
+        """获取平台配置,并拼接完整 URL"""
+        if platform not in self.config:
+            raise ValueError(f"[配置错误] 未找到平台配置: {platform}")
+
+        platform_config = self.config.get(platform, {})
+        base_config = self.config.get("default", {})
+
+        # 合并配置:平台配置覆盖默认配置
+        merged = {**base_config, **platform_config}
+
+        # 自动拼接完整 url(优先用完整 url)
+        if "url" not in merged and "base_url" in merged and "path" in merged:
+            merged["url"] = urljoin(merged["base_url"], merged["path"])
+
+        return merged
+
+if __name__ == '__main__':
+    config = ConfigLoader().get_platform_config("benshanzhufu")
+    print(config)

+ 29 - 0
utils/extractors.py

@@ -0,0 +1,29 @@
+from jsonpath_ng import parse
+
+def safe_extract(json_obj, path, default=None):
+    """
+    安全提取单个字段值,返回匹配到的第一个,否则返回默认值。
+
+    :param json_obj: 输入的 JSON 对象
+    :param path: JSONPath 表达式
+    :param default: 提取失败时返回的默认值
+    :return: 提取结果或默认值
+    """
+    try:
+        jsonpath_expr = parse(path)
+        match = jsonpath_expr.find(json_obj)
+        if match:
+            return match[0].value
+    except Exception as e:
+        print(f"[extractor] Error extracting {path}: {e}")
+    return default
+
+def extract_multiple(json_obj, fields: dict) -> dict:
+    """
+    根据字段配置提取多个字段。
+
+    :param json_obj: 输入的 JSON 对象
+    :param fields: 字段配置,如 {"title": "$.title", "id": "$.id"}
+    :return: 字段名 -> 提取值的字典
+    """
+    return {key: safe_extract(json_obj, path) for key, path in fields.items()}

+ 10 - 0
utils/path_utils.py

@@ -0,0 +1,10 @@
+import os
+
+def get_project_path() -> str:
+    """
+    获取 AutoScraperX 项目根路径
+    """
+    return os.path.dirname(os.path.abspath(__file__)).split("AutoScraperX")[0] + "AutoScraperX"
+
+if __name__ == '__main__':
+    print( get_project_path())

+ 31 - 0
utils/project_paths.py

@@ -0,0 +1,31 @@
+import os
+from utils.path_utils import get_project_path
+
+# 项目根目录
+project_root = get_project_path()
+
+# 配置文件路径
+config_dir = os.path.join(project_root, "configs")
+config_spiders_path = os.path.join(config_dir, "spiders_config.yaml")
+
+# 日志路径(根路径 + log_store)
+log_dir = os.path.join(project_root, "log_store")
+
+# 模型路径(如有)
+model_dir = os.path.join(project_root, "models")
+
+# 临时文件、缓存目录
+tmp_dir = os.path.join(project_root, "tmp")
+
+# 其他路径可按需添加
+# db_config_path = os.path.join(config_dir, "db.yaml")
+
+# 导出路径变量
+__all__ = [
+    "project_root",
+    "config_dir",
+    "config_spiders_path",
+    "log_dir",
+    "model_dir",
+    "tmp_dir",
+]