1 year ago · 7e8fa2af9c
--- a/README.MD
+++ b/README.MD
@@ -112,6 +112,16 @@ ps aux | grep shipinhao_search
 
															 ps aux | grep shipinhao_search | grep -v grep | awk '{print $2}' | xargs kill -9
														
 
															 ```
														
 
															+#### CPU/MEMORY 监控
														
 
															+```commandline
														
 
															+正式环境
														
 
															+* * * * * /bin/sh monitor/monitor_main/run_monitor.sh monitor/monitor_main/run_cpu_memory.py "cpumemory" "monitor" "prod"
														
 
															+线下调试
														
 
															+sh monitor/monitor_main/run_monitor.sh monitor/monitor_main/run_cpu_memory.py "cpumemory" "monitor" "dev"
														
 
															+检测进程
														
 
															+ps aux | grep run_monitor | grep -v grep | awk '{print $2}' | xargs kill -9
														
 
															+```
														
 
															+
														
 
															 #### 调用MQ的爬虫进程守护: main/process_mq.sh
														
 
															 ```commandline
														
--- a/dev/dev_script/mask_watermark.py
+++ b/dev/dev_script/mask_watermark.py
@@ -0,0 +1,158 @@
 
															+# -*- coding: utf-8 -*-
														
 
															+# @Author: wangkun
														
 
															+# @Time: 2023/7/21
														
 
															+import cv2
														
 
															+
														
 
															+
														
 
															+class MaskWatermark:
														
 
															+    @classmethod
														
 
															+    def find_watermark(cls, image_path):
														
 
															+        """
														
 
															+        基于OpenCV自动识别水印并获取其位置
														
 
															+        :param image_path:水印
														
 
															+        :return:watermark_area
														
 
															+        """
														
 
															+        # 读取图像
														
 
															+        image = cv2.imread(image_path)
														
 
															+
														
 
															+        # 将图像转换为灰度图像
														
 
															+        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
														
 
															+
														
 
															+        # 使用边缘检测算法如Canny来检测图像中的边缘信息
														
 
															+        edges = cv2.Canny(gray, 100, 200)
														
 
															+
														
 
															+        # 使用霍夫变换检测直线，以获得边缘中的直线段
														
 
															+        lines = cv2.HoughLinesP(edges, 1, cv2.pi / 180, threshold=100, minLineLength=100, maxLineGap=10)
														
 
															+
														
 
															+        # 对检测到的直线进行筛选，以识别可能表示水印的直线
														
 
															+        watermark_lines = []
														
 
															+        if lines is not None:
														
 
															+            for line in lines:
														
 
															+                x1, y1, x2, y2 = line[0]
														
 
															+                # 根据实际情况确定水印直线的特征，例如长度、斜率等
														
 
															+                # 这里只是一个示例，您需要根据具体情况进行调整
														
 
															+                if abs(y2 - y1) < 5 and abs(x2 - x1) > 50:
														
 
															+                    watermark_lines.append(line)
														
 
															+
														
 
															+        # 根据检测到的水印直线，计算水印区域的位置和大小
														
 
															+        if len(watermark_lines) > 1:
														
 
															+            x_coords = [line[0][0] for line in watermark_lines] + [line[0][2] for line in watermark_lines]
														
 
															+            y_coords = [line[0][1] for line in watermark_lines] + [line[0][3] for line in watermark_lines]
														
 
															+            min_x = min(x_coords)
														
 
															+            max_x = max(x_coords)
														
 
															+            min_y = min(y_coords)
														
 
															+            max_y = max(y_coords)
														
 
															+            watermark_area = (min_x, min_y, max_x - min_x, max_y - min_y)  # 水印区域的位置和大小
														
 
															+        else:
														
 
															+            watermark_area = None
														
 
															+
														
 
															+        return watermark_area
														
 
															+
														
 
															+    @classmethod
														
 
															+    def mask_watermark(cls, input_path, output_path, watermark_area):
														
 
															+        # 读取视频
														
 
															+        video = cv2.VideoCapture(input_path)
														
 
															+
														
 
															+        # 获取视频的宽度和高度
														
 
															+        width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
														
 
															+        height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
														
 
															+
														
 
															+        # 创建输出视频对象
														
 
															+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # 可根据需要更换视频编码器
														
 
															+        output = cv2.VideoWriter(output_path, fourcc, 30.0, (width, height))
														
 
															+
														
 
															+        while True:
														
 
															+            ret, frame = video.read()
														
 
															+
														
 
															+            if not ret:
														
 
															+                break
														
 
															+
														
 
															+            # 在水印区域替换成其他像素或进行遮挡处理
														
 
															+            x, y, w, h = watermark_area
														
 
															+            frame[y:y + h, x:x + w] = 0  # 这里将水印区域像素设为0，可根据需要进行更复杂的像素替换或遮挡处理
														
 
															+
														
 
															+            # 将处理后的帧写入输出视频
														
 
															+            output.write(frame)
														
 
															+
														
 
															+        # 释放资源
														
 
															+        video.release()
														
 
															+        output.release()
														
 
															+
														
 
															+        print("成功去除水印，并保存为", output_path)
														
 
															+
														
 
															+    @classmethod
														
 
															+    def remove_watermark(cls, video_path, output_path):
														
 
															+        # 读取视频
														
 
															+        video = cv2.VideoCapture(video_path)
														
 
															+
														
 
															+        # 获取视频的宽度和高度
														
 
															+        width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
														
 
															+        height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
														
 
															+
														
 
															+        # 创建输出视频对象
														
 
															+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # 可根据需要更换视频编码器
														
 
															+        output = cv2.VideoWriter(output_path, fourcc, 30.0, (width, height))
														
 
															+
														
 
															+        # 读取第一帧作为背景帧
														
 
															+        ret, background = video.read()
														
 
															+        if not ret:
														
 
															+            print("无法读取背景帧")
														
 
															+            return
														
 
															+
														
 
															+        while True:
														
 
															+            ret, frame = video.read()
														
 
															+
														
 
															+            if not ret:
														
 
															+                break
														
 
															+
														
 
															+            # 计算当前帧与背景帧的差值
														
 
															+            diff = cv2.absdiff(frame, background)
														
 
															+
														
 
															+            # 将差值转换为灰度图像
														
 
															+            gray = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY)
														
 
															+
														
 
															+            # 应用阈值二值化，通过调整阈值以过滤差异
														
 
															+            _, threshold = cv2.threshold(gray, 25, 255, cv2.THRESH_BINARY)
														
 
															+
														
 
															+            # 进行形态学操作，填充小区域，平滑边缘
														
 
															+            kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
														
 
															+            dilated = cv2.dilate(threshold, kernel, iterations=3)
														
 
															+
														
 
															+            # 寻找轮廓
														
 
															+            contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
														
 
															+
														
 
															+            # 去除检测到的水印轮廓区域
														
 
															+            for contour in contours:
														
 
															+                # 这里只是一个示例，您可以根据具体情况进行调整，例如根据轮廓面积、形状等进行过滤
														
 
															+                if cv2.contourArea(contour) > threshold_area:
														
 
															+                    # 在当前帧上用背景帧进行填充
														
 
															+                    cv2.drawContours(frame, [contour], -1, (0, 0, 0), cv2.FILLED)
														
 
															+
														
 
															+            # 将处理后的帧写入输出视频
														
 
															+            output.write(frame)
														
 
															+
														
 
															+        # 释放资源
														
 
															+        video.release()
														
 
															+        output.release()
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    # 示例调用
														
 
															+    image_path = 'image.jpg'  # 替换为待识别水印的图像路径
														
 
															+    watermark_area = MaskWatermark.find_watermark(image_path)
														
 
															+    print("水印区域的位置和大小：", watermark_area)
														
 
															+
														
 
															+    # 示例调用
														
 
															+    input_path = 'input.mp4'  # 替换为输入视频文件路径
														
 
															+    output_path = 'output.mp4'  # 替换为输出视频文件路径
														
 
															+    watermark_area = (100, 100, 200, 200)  # 替换为水印区域的位置和大小，表示为 (x, y, width, height)
														
 
															+    MaskWatermark.mask_watermark(input_path, output_path, watermark_area)
														
 
															+
														
 
															+    # 示例调用
														
 
															+    video_path = 'video.mp4'  # 替换为视频文件路径
														
 
															+    output_path = 'output.mp4'  # 替换为输出视频文件路径
														
 
															+    threshold_area = 1000  # 轮廓区域的阈值，根据具体情况进行调整
														
 
															+    MaskWatermark.remove_watermark(video_path, output_path)
														
 
															+    print("成功去除水印，并保存为", output_path)
														
 
															+
														
 
															+    pass
														
--- a/dev/logs/__init__.py
+++ b/dev/logs/__init__.py
@@ -1,3 +1,3 @@
 
															 # -*- coding: utf-8 -*-
														
 
															 # @Author: wangkun
														
 
															-# @Time: 2023/7/4
														
 
															+# @Time: 2023/7/20
														
--- a/monitor/cpu_memory/__init__.py
+++ b/monitor/cpu_memory/__init__.py
@@ -1,3 +1,3 @@
 
															 # -*- coding: utf-8 -*-
														
 
															 # @Author: wangkun
														
 
															-# @Time: 2023/5/9
														
 
															+# @Time: 2023/7/20
														
--- a/monitor/cpu_memory/cpu_memory.py
+++ b/monitor/cpu_memory/cpu_memory.py
@@ -0,0 +1,111 @@
 
															+# -*- coding: utf-8 -*-
														
 
															+# @Author: wangkun
														
 
															+# @Time: 2023/7/20
														
 
															+import json
														
 
															+import os
														
 
															+import socket
														
 
															+import sys
														
 
															+import psutil
														
 
															+sys.path.append(os.getcwd())
														
 
															+from common.common import Common
														
 
															+from common.scheduling_db import MysqlHelper
														
 
															+
														
 
															+
														
 
															+class MonitorCpuMemory:
														
 
															+    @classmethod
														
 
															+    def get_script_list(cls, log_type, crawler, env):
														
 
															+        script_list = []
														
 
															+        select_sql = f""" SELECT DISTINCT `key` FROM crawler_enum WHERE `key` LIKE "%run%";  """
														
 
															+        sql_response = MysqlHelper.get_values(log_type, crawler, select_sql, env, action="")
														
 
															+        for response in sql_response:
														
 
															+            script_list.append(response["key"])
														
 
															+        script_list.append("run_cpu_memory")
														
 
															+        return script_list
														
 
															+
														
 
															+    @classmethod
														
 
															+    def get_ip_address(cls):
														
 
															+        try:
														
 
															+            # 创建一个 UDP 套接字
														
 
															+            sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
														
 
															+            # 连接到一个外部 IP 地址
														
 
															+            sock.connect(("8.8.8.8", 80))
														
 
															+            # 获取本地 IP 地址
														
 
															+            local_ip = sock.getsockname()[0]
														
 
															+            return local_ip
														
 
															+        except socket.error:
														
 
															+            return None
														
 
															+
														
 
															+    @classmethod
														
 
															+    def get_pid_path(cls, script_name):
														
 
															+        # 遍历所有正在运行的进程
														
 
															+        for proc in psutil.process_iter():
														
 
															+            try:
														
 
															+                # 获取进程的命令行参数
														
 
															+                cmds = proc.cmdline()
														
 
															+                # 检查命令行参数是否包含爬虫脚本的名称或关键字
														
 
															+                for cmd in cmds:
														
 
															+                    if script_name in cmd:
														
 
															+                        # Common.logger(log_type, crawler).info(f"cmd:{cmd}")
														
 
															+                        # 获取进程的PID
														
 
															+                        pid = proc.pid
														
 
															+                        pid_path = {
														
 
															+                            "pid": pid,
														
 
															+                            "path": cmd,
														
 
															+                        }
														
 
															+                        return pid_path
														
 
															+            except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
														
 
															+                pass
														
 
															+
														
 
															+    @classmethod
														
 
															+    def get_cpu_memory(cls, script_name):
														
 
															+        # 获取当前进程的PID
														
 
															+        pid_path = cls.get_pid_path(script_name)
														
 
															+        # print(f"pid_path:{pid_path}")
														
 
															+        if pid_path is None:
														
 
															+            return None
														
 
															+        # 获取CPU的使用情况
														
 
															+        cpu_percent = round(psutil.Process(pid_path["pid"]).cpu_percent(), 2)
														
 
															+        # 获取内存的使用情况
														
 
															+        memory_percent = round(psutil.Process(pid_path["pid"]).memory_percent(), 2)
														
 
															+        cpu_memory = {
														
 
															+            "pid": pid_path["pid"],
														
 
															+            "path": pid_path["path"],
														
 
															+            "cpu": cpu_percent,
														
 
															+            "memory": memory_percent,
														
 
															+        }
														
 
															+
														
 
															+        return cpu_memory
														
 
															+
														
 
															+    @classmethod
														
 
															+    def get_all_script_cpu_memory(cls, log_type, crawler, env):
														
 
															+        script_list = cls.get_script_list(log_type, crawler, env)
														
 
															+        for script_name in script_list:
														
 
															+            try:
														
 
															+                Common.logger(log_type, crawler).info(f"开始监控:{script_name}")
														
 
															+                Common.logging(log_type, crawler, env, f"开始监控:{script_name}")
														
 
															+                crawler_info = cls.get_cpu_memory(script_name)
														
 
															+                if crawler_info is None:
														
 
															+                    Common.logger(log_type, crawler).info(f"脚本未运行\n")
														
 
															+                    Common.logging(log_type, crawler, env, f"脚本未运行\n")
														
 
															+                    continue
														
 
															+
														
 
															+                script_info_dict = json.dumps({
														
 
															+                    "crawler_name": script_name,
														
 
															+                    "crawler_ip": cls.get_ip_address(),
														
 
															+                    "crawler_pid": crawler_info["pid"],
														
 
															+                    "crawler_path": crawler_info["path"],
														
 
															+                    "crawler_cpu": crawler_info["cpu"],
														
 
															+                    "crawler_memory": crawler_info["memory"]
														
 
															+                })
														
 
															+                Common.logger(log_type, crawler).info(f'script_info:{script_info_dict}\n')
														
 
															+                Common.logging(log_type, crawler, env, f'{script_info_dict}')
														
 
															+            except Exception as e:
														
 
															+                Common.logger(log_type, crawler).error(f"监控{script_name}时异常:{e}\n")
														
 
															+                Common.logging(log_type, crawler, env, f"监控{script_name}时异常:{e}\n")
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    ipaddress = MonitorCpuMemory.get_ip_address()
														
 
															+    print(ipaddress)
														
 
															+
														
 
															+    pass
														
--- a/monitor/monitor_main/__init__.py
+++ b/monitor/monitor_main/__init__.py
@@ -0,0 +1,3 @@
 
															+# -*- coding: utf-8 -*-
														
 
															+# @Author: wangkun
														
 
															+# @Time: 2023/7/20
														
--- a/monitor/monitor_main/run_cpu_memory.py
+++ b/monitor/monitor_main/run_cpu_memory.py
@@ -0,0 +1,28 @@
 
															+# -*- coding: utf-8 -*-
														
 
															+# @Author: wangkun
														
 
															+# @Time: 2023/7/20
														
 
															+import argparse
														
 
															+import os
														
 
															+import sys
														
 
															+sys.path.append(os.getcwd())
														
 
															+from common.common import Common
														
 
															+from monitor.cpu_memory.cpu_memory import MonitorCpuMemory
														
 
															+
														
 
															+
														
 
															+class MonitorMain:
														
 
															+    @classmethod
														
 
															+    def monitor_main(cls, log_type, crawler, env):
														
 
															+        Common.logger(log_type, crawler).info(f"开始监控脚本 cpu、memory 状态\n")
														
 
															+        Common.logging(log_type, crawler, env, f"开始监控脚本 cpu、memory 状态\n")
														
 
															+        MonitorCpuMemory.get_all_script_cpu_memory(log_type, crawler, env)
														
 
															+        Common.logger(log_type, crawler).info("监控一轮结束\n")
														
 
															+        Common.logging(log_type, crawler, env, "监控一轮结束\n")
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    parser = argparse.ArgumentParser()  ## 新建参数解释器对象
														
 
															+    parser.add_argument('--log_type', type=str)  ## 添加参数,注明参数类型
														
 
															+    parser.add_argument('--crawler')  ## 添加参数
														
 
															+    parser.add_argument('--env')  ## 添加参数
														
 
															+    args = parser.parse_args()  ### 参数赋值，也可以通过终端赋值
														
 
															+    MonitorMain.monitor_main(log_type=args.log_type, crawler=args.crawler, env=args.env)
														
--- a/monitor/monitor_main/run_monitor.sh
+++ b/monitor/monitor_main/run_monitor.sh
@@ -0,0 +1,38 @@
 
															+#!/bin/bash
														
 
															+
														
 
															+path=$1     # 爬虫路径
														
 
															+log_type=$2 # 爬虫策略
														
 
															+crawler=$3  # 哪款爬虫
														
 
															+env=$4      # 环境
														
 
															+
														
 
															+if [ ${env} = "dev" ];then
														
 
															+  piaoquan_crawler_dir=/Users/wangkun/Desktop/crawler/piaoquan_crawler/
														
 
															+  profile_path=/etc/profile
														
 
															+  python=python3
														
 
															+  log_path=${piaoquan_crawler_dir}monitor/logs/$(date +%Y-%m-%d)-shell.log
														
 
															+else
														
 
															+  piaoquan_crawler_dir=/root/piaoquan_crawler/
														
 
															+  profile_path=/etc/profile
														
 
															+  python=python3
														
 
															+  log_path=${piaoquan_crawler_dir}monitor/logs/$(date +%Y-%m-%d)-shell.log
														
 
															+fi
														
 
															+
														
 
															+echo "开始..." >> ${log_path}
														
 
															+
														
 
															+# ====================进程心跳检测====================
														
 
															+echo "$(date "+%Y-%m-%d %H:%M:%S") 正在监测 run_cpu_memory 进程状态" >> ${log_path}
														
 
															+pgrep -f run_cpu_memory.py
														
 
															+if [ "$?" -eq 1 ];then
														
 
															+  echo "$(date "+%Y-%m-%d_%H:%M:%S") 未运行, 正在启动..." >> ${log_path}
														
 
															+  cd ${piaoquan_crawler_dir} && nohup ${python} -u ${path} --log_type="${log_type}" --crawler="${crawler}" --env="${env}" >> ${log_path} 2>&1 &
														
 
															+  echo "$(date "+%Y-%m-%d %H:%M:%S") 启动完成!" >> ${log_path}
														
 
															+else
														
 
															+  echo "$(date "+%Y-%m-%d %H:%M:%S") run_cpu_memory 进程状态正常。" >> ${log_path}
														
 
															+fi
														
 
															+# ===================================================
														
 
															+
														
 
															+# 删除日志
														
 
															+echo "$(date "+%Y-%m-%d %H:%M:%S") 开始清理 10 天前的日志文件" >> ${log_path}
														
 
															+find ${piaoquan_crawler_dir}monitor/logs/ -mtime +10 -name "*.log" -exec rm -rf {} \;
														
 
															+echo "$(date "+%Y-%m-%d %H:%M:%S") 日志文件清理完毕" >> ${log_path}
														
 
															+exit 0