wangkun 1 рік тому
батько
коміт
7e8fa2af9c

+ 10 - 0
README.MD

@@ -112,6 +112,16 @@ ps aux | grep shipinhao_search
 ps aux | grep shipinhao_search | grep -v grep | awk '{print $2}' | xargs kill -9
 ps aux | grep shipinhao_search | grep -v grep | awk '{print $2}' | xargs kill -9
 ```
 ```
 
 
+#### CPU/MEMORY 监控
+```commandline
+正式环境
+* * * * * /bin/sh monitor/monitor_main/run_monitor.sh monitor/monitor_main/run_cpu_memory.py "cpumemory" "monitor" "prod"
+线下调试
+sh monitor/monitor_main/run_monitor.sh monitor/monitor_main/run_cpu_memory.py "cpumemory" "monitor" "dev"
+检测进程
+ps aux | grep run_monitor | grep -v grep | awk '{print $2}' | xargs kill -9
+```
+
 
 
 #### 调用MQ的爬虫进程守护: main/process_mq.sh
 #### 调用MQ的爬虫进程守护: main/process_mq.sh
 ```commandline
 ```commandline

+ 158 - 0
dev/dev_script/mask_watermark.py

@@ -0,0 +1,158 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/7/21
+import cv2
+
+
+class MaskWatermark:
+    @classmethod
+    def find_watermark(cls, image_path):
+        """
+        基于OpenCV自动识别水印并获取其位置
+        :param image_path:水印
+        :return:watermark_area
+        """
+        # 读取图像
+        image = cv2.imread(image_path)
+
+        # 将图像转换为灰度图像
+        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+
+        # 使用边缘检测算法如Canny来检测图像中的边缘信息
+        edges = cv2.Canny(gray, 100, 200)
+
+        # 使用霍夫变换检测直线,以获得边缘中的直线段
+        lines = cv2.HoughLinesP(edges, 1, cv2.pi / 180, threshold=100, minLineLength=100, maxLineGap=10)
+
+        # 对检测到的直线进行筛选,以识别可能表示水印的直线
+        watermark_lines = []
+        if lines is not None:
+            for line in lines:
+                x1, y1, x2, y2 = line[0]
+                # 根据实际情况确定水印直线的特征,例如长度、斜率等
+                # 这里只是一个示例,您需要根据具体情况进行调整
+                if abs(y2 - y1) < 5 and abs(x2 - x1) > 50:
+                    watermark_lines.append(line)
+
+        # 根据检测到的水印直线,计算水印区域的位置和大小
+        if len(watermark_lines) > 1:
+            x_coords = [line[0][0] for line in watermark_lines] + [line[0][2] for line in watermark_lines]
+            y_coords = [line[0][1] for line in watermark_lines] + [line[0][3] for line in watermark_lines]
+            min_x = min(x_coords)
+            max_x = max(x_coords)
+            min_y = min(y_coords)
+            max_y = max(y_coords)
+            watermark_area = (min_x, min_y, max_x - min_x, max_y - min_y)  # 水印区域的位置和大小
+        else:
+            watermark_area = None
+
+        return watermark_area
+
+    @classmethod
+    def mask_watermark(cls, input_path, output_path, watermark_area):
+        # 读取视频
+        video = cv2.VideoCapture(input_path)
+
+        # 获取视频的宽度和高度
+        width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
+
+        # 创建输出视频对象
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # 可根据需要更换视频编码器
+        output = cv2.VideoWriter(output_path, fourcc, 30.0, (width, height))
+
+        while True:
+            ret, frame = video.read()
+
+            if not ret:
+                break
+
+            # 在水印区域替换成其他像素或进行遮挡处理
+            x, y, w, h = watermark_area
+            frame[y:y + h, x:x + w] = 0  # 这里将水印区域像素设为0,可根据需要进行更复杂的像素替换或遮挡处理
+
+            # 将处理后的帧写入输出视频
+            output.write(frame)
+
+        # 释放资源
+        video.release()
+        output.release()
+
+        print("成功去除水印,并保存为", output_path)
+
+    @classmethod
+    def remove_watermark(cls, video_path, output_path):
+        # 读取视频
+        video = cv2.VideoCapture(video_path)
+
+        # 获取视频的宽度和高度
+        width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
+
+        # 创建输出视频对象
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # 可根据需要更换视频编码器
+        output = cv2.VideoWriter(output_path, fourcc, 30.0, (width, height))
+
+        # 读取第一帧作为背景帧
+        ret, background = video.read()
+        if not ret:
+            print("无法读取背景帧")
+            return
+
+        while True:
+            ret, frame = video.read()
+
+            if not ret:
+                break
+
+            # 计算当前帧与背景帧的差值
+            diff = cv2.absdiff(frame, background)
+
+            # 将差值转换为灰度图像
+            gray = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY)
+
+            # 应用阈值二值化,通过调整阈值以过滤差异
+            _, threshold = cv2.threshold(gray, 25, 255, cv2.THRESH_BINARY)
+
+            # 进行形态学操作,填充小区域,平滑边缘
+            kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
+            dilated = cv2.dilate(threshold, kernel, iterations=3)
+
+            # 寻找轮廓
+            contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+
+            # 去除检测到的水印轮廓区域
+            for contour in contours:
+                # 这里只是一个示例,您可以根据具体情况进行调整,例如根据轮廓面积、形状等进行过滤
+                if cv2.contourArea(contour) > threshold_area:
+                    # 在当前帧上用背景帧进行填充
+                    cv2.drawContours(frame, [contour], -1, (0, 0, 0), cv2.FILLED)
+
+            # 将处理后的帧写入输出视频
+            output.write(frame)
+
+        # 释放资源
+        video.release()
+        output.release()
+
+
+if __name__ == "__main__":
+    # 示例调用
+    image_path = 'image.jpg'  # 替换为待识别水印的图像路径
+    watermark_area = MaskWatermark.find_watermark(image_path)
+    print("水印区域的位置和大小:", watermark_area)
+
+    # 示例调用
+    input_path = 'input.mp4'  # 替换为输入视频文件路径
+    output_path = 'output.mp4'  # 替换为输出视频文件路径
+    watermark_area = (100, 100, 200, 200)  # 替换为水印区域的位置和大小,表示为 (x, y, width, height)
+    MaskWatermark.mask_watermark(input_path, output_path, watermark_area)
+
+    # 示例调用
+    video_path = 'video.mp4'  # 替换为视频文件路径
+    output_path = 'output.mp4'  # 替换为输出视频文件路径
+    threshold_area = 1000  # 轮廓区域的阈值,根据具体情况进行调整
+    MaskWatermark.remove_watermark(video_path, output_path)
+    print("成功去除水印,并保存为", output_path)
+
+    pass

+ 1 - 1
dev/logs/__init__.py → monitor/__init__.py

@@ -1,3 +1,3 @@
 # -*- coding: utf-8 -*-
 # -*- coding: utf-8 -*-
 # @Author: wangkun
 # @Author: wangkun
-# @Time: 2023/7/4
+# @Time: 2023/7/20

+ 1 - 1
shipinhao/logs/__init__.py → monitor/cpu_memory/__init__.py

@@ -1,3 +1,3 @@
 # -*- coding: utf-8 -*-
 # -*- coding: utf-8 -*-
 # @Author: wangkun
 # @Author: wangkun
-# @Time: 2023/5/9
+# @Time: 2023/7/20

+ 111 - 0
monitor/cpu_memory/cpu_memory.py

@@ -0,0 +1,111 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/7/20
+import json
+import os
+import socket
+import sys
+import psutil
+sys.path.append(os.getcwd())
+from common.common import Common
+from common.scheduling_db import MysqlHelper
+
+
+class MonitorCpuMemory:
+    @classmethod
+    def get_script_list(cls, log_type, crawler, env):
+        script_list = []
+        select_sql = f""" SELECT DISTINCT `key` FROM crawler_enum WHERE `key` LIKE "%run%";  """
+        sql_response = MysqlHelper.get_values(log_type, crawler, select_sql, env, action="")
+        for response in sql_response:
+            script_list.append(response["key"])
+        script_list.append("run_cpu_memory")
+        return script_list
+
+    @classmethod
+    def get_ip_address(cls):
+        try:
+            # 创建一个 UDP 套接字
+            sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+            # 连接到一个外部 IP 地址
+            sock.connect(("8.8.8.8", 80))
+            # 获取本地 IP 地址
+            local_ip = sock.getsockname()[0]
+            return local_ip
+        except socket.error:
+            return None
+
+    @classmethod
+    def get_pid_path(cls, script_name):
+        # 遍历所有正在运行的进程
+        for proc in psutil.process_iter():
+            try:
+                # 获取进程的命令行参数
+                cmds = proc.cmdline()
+                # 检查命令行参数是否包含爬虫脚本的名称或关键字
+                for cmd in cmds:
+                    if script_name in cmd:
+                        # Common.logger(log_type, crawler).info(f"cmd:{cmd}")
+                        # 获取进程的PID
+                        pid = proc.pid
+                        pid_path = {
+                            "pid": pid,
+                            "path": cmd,
+                        }
+                        return pid_path
+            except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
+                pass
+
+    @classmethod
+    def get_cpu_memory(cls, script_name):
+        # 获取当前进程的PID
+        pid_path = cls.get_pid_path(script_name)
+        # print(f"pid_path:{pid_path}")
+        if pid_path is None:
+            return None
+        # 获取CPU的使用情况
+        cpu_percent = round(psutil.Process(pid_path["pid"]).cpu_percent(), 2)
+        # 获取内存的使用情况
+        memory_percent = round(psutil.Process(pid_path["pid"]).memory_percent(), 2)
+        cpu_memory = {
+            "pid": pid_path["pid"],
+            "path": pid_path["path"],
+            "cpu": cpu_percent,
+            "memory": memory_percent,
+        }
+
+        return cpu_memory
+
+    @classmethod
+    def get_all_script_cpu_memory(cls, log_type, crawler, env):
+        script_list = cls.get_script_list(log_type, crawler, env)
+        for script_name in script_list:
+            try:
+                Common.logger(log_type, crawler).info(f"开始监控:{script_name}")
+                Common.logging(log_type, crawler, env, f"开始监控:{script_name}")
+                crawler_info = cls.get_cpu_memory(script_name)
+                if crawler_info is None:
+                    Common.logger(log_type, crawler).info(f"脚本未运行\n")
+                    Common.logging(log_type, crawler, env, f"脚本未运行\n")
+                    continue
+
+                script_info_dict = json.dumps({
+                    "crawler_name": script_name,
+                    "crawler_ip": cls.get_ip_address(),
+                    "crawler_pid": crawler_info["pid"],
+                    "crawler_path": crawler_info["path"],
+                    "crawler_cpu": crawler_info["cpu"],
+                    "crawler_memory": crawler_info["memory"]
+                })
+                Common.logger(log_type, crawler).info(f'script_info:{script_info_dict}\n')
+                Common.logging(log_type, crawler, env, f'{script_info_dict}')
+            except Exception as e:
+                Common.logger(log_type, crawler).error(f"监控{script_name}时异常:{e}\n")
+                Common.logging(log_type, crawler, env, f"监控{script_name}时异常:{e}\n")
+
+
+if __name__ == "__main__":
+    ipaddress = MonitorCpuMemory.get_ip_address()
+    print(ipaddress)
+
+    pass

+ 3 - 0
monitor/monitor_main/__init__.py

@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/7/20

+ 28 - 0
monitor/monitor_main/run_cpu_memory.py

@@ -0,0 +1,28 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/7/20
+import argparse
+import os
+import sys
+sys.path.append(os.getcwd())
+from common.common import Common
+from monitor.cpu_memory.cpu_memory import MonitorCpuMemory
+
+
+class MonitorMain:
+    @classmethod
+    def monitor_main(cls, log_type, crawler, env):
+        Common.logger(log_type, crawler).info(f"开始监控脚本 cpu、memory 状态\n")
+        Common.logging(log_type, crawler, env, f"开始监控脚本 cpu、memory 状态\n")
+        MonitorCpuMemory.get_all_script_cpu_memory(log_type, crawler, env)
+        Common.logger(log_type, crawler).info("监控一轮结束\n")
+        Common.logging(log_type, crawler, env, "监控一轮结束\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()  ## 新建参数解释器对象
+    parser.add_argument('--log_type', type=str)  ## 添加参数,注明参数类型
+    parser.add_argument('--crawler')  ## 添加参数
+    parser.add_argument('--env')  ## 添加参数
+    args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
+    MonitorMain.monitor_main(log_type=args.log_type, crawler=args.crawler, env=args.env)

+ 38 - 0
monitor/monitor_main/run_monitor.sh

@@ -0,0 +1,38 @@
+#!/bin/bash
+
+path=$1     # 爬虫路径
+log_type=$2 # 爬虫策略
+crawler=$3  # 哪款爬虫
+env=$4      # 环境
+
+if [ ${env} = "dev" ];then
+  piaoquan_crawler_dir=/Users/wangkun/Desktop/crawler/piaoquan_crawler/
+  profile_path=/etc/profile
+  python=python3
+  log_path=${piaoquan_crawler_dir}monitor/logs/$(date +%Y-%m-%d)-shell.log
+else
+  piaoquan_crawler_dir=/root/piaoquan_crawler/
+  profile_path=/etc/profile
+  python=python3
+  log_path=${piaoquan_crawler_dir}monitor/logs/$(date +%Y-%m-%d)-shell.log
+fi
+
+echo "开始..." >> ${log_path}
+
+# ====================进程心跳检测====================
+echo "$(date "+%Y-%m-%d %H:%M:%S") 正在监测 run_cpu_memory 进程状态" >> ${log_path}
+pgrep -f run_cpu_memory.py
+if [ "$?" -eq 1 ];then
+  echo "$(date "+%Y-%m-%d_%H:%M:%S") 未运行, 正在启动..." >> ${log_path}
+  cd ${piaoquan_crawler_dir} && nohup ${python} -u ${path} --log_type="${log_type}" --crawler="${crawler}" --env="${env}" >> ${log_path} 2>&1 &
+  echo "$(date "+%Y-%m-%d %H:%M:%S") 启动完成!" >> ${log_path}
+else
+  echo "$(date "+%Y-%m-%d %H:%M:%S") run_cpu_memory 进程状态正常。" >> ${log_path}
+fi
+# ===================================================
+
+# 删除日志
+echo "$(date "+%Y-%m-%d %H:%M:%S") 开始清理 10 天前的日志文件" >> ${log_path}
+find ${piaoquan_crawler_dir}monitor/logs/ -mtime +10 -name "*.log" -exec rm -rf {} \;
+echo "$(date "+%Y-%m-%d %H:%M:%S") 日志文件清理完毕" >> ${log_path}
+exit 0