Browse Source

add 抓取规则判断方式封装

lierqiang 2 years ago
parent
commit
f744159d53
1 changed files with 93 additions and 1 deletions
  1. 93 1
      common/public.py

+ 93 - 1
common/public.py

@@ -2,16 +2,18 @@
 # @Author: wangkun
 # @Time: 2023/3/27
 import os, sys
+import time
 import random
 
 sys.path.append(os.getcwd())
 from common.common import Common
 from common.scheduling_db import MysqlHelper
+
+
 # from common import Common
 # from scheduling_db import MysqlHelper
 
 
-
 # 过滤词库
 def filter_word(log_type, crawler, source, env):
     """
@@ -93,6 +95,96 @@ def task_fun(task_str):
     }
     return task_dict
 
+
+def download_rule(log_type, crawler, video_dict, rule_dict):
+    """
+    下载视频的基本规则
+    :param log_type: 日志
+    :param crawler: 哪款爬虫
+    :param video_dict: 视频信息,字典格式
+    :param rule_dict: 规则信息,字典格式
+    :return: 满足规则,返回 True;反之,返回 False
+    """
+    rule_playCnt_min = rule_dict.get('playCnt', {}).get('min', 0)
+    rule_playCnt_max = rule_dict.get('playCnt', {}).get('max', 100000000)
+    if rule_playCnt_max == 0:
+        rule_playCnt_max = 100000000
+
+    rule_duration_min = rule_dict.get('duration', {}).get('min', 0)
+    rule_duration_max = rule_dict.get('duration', {}).get('max', 100000000)
+    if rule_duration_max == 0:
+        rule_duration_max = 100000000
+
+    rule_period_min = rule_dict.get('period', {}).get('min', 0)
+    # rule_period_max = rule_dict.get('period', {}).get('max', 100000000)
+    # if rule_period_max == 0:
+    #     rule_period_max = 100000000
+    #
+    # rule_fans_min = rule_dict.get('fans', {}).get('min', 0)
+    # rule_fans_max = rule_dict.get('fans', {}).get('max', 100000000)
+    # if rule_fans_max == 0:
+    #     rule_fans_max = 100000000
+    #
+    # rule_videos_min = rule_dict.get('videos', {}).get('min', 0)
+    # rule_videos_max = rule_dict.get('videos', {}).get('max', 100000000)
+    # if rule_videos_max == 0:
+    #     rule_videos_max = 100000000
+
+    rule_like_min = rule_dict.get('like', {}).get('min', 0)
+    rule_like_max = rule_dict.get('like', {}).get('max', 100000000)
+    if rule_like_max == 0:
+        rule_like_max = 100000000
+
+    rule_videoWidth_min = rule_dict.get('videoWidth', {}).get('min', 0)
+    rule_videoWidth_max = rule_dict.get('videoWidth', {}).get('max', 100000000)
+    if rule_videoWidth_max == 0:
+        rule_videoWidth_max = 100000000
+
+    rule_videoHeight_min = rule_dict.get('videoHeight', {}).get('min', 0)
+    rule_videoHeight_max = rule_dict.get('videoHeight', {}).get('max', 100000000)
+    if rule_videoHeight_max == 0:
+        rule_videoHeight_max = 100000000
+
+    rule_shareCnt_min = rule_dict.get('shareCnt', {}).get('min', 0)
+    rule_shareCnt_max = rule_dict.get('shareCnt', {}).get('max', 100000000)
+    if rule_shareCnt_max == 0:
+        rule_shareCnt_max = 100000000
+
+    rule_commentCnt_min = rule_dict.get('commentCnt', {}).get('min', 0)
+    rule_commentCnt_max = rule_dict.get('commentCnt', {}).get('max', 100000000)
+    if rule_commentCnt_max == 0:
+        rule_commentCnt_max = 100000000
+
+    Common.logger(log_type, crawler).info(
+        f'rule_duration_max:{rule_duration_max} >= duration:{int(float(video_dict["duration"]))} >= rule_duration_min:{int(rule_duration_min)}')
+    Common.logger(log_type, crawler).info(
+        f'rule_playCnt_max:{int(rule_playCnt_max)} >= play_cnt:{int(video_dict["play_cnt"])} >= rule_playCnt_min:{int(rule_playCnt_min)}')
+    Common.logger(log_type, crawler).info(
+        f'now:{int(time.time())} - publish_time_stamp:{int(video_dict["publish_time_stamp"])} <= {3600 * 24 * int(rule_period_min)}')
+    Common.logger(log_type, crawler).info(
+        f'rule_like_max:{int(rule_like_max)} >= like_cnt:{int(video_dict["like_cnt"])} >= rule_like_min:{int(rule_like_min)}')
+    Common.logger(log_type, crawler).info(
+        f'rule_commentCnt_max:{int(rule_commentCnt_max)} >= comment_cnt:{int(video_dict["comment_cnt"])} >= rule_commentCnt_min:{int(rule_commentCnt_min)}')
+    Common.logger(log_type, crawler).info(
+        f'rule_shareCnt_max:{int(rule_shareCnt_max)} >= share_cnt:{int(video_dict["share_cnt"])} >= rule_shareCnt_min:{int(rule_shareCnt_min)}')
+    Common.logger(log_type, crawler).info(
+        f'rule_videoWidth_max:{int(rule_videoWidth_max)} >= video_width:{int(video_dict["video_width"])} >= rule_videoWidth_min:{int(rule_videoWidth_min)}')
+    Common.logger(log_type, crawler).info(
+        f'rule_videoHeight_max:{int(rule_videoHeight_max)} >= video_height:{int(video_dict["video_height"])} >= rule_videoHeight_min:{int(rule_videoHeight_min)}')
+
+    if int(rule_duration_max) >= int(float(video_dict["duration"])) >= int(rule_duration_min) \
+            and int(rule_playCnt_max) >= int(video_dict['play_cnt']) >= int(rule_playCnt_min) \
+            and int(time.time()) - int(video_dict["publish_time_stamp"]) <= 3600 * 24 * int(rule_period_min) \
+            and int(rule_like_max) >= int(video_dict['like_cnt']) >= int(rule_like_min) \
+            and int(rule_commentCnt_max) >= int(video_dict['comment_cnt']) >= int(rule_commentCnt_min) \
+            and int(rule_shareCnt_max) >= int(video_dict['share_cnt']) >= int(rule_shareCnt_min) \
+            and int(rule_videoWidth_max) >= int(video_dict['video_width']) >= int(rule_videoWidth_min) \
+            and int(rule_videoHeight_max) >= int(video_dict['video_height']) >= int(rule_videoHeight_min):
+        return True
+    else:
+        return False
+
+
 if __name__ == "__main__":
     # print(filter_word('public', 'xiaoniangao', '小年糕', 'prod'))
     print(get_config_from_mysql('hour', 'xiaoniangao', 'dev', 'emoji'))