wangkun 1 سال پیش
والد
کامیت
76e43985e2

+ 28 - 101
common/public.py

@@ -4,7 +4,6 @@
 import os, sys
 import time
 import random
-
 sys.path.append(os.getcwd())
 from common.common import Common
 from common.scheduling_db import MysqlHelper
@@ -12,27 +11,6 @@ from common.scheduling_db import MysqlHelper
 # from scheduling_db import MysqlHelper
 
 
-# 过滤词库
-def filter_word(log_type, crawler, source, env):
-    """
-    过滤词库
-    :param log_type: 日志
-    :param crawler: 哪款爬虫,如:xiaoniangao
-    :param source: 哪款爬虫,如:小年糕
-    :param env: 环境
-    :return: word_list
-    """
-    select_sql = f""" select * from crawler_filter_word where source="{source}" """
-    words = MysqlHelper.get_values(log_type, crawler, select_sql, env, action='')
-    word_list = []
-    if len(words) == 0:
-        return word_list
-    for word in words:
-        word_list.append(word['filter_word'])
-
-    return word_list
-
-
 def get_user_from_mysql(log_type, crawler, source, env, action=''):
     sql = f"select * from crawler_user_v3 where source='{source}' and mode='{log_type}'"
     results = MysqlHelper.get_values(log_type, crawler, sql, env, action=action)
@@ -109,89 +87,38 @@ def download_rule(log_type, crawler, video_dict, rule_dict):
     :param rule_dict: 规则信息,字典格式
     :return: 满足规则,返回 True;反之,返回 False
     """
-    rule_playCnt_min = rule_dict.get('playCnt', {}).get('min', 0)
-    rule_playCnt_max = rule_dict.get('playCnt', {}).get('max', 100000000)
-    if rule_playCnt_max == 0:
-        rule_playCnt_max = 100000000
-
-    rule_duration_min = rule_dict.get('duration', {}).get('min', 0)
-    rule_duration_max = rule_dict.get('duration', {}).get('max', 100000000)
-    if rule_duration_max == 0:
-        rule_duration_max = 100000000
-
-    rule_period_min = rule_dict.get('period', {}).get('min', 0)
-    # rule_period_max = rule_dict.get('period', {}).get('max', 100000000)
-    # if rule_period_max == 0:
-    #     rule_period_max = 100000000
-    #
-    # rule_fans_min = rule_dict.get('fans', {}).get('min', 0)
-    # rule_fans_max = rule_dict.get('fans', {}).get('max', 100000000)
-    # if rule_fans_max == 0:
-    #     rule_fans_max = 100000000
-    #
-    # rule_videos_min = rule_dict.get('videos', {}).get('min', 0)
-    # rule_videos_max = rule_dict.get('videos', {}).get('max', 100000000)
-    # if rule_videos_max == 0:
-    #     rule_videos_max = 100000000
-
-    rule_like_min = rule_dict.get('like', {}).get('min', 0)
-    rule_like_max = rule_dict.get('like', {}).get('max', 100000000)
-    if rule_like_max == 0:
-        rule_like_max = 100000000
-
-    rule_videoWidth_min = rule_dict.get('videoWidth', {}).get('min', 0)
-    rule_videoWidth_max = rule_dict.get('videoWidth', {}).get('max', 100000000)
-    if rule_videoWidth_max == 0:
-        rule_videoWidth_max = 100000000
-
-    rule_videoHeight_min = rule_dict.get('videoHeight', {}).get('min', 0)
-    rule_videoHeight_max = rule_dict.get('videoHeight', {}).get('max', 100000000)
-    if rule_videoHeight_max == 0:
-        rule_videoHeight_max = 100000000
-
-    rule_shareCnt_min = rule_dict.get('shareCnt', {}).get('min', 0)
-    rule_shareCnt_max = rule_dict.get('shareCnt', {}).get('max', 100000000)
-    if rule_shareCnt_max == 0:
-        rule_shareCnt_max = 100000000
-
-    rule_commentCnt_min = rule_dict.get('commentCnt', {}).get('min', 0)
-    rule_commentCnt_max = rule_dict.get('commentCnt', {}).get('max', 100000000)
-    if rule_commentCnt_max == 0:
-        rule_commentCnt_max = 100000000
-
-    Common.logger(log_type, crawler).info(
-        f'rule_duration_max:{rule_duration_max} >= duration:{int(float(video_dict["duration"]))} >= rule_duration_min:{int(rule_duration_min)}')
-    Common.logger(log_type, crawler).info(
-        f'rule_playCnt_max:{int(rule_playCnt_max)} >= play_cnt:{int(video_dict["play_cnt"])} >= rule_playCnt_min:{int(rule_playCnt_min)}')
-    Common.logger(log_type, crawler).info(
-        f'now:{int(time.time())} - publish_time_stamp:{int(video_dict["publish_time_stamp"])} <= {3600 * 24 * int(rule_period_min)}')
-    Common.logger(log_type, crawler).info(
-        f'rule_like_max:{int(rule_like_max)} >= like_cnt:{int(video_dict["like_cnt"])} >= rule_like_min:{int(rule_like_min)}')
-    Common.logger(log_type, crawler).info(
-        f'rule_commentCnt_max:{int(rule_commentCnt_max)} >= comment_cnt:{int(video_dict["comment_cnt"])} >= rule_commentCnt_min:{int(rule_commentCnt_min)}')
-    Common.logger(log_type, crawler).info(
-        f'rule_shareCnt_max:{int(rule_shareCnt_max)} >= share_cnt:{int(video_dict["share_cnt"])} >= rule_shareCnt_min:{int(rule_shareCnt_min)}')
-    Common.logger(log_type, crawler).info(
-        f'rule_videoWidth_max:{int(rule_videoWidth_max)} >= video_width:{int(video_dict["video_width"])} >= rule_videoWidth_min:{int(rule_videoWidth_min)}')
-    Common.logger(log_type, crawler).info(
-        f'rule_videoHeight_max:{int(rule_videoHeight_max)} >= video_height:{int(video_dict["video_height"])} >= rule_videoHeight_min:{int(rule_videoHeight_min)}')
-
-    if int(rule_duration_max) >= int(float(video_dict["duration"])) >= int(rule_duration_min) \
-            and int(rule_playCnt_max) >= int(video_dict['play_cnt']) >= int(rule_playCnt_min) \
-            and int(time.time()) - int(video_dict["publish_time_stamp"]) <= 3600 * 24 * int(rule_period_min) \
-            and int(rule_like_max) >= int(video_dict['like_cnt']) >= int(rule_like_min) \
-            and int(rule_commentCnt_max) >= int(video_dict['comment_cnt']) >= int(rule_commentCnt_min) \
-            and int(rule_shareCnt_max) >= int(video_dict['share_cnt']) >= int(rule_shareCnt_min) \
-            and int(rule_videoWidth_max) >= int(video_dict['video_width']) >= int(rule_videoWidth_min) \
-            and int(rule_videoHeight_max) >= int(video_dict['video_height']) >= int(rule_videoHeight_min):
-        return True
-    else:
-        return False
+    # 格式化 video_dict:publish_time_stamp
+    if "publish_time_stamp" in video_dict.keys():
+        video_dict["publish_time"] = video_dict["publish_time_stamp"] * 1000
+    # 格式化 video_dict:period
+    if "period" not in video_dict.keys() and "publish_time" in video_dict.keys():
+        video_dict["period"] = int((int(time.time() * 1000) - video_dict["publish_time"]) / (3600 * 24 * 1000))
+    # 格式化 rule_dict 最大值取值为 0 的问题
+    for rule_value in rule_dict.values():
+        if rule_value["max"] == 0:
+            rule_value["max"] = 999999999999999
+    # 格式化 rule_dict 有的 key,video_dict 中没有的问题
+    for rule_key in rule_dict.keys():
+        if rule_key not in video_dict.keys():
+            video_dict[rule_key] = int(rule_dict[rule_key]["max"] / 2)
+    # 比较结果,输出:True / False
+    for video_key, video_value in video_dict.items():
+        for rule_key, rule_value in rule_dict.items():
+            if video_key == rule_key:
+                result = rule_value["min"] <= video_value <= rule_value["max"]
+                # print(f'{video_key}: {rule_value["min"]} <= {video_value} <= {rule_value["max"]},{result}')
+                Common.logger(log_type, crawler).info(f'{video_key}: {rule_value["min"]} <= {video_value} <= {rule_value["max"]},{result}')
+                if result is False:
+                    return False
+                else:
+                    continue
+    return True
 
 
 if __name__ == "__main__":
     # print(filter_word('public', 'xiaoniangao', '小年糕', 'prod'))
-    print(get_config_from_mysql('hour', 'xiaoniangao', 'prod', 'emoji'))
+    # print(get_config_from_mysql('test', 'gongzhonghao', 'prod', 'filter'))
+    # print(filter_word('test', 'gongzhonghao', '公众号', 'prod'))
     # task_str = "[('task_id','11')," \
     #            "('task_name','小年糕小时榜')," \
     #            "('source','xiaoniangao')," \

+ 13 - 4
gongzhonghao/gongzhonghao_follow/gongzhonghao_follow.py

@@ -20,7 +20,7 @@ from selenium import webdriver
 sys.path.append(os.getcwd())
 from common.common import Common
 from common.feishu import Feishu
-from common.public import filter_word
+from common.public import get_config_from_mysql
 from common.publish import Publish
 from common.scheduling_db import MysqlHelper
 
@@ -372,8 +372,12 @@ class GongzhonghaoFollow:
         if video_dict['article_url'] == 0 or video_dict['video_url'] == 0:
             Common.logger(log_type, crawler).info("文章涉嫌违反相关法律法规和政策\n")
         # 标题敏感词过滤
-        elif any(word if word in video_dict['video_title'] else False for word in
-                 filter_word(log_type, crawler, "公众号", env)) is True:
+        elif any(word if word in video_dict['video_title']
+                 else False for word in get_config_from_mysql(log_type=log_type,
+                                                              source=crawler,
+                                                              env=env,
+                                                              text="filter",
+                                                              action="")) is True:
             Common.logger(log_type, crawler).info("标题已中过滤词\n")
         # 已下载判断
         elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env) != 0:
@@ -534,11 +538,16 @@ class GongzhonghaoFollow:
 
 if __name__ == "__main__":
     # GongzhonghaoFollow.get_token(log_type="follow", crawler="gongzhonghao")
-    GongzhonghaoFollow.get_users()
+    # GongzhonghaoFollow.get_users()
     # GongzhonghaoFollow.get_videoList(log_type="follow",
     #                                  crawler="gongzhonghao",
     #                                  user="香音难忘",
     #                                  index=1,
     #                                  oss_endpoint="out",
     #                                  env="dev")
+    print(get_config_from_mysql(log_type="test",
+                              source="gongzhonghao",
+                              env="prod",
+                              text="filter",
+                              action=""))
     pass

+ 7 - 3
gongzhonghao/gongzhonghao_follow/gongzhonghao_follow_2.py

@@ -21,7 +21,7 @@ from selenium import webdriver
 sys.path.append(os.getcwd())
 from common.common import Common
 from common.feishu import Feishu
-from common.public import filter_word
+from common.public import get_config_from_mysql
 from common.publish import Publish
 from common.scheduling_db import MysqlHelper
 
@@ -373,8 +373,12 @@ class GongzhonghaoFollow2:
         if video_dict['article_url'] == 0 or video_dict['video_url'] == 0:
             Common.logger(log_type, crawler).info("文章涉嫌违反相关法律法规和政策\n")
         # 标题敏感词过滤
-        elif any(word if word in video_dict['video_title'] else False for word in
-                 filter_word(log_type, crawler, "公众号", env)) is True:
+        elif any(word if word in video_dict['video_title']
+                 else False for word in get_config_from_mysql(log_type=log_type,
+                                                              source=crawler,
+                                                              env=env,
+                                                              text="filter",
+                                                              action="")) is True:
             Common.logger(log_type, crawler).info("标题已中过滤词\n")
         # 已下载判断
         elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env) != 0:

+ 7 - 3
gongzhonghao/gongzhonghao_follow/gongzhonghao_follow_3.py

@@ -21,7 +21,7 @@ from selenium import webdriver
 sys.path.append(os.getcwd())
 from common.common import Common
 from common.feishu import Feishu
-from common.public import filter_word
+from common.public import get_config_from_mysql
 from common.publish import Publish
 from common.scheduling_db import MysqlHelper
 
@@ -373,8 +373,12 @@ class GongzhonghaoFollow3:
         if video_dict['article_url'] == 0 or video_dict['video_url'] == 0:
             Common.logger(log_type, crawler).info("文章涉嫌违反相关法律法规和政策\n")
         # 标题敏感词过滤
-        elif any(word if word in video_dict['video_title'] else False for word in
-                 filter_word(log_type, crawler, "公众号", env)) is True:
+        elif any(word if word in video_dict['video_title']
+                 else False for word in get_config_from_mysql(log_type=log_type,
+                                                              source=crawler,
+                                                              env=env,
+                                                              text="filter",
+                                                              action="")) is True:
             Common.logger(log_type, crawler).info("标题已中过滤词\n")
         # 已下载判断
         elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env) != 0:

+ 28 - 28
main/process.sh

@@ -68,35 +68,35 @@ else
   echo "$(date "+%Y-%m-%d %H:%M:%S") 小年糕定向爬虫策略 进程状态正常" >> ${log_path}
 fi
 
-## 小年糕小时榜爬虫策略
-#echo "$(date "+%Y-%m-%d %H:%M:%S") 正在监测 小年糕小时榜爬虫策略 进程状态" >> ${log_path}
-#ps -ef | grep "run_xiaoniangao_hour.py" | grep -v "grep"
-#if [ "$?" -eq 1 ];then
-#  echo "$(date "+%Y-%m-%d_%H:%M:%S") 异常停止,正在重启!" >> ${log_path}
-#  if [ ${env} = "dev" ];then
-#    cd ${piaoquan_crawler_dir} && sh main/scheduling_main.sh ./xiaoniangao/xiaoniangao_main/run_xiaoniangao_hour.py --log_type="hour" --crawler="xiaoniangao" --env="dev" xiaoniangao/logs/nohup-hour.log
-#  else
-#    cd ${piaoquan_crawler_dir} && /usr/bin/sh /data5/piaoquan_crawler/main/scheduling_main.sh ./xiaoniangao/xiaoniangao_main/run_xiaoniangao_hour.py --log_type="hour" --crawler="xiaoniangao" --env="prod" xiaoniangao/logs/nohup-hour.log
-#  fi
-#  echo "$(date "+%Y-%m-%d %H:%M:%S") 重启完成!" >> ${log_path}
-#else
-#  echo "$(date "+%Y-%m-%d %H:%M:%S") 小年糕小时榜爬虫策略 进程状态正常" >> ${log_path}
-#fi
+# 小年糕小时榜爬虫策略
+echo "$(date "+%Y-%m-%d %H:%M:%S") 正在监测 小年糕小时榜爬虫策略 进程状态" >> ${log_path}
+ps -ef | grep "run_xiaoniangao_hour.py" | grep -v "grep"
+if [ "$?" -eq 1 ];then
+  echo "$(date "+%Y-%m-%d_%H:%M:%S") 异常停止,正在重启!" >> ${log_path}
+  if [ ${env} = "dev" ];then
+    cd ${piaoquan_crawler_dir} && sh main/scheduling_main.sh ./xiaoniangao/xiaoniangao_main/run_xiaoniangao_hour.py --log_type="hour" --crawler="xiaoniangao" --env="dev" xiaoniangao/logs/nohup-hour.log
+  else
+    cd ${piaoquan_crawler_dir} && /usr/bin/sh /data5/piaoquan_crawler/main/scheduling_main.sh ./xiaoniangao/xiaoniangao_main/run_xiaoniangao_hour.py --log_type="hour" --crawler="xiaoniangao" --env="prod" xiaoniangao/logs/nohup-hour.log
+  fi
+  echo "$(date "+%Y-%m-%d %H:%M:%S") 重启完成!" >> ${log_path}
+else
+  echo "$(date "+%Y-%m-%d %H:%M:%S") 小年糕小时榜爬虫策略 进程状态正常" >> ${log_path}
+fi
 
-## 小年糕播放量榜爬虫策略
-#echo "$(date "+%Y-%m-%d %H:%M:%S") 正在监测 播放量榜爬虫策略 进程状态" >> ${log_path}
-#ps -ef | grep "run_xiaoniangao_play.py" | grep -v "grep"
-#if [ "$?" -eq 1 ];then
-#  echo "$(date "+%Y-%m-%d_%H:%M:%S") 异常停止,正在重启!" >> ${log_path}
-#  if [ ${env} = "dev" ];then
-#    cd ${piaoquan_crawler_dir} && sh main/scheduling_main.sh ./xiaoniangao/xiaoniangao_main/run_xiaoniangao_play.py --log_type="play" --crawler="xiaoniangao" --env="dev" xiaoniangao/logs/nohup-play.log
-#  else
-#    cd ${piaoquan_crawler_dir} && /usr/bin/sh /data5/piaoquan_crawler/main/scheduling_main.sh ./xiaoniangao/xiaoniangao_main/run_xiaoniangao_play.py --log_type="play" --crawler="xiaoniangao" --env="prod" xiaoniangao/logs/nohup-play.log
-#  fi
-#  echo "$(date "+%Y-%m-%d %H:%M:%S") 重启完成!" >> ${log_path}
-#else
-#  echo "$(date "+%Y-%m-%d %H:%M:%S") 播放量榜爬虫策略 进程状态正常" >> ${log_path}
-#fi
+# 小年糕播放量榜爬虫策略
+echo "$(date "+%Y-%m-%d %H:%M:%S") 正在监测 播放量榜爬虫策略 进程状态" >> ${log_path}
+ps -ef | grep "run_xiaoniangao_play.py" | grep -v "grep"
+if [ "$?" -eq 1 ];then
+  echo "$(date "+%Y-%m-%d_%H:%M:%S") 异常停止,正在重启!" >> ${log_path}
+  if [ ${env} = "dev" ];then
+    cd ${piaoquan_crawler_dir} && sh main/scheduling_main.sh ./xiaoniangao/xiaoniangao_main/run_xiaoniangao_play.py --log_type="play" --crawler="xiaoniangao" --env="dev" xiaoniangao/logs/nohup-play.log
+  else
+    cd ${piaoquan_crawler_dir} && /usr/bin/sh /data5/piaoquan_crawler/main/scheduling_main.sh ./xiaoniangao/xiaoniangao_main/run_xiaoniangao_play.py --log_type="play" --crawler="xiaoniangao" --env="prod" xiaoniangao/logs/nohup-play.log
+  fi
+  echo "$(date "+%Y-%m-%d %H:%M:%S") 重启完成!" >> ${log_path}
+else
+  echo "$(date "+%Y-%m-%d %H:%M:%S") 播放量榜爬虫策略 进程状态正常" >> ${log_path}
+fi
 
 
 # 快手定向爬虫策略

+ 0 - 105
scheduling/scheduling_v3/demo.py

@@ -1,105 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/4/19
-import time
-from datetime import date, timedelta
-
-# import ast
-# task_str = "[('task_id','11')," \
-#            "('task_name','小年糕小时榜')," \
-#            "('source','xiaoniangao')," \
-#            "('start_time','1681834560000')," \
-#            "('interval','1'),('mode','hour')," \
-#            "('rule','[{'duration':{'min':40,'max':0}},{'playCnt':{'min':4000,'max':0}},{'period':{'min':10,'max':0}},{'fans':{'min':0,'max':0}},{'videos':{'min':0,'max':0}},{'like':{'min':0,'max':0}},{'videoWidth':{'min':0,'max':0}},{'videoHeight':{'min':0,'max':0}}]')," \
-#            "('spider_name','')," \
-#            "('machine','')," \
-#            "('status','0')," \
-#            "('create_time','1681889875288')," \
-#            "('update_time','1681889904908')," \
-#            "('operator','王坤')]"
-# task_str = task_str.replace("'[{", '[{').replace("}}]'", '}}]')
-# print(task_str)
-# task_list = eval(task_str)
-# print(task_list)
-# print(type(task_list))
-# task_dict = dict(task_list)
-# print(task_dict)
-# print(type(task_dict))
-#
-#
-# rule = task_dict['rule']
-# print(type(rule))
-# print(rule)
-# print(task_dict)
-# task_dict['rule'] = dict()
-# for item in rule:
-#     for k, val in item.items():
-#         task_dict['rule'][k] = val
-# print('\n')
-# print(task_dict['rule'])
-# print('\n')
-# print(task_dict)
-
-# str1 = task_str.split(",('rule',")[0]+"]"
-# print(type(str1))
-# print(str1)
-# eval1 = eval(str1)
-# print(type(eval1), eval1)
-# dict1 = dict(eval1)
-# print(type(dict1), dict1)
-# print("============\n")
-#
-# "[{'duration':{'min':40,'max':0}},{'playCnt':{'min':4000,'max':0}},{'period':{'min':10,'max':0}},{'fans':{'min':0,'max':0}},{'videos':{'min':0,'max':0}},{'like':{'min':0,'max':0}},{'videoWidth':{'min':0,'max':0}},{'videoHeight':{'min':0,'max':0}}]"
-# print(task_str.split(",('rule',")[-1].split(",('spider_name'")[0].replace(")", ""))
-# # rule_str = task_str.split(",('rule',")[-1].split(",('spider_name'")[0].replace(")", "").replace("'[{", '[{').replace("}}]'", '}}]')
-# rule_str = task_str.split(",('rule',")[-1].split(",('spider_name'")[0].replace(")", "")[1:-1]
-# print(type(rule_str))
-# print(rule_str)
-#
-#
-# rule_list = eval(rule_str)
-# print(type(rule_list))
-# print(rule_list)
-
-# rule_dict = {'duration': {'min': 40, 'max': 0}, 'playCnt': {'min': 4000, 'max': 0}, 'period': {'min': 10, 'max': 0}, 'fans': {'min': 0, 'max': 0}, 'videos': {'min': 0, 'max': 0}, 'like': {'min': 0, 'max': 0}, 'videoWidth': {'min': 0, 'max': 0}, 'videoHeight': {'min': 0, 'max': 0}}
-# rule_dict = {}
-#
-# for k, v in rule_dict.items():
-#     print(f"{k}:{v}")
-#
-# rule_duration_min = rule_dict.get('duration', {}).get('min', 0)
-# rule_duration_max = rule_dict.get('duration', {}).get('max', 100000000)
-# rule_playCnt_min = rule_dict.get('playCnt', {}).get('min', 0)
-# rule_playCnt_max = rule_dict.get('playCnt', {}).get('max', 100000000)
-# rule_period_min = rule_dict.get('period', {}).get('min', 0)
-# rule_period_max = rule_dict.get('period', {}).get('max', 100000000)
-# rule_fans_min = rule_dict.get('fans', {}).get('min', 0)
-# rule_fans_max = rule_dict.get('fans', {}).get('max', 100000000)
-# rule_videos_min = rule_dict.get('videos', {}).get('min', 0)
-# rule_videos_max = rule_dict.get('videos', {}).get('max', 100000000)
-# rule_like_min = rule_dict.get('like', {}).get('min', 0)
-# rule_like_max = rule_dict.get('like', {}).get('max', 100000000)
-# rule_videoWidth_min = rule_dict.get('videoWidth', {}).get('min', 0)
-# rule_videoWidth_max = rule_dict.get('videoWidth', {}).get('max', 100000000)
-# rule_videoHeight_min = rule_dict.get('videoWidth', {}).get('min', 0)
-# rule_videoHeight_max = rule_dict.get('videoWidth', {}).get('max', 100000000)
-#
-# print(f"rule_duration_min:{rule_duration_min}")
-# print(f"rule_duration_max:{rule_duration_max}")
-# print(f"rule_playCnt_min:{rule_playCnt_min}")
-# print(f"rule_playCnt_max:{rule_playCnt_max}")
-# print(f"rule_period_min:{rule_period_min}")
-# print(f"rule_period_max:{rule_period_max}")
-# print(f"rule_fans_min:{rule_fans_min}")
-# print(f"rule_fans_max:{rule_fans_max}")
-# print(f"rule_videos_min:{rule_videos_min}")
-# print(f"rule_videos_max:{rule_videos_max}")
-# print(f"rule_videoWidth_min:{rule_videoWidth_min}")
-# print(f"rule_videoWidth_max:{rule_videoWidth_max}")
-# print(f"rule_videoHeight_min:{rule_videoHeight_min}")
-# print(f"rule_videoHeight_max:{rule_videoHeight_max}")
-
-time_str = (date.today() + timedelta(days=-10)).strftime("%Y-%m-%d %H:%M:%S")
-time_stamp = int(time.mktime(time.strptime(time_str, "%Y-%m-%d %H:%M:%S")))
-print(time_str)
-print(time_stamp)

BIN
suisuiniannianyingfuqi/.DS_Store


+ 14 - 13
suisuiniannianyingfuqi/suisuiniannianyingfuqi_main/demo.py

@@ -31,19 +31,20 @@ class Demo:
             "publish_time_stamp": 1683648000,  # 2023-05-10 00:00:00
             "video_url": "www.baidu.com"
         }
-        rule_dict = {"play_cnt": {"min": 0, "max": 0},
-                     "fans_cnt": {"min": 0, "max": 0},
-                     "videos_cnt": {"min": 0, "max": 0},
-                     "like_cnt": {"min": 0, "max": 0},
-                     "video_width": {"min": 0, "max": 0},
-                     "video_height": {"min": 0, "max": 0},
-                     "duration": {"min": 0, "max": 0},
-                     "share_cnt": {"min": 0, "max": 0},
-                     "comment_cnt": {"min": 0, "max": 0},
-                     "favorite_cnt": {"min": 0, "max": 0},
-                     # "period": {"min": 10, "max": 0},
-                     "publish_time": {"min": 1673734400000, "max": 0}
-                     }
+        rule_dict = {
+             # "play_cnt": {"min": 0, "max": 0},
+             # "fans_cnt": {"min": 0, "max": 0},
+             # "videos_cnt": {"min": 0, "max": 0},
+             # "like_cnt": {"min": 0, "max": 0},
+             # "video_width": {"min": 0, "max": 0},
+             # "video_height": {"min": 0, "max": 0},
+             # "duration": {"min": 0, "max": 0},
+             # "share_cnt": {"min": 0, "max": 0},
+             # "comment_cnt": {"min": 0, "max": 0},
+             # "favorite_cnt": {"min": 0, "max": 0},
+             # "period": {"min": 10, "max": 0},
+             # "publish_time": {"min": 1673734400000, "max": 0}
+        }
 
         # 格式化 video_dict:publish_time_stamp
         if "publish_time_stamp" in video_dict.keys():

+ 1 - 1
suisuiniannianyingfuqi/suisuiniannianyingfuqi_main/run_suisuiniannianyingfuqi_recommend_scheduling.py

@@ -32,7 +32,7 @@ def main(log_type, crawler, task, env):
                                                             rule_dict=rule_dict,
                                                             env=env)
     Common.del_logs(log_type, crawler)
-    Common.logger(log_type, crawler).info('抓取完一轮,休眠 1 分钟\n')
+    Common.logger(log_type, crawler).info('抓取完一轮\n')
 
 
 if __name__ == "__main__":

+ 16 - 24
suisuiniannianyingfuqi/suisuiniannianyingfuqi_recommend/suisuiniannianyingfuqi_recommend_scheduling.py

@@ -13,6 +13,7 @@ sys.path.append(os.getcwd())
 from common.common import Common
 from common.feishu import Feishu
 from common.publish import Publish
+from common.public import download_rule
 from common.scheduling_db import MysqlHelper
 
 
@@ -68,39 +69,30 @@ class SuisuiniannianyingfuqiRecommendScheduling:
                     feeds = response.json()['data']['video_list']['data']
                     for i in range(len(feeds)):
                         try:
-                            video_title = feeds[i].get('title', "").replace("'", "").replace('"', '')
-                            video_id = str(feeds[i].get('id', ''))
-                            play_cnt = feeds[i].get('browse', 0)
-                            comment_cnt = 0
-                            like_cnt = 0
-                            share_cnt = 0
                             publish_time_str = feeds[i].get('createtime', '')
                             publish_time_stamp = int(time.mktime(time.strptime(publish_time_str, "%Y-%m-%d")))
-                            user_name = "岁岁年年迎福气"
-                            user_id = "suisuiniannianyingfuqi"
-                            cover_url = feeds[i].get('thumb', '')
-                            video_url = feeds[i].get('url', '')
-
-                            video_dict = {'video_title': video_title,
-                                          'video_id': video_id,
-                                          'play_cnt': play_cnt,
-                                          'comment_cnt': comment_cnt,
-                                          'like_cnt': like_cnt,
-                                          'share_cnt': share_cnt,
+                            video_dict = {'video_title': feeds[i].get('title', "").replace("'", "").replace('"', ''),
+                                          'video_id': str(feeds[i].get('id', '')),
+                                          'play_cnt': feeds[i].get('browse', 0),
+                                          'comment_cnt': 0,
+                                          'like_cnt': 0,
+                                          'share_cnt': 0,
                                           'publish_time_stamp': publish_time_stamp,
                                           'publish_time_str': publish_time_str,
-                                          'user_name': user_name,
-                                          'user_id': user_id,
-                                          'avatar_url': cover_url,
-                                          'cover_url': cover_url,
-                                          'video_url': video_url,
+                                          'user_name': "岁岁年年迎福气",
+                                          'user_id': "suisuiniannianyingfuqi",
+                                          'avatar_url': feeds[i].get('thumb', ''),
+                                          'cover_url': feeds[i].get('thumb', ''),
+                                          'video_url': feeds[i].get('url', ''),
                                           'session': f"suisuiniannianyingfuqi-{int(time.time())}"}
                             for k, v in video_dict.items():
                                 Common.logger(log_type, crawler).info(f"{k}:{v}")
 
-                            if video_id == '' or video_title == '' or cover_url == '' or video_url == '':
+                            if video_dict["video_id"] == '' or video_dict["video_title"] == '' or video_dict["cover_url"] == '' or video_dict["video_url"] == '':
                                 Common.logger(log_type, crawler).info('无效视频\n')
-                            elif cls.repeat_video(log_type, crawler, video_id, env) != 0:
+                            elif download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
+                                Common.logger(log_type, crawler).info("不满足抓取规则\n")
+                            elif cls.repeat_video(log_type, crawler, video_dict["video_id"], env) != 0:
                                 Common.logger(log_type, crawler).info('视频已下载\n')
                             else:
                                 cls.download_publish(log_type=log_type,

+ 6 - 2
xiaoniangao/xiaoniangao_follow/xiaoniangao_follow.py

@@ -14,7 +14,7 @@ from common.common import Common
 from common.scheduling_db import MysqlHelper
 from common.publish import Publish
 from common.feishu import Feishu
-from common.public import filter_word
+from common.public import get_config_from_mysql
 proxies = {"http": None, "https": None}
 
 
@@ -309,7 +309,11 @@ class XiaoniangaoFollow:
         elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env) != 0:
             Common.logger(log_type, crawler).info('视频已下载\n')
         elif any(str(word) if str(word) in video_dict['video_title'] else False for word in
-                 filter_word(log_type, crawler, "小年糕", env)) is True:
+                 get_config_from_mysql(log_type=log_type,
+                                       source=crawler,
+                                       env=env,
+                                       text="filter",
+                                       action="")) is True:
             Common.logger(log_type, crawler).info("视频已中过滤词\n")
         else:
             # 下载封面

+ 7 - 4
xiaoniangao/xiaoniangao_hour/xiaoniangao_hour.py

@@ -16,7 +16,7 @@ from common.common import Common
 from common.feishu import Feishu
 from common.publish import Publish
 from common.scheduling_db import MysqlHelper
-from common.public import filter_word
+from common.public import get_config_from_mysql
 
 proxies = {"http": None, "https": None}
 
@@ -329,9 +329,12 @@ class XiaoniangaoHour:
                     Common.logger(log_type, crawler).info("不满足基础门槛规则\n")
                 elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env) != 0:
                     Common.logger(log_type, crawler).info('视频已下载\n')
-                # 过滤敏感词
-                elif any(str(word) if str(word) in video_title else False for word in
-                         filter_word(log_type, crawler, "小年糕", env)) is True:
+                elif any(str(word) if str(word) in video_dict['video_title'] else False for word in
+                         get_config_from_mysql(log_type=log_type,
+                                               source=crawler,
+                                               env=env,
+                                               text="filter",
+                                               action="")) is True:
                     Common.logger(log_type, crawler).info("视频已中过滤词\n")
                     time.sleep(1)
                 else:

+ 8 - 6
xiaoniangao/xiaoniangao_play/xiaoniangao_play.py

@@ -9,14 +9,12 @@ import sys
 import time
 import requests
 import urllib3
-
-from common.public import filter_word
-from common.scheduling_db import MysqlHelper
-
 sys.path.append(os.getcwd())
 from common.common import Common
 from common.feishu import Feishu
 from common.publish import Publish
+from common.public import get_config_from_mysql
+from common.scheduling_db import MysqlHelper
 proxies = {"http": None, "https": None}
 
 
@@ -326,8 +324,12 @@ class XiaoniangaoPlay:
         # 去重
         elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env) != 0:
             Common.logger(log_type, crawler).info("视频已下载\n")
-        # 过滤词库
-        elif any(str(word) if str(word) in video_dict['video_title'] else False for word in filter_word(log_type, crawler, "小年糕", env)) is True:
+        elif any(str(word) if str(word) in video_dict['video_title'] else False for word in
+                 get_config_from_mysql(log_type=log_type,
+                                       source=crawler,
+                                       env=env,
+                                       text="filter",
+                                       action="")) is True:
             Common.logger(log_type, crawler).info("视频已中过滤词\n")
         else:
             # 下载封面