vor 1 Jahr · cea691a0ca
--- a/xigua/xigua_author/xigua_author_test.py
+++ b/xigua/xigua_author/xigua_author_test.py
@@ -1,4 +1,5 @@
 
				 import json
			
 
				+import re
			
 
				 import os
			
 
				 import random
			
 
				 import sys
			
@@ -8,13 +9,116 @@ import uuid
 
				 import base64
			
 
				 import requests
			
 
				 from fake_useragent import FakeUserAgent
			
 
				-from common.userAgent import get_random_user_agent
			
 
				-
			
 
				-from common.mq import MQ
			
 
				 
			
 
				 sys.path.append(os.getcwd())
			
 
				 
			
 
				-from common.pipeline import PiaoQuanPipelineTest
			
 
				+
			
 
				+class PiaoQuanPipelineTest:
			
 
				+    def __init__(self, platform, mode, rule_dict, env, item, trace_id):
			
 
				+        self.platform = platform
			
 
				+        self.mode = mode
			
 
				+        self.item = item
			
 
				+        self.rule_dict = rule_dict
			
 
				+        self.env = env
			
 
				+        self.trace_id = trace_id
			
 
				+
			
 
				+    # 视频的发布时间限制, 属于是规则过滤
			
 
				+    def publish_time_flag(self):
			
 
				+        # 判断发布时间
			
 
				+        publish_time_stamp = self.item["publish_time_stamp"]
			
 
				+        update_time_stamp = self.item["update_time_stamp"]
			
 
				+        if self.platform == "gongzhonghao":
			
 
				+            if (
			
 
				+                int(time.time()) - publish_time_stamp
			
 
				+                > 3600 * 24 * int(self.rule_dict.get("period", {}).get("max", 1000))
			
 
				+            ) and (
			
 
				+                int(time.time()) - update_time_stamp
			
 
				+                > 3600 * 24 * int(self.rule_dict.get("period", {}).get("max", 1000))
			
 
				+            ):
			
 
				+                message = "发布时间超过{}天".format(
			
 
				+                    int(self.rule_dict.get("period", {}).get("max", 1000))
			
 
				+                )
			
 
				+                print(message)
			
 
				+                return False
			
 
				+        else:
			
 
				+            if (
			
 
				+                int(time.time()) - publish_time_stamp
			
 
				+                > 3600 * 24 * int(self.rule_dict.get("period", {}).get("max", 1000))
			
 
				+            ):
			
 
				+                message = "发布时间超过{}天".format(
			
 
				+                    int(self.rule_dict.get("period", {}).get("max", 1000))
			
 
				+                )
			
 
				+                print(message)
			
 
				+                return False
			
 
				+        return True
			
 
				+
			
 
				+    # 视频标题是否满足需求
			
 
				+    def title_flag(self):
			
 
				+        title = self.item["video_title"]
			
 
				+        cleaned_title = re.sub(r"[^\w]", " ", title)
			
 
				+        # 敏感词
			
 
				+        # 获取敏感词列表
			
 
				+        sensitive_words = []
			
 
				+        if any(word in cleaned_title for word in sensitive_words):
			
 
				+            message = "标题中包含敏感词"
			
 
				+            print(message)
			
 
				+            return False
			
 
				+        return True
			
 
				+
			
 
				+    # 视频基础下载规则
			
 
				+    def download_rule_flag(self):
			
 
				+        for key in self.item:
			
 
				+            if self.rule_dict.get(key):
			
 
				+                max_value = (
			
 
				+                    int(self.rule_dict[key]["max"])
			
 
				+                    if int(self.rule_dict[key]["max"]) > 0
			
 
				+                    else 999999999999999
			
 
				+                )
			
 
				+                if key == "peroid": # peroid是抓取周期天数
			
 
				+                    continue
			
 
				+                else:
			
 
				+                    flag = int(self.rule_dict[key]["min"]) <= int(self.item[key]) <= max_value
			
 
				+                    if not flag:
			
 
				+                        message = "{}: {} <= {} <= {}, {}".format(
			
 
				+                            key,
			
 
				+                            self.rule_dict[key]["min"],
			
 
				+                            self.item[key],
			
 
				+                            max_value,
			
 
				+                            flag,
			
 
				+                        )
			
 
				+                        print(message)
			
 
				+                        return flag
			
 
				+            else:
			
 
				+                continue
			
 
				+        return True
			
 
				+
			
 
				+    # 按照某个具体平台来去重
			
 
				+    # def repeat_video(self):
			
 
				+    #     # sql = f""" select * from crawler_video where platform="公众号" and out_video_id="{video_id}"; """
			
 
				+    #     out_id = self.item["out_video_id"]
			
 
				+    #     sql = f""" select * from crawler_video where platform = "{self.platform}" and out_video_id="{out_id}"; """
			
 
				+    #     repeat_video = MysqlHelper.get_values(
			
 
				+    #         log_type=self.mode, crawler=self.platform, env=self.env, sql=sql, action=""
			
 
				+    #     )
			
 
				+    #     if repeat_video:
			
 
				+    #         message = "重复的视频"
			
 
				+    #         return False
			
 
				+    #     return True
			
 
				+
			
 
				+    def process_item(self):
			
 
				+        if not self.publish_time_flag():
			
 
				+            # 记录相关日志
			
 
				+            return False
			
 
				+        if not self.title_flag():
			
 
				+            # 记录相关日志
			
 
				+            return False
			
 
				+        # if not self.repeat_video():
			
 
				+        #     # 记录相关日志
			
 
				+        #     return False
			
 
				+        if not self.download_rule_flag():
			
 
				+            # 记录相关日志
			
 
				+            return False
			
 
				+        return True
			
 
				 
			
 
				 
			
 
				 def tunnel_proxies():
			
@@ -611,7 +715,7 @@ def get_comment_cnt(item_id):
 
				         "aid": "1768",
			
 
				         "msToken": "50-JJObWB07HfHs-BMJWT1eIDX3G-6lPSF_i-QwxBIXE9VVa-iN0jbEXR5pG2DKjXBmP299n6ZTuXzY-GAy968CCvouSAYIS4GzvGQT3pNlKNejr5G4-1g==",
			
 
				         "X-Bogus": "DFSzswVOyGtANVeWtCLMqR/F6q9U",
			
 
				-        "_signature":get_random_user_agent('pc'),
			
 
				+        "_signature": FakeUserAgent().random,
			
 
				     }
			
 
				     headers = {
			
 
				         "authority": "www.ixigua.com",
			
@@ -740,13 +844,13 @@ class XiGuaAuthor:
 
				         self.rule_dict = rule_dict
			
 
				         self.env = env
			
 
				         self.user_list = user_list
			
 
				-        self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
			
 
				+        # self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
			
 
				         self.download_count = 0
			
 
				 
			
 
				     def get_author_list(self):
			
 
				         # 每轮只抓取定量的数据，到达数量后自己退出
			
 
				         max_count = int(self.rule_dict.get("videos_cnt", {}).get("min", 300))
			
 
				-        for user_dict in self.user_list[2: 3]:
			
 
				+        for user_dict in self.user_list:
			
 
				             self.get_video_list(user_dict)
			
 
				             if self.download_count <= max_count:
			
 
				                 self.get_video_list(user_dict)
			
--- a/youlegaoxiaoxiaoshipin/youlegaoxiaoxiaoshipin_recommend/youlegaoxiaoxiaoshipin_scheduling.py
+++ b/youlegaoxiaoxiaoshipin/youlegaoxiaoxiaoshipin_recommend/youlegaoxiaoxiaoshipin_scheduling.py
@@ -180,6 +180,7 @@ class YLGXXSPScheduling:
 
				             video_dict["publish_time"] = video_dict["publish_time_str"]
			
 
				             video_dict["video_url"] = video_obj["data"]["url"]
			
 
				             video_dict["avatar_url"] = "http:" + video_obj["data"]["avatar"]
			
 
				+            video_dict["cover_url"] = "http:" + video_obj["data"]["thumbUrl"]
			
 
				             self.download_count += 1
			
 
				             self.mq.send_msg(video_dict)
			
 
				             # print(video_dict)