Browse Source

西瓜测试代码提交,
优乐搞笑小视频——修改 bug(返回结构发生改变)

罗俊辉 1 year ago
parent
commit
cea691a0ca

+ 111 - 7
xigua/xigua_author/xigua_author_test.py

@@ -1,4 +1,5 @@
 import json
+import re
 import os
 import random
 import sys
@@ -8,13 +9,116 @@ import uuid
 import base64
 import requests
 from fake_useragent import FakeUserAgent
-from common.userAgent import get_random_user_agent
-
-from common.mq import MQ
 
 sys.path.append(os.getcwd())
 
-from common.pipeline import PiaoQuanPipelineTest
+
+class PiaoQuanPipelineTest:
+    def __init__(self, platform, mode, rule_dict, env, item, trace_id):
+        self.platform = platform
+        self.mode = mode
+        self.item = item
+        self.rule_dict = rule_dict
+        self.env = env
+        self.trace_id = trace_id
+
+    # 视频的发布时间限制, 属于是规则过滤
+    def publish_time_flag(self):
+        # 判断发布时间
+        publish_time_stamp = self.item["publish_time_stamp"]
+        update_time_stamp = self.item["update_time_stamp"]
+        if self.platform == "gongzhonghao":
+            if (
+                int(time.time()) - publish_time_stamp
+                > 3600 * 24 * int(self.rule_dict.get("period", {}).get("max", 1000))
+            ) and (
+                int(time.time()) - update_time_stamp
+                > 3600 * 24 * int(self.rule_dict.get("period", {}).get("max", 1000))
+            ):
+                message = "发布时间超过{}天".format(
+                    int(self.rule_dict.get("period", {}).get("max", 1000))
+                )
+                print(message)
+                return False
+        else:
+            if (
+                int(time.time()) - publish_time_stamp
+                > 3600 * 24 * int(self.rule_dict.get("period", {}).get("max", 1000))
+            ):
+                message = "发布时间超过{}天".format(
+                    int(self.rule_dict.get("period", {}).get("max", 1000))
+                )
+                print(message)
+                return False
+        return True
+
+    # 视频标题是否满足需求
+    def title_flag(self):
+        title = self.item["video_title"]
+        cleaned_title = re.sub(r"[^\w]", " ", title)
+        # 敏感词
+        # 获取敏感词列表
+        sensitive_words = []
+        if any(word in cleaned_title for word in sensitive_words):
+            message = "标题中包含敏感词"
+            print(message)
+            return False
+        return True
+
+    # 视频基础下载规则
+    def download_rule_flag(self):
+        for key in self.item:
+            if self.rule_dict.get(key):
+                max_value = (
+                    int(self.rule_dict[key]["max"])
+                    if int(self.rule_dict[key]["max"]) > 0
+                    else 999999999999999
+                )
+                if key == "peroid": # peroid是抓取周期天数
+                    continue
+                else:
+                    flag = int(self.rule_dict[key]["min"]) <= int(self.item[key]) <= max_value
+                    if not flag:
+                        message = "{}: {} <= {} <= {}, {}".format(
+                            key,
+                            self.rule_dict[key]["min"],
+                            self.item[key],
+                            max_value,
+                            flag,
+                        )
+                        print(message)
+                        return flag
+            else:
+                continue
+        return True
+
+    # 按照某个具体平台来去重
+    # def repeat_video(self):
+    #     # sql = f""" select * from crawler_video where platform="公众号" and out_video_id="{video_id}"; """
+    #     out_id = self.item["out_video_id"]
+    #     sql = f""" select * from crawler_video where platform = "{self.platform}" and out_video_id="{out_id}"; """
+    #     repeat_video = MysqlHelper.get_values(
+    #         log_type=self.mode, crawler=self.platform, env=self.env, sql=sql, action=""
+    #     )
+    #     if repeat_video:
+    #         message = "重复的视频"
+    #         return False
+    #     return True
+
+    def process_item(self):
+        if not self.publish_time_flag():
+            # 记录相关日志
+            return False
+        if not self.title_flag():
+            # 记录相关日志
+            return False
+        # if not self.repeat_video():
+        #     # 记录相关日志
+        #     return False
+        if not self.download_rule_flag():
+            # 记录相关日志
+            return False
+        return True
 
 
 def tunnel_proxies():
@@ -611,7 +715,7 @@ def get_comment_cnt(item_id):
         "aid": "1768",
         "msToken": "50-JJObWB07HfHs-BMJWT1eIDX3G-6lPSF_i-QwxBIXE9VVa-iN0jbEXR5pG2DKjXBmP299n6ZTuXzY-GAy968CCvouSAYIS4GzvGQT3pNlKNejr5G4-1g==",
         "X-Bogus": "DFSzswVOyGtANVeWtCLMqR/F6q9U",
-        "_signature":get_random_user_agent('pc'),
+        "_signature": FakeUserAgent().random,
     }
     headers = {
         "authority": "www.ixigua.com",
@@ -740,13 +844,13 @@ class XiGuaAuthor:
         self.rule_dict = rule_dict
         self.env = env
         self.user_list = user_list
-        self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
+        # self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
         self.download_count = 0
 
     def get_author_list(self):
         # 每轮只抓取定量的数据,到达数量后自己退出
         max_count = int(self.rule_dict.get("videos_cnt", {}).get("min", 300))
-        for user_dict in self.user_list[2: 3]:
+        for user_dict in self.user_list:
             self.get_video_list(user_dict)
             if self.download_count <= max_count:
                 self.get_video_list(user_dict)

+ 1 - 0
youlegaoxiaoxiaoshipin/youlegaoxiaoxiaoshipin_recommend/youlegaoxiaoxiaoshipin_scheduling.py

@@ -180,6 +180,7 @@ class YLGXXSPScheduling:
             video_dict["publish_time"] = video_dict["publish_time_str"]
             video_dict["video_url"] = video_obj["data"]["url"]
             video_dict["avatar_url"] = "http:" + video_obj["data"]["avatar"]
+            video_dict["cover_url"] = "http:" + video_obj["data"]["thumbUrl"]
             self.download_count += 1
             self.mq.send_msg(video_dict)
             # print(video_dict)