|
@@ -1,4 +1,5 @@
|
|
|
import json
|
|
|
+import re
|
|
|
import os
|
|
|
import random
|
|
|
import sys
|
|
@@ -8,13 +9,116 @@ import uuid
|
|
|
import base64
|
|
|
import requests
|
|
|
from fake_useragent import FakeUserAgent
|
|
|
-from common.userAgent import get_random_user_agent
|
|
|
-
|
|
|
-from common.mq import MQ
|
|
|
|
|
|
sys.path.append(os.getcwd())
|
|
|
|
|
|
-from common.pipeline import PiaoQuanPipelineTest
|
|
|
+
|
|
|
+class PiaoQuanPipelineTest:
|
|
|
+ def __init__(self, platform, mode, rule_dict, env, item, trace_id):
|
|
|
+ self.platform = platform
|
|
|
+ self.mode = mode
|
|
|
+ self.item = item
|
|
|
+ self.rule_dict = rule_dict
|
|
|
+ self.env = env
|
|
|
+ self.trace_id = trace_id
|
|
|
+
|
|
|
+ # 视频的发布时间限制, 属于是规则过滤
|
|
|
+ def publish_time_flag(self):
|
|
|
+ # 判断发布时间
|
|
|
+ publish_time_stamp = self.item["publish_time_stamp"]
|
|
|
+ update_time_stamp = self.item["update_time_stamp"]
|
|
|
+ if self.platform == "gongzhonghao":
|
|
|
+ if (
|
|
|
+ int(time.time()) - publish_time_stamp
|
|
|
+ > 3600 * 24 * int(self.rule_dict.get("period", {}).get("max", 1000))
|
|
|
+ ) and (
|
|
|
+ int(time.time()) - update_time_stamp
|
|
|
+ > 3600 * 24 * int(self.rule_dict.get("period", {}).get("max", 1000))
|
|
|
+ ):
|
|
|
+ message = "发布时间超过{}天".format(
|
|
|
+ int(self.rule_dict.get("period", {}).get("max", 1000))
|
|
|
+ )
|
|
|
+ print(message)
|
|
|
+ return False
|
|
|
+ else:
|
|
|
+ if (
|
|
|
+ int(time.time()) - publish_time_stamp
|
|
|
+ > 3600 * 24 * int(self.rule_dict.get("period", {}).get("max", 1000))
|
|
|
+ ):
|
|
|
+ message = "发布时间超过{}天".format(
|
|
|
+ int(self.rule_dict.get("period", {}).get("max", 1000))
|
|
|
+ )
|
|
|
+ print(message)
|
|
|
+ return False
|
|
|
+ return True
|
|
|
+
|
|
|
+ # 视频标题是否满足需求
|
|
|
+ def title_flag(self):
|
|
|
+ title = self.item["video_title"]
|
|
|
+ cleaned_title = re.sub(r"[^\w]", " ", title)
|
|
|
+ # 敏感词
|
|
|
+ # 获取敏感词列表
|
|
|
+ sensitive_words = []
|
|
|
+ if any(word in cleaned_title for word in sensitive_words):
|
|
|
+ message = "标题中包含敏感词"
|
|
|
+ print(message)
|
|
|
+ return False
|
|
|
+ return True
|
|
|
+
|
|
|
+ # 视频基础下载规则
|
|
|
+ def download_rule_flag(self):
|
|
|
+ for key in self.item:
|
|
|
+ if self.rule_dict.get(key):
|
|
|
+ max_value = (
|
|
|
+ int(self.rule_dict[key]["max"])
|
|
|
+ if int(self.rule_dict[key]["max"]) > 0
|
|
|
+ else 999999999999999
|
|
|
+ )
|
|
|
+ if key == "peroid": # peroid是抓取周期天数
|
|
|
+ continue
|
|
|
+ else:
|
|
|
+ flag = int(self.rule_dict[key]["min"]) <= int(self.item[key]) <= max_value
|
|
|
+ if not flag:
|
|
|
+ message = "{}: {} <= {} <= {}, {}".format(
|
|
|
+ key,
|
|
|
+ self.rule_dict[key]["min"],
|
|
|
+ self.item[key],
|
|
|
+ max_value,
|
|
|
+ flag,
|
|
|
+ )
|
|
|
+ print(message)
|
|
|
+ return flag
|
|
|
+ else:
|
|
|
+ continue
|
|
|
+ return True
|
|
|
+
|
|
|
+ # 按照某个具体平台来去重
|
|
|
+ # def repeat_video(self):
|
|
|
+ # # sql = f""" select * from crawler_video where platform="公众号" and out_video_id="{video_id}"; """
|
|
|
+ # out_id = self.item["out_video_id"]
|
|
|
+ # sql = f""" select * from crawler_video where platform = "{self.platform}" and out_video_id="{out_id}"; """
|
|
|
+ # repeat_video = MysqlHelper.get_values(
|
|
|
+ # log_type=self.mode, crawler=self.platform, env=self.env, sql=sql, action=""
|
|
|
+ # )
|
|
|
+ # if repeat_video:
|
|
|
+ # message = "重复的视频"
|
|
|
+ # return False
|
|
|
+ # return True
|
|
|
+
|
|
|
+ def process_item(self):
|
|
|
+ if not self.publish_time_flag():
|
|
|
+ # 记录相关日志
|
|
|
+ return False
|
|
|
+ if not self.title_flag():
|
|
|
+ # 记录相关日志
|
|
|
+ return False
|
|
|
+ # if not self.repeat_video():
|
|
|
+ # # 记录相关日志
|
|
|
+ # return False
|
|
|
+ if not self.download_rule_flag():
|
|
|
+ # 记录相关日志
|
|
|
+ return False
|
|
|
+ return True
|
|
|
|
|
|
|
|
|
def tunnel_proxies():
|
|
@@ -611,7 +715,7 @@ def get_comment_cnt(item_id):
|
|
|
"aid": "1768",
|
|
|
"msToken": "50-JJObWB07HfHs-BMJWT1eIDX3G-6lPSF_i-QwxBIXE9VVa-iN0jbEXR5pG2DKjXBmP299n6ZTuXzY-GAy968CCvouSAYIS4GzvGQT3pNlKNejr5G4-1g==",
|
|
|
"X-Bogus": "DFSzswVOyGtANVeWtCLMqR/F6q9U",
|
|
|
- "_signature":get_random_user_agent('pc'),
|
|
|
+ "_signature": FakeUserAgent().random,
|
|
|
}
|
|
|
headers = {
|
|
|
"authority": "www.ixigua.com",
|
|
@@ -740,13 +844,13 @@ class XiGuaAuthor:
|
|
|
self.rule_dict = rule_dict
|
|
|
self.env = env
|
|
|
self.user_list = user_list
|
|
|
- self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
|
|
|
+ # self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
|
|
|
self.download_count = 0
|
|
|
|
|
|
def get_author_list(self):
|
|
|
# 每轮只抓取定量的数据,到达数量后自己退出
|
|
|
max_count = int(self.rule_dict.get("videos_cnt", {}).get("min", 300))
|
|
|
- for user_dict in self.user_list[2: 3]:
|
|
|
+ for user_dict in self.user_list:
|
|
|
self.get_video_list(user_dict)
|
|
|
if self.download_count <= max_count:
|
|
|
self.get_video_list(user_dict)
|