|
@@ -18,6 +18,7 @@ from selenium.webdriver.chrome.service import Service
|
|
|
from selenium import webdriver
|
|
|
from selenium.webdriver.common.by import By
|
|
|
sys.path.append(os.getcwd())
|
|
|
+from common.mq import MQ
|
|
|
from common.scheduling_db import MysqlHelper
|
|
|
from common.common import Common
|
|
|
from common.feishu import Feishu
|
|
@@ -548,6 +549,7 @@ class XiguasearchScheduling:
|
|
|
|
|
|
@classmethod
|
|
|
def get_videoList(cls, log_type, crawler, user_dict, rule_dict, env):
|
|
|
+ mq = MQ(topic_name="topic_crawler_etl_" + env)
|
|
|
# 打印请求配置
|
|
|
ca = DesiredCapabilities.CHROME
|
|
|
ca["goog:loggingPrefs"] = {"performance": "ALL"}
|
|
@@ -658,20 +660,32 @@ class XiguasearchScheduling:
|
|
|
Common.logger(log_type, crawler).info('视频已下载\n')
|
|
|
Common.logging(log_type, crawler, env, '视频已下载\n')
|
|
|
else:
|
|
|
- title_score = get_title_score(log_type, "kuaishou", "16QspO", "0usaDk", video_dict["video_title"])
|
|
|
- if title_score <= 0.3:
|
|
|
- Common.logger(log_type, crawler).info(f"权重分:{title_score}<=0.3\n")
|
|
|
- Common.logging(log_type, crawler, env, f"权重分:{title_score}<=0.3\n")
|
|
|
- continue
|
|
|
- Common.logger(log_type, crawler).info(f"权重分:{title_score}>0.3\n")
|
|
|
- Common.logging(log_type, crawler, env, f"权重分:{title_score}>0.3\n")
|
|
|
- cls.download_publish(log_type=log_type,
|
|
|
- crawler=crawler,
|
|
|
- user_dict=user_dict,
|
|
|
- video_dict=video_dict,
|
|
|
- rule_dict=rule_dict,
|
|
|
- title_score=title_score,
|
|
|
- env=env)
|
|
|
+ # title_score = get_title_score(log_type, "kuaishou", "16QspO", "0usaDk", video_dict["video_title"])
|
|
|
+ # if title_score <= 0.3:
|
|
|
+ # Common.logger(log_type, crawler).info(f"权重分:{title_score}<=0.3\n")
|
|
|
+ # Common.logging(log_type, crawler, env, f"权重分:{title_score}<=0.3\n")
|
|
|
+ # continue
|
|
|
+ # Common.logger(log_type, crawler).info(f"权重分:{title_score}>0.3\n")
|
|
|
+ # Common.logging(log_type, crawler, env, f"权重分:{title_score}>0.3\n")
|
|
|
+ # cls.download_publish(log_type=log_type,
|
|
|
+ # crawler=crawler,
|
|
|
+ # user_dict=user_dict,
|
|
|
+ # video_dict=video_dict,
|
|
|
+ # rule_dict=rule_dict,
|
|
|
+ # title_score=title_score,
|
|
|
+ # env=env)
|
|
|
+ video_dict["out_user_id"] = video_dict["user_id"]
|
|
|
+ video_dict["platform"] = crawler
|
|
|
+ video_dict["strategy"] = log_type
|
|
|
+ video_dict["out_video_id"] = video_dict["video_id"]
|
|
|
+ video_dict["width"] = video_dict["video_width"]
|
|
|
+ video_dict["height"] = video_dict["video_height"]
|
|
|
+ video_dict["crawler_rule"] = json.dumps(rule_dict)
|
|
|
+ video_dict["user_id"] = user_dict["uid"]
|
|
|
+ video_dict["publish_time"] = video_dict["publish_time_str"]
|
|
|
+ video_dict["strategy_type"] = log_type
|
|
|
+ mq.send_msg(video_dict)
|
|
|
+
|
|
|
except Exception as e:
|
|
|
Common.logger(log_type, crawler).warning(f"抓取单条视频异常:{e}\n")
|
|
|
Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
|
|
@@ -683,7 +697,8 @@ class XiguasearchScheduling:
|
|
|
|
|
|
@classmethod
|
|
|
def repeat_video(cls, log_type, crawler, video_id, env):
|
|
|
- sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_id}"; """
|
|
|
+ # sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_id}"; """
|
|
|
+ sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{video_id}"; """
|
|
|
repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, action="")
|
|
|
return len(repeat_video)
|
|
|
|