浏览代码

xigua search etl

ehlxr 1 年之前
父节点
当前提交
2d6fc5a093
共有 1 个文件被更改,包括 27 次插入14 次删除
  1. 27 14
      xigua/xigua_search/xigua_search_scheduling.py

+ 27 - 14
xigua/xigua_search/xigua_search_scheduling.py

@@ -17,6 +17,7 @@ from selenium.webdriver import DesiredCapabilities
 from selenium.webdriver.chrome.service import Service
 from selenium import webdriver
 from selenium.webdriver.common.by import By
+from common.mq import MQ
 sys.path.append(os.getcwd())
 from common.scheduling_db import MysqlHelper
 from common.common import Common
@@ -548,6 +549,7 @@ class XiguasearchScheduling:
 
     @classmethod
     def get_videoList(cls, log_type, crawler, user_dict, rule_dict, env):
+        mq = MQ(topic_name="topic_crawler_etl_" + env)
         # 打印请求配置
         ca = DesiredCapabilities.CHROME
         ca["goog:loggingPrefs"] = {"performance": "ALL"}
@@ -645,20 +647,31 @@ class XiguasearchScheduling:
                         Common.logger(log_type, crawler).info('视频已下载\n')
                         Common.logging(log_type, crawler, env, '视频已下载\n')
                     else:
-                        title_score = get_title_score(log_type, "kuaishou", "16QspO", "0usaDk", video_dict["video_title"])
-                        if title_score <= 0.3:
-                            Common.logger(log_type, crawler).info(f"权重分:{title_score}<=0.3\n")
-                            Common.logging(log_type, crawler, env, f"权重分:{title_score}<=0.3\n")
-                            continue
-                        Common.logger(log_type, crawler).info(f"权重分:{title_score}>0.3\n")
-                        Common.logging(log_type, crawler, env, f"权重分:{title_score}>0.3\n")
-                        cls.download_publish(log_type=log_type,
-                                             crawler=crawler,
-                                             user_dict=user_dict,
-                                             video_dict=video_dict,
-                                             rule_dict=rule_dict,
-                                             title_score=title_score,
-                                             env=env)
+                        # title_score = get_title_score(log_type, "kuaishou", "16QspO", "0usaDk", video_dict["video_title"])
+                        # if title_score <= 0.3:
+                        #     Common.logger(log_type, crawler).info(f"权重分:{title_score}<=0.3\n")
+                        #     Common.logging(log_type, crawler, env, f"权重分:{title_score}<=0.3\n")
+                        #     continue
+                        # Common.logger(log_type, crawler).info(f"权重分:{title_score}>0.3\n")
+                        # Common.logging(log_type, crawler, env, f"权重分:{title_score}>0.3\n")
+                        # cls.download_publish(log_type=log_type,
+                        #                      crawler=crawler,
+                        #                      user_dict=user_dict,
+                        #                      video_dict=video_dict,
+                        #                      rule_dict=rule_dict,
+                        #                      title_score=title_score,
+                        #                      env=env)
+                        video_dict["out_user_id"] = video_dict["user_id"]
+                        video_dict["platform"] = crawler
+                        video_dict["strategy"] = log_type
+                        video_dict["out_video_id"] = video_dict["video_id"]
+                        video_dict["width"] = video_dict["video_width"]
+                        video_dict["height"] = video_dict["video_height"]
+                        video_dict["crawler_rule"] = json.dumps(rule_dict)
+                        video_dict["user_id"] = user_dict["uid"]
+                        video_dict["publish_time"] = video_dict["publish_time_str"]
+
+                        mq.send_msg(video_dict)
                 except Exception as e:
                     Common.logger(log_type, crawler).warning(f"抓取单条视频异常:{e}\n")
                     Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")