1 tahun lalu · 737eb334fd
--- a/xigua/xigua_main/run_xg_author_dev.py
+++ b/xigua/xigua_main/run_xg_author_dev.py
@@ -0,0 +1,32 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+# @Author: wangkun
			
 
				+# @Time: 2023/6/28
			
 
				+import os
			
 
				+import sys
			
 
				+sys.path.append(os.getcwd())
			
 
				+from common.common import Common
			
 
				+from xigua.xigua_author.xigua_author_scheduling import XiguaauthorScheduling
			
 
				+
			
 
				+
			
 
				+def xigua_author_main(log_type, crawler, env):
			
 
				+    Common.logger(log_type, crawler).info("开始抓取:西瓜账号\n")
			
 
				+    Common.logging(log_type, crawler, env, "开始抓取:西瓜账号\n")
			
 
				+    XiguaauthorScheduling.get_author_videos(log_type=log_type,
			
 
				+                                            crawler=crawler,
			
 
				+                                            rule_dict={"play_cnt":{"min":1000,"max":0},"duration":{"min":60,"max":0},"period":{"min":2,"max":2},"video_width":{"min":720,"max":0},"video_height":{"min":720,"max":0}},
			
 
				+                                            user_list=[{"uid": 6267140, "source": "xigua", "link": "https://www.ixigua.com/home/2273435016499629",
			
 
				+                                                           "nick_name": "买两个橘子",
			
 
				+                                                           "avatar_url": "http://rescdn.yishihui.com/user/default/avatar/live/1616555578819_u=1922778943,2660693611&fm=26&gp=0.jpg",
			
 
				+                                                           "mode": "author"},
			
 
				+                                                          {"uid": 6267141, "source": "xigua", "link": "https://www.ixigua.com/home/694546480506782",
			
 
				+                                                           "nick_name": "西二旗刘德华",
			
 
				+                                                           "avatar_url": "http://rescdn.yishihui.com/user/default/avatar/live/1616555578819_u=1922778943,2660693611&fm=26&gp=0.jpg",
			
 
				+                                                           "mode": "author"}],
			
 
				+                                            env=env)
			
 
				+    Common.del_logs(log_type, crawler)
			
 
				+    Common.logger(log_type, crawler).info("抓取一轮结束\n")
			
 
				+    Common.logging(log_type, crawler, env, "抓取一轮结束\n")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    xigua_author_main("author", "xigua", "dev")
			
--- a/xigua/xigua_main/run_xg_recommend_dev.py
+++ b/xigua/xigua_main/run_xg_recommend_dev.py
@@ -8,12 +8,12 @@ from common.common import Common
 
				 from xigua.xigua_recommend.xigua_recommend_scheduling import XiguarecommendScheduling
			
 
				 
			
 
				 
			
 
				-def main(log_type, crawler, env):
			
 
				-    Common.logger(log_type, crawler).info(f'开始抓取 西瓜推荐\n')
			
 
				-    Common.logging(log_type, crawler, env, "开始抓取 西瓜推荐\n")
			
 
				+def xigua_recommend_main(log_type, crawler, env):
			
 
				+    Common.logger(log_type, crawler).info(f'开始抓取:西瓜推荐\n')
			
 
				+    Common.logging(log_type, crawler, env, "开始抓取:西瓜推荐\n")
			
 
				     XiguarecommendScheduling.get_videoList(log_type=log_type,
			
 
				                                            crawler=crawler,
			
 
				-                                           rule_dict={},
			
 
				+                                           rule_dict={"play_cnt":{"min":10000,"max":0},"duration":{"min":60,"max":1800},"period":{"min":90,"max":90}},
			
 
				                                            our_uid=6267140,
			
 
				                                            env=env)
			
 
				     Common.logger(log_type, crawler).info("抓取一轮结束\n")
			
@@ -21,4 +21,4 @@ def main(log_type, crawler, env):
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    main("recommend", "xigua", "dev")
			
 
				+    xigua_recommend_main("recommend", "xigua", "dev")
			
--- a/xigua/xigua_recommend/xigua_recommend_scheduling.py
+++ b/xigua/xigua_recommend/xigua_recommend_scheduling.py
@@ -724,6 +724,7 @@ class XiguarecommendScheduling:
 
				                                 video_dict["crawler_rule"] = json.dumps(rule_dict)
			
 
				                                 video_dict["user_id"] = our_uid
			
 
				                                 video_dict["publish_time"] = video_dict["publish_time_str"]
			
 
				+                                video_dict["strategy_type"] = log_type
			
 
				                                 mq.send_msg(video_dict)
			
 
				                         except Exception as e:
			
 
				                             Common.logger(log_type, crawler).error(f"抓取单条视频时异常:{e}\n")
			
--- a/xigua/xigua_search/xigua_search_scheduling.py
+++ b/xigua/xigua_search/xigua_search_scheduling.py
@@ -683,6 +683,7 @@ class XiguasearchScheduling:
 
				                         video_dict["crawler_rule"] = json.dumps(rule_dict)
			
 
				                         video_dict["user_id"] = user_dict["uid"]
			
 
				                         video_dict["publish_time"] = video_dict["publish_time_str"]
			
 
				+                        video_dict["strategy_type"] = log_type
			
 
				 
			
 
				                         mq.send_msg(video_dict)
			
 
				                 except Exception as e:
			
--- a/xigua/xigua_search/xigua_search_scheduling0628.py
+++ b/xigua/xigua_search/xigua_search_scheduling0628.py
@@ -18,6 +18,7 @@ from selenium.webdriver.chrome.service import Service
 
				 from selenium import webdriver
			
 
				 from selenium.webdriver.common.by import By
			
 
				 sys.path.append(os.getcwd())
			
 
				+from common.mq import MQ
			
 
				 from common.scheduling_db import MysqlHelper
			
 
				 from common.common import Common
			
 
				 from common.feishu import Feishu
			
@@ -548,6 +549,7 @@ class XiguasearchScheduling:
 
				 
			
 
				     @classmethod
			
 
				     def get_videoList(cls, log_type, crawler, user_dict, rule_dict, env):
			
 
				+        mq = MQ(topic_name="topic_crawler_etl_" + env)
			
 
				         # 打印请求配置
			
 
				         ca = DesiredCapabilities.CHROME
			
 
				         ca["goog:loggingPrefs"] = {"performance": "ALL"}
			
@@ -658,20 +660,32 @@ class XiguasearchScheduling:
 
				                         Common.logger(log_type, crawler).info('视频已下载\n')
			
 
				                         Common.logging(log_type, crawler, env, '视频已下载\n')
			
 
				                     else:
			
 
				-                        title_score = get_title_score(log_type, "kuaishou", "16QspO", "0usaDk", video_dict["video_title"])
			
 
				-                        if title_score <= 0.3:
			
 
				-                            Common.logger(log_type, crawler).info(f"权重分:{title_score}<=0.3\n")
			
 
				-                            Common.logging(log_type, crawler, env, f"权重分:{title_score}<=0.3\n")
			
 
				-                            continue
			
 
				-                        Common.logger(log_type, crawler).info(f"权重分:{title_score}>0.3\n")
			
 
				-                        Common.logging(log_type, crawler, env, f"权重分:{title_score}>0.3\n")
			
 
				-                        cls.download_publish(log_type=log_type,
			
 
				-                                             crawler=crawler,
			
 
				-                                             user_dict=user_dict,
			
 
				-                                             video_dict=video_dict,
			
 
				-                                             rule_dict=rule_dict,
			
 
				-                                             title_score=title_score,
			
 
				-                                             env=env)
			
 
				+                        # title_score = get_title_score(log_type, "kuaishou", "16QspO", "0usaDk", video_dict["video_title"])
			
 
				+                        # if title_score <= 0.3:
			
 
				+                        #     Common.logger(log_type, crawler).info(f"权重分:{title_score}<=0.3\n")
			
 
				+                        #     Common.logging(log_type, crawler, env, f"权重分:{title_score}<=0.3\n")
			
 
				+                        #     continue
			
 
				+                        # Common.logger(log_type, crawler).info(f"权重分:{title_score}>0.3\n")
			
 
				+                        # Common.logging(log_type, crawler, env, f"权重分:{title_score}>0.3\n")
			
 
				+                        # cls.download_publish(log_type=log_type,
			
 
				+                        #                      crawler=crawler,
			
 
				+                        #                      user_dict=user_dict,
			
 
				+                        #                      video_dict=video_dict,
			
 
				+                        #                      rule_dict=rule_dict,
			
 
				+                        #                      title_score=title_score,
			
 
				+                        #                      env=env)
			
 
				+                        video_dict["out_user_id"] = video_dict["user_id"]
			
 
				+                        video_dict["platform"] = crawler
			
 
				+                        video_dict["strategy"] = log_type
			
 
				+                        video_dict["out_video_id"] = video_dict["video_id"]
			
 
				+                        video_dict["width"] = video_dict["video_width"]
			
 
				+                        video_dict["height"] = video_dict["video_height"]
			
 
				+                        video_dict["crawler_rule"] = json.dumps(rule_dict)
			
 
				+                        video_dict["user_id"] = user_dict["uid"]
			
 
				+                        video_dict["publish_time"] = video_dict["publish_time_str"]
			
 
				+                        video_dict["strategy_type"] = log_type
			
 
				+                        mq.send_msg(video_dict)
			
 
				+
			
 
				                 except Exception as e:
			
 
				                     Common.logger(log_type, crawler).warning(f"抓取单条视频异常:{e}\n")
			
 
				                     Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
			
@@ -683,7 +697,8 @@ class XiguasearchScheduling:
 
				 
			
 
				     @classmethod
			
 
				     def repeat_video(cls, log_type, crawler, video_id, env):
			
 
				-        sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_id}"; """
			
 
				+        # sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_id}"; """
			
 
				+        sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{video_id}"; """
			
 
				         repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, action="")
			
 
				         return len(repeat_video)