ehlxr 1 рік тому
батько
коміт
d8e332e932

+ 22 - 7
gongzhonghao/gongzhonghao_author/gongzhonghao1_author.py

@@ -14,6 +14,8 @@ from selenium.webdriver import DesiredCapabilities
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.common.by import By
 from selenium import webdriver
+
+from common.mq import MQ
 sys.path.append(os.getcwd())
 from common.common import Common
 from common.feishu import Feishu
@@ -188,6 +190,7 @@ class GongzhonghaoAuthor1:
     # 获取文章列表
     @classmethod
     def get_videoList(cls, log_type, crawler, rule_dict, user_dict, env):
+        mq = MQ(topic_name="topic_crawler_etl_" + env)
         begin = 0
         while True:
             token_dict = cls.get_token(log_type, crawler, env)
@@ -312,12 +315,23 @@ class GongzhonghaoAuthor1:
                             Common.logger(log_type, crawler).info(f'标题相似度>=80%:{video_dict["video_title"]}\n')
                             Common.logging(log_type, crawler, env, f'标题相似度>=80%:{video_dict["video_title"]}\n')
                         else:
-                            cls.download_publish(log_type=log_type,
-                                                 crawler=crawler,
-                                                 video_dict=video_dict,
-                                                 rule_dict=rule_dict,
-                                                 # user_dict=user_dict,
-                                                 env=env)
+                            # cls.download_publish(log_type=log_type,
+                            #                      crawler=crawler,
+                            #                      video_dict=video_dict,
+                            #                      rule_dict=rule_dict,
+                            #                      # user_dict=user_dict,
+                            #                      env=env)
+                            video_dict["out_user_id"] = video_dict["user_id"]
+                            video_dict["platform"] = crawler
+                            video_dict["strategy"] = log_type
+                            video_dict["out_video_id"] = video_dict["video_id"]
+                            video_dict["width"] = video_dict["video_width"]
+                            video_dict["height"] = video_dict["video_height"]
+                            video_dict["crawler_rule"] = json.dumps(rule_dict)
+                            video_dict["user_id"] = user_dict["uid"]
+                            video_dict["publish_time"] = video_dict["publish_time_str"]
+
+                            mq.send_msg(video_dict)
                     except Exception as e:
                         Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
                         Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
@@ -327,7 +341,8 @@ class GongzhonghaoAuthor1:
 
     @classmethod
     def repeat_video(cls, log_type, crawler, video_id, env):
-        sql = f""" select * from crawler_video where platform="公众号" and out_video_id="{video_id}"; """
+        # sql = f""" select * from crawler_video where platform="公众号" and out_video_id="{video_id}"; """
+        sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{video_id}"; """
         repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
         return len(repeat_video)
 

+ 22 - 7
gongzhonghao/gongzhonghao_author/gongzhonghao2_author.py

@@ -14,6 +14,8 @@ from selenium.webdriver import DesiredCapabilities
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.common.by import By
 from selenium import webdriver
+
+from common.mq import MQ
 sys.path.append(os.getcwd())
 # from common.getuser import getUser
 from common.common import Common
@@ -186,6 +188,7 @@ class GongzhonghaoAuthor2:
     # 获取文章列表
     @classmethod
     def get_videoList(cls, log_type, crawler, rule_dict, user_dict, env):
+        mq = MQ(topic_name="topic_crawler_etl_" + env)
         begin = 0
         while True:
             token_dict = cls.get_token(log_type, crawler, env)
@@ -310,12 +313,23 @@ class GongzhonghaoAuthor2:
                             Common.logger(log_type, crawler).info(f'标题相似度>=80%:{video_dict["video_title"]}\n')
                             Common.logging(log_type, crawler, env, f'标题相似度>=80%:{video_dict["video_title"]}\n')
                         else:
-                            cls.download_publish(log_type=log_type,
-                                                 crawler=crawler,
-                                                 video_dict=video_dict,
-                                                 rule_dict=rule_dict,
-                                                 # user_dict=user_dict,
-                                                 env=env)
+                            # cls.download_publish(log_type=log_type,
+                            #                      crawler=crawler,
+                            #                      video_dict=video_dict,
+                            #                      rule_dict=rule_dict,
+                            #                      # user_dict=user_dict,
+                            #                      env=env)
+                            video_dict["out_user_id"] = video_dict["user_id"]
+                            video_dict["platform"] = crawler
+                            video_dict["strategy"] = log_type
+                            video_dict["out_video_id"] = video_dict["video_id"]
+                            video_dict["width"] = video_dict["video_width"]
+                            video_dict["height"] = video_dict["video_height"]
+                            video_dict["crawler_rule"] = json.dumps(rule_dict)
+                            video_dict["user_id"] = user_dict["uid"]
+                            video_dict["publish_time"] = video_dict["publish_time_str"]
+
+                            mq.send_msg(video_dict)
                     except Exception as e:
                         Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
                         Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
@@ -326,7 +340,8 @@ class GongzhonghaoAuthor2:
 
     @classmethod
     def repeat_video(cls, log_type, crawler, video_id, env):
-        sql = f""" select * from crawler_video where platform="公众号" and out_video_id="{video_id}"; """
+        # sql = f""" select * from crawler_video where platform="公众号" and out_video_id="{video_id}"; """
+        sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{video_id}"; """
         repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
         return len(repeat_video)
 

+ 22 - 7
gongzhonghao/gongzhonghao_author/gongzhonghao3_author.py

@@ -14,6 +14,8 @@ from selenium.webdriver import DesiredCapabilities
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.common.by import By
 from selenium import webdriver
+
+from common.mq import MQ
 sys.path.append(os.getcwd())
 from common.common import Common
 from common.feishu import Feishu
@@ -186,6 +188,7 @@ class GongzhonghaoAuthor3:
     # 获取文章列表
     @classmethod
     def get_videoList(cls, log_type, crawler, rule_dict, user_dict, env):
+        mq = MQ(topic_name="topic_crawler_etl_" + env)
         begin = 0
         while True:
             token_dict = cls.get_token(log_type, crawler, env)
@@ -311,12 +314,23 @@ class GongzhonghaoAuthor3:
                             Common.logger(log_type, crawler).info(f'标题相似度>=80%:{video_dict["video_title"]}\n')
                             Common.logging(log_type, crawler, env, f'标题相似度>=80%:{video_dict["video_title"]}\n')
                         else:
-                            cls.download_publish(log_type=log_type,
-                                                 crawler=crawler,
-                                                 video_dict=video_dict,
-                                                 rule_dict=rule_dict,
-                                                 # user_dict=user_dict,
-                                                 env=env)
+                            # cls.download_publish(log_type=log_type,
+                            #                      crawler=crawler,
+                            #                      video_dict=video_dict,
+                            #                      rule_dict=rule_dict,
+                            #                      # user_dict=user_dict,
+                            #                      env=env)
+                            video_dict["out_user_id"] = video_dict["user_id"]
+                            video_dict["platform"] = crawler
+                            video_dict["strategy"] = log_type
+                            video_dict["out_video_id"] = video_dict["video_id"]
+                            video_dict["width"] = video_dict["video_width"]
+                            video_dict["height"] = video_dict["video_height"]
+                            video_dict["crawler_rule"] = json.dumps(rule_dict)
+                            video_dict["user_id"] = user_dict["uid"]
+                            video_dict["publish_time"] = video_dict["publish_time_str"]
+
+                            mq.send_msg(video_dict)
                     except Exception as e:
                         Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
                         Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
@@ -327,7 +341,8 @@ class GongzhonghaoAuthor3:
 
     @classmethod
     def repeat_video(cls, log_type, crawler, video_id, env):
-        sql = f""" select * from crawler_video where platform="公众号" and out_video_id="{video_id}"; """
+        # sql = f""" select * from crawler_video where platform="公众号" and out_video_id="{video_id}"; """
+        sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{video_id}"; """
         repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
         return len(repeat_video)
 

+ 22 - 7
gongzhonghao/gongzhonghao_author/gongzhonghao4_author.py

@@ -15,6 +15,8 @@ from selenium.webdriver import DesiredCapabilities
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.common.by import By
 from selenium import webdriver
+
+from common.mq import MQ
 sys.path.append(os.getcwd())
 from common.common import Common
 from common.feishu import Feishu
@@ -188,6 +190,7 @@ class GongzhonghaoAuthor4:
     # 获取文章列表
     @classmethod
     def get_videoList(cls, log_type, crawler, rule_dict, user_dict, env):
+        mq = MQ(topic_name="topic_crawler_etl_" + env)
         begin = 0
         while True:
             token_dict = cls.get_token(log_type, crawler, env)
@@ -312,12 +315,23 @@ class GongzhonghaoAuthor4:
                             Common.logger(log_type, crawler).info(f'标题相似度>=80%:{video_dict["video_title"]}\n')
                             Common.logging(log_type, crawler, env, f'标题相似度>=80%:{video_dict["video_title"]}\n')
                         else:
-                            cls.download_publish(log_type=log_type,
-                                                 crawler=crawler,
-                                                 video_dict=video_dict,
-                                                 rule_dict=rule_dict,
-                                                 # user_dict=user_dict,
-                                                 env=env)
+                            # cls.download_publish(log_type=log_type,
+                            #                      crawler=crawler,
+                            #                      video_dict=video_dict,
+                            #                      rule_dict=rule_dict,
+                            #                      # user_dict=user_dict,
+                            #                      env=env)
+                            video_dict["out_user_id"] = video_dict["user_id"]
+                            video_dict["platform"] = crawler
+                            video_dict["strategy"] = log_type
+                            video_dict["out_video_id"] = video_dict["video_id"]
+                            video_dict["width"] = video_dict["video_width"]
+                            video_dict["height"] = video_dict["video_height"]
+                            video_dict["crawler_rule"] = json.dumps(rule_dict)
+                            video_dict["user_id"] = user_dict["uid"]
+                            video_dict["publish_time"] = video_dict["publish_time_str"]
+
+                            mq.send_msg(video_dict)
                     except Exception as e:
                         Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
                         Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
@@ -328,7 +342,8 @@ class GongzhonghaoAuthor4:
 
     @classmethod
     def repeat_video(cls, log_type, crawler, video_id, env):
-        sql = f""" select * from crawler_video where platform="公众号" and out_video_id="{video_id}"; """
+        # sql = f""" select * from crawler_video where platform="公众号" and out_video_id="{video_id}"; """
+        sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{video_id}"; """
         repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
         return len(repeat_video)
 

+ 22 - 7
gongzhonghao/gongzhonghao_author/gongzhonghao5_author.py

@@ -15,6 +15,8 @@ from selenium.webdriver import DesiredCapabilities
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.common.by import By
 from selenium import webdriver
+
+from common.mq import MQ
 sys.path.append(os.getcwd())
 from common.common import Common
 from common.feishu import Feishu
@@ -188,6 +190,7 @@ class GongzhonghaoAuthor5:
     # 获取文章列表
     @classmethod
     def get_videoList(cls, log_type, crawler, rule_dict, user_dict, env):
+        mq = MQ(topic_name="topic_crawler_etl_" + env)
         begin = 0
         while True:
             token_dict = cls.get_token(log_type, crawler, env)
@@ -312,12 +315,23 @@ class GongzhonghaoAuthor5:
                             Common.logger(log_type, crawler).info(f'标题相似度>=80%:{video_dict["video_title"]}\n')
                             Common.logging(log_type, crawler, env, f'标题相似度>=80%:{video_dict["video_title"]}\n')
                         else:
-                            cls.download_publish(log_type=log_type,
-                                                 crawler=crawler,
-                                                 video_dict=video_dict,
-                                                 rule_dict=rule_dict,
-                                                 # user_dict=user_dict,
-                                                 env=env)
+                            # cls.download_publish(log_type=log_type,
+                            #                      crawler=crawler,
+                            #                      video_dict=video_dict,
+                            #                      rule_dict=rule_dict,
+                            #                      # user_dict=user_dict,
+                            #                      env=env)
+                            video_dict["out_user_id"] = video_dict["user_id"]
+                            video_dict["platform"] = crawler
+                            video_dict["strategy"] = log_type
+                            video_dict["out_video_id"] = video_dict["video_id"]
+                            video_dict["width"] = video_dict["video_width"]
+                            video_dict["height"] = video_dict["video_height"]
+                            video_dict["crawler_rule"] = json.dumps(rule_dict)
+                            video_dict["user_id"] = user_dict["uid"]
+                            video_dict["publish_time"] = video_dict["publish_time_str"]
+
+                            mq.send_msg(video_dict)
                     except Exception as e:
                         Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
                         Common.logging(log_type, crawler, env, f'抓取单条视频异常:{e}\n')
@@ -328,7 +342,8 @@ class GongzhonghaoAuthor5:
 
     @classmethod
     def repeat_video(cls, log_type, crawler, video_id, env):
-        sql = f""" select * from crawler_video where platform="公众号" and out_video_id="{video_id}"; """
+        # sql = f""" select * from crawler_video where platform="公众号" and out_video_id="{video_id}"; """
+        sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{video_id}"; """
         repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
         return len(repeat_video)