преди 2 години · 250fda3c09
--- a/README.MD
+++ b/README.MD
@@ -165,6 +165,7 @@ ps aux | grep douyin | grep -v grep | awk '{print $2}' | xargs kill -9
 
				 ps aux | grep xiaoniangao | grep -v grep | awk '{print $2}' | xargs kill -9
			
 
				 ps aux | grep kanyikan | grep -v grep | awk '{print $2}' | xargs kill -9
			
 
				 ps aux | grep Appium | grep -v grep | awk '{print $2}' | xargs kill -9
			
 
				+ps aux | grep shipinhao | grep -v grep | awk '{print $2}' | xargs kill -9
			
 
				 ```
			
 
				 
			
 
				 #### 生成 requirements.txt
			
--- a/gongzhonghao/gongzhonghao_author/gongzhonghao1_author.py
+++ b/gongzhonghao/gongzhonghao_author/gongzhonghao1_author.py
@@ -4,6 +4,7 @@
 
				 import datetime
			
 
				 import json
			
 
				 import os
			
 
				+import random
			
 
				 import shutil
			
 
				 import sys
			
 
				 import time
			
@@ -325,10 +326,11 @@ class GongzhonghaoAuthor1:
 
				                             video_dict["platform"] = crawler
			
 
				                             video_dict["strategy"] = log_type
			
 
				                             video_dict["out_video_id"] = video_dict["video_id"]
			
 
				-                            video_dict["width"] = video_dict["video_width"]
			
 
				-                            video_dict["height"] = video_dict["video_height"]
			
 
				+                            video_dict["width"] = 0
			
 
				+                            video_dict["height"] = 0
			
 
				                             video_dict["crawler_rule"] = json.dumps(rule_dict)
			
 
				-                            video_dict["user_id"] = user_dict["uid"]
			
 
				+                            # video_dict["user_id"] = user_dict["uid"]
			
 
				+                            video_dict["user_id"] = Publish.uids(crawler, "定向爬虫策略", "", env)
			
 
				                             video_dict["publish_time"] = video_dict["publish_time_str"]
			
 
				 
			
 
				                             mq.send_msg(video_dict)
			
--- a/gongzhonghao/gongzhonghao_author/gongzhonghao2_author.py
+++ b/gongzhonghao/gongzhonghao_author/gongzhonghao2_author.py
@@ -323,10 +323,11 @@ class GongzhonghaoAuthor2:
 
				                             video_dict["platform"] = crawler
			
 
				                             video_dict["strategy"] = log_type
			
 
				                             video_dict["out_video_id"] = video_dict["video_id"]
			
 
				-                            video_dict["width"] = video_dict["video_width"]
			
 
				-                            video_dict["height"] = video_dict["video_height"]
			
 
				+                            video_dict["width"] = 0
			
 
				+                            video_dict["height"] = 0
			
 
				                             video_dict["crawler_rule"] = json.dumps(rule_dict)
			
 
				-                            video_dict["user_id"] = user_dict["uid"]
			
 
				+                            # video_dict["user_id"] = user_dict["uid"]
			
 
				+                            video_dict["user_id"] = Publish.uids(crawler, "定向爬虫策略", "", env)
			
 
				                             video_dict["publish_time"] = video_dict["publish_time_str"]
			
 
				 
			
 
				                             mq.send_msg(video_dict)
			
--- a/gongzhonghao/gongzhonghao_author/gongzhonghao3_author.py
+++ b/gongzhonghao/gongzhonghao_author/gongzhonghao3_author.py
@@ -324,10 +324,11 @@ class GongzhonghaoAuthor3:
 
				                             video_dict["platform"] = crawler
			
 
				                             video_dict["strategy"] = log_type
			
 
				                             video_dict["out_video_id"] = video_dict["video_id"]
			
 
				-                            video_dict["width"] = video_dict["video_width"]
			
 
				-                            video_dict["height"] = video_dict["video_height"]
			
 
				+                            video_dict["width"] = 0
			
 
				+                            video_dict["height"] = 0
			
 
				                             video_dict["crawler_rule"] = json.dumps(rule_dict)
			
 
				-                            video_dict["user_id"] = user_dict["uid"]
			
 
				+                            # video_dict["user_id"] = user_dict["uid"]
			
 
				+                            video_dict["user_id"] = Publish.uids(crawler, "定向爬虫策略", "", env)
			
 
				                             video_dict["publish_time"] = video_dict["publish_time_str"]
			
 
				 
			
 
				                             mq.send_msg(video_dict)
			
--- a/gongzhonghao/gongzhonghao_author/gongzhonghao4_author.py
+++ b/gongzhonghao/gongzhonghao_author/gongzhonghao4_author.py
@@ -168,7 +168,7 @@ class GongzhonghaoAuthor4:
 
				             driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
			
 
				         else:
			
 
				             driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service(
			
 
				-                '/Users/wangkun/Downloads/chromedriver/chromedriver_v111/chromedriver'))
			
 
				+                '/Users/wangkun/Downloads/chromedriver/chromedriver_v113/chromedriver'))
			
 
				 
			
 
				         driver.implicitly_wait(10)
			
 
				         driver.get(article_url)
			
@@ -325,10 +325,11 @@ class GongzhonghaoAuthor4:
 
				                             video_dict["platform"] = crawler
			
 
				                             video_dict["strategy"] = log_type
			
 
				                             video_dict["out_video_id"] = video_dict["video_id"]
			
 
				-                            video_dict["width"] = video_dict["video_width"]
			
 
				-                            video_dict["height"] = video_dict["video_height"]
			
 
				+                            video_dict["width"] = 0
			
 
				+                            video_dict["height"] = 0
			
 
				                             video_dict["crawler_rule"] = json.dumps(rule_dict)
			
 
				-                            video_dict["user_id"] = user_dict["uid"]
			
 
				+                            # video_dict["user_id"] = user_dict["uid"]
			
 
				+                            video_dict["user_id"] = Publish.uids(crawler, "定向爬虫策略", "", env)
			
 
				                             video_dict["publish_time"] = video_dict["publish_time_str"]
			
 
				 
			
 
				                             mq.send_msg(video_dict)
			
--- a/gongzhonghao/gongzhonghao_author/gongzhonghao5_author.py
+++ b/gongzhonghao/gongzhonghao_author/gongzhonghao5_author.py
@@ -168,7 +168,7 @@ class GongzhonghaoAuthor5:
 
				             driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
			
 
				         else:
			
 
				             driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service(
			
 
				-                '/Users/wangkun/Downloads/chromedriver/chromedriver_v111/chromedriver'))
			
 
				+                '/Users/wangkun/Downloads/chromedriver/chromedriver_v113/chromedriver'))
			
 
				 
			
 
				         driver.implicitly_wait(10)
			
 
				         driver.get(article_url)
			
@@ -325,10 +325,11 @@ class GongzhonghaoAuthor5:
 
				                             video_dict["platform"] = crawler
			
 
				                             video_dict["strategy"] = log_type
			
 
				                             video_dict["out_video_id"] = video_dict["video_id"]
			
 
				-                            video_dict["width"] = video_dict["video_width"]
			
 
				-                            video_dict["height"] = video_dict["video_height"]
			
 
				+                            video_dict["width"] = 0
			
 
				+                            video_dict["height"] = 0
			
 
				                             video_dict["crawler_rule"] = json.dumps(rule_dict)
			
 
				-                            video_dict["user_id"] = user_dict["uid"]
			
 
				+                            # video_dict["user_id"] = user_dict["uid"]
			
 
				+                            video_dict["user_id"] = Publish.uids(crawler, "定向爬虫策略", "", env)
			
 
				                             video_dict["publish_time"] = video_dict["publish_time_str"]
			
 
				 
			
 
				                             mq.send_msg(video_dict)
			
--- a/gongzhonghao/gongzhonghao_main/run_gzh_author_dev.py
+++ b/gongzhonghao/gongzhonghao_main/run_gzh_author_dev.py
@@ -6,14 +6,20 @@ import sys
 
				 sys.path.append(os.getcwd())
			
 
				 from common.common import Common
			
 
				 # from gongzhonghao.gongzhonghao_author.gongzhonghao1_author import GongzhonghaoAuthor1
			
 
				-from gongzhonghao.gongzhonghao_author.gongzhonghao2_author import GongzhonghaoAuthor2
			
 
				+# from gongzhonghao.gongzhonghao_author.gongzhonghao2_author import GongzhonghaoAuthor2
			
 
				+# from gongzhonghao.gongzhonghao_author.gongzhonghao3_author import GongzhonghaoAuthor3
			
 
				+# from gongzhonghao.gongzhonghao_author.gongzhonghao4_author import GongzhonghaoAuthor4
			
 
				+from gongzhonghao.gongzhonghao_author.gongzhonghao5_author import GongzhonghaoAuthor5
			
 
				 
			
 
				 
			
 
				 def gzh_main(log_type, crawler, env):
			
 
				     Common.logger(log_type, crawler).info("开始抓取:公众号")
			
 
				     Common.logging(log_type, crawler, env, "开始抓取:公众号")
			
 
				     # GongzhonghaoAuthor1.get_all_videos(log_type=log_type,
			
 
				-    GongzhonghaoAuthor2.get_all_videos(log_type=log_type,
			
 
				+    # GongzhonghaoAuthor2.get_all_videos(log_type=log_type,
			
 
				+    # GongzhonghaoAuthor3.get_all_videos(log_type=log_type,
			
 
				+    # GongzhonghaoAuthor4.get_all_videos(log_type=log_type,
			
 
				+    GongzhonghaoAuthor5.get_all_videos(log_type=log_type,
			
 
				                                        crawler=crawler,
			
 
				                                        rule_dict={"period": {"max": 1, "min": 1}, "duration": {"max": 2700, "min": 20}},
			
 
				                                        env=env)
			
--- a/kanyikan/kanyikan_recommend/kanyikan_recommend0627.py
+++ b/kanyikan/kanyikan_recommend/kanyikan_recommend0627.py
@@ -11,6 +11,7 @@ from hashlib import md5
 
				 import requests
			
 
				 import urllib3
			
 
				 sys.path.append(os.getcwd())
			
 
				+from common.mq import MQ
			
 
				 from common.common import Common
			
 
				 from common.feishu import Feishu
			
 
				 from common.publish import Publish
			
@@ -25,12 +26,14 @@ class KanyikanRecommend:
 
				 
			
 
				     @classmethod
			
 
				     def repeat_video(cls, log_type, crawler, video_id, env):
			
 
				-        sql = f""" select * from crawler_video where platform="{cls.platform}" and strategy="{cls.strategy}" and out_video_id="{video_id}" """
			
 
				+        # sql = f""" select * from crawler_video where platform="{cls.platform}" and strategy="{cls.strategy}" and out_video_id="{video_id}" """
			
 
				+        sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{video_id}"; """
			
 
				         repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
			
 
				         return len(repeat_video)
			
 
				 
			
 
				     @classmethod
			
 
				     def get_videoList(cls, log_type, crawler, our_uid, rule_dict, env):
			
 
				+        mq = MQ(topic_name="topic_crawler_etl_" + env)
			
 
				         try:
			
 
				             Common.logger(log_type, crawler).info(f"正在抓取列表页")
			
 
				             Common.logging(log_type, crawler, env, f"正在抓取列表页")
			
@@ -91,7 +94,7 @@ class KanyikanRecommend:
 
				                         .replace("&NBSP", "").replace(".", "。").replace(" ", "") \
			
 
				                         .replace("'", "").replace("#", "").replace("Merge", "")
			
 
				                     publish_time_stamp = feeds[i].get("date", 0)
			
 
				-                    publish_time_str = time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(publish_time_stamp))
			
 
				+                    publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))
			
 
				                     # 获取播放地址
			
 
				                     if "videoInfo" not in feeds[i]:
			
 
				                         video_url = ""
			
@@ -137,12 +140,23 @@ class KanyikanRecommend:
 
				                         Common.logger(log_type, crawler).info('视频已下载\n')
			
 
				                         Common.logging(log_type, crawler, env, '视频已下载\n')
			
 
				                     else:
			
 
				-                        cls.download_publish(log_type=log_type,
			
 
				-                                             crawler=crawler,
			
 
				-                                             our_uid=our_uid,
			
 
				-                                             video_dict=video_dict,
			
 
				-                                             rule_dict=rule_dict,
			
 
				-                                             env=env)
			
 
				+                        # cls.download_publish(log_type=log_type,
			
 
				+                        #                      crawler=crawler,
			
 
				+                        #                      our_uid=our_uid,
			
 
				+                        #                      video_dict=video_dict,
			
 
				+                        #                      rule_dict=rule_dict,
			
 
				+                        #                      env=env)
			
 
				+                        video_dict["out_user_id"] = video_dict["user_id"]
			
 
				+                        video_dict["platform"] = crawler
			
 
				+                        video_dict["strategy"] = log_type
			
 
				+                        video_dict["out_video_id"] = video_dict["video_id"]
			
 
				+                        video_dict["width"] = video_dict["video_width"]
			
 
				+                        video_dict["height"] = video_dict["video_height"]
			
 
				+                        video_dict["crawler_rule"] = json.dumps(rule_dict)
			
 
				+                        video_dict["user_id"] = our_uid
			
 
				+                        video_dict["publish_time"] = video_dict["publish_time_str"]
			
 
				+
			
 
				+                        mq.send_msg(video_dict)
			
 
				                 except Exception as e:
			
 
				                     Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
			
 
				                     Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
			
--- a/shipinhao/shipinhao_search/shipinhao_search.py
+++ b/shipinhao/shipinhao_search/shipinhao_search.py
@@ -15,6 +15,7 @@ from appium.webdriver.webdriver import WebDriver
 
				 from selenium.common import NoSuchElementException
			
 
				 from selenium.webdriver.common.by import By
			
 
				 sys.path.append(os.getcwd())
			
 
				+from common.mq import MQ
			
 
				 from common.feishu import Feishu
			
 
				 from common.publish import Publish
			
 
				 from common.common import Common
			
@@ -260,18 +261,20 @@ class ShipinhaoSearch:
 
				 
			
 
				     @classmethod
			
 
				     def repeat_out_video_id(cls, log_type, crawler, out_video_id, env):
			
 
				-        sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{out_video_id}"; """
			
 
				+        # sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{out_video_id}"; """
			
 
				+        sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{out_video_id}"; """
			
 
				         repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
			
 
				         return len(repeat_video)
			
 
				 
			
 
				     @classmethod
			
 
				     def repeat_video_url(cls, log_type, crawler, video_url, env):
			
 
				-        sql = f""" select * from crawler_video where platform="{cls.platform}" and video_url="{video_url}"; """
			
 
				+        # sql = f""" select * from crawler_video where platform="{cls.platform}" and video_url="{video_url}"; """
			
 
				+        sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and video_url="{video_url}"; """
			
 
				         repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
			
 
				         return len(repeat_video)
			
 
				 
			
 
				     @classmethod
			
 
				-    def download_publish(cls, log_type, crawler, word, video_dict, our_uid, env):
			
 
				+    def download_publish(cls, log_type, crawler, word, video_dict, rule_dict, our_uid, env):
			
 
				         # 下载视频
			
 
				         Common.download_method(log_type=log_type, crawler=crawler, text="video", title=video_dict["video_title"], url=video_dict["video_url"])
			
 
				 
			
@@ -326,7 +329,6 @@ class ShipinhaoSearch:
 
				             except FileNotFoundError:
			
 
				                 return
			
 
				 
			
 
				-        rule_dict = cls.rule_dict(log_type, crawler)
			
 
				         insert_sql = f""" insert into crawler_video(video_id,
			
 
				                                                 out_user_id,
			
 
				                                                 platform,
			
@@ -542,9 +544,9 @@ class ShipinhaoSearch:
 
				 
			
 
				             return our_user_list
			
 
				 
			
 
				-
			
 
				     @classmethod
			
 
				     def search_video(cls, log_type, crawler, word, driver: WebDriver, our_uid, env):
			
 
				+        mq = MQ(topic_name="topic_crawler_etl_" + env)
			
 
				         # 点击微信搜索框，并输入搜索词
			
 
				         driver.implicitly_wait(10)
			
 
				         Common.logger(log_type, crawler).info("点击搜索框")
			
@@ -641,6 +643,7 @@ class ShipinhaoSearch:
 
				                         "video_id": out_video_id,
			
 
				                         "play_cnt": 0,
			
 
				                         "duration": duration,
			
 
				+                        # "duration": 60,
			
 
				                         "user_name": user_name,
			
 
				                         "user_id": out_user_id,
			
 
				                         "avatar_url": avatar_url,
			
@@ -668,7 +671,7 @@ class ShipinhaoSearch:
 
				                         video_dict["share_cnt"] = video_info_dict["share_cnt"]
			
 
				                         video_dict["favorite_cnt"] = video_info_dict["favorite_cnt"]
			
 
				                         video_dict["comment_cnt"] = video_info_dict["comment_cnt"]
			
 
				-                        video_dict["publish_time_str"] = video_info_dict["publish_time_str"]
			
 
				+                        video_dict["publish_time_str"] = video_info_dict["publish_time_str"] + " 00:00:00"
			
 
				                         video_dict["publish_time_stamp"] = video_info_dict["publish_time_stamp"]
			
 
				                         Common.logger(log_type, crawler).info(f'publish_time:{video_dict["publish_time_str"]}')
			
 
				                         Common.logging(log_type, crawler, env, f'publish_time:{video_dict["publish_time_str"]}')
			
@@ -676,12 +679,26 @@ class ShipinhaoSearch:
 
				                             Common.logger(log_type, crawler).info("不满足抓取规则\n")
			
 
				                             Common.logging(log_type, crawler, env, "不满足抓取规则\n")
			
 
				                         else:
			
 
				-                            cls.download_publish(log_type=log_type,
			
 
				-                                                 crawler=crawler,
			
 
				-                                                 word=word,
			
 
				-                                                 video_dict=video_dict,
			
 
				-                                                 our_uid=our_uid,
			
 
				-                                                 env=env)
			
 
				+                            rule_dict = cls.rule_dict(log_type, crawler)
			
 
				+                            video_dict["out_user_id"] = video_dict["user_id"]
			
 
				+                            video_dict["platform"] = crawler
			
 
				+                            video_dict["strategy"] = log_type
			
 
				+                            video_dict["out_video_id"] = video_dict["video_id"]
			
 
				+                            video_dict["width"] = 0
			
 
				+                            video_dict["height"] = 0
			
 
				+                            video_dict["crawler_rule"] = json.dumps(rule_dict)
			
 
				+                            video_dict["user_id"] = our_uid
			
 
				+                            video_dict["publish_time"] = video_dict["publish_time_str"]
			
 
				+                            mq.send_msg(video_dict)
			
 
				+                            cls.download_cnt += 1
			
 
				+                            # cls.download_publish(log_type=log_type,
			
 
				+                            #                      crawler=crawler,
			
 
				+                            #                      word=word,
			
 
				+                            #                      video_dict=video_dict,
			
 
				+                            #                      rule_dict=rule_dict,
			
 
				+                            #                      our_uid=our_uid,
			
 
				+                            #                      env=env)
			
 
				+
			
 
				                 except Exception as e:
			
 
				                     Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
			
 
				                     Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
			
--- a/xiaoniangao/xiaoniangao_main/run_xng_author_dev.py
+++ b/xiaoniangao/xiaoniangao_main/run_xng_author_dev.py
@@ -8,7 +8,7 @@ from common.common import Common
 
				 from xiaoniangao.xiaoniangao_author.xiaoniangao_author_scheduling import XiaoniangaoAuthorScheduling
			
 
				 
			
 
				 
			
 
				-def main(log_type, crawler, env):
			
 
				+def xiaoniangao_author_main(log_type, crawler, env):
			
 
				     Common.logger(log_type, crawler).info(f'开始抓取:小年糕账号\n')
			
 
				     Common.logging(log_type, crawler, env, "开始抓取:小年糕账号\n")
			
 
				     XiaoniangaoAuthorScheduling.get_author_videos(log_type=log_type,
			
@@ -21,4 +21,4 @@ def main(log_type, crawler, env):
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    main("author", "xiaoniangao", "dev")
			
 
				+    xiaoniangao_author_main("author", "xiaoniangao", "dev")
			
--- a/xiaoniangao/xiaoniangao_main/run_xng_play_dev.py
+++ b/xiaoniangao/xiaoniangao_main/run_xng_play_dev.py
@@ -8,9 +8,9 @@ from common.common import Common
 
				 from xiaoniangao.xiaoniangao_play.xiaoniangao_play_scheduling import XiaoniangaoplayScheduling
			
 
				 
			
 
				 
			
 
				-def main(log_type, crawler, env):
			
 
				-    Common.logger(log_type, crawler).info(f'开始抓取 西瓜推荐\n')
			
 
				-    Common.logging(log_type, crawler, env, "开始抓取 西瓜推荐\n")
			
 
				+def xiaoniangao_recommend_main(log_type, crawler, env):
			
 
				+    Common.logger(log_type, crawler).info(f'开始抓取:小年糕播放\n')
			
 
				+    Common.logging(log_type, crawler, env, "开始抓取:小年糕播放\n")
			
 
				     XiaoniangaoplayScheduling.get_videoList(log_type=log_type,
			
 
				                                             crawler=crawler,
			
 
				                                             rule_dict={"duration":{"min":40,"max":0},"play_cnt":{"min":20000,"max":0},"period":{"min":60,"max":60}},
			
@@ -21,4 +21,4 @@ def main(log_type, crawler, env):
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    main("recommend", "xiaoniangao", "dev")
			
 
				+    xiaoniangao_recommend_main("recommend", "xiaoniangao", "dev")