wangkun преди 1 година
родител
ревизия
250fda3c09

+ 1 - 0
README.MD

@@ -165,6 +165,7 @@ ps aux | grep douyin | grep -v grep | awk '{print $2}' | xargs kill -9
 ps aux | grep xiaoniangao | grep -v grep | awk '{print $2}' | xargs kill -9
 ps aux | grep kanyikan | grep -v grep | awk '{print $2}' | xargs kill -9
 ps aux | grep Appium | grep -v grep | awk '{print $2}' | xargs kill -9
+ps aux | grep shipinhao | grep -v grep | awk '{print $2}' | xargs kill -9
 ```
 
 #### 生成 requirements.txt

+ 5 - 3
gongzhonghao/gongzhonghao_author/gongzhonghao1_author.py

@@ -4,6 +4,7 @@
 import datetime
 import json
 import os
+import random
 import shutil
 import sys
 import time
@@ -325,10 +326,11 @@ class GongzhonghaoAuthor1:
                             video_dict["platform"] = crawler
                             video_dict["strategy"] = log_type
                             video_dict["out_video_id"] = video_dict["video_id"]
-                            video_dict["width"] = video_dict["video_width"]
-                            video_dict["height"] = video_dict["video_height"]
+                            video_dict["width"] = 0
+                            video_dict["height"] = 0
                             video_dict["crawler_rule"] = json.dumps(rule_dict)
-                            video_dict["user_id"] = user_dict["uid"]
+                            # video_dict["user_id"] = user_dict["uid"]
+                            video_dict["user_id"] = Publish.uids(crawler, "定向爬虫策略", "", env)
                             video_dict["publish_time"] = video_dict["publish_time_str"]
 
                             mq.send_msg(video_dict)

+ 4 - 3
gongzhonghao/gongzhonghao_author/gongzhonghao2_author.py

@@ -323,10 +323,11 @@ class GongzhonghaoAuthor2:
                             video_dict["platform"] = crawler
                             video_dict["strategy"] = log_type
                             video_dict["out_video_id"] = video_dict["video_id"]
-                            video_dict["width"] = video_dict["video_width"]
-                            video_dict["height"] = video_dict["video_height"]
+                            video_dict["width"] = 0
+                            video_dict["height"] = 0
                             video_dict["crawler_rule"] = json.dumps(rule_dict)
-                            video_dict["user_id"] = user_dict["uid"]
+                            # video_dict["user_id"] = user_dict["uid"]
+                            video_dict["user_id"] = Publish.uids(crawler, "定向爬虫策略", "", env)
                             video_dict["publish_time"] = video_dict["publish_time_str"]
 
                             mq.send_msg(video_dict)

+ 4 - 3
gongzhonghao/gongzhonghao_author/gongzhonghao3_author.py

@@ -324,10 +324,11 @@ class GongzhonghaoAuthor3:
                             video_dict["platform"] = crawler
                             video_dict["strategy"] = log_type
                             video_dict["out_video_id"] = video_dict["video_id"]
-                            video_dict["width"] = video_dict["video_width"]
-                            video_dict["height"] = video_dict["video_height"]
+                            video_dict["width"] = 0
+                            video_dict["height"] = 0
                             video_dict["crawler_rule"] = json.dumps(rule_dict)
-                            video_dict["user_id"] = user_dict["uid"]
+                            # video_dict["user_id"] = user_dict["uid"]
+                            video_dict["user_id"] = Publish.uids(crawler, "定向爬虫策略", "", env)
                             video_dict["publish_time"] = video_dict["publish_time_str"]
 
                             mq.send_msg(video_dict)

+ 5 - 4
gongzhonghao/gongzhonghao_author/gongzhonghao4_author.py

@@ -168,7 +168,7 @@ class GongzhonghaoAuthor4:
             driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
         else:
             driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service(
-                '/Users/wangkun/Downloads/chromedriver/chromedriver_v111/chromedriver'))
+                '/Users/wangkun/Downloads/chromedriver/chromedriver_v113/chromedriver'))
 
         driver.implicitly_wait(10)
         driver.get(article_url)
@@ -325,10 +325,11 @@ class GongzhonghaoAuthor4:
                             video_dict["platform"] = crawler
                             video_dict["strategy"] = log_type
                             video_dict["out_video_id"] = video_dict["video_id"]
-                            video_dict["width"] = video_dict["video_width"]
-                            video_dict["height"] = video_dict["video_height"]
+                            video_dict["width"] = 0
+                            video_dict["height"] = 0
                             video_dict["crawler_rule"] = json.dumps(rule_dict)
-                            video_dict["user_id"] = user_dict["uid"]
+                            # video_dict["user_id"] = user_dict["uid"]
+                            video_dict["user_id"] = Publish.uids(crawler, "定向爬虫策略", "", env)
                             video_dict["publish_time"] = video_dict["publish_time_str"]
 
                             mq.send_msg(video_dict)

+ 5 - 4
gongzhonghao/gongzhonghao_author/gongzhonghao5_author.py

@@ -168,7 +168,7 @@ class GongzhonghaoAuthor5:
             driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
         else:
             driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service(
-                '/Users/wangkun/Downloads/chromedriver/chromedriver_v111/chromedriver'))
+                '/Users/wangkun/Downloads/chromedriver/chromedriver_v113/chromedriver'))
 
         driver.implicitly_wait(10)
         driver.get(article_url)
@@ -325,10 +325,11 @@ class GongzhonghaoAuthor5:
                             video_dict["platform"] = crawler
                             video_dict["strategy"] = log_type
                             video_dict["out_video_id"] = video_dict["video_id"]
-                            video_dict["width"] = video_dict["video_width"]
-                            video_dict["height"] = video_dict["video_height"]
+                            video_dict["width"] = 0
+                            video_dict["height"] = 0
                             video_dict["crawler_rule"] = json.dumps(rule_dict)
-                            video_dict["user_id"] = user_dict["uid"]
+                            # video_dict["user_id"] = user_dict["uid"]
+                            video_dict["user_id"] = Publish.uids(crawler, "定向爬虫策略", "", env)
                             video_dict["publish_time"] = video_dict["publish_time_str"]
 
                             mq.send_msg(video_dict)

+ 8 - 2
gongzhonghao/gongzhonghao_main/run_gzh_author_dev.py

@@ -6,14 +6,20 @@ import sys
 sys.path.append(os.getcwd())
 from common.common import Common
 # from gongzhonghao.gongzhonghao_author.gongzhonghao1_author import GongzhonghaoAuthor1
-from gongzhonghao.gongzhonghao_author.gongzhonghao2_author import GongzhonghaoAuthor2
+# from gongzhonghao.gongzhonghao_author.gongzhonghao2_author import GongzhonghaoAuthor2
+# from gongzhonghao.gongzhonghao_author.gongzhonghao3_author import GongzhonghaoAuthor3
+# from gongzhonghao.gongzhonghao_author.gongzhonghao4_author import GongzhonghaoAuthor4
+from gongzhonghao.gongzhonghao_author.gongzhonghao5_author import GongzhonghaoAuthor5
 
 
 def gzh_main(log_type, crawler, env):
     Common.logger(log_type, crawler).info("开始抓取:公众号")
     Common.logging(log_type, crawler, env, "开始抓取:公众号")
     # GongzhonghaoAuthor1.get_all_videos(log_type=log_type,
-    GongzhonghaoAuthor2.get_all_videos(log_type=log_type,
+    # GongzhonghaoAuthor2.get_all_videos(log_type=log_type,
+    # GongzhonghaoAuthor3.get_all_videos(log_type=log_type,
+    # GongzhonghaoAuthor4.get_all_videos(log_type=log_type,
+    GongzhonghaoAuthor5.get_all_videos(log_type=log_type,
                                        crawler=crawler,
                                        rule_dict={"period": {"max": 1, "min": 1}, "duration": {"max": 2700, "min": 20}},
                                        env=env)

+ 22 - 8
kanyikan/kanyikan_recommend/kanyikan_recommend0627.py

@@ -11,6 +11,7 @@ from hashlib import md5
 import requests
 import urllib3
 sys.path.append(os.getcwd())
+from common.mq import MQ
 from common.common import Common
 from common.feishu import Feishu
 from common.publish import Publish
@@ -25,12 +26,14 @@ class KanyikanRecommend:
 
     @classmethod
     def repeat_video(cls, log_type, crawler, video_id, env):
-        sql = f""" select * from crawler_video where platform="{cls.platform}" and strategy="{cls.strategy}" and out_video_id="{video_id}" """
+        # sql = f""" select * from crawler_video where platform="{cls.platform}" and strategy="{cls.strategy}" and out_video_id="{video_id}" """
+        sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{video_id}"; """
         repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
         return len(repeat_video)
 
     @classmethod
     def get_videoList(cls, log_type, crawler, our_uid, rule_dict, env):
+        mq = MQ(topic_name="topic_crawler_etl_" + env)
         try:
             Common.logger(log_type, crawler).info(f"正在抓取列表页")
             Common.logging(log_type, crawler, env, f"正在抓取列表页")
@@ -91,7 +94,7 @@ class KanyikanRecommend:
                         .replace("&NBSP", "").replace(".", "。").replace(" ", "") \
                         .replace("'", "").replace("#", "").replace("Merge", "")
                     publish_time_stamp = feeds[i].get("date", 0)
-                    publish_time_str = time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(publish_time_stamp))
+                    publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))
                     # 获取播放地址
                     if "videoInfo" not in feeds[i]:
                         video_url = ""
@@ -137,12 +140,23 @@ class KanyikanRecommend:
                         Common.logger(log_type, crawler).info('视频已下载\n')
                         Common.logging(log_type, crawler, env, '视频已下载\n')
                     else:
-                        cls.download_publish(log_type=log_type,
-                                             crawler=crawler,
-                                             our_uid=our_uid,
-                                             video_dict=video_dict,
-                                             rule_dict=rule_dict,
-                                             env=env)
+                        # cls.download_publish(log_type=log_type,
+                        #                      crawler=crawler,
+                        #                      our_uid=our_uid,
+                        #                      video_dict=video_dict,
+                        #                      rule_dict=rule_dict,
+                        #                      env=env)
+                        video_dict["out_user_id"] = video_dict["user_id"]
+                        video_dict["platform"] = crawler
+                        video_dict["strategy"] = log_type
+                        video_dict["out_video_id"] = video_dict["video_id"]
+                        video_dict["width"] = video_dict["video_width"]
+                        video_dict["height"] = video_dict["video_height"]
+                        video_dict["crawler_rule"] = json.dumps(rule_dict)
+                        video_dict["user_id"] = our_uid
+                        video_dict["publish_time"] = video_dict["publish_time_str"]
+
+                        mq.send_msg(video_dict)
                 except Exception as e:
                     Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
                     Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")

+ 29 - 12
shipinhao/shipinhao_search/shipinhao_search.py

@@ -15,6 +15,7 @@ from appium.webdriver.webdriver import WebDriver
 from selenium.common import NoSuchElementException
 from selenium.webdriver.common.by import By
 sys.path.append(os.getcwd())
+from common.mq import MQ
 from common.feishu import Feishu
 from common.publish import Publish
 from common.common import Common
@@ -260,18 +261,20 @@ class ShipinhaoSearch:
 
     @classmethod
     def repeat_out_video_id(cls, log_type, crawler, out_video_id, env):
-        sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{out_video_id}"; """
+        # sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{out_video_id}"; """
+        sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{out_video_id}"; """
         repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
         return len(repeat_video)
 
     @classmethod
     def repeat_video_url(cls, log_type, crawler, video_url, env):
-        sql = f""" select * from crawler_video where platform="{cls.platform}" and video_url="{video_url}"; """
+        # sql = f""" select * from crawler_video where platform="{cls.platform}" and video_url="{video_url}"; """
+        sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and video_url="{video_url}"; """
         repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
         return len(repeat_video)
 
     @classmethod
-    def download_publish(cls, log_type, crawler, word, video_dict, our_uid, env):
+    def download_publish(cls, log_type, crawler, word, video_dict, rule_dict, our_uid, env):
         # 下载视频
         Common.download_method(log_type=log_type, crawler=crawler, text="video", title=video_dict["video_title"], url=video_dict["video_url"])
 
@@ -326,7 +329,6 @@ class ShipinhaoSearch:
             except FileNotFoundError:
                 return
 
-        rule_dict = cls.rule_dict(log_type, crawler)
         insert_sql = f""" insert into crawler_video(video_id,
                                                 out_user_id,
                                                 platform,
@@ -542,9 +544,9 @@ class ShipinhaoSearch:
 
             return our_user_list
 
-
     @classmethod
     def search_video(cls, log_type, crawler, word, driver: WebDriver, our_uid, env):
+        mq = MQ(topic_name="topic_crawler_etl_" + env)
         # 点击微信搜索框,并输入搜索词
         driver.implicitly_wait(10)
         Common.logger(log_type, crawler).info("点击搜索框")
@@ -641,6 +643,7 @@ class ShipinhaoSearch:
                         "video_id": out_video_id,
                         "play_cnt": 0,
                         "duration": duration,
+                        # "duration": 60,
                         "user_name": user_name,
                         "user_id": out_user_id,
                         "avatar_url": avatar_url,
@@ -668,7 +671,7 @@ class ShipinhaoSearch:
                         video_dict["share_cnt"] = video_info_dict["share_cnt"]
                         video_dict["favorite_cnt"] = video_info_dict["favorite_cnt"]
                         video_dict["comment_cnt"] = video_info_dict["comment_cnt"]
-                        video_dict["publish_time_str"] = video_info_dict["publish_time_str"]
+                        video_dict["publish_time_str"] = video_info_dict["publish_time_str"] + " 00:00:00"
                         video_dict["publish_time_stamp"] = video_info_dict["publish_time_stamp"]
                         Common.logger(log_type, crawler).info(f'publish_time:{video_dict["publish_time_str"]}')
                         Common.logging(log_type, crawler, env, f'publish_time:{video_dict["publish_time_str"]}')
@@ -676,12 +679,26 @@ class ShipinhaoSearch:
                             Common.logger(log_type, crawler).info("不满足抓取规则\n")
                             Common.logging(log_type, crawler, env, "不满足抓取规则\n")
                         else:
-                            cls.download_publish(log_type=log_type,
-                                                 crawler=crawler,
-                                                 word=word,
-                                                 video_dict=video_dict,
-                                                 our_uid=our_uid,
-                                                 env=env)
+                            rule_dict = cls.rule_dict(log_type, crawler)
+                            video_dict["out_user_id"] = video_dict["user_id"]
+                            video_dict["platform"] = crawler
+                            video_dict["strategy"] = log_type
+                            video_dict["out_video_id"] = video_dict["video_id"]
+                            video_dict["width"] = 0
+                            video_dict["height"] = 0
+                            video_dict["crawler_rule"] = json.dumps(rule_dict)
+                            video_dict["user_id"] = our_uid
+                            video_dict["publish_time"] = video_dict["publish_time_str"]
+                            mq.send_msg(video_dict)
+                            cls.download_cnt += 1
+                            # cls.download_publish(log_type=log_type,
+                            #                      crawler=crawler,
+                            #                      word=word,
+                            #                      video_dict=video_dict,
+                            #                      rule_dict=rule_dict,
+                            #                      our_uid=our_uid,
+                            #                      env=env)
+
                 except Exception as e:
                     Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
                     Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")

+ 2 - 2
xiaoniangao/xiaoniangao_main/run_xng_author_dev.py

@@ -8,7 +8,7 @@ from common.common import Common
 from xiaoniangao.xiaoniangao_author.xiaoniangao_author_scheduling import XiaoniangaoAuthorScheduling
 
 
-def main(log_type, crawler, env):
+def xiaoniangao_author_main(log_type, crawler, env):
     Common.logger(log_type, crawler).info(f'开始抓取:小年糕账号\n')
     Common.logging(log_type, crawler, env, "开始抓取:小年糕账号\n")
     XiaoniangaoAuthorScheduling.get_author_videos(log_type=log_type,
@@ -21,4 +21,4 @@ def main(log_type, crawler, env):
 
 
 if __name__ == "__main__":
-    main("author", "xiaoniangao", "dev")
+    xiaoniangao_author_main("author", "xiaoniangao", "dev")

+ 4 - 4
xiaoniangao/xiaoniangao_main/run_xng_play_dev.py

@@ -8,9 +8,9 @@ from common.common import Common
 from xiaoniangao.xiaoniangao_play.xiaoniangao_play_scheduling import XiaoniangaoplayScheduling
 
 
-def main(log_type, crawler, env):
-    Common.logger(log_type, crawler).info(f'开始抓取 西瓜推荐\n')
-    Common.logging(log_type, crawler, env, "开始抓取 西瓜推荐\n")
+def xiaoniangao_recommend_main(log_type, crawler, env):
+    Common.logger(log_type, crawler).info(f'开始抓取:小年糕播放\n')
+    Common.logging(log_type, crawler, env, "开始抓取:小年糕播放\n")
     XiaoniangaoplayScheduling.get_videoList(log_type=log_type,
                                             crawler=crawler,
                                             rule_dict={"duration":{"min":40,"max":0},"play_cnt":{"min":20000,"max":0},"period":{"min":60,"max":60}},
@@ -21,4 +21,4 @@ def main(log_type, crawler, env):
 
 
 if __name__ == "__main__":
-    main("recommend", "xiaoniangao", "dev")
+    xiaoniangao_recommend_main("recommend", "xiaoniangao", "dev")