wangkun 2 年之前
父節點
當前提交
937d1b0fcf
共有 2 個文件被更改,包括 72 次插入75 次删除
  1. 60 58
      kuaishou/kuaishou_follow/kuaishou_follow.py
  2. 12 17
      scheduling/crawler_scheduling.py

+ 60 - 58
kuaishou/kuaishou_follow/kuaishou_follow.py

@@ -301,6 +301,63 @@ class Follow:
         else:
         else:
             return video_title
             return video_title
 
 
+    @classmethod
+    def get_cookie(cls, log_type, crawler, out_uid, machine):
+        try:
+            # 打印请求配置
+            ca = DesiredCapabilities.CHROME
+            ca["goog:loggingPrefs"] = {"performance": "ALL"}
+
+            # 不打开浏览器运行
+            chrome_options = webdriver.ChromeOptions()
+            chrome_options.add_argument("headless")
+            chrome_options.add_argument(
+                f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36')
+            chrome_options.add_argument("--no-sandbox")
+
+            # driver初始化
+            if machine == "aliyun":
+                driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
+            elif machine == "macpro":
+                driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options,
+                                          service=Service('/Users/lieyunye/Downloads/chromedriver_v107/chromedriver'))
+            elif machine == "macair":
+                driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options,
+                                          service=Service('/Users/piaoquan/Downloads/chromedriver_v108/chromedriver'))
+            else:
+                driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service(
+                    '/Users/wangkun/Downloads/chromedriver/chromedriver_v109/chromedriver'))
+
+            driver.implicitly_wait(10)
+            # print('打开个人主页')
+            driver.get(f'https://www.kuaishou.com/profile/{out_uid}')
+            time.sleep(1)
+
+            # print('解析cookies')
+            logs = driver.get_log("performance")
+            # Common.logger(log_type, crawler).info('已获取logs:{}\n', logs)
+            # print('退出浏览器')
+            driver.quit()
+            for line in logs:
+                msg = json.loads(line['message'])
+                # Common.logger(log_type, crawler).info(f"{msg}\n\n")
+                if 'message' not in msg:
+                    pass
+                elif 'params' not in msg['message']:
+                    pass
+                elif 'headers' not in msg['message']['params']:
+                    pass
+                elif 'Cookie' not in msg['message']['params']['headers']:
+                    pass
+                elif msg['message']['params']['headers']['Host'] != 'www.kuaishou.com':
+                    pass
+                else:
+                    cookie = msg['message']['params']['headers']['Cookie']
+                    # Common.logger(log_type, crawler).info(f"{cookie}")
+                    return cookie
+        except Exception as e:
+            Common.logger(log_type, crawler).error(f"get_cookie:{e}\n")
+
     @classmethod
     @classmethod
     def get_videoList(cls, log_type, crawler, strategy, our_uid, out_uid, oss_endpoint, env, machine):
     def get_videoList(cls, log_type, crawler, strategy, our_uid, out_uid, oss_endpoint, env, machine):
         try:
         try:
@@ -710,64 +767,9 @@ class Follow:
                               oss_endpoint=oss_endpoint,
                               oss_endpoint=oss_endpoint,
                               env=env,
                               env=env,
                               machine=machine)
                               machine=machine)
-            time.sleep(3)
-
-    @classmethod
-    def get_cookie(cls, log_type, crawler, out_uid, machine):
-        try:
-            # 打印请求配置
-            ca = DesiredCapabilities.CHROME
-            ca["goog:loggingPrefs"] = {"performance": "ALL"}
-
-            # 不打开浏览器运行
-            chrome_options = webdriver.ChromeOptions()
-            chrome_options.add_argument("headless")
-            chrome_options.add_argument(
-                f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36')
-            chrome_options.add_argument("--no-sandbox")
-
-            # driver初始化
-            if machine == "aliyun":
-                driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
-            elif machine == "macpro":
-                driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options,
-                                          service=Service('/Users/lieyunye/Downloads/chromedriver_v107/chromedriver'))
-            elif machine == "macair":
-                driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options,
-                                          service=Service('/Users/piaoquan/Downloads/chromedriver_v108/chromedriver'))
-            else:
-                driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service(
-                    '/Users/wangkun/Downloads/chromedriver/chromedriver_v109/chromedriver'))
-
-            driver.implicitly_wait(10)
-            # print('打开个人主页')
-            driver.get(f'https://www.kuaishou.com/profile/{out_uid}')
-            time.sleep(1)
-
-            # print('解析cookies')
-            logs = driver.get_log("performance")
-            # Common.logger(log_type, crawler).info('已获取logs:{}\n', logs)
-            # print('退出浏览器')
-            driver.quit()
-            for line in logs:
-                msg = json.loads(line['message'])
-                # Common.logger(log_type, crawler).info(f"{msg}\n\n")
-                if 'message' not in msg:
-                    pass
-                elif 'params' not in msg['message']:
-                    pass
-                elif 'headers' not in msg['message']['params']:
-                    pass
-                elif 'Cookie' not in msg['message']['params']['headers']:
-                    pass
-                elif msg['message']['params']['headers']['Host'] != 'www.kuaishou.com':
-                    pass
-                else:
-                    cookie = msg['message']['params']['headers']['Cookie']
-                    # Common.logger(log_type, crawler).info(f"{cookie}")
-                    return cookie
-        except Exception as e:
-            Common.logger(log_type, crawler).error(f"get_cookie:{e}\n")
+            sleep_time = 120
+            Common.logger(log_type, crawler).info(f"休眠{sleep_time}秒")
+            time.sleep(sleep_time)
 
 
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":

+ 12 - 17
scheduling/crawler_scheduling.py

@@ -82,7 +82,7 @@ class Scheduling:
 
 
     # 资源分配 / 组装 / 调度任务
     # 资源分配 / 组装 / 调度任务
     @classmethod
     @classmethod
-    def main(cls, log_type, crawler, strategy, oss_endpoint, env, machine):
+    def main(cls, log_type, crawler, env, machine):
         pre_task_list = cls.get_task(log_type=log_type, crawler=crawler, env=env, machine=machine)
         pre_task_list = cls.get_task(log_type=log_type, crawler=crawler, env=env, machine=machine)
         if len(pre_task_list) == 0:
         if len(pre_task_list) == 0:
             Common.logger(log_type, crawler).info("暂无新任务\n")
             Common.logger(log_type, crawler).info("暂无新任务\n")
@@ -94,21 +94,16 @@ class Scheduling:
                 interval_piaoquan = pre_task_list[i]["interval_piaoquan"]
                 interval_piaoquan = pre_task_list[i]["interval_piaoquan"]
                 spider_rule = pre_task_list[i]["spider_rule"]
                 spider_rule = pre_task_list[i]["spider_rule"]
 
 
+                if machine == "hk":
+                    # 写入 redis
+                    pass
+                elif machine == "aliyun":
+                    # 写入 redis
+                    pass
+                else:
+                    # 写入 redis
+                    pass
+
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()  ## 新建参数解释器对象
-    parser.add_argument('--log_type', type=str)  ## 添加参数,注明参数类型
-    parser.add_argument('--crawler')  ## 添加参数
-    parser.add_argument('--strategy')  ## 添加参数
-    parser.add_argument('--our_uid')  ## 添加参数
-    parser.add_argument('--oss_endpoint')  ## 添加参数
-    parser.add_argument('--env')  ## 添加参数
-    parser.add_argument('--machine')  ## 添加参数
-    args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
-    # print(args)
-    Scheduling.main(log_type=args.log_type,
-                    crawler=args.crawler,
-                    strategy=args.strategy,
-                    oss_endpoint=args.oss_endpoint,
-                    env=args.env,
-                    machine=args.machine)
+    Scheduling.main("scheduling", "scheduling", "dev", "local")