il y a 1 an · bbf2fa7be7
--- a/shipinhao/shipinhao_search/shipinhao_search.py
+++ b/shipinhao/shipinhao_search/shipinhao_search.py
@@ -245,15 +245,17 @@ class ShipinhaoSearch:
 
				         webviews = driver.contexts
			
 
				         Common.logger(log_type, crawler).info(f"webviews:{webviews}")
			
 
				         driver.switch_to.context(webviews[1])
			
 
				-        time.sleep(3)
			
 
				+        Common.logger(log_type, crawler).info(driver.current_context)
			
 
				+        time.sleep(1)
			
 
				         windowHandles = driver.window_handles
			
 
				         for handle in windowHandles:
			
 
				-            driver.switch_to.window(handle)
			
 
				-            time.sleep(3)
			
 
				-            if driver.find_element(By.XPATH, '//div[@class="unit"]'):
			
 
				+            try:
			
 
				+                driver.switch_to.window(handle)
			
 
				+                time.sleep(1)
			
 
				+                driver.find_element(By.XPATH, '//div[@class="unit"]')
			
 
				                 Common.logger(log_type, crawler).info('切换 webview 成功')
			
 
				                 return "成功"
			
 
				-            else:
			
 
				+            except Exception:
			
 
				                 Common.logger(log_type, crawler).info("切换 webview 失败")
			
 
				 
			
 
				     @classmethod
			
@@ -596,93 +598,93 @@ class ShipinhaoSearch:
 
				                 return
			
 
				 
			
 
				             for i, video_element in enumerate(video_element_temp):
			
 
				-                try:
			
 
				-                    Common.logger(log_type, crawler).info(f"download_cnt:{cls.download_cnt}")
			
 
				-                    Common.logging(log_type, crawler, env, f"download_cnt:{cls.download_cnt}")
			
 
				-                    if cls.download_cnt >= cls.videos_cnt(log_type, crawler):
			
 
				-                        Common.logger(log_type, crawler).info(f'搜索词:"{word}"，已抓取视频数:{cls.download_cnt}')
			
 
				-                        Common.logging(log_type, crawler, env, f'搜索词:"{word}"，已抓取视频数:{cls.download_cnt}')
			
 
				-                        cls.download_cnt = 0
			
 
				-                        return
			
 
				-
			
 
				-                    if video_element is None:
			
 
				-                        Common.logger(log_type, crawler).info('到底啦~\n')
			
 
				-                        Common.logging(log_type, crawler, env, '到底啦~\n')
			
 
				-                        return
			
 
				-
			
 
				-                    cls.i += 1
			
 
				-                    cls.search_elements(driver, '//*[@class="rich-media active__absolute"]')
			
 
				-
			
 
				-                    Common.logger(log_type, crawler).info(f'拖动"视频"列表第{cls.i}个至屏幕中间')
			
 
				-                    Common.logging(log_type, crawler, env, f'拖动"视频"列表第{cls.i}个至屏幕中间')
			
 
				+                # try:
			
 
				+                Common.logger(log_type, crawler).info(f"download_cnt:{cls.download_cnt}")
			
 
				+                Common.logging(log_type, crawler, env, f"download_cnt:{cls.download_cnt}")
			
 
				+                if cls.download_cnt >= cls.videos_cnt(log_type, crawler):
			
 
				+                    Common.logger(log_type, crawler).info(f'搜索词:"{word}"，已抓取视频数:{cls.download_cnt}')
			
 
				+                    Common.logging(log_type, crawler, env, f'搜索词:"{word}"，已抓取视频数:{cls.download_cnt}')
			
 
				+                    cls.download_cnt = 0
			
 
				+                    return
			
 
				+
			
 
				+                if video_element is None:
			
 
				+                    Common.logger(log_type, crawler).info('到底啦~\n')
			
 
				+                    Common.logging(log_type, crawler, env, '到底啦~\n')
			
 
				+                    return
			
 
				+
			
 
				+                cls.i += 1
			
 
				+                cls.search_elements(driver, '//*[@class="rich-media active__absolute"]')
			
 
				+
			
 
				+                Common.logger(log_type, crawler).info(f'拖动"视频"列表第{cls.i}个至屏幕中间')
			
 
				+                Common.logging(log_type, crawler, env, f'拖动"视频"列表第{cls.i}个至屏幕中间')
			
 
				+                time.sleep(3)
			
 
				+                driver.execute_script("arguments[0].scrollIntoView({block:'center',inline:'center'})",
			
 
				+                                      video_element)
			
 
				+                if len(video_element.find_elements(By.XPATH, "//*[@text='没有更多的搜索结果']")) != 0:
			
 
				+                    Common.logger(log_type, crawler).info("没有更多的搜索结果\n")
			
 
				+                    Common.logging(log_type, crawler, env, "没有更多的搜索结果\n")
			
 
				+                    return
			
 
				+                video_title = video_element.find_elements(By.XPATH, '//div[@class="rich-media__title ellipsis_2"]/span')[index + i].text[:40]
			
 
				+                video_url = video_element.find_elements(By.XPATH, '//div[@class="video-player"]')[index+i].get_attribute('src')
			
 
				+                cover_url = video_element.find_elements(By.XPATH, '//div[@class="video-player__bd"]')[index+i].get_attribute('style')
			
 
				+                cover_url = cover_url.split('url("')[-1].split('")')[0]
			
 
				+                duration = video_element.find_elements(By.XPATH, '//div[@class="video-player-mask__text"]')[index+i].text
			
 
				+                duration = int(duration.split(':')[0]) * 60 + int(duration.split(':')[-1])
			
 
				+                user_name = video_element.find_elements(By.XPATH, '//div[@class="rich-media__source__title"]')[index+i].text
			
 
				+                avatar_url = video_element.find_elements(By.XPATH, '//div[@class="ui-image-image ui-image rich-media__source__thumb"]')[index+i].get_attribute('style')
			
 
				+                avatar_url = avatar_url.split('url("')[-1].split('")')[0]
			
 
				+                out_video_id = md5(video_title.encode('utf8')).hexdigest()
			
 
				+                out_user_id = md5(user_name.encode('utf8')).hexdigest()
			
 
				+
			
 
				+                video_dict = {
			
 
				+                    "video_title": video_title,
			
 
				+                    "video_id": out_video_id,
			
 
				+                    "play_cnt": 0,
			
 
				+                    "duration": duration,
			
 
				+                    "user_name": user_name,
			
 
				+                    "user_id": out_user_id,
			
 
				+                    "avatar_url": avatar_url,
			
 
				+                    "cover_url": cover_url,
			
 
				+                    "video_url": video_url,
			
 
				+                    "session": f"shipinhao-search-{int(time.time())}"
			
 
				+                }
			
 
				+                for k, v in video_dict.items():
			
 
				+                    Common.logger(log_type, crawler).info(f"{k}:{v}")
			
 
				+                Common.logging(log_type, crawler, env, f"{video_dict}")
			
 
				+                if video_title is None or video_url is None:
			
 
				+                    Common.logger(log_type, crawler).info("无效视频\n")
			
 
				+                    Common.logging(log_type, crawler, env, "无效视频\n")
			
 
				+                elif cls.repeat_out_video_id(log_type, crawler, out_video_id, env) != 0:
			
 
				+                    Common.logger(log_type, crawler).info('视频已下载\n')
			
 
				+                    Common.logging(log_type, crawler, env, '视频已下载\n')
			
 
				+                elif cls.repeat_video_url(log_type, crawler, video_url, env) != 0:
			
 
				+                    Common.logger(log_type, crawler).info('视频已下载\n')
			
 
				+                    Common.logging(log_type, crawler, env, '视频已下载\n')
			
 
				+                else:
			
 
				+                    video_element.click()
			
 
				                     time.sleep(3)
			
 
				-                    driver.execute_script("arguments[0].scrollIntoView({block:'center',inline:'center'})",
			
 
				-                                          video_element)
			
 
				-                    if len(video_element.find_elements(By.XPATH, "//*[@text='没有更多的搜索结果']")) != 0:
			
 
				-                        Common.logger(log_type, crawler).info("没有更多的搜索结果\n")
			
 
				-                        Common.logging(log_type, crawler, env, "没有更多的搜索结果\n")
			
 
				-                        return
			
 
				-                    video_title = video_element.find_elements(By.XPATH, '//div[@class="rich-media__title ellipsis_2"]/span')[index + i].text[:40]
			
 
				-                    video_url = video_element.find_elements(By.XPATH, '//div[@class="video-player"]')[index+i].get_attribute('src')
			
 
				-                    cover_url = video_element.find_elements(By.XPATH, '//div[@class="video-player__bd"]')[index+i].get_attribute('style')
			
 
				-                    cover_url = cover_url.split('url("')[-1].split('")')[0]
			
 
				-                    duration = video_element.find_elements(By.XPATH, '//div[@class="video-player-mask__text"]')[index+i].text
			
 
				-                    duration = int(duration.split(':')[0]) * 60 + int(duration.split(':')[-1])
			
 
				-                    user_name = video_element.find_elements(By.XPATH, '//div[@class="rich-media__source__title"]')[index+i].text
			
 
				-                    avatar_url = video_element.find_elements(By.XPATH, '//div[@class="ui-image-image ui-image rich-media__source__thumb"]')[index+i].get_attribute('style')
			
 
				-                    avatar_url = avatar_url.split('url("')[-1].split('")')[0]
			
 
				-                    out_video_id = md5(video_title.encode('utf8')).hexdigest()
			
 
				-                    out_user_id = md5(user_name.encode('utf8')).hexdigest()
			
 
				-
			
 
				-                    video_dict = {
			
 
				-                        "video_title": video_title,
			
 
				-                        "video_id": out_video_id,
			
 
				-                        "play_cnt": 0,
			
 
				-                        "duration": duration,
			
 
				-                        "user_name": user_name,
			
 
				-                        "user_id": out_user_id,
			
 
				-                        "avatar_url": avatar_url,
			
 
				-                        "cover_url": cover_url,
			
 
				-                        "video_url": video_url,
			
 
				-                        "session": f"shipinhao-search-{int(time.time())}"
			
 
				-                    }
			
 
				-                    for k, v in video_dict.items():
			
 
				-                        Common.logger(log_type, crawler).info(f"{k}:{v}")
			
 
				-                    Common.logging(log_type, crawler, env, f"{video_dict}")
			
 
				-                    if video_title is None or video_url is None:
			
 
				-                        Common.logger(log_type, crawler).info("无效视频\n")
			
 
				-                        Common.logging(log_type, crawler, env, "无效视频\n")
			
 
				-                    elif cls.repeat_out_video_id(log_type, crawler, out_video_id, env) != 0:
			
 
				-                        Common.logger(log_type, crawler).info('视频已下载\n')
			
 
				-                        Common.logging(log_type, crawler, env, '视频已下载\n')
			
 
				-                    elif cls.repeat_video_url(log_type, crawler, video_url, env) != 0:
			
 
				-                        Common.logger(log_type, crawler).info('视频已下载\n')
			
 
				-                        Common.logging(log_type, crawler, env, '视频已下载\n')
			
 
				+                    video_info_dict = cls.get_video_info(driver)
			
 
				+                    video_dict["like_cnt"] = video_info_dict["like_cnt"]
			
 
				+                    video_dict["share_cnt"] = video_info_dict["share_cnt"]
			
 
				+                    video_dict["favorite_cnt"] = video_info_dict["favorite_cnt"]
			
 
				+                    video_dict["comment_cnt"] = video_info_dict["comment_cnt"]
			
 
				+                    video_dict["publish_time_str"] = video_info_dict["publish_time_str"]
			
 
				+                    video_dict["publish_time_stamp"] = video_info_dict["publish_time_stamp"]
			
 
				+                    Common.logger(log_type, crawler).info(f'publish_time:{video_dict["publish_time_str"]}')
			
 
				+                    Common.logging(log_type, crawler, env, f'publish_time:{video_dict["publish_time_str"]}')
			
 
				+                    if cls.download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict) is False:
			
 
				+                        Common.logger(log_type, crawler).info("不满足抓取规则\n")
			
 
				+                        Common.logging(log_type, crawler, env, "不满足抓取规则\n")
			
 
				                     else:
			
 
				-                        video_element.click()
			
 
				-                        time.sleep(3)
			
 
				-                        video_info_dict = cls.get_video_info(driver)
			
 
				-                        video_dict["like_cnt"] = video_info_dict["like_cnt"]
			
 
				-                        video_dict["share_cnt"] = video_info_dict["share_cnt"]
			
 
				-                        video_dict["favorite_cnt"] = video_info_dict["favorite_cnt"]
			
 
				-                        video_dict["comment_cnt"] = video_info_dict["comment_cnt"]
			
 
				-                        video_dict["publish_time_str"] = video_info_dict["publish_time_str"]
			
 
				-                        video_dict["publish_time_stamp"] = video_info_dict["publish_time_stamp"]
			
 
				-                        Common.logger(log_type, crawler).info(f'publish_time:{video_dict["publish_time_str"]}')
			
 
				-                        Common.logging(log_type, crawler, env, f'publish_time:{video_dict["publish_time_str"]}')
			
 
				-                        if cls.download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict) is False:
			
 
				-                            Common.logger(log_type, crawler).info("不满足抓取规则\n")
			
 
				-                            Common.logging(log_type, crawler, env, "不满足抓取规则\n")
			
 
				-                        else:
			
 
				-                            cls.download_publish(log_type=log_type,
			
 
				-                                                 crawler=crawler,
			
 
				-                                                 word=word,
			
 
				-                                                 video_dict=video_dict,
			
 
				-                                                 our_uid=our_uid,
			
 
				-                                                 env=env)
			
 
				-                except Exception as e:
			
 
				-                    Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
			
 
				-                    Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
			
 
				+                        cls.download_publish(log_type=log_type,
			
 
				+                                             crawler=crawler,
			
 
				+                                             word=word,
			
 
				+                                             video_dict=video_dict,
			
 
				+                                             our_uid=our_uid,
			
 
				+                                             env=env)
			
 
				+                # except Exception as e:
			
 
				+                #     Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
			
 
				+                #     Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
			
 
				 
			
 
				             Common.logger(log_type, crawler).info('已抓取完一组视频，休眠1秒\n')
			
 
				             Common.logging(log_type, crawler, env, '已抓取完一组视频，休眠1秒\n')
			
--- a/xigua/photos/__init__.py
+++ b/xigua/photos/__init__.py
@@ -0,0 +1,3 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+# @Author: wangkun
			
 
				+# @Time: 2023/6/25
			
--- a/xigua/xigua_main/run_xg_search_dev.py
+++ b/xigua/xigua_main/run_xg_search_dev.py
@@ -14,7 +14,9 @@ def xigua_search_main(log_type, crawler, env):
 
				     XiguasearchScheduling.get_search_videos(log_type=log_type,
			
 
				                                             crawler=crawler,
			
 
				                                             rule_dict={"play_cnt":{"min":8000,"max":0},"duration":{"min":60,"max":600},"period":{"min":365,"max":365},"videos_cnt":{"min":30,"max":0}},
			
 
				-                                            user_list=[{"uid": 6267140, "source": "xigua", "link": "退休补贴", "nick_name": "西瓜搜索测试账号", "avatar_url": "http://rescdn.yishihui.com/user/default/avatar/live/1616555578819_u=1922778943,2660693611&fm=26&gp=0.jpg", "mode": "search"}],
			
 
				+                                            user_list=[{"uid": 6267140, "source": "xigua", "link": "健康", "nick_name": "健康", "avatar_url": "http://rescdn.yishihui.com/user/default/avatar/live/1616555578819_u=1922778943,2660693611&fm=26&gp=0.jpg", "mode": "search"},
			
 
				+                                                       {"uid": 6267140, "source": "xigua", "link": "瓦格纳", "nick_name": "瓦格纳", "avatar_url": "http://rescdn.yishihui.com/user/default/avatar/live/1616555578819_u=1922778943,2660693611&fm=26&gp=0.jpg", "mode": "search"},
			
 
				+                                                       {"uid": 6267141, "source": "xigua", "link": "高考分数线", "nick_name": "高考分数线", "avatar_url": "http://rescdn.yishihui.com/user/default/avatar/live/1616555578819_u=1922778943,2660693611&fm=26&gp=0.jpg", "mode": "search"}],
			
 
				                                             env=env)
			
 
				     Common.del_logs(log_type, crawler)
			
 
				     Common.logger(log_type, crawler).info("抓取一轮结束\n")
			
--- a/xigua/xigua_search/xigua_search_dev.py
+++ b/xigua/xigua_search/xigua_search_dev.py
@@ -0,0 +1,87 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+# @Author: wangkun
			
 
				+# @Time: 2023/6/25
			
 
				+import json
			
 
				+import time
			
 
				+import requests
			
 
				+import urllib.parse
			
 
				+from selenium.webdriver import DesiredCapabilities
			
 
				+from selenium.webdriver.chrome.service import Service
			
 
				+from selenium.webdriver.common.by import By
			
 
				+from seleniumwire import webdriver
			
 
				+
			
 
				+
			
 
				+class SearchDev:
			
 
				+    @classmethod
			
 
				+    def get_videoList_requests(cls):
			
 
				+        url = "https://www.ixigua.com/api/searchv2/complex/猪八戒/0?" \
			
 
				+              "fss=default_search&" \
			
 
				+              "order_type=publish_time&" \
			
 
				+              "click_position=new&" \
			
 
				+              "aid=1768&" \
			
 
				+              "msToken=EV6DlzmvSZH6yBIIm7tCdxb6EY7xuV7p0EZw4nZUyznGvXk9Wkyx0GiT39zCO2HRROdUYZc0XYpAztUSzg14q3a1Fkoj01Avy_BGjKFFn5wRQDP8nVWECA==&" \
			
 
				+              "X-Bogus=DFSzswVuSIsANrq4tnr0UFm4pID1&" \
			
 
				+              "_signature=_02B4Z6wo00001jeNZ4AAAIDCr-bw8w.DSLY3jWMAAOmJTnwirif4XNCUKjt3Ms0gS9-upb8jMBZJL5RSZ5dHBQm6GRMtSyn8h6D5rc1Y7tmwZL7a2nP390R3ARXFwF6tVQi97vqO5viH53M0c3"
			
 
				+
			
 
				+        payload = {}
			
 
				+        headers = {
			
 
				+            # 'authority': 'www.ixigua.com',
			
 
				+            'accept': 'application/json, text/plain, */*',
			
 
				+            'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
			
 
				+            'cache-control': 'no-cache',
			
 
				+            # 'cookie': 'MONITOR_WEB_ID=67cb5099-a022-4ec3-bb8e-c4de6ba51dd0; sid_guard=c7472b508ea631823ba765a60cf8757f%7C1680867422%7C3024002%7CFri%2C+12-May-2023+11%3A37%3A04+GMT; odin_tt=b893608d4dde2e1e8df8cd5d97a0e2fbeafc4ca762ac72ebef6e6c97e2ed19859bb01d46b4190ddd6dd17d7f9678e1de; s_v_web_id=verify_lhoket5d_0qlKZtzS_YZkf_4Uaj_82mX_j6lRT4PcYJ7A; __ac_signature=_02B4Z6wo00f01yB6eXwAAIDCWLSSerYAxYsgWn3AAKx5S2D2PsJJ92YblwdDE-9rnwnzZ87S0CUowZ3Xi8XmxMU3JHd0xfP-9VucrE9D.l9E7Vgn6y95sGbL2H6mgsddoCZX0cCgfcfKAzWgcd; ixigua-a-s=1; support_webp=true; support_avif=false; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; SEARCH_CARD_MODE=7168304743566296612_1; msToken=EV6DlzmvSZH6yBIIm7tCdxb6EY7xuV7p0EZw4nZUyznGvXk9Wkyx0GiT39zCO2HRROdUYZc0XYpAztUSzg14q3a1Fkoj01Avy_BGjKFFn5wRQDP8nVWECA==; tt_scid=rP8nVwFTm4wPZyREet0crbp-ZRgJsK.x5TE0lqU2uibGbUDAhlM.oA14pKRcGzXW0955; ttwid=1%7CHHtv2QqpSGuSu8r-zXF1QoWsvjmNi1SJrqOrZzg-UCY%7C1687685218%7Ca985a413a36bb156ba577dac11fbc14593e5a2a4000001f9cfc7fd72781c4cc5; ixigua-a-s=1',
			
 
				+            'pragma': 'no-cache',
			
 
				+            'referer': f'https://www.ixigua.com/search/{urllib.parse.quote("猪八戒")}/?logTag=e0b95015015c05e60b1b&tab_name=home&fss=default_search',
			
 
				+            'sec-ch-ua': '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
			
 
				+            'sec-ch-ua-mobile': '?0',
			
 
				+            'sec-ch-ua-platform': '"macOS"',
			
 
				+            'sec-fetch-dest': 'empty',
			
 
				+            'sec-fetch-mode': 'cors',
			
 
				+            'sec-fetch-site': 'same-origin',
			
 
				+            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57',
			
 
				+            # 'x-secsdk-csrf-token': '0001000000011fd0adbaee655439e86800862b81e3e34974cab6a8656af77695b76ff5c76c96176bdcbf2631eeb7'
			
 
				+        }
			
 
				+
			
 
				+        response = requests.request("GET", url, headers=headers, data=payload)
			
 
				+
			
 
				+        print(response.text)
			
 
				+
			
 
				+
			
 
				+    @classmethod
			
 
				+    def get_videoList_selenium(cls):
			
 
				+        # 打印请求配置
			
 
				+        ca = DesiredCapabilities.CHROME
			
 
				+        ca["goog:loggingPrefs"] = {"performance": "ALL"}
			
 
				+        # # 不打开浏览器运行
			
 
				+        chrome_options = webdriver.ChromeOptions()
			
 
				+        chrome_options.add_argument(
			
 
				+            f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36')
			
 
				+        # chrome_options.add_argument("--headless")
			
 
				+        chrome_options.add_argument("--window-size=1920,1080")
			
 
				+        # chrome_options.add_argument("--no-sandbox")
			
 
				+        chromedriver = "/Users/wangkun/Downloads/chromedriver/chromedriver_v114/chromedriver"
			
 
				+        # driver初始化
			
 
				+        driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service(chromedriver))
			
 
				+        driver.implicitly_wait(10)
			
 
				+        print("打开搜索页：健康")
			
 
				+        driver.get(f"https://www.ixigua.com/search/健康/")
			
 
				+        time.sleep(3)
			
 
				+        # logs = driver.get_log("performance")
			
 
				+        print("关闭登录弹框")
			
 
				+        if driver.find_elements(By.XPATH, '//*[@class="xg-notification-close"]') != 0:
			
 
				+            driver.find_element(By.XPATH, '//*[@class="xg-notification-close"]').click()
			
 
				+        driver.get_screenshot_as_file("./关闭弹框.png")
			
 
				+        print("点击筛选按钮")
			
 
				+        driver.find_element(By.XPATH, '//*[@class="searchPageV2__header-icons-categories"]').click()
			
 
				+        print("点击最新排序")
			
 
				+        driver.find_element(By.XPATH, '//*[@class="searchPageV2__header-categories-wrapper"]/*[1]/*[2]/*[1]').click()
			
 
				+        time.sleep(3)
			
 
				+        driver.get_screenshot_as_file("./最新排序.png")
			
 
				+
			
 
				+
			
 
				+        driver.quit()
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # SearchDev.get_videoList_requests()
			
 
				+    SearchDev.get_videoList_selenium()
			
--- a/xigua/xigua_search/xigua_search_scheduling.py
+++ b/xigua/xigua_search/xigua_search_scheduling.py
@@ -570,24 +570,23 @@ class XiguasearchScheduling:
 
				         Common.logging(log_type, crawler, env, f"打开搜索页:{user_dict['link']}")
			
 
				         driver.get(f"https://www.ixigua.com/search/{user_dict['link']}/")
			
 
				         time.sleep(3)
			
 
				-        # driver.get_screenshot_as_file(f"./{crawler}/logs/打开搜索页.jpg")
			
 
				-        # if len(driver.find_elements(By.XPATH, '//*[@class="xg-notification-close"]')) != 0:
			
 
				-        #     driver.find_element(By.XPATH, '//*[@class="xg-notification-close"]').click()
			
 
				-        #     time.sleep(1)
			
 
				-        # Common.logger(log_type, crawler).info("点击筛选")
			
 
				-        # driver.find_element(By.XPATH, '//*[@class="searchPageV2__header-icons-categories"]').click()
			
 
				-        # time.sleep(1)
			
 
				-        # Common.logger(log_type, crawler).info("点击最新排序")
			
 
				-        # driver.find_element(By.XPATH, '//*[@class="searchPageV2-category__wrapper"]/*[2]/*[1]').click()
			
 
				-        # time.sleep(5)
			
 
				-        # Common.logger(log_type, crawler).info("收回筛选")
			
 
				-        # driver.find_element(By.XPATH, '//*[@class="searchPageV2__header-icons-categories"]').click()
			
 
				-        # time.sleep(1)
			
 
				-        # # 点击列表形式//div[@class="searchPageV2__header-icons"]/*[3]
			
 
				-        # Common.logger(log_type, crawler).info("点击列表形式展示")
			
 
				-        # driver.find_element(By.XPATH, '//div[@class="searchPageV2__header-icons"]/*[3]').click()
			
 
				-        # time.sleep(3)
			
 
				-        # driver.get_screenshot_as_file(f"./{crawler}/logs/已点击最新排序.jpg")
			
 
				+        Common.logger(log_type, crawler).info("关闭登录弹框")
			
 
				+        Common.logging(log_type, crawler, env, "关闭登录弹框")
			
 
				+        if driver.find_elements(By.XPATH, '//*[@class="xg-notification-close"]') != 0:
			
 
				+            driver.find_element(By.XPATH, '//*[@class="xg-notification-close"]').click()
			
 
				+        # driver.get_screenshot_as_file(f"./{crawler}/photos/{user_dict['link']}-关闭登录弹框.png")
			
 
				+        Common.logger(log_type, crawler).info("展开筛选按钮")
			
 
				+        Common.logging(log_type, crawler, env, "展开筛选按钮")
			
 
				+        driver.find_element(By.XPATH, '//*[@class="searchPageV2__header-icons-categories"]').click()
			
 
				+        Common.logger(log_type, crawler).info("点击最新排序")
			
 
				+        Common.logging(log_type, crawler, env, "点击最新排序")
			
 
				+        driver.find_element(By.XPATH, '//*[@class="searchPageV2__header-categories-wrapper"]/*[1]/*[2]/*[1]').click()
			
 
				+        time.sleep(3)
			
 
				+        # driver.get_screenshot_as_file(f"./{crawler}/photos/{user_dict['link']}-最新排序.png")
			
 
				+        Common.logger(log_type, crawler).info("收起筛选按钮\n")
			
 
				+        Common.logging(log_type, crawler, env, "收起筛选按钮\n")
			
 
				+        driver.find_element(By.XPATH, '//*[@class="searchPageV2__header-icons-categories"]').click()
			
 
				+        time.sleep(1)
			
 
				 
			
 
				         index = 0
			
 
				         num = 0
			
@@ -617,8 +616,20 @@ class XiguasearchScheduling:
 
				                     Common.logging(log_type, crawler, env, f'拖动"视频"列表第{num}个至屏幕中间')
			
 
				                     driver.execute_script("arguments[0].scrollIntoView({block:'center',inline:'center'})", video_element)
			
 
				                     time.sleep(3)
			
 
				-                    # driver.get_screenshot_as_file(f"./{crawler}/logs/{num}.jpg")
			
 
				-                    item_id = video_element.find_elements(By.XPATH, '//*[@class="HorizontalFeedCard__coverWrapper disableZoomAnimation"]')[index+i].get_attribute('href')
			
 
				+                    # driver.get_screenshot_as_file(f"./{crawler}/photos/{user_dict['link']}-{num}.png")
			
 
				+                    title = video_element.find_elements(By.XPATH, '//*[@class="HorizontalFeedCard__coverWrapper disableZoomAnimation"]')[index+i-1].get_attribute('title')
			
 
				+                    publish_day = video_element.find_elements(By.XPATH, '//*[@class="HorizontalFeedCard-accessories-bottomInfo__statistics"]')[index+i-1].text.split('· ')[-1]
			
 
				+                    Common.logger(log_type, crawler).info(f"标题:{title}")
			
 
				+                    Common.logging(log_type, crawler, env, f"标题:{title}")
			
 
				+                    Common.logger(log_type, crawler).info(f"发布时间:{publish_day}")
			
 
				+                    Common.logging(log_type, crawler, env, f"发布时间:{publish_day}")
			
 
				+                    if "年" in publish_day:
			
 
				+                        Common.logger(log_type, crawler).info("发布时间超过 1 年\n")
			
 
				+                        Common.logging(log_type, crawler, env, "发布时间超过 1 年\n")
			
 
				+                        driver.quit()
			
 
				+                        return
			
 
				+
			
 
				+                    item_id = video_element.find_elements(By.XPATH, '//*[@class="HorizontalFeedCard__coverWrapper disableZoomAnimation"]')[index+i-1].get_attribute('href')
			
 
				                     item_id = item_id.split("com/")[-1].split("?&")[0]
			
 
				                     video_dict = cls.get_video_info(log_type, crawler, item_id)
			
 
				                     if video_dict is None:
			
@@ -628,10 +639,12 @@ class XiguasearchScheduling:
 
				                     for k, v in video_dict.items():
			
 
				                         Common.logger(log_type, crawler).info(f"{k}:{v}")
			
 
				                     Common.logging(log_type, crawler, env, f"{video_dict}")
			
 
				+
			
 
				                     # if int((int(time.time()) - int(video_dict["publish_time_stamp"])) / (3600 * 24)) > int(rule_dict.get("period", {}).get("max", 1000)):
			
 
				                     #     Common.logger(log_type, crawler).info(f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n')
			
 
				                     #     driver.quit()
			
 
				                     #     return
			
 
				+
			
 
				                     if download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
			
 
				                         Common.logger(log_type, crawler).info("不满足抓取规则\n")
			
 
				                         Common.logging(log_type, crawler, env, "不满足抓取规则\n")
			
@@ -819,6 +832,8 @@ class XiguasearchScheduling:
 
				 
			
 
				     @classmethod
			
 
				     def get_search_videos(cls, log_type, crawler, user_list, rule_dict, env):
			
 
				+        Common.logger(log_type, crawler).info(f"搜索词总数:{len(user_list)}\n")
			
 
				+        Common.logging(log_type, crawler, env, f"搜索词总数:{len(user_list)}\n")
			
 
				         for user_dict in user_list:
			
 
				             try:
			
 
				                 cls.download_cnt = 0