wangkun 1 year ago
parent
commit
4edb90c596

+ 25 - 0
benshanzhufu/benshanzhufu_main/run_bszf_recommend_dev.py

@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/6/28
+import os
+import sys
+sys.path.append(os.getcwd())
+from common.common import Common
+from benshanzhufu.benshanzhufu_recommend.benshanzhufu_recommend_scheduling import BenshanzhufuRecommend
+
+
+def benshanzhufu_recommend_main(log_type, crawler, env):
+    Common.logger(log_type, crawler).info("开始抓取:本山祝福\n")
+    Common.logging(log_type, crawler, env, "开始抓取:本山祝福\n")
+    BenshanzhufuRecommend.get_videoList(log_type=log_type,
+                                        crawler=crawler,
+                                        our_uid=6267140,
+                                        rule_dict={},
+                                        env=env)
+    Common.del_logs(log_type, crawler)
+    Common.logger(log_type, crawler).info("抓取一轮结束\n")
+    Common.logging(log_type, crawler, env, "抓取一轮结束\n")
+
+
+if __name__ == "__main__":
+    benshanzhufu_recommend_main("recommend", "benshanzhufu", "dev")

+ 26 - 0
suisuiniannianyingfuqi/suisuiniannianyingfuqi_main/run_ssnnyfq_recommend_dev.py

@@ -0,0 +1,26 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/6/28
+import os
+import sys
+sys.path.append(os.getcwd())
+from common.common import Common
+from suisuiniannianyingfuqi.suisuiniannianyingfuqi_recommend.suisuiniannianyingfuqi_recommend_scheduling import \
+    SuisuiniannianyingfuqiRecommendScheduling
+
+
+def suisuiniannianyingfuqi_recommend_main(log_type, crawler, env):
+    Common.logger(log_type, crawler).info("开始抓取:岁岁年年迎福气\n")
+    Common.logging(log_type, crawler, env, "开始抓取:岁岁年年迎福气\n")
+    SuisuiniannianyingfuqiRecommendScheduling.get_videoList(log_type=log_type,
+                                                            crawler=crawler,
+                                                            our_uid=6267140,
+                                                            rule_dict={},
+                                                            env=env)
+    Common.del_logs(log_type, crawler)
+    Common.logger(log_type, crawler).info("抓取一轮结束\n")
+    Common.logging(log_type, crawler, env, "抓取一轮结束\n")
+
+
+if __name__ == "__main__":
+    suisuiniannianyingfuqi_recommend_main("recommend", "suisuiniannianyingfuqi", "dev")

+ 73 - 36
xigua/xigua_search/xigua_search_dev.py

@@ -1,8 +1,9 @@
 # -*- coding: utf-8 -*-
 # @Author: wangkun
 # @Time: 2023/6/25
-import json
 import os
+import random
+import string
 import sys
 import time
 import requests
@@ -15,36 +16,64 @@ from selenium.webdriver.common.by import By
 from seleniumwire import webdriver
 sys.path.append(os.getcwd())
 from common.common import Common
+from common.userAgent import get_random_user_agent
 
 
 class SearchDev:
     @classmethod
-    def get_videoList_requests(cls, word):
+    def random_signature(cls):
+        src_digits = string.digits  # string_数字
+        src_uppercase = string.ascii_uppercase  # string_大写字母
+        src_lowercase = string.ascii_lowercase  # string_小写字母
+        digits_num = random.randint(1, 6)
+        uppercase_num = random.randint(1, 26 - digits_num - 1)
+        lowercase_num = 26 - (digits_num + uppercase_num)
+        password = random.sample(src_digits, digits_num) + random.sample(src_uppercase, uppercase_num) + random.sample(
+            src_lowercase, lowercase_num)
+        random.shuffle(password)
+        new_password = 'AAAAAAAAAA' + ''.join(password)[10:-4] + 'AAAB'
+        new_password_start = new_password[0:18]
+        new_password_end = new_password[-7:]
+        if new_password[18] == '8':
+            new_password = new_password_start + 'w' + new_password_end
+        elif new_password[18] == '9':
+            new_password = new_password_start + 'x' + new_password_end
+        elif new_password[18] == '-':
+            new_password = new_password_start + 'y' + new_password_end
+        elif new_password[18] == '.':
+            new_password = new_password_start + 'z' + new_password_end
+        else:
+            new_password = new_password_start + 'y' + new_password_end
+        return new_password
+
+    @classmethod
+    def get_videoList_requests(cls, log_type, crawler, rule_dict, word, env):
+        offset = 0
         while True:
-            url = f"https://www.ixigua.com/api/searchv2/complex/{str(word)}/0?" \
-                  "fss=default_search&" \
+            url = f"https://www.ixigua.com/api/searchv2/complex/{str(word)}/{offset}?" \
+                  "fss=input&" \
                   "order_type=publish_time&" \
                   "click_position=new&" \
                   "aid=1768&" \
-                  "msToken=EV6DlzmvSZH6yBIIm7tCdxb6EY7xuV7p0EZw4nZUyznGvXk9Wkyx0GiT39zCO2HRROdUYZc0XYpAztUSzg14q3a1Fkoj01Avy_BGjKFFn5wRQDP8nVWECA==&" \
-                  "X-Bogus=DFSzswVuSIsANrq4tnr0UFm4pID1&" \
-                  "_signature=_02B4Z6wo00001jeNZ4AAAIDCr-bw8w.DSLY3jWMAAOmJTnwirif4XNCUKjt3Ms0gS9-upb8jMBZJL5RSZ5dHBQm6GRMtSyn8h6D5rc1Y7tmwZL7a2nP390R3ARXFwF6tVQi97vqO5viH53M0c3"
+                  f"_signature={cls.random_signature()}"
+
             headers = {
-                'authority': 'www.ixigua.com',
-                'accept': 'application/json, text/plain, */*',
-                'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
-                'cache-control': 'no-cache',
-                'cookie': 'MONITOR_WEB_ID=67cb5099-a022-4ec3-bb8e-c4de6ba51dd0; sid_guard=c7472b508ea631823ba765a60cf8757f%7C1680867422%7C3024002%7CFri%2C+12-May-2023+11%3A37%3A04+GMT; odin_tt=b893608d4dde2e1e8df8cd5d97a0e2fbeafc4ca762ac72ebef6e6c97e2ed19859bb01d46b4190ddd6dd17d7f9678e1de; s_v_web_id=verify_lhoket5d_0qlKZtzS_YZkf_4Uaj_82mX_j6lRT4PcYJ7A; __ac_signature=_02B4Z6wo00f01yB6eXwAAIDCWLSSerYAxYsgWn3AAKx5S2D2PsJJ92YblwdDE-9rnwnzZ87S0CUowZ3Xi8XmxMU3JHd0xfP-9VucrE9D.l9E7Vgn6y95sGbL2H6mgsddoCZX0cCgfcfKAzWgcd; ixigua-a-s=1; support_webp=true; support_avif=false; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; SEARCH_CARD_MODE=7168304743566296612_1; msToken=EV6DlzmvSZH6yBIIm7tCdxb6EY7xuV7p0EZw4nZUyznGvXk9Wkyx0GiT39zCO2HRROdUYZc0XYpAztUSzg14q3a1Fkoj01Avy_BGjKFFn5wRQDP8nVWECA==; tt_scid=rP8nVwFTm4wPZyREet0crbp-ZRgJsK.x5TE0lqU2uibGbUDAhlM.oA14pKRcGzXW0955; ttwid=1%7CHHtv2QqpSGuSu8r-zXF1QoWsvjmNi1SJrqOrZzg-UCY%7C1687685218%7Ca985a413a36bb156ba577dac11fbc14593e5a2a4000001f9cfc7fd72781c4cc5; ixigua-a-s=1',
-                'pragma': 'no-cache',
-                'referer': f'https://www.ixigua.com/search/{urllib.parse.quote(str(word))}/?logTag=e0b95015015c05e60b1b&tab_name=home&fss=default_search',
-                'sec-ch-ua': '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
-                'sec-ch-ua-mobile': '?0',
-                'sec-ch-ua-platform': '"macOS"',
-                'sec-fetch-dest': 'empty',
-                'sec-fetch-mode': 'cors',
-                'sec-fetch-site': 'same-origin',
-                'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57',
-                'x-secsdk-csrf-token': '0001000000011fd0adbaee655439e86800862b81e3e34974cab6a8656af77695b76ff5c76c96176bdcbf2631eeb7'
+                # 'authority': 'www.ixigua.com',
+                # 'accept': 'application/json, text/plain, */*',
+                # 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
+                # 'cache-control': 'no-cache',
+                # 'cookie': 'MONITOR_WEB_ID=67cb5099-a022-4ec3-bb8e-c4de6ba51dd0; sid_guard=c7472b508ea631823ba765a60cf8757f%7C1680867422%7C3024002%7CFri%2C+12-May-2023+11%3A37%3A04+GMT; odin_tt=b893608d4dde2e1e8df8cd5d97a0e2fbeafc4ca762ac72ebef6e6c97e2ed19859bb01d46b4190ddd6dd17d7f9678e1de; s_v_web_id=verify_lhoket5d_0qlKZtzS_YZkf_4Uaj_82mX_j6lRT4PcYJ7A; __ac_signature=_02B4Z6wo00f01yB6eXwAAIDCWLSSerYAxYsgWn3AAKx5S2D2PsJJ92YblwdDE-9rnwnzZ87S0CUowZ3Xi8XmxMU3JHd0xfP-9VucrE9D.l9E7Vgn6y95sGbL2H6mgsddoCZX0cCgfcfKAzWgcd; ixigua-a-s=1; support_webp=true; support_avif=false; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; SEARCH_CARD_MODE=7168304743566296612_1; msToken=EV6DlzmvSZH6yBIIm7tCdxb6EY7xuV7p0EZw4nZUyznGvXk9Wkyx0GiT39zCO2HRROdUYZc0XYpAztUSzg14q3a1Fkoj01Avy_BGjKFFn5wRQDP8nVWECA==; tt_scid=rP8nVwFTm4wPZyREet0crbp-ZRgJsK.x5TE0lqU2uibGbUDAhlM.oA14pKRcGzXW0955; ttwid=1%7CHHtv2QqpSGuSu8r-zXF1QoWsvjmNi1SJrqOrZzg-UCY%7C1687685218%7Ca985a413a36bb156ba577dac11fbc14593e5a2a4000001f9cfc7fd72781c4cc5; ixigua-a-s=1',
+                # 'pragma': 'no-cache',
+                'referer': f'https://www.ixigua.com/search/{urllib.parse.quote(str(word))}/?tab_name=home&fss=default_search',
+                # 'sec-ch-ua': '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
+                # 'sec-ch-ua-mobile': '?0',
+                # 'sec-ch-ua-platform': '"macOS"',
+                # 'sec-fetch-dest': 'empty',
+                # 'sec-fetch-mode': 'cors',
+                # 'sec-fetch-site': 'same-origin',
+                # 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57',
+                'user-agent': get_random_user_agent("pc"),
+                # 'x-secsdk-csrf-token': '0001000000011fd0adbaee655439e86800862b81e3e34974cab6a8656af77695b76ff5c76c96176bdcbf2631eeb7'
             }
             urllib3.disable_warnings()
             s = requests.session()
@@ -53,27 +82,33 @@ class SearchDev:
             s.mount('https://', HTTPAdapter(max_retries=3))
             response = requests.get(url=url, headers=headers, verify=False, proxies=Common.tunnel_proxies(), timeout=5)
             if response.status_code != 200 or "data" not in response.text:
-                print(f"response:{response.text}\n")
+                Common.logger(log_type, crawler).info(f"response:{response.text}\n")
                 return
             elif len(response.json()["data"]["data"]) == 0:
-                print("没有更多数据啦~")
+                Common.logger(log_type, crawler).info("没有更多数据啦~")
                 return
             else:
+                offset += 10
                 feeds = response.json()["data"]["data"]
                 for i in range(len(feeds)):
                     video_type = feeds[i].get("type", "")
                     title = feeds[i].get("data", {}).get("title", "")
                     publish_time = feeds[i].get("data", {}).get("publish_time", "")
                     item_id = feeds[i].get("data", {}).get("group_id", "")
-                    print(f"title:{title}")
-                    print(f"video_type:{video_type}")
-                    print(f"publish_time:{publish_time}")
-                    print(f"item_id:{item_id}")
-                    print("\n")
+                    Common.logger(log_type, crawler).info(f"title:{title}")
+                    Common.logger(log_type, crawler).info(f"video_type:{video_type}")
+                    Common.logger(log_type, crawler).info(f"publish_time:{publish_time}")
+                    Common.logger(log_type, crawler).info(f"item_id:{item_id}")
+                    if video_type != "video":
+                        Common.logger(log_type, crawler).info("合集,跳过\n")
+                        continue
+                    if int(time.time()) - publish_time > 3600 * 24 * int(rule_dict.get('period', {}).get('max', 1000)):
+                        Common.logger(log_type, crawler).info(f"发布时间超过{int(rule_dict.get('period', {}).get('max', 1000))}天\n")
+                        return
 
 
     @classmethod
-    def get_videoList_selenium(cls):
+    def get_videoList_selenium(cls, log_type, crawler):
         # 打印请求配置
         ca = DesiredCapabilities.CHROME
         ca["goog:loggingPrefs"] = {"performance": "ALL"}
@@ -88,17 +123,17 @@ class SearchDev:
         # driver初始化
         driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service(chromedriver))
         driver.implicitly_wait(10)
-        print("打开搜索页:健康")
+        Common.logger(log_type, crawler).info("打开搜索页:健康")
         driver.get(f"https://www.ixigua.com/search/健康/")
         time.sleep(3)
         # logs = driver.get_log("performance")
-        print("关闭登录弹框")
+        Common.logger(log_type, crawler).info("关闭登录弹框")
         if driver.find_elements(By.XPATH, '//*[@class="xg-notification-close"]') != 0:
             driver.find_element(By.XPATH, '//*[@class="xg-notification-close"]').click()
         driver.get_screenshot_as_file("./关闭弹框.png")
-        print("点击筛选按钮")
+        Common.logger(log_type, crawler).info("点击筛选按钮")
         driver.find_element(By.XPATH, '//*[@class="searchPageV2__header-icons-categories"]').click()
-        print("点击最新排序")
+        Common.logger(log_type, crawler).info("点击最新排序")
         driver.find_element(By.XPATH, '//*[@class="searchPageV2__header-categories-wrapper"]/*[1]/*[2]/*[1]').click()
         time.sleep(3)
         driver.get_screenshot_as_file("./最新排序.png")
@@ -108,5 +143,7 @@ class SearchDev:
 
 
 if __name__ == "__main__":
-    SearchDev.get_videoList_requests("猪八戒")
-    # SearchDev.get_videoList_selenium()
+    SearchDev.get_videoList_requests(log_type="search", crawler="xigua", rule_dict={"period": {"min":365, "max":365}}, word="健康", env="dev")
+    # SearchDev.get_videoList_selenium()
+    # print(get_random_user_agent("pc"))
+    pass